├── .gitignore ├── 00.data.preprocess ├── Makefile ├── README.org └── src │ ├── main │ ├── R │ │ ├── sa2.05.L1_cluster.R │ │ └── supple.01.update.sample.info.R │ ├── pipeline │ │ ├── alignment.Snakefile │ │ └── snapatac2.qc.Snakefile │ ├── python │ │ ├── sa2.01.preprocess.py │ │ ├── sa2.02.sum.pp.py │ │ ├── sa2.03.l1_embed.py │ │ ├── sa2.03.preL1Clustering.py │ │ ├── sa2.04.l1_knn.py │ │ └── sa2.05.l1_cluster.py │ ├── resource │ │ ├── cluster.json │ │ ├── config.json │ │ ├── mba.whole.sample.lst │ │ └── snapatac2.qc.config.yaml │ └── shell │ │ └── run_alignment.sh │ └── test │ └── python │ └── 01.test.makefragment.py ├── 01.clustering ├── L3_dlt2_cids.txt ├── L4_dlt2_cids.txt ├── README.org ├── Snakefile ├── config.yaml ├── makefile ├── meta │ ├── mba.test.sample │ ├── mba.whole.sample.lst │ └── mm10.blacklist.bed ├── post_script │ ├── build.sbt │ ├── rByMaxSilsL4.csv │ ├── sa2.L1.clustering.barcode2id.py │ ├── sa2.bmat.dlt.compare.py │ ├── sa2.dlt2.sumL3.R │ ├── sa2.dlt2.sumL4.R │ ├── sa2_L1_consensus.R │ ├── sa2_dlt2_L3_rByMaxSils.csv │ ├── sa2_dlt2_L3_sum.xlsx │ └── sa2_dlt2_prepareL2.scala ├── rerun.Snakefile ├── resource │ ├── bmatfile.csv │ ├── bmatfile_L0.csv │ ├── bmatfile_L0_condo.csv │ ├── sa2_L0_cluster2size.csv │ ├── sa2_L1_cluster2size.csv │ ├── sa2_dlt2_L1_cluster2size.csv │ └── sa2_dlt2_L1_cluster2size_test.csv ├── sa2.gmat.Snakefile ├── sa2.qc.dlt.Snakefile └── script │ ├── sa2.clustering.umap.py │ ├── sa2.embed.py │ ├── sa2.get.sample.gmat.py │ ├── sa2.knn.py │ ├── sa2.leiden.py │ ├── sa2.merge.gmat.py │ ├── sa2.merge.rmdlt.py │ ├── sa2.pre.anndataset.py │ ├── sa2.qc.dlt.py │ ├── sa2.rm.dlt.py │ ├── sa2.united.py │ ├── supple.sa2.add.barcode.to.unite.clustering.py │ ├── supple.sa2.bmat.dlt.py │ ├── supple.sa2.get.embed.py │ ├── supple.sa2.prepare.L1.py │ ├── supple.sa2.prepare.L2.in.memory.subset.py │ ├── supple.sa2.prepare.L2.py │ ├── supple.sa2.prepare.L3.R │ └── supple.sa2.prepare.L4.R ├── 02.integration ├── Makefile ├── README.org └── src │ ├── main │ ├── R │ │ ├── TransferLabel.R │ │ ├── analyzetf.R │ │ ├── annToS5.R │ │ ├── downsample.Allen.Seurat.on.subclass.level.R │ │ ├── downsample.sa2.Seurat.on.subclass.level.R │ │ ├── dp.seurat5.intgn.R │ │ ├── getIntUMAP.R │ │ ├── heavySummaryOfTranserlabel_IMN.R │ │ ├── heavySummaryOfTransferLabel.R │ │ ├── lightSummaryOfTransferLabel.R │ │ ├── mapSubclassNames.R │ │ ├── post_1st_tf.R │ │ ├── reciprocal.KNN.R │ │ ├── rknn2.R │ │ ├── rough.annot.L3.using.snATACv1.R │ │ ├── runPCA.R │ │ ├── simple.gene.list.of.allen.R │ │ └── sumSnapATAC2Meta.R │ ├── pipeline │ │ └── Seurat.TransferLabel.Snakefile │ ├── python │ │ ├── 01.extract.allen.py │ │ ├── 02.pseudobulk.allen.py │ │ ├── 02.pseudobulk.sa2.py │ │ ├── imneuron.py │ │ └── reduce.anndata.allen.sa2.py │ └── resource │ │ ├── AIT21_ReadMe.txt │ │ ├── AIT21_annotation.tsv │ │ ├── AIT21_annotation_freeze_081523.tsv │ │ ├── AIT21_cluster_markers.txt │ │ ├── AIT21_k8_markers.txt │ │ ├── AIT21_merfish_markers.txt │ │ ├── AllenIMNTopMajorRegionRelatedAllenCls.txt │ │ ├── AllenIMNTopMajorRegionRelatedL4s.txt │ │ ├── BICCN.BrainRegionMetadata.xlsx │ │ ├── allen_supple_v1 │ │ ├── ED_Table1_complete_CCFv3_ontology.csv │ │ ├── ED_Table5_select.markers.csv │ │ ├── TF263.csv │ │ └── TF499.txt │ │ ├── atac.subclass2size.v1.csv │ │ ├── sa2_dlt2_L3toSa1Annot.rough.csv │ │ └── subclass_nm_in_macs2_bigwig.txt │ └── test │ ├── R │ ├── analyze.Intgn.Seurat5.R │ ├── test.Seurat.with.anndata.R │ └── test.snakemake.wildcards.R │ ├── pipeline │ ├── Makefile │ └── R.Snakefile │ └── python │ └── prepare.intg.test.ann.py ├── 03.peakcalling ├── Makefile ├── bin │ └── merge_peaks └── src │ └── main │ ├── R │ ├── addpL4Info2atacMeta.R │ ├── filterPeakByscbgModel.R │ ├── filterPeakFromPseudoBulk.R │ ├── finalizedpeaks.R │ ├── fitbgmodel.R │ ├── iterativeMergePeak.R │ ├── preparePeakCallingByMergeNN.R │ ├── preparePeakCallingByMergeNeuron.R │ ├── subclass2peak.R │ └── sumReproducePeaks.R │ ├── pipeline │ ├── getsa2pmat.Snakefile │ ├── mergePeak.Snakefile │ ├── scfilter.Snakefile │ └── snap2.peakcalling.Snakefile │ ├── python │ ├── get_full_snap2.py │ ├── prepare_bedfiles.py │ ├── run_macs2.py │ ├── sa2_get_peakfrac.py │ └── sa2pmat.py │ ├── resource │ ├── all.pL4.meta.csv │ ├── all_pL4s.txt │ ├── config.yaml │ ├── mba.whole.sample.lst │ └── test_neuron_L4pc2sizes.csv │ └── shell │ ├── export_unionpeak.sh │ ├── get_reproduce_peak_within_cluster.sh │ └── intersect_mergepeak.sh ├── 04.nmf ├── Makefile └── src │ └── main │ ├── R │ ├── 01.prepare.nmf.R │ ├── 02.nmfATAC.plotH.R │ ├── 02.nmfATAC.plotW.R │ ├── 02.nmfATAC.statBox.R │ ├── 03.sumnmf.R │ ├── 04.nmf.plot.R │ └── 05.splitPeakByModule.R │ ├── pipeline │ └── nmf.Snakefile │ ├── python │ ├── 02.nmf.py │ ├── 02.nmfATAC.stat.py │ └── 02.post_nmf.py │ └── resource │ └── config.yaml ├── 05.cCREgene └── sa2.cicero │ ├── Makefile │ └── src │ └── main │ ├── R │ ├── 03.filterCiceroByShuf.R │ ├── 06.summaryDistalProximalConns.R │ ├── 07.cor.scRNAseq.R │ ├── 08.summarize.cor.R │ ├── 09.sa2.subclass.specific.ppdc.R │ ├── cicero_mouse_atlas.R │ ├── run_cicero.R │ ├── run_cicero_shuffle.R │ ├── sa2.pdc.of.globalpeaks.R │ ├── supple.07.01.get.pdc.and.rdm.gene2peak.R │ └── supple.07.02.get.RNA.ATAC.cpm.R │ ├── pipeline │ ├── pdc.Snakefile │ └── runCicero.Snakefile │ ├── resource │ └── config.yaml │ └── shell │ ├── 04.addTSSAnnot2Conns.sh │ ├── 05.mergeDistalProximalConns.sh │ ├── 09.get.pos.neg.pdc.info.sh │ ├── alignv1.to.bedpe.sh │ ├── sa2.all.distal.peaks.sh │ └── supple.02.annotPeakBasedOnTSS.sh ├── 06.motifanalysis ├── Makefile ├── README.org └── src │ └── main │ ├── R │ └── 05.splitPeakByModule.R │ ├── pipeline │ └── motif.Snakefile │ └── python │ └── test.scienicplus.py ├── 07.m3C ├── README.org ├── hic2 │ └── hic2.sh ├── runHiC.sh └── subclass.txt ├── 08.GRN ├── Makefile └── src │ └── main │ ├── pipeline │ └── celloracle.Snakefile │ ├── python │ ├── 01.runGimmemotifs.py │ ├── 02.mergeGimme.py │ ├── 03.seurat2anndata.py │ ├── 04.runGRN.py │ └── 05.plot.powerlaw.py │ └── resource │ ├── CisBP_ver2_Mus_musculus.motif2factors.txt │ ├── config.yaml │ └── sa2.allen.vf3281.gene.txt ├── 09.cCRE_conservation ├── 01.reciLiftOver.sh └── 02.orthologous.R ├── 10.cCRE_TE ├── 01.highTE.subclass.R └── 02.TE.variability.R ├── 11.deeplearning ├── README.org └── src │ └── main │ └── resource │ └── mappedHMB.txt ├── LICENSE ├── README.org ├── manuscript_figures ├── Fig1.R ├── Fig5.R ├── paper.R ├── sa2.Fig2.R ├── sa2.Fig3.R ├── sa2.Fig4.R └── sa2.sc2region.R ├── meta ├── BICCN.BrainRegionMetadata.xlsx ├── BrainRegion.Metadata.txt ├── allen.region.to.main.region.v2.txt ├── allen_subclass_RegionMeta.csv ├── atac_L4_MajorRegion.csv ├── dissect2time.csv ├── ensemble.genesymbol.allengenesymbol.csv ├── gencode.vM16.geneUp2k.bed ├── gencode.vM23.gene.tssUpDn1k.bed ├── getGeneUp2K.sh ├── makefile ├── mm10-blacklist.v2.bed ├── mm10.blacklist.bed ├── mm10.chrom.sizes ├── mm10.chrom.sizes.lite ├── mouse.modified.gencode.vM23.bed ├── mouse.modified.gencode.vM23.gene.up2k.bed ├── neuron_cell_markers.csv ├── sa2.subclass.names.map.csv ├── sa2.subclass.srt.txt ├── sa2.subclass2region2score.csv ├── sample2bamfile.csv ├── sample2rawbam.csv ├── subclass_and_genemarker_CEMBAv1.xlsx └── whole.brain.cellname.org ├── package ├── R │ ├── annot.R │ ├── bed.R │ ├── cembav2env.R │ ├── cicero.R │ ├── colors.R │ ├── dendro.R │ ├── gglot.theme.R │ ├── gmat.R │ ├── grn.R │ ├── hc.R │ ├── hdf5.R │ ├── igv.R │ ├── integration.R │ ├── loadSnap.R │ ├── peak.R │ ├── plot.R │ ├── prob.R │ ├── region.R │ └── utils.R ├── python │ ├── bedpe2bigwig.py │ ├── cembav2env.py │ ├── colors.py │ ├── leiden.py │ ├── myanndata.py │ ├── mycelloracle.py │ ├── mylog.py │ ├── mysnapatac2.py │ ├── snap2h5ad.py │ └── utils.py └── tasks │ ├── getAllL3SnapMat │ ├── L2GroupAll.csv │ ├── Makefile │ ├── Snakefile.template │ ├── bmat │ │ └── config.json │ ├── getSnapATACMat.R │ ├── profile.template │ │ ├── cluster.yaml │ │ └── config.yaml │ └── vM16gmat │ │ ├── L2GroupAll.csv │ │ └── config.json │ ├── getSnapATACMatByGroup │ ├── L1Group.csv │ ├── L2GroupAll.csv │ ├── L2GroupTest.csv │ ├── L2MultiGroup.csv │ ├── Makefile │ ├── Snakefile.template │ ├── config.json.template │ ├── configL1.json.template │ ├── configMultiGroup.json.template │ ├── getSnapATACMatByGroup.R │ └── profile.template │ │ ├── cluster.yaml │ │ └── config.yaml │ └── nmf │ ├── 01.prepare.nmf.R │ ├── 02.nmf.py │ ├── 02.nmfATAC.plotH.R │ ├── 02.nmfATAC.plotW.R │ ├── 02.nmfATAC.stat.py │ ├── 02.nmfATAC.statBox.R │ ├── 02.post_nmf.py │ ├── 03.sumnmf.R │ ├── 04.nmf.plot.R │ ├── 05.splitPeakByModule.R │ ├── Makefile │ ├── config.yaml │ ├── nmf.Snakefile │ ├── profile │ ├── .cluster.yaml.~undo-tree~ │ ├── cluster.yaml │ └── config.yaml │ └── supple.01.prepare.nmf.R ├── repo_figures ├── GraphAbstract.jpg ├── GraphAbstract.tif ├── snATAC-seq_analysis_pipeline.jpg └── snATAC-seq_analysis_pipeline.pdf ├── snakemake.template ├── Makefile ├── README.org ├── Snakefile ├── config.yaml ├── pbs.demo.sh └── profile │ ├── cluster.yaml │ └── config.yaml └── supple.datashare ├── paper.R ├── read.Supple.Nature.py ├── sa2.supplementary.tables.R └── share.peak.by.majorRegion.R /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | /meta/._subclass_and_genemarker_CEMBAv1.xlsx 131 | /repo_figures/._snATAC-seq_analysis_pipeline.jpg 132 | /repo_figures/._snATAC-seq_analysis_pipeline.pdf 133 | /repo_figures/._GraphAbstract.tif 134 | /repo_figures/._GraphAbstract.jpg 135 | /data/ 136 | /00.data.preprocess/out/ 137 | /00.data.preprocess/src/test/python/.tmprOXZWn/ 138 | *.Rhistory 139 | /._.gitignore 140 | -------------------------------------------------------------------------------- /00.data.preprocess/Makefile: -------------------------------------------------------------------------------- 1 | sa2_pp: snapatac2.qc.Snakefile 2 | snakemake -c 1 --config system=silencer \ 3 | --snakefile $< -R --rerun-incomplete --profile profile 4 | 5 | 6 | encoder_sa2_l1_embed: snapatac2.qc.Snakefile 7 | snakemake -c 4 --config system=silencer debug=1 \ 8 | --snakefile $< --until snapatac2_l1_embed -R --rerun-incomplete 9 | 10 | encoder_sa2_l1_knn_hora: snapatac2.qc.Snakefile 11 | snakemake -c 4 --config system=silencer debug=1 \ 12 | knn_method=hora \ 13 | --snakefile $< --until snapatac2_l1_knn -R --rerun-incomplete 14 | 15 | encoder_sa2_l1_knn_exact: snapatac2.qc.Snakefile 16 | snakemake -c 4 --config system=silencer debug=1 \ 17 | knn_method=exact \ 18 | --snakefile $< --until snapatac2_l1_knn -R --rerun-incomplete 19 | 20 | .PHONY: tscc_sa2_l1_embed 21 | tscc_sa2_l1_embed: snapatac2.qc.Snakefile snapatac2.qc.config.yaml 22 | -mkdir -p $@ 23 | cp $(word 2,$^) $@/$(word 2,$^) 24 | cp $< $@/$< 25 | cp -R profile/. $@/profile 26 | cd $@ && \ 27 | snakemake -c 1 --config system=tscc debug=1 \ 28 | --snakefile $< --until snapatac2_l1_embed -R \ 29 | --rerun-incomplete --profile profile 30 | 31 | .PHONY: tscc_sa2_l1_knn_exact 32 | l1_knn_dir := tscc_sa2_l1_embed 33 | tscc_sa2_l1_knn_exact: snapatac2.qc.Snakefile snapatac2.qc.config.yaml 34 | cp $(word 2,$^) ${l1_knn_dir}/$(word 2,$^) 35 | cp $< ${l1_knn_dir}/$< 36 | cp -R profile/. ${l1_knn_dir}/profile 37 | cd ${l1_knn_dir} && \ 38 | snakemake -c 1 --config \ 39 | system=tscc debug=1 \ 40 | knn_method=exact \ 41 | --snakefile $< --until snapatac2_l1_knn -R \ 42 | --rerun-incomplete --profile profile 43 | 44 | 45 | # test 46 | test_sa2_pp: snapatac2.qc.Snakefile 47 | snakemake -c 1 --config system=imac debug=1\ 48 | --snakefile $< -R --rerun-incomplete 49 | 50 | test_sa2_pp_tscc: snapatac2.qc.Snakefile 51 | snakemake -c 1 --config system=tscc debug=1 \ 52 | --snakefile $< -R --rerun-incomplete 53 | 54 | test_sa2_l1_embed: snapatac2.qc.Snakefile 55 | snakemake -c 2 --config system=imac debug=1 \ 56 | --snakefile $< --until snapatac2_l1_embed -R --rerun-incomplete 57 | 58 | test_sa2_l1_knn: snapatac2.qc.Snakefile 59 | snakemake -c 2 --config system=imac debug=1 \ 60 | --snakefile $< --until snapatac2_l1_knn -R --rerun-incomplete 61 | 62 | .PHONY: clean 63 | clean: 64 | -rm snapatac2_l1_embed..03* 65 | -rm snapatac2_pp.sample=*.o3* 66 | -rm -rf .tmp* 67 | -------------------------------------------------------------------------------- /00.data.preprocess/README.org: -------------------------------------------------------------------------------- 1 | * Alignment 2 | - All the bam files are generated by snaptools, which is an old 3 | python package for SnapATAC R package. 4 | - In snaptools, bwa software is used for the algnment. 5 | - See details under src/main/pipeline/alignment.Snakefile 6 | - The config.json and cluster.json are under src/main/resource 7 | 8 | * Quality Control and Doublet Removal 9 | - After generating bam files, we then use SnapATAC2 for our 10 | analysis. 11 | - See details under src/main/pipeline/snapatac2.qc.Snakefile. 12 | - The snapatac2.qc.config.yaml is under src/main/resource. 13 | ** Quality control 14 | - At bulk level, we check the quality of the sequencing based on 15 | the sequencing depth, number of unique fragments, and so on. The 16 | most important thing is to check if the size of fragments in each 17 | sample were enriched in 100 bp and 200 bp (see [[https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-1929-3][ref]]). 18 | - At single-cell level, we follow the 19 | [[https://www.encodeproject.org/atac-seq/][ENCODE pipeline]], we use two criteria to filter cells: 20 | 1. Number of unique fragment >= 1,000 21 | 2. Transcription start site enrichment (TSSe) >= 10 22 | ** Doublet removal 23 | - We use Scrublet for this. And the imputed gene expression from 24 | snATAC-seq data is similar like what Seurat did. 25 | -------------------------------------------------------------------------------- /00.data.preprocess/src/main/R/supple.01.update.sample.info.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | 3 | # Feedback from Hanqing 4 | ## {"AMY-1": "AMY-2", "AMY-2": "AMY-1"}, {"ACB-2": "CP-1", "CP-1": "ACB-2"} 5 | ## 1. AMY那个我不确定是我们谁反了,因为cell-type 比较像,但我们肯定是不一样的。 6 | ## 2. CP那个应该是ATAC 反了 7 | 8 | # For current sample data we have 9 | ## ACB-2: 4E, CEMBA180110_4E and CEMBA180111_4E 10 | ## CP-1: 4D, CEMBA171214_4D and CEMBA171219_4D 11 | ## AMY-1: 7H, CEMBA200820_7H and CEMBA200827_7H 12 | ## AMY-2: 8H, CEMBA200903_8H and CEMBA200910_8H 13 | 14 | 15 | # ATAC-seq experiments record: 16 | # https://docs.google.com/spreadsheets/d/1HbYP0tLpv4rPwkJn6uZPjR_M7dnIRglzeTcY9Os-CVA/edit#gid=1893884661 17 | 18 | # ATAC-seq LIMS spreadsheet 19 | # https://docs.google.com/spreadsheets/d/1UPkKv3potJtNEbYxkpMY5X_V1xgkRbkvkW4Qi14o1x4/edit#gid=1307716493 20 | 21 | # Comments from Yang 22 | ## brain dissection编号(4D: CP-1, etc) 肯定是一致的,毫无疑问。 23 | ## 上面的描述里,没懂CP-1跟那个region反了?可以查实验室记录吧,看反了的两个sample是不是同一天做的实验,如果不是,就好分辨了? 24 | ## 假设Marga没有把sample标记错,tissue被同时发给两个实验室后,某一个实验室的sample标记错了。 25 | ## 这样就核对marga,hanqing和我们三处的日期 26 | 27 | ## 一般是4个sample为一组做实验,比如1号做了CP-1+其他三个sample,5号做了ACB-2+其他3个sample。 28 | ## 假设: 29 | ## Marga, hanqing, renlab 30 | ## CP-1 1 5 1 31 | ## others 1 5 1 32 | ## ACB-1 5 1 5 33 | ## 你就应该知道是哪里出错了 34 | -------------------------------------------------------------------------------- /00.data.preprocess/src/main/python/sa2.04.l1_knn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from pathlib import Path 4 | import logging 5 | import numpy as np 6 | import numpy 7 | import argparse 8 | import shutil 9 | 10 | import snapatac2 as sa2 11 | import pyprojroot 12 | code_root_dir = str(pyprojroot.here()) 13 | pack_dir = f"{code_root_dir}/package/python" 14 | sys.path.insert(0, pack_dir) 15 | import utils # type: ignore # noqa: E402 16 | 17 | parser = argparse.ArgumentParser("snapatac2 L1 KNN") 18 | parser.add_argument("--embed_file", type = str) 19 | parser.add_argument("--outf", type = str, default = "test_knn.hdf5") 20 | parser.add_argument("--kmethod", type = str, default = 'exact') 21 | parser.add_argument("--knn", type = int, default = 50) 22 | parser.add_argument("--logfile", type = str, 23 | default = "log/test_sa2_l1_embed_knn.log") 24 | parser.add_argument("--debug", type = int, default = 1) 25 | parser.add_argument("-i", "--ipython", action = "store_true") 26 | parser.add_argument("--simple-prompt", action = "store_true") 27 | 28 | args = parser.parse_args() 29 | 30 | # * set log 31 | logger = utils.set_file_logger(fnm = args.logfile, #type: ignore 32 | name = "sa2.04.l1_knn") 33 | if args.debug == 0: 34 | debug = False 35 | else: 36 | debug = True 37 | logger.warning("DEBUG mode is open") 38 | 39 | # * meta 40 | k = args.knn 41 | km = args.kmethod 42 | embed_file = args.embed_file 43 | if not os.path.exists(embed_file): 44 | err_msg = f"{embed_file} is not found" 45 | logging.error(err_msg) 46 | sys.exit(err_msg) 47 | outf = args.outf 48 | outdir = os.path.dirname(outf) 49 | if os.path.exists(outf): 50 | logger.warning(f"{outf} exists and remove it.") 51 | os.remove(outf) 52 | else: 53 | os.makedirs(outdir, exist_ok = True) 54 | 55 | 56 | # * main 57 | logger.info(f"Copy {embed_file} to {outf}") 58 | shutil.copyfile(src = embed_file, 59 | dst = outf) 60 | 61 | logger.info(f"Read AnnData to RAM from : {outf}") 62 | sds = sa2.read(outf, backend = None) 63 | 64 | logger.info("Start to run KNN.") 65 | sa2.pp.knn( 66 | adata = sds, 67 | n_neighbors = k, 68 | use_dims = None, 69 | use_rep = 'X_spectral', 70 | method = km, 71 | inplace = True, 72 | random_state = 0 73 | ) 74 | sds.close() 75 | logger.info("Done") 76 | -------------------------------------------------------------------------------- /00.data.preprocess/src/main/resource/cluster.json: -------------------------------------------------------------------------------- 1 | { 2 | "__default__" : 3 | { 4 | "time" : "walltime=48:00:00", 5 | "ppn" : "nodes=1:ppn=4", 6 | "queue" : "hotel" 7 | }, 8 | "snap_align" : 9 | { 10 | "time" : "walltime=48:00:00", 11 | "ppn" : "nodes=1:ppn=4", 12 | "queue" : "hotel" 13 | }, 14 | "snap_pre" : 15 | { 16 | "time" : "walltime=24:00:00", 17 | "ppn" : "nodes=1:ppn=4", 18 | "queue" : "hotel" 19 | }, 20 | "snap_add_bmat" : 21 | { 22 | "time" : "walltime=24:00:00", 23 | "ppn" : "nodes=1:ppn=2", 24 | "queue" : "hotel" 25 | }, 26 | "snap_add_gmat" : 27 | { 28 | "time" : "walltime=24:00:00", 29 | "ppn" : "nodes=1:ppn=2", 30 | "queue" : "hotel" 31 | }, 32 | "pre_sta" : 33 | { 34 | "time" : "walltime=24:00:00", 35 | "ppn" : "nodes=1:ppn=5", 36 | "queue" : "hotel" 37 | }, 38 | "cluster" : 39 | { 40 | "time" : "walltime=24:00:00", 41 | "ppn" : "nodes=1:ppn=6", 42 | "queue" : "hotel" 43 | }, 44 | "plotGene" : 45 | { 46 | "time" : "walltime=24:00:00", 47 | "ppn" : "nodes=1:ppn=2", 48 | "queue" : "hotel" 49 | }, 50 | "dump_frag" : 51 | { 52 | "time" : "walltime=24:00:00", 53 | "ppn" : "nodes=1:ppn=2", 54 | "queue" : "home" 55 | }, 56 | "tsse2depth" : 57 | { 58 | "time" : "walltime=24:00:00", 59 | "ppn" : "nodes=1:ppn=2", 60 | "queue" : "home" 61 | }, 62 | "bam2bedpe": 63 | { 64 | "time" : "walltime=24:00:00", 65 | "ppn" : "nodes=1:ppn=1", 66 | "queue" : "hotel" 67 | }, 68 | "snap2cb" : 69 | { 70 | "time" : "walltime=1:00:00", 71 | "ppn" : "nodes=1:ppn=1", 72 | "queue" : "glean" 73 | } 74 | } 75 | 76 | 77 | -------------------------------------------------------------------------------- /00.data.preprocess/src/main/resource/snapatac2.qc.config.yaml: -------------------------------------------------------------------------------- 1 | system: imac 2 | conda: 3 | imac: /Users/szu/mambaforge 4 | tscc: /projects/ps-renlab2/szu/miniconda3 5 | silencer: /projects/ps-renlab2/szu/miniconda3 6 | conda_env: 7 | imac: sa2dev 8 | silencer: sa2dev 9 | tscc: sa2dev_tscc 10 | python: 11 | imac: /Users/szu/mambaforge/envs/sa2dev/bin/python 12 | silencer: /projects/ps-renlab2/szu/miniconda3/envs/sa2dev/bin/python 13 | tscc: /projects/ps-renlab2/szu/miniconda3/envs/sa2dev/bin/python 14 | project_dir: 15 | imac: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2 16 | silencer: /projects/ps-renlab2/szu/projects/CEMBA2 17 | tscc: /projects/ps-renlab2/szu/projects/CEMBA2 18 | work_dir: 19 | imac: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2/00.data.preprocess 20 | silencer: /projects/ps-renlab2/szu/projects/CEMBA2/00.data.preprocess 21 | tscc: /projects/ps-renlab2/szu/projects/CEMBA2/00.data.preprocess 22 | out_dir: snapatac2_pp_out 23 | sample2bamfile: 24 | imac: 00.data.preprocess/test_sample2bamfile.csv 25 | silencer: meta/sample2rawbam.csv 26 | tscc: 00.data.preprocess/test_sample2bamfile_tscc.csv 27 | chrom_size_file: meta/mm10.chrom.sizes.lite 28 | gtf_file: meta/gencode.vM23.gene.annot2.gtf 29 | blacklist_file: meta/mm10.blacklist.bed 30 | samples_file: 31 | imac: 00.data.preprocess/test_samples.txt 32 | silencer: meta/CEMBA_all_samples.txt 33 | tscc: meta/CEMBA_all_samples.txt 34 | embed_nfeature: 500000 35 | embed_ncomps: 50 36 | embed_sample_size: 1.0 37 | distance_metric: cosine 38 | knn: 50 39 | knn_method: hora 40 | debug: 0 41 | 42 | -------------------------------------------------------------------------------- /00.data.preprocess/src/main/shell/run_alignment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #Author: Yang Li 4 | #File: run.sh 5 | #Create Date: Sat Feb 26 10:26:52 PST 2022 6 | 7 | snakemake -p --rerun-incomplete -k -j 128 --cluster "qsub -l {cluster.ppn} -l {cluster.time} -N {params.jobname} -q {cluster.queue} -o pbslog/{params.jobname}.pbs.out -e pbslog/{params.jobname}.pbs.err" --jobscript jobscript.pbs --jobname "{rulename}.{jobid}.pbs" --cluster-config cluster.json 2>run.log 8 | -------------------------------------------------------------------------------- /00.data.preprocess/src/test/python/01.test.makefragment.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import os 3 | import snapatac2 as sa2 4 | import pyprojroot 5 | 6 | proj_root = pyprojroot.here() 7 | raw_bam_dir: str = os.path.join(proj_root, "data", 8 | "raw_bam_test") 9 | dedup_bam_dir: str = os.path.join(proj_root, "data", 10 | "r") 11 | samples: List[str] = [ 12 | "CEMBA181023_6B", "CEMBA201210_10D"] 13 | outdir = os.path.join(proj_root, "00.data.preprocess", 14 | "out/test") 15 | 16 | for s in samples: 17 | stat = sa2.pp.make_fragment_file( 18 | bam_file = os.path.join(raw_bam_dir, f"{s}.bam"), 19 | output_file = os.path.join(outdir, 20 | f"{s}.frag.rawbam.tsv"), 21 | is_paired = True, 22 | barcode_regex = "^(\w+):.+", 23 | shift_left = 4, 24 | shift_right = -4, 25 | min_mapq = 30 26 | ) 27 | print(stat) 28 | 29 | for s in samples: 30 | stat = sa2.pp.make_fragment_file( 31 | bam_file = os.path.join(proj_root, "data", 32 | "filtered_dedup_bam", 33 | f"{s}.filtered_dedup.bam"), 34 | output_file = os.path.join(outdir, 35 | f"{s}.frag.filtered_debup_bam.tsv"), 36 | is_paired = True, 37 | barcode_regex = "^(\w+):.+", 38 | shift_left = 4, 39 | shift_right = -4, 40 | min_mapq = 30 41 | ) 42 | 43 | -------------------------------------------------------------------------------- /01.clustering/L3_dlt2_cids.txt: -------------------------------------------------------------------------------- 1 | 1-1 2 | 1-10 3 | 1-11 4 | 1-12 5 | 1-13 6 | 1-14 7 | 1-15 8 | 1-16 9 | 1-17 10 | 1-18 11 | 1-19 12 | 1-2 13 | 1-20 14 | 1-21 15 | 1-22 16 | 1-23 17 | 1-24 18 | 1-25 19 | 1-26 20 | 1-27 21 | 1-28 22 | 1-29 23 | 1-3 24 | 1-30 25 | 1-4 26 | 1-5 27 | 1-6 28 | 1-7 29 | 1-8 30 | 1-9 31 | 10-1 32 | 10-2 33 | 10-3 34 | 11-1 35 | 11-2 36 | 11-3 37 | 11-4 38 | 12-1 39 | 12-2 40 | 12-3 41 | 12-4 42 | 13-1 43 | 13-2 44 | 13-3 45 | 13-4 46 | 13-5 47 | 13-6 48 | 13-7 49 | 13-8 50 | 14-1 51 | 14-2 52 | 14-3 53 | 14-4 54 | 14-5 55 | 14-6 56 | 14-7 57 | 14-8 58 | 15-1 59 | 15-2 60 | 15-3 61 | 16-1 62 | 16-2 63 | 16-3 64 | 16-4 65 | 16-5 66 | 16-6 67 | 17-1 68 | 17-2 69 | 17-3 70 | 17-4 71 | 17-5 72 | 17-6 73 | 17-7 74 | 17-8 75 | 17-9 76 | 18-1 77 | 18-2 78 | 18-3 79 | 18-4 80 | 18-5 81 | 18-6 82 | 19-1 83 | 19-2 84 | 19-3 85 | 19-4 86 | 19-5 87 | 19-6 88 | 19-7 89 | 19-8 90 | 2-1 91 | 2-10 92 | 2-11 93 | 2-12 94 | 2-13 95 | 2-14 96 | 2-15 97 | 2-16 98 | 2-17 99 | 2-18 100 | 2-19 101 | 2-2 102 | 2-20 103 | 2-21 104 | 2-22 105 | 2-23 106 | 2-24 107 | 2-25 108 | 2-26 109 | 2-27 110 | 2-28 111 | 2-29 112 | 2-3 113 | 2-4 114 | 2-5 115 | 2-6 116 | 2-7 117 | 2-8 118 | 2-9 119 | 20-1 120 | 20-2 121 | 20-3 122 | 20-4 123 | 20-5 124 | 21-1 125 | 21-2 126 | 21-3 127 | 21-4 128 | 21-5 129 | 21-6 130 | 22-1 131 | 22-2 132 | 22-3 133 | 23-1 134 | 23-10 135 | 23-11 136 | 23-2 137 | 23-3 138 | 23-4 139 | 23-5 140 | 23-6 141 | 23-7 142 | 23-8 143 | 23-9 144 | 24-1 145 | 24-2 146 | 24-3 147 | 24-4 148 | 24-5 149 | 25-1 150 | 25-2 151 | 25-3 152 | 25-4 153 | 25-5 154 | 26-1 155 | 26-2 156 | 26-3 157 | 26-4 158 | 26-5 159 | 27-1 160 | 27-2 161 | 28-1 162 | 28-2 163 | 28-3 164 | 28-4 165 | 28-5 166 | 28-6 167 | 28-7 168 | 29-1 169 | 29-2 170 | 3-1 171 | 3-2 172 | 3-3 173 | 3-4 174 | 3-5 175 | 30-1 176 | 30-2 177 | 30-3 178 | 30-4 179 | 30-5 180 | 30-6 181 | 30-7 182 | 30-8 183 | 31-1 184 | 31-2 185 | 32-1 186 | 32-2 187 | 33-1 188 | 33-2 189 | 33-3 190 | 34-1 191 | 34-2 192 | 35-1 193 | 35-2 194 | 35-3 195 | 35-4 196 | 36-1 197 | 36-2 198 | 37-1 199 | 37-2 200 | 37-3 201 | 37-4 202 | 4-1 203 | 4-2 204 | 4-3 205 | 4-4 206 | 5-1 207 | 5-2 208 | 5-3 209 | 5-4 210 | 6-1 211 | 6-10 212 | 6-2 213 | 6-3 214 | 6-4 215 | 6-5 216 | 6-6 217 | 6-7 218 | 6-8 219 | 6-9 220 | 7-1 221 | 7-10 222 | 7-11 223 | 7-12 224 | 7-13 225 | 7-2 226 | 7-3 227 | 7-4 228 | 7-5 229 | 7-6 230 | 7-7 231 | 7-8 232 | 7-9 233 | 8-1 234 | 8-10 235 | 8-2 236 | 8-3 237 | 8-4 238 | 8-5 239 | 8-6 240 | 8-7 241 | 8-8 242 | 8-9 243 | 9-1 244 | 9-2 245 | 9-3 246 | 9-4 247 | 9-5 248 | 9-6 249 | -------------------------------------------------------------------------------- /01.clustering/README.org: -------------------------------------------------------------------------------- 1 | * CEMBA snATAC-seq clustering using SnapATAC2 2 | ** Installation of SnapATAC2 3 | #+BEGIN_SRC shell 4 | # Install a mamba enviroment named sa2 with python=3.10 or newer version 5 | # Then 6 | mamba activate sa2 7 | && mamba install -c conda-forge -c bioconda snakemake \ 8 | && mamba install -c anaconda cmake 9 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh \ 10 | && rustup default nightly 11 | git clone https://github.com/kaizhang/SnapATAC2.git ~/softwares/SnapATAC2 \ 12 | && cd ~/softwares/SnapATAC2/snapatac2-python && pip install . 13 | # current numba depends on numpy=1.24 14 | pip install numpy==1.24 && pip install ipython pyprojroot matplotlib 15 | #+END_SRC 16 | 17 | ** NOTE 18 | *** TODO save L1-level QC, doublet removal and clustering result 19 | *** Under ps-renlab, we perform clustering. 20 | *** How to run scala file 21 | 1. enter post_script 22 | 2. run: ~sbt~ 23 | 3. then run: ~run~, this will run Hello in sa2_dlt2_preprareL2.scala 24 | -------------------------------------------------------------------------------- /01.clustering/config.yaml: -------------------------------------------------------------------------------- 1 | system: imac 2 | ## clustering on which level 3 | ## so on L1, means we perform L2 level clustering 4 | clustering_level: L1 5 | retries: 1 6 | conda: 7 | imac: sa2 8 | encoder: sa2 9 | tscc: sa2 10 | tscc_test: sa2 11 | test_unite: sa2 12 | 13 | project_dir: 14 | imac: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2 15 | encoder: /projects/ps-renlab/szu/projects/CEMBA2 16 | tscc: /projects/ps-renlab/szu/projects/CEMBA2 17 | tscc_test: /projects/ps-renlab/szu/projects/CEMBA2 18 | test_unite: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2 19 | 20 | work_dir: 21 | imac: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2/17.snapatac2 22 | encoder: /projects/ps-renlab/szu/projects/CEMBA2/17.snapatac2 23 | #tscc: /oasis/tscc/scratch/szu/projects/CEMBA2/17.snapatac2 24 | # use condo since scratch has slow IO. 25 | tscc: /projects/ps-renlab/szu/projects/CEMBA2/17.snapatac2 26 | tscc_test: /oasis/tscc/scratch/szu/projects/CEMBA2/17.snapatac2 27 | test_unite: /users/szu/test/17.snapatac2 28 | out_dir: result 29 | blacklist_file: 17.snapatac2/meta/mm10.blacklist.bed 30 | barcode2id_file: 17.snapatac2/resource/barcode2id.csv 31 | sample2fragment_file: 17.snapatac2/resource/sample2fragment.csv 32 | pre_clustering_meta: 17.snapatac2/resource/cluster2size.csv 33 | cemba_anndata_file: 17.snapatac2/resource/merge_cemba_all.h5ad 34 | max_united_size: 200000 35 | embed_nfeat: 500000 36 | embed_nsample: 3000000 37 | embed_ncomp: 30 38 | embed_name: nfeat-default_nsample-all_nc30 39 | embed: 40 | nfeat: 500000 41 | ncomp: 30 42 | embed_nsample: 3000000 43 | name: allfeat_nc30 44 | knn: 45 | n: 50 46 | method: exact 47 | name: knn50 48 | leiden: 49 | n_iter: -1 50 | weight: True 51 | obj: modularity 52 | min_size: 50 53 | repeat: 1 54 | minr: 0.1 55 | maxr: 2.0 56 | byr: 0.1 57 | seed: 0 58 | name: igraph_leiden_modularity 59 | n_sample: 20000 60 | umap: 61 | n_neigh: 15 62 | n_comp: 2 63 | metric: euclidean 64 | init: spectral 65 | min_dist: 0.01 66 | a: 1.8956 67 | b: 0.8006 68 | seed: 0 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /01.clustering/meta/mba.test.sample: -------------------------------------------------------------------------------- 1 | CEMBA171206_3C 2 | CEMBA171207_3C -------------------------------------------------------------------------------- /01.clustering/post_script/build.sbt: -------------------------------------------------------------------------------- 1 | ThisBuild / scalaVersion := "3.2.2" 2 | -------------------------------------------------------------------------------- /01.clustering/post_script/sa2.L1.clustering.barcode2id.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import sys 4 | import numpy as np 5 | import pandas as pd 6 | 7 | with open(f"../result/clustering_sum_L1/sa2_clustering_L0_0.pkl", 'rb') as f: 8 | L1_sum = pickle.load(f) 9 | 10 | with open(f"../result/clustering_sum_L1/barcodes.txt", 'r') as f: 11 | barcodes = [l.strip() for l in f.readlines()] 12 | 13 | r = 0.4 14 | which_col = L1_sum['leiden_r'] == r 15 | leiden = L1_sum['leiden'][:, which_col] 16 | 17 | with open(f"../result/clustering_sum_L1/sa2_L1_r0.4_barcodes2id.csv", 'w') as f: 18 | f.writelines("barcode,L1\n") 19 | f.writelines([f"{b},{i}\n" for b, i in zip(barcodes,leiden[:,0])]) 20 | 21 | # copy this file to 17.snapatac2/resource/sa2_dlt2_barcode2id.csv 22 | 23 | # save umap 24 | umap = L1_sum['umap'] 25 | barcode2umap = pd.DataFrame(data = {"barcode" : barcodes, 26 | "UMAP1" : umap[:,0], 27 | "UMAP2" : umap[:,1]}) 28 | barcode2umap.to_csv("../result/clustering_sum_L1/L1_UMAP.csv", 29 | header = True, index = False) 30 | -------------------------------------------------------------------------------- /01.clustering/post_script/sa2.bmat.dlt.compare.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from typing import List 4 | from pathlib import Path 5 | import itertools 6 | 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | import snapatac2 as sa2 10 | 11 | with open("../meta/mba.whole.sample.lst", 'r') as f: 12 | samples = [ l.strip() for l in f.readlines()] 13 | 14 | # * summarize doublet removal under bmat 15 | bmat_dlt_dir = "../result/barcode2dltprob" 16 | dlt_prob_threshold = 0.5 17 | sample2dlt = {} 18 | for s in samples: 19 | with open(f"{bmat_dlt_dir}/{s}.txt", 'r') as f: 20 | lines = [l.strip() for l in f.readlines()] 21 | barcode2dltprob = [ 22 | (l.split(',')[0], float(l.split(',')[1])) for l in lines ] 23 | sample2dlt[s] = barcode2dltprob 24 | barcodes_all_bmat = list(itertools.chain.from_iterable( 25 | [sample2dlt[s] for s in samples] )) 26 | 27 | sample2barcodes = {} 28 | for s in samples: 29 | sample2barcodes[s] = [ 30 | v[0] for v in list(filter(lambda x: x[1] <= dlt_prob_threshold, 31 | sample2dlt[s]))] 32 | # 2355842 33 | nbarcodes_after_dlt = sum( 34 | [len(sample2barcodes[s]) for s in sample2barcodes.keys()]) 35 | 36 | # * load results from doublet removal under gmat 37 | gmat_dlt_dir = "../../00.data.preprocess/snapatac2_pp_out/pp_stat" 38 | sample2barcodes_gmat = {} 39 | for s in samples: 40 | with open(f"{gmat_dlt_dir}/{s}.qc.dlt.barcodes", 'r') as f: 41 | sample2barcodes_gmat[s] = [l.strip() for l in f.readlines()] 42 | 43 | # 2361710 44 | nbarcodes_gmat = sum( 45 | [len(sample2barcodes_gmat[s]) for s in sample2barcodes_gmat.keys()]) 46 | 47 | # ** joint between barcodes under bmat and gmat 48 | barcodes_bmat = list(itertools.chain.from_iterable( 49 | [sample2barcodes[s] for s in sample2barcodes.keys()])) 50 | barcodes_gmat = list(itertools.chain.from_iterable( 51 | [sample2barcodes_gmat[s] for s in sample2barcodes_gmat.keys()] 52 | )) 53 | 54 | # 2326359 55 | barcodes_both = list( 56 | set(barcodes_bmat).intersection(set(barcodes_gmat))) 57 | 58 | len(barcodes_both) / len(barcodes_bmat) # 98.75% 59 | 60 | # * compare with SnapATAC 61 | with open("../../supple.02.QC/sa1.qc.dlt.barcodes", 'r') as f: 62 | barcodes_sa1 = [l.strip() for l in f.readlines()] 63 | # 2204291 64 | barcodes_sa1_bmat = list( 65 | set(barcodes_sa1).intersection(set(barcodes_bmat)) 66 | ) 67 | 68 | # * draw dlt rate 69 | dl2r = {} 70 | for s in samples: 71 | dl2r[s] = 1 - len(sample2barcodes[s]) / len(sample2dlt[s]) 72 | 73 | with open("../../supple.02.QC/sample2biorep.csv", 'r') as f: 74 | lines = [l.strip() for l in f.readlines()] 75 | s2rep = {l.split(',')[0]: l.split(',')[1] for l in lines} 76 | 77 | dlt_early = [dl2r[s] for s in samples if s2rep[s] == 'early'] 78 | dlt_later = [dl2r[s] for s in samples if s2rep[s] == 'later'] 79 | fig = plt.figure() 80 | plt.boxplot([dlt_early, dlt_later]) 81 | plt.show() 82 | 83 | s_early = [s for s in samples if s2rep[s] == 'early'] 84 | s_later = [s for s in samples if s2rep[s] == 'later'] 85 | -------------------------------------------------------------------------------- /01.clustering/post_script/sa2_dlt2_L3_sum.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/01.clustering/post_script/sa2_dlt2_L3_sum.xlsx -------------------------------------------------------------------------------- /01.clustering/post_script/sa2_dlt2_prepareL2.scala: -------------------------------------------------------------------------------- 1 | import scala.io.Source 2 | import scala.collection.immutable._ 3 | import java.io._ 4 | 5 | case class Cell(barcode: String, clusterId: Int) 6 | case class ClusterSum(head: Tuple2[String, String], cells: List[Cell]) 7 | 8 | def loadClusterSumFromFile(file: String, sep:String = ","): ClusterSum = { 9 | val lines= Source.fromFile(file).getLines.toList 10 | val firstline= lines.head.trim.split(sep) 11 | val head: Tuple2[String, String] = (firstline(0), firstline(1)) 12 | val cells = lines.tail.map(x => x.trim.split(sep)).map(x => Cell(x(0), x(1).toInt)) 13 | ClusterSum(head, cells) 14 | } 15 | 16 | def getClusterSize(x: ClusterSum): Map[Int, Int] = { 17 | x.cells.groupBy(_.clusterId).map(t => (t._1, t._2.length)) 18 | } 19 | 20 | def writeMap2csv(fnm: String, lines: Map[Int,Int]): Unit = { 21 | val file = new File(fnm) 22 | val bw = new BufferedWriter(new FileWriter(file)) 23 | lines.foreach(t => bw.write(s"${t._1},${t._2}\n")) 24 | bw.close() 25 | } 26 | 27 | @main def Hello(params: String*):Unit = { 28 | val clusterSum = loadClusterSumFromFile( 29 | file = "../result/clustering_sum_L1/sa2_L1_r0.4_barcodes2id.csv", sep = ",") 30 | val cluster2size = getClusterSize(clusterSum) 31 | writeMap2csv(fnm = "../resource/sa2_dlt2_L1_cluster2size.csv", lines = cluster2size) 32 | } 33 | -------------------------------------------------------------------------------- /01.clustering/resource/sa2_L0_cluster2size.csv: -------------------------------------------------------------------------------- 1 | 0,2355842 -------------------------------------------------------------------------------- /01.clustering/resource/sa2_L1_cluster2size.csv: -------------------------------------------------------------------------------- 1 | 0,169925 2 | 1,156136 3 | 2,151149 4 | 3,142675 5 | 4,132251 6 | 5,112122 7 | 6,107750 8 | 7,100876 9 | 8,95525 10 | 9,92969 11 | 10,82726 12 | 11,77547 13 | 12,77531 14 | 13,75507 15 | 14,66942 16 | 15,62223 17 | 16,55357 18 | 17,41114 19 | 18,41111 20 | 19,38504 21 | 20,36489 22 | 21,36276 23 | 22,34808 24 | 23,28705 25 | 24,24525 26 | 25,23299 27 | 26,22955 28 | 27,21946 29 | 28,21565 30 | 29,18470 31 | 30,17268 32 | 31,14926 33 | 32,14642 34 | 33,13659 35 | 34,13635 36 | 35,12601 37 | 36,12564 38 | 37,11548 39 | 38,11360 40 | 39,11208 41 | 40,8974 42 | 41,6967 43 | 42,6694 44 | 43,6329 45 | 44,6000 46 | 45,5501 47 | 46,5057 48 | 47,4894 49 | 48,4238 50 | 49,4072 51 | 50,3731 52 | 51,3713 53 | 52,3250 54 | 53,2854 55 | 54,2763 56 | 55,2459 57 | 56,1423 58 | 57,402 59 | -------------------------------------------------------------------------------- /01.clustering/resource/sa2_dlt2_L1_cluster2size.csv: -------------------------------------------------------------------------------- 1 | 0,400100 2 | 5,113203 3 | 10,93057 4 | 14,61697 5 | 1,253568 6 | 6,105544 7 | 9,94120 8 | 13,68477 9 | 2,174357 10 | 12,77165 11 | 7,101102 12 | 18,34666 13 | 11,88462 14 | 8,96925 15 | 4,120440 16 | 15,41035 17 | 24,14929 18 | 25,13648 19 | 20,24991 20 | 29,9342 21 | 28,10726 22 | 21,23196 23 | 33,5053 24 | 17,36437 25 | 32,5993 26 | 34,3650 27 | 22,22182 28 | 27,11293 29 | 3,135066 30 | 35,3244 31 | 16,37191 32 | 31,6418 33 | 26,12019 34 | 23,17396 35 | 36,3192 36 | 30,6907 37 | 19,29051 38 | -------------------------------------------------------------------------------- /01.clustering/resource/sa2_dlt2_L1_cluster2size_test.csv: -------------------------------------------------------------------------------- 1 | 35,3244 2 | 36,3192 3 | -------------------------------------------------------------------------------- /01.clustering/sa2.gmat.Snakefile: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict 3 | system: str = "encoder" 4 | project_dict: Dict[str, str] = { 5 | "imac": "/Users/szu/git-recipes/mouseBrainAtlas", 6 | "encoder": "/projects/ps-renlab/szu/projects/CEMBA2" 7 | } 8 | genome = "sa2default" 9 | 10 | project_dir = project_dict[system] 11 | rm_dlt_dir = f"{project_dir}/17.snapatac2/sa2_qc_dlt/rm_dlt" 12 | with open(f"{project_dir}/17.snapatac2/meta/mba.whole.sample.lst", 'r') as f: 13 | samples = [l.strip() for l in f.readlines()] 14 | # test only 15 | # samples = ["CEMBA171206_3C", "CEMBA171207_3C"] 16 | # samples = ["CEMBA171206_3C"] 17 | 18 | out_dir = f"{project_dir}/17.snapatac2/sa2_{genome}_gmat" 19 | log_dir = f"{out_dir}/log" 20 | flag_dir = f"{out_dir}/flag" 21 | # sample-level gmat 22 | sgmat_dir = f"{out_dir}/sgmat" 23 | for d in [out_dir, log_dir, flag_dir, sgmat_dir]: 24 | os.makedirs(d, exist_ok = True) 25 | 26 | 27 | def get_sample(wildcards): 28 | return wildcards.s 29 | 30 | rule all: 31 | input: 32 | expand("{f}/{s}_{g}_gmat.done", 33 | f = flag_dir, g = genome, s = samples), 34 | f"{flag_dir}/{genome}_gmat_merged.done" 35 | 36 | rule sgmat: 37 | input: 38 | snap_file = expand("{i}/{{s}}_rm_dlt.h5ad", i = rm_dlt_dir) 39 | output: 40 | gmat_file = expand("{o}/{{s}}_{g}_gmat.h5ad", 41 | o = sgmat_dir, g = genome), 42 | tag = touch(expand("{f}/{{s}}_{g}_gmat.done", 43 | f = flag_dir, g = genome)) 44 | log: 45 | expand("{l}/{{s}}_{g}_gmat.log", l = log_dir, g = genome) 46 | params: 47 | sample = get_sample, 48 | genome = genome 49 | threads: 1 50 | script: 51 | f"{project_dir}/17.snapatac2/script/sa2.get.sample.gmat.py" 52 | 53 | rule merge_sgmat: 54 | input: 55 | snap_files = expand("{o}/{s}_{g}_gmat.h5ad", 56 | o = sgmat_dir, s = samples, g = genome) 57 | output: 58 | merge_snap = f"{out_dir}/{genome}_gmat_merged.h5ad", 59 | tag = touch(f"{flag_dir}/{genome}_gmat_merged.done") 60 | params: 61 | genome = genome 62 | log: 63 | f"{log_dir}/{genome}_gmat_merged.log" 64 | threads: 4 65 | script: 66 | f"{project_dir}/17.snapatac2/script/sa2.merge.gmat.py" 67 | 68 | 69 | -------------------------------------------------------------------------------- /01.clustering/script/sa2.clustering.umap.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import math 4 | import pickle 5 | from typing import Dict, List 6 | from dataclasses import dataclass, field 7 | import numpy as np 8 | import matplotlib 9 | from matplotlib.figure import Figure 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | import pyprojroot 14 | code_root_dir = str(pyprojroot.here()) 15 | pack_dir = f"{code_root_dir}/package/python" 16 | sys.path.insert(0, pack_dir) 17 | from leiden import LeidenSum, ScatterPlot 18 | from leiden import draw_umap 19 | from leiden import init_LeidenSum_from_file 20 | from colors import SnapATACPalette 21 | 22 | 23 | if __name__ == '__main__': 24 | from_dir: str = sys.argv[1] 25 | cll: str = sys.argv[2] 26 | outdir: str = sys.argv[3] 27 | nsample: int = int(sys.argv[4]) 28 | # this will be used in parallel 29 | cid: str = sys.argv[5] 30 | ls: LeidenSum = init_LeidenSum_from_file( 31 | from_dir = from_dir, 32 | cll = cll, 33 | cid = cid) 34 | print(f"draw UMAP for {cll}: {cid}") 35 | scf = ScatterPlot(nsample = nsample) 36 | ## FIXME: put colors into scf 37 | draw_umap(t = ls, scf = scf, outdir = outdir, colors = list(set(SnapATACPalette))) 38 | 39 | -------------------------------------------------------------------------------- /01.clustering/script/sa2.get.sample.gmat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import traceback 3 | from pathlib import Path 4 | 5 | import snapatac2 as sa2 6 | 7 | import pyprojroot 8 | code_root_dir = str(pyprojroot.here()) 9 | pack_dir = f"{code_root_dir}/package/python" 10 | sys.path.insert(0, pack_dir) 11 | import utils #pyright: ignore # noqa: F401, E402 12 | 13 | # * log 14 | logger = utils.set_file_logger( #pyright: ignore 15 | fnm = snakemake.log[0], #pyright: ignore # noqa: F821 16 | name = "sa2.get.sample.gmat" 17 | ) 18 | def handle_exception(exc_type, exc_value, exc_traceback): 19 | if issubclass(exc_type, KeyboardInterrupt): 20 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 21 | return 22 | logger.error(''.join(["Uncaught exception: ", 23 | *traceback.format_exception( 24 | exc_type, exc_value, exc_traceback) 25 | ])) 26 | # Install exception handler 27 | sys.excepthook = handle_exception 28 | 29 | snap_file = snakemake.input["snap_file"][0] 30 | genome = snakemake.params['genome'] 31 | out_file = snakemake.output["gmat_file"][0] 32 | 33 | logger.info(f"Load snap file {snap_file}") 34 | snap = sa2.read(snap_file, backed = 'r') 35 | # NOTE: 36 | # currently, only sa2 default genome support (i.e., mm10 in mouse) 37 | logger.info(f"genome: {genome}.") 38 | logger.info(f"write gmat to file: {out_file}.") 39 | sa2.pp.make_gene_matrix( 40 | adata = snap, 41 | gene_anno = sa2.genome.mm10, 42 | file = Path(out_file), 43 | use_x = False, 44 | id_type = "gene" 45 | ) 46 | 47 | logger.info("Done.") 48 | -------------------------------------------------------------------------------- /01.clustering/script/sa2.knn.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import sys 4 | import traceback 5 | from typing import Dict 6 | 7 | import snapatac2 as sa2 8 | import pyprojroot 9 | 10 | code_root_dir = str(pyprojroot.here()) 11 | pack_dir = f"{code_root_dir}/package/python" 12 | sys.path.insert(0, pack_dir) 13 | import utils #pyright: ignore # noqa: E402 14 | 15 | # * log 16 | logger = utils.set_file_logger( #pyright: ignore # noqa 17 | fnm = snakemake.log[0], #pyright:ignore # noqa: F821 18 | name = "sa2.L2.embed" 19 | ) 20 | def handle_exception(exc_type, exc_value, exc_traceback): 21 | if issubclass(exc_type, KeyboardInterrupt): 22 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 23 | return 24 | 25 | logger.error(''.join(["Uncaught exception: ", 26 | *traceback.format_exception(exc_type, exc_value, exc_traceback) 27 | ]) 28 | ) 29 | # Install exception handler 30 | sys.excepthook = handle_exception 31 | 32 | # * meta 33 | snap_file = snakemake.input["snap_file"][0] #pyright: ignore # noqa 34 | knn_params: Dict = snakemake.params["knn"] #pyright: ignore # noqa 35 | knn_nm: str = knn_params["name"] 36 | logger.info(f"Use {knn_nm} for knn.") 37 | logger.info(f"Load snapatac2 anndataset: {snap_file}.") 38 | sds = sa2.read(Path(snap_file)) 39 | sa2.pp.knn( 40 | adata = sds, 41 | n_neighbors = knn_params["n"], 42 | use_dims = None, 43 | use_rep = "X_spectral", 44 | method = knn_params["method"], 45 | inplace = True, 46 | random_state = 0 47 | ) 48 | sds.close() 49 | logger.info("Run KNN done.") 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /01.clustering/script/sa2.merge.gmat.py: -------------------------------------------------------------------------------- 1 | ## NOTE: this file is quite similar with sa2.merge.rmdlt.py 2 | ## only diff: get the sample names 3 | ## TODO: merge sa2.merge.gmat and sa2.merge.rmdlt.py 4 | 5 | import os 6 | import sys 7 | import traceback 8 | from pathlib import Path 9 | from typing import Dict 10 | 11 | import numpy as np 12 | import snapatac2 as sa2 13 | 14 | import pyprojroot 15 | code_root_dir = str(pyprojroot.here()) 16 | pack_dir = f"{code_root_dir}/package/python" 17 | sys.path.insert(0, pack_dir) 18 | import utils #pyright: ignore # noqa: F401, E402 19 | 20 | # * log 21 | logger = utils.set_file_logger( #pyright: ignore 22 | fnm = snakemake.log[0], #pyright: ignore # noqa: F821 23 | name = "sa2.merge.gmat" 24 | ) 25 | def handle_exception(exc_type, exc_value, exc_traceback): 26 | if issubclass(exc_type, KeyboardInterrupt): 27 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 28 | return 29 | logger.error(''.join(["Uncaught exception: ", 30 | *traceback.format_exception( 31 | exc_type, exc_value, exc_traceback) 32 | ])) 33 | # Install exception handler 34 | sys.excepthook = handle_exception 35 | 36 | snap_files = snakemake.input["snap_files"] 37 | out_snap = snakemake.output["merge_snap"] 38 | tmp_snap = os.path.join(os.path.dirname(out_snap), "tmp.merge.snap.h5ad") 39 | genome = snakemake.params['genome'] 40 | 41 | logger.info(f"In total, {len(snap_files)} are detected.") 42 | 43 | fnms = [os.path.basename(v) for v in snap_files] 44 | samples = [a.replace(f"_{genome}_gmat.h5ad", "") for a in fnms] 45 | 46 | sample2files = [(s, f) for s, f in zip(samples, snap_files)] 47 | logger.info(f"Create AnnDataSet to tmp file: f{tmp_snap}") 48 | sds = sa2.AnnDataSet( 49 | adatas = sample2files, 50 | filename = tmp_snap, 51 | add_key = 'sample' 52 | ) 53 | 54 | logger.info(f"AnnDataSet to AnnData: {out_snap}") 55 | 56 | snap = sds.to_adata(file = out_snap, copy_x = True) 57 | new_obs_names = utils.modify_obs_name(snap, obs_key = "sample") 58 | snap.obs_names = new_obs_names 59 | 60 | snap.close() 61 | sds.close() 62 | logger.info(f"Delete tmp file: {tmp_snap}.") 63 | os.remove(tmp_snap) 64 | logger.info("Done") 65 | -------------------------------------------------------------------------------- /01.clustering/script/sa2.merge.rmdlt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | from pathlib import Path 5 | from typing import Dict 6 | 7 | import numpy as np 8 | import snapatac2 as sa2 9 | 10 | import pyprojroot 11 | code_root_dir = str(pyprojroot.here()) 12 | pack_dir = f"{code_root_dir}/package/python" 13 | sys.path.insert(0, pack_dir) 14 | import utils #pyright: ignore # noqa: F401, E402 15 | 16 | # * log 17 | logger = utils.set_file_logger( #pyright: ignore 18 | fnm = snakemake.log[0], #pyright: ignore # noqa: F821 19 | name = "sa2.embed" 20 | ) 21 | # logger = utils.set_file_logger( #pyright: ignore 22 | # fnm = "test_qc_dlt.log", #pyright: ignore # noqa: F821 23 | # name = "sa2.embed" 24 | # ) 25 | def handle_exception(exc_type, exc_value, exc_traceback): 26 | if issubclass(exc_type, KeyboardInterrupt): 27 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 28 | return 29 | logger.error(''.join(["Uncaught exception: ", 30 | *traceback.format_exception( 31 | exc_type, exc_value, exc_traceback) 32 | ])) 33 | # Install exception handler 34 | sys.excepthook = handle_exception 35 | 36 | snap_files = snakemake.input["snap_files"] 37 | out_snap = snakemake.output["merge_snap"] 38 | tmp_snap = os.path.join(os.path.dirname(out_snap), "tmp.merge.snap.h5ad") 39 | 40 | logger.info(f"In total, {len(snap_files)} are inputed.") 41 | 42 | fnms = [os.path.basename(v) for v in snap_files] 43 | samples = [a.replace("_rm_dlt.h5ad", "") for a in fnms] 44 | 45 | sample2files = [(s, f)for s, f in zip(samples, snap_files)] 46 | logger.info(f"Create AnnDataSet to tmp file: f{tmp_snap}") 47 | sds = sa2.AnnDataSet( 48 | adatas = sample2files, 49 | filename = tmp_snap, 50 | add_key = 'sample' 51 | ) 52 | 53 | logger.info(f"AnnDataSet to AnnData: {out_snap}") 54 | 55 | snap = sds.to_adata(file = out_snap, copy_x = True) 56 | new_obs_names = utils.modify_obs_name(snap, obs_key = "sample") 57 | snap.obs_names = new_obs_names 58 | 59 | snap.close() 60 | sds.close() 61 | logger.info(f"Delete tmp file: {tmp_snap}.") 62 | os.remove(tmp_snap) 63 | logger.info("Done") 64 | 65 | -------------------------------------------------------------------------------- /01.clustering/script/sa2.pre.anndataset.py: -------------------------------------------------------------------------------- 1 | # Deprecated 2 | # Now we use sa2.merge.rmdlt.py to get a complete bmat snap file. 3 | # then do subset on it directly. 4 | import os 5 | import sys 6 | import traceback 7 | from pathlib import Path 8 | import logging 9 | from typing import Dict, List 10 | import shutil 11 | 12 | 13 | from numba.core.errors import NumbaDeprecationWarning 14 | from numba.core.errors import NumbaPendingDeprecationWarning 15 | import warnings 16 | warnings.simplefilter('ignore', category=NumbaDeprecationWarning) 17 | warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning) 18 | 19 | import numpy as np 20 | import snapatac2 as sa2 21 | import pyprojroot 22 | code_root_dir = str(pyprojroot.here()) 23 | pack_dir = f"{code_root_dir}/package/python" 24 | sys.path.insert(0, pack_dir) 25 | import utils # noqa: E402 26 | from leiden import cemba #pyright: ignore # noqa: E401, E402, F401 27 | from leiden import cal_silhouette #pyright: ignore # noqa: E402, F401 28 | from leiden import umap #pyright: ignore # noqa: E402 29 | 30 | logger = utils.set_file_logger( #pyright: ignore 31 | fnm = snakemake.log[0], #pyright: ignore # noqa: F821 32 | name = "cemba.all.anndataset" 33 | ) 34 | def handle_exception(exc_type, exc_value, exc_traceback): 35 | if issubclass(exc_type, KeyboardInterrupt): 36 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 37 | return 38 | 39 | logger.error(''.join(["Uncaught exception: ", 40 | *traceback.format_exception(exc_type, exc_value, exc_traceback) 41 | ]) 42 | ) 43 | # Install exception handler 44 | sys.excepthook = handle_exception 45 | 46 | logger.info(f"Get CEMBA all anndata files into AnnDataSet.") 47 | 48 | 49 | sample2fragment_file = snakemake.input[0] 50 | cemba_all_file = snakemake.output[0] 51 | 52 | with open(sample2fragment_file, 'r') as f: 53 | files = [l.strip() for l in f.readlines()] 54 | samples = [os.path.basename(a).split(".")[0] for a in files] 55 | sample2files = [(s, f) for s, f in zip(samples, files)] 56 | logger.info(f"Load {len(samples)} samples into AnnDataSet.") 57 | sds = sa2.AnnDataSet( 58 | adatas = sample2files, 59 | filename = cemba_all_file 60 | ) 61 | logger.info("Update obs_names: [sample].[barcode] .") 62 | 63 | obs_names: List[str] = [f"{i}.{j}" 64 | for i, j in zip(sds.obs['sample'].to_list(), sds.obs_names)] 65 | sds.obs_names = obs_names 66 | 67 | sds.close() 68 | logger.info(f"CEMBA.all.AnnDataset is saved at: {cemba_all_file}.") 69 | 70 | -------------------------------------------------------------------------------- /01.clustering/script/sa2.rm.dlt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import traceback 4 | from pathlib import Path 5 | from typing import Dict 6 | 7 | import numpy as np 8 | import snapatac2 as sa2 9 | 10 | import pyprojroot 11 | code_root_dir = str(pyprojroot.here()) 12 | pack_dir = f"{code_root_dir}/package/python" 13 | sys.path.insert(0, pack_dir) 14 | import utils #pyright: ignore # noqa: F401, E402 15 | 16 | # * log 17 | logger = utils.set_file_logger( #pyright: ignore 18 | fnm = snakemake.log[0], #pyright: ignore # noqa: F821 19 | name = "sa2.embed" 20 | ) 21 | # logger = utils.set_file_logger( #pyright: ignore 22 | # fnm = "test_qc_dlt.log", #pyright: ignore # noqa: F821 23 | # name = "sa2.embed" 24 | # ) 25 | def handle_exception(exc_type, exc_value, exc_traceback): 26 | if issubclass(exc_type, KeyboardInterrupt): 27 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 28 | return 29 | logger.error(''.join(["Uncaught exception: ", 30 | *traceback.format_exception( 31 | exc_type, exc_value, exc_traceback) 32 | ])) 33 | # Install exception handler 34 | sys.excepthook = handle_exception 35 | 36 | snap_file = snakemake.input["qc_dlt_file"][0] #pyright: ignore # noqa: F821 37 | out_file = snakemake.output["snap_file"][0] #pyright: ignore # noqa: F821 38 | 39 | logger.info(f"Load snap file {snap_file}") 40 | snap = sa2.read(snap_file, backed = 'r') 41 | 42 | barcodes: np.ndarray = np.array([snap.obs_names]) 43 | dlt_probs = snap.obs['doublet_probability'].to_numpy() 44 | slt_index = (dlt_probs <= 0.5).tolist() 45 | 46 | r = snap.subset(obs_indices = slt_index, out = out_file) 47 | r.close() 48 | snap.close() 49 | 50 | -------------------------------------------------------------------------------- /01.clustering/script/supple.sa2.add.barcode.to.unite.clustering.py: -------------------------------------------------------------------------------- 1 | """ 2 | Fix unite clustering from L1: 3 | - loss of barcode information under summary picklefile 4 | """ 5 | import os 6 | import pickle 7 | import snapatac2 as sa2 8 | 9 | 10 | def update_pkl(cid, cll:str = "L1", 11 | from_dir: str = "L2_dlt2_encoder", 12 | prefix = "sa2_clustering", 13 | snap_prefix: str = "nfeat-top_nc50") -> None: 14 | if not os.path.exists(from_dir): 15 | raise FileExistsError(f"{from_dir} dos not exist.") 16 | pkl_fnm = os.path.join(from_dir, f"{prefix}_{cll}_{cid}.pkl") 17 | if not os.path.exists(pkl_fnm): 18 | raise FileExistsError(f"{pkl_fnm} does not exist.") 19 | out_dir = os.path.join(from_dir, "supple.add.barcode") 20 | os.makedirs(out_dir, exist_ok=True) 21 | pkl2_fnm = os.path.join(out_dir, f"{prefix}_{cll}_{cid}.pkl") 22 | 23 | snap_fnm = os.path.join(from_dir, f"{snap_prefix}_{cid}_unite.h5ad") 24 | if not os.path.exists(snap_fnm): 25 | raise FileExistsError(f"{snap_fnm} does not exist.") 26 | with open(pkl_fnm, 'rb') as f: 27 | sum = pickle.load(f) 28 | snap = sa2.read(snap_fnm, 'r') 29 | if "barcode" in sum.keys(): 30 | print(f"barcode is already in {pkl_fnm}.") 31 | else: 32 | sum['barcode'] = snap.obs_names 33 | snap.close() 34 | with open(pkl2_fnm, 'wb') as f: 35 | pickle.dump(sum, f) 36 | 37 | 38 | cids_unites = list(range(8,37)) 39 | list(map(update_pkl, cids_unites)) 40 | 41 | -------------------------------------------------------------------------------- /01.clustering/script/supple.sa2.bmat.dlt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from typing import List 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import snapatac2 as sa2 8 | 9 | # test 10 | # work_dir = "/Users/szu/git-recipes/mouseBrainAtlas/CEMBA2" 11 | # sample = "CEMBA190718_8F" 12 | # qc_dlt_dir = f"{work_dir}/17.snapatac2/result/qc_bmat-dlt/" 13 | # out_dir = f"{work_dir}/17.snapatac2/result/qc_bmat-dlt_barcodes" 14 | 15 | work_dir = "/oasis/tscc/scratch/szu/projects/CEMBA2/" 16 | qc_dlt_dir = f"{work_dir}/17.snapatac2/sa2_qc_dlt/qc_dlt" 17 | sample = sys.argv[1] 18 | out_dir = f"{work_dir}/17.snapatac2/sa2_qc_dlt/barcode2dltprob" 19 | 20 | os.makedirs(out_dir, exist_ok = True) 21 | dlt_prob_threshold = 0.5 22 | 23 | outf = f"{out_dir}/{sample}.txt" 24 | snap = sa2.read(Path(f"{qc_dlt_dir}/{sample}_qc_dlt.h5ad"), backed = 'r') 25 | barcodes: np.ndarray = np.array([f"{sample}.{k}" for k in snap.obs_names]) 26 | dlt_probs = snap.obs['doublet_probability'].to_numpy().tolist() 27 | # barcodes_filtered:List[str] = barcodes[dlt_probs <= dlt_prob_threshold].tolist() 28 | with open(outf, 'w') as f: 29 | f.writelines('\n'.join([f"{b},{s}" for b, s in zip(barcodes, dlt_probs)])) 30 | snap.close() 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /01.clustering/script/supple.sa2.get.embed.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import math 4 | import pickle 5 | from typing import Dict, List 6 | from dataclasses import dataclass, field 7 | import numpy as np 8 | 9 | import pyprojroot 10 | code_root_dir = str(pyprojroot.here()) 11 | pack_dir = f"{code_root_dir}/package/python" 12 | sys.path.insert(0, pack_dir) 13 | from leiden import LeidenSum, ScatterPlot 14 | from leiden import draw_umap 15 | from leiden import init_LeidenSum_from_file 16 | from colors import SnapATACPalette 17 | import snapatac2 as sa2 18 | 19 | sa2_dir = os.path.join( 20 | "/projects/ps-renlab/szu/projects/CEMBA2", 21 | "17.snapatac2" 22 | ) 23 | 24 | sa2L1_fnm = os.path.join( 25 | sa2_dir, "L1_encoder", 26 | "nfeat-all_nsample-all_nc50_0_mult.h5ad") 27 | 28 | snapL1 = sa2.read( 29 | filename = sa2L1_fnm, backed = 'r') 30 | 31 | embed_mat: np.ndarray = snapL1.obsm['X_spectral'] 32 | barcodes: List[str] = snapL1.obs_names 33 | out_dir = os.path.join( 34 | "/projects/ps-renlab2/szu/projects/CEMBA2", 35 | "17.snapatac2", "resource", "sa2L1sum" 36 | ) 37 | if not os.path.exists(out_dir): 38 | os.makedirs(out_dir, exist_ok = True) 39 | 40 | np.savetxt( 41 | os.path.join(out_dir, "sa2.L1.embed_mat.csv"), 42 | embed_mat, delimiter = ',') 43 | 44 | # save barcodes to txt 45 | with open(os.path.join( 46 | out_dir, "sa2.L1.barcodes.txt"), 'w') as f: 47 | for bc in barcodes: 48 | f.write(bc + '\n') 49 | 50 | # save umap 51 | umap: np.ndarray = snapL1.obsm["X_umap"] 52 | np.savetxt(os.path.join(out_dir, "sa2.L1.umap_ab_spectral.csv"), 53 | umap, delimiter = ',') 54 | 55 | # calculate umap with default parameter 56 | from numba.core.errors import NumbaDeprecationWarning 57 | from numba.core.errors import NumbaPendingDeprecationWarning 58 | import warnings 59 | warnings.simplefilter('ignore', category=NumbaDeprecationWarning) 60 | warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning) 61 | from leiden import umap # pyright: ignore # noqa: E402 62 | 63 | # it will automatically use <20 CPUs 64 | # and cost <30G RAM 65 | # in encoder 66 | umap_default : np.ndarray = umap( 67 | adata = snapL1, 68 | use_rep = "X_spectral", 69 | inplace = False, 70 | a = None, 71 | b = None, 72 | init='spectral' 73 | ) 74 | 75 | np.savetxt(os.path.join(out_dir, "sa2.L1.umap_default.csv"), 76 | umap_default, delimiter = ',') 77 | snapL1.close() 78 | -------------------------------------------------------------------------------- /01.clustering/script/supple.sa2.prepare.L1.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from typing import List 4 | from pathlib import Path 5 | import itertools 6 | 7 | import numpy as np 8 | import snapatac2 as sa2 9 | 10 | # get barcodes after qc and doublet removal 11 | with open("../meta/mba.whole.sample.lst", 'r') as f: 12 | samples = [ l.strip() for l in f.readlines()] 13 | 14 | # * summarize doublet removal under bmat 15 | bmat_dlt_dir = "../result/barcode2dltprob" 16 | dlt_prob_threshold = 0.5 17 | sample2dlt = {} 18 | for s in samples: 19 | with open(f"{bmat_dlt_dir}/{s}.txt", 'r') as f: 20 | lines = [l.strip() for l in f.readlines()] 21 | barcode2dltprob = [ 22 | (l.split(',')[0], float(l.split(',')[1])) for l in lines ] 23 | sample2dlt[s] = barcode2dltprob 24 | barcodes_all_bmat = list(itertools.chain.from_iterable( 25 | [sample2dlt[s] for s in samples] )) 26 | 27 | sample2barcodes = {} 28 | for s in samples: 29 | sample2barcodes[s] = [ 30 | v[0] for v in list(filter(lambda x: x[1] <= dlt_prob_threshold, 31 | sample2dlt[s]))] 32 | # 2355842 33 | nbarcodes_after_dlt = sum( 34 | [len(sample2barcodes[s]) for s in sample2barcodes.keys()]) 35 | 36 | barcodes = [] 37 | for s in samples: 38 | barcodes.extend(sample2barcodes[s]) 39 | 40 | with open("../resource/barcode2id_L0.csv", 'w') as f: 41 | f.writelines("barcode,L0\n") 42 | f.writelines('\n'.join([f"{v},0" for v in barcodes])) 43 | 44 | -------------------------------------------------------------------------------- /01.clustering/script/supple.sa2.prepare.L2.in.memory.subset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import pandas as pd 5 | import snapatac2 as sa2 6 | 7 | clevel = "L1" 8 | out_dir = f"sa2_dlt2_{clevel}_subsets" 9 | os.makedirs(out_dir, exist_ok = False) 10 | # load snap data into memory: 11 | # - it consumes about 200G and loading process takes about 10 minutes. 12 | # - tscc has slow IO, so use this to make subset faster. 13 | # - this loading format will limit to read this file for another loading. 14 | # - not sure if: read by 'r', then use to_memory will better, since this 15 | # will create a new object based on the description 16 | # FIXME: after this loading, no subset attribute. 17 | 18 | 19 | sds_all = sa2.read("resource/merge_cemba_all.h5ad", None) 20 | 21 | barcode2id: pd.DataFrame = pd.read_csv("resource/sa2_dlt2_barcode2id.csv") 22 | 23 | cid = 0 24 | snap_file = f"{out_dir}/sa2_dlt2_{clevel}_{cid}.h5ad" 25 | sub_barcodes = barcode2id[barcode2id[clevel] == int(cid)]['barcode'] 26 | all_barcodes = sds_all.obs_names 27 | a = set(sub_barcodes) 28 | is_in_sub = np.array([b in a for b in all_barcodes]) 29 | print(f"Find {is_in_sub.sum()} barcodes in cemba.") 30 | sds = sds_all.subset( 31 | obs_indices = is_in_sub, 32 | out = snap_file) 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /01.clustering/script/supple.sa2.prepare.L2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | barcode2id_file = "../resource/barcode2id.csv" 4 | barcode2id = pd.read_csv(barcode2id_file, header = 0) 5 | cluster2size = barcode2id.L1.value_counts() 6 | with open("../resource/sa2_L1_cluster2size.csv", 'w') as f: 7 | f.writelines([f"{c},{s}\n" for c, s in zip( 8 | cluster2size.index.to_list(), cluster2size)]) 9 | 10 | 11 | -------------------------------------------------------------------------------- /01.clustering/script/supple.sa2.prepare.L4.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(stringr) 3 | library(dplyr) 4 | packdir <- file.path(here::here(), "package/R") 5 | import::from(.from = "cembav2env.R", .directory = packdir, 6 | cluSumBySa2) 7 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE) 8 | 9 | # * load L3 clustering result 10 | L3Sums <- cluSumBySa2$loadL3Sums() 11 | L3Resos <- cluSumBySa2$loadL3Resos() 12 | barcode2L3 <- cluSumBySa2$loadbarcode2L3() 13 | 14 | # * prepare two files 15 | # 0. get L3 needed for L4 clustering 16 | uL3s <- unique(barcode2L3$L3) 17 | 18 | # nolint start 19 | L3Pattern <- with(L3Resos, L2nm[needL4 == "L4"]) |> 20 | x => gsub("sa2v1_", "", x) 21 | nMin <- 400 22 | # nolint end 23 | 24 | # filter uL3s that are in L3Pattern 25 | uL3toL2 <- gsub("-[0-9]+$", "", uL3s) 26 | uL3forL4 <- uL3s[uL3toL2 %in% L3Pattern] 27 | 28 | # 1. cluster size at L3-level in uL3forL4 29 | # and no less than nMin 30 | L3tosize <- table(barcode2L3$L3) |> 31 | as.data.frame(stringsAsFactors = FALSE) |> 32 | setNames(c("L3", "size")) |> 33 | x => x[x$L3 %in% uL3forL4, ] |> 34 | arrange(desc(size)) |> 35 | x => x[x$size >= nMin, ] 36 | # outut L3tosize to csv file without header 37 | write.table(L3tosize, 38 | file = file.path("../resource", "sa2_dlt2_L3forL4_cluster2size.csv"), 39 | sep = ",", row.names = FALSE, col.names = FALSE, quote = FALSE) 40 | 41 | # 2. barcode2cluster at L3-level 42 | barcode2L3 |> 43 | x => x[x$L3 %in% uL3forL4, ] |> 44 | write.table( 45 | file = file.path("../resource", "sa2_dlt2_L3forL4_barcode2id.csv"), 46 | sep = ",", row.names = FALSE, col.names = TRUE, quote = FALSE) 47 | 48 | -------------------------------------------------------------------------------- /02.integration/src/main/R/annToS5.R: -------------------------------------------------------------------------------- 1 | library(BPCells) 2 | library(Seurat) 3 | options(Seurat.object.assay.version = "v5") 4 | library(stringr) 5 | library(purrr) 6 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE) 7 | 8 | projdir <- here::here() 9 | rdir <- file.path(projdir, "package/R") 10 | import::from(.from = "utils.R", .directory = rdir, 11 | setupLogging, closeLogging) 12 | 13 | 14 | ## library(future) 15 | ## plan(multicore, workers = 2) 16 | ## options(future.globals.maxSize = 2e9) 17 | ## options(future.rng.onMisuse = "ignore") 18 | 19 | 20 | # * load snakemake configs 21 | annfnm <- snakemake@input[[1]] 22 | outfnm <- snakemake@output[[1]] 23 | logfnm <- snakemake@log[[1]] 24 | modality <- snakemake@params[["modality"]] 25 | ## py <- snakemake@params[["py"]] 26 | 27 | # * set logger 28 | setupLogging(logfnm) 29 | 30 | # * set python 31 | ## library(reticulate) 32 | ## reticulate::use_python(py) 33 | ## ad <- reticulate::import("andnata", convert = FALSE) 34 | 35 | # * function 36 | convertAnn2Seurat5 <- function(annfnm, 37 | modality, 38 | group = "X", 39 | outdir, 40 | overwrite = TRUE, 41 | assay = "RNA", 42 | isLogNorm = TRUE) { 43 | xlognorm <- BPCells::open_matrix_anndata_hdf5( 44 | path = annfnm, group = group) 45 | BPCells::write_matrix_dir(mat = xlognorm, dir = outdir, overwrite = overwrite) 46 | d <- BPCells::open_matrix_dir(outdir) 47 | s5 <- Seurat::CreateSeuratObject(counts = d, assay = assay) 48 | if (isLogNorm) { 49 | s5 <- Seurat::SetAssayData( 50 | object = s5, slot = "data", new.data = d) 51 | } 52 | s5$modality <- modality 53 | return(s5) 54 | } 55 | 56 | 57 | # * main 58 | outdir <- dirname(outfnm) 59 | outfprefix <- sub("\\.[^.]+$", "", basename(outfnm)) 60 | logger::log_info("output to: ", outdir) 61 | logger::log_info("out file prefix: ", outfprefix) 62 | 63 | logger::log_info("to Seuratv5: ", annfnm) 64 | s5 <- convertAnn2Seurat5( 65 | annfnm = annfnm, modality = modality, 66 | outdir = file.path(outdir, outfprefix), isLogNorm = TRUE) 67 | logger::log_info( 68 | "NOTE: [modality] column added to Seuratv5 with value: ", modality) 69 | logger::log_info("NOTE: obs meta data is ignored in Seuratv5.") 70 | 71 | logger::log_info("save seuratv5 to: ", outfnm) 72 | saveRDS(s5, outfnm) 73 | logger::log_info("to Seuratv5 done.") 74 | closeLogging() 75 | -------------------------------------------------------------------------------- /02.integration/src/main/R/downsample.Allen.Seurat.on.subclass.level.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(Seurat) 3 | options(Seurat.object.assay.version = "v5") 4 | options(future.globals.maxSize = 5e9) 5 | library(SeuratObject) 6 | library(Matrix) 7 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE) 8 | library(future) 9 | 10 | projdir <- here::here() 11 | rdir <- file.path(projdir, "package/R") 12 | import::from(.from = "cembav2env.R", .directory = rdir, 13 | cluSumBySa2, Sa2Integration, cembav2env, Sa2PeakCalling) 14 | import::from(.from = "integration.R", .directory = rdir, 15 | convertAnn2Seurat5, get.downsample.fun, downsampleSeurat, 16 | toSeuratInMemory, 17 | isOnDiskMat.Seurat, calVarOfFea.Seurat, setVariableFeatures) 18 | 19 | # * configs 20 | # tscc or encoder 21 | system <- "tscc" 22 | rscdir <- file.path(here::here(), "19.snap2_integration", 23 | "src/main/resource") 24 | outdir <- file.path(here::here(), "19.snap2_integration", 25 | paste0("out/transferLabel_", system)) 26 | if(!dir.exists(outdir)) { 27 | dir.create(outdir) 28 | } 29 | allenAnnotMeta <- Sa2Integration$loadAllenAnnot() 30 | allenAnnotMeta$subclass_label_v3 <- vapply(allenAnnotMeta$subclass_label, 31 | Sa2PeakCalling$renameAllenSubclass, "rename") 32 | 33 | neuronAllenSeu <- readRDS( 34 | file.path(outdir, "neuron_allen_noraw_seurat.rds")) 35 | neuronAllenSeu$subclass <- allenAnnotMeta[ 36 | as.character(neuronAllenSeu$cl), "subclass_label_v3"] 37 | neuronAllenSeu$barcode <- colnames(neuronAllenSeu) 38 | 39 | nnAllenSeu <- readRDS( 40 | file.path(outdir, "nn_allen_noraw_seurat.rds")) 41 | nnAllenSeu$subclass <- allenAnnotMeta[ 42 | as.character(nnAllenSeu$cl), "subclass_label_v3"] 43 | nnAllenSeu$barcode <- colnames(nnAllenSeu) 44 | 45 | # * downsample by subclass 46 | s <- merge(nnAllenSeu, neuronAllenSeu) 47 | nds <- 1000 48 | barcodes <- s@meta.data |> 49 | group_by(subclass) |> slice_sample(n = nds) |> 50 | x => x$barcode 51 | s <- s[, barcodes] 52 | mat1 <- as(s$RNA[["data.1"]], Class = "dgCMatrix") 53 | mat2 <- as(s$RNA[["data.2"]], Class = "dgCMatrix") 54 | mat <- SeuratObject::RowMergeSparseMatrices( 55 | mat1 = mat1, mat2 = mat2) 56 | meta <- s@meta.data 57 | r <- Seurat::CreateSeuratObject(counts = mat, assay = "RNA", 58 | meta.data = meta) 59 | r <- Seurat::SetAssayData(object = r, slot = "data", new.data = mat) 60 | saveRDS(r, file.path(outdir, "allen_ds1000_seurat.rds")) 61 | 62 | ## nds <- 5000 63 | ## neuron.meta.ds <- neuronAllenSeu@meta.data |> 64 | ## group_by(subclass) |> slice_sample(n = nds) 65 | ## neuronAllenSeu.ds <- neuronAllenSeu[, neuron.meta.ds$barcode] 66 | ## saveRDS(neuronAllenSeu.ds, file.path(outdir, 67 | ## "neuron_allen_ds5000_seurat.rds")) 68 | 69 | ## nn.meta.ds <- nnAllenSeu@meta.data |> 70 | ## group_by(subclass) |> slice_sample(n = nds) 71 | ## nnAllenSeu.ds <- nnAllenSeu[ , nn.meta.ds$barcode] 72 | ## saveRDS(nnAllenSeu.ds, file.path(outdir, 73 | ## "nn_allen_ds5000_seurat.rds")) 74 | 75 | -------------------------------------------------------------------------------- /02.integration/src/main/R/downsample.sa2.Seurat.on.subclass.level.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(Seurat) 3 | options(Seurat.object.assay.version = "v5") 4 | options(future.globals.maxSize = 5e9) 5 | library(SeuratObject) 6 | library(Matrix) 7 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE) 8 | library(future) 9 | 10 | projdir <- here::here() 11 | rdir <- file.path(projdir, "package/R") 12 | import::from(.from = "cembav2env.R", .directory = rdir, 13 | cluSumBySa2, Sa2Integration, cembav2env, Sa2PeakCalling) 14 | import::from(.from = "integration.R", .directory = rdir, 15 | convertAnn2Seurat5, get.downsample.fun, downsampleSeurat, 16 | toSeuratInMemory, 17 | isOnDiskMat.Seurat, calVarOfFea.Seurat, setVariableFeatures) 18 | 19 | # * configs 20 | # tscc or encoder 21 | system <- "tscc" 22 | rscdir <- file.path(here::here(), "19.snap2_integration", 23 | "src/main/resource") 24 | outdir <- file.path(here::here(), "19.snap2_integration", 25 | paste0("out/transferLabel_", system)) 26 | 27 | # load atacmeta 28 | atacMeta <- readRDS(cemabv) 29 | -------------------------------------------------------------------------------- /02.integration/src/main/R/getIntUMAP.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | options(Seurat.object.assay.version = "v5") 3 | library(stringr) 4 | library(purrr) 5 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE) 6 | projdir <- here::here() 7 | rdir <- file.path(projdir, "package/R") 8 | import::from(.from = "cembav2env.R", .directory = rdir, 9 | Sa2Integration) 10 | import::from(.from = "utils.R", .directory = rdir, 11 | setupLogging, closeLogging) 12 | 13 | # * load snakemake configs 14 | intS5fnm <- snakemake@input[[1]] 15 | outfnm <- snakemake@output[[1]] 16 | nPCA <- snakemake@params$nPCA 17 | intMethod <- snakemake@wildcards$m 18 | 19 | # * set logger 20 | setupLogging(snakemake@log[[1]]) 21 | 22 | # * main 23 | logger::log_info("readRDS: ", intS5fnm) 24 | seu <- readRDS(intS5fnm) 25 | logger:log_info("After integration with: ", intMethod) 26 | logger::log_info("run UMAP on reduction: ", paste0("intgn.", intMethod)) 27 | 28 | seu <- Seurat::RunUMAP(seu, 29 | reduction = paste0("intgn.", intMethod), dims = 1:nPCA, 30 | reduction.name = paste0("umap.", intMethod)) 31 | 32 | logger::log_info("finish UMAP, and save to ", outfnm) 33 | saveRDS(seu, outfnm) 34 | logger::log_info("done") 35 | closeLogging() 36 | -------------------------------------------------------------------------------- /02.integration/src/main/R/mapSubclassNames.R: -------------------------------------------------------------------------------- 1 | projroot <- here::here() 2 | rdir <- file.path(projroot, "package/R") 3 | import::from(.from = "cembav2env.R", .directory = rdir, 4 | cembav2env, Sa2Integration, Sa2PeakCalling) 5 | import::from(.from = "utils.R", .directory = rdir, 6 | fastread.csv) 7 | 8 | 9 | allenMeta <- Sa2Integration$loadAllenAnnot() 10 | 11 | subclassMap <- unique(data.frame( 12 | subclass_id = allenMeta$subclass_id, 13 | subclass_id_label = allenMeta$subclass_id_label, 14 | subclass_label = allenMeta$subclass_label 15 | )) 16 | 17 | subclass_label_bw <- data.table::fread( 18 | file = "../resource/subclass_nm_in_macs2_bigwig.txt", 19 | header = FALSE, data.table = FALSE)$V1 20 | 21 | subclassMap$subclass_id_label_bw <- subclassMap$subclass_id_label |> 22 | gsub(" ", "_", x = _) |> 23 | gsub("/", ".", x = _) 24 | 25 | subclassMap$subclass_label_bw <- subclassMap$subclass_label |> 26 | gsub(" ", "_", x = _) |> 27 | gsub("/", ".", x = _) 28 | 29 | subclassMap$subclass_label_peak <- subclassMap$subclass_label |> 30 | Sa2PeakCalling$renameAllenSubclass() 31 | 32 | # check 33 | all(subclass_label_bw %in% subclassMap$subclass_id_label_bw) 34 | a <- fastread.csv(fnm = file.path(projroot, "18.snap2_peakcalling", 35 | "out/scfilter", "count_peakBysubclass.csv")) 36 | all(colnames(a) %in% subclassMap$subclass_label_peak) 37 | 38 | # save 39 | saveRDS(subclassMap, 40 | file.path(projroot, "meta", "sa2.subclass.names.map.rds")) 41 | write.table(subclassMap, 42 | file.path(projroot, "meta", "sa2.subclass.names.map.csv"), 43 | sep = ",", quote = FALSE, col.names = TRUE, row.names = FALSE) 44 | -------------------------------------------------------------------------------- /02.integration/src/main/R/runPCA.R: -------------------------------------------------------------------------------- 1 | library(Seurat) 2 | options(Seurat.object.assay.version = "v5") 3 | library(stringr) 4 | library(purrr) 5 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE) 6 | projdir <- here::here() 7 | rdir <- file.path(projdir, "package/R") 8 | import::from(.from = "cembav2env.R", .directory = rdir, 9 | Sa2Integration) 10 | import::from(.from = "utils.R", .directory = rdir, 11 | setupLogging, closeLogging) 12 | import::from(.from = "integration.R", .directory = rdir, 13 | get.downsample.fun, downsampleSeurat) 14 | 15 | 16 | # * load snakemake configs 17 | infnm <- snakemake@input[[1]] 18 | outfnm <- snakemake@output[[1]] 19 | ft <- snakemake@wildcards$ft 20 | logfnm <- snakemake@log[[1]] 21 | nPCA <- snakemake@params$nPCA 22 | 23 | # * set logger 24 | setupLogging(logfnm) 25 | 26 | # * load features from Sa2Integration 27 | logger::log_info("load features from Sa2Integration: ", ft) 28 | geneList <- Sa2Integration$getMarkersList() 29 | fts <- geneList[[ft]] 30 | 31 | # * main 32 | logger::log_info("readRDS: ", infnm) 33 | seu <- readRDS(infnm) 34 | logger::log_info("ScaleData on feature: ", ft) 35 | Seurat::VariableFeatures(seu) <- fts 36 | seu <- Seurat::ScaleData(seu, features = fts) 37 | 38 | logger::log_info("runPCA") 39 | seu <- Seurat::RunPCA( 40 | seu, features = fts, verbose = TRUE, 41 | npcs = nPCA) 42 | 43 | logger::log_info("writeRDS: ", outfnm) 44 | saveRDS(seu, outfnm) 45 | logger::log_info("done") 46 | closeLogging() 47 | -------------------------------------------------------------------------------- /02.integration/src/main/R/simple.gene.list.of.allen.R: -------------------------------------------------------------------------------- 1 | library(purrr) 2 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE) 3 | library(stringr) 4 | projdir <- here::here() 5 | resourcedir <- "../resource" 6 | rdir <- file.path(projdir, "package/R") 7 | import::from(.from = "cembav2env.R", .directory = rdir, 8 | Sa2Integration) 9 | 10 | # * simply extract allen's cluster marker genes 11 | annotAllen <- Sa2Integration$loadAllenAnnot() 12 | 13 | # * functions 14 | filterMarkers <- function(markers, sep = ",") { 15 | # filter space 16 | # filter gene symbols linked with ENSMUSG 17 | r <- map(markers, str_split_1, pattern = sep) |> 18 | unlist() |> sort() |> 19 | x => x[!grepl("ENSMUSG", x)] |> 20 | x => x[nzchar(x)] |> 21 | unique() 22 | message("found ", length(r), " markers.") 23 | message("unique ", length(unique(r)), " markers.") 24 | return(r) 25 | } 26 | getMarkers <- function(annotAllen, 27 | groupBy = "", 28 | groups = c(), 29 | markerCol = "cluster.markers", 30 | outfnm = "") { 31 | message("marker column: ", markerCol) 32 | markers <- if (nchar(groupBy) > 1) { 33 | message("select markers from column: ", groupBy) 34 | message("with values: ", paste(groups, collapse = " ")) 35 | gs <- annotAllen[[groupBy]] 36 | annotAllen[gs %in% groups, markerCol] 37 | } else { 38 | annotAllen[[markerCol]] 39 | } 40 | r <- filterMarkers(markers) 41 | if (nchar(outfnm) > 1) { 42 | message("save results to: ", outfnm) 43 | write.table(r, outfnm, append = FALSE, quote = FALSE, 44 | col.names = FALSE, row.names = FALSE) 45 | } 46 | return(r) 47 | } 48 | 49 | # * main 50 | allMarkers <- getMarkers( 51 | annotAllen, markerCol = "cluster.markers", 52 | outfnm = file.path(resourcedir, "AIT21_cluster_markers.txt") 53 | ) 54 | allMerfishMarkers <- getMarkers( 55 | annotAllen, markerCol = "merfish.markers", 56 | outfnm = file.path(resourcedir, "AIT21_merfish_markers.txt") 57 | ) 58 | 59 | # fix k8 markers 60 | rawk8markers <- read.table( 61 | Sa2Integration$AllenRaw8KMarkerFile, header = FALSE)$V1 62 | k8markers <- filterMarkers(rawk8markers) 63 | write.table(k8markers, 64 | file.path(resourcedir, "AIT21_k8_markers.txt"), 65 | append = FALSE, quote = FALSE, col.names = FALSE, row.names = FALSE) 66 | 67 | # * test markers 68 | geneList <- Sa2Integration$getMarkersList() 69 | -------------------------------------------------------------------------------- /02.integration/src/main/python/imneuron.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import importlib 4 | import pandas as pd 5 | import anndata as ad 6 | 7 | from pyprojroot import here 8 | proj_root: str = str(here()) 9 | sys.path.insert(0, os.path.join(proj_root, "package/python")) 10 | from utils import set_file_logger 11 | import cembav2env 12 | importlib.reload(cembav2env) 13 | 14 | # * configs 15 | rsc_dir = os.path.join(proj_root, 16 | "19.snap2_integration/" 17 | "src/main/resource") 18 | allen = cembav2env.Allen() 19 | sa2atac = cembav2env.Sa2ATAC() 20 | 21 | # * load reduced ann data 22 | reduced_allen_dir = os.path.join(rsc_dir, "norawdata_allen") 23 | neuron_allen_fnm = os.path.join(reduced_allen_dir, 24 | "neuron_male_allen_ann_noraw.h5ad") 25 | neuron_allen = ad.read_h5ad( 26 | filename=neuron_allen_fnm, backed="r") 27 | 28 | nn_allen_fnm = os.path.join(reduced_allen_dir, 29 | "nn_male_allen_ann_noraw.h5ad") 30 | nn_allen = ad.read_h5ad( 31 | filename=nn_allen_fnm, backed="r") 32 | 33 | -------------------------------------------------------------------------------- /02.integration/src/main/python/reduce.anndata.allen.sa2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import importlib 4 | import pandas as pd 5 | import anndata as ad 6 | 7 | from pyprojroot import here 8 | proj_root: str = str(here()) 9 | sys.path.insert(0, os.path.join(proj_root, "package/python")) 10 | from utils import set_file_logger 11 | import cembav2env 12 | importlib.reload(cembav2env) 13 | 14 | # * configs 15 | rsc_dir = os.path.join(proj_root, 16 | "19.snap2_integration/" 17 | "src/main/resource") 18 | allen = cembav2env.Allen() 19 | sa2atac = cembav2env.Sa2ATAC() 20 | 21 | # * reduce anndata by removing raw data 22 | # run only once 23 | # let's keep using log-norm data for later integration 24 | # since scaling is invariant towards normalization constant. 25 | 26 | reduced_allen_dir = os.path.join(rsc_dir, "norawdata_allen") 27 | if not os.path.exists(reduced_allen_dir): 28 | os.makedirs(reduced_allen_dir, exist_ok = False) 29 | ann_10xv3_nn = allen.get_10xv3_nn_ann() 30 | allen.get_reduced_ann( 31 | ann_10xv3_nn, 32 | outfnm = os.path.join(reduced_allen_dir, "nn_male_10xv3_ann_noraw.h5ad")) 33 | del ann_10xv3_nn 34 | 35 | ann_10xv3_neuron = allen.get_10xv3_neuron_ann() 36 | allen.get_reduced_ann( 37 | ann_10xv3_neuron, 38 | outfnm = os.path.join(reduced_allen_dir, 39 | "neuron_male_10xv3_ann_noraw.h5ad")) 40 | del ann_10xv3_neuron 41 | 42 | 43 | ann_all_nn = allen.get_allen_nn_ann() 44 | allen.get_reduced_ann( 45 | ann_all_nn, 46 | outfnm = os.path.join(reduced_allen_dir, 47 | "nn_male_allen_ann_noraw.h5ad")) 48 | del ann_all_nn 49 | 50 | ann_all_neuron = allen.get_allen_neuron_ann() 51 | allen.get_reduced_ann( 52 | ann_all_neuron, 53 | outfnm = os.path.join(reduced_allen_dir, 54 | "neuron_male_allen_ann_noraw.h5ad")) 55 | del ann_all_neuron 56 | 57 | # * remove raw from our atac gmat 58 | # run only once 59 | reduced_atac_dir = os.path.join(rsc_dir, "norawdata_atac") 60 | if not os.path.exists(reduced_atac_dir): 61 | os.makedirs(reduced_atac_dir, exist_ok=True) 62 | atac_ann = sa2atac.load_sa2gmat_ann() 63 | barcode2L3: pd.DataFrame = sa2atac.read_barcode2L3() 64 | sa2atac.add_L3_to_atac_ann(atac_ann, barcode2L3) 65 | 66 | 67 | atac_rough_annot = sa2atac.read_rough_annot() 68 | barcode2annot = atac_rough_annot.loc[barcode2L3["L3"]] 69 | barcode2annot.index = barcode2L3["barcode"] 70 | 71 | atac_nn_ann = sa2atac.get_nn_atac_ann( 72 | atac_ann, barcode2annot, 73 | outfnm = os.path.join(reduced_atac_dir, "nn_gmat_atac_ann.h5ad")) 74 | del atac_nn_ann 75 | 76 | atac_neuron_ann = sa2atac.get_neuron_atac_ann( 77 | atac_ann, barcode2annot, 78 | outfnm = os.path.join(reduced_atac_dir, "neuron_gmat_atac_ann.h5ad") 79 | ) 80 | del atac_neuron_ann 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /02.integration/src/main/resource/AIT21_ReadMe.txt: -------------------------------------------------------------------------------- 1 | This folder includes Mouse Whole Brain taxonomy's cellxgene matrix for scRNAseq data (10Xv2, 10Xv3) and snRNAseq data (Multiome) in h5ad file and the cluster annotation file. 2 | 3 | # cellxgene matrix 4 | 5 | - filename : AIT21_10Xv2.h5ad 6 | AIT21_10Xv3.h5ad 7 | AIT21_10Xmulti.h5ad 8 | 9 | - anndata$X : log-normalized count matrix 10 | $layers : raw count matrix 11 | $obs : cluster id 'cl' and sample related metadata 12 | 'cl' - cluster id to be used as matching key for cluster annotation file! 13 | library_prep - library 14 | gene.counts.0 - number of detected genes 15 | doublet_score - doublet score 16 | roi - region 17 | umi.counts - number of detected UMI's 18 | method - 10Xv3 / 10Xv2 / 10Xmulti 19 | sex - 20 | external_donor_name - 21 | age - 22 | medical_conditions - Light/Dark 23 | 24 | # cluster annotation 25 | 26 | - filename : AIT21_annotation.tsv 27 | cl : cluster key matching all cluster-related tables 28 | cluster_id/cluster_label : id and label at cluster level 29 | supertype_id/supertype_label : id and label at supertype level 30 | subclass_id/subclass_label : id and label at subclass 31 | class_id/class_label : id and label at clas 32 | anatomical_annotation : anatomical region that contribute to the most of cells in the cluster 33 | CCF_broad.freq/CCF_acronym.freq : fraction of cells from regions in CCF_broad or CCF_acronym 34 | v3.size/v2.size/multiome.size : number of cells or /nuclei from 10Xv3, 10Xv2, and multiome data 35 | cluster.markers : markers of the cluster 36 | merfish.markers : markers of the cluster for merfish data 37 | max.region : The region that contribute to the most of the cells in given cluster 38 | max.region.ratio : The fraction of cells coming from the max.region 39 | F,M : sex distribution 40 | 41 | 42 | -------------------------------------------------------------------------------- /02.integration/src/main/resource/BICCN.BrainRegionMetadata.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/02.integration/src/main/resource/BICCN.BrainRegionMetadata.xlsx -------------------------------------------------------------------------------- /02.integration/src/test/R/test.snakemake.wildcards.R: -------------------------------------------------------------------------------- 1 | logger::log_threshold(logger::TRACE) 2 | log_file <- snakemake@log[[1]] 3 | logger::log_appender(logger::appender_file(log_file)) 4 | 5 | 6 | afnm <- snakemake@input[["afnm"]] 7 | str(afnm) 8 | logger::log_info(afnm) 9 | 10 | bfnm <- snakemake@input[["bfnm"]] 11 | str(bfnm) 12 | print(bfnm) 13 | 14 | params <- snakemake@params[["a"]] 15 | str(params) 16 | print(params) 17 | 18 | print(snakemake@wildcards) 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /02.integration/src/test/pipeline/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: test_R_snakemake test_intgn_snakemake 2 | 3 | test_R_snakemake: R.Snakefile 4 | snakemake --snakefile $< -R -c 2\ 5 | --rerun-triggers mtime \ 6 | --skip-script-cleanup 7 | rm out/*.out 8 | 9 | conda_path := /home/szu/mambaforge/envs/seurat/bin 10 | test_intgn_snakemake: ../../main/pipeline/Seurat.Intgn.Snakefile 11 | -mkdir -p $@ 12 | cp $< $@/Snakefile 13 | cd $@ && \ 14 | snakemake --snakefile Snakefile -R -c 2 \ 15 | --config \ 16 | gp=nn \ 17 | debug=1 \ 18 | intgn_method='rpac,mnn' \ 19 | allen_techs='10xv3' \ 20 | --rerun-triggers mtime \ 21 | --skip-script-cleanup \ 22 | --profile pbs-torque-conda 23 | 24 | clean: 25 | -rm -rf out 26 | -rm -rf log 27 | -rm -rf .snakemake 28 | -------------------------------------------------------------------------------- /02.integration/src/test/pipeline/R.Snakefile: -------------------------------------------------------------------------------- 1 | import os 2 | a = ["a1", "a2", "a3"] 3 | 4 | out_dir = "out" 5 | log_dir = "log" 6 | for i in [out_dir, log_dir]: 7 | os.makedirs(i, exist_ok = True) 8 | 9 | script_dir = "../R" 10 | 11 | rule all: 12 | input: 13 | expand("{o}/{t}.out", o = out_dir, t = a) 14 | 15 | rule getOut: 16 | input: 17 | # afnm = lambda w: f"{out_dir}/{w.t}.in", 18 | # afnm = expand("{o}/{{t}}.in", o = out_dir), 19 | afnm = f"{out_dir}/{{t}}.in", 20 | # bfnm = lambda w: f"{out_dir}/{w.t}.in" 21 | bfnm = f"{out_dir}/{{t}}.in" 22 | # bfnm = expand("{o}/{{t}}.in", o = out_dir) 23 | output: 24 | touch(expand("{o}/{{t}}.out", o = out_dir)) 25 | log: 26 | f"{log_dir}/{{t}}.log" 27 | # wildcard in log filed does not work 28 | # fnm = lambda w: f"{log_dir}/{w.t}.log" 29 | params: 30 | a = [1,2,3] 31 | script: 32 | f"{script_dir}/test.snakemake.wildcards.R" 33 | 34 | 35 | -------------------------------------------------------------------------------- /02.integration/src/test/python/prepare.intg.test.ann.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import importlib 4 | import anndata as ad 5 | import pandas as pd 6 | import numpy as np 7 | from pyprojroot import here 8 | 9 | proj_root = str(here()) 10 | sys.path.insert(0, os.path.join(proj_root, "package/python")) 11 | from utils import set_file_logger 12 | import cembav2env 13 | 14 | importlib.reload(cembav2env) 15 | 16 | rsc_dir = os.path.join(proj_root, "19.snap2_integration", 17 | "src/test/resource") 18 | out_allen_dir = os.path.join(rsc_dir, "noraw_allen") 19 | os.makedirs(out_allen_dir, exist_ok = True) 20 | out_atac_dir = os.path.join(rsc_dir, "noraw_atac") 21 | os.makedirs(out_atac_dir, exist_ok = True) 22 | 23 | allen = cembav2env.Allen() 24 | sa2atac = cembav2env.Sa2ATAC() 25 | 26 | nn_allen = allen.read_nn_10xv3_lognorm_ann() 27 | 28 | rdm_nn_allen = nn_allen[np.random.choice(nn_allen.n_obs, 5000, replace = False)] 29 | 30 | rdm_nn_allen.write( 31 | os.path.join(out_allen_dir, "nn_male_10xv3_ann_noraw.h5ad")) 32 | 33 | del nn_allen 34 | 35 | nn_atac = sa2atac.read_nn_gmat_lognorm_ann() 36 | rdm_nn_atac = nn_atac[np.random.choice(nn_atac.n_obs, 5000, replace = False)] 37 | 38 | rdm_nn_atac.write( 39 | os.path.join(out_atac_dir, "nn_gmat_atac_ann.h5ad")) 40 | del nn_atac 41 | 42 | -------------------------------------------------------------------------------- /03.peakcalling/bin/merge_peaks: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/03.peakcalling/bin/merge_peaks -------------------------------------------------------------------------------- /03.peakcalling/src/main/R/addpL4Info2atacMeta.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(rlang) 3 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE) 4 | projdir <- here::here() 5 | rdir <- file.path(projdir, "package/R") 6 | import::from(.from = "cembav2env.R", .directory = rdir, 7 | cembav2env) 8 | 9 | atacMeta <- readRDS(cembav2env$sa2metaFile) 10 | rownames(atacMeta) <- atacMeta$barcode2 11 | 12 | barcode2pL4_nn_fnm <- file.path(projdir, "18.snap2_peakcalling", 13 | "src/main/resource", 14 | "nn_barcode2cluster_bedtag-all.csv") 15 | 16 | barcode2pL4_nn <- data.table::fread( 17 | file = barcode2pL4_nn_fnm, header = FALSE, sep = ",", 18 | data.table = FALSE) 19 | colnames(barcode2pL4_nn) <- c("barcode", "pL4") 20 | barcode2pL4_nn$gpL4 <- paste0("nn.", barcode2pL4_nn$pL4) 21 | 22 | barcode2pL4_neuron_fnm <- file.path(projdir, "18.snap2_peakcalling", 23 | "src/main/resource", 24 | "neuron_barcode2cluster_bedtag-all.csv") 25 | barcode2pL4_neuron <- data.table::fread( 26 | file = barcode2pL4_neuron_fnm, header = FALSE, sep = ",", 27 | data.table = FALSE) 28 | colnames(barcode2pL4_neuron) <- c("barcode", "pL4") 29 | barcode2pL4_neuron$gpL4 <- paste0("neuron.", barcode2pL4_neuron$pL4) 30 | 31 | barcode2pL4 <- rbind(barcode2pL4_nn, barcode2pL4_neuron) 32 | rownames(barcode2pL4) <- barcode2pL4$barcode 33 | 34 | atacMeta$pL4 <- barcode2pL4[rownames(atacMeta), "pL4"] 35 | atacMeta$gpL4 <- barcode2pL4[rownames(atacMeta), "gpL4"] 36 | saveRDS(atacMeta, 37 | file = file.path(projdir, "supple.02.annotation.all.in.one.meta", 38 | "mba.whole.cell.meta.v9.4.rds")) 39 | 40 | 41 | # * now it's v9.7 42 | atacMeta <- readRDS(cembav2env$sa2metaFile) 43 | 44 | table(atacMeta[grep("IMN", atacMeta$subclass_label_v3), c("L4", "NT_v3")], useNA = "ifany") 45 | table(atacMeta[grep("IMN", atacMeta$subclass_label_v3), c("L4", "mainclass.rough")], useNA = "ifany") 46 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/R/fitbgmodel.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(reticulate) 3 | library(gamlss.dist) 4 | library(gamlss) 5 | library(fitdistrplus) 6 | 7 | usePython <- "/home/szu/mambaforge/envs/sa2/bin/python" 8 | reticulate::use_python(usePython) 9 | pd <- reticulate::import("pandas", convert = FALSE) 10 | 11 | projdir <- here::here() 12 | workdir <- file.path(projdir, "18.snap2_peakcalling") 13 | 14 | # * meta 15 | default_cutoff <- 0.001 16 | pep <- 0.001 17 | rnd_upbound <- 0.1 18 | outdir <- file.path(workdir, "out/scfilter/fitfrac_bg") 19 | 20 | 21 | # * read peakfrac data 22 | peakfrac_rnd <- pd$read_pickle( 23 | file.path(workdir, "out/scfilter", "peakfrac_rnd.pkl")) 24 | peakfrac_rnd <- reticulate::py_to_r(x = peakfrac_rnd) 25 | 26 | ## peakfrac_union <- pd$read_pickle( 27 | ## file.path(workdir, "out/scfilter", "peakfrac_union.pkl")) 28 | ## peakfrac_union <- reticulate::py_to_r(x = peakfrac_union) 29 | 30 | # * fit models for each pL4 using peakfrac_rnd 31 | i <- as.integer(commandArgs(trailingOnly = TRUE)[1]) 32 | pL4 <- rownames(peakfrac_rnd)[i] 33 | message("fit BEZI model for ", pL4) 34 | 35 | x <- unlist(peakfrac_rnd[i, ]) 36 | x <- x[x <= rnd_upbound] 37 | 38 | mod <- gamlss::gamlss( 39 | x ~ 1, sigma.formula = ~1, 40 | nu.formula = ~1, 41 | family = BEZI, 42 | control = gamlss::gamlss.control(n.cyc = 100, trace = FALSE), 43 | ) 44 | 45 | ## summary(mod) 46 | mufit <- fitted(mod, "mu")[1] 47 | sigmafit <- fitted(mod, "sigma")[1] 48 | nufit <- fitted(mod, "nu")[1] 49 | converged <- ifelse(mod$converged, 1, 0) 50 | 51 | cutoff <- if (mod$converged) { 52 | # find x axis value at p = pep (default 0.001) 53 | gamlss.dist::qBEZI((1 - pep), 54 | mu = mufit, sigma = sigmafit, nu = nufit, 55 | lower.tail = TRUE, log.p = FALSE) 56 | } else { 57 | message("Fitting BEZI dose not converge.", 58 | " Use default cutoff: ", default_cutoff) 59 | default_cutoff 60 | } 61 | 62 | para <- data.frame( 63 | n = length(x), 64 | mu = mufit, 65 | sigma = as.numeric(sigmafit), 66 | nu = nufit, 67 | pep = pep, 68 | cutoff = cutoff, 69 | Gdeviance = mod$G.deviance, 70 | converged = converged) 71 | 72 | data.table::fwrite(para, 73 | file.path(outdir, paste0(pL4, ".fitPeakModel.para.csv")), 74 | quote = FALSE, col.names = TRUE, row.names = FALSE, sep = ",") 75 | 76 | message("done.") 77 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/R/subclass2peak.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(rlang) 3 | library(R6) 4 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE) 5 | 6 | library(data.table) 7 | library(GenomicRanges) 8 | library(S4Vectors) 9 | 10 | rdir <- file.path(here::here(), "package/R") 11 | import::from(.from = "cembav2env.R", .directory = rdir, 12 | cembav2env, Sa2PeakCalling) 13 | import::from(.from = "peak.R", .directory = rdir, 14 | mapL4pc2L4) 15 | work_dir <- file.path(here::here(), "18.snap2_peakcalling") 16 | 17 | # * load union peaks 18 | unionPeakBed <- Sa2PeakCalling$readUnionPeakBedFile() 19 | clsUnionPeakBeds <- Sa2PeakCalling$loadpL4UnionPeakBeds() 20 | # check 21 | bedExists <- vapply(clsUnionPeakBeds, 22 | \(x) { 23 | all(rownames(x) %in% rownames(unionPeakBed)) 24 | }, TRUE) 25 | 26 | saveRDS(clsUnionPEakBeds, 27 | file = file.path(work_dir, "out/tscc/pL4UnionPeakBeds.rds")) 28 | 29 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/R/sumReproducePeaks.R: -------------------------------------------------------------------------------- 1 | ## library(R.utils) 2 | ## library(purrr) 3 | args <- commandArgs(trailingOnly = TRUE) 4 | rpdpeakDir <- args[1] 5 | outdir <- args[2] 6 | 7 | # debug 8 | ## rpdpeakDir <- "/oasis/tscc/scratch/szu/projects/CEMBA2/18.snap2_peakcalling/out/tscc/rpdpeak" 9 | ## outdir <- "." 10 | 11 | recordSummitFiles <- file.path(outdir, "mba.whole.naiveSummitList.list") 12 | ## cl2npeakFile <- file.path(outdir, "mba.whole.L4.npeak4anno.txt") 13 | 14 | summitFiles <- list.files(path = rpdpeakDir, pattern = ".*naiveSummitList.bed", 15 | full.names = TRUE, include.dirs = TRUE, no.. = TRUE, ignore.case = FALSE) 16 | 17 | cls <- basename(summitFiles) |> gsub(".naiveSummitList.bed", "", x = _) 18 | ## npeaks <- map_int(.x = summitFiles, .f = countLines) 19 | 20 | r1 <- data.frame( 21 | cl = cls, 22 | path = summitFiles 23 | ) 24 | 25 | write.table(r1, file = recordSummitFiles, quote = FALSE, sep = "\t", 26 | row.names = FALSE, col.names = FALSE, append = FALSE) 27 | 28 | ## r2 <- data.frame( 29 | ## cl = cls, 30 | ## npeak = npeaks 31 | ## ) 32 | ## write.table(r2, file = cl2npeakFile, quote = FALSE, sep = "\t", 33 | ## row.names = FALSE, col.names = FALSE, append = FALSE) 34 | 35 | message("done.") 36 | 37 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/pipeline/getsa2pmat.Snakefile: -------------------------------------------------------------------------------- 1 | #envvars: 2 | # "PATH" 3 | import os 4 | import pyprojroot 5 | import pandas as pd 6 | from typing import List 7 | 8 | proj_dir = str(pyprojroot.here()) 9 | work_dir = os.path.join(proj_dir, "18.snap2_peakcalling") 10 | script_dir = os.path.join(work_dir, "src/main") 11 | 12 | union_bed_file = os.path.join(work_dir, "out/tscc", 13 | "mba.whole.union.peak.srt.bed") 14 | if not os.path.exists(union_bed_file): 15 | raise FileNotFoundError(union_bed_file) 16 | 17 | rnd_bed_file = os.path.join(work_dir, "out/tscc", 18 | "mba.whole.shuffle.removeovlp.bed") 19 | if not os.path.exists(rnd_bed_file): 20 | raise FileNotFoundError(rnd_bed_file) 21 | 22 | with open(f"{proj_dir}/17.snapatac2/meta/mba.whole.sample.lst", 'r') as f: 23 | samples = [l.strip() for l in f.readlines()] 24 | 25 | snap2_dir = os.path.join(proj_dir, 26 | "17.snapatac2", 27 | "sa2_qc_dlt/rm_dlt") 28 | 29 | fdir = os.path.join(work_dir, "out", "tscc/sa2pmat" , "flag") 30 | ldir = os.path.join(work_dir, "out", "tscc/sa2pmat" , "log") 31 | odir_union = os.path.join(work_dir, "out", "tscc/sa2pmat" , "union_pmat") 32 | odir_rnd = os.path.join(work_dir, "out", "tscc/sa2pmat" , "union_pmat_rnd") 33 | 34 | for d in [fdir, ldir, odir_union, odir_rnd]: 35 | os.makedirs(d, exist_ok = True) 36 | 37 | rule all: 38 | input: 39 | expand("{f}/{s}.sa2pmat.flag", f=fdir, s=samples), 40 | expand("{f}/{s}.sa2pmat_rnd.flag", f=fdir, s=samples) 41 | 42 | rule sa2pmat_union: 43 | input: 44 | bedfnm=union_bed_file 45 | output: 46 | touch(f"{fdir}/{{s}}.sa2pmat.flag") 47 | log: 48 | f"{ldir}/{{s}}.sa2pmat.log" 49 | params: 50 | snap2_dir = snap2_dir, 51 | suffix = "_rm_dlt.h5ad", 52 | out_dir = odir_union, 53 | out_suffix = "_union_pmat.h5ad" 54 | #conda: "sa22" 55 | threads: 2 56 | resources: 57 | walltime="1:00:00", 58 | queue="glean" 59 | script: 60 | f"{script_dir}/python/sa2pmat.py" 61 | 62 | rule sa2pmat_rnd: 63 | input: 64 | bedfnm=rnd_bed_file 65 | output: 66 | touch(f"{fdir}/{{s}}.sa2pmat_rnd.flag") 67 | log: 68 | f"{ldir}/{{s}}.sa2pmat_rnd.log" 69 | params: 70 | snap2_dir = snap2_dir, 71 | suffix = "_rm_dlt.h5ad", 72 | out_dir = odir_rnd, 73 | out_suffix = "_rnd_pmat.h5ad" 74 | #conda: "sa22" 75 | threads: 2 76 | resources: 77 | walltime="1:00:00", 78 | queue="glean" 79 | script: 80 | f"{script_dir}/python/sa2pmat.py" 81 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/pipeline/scfilter.Snakefile: -------------------------------------------------------------------------------- 1 | envvars: 2 | "PATH" 3 | 4 | import os 5 | import pyprojroot 6 | import pandas as pd 7 | from typing import List 8 | 9 | rscript = "/home/szu/mambaforge/envs/seurat/bin/Rscript" 10 | proj_dir = str(pyprojroot.here()) 11 | work_dir = os.path.join(proj_dir, "18.snap2_peakcalling") 12 | script_dir = os.path.join(work_dir, "src/main") 13 | 14 | rdir = os.path.join(work_dir, "src/main/resource") 15 | odir = os.path.join(work_dir, "out/scfilter") 16 | fdir = os.path.join(odir, "flag") 17 | ldir = os.path.join(odir, "log") 18 | odir_peakfrac = os.path.join(odir, "peakfrac") 19 | odir_fitbgmodel = os.path.join(odir, "fitfrac_bg") 20 | odir_clfilter = os.path.join(odir, "clfileter") 21 | 22 | for d in [odir, fdir, ldir, odir_peakfrac, odir_fitbgmodel, 23 | odir_clfilter]: 24 | os.makedirs(d, exist_ok = True) 25 | 26 | def get_pL4(L4_fnm:str, prefix:str = "nn.") -> List[str]: 27 | d: pd.DataFrame = pd.read_csv( 28 | L4_fnm, sep = ",", header = None, 29 | names = ['cluster', 'size', 'early_size', 'later_size'], 30 | index_col = None 31 | ) 32 | r = [f"{prefix}{i}" for i in d['cluster'].to_list()] 33 | return r 34 | 35 | nn_L4_fnm = os.path.join(rdir, "nn_L4pc2sizes_cca-k49-cl_v1.csv") 36 | nn_pL4s: List[str] = get_pL4(nn_L4_fnm, prefix = "nn.") 37 | neuron_L4_fnm = os.path.join(rdir, "neuron_L4pc2sizes_cca-k50-cl_v1.csv") 38 | neuron_pL4s: List[str] = get_pL4(neuron_L4_fnm, prefix = "neuron.") 39 | pL4s: List[str] = nn_pL4s + neuron_pL4s 40 | print(f"{len(pL4s)} pL4 cls for filtering peaks.") 41 | 42 | # pL4ids = list(range(1,3)) 43 | pL4ids = list(range(1,1464)) 44 | 45 | # sa2pmatd = os.path.join(work_dir, "out/tscc/sa2pmat") 46 | # sa2pmatd_union = os.path.join(sa2pmatd, "union_pmat") 47 | # sa2pmatd_rnd = os.path.join(sa2pmatd, "union_pmat_rnd") 48 | 49 | rule all: 50 | input: 51 | # f"{fdir}/peakfrac.done", 52 | expand("{f}/fitbgmodel_{cl}.done", f = fdir, cl = pL4ids) 53 | 54 | # let's do this in our script interactively. 55 | # rule get_peakfrac: 56 | # output: 57 | # touch(f"{fdir}/peakfrac.done") 58 | # params: 59 | # sa2pmatd_fg = sa2pmatd_union, 60 | # sa2pmatd_bg = sa2pmatd_rnd, 61 | # outdir = odir_peakfrac 62 | # threads: 63 | # 20 64 | # resources: 65 | # walltime = 8, 66 | # queue = "glean", 67 | # mail = "a", 68 | # tag = "icelake:mem1024" 69 | # conda: 70 | # "sa22" 71 | # script: 72 | # "{script_dir}/python/sa2_get_peakfrac.py" 73 | 74 | rule fit_bgmodel: 75 | # input: 76 | # f"{fdir}/peakfrac.done" 77 | output: 78 | touch(f"{fdir}/fitbgmodel_{{cl}}.done") 79 | log: 80 | f"{ldir}/fitbgmodel_{{cl}}.done" 81 | threads: 5 82 | resources: 83 | walltime = 4, 84 | queue = "condo", 85 | mail = "a" 86 | conda: 87 | "seurat" 88 | shell: 89 | """ 90 | {rscript} {script_dir}/R/fitbgmodel.R {wildcards.cl} 2> {log} 91 | """ 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/python/get_full_snap2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Get snap2 file with all the samples. 3 | 4 | """ 5 | import os 6 | import sys 7 | import logging 8 | from pathlib import Path 9 | from typing import Dict, List 10 | 11 | import numpy as np 12 | import snapatac2 as sa2 13 | 14 | import pyprojroot 15 | 16 | proj_dir = str(pyprojroot.here()) 17 | pack_dir = f"{proj_dir}/package/python" 18 | sys.path.insert(0, pack_dir) 19 | from mylog import StreamToLogger, set_file_logger 20 | from mysnapatac2 import modify_obs_name 21 | 22 | # * snakemake 23 | # log_fnm=snakemake.log[0] 24 | # snap2_files: List[str] = snakemake.input["snap2_files"] 25 | # out_snap2: str = snakemake.output["snap2"] 26 | 27 | # * log 28 | log_dir = os.path.join(proj_dir, "18.snap2_peakcalling", "out/log") 29 | os.makedirs(log_dir, exist_ok=True) 30 | log_fnm = os.path.join(log_dir, "get_full_snap2.log") 31 | logger = set_file_logger(log_fnm, name="sa2.get_full_snap2") 32 | sys.stdout = StreamToLogger(logger=logger, level=logging.INFO) 33 | sys.stderr = StreamToLogger(logger=logger, level=logging.ERROR) 34 | 35 | # * meta 36 | rdir = os.path.join(proj_dir, "18.snap2_peakcalling", "src/main/resource") 37 | with open(f"{rdir}/mba.whole.sample.lst", "r") as f: 38 | samples = [l.strip() for l in f.readlines()] 39 | out_snap2_fnm = f"{rdir}/cemba.snap2.with.fragment.hdf5" 40 | 41 | snap2s_dir = os.path.join(proj_dir, "17.snapatac2", "sa2_qc_dlt", "rm_dlt") 42 | snap2_files = [f"{snap2s_dir}/{s}_rm_dlt.h5ad" for s in samples] 43 | 44 | logger.info(f"In total, {len(snap2_files)} are inputed.") 45 | 46 | fnms = [os.path.basename(v) for v in snap2_files] 47 | 48 | logger.info(f"get full snap anndataset object to: {out_snap2_fnm}") 49 | sample2files = [(s, f) for s, f in zip(samples, snap2_files)] 50 | sds = sa2.AnnDataSet(adatas=sample2files, 51 | filename=out_snap2_fnm, add_key="sample") 52 | new_obs_names = modify_obs_name(sds, obs_key="sample") 53 | sds.obs_names = new_obs_names 54 | sds.close() 55 | logger.info("Done.") 56 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/python/sa2pmat.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | from pathlib import Path 5 | from typing import Dict, List 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import snapatac2 as sa2 10 | 11 | import pyprojroot 12 | 13 | proj_dir = str(pyprojroot.here()) 14 | pack_dir = f"{proj_dir}/package/python" 15 | sys.path.insert(0, pack_dir) 16 | from mylog import StreamToLogger, set_file_logger 17 | from mysnapatac2 import modify_obs_name 18 | 19 | # * snakemake 20 | log_fnm: str =snakemake.log[0] 21 | bedfnm: str = snakemake.input["bedfnm"] 22 | snap2_dir: str = snakemake.params["snap2_dir"] 23 | suffix: str = snakemake.params["suffix"] 24 | out_dir: str = snakemake.params["out_dir"] 25 | out_suffix: str = snakemake.params["out_suffix"] 26 | sample: str = snakemake.wildcards["s"] 27 | 28 | # * debug 29 | # log_fnm = "test_pmat.log" 30 | # work_dir = os.path.join(proj_dir, "18.snap2_peakcalling") 31 | # bedfnm = os.path.join(proj_dir, "18.snap2_peakcalling", 32 | # "out/tscc", "mba.whole.union.peak.srt.bed") 33 | # with open(f"{proj_dir}/17.snapatac2/meta/mba.whole.sample.lst", 'r') as f: 34 | # samples = [l.strip() for l in f.readlines()] 35 | 36 | # snap2_dir = os.path.join(proj_dir, 37 | # "17.snapatac2", 38 | # "sa2_qc_dlt/rm_dlt") 39 | # out_dir = "." 40 | # suffix = "_rm_dlt.h5ad" 41 | # out_suffix = "_union_pmat.h5ad" 42 | # sample = samples[0] 43 | 44 | # * log 45 | logger = set_file_logger(log_fnm, name="sa2.get_full_snap2") 46 | sys.stdout = StreamToLogger(logger=logger, level=logging.INFO) 47 | sys.stderr = StreamToLogger(logger=logger, level=logging.ERROR) 48 | 49 | # * main 50 | # ** load snap2 51 | logger.info(f"Loading snap2 for {sample}...") 52 | snap_file = os.path.join(snap2_dir, f"{sample}{suffix}") 53 | if not os.path.exists(snap_file): 54 | raise FileNotFoundError(f"{snap_file} not found.") 55 | snap2: sa2.AnnData = sa2.read(snap_file, backed = 'r') 56 | 57 | # ** read bed file 58 | logger.info("Reading bed file...") 59 | if not os.path.exists(bedfnm): 60 | raise FileNotFoundError(f"{bedfnm} not found.") 61 | 62 | 63 | # ** get pmat 64 | logger.info(f"Getting pmat for {sample}...") 65 | outfnm = os.path.join(out_dir, f"{sample}{out_suffix}") 66 | sa2.pp.make_peak_matrix(adata = snap2, 67 | inplace = False, 68 | file = outfnm, 69 | backend = "hdf5", 70 | peak_file = bedfnm, 71 | chunk_size = 10000, 72 | use_x = False) 73 | snap2.close() 74 | logger.info(f"Done.") 75 | 76 | 77 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/resource/config.yaml: -------------------------------------------------------------------------------- 1 | system: imac 2 | project_dir: 3 | imac: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2 4 | encoder: /projects/ps-renlab/szu/projects/CEMBA2 5 | tscc: /projects/ps-renlab/szu/projects/CEMBA2 6 | subset_dir: subset 7 | callpeak_dir: callpeak 8 | cemba_all_dataset: src/main/resource/cemba.sa2.dlt2.anndataset.h5ad 9 | subset_barcode2group: src/main/resource/subset.barcode2group.csv 10 | 11 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/resource/test_neuron_L4pc2sizes.csv: -------------------------------------------------------------------------------- 1 | 12-1-1-2,8982,4505,4477 2 | 12-1-1-3,6194,3110,3084 3 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/shell/export_unionpeak.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | union_peakset=$1 3 | if [ ! -f ${union_peakset} ];then 4 | echo "${union_peakset} does not exist." 5 | exit 1 6 | fi 7 | 8 | sed -e "1d " ${union_peakset} | cut -f 1-3 > $2 9 | 10 | echo "done." 11 | -------------------------------------------------------------------------------- /03.peakcalling/src/main/shell/intersect_mergepeak.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # echo $1 4 | # echo $2 5 | # echo $3 6 | # echo done 7 | 8 | if [ ! -f $1 ]; then 9 | echo "$1 does not exist." 10 | exit 1 11 | fi 12 | 13 | if [ ! -f $2 ]; then 14 | echo "$2 does not exist." 15 | exit 1 16 | fi 17 | 18 | echo "intersect merge peak for $3 ." 19 | 20 | intersectBed -wa -a $1 -b <(sed -e "1d" $2) -nonamecheck \ 21 | | sort -k1,1 -k2,2n | uniq > $3 22 | 23 | echo "done." 24 | -------------------------------------------------------------------------------- /04.nmf/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: allpeak_nmf novlpeak_nmf ppdcpeak_nmf 2 | # * run non-negative matrix factorization 3 | 4 | allpeak_nmf: src/main/pipeline/nmf.Snakefile src/main/resource/config.yaml 5 | -mkdir -p build/$@ 6 | cp $(word 2,$^) build/$@/config.yaml 7 | cp $< build/$@/Snakefile 8 | cd build/$@ && \ 9 | snakemake --config \ 10 | system=encoder \ 11 | tag=allpeak \ 12 | out=out/allpeak_nmf \ 13 | mod_from=150 mod_to=152 mod_by=2 n_rerun=3 \ 14 | subclass_order_meta=data/sa2.subclass.srt.txt \ 15 | peak_nm_file=data/sa2.final.peak.nms.txt \ 16 | cluster_nm_file=data/sa2.allpeak.subclass.nms.txt \ 17 | mat_pbyc_h5=data/sa2.allpeak.cbyp.mat.h5 \ 18 | -c 10 \ 19 | --snakefile Snakefile -R --rerun-incomplete \ 20 | --rerun-triggers mtime \ 21 | --skip-script-cleanup \ 22 | # --profile pbs-torque-conda 23 | 24 | 25 | novlpeak_nmf: src/main/pipeline/nmf.Snakefile src/main/resource/config.yaml 26 | -mkdir -p build/$@ 27 | cp $(word 2,$^) build/$@/config.yaml 28 | cp $< build/$@/Snakefile 29 | cd build/$@ && \ 30 | snakemake --config \ 31 | system=encoder \ 32 | tag=novlpeak \ 33 | out=out/novlppeak_nmf \ 34 | mod_from=150 mod_to=152 mod_by=2 n_rerun=3 \ 35 | subclass_order_meta=data/sa2.subclass.srt.txt \ 36 | peak_nm_file=data/sa2.novlpDHS.peak.nms.txt \ 37 | cluster_nm_file=data/sa2.novlpDHS.subclass.nms.txt \ 38 | mat_pbyc_h5=data/sa2.novlpDHS.cbyp.mat.h5 \ 39 | -c 10 \ 40 | --snakefile Snakefile -R --rerun-incomplete \ 41 | --rerun-triggers mtime \ 42 | --skip-script-cleanup \ 43 | # --profile pbs-torque-conda 44 | 45 | 46 | 47 | ppdcpeak_nmf: src/main/pipeline/nmf.Snakefile src/main/resource/config.yaml 48 | -mkdir -p build/$@ 49 | cp $(word 2,$^) build/$@/config.yaml 50 | cp $< build/$@/Snakefile 51 | cd build/$@ && \ 52 | snakemake --config \ 53 | system=encoder \ 54 | tag=ppdcpeak \ 55 | out=out/ppdcpeak_nmf \ 56 | mod_from=54 mod_to=56 mod_by=2 n_rerun=3 \ 57 | subclass_order_meta=data/sa2.subclass.srt.txt \ 58 | peak_nm_file=data/sa2.ppdc.peak.nms.txt \ 59 | cluster_nm_file=data/sa2.ppdc.subclass.nms.txt \ 60 | mat_pbyc_h5=data/sa2.ppdc.cbyp.mat.h5 \ 61 | -c 10 \ 62 | --snakefile Snakefile -R --rerun-incomplete \ 63 | --rerun-triggers mtime \ 64 | --skip-script-cleanup \ 65 | # --profile pbs-torque-conda 66 | -------------------------------------------------------------------------------- /04.nmf/src/main/R/02.nmfATAC.plotH.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # read results from ksklearn 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | # create parser object 8 | parser <- ArgumentParser() 9 | 10 | # specify our desired options 11 | # by default ArgumentParser will add an help option 12 | parser$add_argument("-i", "--input", required=TRUE, help="input matrix") 13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix") 14 | # get command line options, if help option encountered print help and exit, 15 | # otherwise if options not found on command line then set defaults, 16 | args <- parser$parse_args() 17 | 18 | 19 | library(data.table) 20 | dataH <- fread(args$input,sep="\t") 21 | 22 | library(pheatmap) 23 | library(RColorBrewer) 24 | library(viridis) 25 | library(dendsort) 26 | 27 | # scale by column 28 | #mx <- apply(dataH,2,scale) 29 | 30 | normUnity <- function(x){ 31 | sum <- sum(x) 32 | out <- x / sum(x) 33 | } 34 | 35 | mx <- apply(dataH,2,normUnity) 36 | 37 | sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...))) 38 | #mat_cluster_rows_H <- sort_hclust(hclust(dist(dataH))) 39 | mat_cluster_cols_H <- sort_hclust(hclust(dist(t(mx)))) 40 | 41 | quantile_breaks <- function(xs, n = 30) { 42 | breaks <- quantile(xs, probs = seq(0, 1, length.out = n)) 43 | breaks[!duplicated(breaks)] 44 | } 45 | 46 | # mat_breaks_H <- quantile_breaks(t(mx), n = 30) 47 | 48 | pdf(paste(args$output,".H.pdf",sep='')) 49 | pheatmap( 50 | mat = mx, 51 | scale = 'none', 52 | color = viridis(30), 53 | # color = viridis(length(mat_breaks_H) - 1), 54 | # breaks = mat_breaks_H, 55 | border_color = NA, 56 | cluster_cols = mat_cluster_cols_H, 57 | cluster_rows = F, 58 | # cluster_rows = mat_cluster_rows_H, 59 | show_colnames = TRUE, 60 | show_rownames = FALSE, 61 | drop_levels = TRUE, 62 | fontsize = 14, 63 | main = "decomp H" 64 | ) 65 | dev.off() 66 | 67 | 68 | nor01 <- function(x){ 69 | min <- min(x) 70 | max <- max(x) 71 | out <- (x - min) / (max - min) 72 | } 73 | 74 | -------------------------------------------------------------------------------- /04.nmf/src/main/R/02.nmfATAC.plotW.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # read results from ksklearn 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | # create parser object 8 | parser <- ArgumentParser() 9 | 10 | # specify our desired options 11 | # by default ArgumentParser will add an help option 12 | parser$add_argument("-i", "--input", required=TRUE, help="input matrix") 13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix") 14 | # get command line options, if help option encountered print help and exit, 15 | # otherwise if options not found on command line then set defaults, 16 | args <- parser$parse_args() 17 | 18 | 19 | library(data.table) 20 | dataW <- fread(args$input,sep="\t") 21 | 22 | library(pheatmap) 23 | library(RColorBrewer) 24 | library(viridis) 25 | library(dendsort) 26 | 27 | # scale by column 28 | #tmp <- apply(dataW,2,scale) 29 | 30 | normUnity <- function(x){ 31 | sum <- sum(x) 32 | out <- x / sum(x) 33 | } 34 | 35 | tmp <- apply(dataW,1,normUnity) 36 | tmp <- t(tmp) 37 | mx <- tmp[sample(nrow(tmp), 5000), ] 38 | 39 | sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...))) 40 | mat_cluster_rows_W <- sort_hclust(hclust(dist(mx))) 41 | #mat_cluster_cols_W <- sort_hclust(hclust(dist(t(mx)))) 42 | 43 | quantile_breaks <- function(xs, n = 30) { 44 | breaks <- quantile(xs, probs = seq(0, 1, length.out = n)) 45 | breaks[!duplicated(breaks)] 46 | } 47 | 48 | #mat_breaks_W <- quantile_breaks(t(mx), n = 30) 49 | 50 | pdf(paste(args$output,".W.pdf",sep='')) 51 | pheatmap( 52 | mat = mx, 53 | scale = 'none', 54 | color = viridis(30), 55 | # color = viridis(length(mat_breaks_W) - 1), 56 | # breaks = mat_breaks_W, 57 | border_color = NA, 58 | # cluster_cols = mat_cluster_cols_W, 59 | cluster_cols = F, 60 | cluster_rows = mat_cluster_rows_W, 61 | show_colnames = FALSE, 62 | show_rownames = FALSE, 63 | drop_levels = TRUE, 64 | fontsize = 14, 65 | main = "decomp W" 66 | ) 67 | dev.off() 68 | 69 | 70 | norm01 <- function(x){ 71 | min <- min(x) 72 | max <- max(x) 73 | out <- (x - min) / (max - min) 74 | } 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /04.nmf/src/main/R/02.nmfATAC.statBox.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # read results from ksklearn 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | # create parser object 8 | parser <- ArgumentParser() 9 | 10 | # specify our desired options 11 | # by default ArgumentParser will add an help option 12 | parser$add_argument("-i", "--input", required=TRUE, help="input statH") 13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix") 14 | # get command line options, if help option encountered print help and exit, 15 | # otherwise if options not found on command line then set defaults, 16 | args <- parser$parse_args() 17 | 18 | data <- read.table(args$input,sep="\t",head=F) 19 | 20 | staoutmx <- data.frame(row.names=c("Min","Q1","Median","Mean","Q3","Max","TopWhisker","BottomWhisker","Box1","Box2","Box3","UpWhisker","DnWhisker")) 21 | for (i in c(5,6,7)){ 22 | x <- data[,i] 23 | boxMx <- matrix(summary(x)) 24 | rownames(boxMx) <- c("Min","Q1","Median","Mean","Q3","Max") 25 | iqr <- IQR(x) 26 | q1 <- summary(x)[2] 27 | q3 <- summary(x)[5] 28 | TopWhisker <- min(max(x), q3 + 1.5 * iqr) 29 | BottomWhisker <- max(min(x), q1 - 1.5 * iqr) 30 | Box1 <- boxMx["Q1",] 31 | Box2 <- boxMx["Median",] - boxMx["Q1",] 32 | Box3 <- boxMx["Q3",] - boxMx["Median",] 33 | UpWhisker <- TopWhisker - boxMx["Q3",] 34 | DnWhisker <- boxMx["Q1",] - BottomWhisker 35 | boxMx <- rbind(boxMx,TopWhisker,BottomWhisker,Box1,Box2,Box3,UpWhisker,DnWhisker) 36 | colnames(boxMx) <- i 37 | staoutmx <- cbind(staoutmx,boxMx) 38 | } 39 | colnames(staoutmx) <- c("contributes","sparseness","entropy") 40 | 41 | cat(median(data$V6),"\n") 42 | 43 | k <- max(data$V3) 44 | n <- nrow(data) 45 | normInfoGain = 1 - sum(data$V7) / (n * log2(k)) 46 | cat(normInfoGain) 47 | 48 | write.table(staoutmx, file=paste(args$output,".box.sta",sep=''), sep="\t", quote=F, col.names=T, row.names=T) 49 | 50 | -------------------------------------------------------------------------------- /04.nmf/src/main/R/05.splitPeakByModule.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(optparse) 3 | packdir <- file.path(here::here(), "package/R") 4 | import::from(.from = "utils.R", .directory = packdir, 5 | checkArgsExistOrStop, prepareOutdir, checkFileExistOrStop) 6 | import::from(.from = "peak.R", .directory = packdir, 7 | loadStatPeak.NMF) 8 | import::from(.from = "cembav2env.R", .directory = packdir, 9 | cembav2env) 10 | 11 | op <- list( 12 | make_option(c("--nmfDir"), type = "character", 13 | default = "nmf_ppdc/out"), 14 | make_option(c("--module"), type = "integer", 15 | default = 54), 16 | make_option(c("--tag"), type = "character", default = "ppdc") 17 | ) 18 | 19 | args <- parse_args(OptionParser(option_list = op)) 20 | checkArgsExistOrStop(args) 21 | 22 | if(!dir.exists(args$nmfDir)) { 23 | stop(args$nmfDir, " does not exist.") 24 | } 25 | 26 | mod.nmf <- args$module 27 | tag <- args$tag 28 | 29 | outDir <- file.path(args$nmfDir, 30 | paste("nmf", tag, paste0("r", mod.nmf), "motif", sep = ".")) 31 | prepareOutdir(outDir) 32 | 33 | # * functions 34 | convertPeakToBed <- function(peakBed, peaknms, outFile = NULL) { 35 | r <- peakBed[peaknms, ] 36 | if(!is.null(outFile)) { 37 | write.table(x = r, file = outFile, quote = FALSE, sep = "\t", 38 | row.names = FALSE, col.names = FALSE) 39 | } 40 | return(r) 41 | } 42 | 43 | # * load peaks 44 | peakBed <- data.table::fread(cembav2env$peakBedFile, 45 | header = FALSE, sep = "\t", data.table = FALSE) 46 | colnames(peakBed) <- c("chrom", "start", "end", "name") 47 | rownames(peakBed) <- peakBed$name 48 | 49 | # * nmf modules 50 | nmfPeakStat <- loadStatPeak.NMF( 51 | file = file.path(args$nmfDir, 52 | paste("nmfPmat", tag, 53 | paste0("r", mod.nmf), "n0", "statW", sep = "."))) 54 | 55 | modules <- unique(nmfPeakStat$class0 + 1) 56 | 57 | # * save peaks from each module to a seperate bed file 58 | invisible(lapply(modules, function(i) { 59 | outFile <- file.path(outDir, 60 | paste0("r", mod.nmf, "_n", i, ".cCREs.bed")) 61 | message("Writing peak bed file to: ", outFile) 62 | peaks <- with(nmfPeakStat, peak[class0 == (i-1)]) 63 | convertPeakToBed(peakBed = peakBed, peaknms = peaks, outFile = outFile) 64 | })) 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /04.nmf/src/main/resource/config.yaml: -------------------------------------------------------------------------------- 1 | system: imac 2 | python: 3 | imac: /Users/szu/mambaforge/bin/python 4 | tscc: /projects/ps-renlab/szu/miniconda3/envs/snATAC2/bin/python 5 | encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/python 6 | Rscript: 7 | imac: /usr/local/bin/Rscript 8 | tscc: /projects/ps-renlab/szu/miniconda3/envs/snATAC2/bin/Rscript 9 | encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/Rscript 10 | code_dir: 11 | imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2 12 | tscc: /projects/ps-renlab2/szu/projects/CEMBA2 13 | encoder: /projects/ps-renlab2/szu/projects/CEMBA2 14 | work_dir: 15 | imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2 16 | tscc: /oasis/tscc/scratch/szu/projects/CEMBA2 17 | encoder: /projects/ps-renlab2/szu/projects/CEMBA2 18 | homer: 19 | imac: /Users/szu/mambaforge/envs/bio/bin/findMotifsGenome.pl 20 | tscc: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/findMotifsGenome.pl 21 | encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/findMotifsGenome.pl 22 | subclass_order_meta: meta/subclass.order.hc.csv 23 | peak_nm_file: data/peaks.txt 24 | cluster_nm_file: data/clusters.txt 25 | mat_pbyc_h5: data/cpm.cbyp.ppdc.h5 26 | tag: ppdc 27 | out: nmf_ppdc 28 | n_rerun: 3 29 | mod_from: 78 30 | mod_to: 80 31 | mod_by: 2 32 | use_detailed_mod: 1 33 | detailed_mod: 78a80 34 | mod_split: a 35 | module: 54 36 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: run_cicero pdc_cicero 2 | 3 | snakemake := /home/szu/miniforge3/bin/snakemake 4 | # snakemake := /home/szu/mambaforge/envs/seurat/bin/snakemake 5 | run_cicero: src/main/pipeline/runCicero.Snakefile 6 | ${snakemake} --snakefile $< -R -c 9 \ 7 | --config \ 8 | threads=8 \ 9 | debug=0 \ 10 | queue=hotel \ 11 | walltime=16 \ 12 | --rerun-triggers mtime \ 13 | --skip-script-cleanup \ 14 | --profile pbs-torque-conda 15 | 16 | pdc_cicero: src/main/pipeline/pdc.Snakefile src/main/resource/config.yaml 17 | -mkdir -p build/$@ 18 | cp $(word 2,$^) build/$@/config.yaml 19 | cp $< build/$@/Snakefile 20 | cd build/$@ && \ 21 | ${snakemake} -c 20 --config \ 22 | threads=1 \ 23 | debug=0 \ 24 | queue=glean \ 25 | walltime=2 \ 26 | --snakefile Snakefile -R --rerun-incomplete \ 27 | --rerun-triggers mtime \ 28 | --skip-script-cleanup \ 29 | # --profile pbs-torque-conda 30 | 31 | getppdc_bedpe: src/main/shell/09.get.pos.neg.pdc.info.sh 32 | bash $< 33 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/R/cicero_mouse_atlas.R: -------------------------------------------------------------------------------- 1 | # This script is prepared by Sai. 2 | library(Matrix) 3 | library(monocle3) 4 | # should be cicero for monocle3 5 | # devtools::install_github( 6 | # "cole-trapnell-lab/cicero-release", ref = "monocle3") 7 | library(cicero) 8 | 9 | projdir <- here::here() 10 | rscdir <- file.path(projdir, "src/main/resource") 11 | 12 | args <- commandArgs(trailingOnly = TRUE) 13 | CellType <- args[1] 14 | print(CellType) 15 | path <- "/oasis/tscc/scratch/smamde/mouse_atlas/mtx_files/" 16 | # Read in matrix data using the Matrix package 17 | indata <- Matrix::readMM(paste0(path,CellType,".mtx")) 18 | # Binarize the matrix 19 | indata@x[indata@x > 0] <- 1 20 | indata <- t(indata) 21 | # Format cell info 22 | cellinfo <- read.table(paste0(path,CellType,"_Barcodes.tsv")) 23 | row.names(cellinfo) <- cellinfo$X0 24 | names(cellinfo) <- "cells" 25 | 26 | # Format peak info 27 | peakinfo <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/peaks.tsv") 28 | names(peakinfo) <- c("chr", "bp1", "bp2") 29 | peakinfo$site_name <- paste(peakinfo$chr, peakinfo$bp1, peakinfo$bp2, sep="_") 30 | row.names(peakinfo) <- peakinfo$site_name 31 | 32 | row.names(indata) <- row.names(peakinfo) 33 | colnames(indata) <- row.names(cellinfo) 34 | 35 | # Make CDS 36 | input_cds <- suppressWarnings(new_cell_data_set(indata, 37 | cell_metadata = cellinfo, 38 | gene_metadata = peakinfo)) 39 | 40 | input_cds <- monocle3::detect_genes(input_cds) 41 | 42 | #Ensure there are no peaks included with zero reads 43 | input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,] 44 | 45 | input_cds <- detect_genes(input_cds) 46 | input_cds <- estimate_size_factors(input_cds) 47 | input_cds <- preprocess_cds(input_cds, method = "LSI") 48 | 49 | input_cds <- reduce_dimension(input_cds, reduction_method = 'UMAP', 50 | preprocess_method = "LSI") 51 | umap_coords <- reducedDims(input_cds)$UMAP 52 | 53 | 54 | cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords) 55 | 56 | chromosome_length <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/mm10_chromosome_length.txt") 57 | 58 | conns <- run_cicero(cicero_cds, chromosome_length) 59 | 60 | 61 | saveRDS(conns,paste0(path,CellType,"_cicero_connections.Rds")) 62 | 63 | all_peaks <- row.names(exprs(input_cds)) 64 | write.csv(x = all_peaks, file = paste0(path,CellType,"_all_peaks.csv")) 65 | write.csv(x = conns, file = paste0(path,CellType,"_cicero_connections.csv")) 66 | 67 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/R/run_cicero.R: -------------------------------------------------------------------------------- 1 | 2 | library(Matrix) 3 | library(monocle3) 4 | library(cicero) 5 | 6 | 7 | #CellType = "ABC_NN" 8 | args <- commandArgs(trailingOnly = TRUE) 9 | CellType <- args[1] 10 | print(CellType) 11 | path = "/oasis/tscc/scratch/smamde/mouse_atlas/mtx_files/" 12 | # Read in matrix data using the Matrix package 13 | indata <- Matrix::readMM(paste0(path,CellType,".mtx")) 14 | # Binarize the matrix 15 | indata@x[indata@x > 0] <- 1 16 | indata <- t(indata) 17 | # Format cell info 18 | cellinfo <- read.table(paste0(path,CellType,"_Barcodes.tsv")) 19 | row.names(cellinfo) <- cellinfo$X0 20 | names(cellinfo) <- "cells" 21 | 22 | # Format peak info 23 | peakinfo <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/peaks.tsv") 24 | names(peakinfo) <- c("chr", "bp1", "bp2") 25 | peakinfo$site_name <- paste(peakinfo$chr, peakinfo$bp1, peakinfo$bp2, sep="_") 26 | row.names(peakinfo) <- peakinfo$site_name 27 | 28 | row.names(indata) <- row.names(peakinfo) 29 | colnames(indata) <- row.names(cellinfo) 30 | 31 | # Make CDS 32 | input_cds <- suppressWarnings(new_cell_data_set(indata, 33 | cell_metadata = cellinfo, 34 | gene_metadata = peakinfo)) 35 | 36 | input_cds <- monocle3::detect_genes(input_cds) 37 | 38 | #Ensure there are no peaks included with zero reads 39 | input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,] 40 | 41 | input_cds <- detect_genes(input_cds) 42 | input_cds <- estimate_size_factors(input_cds) 43 | input_cds <- preprocess_cds(input_cds, method = "LSI") 44 | 45 | input_cds <- reduce_dimension(input_cds, reduction_method = 'UMAP', 46 | preprocess_method = "LSI") 47 | umap_coords <- reducedDims(input_cds)$UMAP 48 | 49 | 50 | cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords) 51 | 52 | chromosome_length <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/mm10_chromosome_length.txt") 53 | 54 | conns <- run_cicero(cicero_cds, chromosome_length) 55 | 56 | 57 | saveRDS(conns,paste0(path,CellType,"_cicero_connections.Rds")) 58 | 59 | all_peaks <- row.names(exprs(input_cds)) 60 | write.csv(x = all_peaks, file = paste0(path,CellType,"_all_peaks.csv")) 61 | write.csv(x = conns, file = paste0(path,CellType,"_cicero_connections.csv")) 62 | 63 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/R/run_cicero_shuffle.R: -------------------------------------------------------------------------------- 1 | library(Matrix) 2 | library(monocle3) 3 | library(cicero) 4 | 5 | #CellType = "ABC_NN" 6 | args <- commandArgs(trailingOnly = TRUE) 7 | CellType <- args[1] 8 | print(CellType) 9 | path = "/oasis/tscc/scratch/smamde/mouse_atlas/mtx_files/" 10 | path_save = "/oasis/tscc/scratch/smamde/mouse_atlas/shuffle_cicero/cicero_shuffle_results/" 11 | # Read in matrix data using the Matrix package 12 | indata <- Matrix::readMM(paste0(path,CellType,".mtx") ) 13 | 14 | # Format cell info 15 | cellinfo <- read.table(paste0(path,CellType,"_Barcodes.tsv")) 16 | row.names(cellinfo) <- cellinfo$X0 17 | names(cellinfo) <- "cells" 18 | 19 | # Format peak info 20 | peakinfo <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/peaks.tsv") 21 | names(peakinfo) <- c("chr", "bp1", "bp2") 22 | peakinfo$site_name <- paste(peakinfo$chr, peakinfo$bp1, peakinfo$bp2, sep="_") 23 | row.names(peakinfo) <- peakinfo$site_name 24 | 25 | row.names(indata) <- row.names(cellinfo) 26 | colnames(indata) <- row.names(peakinfo) 27 | 28 | mat<-indata 29 | r <- mat[ , sample(ncol(mat))] 30 | colnames(r) <- colnames(mat) 31 | rownames(r) <- rownames(mat) 32 | 33 | indata<-r 34 | indata@x[indata@x > 0] <- 1 35 | indata <- t(indata) 36 | 37 | # Make CDS 38 | input_cds <- suppressWarnings(new_cell_data_set(indata)) 39 | 40 | input_cds <- monocle3::detect_genes(input_cds) 41 | 42 | #Ensure there are no peaks included with zero reads 43 | input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,] 44 | 45 | input_cds <- detect_genes(input_cds) 46 | input_cds <- estimate_size_factors(input_cds) 47 | input_cds <- preprocess_cds(input_cds, method = "LSI") 48 | 49 | input_cds <- reduce_dimension(input_cds, reduction_method = 'UMAP', 50 | preprocess_method = "LSI") 51 | umap_coords <- reducedDims(input_cds)$UMAP 52 | 53 | 54 | cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords) 55 | 56 | chromosome_length <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/mm10_chromosome_length.txt") 57 | 58 | conns <- run_cicero(cicero_cds, chromosome_length) 59 | 60 | 61 | saveRDS(conns,paste0(path_save,CellType,"_cicero_shuffle_connections.Rds")) 62 | 63 | all_peaks <- row.names(exprs(input_cds)) 64 | write.csv(x = all_peaks, file = paste0(path_save,CellType,"_all_shuffle_peaks.csv")) 65 | write.csv(x = conns, file = paste0(path_save,CellType,"_cicero_shuffle_connections.csv")) 66 | 67 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/R/sa2.pdc.of.globalpeaks.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | projdir <- here::here() 3 | rpack <- file.path(projdir, "package/R") 4 | import::from(.from = "cembav2env.R", .directory = rpack, 5 | Sa2PeakCalling, cembav2env) 6 | globalPeaks <- Sa2PeakCalling$readBed4File( 7 | file.path(projdir, "20.nmf", "out", 8 | "peaks.from.global.modules.bed") 9 | ) 10 | 11 | pdc <- data.table::fread( 12 | file = cembav2env$sa2.all.pdc.info, header = TRUE, sep = "\t", 13 | data.table = FALSE) 14 | 15 | global.pdc <- pdc[pdc$cre2 %in% globalPeaks$name, ] |> 16 | group_by(cre1, cre2) |> 17 | slice_max(coaccess, n = 1) 18 | 19 | global.pdc.srt <- global.pdc |> arrange(desc(coaccess)) 20 | 21 | proximal <- vapply(global.pdc.srt$cre1, 22 | str_split_1, FUN.VALUE = rep("1",3), pattern = ":|-") |> 23 | t() |> 24 | as.data.frame() 25 | 26 | distal <- vapply(global.pdc.srt$cre2, 27 | str_split_1, FUN.VALUE = rep("1",3), pattern = ":|-") |> 28 | t() |> 29 | as.data.frame() 30 | 31 | global.pdc.srt.bedpe <- cbind(proximal, distal) 32 | global.pdc.srt.bedpe$coaccess <- global.pdc.srt$coaccess 33 | global.pdc.srt.bedpe$gene <- global.pdc.srt$gene 34 | 35 | write.table(global.pdc.srt.bedpe, 36 | file = file.path(projdir, "04.cCREgene", "sa2.cicero", 37 | "out/sa2pdcsum", "mba.whole.sa2subclass.globalbynmf.pdc.bedpe"), 38 | quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE) 39 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/pipeline/runCicero.Snakefile: -------------------------------------------------------------------------------- 1 | envvars: 2 | "PATH" 3 | 4 | import os 5 | 6 | with open("subclass-name.txt", 'r') as f: 7 | subclasses = [l.strip() for l in f.readlines()] 8 | 9 | rscript_bin = "/home/szu/mambaforge/bin/Rscript" 10 | work_dir = "/projects/ps-renlab2/szu/projects/CEMBA_wmb_snATAC/05.cCREgene/sa2.cicero" 11 | flag_dir = os.path.join(work_dir, "flag_dir") 12 | log_dir = os.path.join(work_dir, "log_dir") 13 | 14 | walltime = config["walltime"] 15 | queue = config["queue"] 16 | if queue == "glean": 17 | if walltime > 8: 18 | walltime = 8 19 | 20 | rule all: 21 | input: 22 | # expand(f"{f}/runcicero_{sc}.done", f = flag_dir, sc = subclasses) 23 | expand(f"{f}/runcicero_{sc}_shuffle.done", f = flag_dir, sc = subclasses) 24 | 25 | rule run_cicero: 26 | output: 27 | touch(f"{flag_dir}/runcicero_{{sc}}.done") 28 | log: 29 | f"{log_dir}/runcicero_{{sc}}.log" 30 | threads: config["threads"] 31 | resources: 32 | walltime = walltime, 33 | queue = queue 34 | shell: 35 | """ 36 | {rscript_bin} {work_dir}/src/main/R/run_cicero.R {wildcards.sc} 2>&1 > {log} 37 | """ 38 | 39 | 40 | rule run_cicero_shuf: 41 | output: 42 | touch(f"{flag_dir}/runcicero_{{sc}}_shuffle.done") 43 | log: 44 | f"{log_dir}/runcicero_{{sc}}_shuffle.log" 45 | threads: config["threads"] 46 | resources: 47 | walltime = walltime, 48 | queue = queue 49 | shell: 50 | """ 51 | {rscript_bin} {work_dir}/src/main/R/run_cicero_shuf.R {wildcards.sc} 2>&1 > {log} 52 | """ 53 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/resource/config.yaml: -------------------------------------------------------------------------------- 1 | n_ds: 1000 2 | n_core: 2 3 | mm10_file: meta/mm10.chrom.sizes.lite 4 | tssUpDn1k_file: meta/gencode.vM23.gene.tssUpDn1k.bed 5 | peakannot_file: 04.cCREgene/sa2.cicero/src/main/resource/mba.whole.sa2.peakOvlpTSS.proximal.distal.ciceroPeakCoord.bed 6 | k_cicero: 50 7 | reduct_cicero: "UMAP" 8 | preprocess_cicero: "LSI" 9 | debug: 0 10 | atac_subclass_cpm_file: 04.cCREgene/sa2.cicero/out/sa2pdcsum/sa2.cpm.pmat.pbysc.distal.rds 11 | rdm_pdc_file: 04.cCREgene/sa2.cicero/out/sa2pdcsum/sa2.rdm.g2p.pdc.rds 12 | allen_l2_cpm_file: 04.cCREgene/sa2.cicero/out/sa2pdcsum/sa2.allen.avg.logCPM.gbysc.rds 13 | pdc_file: 04.cCREgene/sa2.cicero/out/sa2pdcsum/sa2.g2p.pdc.pair.rds 14 | chunk_size: 50000 15 | 16 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/shell/05.mergeDistalProximalConns.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | usage() { 4 | echo "Usage: $0 -c [cicero_dir] -m [meta_col]" 5 | exit 1 6 | } 7 | 8 | while getopts ":c:a:g:m:" o; do 9 | case "${o}" in 10 | c) 11 | cicero_dir=${OPTARG};; 12 | m) 13 | meta_col=${OPTARG};; 14 | *) 15 | usage ;; 16 | esac 17 | done 18 | 19 | shift $((OPTIND-1)) 20 | 21 | if [ -z "${cicero_dir}" ] || [ -z "${meta_col}" ]; then 22 | usage 23 | fi 24 | 25 | echo "Merge distal-proximal conns: ${meta_col}" 26 | out_all_conns_sta="${cicero_dir}/mba.whole.${meta_col}.merge.conns.sta.all" 27 | cat ${cicero_dir}/${meta_col}.*.conns.sta \ 28 | | sed -e "1i cluster\ttotConns\tddConns\tppConns\tpdConns\tgeneN\tcreN" \ 29 | > ${out_all_conns_sta} 30 | out_all_pdc_sta="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.sta.all" 31 | cat ${cicero_dir}/${meta_col}.*.pdc.sta \ 32 | | sed -e "1i cluster\ttotc\tgeneN\tcreN" \ 33 | > ${out_all_pdc_sta} 34 | 35 | ## All pdc without any filtering 36 | out_all_pdc="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.all" 37 | cat ${cicero_dir}/${meta_col}.*.fitConns.res.alignv1.pdc \ 38 | | grep -v 'anno1' \ 39 | | sed -e "1i peak1\tcre1\tanno1\tpeak2\tcre2\tanno2\tcoaccess\tpval\tfdr\tcluster\tgene" \ 40 | > ${out_all_pdc} 41 | 42 | # NOTE: since differen subclass may have different scores for the same 43 | # pdc, then there are repeat pdcs in this file. 44 | out_all_bedpe="${cicero_dir}/mba.whole.${meta_col}.merge.bedpe.all" 45 | sed '1d' ${out_all_pdc} \ 46 | | awk 'BEGIN{FS=OFS="\t"}{split($1,a,"_"); split($4,b,"_"); print a[1],a[2],a[3],b[1],b[2],b[3],$11"|"$5,$7,".","."}'\ 47 | | sort -k1,1 -k2,2n | uniq > ${out_all_bedpe} 48 | 49 | out_all_pdcpair="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.pair.all" 50 | awk 'BEGIN{FS=OFS="\t"}{print $11"|"$5,$2}' ${out_all_pdc} \ 51 | | sed '1d' | sort | uniq > ${out_all_pdcpair} 52 | 53 | echo "# of cCRE" 54 | cut -f 5 ${out_all_pdc} | sed '1d' | sort | uniq | wc -l 55 | echo "# of genes" 56 | cut -f 11 ${out_all_pdc} | sed '1d' | sort | uniq | wc -l 57 | echo "# of pairs" 58 | sed '1d' ${out_all_pdc} | sort | uniq | wc -l 59 | 60 | echo "Merge all proximal-distal pairs' distances" 61 | out_all_dist="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.dist.all" 62 | cat ${cicero_dir}/${meta_col}.*.pdc.dist \ 63 | | sed -e "1i conns\tdistance\tcluster" > ${out_all_dist} 64 | 65 | echo "Merge all genes' stat per cluster" 66 | out_all_peak2gene="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.peak2gene.all" 67 | cat ${cicero_dir}/${meta_col}.*.pdc.peak2gene \ 68 | | sed -e "1i gene\tcnt\tcluster" > ${out_all_peak2gene} 69 | 70 | echo "Merge all peaks' stat per cluster" 71 | out_all_gene2peak="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.gene2peak.all" 72 | cat ${cicero_dir}/${meta_col}.*.pdc.gene2peak \ 73 | | sed -e "1i cCREs\tcnt\tcluster" > ${out_all_gene2peak} 74 | 75 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/shell/09.get.pos.neg.pdc.info.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | proj_dir="/Users/szu/git-recipes/mouseBrainAtlas/CEMBA2" 4 | cicero_dir="${proj_dir}/04.cCREgene/sa2.cicero/out/sa2pdcsum" 5 | meta_col="sa2subclass" 6 | allpeak="${proj_dir}/supple.07.peakcalling.allinone/mba.whole.sa2.final.peak.srt.bed" 7 | pdc_bedpe=${cicero_dir}/mba.whole.${meta_col}.merge.bedpe.all 8 | pdc_pair=${cicero_dir}/mba.whole.${meta_col}.merge.pdc.pair.all 9 | 10 | # * function 11 | sum_pos_or_neg_pdc () { 12 | local class=$1 13 | local cor_method=$2 14 | local class_pdc=${cicero_dir}/mba.whole.${meta_col}.${cor_method}.${class}.pdc.alignv1.tsv 15 | 16 | # outfiles 17 | local class_bedpe=${cicero_dir}/mba.whole.${meta_col}.${cor_method}.${class}.pdc.bedpe 18 | local class_pair=${cicero_dir}/mba.whole.${meta_col}.${cor_method}.${class}.pdc.pair 19 | local class_CREs=${cicero_dir}/mba.whole.${meta_col}.${cor_method}.${class}.pdc.CREs 20 | local class_bed=${cicero_dir}/mba.whole.${meta_col}.${cor_method}.${class}.pdc.CREs.bed 21 | 22 | # * process 23 | echo "$class under correlation method: ${cor_method}" 24 | 25 | # get bedpe 26 | # $8 is co-accessible score 27 | # the results have repeats since co-accessible are infered in subclass level and then 28 | # pool together 29 | join -1 1 -2 7 <(cut -f 1 ${class_pdc} | sort) <(sort -k7,7 ${pdc_bedpe}) -t$'\t' \ 30 | | awk 'BEGIN{FS=OFS="\t"}{print $2,$3,$4,$5,$6,$7,$1,$8,$9,$10}' \ 31 | | sort -k1,1 -k2,2n | uniq > ${class_bedpe} 32 | # get pair 33 | join -1 1 -2 1 <(cut -f 1 ${class_pdc} | sort) <(sort -k1,1 ${pdc_pair}) -t$'\t' \ 34 | | sort -k1,1 -k2,2n | uniq > ${class_pair} 35 | # get CREs 36 | cut -f 1 ${class_pdc} | tr '|' '\t' | cut -f 2 | sort | uniq | sed '1d'> ${class_CREs} 37 | # get bed 38 | join -1 1 -2 4 ${class_CREs} <(sort -k4,4 ${allpeak}) -t$'\t' \ 39 | | awk 'BEGIN{FS=OFS="\t"}{print $2,$3,$4,$1}' \ 40 | | sort -k1,1 -k2,2n | uniq > ${class_bed} 41 | echo 'Done' 42 | } 43 | 44 | sum_pos_or_neg_pdc pos pearson 45 | sum_pos_or_neg_pdc neg pearson 46 | 47 | 48 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/shell/alignv1.to.bedpe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | all_pdc_bedpe=$1 4 | pdc_alignv1=$2 5 | 6 | join -1 1 -2 7 <(cut -f 1 ${pdc_alignv1} | sort) <(sort -k7,7 ${all_pdc_bedpe}) -t$'\t' \ 7 | | awk 'BEGIN{FS=OFS="\t"}{print $2,$3,$4,$5,$6,$7,$1,$8,$9,$10}' \ 8 | | sort -u -k1,1 -k2,2n -k8,8nr 9 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/shell/sa2.all.distal.peaks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | sum_dir="/Users/szu/git-recipes/mouseBrainAtlas/CEMBA2/04.cCREgene/sa2.cicero/out/sa2pdcsum" 4 | all_pdc_file="${sum_dir}/mba.whole.sa2subclass.merge.pdc.all" 5 | distal_peak_nm="${sum_dir}/mba.whole.sa2subclass.all.distal.peaks.from.pdc.all.txt" 6 | distal_peak_bed="${sum_dir}/mba.whole.sa2subclass.all.distal.peaks.from.pdc.all.bed" 7 | 8 | awk 'BEGIN{FS=OFS="\t"}{NR>1}{print $5}' ${all_pdc_file} \ 9 | | sort | uniq > ${distal_peak_nm} 10 | 11 | awk -F '[:-]' 'BEGIN{OFS="\t"}{print $1,$2,$3}' ${distal_peak_nm} \ 12 | > ${distal_peak_bed} 13 | 14 | 15 | -------------------------------------------------------------------------------- /05.cCREgene/sa2.cicero/src/main/shell/supple.02.annotPeakBasedOnTSS.sh: -------------------------------------------------------------------------------- 1 | projroot="/projects/ps-renlab2/szu/projects/CEMBA2" 2 | allpeakFile="${projroot}/supple.07.peakcalling.allinone/mba.whole.sa2.final.peak.srt.bed" 3 | tssBed="${projroot}/meta/gencode.vM23.gene.tssUpDn1k.bed" 4 | outdir="${projroot}/04.cCREgene/sa2.cicero/src/main/resource" 5 | allpeakOvlpTssBed1="${outdir}/mba.whole.sa2.peakOvlpTSS.bed" 6 | allpeakOvlpTssBed2="${outdir}/mba.whole.sa2.peakOvlpTSS.proximal.distal.bed" 7 | allpeakOvlpTssBed3="${outdir}/mba.whole.sa2.peakOvlpTSS.proximal.distal.ciceroPeakCoord.bed" 8 | 9 | 10 | ## annot distall, proximal peak 11 | intersectBed -wao -f 0.5 -a ${allpeakFile} -b ${tssBed} > ${allpeakOvlpTssBed1} 12 | 13 | awk 'BEGIN{FS=OFS="\t"}{if($5 != ".") {print $1,$2,$3,$4,"proximal",$11} else {print $1,$2,$3,$4,"distal","nan"}}' \ 14 | ${allpeakOvlpTssBed1} \ 15 | | sort -k1,1 -k2,2n | uniq > ${allpeakOvlpTssBed2} 16 | 17 | awk 'BEGIN{FS=OFS="\t"}{print $1"_"$2"_"$3,$4,$5,$6}' ${allpeakOvlpTssBed2} \ 18 | | sort -k1,1 > ${allpeakOvlpTssBed3} 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /06.motifanalysis/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: ppdcpeak_motif 2 | snakemake := /home/szu/miniforge3/bin/snakemake 3 | # * run non-negative matrix factorization 4 | ppdcpeak_motif: src/main/pipeline/motif.Snakefile 5 | -mkdir -p build/$@ 6 | cp $< build/$@/Snakefile 7 | cd build/$@ && \ 8 | ${snakemake} --config \ 9 | system=mediator \ 10 | tag=ppdcpeak \ 11 | out=ppdcpeak \ 12 | njob=4 \ 13 | module=54 \ 14 | -c 20 \ 15 | --snakefile Snakefile -R --rerun-incomplete \ 16 | --rerun-triggers mtime \ 17 | --skip-script-cleanup \ 18 | # --profile pbs-torque-conda 19 | 20 | clean_homer_tag: 21 | -rm out/ppdcpeak/flag/nmf.ppdcpeak.homer.*.n*.done 22 | -------------------------------------------------------------------------------- /06.motifanalysis/README.org: -------------------------------------------------------------------------------- 1 | * SCENICPLUS 2 | ** motif database: 3 | - */oasis/tscc/scratch/smamde/scenic_plus_mouse_database* 4 | ** NOTE: 5 | - the output from scenicplus may have the same name or we need to test output 6 | in order to make sure no conflicts files are generated for different tasks. 7 | 8 | 9 | -------------------------------------------------------------------------------- /06.motifanalysis/src/main/R/05.splitPeakByModule.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(optparse) 3 | packdir <- file.path(here::here(), "package/R") 4 | import::from(.from = "utils.R", .directory = packdir, 5 | checkArgsExistOrStop, prepareOutdir, checkFileExistOrStop) 6 | import::from(.from = "peak.R", .directory = packdir, 7 | loadStatPeak.NMF) 8 | import::from(.from = "cembav2env.R", .directory = packdir, 9 | cembav2env, Sa2PeakCalling) 10 | 11 | op <- list( 12 | make_option(c("--nmfDir"), type = "character", 13 | default = "nmf_ppdc/out"), 14 | make_option(c("--module"), type = "integer", 15 | default = 54), 16 | make_option(c("--tag"), type = "character", default = "ppdc") 17 | ) 18 | 19 | args <- parse_args(OptionParser(option_list = op)) 20 | checkArgsExistOrStop(args) 21 | 22 | if(!dir.exists(args$nmfDir)) { 23 | stop(args$nmfDir, " does not exist.") 24 | } 25 | 26 | mod.nmf <- args$module 27 | tag <- args$tag 28 | 29 | outDir <- file.path(args$nmfDir, 30 | paste("nmf", tag, paste0("r", mod.nmf), "motif", sep = ".")) 31 | prepareOutdir(outDir) 32 | 33 | # * functions 34 | convertPeakToBed <- function(peakBed, peaknms, outFile = NULL) { 35 | r <- peakBed[peaknms, ] 36 | if(!is.null(outFile)) { 37 | write.table(x = r, file = outFile, quote = FALSE, sep = "\t", 38 | row.names = FALSE, col.names = FALSE) 39 | } 40 | return(r) 41 | } 42 | 43 | # * load peaks 44 | peakBed <- data.table::fread(Sa2PeakCalling$finalpeakBedFile, 45 | header = FALSE, sep = "\t", data.table = FALSE) 46 | colnames(peakBed) <- c("chrom", "start", "end", "name") 47 | rownames(peakBed) <- peakBed$name 48 | 49 | # * nmf modules 50 | nmfPeakStat <- loadStatPeak.NMF( 51 | file = file.path(args$nmfDir, 52 | paste("nmfPmat", tag, 53 | paste0("r", mod.nmf), "n0", "statW", sep = "."))) 54 | 55 | modules <- unique(nmfPeakStat$class0 + 1) 56 | 57 | # * save peaks from each module to a seperate bed file 58 | invisible(lapply(modules, function(i) { 59 | outFile <- file.path(outDir, 60 | paste0("r", mod.nmf, "_n", i, ".cCREs.bed")) 61 | message("Writing peak bed file to: ", outFile) 62 | peaks <- with(nmfPeakStat, peak[class0 == (i-1)]) 63 | convertPeakToBed(peakBed = peakBed, peaknms = peaks, outFile = outFile) 64 | })) 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /06.motifanalysis/src/main/pipeline/motif.Snakefile: -------------------------------------------------------------------------------- 1 | envvars: 2 | "PATH" 3 | 4 | import os 5 | import pyprojroot 6 | import sys 7 | 8 | proj_dir = str(pyprojroot.here()) 9 | 10 | mod = config['module'] 11 | myrange = list(range(1, mod + 1)) 12 | 13 | Rscript_bin = "/home/szu/miniforge3/envs/r/bin/Rscript" 14 | homer_bin = "/home/szu/miniforge3/envs/r/bin/findMotifsGenome.pl" 15 | code_project_dir = "/projects/ps-renlab2/szu/projects/CEMBA2" 16 | motif_dir = "24.motifanalysis" 17 | 18 | code_dir = os.path.join(code_project_dir, motif_dir, "src/main") 19 | work_dir = os.path.join(proj_dir, "24.motifanalysis", "out") 20 | tag = config['tag'] 21 | nmf_dir = os.path.join(proj_dir, "20.nmf", "out", 22 | f"{tag}_nmf", "out") 23 | out = config['out'] 24 | out_dir = os.path.join(work_dir, out) 25 | log_dir = f"{work_dir}/{out}/log" 26 | flag_dir = f"{work_dir}/{out}/flag" 27 | motif_dir = f"{out_dir}/nmf.{tag}.r{mod}.motif" 28 | for d in [out_dir, log_dir, flag_dir, motif_dir]: 29 | os.makedirs(d, exist_ok = True) 30 | njob = config["njob"] 31 | nmf_bed_dir = os.path.join(nmf_dir, f"nmf.{tag}.r{mod}.motif") 32 | 33 | rule all: 34 | input: 35 | f"{flag_dir}/nmf.{tag}.splitPeakByModule.{mod}.done", 36 | expand("{d}/nmf.{t}.homer.{m}.n{i}.done", 37 | d = flag_dir, t = tag, m = mod, i = myrange) 38 | 39 | 40 | rule splitPeakByModule: 41 | output: 42 | touch(f"{flag_dir}/nmf.{tag}.splitPeakByModule.{mod}.done") 43 | log: 44 | f"{log_dir}/nmf.{tag}.splitPeakByModule.{mod}.log" 45 | shell: 46 | """ 47 | {Rscript_bin} {code_dir}/R/05.splitPeakByModule.R --nmfDir {nmf_dir} \ 48 | --module {mod} --tag {tag} 2> {log} 49 | """ 50 | rule findMotif: 51 | input: 52 | f"{flag_dir}/nmf.{tag}.splitPeakByModule.{mod}.done" 53 | output: 54 | touch(expand("{d}/nmf.{t}.homer.{m}.n{{i}}.done", d = flag_dir, t = tag, m = mod)) 55 | log: 56 | expand("{d}/nmf.{t}.homer.{m}.n{{i}}.log", d = log_dir, t = tag, m = mod) 57 | shell: 58 | """ 59 | {homer_bin} {nmf_bed_dir}/r{mod}_n{wildcards.i}.cCREs.bed \ 60 | mm10 {motif_dir}/homer_n{wildcards.i} \ 61 | -nomotif -size given -p {njob} 2> {log} 62 | """ 63 | -------------------------------------------------------------------------------- /06.motifanalysis/src/main/python/test.scienicplus.py: -------------------------------------------------------------------------------- 1 | import pycistarget 2 | import pyranges as pr 3 | import pandas as pd 4 | from scenicplus.wrappers.run_pycistarget import run_pycistarget 5 | import glob 6 | import os 7 | from pyscistarget.utils import region_names_to_coordinates 8 | 9 | import pyprojroot 10 | 11 | proj_dir = str(pyprojroot.here()) 12 | 13 | nvolp_peak_fnm = os.path.join(proj_dir, "18.snap2_peakcalling", 14 | "out/scfilter", "cembav2.nonOvlpDHS.bed") 15 | 16 | peaks = pd.read_csv(nvolp_peak_fnm, sep = "\t", header = None) 17 | peaks = peaks.rename(columns = {0: "Chromosome", 1: "Start", 2: "End", 3: "Name"}) 18 | 19 | region_sets = {'DARs': 20 | {'novlp': pr.PyRanges(peaks[["Chromosome", "Start", "End"]])} 21 | } 22 | db_path = os.path.join(proj_dir, "24.scenicplus/data/scenicplus_database") 23 | rankings_db = os.path.join(db_path, "mm10_screen_v10_clust.regions_vs_motifs.rankings.feather") 24 | scores_db = os.path.join(db_path, "mm10_screen_v10_clust.regions_vs_motifs.scores.feather") 25 | motif_annotation = os.path.join(db_path, "motifs-v10nr_clust-nr.mgi-m0.001-o0.0.tbl") 26 | 27 | out_dir = os.path.join(proj_dir, "24.sceinicplust/out/test_scenicplus") 28 | 29 | motif_res = run_pycistarget( 30 | region_sets = region_sets, 31 | species = "mus_musculus", 32 | save_path = out_dir, 33 | ctx_db_path = rankings_db, 34 | path_to_motif_annotations = motif_annotation, 35 | run_without_promoters = True, 36 | annotation_version = "v10nr_clust", 37 | n_cpu = 4, 38 | ignore_reinit_error = True, 39 | ) 40 | -------------------------------------------------------------------------------- /07.m3C/README.org: -------------------------------------------------------------------------------- 1 | * Proximal-distal pairs verification from m3C data. 2 | - Sst Gaba 3 | - Pvalb Gaba 4 | - CBX MLI Megf11 Gaba 5 | - Vip Gaba 6 | - CA1-ProS Glut 7 | - CB Granule Glut 8 | - L6 CT CTX Glut 9 | - L2/3 IT CTX Glut 10 | - Oligo NN 11 | - Astro-TE NN 12 | - Microglia NN 13 | - Bergmann NN 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /07.m3C/hic2/hic2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | while read s; 3 | do nohup bash /projects/ps-renlab2/y2xie/projects/44.colab/szu_mba/scripts/hic_APA_subtype.sh $s & 4 | done pd.DataFrame: 19 | import pyarrow.parquet as pq 20 | r = pq.read_table(f) 21 | return r.to_pandas() 22 | 23 | # * load groups 24 | proj_dir = str(pyprojroot.here()) 25 | group_file = os.path.join(proj_dir, "meta", "sa2.subclass.srt.txt") 26 | with open(group_file) as f: 27 | lines = f.readlines() 28 | groups = [l.strip() for l in lines if len(l.strip()) > 1] 29 | groups = [g for g in groups if (not "Hypendymal_NN" in g)] 30 | 31 | with open("pdc.suffix.txt", mode = "r") as f: 32 | parts = [l.strip() for l in f.readlines() if len(l) > 1] 33 | 34 | # * check gimmemotifs 35 | tfscan_dir = os.path.join(proj_dir, "22.sa2GRN", "out/tfscan") 36 | prefix = "sa2subclass" 37 | 38 | # about 100G RAM 39 | tfidfs = [load_tfidf(f"{tfscan_dir}/{prefix}.{i}.pdc.baseGRN.df.parquet") 40 | for i in groups] 41 | # another 100G RAM 42 | r = pd.concat(tfidfs, axis = 0, ignore_index= True) 43 | r = r.fillna(int(0)) 44 | del tfidfs 45 | # 520M 46 | r.to_parquet(f"{tfscan_dir}/{prefix}.all.baseGRN.df.parquet") 47 | 48 | # stat 49 | tfCount = r.sum(axis = 0, numeric_only = True) 50 | peakCount = r.sum(axis = 1, numeric_only = True) 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /08.GRN/src/main/python/05.plot.powerlaw.py: -------------------------------------------------------------------------------- 1 | from os.path import exists 2 | import pandas as pd 3 | 4 | # 1.23.0 since 5 | # - numba(cell oracle needs) <=1.24, but 6 | # - seaborn != 1.24.0 7 | 8 | import numpy as np 9 | import os, sys, shutil, importlib, glob 10 | from tqdm.notebook import tqdm 11 | import celloracle as co 12 | from celloracle import motif_analysis as ma 13 | from genomepy import Genome 14 | import scanpy as sc 15 | import pyarrow.parquet as pq 16 | import pyprojroot 17 | import matplotlib.pyplot as plt 18 | 19 | proj_root = pyprojroot.here() 20 | print(proj_root) 21 | co_base_sc_dir = os.path.join(proj_root, "22.sa2GRN", 22 | "out", "GRN", 23 | "baseGRN_subclass") 24 | 25 | out_dir = os.path.join(proj_root, "22.sa2GRN", 26 | "out", "powerlaw") 27 | if not os.path.exists(out_dir): 28 | os.makedirs(out_dir, exist_ok = True) 29 | 30 | sc = "Astro-TE_NN" 31 | link_sc = co.utility.load_hdf5( 32 | os.path.join(co_base_sc_dir, f"GRN.{sc}.celloracle.links")) 33 | 34 | link_sc 35 | link_sc.cluster 36 | 37 | # use default parameter for filtering 38 | # p=0.001, weight="coef_abs", threshold_number=10000 39 | link_sc.filter_links() 40 | link_sc.plot_degree_distributions(plot_model=True, 41 | save = os.path.join(out_dir, sc)) 42 | -------------------------------------------------------------------------------- /08.GRN/src/main/resource/config.yaml: -------------------------------------------------------------------------------- 1 | system: mediator 2 | python_bin: 3 | imac: /Users/szu/mambaforge/envs/celloracle/bin/python 4 | tscc: /projects/ps-renlab2/szu/miniconda3/envs/celloracle/bin/python 5 | encoder: /projects/ps-renlab2/szu/miniconda3/envs/celloracle/bin/python 6 | mediator: /home/szu/miniforge3/envs/sa2/bin/python 7 | code_project_dir: 8 | imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2 9 | tscc: /projects/ps-renlab2/szu/projects/CEMBA2 10 | encoder: /projects/ps-renlab2/szu/projects/CEMBA2 11 | mediator: /projects/ps-renlab2/szu/projects/CEMBA2 12 | work_project_dir: 13 | imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2 14 | tscc: /oasis/tscc/scratch/szu/projects/CEMBA2 15 | encoder: /projects/ps-renlab2/szu/projects/CEMBA2 16 | mediator: /projects/ps-renlab2/szu/projects/CEMBA2 17 | local_dir: 22.sa2GRN 18 | pdc_suffix: pdc.suffix.txt 19 | tfscan_dir: out/tfscan 20 | threshold_gimme: 10 21 | subclass4GRN: allen.subclass.chosen.txt 22 | allenRNA_dir: src/main/resource/sa2.allen.logCPM.vf3281.ds1000.subclass.specific 23 | GRN_dir: out/GRN 24 | 25 | -------------------------------------------------------------------------------- /09.cCRE_conservation/01.reciLiftOver.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # * mm10 lift to hg38 4 | liftOver whole.mouse.brain.cCREs.bed mm10ToHg38.over.chain.gz -minMatch=0.5 peak.conserve0.5.bed peak.unMapped0.5 5 | 6 | # * hg38 lift back to mm10 7 | liftOver peak.conserve0.5.bed hg38ToMm10.over.chain.gz -minMatch=0.5 peak.conserve0.5.reciprocal0.5.bed peak.conserver0.5.unMapped0.5 8 | 9 | -------------------------------------------------------------------------------- /11.deeplearning/README.org: -------------------------------------------------------------------------------- 1 | * Install tensorflow 2 | From Kangli: 3 | this is what i install and test tensorflow on mediator: 4 | #+BEGIN_SRC python 5 | conda create -n tensorflow python=3.10 6 | conda activate tensorflow 7 | mamba install -c conda-forge cudatoolkit=11.8.0 8 | python3 -m pip install nvidia-cudnn-cu11==8.6.0.163 tensorflow==2.13.* 9 | mkdir -p $CONDA_PREFIX/etc/conda/activate.d 10 | echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh 11 | echo 'export LD_LIBRARY_PATH=$CUDNN_PATH/lib:$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh 12 | source $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh 13 | # Verify install: 14 | python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))" 15 | # for no TensorRT 16 | pip install nvidia-pyindex 17 | pip install nvidia-tensorrt 18 | python3 -c "import tensorrt; import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))" 19 | #+END_SRC 20 | -------------------------------------------------------------------------------- /11.deeplearning/src/main/resource/mappedHMB.txt: -------------------------------------------------------------------------------- 1 | ACBGM 2 | ASCNT_1 3 | ASCNT_2 4 | ASCNT_3 5 | ASCT_1 6 | ASCT_2 7 | ASCT_3 8 | CBGRC 9 | CNGA_1 10 | CNGA_2 11 | COP 12 | CT_1 13 | CT_2 14 | D12NAC 15 | D1CaB 16 | D1Pu 17 | D2CaB 18 | D2Pu 19 | ET 20 | ICGA_2 21 | ITL23_1 22 | ITL23_2 23 | ITL23_3 24 | ITL23_5 25 | ITL45_1 26 | ITL45_2 27 | ITL5_1 28 | ITL5_2 29 | ITL5_3 30 | ITL5_4 31 | ITL6_1_1 32 | ITL6_1_2 33 | ITL6_2_1 34 | ITL6_2_2 35 | L6B_1 36 | L6B_2 37 | LAMP5 38 | MGC_1 39 | MGC_2 40 | MSN_1 41 | NP_1 42 | NP_3 43 | OGC_1 44 | OGC_2 45 | OGC_3 46 | OPC 47 | PIR 48 | PVALB_1 49 | PVALB_2 50 | PVALB_3 51 | PVALB_4 52 | PV_ChCs 53 | SNCG_1 54 | SNCG_3 55 | SNCG_4 56 | SNCG_5 57 | SST_1 58 | SST_2 59 | SST_3 60 | SST_4 61 | SST_5 62 | VIP_1 63 | VIP_2 64 | VIP_3 65 | VIP_4 66 | VIP_5 67 | VIP_6 68 | VIP_7 69 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Songpeng Zu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /manuscript_figures/sa2.sc2region.R: -------------------------------------------------------------------------------- 1 | source("paper.R") 2 | scnmMap <- cembav2env$load.sa2.subclass.names() 3 | rownames(scnmMap) <- scnmMap$subclass_id_label 4 | allenSubclass2Region <- Sa2Integration$getAllenSubclass2Region() 5 | rownames(allenSubclass2Region) <- allenSubclass2Region$subclass_id_label 6 | 7 | mrMeta <- list( 8 | Telencephalon = c("Isocortex", "HPF", "OLF", "AMY", "STR", "PAL"), 9 | Diencephalon = c("TH", "HY"), 10 | Midbrain = "MB", 11 | Hindbrain = c("Pons", "MY"), 12 | Cerebellum = "CB" 13 | ) 14 | mr2r <- data.frame( 15 | mr = c( 16 | c("Isocortex", "HPF", "OLF", "AMY", "STR", "PAL", "CNU"), 17 | c("TH", "HY"), 18 | c("MB", "MB-PONS"), 19 | c("Pons", "MY", "HB"), 20 | "CB"), 21 | r = c(rep("Telencephalon", 7), 22 | rep("Diencephalon", 2), 23 | rep("Midbrain", 2), 24 | rep("Hindbrain", 3), 25 | "Cerebellum") 26 | ) 27 | rownames(mr2r) <- mr2r$mr 28 | 29 | r2Color <- data.frame( 30 | r = c("Telencephalon", "Diencephalon", "Midbrain", 31 | "Hindbrain", "Cerebellum", "Non-Telencephalon"), 32 | color = c("#00688B", "#F15F30", "#74E44B", 33 | "#788FC8", "#DEB34C", "#999999") 34 | ) 35 | 36 | # use this to update mr2r 37 | allenmrs <- allenSubclass2Region[, "MajorRegionRateTop3"] |> 38 | lapply(X = _, Sa2Integration$extractAllenRegionWithScore) |> 39 | do.call(rbind, args = _ )|> 40 | x => x[!is.na(x[,2]), ]|> 41 | x => x$region |> 42 | unique() 43 | 44 | # map subclass to region based on their top region information 45 | sc2r <- allenSubclass2Region[ , "MajorRegionRateTop3"] |> 46 | lapply(X = _, \(x) { 47 | r2score <- Sa2Integration$extractAllenRegionWithScore(x) |> 48 | mutate(r = mr2r[region, "r"]) |> 49 | group_by(r) |> 50 | summarise(r_s = sum(score)) |> 51 | slice_max(r_s, n = 1) 52 | }) |> do.call(rbind, args = _) |> 53 | as.data.frame() 54 | 55 | rownames(scnmMap) <- scnmMap$subclass_id_label 56 | rownames(sc2r) <- scnmMap[allenSubclass2Region$subclass_id_label, 57 | "subclass_label_peak"] 58 | 59 | write.table(sc2r, 60 | file = file.path(projdir, "meta", "sa2.subclass2region2score.csv"), 61 | sep = ",", 62 | col.names = TRUE, row.names = TRUE, quote = FALSE) 63 | 64 | -------------------------------------------------------------------------------- /meta/BICCN.BrainRegionMetadata.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/meta/BICCN.BrainRegionMetadata.xlsx -------------------------------------------------------------------------------- /meta/allen.region.to.main.region.v2.txt: -------------------------------------------------------------------------------- 1 | allenRegionLabel@RegionFromJingtian@MajorRegionCEMBA@MajorRegionLabel@MajorRegionColor 2 | PONS@Pons@Pons@Pons@#78cbe6 3 | SS-GU-VISC@CTX@Isocortex@Isocortex@#176CDB 4 | MB@MB@MB@MB@#74e44b 5 | HY@HY@HY@HY@#d68790 6 | AUD-TEa-PERI-ECT@CTX@Isocortex@Isocortex@#176CDB 7 | MY@MY@MY@MY@#66458d 8 | RSP@CTX@Isocortex@Isocortex@#176CDB 9 | sAMY@CTXsp@AMY@AMY@#508eda 10 | AI@CTX@Isocortex@Isocortex@#176CDB 11 | ACA@CTX@Isocortex@Isocortex@#176CDB 12 | CB@CB@CB@CB@#deb34c 13 | HIP@HIP@HPF@HPF@#d62728 14 | VIS@CTX@Isocortex@Isocortex@#176CDB 15 | LSX@PAL@PAL@PAL@#a83795 16 | PL-ILA-ORB@CTX@Isocortex@Isocortex@#176CDB 17 | CTXsp@PIR@OLF@OLF@#ff7f0e 18 | OLF@OLF@OLF@OLF@#ff7f0e 19 | MO-FRP@CTX@Isocortex@Isocortex@#176CDB 20 | MOp@CTX@Isocortex@Isocortex@#176CDB 21 | TH@TH@TH@TH@#9f4020 22 | STRv@STR@STR@STR@#7B42CD 23 | STRd@STR@STR@STR@#7B42CD 24 | ENT@RHP@HPF@HPF@#d62728 25 | PAL@PAL@PAL@PAL@#a83795 26 | VIS-PTLp@CTX@Isocortex@Isocortex@#176CDB 27 | RHP@RHP@HPF@HPF@#d62728 28 | PAR-POST-PRE-SUB-ProS@RHP@HPF@HPF@#d62728 29 | CNU@NA@NA@CNU@#923DB1 30 | STR@STR@STR@STR@#7B42CD 31 | HY LZ@HY@HY@HY@#d68790 32 | AI-CLA@CTX@Isocortex@Isocortex@#176CDB 33 | VISp@CTX@Isocortex@Isocortex@#176CDB 34 | HB@NA@NA@HB@#6F88BA 35 | MB-PONS@NA@NA@MB-PONS@#76D899 36 | SSs-GU-VISC-AIp@CTX@Isocortex@Isocortex@#176CDB 37 | VISl@CTX@Isocortex@Isocortex@#176CDB 38 | MOs-FRP@CTX@Isocortex@Isocortex@#176CDB 39 | PTLp@CTX@Isocortex@Isocortex@#176CDB 40 | VISm@CTX@Isocortex@Isocortex@#176CDB 41 | SSp@CTX@Isocortex@Isocortex@#176CDB 42 | AUD@CTX@Isocortex@Isocortex@#176CDB 43 | VISpos@CTX@Isocortex@Isocortex@#176CDB 44 | VISa@CTX@Isocortex@Isocortex@#176CDB 45 | AId-AIv@CTX@Isocortex@Isocortex@#176CDB 46 | TEa-PERI-ECT@CTX@Isocortex@Isocortex@#176CDB 47 | AId-AIv-AIp@CTX@Isocortex@Isocortex@#176CDB 48 | 49 | -------------------------------------------------------------------------------- /meta/dissect2time.csv: -------------------------------------------------------------------------------- 1 | early,later 2 | 3C,171206,171207 3 | 4B,171213,180104 4 | 4D,171214,171219 5 | 3F,180105,180109 6 | 4E,180110,180111 7 | 1B,180119,180213 8 | 2A,180123,190207 9 | 3A,180129,180130 10 | 4A,180205,180206 11 | 1C,180208,180212 12 | 2E,180222,190207 13 | 1A,180226,180227 14 | 2B,180305,180306 15 | 3B,180308,180312 16 | 2D,180313,180319 17 | 3E,180320,180326 18 | 4F,180329,180402 19 | 2C,180409,180410 20 | 3D,180412,180416 21 | 4C,180417,180419 22 | 8B,180426,180430 23 | 5B,180514,180529 24 | 5D,180612,180618 25 | 4G,180619,180723 26 | 4H,180724,180730 27 | 5A,180731,180807 28 | 5E,180813,180820 29 | 5J,180904,180910 30 | 5C,181001,181002 31 | 5G,181008,181009 32 | 5H,181015,181016 33 | 6B,181022,181023 34 | 5F,181218,181220 35 | 6A,190108,190117 36 | 7B,190110,190115 37 | 6C,190122,190124 38 | 6D,190131,190205 39 | 9H,190212,190219 40 | 9J,190212,190219 41 | 11E,190214,190305 42 | 11F,190214,190305 43 | 11B,190314,190325 44 | 12B,190314,190325 45 | 9B,190326,190404 46 | 9D,190326,190404 47 | 10C,190411,190418 48 | 9A,190411,190418 49 | 10A,190423,190523 50 | 10G,190423,190523 51 | 11C,190530,190620 52 | 9C,190530,190620 53 | 10E,190625,190627 54 | 10F,190625,190627 55 | 7G,190702,190709 56 | 8E,190711,190716 57 | 8J,190711,190716 58 | 6E,190718,190723 59 | 8F,190718,190723 60 | 10B,190725,190730 61 | 13A,190725,190730 62 | 12A,191008,191017 63 | 7A,191008,191017 64 | 11D,191024,191031 65 | 12D,191024,191031 66 | 11A,191107,191114 67 | 8A,191107,191114 68 | 7C,191205,191212 69 | 8C,191205,191212 70 | 7J,200305,200520 71 | 9L,200305,200520 72 | 6H,200312,200319 73 | 8K,200312,200319 74 | 8D,20200707,200709 75 | 9E,20200707,200709 76 | 6G,200714,20200721 77 | 7D,200714,20200721 78 | 6F,200723,200728 79 | 7F,200723,200728 80 | 11H,200730,200813 81 | 12H,200730,200813 82 | 12E,200820,200827 83 | 7H,200820,200827 84 | 8H,200903,200910 85 | 9G,200903,200910 86 | 13D,200917,200924 87 | 14C,200917,200924 88 | 8L,201001,201008 89 | 9F,201001,201008 90 | 11K,201015,201022 91 | 12C,201015,201022 92 | 7E,201029,201105 93 | 8G,201029,201105 94 | 9Ka,201112,201119 95 | 9Kb,201112,201119 96 | 10D,201203,201210 97 | 13B,201203,201210 98 | 14B,201217,210107 99 | 18A,201217,210107 100 | 10H,210114,210121 101 | 10J,210114,210121 102 | 11G,210128,210204 103 | 11J,210128,210204 104 | 12F,210211,210218 105 | 12G,210211,210218 106 | 12J,210225,210304 107 | 13C,210225,210304 108 | 13E,210311,210318 109 | 13F,210311,210318 110 | 14A,210325,210401 111 | 14D,210325,210401 112 | 15C,210408,210415 113 | 16C,210408,210415 114 | 17B,210422,210429 115 | 18B,210422,210429 116 | 15B,210805,210810 117 | 16A,210805,210812 118 | 17A,210810,210812 119 | -------------------------------------------------------------------------------- /meta/getGeneUp2K.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gtf="modified_gencode.vM23.primary_assembly.annotation.gtf" 4 | bedfile="mouse.modified_gencode.vM23.bed" 5 | geneUp2kfile="mmouse.modified.genecode.vM23.gene.up2k.bed" 6 | 7 | awk 'BEGIN{FS=OFS="\t"}($3=="gene"){split($9,a,"\""); print $1,$4-1,$5,a[6],$6,$7}' ${gtf}\ 8 | | sort -k1,1 -k2,2n > ${bedfile} 9 | 10 | awk 'BEGIN{FS=OFS="\t"}{if($6=="+" && $2-2000>0){print $1,$2-2000,$3,$4,$5,$6}else if($6=="+" && $2-2000<0){print $1,0,$3,$4,$5,$6}else if($6=="-"){print $1,$2,$3+2000,$4,$5,$6}}' ${bedfile} > ${geneUp2kfile} 11 | -------------------------------------------------------------------------------- /meta/mm10-blacklist.v2.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/meta/mm10-blacklist.v2.bed -------------------------------------------------------------------------------- /meta/mm10.blacklist.bed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/meta/mm10.blacklist.bed -------------------------------------------------------------------------------- /meta/mm10.chrom.sizes: -------------------------------------------------------------------------------- 1 | chr1 195471971 2 | chr2 182113224 3 | chrX 171031299 4 | chr3 160039680 5 | chr4 156508116 6 | chr5 151834684 7 | chr6 149736546 8 | chr7 145441459 9 | chr10 130694993 10 | chr8 129401213 11 | chr14 124902244 12 | chr9 124595110 13 | chr11 122082543 14 | chr13 120421639 15 | chr12 120129022 16 | chr15 104043685 17 | chr16 98207768 18 | chr17 94987271 19 | chrY 91744698 20 | chr18 90702639 21 | chr19 61431566 22 | chr5_JH584299_random 953012 23 | chrX_GL456233_random 336933 24 | chrY_JH584301_random 259875 25 | chr1_GL456211_random 241735 26 | chr4_GL456350_random 227966 27 | chr4_JH584293_random 207968 28 | chr1_GL456221_random 206961 29 | chr5_JH584297_random 205776 30 | chr5_JH584296_random 199368 31 | chr5_GL456354_random 195993 32 | chr4_JH584294_random 191905 33 | chr5_JH584298_random 184189 34 | chrY_JH584300_random 182347 35 | chr7_GL456219_random 175968 36 | chr1_GL456210_random 169725 37 | chrY_JH584303_random 158099 38 | chrY_JH584302_random 155838 39 | chr1_GL456212_random 153618 40 | chrUn_JH584304 114452 41 | chrUn_GL456379 72385 42 | chr4_GL456216_random 66673 43 | chrUn_GL456393 55711 44 | chrUn_GL456366 47073 45 | chrUn_GL456367 42057 46 | chrUn_GL456239 40056 47 | chr1_GL456213_random 39340 48 | chrUn_GL456383 38659 49 | chrUn_GL456385 35240 50 | chrUn_GL456360 31704 51 | chrUn_GL456378 31602 52 | chrUn_GL456389 28772 53 | chrUn_GL456372 28664 54 | chrUn_GL456370 26764 55 | chrUn_GL456381 25871 56 | chrUn_GL456387 24685 57 | chrUn_GL456390 24668 58 | chrUn_GL456394 24323 59 | chrUn_GL456392 23629 60 | chrUn_GL456382 23158 61 | chrUn_GL456359 22974 62 | chrUn_GL456396 21240 63 | chrUn_GL456368 20208 64 | chrM 16299 65 | chr4_JH584292_random 14945 66 | chr4_JH584295_random 1976 67 | -------------------------------------------------------------------------------- /meta/mm10.chrom.sizes.lite: -------------------------------------------------------------------------------- 1 | chr1 195471971 2 | chr2 182113224 3 | chrX 171031299 4 | chr3 160039680 5 | chr4 156508116 6 | chr5 151834684 7 | chr6 149736546 8 | chr7 145441459 9 | chr10 130694993 10 | chr8 129401213 11 | chr14 124902244 12 | chr9 124595110 13 | chr11 122082543 14 | chr13 120421639 15 | chr12 120129022 16 | chr15 104043685 17 | chr16 98207768 18 | chr17 94987271 19 | chrY 91744698 20 | chr18 90702639 21 | chr19 61431566 22 | -------------------------------------------------------------------------------- /meta/subclass_and_genemarker_CEMBAv1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/meta/subclass_and_genemarker_CEMBAv1.xlsx -------------------------------------------------------------------------------- /package/R/cicero.R: -------------------------------------------------------------------------------- 1 | #' @examples 2 | #' File format: 3 | #' pdc,cor 4 | #' 0610009B22Rik@peak217336,0.0602667683204699 5 | #' 0610009B22Rik@peak217343,-0.0929365338771179 6 | #' @export 7 | loadCor <- function(f, split = "@") { 8 | r <- data.table::fread(file = f, header = TRUE, sep = ",", 9 | data.table = FALSE) 10 | gene.peak <- vapply(r$pdc, function(i) { 11 | t <- strsplit(i, split = split, fixed = TRUE) 12 | t[[1]] 13 | }, c("a", "peak1")) 14 | r$gene <- gene.peak[1,] 15 | r$peak <- gene.peak[2,] 16 | return(r) 17 | } 18 | -------------------------------------------------------------------------------- /package/R/gglot.theme.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | theme.no.axis <- theme( 4 | axis.text = ggplot2::element_blank(), 5 | axis.title = ggplot2::element_blank(), 6 | axis.ticks = ggplot2::element_blank(), 7 | panel.border = ggplot2::element_blank(), 8 | panel.background = element_blank(), 9 | plot.margin = unit(c(0,0,0,0), "cm"), 10 | legend.title = element_text(size = 12), 11 | legend.text = element_text(size = 9) 12 | ) 13 | 14 | theme_my_minimal <- theme_minimal() + 15 | theme( 16 | panel.grid = element_blank(), 17 | panel.border = element_blank(), 18 | legend.position = "none", 19 | axis.title = element_blank(), 20 | axis.text = element_text(colour = "black"), 21 | ) 22 | 23 | #'@export 24 | setGlobalTheme <- function(newTheme){ 25 | oldTheme <- ggplot2::theme_get() 26 | ggplot2::theme_set(newTheme) 27 | return(oldTheme) 28 | } 29 | -------------------------------------------------------------------------------- /package/R/hdf5.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | splitByChunk <- function(nondecVec, chunkSize = 2012) { 3 | n <- ceiling(max(nondecVec) / chunkSize) 4 | r <- lapply(seq(n), function(i) { 5 | left <- chunkSize * (i-1) 6 | right <- chunkSize *i 7 | index <- (nondecVec > left) & (nondecVec <= right) 8 | if(sum(index) < 1) { 9 | return(NULL) 10 | } else { 11 | return(nondecVec[index]) 12 | } 13 | }) 14 | rr <- r[!sapply(r, is.null)] 15 | return(rr) 16 | } 17 | 18 | #' @export 19 | h5adToMat.Sparse <- function(filenm) { 20 | conn <- hdf5r::H5File$new(filename = filenm, 21 | mode = "r") 22 | data <- conn[["X"]][["data"]][] 23 | indices <- conn[["X"]][["indices"]][] 24 | indptr <- conn[["X"]][["indptr"]][] 25 | obs <- conn[["obs"]][["_index"]][] 26 | var <- conn[["var"]][["_index"]][] 27 | mat <- Matrix::sparseMatrix(i = indices, 28 | p = indptr, x = data, index1 = FALSE, 29 | dims = c(length(obs), length(var))) 30 | rownames(mat) <- obs 31 | colnames(mat) <- var 32 | return(mat) 33 | } 34 | 35 | #' @export 36 | subset.snap.gmat.h5ad <- function(barcodes, h5adDir, ncore = 2, 37 | is.parallel = FALSE) { 38 | sampleFiles <- list.files(path = h5adDir, 39 | full.names = TRUE, no.. = TRUE) 40 | tmp <- h5adToMat.Sparse(filenm = sampleFiles[1]) 41 | genes <- colnames(tmp) 42 | rm(tmp) 43 | 44 | matList <- if (is.parallel) { 45 | parallel::mclapply( 46 | sampleFiles, function(f){ 47 | subset.snap.gmat.h5ad.single( 48 | barcodes = barcodes, h5adFile = f, colnms = genes) 49 | }, mc.cores = ncore) 50 | } else { 51 | lapply(sampleFiles, function(f) { 52 | subset.snap.gmat.h5ad.single( 53 | barcodes = barcodes, h5adFile = f, colnms = genes) 54 | }) 55 | } 56 | 57 | left <- !sapply(matList, is.null) 58 | if(sum(left) < 1) { 59 | stop("No barcodes found in all the sample files.") 60 | } 61 | matList <- matList[left] 62 | mat <- do.call(rbind, matList) 63 | ## remote this 64 | ## - we have colnames and rownames, which will be kept 65 | ## after rbind 66 | ## - there may have less barcodes 67 | ## rownames(mat) <- barcodes 68 | ## colnames(mat) <- genes 69 | if(length(barcodes) != nrow(mat)) { 70 | message(paste(nrow(mat), "over", length(barcodes), "found.")) 71 | } else { 72 | message("All the barcodes are found.") 73 | } 74 | return(mat) 75 | } 76 | 77 | #' @export 78 | subset.snap.gmat.h5ad.single <- function(barcodes, h5adFile, colnms = NULL) { 79 | if (!file.exists(h5adFile)){ 80 | stop(h5adFile, " does not exist.") 81 | } 82 | message("working on: ", basename(h5adFile)) 83 | mat <- h5adToMat.Sparse(filenm = h5adFile) 84 | index <- barcodes %in% rownames(mat) 85 | if(sum(index) < 1) { 86 | message("No barcodes found.") 87 | return(NULL) 88 | } 89 | r <- mat[barcodes[index], , drop = FALSE] 90 | if (!is.null(colnms)) { 91 | colnames(r) <- colnms 92 | } 93 | return(r) 94 | } 95 | -------------------------------------------------------------------------------- /package/R/prob.R: -------------------------------------------------------------------------------- 1 | #' @export 2 | pvalOfBEZI <- function(x, mu, sigma, nu, lower = TRUE){ 3 | pval <- (1 - gamlss.dist::pBEZI( 4 | x, mu = mu, sigma = sigma, nu = nu, lower.tail = lower, log.p = FALSE)) 5 | return(pval) 6 | } 7 | -------------------------------------------------------------------------------- /package/python/bedpe2bigwig.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | import pandas as pd 4 | from typing import Dict, List 5 | 6 | def generate_bedpe_worker(meta: str, 7 | sample: str, 8 | infnm: str, 9 | outdir:str,) -> bool: 10 | """Partition barcodes from a sample bedpe to different goups. 11 | 12 | Outfile format: 13 | fnm = f"{outdir}/{sample}.{g}.bedpe.gz" 14 | """ 15 | # * read meta data 16 | if not infnm.endswith('.bedpe.gz'): 17 | raise FileNotFoundError( 18 | f"{infnm} should be end with .bedpe.gz") 19 | r: pd.DataFrame = pd.read_csv(meta, sep = "\t", header = 0) 20 | barcode = pd.Series( 21 | data = r.iloc[:, 2].values, 22 | index = r.iloc[:, 0]) 23 | cell2group :Dict[str, str] = barcode.to_dict() 24 | # * set up outfiles 25 | s = r.iloc[:, [1,2]].groupby(by = r.iloc[:,1]).apply( 26 | lambda x: list(pd.unique(x.iloc[:,1]))) 27 | o2n :Dict[str, List[str]] = s.to_dict() 28 | ng2f :Dict[str, gzip.GzipFile] = dict() 29 | 30 | if not os.path.exists(outdir): 31 | print(f"{outdir} does not exist.") 32 | os.makedirs(name = outdir, exist_ok = True) 33 | for g in o2n[sample]: 34 | fnm = f"{outdir}/{sample}.{g}.bedpe.gz" 35 | if os.path.exists(fnm): 36 | print(f"{fnm} exists and remove it.") 37 | os.remove(fnm) 38 | ng2f[g] = gzip.open(f"{fnm}", mode = "wb") 39 | 40 | # * main 41 | with gzip.open(infnm, "rb") as f: 42 | for l in f: 43 | barcode: str = l.decode().split("\t")[6] 44 | if barcode in cell2group: 45 | ng:str = cell2group[barcode] 46 | ## debug 47 | # print(f"{barcode} -> {ng}") 48 | ng2f[ng].write(l) 49 | # close all the output files 50 | for g in o2n[sample]: 51 | if not ng2f[g].closed: 52 | ng2f[g].close() 53 | return True 54 | -------------------------------------------------------------------------------- /package/python/mycelloracle.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os, sys, shutil, importlib, glob 4 | from tqdm.notebook import tqdm 5 | import celloracle as co 6 | from celloracle import motif_analysis as ma 7 | from genomepy import Genome 8 | import scanpy as sc 9 | 10 | def download_refgenome(ref = "mm10") -> None: 11 | genome_installation = ma.is_genome_installed(ref_genome = ref) 12 | if not genome_installation: 13 | import genomepy 14 | ## fasta: /Users/szu/.local/share/genomes/mm10/mm10.fa 15 | genomepy.install_genome(name = ref, provider = "UCSC") 16 | else: 17 | print(f"{ref} is installed") 18 | 19 | 20 | def decompose_chrstr(peak_str): 21 | """ 22 | Args: 23 | peak_str (str): peak_str. e.g. 'chr1_3094484_3095479' 24 | 25 | Returns: 26 | tuple: chromosome name, start position, end position 27 | """ 28 | *chr_, start, end = peak_str.split("_") 29 | chr_ = "_".join(chr_) 30 | return chr_, start, end 31 | 32 | def check_peak_format(peaks_df, ref_genome): 33 | """ 34 | Check peak format. 35 | (1) Check chromosome name. 36 | (2) Check peak size (length) and remove sort DNA sequences (<5bp) 37 | 38 | """ 39 | df = peaks_df.copy() 40 | n_peaks_before = df.shape[0] 41 | # Decompose peaks and make df 42 | decomposed = [decompose_chrstr(peak_str) for peak_str in df["peak_id"]] 43 | df_decomposed = pd.DataFrame(np.array(decomposed), index=peaks_df.index) 44 | df_decomposed.columns = ["chr", "start", "end"] 45 | df_decomposed["start"] = df_decomposed["start"].astype(int) 46 | df_decomposed["end"] = df_decomposed["end"].astype(int) 47 | # Load genome data 48 | genome_data = Genome(ref_genome) 49 | all_chr_list = list(genome_data.keys()) 50 | # DNA length check 51 | lengths = np.abs(df_decomposed["end"] - df_decomposed["start"]) 52 | # Filter peaks with invalid chromosome name 53 | n_threshold = 5 54 | df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)] 55 | # DNA length check 56 | lengths = np.abs(df_decomposed["end"] - df_decomposed["start"]) 57 | 58 | # Data counting 59 | n_invalid_length = len(lengths[lengths < n_threshold]) 60 | n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum() 61 | n_peaks_after = df.shape[0] 62 | print("Peaks before filtering: ", n_peaks_before) 63 | print("Peaks with invalid chr_name: ", n_peaks_invalid_chr) 64 | print("Peaks with invalid length: ", n_invalid_length) 65 | print("Peaks after filtering: ", n_peaks_after) 66 | return df 67 | 68 | def load_tfidf(f) -> pd.DataFrame: 69 | import pyarrow.parquet as pq 70 | r = pq.read_table(f) 71 | return r.to_pandas() 72 | -------------------------------------------------------------------------------- /package/python/mylog.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | 4 | class StreamToLogger(object): 5 | """ 6 | Fake file-like stream object that redirects writes to a logger instance. 7 | Ref: 8 | https://stackoverflow.com/questions/19425736/how-to-redirect-stdout-and-stderr-to-logger-in-python 9 | """ 10 | def __init__(self, logger, level): 11 | self.logger = logger 12 | self.level = level 13 | self.linebuf = '' 14 | 15 | def write(self, buf): 16 | for line in buf.rstrip().splitlines(): 17 | self.logger.log(self.level, line.rstrip()) 18 | 19 | def flush(self): 20 | pass 21 | 22 | def set_file_logger(fnm:str, 23 | fmode:str = 'a', 24 | name:str = 'sa2_pp', 25 | log_level: int = logging.DEBUG) -> logging.Logger: 26 | logger = logging.getLogger(name) 27 | logger.setLevel(log_level) 28 | fh = logging.FileHandler(filename = fnm, 29 | mode = fmode) 30 | fm = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 31 | fh.setFormatter(fm) 32 | logger.addHandler(fh) 33 | return logger 34 | 35 | def handle_exception(logger, exc_type, exc_value, exc_traceback): 36 | import traceback 37 | if issubclass(exc_type, KeyboardInterrupt): 38 | sys.__excepthook__(exc_type, exc_value, exc_traceback) 39 | return 40 | logger.error(''.join(["Uncaught exception: ", 41 | *traceback.format_exception( 42 | exc_type, exc_value, exc_traceback) 43 | ])) 44 | -------------------------------------------------------------------------------- /package/python/mysnapatac2.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import snapatac2 as sa2 3 | 4 | 5 | def modify_obs_name(sds: sa2.AnnData | sa2.AnnDataSet, 6 | obs_key = "sample") -> List[str]: 7 | obs_names: List[str] = [f"{i}.{j}" 8 | for i, j in zip( 9 | sds.obs[obs_key].to_list(), sds.obs_names)] 10 | return obs_names 11 | 12 | def clean_AnnDataSet(d: str): 13 | import os 14 | os.remove(f"{d}/_dataset.h5ads") 15 | import shutil 16 | shutil.rmtree(f"{d}/anndatas", ignore_errors = True) 17 | os.removedirs(d) 18 | 19 | # deprecated 20 | # use to_adata directly for subsetting data. 21 | def get_subset_from_AnnDataSet(adata, outf, 22 | obs_index = None, var_index = None, 23 | logger = None): 24 | import os 25 | tmp_dir, _ = os.path.splitext(outf) 26 | os.makedirs(tmp_dir, exist_ok = True) 27 | adata_subset = adata.subset( 28 | obs_indices = obs_index, 29 | var_indices = var_index, 30 | out = tmp_dir 31 | ) 32 | adata_subset = adata_subset[0] 33 | if logger is not None: 34 | logger.info(f"create subset from {tmp_dir} to {outf}") 35 | r = adata_subset.to_adata( 36 | copy_x = True, 37 | file = outf 38 | ) 39 | adata_subset.close() 40 | # remove adata_subset 41 | os.remove(f"{tmp_dir}/_dataset.h5ads") 42 | import shutil 43 | shutil.rmtree(f"{tmp_dir}/anndatas", ignore_errors = True) 44 | os.removedirs(tmp_dir) 45 | if logger is not None: 46 | logger.info(f"clean tmp {tmp_dir}.") 47 | return r 48 | -------------------------------------------------------------------------------- /package/python/snap2h5ad.py: -------------------------------------------------------------------------------- 1 | """ 2 | Save mat from snap file to h5ad file. 3 | """ 4 | import anndata as ad 5 | import h5py 6 | import numpy as np 7 | from scipy import sparse 8 | 9 | def snapbmat2h5ad(snap_file, bin_size:int = 5000, 10 | prefix:str = '', 11 | outfnm:str = '') -> ad.AnnData: 12 | """Save bmat from snap file to h5ad. 13 | """ 14 | with h5py.File(snap_file, mode = 'r') as f: 15 | barcode = [b.decode('utf-8') for b in f['/BD/name']] 16 | chrom = [i.decode('utf-8') 17 | for i in f[f'/AM/{bin_size}/binChrom']] 18 | start = f[f'/AM/{bin_size}/binStart'][:] 19 | end = start + bin_size - 1 20 | name = [f"{i}:{s}-{t}" for i, s, t in 21 | zip(chrom, start, end)] 22 | idx = np.array(f[f'/AM/{bin_size}/idx'], 23 | dtype = np.uintc) 24 | idy = np.array(f[f'/AM/{bin_size}/idy'], 25 | dtype = np.uintc) 26 | count = np.array(f[f'/AM/{bin_size}/count'], 27 | dtype = np.float32) 28 | cscmat = sparse.csc_matrix( 29 | (count, (idx - 1, idy - 1)), 30 | shape = (len(barcode), len(name)), dtype = np.float32) 31 | adata = ad.AnnData(X = cscmat) 32 | if len(prefix) > 0: 33 | fullname = [f"{prefix}.{b}" for b in barcode] 34 | else: 35 | fullname = barcode 36 | adata.obs_names = fullname 37 | adata.var_names = name 38 | if len(outfnm) > 0 : 39 | adata.write(outfnm, compression = "gzip") 40 | return adata 41 | 42 | 43 | def snapgmat2h5ad(snap_file, prefix:str = '', 44 | outfnm:str = '') -> ad.AnnData: 45 | """Save gmat from snap file to h5ad. 46 | """ 47 | with h5py.File(snap_file, mode = 'r') as f: 48 | barcode = [b.decode('utf-8') for b in f['/BD/name']] 49 | genm = [g.decode('utf-8') for g in f['/GM/name']] 50 | idx = np.array(f["/GM/idx"], dtype = np.uintc) 51 | idy = np.array(f["/GM/idy"], dtype = np.uintc) 52 | count = np.array(f["/GM/count"], dtype = np.float32) 53 | cscmat = sparse.csc_matrix( 54 | (count, (idx -1, idy-1)), 55 | shape = (len(barcode), len(genm)), 56 | dtype = np.float32 57 | ) 58 | adata = ad.AnnData(X = cscmat) 59 | if len(prefix) > 0: 60 | fullname = [f"{prefix}.{b}" for b in barcode] 61 | else: 62 | fullname = barcode 63 | adata.obs_names = fullname 64 | adata.var_names = genm 65 | if len(outfnm) > 0 : 66 | adata.write(outfnm, compression = "gzip") 67 | return adata 68 | -------------------------------------------------------------------------------- /package/tasks/getAllL3SnapMat/L2GroupAll.csv: -------------------------------------------------------------------------------- 1 | GABA_1 2 | GABA_10 3 | GABA_11 4 | GABA_12 5 | GABA_13 6 | GABA_14 7 | GABA_15 8 | GABA_16 9 | GABA_17 10 | GABA_18 11 | GABA_19 12 | GABA_2 13 | GABA_20 14 | GABA_21 15 | GABA_22 16 | GABA_23 17 | GABA_24 18 | GABA_25 19 | GABA_26 20 | GABA_27 21 | GABA_28 22 | GABA_29 23 | GABA_3 24 | GABA_30 25 | GABA_31 26 | GABA_32 27 | GABA_33 28 | GABA_4 29 | GABA_5 30 | GABA_6 31 | GABA_7 32 | GABA_8 33 | GABA_9 34 | GLUT_1 35 | GLUT_11 36 | GLUT_13 37 | GLUT_14 38 | GLUT_15 39 | GLUT_16.25 40 | GLUT_17 41 | GLUT_18 42 | GLUT_19 43 | GLUT_2.7 44 | GLUT_20 45 | GLUT_21 46 | GLUT_22.24 47 | GLUT_23 48 | GLUT_26 49 | GLUT_27 50 | GLUT_28 51 | GLUT_3 52 | GLUT_4 53 | GLUT_5 54 | GLUT_6 55 | GLUT_8.10 56 | GLUT_9.12 57 | NonN_1 58 | NonN_10 59 | NonN_11 60 | NonN_12 61 | NonN_13 62 | NonN_2 63 | NonN_3 64 | NonN_4 65 | NonN_5 66 | NonN_6 67 | NonN_7 68 | NonN_8 69 | NonN_9 70 | -------------------------------------------------------------------------------- /package/tasks/getAllL3SnapMat/Makefile: -------------------------------------------------------------------------------- 1 | mat := vM16gmat 2 | 3 | define screenSnakemake 4 | screen -dmS ${1} snakemake -c 1 -p --snakefile Snakefile --profile profile -R --rerun-incomplete 5 | endef 6 | 7 | run: Snakefile.template L2GroupAll.csv 8 | cp $< ${mat}/Snakefile 9 | cp -R profile.template/. ${mat}/profile 10 | cp $(word 2,$^) ${mat}/group.csv 11 | cd ${mat} && $(call screenSnakemake,${mat}) 12 | -------------------------------------------------------------------------------- /package/tasks/getAllL3SnapMat/Snakefile.template: -------------------------------------------------------------------------------- 1 | import json 2 | config_file = "config.json" 3 | 4 | with open(config_file, "r") as f: 5 | config = json.load(f) 6 | matFlagdir = config["matFlagdir"] 7 | matLogdir = config["matLogdir"] 8 | conda = config["conda"] 9 | Rscript = f"{conda}/bin/Rscript" 10 | atacMatScript = config["atacMatScript"] 11 | groupFile = config["groupFile"] 12 | 13 | with open(groupFile, "r") as f: 14 | lines = f.readlines() 15 | groups = [l.strip() for l in lines if len(l.strip()) > 1] 16 | print(groups) 17 | 18 | rule all: 19 | input: 20 | expand("{d}/{g}.done", d = matFlagdir, g = groups) 21 | 22 | rule getMat: 23 | output: 24 | touch(expand("{d}/{{g}}.done", d = matFlagdir)) 25 | log: 26 | expand("{d}/{{g}}.log", d = matLogdir) 27 | shell: 28 | """ 29 | {Rscript} {atacMatScript} --config {config_file} \ 30 | --group {wildcards.g} 2> {log} 31 | """ 32 | -------------------------------------------------------------------------------- /package/tasks/getAllL3SnapMat/bmat/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "gencodevM23snapFile": "../snap/cemba.whole.gencode23.snap.rds", 3 | "gencodevM16snapFile": "../snap/snapWithL3Extra.rds", 4 | "ncores": 2, 5 | "matDir": "out", 6 | "matLogdir": "log", 7 | "matFlagdir": "flagdir", 8 | "splitGroup": 0, 9 | "conda": "/projects/ps-renlab/szu/miniconda3/envs/snATAC2", 10 | "atacMatScript": "../getSnapATACMat.R", 11 | "mat": "bmat", 12 | "groupType": "L2", 13 | "groupFile": "group.csv", 14 | "gencode": "vM16" 15 | } 16 | -------------------------------------------------------------------------------- /package/tasks/getAllL3SnapMat/profile.template/cluster.yaml: -------------------------------------------------------------------------------- 1 | __default__: 2 | jobname: "{rule}.{wildcards}" 3 | nodes: 1 4 | ppn: 1 5 | walltime: "15:00:00" 6 | account: "ren-group" 7 | queue: "hotel" 8 | email: "debug.pie@gmail.com" 9 | mailon: "ae" 10 | jobout: "oe" 11 | log: "{rule}.{wildcards}.tscc.log" 12 | logdir: "qsub_tscc_log/" 13 | mem: "50gb" 14 | 15 | getMat: 16 | mem: "100gb" 17 | ppn: 2 18 | queue: "hotel" 19 | walltime: 15:00:00 20 | -------------------------------------------------------------------------------- /package/tasks/getAllL3SnapMat/profile.template/config.yaml: -------------------------------------------------------------------------------- 1 | cluster-config: "profile/cluster.yaml" 2 | cluster: "qsub -N {cluster.jobname} -l nodes={cluster.nodes}:ppn={cluster.ppn},mem={cluster.mem},walltime={cluster.walltime} -A {cluster.account} -q {cluster.queue} -M {cluster.email} -m {cluster.mailon} -j {cluster.jobout} -e {cluster.logdir} -V " 3 | jobs: 100 4 | verbose: true 5 | notemp: true 6 | -------------------------------------------------------------------------------- /package/tasks/getAllL3SnapMat/vM16gmat/L2GroupAll.csv: -------------------------------------------------------------------------------- 1 | GABA_1 2 | GABA_10 3 | GABA_11 4 | GABA_12 5 | GABA_13 6 | GABA_14 7 | GABA_15 8 | GABA_16 9 | GABA_17 10 | GABA_18 11 | GABA_19 12 | GABA_2 13 | GABA_20 14 | GABA_21 15 | GABA_22 16 | GABA_23 17 | GABA_24 18 | GABA_25 19 | GABA_26 20 | GABA_27 21 | GABA_28 22 | GABA_29 23 | GABA_3 24 | GABA_30 25 | GABA_31 26 | GABA_32 27 | GABA_33 28 | GABA_4 29 | GABA_5 30 | GABA_6 31 | GABA_7 32 | GABA_8 33 | GABA_9 34 | GLUT_1 35 | GLUT_11 36 | GLUT_13 37 | GLUT_14 38 | GLUT_15 39 | GLUT_16.25 40 | GLUT_17 41 | GLUT_18 42 | GLUT_19 43 | GLUT_2.7 44 | GLUT_20 45 | GLUT_21 46 | GLUT_22.24 47 | GLUT_23 48 | GLUT_26 49 | GLUT_27 50 | GLUT_28 51 | GLUT_3 52 | GLUT_4 53 | GLUT_5 54 | GLUT_6 55 | GLUT_8.10 56 | GLUT_9.12 57 | NonN_1 58 | NonN_10 59 | NonN_11 60 | NonN_12 61 | NonN_13 62 | NonN_2 63 | NonN_3 64 | NonN_4 65 | NonN_5 66 | NonN_6 67 | NonN_7 68 | NonN_8 69 | NonN_9 70 | -------------------------------------------------------------------------------- /package/tasks/getAllL3SnapMat/vM16gmat/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "gencodevM23snapFile": "../snap/cemba.whole.gencode23.snap.rds", 3 | "gencodevM16snapFile": "../snap/snapWithL3Extra.rds", 4 | "ncores": 2, 5 | "matDir": "out", 6 | "matLogdir": "log", 7 | "matFlagdir": "flagdir", 8 | "splitGroup": 0, 9 | "conda": "/projects/ps-renlab/szu/miniconda3/envs/snATAC2", 10 | "atacMatScript": "../getSnapATACMat.R", 11 | "mat": "gmat", 12 | "groupType": "L2", 13 | "groupFile": "group.csv", 14 | "gencode": "vM16" 15 | } 16 | -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/L1Group.csv: -------------------------------------------------------------------------------- 1 | NonN 2 | GABA 3 | GLUT -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/L2GroupAll.csv: -------------------------------------------------------------------------------- 1 | GABA_1 2 | GABA_10 3 | GABA_11 4 | GABA_12 5 | GABA_13 6 | GABA_14 7 | GABA_15 8 | GABA_16 9 | GABA_17 10 | GABA_18 11 | GABA_19 12 | GABA_2 13 | GABA_20 14 | GABA_21 15 | GABA_22 16 | GABA_23 17 | GABA_24 18 | GABA_25 19 | GABA_26 20 | GABA_27 21 | GABA_28 22 | GABA_29 23 | GABA_3 24 | GABA_30 25 | GABA_31 26 | GABA_32 27 | GABA_33 28 | GABA_4 29 | GABA_5 30 | GABA_6 31 | GABA_7 32 | GABA_8 33 | GABA_9 34 | GLUT_1 35 | GLUT_11 36 | GLUT_13 37 | GLUT_14 38 | GLUT_15 39 | GLUT_16.25 40 | GLUT_17 41 | GLUT_18 42 | GLUT_19 43 | GLUT_2.7 44 | GLUT_20 45 | GLUT_21 46 | GLUT_22.24 47 | GLUT_23 48 | GLUT_26 49 | GLUT_27 50 | GLUT_28 51 | GLUT_3 52 | GLUT_4 53 | GLUT_5 54 | GLUT_6 55 | GLUT_8.10 56 | GLUT_9.12 57 | NonN_1 58 | NonN_10 59 | NonN_11 60 | NonN_12 61 | NonN_13 62 | NonN_2 63 | NonN_3 64 | NonN_4 65 | NonN_5 66 | NonN_6 67 | NonN_7 68 | NonN_8 69 | NonN_9 70 | -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/L2GroupTest.csv: -------------------------------------------------------------------------------- 1 | NonN_2 2 | NonN_3 3 | NonN_11 4 | -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/L2MultiGroup.csv: -------------------------------------------------------------------------------- 1 | NonN_2-NonN_3-NonN_11 -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/Makefile: -------------------------------------------------------------------------------- 1 | mat := L2vM16gmat 2 | 3 | define screenSnakemake 4 | screen -dmS ${1} snakemake -c 1 -p --snakefile Snakefile --profile profile -R --rerun-incomplete 5 | endef 6 | 7 | run: Snakefile.template L2GroupAll.csv config.json.template 8 | -mkdir -p ${mat} 9 | cp $< ${mat}/Snakefile 10 | cp -R profile.template/. ${mat}/profile 11 | cp L2GroupAll.csv ${mat}/group.csv 12 | cp config.json.template ${mat}/config.json 13 | cd ${mat} && $(call screenSnakemake,${mat}) 14 | 15 | test: Snakefile.template L2GroupTest.csv config.json.template 16 | -mkdir -p ${mat} 17 | cp $< ${mat}/Snakefile 18 | cp -R profile.template/. ${mat}/profile 19 | cp L2GroupTest.csv ${mat}/group.csv 20 | cp config.json.template ${mat}/config.json 21 | cd ${mat} && $(call screenSnakemake,${mat}) 22 | 23 | l1mat := L1vM16gmat 24 | runL1: Snakefile.template L1Group.csv configL1.json.template 25 | -mkdir -p ${l1mat} 26 | cp $< ${l1mat}/Snakefile 27 | cp -R profile.template/. ${l1mat}/profile 28 | cp $(word 2,$^) ${l1mat}/group.csv 29 | cp $(word 3,$^) ${l1mat}/config.json 30 | cd ${l1mat} && $(call screenSnakemake,${l1mat}) 31 | 32 | multiL2 := L2MultvM16gmat 33 | runMultiL2: Snakefile.template L2MultiGroup.csv configMultiGroup.json.template 34 | -mkdir -p ${multiL2} 35 | cp $< ${multiL2}/Snakefile 36 | cp -R profile.template/. ${multiL2}/profile 37 | cp $(word 2,$^) ${multiL2}/group.csv 38 | cp $(word 3,$^) ${multiL2}/config.json 39 | cd ${multiL2} && $(call screenSnakemake,${multiL2}) 40 | -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/Snakefile.template: -------------------------------------------------------------------------------- 1 | import json 2 | config_file = "config.json" 3 | 4 | with open(config_file, "r") as f: 5 | config = json.load(f) 6 | matFlagdir = config["matFlagdir"] 7 | matLogdir = config["matLogdir"] 8 | conda = config["conda"] 9 | Rscript = f"{conda}/bin/Rscript" 10 | atacMatScript = config["atacMatScript"] 11 | groupFile = config["groupFile"] 12 | 13 | with open(groupFile, "r") as f: 14 | lines = f.readlines() 15 | groups = [l.strip() for l in lines if len(l.strip()) > 1] 16 | print(groups) 17 | 18 | rule all: 19 | input: 20 | expand("{d}/{g}.done", d = matFlagdir, g = groups) 21 | 22 | rule getMat: 23 | output: 24 | touch(expand("{d}/{{g}}.done", d = matFlagdir)) 25 | log: 26 | expand("{d}/{{g}}.log", d = matLogdir) 27 | shell: 28 | """ 29 | {Rscript} {atacMatScript} --config {config_file} \ 30 | --group {wildcards.g} 2> {log} 31 | """ 32 | -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/config.json.template: -------------------------------------------------------------------------------- 1 | { 2 | "gencodevM23snapFile": "../snap/cemba.whole.gencode23.snap.rds", 3 | "gencodevM16snapFile": "../snap/snapWithL3Extra.rds", 4 | "ncores": 4, 5 | "matDir": "out", 6 | "matLogdir": "log", 7 | "matFlagdir": "flagdir", 8 | "ndpL2": 30000, 9 | "ndpL3": 10000, 10 | "conda": "/projects/ps-renlab/szu/miniconda3/envs/snATAC2", 11 | "atacMatScript": "../getSnapATACMatByGroup.R", 12 | "mat": "gmat", 13 | "groupType": "L2", 14 | "groupFile": "group.csv", 15 | "gencode": "vM16", 16 | "requireNextLevel": 0 17 | } 18 | -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/configL1.json.template: -------------------------------------------------------------------------------- 1 | { 2 | "gencodevM23snapFile": "../snap/cemba.whole.gencode23.snap.rds", 3 | "gencodevM16snapFile": "../snap/snapL3WithIds.rds", 4 | "ncores": 4, 5 | "matDir": "out", 6 | "matLogdir": "log", 7 | "matFlagdir": "flagdir", 8 | "ndp": 30000, 9 | "conda": "/projects/ps-renlab/szu/miniconda3/envs/snATAC2", 10 | "atacMatScript": "../getSnapATACMatByGroup.R", 11 | "mat": "gmat", 12 | "groupType": "L1", 13 | "groupFile": "group.csv", 14 | "gencode": "vM16", 15 | "requireNextLevel": 0 16 | } 17 | -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/configMultiGroup.json.template: -------------------------------------------------------------------------------- 1 | { 2 | "gencodevM23snapFile": "../snap/cemba.whole.gencode23.snap.rds", 3 | "gencodevM16snapFile": "../snap/snapWithL3Extra.rds", 4 | "ncores": 4, 5 | "matDir": "out", 6 | "matLogdir": "log", 7 | "matFlagdir": "flagdir", 8 | "splitGroup": 0, 9 | "ndpL2": 30000, 10 | "ndpL3": 10000, 11 | "conda": "/projects/ps-renlab/szu/miniconda3/envs/snATAC2", 12 | "atacMatScript": "../getSnapATACMatByGroup.R", 13 | "mat": "gmat", 14 | "groupType": "L2Extra", 15 | "groupFile": "group.csv", 16 | "gencode": "vM16", 17 | "requireNextLevel": 1 18 | } 19 | -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/profile.template/cluster.yaml: -------------------------------------------------------------------------------- 1 | __default__: 2 | jobname: "{rule}.{wildcards}" 3 | nodes: 1 4 | ppn: 1 5 | walltime: "15:00:00" 6 | account: "ren-group" 7 | queue: "hotel" 8 | email: "debug.pie@gmail.com" 9 | mailon: "ae" 10 | jobout: "oe" 11 | log: "{rule}.{wildcards}.tscc.log" 12 | logdir: "qsub_tscc_log/" 13 | mem: "50gb" 14 | 15 | getMat: 16 | mem: "50gb" 17 | ppn: 2 18 | queue: "glean" 19 | walltime: 04:00:00 20 | -------------------------------------------------------------------------------- /package/tasks/getSnapATACMatByGroup/profile.template/config.yaml: -------------------------------------------------------------------------------- 1 | cluster-config: "profile/cluster.yaml" 2 | cluster: "qsub -N {cluster.jobname} -l nodes={cluster.nodes}:ppn={cluster.ppn},mem={cluster.mem},walltime={cluster.walltime} -A {cluster.account} -q {cluster.queue} -M {cluster.email} -m {cluster.mailon} -j {cluster.jobout} -e {cluster.logdir} -V " 3 | jobs: 100 4 | verbose: true 5 | notemp: true 6 | -------------------------------------------------------------------------------- /package/tasks/nmf/01.prepare.nmf.R: -------------------------------------------------------------------------------- 1 | # Prepare data for nmf. 2 | # TODO: This script can be generalised. 3 | # Currently, I use it case by case. 4 | library(data.table) 5 | library(hdf5r) 6 | 7 | packdir <- file.path(here::here(), "package", "R") 8 | import::from(.from = "cembav2env.R", .directory = packdir, 9 | cembav2env) 10 | 11 | # * configs 12 | outdir <- "data" 13 | outh5 <- file.path(outdir, "cpm.cbyp.Intv2.h5") 14 | outPeakCoordFile <- file.path(outdir, "peaks.Intv2.txt") 15 | outClusterFile <- file.path(outdir, "clusters.Intv2.txt") 16 | 17 | # * load atac cpm. 18 | cpm.scbyp <- readRDS(cembav2env$subclassPmatCPMIntv2File) 19 | 20 | ## # * load cCREs from posivite pdc 21 | ## cCREs.ppdc <- data.table::fread( 22 | ## "../out/AllenAnnotConcat/mba.whole.AllenAnnotConcat.pearson.pos.pdc.CREs", 23 | ## header = FALSE, data.table = FALSE 24 | ## )$V1 25 | 26 | # * main 27 | ## cpm.ppdc <- cpm.pdc[cCREs.ppdc, ] 28 | ## cpm.pbysc <- cpm.ppdc 29 | 30 | ## set cap for too high values 31 | upValue <- quantile(cpm.scbyp, 0.9999) 32 | cpm.capped <- cpm.scbyp 33 | cpm.capped[cpm.scbyp > upValue] <- upValue 34 | 35 | ## change to cluster by peak mat 36 | ## for later saving to hdf5 37 | peaks <- colnames(cpm.capped) 38 | clusters <- rownames(cpm.capped) 39 | 40 | # * save mat to hdf5 format for python handling. 41 | conn <- hdf5r::H5File$new(outh5, mode = "w") 42 | data.grp <- conn$create_group("X") 43 | # NOTE: hdf5r will transpose the mat 44 | # https://github.com/hhoeflin/hdf5r/issues/81 45 | data.grp[["mat"]] <- cpm.capped 46 | # colnames corresponds to cpm.capped 47 | data.grp[["colnames"]] <- peaks 48 | # rownames corresponds to cpm.capped 49 | data.grp[["rownames"]] <- clusters 50 | conn$close_all() 51 | 52 | write.table(peaks, file = outPeakCoordFile, quote = FALSE, 53 | row.names = FALSE, col.names = FALSE) 54 | write.table(clusters, file = outClusterFile, quote = FALSE, 55 | row.names = FALSE, col.names = FALSE) 56 | -------------------------------------------------------------------------------- /package/tasks/nmf/02.nmfATAC.plotH.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # read results from ksklearn 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | # create parser object 8 | parser <- ArgumentParser() 9 | 10 | # specify our desired options 11 | # by default ArgumentParser will add an help option 12 | parser$add_argument("-i", "--input", required=TRUE, help="input matrix") 13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix") 14 | # get command line options, if help option encountered print help and exit, 15 | # otherwise if options not found on command line then set defaults, 16 | args <- parser$parse_args() 17 | 18 | 19 | library(data.table) 20 | dataH <- fread(args$input,sep="\t") 21 | 22 | library(pheatmap) 23 | library(RColorBrewer) 24 | library(viridis) 25 | library(dendsort) 26 | 27 | # scale by column 28 | #mx <- apply(dataH,2,scale) 29 | 30 | normUnity <- function(x){ 31 | sum <- sum(x) 32 | out <- x / sum(x) 33 | } 34 | 35 | mx <- apply(dataH,2,normUnity) 36 | 37 | sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...))) 38 | #mat_cluster_rows_H <- sort_hclust(hclust(dist(dataH))) 39 | mat_cluster_cols_H <- sort_hclust(hclust(dist(t(mx)))) 40 | 41 | quantile_breaks <- function(xs, n = 30) { 42 | breaks <- quantile(xs, probs = seq(0, 1, length.out = n)) 43 | breaks[!duplicated(breaks)] 44 | } 45 | 46 | # mat_breaks_H <- quantile_breaks(t(mx), n = 30) 47 | 48 | pdf(paste(args$output,".H.pdf",sep='')) 49 | pheatmap( 50 | mat = mx, 51 | scale = 'none', 52 | color = viridis(30), 53 | # color = viridis(length(mat_breaks_H) - 1), 54 | # breaks = mat_breaks_H, 55 | border_color = NA, 56 | cluster_cols = mat_cluster_cols_H, 57 | cluster_rows = F, 58 | # cluster_rows = mat_cluster_rows_H, 59 | show_colnames = TRUE, 60 | show_rownames = FALSE, 61 | drop_levels = TRUE, 62 | fontsize = 14, 63 | main = "decomp H" 64 | ) 65 | dev.off() 66 | 67 | 68 | nor01 <- function(x){ 69 | min <- min(x) 70 | max <- max(x) 71 | out <- (x - min) / (max - min) 72 | } 73 | 74 | -------------------------------------------------------------------------------- /package/tasks/nmf/02.nmfATAC.plotW.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # read results from ksklearn 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | # create parser object 8 | parser <- ArgumentParser() 9 | 10 | # specify our desired options 11 | # by default ArgumentParser will add an help option 12 | parser$add_argument("-i", "--input", required=TRUE, help="input matrix") 13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix") 14 | # get command line options, if help option encountered print help and exit, 15 | # otherwise if options not found on command line then set defaults, 16 | args <- parser$parse_args() 17 | 18 | 19 | library(data.table) 20 | dataW <- fread(args$input,sep="\t") 21 | 22 | library(pheatmap) 23 | library(RColorBrewer) 24 | library(viridis) 25 | library(dendsort) 26 | 27 | # scale by column 28 | #tmp <- apply(dataW,2,scale) 29 | 30 | normUnity <- function(x){ 31 | sum <- sum(x) 32 | out <- x / sum(x) 33 | } 34 | 35 | tmp <- apply(dataW,1,normUnity) 36 | tmp <- t(tmp) 37 | mx <- tmp[sample(nrow(tmp), 5000), ] 38 | 39 | sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...))) 40 | mat_cluster_rows_W <- sort_hclust(hclust(dist(mx))) 41 | #mat_cluster_cols_W <- sort_hclust(hclust(dist(t(mx)))) 42 | 43 | quantile_breaks <- function(xs, n = 30) { 44 | breaks <- quantile(xs, probs = seq(0, 1, length.out = n)) 45 | breaks[!duplicated(breaks)] 46 | } 47 | 48 | #mat_breaks_W <- quantile_breaks(t(mx), n = 30) 49 | 50 | pdf(paste(args$output,".W.pdf",sep='')) 51 | pheatmap( 52 | mat = mx, 53 | scale = 'none', 54 | color = viridis(30), 55 | # color = viridis(length(mat_breaks_W) - 1), 56 | # breaks = mat_breaks_W, 57 | border_color = NA, 58 | # cluster_cols = mat_cluster_cols_W, 59 | cluster_cols = F, 60 | cluster_rows = mat_cluster_rows_W, 61 | show_colnames = FALSE, 62 | show_rownames = FALSE, 63 | drop_levels = TRUE, 64 | fontsize = 14, 65 | main = "decomp W" 66 | ) 67 | dev.off() 68 | 69 | 70 | norm01 <- function(x){ 71 | min <- min(x) 72 | max <- max(x) 73 | out <- (x - min) / (max - min) 74 | } 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /package/tasks/nmf/02.nmfATAC.statBox.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | # read results from ksklearn 4 | 5 | suppressPackageStartupMessages(library("argparse")) 6 | 7 | # create parser object 8 | parser <- ArgumentParser() 9 | 10 | # specify our desired options 11 | # by default ArgumentParser will add an help option 12 | parser$add_argument("-i", "--input", required=TRUE, help="input statH") 13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix") 14 | # get command line options, if help option encountered print help and exit, 15 | # otherwise if options not found on command line then set defaults, 16 | args <- parser$parse_args() 17 | 18 | data <- read.table(args$input,sep="\t",head=F) 19 | 20 | staoutmx <- data.frame(row.names=c("Min","Q1","Median","Mean","Q3","Max","TopWhisker","BottomWhisker","Box1","Box2","Box3","UpWhisker","DnWhisker")) 21 | for (i in c(5,6,7)){ 22 | x <- data[,i] 23 | boxMx <- matrix(summary(x)) 24 | rownames(boxMx) <- c("Min","Q1","Median","Mean","Q3","Max") 25 | iqr <- IQR(x) 26 | q1 <- summary(x)[2] 27 | q3 <- summary(x)[5] 28 | TopWhisker <- min(max(x), q3 + 1.5 * iqr) 29 | BottomWhisker <- max(min(x), q1 - 1.5 * iqr) 30 | Box1 <- boxMx["Q1",] 31 | Box2 <- boxMx["Median",] - boxMx["Q1",] 32 | Box3 <- boxMx["Q3",] - boxMx["Median",] 33 | UpWhisker <- TopWhisker - boxMx["Q3",] 34 | DnWhisker <- boxMx["Q1",] - BottomWhisker 35 | boxMx <- rbind(boxMx,TopWhisker,BottomWhisker,Box1,Box2,Box3,UpWhisker,DnWhisker) 36 | colnames(boxMx) <- i 37 | staoutmx <- cbind(staoutmx,boxMx) 38 | } 39 | colnames(staoutmx) <- c("contributes","sparseness","entropy") 40 | 41 | cat(median(data$V6),"\n") 42 | 43 | k <- max(data$V3) 44 | n <- nrow(data) 45 | normInfoGain = 1 - sum(data$V7) / (n * log2(k)) 46 | cat(normInfoGain) 47 | 48 | write.table(staoutmx, file=paste(args$output,".box.sta",sep=''), sep="\t", quote=F, col.names=T, row.names=T) 49 | 50 | -------------------------------------------------------------------------------- /package/tasks/nmf/05.splitPeakByModule.R: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(optparse) 3 | packdir <- file.path(here::here(), "package/R") 4 | import::from(.from = "utils.R", .directory = packdir, 5 | checkArgsExistOrStop, prepareOutdir, checkFileExistOrStop) 6 | import::from(.from = "peak.R", .directory = packdir, 7 | loadStatPeak.NMF) 8 | import::from(.from = "cembav2env.R", .directory = packdir, 9 | cembav2env) 10 | 11 | op <- list( 12 | make_option(c("--nmfDir"), type = "character", 13 | default = "nmf_ppdc/out"), 14 | make_option(c("--module"), type = "integer", 15 | default = 54), 16 | make_option(c("--tag"), type = "character", default = "ppdc") 17 | ) 18 | 19 | args <- parse_args(OptionParser(option_list = op)) 20 | checkArgsExistOrStop(args) 21 | 22 | if(!dir.exists(args$nmfDir)) { 23 | stop(args$nmfDir, " does not exist.") 24 | } 25 | 26 | mod.nmf <- args$module 27 | tag <- args$tag 28 | 29 | outDir <- file.path(args$nmfDir, 30 | paste("nmf", tag, paste0("r", mod.nmf), "motif", sep = ".")) 31 | prepareOutdir(outDir) 32 | 33 | # * functions 34 | convertPeakToBed <- function(peakBed, peaknms, outFile = NULL) { 35 | r <- peakBed[peaknms, ] 36 | if(!is.null(outFile)) { 37 | write.table(x = r, file = outFile, quote = FALSE, sep = "\t", 38 | row.names = FALSE, col.names = FALSE) 39 | } 40 | return(r) 41 | } 42 | 43 | # * load peaks 44 | peakBed <- data.table::fread(cembav2env$peakBedFile, 45 | header = FALSE, sep = "\t", data.table = FALSE) 46 | colnames(peakBed) <- c("chrom", "start", "end", "name") 47 | rownames(peakBed) <- peakBed$name 48 | 49 | # * nmf modules 50 | nmfPeakStat <- loadStatPeak.NMF( 51 | file = file.path(args$nmfDir, 52 | paste("nmfPmat", tag, 53 | paste0("r", mod.nmf), "n0", "statW", sep = "."))) 54 | 55 | modules <- unique(nmfPeakStat$class0 + 1) 56 | 57 | # * save peaks from each module to a seperate bed file 58 | invisible(lapply(modules, function(i) { 59 | outFile <- file.path(outDir, 60 | paste0("r", mod.nmf, "_n", i, ".cCREs.bed")) 61 | message("Writing peak bed file to: ", outFile) 62 | peaks <- with(nmfPeakStat, peak[class0 == (i-1)]) 63 | convertPeakToBed(peakBed = peakBed, peaknms = peaks, outFile = outFile) 64 | })) 65 | 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /package/tasks/nmf/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: nmf_subclassIntv2 2 | nmf_subclassIntv2_encoder: nmf.Snakefile 3 | -mkdir -p $@ 4 | cp config.yaml $@/config.yaml 5 | cp $< $@/Snakefile 6 | cp -R profile/. $@/profile 7 | cd $@ && \ 8 | snakemake --config \ 9 | system=encoder \ 10 | tag=all.Intv2 \ 11 | module=150 \ 12 | n_rerun=2 \ 13 | out=$@ \ 14 | mat_pbyc_h5=data/cpm.cbyp.Intv2.h5 \ 15 | peak_nm_file=data/peaks.Intv2.txt \ 16 | cluster_nm_file=data/clusters.Intv2.txt \ 17 | local_dir=package/tasks/nmf \ 18 | -c 2 -p --snakefile Snakefile -R --rerun-incomplete 19 | 20 | .PHONY: nmf_subclassIntv2_novlp_encoder 21 | nmf_subclassIntv2_novlp_encoder: nmf.Snakefile 22 | -mkdir -p $@ 23 | cp config.yaml $@/config.yaml 24 | cp $< $@/Snakefile 25 | cp -R profile/. $@/profile 26 | cd $@ && \ 27 | snakemake --config \ 28 | system=encoder \ 29 | tag=all.Intv2 \ 30 | mod_from=150 \ 31 | mod_to=151 \ 32 | mod_by=1 \ 33 | n_rerun=2 \ 34 | out=$@ \ 35 | mat_pbyc_h5=data/cpm.cbyp.novlp.Intv2.h5 \ 36 | peak_nm_file=data/peaks.novlp.Intv2.txt \ 37 | cluster_nm_file=data/clusters.novlp.Intv2.txt \ 38 | local_dir=package/tasks/nmf \ 39 | -c 4 -p --snakefile Snakefile -R --rerun-incomplete 40 | 41 | 42 | .PHONY: nmf_subclassIntv2_novlp_tscc 43 | nmf_subclassIntv2_novlp_tscc: nmf.Snakefile 44 | -mkdir -p $@ 45 | cp config.yaml $@/config.yaml 46 | cp $< $@/Snakefile 47 | cp -R profile/. $@/profile 48 | cd $@ && \ 49 | snakemake --config \ 50 | system=tscc \ 51 | tag=novlp.Intv2 \ 52 | module=150 \ 53 | n_rerun=2 \ 54 | out=$@ \ 55 | mat_pbyc_h5=data/cpm.cbyp.novlp.Intv2.h5 \ 56 | peak_nm_file=data/peaks.novlp.Intv2.txt \ 57 | cluster_nm_file=data/clusters.novlp.Intv2.txt \ 58 | local_dir=package/tasks/nmf \ 59 | -c 1 -p --snakefile Snakefile -R --rerun-incomplete 60 | 61 | .PHONY: nmf_subclassIntv2_all_tscc 62 | nmf_subclassIntv2_all_tscc: nmf.Snakefile 63 | -mkdir -p $@ 64 | cp config.yaml $@/config.yaml 65 | cp $< $@/Snakefile 66 | cp -R profile/. $@/profile 67 | cd $@ && \ 68 | snakemake --config \ 69 | system=tscc \ 70 | tag=all.Intv2 \ 71 | module=150 \ 72 | n_rerun=2 \ 73 | out=$@ \ 74 | mat_pbyc_h5=data/cpm.cbyp.Intv2.h5 \ 75 | peak_nm_file=data/peaks.Intv2.txt \ 76 | cluster_nm_file=data/clusters.Intv2.txt \ 77 | local_dir=package/tasks/nmf \ 78 | -c 2 -p --snakefile Snakefile -R --rerun-incomplete --profile profile 79 | 80 | 81 | clean: 82 | -rm -rf nmf_test 83 | -rm -rf nmf_test_local 84 | -rm -rf nmf_test_qsub 85 | -------------------------------------------------------------------------------- /package/tasks/nmf/config.yaml: -------------------------------------------------------------------------------- 1 | system: imac 2 | python: 3 | imac: /Users/szu/mambaforge/bin/python 4 | tscc: /projects/ps-renlab/szu/miniconda3/envs/snATAC2/bin/python 5 | encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/python 6 | Rscript: 7 | imac: /usr/local/bin/Rscript 8 | tscc: /projects/ps-renlab/szu/miniconda3/envs/snATAC2/bin/Rscript 9 | encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/Rscript 10 | code_dir: 11 | imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2 12 | tscc: /projects/ps-renlab2/szu/projects/CEMBA2 13 | encoder: /projects/ps-renlab2/szu/projects/CEMBA2 14 | work_dir: 15 | imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2 16 | tscc: /oasis/tscc/scratch/szu/projects/CEMBA2 17 | encoder: /projects/ps-renlab2/szu/projects/CEMBA2 18 | homer: 19 | imac: /Users/szu/mambaforge/envs/bio/bin/findMotifsGenome.pl 20 | tscc: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/findMotifsGenome.pl 21 | encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/findMotifsGenome.pl 22 | subclass_order_meta: meta/subclass.order.hc.csv 23 | local_dir: package/tasks/nmf 24 | peak_nm_file: data/peaks.txt 25 | cluster_nm_file: data/clusters.txt 26 | mat_pbyc_h5: data/cpm.cbyp.ppdc.h5 27 | tag: Intv2 28 | out: nmf_Intv2 29 | n_rerun: 2 30 | mod_from: 150 31 | mod_to: 151 32 | mod_by: 1 33 | use_detailed_mod: 1 34 | detailed_mod: 40a80 35 | mod_split: a 36 | module: 150 37 | -------------------------------------------------------------------------------- /package/tasks/nmf/profile/.cluster.yaml.~undo-tree~: -------------------------------------------------------------------------------- 1 | (undo-tree-save-format-version . 1) 2 | "70b1dac5c0ea77a274a80b2549cdfbcf37ed2f9e" 3 | [nil nil nil nil (25150 7538 110802 0) 0 nil] 4 | ([nil nil (("1" . -275) (undo-tree-id0 . -1) (undo-tree-id1 . -1) (undo-tree-id2 . -1) (undo-tree-id3 . -1) (undo-tree-id4 . -1) (undo-tree-id5 . -1) (undo-tree-id6 . -1) (undo-tree-id7 . -1) (undo-tree-id8 . -1) (undo-tree-id9 . -1) (undo-tree-id10 . -1) ("0" . -276) (undo-tree-id11 . -1) (undo-tree-id12 . -1) (undo-tree-id13 . -1) (undo-tree-id14 . -1) (undo-tree-id15 . -1) (undo-tree-id16 . -1) (undo-tree-id17 . -1) (undo-tree-id18 . -1) (undo-tree-id19 . -1) 277 (t 25051 31271 891676 118000)) nil (25150 7538 110801 0) 0 nil]) 5 | ([nil nil ((275 . 276)) nil (25150 7538 110774 0) 0 nil]) 6 | ([nil nil (("1" . -309) (undo-tree-id20 . -1) (undo-tree-id21 . -1) (undo-tree-id22 . -1) (undo-tree-id23 . -1) (undo-tree-id24 . -1) ("0" . -310) (undo-tree-id25 . -1) (undo-tree-id26 . -1) (undo-tree-id27 . -1) 311 (t 25150 7538 113892 194000)) nil (25150 7542 995513 0) 0 nil]) 7 | ([nil nil ((309 . 311)) nil (25150 7542 995499 0) 0 nil]) 8 | ([nil nil (("1" . -290) (undo-tree-id28 . -1) (undo-tree-id29 . -1) (undo-tree-id30 . -1) (undo-tree-id31 . -1) (undo-tree-id32 . -1) (undo-tree-id33 . -1) (undo-tree-id34 . -1) (undo-tree-id35 . -1) (undo-tree-id36 . -1) (undo-tree-id37 . -1) (undo-tree-id38 . -1) ("0" . -291) (undo-tree-id39 . -1) (undo-tree-id40 . -1) (undo-tree-id41 . -1) (undo-tree-id42 . -1) (undo-tree-id43 . -1) (undo-tree-id44 . -1) (undo-tree-id45 . -1) 292 (t 25150 7542 997528 39000)) nil (25150 7546 129398 0) 0 nil]) 9 | ([nil nil ((290 . 292)) nil (25150 7546 129380 0) 0 nil]) 10 | ([nil nil (("3" . -309) (undo-tree-id46 . -1) (undo-tree-id47 . -1) (undo-tree-id48 . -1) (undo-tree-id49 . -1) (undo-tree-id50 . -1) (undo-tree-id51 . -1) (undo-tree-id52 . -1) ("0" . -310) (undo-tree-id53 . -1) (undo-tree-id54 . -1) (undo-tree-id55 . -1) (undo-tree-id56 . -1) (undo-tree-id57 . -1) 311 (t 25150 7546 131806 260000)) nil (25150 7552 611190 0) 0 nil]) 11 | ([nil nil ((309 . 311)) nil (25150 7552 611175 0) 0 nil]) 12 | ([nil nil (("2" . -290) (undo-tree-id58 . -1) (undo-tree-id59 . -1) (undo-tree-id60 . -1) (undo-tree-id61 . -1) (undo-tree-id62 . -1) (undo-tree-id63 . -1) (undo-tree-id64 . -1) (undo-tree-id65 . -1) (undo-tree-id66 . -1) ("0" . -291) (undo-tree-id67 . -1) (undo-tree-id68 . -1) (undo-tree-id69 . -1) (undo-tree-id70 . -1) (undo-tree-id71 . -1) 292 (t 25150 7552 613076 298000)) nil (25150 9878 535189 0) 0 nil]) 13 | ([nil current ((290 . 292)) nil (25150 9878 535171 0) 0 nil]) 14 | nil 15 | -------------------------------------------------------------------------------- /package/tasks/nmf/profile/cluster.yaml: -------------------------------------------------------------------------------- 1 | __default__: 2 | jobname: "{rule}.{wildcards}" 3 | nodes: 1 4 | ppn: 1 5 | walltime: "04:00:00" 6 | account: "ren-group" 7 | queue: "hotel" 8 | email: "debug.pie@gmail.com" 9 | mailon: "ae" 10 | jobout: "oe" 11 | log: "{rule}.{wildcards}.tscc.log" 12 | pmem: "5gb" 13 | 14 | nmf: 15 | queue: "hotel" 16 | nodes: 1 17 | ppn: 4 18 | walltime: "24:00:00" 19 | 20 | post_nmf: 21 | queue: "hotel" 22 | nodes: 1 23 | ppn: 1 24 | walltime: "01:00:00" 25 | 26 | stat: 27 | queue: "hotel" 28 | nodes: 1 29 | ppn: 1 30 | walltime: "01:00:00" 31 | 32 | plot_nmf: 33 | queue: "hotel" 34 | nodes: 1 35 | ppn: 1 36 | walltime: "01:00:00" 37 | 38 | sumNMF: 39 | queue: "hotel" 40 | nodes: 1 41 | ppn: 1 42 | walltime: "01:00:00" 43 | -------------------------------------------------------------------------------- /package/tasks/nmf/profile/config.yaml: -------------------------------------------------------------------------------- 1 | cluster-config: "profile/cluster.yaml" 2 | cluster: "qsub -N {cluster.jobname} -l nodes={cluster.nodes}:ppn={cluster.ppn},pmem={cluster.pmem},walltime={cluster.walltime} -A {cluster.account} -q {cluster.queue} -M {cluster.email} -m {cluster.mailon} -j {cluster.jobout} -o {cluster.log} -V " 3 | jobs: 100 4 | verbose: true 5 | notemp: true 6 | -------------------------------------------------------------------------------- /package/tasks/nmf/supple.01.prepare.nmf.R: -------------------------------------------------------------------------------- 1 | # Prepare data for nmf. 2 | # TODO: This script can be generalised. 3 | # Currently, I use it case by case. 4 | library(data.table) 5 | library(hdf5r) 6 | 7 | packdir <- file.path(here::here(), "package", "R") 8 | import::from(.from = "cembav2env.R", .directory = packdir, 9 | cembav2env) 10 | 11 | # * configs 12 | outdir <- "data" 13 | outh5 <- file.path(outdir, "cpm.cbyp.novlp.Intv2.h5") 14 | outPeakCoordFile <- file.path(outdir, "peaks.novlp.Intv2.txt") 15 | outClusterFile <- file.path(outdir, "clusters.novlp.Intv2.txt") 16 | 17 | # * load atac cpm. 18 | cpm.scbyp <- readRDS(cembav2env$subclassPmatCPMIntv2File) 19 | 20 | ## # * load cCREs from posivite pdc 21 | novlp.CREs <- data.table::fread( 22 | file = cembav2env$nonOvlpDHSPeakBedFile, 23 | sep = "\t", 24 | header = FALSE, 25 | data.table = FALSE 26 | ) 27 | colnames(novlp.CREs) <- c("chr", "start", "end", "name") 28 | novlpCoords <- with(novlp.CREs, 29 | paste(chr, paste(start, end, sep = "-"), sep = ":")) 30 | 31 | cpm.scbyp <- cpm.scbyp[ , novlpCoords] 32 | 33 | ## set cap for too high values 34 | upValue <- quantile(cpm.scbyp, 0.9999) 35 | cpm.capped <- cpm.scbyp 36 | cpm.capped[cpm.scbyp > upValue] <- upValue 37 | 38 | ## change to cluster by peak mat 39 | ## for later saving to hdf5 40 | peaks <- colnames(cpm.capped) 41 | clusters <- rownames(cpm.capped) 42 | 43 | # * save mat to hdf5 format for python handling. 44 | conn <- hdf5r::H5File$new(outh5, mode = "w") 45 | data.grp <- conn$create_group("X") 46 | # NOTE: hdf5r will transpose the mat 47 | # https://github.com/hhoeflin/hdf5r/issues/81 48 | data.grp[["mat"]] <- cpm.capped 49 | # colnames corresponds to cpm.capped 50 | data.grp[["colnames"]] <- peaks 51 | # rownames corresponds to cpm.capped 52 | data.grp[["rownames"]] <- clusters 53 | conn$close_all() 54 | 55 | write.table(peaks, file = outPeakCoordFile, quote = FALSE, 56 | row.names = FALSE, col.names = FALSE) 57 | write.table(clusters, file = outClusterFile, quote = FALSE, 58 | row.names = FALSE, col.names = FALSE) 59 | -------------------------------------------------------------------------------- /repo_figures/GraphAbstract.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/repo_figures/GraphAbstract.jpg -------------------------------------------------------------------------------- /repo_figures/GraphAbstract.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/repo_figures/GraphAbstract.tif -------------------------------------------------------------------------------- /repo_figures/snATAC-seq_analysis_pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/repo_figures/snATAC-seq_analysis_pipeline.jpg -------------------------------------------------------------------------------- /repo_figures/snATAC-seq_analysis_pipeline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/repo_figures/snATAC-seq_analysis_pipeline.pdf -------------------------------------------------------------------------------- /snakemake.template/Makefile: -------------------------------------------------------------------------------- 1 | test_dir: 2 | -mkdir -p $@ 3 | 4 | .PHONY: test_snakemake 5 | test_snakemake : test_dir Snakefile 6 | cp config.yaml $ str: 53 | t = apL4.replace("p", "") 54 | tt: List[str] = t.split(sep="_") 55 | if len(tt) < 2: 56 | return L4meta.loc[t]["Subclass label"] 57 | l3: str = re.sub("-\d+$", "", tt[0]) 58 | l4s: List[str] = [f"{l3}-{i}" for i in tt[1:]] 59 | l4s.append(tt[0]) 60 | scs: List[str] = L4meta.loc[l4s]["Subclass label"].unique().tolist() 61 | if len(scs) > 1: 62 | print(f"{apL4} has multiple subclasses.") 63 | return scs[0] 64 | 65 | 66 | scs: List[str] = [map_pL4_to_sc(k) for k in pL4] 67 | 68 | # subclass to CREs 69 | L4toCRE.insert(0, column="sc", value=scs) 70 | sctoCRE = L4toCRE.groupby(by="sc").apply( 71 | lambda x: x[CREs].sum(axis = 0)) 72 | 73 | # output subclass' CREs 74 | outdir = os.path.join("/Users/szu/git-recipes", 75 | "mouseBrainAtlas/cCRE_heatmap/data", 76 | "subclass2CRE") 77 | def outscCRE(sc: str) -> None: 78 | scnm = sc.replace('/', '-').replace(' ', '_') 79 | outfnm = os.path.join(outdir, f"{scnm}.cCREs.txt") 80 | pd.Series(sctoCRE.columns[sctoCRE.loc[sc] > 0]).to_csv( 81 | outfnm, sep = "\t", header = False, index = False 82 | ) 83 | 84 | for sc in sctoCRE.index.tolist(): 85 | print(f"{sc}") 86 | outscCRE(sc) 87 | --------------------------------------------------------------------------------