├── .gitignore
├── 00.data.preprocess
    ├── Makefile
    ├── README.org
    └── src
    │   ├── main
    │       ├── R
    │       │   ├── sa2.05.L1_cluster.R
    │       │   └── supple.01.update.sample.info.R
    │       ├── pipeline
    │       │   ├── alignment.Snakefile
    │       │   └── snapatac2.qc.Snakefile
    │       ├── python
    │       │   ├── sa2.01.preprocess.py
    │       │   ├── sa2.02.sum.pp.py
    │       │   ├── sa2.03.l1_embed.py
    │       │   ├── sa2.03.preL1Clustering.py
    │       │   ├── sa2.04.l1_knn.py
    │       │   └── sa2.05.l1_cluster.py
    │       ├── resource
    │       │   ├── cluster.json
    │       │   ├── config.json
    │       │   ├── mba.whole.sample.lst
    │       │   └── snapatac2.qc.config.yaml
    │       └── shell
    │       │   └── run_alignment.sh
    │   └── test
    │       └── python
    │           └── 01.test.makefragment.py
├── 01.clustering
    ├── L3_dlt2_cids.txt
    ├── L4_dlt2_cids.txt
    ├── README.org
    ├── Snakefile
    ├── config.yaml
    ├── makefile
    ├── meta
    │   ├── mba.test.sample
    │   ├── mba.whole.sample.lst
    │   └── mm10.blacklist.bed
    ├── post_script
    │   ├── build.sbt
    │   ├── rByMaxSilsL4.csv
    │   ├── sa2.L1.clustering.barcode2id.py
    │   ├── sa2.bmat.dlt.compare.py
    │   ├── sa2.dlt2.sumL3.R
    │   ├── sa2.dlt2.sumL4.R
    │   ├── sa2_L1_consensus.R
    │   ├── sa2_dlt2_L3_rByMaxSils.csv
    │   ├── sa2_dlt2_L3_sum.xlsx
    │   └── sa2_dlt2_prepareL2.scala
    ├── rerun.Snakefile
    ├── resource
    │   ├── bmatfile.csv
    │   ├── bmatfile_L0.csv
    │   ├── bmatfile_L0_condo.csv
    │   ├── sa2_L0_cluster2size.csv
    │   ├── sa2_L1_cluster2size.csv
    │   ├── sa2_dlt2_L1_cluster2size.csv
    │   └── sa2_dlt2_L1_cluster2size_test.csv
    ├── sa2.gmat.Snakefile
    ├── sa2.qc.dlt.Snakefile
    └── script
    │   ├── sa2.clustering.umap.py
    │   ├── sa2.embed.py
    │   ├── sa2.get.sample.gmat.py
    │   ├── sa2.knn.py
    │   ├── sa2.leiden.py
    │   ├── sa2.merge.gmat.py
    │   ├── sa2.merge.rmdlt.py
    │   ├── sa2.pre.anndataset.py
    │   ├── sa2.qc.dlt.py
    │   ├── sa2.rm.dlt.py
    │   ├── sa2.united.py
    │   ├── supple.sa2.add.barcode.to.unite.clustering.py
    │   ├── supple.sa2.bmat.dlt.py
    │   ├── supple.sa2.get.embed.py
    │   ├── supple.sa2.prepare.L1.py
    │   ├── supple.sa2.prepare.L2.in.memory.subset.py
    │   ├── supple.sa2.prepare.L2.py
    │   ├── supple.sa2.prepare.L3.R
    │   └── supple.sa2.prepare.L4.R
├── 02.integration
    ├── Makefile
    ├── README.org
    └── src
    │   ├── main
    │       ├── R
    │       │   ├── TransferLabel.R
    │       │   ├── analyzetf.R
    │       │   ├── annToS5.R
    │       │   ├── downsample.Allen.Seurat.on.subclass.level.R
    │       │   ├── downsample.sa2.Seurat.on.subclass.level.R
    │       │   ├── dp.seurat5.intgn.R
    │       │   ├── getIntUMAP.R
    │       │   ├── heavySummaryOfTranserlabel_IMN.R
    │       │   ├── heavySummaryOfTransferLabel.R
    │       │   ├── lightSummaryOfTransferLabel.R
    │       │   ├── mapSubclassNames.R
    │       │   ├── post_1st_tf.R
    │       │   ├── reciprocal.KNN.R
    │       │   ├── rknn2.R
    │       │   ├── rough.annot.L3.using.snATACv1.R
    │       │   ├── runPCA.R
    │       │   ├── simple.gene.list.of.allen.R
    │       │   └── sumSnapATAC2Meta.R
    │       ├── pipeline
    │       │   └── Seurat.TransferLabel.Snakefile
    │       ├── python
    │       │   ├── 01.extract.allen.py
    │       │   ├── 02.pseudobulk.allen.py
    │       │   ├── 02.pseudobulk.sa2.py
    │       │   ├── imneuron.py
    │       │   └── reduce.anndata.allen.sa2.py
    │       └── resource
    │       │   ├── AIT21_ReadMe.txt
    │       │   ├── AIT21_annotation.tsv
    │       │   ├── AIT21_annotation_freeze_081523.tsv
    │       │   ├── AIT21_cluster_markers.txt
    │       │   ├── AIT21_k8_markers.txt
    │       │   ├── AIT21_merfish_markers.txt
    │       │   ├── AllenIMNTopMajorRegionRelatedAllenCls.txt
    │       │   ├── AllenIMNTopMajorRegionRelatedL4s.txt
    │       │   ├── BICCN.BrainRegionMetadata.xlsx
    │       │   ├── allen_supple_v1
    │       │       ├── ED_Table1_complete_CCFv3_ontology.csv
    │       │       ├── ED_Table5_select.markers.csv
    │       │       ├── TF263.csv
    │       │       └── TF499.txt
    │       │   ├── atac.subclass2size.v1.csv
    │       │   ├── sa2_dlt2_L3toSa1Annot.rough.csv
    │       │   └── subclass_nm_in_macs2_bigwig.txt
    │   └── test
    │       ├── R
    │           ├── analyze.Intgn.Seurat5.R
    │           ├── test.Seurat.with.anndata.R
    │           └── test.snakemake.wildcards.R
    │       ├── pipeline
    │           ├── Makefile
    │           └── R.Snakefile
    │       └── python
    │           └── prepare.intg.test.ann.py
├── 03.peakcalling
    ├── Makefile
    ├── bin
    │   └── merge_peaks
    └── src
    │   └── main
    │       ├── R
    │           ├── addpL4Info2atacMeta.R
    │           ├── filterPeakByscbgModel.R
    │           ├── filterPeakFromPseudoBulk.R
    │           ├── finalizedpeaks.R
    │           ├── fitbgmodel.R
    │           ├── iterativeMergePeak.R
    │           ├── preparePeakCallingByMergeNN.R
    │           ├── preparePeakCallingByMergeNeuron.R
    │           ├── subclass2peak.R
    │           └── sumReproducePeaks.R
    │       ├── pipeline
    │           ├── getsa2pmat.Snakefile
    │           ├── mergePeak.Snakefile
    │           ├── scfilter.Snakefile
    │           └── snap2.peakcalling.Snakefile
    │       ├── python
    │           ├── get_full_snap2.py
    │           ├── prepare_bedfiles.py
    │           ├── run_macs2.py
    │           ├── sa2_get_peakfrac.py
    │           └── sa2pmat.py
    │       ├── resource
    │           ├── all.pL4.meta.csv
    │           ├── all_pL4s.txt
    │           ├── config.yaml
    │           ├── mba.whole.sample.lst
    │           └── test_neuron_L4pc2sizes.csv
    │       └── shell
    │           ├── export_unionpeak.sh
    │           ├── get_reproduce_peak_within_cluster.sh
    │           └── intersect_mergepeak.sh
├── 04.nmf
    ├── Makefile
    └── src
    │   └── main
    │       ├── R
    │           ├── 01.prepare.nmf.R
    │           ├── 02.nmfATAC.plotH.R
    │           ├── 02.nmfATAC.plotW.R
    │           ├── 02.nmfATAC.statBox.R
    │           ├── 03.sumnmf.R
    │           ├── 04.nmf.plot.R
    │           └── 05.splitPeakByModule.R
    │       ├── pipeline
    │           └── nmf.Snakefile
    │       ├── python
    │           ├── 02.nmf.py
    │           ├── 02.nmfATAC.stat.py
    │           └── 02.post_nmf.py
    │       └── resource
    │           └── config.yaml
├── 05.cCREgene
    └── sa2.cicero
    │   ├── Makefile
    │   └── src
    │       └── main
    │           ├── R
    │               ├── 03.filterCiceroByShuf.R
    │               ├── 06.summaryDistalProximalConns.R
    │               ├── 07.cor.scRNAseq.R
    │               ├── 08.summarize.cor.R
    │               ├── 09.sa2.subclass.specific.ppdc.R
    │               ├── cicero_mouse_atlas.R
    │               ├── run_cicero.R
    │               ├── run_cicero_shuffle.R
    │               ├── sa2.pdc.of.globalpeaks.R
    │               ├── supple.07.01.get.pdc.and.rdm.gene2peak.R
    │               └── supple.07.02.get.RNA.ATAC.cpm.R
    │           ├── pipeline
    │               ├── pdc.Snakefile
    │               └── runCicero.Snakefile
    │           ├── resource
    │               └── config.yaml
    │           └── shell
    │               ├── 04.addTSSAnnot2Conns.sh
    │               ├── 05.mergeDistalProximalConns.sh
    │               ├── 09.get.pos.neg.pdc.info.sh
    │               ├── alignv1.to.bedpe.sh
    │               ├── sa2.all.distal.peaks.sh
    │               └── supple.02.annotPeakBasedOnTSS.sh
├── 06.motifanalysis
    ├── Makefile
    ├── README.org
    └── src
    │   └── main
    │       ├── R
    │           └── 05.splitPeakByModule.R
    │       ├── pipeline
    │           └── motif.Snakefile
    │       └── python
    │           └── test.scienicplus.py
├── 07.m3C
    ├── README.org
    ├── hic2
    │   └── hic2.sh
    ├── runHiC.sh
    └── subclass.txt
├── 08.GRN
    ├── Makefile
    └── src
    │   └── main
    │       ├── pipeline
    │           └── celloracle.Snakefile
    │       ├── python
    │           ├── 01.runGimmemotifs.py
    │           ├── 02.mergeGimme.py
    │           ├── 03.seurat2anndata.py
    │           ├── 04.runGRN.py
    │           └── 05.plot.powerlaw.py
    │       └── resource
    │           ├── CisBP_ver2_Mus_musculus.motif2factors.txt
    │           ├── config.yaml
    │           └── sa2.allen.vf3281.gene.txt
├── 09.cCRE_conservation
    ├── 01.reciLiftOver.sh
    └── 02.orthologous.R
├── 10.cCRE_TE
    ├── 01.highTE.subclass.R
    └── 02.TE.variability.R
├── 11.deeplearning
    ├── README.org
    └── src
    │   └── main
    │       └── resource
    │           └── mappedHMB.txt
├── LICENSE
├── README.org
├── manuscript_figures
    ├── Fig1.R
    ├── Fig5.R
    ├── paper.R
    ├── sa2.Fig2.R
    ├── sa2.Fig3.R
    ├── sa2.Fig4.R
    └── sa2.sc2region.R
├── meta
    ├── BICCN.BrainRegionMetadata.xlsx
    ├── BrainRegion.Metadata.txt
    ├── allen.region.to.main.region.v2.txt
    ├── allen_subclass_RegionMeta.csv
    ├── atac_L4_MajorRegion.csv
    ├── dissect2time.csv
    ├── ensemble.genesymbol.allengenesymbol.csv
    ├── gencode.vM16.geneUp2k.bed
    ├── gencode.vM23.gene.tssUpDn1k.bed
    ├── getGeneUp2K.sh
    ├── makefile
    ├── mm10-blacklist.v2.bed
    ├── mm10.blacklist.bed
    ├── mm10.chrom.sizes
    ├── mm10.chrom.sizes.lite
    ├── mouse.modified.gencode.vM23.bed
    ├── mouse.modified.gencode.vM23.gene.up2k.bed
    ├── neuron_cell_markers.csv
    ├── sa2.subclass.names.map.csv
    ├── sa2.subclass.srt.txt
    ├── sa2.subclass2region2score.csv
    ├── sample2bamfile.csv
    ├── sample2rawbam.csv
    ├── subclass_and_genemarker_CEMBAv1.xlsx
    └── whole.brain.cellname.org
├── package
    ├── R
    │   ├── annot.R
    │   ├── bed.R
    │   ├── cembav2env.R
    │   ├── cicero.R
    │   ├── colors.R
    │   ├── dendro.R
    │   ├── gglot.theme.R
    │   ├── gmat.R
    │   ├── grn.R
    │   ├── hc.R
    │   ├── hdf5.R
    │   ├── igv.R
    │   ├── integration.R
    │   ├── loadSnap.R
    │   ├── peak.R
    │   ├── plot.R
    │   ├── prob.R
    │   ├── region.R
    │   └── utils.R
    ├── python
    │   ├── bedpe2bigwig.py
    │   ├── cembav2env.py
    │   ├── colors.py
    │   ├── leiden.py
    │   ├── myanndata.py
    │   ├── mycelloracle.py
    │   ├── mylog.py
    │   ├── mysnapatac2.py
    │   ├── snap2h5ad.py
    │   └── utils.py
    └── tasks
    │   ├── getAllL3SnapMat
    │       ├── L2GroupAll.csv
    │       ├── Makefile
    │       ├── Snakefile.template
    │       ├── bmat
    │       │   └── config.json
    │       ├── getSnapATACMat.R
    │       ├── profile.template
    │       │   ├── cluster.yaml
    │       │   └── config.yaml
    │       └── vM16gmat
    │       │   ├── L2GroupAll.csv
    │       │   └── config.json
    │   ├── getSnapATACMatByGroup
    │       ├── L1Group.csv
    │       ├── L2GroupAll.csv
    │       ├── L2GroupTest.csv
    │       ├── L2MultiGroup.csv
    │       ├── Makefile
    │       ├── Snakefile.template
    │       ├── config.json.template
    │       ├── configL1.json.template
    │       ├── configMultiGroup.json.template
    │       ├── getSnapATACMatByGroup.R
    │       └── profile.template
    │       │   ├── cluster.yaml
    │       │   └── config.yaml
    │   └── nmf
    │       ├── 01.prepare.nmf.R
    │       ├── 02.nmf.py
    │       ├── 02.nmfATAC.plotH.R
    │       ├── 02.nmfATAC.plotW.R
    │       ├── 02.nmfATAC.stat.py
    │       ├── 02.nmfATAC.statBox.R
    │       ├── 02.post_nmf.py
    │       ├── 03.sumnmf.R
    │       ├── 04.nmf.plot.R
    │       ├── 05.splitPeakByModule.R
    │       ├── Makefile
    │       ├── config.yaml
    │       ├── nmf.Snakefile
    │       ├── profile
    │           ├── .cluster.yaml.~undo-tree~
    │           ├── cluster.yaml
    │           └── config.yaml
    │       └── supple.01.prepare.nmf.R
├── repo_figures
    ├── GraphAbstract.jpg
    ├── GraphAbstract.tif
    ├── snATAC-seq_analysis_pipeline.jpg
    └── snATAC-seq_analysis_pipeline.pdf
├── snakemake.template
    ├── Makefile
    ├── README.org
    ├── Snakefile
    ├── config.yaml
    ├── pbs.demo.sh
    └── profile
    │   ├── cluster.yaml
    │   └── config.yaml
└── supple.datashare
    ├── paper.R
    ├── read.Supple.Nature.py
    ├── sa2.supplementary.tables.R
    └── share.peak.by.majorRegion.R


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | /meta/._subclass_and_genemarker_CEMBAv1.xlsx
131 | /repo_figures/._snATAC-seq_analysis_pipeline.jpg
132 | /repo_figures/._snATAC-seq_analysis_pipeline.pdf
133 | /repo_figures/._GraphAbstract.tif
134 | /repo_figures/._GraphAbstract.jpg
135 | /data/
136 | /00.data.preprocess/out/
137 | /00.data.preprocess/src/test/python/.tmprOXZWn/
138 | *.Rhistory
139 | /._.gitignore
140 | 


--------------------------------------------------------------------------------
/00.data.preprocess/Makefile:
--------------------------------------------------------------------------------
 1 | sa2_pp: snapatac2.qc.Snakefile
 2 | 	snakemake -c 1 --config system=silencer \
 3 |     --snakefile $< -R --rerun-incomplete --profile profile
 4 | 
 5 | 
 6 | encoder_sa2_l1_embed: snapatac2.qc.Snakefile
 7 | 	snakemake -c 4 --config system=silencer debug=1 \
 8 |     --snakefile $< --until snapatac2_l1_embed -R --rerun-incomplete
 9 | 
10 | encoder_sa2_l1_knn_hora: snapatac2.qc.Snakefile
11 | 	snakemake -c 4 --config system=silencer debug=1 \
12 |     knn_method=hora \
13 |     --snakefile $< --until snapatac2_l1_knn -R --rerun-incomplete
14 | 
15 | encoder_sa2_l1_knn_exact: snapatac2.qc.Snakefile
16 | 	snakemake -c 4 --config system=silencer debug=1 \
17 |     knn_method=exact \
18 |     --snakefile $< --until snapatac2_l1_knn -R --rerun-incomplete
19 | 
20 | .PHONY: tscc_sa2_l1_embed
21 | tscc_sa2_l1_embed: snapatac2.qc.Snakefile snapatac2.qc.config.yaml
22 | 	-mkdir -p $@
23 | 	cp $(word 2,$^) $@/$(word 2,$^)
24 | 	cp $< $@/$<
25 | 	cp -R profile/. $@/profile
26 | 	cd $@ && \
27 | 	snakemake -c 1 --config system=tscc debug=1 \
28 |     --snakefile $< --until snapatac2_l1_embed -R \
29 |     --rerun-incomplete --profile profile
30 | 
31 | .PHONY: tscc_sa2_l1_knn_exact
32 | l1_knn_dir := tscc_sa2_l1_embed
33 | tscc_sa2_l1_knn_exact: snapatac2.qc.Snakefile snapatac2.qc.config.yaml
34 | 	cp $(word 2,$^) ${l1_knn_dir}/$(word 2,$^)
35 | 	cp $< ${l1_knn_dir}/$<
36 | 	cp -R profile/. ${l1_knn_dir}/profile
37 | 	cd ${l1_knn_dir} && \
38 | 	snakemake -c 1 --config \
39 |     system=tscc debug=1 \
40 |     knn_method=exact \
41 |     --snakefile $< --until snapatac2_l1_knn -R \
42 |     --rerun-incomplete --profile profile
43 | 
44 | 
45 | # test
46 | test_sa2_pp: snapatac2.qc.Snakefile
47 | 	snakemake -c 1 --config system=imac debug=1\
48 |     --snakefile $< -R --rerun-incomplete
49 | 
50 | test_sa2_pp_tscc: snapatac2.qc.Snakefile
51 | 	snakemake -c 1 --config system=tscc debug=1 \
52 |     --snakefile $< -R --rerun-incomplete
53 | 
54 | test_sa2_l1_embed: snapatac2.qc.Snakefile
55 | 	snakemake -c 2 --config system=imac debug=1 \
56 |     --snakefile $< --until snapatac2_l1_embed -R --rerun-incomplete
57 | 
58 | test_sa2_l1_knn: snapatac2.qc.Snakefile
59 | 	snakemake -c 2 --config system=imac debug=1 \
60 |     --snakefile $< --until snapatac2_l1_knn -R --rerun-incomplete
61 | 
62 | .PHONY: clean
63 | clean:
64 | 	-rm snapatac2_l1_embed..03*
65 | 	-rm snapatac2_pp.sample=*.o3*
66 | 	-rm -rf .tmp*
67 | 


--------------------------------------------------------------------------------
/00.data.preprocess/README.org:
--------------------------------------------------------------------------------
 1 | * Alignment
 2 |   - All the bam files are generated by snaptools, which is an old
 3 |     python package for SnapATAC R package.
 4 |   - In snaptools, bwa software is used for the algnment.
 5 |   - See details under src/main/pipeline/alignment.Snakefile
 6 |   - The config.json and cluster.json are under src/main/resource
 7 |     
 8 | * Quality Control and Doublet Removal
 9 |   - After generating bam files, we then use SnapATAC2 for our
10 |     analysis.
11 |   - See details under src/main/pipeline/snapatac2.qc.Snakefile.
12 |   - The snapatac2.qc.config.yaml is under src/main/resource.
13 | ** Quality control
14 |    - At bulk level, we check the quality of the sequencing based on
15 |      the sequencing depth, number of unique fragments, and so on. The
16 |      most important thing is to check if the size of fragments in each
17 |      sample were enriched in 100 bp and 200 bp (see [[https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-1929-3][ref]]).
18 |    - At single-cell level, we follow the
19 |      [[https://www.encodeproject.org/atac-seq/][ENCODE pipeline]], we use two criteria to filter cells:
20 |      1. Number of unique fragment >= 1,000
21 |      2. Transcription start site enrichment (TSSe) >= 10
22 | ** Doublet removal
23 |    - We use Scrublet for this. And the imputed gene expression from
24 |      snATAC-seq data is similar like what Seurat did.
25 | 


--------------------------------------------------------------------------------
/00.data.preprocess/src/main/R/supple.01.update.sample.info.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | 
 3 | # Feedback from Hanqing
 4 | ## {"AMY-1": "AMY-2", "AMY-2": "AMY-1"}, {"ACB-2": "CP-1", "CP-1": "ACB-2"}
 5 | ## 1. AMY那个我不确定是我们谁反了，因为cell-type 比较像，但我们肯定是不一样的。
 6 | ## 2. CP那个应该是ATAC 反了
 7 | 
 8 | # For current sample data we have
 9 | ## ACB-2: 4E, CEMBA180110_4E and CEMBA180111_4E
10 | ## CP-1: 4D, CEMBA171214_4D and CEMBA171219_4D
11 | ## AMY-1: 7H, CEMBA200820_7H and CEMBA200827_7H
12 | ## AMY-2: 8H, CEMBA200903_8H and CEMBA200910_8H
13 | 
14 | 
15 | # ATAC-seq experiments record:
16 | # https://docs.google.com/spreadsheets/d/1HbYP0tLpv4rPwkJn6uZPjR_M7dnIRglzeTcY9Os-CVA/edit#gid=1893884661
17 | 
18 | # ATAC-seq LIMS spreadsheet
19 | # https://docs.google.com/spreadsheets/d/1UPkKv3potJtNEbYxkpMY5X_V1xgkRbkvkW4Qi14o1x4/edit#gid=1307716493
20 | 
21 | # Comments from Yang
22 | ## brain dissection编号(4D: CP-1, etc) 肯定是一致的，毫无疑问。
23 | ## 上面的描述里，没懂CP-1跟那个region反了？可以查实验室记录吧，看反了的两个sample是不是同一天做的实验，如果不是，就好分辨了？
24 | ## 假设Marga没有把sample标记错，tissue被同时发给两个实验室后，某一个实验室的sample标记错了。
25 | ## 这样就核对marga，hanqing和我们三处的日期
26 | 
27 | ## 一般是4个sample为一组做实验，比如1号做了CP-1+其他三个sample，5号做了ACB-2+其他3个sample。
28 | ## 假设：
29 | ##      Marga,   hanqing,   renlab
30 | ## CP-1    1         5         1
31 | ## others  1         5         1
32 | ## ACB-1   5         1         5
33 | ## 你就应该知道是哪里出错了 
34 | 


--------------------------------------------------------------------------------
/00.data.preprocess/src/main/python/sa2.04.l1_knn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from pathlib import Path
 4 | import logging
 5 | import numpy as np
 6 | import numpy
 7 | import argparse
 8 | import shutil
 9 | 
10 | import snapatac2 as sa2
11 | import pyprojroot
12 | code_root_dir = str(pyprojroot.here())
13 | pack_dir = f"{code_root_dir}/package/python"
14 | sys.path.insert(0, pack_dir)
15 | import utils # type: ignore # noqa: E402
16 | 
17 | parser = argparse.ArgumentParser("snapatac2 L1 KNN")
18 | parser.add_argument("--embed_file", type = str)
19 | parser.add_argument("--outf", type = str, default = "test_knn.hdf5")
20 | parser.add_argument("--kmethod", type = str, default = 'exact')
21 | parser.add_argument("--knn", type = int, default = 50)
22 | parser.add_argument("--logfile", type = str,
23 |                     default = "log/test_sa2_l1_embed_knn.log")
24 | parser.add_argument("--debug", type = int, default = 1)
25 | parser.add_argument("-i", "--ipython", action = "store_true")
26 | parser.add_argument("--simple-prompt", action = "store_true")
27 | 
28 | args = parser.parse_args()
29 | 
30 | # * set log
31 | logger = utils.set_file_logger(fnm = args.logfile, #type: ignore
32 |                                name = "sa2.04.l1_knn")
33 | if args.debug == 0:
34 |     debug = False
35 | else:
36 |     debug = True
37 |     logger.warning("DEBUG mode is open")
38 | 
39 | # * meta
40 | k = args.knn
41 | km = args.kmethod
42 | embed_file = args.embed_file
43 | if not os.path.exists(embed_file):
44 |     err_msg = f"{embed_file} is not found"
45 |     logging.error(err_msg)
46 |     sys.exit(err_msg)
47 | outf = args.outf
48 | outdir = os.path.dirname(outf)
49 | if os.path.exists(outf):
50 |     logger.warning(f"{outf} exists and remove it.")
51 |     os.remove(outf)
52 | else:
53 |     os.makedirs(outdir, exist_ok = True)
54 | 
55 | 
56 | # * main
57 | logger.info(f"Copy {embed_file} to {outf}")
58 | shutil.copyfile(src = embed_file,
59 |                 dst = outf)
60 | 
61 | logger.info(f"Read AnnData to RAM from : {outf}")
62 | sds = sa2.read(outf, backend = None)
63 | 
64 | logger.info("Start to run KNN.")
65 | sa2.pp.knn(
66 |     adata = sds,
67 |     n_neighbors = k,
68 |     use_dims = None,
69 |     use_rep = 'X_spectral',
70 |     method = km,
71 |     inplace = True,
72 |     random_state = 0
73 | )
74 | sds.close()
75 | logger.info("Done")
76 | 


--------------------------------------------------------------------------------
/00.data.preprocess/src/main/resource/cluster.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "__default__" :
 3 |     {
 4 |         "time" : "walltime=48:00:00",
 5 |         "ppn" : "nodes=1:ppn=4",
 6 |         "queue" : "hotel"
 7 |     },
 8 |     "snap_align" :
 9 |     {
10 |         "time" : "walltime=48:00:00",
11 |         "ppn" : "nodes=1:ppn=4",
12 |         "queue" : "hotel"
13 |     },
14 |     "snap_pre" :
15 |     {
16 |         "time" : "walltime=24:00:00",
17 |         "ppn" : "nodes=1:ppn=4",
18 |         "queue" : "hotel"
19 |     },
20 |     "snap_add_bmat" :
21 |     {
22 |         "time" : "walltime=24:00:00",
23 |         "ppn" : "nodes=1:ppn=2",
24 |         "queue" : "hotel"
25 |     },
26 |     "snap_add_gmat" :
27 |     {
28 |         "time" : "walltime=24:00:00",
29 |         "ppn" : "nodes=1:ppn=2",
30 |         "queue" : "hotel"
31 |     },
32 |     "pre_sta" :
33 |     {
34 |         "time" : "walltime=24:00:00",
35 |         "ppn" : "nodes=1:ppn=5",
36 |         "queue" : "hotel"
37 |     },
38 |     "cluster" :
39 |     {
40 |         "time" : "walltime=24:00:00",
41 |         "ppn" : "nodes=1:ppn=6",
42 |         "queue" : "hotel"
43 |     },
44 |     "plotGene" :
45 |     {
46 |     "time" : "walltime=24:00:00",
47 |     "ppn" : "nodes=1:ppn=2",
48 |         "queue" : "hotel"
49 |     },
50 |     "dump_frag" :
51 |     {
52 |         "time" : "walltime=24:00:00",
53 |         "ppn" : "nodes=1:ppn=2",
54 |         "queue" : "home"
55 |     },
56 |     "tsse2depth" :
57 |     {
58 |         "time" : "walltime=24:00:00",
59 |         "ppn" : "nodes=1:ppn=2",
60 |         "queue" : "home"
61 |     },
62 |     "bam2bedpe":
63 |     {
64 |         "time" : "walltime=24:00:00",
65 |         "ppn" : "nodes=1:ppn=1",
66 |         "queue" : "hotel"
67 |     },
68 |     "snap2cb" :
69 |     {
70 |         "time" : "walltime=1:00:00",
71 |         "ppn" : "nodes=1:ppn=1",
72 |         "queue" : "glean"
73 |     }
74 | }
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/00.data.preprocess/src/main/resource/snapatac2.qc.config.yaml:
--------------------------------------------------------------------------------
 1 | system: imac
 2 | conda:
 3 |   imac: /Users/szu/mambaforge
 4 |   tscc: /projects/ps-renlab2/szu/miniconda3
 5 |   silencer: /projects/ps-renlab2/szu/miniconda3
 6 | conda_env:
 7 |   imac: sa2dev
 8 |   silencer: sa2dev
 9 |   tscc: sa2dev_tscc
10 | python:
11 |   imac: /Users/szu/mambaforge/envs/sa2dev/bin/python
12 |   silencer: /projects/ps-renlab2/szu/miniconda3/envs/sa2dev/bin/python
13 |   tscc: /projects/ps-renlab2/szu/miniconda3/envs/sa2dev/bin/python
14 | project_dir:
15 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2
16 |   silencer: /projects/ps-renlab2/szu/projects/CEMBA2
17 |   tscc: /projects/ps-renlab2/szu/projects/CEMBA2
18 | work_dir:
19 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2/00.data.preprocess
20 |   silencer: /projects/ps-renlab2/szu/projects/CEMBA2/00.data.preprocess
21 |   tscc: /projects/ps-renlab2/szu/projects/CEMBA2/00.data.preprocess
22 | out_dir: snapatac2_pp_out
23 | sample2bamfile:
24 |   imac: 00.data.preprocess/test_sample2bamfile.csv
25 |   silencer: meta/sample2rawbam.csv
26 |   tscc: 00.data.preprocess/test_sample2bamfile_tscc.csv
27 | chrom_size_file: meta/mm10.chrom.sizes.lite
28 | gtf_file: meta/gencode.vM23.gene.annot2.gtf
29 | blacklist_file: meta/mm10.blacklist.bed
30 | samples_file:
31 |   imac: 00.data.preprocess/test_samples.txt
32 |   silencer: meta/CEMBA_all_samples.txt
33 |   tscc: meta/CEMBA_all_samples.txt
34 | embed_nfeature: 500000
35 | embed_ncomps: 50
36 | embed_sample_size: 1.0
37 | distance_metric: cosine
38 | knn: 50
39 | knn_method: hora
40 | debug: 0
41 |   
42 | 


--------------------------------------------------------------------------------
/00.data.preprocess/src/main/shell/run_alignment.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | #Author: Yang Li <yal054@health.ucsd.edu>
4 | #File: run.sh
5 | #Create Date: Sat Feb 26 10:26:52 PST 2022
6 | 
7 | snakemake -p --rerun-incomplete -k -j 128 --cluster "qsub -l {cluster.ppn} -l {cluster.time} -N {params.jobname} -q {cluster.queue} -o pbslog/{params.jobname}.pbs.out -e pbslog/{params.jobname}.pbs.err" --jobscript jobscript.pbs --jobname "{rulename}.{jobid}.pbs" --cluster-config cluster.json 2>run.log
8 | 


--------------------------------------------------------------------------------
/00.data.preprocess/src/test/python/01.test.makefragment.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import os
 3 | import snapatac2 as sa2
 4 | import pyprojroot
 5 | 
 6 | proj_root = pyprojroot.here()
 7 | raw_bam_dir: str = os.path.join(proj_root, "data",
 8 |                                 "raw_bam_test")
 9 | dedup_bam_dir: str = os.path.join(proj_root, "data",
10 |                                   "r")
11 | samples: List[str] = [
12 |     "CEMBA181023_6B", "CEMBA201210_10D"]
13 | outdir = os.path.join(proj_root, "00.data.preprocess",
14 |                       "out/test")
15 | 
16 | for s in samples:
17 |     stat = sa2.pp.make_fragment_file(
18 |         bam_file = os.path.join(raw_bam_dir, f"{s}.bam"),
19 |         output_file = os.path.join(outdir,
20 |                                    f"{s}.frag.rawbam.tsv"),
21 |         is_paired = True,
22 |         barcode_regex = "^(\w+):.+",
23 |         shift_left = 4,
24 |         shift_right = -4,
25 |         min_mapq = 30
26 |     )
27 |     print(stat)
28 | 
29 | for s in samples:
30 |     stat = sa2.pp.make_fragment_file(
31 |         bam_file = os.path.join(proj_root, "data",
32 |                                 "filtered_dedup_bam",
33 |                                 f"{s}.filtered_dedup.bam"),
34 |         output_file = os.path.join(outdir,
35 |                                    f"{s}.frag.filtered_debup_bam.tsv"),
36 |         is_paired = True,
37 |         barcode_regex = "^(\w+):.+",
38 |         shift_left = 4,
39 |         shift_right = -4,
40 |         min_mapq = 30
41 |     )
42 | 
43 | 


--------------------------------------------------------------------------------
/01.clustering/L3_dlt2_cids.txt:
--------------------------------------------------------------------------------
  1 | 1-1
  2 | 1-10
  3 | 1-11
  4 | 1-12
  5 | 1-13
  6 | 1-14
  7 | 1-15
  8 | 1-16
  9 | 1-17
 10 | 1-18
 11 | 1-19
 12 | 1-2
 13 | 1-20
 14 | 1-21
 15 | 1-22
 16 | 1-23
 17 | 1-24
 18 | 1-25
 19 | 1-26
 20 | 1-27
 21 | 1-28
 22 | 1-29
 23 | 1-3
 24 | 1-30
 25 | 1-4
 26 | 1-5
 27 | 1-6
 28 | 1-7
 29 | 1-8
 30 | 1-9
 31 | 10-1
 32 | 10-2
 33 | 10-3
 34 | 11-1
 35 | 11-2
 36 | 11-3
 37 | 11-4
 38 | 12-1
 39 | 12-2
 40 | 12-3
 41 | 12-4
 42 | 13-1
 43 | 13-2
 44 | 13-3
 45 | 13-4
 46 | 13-5
 47 | 13-6
 48 | 13-7
 49 | 13-8
 50 | 14-1
 51 | 14-2
 52 | 14-3
 53 | 14-4
 54 | 14-5
 55 | 14-6
 56 | 14-7
 57 | 14-8
 58 | 15-1
 59 | 15-2
 60 | 15-3
 61 | 16-1
 62 | 16-2
 63 | 16-3
 64 | 16-4
 65 | 16-5
 66 | 16-6
 67 | 17-1
 68 | 17-2
 69 | 17-3
 70 | 17-4
 71 | 17-5
 72 | 17-6
 73 | 17-7
 74 | 17-8
 75 | 17-9
 76 | 18-1
 77 | 18-2
 78 | 18-3
 79 | 18-4
 80 | 18-5
 81 | 18-6
 82 | 19-1
 83 | 19-2
 84 | 19-3
 85 | 19-4
 86 | 19-5
 87 | 19-6
 88 | 19-7
 89 | 19-8
 90 | 2-1
 91 | 2-10
 92 | 2-11
 93 | 2-12
 94 | 2-13
 95 | 2-14
 96 | 2-15
 97 | 2-16
 98 | 2-17
 99 | 2-18
100 | 2-19
101 | 2-2
102 | 2-20
103 | 2-21
104 | 2-22
105 | 2-23
106 | 2-24
107 | 2-25
108 | 2-26
109 | 2-27
110 | 2-28
111 | 2-29
112 | 2-3
113 | 2-4
114 | 2-5
115 | 2-6
116 | 2-7
117 | 2-8
118 | 2-9
119 | 20-1
120 | 20-2
121 | 20-3
122 | 20-4
123 | 20-5
124 | 21-1
125 | 21-2
126 | 21-3
127 | 21-4
128 | 21-5
129 | 21-6
130 | 22-1
131 | 22-2
132 | 22-3
133 | 23-1
134 | 23-10
135 | 23-11
136 | 23-2
137 | 23-3
138 | 23-4
139 | 23-5
140 | 23-6
141 | 23-7
142 | 23-8
143 | 23-9
144 | 24-1
145 | 24-2
146 | 24-3
147 | 24-4
148 | 24-5
149 | 25-1
150 | 25-2
151 | 25-3
152 | 25-4
153 | 25-5
154 | 26-1
155 | 26-2
156 | 26-3
157 | 26-4
158 | 26-5
159 | 27-1
160 | 27-2
161 | 28-1
162 | 28-2
163 | 28-3
164 | 28-4
165 | 28-5
166 | 28-6
167 | 28-7
168 | 29-1
169 | 29-2
170 | 3-1
171 | 3-2
172 | 3-3
173 | 3-4
174 | 3-5
175 | 30-1
176 | 30-2
177 | 30-3
178 | 30-4
179 | 30-5
180 | 30-6
181 | 30-7
182 | 30-8
183 | 31-1
184 | 31-2
185 | 32-1
186 | 32-2
187 | 33-1
188 | 33-2
189 | 33-3
190 | 34-1
191 | 34-2
192 | 35-1
193 | 35-2
194 | 35-3
195 | 35-4
196 | 36-1
197 | 36-2
198 | 37-1
199 | 37-2
200 | 37-3
201 | 37-4
202 | 4-1
203 | 4-2
204 | 4-3
205 | 4-4
206 | 5-1
207 | 5-2
208 | 5-3
209 | 5-4
210 | 6-1
211 | 6-10
212 | 6-2
213 | 6-3
214 | 6-4
215 | 6-5
216 | 6-6
217 | 6-7
218 | 6-8
219 | 6-9
220 | 7-1
221 | 7-10
222 | 7-11
223 | 7-12
224 | 7-13
225 | 7-2
226 | 7-3
227 | 7-4
228 | 7-5
229 | 7-6
230 | 7-7
231 | 7-8
232 | 7-9
233 | 8-1
234 | 8-10
235 | 8-2
236 | 8-3
237 | 8-4
238 | 8-5
239 | 8-6
240 | 8-7
241 | 8-8
242 | 8-9
243 | 9-1
244 | 9-2
245 | 9-3
246 | 9-4
247 | 9-5
248 | 9-6
249 | 


--------------------------------------------------------------------------------
/01.clustering/README.org:
--------------------------------------------------------------------------------
 1 | * CEMBA snATAC-seq clustering using SnapATAC2
 2 | ** Installation of SnapATAC2
 3 |    #+BEGIN_SRC shell
 4 |   # Install a mamba enviroment named sa2 with python=3.10 or newer version
 5 |   # Then
 6 |   mamba activate sa2
 7 |    && mamba install -c conda-forge -c bioconda snakemake \
 8 |    && mamba install -c anaconda cmake
 9 |   curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh \
10 |      && rustup default nightly
11 |   git clone https://github.com/kaizhang/SnapATAC2.git ~/softwares/SnapATAC2 \
12 |      && cd ~/softwares/SnapATAC2/snapatac2-python && pip install .
13 |   # current numba depends on numpy=1.24
14 |   pip install numpy==1.24 && pip install ipython pyprojroot matplotlib
15 | #+END_SRC
16 | 
17 | ** NOTE
18 | *** TODO save L1-level QC, doublet removal and clustering result
19 | *** Under ps-renlab, we perform clustering.
20 | *** How to run scala file
21 |     1. enter post_script
22 |     2. run: ~sbt~
23 |     3. then run: ~run~, this will run Hello in sa2_dlt2_preprareL2.scala
24 | 


--------------------------------------------------------------------------------
/01.clustering/config.yaml:
--------------------------------------------------------------------------------
 1 | system: imac
 2 | ## clustering on which level
 3 | ## so on L1, means we perform L2 level clustering
 4 | clustering_level: L1
 5 | retries: 1
 6 | conda:
 7 |   imac: sa2
 8 |   encoder: sa2
 9 |   tscc: sa2
10 |   tscc_test: sa2
11 |   test_unite: sa2
12 | 
13 | project_dir:
14 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2
15 |   encoder: /projects/ps-renlab/szu/projects/CEMBA2
16 |   tscc: /projects/ps-renlab/szu/projects/CEMBA2
17 |   tscc_test: /projects/ps-renlab/szu/projects/CEMBA2
18 |   test_unite: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2
19 |   
20 | work_dir:
21 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2/17.snapatac2
22 |   encoder: /projects/ps-renlab/szu/projects/CEMBA2/17.snapatac2
23 |   #tscc: /oasis/tscc/scratch/szu/projects/CEMBA2/17.snapatac2
24 |   # use condo since scratch has slow IO.
25 |   tscc: /projects/ps-renlab/szu/projects/CEMBA2/17.snapatac2
26 |   tscc_test: /oasis/tscc/scratch/szu/projects/CEMBA2/17.snapatac2
27 |   test_unite: /users/szu/test/17.snapatac2
28 | out_dir: result
29 | blacklist_file: 17.snapatac2/meta/mm10.blacklist.bed
30 | barcode2id_file: 17.snapatac2/resource/barcode2id.csv
31 | sample2fragment_file: 17.snapatac2/resource/sample2fragment.csv
32 | pre_clustering_meta: 17.snapatac2/resource/cluster2size.csv
33 | cemba_anndata_file: 17.snapatac2/resource/merge_cemba_all.h5ad
34 | max_united_size: 200000
35 | embed_nfeat: 500000
36 | embed_nsample: 3000000
37 | embed_ncomp: 30
38 | embed_name: nfeat-default_nsample-all_nc30
39 | embed:
40 |   nfeat: 500000
41 |   ncomp: 30
42 |   embed_nsample: 3000000
43 |   name: allfeat_nc30
44 | knn:
45 |   n: 50
46 |   method: exact
47 |   name: knn50
48 | leiden:
49 |   n_iter: -1
50 |   weight: True
51 |   obj: modularity
52 |   min_size: 50
53 |   repeat: 1
54 |   minr: 0.1
55 |   maxr: 2.0
56 |   byr: 0.1
57 |   seed: 0
58 |   name: igraph_leiden_modularity
59 |   n_sample: 20000
60 | umap:
61 |   n_neigh: 15
62 |   n_comp: 2
63 |   metric: euclidean
64 |   init: spectral
65 |   min_dist: 0.01
66 |   a: 1.8956
67 |   b: 0.8006
68 |   seed: 0
69 |   
70 |   
71 |   
72 | 


--------------------------------------------------------------------------------
/01.clustering/meta/mba.test.sample:
--------------------------------------------------------------------------------
1 | CEMBA171206_3C
2 | CEMBA171207_3C


--------------------------------------------------------------------------------
/01.clustering/post_script/build.sbt:
--------------------------------------------------------------------------------
1 | ThisBuild / scalaVersion := "3.2.2"
2 | 


--------------------------------------------------------------------------------
/01.clustering/post_script/sa2.L1.clustering.barcode2id.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import sys
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | with open(f"../result/clustering_sum_L1/sa2_clustering_L0_0.pkl", 'rb') as f:
 8 |     L1_sum = pickle.load(f)
 9 | 
10 | with open(f"../result/clustering_sum_L1/barcodes.txt", 'r') as f:
11 |     barcodes = [l.strip() for l in f.readlines()]
12 | 
13 | r = 0.4
14 | which_col = L1_sum['leiden_r'] == r
15 | leiden = L1_sum['leiden'][:, which_col]
16 | 
17 | with open(f"../result/clustering_sum_L1/sa2_L1_r0.4_barcodes2id.csv", 'w') as f:
18 |     f.writelines("barcode,L1\n")
19 |     f.writelines([f"{b},{i}\n" for b, i in zip(barcodes,leiden[:,0])])
20 | 
21 | # copy this file to 17.snapatac2/resource/sa2_dlt2_barcode2id.csv
22 | 
23 | # save umap
24 | umap = L1_sum['umap']
25 | barcode2umap = pd.DataFrame(data = {"barcode" : barcodes,
26 |                                     "UMAP1" : umap[:,0],
27 |                                     "UMAP2" : umap[:,1]})
28 | barcode2umap.to_csv("../result/clustering_sum_L1/L1_UMAP.csv",
29 |                     header = True, index = False)
30 | 


--------------------------------------------------------------------------------
/01.clustering/post_script/sa2.bmat.dlt.compare.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from typing import List
 4 | from pathlib import Path
 5 | import itertools
 6 | 
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | import snapatac2 as sa2
10 | 
11 | with open("../meta/mba.whole.sample.lst", 'r') as f:
12 |     samples = [ l.strip() for l in f.readlines()]
13 | 
14 | # * summarize doublet removal under bmat
15 | bmat_dlt_dir = "../result/barcode2dltprob"
16 | dlt_prob_threshold = 0.5
17 | sample2dlt = {}
18 | for s in samples:
19 |     with open(f"{bmat_dlt_dir}/{s}.txt", 'r') as f:
20 |         lines = [l.strip() for l in f.readlines()]
21 |         barcode2dltprob = [
22 |             (l.split(',')[0], float(l.split(',')[1])) for l in lines ]
23 |     sample2dlt[s] = barcode2dltprob
24 | barcodes_all_bmat = list(itertools.chain.from_iterable(
25 |     [sample2dlt[s] for s in samples] ))  
26 | 
27 | sample2barcodes = {}
28 | for s in samples:
29 |     sample2barcodes[s] = [
30 |         v[0] for v in list(filter(lambda x: x[1] <= dlt_prob_threshold,
31 |                                   sample2dlt[s]))]
32 | # 2355842
33 | nbarcodes_after_dlt = sum(
34 |     [len(sample2barcodes[s]) for s in sample2barcodes.keys()])
35 | 
36 | # * load results from doublet removal under gmat
37 | gmat_dlt_dir = "../../00.data.preprocess/snapatac2_pp_out/pp_stat"
38 | sample2barcodes_gmat = {}
39 | for s in samples:
40 |     with open(f"{gmat_dlt_dir}/{s}.qc.dlt.barcodes", 'r') as f:
41 |         sample2barcodes_gmat[s] = [l.strip() for l in f.readlines()]
42 | 
43 | # 2361710
44 | nbarcodes_gmat = sum(
45 |     [len(sample2barcodes_gmat[s]) for s in sample2barcodes_gmat.keys()])
46 | 
47 | # ** joint between barcodes under bmat and gmat
48 | barcodes_bmat = list(itertools.chain.from_iterable(
49 |     [sample2barcodes[s] for s in sample2barcodes.keys()]))
50 | barcodes_gmat = list(itertools.chain.from_iterable(
51 |     [sample2barcodes_gmat[s] for s in sample2barcodes_gmat.keys()]
52 | ))
53 | 
54 | # 2326359
55 | barcodes_both = list(
56 |     set(barcodes_bmat).intersection(set(barcodes_gmat)))
57 | 
58 | len(barcodes_both) / len(barcodes_bmat) # 98.75%
59 | 
60 | # * compare with SnapATAC
61 | with open("../../supple.02.QC/sa1.qc.dlt.barcodes", 'r') as f:
62 |     barcodes_sa1 = [l.strip() for l in f.readlines()]
63 | # 2204291
64 | barcodes_sa1_bmat = list(
65 |     set(barcodes_sa1).intersection(set(barcodes_bmat))
66 | )
67 | 
68 | # * draw dlt rate
69 | dl2r = {}
70 | for s in samples:
71 |     dl2r[s] = 1 - len(sample2barcodes[s]) / len(sample2dlt[s])
72 | 
73 | with open("../../supple.02.QC/sample2biorep.csv", 'r') as f:
74 |     lines = [l.strip() for l in f.readlines()]
75 |     s2rep = {l.split(',')[0]: l.split(',')[1] for l in lines}
76 | 
77 | dlt_early = [dl2r[s] for s in samples if s2rep[s] == 'early']
78 | dlt_later = [dl2r[s] for s in samples if s2rep[s] == 'later']
79 | fig = plt.figure()
80 | plt.boxplot([dlt_early, dlt_later])
81 | plt.show()
82 | 
83 | s_early = [s for s in samples if s2rep[s] == 'early']
84 | s_later = [s for s in samples if s2rep[s] == 'later']
85 | 


--------------------------------------------------------------------------------
/01.clustering/post_script/sa2_dlt2_L3_sum.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/01.clustering/post_script/sa2_dlt2_L3_sum.xlsx


--------------------------------------------------------------------------------
/01.clustering/post_script/sa2_dlt2_prepareL2.scala:
--------------------------------------------------------------------------------
 1 | import scala.io.Source
 2 | import scala.collection.immutable._
 3 | import java.io._
 4 | 
 5 | case class Cell(barcode: String, clusterId: Int)
 6 | case class ClusterSum(head: Tuple2[String, String], cells: List[Cell])
 7 | 
 8 | def loadClusterSumFromFile(file: String, sep:String = ","): ClusterSum = {
 9 |   val lines= Source.fromFile(file).getLines.toList
10 |   val firstline= lines.head.trim.split(sep)
11 |   val head: Tuple2[String, String] = (firstline(0), firstline(1))
12 |   val cells = lines.tail.map(x => x.trim.split(sep)).map(x => Cell(x(0), x(1).toInt))
13 |   ClusterSum(head, cells)
14 | }
15 | 
16 | def getClusterSize(x: ClusterSum): Map[Int, Int] = {
17 |   x.cells.groupBy(_.clusterId).map(t => (t._1, t._2.length))
18 | }
19 | 
20 | def writeMap2csv(fnm: String, lines: Map[Int,Int]): Unit = {
21 |   val file = new File(fnm)
22 |   val bw = new BufferedWriter(new FileWriter(file))
23 |   lines.foreach(t => bw.write(s"${t._1},${t._2}\n"))
24 |   bw.close()
25 | }
26 | 
27 | @main def Hello(params: String*):Unit = {
28 |   val clusterSum = loadClusterSumFromFile(
29 |     file = "../result/clustering_sum_L1/sa2_L1_r0.4_barcodes2id.csv", sep = ",")
30 |   val cluster2size = getClusterSize(clusterSum)
31 |   writeMap2csv(fnm = "../resource/sa2_dlt2_L1_cluster2size.csv", lines = cluster2size)
32 | }
33 | 


--------------------------------------------------------------------------------
/01.clustering/resource/sa2_L0_cluster2size.csv:
--------------------------------------------------------------------------------
1 | 0,2355842


--------------------------------------------------------------------------------
/01.clustering/resource/sa2_L1_cluster2size.csv:
--------------------------------------------------------------------------------
 1 | 0,169925
 2 | 1,156136
 3 | 2,151149
 4 | 3,142675
 5 | 4,132251
 6 | 5,112122
 7 | 6,107750
 8 | 7,100876
 9 | 8,95525
10 | 9,92969
11 | 10,82726
12 | 11,77547
13 | 12,77531
14 | 13,75507
15 | 14,66942
16 | 15,62223
17 | 16,55357
18 | 17,41114
19 | 18,41111
20 | 19,38504
21 | 20,36489
22 | 21,36276
23 | 22,34808
24 | 23,28705
25 | 24,24525
26 | 25,23299
27 | 26,22955
28 | 27,21946
29 | 28,21565
30 | 29,18470
31 | 30,17268
32 | 31,14926
33 | 32,14642
34 | 33,13659
35 | 34,13635
36 | 35,12601
37 | 36,12564
38 | 37,11548
39 | 38,11360
40 | 39,11208
41 | 40,8974
42 | 41,6967
43 | 42,6694
44 | 43,6329
45 | 44,6000
46 | 45,5501
47 | 46,5057
48 | 47,4894
49 | 48,4238
50 | 49,4072
51 | 50,3731
52 | 51,3713
53 | 52,3250
54 | 53,2854
55 | 54,2763
56 | 55,2459
57 | 56,1423
58 | 57,402
59 | 


--------------------------------------------------------------------------------
/01.clustering/resource/sa2_dlt2_L1_cluster2size.csv:
--------------------------------------------------------------------------------
 1 | 0,400100
 2 | 5,113203
 3 | 10,93057
 4 | 14,61697
 5 | 1,253568
 6 | 6,105544
 7 | 9,94120
 8 | 13,68477
 9 | 2,174357
10 | 12,77165
11 | 7,101102
12 | 18,34666
13 | 11,88462
14 | 8,96925
15 | 4,120440
16 | 15,41035
17 | 24,14929
18 | 25,13648
19 | 20,24991
20 | 29,9342
21 | 28,10726
22 | 21,23196
23 | 33,5053
24 | 17,36437
25 | 32,5993
26 | 34,3650
27 | 22,22182
28 | 27,11293
29 | 3,135066
30 | 35,3244
31 | 16,37191
32 | 31,6418
33 | 26,12019
34 | 23,17396
35 | 36,3192
36 | 30,6907
37 | 19,29051
38 | 


--------------------------------------------------------------------------------
/01.clustering/resource/sa2_dlt2_L1_cluster2size_test.csv:
--------------------------------------------------------------------------------
1 | 35,3244
2 | 36,3192
3 | 


--------------------------------------------------------------------------------
/01.clustering/sa2.gmat.Snakefile:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Dict
 3 | system: str = "encoder"
 4 | project_dict: Dict[str, str] = {
 5 |     "imac": "/Users/szu/git-recipes/mouseBrainAtlas",
 6 |     "encoder": "/projects/ps-renlab/szu/projects/CEMBA2"
 7 | }
 8 | genome = "sa2default"
 9 | 
10 | project_dir = project_dict[system]
11 | rm_dlt_dir = f"{project_dir}/17.snapatac2/sa2_qc_dlt/rm_dlt"
12 | with open(f"{project_dir}/17.snapatac2/meta/mba.whole.sample.lst", 'r') as f:
13 |     samples = [l.strip() for l in f.readlines()]
14 | # test only
15 | # samples = ["CEMBA171206_3C", "CEMBA171207_3C"]
16 | # samples = ["CEMBA171206_3C"]
17 | 
18 | out_dir = f"{project_dir}/17.snapatac2/sa2_{genome}_gmat"
19 | log_dir = f"{out_dir}/log"
20 | flag_dir = f"{out_dir}/flag"
21 | # sample-level gmat
22 | sgmat_dir = f"{out_dir}/sgmat"
23 | for d in [out_dir, log_dir, flag_dir, sgmat_dir]:
24 |     os.makedirs(d, exist_ok = True)
25 | 
26 | 
27 | def get_sample(wildcards):
28 |     return wildcards.s
29 | 
30 | rule all:
31 |     input:
32 |         expand("{f}/{s}_{g}_gmat.done",
33 |                f = flag_dir, g = genome, s = samples),
34 |         f"{flag_dir}/{genome}_gmat_merged.done"
35 | 
36 | rule sgmat:
37 |     input:
38 |         snap_file = expand("{i}/{{s}}_rm_dlt.h5ad", i = rm_dlt_dir)
39 |     output:
40 |         gmat_file = expand("{o}/{{s}}_{g}_gmat.h5ad",
41 |                            o = sgmat_dir, g = genome),
42 |         tag = touch(expand("{f}/{{s}}_{g}_gmat.done",
43 |                            f = flag_dir, g = genome))
44 |     log:
45 |         expand("{l}/{{s}}_{g}_gmat.log", l = log_dir, g = genome)
46 |     params:
47 |         sample = get_sample,
48 |         genome = genome
49 |     threads: 1
50 |     script:
51 |         f"{project_dir}/17.snapatac2/script/sa2.get.sample.gmat.py"
52 | 
53 | rule merge_sgmat:
54 |     input:
55 |         snap_files = expand("{o}/{s}_{g}_gmat.h5ad",
56 |                             o = sgmat_dir, s = samples, g = genome)
57 |     output:
58 |         merge_snap = f"{out_dir}/{genome}_gmat_merged.h5ad",
59 |         tag = touch(f"{flag_dir}/{genome}_gmat_merged.done")
60 |     params:
61 |         genome = genome
62 |     log:
63 |         f"{log_dir}/{genome}_gmat_merged.log"
64 |     threads: 4
65 |     script:
66 |         f"{project_dir}/17.snapatac2/script/sa2.merge.gmat.py"
67 |     
68 |         
69 | 


--------------------------------------------------------------------------------
/01.clustering/script/sa2.clustering.umap.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import math
 4 | import pickle
 5 | from typing import Dict, List
 6 | from dataclasses import dataclass, field
 7 | import numpy as np
 8 | import matplotlib
 9 | from matplotlib.figure import Figure
10 | import matplotlib.pyplot as plt
11 | 
12 | 
13 | import pyprojroot
14 | code_root_dir = str(pyprojroot.here())
15 | pack_dir = f"{code_root_dir}/package/python"
16 | sys.path.insert(0, pack_dir)
17 | from leiden import LeidenSum, ScatterPlot
18 | from leiden import draw_umap
19 | from leiden import init_LeidenSum_from_file
20 | from colors import SnapATACPalette
21 | 
22 |     
23 | if __name__ == '__main__':
24 |     from_dir: str = sys.argv[1]
25 |     cll: str = sys.argv[2]
26 |     outdir: str = sys.argv[3]
27 |     nsample: int = int(sys.argv[4])
28 |     # this will be used in parallel
29 |     cid: str = sys.argv[5]
30 |     ls: LeidenSum = init_LeidenSum_from_file(
31 |         from_dir = from_dir,
32 |         cll = cll,
33 |         cid = cid)
34 |     print(f"draw UMAP for {cll}: {cid}")
35 |     scf = ScatterPlot(nsample = nsample)
36 |     ## FIXME: put colors into scf
37 |     draw_umap(t = ls, scf = scf, outdir = outdir, colors = list(set(SnapATACPalette)))
38 | 
39 | 


--------------------------------------------------------------------------------
/01.clustering/script/sa2.get.sample.gmat.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import traceback
 3 | from pathlib import Path
 4 | 
 5 | import snapatac2 as sa2
 6 | 
 7 | import pyprojroot
 8 | code_root_dir = str(pyprojroot.here())
 9 | pack_dir = f"{code_root_dir}/package/python"
10 | sys.path.insert(0, pack_dir)
11 | import utils #pyright: ignore # noqa: F401, E402
12 | 
13 | # * log
14 | logger = utils.set_file_logger( #pyright: ignore
15 |     fnm = snakemake.log[0], #pyright: ignore # noqa: F821
16 |     name = "sa2.get.sample.gmat"
17 | )
18 | def handle_exception(exc_type, exc_value, exc_traceback):
19 |     if issubclass(exc_type, KeyboardInterrupt):
20 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
21 |         return
22 |     logger.error(''.join(["Uncaught exception: ",
23 |                          *traceback.format_exception(
24 |                              exc_type, exc_value, exc_traceback)
25 |                          ]))
26 | # Install exception handler
27 | sys.excepthook = handle_exception
28 | 
29 | snap_file = snakemake.input["snap_file"][0]
30 | genome = snakemake.params['genome']
31 | out_file = snakemake.output["gmat_file"][0]
32 | 
33 | logger.info(f"Load snap file {snap_file}")
34 | snap = sa2.read(snap_file, backed = 'r')
35 | # NOTE:
36 | # currently, only sa2 default genome support (i.e., mm10 in mouse)
37 | logger.info(f"genome: {genome}.")
38 | logger.info(f"write gmat to file: {out_file}.")
39 | sa2.pp.make_gene_matrix(
40 |     adata = snap,
41 |     gene_anno = sa2.genome.mm10,
42 |     file = Path(out_file),
43 |     use_x = False,
44 |     id_type = "gene"
45 | )
46 | 
47 | logger.info("Done.")
48 | 


--------------------------------------------------------------------------------
/01.clustering/script/sa2.knn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | import sys
 4 | import traceback
 5 | from typing import Dict
 6 | 
 7 | import snapatac2 as sa2
 8 | import pyprojroot
 9 | 
10 | code_root_dir = str(pyprojroot.here())
11 | pack_dir = f"{code_root_dir}/package/python"
12 | sys.path.insert(0, pack_dir)
13 | import utils #pyright: ignore # noqa: E402
14 | 
15 | # * log
16 | logger = utils.set_file_logger( #pyright: ignore # noqa
17 |     fnm = snakemake.log[0], #pyright:ignore # noqa: F821
18 |     name = "sa2.L2.embed"
19 | )
20 | def handle_exception(exc_type, exc_value, exc_traceback):
21 |     if issubclass(exc_type, KeyboardInterrupt):
22 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
23 |         return
24 | 
25 |     logger.error(''.join(["Uncaught exception: ",
26 |                          *traceback.format_exception(exc_type, exc_value, exc_traceback)
27 |                          ])
28 |                  )
29 | # Install exception handler
30 | sys.excepthook = handle_exception
31 | 
32 | # * meta
33 | snap_file = snakemake.input["snap_file"][0] #pyright: ignore # noqa
34 | knn_params: Dict = snakemake.params["knn"] #pyright: ignore # noqa 
35 | knn_nm: str = knn_params["name"]
36 | logger.info(f"Use {knn_nm} for knn.")
37 | logger.info(f"Load snapatac2 anndataset: {snap_file}.")
38 | sds = sa2.read(Path(snap_file))
39 | sa2.pp.knn(
40 |     adata = sds,
41 |     n_neighbors = knn_params["n"],
42 |     use_dims = None,
43 |     use_rep = "X_spectral",
44 |     method = knn_params["method"],
45 |     inplace = True,
46 |     random_state = 0
47 | )
48 | sds.close()
49 | logger.info("Run KNN done.")
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/01.clustering/script/sa2.merge.gmat.py:
--------------------------------------------------------------------------------
 1 | ## NOTE: this file is quite similar with sa2.merge.rmdlt.py
 2 | ##  only diff: get the sample names
 3 | ## TODO: merge sa2.merge.gmat and sa2.merge.rmdlt.py
 4 | 
 5 | import os
 6 | import sys
 7 | import traceback
 8 | from pathlib import Path
 9 | from typing import Dict
10 | 
11 | import numpy as np
12 | import snapatac2 as sa2
13 | 
14 | import pyprojroot
15 | code_root_dir = str(pyprojroot.here())
16 | pack_dir = f"{code_root_dir}/package/python"
17 | sys.path.insert(0, pack_dir)
18 | import utils #pyright: ignore # noqa: F401, E402
19 | 
20 | # * log
21 | logger = utils.set_file_logger( #pyright: ignore
22 |     fnm = snakemake.log[0], #pyright: ignore # noqa: F821
23 |     name = "sa2.merge.gmat"
24 | )
25 | def handle_exception(exc_type, exc_value, exc_traceback):
26 |     if issubclass(exc_type, KeyboardInterrupt):
27 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
28 |         return
29 |     logger.error(''.join(["Uncaught exception: ",
30 |                          *traceback.format_exception(
31 |                              exc_type, exc_value, exc_traceback)
32 |                          ]))
33 | # Install exception handler
34 | sys.excepthook = handle_exception
35 | 
36 | snap_files = snakemake.input["snap_files"]
37 | out_snap = snakemake.output["merge_snap"]
38 | tmp_snap = os.path.join(os.path.dirname(out_snap), "tmp.merge.snap.h5ad")
39 | genome = snakemake.params['genome']
40 | 
41 | logger.info(f"In total, {len(snap_files)} are detected.")
42 | 
43 | fnms = [os.path.basename(v) for v in snap_files]
44 | samples = [a.replace(f"_{genome}_gmat.h5ad", "") for a in fnms]
45 | 
46 | sample2files = [(s, f) for s, f in zip(samples, snap_files)]
47 | logger.info(f"Create AnnDataSet to tmp file: f{tmp_snap}")
48 | sds = sa2.AnnDataSet(
49 |     adatas = sample2files,
50 |     filename = tmp_snap,
51 |     add_key = 'sample'
52 | )
53 | 
54 | logger.info(f"AnnDataSet to AnnData: {out_snap}")
55 | 
56 | snap = sds.to_adata(file = out_snap, copy_x = True)
57 | new_obs_names = utils.modify_obs_name(snap, obs_key = "sample")
58 | snap.obs_names = new_obs_names
59 | 
60 | snap.close()
61 | sds.close()
62 | logger.info(f"Delete tmp file: {tmp_snap}.")
63 | os.remove(tmp_snap)
64 | logger.info("Done")
65 | 


--------------------------------------------------------------------------------
/01.clustering/script/sa2.merge.rmdlt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import traceback
 4 | from pathlib import Path
 5 | from typing import Dict
 6 | 
 7 | import numpy as np
 8 | import snapatac2 as sa2
 9 | 
10 | import pyprojroot
11 | code_root_dir = str(pyprojroot.here())
12 | pack_dir = f"{code_root_dir}/package/python"
13 | sys.path.insert(0, pack_dir)
14 | import utils #pyright: ignore # noqa: F401, E402
15 | 
16 | # * log
17 | logger = utils.set_file_logger( #pyright: ignore
18 |     fnm = snakemake.log[0], #pyright: ignore # noqa: F821
19 |     name = "sa2.embed"
20 | )
21 | # logger = utils.set_file_logger( #pyright: ignore
22 | #     fnm = "test_qc_dlt.log", #pyright: ignore # noqa: F821
23 | #     name = "sa2.embed"
24 | # )
25 | def handle_exception(exc_type, exc_value, exc_traceback):
26 |     if issubclass(exc_type, KeyboardInterrupt):
27 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
28 |         return
29 |     logger.error(''.join(["Uncaught exception: ",
30 |                          *traceback.format_exception(
31 |                              exc_type, exc_value, exc_traceback)
32 |                          ]))
33 | # Install exception handler
34 | sys.excepthook = handle_exception
35 | 
36 | snap_files = snakemake.input["snap_files"]
37 | out_snap = snakemake.output["merge_snap"]
38 | tmp_snap = os.path.join(os.path.dirname(out_snap), "tmp.merge.snap.h5ad")
39 | 
40 | logger.info(f"In total, {len(snap_files)} are inputed.")
41 | 
42 | fnms = [os.path.basename(v) for v in snap_files]
43 | samples = [a.replace("_rm_dlt.h5ad", "") for a in fnms]
44 | 
45 | sample2files = [(s, f)for s, f in zip(samples, snap_files)]
46 | logger.info(f"Create AnnDataSet to tmp file: f{tmp_snap}")
47 | sds = sa2.AnnDataSet(
48 |     adatas = sample2files,
49 |     filename = tmp_snap,
50 |     add_key = 'sample'
51 | )
52 | 
53 | logger.info(f"AnnDataSet to AnnData: {out_snap}")
54 | 
55 | snap = sds.to_adata(file = out_snap, copy_x = True)
56 | new_obs_names = utils.modify_obs_name(snap, obs_key = "sample")
57 | snap.obs_names = new_obs_names
58 | 
59 | snap.close()
60 | sds.close()
61 | logger.info(f"Delete tmp file: {tmp_snap}.")
62 | os.remove(tmp_snap)
63 | logger.info("Done")
64 | 
65 | 


--------------------------------------------------------------------------------
/01.clustering/script/sa2.pre.anndataset.py:
--------------------------------------------------------------------------------
 1 | # Deprecated
 2 | # Now we use sa2.merge.rmdlt.py to get a complete bmat snap file.
 3 | # then do subset on it directly.
 4 | import os
 5 | import sys
 6 | import traceback
 7 | from pathlib import Path
 8 | import logging
 9 | from typing import Dict, List
10 | import shutil
11 | 
12 | 
13 | from numba.core.errors import NumbaDeprecationWarning 
14 | from numba.core.errors import NumbaPendingDeprecationWarning
15 | import warnings
16 | warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
17 | warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)
18 | 
19 | import numpy as np
20 | import snapatac2 as sa2
21 | import pyprojroot
22 | code_root_dir = str(pyprojroot.here())
23 | pack_dir = f"{code_root_dir}/package/python"
24 | sys.path.insert(0, pack_dir)
25 | import utils # noqa: E402
26 | from leiden import cemba #pyright: ignore # noqa: E401, E402, F401
27 | from leiden import cal_silhouette #pyright: ignore # noqa: E402, F401
28 | from leiden import umap #pyright: ignore # noqa: E402
29 | 
30 | logger = utils.set_file_logger( #pyright: ignore
31 |     fnm = snakemake.log[0], #pyright: ignore # noqa: F821
32 |     name = "cemba.all.anndataset"
33 | )
34 | def handle_exception(exc_type, exc_value, exc_traceback):
35 |     if issubclass(exc_type, KeyboardInterrupt):
36 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
37 |         return
38 | 
39 |     logger.error(''.join(["Uncaught exception: ",
40 |                          *traceback.format_exception(exc_type, exc_value, exc_traceback)
41 |                          ])
42 |                  )
43 | # Install exception handler
44 | sys.excepthook = handle_exception
45 | 
46 | logger.info(f"Get CEMBA all anndata files into AnnDataSet.")
47 | 
48 | 
49 | sample2fragment_file = snakemake.input[0]
50 | cemba_all_file = snakemake.output[0]
51 | 
52 | with open(sample2fragment_file, 'r') as f:
53 |    files = [l.strip() for l in f.readlines()]
54 | samples = [os.path.basename(a).split(".")[0] for a in files]
55 | sample2files = [(s, f) for s, f in zip(samples, files)]
56 | logger.info(f"Load {len(samples)} samples into AnnDataSet.")
57 | sds = sa2.AnnDataSet(
58 |     adatas = sample2files,
59 |     filename = cemba_all_file
60 | )
61 | logger.info("Update obs_names: [sample].[barcode] .")
62 | 
63 | obs_names: List[str] = [f"{i}.{j}"
64 |               for i, j in zip(sds.obs['sample'].to_list(), sds.obs_names)]
65 | sds.obs_names = obs_names
66 | 
67 | sds.close()
68 | logger.info(f"CEMBA.all.AnnDataset is saved at: {cemba_all_file}.")
69 | 
70 | 


--------------------------------------------------------------------------------
/01.clustering/script/sa2.rm.dlt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import traceback
 4 | from pathlib import Path
 5 | from typing import Dict
 6 | 
 7 | import numpy as np
 8 | import snapatac2 as sa2
 9 | 
10 | import pyprojroot
11 | code_root_dir = str(pyprojroot.here())
12 | pack_dir = f"{code_root_dir}/package/python"
13 | sys.path.insert(0, pack_dir)
14 | import utils #pyright: ignore # noqa: F401, E402
15 | 
16 | # * log
17 | logger = utils.set_file_logger( #pyright: ignore
18 |     fnm = snakemake.log[0], #pyright: ignore # noqa: F821
19 |     name = "sa2.embed"
20 | )
21 | # logger = utils.set_file_logger( #pyright: ignore
22 | #     fnm = "test_qc_dlt.log", #pyright: ignore # noqa: F821
23 | #     name = "sa2.embed"
24 | # )
25 | def handle_exception(exc_type, exc_value, exc_traceback):
26 |     if issubclass(exc_type, KeyboardInterrupt):
27 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
28 |         return
29 |     logger.error(''.join(["Uncaught exception: ",
30 |                          *traceback.format_exception(
31 |                              exc_type, exc_value, exc_traceback)
32 |                          ]))
33 | # Install exception handler
34 | sys.excepthook = handle_exception
35 | 
36 | snap_file = snakemake.input["qc_dlt_file"][0] #pyright: ignore # noqa: F821
37 | out_file = snakemake.output["snap_file"][0] #pyright: ignore # noqa: F821
38 | 
39 | logger.info(f"Load snap file {snap_file}")
40 | snap = sa2.read(snap_file, backed = 'r')
41 | 
42 | barcodes: np.ndarray = np.array([snap.obs_names])
43 | dlt_probs = snap.obs['doublet_probability'].to_numpy()
44 | slt_index = (dlt_probs <= 0.5).tolist()
45 | 
46 | r = snap.subset(obs_indices = slt_index, out = out_file)
47 | r.close()
48 | snap.close()
49 | 
50 | 


--------------------------------------------------------------------------------
/01.clustering/script/supple.sa2.add.barcode.to.unite.clustering.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Fix unite clustering from L1:
 3 | - loss of barcode information under summary picklefile
 4 | """
 5 | import os
 6 | import pickle
 7 | import snapatac2 as sa2
 8 | 
 9 | 
10 | def update_pkl(cid, cll:str = "L1",
11 |                from_dir: str = "L2_dlt2_encoder",
12 |                prefix = "sa2_clustering",
13 |                snap_prefix: str = "nfeat-top_nc50") -> None:
14 |     if not os.path.exists(from_dir):
15 |         raise FileExistsError(f"{from_dir} dos not exist.")
16 |     pkl_fnm = os.path.join(from_dir, f"{prefix}_{cll}_{cid}.pkl")
17 |     if not os.path.exists(pkl_fnm):
18 |         raise FileExistsError(f"{pkl_fnm} does not exist.")
19 |     out_dir = os.path.join(from_dir, "supple.add.barcode")
20 |     os.makedirs(out_dir, exist_ok=True)
21 |     pkl2_fnm = os.path.join(out_dir, f"{prefix}_{cll}_{cid}.pkl")
22 |     
23 |     snap_fnm = os.path.join(from_dir, f"{snap_prefix}_{cid}_unite.h5ad")
24 |     if not os.path.exists(snap_fnm):
25 |         raise FileExistsError(f"{snap_fnm} does not exist.")
26 |     with open(pkl_fnm, 'rb') as f:
27 |         sum = pickle.load(f)
28 |     snap = sa2.read(snap_fnm, 'r')
29 |     if "barcode" in sum.keys():
30 |         print(f"barcode is already in {pkl_fnm}.")
31 |     else:
32 |         sum['barcode'] = snap.obs_names
33 |     snap.close()
34 |     with open(pkl2_fnm, 'wb') as f:
35 |         pickle.dump(sum, f)
36 | 
37 | 
38 | cids_unites = list(range(8,37))
39 | list(map(update_pkl, cids_unites))
40 |     
41 | 


--------------------------------------------------------------------------------
/01.clustering/script/supple.sa2.bmat.dlt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from typing import List
 4 | from pathlib import Path
 5 | 
 6 | import numpy as np
 7 | import snapatac2 as sa2
 8 | 
 9 | # test
10 | # work_dir = "/Users/szu/git-recipes/mouseBrainAtlas/CEMBA2"
11 | # sample = "CEMBA190718_8F"
12 | # qc_dlt_dir = f"{work_dir}/17.snapatac2/result/qc_bmat-dlt/"
13 | # out_dir = f"{work_dir}/17.snapatac2/result/qc_bmat-dlt_barcodes"
14 | 
15 | work_dir = "/oasis/tscc/scratch/szu/projects/CEMBA2/"
16 | qc_dlt_dir = f"{work_dir}/17.snapatac2/sa2_qc_dlt/qc_dlt"
17 | sample = sys.argv[1]
18 | out_dir = f"{work_dir}/17.snapatac2/sa2_qc_dlt/barcode2dltprob"
19 | 
20 | os.makedirs(out_dir, exist_ok = True)
21 | dlt_prob_threshold = 0.5
22 | 
23 | outf = f"{out_dir}/{sample}.txt"
24 | snap = sa2.read(Path(f"{qc_dlt_dir}/{sample}_qc_dlt.h5ad"), backed = 'r')
25 | barcodes: np.ndarray = np.array([f"{sample}.{k}" for k in snap.obs_names])
26 | dlt_probs = snap.obs['doublet_probability'].to_numpy().tolist()
27 | # barcodes_filtered:List[str] = barcodes[dlt_probs <= dlt_prob_threshold].tolist()
28 | with open(outf, 'w') as f:
29 |     f.writelines('\n'.join([f"{b},{s}" for b, s in zip(barcodes, dlt_probs)]))
30 | snap.close()
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/01.clustering/script/supple.sa2.get.embed.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import math
 4 | import pickle
 5 | from typing import Dict, List
 6 | from dataclasses import dataclass, field
 7 | import numpy as np
 8 | 
 9 | import pyprojroot
10 | code_root_dir = str(pyprojroot.here())
11 | pack_dir = f"{code_root_dir}/package/python"
12 | sys.path.insert(0, pack_dir)
13 | from leiden import LeidenSum, ScatterPlot
14 | from leiden import draw_umap
15 | from leiden import init_LeidenSum_from_file
16 | from colors import SnapATACPalette
17 | import snapatac2 as sa2
18 | 
19 | sa2_dir = os.path.join(
20 |     "/projects/ps-renlab/szu/projects/CEMBA2",
21 |     "17.snapatac2"
22 | )
23 | 
24 | sa2L1_fnm = os.path.join(
25 |     sa2_dir, "L1_encoder",
26 |     "nfeat-all_nsample-all_nc50_0_mult.h5ad")
27 | 
28 | snapL1 = sa2.read(
29 |     filename = sa2L1_fnm, backed = 'r')
30 | 
31 | embed_mat: np.ndarray = snapL1.obsm['X_spectral']
32 | barcodes: List[str] = snapL1.obs_names
33 | out_dir = os.path.join(
34 |     "/projects/ps-renlab2/szu/projects/CEMBA2",
35 |     "17.snapatac2", "resource", "sa2L1sum"
36 | )
37 | if not os.path.exists(out_dir):
38 |     os.makedirs(out_dir, exist_ok = True)
39 | 
40 | np.savetxt(
41 |     os.path.join(out_dir, "sa2.L1.embed_mat.csv"),
42 |     embed_mat, delimiter = ',')
43 | 
44 | # save barcodes to txt
45 | with open(os.path.join(
46 |         out_dir, "sa2.L1.barcodes.txt"), 'w') as f:
47 |     for bc in barcodes:
48 |         f.write(bc + '\n')
49 | 
50 | # save umap
51 | umap: np.ndarray = snapL1.obsm["X_umap"]
52 | np.savetxt(os.path.join(out_dir, "sa2.L1.umap_ab_spectral.csv"),
53 |            umap, delimiter = ',')
54 | 
55 | # calculate umap with default parameter
56 | from numba.core.errors import NumbaDeprecationWarning 
57 | from numba.core.errors import NumbaPendingDeprecationWarning
58 | import warnings
59 | warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
60 | warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)
61 | from leiden import umap  # pyright: ignore # noqa: E402
62 | 
63 | # it will automatically use <20 CPUs
64 | # and cost <30G RAM
65 | # in encoder
66 | umap_default : np.ndarray = umap(
67 |     adata = snapL1,
68 |     use_rep = "X_spectral",
69 |     inplace = False,
70 |     a = None,
71 |     b = None,
72 |     init='spectral'
73 | )
74 | 
75 | np.savetxt(os.path.join(out_dir, "sa2.L1.umap_default.csv"),
76 |            umap_default, delimiter = ',')
77 | snapL1.close()
78 | 


--------------------------------------------------------------------------------
/01.clustering/script/supple.sa2.prepare.L1.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from typing import List
 4 | from pathlib import Path
 5 | import itertools
 6 | 
 7 | import numpy as np
 8 | import snapatac2 as sa2
 9 | 
10 | # get barcodes after qc and doublet removal
11 | with open("../meta/mba.whole.sample.lst", 'r') as f:
12 |     samples = [ l.strip() for l in f.readlines()]
13 | 
14 | # * summarize doublet removal under bmat
15 | bmat_dlt_dir = "../result/barcode2dltprob"
16 | dlt_prob_threshold = 0.5
17 | sample2dlt = {}
18 | for s in samples:
19 |     with open(f"{bmat_dlt_dir}/{s}.txt", 'r') as f:
20 |         lines = [l.strip() for l in f.readlines()]
21 |         barcode2dltprob = [
22 |             (l.split(',')[0], float(l.split(',')[1])) for l in lines ]
23 |     sample2dlt[s] = barcode2dltprob
24 | barcodes_all_bmat = list(itertools.chain.from_iterable(
25 |     [sample2dlt[s] for s in samples] ))  
26 | 
27 | sample2barcodes = {}
28 | for s in samples:
29 |     sample2barcodes[s] = [
30 |         v[0] for v in list(filter(lambda x: x[1] <= dlt_prob_threshold,
31 |                                   sample2dlt[s]))]
32 | # 2355842
33 | nbarcodes_after_dlt = sum(
34 |     [len(sample2barcodes[s]) for s in sample2barcodes.keys()])
35 | 
36 | barcodes = []
37 | for s in samples:
38 |     barcodes.extend(sample2barcodes[s])
39 | 
40 | with open("../resource/barcode2id_L0.csv", 'w') as f:
41 |     f.writelines("barcode,L0\n")
42 |     f.writelines('\n'.join([f"{v},0" for v in barcodes]))
43 | 
44 | 


--------------------------------------------------------------------------------
/01.clustering/script/supple.sa2.prepare.L2.in.memory.subset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import numpy as np
 4 | import pandas as pd
 5 | import snapatac2 as sa2
 6 | 
 7 | clevel = "L1"
 8 | out_dir = f"sa2_dlt2_{clevel}_subsets"
 9 | os.makedirs(out_dir, exist_ok = False)
10 | # load snap data into memory:
11 | # - it consumes about 200G and loading process takes about 10 minutes.
12 | # - tscc has slow IO, so use this to make subset faster.
13 | # - this loading format will limit to read this file for another loading.
14 | #   - not sure if: read by 'r', then use to_memory will better, since this
15 | #     will create a new object based on the description
16 | # FIXME: after this loading, no subset attribute.
17 | 
18 | 
19 | sds_all = sa2.read("resource/merge_cemba_all.h5ad", None)
20 | 
21 | barcode2id: pd.DataFrame = pd.read_csv("resource/sa2_dlt2_barcode2id.csv")
22 | 
23 | cid = 0
24 | snap_file = f"{out_dir}/sa2_dlt2_{clevel}_{cid}.h5ad"
25 | sub_barcodes = barcode2id[barcode2id[clevel] == int(cid)]['barcode']
26 | all_barcodes = sds_all.obs_names
27 | a = set(sub_barcodes)
28 | is_in_sub = np.array([b in a for b in all_barcodes])
29 | print(f"Find {is_in_sub.sum()} barcodes in cemba.")
30 | sds = sds_all.subset(
31 |     obs_indices = is_in_sub,
32 |     out = snap_file)
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/01.clustering/script/supple.sa2.prepare.L2.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | barcode2id_file = "../resource/barcode2id.csv"
 4 | barcode2id = pd.read_csv(barcode2id_file, header = 0)
 5 | cluster2size = barcode2id.L1.value_counts()
 6 | with open("../resource/sa2_L1_cluster2size.csv", 'w') as f:
 7 |     f.writelines([f"{c},{s}\n" for c, s in zip(
 8 |         cluster2size.index.to_list(), cluster2size)])
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/01.clustering/script/supple.sa2.prepare.L4.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(stringr)
 3 | library(dplyr)
 4 | packdir <- file.path(here::here(), "package/R")
 5 | import::from(.from = "cembav2env.R", .directory = packdir,
 6 |   cluSumBySa2)
 7 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE)
 8 | 
 9 | # * load L3 clustering result
10 | L3Sums <- cluSumBySa2$loadL3Sums()
11 | L3Resos <- cluSumBySa2$loadL3Resos()
12 | barcode2L3 <- cluSumBySa2$loadbarcode2L3()
13 | 
14 | # * prepare two files
15 | # 0. get L3 needed for L4 clustering
16 | uL3s <- unique(barcode2L3$L3)
17 | 
18 | # nolint start
19 | L3Pattern <- with(L3Resos, L2nm[needL4 == "L4"]) |>
20 |   x => gsub("sa2v1_", "", x)
21 | nMin <- 400
22 | # nolint end
23 | 
24 | # filter uL3s that are in L3Pattern
25 | uL3toL2 <- gsub("-[0-9]+$", "", uL3s)
26 | uL3forL4 <- uL3s[uL3toL2 %in% L3Pattern]
27 | 
28 | # 1. cluster size at L3-level in uL3forL4
29 | # and no less than nMin
30 | L3tosize <- table(barcode2L3$L3) |> 
31 |   as.data.frame(stringsAsFactors = FALSE) |>
32 |   setNames(c("L3", "size")) |>
33 |   x => x[x$L3 %in% uL3forL4, ] |>
34 |   arrange(desc(size)) |>
35 |   x => x[x$size >= nMin, ]
36 | # outut L3tosize to csv file without header
37 | write.table(L3tosize,
38 |   file = file.path("../resource", "sa2_dlt2_L3forL4_cluster2size.csv"),
39 |   sep = ",", row.names = FALSE, col.names = FALSE, quote = FALSE)
40 | 
41 | # 2. barcode2cluster at L3-level
42 | barcode2L3 |> 
43 |   x => x[x$L3 %in% uL3forL4, ] |> 
44 |   write.table(
45 |     file = file.path("../resource", "sa2_dlt2_L3forL4_barcode2id.csv"),
46 |     sep = ",", row.names = FALSE, col.names = TRUE, quote = FALSE)
47 | 
48 | 


--------------------------------------------------------------------------------
/02.integration/src/main/R/annToS5.R:
--------------------------------------------------------------------------------
 1 | library(BPCells)
 2 | library(Seurat)
 3 | options(Seurat.object.assay.version = "v5")
 4 | library(stringr)
 5 | library(purrr)
 6 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE)
 7 | 
 8 | projdir <- here::here()
 9 | rdir <- file.path(projdir, "package/R")
10 | import::from(.from = "utils.R", .directory = rdir,
11 |   setupLogging, closeLogging)
12 | 
13 | 
14 | ## library(future)
15 | ## plan(multicore, workers = 2)
16 | ## options(future.globals.maxSize = 2e9)
17 | ## options(future.rng.onMisuse = "ignore")
18 | 
19 | 
20 | # * load snakemake configs
21 | annfnm <- snakemake@input[[1]]
22 | outfnm <- snakemake@output[[1]]
23 | logfnm <- snakemake@log[[1]]
24 | modality <- snakemake@params[["modality"]]
25 | ## py <- snakemake@params[["py"]]
26 | 
27 | # * set logger
28 | setupLogging(logfnm)
29 | 
30 | # * set python
31 | ## library(reticulate)
32 | ## reticulate::use_python(py)
33 | ## ad <- reticulate::import("andnata", convert = FALSE)
34 | 
35 | # * function
36 | convertAnn2Seurat5 <- function(annfnm,
37 |                                modality,
38 |                                group = "X",
39 |                                outdir,
40 |                                overwrite = TRUE,
41 |                                assay = "RNA",
42 |                                isLogNorm = TRUE) {
43 |   xlognorm <- BPCells::open_matrix_anndata_hdf5(
44 |     path = annfnm, group = group)
45 |   BPCells::write_matrix_dir(mat = xlognorm, dir = outdir, overwrite = overwrite)
46 |   d <- BPCells::open_matrix_dir(outdir)
47 |   s5 <- Seurat::CreateSeuratObject(counts = d, assay = assay)
48 |   if (isLogNorm) {
49 |     s5 <- Seurat::SetAssayData(
50 |       object = s5, slot = "data", new.data = d)
51 |   }
52 |   s5$modality <- modality
53 |   return(s5)
54 | }
55 | 
56 | 
57 | # * main
58 | outdir <- dirname(outfnm)
59 | outfprefix <- sub("\\.[^.]+$", "", basename(outfnm))
60 | logger::log_info("output to: ", outdir)
61 | logger::log_info("out file prefix: ", outfprefix)
62 | 
63 | logger::log_info("to Seuratv5: ", annfnm)
64 | s5 <- convertAnn2Seurat5(
65 |   annfnm = annfnm, modality = modality,
66 |   outdir = file.path(outdir, outfprefix), isLogNorm = TRUE)
67 | logger::log_info(
68 |   "NOTE: [modality] column added to Seuratv5 with value: ", modality)
69 | logger::log_info("NOTE: obs meta data is ignored in Seuratv5.")
70 | 
71 | logger::log_info("save seuratv5 to: ", outfnm)
72 | saveRDS(s5, outfnm)
73 | logger::log_info("to Seuratv5 done.")
74 | closeLogging()
75 | 


--------------------------------------------------------------------------------
/02.integration/src/main/R/downsample.Allen.Seurat.on.subclass.level.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(Seurat)
 3 | options(Seurat.object.assay.version = "v5")
 4 | options(future.globals.maxSize = 5e9)
 5 | library(SeuratObject)
 6 | library(Matrix)
 7 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE)
 8 | library(future)
 9 | 
10 | projdir <- here::here()
11 | rdir <- file.path(projdir, "package/R")
12 | import::from(.from = "cembav2env.R", .directory = rdir,
13 |   cluSumBySa2, Sa2Integration, cembav2env, Sa2PeakCalling)
14 | import::from(.from = "integration.R", .directory = rdir,
15 |   convertAnn2Seurat5, get.downsample.fun, downsampleSeurat,
16 |   toSeuratInMemory,
17 |   isOnDiskMat.Seurat, calVarOfFea.Seurat, setVariableFeatures)
18 | 
19 | # * configs
20 | # tscc or encoder
21 | system <- "tscc"
22 | rscdir <- file.path(here::here(), "19.snap2_integration",
23 |   "src/main/resource")
24 | outdir <- file.path(here::here(), "19.snap2_integration",
25 |   paste0("out/transferLabel_", system))
26 | if(!dir.exists(outdir)) {
27 |   dir.create(outdir)
28 | }
29 | allenAnnotMeta <- Sa2Integration$loadAllenAnnot()
30 | allenAnnotMeta$subclass_label_v3 <- vapply(allenAnnotMeta$subclass_label,
31 |   Sa2PeakCalling$renameAllenSubclass, "rename")
32 | 
33 | neuronAllenSeu <- readRDS(
34 |   file.path(outdir, "neuron_allen_noraw_seurat.rds"))
35 | neuronAllenSeu$subclass <- allenAnnotMeta[
36 |   as.character(neuronAllenSeu$cl), "subclass_label_v3"]
37 | neuronAllenSeu$barcode <- colnames(neuronAllenSeu)
38 | 
39 | nnAllenSeu <- readRDS(
40 |   file.path(outdir, "nn_allen_noraw_seurat.rds"))
41 | nnAllenSeu$subclass <- allenAnnotMeta[
42 |   as.character(nnAllenSeu$cl), "subclass_label_v3"]
43 | nnAllenSeu$barcode <- colnames(nnAllenSeu)
44 | 
45 | # * downsample by subclass
46 | s <- merge(nnAllenSeu, neuronAllenSeu)
47 | nds <- 1000
48 | barcodes <- s@meta.data |>
49 |   group_by(subclass) |> slice_sample(n = nds) |>
50 |   x => x$barcode
51 | s <- s[, barcodes]
52 | mat1 <- as(s$RNA[["data.1"]], Class = "dgCMatrix")
53 | mat2 <- as(s$RNA[["data.2"]], Class = "dgCMatrix")
54 | mat <- SeuratObject::RowMergeSparseMatrices(
55 |   mat1 = mat1, mat2 = mat2)
56 | meta <- s@meta.data
57 | r <- Seurat::CreateSeuratObject(counts = mat, assay = "RNA",
58 |   meta.data = meta)
59 | r <- Seurat::SetAssayData(object = r, slot = "data", new.data = mat)
60 | saveRDS(r, file.path(outdir, "allen_ds1000_seurat.rds"))
61 | 
62 | ## nds <- 5000
63 | ## neuron.meta.ds <- neuronAllenSeu@meta.data |>
64 | ##   group_by(subclass) |> slice_sample(n = nds)
65 | ## neuronAllenSeu.ds <- neuronAllenSeu[, neuron.meta.ds$barcode]
66 | ## saveRDS(neuronAllenSeu.ds, file.path(outdir,
67 | ##   "neuron_allen_ds5000_seurat.rds"))
68 | 
69 | ## nn.meta.ds <- nnAllenSeu@meta.data |>
70 | ##   group_by(subclass) |> slice_sample(n = nds)
71 | ## nnAllenSeu.ds <- nnAllenSeu[ , nn.meta.ds$barcode]
72 | ## saveRDS(nnAllenSeu.ds, file.path(outdir,
73 | ##   "nn_allen_ds5000_seurat.rds"))
74 | 
75 | 


--------------------------------------------------------------------------------
/02.integration/src/main/R/downsample.sa2.Seurat.on.subclass.level.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(Seurat)
 3 | options(Seurat.object.assay.version = "v5")
 4 | options(future.globals.maxSize = 5e9)
 5 | library(SeuratObject)
 6 | library(Matrix)
 7 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE)
 8 | library(future)
 9 | 
10 | projdir <- here::here()
11 | rdir <- file.path(projdir, "package/R")
12 | import::from(.from = "cembav2env.R", .directory = rdir,
13 |   cluSumBySa2, Sa2Integration, cembav2env, Sa2PeakCalling)
14 | import::from(.from = "integration.R", .directory = rdir,
15 |   convertAnn2Seurat5, get.downsample.fun, downsampleSeurat,
16 |   toSeuratInMemory,
17 |   isOnDiskMat.Seurat, calVarOfFea.Seurat, setVariableFeatures)
18 | 
19 | # * configs
20 | # tscc or encoder
21 | system <- "tscc"
22 | rscdir <- file.path(here::here(), "19.snap2_integration",
23 |   "src/main/resource")
24 | outdir <- file.path(here::here(), "19.snap2_integration",
25 |   paste0("out/transferLabel_", system))
26 | 
27 | # load atacmeta
28 | atacMeta <- readRDS(cemabv)
29 | 


--------------------------------------------------------------------------------
/02.integration/src/main/R/getIntUMAP.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | options(Seurat.object.assay.version = "v5")
 3 | library(stringr)
 4 | library(purrr)
 5 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE)
 6 | projdir <- here::here()
 7 | rdir <- file.path(projdir, "package/R")
 8 | import::from(.from = "cembav2env.R", .directory = rdir,
 9 |   Sa2Integration)
10 | import::from(.from = "utils.R", .directory = rdir,
11 |   setupLogging, closeLogging)
12 | 
13 | # * load snakemake configs
14 | intS5fnm <- snakemake@input[[1]]
15 | outfnm <- snakemake@output[[1]]
16 | nPCA <- snakemake@params$nPCA
17 | intMethod <- snakemake@wildcards$m
18 | 
19 | # * set logger
20 | setupLogging(snakemake@log[[1]])
21 | 
22 | # * main
23 | logger::log_info("readRDS: ", intS5fnm)
24 | seu <- readRDS(intS5fnm)
25 | logger:log_info("After integration with: ", intMethod)
26 | logger::log_info("run UMAP on reduction: ", paste0("intgn.", intMethod))
27 | 
28 | seu <- Seurat::RunUMAP(seu,
29 |   reduction = paste0("intgn.", intMethod), dims = 1:nPCA,
30 |   reduction.name = paste0("umap.", intMethod))
31 | 
32 | logger::log_info("finish UMAP, and save to ", outfnm)
33 | saveRDS(seu, outfnm)
34 | logger::log_info("done")
35 | closeLogging()
36 | 


--------------------------------------------------------------------------------
/02.integration/src/main/R/mapSubclassNames.R:
--------------------------------------------------------------------------------
 1 | projroot <- here::here()
 2 | rdir <- file.path(projroot, "package/R")
 3 | import::from(.from = "cembav2env.R", .directory = rdir,
 4 |   cembav2env, Sa2Integration, Sa2PeakCalling)
 5 | import::from(.from = "utils.R", .directory = rdir,
 6 |   fastread.csv)
 7 | 
 8 | 
 9 | allenMeta <- Sa2Integration$loadAllenAnnot()
10 | 
11 | subclassMap <- unique(data.frame(
12 |   subclass_id = allenMeta$subclass_id,
13 |   subclass_id_label = allenMeta$subclass_id_label,
14 |   subclass_label = allenMeta$subclass_label
15 | ))
16 | 
17 | subclass_label_bw <- data.table::fread(
18 |   file = "../resource/subclass_nm_in_macs2_bigwig.txt",
19 |   header = FALSE, data.table = FALSE)$V1
20 | 
21 | subclassMap$subclass_id_label_bw <- subclassMap$subclass_id_label |>
22 |   gsub(" ", "_", x = _) |>
23 |   gsub("/", ".", x = _)
24 | 
25 | subclassMap$subclass_label_bw <- subclassMap$subclass_label |>
26 |   gsub(" ", "_", x = _) |>
27 |   gsub("/", ".", x = _)
28 | 
29 | subclassMap$subclass_label_peak <- subclassMap$subclass_label |>
30 |   Sa2PeakCalling$renameAllenSubclass()
31 | 
32 | # check
33 | all(subclass_label_bw %in% subclassMap$subclass_id_label_bw)
34 | a <- fastread.csv(fnm = file.path(projroot, "18.snap2_peakcalling",
35 |   "out/scfilter", "count_peakBysubclass.csv"))
36 | all(colnames(a) %in% subclassMap$subclass_label_peak)
37 | 
38 | # save
39 | saveRDS(subclassMap,
40 |   file.path(projroot, "meta", "sa2.subclass.names.map.rds"))
41 | write.table(subclassMap,
42 |   file.path(projroot, "meta", "sa2.subclass.names.map.csv"),
43 |   sep = ",", quote = FALSE, col.names = TRUE, row.names = FALSE)
44 | 


--------------------------------------------------------------------------------
/02.integration/src/main/R/runPCA.R:
--------------------------------------------------------------------------------
 1 | library(Seurat)
 2 | options(Seurat.object.assay.version = "v5")
 3 | library(stringr)
 4 | library(purrr)
 5 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE)
 6 | projdir <- here::here()
 7 | rdir <- file.path(projdir, "package/R")
 8 | import::from(.from = "cembav2env.R", .directory = rdir,
 9 |   Sa2Integration)
10 | import::from(.from = "utils.R", .directory = rdir,
11 |   setupLogging, closeLogging)
12 | import::from(.from = "integration.R", .directory = rdir,
13 |   get.downsample.fun, downsampleSeurat)
14 | 
15 | 
16 | # * load snakemake configs
17 | infnm <- snakemake@input[[1]]
18 | outfnm <- snakemake@output[[1]]
19 | ft <- snakemake@wildcards$ft
20 | logfnm <- snakemake@log[[1]]
21 | nPCA <- snakemake@params$nPCA
22 | 
23 | # * set logger
24 | setupLogging(logfnm)
25 | 
26 | # * load features from Sa2Integration
27 | logger::log_info("load features from Sa2Integration: ", ft)
28 | geneList <- Sa2Integration$getMarkersList()
29 | fts <- geneList[[ft]]
30 | 
31 | # * main
32 | logger::log_info("readRDS: ", infnm)
33 | seu <- readRDS(infnm)
34 | logger::log_info("ScaleData on feature: ", ft)
35 | Seurat::VariableFeatures(seu) <- fts
36 | seu <- Seurat::ScaleData(seu, features = fts)
37 | 
38 | logger::log_info("runPCA")
39 | seu <- Seurat::RunPCA(
40 |   seu, features = fts, verbose = TRUE,
41 |   npcs = nPCA)
42 | 
43 | logger::log_info("writeRDS: ", outfnm)
44 | saveRDS(seu, outfnm)
45 | logger::log_info("done")
46 | closeLogging()
47 | 


--------------------------------------------------------------------------------
/02.integration/src/main/R/simple.gene.list.of.allen.R:
--------------------------------------------------------------------------------
 1 | library(purrr)
 2 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE)
 3 | library(stringr)
 4 | projdir <- here::here()
 5 | resourcedir <- "../resource"
 6 | rdir <- file.path(projdir, "package/R")
 7 | import::from(.from = "cembav2env.R", .directory = rdir,
 8 |   Sa2Integration)
 9 | 
10 | # * simply extract allen's cluster marker genes
11 | annotAllen <- Sa2Integration$loadAllenAnnot()
12 | 
13 | # * functions
14 | filterMarkers <- function(markers, sep = ",") {
15 |   # filter space
16 |   # filter gene symbols linked with ENSMUSG
17 |   r <- map(markers, str_split_1, pattern = sep) |>
18 |     unlist() |> sort() |>
19 |     x => x[!grepl("ENSMUSG", x)] |>
20 |     x => x[nzchar(x)] |>
21 |     unique()
22 |   message("found ", length(r), " markers.")
23 |   message("unique ", length(unique(r)), " markers.")
24 |   return(r)
25 | }
26 | getMarkers <- function(annotAllen,
27 |                        groupBy = "",
28 |                        groups = c(),
29 |                        markerCol = "cluster.markers",
30 |                        outfnm = "") {
31 |   message("marker column: ", markerCol)
32 |   markers <- if (nchar(groupBy) > 1) {
33 |     message("select markers from column: ", groupBy)
34 |     message("with values: ", paste(groups, collapse = " "))
35 |     gs <- annotAllen[[groupBy]]
36 |     annotAllen[gs %in% groups, markerCol]
37 |   } else {
38 |     annotAllen[[markerCol]]
39 |   }
40 |   r <- filterMarkers(markers)
41 |   if (nchar(outfnm) > 1) {
42 |     message("save results to: ", outfnm)
43 |     write.table(r, outfnm, append = FALSE, quote = FALSE,
44 |       col.names = FALSE, row.names = FALSE)
45 |   }
46 |   return(r)
47 | }
48 | 
49 | # * main
50 | allMarkers <- getMarkers(
51 |   annotAllen, markerCol = "cluster.markers",
52 |   outfnm = file.path(resourcedir, "AIT21_cluster_markers.txt")
53 | )
54 | allMerfishMarkers <- getMarkers(
55 |   annotAllen, markerCol = "merfish.markers",
56 |   outfnm = file.path(resourcedir, "AIT21_merfish_markers.txt")
57 | )
58 | 
59 | # fix k8 markers
60 | rawk8markers <- read.table(
61 |   Sa2Integration$AllenRaw8KMarkerFile, header = FALSE)$V1
62 | k8markers <- filterMarkers(rawk8markers)
63 | write.table(k8markers,
64 |   file.path(resourcedir, "AIT21_k8_markers.txt"),
65 |   append = FALSE, quote = FALSE, col.names = FALSE, row.names = FALSE)
66 | 
67 | # * test markers
68 | geneList <- Sa2Integration$getMarkersList()
69 | 


--------------------------------------------------------------------------------
/02.integration/src/main/python/imneuron.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import importlib
 4 | import pandas as pd
 5 | import anndata as ad
 6 | 
 7 | from pyprojroot import here
 8 | proj_root: str = str(here())
 9 | sys.path.insert(0, os.path.join(proj_root, "package/python"))
10 | from utils import set_file_logger
11 | import cembav2env
12 | importlib.reload(cembav2env)
13 | 
14 | # * configs
15 | rsc_dir = os.path.join(proj_root,
16 |                        "19.snap2_integration/"
17 |                        "src/main/resource")
18 | allen = cembav2env.Allen()
19 | sa2atac = cembav2env.Sa2ATAC()
20 | 
21 | # * load reduced ann data
22 | reduced_allen_dir = os.path.join(rsc_dir, "norawdata_allen")
23 | neuron_allen_fnm = os.path.join(reduced_allen_dir,
24 |                             "neuron_male_allen_ann_noraw.h5ad")
25 | neuron_allen = ad.read_h5ad(
26 |     filename=neuron_allen_fnm, backed="r")
27 | 
28 | nn_allen_fnm = os.path.join(reduced_allen_dir,
29 |                             "nn_male_allen_ann_noraw.h5ad")
30 | nn_allen = ad.read_h5ad(
31 |     filename=nn_allen_fnm, backed="r")
32 | 
33 | 


--------------------------------------------------------------------------------
/02.integration/src/main/python/reduce.anndata.allen.sa2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import importlib
 4 | import pandas as pd
 5 | import anndata as ad
 6 | 
 7 | from pyprojroot import here
 8 | proj_root: str = str(here())
 9 | sys.path.insert(0, os.path.join(proj_root, "package/python"))
10 | from utils import set_file_logger
11 | import cembav2env
12 | importlib.reload(cembav2env)
13 | 
14 | # * configs
15 | rsc_dir = os.path.join(proj_root,
16 |                        "19.snap2_integration/"
17 |                        "src/main/resource")
18 | allen = cembav2env.Allen()
19 | sa2atac = cembav2env.Sa2ATAC()
20 | 
21 | # * reduce anndata by removing raw data
22 | # run only once
23 | # let's keep using log-norm data for later integration
24 | # since scaling is invariant towards normalization constant.
25 | 
26 | reduced_allen_dir = os.path.join(rsc_dir, "norawdata_allen")
27 | if not os.path.exists(reduced_allen_dir):
28 |     os.makedirs(reduced_allen_dir, exist_ok = False)
29 | ann_10xv3_nn = allen.get_10xv3_nn_ann()
30 | allen.get_reduced_ann(
31 |     ann_10xv3_nn,
32 |     outfnm = os.path.join(reduced_allen_dir, "nn_male_10xv3_ann_noraw.h5ad"))
33 | del ann_10xv3_nn
34 | 
35 | ann_10xv3_neuron = allen.get_10xv3_neuron_ann()
36 | allen.get_reduced_ann(
37 |     ann_10xv3_neuron,
38 |     outfnm = os.path.join(reduced_allen_dir,
39 |                           "neuron_male_10xv3_ann_noraw.h5ad"))
40 | del ann_10xv3_neuron
41 | 
42 | 
43 | ann_all_nn = allen.get_allen_nn_ann()
44 | allen.get_reduced_ann(
45 |     ann_all_nn,
46 |     outfnm = os.path.join(reduced_allen_dir,
47 |                           "nn_male_allen_ann_noraw.h5ad"))
48 | del ann_all_nn
49 | 
50 | ann_all_neuron = allen.get_allen_neuron_ann()
51 | allen.get_reduced_ann(
52 |     ann_all_neuron,
53 |     outfnm = os.path.join(reduced_allen_dir,
54 |                           "neuron_male_allen_ann_noraw.h5ad"))
55 | del ann_all_neuron
56 | 
57 | # * remove raw from our atac gmat
58 | # run only once
59 | reduced_atac_dir = os.path.join(rsc_dir, "norawdata_atac")
60 | if not os.path.exists(reduced_atac_dir):
61 |     os.makedirs(reduced_atac_dir, exist_ok=True)
62 | atac_ann = sa2atac.load_sa2gmat_ann()
63 | barcode2L3: pd.DataFrame = sa2atac.read_barcode2L3()
64 | sa2atac.add_L3_to_atac_ann(atac_ann, barcode2L3)
65 | 
66 | 
67 | atac_rough_annot = sa2atac.read_rough_annot()
68 | barcode2annot = atac_rough_annot.loc[barcode2L3["L3"]]
69 | barcode2annot.index = barcode2L3["barcode"]
70 | 
71 | atac_nn_ann = sa2atac.get_nn_atac_ann(
72 |     atac_ann, barcode2annot,
73 |     outfnm = os.path.join(reduced_atac_dir, "nn_gmat_atac_ann.h5ad"))
74 | del atac_nn_ann
75 | 
76 | atac_neuron_ann = sa2atac.get_neuron_atac_ann(
77 |     atac_ann, barcode2annot,
78 |     outfnm = os.path.join(reduced_atac_dir, "neuron_gmat_atac_ann.h5ad")
79 | )
80 | del atac_neuron_ann
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/02.integration/src/main/resource/AIT21_ReadMe.txt:
--------------------------------------------------------------------------------
 1 | This folder includes Mouse Whole Brain taxonomy's cellxgene matrix for scRNAseq data (10Xv2, 10Xv3) and snRNAseq data (Multiome) in h5ad file and the cluster annotation file.
 2 | 
 3 | # cellxgene matrix
 4 | 
 5 |   - filename : AIT21_10Xv2.h5ad
 6 |                AIT21_10Xv3.h5ad
 7 |                AIT21_10Xmulti.h5ad
 8 | 
 9 |   - anndata$X      : log-normalized count matrix
10 |            $layers : raw count matrix
11 |            $obs    : cluster id 'cl' and sample related metadata
12 |                      'cl' - cluster id to be used as matching key for cluster  annotation file!
13 |                      library_prep        - library
14 |                      gene.counts.0       - number of detected genes 
15 |                      doublet_score       - doublet score
16 |                      roi                 - region
17 |                      umi.counts          - number of detected UMI's
18 |                      method              - 10Xv3 / 10Xv2 / 10Xmulti
19 |                      sex                 - 
20 |                      external_donor_name - 
21 |                      age                 -   
22 |                      medical_conditions  - Light/Dark
23 | 
24 | # cluster annotation 
25 | 
26 |   - filename : AIT21_annotation.tsv
27 |         cl                              : cluster key matching all cluster-related tables
28 | 	cluster_id/cluster_label        : id and label at cluster level	
29 |         supertype_id/supertype_label    : id and label at supertype level
30 |         subclass_id/subclass_label	: id and label at subclass
31 |         class_id/class_label	        : id and label at clas
32 |         anatomical_annotation           : anatomical region that contribute to the most of cells in the cluster
33 |         CCF_broad.freq/CCF_acronym.freq	: fraction of cells from regions in CCF_broad or CCF_acronym
34 |         v3.size/v2.size/multiome.size   : number of cells or /nuclei from 10Xv3, 10Xv2, and multiome data
35 |         cluster.markers                 : markers of the cluster
36 |         merfish.markers                 : markers of the  cluster for merfish data
37 |         max.region                      : The region that contribute to the most of the cells in given cluster
38 |         max.region.ratio                : The fraction of cells coming from the max.region 
39 |         F,M                             : sex distribution
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/02.integration/src/main/resource/BICCN.BrainRegionMetadata.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/02.integration/src/main/resource/BICCN.BrainRegionMetadata.xlsx


--------------------------------------------------------------------------------
/02.integration/src/test/R/test.snakemake.wildcards.R:
--------------------------------------------------------------------------------
 1 | logger::log_threshold(logger::TRACE)
 2 | log_file <- snakemake@log[[1]]
 3 | logger::log_appender(logger::appender_file(log_file))
 4 | 
 5 | 
 6 | afnm <- snakemake@input[["afnm"]]
 7 | str(afnm)
 8 | logger::log_info(afnm)
 9 | 
10 | bfnm <- snakemake@input[["bfnm"]]
11 | str(bfnm)
12 | print(bfnm)
13 | 
14 | params <- snakemake@params[["a"]]
15 | str(params)
16 | print(params)
17 | 
18 | print(snakemake@wildcards)
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/02.integration/src/test/pipeline/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: test_R_snakemake test_intgn_snakemake
 2 | 
 3 | test_R_snakemake: R.Snakefile
 4 | 	snakemake --snakefile $< -R -c 2\
 5 |     --rerun-triggers mtime \
 6 |     --skip-script-cleanup
 7 | 	rm out/*.out
 8 | 
 9 | conda_path := /home/szu/mambaforge/envs/seurat/bin
10 | test_intgn_snakemake: ../../main/pipeline/Seurat.Intgn.Snakefile
11 | 	-mkdir -p $@
12 | 	cp $< $@/Snakefile
13 | 	cd $@ && \
14 | 	snakemake --snakefile Snakefile -R -c 2 \
15 |     --config \
16 |     gp=nn \
17 |     debug=1 \
18 |     intgn_method='rpac,mnn' \
19 |     allen_techs='10xv3' \
20 | 	  --rerun-triggers mtime \
21 |     --skip-script-cleanup \
22 |     --profile pbs-torque-conda
23 | 
24 | clean:
25 | 	-rm -rf out
26 | 	-rm -rf log
27 | 	-rm -rf .snakemake
28 | 


--------------------------------------------------------------------------------
/02.integration/src/test/pipeline/R.Snakefile:
--------------------------------------------------------------------------------
 1 | import os
 2 | a = ["a1", "a2", "a3"]
 3 | 
 4 | out_dir = "out"
 5 | log_dir = "log"
 6 | for i in [out_dir, log_dir]:
 7 |     os.makedirs(i, exist_ok = True)
 8 | 
 9 | script_dir = "../R"
10 | 
11 | rule all:
12 |     input:
13 |         expand("{o}/{t}.out", o = out_dir, t = a)
14 | 
15 | rule getOut:
16 |     input:
17 |         # afnm = lambda w: f"{out_dir}/{w.t}.in",
18 |         # afnm = expand("{o}/{{t}}.in", o = out_dir),
19 |         afnm = f"{out_dir}/{{t}}.in",
20 |         # bfnm = lambda w: f"{out_dir}/{w.t}.in"
21 |         bfnm = f"{out_dir}/{{t}}.in"
22 |         # bfnm = expand("{o}/{{t}}.in", o = out_dir)
23 |     output:
24 |         touch(expand("{o}/{{t}}.out", o = out_dir))
25 |     log:
26 |         f"{log_dir}/{{t}}.log"
27 |         # wildcard in log filed does not work
28 |         # fnm = lambda w: f"{log_dir}/{w.t}.log"
29 |     params:
30 |         a = [1,2,3]
31 |     script:
32 |         f"{script_dir}/test.snakemake.wildcards.R"
33 |         
34 |         
35 | 


--------------------------------------------------------------------------------
/02.integration/src/test/python/prepare.intg.test.ann.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import importlib
 4 | import anndata as ad
 5 | import pandas as pd
 6 | import numpy as np
 7 | from pyprojroot import here
 8 | 
 9 | proj_root = str(here())
10 | sys.path.insert(0, os.path.join(proj_root, "package/python"))
11 | from utils import set_file_logger
12 | import cembav2env
13 | 
14 | importlib.reload(cembav2env)
15 | 
16 | rsc_dir = os.path.join(proj_root, "19.snap2_integration",
17 |                        "src/test/resource")
18 | out_allen_dir = os.path.join(rsc_dir, "noraw_allen")
19 | os.makedirs(out_allen_dir, exist_ok = True)
20 | out_atac_dir = os.path.join(rsc_dir, "noraw_atac")
21 | os.makedirs(out_atac_dir, exist_ok = True)
22 | 
23 | allen = cembav2env.Allen()
24 | sa2atac = cembav2env.Sa2ATAC()
25 | 
26 | nn_allen = allen.read_nn_10xv3_lognorm_ann()
27 | 
28 | rdm_nn_allen = nn_allen[np.random.choice(nn_allen.n_obs, 5000, replace = False)]
29 | 
30 | rdm_nn_allen.write(
31 |     os.path.join(out_allen_dir, "nn_male_10xv3_ann_noraw.h5ad"))
32 | 
33 | del nn_allen
34 | 
35 | nn_atac = sa2atac.read_nn_gmat_lognorm_ann()
36 | rdm_nn_atac = nn_atac[np.random.choice(nn_atac.n_obs, 5000, replace = False)]
37 | 
38 | rdm_nn_atac.write(
39 |     os.path.join(out_atac_dir, "nn_gmat_atac_ann.h5ad"))
40 | del nn_atac
41 | 
42 | 


--------------------------------------------------------------------------------
/03.peakcalling/bin/merge_peaks:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/03.peakcalling/bin/merge_peaks


--------------------------------------------------------------------------------
/03.peakcalling/src/main/R/addpL4Info2atacMeta.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(rlang)
 3 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE)
 4 | projdir <- here::here()
 5 | rdir <- file.path(projdir, "package/R")
 6 | import::from(.from = "cembav2env.R", .directory = rdir,
 7 |   cembav2env)
 8 | 
 9 | atacMeta <- readRDS(cembav2env$sa2metaFile)
10 | rownames(atacMeta) <- atacMeta$barcode2
11 | 
12 | barcode2pL4_nn_fnm <- file.path(projdir, "18.snap2_peakcalling",
13 |   "src/main/resource",
14 |   "nn_barcode2cluster_bedtag-all.csv")
15 | 
16 | barcode2pL4_nn <- data.table::fread(
17 |   file = barcode2pL4_nn_fnm, header = FALSE, sep = ",",
18 |   data.table = FALSE)
19 | colnames(barcode2pL4_nn) <- c("barcode", "pL4")
20 | barcode2pL4_nn$gpL4 <- paste0("nn.", barcode2pL4_nn$pL4)
21 | 
22 | barcode2pL4_neuron_fnm <- file.path(projdir, "18.snap2_peakcalling",
23 |   "src/main/resource",
24 |   "neuron_barcode2cluster_bedtag-all.csv")
25 | barcode2pL4_neuron <- data.table::fread(
26 |   file = barcode2pL4_neuron_fnm, header = FALSE, sep = ",",
27 |   data.table = FALSE)
28 | colnames(barcode2pL4_neuron) <- c("barcode", "pL4")
29 | barcode2pL4_neuron$gpL4 <- paste0("neuron.", barcode2pL4_neuron$pL4)
30 | 
31 | barcode2pL4 <- rbind(barcode2pL4_nn, barcode2pL4_neuron)
32 | rownames(barcode2pL4) <- barcode2pL4$barcode
33 | 
34 | atacMeta$pL4 <- barcode2pL4[rownames(atacMeta), "pL4"]
35 | atacMeta$gpL4 <- barcode2pL4[rownames(atacMeta), "gpL4"]
36 | saveRDS(atacMeta,
37 |   file = file.path(projdir, "supple.02.annotation.all.in.one.meta",
38 |     "mba.whole.cell.meta.v9.4.rds"))
39 | 
40 | 
41 | # * now it's v9.7
42 | atacMeta <- readRDS(cembav2env$sa2metaFile)
43 | 
44 | table(atacMeta[grep("IMN", atacMeta$subclass_label_v3), c("L4", "NT_v3")], useNA = "ifany")
45 | table(atacMeta[grep("IMN", atacMeta$subclass_label_v3), c("L4", "mainclass.rough")], useNA = "ifany")
46 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/R/fitbgmodel.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(reticulate)
 3 | library(gamlss.dist)
 4 | library(gamlss)
 5 | library(fitdistrplus)
 6 | 
 7 | usePython <- "/home/szu/mambaforge/envs/sa2/bin/python"
 8 | reticulate::use_python(usePython)
 9 | pd <- reticulate::import("pandas", convert = FALSE)
10 | 
11 | projdir <- here::here()
12 | workdir <- file.path(projdir, "18.snap2_peakcalling")
13 | 
14 | # * meta
15 | default_cutoff <- 0.001
16 | pep <- 0.001
17 | rnd_upbound <- 0.1
18 | outdir <- file.path(workdir, "out/scfilter/fitfrac_bg")
19 | 
20 | 
21 | # * read peakfrac data
22 | peakfrac_rnd <- pd$read_pickle(
23 |   file.path(workdir, "out/scfilter", "peakfrac_rnd.pkl"))
24 | peakfrac_rnd <- reticulate::py_to_r(x = peakfrac_rnd)
25 | 
26 | ## peakfrac_union <- pd$read_pickle(
27 | ##   file.path(workdir, "out/scfilter", "peakfrac_union.pkl"))
28 | ## peakfrac_union <- reticulate::py_to_r(x = peakfrac_union)
29 | 
30 | # * fit models for each pL4 using peakfrac_rnd
31 | i <- as.integer(commandArgs(trailingOnly = TRUE)[1])
32 | pL4 <- rownames(peakfrac_rnd)[i]
33 | message("fit BEZI model for ", pL4)
34 | 
35 | x <- unlist(peakfrac_rnd[i, ])
36 | x <- x[x <= rnd_upbound]
37 | 
38 | mod <- gamlss::gamlss(
39 |   x ~ 1, sigma.formula = ~1,
40 |   nu.formula = ~1,
41 |   family = BEZI,
42 |   control = gamlss::gamlss.control(n.cyc = 100, trace = FALSE),
43 |   )
44 | 
45 | ## summary(mod)
46 | mufit <- fitted(mod, "mu")[1]
47 | sigmafit <- fitted(mod, "sigma")[1]
48 | nufit <- fitted(mod, "nu")[1]
49 | converged <- ifelse(mod$converged, 1, 0)
50 | 
51 | cutoff <- if (mod$converged) {
52 |   # find x axis value at p = pep (default 0.001)
53 |   gamlss.dist::qBEZI((1 - pep),
54 |     mu = mufit, sigma = sigmafit, nu = nufit,
55 |     lower.tail = TRUE, log.p = FALSE)
56 | } else {
57 |   message("Fitting BEZI dose not converge.",
58 |     " Use default cutoff: ", default_cutoff)
59 |   default_cutoff
60 | }
61 | 
62 | para <- data.frame(
63 |   n = length(x),
64 |   mu = mufit,
65 |   sigma = as.numeric(sigmafit),
66 |   nu = nufit,
67 |   pep = pep,
68 |   cutoff = cutoff,
69 |   Gdeviance = mod$G.deviance,
70 |   converged = converged)
71 | 
72 | data.table::fwrite(para,
73 |   file.path(outdir, paste0(pL4, ".fitPeakModel.para.csv")),
74 |   quote = FALSE, col.names = TRUE, row.names = FALSE, sep = ",")
75 | 
76 | message("done.")
77 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/R/subclass2peak.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(rlang)
 3 | library(R6)
 4 | Sys.setenv("_R_USE_PIPEBIND_" = TRUE)
 5 | 
 6 | library(data.table)
 7 | library(GenomicRanges)
 8 | library(S4Vectors)
 9 | 
10 | rdir <- file.path(here::here(), "package/R")
11 | import::from(.from = "cembav2env.R", .directory = rdir,
12 |   cembav2env, Sa2PeakCalling)
13 | import::from(.from = "peak.R", .directory = rdir,
14 |   mapL4pc2L4)
15 | work_dir <- file.path(here::here(), "18.snap2_peakcalling")
16 | 
17 | # * load union peaks
18 | unionPeakBed <- Sa2PeakCalling$readUnionPeakBedFile()
19 | clsUnionPeakBeds <- Sa2PeakCalling$loadpL4UnionPeakBeds()
20 | # check
21 | bedExists <- vapply(clsUnionPeakBeds,
22 |   \(x) {
23 |     all(rownames(x) %in% rownames(unionPeakBed))
24 |   }, TRUE)
25 | 
26 | saveRDS(clsUnionPEakBeds,
27 |   file = file.path(work_dir, "out/tscc/pL4UnionPeakBeds.rds"))
28 | 
29 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/R/sumReproducePeaks.R:
--------------------------------------------------------------------------------
 1 | ## library(R.utils)
 2 | ## library(purrr)
 3 | args <- commandArgs(trailingOnly = TRUE)
 4 | rpdpeakDir <- args[1]
 5 | outdir <- args[2]
 6 | 
 7 | # debug
 8 | ## rpdpeakDir <-  "/oasis/tscc/scratch/szu/projects/CEMBA2/18.snap2_peakcalling/out/tscc/rpdpeak"
 9 | ## outdir <- "."
10 | 
11 | recordSummitFiles <- file.path(outdir, "mba.whole.naiveSummitList.list")
12 | ## cl2npeakFile <- file.path(outdir, "mba.whole.L4.npeak4anno.txt")
13 | 
14 | summitFiles <- list.files(path = rpdpeakDir, pattern = ".*naiveSummitList.bed",
15 |   full.names = TRUE, include.dirs = TRUE, no.. = TRUE, ignore.case = FALSE)
16 | 
17 | cls <- basename(summitFiles) |> gsub(".naiveSummitList.bed", "", x = _)
18 | ## npeaks <- map_int(.x = summitFiles, .f = countLines)
19 | 
20 | r1 <- data.frame(
21 |   cl = cls,
22 |   path = summitFiles
23 | )
24 | 
25 | write.table(r1, file = recordSummitFiles, quote = FALSE, sep = "\t",
26 |   row.names = FALSE, col.names = FALSE, append = FALSE)
27 | 
28 | ## r2 <- data.frame(
29 | ##   cl = cls,
30 | ##   npeak = npeaks
31 | ## )
32 | ## write.table(r2, file = cl2npeakFile, quote = FALSE, sep = "\t",
33 | ##   row.names = FALSE, col.names = FALSE, append = FALSE)
34 | 
35 | message("done.")
36 | 
37 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/pipeline/getsa2pmat.Snakefile:
--------------------------------------------------------------------------------
 1 | #envvars:
 2 | #    "PATH"
 3 | import os
 4 | import pyprojroot
 5 | import pandas as pd
 6 | from typing import List
 7 | 
 8 | proj_dir = str(pyprojroot.here())
 9 | work_dir = os.path.join(proj_dir, "18.snap2_peakcalling")
10 | script_dir = os.path.join(work_dir, "src/main")
11 | 
12 | union_bed_file = os.path.join(work_dir, "out/tscc",
13 |                                "mba.whole.union.peak.srt.bed")
14 | if not os.path.exists(union_bed_file):
15 |     raise FileNotFoundError(union_bed_file)
16 | 
17 | rnd_bed_file = os.path.join(work_dir, "out/tscc",
18 |                                "mba.whole.shuffle.removeovlp.bed")
19 | if not os.path.exists(rnd_bed_file):
20 |     raise FileNotFoundError(rnd_bed_file)
21 | 
22 | with open(f"{proj_dir}/17.snapatac2/meta/mba.whole.sample.lst", 'r') as f:
23 |     samples = [l.strip() for l in f.readlines()]
24 | 
25 | snap2_dir = os.path.join(proj_dir,
26 |                          "17.snapatac2",
27 |                          "sa2_qc_dlt/rm_dlt")
28 | 
29 | fdir = os.path.join(work_dir, "out", "tscc/sa2pmat" , "flag")
30 | ldir = os.path.join(work_dir, "out", "tscc/sa2pmat" , "log")
31 | odir_union = os.path.join(work_dir, "out", "tscc/sa2pmat" , "union_pmat")
32 | odir_rnd = os.path.join(work_dir, "out", "tscc/sa2pmat" , "union_pmat_rnd")
33 | 
34 | for d in [fdir, ldir, odir_union, odir_rnd]:
35 |     os.makedirs(d, exist_ok = True)
36 | 
37 | rule all:
38 |     input:
39 |         expand("{f}/{s}.sa2pmat.flag", f=fdir, s=samples),
40 |         expand("{f}/{s}.sa2pmat_rnd.flag", f=fdir, s=samples)
41 | 
42 | rule sa2pmat_union:
43 |     input:
44 |         bedfnm=union_bed_file
45 |     output:
46 |         touch(f"{fdir}/{{s}}.sa2pmat.flag")
47 |     log:
48 |         f"{ldir}/{{s}}.sa2pmat.log"
49 |     params:
50 |         snap2_dir = snap2_dir,
51 |         suffix = "_rm_dlt.h5ad",
52 |         out_dir = odir_union,
53 |         out_suffix = "_union_pmat.h5ad"
54 |     #conda: "sa22"
55 |     threads: 2
56 |     resources:
57 |         walltime="1:00:00",
58 |         queue="glean"
59 |     script:
60 |         f"{script_dir}/python/sa2pmat.py"
61 |         
62 | rule sa2pmat_rnd:
63 |     input:
64 |         bedfnm=rnd_bed_file
65 |     output:
66 |         touch(f"{fdir}/{{s}}.sa2pmat_rnd.flag")
67 |     log:
68 |         f"{ldir}/{{s}}.sa2pmat_rnd.log"
69 |     params:
70 |         snap2_dir = snap2_dir,
71 |         suffix = "_rm_dlt.h5ad",
72 |         out_dir = odir_rnd,
73 |         out_suffix = "_rnd_pmat.h5ad"
74 |     #conda: "sa22"
75 |     threads: 2
76 |     resources:
77 |         walltime="1:00:00",
78 |         queue="glean"
79 |     script:
80 |         f"{script_dir}/python/sa2pmat.py"
81 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/pipeline/scfilter.Snakefile:
--------------------------------------------------------------------------------
 1 | envvars:
 2 |     "PATH"
 3 | 
 4 | import os
 5 | import pyprojroot
 6 | import pandas as pd
 7 | from typing import List
 8 | 
 9 | rscript = "/home/szu/mambaforge/envs/seurat/bin/Rscript"
10 | proj_dir = str(pyprojroot.here())
11 | work_dir = os.path.join(proj_dir, "18.snap2_peakcalling")
12 | script_dir = os.path.join(work_dir, "src/main")
13 | 
14 | rdir = os.path.join(work_dir, "src/main/resource")
15 | odir = os.path.join(work_dir, "out/scfilter")
16 | fdir = os.path.join(odir, "flag")
17 | ldir = os.path.join(odir, "log")
18 | odir_peakfrac = os.path.join(odir, "peakfrac")
19 | odir_fitbgmodel = os.path.join(odir, "fitfrac_bg")
20 | odir_clfilter = os.path.join(odir, "clfileter")
21 | 
22 | for d in [odir, fdir, ldir, odir_peakfrac, odir_fitbgmodel,
23 |           odir_clfilter]:
24 |     os.makedirs(d, exist_ok = True)
25 | 
26 | def get_pL4(L4_fnm:str, prefix:str = "nn.") -> List[str]:
27 |     d: pd.DataFrame = pd.read_csv(
28 |         L4_fnm, sep = ",", header = None,
29 |         names = ['cluster', 'size', 'early_size', 'later_size'],
30 |         index_col = None
31 |     )
32 |     r = [f"{prefix}{i}" for i in d['cluster'].to_list()]
33 |     return r
34 | 
35 | nn_L4_fnm = os.path.join(rdir, "nn_L4pc2sizes_cca-k49-cl_v1.csv")
36 | nn_pL4s: List[str] = get_pL4(nn_L4_fnm, prefix = "nn.")
37 | neuron_L4_fnm = os.path.join(rdir, "neuron_L4pc2sizes_cca-k50-cl_v1.csv")
38 | neuron_pL4s: List[str] = get_pL4(neuron_L4_fnm, prefix = "neuron.")
39 | pL4s: List[str] = nn_pL4s + neuron_pL4s
40 | print(f"{len(pL4s)} pL4 cls for filtering peaks.")
41 | 
42 | # pL4ids = list(range(1,3))
43 | pL4ids = list(range(1,1464))
44 | 
45 | # sa2pmatd = os.path.join(work_dir, "out/tscc/sa2pmat")
46 | # sa2pmatd_union = os.path.join(sa2pmatd, "union_pmat")
47 | # sa2pmatd_rnd = os.path.join(sa2pmatd, "union_pmat_rnd")
48 | 
49 | rule all:
50 |     input:
51 |         # f"{fdir}/peakfrac.done",
52 |         expand("{f}/fitbgmodel_{cl}.done", f = fdir, cl = pL4ids)
53 | 
54 | # let's do this in our script interactively.
55 | # rule get_peakfrac:
56 | #     output:
57 | #         touch(f"{fdir}/peakfrac.done")
58 | #     params:
59 | #         sa2pmatd_fg = sa2pmatd_union,
60 | #         sa2pmatd_bg = sa2pmatd_rnd,
61 | #         outdir = odir_peakfrac
62 | #     threads:
63 | #         20
64 | #     resources:
65 | #         walltime = 8,
66 | #         queue = "glean",
67 | #         mail = "a",
68 | #         tag = "icelake:mem1024"
69 | #     conda:
70 | #         "sa22"
71 | #     script:
72 | #         "{script_dir}/python/sa2_get_peakfrac.py"
73 | 
74 | rule fit_bgmodel:
75 |     # input:
76 |     #     f"{fdir}/peakfrac.done"
77 |     output:
78 |         touch(f"{fdir}/fitbgmodel_{{cl}}.done")
79 |     log:
80 |         f"{ldir}/fitbgmodel_{{cl}}.done"
81 |     threads: 5
82 |     resources:
83 |         walltime = 4,
84 |         queue = "condo",
85 |         mail = "a"
86 |     conda:
87 |         "seurat"
88 |     shell:
89 |         """
90 |         {rscript} {script_dir}/R/fitbgmodel.R {wildcards.cl} 2> {log}
91 |         """
92 |         
93 |         
94 |         
95 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/python/get_full_snap2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Get snap2 file with all the samples.
 3 | 
 4 | """
 5 | import os
 6 | import sys
 7 | import logging
 8 | from pathlib import Path
 9 | from typing import Dict, List
10 | 
11 | import numpy as np
12 | import snapatac2 as sa2
13 | 
14 | import pyprojroot
15 | 
16 | proj_dir = str(pyprojroot.here())
17 | pack_dir = f"{proj_dir}/package/python"
18 | sys.path.insert(0, pack_dir)
19 | from mylog import StreamToLogger, set_file_logger
20 | from mysnapatac2 import modify_obs_name
21 | 
22 | # * snakemake
23 | # log_fnm=snakemake.log[0]
24 | # snap2_files: List[str] = snakemake.input["snap2_files"]
25 | # out_snap2: str = snakemake.output["snap2"]
26 | 
27 | # * log
28 | log_dir = os.path.join(proj_dir, "18.snap2_peakcalling", "out/log")
29 | os.makedirs(log_dir, exist_ok=True)
30 | log_fnm = os.path.join(log_dir, "get_full_snap2.log")
31 | logger = set_file_logger(log_fnm, name="sa2.get_full_snap2")
32 | sys.stdout = StreamToLogger(logger=logger, level=logging.INFO)
33 | sys.stderr = StreamToLogger(logger=logger, level=logging.ERROR)
34 | 
35 | # * meta
36 | rdir = os.path.join(proj_dir, "18.snap2_peakcalling", "src/main/resource")
37 | with open(f"{rdir}/mba.whole.sample.lst", "r") as f:
38 |     samples = [l.strip() for l in f.readlines()]
39 | out_snap2_fnm = f"{rdir}/cemba.snap2.with.fragment.hdf5"
40 | 
41 | snap2s_dir = os.path.join(proj_dir, "17.snapatac2", "sa2_qc_dlt", "rm_dlt")
42 | snap2_files = [f"{snap2s_dir}/{s}_rm_dlt.h5ad" for s in samples]
43 | 
44 | logger.info(f"In total, {len(snap2_files)} are inputed.")
45 | 
46 | fnms = [os.path.basename(v) for v in snap2_files]
47 | 
48 | logger.info(f"get full snap anndataset object to: {out_snap2_fnm}")
49 | sample2files = [(s, f) for s, f in zip(samples, snap2_files)]
50 | sds = sa2.AnnDataSet(adatas=sample2files,
51 |                         filename=out_snap2_fnm, add_key="sample")
52 | new_obs_names = modify_obs_name(sds, obs_key="sample")
53 | sds.obs_names = new_obs_names
54 | sds.close()
55 | logger.info("Done.")
56 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/python/sa2pmat.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import logging
 4 | from pathlib import Path
 5 | from typing import Dict, List
 6 | 
 7 | import numpy as np
 8 | import pandas as pd
 9 | import snapatac2 as sa2
10 | 
11 | import pyprojroot
12 | 
13 | proj_dir = str(pyprojroot.here())
14 | pack_dir = f"{proj_dir}/package/python"
15 | sys.path.insert(0, pack_dir)
16 | from mylog import StreamToLogger, set_file_logger
17 | from mysnapatac2 import modify_obs_name
18 | 
19 | # * snakemake
20 | log_fnm: str =snakemake.log[0]
21 | bedfnm: str = snakemake.input["bedfnm"]
22 | snap2_dir: str = snakemake.params["snap2_dir"]
23 | suffix: str = snakemake.params["suffix"]
24 | out_dir: str = snakemake.params["out_dir"]
25 | out_suffix: str = snakemake.params["out_suffix"]
26 | sample: str = snakemake.wildcards["s"]
27 | 
28 | # * debug
29 | # log_fnm = "test_pmat.log"
30 | # work_dir = os.path.join(proj_dir, "18.snap2_peakcalling")
31 | # bedfnm = os.path.join(proj_dir, "18.snap2_peakcalling",
32 | #                       "out/tscc", "mba.whole.union.peak.srt.bed")
33 | # with open(f"{proj_dir}/17.snapatac2/meta/mba.whole.sample.lst", 'r') as f:
34 | #     samples = [l.strip() for l in f.readlines()]
35 | 
36 | # snap2_dir = os.path.join(proj_dir,
37 | #                          "17.snapatac2",
38 | #                          "sa2_qc_dlt/rm_dlt")
39 | # out_dir = "."
40 | # suffix = "_rm_dlt.h5ad"
41 | # out_suffix = "_union_pmat.h5ad"
42 | # sample = samples[0]
43 | 
44 | # * log
45 | logger = set_file_logger(log_fnm, name="sa2.get_full_snap2")
46 | sys.stdout = StreamToLogger(logger=logger, level=logging.INFO)
47 | sys.stderr = StreamToLogger(logger=logger, level=logging.ERROR)
48 | 
49 | # * main
50 | # ** load snap2
51 | logger.info(f"Loading snap2 for {sample}...")
52 | snap_file = os.path.join(snap2_dir, f"{sample}{suffix}")
53 | if not os.path.exists(snap_file):
54 |     raise FileNotFoundError(f"{snap_file} not found.")
55 | snap2: sa2.AnnData = sa2.read(snap_file, backed = 'r')
56 | 
57 | # ** read bed file
58 | logger.info("Reading bed file...")
59 | if not os.path.exists(bedfnm):
60 |     raise FileNotFoundError(f"{bedfnm} not found.")
61 | 
62 | 
63 | # ** get pmat
64 | logger.info(f"Getting pmat for {sample}...")
65 | outfnm = os.path.join(out_dir, f"{sample}{out_suffix}")
66 | sa2.pp.make_peak_matrix(adata = snap2,
67 |                         inplace = False,
68 |                         file = outfnm,
69 |                         backend = "hdf5",
70 |                         peak_file = bedfnm,
71 |                         chunk_size = 10000,
72 |                         use_x = False)
73 | snap2.close()
74 | logger.info(f"Done.")
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/resource/config.yaml:
--------------------------------------------------------------------------------
 1 | system: imac
 2 | project_dir:
 3 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/CEMBA2
 4 |   encoder: /projects/ps-renlab/szu/projects/CEMBA2
 5 |   tscc: /projects/ps-renlab/szu/projects/CEMBA2
 6 | subset_dir: subset
 7 | callpeak_dir: callpeak
 8 | cemba_all_dataset: src/main/resource/cemba.sa2.dlt2.anndataset.h5ad
 9 | subset_barcode2group: src/main/resource/subset.barcode2group.csv
10 | 
11 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/resource/test_neuron_L4pc2sizes.csv:
--------------------------------------------------------------------------------
1 | 12-1-1-2,8982,4505,4477
2 | 12-1-1-3,6194,3110,3084
3 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/shell/export_unionpeak.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | union_peakset=$1
 3 | if [ ! -f ${union_peakset} ];then
 4 |     echo "${union_peakset} does not exist."
 5 |     exit 1
 6 | fi
 7 | 
 8 | sed -e "1d " ${union_peakset} | cut -f 1-3 > $2
 9 | 
10 | echo "done."
11 | 


--------------------------------------------------------------------------------
/03.peakcalling/src/main/shell/intersect_mergepeak.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # echo $1
 4 | # echo $2
 5 | # echo $3
 6 | # echo done
 7 | 
 8 | if [ ! -f $1 ]; then
 9 |     echo "$1 does not exist."
10 |     exit 1
11 | fi
12 | 
13 | if [ ! -f $2 ]; then
14 |     echo "$2 does not exist."
15 |     exit 1
16 | fi
17 | 
18 | echo "intersect merge peak for $3 ."
19 | 
20 | intersectBed -wa -a $1 -b <(sed -e "1d" $2) -nonamecheck \
21 |    | sort -k1,1 -k2,2n | uniq > $3
22 | 
23 | echo "done."
24 | 


--------------------------------------------------------------------------------
/04.nmf/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: allpeak_nmf novlpeak_nmf ppdcpeak_nmf
 2 | # * run non-negative matrix factorization 
 3 | 
 4 | allpeak_nmf: src/main/pipeline/nmf.Snakefile src/main/resource/config.yaml
 5 | 	-mkdir -p build/$@
 6 | 	cp $(word 2,$^) build/$@/config.yaml
 7 | 	cp $< build/$@/Snakefile
 8 | 	cd build/$@ && \
 9 |   snakemake --config \
10 |     system=encoder \
11 |     tag=allpeak \
12 |     out=out/allpeak_nmf \
13 |     mod_from=150 mod_to=152 mod_by=2 n_rerun=3 \
14 |     subclass_order_meta=data/sa2.subclass.srt.txt \
15 |     peak_nm_file=data/sa2.final.peak.nms.txt \
16 |     cluster_nm_file=data/sa2.allpeak.subclass.nms.txt \
17 |     mat_pbyc_h5=data/sa2.allpeak.cbyp.mat.h5 \
18 |      -c 10 \
19 |      --snakefile Snakefile -R --rerun-incomplete \
20 |      --rerun-triggers mtime \
21 |      --skip-script-cleanup  \
22 |      # --profile pbs-torque-conda
23 | 
24 | 
25 | novlpeak_nmf: src/main/pipeline/nmf.Snakefile src/main/resource/config.yaml
26 | 	-mkdir -p build/$@
27 | 	cp $(word 2,$^) build/$@/config.yaml
28 | 	cp $< build/$@/Snakefile
29 | 	cd build/$@ && \
30 |   snakemake --config \
31 |     system=encoder \
32 |     tag=novlpeak \
33 |     out=out/novlppeak_nmf \
34 |     mod_from=150 mod_to=152 mod_by=2 n_rerun=3 \
35 |     subclass_order_meta=data/sa2.subclass.srt.txt \
36 |     peak_nm_file=data/sa2.novlpDHS.peak.nms.txt \
37 |     cluster_nm_file=data/sa2.novlpDHS.subclass.nms.txt \
38 |     mat_pbyc_h5=data/sa2.novlpDHS.cbyp.mat.h5 \
39 |      -c 10 \
40 |      --snakefile Snakefile -R --rerun-incomplete \
41 |      --rerun-triggers mtime \
42 |      --skip-script-cleanup  \
43 |      # --profile pbs-torque-conda
44 | 
45 | 
46 | 
47 | ppdcpeak_nmf: src/main/pipeline/nmf.Snakefile src/main/resource/config.yaml
48 | 	-mkdir -p build/$@
49 | 	cp $(word 2,$^) build/$@/config.yaml
50 | 	cp $< build/$@/Snakefile
51 | 	cd build/$@ && \
52 |   snakemake --config \
53 |     system=encoder \
54 |     tag=ppdcpeak \
55 |     out=out/ppdcpeak_nmf \
56 |     mod_from=54 mod_to=56 mod_by=2 n_rerun=3 \
57 |     subclass_order_meta=data/sa2.subclass.srt.txt \
58 |     peak_nm_file=data/sa2.ppdc.peak.nms.txt \
59 |     cluster_nm_file=data/sa2.ppdc.subclass.nms.txt \
60 |     mat_pbyc_h5=data/sa2.ppdc.cbyp.mat.h5 \
61 |      -c 10 \
62 |      --snakefile Snakefile -R --rerun-incomplete \
63 |      --rerun-triggers mtime \
64 |      --skip-script-cleanup  \
65 |      # --profile pbs-torque-conda
66 | 


--------------------------------------------------------------------------------
/04.nmf/src/main/R/02.nmfATAC.plotH.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # read results from ksklearn
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | # create parser object
 8 | parser <- ArgumentParser()
 9 | 
10 | # specify our desired options
11 | # by default ArgumentParser will add an help option
12 | parser$add_argument("-i", "--input", required=TRUE, help="input matrix")
13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix")
14 | # get command line options, if help option encountered print help and exit,
15 | # otherwise if options not found on command line then set defaults,
16 | args <- parser$parse_args()
17 | 
18 | 
19 | library(data.table)
20 | dataH <- fread(args$input,sep="\t")
21 | 
22 | library(pheatmap)
23 | library(RColorBrewer)
24 | library(viridis)
25 | library(dendsort)
26 | 
27 | # scale by column
28 | #mx <- apply(dataH,2,scale)
29 | 
30 | normUnity <- function(x){
31 |   sum <- sum(x)
32 |   out <- x / sum(x)
33 | }
34 | 
35 | mx <- apply(dataH,2,normUnity)
36 | 
37 | sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...)))
38 | #mat_cluster_rows_H <- sort_hclust(hclust(dist(dataH)))
39 | mat_cluster_cols_H <- sort_hclust(hclust(dist(t(mx))))
40 | 
41 | quantile_breaks <- function(xs, n = 30) {
42 |   breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
43 |   breaks[!duplicated(breaks)]
44 | }
45 | 
46 | # mat_breaks_H <- quantile_breaks(t(mx), n = 30)
47 | 
48 | pdf(paste(args$output,".H.pdf",sep=''))
49 | pheatmap(
50 |   mat               = mx,
51 |   scale             = 'none',
52 |   color             = viridis(30),
53 | #  color             = viridis(length(mat_breaks_H) - 1),
54 | #  breaks            = mat_breaks_H,
55 |   border_color      = NA,
56 |   cluster_cols      = mat_cluster_cols_H,
57 |   cluster_rows      = F,
58 | #  cluster_rows      = mat_cluster_rows_H,
59 |   show_colnames     = TRUE,
60 |   show_rownames     = FALSE,
61 |   drop_levels       = TRUE,
62 |   fontsize          = 14,
63 |   main              = "decomp H"
64 | )
65 | dev.off()
66 | 
67 | 
68 | nor01 <- function(x){
69 |   min <- min(x)
70 |   max <- max(x)
71 |   out <- (x - min) / (max - min)
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/04.nmf/src/main/R/02.nmfATAC.plotW.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # read results from ksklearn
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | # create parser object
 8 | parser <- ArgumentParser()
 9 | 
10 | # specify our desired options
11 | # by default ArgumentParser will add an help option
12 | parser$add_argument("-i", "--input", required=TRUE, help="input matrix")
13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix")
14 | # get command line options, if help option encountered print help and exit,
15 | # otherwise if options not found on command line then set defaults,
16 | args <- parser$parse_args()
17 | 
18 | 
19 | library(data.table)
20 | dataW <- fread(args$input,sep="\t")
21 | 
22 | library(pheatmap)
23 | library(RColorBrewer)
24 | library(viridis)
25 | library(dendsort)
26 | 
27 | # scale by column
28 | #tmp <- apply(dataW,2,scale)
29 | 
30 | normUnity <- function(x){
31 |   sum <- sum(x)
32 |   out <- x / sum(x)
33 | }
34 | 
35 | tmp <- apply(dataW,1,normUnity)
36 | tmp <- t(tmp)
37 | mx <- tmp[sample(nrow(tmp), 5000), ]
38 | 
39 | sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...)))
40 | mat_cluster_rows_W <- sort_hclust(hclust(dist(mx)))
41 | #mat_cluster_cols_W <- sort_hclust(hclust(dist(t(mx))))
42 | 
43 | quantile_breaks <- function(xs, n = 30) {
44 |   breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
45 |   breaks[!duplicated(breaks)]
46 | }
47 | 
48 | #mat_breaks_W <- quantile_breaks(t(mx), n = 30)
49 | 
50 | pdf(paste(args$output,".W.pdf",sep=''))
51 | pheatmap(
52 |   mat               = mx,
53 |   scale             = 'none',
54 |   color             = viridis(30),
55 | #  color             = viridis(length(mat_breaks_W) - 1),
56 | #  breaks            = mat_breaks_W,
57 |   border_color      = NA,
58 | #  cluster_cols      = mat_cluster_cols_W,
59 |   cluster_cols      = F,
60 |   cluster_rows      = mat_cluster_rows_W,
61 |   show_colnames     = FALSE,
62 |   show_rownames     = FALSE,
63 |   drop_levels       = TRUE,
64 |   fontsize          = 14,
65 |   main              = "decomp W"
66 | )
67 | dev.off()
68 | 
69 | 
70 | norm01 <- function(x){
71 |   min <- min(x)
72 |   max <- max(x)
73 |   out <- (x - min) / (max - min)
74 | }
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/04.nmf/src/main/R/02.nmfATAC.statBox.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # read results from ksklearn
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | # create parser object
 8 | parser <- ArgumentParser()
 9 | 
10 | # specify our desired options
11 | # by default ArgumentParser will add an help option
12 | parser$add_argument("-i", "--input", required=TRUE, help="input statH")
13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix")
14 | # get command line options, if help option encountered print help and exit,
15 | # otherwise if options not found on command line then set defaults,
16 | args <- parser$parse_args()
17 | 
18 | data <- read.table(args$input,sep="\t",head=F)
19 | 
20 | staoutmx <- data.frame(row.names=c("Min","Q1","Median","Mean","Q3","Max","TopWhisker","BottomWhisker","Box1","Box2","Box3","UpWhisker","DnWhisker"))
21 | for (i in c(5,6,7)){
22 | x <- data[,i]
23 | boxMx <- matrix(summary(x))
24 | rownames(boxMx) <- c("Min","Q1","Median","Mean","Q3","Max")
25 | iqr <- IQR(x)
26 | q1 <- summary(x)[2]
27 | q3 <- summary(x)[5]
28 | TopWhisker <- min(max(x), q3 + 1.5 * iqr)
29 | BottomWhisker <- max(min(x), q1 - 1.5 * iqr)
30 | Box1 <- boxMx["Q1",]
31 | Box2 <- boxMx["Median",] - boxMx["Q1",]
32 | Box3 <- boxMx["Q3",] - boxMx["Median",]
33 | UpWhisker <- TopWhisker - boxMx["Q3",]
34 | DnWhisker <- boxMx["Q1",] - BottomWhisker
35 | boxMx <- rbind(boxMx,TopWhisker,BottomWhisker,Box1,Box2,Box3,UpWhisker,DnWhisker)
36 | colnames(boxMx) <- i
37 | staoutmx <- cbind(staoutmx,boxMx)
38 | }
39 | colnames(staoutmx) <- c("contributes","sparseness","entropy")
40 | 
41 | cat(median(data$V6),"\n")
42 | 
43 | k <- max(data$V3)
44 | n <- nrow(data)
45 | normInfoGain = 1 - sum(data$V7) / (n * log2(k))
46 | cat(normInfoGain)
47 | 
48 | write.table(staoutmx, file=paste(args$output,".box.sta",sep=''), sep="\t", quote=F, col.names=T, row.names=T)
49 | 
50 | 


--------------------------------------------------------------------------------
/04.nmf/src/main/R/05.splitPeakByModule.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(optparse)
 3 | packdir <- file.path(here::here(), "package/R")
 4 | import::from(.from = "utils.R", .directory = packdir,
 5 |   checkArgsExistOrStop, prepareOutdir, checkFileExistOrStop)
 6 | import::from(.from = "peak.R", .directory = packdir,
 7 |   loadStatPeak.NMF)
 8 | import::from(.from = "cembav2env.R", .directory = packdir,
 9 |   cembav2env)
10 | 
11 | op <- list(
12 |   make_option(c("--nmfDir"), type = "character",
13 |     default = "nmf_ppdc/out"),
14 |   make_option(c("--module"), type = "integer",
15 |     default = 54),
16 |   make_option(c("--tag"), type = "character", default = "ppdc")
17 | )
18 | 
19 | args <- parse_args(OptionParser(option_list = op))
20 | checkArgsExistOrStop(args)
21 | 
22 | if(!dir.exists(args$nmfDir)) {
23 |   stop(args$nmfDir, " does not exist.")
24 | }
25 | 
26 | mod.nmf <- args$module
27 | tag <- args$tag
28 | 
29 | outDir <- file.path(args$nmfDir,
30 |   paste("nmf", tag, paste0("r", mod.nmf), "motif", sep = "."))
31 | prepareOutdir(outDir)
32 | 
33 | # * functions
34 | convertPeakToBed <- function(peakBed, peaknms, outFile = NULL) {
35 |   r <- peakBed[peaknms, ]
36 |   if(!is.null(outFile)) {
37 |     write.table(x = r, file = outFile, quote = FALSE, sep = "\t",
38 |       row.names = FALSE, col.names = FALSE)
39 |   }
40 |   return(r)
41 | }
42 | 
43 | # * load peaks
44 | peakBed <- data.table::fread(cembav2env$peakBedFile,
45 |   header = FALSE, sep = "\t", data.table = FALSE)
46 | colnames(peakBed) <- c("chrom", "start", "end", "name")
47 | rownames(peakBed) <- peakBed$name
48 | 
49 | # * nmf modules 
50 | nmfPeakStat <- loadStatPeak.NMF(
51 |   file = file.path(args$nmfDir,
52 |     paste("nmfPmat", tag,
53 |       paste0("r", mod.nmf), "n0", "statW", sep = ".")))
54 | 
55 | modules <- unique(nmfPeakStat$class0 + 1)
56 | 
57 | # * save peaks from each module to a seperate bed file
58 | invisible(lapply(modules, function(i) {
59 |   outFile <- file.path(outDir,
60 |     paste0("r", mod.nmf, "_n", i, ".cCREs.bed"))
61 |   message("Writing peak bed file to: ", outFile)
62 |   peaks <- with(nmfPeakStat, peak[class0 == (i-1)])
63 |   convertPeakToBed(peakBed = peakBed, peaknms = peaks, outFile = outFile)
64 | }))
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/04.nmf/src/main/resource/config.yaml:
--------------------------------------------------------------------------------
 1 | system: imac
 2 | python:
 3 |   imac: /Users/szu/mambaforge/bin/python
 4 |   tscc: /projects/ps-renlab/szu/miniconda3/envs/snATAC2/bin/python
 5 |   encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/python
 6 | Rscript:
 7 |   imac: /usr/local/bin/Rscript
 8 |   tscc: /projects/ps-renlab/szu/miniconda3/envs/snATAC2/bin/Rscript
 9 |   encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/Rscript
10 | code_dir:
11 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2
12 |   tscc: /projects/ps-renlab2/szu/projects/CEMBA2
13 |   encoder: /projects/ps-renlab2/szu/projects/CEMBA2
14 | work_dir:
15 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2
16 |   tscc: /oasis/tscc/scratch/szu/projects/CEMBA2
17 |   encoder: /projects/ps-renlab2/szu/projects/CEMBA2
18 | homer:
19 |   imac: /Users/szu/mambaforge/envs/bio/bin/findMotifsGenome.pl
20 |   tscc: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/findMotifsGenome.pl
21 |   encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/findMotifsGenome.pl
22 | subclass_order_meta: meta/subclass.order.hc.csv
23 | peak_nm_file: data/peaks.txt
24 | cluster_nm_file: data/clusters.txt
25 | mat_pbyc_h5: data/cpm.cbyp.ppdc.h5
26 | tag: ppdc
27 | out: nmf_ppdc
28 | n_rerun: 3
29 | mod_from: 78
30 | mod_to: 80
31 | mod_by: 2
32 | use_detailed_mod: 1
33 | detailed_mod: 78a80
34 | mod_split: a
35 | module: 54
36 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: run_cicero pdc_cicero
 2 | 
 3 | snakemake := /home/szu/miniforge3/bin/snakemake
 4 | # snakemake := /home/szu/mambaforge/envs/seurat/bin/snakemake
 5 | run_cicero: src/main/pipeline/runCicero.Snakefile
 6 | 	${snakemake} --snakefile $< -R -c 9 \
 7 |     --config \
 8 |     threads=8 \
 9 |     debug=0 \
10 |     queue=hotel \
11 |     walltime=16 \
12 |     --rerun-triggers mtime \
13 |     --skip-script-cleanup \
14 |     --profile pbs-torque-conda
15 | 
16 | pdc_cicero: src/main/pipeline/pdc.Snakefile src/main/resource/config.yaml
17 | 	-mkdir -p build/$@
18 | 	cp $(word 2,$^) build/$@/config.yaml
19 | 	cp $< build/$@/Snakefile
20 | 	cd build/$@ && \
21 |   ${snakemake} -c 20 --config \
22 |     threads=1 \
23 |     debug=0 \
24 |     queue=glean \
25 |     walltime=2 \
26 | 		--snakefile Snakefile -R --rerun-incomplete \
27 | 		--rerun-triggers mtime \
28 | 		--skip-script-cleanup  \
29 |     # --profile pbs-torque-conda
30 | 
31 | getppdc_bedpe: src/main/shell/09.get.pos.neg.pdc.info.sh
32 | 	bash $<
33 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/R/cicero_mouse_atlas.R:
--------------------------------------------------------------------------------
 1 | # This script is prepared by Sai.
 2 | library(Matrix)
 3 | library(monocle3)
 4 | # should be cicero for monocle3
 5 | # devtools::install_github(
 6 | #  "cole-trapnell-lab/cicero-release", ref = "monocle3")
 7 | library(cicero)
 8 | 
 9 | projdir <- here::here()
10 | rscdir <- file.path(projdir, "src/main/resource")
11 | 
12 | args <- commandArgs(trailingOnly = TRUE)
13 | CellType <- args[1]
14 | print(CellType)
15 | path <- "/oasis/tscc/scratch/smamde/mouse_atlas/mtx_files/"
16 | # Read in matrix data using the Matrix package
17 | indata <- Matrix::readMM(paste0(path,CellType,".mtx"))
18 | # Binarize the matrix
19 | indata@x[indata@x > 0] <- 1
20 | indata <- t(indata)
21 | # Format cell info
22 | cellinfo <- read.table(paste0(path,CellType,"_Barcodes.tsv"))
23 | row.names(cellinfo) <- cellinfo$X0
24 | names(cellinfo) <- "cells"
25 | 
26 | # Format peak info
27 | peakinfo <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/peaks.tsv")
28 | names(peakinfo) <- c("chr", "bp1", "bp2")
29 | peakinfo$site_name <- paste(peakinfo$chr, peakinfo$bp1, peakinfo$bp2, sep="_")
30 | row.names(peakinfo) <- peakinfo$site_name
31 | 
32 | row.names(indata) <- row.names(peakinfo)
33 | colnames(indata) <- row.names(cellinfo)
34 | 
35 | # Make CDS
36 | input_cds <-  suppressWarnings(new_cell_data_set(indata,
37 | cell_metadata = cellinfo,
38 | gene_metadata = peakinfo))
39 | 
40 | input_cds <- monocle3::detect_genes(input_cds)
41 | 
42 | #Ensure there are no peaks included with zero reads
43 | input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,] 
44 | 
45 | input_cds <- detect_genes(input_cds)
46 | input_cds <- estimate_size_factors(input_cds)
47 | input_cds <- preprocess_cds(input_cds, method = "LSI")
48 | 
49 | input_cds <- reduce_dimension(input_cds, reduction_method = 'UMAP', 
50 |                               preprocess_method = "LSI")
51 | umap_coords <- reducedDims(input_cds)$UMAP
52 | 
53 | 
54 | cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords)
55 | 
56 | chromosome_length <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/mm10_chromosome_length.txt")
57 | 
58 | conns <- run_cicero(cicero_cds, chromosome_length)
59 | 
60 | 
61 | saveRDS(conns,paste0(path,CellType,"_cicero_connections.Rds"))
62 | 
63 | all_peaks <- row.names(exprs(input_cds))
64 | write.csv(x = all_peaks, file = paste0(path,CellType,"_all_peaks.csv"))
65 | write.csv(x = conns, file = paste0(path,CellType,"_cicero_connections.csv"))
66 | 
67 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/R/run_cicero.R:
--------------------------------------------------------------------------------
 1 | 
 2 | library(Matrix)
 3 | library(monocle3)
 4 | library(cicero)
 5 | 
 6 | 
 7 | #CellType = "ABC_NN"
 8 | args <- commandArgs(trailingOnly = TRUE)
 9 | CellType <- args[1]
10 | print(CellType)
11 | path = "/oasis/tscc/scratch/smamde/mouse_atlas/mtx_files/"
12 | # Read in matrix data using the Matrix package
13 | indata <- Matrix::readMM(paste0(path,CellType,".mtx"))
14 | # Binarize the matrix
15 | indata@x[indata@x > 0] <- 1
16 | indata <- t(indata)
17 | # Format cell info
18 | cellinfo <- read.table(paste0(path,CellType,"_Barcodes.tsv"))
19 | row.names(cellinfo) <- cellinfo$X0
20 | names(cellinfo) <- "cells"
21 | 
22 | # Format peak info
23 | peakinfo <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/peaks.tsv")
24 | names(peakinfo) <- c("chr", "bp1", "bp2")
25 | peakinfo$site_name <- paste(peakinfo$chr, peakinfo$bp1, peakinfo$bp2, sep="_")
26 | row.names(peakinfo) <- peakinfo$site_name
27 | 
28 | row.names(indata) <- row.names(peakinfo)
29 | colnames(indata) <- row.names(cellinfo)
30 | 
31 | # Make CDS
32 | input_cds <-  suppressWarnings(new_cell_data_set(indata,
33 | cell_metadata = cellinfo,
34 | gene_metadata = peakinfo))
35 | 
36 | input_cds <- monocle3::detect_genes(input_cds)
37 | 
38 | #Ensure there are no peaks included with zero reads
39 | input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,] 
40 | 
41 | input_cds <- detect_genes(input_cds)
42 | input_cds <- estimate_size_factors(input_cds)
43 | input_cds <- preprocess_cds(input_cds, method = "LSI")
44 | 
45 | input_cds <- reduce_dimension(input_cds, reduction_method = 'UMAP', 
46 |                               preprocess_method = "LSI")
47 | umap_coords <- reducedDims(input_cds)$UMAP
48 | 
49 | 
50 | cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords)
51 | 
52 | chromosome_length <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/mm10_chromosome_length.txt")
53 | 
54 | conns <- run_cicero(cicero_cds, chromosome_length)
55 | 
56 | 
57 | saveRDS(conns,paste0(path,CellType,"_cicero_connections.Rds"))
58 | 
59 | all_peaks <- row.names(exprs(input_cds))
60 | write.csv(x = all_peaks, file = paste0(path,CellType,"_all_peaks.csv"))
61 | write.csv(x = conns, file = paste0(path,CellType,"_cicero_connections.csv"))
62 | 
63 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/R/run_cicero_shuffle.R:
--------------------------------------------------------------------------------
 1 | library(Matrix)
 2 | library(monocle3)
 3 | library(cicero)
 4 | 
 5 | #CellType = "ABC_NN"
 6 | args <- commandArgs(trailingOnly = TRUE)
 7 | CellType <- args[1]
 8 | print(CellType)
 9 | path = "/oasis/tscc/scratch/smamde/mouse_atlas/mtx_files/"
10 | path_save = "/oasis/tscc/scratch/smamde/mouse_atlas/shuffle_cicero/cicero_shuffle_results/"
11 | # Read in matrix data using the Matrix package
12 | indata <- Matrix::readMM(paste0(path,CellType,".mtx") )
13 | 
14 | # Format cell info
15 | cellinfo <- read.table(paste0(path,CellType,"_Barcodes.tsv"))
16 | row.names(cellinfo) <- cellinfo$X0
17 | names(cellinfo) <- "cells"
18 | 
19 | # Format peak info
20 | peakinfo <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/peaks.tsv")
21 | names(peakinfo) <- c("chr", "bp1", "bp2")
22 | peakinfo$site_name <- paste(peakinfo$chr, peakinfo$bp1, peakinfo$bp2, sep="_")
23 | row.names(peakinfo) <- peakinfo$site_name
24 | 
25 | row.names(indata) <- row.names(cellinfo)
26 | colnames(indata) <- row.names(peakinfo)
27 | 
28 | mat<-indata
29 | r <- mat[ , sample(ncol(mat))]
30 | colnames(r) <- colnames(mat)
31 | rownames(r) <- rownames(mat)
32 | 
33 | indata<-r
34 | indata@x[indata@x > 0] <- 1
35 | indata <- t(indata)
36 | 
37 | # Make CDS
38 | input_cds <-  suppressWarnings(new_cell_data_set(indata))
39 | 
40 | input_cds <- monocle3::detect_genes(input_cds)
41 | 
42 | #Ensure there are no peaks included with zero reads
43 | input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,] 
44 | 
45 | input_cds <- detect_genes(input_cds)
46 | input_cds <- estimate_size_factors(input_cds)
47 | input_cds <- preprocess_cds(input_cds, method = "LSI")
48 | 
49 | input_cds <- reduce_dimension(input_cds, reduction_method = 'UMAP', 
50 |                               preprocess_method = "LSI")
51 | umap_coords <- reducedDims(input_cds)$UMAP
52 | 
53 | 
54 | cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords)
55 | 
56 | chromosome_length <- read.table("/oasis/tscc/scratch/smamde/mouse_atlas/mm10_chromosome_length.txt")
57 | 
58 | conns <- run_cicero(cicero_cds, chromosome_length)
59 | 
60 | 
61 | saveRDS(conns,paste0(path_save,CellType,"_cicero_shuffle_connections.Rds"))
62 | 
63 | all_peaks <- row.names(exprs(input_cds))
64 | write.csv(x = all_peaks, file = paste0(path_save,CellType,"_all_shuffle_peaks.csv"))
65 | write.csv(x = conns, file = paste0(path_save,CellType,"_cicero_shuffle_connections.csv"))
66 | 
67 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/R/sa2.pdc.of.globalpeaks.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | projdir <- here::here()
 3 | rpack <- file.path(projdir, "package/R")
 4 | import::from(.from = "cembav2env.R", .directory = rpack,
 5 |   Sa2PeakCalling, cembav2env)
 6 | globalPeaks <- Sa2PeakCalling$readBed4File(
 7 |   file.path(projdir, "20.nmf", "out",
 8 |     "peaks.from.global.modules.bed")
 9 | )
10 | 
11 | pdc <- data.table::fread(
12 |   file = cembav2env$sa2.all.pdc.info, header = TRUE, sep = "\t",
13 |   data.table = FALSE)
14 | 
15 | global.pdc <- pdc[pdc$cre2 %in% globalPeaks$name, ] |>
16 |   group_by(cre1, cre2) |>
17 |   slice_max(coaccess, n = 1)
18 | 
19 | global.pdc.srt <- global.pdc |> arrange(desc(coaccess))
20 | 
21 | proximal <- vapply(global.pdc.srt$cre1,
22 |   str_split_1, FUN.VALUE = rep("1",3), pattern = ":|-") |>
23 |   t() |>
24 |   as.data.frame()
25 | 
26 | distal <- vapply(global.pdc.srt$cre2,
27 |   str_split_1, FUN.VALUE = rep("1",3), pattern = ":|-") |>
28 |   t() |>
29 |   as.data.frame()
30 | 
31 | global.pdc.srt.bedpe <- cbind(proximal, distal)
32 | global.pdc.srt.bedpe$coaccess <- global.pdc.srt$coaccess
33 | global.pdc.srt.bedpe$gene <- global.pdc.srt$gene
34 | 
35 | write.table(global.pdc.srt.bedpe,
36 |   file = file.path(projdir, "04.cCREgene", "sa2.cicero",
37 |     "out/sa2pdcsum", "mba.whole.sa2subclass.globalbynmf.pdc.bedpe"),
38 |   quote = FALSE, sep = "\t", col.names = FALSE, row.names = FALSE)
39 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/pipeline/runCicero.Snakefile:
--------------------------------------------------------------------------------
 1 | envvars:
 2 |     "PATH"
 3 | 
 4 | import os
 5 | 
 6 | with open("subclass-name.txt", 'r')  as f:
 7 |     subclasses = [l.strip() for l in f.readlines()]
 8 | 
 9 | rscript_bin = "/home/szu/mambaforge/bin/Rscript"
10 | work_dir = "/projects/ps-renlab2/szu/projects/CEMBA_wmb_snATAC/05.cCREgene/sa2.cicero"
11 | flag_dir =  os.path.join(work_dir, "flag_dir")
12 | log_dir = os.path.join(work_dir, "log_dir")
13 | 
14 | walltime = config["walltime"]
15 | queue = config["queue"]
16 | if queue == "glean":
17 |     if walltime > 8:
18 |         walltime = 8
19 | 
20 | rule all:
21 |     input:
22 |         # expand(f"{f}/runcicero_{sc}.done", f = flag_dir, sc = subclasses)
23 |         expand(f"{f}/runcicero_{sc}_shuffle.done", f = flag_dir, sc = subclasses)
24 | 
25 | rule run_cicero:
26 |     output:
27 |         touch(f"{flag_dir}/runcicero_{{sc}}.done")
28 |     log:
29 |         f"{log_dir}/runcicero_{{sc}}.log"
30 |     threads: config["threads"]
31 |     resources:
32 |         walltime = walltime,
33 |         queue = queue
34 |     shell:
35 |         """
36 |         {rscript_bin} {work_dir}/src/main/R/run_cicero.R {wildcards.sc} 2>&1 > {log}
37 |         """
38 |         
39 |     
40 | rule run_cicero_shuf:
41 |     output:
42 |         touch(f"{flag_dir}/runcicero_{{sc}}_shuffle.done")
43 |     log:
44 |         f"{log_dir}/runcicero_{{sc}}_shuffle.log"
45 |     threads: config["threads"]
46 |     resources:
47 |         walltime = walltime,
48 |         queue = queue
49 |     shell:
50 |         """
51 |         {rscript_bin} {work_dir}/src/main/R/run_cicero_shuf.R {wildcards.sc} 2>&1 > {log}
52 |         """
53 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/resource/config.yaml:
--------------------------------------------------------------------------------
 1 | n_ds: 1000
 2 | n_core: 2
 3 | mm10_file: meta/mm10.chrom.sizes.lite
 4 | tssUpDn1k_file: meta/gencode.vM23.gene.tssUpDn1k.bed
 5 | peakannot_file: 04.cCREgene/sa2.cicero/src/main/resource/mba.whole.sa2.peakOvlpTSS.proximal.distal.ciceroPeakCoord.bed
 6 | k_cicero: 50
 7 | reduct_cicero: "UMAP"
 8 | preprocess_cicero: "LSI"
 9 | debug: 0
10 | atac_subclass_cpm_file: 04.cCREgene/sa2.cicero/out/sa2pdcsum/sa2.cpm.pmat.pbysc.distal.rds
11 | rdm_pdc_file: 04.cCREgene/sa2.cicero/out/sa2pdcsum/sa2.rdm.g2p.pdc.rds
12 | allen_l2_cpm_file: 04.cCREgene/sa2.cicero/out/sa2pdcsum/sa2.allen.avg.logCPM.gbysc.rds
13 | pdc_file: 04.cCREgene/sa2.cicero/out/sa2pdcsum/sa2.g2p.pdc.pair.rds
14 | chunk_size: 50000
15 | 
16 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/shell/05.mergeDistalProximalConns.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage() {
 4 |     echo "Usage: $0 -c [cicero_dir] -m [meta_col]"
 5 |     exit 1
 6 | }
 7 | 
 8 | while getopts ":c:a:g:m:" o; do
 9 |     case "${o}" in
10 |         c)
11 |             cicero_dir=${OPTARG};;
12 |         m)
13 |             meta_col=${OPTARG};;
14 |         *)
15 |             usage ;;
16 |     esac
17 | done
18 | 
19 | shift $((OPTIND-1))
20 | 
21 | if [ -z "${cicero_dir}" ] || [ -z "${meta_col}" ]; then
22 |     usage
23 | fi
24 | 
25 | echo "Merge distal-proximal conns: ${meta_col}"
26 | out_all_conns_sta="${cicero_dir}/mba.whole.${meta_col}.merge.conns.sta.all"
27 | cat ${cicero_dir}/${meta_col}.*.conns.sta \
28 |     | sed -e "1i cluster\ttotConns\tddConns\tppConns\tpdConns\tgeneN\tcreN" \
29 |           > ${out_all_conns_sta}
30 | out_all_pdc_sta="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.sta.all"
31 | cat ${cicero_dir}/${meta_col}.*.pdc.sta \
32 |     | sed -e "1i cluster\ttotc\tgeneN\tcreN" \
33 |           > ${out_all_pdc_sta}
34 | 
35 | ## All pdc without any filtering
36 | out_all_pdc="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.all"
37 | cat ${cicero_dir}/${meta_col}.*.fitConns.res.alignv1.pdc \
38 |     | grep -v 'anno1' \
39 |     | sed -e "1i peak1\tcre1\tanno1\tpeak2\tcre2\tanno2\tcoaccess\tpval\tfdr\tcluster\tgene" \
40 |           > ${out_all_pdc}
41 | 
42 | # NOTE: since differen subclass may have different scores for the same
43 | # pdc, then there are repeat pdcs in this file.
44 | out_all_bedpe="${cicero_dir}/mba.whole.${meta_col}.merge.bedpe.all"
45 | sed '1d' ${out_all_pdc} \
46 |     | awk 'BEGIN{FS=OFS="\t"}{split($1,a,"_"); split($4,b,"_"); print a[1],a[2],a[3],b[1],b[2],b[3],$11"|"$5,$7,".","."}'\
47 |     | sort -k1,1 -k2,2n | uniq > ${out_all_bedpe}
48 | 
49 | out_all_pdcpair="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.pair.all"
50 | awk 'BEGIN{FS=OFS="\t"}{print $11"|"$5,$2}' ${out_all_pdc} \
51 |     | sed '1d' | sort | uniq > ${out_all_pdcpair}
52 | 
53 | echo "# of cCRE"
54 | cut -f 5 ${out_all_pdc} | sed '1d' | sort | uniq | wc -l
55 | echo "# of genes"
56 | cut -f 11 ${out_all_pdc} | sed '1d' | sort | uniq | wc -l
57 | echo "# of pairs"
58 | sed '1d' ${out_all_pdc} | sort | uniq | wc -l
59 | 
60 | echo "Merge all proximal-distal pairs' distances"
61 | out_all_dist="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.dist.all"
62 | cat ${cicero_dir}/${meta_col}.*.pdc.dist \
63 |     | sed -e "1i conns\tdistance\tcluster" > ${out_all_dist}
64 | 
65 | echo "Merge all genes' stat per cluster"
66 | out_all_peak2gene="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.peak2gene.all"
67 | cat ${cicero_dir}/${meta_col}.*.pdc.peak2gene \
68 |     | sed -e "1i gene\tcnt\tcluster" > ${out_all_peak2gene}
69 | 
70 | echo "Merge all peaks' stat per cluster"
71 | out_all_gene2peak="${cicero_dir}/mba.whole.${meta_col}.merge.pdc.gene2peak.all"
72 | cat ${cicero_dir}/${meta_col}.*.pdc.gene2peak \
73 |     | sed -e "1i cCREs\tcnt\tcluster" > ${out_all_gene2peak}
74 | 
75 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/shell/09.get.pos.neg.pdc.info.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | proj_dir="/Users/szu/git-recipes/mouseBrainAtlas/CEMBA2"
 4 | cicero_dir="${proj_dir}/04.cCREgene/sa2.cicero/out/sa2pdcsum"
 5 | meta_col="sa2subclass"
 6 | allpeak="${proj_dir}/supple.07.peakcalling.allinone/mba.whole.sa2.final.peak.srt.bed"
 7 | pdc_bedpe=${cicero_dir}/mba.whole.${meta_col}.merge.bedpe.all
 8 | pdc_pair=${cicero_dir}/mba.whole.${meta_col}.merge.pdc.pair.all
 9 | 
10 | # * function
11 | sum_pos_or_neg_pdc () {
12 |     local class=$1
13 |     local cor_method=$2
14 |     local class_pdc=${cicero_dir}/mba.whole.${meta_col}.${cor_method}.${class}.pdc.alignv1.tsv
15 | 
16 |     # outfiles
17 |     local class_bedpe=${cicero_dir}/mba.whole.${meta_col}.${cor_method}.${class}.pdc.bedpe
18 |     local class_pair=${cicero_dir}/mba.whole.${meta_col}.${cor_method}.${class}.pdc.pair
19 |     local class_CREs=${cicero_dir}/mba.whole.${meta_col}.${cor_method}.${class}.pdc.CREs
20 |     local class_bed=${cicero_dir}/mba.whole.${meta_col}.${cor_method}.${class}.pdc.CREs.bed
21 |     
22 |     # * process
23 |     echo "$class under correlation method: ${cor_method}"
24 |     
25 |     # get bedpe
26 |     # $8 is co-accessible score
27 |     # the results have repeats since co-accessible are infered in subclass level and then
28 |     # pool together
29 |     join -1 1 -2 7 <(cut -f 1 ${class_pdc} | sort) <(sort -k7,7 ${pdc_bedpe})  -t$'\t' \
30 |         | awk 'BEGIN{FS=OFS="\t"}{print $2,$3,$4,$5,$6,$7,$1,$8,$9,$10}' \
31 |         | sort -k1,1 -k2,2n | uniq > ${class_bedpe}
32 |     # get pair
33 |     join -1 1 -2 1 <(cut -f 1 ${class_pdc} | sort)  <(sort -k1,1 ${pdc_pair}) -t$'\t' \
34 |         | sort -k1,1 -k2,2n | uniq > ${class_pair}
35 |     # get CREs
36 |     cut -f 1 ${class_pdc} | tr '|' '\t' | cut -f 2 | sort | uniq | sed '1d'> ${class_CREs}
37 |     # get bed
38 |     join -1 1 -2 4 ${class_CREs} <(sort -k4,4 ${allpeak}) -t$'\t' \
39 |         | awk 'BEGIN{FS=OFS="\t"}{print $2,$3,$4,$1}' \
40 |         | sort -k1,1 -k2,2n | uniq > ${class_bed}
41 |     echo 'Done'
42 | }
43 | 
44 | sum_pos_or_neg_pdc pos pearson
45 | sum_pos_or_neg_pdc neg pearson
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/shell/alignv1.to.bedpe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | all_pdc_bedpe=$1
4 | pdc_alignv1=$2
5 | 
6 | join -1 1 -2 7 <(cut -f 1 ${pdc_alignv1} | sort) <(sort -k7,7 ${all_pdc_bedpe})  -t$'\t' \
7 |     | awk 'BEGIN{FS=OFS="\t"}{print $2,$3,$4,$5,$6,$7,$1,$8,$9,$10}' \
8 |     | sort -u -k1,1 -k2,2n -k8,8nr 
9 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/shell/sa2.all.distal.peaks.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | sum_dir="/Users/szu/git-recipes/mouseBrainAtlas/CEMBA2/04.cCREgene/sa2.cicero/out/sa2pdcsum"
 4 | all_pdc_file="${sum_dir}/mba.whole.sa2subclass.merge.pdc.all"
 5 | distal_peak_nm="${sum_dir}/mba.whole.sa2subclass.all.distal.peaks.from.pdc.all.txt"
 6 | distal_peak_bed="${sum_dir}/mba.whole.sa2subclass.all.distal.peaks.from.pdc.all.bed"
 7 | 
 8 | awk 'BEGIN{FS=OFS="\t"}{NR>1}{print $5}' ${all_pdc_file} \
 9 |     | sort | uniq > ${distal_peak_nm}
10 | 
11 | awk -F '[:-]' 'BEGIN{OFS="\t"}{print $1,$2,$3}'  ${distal_peak_nm} \
12 |     > ${distal_peak_bed}
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/05.cCREgene/sa2.cicero/src/main/shell/supple.02.annotPeakBasedOnTSS.sh:
--------------------------------------------------------------------------------
 1 | projroot="/projects/ps-renlab2/szu/projects/CEMBA2"
 2 | allpeakFile="${projroot}/supple.07.peakcalling.allinone/mba.whole.sa2.final.peak.srt.bed"
 3 | tssBed="${projroot}/meta/gencode.vM23.gene.tssUpDn1k.bed"
 4 | outdir="${projroot}/04.cCREgene/sa2.cicero/src/main/resource"
 5 | allpeakOvlpTssBed1="${outdir}/mba.whole.sa2.peakOvlpTSS.bed"
 6 | allpeakOvlpTssBed2="${outdir}/mba.whole.sa2.peakOvlpTSS.proximal.distal.bed"
 7 | allpeakOvlpTssBed3="${outdir}/mba.whole.sa2.peakOvlpTSS.proximal.distal.ciceroPeakCoord.bed"
 8 | 
 9 | 
10 | ## annot distall, proximal peak
11 | intersectBed -wao -f 0.5 -a ${allpeakFile} -b ${tssBed} > ${allpeakOvlpTssBed1}
12 | 
13 | awk 'BEGIN{FS=OFS="\t"}{if($5 != ".") {print $1,$2,$3,$4,"proximal",$11} else {print $1,$2,$3,$4,"distal","nan"}}' \
14 |     ${allpeakOvlpTssBed1} \
15 |     | sort -k1,1 -k2,2n | uniq > ${allpeakOvlpTssBed2}
16 | 
17 | awk 'BEGIN{FS=OFS="\t"}{print $1"_"$2"_"$3,$4,$5,$6}' ${allpeakOvlpTssBed2} \
18 |     | sort -k1,1 > ${allpeakOvlpTssBed3}
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/06.motifanalysis/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: ppdcpeak_motif
 2 | snakemake := /home/szu/miniforge3/bin/snakemake
 3 | # * run non-negative matrix factorization 
 4 | ppdcpeak_motif: src/main/pipeline/motif.Snakefile
 5 | 	-mkdir -p build/$@
 6 | 	cp $< build/$@/Snakefile
 7 | 	cd build/$@ && \
 8 |   ${snakemake} --config \
 9 |     system=mediator \
10 |     tag=ppdcpeak \
11 |     out=ppdcpeak \
12 |     njob=4 \
13 |     module=54 \
14 |      -c 20 \
15 |      --snakefile Snakefile -R --rerun-incomplete \
16 |      --rerun-triggers mtime \
17 |      --skip-script-cleanup  \
18 |      # --profile pbs-torque-conda
19 | 
20 | clean_homer_tag:
21 | 	-rm out/ppdcpeak/flag/nmf.ppdcpeak.homer.*.n*.done
22 | 


--------------------------------------------------------------------------------
/06.motifanalysis/README.org:
--------------------------------------------------------------------------------
1 | * SCENICPLUS
2 | ** motif database:
3 |    - */oasis/tscc/scratch/smamde/scenic_plus_mouse_database*
4 | ** NOTE:
5 |    - the output from scenicplus may have the same name or we need to test output
6 |      in order to make sure no conflicts files are generated for different tasks.
7 |    
8 | 
9 | 


--------------------------------------------------------------------------------
/06.motifanalysis/src/main/R/05.splitPeakByModule.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(optparse)
 3 | packdir <- file.path(here::here(), "package/R")
 4 | import::from(.from = "utils.R", .directory = packdir,
 5 |   checkArgsExistOrStop, prepareOutdir, checkFileExistOrStop)
 6 | import::from(.from = "peak.R", .directory = packdir,
 7 |   loadStatPeak.NMF)
 8 | import::from(.from = "cembav2env.R", .directory = packdir,
 9 |   cembav2env, Sa2PeakCalling)
10 | 
11 | op <- list(
12 |   make_option(c("--nmfDir"), type = "character",
13 |     default = "nmf_ppdc/out"),
14 |   make_option(c("--module"), type = "integer",
15 |     default = 54),
16 |   make_option(c("--tag"), type = "character", default = "ppdc")
17 | )
18 | 
19 | args <- parse_args(OptionParser(option_list = op))
20 | checkArgsExistOrStop(args)
21 | 
22 | if(!dir.exists(args$nmfDir)) {
23 |   stop(args$nmfDir, " does not exist.")
24 | }
25 | 
26 | mod.nmf <- args$module
27 | tag <- args$tag
28 | 
29 | outDir <- file.path(args$nmfDir,
30 |   paste("nmf", tag, paste0("r", mod.nmf), "motif", sep = "."))
31 | prepareOutdir(outDir)
32 | 
33 | # * functions
34 | convertPeakToBed <- function(peakBed, peaknms, outFile = NULL) {
35 |   r <- peakBed[peaknms, ]
36 |   if(!is.null(outFile)) {
37 |     write.table(x = r, file = outFile, quote = FALSE, sep = "\t",
38 |       row.names = FALSE, col.names = FALSE)
39 |   }
40 |   return(r)
41 | }
42 | 
43 | # * load peaks
44 | peakBed <- data.table::fread(Sa2PeakCalling$finalpeakBedFile,
45 |   header = FALSE, sep = "\t", data.table = FALSE)
46 | colnames(peakBed) <- c("chrom", "start", "end", "name")
47 | rownames(peakBed) <- peakBed$name
48 | 
49 | # * nmf modules 
50 | nmfPeakStat <- loadStatPeak.NMF(
51 |   file = file.path(args$nmfDir,
52 |     paste("nmfPmat", tag,
53 |       paste0("r", mod.nmf), "n0", "statW", sep = ".")))
54 | 
55 | modules <- unique(nmfPeakStat$class0 + 1)
56 | 
57 | # * save peaks from each module to a seperate bed file
58 | invisible(lapply(modules, function(i) {
59 |   outFile <- file.path(outDir,
60 |     paste0("r", mod.nmf, "_n", i, ".cCREs.bed"))
61 |   message("Writing peak bed file to: ", outFile)
62 |   peaks <- with(nmfPeakStat, peak[class0 == (i-1)])
63 |   convertPeakToBed(peakBed = peakBed, peaknms = peaks, outFile = outFile)
64 | }))
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/06.motifanalysis/src/main/pipeline/motif.Snakefile:
--------------------------------------------------------------------------------
 1 | envvars:
 2 |     "PATH"
 3 | 
 4 | import os
 5 | import pyprojroot
 6 | import sys
 7 | 
 8 | proj_dir = str(pyprojroot.here())
 9 | 
10 | mod = config['module']
11 | myrange = list(range(1, mod + 1))
12 | 
13 | Rscript_bin = "/home/szu/miniforge3/envs/r/bin/Rscript"
14 | homer_bin = "/home/szu/miniforge3/envs/r/bin/findMotifsGenome.pl"
15 | code_project_dir = "/projects/ps-renlab2/szu/projects/CEMBA2"
16 | motif_dir = "24.motifanalysis"
17 | 
18 | code_dir = os.path.join(code_project_dir, motif_dir, "src/main")
19 | work_dir = os.path.join(proj_dir, "24.motifanalysis", "out")
20 | tag = config['tag']
21 | nmf_dir = os.path.join(proj_dir, "20.nmf", "out",
22 |                        f"{tag}_nmf", "out")
23 | out = config['out']
24 | out_dir = os.path.join(work_dir, out)
25 | log_dir = f"{work_dir}/{out}/log"
26 | flag_dir = f"{work_dir}/{out}/flag"
27 | motif_dir = f"{out_dir}/nmf.{tag}.r{mod}.motif"
28 | for d in [out_dir, log_dir, flag_dir, motif_dir]:
29 |     os.makedirs(d, exist_ok = True)
30 | njob = config["njob"]
31 | nmf_bed_dir = os.path.join(nmf_dir, f"nmf.{tag}.r{mod}.motif")
32 | 
33 | rule all:
34 |     input:
35 |         f"{flag_dir}/nmf.{tag}.splitPeakByModule.{mod}.done",
36 |         expand("{d}/nmf.{t}.homer.{m}.n{i}.done",
37 |                d = flag_dir, t = tag, m = mod, i = myrange)
38 | 
39 | 
40 | rule splitPeakByModule:
41 |     output:
42 |         touch(f"{flag_dir}/nmf.{tag}.splitPeakByModule.{mod}.done")
43 |     log:
44 |         f"{log_dir}/nmf.{tag}.splitPeakByModule.{mod}.log"
45 |     shell:
46 |         """
47 |         {Rscript_bin} {code_dir}/R/05.splitPeakByModule.R --nmfDir {nmf_dir} \
48 |             --module {mod} --tag {tag} 2> {log}
49 |         """
50 | rule findMotif:
51 |     input:
52 |         f"{flag_dir}/nmf.{tag}.splitPeakByModule.{mod}.done"
53 |     output:
54 |         touch(expand("{d}/nmf.{t}.homer.{m}.n{{i}}.done", d = flag_dir, t = tag, m = mod))
55 |     log:
56 |         expand("{d}/nmf.{t}.homer.{m}.n{{i}}.log", d = log_dir, t = tag, m = mod)
57 |     shell:
58 |         """
59 |         {homer_bin} {nmf_bed_dir}/r{mod}_n{wildcards.i}.cCREs.bed \
60 |              mm10 {motif_dir}/homer_n{wildcards.i} \
61 |              -nomotif -size given -p {njob} 2> {log}
62 |         """
63 | 


--------------------------------------------------------------------------------
/06.motifanalysis/src/main/python/test.scienicplus.py:
--------------------------------------------------------------------------------
 1 | import pycistarget
 2 | import pyranges as pr
 3 | import pandas as pd
 4 | from scenicplus.wrappers.run_pycistarget import run_pycistarget
 5 | import glob
 6 | import os
 7 | from pyscistarget.utils import region_names_to_coordinates
 8 | 
 9 | import pyprojroot
10 | 
11 | proj_dir = str(pyprojroot.here())
12 | 
13 | nvolp_peak_fnm = os.path.join(proj_dir, "18.snap2_peakcalling",
14 |                               "out/scfilter", "cembav2.nonOvlpDHS.bed")
15 | 
16 | peaks = pd.read_csv(nvolp_peak_fnm, sep = "\t", header = None)
17 | peaks = peaks.rename(columns = {0: "Chromosome", 1: "Start", 2: "End", 3: "Name"})
18 | 
19 | region_sets = {'DARs':
20 |                {'novlp': pr.PyRanges(peaks[["Chromosome", "Start", "End"]])}
21 |                }
22 | db_path = os.path.join(proj_dir, "24.scenicplus/data/scenicplus_database")
23 | rankings_db = os.path.join(db_path, "mm10_screen_v10_clust.regions_vs_motifs.rankings.feather")
24 | scores_db = os.path.join(db_path, "mm10_screen_v10_clust.regions_vs_motifs.scores.feather")
25 | motif_annotation = os.path.join(db_path, "motifs-v10nr_clust-nr.mgi-m0.001-o0.0.tbl")
26 | 
27 | out_dir = os.path.join(proj_dir, "24.sceinicplust/out/test_scenicplus")
28 | 
29 | motif_res = run_pycistarget(
30 |     region_sets = region_sets,
31 |     species = "mus_musculus",
32 |     save_path = out_dir,
33 |     ctx_db_path = rankings_db,
34 |     path_to_motif_annotations = motif_annotation,
35 |     run_without_promoters = True,
36 |     annotation_version = "v10nr_clust",
37 |     n_cpu = 4,
38 |     ignore_reinit_error = True,
39 | )
40 | 


--------------------------------------------------------------------------------
/07.m3C/README.org:
--------------------------------------------------------------------------------
 1 | * Proximal-distal pairs verification from m3C data.
 2 |   - Sst Gaba
 3 |   - Pvalb Gaba
 4 |   - CBX MLI Megf11 Gaba
 5 |   - Vip Gaba
 6 |   - CA1-ProS Glut
 7 |   - CB Granule Glut
 8 |   - L6 CT CTX Glut
 9 |   - L2/3 IT CTX Glut
10 |   - Oligo NN
11 |   - Astro-TE NN
12 |   - Microglia NN
13 |   - Bergmann NN
14 |   
15 |   
16 |   
17 |   
18 |   
19 | 


--------------------------------------------------------------------------------
/07.m3C/hic2/hic2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | while read s; 
3 | do nohup bash /projects/ps-renlab2/y2xie/projects/44.colab/szu_mba/scripts/hic_APA_subtype.sh $s & 
4 | done</projects/ps-renlab2/y2xie/projects/44.colab/szu_mba/meta/subclass.txt
5 | 


--------------------------------------------------------------------------------
/07.m3C/runHiC.sh:
--------------------------------------------------------------------------------
1 | while read s;
2 | do nohup bash /projects/ps-renlab2/y2xie/projects/44.colab/szu_mba/scripts/hic_balance_subtype.sh $s & 
3 | done </projects/ps-renlab2/y2xie/projects/44.colab/szu_mba/meta/subclass.txt
4 | 


--------------------------------------------------------------------------------
/07.m3C/subclass.txt:
--------------------------------------------------------------------------------
 1 | Sst_Gaba
 2 | Pvalb_Gaba
 3 | CBX_MLI_Megf11_Gaba
 4 | Vip_Gaba
 5 | CA1-ProS_Glut
 6 | CB_Granule_Glut
 7 | L6_CT_CTX_Glut
 8 | L2-3_IT_CTX_Glut
 9 | Oligo_NN
10 | Astro-TE_NN
11 | Microglia_NN
12 | Bergmann_NN
13 | 


--------------------------------------------------------------------------------
/08.GRN/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: celloracle
 2 | 
 3 | snakemake := /home/szu/miniforge3/envs/sa2/bin/snakemake
 4 | # snakemake := /home/szu/mambaforge/envs/sa22/bin/snakemake
 5 | # celloracle: src/main/pipeline/celloracle.Snakefile src/main/resource/config.yaml
 6 | # 	-mkdir -p build/$@
 7 | # 	cp $(word 2,$^) build/$@/config.yaml
 8 | # 	cp $< build/$@/Snakefile
 9 | # 	cd build/$@ && \
10 | # 	${snakemake} --snakefile Snakefile  -R -c 30\
11 | #      --config \
12 | #      threads=1 \
13 | #      debug=0 \
14 | #      queue=glean \
15 | #      walltime=8 \
16 | #      system=mediator \
17 | #      njobs=1 \
18 | #     --rerun-triggers mtime \
19 | #     --skip-script-cleanup \
20 | #     --keep-going \
21 |     # --profile pbs-torque-conda
22 | 
23 | # comment runGRN_baseGRNsc firstly in snakefile
24 | celloracle: src/main/pipeline/celloracle.Snakefile src/main/resource/config.yaml
25 | 	-mkdir -p build/$@
26 | 	cp $(word 2,$^) build/$@/config.yaml
27 | 	cp $< build/$@/Snakefile
28 | 	cd build/$@ && \
29 | 	${snakemake} --snakefile Snakefile  -R -c 30\
30 |      --config \
31 |      threads=1 \
32 |      debug=0 \
33 |      queue=glean \
34 |      walltime=8 \
35 |      system=mediator \
36 |      njobs=4 \
37 |     --rerun-triggers mtime \
38 |     --skip-script-cleanup \
39 |     --keep-going \
40 |     # --profile pbs-torque-conda
41 | 


--------------------------------------------------------------------------------
/08.GRN/src/main/python/01.runGimmemotifs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os, sys, shutil, importlib, glob
 5 | from tqdm.notebook import tqdm
 6 | import celloracle as co
 7 | from celloracle import motif_analysis as ma
 8 | from genomepy import Genome
 9 | import scanpy as sc
10 | import pyprojroot
11 | packdir = f"{pyprojroot.here()}/package/python"
12 | sys.path.insert(0, packdir)
13 | import utils
14 | import mycelloracle
15 | 
16 | 
17 | parser = argparse.ArgumentParser(description = "run gimmemotifs")
18 | parser.add_argument('--pdcbedpe', type = str)
19 | parser.add_argument('--outdir', type = str, default = "tfscan")
20 | parser.add_argument('--threshold', type = int, default = 10)
21 | parser.add_argument('--refgenome', type = str, default = "mm10")
22 | 
23 | args = parser.parse_args()
24 | 
25 | # * main
26 | # to celloracle supported file
27 | print(f"TFscan for {args.pdcbedpe}")
28 | pdc_bedpe = pd.read_table(args.pdcbedpe, sep = "\t", header = None)
29 | pdc_bedpe.columns = ["pchr", "pstart", "pend", "dchr",
30 |                      "dstart", "dend", "pair", "pearson", "n1", "n2"]
31 | 
32 | peak_id = pdc_bedpe['dchr']  + '_' + pdc_bedpe['dstart'].map(str) + '_'  + pdc_bedpe['dend'].map(str)
33 | gene_short_name = pdc_bedpe['pair'].apply(lambda x: x.split('|')[0])
34 | pdc = pd.concat([peak_id, gene_short_name], axis = 1)
35 | pdc.columns = ["peak_id", 'gene_short_name']
36 | mycelloracle.check_peak_format(peaks_df = pdc, ref_genome = args.refgenome)
37 | 
38 | # * tfcan
39 | tfi = ma.TFinfo(peak_data_frame = pdc, ref_genome = args.refgenome)
40 | tfi.scan(fpr = 0.02, motifs = None, verbose = True)
41 | # save result
42 | prefix = os.path.basename(args.pdcbedpe).replace(".bedpe", "")
43 | tfi.to_hdf5(file_path=f"{args.outdir}/{prefix}.celloracle.tfinfo")
44 | 
45 | # * filter
46 | tfi.reset_filtering()
47 | tfi.filter_motifs_by_score(threshold=args.threshold)
48 | tfi.make_TFinfo_dataframe_and_dictionary(verbose = True)
49 | df_tfi = tfi.to_dataframe()
50 | # save result
51 | df_tfi.to_parquet(f"{args.outdir}/{prefix}.baseGRN.df.parquet")
52 | print("Done")
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/08.GRN/src/main/python/02.mergeGimme.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os, sys, shutil, importlib, glob
 5 | from tqdm.notebook import tqdm
 6 | import celloracle as co
 7 | from celloracle import motif_analysis as ma
 8 | from genomepy import Genome
 9 | import scanpy as sc
10 | import pyarrow.parquet as pq
11 | import pyprojroot
12 | packdir = f"{pyprojroot.here()}/package/python"
13 | sys.path.insert(0, packdir)
14 | import utils
15 | import mycelloracle
16 | 
17 | # * function
18 | def load_tfidf(f) -> pd.DataFrame:
19 |     import pyarrow.parquet as pq
20 |     r = pq.read_table(f)
21 |     return r.to_pandas()
22 | 
23 | # * load groups
24 | proj_dir = str(pyprojroot.here())
25 | group_file = os.path.join(proj_dir, "meta", "sa2.subclass.srt.txt")
26 | with open(group_file) as f:
27 |     lines = f.readlines()
28 |     groups = [l.strip() for l in lines if len(l.strip()) > 1]
29 |     groups = [g for g in groups if (not "Hypendymal_NN" in g)]
30 | 
31 | with open("pdc.suffix.txt", mode = "r") as f:
32 |     parts = [l.strip() for l in f.readlines() if len(l) > 1]
33 | 
34 | # * check gimmemotifs
35 | tfscan_dir = os.path.join(proj_dir, "22.sa2GRN", "out/tfscan")
36 | prefix = "sa2subclass"
37 | 
38 | # about 100G RAM
39 | tfidfs = [load_tfidf(f"{tfscan_dir}/{prefix}.{i}.pdc.baseGRN.df.parquet")
40 |           for i in groups]
41 | # another 100G RAM
42 | r = pd.concat(tfidfs, axis = 0, ignore_index= True)
43 | r = r.fillna(int(0))
44 | del tfidfs
45 | # 520M
46 | r.to_parquet(f"{tfscan_dir}/{prefix}.all.baseGRN.df.parquet")
47 | 
48 | # stat
49 | tfCount = r.sum(axis = 0, numeric_only = True)
50 | peakCount = r.sum(axis = 1, numeric_only = True)
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/08.GRN/src/main/python/05.plot.powerlaw.py:
--------------------------------------------------------------------------------
 1 | from os.path import exists
 2 | import pandas as pd
 3 | 
 4 | # 1.23.0 since
 5 | # - numba(cell oracle needs) <=1.24, but
 6 | # - seaborn != 1.24.0
 7 | 
 8 | import numpy as np
 9 | import os, sys, shutil, importlib, glob
10 | from tqdm.notebook import tqdm
11 | import celloracle as co
12 | from celloracle import motif_analysis as ma
13 | from genomepy import Genome
14 | import scanpy as sc
15 | import pyarrow.parquet as pq
16 | import pyprojroot
17 | import matplotlib.pyplot as plt
18 | 
19 | proj_root = pyprojroot.here()
20 | print(proj_root)
21 | co_base_sc_dir = os.path.join(proj_root, "22.sa2GRN",
22 |                               "out", "GRN",
23 |                               "baseGRN_subclass")
24 | 
25 | out_dir = os.path.join(proj_root, "22.sa2GRN",
26 |                        "out", "powerlaw")
27 | if not os.path.exists(out_dir):
28 |     os.makedirs(out_dir, exist_ok = True)
29 | 
30 | sc = "Astro-TE_NN"
31 | link_sc = co.utility.load_hdf5(
32 |     os.path.join(co_base_sc_dir, f"GRN.{sc}.celloracle.links"))
33 | 
34 | link_sc
35 | link_sc.cluster
36 | 
37 | # use default parameter for filtering
38 | # p=0.001, weight="coef_abs", threshold_number=10000
39 | link_sc.filter_links()
40 | link_sc.plot_degree_distributions(plot_model=True,
41 |                                   save = os.path.join(out_dir, sc))
42 | 


--------------------------------------------------------------------------------
/08.GRN/src/main/resource/config.yaml:
--------------------------------------------------------------------------------
 1 | system: mediator
 2 | python_bin:
 3 |   imac: /Users/szu/mambaforge/envs/celloracle/bin/python
 4 |   tscc: /projects/ps-renlab2/szu/miniconda3/envs/celloracle/bin/python
 5 |   encoder: /projects/ps-renlab2/szu/miniconda3/envs/celloracle/bin/python
 6 |   mediator: /home/szu/miniforge3/envs/sa2/bin/python
 7 | code_project_dir:
 8 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2
 9 |   tscc: /projects/ps-renlab2/szu/projects/CEMBA2
10 |   encoder: /projects/ps-renlab2/szu/projects/CEMBA2
11 |   mediator: /projects/ps-renlab2/szu/projects/CEMBA2
12 | work_project_dir:
13 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2
14 |   tscc: /oasis/tscc/scratch/szu/projects/CEMBA2
15 |   encoder: /projects/ps-renlab2/szu/projects/CEMBA2
16 |   mediator: /projects/ps-renlab2/szu/projects/CEMBA2
17 | local_dir: 22.sa2GRN
18 | pdc_suffix: pdc.suffix.txt
19 | tfscan_dir: out/tfscan
20 | threshold_gimme: 10
21 | subclass4GRN: allen.subclass.chosen.txt
22 | allenRNA_dir: src/main/resource/sa2.allen.logCPM.vf3281.ds1000.subclass.specific
23 | GRN_dir: out/GRN
24 | 
25 | 


--------------------------------------------------------------------------------
/09.cCRE_conservation/01.reciLiftOver.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # * mm10 lift to hg38
4 | liftOver whole.mouse.brain.cCREs.bed mm10ToHg38.over.chain.gz -minMatch=0.5 peak.conserve0.5.bed peak.unMapped0.5
5 | 
6 | # * hg38 lift back to mm10
7 | liftOver peak.conserve0.5.bed hg38ToMm10.over.chain.gz -minMatch=0.5 peak.conserve0.5.reciprocal0.5.bed peak.conserver0.5.unMapped0.5 
8 | 
9 | 


--------------------------------------------------------------------------------
/11.deeplearning/README.org:
--------------------------------------------------------------------------------
 1 | * Install tensorflow
 2 | From Kangli:
 3 | this is what i install and test tensorflow on mediator:
 4 | #+BEGIN_SRC python
 5 | conda create -n tensorflow python=3.10
 6 | conda activate tensorflow
 7 | mamba install -c conda-forge cudatoolkit=11.8.0
 8 | python3 -m pip install nvidia-cudnn-cu11==8.6.0.163 tensorflow==2.13.*
 9 | mkdir -p $CONDA_PREFIX/etc/conda/activate.d
10 | echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
11 | echo 'export LD_LIBRARY_PATH=$CUDNN_PATH/lib:$CONDA_PREFIX/lib/:$LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
12 | source $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
13 | # Verify install:
14 | python3 -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"
15 | # for no TensorRT
16 | pip install nvidia-pyindex
17 | pip install nvidia-tensorrt
18 | python3 -c "import tensorrt; import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))" 
19 | #+END_SRC
20 | 


--------------------------------------------------------------------------------
/11.deeplearning/src/main/resource/mappedHMB.txt:
--------------------------------------------------------------------------------
 1 | ACBGM
 2 | ASCNT_1
 3 | ASCNT_2
 4 | ASCNT_3
 5 | ASCT_1
 6 | ASCT_2
 7 | ASCT_3
 8 | CBGRC
 9 | CNGA_1
10 | CNGA_2
11 | COP
12 | CT_1
13 | CT_2
14 | D12NAC
15 | D1CaB
16 | D1Pu
17 | D2CaB
18 | D2Pu
19 | ET
20 | ICGA_2
21 | ITL23_1
22 | ITL23_2
23 | ITL23_3
24 | ITL23_5
25 | ITL45_1
26 | ITL45_2
27 | ITL5_1
28 | ITL5_2
29 | ITL5_3
30 | ITL5_4
31 | ITL6_1_1
32 | ITL6_1_2
33 | ITL6_2_1
34 | ITL6_2_2
35 | L6B_1
36 | L6B_2
37 | LAMP5
38 | MGC_1
39 | MGC_2
40 | MSN_1
41 | NP_1
42 | NP_3
43 | OGC_1
44 | OGC_2
45 | OGC_3
46 | OPC
47 | PIR
48 | PVALB_1
49 | PVALB_2
50 | PVALB_3
51 | PVALB_4
52 | PV_ChCs
53 | SNCG_1
54 | SNCG_3
55 | SNCG_4
56 | SNCG_5
57 | SST_1
58 | SST_2
59 | SST_3
60 | SST_4
61 | SST_5
62 | VIP_1
63 | VIP_2
64 | VIP_3
65 | VIP_4
66 | VIP_5
67 | VIP_6
68 | VIP_7
69 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Songpeng Zu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/manuscript_figures/sa2.sc2region.R:
--------------------------------------------------------------------------------
 1 | source("paper.R")
 2 | scnmMap <- cembav2env$load.sa2.subclass.names()
 3 | rownames(scnmMap) <- scnmMap$subclass_id_label
 4 | allenSubclass2Region <- Sa2Integration$getAllenSubclass2Region()
 5 | rownames(allenSubclass2Region) <- allenSubclass2Region$subclass_id_label
 6 | 
 7 | mrMeta <- list(
 8 |   Telencephalon = c("Isocortex", "HPF", "OLF", "AMY", "STR", "PAL"),
 9 |   Diencephalon = c("TH", "HY"),
10 |   Midbrain = "MB",
11 |   Hindbrain = c("Pons", "MY"),
12 |   Cerebellum = "CB"
13 | )
14 | mr2r <- data.frame(
15 |   mr = c(
16 |     c("Isocortex", "HPF", "OLF", "AMY", "STR", "PAL", "CNU"),
17 |     c("TH", "HY"),
18 |     c("MB", "MB-PONS"),
19 |     c("Pons", "MY", "HB"),
20 |     "CB"),
21 |   r = c(rep("Telencephalon", 7),
22 |     rep("Diencephalon", 2),
23 |     rep("Midbrain", 2),
24 |     rep("Hindbrain", 3),
25 |     "Cerebellum")
26 | )
27 | rownames(mr2r) <- mr2r$mr
28 | 
29 | r2Color <- data.frame(
30 |   r = c("Telencephalon", "Diencephalon", "Midbrain",
31 |     "Hindbrain", "Cerebellum", "Non-Telencephalon"),
32 |   color = c("#00688B", "#F15F30", "#74E44B",
33 |     "#788FC8", "#DEB34C", "#999999")
34 | )
35 | 
36 | # use this to update mr2r
37 | allenmrs <- allenSubclass2Region[, "MajorRegionRateTop3"] |>
38 |   lapply(X = _, Sa2Integration$extractAllenRegionWithScore) |>
39 |   do.call(rbind, args = _ )|>
40 |   x => x[!is.na(x[,2]), ]|>
41 |   x => x$region |>
42 |   unique()
43 | 
44 | # map subclass to region based on their top region information
45 | sc2r <- allenSubclass2Region[ , "MajorRegionRateTop3"] |>
46 |   lapply(X = _, \(x) {
47 |     r2score <- Sa2Integration$extractAllenRegionWithScore(x) |>
48 |       mutate(r = mr2r[region, "r"]) |>
49 |       group_by(r) |>
50 |       summarise(r_s = sum(score)) |>
51 |       slice_max(r_s, n = 1)
52 |   }) |> do.call(rbind, args = _) |>
53 |   as.data.frame()
54 | 
55 | rownames(scnmMap) <- scnmMap$subclass_id_label
56 | rownames(sc2r) <- scnmMap[allenSubclass2Region$subclass_id_label,
57 |   "subclass_label_peak"]
58 | 
59 | write.table(sc2r,
60 |   file = file.path(projdir, "meta", "sa2.subclass2region2score.csv"),
61 |   sep = ",",
62 |   col.names = TRUE, row.names = TRUE, quote = FALSE)
63 | 
64 | 


--------------------------------------------------------------------------------
/meta/BICCN.BrainRegionMetadata.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/meta/BICCN.BrainRegionMetadata.xlsx


--------------------------------------------------------------------------------
/meta/allen.region.to.main.region.v2.txt:
--------------------------------------------------------------------------------
 1 | allenRegionLabel@RegionFromJingtian@MajorRegionCEMBA@MajorRegionLabel@MajorRegionColor
 2 | PONS@Pons@Pons@Pons@#78cbe6
 3 | SS-GU-VISC@CTX@Isocortex@Isocortex@#176CDB
 4 | MB@MB@MB@MB@#74e44b
 5 | HY@HY@HY@HY@#d68790
 6 | AUD-TEa-PERI-ECT@CTX@Isocortex@Isocortex@#176CDB
 7 | MY@MY@MY@MY@#66458d
 8 | RSP@CTX@Isocortex@Isocortex@#176CDB
 9 | sAMY@CTXsp@AMY@AMY@#508eda
10 | AI@CTX@Isocortex@Isocortex@#176CDB
11 | ACA@CTX@Isocortex@Isocortex@#176CDB
12 | CB@CB@CB@CB@#deb34c
13 | HIP@HIP@HPF@HPF@#d62728
14 | VIS@CTX@Isocortex@Isocortex@#176CDB
15 | LSX@PAL@PAL@PAL@#a83795
16 | PL-ILA-ORB@CTX@Isocortex@Isocortex@#176CDB
17 | CTXsp@PIR@OLF@OLF@#ff7f0e
18 | OLF@OLF@OLF@OLF@#ff7f0e
19 | MO-FRP@CTX@Isocortex@Isocortex@#176CDB
20 | MOp@CTX@Isocortex@Isocortex@#176CDB
21 | TH@TH@TH@TH@#9f4020
22 | STRv@STR@STR@STR@#7B42CD
23 | STRd@STR@STR@STR@#7B42CD
24 | ENT@RHP@HPF@HPF@#d62728
25 | PAL@PAL@PAL@PAL@#a83795
26 | VIS-PTLp@CTX@Isocortex@Isocortex@#176CDB
27 | RHP@RHP@HPF@HPF@#d62728
28 | PAR-POST-PRE-SUB-ProS@RHP@HPF@HPF@#d62728
29 | CNU@NA@NA@CNU@#923DB1
30 | STR@STR@STR@STR@#7B42CD
31 | HY LZ@HY@HY@HY@#d68790
32 | AI-CLA@CTX@Isocortex@Isocortex@#176CDB
33 | VISp@CTX@Isocortex@Isocortex@#176CDB
34 | HB@NA@NA@HB@#6F88BA
35 | MB-PONS@NA@NA@MB-PONS@#76D899
36 | SSs-GU-VISC-AIp@CTX@Isocortex@Isocortex@#176CDB
37 | VISl@CTX@Isocortex@Isocortex@#176CDB
38 | MOs-FRP@CTX@Isocortex@Isocortex@#176CDB
39 | PTLp@CTX@Isocortex@Isocortex@#176CDB
40 | VISm@CTX@Isocortex@Isocortex@#176CDB
41 | SSp@CTX@Isocortex@Isocortex@#176CDB
42 | AUD@CTX@Isocortex@Isocortex@#176CDB
43 | VISpos@CTX@Isocortex@Isocortex@#176CDB
44 | VISa@CTX@Isocortex@Isocortex@#176CDB
45 | AId-AIv@CTX@Isocortex@Isocortex@#176CDB
46 | TEa-PERI-ECT@CTX@Isocortex@Isocortex@#176CDB
47 | AId-AIv-AIp@CTX@Isocortex@Isocortex@#176CDB
48 | 
49 | 


--------------------------------------------------------------------------------
/meta/dissect2time.csv:
--------------------------------------------------------------------------------
  1 | early,later
  2 | 3C,171206,171207
  3 | 4B,171213,180104
  4 | 4D,171214,171219
  5 | 3F,180105,180109
  6 | 4E,180110,180111
  7 | 1B,180119,180213
  8 | 2A,180123,190207
  9 | 3A,180129,180130
 10 | 4A,180205,180206
 11 | 1C,180208,180212
 12 | 2E,180222,190207
 13 | 1A,180226,180227
 14 | 2B,180305,180306
 15 | 3B,180308,180312
 16 | 2D,180313,180319
 17 | 3E,180320,180326
 18 | 4F,180329,180402
 19 | 2C,180409,180410
 20 | 3D,180412,180416
 21 | 4C,180417,180419
 22 | 8B,180426,180430
 23 | 5B,180514,180529
 24 | 5D,180612,180618
 25 | 4G,180619,180723
 26 | 4H,180724,180730
 27 | 5A,180731,180807
 28 | 5E,180813,180820
 29 | 5J,180904,180910
 30 | 5C,181001,181002
 31 | 5G,181008,181009
 32 | 5H,181015,181016
 33 | 6B,181022,181023
 34 | 5F,181218,181220
 35 | 6A,190108,190117
 36 | 7B,190110,190115
 37 | 6C,190122,190124
 38 | 6D,190131,190205
 39 | 9H,190212,190219
 40 | 9J,190212,190219
 41 | 11E,190214,190305
 42 | 11F,190214,190305
 43 | 11B,190314,190325
 44 | 12B,190314,190325
 45 | 9B,190326,190404
 46 | 9D,190326,190404
 47 | 10C,190411,190418
 48 | 9A,190411,190418
 49 | 10A,190423,190523
 50 | 10G,190423,190523
 51 | 11C,190530,190620
 52 | 9C,190530,190620
 53 | 10E,190625,190627
 54 | 10F,190625,190627
 55 | 7G,190702,190709
 56 | 8E,190711,190716
 57 | 8J,190711,190716
 58 | 6E,190718,190723
 59 | 8F,190718,190723
 60 | 10B,190725,190730
 61 | 13A,190725,190730
 62 | 12A,191008,191017
 63 | 7A,191008,191017
 64 | 11D,191024,191031
 65 | 12D,191024,191031
 66 | 11A,191107,191114
 67 | 8A,191107,191114
 68 | 7C,191205,191212
 69 | 8C,191205,191212
 70 | 7J,200305,200520
 71 | 9L,200305,200520
 72 | 6H,200312,200319
 73 | 8K,200312,200319
 74 | 8D,20200707,200709
 75 | 9E,20200707,200709
 76 | 6G,200714,20200721
 77 | 7D,200714,20200721
 78 | 6F,200723,200728
 79 | 7F,200723,200728
 80 | 11H,200730,200813
 81 | 12H,200730,200813
 82 | 12E,200820,200827
 83 | 7H,200820,200827
 84 | 8H,200903,200910
 85 | 9G,200903,200910
 86 | 13D,200917,200924
 87 | 14C,200917,200924
 88 | 8L,201001,201008
 89 | 9F,201001,201008
 90 | 11K,201015,201022
 91 | 12C,201015,201022
 92 | 7E,201029,201105
 93 | 8G,201029,201105
 94 | 9Ka,201112,201119
 95 | 9Kb,201112,201119
 96 | 10D,201203,201210
 97 | 13B,201203,201210
 98 | 14B,201217,210107
 99 | 18A,201217,210107
100 | 10H,210114,210121
101 | 10J,210114,210121
102 | 11G,210128,210204
103 | 11J,210128,210204
104 | 12F,210211,210218
105 | 12G,210211,210218
106 | 12J,210225,210304
107 | 13C,210225,210304
108 | 13E,210311,210318
109 | 13F,210311,210318
110 | 14A,210325,210401
111 | 14D,210325,210401
112 | 15C,210408,210415
113 | 16C,210408,210415
114 | 17B,210422,210429
115 | 18B,210422,210429
116 | 15B,210805,210810
117 | 16A,210805,210812
118 | 17A,210810,210812
119 | 


--------------------------------------------------------------------------------
/meta/getGeneUp2K.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | gtf="modified_gencode.vM23.primary_assembly.annotation.gtf"
 4 | bedfile="mouse.modified_gencode.vM23.bed"
 5 | geneUp2kfile="mmouse.modified.genecode.vM23.gene.up2k.bed"
 6 | 
 7 | awk 'BEGIN{FS=OFS="\t"}($3=="gene"){split($9,a,"\""); print $1,$4-1,$5,a[6],$6,$7}' ${gtf}\
 8 |     | sort -k1,1 -k2,2n > ${bedfile}
 9 | 
10 | awk 'BEGIN{FS=OFS="\t"}{if($6=="+" && $2-2000>0){print $1,$2-2000,$3,$4,$5,$6}else if($6=="+" && $2-2000<0){print $1,0,$3,$4,$5,$6}else if($6=="-"){print $1,$2,$3+2000,$4,$5,$6}}' ${bedfile} > ${geneUp2kfile}
11 | 


--------------------------------------------------------------------------------
/meta/mm10-blacklist.v2.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/meta/mm10-blacklist.v2.bed


--------------------------------------------------------------------------------
/meta/mm10.blacklist.bed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/meta/mm10.blacklist.bed


--------------------------------------------------------------------------------
/meta/mm10.chrom.sizes:
--------------------------------------------------------------------------------
 1 | chr1	195471971
 2 | chr2	182113224
 3 | chrX	171031299
 4 | chr3	160039680
 5 | chr4	156508116
 6 | chr5	151834684
 7 | chr6	149736546
 8 | chr7	145441459
 9 | chr10	130694993
10 | chr8	129401213
11 | chr14	124902244
12 | chr9	124595110
13 | chr11	122082543
14 | chr13	120421639
15 | chr12	120129022
16 | chr15	104043685
17 | chr16	98207768
18 | chr17	94987271
19 | chrY	91744698
20 | chr18	90702639
21 | chr19	61431566
22 | chr5_JH584299_random	953012
23 | chrX_GL456233_random	336933
24 | chrY_JH584301_random	259875
25 | chr1_GL456211_random	241735
26 | chr4_GL456350_random	227966
27 | chr4_JH584293_random	207968
28 | chr1_GL456221_random	206961
29 | chr5_JH584297_random	205776
30 | chr5_JH584296_random	199368
31 | chr5_GL456354_random	195993
32 | chr4_JH584294_random	191905
33 | chr5_JH584298_random	184189
34 | chrY_JH584300_random	182347
35 | chr7_GL456219_random	175968
36 | chr1_GL456210_random	169725
37 | chrY_JH584303_random	158099
38 | chrY_JH584302_random	155838
39 | chr1_GL456212_random	153618
40 | chrUn_JH584304	114452
41 | chrUn_GL456379	72385
42 | chr4_GL456216_random	66673
43 | chrUn_GL456393	55711
44 | chrUn_GL456366	47073
45 | chrUn_GL456367	42057
46 | chrUn_GL456239	40056
47 | chr1_GL456213_random	39340
48 | chrUn_GL456383	38659
49 | chrUn_GL456385	35240
50 | chrUn_GL456360	31704
51 | chrUn_GL456378	31602
52 | chrUn_GL456389	28772
53 | chrUn_GL456372	28664
54 | chrUn_GL456370	26764
55 | chrUn_GL456381	25871
56 | chrUn_GL456387	24685
57 | chrUn_GL456390	24668
58 | chrUn_GL456394	24323
59 | chrUn_GL456392	23629
60 | chrUn_GL456382	23158
61 | chrUn_GL456359	22974
62 | chrUn_GL456396	21240
63 | chrUn_GL456368	20208
64 | chrM	16299
65 | chr4_JH584292_random	14945
66 | chr4_JH584295_random	1976
67 | 


--------------------------------------------------------------------------------
/meta/mm10.chrom.sizes.lite:
--------------------------------------------------------------------------------
 1 | chr1	195471971
 2 | chr2	182113224
 3 | chrX	171031299
 4 | chr3	160039680
 5 | chr4	156508116
 6 | chr5	151834684
 7 | chr6	149736546
 8 | chr7	145441459
 9 | chr10	130694993
10 | chr8	129401213
11 | chr14	124902244
12 | chr9	124595110
13 | chr11	122082543
14 | chr13	120421639
15 | chr12	120129022
16 | chr15	104043685
17 | chr16	98207768
18 | chr17	94987271
19 | chrY	91744698
20 | chr18	90702639
21 | chr19	61431566
22 | 


--------------------------------------------------------------------------------
/meta/subclass_and_genemarker_CEMBAv1.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/meta/subclass_and_genemarker_CEMBAv1.xlsx


--------------------------------------------------------------------------------
/package/R/cicero.R:
--------------------------------------------------------------------------------
 1 | #' @examples
 2 | #' File format:
 3 | #' pdc,cor
 4 | #' 0610009B22Rik@peak217336,0.0602667683204699
 5 | #' 0610009B22Rik@peak217343,-0.0929365338771179
 6 | #' @export
 7 | loadCor <- function(f, split = "@") {
 8 |   r <- data.table::fread(file = f, header = TRUE, sep = ",",
 9 |     data.table = FALSE)
10 |   gene.peak  <- vapply(r$pdc, function(i) {
11 |     t <- strsplit(i, split = split, fixed = TRUE)
12 |     t[[1]]
13 |   }, c("a", "peak1"))
14 |   r$gene <- gene.peak[1,]
15 |   r$peak <- gene.peak[2,]
16 |   return(r)
17 | }
18 | 


--------------------------------------------------------------------------------
/package/R/gglot.theme.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | theme.no.axis <- theme(
 4 |   axis.text = ggplot2::element_blank(),
 5 |   axis.title = ggplot2::element_blank(),
 6 |   axis.ticks = ggplot2::element_blank(),
 7 |   panel.border  = ggplot2::element_blank(),
 8 |   panel.background = element_blank(),
 9 |   plot.margin = unit(c(0,0,0,0), "cm"),
10 |   legend.title = element_text(size = 12),
11 |   legend.text = element_text(size = 9)
12 | )
13 | 
14 | theme_my_minimal <- theme_minimal() +
15 |   theme(
16 |     panel.grid = element_blank(),
17 |     panel.border = element_blank(),
18 |     legend.position = "none",
19 |     axis.title = element_blank(),
20 |     axis.text = element_text(colour = "black"),
21 |     )
22 | 
23 | #'@export
24 | setGlobalTheme <- function(newTheme){
25 |   oldTheme <- ggplot2::theme_get()
26 |   ggplot2::theme_set(newTheme)
27 |   return(oldTheme)
28 | }
29 | 


--------------------------------------------------------------------------------
/package/R/hdf5.R:
--------------------------------------------------------------------------------
 1 | #' @export
 2 | splitByChunk <- function(nondecVec, chunkSize = 2012) {
 3 |   n <- ceiling(max(nondecVec) / chunkSize)
 4 |   r <- lapply(seq(n), function(i) {
 5 |     left <- chunkSize * (i-1)
 6 |     right <- chunkSize *i
 7 |     index <- (nondecVec > left) & (nondecVec <= right)
 8 |     if(sum(index) < 1) {
 9 |       return(NULL)
10 |     } else {
11 |       return(nondecVec[index])
12 |     }
13 |   })
14 |   rr <- r[!sapply(r, is.null)]
15 |   return(rr)
16 | }
17 | 
18 | #' @export
19 | h5adToMat.Sparse <- function(filenm) {
20 |   conn <- hdf5r::H5File$new(filename = filenm,
21 |     mode = "r")
22 |   data <- conn[["X"]][["data"]][]
23 |   indices <- conn[["X"]][["indices"]][]
24 |   indptr <- conn[["X"]][["indptr"]][]
25 |   obs <- conn[["obs"]][["_index"]][]
26 |   var <- conn[["var"]][["_index"]][]
27 |   mat <- Matrix::sparseMatrix(i = indices,
28 |     p = indptr, x = data, index1 = FALSE,
29 |     dims = c(length(obs), length(var)))
30 |   rownames(mat) <- obs
31 |   colnames(mat) <- var
32 |   return(mat)
33 | }
34 | 
35 | #' @export
36 | subset.snap.gmat.h5ad <- function(barcodes, h5adDir, ncore = 2,
37 |                                   is.parallel = FALSE) {
38 |   sampleFiles <- list.files(path = h5adDir,
39 |     full.names = TRUE, no.. = TRUE)
40 |   tmp <- h5adToMat.Sparse(filenm = sampleFiles[1])
41 |   genes <- colnames(tmp)
42 |   rm(tmp)
43 |   
44 |   matList <- if (is.parallel) {
45 |     parallel::mclapply(
46 |       sampleFiles, function(f){
47 |         subset.snap.gmat.h5ad.single(
48 |           barcodes = barcodes, h5adFile = f, colnms = genes)
49 |       }, mc.cores = ncore)
50 |   } else {
51 |     lapply(sampleFiles, function(f) {
52 |         subset.snap.gmat.h5ad.single(
53 |           barcodes = barcodes, h5adFile = f, colnms = genes)
54 |     })
55 |   }
56 |   
57 |   left <- !sapply(matList, is.null)
58 |   if(sum(left) < 1) {
59 |     stop("No barcodes found in all the sample files.")
60 |   }
61 |   matList <- matList[left]
62 |   mat <- do.call(rbind, matList)
63 |   ## remote this
64 |   ## - we have colnames and rownames, which will be kept
65 |   ##   after rbind
66 |   ## - there may have less barcodes
67 |   ## rownames(mat) <- barcodes
68 |   ## colnames(mat) <- genes
69 |   if(length(barcodes) != nrow(mat)) {
70 |     message(paste(nrow(mat), "over", length(barcodes), "found."))
71 |   } else {
72 |     message("All the barcodes are found.")
73 |   }
74 |   return(mat)
75 | }
76 | 
77 | #' @export
78 | subset.snap.gmat.h5ad.single <- function(barcodes, h5adFile, colnms = NULL) {
79 |   if (!file.exists(h5adFile)){
80 |     stop(h5adFile, " does not exist.")
81 |   }
82 |   message("working on: ", basename(h5adFile))
83 |   mat <- h5adToMat.Sparse(filenm = h5adFile)
84 |   index <- barcodes %in% rownames(mat)
85 |   if(sum(index) < 1) {
86 |     message("No barcodes found.")
87 |     return(NULL)
88 |   }
89 |   r <- mat[barcodes[index], , drop = FALSE]
90 |   if (!is.null(colnms)) {
91 |     colnames(r) <- colnms
92 |   }
93 |   return(r)
94 | }
95 | 


--------------------------------------------------------------------------------
/package/R/prob.R:
--------------------------------------------------------------------------------
1 | #' @export
2 | pvalOfBEZI <- function(x, mu, sigma, nu, lower = TRUE){
3 |   pval <- (1 - gamlss.dist::pBEZI(
4 |     x, mu = mu, sigma = sigma, nu = nu, lower.tail = lower, log.p = FALSE))
5 |   return(pval)
6 | }
7 | 


--------------------------------------------------------------------------------
/package/python/bedpe2bigwig.py:
--------------------------------------------------------------------------------
 1 | import gzip
 2 | import os
 3 | import pandas as pd
 4 | from typing import Dict, List
 5 | 
 6 | def generate_bedpe_worker(meta: str,
 7 |                           sample: str,
 8 |                           infnm: str,
 9 |                           outdir:str,) -> bool:
10 |     """Partition barcodes from a sample bedpe to different goups.
11 | 
12 |     Outfile format:
13 |     fnm = f"{outdir}/{sample}.{g}.bedpe.gz" 
14 |     """
15 |     # * read meta data
16 |     if not infnm.endswith('.bedpe.gz'):
17 |         raise FileNotFoundError(
18 |             f"{infnm} should be end with .bedpe.gz")
19 |     r: pd.DataFrame = pd.read_csv(meta, sep = "\t", header = 0)
20 |     barcode = pd.Series(
21 |         data = r.iloc[:, 2].values,
22 |         index = r.iloc[:, 0])
23 |     cell2group :Dict[str, str] = barcode.to_dict()
24 |     # * set up outfiles
25 |     s = r.iloc[:, [1,2]].groupby(by = r.iloc[:,1]).apply(
26 |         lambda x: list(pd.unique(x.iloc[:,1])))
27 |     o2n :Dict[str, List[str]] = s.to_dict()
28 |     ng2f :Dict[str, gzip.GzipFile] = dict()
29 | 
30 |     if not os.path.exists(outdir):
31 |         print(f"{outdir} does not exist.")
32 |         os.makedirs(name = outdir, exist_ok = True)
33 |     for g in o2n[sample]:
34 |         fnm = f"{outdir}/{sample}.{g}.bedpe.gz"
35 |         if os.path.exists(fnm):
36 |             print(f"{fnm} exists and remove it.")
37 |             os.remove(fnm)
38 |         ng2f[g] = gzip.open(f"{fnm}", mode = "wb")
39 | 
40 |     # * main
41 |     with gzip.open(infnm, "rb") as f:
42 |         for l in f:
43 |             barcode: str = l.decode().split("\t")[6]
44 |             if barcode in cell2group:
45 |                 ng:str = cell2group[barcode]
46 |                 ## debug
47 |                 # print(f"{barcode} -> {ng}")
48 |                 ng2f[ng].write(l)
49 |     # close all the output files
50 |     for g in o2n[sample]:
51 |         if not ng2f[g].closed:
52 |             ng2f[g].close()
53 |     return True
54 | 


--------------------------------------------------------------------------------
/package/python/mycelloracle.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os, sys, shutil, importlib, glob
 4 | from tqdm.notebook import tqdm
 5 | import celloracle as co
 6 | from celloracle import motif_analysis as ma
 7 | from genomepy import Genome
 8 | import scanpy as sc
 9 | 
10 | def download_refgenome(ref = "mm10") -> None:
11 |     genome_installation = ma.is_genome_installed(ref_genome = ref)
12 |     if not genome_installation:
13 |         import genomepy
14 |         ## fasta: /Users/szu/.local/share/genomes/mm10/mm10.fa
15 |         genomepy.install_genome(name = ref, provider = "UCSC")
16 |     else:
17 |         print(f"{ref} is installed")
18 | 
19 | 
20 | def decompose_chrstr(peak_str):
21 |     """
22 |     Args:
23 |         peak_str (str): peak_str. e.g. 'chr1_3094484_3095479'
24 | 
25 |     Returns:
26 |         tuple: chromosome name, start position, end position
27 |     """
28 |     *chr_, start, end = peak_str.split("_")
29 |     chr_ = "_".join(chr_)
30 |     return chr_, start, end
31 | 
32 | def check_peak_format(peaks_df, ref_genome):
33 |     """
34 |     Check peak format.
35 |      (1) Check chromosome name.
36 |      (2) Check peak size (length) and remove sort DNA sequences (<5bp)
37 | 
38 |     """
39 |     df = peaks_df.copy()
40 |     n_peaks_before = df.shape[0]
41 |     # Decompose peaks and make df
42 |     decomposed = [decompose_chrstr(peak_str) for peak_str in df["peak_id"]]
43 |     df_decomposed = pd.DataFrame(np.array(decomposed), index=peaks_df.index)
44 |     df_decomposed.columns = ["chr", "start", "end"]
45 |     df_decomposed["start"] = df_decomposed["start"].astype(int)
46 |     df_decomposed["end"] = df_decomposed["end"].astype(int)
47 |     # Load genome data
48 |     genome_data = Genome(ref_genome)
49 |     all_chr_list = list(genome_data.keys())
50 |     # DNA length check
51 |     lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])
52 |     # Filter peaks with invalid chromosome name
53 |     n_threshold = 5
54 |     df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)]
55 |     # DNA length check
56 |     lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])
57 | 
58 |     # Data counting
59 |     n_invalid_length = len(lengths[lengths < n_threshold])
60 |     n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum()
61 |     n_peaks_after = df.shape[0]
62 |     print("Peaks before filtering: ", n_peaks_before)
63 |     print("Peaks with invalid chr_name: ", n_peaks_invalid_chr)
64 |     print("Peaks with invalid length: ", n_invalid_length)
65 |     print("Peaks after filtering: ", n_peaks_after)
66 |     return df
67 | 
68 | def load_tfidf(f) -> pd.DataFrame:
69 |     import pyarrow.parquet as pq
70 |     r = pq.read_table(f)
71 |     return r.to_pandas()
72 | 


--------------------------------------------------------------------------------
/package/python/mylog.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import logging
 3 | 
 4 | class StreamToLogger(object):
 5 |     """
 6 |     Fake file-like stream object that redirects writes to a logger instance.
 7 |     Ref:
 8 |     https://stackoverflow.com/questions/19425736/how-to-redirect-stdout-and-stderr-to-logger-in-python
 9 |     """
10 |     def __init__(self, logger, level):
11 |        self.logger = logger
12 |        self.level = level
13 |        self.linebuf = ''
14 | 
15 |     def write(self, buf):
16 |        for line in buf.rstrip().splitlines():
17 |           self.logger.log(self.level, line.rstrip())
18 | 
19 |     def flush(self):
20 |         pass
21 | 
22 | def set_file_logger(fnm:str,
23 |                     fmode:str = 'a',
24 |                     name:str = 'sa2_pp',
25 |                     log_level: int = logging.DEBUG) -> logging.Logger:
26 |     logger = logging.getLogger(name)
27 |     logger.setLevel(log_level)
28 |     fh = logging.FileHandler(filename = fnm,
29 |                              mode = fmode)
30 |     fm = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
31 |     fh.setFormatter(fm)
32 |     logger.addHandler(fh)
33 |     return logger
34 | 
35 | def handle_exception(logger, exc_type, exc_value, exc_traceback):
36 |     import traceback
37 |     if issubclass(exc_type, KeyboardInterrupt):
38 |         sys.__excepthook__(exc_type, exc_value, exc_traceback)
39 |         return
40 |     logger.error(''.join(["Uncaught exception: ",
41 |                          *traceback.format_exception(
42 |                              exc_type, exc_value, exc_traceback)
43 |                          ]))
44 | 


--------------------------------------------------------------------------------
/package/python/mysnapatac2.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import snapatac2 as sa2
 3 | 
 4 | 
 5 | def modify_obs_name(sds: sa2.AnnData | sa2.AnnDataSet,
 6 |                     obs_key = "sample") -> List[str]:
 7 |     obs_names: List[str] = [f"{i}.{j}"
 8 |               for i, j in zip(
 9 |                       sds.obs[obs_key].to_list(), sds.obs_names)]
10 |     return obs_names
11 | 
12 | def clean_AnnDataSet(d: str):
13 |     import os
14 |     os.remove(f"{d}/_dataset.h5ads")
15 |     import shutil
16 |     shutil.rmtree(f"{d}/anndatas", ignore_errors = True)
17 |     os.removedirs(d)
18 | 
19 | # deprecated
20 | # use to_adata directly for subsetting data.
21 | def get_subset_from_AnnDataSet(adata, outf,
22 |                                obs_index = None, var_index = None,
23 |                                logger = None):
24 |     import os
25 |     tmp_dir, _ = os.path.splitext(outf)
26 |     os.makedirs(tmp_dir, exist_ok = True)
27 |     adata_subset = adata.subset(
28 |         obs_indices = obs_index,
29 |         var_indices = var_index,
30 |         out = tmp_dir
31 |     )
32 |     adata_subset = adata_subset[0]
33 |     if logger is not None:
34 |         logger.info(f"create subset from {tmp_dir} to {outf}")
35 |     r = adata_subset.to_adata(
36 |         copy_x = True,
37 |         file = outf
38 |     )
39 |     adata_subset.close()
40 |     # remove adata_subset
41 |     os.remove(f"{tmp_dir}/_dataset.h5ads")
42 |     import shutil
43 |     shutil.rmtree(f"{tmp_dir}/anndatas", ignore_errors = True)
44 |     os.removedirs(tmp_dir)
45 |     if logger is not None:
46 |         logger.info(f"clean tmp {tmp_dir}.")
47 |     return r
48 | 


--------------------------------------------------------------------------------
/package/python/snap2h5ad.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Save mat from snap file to h5ad file.
 3 | """
 4 | import anndata as ad
 5 | import h5py
 6 | import numpy as np
 7 | from scipy import sparse
 8 | 
 9 | def snapbmat2h5ad(snap_file, bin_size:int = 5000,
10 |                   prefix:str = '',
11 |                   outfnm:str = '') -> ad.AnnData:
12 |     """Save bmat from snap file to h5ad.
13 |     """
14 |     with h5py.File(snap_file, mode = 'r') as f:
15 |         barcode = [b.decode('utf-8') for b in f['/BD/name']]
16 |         chrom = [i.decode('utf-8')
17 |                      for i in f[f'/AM/{bin_size}/binChrom']]
18 |         start = f[f'/AM/{bin_size}/binStart'][:]
19 |         end = start + bin_size - 1
20 |         name = [f"{i}:{s}-{t}" for i, s, t in
21 |                     zip(chrom, start, end)]
22 |         idx = np.array(f[f'/AM/{bin_size}/idx'],
23 |                             dtype = np.uintc)
24 |         idy = np.array(f[f'/AM/{bin_size}/idy'],
25 |                             dtype = np.uintc)
26 |         count = np.array(f[f'/AM/{bin_size}/count'],
27 |                               dtype = np.float32)
28 |     cscmat = sparse.csc_matrix(
29 |         (count, (idx - 1, idy - 1)),
30 |         shape = (len(barcode), len(name)), dtype = np.float32)
31 |     adata = ad.AnnData(X = cscmat)
32 |     if len(prefix) > 0:
33 |         fullname = [f"{prefix}.{b}" for b in barcode]
34 |     else:
35 |         fullname = barcode
36 |     adata.obs_names = fullname
37 |     adata.var_names = name
38 |     if len(outfnm) > 0 :
39 |         adata.write(outfnm, compression = "gzip")
40 |     return adata
41 | 
42 | 
43 | def snapgmat2h5ad(snap_file, prefix:str = '',
44 |                   outfnm:str = '') -> ad.AnnData:
45 |     """Save gmat from snap file to h5ad.
46 |     """
47 |     with h5py.File(snap_file, mode = 'r') as f:
48 |         barcode = [b.decode('utf-8') for b in f['/BD/name']]
49 |         genm = [g.decode('utf-8') for g in f['/GM/name']]
50 |         idx = np.array(f["/GM/idx"], dtype = np.uintc)
51 |         idy = np.array(f["/GM/idy"], dtype = np.uintc)
52 |         count = np.array(f["/GM/count"], dtype = np.float32)
53 |     cscmat = sparse.csc_matrix(
54 |         (count, (idx -1, idy-1)),
55 |         shape = (len(barcode), len(genm)),
56 |         dtype = np.float32
57 |     )
58 |     adata = ad.AnnData(X = cscmat)
59 |     if len(prefix) > 0:
60 |         fullname = [f"{prefix}.{b}" for b in barcode]
61 |     else:
62 |         fullname = barcode
63 |     adata.obs_names = fullname
64 |     adata.var_names = genm
65 |     if len(outfnm) > 0 :
66 |         adata.write(outfnm, compression = "gzip")
67 |     return adata
68 | 


--------------------------------------------------------------------------------
/package/tasks/getAllL3SnapMat/L2GroupAll.csv:
--------------------------------------------------------------------------------
 1 | GABA_1
 2 | GABA_10
 3 | GABA_11
 4 | GABA_12
 5 | GABA_13
 6 | GABA_14
 7 | GABA_15
 8 | GABA_16
 9 | GABA_17
10 | GABA_18
11 | GABA_19
12 | GABA_2
13 | GABA_20
14 | GABA_21
15 | GABA_22
16 | GABA_23
17 | GABA_24
18 | GABA_25
19 | GABA_26
20 | GABA_27
21 | GABA_28
22 | GABA_29
23 | GABA_3
24 | GABA_30
25 | GABA_31
26 | GABA_32
27 | GABA_33
28 | GABA_4
29 | GABA_5
30 | GABA_6
31 | GABA_7
32 | GABA_8
33 | GABA_9
34 | GLUT_1
35 | GLUT_11
36 | GLUT_13
37 | GLUT_14
38 | GLUT_15
39 | GLUT_16.25
40 | GLUT_17
41 | GLUT_18
42 | GLUT_19
43 | GLUT_2.7
44 | GLUT_20
45 | GLUT_21
46 | GLUT_22.24
47 | GLUT_23
48 | GLUT_26
49 | GLUT_27
50 | GLUT_28
51 | GLUT_3
52 | GLUT_4
53 | GLUT_5
54 | GLUT_6
55 | GLUT_8.10
56 | GLUT_9.12
57 | NonN_1
58 | NonN_10
59 | NonN_11
60 | NonN_12
61 | NonN_13
62 | NonN_2
63 | NonN_3
64 | NonN_4
65 | NonN_5
66 | NonN_6
67 | NonN_7
68 | NonN_8
69 | NonN_9
70 | 


--------------------------------------------------------------------------------
/package/tasks/getAllL3SnapMat/Makefile:
--------------------------------------------------------------------------------
 1 | mat := vM16gmat
 2 | 
 3 | define screenSnakemake
 4 | 	screen -dmS ${1} snakemake -c 1 -p --snakefile Snakefile --profile profile -R --rerun-incomplete
 5 | endef
 6 | 
 7 | run: Snakefile.template L2GroupAll.csv
 8 | 	cp $< ${mat}/Snakefile
 9 | 	cp -R profile.template/. ${mat}/profile
10 | 	cp $(word 2,$^) ${mat}/group.csv
11 | 	cd ${mat} && $(call screenSnakemake,${mat})
12 | 


--------------------------------------------------------------------------------
/package/tasks/getAllL3SnapMat/Snakefile.template:
--------------------------------------------------------------------------------
 1 | import json
 2 | config_file = "config.json"
 3 | 
 4 | with open(config_file, "r") as f:
 5 |     config = json.load(f)
 6 |     matFlagdir = config["matFlagdir"]
 7 |     matLogdir = config["matLogdir"]
 8 |     conda = config["conda"]
 9 |     Rscript = f"{conda}/bin/Rscript"
10 |     atacMatScript = config["atacMatScript"]
11 |     groupFile = config["groupFile"]
12 | 
13 | with open(groupFile, "r") as f:
14 |     lines = f.readlines()
15 |     groups = [l.strip() for l in lines if len(l.strip()) > 1]
16 |     print(groups)
17 | 
18 | rule all:
19 |     input:
20 |         expand("{d}/{g}.done", d = matFlagdir, g = groups)
21 | 
22 | rule getMat:
23 |     output:
24 |         touch(expand("{d}/{{g}}.done", d = matFlagdir))
25 |     log:
26 |         expand("{d}/{{g}}.log", d = matLogdir)
27 |     shell:
28 |         """
29 |         {Rscript} {atacMatScript} --config {config_file} \
30 |                                   --group {wildcards.g} 2> {log}
31 |         """
32 | 


--------------------------------------------------------------------------------
/package/tasks/getAllL3SnapMat/bmat/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gencodevM23snapFile": "../snap/cemba.whole.gencode23.snap.rds",
 3 |     "gencodevM16snapFile": "../snap/snapWithL3Extra.rds",
 4 |     "ncores": 2,
 5 |     "matDir": "out",
 6 |     "matLogdir": "log",
 7 |     "matFlagdir": "flagdir",
 8 |     "splitGroup": 0,
 9 |     "conda": "/projects/ps-renlab/szu/miniconda3/envs/snATAC2",
10 |     "atacMatScript": "../getSnapATACMat.R",
11 |     "mat": "bmat",
12 |     "groupType": "L2",
13 |     "groupFile": "group.csv",
14 |     "gencode": "vM16"
15 | }
16 | 


--------------------------------------------------------------------------------
/package/tasks/getAllL3SnapMat/profile.template/cluster.yaml:
--------------------------------------------------------------------------------
 1 | __default__:
 2 |   jobname: "{rule}.{wildcards}"
 3 |   nodes: 1
 4 |   ppn: 1
 5 |   walltime: "15:00:00"
 6 |   account: "ren-group"
 7 |   queue: "hotel"
 8 |   email: "debug.pie@gmail.com"
 9 |   mailon: "ae"
10 |   jobout: "oe"
11 |   log: "{rule}.{wildcards}.tscc.log"
12 |   logdir: "qsub_tscc_log/"
13 |   mem: "50gb"
14 | 
15 | getMat:
16 |   mem: "100gb"
17 |   ppn: 2
18 |   queue: "hotel"
19 |   walltime: 15:00:00
20 | 


--------------------------------------------------------------------------------
/package/tasks/getAllL3SnapMat/profile.template/config.yaml:
--------------------------------------------------------------------------------
1 | cluster-config: "profile/cluster.yaml"
2 | cluster: "qsub -N {cluster.jobname} -l nodes={cluster.nodes}:ppn={cluster.ppn},mem={cluster.mem},walltime={cluster.walltime} -A {cluster.account} -q {cluster.queue} -M {cluster.email} -m {cluster.mailon} -j {cluster.jobout} -e {cluster.logdir} -V "
3 | jobs: 100
4 | verbose: true
5 | notemp: true
6 | 


--------------------------------------------------------------------------------
/package/tasks/getAllL3SnapMat/vM16gmat/L2GroupAll.csv:
--------------------------------------------------------------------------------
 1 | GABA_1
 2 | GABA_10
 3 | GABA_11
 4 | GABA_12
 5 | GABA_13
 6 | GABA_14
 7 | GABA_15
 8 | GABA_16
 9 | GABA_17
10 | GABA_18
11 | GABA_19
12 | GABA_2
13 | GABA_20
14 | GABA_21
15 | GABA_22
16 | GABA_23
17 | GABA_24
18 | GABA_25
19 | GABA_26
20 | GABA_27
21 | GABA_28
22 | GABA_29
23 | GABA_3
24 | GABA_30
25 | GABA_31
26 | GABA_32
27 | GABA_33
28 | GABA_4
29 | GABA_5
30 | GABA_6
31 | GABA_7
32 | GABA_8
33 | GABA_9
34 | GLUT_1
35 | GLUT_11
36 | GLUT_13
37 | GLUT_14
38 | GLUT_15
39 | GLUT_16.25
40 | GLUT_17
41 | GLUT_18
42 | GLUT_19
43 | GLUT_2.7
44 | GLUT_20
45 | GLUT_21
46 | GLUT_22.24
47 | GLUT_23
48 | GLUT_26
49 | GLUT_27
50 | GLUT_28
51 | GLUT_3
52 | GLUT_4
53 | GLUT_5
54 | GLUT_6
55 | GLUT_8.10
56 | GLUT_9.12
57 | NonN_1
58 | NonN_10
59 | NonN_11
60 | NonN_12
61 | NonN_13
62 | NonN_2
63 | NonN_3
64 | NonN_4
65 | NonN_5
66 | NonN_6
67 | NonN_7
68 | NonN_8
69 | NonN_9
70 | 


--------------------------------------------------------------------------------
/package/tasks/getAllL3SnapMat/vM16gmat/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gencodevM23snapFile": "../snap/cemba.whole.gencode23.snap.rds",
 3 |     "gencodevM16snapFile": "../snap/snapWithL3Extra.rds",
 4 |     "ncores": 2,
 5 |     "matDir": "out",
 6 |     "matLogdir": "log",
 7 |     "matFlagdir": "flagdir",
 8 |     "splitGroup": 0,
 9 |     "conda": "/projects/ps-renlab/szu/miniconda3/envs/snATAC2",
10 |     "atacMatScript": "../getSnapATACMat.R",
11 |     "mat": "gmat",
12 |     "groupType": "L2",
13 |     "groupFile": "group.csv",
14 |     "gencode": "vM16"
15 | }
16 | 


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/L1Group.csv:
--------------------------------------------------------------------------------
1 | NonN
2 | GABA
3 | GLUT


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/L2GroupAll.csv:
--------------------------------------------------------------------------------
 1 | GABA_1
 2 | GABA_10
 3 | GABA_11
 4 | GABA_12
 5 | GABA_13
 6 | GABA_14
 7 | GABA_15
 8 | GABA_16
 9 | GABA_17
10 | GABA_18
11 | GABA_19
12 | GABA_2
13 | GABA_20
14 | GABA_21
15 | GABA_22
16 | GABA_23
17 | GABA_24
18 | GABA_25
19 | GABA_26
20 | GABA_27
21 | GABA_28
22 | GABA_29
23 | GABA_3
24 | GABA_30
25 | GABA_31
26 | GABA_32
27 | GABA_33
28 | GABA_4
29 | GABA_5
30 | GABA_6
31 | GABA_7
32 | GABA_8
33 | GABA_9
34 | GLUT_1
35 | GLUT_11
36 | GLUT_13
37 | GLUT_14
38 | GLUT_15
39 | GLUT_16.25
40 | GLUT_17
41 | GLUT_18
42 | GLUT_19
43 | GLUT_2.7
44 | GLUT_20
45 | GLUT_21
46 | GLUT_22.24
47 | GLUT_23
48 | GLUT_26
49 | GLUT_27
50 | GLUT_28
51 | GLUT_3
52 | GLUT_4
53 | GLUT_5
54 | GLUT_6
55 | GLUT_8.10
56 | GLUT_9.12
57 | NonN_1
58 | NonN_10
59 | NonN_11
60 | NonN_12
61 | NonN_13
62 | NonN_2
63 | NonN_3
64 | NonN_4
65 | NonN_5
66 | NonN_6
67 | NonN_7
68 | NonN_8
69 | NonN_9
70 | 


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/L2GroupTest.csv:
--------------------------------------------------------------------------------
1 | NonN_2
2 | NonN_3
3 | NonN_11
4 | 


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/L2MultiGroup.csv:
--------------------------------------------------------------------------------
1 | NonN_2-NonN_3-NonN_11


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/Makefile:
--------------------------------------------------------------------------------
 1 | mat := L2vM16gmat
 2 | 
 3 | define screenSnakemake
 4 | 	screen -dmS ${1} snakemake -c 1 -p --snakefile Snakefile --profile profile -R --rerun-incomplete
 5 | endef
 6 | 
 7 | run: Snakefile.template L2GroupAll.csv config.json.template
 8 | 	-mkdir -p ${mat}
 9 | 	cp $< ${mat}/Snakefile
10 | 	cp -R profile.template/. ${mat}/profile
11 | 	cp L2GroupAll.csv ${mat}/group.csv
12 | 	cp config.json.template ${mat}/config.json
13 | 	cd ${mat} && $(call screenSnakemake,${mat})
14 | 
15 | test: Snakefile.template L2GroupTest.csv config.json.template
16 | 	-mkdir -p ${mat}
17 | 	cp $< ${mat}/Snakefile
18 | 	cp -R profile.template/. ${mat}/profile
19 | 	cp L2GroupTest.csv ${mat}/group.csv
20 | 	cp config.json.template ${mat}/config.json
21 | 	cd ${mat} && $(call screenSnakemake,${mat})
22 | 
23 | l1mat := L1vM16gmat
24 | runL1: Snakefile.template L1Group.csv configL1.json.template
25 | 	-mkdir -p ${l1mat}
26 | 	cp $< ${l1mat}/Snakefile
27 | 	cp -R profile.template/. ${l1mat}/profile
28 | 	cp $(word 2,$^) ${l1mat}/group.csv
29 | 	cp $(word 3,$^) ${l1mat}/config.json
30 | 	cd ${l1mat} && $(call screenSnakemake,${l1mat})
31 | 
32 | multiL2 := L2MultvM16gmat
33 | runMultiL2: Snakefile.template L2MultiGroup.csv configMultiGroup.json.template
34 | 	-mkdir -p ${multiL2}
35 | 	cp $< ${multiL2}/Snakefile
36 | 	cp -R profile.template/. ${multiL2}/profile
37 | 	cp $(word 2,$^) ${multiL2}/group.csv
38 | 	cp $(word 3,$^) ${multiL2}/config.json
39 | 	cd ${multiL2} && $(call screenSnakemake,${multiL2})
40 | 


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/Snakefile.template:
--------------------------------------------------------------------------------
 1 | import json
 2 | config_file = "config.json"
 3 | 
 4 | with open(config_file, "r") as f:
 5 |     config = json.load(f)
 6 |     matFlagdir = config["matFlagdir"]
 7 |     matLogdir = config["matLogdir"]
 8 |     conda = config["conda"]
 9 |     Rscript = f"{conda}/bin/Rscript"
10 |     atacMatScript = config["atacMatScript"]
11 |     groupFile = config["groupFile"]
12 | 
13 | with open(groupFile, "r") as f:
14 |     lines = f.readlines()
15 |     groups = [l.strip() for l in lines if len(l.strip()) > 1]
16 |     print(groups)
17 | 
18 | rule all:
19 |     input:
20 |         expand("{d}/{g}.done", d = matFlagdir, g = groups)
21 | 
22 | rule getMat:
23 |     output:
24 |         touch(expand("{d}/{{g}}.done", d = matFlagdir))
25 |     log:
26 |         expand("{d}/{{g}}.log", d = matLogdir)
27 |     shell:
28 |         """
29 |         {Rscript} {atacMatScript} --config {config_file} \
30 |                                   --group {wildcards.g} 2> {log}
31 |         """
32 | 


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/config.json.template:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gencodevM23snapFile": "../snap/cemba.whole.gencode23.snap.rds",
 3 |     "gencodevM16snapFile": "../snap/snapWithL3Extra.rds",
 4 |     "ncores": 4,
 5 |     "matDir": "out",
 6 |     "matLogdir": "log",
 7 |     "matFlagdir": "flagdir",
 8 |     "ndpL2": 30000,
 9 |     "ndpL3": 10000,
10 |     "conda": "/projects/ps-renlab/szu/miniconda3/envs/snATAC2",
11 |     "atacMatScript": "../getSnapATACMatByGroup.R",
12 |     "mat": "gmat",
13 |     "groupType": "L2",
14 |     "groupFile": "group.csv",
15 |     "gencode": "vM16",
16 |     "requireNextLevel": 0
17 | }
18 | 


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/configL1.json.template:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gencodevM23snapFile": "../snap/cemba.whole.gencode23.snap.rds",
 3 |     "gencodevM16snapFile": "../snap/snapL3WithIds.rds",
 4 |     "ncores": 4,
 5 |     "matDir": "out",
 6 |     "matLogdir": "log",
 7 |     "matFlagdir": "flagdir",
 8 |     "ndp": 30000,
 9 |     "conda": "/projects/ps-renlab/szu/miniconda3/envs/snATAC2",
10 |     "atacMatScript": "../getSnapATACMatByGroup.R",
11 |     "mat": "gmat",
12 |     "groupType": "L1",
13 |     "groupFile": "group.csv",
14 |     "gencode": "vM16",
15 |     "requireNextLevel": 0
16 | }
17 | 


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/configMultiGroup.json.template:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gencodevM23snapFile": "../snap/cemba.whole.gencode23.snap.rds",
 3 |     "gencodevM16snapFile": "../snap/snapWithL3Extra.rds",
 4 |     "ncores": 4,
 5 |     "matDir": "out",
 6 |     "matLogdir": "log",
 7 |     "matFlagdir": "flagdir",
 8 |     "splitGroup": 0,
 9 |     "ndpL2": 30000,
10 |     "ndpL3": 10000,
11 |     "conda": "/projects/ps-renlab/szu/miniconda3/envs/snATAC2",
12 |     "atacMatScript": "../getSnapATACMatByGroup.R",
13 |     "mat": "gmat",
14 |     "groupType": "L2Extra",
15 |     "groupFile": "group.csv",
16 |     "gencode": "vM16",
17 |     "requireNextLevel": 1
18 | }
19 | 


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/profile.template/cluster.yaml:
--------------------------------------------------------------------------------
 1 | __default__:
 2 |   jobname: "{rule}.{wildcards}"
 3 |   nodes: 1
 4 |   ppn: 1
 5 |   walltime: "15:00:00"
 6 |   account: "ren-group"
 7 |   queue: "hotel"
 8 |   email: "debug.pie@gmail.com"
 9 |   mailon: "ae"
10 |   jobout: "oe"
11 |   log: "{rule}.{wildcards}.tscc.log"
12 |   logdir: "qsub_tscc_log/"
13 |   mem: "50gb"
14 | 
15 | getMat:
16 |   mem: "50gb"
17 |   ppn: 2
18 |   queue: "glean"
19 |   walltime: 04:00:00
20 | 


--------------------------------------------------------------------------------
/package/tasks/getSnapATACMatByGroup/profile.template/config.yaml:
--------------------------------------------------------------------------------
1 | cluster-config: "profile/cluster.yaml"
2 | cluster: "qsub -N {cluster.jobname} -l nodes={cluster.nodes}:ppn={cluster.ppn},mem={cluster.mem},walltime={cluster.walltime} -A {cluster.account} -q {cluster.queue} -M {cluster.email} -m {cluster.mailon} -j {cluster.jobout} -e {cluster.logdir} -V "
3 | jobs: 100
4 | verbose: true
5 | notemp: true
6 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/01.prepare.nmf.R:
--------------------------------------------------------------------------------
 1 | # Prepare data for nmf.
 2 | # TODO: This script can be generalised.
 3 | # Currently, I use it case by case.
 4 | library(data.table)
 5 | library(hdf5r)
 6 | 
 7 | packdir <- file.path(here::here(), "package", "R")
 8 | import::from(.from = "cembav2env.R", .directory = packdir,
 9 |   cembav2env)
10 | 
11 | # * configs
12 | outdir <- "data"
13 | outh5 <- file.path(outdir, "cpm.cbyp.Intv2.h5")
14 | outPeakCoordFile <- file.path(outdir, "peaks.Intv2.txt")
15 | outClusterFile <- file.path(outdir, "clusters.Intv2.txt")
16 | 
17 | # * load atac cpm.
18 | cpm.scbyp <- readRDS(cembav2env$subclassPmatCPMIntv2File)
19 | 
20 | ## # * load cCREs from posivite pdc
21 | ## cCREs.ppdc <- data.table::fread(
22 | ##   "../out/AllenAnnotConcat/mba.whole.AllenAnnotConcat.pearson.pos.pdc.CREs",
23 | ##   header = FALSE, data.table = FALSE
24 | ## )$V1
25 | 
26 | # * main
27 | ## cpm.ppdc <- cpm.pdc[cCREs.ppdc, ]
28 | ## cpm.pbysc <- cpm.ppdc
29 | 
30 | ## set cap for too high values
31 | upValue <- quantile(cpm.scbyp, 0.9999)
32 | cpm.capped <- cpm.scbyp
33 | cpm.capped[cpm.scbyp > upValue] <- upValue
34 | 
35 | ## change to cluster by peak mat
36 | ## for later saving to hdf5
37 | peaks <- colnames(cpm.capped)
38 | clusters <- rownames(cpm.capped)
39 | 
40 | # * save mat to hdf5 format for python handling.
41 | conn <- hdf5r::H5File$new(outh5, mode = "w")
42 | data.grp <- conn$create_group("X")
43 | # NOTE: hdf5r will transpose the mat
44 | # https://github.com/hhoeflin/hdf5r/issues/81
45 | data.grp[["mat"]] <- cpm.capped
46 | # colnames corresponds to cpm.capped
47 | data.grp[["colnames"]] <- peaks
48 | # rownames corresponds to cpm.capped
49 | data.grp[["rownames"]] <- clusters
50 | conn$close_all()
51 | 
52 | write.table(peaks, file = outPeakCoordFile, quote = FALSE,
53 |   row.names = FALSE, col.names = FALSE)
54 | write.table(clusters, file = outClusterFile, quote = FALSE,
55 |   row.names = FALSE, col.names = FALSE)
56 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/02.nmfATAC.plotH.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # read results from ksklearn
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | # create parser object
 8 | parser <- ArgumentParser()
 9 | 
10 | # specify our desired options
11 | # by default ArgumentParser will add an help option
12 | parser$add_argument("-i", "--input", required=TRUE, help="input matrix")
13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix")
14 | # get command line options, if help option encountered print help and exit,
15 | # otherwise if options not found on command line then set defaults,
16 | args <- parser$parse_args()
17 | 
18 | 
19 | library(data.table)
20 | dataH <- fread(args$input,sep="\t")
21 | 
22 | library(pheatmap)
23 | library(RColorBrewer)
24 | library(viridis)
25 | library(dendsort)
26 | 
27 | # scale by column
28 | #mx <- apply(dataH,2,scale)
29 | 
30 | normUnity <- function(x){
31 |   sum <- sum(x)
32 |   out <- x / sum(x)
33 | }
34 | 
35 | mx <- apply(dataH,2,normUnity)
36 | 
37 | sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...)))
38 | #mat_cluster_rows_H <- sort_hclust(hclust(dist(dataH)))
39 | mat_cluster_cols_H <- sort_hclust(hclust(dist(t(mx))))
40 | 
41 | quantile_breaks <- function(xs, n = 30) {
42 |   breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
43 |   breaks[!duplicated(breaks)]
44 | }
45 | 
46 | # mat_breaks_H <- quantile_breaks(t(mx), n = 30)
47 | 
48 | pdf(paste(args$output,".H.pdf",sep=''))
49 | pheatmap(
50 |   mat               = mx,
51 |   scale             = 'none',
52 |   color             = viridis(30),
53 | #  color             = viridis(length(mat_breaks_H) - 1),
54 | #  breaks            = mat_breaks_H,
55 |   border_color      = NA,
56 |   cluster_cols      = mat_cluster_cols_H,
57 |   cluster_rows      = F,
58 | #  cluster_rows      = mat_cluster_rows_H,
59 |   show_colnames     = TRUE,
60 |   show_rownames     = FALSE,
61 |   drop_levels       = TRUE,
62 |   fontsize          = 14,
63 |   main              = "decomp H"
64 | )
65 | dev.off()
66 | 
67 | 
68 | nor01 <- function(x){
69 |   min <- min(x)
70 |   max <- max(x)
71 |   out <- (x - min) / (max - min)
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/02.nmfATAC.plotW.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # read results from ksklearn
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | # create parser object
 8 | parser <- ArgumentParser()
 9 | 
10 | # specify our desired options
11 | # by default ArgumentParser will add an help option
12 | parser$add_argument("-i", "--input", required=TRUE, help="input matrix")
13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix")
14 | # get command line options, if help option encountered print help and exit,
15 | # otherwise if options not found on command line then set defaults,
16 | args <- parser$parse_args()
17 | 
18 | 
19 | library(data.table)
20 | dataW <- fread(args$input,sep="\t")
21 | 
22 | library(pheatmap)
23 | library(RColorBrewer)
24 | library(viridis)
25 | library(dendsort)
26 | 
27 | # scale by column
28 | #tmp <- apply(dataW,2,scale)
29 | 
30 | normUnity <- function(x){
31 |   sum <- sum(x)
32 |   out <- x / sum(x)
33 | }
34 | 
35 | tmp <- apply(dataW,1,normUnity)
36 | tmp <- t(tmp)
37 | mx <- tmp[sample(nrow(tmp), 5000), ]
38 | 
39 | sort_hclust <- function(...) as.hclust(dendsort(as.dendrogram(...)))
40 | mat_cluster_rows_W <- sort_hclust(hclust(dist(mx)))
41 | #mat_cluster_cols_W <- sort_hclust(hclust(dist(t(mx))))
42 | 
43 | quantile_breaks <- function(xs, n = 30) {
44 |   breaks <- quantile(xs, probs = seq(0, 1, length.out = n))
45 |   breaks[!duplicated(breaks)]
46 | }
47 | 
48 | #mat_breaks_W <- quantile_breaks(t(mx), n = 30)
49 | 
50 | pdf(paste(args$output,".W.pdf",sep=''))
51 | pheatmap(
52 |   mat               = mx,
53 |   scale             = 'none',
54 |   color             = viridis(30),
55 | #  color             = viridis(length(mat_breaks_W) - 1),
56 | #  breaks            = mat_breaks_W,
57 |   border_color      = NA,
58 | #  cluster_cols      = mat_cluster_cols_W,
59 |   cluster_cols      = F,
60 |   cluster_rows      = mat_cluster_rows_W,
61 |   show_colnames     = FALSE,
62 |   show_rownames     = FALSE,
63 |   drop_levels       = TRUE,
64 |   fontsize          = 14,
65 |   main              = "decomp W"
66 | )
67 | dev.off()
68 | 
69 | 
70 | norm01 <- function(x){
71 |   min <- min(x)
72 |   max <- max(x)
73 |   out <- (x - min) / (max - min)
74 | }
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/02.nmfATAC.statBox.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # read results from ksklearn
 4 | 
 5 | suppressPackageStartupMessages(library("argparse"))
 6 | 
 7 | # create parser object
 8 | parser <- ArgumentParser()
 9 | 
10 | # specify our desired options
11 | # by default ArgumentParser will add an help option
12 | parser$add_argument("-i", "--input", required=TRUE, help="input statH")
13 | parser$add_argument("-o", "--output", required=TRUE, help="output file prefix")
14 | # get command line options, if help option encountered print help and exit,
15 | # otherwise if options not found on command line then set defaults,
16 | args <- parser$parse_args()
17 | 
18 | data <- read.table(args$input,sep="\t",head=F)
19 | 
20 | staoutmx <- data.frame(row.names=c("Min","Q1","Median","Mean","Q3","Max","TopWhisker","BottomWhisker","Box1","Box2","Box3","UpWhisker","DnWhisker"))
21 | for (i in c(5,6,7)){
22 | x <- data[,i]
23 | boxMx <- matrix(summary(x))
24 | rownames(boxMx) <- c("Min","Q1","Median","Mean","Q3","Max")
25 | iqr <- IQR(x)
26 | q1 <- summary(x)[2]
27 | q3 <- summary(x)[5]
28 | TopWhisker <- min(max(x), q3 + 1.5 * iqr)
29 | BottomWhisker <- max(min(x), q1 - 1.5 * iqr)
30 | Box1 <- boxMx["Q1",]
31 | Box2 <- boxMx["Median",] - boxMx["Q1",]
32 | Box3 <- boxMx["Q3",] - boxMx["Median",]
33 | UpWhisker <- TopWhisker - boxMx["Q3",]
34 | DnWhisker <- boxMx["Q1",] - BottomWhisker
35 | boxMx <- rbind(boxMx,TopWhisker,BottomWhisker,Box1,Box2,Box3,UpWhisker,DnWhisker)
36 | colnames(boxMx) <- i
37 | staoutmx <- cbind(staoutmx,boxMx)
38 | }
39 | colnames(staoutmx) <- c("contributes","sparseness","entropy")
40 | 
41 | cat(median(data$V6),"\n")
42 | 
43 | k <- max(data$V3)
44 | n <- nrow(data)
45 | normInfoGain = 1 - sum(data$V7) / (n * log2(k))
46 | cat(normInfoGain)
47 | 
48 | write.table(staoutmx, file=paste(args$output,".box.sta",sep=''), sep="\t", quote=F, col.names=T, row.names=T)
49 | 
50 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/05.splitPeakByModule.R:
--------------------------------------------------------------------------------
 1 | library(data.table)
 2 | library(optparse)
 3 | packdir <- file.path(here::here(), "package/R")
 4 | import::from(.from = "utils.R", .directory = packdir,
 5 |   checkArgsExistOrStop, prepareOutdir, checkFileExistOrStop)
 6 | import::from(.from = "peak.R", .directory = packdir,
 7 |   loadStatPeak.NMF)
 8 | import::from(.from = "cembav2env.R", .directory = packdir,
 9 |   cembav2env)
10 | 
11 | op <- list(
12 |   make_option(c("--nmfDir"), type = "character",
13 |     default = "nmf_ppdc/out"),
14 |   make_option(c("--module"), type = "integer",
15 |     default = 54),
16 |   make_option(c("--tag"), type = "character", default = "ppdc")
17 | )
18 | 
19 | args <- parse_args(OptionParser(option_list = op))
20 | checkArgsExistOrStop(args)
21 | 
22 | if(!dir.exists(args$nmfDir)) {
23 |   stop(args$nmfDir, " does not exist.")
24 | }
25 | 
26 | mod.nmf <- args$module
27 | tag <- args$tag
28 | 
29 | outDir <- file.path(args$nmfDir,
30 |   paste("nmf", tag, paste0("r", mod.nmf), "motif", sep = "."))
31 | prepareOutdir(outDir)
32 | 
33 | # * functions
34 | convertPeakToBed <- function(peakBed, peaknms, outFile = NULL) {
35 |   r <- peakBed[peaknms, ]
36 |   if(!is.null(outFile)) {
37 |     write.table(x = r, file = outFile, quote = FALSE, sep = "\t",
38 |       row.names = FALSE, col.names = FALSE)
39 |   }
40 |   return(r)
41 | }
42 | 
43 | # * load peaks
44 | peakBed <- data.table::fread(cembav2env$peakBedFile,
45 |   header = FALSE, sep = "\t", data.table = FALSE)
46 | colnames(peakBed) <- c("chrom", "start", "end", "name")
47 | rownames(peakBed) <- peakBed$name
48 | 
49 | # * nmf modules 
50 | nmfPeakStat <- loadStatPeak.NMF(
51 |   file = file.path(args$nmfDir,
52 |     paste("nmfPmat", tag,
53 |       paste0("r", mod.nmf), "n0", "statW", sep = ".")))
54 | 
55 | modules <- unique(nmfPeakStat$class0 + 1)
56 | 
57 | # * save peaks from each module to a seperate bed file
58 | invisible(lapply(modules, function(i) {
59 |   outFile <- file.path(outDir,
60 |     paste0("r", mod.nmf, "_n", i, ".cCREs.bed"))
61 |   message("Writing peak bed file to: ", outFile)
62 |   peaks <- with(nmfPeakStat, peak[class0 == (i-1)])
63 |   convertPeakToBed(peakBed = peakBed, peaknms = peaks, outFile = outFile)
64 | }))
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: nmf_subclassIntv2
 2 | nmf_subclassIntv2_encoder: nmf.Snakefile
 3 | 	-mkdir -p $@
 4 | 	cp config.yaml $@/config.yaml
 5 | 	cp $< $@/Snakefile
 6 | 	cp -R profile/. $@/profile
 7 | 	cd $@ && \
 8 |   snakemake --config \
 9 |     system=encoder \
10 |     tag=all.Intv2 \
11 |     module=150 \
12 |     n_rerun=2 \
13 |     out=$@ \
14 |     mat_pbyc_h5=data/cpm.cbyp.Intv2.h5 \
15 |     peak_nm_file=data/peaks.Intv2.txt \
16 |     cluster_nm_file=data/clusters.Intv2.txt \
17 |     local_dir=package/tasks/nmf \
18 |      -c 2 -p --snakefile Snakefile -R --rerun-incomplete
19 | 
20 | .PHONY: nmf_subclassIntv2_novlp_encoder
21 | nmf_subclassIntv2_novlp_encoder: nmf.Snakefile
22 | 	-mkdir -p $@
23 | 	cp config.yaml $@/config.yaml
24 | 	cp $< $@/Snakefile
25 | 	cp -R profile/. $@/profile
26 | 	cd $@ && \
27 |   snakemake --config \
28 |     system=encoder \
29 |     tag=all.Intv2 \
30 |     mod_from=150 \
31 |     mod_to=151 \
32 |     mod_by=1 \
33 |     n_rerun=2 \
34 |     out=$@ \
35 |     mat_pbyc_h5=data/cpm.cbyp.novlp.Intv2.h5 \
36 |     peak_nm_file=data/peaks.novlp.Intv2.txt \
37 |     cluster_nm_file=data/clusters.novlp.Intv2.txt \
38 |     local_dir=package/tasks/nmf \
39 |      -c 4 -p --snakefile Snakefile -R --rerun-incomplete
40 | 
41 | 
42 | .PHONY: nmf_subclassIntv2_novlp_tscc
43 | nmf_subclassIntv2_novlp_tscc: nmf.Snakefile
44 | 	-mkdir -p $@
45 | 	cp config.yaml $@/config.yaml
46 | 	cp $< $@/Snakefile
47 | 	cp -R profile/. $@/profile
48 | 	cd $@ && \
49 |   snakemake --config \
50 |     system=tscc \
51 |     tag=novlp.Intv2 \
52 |     module=150 \
53 |     n_rerun=2 \
54 |     out=$@ \
55 |     mat_pbyc_h5=data/cpm.cbyp.novlp.Intv2.h5 \
56 |     peak_nm_file=data/peaks.novlp.Intv2.txt \
57 |     cluster_nm_file=data/clusters.novlp.Intv2.txt \
58 |     local_dir=package/tasks/nmf \
59 |      -c 1 -p --snakefile Snakefile -R --rerun-incomplete
60 | 
61 | .PHONY: nmf_subclassIntv2_all_tscc
62 | nmf_subclassIntv2_all_tscc: nmf.Snakefile
63 | 	-mkdir -p $@
64 | 	cp config.yaml $@/config.yaml
65 | 	cp $< $@/Snakefile
66 | 	cp -R profile/. $@/profile
67 | 	cd $@ && \
68 |   snakemake --config \
69 |     system=tscc \
70 |     tag=all.Intv2 \
71 |     module=150 \
72 |     n_rerun=2 \
73 |     out=$@ \
74 |     mat_pbyc_h5=data/cpm.cbyp.Intv2.h5 \
75 |     peak_nm_file=data/peaks.Intv2.txt \
76 |     cluster_nm_file=data/clusters.Intv2.txt \
77 |     local_dir=package/tasks/nmf \
78 |      -c 2 -p --snakefile Snakefile -R --rerun-incomplete --profile profile
79 | 
80 | 
81 | clean:
82 | 	-rm -rf nmf_test
83 | 	-rm -rf nmf_test_local
84 | 	-rm -rf nmf_test_qsub
85 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/config.yaml:
--------------------------------------------------------------------------------
 1 | system: imac
 2 | python:
 3 |   imac: /Users/szu/mambaforge/bin/python
 4 |   tscc: /projects/ps-renlab/szu/miniconda3/envs/snATAC2/bin/python
 5 |   encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/python
 6 | Rscript:
 7 |   imac: /usr/local/bin/Rscript
 8 |   tscc: /projects/ps-renlab/szu/miniconda3/envs/snATAC2/bin/Rscript
 9 |   encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/Rscript
10 | code_dir:
11 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2
12 |   tscc: /projects/ps-renlab2/szu/projects/CEMBA2
13 |   encoder: /projects/ps-renlab2/szu/projects/CEMBA2
14 | work_dir:
15 |   imac: /Users/szu/git-recipes/mouseBrainAtlas/cembaV2
16 |   tscc: /oasis/tscc/scratch/szu/projects/CEMBA2
17 |   encoder: /projects/ps-renlab2/szu/projects/CEMBA2
18 | homer:
19 |   imac: /Users/szu/mambaforge/envs/bio/bin/findMotifsGenome.pl
20 |   tscc: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/findMotifsGenome.pl
21 |   encoder: /projects/ps-renlab2/szu/miniconda3/envs/cicero/bin/findMotifsGenome.pl
22 | subclass_order_meta: meta/subclass.order.hc.csv
23 | local_dir: package/tasks/nmf
24 | peak_nm_file: data/peaks.txt
25 | cluster_nm_file: data/clusters.txt
26 | mat_pbyc_h5: data/cpm.cbyp.ppdc.h5
27 | tag: Intv2
28 | out: nmf_Intv2
29 | n_rerun: 2
30 | mod_from: 150
31 | mod_to: 151
32 | mod_by: 1
33 | use_detailed_mod: 1
34 | detailed_mod: 40a80
35 | mod_split: a
36 | module: 150
37 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/profile/.cluster.yaml.~undo-tree~:
--------------------------------------------------------------------------------
 1 | (undo-tree-save-format-version . 1)
 2 | "70b1dac5c0ea77a274a80b2549cdfbcf37ed2f9e"
 3 | [nil nil nil nil (25150 7538 110802 0) 0 nil]
 4 | ([nil nil (("1" . -275) (undo-tree-id0 . -1) (undo-tree-id1 . -1) (undo-tree-id2 . -1) (undo-tree-id3 . -1) (undo-tree-id4 . -1) (undo-tree-id5 . -1) (undo-tree-id6 . -1) (undo-tree-id7 . -1) (undo-tree-id8 . -1) (undo-tree-id9 . -1) (undo-tree-id10 . -1) ("0" . -276) (undo-tree-id11 . -1) (undo-tree-id12 . -1) (undo-tree-id13 . -1) (undo-tree-id14 . -1) (undo-tree-id15 . -1) (undo-tree-id16 . -1) (undo-tree-id17 . -1) (undo-tree-id18 . -1) (undo-tree-id19 . -1) 277 (t 25051 31271 891676 118000)) nil (25150 7538 110801 0) 0 nil])
 5 | ([nil nil ((275 . 276)) nil (25150 7538 110774 0) 0 nil])
 6 | ([nil nil (("1" . -309) (undo-tree-id20 . -1) (undo-tree-id21 . -1) (undo-tree-id22 . -1) (undo-tree-id23 . -1) (undo-tree-id24 . -1) ("0" . -310) (undo-tree-id25 . -1) (undo-tree-id26 . -1) (undo-tree-id27 . -1) 311 (t 25150 7538 113892 194000)) nil (25150 7542 995513 0) 0 nil])
 7 | ([nil nil ((309 . 311)) nil (25150 7542 995499 0) 0 nil])
 8 | ([nil nil (("1" . -290) (undo-tree-id28 . -1) (undo-tree-id29 . -1) (undo-tree-id30 . -1) (undo-tree-id31 . -1) (undo-tree-id32 . -1) (undo-tree-id33 . -1) (undo-tree-id34 . -1) (undo-tree-id35 . -1) (undo-tree-id36 . -1) (undo-tree-id37 . -1) (undo-tree-id38 . -1) ("0" . -291) (undo-tree-id39 . -1) (undo-tree-id40 . -1) (undo-tree-id41 . -1) (undo-tree-id42 . -1) (undo-tree-id43 . -1) (undo-tree-id44 . -1) (undo-tree-id45 . -1) 292 (t 25150 7542 997528 39000)) nil (25150 7546 129398 0) 0 nil])
 9 | ([nil nil ((290 . 292)) nil (25150 7546 129380 0) 0 nil])
10 | ([nil nil (("3" . -309) (undo-tree-id46 . -1) (undo-tree-id47 . -1) (undo-tree-id48 . -1) (undo-tree-id49 . -1) (undo-tree-id50 . -1) (undo-tree-id51 . -1) (undo-tree-id52 . -1) ("0" . -310) (undo-tree-id53 . -1) (undo-tree-id54 . -1) (undo-tree-id55 . -1) (undo-tree-id56 . -1) (undo-tree-id57 . -1) 311 (t 25150 7546 131806 260000)) nil (25150 7552 611190 0) 0 nil])
11 | ([nil nil ((309 . 311)) nil (25150 7552 611175 0) 0 nil])
12 | ([nil nil (("2" . -290) (undo-tree-id58 . -1) (undo-tree-id59 . -1) (undo-tree-id60 . -1) (undo-tree-id61 . -1) (undo-tree-id62 . -1) (undo-tree-id63 . -1) (undo-tree-id64 . -1) (undo-tree-id65 . -1) (undo-tree-id66 . -1) ("0" . -291) (undo-tree-id67 . -1) (undo-tree-id68 . -1) (undo-tree-id69 . -1) (undo-tree-id70 . -1) (undo-tree-id71 . -1) 292 (t 25150 7552 613076 298000)) nil (25150 9878 535189 0) 0 nil])
13 | ([nil current ((290 . 292)) nil (25150 9878 535171 0) 0 nil])
14 | nil
15 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/profile/cluster.yaml:
--------------------------------------------------------------------------------
 1 | __default__:
 2 |   jobname: "{rule}.{wildcards}"
 3 |   nodes: 1
 4 |   ppn: 1
 5 |   walltime: "04:00:00"
 6 |   account: "ren-group"
 7 |   queue: "hotel"
 8 |   email: "debug.pie@gmail.com"
 9 |   mailon: "ae"
10 |   jobout: "oe"
11 |   log: "{rule}.{wildcards}.tscc.log"
12 |   pmem: "5gb"
13 | 
14 | nmf:
15 |   queue: "hotel"
16 |   nodes: 1
17 |   ppn: 4
18 |   walltime: "24:00:00"
19 | 
20 | post_nmf:
21 |   queue: "hotel"
22 |   nodes: 1
23 |   ppn: 1
24 |   walltime: "01:00:00"
25 | 
26 | stat:
27 |   queue: "hotel"
28 |   nodes: 1
29 |   ppn: 1
30 |   walltime: "01:00:00"
31 | 
32 | plot_nmf:
33 |   queue: "hotel"
34 |   nodes: 1
35 |   ppn: 1
36 |   walltime: "01:00:00"
37 | 
38 | sumNMF:
39 |   queue: "hotel"
40 |   nodes: 1
41 |   ppn: 1
42 |   walltime: "01:00:00"
43 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/profile/config.yaml:
--------------------------------------------------------------------------------
1 | cluster-config: "profile/cluster.yaml"
2 | cluster: "qsub -N {cluster.jobname} -l nodes={cluster.nodes}:ppn={cluster.ppn},pmem={cluster.pmem},walltime={cluster.walltime} -A {cluster.account} -q {cluster.queue} -M {cluster.email} -m {cluster.mailon} -j {cluster.jobout} -o {cluster.log} -V "
3 | jobs: 100
4 | verbose: true
5 | notemp: true
6 | 


--------------------------------------------------------------------------------
/package/tasks/nmf/supple.01.prepare.nmf.R:
--------------------------------------------------------------------------------
 1 | # Prepare data for nmf.
 2 | # TODO: This script can be generalised.
 3 | # Currently, I use it case by case.
 4 | library(data.table)
 5 | library(hdf5r)
 6 | 
 7 | packdir <- file.path(here::here(), "package", "R")
 8 | import::from(.from = "cembav2env.R", .directory = packdir,
 9 |   cembav2env)
10 | 
11 | # * configs
12 | outdir <- "data"
13 | outh5 <- file.path(outdir, "cpm.cbyp.novlp.Intv2.h5")
14 | outPeakCoordFile <- file.path(outdir, "peaks.novlp.Intv2.txt")
15 | outClusterFile <- file.path(outdir, "clusters.novlp.Intv2.txt")
16 | 
17 | # * load atac cpm.
18 | cpm.scbyp <- readRDS(cembav2env$subclassPmatCPMIntv2File)
19 | 
20 | ## # * load cCREs from posivite pdc
21 | novlp.CREs <- data.table::fread(
22 |   file = cembav2env$nonOvlpDHSPeakBedFile,
23 |   sep = "\t",
24 |   header = FALSE,
25 |   data.table = FALSE
26 | )
27 | colnames(novlp.CREs) <- c("chr", "start", "end", "name")
28 | novlpCoords <- with(novlp.CREs,
29 |   paste(chr, paste(start, end, sep = "-"), sep = ":"))
30 | 
31 | cpm.scbyp <- cpm.scbyp[ , novlpCoords]
32 | 
33 | ## set cap for too high values
34 | upValue <- quantile(cpm.scbyp, 0.9999)
35 | cpm.capped <- cpm.scbyp
36 | cpm.capped[cpm.scbyp > upValue] <- upValue
37 | 
38 | ## change to cluster by peak mat
39 | ## for later saving to hdf5
40 | peaks <- colnames(cpm.capped)
41 | clusters <- rownames(cpm.capped)
42 | 
43 | # * save mat to hdf5 format for python handling.
44 | conn <- hdf5r::H5File$new(outh5, mode = "w")
45 | data.grp <- conn$create_group("X")
46 | # NOTE: hdf5r will transpose the mat
47 | # https://github.com/hhoeflin/hdf5r/issues/81
48 | data.grp[["mat"]] <- cpm.capped
49 | # colnames corresponds to cpm.capped
50 | data.grp[["colnames"]] <- peaks
51 | # rownames corresponds to cpm.capped
52 | data.grp[["rownames"]] <- clusters
53 | conn$close_all()
54 | 
55 | write.table(peaks, file = outPeakCoordFile, quote = FALSE,
56 |   row.names = FALSE, col.names = FALSE)
57 | write.table(clusters, file = outClusterFile, quote = FALSE,
58 |   row.names = FALSE, col.names = FALSE)
59 | 


--------------------------------------------------------------------------------
/repo_figures/GraphAbstract.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/repo_figures/GraphAbstract.jpg


--------------------------------------------------------------------------------
/repo_figures/GraphAbstract.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/repo_figures/GraphAbstract.tif


--------------------------------------------------------------------------------
/repo_figures/snATAC-seq_analysis_pipeline.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/repo_figures/snATAC-seq_analysis_pipeline.jpg


--------------------------------------------------------------------------------
/repo_figures/snATAC-seq_analysis_pipeline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/beyondpie/CEMBA_wmb_snATAC/4605d86a83fe45a38e656ba87df85fe99762f358/repo_figures/snATAC-seq_analysis_pipeline.pdf


--------------------------------------------------------------------------------
/snakemake.template/Makefile:
--------------------------------------------------------------------------------
 1 | test_dir:
 2 | 	-mkdir -p $@
 3 | 
 4 | .PHONY: test_snakemake
 5 | test_snakemake : test_dir Snakefile
 6 | 	cp config.yaml $</config.yaml
 7 | 	cp $(word 2,$^) $</Snakefile
 8 | 	cp -R profile/. $</profile
 9 | 	cd $< && snakemake --config \
10 |     pmat_dir=$< \
11 |      -c 1 -p --snakefile Snakefile -R --rerun-incomplete --profile profile
12 | 
13 | 
14 | submit_qsub:
15 | 	qsub pbs.demo.sh
16 | 


--------------------------------------------------------------------------------
/snakemake.template/README.org:
--------------------------------------------------------------------------------
 1 | * A general template for snakemake
 2 | We use [[https://github.com/snakemake/snakemake][snakemake]] to organize our pipeline for majority of the data
 3 | analysis. Here we provide one example Snakefile used for calling
 4 | cicero (part of the rules are selected and the config file is
 5 | modified).
 6 | 
 7 | ** About snakefile
 8 |    1. We can basically treat it as a *Python* script with some
 9 |       /snakemake/ rules.
10 |    2. It supports yaml file as a config file, and the parameters in
11 |       this file will have lower priorities with the parameters
12 |       directly claimed when calling snakemake with `--config
13 |       param1=value1`. 
14 |    3. It supports high-performance cluster (HPC) automatically. See
15 |       the [[https://github.com/snakemake-profiles/doc][refenrence]] for details.
16 |       - Basically, we can have a local directory named /profile/. For
17 |         example, when HPC uses pbs-torque as the job scheduler, we
18 |         will have two files under /profile/:
19 |         - *cluster.yaml*: resource configs for each rule in Snakemake.
20 |         - *config.yaml*: config specific for pbs-torque, which do not need to be changed.
21 |       - Then during submit snakemake job, use the parameter `--profile
22 |         profile`.
23 |         
24 | ** Files under the directory
25 |   1. *Snakefile*: a snakefile example we use.
26 |      - /wildcards/ is used to run multiple gourps in parallel.
27 |      - Use /touch/ to label the status of one job.
28 |      - Show how to add rule to run after all the seperated jobs are
29 |        finished.
30 |   2. *config.yaml*: a config file to maintain multiple parameters
31 |   3. *Makefile*: show how to run snakemake.
32 |      - Instead of directly running *snakemake*, here we copy the files
33 |        to another directory, so that the generated logs are under that
34 |        directory. 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/snakemake.template/config.yaml:
--------------------------------------------------------------------------------
 1 | code_project_dir: /code_project_dir
 2 | work_project_dir: /work_project_dir
 3 | Rscript: /conda_path/envs/some_env/bin/Rscript
 4 | Rscript_cicero: /conda_path/envs/cicero/bin/Rscript
 5 | cCREgene_dir: 04.cCREgene
 6 | pmat_dir: pmat
 7 | cicero_dir: cicero
 8 | snap_file: snap.rds
 9 | meta_col: AllenAnnotConcat
10 | n_ds: 2000
11 | n_core: 2
12 | group_file: groupList
13 | peak_file: allpeak.bed
14 | subclass_bpmat_file: binary.pmat.rds
15 | L3_bpmat_file: L3.binary.pmat.rds
16 | mm10_file: mm10.chrom.size.txt
17 | tssUpDn1k_file: gencode.vM23.gene.tssUpDn1k.bed
18 | peakannot_file: proximal.distal.peak.annot.bed
19 | k_cicero: 50
20 | reduct_cicero: "UMAP"
21 | preprocess_cicero: "LSI"
22 | debug: 0
23 | allen_l2_cpm_file: allen.L2.cpm.rds
24 | atac_subclass_cpm_file: subclass.cpm.rds
25 | rdm_pdc_file: rdm.pdc.rds
26 | pdc_file: proximal.distal.rds
27 | chunk_size: 50000
28 | 
29 | 


--------------------------------------------------------------------------------
/snakemake.template/pbs.demo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #PBS -q [QueueName]
 4 | #PBS -N [JobName]
 5 | #PBS -l nodes=1:ppn=2,walltime=24:00:00
 6 | #PBS -V
 7 | #PBS -M [Your-email]
 8 | #PBS -m a
 9 | #PBS -A ren-group
10 | #PBS -j oe
11 | 
12 | [Your bash script content]
13 | 


--------------------------------------------------------------------------------
/snakemake.template/profile/cluster.yaml:
--------------------------------------------------------------------------------
 1 | __default__:
 2 |   jobname: "{rule}.{wildcards}"
 3 |   nodes: 1
 4 |   ppn: 1
 5 |   walltime: "02:00:00"
 6 |   account: "ren-group"
 7 |   queue: "glean"
 8 |   email: "debug.pie@gmail.com"
 9 |   mailon: "ae"
10 |   jobout: "oe"
11 |   log: "{rule}.{wildcards}.tscc.log"
12 |   logdir: "qsub_tscc_log/"
13 |   mem: "10gb"
14 | 
15 | parsePmat:
16 |   ppn: 2
17 |   mem: "80gb"
18 |   queue: "hotel"
19 |   walltime: 24:00:00
20 | 
21 | runCicero:
22 |   ppn: 1
23 |   mem: "40gb"
24 |   queue: "glean"
25 |   walltime: 08:00:00
26 | 
27 | runShufCicero:
28 |   ppn: 1
29 |   mem: "40gb"
30 |   queue: "hotel"
31 |   walltime: 08:00:00
32 | 
33 | filterCiceroByShuf:
34 |   ppn: 1
35 |   mem: "40gb"
36 |   queue: "glean"
37 |   walltime: 04:00:00
38 | 
39 | filterDistalProximalConns:
40 |   ppn: 1
41 |   mem: "10gb"
42 |   queue: "glean"
43 |   walltime: 01:00:00
44 | 


--------------------------------------------------------------------------------
/snakemake.template/profile/config.yaml:
--------------------------------------------------------------------------------
1 | cluster-config: "profile/cluster.yaml"
2 | cluster: "qsub -N {cluster.jobname} -l nodes={cluster.nodes}:ppn={cluster.ppn},mem={cluster.mem},walltime={cluster.walltime} -A {cluster.account} -q {cluster.queue} -M {cluster.email} -m {cluster.mailon} -j {cluster.jobout} -e {cluster.logdir} -V "
3 | jobs: 100
4 | verbose: true
5 | notemp: true
6 | 


--------------------------------------------------------------------------------
/supple.datashare/read.Supple.Nature.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script is used to record how to read some supplementary data,
 3 | which are published in Nature already.
 4 | """
 5 | from typing import List
 6 | import os
 7 | import re
 8 | import numpy as np
 9 | import pandas as pd
10 | 
11 | 
12 | # * Download Supplementary Data
13 | # You can find the link under the paper website:
14 | # https://www.nature.com/articles/s41586-023-06824-9
15 | # Here I put it under my Download directory
16 | 
17 | # * get subclass-specific cCREs
18 | # load L4-level cCRE first.
19 | L4toCRE_fnm = os.path.join(
20 |     "/Users/szu/Downloads",
21 |     "2023-03-05453B-s3/SI Tables",
22 |     "SI Table 6 Cell-subtype specific cCREs.csv",
23 | )
24 | with open(L4toCRE_fnm, "r") as f:
25 |     CREs: List[str] = f.readline().strip().split(",")
26 | 
27 | hasCRE: np.ndarray = np.loadtxt(
28 |     fname=L4toCRE_fnm,
29 |     delimiter=",",
30 |     dtype=np.uint8,
31 |     skiprows=1,
32 |     usecols=range(1, len(CREs) + 1),
33 | )
34 | L4names: np.ndarray = np.loadtxt(
35 |     fname=L4toCRE_fnm, delimiter=",", dtype=str, skiprows=1, usecols=0
36 | )
37 | L4toCRE: pd.DataFrame = pd.DataFrame(data=hasCRE, index=L4names, columns=CREs)
38 | pL4: List[str] = L4names.tolist()
39 | 
40 | # L4 to subclass
41 | L4meta_fnm = os.path.join(
42 |     "/Users/szu/Downloads",
43 |     "2023-03-05453B-s3/SI Tables",
44 |     "SI Table 4 L4-level annotation based on the integration analysis with scRNA-seq data including major region information.xlsx",
45 | )
46 | 
47 | L4meta: pd.DataFrame = pd.read_excel(io=L4meta_fnm, header=0, sheet_name=0)
48 | L4meta.set_index(keys="ATAC L4 cluster", drop=False, inplace=True)
49 | 
50 | 
51 | # pL4 to subclass
52 | def map_pL4_to_sc(apL4: str) -> str:
53 |     t = apL4.replace("p", "")
54 |     tt: List[str] = t.split(sep="_")
55 |     if len(tt) < 2:
56 |         return L4meta.loc[t]["Subclass label"]
57 |     l3: str = re.sub("-\d+$", "", tt[0])
58 |     l4s: List[str] = [f"{l3}-{i}" for i in tt[1:]]
59 |     l4s.append(tt[0])
60 |     scs: List[str] = L4meta.loc[l4s]["Subclass label"].unique().tolist()
61 |     if len(scs) > 1:
62 |         print(f"{apL4} has multiple subclasses.")
63 |     return scs[0]
64 | 
65 | 
66 | scs: List[str] = [map_pL4_to_sc(k) for k in pL4]
67 | 
68 | # subclass to CREs
69 | L4toCRE.insert(0, column="sc", value=scs)
70 | sctoCRE = L4toCRE.groupby(by="sc").apply(
71 |     lambda x: x[CREs].sum(axis = 0))
72 | 
73 | # output subclass' CREs
74 | outdir = os.path.join("/Users/szu/git-recipes",
75 |                       "mouseBrainAtlas/cCRE_heatmap/data",
76 |                       "subclass2CRE")
77 | def outscCRE(sc: str) -> None:
78 |     scnm = sc.replace('/', '-').replace(' ', '_')
79 |     outfnm = os.path.join(outdir, f"{scnm}.cCREs.txt")
80 |     pd.Series(sctoCRE.columns[sctoCRE.loc[sc] > 0]).to_csv(
81 |         outfnm, sep = "\t", header = False, index = False
82 |     )
83 | 
84 | for sc in sctoCRE.index.tolist():
85 |     print(f"{sc}")
86 |     outscCRE(sc)
87 | 


--------------------------------------------------------------------------------