├── .gitignore ├── LICENSE ├── README.md ├── config ├── config.yaml ├── prior_cats.json └── slurm │ ├── CookieCutter.py │ ├── config.yaml │ ├── settings.json │ ├── slurm-jobscript.sh │ ├── slurm-sidecar.py │ ├── slurm-status.py │ ├── slurm-submit.py │ └── slurm_utils.py ├── docs ├── dbs.md ├── dts.md └── mth.md └── workflow ├── Snakefile ├── envs ├── celloracle.def ├── dictys.def ├── dictys.yaml ├── figr.def ├── granie.def ├── gretabench.def ├── pando.def └── scenicplus.def ├── rules ├── anl │ ├── dbs.smk │ ├── dts.smk │ ├── metrics │ │ ├── mech.smk │ │ ├── pred.smk │ │ ├── prior.smk │ │ └── utils.smk │ ├── pair.smk │ ├── stab.smk │ ├── topo.smk │ └── tss.smk ├── dbs │ ├── c2g.smk │ ├── cre.smk │ ├── gen.smk │ ├── gst.smk │ ├── ont.smk │ ├── prt.smk │ ├── tfb.smk │ ├── tfm.smk │ ├── tfp.smk │ └── tss.smk ├── dts │ ├── brain.smk │ ├── fakepair.smk │ ├── general.smk │ ├── heartatlas.smk │ ├── pbmc10k.smk │ ├── pitunpair.smk │ ├── pitupair.smk │ └── reprofibro.smk ├── img │ └── img.smk ├── mth │ ├── celloracle.smk │ ├── dictys.smk │ ├── figr.smk │ ├── granie.smk │ ├── grn.smk │ ├── pando.smk │ ├── random.smk │ ├── scenic.smk │ └── scenicplus.smk └── plt │ ├── comb.smk │ ├── dbs.smk │ ├── eval.smk │ ├── figs.smk │ ├── pair.smk │ └── stab.smk └── scripts ├── anl ├── dbs │ ├── ocoef.py │ ├── stats.py │ └── terms.py ├── dts │ └── qcstats.py ├── metrics │ ├── aggregate.py │ ├── mech │ │ ├── prt.py │ │ ├── sim.py │ │ ├── tfa.py │ │ └── tfm.py │ ├── pred │ │ ├── gsets.py │ │ └── omics.py │ ├── prior │ │ ├── gnm.py │ │ ├── tfm.py │ │ └── tfp.py │ ├── test.py │ └── utils.py ├── pair │ ├── fake_stats.py │ ├── pairsim.py │ ├── real_cors.py │ └── realqc.py ├── stab │ ├── ovsd.py │ ├── run_stab.py │ └── seeds.py ├── topo │ ├── fvsd.py │ ├── inter.py │ └── run_pair_sim.py ├── tss │ ├── dist.py │ └── gocoef.py └── utils.py ├── dbs ├── c2g │ ├── eqtlcat_gene.py │ └── eqtlcat_smpl.py ├── cre │ ├── gwascatalogue.py │ └── promoters.R ├── gen │ ├── genome │ │ └── celloracle.py │ ├── gid │ │ └── ensmbl.R │ ├── pid │ │ └── uniprot.R │ └── tss │ │ ├── celloracle.py │ │ ├── dictys.py │ │ ├── figr.R │ │ ├── granie.R │ │ ├── hummus.R │ │ ├── pando.R │ │ └── scenicplus.py ├── gst │ └── pways.py ├── ont │ └── bto.py ├── tfb │ ├── aggregate.py │ ├── chipatlas_meta.py │ ├── chipatlas_tf.py │ ├── remap2022_meta.py │ ├── remap2022_raw.py │ └── unibind_raw.py ├── tfm │ └── hpa.py └── tfp │ ├── europmc.py │ ├── europmc_raw.py │ └── intact.py ├── dts ├── brain │ ├── brain.py │ └── prc_annot.py ├── callpeaks.py ├── extract_case.py ├── fakepair │ ├── coembedd.R │ ├── fakepair.py │ └── paircells.R ├── format_frags.sh ├── heartatlas │ ├── heart_annot.py │ └── heartatlas.py ├── pbmc10k │ ├── pbmc10k.py │ └── prc_annot.py ├── pitunpair │ ├── coembedd.R │ ├── paircells.R │ └── pitunpair.py ├── pitupair │ └── pitupair.py └── reprofibro │ ├── prc_annot.py │ └── reprofibro.py ├── mth ├── celloracle │ ├── mdl.py │ ├── p2g.R │ ├── p2g.py │ ├── pre.py │ ├── src.R │ ├── src.py │ └── tfb.py ├── dictys │ ├── before_mdl.py │ ├── extract_data.py │ ├── frag_to_bam.py │ ├── mdl.sh │ ├── p2g.py │ ├── pre.py │ └── tfb.sh ├── figr │ ├── mdl.R │ ├── p2g.R │ ├── pre.R │ ├── src.R │ └── tfb.R ├── granie │ ├── mdl.R │ ├── p2g.R │ ├── pre.R │ ├── pre.py │ ├── pre_post.py │ ├── src.R │ └── tfb.R ├── grn.py ├── pando │ ├── get_granges.R │ ├── mdl.R │ ├── p2g.R │ ├── pre.R │ ├── pre.py │ ├── src.R │ └── tfb.R ├── prc_prior_grn.py ├── random │ └── grn.py ├── scenic │ ├── loom.py │ └── process_grn.py └── scenicplus │ ├── egrn.py │ ├── mdata.py │ ├── mdl.sh │ ├── motifs.py │ ├── o_mdl.sh │ ├── p2g.sh │ ├── pre.py │ ├── tfb.py │ └── topics.py └── plt ├── comb └── sims.py ├── dbs └── stats.py ├── eval └── eval.py ├── pair ├── fake.py └── pair.py ├── stab ├── cors.py ├── links.py ├── sims.py └── stab.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # singularity images 132 | *.sif 133 | 134 | # vs code 135 | .vscode/ 136 | 137 | # DS_Store 138 | .DS_Store 139 | workflow/.DS_Store 140 | workflow/scripts/.DS_Store 141 | workflow/scripts/methods/.DS_Store 142 | 143 | 144 | .snakemake/ 145 | benchmarks/ 146 | /datasets/ 147 | gdata/ 148 | workflow/scripts/methods/scenic+/s1.py 149 | logs/ 150 | *.ipynb 151 | -------------------------------------------------------------------------------- /config/slurm/CookieCutter.py: -------------------------------------------------------------------------------- 1 | # 2 | # Based on lsf CookieCutter.py 3 | # 4 | import os 5 | import json 6 | 7 | d = os.path.dirname(__file__) 8 | with open(os.path.join(d, "settings.json")) as fh: 9 | settings = json.load(fh) 10 | 11 | 12 | def from_entry_or_env(values, key): 13 | """Return value from ``values`` and override with environment variables.""" 14 | if key in os.environ: 15 | return os.environ[key] 16 | else: 17 | return values[key] 18 | 19 | 20 | class CookieCutter: 21 | 22 | SBATCH_DEFAULTS = from_entry_or_env(settings, "SBATCH_DEFAULTS") 23 | CLUSTER_NAME = from_entry_or_env(settings, "CLUSTER_NAME") 24 | CLUSTER_CONFIG = from_entry_or_env(settings, "CLUSTER_CONFIG") 25 | 26 | @staticmethod 27 | def get_cluster_option() -> str: 28 | cluster = CookieCutter.CLUSTER_NAME 29 | if cluster != "": 30 | return f"--cluster={cluster}" 31 | return "" 32 | 33 | @staticmethod 34 | def get_cluster_logpath() -> str: 35 | return "logs/%r/%j" 36 | 37 | @staticmethod 38 | def get_cluster_jobname() -> str: 39 | return "%r_%w" 40 | -------------------------------------------------------------------------------- /config/slurm/config.yaml: -------------------------------------------------------------------------------- 1 | cluster-sidecar: "slurm-sidecar.py" 2 | cluster-cancel: "scancel" 3 | jobscript: "slurm-jobscript.sh" 4 | cluster: "slurm-submit.py" 5 | cluster-status: "slurm-status.py" 6 | restart-times: 5 7 | max-jobs-per-second: 5 8 | max-status-checks-per-second: 5 9 | local-cores: 1 10 | latency-wait: 15 11 | use-conda: True 12 | use-singularity: True 13 | jobs: 64 14 | printshellcmds: True 15 | keep-incomplete: True 16 | notemp: True 17 | rerun-incomplete: False 18 | default-resources: 19 | - runtime=720 20 | - mem_mb=64000 21 | - partition=cpu-single 22 | - threads=1 23 | -------------------------------------------------------------------------------- /config/slurm/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "SBATCH_DEFAULTS": "", 3 | "CLUSTER_NAME": "", 4 | "CLUSTER_CONFIG": "" 5 | } 6 | -------------------------------------------------------------------------------- /config/slurm/slurm-jobscript.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # properties = {properties} 3 | {exec_job} 4 | -------------------------------------------------------------------------------- /docs/dbs.md: -------------------------------------------------------------------------------- 1 | ## Adding databases 2 | 3 | Databases are sorted into different categories: 4 | - c2g: CRE to gene databases (e.g. eQTL studies) 5 | - cre: CRE annotation (e.g. ENCODE) 6 | - gen: general genome annotations (e.g. lambert, ENSMBL) 7 | - gst: Gene set databases (e.g. REACTOME) 8 | - ont: Ontology databases 9 | - prt: TF perturbation databases (e.g. KnockTF) 10 | - tfb: TF binding databases (e.g. ChIP-Atlas) 11 | - tfm: TF marker databases (e.g. TF-Marker) 12 | - tfp: TF-TF interaction databases (e.g. IntAct) 13 | - tss: TSS databases (e.g. ENSMBL) 14 | 15 | A url where to download the database should be provided in the `config/config.yaml` file. 16 | ``` 17 | # Databases 18 | dbs: 19 | hg38: 20 | gen: 21 | ... 22 | prt: 23 | ... 24 | gst: 25 | ... 26 | newdatabase: 'https:// ...' 27 | mm10: 28 | ... 29 | ``` 30 | Note that databases are divided by organism. 31 | 32 | Rules for each of these categories can be found in `workflow/rules/dbs/`. New databases should be added to their corresponding rule file. 33 | If a database does not fit any of these categories, a new rule file can be created. 34 | 35 | Here is an example of a rule for CRE: 36 | ``` 37 | rule cre_encode: 38 | threads: 1 39 | output: 'dbs/hg38/cre/encode/encode.bed' 40 | params: 41 | url=config['dbs']['hg38']['cre']['encode'] 42 | shell: 43 | """ 44 | ... 45 | """ 46 | ``` 47 | 48 | Rules should follow this naming convetion: `{dbtype_dbname}`, in this case `cre_encode`. 49 | The output should be stored using this path format: `dbs/{organism}/{dbtype}/{dbname}/{dbname}.bed` 50 | When possible use `.bed` format, else `csv`. 51 | -------------------------------------------------------------------------------- /workflow/envs/celloracle.def: -------------------------------------------------------------------------------- 1 | Bootstrap: docker 2 | From: ubuntu:20.04 3 | 4 | 5 | %environment 6 | export PATH=/opt/:$PATH 7 | . "/opt/conda/etc/profile.d/conda.sh" 8 | . "/opt/conda/etc/profile.d/mamba.sh" 9 | conda activate env 10 | 11 | %post 12 | 13 | # update apt 14 | apt update -y 15 | 16 | # basic packages (~2 min) 17 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata 18 | apt install -y build-essential \ 19 | gcc \ 20 | libstdc++6 \ 21 | cmake \ 22 | wget \ 23 | curl \ 24 | libcurl4-openssl-dev \ 25 | libssl-dev \ 26 | libxml2-dev \ 27 | libcairo2-dev \ 28 | libxt-dev \ 29 | libopenblas-dev \ 30 | bedtools 31 | 32 | # conda 33 | wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" 34 | bash Miniforge3.sh -b -p "/opt/conda" 35 | . "/opt/conda/etc/profile.d/conda.sh" 36 | . "/opt/conda/etc/profile.d/mamba.sh" 37 | conda activate 38 | 39 | # Create env 40 | mamba create -y -n=env -c conda-forge -c bioconda \ 41 | python=3.10 \ 42 | r-base==4.2 \ 43 | r-monocle3 \ 44 | r-vgam \ 45 | r-glasso \ 46 | bioconductor-gviz \ 47 | bioconductor-genomicranges \ 48 | bioconductor-rtracklayer \ 49 | bioconductor-rhdf5 \ 50 | r-devtools \ 51 | pip \ 52 | cython \ 53 | pybedtools \ 54 | muon 55 | 56 | # Install cicero 57 | conda activate env 58 | Rscript -e "devtools::install_github('cole-trapnell-lab/cicero-release', ref = 'monocle3', upgrade = 'never')" 59 | Rscript -e "remove.packages('irlba'); install.packages('irlba', repos = 'https://cloud.r-project.org')" 60 | 61 | # Install CellOracle 62 | pip install celloracle==0.16.0 pybedtools==0.9.0 scikit-learn==1.1.3 63 | 64 | # Remove cache for lighter containers 65 | pip cache purge 66 | conda clean -a -y 67 | -------------------------------------------------------------------------------- /workflow/envs/dictys.def: -------------------------------------------------------------------------------- 1 | Bootstrap: docker 2 | From: ubuntu:20.04 3 | 4 | 5 | %environment 6 | export PATH=/opt/:$PATH 7 | . "/opt/conda/etc/profile.d/conda.sh" 8 | . "/opt/conda/etc/profile.d/mamba.sh" 9 | conda activate env 10 | 11 | %post 12 | 13 | # update apt 14 | apt update -y 15 | 16 | # basic packages (~2 min) 17 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata 18 | apt install -y build-essential \ 19 | gcc \ 20 | libstdc++6 \ 21 | cmake \ 22 | wget \ 23 | curl \ 24 | libcurl4-openssl-dev \ 25 | libssl-dev \ 26 | libxml2-dev \ 27 | libcairo2-dev \ 28 | libxt-dev \ 29 | libopenblas-dev \ 30 | bedtools 31 | 32 | # conda 33 | wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" 34 | bash Miniforge3.sh -b -p "/opt/conda" 35 | . "/opt/conda/etc/profile.d/conda.sh" 36 | . "/opt/conda/etc/profile.d/mamba.sh" 37 | conda activate 38 | 39 | # Create env 40 | mamba create -y -n=env -c lingfeiwang -c conda-forge -c bioconda -c pytorch \ 41 | python=3.10 \ 42 | pip \ 43 | dictys \ 44 | pytorch \ 45 | torchvision \ 46 | torchaudio \ 47 | cpuonly \ 48 | jupyterlab \ 49 | mudata 50 | 51 | # Remove cache for lighter containers 52 | pip cache purge 53 | conda clean -a -y 54 | -------------------------------------------------------------------------------- /workflow/envs/dictys.yaml: -------------------------------------------------------------------------------- 1 | name: dictys 2 | channels: 3 | - lingfeiwang 4 | - bioconda 5 | - conda-forge 6 | - pytorch 7 | - nvidia 8 | dependencies: 9 | - python=3.10 10 | - dictys 11 | - pytorch 12 | - torchvision 13 | - torchaudio 14 | - pytorch-cuda=11.7 15 | - mudata 16 | - pip 17 | - pip: 18 | - torch --index-url https://download.pytorch.org/whl/cu118 19 | - torchvision --index-url https://download.pytorch.org/whl/cu118 20 | - torchaudio --index-url https://download.pytorch.org/whl/cu118 21 | -------------------------------------------------------------------------------- /workflow/envs/granie.def: -------------------------------------------------------------------------------- 1 | Bootstrap: docker 2 | From: ubuntu:20.04 3 | 4 | 5 | %environment 6 | export PATH=/opt/:$PATH 7 | . "/opt/conda/etc/profile.d/conda.sh" 8 | . "/opt/conda/etc/profile.d/mamba.sh" 9 | conda activate env 10 | 11 | %post 12 | 13 | # update apt 14 | apt update -y 15 | 16 | # basic packages (~2 min) 17 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata 18 | apt install -y build-essential \ 19 | gcc \ 20 | cmake \ 21 | wget \ 22 | curl \ 23 | libcurl4-openssl-dev \ 24 | libssl-dev \ 25 | libxml2-dev \ 26 | libcairo2-dev \ 27 | libxt-dev \ 28 | libopenblas-dev \ 29 | bedtools 30 | 31 | # conda 32 | wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" 33 | bash Miniforge3.sh -b -p "/opt/conda" 34 | . "/opt/conda/etc/profile.d/conda.sh" 35 | . "/opt/conda/etc/profile.d/mamba.sh" 36 | conda activate 37 | 38 | # Create env 39 | mamba create -y -n=env -c conda-forge -c bioconda \ 40 | python=3.10 \ 41 | r-base==4.3 \ 42 | r-futile.logger \ 43 | r-checkmate \ 44 | r-patchwork \ 45 | r-reshape2 \ 46 | r-data.table \ 47 | r-matrixstats \ 48 | r-matrix \ 49 | bioconductor-genomicranges \ 50 | r-rcolorbrewer \ 51 | bioconductor-complexheatmap \ 52 | bioconductor-deseq2 \ 53 | r-circlize \ 54 | r-progress \ 55 | r-stringr \ 56 | r-scales \ 57 | r-igraph \ 58 | bioconductor-s4vectors \ 59 | r-ggplot2 \ 60 | r-rlang \ 61 | bioconductor-biostrings \ 62 | bioconductor-genomeinfodb \ 63 | bioconductor-summarizedexperiment \ 64 | r-forcats \ 65 | r-gridextra \ 66 | bioconductor-limma \ 67 | r-tidyselect \ 68 | r-readr \ 69 | r-tidyr \ 70 | r-dplyr \ 71 | r-magrittr \ 72 | r-tibble \ 73 | r-viridis \ 74 | r-colorspace \ 75 | bioconductor-biomart \ 76 | bioconductor-topgo \ 77 | bioconductor-annotationhub \ 78 | bioconductor-ensembldb \ 79 | r-devtools \ 80 | bioconductor-rhdf5 \ 81 | r-irkernel \ 82 | mudata \ 83 | decoupler-py==1.8.0 \ 84 | jupyterlab \ 85 | r-tidyverse \ 86 | bioconductor-org.hs.eg.db \ 87 | bioconductor-txdb.hsapiens.ucsc.hg38.knowngene \ 88 | bioconductor-bsgenome.hsapiens.ucsc.hg38 \ 89 | r-batchtools 90 | 91 | # Install granie 92 | conda activate env 93 | Rscript -e "devtools::install_gitlab('grp-zaugg/GRaNIE@6f1f4ddd96f2932e15ca60fb8554e74de842f7e4', host = 'git.embl.de', subdir = 'src/GRaNIE', upgrade = 'never')" 94 | 95 | # Remove cache for lighter containers 96 | pip cache purge 97 | conda clean -a -y 98 | -------------------------------------------------------------------------------- /workflow/envs/gretabench.def: -------------------------------------------------------------------------------- 1 | Bootstrap: docker 2 | From: ubuntu:20.04 3 | 4 | 5 | %environment 6 | export PATH=/opt/:$PATH 7 | . "/opt/conda/etc/profile.d/conda.sh" 8 | . "/opt/conda/etc/profile.d/mamba.sh" 9 | conda activate env 10 | 11 | %post 12 | 13 | # update apt 14 | apt update -y 15 | 16 | # basic packages (~2 min) 17 | DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata 18 | apt install -y build-essential \ 19 | gcc \ 20 | libstdc++6 \ 21 | cmake \ 22 | wget \ 23 | curl \ 24 | git \ 25 | libcurl4-openssl-dev \ 26 | libssl-dev \ 27 | libxml2-dev \ 28 | libcairo2-dev \ 29 | libxt-dev \ 30 | libopenblas-dev \ 31 | bedtools \ 32 | tabix 33 | 34 | # conda 35 | wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" 36 | bash Miniforge3.sh -b -p "/opt/conda" 37 | . "/opt/conda/etc/profile.d/conda.sh" 38 | . "/opt/conda/etc/profile.d/mamba.sh" 39 | conda activate 40 | 41 | # Create env 42 | mamba create -y -n=env -c conda-forge -c bioconda -c colomoto \ 43 | python=3.10 \ 44 | pip \ 45 | muon==0.1.5 \ 46 | scanpy==1.9.8 \ 47 | leidenalg \ 48 | harmonypy \ 49 | jupyterlab \ 50 | r-base==4.3 \ 51 | bioconductor-biomart \ 52 | cython \ 53 | polars \ 54 | hmmlearn \ 55 | plotly \ 56 | pooch \ 57 | python-kaleido \ 58 | multiprocess \ 59 | pyarrow \ 60 | rustworkx \ 61 | dill \ 62 | macs3 \ 63 | scrublet \ 64 | decoupler-py==1.7.0 \ 65 | py-xgboost \ 66 | pyranges \ 67 | statannotations \ 68 | numba==0.59.1 \ 69 | pyboolnet 70 | 71 | conda activate env 72 | pip install mofapy2 marsilea==0.3.2 snapatac2==2.6.0 celloracle==0.18.0 scipy==1.12.0 ipykernel 73 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib 74 | 75 | # Remove cache for lighter containers 76 | pip cache purge 77 | conda clean -a -y 78 | -------------------------------------------------------------------------------- /workflow/rules/anl/dbs.smk: -------------------------------------------------------------------------------- 1 | rule dbs_stats: 2 | threads: 1 3 | input: 4 | paths_prt=expand('dbs/hg38/prt/{prt}/meta.csv', prt=config['dbs']['hg38']['prt'].keys()), 5 | paths_gst=expand('dbs/hg38/gst/{gst}.csv', gst=config['dbs']['hg38']['gst'].keys()), 6 | paths_tfm=expand('dbs/hg38/tfm/{tfm}/{tfm}.tsv', tfm=config['dbs']['hg38']['tfm'].keys()), 7 | paths_tfp=expand('dbs/hg38/tfp/{tfp}/{tfp}.tsv', tfp=config['dbs']['hg38']['tfp'].keys()), 8 | paths_tfb=expand('dbs/hg38/tfb/{tfb}/{tfb}.bed', tfb=config['dbs']['hg38']['tfb'].keys()), 9 | paths_cre=expand('dbs/hg38/cre/{cre}/{cre}.bed', cre=config['dbs']['hg38']['cre'].keys()), 10 | paths_c2g=expand('dbs/hg38/c2g/{c2g}/{c2g}.bed', c2g=config['dbs']['hg38']['c2g'].keys()), 11 | output: 'anl/dbs/stats.csv' 12 | resources: 13 | mem_mb=32000 14 | shell: 15 | """ 16 | python workflow/scripts/anl/dbs/stats.py \ 17 | -p {input.paths_prt} \ 18 | -g {input.paths_gst} \ 19 | -m {input.paths_tfm} \ 20 | -t {input.paths_tfp} \ 21 | -b {input.paths_tfb} \ 22 | -c {input.paths_cre} \ 23 | -e {input.paths_c2g} \ 24 | -o {output} 25 | """ 26 | 27 | 28 | rule dbs_terms: 29 | threads: 1 30 | singularity: 'workflow/envs/gretabench.sif' 31 | input: 32 | paths_prt=expand('dbs/hg38/prt/{prt}/meta.csv', prt=config['dbs']['hg38']['prt'].keys()), 33 | paths_tfm=expand('dbs/hg38/tfm/{tfm}/{tfm}.tsv', tfm=config['dbs']['hg38']['tfm'].keys()), 34 | paths_tfb=expand('dbs/hg38/tfb/{tfb}/{tfb}.bed', tfb=config['dbs']['hg38']['tfb'].keys()), 35 | paths_cre=expand('dbs/hg38/cre/{cre}/{cre}.bed', cre=config['dbs']['hg38']['cre'].keys()), 36 | paths_c2g=expand('dbs/hg38/c2g/{c2g}/{c2g}.bed', c2g=config['dbs']['hg38']['c2g'].keys()), 37 | output: 'anl/dbs/terms.csv' 38 | resources: 39 | mem_mb=64000 40 | shell: 41 | """ 42 | python workflow/scripts/anl/dbs/terms.py -i {input} -o {output} 43 | """ 44 | 45 | 46 | rule dbs_ocoef: 47 | threads: 1 48 | singularity: 'workflow/envs/gretabench.sif' 49 | input: 'anl/dbs/stats.csv', 50 | output: 'anl/dbs/ocoef.csv', 51 | shell: 52 | """ 53 | python workflow/scripts/anl/dbs/ocoef.py {output} 54 | """ 55 | -------------------------------------------------------------------------------- /workflow/rules/anl/dts.smk: -------------------------------------------------------------------------------- 1 | localrules: dts_qcstats 2 | 3 | 4 | rule dts_qcstats: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: rules.extract_case.output.mdata 8 | output: 9 | qc='anl/dts/{dat}.{case}.qc.csv', 10 | nc='anl/dts/{dat}.{case}.nc.csv', 11 | shell: 12 | """ 13 | python workflow/scripts/anl/dts/qcstats.py \ 14 | {input} {output.qc} {output.nc} 15 | """ 16 | -------------------------------------------------------------------------------- /workflow/rules/anl/metrics/mech.smk: -------------------------------------------------------------------------------- 1 | localrules: mech_tfa 2 | rule mech_tfa: 3 | threads: 1 4 | singularity: 'workflow/envs/gretabench.sif' 5 | input: 6 | grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards), 7 | rsc=rules.prt_knocktf.output.dir, 8 | output: 9 | out='anl/metrics/mech/tfa/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv' 10 | shell: 11 | """ 12 | python workflow/scripts/anl/metrics/mech/tfa.py \ 13 | -i {input.grn} \ 14 | -b {input.rsc} \ 15 | -o {output.out} 16 | """ 17 | 18 | 19 | rule mech_prt: 20 | threads: 16 21 | singularity: 'workflow/envs/gretabench.sif' 22 | input: 23 | grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards), 24 | rsc=rules.prt_knocktf.output.dir, 25 | output: 26 | out='anl/metrics/mech/prt/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv' 27 | resources: 28 | mem_mb=restart_mem, 29 | runtime=config['max_mins_per_step'] * 2, 30 | shell: 31 | """ 32 | set +e 33 | timeout $(({resources.runtime}-20))m \ 34 | python workflow/scripts/anl/metrics/mech/prt.py \ 35 | -i {input.grn} \ 36 | -b {input.rsc} \ 37 | -o {output.out} 38 | if [ $? -eq 124 ]; then 39 | awk 'BEGIN {{ print "name,prc,rcl,f01" }}' > {output.out} 40 | fi 41 | """ 42 | 43 | 44 | rule extract_mech_tfm: 45 | threads: 1 46 | singularity: 'workflow/envs/gretabench.sif' 47 | input: 48 | mdata=rules.extract_case.output.mdata, 49 | tf=rules.gen_tfs_lambert.output, 50 | output: 'anl/metrics/mech/sss/sss/{dat}.{case}/tfm.csv' 51 | shell: 52 | """ 53 | python workflow/scripts/anl/metrics/mech/tfm.py {input.mdata} {input.tf} {output} 54 | """ 55 | 56 | 57 | rule mech_sss: 58 | threads: 1 59 | singularity: 'workflow/envs/gretabench.sif' 60 | input: 61 | grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards), 62 | tfm=rules.extract_mech_tfm.output, 63 | output: 64 | out='anl/metrics/mech/sss/sss/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv' 65 | params: 66 | thr_pval=0.01, 67 | resources: 68 | mem_mb=8000, 69 | runtime=60, 70 | shell: 71 | """ 72 | set +e 73 | timeout $(({resources.runtime}-20))m \ 74 | python workflow/scripts/anl/metrics/mech/sim.py {input.grn} {input.tfm} {params.thr_pval} {output.out} 75 | if [ $? -eq 124 ]; then 76 | awk 'BEGIN {{ print "name,prc,rcl,f01" }}' > {output.out} 77 | fi 78 | """ 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /workflow/rules/anl/metrics/pred.smk: -------------------------------------------------------------------------------- 1 | rule pred_omics: 2 | threads: 1 3 | singularity: 'workflow/envs/gretabench.sif' 4 | input: 5 | grn=lambda w: rules.grn_run.output.out.format(**w), 6 | output: 7 | out='anl/metrics/pred/omics/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv' 8 | params: 9 | col_source=lambda w: 'cre' if w.db == 'gcre' else 'source', 10 | col_target=lambda w: 'cre' if w.db == 'cretf' else 'target', 11 | mod_source=lambda w: 'atac' if w.db == 'gcre' else 'rna', 12 | mod_target=lambda w: 'atac' if w.db == 'cretf' else 'rna', 13 | shell: 14 | """ 15 | python workflow/scripts/anl/metrics/pred/omics.py \ 16 | -a {input.grn} \ 17 | -b {params.col_source} \ 18 | -c {params.col_target} \ 19 | -d {params.mod_source} \ 20 | -e {params.mod_target} \ 21 | -f {output} 22 | """ 23 | 24 | 25 | rule pred_gsets: 26 | threads: 1 27 | singularity: 'workflow/envs/gretabench.sif' 28 | input: 29 | grn=lambda w: rules.grn_run.output.out.format(**w), 30 | rsc='dbs/hg38/gst/{db}.csv' 31 | output: 32 | out='anl/metrics/pred/gsets/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv' 33 | shell: 34 | """ 35 | python workflow/scripts/anl/metrics/pred/gsets.py \ 36 | -i {input.grn} \ 37 | -p {input.rsc} \ 38 | -o {output} 39 | """ 40 | -------------------------------------------------------------------------------- /workflow/rules/anl/metrics/prior.smk: -------------------------------------------------------------------------------- 1 | localrules: prior_tfm, prior_tfp, prior_cre 2 | 3 | 4 | rule prior_tfm: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: 8 | grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards), 9 | db='dbs/hg38/tfm/{db}/{db}.tsv', 10 | output: 11 | out='anl/metrics/prior/tfm/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv' 12 | shell: 13 | """ 14 | python workflow/scripts/anl/metrics/prior/tfm.py \ 15 | -a {input.grn} \ 16 | -b {input.db} \ 17 | -f {output.out} 18 | """ 19 | 20 | 21 | rule prior_tfp: 22 | threads: 1 23 | singularity: 'workflow/envs/gretabench.sif' 24 | input: 25 | grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards), 26 | db='dbs/hg38/tfp/{db}/{db}.tsv', 27 | output: 28 | out='anl/metrics/prior/tfp/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv' 29 | params: 30 | thr_p=0.01, 31 | shell: 32 | """ 33 | python workflow/scripts/anl/metrics/prior/tfp.py \ 34 | {input.grn} {input.db} {params.thr_p} {output.out} 35 | """ 36 | 37 | 38 | rule prior_tfb: 39 | threads: 1 40 | singularity: 'workflow/envs/gretabench.sif' 41 | input: 42 | grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards), 43 | db='dbs/hg38/tfb/{db}/{db}.bed', 44 | output: 45 | out='anl/metrics/prior/tfb/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv' 46 | params: 47 | grp='source', 48 | shell: 49 | """ 50 | python workflow/scripts/anl/metrics/prior/gnm.py \ 51 | -a {input.grn} \ 52 | -b {input.db} \ 53 | -d {params.grp} \ 54 | -f {output} 55 | """ 56 | 57 | 58 | rule prior_cre: 59 | threads: 1 60 | singularity: 'workflow/envs/gretabench.sif' 61 | input: 62 | grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards), 63 | db='dbs/hg38/cre/{db}/{db}.bed', 64 | output: 65 | out='anl/metrics/prior/cre/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv' 66 | shell: 67 | """ 68 | python workflow/scripts/anl/metrics/prior/gnm.py \ 69 | -a {input.grn} \ 70 | -b {input.db} \ 71 | -f {output} 72 | """ 73 | 74 | 75 | rule prior_c2g: 76 | threads: 1 77 | singularity: 'workflow/envs/gretabench.sif' 78 | input: 79 | grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards), 80 | resource='dbs/hg38/c2g/{db}/{db}.bed', 81 | output: 82 | out='anl/metrics/prior/c2g/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv' 83 | params: 84 | grp='target', 85 | shell: 86 | """ 87 | python workflow/scripts/anl/metrics/prior/gnm.py \ 88 | -a {input.grn} \ 89 | -b {input.resource} \ 90 | -d {params.grp} \ 91 | -f {output} 92 | """ 93 | -------------------------------------------------------------------------------- /workflow/rules/anl/metrics/utils.smk: -------------------------------------------------------------------------------- 1 | localrules: aggr_metric, metric_summ 2 | 3 | 4 | rule aggr_metric: 5 | threads: 1 6 | input: 7 | lambda w: make_combs_rules(w=w, mthds=mthds, baselines=baselines, rule_name='{typ}_{tsk}'.format(typ=w.type, tsk=w.task)) 8 | output: 9 | 'anl/metrics/{type}/{task}/{db}/{dat}.{case}.scores.csv' 10 | shell: 11 | """ 12 | python workflow/scripts/anl/metrics/aggregate.py \ 13 | -i {input} \ 14 | -o {output} 15 | """ 16 | 17 | 18 | rule metric_summ: 19 | threads: 1 20 | singularity: 'workflow/envs/gretabench.sif' 21 | input: 22 | [ 23 | 'anl/metrics/mech/prt/knocktf/{dat}.{case}.scores.csv', 24 | 'anl/metrics/mech/tfa/knocktf/{dat}.{case}.scores.csv', 25 | 'anl/metrics/mech/sss/sss/{dat}.{case}.scores.csv', 26 | 'anl/metrics/pred/omics/gtf/{dat}.{case}.scores.csv', 27 | 'anl/metrics/pred/omics/cretf/{dat}.{case}.scores.csv', 28 | 'anl/metrics/pred/omics/gcre/{dat}.{case}.scores.csv', 29 | 'anl/metrics/pred/gsets/kegg/{dat}.{case}.scores.csv', 30 | 'anl/metrics/pred/gsets/hall/{dat}.{case}.scores.csv', 31 | 'anl/metrics/pred/gsets/reac/{dat}.{case}.scores.csv', 32 | 'anl/metrics/pred/gsets/prog/{dat}.{case}.scores.csv', 33 | 'anl/metrics/prior/tfm/hpa/{dat}.{case}.scores.csv', 34 | 'anl/metrics/prior/tfm/tfmdb/{dat}.{case}.scores.csv', 35 | 'anl/metrics/prior/tfp/europmc/{dat}.{case}.scores.csv', 36 | 'anl/metrics/prior/tfp/intact/{dat}.{case}.scores.csv', 37 | 'anl/metrics/prior/tfb/chipatlas/{dat}.{case}.scores.csv', 38 | 'anl/metrics/prior/tfb/remap2022/{dat}.{case}.scores.csv', 39 | 'anl/metrics/prior/tfb/unibind/{dat}.{case}.scores.csv', 40 | 'anl/metrics/prior/cre/blacklist/{dat}.{case}.scores.csv', 41 | 'anl/metrics/prior/cre/encode/{dat}.{case}.scores.csv', 42 | 'anl/metrics/prior/cre/gwascatalogue/{dat}.{case}.scores.csv', 43 | 'anl/metrics/prior/cre/phastcons/{dat}.{case}.scores.csv', 44 | 'anl/metrics/prior/cre/zhang21/{dat}.{case}.scores.csv', 45 | 'anl/metrics/prior/cre/promoters/{dat}.{case}.scores.csv', 46 | 'anl/metrics/prior/c2g/eqtlcatalogue/{dat}.{case}.scores.csv', 47 | ] 48 | output: 'anl/metrics/summary/{dat}.{case}.csv' 49 | shell: 50 | """ 51 | python workflow/scripts/anl/metrics/test.py -m {input} -o {output} 52 | """ 53 | -------------------------------------------------------------------------------- /workflow/rules/anl/pair.smk: -------------------------------------------------------------------------------- 1 | localrules: pair_realsim, pair_fakesim 2 | 3 | 4 | rule pair_real_cor: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: 8 | pair='dts/{dname}pair/cases/{case}/mdata.h5mu', 9 | npair='dts/{dname}npair/cases/{case}/mdata.h5mu', 10 | output: 11 | cors='anl/pair/{dname}.{case}.real_corvals.csv', 12 | stat='anl/pair/{dname}.{case}.real_corsstat.csv', 13 | singularity: 14 | 'workflow/envs/gretabench.sif' 15 | shell: 16 | """ 17 | python workflow/scripts/anl/pair/real_cors.py \ 18 | -a {input.pair} \ 19 | -b {input.npair} \ 20 | -c {output.cors} \ 21 | -d {output.stat} 22 | """ 23 | 24 | 25 | rule pair_fake_stats: 26 | threads: 1 27 | singularity: 'workflow/envs/gretabench.sif' 28 | input: 29 | mdata='dts/{dname}pair/cases/{case}/mdata.h5mu', 30 | barmap='dts/fake{dname}pair/barmap.csv', 31 | output: 32 | knn='anl/pair/{dname}.{case}.fake_knn.csv', 33 | cor='anl/pair/{dname}.{case}.fake_cor.csv', 34 | prp='anl/pair/{dname}.{case}.fake_prp.csv', 35 | singularity: 36 | 'workflow/envs/gretabench.sif' 37 | shell: 38 | """ 39 | python workflow/scripts/anl/pair/fake_stats.py \ 40 | -a {input.mdata} \ 41 | -b {input.barmap} \ 42 | -c {output.knn} \ 43 | -d {output.cor} \ 44 | -e {output.prp} 45 | """ 46 | 47 | 48 | rule pair_realsim: 49 | threads: 1 50 | singularity: 'workflow/envs/gretabench.sif' 51 | input: 52 | p='anl/topo/{dname}pair.{case}.sims_mult.csv', 53 | n='anl/topo/{dname}npair.{case}.sims_mult.csv', 54 | output: 'anl/pair/{dname}.{case}.pvsn.csv' 55 | shell: 56 | """ 57 | python workflow/scripts/anl/pair/pairsim.py \ 58 | -a {input.p} \ 59 | -b {input.n} \ 60 | -o {output} 61 | """ 62 | 63 | 64 | rule pair_fakesim: 65 | threads: 1 66 | singularity: 'workflow/envs/gretabench.sif' 67 | input: 68 | p='anl/topo/{dname}pair.{case}.sims_mult.csv', 69 | f='anl/topo/fake{dname}pair.{case}.sims_mult.csv', 70 | output: 'anl/pair/{dname}.{case}.pvsf.csv' 71 | shell: 72 | """ 73 | python workflow/scripts/anl/pair/pairsim.py \ 74 | -a {input.p} \ 75 | -b {input.f} \ 76 | -o {output} 77 | """ 78 | 79 | 80 | rule pair_real_qc: 81 | threads: 1 82 | singularity: 'workflow/envs/gretabench.sif' 83 | input: 84 | pair='dts/{dname}pair/cases/{case}/mdata.h5mu', 85 | npair='dts/{dname}npair/cases/{case}/mdata.h5mu', 86 | output: 87 | qc='anl/pair/{dname}.{case}.qc.csv', 88 | nc='anl/pair/{dname}.{case}.ncells.csv' 89 | shell: 90 | """ 91 | python workflow/scripts/anl/pair/realqc.py {input.pair} {input.npair} {output.qc} {output.nc} 92 | """ 93 | -------------------------------------------------------------------------------- /workflow/rules/anl/topo.smk: -------------------------------------------------------------------------------- 1 | localrules: topo_inter, topo_fvsd 2 | 3 | 4 | rule topo_mult: 5 | threads: 4 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: 8 | lambda w: make_combs_rules(w=w, mthds=mthds, baselines=baselines, rule_name='grn_run') 9 | output: 10 | stats='anl/topo/{dat}.{case}.stats_mult.csv', 11 | sims='anl/topo/{dat}.{case}.sims_mult.csv', 12 | resources: 13 | mem_mb=128000 14 | shell: 15 | """ 16 | python workflow/scripts/anl/topo/run_pair_sim.py \ 17 | -t {output.stats} \ 18 | -s {output.sims} 19 | """ 20 | 21 | 22 | rule topo_fvsd: 23 | threads: 4 24 | singularity: 'workflow/envs/gretabench.sif' 25 | input: 26 | stats=rules.topo_mult.output.stats, 27 | sims=rules.topo_mult.output.sims, 28 | output: 'anl/topo/{dat}.{case}.fvsd.csv', 29 | shell: 30 | """ 31 | python workflow/scripts/anl/topo/fvsd.py {input.sims} {input.stats} {output} 32 | """ 33 | 34 | 35 | rule topo_inter: 36 | threads: 1 37 | input: 38 | lambda w: make_combs_rules(w=w, mthds=mthds, baselines=baselines, rule_name='grn_run') 39 | output: 'anl/topo/{dat}.{case}.inter.csv', 40 | params: min_prop=config['topo_min_prop'] 41 | shell: 42 | """ 43 | python workflow/scripts/anl/topo/inter.py \ 44 | -g {input} \ 45 | -b {baselines} \ 46 | -p {params.min_prop} \ 47 | -o {output} 48 | """ 49 | -------------------------------------------------------------------------------- /workflow/rules/anl/tss.smk: -------------------------------------------------------------------------------- 1 | localrules: tss_aggr 2 | 3 | 4 | rule tss_gocoef: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: 8 | tss_a='dbs/hg38/gen/tss/{mth_a}.bed', 9 | tss_b='dbs/hg38/gen/tss/{mth_b}.bed', 10 | output: temp(local('anl/tss/ocoef/{mth_a}.{mth_b}.csv')) 11 | resources: 12 | mem_mb=2000, 13 | runtime=config['max_mins_per_step'], 14 | shell: 15 | """ 16 | python workflow/scripts/anl/tss/gocoef.py \ 17 | -a {input.tss_a} \ 18 | -b {input.tss_b} \ 19 | -o {output} 20 | """ 21 | 22 | 23 | tss_paths = [f'anl/tss/ocoef/{mth_a}.{mth_b}.csv' for mth_a, mth_b in combinations([x for x in mthds + baselines], 2)] 24 | rule tss_aggr: 25 | threads: 1 26 | singularity: 'workflow/envs/gretabench.sif' 27 | input: tss_paths 28 | output: "anl/tss/ocoef.csv" 29 | shell: 30 | """ 31 | python -c "import pandas as pd; import sys; \ 32 | tss_paths = sys.argv[1:]; \ 33 | df = pd.concat([pd.read_csv(tss_path) for tss_path in tss_paths]); \ 34 | df.to_csv('{output}', index=False);" {input} 35 | """ 36 | 37 | 38 | rule tss_dist: 39 | threads: 1 40 | singularity: 'workflow/envs/gretabench.sif' 41 | input: 42 | c=rules.tss_aggr.output, 43 | g='anl/topo/{dat}.{case}.stats_mult.csv' 44 | output: "anl/tss/{dat}.{case}.dist.csv" 45 | resources: 46 | mem_mb=restart_mem, 47 | runtime=config['max_mins_per_step'], 48 | params: 49 | b=baselines, 50 | shell: 51 | """ 52 | python workflow/scripts/anl/tss/dist.py \ 53 | -g {input.g} \ 54 | -b {params.b} \ 55 | -o {output} 56 | """ 57 | -------------------------------------------------------------------------------- /workflow/rules/dbs/cre.smk: -------------------------------------------------------------------------------- 1 | localrules: cre_blacklist, cre_encode, cre_gwascatalogue, cre_phastcons, cre_promoters, cre_zhang21 2 | 3 | 4 | rule cre_blacklist: 5 | threads: 1 6 | output: 'dbs/hg38/cre/blacklist/blacklist.bed' 7 | params: 8 | url=config['dbs']['hg38']['cre']['blacklist'] 9 | shell: 10 | """ 11 | wget --no-verbose -O - "{params.url}" | zcat > {output} 12 | """ 13 | 14 | 15 | rule cre_encode: 16 | threads: 1 17 | singularity: 'workflow/envs/gretabench.sif' 18 | output: 'dbs/hg38/cre/encode/encode.bed' 19 | params: 20 | url=config['dbs']['hg38']['cre']['encode'] 21 | shell: 22 | """ 23 | wget --no-verbose '{params.url}' -O {output}.tmp && \ 24 | cat {output}.tmp | sort -k 1,1 -k2,2n | bedtools merge -c 6 -o distinct > {output} && \ 25 | rm {output}.tmp 26 | """ 27 | 28 | 29 | rule cre_gwascatalogue: 30 | threads: 1 31 | singularity: 'workflow/envs/gretabench.sif' 32 | output: 'dbs/hg38/cre/gwascatalogue/gwascatalogue.bed' 33 | params: 34 | url=config['dbs']['hg38']['cre']['gwascatalogue'] 35 | shell: 36 | """ 37 | wget --no-verbose '{params.url}' -O {output} && \ 38 | python workflow/scripts/dbs/cre/gwascatalogue.py -i {output} && \ 39 | sort -k 1,1 -k2,2n {output} | bedtools merge -i - -c 4,5 -o distinct,distinct -delim "|" > {output}.tmp && \ 40 | mv {output}.tmp {output} 41 | """ 42 | 43 | 44 | rule cre_phastcons: 45 | threads: 1 46 | singularity: 'workflow/envs/pando.sif' 47 | output: 'dbs/hg38/cre/phastcons/phastcons.bed' 48 | params: 49 | url=config['dbs']['hg38']['cre']['phastcons'] 50 | shell: 51 | """ 52 | wget --no-verbose '{params.url}' -O {output}.tmp && \ 53 | Rscript -e " \ 54 | df <- get(load('{output}.tmp')); \ 55 | df <- GenomicRanges::reduce(df); \ 56 | df <- as.data.frame(df)[, c('seqnames', 'start', 'end')]; \ 57 | write.table(x=df, file='{output}.tmp', sep = '\t', quote=FALSE, row.names=FALSE, col.names=FALSE)" && \ 58 | sort -k 1,1 -k2,2n {output}.tmp > {output} && \ 59 | rm {output}.tmp 60 | """ 61 | 62 | 63 | rule cre_promoters: 64 | threads: 1 65 | singularity: 'workflow/envs/gretabench.sif' 66 | output: 'dbs/hg38/cre/promoters/promoters.bed' 67 | params: 68 | wsize=config['cre_prom_size'] 69 | shell: 70 | """ 71 | Rscript workflow/scripts/dbs/cre/promoters.R \ 72 | {params.wsize} \ 73 | {output} 74 | """ 75 | 76 | 77 | rule cre_zhang21: 78 | threads: 1 79 | singularity: 'workflow/envs/gretabench.sif' 80 | output: 'dbs/hg38/cre/zhang21/zhang21.bed' 81 | params: 82 | url=config['dbs']['hg38']['cre']['zhang21'] 83 | shell: 84 | """ 85 | wget --no-verbose '{params.url}' -O {output}.tmp && \ 86 | zcat {output}.tmp | bedtools merge > {output} && \ 87 | rm {output}.tmp 88 | """ 89 | -------------------------------------------------------------------------------- /workflow/rules/dbs/ont.smk: -------------------------------------------------------------------------------- 1 | localrules: ont_bto 2 | 3 | 4 | checkpoint ont_bto: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | output: 'dbs/ont/bto.tsv' 8 | params: 9 | url=config['dbs']['ont']['bto'], 10 | shell: 11 | """ 12 | wget --no-verbose '{params.url}' -O - | \ 13 | python workflow/scripts/dbs/ont/bto.py {output} 14 | """ -------------------------------------------------------------------------------- /workflow/rules/dbs/prt.smk: -------------------------------------------------------------------------------- 1 | localrules: prt_knocktf 2 | 3 | 4 | rule prt_knocktf: 5 | threads: 1 6 | output: 7 | meta='dbs/hg38/prt/knocktf/meta.csv', 8 | diff='dbs/hg38/prt/knocktf/diff.csv', 9 | dir=directory('dbs/hg38/prt/knocktf/') 10 | params: 11 | url_m=config['dbs']['hg38']['prt']['knocktf']['meta'], 12 | url_d=config['dbs']['hg38']['prt']['knocktf']['diff'], 13 | shell: 14 | """ 15 | wget --no-verbose '{params.url_m}' -O {output.meta} && \ 16 | wget --no-verbose '{params.url_d}' -O {output.diff} 17 | """ 18 | -------------------------------------------------------------------------------- /workflow/rules/dbs/tfm.smk: -------------------------------------------------------------------------------- 1 | localrules: tfm_hpa, tfm_tfmdb 2 | 3 | 4 | rule tfm_hpa: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: rules.gen_tfs_lambert.output 8 | output: 'dbs/hg38/tfm/hpa/hpa.tsv' 9 | params: 10 | url=config['dbs']['hg38']['tfm']['hpa'] 11 | shell: 12 | """ 13 | wget --no-verbose '{params.url}' -O {output}.zip && \ 14 | python workflow/scripts/dbs/tfm/hpa.py \ 15 | -i {output}.zip \ 16 | -t {input} \ 17 | -o {output} 18 | """ 19 | 20 | 21 | rule tfm_tfmdb: 22 | threads: 1 23 | singularity: 'workflow/envs/gretabench.sif' 24 | output: 'dbs/hg38/tfm/tfmdb/tfmdb.tsv' 25 | params: 26 | url=config['dbs']['hg38']['tfm']['tfmdb'] 27 | shell: 28 | """ 29 | wget --no-verbose '{params.url}' -O {output} && \ 30 | python -c "import pandas as pd; \ 31 | import sys; \ 32 | df = pd.read_csv(sys.argv[1]); \ 33 | df = df[['Gene Name', 'Cell Name', 'Tissue Type']]; \ 34 | df['ctype'] = df['Cell Name'] + ',' + df['Tissue Type']; \ 35 | df = df.groupby('Gene Name', as_index=False)['ctype'].apply(lambda x: ','.join(x)); \ 36 | df['ctype'] = [','.join(sorted(set(s.split(',')))) for s in df['ctype']]; \ 37 | df = df.drop_duplicates(['Gene Name', 'ctype']); \ 38 | df = df.rename(columns={{'Gene Name': 'gene'}}); \ 39 | df = df.sort_values(['gene', 'ctype']); \ 40 | df.to_csv(sys.argv[1], sep='\\t', index=False, header=None)" {output} 41 | """ 42 | -------------------------------------------------------------------------------- /workflow/rules/dbs/tfp.smk: -------------------------------------------------------------------------------- 1 | localrules: download_intact, tfp_intact, tfp_europmc 2 | 3 | 4 | rule download_intact: 5 | output: 6 | temp("dbs/hg38/tfp/intact/raw/intact.txt") 7 | params: 8 | url=config['dbs']['hg38']['tfp']['intact'] 9 | shell: 10 | """ 11 | wget --no-verbose {params.url} -O {output}.zip && \ 12 | unzip -o {output}.zip -d $( dirname {output} ) && \ 13 | rm {output}.zip 14 | """ 15 | 16 | 17 | rule tfp_intact: 18 | input: 19 | inc=rules.download_intact.output, 20 | lmb=rules.gen_tfs_lambert.output, 21 | pid=rules.gen_pid_uniprot.output, 22 | output: 'dbs/hg38/tfp/intact/intact.tsv' 23 | shell: 24 | """ 25 | python workflow/scripts/dbs/tfp/intact.py \ 26 | {input.inc} {input.lmb} {input.pid} {output} 27 | """ 28 | 29 | 30 | rule tfp_europmc_raw: 31 | threads: 1 32 | singularity: 'workflow/envs/gretabench.sif' 33 | input: rules.gen_tfs_lambert.output, 34 | output: 35 | single='dbs/hg38/tfp/europmc/raw/single.csv', 36 | pairs='dbs/hg38/tfp/europmc/raw/pairs.csv' 37 | params: 38 | min_chars=2, 39 | min_n=49 40 | resources: 41 | runtime=config['max_mins_per_step'] * 2, 42 | shell: 43 | """ 44 | python workflow/scripts/dbs/tfp/europmc_raw.py \ 45 | {input} {params.min_chars} {params.min_n} {output.single} {output.pairs} 46 | """ 47 | 48 | 49 | rule tfp_europmc: 50 | threads: 1 51 | singularity: 'workflow/envs/gretabench.sif' 52 | input: 53 | single=rules.tfp_europmc_raw.output.single, 54 | pairs=rules.tfp_europmc_raw.output.pairs, 55 | output: 'dbs/hg38/tfp/europmc/europmc.tsv' 56 | params: 57 | pval_thr=2.2e-16, 58 | min_odds=5, 59 | shell: 60 | """ 61 | python workflow/scripts/dbs/tfp/europmc.py \ 62 | {input.single} {input.pairs} {params.pval_thr} {params.min_odds} {output} 63 | """ 64 | -------------------------------------------------------------------------------- /workflow/rules/dts/fakepair.smk: -------------------------------------------------------------------------------- 1 | localrules: index_frags_fakepair 2 | 3 | 4 | rule index_frags_fakepair: 5 | threads: 1 6 | input: 7 | frags=lambda w: map_rules('download', w_name='{dname}pair'.format(dname=w.dname), out='frags'), 8 | tbis=lambda w: map_rules('download', w_name='{dname}pair'.format(dname=w.dname), out='tbis'), 9 | output: 10 | frags=temp(local('dts/fake{dname}pair/smpl.frags.tsv.gz')), 11 | tbis=temp(local('dts/fake{dname}pair/smpl.frags.tsv.gz.tbi')), 12 | shell: 13 | """ 14 | cp {input.frags} {output.frags} 15 | cp {input.tbis} {output.tbis} 16 | """ 17 | 18 | 19 | rule coem_fakepair: 20 | threads: 32 21 | singularity: 'workflow/envs/figr.sif' 22 | input: 23 | gex=lambda w: map_rules(rule_prefix='download', w_name='{dname}pair'.format(dname=w.dname), out='gex'), 24 | peaks=lambda w: map_rules('callpeaks', w_name='{dname}pair'.format(dname=w.dname), out='peaks'), 25 | frags=rules.index_frags_fakepair.output.frags, 26 | tbis=rules.index_frags_fakepair.output.tbis, 27 | output: 28 | cca=temp(local('dts/fake{dname}pair/cca.rds')) 29 | resources: mem_mb=128000 30 | shell: 31 | """ 32 | Rscript workflow/scripts/dts/fakepair/coembedd.R \ 33 | {input.gex} \ 34 | {input.peaks} \ 35 | {input.frags} \ 36 | {output.cca} 37 | """ 38 | 39 | 40 | rule pair_fakepair: 41 | threads: 1 42 | singularity: 'workflow/envs/figr.sif' 43 | input: 44 | cca=rules.coem_fakepair.output.cca, 45 | annot=lambda w: map_rules(rule_prefix='download', w_name='{dname}pair'.format(dname=w.dname), out='annot'), 46 | output: barmap=temp(local('dts/fake{dname}pair/barmap.csv')) 47 | shell: 48 | """ 49 | Rscript workflow/scripts/dts/fakepair/paircells.R \ 50 | {input.cca} \ 51 | {input.annot} \ 52 | {output.barmap} 53 | """ 54 | 55 | localrules: annotate_fakepitupair 56 | rule annotate_fakepitupair: 57 | threads: 1 58 | singularity: 'workflow/envs/gretabench.sif' 59 | input: 60 | mdata=rules.annotate_pitupair.output.out, 61 | barmap='dts/fakepitupair/barmap.csv', 62 | output: 63 | out='dts/fakepitupair/annotated.h5mu' 64 | shell: 65 | """ 66 | python workflow/scripts/dts/fakepair/fakepair.py \ 67 | -m {input.mdata} \ 68 | -b {input.barmap} \ 69 | -o {output.out} 70 | """ 71 | -------------------------------------------------------------------------------- /workflow/rules/dts/general.smk: -------------------------------------------------------------------------------- 1 | rule extract_case: 2 | threads: 32 3 | singularity: 'workflow/envs/gretabench.sif' 4 | input: lambda w: map_rules('annotate', w.dat) 5 | output: 6 | mdata='dts/{dat}/cases/{case}/mdata.h5mu', 7 | params: 8 | celltypes=lambda w: config['dts'][w.dat]['cases'][w.case]['celltypes'], 9 | n_sample=lambda w: config['dts'][w.dat]['cases'][w.case]['n_sample'] if 'n_sample' in config['dts'][w.dat]['cases'][w.case] else '0', 10 | seed=lambda w: config['dts'][w.dat]['cases'][w.case]['seed'] if 'n_sample' in config['dts'][w.dat]['cases'][w.case] else '0', 11 | n_hvg=lambda w: config['dts'][w.dat]['cases'][w.case]['n_hvg'], 12 | n_hvr=lambda w: config['dts'][w.dat]['cases'][w.case]['n_hvr'], 13 | root=lambda w: config['dts'][w.dat]['cases'][w.case]['root'] if 'root' in config['dts'][w.dat]['cases'][w.case] else 'None', 14 | shell: 15 | """ 16 | python workflow/scripts/dts/extract_case.py \ 17 | -i '{input}' \ 18 | -c '{params.celltypes}' \ 19 | -s '{params.n_sample}' \ 20 | -d '{params.seed}' \ 21 | -g '{params.n_hvg}' \ 22 | -r '{params.n_hvr}' \ 23 | -t '{params.root}' \ 24 | -o '{output.mdata}' 25 | """ 26 | -------------------------------------------------------------------------------- /workflow/rules/dts/pbmc10k.smk: -------------------------------------------------------------------------------- 1 | rule download_pbmc10k: 2 | threads: 1 3 | singularity: 'workflow/envs/figr.sif' 4 | output: 5 | frags='dts/pbmc10k/smpl.frags.tsv.gz', 6 | tbis='dts/pbmc10k/smpl.frags.tsv.gz.tbi', 7 | params: 8 | matrix=config['dts']['pbmc10k']['url']['matrix'], 9 | atac_frags=config['dts']['pbmc10k']['url']['atac_frags'], 10 | shell: 11 | """ 12 | wget --no-verbose '{params.atac_frags}' -O '{output.frags}' 13 | bash workflow/scripts/dts/format_frags.sh {output.frags} 14 | """ 15 | 16 | 17 | rule prcannot_pbmc10k: 18 | threads: 1 19 | singularity: 'workflow/envs/gretabench.sif' 20 | output: annot=temp(local('dts/pbmc10k/annot.csv')), 21 | shell: 22 | "python workflow/scripts/dts/pbmc10k/prc_annot.py -a {output.annot}" 23 | 24 | 25 | rule callpeaks_pbmc10k: 26 | threads: 32 27 | singularity: 'workflow/envs/gretabench.sif' 28 | input: 29 | frags=rules.download_pbmc10k.output.frags, 30 | annot=rules.prcannot_pbmc10k.output.annot, 31 | output: peaks=temp(local('dts/pbmc10k/peaks.h5ad')) 32 | resources: mem_mb=64000 33 | shell: 34 | """ 35 | python workflow/scripts/dts/callpeaks.py \ 36 | -f {input.frags} \ 37 | -a {input.annot} \ 38 | -t '/tmp/pbcm10k/' \ 39 | -n {threads} \ 40 | -o {output.peaks} 41 | """ 42 | 43 | 44 | rule annotate_pbmc10k: 45 | threads: 1 46 | singularity: 'workflow/envs/gretabench.sif' 47 | input: 48 | annot=rules.prcannot_pbmc10k.output.annot, 49 | peaks=rules.callpeaks_pbmc10k.output.peaks, 50 | gid=rules.gen_gid_ensmbl.output, 51 | output: out='dts/pbmc10k/annotated.h5mu' 52 | resources: mem_mb=32000 53 | shell: 54 | """ 55 | python workflow/scripts/dts/pbmc10k/pbmc10k.py \ 56 | -b {input.annot} \ 57 | -c {input.gid} \ 58 | -e {input.peaks} \ 59 | -f {output.out} 60 | """ 61 | -------------------------------------------------------------------------------- /workflow/rules/dts/pitupair.smk: -------------------------------------------------------------------------------- 1 | rule download_pitupair: 2 | threads: 1 3 | singularity: 'workflow/envs/figr.sif' 4 | output: 5 | gex=temp(local('dts/pitupair/multiome_original.h5')), 6 | frags='dts/pitupair/smpl.frags.tsv.gz', 7 | tbis='dts/pitupair/smpl.frags.tsv.gz.tbi', 8 | annot=temp(local('dts/pitupair/annot.csv')) 9 | params: 10 | gex=config['dts']['pitupair']['url']['gex'], 11 | frags=config['dts']['pitupair']['url']['frags'], 12 | annot=config['dts']['pitupair']['url']['annot'] 13 | shell: 14 | """ 15 | wget --no-verbose '{params.frags}' -O '{output.frags}' && \ 16 | bash workflow/scripts/dts/format_frags.sh {output.frags} && \ 17 | wget --no-verbose '{params.gex}' -O '{output.gex}' && \ 18 | wget --no-verbose '{params.annot}' -O '{output.annot}' && \ 19 | awk 'BEGIN {{FS=OFS=","}} NR==1 {{print $0; next}} {{gsub(/-[0-9]+$/, "", $1); print $3"_"$1,$2,$3}}' {output.annot} > {output.annot}.tmp && \ 20 | mv {output.annot}.tmp {output.annot} 21 | """ 22 | 23 | 24 | rule callpeaks_pitupair: 25 | threads: 32 26 | singularity: 'workflow/envs/gretabench.sif' 27 | input: 28 | frags=rules.download_pitupair.output.frags, 29 | annot=rules.download_pitupair.output.annot, 30 | output: peaks=temp(local('dts/pitupair/peaks.h5ad')) 31 | resources: mem_mb=64000 32 | shell: 33 | """ 34 | python workflow/scripts/dts/callpeaks.py \ 35 | -f {input.frags} \ 36 | -a {input.annot} \ 37 | -t '/tmp/pitupair/' \ 38 | -n {threads} \ 39 | -o {output.peaks} 40 | """ 41 | 42 | 43 | rule annotate_pitupair: 44 | threads: 1 45 | singularity: 'workflow/envs/gretabench.sif' 46 | input: 47 | annot=rules.download_pitupair.output.annot, 48 | peaks=rules.callpeaks_pitupair.output.peaks, 49 | gex=rules.download_pitupair.output.gex, 50 | gid=rules.gen_gid_ensmbl.output, 51 | output: out='dts/pitupair/annotated.h5mu' 52 | resources: mem_mb=32000 53 | shell: 54 | """ 55 | python workflow/scripts/dts/pitupair/pitupair.py \ 56 | -b {input.annot} \ 57 | -c {input.gid} \ 58 | -e {input.peaks} \ 59 | -f {output} \ 60 | -g {input.gex} 61 | """ 62 | -------------------------------------------------------------------------------- /workflow/rules/img/img.smk: -------------------------------------------------------------------------------- 1 | localrules: dwn_image 2 | 3 | rule dwn_image: 4 | threads: 1 5 | singularity: None 6 | output: 'workflow/envs/{name_img}.sif' 7 | resources: 8 | mem_mb=8000, 9 | runtime=config['max_mins_per_step'], 10 | shell: 11 | """ 12 | wget "https://zenodo.org/records/15058660/files/{wildcards.name_img}.sif?download=1" -O {output} 13 | """ 14 | -------------------------------------------------------------------------------- /workflow/rules/mth/grn.smk: -------------------------------------------------------------------------------- 1 | localrules: grn_run 2 | 3 | 4 | rule grn_run: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: lambda wildcards: map_rules('mdl', wildcards.mdl), 8 | output: 9 | out='dts/{dat}/cases/{case}/runs/{pre}.{p2g}.{tfb}.{mdl}.grn.csv' 10 | shell: 11 | """ 12 | python workflow/scripts/mth/grn.py \ 13 | -i {input} \ 14 | -o {output.out} 15 | """ 16 | 17 | 18 | rule mdl_collectri: 19 | threads: 1 20 | singularity: 'workflow/envs/gretabench.sif' 21 | input: 22 | mdata=rules.extract_case.output.mdata, 23 | grn=rules.gst_collectri.output, 24 | proms=rules.cre_promoters.output, 25 | output: 26 | out='dts/{dat}/cases/{case}/runs/collectri.collectri.collectri.collectri.mdl.csv' 27 | resources: 28 | mem_mb=restart_mem, 29 | runtime=config['max_mins_per_step'], 30 | shell: 31 | """ 32 | python workflow/scripts/mth/prc_prior_grn.py \ 33 | -g {input.grn} \ 34 | -d {input.mdata} \ 35 | -p {input.proms} \ 36 | -o {output.out} 37 | """ 38 | 39 | 40 | rule mdl_dorothea: 41 | threads: 1 42 | singularity: 'workflow/envs/gretabench.sif' 43 | input: 44 | mdata=rules.extract_case.output.mdata, 45 | grn=rules.gst_dorothea.output, 46 | proms=rules.cre_promoters.output, 47 | output: 48 | out='dts/{dat}/cases/{case}/runs/dorothea.dorothea.dorothea.dorothea.mdl.csv' 49 | resources: 50 | mem_mb=restart_mem, 51 | runtime=config['max_mins_per_step'], 52 | shell: 53 | """ 54 | python workflow/scripts/mth/prc_prior_grn.py \ 55 | -g {input.grn} \ 56 | -d {input.mdata} \ 57 | -p {input.proms} \ 58 | -o {output.out} 59 | """ 60 | -------------------------------------------------------------------------------- /workflow/rules/mth/random.smk: -------------------------------------------------------------------------------- 1 | rule mdl_random: 2 | threads: 1 3 | singularity: 'workflow/envs/gretabench.sif' 4 | input: 5 | mdata=rules.extract_case.output.mdata, 6 | tf=rules.gen_tfs_lambert.output, 7 | cg=rules.cre_promoters.output, 8 | output: out='dts/{dat}/cases/{case}/runs/random.random.random.random.mdl.csv' 9 | params: 10 | g_perc=0.25, 11 | scale=1, 12 | tf_g_ratio=0.10, 13 | w_size=250000, 14 | seed=lambda w: config['dts']['pitupair']['cases'][w.case].get('seed', 42), 15 | resources: 16 | mem_mb=restart_mem, 17 | runtime=config['max_mins_per_step'], 18 | shell: 19 | """ 20 | python workflow/scripts/mth/random/grn.py \ 21 | -i {input.mdata} \ 22 | -t {input.tf} \ 23 | -c {input.cg} \ 24 | -g {params.g_perc} \ 25 | -n {params.scale} \ 26 | -r {params.tf_g_ratio} \ 27 | -w {params.w_size} \ 28 | -s {params.seed} \ 29 | -o {output.out} 30 | """ 31 | -------------------------------------------------------------------------------- /workflow/rules/mth/scenic.smk: -------------------------------------------------------------------------------- 1 | rule mdl_scenic: 2 | threads: 16 3 | singularity: 'workflow/envs/scenicplus.sif' 4 | input: 5 | img='workflow/envs/scenicplus.sif', 6 | mdata=rules.extract_case.output.mdata, 7 | tf=rules.gen_tfs_scenic.output, 8 | proms=rules.cre_promoters.output, 9 | ranking_small=rules.gen_motif_scenic_rnk.output.sml, 10 | ranking_big=rules.gen_motif_scenic_rnk.output.big, 11 | motifs=rules.gen_motif_scenic.output 12 | output: 13 | adj=temp(local('dts/{dat}/cases/{case}/runs/adj_tmp.tsv')), 14 | t=temp(local('dts/{dat}/cases/{case}/runs/scenic_tmp.loom')), 15 | reg=temp(local('dts/{dat}/cases/{case}/runs/scenic_reg.csv')), 16 | out='dts/{dat}/cases/{case}/runs/scenic.scenic.scenic.scenic.mdl.csv' 17 | resources: 18 | mem_mb=restart_mem, 19 | runtime=config['max_mins_per_step'] * 2, 20 | shell: 21 | """ 22 | # Step 1: Create Loom file 23 | python workflow/scripts/mth/scenic/loom.py \ 24 | -i {input.mdata} \ 25 | -o {output.t} 26 | echo "Created loom" 27 | 28 | # Step 2: Run pyscenic GRN 29 | arboreto_with_multiprocessing.py {output.t} {input.tf} -o {output.adj} --num_workers {threads} --seed 42 30 | echo "Generated adj" 31 | 32 | # Step 3: Run CTX 33 | pyscenic ctx {output.adj} \ 34 | {input.ranking_small} \ 35 | {input.ranking_big} \ 36 | --annotations_fname {input.motifs} \ 37 | --expression_mtx_fname {output.t} \ 38 | --output {output.reg} \ 39 | --mask_dropouts \ 40 | --num_workers {threads} 41 | echo "Filtered TFs by motifs" 42 | 43 | # Step 4: Process GRN 44 | python workflow/scripts/mth/scenic/process_grn.py \ 45 | -o {output.out} \ 46 | -p {input.proms} \ 47 | -g {output.adj} \ 48 | -r {output.reg} 49 | echo "Done" 50 | """ 51 | -------------------------------------------------------------------------------- /workflow/rules/plt/comb.smk: -------------------------------------------------------------------------------- 1 | #localrules: fig_comb 2 | 3 | 4 | rule fig_comb: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: 8 | mdta='dts/pbmc10k/cases/all/mdata.h5mu', 9 | qc='anl/dts/pbmc10k.all.qc.csv', 10 | nc='anl/dts/pbmc10k.all.nc.csv', 11 | sims='anl/topo/pbmc10k.all.sims_mult.csv', 12 | stat='anl/topo/pbmc10k.all.stats_mult.csv', 13 | fvsd='anl/topo/pbmc10k.all.fvsd.csv', 14 | stab='anl/stab/pbmc10k.all.ovsd.csv', 15 | output: 'plt/comb/fig.pdf' 16 | shell: 17 | """ 18 | python workflow/scripts/plt/comb/sims.py {input.mdta} \ 19 | {input.nc} {input.qc} {input.sims} {input.stat} {input.fvsd} {input.stab} {output} 20 | """ 21 | -------------------------------------------------------------------------------- /workflow/rules/plt/dbs.smk: -------------------------------------------------------------------------------- 1 | localrules: fig_dbs 2 | 3 | 4 | rule fig_dbs: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: 8 | sts='anl/dbs/stats.csv', 9 | ovc='anl/dbs/ocoef.csv', 10 | output: 'plt/dbs/fig.pdf' 11 | shell: 12 | """ 13 | python workflow/scripts/plt/dbs/stats.py {input.sts} {input.ovc} {output} 14 | """ 15 | -------------------------------------------------------------------------------- /workflow/rules/plt/eval.smk: -------------------------------------------------------------------------------- 1 | localrules: fig_eval 2 | 3 | 4 | rule fig_eval: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: 8 | smr='anl/metrics/summary/pbmc10k.all.csv', 9 | dct='anl/stab/unsmthds/pbmc10k.scores.csv', 10 | output: 'plt/eval/fig.pdf' 11 | shell: 12 | """ 13 | python workflow/scripts/plt/eval/eval.py {input} {output} 14 | """ 15 | -------------------------------------------------------------------------------- /workflow/rules/plt/figs.smk: -------------------------------------------------------------------------------- 1 | localrules: plt_figs 2 | 3 | 4 | rule plt_figs: 5 | threads: 1 6 | input: ['plt/stab/fig.pdf', 'plt/pair/fig.pdf', 'plt/comb/fig.pdf', 'plt/dbs/fig.pdf', 'plt/eval/fig.pdf'] 7 | output: 'plt/figs.txt' 8 | shell: 9 | """ 10 | touch {output} 11 | echo 'Done' 12 | """ -------------------------------------------------------------------------------- /workflow/rules/plt/pair.smk: -------------------------------------------------------------------------------- 1 | localrules: plt_npair, plt_fake, fig_pair 2 | 3 | 4 | rule plt_npair: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: 8 | pmd='dts/pitupair/cases/all/mdata.h5mu', 9 | nmd='dts/pitunpair/cases/all/mdata.h5mu', 10 | ral='anl/pair/pitu.all.real_corvals.csv', 11 | qc='anl/pair/pitu.all.qc.csv', 12 | nc='anl/pair/pitu.all.ncells.csv', 13 | oc='anl/pair/pitu.all.pvsn.csv', 14 | output: 'plt/pair/npair.pdf' 15 | shell: 16 | """ 17 | python workflow/scripts/plt/pair/pair.py {input.pmd} {input.nmd} {input.ral} {input.qc} {input.nc} {input.oc} {output} 18 | """ 19 | 20 | 21 | rule plt_fake: 22 | threads: 1 23 | singularity: 'workflow/envs/gretabench.sif' 24 | input: 25 | knn='anl/pair/pitu.all.fake_knn.csv', 26 | ctp='anl/pair/pitu.all.fake_prp.csv', 27 | cor='anl/pair/pitu.all.fake_cor.csv', 28 | ocf='anl/pair/pitu.all.pvsf.csv', 29 | output: 'plt/pair/fake.pdf' 30 | shell: 31 | """ 32 | python workflow/scripts/plt/pair/fake.py {input.knn} {input.ctp} {input.cor} {input.ocf} {output} 33 | """ 34 | 35 | 36 | rule fig_pair: 37 | threads: 1 38 | input: ['plt/pair/npair.pdf', 'plt/pair/fake.pdf'] 39 | output: 'plt/pair/fig.pdf' 40 | shell: 41 | """ 42 | gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile={output} {input} 43 | """ 44 | -------------------------------------------------------------------------------- /workflow/rules/plt/stab.smk: -------------------------------------------------------------------------------- 1 | localrules: plt_dwns, plt_sims, plt_AREG, fig_stability 2 | 3 | 4 | rule plt_dwns: 5 | threads: 1 6 | singularity: 'workflow/envs/gretabench.sif' 7 | input: 8 | ovc='anl/stab/pitupair.ovc.csv', 9 | auc='anl/stab/pitupair.auc.csv', 10 | wgt='anl/stab/pitupair.wgt.csv', 11 | cor='anl/stab/pitupair.cor.csv', 12 | output: 13 | stab='plt/stab/dwns.pdf', 14 | cors='plt/stab/cors.pdf', 15 | shell: 16 | """ 17 | python workflow/scripts/plt/stab/stab.py {input.ovc} {input.auc} {output.stab} 18 | python workflow/scripts/plt/stab/cors.py {input.wgt} {input.cor} {output.cors} 19 | """ 20 | 21 | 22 | rule plt_sims: 23 | threads: 1 24 | singularity: 'workflow/envs/gretabench.sif' 25 | input: 26 | sims='anl/topo/pitupair.all.sims_mult.csv', 27 | stats='anl/topo/pitupair.all.stats_mult.csv', 28 | tss=rules.tss_aggr.output, 29 | dst='anl/tss/pitupair.all.dist.csv', 30 | net='anl/topo/pitupair.all.inter.csv', 31 | output: 'plt/stab/sims.pdf' 32 | shell: 33 | """ 34 | python workflow/scripts/plt/stab/sims.py \ 35 | {input.sims} {input.stats} {input.tss} {input.dst} {input.net} {output} 36 | """ 37 | 38 | 39 | rule plt_AREG: 40 | threads: 1 41 | singularity: 'workflow/envs/gretabench.sif' 42 | input: 43 | sims='anl/topo/pitupair.all.sims_mult.csv', 44 | gann='dbs/hg38/gen/ann/dictys/ann.bed', 45 | output: 'plt/stab/links_AREG.pdf' 46 | params: 47 | gene='AREG', 48 | tfs=['FOSL1', 'FOSL2', 'JUNB'], 49 | wsize=250000 50 | shell: 51 | """ 52 | python workflow/scripts/plt/stab/links.py \ 53 | -s {input.sims} \ 54 | -g {params.gene} \ 55 | -t {params.tfs} \ 56 | -a {input.gann} \ 57 | -w {params.wsize} \ 58 | -o {output} 59 | """ 60 | 61 | 62 | rule fig_stability: 63 | threads: 1 64 | input: 65 | stab='plt/stab/dwns.pdf', 66 | cors='plt/stab/cors.pdf', 67 | sims='plt/stab/sims.pdf', 68 | areg='plt/stab/links_AREG.pdf' 69 | output: 'plt/stab/fig.pdf' 70 | shell: 71 | """ 72 | gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile={output} {input.stab} {input.cors} {input.sims} {input.areg} 73 | """ 74 | -------------------------------------------------------------------------------- /workflow/scripts/anl/dbs/terms.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from tqdm import tqdm 4 | import os 5 | import argparse 6 | 7 | 8 | # Init args 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('-i','--db_paths', required=True, nargs='+') 11 | parser.add_argument('-o','--path_out', required=True) 12 | args = vars(parser.parse_args()) 13 | 14 | db_paths = args['db_paths'] 15 | path_out = args['path_out'] 16 | 17 | non_term_dbs = ['blacklist', 'encode', 'promoters', 'zhang21', 'phastcons'] 18 | df = [] 19 | for db_path in db_paths: 20 | db_name = os.path.basename(os.path.dirname(db_path)) 21 | task = os.path.basename(os.path.dirname(os.path.dirname(db_path))) 22 | if db_name not in non_term_dbs: 23 | if task == 'tfb': 24 | db = pd.read_csv(db_path, header=None, sep='\t', usecols=[4])[4] 25 | terms = set() 26 | for r in tqdm(db): 27 | terms.update(r.split(',')) 28 | terms = sorted(terms) 29 | elif task == 'tfm': 30 | db = pd.read_csv(db_path, sep='\t', header=None, usecols=[1])[1] 31 | terms = set() 32 | for r in db: 33 | terms.update(r.split(',')) 34 | terms = sorted(terms) 35 | elif task == 'prt': 36 | db = pd.read_csv(db_path) 37 | terms = np.sort(db['Tissue.Type'].unique()) 38 | elif 'catalogue' in db_name: 39 | db = pd.read_csv(db_path, header=None, sep='\t', usecols=[4])[4] 40 | terms = set() 41 | for r in tqdm(db): 42 | r = r.split(',') 43 | if isinstance(r, str): 44 | r = [r] 45 | for s_r in r: 46 | terms.update(s_r.split('|')) 47 | terms = sorted(terms) 48 | else: 49 | raise ValueError('db {db} of task {task} has no defined terms'.format(db=db_name, task=task)) 50 | for term in terms: 51 | df.append([db_name, term]) 52 | df = pd.DataFrame(df, columns=['db_name', 'term']) 53 | 54 | # Write 55 | df.to_csv(path_out, index=False) 56 | -------------------------------------------------------------------------------- /workflow/scripts/anl/dts/qcstats.py: -------------------------------------------------------------------------------- 1 | import mudata as mu 2 | import pandas as pd 3 | import numpy as np 4 | import scanpy as sc 5 | import sys 6 | 7 | 8 | mdata = mu.read(sys.argv[1]) 9 | 10 | 11 | def get_qc_omic(mdata, omic): 12 | adata = mdata.mod[omic] 13 | adata.X = adata.layers['counts'] 14 | obs, _ = sc.pp.calculate_qc_metrics( 15 | adata, percent_top=None, log1p=True 16 | ) 17 | qc = obs.assign(omic=omic) 18 | qc = pd.merge(qc.reset_index(names='barcode'), mdata.obs.reset_index(names='barcode')[['barcode', 'celltype']], on=['barcode'], how='inner') 19 | return qc 20 | 21 | 22 | def extract_n_cells(mdata): 23 | return mdata.obs.groupby('celltype', as_index=False).size().sort_values('celltype') 24 | 25 | 26 | # Compute qc 27 | omics = ['rna', 'atac'] 28 | n_ctps = extract_n_cells(mdata) 29 | qc = [] 30 | for omic in omics: 31 | qc.append(get_qc_omic(mdata, omic)) 32 | qc = pd.concat(qc) 33 | 34 | # Write 35 | qc.to_csv(sys.argv[2], index=False) 36 | n_ctps.to_csv(sys.argv[3], index=False) 37 | -------------------------------------------------------------------------------- /workflow/scripts/anl/metrics/aggregate.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import argparse 5 | 6 | 7 | # Init args 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-i','--path_input', required=True, nargs='+') 10 | parser.add_argument('-a','--add_info', required=False, action="store_true") 11 | parser.add_argument('-o','--path_out', required=True) 12 | args = vars(parser.parse_args()) 13 | 14 | df_paths = args['path_input'] 15 | add_info = args['add_info'] 16 | path_out = args['path_out'] 17 | 18 | df = [] 19 | for df_path in df_paths: 20 | tmp = pd.read_csv(df_path) 21 | if add_info: 22 | dts = os.path.basename(os.path.dirname(df_path)) 23 | db = os.path.basename(os.path.dirname(os.path.dirname(df_path))) 24 | task = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(df_path)))) 25 | metric = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(df_path))))) 26 | tmp[['metric', 'task', 'db', 'dts']] = [metric, task, db, dts] 27 | tmp = tmp[['metric', 'task', 'db', 'dts', 'name', 'prc', 'rcl', 'f01']] 28 | df.append(tmp) 29 | df = pd.concat(df) 30 | 31 | # Write 32 | df.to_csv(path_out, index=False) -------------------------------------------------------------------------------- /workflow/scripts/anl/metrics/mech/tfa.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import decoupler as dc 4 | import mudata as mu 5 | import sys 6 | import os 7 | import re 8 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 9 | from utils import load_cats, f_beta_score 10 | import argparse 11 | 12 | 13 | # Init args 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-i','--grn_path', required=True) 16 | parser.add_argument('-b','--bnc_path', required=True) 17 | parser.add_argument('-o','--out_path', required=True) 18 | args = vars(parser.parse_args()) 19 | 20 | grn_path = args['grn_path'] 21 | bnc_path = args['bnc_path'] 22 | out_path = args['out_path'] 23 | 24 | # Extract names and path 25 | grn_name = os.path.basename(grn_path).replace('.grn.csv', '') 26 | data_path = os.path.join(os.path.dirname(os.path.dirname(grn_path)), 'mdata.h5mu') 27 | dataset = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(data_path)))) 28 | case = os.path.basename(os.path.dirname(data_path)) 29 | rsc_name = os.path.basename(bnc_path) 30 | 31 | # Read GRN 32 | grn = pd.read_csv(grn_path) 33 | grn = grn.drop_duplicates(['source', 'target'], keep='first') 34 | 35 | if grn.shape[0] > 0: 36 | # Read dataset 37 | rna = mu.read(os.path.join(data_path, 'mod', 'rna')) 38 | 39 | # Read benchmark data 40 | mat = pd.read_csv(os.path.join(bnc_path, 'diff.csv'), index_col=0) 41 | obs = pd.read_csv(os.path.join(bnc_path, 'meta.csv'), index_col=0) 42 | 43 | # Subset bench data to dataset 44 | cats = load_cats(dataset, case) 45 | cats = [re.escape(c) for c in cats[rsc_name]] 46 | msk = obs['Tissue.Type'].isin(cats) & obs['TF'].isin(rna.var_names) & (obs['logFC'] < -0.5) 47 | obs = obs.loc[msk, :] 48 | mat = mat.loc[msk, :] 49 | 50 | # Compute TF activities 51 | acts = [] 52 | pvals = [] 53 | for dataset in obs.index: 54 | tf = obs.loc[dataset, 'TF'] 55 | tf_mat = mat.loc[[dataset], :] 56 | tf_grn = grn[grn['source'] == tf] 57 | try: 58 | act, pval = dc.run_ulm( 59 | mat=tf_mat, 60 | net=tf_grn, 61 | weight='score', 62 | min_n=3, 63 | ) 64 | act, pval = act.values[0, 0], pval.values[0, 0] 65 | acts.append(act) 66 | pvals.append(pval) 67 | except: 68 | pass 69 | 70 | # Compute recall 71 | acts = np.array(acts) 72 | pvals = np.array(pvals) 73 | padj = dc.p_adjust_fdr(pvals) 74 | tp = np.sum((acts < 0) & (padj < 0.05)) 75 | if tp > 0: 76 | prc = tp / acts.size 77 | rcl = tp / obs.shape[0] 78 | f01 = f_beta_score(prc, rcl) 79 | else: 80 | prc, rcl, f01 = 0., 0., 0. 81 | 82 | df = pd.DataFrame([[grn_name, prc, rcl, f01]], columns=['name', 'prc', 'rcl', 'f01']) 83 | else: 84 | df = pd.DataFrame([[grn_name, np.nan, np.nan, np.nan]], columns=['name', 'prc', 'rcl', 'f01']) 85 | 86 | # Write 87 | df.to_csv(out_path, index=False) 88 | -------------------------------------------------------------------------------- /workflow/scripts/anl/metrics/mech/tfm.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | import pandas as pd 3 | import numpy as np 4 | import mudata as mu 5 | import os 6 | import h5py 7 | import sys 8 | 9 | 10 | path_mdata = sys.argv[1] 11 | path_tfs = sys.argv[2] 12 | path_out = sys.argv[3] 13 | 14 | # Read 15 | tfs = pd.read_csv(path_tfs, header=None)[0].values 16 | rna = mu.read(os.path.join(path_mdata, 'mod', 'rna')) 17 | 18 | # Filter and update 19 | inter = rna.var_names.intersection(tfs) 20 | rna = rna[:, inter].copy() 21 | rna.obs = mu.read(path_mdata).obs.loc[:, ['celltype']].copy() 22 | 23 | # Extract DEG tfs 24 | sc.tl.rank_genes_groups(rna, groupby='celltype', method='wilcoxon') 25 | df = sc.get.rank_genes_groups_df(rna, group=None) 26 | 27 | # Filter results 28 | df = df[(df['pvals_adj'] < 2.22e-16) & (df['logfoldchanges'] > 2.)] 29 | n_group = df.groupby('group', as_index=False).size() 30 | n_group = n_group[n_group['size'] >= 1] 31 | groups = n_group['group'].values 32 | df['group'] = df['group'].astype(str) 33 | df = df[df['group'].isin(groups)] 34 | df = df[['group', 'names']] 35 | df.columns = ['celltype', 'tf'] 36 | 37 | # Write 38 | df.to_csv(path_out, index=False) 39 | -------------------------------------------------------------------------------- /workflow/scripts/anl/metrics/pred/gsets.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import mudata as mu 4 | import decoupler as dc 5 | import argparse 6 | import sys 7 | import os 8 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 9 | from utils import f_beta_score 10 | 11 | 12 | # Init args 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('-i','--grn_path', required=True) 15 | parser.add_argument('-p','--ptw_path', required=True) 16 | parser.add_argument('-o','--out_path', required=True) 17 | args = vars(parser.parse_args()) 18 | 19 | grn_path = args['grn_path'] 20 | ptw_path = args['ptw_path'] 21 | out_path = args['out_path'] 22 | 23 | grn_name = os.path.basename(grn_path).replace('.grn.csv', '') 24 | data_path = os.path.join(os.path.dirname(os.path.dirname(grn_path)), 'mdata.h5mu') 25 | 26 | grn = pd.read_csv(grn_path) 27 | 28 | def get_sig_pws(grn, db, thr_pval): 29 | sig_pws = set() 30 | for tf in grn['source'].unique(): 31 | df = grn[grn['source'] == tf].set_index('target') 32 | pws = dc.get_ora_df( 33 | df=df, 34 | net=db, 35 | ) 36 | sig_pws.update(pws[pws['FDR p-value'] < thr_pval]['Term']) 37 | sig_pws = np.array(list(sig_pws)) 38 | return sig_pws 39 | 40 | 41 | def eval_metrics(y_pred, y): 42 | tp = np.intersect1d(y_pred, y).size 43 | if tp > 0.: 44 | fp = np.setdiff1d(y_pred, y).size 45 | fn = np.setdiff1d(y, y_pred).size 46 | prc = tp / (tp + fp) 47 | rcl = tp / (tp + fn) 48 | f1 = f_beta_score(prc, rcl) 49 | else: 50 | prc, rcl, f1 = 0., 0., 0. 51 | return prc, rcl, f1 52 | 53 | 54 | def eval_grn(data, grn, db, thr_pval=0.01, thr_prop=0.2): 55 | hits = get_pw_hits(data, thr_pval, thr_prop) 56 | sig_pws = get_sig_pws(grn, db, thr_pval) 57 | prc, rcl, f1 = eval_metrics(y_pred=sig_pws, y=hits) 58 | return prc, rcl, f1 59 | 60 | 61 | def get_pw_hits(data, thr_pval, thr_prop): 62 | pvals = data.obsm['ulm_pvals'].copy() 63 | pvals.loc[:, :] = dc.p_adjust_fdr(pvals.values.ravel()).reshape(pvals.shape) 64 | acts = data.obsm['ulm_estimate'].copy() 65 | hits = ((pvals < thr_pval) & (acts > 0)).sum(0).sort_values(ascending=False) / pvals.shape[0] 66 | hits = hits[hits > thr_prop].index.values.astype('U') 67 | return hits 68 | 69 | 70 | if grn.shape[0] > 0: 71 | ptw = pd.read_csv(ptw_path) 72 | rna = mu.read(os.path.join(data_path, 'mod', 'rna')) 73 | # Infer pathway activities 74 | dc.run_ulm( 75 | mat=rna, 76 | net=ptw, 77 | weight=None, 78 | use_raw=False, 79 | verbose=True 80 | ) 81 | prc, rcl, f01 = eval_grn(rna, grn, ptw, thr_pval=0.01, thr_prop=0.2) 82 | df = pd.DataFrame([[grn_name, prc, rcl, f01]], columns=['name', 'prc', 'rcl', 'f01']) 83 | else: 84 | df = pd.DataFrame([[grn_name, np.nan, np.nan, np.nan]], columns=['name', 'prc', 'rcl', 'f01']) 85 | 86 | # Write 87 | df.to_csv(out_path, index=False) 88 | -------------------------------------------------------------------------------- /workflow/scripts/anl/metrics/prior/tfm.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import mudata as mu 4 | from tqdm import tqdm 5 | import sys 6 | import os 7 | import re 8 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 9 | from utils import load_cats, f_beta_score 10 | import argparse 11 | 12 | 13 | # Init args 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-a','--grn_path', required=True) 16 | parser.add_argument('-b','--resource_path', required=True) 17 | parser.add_argument('-f','--out_path', required=True) 18 | args = vars(parser.parse_args()) 19 | 20 | grn_path = args['grn_path'] 21 | resource_path = args['resource_path'] 22 | out_path = args['out_path'] 23 | 24 | 25 | grn_name = os.path.basename(grn_path).replace('.grn.csv', '') 26 | data_path = os.path.join(os.path.dirname(os.path.dirname(grn_path)), 'mdata.h5mu') 27 | dataset = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(data_path)))) 28 | case = os.path.basename(os.path.dirname(data_path)) 29 | resource_name = os.path.basename(resource_path).replace('.csv', '') 30 | 31 | # Read grn 32 | grn = pd.read_csv(grn_path) 33 | 34 | if grn.shape[0] > 0: 35 | # Read resource and filter by cats 36 | db = pd.read_csv(resource_path, header=None, sep='\t') 37 | db.columns = ['gene', 'ctype'] 38 | cats = load_cats(dataset, case) 39 | if resource_name in cats: 40 | cats = [re.escape(c) for c in cats[resource_name]] 41 | print('Filtering for {0} cats'.format(len(cats))) 42 | db = db[db['ctype'].str.contains('|'.join(cats))] 43 | 44 | # Filter resource by measured genes 45 | genes = mu.read(os.path.join(data_path, 'mod', 'rna')).var_names.astype('U') 46 | db = db[db['gene'].astype('U').isin(genes)] 47 | 48 | # Compute evaluation 49 | y_pred = grn['source'].unique().astype('U') 50 | y = db['gene'].unique().astype('U') 51 | tp = np.intersect1d(y_pred, y).size 52 | if tp > 0.: 53 | fp = np.setdiff1d(y_pred, y).size 54 | fn = np.setdiff1d(y, y_pred).size 55 | prc = tp / (tp + fp) 56 | rcl = tp / (tp + fn) 57 | f01 = f_beta_score(prc, rcl) 58 | else: 59 | prc, rcl, f01 = 0., 0., 0., 60 | df = pd.DataFrame([[grn_name, prc, rcl, f01]], columns=['name', 'prc', 'rcl', 'f01']) 61 | else: 62 | df = pd.DataFrame([[grn_name, np.nan, np.nan, np.nan]], columns=['name', 'prc', 'rcl', 'f01']) 63 | 64 | # Write 65 | df.to_csv(out_path, index=False) 66 | -------------------------------------------------------------------------------- /workflow/scripts/anl/metrics/prior/tfp.py: -------------------------------------------------------------------------------- 1 | from itertools import combinations 2 | import scipy.stats as ss 3 | import numpy as np 4 | import pandas as pd 5 | import sys 6 | import os 7 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 8 | from utils import f_beta_score 9 | 10 | 11 | def compute_pval(tf_a, tf_b, grn): 12 | trg_a = set(grn[grn['source'] == tf_a]['target']) 13 | trg_b = set(grn[grn['source'] == tf_b]['target']) 14 | total = set(grn['target']) 15 | a = len(trg_a & trg_b) 16 | if a > 0: 17 | b = len(trg_a - trg_b) 18 | c = len(trg_b - trg_a) 19 | d = len(total - (trg_a | trg_b)) 20 | s, p = ss.fisher_exact([[a, b], [c, d]], alternative='greater') 21 | else: 22 | s, p = 0, np.nan 23 | return s, p 24 | 25 | 26 | def find_pairs(grn, thr_pval): 27 | df = [] 28 | for tf_a, tf_b in combinations(grn['source'].unique(), r=2): 29 | s, p = compute_pval(tf_a, tf_b, grn) 30 | df.append([tf_a, tf_b, s, p]) 31 | df = pd.DataFrame(df, columns=['tf_a', 'tf_b', 'stat', 'pval']).dropna() 32 | if df.shape[0] > 0: 33 | df['padj'] = ss.false_discovery_control(df['pval'], method='bh') 34 | df = df[df['padj'] < thr_pval] 35 | pairs = set(['|'.join(sorted([a, b])) for a, b in zip(df['tf_a'], df['tf_b'])]) 36 | else: 37 | pairs = set() 38 | return pairs 39 | 40 | 41 | # Read 42 | grn = pd.read_csv(sys.argv[1]).drop_duplicates(['source', 'target']) 43 | tfp = pd.read_csv(sys.argv[2], sep='\t', header=None) 44 | 45 | # Process 46 | tfs = set(tfp[0]) | set(tfp[1]) 47 | grn = grn[grn['source'].isin(tfs)] 48 | tfp = set(['|'.join(sorted([a, b])) for a, b in zip(tfp[0], tfp[1])]) 49 | grn_name = os.path.basename(sys.argv[1]).replace('.grn.csv', '') 50 | 51 | if grn.shape[0] > 1: # Need at least 2 TFs in grn 52 | # Find pairs 53 | p_grn = find_pairs(grn, thr_pval=float(sys.argv[3])) 54 | 55 | # Compute F score 56 | tp = len(p_grn & tfp) 57 | if tp > 0: 58 | fp = len(p_grn - tfp) 59 | fn = len(tfp - p_grn) 60 | rcl = tp / (tp + fn) 61 | prc = tp / (tp + fp) 62 | f01 = f_beta_score(prc, rcl) 63 | else: 64 | prc, rcl, f01 = 0., 0., 0. 65 | df = pd.DataFrame([[grn_name, prc, rcl, f01]], columns=['name', 'prc', 'rcl', 'f01']) 66 | else: 67 | df = pd.DataFrame([[grn_name, np.nan, np.nan, np.nan]], columns=['name', 'prc', 'rcl', 'f01']) 68 | 69 | # Write 70 | df.to_csv(sys.argv[4], index=False) 71 | -------------------------------------------------------------------------------- /workflow/scripts/anl/metrics/test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import argparse 4 | import os 5 | 6 | 7 | def read_eval(m_path): 8 | db_name = os.path.basename(os.path.dirname(m_path)) 9 | task = os.path.basename(os.path.dirname(os.path.dirname(m_path))) 10 | metric = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(m_path)))) 11 | case = os.path.basename(m_path).replace('.scores.csv', '') 12 | df = pd.read_csv('anl/metrics/{0}/{1}/{2}/{3}.scores.csv'.format(metric, task, db_name, case)).sort_values('f01', ascending=False) 13 | df[['pre', 'p2g', 'tfb', 'mdl']] = df['name'].str.split('.', n=4, expand=True) 14 | df = df[~df['pre'].str.startswith('o_')] 15 | df = df.reset_index(drop=True).reset_index(names='rank') 16 | df['fixed'] = [np.unique(n.split('.')).size == 1 for n in df['name']] 17 | return metric, task, db_name, case, df 18 | 19 | 20 | def test_rank(df): 21 | import decoupler as dc 22 | steps = ['pre', 'p2g', 'tfb', 'mdl'] 23 | mthds = df['pre'].unique() 24 | net = [] 25 | sts = [] 26 | for step in steps: 27 | sts.append(df.groupby([step], as_index=False)['f01'].mean().rename(columns={step: 'name'}).assign(stp=step)) 28 | for mth in mthds: 29 | for name in df[df[step] == mth]['name']: 30 | net.append(['{0}.{1}'.format(step, mth), name]) 31 | net = pd.DataFrame(net, columns=['source', 'target']) 32 | sts = pd.concat(sts) 33 | res = dc.get_gsea_df( 34 | df=df.dropna().set_index('name'), 35 | stat='f01', 36 | net=net, 37 | times=1000 38 | ) 39 | res['padj'] = np.where(res['ES'] > 0, res['FDR p-value'], 1) 40 | res[['stp', 'name']] = res['Term'].str.split('.', n=2, expand=True) 41 | res = res[['stp', 'name', 'padj']] 42 | res = pd.merge(res, sts, how='left', on=['stp', 'name']) 43 | return res 44 | 45 | 46 | parser = argparse.ArgumentParser() 47 | parser.add_argument('-m', '--path_mtr', nargs='+', required=True) 48 | parser.add_argument('-o', '--path_out', required=True) 49 | args = parser.parse_args() 50 | 51 | # Test each metric-database 52 | df = [] 53 | for m_path in args.path_mtr: 54 | metric, task, db_name, case, m_df = read_eval(m_path) 55 | m_df = test_rank(m_df) 56 | m_df[['metric', 'task', 'db', 'case']] = metric, task, db_name, case 57 | df.append(m_df) 58 | df = pd.concat(df) 59 | df = df[['metric', 'task', 'db', 'stp', 'name', 'case', 'padj', 'f01']] 60 | df = df.sort_values(['metric', 'task', 'db', 'stp', 'name']) 61 | 62 | # Write 63 | df.to_csv(args.path_out, index=False) 64 | -------------------------------------------------------------------------------- /workflow/scripts/anl/metrics/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def load_cats(dataset, case): 5 | with open('config/prior_cats.json') as f: 6 | cats = json.load(f) 7 | if (dataset == 'pbmc10k'): 8 | for i in range(4): 9 | cats[dataset][str(i)] = cats[dataset]['all'].copy() 10 | cats = cats[dataset][case] 11 | return cats 12 | 13 | def f_beta_score(prc, rcl, beta=0.1): 14 | if prc + rcl == 0: 15 | return 0 16 | return (1 + beta**2) * (prc * rcl) / ((prc * beta**2) + rcl) 17 | -------------------------------------------------------------------------------- /workflow/scripts/anl/pair/pairsim.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import yaml 4 | import sys 5 | import os 6 | from tqdm import tqdm 7 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 8 | from utils import ( 9 | ocoeff, 10 | ) 11 | import glob 12 | import argparse 13 | 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('-a','--path_a', required=True) 17 | parser.add_argument('-b','--path_b', required=True) 18 | parser.add_argument('-o','--path_out', required=True) 19 | args = vars(parser.parse_args()) 20 | 21 | path_a = args['path_a'] 22 | path_b = args['path_b'] 23 | path_out = args['path_out'] 24 | 25 | # Find paths 26 | dname_a, case_a = os.path.basename(path_a).split('.')[:2] 27 | dname_b, case_b = os.path.basename(path_b).split('.')[:2] 28 | dname_a = dname_a.replace('pair', '') 29 | dname_b = dname_b.replace('pair', '') 30 | path_pair = sorted(glob.glob(f'dts/{dname_a}pair/cases/{case_b}/runs/*.grn.csv')) 31 | path_npair = sorted(glob.glob(f'dts/{dname_b}pair/cases/{case_b}/runs/*.grn.csv')) 32 | 33 | # Compute ocoef 34 | df = [] 35 | for i in tqdm(range(len(path_pair))): 36 | p_path, n_path = path_pair[i], path_npair[i] 37 | assert os.path.basename(p_path) == os.path.basename(n_path) 38 | p_grn, n_grn = pd.read_csv(p_path), pd.read_csv(n_path) 39 | val = ocoeff(p_grn, n_grn, on=['source', 'target']) 40 | df.append([os.path.basename(p_path).replace('.grn.csv', ''), val]) 41 | df = pd.DataFrame(df, columns=['mth', 'ocoef']) 42 | 43 | # Write 44 | df.to_csv(path_out, index=False) 45 | -------------------------------------------------------------------------------- /workflow/scripts/anl/pair/realqc.py: -------------------------------------------------------------------------------- 1 | import mudata as mu 2 | import pandas as pd 3 | import numpy as np 4 | import scanpy as sc 5 | import sys 6 | 7 | 8 | pmdata = mu.read(sys.argv[1]) 9 | nmdata = mu.read(sys.argv[2]) 10 | 11 | 12 | def get_qc_omic(mdata, omic, tpe): 13 | adata = mdata.mod[omic] 14 | adata.X = adata.layers['counts'] 15 | obs, _ = sc.pp.calculate_qc_metrics( 16 | adata, percent_top=None, log1p=True 17 | ) 18 | qc = obs.assign(omic=omic, type=t) 19 | qc = pd.merge(qc.reset_index(names='barcode'), mdata.obs.reset_index(names='barcode')[['barcode', 'celltype']], on=['barcode'], how='inner') 20 | return qc 21 | 22 | 23 | def extract_n_cells(mdata, tpe): 24 | return mdata.obs.groupby('celltype', as_index=False).size().sort_values('celltype').assign(type=tpe) 25 | 26 | 27 | # Compute qc 28 | types = ['paired', 'upaired'] 29 | omics = ['rna', 'atac'] 30 | qc = [] 31 | n_ctps = [] 32 | for mdata, t in zip([pmdata, nmdata], types): 33 | n_ctps.append(extract_n_cells(mdata, t)) 34 | for omic in omics: 35 | qc.append(get_qc_omic(mdata, omic, t)) 36 | qc = pd.concat(qc) 37 | n_ctps = pd.concat(n_ctps) 38 | 39 | # Write 40 | qc.to_csv(sys.argv[3], index=False) 41 | n_ctps.to_csv(sys.argv[4], index=False) 42 | -------------------------------------------------------------------------------- /workflow/scripts/anl/stab/ovsd.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import scipy.stats as ss 3 | import sys 4 | import os 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | from utils import read_config, ocoeff 7 | 8 | 9 | # Extract dat and case 10 | dat, case = os.path.basename(sys.argv[1]).split('.')[:2] 11 | 12 | # Read config 13 | config = read_config() 14 | palette = config['colors']['nets'] 15 | mthds = list(config['methods'].keys()) 16 | baselines = config['baselines'] 17 | 18 | # Compute ocoeff and pearson 19 | df = [] 20 | for mth in mthds: 21 | ref = pd.read_csv(f'dts/{dat}/cases/{case}/runs/o_{mth}.o_{mth}.o_{mth}.o_{mth}.grn.csv') 22 | net = pd.read_csv(f'dts/{dat}/cases/{case}/runs/{mth}.{mth}.{mth}.{mth}.grn.csv') 23 | inter = pd.merge(ref, net, on=['source', 'target'], how='inner') 24 | s, p = ss.pearsonr(inter['score_x'], inter['score_y']) 25 | df.append([mth, ocoeff(ref, net, on=['source', 'target']), s, p]) 26 | df = pd.DataFrame(df, columns=['mth', 'ocoeff', 'stat', 'pval']) 27 | df['padj'] = ss.false_discovery_control(df['pval']) 28 | 29 | # Write 30 | df.to_csv(sys.argv[2], index=False) 31 | -------------------------------------------------------------------------------- /workflow/scripts/anl/stab/seeds.py: -------------------------------------------------------------------------------- 1 | import scipy.stats as sts 2 | import pandas as pd 3 | import numpy as np 4 | import sys 5 | import os 6 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 7 | from utils import read_config 8 | 9 | 10 | # Read config 11 | config = read_config() 12 | palette = config['colors']['nets'] 13 | mthds = list(config['methods'].keys()) 14 | baselines = config['baselines'] 15 | 16 | path_df = sys.argv[1] 17 | dname = os.path.basename(path_df).split('.')[0] 18 | df = pd.read_csv(path_df) 19 | mthds = df[df['cat'] == 'full'].groupby('mth', as_index=False)['e_ocoeff'].mean() 20 | mthds = mthds[mthds['e_ocoeff'] < 1.]['mth'].values 21 | 22 | # Find inter across seeds 23 | seeds = [0, 1, 2] 24 | dfs = [] 25 | for mth in mthds: 26 | if mth not in baselines: 27 | mth = 'o_' + mth 28 | df = [] 29 | for i, seed_a in enumerate(seeds): 30 | seed_a = str(seed_a) 31 | path_a = f'dts/{dname}/cases/16384_16384_{seed_a}/runs/{mth}.{mth}.{mth}.{mth}.grn.csv' 32 | grn_a = pd.read_csv(path_a)[['source', 'target', 'score']] 33 | for seed_b in seeds[i + 1:]: 34 | path_b = f'dts/{dname}/cases/16384_16384_{seed_b}/runs/{mth}.{mth}.{mth}.{mth}.grn.csv' 35 | grn_b = pd.read_csv(path_b)[['source', 'target', 'score']] 36 | df.append(pd.merge(grn_a, grn_b, how='inner', on=['source', 'target']).assign(comp=f'{seed_a}_{seed_b}')) 37 | mth = mth.replace('o_', '') 38 | df = pd.concat(df) 39 | if df.shape[0] > 1: 40 | df.insert(0, 'mth', mth) 41 | else: 42 | df.loc[0, :] = [np.nan for c in df.columns] 43 | df['mth'] = mth 44 | dfs.append(df) 45 | df = pd.concat(dfs) 46 | 47 | # Cors 48 | pairs = ['0_1', '0_2', '1_2'] 49 | cors = [] 50 | for mth in df['mth'].unique(): 51 | tmp = df[df['mth'] == mth] 52 | for pair in pairs: 53 | comp = tmp[tmp['comp'] == pair] 54 | if comp.shape[0] > 1: 55 | r, p = sts.pearsonr(comp['score_x'], comp['score_y']) 56 | else: 57 | r, p = np.nan, 1 58 | cors.append([mth, r, p, pair]) 59 | cors = pd.DataFrame(cors, columns=['mth', 'stat', 'pval', 'comp']) 60 | cors['padj'] = sts.false_discovery_control(cors['pval']) 61 | 62 | # Write 63 | df.to_csv(sys.argv[2], index=False) 64 | cors.to_csv(sys.argv[3], index=False) 65 | 66 | -------------------------------------------------------------------------------- /workflow/scripts/anl/topo/fvsd.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sys 4 | import os 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 6 | from utils import read_config 7 | 8 | 9 | def fixed_pip(mthds, sts, mat, title): 10 | res = [] 11 | steps = ['pre', 'c2g', 'tfb', 'mdl'] 12 | for mth in mthds: 13 | # Extract steps 14 | msk_mth = (sts['pre'] == mth) & (sts['c2g'] == mth) & (sts['tfb'] == mth) & (sts['mdl'] == mth) 15 | msk_pre = (sts['pre'] != mth) & (sts['c2g'] == mth) & (sts['tfb'] == mth) & (sts['mdl'] == mth) 16 | msk_c2g = (sts['pre'] == mth) & (sts['c2g'] != mth) & (sts['tfb'] == mth) & (sts['mdl'] == mth) 17 | msk_tfb = (sts['pre'] == mth) & (sts['c2g'] == mth) & (sts['tfb'] != mth) & (sts['mdl'] == mth) 18 | msk_mdl = (sts['pre'] == mth) & (sts['c2g'] == mth) & (sts['tfb'] == mth) & (sts['mdl'] != mth) 19 | 20 | # Build df 21 | df = pd.concat([ 22 | mat.loc[sts[msk_pre].index, sts[msk_mth].index].assign(step=0), 23 | mat.loc[sts[msk_c2g].index, sts[msk_mth].index].assign(step=1), 24 | mat.loc[sts[msk_tfb].index, sts[msk_mth].index].assign(step=2), 25 | mat.loc[sts[msk_mdl].index, sts[msk_mth].index].assign(step=3), 26 | ]).reset_index().rename(columns={'{m}.{m}.{m}.{m}'.format(m=mth): 'ocoeff', 'name_a': 'rest'}) 27 | 28 | # Format df 29 | df['rest'] = [n.split('.')[i] for n,i in zip(df['rest'], df['step'])] 30 | df['step'] = [steps[i] for i in df['step']] 31 | df['mth'] = mth 32 | df = df[['mth', 'step', 'rest', 'ocoeff']] 33 | res.append(df) 34 | res = pd.concat(res) 35 | res = res.rename(columns={'ocoeff': title}) 36 | return res 37 | 38 | 39 | # Read 40 | sim = pd.read_csv(sys.argv[1]) 41 | sts = pd.read_csv(sys.argv[2]) 42 | config = read_config() 43 | mthds = list(config['methods'].keys()) 44 | 45 | # Remove original runs and baselines 46 | sim = sim[~(sim['name_a'].str.startswith('o_') | sim['name_b'].str.startswith('o_'))] 47 | sim = sim[(sim['name_a'].str.split('.', expand=True)[0].isin(mthds) & sim['name_b'].str.split('.', expand=True)[0].isin(mthds))] 48 | 49 | # Find ocoeffs for fixed vs one step change 50 | df = None 51 | for oc in ['tf_oc', 'edge_oc', 'target_oc']: 52 | mat = sim.dropna().pivot(index='name_a', columns='name_b', values=oc).fillna(0) 53 | mat = mat + mat.T 54 | np.fill_diagonal(mat.values, 1) 55 | t_sts = sts.set_index('name').loc[mat.index].rename(columns={'p2g': 'c2g'}) 56 | t_sts[['pre', 'c2g', 'tfb', 'mdl']] = t_sts.reset_index()['name_a'].str.split('.', n=4, expand=True).values 57 | if df is None: 58 | df = fixed_pip(mthds, t_sts, mat, title=oc) 59 | else: 60 | df = pd.merge(df, fixed_pip(mthds, t_sts, mat, title=oc), on=['mth', 'step', 'rest']) 61 | 62 | # Write 63 | df.to_csv(sys.argv[3], index=False) 64 | -------------------------------------------------------------------------------- /workflow/scripts/anl/topo/inter.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import argparse 4 | 5 | 6 | # Init args 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('-g','--paths_grns', required=True, nargs='+') 9 | parser.add_argument('-b','--baselines', required=True, nargs='+') 10 | parser.add_argument('-p','--min_prop', required=True, type=float) 11 | parser.add_argument('-o','--path_out', required=True) 12 | args = parser.parse_args() 13 | 14 | grns = [] 15 | blns = [] 16 | for grn_path in args.paths_grns: 17 | name = grn_path.split('.')[-3] 18 | if name.startswith('o_') and (name not in args.baselines): 19 | grn = pd.read_csv(grn_path).drop_duplicates(['source', 'target']) 20 | grn['name'] = name.replace('o_', '') 21 | grns.append(grn) 22 | elif name in args.baselines: 23 | grn = pd.read_csv(grn_path).drop_duplicates(['source', 'target']).drop(columns='cre') 24 | grn['name'] = name 25 | blns.append(grn) 26 | 27 | min_n = np.floor(args.min_prop * len(grns)) 28 | grns = pd.concat(grns) 29 | blns = pd.concat(blns) 30 | shared = grns.groupby(['source', 'target'], as_index=False).size().sort_values('size', ascending=False) 31 | shared = shared[shared['size'] > min_n] 32 | 33 | 34 | shared_grn = ( 35 | pd.merge(grns, shared, how='inner', on=['source', 'target']) 36 | .sort_values(['name', 'source', 'target', 'pval']) 37 | [['name', 'source', 'target', 'score']] 38 | ) 39 | nodes = set(shared_grn['source']) | set(shared_grn['target']) 40 | msk = blns['source'].isin(nodes) & blns['target'].isin(nodes) 41 | blns = blns.loc[msk, :] 42 | 43 | shared_grn = pd.concat([ 44 | shared_grn.assign(type='mth'), 45 | blns.assign(type='bsl') 46 | ]) 47 | 48 | # Write 49 | shared_grn.to_csv(args.path_out, index=False) 50 | -------------------------------------------------------------------------------- /workflow/scripts/anl/topo/run_pair_sim.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | import glob 6 | from tqdm import tqdm 7 | from functools import partial 8 | import sys 9 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 10 | from utils import ( 11 | ocoeff, 12 | get_grn_name, 13 | get_grn_stats 14 | ) 15 | import argparse 16 | 17 | 18 | # Init args 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('-t','--stat_path', required=True) 21 | parser.add_argument('-s','--sim_path', required=True) 22 | args = vars(parser.parse_args()) 23 | 24 | stat_path = args['stat_path'] 25 | sim_path = args['sim_path'] 26 | 27 | dat, case = os.path.basename(stat_path).split('.')[:2] 28 | paths = glob.glob(os.path.join('dts', dat, 'cases', case, 'runs', '*.grn.csv')) 29 | 30 | print('Reading and computing grns stats...') 31 | names = [] 32 | dfs = [] 33 | stats = [] 34 | tfs = [] 35 | edges = [] 36 | genes = [] 37 | 38 | for path in tqdm(paths): 39 | name = get_grn_name(path) 40 | names.append(name) 41 | df = pd.read_csv(path).drop_duplicates(['source', 'target'], keep='first') 42 | stat = get_grn_stats(df) 43 | stats.append([name] + list(stat)) 44 | tfs.append(set(df['source'])) 45 | edges.append(set(df['source'] + '|' + df['target'])) 46 | genes.append(set(df['target'])) 47 | 48 | 49 | # Store as df 50 | cols = ['name', 'n_tfs', 'n_edges', 'n_targets', 'odegree', 'betweenc', 'eigv'] 51 | stats = pd.DataFrame(stats, columns=cols) 52 | 53 | print('Computing pairwise overlap coefficients...') 54 | 55 | 56 | def set_ocoef(a, b): 57 | min_s = min(len(a), len(b)) 58 | if min_s == 0: 59 | return np.nan 60 | else: 61 | inter = len(a & b) 62 | return inter / min_s 63 | 64 | 65 | names_a = [] 66 | names_b = [] 67 | tf_coefs = [] 68 | edge_coefs = [] 69 | target_coefs = [] 70 | for i in tqdm(range(len(names))): 71 | name_a = names[i] 72 | tf_a = tfs[i] 73 | ed_a = edges[i] 74 | gn_a = genes[i] 75 | for j in range(i, len(names)): 76 | name_b = names[j] 77 | tf_b = tfs[j] 78 | ed_b = edges[j] 79 | gn_b = genes[j] 80 | names_a.append(name_a) 81 | names_b.append(name_b) 82 | tf_coefs.append(set_ocoef(tf_a, tf_b)) 83 | edge_coefs.append(set_ocoef(ed_a, ed_b)) 84 | target_coefs.append(set_ocoef(gn_a, gn_b)) 85 | 86 | 87 | # Store as df 88 | sims = pd.DataFrame() 89 | sims['name_a'] = names_a 90 | sims['name_b'] = names_b 91 | sims['tf_oc'] = tf_coefs 92 | sims['edge_oc'] = edge_coefs 93 | sims['target_oc'] = target_coefs 94 | 95 | # Write 96 | stats.to_csv(stat_path, index=False) 97 | sims.to_csv(sim_path, index=False) 98 | -------------------------------------------------------------------------------- /workflow/scripts/anl/tss/dist.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pyranges as pr 3 | from tqdm import tqdm 4 | import os 5 | import glob 6 | import argparse 7 | 8 | 9 | # Parse args 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-g', '--path_cmp', required=True) 12 | parser.add_argument('-b', '--baselines', required=True, nargs='+') 13 | parser.add_argument('-o', '--path_out', required=True) 14 | args = parser.parse_args() 15 | path_cmp = args.path_cmp 16 | baselines = args.baselines 17 | path_out = args.path_out 18 | 19 | # Set variables 20 | dname, case = os.path.basename(path_cmp).split('.')[:2] 21 | path_grns = glob.glob(os.path.join('dts', dname, 'cases', case, 'runs', '*.grn.csv')) 22 | def compute_dist_tss(path, mth): 23 | if mth.startswith('o_'): 24 | grn = pd.read_csv(path) 25 | cre_grn = pd.read_csv(path.replace('o_', '')).rename(columns={'tf': 'source', 'gene': 'target'}) 26 | grn = pd.merge(grn, cre_grn[['source', 'cre', 'target']]) 27 | else: 28 | grn = pd.read_csv(path) 29 | mth = mth.replace('o_', '') 30 | grn = grn.drop_duplicates(['cre', 'target']) 31 | grn[['Chromosome', 'Start', 'End']] = grn['cre'].str.split('-', expand=True) 32 | grn = pr.PyRanges(grn[['Chromosome', 'Start', 'End', 'target']].rename(columns={'target': 'Name'})) 33 | tss = pd.read_csv(f'dbs/hg38/gen/tss/{mth}.bed', sep='\t', header=None) 34 | tss.columns = ['Chromosome', 'Start', 'End', 'Name'] 35 | tss = pr.PyRanges(tss) 36 | genes = grn.df['Name'].unique().astype('U') 37 | dists = [] 38 | for g in genes: 39 | g_grn = grn[grn.Name == g] 40 | g_tss = tss[tss.Name == g] 41 | dists.append(g_grn.nearest(g_tss, overlap=True).df[['Chromosome', 'Start', 'End', 'Distance']].assign(gene=g)) 42 | dists = pd.concat(dists).rename(columns={'Distance': 'dist'}) 43 | dists['mth'] = mth 44 | dists['cre'] = dists['Chromosome'].astype(str) + '-' + dists['Start'].astype(str) + '-' + dists['End'].astype(str) 45 | dists = dists[['mth', 'cre', 'gene', 'dist']] 46 | return dists 47 | 48 | # Compute dists 49 | dists = [] 50 | path_grns = [p for p in path_grns if (os.path.basename(p).startswith('o_')) or (os.path.basename(p).split('.')[0] in baselines)] 51 | print(path_grns) 52 | for path_grn in tqdm(path_grns): 53 | mth = os.path.basename(path_grn).split('.')[0] # Assume all stp equal 54 | dists.append(compute_dist_tss(path_grn, mth)) 55 | dists = pd.concat(dists) 56 | 57 | # Write 58 | dists.to_csv(path_out, index=False) 59 | -------------------------------------------------------------------------------- /workflow/scripts/anl/tss/gocoef.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | import pyranges as pr 4 | import os 5 | import argparse 6 | 7 | 8 | # Parse args 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('-a', '--path_tss_a', required=True) 11 | parser.add_argument('-b', '--path_tss_b', required=True) 12 | parser.add_argument('-o', '--path_out', required=True) 13 | args = parser.parse_args() 14 | path_tss_a = args.path_tss_a 15 | path_tss_b = args.path_tss_b 16 | out_path = args.path_out 17 | 18 | 19 | # Read 20 | names = [] 21 | pr_tss = [] 22 | for path in [path_tss_a, path_tss_b]: 23 | name = os.path.basename(path).replace('.bed', '') 24 | tss = pd.read_csv(path, sep='\t', header=None) 25 | tss.columns = ['Chromosome', 'Start', 'End', 'Name'] 26 | tss = tss.sort_values(['Chromosome', 'Start', 'End', 'Name']) 27 | tss = pr.PyRanges(tss) 28 | names.append(name) 29 | pr_tss.append(tss) 30 | 31 | # Find shared genes 32 | genes = set().union(pr_tss[0].Name).intersection(pr_tss[1].Name) 33 | 34 | # Find genomic overlap coef 35 | def overlap_coef_per_gene(gene, tss_a, tss_b): 36 | ftss_a = tss_a[tss_a.Name == gene].merge() 37 | ftss_b = tss_b[tss_b.Name == gene].merge() 38 | if ftss_a.empty or ftss_b.empty: 39 | raise ValueError('Gene has to be in tss') 40 | overlap = ftss_a.intersect(ftss_b) 41 | if overlap.empty: 42 | return 0. 43 | else: 44 | l = overlap.length 45 | if l == 0: 46 | return 1 47 | else: 48 | return l / min(ftss_a.length, ftss_b.length) 49 | 50 | 51 | df = [] 52 | 53 | tss_a = pr_tss[0] 54 | tss_a = tss_a[tss_a.Name.isin(genes)] 55 | name_a = names[0] 56 | 57 | tss_b = pr_tss[1] 58 | tss_b = tss_b[tss_b.Name.isin(genes)] 59 | name_b = names[1] 60 | 61 | for gene in tqdm(list(genes)): 62 | val = overlap_coef_per_gene(gene, tss_a, tss_b) 63 | df.append([name_a, name_b, gene, val]) 64 | 65 | df = pd.DataFrame(df, columns=['tss_a', 'tss_b', 'gene', 'ocoef']) 66 | 67 | # Write 68 | df.to_csv(out_path, index=False) 69 | -------------------------------------------------------------------------------- /workflow/scripts/anl/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | 5 | 6 | def read_config(path_config='config/config.yaml'): 7 | import yaml 8 | with open(path_config, 'r') as file: 9 | config = yaml.safe_load(file) 10 | return config 11 | 12 | 13 | def get_grn_name(grn_path): 14 | name = os.path.basename(grn_path).replace('.grn.csv', '').replace('.csv', '') 15 | return name 16 | 17 | 18 | def get_grn_stats(grn): 19 | import igraph as ig 20 | if len(grn) == 0: 21 | return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan 22 | n_s = grn['source'].unique().size 23 | n_e = grn.shape[0] 24 | n_t = grn['target'].unique().size 25 | 26 | g = ig.Graph.TupleList(list(zip(grn['source'], grn['target'])), directed=True) 27 | tf_bet = np.mean(g.betweenness()) 28 | tf_odg = grn.groupby(['source']).size().mean() 29 | if not g.is_acyclic(): 30 | tf_eig = np.mean(g.eigenvector_centrality()) 31 | else: 32 | tf_eig = 0. 33 | 34 | return n_s, n_e, n_t, tf_odg, tf_bet, tf_eig 35 | 36 | 37 | def ocoeff(df_a, df_b, on=['source', 'target']): 38 | """Compute overlap coefficient between two dfs""" 39 | tmp_a, tmp_b = df_a.drop_duplicates(on), df_b.drop_duplicates(on) 40 | a_size, b_size = tmp_a.shape[0], tmp_b.shape[0] 41 | if (a_size > 0) and (b_size > 0): 42 | inter = pd.merge(tmp_a, tmp_b, on=on, how='inner') 43 | i_size = inter.shape[0] 44 | coeff = i_size / np.min([a_size, b_size]) 45 | else: 46 | coeff = 0. 47 | return coeff 48 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/c2g/eqtlcat_gene.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | from tqdm import tqdm 5 | from concurrent.futures import ProcessPoolExecutor 6 | 7 | 8 | mta = pd.read_csv(sys.argv[1], sep='\t', header=None) 9 | mta['smpl'] = mta[0] + '.' + mta[1] 10 | mta = mta.set_index('smpl')[2].to_dict() 11 | 12 | file_data = {} 13 | 14 | for line in tqdm(sys.stdin): 15 | chrm, start, end, gene, smpl = line.strip().split('\t') 16 | start, end = int(start), int(end) 17 | ctype = mta[smpl] 18 | 19 | if gene not in file_data: 20 | file_data[gene] = "" 21 | file_data[gene] += f'{chrm}\t{start}\t{end}\t{gene}\t{ctype}\n' 22 | 23 | 24 | def write_gene_file(gene, lines, output_dir): 25 | with open(os.path.join(output_dir, f'{gene}.bed'), 'w') as f: 26 | f.writelines(lines) 27 | 28 | 29 | with ProcessPoolExecutor(max_workers=32) as executor: 30 | futures = {executor.submit(write_gene_file, gene, lines, sys.argv[2]): gene for gene, lines in file_data.items()} 31 | for future in tqdm(futures, total=len(futures)): 32 | future.result() -------------------------------------------------------------------------------- /workflow/scripts/dbs/c2g/eqtlcat_smpl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | 7 | gdict = pd.read_csv(sys.argv[1]).set_index('id')['symbol'].to_dict() 8 | thr_pval = float(sys.argv[2]) 9 | name = os.path.basename(sys.argv[3]).replace('.bed', '') 10 | with open(sys.argv[3], 'w') as f: 11 | next(sys.stdin) # skip first line 12 | for line in tqdm(sys.stdin): 13 | line = line.strip().split('\t') 14 | gene, coords, pval = line[1], line[3], float(line[7]) 15 | chrm, start = coords.split('_')[:2] 16 | start, end = int(start), int(start) 17 | valid = (pval < thr_pval) and (gene in gdict) and ('_' not in chrm) 18 | if valid: 19 | gene = gdict[gene] 20 | f.write(f'{chrm}\t{start}\t{end}\t{gene}\t{name}\n') 21 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/cre/gwascatalogue.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from tqdm import tqdm 4 | import argparse 5 | 6 | 7 | # Init args 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-i','--inp_path', required=True) 10 | args = vars(parser.parse_args()) 11 | 12 | inp_path = args['inp_path'] 13 | 14 | # Read tsv input 15 | df = pd.read_csv(inp_path, sep='\t', dtype={9: 'str', 12: 'str', 23: 'str', 26: 'str'}) 16 | 17 | # Remove nans 18 | df = df[~df['CHR_POS'].isna()] 19 | df = df[~df['SNP_ID_CURRENT'].isna()] 20 | df = df[~df['MAPPED_TRAIT_URI'].isna()] 21 | 22 | # Drop one special case with multiple snp ids 23 | df = df[~df['SNP_ID_CURRENT'].astype(str).str.contains(';')] 24 | 25 | # Split urls and obtain key, take care of multiple terms separated by commas 26 | df['MAPPED_TRAIT_URI'] = [', '.join([x.split('/')[-1] for x in url.split(',')]) for url in df['MAPPED_TRAIT_URI']] 27 | 28 | # Exctracts the risk allele and sets anything else from ATGC to unknown 29 | str_alleles = [] 30 | bases = np.array(['A', 'T', 'G', 'C']) 31 | for snp in tqdm(df['STRONGEST SNP-RISK ALLELE']): 32 | snp = snp.split('-')[-1].upper() 33 | has_bases = np.all(np.isin([l for l in snp], bases)) 34 | if has_bases and snp != '': 35 | str_alleles.append(snp) 36 | else: 37 | str_alleles.append('?') 38 | df['STRONGEST SNP-RISK ALLELE'] = str_alleles 39 | df['CHR_POS_2'] = df['CHR_POS'].copy() 40 | 41 | # Subset by important cols 42 | cols = ['CHR_ID', 'CHR_POS', 'CHR_POS_2', 'STRONGEST SNP-RISK ALLELE', 43 | 'P-VALUE', 'MAPPED_TRAIT', 'MAPPED_TRAIT_URI', 'PUBMEDID'] 44 | df = df[cols] 45 | 46 | # Transform to correct data types 47 | df['CHR_ID'] = 'chr' + df['CHR_ID'].astype(str) 48 | df['CHR_POS'] = df['CHR_POS'].astype(int) 49 | df['CHR_POS_2'] = df['CHR_POS_2'].astype(int) 50 | df['P-VALUE'] = df['P-VALUE'].astype(float) 51 | df['MAPPED_TRAIT'] = df['MAPPED_TRAIT'].astype(str) 52 | df['MAPPED_TRAIT_URI'] = df['MAPPED_TRAIT_URI'].astype(str) 53 | df['PUBMEDID'] = df['PUBMEDID'].astype(str) 54 | 55 | # Summarize when multiple p-values are given 56 | df = df.groupby(list(df.columns[df.columns != 'P-VALUE'])).mean(numeric_only=True).reset_index() 57 | 58 | # Rename and sort 59 | df = df.rename(columns={ 60 | 'CHR_ID': 'chr_id', 61 | 'CHR_POS': 'chr_start', 62 | 'CHR_POS_2': 'chr_end', 63 | 'STRONGEST SNP-RISK ALLELE': 'eff_allele', 64 | 'MAPPED_TRAIT': 'trait_name', 65 | 'MAPPED_TRAIT_URI': 'trait_uri', 66 | 'PUBMEDID': 'pubmedid', 67 | 'P-VALUE': 'pval' 68 | }) 69 | 70 | # Save 71 | df = df[['chr_id', 'chr_start', 'chr_end', 'eff_allele', 'trait_name']] 72 | df['trait_name'] = df['trait_name'].str.strip() 73 | df.to_csv(inp_path, index=False, header=None, sep='\t') 74 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/cre/promoters.R: -------------------------------------------------------------------------------- 1 | library(biomaRt) 2 | library(dplyr) 3 | 4 | # Parse args 5 | args <- commandArgs(trailingOnly = F) 6 | window_size <- as.numeric(args[6]) 7 | out_path <- args[7] 8 | 9 | 10 | ensembl <- useMart( 11 | "ensembl", 12 | dataset = "hsapiens_gene_ensembl", 13 | host = "http://www.ensembl.org" 14 | ) 15 | 16 | gene_data <- getBM( 17 | attributes = c("ensembl_gene_id", "external_gene_name", "chromosome_name", "transcription_start_site"), 18 | mart = ensembl 19 | ) 20 | 21 | gene_data <- gene_data %>% 22 | mutate( 23 | promoter_start = transcription_start_site - window_size, 24 | promoter_end = transcription_start_site + window_size - 1, 25 | promoter_start = pmax(promoter_start, 0) # Ensure non-negative values 26 | ) 27 | 28 | standard_chromosomes <- c(1:23, "X", "Y") 29 | bed_data <- gene_data %>% 30 | filter(chromosome_name %in% standard_chromosomes & external_gene_name != "") %>% 31 | distinct(external_gene_name, .keep_all = TRUE) %>% 32 | transmute( 33 | chrom = paste0("chr", chromosome_name), 34 | chromStart = promoter_start - 1, # BED format is 0-based 35 | chromEnd = promoter_end, 36 | name = external_gene_name 37 | ) 38 | 39 | bed_data <- bed_data %>% 40 | arrange( 41 | factor(chrom, levels = paste0("chr", c(1:23, "X", "Y"))), 42 | chromStart 43 | ) 44 | 45 | # Write to output file 46 | write.table(bed_data, file = out_path, sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE) 47 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gen/genome/celloracle.py: -------------------------------------------------------------------------------- 1 | from genomepy import install_genome 2 | import os 3 | import re 4 | import argparse 5 | 6 | 7 | # Init args 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-o','--orgms', required=True, nargs='+') 10 | args = vars(parser.parse_args()) 11 | 12 | # Get dir 13 | orgms = args['orgms'] 14 | 15 | # Install genomes 16 | for path_org in orgms: 17 | org = re.search(r'^dbs/([^/]+)/.*$', path_org).group(1) 18 | install_genome(name=org, genomes_dir=path_org, provider="UCSC") 19 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gen/gid/ensmbl.R: -------------------------------------------------------------------------------- 1 | library(biomaRt) 2 | 3 | # Parse args 4 | orgms <- commandArgs(trailingOnly = TRUE) 5 | 6 | get_gene_table <- function(dataset){ 7 | # Connect to the Ensembl database 8 | ensembl <- useEnsembl( 9 | biomart = 'genes', 10 | dataset = dataset, 11 | version = 111 12 | ) 13 | # Specify the attributes to retrieve 14 | attributes <- c("ensembl_gene_id", "external_gene_name") 15 | # Retrieve the data 16 | gene_data <- getBM( 17 | attributes = attributes, 18 | mart = ensembl, 19 | useCache=FALSE, 20 | verbose=FALSE 21 | ) 22 | colnames(gene_data) <- c('id', 'symbol') 23 | return(gene_data) 24 | } 25 | 26 | org_table <- list( 27 | 'hg38'='hsapiens_gene_ensembl', 28 | 'mm10'='mmusculus_gene_ensembl' 29 | ) 30 | 31 | for (path_org in orgms) { 32 | org <- sub('^dbs/([^/]+)/.*$', '\\1', path_org) 33 | org <- org_table[org] 34 | gid <- get_gene_table(org) 35 | gid <- gid[gid$symbol != "", ] 36 | write.csv(x = gid, file = path_org, row.names=FALSE, quote=FALSE) 37 | } 38 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gen/pid/uniprot.R: -------------------------------------------------------------------------------- 1 | library(biomaRt) 2 | 3 | # Parse args 4 | orgms <- commandArgs(trailingOnly = TRUE) 5 | 6 | get_gene_table <- function(dataset){ 7 | # Connect to the Ensembl database 8 | ensembl <- useEnsembl( 9 | biomart = 'genes', 10 | dataset = dataset, 11 | version = 111 12 | ) 13 | # Specify the attributes to retrieve 14 | attributes <- c("uniprotswissprot", "external_gene_name") 15 | # Retrieve the data 16 | gene_data <- getBM( 17 | attributes = attributes, 18 | mart = ensembl, 19 | useCache=FALSE, 20 | verbose=FALSE 21 | ) 22 | colnames(gene_data) <- c('uniprot_id', 'symbol') 23 | return(gene_data) 24 | } 25 | 26 | org_table <- list( 27 | 'hg38'='hsapiens_gene_ensembl', 28 | 'mm10'='mmusculus_gene_ensembl' 29 | ) 30 | 31 | for (path_org in orgms) { 32 | org <- sub('^dbs/([^/]+)/.*$', '\\1', path_org) 33 | org <- org_table[org] 34 | gid <- get_gene_table(org) 35 | gid <- gid[(gid$symbol != "") & (gid$uniprot_id != ""), ] # Exclude rows with empty gene symbols 36 | write.csv(x = gid, file = path_org, row.names=FALSE, quote=FALSE) 37 | } 38 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gen/tss/celloracle.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import requests 3 | from io import StringIO 4 | import argparse 5 | 6 | # Initiate args 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('-o', '--path_out', required=True) 9 | args = parser.parse_args() 10 | out_path = args.path_out 11 | 12 | # Download bed file 13 | url = "https://github.com/morris-lab/CellOracle/blob/e5ae78e93272da7d772378e60ae6cd4602f24be6/celloracle/motif_analysis/tss_ref_data/hg38_tss_info.bed?raw=true" 14 | response = requests.get(url) 15 | bed = pd.read_csv(StringIO(response.text), sep='\t', header=None)[[0, 1, 2, 3]].dropna().sort_values([0, 1, 2]) 16 | 17 | # Save file 18 | bed.to_csv(out_path, sep="\t", index=False, header=False) 19 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gen/tss/dictys.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument('-o', '--path_out', required=True) 6 | parser.add_argument('-i', '--path_input', required=True) 7 | args = parser.parse_args() 8 | out_path = args.path_out 9 | input_path = args.path_input 10 | 11 | # Read file 12 | bed = pd.read_csv(input_path, sep='\t', header=None) 13 | 14 | # Process columns 15 | bed.columns = ['Chromosome', 'Start', 'End', 'Name', 'score', 'strand'] 16 | bed = bed[['Chromosome', 'Start', 'End', 'Name']] 17 | bed['Start'] = bed['Start'] - 1 18 | bed['End'] = bed['End'] - 1 19 | 20 | # Save file 21 | bed.to_csv(out_path, sep="\t", index=False, header=None) 22 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gen/tss/figr.R: -------------------------------------------------------------------------------- 1 | library(FigR) 2 | library(GenomicRanges) 3 | 4 | 5 | # Add arguments 6 | args <- commandArgs(trailingOnly = F) 7 | path_out <- args[6] 8 | 9 | # Extract TSS annotations 10 | TSSg <- FigR::hg38TSSRanges 11 | chr <- as.character(seqnames(TSSg)) 12 | start_pos <- start(TSSg) 13 | end_pos <- end(TSSg) 14 | gene_names <- mcols(TSSg)$gene_name 15 | 16 | # Transform it into a data frame 17 | data <- data.frame(Chromosome = chr, Start = start_pos - 1, End = end_pos - 1, Name = gene_names) 18 | 19 | # Write 20 | write.table(x = data, file = path_out, sep = '\t', row.names = FALSE, quote = FALSE, col.names = FALSE) 21 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gen/tss/granie.R: -------------------------------------------------------------------------------- 1 | library(AnnotationHub) 2 | 3 | 4 | # Initiate args 5 | args <- commandArgs(trailingOnly = F) 6 | path_out <- args[6] 7 | 8 | 9 | # Load db 10 | ah <- AnnotationHub() 11 | 12 | # Get the newest version of annotation 13 | results = AnnotationHub::query(ah, c("EnsDb", "Homo sapiens")) 14 | annotationDatasets <- as.data.frame(mcols(results)) 15 | newestAnno.title = tail(annotationDatasets$title, 1) 16 | newestAnno.ID = tail(rownames(annotationDatasets), 1) 17 | ensdb.newest <- ah[[newestAnno.ID]] 18 | 19 | # Read 20 | gr <- ensembldb::genes(ensdb.newest) 21 | 22 | # Merge overlaps 23 | merged <- unlist(reduce(split(gr, gr$gene_name)), use.names = TRUE) 24 | 25 | # To df 26 | chr_names <- paste0("chr", as.character(seqnames(merged))) 27 | start_pos <- start(merged) - 1 28 | end_pos <- end(merged) - 1 29 | gene_names <- names(merged) 30 | bed <- data.frame(Chromosome = chr_names, Start = start_pos, End = end_pos, Name = gene_names) 31 | 32 | # Filter empty names 33 | bed <- bed[bed$Name != '', ] 34 | 35 | # Sort 36 | bed <- bed[order(bed$Chromosome, bed$Start, bed$End), ] 37 | 38 | # Write 39 | write.table(x = bed, file = path_out, sep = '\t', row.names = FALSE, quote = FALSE, col.names = FALSE) 40 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gen/tss/hummus.R: -------------------------------------------------------------------------------- 1 | # Initiate arguments 2 | args <- commandArgs(trailingOnly = F) 3 | path_out <- args[6] 4 | 5 | 6 | library(HuMMuS) 7 | library(EnsDb.Hsapiens.v86) 8 | library(dplyr) 9 | 10 | # Extract TSS 11 | gene_range = get_genome_annotations(EnsDb.Hsapiens.v86) 12 | chr <- as.character(seqnames(gene_range)) 13 | start_pos <- start(gene_range) 14 | end_pos <- end(gene_range) 15 | gene_names <- mcols(gene_range)$gene_name 16 | gene_type <- mcols(gene_range)$gene_biotype 17 | 18 | 19 | # Build dataframe in .csv 20 | data <- data.frame(Chromosome = chr, Start = start_pos, End = end_pos, Name = gene_names, gene.type = gene_type) 21 | 22 | 23 | # Filter only protein coding genes 24 | data <- data %>% filter(gene.type == "protein_coding") 25 | data <- data %>% 26 | dplyr::select(Chromosome, Start, End, Name) 27 | 28 | 29 | write.csv(x = data, file = path_out) 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gen/tss/pando.R: -------------------------------------------------------------------------------- 1 | library(EnsDb.Hsapiens.v86) 2 | library(dplyr) 3 | 4 | 5 | # Parse args 6 | args <- commandArgs(trailingOnly = F) 7 | path_out <- args[6] 8 | 9 | 10 | # Read 11 | gr <- Signac::GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) 12 | 13 | # Merge overlaps 14 | merged <- unlist(reduce(split(gr, gr$gene_name)), use.names = TRUE) 15 | 16 | # To df 17 | chr_names <- paste0("chr", as.character(seqnames(merged))) 18 | start_pos <- start(merged) 19 | end_pos <- end(merged) 20 | gene_names <- names(merged) 21 | bed <- data.frame(Chromosome = chr_names, Start = start_pos, End = end_pos, Name = gene_names) 22 | bed <- dplyr::arrange(bed, Chromosome, Start, End) 23 | 24 | # Write 25 | write.table(x = bed, file = path_out, sep = '\t', row.names = FALSE, quote = FALSE, col.names = FALSE) 26 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gen/tss/scenicplus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[22]: 5 | 6 | 7 | import pybiomart as pbm 8 | import argparse 9 | import numpy as np 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('-j', '--path_out', required=True) 13 | args = parser.parse_args() 14 | out_path = args.path_out 15 | 16 | dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org') 17 | 18 | annot = dataset.query(attributes=['chromosome_name', 'start_position', 'end_position', 19 | 'strand', 'external_gene_name', 'transcription_start_site', 'transcript_biotype']) 20 | annot['chromosome_name'] = annot['chromosome_name'].to_numpy(dtype=str) 21 | filter = annot['chromosome_name'].str.contains('CHR|GL|JH|MT', case=False) 22 | annot = annot[~filter] 23 | annot['chromosome_name'] = annot['chromosome_name'].str.replace(r'(\b\S)', r'chr\1') 24 | annot.columns = ['Chromosome', 'Start', 'End', 'Strand', 'Name', 'Transcription_Start_Site', 'Transcript_type'] 25 | annot["Strand"] = annot["Strand"].replace({1: "+", -1: "-"}) 26 | annot.Start = annot.Start.astype(np.int32) 27 | annot['Chromosome'] = 'chr' + annot['Chromosome'].astype(str) 28 | annot.dropna(inplace=True) 29 | annot = annot[['Chromosome', 'Start', 'End', 'Name']] 30 | 31 | # Save the file 32 | annot.to_csv(out_path, sep="\t", index=False) 33 | 34 | 35 | 36 | # In[ ]: 37 | 38 | 39 | 40 | 41 | 42 | # In[ ]: 43 | 44 | 45 | 46 | 47 | 48 | # In[ ]: 49 | 50 | 51 | 52 | 53 | 54 | # In[ ]: 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/gst/pways.py: -------------------------------------------------------------------------------- 1 | import decoupler as dc 2 | import pandas as pd 3 | import argparse 4 | 5 | # Init args 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('-i','--path_inp', required=True) 8 | parser.add_argument('-o','--path_out', required=True) 9 | args = vars(parser.parse_args()) 10 | 11 | path_reac = args['path_reac'] 12 | path_hall = args['path_hall'] 13 | path_kegg = args['path_kegg'] 14 | path_tfs = args['path_tfs'] 15 | path_prg = args['path_prg'] 16 | path_out_hall = args['path_out_hall'] 17 | path_out_kegg = args['path_out_kegg'] 18 | path_out_prg = args['path_out_prg'] 19 | path_out_reac = args['path_out_reac'] 20 | 21 | # Process hallmark 22 | hall = dc.read_gmt(path_hall) 23 | hall['source'] = hall['source'].str.replace('HALLMARK_', '') 24 | 25 | # Process kegg 26 | kegg = dc.read_gmt(path_kegg) 27 | kegg['source'] = kegg['source'].str.replace('KEGG_', '') 28 | 29 | # Process progeny 30 | prg = pd.read_csv(path_prg) 31 | prg = prg.rename(columns={'gene': 'target', 'pathway': 'source', 'p.value': 'pval'}) 32 | prg = prg[['source', 'target', 'weight', 'pval']] 33 | prg = prg[prg['pval'] < 0.05] 34 | prg = prg.sort_values(['source', 'pval']) 35 | prg = prg.rename(columns={'source': 'pathway', 'target': 'gene'}) 36 | 37 | # Process reactome 38 | reac = dc.read_gmt(path_reac) 39 | reac['source'] = reac['source'].str.replace('REACTOME_', '') 40 | 41 | # Write 42 | kegg.to_csv(path_out_kegg, index=False) 43 | prg.to_csv(path_out_prg, index=False) 44 | hall.to_csv(path_out_hall, index=False) 45 | reac.to_csv(path_out_reac, index=False) 46 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/ont/bto.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | 4 | b_dict = dict() 5 | read = False 6 | 7 | for line in sys.stdin: 8 | line = line.strip() 9 | if line.startswith('')[1].split('<')[0] 14 | continue 15 | elif line.startswith('')[1].split('<')[0] 17 | continue 18 | elif line.startswith('') and read: 19 | b_dict[key] = val 20 | read = False 21 | 22 | b_dict = pd.DataFrame(list(b_dict.items())) 23 | b_dict.to_csv(sys.argv[1], sep='\t', index=False, header=None) 24 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/tfb/aggregate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | for line in sys.stdin: 3 | line = line.replace('\n', '').split('\t') 4 | chrm, start, end, tf, ctype = line[0], line[1], line[2], line[3], line[4] 5 | ctype = ','.join(sorted(set(ctype.split(',')))) 6 | print(f'{chrm}\t{start}\t{end}\t{tf}\t{ctype}') 7 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/tfb/chipatlas_meta.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import sys 4 | 5 | 6 | df = pd.read_csv(sys.argv[1], sep='\t', usecols=[0, 1, 2, 3, 4, 5], header=None) 7 | tfs = pd.read_csv(sys.argv[2], header=None).values.ravel() 8 | org = sys.argv[1].split(os.sep)[1] 9 | msk_org = df[1] == org 10 | msk_tfs = df[3].isin(tfs) 11 | msk_unc = ~(df[4] == 'Unclassified') 12 | msk = msk_org & msk_tfs & msk_unc 13 | df = df.loc[msk, :].dropna() 14 | df['ctype'] = df[4] + ',' + df[5] 15 | df = df[[0, 3, 'ctype']] 16 | df.to_csv(sys.argv[1], sep='\t', index=False, header=None) 17 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/tfb/chipatlas_tf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import re 4 | import pandas as pd 5 | 6 | 7 | tf = os.path.basename(sys.argv[1]).replace('.bed', '') 8 | meta = pd.read_csv(sys.argv[2], sep='\t', header=None).set_index(0) 9 | pattern = r'ID=(.*?);' 10 | for line in sys.stdin: 11 | if line.startswith('chr'): 12 | line = line.replace('\n', '').split('\t') 13 | chrm, start, end, sample_id = line[0], line[1], line[2], line[3] 14 | sample_id = re.search(pattern, sample_id).group(1) 15 | if (sample_id in meta.index) and ('_' not in chrm): 16 | m_tf = meta.loc[sample_id, 1] 17 | ctype = meta.loc[sample_id, 2] 18 | start, end = int(start), int(end) 19 | if (m_tf == tf) and ((start - end) < int(sys.argv[3])): 20 | print(f'{chrm}\t{start}\t{end}\t{tf}\t{ctype}') 21 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/tfb/remap2022_meta.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from io import BytesIO 3 | import sys 4 | 5 | binary_data = BytesIO(sys.stdin.buffer.read()) 6 | df = pd.read_excel(pd.ExcelFile(binary_data), sheet_name=0) 7 | df = df[['biotype', 'identifiants/0/BTO_id']].dropna() 8 | df = df.rename(columns={'identifiants/0/BTO_id': 'id'}) 9 | df['id'] = df['id'].str.replace('_', ':') 10 | bto = pd.read_csv(sys.argv[1], sep='\t', header=None).set_index(0)[1].to_dict() 11 | df['term'] = [bto[i] for i in df['id']] 12 | df[['biotype', 'term']].to_csv(sys.argv[2], sep='\t', index=False, header=None) 13 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/tfb/remap2022_raw.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | 7 | tfs = set(pd.read_csv(sys.argv[1], header=None).iloc[:, 0].astype('U')) 8 | mta = pd.read_csv(sys.argv[2], header=None, sep='\t', index_col=0).iloc[:, 0].to_dict() 9 | file_handles = {} 10 | for line in tqdm(sys.stdin): 11 | if line.startswith('chr'): 12 | chrm, start, end, tf_ctype = line.strip().split('\t')[:4] 13 | tf, ctype = tf_ctype.split(':') 14 | start, end = int(start), int(end) 15 | if tf in tfs and '_' not in chrm and (end - start) < int(sys.argv[3]): 16 | ctypes = [mta[c] for c in ctype.split(',') if c in mta] 17 | if ctypes: 18 | if tf not in file_handles: 19 | file_handles[tf] = open(os.path.join(sys.argv[4], f'{tf}.bed'), 'w') 20 | file_handles[tf].write(f'{chrm}\t{start}\t{end}\t{tf}\t{",".join(ctypes)}\n') 21 | for tf in file_handles: 22 | file_handles[tf].close() 23 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/tfb/unibind_raw.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | 7 | tfs = set(pd.read_csv(sys.argv[1], header=None).iloc[:, 0].astype('U')) 8 | file_handles = {} 9 | for line in tqdm(sys.stdin): 10 | chrm, start, end, tmp = line.strip().split('\t')[:4] 11 | tmp = tmp.split('_') 12 | if len(tmp) == 4: 13 | _, ctype, tf, _ = tmp 14 | start, end = int(start), int(end) 15 | ctype = ctype.replace('-', ' ').replace(',', ' ').strip() 16 | tf = tf.strip() 17 | valid = (tf in tfs) and ('_' not in chrm) and ((start - end) < int(sys.argv[2])) 18 | if valid: 19 | if tf not in file_handles: 20 | file_handles[tf] = open(os.path.join(sys.argv[3], f'{tf}.bed'), 'w') 21 | file_handles[tf].write(f'{chrm}\t{start}\t{end}\t{tf}\t{ctype}\n') 22 | for tf in file_handles: 23 | file_handles[tf].close() -------------------------------------------------------------------------------- /workflow/scripts/dbs/tfm/hpa.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import argparse 4 | 5 | 6 | # Init args 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('-i','--inp_path', required=True) 9 | parser.add_argument('-t','--tfs_path', required=True) 10 | parser.add_argument('-o','--out_path', required=True) 11 | args = vars(parser.parse_args()) 12 | 13 | inp_path = args['inp_path'] 14 | tfs_path = args['tfs_path'] 15 | out_path = args['out_path'] 16 | 17 | # Read 18 | c_cols = ['Tissue expression cluster', 'Cell line expression cluster', 'Single cell expression cluster'] 19 | df = pd.read_csv(inp_path, sep='\t').dropna(subset=c_cols) 20 | 21 | # Read tfs 22 | tfs = pd.read_csv(tfs_path, sep='\t', header=None).values.ravel().astype('U') 23 | 24 | # Filter 25 | msk_evd = (df['Evidence'] == 'Evidence at protein level').values 26 | msk_loc = np.array(['Nucle' in str(s) for s in df['Subcellular location']]) 27 | msk_tissue = ~df['Tissue expression cluster'].str.contains('Non-specific -') 28 | msk_celine = ~df['Cell line expression cluster'].str.contains('Non-specific -') 29 | msk_cetype = ~df['Single cell expression cluster'].str.contains('Non-specific -') 30 | msk_tf = df['Gene'].isin(tfs) 31 | msk = msk_tf & msk_evd & msk_loc & msk_tissue & msk_celine & msk_cetype 32 | df = df.loc[msk, :] 33 | 34 | # Format names 35 | for col in c_cols: 36 | df[col] = [s.split(':')[1].split('-')[0].strip() for s in df[col]] 37 | df['ctype'] = [','.join(sorted(set(lst))) for lst in df[c_cols].values] 38 | df = df.rename(columns={'Gene': 'gene'})[['gene', 'ctype']] 39 | df = df.sort_values(['gene', 'ctype']) 40 | 41 | # Write 42 | df.to_csv(out_path, sep='\t', index=False, header=None) 43 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/tfp/europmc.py: -------------------------------------------------------------------------------- 1 | import scipy.stats as ss 2 | import pandas as pd 3 | import sys 4 | 5 | 6 | # Vars 7 | path_single = sys.argv[1] 8 | path_pairs = sys.argv[2] 9 | pval_thr = float(sys.argv[3]) 10 | min_odds = float(sys.argv[4]) 11 | path_out = sys.argv[5] 12 | 13 | # Read 14 | single = pd.read_csv(path_single) 15 | total = single['n'].sum() 16 | single = single.set_index('tf')['n'].to_dict() 17 | pairs = pd.read_csv(path_pairs) 18 | 19 | # Compute one-sided Fisher test 20 | df = [] 21 | for row in pairs.values: 22 | tf_a, tf_b, n = row 23 | only_a = single[tf_a] - n 24 | only_b = single[tf_b] - n 25 | backgr = total - (single[tf_a] + single[tf_b]) 26 | s, p = ss.fisher_exact([[n, only_a], [only_b, backgr]], alternative='greater') 27 | df.append([tf_a, tf_b, s, p]) 28 | df = pd.DataFrame(df, columns=['tf_a', 'tf_b', 'stat', 'pval']) 29 | df['padj'] = ss.false_discovery_control(df['pval']) 30 | 31 | # Filter 32 | df = df[(df['padj'] < pval_thr) & (df['stat'] > min_odds)].copy() 33 | df['name'] = ['|'.join(sorted([a, b])) for a, b in zip(df['tf_a'], df['tf_b'])] 34 | df[['tf_a', 'tf_b']] = df['name'].str.split('|', expand=True) 35 | df = df.drop(columns='name') 36 | 37 | # Save 38 | df.to_csv(path_out, index=False, header=False, sep='\t') 39 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/tfp/europmc_raw.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import pandas as pd 3 | import requests 4 | import re 5 | import time 6 | import sys 7 | 8 | 9 | def do_query(query): 10 | base = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search' 11 | url = f"{base}?query={query}&format=json" 12 | res = requests.get(url) 13 | while res.status_code != 200: 14 | print(url, flush=True) 15 | time.sleep(1) 16 | res = requests.get(url) 17 | n = int(res.json()['hitCount']) 18 | return n 19 | 20 | 21 | def get_n_pairs(tf_a, tf_b): 22 | query = f'(TITLE:"{tf_a}"+OR+ABSTRACT:"{tf_a}")+AND+(TITLE:"{tf_b}"+OR+ABSTRACT:"{tf_b}")' 23 | return do_query(query) 24 | 25 | 26 | def get_n_single(tf): 27 | query = f'(ABSTRACT:"{tf}"+OR+TITLE:"{tf}")' 28 | return do_query(query) 29 | 30 | 31 | # Read args 32 | path_tfs = sys.argv[1] 33 | min_chars = int(sys.argv[2]) 34 | min_n = int(sys.argv[3]) 35 | path_single = sys.argv[4] 36 | path_pairs = sys.argv[5] 37 | 38 | # Open tfs 39 | tfs = pd.read_csv(path_tfs, sep='\t', header=None)[0].values.astype('U') 40 | 41 | # Find unique tfs with enough publications (min_n) and characters (min_chars) 42 | single_tfs = [] 43 | for tf in tqdm(tfs): 44 | if len(tf) > min_chars: 45 | single_tfs.append([tf, get_n_single(tf)]) 46 | single_tfs = pd.DataFrame(single_tfs, columns=['tf', 'n']).sort_values('n') 47 | single_tfs = single_tfs[single_tfs['n'] > min_n] 48 | tfs = single_tfs['tf'].sort_values().unique() 49 | single_tfs.to_csv(path_single, index=False) 50 | 51 | # Find pairs 52 | df = [] 53 | for i in tqdm(range(tfs.size)): 54 | tf_a = tfs[i] 55 | for j in range(i + 1, tfs.size): 56 | tf_b = tfs[j] 57 | n = get_n_pairs(tf_a, tf_b) 58 | if n > 0: 59 | df.append([tf_a, tf_b, n]) 60 | df = pd.DataFrame(df, columns=['tf_a', 'tf_b', 'n']) 61 | df.to_csv(path_pairs, index=False) 62 | -------------------------------------------------------------------------------- /workflow/scripts/dbs/tfp/intact.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | 4 | 5 | # Read 6 | db = pd.read_csv(sys.argv[1], sep="\t", usecols=['#ID(s) interactor A', 'ID(s) interactor B', 'Confidence value(s)']) 7 | tfs = pd.read_csv(sys.argv[2], header=None)[0].to_list() 8 | pid = pd.read_csv(sys.argv[3]) 9 | 10 | # Format 11 | p_to_g = pid.set_index('uniprot_id')['symbol'].to_dict() 12 | db = db.rename(columns={ 13 | '#ID(s) interactor A': 'tf_a', 14 | 'ID(s) interactor B': 'tf_b', 15 | 'Confidence value(s)': 'score', 16 | }) 17 | db['tf_a'] = db['tf_a'].str.extract(r'uniprotkb:(\w+)')[0].map(p_to_g) 18 | db['tf_b'] = db['tf_b'].str.extract(r'uniprotkb:(\w+)')[0].map(p_to_g) 19 | db['score'] = db['score'].str.extract(r'intact-miscore:(\d+\.\d+)').astype(float) 20 | 21 | # Filter 22 | db = db[db['score'] > 0.75].dropna() 23 | db = db[db['tf_a'].isin(tfs) & db['tf_b'].isin(tfs)] 24 | db = db[db['tf_a'] != db['tf_b']].copy() 25 | db['str'] = ['|'.join(sorted([a, b])) for a, b in zip(db['tf_a'], db['tf_b'])] 26 | db = db.drop_duplicates('str').sort_values('score', ascending=False) 27 | db[['tf_a', 'tf_b']] = db['str'].str.split('|', expand=True) 28 | db = db.drop(columns=['str']) 29 | 30 | # Write 31 | db.to_csv(sys.argv[4], index=False, header=False, sep='\t') 32 | -------------------------------------------------------------------------------- /workflow/scripts/dts/brain/brain.py: -------------------------------------------------------------------------------- 1 | import os 2 | import scanpy as sc 3 | import snapatac2 as snap 4 | from snapatac2.datasets import _datasets, datasets 5 | from pathlib import Path 6 | import pandas as pd 7 | import numpy as np 8 | import anndata as ad 9 | import mudata as md 10 | import argparse 11 | 12 | 13 | # Init args 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-a','--path_gex', nargs='+', required=True) 16 | parser.add_argument('-b','--path_peaks', required=True) 17 | parser.add_argument('-c','--path_annot', required=True) 18 | parser.add_argument('-d','--path_geneids', required=True) 19 | parser.add_argument('-f','--path_output', required=True) 20 | args = vars(parser.parse_args()) 21 | 22 | path_gex = args['path_gex'] 23 | path_peaks = args['path_peaks'] 24 | path_annot = args['path_annot'] 25 | path_geneids = args['path_geneids'] 26 | path_output = args['path_output'] 27 | 28 | # Read annots 29 | obs = pd.read_csv(path_annot, index_col=0) 30 | 31 | def read_sample(path_gex, obs, geneids): 32 | rna = sc.read_10x_h5(path_gex) 33 | rna.obs_names_make_unique() 34 | sample_id = os.path.basename(path_gex).split('_')[0] 35 | rna.obs_names = [sample_id + '_' + b.split('-1')[0] for b in rna.obs_names] 36 | 37 | # Filter faulty gene symbols 38 | ensmbls = np.array([geneids[g] if g in geneids else '' for g in rna.var_names]) 39 | msk = ensmbls != '' 40 | rna = rna[:, msk].copy() 41 | 42 | # Basic QC 43 | sc.pp.filter_cells(rna, min_genes=100) 44 | sc.pp.filter_genes(rna, min_cells=3) 45 | del rna.obs['n_genes'] 46 | 47 | # Remove duplicated genes based on num of cells 48 | to_remove = [] 49 | for dup in rna.var.index[rna.var.index.duplicated()]: 50 | tmp = rna.var.loc[dup] 51 | max_idx = tmp.set_index('gene_ids')['n_cells'].idxmax() 52 | to_remove.extend(tmp['gene_ids'][tmp['gene_ids'] != max_idx].values) 53 | rna = rna[:, ~rna.var['gene_ids'].isin(to_remove)].copy() 54 | return rna 55 | 56 | 57 | # Read gene ids 58 | geneids = pd.read_csv(path_geneids).set_index('symbol')['id'].to_dict() 59 | 60 | # Read samples 61 | rna = [] 62 | for p in path_gex: 63 | rna.append(read_sample(p, obs, geneids)) 64 | rna = ad.concat(rna, join='outer') 65 | 66 | # Read atac data 67 | atac = ad.read_h5ad(path_peaks) 68 | rna = rna[atac.obs_names].copy() 69 | rna.X.sort_indices() 70 | atac = atac[rna.obs_names].copy() 71 | 72 | # Create mdata 73 | mdata = md.MuData( 74 | {'rna': rna, 'atac': atac,}, 75 | obs=obs 76 | ) 77 | 78 | # Write 79 | mdata.write(path_output) 80 | -------------------------------------------------------------------------------- /workflow/scripts/dts/brain/prc_annot.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import argparse 5 | 6 | 7 | # Init args 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-a', '--path_rannot', required=True) 10 | parser.add_argument('-b','--samples', required=True, nargs='+') 11 | parser.add_argument('-c', '--path_annot', required=True) 12 | args = vars(parser.parse_args()) 13 | 14 | path_rannot = args['path_rannot'] 15 | samples = args['samples'] 16 | path_annot = args['path_annot'] 17 | 18 | annot = pd.read_csv(path_rannot) 19 | annot = annot[annot['batch'].isin(samples)] 20 | annot['barcode'] = annot['batch'] + '_' + annot['barcode'] 21 | annot = annot.set_index('barcode', drop=True) 22 | annot.to_csv(path_annot, header=True) 23 | -------------------------------------------------------------------------------- /workflow/scripts/dts/fakepair/coembedd.R: -------------------------------------------------------------------------------- 1 | library(Signac) 2 | library(EnsDb.Hsapiens.v86) 3 | library(ggplot2) 4 | library(cowplot) 5 | library(dplyr) 6 | library(Seurat) 7 | library(SingleCellExperiment) 8 | library(rhdf5) 9 | 10 | 11 | # Parse args 12 | args <- commandArgs(trailingOnly = F) 13 | path_gex <- args[6] 14 | path_peaks <- args[7] 15 | path_frags <- args[8] 16 | path_cca_out <- args[9] 17 | 18 | 19 | # Load RNA and ATAC seq matrix 20 | 21 | # Process RNA 22 | rna <- Read10X_h5(path_gex)[[1]] 23 | data.rna <- CreateSeuratObject(counts = rna, project = "RNA", assay = "RNA") 24 | data.rna <- NormalizeData(data.rna) 25 | data.rna <- FindVariableFeatures(data.rna) 26 | data.rna <- ScaleData(data.rna) 27 | 28 | # Process ATAC 29 | indata <- H5Fopen(path_peaks, flags='H5F_ACC_RDONLY') 30 | indices <- indata$X$indices 31 | indptr <- indata$X$indptr 32 | data <- as.numeric(indata$X$data) 33 | atac <- Matrix::sparseMatrix(i=indices, p=indptr, x=data, index1 = FALSE) 34 | colnames(atac) <- indata$obs$`_index` 35 | rownames(atac) <- indata$var$`_index` 36 | h5closeAll() 37 | grange.counts <- StringToGRanges(rownames(atac), sep = c(":", "-")) 38 | grange.use <- seqnames(grange.counts) %in% standardChromosomes(grange.counts) 39 | atac <- atac[as.vector(grange.use), ] 40 | annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) 41 | seqlevelsStyle(annotations) <- 'UCSC' 42 | genome(annotations) <- "hg38" 43 | chrom_assay <- CreateChromatinAssay( 44 | counts = atac, 45 | sep = c(":", "-"), 46 | genome = 'hg38', 47 | fragments = path_frags, 48 | annotation = annotations 49 | ) 50 | data.atac <- CreateSeuratObject(counts = chrom_assay, assay = "ATAC", project = "ATAC") 51 | data.atac <- RunTFIDF(data.atac) 52 | data.atac <- FindTopFeatures(data.atac, min.cutoff = "q0") 53 | data.atac <- ScaleData(data.atac) 54 | 55 | # Infer gene scores 56 | gene.activities <- GeneActivity(data.atac, features = VariableFeatures(data.rna)) 57 | data.atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities) 58 | DefaultAssay(data.atac) <- "ACTIVITY" 59 | data.atac <- NormalizeData(data.atac) 60 | data.atac <- ScaleData(data.atac, features = rownames(data.atac)) 61 | data.atac <- FindVariableFeatures(data.atac) 62 | 63 | # Run CCA 64 | data.cca <- RunCCA( 65 | data.rna, 66 | data.atac, 67 | assay1 = "RNA", 68 | assay2 = "ACTIVITY", 69 | num.cc = 50 70 | ) 71 | 72 | CCA_PCs <- Embeddings(data.cca, reduction = "cca") 73 | saveRDS(CCA_PCs, file = path_cca_out) 74 | -------------------------------------------------------------------------------- /workflow/scripts/dts/fakepair/fakepair.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import mudata as mu 3 | import argparse 4 | import sys 5 | import os 6 | 7 | 8 | # Init args 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('-m','--path_mdata', required=True) 11 | parser.add_argument('-b','--path_barmap', required=True) 12 | parser.add_argument('-o','--path_output', required=True) 13 | args = vars(parser.parse_args()) 14 | 15 | path_mdata = args['path_mdata'] 16 | path_barmap = args['path_barmap'] 17 | path_output = args['path_output'] 18 | 19 | # Read 20 | mdata = mu.read('dts/pitupair/annotated.h5mu') 21 | barmap = pd.read_csv('dts/fakepitupair/barmap.csv') 22 | 23 | # Format RNA barmap 24 | barmap.loc[:, 'RNA'] = ['smpl_' + b.replace('-1', '') for b in barmap['RNA']] 25 | 26 | # Make sure intersection of all 27 | inter = set(barmap['RNA']) & set(barmap['ATAC']) & set(mdata.obs_names) 28 | msk = barmap['ATAC'].isin(inter) & barmap['RNA'].isin(inter) 29 | barmap = barmap.loc[msk, :].reset_index(drop=True) 30 | mdata = mdata[list(inter), :].copy() 31 | 32 | # Create new fake object 33 | fmdata = mdata[barmap['ATAC'], :].copy() 34 | 35 | # Populate with predicted RNA 36 | fmdata.mod['rna'].X = mdata.mod['rna'][barmap['RNA'].values, :].X 37 | 38 | # Update metadata 39 | obs = barmap.set_index('ATAC') 40 | obs.index.name = None 41 | fmdata.obs = obs 42 | 43 | # Write 44 | fmdata.write(path_output) 45 | -------------------------------------------------------------------------------- /workflow/scripts/dts/fakepair/paircells.R: -------------------------------------------------------------------------------- 1 | library(doParallel) 2 | library(FigR) 3 | library(BSgenome.Hsapiens.UCSC.hg38) 4 | library(SingleCellExperiment) 5 | options("optmatch_max_problem_size" = Inf) 6 | optmatch::setMaxProblemSize(size = Inf) 7 | 8 | 9 | # Parse args 10 | args <- commandArgs(trailingOnly = F) 11 | path_cca <- args[6] 12 | path_ctypes <- args[7] 13 | path_barMap_out <- args[8] 14 | 15 | 16 | # Load Data 17 | CCA_PCs <- readRDS(path_cca) 18 | isATAC <- grepl("^smpl_",rownames(CCA_PCs)) 19 | ATAC_PCs <- CCA_PCs[isATAC,] 20 | RNA_PCs <- CCA_PCs[!isATAC,] 21 | 22 | # Pair with FigR 23 | pairing <- pairCells( 24 | ATAC = ATAC_PCs, 25 | RNA = RNA_PCs, 26 | keepUnique = TRUE 27 | ) 28 | 29 | # Filter paired object 30 | #euc.dist <- function(x1, x2) sqrt(sum((x1 - x2) ^ 2)) 31 | #pairing$dist <- apply(pairing, 1, function(x) { euc.dist(ATAC_PCs[x[1],1:ncol(ATAC_PCs)],RNA_PCs[x[2],1:ncol(RNA_PCs)])}) 32 | pairing <- pairing[order(pairing$dist, decreasing = FALSE), ] 33 | pairing <- pairing[!duplicated(pairing$ATAC),] 34 | #atac_pairing <- pairing[!duplicated(pairing$ATAC),] 35 | #rna_pairing <- pairing[!duplicated(pairing$RNA),] 36 | #pairing <- merge(atac_pairing, rna_pairing) 37 | 38 | # Merge ctype info 39 | ctypes <- read.csv(path_ctypes) 40 | pairing <- merge(pairing, ctypes, by.x='ATAC', by.y='barcode') 41 | pairing['batch'] <- 'smpl' 42 | pairing <- pairing[, c('ATAC', 'RNA', 'batch', 'celltype', 'dist')] 43 | 44 | # Write 45 | write.csv(pairing, path_barMap_out, row.names = FALSE) -------------------------------------------------------------------------------- /workflow/scripts/dts/format_frags.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILE_PATH=$1 4 | SAMPLE_NAME=$(basename "$FILE_PATH" .frags.tsv.gz) 5 | echo 'File path: ' $FILE_PATH 6 | echo 'Sample id: ' $SAMPLE_NAME 7 | 8 | # Process, modify, compress to bgzip format, and index 9 | zcat "$FILE_PATH" | \ 10 | awk -v sample="$SAMPLE_NAME" '$0 !~ /^#/ {print $1"\t"$2"\t"$3"\t"sample"_"($4 ~ /-1$/ ? substr($4, 1, length($4)-2) : $4)"\t"$5}' | \ 11 | bgzip > "${FILE_PATH}_modified.frags.tsv.bgz" 12 | 13 | # Index the bgzipped file with tabix 14 | tabix -p bed "${FILE_PATH}_modified.frags.tsv.bgz" 15 | 16 | # (Optional) Replace original file with the new bgzipped file 17 | mv "${FILE_PATH}_modified.frags.tsv.bgz" "$FILE_PATH" 18 | mv "${FILE_PATH}_modified.frags.tsv.bgz.tbi" "$FILE_PATH.tbi" 19 | -------------------------------------------------------------------------------- /workflow/scripts/dts/heartatlas/heart_annot.py: -------------------------------------------------------------------------------- 1 | import scanpy as sc 2 | import anndata as ad 3 | import pandas as pd 4 | import numpy as np 5 | import argparse 6 | 7 | # Init args 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-i','--path_atac', required=True) 10 | parser.add_argument('-o','--path_annot', required=True) 11 | args = vars(parser.parse_args()) 12 | 13 | 14 | path_atac = args['path_atac'] 15 | path_annot = args['path_annot'] 16 | 17 | # Get obs 18 | atac = ad.read_h5ad(path_atac).obs 19 | atac = atac[atac['region'] == 'LV'][['combinedID', 'cell_type']].copy() 20 | atac[['sangerID', 'batch']] = atac['combinedID'].str.split('_', expand=True) 21 | atac.index = [b + '_' + i.split('-')[0].split('_')[-1] for i, b in zip(atac.index, atac['batch'])] 22 | atac = atac.rename(columns={'cell_type': 'celltype', 'sangerID': 'sangerid'}) 23 | atac = atac[['celltype', 'batch', 'sangerid']] 24 | ctype_counts = atac.groupby('celltype', as_index=False).size() 25 | ctypes = ctype_counts[ctype_counts['size'] >= 100]['celltype'].values.astype(str) 26 | atac = atac[atac['celltype'].isin(ctypes)] 27 | 28 | # Write 29 | atac.to_csv(path_annot) -------------------------------------------------------------------------------- /workflow/scripts/dts/heartatlas/heartatlas.py: -------------------------------------------------------------------------------- 1 | import os 2 | import scanpy as sc 3 | from pathlib import Path 4 | import pandas as pd 5 | import numpy as np 6 | import anndata as ad 7 | import mudata as md 8 | import argparse 9 | 10 | 11 | # Init args 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('-a','--path_gex', required=True) 14 | parser.add_argument('-b','--path_peaks', required=True) 15 | parser.add_argument('-c','--path_annot', required=True) 16 | parser.add_argument('-e','--path_geneids', required=True) 17 | parser.add_argument('-f','--path_output', required=True) 18 | args = vars(parser.parse_args()) 19 | 20 | path_gex = args['path_gex'] 21 | path_peaks = args['path_peaks'] 22 | path_annot = args['path_annot'] 23 | path_geneids = args['path_geneids'] 24 | path_output = args['path_output'] 25 | 26 | # Read annots 27 | obs = pd.read_csv(path_annot, index_col=0) 28 | sngr_dict = {a: b for a, b in zip(obs['sangerid'], obs['batch'])} 29 | 30 | # Read gene ids 31 | geneids = pd.read_csv(path_geneids).set_index('id')['symbol'].to_dict() 32 | 33 | # Read rna 34 | rna = sc.read_h5ad(path_gex) 35 | rna = rna[rna.obs['sangerID'].isin(sngr_dict.keys()), :].copy() 36 | rna.obs_names = [sngr_dict[s] + '_' + i.replace('-1', '').split('_')[-1] for i, s in zip(rna.obs_names, rna.obs['sangerID'])] 37 | rna.obs = rna.obs[['cell_type']] 38 | #rna.var_names = rna.var['gene_name-new'].astype(str).values 39 | 40 | # Filter faulty gene symbols 41 | msk = rna.var_names.isin(geneids) 42 | rna = rna[:, msk].copy() 43 | msk = np.array([True if geneids[e] == g else False for e, g in zip(rna.var_names, rna.var['gene_name-new'])]) 44 | rna = rna[:, msk].copy() 45 | 46 | # Basic QC 47 | sc.pp.filter_cells(rna, min_genes=100) 48 | sc.pp.filter_genes(rna, min_cells=3) 49 | del rna.obs['n_genes'] 50 | 51 | # Remove duplicated genes based on num of cells 52 | to_remove = [] 53 | for dup in rna.var['gene_name-new'].values[rna.var['gene_name-new'].duplicated()]: 54 | tmp = rna.var[rna.var['gene_name-new'] == dup] 55 | max_idx = tmp['n_cells'].idxmax() 56 | to_remove.extend(tmp.index[tmp.index != max_idx].values) 57 | rna = rna[:, ~rna.var_names.isin(to_remove)].copy() 58 | 59 | # Update gene names 60 | rna.var_names = [geneids[g] for g in rna.var_names] 61 | 62 | # Read atac data 63 | atac = ad.read_h5ad(path_peaks) 64 | rna = rna[atac.obs_names].copy() 65 | atac = atac[rna.obs_names].copy() 66 | obs = obs.loc[atac.obs_names] 67 | del rna.obs 68 | del rna.var 69 | del rna.uns 70 | del rna.obsm 71 | del rna.obsp 72 | 73 | # Create mdata 74 | mdata = md.MuData( 75 | {'rna': rna, 'atac': atac,}, 76 | obs=obs 77 | ) 78 | 79 | # Write 80 | mdata.write(path_output) 81 | -------------------------------------------------------------------------------- /workflow/scripts/dts/pbmc10k/pbmc10k.py: -------------------------------------------------------------------------------- 1 | import os 2 | import scanpy as sc 3 | import snapatac2 as snap 4 | from snapatac2.datasets import _datasets, datasets 5 | from pathlib import Path 6 | import pandas as pd 7 | import numpy as np 8 | import anndata as ad 9 | import mudata as md 10 | import argparse 11 | 12 | 13 | # Init args 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-b','--path_annot', required=True) 16 | parser.add_argument('-c','--path_geneids', required=True) 17 | parser.add_argument('-e','--path_peaks', required=True) 18 | parser.add_argument('-f','--path_output', required=True) 19 | args = vars(parser.parse_args()) 20 | 21 | path_annot = args['path_annot'] 22 | path_geneids = args['path_geneids'] 23 | path_peaks = args['path_peaks'] 24 | path_output = args['path_output'] 25 | 26 | # Read gene ids 27 | geneids = pd.read_csv(path_geneids).set_index('symbol')['id'].to_dict() 28 | 29 | # Change default cache dir 30 | _datasets = datasets() 31 | _datasets.path = Path('/tmp/') 32 | 33 | # Download 34 | rna = snap.read(snap.datasets.pbmc10k_multiome(modality='RNA', type='h5ad'), backed=None) 35 | del rna.obs 36 | rna.var.index.name = None 37 | 38 | # Read annot 39 | obs = pd.read_csv(path_annot, index_col=0) 40 | 41 | # Add celltype annotation 42 | rna.obs_names = ['smpl_' + i.replace('-1', '') for i in rna.obs_names] 43 | rna = rna[obs.index, :].copy() 44 | rna.obs = obs 45 | 46 | # Filter faulty gene symbols 47 | ensmbls = np.array([geneids[g] if g in geneids else '' for g in rna.var_names]) 48 | msk = ensmbls != '' 49 | rna = rna[:, msk].copy() 50 | # Basic QC 51 | sc.pp.filter_cells(rna, min_genes=100) 52 | sc.pp.filter_genes(rna, min_cells=3) 53 | del rna.obs['n_genes'] 54 | # Remove duplicated genes based on num of cells 55 | to_remove = [] 56 | for dup in rna.var.index[rna.var.index.duplicated()]: 57 | tmp = rna.var.loc[dup] 58 | max_idx = tmp.set_index('gene_ids')['n_cells'].idxmax() 59 | to_remove.extend(tmp['gene_ids'][tmp['gene_ids'] != max_idx].values) 60 | rna = rna[:, ~rna.var['gene_ids'].isin(to_remove)].copy() 61 | del rna.obs 62 | del rna.var 63 | 64 | # Read atac data 65 | atac = ad.read_h5ad(path_peaks) 66 | atac = atac[rna.obs_names].copy() 67 | 68 | # Create mdata 69 | mdata = md.MuData( 70 | {'rna': rna, 'atac': atac,}, 71 | obs=obs 72 | ) 73 | 74 | # Write 75 | mdata.write(path_output) 76 | -------------------------------------------------------------------------------- /workflow/scripts/dts/pbmc10k/prc_annot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import snapatac2 as snap 3 | from snapatac2.datasets import _datasets, datasets 4 | from pathlib import Path 5 | import pandas as pd 6 | import argparse 7 | 8 | 9 | # Init args 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-a','--path_annot', required=True) 12 | args = vars(parser.parse_args()) 13 | 14 | path_annot = args['path_annot'] 15 | 16 | # Change default cache dir 17 | _datasets = datasets() 18 | _datasets.path = Path('/tmp/') 19 | 20 | # Download 21 | rna = snap.read(snap.datasets.pbmc10k_multiome(modality='RNA', type='h5ad'), backed=None) 22 | 23 | # Extract annot 24 | rna.obs['batch'] = 'smpl' 25 | annot = rna.obs.rename(columns={'cell_type': 'celltype'})[['batch', 'celltype']] 26 | annot.index.name = None 27 | annot.index = ['smpl_' + i.replace('-1', '') for i in annot.index] 28 | 29 | # Write 30 | annot.to_csv(path_annot) 31 | -------------------------------------------------------------------------------- /workflow/scripts/dts/pitunpair/coembedd.R: -------------------------------------------------------------------------------- 1 | library(Signac) 2 | library(EnsDb.Hsapiens.v86) 3 | library(ggplot2) 4 | library(cowplot) 5 | library(dplyr) 6 | library(Seurat) 7 | 8 | 9 | # Parse args 10 | args <- commandArgs(trailingOnly = F) 11 | path_gex <- args[6] 12 | path_celltypes <- args[7] 13 | path_peaks <- args[8] 14 | path_frags <- args[9] 15 | path_cca_out <- args[10] 16 | 17 | 18 | # RNA 19 | rna <- Read10X_h5(path_gex) 20 | data.rna <- CreateSeuratObject(counts = rna, project = "RNA", assay = "RNA") 21 | celltypes <- read.csv(path_celltypes) 22 | cells_to_remove <- Cells(data.rna)[!Cells(data.rna) %in% celltypes$X] 23 | data.rna <- subset(data.rna, cells = setdiff(Cells(data.rna), cells_to_remove)) 24 | data.rna <- NormalizeData(data.rna) 25 | data.rna <- FindVariableFeatures(data.rna) 26 | data.rna <- ScaleData(data.rna) 27 | 28 | # ATAC 29 | atac <- Read10X_h5(path_peaks) 30 | grange.counts <- StringToGRanges(rownames(atac), sep = c(":", "-")) 31 | grange.use <- seqnames(grange.counts) %in% standardChromosomes(grange.counts) 32 | atac <- atac[as.vector(grange.use), ] 33 | annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) 34 | seqlevelsStyle(annotations) <- 'UCSC' 35 | genome(annotations) <- "hg38" 36 | colnames(atac) <- gsub("-[0-9]+$", "", colnames(atac)) 37 | colnames(atac) <- paste0("smpl_", colnames(atac)) 38 | chrom_assay <- CreateChromatinAssay( 39 | counts = atac, 40 | sep = c(":", "-"), 41 | genome = 'hg38', 42 | fragments = path_frags, 43 | min.cells = 10, 44 | annotation = annotations 45 | ) 46 | data.atac <- CreateSeuratObject(counts = chrom_assay, assay = "ATAC", project = "ATAC") 47 | data.atac <- RunTFIDF(data.atac) 48 | data.atac <- FindTopFeatures(data.atac, min.cutoff = "q0") 49 | data.atac <- ScaleData(data.atac) 50 | 51 | # Infer gene scores 52 | gene.activities <- GeneActivity(data.atac, features = VariableFeatures(data.rna)) 53 | data.atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities) 54 | DefaultAssay(data.atac) <- "ACTIVITY" 55 | data.atac <- NormalizeData(data.atac) 56 | data.atac <- ScaleData(data.atac, features = rownames(data.atac)) 57 | data.atac <- FindVariableFeatures(data.atac) 58 | 59 | # Run CCA 60 | data.cca <- RunCCA( 61 | data.rna, 62 | data.atac, 63 | assay1 = "RNA", 64 | assay2 = "ACTIVITY", 65 | num.cc = 50 66 | ) 67 | 68 | CCA_PCs <- Embeddings(data.cca, reduction = "cca") 69 | saveRDS(CCA_PCs, file = path_cca_out) 70 | 71 | -------------------------------------------------------------------------------- /workflow/scripts/dts/pitunpair/paircells.R: -------------------------------------------------------------------------------- 1 | library(doParallel) 2 | library(FigR) 3 | library(BSgenome.Hsapiens.UCSC.hg38) 4 | library(SingleCellExperiment) 5 | options("optmatch_max_problem_size" = Inf) 6 | optmatch::setMaxProblemSize(size = Inf) 7 | 8 | 9 | # Parse args 10 | args <- commandArgs(trailingOnly = F) 11 | path_cca <- args[6] 12 | path_ctypes <- args[7] 13 | path_barMap_out <- args[8] 14 | 15 | 16 | # Load Data 17 | CCA_PCs <- readRDS(path_cca) 18 | isATAC <- grepl("^smpl_",rownames(CCA_PCs)) 19 | ATAC_PCs <- CCA_PCs[isATAC,] 20 | RNA_PCs <- CCA_PCs[!isATAC,] 21 | 22 | # Pair with FigR 23 | pairing <- pairCells( 24 | ATAC = ATAC_PCs, 25 | RNA = RNA_PCs, 26 | keepUnique = TRUE 27 | ) 28 | 29 | # Filter paired object 30 | pairing <- pairing[order(pairing$dist, decreasing = FALSE), ] 31 | pairing <- pairing[!duplicated(pairing$ATAC),] 32 | 33 | # Merge ctype info 34 | ctypes <- read.csv(path_ctypes) 35 | pairing <- merge(pairing, ctypes, by.x='RNA', by.y='X') 36 | pairing['batch'] <- 'smpl' 37 | pairing <- pairing[, c('ATAC', 'RNA', 'batch', 'celltype', 'dist')] 38 | rownames(pairing) <- pairing$ATAC 39 | 40 | # Write 41 | write.csv(pairing, path_barMap_out, row.names = FALSE) 42 | -------------------------------------------------------------------------------- /workflow/scripts/dts/pitunpair/pitunpair.py: -------------------------------------------------------------------------------- 1 | import os 2 | import scanpy as sc 3 | from pathlib import Path 4 | import pandas as pd 5 | import numpy as np 6 | import anndata as ad 7 | import mudata as md 8 | import argparse 9 | 10 | 11 | # Init args 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('-c','--path_geneids', required=True) 14 | parser.add_argument('-e','--path_peaks', required=True) 15 | parser.add_argument('-f','--path_output', required=True) 16 | parser.add_argument('-g','--path_expr', required=True) 17 | parser.add_argument('-i', '--path_barmap', required=True) 18 | args = vars(parser.parse_args()) 19 | 20 | path_barmap = args['path_barmap'] 21 | path_geneids = args['path_geneids'] 22 | path_peaks = args['path_peaks'] 23 | path_output = args['path_output'] 24 | path_expr = args['path_expr'] 25 | 26 | # Read gene ids 27 | geneids = pd.read_csv(path_geneids).set_index('symbol')['id'].to_dict() 28 | 29 | # Read barmap 30 | barmap = pd.read_csv(path_barmap, index_col=0) 31 | barmap.index.name = None 32 | 33 | # Read data 34 | rna = sc.read_10x_h5(path_expr, genome="GRCh38") 35 | del rna.obs 36 | rna.var.index.name = None 37 | 38 | # Filter RNA data based on barmap 39 | rna = rna[barmap['RNA'].values, :] 40 | print(barmap) 41 | rna.obs_names = barmap.index 42 | 43 | # Filter faulty gene symbols 44 | ensmbls = np.array([geneids[g] if g in geneids else '' for g in rna.var_names]) 45 | msk = ensmbls != '' 46 | rna = rna[:, msk].copy() 47 | 48 | # Basic QC 49 | sc.pp.filter_cells(rna, min_genes=100) 50 | sc.pp.filter_genes(rna, min_cells=3) 51 | del rna.obs['n_genes'] 52 | 53 | # Remove duplicated genes based on num of cells 54 | to_remove = [] 55 | for dup in rna.var.index[rna.var.index.duplicated()]: 56 | tmp = rna.var.loc[dup] 57 | max_idx = tmp.set_index('gene_ids')['n_cells'].idxmax() 58 | to_remove.extend(tmp['gene_ids'][tmp['gene_ids'] != max_idx].values) 59 | rna = rna[:, ~rna.var['gene_ids'].isin(to_remove)].copy() 60 | del rna.var 61 | 62 | # Read atac data 63 | atac = ad.read_h5ad(path_peaks) 64 | 65 | # Filter ATAC data based on barmap and RNA 66 | atac = atac[rna.obs_names, :] 67 | 68 | # Create mdata 69 | mdata = md.MuData( 70 | {'rna': rna, 'atac': atac,}, 71 | obs=barmap 72 | ) 73 | 74 | # Write 75 | mdata.write(path_output) 76 | -------------------------------------------------------------------------------- /workflow/scripts/dts/pitupair/pitupair.py: -------------------------------------------------------------------------------- 1 | import os 2 | import scanpy as sc 3 | import snapatac2 as snap 4 | from snapatac2.datasets import _datasets, datasets 5 | from pathlib import Path 6 | import pandas as pd 7 | import numpy as np 8 | import anndata as ad 9 | import mudata as md 10 | import argparse 11 | 12 | 13 | # Init args 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-b','--path_annot', required=True) 16 | parser.add_argument('-c','--path_geneids', required=True) 17 | parser.add_argument('-e','--path_peaks', required=True) 18 | parser.add_argument('-f','--path_output', required=True) 19 | parser.add_argument('-g','--path_multi', required=True) 20 | args = vars(parser.parse_args()) 21 | 22 | path_annot = args['path_annot'] 23 | path_geneids = args['path_geneids'] 24 | path_peaks = args['path_peaks'] 25 | path_output = args['path_output'] 26 | path_multi = args['path_multi'] 27 | 28 | # Read gene ids 29 | geneids = pd.read_csv(path_geneids).set_index('symbol')['id'].to_dict() 30 | 31 | # Read annots 32 | obs = pd.read_csv(path_annot, index_col=0) 33 | 34 | # Read data 35 | rna = sc.read_10x_h5(path_multi, genome="GRCh38", gex_only=True) 36 | del rna.obs 37 | rna.var.index.name = None 38 | 39 | # Rename barcodes RNA 40 | sample_id = 'smpl' 41 | rna.obs_names = [sample_id + '_' + o.split('-1')[0] for o in rna.obs_names] 42 | 43 | # Filter faulty gene symbols 44 | ensmbls = np.array([geneids[g] if g in geneids else '' for g in rna.var_names]) 45 | msk = ensmbls != '' 46 | rna = rna[:, msk].copy() 47 | 48 | # Basic QC 49 | sc.pp.filter_cells(rna, min_genes=100) 50 | sc.pp.filter_genes(rna, min_cells=3) 51 | del rna.obs['n_genes'] 52 | 53 | # Remove duplicated genes based on num of cells 54 | to_remove = [] 55 | for dup in rna.var.index[rna.var.index.duplicated()]: 56 | tmp = rna.var.loc[dup] 57 | max_idx = tmp.set_index('gene_ids')['n_cells'].idxmax() 58 | to_remove.extend(tmp['gene_ids'][tmp['gene_ids'] != max_idx].values) 59 | rna = rna[:, ~rna.var['gene_ids'].isin(to_remove)].copy() 60 | del rna.var 61 | del rna.obs 62 | 63 | # Read atac data 64 | atac = ad.read_h5ad(path_peaks) 65 | 66 | # Filter 67 | rna = rna[atac.obs_names, :].copy() 68 | obs = obs.loc[atac.obs_names, :] 69 | 70 | # Create mdata 71 | mdata = md.MuData( 72 | {'rna': rna, 'atac': atac,}, 73 | obs=obs 74 | ) 75 | 76 | # Write 77 | mdata.write(path_output) 78 | -------------------------------------------------------------------------------- /workflow/scripts/dts/reprofibro/prc_annot.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import zipfile 4 | import argparse 5 | 6 | 7 | # Init args 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-a','--path_annot', required=True) 10 | args = vars(parser.parse_args()) 11 | 12 | path_annot = args['path_annot'] 13 | 14 | archive = zipfile.ZipFile(path_annot, 'r') 15 | obs = pd.read_csv(archive.open('multiome/snATAC/cells.tsv'), sep='\t') 16 | obs = obs[['barcode', 'sample', 'cluster']].set_index('barcode').rename(columns={'sample': 'batch', 'cluster': 'celltype'}) 17 | obs.index.name = None 18 | obs = obs[obs['batch'] != 'D2'] 19 | annot = { 20 | 1: 'Fibroblast', 21 | 2: 'Fibroblast-like', 22 | 3: 'Fibroblast-like', 23 | 4: 'Fibroblast-like', 24 | 5: 'Fibroblast-like', 25 | 6: 'Keratinocyte-like', 26 | 7: 'hOSK', 27 | 8: 'xOSK', 28 | 9: 'Intermediate', 29 | 10: 'Partially-reprogrammed', 30 | 11: 'Intermediate', 31 | 12: 'Intermediate', 32 | 13: 'Pre-iPSC', 33 | 14: 'Pre-iPSC', 34 | 15: 'iPSC', 35 | } 36 | obs['celltype'] = [annot[c] for c in obs['celltype']] 37 | obs.to_csv(path_annot) 38 | -------------------------------------------------------------------------------- /workflow/scripts/mth/celloracle/mdl.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt # Import else compiling error 2 | import numpy as np 3 | import pandas as pd 4 | import muon as mu 5 | import celloracle as co 6 | import os 7 | import argparse 8 | 9 | 10 | # Init args 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('-m','--path_mdata', required=True) 13 | parser.add_argument('-g','--path_p2g', required=True) 14 | parser.add_argument('-t','--path_tfb', required=True) 15 | parser.add_argument('-a','--alpha', required=True) 16 | parser.add_argument('-p','--pthr', required=True) 17 | parser.add_argument('-n','--top_n', required=True) 18 | parser.add_argument('-o','--path_out', required=True) 19 | args = vars(parser.parse_args()) 20 | 21 | path_mdata = args['path_mdata'] 22 | path_p2g = args['path_p2g'] 23 | path_tfb = args['path_tfb'] 24 | alpha = float(args['alpha']) 25 | pthr = float(args['pthr']) 26 | top_n = int(args['top_n']) 27 | path_out = args['path_out'] 28 | 29 | # Process base GRN 30 | p2g = pd.read_csv(path_p2g) 31 | tfb = pd.read_csv(path_tfb) 32 | if (p2g.shape[0] == 0) or (tfb.shape[0] == 0): 33 | grn = pd.DataFrame(columns=['source', 'target', 'score', 'pval']) 34 | grn.to_csv(path_out, index=False) 35 | exit() 36 | tfb['score'] = 1 37 | p2g = p2g[['cre', 'gene']] 38 | base_grn = pd.merge( 39 | p2g, 40 | tfb 41 | .pivot(index='cre', columns='tf') 42 | .fillna(0) 43 | .droplevel(0, axis=1) 44 | .reset_index() 45 | ) 46 | base_grn = base_grn.rename(columns={'cre': 'peak_id', 'gene': 'gene_short_name'}) 47 | base_grn['peak_id'] = base_grn['peak_id'].str.replace('-', '_') 48 | 49 | # Init oracle object 50 | oracle = co.Oracle() 51 | oracle.adata = mu.read(path_mdata)['rna'].copy() 52 | oracle.adata.obsm['X_umap'] = np.zeros((oracle.adata.shape[0], 2)) 53 | oracle.adata.layers['imputed_count'] = oracle.adata.X 54 | oracle.adata.obs['cluster'] = 'cluster' 55 | oracle.cluster_column_name = 'cluster' 56 | oracle.embedding_name = 'X_umap' 57 | oracle.pcs = np.zeros((oracle.adata.shape[0], 2)) 58 | oracle.knn = True 59 | oracle.k_knn_imputation = True 60 | oracle.import_TF_data(TF_info_matrix=base_grn) 61 | 62 | # Model TF ~ G 63 | print('Modeling GRN...') 64 | links = oracle.get_links( 65 | cluster_name_for_GRN_unit="cluster", 66 | alpha=alpha, 67 | n_jobs=32, 68 | ) 69 | print('Modeling Done!') 70 | print('Filtering links...') 71 | links.filter_links( 72 | p=pthr, 73 | weight="coef_abs", 74 | threshold_number=top_n 75 | ) 76 | print('Filtering done!') 77 | 78 | # Extract grn 79 | grn = links.filtered_links['cluster'].dropna()[['source', 'target', 'coef_mean', 'p']] 80 | grn = grn.rename(columns={'coef_mean': 'score', 'p': 'pval'}) 81 | grn = grn.sort_values(['source', 'target', 'pval']) 82 | 83 | # Write 84 | grn.to_csv(path_out, index=False) 85 | 86 | print('Done') 87 | os._exit(0) # Add this else it gets stuck -------------------------------------------------------------------------------- /workflow/scripts/mth/celloracle/p2g.R: -------------------------------------------------------------------------------- 1 | library(cicero) 2 | library(rhdf5) 3 | 4 | 5 | # Parse args 6 | args <- commandArgs(trailingOnly = F) 7 | path_data <- args[6] 8 | path_genome <- args[7] 9 | ext <- as.numeric(args[8]) 10 | path_all_peaks <- args[9] 11 | path_connections <- args[10] 12 | 13 | # Read genome 14 | org <- sub('^dbs/([^/]+)/.*$', '\\1', path_genome) 15 | path_chr_sizes <- file.path(path_genome, org, sprintf('%s.fa.sizes', org)) 16 | genome <- read.table(path_chr_sizes) 17 | 18 | # Process mudata 19 | indata <- H5Fopen(path_data, flags='H5F_ACC_RDONLY') 20 | data <- indata$mod$atac$X 21 | barcodes <- indata$mod$atac$obs$`_index` 22 | peaks <- indata$mod$atac$var$`_index` 23 | h5closeAll() 24 | 25 | # Format cell info 26 | cellinfo <- data.frame(row.names=barcodes, cells=barcodes) 27 | 28 | # Format peak info 29 | peakinfo <- data.frame(row.names=peaks, site_name=peaks) 30 | peakinfo <- tidyr::separate(data = peakinfo, col = 'site_name', into = c("chr", "bp1", "bp2"), sep = "-", remove=FALSE) 31 | 32 | # Add names 33 | row.names(data) <- row.names(peakinfo) 34 | colnames(data) <- row.names(cellinfo) 35 | 36 | # Make CDS 37 | input_cds <- suppressWarnings( 38 | new_cell_data_set(data, 39 | cell_metadata = cellinfo, 40 | gene_metadata = peakinfo) 41 | ) 42 | 43 | # Data preprocessing 44 | set.seed(2017) 45 | 46 | # Run cicero 47 | print("Starting Cicero") 48 | print("Calculating distance_parameter value") 49 | distance_parameters <- estimate_distance_parameter( 50 | input_cds, 51 | window=ext, 52 | maxit=100, 53 | sample_num = 100, 54 | distance_constraint = round(ext / 2), 55 | distance_parameter_convergence = 1e-22, 56 | genomic_coords = genome 57 | ) 58 | mean_distance_parameter <- mean(unlist(distance_parameters)) 59 | print("Running models") 60 | cicero_out <- generate_cicero_models( 61 | input_cds, 62 | distance_parameter = mean_distance_parameter, 63 | window = ext, 64 | genomic_coords = genome 65 | ) 66 | print("Assembling connections") 67 | conns <- assemble_connections(cicero_out, silent=FALSE) 68 | 69 | # Save 70 | all_peaks <- row.names(exprs(input_cds)) 71 | write.csv(x = all_peaks, file = file.path(path_all_peaks)) 72 | write.csv(x = conns, file = file.path(path_connections)) 73 | -------------------------------------------------------------------------------- /workflow/scripts/mth/celloracle/p2g.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt # Import else compiling error 2 | import pandas as pd 3 | import numpy as np 4 | from celloracle import motif_analysis as ma 5 | import celloracle as co 6 | import mudata as mu 7 | import os 8 | import re 9 | import argparse 10 | 11 | 12 | # Init args 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('-d','--path_data', required=True) 15 | parser.add_argument('-a','--all_peaks', required=True) 16 | parser.add_argument('-c','--connections', required=True) 17 | parser.add_argument('-o','--organism', required=True) 18 | parser.add_argument('-t','--thr', required=True) 19 | parser.add_argument('-p','--path_out', required=True) 20 | args = vars(parser.parse_args()) 21 | 22 | path_data = args['path_data'] 23 | path_all_peaks = args['all_peaks'] 24 | path_connections = args['connections'] 25 | organism = args['organism'] 26 | thr_coaccess = float(args['thr']) 27 | path_out = args['path_out'] 28 | 29 | # Process organism 30 | organism = re.search(r'^dbs/([^/]+)/.*$', organism).group(1) 31 | 32 | # Load scATAC-seq peak list 33 | peaks = pd.read_csv(path_all_peaks, index_col=0).x.values.astype('U') 34 | peaks = np.char.replace(peaks, '-', '_') 35 | 36 | # Load Cicero coaccessibility scores 37 | cicero_connections = pd.read_csv(path_connections, index_col=0) 38 | cicero_connections['Peak1'] = np.char.replace(cicero_connections['Peak1'].values.astype('U'), '-', '_') 39 | cicero_connections['Peak2'] = np.char.replace(cicero_connections['Peak2'].values.astype('U'), '-', '_') 40 | 41 | # Extract tss information 42 | tss_annotated = ma.get_tss_info( 43 | peak_str_list=peaks, 44 | ref_genome=organism 45 | ) 46 | 47 | # Integrate 48 | integrated = ma.integrate_tss_peak_with_cicero( 49 | tss_peak=tss_annotated, 50 | cicero_connections=cicero_connections 51 | ) 52 | 53 | # Process 54 | integrated = integrated[integrated['coaccess'] >= thr_coaccess] 55 | integrated['peak_id'] = integrated['peak_id'].str.replace('_', '-') 56 | integrated = integrated.rename(columns={'peak_id': 'cre', 'gene_short_name': 'gene', 'coaccess': 'score'}) 57 | integrated = integrated.sort_values(['cre', 'score'], ascending=[True, False]) 58 | 59 | # Remove unexpressed genes 60 | genes = mu.read(os.path.join(path_data, 'rna')).var.index.values.astype('U') 61 | integrated = integrated[integrated['gene'].isin(genes)] 62 | 63 | # Write 64 | integrated.to_csv(path_out, index=False) 65 | -------------------------------------------------------------------------------- /workflow/scripts/mth/celloracle/pre.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | import numpy as np 4 | import celloracle as co 5 | import muon as mu 6 | import scipy 7 | import os 8 | import argparse 9 | 10 | 11 | # Init args 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('-i','--path_input', required=True) 14 | parser.add_argument('-k','--knn', required=True) 15 | parser.add_argument('-o','--path_out', required=True) 16 | args = vars(parser.parse_args()) 17 | 18 | path_input = args['path_input'] 19 | k = int(args['knn']) 20 | path_out = args['path_out'] 21 | 22 | # Read rna adata 23 | mdata = mu.read(path_input) 24 | 25 | # Extract raw counts data and assign labels 26 | adata = mdata.mod['rna'].copy() 27 | adata.layers['lognorm'] = adata.X.copy() 28 | adata.X = adata.layers['counts'].copy() 29 | adata.obs['celltype'] = mdata.obs['celltype'] 30 | adata.obsm['X_pca'] = mdata.obsm['X_spectral'] 31 | 32 | # Instantiate Oracle object 33 | oracle = co.Oracle() 34 | oracle.import_anndata_as_raw_count( 35 | adata=adata, 36 | cluster_column_name="celltype", 37 | embedding_name="X_pca" 38 | ) 39 | 40 | # Compute PCA and select top pcs 41 | oracle.perform_PCA() 42 | n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0] 43 | n_comps = min(n_comps, 50) 44 | 45 | # Run imputation 46 | oracle.knn_imputation( 47 | n_pca_dims=n_comps, 48 | k=k, 49 | balanced=True, 50 | b_sight=k*8, 51 | b_maxl=k*4, 52 | n_jobs=os.cpu_count(), 53 | ) 54 | 55 | # Update object with imputet counts 56 | mdata['rna'].X = oracle.adata.layers['imputed_count'] 57 | 58 | # Write 59 | mdata.write(path_out) 60 | -------------------------------------------------------------------------------- /workflow/scripts/mth/celloracle/src.R: -------------------------------------------------------------------------------- 1 | library(cicero) 2 | library(rhdf5) 3 | 4 | 5 | # Parse args 6 | args <- commandArgs(trailingOnly = F) 7 | path_data <- args[6] 8 | path_genome <- args[7] 9 | ext <- as.numeric(args[8]) 10 | path_all_peaks <- args[9] 11 | path_connections <- args[10] 12 | 13 | # Read genome 14 | org <- sub('^dbs/([^/]+)/.*$', '\\1', path_genome) 15 | path_chr_sizes <- file.path(path_genome, org, sprintf('%s.fa.sizes', org)) 16 | genome <- read.table(path_chr_sizes) 17 | 18 | # Process mudata 19 | indata <- H5Fopen(path_data, flags='H5F_ACC_RDONLY') 20 | data <- indata$mod$atac$X 21 | barcodes <- indata$mod$atac$obs$`_index` 22 | peaks <- indata$mod$atac$var$`_index` 23 | h5closeAll() 24 | 25 | # Format cell info 26 | cellinfo <- data.frame(row.names=barcodes, cells=barcodes) 27 | 28 | # Format peak info 29 | peakinfo <- data.frame(row.names=peaks, site_name=peaks) 30 | peakinfo <- tidyr::separate(data = peakinfo, col = 'site_name', into = c("chr", "bp1", "bp2"), sep = "-", remove=FALSE) 31 | 32 | # Add names 33 | row.names(data) <- row.names(peakinfo) 34 | colnames(data) <- row.names(cellinfo) 35 | 36 | # Binarize 37 | data[data != 0] <- 1 38 | 39 | # Make CDS 40 | input_cds <- suppressWarnings( 41 | new_cell_data_set(data, 42 | cell_metadata = cellinfo, 43 | gene_metadata = peakinfo) 44 | ) 45 | 46 | # Data preprocessing 47 | set.seed(2017) 48 | input_cds <- estimate_size_factors(input_cds) 49 | input_cds <- preprocess_cds(input_cds, method = "LSI") 50 | 51 | # Dimensional reduction with umap 52 | input_cds <- reduce_dimension( 53 | input_cds, 54 | reduction_method = 'UMAP', 55 | preprocess_method = "LSI" 56 | ) 57 | umap_coords <- reducedDims(input_cds)$UMAP 58 | 59 | # Build "metacells" 60 | cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords) 61 | 62 | # Run cicero 63 | print("Starting Cicero") 64 | print("Calculating distance_parameter value") 65 | distance_parameters <- estimate_distance_parameter( 66 | input_cds, 67 | window=ext, 68 | maxit=100, 69 | sample_num = 100, 70 | distance_constraint = round(ext / 2), 71 | distance_parameter_convergence = 1e-22, 72 | genomic_coords = genome 73 | ) 74 | mean_distance_parameter <- mean(unlist(distance_parameters)) 75 | print("Running models") 76 | cicero_out <- generate_cicero_models( 77 | input_cds, 78 | distance_parameter = mean_distance_parameter, 79 | window = ext, 80 | genomic_coords = genome 81 | ) 82 | print("Assembling connections") 83 | conns <- assemble_connections(cicero_out, silent=FALSE) 84 | 85 | # Save 86 | all_peaks <- row.names(exprs(input_cds)) 87 | write.csv(x = all_peaks, file = file.path(path_all_peaks)) 88 | write.csv(x = conns, file = file.path(path_connections)) 89 | -------------------------------------------------------------------------------- /workflow/scripts/mth/dictys/before_mdl.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import mudata as mu 4 | import sys 5 | import os 6 | import dictys 7 | 8 | 9 | # Read and process gex 10 | rna_path = os.path.join(sys.argv[1], 'mod', 'rna') 11 | rna = mu.read(rna_path) 12 | rna.X = rna.layers['counts'] 13 | rna = rna.to_df().T 14 | rna.to_csv(sys.argv[2], sep='\t', compression='gzip') 15 | name_pre = sys.argv[1].split('/runs/')[1].split('.')[0] 16 | if 'dictys' not in name_pre: 17 | dictys.preproc.qc_reads(sys.argv[2], sys.argv[2], 50, 10, 0, 200, 100, 0) 18 | rna = pd.read_csv(sys.argv[2], header=0, index_col=0, sep='\t') 19 | 20 | # Read and process peaks 21 | use_peaks = bool(sys.argv[3]) 22 | if use_peaks: 23 | peaks = pd.read_csv(sys.argv[4])['cre'].unique() 24 | else: 25 | atac_path = os.path.join(sys.argv[1], 'mod', 'atac') 26 | peaks = mu.read(atac_path).var_names 27 | peaks = np.array([p.replace('-', ':') for p in peaks]) 28 | peaks = pd.DataFrame(np.zeros((peaks.size, 1)), index=peaks, columns=['placeholder']) 29 | peaks.to_csv(sys.argv[5], sep='\t', compression='gzip') 30 | 31 | # Read tfb 32 | tfb = pd.read_csv(sys.argv[6]) 33 | tfb['cre'] = tfb['cre'].str.replace('-', ':') 34 | tfb = tfb[tfb['tf'].isin(rna.index) & tfb['cre'].isin(peaks.index)] 35 | output_tfb = tfb.rename(columns={'cre': 'loc', 'tf': 'TF'})[['TF', 'loc', 'score']] 36 | output_tfb.to_csv(sys.argv[7], sep='\t', index=False) 37 | -------------------------------------------------------------------------------- /workflow/scripts/mth/dictys/extract_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import mudata as mu 4 | import os 5 | import argparse 6 | 7 | 8 | parser = argparse.ArgumentParser(description="", usage="") 9 | parser.add_argument('--pre_path', required=True) 10 | parser.add_argument('--p2g_path', required=True) 11 | parser.add_argument('--exp_path', required=True) 12 | parser.add_argument('--pks_path', required=True) 13 | parser.add_argument('--use_p2g' , required=True) 14 | 15 | args = vars(parser.parse_args()) 16 | pre_path = args['pre_path'] 17 | p2g_path = args['p2g_path'] 18 | exp_path = args['exp_path'] 19 | pks_path = args['pks_path'] 20 | use_p2g = args['use_p2g' ] 21 | 22 | 23 | # Write the RNA matrix 24 | pre_type = os.path.basename(pre_path).split('.')[0] 25 | data = mu.read(pre_path) 26 | rna_X = pd.DataFrame(np.array(data['rna'].layers['counts'].todense()).T, columns=data['rna'].obs.index, index=data['rna'].var.index) 27 | rna_X.to_csv(exp_path, sep="\t", compression="gzip") 28 | 29 | if use_p2g: 30 | # Read in p2g and keep only peaks that are wide enough for footprinting 31 | all_atac_peak = np.unique(pd.read_csv(p2g_path)['cre']) 32 | else: 33 | # From the consensus peak list, keep only peaks that are wide enough for footprinting 34 | all_atac_peak = np.unique([n.replace(':', '-') for n in data['atac'].var.index]) 35 | 36 | all_atac_peak = pd.DataFrame([n.split('-') for n in all_atac_peak]) 37 | all_atac_peak.columns = ['chr', 'srt', 'end'] 38 | all_atac_peak['srt'] = all_atac_peak['srt'].astype(int) 39 | all_atac_peak['end'] = all_atac_peak['end'].astype(int) 40 | all_atac_peak = all_atac_peak[(all_atac_peak.end - all_atac_peak.srt) >= 100] 41 | all_atac_peak = all_atac_peak.sort_values(by=['chr', 'srt', 'end']) 42 | all_atac_peak.to_csv(pks_path, sep='\t', header=False, index=False) 43 | 44 | # Store clusters 45 | clus = sorted(data.obs['celltype'].unique()) 46 | for c in clus: 47 | if pre_type == 'granie': 48 | ctype_ids = data['rna'].uns['rna_b_per_c'][c] 49 | else: 50 | ctype_ids = data[data.obs['celltype'] == c].obs.index 51 | c = c.replace(' ', '_') 52 | with open(os.path.join(os.path.dirname(exp_path), f'barcodes_{c}.txt'), "w") as f: 53 | for i in ctype_ids: 54 | f.write(f"{i}\n") 55 | -------------------------------------------------------------------------------- /workflow/scripts/mth/dictys/frag_to_bam.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse, os, sys 3 | import gzip 4 | 5 | 6 | parser = argparse.ArgumentParser(description="Splits fragment file by annotated cell clusters and builds .bam file", usage="") 7 | parser.add_argument('--fnames', required=True, nargs='+') 8 | parser.add_argument('--barcodes', required=True) 9 | 10 | args = vars(parser.parse_args()) 11 | atac_fnames = args['fnames'] 12 | barcodes = args['barcodes'] 13 | 14 | fwflag = 99 # 1 + 2 + 32 + 64 15 | bwflag = 147 # 1 + 2 + 16 + 128 16 | mapq = 60 17 | rnext = '=' 18 | lshift = +4 19 | rshift = -5 20 | seqlen = 50 21 | cigar = f'{seqlen}M' 22 | seq = 'N' * seqlen 23 | qual = 'F' * seqlen 24 | valid_chr = [f"chr{i}" for i in range(1,23)] + ['chrX', 'chrY'] 25 | valid_chr = dict([(i,0) for i in valid_chr]) 26 | 27 | sam_header_string = """@HD SO:coordinate 28 | @SQ SN:chr1 LN:248956422 29 | @SQ SN:chr10 LN:133797422 30 | @SQ SN:chr11 LN:135086622 31 | @SQ SN:chr12 LN:133275309 32 | @SQ SN:chr13 LN:114364328 33 | @SQ SN:chr14 LN:107043718 34 | @SQ SN:chr15 LN:101991189 35 | @SQ SN:chr16 LN:90338345 36 | @SQ SN:chr17 LN:83257441 37 | @SQ SN:chr18 LN:80373285 38 | @SQ SN:chr19 LN:58617616 39 | @SQ SN:chr2 LN:242193529 40 | @SQ SN:chr20 LN:64444167 41 | @SQ SN:chr21 LN:46709983 42 | @SQ SN:chr22 LN:50818468 43 | @SQ SN:chr3 LN:198295559 44 | @SQ SN:chr4 LN:190214555 45 | @SQ SN:chr5 LN:181538259 46 | @SQ SN:chr6 LN:170805979 47 | @SQ SN:chr7 LN:159345973 48 | @SQ SN:chr8 LN:145138636 49 | @SQ SN:chr9 LN:138394717 50 | @SQ SN:chrX LN:156040895 51 | @SQ SN:chrY LN:57227415 52 | """ 53 | 54 | def format_sam(s, barcodes): 55 | [chrom, srt, end, bc, rpt] = s.strip().split('\t') 56 | if (chrom.lower() not in valid_chr) or (bc not in barcodes): 57 | return 58 | qname = f"{chrom}:{srt}:{end}:{bc}" 59 | fwpos = int(srt) - lshift + 1 # fragment is 0-index, sam is 1-index (bam is 0-index) 60 | bwpos = int(end) - rshift + 1 - seqlen # reverse strand, left-most position 61 | tlen = bwpos + seqlen - fwpos 62 | for c in range(int(rpt)): 63 | sys.stdout.write(f"{qname}:{c}\t{fwflag}\t{chrom}\t{fwpos}\t{mapq}\t" + 64 | f"{cigar}\t{rnext}\t{bwpos}\t{tlen}\t{seq}\t{qual}\tCB:Z:{bc}\n") 65 | sys.stdout.write(f"{qname}:{c}\t{bwflag}\t{chrom}\t{bwpos}\t{mapq}\t" + 66 | f"{cigar}\t{rnext}\t{fwpos}\t{tlen*-1}\t{seq}\t{qual}\tCB:Z:{bc}\n") 67 | 68 | def filter_fragment_file(atac_fname, barcodes): 69 | with gzip.open(atac_fname, 'rt', encoding='utf-8') as f: 70 | for line in f: 71 | format_sam(line, barcodes) 72 | 73 | sys.stdout.write(sam_header_string) 74 | barcodes = set(pd.read_csv(barcodes, header=None)[0].values) 75 | for atac_fname in atac_fnames: 76 | filter_fragment_file(atac_fname, barcodes) 77 | 78 | -------------------------------------------------------------------------------- /workflow/scripts/mth/dictys/mdl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # Parse command-line arguments 5 | while [[ "$#" -gt 0 ]]; do 6 | case $1 in 7 | --output_d) output_d="$2"; shift ;; 8 | --pre_path) pre_path="$2"; shift ;; 9 | --p2g_path) p2g_path="$2"; shift ;; 10 | --tfb_path) tfb_path="$2"; shift ;; 11 | --annot) annot="$2"; shift ;; 12 | --distance) distance="$2"; shift ;; 13 | --n_p2g_links) n_p2g_links="$2"; shift ;; 14 | --threads) threads="$2"; shift ;; 15 | --device) device="$2"; shift ;; 16 | --thr_score) thr_score="$2"; shift ;; 17 | --use_p2g) use_p2g="$2"; shift ;; 18 | --out_path) out_path="$2"; shift ;; 19 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 20 | esac 21 | shift 22 | done 23 | 24 | if [ $(wc -l < $p2g_path) -eq 1 ] || [ $(wc -l < $tfb_path) -eq 1 ] || [ $(basename $pre_path | grep -q '^granie'; echo $?) -eq 0 ]; then 25 | echo "source,target,score,pval" > "$out_path" 26 | mkdir -p "$output_d" 27 | exit 0 28 | fi && \ 29 | mkdir -p "$output_d" && \ 30 | python -c "import torch; print('Cuda enabled:', torch.cuda.is_available())" && \ 31 | python workflow/scripts/mth/dictys/before_mdl.py $pre_path $output_d/expr.tsv.gz $use_p2g $p2g_path $output_d/peaks.tsv.gz $tfb_path $output_d/tfb.tsv.gz && \ 32 | python -m dictys chromatin tssdist --cut $distance $output_d/expr.tsv.gz $output_d/peaks.tsv.gz $annot $output_d/tssdist.tsv.gz && \ 33 | echo 'Finished tssdist' && \ 34 | python -m dictys chromatin linking $output_d/tfb.tsv.gz $output_d/tssdist.tsv.gz $output_d/linking.tsv.gz && \ 35 | echo 'Finished chromatin linking' && \ 36 | python -m dictys chromatin binlinking $output_d/linking.tsv.gz $output_d/binlinking.tsv.gz $n_p2g_links && \ 37 | echo 'Finished chromatin binlinking' && \ 38 | python -m dictys network reconstruct --device $device --nth $threads $output_d/expr.tsv.gz $output_d/binlinking.tsv.gz $output_d/net_weight.tsv.gz $output_d/net_meanvar.tsv.gz $output_d/net_covfactor.tsv.gz $output_d/net_loss.tsv.gz $output_d/net_stats.tsv.gz && \ 39 | echo 'Finished network reconstruct' && \ 40 | python -m dictys network normalize --nth $threads $output_d/net_weight.tsv.gz $output_d/net_meanvar.tsv.gz $output_d/net_covfactor.tsv.gz $output_d/net_nweight.tsv.gz && \ 41 | echo 'Finished network normalize' && \ 42 | python -c "import pandas as pd, numpy as np, sys, os; \ 43 | weights = pd.read_csv(sys.argv[1], sep='\t', index_col=0); \ 44 | mask = pd.read_csv(sys.argv[2], sep='\t', index_col=0); \ 45 | mask = mask.loc[weights.index, weights.columns]; \ 46 | df = [(weights.index[i], weights.columns[j], weights.iloc[i, j]) for i in np.arange(weights.shape[0]) for j in np.arange(weights.shape[1]) if mask.iloc[i, j]]; \ 47 | df = np.array(df); \ 48 | df = pd.DataFrame(df, columns=['source', 'target', 'score']); \ 49 | df['pval'] = 0.01; \ 50 | df['score'] = df['score'].astype(float); \ 51 | df = df[df['score'].abs() > float(sys.argv[3])]; \ 52 | df.to_csv(sys.argv[4], index=False)" $output_d/net_nweight.tsv.gz $output_d/binlinking.tsv.gz $thr_score $out_path 53 | -------------------------------------------------------------------------------- /workflow/scripts/mth/dictys/p2g.py: -------------------------------------------------------------------------------- 1 | import argparse, os, sys 2 | import pandas as pd 3 | import numpy as np 4 | import mudata as md 5 | 6 | 7 | # Init args 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-d', '--path_data', required=True) 10 | parser.add_argument('-t', '--tmp_path', required=True) 11 | parser.add_argument('-p', '--path_out', required=True) 12 | parser.add_argument('-g', '--gene_annotation', required=True) 13 | parser.add_argument('-e', '--ext', required=True) 14 | args = vars(parser.parse_args()) 15 | 16 | path_data = args['path_data'] 17 | tmp_path = args['tmp_path'] 18 | annot = args['gene_annotation'] 19 | path_out = args['path_out'] 20 | distance = int(args['ext']) 21 | 22 | 23 | # Write the RNA matrix and ATAC matrix to working directory 24 | rna_filename = os.path.join(tmp_path, "expression.tsv.gz") 25 | atac_filename = os.path.join(tmp_path, "atac_peak.tsv.gz") 26 | dist_filename = os.path.join(tmp_path, "tssdist.tsv.gz") 27 | data = md.read(path_data) 28 | rna_X = pd.DataFrame(np.array(data['rna'].X).T, columns=data['rna'].obs.index, index=data['rna'].var.index) 29 | rna_X.to_csv(rna_filename, sep="\t", compression="gzip") 30 | 31 | atac_peak_names = [n.replace('-', ':') for n in data['atac'].var.index] 32 | atac_X = pd.DataFrame(np.zeros((data['atac'].var.index.shape[0], 1)), index=atac_peak_names, columns=['placeholder']) 33 | atac_X.to_csv(atac_filename, sep="\t", compression="gzip") 34 | 35 | # Identify all peaks that are within Xbp of annotated TSS 36 | os.system(f'python3 -m dictys chromatin tssdist --cut {distance} {rna_filename} {atac_filename} {annot} {dist_filename}') 37 | 38 | # Convert distance to score for p2g 39 | df = pd.read_csv(dist_filename, sep='\t').rename(columns={'region': 'cre', 'target': 'gene', 'dist': 'score'}) 40 | df['score'] = -np.abs(df['score']) 41 | df['cre'] = df['cre'].str.replace(':', '-') 42 | df = df.sort_values('score', ascending=False).reset_index(drop=True).reset_index(names='rank') 43 | df['score'] = (1 - (df['rank'] / df['rank'].max())) 44 | df[['cre', 'gene', 'score']].to_csv(path_out, index=False) 45 | -------------------------------------------------------------------------------- /workflow/scripts/mth/dictys/pre.py: -------------------------------------------------------------------------------- 1 | import argparse, os, sys 2 | import pandas as pd 3 | import numpy as np 4 | import mudata as md 5 | import dictys 6 | 7 | 8 | # Init args 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('-m','--mudata_path', required=True) 11 | parser.add_argument('-t','--tmp_path', required=True) 12 | parser.add_argument('-o','--out_path', required=True) 13 | args = vars(parser.parse_args()) 14 | 15 | mudata_path = args['mudata_path'] 16 | tmp_path = args['tmp_path'] 17 | out_path = args['out_path'] 18 | 19 | # Read 20 | mdata = md.read(mudata_path) 21 | 22 | # Process rna 23 | pd.DataFrame( 24 | np.array(mdata.mod['rna'].layers['counts'].todense()).T, 25 | columns=mdata.mod['rna'].obs.index, 26 | index=mdata.mod['rna'].var.index 27 | ).to_csv(tmp_path, sep="\t", compression="gzip") 28 | 29 | dictys.preproc.qc_reads(tmp_path, tmp_path, 50, 10, 0, 200, 100, 0) 30 | rna_df = pd.read_csv(tmp_path, sep='\t', compression="gzip", index_col=0) 31 | genes, barcodes = rna_df.index.values.astype('U'), rna_df.columns.values.astype('U') 32 | rna = mdata.mod['rna'] 33 | rna = rna[barcodes, :][:, genes].copy() 34 | rna.X = rna.layers['counts'].todense().A.copy() 35 | 36 | # Process atac 37 | atac = mdata.mod['atac'] 38 | atac.X = atac.layers['counts'].todense().A.copy() 39 | 40 | # Update 41 | mdata.mod['rna'] = rna 42 | mdata.mod['atac'] = atac 43 | mdata.update() 44 | 45 | # Write 46 | mdata.write(out_path) 47 | -------------------------------------------------------------------------------- /workflow/scripts/mth/figr/pre.R: -------------------------------------------------------------------------------- 1 | library(rhdf5) 2 | library(dplyr) 3 | library(doParallel) 4 | 5 | # Parse args 6 | args <- commandArgs(trailingOnly = F) 7 | path_data <- args[6] 8 | nCores <- as.numeric(args[7]) 9 | 10 | # Read data 11 | print('Open object') 12 | indata <- H5Fopen(path_data) 13 | 14 | # RNA 15 | rna_data <- as.data.frame(indata$mod$rna$X) 16 | colnames(rna_data) <- indata$obs$`_index` 17 | rownames(rna_data) <- indata$mod$rna$var$`_index` 18 | 19 | # ATAC 20 | atac_data <- Matrix::sparseMatrix( 21 | i=indata$mod$atac$layers$counts$indices, 22 | p=indata$mod$atac$layers$counts$indptr, 23 | x=as.numeric(indata$mod$atac$layers$counts$data), 24 | index1 = FALSE 25 | ) 26 | colnames(atac_data) <- indata$obs$`_index` 27 | rownames(atac_data) <- indata$mod$atac$var$`_index` 28 | 29 | # Normalize ATAC data 30 | atac_data <- as.matrix(FigR::centerCounts(atac_data, chunkSize = 100000)) 31 | colnames(atac_data) <- as.character(colnames(atac_data)) 32 | rownames(atac_data) <- as.character(rownames(atac_data)) 33 | 34 | # Write 35 | h5write(atac_data, name="mod/atac/X", file=indata) 36 | 37 | # Close 38 | h5closeAll() 39 | -------------------------------------------------------------------------------- /workflow/scripts/mth/granie/pre.R: -------------------------------------------------------------------------------- 1 | library(rhdf5) 2 | 3 | # Parse args 4 | args <- commandArgs(trailingOnly = F) 5 | path_data <- args[6] 6 | 7 | # Read data 8 | print('Open object') 9 | indata <- H5Fopen(path_data) 10 | 11 | # RNA 12 | rna_data <- indata$mod$rna$X 13 | colnames(rna_data) <- indata$obs$`_index` 14 | rownames(rna_data) <- indata$mod$rna$var$`_index` 15 | 16 | ### ATAC 17 | atac_data <- indata$mod$atac$X 18 | colnames(atac_data) <- indata$obs$`_index` 19 | rownames(atac_data) <- indata$mod$atac$var$`_index` 20 | 21 | # Normalize data 22 | norm_data <- function(data, norm){ 23 | if (norm == 'deseq2'){ 24 | data <- DESeq2::DESeqDataSetFromMatrix( 25 | countData = data, 26 | colData = data.frame(sampleID = colnames(data)), 27 | design = stats::as.formula(" ~ 1") 28 | ) 29 | data <- DESeq2::estimateSizeFactors(data) 30 | data <- DESeq2::counts(data, normalized = TRUE) 31 | } 32 | if (norm == 'limma'){ 33 | data <- limma::normalizeBetweenArrays( 34 | data, 35 | method = 'quantile' 36 | ) 37 | } 38 | return(data) 39 | } 40 | # Add pseudocounts for sparsity and normalize 41 | rna_data <- norm_data(rna_data, 'limma') 42 | atac_data <- norm_data(atac_data, 'deseq2') 43 | 44 | # Write 45 | h5write(rna_data, name="mod/rna/X", file=indata) 46 | h5write(atac_data, name="mod/atac/X", file=indata) 47 | 48 | # Close 49 | h5closeAll() 50 | -------------------------------------------------------------------------------- /workflow/scripts/mth/granie/pre.py: -------------------------------------------------------------------------------- 1 | import decoupler as dc 2 | import pandas as pd 3 | import numpy as np 4 | import mudata as mu 5 | import scipy.sparse as ss 6 | import argparse 7 | 8 | 9 | # Init args 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-i','--path_input', required=True) 12 | parser.add_argument('-o','--path_out', required=True) 13 | args = vars(parser.parse_args()) 14 | 15 | path_input = args['path_input'] 16 | path_out = args['path_out'] 17 | 18 | # Read rna adata 19 | mdata = mu.read(path_input) 20 | rna = mdata.mod['rna'].copy() 21 | 22 | # Psbulk rna 23 | rna.obs['batch'] = mdata.obs['batch'] 24 | rna.obs['celltype'] = mdata.obs['celltype'] 25 | rna_b_per_c = ( 26 | rna.obs.reset_index() 27 | .groupby('celltype', as_index=False)['index'] 28 | .agg(list).set_index('celltype')['index'] 29 | .to_dict() 30 | ) 31 | rna = dc.get_pseudobulk( 32 | adata=rna, 33 | sample_col='batch', 34 | groups_col='celltype', 35 | layer='counts', 36 | mode='sum', 37 | min_cells=10, 38 | min_counts=1000, 39 | ) 40 | del rna.obs['psbulk_n_cells'] 41 | del rna.obs['psbulk_counts'] 42 | del rna.layers['psbulk_props'] 43 | rna.layers['counts'] = ss.csr_matrix(rna.X.copy()) 44 | rna.uns['rna_b_per_c'] = rna_b_per_c 45 | 46 | # Psbulk atac 47 | atac = mdata.mod['atac'].copy() 48 | atac.obs['batch'] = mdata.obs['batch'] 49 | atac.obs['celltype'] = mdata.obs['celltype'] 50 | atac_b_per_c = ( 51 | atac.obs.reset_index() 52 | .groupby('celltype', as_index=False)['index'] 53 | .agg(list).set_index('celltype')['index'] 54 | .to_dict() 55 | ) 56 | atac = dc.get_pseudobulk( 57 | adata=atac, 58 | sample_col='batch', 59 | groups_col='celltype', 60 | layer='counts', 61 | mode='sum', 62 | min_cells=10, 63 | min_counts=1000, 64 | ) 65 | del atac.obs['psbulk_n_cells'] 66 | del atac.obs['psbulk_counts'] 67 | del atac.layers['psbulk_props'] 68 | atac.layers['counts'] = ss.csr_matrix(atac.X.copy()) 69 | atac.uns['atac_b_per_c'] = atac_b_per_c 70 | 71 | # Intersect and generate new object 72 | inter = np.intersect1d(rna.obs_names, atac.obs_names) 73 | mdata = mu.MuData({ 74 | 'rna': rna[inter, :].copy(), 75 | 'atac': atac[inter, :].copy(), 76 | }) 77 | mdata.obs = mdata.mod['rna'].obs.copy() 78 | del mdata.mod['rna'].obs 79 | del mdata.mod['atac'].obs 80 | 81 | # Write 82 | mdata.write(path_out) 83 | -------------------------------------------------------------------------------- /workflow/scripts/mth/granie/pre_post.py: -------------------------------------------------------------------------------- 1 | import decoupler as dc 2 | import pandas as pd 3 | import numpy as np 4 | import mudata as mu 5 | import scipy.sparse as ss 6 | import argparse 7 | 8 | 9 | # Init args 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-i','--path_input', required=True) 12 | parser.add_argument('-o','--path_out', required=True) 13 | args = vars(parser.parse_args()) 14 | 15 | path_input = args['path_input'] 16 | path_out = args['path_out'] 17 | 18 | # Read data 19 | print(path_input) 20 | mdata = mu.read(path_input) 21 | 22 | # Remove all equal features 23 | msk = np.any(np.diff(mdata.mod['rna'].X, axis=0), axis=0) 24 | rna = mdata.mod['rna'][:, msk].copy() 25 | 26 | msk = np.any(np.diff(mdata.mod['atac'].X, axis=0), axis=0) 27 | atac = mdata.mod['atac'][:, msk].copy() 28 | 29 | # Save 30 | obs=mdata.obs.copy() 31 | mdata = mu.MuData({ 32 | 'rna': rna, 33 | 'atac': atac, 34 | }) 35 | mdata.obs = obs 36 | 37 | # Write 38 | mdata.write(path_out) 39 | -------------------------------------------------------------------------------- /workflow/scripts/mth/grn.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import argparse 4 | 5 | # Init args 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('-i', '--path_input', required=True) 8 | parser.add_argument('-o', '--path_out', required=True) 9 | args = vars(parser.parse_args()) 10 | 11 | mdl_path = args['path_input'] 12 | path_out = args['path_out'] 13 | 14 | # Find paths 15 | path = os.path.dirname(mdl_path) 16 | names = os.path.basename(mdl_path) 17 | lst = names.replace('.mdl.csv', '').split('.') 18 | 19 | # Read in chunks to reduce memory usage 20 | chunksize = 100_000 # Adjust based on available memory 21 | dtype_dict = {'source': 'category', 'target': 'category', 'score': 'float32', 'pval': 'float32'} 22 | mdl_chunks = pd.read_csv(mdl_path, dtype=dtype_dict, chunksize=chunksize) 23 | mdl = pd.concat(mdl_chunks, ignore_index=True) 24 | 25 | # Skip if empty 26 | if mdl.empty: 27 | grn = pd.DataFrame(columns=['source', 'cre', 'target', 'score', 'pval']) 28 | grn.to_csv(path_out, index=False) 29 | os._exit(0) 30 | 31 | # Limit to 100k largest absolute scores 32 | mdl = mdl.nlargest(100_000, 'score', keep='all').reset_index(drop=True) 33 | tfs = set(mdl['source'].unique()) 34 | gns = set(mdl['target'].unique()) 35 | 36 | # Skip baselines 37 | baselines = {'collectri', 'dorothea', 'random', 'scenic'} 38 | if lst[0] in baselines or lst[0].startswith('o_'): 39 | mdl.to_csv(path_out, index=False) 40 | os._exit(0) 41 | 42 | # Read paths 43 | pre_name, p2g_name, tfb_name, mdl_name = lst 44 | p2g_path = os.path.join(path, f'{pre_name}.{p2g_name}.p2g.csv') 45 | tfb_path = os.path.join(path, f'{pre_name}.{p2g_name}.{tfb_name}.tfb.csv') 46 | 47 | # Read relevant columns with filtering 48 | usecols_tfb = ['tf', 'cre'] 49 | usecols_p2g = ['cre', 'gene'] 50 | 51 | tfb_chunks = pd.read_csv(tfb_path, usecols=usecols_tfb, dtype={'tf': 'category', 'cre': 'category'}, chunksize=chunksize) 52 | tfb = pd.concat((chunk[chunk['tf'].isin(tfs)] for chunk in tfb_chunks), ignore_index=True) 53 | 54 | p2g_chunks = pd.read_csv(p2g_path, usecols=usecols_p2g, dtype={'cre': 'category', 'gene': 'category'}, chunksize=chunksize) 55 | p2g = pd.concat((chunk[chunk['gene'].isin(gns)] for chunk in p2g_chunks), ignore_index=True) 56 | 57 | # Merge in an optimized manner 58 | grn = tfb.merge(p2g, on='cre', how='inner') 59 | grn = grn.rename(columns={'tf': 'source', 'gene': 'target'}) 60 | grn = grn.merge(mdl, on=['source', 'target'], how='inner') 61 | grn = grn.sort_values(['source', 'target', 'cre']).reset_index(drop=True) 62 | grn = grn[['source', 'cre', 'target', 'score', 'pval']] 63 | 64 | grn.to_csv(path_out, index=False) 65 | -------------------------------------------------------------------------------- /workflow/scripts/mth/pando/get_granges.R: -------------------------------------------------------------------------------- 1 | # Parse args 2 | args <- commandArgs(trailingOnly = F) 3 | path_hg <- args[6] 4 | path_mm <- args[7] 5 | 6 | library(EnsDb.Hsapiens.v86) 7 | gene.ranges_hg <- Signac::GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) 8 | write.csv(gene.ranges_hg, path_hg, row.names=FALSE) 9 | 10 | library(EnsDb.Mmusculus.v79) 11 | gene.ranges_mm <- Signac::GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79) 12 | write.csv(gene.ranges_mm, path_mm, row.names=FALSE) 13 | -------------------------------------------------------------------------------- /workflow/scripts/mth/pando/p2g.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(rhdf5) 3 | library(Pando) 4 | library(GenomicRanges) 5 | 6 | 7 | # Parse args 8 | args <- commandArgs(trailingOnly = F) 9 | path_data <- args[6] 10 | path_ann <- args[7] 11 | extend <- as.numeric(args[8]) 12 | path_out <- args[9] 13 | 14 | # Set genome 15 | annot <- read.csv(path_ann) 16 | annot <- GenomicRanges::makeGRangesFromDataFrame(annot, keep.extra.columns=TRUE) 17 | GenomeInfoDb::seqlevelsStyle(annot) <- 'UCSC' 18 | 19 | # Read peaks and genes 20 | indata <- H5Fopen(path_data, flags='H5F_ACC_RDONLY') 21 | peaks <- indata$mod$atac$var$`_index` 22 | genes <- indata$mod$rna$var$`_index` 23 | h5closeAll() 24 | peaks <- data.frame(seqnames=peaks) 25 | peaks <- tidyr::separate(data = peaks, col = 'seqnames', into = c("seqnames", "start", "end"), sep = "-", remove=FALSE) 26 | peaks <- GenomicRanges::makeGRangesFromDataFrame(peaks) 27 | 28 | # Filter annot by seen genes 29 | annot <- annot[annot$gene_name %in% intersect(genes, annot$gene_name), ] 30 | 31 | # Find peak2gene links 32 | peaks_near_gene <- find_peaks_near_genes( 33 | peaks = peaks, 34 | genes = annot, 35 | method = 'GREAT', 36 | upstream = round(extend / 2), 37 | downstream = round(extend / 2), 38 | ) 39 | peaks2gene <- aggregate_matrix(t(peaks_near_gene), groups=colnames(peaks_near_gene), fun='sum') 40 | 41 | # Convert from sparse mat to df 42 | sparse <- summary(peaks2gene) 43 | df <- data.frame( 44 | cre = colnames(peaks2gene)[sparse$j], 45 | gene = rownames(peaks2gene)[sparse$i], 46 | score = sparse$x 47 | ) 48 | df <- df %>% arrange(cre, desc(score)) 49 | 50 | # Write 51 | write.csv(x = df, file = path_out, row.names=FALSE) 52 | -------------------------------------------------------------------------------- /workflow/scripts/mth/pando/pre.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(rhdf5) 3 | library(Pando) 4 | library(GenomicRanges) 5 | 6 | 7 | # Parse args 8 | args <- commandArgs(trailingOnly = F) 9 | path_data <- args[6] 10 | path_ann <- args[7] 11 | exclude_exons <- args[8] 12 | path_cand <- args[9] 13 | path_matches <- args[10] 14 | 15 | # Set genome 16 | data('phastConsElements20Mammals.UCSC.hg38') 17 | regions <- phastConsElements20Mammals.UCSC.hg38 18 | annot <- read.csv(path_ann) 19 | annot <- GenomicRanges::makeGRangesFromDataFrame(annot, keep.extra.columns=TRUE) 20 | GenomeInfoDb::seqlevelsStyle(annot) <- 'UCSC' 21 | 22 | # Read peaks 23 | indata <- H5Fopen(path_data, flags='H5F_ACC_RDONLY') 24 | peaks <- indata$mod$atac$var$`_index` 25 | h5closeAll() 26 | peaks <- data.frame(seqnames=peaks) 27 | peaks <- tidyr::separate(data = peaks, col = 'seqnames', into = c("seqnames", "start", "end"), sep = "-", remove=FALSE) 28 | peaks <- GenomicRanges::makeGRangesFromDataFrame(peaks) 29 | 30 | # Read exons 31 | exons <- annot[annot$type=='exon', ] 32 | names(exons@ranges) <- NULL 33 | exons <- IRanges::intersect(exons, exons) 34 | exons <- GenomicRanges::GRanges( 35 | seqnames = exons@seqnames, 36 | ranges = exons@ranges 37 | ) 38 | 39 | # Intersect by only shared chromosomes 40 | seqnames <- intersect(intersect(levels(peaks@seqnames), levels(regions@seqnames)), levels(exons@seqnames)) 41 | peaks <- keepSeqlevels(peaks, seqnames, pruning.mode = "coarse") 42 | exons <- keepSeqlevels(exons, seqnames, pruning.mode = "coarse") 43 | regions <- keepSeqlevels(regions, seqnames, pruning.mode = "coarse") 44 | 45 | # Filter by evo cons regions 46 | hits <- GenomicRanges::findOverlaps(regions, peaks) 47 | cand <- GenomicRanges::pintersect( 48 | peaks[S4Vectors::subjectHits(hits)], 49 | regions[S4Vectors::queryHits(hits)] 50 | ) 51 | 52 | # Substract exons 53 | if (exclude_exons){ 54 | cand <- GenomicRanges::subtract(cand, exons, ignore.strand=TRUE) %>% unlist() 55 | } 56 | 57 | # Find matches of new peaks to old peaks 58 | matches <- S4Vectors::subjectHits(GenomicRanges::findOverlaps(cand, peaks)) 59 | 60 | # Write 61 | write.csv(x = cand, file = path_cand, row.names=FALSE) 62 | write.csv(x = matches, file = path_matches, row.names=FALSE) 63 | -------------------------------------------------------------------------------- /workflow/scripts/mth/pando/pre.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import mudata as mu 4 | import argparse 5 | 6 | 7 | # Init args 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('-i','--path_input', required=True) 10 | parser.add_argument('-p','--path_peaks', required=True) 11 | parser.add_argument('-m','--path_matches', required=True) 12 | parser.add_argument('-o','--path_out', required=True) 13 | args = vars(parser.parse_args()) 14 | 15 | path_input = args['path_input'] 16 | path_peaks = args['path_peaks'] 17 | path_matches = args['path_matches'] 18 | path_out = args['path_out'] 19 | 20 | # Read 21 | mdata = mu.read(path_input) 22 | df = pd.read_csv(path_peaks) 23 | matches = pd.read_csv(path_matches).iloc[:, 0].values - 1 24 | 25 | # Format peaks 26 | new_peaks = (df['seqnames'].astype(str) + '-' + df['start'].astype(str) + '-' + df['end'].astype(str)).values.astype('U') 27 | 28 | # Filter 29 | atac = mdata.mod['atac'][:, matches].copy() 30 | atac.var_names = new_peaks 31 | msk = np.sum(atac.X, axis=1) != 0 32 | atac = atac[msk, :].copy() 33 | 34 | # Remove missmatched obs 35 | rna = mdata.mod['rna'].copy() 36 | inter = np.intersect1d(rna.obs_names, atac.obs_names) 37 | x_spectral = mdata[inter, :].obsm['X_spectral'].copy() 38 | x_umap = mdata[inter, :].obsm['X_umap'].copy() 39 | obs = mdata.obs.copy() 40 | obs = obs.loc[inter, :] 41 | mdata = mu.MuData( 42 | { 43 | 'rna': rna[inter, :].copy(), 44 | 'atac': atac[inter, :].copy(), 45 | } 46 | ) 47 | mdata.obsm['X_spectral'] = x_spectral 48 | mdata.obsm['X_umap'] = x_umap 49 | mdata.obs = obs 50 | 51 | # Write 52 | mdata.write(path_out) 53 | -------------------------------------------------------------------------------- /workflow/scripts/mth/pando/tfb.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | library(rhdf5) 3 | library(Pando) 4 | 5 | # Parse args 6 | args <- commandArgs(trailingOnly = F) 7 | path_data <- args[6] 8 | organism <- args[7] 9 | path_p2g <- args[8] 10 | path_out <- args[9] 11 | 12 | # Read genome 13 | if (organism == 'hg38'){ 14 | library(BSgenome.Hsapiens.UCSC.hg38) 15 | genome <- BSgenome.Hsapiens.UCSC.hg38 16 | } else { 17 | library(BSgenome.Mmusculus.UCSC.mm10) 18 | genome <- BSgenome.Mmusculus.UCSC.mm10 19 | } 20 | 21 | # Read p2g 22 | p2g <- read.csv(path_p2g) 23 | if (nrow(p2g) == 0){ 24 | tfb <- data.frame(cre=character(), tf=character(), score=numeric()) 25 | write.csv(x = tfb, file = path_out, row.names=FALSE) 26 | quit(save="no") 27 | } 28 | 29 | # Read genes 30 | indata <- H5Fopen(path_data, flags='H5F_ACC_RDONLY') 31 | genes <- indata$mod$rna$var$`_index` 32 | h5closeAll() 33 | 34 | # Transform motif2tf to mat 35 | data('motif2tf') 36 | motif2tf <- motif2tf %>% select('motif'=1,'tf'=2) %>% 37 | distinct() %>% mutate(val=1) %>% 38 | tidyr::pivot_wider(names_from = 'tf', values_from=val, values_fill=0) %>% 39 | tibble::column_to_rownames('motif') %>% 40 | as.matrix() %>% Matrix::Matrix(sparse=TRUE) 41 | 42 | # Subset motifs to tfs in data 43 | data('motifs') 44 | motif2tf <- motif2tf[, intersect(genes, colnames(motif2tf))] 45 | motifs <- motifs[rownames(motif2tf)[Matrix::rowSums(motif2tf) != 0 ]] 46 | 47 | # Transform peaks to Granger 48 | peaks <- data.frame(seqnames=p2g$cre) %>% distinct() 49 | peaks <- tidyr::separate(data = peaks, col = 'seqnames', into = c("seqnames", "start", "end"), sep = "-", remove=FALSE) 50 | peaks <- GenomicRanges::makeGRangesFromDataFrame(peaks) 51 | 52 | # Run motif enrichment using motifmatcher (MOODS) 53 | peak_motifs <- Signac::CreateMotifMatrix( 54 | features = peaks, 55 | pwm = motifs, 56 | genome = genome, 57 | use.counts = FALSE, 58 | score=TRUE 59 | ) 60 | 61 | # Extact list of motifs to tfs 62 | sparse <- summary(motif2tf) 63 | motif2tf_lst <- data.frame( 64 | motif = rownames(motif2tf)[sparse$i], 65 | tf = colnames(motif2tf)[sparse$j] 66 | ) %>% 67 | group_by(motif) %>% 68 | summarize(values = list(tf)) %>% 69 | deframe() 70 | 71 | # Convert from sparse mat to df 72 | sparse <- summary(peak_motifs) 73 | df <- data.frame( 74 | cre = rownames(peak_motifs)[sparse$i], 75 | tf = colnames(peak_motifs)[sparse$j], 76 | score = sparse$x 77 | ) %>% 78 | mutate(tf=motif2tf_lst[tf]) %>% 79 | unnest(tf) %>% 80 | summarize(score = max(score), .by=c(cre, tf)) %>% 81 | mutate(score = ifelse(score < 0, 0, score)) %>% # Sometimes MOODs returns negative values 82 | arrange(cre, desc(score)) 83 | 84 | # Write 85 | write.csv(x = df, file = path_out, row.names=FALSE) 86 | -------------------------------------------------------------------------------- /workflow/scripts/mth/prc_prior_grn.py: -------------------------------------------------------------------------------- 1 | import pyranges as pr 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | import mudata as mu 6 | import argparse 7 | 8 | 9 | # Init args 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-g','--grn_path', required=True) 12 | parser.add_argument('-d','--data_path', required=True) 13 | parser.add_argument('-p','--proms_path', required=True) 14 | parser.add_argument('-o','--out_path', required=True) 15 | args = vars(parser.parse_args()) 16 | 17 | grn_path = args['grn_path'] 18 | data_path = args['data_path'] 19 | proms_path = args['proms_path'] 20 | out_path = args['out_path'] 21 | 22 | # Read 23 | grn = pd.read_csv(grn_path) 24 | genes = mu.read(os.path.join(data_path, 'mod', 'rna')).var_names.astype('U') 25 | peaks = mu.read(os.path.join(data_path, 'mod', 'atac')).var_names.astype('U') 26 | proms = pr.read_bed(proms_path) 27 | 28 | # Transform peaks 29 | peaks = pd.DataFrame(peaks, columns=['cre']) 30 | peaks[['Chromosome', 'Start', 'End']] = peaks['cre'].str.split('-', n=2, expand=True) 31 | peaks = pr.PyRanges(peaks[['Chromosome', 'Start', 'End']]) 32 | 33 | # Filter by genes 34 | grn = grn[grn['source'].astype('U').isin(genes) & grn['target'].astype('U').isin(genes)] 35 | proms = proms[proms.Name.astype('U').isin(genes)] 36 | 37 | # Filter by peaks 38 | proms = proms.overlap(peaks) 39 | proms.cre = proms.df['Chromosome'].astype(str) + '-' + proms.df['Start'].astype(str) + '-' + proms.df['End'].astype(str) 40 | proms = proms.df[['cre', 'Name']].rename(columns={'Name': 'target'}) 41 | 42 | # Merge 43 | grn = pd.merge(grn, proms, how='inner')[['source', 'cre', 'target', 'weight']] 44 | grn = grn.sort_values(['source', 'target', 'cre']).rename(columns={'weight': 'score'}) 45 | 46 | # Filter regulons with less than 5 targets 47 | n_targets = grn.groupby(['source']).size().reset_index(name='counts') 48 | n_targets = n_targets[n_targets['counts'] > 5] 49 | grn = grn[grn['source'].isin(n_targets['source'])] 50 | 51 | # Write 52 | grn.to_csv(out_path, index=False) 53 | -------------------------------------------------------------------------------- /workflow/scripts/mth/scenic/loom.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | import mudata as mu 5 | import loompy as lp 6 | import argparse 7 | 8 | 9 | # Init args 10 | parser = argparse.ArgumentParser() 11 | 12 | parser.add_argument('-i','--data', required=True) 13 | parser.add_argument('-o','--path_out', required=True) 14 | args = vars(parser.parse_args()) 15 | 16 | path_input = args['data'] 17 | path_out = args['path_out'] 18 | 19 | # Extract raw counts data and assign labels 20 | mdata = mu.read(path_input) 21 | adata = mdata.mod['rna'].copy() 22 | adata.layers['lognorm'] = adata.X.copy() 23 | adata.X = adata.layers['counts'].copy() 24 | adata.obs['celltype'] = mdata.obs['celltype'] 25 | adata.obsm['X_pca'] = mdata.obsm['X_spectral'] 26 | 27 | # create basic row and column attributes for the loom file: 28 | row_attrs = { 29 | "Gene": np.array(adata.var_names) , 30 | } 31 | col_attrs = { 32 | "CellID": np.array(adata.obs_names) , 33 | "nGene": np.array(np.sum(adata.X.transpose() > 0, axis=0)).flatten() , 34 | "nUMI": np.array(np.sum(adata.X.transpose(), axis=0)).flatten() , 35 | } 36 | lp.create(path_out, adata.X.transpose(), row_attrs, col_attrs) 37 | -------------------------------------------------------------------------------- /workflow/scripts/mth/scenic/process_grn.py: -------------------------------------------------------------------------------- 1 | import pyranges as pr 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | import argparse 6 | 7 | 8 | # Init args 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('-g','--grn_path', required=True) 11 | parser.add_argument('-p','--proms_path', required=True) 12 | parser.add_argument('-o','--out_path', required=True) 13 | parser.add_argument('-r','--reg_path', required=True) 14 | args = vars(parser.parse_args()) 15 | 16 | grn_path = args['grn_path'] 17 | proms_path = args['proms_path'] 18 | out_path = args['out_path'] 19 | reg_path = args['reg_path'] 20 | 21 | # Read 22 | grn = pd.read_csv(grn_path, index_col=False, sep='\t').rename(columns={'TF': 'source', 'importance': 'score'}) 23 | proms = pr.read_bed(proms_path).df 24 | proms['cre'] = proms['Chromosome'].astype(str) + '-' + proms['Start'].astype(str) + '-' + proms['End'].astype(str) 25 | proms = proms[['cre', 'Name']].rename(columns={'Name': 'target'}) 26 | reg = pd.read_csv(reg_path) 27 | 28 | # Filter by enriched TFs 29 | reg = reg.iloc[2:, [0, 8]] 30 | reg.columns = ['source', 'target'] 31 | reg['target'] = reg['target'].str.split(',') 32 | reg_exp = reg.explode('target') 33 | reg_exp['target'] = reg_exp['target'].str.replace(r"[\[\(\)' ]", "", regex=True) 34 | 35 | # Merge 36 | grn = pd.merge(grn, reg_exp, on=['source', 'target'], how='inner') 37 | grn = pd.merge(grn, proms, how='inner')[['source', 'cre', 'target', 'score']] 38 | grn = grn[grn["score"] > 0.001] 39 | grn = grn.sort_values(['source', 'target', 'cre']) 40 | 41 | # Write 42 | grn.to_csv(out_path, index=False) 43 | -------------------------------------------------------------------------------- /workflow/scripts/mth/scenicplus/egrn.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | 4 | 5 | df = pd.read_table(sys.argv[1]) 6 | if df.shape[0] > 0: 7 | df = df[df['regulation'] != 0.] 8 | df = df[['TF', 'Region', 'Gene', 'regulation', 'triplet_rank']].groupby(['TF', 'Gene'], as_index=False).mean(numeric_only=True).sort_values('triplet_rank') 9 | df = df.reset_index(drop=True).reset_index(names='rank') 10 | df['score'] = (1 - (df['rank'] / df['rank'].max())) * df['regulation'] 11 | df = df[['TF', 'Gene', 'score']] 12 | df.columns = ['source', 'target', 'score'] 13 | df['pval'] = 0.01 14 | else: 15 | df = pd.DataFrame(columns=['source', 'target', 'score', 'pval']) 16 | df.to_csv(sys.argv[2], index=False) 17 | -------------------------------------------------------------------------------- /workflow/scripts/mth/scenicplus/mdata.py: -------------------------------------------------------------------------------- 1 | import mudata, sys 2 | 3 | m = mudata.read(sys.argv[1]) 4 | m.mod['scRNA'] = m.mod['rna'] 5 | del m.mod['rna'] 6 | m.mod['scATAC'] = m.mod['atac'] 7 | del m.mod['atac'] 8 | m.mod['scATAC'].var_names = m.mod['scATAC'].var_names.str.replace('-', ':', 1) 9 | m.write(sys.argv[2]) 10 | -------------------------------------------------------------------------------- /workflow/scripts/mth/scenicplus/mdl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # Parse command-line arguments 5 | while [[ "$#" -gt 0 ]]; do 6 | case $1 in 7 | --new_dir) new_dir="$2"; shift ;; 8 | --path_pre) path_pre="$2"; shift ;; 9 | --path_p2g) path_p2g="$2"; shift ;; 10 | --path_tfb) path_tfb="$2"; shift ;; 11 | --path_rnk) path_rnk="$2"; shift ;; 12 | --threads) threads="$2"; shift ;; 13 | --path_out) path_out="$2"; shift ;; 14 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 15 | esac 16 | shift 17 | done 18 | 19 | if [ $(wc -l < $path_p2g) -eq 1 ] || [ $(wc -l < $path_tfb) -eq 1 ]; then 20 | echo "source,target,score" > "$path_out" 21 | exit 0 22 | fi 23 | 24 | # Transform pre to scenicplus mdata format 25 | python workflow/scripts/mth/scenicplus/mdata.py $path_pre $new_dir/mdata.h5mu 26 | 27 | # Extract unique tfs 28 | python -c "import pandas as pd; \ 29 | import sys; \ 30 | tfb = pd.DataFrame(pd.read_csv(sys.argv[1])['tf'].unique().reshape(-1, 1), columns=['tf']); \ 31 | tfb.to_csv(sys.argv[2], index=False, header=False)" $path_tfb $new_dir/tfs.txt 32 | 33 | # Infer tg links 34 | scenicplus grn_inference TF_to_gene \ 35 | --multiome_mudata_fname $new_dir/mdata.h5mu \ 36 | --tf_names $new_dir/tfs.txt \ 37 | --temp_dir $TMPDIR \ 38 | --out_tf_to_gene_adjacencies $new_dir/tg_adj.tsv \ 39 | --method GBM \ 40 | --n_cpu $threads 41 | 42 | # Transform p2g to scenicplus format 43 | python -c "import pandas as pd; \ 44 | import numpy as np; \ 45 | import sys; \ 46 | p2g = pd.read_csv(sys.argv[1]); \ 47 | p2g['region'] = p2g['cre'].str.replace('-', ':', 1); \ 48 | p2g['target'] = p2g['gene']; \ 49 | p2g['importance'] = p2g['score'].abs(); \ 50 | p2g['rho'] = np.sign(p2g['score']); \ 51 | p2g['importance_x_rho'] = p2g['score']; \ 52 | p2g['importance_x_abs_rho'] = p2g['score']; \ 53 | p2g = p2g[['region', 'target', 'importance', 'rho', 'importance_x_rho', 'importance_x_abs_rho']]; \ 54 | p2g.to_csv(sys.argv[2], sep='\\t', index=False)" $path_p2g $new_dir/rg_adj.tsv 55 | 56 | # Transform tfb to scenicplus format 57 | python workflow/scripts/mth/scenicplus/motifs.py $path_tfb $path_rnk $new_dir/motifs.h5ad 58 | 59 | # Egrn inference 60 | dichotomize=$( python -c "import sys, pandas; print('') if (pandas.read_csv(sys.argv[1])['score'] < 0).any() else print('--do_not_rho_dichotomize_r2g --do_not_rho_dichotomize_eRegulon');" $path_p2g ) 61 | echo "$dichotomize" 62 | scenicplus grn_inference eGRN \ 63 | --TF_to_gene_adj_fname $new_dir/tg_adj.tsv \ 64 | --region_to_gene_adj_fname $new_dir/rg_adj.tsv \ 65 | --cistromes_fname $new_dir/motifs.h5ad \ 66 | --ranking_db_fname $path_rnk \ 67 | --eRegulon_out_fname $new_dir/egrn.tsv \ 68 | --temp_dir $TMPDIR \ 69 | --min_target_genes 10 \ 70 | --n_cpu $threads $dichotomize 71 | 72 | # Transform grn into greta format 73 | python workflow/scripts/mth/scenicplus/egrn.py $new_dir/egrn.tsv $path_out 74 | -------------------------------------------------------------------------------- /workflow/scripts/mth/scenicplus/motifs.py: -------------------------------------------------------------------------------- 1 | from scenicplus.triplet_score import get_max_rank_of_motif_for_each_TF 2 | from pycistarget.motif_enrichment_cistarget import cisTargetDatabase 3 | from pycistarget.utils import load_motif_annotations 4 | import scipy.sparse as ss 5 | import numpy as np 6 | import pandas as pd 7 | import pyranges as pr 8 | import anndata as ad 9 | import sys 10 | 11 | 12 | def get_pr(index): 13 | df = index.str.replace(':', '-') 14 | df = df.str.split('-').tolist() 15 | df = pd.DataFrame(df, columns=['Chromosome', 'Start', 'End']) 16 | return pr.PyRanges(df) 17 | 18 | 19 | def get_motifs_for_TF(tf_names, annotation_to_use, motif_to_tf): 20 | motif_to_tf = motif_to_tf.fillna("")[annotation_to_use].agg(", ".join, axis = 1).apply(lambda x: [x for x in x.split(", ") if len(x) > 0]) 21 | motif_to_tf = motif_to_tf.loc[[len(x) > 0 for x in motif_to_tf]] 22 | tf_to_motif = motif_to_tf.explode().reset_index().drop_duplicates().groupby(0)["MotifID"].apply(lambda x: ','.join(list(x))) 23 | tf_names = pd.Index(tf_names) 24 | tf_names = tf_names.intersection(tf_to_motif.index) 25 | return tf_to_motif.loc[tf_names].to_dict() 26 | 27 | 28 | path_tfb = sys.argv[1] 29 | path_db = sys.argv[2] 30 | path_out = sys.argv[3] 31 | 32 | # Read 33 | tfb = pd.read_csv(path_tfb) 34 | tfb['cre'] = tfb['cre'].str.replace('-', ':', 1) 35 | var_names = tfb['tf'].unique() 36 | obs_names = tfb['cre'].unique() 37 | 38 | # Create anndata 39 | motifs = ad.AnnData( 40 | obs=pd.DataFrame(index=obs_names), 41 | var=pd.DataFrame(index=var_names), 42 | X=ss.lil_matrix((obs_names.size, var_names.size), dtype=bool), 43 | ) 44 | for cre, tfs in tfb.groupby('cre')['tf'].apply(lambda x: np.array(x)).items(): 45 | motifs[cre, tfs].X = True 46 | motifs.X = ss.csr_matrix(motifs.X) 47 | 48 | # Find motif annots 49 | motif_to_tf = load_motif_annotations( 50 | specie = "homo_sapiens", 51 | version = "v10nr_clust", 52 | motif_similarity_fdr = 0.001, 53 | orthologous_identity_threshold = 0.0) 54 | 55 | # Remove ann if they are not in db or cres not in db 56 | ctx_db = cisTargetDatabase( 57 | fname=path_db, 58 | region_sets=get_pr(motifs.obs_names) 59 | ) 60 | inter = motif_to_tf.index.intersection(ctx_db.db_rankings.index) 61 | motif_to_tf = motif_to_tf.loc[inter] 62 | motif_to_tf.index.name = 'MotifID' 63 | 64 | # Find motif anns per tf gene name 65 | tf_to_motif = get_motifs_for_TF( 66 | tf_names = motifs.var_names, 67 | annotation_to_use = ["Direct_annot", "Orthology_annot"], 68 | motif_to_tf = motif_to_tf 69 | ) 70 | 71 | # Subset and add annots 72 | m_msk = motifs.var_names.isin(tf_to_motif) 73 | motifs = motifs[:, m_msk].copy() 74 | motifs.var.loc[:, 'motifs'] = [tf_to_motif[v] for v in motifs.var_names] 75 | 76 | # Remove regions not found in db 77 | df = get_max_rank_of_motif_for_each_TF(motifs, path_db) 78 | inter = motifs.obs_names.intersection(df.index) 79 | motifs = motifs[inter, :].copy() 80 | 81 | # Write 82 | motifs.write(path_out) 83 | -------------------------------------------------------------------------------- /workflow/scripts/mth/scenicplus/p2g.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # Parse command-line arguments 5 | while [[ "$#" -gt 0 ]]; do 6 | case $1 in 7 | --new_dir) new_dir="$2"; shift ;; 8 | --path_pre) path_pre="$2"; shift ;; 9 | --path_ann) path_ann="$2"; shift ;; 10 | --path_csz) path_csz="$2"; shift ;; 11 | --ext) ext="$2"; shift ;; 12 | --threads) threads="$2"; shift ;; 13 | --path_out) path_out="$2"; shift ;; 14 | *) echo "Unknown parameter passed: $1"; exit 1 ;; 15 | esac 16 | shift 17 | done 18 | 19 | python -c "import mudata, sys; \ 20 | m = mudata.read(sys.argv[1]); \ 21 | m.mod['scRNA'] = m.mod['rna']; \ 22 | del m.mod['rna']; \ 23 | m.mod['scATAC'] = m.mod['atac']; \ 24 | del m.mod['atac']; \ 25 | m.mod['scATAC'].var_names = m.mod['scATAC'].var_names.str.replace('-', ':', 1); \ 26 | m.write(sys.argv[2])" $path_pre $new_dir/mdata.h5mu 27 | 28 | scenicplus prepare_data search_spance \ 29 | --multiome_mudata_fname $new_dir/mdata.h5mu \ 30 | --gene_annotation_fname $path_ann \ 31 | --chromsizes_fname $path_csz \ 32 | --upstream 1000 $ext \ 33 | --downstream 1000 $ext \ 34 | --out_fname $new_dir/space.tsv 35 | 36 | scenicplus grn_inference region_to_gene \ 37 | --multiome_mudata_fname $new_dir/mdata.h5mu \ 38 | --search_space_fname $new_dir/space.tsv \ 39 | --temp_dir $TMPDIR \ 40 | --out_region_to_gene_adjacencies $new_dir/rg_adj.tsv \ 41 | --n_cpu $threads 42 | 43 | python -c "import pandas as pd; \ 44 | import sys; \ 45 | tab = pd.read_table(sys.argv[1]); \ 46 | tab = tab[tab['importance_x_rho'].abs() > 1e-16]; \ 47 | tab = tab[['region', 'target', 'importance_x_rho']]; \ 48 | tab['region'] = tab['region'].str.replace(':', '-'); \ 49 | tab.columns = ['cre', 'gene', 'score']; \ 50 | tab.to_csv(sys.argv[2], index=False)" $new_dir/rg_adj.tsv $path_out 51 | -------------------------------------------------------------------------------- /workflow/scripts/mth/scenicplus/pre.py: -------------------------------------------------------------------------------- 1 | import mudata as mu 2 | import scipy.sparse as scs 3 | import scanpy as sc 4 | import sys 5 | 6 | 7 | path_ann = sys.argv[1] 8 | path_scn = sys.argv[2] 9 | path_out = sys.argv[3] 10 | 11 | # Read 12 | ann = mu.read(path_ann) 13 | scn = mu.read(path_scn) 14 | scn.var.index = scn.var_names.str.replace(':', '-') 15 | 16 | # Match 17 | inter_var = ann.var_names.intersection(scn.var_names) 18 | inter_obs = ann.obs_names.intersection(scn.obs_names) 19 | ann = ann[inter_obs, inter_var].copy() 20 | scn = scn[inter_obs, inter_var] 21 | 22 | # Update atac counts with topic ones 23 | ann.mod['atac'].layers['counts'] = scs.csr_matrix(scn.mod['scATAC'].X) 24 | ann.mod['atac'].X = scn.mod['scATAC'].X 25 | 26 | # Write 27 | ann.write(path_out) 28 | -------------------------------------------------------------------------------- /workflow/scripts/mth/scenicplus/tfb.py: -------------------------------------------------------------------------------- 1 | import scipy.sparse as scs 2 | import anndata as ad 3 | import pyranges as pr 4 | import h5py 5 | import pandas as pd 6 | import mudata as mu 7 | import sys 8 | 9 | def get_pr(index): 10 | df = index.str.replace(':', '-') 11 | df = df.str.split('-').tolist() 12 | df = pd.DataFrame(df, columns=['Chromosome', 'Start', 'End']) 13 | return pr.PyRanges(df) 14 | 15 | 16 | def get_vars(df): 17 | chrm = df.df['Chromosome'].astype(str) 18 | strt = df.df['Start'].astype(str) 19 | end = df.df['End'].astype(str) 20 | return pd.Index(chrm + ':' + strt + '-' + end) 21 | 22 | 23 | path_pre = sys.argv[1] 24 | path_p2g = sys.argv[2] 25 | path_motifs = sys.argv[3] 26 | path_out = sys.argv[4] 27 | 28 | # Read 29 | motifs = mu.read(path_motifs) 30 | p2g = pd.read_csv(path_p2g) 31 | if p2g.shape[0] == 0: 32 | tfb = pd.DataFrame(columns=['cre', 'tf', 'score']) 33 | tfb.to_csv(path_out, index=False) 34 | exit() 35 | 36 | # Subset by tf genes 37 | with h5py.File(path_pre, 'r') as f: 38 | genes = f['mod']['rna']['var']['_index'][:].astype('U') 39 | tf_msk = motifs.var_names.isin(genes) 40 | motifs = motifs[:, tf_msk] 41 | 42 | # Find shared regions 43 | mtf_pr = get_pr(motifs.obs_names) 44 | p2g_pr = get_pr(pd.Index(p2g['cre'].unique())) 45 | inter = mtf_pr.join(p2g_pr) 46 | inter_motifs = get_vars(inter[['Chromosome', 'Start', 'End']]) 47 | inter_p2g = get_vars(pr.PyRanges(inter.df[['Chromosome', 'Start_b', 'End_b']].rename(columns={'Start_b': 'Start', 'End_b': 'End'}))) 48 | 49 | # Create matching motif anndata 50 | new_motifs = ad.AnnData( 51 | var=pd.DataFrame(index=motifs.var_names), 52 | obs=pd.DataFrame(index=inter_p2g), 53 | X=scs.csr_matrix((inter_p2g.size, motifs.var_names.size)) 54 | ) 55 | new_motifs[inter_p2g, :].X = motifs[inter_motifs, :].X 56 | 57 | # Build df 58 | new_motifs.X = new_motifs.X.tocoo() 59 | tfb = pd.DataFrame() 60 | tfb['cre'] = new_motifs.obs_names[new_motifs.X.row] 61 | tfb['tf'] = new_motifs.var_names[new_motifs.X.col] 62 | tfb['score'] = 5. 63 | tfb['cre'] = tfb['cre'].str.replace(':', '-') 64 | 65 | # Write 66 | tfb.to_csv(path_out, index=False) 67 | -------------------------------------------------------------------------------- /workflow/scripts/plt/stab/cors.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import seaborn as sns 3 | import pandas as pd 4 | import numpy as np 5 | import sys 6 | import os 7 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 8 | from utils import read_config, savefigs 9 | 10 | 11 | # Read config 12 | config = read_config() 13 | palette = config['colors']['nets'] 14 | mthds = list(config['methods'].keys()) 15 | baselines = config['baselines'] 16 | 17 | path_repl_wgt = sys.argv[1] 18 | path_repl_cor = sys.argv[2] 19 | repl_wgt = pd.read_csv(path_repl_wgt) 20 | repl_cor = pd.read_csv(path_repl_cor) 21 | 22 | figs = [] 23 | for mth in repl_wgt['mth'].unique(): 24 | tmp = repl_wgt[repl_wgt['mth'] == mth] 25 | if tmp.shape[0] > 1: 26 | fig, ax = plt.subplots(1, 1, figsize=(2, 2), dpi=150) 27 | max_n = np.max([tmp['score_x'].abs().max(), tmp['score_y'].abs().max()]) 28 | max_n = max_n + (max_n * 0.05) 29 | sns.histplot( 30 | data=tmp, 31 | x='score_x', 32 | y='score_y', 33 | cbar=False, 34 | cmap='magma', 35 | stat='proportion', 36 | vmin=0., 37 | vmax=1e-2, 38 | bins=(50, 50), 39 | cbar_kws=dict(label='Proportion', shrink=0.5, aspect=5, orientation='horizontal') 40 | ) 41 | ax.set_xlabel('Run A edge score') 42 | ax.set_ylabel('Run B edge score') 43 | ax.set_xlim(-max_n, max_n) 44 | ax.set_ylim(-max_n, max_n) 45 | ax.set_title(mth) 46 | figs.append(fig) 47 | 48 | 49 | fig, ax = plt.subplots(1, 1, figsize=(1.5, 1), dpi=150) 50 | order = mthds + baselines 51 | order = [m for m in order if m in repl_cor['mth'].unique()] 52 | sns.boxplot(data=repl_cor, x='stat', y='mth', hue='mth', fill=None, ax=ax, palette=palette, order=order) 53 | sns.stripplot(data=repl_cor, x='stat', y='mth', hue='mth', ax=ax, palette=palette, order=order) 54 | ax.set_xlabel('Pearson ρ') 55 | ax.set_ylabel('') 56 | ax.set_xticks([0, 0.5, 1]) 57 | ax.set_xlim(-0.05, 1.05) 58 | figs.append(fig) 59 | 60 | # Write 61 | savefigs(figs, sys.argv[3]) 62 | -------------------------------------------------------------------------------- /workflow/scripts/plt/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def read_config(path_config='config/config.yaml'): 4 | import yaml 5 | with open(path_config, 'r') as file: 6 | config = yaml.safe_load(file) 7 | return config 8 | 9 | 10 | def savefigs(lst_figs, path_fname, index_pngs=[]): 11 | import matplotlib.backends.backend_pdf 12 | import io 13 | from PIL import Image 14 | import matplotlib.pyplot as plt 15 | pdf = matplotlib.backends.backend_pdf.PdfPages(path_fname) 16 | for i, fig in enumerate(lst_figs): 17 | if i not in index_pngs: 18 | pdf.savefig(fig, bbox_inches='tight') 19 | else: 20 | buf = io.BytesIO() 21 | fig.savefig(buf, format='png', dpi=300, bbox_inches='tight') 22 | plt.close(fig) 23 | buf.seek(0) 24 | image = Image.open(buf) 25 | new_fig, ax = plt.subplots(figsize=(image.width / 100, image.height / 100), dpi=300) 26 | ax.imshow(image) 27 | ax.axis("off") 28 | pdf.savefig(new_fig, bbox_inches='tight') 29 | plt.close(new_fig) 30 | pdf.close() 31 | --------------------------------------------------------------------------------