├── .gitignore
├── LICENSE
├── README.md
├── config
    ├── config.yaml
    ├── prior_cats.json
    └── slurm
    │   ├── CookieCutter.py
    │   ├── config.yaml
    │   ├── settings.json
    │   ├── slurm-jobscript.sh
    │   ├── slurm-sidecar.py
    │   ├── slurm-status.py
    │   ├── slurm-submit.py
    │   └── slurm_utils.py
├── docs
    ├── dbs.md
    ├── dts.md
    └── mth.md
└── workflow
    ├── Snakefile
    ├── envs
        ├── celloracle.def
        ├── dictys.def
        ├── dictys.yaml
        ├── figr.def
        ├── granie.def
        ├── gretabench.def
        ├── pando.def
        └── scenicplus.def
    ├── rules
        ├── anl
        │   ├── dbs.smk
        │   ├── dts.smk
        │   ├── metrics
        │   │   ├── mech.smk
        │   │   ├── pred.smk
        │   │   ├── prior.smk
        │   │   └── utils.smk
        │   ├── pair.smk
        │   ├── stab.smk
        │   ├── topo.smk
        │   └── tss.smk
        ├── dbs
        │   ├── c2g.smk
        │   ├── cre.smk
        │   ├── gen.smk
        │   ├── gst.smk
        │   ├── ont.smk
        │   ├── prt.smk
        │   ├── tfb.smk
        │   ├── tfm.smk
        │   ├── tfp.smk
        │   └── tss.smk
        ├── dts
        │   ├── brain.smk
        │   ├── fakepair.smk
        │   ├── general.smk
        │   ├── heartatlas.smk
        │   ├── pbmc10k.smk
        │   ├── pitunpair.smk
        │   ├── pitupair.smk
        │   └── reprofibro.smk
        ├── img
        │   └── img.smk
        ├── mth
        │   ├── celloracle.smk
        │   ├── dictys.smk
        │   ├── figr.smk
        │   ├── granie.smk
        │   ├── grn.smk
        │   ├── pando.smk
        │   ├── random.smk
        │   ├── scenic.smk
        │   └── scenicplus.smk
        └── plt
        │   ├── comb.smk
        │   ├── dbs.smk
        │   ├── eval.smk
        │   ├── figs.smk
        │   ├── pair.smk
        │   └── stab.smk
    └── scripts
        ├── anl
            ├── dbs
            │   ├── ocoef.py
            │   ├── stats.py
            │   └── terms.py
            ├── dts
            │   └── qcstats.py
            ├── metrics
            │   ├── aggregate.py
            │   ├── mech
            │   │   ├── prt.py
            │   │   ├── sim.py
            │   │   ├── tfa.py
            │   │   └── tfm.py
            │   ├── pred
            │   │   ├── gsets.py
            │   │   └── omics.py
            │   ├── prior
            │   │   ├── gnm.py
            │   │   ├── tfm.py
            │   │   └── tfp.py
            │   ├── test.py
            │   └── utils.py
            ├── pair
            │   ├── fake_stats.py
            │   ├── pairsim.py
            │   ├── real_cors.py
            │   └── realqc.py
            ├── stab
            │   ├── ovsd.py
            │   ├── run_stab.py
            │   └── seeds.py
            ├── topo
            │   ├── fvsd.py
            │   ├── inter.py
            │   └── run_pair_sim.py
            ├── tss
            │   ├── dist.py
            │   └── gocoef.py
            └── utils.py
        ├── dbs
            ├── c2g
            │   ├── eqtlcat_gene.py
            │   └── eqtlcat_smpl.py
            ├── cre
            │   ├── gwascatalogue.py
            │   └── promoters.R
            ├── gen
            │   ├── genome
            │   │   └── celloracle.py
            │   ├── gid
            │   │   └── ensmbl.R
            │   ├── pid
            │   │   └── uniprot.R
            │   └── tss
            │   │   ├── celloracle.py
            │   │   ├── dictys.py
            │   │   ├── figr.R
            │   │   ├── granie.R
            │   │   ├── hummus.R
            │   │   ├── pando.R
            │   │   └── scenicplus.py
            ├── gst
            │   └── pways.py
            ├── ont
            │   └── bto.py
            ├── tfb
            │   ├── aggregate.py
            │   ├── chipatlas_meta.py
            │   ├── chipatlas_tf.py
            │   ├── remap2022_meta.py
            │   ├── remap2022_raw.py
            │   └── unibind_raw.py
            ├── tfm
            │   └── hpa.py
            └── tfp
            │   ├── europmc.py
            │   ├── europmc_raw.py
            │   └── intact.py
        ├── dts
            ├── brain
            │   ├── brain.py
            │   └── prc_annot.py
            ├── callpeaks.py
            ├── extract_case.py
            ├── fakepair
            │   ├── coembedd.R
            │   ├── fakepair.py
            │   └── paircells.R
            ├── format_frags.sh
            ├── heartatlas
            │   ├── heart_annot.py
            │   └── heartatlas.py
            ├── pbmc10k
            │   ├── pbmc10k.py
            │   └── prc_annot.py
            ├── pitunpair
            │   ├── coembedd.R
            │   ├── paircells.R
            │   └── pitunpair.py
            ├── pitupair
            │   └── pitupair.py
            └── reprofibro
            │   ├── prc_annot.py
            │   └── reprofibro.py
        ├── mth
            ├── celloracle
            │   ├── mdl.py
            │   ├── p2g.R
            │   ├── p2g.py
            │   ├── pre.py
            │   ├── src.R
            │   ├── src.py
            │   └── tfb.py
            ├── dictys
            │   ├── before_mdl.py
            │   ├── extract_data.py
            │   ├── frag_to_bam.py
            │   ├── mdl.sh
            │   ├── p2g.py
            │   ├── pre.py
            │   └── tfb.sh
            ├── figr
            │   ├── mdl.R
            │   ├── p2g.R
            │   ├── pre.R
            │   ├── src.R
            │   └── tfb.R
            ├── granie
            │   ├── mdl.R
            │   ├── p2g.R
            │   ├── pre.R
            │   ├── pre.py
            │   ├── pre_post.py
            │   ├── src.R
            │   └── tfb.R
            ├── grn.py
            ├── pando
            │   ├── get_granges.R
            │   ├── mdl.R
            │   ├── p2g.R
            │   ├── pre.R
            │   ├── pre.py
            │   ├── src.R
            │   └── tfb.R
            ├── prc_prior_grn.py
            ├── random
            │   └── grn.py
            ├── scenic
            │   ├── loom.py
            │   └── process_grn.py
            └── scenicplus
            │   ├── egrn.py
            │   ├── mdata.py
            │   ├── mdl.sh
            │   ├── motifs.py
            │   ├── o_mdl.sh
            │   ├── p2g.sh
            │   ├── pre.py
            │   ├── tfb.py
            │   └── topics.py
        └── plt
            ├── comb
                └── sims.py
            ├── dbs
                └── stats.py
            ├── eval
                └── eval.py
            ├── pair
                ├── fake.py
                └── pair.py
            ├── stab
                ├── cors.py
                ├── links.py
                ├── sims.py
                └── stab.py
            └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # singularity images
132 | *.sif
133 | 
134 | # vs code
135 | .vscode/
136 | 
137 | # DS_Store
138 | .DS_Store
139 | workflow/.DS_Store
140 | workflow/scripts/.DS_Store
141 | workflow/scripts/methods/.DS_Store
142 | 
143 | 
144 | .snakemake/
145 | benchmarks/
146 | /datasets/
147 | gdata/
148 | workflow/scripts/methods/scenic+/s1.py
149 | logs/
150 | *.ipynb
151 | 


--------------------------------------------------------------------------------
/config/slurm/CookieCutter.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Based on lsf CookieCutter.py
 3 | #
 4 | import os
 5 | import json
 6 | 
 7 | d = os.path.dirname(__file__)
 8 | with open(os.path.join(d, "settings.json")) as fh:
 9 |     settings = json.load(fh)
10 | 
11 | 
12 | def from_entry_or_env(values, key):
13 |     """Return value from ``values`` and override with environment variables."""
14 |     if key in os.environ:
15 |         return os.environ[key]
16 |     else:
17 |         return values[key]
18 | 
19 | 
20 | class CookieCutter:
21 | 
22 |     SBATCH_DEFAULTS = from_entry_or_env(settings, "SBATCH_DEFAULTS")
23 |     CLUSTER_NAME = from_entry_or_env(settings, "CLUSTER_NAME")
24 |     CLUSTER_CONFIG = from_entry_or_env(settings, "CLUSTER_CONFIG")
25 | 
26 |     @staticmethod
27 |     def get_cluster_option() -> str:
28 |         cluster = CookieCutter.CLUSTER_NAME
29 |         if cluster != "":
30 |             return f"--cluster={cluster}"
31 |         return ""
32 | 
33 |     @staticmethod
34 |     def get_cluster_logpath() -> str:
35 |         return "logs/%r/%j"
36 | 
37 |     @staticmethod
38 |     def get_cluster_jobname() -> str:
39 |         return "%r_%w"
40 | 


--------------------------------------------------------------------------------
/config/slurm/config.yaml:
--------------------------------------------------------------------------------
 1 | cluster-sidecar: "slurm-sidecar.py"
 2 | cluster-cancel: "scancel"
 3 | jobscript: "slurm-jobscript.sh"
 4 | cluster: "slurm-submit.py"
 5 | cluster-status: "slurm-status.py"
 6 | restart-times: 5
 7 | max-jobs-per-second: 5
 8 | max-status-checks-per-second: 5
 9 | local-cores: 1
10 | latency-wait: 15
11 | use-conda: True
12 | use-singularity: True
13 | jobs: 64
14 | printshellcmds: True
15 | keep-incomplete: True
16 | notemp: True
17 | rerun-incomplete: False
18 | default-resources:
19 |    - runtime=720
20 |    - mem_mb=64000
21 |    - partition=cpu-single
22 |    - threads=1
23 | 


--------------------------------------------------------------------------------
/config/slurm/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "SBATCH_DEFAULTS": "",
3 |     "CLUSTER_NAME": "",
4 |     "CLUSTER_CONFIG": ""
5 | }
6 | 


--------------------------------------------------------------------------------
/config/slurm/slurm-jobscript.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # properties = {properties}
3 | {exec_job}
4 | 


--------------------------------------------------------------------------------
/docs/dbs.md:
--------------------------------------------------------------------------------
 1 | ## Adding databases
 2 | 
 3 | Databases are sorted into different categories:
 4 | - c2g: CRE to gene databases (e.g. eQTL studies)
 5 | - cre: CRE annotation (e.g. ENCODE)
 6 | - gen: general genome annotations (e.g. lambert, ENSMBL)
 7 | - gst: Gene set databases (e.g. REACTOME)
 8 | - ont: Ontology databases
 9 | - prt: TF perturbation databases (e.g. KnockTF)
10 | - tfb: TF binding databases (e.g. ChIP-Atlas)
11 | - tfm: TF marker databases (e.g. TF-Marker)
12 | - tfp: TF-TF interaction databases (e.g. IntAct)
13 | - tss: TSS databases (e.g. ENSMBL)
14 | 
15 | A url where to download the database should be provided in the `config/config.yaml` file.
16 | ```
17 | # Databases
18 | dbs:
19 |     hg38:
20 |         gen:
21 |             ...
22 |         prt:
23 |             ...
24 |         gst:
25 |             ...
26 |             newdatabase: 'https:// ...'
27 |     mm10:
28 |         ...
29 | ```
30 | Note that databases are divided by organism.
31 | 
32 | Rules for each of these categories can be found in `workflow/rules/dbs/`. New databases should be added to their corresponding rule file.
33 | If a database does not fit any of these categories, a new rule file can be created.
34 | 
35 | Here is an example of a rule for CRE:
36 | ```
37 | rule cre_encode:
38 |     threads: 1
39 |     output: 'dbs/hg38/cre/encode/encode.bed'
40 |     params:
41 |         url=config['dbs']['hg38']['cre']['encode']
42 |     shell:
43 |         """
44 |         ...
45 |         """
46 | ```
47 | 
48 | Rules should follow this naming convetion: `{dbtype_dbname}`, in this case `cre_encode`.
49 | The output should be stored using this path format: `dbs/{organism}/{dbtype}/{dbname}/{dbname}.bed`
50 | When possible use `.bed` format, else `csv`.
51 | 


--------------------------------------------------------------------------------
/workflow/envs/celloracle.def:
--------------------------------------------------------------------------------
 1 | Bootstrap: docker
 2 | From: ubuntu:20.04
 3 | 
 4 | 
 5 | %environment
 6 |     export PATH=/opt/:$PATH
 7 |     . "/opt/conda/etc/profile.d/conda.sh"
 8 |     . "/opt/conda/etc/profile.d/mamba.sh"
 9 |     conda activate env
10 | 
11 | %post
12 | 
13 |     # update apt
14 |     apt update -y
15 | 
16 |     # basic packages (~2 min)
17 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
18 |     apt install -y build-essential  \
19 |                    gcc \
20 |                    libstdc++6 \
21 |                    cmake \
22 |                    wget \
23 |                    curl \
24 |                    libcurl4-openssl-dev \
25 |                    libssl-dev \
26 |                    libxml2-dev \
27 |                    libcairo2-dev \
28 |                    libxt-dev \
29 |                    libopenblas-dev \
30 |                    bedtools
31 | 
32 |     # conda
33 |     wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
34 |     bash Miniforge3.sh -b -p "/opt/conda"
35 |     . "/opt/conda/etc/profile.d/conda.sh"
36 |     . "/opt/conda/etc/profile.d/mamba.sh"
37 |     conda activate
38 | 
39 |     # Create env
40 |     mamba create -y -n=env -c conda-forge -c bioconda \
41 |     python=3.10 \
42 |     r-base==4.2 \
43 |     r-monocle3 \
44 |     r-vgam \
45 |     r-glasso \
46 |     bioconductor-gviz \
47 |     bioconductor-genomicranges \
48 |     bioconductor-rtracklayer \
49 |     bioconductor-rhdf5 \
50 |     r-devtools \
51 |     pip \
52 |     cython \
53 |     pybedtools \
54 |     muon
55 | 
56 |     # Install cicero
57 |     conda activate env
58 |     Rscript -e "devtools::install_github('cole-trapnell-lab/cicero-release', ref = 'monocle3', upgrade = 'never')"
59 |     Rscript -e "remove.packages('irlba'); install.packages('irlba', repos = 'https://cloud.r-project.org')"
60 | 
61 |     # Install CellOracle
62 |     pip install celloracle==0.16.0 pybedtools==0.9.0 scikit-learn==1.1.3
63 | 
64 |     # Remove cache for lighter containers
65 |     pip cache purge
66 |     conda clean -a -y
67 | 


--------------------------------------------------------------------------------
/workflow/envs/dictys.def:
--------------------------------------------------------------------------------
 1 | Bootstrap: docker
 2 | From: ubuntu:20.04
 3 | 
 4 | 
 5 | %environment
 6 |     export PATH=/opt/:$PATH
 7 |     . "/opt/conda/etc/profile.d/conda.sh"
 8 |     . "/opt/conda/etc/profile.d/mamba.sh"
 9 |     conda activate env
10 | 
11 | %post
12 | 
13 |     # update apt
14 |     apt update -y
15 | 
16 |     # basic packages (~2 min)
17 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
18 |     apt install -y build-essential  \
19 |                    gcc \
20 |                    libstdc++6 \
21 |                    cmake \
22 |                    wget \
23 |                    curl \
24 |                    libcurl4-openssl-dev \
25 |                    libssl-dev \
26 |                    libxml2-dev \
27 |                    libcairo2-dev \
28 |                    libxt-dev \
29 |                    libopenblas-dev \
30 |                    bedtools
31 | 
32 |     # conda
33 |     wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
34 |     bash Miniforge3.sh -b -p "/opt/conda"
35 |     . "/opt/conda/etc/profile.d/conda.sh"
36 |     . "/opt/conda/etc/profile.d/mamba.sh"
37 |     conda activate
38 | 
39 |     # Create env
40 |     mamba create -y -n=env -c lingfeiwang -c conda-forge -c bioconda -c pytorch \
41 |     python=3.10 \
42 |     pip \
43 |     dictys \
44 |     pytorch \
45 |     torchvision \
46 |     torchaudio \
47 |     cpuonly \
48 |     jupyterlab \
49 |     mudata
50 | 
51 |     # Remove cache for lighter containers
52 |     pip cache purge
53 |     conda clean -a -y
54 | 


--------------------------------------------------------------------------------
/workflow/envs/dictys.yaml:
--------------------------------------------------------------------------------
 1 | name: dictys
 2 | channels:
 3 |   - lingfeiwang
 4 |   - bioconda
 5 |   - conda-forge
 6 |   - pytorch
 7 |   - nvidia
 8 | dependencies:
 9 |   - python=3.10
10 |   - dictys
11 |   - pytorch
12 |   - torchvision
13 |   - torchaudio
14 |   - pytorch-cuda=11.7
15 |   - mudata
16 |   - pip
17 |   - pip:
18 |     - torch --index-url https://download.pytorch.org/whl/cu118
19 |     - torchvision --index-url https://download.pytorch.org/whl/cu118
20 |     - torchaudio --index-url https://download.pytorch.org/whl/cu118
21 | 


--------------------------------------------------------------------------------
/workflow/envs/granie.def:
--------------------------------------------------------------------------------
 1 | Bootstrap: docker
 2 | From: ubuntu:20.04
 3 | 
 4 | 
 5 | %environment
 6 |     export PATH=/opt/:$PATH
 7 |     . "/opt/conda/etc/profile.d/conda.sh"
 8 |     . "/opt/conda/etc/profile.d/mamba.sh"
 9 |     conda activate env
10 | 
11 | %post
12 | 
13 |     # update apt
14 |     apt update -y
15 | 
16 |     # basic packages (~2 min)
17 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
18 |     apt install -y build-essential  \
19 |                    gcc \
20 |                    cmake \
21 |                    wget \
22 |                    curl \
23 |                    libcurl4-openssl-dev \
24 |                    libssl-dev \
25 |                    libxml2-dev \
26 |                    libcairo2-dev \
27 |                    libxt-dev \
28 |                    libopenblas-dev \
29 |                    bedtools
30 | 
31 |     # conda
32 |     wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
33 |     bash Miniforge3.sh -b -p "/opt/conda"
34 |     . "/opt/conda/etc/profile.d/conda.sh"
35 |     . "/opt/conda/etc/profile.d/mamba.sh"
36 |     conda activate
37 | 
38 |     # Create env
39 |     mamba create -y -n=env -c conda-forge -c bioconda \
40 |     python=3.10 \
41 |     r-base==4.3 \
42 |     r-futile.logger \
43 |     r-checkmate \
44 |     r-patchwork \
45 |     r-reshape2 \
46 |     r-data.table \
47 |     r-matrixstats \
48 |     r-matrix \
49 |     bioconductor-genomicranges \
50 |     r-rcolorbrewer \
51 |     bioconductor-complexheatmap \
52 |     bioconductor-deseq2 \
53 |     r-circlize \
54 |     r-progress \
55 |     r-stringr \
56 |     r-scales \
57 |     r-igraph \
58 |     bioconductor-s4vectors \
59 |     r-ggplot2 \
60 |     r-rlang \
61 |     bioconductor-biostrings \
62 |     bioconductor-genomeinfodb \
63 |     bioconductor-summarizedexperiment \
64 |     r-forcats \
65 |     r-gridextra \
66 |     bioconductor-limma \
67 |     r-tidyselect \
68 |     r-readr \
69 |     r-tidyr \
70 |     r-dplyr \
71 |     r-magrittr \
72 |     r-tibble \
73 |     r-viridis \
74 |     r-colorspace \
75 |     bioconductor-biomart \
76 |     bioconductor-topgo \
77 |     bioconductor-annotationhub \
78 |     bioconductor-ensembldb \
79 |     r-devtools \
80 |     bioconductor-rhdf5 \
81 |     r-irkernel \
82 |     mudata \
83 |     decoupler-py==1.8.0 \
84 |     jupyterlab \
85 |     r-tidyverse \
86 |     bioconductor-org.hs.eg.db \
87 |     bioconductor-txdb.hsapiens.ucsc.hg38.knowngene \
88 |     bioconductor-bsgenome.hsapiens.ucsc.hg38 \
89 |     r-batchtools
90 | 
91 |     # Install granie
92 |     conda activate env
93 |     Rscript -e "devtools::install_gitlab('grp-zaugg/GRaNIE@6f1f4ddd96f2932e15ca60fb8554e74de842f7e4', host = 'git.embl.de', subdir = 'src/GRaNIE', upgrade = 'never')"
94 | 
95 |     # Remove cache for lighter containers
96 |     pip cache purge
97 |     conda clean -a -y
98 | 


--------------------------------------------------------------------------------
/workflow/envs/gretabench.def:
--------------------------------------------------------------------------------
 1 | Bootstrap: docker
 2 | From: ubuntu:20.04
 3 | 
 4 | 
 5 | %environment
 6 |     export PATH=/opt/:$PATH
 7 |     . "/opt/conda/etc/profile.d/conda.sh"
 8 |     . "/opt/conda/etc/profile.d/mamba.sh"
 9 |     conda activate env
10 | 
11 | %post
12 | 
13 |     # update apt
14 |     apt update -y
15 | 
16 |     # basic packages (~2 min)
17 |     DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
18 |     apt install -y build-essential  \
19 |                    gcc \
20 |                    libstdc++6 \
21 |                    cmake \
22 |                    wget \
23 |                    curl \
24 |                    git \
25 |                    libcurl4-openssl-dev \
26 |                    libssl-dev \
27 |                    libxml2-dev \
28 |                    libcairo2-dev \
29 |                    libxt-dev \
30 |                    libopenblas-dev \
31 |                    bedtools \
32 |                    tabix
33 | 
34 |     # conda
35 |     wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
36 |     bash Miniforge3.sh -b -p "/opt/conda"
37 |     . "/opt/conda/etc/profile.d/conda.sh"
38 |     . "/opt/conda/etc/profile.d/mamba.sh"
39 |     conda activate
40 | 
41 |     # Create env
42 |     mamba create -y -n=env -c conda-forge -c bioconda -c colomoto \
43 |     python=3.10 \
44 |     pip \
45 |     muon==0.1.5 \
46 |     scanpy==1.9.8 \
47 |     leidenalg \
48 |     harmonypy \
49 |     jupyterlab \
50 |     r-base==4.3 \
51 |     bioconductor-biomart \
52 |     cython \
53 |     polars \
54 |     hmmlearn \
55 |     plotly \
56 |     pooch \
57 |     python-kaleido \
58 |     multiprocess \
59 |     pyarrow \
60 |     rustworkx \
61 |     dill \
62 |     macs3 \
63 |     scrublet \
64 |     decoupler-py==1.7.0 \
65 |     py-xgboost \
66 |     pyranges \
67 |     statannotations \
68 |     numba==0.59.1 \
69 |     pyboolnet
70 | 
71 |     conda activate env
72 |     pip install mofapy2 marsilea==0.3.2 snapatac2==2.6.0 celloracle==0.18.0 scipy==1.12.0 ipykernel
73 |     export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib
74 | 
75 |     # Remove cache for lighter containers
76 |     pip cache purge
77 |     conda clean -a -y
78 | 


--------------------------------------------------------------------------------
/workflow/rules/anl/dbs.smk:
--------------------------------------------------------------------------------
 1 | rule dbs_stats:
 2 |     threads: 1
 3 |     input:
 4 |         paths_prt=expand('dbs/hg38/prt/{prt}/meta.csv', prt=config['dbs']['hg38']['prt'].keys()),
 5 |         paths_gst=expand('dbs/hg38/gst/{gst}.csv', gst=config['dbs']['hg38']['gst'].keys()),
 6 |         paths_tfm=expand('dbs/hg38/tfm/{tfm}/{tfm}.tsv', tfm=config['dbs']['hg38']['tfm'].keys()),
 7 |         paths_tfp=expand('dbs/hg38/tfp/{tfp}/{tfp}.tsv', tfp=config['dbs']['hg38']['tfp'].keys()),
 8 |         paths_tfb=expand('dbs/hg38/tfb/{tfb}/{tfb}.bed', tfb=config['dbs']['hg38']['tfb'].keys()),
 9 |         paths_cre=expand('dbs/hg38/cre/{cre}/{cre}.bed', cre=config['dbs']['hg38']['cre'].keys()),
10 |         paths_c2g=expand('dbs/hg38/c2g/{c2g}/{c2g}.bed', c2g=config['dbs']['hg38']['c2g'].keys()),
11 |     output: 'anl/dbs/stats.csv'
12 |     resources:
13 |         mem_mb=32000
14 |     shell: 
15 |         """
16 |         python workflow/scripts/anl/dbs/stats.py \
17 |         -p {input.paths_prt} \
18 |         -g {input.paths_gst} \
19 |         -m {input.paths_tfm} \
20 |         -t {input.paths_tfp} \
21 |         -b {input.paths_tfb} \
22 |         -c {input.paths_cre} \
23 |         -e {input.paths_c2g} \
24 |         -o {output}
25 |         """
26 | 
27 | 
28 | rule dbs_terms:
29 |     threads: 1
30 |     singularity: 'workflow/envs/gretabench.sif'
31 |     input: 
32 |         paths_prt=expand('dbs/hg38/prt/{prt}/meta.csv', prt=config['dbs']['hg38']['prt'].keys()),
33 |         paths_tfm=expand('dbs/hg38/tfm/{tfm}/{tfm}.tsv', tfm=config['dbs']['hg38']['tfm'].keys()),
34 |         paths_tfb=expand('dbs/hg38/tfb/{tfb}/{tfb}.bed', tfb=config['dbs']['hg38']['tfb'].keys()),
35 |         paths_cre=expand('dbs/hg38/cre/{cre}/{cre}.bed', cre=config['dbs']['hg38']['cre'].keys()),
36 |         paths_c2g=expand('dbs/hg38/c2g/{c2g}/{c2g}.bed', c2g=config['dbs']['hg38']['c2g'].keys()),
37 |     output: 'anl/dbs/terms.csv'
38 |     resources:
39 |         mem_mb=64000
40 |     shell:
41 |         """
42 |         python workflow/scripts/anl/dbs/terms.py -i {input} -o {output}
43 |         """
44 | 
45 | 
46 | rule dbs_ocoef:
47 |     threads: 1
48 |     singularity: 'workflow/envs/gretabench.sif'
49 |     input: 'anl/dbs/stats.csv',
50 |     output: 'anl/dbs/ocoef.csv',
51 |     shell:
52 |         """
53 |         python workflow/scripts/anl/dbs/ocoef.py {output}
54 |         """
55 | 


--------------------------------------------------------------------------------
/workflow/rules/anl/dts.smk:
--------------------------------------------------------------------------------
 1 | localrules: dts_qcstats
 2 | 
 3 | 
 4 | rule dts_qcstats:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input: rules.extract_case.output.mdata
 8 |     output:
 9 |         qc='anl/dts/{dat}.{case}.qc.csv',
10 |         nc='anl/dts/{dat}.{case}.nc.csv',
11 |     shell:
12 |         """
13 |         python workflow/scripts/anl/dts/qcstats.py \
14 |         {input} {output.qc} {output.nc}
15 |         """
16 | 


--------------------------------------------------------------------------------
/workflow/rules/anl/metrics/mech.smk:
--------------------------------------------------------------------------------
 1 | localrules: mech_tfa
 2 | rule mech_tfa:
 3 |     threads: 1
 4 |     singularity: 'workflow/envs/gretabench.sif'
 5 |     input:
 6 |         grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards),
 7 |         rsc=rules.prt_knocktf.output.dir,
 8 |     output:
 9 |         out='anl/metrics/mech/tfa/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv'
10 |     shell:
11 |         """
12 |         python workflow/scripts/anl/metrics/mech/tfa.py \
13 |         -i {input.grn} \
14 |         -b {input.rsc} \
15 |         -o {output.out}
16 |         """
17 | 
18 | 
19 | rule mech_prt:
20 |     threads: 16
21 |     singularity: 'workflow/envs/gretabench.sif'
22 |     input:
23 |         grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards),
24 |         rsc=rules.prt_knocktf.output.dir,
25 |     output:
26 |         out='anl/metrics/mech/prt/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv'
27 |     resources:
28 |         mem_mb=restart_mem,
29 |         runtime=config['max_mins_per_step'] * 2,
30 |     shell:
31 |         """
32 |         set +e
33 |         timeout $(({resources.runtime}-20))m \
34 |         python workflow/scripts/anl/metrics/mech/prt.py \
35 |         -i {input.grn} \
36 |         -b {input.rsc} \
37 |         -o {output.out}
38 |         if [ $? -eq 124 ]; then
39 |             awk 'BEGIN {{ print "name,prc,rcl,f01" }}' > {output.out}
40 |         fi
41 |         """
42 | 
43 | 
44 | rule extract_mech_tfm:
45 |     threads: 1
46 |     singularity: 'workflow/envs/gretabench.sif'
47 |     input:
48 |         mdata=rules.extract_case.output.mdata,
49 |         tf=rules.gen_tfs_lambert.output,
50 |     output: 'anl/metrics/mech/sss/sss/{dat}.{case}/tfm.csv'
51 |     shell:
52 |         """
53 |         python workflow/scripts/anl/metrics/mech/tfm.py {input.mdata} {input.tf} {output}
54 |         """
55 | 
56 | 
57 | rule mech_sss:
58 |     threads: 1
59 |     singularity: 'workflow/envs/gretabench.sif'
60 |     input:
61 |         grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards),
62 |         tfm=rules.extract_mech_tfm.output,
63 |     output:
64 |         out='anl/metrics/mech/sss/sss/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv'
65 |     params:
66 |         thr_pval=0.01,
67 |     resources:
68 |         mem_mb=8000,
69 |         runtime=60,
70 |     shell:
71 |         """
72 |         set +e
73 |         timeout $(({resources.runtime}-20))m \
74 |     	python workflow/scripts/anl/metrics/mech/sim.py {input.grn} {input.tfm} {params.thr_pval} {output.out}
75 |         if [ $? -eq 124 ]; then
76 |             awk 'BEGIN {{ print "name,prc,rcl,f01" }}' > {output.out}
77 |         fi
78 |     	"""
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/workflow/rules/anl/metrics/pred.smk:
--------------------------------------------------------------------------------
 1 | rule pred_omics:
 2 |     threads: 1
 3 |     singularity: 'workflow/envs/gretabench.sif'
 4 |     input:
 5 |         grn=lambda w: rules.grn_run.output.out.format(**w),
 6 |     output:
 7 |         out='anl/metrics/pred/omics/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv'
 8 |     params:
 9 |         col_source=lambda w: 'cre' if w.db == 'gcre' else 'source',
10 |         col_target=lambda w: 'cre' if w.db == 'cretf' else 'target',
11 |         mod_source=lambda w: 'atac' if w.db == 'gcre' else 'rna',
12 |         mod_target=lambda w: 'atac' if w.db == 'cretf' else 'rna',
13 |     shell:
14 |         """
15 |         python workflow/scripts/anl/metrics/pred/omics.py \
16 |         -a {input.grn} \
17 |         -b {params.col_source} \
18 |         -c {params.col_target} \
19 |         -d {params.mod_source} \
20 |         -e {params.mod_target} \
21 |         -f {output}
22 |         """
23 | 
24 | 
25 | rule pred_gsets:
26 |     threads: 1
27 |     singularity: 'workflow/envs/gretabench.sif'
28 |     input:
29 |         grn=lambda w: rules.grn_run.output.out.format(**w),
30 |         rsc='dbs/hg38/gst/{db}.csv'
31 |     output:
32 |         out='anl/metrics/pred/gsets/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv'
33 |     shell:
34 |         """
35 |         python workflow/scripts/anl/metrics/pred/gsets.py \
36 |         -i {input.grn} \
37 |         -p {input.rsc} \
38 |         -o {output}
39 |         """
40 | 


--------------------------------------------------------------------------------
/workflow/rules/anl/metrics/prior.smk:
--------------------------------------------------------------------------------
 1 | localrules: prior_tfm, prior_tfp, prior_cre
 2 | 
 3 | 
 4 | rule prior_tfm:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input:
 8 |         grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards),
 9 |         db='dbs/hg38/tfm/{db}/{db}.tsv',
10 |     output:
11 |         out='anl/metrics/prior/tfm/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv'
12 |     shell:
13 |         """
14 |         python workflow/scripts/anl/metrics/prior/tfm.py \
15 |         -a {input.grn} \
16 |         -b {input.db} \
17 |         -f {output.out}
18 |         """
19 | 
20 | 
21 | rule prior_tfp:
22 |     threads: 1
23 |     singularity: 'workflow/envs/gretabench.sif'
24 |     input:
25 |         grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards),
26 |         db='dbs/hg38/tfp/{db}/{db}.tsv',
27 |     output:
28 |         out='anl/metrics/prior/tfp/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv'
29 |     params:
30 |         thr_p=0.01,
31 |     shell:
32 |         """
33 |         python workflow/scripts/anl/metrics/prior/tfp.py \
34 |         {input.grn} {input.db} {params.thr_p} {output.out}
35 |         """
36 | 
37 | 
38 | rule prior_tfb:
39 |     threads: 1
40 |     singularity: 'workflow/envs/gretabench.sif'
41 |     input:
42 |         grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards),
43 |         db='dbs/hg38/tfb/{db}/{db}.bed',
44 |     output:
45 |         out='anl/metrics/prior/tfb/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv'
46 |     params:
47 |         grp='source',
48 |     shell:
49 |         """
50 |         python workflow/scripts/anl/metrics/prior/gnm.py \
51 |         -a {input.grn} \
52 |         -b {input.db} \
53 |         -d {params.grp} \
54 |         -f {output}
55 |         """
56 | 
57 | 
58 | rule prior_cre:
59 |     threads: 1
60 |     singularity: 'workflow/envs/gretabench.sif'
61 |     input:
62 |         grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards),
63 |         db='dbs/hg38/cre/{db}/{db}.bed',
64 |     output:
65 |         out='anl/metrics/prior/cre/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv'
66 |     shell:
67 |         """
68 |         python workflow/scripts/anl/metrics/prior/gnm.py \
69 |         -a {input.grn} \
70 |         -b {input.db} \
71 |         -f {output}
72 |         """
73 | 
74 | 
75 | rule prior_c2g:
76 |     threads: 1
77 |     singularity: 'workflow/envs/gretabench.sif'
78 |     input:
79 |         grn=lambda wildcards: rules.grn_run.output.out.format(**wildcards),
80 |         resource='dbs/hg38/c2g/{db}/{db}.bed',
81 |     output:
82 |         out='anl/metrics/prior/c2g/{db}/{dat}.{case}/{pre}.{p2g}.{tfb}.{mdl}.scores.csv'
83 |     params:
84 |         grp='target',
85 |     shell:
86 |         """
87 |         python workflow/scripts/anl/metrics/prior/gnm.py \
88 |         -a {input.grn} \
89 |         -b {input.resource} \
90 |         -d {params.grp} \
91 |         -f {output}
92 |         """
93 | 


--------------------------------------------------------------------------------
/workflow/rules/anl/metrics/utils.smk:
--------------------------------------------------------------------------------
 1 | localrules: aggr_metric, metric_summ
 2 | 
 3 | 
 4 | rule aggr_metric:
 5 |     threads: 1
 6 |     input:
 7 |         lambda w: make_combs_rules(w=w, mthds=mthds, baselines=baselines, rule_name='{typ}_{tsk}'.format(typ=w.type, tsk=w.task))
 8 |     output:
 9 |         'anl/metrics/{type}/{task}/{db}/{dat}.{case}.scores.csv'
10 |     shell:
11 |         """
12 |         python workflow/scripts/anl/metrics/aggregate.py \
13 |         -i {input} \
14 |         -o {output}
15 |         """
16 | 
17 | 
18 | rule metric_summ:
19 |     threads: 1
20 |     singularity: 'workflow/envs/gretabench.sif'
21 |     input:
22 |         [
23 |             'anl/metrics/mech/prt/knocktf/{dat}.{case}.scores.csv',
24 |             'anl/metrics/mech/tfa/knocktf/{dat}.{case}.scores.csv',
25 |             'anl/metrics/mech/sss/sss/{dat}.{case}.scores.csv',
26 |             'anl/metrics/pred/omics/gtf/{dat}.{case}.scores.csv',
27 |             'anl/metrics/pred/omics/cretf/{dat}.{case}.scores.csv',
28 |             'anl/metrics/pred/omics/gcre/{dat}.{case}.scores.csv',
29 |             'anl/metrics/pred/gsets/kegg/{dat}.{case}.scores.csv',
30 |             'anl/metrics/pred/gsets/hall/{dat}.{case}.scores.csv',
31 |             'anl/metrics/pred/gsets/reac/{dat}.{case}.scores.csv',
32 |             'anl/metrics/pred/gsets/prog/{dat}.{case}.scores.csv',
33 |             'anl/metrics/prior/tfm/hpa/{dat}.{case}.scores.csv',
34 |             'anl/metrics/prior/tfm/tfmdb/{dat}.{case}.scores.csv',
35 |             'anl/metrics/prior/tfp/europmc/{dat}.{case}.scores.csv',
36 |             'anl/metrics/prior/tfp/intact/{dat}.{case}.scores.csv',
37 |             'anl/metrics/prior/tfb/chipatlas/{dat}.{case}.scores.csv',
38 |             'anl/metrics/prior/tfb/remap2022/{dat}.{case}.scores.csv',
39 |             'anl/metrics/prior/tfb/unibind/{dat}.{case}.scores.csv',
40 |             'anl/metrics/prior/cre/blacklist/{dat}.{case}.scores.csv',
41 |             'anl/metrics/prior/cre/encode/{dat}.{case}.scores.csv',
42 |             'anl/metrics/prior/cre/gwascatalogue/{dat}.{case}.scores.csv',
43 |             'anl/metrics/prior/cre/phastcons/{dat}.{case}.scores.csv',
44 |             'anl/metrics/prior/cre/zhang21/{dat}.{case}.scores.csv',
45 |             'anl/metrics/prior/cre/promoters/{dat}.{case}.scores.csv',
46 |             'anl/metrics/prior/c2g/eqtlcatalogue/{dat}.{case}.scores.csv',
47 |         ]
48 |     output: 'anl/metrics/summary/{dat}.{case}.csv'
49 |     shell:
50 |         """
51 |         python workflow/scripts/anl/metrics/test.py -m {input} -o {output}
52 |         """
53 | 


--------------------------------------------------------------------------------
/workflow/rules/anl/pair.smk:
--------------------------------------------------------------------------------
 1 | localrules: pair_realsim, pair_fakesim
 2 | 
 3 | 
 4 | rule pair_real_cor:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input:
 8 |         pair='dts/{dname}pair/cases/{case}/mdata.h5mu',
 9 |         npair='dts/{dname}npair/cases/{case}/mdata.h5mu',
10 |     output:
11 |         cors='anl/pair/{dname}.{case}.real_corvals.csv',
12 |         stat='anl/pair/{dname}.{case}.real_corsstat.csv',
13 |     singularity:
14 |         'workflow/envs/gretabench.sif'
15 |     shell:
16 |         """
17 |         python workflow/scripts/anl/pair/real_cors.py \
18 |         -a {input.pair} \
19 |         -b {input.npair} \
20 |         -c {output.cors} \
21 |         -d {output.stat}
22 |         """
23 | 
24 | 
25 | rule pair_fake_stats:
26 |     threads: 1
27 |     singularity: 'workflow/envs/gretabench.sif'
28 |     input:
29 |         mdata='dts/{dname}pair/cases/{case}/mdata.h5mu',
30 |         barmap='dts/fake{dname}pair/barmap.csv',
31 |     output:
32 |         knn='anl/pair/{dname}.{case}.fake_knn.csv',
33 |         cor='anl/pair/{dname}.{case}.fake_cor.csv',
34 |         prp='anl/pair/{dname}.{case}.fake_prp.csv',
35 |     singularity:
36 |         'workflow/envs/gretabench.sif'
37 |     shell:
38 |         """
39 |         python workflow/scripts/anl/pair/fake_stats.py \
40 |         -a {input.mdata} \
41 |         -b {input.barmap} \
42 |         -c {output.knn} \
43 |         -d {output.cor} \
44 |         -e {output.prp}
45 |         """
46 | 
47 | 
48 | rule pair_realsim:
49 |     threads: 1
50 |     singularity: 'workflow/envs/gretabench.sif'
51 |     input:
52 |         p='anl/topo/{dname}pair.{case}.sims_mult.csv',
53 |         n='anl/topo/{dname}npair.{case}.sims_mult.csv',
54 |     output: 'anl/pair/{dname}.{case}.pvsn.csv'
55 |     shell:
56 |         """
57 |         python workflow/scripts/anl/pair/pairsim.py \
58 |         -a {input.p} \
59 |         -b {input.n} \
60 |         -o {output}
61 |         """
62 | 
63 | 
64 | rule pair_fakesim:
65 |     threads: 1
66 |     singularity: 'workflow/envs/gretabench.sif'
67 |     input:
68 |         p='anl/topo/{dname}pair.{case}.sims_mult.csv',
69 |         f='anl/topo/fake{dname}pair.{case}.sims_mult.csv',
70 |     output: 'anl/pair/{dname}.{case}.pvsf.csv'
71 |     shell:
72 |         """
73 |         python workflow/scripts/anl/pair/pairsim.py \
74 |         -a {input.p} \
75 |         -b {input.f} \
76 |         -o {output}
77 |         """
78 | 
79 | 
80 | rule pair_real_qc:
81 |     threads: 1
82 |     singularity: 'workflow/envs/gretabench.sif'
83 |     input:
84 |         pair='dts/{dname}pair/cases/{case}/mdata.h5mu',
85 |         npair='dts/{dname}npair/cases/{case}/mdata.h5mu',
86 |     output:
87 |         qc='anl/pair/{dname}.{case}.qc.csv',
88 |         nc='anl/pair/{dname}.{case}.ncells.csv'
89 |     shell:
90 |         """
91 |         python workflow/scripts/anl/pair/realqc.py {input.pair} {input.npair} {output.qc} {output.nc}
92 |         """
93 | 


--------------------------------------------------------------------------------
/workflow/rules/anl/topo.smk:
--------------------------------------------------------------------------------
 1 | localrules: topo_inter, topo_fvsd
 2 | 
 3 | 
 4 | rule topo_mult:
 5 |     threads: 4
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input:
 8 |         lambda w: make_combs_rules(w=w, mthds=mthds, baselines=baselines, rule_name='grn_run')
 9 |     output:
10 |         stats='anl/topo/{dat}.{case}.stats_mult.csv',
11 |         sims='anl/topo/{dat}.{case}.sims_mult.csv',
12 |     resources:
13 |         mem_mb=128000
14 |     shell:
15 |         """
16 |         python workflow/scripts/anl/topo/run_pair_sim.py \
17 |         -t {output.stats} \
18 |         -s {output.sims}
19 |         """
20 | 
21 | 
22 | rule topo_fvsd:
23 |     threads: 4
24 |     singularity: 'workflow/envs/gretabench.sif'
25 |     input:
26 |         stats=rules.topo_mult.output.stats,
27 |         sims=rules.topo_mult.output.sims,
28 |     output: 'anl/topo/{dat}.{case}.fvsd.csv',
29 |     shell:
30 |         """
31 |         python workflow/scripts/anl/topo/fvsd.py {input.sims} {input.stats} {output}
32 |         """
33 | 
34 | 
35 | rule topo_inter:
36 |     threads: 1
37 |     input:
38 |         lambda w: make_combs_rules(w=w, mthds=mthds, baselines=baselines, rule_name='grn_run')
39 |     output: 'anl/topo/{dat}.{case}.inter.csv',
40 |     params: min_prop=config['topo_min_prop']
41 |     shell:
42 |         """
43 |         python workflow/scripts/anl/topo/inter.py \
44 |         -g {input} \
45 |         -b {baselines} \
46 |         -p {params.min_prop} \
47 |         -o {output}
48 |         """
49 | 


--------------------------------------------------------------------------------
/workflow/rules/anl/tss.smk:
--------------------------------------------------------------------------------
 1 | localrules: tss_aggr
 2 | 
 3 | 
 4 | rule tss_gocoef:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input:
 8 |         tss_a='dbs/hg38/gen/tss/{mth_a}.bed',
 9 |         tss_b='dbs/hg38/gen/tss/{mth_b}.bed',
10 |     output: temp(local('anl/tss/ocoef/{mth_a}.{mth_b}.csv'))
11 |     resources:
12 |         mem_mb=2000,
13 |         runtime=config['max_mins_per_step'],
14 |     shell:
15 |         """
16 |         python workflow/scripts/anl/tss/gocoef.py \
17 |         -a {input.tss_a} \
18 |         -b {input.tss_b} \
19 |         -o {output}
20 |         """
21 | 
22 | 
23 | tss_paths = [f'anl/tss/ocoef/{mth_a}.{mth_b}.csv' for mth_a, mth_b in combinations([x for x in mthds + baselines], 2)]
24 | rule tss_aggr:
25 |     threads: 1
26 |     singularity: 'workflow/envs/gretabench.sif'
27 |     input: tss_paths
28 |     output: "anl/tss/ocoef.csv"
29 |     shell:
30 |         """
31 |         python -c "import pandas as pd; import sys; \
32 |         tss_paths = sys.argv[1:]; \
33 |         df = pd.concat([pd.read_csv(tss_path) for tss_path in tss_paths]); \
34 |         df.to_csv('{output}', index=False);" {input}
35 |         """
36 | 
37 | 
38 | rule tss_dist:
39 |     threads: 1
40 |     singularity: 'workflow/envs/gretabench.sif'
41 |     input:
42 |         c=rules.tss_aggr.output,
43 |         g='anl/topo/{dat}.{case}.stats_mult.csv'
44 |     output: "anl/tss/{dat}.{case}.dist.csv"
45 |     resources:
46 |         mem_mb=restart_mem,
47 |         runtime=config['max_mins_per_step'],
48 |     params:
49 |         b=baselines,
50 |     shell:
51 |         """
52 |         python workflow/scripts/anl/tss/dist.py \
53 |         -g {input.g} \
54 |         -b {params.b} \
55 |         -o {output}
56 |         """
57 | 


--------------------------------------------------------------------------------
/workflow/rules/dbs/cre.smk:
--------------------------------------------------------------------------------
 1 | localrules: cre_blacklist, cre_encode, cre_gwascatalogue, cre_phastcons, cre_promoters, cre_zhang21
 2 | 
 3 | 
 4 | rule cre_blacklist:
 5 |     threads: 1
 6 |     output: 'dbs/hg38/cre/blacklist/blacklist.bed'
 7 |     params:
 8 |         url=config['dbs']['hg38']['cre']['blacklist']
 9 |     shell:
10 |         """
11 |         wget --no-verbose -O - "{params.url}" | zcat > {output}
12 |         """
13 | 
14 | 
15 | rule cre_encode:
16 |     threads: 1
17 |     singularity: 'workflow/envs/gretabench.sif'
18 |     output: 'dbs/hg38/cre/encode/encode.bed'
19 |     params:
20 |         url=config['dbs']['hg38']['cre']['encode']
21 |     shell:
22 |         """
23 |         wget --no-verbose '{params.url}' -O {output}.tmp && \
24 |         cat {output}.tmp | sort -k 1,1 -k2,2n | bedtools merge -c 6 -o distinct > {output} && \
25 |         rm {output}.tmp
26 |         """
27 | 
28 | 
29 | rule cre_gwascatalogue:
30 |     threads: 1
31 |     singularity: 'workflow/envs/gretabench.sif'
32 |     output: 'dbs/hg38/cre/gwascatalogue/gwascatalogue.bed'
33 |     params:
34 |         url=config['dbs']['hg38']['cre']['gwascatalogue']
35 |     shell:
36 |         """
37 |         wget --no-verbose '{params.url}' -O {output} && \
38 |         python workflow/scripts/dbs/cre/gwascatalogue.py -i {output} && \
39 |         sort -k 1,1 -k2,2n {output} | bedtools merge -i - -c 4,5 -o distinct,distinct -delim "|" > {output}.tmp && \
40 |         mv {output}.tmp {output}
41 |         """
42 | 
43 | 
44 | rule cre_phastcons:
45 |     threads: 1
46 |     singularity: 'workflow/envs/pando.sif'
47 |     output: 'dbs/hg38/cre/phastcons/phastcons.bed'
48 |     params:
49 |         url=config['dbs']['hg38']['cre']['phastcons']
50 |     shell:
51 |         """
52 |         wget --no-verbose '{params.url}' -O {output}.tmp && \
53 |         Rscript -e " \
54 |         df <- get(load('{output}.tmp')); \
55 |         df <- GenomicRanges::reduce(df); \
56 |         df <- as.data.frame(df)[, c('seqnames', 'start', 'end')]; \
57 |         write.table(x=df, file='{output}.tmp', sep = '\t', quote=FALSE, row.names=FALSE, col.names=FALSE)" && \
58 |         sort -k 1,1 -k2,2n {output}.tmp > {output} && \
59 |         rm {output}.tmp
60 |         """
61 | 
62 | 
63 | rule cre_promoters:
64 |     threads: 1
65 |     singularity: 'workflow/envs/gretabench.sif'
66 |     output: 'dbs/hg38/cre/promoters/promoters.bed'
67 |     params:
68 |         wsize=config['cre_prom_size']
69 |     shell:
70 |         """
71 |         Rscript workflow/scripts/dbs/cre/promoters.R \
72 |         {params.wsize} \
73 |         {output}
74 |         """
75 | 
76 | 
77 | rule cre_zhang21:
78 |     threads: 1
79 |     singularity: 'workflow/envs/gretabench.sif'
80 |     output: 'dbs/hg38/cre/zhang21/zhang21.bed'
81 |     params:
82 |         url=config['dbs']['hg38']['cre']['zhang21']
83 |     shell:
84 |         """
85 |         wget --no-verbose '{params.url}' -O {output}.tmp && \
86 |         zcat {output}.tmp | bedtools merge > {output} && \
87 |         rm {output}.tmp
88 |         """
89 | 


--------------------------------------------------------------------------------
/workflow/rules/dbs/ont.smk:
--------------------------------------------------------------------------------
 1 | localrules: ont_bto
 2 | 
 3 | 
 4 | checkpoint ont_bto:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     output: 'dbs/ont/bto.tsv'
 8 |     params:
 9 |         url=config['dbs']['ont']['bto'],
10 |     shell:
11 |         """
12 |         wget --no-verbose '{params.url}' -O - | \
13 |         python workflow/scripts/dbs/ont/bto.py {output}
14 |         """


--------------------------------------------------------------------------------
/workflow/rules/dbs/prt.smk:
--------------------------------------------------------------------------------
 1 | localrules: prt_knocktf
 2 | 
 3 | 
 4 | rule prt_knocktf:
 5 |     threads: 1
 6 |     output: 
 7 |         meta='dbs/hg38/prt/knocktf/meta.csv',
 8 |         diff='dbs/hg38/prt/knocktf/diff.csv',
 9 |         dir=directory('dbs/hg38/prt/knocktf/')
10 |     params:
11 |         url_m=config['dbs']['hg38']['prt']['knocktf']['meta'],
12 |         url_d=config['dbs']['hg38']['prt']['knocktf']['diff'],
13 |     shell:
14 |         """
15 |         wget --no-verbose '{params.url_m}' -O {output.meta} && \
16 |         wget --no-verbose '{params.url_d}' -O {output.diff}
17 |         """
18 | 


--------------------------------------------------------------------------------
/workflow/rules/dbs/tfm.smk:
--------------------------------------------------------------------------------
 1 | localrules: tfm_hpa, tfm_tfmdb
 2 | 
 3 | 
 4 | rule tfm_hpa:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input: rules.gen_tfs_lambert.output
 8 |     output: 'dbs/hg38/tfm/hpa/hpa.tsv'
 9 |     params:
10 |         url=config['dbs']['hg38']['tfm']['hpa']
11 |     shell:
12 |         """
13 |         wget --no-verbose '{params.url}' -O {output}.zip && \
14 |         python workflow/scripts/dbs/tfm/hpa.py \
15 |         -i {output}.zip \
16 |         -t {input} \
17 |         -o {output}
18 |         """
19 | 
20 | 
21 | rule tfm_tfmdb:
22 |     threads: 1
23 |     singularity: 'workflow/envs/gretabench.sif'
24 |     output: 'dbs/hg38/tfm/tfmdb/tfmdb.tsv'
25 |     params:
26 |         url=config['dbs']['hg38']['tfm']['tfmdb']
27 |     shell:
28 |         """
29 |         wget --no-verbose '{params.url}' -O {output} && \
30 |         python -c "import pandas as pd; \
31 |         import sys; \
32 |         df = pd.read_csv(sys.argv[1]); \
33 |         df = df[['Gene Name', 'Cell Name', 'Tissue Type']]; \
34 |         df['ctype'] = df['Cell Name'] + ',' + df['Tissue Type']; \
35 |         df = df.groupby('Gene Name', as_index=False)['ctype'].apply(lambda x: ','.join(x)); \
36 |         df['ctype'] = [','.join(sorted(set(s.split(',')))) for s in df['ctype']]; \
37 |         df = df.drop_duplicates(['Gene Name', 'ctype']); \
38 |         df = df.rename(columns={{'Gene Name': 'gene'}}); \
39 |         df = df.sort_values(['gene', 'ctype']); \
40 |         df.to_csv(sys.argv[1], sep='\\t', index=False, header=None)" {output}
41 |         """
42 | 


--------------------------------------------------------------------------------
/workflow/rules/dbs/tfp.smk:
--------------------------------------------------------------------------------
 1 | localrules: download_intact, tfp_intact, tfp_europmc
 2 | 
 3 | 
 4 | rule download_intact:
 5 |     output:
 6 |         temp("dbs/hg38/tfp/intact/raw/intact.txt")
 7 |     params:
 8 |         url=config['dbs']['hg38']['tfp']['intact']
 9 |     shell:
10 |         """
11 |         wget --no-verbose {params.url} -O {output}.zip && \
12 |         unzip -o {output}.zip -d $( dirname {output} ) && \
13 |         rm {output}.zip
14 |         """
15 | 
16 | 
17 | rule tfp_intact:
18 |     input:
19 |         inc=rules.download_intact.output,
20 |         lmb=rules.gen_tfs_lambert.output,
21 |         pid=rules.gen_pid_uniprot.output,
22 |     output: 'dbs/hg38/tfp/intact/intact.tsv'
23 |     shell:
24 |         """
25 |         python workflow/scripts/dbs/tfp/intact.py \
26 |         {input.inc} {input.lmb} {input.pid} {output}
27 |         """
28 | 
29 | 
30 | rule tfp_europmc_raw:
31 |     threads: 1
32 |     singularity: 'workflow/envs/gretabench.sif'
33 |     input: rules.gen_tfs_lambert.output,
34 |     output:
35 |         single='dbs/hg38/tfp/europmc/raw/single.csv',
36 |         pairs='dbs/hg38/tfp/europmc/raw/pairs.csv'
37 |     params:
38 |         min_chars=2,
39 |         min_n=49
40 |     resources:
41 |         runtime=config['max_mins_per_step'] * 2,
42 |     shell:
43 |         """
44 |         python workflow/scripts/dbs/tfp/europmc_raw.py \
45 |         {input} {params.min_chars} {params.min_n} {output.single} {output.pairs}
46 |         """
47 | 
48 | 
49 | rule tfp_europmc:
50 |     threads: 1
51 |     singularity: 'workflow/envs/gretabench.sif'
52 |     input:
53 |         single=rules.tfp_europmc_raw.output.single,
54 |         pairs=rules.tfp_europmc_raw.output.pairs,
55 |     output: 'dbs/hg38/tfp/europmc/europmc.tsv'
56 |     params:
57 |         pval_thr=2.2e-16,
58 |         min_odds=5,
59 |     shell:
60 |         """
61 |         python workflow/scripts/dbs/tfp/europmc.py \
62 |         {input.single} {input.pairs} {params.pval_thr} {params.min_odds} {output}
63 |         """
64 | 


--------------------------------------------------------------------------------
/workflow/rules/dts/fakepair.smk:
--------------------------------------------------------------------------------
 1 | localrules: index_frags_fakepair
 2 | 
 3 | 
 4 | rule index_frags_fakepair:
 5 |     threads: 1
 6 |     input:
 7 |         frags=lambda w: map_rules('download', w_name='{dname}pair'.format(dname=w.dname), out='frags'),
 8 |         tbis=lambda w: map_rules('download', w_name='{dname}pair'.format(dname=w.dname), out='tbis'),
 9 |     output:
10 |         frags=temp(local('dts/fake{dname}pair/smpl.frags.tsv.gz')),
11 |         tbis=temp(local('dts/fake{dname}pair/smpl.frags.tsv.gz.tbi')),
12 |     shell:
13 |         """
14 |         cp {input.frags} {output.frags}
15 |         cp {input.tbis} {output.tbis}
16 |         """
17 | 
18 | 
19 | rule coem_fakepair:
20 |     threads: 32
21 |     singularity: 'workflow/envs/figr.sif'
22 |     input:
23 |         gex=lambda w: map_rules(rule_prefix='download', w_name='{dname}pair'.format(dname=w.dname), out='gex'),
24 |         peaks=lambda w: map_rules('callpeaks', w_name='{dname}pair'.format(dname=w.dname), out='peaks'),
25 |         frags=rules.index_frags_fakepair.output.frags,
26 |         tbis=rules.index_frags_fakepair.output.tbis,
27 |     output:
28 |         cca=temp(local('dts/fake{dname}pair/cca.rds'))
29 |     resources: mem_mb=128000
30 |     shell:
31 |         """
32 |         Rscript workflow/scripts/dts/fakepair/coembedd.R \
33 |         {input.gex} \
34 |         {input.peaks} \
35 |         {input.frags} \
36 |         {output.cca}
37 |         """
38 | 
39 | 
40 | rule pair_fakepair:
41 |     threads: 1
42 |     singularity: 'workflow/envs/figr.sif'
43 |     input:
44 |         cca=rules.coem_fakepair.output.cca,
45 |         annot=lambda w: map_rules(rule_prefix='download', w_name='{dname}pair'.format(dname=w.dname), out='annot'),
46 |     output: barmap=temp(local('dts/fake{dname}pair/barmap.csv'))
47 |     shell:
48 |         """
49 |         Rscript workflow/scripts/dts/fakepair/paircells.R \
50 |         {input.cca} \
51 |         {input.annot} \
52 |         {output.barmap}
53 |         """
54 | 
55 | localrules: annotate_fakepitupair
56 | rule annotate_fakepitupair:
57 |     threads: 1
58 |     singularity: 'workflow/envs/gretabench.sif'
59 |     input:
60 |         mdata=rules.annotate_pitupair.output.out,
61 |         barmap='dts/fakepitupair/barmap.csv',
62 |     output:
63 |         out='dts/fakepitupair/annotated.h5mu'
64 |     shell:
65 |         """
66 |         python workflow/scripts/dts/fakepair/fakepair.py \
67 |         -m {input.mdata} \
68 |         -b {input.barmap} \
69 |         -o {output.out}
70 |         """
71 | 


--------------------------------------------------------------------------------
/workflow/rules/dts/general.smk:
--------------------------------------------------------------------------------
 1 | rule extract_case:
 2 |     threads: 32
 3 |     singularity: 'workflow/envs/gretabench.sif'
 4 |     input: lambda w: map_rules('annotate', w.dat)
 5 |     output:
 6 |         mdata='dts/{dat}/cases/{case}/mdata.h5mu',
 7 |     params:
 8 |         celltypes=lambda w: config['dts'][w.dat]['cases'][w.case]['celltypes'],
 9 |         n_sample=lambda w: config['dts'][w.dat]['cases'][w.case]['n_sample'] if 'n_sample' in config['dts'][w.dat]['cases'][w.case] else '0',
10 |         seed=lambda w: config['dts'][w.dat]['cases'][w.case]['seed'] if 'n_sample' in config['dts'][w.dat]['cases'][w.case] else '0',
11 |         n_hvg=lambda w: config['dts'][w.dat]['cases'][w.case]['n_hvg'],
12 |         n_hvr=lambda w: config['dts'][w.dat]['cases'][w.case]['n_hvr'],
13 |         root=lambda w: config['dts'][w.dat]['cases'][w.case]['root'] if 'root' in config['dts'][w.dat]['cases'][w.case] else 'None',
14 |     shell:
15 |         """
16 |         python workflow/scripts/dts/extract_case.py \
17 |         -i '{input}' \
18 |         -c '{params.celltypes}' \
19 |         -s '{params.n_sample}' \
20 |         -d '{params.seed}' \
21 |         -g '{params.n_hvg}' \
22 |         -r '{params.n_hvr}' \
23 |         -t '{params.root}' \
24 |         -o '{output.mdata}'
25 |         """
26 | 


--------------------------------------------------------------------------------
/workflow/rules/dts/pbmc10k.smk:
--------------------------------------------------------------------------------
 1 | rule download_pbmc10k:
 2 |     threads: 1
 3 |     singularity: 'workflow/envs/figr.sif'
 4 |     output:
 5 |         frags='dts/pbmc10k/smpl.frags.tsv.gz',
 6 |         tbis='dts/pbmc10k/smpl.frags.tsv.gz.tbi',
 7 |     params:
 8 |         matrix=config['dts']['pbmc10k']['url']['matrix'],
 9 |         atac_frags=config['dts']['pbmc10k']['url']['atac_frags'],
10 |     shell:
11 |         """
12 |         wget --no-verbose '{params.atac_frags}' -O '{output.frags}'
13 |         bash workflow/scripts/dts/format_frags.sh {output.frags}
14 |         """
15 | 
16 | 
17 | rule prcannot_pbmc10k:
18 |     threads: 1
19 |     singularity: 'workflow/envs/gretabench.sif'
20 |     output: annot=temp(local('dts/pbmc10k/annot.csv')),
21 |     shell:
22 |         "python workflow/scripts/dts/pbmc10k/prc_annot.py -a {output.annot}"
23 | 
24 | 
25 | rule callpeaks_pbmc10k:
26 |     threads: 32
27 |     singularity: 'workflow/envs/gretabench.sif'
28 |     input:
29 |         frags=rules.download_pbmc10k.output.frags,
30 |         annot=rules.prcannot_pbmc10k.output.annot,
31 |     output: peaks=temp(local('dts/pbmc10k/peaks.h5ad'))
32 |     resources: mem_mb=64000
33 |     shell:
34 |         """
35 |         python workflow/scripts/dts/callpeaks.py \
36 |         -f {input.frags} \
37 |         -a {input.annot} \
38 |         -t '/tmp/pbcm10k/' \
39 |         -n {threads} \
40 |         -o {output.peaks}
41 |         """
42 | 
43 | 
44 | rule annotate_pbmc10k:
45 |     threads: 1
46 |     singularity: 'workflow/envs/gretabench.sif'
47 |     input:
48 |         annot=rules.prcannot_pbmc10k.output.annot,
49 |         peaks=rules.callpeaks_pbmc10k.output.peaks,
50 |         gid=rules.gen_gid_ensmbl.output,
51 |     output: out='dts/pbmc10k/annotated.h5mu'
52 |     resources: mem_mb=32000
53 |     shell:
54 |         """
55 |         python workflow/scripts/dts/pbmc10k/pbmc10k.py \
56 |         -b {input.annot} \
57 |         -c {input.gid} \
58 |         -e {input.peaks} \
59 |         -f {output.out}
60 |         """
61 | 


--------------------------------------------------------------------------------
/workflow/rules/dts/pitupair.smk:
--------------------------------------------------------------------------------
 1 | rule download_pitupair:
 2 |     threads: 1
 3 |     singularity: 'workflow/envs/figr.sif'
 4 |     output:
 5 |         gex=temp(local('dts/pitupair/multiome_original.h5')),
 6 |         frags='dts/pitupair/smpl.frags.tsv.gz',
 7 |         tbis='dts/pitupair/smpl.frags.tsv.gz.tbi',
 8 |         annot=temp(local('dts/pitupair/annot.csv'))
 9 |     params:
10 |         gex=config['dts']['pitupair']['url']['gex'],
11 |         frags=config['dts']['pitupair']['url']['frags'],
12 |         annot=config['dts']['pitupair']['url']['annot']
13 |     shell:
14 |         """
15 |         wget --no-verbose '{params.frags}' -O '{output.frags}' && \
16 |         bash workflow/scripts/dts/format_frags.sh {output.frags} && \
17 |         wget --no-verbose '{params.gex}' -O '{output.gex}' && \
18 |         wget --no-verbose '{params.annot}' -O '{output.annot}' && \
19 |         awk 'BEGIN {{FS=OFS=","}} NR==1 {{print $0; next}} {{gsub(/-[0-9]+$/, "", $1); print $3"_"$1,$2,$3}}' {output.annot} > {output.annot}.tmp && \
20 |         mv {output.annot}.tmp {output.annot}
21 |         """
22 | 
23 | 
24 | rule callpeaks_pitupair:
25 |     threads: 32
26 |     singularity: 'workflow/envs/gretabench.sif'
27 |     input:
28 |         frags=rules.download_pitupair.output.frags,
29 |         annot=rules.download_pitupair.output.annot,
30 |     output: peaks=temp(local('dts/pitupair/peaks.h5ad'))
31 |     resources: mem_mb=64000
32 |     shell:
33 |         """
34 |         python workflow/scripts/dts/callpeaks.py \
35 |         -f {input.frags} \
36 |         -a {input.annot} \
37 |         -t '/tmp/pitupair/' \
38 |         -n {threads} \
39 |         -o {output.peaks}
40 |         """
41 | 
42 | 
43 | rule annotate_pitupair:
44 |     threads: 1
45 |     singularity: 'workflow/envs/gretabench.sif'
46 |     input:
47 |         annot=rules.download_pitupair.output.annot,
48 |         peaks=rules.callpeaks_pitupair.output.peaks,
49 |         gex=rules.download_pitupair.output.gex,
50 |         gid=rules.gen_gid_ensmbl.output,
51 |     output: out='dts/pitupair/annotated.h5mu'
52 |     resources: mem_mb=32000
53 |     shell:
54 |         """
55 |         python workflow/scripts/dts/pitupair/pitupair.py \
56 |         -b {input.annot} \
57 |         -c {input.gid} \
58 |         -e {input.peaks} \
59 |         -f {output} \
60 |         -g {input.gex}
61 |         """
62 | 


--------------------------------------------------------------------------------
/workflow/rules/img/img.smk:
--------------------------------------------------------------------------------
 1 | localrules: dwn_image
 2 | 
 3 | rule dwn_image:
 4 |     threads: 1
 5 |     singularity: None
 6 |     output: 'workflow/envs/{name_img}.sif'
 7 |     resources:
 8 |         mem_mb=8000,
 9 |         runtime=config['max_mins_per_step'],
10 |     shell:
11 |         """
12 |         wget "https://zenodo.org/records/15058660/files/{wildcards.name_img}.sif?download=1" -O {output}
13 |         """
14 | 


--------------------------------------------------------------------------------
/workflow/rules/mth/grn.smk:
--------------------------------------------------------------------------------
 1 | localrules: grn_run
 2 | 
 3 | 
 4 | rule grn_run:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input: lambda wildcards: map_rules('mdl', wildcards.mdl),
 8 |     output:
 9 |         out='dts/{dat}/cases/{case}/runs/{pre}.{p2g}.{tfb}.{mdl}.grn.csv'
10 |     shell:
11 |         """
12 |         python workflow/scripts/mth/grn.py \
13 |         -i {input} \
14 |         -o {output.out}
15 |         """
16 | 
17 | 
18 | rule mdl_collectri:
19 |     threads: 1
20 |     singularity: 'workflow/envs/gretabench.sif'
21 |     input:
22 |         mdata=rules.extract_case.output.mdata,
23 |         grn=rules.gst_collectri.output,
24 |         proms=rules.cre_promoters.output,
25 |     output:
26 |         out='dts/{dat}/cases/{case}/runs/collectri.collectri.collectri.collectri.mdl.csv'
27 |     resources:
28 |         mem_mb=restart_mem,
29 |         runtime=config['max_mins_per_step'],
30 |     shell:
31 |         """
32 |         python workflow/scripts/mth/prc_prior_grn.py \
33 |         -g {input.grn} \
34 |         -d {input.mdata} \
35 |         -p {input.proms} \
36 |         -o {output.out}
37 |         """
38 | 
39 | 
40 | rule mdl_dorothea:
41 |     threads: 1
42 |     singularity: 'workflow/envs/gretabench.sif'
43 |     input:
44 |         mdata=rules.extract_case.output.mdata,
45 |         grn=rules.gst_dorothea.output,
46 |         proms=rules.cre_promoters.output,
47 |     output:
48 |         out='dts/{dat}/cases/{case}/runs/dorothea.dorothea.dorothea.dorothea.mdl.csv'
49 |     resources:
50 |         mem_mb=restart_mem,
51 |         runtime=config['max_mins_per_step'],
52 |     shell:
53 |         """
54 |         python workflow/scripts/mth/prc_prior_grn.py \
55 |         -g {input.grn} \
56 |         -d {input.mdata} \
57 |         -p {input.proms} \
58 |         -o {output.out}
59 |         """
60 | 


--------------------------------------------------------------------------------
/workflow/rules/mth/random.smk:
--------------------------------------------------------------------------------
 1 | rule mdl_random:
 2 |     threads: 1
 3 |     singularity: 'workflow/envs/gretabench.sif'
 4 |     input:
 5 |         mdata=rules.extract_case.output.mdata,
 6 |         tf=rules.gen_tfs_lambert.output,
 7 |         cg=rules.cre_promoters.output,
 8 |     output: out='dts/{dat}/cases/{case}/runs/random.random.random.random.mdl.csv'
 9 |     params:
10 |         g_perc=0.25,
11 |         scale=1,
12 |         tf_g_ratio=0.10,
13 |         w_size=250000,
14 |         seed=lambda w: config['dts']['pitupair']['cases'][w.case].get('seed', 42),
15 |     resources:
16 |         mem_mb=restart_mem,
17 |         runtime=config['max_mins_per_step'],
18 |     shell:
19 |         """
20 |         python workflow/scripts/mth/random/grn.py \
21 |         -i {input.mdata} \
22 |         -t {input.tf} \
23 |         -c {input.cg} \
24 |         -g {params.g_perc} \
25 |         -n {params.scale} \
26 |         -r {params.tf_g_ratio} \
27 |         -w {params.w_size} \
28 |         -s {params.seed} \
29 |         -o {output.out}
30 |         """
31 | 


--------------------------------------------------------------------------------
/workflow/rules/mth/scenic.smk:
--------------------------------------------------------------------------------
 1 | rule mdl_scenic:
 2 |     threads: 16
 3 |     singularity: 'workflow/envs/scenicplus.sif'
 4 |     input:
 5 |         img='workflow/envs/scenicplus.sif',
 6 |         mdata=rules.extract_case.output.mdata,
 7 |         tf=rules.gen_tfs_scenic.output,
 8 |         proms=rules.cre_promoters.output,
 9 |         ranking_small=rules.gen_motif_scenic_rnk.output.sml,
10 |         ranking_big=rules.gen_motif_scenic_rnk.output.big,
11 |         motifs=rules.gen_motif_scenic.output
12 |     output:
13 |         adj=temp(local('dts/{dat}/cases/{case}/runs/adj_tmp.tsv')),
14 |         t=temp(local('dts/{dat}/cases/{case}/runs/scenic_tmp.loom')),
15 |         reg=temp(local('dts/{dat}/cases/{case}/runs/scenic_reg.csv')),
16 |         out='dts/{dat}/cases/{case}/runs/scenic.scenic.scenic.scenic.mdl.csv'
17 |     resources:
18 |         mem_mb=restart_mem,
19 |         runtime=config['max_mins_per_step'] * 2,
20 |     shell:
21 |         """
22 |         # Step 1: Create Loom file
23 |         python workflow/scripts/mth/scenic/loom.py \
24 |         -i {input.mdata} \
25 |         -o {output.t}
26 |         echo "Created loom"
27 | 
28 |         # Step 2: Run pyscenic GRN
29 |         arboreto_with_multiprocessing.py {output.t} {input.tf} -o {output.adj} --num_workers {threads} --seed 42
30 |         echo "Generated adj"
31 | 
32 |         # Step 3: Run CTX
33 |         pyscenic ctx {output.adj} \
34 |         {input.ranking_small} \
35 |         {input.ranking_big} \
36 |         --annotations_fname {input.motifs} \
37 |         --expression_mtx_fname {output.t} \
38 |         --output {output.reg} \
39 |         --mask_dropouts \
40 |         --num_workers {threads}
41 |         echo "Filtered TFs by motifs"
42 | 
43 |         # Step 4: Process GRN
44 |         python workflow/scripts/mth/scenic/process_grn.py \
45 |         -o {output.out} \
46 |         -p {input.proms} \
47 |         -g {output.adj} \
48 |         -r {output.reg}
49 |         echo "Done"
50 |         """
51 | 


--------------------------------------------------------------------------------
/workflow/rules/plt/comb.smk:
--------------------------------------------------------------------------------
 1 | #localrules: fig_comb
 2 | 
 3 | 
 4 | rule fig_comb:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input:
 8 |         mdta='dts/pbmc10k/cases/all/mdata.h5mu',
 9 |         qc='anl/dts/pbmc10k.all.qc.csv',
10 |         nc='anl/dts/pbmc10k.all.nc.csv',
11 |         sims='anl/topo/pbmc10k.all.sims_mult.csv',
12 |         stat='anl/topo/pbmc10k.all.stats_mult.csv',
13 |         fvsd='anl/topo/pbmc10k.all.fvsd.csv',
14 |         stab='anl/stab/pbmc10k.all.ovsd.csv',
15 |     output: 'plt/comb/fig.pdf'
16 |     shell:
17 |         """
18 |         python workflow/scripts/plt/comb/sims.py {input.mdta} \
19 |         {input.nc} {input.qc} {input.sims} {input.stat} {input.fvsd} {input.stab} {output}
20 |         """
21 | 


--------------------------------------------------------------------------------
/workflow/rules/plt/dbs.smk:
--------------------------------------------------------------------------------
 1 | localrules: fig_dbs
 2 | 
 3 | 
 4 | rule fig_dbs:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input:
 8 |         sts='anl/dbs/stats.csv',
 9 |         ovc='anl/dbs/ocoef.csv',
10 |     output: 'plt/dbs/fig.pdf'
11 |     shell:
12 |         """
13 |         python workflow/scripts/plt/dbs/stats.py {input.sts} {input.ovc} {output}
14 |         """
15 | 


--------------------------------------------------------------------------------
/workflow/rules/plt/eval.smk:
--------------------------------------------------------------------------------
 1 | localrules: fig_eval
 2 | 
 3 | 
 4 | rule fig_eval:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input:
 8 |         smr='anl/metrics/summary/pbmc10k.all.csv',
 9 |         dct='anl/stab/unsmthds/pbmc10k.scores.csv',
10 |     output: 'plt/eval/fig.pdf'
11 |     shell:
12 |         """
13 |         python workflow/scripts/plt/eval/eval.py {input} {output}
14 |         """
15 | 


--------------------------------------------------------------------------------
/workflow/rules/plt/figs.smk:
--------------------------------------------------------------------------------
 1 | localrules: plt_figs
 2 | 
 3 | 
 4 | rule plt_figs:
 5 |     threads: 1
 6 |     input: ['plt/stab/fig.pdf', 'plt/pair/fig.pdf', 'plt/comb/fig.pdf', 'plt/dbs/fig.pdf', 'plt/eval/fig.pdf']
 7 |     output: 'plt/figs.txt'
 8 |     shell:
 9 |         """
10 |         touch {output}
11 |         echo 'Done'
12 |         """


--------------------------------------------------------------------------------
/workflow/rules/plt/pair.smk:
--------------------------------------------------------------------------------
 1 | localrules: plt_npair, plt_fake, fig_pair
 2 | 
 3 | 
 4 | rule plt_npair:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input:
 8 |         pmd='dts/pitupair/cases/all/mdata.h5mu',
 9 |         nmd='dts/pitunpair/cases/all/mdata.h5mu',
10 |         ral='anl/pair/pitu.all.real_corvals.csv',
11 |         qc='anl/pair/pitu.all.qc.csv',
12 |         nc='anl/pair/pitu.all.ncells.csv',
13 |         oc='anl/pair/pitu.all.pvsn.csv',
14 |     output: 'plt/pair/npair.pdf'
15 |     shell:
16 |         """
17 |         python workflow/scripts/plt/pair/pair.py {input.pmd} {input.nmd} {input.ral} {input.qc} {input.nc} {input.oc} {output}
18 |         """
19 | 
20 | 
21 | rule plt_fake:
22 |     threads: 1
23 |     singularity: 'workflow/envs/gretabench.sif'
24 |     input:
25 |         knn='anl/pair/pitu.all.fake_knn.csv',
26 |         ctp='anl/pair/pitu.all.fake_prp.csv',
27 |         cor='anl/pair/pitu.all.fake_cor.csv',
28 |         ocf='anl/pair/pitu.all.pvsf.csv',
29 |     output: 'plt/pair/fake.pdf'
30 |     shell:
31 |         """
32 |         python workflow/scripts/plt/pair/fake.py {input.knn} {input.ctp} {input.cor} {input.ocf} {output}
33 |         """
34 | 
35 | 
36 | rule fig_pair:
37 |     threads: 1
38 |     input: ['plt/pair/npair.pdf', 'plt/pair/fake.pdf']
39 |     output: 'plt/pair/fig.pdf'
40 |     shell:
41 |         """
42 |         gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile={output} {input}
43 |         """
44 | 


--------------------------------------------------------------------------------
/workflow/rules/plt/stab.smk:
--------------------------------------------------------------------------------
 1 | localrules: plt_dwns, plt_sims, plt_AREG, fig_stability
 2 | 
 3 | 
 4 | rule plt_dwns:
 5 |     threads: 1
 6 |     singularity: 'workflow/envs/gretabench.sif'
 7 |     input:
 8 |         ovc='anl/stab/pitupair.ovc.csv',
 9 |         auc='anl/stab/pitupair.auc.csv',
10 |         wgt='anl/stab/pitupair.wgt.csv',
11 |         cor='anl/stab/pitupair.cor.csv',
12 |     output:
13 |         stab='plt/stab/dwns.pdf',
14 |         cors='plt/stab/cors.pdf',
15 |     shell:
16 |         """
17 |         python workflow/scripts/plt/stab/stab.py {input.ovc} {input.auc} {output.stab}
18 |         python workflow/scripts/plt/stab/cors.py {input.wgt} {input.cor} {output.cors}
19 |         """
20 | 
21 | 
22 | rule plt_sims:
23 |     threads: 1
24 |     singularity: 'workflow/envs/gretabench.sif'
25 |     input:
26 |         sims='anl/topo/pitupair.all.sims_mult.csv',
27 |         stats='anl/topo/pitupair.all.stats_mult.csv',
28 |         tss=rules.tss_aggr.output,
29 |         dst='anl/tss/pitupair.all.dist.csv',
30 |         net='anl/topo/pitupair.all.inter.csv',
31 |     output: 'plt/stab/sims.pdf'
32 |     shell:
33 |         """
34 |         python workflow/scripts/plt/stab/sims.py \
35 |         {input.sims} {input.stats} {input.tss} {input.dst} {input.net} {output}
36 |         """
37 | 
38 | 
39 | rule plt_AREG:
40 |     threads: 1
41 |     singularity: 'workflow/envs/gretabench.sif'
42 |     input:
43 |         sims='anl/topo/pitupair.all.sims_mult.csv',
44 |         gann='dbs/hg38/gen/ann/dictys/ann.bed',
45 |     output: 'plt/stab/links_AREG.pdf'
46 |     params:
47 |         gene='AREG',
48 |         tfs=['FOSL1', 'FOSL2', 'JUNB'],
49 |         wsize=250000
50 |     shell:
51 |         """
52 |         python workflow/scripts/plt/stab/links.py \
53 |         -s {input.sims} \
54 |         -g {params.gene} \
55 |         -t {params.tfs} \
56 |         -a {input.gann} \
57 |         -w {params.wsize} \
58 |         -o {output}
59 |         """
60 | 
61 | 
62 | rule fig_stability:
63 |     threads: 1
64 |     input:
65 |         stab='plt/stab/dwns.pdf',
66 |         cors='plt/stab/cors.pdf',
67 |         sims='plt/stab/sims.pdf',
68 |         areg='plt/stab/links_AREG.pdf'
69 |     output: 'plt/stab/fig.pdf'
70 |     shell:
71 |         """
72 |         gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile={output} {input.stab} {input.cors} {input.sims} {input.areg}
73 |         """
74 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/dbs/terms.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from tqdm import tqdm
 4 | import os
 5 | import argparse
 6 | 
 7 | 
 8 | # Init args
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('-i','--db_paths', required=True, nargs='+')
11 | parser.add_argument('-o','--path_out', required=True)
12 | args = vars(parser.parse_args())
13 | 
14 | db_paths = args['db_paths']
15 | path_out = args['path_out']
16 | 
17 | non_term_dbs = ['blacklist', 'encode', 'promoters', 'zhang21', 'phastcons']
18 | df = []
19 | for db_path in db_paths:
20 |     db_name = os.path.basename(os.path.dirname(db_path))
21 |     task = os.path.basename(os.path.dirname(os.path.dirname(db_path)))
22 |     if db_name not in non_term_dbs:
23 |         if task == 'tfb':
24 |             db = pd.read_csv(db_path, header=None, sep='\t', usecols=[4])[4]
25 |             terms = set()
26 |             for r in tqdm(db):
27 |                 terms.update(r.split(','))
28 |             terms = sorted(terms)
29 |         elif task == 'tfm':
30 |             db = pd.read_csv(db_path, sep='\t', header=None, usecols=[1])[1]
31 |             terms = set()
32 |             for r in db:
33 |                 terms.update(r.split(','))
34 |             terms = sorted(terms)
35 |         elif task == 'prt':
36 |             db = pd.read_csv(db_path)
37 |             terms = np.sort(db['Tissue.Type'].unique())
38 |         elif 'catalogue' in db_name:
39 |             db = pd.read_csv(db_path, header=None, sep='\t', usecols=[4])[4]
40 |             terms = set()
41 |             for r in tqdm(db):
42 |                 r = r.split(',')
43 |                 if isinstance(r, str):
44 |                     r = [r]
45 |                 for s_r in r:
46 |                     terms.update(s_r.split('|'))
47 |             terms = sorted(terms)
48 |         else:
49 |             raise ValueError('db {db} of task {task} has no defined terms'.format(db=db_name, task=task))
50 |         for term in terms:
51 |             df.append([db_name, term])
52 | df = pd.DataFrame(df, columns=['db_name', 'term'])
53 | 
54 | # Write
55 | df.to_csv(path_out, index=False)
56 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/dts/qcstats.py:
--------------------------------------------------------------------------------
 1 | import mudata as mu
 2 | import pandas as pd
 3 | import numpy as np
 4 | import scanpy as sc
 5 | import sys
 6 | 
 7 | 
 8 | mdata = mu.read(sys.argv[1])
 9 | 
10 | 
11 | def get_qc_omic(mdata, omic):
12 |     adata = mdata.mod[omic]
13 |     adata.X = adata.layers['counts']
14 |     obs, _ = sc.pp.calculate_qc_metrics(
15 |         adata, percent_top=None, log1p=True
16 |     )
17 |     qc = obs.assign(omic=omic)
18 |     qc = pd.merge(qc.reset_index(names='barcode'), mdata.obs.reset_index(names='barcode')[['barcode', 'celltype']], on=['barcode'], how='inner')
19 |     return qc
20 | 
21 | 
22 | def extract_n_cells(mdata):
23 |     return mdata.obs.groupby('celltype', as_index=False).size().sort_values('celltype')
24 | 
25 | 
26 | # Compute qc
27 | omics = ['rna', 'atac']
28 | n_ctps = extract_n_cells(mdata)
29 | qc = []
30 | for omic in omics:
31 |     qc.append(get_qc_omic(mdata, omic))
32 | qc = pd.concat(qc)
33 | 
34 | # Write
35 | qc.to_csv(sys.argv[2], index=False)
36 | n_ctps.to_csv(sys.argv[3], index=False)
37 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/metrics/aggregate.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | import argparse
 5 | 
 6 | 
 7 | # Init args
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('-i','--path_input', required=True, nargs='+')
10 | parser.add_argument('-a','--add_info', required=False, action="store_true")
11 | parser.add_argument('-o','--path_out', required=True)
12 | args = vars(parser.parse_args())
13 | 
14 | df_paths = args['path_input']
15 | add_info = args['add_info']
16 | path_out = args['path_out']
17 | 
18 | df = []
19 | for df_path in df_paths:
20 |     tmp = pd.read_csv(df_path)
21 |     if add_info:
22 |         dts = os.path.basename(os.path.dirname(df_path))
23 |         db = os.path.basename(os.path.dirname(os.path.dirname(df_path)))
24 |         task = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(df_path))))
25 |         metric = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(df_path)))))
26 |         tmp[['metric', 'task', 'db', 'dts']] = [metric, task, db, dts]
27 |         tmp = tmp[['metric', 'task', 'db', 'dts', 'name', 'prc', 'rcl', 'f01']]
28 |     df.append(tmp)
29 | df = pd.concat(df)
30 | 
31 | # Write
32 | df.to_csv(path_out, index=False)


--------------------------------------------------------------------------------
/workflow/scripts/anl/metrics/mech/tfa.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import decoupler as dc
 4 | import mudata as mu
 5 | import sys
 6 | import os
 7 | import re
 8 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 9 | from utils import load_cats, f_beta_score
10 | import argparse
11 | 
12 | 
13 | # Init args
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('-i','--grn_path', required=True)
16 | parser.add_argument('-b','--bnc_path', required=True)
17 | parser.add_argument('-o','--out_path', required=True)
18 | args = vars(parser.parse_args())
19 | 
20 | grn_path = args['grn_path']
21 | bnc_path = args['bnc_path']
22 | out_path = args['out_path']
23 | 
24 | # Extract names and path
25 | grn_name = os.path.basename(grn_path).replace('.grn.csv', '')
26 | data_path = os.path.join(os.path.dirname(os.path.dirname(grn_path)), 'mdata.h5mu')
27 | dataset = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(data_path))))
28 | case = os.path.basename(os.path.dirname(data_path))
29 | rsc_name = os.path.basename(bnc_path)
30 | 
31 | # Read GRN
32 | grn = pd.read_csv(grn_path)
33 | grn = grn.drop_duplicates(['source', 'target'], keep='first')
34 | 
35 | if grn.shape[0] > 0:
36 |     # Read dataset
37 |     rna = mu.read(os.path.join(data_path, 'mod', 'rna'))
38 | 
39 |     # Read benchmark data
40 |     mat = pd.read_csv(os.path.join(bnc_path, 'diff.csv'), index_col=0)
41 |     obs = pd.read_csv(os.path.join(bnc_path, 'meta.csv'), index_col=0)
42 |     
43 |     # Subset bench data to dataset
44 |     cats = load_cats(dataset, case)
45 |     cats = [re.escape(c) for c in cats[rsc_name]]
46 |     msk = obs['Tissue.Type'].isin(cats) & obs['TF'].isin(rna.var_names) & (obs['logFC'] < -0.5)
47 |     obs = obs.loc[msk, :]
48 |     mat = mat.loc[msk, :]
49 | 
50 |     # Compute TF activities
51 |     acts = []
52 |     pvals = []
53 |     for dataset in obs.index:
54 |         tf = obs.loc[dataset, 'TF']
55 |         tf_mat = mat.loc[[dataset], :]
56 |         tf_grn = grn[grn['source'] == tf]
57 |         try:
58 |             act, pval = dc.run_ulm(
59 |                 mat=tf_mat,
60 |                 net=tf_grn,
61 |                 weight='score',
62 |                 min_n=3,
63 |             )
64 |             act, pval = act.values[0, 0], pval.values[0, 0]
65 |             acts.append(act)
66 |             pvals.append(pval)
67 |         except:
68 |             pass
69 | 
70 |     # Compute recall
71 |     acts = np.array(acts)
72 |     pvals = np.array(pvals)
73 |     padj = dc.p_adjust_fdr(pvals)
74 |     tp = np.sum((acts < 0) & (padj < 0.05))
75 |     if tp > 0:
76 |         prc = tp / acts.size
77 |         rcl = tp / obs.shape[0]
78 |         f01 = f_beta_score(prc, rcl)
79 |     else:
80 |         prc, rcl, f01 = 0., 0., 0.
81 | 
82 |     df = pd.DataFrame([[grn_name, prc, rcl, f01]], columns=['name', 'prc', 'rcl', 'f01'])
83 | else:
84 |     df = pd.DataFrame([[grn_name, np.nan, np.nan, np.nan]], columns=['name', 'prc', 'rcl', 'f01'])
85 | 
86 | # Write
87 | df.to_csv(out_path, index=False)
88 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/metrics/mech/tfm.py:
--------------------------------------------------------------------------------
 1 | import scanpy as sc
 2 | import pandas as pd
 3 | import numpy as np
 4 | import mudata as mu
 5 | import os
 6 | import h5py
 7 | import sys
 8 | 
 9 | 
10 | path_mdata = sys.argv[1]
11 | path_tfs = sys.argv[2]
12 | path_out = sys.argv[3]
13 | 
14 | # Read
15 | tfs = pd.read_csv(path_tfs, header=None)[0].values
16 | rna = mu.read(os.path.join(path_mdata, 'mod', 'rna'))
17 | 
18 | # Filter and update
19 | inter = rna.var_names.intersection(tfs)
20 | rna = rna[:, inter].copy()
21 | rna.obs = mu.read(path_mdata).obs.loc[:, ['celltype']].copy()
22 | 
23 | # Extract DEG tfs
24 | sc.tl.rank_genes_groups(rna, groupby='celltype', method='wilcoxon')
25 | df = sc.get.rank_genes_groups_df(rna, group=None)
26 | 
27 | # Filter results
28 | df = df[(df['pvals_adj'] < 2.22e-16) & (df['logfoldchanges'] > 2.)]
29 | n_group = df.groupby('group', as_index=False).size()
30 | n_group = n_group[n_group['size'] >= 1]
31 | groups = n_group['group'].values
32 | df['group'] = df['group'].astype(str)
33 | df = df[df['group'].isin(groups)]
34 | df = df[['group', 'names']]
35 | df.columns = ['celltype', 'tf']
36 | 
37 | # Write
38 | df.to_csv(path_out, index=False)
39 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/metrics/pred/gsets.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import mudata as mu
 4 | import decoupler as dc
 5 | import argparse
 6 | import sys
 7 | import os
 8 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 9 | from utils import f_beta_score
10 | 
11 | 
12 | # Init args
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('-i','--grn_path', required=True)
15 | parser.add_argument('-p','--ptw_path', required=True)
16 | parser.add_argument('-o','--out_path', required=True)
17 | args = vars(parser.parse_args())
18 | 
19 | grn_path = args['grn_path']
20 | ptw_path = args['ptw_path']
21 | out_path = args['out_path']
22 | 
23 | grn_name = os.path.basename(grn_path).replace('.grn.csv', '')
24 | data_path = os.path.join(os.path.dirname(os.path.dirname(grn_path)), 'mdata.h5mu')
25 | 
26 | grn = pd.read_csv(grn_path)
27 | 
28 | def get_sig_pws(grn, db, thr_pval):
29 |     sig_pws = set()
30 |     for tf in grn['source'].unique():
31 |         df = grn[grn['source'] == tf].set_index('target')
32 |         pws = dc.get_ora_df(
33 |             df=df,
34 |             net=db,
35 |         )
36 |         sig_pws.update(pws[pws['FDR p-value'] < thr_pval]['Term'])
37 |     sig_pws = np.array(list(sig_pws))
38 |     return sig_pws
39 | 
40 | 
41 | def eval_metrics(y_pred, y):
42 |     tp = np.intersect1d(y_pred, y).size
43 |     if tp > 0.:
44 |         fp = np.setdiff1d(y_pred, y).size
45 |         fn = np.setdiff1d(y, y_pred).size
46 |         prc = tp / (tp + fp)
47 |         rcl = tp / (tp + fn)
48 |         f1 = f_beta_score(prc, rcl)
49 |     else:
50 |         prc, rcl, f1 = 0., 0., 0.
51 |     return prc, rcl, f1
52 | 
53 | 
54 | def eval_grn(data, grn, db, thr_pval=0.01, thr_prop=0.2):
55 |     hits = get_pw_hits(data, thr_pval, thr_prop)
56 |     sig_pws = get_sig_pws(grn, db, thr_pval)
57 |     prc, rcl, f1 = eval_metrics(y_pred=sig_pws, y=hits)
58 |     return prc, rcl, f1
59 | 
60 | 
61 | def get_pw_hits(data, thr_pval, thr_prop):
62 |     pvals = data.obsm['ulm_pvals'].copy()
63 |     pvals.loc[:, :] = dc.p_adjust_fdr(pvals.values.ravel()).reshape(pvals.shape)
64 |     acts = data.obsm['ulm_estimate'].copy()
65 |     hits = ((pvals < thr_pval) & (acts > 0)).sum(0).sort_values(ascending=False) / pvals.shape[0]
66 |     hits = hits[hits > thr_prop].index.values.astype('U')
67 |     return hits
68 | 
69 | 
70 | if grn.shape[0] > 0:
71 |     ptw = pd.read_csv(ptw_path)
72 |     rna = mu.read(os.path.join(data_path, 'mod', 'rna'))
73 |     # Infer pathway activities
74 |     dc.run_ulm(
75 |         mat=rna,
76 |         net=ptw,
77 |         weight=None,
78 |         use_raw=False,
79 |         verbose=True
80 |     )
81 |     prc, rcl, f01 = eval_grn(rna, grn, ptw, thr_pval=0.01, thr_prop=0.2)
82 |     df = pd.DataFrame([[grn_name, prc, rcl, f01]], columns=['name', 'prc', 'rcl', 'f01'])
83 | else:
84 |     df = pd.DataFrame([[grn_name, np.nan, np.nan, np.nan]], columns=['name', 'prc', 'rcl', 'f01'])
85 | 
86 | # Write
87 | df.to_csv(out_path, index=False)
88 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/metrics/prior/tfm.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import mudata as mu
 4 | from tqdm import tqdm
 5 | import sys
 6 | import os
 7 | import re
 8 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 9 | from utils import load_cats, f_beta_score
10 | import argparse
11 | 
12 | 
13 | # Init args
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('-a','--grn_path', required=True)
16 | parser.add_argument('-b','--resource_path', required=True)
17 | parser.add_argument('-f','--out_path', required=True)
18 | args = vars(parser.parse_args())
19 | 
20 | grn_path = args['grn_path']
21 | resource_path = args['resource_path']
22 | out_path = args['out_path']
23 | 
24 | 
25 | grn_name = os.path.basename(grn_path).replace('.grn.csv', '')
26 | data_path = os.path.join(os.path.dirname(os.path.dirname(grn_path)), 'mdata.h5mu')
27 | dataset = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(data_path))))
28 | case = os.path.basename(os.path.dirname(data_path))
29 | resource_name = os.path.basename(resource_path).replace('.csv', '')
30 | 
31 | # Read grn
32 | grn = pd.read_csv(grn_path)
33 | 
34 | if grn.shape[0] > 0:
35 |     # Read resource and filter by cats
36 |     db = pd.read_csv(resource_path, header=None, sep='\t')
37 |     db.columns = ['gene', 'ctype']
38 |     cats = load_cats(dataset, case)
39 |     if resource_name in cats:
40 |         cats = [re.escape(c) for c in cats[resource_name]]
41 |         print('Filtering for {0} cats'.format(len(cats)))
42 |         db = db[db['ctype'].str.contains('|'.join(cats))]
43 |     
44 |     # Filter resource by measured genes
45 |     genes = mu.read(os.path.join(data_path, 'mod', 'rna')).var_names.astype('U')
46 |     db = db[db['gene'].astype('U').isin(genes)]
47 |     
48 |     # Compute evaluation
49 |     y_pred = grn['source'].unique().astype('U')
50 |     y = db['gene'].unique().astype('U')
51 |     tp = np.intersect1d(y_pred, y).size
52 |     if tp > 0.:
53 |         fp = np.setdiff1d(y_pred, y).size
54 |         fn = np.setdiff1d(y, y_pred).size
55 |         prc = tp / (tp + fp)
56 |         rcl = tp / (tp + fn)
57 |         f01 = f_beta_score(prc, rcl)
58 |     else:
59 |         prc, rcl, f01 = 0., 0., 0.,
60 |     df = pd.DataFrame([[grn_name, prc, rcl, f01]], columns=['name', 'prc', 'rcl', 'f01'])
61 | else:
62 |     df = pd.DataFrame([[grn_name, np.nan, np.nan, np.nan]], columns=['name', 'prc', 'rcl', 'f01'])
63 | 
64 | # Write
65 | df.to_csv(out_path, index=False)
66 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/metrics/prior/tfp.py:
--------------------------------------------------------------------------------
 1 | from itertools import combinations
 2 | import scipy.stats as ss
 3 | import numpy as np
 4 | import pandas as pd
 5 | import sys
 6 | import os
 7 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 8 | from utils import f_beta_score
 9 | 
10 | 
11 | def compute_pval(tf_a, tf_b, grn):
12 |     trg_a = set(grn[grn['source'] == tf_a]['target'])
13 |     trg_b = set(grn[grn['source'] == tf_b]['target'])
14 |     total = set(grn['target'])
15 |     a = len(trg_a & trg_b)
16 |     if a > 0:
17 |         b = len(trg_a - trg_b)
18 |         c = len(trg_b - trg_a)
19 |         d = len(total - (trg_a | trg_b))
20 |         s, p = ss.fisher_exact([[a, b], [c, d]], alternative='greater')
21 |     else:
22 |         s, p = 0, np.nan
23 |     return s, p
24 | 
25 | 
26 | def find_pairs(grn, thr_pval):
27 |     df = []
28 |     for tf_a, tf_b in combinations(grn['source'].unique(), r=2):
29 |         s, p = compute_pval(tf_a, tf_b, grn)
30 |         df.append([tf_a, tf_b, s, p])
31 |     df = pd.DataFrame(df, columns=['tf_a', 'tf_b', 'stat', 'pval']).dropna()
32 |     if df.shape[0] > 0:
33 |         df['padj'] = ss.false_discovery_control(df['pval'], method='bh')
34 |         df = df[df['padj'] < thr_pval]
35 |         pairs = set(['|'.join(sorted([a, b])) for a, b in zip(df['tf_a'], df['tf_b'])])
36 |     else:
37 |         pairs = set()
38 |     return pairs
39 | 
40 | 
41 | # Read
42 | grn = pd.read_csv(sys.argv[1]).drop_duplicates(['source', 'target'])
43 | tfp = pd.read_csv(sys.argv[2], sep='\t', header=None)
44 | 
45 | # Process
46 | tfs = set(tfp[0]) | set(tfp[1])
47 | grn = grn[grn['source'].isin(tfs)]
48 | tfp = set(['|'.join(sorted([a, b])) for a, b in zip(tfp[0], tfp[1])])
49 | grn_name = os.path.basename(sys.argv[1]).replace('.grn.csv', '')
50 | 
51 | if grn.shape[0] > 1:  # Need at least 2 TFs in grn
52 |     # Find pairs
53 |     p_grn = find_pairs(grn, thr_pval=float(sys.argv[3]))
54 |     
55 |     # Compute F score
56 |     tp = len(p_grn & tfp)
57 |     if tp > 0:
58 |         fp = len(p_grn - tfp)
59 |         fn = len(tfp - p_grn)
60 |         rcl = tp / (tp + fn)
61 |         prc = tp / (tp + fp)
62 |         f01 = f_beta_score(prc, rcl)
63 |     else:
64 |         prc, rcl, f01 = 0., 0., 0.
65 |     df = pd.DataFrame([[grn_name, prc, rcl, f01]], columns=['name', 'prc', 'rcl', 'f01'])
66 | else:
67 |     df = pd.DataFrame([[grn_name, np.nan, np.nan, np.nan]], columns=['name', 'prc', 'rcl', 'f01'])
68 | 
69 | # Write
70 | df.to_csv(sys.argv[4], index=False)
71 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/metrics/test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import argparse
 4 | import os
 5 | 
 6 | 
 7 | def read_eval(m_path):
 8 |     db_name = os.path.basename(os.path.dirname(m_path))
 9 |     task = os.path.basename(os.path.dirname(os.path.dirname(m_path)))
10 |     metric = os.path.basename(os.path.dirname(os.path.dirname(os.path.dirname(m_path))))
11 |     case = os.path.basename(m_path).replace('.scores.csv', '')
12 |     df = pd.read_csv('anl/metrics/{0}/{1}/{2}/{3}.scores.csv'.format(metric, task, db_name, case)).sort_values('f01', ascending=False)
13 |     df[['pre', 'p2g', 'tfb', 'mdl']] = df['name'].str.split('.', n=4, expand=True)
14 |     df = df[~df['pre'].str.startswith('o_')]
15 |     df = df.reset_index(drop=True).reset_index(names='rank')
16 |     df['fixed'] = [np.unique(n.split('.')).size == 1 for n in df['name']]
17 |     return metric, task, db_name, case, df
18 | 
19 | 
20 | def test_rank(df):
21 |     import decoupler as dc
22 |     steps = ['pre', 'p2g', 'tfb', 'mdl']
23 |     mthds = df['pre'].unique()
24 |     net = []
25 |     sts = []
26 |     for step in steps:
27 |         sts.append(df.groupby([step], as_index=False)['f01'].mean().rename(columns={step: 'name'}).assign(stp=step))
28 |         for mth in mthds:
29 |             for name in df[df[step] == mth]['name']:
30 |                 net.append(['{0}.{1}'.format(step, mth), name])
31 |     net = pd.DataFrame(net, columns=['source', 'target'])
32 |     sts = pd.concat(sts)
33 |     res = dc.get_gsea_df(
34 |         df=df.dropna().set_index('name'),
35 |         stat='f01',
36 |         net=net,
37 |         times=1000
38 |     )
39 |     res['padj'] = np.where(res['ES'] > 0, res['FDR p-value'], 1)
40 |     res[['stp', 'name']] = res['Term'].str.split('.', n=2, expand=True)
41 |     res = res[['stp', 'name', 'padj']]
42 |     res = pd.merge(res, sts, how='left', on=['stp', 'name'])
43 |     return res
44 | 
45 | 
46 | parser = argparse.ArgumentParser()
47 | parser.add_argument('-m', '--path_mtr', nargs='+', required=True)
48 | parser.add_argument('-o', '--path_out', required=True)
49 | args = parser.parse_args()
50 | 
51 | # Test each metric-database
52 | df = []
53 | for m_path in args.path_mtr:
54 |     metric, task, db_name, case, m_df = read_eval(m_path)
55 |     m_df = test_rank(m_df)
56 |     m_df[['metric', 'task', 'db', 'case']] = metric, task, db_name, case
57 |     df.append(m_df)
58 | df = pd.concat(df)
59 | df = df[['metric', 'task', 'db', 'stp', 'name', 'case', 'padj', 'f01']]
60 | df = df.sort_values(['metric', 'task', 'db', 'stp', 'name'])
61 | 
62 | # Write
63 | df.to_csv(args.path_out, index=False)
64 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/metrics/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def load_cats(dataset, case):
 5 |     with open('config/prior_cats.json') as f:
 6 |         cats = json.load(f)
 7 |     if (dataset == 'pbmc10k'):
 8 |         for i in range(4):
 9 |             cats[dataset][str(i)] = cats[dataset]['all'].copy()
10 |     cats = cats[dataset][case]
11 |     return cats
12 | 
13 | def f_beta_score(prc, rcl, beta=0.1):
14 |     if prc + rcl == 0:
15 |         return 0
16 |     return (1 + beta**2) * (prc * rcl) / ((prc * beta**2) + rcl)
17 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/pair/pairsim.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import yaml
 4 | import sys
 5 | import os
 6 | from tqdm import tqdm
 7 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 8 | from utils import (
 9 |     ocoeff,
10 | )
11 | import glob
12 | import argparse
13 | 
14 | 
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('-a','--path_a', required=True)
17 | parser.add_argument('-b','--path_b', required=True)
18 | parser.add_argument('-o','--path_out', required=True)
19 | args = vars(parser.parse_args())
20 | 
21 | path_a = args['path_a']
22 | path_b = args['path_b']
23 | path_out = args['path_out']
24 | 
25 | # Find paths
26 | dname_a, case_a = os.path.basename(path_a).split('.')[:2]
27 | dname_b, case_b = os.path.basename(path_b).split('.')[:2]
28 | dname_a = dname_a.replace('pair', '')
29 | dname_b = dname_b.replace('pair', '')
30 | path_pair = sorted(glob.glob(f'dts/{dname_a}pair/cases/{case_b}/runs/*.grn.csv'))
31 | path_npair = sorted(glob.glob(f'dts/{dname_b}pair/cases/{case_b}/runs/*.grn.csv'))
32 | 
33 | # Compute ocoef
34 | df = []
35 | for i in tqdm(range(len(path_pair))):
36 |     p_path, n_path = path_pair[i], path_npair[i]
37 |     assert os.path.basename(p_path) == os.path.basename(n_path)
38 |     p_grn, n_grn = pd.read_csv(p_path), pd.read_csv(n_path)
39 |     val = ocoeff(p_grn, n_grn, on=['source', 'target'])
40 |     df.append([os.path.basename(p_path).replace('.grn.csv', ''), val])
41 | df = pd.DataFrame(df, columns=['mth', 'ocoef'])
42 | 
43 | # Write
44 | df.to_csv(path_out, index=False)
45 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/pair/realqc.py:
--------------------------------------------------------------------------------
 1 | import mudata as mu
 2 | import pandas as pd
 3 | import numpy as np
 4 | import scanpy as sc
 5 | import sys
 6 | 
 7 | 
 8 | pmdata = mu.read(sys.argv[1])
 9 | nmdata = mu.read(sys.argv[2])
10 | 
11 | 
12 | def get_qc_omic(mdata, omic, tpe):
13 |     adata = mdata.mod[omic]
14 |     adata.X = adata.layers['counts']
15 |     obs, _ = sc.pp.calculate_qc_metrics(
16 |         adata, percent_top=None, log1p=True
17 |     )
18 |     qc = obs.assign(omic=omic, type=t)
19 |     qc = pd.merge(qc.reset_index(names='barcode'), mdata.obs.reset_index(names='barcode')[['barcode', 'celltype']], on=['barcode'], how='inner')
20 |     return qc
21 | 
22 | 
23 | def extract_n_cells(mdata, tpe):
24 |     return mdata.obs.groupby('celltype', as_index=False).size().sort_values('celltype').assign(type=tpe)
25 | 
26 | 
27 | # Compute qc
28 | types = ['paired', 'upaired']
29 | omics = ['rna', 'atac']
30 | qc = []
31 | n_ctps = []
32 | for mdata, t in zip([pmdata, nmdata], types):
33 |     n_ctps.append(extract_n_cells(mdata, t))
34 |     for omic in omics:
35 |         qc.append(get_qc_omic(mdata, omic, t))
36 | qc = pd.concat(qc)
37 | n_ctps = pd.concat(n_ctps)
38 | 
39 | # Write
40 | qc.to_csv(sys.argv[3], index=False)
41 | n_ctps.to_csv(sys.argv[4], index=False)
42 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/stab/ovsd.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import scipy.stats as ss
 3 | import sys
 4 | import os
 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 6 | from utils import read_config, ocoeff
 7 | 
 8 | 
 9 | # Extract dat and case
10 | dat, case = os.path.basename(sys.argv[1]).split('.')[:2]
11 | 
12 | # Read config
13 | config = read_config()
14 | palette = config['colors']['nets']
15 | mthds = list(config['methods'].keys())
16 | baselines = config['baselines']
17 | 
18 | # Compute ocoeff and pearson
19 | df = []
20 | for mth in mthds:
21 |     ref = pd.read_csv(f'dts/{dat}/cases/{case}/runs/o_{mth}.o_{mth}.o_{mth}.o_{mth}.grn.csv')
22 |     net = pd.read_csv(f'dts/{dat}/cases/{case}/runs/{mth}.{mth}.{mth}.{mth}.grn.csv')
23 |     inter = pd.merge(ref, net, on=['source', 'target'], how='inner')
24 |     s, p = ss.pearsonr(inter['score_x'], inter['score_y'])
25 |     df.append([mth, ocoeff(ref, net, on=['source', 'target']), s, p])
26 | df = pd.DataFrame(df, columns=['mth', 'ocoeff', 'stat', 'pval'])
27 | df['padj'] = ss.false_discovery_control(df['pval'])
28 | 
29 | # Write
30 | df.to_csv(sys.argv[2], index=False)
31 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/stab/seeds.py:
--------------------------------------------------------------------------------
 1 | import scipy.stats as sts
 2 | import pandas as pd
 3 | import numpy as np
 4 | import sys
 5 | import os
 6 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 7 | from utils import read_config
 8 | 
 9 | 
10 | # Read config
11 | config = read_config()
12 | palette = config['colors']['nets']
13 | mthds = list(config['methods'].keys())
14 | baselines = config['baselines']
15 | 
16 | path_df = sys.argv[1]
17 | dname = os.path.basename(path_df).split('.')[0]
18 | df = pd.read_csv(path_df)
19 | mthds = df[df['cat'] == 'full'].groupby('mth', as_index=False)['e_ocoeff'].mean()
20 | mthds = mthds[mthds['e_ocoeff'] < 1.]['mth'].values
21 | 
22 | # Find inter across seeds
23 | seeds = [0, 1, 2]
24 | dfs = []
25 | for mth in mthds:
26 |     if mth not in baselines:
27 |         mth = 'o_' + mth
28 |     df = []
29 |     for i, seed_a in enumerate(seeds):
30 |         seed_a = str(seed_a)
31 |         path_a = f'dts/{dname}/cases/16384_16384_{seed_a}/runs/{mth}.{mth}.{mth}.{mth}.grn.csv'
32 |         grn_a = pd.read_csv(path_a)[['source', 'target', 'score']]
33 |         for seed_b in seeds[i + 1:]:
34 |             path_b = f'dts/{dname}/cases/16384_16384_{seed_b}/runs/{mth}.{mth}.{mth}.{mth}.grn.csv'
35 |             grn_b = pd.read_csv(path_b)[['source', 'target', 'score']]
36 |             df.append(pd.merge(grn_a, grn_b, how='inner', on=['source', 'target']).assign(comp=f'{seed_a}_{seed_b}'))
37 |     mth = mth.replace('o_', '')
38 |     df = pd.concat(df)
39 |     if df.shape[0] > 1:
40 |         df.insert(0, 'mth', mth)
41 |     else:
42 |         df.loc[0, :] = [np.nan for c in df.columns]
43 |         df['mth'] = mth
44 |     dfs.append(df)
45 | df = pd.concat(dfs)
46 | 
47 | # Cors
48 | pairs = ['0_1', '0_2', '1_2']
49 | cors = []
50 | for mth in df['mth'].unique():
51 |     tmp = df[df['mth'] == mth]
52 |     for pair in pairs:
53 |         comp = tmp[tmp['comp'] == pair]
54 |         if comp.shape[0] > 1:
55 |             r, p = sts.pearsonr(comp['score_x'], comp['score_y'])
56 |         else:
57 |             r, p = np.nan, 1
58 |         cors.append([mth, r, p, pair])
59 | cors = pd.DataFrame(cors, columns=['mth', 'stat', 'pval', 'comp'])
60 | cors['padj'] = sts.false_discovery_control(cors['pval'])
61 | 
62 | # Write
63 | df.to_csv(sys.argv[2], index=False)
64 | cors.to_csv(sys.argv[3], index=False)
65 | 
66 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/topo/fvsd.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import sys
 4 | import os
 5 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 6 | from utils import read_config
 7 | 
 8 | 
 9 | def fixed_pip(mthds, sts, mat, title):
10 |     res = []
11 |     steps = ['pre', 'c2g', 'tfb', 'mdl']
12 |     for mth in mthds:
13 |         # Extract steps
14 |         msk_mth = (sts['pre'] == mth) & (sts['c2g'] == mth) & (sts['tfb'] == mth) & (sts['mdl'] == mth)
15 |         msk_pre = (sts['pre'] != mth) & (sts['c2g'] == mth) & (sts['tfb'] == mth) & (sts['mdl'] == mth)
16 |         msk_c2g = (sts['pre'] == mth) & (sts['c2g'] != mth) & (sts['tfb'] == mth) & (sts['mdl'] == mth)
17 |         msk_tfb = (sts['pre'] == mth) & (sts['c2g'] == mth) & (sts['tfb'] != mth) & (sts['mdl'] == mth)
18 |         msk_mdl = (sts['pre'] == mth) & (sts['c2g'] == mth) & (sts['tfb'] == mth) & (sts['mdl'] != mth)
19 |         
20 |         # Build df
21 |         df = pd.concat([
22 |             mat.loc[sts[msk_pre].index, sts[msk_mth].index].assign(step=0),
23 |             mat.loc[sts[msk_c2g].index, sts[msk_mth].index].assign(step=1),
24 |             mat.loc[sts[msk_tfb].index, sts[msk_mth].index].assign(step=2),
25 |             mat.loc[sts[msk_mdl].index, sts[msk_mth].index].assign(step=3),
26 |         ]).reset_index().rename(columns={'{m}.{m}.{m}.{m}'.format(m=mth): 'ocoeff', 'name_a': 'rest'})
27 |         
28 |         # Format df
29 |         df['rest'] = [n.split('.')[i] for n,i in zip(df['rest'], df['step'])]
30 |         df['step'] = [steps[i] for i in df['step']]
31 |         df['mth'] = mth
32 |         df = df[['mth', 'step', 'rest', 'ocoeff']]
33 |         res.append(df)
34 |     res = pd.concat(res)
35 |     res = res.rename(columns={'ocoeff': title})
36 |     return res
37 | 
38 | 
39 | # Read
40 | sim = pd.read_csv(sys.argv[1])
41 | sts = pd.read_csv(sys.argv[2])
42 | config = read_config()
43 | mthds = list(config['methods'].keys())
44 | 
45 | # Remove original runs and baselines
46 | sim = sim[~(sim['name_a'].str.startswith('o_') | sim['name_b'].str.startswith('o_'))]
47 | sim = sim[(sim['name_a'].str.split('.', expand=True)[0].isin(mthds) & sim['name_b'].str.split('.', expand=True)[0].isin(mthds))]
48 | 
49 | # Find ocoeffs for fixed vs one step change
50 | df = None
51 | for oc in ['tf_oc', 'edge_oc', 'target_oc']:
52 |     mat = sim.dropna().pivot(index='name_a', columns='name_b', values=oc).fillna(0)
53 |     mat = mat + mat.T
54 |     np.fill_diagonal(mat.values, 1)
55 |     t_sts = sts.set_index('name').loc[mat.index].rename(columns={'p2g': 'c2g'})
56 |     t_sts[['pre', 'c2g', 'tfb', 'mdl']] = t_sts.reset_index()['name_a'].str.split('.', n=4, expand=True).values
57 |     if df is None:
58 |         df = fixed_pip(mthds, t_sts, mat, title=oc)
59 |     else:
60 |         df = pd.merge(df, fixed_pip(mthds, t_sts, mat, title=oc), on=['mth', 'step', 'rest'])
61 | 
62 | # Write
63 | df.to_csv(sys.argv[3], index=False)
64 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/topo/inter.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import argparse
 4 | 
 5 | 
 6 | # Init args
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('-g','--paths_grns', required=True, nargs='+')
 9 | parser.add_argument('-b','--baselines', required=True, nargs='+')
10 | parser.add_argument('-p','--min_prop', required=True, type=float)
11 | parser.add_argument('-o','--path_out', required=True)
12 | args = parser.parse_args()
13 | 
14 | grns = []
15 | blns = []
16 | for grn_path in args.paths_grns:
17 |     name = grn_path.split('.')[-3]
18 |     if name.startswith('o_') and (name not in args.baselines):
19 |         grn = pd.read_csv(grn_path).drop_duplicates(['source', 'target'])
20 |         grn['name'] = name.replace('o_', '')
21 |         grns.append(grn)
22 |     elif name in args.baselines:
23 |         grn = pd.read_csv(grn_path).drop_duplicates(['source', 'target']).drop(columns='cre')
24 |         grn['name'] = name
25 |         blns.append(grn)
26 |         
27 | min_n = np.floor(args.min_prop * len(grns))
28 | grns = pd.concat(grns)
29 | blns = pd.concat(blns)
30 | shared = grns.groupby(['source', 'target'], as_index=False).size().sort_values('size', ascending=False)
31 | shared = shared[shared['size'] > min_n]
32 | 
33 | 
34 | shared_grn = (
35 |     pd.merge(grns, shared, how='inner', on=['source', 'target'])
36 |     .sort_values(['name', 'source', 'target', 'pval'])
37 |     [['name', 'source', 'target', 'score']]
38 | )
39 | nodes = set(shared_grn['source']) | set(shared_grn['target'])
40 | msk = blns['source'].isin(nodes) & blns['target'].isin(nodes)
41 | blns = blns.loc[msk, :]
42 | 
43 | shared_grn = pd.concat([
44 |     shared_grn.assign(type='mth'),
45 |     blns.assign(type='bsl')
46 | ])
47 | 
48 | # Write
49 | shared_grn.to_csv(args.path_out, index=False)
50 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/topo/run_pair_sim.py:
--------------------------------------------------------------------------------
 1 | import concurrent.futures
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os
 5 | import glob
 6 | from tqdm import tqdm
 7 | from functools import partial
 8 | import sys
 9 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
10 | from utils import (
11 |     ocoeff,
12 |     get_grn_name,
13 |     get_grn_stats
14 | )
15 | import argparse
16 | 
17 | 
18 | # Init args
19 | parser = argparse.ArgumentParser()
20 | parser.add_argument('-t','--stat_path', required=True)
21 | parser.add_argument('-s','--sim_path', required=True)
22 | args = vars(parser.parse_args())
23 | 
24 | stat_path = args['stat_path']
25 | sim_path = args['sim_path']
26 | 
27 | dat, case = os.path.basename(stat_path).split('.')[:2]
28 | paths = glob.glob(os.path.join('dts', dat, 'cases', case, 'runs', '*.grn.csv'))
29 | 
30 | print('Reading and computing grns stats...')
31 | names = []
32 | dfs = []
33 | stats = []
34 | tfs = []
35 | edges = []
36 | genes = []
37 | 
38 | for path in tqdm(paths):
39 |     name = get_grn_name(path)
40 |     names.append(name)
41 |     df = pd.read_csv(path).drop_duplicates(['source', 'target'], keep='first')
42 |     stat = get_grn_stats(df)
43 |     stats.append([name] + list(stat))
44 |     tfs.append(set(df['source']))
45 |     edges.append(set(df['source'] + '|' + df['target']))
46 |     genes.append(set(df['target']))
47 |     
48 | 
49 | # Store as df
50 | cols = ['name', 'n_tfs', 'n_edges', 'n_targets', 'odegree', 'betweenc', 'eigv']
51 | stats = pd.DataFrame(stats, columns=cols)
52 | 
53 | print('Computing pairwise overlap coefficients...')
54 | 
55 | 
56 | def set_ocoef(a, b):
57 |     min_s = min(len(a), len(b))
58 |     if min_s == 0:
59 |         return np.nan
60 |     else:
61 |         inter = len(a & b)
62 |         return inter / min_s
63 | 
64 | 
65 | names_a = []
66 | names_b = []
67 | tf_coefs = []
68 | edge_coefs = []
69 | target_coefs = []
70 | for i in tqdm(range(len(names))):
71 |     name_a = names[i]
72 |     tf_a = tfs[i]
73 |     ed_a = edges[i]
74 |     gn_a = genes[i]
75 |     for j in range(i, len(names)):
76 |         name_b = names[j]
77 |         tf_b = tfs[j]
78 |         ed_b = edges[j]
79 |         gn_b = genes[j]
80 |         names_a.append(name_a)
81 |         names_b.append(name_b)
82 |         tf_coefs.append(set_ocoef(tf_a, tf_b))
83 |         edge_coefs.append(set_ocoef(ed_a, ed_b))
84 |         target_coefs.append(set_ocoef(gn_a, gn_b))
85 | 
86 | 
87 | # Store as df
88 | sims = pd.DataFrame()
89 | sims['name_a'] = names_a
90 | sims['name_b'] = names_b
91 | sims['tf_oc'] = tf_coefs
92 | sims['edge_oc'] = edge_coefs
93 | sims['target_oc'] = target_coefs
94 | 
95 | # Write
96 | stats.to_csv(stat_path, index=False)
97 | sims.to_csv(sim_path, index=False)
98 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/tss/dist.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pyranges as pr
 3 | from tqdm import tqdm
 4 | import os
 5 | import glob
 6 | import argparse
 7 | 
 8 | 
 9 | # Parse args
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('-g', '--path_cmp', required=True)
12 | parser.add_argument('-b', '--baselines', required=True, nargs='+')
13 | parser.add_argument('-o', '--path_out', required=True)
14 | args = parser.parse_args()
15 | path_cmp = args.path_cmp
16 | baselines = args.baselines
17 | path_out = args.path_out
18 | 
19 | # Set variables
20 | dname, case = os.path.basename(path_cmp).split('.')[:2]
21 | path_grns = glob.glob(os.path.join('dts', dname, 'cases', case, 'runs', '*.grn.csv'))
22 | def compute_dist_tss(path, mth):
23 |     if mth.startswith('o_'):
24 |         grn = pd.read_csv(path)
25 |         cre_grn = pd.read_csv(path.replace('o_', '')).rename(columns={'tf': 'source', 'gene': 'target'})
26 |         grn = pd.merge(grn, cre_grn[['source', 'cre', 'target']])
27 |     else:
28 |         grn = pd.read_csv(path)
29 |     mth = mth.replace('o_', '')
30 |     grn = grn.drop_duplicates(['cre', 'target'])
31 |     grn[['Chromosome', 'Start', 'End']] = grn['cre'].str.split('-', expand=True)
32 |     grn = pr.PyRanges(grn[['Chromosome', 'Start', 'End', 'target']].rename(columns={'target': 'Name'}))
33 |     tss = pd.read_csv(f'dbs/hg38/gen/tss/{mth}.bed', sep='\t', header=None)
34 |     tss.columns = ['Chromosome', 'Start', 'End', 'Name']
35 |     tss = pr.PyRanges(tss)
36 |     genes = grn.df['Name'].unique().astype('U')
37 |     dists = []
38 |     for g in genes:
39 |         g_grn = grn[grn.Name == g]
40 |         g_tss = tss[tss.Name == g]
41 |         dists.append(g_grn.nearest(g_tss, overlap=True).df[['Chromosome', 'Start', 'End', 'Distance']].assign(gene=g))
42 |     dists = pd.concat(dists).rename(columns={'Distance': 'dist'})
43 |     dists['mth'] = mth
44 |     dists['cre'] = dists['Chromosome'].astype(str) + '-' + dists['Start'].astype(str) + '-' + dists['End'].astype(str)
45 |     dists = dists[['mth', 'cre', 'gene', 'dist']]
46 |     return dists
47 | 
48 | # Compute dists
49 | dists = []
50 | path_grns = [p for p in path_grns if (os.path.basename(p).startswith('o_')) or (os.path.basename(p).split('.')[0] in baselines)]
51 | print(path_grns)
52 | for path_grn in tqdm(path_grns):
53 |     mth = os.path.basename(path_grn).split('.')[0]  # Assume all stp equal
54 |     dists.append(compute_dist_tss(path_grn, mth))
55 | dists = pd.concat(dists)
56 | 
57 | # Write
58 | dists.to_csv(path_out, index=False)
59 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/tss/gocoef.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from tqdm import tqdm
 3 | import pyranges as pr
 4 | import os
 5 | import argparse
 6 | 
 7 | 
 8 | # Parse args
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('-a', '--path_tss_a', required=True)
11 | parser.add_argument('-b', '--path_tss_b', required=True)
12 | parser.add_argument('-o', '--path_out', required=True)
13 | args = parser.parse_args()
14 | path_tss_a = args.path_tss_a
15 | path_tss_b = args.path_tss_b
16 | out_path = args.path_out
17 | 
18 | 
19 | # Read
20 | names = []
21 | pr_tss = []
22 | for path in [path_tss_a, path_tss_b]:
23 |     name = os.path.basename(path).replace('.bed', '')
24 |     tss = pd.read_csv(path, sep='\t', header=None)
25 |     tss.columns = ['Chromosome', 'Start', 'End', 'Name']
26 |     tss = tss.sort_values(['Chromosome', 'Start', 'End', 'Name'])
27 |     tss = pr.PyRanges(tss)
28 |     names.append(name)
29 |     pr_tss.append(tss)
30 | 
31 | # Find shared genes
32 | genes = set().union(pr_tss[0].Name).intersection(pr_tss[1].Name)
33 | 
34 | # Find genomic overlap coef
35 | def overlap_coef_per_gene(gene, tss_a, tss_b):
36 |     ftss_a = tss_a[tss_a.Name == gene].merge()
37 |     ftss_b = tss_b[tss_b.Name == gene].merge()
38 |     if ftss_a.empty or ftss_b.empty:
39 |         raise ValueError('Gene has to be in tss')
40 |     overlap = ftss_a.intersect(ftss_b)
41 |     if overlap.empty:
42 |         return 0.
43 |     else:
44 |         l = overlap.length
45 |         if l == 0:
46 |             return 1
47 |         else:
48 |             return l / min(ftss_a.length, ftss_b.length)
49 | 
50 | 
51 | df = []
52 | 
53 | tss_a = pr_tss[0]
54 | tss_a = tss_a[tss_a.Name.isin(genes)]
55 | name_a = names[0]
56 | 
57 | tss_b = pr_tss[1]
58 | tss_b = tss_b[tss_b.Name.isin(genes)]
59 | name_b = names[1]
60 | 
61 | for gene in tqdm(list(genes)):
62 |     val = overlap_coef_per_gene(gene, tss_a, tss_b)
63 |     df.append([name_a, name_b, gene, val])
64 | 
65 | df = pd.DataFrame(df, columns=['tss_a', 'tss_b', 'gene', 'ocoef'])
66 | 
67 | # Write
68 | df.to_csv(out_path, index=False)
69 | 


--------------------------------------------------------------------------------
/workflow/scripts/anl/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | 
 5 | 
 6 | def read_config(path_config='config/config.yaml'):
 7 |     import yaml
 8 |     with open(path_config, 'r') as file:
 9 |         config = yaml.safe_load(file)
10 |     return config
11 | 
12 | 
13 | def get_grn_name(grn_path):
14 |     name = os.path.basename(grn_path).replace('.grn.csv', '').replace('.csv', '')
15 |     return name
16 | 
17 | 
18 | def get_grn_stats(grn):
19 |     import igraph as ig
20 |     if len(grn) == 0:
21 |         return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
22 |     n_s = grn['source'].unique().size
23 |     n_e = grn.shape[0]
24 |     n_t = grn['target'].unique().size
25 | 
26 |     g = ig.Graph.TupleList(list(zip(grn['source'], grn['target'])), directed=True)
27 |     tf_bet = np.mean(g.betweenness())
28 |     tf_odg = grn.groupby(['source']).size().mean()
29 |     if not g.is_acyclic():
30 |         tf_eig = np.mean(g.eigenvector_centrality())
31 |     else:
32 |         tf_eig = 0.
33 |     
34 |     return n_s, n_e, n_t, tf_odg, tf_bet, tf_eig
35 | 
36 | 
37 | def ocoeff(df_a, df_b, on=['source', 'target']):
38 |     """Compute overlap coefficient between two dfs"""
39 |     tmp_a, tmp_b = df_a.drop_duplicates(on), df_b.drop_duplicates(on)
40 |     a_size, b_size = tmp_a.shape[0], tmp_b.shape[0]
41 |     if (a_size > 0) and (b_size > 0):
42 |         inter = pd.merge(tmp_a, tmp_b, on=on, how='inner')
43 |         i_size = inter.shape[0]
44 |         coeff = i_size / np.min([a_size, b_size])
45 |     else:
46 |         coeff = 0.
47 |     return coeff
48 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/c2g/eqtlcat_gene.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | from concurrent.futures import ProcessPoolExecutor
 6 | 
 7 | 
 8 | mta = pd.read_csv(sys.argv[1], sep='\t', header=None)
 9 | mta['smpl'] = mta[0] + '.' + mta[1]
10 | mta = mta.set_index('smpl')[2].to_dict()
11 | 
12 | file_data = {}
13 | 
14 | for line in tqdm(sys.stdin):
15 |     chrm, start, end, gene, smpl = line.strip().split('\t')
16 |     start, end = int(start), int(end)
17 |     ctype = mta[smpl]
18 |     
19 |     if gene not in file_data:
20 |         file_data[gene] = ""
21 |     file_data[gene] += f'{chrm}\t{start}\t{end}\t{gene}\t{ctype}\n'
22 | 
23 | 
24 | def write_gene_file(gene, lines, output_dir):
25 |     with open(os.path.join(output_dir, f'{gene}.bed'), 'w') as f:
26 |         f.writelines(lines)
27 | 
28 | 
29 | with ProcessPoolExecutor(max_workers=32) as executor:
30 |     futures = {executor.submit(write_gene_file, gene, lines, sys.argv[2]): gene for gene, lines in file_data.items()}
31 |     for future in tqdm(futures, total=len(futures)):
32 |         future.result()


--------------------------------------------------------------------------------
/workflow/scripts/dbs/c2g/eqtlcat_smpl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | gdict = pd.read_csv(sys.argv[1]).set_index('id')['symbol'].to_dict()
 8 | thr_pval = float(sys.argv[2])
 9 | name = os.path.basename(sys.argv[3]).replace('.bed', '')
10 | with open(sys.argv[3], 'w') as f:
11 |     next(sys.stdin)  # skip first line
12 |     for line in tqdm(sys.stdin):
13 |         line = line.strip().split('\t')
14 |         gene, coords, pval = line[1], line[3], float(line[7])
15 |         chrm, start = coords.split('_')[:2]
16 |         start, end = int(start), int(start)
17 |         valid = (pval < thr_pval) and (gene in gdict) and ('_' not in chrm)
18 |         if valid:
19 |             gene = gdict[gene]
20 |             f.write(f'{chrm}\t{start}\t{end}\t{gene}\t{name}\n')
21 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/cre/gwascatalogue.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from tqdm import tqdm
 4 | import argparse
 5 | 
 6 | 
 7 | # Init args
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('-i','--inp_path', required=True)
10 | args = vars(parser.parse_args())
11 | 
12 | inp_path = args['inp_path']
13 | 
14 | # Read tsv input
15 | df = pd.read_csv(inp_path, sep='\t', dtype={9: 'str', 12: 'str', 23: 'str', 26: 'str'})
16 | 
17 | # Remove nans
18 | df = df[~df['CHR_POS'].isna()]
19 | df = df[~df['SNP_ID_CURRENT'].isna()]
20 | df = df[~df['MAPPED_TRAIT_URI'].isna()]
21 | 
22 | # Drop one special case with multiple snp ids
23 | df = df[~df['SNP_ID_CURRENT'].astype(str).str.contains(';')]
24 | 
25 | # Split urls and obtain key, take care of multiple terms separated by commas
26 | df['MAPPED_TRAIT_URI'] = [', '.join([x.split('/')[-1] for x in url.split(',')]) for url in df['MAPPED_TRAIT_URI']]
27 | 
28 | # Exctracts the risk allele and sets anything else from ATGC to unknown
29 | str_alleles = []
30 | bases = np.array(['A', 'T', 'G', 'C'])
31 | for snp in tqdm(df['STRONGEST SNP-RISK ALLELE']):
32 |     snp = snp.split('-')[-1].upper()
33 |     has_bases = np.all(np.isin([l for l in snp], bases))
34 |     if has_bases and snp != '':
35 |         str_alleles.append(snp)
36 |     else:
37 |         str_alleles.append('?')
38 | df['STRONGEST SNP-RISK ALLELE'] = str_alleles
39 | df['CHR_POS_2'] = df['CHR_POS'].copy()
40 | 
41 | # Subset by important cols
42 | cols = ['CHR_ID', 'CHR_POS', 'CHR_POS_2', 'STRONGEST SNP-RISK ALLELE',
43 |         'P-VALUE', 'MAPPED_TRAIT', 'MAPPED_TRAIT_URI', 'PUBMEDID']
44 | df = df[cols]
45 | 
46 | # Transform to correct data types
47 | df['CHR_ID'] = 'chr' + df['CHR_ID'].astype(str)
48 | df['CHR_POS'] = df['CHR_POS'].astype(int)
49 | df['CHR_POS_2'] = df['CHR_POS_2'].astype(int)
50 | df['P-VALUE'] = df['P-VALUE'].astype(float)
51 | df['MAPPED_TRAIT'] = df['MAPPED_TRAIT'].astype(str)
52 | df['MAPPED_TRAIT_URI'] = df['MAPPED_TRAIT_URI'].astype(str)
53 | df['PUBMEDID'] = df['PUBMEDID'].astype(str)
54 | 
55 | # Summarize when multiple p-values are given
56 | df = df.groupby(list(df.columns[df.columns != 'P-VALUE'])).mean(numeric_only=True).reset_index()
57 | 
58 | # Rename and sort
59 | df = df.rename(columns={
60 |     'CHR_ID': 'chr_id',
61 |     'CHR_POS': 'chr_start',
62 |     'CHR_POS_2': 'chr_end',
63 |     'STRONGEST SNP-RISK ALLELE': 'eff_allele',
64 |     'MAPPED_TRAIT': 'trait_name',
65 |     'MAPPED_TRAIT_URI': 'trait_uri',
66 |     'PUBMEDID': 'pubmedid',
67 |     'P-VALUE': 'pval'
68 | })
69 | 
70 | # Save
71 | df = df[['chr_id', 'chr_start', 'chr_end', 'eff_allele', 'trait_name']]
72 | df['trait_name'] = df['trait_name'].str.strip()
73 | df.to_csv(inp_path, index=False, header=None, sep='\t')
74 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/cre/promoters.R:
--------------------------------------------------------------------------------
 1 | library(biomaRt)
 2 | library(dplyr)
 3 | 
 4 | # Parse args
 5 | args <- commandArgs(trailingOnly = F)
 6 | window_size <- as.numeric(args[6])
 7 | out_path <- args[7]
 8 | 
 9 | 
10 | ensembl <- useMart(
11 |     "ensembl",
12 |     dataset = "hsapiens_gene_ensembl",
13 |     host = "http://www.ensembl.org"
14 | )
15 | 
16 | gene_data <- getBM(
17 |   attributes = c("ensembl_gene_id", "external_gene_name", "chromosome_name", "transcription_start_site"),
18 |   mart = ensembl
19 | )
20 | 
21 | gene_data <- gene_data %>%
22 |   mutate(
23 |     promoter_start = transcription_start_site - window_size,
24 |     promoter_end = transcription_start_site + window_size - 1,
25 |     promoter_start = pmax(promoter_start, 0)  # Ensure non-negative values
26 |   )
27 | 
28 | standard_chromosomes <- c(1:23, "X", "Y")
29 | bed_data <- gene_data %>%
30 |   filter(chromosome_name %in% standard_chromosomes & external_gene_name != "") %>%
31 |   distinct(external_gene_name, .keep_all = TRUE) %>%
32 |   transmute(
33 |     chrom = paste0("chr", chromosome_name),
34 |     chromStart = promoter_start - 1,  # BED format is 0-based
35 |     chromEnd = promoter_end,
36 |     name = external_gene_name
37 |   )
38 | 
39 | bed_data <- bed_data %>%
40 |   arrange(
41 |     factor(chrom, levels = paste0("chr", c(1:23, "X", "Y"))),
42 |     chromStart
43 |   )
44 | 
45 | # Write to output file
46 | write.table(bed_data, file = out_path, sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE)
47 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gen/genome/celloracle.py:
--------------------------------------------------------------------------------
 1 | from genomepy import install_genome
 2 | import os
 3 | import re
 4 | import argparse
 5 | 
 6 | 
 7 | # Init args
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('-o','--orgms', required=True, nargs='+')
10 | args = vars(parser.parse_args())
11 | 
12 | # Get dir
13 | orgms = args['orgms']
14 | 
15 | # Install genomes
16 | for path_org in orgms:
17 |     org = re.search(r'^dbs/([^/]+)/.*$', path_org).group(1)
18 |     install_genome(name=org, genomes_dir=path_org, provider="UCSC")
19 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gen/gid/ensmbl.R:
--------------------------------------------------------------------------------
 1 | library(biomaRt)
 2 | 
 3 | # Parse args
 4 | orgms <- commandArgs(trailingOnly = TRUE)
 5 | 
 6 | get_gene_table <- function(dataset){
 7 |     # Connect to the Ensembl database
 8 |     ensembl <- useEnsembl(
 9 |         biomart = 'genes',
10 |         dataset = dataset,
11 |         version = 111
12 |     )
13 |     # Specify the attributes to retrieve
14 |     attributes <- c("ensembl_gene_id", "external_gene_name")
15 |     # Retrieve the data
16 |     gene_data <- getBM(
17 |         attributes = attributes,
18 |         mart = ensembl,
19 |         useCache=FALSE,
20 |         verbose=FALSE
21 |     )
22 |     colnames(gene_data) <- c('id', 'symbol')
23 |     return(gene_data)
24 | }
25 | 
26 | org_table <- list(
27 |     'hg38'='hsapiens_gene_ensembl',
28 |     'mm10'='mmusculus_gene_ensembl'
29 | )
30 | 
31 | for (path_org in orgms) {
32 |     org <- sub('^dbs/([^/]+)/.*$', '\\1', path_org)
33 |     org <- org_table[org]
34 |     gid <- get_gene_table(org)
35 |     gid <- gid[gid$symbol != "", ]
36 |     write.csv(x = gid, file = path_org, row.names=FALSE, quote=FALSE)
37 | }
38 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gen/pid/uniprot.R:
--------------------------------------------------------------------------------
 1 | library(biomaRt)
 2 | 
 3 | # Parse args
 4 | orgms <- commandArgs(trailingOnly = TRUE)
 5 | 
 6 | get_gene_table <- function(dataset){
 7 |     # Connect to the Ensembl database
 8 |     ensembl <- useEnsembl(
 9 |         biomart = 'genes',
10 |         dataset = dataset,
11 |         version = 111
12 |     )
13 |     # Specify the attributes to retrieve
14 |     attributes <- c("uniprotswissprot", "external_gene_name")
15 |     # Retrieve the data
16 |     gene_data <- getBM(
17 |         attributes = attributes,
18 |         mart = ensembl,
19 |         useCache=FALSE,
20 |         verbose=FALSE
21 |     )
22 |     colnames(gene_data) <- c('uniprot_id', 'symbol')
23 |     return(gene_data)
24 | }
25 | 
26 | org_table <- list(
27 |     'hg38'='hsapiens_gene_ensembl',
28 |     'mm10'='mmusculus_gene_ensembl'
29 | )
30 | 
31 | for (path_org in orgms) {
32 |     org <- sub('^dbs/([^/]+)/.*$', '\\1', path_org)
33 |     org <- org_table[org]
34 |     gid <- get_gene_table(org)
35 |     gid <- gid[(gid$symbol != "") & (gid$uniprot_id != ""), ]  # Exclude rows with empty gene symbols
36 |     write.csv(x = gid, file = path_org, row.names=FALSE, quote=FALSE)
37 | }
38 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gen/tss/celloracle.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import requests
 3 | from io import StringIO
 4 | import argparse
 5 | 
 6 | # Initiate args
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('-o', '--path_out', required=True)
 9 | args = parser.parse_args()
10 | out_path = args.path_out 
11 | 
12 | # Download bed file
13 | url = "https://github.com/morris-lab/CellOracle/blob/e5ae78e93272da7d772378e60ae6cd4602f24be6/celloracle/motif_analysis/tss_ref_data/hg38_tss_info.bed?raw=true"
14 | response = requests.get(url)
15 | bed = pd.read_csv(StringIO(response.text), sep='\t', header=None)[[0, 1, 2, 3]].dropna().sort_values([0, 1, 2])
16 | 
17 | # Save file
18 | bed.to_csv(out_path, sep="\t", index=False, header=False)
19 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gen/tss/dictys.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse
 3 | 
 4 | parser = argparse.ArgumentParser()
 5 | parser.add_argument('-o', '--path_out', required=True)
 6 | parser.add_argument('-i', '--path_input', required=True)
 7 | args = parser.parse_args()
 8 | out_path = args.path_out
 9 | input_path = args.path_input
10 | 
11 | # Read file
12 | bed = pd.read_csv(input_path, sep='\t', header=None)
13 | 
14 | # Process columns
15 | bed.columns = ['Chromosome', 'Start', 'End', 'Name', 'score', 'strand']
16 | bed = bed[['Chromosome', 'Start', 'End', 'Name']]
17 | bed['Start'] = bed['Start'] - 1
18 | bed['End'] = bed['End'] - 1
19 | 
20 | # Save file
21 | bed.to_csv(out_path, sep="\t", index=False, header=None)
22 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gen/tss/figr.R:
--------------------------------------------------------------------------------
 1 | library(FigR)
 2 | library(GenomicRanges)
 3 | 
 4 | 
 5 | # Add arguments
 6 | args <- commandArgs(trailingOnly = F)
 7 | path_out <- args[6]
 8 | 
 9 | # Extract TSS annotations
10 | TSSg <- FigR::hg38TSSRanges
11 | chr <- as.character(seqnames(TSSg))
12 | start_pos <- start(TSSg)
13 | end_pos <- end(TSSg)
14 | gene_names <- mcols(TSSg)$gene_name
15 | 
16 | # Transform it into a data frame
17 | data <- data.frame(Chromosome = chr, Start = start_pos - 1, End = end_pos - 1, Name = gene_names)
18 | 
19 | # Write
20 | write.table(x = data, file = path_out, sep = '\t', row.names = FALSE, quote = FALSE, col.names = FALSE)
21 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gen/tss/granie.R:
--------------------------------------------------------------------------------
 1 | library(AnnotationHub)
 2 | 
 3 | 
 4 | # Initiate args
 5 | args <- commandArgs(trailingOnly = F)
 6 | path_out <- args[6]
 7 | 
 8 | 
 9 | # Load db
10 | ah <- AnnotationHub()
11 | 
12 | # Get the newest version of annotation
13 | results = AnnotationHub::query(ah, c("EnsDb", "Homo sapiens"))
14 | annotationDatasets <- as.data.frame(mcols(results))
15 | newestAnno.title = tail(annotationDatasets$title, 1)
16 | newestAnno.ID = tail(rownames(annotationDatasets), 1)
17 | ensdb.newest <- ah[[newestAnno.ID]]
18 | 
19 | # Read
20 | gr <- ensembldb::genes(ensdb.newest)
21 | 
22 | # Merge overlaps
23 | merged <- unlist(reduce(split(gr, gr$gene_name)), use.names = TRUE)
24 | 
25 | # To df
26 | chr_names <- paste0("chr", as.character(seqnames(merged)))
27 | start_pos <- start(merged) - 1
28 | end_pos <- end(merged) - 1
29 | gene_names <- names(merged)
30 | bed <- data.frame(Chromosome = chr_names, Start = start_pos, End = end_pos, Name = gene_names)
31 | 
32 | # Filter empty names
33 | bed <- bed[bed$Name != '', ]
34 | 
35 | # Sort
36 | bed <- bed[order(bed$Chromosome, bed$Start, bed$End), ]
37 | 
38 | # Write
39 | write.table(x = bed, file = path_out, sep = '\t', row.names = FALSE, quote = FALSE, col.names = FALSE)
40 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gen/tss/hummus.R:
--------------------------------------------------------------------------------
 1 | # Initiate arguments
 2 | args <- commandArgs(trailingOnly = F)
 3 | path_out <- args[6]
 4 | 
 5 | 
 6 | library(HuMMuS)
 7 | library(EnsDb.Hsapiens.v86)
 8 | library(dplyr)
 9 | 
10 | # Extract TSS
11 | gene_range = get_genome_annotations(EnsDb.Hsapiens.v86)
12 | chr <- as.character(seqnames(gene_range))
13 | start_pos <- start(gene_range)
14 | end_pos <- end(gene_range)
15 | gene_names <- mcols(gene_range)$gene_name
16 | gene_type <- mcols(gene_range)$gene_biotype
17 | 
18 | 
19 | # Build dataframe in .csv
20 | data <- data.frame(Chromosome = chr, Start = start_pos, End = end_pos, Name = gene_names, gene.type = gene_type)
21 | 
22 | 
23 | # Filter only protein coding genes
24 | data <- data %>% filter(gene.type == "protein_coding")
25 | data <- data %>%
26 |   dplyr::select(Chromosome, Start, End, Name)
27 | 
28 | 
29 | write.csv(x = data, file = path_out)
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gen/tss/pando.R:
--------------------------------------------------------------------------------
 1 | library(EnsDb.Hsapiens.v86)
 2 | library(dplyr)
 3 | 
 4 | 
 5 | # Parse args
 6 | args <- commandArgs(trailingOnly = F)
 7 | path_out <- args[6]
 8 | 
 9 | 
10 | # Read
11 | gr <- Signac::GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
12 | 
13 | # Merge overlaps
14 | merged <- unlist(reduce(split(gr, gr$gene_name)), use.names = TRUE)
15 | 
16 | # To df
17 | chr_names <- paste0("chr", as.character(seqnames(merged)))
18 | start_pos <- start(merged)
19 | end_pos <- end(merged)
20 | gene_names <- names(merged)
21 | bed <- data.frame(Chromosome = chr_names, Start = start_pos, End = end_pos, Name = gene_names)
22 | bed <- dplyr::arrange(bed, Chromosome, Start, End)
23 | 
24 | # Write
25 | write.table(x = bed, file = path_out, sep = '\t', row.names = FALSE, quote = FALSE, col.names = FALSE)
26 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gen/tss/scenicplus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[22]:
 5 | 
 6 | 
 7 | import pybiomart as pbm
 8 | import argparse
 9 | import numpy as np
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('-j', '--path_out', required=True)
13 | args = parser.parse_args()
14 | out_path = args.path_out
15 | 
16 | dataset = pbm.Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')
17 | 
18 | annot = dataset.query(attributes=['chromosome_name', 'start_position', 'end_position',
19 |                                   'strand', 'external_gene_name', 'transcription_start_site', 'transcript_biotype'])
20 | annot['chromosome_name'] = annot['chromosome_name'].to_numpy(dtype=str)
21 | filter = annot['chromosome_name'].str.contains('CHR|GL|JH|MT', case=False)
22 | annot = annot[~filter]
23 | annot['chromosome_name'] = annot['chromosome_name'].str.replace(r'(\b\S)', r'chr\1')
24 | annot.columns = ['Chromosome', 'Start', 'End', 'Strand', 'Name', 'Transcription_Start_Site', 'Transcript_type']
25 | annot["Strand"] = annot["Strand"].replace({1: "+", -1: "-"})
26 | annot.Start = annot.Start.astype(np.int32)
27 | annot['Chromosome'] = 'chr' + annot['Chromosome'].astype(str)
28 | annot.dropna(inplace=True)
29 | annot = annot[['Chromosome', 'Start', 'End', 'Name']]
30 | 
31 | # Save the file
32 | annot.to_csv(out_path, sep="\t", index=False)
33 | 
34 | 
35 | 
36 | # In[ ]:
37 | 
38 | 
39 | 
40 | 
41 | 
42 | # In[ ]:
43 | 
44 | 
45 | 
46 | 
47 | 
48 | # In[ ]:
49 | 
50 | 
51 | 
52 | 
53 | 
54 | # In[ ]:
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/gst/pways.py:
--------------------------------------------------------------------------------
 1 | import decoupler as dc
 2 | import pandas as pd
 3 | import argparse
 4 | 
 5 | # Init args
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('-i','--path_inp', required=True)
 8 | parser.add_argument('-o','--path_out', required=True)
 9 | args = vars(parser.parse_args())
10 | 
11 | path_reac = args['path_reac']
12 | path_hall = args['path_hall']
13 | path_kegg = args['path_kegg']
14 | path_tfs = args['path_tfs']
15 | path_prg = args['path_prg']
16 | path_out_hall = args['path_out_hall']
17 | path_out_kegg = args['path_out_kegg']
18 | path_out_prg = args['path_out_prg']
19 | path_out_reac = args['path_out_reac']
20 | 
21 | # Process hallmark
22 | hall = dc.read_gmt(path_hall)
23 | hall['source'] = hall['source'].str.replace('HALLMARK_', '')
24 | 
25 | # Process kegg
26 | kegg = dc.read_gmt(path_kegg)
27 | kegg['source'] = kegg['source'].str.replace('KEGG_', '')
28 | 
29 | # Process progeny
30 | prg = pd.read_csv(path_prg)
31 | prg = prg.rename(columns={'gene': 'target', 'pathway': 'source', 'p.value': 'pval'})
32 | prg = prg[['source', 'target', 'weight', 'pval']]
33 | prg = prg[prg['pval'] < 0.05]
34 | prg = prg.sort_values(['source', 'pval'])
35 | prg = prg.rename(columns={'source': 'pathway', 'target': 'gene'})
36 | 
37 | # Process reactome
38 | reac = dc.read_gmt(path_reac)
39 | reac['source'] = reac['source'].str.replace('REACTOME_', '')
40 | 
41 | # Write
42 | kegg.to_csv(path_out_kegg, index=False)
43 | prg.to_csv(path_out_prg, index=False)
44 | hall.to_csv(path_out_hall, index=False)
45 | reac.to_csv(path_out_reac, index=False)
46 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/ont/bto.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pandas as pd
 3 | 
 4 | b_dict = dict()
 5 | read = False
 6 | 
 7 | for line in sys.stdin:
 8 |     line = line.strip()
 9 |     if line.startswith('<owl:Class'):
10 |         read = True
11 |         continue
12 |     elif line.startswith('<oboInOwl:id') and read:
13 |         key = line.split('>')[1].split('<')[0]
14 |         continue
15 |     elif line.startswith('<rdfs:label') and read:
16 |         val = line.split('>')[1].split('<')[0]
17 |         continue
18 |     elif line.startswith('</owl:Class>') and read:
19 |         b_dict[key] = val
20 |         read = False
21 | 
22 | b_dict = pd.DataFrame(list(b_dict.items()))
23 | b_dict.to_csv(sys.argv[1], sep='\t', index=False, header=None)
24 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/tfb/aggregate.py:
--------------------------------------------------------------------------------
1 | import sys
2 | for line in sys.stdin:
3 |     line = line.replace('\n', '').split('\t')
4 |     chrm, start, end, tf, ctype = line[0], line[1], line[2], line[3], line[4]
5 |     ctype = ','.join(sorted(set(ctype.split(','))))
6 |     print(f'{chrm}\t{start}\t{end}\t{tf}\t{ctype}')
7 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/tfb/chipatlas_meta.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import sys
 4 | 
 5 | 
 6 | df = pd.read_csv(sys.argv[1], sep='\t', usecols=[0, 1, 2, 3, 4, 5], header=None)
 7 | tfs = pd.read_csv(sys.argv[2], header=None).values.ravel()
 8 | org = sys.argv[1].split(os.sep)[1]
 9 | msk_org = df[1] == org
10 | msk_tfs = df[3].isin(tfs)
11 | msk_unc = ~(df[4] == 'Unclassified')
12 | msk = msk_org & msk_tfs & msk_unc
13 | df = df.loc[msk, :].dropna()
14 | df['ctype'] = df[4] + ',' + df[5]
15 | df = df[[0, 3, 'ctype']]
16 | df.to_csv(sys.argv[1], sep='\t', index=False, header=None)
17 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/tfb/chipatlas_tf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import re
 4 | import pandas as pd
 5 | 
 6 | 
 7 | tf = os.path.basename(sys.argv[1]).replace('.bed', '')
 8 | meta = pd.read_csv(sys.argv[2], sep='\t', header=None).set_index(0)
 9 | pattern = r'ID=(.*?);'
10 | for line in sys.stdin:
11 |     if line.startswith('chr'):
12 |         line = line.replace('\n', '').split('\t')
13 |         chrm, start, end, sample_id = line[0], line[1], line[2], line[3]
14 |         sample_id = re.search(pattern, sample_id).group(1)
15 |         if (sample_id in meta.index) and ('_' not in chrm):
16 |             m_tf = meta.loc[sample_id, 1]
17 |             ctype = meta.loc[sample_id, 2]
18 |             start, end = int(start), int(end)
19 |             if (m_tf == tf) and ((start - end) < int(sys.argv[3])):
20 |                 print(f'{chrm}\t{start}\t{end}\t{tf}\t{ctype}')
21 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/tfb/remap2022_meta.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from io import BytesIO
 3 | import sys
 4 | 
 5 | binary_data = BytesIO(sys.stdin.buffer.read())
 6 | df = pd.read_excel(pd.ExcelFile(binary_data), sheet_name=0)
 7 | df = df[['biotype', 'identifiants/0/BTO_id']].dropna()
 8 | df = df.rename(columns={'identifiants/0/BTO_id': 'id'})
 9 | df['id'] = df['id'].str.replace('_', ':')
10 | bto = pd.read_csv(sys.argv[1], sep='\t', header=None).set_index(0)[1].to_dict()
11 | df['term'] = [bto[i] for i in df['id']]
12 | df[['biotype', 'term']].to_csv(sys.argv[2], sep='\t', index=False, header=None)
13 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/tfb/remap2022_raw.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | tfs = set(pd.read_csv(sys.argv[1], header=None).iloc[:, 0].astype('U'))
 8 | mta = pd.read_csv(sys.argv[2], header=None, sep='\t', index_col=0).iloc[:, 0].to_dict()
 9 | file_handles = {}
10 | for line in tqdm(sys.stdin):
11 |     if line.startswith('chr'):
12 |         chrm, start, end, tf_ctype = line.strip().split('\t')[:4]
13 |         tf, ctype = tf_ctype.split(':')
14 |         start, end = int(start), int(end)
15 |         if tf in tfs and '_' not in chrm and (end - start) < int(sys.argv[3]):
16 |             ctypes = [mta[c] for c in ctype.split(',') if c in mta]
17 |             if ctypes:
18 |                 if tf not in file_handles:
19 |                     file_handles[tf] = open(os.path.join(sys.argv[4], f'{tf}.bed'), 'w')
20 |                 file_handles[tf].write(f'{chrm}\t{start}\t{end}\t{tf}\t{",".join(ctypes)}\n')
21 | for tf in file_handles:
22 |     file_handles[tf].close()
23 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/tfb/unibind_raw.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | tfs = set(pd.read_csv(sys.argv[1], header=None).iloc[:, 0].astype('U'))
 8 | file_handles = {}
 9 | for line in tqdm(sys.stdin):
10 |     chrm, start, end, tmp = line.strip().split('\t')[:4]
11 |     tmp = tmp.split('_')
12 |     if len(tmp) == 4:
13 |         _, ctype, tf, _ = tmp 
14 |         start, end = int(start), int(end)
15 |         ctype = ctype.replace('-', ' ').replace(',', ' ').strip()
16 |         tf = tf.strip()
17 |         valid = (tf in tfs) and ('_' not in chrm) and ((start - end) < int(sys.argv[2]))
18 |         if valid:
19 |             if tf not in file_handles:
20 |                 file_handles[tf] = open(os.path.join(sys.argv[3], f'{tf}.bed'), 'w')
21 |             file_handles[tf].write(f'{chrm}\t{start}\t{end}\t{tf}\t{ctype}\n')
22 | for tf in file_handles:
23 |     file_handles[tf].close()


--------------------------------------------------------------------------------
/workflow/scripts/dbs/tfm/hpa.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import argparse
 4 | 
 5 | 
 6 | # Init args
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('-i','--inp_path', required=True)
 9 | parser.add_argument('-t','--tfs_path', required=True)
10 | parser.add_argument('-o','--out_path', required=True)
11 | args = vars(parser.parse_args())
12 | 
13 | inp_path = args['inp_path']
14 | tfs_path = args['tfs_path']
15 | out_path = args['out_path']
16 | 
17 | # Read
18 | c_cols = ['Tissue expression cluster', 'Cell line expression cluster', 'Single cell expression cluster']
19 | df = pd.read_csv(inp_path, sep='\t').dropna(subset=c_cols)
20 | 
21 | # Read tfs
22 | tfs = pd.read_csv(tfs_path, sep='\t', header=None).values.ravel().astype('U')
23 | 
24 | # Filter
25 | msk_evd = (df['Evidence'] == 'Evidence at protein level').values
26 | msk_loc = np.array(['Nucle' in str(s) for s in df['Subcellular location']])
27 | msk_tissue =  ~df['Tissue expression cluster'].str.contains('Non-specific -')
28 | msk_celine =  ~df['Cell line expression cluster'].str.contains('Non-specific -')
29 | msk_cetype =  ~df['Single cell expression cluster'].str.contains('Non-specific -')
30 | msk_tf = df['Gene'].isin(tfs)
31 | msk = msk_tf & msk_evd & msk_loc & msk_tissue & msk_celine & msk_cetype
32 | df = df.loc[msk, :]
33 | 
34 | # Format names
35 | for col in c_cols:
36 |     df[col] = [s.split(':')[1].split('-')[0].strip() for s in df[col]]
37 | df['ctype'] = [','.join(sorted(set(lst))) for lst in df[c_cols].values]
38 | df = df.rename(columns={'Gene': 'gene'})[['gene', 'ctype']]
39 | df = df.sort_values(['gene', 'ctype'])
40 | 
41 | # Write
42 | df.to_csv(out_path, sep='\t', index=False, header=None)
43 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/tfp/europmc.py:
--------------------------------------------------------------------------------
 1 | import scipy.stats as ss
 2 | import pandas as pd
 3 | import sys
 4 | 
 5 | 
 6 | # Vars
 7 | path_single = sys.argv[1]
 8 | path_pairs = sys.argv[2]
 9 | pval_thr = float(sys.argv[3])
10 | min_odds = float(sys.argv[4])
11 | path_out = sys.argv[5]
12 | 
13 | # Read
14 | single = pd.read_csv(path_single)
15 | total = single['n'].sum()
16 | single = single.set_index('tf')['n'].to_dict()
17 | pairs = pd.read_csv(path_pairs)
18 | 
19 | # Compute one-sided Fisher test
20 | df = []
21 | for row in pairs.values:
22 |     tf_a, tf_b, n = row
23 |     only_a = single[tf_a] - n
24 |     only_b = single[tf_b] - n
25 |     backgr = total - (single[tf_a] + single[tf_b])
26 |     s, p = ss.fisher_exact([[n, only_a], [only_b, backgr]], alternative='greater')
27 |     df.append([tf_a, tf_b, s, p])
28 | df = pd.DataFrame(df, columns=['tf_a', 'tf_b', 'stat', 'pval'])
29 | df['padj'] = ss.false_discovery_control(df['pval'])
30 | 
31 | # Filter
32 | df = df[(df['padj'] < pval_thr) & (df['stat'] > min_odds)].copy()
33 | df['name'] = ['|'.join(sorted([a, b])) for a, b in zip(df['tf_a'], df['tf_b'])]
34 | df[['tf_a', 'tf_b']] = df['name'].str.split('|', expand=True)
35 | df = df.drop(columns='name')
36 | 
37 | # Save
38 | df.to_csv(path_out, index=False, header=False, sep='\t')
39 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/tfp/europmc_raw.py:
--------------------------------------------------------------------------------
 1 | from tqdm import tqdm
 2 | import pandas as pd
 3 | import requests
 4 | import re
 5 | import time
 6 | import sys
 7 | 
 8 | 
 9 | def do_query(query):
10 |     base = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search'
11 |     url = f"{base}?query={query}&format=json"
12 |     res = requests.get(url)
13 |     while res.status_code != 200:
14 |         print(url, flush=True)
15 |         time.sleep(1)
16 |         res = requests.get(url)
17 |     n = int(res.json()['hitCount'])
18 |     return n
19 | 
20 | 
21 | def get_n_pairs(tf_a, tf_b):
22 |     query = f'(TITLE:"{tf_a}"+OR+ABSTRACT:"{tf_a}")+AND+(TITLE:"{tf_b}"+OR+ABSTRACT:"{tf_b}")'
23 |     return do_query(query)
24 | 
25 | 
26 | def get_n_single(tf):
27 |     query = f'(ABSTRACT:"{tf}"+OR+TITLE:"{tf}")'
28 |     return do_query(query)
29 | 
30 | 
31 | # Read args
32 | path_tfs = sys.argv[1]
33 | min_chars = int(sys.argv[2])
34 | min_n = int(sys.argv[3])
35 | path_single = sys.argv[4]
36 | path_pairs = sys.argv[5]
37 | 
38 | # Open tfs
39 | tfs = pd.read_csv(path_tfs, sep='\t', header=None)[0].values.astype('U')
40 | 
41 | # Find unique tfs with enough publications (min_n) and characters (min_chars)
42 | single_tfs = []
43 | for tf in tqdm(tfs):
44 |     if len(tf) > min_chars:
45 |         single_tfs.append([tf, get_n_single(tf)])
46 | single_tfs = pd.DataFrame(single_tfs, columns=['tf', 'n']).sort_values('n')
47 | single_tfs = single_tfs[single_tfs['n'] > min_n]
48 | tfs = single_tfs['tf'].sort_values().unique()
49 | single_tfs.to_csv(path_single, index=False)
50 | 
51 | # Find pairs 
52 | df = []
53 | for i in tqdm(range(tfs.size)):
54 |     tf_a = tfs[i]
55 |     for j in range(i + 1, tfs.size):
56 |         tf_b = tfs[j]
57 |         n = get_n_pairs(tf_a, tf_b)
58 |         if n > 0:
59 |             df.append([tf_a, tf_b, n])
60 | df = pd.DataFrame(df, columns=['tf_a', 'tf_b', 'n'])
61 | df.to_csv(path_pairs, index=False)
62 | 


--------------------------------------------------------------------------------
/workflow/scripts/dbs/tfp/intact.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sys
 3 | 
 4 | 
 5 | # Read
 6 | db = pd.read_csv(sys.argv[1], sep="\t", usecols=['#ID(s) interactor A', 'ID(s) interactor B', 'Confidence value(s)'])
 7 | tfs = pd.read_csv(sys.argv[2], header=None)[0].to_list()
 8 | pid = pd.read_csv(sys.argv[3])
 9 | 
10 | # Format
11 | p_to_g = pid.set_index('uniprot_id')['symbol'].to_dict()
12 | db = db.rename(columns={
13 |     '#ID(s) interactor A': 'tf_a',
14 |     'ID(s) interactor B': 'tf_b',
15 |     'Confidence value(s)': 'score',
16 | })
17 | db['tf_a'] = db['tf_a'].str.extract(r'uniprotkb:(\w+)')[0].map(p_to_g)
18 | db['tf_b'] = db['tf_b'].str.extract(r'uniprotkb:(\w+)')[0].map(p_to_g)
19 | db['score'] = db['score'].str.extract(r'intact-miscore:(\d+\.\d+)').astype(float)
20 | 
21 | # Filter
22 | db = db[db['score'] > 0.75].dropna()
23 | db = db[db['tf_a'].isin(tfs) & db['tf_b'].isin(tfs)]
24 | db = db[db['tf_a'] != db['tf_b']].copy()
25 | db['str'] = ['|'.join(sorted([a, b])) for a, b in zip(db['tf_a'], db['tf_b'])]
26 | db = db.drop_duplicates('str').sort_values('score', ascending=False)
27 | db[['tf_a', 'tf_b']] = db['str'].str.split('|', expand=True)
28 | db = db.drop(columns=['str'])
29 | 
30 | # Write
31 | db.to_csv(sys.argv[4], index=False, header=False, sep='\t')
32 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/brain/brain.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import scanpy as sc
 3 | import snapatac2 as snap
 4 | from snapatac2.datasets import _datasets, datasets
 5 | from pathlib import Path
 6 | import pandas as pd
 7 | import numpy as np
 8 | import anndata as ad
 9 | import mudata as md
10 | import argparse
11 | 
12 | 
13 | # Init args
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('-a','--path_gex', nargs='+', required=True)
16 | parser.add_argument('-b','--path_peaks', required=True)
17 | parser.add_argument('-c','--path_annot', required=True)
18 | parser.add_argument('-d','--path_geneids', required=True)
19 | parser.add_argument('-f','--path_output', required=True)
20 | args = vars(parser.parse_args())
21 | 
22 | path_gex = args['path_gex']
23 | path_peaks = args['path_peaks']
24 | path_annot = args['path_annot']
25 | path_geneids = args['path_geneids']
26 | path_output = args['path_output']
27 | 
28 | # Read annots
29 | obs = pd.read_csv(path_annot, index_col=0)
30 | 
31 | def read_sample(path_gex, obs, geneids):
32 |     rna = sc.read_10x_h5(path_gex)
33 |     rna.obs_names_make_unique()
34 |     sample_id = os.path.basename(path_gex).split('_')[0]
35 |     rna.obs_names = [sample_id + '_' + b.split('-1')[0] for b in rna.obs_names]
36 | 
37 |     # Filter faulty gene symbols
38 |     ensmbls = np.array([geneids[g] if g in geneids else '' for g in rna.var_names])
39 |     msk = ensmbls != ''
40 |     rna = rna[:, msk].copy()
41 | 
42 |     # Basic QC
43 |     sc.pp.filter_cells(rna, min_genes=100)
44 |     sc.pp.filter_genes(rna, min_cells=3)
45 |     del rna.obs['n_genes']
46 | 
47 |     # Remove duplicated genes based on num of cells
48 |     to_remove = []
49 |     for dup in rna.var.index[rna.var.index.duplicated()]:
50 |         tmp = rna.var.loc[dup]
51 |         max_idx = tmp.set_index('gene_ids')['n_cells'].idxmax()
52 |         to_remove.extend(tmp['gene_ids'][tmp['gene_ids'] != max_idx].values)
53 |     rna = rna[:, ~rna.var['gene_ids'].isin(to_remove)].copy()
54 |     return rna
55 | 
56 | 
57 | # Read gene ids
58 | geneids = pd.read_csv(path_geneids).set_index('symbol')['id'].to_dict()
59 | 
60 | # Read samples
61 | rna = []
62 | for p in path_gex:
63 |     rna.append(read_sample(p, obs, geneids))
64 | rna = ad.concat(rna, join='outer')
65 | 
66 | # Read atac data
67 | atac = ad.read_h5ad(path_peaks)
68 | rna = rna[atac.obs_names].copy()
69 | rna.X.sort_indices()
70 | atac = atac[rna.obs_names].copy()
71 | 
72 | # Create mdata
73 | mdata = md.MuData(
74 |     {'rna': rna, 'atac': atac,},
75 |     obs=obs
76 | )
77 | 
78 | # Write
79 | mdata.write(path_output)
80 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/brain/prc_annot.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | import argparse
 5 | 
 6 | 
 7 | # Init args
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('-a', '--path_rannot', required=True)
10 | parser.add_argument('-b','--samples', required=True, nargs='+')
11 | parser.add_argument('-c', '--path_annot', required=True)
12 | args = vars(parser.parse_args())
13 | 
14 | path_rannot = args['path_rannot']
15 | samples = args['samples']
16 | path_annot = args['path_annot']
17 | 
18 | annot = pd.read_csv(path_rannot)
19 | annot = annot[annot['batch'].isin(samples)]
20 | annot['barcode'] = annot['batch'] + '_' + annot['barcode']
21 | annot = annot.set_index('barcode', drop=True)
22 | annot.to_csv(path_annot, header=True)
23 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/fakepair/coembedd.R:
--------------------------------------------------------------------------------
 1 | library(Signac)
 2 | library(EnsDb.Hsapiens.v86)
 3 | library(ggplot2)
 4 | library(cowplot)
 5 | library(dplyr)
 6 | library(Seurat)
 7 | library(SingleCellExperiment)
 8 | library(rhdf5)
 9 | 
10 | 
11 | # Parse args
12 | args <- commandArgs(trailingOnly = F)
13 | path_gex <- args[6]
14 | path_peaks <- args[7]
15 | path_frags <- args[8]
16 | path_cca_out <- args[9]
17 | 
18 | 
19 | # Load RNA and ATAC seq matrix
20 | 
21 | # Process RNA
22 | rna <- Read10X_h5(path_gex)[[1]]
23 | data.rna <- CreateSeuratObject(counts = rna, project = "RNA", assay = "RNA")
24 | data.rna <- NormalizeData(data.rna)
25 | data.rna <- FindVariableFeatures(data.rna)
26 | data.rna <- ScaleData(data.rna)
27 | 
28 | # Process ATAC
29 | indata <- H5Fopen(path_peaks, flags='H5F_ACC_RDONLY')
30 | indices <- indata$X$indices
31 | indptr <- indata$X$indptr
32 | data <- as.numeric(indata$X$data)
33 | atac <- Matrix::sparseMatrix(i=indices, p=indptr, x=data, index1 = FALSE)
34 | colnames(atac) <- indata$obs$`_index`
35 | rownames(atac) <- indata$var$`_index`
36 | h5closeAll()
37 | grange.counts <- StringToGRanges(rownames(atac), sep = c(":", "-"))
38 | grange.use <- seqnames(grange.counts) %in% standardChromosomes(grange.counts)
39 | atac <- atac[as.vector(grange.use), ]
40 | annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
41 | seqlevelsStyle(annotations) <- 'UCSC'
42 | genome(annotations) <- "hg38"
43 | chrom_assay <- CreateChromatinAssay(
44 |    counts = atac,
45 |    sep = c(":", "-"),
46 |    genome = 'hg38',
47 |    fragments = path_frags,
48 |    annotation = annotations
49 | )
50 | data.atac <- CreateSeuratObject(counts = chrom_assay, assay = "ATAC", project = "ATAC")
51 | data.atac <- RunTFIDF(data.atac)
52 | data.atac <- FindTopFeatures(data.atac, min.cutoff = "q0")
53 | data.atac <- ScaleData(data.atac)
54 | 
55 | # Infer gene scores
56 | gene.activities <- GeneActivity(data.atac, features = VariableFeatures(data.rna))
57 | data.atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
58 | DefaultAssay(data.atac) <- "ACTIVITY"
59 | data.atac <- NormalizeData(data.atac)
60 | data.atac <- ScaleData(data.atac, features = rownames(data.atac))
61 | data.atac <- FindVariableFeatures(data.atac)
62 | 
63 | # Run CCA
64 | data.cca <- RunCCA(
65 |   data.rna,
66 |   data.atac,
67 |   assay1 = "RNA",
68 |   assay2 = "ACTIVITY",
69 |   num.cc = 50
70 | )
71 | 
72 | CCA_PCs <- Embeddings(data.cca, reduction = "cca")
73 | saveRDS(CCA_PCs, file = path_cca_out)
74 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/fakepair/fakepair.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import mudata as mu
 3 | import argparse
 4 | import sys
 5 | import os
 6 | 
 7 | 
 8 | # Init args
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('-m','--path_mdata', required=True)
11 | parser.add_argument('-b','--path_barmap', required=True)
12 | parser.add_argument('-o','--path_output', required=True)
13 | args = vars(parser.parse_args())
14 | 
15 | path_mdata = args['path_mdata']
16 | path_barmap = args['path_barmap']
17 | path_output = args['path_output']
18 | 
19 | # Read
20 | mdata = mu.read('dts/pitupair/annotated.h5mu')
21 | barmap = pd.read_csv('dts/fakepitupair/barmap.csv')
22 | 
23 | # Format RNA barmap
24 | barmap.loc[:, 'RNA'] = ['smpl_' + b.replace('-1', '') for b in barmap['RNA']]
25 | 
26 | # Make sure intersection of all
27 | inter = set(barmap['RNA']) & set(barmap['ATAC']) & set(mdata.obs_names)
28 | msk = barmap['ATAC'].isin(inter) & barmap['RNA'].isin(inter)
29 | barmap = barmap.loc[msk, :].reset_index(drop=True)
30 | mdata = mdata[list(inter), :].copy()
31 | 
32 | # Create new fake object
33 | fmdata = mdata[barmap['ATAC'], :].copy()
34 | 
35 | # Populate with predicted RNA
36 | fmdata.mod['rna'].X = mdata.mod['rna'][barmap['RNA'].values, :].X
37 | 
38 | # Update metadata
39 | obs = barmap.set_index('ATAC')
40 | obs.index.name = None
41 | fmdata.obs = obs
42 | 
43 | # Write
44 | fmdata.write(path_output)
45 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/fakepair/paircells.R:
--------------------------------------------------------------------------------
 1 | library(doParallel)
 2 | library(FigR)
 3 | library(BSgenome.Hsapiens.UCSC.hg38)
 4 | library(SingleCellExperiment)
 5 | options("optmatch_max_problem_size" = Inf)
 6 | optmatch::setMaxProblemSize(size = Inf)
 7 | 
 8 | 
 9 | # Parse args
10 | args <- commandArgs(trailingOnly = F)
11 | path_cca <- args[6]
12 | path_ctypes <- args[7]
13 | path_barMap_out <- args[8]
14 | 
15 | 
16 | # Load Data
17 | CCA_PCs <- readRDS(path_cca)
18 | isATAC <- grepl("^smpl_",rownames(CCA_PCs))
19 | ATAC_PCs <- CCA_PCs[isATAC,]
20 | RNA_PCs <- CCA_PCs[!isATAC,]
21 | 
22 | # Pair with FigR
23 | pairing <- pairCells(
24 |     ATAC = ATAC_PCs,
25 |     RNA = RNA_PCs,
26 |     keepUnique = TRUE
27 | )
28 | 
29 | # Filter paired object
30 | #euc.dist <- function(x1, x2) sqrt(sum((x1 - x2) ^ 2))
31 | #pairing$dist <- apply(pairing, 1, function(x) { euc.dist(ATAC_PCs[x[1],1:ncol(ATAC_PCs)],RNA_PCs[x[2],1:ncol(RNA_PCs)])})
32 | pairing <- pairing[order(pairing$dist, decreasing = FALSE), ]
33 | pairing <- pairing[!duplicated(pairing$ATAC),]
34 | #atac_pairing <- pairing[!duplicated(pairing$ATAC),]
35 | #rna_pairing <- pairing[!duplicated(pairing$RNA),]
36 | #pairing <- merge(atac_pairing, rna_pairing)
37 | 
38 | # Merge ctype info
39 | ctypes <- read.csv(path_ctypes)
40 | pairing <- merge(pairing, ctypes, by.x='ATAC', by.y='barcode')
41 | pairing['batch'] <- 'smpl'
42 | pairing <- pairing[, c('ATAC', 'RNA', 'batch', 'celltype', 'dist')]
43 | 
44 | # Write
45 | write.csv(pairing, path_barMap_out, row.names = FALSE)


--------------------------------------------------------------------------------
/workflow/scripts/dts/format_frags.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILE_PATH=$1
 4 | SAMPLE_NAME=$(basename "$FILE_PATH" .frags.tsv.gz)
 5 | echo 'File path: ' $FILE_PATH
 6 | echo 'Sample id: ' $SAMPLE_NAME
 7 | 
 8 | # Process, modify, compress to bgzip format, and index
 9 | zcat "$FILE_PATH" | \
10 | awk -v sample="$SAMPLE_NAME" '$0 !~ /^#/ {print $1"\t"$2"\t"$3"\t"sample"_"($4 ~ /-1$/ ? substr($4, 1, length($4)-2) : $4)"\t"$5}' | \
11 | bgzip > "${FILE_PATH}_modified.frags.tsv.bgz"
12 | 
13 | # Index the bgzipped file with tabix
14 | tabix -p bed "${FILE_PATH}_modified.frags.tsv.bgz"
15 | 
16 | # (Optional) Replace original file with the new bgzipped file
17 | mv "${FILE_PATH}_modified.frags.tsv.bgz" "$FILE_PATH"
18 | mv "${FILE_PATH}_modified.frags.tsv.bgz.tbi" "$FILE_PATH.tbi"
19 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/heartatlas/heart_annot.py:
--------------------------------------------------------------------------------
 1 | import scanpy as sc
 2 | import anndata as ad
 3 | import pandas as pd
 4 | import numpy as np
 5 | import argparse
 6 | 
 7 | # Init args
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('-i','--path_atac', required=True)
10 | parser.add_argument('-o','--path_annot', required=True)
11 | args = vars(parser.parse_args())
12 | 
13 | 
14 | path_atac = args['path_atac']
15 | path_annot = args['path_annot']
16 | 
17 | # Get obs
18 | atac = ad.read_h5ad(path_atac).obs
19 | atac = atac[atac['region'] == 'LV'][['combinedID', 'cell_type']].copy()
20 | atac[['sangerID', 'batch']] = atac['combinedID'].str.split('_', expand=True)
21 | atac.index = [b + '_' + i.split('-')[0].split('_')[-1] for i, b in zip(atac.index, atac['batch'])]
22 | atac = atac.rename(columns={'cell_type': 'celltype', 'sangerID': 'sangerid'})
23 | atac = atac[['celltype', 'batch', 'sangerid']]
24 | ctype_counts = atac.groupby('celltype', as_index=False).size()
25 | ctypes = ctype_counts[ctype_counts['size'] >= 100]['celltype'].values.astype(str)
26 | atac = atac[atac['celltype'].isin(ctypes)]
27 | 
28 | # Write
29 | atac.to_csv(path_annot)


--------------------------------------------------------------------------------
/workflow/scripts/dts/heartatlas/heartatlas.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import scanpy as sc
 3 | from pathlib import Path
 4 | import pandas as pd
 5 | import numpy as np
 6 | import anndata as ad
 7 | import mudata as md
 8 | import argparse
 9 | 
10 | 
11 | # Init args
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('-a','--path_gex', required=True)
14 | parser.add_argument('-b','--path_peaks', required=True)
15 | parser.add_argument('-c','--path_annot', required=True)
16 | parser.add_argument('-e','--path_geneids', required=True)
17 | parser.add_argument('-f','--path_output', required=True)
18 | args = vars(parser.parse_args())
19 | 
20 | path_gex = args['path_gex']
21 | path_peaks = args['path_peaks']
22 | path_annot = args['path_annot']
23 | path_geneids = args['path_geneids']
24 | path_output = args['path_output']
25 | 
26 | # Read annots
27 | obs = pd.read_csv(path_annot, index_col=0)
28 | sngr_dict = {a: b for a, b in zip(obs['sangerid'], obs['batch'])}
29 | 
30 | # Read gene ids
31 | geneids = pd.read_csv(path_geneids).set_index('id')['symbol'].to_dict()
32 | 
33 | # Read rna
34 | rna = sc.read_h5ad(path_gex)
35 | rna = rna[rna.obs['sangerID'].isin(sngr_dict.keys()), :].copy()
36 | rna.obs_names = [sngr_dict[s] + '_' + i.replace('-1', '').split('_')[-1] for i, s in zip(rna.obs_names, rna.obs['sangerID'])]
37 | rna.obs = rna.obs[['cell_type']]
38 | #rna.var_names = rna.var['gene_name-new'].astype(str).values
39 | 
40 | # Filter faulty gene symbols
41 | msk = rna.var_names.isin(geneids)
42 | rna = rna[:, msk].copy()
43 | msk = np.array([True if geneids[e] == g else False for e, g in zip(rna.var_names, rna.var['gene_name-new'])])
44 | rna = rna[:, msk].copy()
45 | 
46 | # Basic QC
47 | sc.pp.filter_cells(rna, min_genes=100)
48 | sc.pp.filter_genes(rna, min_cells=3)
49 | del rna.obs['n_genes']
50 | 
51 | # Remove duplicated genes based on num of cells
52 | to_remove = []
53 | for dup in rna.var['gene_name-new'].values[rna.var['gene_name-new'].duplicated()]:
54 |     tmp = rna.var[rna.var['gene_name-new'] == dup]
55 |     max_idx = tmp['n_cells'].idxmax()
56 |     to_remove.extend(tmp.index[tmp.index != max_idx].values)
57 | rna = rna[:, ~rna.var_names.isin(to_remove)].copy()
58 | 
59 | # Update gene names
60 | rna.var_names = [geneids[g] for g in rna.var_names]
61 | 
62 | # Read atac data
63 | atac = ad.read_h5ad(path_peaks)
64 | rna = rna[atac.obs_names].copy()
65 | atac = atac[rna.obs_names].copy()
66 | obs = obs.loc[atac.obs_names]
67 | del rna.obs
68 | del rna.var
69 | del rna.uns
70 | del rna.obsm
71 | del rna.obsp
72 | 
73 | # Create mdata
74 | mdata = md.MuData(
75 |     {'rna': rna, 'atac': atac,},
76 |     obs=obs
77 | )
78 | 
79 | # Write
80 | mdata.write(path_output)
81 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/pbmc10k/pbmc10k.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import scanpy as sc
 3 | import snapatac2 as snap
 4 | from snapatac2.datasets import _datasets, datasets
 5 | from pathlib import Path
 6 | import pandas as pd
 7 | import numpy as np
 8 | import anndata as ad
 9 | import mudata as md
10 | import argparse
11 | 
12 | 
13 | # Init args
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('-b','--path_annot', required=True)
16 | parser.add_argument('-c','--path_geneids', required=True)
17 | parser.add_argument('-e','--path_peaks', required=True)
18 | parser.add_argument('-f','--path_output', required=True)
19 | args = vars(parser.parse_args())
20 | 
21 | path_annot = args['path_annot']
22 | path_geneids = args['path_geneids']
23 | path_peaks = args['path_peaks']
24 | path_output = args['path_output']
25 | 
26 | # Read gene ids
27 | geneids = pd.read_csv(path_geneids).set_index('symbol')['id'].to_dict()
28 | 
29 | # Change default cache dir
30 | _datasets = datasets()
31 | _datasets.path = Path('/tmp/')
32 | 
33 | # Download
34 | rna = snap.read(snap.datasets.pbmc10k_multiome(modality='RNA', type='h5ad'), backed=None)
35 | del rna.obs
36 | rna.var.index.name = None
37 | 
38 | # Read annot
39 | obs = pd.read_csv(path_annot, index_col=0)
40 | 
41 | # Add celltype annotation
42 | rna.obs_names = ['smpl_' + i.replace('-1', '') for i in rna.obs_names]
43 | rna = rna[obs.index, :].copy()
44 | rna.obs = obs
45 | 
46 | # Filter faulty gene symbols
47 | ensmbls = np.array([geneids[g] if g in geneids else '' for g in rna.var_names])
48 | msk = ensmbls != ''
49 | rna = rna[:, msk].copy()
50 | # Basic QC
51 | sc.pp.filter_cells(rna, min_genes=100)
52 | sc.pp.filter_genes(rna, min_cells=3)
53 | del rna.obs['n_genes']
54 | # Remove duplicated genes based on num of cells
55 | to_remove = []
56 | for dup in rna.var.index[rna.var.index.duplicated()]:
57 |     tmp = rna.var.loc[dup]
58 |     max_idx = tmp.set_index('gene_ids')['n_cells'].idxmax()
59 |     to_remove.extend(tmp['gene_ids'][tmp['gene_ids'] != max_idx].values)
60 | rna = rna[:, ~rna.var['gene_ids'].isin(to_remove)].copy()
61 | del rna.obs
62 | del rna.var
63 | 
64 | # Read atac data
65 | atac = ad.read_h5ad(path_peaks)
66 | atac = atac[rna.obs_names].copy()
67 | 
68 | # Create mdata
69 | mdata = md.MuData(
70 |     {'rna': rna, 'atac': atac,},
71 |     obs=obs
72 | )
73 | 
74 | # Write
75 | mdata.write(path_output)
76 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/pbmc10k/prc_annot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import snapatac2 as snap
 3 | from snapatac2.datasets import _datasets, datasets
 4 | from pathlib import Path
 5 | import pandas as pd
 6 | import argparse
 7 | 
 8 | 
 9 | # Init args
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('-a','--path_annot', required=True)
12 | args = vars(parser.parse_args())
13 | 
14 | path_annot = args['path_annot']
15 | 
16 | # Change default cache dir
17 | _datasets = datasets()
18 | _datasets.path = Path('/tmp/')
19 | 
20 | # Download
21 | rna = snap.read(snap.datasets.pbmc10k_multiome(modality='RNA', type='h5ad'), backed=None)
22 | 
23 | # Extract annot
24 | rna.obs['batch'] = 'smpl'
25 | annot = rna.obs.rename(columns={'cell_type': 'celltype'})[['batch', 'celltype']]
26 | annot.index.name = None
27 | annot.index = ['smpl_' + i.replace('-1', '') for i in annot.index]
28 | 
29 | # Write
30 | annot.to_csv(path_annot)
31 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/pitunpair/coembedd.R:
--------------------------------------------------------------------------------
 1 | library(Signac)
 2 | library(EnsDb.Hsapiens.v86)
 3 | library(ggplot2)
 4 | library(cowplot)
 5 | library(dplyr)
 6 | library(Seurat)
 7 | 
 8 | 
 9 | # Parse args
10 | args <- commandArgs(trailingOnly = F)
11 | path_gex <- args[6]
12 | path_celltypes <- args[7]
13 | path_peaks <- args[8]
14 | path_frags <- args[9]
15 | path_cca_out <- args[10]
16 | 
17 | 
18 | # RNA
19 | rna <- Read10X_h5(path_gex)
20 | data.rna <- CreateSeuratObject(counts = rna, project = "RNA", assay = "RNA")
21 | celltypes <- read.csv(path_celltypes)
22 | cells_to_remove <- Cells(data.rna)[!Cells(data.rna) %in% celltypes$X]
23 | data.rna <- subset(data.rna, cells = setdiff(Cells(data.rna), cells_to_remove))
24 | data.rna <- NormalizeData(data.rna)
25 | data.rna <- FindVariableFeatures(data.rna)
26 | data.rna <- ScaleData(data.rna)
27 | 
28 | # ATAC
29 | atac <- Read10X_h5(path_peaks)
30 | grange.counts <- StringToGRanges(rownames(atac), sep = c(":", "-"))
31 | grange.use <- seqnames(grange.counts) %in% standardChromosomes(grange.counts)
32 | atac <- atac[as.vector(grange.use), ]
33 | annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
34 | seqlevelsStyle(annotations) <- 'UCSC'
35 | genome(annotations) <- "hg38"
36 | colnames(atac) <- gsub("-[0-9]+$", "", colnames(atac))
37 | colnames(atac) <- paste0("smpl_", colnames(atac))
38 | chrom_assay <- CreateChromatinAssay(
39 |    counts = atac,
40 |    sep = c(":", "-"),
41 |    genome = 'hg38',
42 |    fragments = path_frags,
43 |    min.cells = 10,
44 |    annotation = annotations
45 | )
46 | data.atac <- CreateSeuratObject(counts = chrom_assay, assay = "ATAC", project = "ATAC")
47 | data.atac <- RunTFIDF(data.atac)
48 | data.atac <- FindTopFeatures(data.atac, min.cutoff = "q0")
49 | data.atac <- ScaleData(data.atac)
50 | 
51 | # Infer gene scores
52 | gene.activities <- GeneActivity(data.atac, features = VariableFeatures(data.rna))
53 | data.atac[["ACTIVITY"]] <- CreateAssayObject(counts = gene.activities)
54 | DefaultAssay(data.atac) <- "ACTIVITY"
55 | data.atac <- NormalizeData(data.atac)
56 | data.atac <- ScaleData(data.atac, features = rownames(data.atac))
57 | data.atac <- FindVariableFeatures(data.atac)
58 | 
59 | # Run CCA
60 | data.cca <- RunCCA(
61 |   data.rna,
62 |   data.atac,
63 |   assay1 = "RNA",
64 |   assay2 = "ACTIVITY",
65 |   num.cc = 50
66 | )
67 | 
68 | CCA_PCs <- Embeddings(data.cca, reduction = "cca")
69 | saveRDS(CCA_PCs, file = path_cca_out)
70 | 
71 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/pitunpair/paircells.R:
--------------------------------------------------------------------------------
 1 | library(doParallel)
 2 | library(FigR)
 3 | library(BSgenome.Hsapiens.UCSC.hg38)
 4 | library(SingleCellExperiment)
 5 | options("optmatch_max_problem_size" = Inf)
 6 | optmatch::setMaxProblemSize(size = Inf)
 7 | 
 8 | 
 9 | # Parse args
10 | args <- commandArgs(trailingOnly = F)
11 | path_cca <- args[6]
12 | path_ctypes <- args[7]
13 | path_barMap_out <- args[8]
14 | 
15 | 
16 | # Load Data
17 | CCA_PCs <- readRDS(path_cca)
18 | isATAC <- grepl("^smpl_",rownames(CCA_PCs))
19 | ATAC_PCs <- CCA_PCs[isATAC,]
20 | RNA_PCs <- CCA_PCs[!isATAC,]
21 | 
22 | # Pair with FigR
23 | pairing <- pairCells(
24 |     ATAC = ATAC_PCs,
25 |     RNA = RNA_PCs,
26 |     keepUnique = TRUE
27 | )
28 | 
29 | # Filter paired object
30 | pairing <- pairing[order(pairing$dist, decreasing = FALSE), ]
31 | pairing <- pairing[!duplicated(pairing$ATAC),]
32 | 
33 | # Merge ctype info
34 | ctypes <- read.csv(path_ctypes)
35 | pairing <- merge(pairing, ctypes, by.x='RNA', by.y='X')
36 | pairing['batch'] <- 'smpl'
37 | pairing <- pairing[, c('ATAC', 'RNA', 'batch', 'celltype', 'dist')]
38 | rownames(pairing) <- pairing$ATAC
39 | 
40 | # Write
41 | write.csv(pairing, path_barMap_out, row.names = FALSE)
42 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/pitunpair/pitunpair.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import scanpy as sc
 3 | from pathlib import Path
 4 | import pandas as pd
 5 | import numpy as np
 6 | import anndata as ad
 7 | import mudata as md
 8 | import argparse
 9 | 
10 | 
11 | # Init args
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('-c','--path_geneids', required=True)
14 | parser.add_argument('-e','--path_peaks', required=True)
15 | parser.add_argument('-f','--path_output', required=True)
16 | parser.add_argument('-g','--path_expr', required=True)
17 | parser.add_argument('-i', '--path_barmap', required=True)
18 | args = vars(parser.parse_args())
19 | 
20 | path_barmap = args['path_barmap']
21 | path_geneids = args['path_geneids']
22 | path_peaks = args['path_peaks']
23 | path_output = args['path_output']
24 | path_expr = args['path_expr']
25 | 
26 | # Read gene ids
27 | geneids = pd.read_csv(path_geneids).set_index('symbol')['id'].to_dict()
28 | 
29 | # Read barmap
30 | barmap = pd.read_csv(path_barmap, index_col=0)
31 | barmap.index.name = None
32 | 
33 | # Read data
34 | rna = sc.read_10x_h5(path_expr, genome="GRCh38")
35 | del rna.obs
36 | rna.var.index.name = None
37 | 
38 | # Filter RNA data based on barmap
39 | rna = rna[barmap['RNA'].values, :]
40 | print(barmap)
41 | rna.obs_names = barmap.index
42 | 
43 | # Filter faulty gene symbols
44 | ensmbls = np.array([geneids[g] if g in geneids else '' for g in rna.var_names])
45 | msk = ensmbls != ''
46 | rna = rna[:, msk].copy()
47 | 
48 | # Basic QC
49 | sc.pp.filter_cells(rna, min_genes=100)
50 | sc.pp.filter_genes(rna, min_cells=3)
51 | del rna.obs['n_genes']
52 | 
53 | # Remove duplicated genes based on num of cells
54 | to_remove = []
55 | for dup in rna.var.index[rna.var.index.duplicated()]:
56 |     tmp = rna.var.loc[dup]
57 |     max_idx = tmp.set_index('gene_ids')['n_cells'].idxmax()
58 |     to_remove.extend(tmp['gene_ids'][tmp['gene_ids'] != max_idx].values)
59 | rna = rna[:, ~rna.var['gene_ids'].isin(to_remove)].copy()
60 | del rna.var
61 | 
62 | # Read atac data
63 | atac = ad.read_h5ad(path_peaks)
64 | 
65 | # Filter ATAC data based on barmap and RNA
66 | atac = atac[rna.obs_names, :]
67 | 
68 | # Create mdata
69 | mdata = md.MuData(
70 |     {'rna': rna, 'atac': atac,},
71 |     obs=barmap
72 | )
73 | 
74 | # Write
75 | mdata.write(path_output)
76 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/pitupair/pitupair.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import scanpy as sc
 3 | import snapatac2 as snap
 4 | from snapatac2.datasets import _datasets, datasets
 5 | from pathlib import Path
 6 | import pandas as pd
 7 | import numpy as np
 8 | import anndata as ad
 9 | import mudata as md
10 | import argparse
11 | 
12 | 
13 | # Init args
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('-b','--path_annot', required=True)
16 | parser.add_argument('-c','--path_geneids', required=True)
17 | parser.add_argument('-e','--path_peaks', required=True)
18 | parser.add_argument('-f','--path_output', required=True)
19 | parser.add_argument('-g','--path_multi', required=True)
20 | args = vars(parser.parse_args())
21 | 
22 | path_annot = args['path_annot']
23 | path_geneids = args['path_geneids']
24 | path_peaks = args['path_peaks']
25 | path_output = args['path_output']
26 | path_multi = args['path_multi']
27 | 
28 | # Read gene ids
29 | geneids = pd.read_csv(path_geneids).set_index('symbol')['id'].to_dict()
30 | 
31 | # Read annots
32 | obs = pd.read_csv(path_annot, index_col=0)
33 | 
34 | # Read data
35 | rna = sc.read_10x_h5(path_multi, genome="GRCh38", gex_only=True)
36 | del rna.obs
37 | rna.var.index.name = None
38 | 
39 | # Rename barcodes RNA
40 | sample_id = 'smpl'
41 | rna.obs_names = [sample_id + '_' + o.split('-1')[0] for o in rna.obs_names]
42 | 
43 | # Filter faulty gene symbols
44 | ensmbls = np.array([geneids[g] if g in geneids else '' for g in rna.var_names])
45 | msk = ensmbls != ''
46 | rna = rna[:, msk].copy()
47 | 
48 | # Basic QC
49 | sc.pp.filter_cells(rna, min_genes=100)
50 | sc.pp.filter_genes(rna, min_cells=3)
51 | del rna.obs['n_genes']
52 | 
53 | # Remove duplicated genes based on num of cells
54 | to_remove = []
55 | for dup in rna.var.index[rna.var.index.duplicated()]:
56 |     tmp = rna.var.loc[dup]
57 |     max_idx = tmp.set_index('gene_ids')['n_cells'].idxmax()
58 |     to_remove.extend(tmp['gene_ids'][tmp['gene_ids'] != max_idx].values)
59 | rna = rna[:, ~rna.var['gene_ids'].isin(to_remove)].copy()
60 | del rna.var
61 | del rna.obs
62 | 
63 | # Read atac data
64 | atac = ad.read_h5ad(path_peaks)
65 | 
66 | # Filter
67 | rna = rna[atac.obs_names, :].copy()
68 | obs = obs.loc[atac.obs_names, :]
69 | 
70 | # Create mdata
71 | mdata = md.MuData(
72 |     {'rna': rna, 'atac': atac,},
73 |     obs=obs
74 | )
75 | 
76 | # Write
77 | mdata.write(path_output)
78 | 


--------------------------------------------------------------------------------
/workflow/scripts/dts/reprofibro/prc_annot.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import zipfile
 4 | import argparse
 5 | 
 6 | 
 7 | # Init args
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('-a','--path_annot', required=True)
10 | args = vars(parser.parse_args())
11 | 
12 | path_annot = args['path_annot']
13 | 
14 | archive = zipfile.ZipFile(path_annot, 'r')
15 | obs = pd.read_csv(archive.open('multiome/snATAC/cells.tsv'), sep='\t')
16 | obs = obs[['barcode', 'sample', 'cluster']].set_index('barcode').rename(columns={'sample': 'batch', 'cluster': 'celltype'})
17 | obs.index.name = None
18 | obs = obs[obs['batch'] != 'D2']
19 | annot = {
20 |     1: 'Fibroblast',
21 |     2: 'Fibroblast-like',
22 |     3: 'Fibroblast-like',
23 |     4: 'Fibroblast-like',
24 |     5: 'Fibroblast-like',
25 |     6: 'Keratinocyte-like',
26 |     7: 'hOSK',
27 |     8: 'xOSK',
28 |     9: 'Intermediate',
29 |     10: 'Partially-reprogrammed',
30 |     11: 'Intermediate',
31 |     12: 'Intermediate',
32 |     13: 'Pre-iPSC',
33 |     14: 'Pre-iPSC',
34 |     15: 'iPSC',
35 | }
36 | obs['celltype'] = [annot[c] for c in obs['celltype']]
37 | obs.to_csv(path_annot)
38 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/celloracle/mdl.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt  # Import else compiling error
 2 | import numpy as np
 3 | import pandas as pd
 4 | import muon as mu
 5 | import celloracle as co
 6 | import os
 7 | import argparse
 8 | 
 9 | 
10 | # Init args
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('-m','--path_mdata', required=True)
13 | parser.add_argument('-g','--path_p2g', required=True)
14 | parser.add_argument('-t','--path_tfb', required=True)
15 | parser.add_argument('-a','--alpha', required=True)
16 | parser.add_argument('-p','--pthr', required=True)
17 | parser.add_argument('-n','--top_n', required=True)
18 | parser.add_argument('-o','--path_out', required=True)
19 | args = vars(parser.parse_args())
20 | 
21 | path_mdata = args['path_mdata']
22 | path_p2g = args['path_p2g']
23 | path_tfb = args['path_tfb']
24 | alpha = float(args['alpha'])
25 | pthr = float(args['pthr'])
26 | top_n = int(args['top_n'])
27 | path_out = args['path_out']
28 | 
29 | # Process base GRN
30 | p2g = pd.read_csv(path_p2g)
31 | tfb = pd.read_csv(path_tfb)
32 | if (p2g.shape[0] == 0) or (tfb.shape[0] == 0):
33 |     grn = pd.DataFrame(columns=['source', 'target', 'score', 'pval'])
34 |     grn.to_csv(path_out, index=False)
35 |     exit()
36 | tfb['score'] = 1
37 | p2g = p2g[['cre', 'gene']]
38 | base_grn = pd.merge(
39 |     p2g,
40 |     tfb
41 |     .pivot(index='cre', columns='tf')
42 |     .fillna(0)
43 |     .droplevel(0, axis=1)
44 |     .reset_index()
45 | )
46 | base_grn = base_grn.rename(columns={'cre': 'peak_id', 'gene': 'gene_short_name'})
47 | base_grn['peak_id'] = base_grn['peak_id'].str.replace('-', '_')
48 | 
49 | # Init oracle object
50 | oracle = co.Oracle()
51 | oracle.adata = mu.read(path_mdata)['rna'].copy()
52 | oracle.adata.obsm['X_umap'] = np.zeros((oracle.adata.shape[0], 2))
53 | oracle.adata.layers['imputed_count'] = oracle.adata.X
54 | oracle.adata.obs['cluster'] = 'cluster'
55 | oracle.cluster_column_name = 'cluster'
56 | oracle.embedding_name = 'X_umap'
57 | oracle.pcs = np.zeros((oracle.adata.shape[0], 2))
58 | oracle.knn = True
59 | oracle.k_knn_imputation = True
60 | oracle.import_TF_data(TF_info_matrix=base_grn)
61 | 
62 | # Model TF ~ G
63 | print('Modeling GRN...')
64 | links = oracle.get_links(
65 |     cluster_name_for_GRN_unit="cluster",
66 |     alpha=alpha,
67 |     n_jobs=32,
68 | )
69 | print('Modeling Done!')
70 | print('Filtering links...')
71 | links.filter_links(
72 |     p=pthr,
73 |     weight="coef_abs",
74 |     threshold_number=top_n
75 | )
76 | print('Filtering done!')
77 | 
78 | # Extract grn
79 | grn = links.filtered_links['cluster'].dropna()[['source', 'target', 'coef_mean', 'p']]
80 | grn = grn.rename(columns={'coef_mean': 'score', 'p': 'pval'})
81 | grn = grn.sort_values(['source', 'target', 'pval'])
82 | 
83 | # Write
84 | grn.to_csv(path_out, index=False)
85 | 
86 | print('Done')
87 | os._exit(0)  # Add this else it gets stuck


--------------------------------------------------------------------------------
/workflow/scripts/mth/celloracle/p2g.R:
--------------------------------------------------------------------------------
 1 | library(cicero)
 2 | library(rhdf5)
 3 | 
 4 | 
 5 | # Parse args
 6 | args <- commandArgs(trailingOnly = F)
 7 | path_data <- args[6]
 8 | path_genome <- args[7]
 9 | ext <- as.numeric(args[8])
10 | path_all_peaks <- args[9]
11 | path_connections <- args[10]
12 | 
13 | # Read genome
14 | org <- sub('^dbs/([^/]+)/.*$', '\\1', path_genome)
15 | path_chr_sizes <- file.path(path_genome, org, sprintf('%s.fa.sizes', org))
16 | genome <- read.table(path_chr_sizes)
17 | 
18 | # Process mudata
19 | indata <- H5Fopen(path_data, flags='H5F_ACC_RDONLY')
20 | data <- indata$mod$atac$X
21 | barcodes <- indata$mod$atac$obs$`_index`
22 | peaks <- indata$mod$atac$var$`_index`
23 | h5closeAll()
24 | 
25 | # Format cell info
26 | cellinfo <- data.frame(row.names=barcodes, cells=barcodes)
27 | 
28 | # Format peak info
29 | peakinfo <- data.frame(row.names=peaks, site_name=peaks)
30 | peakinfo <- tidyr::separate(data = peakinfo, col = 'site_name', into = c("chr", "bp1", "bp2"), sep = "-", remove=FALSE)
31 | 
32 | # Add names
33 | row.names(data) <- row.names(peakinfo)
34 | colnames(data) <- row.names(cellinfo)
35 | 
36 | # Make CDS
37 | input_cds <-  suppressWarnings(
38 |     new_cell_data_set(data,
39 |     cell_metadata = cellinfo,
40 |     gene_metadata = peakinfo)
41 | )
42 | 
43 | # Data preprocessing
44 | set.seed(2017)
45 | 
46 | # Run cicero
47 | print("Starting Cicero")
48 | print("Calculating distance_parameter value")
49 | distance_parameters <- estimate_distance_parameter(
50 |     input_cds,
51 |     window=ext,
52 |     maxit=100,
53 |     sample_num = 100,
54 |     distance_constraint = round(ext / 2),
55 |     distance_parameter_convergence = 1e-22,
56 |     genomic_coords = genome
57 | )
58 | mean_distance_parameter <- mean(unlist(distance_parameters))
59 | print("Running models")
60 | cicero_out <- generate_cicero_models(
61 |     input_cds,
62 |     distance_parameter = mean_distance_parameter,
63 |     window = ext,
64 |     genomic_coords = genome
65 | )
66 | print("Assembling connections")
67 | conns <- assemble_connections(cicero_out, silent=FALSE)
68 | 
69 | # Save
70 | all_peaks <- row.names(exprs(input_cds))
71 | write.csv(x = all_peaks, file = file.path(path_all_peaks))
72 | write.csv(x = conns, file = file.path(path_connections))
73 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/celloracle/p2g.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt  # Import else compiling error
 2 | import pandas as pd
 3 | import numpy as np
 4 | from celloracle import motif_analysis as ma
 5 | import celloracle as co
 6 | import mudata as mu
 7 | import os
 8 | import re
 9 | import argparse
10 | 
11 | 
12 | # Init args
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('-d','--path_data', required=True)
15 | parser.add_argument('-a','--all_peaks', required=True)
16 | parser.add_argument('-c','--connections', required=True)
17 | parser.add_argument('-o','--organism', required=True)
18 | parser.add_argument('-t','--thr', required=True)
19 | parser.add_argument('-p','--path_out', required=True)
20 | args = vars(parser.parse_args())
21 | 
22 | path_data = args['path_data']
23 | path_all_peaks = args['all_peaks']
24 | path_connections = args['connections']
25 | organism = args['organism']
26 | thr_coaccess = float(args['thr'])
27 | path_out = args['path_out']
28 | 
29 | # Process organism
30 | organism = re.search(r'^dbs/([^/]+)/.*$', organism).group(1)
31 | 
32 | # Load scATAC-seq peak list
33 | peaks = pd.read_csv(path_all_peaks, index_col=0).x.values.astype('U')
34 | peaks = np.char.replace(peaks, '-', '_')
35 | 
36 | # Load Cicero coaccessibility scores
37 | cicero_connections = pd.read_csv(path_connections, index_col=0)
38 | cicero_connections['Peak1'] = np.char.replace(cicero_connections['Peak1'].values.astype('U'), '-', '_')
39 | cicero_connections['Peak2'] = np.char.replace(cicero_connections['Peak2'].values.astype('U'), '-', '_')
40 | 
41 | # Extract tss information
42 | tss_annotated = ma.get_tss_info(
43 |     peak_str_list=peaks,
44 |     ref_genome=organism
45 | )
46 | 
47 | # Integrate
48 | integrated = ma.integrate_tss_peak_with_cicero(
49 |     tss_peak=tss_annotated,
50 |     cicero_connections=cicero_connections
51 | )
52 | 
53 | # Process
54 | integrated = integrated[integrated['coaccess'] >= thr_coaccess]
55 | integrated['peak_id'] = integrated['peak_id'].str.replace('_', '-')
56 | integrated = integrated.rename(columns={'peak_id': 'cre', 'gene_short_name': 'gene', 'coaccess': 'score'})
57 | integrated = integrated.sort_values(['cre', 'score'], ascending=[True, False])
58 | 
59 | # Remove unexpressed genes
60 | genes = mu.read(os.path.join(path_data, 'rna')).var.index.values.astype('U')
61 | integrated = integrated[integrated['gene'].isin(genes)]
62 | 
63 | # Write
64 | integrated.to_csv(path_out, index=False)
65 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/celloracle/pre.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import pandas as pd
 3 | import numpy as np
 4 | import celloracle as co
 5 | import muon as mu
 6 | import scipy
 7 | import os
 8 | import argparse
 9 | 
10 | 
11 | # Init args
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('-i','--path_input', required=True)
14 | parser.add_argument('-k','--knn', required=True)
15 | parser.add_argument('-o','--path_out', required=True)
16 | args = vars(parser.parse_args())
17 | 
18 | path_input = args['path_input']
19 | k = int(args['knn'])
20 | path_out = args['path_out']
21 | 
22 | # Read rna adata
23 | mdata = mu.read(path_input)
24 | 
25 | # Extract raw counts data and assign labels
26 | adata = mdata.mod['rna'].copy()
27 | adata.layers['lognorm'] = adata.X.copy()
28 | adata.X = adata.layers['counts'].copy()
29 | adata.obs['celltype'] = mdata.obs['celltype']
30 | adata.obsm['X_pca'] = mdata.obsm['X_spectral']
31 | 
32 | # Instantiate Oracle object
33 | oracle = co.Oracle()
34 | oracle.import_anndata_as_raw_count(
35 |     adata=adata,
36 |     cluster_column_name="celltype",
37 |     embedding_name="X_pca"
38 | )
39 | 
40 | # Compute PCA and select top pcs
41 | oracle.perform_PCA()
42 | n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]
43 | n_comps = min(n_comps, 50)
44 | 
45 | # Run imputation
46 | oracle.knn_imputation(
47 |     n_pca_dims=n_comps,
48 |     k=k,
49 |     balanced=True,
50 |     b_sight=k*8,
51 |     b_maxl=k*4,
52 |     n_jobs=os.cpu_count(),
53 | )
54 | 
55 | # Update object with imputet counts
56 | mdata['rna'].X = oracle.adata.layers['imputed_count']
57 | 
58 | # Write
59 | mdata.write(path_out)
60 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/celloracle/src.R:
--------------------------------------------------------------------------------
 1 | library(cicero)
 2 | library(rhdf5)
 3 | 
 4 | 
 5 | # Parse args
 6 | args <- commandArgs(trailingOnly = F)
 7 | path_data <- args[6]
 8 | path_genome <- args[7]
 9 | ext <- as.numeric(args[8])
10 | path_all_peaks <- args[9]
11 | path_connections <- args[10]
12 | 
13 | # Read genome
14 | org <- sub('^dbs/([^/]+)/.*$', '\\1', path_genome)
15 | path_chr_sizes <- file.path(path_genome, org, sprintf('%s.fa.sizes', org))
16 | genome <- read.table(path_chr_sizes)
17 | 
18 | # Process mudata
19 | indata <- H5Fopen(path_data, flags='H5F_ACC_RDONLY')
20 | data <- indata$mod$atac$X
21 | barcodes <- indata$mod$atac$obs$`_index`
22 | peaks <- indata$mod$atac$var$`_index`
23 | h5closeAll()
24 | 
25 | # Format cell info
26 | cellinfo <- data.frame(row.names=barcodes, cells=barcodes)
27 | 
28 | # Format peak info
29 | peakinfo <- data.frame(row.names=peaks, site_name=peaks)
30 | peakinfo <- tidyr::separate(data = peakinfo, col = 'site_name', into = c("chr", "bp1", "bp2"), sep = "-", remove=FALSE)
31 | 
32 | # Add names
33 | row.names(data) <- row.names(peakinfo)
34 | colnames(data) <- row.names(cellinfo)
35 | 
36 | # Binarize
37 | data[data != 0] <- 1
38 | 
39 | # Make CDS
40 | input_cds <-  suppressWarnings(
41 |     new_cell_data_set(data,
42 |     cell_metadata = cellinfo,
43 |     gene_metadata = peakinfo)
44 | )
45 | 
46 | # Data preprocessing
47 | set.seed(2017)
48 | input_cds <- estimate_size_factors(input_cds)
49 | input_cds <- preprocess_cds(input_cds, method = "LSI")
50 | 
51 | # Dimensional reduction with umap
52 | input_cds <- reduce_dimension(
53 |     input_cds,
54 |     reduction_method = 'UMAP', 
55 |     preprocess_method = "LSI"
56 | )
57 | umap_coords <- reducedDims(input_cds)$UMAP
58 | 
59 | # Build "metacells"
60 | cicero_cds <- make_cicero_cds(input_cds, reduced_coordinates = umap_coords)
61 | 
62 | # Run cicero
63 | print("Starting Cicero")
64 | print("Calculating distance_parameter value")
65 | distance_parameters <- estimate_distance_parameter(
66 |     input_cds,
67 |     window=ext,
68 |     maxit=100,
69 |     sample_num = 100,
70 |     distance_constraint = round(ext / 2),
71 |     distance_parameter_convergence = 1e-22,
72 |     genomic_coords = genome
73 | )
74 | mean_distance_parameter <- mean(unlist(distance_parameters))
75 | print("Running models")
76 | cicero_out <- generate_cicero_models(
77 |     input_cds,
78 |     distance_parameter = mean_distance_parameter,
79 |     window = ext,
80 |     genomic_coords = genome
81 | )
82 | print("Assembling connections")
83 | conns <- assemble_connections(cicero_out, silent=FALSE)
84 | 
85 | # Save
86 | all_peaks <- row.names(exprs(input_cds))
87 | write.csv(x = all_peaks, file = file.path(path_all_peaks))
88 | write.csv(x = conns, file = file.path(path_connections))
89 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/dictys/before_mdl.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import mudata as mu
 4 | import sys
 5 | import os
 6 | import dictys
 7 | 
 8 | 
 9 | # Read and process gex
10 | rna_path = os.path.join(sys.argv[1], 'mod', 'rna')
11 | rna = mu.read(rna_path)
12 | rna.X = rna.layers['counts']
13 | rna = rna.to_df().T
14 | rna.to_csv(sys.argv[2], sep='\t', compression='gzip')
15 | name_pre = sys.argv[1].split('/runs/')[1].split('.')[0]
16 | if 'dictys' not in name_pre:
17 |     dictys.preproc.qc_reads(sys.argv[2], sys.argv[2], 50, 10, 0, 200, 100, 0)
18 | rna = pd.read_csv(sys.argv[2], header=0, index_col=0, sep='\t')
19 | 
20 | # Read and process peaks
21 | use_peaks = bool(sys.argv[3])
22 | if use_peaks:
23 |     peaks = pd.read_csv(sys.argv[4])['cre'].unique()
24 | else:
25 |     atac_path = os.path.join(sys.argv[1], 'mod', 'atac')
26 |     peaks = mu.read(atac_path).var_names
27 | peaks = np.array([p.replace('-', ':') for p in peaks])
28 | peaks = pd.DataFrame(np.zeros((peaks.size, 1)), index=peaks, columns=['placeholder'])
29 | peaks.to_csv(sys.argv[5], sep='\t', compression='gzip')
30 | 
31 | # Read tfb
32 | tfb = pd.read_csv(sys.argv[6])
33 | tfb['cre'] = tfb['cre'].str.replace('-', ':')
34 | tfb = tfb[tfb['tf'].isin(rna.index) & tfb['cre'].isin(peaks.index)]
35 | output_tfb = tfb.rename(columns={'cre': 'loc', 'tf': 'TF'})[['TF', 'loc', 'score']]
36 | output_tfb.to_csv(sys.argv[7], sep='\t', index=False)
37 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/dictys/extract_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import mudata as mu
 4 | import os
 5 | import argparse
 6 | 
 7 | 
 8 | parser = argparse.ArgumentParser(description="", usage="")
 9 | parser.add_argument('--pre_path', required=True)
10 | parser.add_argument('--p2g_path', required=True)
11 | parser.add_argument('--exp_path', required=True)
12 | parser.add_argument('--pks_path', required=True)
13 | parser.add_argument('--use_p2g' , required=True)
14 | 
15 | args = vars(parser.parse_args())
16 | pre_path = args['pre_path']
17 | p2g_path = args['p2g_path']
18 | exp_path = args['exp_path']
19 | pks_path = args['pks_path']
20 | use_p2g  = args['use_p2g' ]
21 | 
22 | 
23 | # Write the RNA matrix
24 | pre_type = os.path.basename(pre_path).split('.')[0]
25 | data = mu.read(pre_path)
26 | rna_X = pd.DataFrame(np.array(data['rna'].layers['counts'].todense()).T, columns=data['rna'].obs.index, index=data['rna'].var.index)
27 | rna_X.to_csv(exp_path, sep="\t", compression="gzip")
28 | 
29 | if use_p2g:
30 |     # Read in p2g and keep only peaks that are wide enough for footprinting
31 |     all_atac_peak = np.unique(pd.read_csv(p2g_path)['cre'])
32 | else:
33 |     # From the consensus peak list, keep only peaks that are wide enough for footprinting
34 |     all_atac_peak = np.unique([n.replace(':', '-') for n in data['atac'].var.index])
35 | 
36 | all_atac_peak = pd.DataFrame([n.split('-') for n in all_atac_peak])
37 | all_atac_peak.columns = ['chr', 'srt', 'end']
38 | all_atac_peak['srt'] = all_atac_peak['srt'].astype(int)
39 | all_atac_peak['end'] = all_atac_peak['end'].astype(int)
40 | all_atac_peak = all_atac_peak[(all_atac_peak.end - all_atac_peak.srt) >= 100]
41 | all_atac_peak = all_atac_peak.sort_values(by=['chr', 'srt', 'end'])
42 | all_atac_peak.to_csv(pks_path, sep='\t', header=False, index=False)
43 | 
44 | # Store clusters
45 | clus = sorted(data.obs['celltype'].unique())
46 | for c in clus:
47 |     if pre_type == 'granie':
48 |         ctype_ids = data['rna'].uns['rna_b_per_c'][c]
49 |     else:
50 |         ctype_ids = data[data.obs['celltype'] == c].obs.index
51 |     c = c.replace(' ', '_')
52 |     with open(os.path.join(os.path.dirname(exp_path), f'barcodes_{c}.txt'), "w") as f:
53 |         for i in ctype_ids:
54 |             f.write(f"{i}\n")
55 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/dictys/frag_to_bam.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import argparse, os, sys
 3 | import gzip
 4 | 
 5 | 
 6 | parser = argparse.ArgumentParser(description="Splits fragment file by annotated cell clusters and builds .bam file", usage="")
 7 | parser.add_argument('--fnames', required=True, nargs='+')
 8 | parser.add_argument('--barcodes', required=True)
 9 | 
10 | args = vars(parser.parse_args())
11 | atac_fnames = args['fnames']
12 | barcodes = args['barcodes']
13 | 
14 | fwflag = 99 # 1 + 2 + 32 + 64
15 | bwflag = 147 # 1 + 2 + 16 + 128
16 | mapq = 60
17 | rnext = '='
18 | lshift = +4
19 | rshift = -5
20 | seqlen = 50
21 | cigar = f'{seqlen}M'
22 | seq = 'N' * seqlen
23 | qual = 'F' * seqlen
24 | valid_chr = [f"chr{i}" for i in range(1,23)] + ['chrX', 'chrY']
25 | valid_chr = dict([(i,0) for i in valid_chr])
26 | 
27 | sam_header_string = """@HD	SO:coordinate
28 | @SQ	SN:chr1	LN:248956422
29 | @SQ	SN:chr10	LN:133797422
30 | @SQ	SN:chr11	LN:135086622
31 | @SQ	SN:chr12	LN:133275309
32 | @SQ	SN:chr13	LN:114364328
33 | @SQ	SN:chr14	LN:107043718
34 | @SQ	SN:chr15	LN:101991189
35 | @SQ	SN:chr16	LN:90338345
36 | @SQ	SN:chr17	LN:83257441
37 | @SQ	SN:chr18	LN:80373285
38 | @SQ	SN:chr19	LN:58617616
39 | @SQ	SN:chr2	LN:242193529
40 | @SQ	SN:chr20	LN:64444167
41 | @SQ	SN:chr21	LN:46709983
42 | @SQ	SN:chr22	LN:50818468
43 | @SQ	SN:chr3	LN:198295559
44 | @SQ	SN:chr4	LN:190214555
45 | @SQ	SN:chr5	LN:181538259
46 | @SQ	SN:chr6	LN:170805979
47 | @SQ	SN:chr7	LN:159345973
48 | @SQ	SN:chr8	LN:145138636
49 | @SQ	SN:chr9	LN:138394717
50 | @SQ	SN:chrX	LN:156040895
51 | @SQ	SN:chrY	LN:57227415
52 | """
53 | 
54 | def format_sam(s, barcodes):
55 |     [chrom, srt, end, bc, rpt] = s.strip().split('\t')
56 |     if (chrom.lower() not in valid_chr) or (bc not in barcodes):
57 |         return
58 |     qname = f"{chrom}:{srt}:{end}:{bc}"
59 |     fwpos = int(srt) - lshift + 1          # fragment is 0-index, sam is 1-index (bam is 0-index)
60 |     bwpos = int(end) - rshift + 1 - seqlen # reverse strand, left-most position
61 |     tlen  = bwpos + seqlen - fwpos
62 |     for c in range(int(rpt)):
63 |         sys.stdout.write(f"{qname}:{c}\t{fwflag}\t{chrom}\t{fwpos}\t{mapq}\t" +
64 |               f"{cigar}\t{rnext}\t{bwpos}\t{tlen}\t{seq}\t{qual}\tCB:Z:{bc}\n")
65 |         sys.stdout.write(f"{qname}:{c}\t{bwflag}\t{chrom}\t{bwpos}\t{mapq}\t" +
66 |               f"{cigar}\t{rnext}\t{fwpos}\t{tlen*-1}\t{seq}\t{qual}\tCB:Z:{bc}\n")
67 | 
68 | def filter_fragment_file(atac_fname, barcodes):
69 |     with gzip.open(atac_fname, 'rt', encoding='utf-8') as f:
70 |         for line in f:
71 |             format_sam(line, barcodes)
72 | 
73 | sys.stdout.write(sam_header_string)
74 | barcodes = set(pd.read_csv(barcodes, header=None)[0].values)
75 | for atac_fname in atac_fnames:
76 |     filter_fragment_file(atac_fname, barcodes)
77 | 
78 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/dictys/mdl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # Parse command-line arguments
 5 | while [[ "$#" -gt 0 ]]; do
 6 |     case $1 in
 7 |         --output_d) output_d="$2"; shift ;;
 8 |         --pre_path) pre_path="$2"; shift ;;
 9 |         --p2g_path) p2g_path="$2"; shift ;;
10 |         --tfb_path) tfb_path="$2"; shift ;;
11 |         --annot) annot="$2"; shift ;;
12 |         --distance) distance="$2"; shift ;;
13 |         --n_p2g_links) n_p2g_links="$2"; shift ;;
14 |         --threads) threads="$2"; shift ;;
15 |         --device) device="$2"; shift ;;
16 |         --thr_score) thr_score="$2"; shift ;;
17 |         --use_p2g) use_p2g="$2"; shift ;;
18 |         --out_path) out_path="$2"; shift ;;
19 |         *) echo "Unknown parameter passed: $1"; exit 1 ;;
20 |     esac
21 |     shift
22 | done
23 | 
24 | if [ $(wc -l < $p2g_path) -eq 1 ] || [ $(wc -l < $tfb_path) -eq 1 ] || [ $(basename $pre_path | grep -q '^granie'; echo $?) -eq 0 ]; then
25 |     echo "source,target,score,pval" > "$out_path"
26 |     mkdir -p "$output_d"
27 |     exit 0
28 | fi && \
29 | mkdir -p "$output_d" && \
30 | python -c "import torch; print('Cuda enabled:', torch.cuda.is_available())" && \
31 | python workflow/scripts/mth/dictys/before_mdl.py $pre_path $output_d/expr.tsv.gz $use_p2g $p2g_path $output_d/peaks.tsv.gz $tfb_path $output_d/tfb.tsv.gz && \
32 | python -m dictys chromatin tssdist --cut $distance $output_d/expr.tsv.gz $output_d/peaks.tsv.gz $annot $output_d/tssdist.tsv.gz && \
33 | echo 'Finished tssdist' && \
34 | python -m dictys chromatin linking $output_d/tfb.tsv.gz $output_d/tssdist.tsv.gz $output_d/linking.tsv.gz && \
35 | echo 'Finished chromatin linking' && \
36 | python -m dictys chromatin binlinking $output_d/linking.tsv.gz $output_d/binlinking.tsv.gz $n_p2g_links && \
37 | echo 'Finished chromatin binlinking' && \
38 | python -m dictys network reconstruct --device $device --nth $threads $output_d/expr.tsv.gz $output_d/binlinking.tsv.gz $output_d/net_weight.tsv.gz $output_d/net_meanvar.tsv.gz $output_d/net_covfactor.tsv.gz $output_d/net_loss.tsv.gz $output_d/net_stats.tsv.gz && \
39 | echo 'Finished network reconstruct' && \
40 | python -m dictys network normalize --nth $threads $output_d/net_weight.tsv.gz $output_d/net_meanvar.tsv.gz $output_d/net_covfactor.tsv.gz $output_d/net_nweight.tsv.gz && \
41 | echo 'Finished network normalize' && \
42 | python -c "import pandas as pd, numpy as np, sys, os; \
43 | weights = pd.read_csv(sys.argv[1], sep='\t', index_col=0); \
44 | mask = pd.read_csv(sys.argv[2], sep='\t', index_col=0); \
45 | mask = mask.loc[weights.index, weights.columns]; \
46 | df = [(weights.index[i], weights.columns[j], weights.iloc[i, j]) for i in np.arange(weights.shape[0]) for j in np.arange(weights.shape[1]) if mask.iloc[i, j]]; \
47 | df = np.array(df); \
48 | df = pd.DataFrame(df, columns=['source', 'target', 'score']); \
49 | df['pval'] = 0.01; \
50 | df['score'] = df['score'].astype(float); \
51 | df = df[df['score'].abs() > float(sys.argv[3])]; \
52 | df.to_csv(sys.argv[4], index=False)" $output_d/net_nweight.tsv.gz $output_d/binlinking.tsv.gz $thr_score $out_path
53 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/dictys/p2g.py:
--------------------------------------------------------------------------------
 1 | import argparse, os, sys
 2 | import pandas as pd
 3 | import numpy as np
 4 | import mudata as md
 5 | 
 6 | 
 7 | # Init args
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('-d', '--path_data', required=True)
10 | parser.add_argument('-t', '--tmp_path', required=True)
11 | parser.add_argument('-p', '--path_out', required=True)
12 | parser.add_argument('-g', '--gene_annotation', required=True)
13 | parser.add_argument('-e', '--ext', required=True)
14 | args = vars(parser.parse_args())
15 | 
16 | path_data = args['path_data']
17 | tmp_path = args['tmp_path']
18 | annot = args['gene_annotation']
19 | path_out = args['path_out']
20 | distance = int(args['ext'])
21 | 
22 | 
23 | # Write the RNA matrix and ATAC matrix to working directory
24 | rna_filename = os.path.join(tmp_path, "expression.tsv.gz")
25 | atac_filename = os.path.join(tmp_path, "atac_peak.tsv.gz")
26 | dist_filename = os.path.join(tmp_path, "tssdist.tsv.gz")
27 | data = md.read(path_data)
28 | rna_X = pd.DataFrame(np.array(data['rna'].X).T, columns=data['rna'].obs.index, index=data['rna'].var.index)
29 | rna_X.to_csv(rna_filename, sep="\t", compression="gzip")
30 | 
31 | atac_peak_names = [n.replace('-', ':') for n in data['atac'].var.index]
32 | atac_X = pd.DataFrame(np.zeros((data['atac'].var.index.shape[0], 1)), index=atac_peak_names, columns=['placeholder'])
33 | atac_X.to_csv(atac_filename, sep="\t", compression="gzip")
34 | 
35 | # Identify all peaks that are within Xbp of annotated TSS
36 | os.system(f'python3 -m dictys chromatin tssdist --cut {distance} {rna_filename} {atac_filename} {annot} {dist_filename}')
37 | 
38 | # Convert distance to score for p2g
39 | df = pd.read_csv(dist_filename, sep='\t').rename(columns={'region': 'cre', 'target': 'gene', 'dist': 'score'})
40 | df['score'] = -np.abs(df['score'])
41 | df['cre'] = df['cre'].str.replace(':', '-')
42 | df = df.sort_values('score', ascending=False).reset_index(drop=True).reset_index(names='rank')
43 | df['score'] = (1 - (df['rank'] / df['rank'].max()))
44 | df[['cre', 'gene', 'score']].to_csv(path_out, index=False)
45 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/dictys/pre.py:
--------------------------------------------------------------------------------
 1 | import argparse, os, sys
 2 | import pandas as pd
 3 | import numpy as np
 4 | import mudata as md
 5 | import dictys
 6 | 
 7 | 
 8 | # Init args
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('-m','--mudata_path', required=True)
11 | parser.add_argument('-t','--tmp_path', required=True)
12 | parser.add_argument('-o','--out_path', required=True)
13 | args = vars(parser.parse_args())
14 | 
15 | mudata_path = args['mudata_path']
16 | tmp_path = args['tmp_path']
17 | out_path = args['out_path']
18 | 
19 | # Read
20 | mdata = md.read(mudata_path)
21 | 
22 | # Process rna
23 | pd.DataFrame(
24 |     np.array(mdata.mod['rna'].layers['counts'].todense()).T,
25 |     columns=mdata.mod['rna'].obs.index,
26 |     index=mdata.mod['rna'].var.index
27 | ).to_csv(tmp_path, sep="\t", compression="gzip")
28 | 
29 | dictys.preproc.qc_reads(tmp_path, tmp_path, 50, 10, 0, 200, 100, 0)
30 | rna_df = pd.read_csv(tmp_path, sep='\t', compression="gzip", index_col=0)
31 | genes, barcodes = rna_df.index.values.astype('U'), rna_df.columns.values.astype('U')
32 | rna = mdata.mod['rna']
33 | rna = rna[barcodes, :][:, genes].copy()
34 | rna.X = rna.layers['counts'].todense().A.copy()
35 | 
36 | # Process atac
37 | atac = mdata.mod['atac']
38 | atac.X = atac.layers['counts'].todense().A.copy()
39 | 
40 | # Update
41 | mdata.mod['rna'] = rna
42 | mdata.mod['atac'] = atac
43 | mdata.update()
44 | 
45 | # Write
46 | mdata.write(out_path)
47 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/figr/pre.R:
--------------------------------------------------------------------------------
 1 | library(rhdf5)
 2 | library(dplyr)
 3 | library(doParallel)
 4 | 
 5 | # Parse args
 6 | args <- commandArgs(trailingOnly = F)
 7 | path_data <- args[6]
 8 | nCores <- as.numeric(args[7])
 9 | 
10 | # Read data
11 | print('Open object')
12 | indata <- H5Fopen(path_data)
13 | 
14 | # RNA
15 | rna_data <- as.data.frame(indata$mod$rna$X)
16 | colnames(rna_data) <- indata$obs$`_index`
17 | rownames(rna_data) <- indata$mod$rna$var$`_index`
18 | 
19 | # ATAC
20 | atac_data <- Matrix::sparseMatrix(
21 |     i=indata$mod$atac$layers$counts$indices,
22 |     p=indata$mod$atac$layers$counts$indptr,
23 |     x=as.numeric(indata$mod$atac$layers$counts$data),
24 |     index1 = FALSE
25 | )
26 | colnames(atac_data) <- indata$obs$`_index`
27 | rownames(atac_data) <- indata$mod$atac$var$`_index`
28 | 
29 | # Normalize ATAC data
30 | atac_data <- as.matrix(FigR::centerCounts(atac_data, chunkSize = 100000))
31 | colnames(atac_data) <- as.character(colnames(atac_data))
32 | rownames(atac_data) <- as.character(rownames(atac_data))
33 | 
34 | # Write
35 | h5write(atac_data, name="mod/atac/X", file=indata)
36 | 
37 | # Close
38 | h5closeAll()
39 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/granie/pre.R:
--------------------------------------------------------------------------------
 1 | library(rhdf5)
 2 | 
 3 | # Parse args
 4 | args <- commandArgs(trailingOnly = F)
 5 | path_data <- args[6]
 6 | 
 7 | # Read data
 8 | print('Open object')
 9 | indata <- H5Fopen(path_data)
10 | 
11 | # RNA
12 | rna_data <- indata$mod$rna$X
13 | colnames(rna_data) <- indata$obs$`_index`
14 | rownames(rna_data) <- indata$mod$rna$var$`_index`
15 | 
16 | ### ATAC
17 | atac_data <- indata$mod$atac$X
18 | colnames(atac_data) <- indata$obs$`_index`
19 | rownames(atac_data) <- indata$mod$atac$var$`_index`
20 | 
21 | # Normalize data
22 | norm_data <- function(data, norm){
23 |     if (norm == 'deseq2'){
24 |         data <- DESeq2::DESeqDataSetFromMatrix(
25 |             countData = data,
26 |             colData = data.frame(sampleID = colnames(data)),
27 |             design = stats::as.formula(" ~ 1")
28 |         )
29 |         data <- DESeq2::estimateSizeFactors(data)
30 |         data <- DESeq2::counts(data, normalized = TRUE)
31 |     }
32 |     if (norm == 'limma'){
33 |         data <- limma::normalizeBetweenArrays(
34 |             data,
35 |             method = 'quantile'
36 |         )
37 |     }
38 |     return(data)
39 | }
40 | # Add pseudocounts for sparsity and normalize
41 | rna_data <- norm_data(rna_data, 'limma')
42 | atac_data <- norm_data(atac_data, 'deseq2')
43 | 
44 | # Write
45 | h5write(rna_data, name="mod/rna/X", file=indata)
46 | h5write(atac_data, name="mod/atac/X", file=indata)
47 | 
48 | # Close
49 | h5closeAll()
50 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/granie/pre.py:
--------------------------------------------------------------------------------
 1 | import decoupler as dc
 2 | import pandas as pd
 3 | import numpy as np
 4 | import mudata as mu
 5 | import scipy.sparse as ss
 6 | import argparse
 7 | 
 8 | 
 9 | # Init args
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('-i','--path_input', required=True)
12 | parser.add_argument('-o','--path_out', required=True)
13 | args = vars(parser.parse_args())
14 | 
15 | path_input = args['path_input']
16 | path_out = args['path_out']
17 | 
18 | # Read rna adata
19 | mdata = mu.read(path_input)
20 | rna = mdata.mod['rna'].copy()
21 | 
22 | # Psbulk rna
23 | rna.obs['batch'] = mdata.obs['batch']
24 | rna.obs['celltype'] = mdata.obs['celltype']
25 | rna_b_per_c = (
26 |     rna.obs.reset_index()
27 |     .groupby('celltype', as_index=False)['index']
28 |     .agg(list).set_index('celltype')['index']
29 |     .to_dict()
30 | )
31 | rna = dc.get_pseudobulk(
32 |     adata=rna,
33 |     sample_col='batch',
34 |     groups_col='celltype',
35 |     layer='counts',
36 |     mode='sum',
37 |     min_cells=10,
38 |     min_counts=1000,
39 | )
40 | del rna.obs['psbulk_n_cells']
41 | del rna.obs['psbulk_counts']
42 | del rna.layers['psbulk_props']
43 | rna.layers['counts'] = ss.csr_matrix(rna.X.copy())
44 | rna.uns['rna_b_per_c'] = rna_b_per_c
45 | 
46 | # Psbulk atac
47 | atac = mdata.mod['atac'].copy()
48 | atac.obs['batch'] = mdata.obs['batch']
49 | atac.obs['celltype'] = mdata.obs['celltype']
50 | atac_b_per_c = (
51 |     atac.obs.reset_index()
52 |     .groupby('celltype', as_index=False)['index']
53 |     .agg(list).set_index('celltype')['index']
54 |     .to_dict()
55 | )
56 | atac = dc.get_pseudobulk(
57 |     adata=atac,
58 |     sample_col='batch',
59 |     groups_col='celltype',
60 |     layer='counts',
61 |     mode='sum',
62 |     min_cells=10,
63 |     min_counts=1000,
64 | )
65 | del atac.obs['psbulk_n_cells']
66 | del atac.obs['psbulk_counts']
67 | del atac.layers['psbulk_props']
68 | atac.layers['counts'] = ss.csr_matrix(atac.X.copy())
69 | atac.uns['atac_b_per_c'] = atac_b_per_c
70 | 
71 | # Intersect and generate new object
72 | inter = np.intersect1d(rna.obs_names, atac.obs_names)
73 | mdata = mu.MuData({
74 |     'rna': rna[inter, :].copy(),
75 |     'atac': atac[inter, :].copy(),
76 | })
77 | mdata.obs = mdata.mod['rna'].obs.copy()
78 | del mdata.mod['rna'].obs
79 | del mdata.mod['atac'].obs
80 | 
81 | # Write
82 | mdata.write(path_out)
83 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/granie/pre_post.py:
--------------------------------------------------------------------------------
 1 | import decoupler as dc
 2 | import pandas as pd
 3 | import numpy as np
 4 | import mudata as mu
 5 | import scipy.sparse as ss
 6 | import argparse
 7 | 
 8 | 
 9 | # Init args
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('-i','--path_input', required=True)
12 | parser.add_argument('-o','--path_out', required=True)
13 | args = vars(parser.parse_args())
14 | 
15 | path_input = args['path_input']
16 | path_out = args['path_out']
17 | 
18 | # Read data
19 | print(path_input)
20 | mdata = mu.read(path_input)
21 | 
22 | # Remove all equal features
23 | msk = np.any(np.diff(mdata.mod['rna'].X, axis=0), axis=0)
24 | rna = mdata.mod['rna'][:, msk].copy()
25 | 
26 | msk = np.any(np.diff(mdata.mod['atac'].X, axis=0), axis=0)
27 | atac = mdata.mod['atac'][:, msk].copy()
28 | 
29 | # Save
30 | obs=mdata.obs.copy()
31 | mdata = mu.MuData({
32 |     'rna': rna,
33 |     'atac': atac,
34 | })
35 | mdata.obs = obs
36 | 
37 | # Write
38 | mdata.write(path_out)
39 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/grn.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import argparse
 4 | 
 5 | # Init args
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('-i', '--path_input', required=True)
 8 | parser.add_argument('-o', '--path_out', required=True)
 9 | args = vars(parser.parse_args())
10 | 
11 | mdl_path = args['path_input']
12 | path_out = args['path_out']
13 | 
14 | # Find paths
15 | path = os.path.dirname(mdl_path)
16 | names = os.path.basename(mdl_path)
17 | lst = names.replace('.mdl.csv', '').split('.')
18 | 
19 | # Read in chunks to reduce memory usage
20 | chunksize = 100_000  # Adjust based on available memory
21 | dtype_dict = {'source': 'category', 'target': 'category', 'score': 'float32', 'pval': 'float32'}
22 | mdl_chunks = pd.read_csv(mdl_path, dtype=dtype_dict, chunksize=chunksize)
23 | mdl = pd.concat(mdl_chunks, ignore_index=True)
24 | 
25 | # Skip if empty
26 | if mdl.empty:
27 |     grn = pd.DataFrame(columns=['source', 'cre', 'target', 'score', 'pval'])
28 |     grn.to_csv(path_out, index=False)
29 |     os._exit(0)
30 | 
31 | # Limit to 100k largest absolute scores
32 | mdl = mdl.nlargest(100_000, 'score', keep='all').reset_index(drop=True)
33 | tfs = set(mdl['source'].unique())
34 | gns = set(mdl['target'].unique())
35 | 
36 | # Skip baselines
37 | baselines = {'collectri', 'dorothea', 'random', 'scenic'}
38 | if lst[0] in baselines or lst[0].startswith('o_'):
39 |     mdl.to_csv(path_out, index=False)
40 |     os._exit(0)
41 | 
42 | # Read paths
43 | pre_name, p2g_name, tfb_name, mdl_name = lst
44 | p2g_path = os.path.join(path, f'{pre_name}.{p2g_name}.p2g.csv')
45 | tfb_path = os.path.join(path, f'{pre_name}.{p2g_name}.{tfb_name}.tfb.csv')
46 | 
47 | # Read relevant columns with filtering
48 | usecols_tfb = ['tf', 'cre']
49 | usecols_p2g = ['cre', 'gene']
50 | 
51 | tfb_chunks = pd.read_csv(tfb_path, usecols=usecols_tfb, dtype={'tf': 'category', 'cre': 'category'}, chunksize=chunksize)
52 | tfb = pd.concat((chunk[chunk['tf'].isin(tfs)] for chunk in tfb_chunks), ignore_index=True)
53 | 
54 | p2g_chunks = pd.read_csv(p2g_path, usecols=usecols_p2g, dtype={'cre': 'category', 'gene': 'category'}, chunksize=chunksize)
55 | p2g = pd.concat((chunk[chunk['gene'].isin(gns)] for chunk in p2g_chunks), ignore_index=True)
56 | 
57 | # Merge in an optimized manner
58 | grn = tfb.merge(p2g, on='cre', how='inner')
59 | grn = grn.rename(columns={'tf': 'source', 'gene': 'target'})
60 | grn = grn.merge(mdl, on=['source', 'target'], how='inner')
61 | grn = grn.sort_values(['source', 'target', 'cre']).reset_index(drop=True)
62 | grn = grn[['source', 'cre', 'target', 'score', 'pval']]
63 | 
64 | grn.to_csv(path_out, index=False)
65 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/pando/get_granges.R:
--------------------------------------------------------------------------------
 1 | # Parse args
 2 | args <- commandArgs(trailingOnly = F)
 3 | path_hg <- args[6]
 4 | path_mm <- args[7]
 5 | 
 6 | library(EnsDb.Hsapiens.v86)
 7 | gene.ranges_hg <- Signac::GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)
 8 | write.csv(gene.ranges_hg, path_hg, row.names=FALSE)
 9 | 
10 | library(EnsDb.Mmusculus.v79)
11 | gene.ranges_mm <- Signac::GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
12 | write.csv(gene.ranges_mm, path_mm, row.names=FALSE)
13 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/pando/p2g.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(rhdf5)
 3 | library(Pando)
 4 | library(GenomicRanges)
 5 | 
 6 | 
 7 | # Parse args
 8 | args <- commandArgs(trailingOnly = F)
 9 | path_data <- args[6]
10 | path_ann <- args[7]
11 | extend <- as.numeric(args[8])
12 | path_out <- args[9]
13 | 
14 | # Set genome
15 | annot <- read.csv(path_ann)
16 | annot <- GenomicRanges::makeGRangesFromDataFrame(annot, keep.extra.columns=TRUE)
17 | GenomeInfoDb::seqlevelsStyle(annot) <- 'UCSC'
18 | 
19 | # Read peaks and genes
20 | indata <- H5Fopen(path_data, flags='H5F_ACC_RDONLY')
21 | peaks <- indata$mod$atac$var$`_index`
22 | genes <- indata$mod$rna$var$`_index`
23 | h5closeAll()
24 | peaks <- data.frame(seqnames=peaks)
25 | peaks <- tidyr::separate(data = peaks, col = 'seqnames', into = c("seqnames", "start", "end"), sep = "-", remove=FALSE)
26 | peaks <- GenomicRanges::makeGRangesFromDataFrame(peaks)
27 | 
28 | # Filter annot by seen genes
29 | annot <- annot[annot$gene_name %in% intersect(genes, annot$gene_name), ]
30 | 
31 | # Find peak2gene links
32 | peaks_near_gene <- find_peaks_near_genes(
33 |     peaks = peaks,
34 |     genes = annot,
35 |     method = 'GREAT',
36 |     upstream = round(extend / 2),
37 |     downstream = round(extend / 2),
38 | )
39 | peaks2gene <- aggregate_matrix(t(peaks_near_gene), groups=colnames(peaks_near_gene), fun='sum')
40 | 
41 | # Convert from sparse mat to df
42 | sparse <- summary(peaks2gene)
43 | df <- data.frame(
44 |   cre = colnames(peaks2gene)[sparse$j],
45 |   gene = rownames(peaks2gene)[sparse$i],
46 |   score = sparse$x
47 | )
48 | df <- df %>% arrange(cre, desc(score))
49 | 
50 | # Write
51 | write.csv(x = df, file = path_out, row.names=FALSE)
52 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/pando/pre.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(rhdf5)
 3 | library(Pando)
 4 | library(GenomicRanges)
 5 | 
 6 | 
 7 | # Parse args
 8 | args <- commandArgs(trailingOnly = F)
 9 | path_data <- args[6]
10 | path_ann <- args[7]
11 | exclude_exons <- args[8]
12 | path_cand <- args[9]
13 | path_matches <- args[10]
14 | 
15 | # Set genome
16 | data('phastConsElements20Mammals.UCSC.hg38')
17 | regions <- phastConsElements20Mammals.UCSC.hg38
18 | annot <- read.csv(path_ann)
19 | annot <- GenomicRanges::makeGRangesFromDataFrame(annot, keep.extra.columns=TRUE)
20 | GenomeInfoDb::seqlevelsStyle(annot) <- 'UCSC'
21 | 
22 | # Read peaks
23 | indata <- H5Fopen(path_data, flags='H5F_ACC_RDONLY')
24 | peaks <- indata$mod$atac$var$`_index`
25 | h5closeAll()
26 | peaks <- data.frame(seqnames=peaks)
27 | peaks <- tidyr::separate(data = peaks, col = 'seqnames', into = c("seqnames", "start", "end"), sep = "-", remove=FALSE)
28 | peaks <- GenomicRanges::makeGRangesFromDataFrame(peaks)
29 | 
30 | # Read exons
31 | exons <- annot[annot$type=='exon', ]
32 | names(exons@ranges) <- NULL
33 | exons <- IRanges::intersect(exons, exons)
34 | exons <- GenomicRanges::GRanges(
35 |     seqnames = exons@seqnames,
36 |     ranges = exons@ranges
37 | )
38 | 
39 | # Intersect by only shared chromosomes
40 | seqnames <- intersect(intersect(levels(peaks@seqnames), levels(regions@seqnames)), levels(exons@seqnames))
41 | peaks <- keepSeqlevels(peaks, seqnames, pruning.mode = "coarse")
42 | exons <- keepSeqlevels(exons, seqnames, pruning.mode = "coarse")
43 | regions <- keepSeqlevels(regions, seqnames, pruning.mode = "coarse")
44 | 
45 | # Filter by evo cons regions
46 | hits <- GenomicRanges::findOverlaps(regions, peaks)
47 | cand <- GenomicRanges::pintersect(
48 |     peaks[S4Vectors::subjectHits(hits)],
49 |     regions[S4Vectors::queryHits(hits)]
50 | )
51 | 
52 | # Substract exons
53 | if (exclude_exons){
54 |     cand <- GenomicRanges::subtract(cand, exons, ignore.strand=TRUE) %>% unlist()
55 | }
56 | 
57 | # Find matches of new peaks to old peaks
58 | matches <- S4Vectors::subjectHits(GenomicRanges::findOverlaps(cand, peaks))
59 | 
60 | # Write
61 | write.csv(x = cand, file = path_cand, row.names=FALSE)
62 | write.csv(x = matches, file = path_matches, row.names=FALSE)
63 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/pando/pre.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import mudata as mu
 4 | import argparse
 5 | 
 6 | 
 7 | # Init args
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('-i','--path_input', required=True)
10 | parser.add_argument('-p','--path_peaks', required=True)
11 | parser.add_argument('-m','--path_matches', required=True)
12 | parser.add_argument('-o','--path_out', required=True)
13 | args = vars(parser.parse_args())
14 | 
15 | path_input = args['path_input']
16 | path_peaks = args['path_peaks']
17 | path_matches = args['path_matches']
18 | path_out = args['path_out']
19 | 
20 | # Read
21 | mdata = mu.read(path_input)
22 | df = pd.read_csv(path_peaks)
23 | matches = pd.read_csv(path_matches).iloc[:, 0].values - 1
24 | 
25 | # Format peaks
26 | new_peaks = (df['seqnames'].astype(str) + '-' + df['start'].astype(str) + '-' + df['end'].astype(str)).values.astype('U')
27 | 
28 | # Filter
29 | atac = mdata.mod['atac'][:, matches].copy()
30 | atac.var_names = new_peaks
31 | msk = np.sum(atac.X, axis=1) != 0
32 | atac = atac[msk, :].copy()
33 | 
34 | # Remove missmatched obs
35 | rna = mdata.mod['rna'].copy()
36 | inter = np.intersect1d(rna.obs_names, atac.obs_names)
37 | x_spectral = mdata[inter, :].obsm['X_spectral'].copy()
38 | x_umap = mdata[inter, :].obsm['X_umap'].copy()
39 | obs = mdata.obs.copy()
40 | obs = obs.loc[inter, :]
41 | mdata = mu.MuData(
42 |     {
43 |     'rna': rna[inter, :].copy(),
44 |     'atac': atac[inter, :].copy(),
45 |     }
46 | )
47 | mdata.obsm['X_spectral'] = x_spectral
48 | mdata.obsm['X_umap'] = x_umap
49 | mdata.obs = obs
50 | 
51 | # Write
52 | mdata.write(path_out)
53 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/pando/tfb.R:
--------------------------------------------------------------------------------
 1 | library(tidyverse)
 2 | library(rhdf5)
 3 | library(Pando)
 4 | 
 5 | # Parse args
 6 | args <- commandArgs(trailingOnly = F)
 7 | path_data <- args[6]
 8 | organism <- args[7]
 9 | path_p2g <- args[8]
10 | path_out <- args[9]
11 | 
12 | # Read genome
13 | if (organism == 'hg38'){
14 |     library(BSgenome.Hsapiens.UCSC.hg38)
15 |     genome <- BSgenome.Hsapiens.UCSC.hg38
16 | } else {
17 |     library(BSgenome.Mmusculus.UCSC.mm10)
18 |     genome <- BSgenome.Mmusculus.UCSC.mm10
19 | }
20 | 
21 | # Read p2g
22 | p2g <- read.csv(path_p2g)
23 | if (nrow(p2g) == 0){
24 |     tfb <- data.frame(cre=character(), tf=character(), score=numeric())
25 |     write.csv(x = tfb, file = path_out, row.names=FALSE)
26 |     quit(save="no")
27 | }
28 | 
29 | # Read genes
30 | indata <- H5Fopen(path_data, flags='H5F_ACC_RDONLY')
31 | genes <- indata$mod$rna$var$`_index`
32 | h5closeAll()
33 | 
34 | # Transform motif2tf to mat
35 | data('motif2tf')
36 | motif2tf <- motif2tf %>% select('motif'=1,'tf'=2) %>%
37 |     distinct() %>% mutate(val=1) %>%
38 |     tidyr::pivot_wider(names_from = 'tf', values_from=val, values_fill=0) %>%
39 |     tibble::column_to_rownames('motif') %>%
40 |     as.matrix() %>% Matrix::Matrix(sparse=TRUE)
41 | 
42 | # Subset motifs to tfs in data
43 | data('motifs')
44 | motif2tf <- motif2tf[, intersect(genes, colnames(motif2tf))]
45 | motifs <- motifs[rownames(motif2tf)[Matrix::rowSums(motif2tf) != 0 ]]
46 | 
47 | # Transform peaks to Granger
48 | peaks <- data.frame(seqnames=p2g$cre) %>% distinct()
49 | peaks <- tidyr::separate(data = peaks, col = 'seqnames', into = c("seqnames", "start", "end"), sep = "-", remove=FALSE)
50 | peaks <- GenomicRanges::makeGRangesFromDataFrame(peaks)
51 | 
52 | # Run motif enrichment using motifmatcher (MOODS)
53 | peak_motifs <- Signac::CreateMotifMatrix(
54 |     features = peaks,
55 |     pwm = motifs,
56 |     genome = genome,
57 |     use.counts = FALSE,
58 |     score=TRUE
59 | )
60 | 
61 | # Extact list of motifs to tfs
62 | sparse <- summary(motif2tf)
63 | motif2tf_lst <- data.frame(
64 |     motif = rownames(motif2tf)[sparse$i],
65 |     tf = colnames(motif2tf)[sparse$j]
66 | ) %>%
67 | group_by(motif) %>%
68 | summarize(values = list(tf)) %>%
69 | deframe()
70 | 
71 | # Convert from sparse mat to df
72 | sparse <- summary(peak_motifs)
73 | df <- data.frame(
74 |     cre = rownames(peak_motifs)[sparse$i],
75 |     tf = colnames(peak_motifs)[sparse$j],
76 |     score = sparse$x
77 | ) %>%
78 | mutate(tf=motif2tf_lst[tf]) %>%
79 | unnest(tf) %>%
80 | summarize(score = max(score), .by=c(cre, tf)) %>%
81 | mutate(score = ifelse(score < 0, 0, score)) %>%  # Sometimes MOODs returns negative values
82 | arrange(cre, desc(score))
83 | 
84 | # Write
85 | write.csv(x = df, file = path_out, row.names=FALSE)
86 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/prc_prior_grn.py:
--------------------------------------------------------------------------------
 1 | import pyranges as pr
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os
 5 | import mudata as mu
 6 | import argparse
 7 | 
 8 | 
 9 | # Init args
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('-g','--grn_path', required=True)
12 | parser.add_argument('-d','--data_path', required=True)
13 | parser.add_argument('-p','--proms_path', required=True)
14 | parser.add_argument('-o','--out_path', required=True)
15 | args = vars(parser.parse_args())
16 | 
17 | grn_path = args['grn_path']
18 | data_path = args['data_path']
19 | proms_path = args['proms_path']
20 | out_path = args['out_path']
21 | 
22 | # Read
23 | grn = pd.read_csv(grn_path)
24 | genes = mu.read(os.path.join(data_path, 'mod', 'rna')).var_names.astype('U')
25 | peaks = mu.read(os.path.join(data_path, 'mod', 'atac')).var_names.astype('U')
26 | proms = pr.read_bed(proms_path)
27 | 
28 | # Transform peaks
29 | peaks = pd.DataFrame(peaks, columns=['cre'])
30 | peaks[['Chromosome', 'Start', 'End']] = peaks['cre'].str.split('-', n=2, expand=True)
31 | peaks = pr.PyRanges(peaks[['Chromosome', 'Start', 'End']])
32 | 
33 | # Filter by genes
34 | grn = grn[grn['source'].astype('U').isin(genes) & grn['target'].astype('U').isin(genes)]
35 | proms = proms[proms.Name.astype('U').isin(genes)]
36 | 
37 | # Filter by peaks
38 | proms = proms.overlap(peaks)
39 | proms.cre = proms.df['Chromosome'].astype(str) + '-' + proms.df['Start'].astype(str) + '-' + proms.df['End'].astype(str)
40 | proms = proms.df[['cre', 'Name']].rename(columns={'Name': 'target'})
41 | 
42 | # Merge
43 | grn = pd.merge(grn, proms, how='inner')[['source', 'cre', 'target', 'weight']]
44 | grn = grn.sort_values(['source', 'target', 'cre']).rename(columns={'weight': 'score'})
45 | 
46 | # Filter regulons with less than 5 targets
47 | n_targets = grn.groupby(['source']).size().reset_index(name='counts')
48 | n_targets = n_targets[n_targets['counts'] > 5]
49 | grn = grn[grn['source'].isin(n_targets['source'])]
50 | 
51 | # Write
52 | grn.to_csv(out_path, index=False)
53 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/scenic/loom.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | import mudata as mu
 5 | import loompy as lp
 6 | import argparse
 7 | 
 8 | 
 9 | # Init args
10 | parser = argparse.ArgumentParser()
11 | 
12 | parser.add_argument('-i','--data', required=True)
13 | parser.add_argument('-o','--path_out', required=True)
14 | args = vars(parser.parse_args())
15 | 
16 | path_input = args['data']
17 | path_out = args['path_out']
18 | 
19 | # Extract raw counts data and assign labels
20 | mdata = mu.read(path_input)
21 | adata = mdata.mod['rna'].copy()
22 | adata.layers['lognorm'] = adata.X.copy()
23 | adata.X = adata.layers['counts'].copy()
24 | adata.obs['celltype'] = mdata.obs['celltype']
25 | adata.obsm['X_pca'] = mdata.obsm['X_spectral']
26 | 
27 | # create basic row and column attributes for the loom file:
28 | row_attrs = {
29 |     "Gene": np.array(adata.var_names) ,
30 | }
31 | col_attrs = {
32 |     "CellID": np.array(adata.obs_names) ,
33 |     "nGene": np.array(np.sum(adata.X.transpose() > 0, axis=0)).flatten() ,
34 |     "nUMI": np.array(np.sum(adata.X.transpose(), axis=0)).flatten() ,
35 | }
36 | lp.create(path_out, adata.X.transpose(), row_attrs, col_attrs)
37 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/scenic/process_grn.py:
--------------------------------------------------------------------------------
 1 | import pyranges as pr
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os
 5 | import argparse
 6 | 
 7 | 
 8 | # Init args
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('-g','--grn_path', required=True)
11 | parser.add_argument('-p','--proms_path', required=True)
12 | parser.add_argument('-o','--out_path', required=True)
13 | parser.add_argument('-r','--reg_path', required=True)
14 | args = vars(parser.parse_args())
15 | 
16 | grn_path = args['grn_path']
17 | proms_path = args['proms_path']
18 | out_path = args['out_path']
19 | reg_path = args['reg_path']
20 | 
21 | # Read
22 | grn = pd.read_csv(grn_path, index_col=False, sep='\t').rename(columns={'TF': 'source', 'importance': 'score'})
23 | proms = pr.read_bed(proms_path).df
24 | proms['cre'] = proms['Chromosome'].astype(str) + '-' + proms['Start'].astype(str) + '-' + proms['End'].astype(str)
25 | proms = proms[['cre', 'Name']].rename(columns={'Name': 'target'})
26 | reg = pd.read_csv(reg_path)
27 | 
28 | # Filter by enriched TFs
29 | reg = reg.iloc[2:, [0, 8]]
30 | reg.columns = ['source', 'target']
31 | reg['target'] = reg['target'].str.split(',')
32 | reg_exp = reg.explode('target')
33 | reg_exp['target'] = reg_exp['target'].str.replace(r"[\[\(\)' ]", "", regex=True)
34 | 
35 | # Merge
36 | grn = pd.merge(grn, reg_exp, on=['source', 'target'], how='inner')
37 | grn = pd.merge(grn, proms, how='inner')[['source', 'cre', 'target', 'score']]
38 | grn = grn[grn["score"] > 0.001]
39 | grn = grn.sort_values(['source', 'target', 'cre'])
40 | 
41 | # Write
42 | grn.to_csv(out_path, index=False)
43 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/scenicplus/egrn.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sys
 3 | 
 4 | 
 5 | df = pd.read_table(sys.argv[1])
 6 | if df.shape[0] > 0:
 7 |     df = df[df['regulation'] != 0.]
 8 |     df = df[['TF', 'Region', 'Gene', 'regulation', 'triplet_rank']].groupby(['TF', 'Gene'], as_index=False).mean(numeric_only=True).sort_values('triplet_rank')
 9 |     df = df.reset_index(drop=True).reset_index(names='rank')
10 |     df['score'] = (1 - (df['rank'] / df['rank'].max())) * df['regulation']
11 |     df = df[['TF', 'Gene', 'score']]
12 |     df.columns = ['source', 'target', 'score']
13 |     df['pval'] = 0.01
14 | else:
15 |     df = pd.DataFrame(columns=['source', 'target', 'score', 'pval'])
16 | df.to_csv(sys.argv[2], index=False)
17 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/scenicplus/mdata.py:
--------------------------------------------------------------------------------
 1 | import mudata, sys
 2 | 
 3 | m = mudata.read(sys.argv[1])
 4 | m.mod['scRNA'] = m.mod['rna']
 5 | del m.mod['rna']
 6 | m.mod['scATAC'] = m.mod['atac']
 7 | del m.mod['atac']
 8 | m.mod['scATAC'].var_names = m.mod['scATAC'].var_names.str.replace('-', ':', 1)
 9 | m.write(sys.argv[2])
10 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/scenicplus/mdl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # Parse command-line arguments
 5 | while [[ "$#" -gt 0 ]]; do
 6 |     case $1 in
 7 |         --new_dir) new_dir="$2"; shift ;;
 8 |         --path_pre) path_pre="$2"; shift ;;
 9 |         --path_p2g) path_p2g="$2"; shift ;;
10 |         --path_tfb) path_tfb="$2"; shift ;;
11 |         --path_rnk) path_rnk="$2"; shift ;;
12 |         --threads) threads="$2"; shift ;;
13 |         --path_out) path_out="$2"; shift ;;
14 |         *) echo "Unknown parameter passed: $1"; exit 1 ;;
15 |     esac
16 |     shift
17 | done
18 | 
19 | if [ $(wc -l < $path_p2g) -eq 1 ] || [ $(wc -l < $path_tfb) -eq 1 ]; then
20 |     echo "source,target,score" > "$path_out"
21 |     exit 0
22 | fi
23 | 
24 | # Transform pre to scenicplus mdata format
25 | python workflow/scripts/mth/scenicplus/mdata.py $path_pre $new_dir/mdata.h5mu
26 | 
27 | # Extract unique tfs
28 | python -c "import pandas as pd; \
29 | import sys; \
30 | tfb = pd.DataFrame(pd.read_csv(sys.argv[1])['tf'].unique().reshape(-1, 1), columns=['tf']); \
31 | tfb.to_csv(sys.argv[2], index=False, header=False)" $path_tfb $new_dir/tfs.txt
32 | 
33 | # Infer tg links
34 | scenicplus grn_inference TF_to_gene \
35 | --multiome_mudata_fname $new_dir/mdata.h5mu \
36 | --tf_names $new_dir/tfs.txt \
37 | --temp_dir $TMPDIR \
38 | --out_tf_to_gene_adjacencies $new_dir/tg_adj.tsv \
39 | --method GBM \
40 | --n_cpu $threads
41 | 
42 | # Transform p2g to scenicplus format
43 | python -c "import pandas as pd; \
44 | import numpy as np; \
45 | import sys; \
46 | p2g = pd.read_csv(sys.argv[1]); \
47 | p2g['region'] = p2g['cre'].str.replace('-', ':', 1); \
48 | p2g['target'] = p2g['gene']; \
49 | p2g['importance'] = p2g['score'].abs(); \
50 | p2g['rho'] = np.sign(p2g['score']); \
51 | p2g['importance_x_rho'] = p2g['score']; \
52 | p2g['importance_x_abs_rho'] = p2g['score']; \
53 | p2g = p2g[['region', 'target', 'importance', 'rho', 'importance_x_rho', 'importance_x_abs_rho']]; \
54 | p2g.to_csv(sys.argv[2], sep='\\t', index=False)" $path_p2g $new_dir/rg_adj.tsv
55 | 
56 | # Transform tfb to scenicplus format
57 | python workflow/scripts/mth/scenicplus/motifs.py $path_tfb $path_rnk $new_dir/motifs.h5ad
58 | 
59 | # Egrn inference
60 | dichotomize=$( python -c "import sys, pandas; print('') if (pandas.read_csv(sys.argv[1])['score'] < 0).any() else print('--do_not_rho_dichotomize_r2g --do_not_rho_dichotomize_eRegulon');" $path_p2g )
61 | echo "$dichotomize"
62 | scenicplus grn_inference eGRN \
63 | --TF_to_gene_adj_fname $new_dir/tg_adj.tsv \
64 | --region_to_gene_adj_fname $new_dir/rg_adj.tsv \
65 | --cistromes_fname $new_dir/motifs.h5ad \
66 | --ranking_db_fname $path_rnk \
67 | --eRegulon_out_fname $new_dir/egrn.tsv \
68 | --temp_dir $TMPDIR \
69 | --min_target_genes 10 \
70 | --n_cpu $threads $dichotomize
71 | 
72 | # Transform grn into greta format
73 | python workflow/scripts/mth/scenicplus/egrn.py $new_dir/egrn.tsv $path_out
74 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/scenicplus/motifs.py:
--------------------------------------------------------------------------------
 1 | from scenicplus.triplet_score import get_max_rank_of_motif_for_each_TF
 2 | from pycistarget.motif_enrichment_cistarget import cisTargetDatabase
 3 | from pycistarget.utils import load_motif_annotations
 4 | import scipy.sparse as ss
 5 | import numpy as np
 6 | import pandas as pd
 7 | import pyranges as pr
 8 | import anndata as ad
 9 | import sys
10 | 
11 | 
12 | def get_pr(index):
13 |     df = index.str.replace(':', '-') 
14 |     df = df.str.split('-').tolist()
15 |     df = pd.DataFrame(df, columns=['Chromosome', 'Start', 'End'])
16 |     return pr.PyRanges(df)
17 | 
18 | 
19 | def get_motifs_for_TF(tf_names, annotation_to_use, motif_to_tf):
20 |     motif_to_tf = motif_to_tf.fillna("")[annotation_to_use].agg(", ".join, axis = 1).apply(lambda x: [x for x in x.split(", ") if len(x) > 0])
21 |     motif_to_tf = motif_to_tf.loc[[len(x) > 0 for x in motif_to_tf]]
22 |     tf_to_motif = motif_to_tf.explode().reset_index().drop_duplicates().groupby(0)["MotifID"].apply(lambda x: ','.join(list(x)))
23 |     tf_names = pd.Index(tf_names)
24 |     tf_names = tf_names.intersection(tf_to_motif.index)
25 |     return tf_to_motif.loc[tf_names].to_dict()
26 | 
27 | 
28 | path_tfb = sys.argv[1]
29 | path_db = sys.argv[2]
30 | path_out = sys.argv[3]
31 | 
32 | # Read
33 | tfb = pd.read_csv(path_tfb)
34 | tfb['cre'] = tfb['cre'].str.replace('-', ':', 1)
35 | var_names = tfb['tf'].unique()
36 | obs_names = tfb['cre'].unique()
37 | 
38 | # Create anndata
39 | motifs = ad.AnnData(
40 |     obs=pd.DataFrame(index=obs_names),
41 |     var=pd.DataFrame(index=var_names),
42 |     X=ss.lil_matrix((obs_names.size, var_names.size), dtype=bool),
43 | )
44 | for cre, tfs in tfb.groupby('cre')['tf'].apply(lambda x: np.array(x)).items():
45 |     motifs[cre, tfs].X = True
46 | motifs.X = ss.csr_matrix(motifs.X)
47 | 
48 | # Find motif annots
49 | motif_to_tf = load_motif_annotations(
50 |     specie = "homo_sapiens",
51 |     version = "v10nr_clust",
52 |     motif_similarity_fdr = 0.001,
53 |     orthologous_identity_threshold = 0.0)
54 | 
55 | # Remove ann if they are not in db or cres not in db
56 | ctx_db = cisTargetDatabase(
57 |     fname=path_db,
58 |     region_sets=get_pr(motifs.obs_names)
59 | )
60 | inter = motif_to_tf.index.intersection(ctx_db.db_rankings.index)
61 | motif_to_tf = motif_to_tf.loc[inter]
62 | motif_to_tf.index.name = 'MotifID'
63 | 
64 | # Find motif anns per tf gene name
65 | tf_to_motif = get_motifs_for_TF(
66 |     tf_names = motifs.var_names,
67 |     annotation_to_use = ["Direct_annot", "Orthology_annot"],
68 |     motif_to_tf = motif_to_tf
69 | )
70 | 
71 | # Subset and add annots
72 | m_msk = motifs.var_names.isin(tf_to_motif)
73 | motifs = motifs[:, m_msk].copy()
74 | motifs.var.loc[:, 'motifs'] = [tf_to_motif[v] for v in motifs.var_names]
75 | 
76 | # Remove regions not found in db
77 | df = get_max_rank_of_motif_for_each_TF(motifs, path_db)
78 | inter = motifs.obs_names.intersection(df.index)
79 | motifs = motifs[inter, :].copy()
80 | 
81 | # Write
82 | motifs.write(path_out)
83 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/scenicplus/p2g.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # Parse command-line arguments
 5 | while [[ "$#" -gt 0 ]]; do
 6 |     case $1 in
 7 |         --new_dir) new_dir="$2"; shift ;;
 8 |         --path_pre) path_pre="$2"; shift ;;
 9 |         --path_ann) path_ann="$2"; shift ;;
10 |         --path_csz) path_csz="$2"; shift ;;
11 |         --ext) ext="$2"; shift ;;
12 |         --threads) threads="$2"; shift ;;
13 |         --path_out) path_out="$2"; shift ;;
14 |         *) echo "Unknown parameter passed: $1"; exit 1 ;;
15 |     esac
16 |     shift
17 | done
18 | 
19 | python -c "import mudata, sys; \
20 | m = mudata.read(sys.argv[1]); \
21 | m.mod['scRNA'] = m.mod['rna']; \
22 | del m.mod['rna']; \
23 | m.mod['scATAC'] = m.mod['atac']; \
24 | del m.mod['atac']; \
25 | m.mod['scATAC'].var_names = m.mod['scATAC'].var_names.str.replace('-', ':', 1); \
26 | m.write(sys.argv[2])" $path_pre $new_dir/mdata.h5mu
27 | 
28 | scenicplus prepare_data search_spance \
29 | --multiome_mudata_fname $new_dir/mdata.h5mu \
30 | --gene_annotation_fname $path_ann \
31 | --chromsizes_fname $path_csz \
32 | --upstream 1000 $ext \
33 | --downstream 1000 $ext \
34 | --out_fname $new_dir/space.tsv
35 | 
36 | scenicplus grn_inference region_to_gene \
37 | --multiome_mudata_fname $new_dir/mdata.h5mu \
38 | --search_space_fname $new_dir/space.tsv \
39 | --temp_dir $TMPDIR \
40 | --out_region_to_gene_adjacencies $new_dir/rg_adj.tsv \
41 | --n_cpu $threads
42 | 
43 | python -c "import pandas as pd; \
44 | import sys; \
45 | tab = pd.read_table(sys.argv[1]); \
46 | tab = tab[tab['importance_x_rho'].abs() > 1e-16]; \
47 | tab = tab[['region', 'target', 'importance_x_rho']]; \
48 | tab['region'] = tab['region'].str.replace(':', '-'); \
49 | tab.columns = ['cre', 'gene', 'score']; \
50 | tab.to_csv(sys.argv[2], index=False)" $new_dir/rg_adj.tsv $path_out
51 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/scenicplus/pre.py:
--------------------------------------------------------------------------------
 1 | import mudata as mu
 2 | import scipy.sparse as scs
 3 | import scanpy as sc
 4 | import sys
 5 | 
 6 | 
 7 | path_ann = sys.argv[1]
 8 | path_scn = sys.argv[2]
 9 | path_out = sys.argv[3]
10 | 
11 | # Read
12 | ann = mu.read(path_ann)
13 | scn = mu.read(path_scn)
14 | scn.var.index = scn.var_names.str.replace(':', '-')
15 | 
16 | # Match
17 | inter_var = ann.var_names.intersection(scn.var_names)
18 | inter_obs = ann.obs_names.intersection(scn.obs_names)
19 | ann = ann[inter_obs, inter_var].copy()
20 | scn = scn[inter_obs, inter_var]
21 | 
22 | # Update atac counts with topic ones
23 | ann.mod['atac'].layers['counts'] = scs.csr_matrix(scn.mod['scATAC'].X)
24 | ann.mod['atac'].X = scn.mod['scATAC'].X
25 | 
26 | # Write
27 | ann.write(path_out)
28 | 


--------------------------------------------------------------------------------
/workflow/scripts/mth/scenicplus/tfb.py:
--------------------------------------------------------------------------------
 1 | import scipy.sparse as scs
 2 | import anndata as ad
 3 | import pyranges as pr
 4 | import h5py
 5 | import pandas as pd
 6 | import mudata as mu
 7 | import sys
 8 | 
 9 | def get_pr(index):
10 |     df = index.str.replace(':', '-') 
11 |     df = df.str.split('-').tolist()
12 |     df = pd.DataFrame(df, columns=['Chromosome', 'Start', 'End'])
13 |     return pr.PyRanges(df)
14 | 
15 | 
16 | def get_vars(df):
17 |     chrm = df.df['Chromosome'].astype(str)
18 |     strt = df.df['Start'].astype(str)
19 |     end = df.df['End'].astype(str)
20 |     return pd.Index(chrm + ':' + strt + '-' + end)
21 | 
22 | 
23 | path_pre = sys.argv[1]
24 | path_p2g = sys.argv[2]
25 | path_motifs = sys.argv[3]
26 | path_out = sys.argv[4]
27 | 
28 | # Read
29 | motifs = mu.read(path_motifs)
30 | p2g = pd.read_csv(path_p2g)
31 | if p2g.shape[0] == 0:
32 |     tfb = pd.DataFrame(columns=['cre', 'tf', 'score'])
33 |     tfb.to_csv(path_out, index=False)
34 |     exit()
35 | 
36 | # Subset by tf genes
37 | with h5py.File(path_pre, 'r') as f:
38 |     genes = f['mod']['rna']['var']['_index'][:].astype('U')
39 | tf_msk = motifs.var_names.isin(genes)
40 | motifs = motifs[:, tf_msk]
41 | 
42 | # Find shared regions
43 | mtf_pr = get_pr(motifs.obs_names)
44 | p2g_pr = get_pr(pd.Index(p2g['cre'].unique()))
45 | inter = mtf_pr.join(p2g_pr)
46 | inter_motifs = get_vars(inter[['Chromosome', 'Start', 'End']])
47 | inter_p2g = get_vars(pr.PyRanges(inter.df[['Chromosome', 'Start_b', 'End_b']].rename(columns={'Start_b': 'Start', 'End_b': 'End'})))
48 | 
49 | # Create matching motif anndata
50 | new_motifs = ad.AnnData(
51 |     var=pd.DataFrame(index=motifs.var_names),
52 |     obs=pd.DataFrame(index=inter_p2g),
53 |     X=scs.csr_matrix((inter_p2g.size, motifs.var_names.size))
54 | )
55 | new_motifs[inter_p2g, :].X = motifs[inter_motifs, :].X
56 | 
57 | # Build df
58 | new_motifs.X = new_motifs.X.tocoo()
59 | tfb = pd.DataFrame()
60 | tfb['cre'] = new_motifs.obs_names[new_motifs.X.row]
61 | tfb['tf'] = new_motifs.var_names[new_motifs.X.col]
62 | tfb['score'] = 5.
63 | tfb['cre'] = tfb['cre'].str.replace(':', '-')
64 | 
65 | # Write
66 | tfb.to_csv(path_out, index=False)
67 | 


--------------------------------------------------------------------------------
/workflow/scripts/plt/stab/cors.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import seaborn as sns
 3 | import pandas as pd
 4 | import numpy as np
 5 | import sys
 6 | import os
 7 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 8 | from utils import read_config, savefigs
 9 | 
10 | 
11 | # Read config
12 | config = read_config()
13 | palette = config['colors']['nets']
14 | mthds = list(config['methods'].keys())
15 | baselines = config['baselines']
16 | 
17 | path_repl_wgt = sys.argv[1]
18 | path_repl_cor = sys.argv[2]
19 | repl_wgt = pd.read_csv(path_repl_wgt)
20 | repl_cor = pd.read_csv(path_repl_cor)
21 | 
22 | figs = []
23 | for mth in repl_wgt['mth'].unique():
24 |     tmp = repl_wgt[repl_wgt['mth'] == mth]
25 |     if tmp.shape[0] > 1:
26 |         fig, ax = plt.subplots(1, 1, figsize=(2, 2), dpi=150)
27 |         max_n = np.max([tmp['score_x'].abs().max(), tmp['score_y'].abs().max()])
28 |         max_n = max_n + (max_n * 0.05)
29 |         sns.histplot(
30 |             data=tmp,
31 |             x='score_x',
32 |             y='score_y',
33 |             cbar=False,
34 |             cmap='magma',
35 |             stat='proportion',
36 |             vmin=0.,
37 |             vmax=1e-2,
38 |             bins=(50, 50),
39 |             cbar_kws=dict(label='Proportion', shrink=0.5, aspect=5, orientation='horizontal')
40 |         )
41 |         ax.set_xlabel('Run A edge score')
42 |         ax.set_ylabel('Run B edge score')
43 |         ax.set_xlim(-max_n, max_n)
44 |         ax.set_ylim(-max_n, max_n)
45 |         ax.set_title(mth)
46 |         figs.append(fig)
47 |         
48 | 
49 | fig, ax = plt.subplots(1, 1, figsize=(1.5, 1), dpi=150)
50 | order = mthds + baselines
51 | order = [m for m in order if m in repl_cor['mth'].unique()]
52 | sns.boxplot(data=repl_cor, x='stat', y='mth', hue='mth', fill=None, ax=ax, palette=palette, order=order)
53 | sns.stripplot(data=repl_cor, x='stat', y='mth', hue='mth', ax=ax, palette=palette, order=order)
54 | ax.set_xlabel('Pearson ρ')
55 | ax.set_ylabel('')
56 | ax.set_xticks([0, 0.5, 1])
57 | ax.set_xlim(-0.05, 1.05)
58 | figs.append(fig)
59 | 
60 | # Write
61 | savefigs(figs, sys.argv[3])
62 | 


--------------------------------------------------------------------------------
/workflow/scripts/plt/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def read_config(path_config='config/config.yaml'):
 4 |     import yaml
 5 |     with open(path_config, 'r') as file:
 6 |         config = yaml.safe_load(file)
 7 |     return config
 8 | 
 9 | 
10 | def savefigs(lst_figs, path_fname, index_pngs=[]):
11 |     import matplotlib.backends.backend_pdf
12 |     import io
13 |     from PIL import Image
14 |     import matplotlib.pyplot as plt
15 |     pdf = matplotlib.backends.backend_pdf.PdfPages(path_fname)
16 |     for i, fig in enumerate(lst_figs):
17 |         if i not in index_pngs:
18 |             pdf.savefig(fig, bbox_inches='tight')
19 |         else:
20 |             buf = io.BytesIO()
21 |             fig.savefig(buf, format='png', dpi=300, bbox_inches='tight')
22 |             plt.close(fig)
23 |             buf.seek(0)
24 |             image = Image.open(buf)
25 |             new_fig, ax = plt.subplots(figsize=(image.width / 100, image.height / 100), dpi=300)
26 |             ax.imshow(image)
27 |             ax.axis("off")
28 |             pdf.savefig(new_fig, bbox_inches='tight')
29 |             plt.close(new_fig)
30 |     pdf.close()
31 | 


--------------------------------------------------------------------------------