├── .gitignore
├── LICENSE
├── README.Rmd
├── README.md
├── bin
    ├── .gitignore
    ├── README.md
    └── init
├── resources
    ├── github_mark.svg
    └── orcid_id.svg
├── sample_data
    └── predict_modality
    │   ├── openproblems_bmmc_cite_starter
    │       ├── openproblems_bmmc_cite_starter.test_mod1.h5ad
    │       ├── openproblems_bmmc_cite_starter.test_mod2.h5ad
    │       ├── openproblems_bmmc_cite_starter.train_mod1.h5ad
    │       └── openproblems_bmmc_cite_starter.train_mod2.h5ad
    │   └── openproblems_bmmc_multiome_starter
    │       ├── openproblems_bmmc_multiome_starter.test_mod1.h5ad
    │       ├── openproblems_bmmc_multiome_starter.test_mod2.h5ad
    │       ├── openproblems_bmmc_multiome_starter.train_mod1.h5ad
    │       └── openproblems_bmmc_multiome_starter.train_mod2.h5ad
└── src
    ├── joint_embedding
        └── methods
        │   ├── Guanlab-dengkw
        │       ├── run
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │       └── test.sh
        │   ├── jae
        │       ├── README.md
        │       ├── model_architecture.png
        │       ├── resources
        │       │   └── utils.py
        │       ├── run
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │       ├── test.sh
        │       └── train
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │   └── lsl_ae
        │       ├── run
        │           ├── config.vsh.yaml
        │           └── script.py
        │       └── test.sh
    ├── match_modality
        └── methods
        │   ├── clue
        │       ├── README.md
        │       ├── clue_architecture.jpg
        │       ├── resources
        │       │   ├── scglue-0.1.1-py3-none-any.whl
        │       │   └── utils.py
        │       ├── run
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │       ├── test.sh
        │       └── train
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │   └── novel
        │       ├── README.md
        │       ├── novel_architecture1.png
        │       ├── novel_architecture2.png
        │       ├── resources
        │           ├── catalyst_tools.py
        │           ├── config_ADT2GEX.py
        │           ├── config_ATAC2GEX.py
        │           ├── data.py
        │           ├── models.py
        │           ├── postprocessing.py
        │           └── preprocessing.py
        │       ├── run
        │           ├── config.vsh.yaml
        │           └── script.py
        │       ├── test.sh
        │       └── train
        │           ├── config.vsh.yaml
        │           └── script.py
    ├── predict_modality
        └── methods
        │   ├── AXX
        │       ├── .gitignore
        │       ├── README.md
        │       ├── resources
        │       │   ├── const.py
        │       │   ├── models.py
        │       │   ├── predict.py
        │       │   ├── test.py
        │       │   ├── train.py
        │       │   ├── utils.py
        │       │   └── yaml
        │       │   │   ├── mlp_ADT2GEX.yaml
        │       │   │   ├── mlp_ATAC2GEX.yaml
        │       │   │   └── mlp_GEX2ADT.yaml
        │       ├── run
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │       ├── test.sh
        │       └── train
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │   ├── DANCE
        │       ├── resources
        │       │   ├── baseline.py
        │       │   └── graph_util.py
        │       ├── run
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │       ├── test.sh
        │       └── train
        │       │   ├── config.vsh.yaml
        │       │   ├── generate_extra_files.py
        │       │   ├── h.all.v7.4.entrez.gmt
        │       │   ├── h.all.v7.4.symbols.gmt
        │       │   ├── hetero_arg_version_v5.py
        │       │   └── script.sh
        │   ├── Guanlab-dengkw
        │       ├── run
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │       └── test.sh
        │   ├── LS_lab
        │       ├── run
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │       └── test.sh
        │   ├── cajal
        │       ├── run
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │       ├── test.sh
        │       └── train
        │       │   ├── ADT_list_df_updated.csv
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │   ├── novel
        │       ├── README.md
        │       ├── novel_architecture.jpg
        │       ├── resources
        │       │   └── helper_functions.py
        │       ├── run
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │       ├── test.sh
        │       └── train
        │       │   ├── config.vsh.yaml
        │       │   └── script.py
        │   └── scJoint
        │       ├── .gitignore
        │       ├── README.md
        │       ├── resources
        │           ├── modules
        │           │   └── model_ae.py
        │           ├── opts.py
        │           ├── preprocess
        │           │   ├── save_filter_genes.py
        │           │   ├── save_highlyvar_genes.py
        │           │   └── save_idf_matrix.py
        │           ├── train.py
        │           ├── trainer
        │           │   ├── __init__.py
        │           │   ├── trainer_batchgan.py
        │           │   ├── trainer_cycle.py
        │           │   └── trainer_nn.py
        │           └── utils
        │           │   ├── __init__.py
        │           │   ├── dataloader.py
        │           │   ├── loss.py
        │           │   └── metric.py
        │       ├── run
        │           ├── config.vsh.yaml
        │           └── script.py
        │       ├── test.sh
        │       └── train
        │           ├── config.vsh.yaml
        │           └── train.sh
    ├── resources
        ├── nextflow.config
        └── nextflow_moremem.config
    └── sync_datasets.sh


/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | *.pyc
3 | target
4 | work
5 | .nextflow*
6 | log.txt
7 | README.html
8 | bin/build_for_release.sh
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Open Problems in Single-Cell Analysis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bin/.gitignore:
--------------------------------------------------------------------------------
1 | fetch
2 | viash*
3 | nextflow
4 | 


--------------------------------------------------------------------------------
/bin/README.md:
--------------------------------------------------------------------------------
1 | These executables were generated by running the `bin/init` executable.
2 | 


--------------------------------------------------------------------------------
/bin/init:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check Java installed
 4 | if ! command -v java --version &> /dev/null
 5 | then
 6 |     echo "Please ensure Java Runtime ≥8 is installed. You can find an open source installer here: https://adoptopenjdk.net/?variant=openjdk8&jvmVariant=hotspot"
 7 |     exit
 8 | fi
 9 | 
10 | # Check Docker installed
11 | if ! command -v docker --version &> /dev/null
12 | then
13 |     echo "Please ensure Docker is installed and up-to-date. Instructions at https://www.docker.com/get-started"
14 |     exit
15 | fi
16 | 
17 | # get the root of the directory
18 | REPO_ROOT=$(git rev-parse --show-toplevel)
19 | 
20 | # ensure that the command below is run from the root of the repository
21 | cd "$REPO_ROOT"
22 | 
23 | curl -fsSL http://get.viash.io | bash -s -- \
24 |   --viash bin/viash \
25 |   --registry openproblems \
26 |   --tag 0.5.5 \
27 |   --log check_results/results.tsv \
28 |   --config_mod '.platforms[.type == "nextflow"].separate_multiple_outputs := false' \
29 |   --config_mod '.platforms[.type == "nextflow"].directive_memory := "10GB"' \
30 |   --config_mod '.platforms[.type == "nextflow"].directive_time := "10 m"'
31 | 
32 | cd bin
33 | 
34 | curl -s https://get.nextflow.io | bash
35 | 


--------------------------------------------------------------------------------
/resources/github_mark.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <svg
 3 |    xmlns:dc="http://purl.org/dc/elements/1.1/"
 4 |    xmlns:cc="http://creativecommons.org/ns#"
 5 |    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 6 |    xmlns:svg="http://www.w3.org/2000/svg"
 7 |    xmlns="http://www.w3.org/2000/svg"
 8 |    version="1.1"
 9 |    id="svg2"
10 |    xml:space="preserve"
11 |    width="46"
12 |    height="46"
13 |    viewBox="0 0 46 46"><metadata
14 |      id="metadata8"><rdf:RDF><cc:Work
15 |          rdf:about=""><dc:format>image/svg+xml</dc:format><dc:type
16 |            rdf:resource="http://purl.org/dc/dcmitype/StillImage" /><dc:title></dc:title></cc:Work></rdf:RDF></metadata><defs
17 |      id="defs6" /><g
18 |      id="g10"
19 |      transform="matrix(1.3333333,0,0,-1.3333333,315.90273,489.92247)"><g
20 |        id="g12"
21 |        transform="scale(0.1)"><path
22 |          d="m -2024.2705,3329.4186 h -345 v 345 h 345 v -345"
23 |          style="fill:#231f20;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:0.0901238"
24 |          id="path14" /><path
25 |          d="m -2196.7855,3660.7929 c -89.94,0 -162.88,-72.93 -162.88,-162.9 0,-71.97 46.67,-133.02 111.4,-154.56 8.15,-1.5 11.12,3.53 11.12,7.85 0,3.87 -0.14,14.11 -0.22,27.7 -45.31,-9.84 -54.87,21.84 -54.87,21.84 -7.41,18.82 -18.09,23.83 -18.09,23.83 -14.79,10.1 1.12,9.9 1.12,9.9 16.35,-1.15 24.95,-16.79 24.95,-16.79 14.53,-24.89 38.13,-17.7 47.41,-13.53 1.48,10.52 5.69,17.7 10.34,21.77 -36.17,4.11 -74.2,18.09 -74.2,80.51 0,17.78 6.35,32.33 16.77,43.71 -1.68,4.12 -7.27,20.69 1.6,43.11 0,0 13.67,4.38 44.79,-16.7 12.99,3.62 26.93,5.42 40.78,5.49 13.84,-0.07 27.77,-1.87 40.78,-5.49 31.1,21.08 44.75,16.7 44.75,16.7 8.89,-22.42 3.3,-38.99 1.63,-43.11 10.44,-11.38 16.74,-25.93 16.74,-43.71 0,-62.58 -38.09,-76.35 -74.37,-80.38 5.84,-5.03 11.05,-14.97 11.05,-30.16 0,-21.78 -0.2,-39.35 -0.2,-44.69 0,-4.36 2.94,-9.43 11.2,-7.84 64.68,21.59 111.31,82.6 111.31,154.55 0,89.97 -72.94,162.9 -162.91,162.9"
26 |          style="fill:#ffffff;fill-opacity:1;fill-rule:evenodd;stroke:none"
27 |          id="path106" /></g></g></svg>
28 | 


--------------------------------------------------------------------------------
/resources/orcid_id.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!-- Generator: Adobe Illustrator 19.1.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 256 256" style="enable-background:new 0 0 256 256;" xml:space="preserve">
 4 | <style type="text/css">
 5 | 	.st0{fill:#A6CE39;}
 6 | 	.st1{fill:#FFFFFF;}
 7 | </style>
 8 | <path class="st0" d="M256,128c0,70.7-57.3,128-128,128C57.3,256,0,198.7,0,128C0,57.3,57.3,0,128,0C198.7,0,256,57.3,256,128z"/>
 9 | <g>
10 | 	<path class="st1" d="M86.3,186.2H70.9V79.1h15.4v48.4V186.2z"/>
11 | 	<path class="st1" d="M108.9,79.1h41.6c39.6,0,57,28.3,57,53.6c0,27.5-21.5,53.6-56.8,53.6h-41.8V79.1z M124.3,172.4h24.5   c34.9,0,42.9-26.5,42.9-39.7c0-21.5-13.7-39.7-43.7-39.7h-23.7V172.4z"/>
12 | 	<path class="st1" d="M88.7,56.8c0,5.5-4.5,10.1-10.1,10.1c-5.6,0-10.1-4.6-10.1-10.1c0-5.6,4.5-10.1,10.1-10.1   C84.2,46.7,88.7,51.3,88.7,56.8z"/>
13 | </g>
14 | </svg>


--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod1.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod1.h5ad


--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod2.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod2.h5ad


--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod1.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod1.h5ad


--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod2.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod2.h5ad


--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad


--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad


--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad


--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad


--------------------------------------------------------------------------------
/src/joint_embedding/methods/Guanlab-dengkw/run/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: guanlab_dengkw_je
 3 |   namespace: joint_embedding_methods
 4 | 
 5 |   # metadata for your method
 6 |   description: A description for your method.
 7 |   info:
 8 |     method_label: "Guanlab-dengkw"
 9 |     submission_id: "170795"
10 |     team_name: Guanlab-dengkw
11 |     # project_url: https://github.com/foo/bar
12 |     # publication_doi: 10.1101/0123.45.67.890123
13 |     # publication_url: https://arxiv.org/abs/1234.56789
14 | 
15 |   authors:
16 |     - name: Kaiwen Deng
17 |       email: dengkw@umich.edu
18 |       roles: [ author, maintainer ]
19 |       props: { github: nonztalk }
20 | 
21 |   # parameters
22 |   arguments:
23 |     # required inputs
24 |     - name: "--input_mod1"
25 |       type: "file"
26 |       example: "dataset_mod1.h5ad"
27 |       description: Modality 1 dataset.
28 |       required: true
29 |     - name: "--input_mod2"
30 |       type: "file"
31 |       example: "dataset_mod2.h5ad"
32 |       description: Modality 2 dataset.
33 |       required: true
34 |     # required outputs
35 |     - name: "--output"
36 |       type: "file"
37 |       direction: "output"
38 |       example: "output.h5ad"
39 |       description: Data for all cells in mod1 and mod2 embedded to ≤100 dimensions.
40 |       required: true
41 | 
42 | 
43 |   # files your script needs
44 |   resources:
45 |     - type: python_script
46 |       path: script.py
47 |   
48 | # target platforms
49 | platforms:
50 |   - type: docker
51 |     image: dataintuitive/randpy:py3.8
52 |     setup:
53 |       - type: python
54 |         packages:
55 |           - anndata
56 |           - umap-learn
57 |   - type: nextflow
58 |     labels: [ vhighmem, hightime, vhighcpu ]
59 | 


--------------------------------------------------------------------------------
/src/joint_embedding/methods/Guanlab-dengkw/run/script.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import anndata as ad
 3 | import numpy as np
 4 | 
 5 | from sklearn.decomposition import TruncatedSVD
 6 | 
 7 | logging.basicConfig(level=logging.INFO)
 8 | 
 9 | ## VIASH START
10 | dataset_path = "output/datasets/joint_embedding/openproblems_bmmc_cite_phase2/openproblems_bmmc_cite_phase2.censor_dataset.output_"
11 | 
12 | par = {
13 |     'input_mod1': f'{dataset_path}mod1.h5ad',
14 |     'input_mod2': f'{dataset_path}mod2.h5ad',
15 |     'output': 'output.h5ad'
16 | }
17 | meta = {
18 |     'resources_dir': '.',
19 |     'functionality_name': 'submission_170795'
20 | }
21 | ## VIASH END
22 | 
23 | def normalize(arr):
24 |     arr_sd = np.std(arr, axis=1).reshape(-1, 1)
25 |     arr_mean = np.mean(arr, axis=1).reshape(-1, 1)
26 |     return (arr - arr_mean) / arr_sd
27 | 
28 | logging.info('Reading `h5ad` files...')
29 | ad_mod1 = ad.read_h5ad(par['input_mod1'])
30 | ad_mod2 = ad.read_h5ad(par['input_mod2'])
31 | 
32 | logging.info('Determine parameters by the modalities')
33 | mod1_type = ad_mod1.var.feature_types[0].upper()
34 | mod2_type = ad_mod2.var.feature_types[0].upper()
35 | 
36 | if mod1_type == "GEX" and mod2_type == "ADT":
37 |     n_mod1 = 73
38 |     n_mod2 = 27
39 | 
40 | elif mod1_type == "ADT" and mod2_type == "GEX":
41 |     n_mod1 = 27
42 |     n_mod2 = 73
43 | 
44 | elif mod1_type == "GEX" and mod2_type == "ATAC":
45 |     n_mod1 = 38
46 |     n_mod2 = 62
47 | 
48 | elif mod1_type == "ATAC" and mod2_type == "GEX":
49 |     n_mod1 = 62
50 |     n_mod2 = 38
51 | 
52 | else:
53 |     n_mod1 = 50
54 |     n_mod2 = 50
55 | 
56 | logging.info('Performing dimensionality reduction on modality 1 values...')
57 | embedder_mod1 = TruncatedSVD(n_components=n_mod1)
58 | mod1_pca = embedder_mod1.fit_transform(ad_mod1.X)
59 | mod1_obs = ad_mod1.obs
60 | mod1_uns = ad_mod1.uns
61 | del ad_mod1
62 | 
63 | logging.info('Performing dimensionality reduction on modality 2 values...')
64 | embedder_mod1 = TruncatedSVD(n_components=n_mod2)
65 | mod2_pca = embedder_mod1.fit_transform(ad_mod2.X)
66 | del ad_mod2
67 | 
68 | logging.info('Concatenating datasets')
69 | pca_combined = np.concatenate([normalize(mod1_pca), normalize(mod2_pca)], axis=1)
70 | 
71 | logging.info('Storing output to file')
72 | adata = ad.AnnData(
73 |     X=pca_combined,
74 |     obs=mod1_obs,
75 |     uns={
76 |         'dataset_id': mod1_uns['dataset_id'],
77 |         'method_id': meta['functionality_name'],
78 |     },
79 | )
80 | adata.write_h5ad(par['output'], compression="gzip")
81 | 


--------------------------------------------------------------------------------
/src/joint_embedding/methods/Guanlab-dengkw/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
 4 | export NXF_VER=21.04.1
 5 | export PIPELINE_VERSION=1.4.0
 6 | method_id=submission_170795
 7 | task_id=joint_embedding
 8 | 
 9 | # CITE
10 | dataset_id=openproblems_bmmc_cite_phase2
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
13 | 
14 | target/docker/${task_id}_methods/${method_id}/${method_id} \
15 |   --input_mod1 ${dataset_path}.output_mod1.h5ad \
16 |   --input_mod2 ${dataset_path}.output_mod2.h5ad \
17 |   --output ${pred_path}.${method_id}.output.h5ad
18 | 
19 | # MULTIOME
20 | dataset_id=openproblems_bmmc_multiome_phase2
21 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
22 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
23 | 
24 | target/docker/${task_id}_methods/${method_id}/${method_id} \
25 |   --input_mod1 ${dataset_path}.output_mod1.h5ad \
26 |   --input_mod2 ${dataset_path}.output_mod2.h5ad \
27 |   --output ${pred_path}.${method_id}.output.h5ad
28 | 
29 | 
30 | # RUN EVALUATION
31 | bin/nextflow run "$PIPELINE_REPO" \
32 |   -r "$PIPELINE_VERSION" \
33 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
34 |   --solutionDir "output/datasets/$task_id" \
35 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
36 |   --publishDir "output/evaluation/$task_id/$method_id/" \
37 |   -latest \
38 |   -resume \
39 |   -c "src/resources/nextflow_moremem.config"
40 | 
41 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/README.md:
--------------------------------------------------------------------------------
 1 | # Single cell joint embedding with an autoencoder (JAE)
 2 | 
 3 | **Team**: Amateur
 4 | 
 5 | **Team members**: Qiao Liu, Wanwen Zeng, Chencheng Xu
 6 | 
 7 | **Project URL**: https://github.com/kimmo1019/JAE
 8 | 
 9 | <img src="model_architecture.png" width="70%">
10 | 
11 | In brief, we built an autoencoder for joint embedding (JAE). Each modality will first be SVD transformed and concatenated together (denoted as x). The major difference from standard AE is that we incorporated the information from cell annotations (e.g., cell label, cell cycle score, and cell batch) to constrain the structure of latent features. We desire that some latent features (c) predict the cell type information, some features predict the cell cycle score. Noticeably, for feature (b), we want it to predict the batch label as randomly as possible to potentially eliminate the batch effect. z has no constrain at all to ensure the flexibility of neural network.
12 | 
13 | In the pretrain stage, JAE was trained with exploration data where the cell annotation information (cell type, cell cycle phase score) is available. In the test stage where the cell annotation information is not available, we only minimize the reconstruction loss of the autoencoder with a smaller learning rate (fine-tune).
14 | 
15 | 
16 | Feel free to contact `liuqiao@stanford.edu` if you have any problem in the JAE model.
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/model_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/joint_embedding/methods/jae/model_architecture.png


--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/resources/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for consistent data preprocessing
  3 | """
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | class EarlyStoppingAtMinLoss(tf.keras.callbacks.Callback):
  9 |     def __init__(self, patience=0):
 10 |         super(EarlyStoppingAtMinLoss, self).__init__()
 11 |         self.patience = patience
 12 |         self.best_weights = None
 13 | 
 14 |     def on_train_begin(self, logs=None):
 15 |         self.wait = 0
 16 |         self.stopped_epoch = 0
 17 |         self.best = np.Inf
 18 | 
 19 |     def on_epoch_end(self, epoch, logs=None):
 20 |         current = logs.get("val_loss")
 21 |         if np.less(current, self.best):
 22 |             self.best = current
 23 |             self.wait = 0
 24 |             self.best_weights = self.model.get_weights()
 25 |         else:
 26 |             self.wait += 1
 27 |             if self.wait >= self.patience:
 28 |                 self.stopped_epoch = epoch
 29 |                 self.model.stop_training = True
 30 |                 print("Restoring model weights from the end of the best epoch.")
 31 |                 self.model.set_weights(self.best_weights)
 32 | 
 33 |     def on_train_end(self, logs=None):
 34 |         if self.stopped_epoch > 0:
 35 |             print("Epoch %05d: early stopping" % (self.stopped_epoch + 1))
 36 | 
 37 | 
 38 | class JointEmbeddingModel(tf.keras.Model):
 39 |     def __init__(self, params, name=None):
 40 |         super(JointEmbeddingModel, self).__init__(name=name)
 41 |         self.params = params
 42 |         self.encoder = self.create_encoder()
 43 |         self.decoder = self.create_decoder()
 44 |         self.classifier = self.create_classifier()
 45 | 
 46 |     def get_config(self):
 47 |         return {
 48 |                 "params": self.params,
 49 |         }
 50 |     def call(self, inputs, training):
 51 |         encoded = self.encoder(inputs)
 52 |         decoded = self.decoder(encoded)
 53 |         digits_cell_type, digits_batch, digits_phase = self.classifier(encoded)
 54 |         if self.params['use_batch']:
 55 |             return decoded, digits_cell_type, digits_batch, digits_phase
 56 |         else:
 57 |             return decoded, digits_cell_type
 58 | 
 59 |     def create_encoder(self, use_resnet=True):
 60 |         if use_resnet:
 61 |             inputs = tf.keras.layers.Input(shape=(self.params['dim'],))
 62 |             for i, n_unit in enumerate(self.params['hidden_units'][:-1]):
 63 |                 if i==0:
 64 |                     x_init = tf.keras.layers.Dense(n_unit, activation='relu')(inputs)
 65 |                 else:
 66 |                     x_init = tf.keras.layers.Dense(n_unit, activation='relu')(x)
 67 |                 x = tf.keras.layers.Dropout(0.1)(x_init)
 68 |                 x = tf.keras.layers.BatchNormalization()(x)
 69 |                 x = tf.keras.layers.Dense(n_unit)(x)
 70 |                 x = tf.keras.layers.Add()([x,x_init])
 71 |                 x = tf.keras.layers.Activation(activation='relu')(x)
 72 |             encoded = tf.keras.layers.Dense(self.params['hidden_units'][-1], activation='relu')(x)
 73 |         else:
 74 |             inputs = tf.keras.layers.Input(shape=(self.params['dim'],))
 75 |             for i, n_unit in enumerate(self.params['hidden_units'][:-1]):
 76 |                 if i==0:
 77 |                     x = tf.keras.layers.Dense(n_unit, activation='relu')(inputs)
 78 |                 else:
 79 |                     x = tf.keras.layers.Dense(n_unit, activation='relu')(x)
 80 |                 x = tf.keras.layers.Dropout(0.1)(x)
 81 |                 x = tf.keras.layers.BatchNormalization()(x)
 82 |             encoded = tf.keras.layers.Dense(self.params['hidden_units'][-1], activation='relu')(x)
 83 |         return tf.keras.Model(inputs=inputs, outputs=encoded, name='encoder')
 84 | 
 85 |     def create_decoder(self):
 86 |         inputs = tf.keras.layers.Input(shape=(self.params['hidden_units'][-1],))
 87 |         for i, n_unit in enumerate(self.params['hidden_units'][:-1][::-1]):
 88 |             if i==0:
 89 |                 x = tf.keras.layers.Dense(n_unit, activation='relu')(inputs)
 90 |             else:
 91 |                 x = tf.keras.layers.Dense(n_unit, activation='relu')(x)
 92 |         decoded = tf.keras.layers.Dense(self.params['dim'], activation='relu')(x)
 93 |         return tf.keras.Model(inputs=inputs, outputs=decoded, name='decoder')
 94 | 
 95 |     def create_classifier(self):
 96 |         inputs = tf.keras.layers.Input(shape=(self.params['hidden_units'][-1],))
 97 |         digits_cell_type = inputs[:,:self.params['nb_cell_types']]
 98 |         digits_batch = inputs[:,self.params['nb_cell_types']:(self.params['nb_cell_types']+self.params['nb_batches'])]
 99 |         digits_phase = inputs[:,(self.params['nb_cell_types']+self.params['nb_batches']):(self.params['nb_cell_types']+self.params['nb_batches']+self.params['nb_phases'])]
100 |         return tf.keras.Model(inputs=inputs, outputs=[digits_cell_type, digits_batch, digits_phase], name='classifier')
101 | 
102 | 


--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/run/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: jae
 3 |   namespace: joint_embedding_methods
 4 | 
 5 |   # metadata for your method
 6 |   description: In brief, we built an autoencoder for joint embedding (JAE). Each modality will first be SVD transformed and concatenated together. The major difference from standard AE is that we incorporated the information from cell annotations (e.g., cell label, cell cycle score, and cell batch) to constrain the structure of latent features. We desire that some latent features predict the cell type information, some features predict the cell cycle score. Noticeably, for feature corresponding to batch effect, we want it to predict the batch label as randomly as possible to potentially eliminate the batch effect. There are also several nodes that have no constrain at all to ensure the flexibility of neural network.
 7 |   info:
 8 |     method_label: "JAE"
 9 |     submission_id: "170936/171079"
10 |     team_name: Amateur
11 |     project_url: https://github.com/kimmo1019/JAE
12 | 
13 |   authors:
14 |     - name: Qiao Liu 
15 |       email: liuqiao@stanford.edu
16 |       roles: [ author, maintainer ]
17 |       props: { github: kimmo1019, orcid: "0000-0002-9781-3360", url: "http://liuqiao.me" }
18 |     - name: Wanwen Zeng
19 |       email: wanwen@stanford.edu
20 |       roles: [ author ]
21 |       props: { github: wanwenzeng, orcid: "0000-0003-3426-0890", url: "https://scholar.google.com/citations?user=MbeOhkgAAAAJ&hl=zh-CN" }
22 |     - name: Chencheng Xu
23 |       roles: [ author ]
24 |       props: { github: Zoesgithub, orcid: "0000-0002-2262-6966" }
25 | 
26 |   # parameters
27 |   arguments:
28 |     # required inputs
29 |     - name: "--input_mod1"
30 |       type: "file"
31 |       example: "dataset_mod1.h5ad"
32 |       description: Modality 1 dataset.
33 |       required: true
34 |     - name: "--input_mod2"
35 |       type: "file"
36 |       example: "dataset_mod2.h5ad"
37 |       description: Modality 2 dataset.
38 |       required: true
39 |     - name: "--input_pretrain"
40 |       type: "file"
41 |       example: "pretrain_model"
42 |       description: Path to the directory containing a pretrained model.
43 |       required: true
44 |     # required outputs
45 |     - name: "--output"
46 |       type: "file"
47 |       direction: "output"
48 |       example: "output.h5ad"
49 |       description: Data for all cells in mod1 and mod2 embedded to ≤100 dimensions.
50 |       required: true
51 | 
52 |   # files your script needs
53 |   resources:
54 |     - type: python_script
55 |       path: script.py
56 |     - path: '../resources/utils.py'
57 | 
58 | # target platforms
59 | platforms:
60 |   - type: docker
61 |     image: tensorflow/tensorflow:latest-gpu
62 |     run_args: [ "--gpus all" ]
63 |     setup:
64 |       - type: python
65 |         packages:
66 |           - anndata
67 |           - umap-learn
68 |           - scanpy
69 |   - type: nextflow
70 |     labels: [ vhighmem, vhightime, vhighcpu, gpu ]
71 | 


--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/run/script.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import logging
  4 | import json
  5 | import anndata as ad
  6 | import numpy as np
  7 | from sklearn.preprocessing import normalize
  8 | import tensorflow as tf
  9 | import pickle as pk
 10 | import scipy
 11 | 
 12 | logging.basicConfig(level=logging.INFO)
 13 | 
 14 | ## VIASH START
 15 | dataset_path = 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.'
 16 | 
 17 | par = {
 18 |     'input_mod1': dataset_path + 'mod1.h5ad',
 19 |     'input_mod2': dataset_path + 'mod2.h5ad',
 20 |     'input_pretrain': '...',
 21 |     'output': 'output.h5ad',
 22 | }
 23 | 
 24 | meta = { 'resources_dir': '.', 'functionality_name': 'submission_171079' }
 25 | ## VIASH END
 26 | 
 27 | sys.path.append(meta['resources_dir'])
 28 | from utils import JointEmbeddingModel
 29 | 
 30 | logging.info('Reading `h5ad` files...')
 31 | ad_mod1 = ad.read_h5ad(par['input_mod1'])
 32 | ad_mod2 = ad.read_h5ad(par['input_mod2'])
 33 | mod1_obs = ad_mod1.obs
 34 | mod1_uns = ad_mod1.uns
 35 | 
 36 | ad_mod2_var = ad_mod2.var
 37 | 
 38 | mod_type = ad_mod2_var['feature_types'][0]
 39 | 
 40 | mod1_mat = ad_mod1.layers["counts"]
 41 | mod2_mat = ad_mod2.layers["counts"]
 42 | 
 43 | del ad_mod2, ad_mod1
 44 | 
 45 | if mod_type == 'ATAC':
 46 |     mod1_svd = pk.load(open(os.path.join(par['input_pretrain'], 'svd_mod1.pkl'),'rb'))
 47 |     mod2_svd = pk.load(open(os.path.join(par['input_pretrain'], 'svd_mod2.pkl'),'rb'))
 48 | else:
 49 |     mod1_svd = pk.load(open(os.path.join(par['input_pretrain'], 'svd_mod1.pkl'),'rb'))
 50 |     mod2_svd = None
 51 | 
 52 | def svd_transform(mod1_data, mod2_data, mod1_svd, mod2_svd, scale=1e4):
 53 |     mod1_data = scale * normalize(mod1_data, norm='l1', axis=1)
 54 |     mod2_data = scale * normalize(mod2_data, norm='l1', axis=1)
 55 |     mod1_data = scipy.sparse.csr_matrix.log1p(mod1_data) / np.log(10)
 56 |     mod2_data = scipy.sparse.csr_matrix.log1p(mod2_data) / np.log(10)
 57 |     pca_data_mod1 = mod1_svd.transform(mod1_data)
 58 | 
 59 |     if mod_type == 'ADT':
 60 |         pca_data_mod2 = mod2_data.toarray()
 61 |     else:
 62 |         pca_data_mod2 = mod2_svd.transform(mod2_data)
 63 |     return pca_data_mod1, pca_data_mod2
 64 | 
 65 | mod1_pca, mod2_pca = svd_transform(mod1_mat, mod2_mat, mod1_svd, mod2_svd)
 66 | 
 67 | del mod1_mat, mod2_mat
 68 | 
 69 | pca_combined = np.concatenate([mod1_pca, mod2_pca],axis=1)
 70 | del mod1_pca, mod2_pca
 71 | 
 72 | if mod_type == 'ATAC':
 73 |     epochs = 2
 74 | else:
 75 |     epochs = 1
 76 | 
 77 | coeff = [1.0, 0.0, 0.0, 0.0]
 78 | 
 79 | with open(os.path.join(par['input_pretrain'], 'hyperparams.json'), 'r') as file:
 80 |      params = json.load(file)
 81 | 
 82 | mymodel = JointEmbeddingModel(params)
 83 | mymodel(np.zeros((2, params['dim'])))
 84 | 
 85 | mymodel.compile(tf.keras.optimizers.Adam(learning_rate = params["lr"]), 
 86 |             loss = [tf.keras.losses.MeanSquaredError(), 
 87 |                     tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
 88 |                     tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
 89 |                     tf.keras.losses.MeanSquaredError()
 90 |                     ],
 91 |             loss_weights=coeff, run_eagerly=True)
 92 | 
 93 | #load pretrain model
 94 | mymodel.load_weights(os.path.join(par['input_pretrain'], 'weights.h5'))
 95 | 
 96 | 
 97 | X_train = pca_combined
 98 | c_fakes = np.random.randint(low=0, high=params['nb_cell_types'],size=pca_combined.shape[0])
 99 | b_fakes = np.random.randint(low=0, high=params['nb_batches'],size=pca_combined.shape[0])
100 | p_fakes = np.random.randint(low=0, high=params['nb_phases'],size=pca_combined.shape[0])
101 | Y_train = [pca_combined, c_fakes, b_fakes, p_fakes]
102 | 
103 | #finetune on the test data
104 | mymodel.fit(x=X_train, y=Y_train,
105 |             epochs = epochs,
106 |             batch_size = 32,
107 |             shuffle=True)
108 | 
109 | embeds = mymodel.encoder.predict(pca_combined)
110 | print(embeds.shape)
111 | 
112 | adata = ad.AnnData(
113 |     X=embeds,
114 |     obs=mod1_obs,
115 | 	uns={
116 |         'dataset_id': mod1_uns['dataset_id'],
117 |         'method_id': meta['functionality_name'],
118 |     },
119 | )
120 | adata.write_h5ad(par['output'], compression="gzip")
121 | 


--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
 4 | export NXF_VER=21.04.1
 5 | export PIPELINE_VERSION=1.4.0
 6 | method_id=jae
 7 | task_id=joint_embedding
 8 | 
 9 | # CITE
10 | dataset_id=openproblems_bmmc_cite_phase2
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | dataset_train_path=output/datasets_phase2_public/$task_id/$dataset_id/$dataset_id.censor_dataset
13 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
14 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
15 | 
16 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
17 |   --input_mod1 ${dataset_train_path}.output_mod1.h5ad \
18 |   --input_mod2 ${dataset_train_path}.output_mod2.h5ad \
19 |   --input_explore_mod1 output/datasets_explore/cite/cite_gex_processed_training.h5ad \
20 |   --input_explore_mod2 output/datasets_explore/cite/cite_adt_processed_training.h5ad \
21 |   --input_sol ${dataset_path}.output_sol.h5ad \
22 |   --output_pretrain ${pretrain_path}
23 | 
24 | target/docker/${task_id}_methods/${method_id}/${method_id} \
25 |   --input_mod1 ${dataset_path}.output_mod1.h5ad \
26 |   --input_mod2 ${dataset_path}.output_mod2.h5ad \
27 |   --input_pretrain ${pretrain_path} \
28 |   --output ${pred_path}.${method_id}.output.h5ad
29 | 
30 | # MULTIOME
31 | dataset_id=openproblems_bmmc_multiome_phase2
32 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
33 | dataset_train_path=output/datasets_phase2_public/$task_id/$dataset_id/$dataset_id.censor_dataset
34 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
36 | 
37 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
38 |   --input_mod1 ${dataset_train_path}.output_mod1.h5ad \
39 |   --input_mod2 ${dataset_train_path}.output_mod2.h5ad \
40 |   --input_explore_mod1 output/datasets_explore/multiome/multiome_gex_processed_training.h5ad \
41 |   --input_explore_mod2 output/datasets_explore/multiome/multiome_atac_processed_training.h5ad \
42 |   --input_sol ${dataset_path}.output_sol.h5ad \
43 |   --output_pretrain ${pretrain_path}
44 | 
45 | target/docker/${task_id}_methods/${method_id}/${method_id} \
46 |   --input_mod1 ${dataset_path}.output_mod1.h5ad \
47 |   --input_mod2 ${dataset_path}.output_mod2.h5ad \
48 |   --input_pretrain ${pretrain_path} \
49 |   --output ${pred_path}.${method_id}.output.h5ad
50 | 
51 | # RUN EVALUATION
52 | bin/nextflow run "$PIPELINE_REPO" \
53 |   -r "$PIPELINE_VERSION" \
54 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
55 |   --solutionDir "output/datasets/$task_id" \
56 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
57 |   --publishDir "output/evaluation/$task_id/$method_id/" \
58 |   -latest \
59 |   -resume \
60 |   -c "src/resources/nextflow_moremem.config"
61 | 
62 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/train/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: jae_train
 3 |   namespace: joint_embedding_methods
 4 | 
 5 |   # metadata for your method
 6 |   description: In brief, we built an autoencoder for joint embedding (JAE). Each modality will first be SVD transformed and concatenated together. The major difference from standard AE is that we incorporated the information from cell annotations (e.g., cell label, cell cycle score, and cell batch) to constrain the structure of latent features. We desire that some latent features predict the cell type information, some features predict the cell cycle score. Noticeably, for feature corresponding to batch effect, we want it to predict the batch label as randomly as possible to potentially eliminate the batch effect. There are also several nodes that have no constrain at all to ensure the flexibility of neural network.
 7 |   authors:
 8 |     - name: Qiao Liu 
 9 |       email: liuqiao@stanford.edu
10 |       roles: [ author, maintainer ]
11 |       props: { github: kimmo1019, orcid: "0000-0002-9781-3360", url: "http://liuqiao.me" }
12 |     - name: Wanwen Zeng
13 |       email: wanwen@stanford.edu
14 |       roles: [ author ]
15 |       props: { github: wanwenzeng, orcid: "0000-0003-3426-0890", url: "https://scholar.google.com/citations?user=MbeOhkgAAAAJ&hl=zh-CN" }
16 |     - name: Chencheng Xu
17 |       roles: [ author ]
18 |       props: { github: Zoesgithub, orcid: "0000-0002-2262-6966" }
19 | 
20 |   # parameters
21 |   arguments:
22 |     # required inputs
23 |     - name: "--input_mod1"
24 |       type: "file"
25 |       example: "dataset_mod1.h5ad"
26 |       description: Modality 1 dataset.
27 |       required: true
28 |     - name: "--input_mod2"
29 |       type: "file"
30 |       example: "dataset_mod2.h5ad"
31 |       description: Modality 2 dataset.
32 |       required: true
33 |     - name: "--input_explore_mod1"
34 |       type: "file"
35 |       example: "dataset_mod1.h5ad"
36 |       description: Explore version of the modality 1 dataset.
37 |       required: true
38 |     - name: "--input_explore_mod2"
39 |       type: "file"
40 |       example: "dataset_mod2.h5ad"
41 |       description: Explore version of the modality 2 dataset.
42 |       required: true
43 |     - name: "--tf_seed"
44 |       type: "integer"
45 |       default: 46
46 |       description: ...
47 |     - name: "--np_seed"
48 |       type: "integer"
49 |       default: 56
50 |       description: ...
51 |     
52 |     # required outputs
53 |     - name: "--output_pretrain"
54 |       type: "file"
55 |       direction: "output"
56 |       example: "pretrain_model"
57 |       description: Path to the directory containing a pretrained model.
58 |       required: true
59 | 
60 |   # files your script needs
61 |   resources:
62 |     - type: python_script
63 |       path: script.py
64 |     - path: '../resources/utils.py'
65 | 
66 | # target platforms
67 | platforms:
68 |   - type: docker
69 |     image: tensorflow/tensorflow:latest-gpu
70 |     run_args: [ "--gpus all" ]
71 |     setup:
72 |       - type: python
73 |         packages:
74 |           - anndata
75 |           - umap-learn
76 |           - scanpy
77 |   - type: nextflow
78 |     labels: [ vhighmem, vhightime, vhighcpu, gpu ]
79 | 


--------------------------------------------------------------------------------
/src/joint_embedding/methods/lsl_ae/run/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: lsl_ae
 3 |   namespace: joint_embedding_methods
 4 | 
 5 |   # metadata for your method
 6 |   description: A description for your method.
 7 |   info:
 8 |     method_label: "LSL_AE"
 9 |     submission_id: "170825"
10 |     team_name: Living-Systems-Lab
11 |     # project_url: https://github.com/foo/bar
12 |     # publication_doi: 10.1101/0123.45.67.890123
13 |     # publication_url: https://arxiv.org/abs/1234.56789
14 | 
15 |   authors:
16 |     - name: Sumeer Khan
17 |       email: sumeer.khan@kaust.edu.sa
18 |       roles: [ author, maintainer ]
19 |     - name: Robert Lehman
20 |       email: robert.lehman@kaust.edu.sa 
21 |       roles: [ author, maintainer ]
22 |     - name: Xabier Martinez De Morentin
23 |       email: xavier.martinez.demorentin@navarra.es 
24 |       roles: [ author, maintainer ]
25 |     - name: Aidyn Ubingazhibov
26 |       email: aidyn.ubingazhibov@nu.edu.kz
27 |       roles: [ author, maintainer ]
28 |     - name: Minxing Pang
29 |       email: minxing.pang@kaust.edu.sa 
30 |       roles: [ author, maintainer ]
31 | 
32 |   # parameters
33 |   arguments:
34 |     # required inputs
35 |     - name: "--input_mod1"
36 |       type: "file"
37 |       example: "dataset_mod1.h5ad"
38 |       description: Modality 1 dataset.
39 |       required: true
40 |     - name: "--input_mod2"
41 |       type: "file"
42 |       example: "dataset_mod2.h5ad"
43 |       description: Modality 2 dataset.
44 |       required: true
45 |     # required outputs
46 |     - name: "--output"
47 |       type: "file"
48 |       direction: "output"
49 |       example: "output.h5ad"
50 |       description: Data for all cells in mod1 and mod2 embedded to ≤100 dimensions.
51 |       required: true
52 | 
53 |   # files your script needs
54 |   resources:
55 |     - type: python_script
56 |       path: script.py
57 | 
58 | # target platforms
59 | platforms:
60 |   - type: docker
61 |     image: nvcr.io/nvidia/tensorflow:20.10-tf1-py3
62 |     run_args: [ "--gpus all" ]
63 |     setup:
64 |       - type: python
65 |         packages:
66 |           - anndata
67 |           - umap-learn
68 |           - keras
69 |           - matplotlib
70 |           - scanpy
71 |           - scipy
72 |   - type: nextflow
73 |     labels: [ vhighmem, vvhightime, vhighcpu, gpu ]
74 | 


--------------------------------------------------------------------------------
/src/joint_embedding/methods/lsl_ae/run/script.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import anndata as ad
  3 | import pandas as pd
  4 | from tensorflow.keras.layers import Input, Dense, Dropout
  5 | from tensorflow.keras.layers import concatenate
  6 | from tensorflow.keras.models import Model
  7 | from tensorflow.keras.callbacks import EarlyStopping
  8 | from tensorflow import keras
  9 | import warnings
 10 | warnings.filterwarnings('ignore')
 11 | import scanpy as sc
 12 | #from keras import backend as K
 13 | from tensorflow.keras.constraints import Constraint
 14 | import tensorflow.keras.backend as K
 15 | 
 16 | from tensorflow.keras.optimizers import Adam
 17 | from tensorflow.keras.models import Model
 18 | import warnings
 19 | from numpy.random import seed
 20 | seed(1)
 21 | import tensorflow as tf
 22 | tf.compat.v1.random.set_random_seed(2)
 23 | 
 24 | 
 25 | print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
 26 | 
 27 | 
 28 | warnings.filterwarnings('ignore')
 29 | 
 30 | ## VIASH START
 31 | dataset_path = "output/datasets/joint_embedding/openproblems_bmmc_cite_phase2/openproblems_bmmc_cite_phase2.censor_dataset.output_"
 32 | 
 33 | par = {
 34 |     'input_mod1': f'{dataset_path}mod1.h5ad',
 35 |     'input_mod2': f'{dataset_path}mod2.h5ad',
 36 |     'output': 'output.h5ad'
 37 | }
 38 | meta = {
 39 |     'resources_dir': '.',
 40 |     'functionality_name': 'submission_170795'
 41 | }
 42 | ## VIASH END
 43 | 
 44 | 
 45 | logging.info('Reading `h5ad` files...')
 46 | ad_mod1 = ad.read_h5ad(par['input_mod1'])
 47 | ad_mod2 = ad.read_h5ad(par['input_mod2'])
 48 | 
 49 | # high variable gene calculation
 50 | min_cells = int(ad_mod2.shape[0] * 0.03)
 51 | sc.pp.highly_variable_genes(ad_mod1, batch_key ='batch', subset = True)
 52 | sc.pp.filter_genes(ad_mod2, min_cells=min_cells)
 53 | 
 54 | ad_mod_1 = ad_mod1[:, ad_mod1.var.highly_variable]
 55 | 
 56 | ## Convert to  csv for AE training
 57 | scRNAseq1 = ad_mod_1.X.toarray()
 58 | scRNAseq2 = ad_mod2.X.toarray()
 59 | 
 60 | 
 61 | class WeightsOrthogonalityConstraint(Constraint):
 62 |     def __init__(self, encoding_dim, weightage = 1.0, axis = 0):
 63 |         self.encoding_dim = encoding_dim
 64 |         self.weightage = weightage
 65 |         self.axis = axis
 66 |         
 67 |     def weights_orthogonality(self, w):
 68 |         if(self.axis==1):
 69 |             w = K.transpose(w)
 70 |         if(self.encoding_dim > 1):
 71 |             m = K.dot(K.transpose(w), w) - K.eye(self.encoding_dim)
 72 |             return self.weightage * K.sqrt(K.sum(K.square(m)))
 73 |         else:
 74 |             m = K.sum(w ** 2) - 1.
 75 |             return m
 76 | 
 77 |     def __call__(self, w):
 78 |         return self.weights_orthogonality(w)
 79 | 
 80 | 
 81 | # Input Layer
 82 | ncol_scRNAseq1 = scRNAseq1.shape[1]
 83 | input_dim_scRNAseq1 = Input(shape = (ncol_scRNAseq1, ), name = "scRNAseq1")
 84 | ncol_scRNAseq2 = scRNAseq2.shape[1]
 85 | input_dim_scRNAseq2 = Input(shape = (ncol_scRNAseq2, ), name = "scRNAseq2")
 86 | 
 87 | encoding_dim_scRNAseq1 = 64
 88 | encoding_dim_scRNAseq2 = 64
 89 | 
 90 | dropout_scRNAseq1 = Dropout(0.1, name = "Dropout_scRNAseq1")(input_dim_scRNAseq1)
 91 | dropout_scRNAseq2 = Dropout(0.1, name = "Dropout_scRNAseq2")(input_dim_scRNAseq2)
 92 | 
 93 | encoded_scRNAseq1 = Dense(encoding_dim_scRNAseq1, activation = 'relu', name = "Encoder_scRNAseq1", use_bias=True, kernel_regularizer=WeightsOrthogonalityConstraint(64, weightage=1., axis=0))(dropout_scRNAseq1) #300 #prv 256 
 94 | encoded_scRNAseq2 = Dense(encoding_dim_scRNAseq2, activation = 'relu', name = "Encoder_scRNAseq2", use_bias=True, kernel_regularizer=WeightsOrthogonalityConstraint(64, weightage=1., axis=0))(dropout_scRNAseq2)
 95 | 
 96 | merge = concatenate([encoded_scRNAseq1,  encoded_scRNAseq2])
 97 | 
 98 | bottleneck = Dense(64, kernel_initializer = 'uniform', activation = 'linear', name = "Bottleneck")(merge) #50
 99 | 
100 | merge_inverse = Dense(encoding_dim_scRNAseq1 + encoding_dim_scRNAseq2, activation = 'relu', name = "Concatenate_Inverse")(bottleneck)
101 | 
102 | decoded_scRNAseq1 = Dense(ncol_scRNAseq1, activation = 'relu', name = "Decoder_scRNAseq1")(merge_inverse) #sigmoid
103 | 
104 | decoded_scRNAseq2 = Dense(ncol_scRNAseq2, activation = 'relu', name = "Decoder_scRNAseq2")(merge_inverse)
105 | 
106 | autoencoder = Model([input_dim_scRNAseq1, input_dim_scRNAseq2],  [decoded_scRNAseq1, decoded_scRNAseq2])
107 | 
108 | opt = Adam(lr=0.0001)
109 | autoencoder.compile(optimizer = opt, loss={'Decoder_scRNAseq1': 'mean_squared_error', 'Decoder_scRNAseq2': 'mean_squared_error'}) #loss_weights = [1., 1.]
110 | autoencoder.summary()
111 | 
112 | es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=20)
113 | # Autoencoder training
114 | estimator = autoencoder.fit([scRNAseq1, scRNAseq2], [scRNAseq1, scRNAseq2], epochs = 600, batch_size = 32, validation_split = 0.2, shuffle = True, verbose = 1, callbacks=[es]) #prev 64 BS prev 32
115 | 
116 | 
117 | encoder = Model([input_dim_scRNAseq1, input_dim_scRNAseq2], bottleneck)
118 | bottleneck_representation = encoder.predict([scRNAseq1, scRNAseq2])
119 | 
120 | embd = pd.DataFrame(bottleneck_representation)
121 | #embd  = scipy.sparse.csr_matrix(RNA_ATAC_Latent.values)
122 | 
123 | mod1_obs = ad_mod1.obs
124 | mod1_uns = ad_mod1.uns
125 | logging.info('Storing output to file')
126 | adata = ad.AnnData(
127 |     X=embd.values,
128 |     obs=mod1_obs,
129 |     uns={
130 |         'dataset_id': mod1_uns['dataset_id'],
131 |         'method_id': meta['functionality_name'],
132 |     },
133 | )
134 | adata.write_h5ad(par['output'], compression="gzip")
135 | 


--------------------------------------------------------------------------------
/src/joint_embedding/methods/lsl_ae/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
 4 | export NXF_VER=21.04.1
 5 | export PIPELINE_VERSION=1.4.0
 6 | method_id=submission_170825
 7 | task_id=joint_embedding
 8 | 
 9 | # CITE
10 | dataset_id=openproblems_bmmc_cite_phase2
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
13 | 
14 | target/docker/${task_id}_methods/${method_id}/${method_id} \
15 |   --input_mod1 ${dataset_path}.output_mod1.h5ad \
16 |   --input_mod2 ${dataset_path}.output_mod2.h5ad \
17 |   --output ${pred_path}.${method_id}.output.h5ad
18 | 
19 | # MULTIOME
20 | dataset_id=openproblems_bmmc_multiome_phase2
21 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
22 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
23 | 
24 | target/docker/${task_id}_methods/${method_id}/${method_id} \
25 |   --input_mod1 ${dataset_path}.output_mod1.h5ad \
26 |   --input_mod2 ${dataset_path}.output_mod2.h5ad \
27 |   --output ${pred_path}.${method_id}.output.h5ad
28 | 
29 | 
30 | # RUN EVALUATION
31 | bin/nextflow run "$PIPELINE_REPO" \
32 |   -r "$PIPELINE_VERSION" \
33 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
34 |   --solutionDir "output/datasets/$task_id" \
35 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
36 |   --publishDir "output/evaluation/$task_id/$method_id/" \
37 |   -latest \
38 |   -resume \
39 |   -c "src/resources/nextflow_moremem.config"
40 | 
41 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/match_modality/methods/clue/README.md:
--------------------------------------------------------------------------------
 1 | # CLUE (Cross-Linked Unified Embedding)
 2 | 
 3 | Team GLUE: Zhi-Jie Cao, Xin-Ming Tu, Chen-Rui Xia
 4 | 
 5 | **CLUE** is a semi-supervised single-cell multi-omics integration model. It employs variational autoencoders to project cells from different modalities into a unified low-dimensional embedding space, where modality matching can be performed. Specially, we model data in each modality as generated from a modality-specific subspace of the complete cell embedding. Through a matrix of cross-encoders, CLUE projects cells in each modality into all modality-specific subspaces, which are then concatenated to build a comprehensive embedding, allowing the model to capture both shared and modality-specific information.
 6 | 
 7 | <img src="clue_architecture.jpg" width="50%">
 8 | 
 9 | **General architecture of CLUE ⤴️**
10 | 
11 | > CLUE is implemented as part of the `scglue` Python package. A pre-release containing the CLUE model is available as `resources/scglue-0.1.1-py3-none-any.whl`. A formal release will be made available later on PyPI and Anaconda. Stay tuned at [https://github.com/gao-lab/GLUE](https://github.com/gao-lab/GLUE).
12 | 


--------------------------------------------------------------------------------
/src/match_modality/methods/clue/clue_architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/clue/clue_architecture.jpg


--------------------------------------------------------------------------------
/src/match_modality/methods/clue/resources/scglue-0.1.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/clue/resources/scglue-0.1.1-py3-none-any.whl


--------------------------------------------------------------------------------
/src/match_modality/methods/clue/run/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: clue
 3 |   namespace: match_modality_methods
 4 | 
 5 |   # metadata for your method
 6 | 
 7 |   description: Cross-linked unified embedding for single-cell multi-omics data integration
 8 |   info:
 9 |     method_label: "CLUE"
10 |     submission_id: "169959"
11 |     team_name: GLUE
12 |     project_url: https://github.com/gao-lab/GLUE
13 |     # publication_doi: 10.1101/2021.08.22.457275
14 |     # publication_url: https://arxiv.org/abs/1234.56789
15 | 
16 |   authors:
17 |     - name: Zhi-Jie Cao
18 |       email: caozj@mail.cbi.pku.edu.cn
19 |       roles: [ author, maintainer ]
20 |       props: { github: Jeff1995, orcid: "0000-0002-0026-671X" }
21 |     - name: Xin-Ming Tu
22 |       email: xinmingtu@pku.edu.cn
23 |       roles: [ author, maintainer ]
24 |       props: { github: XinmingTu }
25 |     - name: Chen-Rui Xia
26 |       email: xiachenrui@mail.cbi.pku.edu.cn
27 |       roles: [ author, maintainer ]
28 |       props: { github: xiachenrui }
29 | 
30 |   # parameters
31 |   arguments:
32 |     # required inputs
33 |     - name: "--input_train_mod1"
34 |       type: "file"
35 |       example: "dataset_censored.h5ad"
36 |       description: "The censored shuffled train mod1 profiles."
37 |       required: true
38 |     - name: "--input_train_mod2"
39 |       type: "file"
40 |       example: "dataset_censored.h5ad"
41 |       description: "The censored shuffled train mod2 profiles."
42 |       required: true
43 |     - name: "--input_train_sol"
44 |       type: "file"
45 |       example: "dataset_solution.h5ad"
46 |       description: "The pairing of train mod1&mod2 profiles."
47 |       required: true
48 |     - name: "--input_test_mod1"
49 |       type: "file"
50 |       example: "dataset_censored.h5ad"
51 |       description: "The censored shuffled test mod1 profiles."
52 |       required: true
53 |     - name: "--input_test_mod2"
54 |       type: "file"
55 |       example: "dataset_censored.h5ad"
56 |       description: "The censored shuffled test mod2 profiles."
57 |       required: true
58 |     - name: "--input_pretrain"
59 |       type: "file"
60 |       example: "pretrain_model"
61 |       description: Path to the directory containing a pretrained model.
62 |       required: true
63 | 
64 |     # required outputs
65 |     - name: "--output"
66 |       type: "file"
67 |       direction: "output"
68 |       example: "output.h5ad"
69 |       description: "The predicted pairing of test mod1&mod2 profiles."
70 |       required: true
71 | 
72 |   # files your script needs
73 |   resources:
74 |     - type: python_script
75 |       path: script.py
76 |     - path: ../resources/utils.py
77 |     - path: ../resources/scglue-0.1.1-py3-none-any.whl
78 | 
79 | # target platforms
80 | platforms:
81 |   - type: docker
82 |     image: nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04
83 |     setup:
84 |       - type: apt
85 |         packages:
86 |           - python3-pip
87 |           - python3.8-dev
88 |       - type: docker
89 |         run:
90 |           - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10
91 |           - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 10
92 |           - python -m pip install --upgrade pip
93 |           - pip install scglue-0.1.1-py3-none-any.whl
94 |           - pip install pyyaml
95 |         resources:
96 |           - scglue-0.1.1-py3-none-any.whl scglue-0.1.1-py3-none-any.whl
97 | 
98 |   - type: nextflow
99 |     labels: [ highmem, hightime, highcpu, gpu ]


--------------------------------------------------------------------------------
/src/match_modality/methods/clue/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
  4 | export NXF_VER=21.04.1
  5 | export PIPELINE_VERSION=1.4.0
  6 | method_id=clue
  7 | task_id=match_modality
  8 | 
  9 | # CITE GEX2ADT
 10 | dataset_id=openproblems_bmmc_cite_phase2_rna
 11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 12 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 13 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 14 | 
 15 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 16 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 17 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 18 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 19 |   --output_pretrain ${pretrain_path}
 20 | 
 21 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 22 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 23 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 24 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 25 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 26 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 27 |   --input_pretrain ${pretrain_path} \
 28 |   --output ${pred_path}.${method_id}.output.h5ad
 29 | 
 30 | # CITE ADT2GEX
 31 | dataset_id=openproblems_bmmc_cite_phase2_mod2
 32 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 33 | # can reuse same pretrain
 34 | # pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 36 | 
 37 | # target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 38 | #   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 39 | #   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 40 | #   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 41 | #   --output_pretrain ${pretrain_path}
 42 | 
 43 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 44 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 45 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 46 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 47 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 48 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 49 |   --input_pretrain ${pretrain_path} \
 50 |   --output ${pred_path}.${method_id}.output.h5ad
 51 | 
 52 | 
 53 | # MULTIOME GEX2ATAC
 54 | dataset_id=openproblems_bmmc_multiome_phase2_rna
 55 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 56 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 57 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 58 | 
 59 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 60 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 61 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 62 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 63 |   --output_pretrain ${pretrain_path}
 64 | 
 65 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 66 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 67 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 68 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 69 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 70 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 71 |   --input_pretrain ${pretrain_path} \
 72 |   --output ${pred_path}.${method_id}.output.h5ad
 73 | 
 74 | # MULTIOME ATAC2GEX
 75 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
 76 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 77 | # can reuse same pretrains
 78 | # pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 79 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 80 | 
 81 | # target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 82 | #   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 83 | #   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 84 | #   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 85 | #   --output_pretrain ${pretrain_path}
 86 | 
 87 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 88 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 89 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 90 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 91 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 92 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 93 |   --input_pretrain ${pretrain_path} \
 94 |   --output ${pred_path}.${method_id}.output.h5ad
 95 | 
 96 | # RUN EVALUATION
 97 | bin/nextflow run "$PIPELINE_REPO" \
 98 |   -r "$PIPELINE_VERSION" \
 99 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
100 |   --solutionDir "output/datasets/$task_id" \
101 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
102 |   --publishDir "output/evaluation/$task_id/$method_id/" \
103 |   -latest \
104 |   -resume \
105 |   -c "src/resources/nextflow_moremem.config"
106 | 
107 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/match_modality/methods/clue/train/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: clue_train
 3 |   namespace: match_modality_methods
 4 | 
 5 |   # metadata for your method
 6 |   description: Cross-linked unified embedding for single-cell multi-omics data integration
 7 | 
 8 |   info:
 9 |     submission_id: "169959"
10 |     team_name: GLUE
11 |     # project_url: https://github.com/foo/bar
12 |     # publication_doi: 10.1101/0123.45.67.890123
13 |     # publication_url: https://arxiv.org/abs/1234.56789
14 | 
15 |   authors:
16 |     - name: Zhi-Jie Cao
17 |       email: caozj@mail.cbi.pku.edu.cn
18 |       roles: [ author, maintainer ]
19 |       props: { github: Jeff1995, orcid: "0000-0002-0026-671X" }
20 |     - name: Xin-Ming Tu
21 |       email: xinmingtu@pku.edu.cn
22 |       roles: [ author, maintainer ]
23 |       props: { github: XinmingTu }
24 |     - name: Chen-Rui Xia
25 |       email: xiachenrui@mail.cbi.pku.edu.cn
26 |       roles: [ author, maintainer ]
27 |       props: { github: xiachenrui }
28 | 
29 |   # parameters
30 |   arguments:
31 |     # required inputs
32 |     - name: "--input_train_mod1"
33 |       type: "file"
34 |       example: "dataset_censored.h5ad"
35 |       description: "The censored shuffled train mod1 profiles."
36 |       required: true
37 |     - name: "--input_train_mod2"
38 |       type: "file"
39 |       example: "dataset_censored.h5ad"
40 |       description: "The censored shuffled train mod2 profiles."
41 |       required: true
42 |     - name: "--input_train_sol"
43 |       type: "file"
44 |       example: "dataset_solution.h5ad"
45 |       description: "The pairing of train mod1&mod2 profiles."
46 |       required: true
47 | 
48 |     # required outputs
49 |     - name: "--output_pretrain"
50 |       type: "file"
51 |       example: "pretrain_model"
52 |       direction: "output"
53 |       description: Path to the directory containing a pretrained model.
54 |       required: true
55 | 
56 |   # files your script needs
57 |   resources:
58 |     - type: python_script
59 |       path: script.py
60 |     - path: ../resources/utils.py
61 |     - path: ../resources/scglue-0.1.1-py3-none-any.whl
62 | 
63 | # target platforms
64 | platforms:
65 |   - type: docker
66 |     image: nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04
67 |     run_args: [ "--gpus all" ]
68 |     setup:
69 |       - type: apt
70 |         packages:
71 |           - python3-pip
72 |           - python3.8-dev
73 |       - type: docker
74 |         run:
75 |           - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10
76 |           - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 10
77 |           - python -m pip install --upgrade pip
78 |           - pip install scglue-0.1.1-py3-none-any.whl
79 |           - pip install pyyaml scikit-misc
80 |         resources:
81 |           - scglue-0.1.1-py3-none-any.whl scglue-0.1.1-py3-none-any.whl
82 | 
83 |   - type: nextflow
84 |     labels: [ highmem, hightime, highcpu, gpu ]


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/README.md:
--------------------------------------------------------------------------------
1 | # NeurIPS-Single-Cell-MultiModality
2 | 
3 | Team Novel: Gleb Ryazantsev, Nikolay Russkikh, Igor I
4 | 
5 | The approach utilizes sample representations, learned in the same way as in the CLIP model. Encoders for all of the modalities are fully connected, the dimensionality of GEX and ATAC data is reduces via LSI transform (ADT is left as-is). Then, to obtain sample pairings, a maximum weight matching on a bipartite graph is performed, where weights are cosine similarities between sample embeddings.
6 | 
7 | <img src="novel_architecture1.png" width="100%">
8 | 
9 | <img src="novel_architecture2.png" width="100%">


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/novel_architecture1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/novel/novel_architecture1.png


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/novel_architecture2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/novel/novel_architecture2.png


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/catalyst_tools.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from catalyst import runners, metrics
 4 | from models import symmetric_npair_loss
 5 | 
 6 | 
 7 | import numpy as np
 8 | import torch.nn.functional as F
 9 | from tqdm.notebook import tqdm
10 | 
11 | from networkx.algorithms import bipartite
12 | from scipy import sparse
13 | 
14 | 
15 | 
16 | class scRNARunner(runners.Runner):
17 |     def handle_batch(self, batch):
18 |         features_first = batch['features_first']
19 |         features_second = batch['features_second']
20 | 
21 |         logits, embeddings_first, embeddings_second = self.model(features_first, features_second)
22 |         targets = torch.arange(logits.shape[0]).to(logits.device)
23 |         
24 |         loss = symmetric_npair_loss(logits, targets)
25 |         
26 |         batch_temperature = self.model.logit_scale.exp().item()
27 |         
28 |         self.batch_metrics.update({"loss": loss})
29 |         self.batch_metrics.update({"T": batch_temperature})
30 | 
31 |         self.batch = {
32 |                         'features_first': features_first,
33 |                         'features_second': features_second,
34 |                         'embeddings_first': embeddings_first,
35 |                         'embeddings_second': embeddings_second,
36 |                         'scores': logits, 
37 |                         'targets': targets,
38 |                         'temperature': batch_temperature
39 | 
40 |         }
41 |         self.input = { 'features_first': features_first,
42 |                        'features_second': features_second, 
43 |                      }
44 |         self.output = {'scores': logits,
45 |                        'embeddings_first': embeddings_first,
46 |                        'embeddings_second': embeddings_second
47 |                       }
48 |         
49 | class CustomMetric(metrics.ICallbackLoaderMetric):
50 |     def __init__(self, compute_on_call: bool = True, prefix: str = None, suffix: str = None):
51 |         """Init."""
52 |         super().__init__(compute_on_call=compute_on_call)
53 |         self.prefix = prefix or ""
54 |         self.suffix = suffix or ""
55 |         self.embeddings_list_first = []
56 |         self.embeddings_list_second = []
57 |         
58 |     def reset(self, num_batches: int, num_samples: int) -> None:
59 |         self.embeddings_list_first = []
60 |         self.embeddings_list_second = []
61 |         torch.cuda.empty_cache()
62 |         
63 |     def update(self, *args, **kwargs) -> None:
64 |         embeddings_first = kwargs['embeddings_first']
65 |         embeddings_second = kwargs['embeddings_second']
66 |         temperature = kwargs['temperature']
67 |         self.embeddings_list_first.append(temperature*embeddings_first)
68 |         self.embeddings_list_second.append(embeddings_second)
69 |         
70 |     def compute(self):
71 |         raise NotImplementedError('This method is not supported')
72 |         
73 |         
74 |     def compute_key_value(self):
75 |         all_embeddings_first = torch.cat(self.embeddings_list_first).detach().cpu()
76 |         all_embeddings_second = torch.cat(self.embeddings_list_second).detach().cpu()
77 |         logits = all_embeddings_first@all_embeddings_second.T
78 |         #labels = torch.arange(logits.shape[0]).to(logits.device)
79 |         labels = torch.arange(logits.shape[0])
80 |        
81 |         del(all_embeddings_first)
82 |         del(all_embeddings_second)
83 | 
84 |         forward_accuracy = (torch.argmax(logits, dim=1)==labels).float().mean().item()
85 |         backward_accuracy = (torch.argmax(logits, dim=0)==labels).float().mean().item()
86 |         del(logits)
87 | 
88 |         avg_accuracy = 0.5*(forward_accuracy+backward_accuracy)
89 |         
90 |         loader_metrics = {
91 | 
92 |             'forward_acc':forward_accuracy,
93 |             'backward_acc':backward_accuracy,
94 |             'avg_acc': avg_accuracy
95 |         }
96 |         return loader_metrics


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/config_ADT2GEX.py:
--------------------------------------------------------------------------------
 1 | LR = 7.79984e-05
 2 | OPTIM = 'AdamW'
 3 | weight_decay=0
 4 | 
 5 | EMBEDDING_DIM = 64
 6 | 
 7 | DROPOUT_RATES_FIRST = [0.0221735, 0.296919]
 8 | DROPOUT_RATES_GEX = [0.0107121,0.254689]
 9 | 
10 | LAYERS_DIM_FIRST = [512, 2048]
11 | LAYERS_DIM_GEX = [1024, 512]
12 | 
13 | LOG_T = 3.463735
14 | 
15 | N_LSI_COMPONENTS_GEX = 128
16 | N_EPOCHS = 7000
17 | 
18 | BATCH_SIZE = 2048
19 | 
20 | SWAP_RATE_FIRST = 0.
21 | SWAP_RATE_GEX = 0.
22 | 


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/config_ATAC2GEX.py:
--------------------------------------------------------------------------------
 1 | #optimizer
 2 | LR = 0.000585
 3 | OPTIM = 'AdamW'
 4 | weight_decay=0
 5 | EMBEDDING_DIM = 256
 6 | 
 7 | DROPOUT_RATES_FIRST = [0.661]
 8 | DROPOUT_RATES_GEX = [ 0.541, 0.396]
 9 | 
10 | LAYERS_DIM_FIRST = [2048]
11 | LAYERS_DIM_GEX = [1024, 1024]
12 | 
13 | LOG_T = 3.065016	
14 | 
15 | 
16 | N_LSI_COMPONENTS_FIRST= 512
17 | N_LSI_COMPONENTS_GEX = 64
18 | 
19 | N_EPOCHS = 7000
20 | 
21 | BATCH_SIZE = 16384


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/data.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset,DataLoader
 2 | 
 3 | class ModalityMatchingDataset(Dataset):
 4 |     def __init__(
 5 |         self, df_modality1, df_modality2
 6 |     ):
 7 |         super().__init__()
 8 |         
 9 |         self.df_modality1 = df_modality1.values
10 |         self.df_modality2 = df_modality2.values
11 |         
12 |     
13 |     def __len__(self):
14 |         return self.df_modality1.shape[0]
15 |     
16 |     def __getitem__(self, index: int):
17 |         x_modality_1 = self.df_modality1[index]
18 |         x_modality_2 = self.df_modality2[index]
19 |         return {'features_first':x_modality_1, 'features_second':x_modality_2}
20 |     
21 | def get_dataloaders(mod1_train, mod2_train, sol_train,
22 |                          mod1_test, mod2_test, sol_test, NUM_WORKERS, BATCH_SIZE):
23 |     
24 |     mod2_train = mod2_train.iloc[sol_train.values.argmax(1)]
25 |     mod2_test = mod2_test.iloc[sol_test.values.argmax(1)]
26 |     
27 |     dataset_train = ModalityMatchingDataset(mod1_train, mod2_train)
28 |     data_train = DataLoader(dataset_train, BATCH_SIZE, shuffle = True, num_workers = NUM_WORKERS)
29 |     
30 |     dataset_test = ModalityMatchingDataset(mod1_test, mod2_test)
31 |     data_test = DataLoader(dataset_test, BATCH_SIZE, shuffle = False, num_workers = NUM_WORKERS)
32 |     
33 |     return data_train, data_test
34 | 
35 | 


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import Dataset,DataLoader
 3 | import torch.nn.functional as F
 4 | from torch import nn
 5 | 
 6 | 
 7 | class BatchSwapNoise(nn.Module):
 8 |     """Swap Noise module"""
 9 |     def __init__(self, p):
10 |         super().__init__()
11 |         self.p = p
12 | 
13 |     def forward(self, x):
14 |         if self.training:
15 |             mask = torch.rand(x.size()) > (1 - self.p)
16 |             idx = torch.add(torch.arange(x.nelement()),
17 |                             (torch.floor(torch.rand(x.size()) * x.size(0)).type(torch.LongTensor) *
18 |                              (mask.type(torch.LongTensor) * x.size(1))).view(-1))
19 |             idx[idx>=x.nelement()] = idx[idx>=x.nelement()]-x.nelement()
20 |             return x.view(-1)[idx].view(x.size())
21 |         else:
22 |             return x
23 | 
24 | 
25 | class Encoder(nn.Module):
26 |     def __init__(self, n_input, embedding_size, dropout_rates, dims_layers, swap_noise_ratio):
27 |         super(Encoder, self).__init__()
28 |         dropout = []
29 |         layers = []
30 |         layers.append(nn.Linear(n_input, dims_layers[0]))
31 |         
32 |         for i in range(len(dims_layers)-1):
33 |             layers.append(nn.Linear(dims_layers[i], dims_layers[i+1]))
34 |         for i in range(len(dropout_rates)):
35 |             dropout.append(nn.Dropout(p=dropout_rates[i]))
36 |             
37 |         layers.append(nn.Linear(dims_layers[-1], embedding_size))
38 |         
39 |         self.fc_list = nn.ModuleList(layers)
40 |         self.dropout_list = nn.ModuleList(dropout)
41 |         
42 |     def forward(self, x):
43 |         for i in range(len(self.fc_list)-1):
44 |             x = F.elu(self.fc_list[i](x))
45 |             if(i<len(self.dropout_list)):
46 |                  x = self.dropout_list[i](x)
47 |             
48 |         x = self.fc_list[-1](x)
49 |         return x
50 |         
51 | class Modality_CLIP(nn.Module):
52 |     def __init__(self, Encoder, layers_dims, dropout_rates, dim_mod1, dim_mod2, output_dim, T, swap_rate_1=0., swap_rate_2=0.):
53 |         super(Modality_CLIP, self).__init__()
54 |         
55 |         self.encoder_modality1 = Encoder(dim_mod1, output_dim, dropout_rates[0], layers_dims[0], swap_rate_1)
56 |         self.encoder_modality2 = Encoder(dim_mod2, output_dim, dropout_rates[1], layers_dims[1], swap_rate_2)
57 |         
58 |         self.logit_scale = nn.Parameter(torch.ones([]) * T)
59 |         
60 |     def forward(self, features_first, features_second):
61 |         features_mod1 = self.encoder_modality1(features_first)
62 |         features_mod2 = self.encoder_modality2(features_second)
63 |         
64 |         features_mod1 = features_mod1/torch.norm(features_mod1, p=2, dim=-1, keepdim=True)
65 |         features_mod2 = features_mod2/torch.norm(features_mod2, p=2, dim=-1, keepdim=True)
66 |         
67 |         logit_scale = self.logit_scale.exp()
68 |         
69 |         logits = logit_scale*features_mod1@features_mod2.T
70 |         
71 |         return logits, features_mod1, features_mod2
72 |     
73 | def symmetric_npair_loss(logits, targets):
74 |         loss = 0.5*(F.cross_entropy(logits, targets)+ F.cross_entropy(logits.T, targets))
75 |         return loss


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/postprocessing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch.nn.functional as F
 3 | from tqdm.notebook import tqdm
 4 | 
 5 | from networkx.algorithms import bipartite
 6 | from scipy import sparse
 7 | 
 8 | 
 9 | 
10 | def calculate_rank(mask, logits):
11 |     soft_logits = F.softmax(logits).detach().cpu().numpy()
12 |     indexes_target = np.argmax(mask, axis=1)
13 |     rank = soft_logits.argsort()
14 |     
15 |     list_indexes_rank = []
16 |     for i in range(len(rank)):
17 |         list_indexes_rank.append(np.where(indexes_target[i] == rank[i])[0][0])
18 |         
19 |     return list_indexes_rank
20 | 
21 |     
22 |     
23 | def topN_logits(logits, topn, ):
24 |     argsort_logits = logits.argsort()
25 |     ind = argsort_logits[:,-topn:]
26 |     n_logits = np.zeros(logits.shape)
27 |     n_logits[:] = -100000
28 |     n_logits = torch.tensor(n_logits)
29 | 
30 |     for i in tqdm(range(len(ind))):
31 |         for j in range(len(ind[0])):
32 |             ii = ind[i][j]
33 |             n_logits[i][ii.item()] = logits[i][ii.item()]
34 |             
35 |     return n_logits
36 | 
37 | def get_bipartite_matching_adjacency_matrix(raw_logits, threshold_quantile=0.995):
38 |     #getting rid of unpromising graph connections
39 |     weights = raw_logits.copy()
40 |     quantile_row = np.quantile(weights, threshold_quantile, axis=0, keepdims=True)
41 |     quantile_col = np.quantile(weights, threshold_quantile, axis=1, keepdims=True)
42 |     quantile_minimum = np.minimum(quantile_row, quantile_col)
43 |     weights[weights<quantile_minimum] = 0
44 |     weights_sparse = sparse.csr_matrix(-weights)
45 |     graph = bipartite.matrix.from_biadjacency_matrix(weights_sparse)
46 |     #explicitly combining top nodes in once component or networkx freaks tf out
47 |     u = [n for n in graph.nodes if graph.nodes[n]['bipartite'] == 0]
48 |     matches = bipartite.matching.minimum_weight_full_matching(graph, top_nodes=u)
49 |     best_matches = np.array([matches[x]-len(u) for x in u])
50 |     bipartite_matching_adjacency = np.zeros(raw_logits.shape)
51 |     bipartite_matching_adjacency[np.arange(raw_logits.shape[0]), best_matches]=1
52 |     return bipartite_matching_adjacency


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from itertools import chain
  3 | from typing import Callable, List, Mapping, Optional
  4 | 
  5 | import anndata
  6 | import numpy as np
  7 | import pandas as pd
  8 | import scipy.sparse
  9 | import sklearn.decomposition
 10 | import sklearn.feature_extraction.text
 11 | import sklearn.preprocessing
 12 | import sklearn.neighbors
 13 | import sklearn.utils.extmath
 14 | 
 15 | def tfidf(X):
 16 |     r"""
 17 |     TF-IDF normalization (following the Seurat v3 approach)
 18 |     Parameters
 19 |     ----------
 20 |     X
 21 |         Input matrix
 22 |     Returns
 23 |     -------
 24 |     X_tfidf
 25 |         TF-IDF normalized matrix
 26 |     """
 27 |     idf = X.shape[0] / X.sum(axis=0)
 28 |     if scipy.sparse.issparse(X):
 29 |         tf = X.multiply(1 / X.sum(axis=1))
 30 |         return tf.multiply(idf)
 31 |     else:
 32 |         tf = X / X.sum(axis=1, keepdims=True)
 33 |         return tf * idf
 34 | 
 35 | class tfidfTransformer():
 36 |     def __init__(self):
 37 |         self.idf = None
 38 |         self.fitted = False
 39 | 
 40 |     def fit(self, X):
 41 |         self.idf = X.shape[0] / X.sum(axis=0)
 42 |         self.fitted = True
 43 | 
 44 |     def transform(self, X):
 45 |         if not self.fitted:
 46 |             raise RuntimeError('Transformer was not fitted on any data')
 47 |         if scipy.sparse.issparse(X):
 48 |             tf = X.multiply(1 / X.sum(axis=1))
 49 |             return tf.multiply(self.idf)
 50 |         else:
 51 |             tf = X / X.sum(axis=1, keepdims=True)
 52 |             return tf * self.idf
 53 | 
 54 |     def fit_transform(self, X):
 55 |         self.fit(X)
 56 |         return self.transform(X)
 57 |  
 58 | 
 59 | class lsiTransformer():
 60 |     def __init__(self,
 61 |                  n_components: int = 20,
 62 |                  drop_first=True,
 63 |                  use_highly_variable = None
 64 |                 ):
 65 |         
 66 |         self.drop_first=drop_first
 67 |         self.n_components = n_components+drop_first
 68 |         self.use_highly_variable = use_highly_variable
 69 |         self.tfidfTransformer = tfidfTransformer()
 70 |         self.normalizer =  sklearn.preprocessing.Normalizer(norm="l1")
 71 |         self.pcaTransformer = sklearn.decomposition.TruncatedSVD(n_components = self.n_components, random_state=777)
 72 |         self.fitted = None
 73 |         
 74 |     def fit(self, adata: anndata.AnnData):
 75 |         if self.use_highly_variable is None:
 76 |             self.use_highly_variable = "highly_variable" in adata.var
 77 |         adata_use = adata[:, adata.var["highly_variable"]] if self.use_highly_variable else adata
 78 |         X = self.tfidfTransformer.fit_transform(adata_use.X)
 79 |         X_norm = self.normalizer.fit_transform(X)
 80 |         X_norm = np.log1p(X_norm * 1e4)
 81 |         X_lsi = self.pcaTransformer.fit_transform(X_norm)
 82 |         self.fitted = True
 83 |     
 84 |     def transform(self, adata):
 85 |         if not self.fitted:
 86 |             raise RuntimeError('Transformer was not fitted on any data')
 87 |         adata_use = adata[:, adata.var["highly_variable"]] if self.use_highly_variable else adata
 88 |         X = self.tfidfTransformer.transform(adata_use.X)
 89 |         X_norm = self.normalizer.transform(X)
 90 |         X_norm = np.log1p(X_norm * 1e4)
 91 |         X_lsi = self.pcaTransformer.transform(X_norm)
 92 |         X_lsi -= X_lsi.mean(axis=1, keepdims=True)
 93 |         X_lsi /= X_lsi.std(axis=1, ddof=1, keepdims=True)
 94 |         lsi_df = pd.DataFrame(X_lsi, index = adata_use.obs_names).iloc[:,int(self.drop_first):]
 95 |         return lsi_df
 96 |     
 97 |     def fit_transform(self, adata):
 98 |         self.fit(adata)
 99 |         return self.transform(adata)
100 |         
101 |         
102 |                  
103 |     
104 | def lsi(
105 |         adata: anndata.AnnData, n_components: int = 20,
106 |         use_highly_variable: Optional[bool] = None, **kwargs
107 | ) -> None:
108 |     r"""
109 |     LSI analysis (following the Seurat v3 approach)
110 |     Parameters
111 |     ----------
112 |     adata
113 |         Input dataset
114 |     n_components
115 |         Number of dimensions to use
116 |     use_highly_variable
117 |         Whether to use highly variable features only, stored in
118 |         ``adata.var['highly_variable']``. By default uses them if they
119 |         have been determined beforehand.
120 |     **kwargs
121 |         Additional keyword arguments are passed to
122 |         :func:`sklearn.utils.extmath.randomized_svd`
123 |     """
124 |     if use_highly_variable is None:
125 |         use_highly_variable = "highly_variable" in adata.var
126 |     adata_use = adata[:, adata.var["highly_variable"]] if use_highly_variable else adata
127 |     X = tfidf(adata_use.X)
128 |     X_norm = sklearn.preprocessing.Normalizer(norm="l1").fit_transform(X)
129 |     X_norm = np.log1p(X_norm * 1e4)
130 |     X_lsi = sklearn.utils.extmath.randomized_svd(X_norm, n_components, random_state=777, **kwargs)[0]
131 |     X_lsi -= X_lsi.mean(axis=1, keepdims=True)
132 |     X_lsi /= X_lsi.std(axis=1, ddof=1, keepdims=True)
133 |     adata.obsm["X_lsi"] = X_lsi


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/run/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: novel
 3 |   namespace: match_modality_methods
 4 |   
 5 |   description: The approach utilizes sample representations, learned in the same way as in the CLIP model. Encoders for all of the modalities are fully connected, the dimensionality of GEX and ATAC data is reduces via LSI transform (ADT is left as-is). Then, to obtain sample pairings, a maximum weight matching on a bipartite graph is performed, where weights are cosine similarities between sample embeddings.
 6 |   info:
 7 |     method_label: "Novel"
 8 |     submission_id: "169594/170690"
 9 |     team_name: Novel
10 |   
11 |   authors:
12 |     - name: Gleb Ryazantsev
13 |       email: ryazantsev.gleb@gmail.com
14 |       roles: [ author, maintainer ]
15 |     - name: Nikolay Russkikh
16 |       email: russkikh.nikolay@gmail.com
17 |       roles: [ author, maintainer ]
18 |     - name: Igor I
19 |       email: herri.i.67@gmail.com
20 |       roles: [ author, maintainer ]
21 |       
22 |   # parameters
23 |   arguments:
24 |     # required inputs
25 |     - name: "--input_train_mod1"
26 |       type: "file"
27 |       example: "dataset_censored.h5ad"
28 |       description: "The censored shuffled train mod1 profiles."
29 |       required: true
30 |     - name: "--input_train_mod2"
31 |       type: "file"
32 |       example: "dataset_censored.h5ad"
33 |       description: "The censored shuffled train mod2 profiles."
34 |       required: true
35 |     - name: "--input_train_sol"
36 |       type: "file"
37 |       example: "dataset_solution.h5ad"
38 |       description: "The pairing of train mod1&mod2 profiles."
39 |       required: true
40 |     - name: "--input_test_mod1"
41 |       type: "file"
42 |       example: "dataset_censored.h5ad"
43 |       description: "The censored shuffled test mod1 profiles."
44 |       required: true
45 |     - name: "--input_test_mod2"
46 |       type: "file"
47 |       example: "dataset_censored.h5ad"
48 |       description: "The censored shuffled test mod2 profiles."
49 |       required: true
50 |     - name: "--input_pretrain"
51 |       type: "file"
52 |       example: "pretrain_model"
53 |       description: Path to the directory containing a pretrained model.
54 |       required: true
55 | 
56 |     # required outputs
57 |     - name: "--output"
58 |       type: "file"
59 |       direction: "output"
60 |       example: "output.h5ad"
61 |       description: "The predicted pairing of test mod1&mod2 profiles."
62 |       required: true
63 |       
64 |   # files your script needs
65 |   resources:
66 |     - type: python_script
67 |       path: script.py
68 |     - path: ../resources/catalyst_tools.py
69 |     - path: ../resources/config_ADT2GEX.py
70 |     - path: ../resources/config_ATAC2GEX.py
71 |     - path: ../resources/data.py
72 |     - path: ../resources/models.py
73 |     - path: ../resources/postprocessing.py
74 |     - path: ../resources/preprocessing.py
75 |       
76 | # target platforms
77 | platforms:
78 |   - type: docker
79 |     image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime"
80 |     run_args: [ "--gpus all --shm-size=5G" ]
81 |     setup:
82 |       - type: python
83 |         packages:
84 |           - catalyst
85 |           - anndata
86 |           - scikit-learn
87 |           - networkx
88 | 
89 |   - type: nextflow
90 |     labels: [ vhighmem, vvhightime, vhighcpu, gpu]
91 | 


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
  4 | export NXF_VER=21.04.1
  5 | export PIPELINE_VERSION=1.4.0
  6 | method_id=novel
  7 | task_id=match_modality
  8 | 
  9 | 
 10 | # CITE ADT2GEX
 11 | dataset_id=openproblems_bmmc_cite_phase2_mod2
 12 | dataset_id_val=openproblems_bmmc_cite_phase2_mod2
 13 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 14 | dataset_path_val=output/datasets/$task_id/$dataset_id_val/$dataset_id_val.censor_dataset
 15 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 16 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 17 | 
 18 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 19 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 20 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 21 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 22 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 23 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 24 |   --input_test_sol ${dataset_path}.output_test_sol.h5ad \
 25 |   --output_pretrain ${pretrain_path}
 26 |   
 27 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 28 |   --input_train_mod1 ${dataset_path_val}.output_train_mod1.h5ad \
 29 |   --input_train_mod2 ${dataset_path_val}.output_train_mod2.h5ad \
 30 |   --input_train_sol ${dataset_path_val}.output_train_sol.h5ad \
 31 |   --input_test_mod1 ${dataset_path_val}.output_test_mod1.h5ad \
 32 |   --input_test_mod2 ${dataset_path_val}.output_test_mod2.h5ad \
 33 |   --input_pretrain ${pretrain_path} \
 34 |   --output ${pred_path}.${method_id}.output.h5ad
 35 |   
 36 | #CITE GEX2ADT
 37 | dataset_id=openproblems_bmmc_cite_phase2_rna
 38 | pretrain_dataset_id=openproblems_bmmc_cite_phase2_mod2
 39 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 40 | pretrain_path=output/pretrain/$task_id/$method_id/$pretrain_dataset_id.${method_id}_train.output_pretrain/
 41 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 42 | 
 43 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 44 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 45 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 46 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 47 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 48 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 49 |   --input_pretrain ${pretrain_path} \
 50 |   --output ${pred_path}.${method_id}.output.h5ad
 51 | 
 52 | 
 53 | 
 54 | # MULTIOME ATAC2GEX
 55 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
 56 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 57 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 58 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 59 | 
 60 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 61 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 62 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 63 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 64 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 65 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 66 |   --input_test_sol ${dataset_path}.output_test_sol.h5ad \
 67 |   --output_pretrain ${pretrain_path}
 68 |   
 69 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 70 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 71 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 72 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 73 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 74 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 75 |   --input_pretrain ${pretrain_path} \
 76 |   --output ${pred_path}.${method_id}.output.h5ad
 77 | 
 78 | # MULTIOME GEX2ATAC
 79 | dataset_id=openproblems_bmmc_multiome_phase2_rna
 80 | pretrain_dataset_id=openproblems_bmmc_multiome_phase2_mod2
 81 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 82 | pretrain_path=output/pretrain/$task_id/$method_id/$pretrain_dataset_id.${method_id}_train.output_pretrain/
 83 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 84 | 
 85 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 86 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 87 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 88 |   --input_train_sol ${dataset_path}.output_train_sol.h5ad \
 89 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 90 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 91 |   --input_pretrain ${pretrain_path} \
 92 |   --output ${pred_path}.${method_id}.output.h5ad
 93 | 
 94 | # RUN EVALUATION
 95 | bin/nextflow run "$PIPELINE_REPO" \
 96 |   -r "$PIPELINE_VERSION" \
 97 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
 98 |   --solutionDir "output/datasets/$task_id" \
 99 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
100 |   --publishDir "output/evaluation/$task_id/$method_id/" \
101 |   -latest \
102 |   -resume \
103 |   -c "src/resources/nextflow_moremem.config"
104 | 
105 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/match_modality/methods/novel/train/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: novel_train
 3 |   namespace: match_modality_methods
 4 |   
 5 |   # metadata for your method
 6 |   
 7 |   description: The approach utilizes sample representations, learned in the same way as in the CLIP model. Encoders for all of the modalities are fully connected, the dimensionality of GEX and ATAC data is reduces via LSI transform (ADT is left as-is). Then, to obtain sample pairings, a maximum weight matching on a bipartite graph is performed, where weights are cosine similarities between sample embeddings.
 8 |   
 9 |   authors:
10 |     - name: Gleb Ryazantsev
11 |       email: ryazantsev.gleb@gmail.com
12 |       roles: [ author, maintainer ]
13 |     - name: Nikolay Russkikh
14 |       email: russkikh.nikolay@gmail.com
15 |       roles: [ author, maintainer ]
16 |     - name: Igor I
17 |       email: herri.i.67@gmail.com
18 |       roles: [ author, maintainer ]
19 |       
20 |   # parameters
21 |   arguments:
22 |     # required inputs
23 |     - name: "--input_train_mod1"
24 |       type: "file"
25 |       example: "dataset_mod1.h5ad"
26 |       description: Censored dataset, training cells.
27 |       required: true
28 |     - name: "--input_train_mod2"
29 |       type: "file"
30 |       example: "dataset_mod2.h5ad"
31 |       description: Censored dataset.
32 |       required: true
33 |     - name: "--input_train_sol"
34 |       type: "file"
35 |       example: "dataset_solution.h5ad"
36 |       description: "The pairing of train mod1&mod2 profiles."
37 |       required: true
38 |     - name: "--input_test_mod1"
39 |       type: "file"
40 |       example: "dataset_test_mod1.h5ad"
41 |       description: Censored dataset, training cells.
42 |       required: true
43 |     - name: "--input_test_mod2"
44 |       type: "file"
45 |       example: "dataset_test_mod2.h5ad"
46 |       description: Censored dataset.
47 |       required: true
48 |     - name: "--input_test_sol"
49 |       type: "file"
50 |       example: "dataset_solution.h5ad"
51 |       description: "The pairing of train mod1&mod2 profiles."
52 |       required: true
53 | 
54 |     # required outputs
55 |     - name: "--output_pretrain"
56 |       type: "file"
57 |       direction: "output"
58 |       example: "pretrain_model"
59 |       description: Path to the directory containing a pretrained model.
60 |       required: true
61 |       
62 |   # files your script needs
63 |   resources:
64 |     - type: python_script
65 |       path: script.py
66 |     - path: ../resources/catalyst_tools.py
67 |     - path: ../resources/config_ADT2GEX.py
68 |     - path: ../resources/config_ATAC2GEX.py
69 |     - path: ../resources/data.py
70 |     - path: ../resources/models.py
71 |     - path: ../resources/postprocessing.py
72 |     - path: ../resources/preprocessing.py
73 |       
74 | # target platforms
75 | platforms:
76 |   - type: docker
77 |     image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime"
78 |     run_args: [ "--gpus all --shm-size=5G" ]
79 |     setup:
80 |       - type: python
81 |         packages:
82 |           - catalyst
83 |           - anndata
84 |           - scikit-learn
85 |           - networkx
86 | 
87 |   - type: nextflow
88 |     labels: [ vhighmem, vvhightime, vhighcpu, gpu]
89 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/README.md:
--------------------------------------------------------------------------------
1 | # NeurIPS-Single-Cell-MultiModality
2 | 
3 | Team: [Xueer Chen](https://github.com/xuerchen), [Jiwei Liu](https://github.com/daxiongshu)
4 | 
5 | This folder contains our solution to the [OpenProblems-NeurIPS2021 Single-Cell Multimodal Data Integration](https://eval.ai/web/challenges/challenge-page/1111/overview). Our team AXX took the [4th place of the modality prediction task](https://eval.ai/web/challenges/challenge-page/1111/leaderboard/2860) in terms of overall ranking of 4 subtasks: namely `GEX to ADT`, `ADT to GEX`, `GEX to ATAC` and `ATAC to GEX`. Specifically, our methods ranked **3rd** in `GEX to ATAC` and **4th** in `GEX to ADT`. More details about the task can be found in the [competition webpage](https://openproblems.bio/neurips_docs/about_tasks/task1_modality_prediction/). 
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/const.py:
--------------------------------------------------------------------------------
1 | PATH = '.'
2 | OUT_PATH = '.'
3 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pytorch_lightning as pl
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from torch.utils.data import TensorDataset,DataLoader
 6 | 
 7 | class MLP(pl.LightningModule):
 8 |     def __init__(self,in_dim,out_dim,ymean,config):
 9 |         super(MLP, self).__init__()
10 |         self.ymean = ymean.cuda()
11 |         H1 = config.H1
12 |         H2 = config.H2
13 |         p = config.dropout
14 |         self.config = config
15 |         self.fc1 = nn.Linear(in_dim, H1)
16 |         self.fc2 = nn.Linear(H1,H2)
17 |         self.fc3 = nn.Linear(H1+H2, out_dim)
18 |         self.dp2 = nn.Dropout(p=p)
19 | 
20 |     def forward(self, x):
21 |         x0 = x
22 |         x1 = F.relu(self.fc1(x))
23 |         x1 = self.dp2(x1)
24 |         x = F.relu(self.fc2(x1))
25 |         x = torch.cat([x,x1],dim=1)
26 |         x = self.fc3(x)
27 |         x = self.apply_mask(x)
28 |         return x
29 |     
30 |     def apply_mask(self,yp):
31 |         tmp = torch.ones_like(yp).float()*self.ymean
32 |         mask = tmp<self.config.threshold
33 |         mask = mask.float()
34 |         return yp*(1-mask) + tmp*mask
35 |     
36 |     def training_step(self, batch, batch_nb):
37 |         x,y = batch
38 |         yp = self(x)
39 |         criterion = nn.MSELoss()
40 |         loss = criterion(yp, y)
41 |         self.log('train_loss', loss, prog_bar=True)
42 |         return loss
43 |     
44 |     def validation_step(self, batch, batch_idx):
45 |         x,y = batch
46 |         yp = self(x)
47 |         criterion = nn.MSELoss()
48 |         loss = criterion(yp, y)
49 |         self.log('valid_RMSE', loss**0.5, prog_bar=True)
50 |         return loss
51 |     
52 |     def predict_step(self, batch, batch_idx):
53 |         if len(batch) == 2:
54 |             x,_ = batch
55 |         else:
56 |             x = batch
57 |         return self(x)
58 |     
59 |     def configure_optimizers(self):
60 |         lr = self.config.lr
61 |         wd = float(self.config.wd)
62 |         adam = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=wd)
63 |         if self.config.lr_schedule == 'adam':
64 |             return adam
65 |         elif self.config.lr_schedule == 'adam_cosin':
66 |             slr = torch.optim.lr_scheduler.CosineAnnealingLR(adam, self.config.epochs)
67 |             return [adam], [slr]
68 |         else:
69 |             assert 0


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/predict.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | import os
 3 | import torch
 4 | import utils
 5 | import anndata as ad
 6 | from torch.utils.data import TensorDataset,DataLoader
 7 | from pathlib import Path
 8 | import numpy as np
 9 | from models import MLP
10 | from const import PATH, OUT_PATH
11 | 
12 | 
13 | def _predict(model,dl):
14 |     model = model.cuda()
15 |     model.eval()
16 |     yps = []
17 |     for x in dl:
18 |         with torch.no_grad():
19 |             yp = model(x[0].cuda())
20 |             yps.append(yp.detach().cpu().numpy())
21 |     yp = np.vstack(yps)
22 |     return yp
23 |             
24 | def predict(ymean,test_data_path,folds,cp,wp):
25 |     ymean = torch.from_numpy(ymean).float()
26 |     y_dim,task = utils.get_y_dim(test_data_path)
27 |     yaml_path=f"{cp}/yaml/mlp_{task}.yaml"
28 |     config = utils.load_yaml(yaml_path)
29 |     te1 = ad.read_h5ad(test_data_path)
30 |     X = te1.X.toarray()
31 |     X = torch.from_numpy(X).float()
32 |     
33 |     te_ds = TensorDataset(X)
34 |     
35 |     yp = 0
36 |     for fold in folds:
37 |         load_path = f'{wp}/{task}_fold_{fold}/version_0/checkpoints/*'
38 |         print(load_path)
39 |         ckpt = glob(load_path)[0]
40 |         model_inf = MLP.load_from_checkpoint(ckpt,in_dim=X.shape[1],
41 |                                              out_dim=y_dim,
42 |                                              ymean=ymean,
43 |                                              config=config)
44 |         te_loader = DataLoader(te_ds, batch_size=config.batch_size,num_workers=0,
45 |                         shuffle=False, drop_last=False)
46 |         yp = yp + _predict(model_inf, te_loader)
47 |     return yp/len(folds)
48 | 
49 | if __name__ == '__main__':
50 |     sanity_check()
51 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/test.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | import subprocess
 3 | import anndata as ad
 4 | from scipy.sparse import issparse
 5 | 
 6 | ## VIASH START
 7 | # This code block will be replaced by viash at runtime.
 8 | meta = { 'functionality_name': 'python_starter_kit' }
 9 | ## VIASH END
10 | 
11 | method_id = meta['functionality_name']
12 | command = "./" + method_id
13 | 
14 | # define some filenames
15 | 
16 | tag = 'cite'
17 | #tag = 'multiome'
18 | #mod = 'rna'
19 | mod = 'mod2'
20 | testpar = {
21 |     'input_train_mod1': f'sample_data/openproblems_bmmc_{tag}_phase1v2_{mod}/openproblems_bmmc_{tag}_phase1v2_{mod}.censor_dataset.output_test_mod1.h5ad',
22 |     'input_train_mod2': f'sample_data/openproblems_bmmc_{tag}_phase1v2_{mod}/openproblems_bmmc_{tag}_phase1v2_{mod}.censor_dataset.output_test_mod2.h5ad',
23 |     'input_test_mod1': f'sample_data/openproblems_bmmc_{tag}_phase1v2_{mod}/openproblems_bmmc_{tag}_phase1v2_{mod}.censor_dataset.output_test_mod1.h5ad',
24 |     'input_test_mod2': f'sample_data/openproblems_bmmc_{tag}_phase1v2_{mod}/openproblems_bmmc_{tag}_phase1v2_{mod}.censor_dataset.output_test_mod2.h5ad',
25 |    "output": "output.h5ad"
26 | }
27 | 
28 | print("> Running method")
29 | out = subprocess.check_output([
30 |   command,
31 |   "--input_train_mod1", testpar['input_train_mod1'],
32 |   "--input_train_mod2", testpar['input_train_mod2'],
33 |   "--input_test_mod1", testpar['input_test_mod1'],
34 |   "--output", testpar['output']
35 | ]).decode("utf-8")
36 | 
37 | print("> Checking whether output files were created")
38 | assert path.exists(testpar['output'])
39 | 
40 | print("> Reading h5ad files")
41 | ad_sol = ad.read_h5ad(testpar['input_test_mod2'])
42 | ad_pred = ad.read_h5ad(testpar['output'])
43 | 
44 | print("> Checking dataset id")
45 | assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id']
46 | 
47 | print("> Checking method id", ad_pred.uns['method_id'], method_id)
48 | assert ad_pred.uns['method_id'] == method_id
49 | 
50 | print("> Checking X")
51 | assert issparse(ad_pred.X)
52 | assert ad_pred.n_obs == ad_sol.n_obs
53 | assert ad_pred.n_vars == ad_sol.n_vars
54 | assert all(ad_pred.obs_names == ad_sol.obs_names)
55 | assert all(ad_pred.var_names == ad_sol.var_names)
56 | 
57 | print("> Test succeeded!")


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import pytorch_lightning as pl
 3 | from torch.utils.data import TensorDataset,DataLoader
 4 | from models import MLP
 5 | from pytorch_lightning.callbacks import ModelCheckpoint
 6 | from pytorch_lightning.loggers import TensorBoardLogger,WandbLogger
 7 | from pathlib import Path
 8 | import utils
 9 | import anndata as ad
10 | import numpy as np
11 | import json
12 | from const import PATH, OUT_PATH
13 | 
14 | def _train(X, y, Xt, yt, enable_ckpt, logger, yaml_path):
15 |     config = utils.load_yaml(yaml_path)
16 |     X = torch.from_numpy(X).float()
17 |     y = torch.from_numpy(y).float()
18 |     ymean = torch.mean(y,dim=0,keepdim=True)
19 |     
20 |     tr_ds = TensorDataset(X,y)
21 |     nw = 0 if 'ATAC' in yaml_path else 4 
22 |     tr_loader = DataLoader(tr_ds, batch_size=config.batch_size,num_workers=nw,
23 |                         shuffle=True, drop_last=True)
24 |     
25 |     Xt = torch.from_numpy(Xt).float()
26 |     yt = torch.from_numpy(yt).float()
27 |     te_ds = TensorDataset(Xt,yt)
28 |     te_loader = DataLoader(te_ds, batch_size=config.batch_size,num_workers=0,
29 |                         shuffle=False, drop_last=False)
30 |     
31 |     checkpoint_callback = ModelCheckpoint(monitor='valid_RMSE')
32 |     if enable_ckpt:
33 |         epochs = config.epochs
34 |         cb = [checkpoint_callback]
35 |     else:
36 |         epochs = 1
37 |         cb = None
38 |     
39 |     trainer = pl.Trainer(enable_checkpointing=enable_ckpt, logger=logger, 
40 |                          gpus=1, max_epochs=epochs, 
41 |                          callbacks=cb,
42 |                          progress_bar_refresh_rate=5)
43 |     
44 |     net = MLP(X.shape[1],y.shape[1],ymean,config)
45 |     trainer.fit(net, tr_loader, te_loader)
46 |     
47 |     cp = 'best' if enable_ckpt else None
48 |     yp = trainer.predict(net,te_loader,ckpt_path=cp)
49 |     yp = torch.cat(yp,dim=0)
50 |     
51 |     score = ((yp-yt)**2).mean()**0.5
52 |     print(f"VALID RMSE {score:.3f}")
53 |     del trainer
54 |     return score,yp.detach().numpy()
55 | 
56 | 
57 | def train(task,cp,wp,tr1,tr2):
58 |     yaml_path = f'{cp}/yaml/mlp_{task}.yaml'
59 |     yps = []
60 |     scores = []
61 | 
62 |     msgs = {}
63 |     for fold in range(3):
64 | 
65 |         run_name = f"{task}_fold_{fold}"
66 |         save_path = f'{wp}/{run_name}'
67 |         Path(save_path).mkdir(parents=True, exist_ok=True)   
68 | 
69 |         X,y,Xt,yt = utils.split(tr1, tr2, fold)
70 |         run_name = f'fold_{fold}'
71 |         logger = TensorBoardLogger(save_path, name='') 
72 |         
73 |         enable_ckpt = True
74 |         
75 |         score, yp = _train(X, y, Xt, yt, enable_ckpt, logger, yaml_path)
76 |         yps.append(yp)
77 |         scores.append(score)
78 |         msg = f"{task} Fold {fold} RMSE {score:.3f}"
79 |         msgs[f'Fold {fold}'] = f'{score:.3f}'
80 |         print(msg)
81 | 
82 |     yp = np.concatenate(yps)
83 |     score = np.mean(scores)
84 |     msgs['Overall'] = f'{score:.3f}'
85 |     print('Overall', f'{score:.3f}')


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/yaml/mlp_ADT2GEX.yaml:
--------------------------------------------------------------------------------
 1 | # sample config defaults file
 2 | epochs:
 3 |   desc: Number of epochs to train over
 4 |   value: 10
 5 | batch_size:
 6 |   desc: Size of each mini-batch
 7 |   value: 512
 8 | H1:
 9 |   desc: Number of hidden neurons in 1st layer of MLP
10 |   value: 256
11 | H2:
12 |   desc: Number of hidden neurons in 2nd layer of MLP
13 |   value: 128
14 | dropout:
15 |   desc: probs of zeroing values
16 |   value: 0
17 | lr:
18 |   desc: learning rate
19 |   value: 0.001
20 | wd:
21 |   desc: weight decay
22 |   value: 1e-5
23 | threshold:
24 |   desc: threshold to set values to zero
25 |   value: 0
26 | lr_schedule:
27 |   desc: learning rate scheduler
28 |   value: adam


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/yaml/mlp_ATAC2GEX.yaml:
--------------------------------------------------------------------------------
 1 | # sample config defaults file
 2 | epochs:
 3 |   desc: Number of epochs to train over
 4 |   value: 10
 5 | batch_size:
 6 |   desc: Size of each mini-batch
 7 |   value: 512
 8 | H1:
 9 |   desc: Number of hidden neurons in 1st layer of MLP
10 |   value: 256
11 | H2:
12 |   desc: Number of hidden neurons in 2nd layer of MLP
13 |   value: 128
14 | dropout:
15 |   desc: probs of zeroing values
16 |   value: 0.5
17 | lr:
18 |   desc: learning rate
19 |   value: 0.001
20 | wd:
21 |   desc: weight decay
22 |   value: 1e-5
23 | threshold:
24 |   desc: threshold to set values to zero
25 |   value: 0
26 | lr_schedule:
27 |   desc: learning rate scheduler
28 |   value: adam


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/yaml/mlp_GEX2ADT.yaml:
--------------------------------------------------------------------------------
 1 | # sample config defaults file
 2 | epochs:
 3 |   desc: Number of epochs to train over
 4 |   value: 10 
 5 | batch_size:
 6 |   desc: Size of each mini-batch
 7 |   value: 512
 8 | H1:
 9 |   desc: Number of hidden neurons in 1st layer of MLP
10 |   value: 1024 
11 | H2:
12 |   desc: Number of hidden neurons in 2nd layer of MLP
13 |   value: 512
14 | dropout:
15 |   desc: probs of zeroing values
16 |   value: 0
17 | lr:
18 |   desc: learning rate
19 |   value: 0.001
20 | wd:
21 |   desc: weight decay
22 |   value: 1e-5
23 | threshold:
24 |   desc: threshold to set values to zero
25 |   value: 0.05
26 | lr_schedule:
27 |   desc: learning rate scheduler
28 |   value: adam_cosin


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/run/config.vsh.yaml:
--------------------------------------------------------------------------------
  1 | functionality:
  2 |   name: simplemlp
  3 |   namespace: predict_modality_methods
  4 |   
  5 |   # metadata for your method
  6 |   description: Ensemble of MLPs trained on different sites
  7 |   info:
  8 |     method_label: SimpleMLP
  9 |     submission_id: "170812"
 10 |     team_name: AXX
 11 |     # project_url: https://github.com/foo/bar
 12 |     # publication_doi: 10.1101/0123.45.67.890123
 13 |     # publication_url: https://arxiv.org/abs/1234.56789
 14 | 
 15 |   authors:
 16 |     - name: Xueer Chen
 17 |       email: xc2579@columbia.edu
 18 |       roles: [ author, maintainer ]
 19 |       props: { github: xuerchen }
 20 |     - name: Jiwei Liu
 21 |       email: jiweil@nvidia.com
 22 |       roles: [ author, maintainer ]
 23 |       props: { github: daxiongshu, orcid: "0000-0002-8799-9763" }
 24 |       
 25 |   # parameters
 26 |   arguments:
 27 |     # required inputs
 28 |     - name: "--input_train_mod1"
 29 |       type: "file"
 30 |       example: "dataset_mod1.h5ad"
 31 |       description: Censored dataset, training cells.
 32 |       required: true
 33 |     - name: "--input_test_mod1"
 34 |       type: "file"
 35 |       example: "dataset_mod1.h5ad"
 36 |       description: Censored dataset, test cells.
 37 |       required: true
 38 |     - name: "--input_train_mod2"
 39 |       type: "file"
 40 |       example: "dataset_mod2.h5ad"
 41 |       description: Censored dataset.
 42 |       required: true
 43 |     - name: "--input_pretrain"
 44 |       type: "file"
 45 |       direction: "output"
 46 |       example: "pretrain_model"
 47 |       description: Path to the directory containing a pretrained model.
 48 |       required: true
 49 |     # required outputs
 50 |     - name: "--output"
 51 |       type: "file"
 52 |       direction: "output"
 53 |       example: "output.h5ad"
 54 |       description: Dataset with predicted values for modality2.
 55 |       required: true
 56 |    
 57 |       
 58 |   # files your script needs
 59 |   resources:
 60 |     - type: python_script
 61 |       path: script.py
 62 |     - path: ../resources/predict.py
 63 |     - path: ../resources/models.py
 64 |     - path: ../resources/utils.py
 65 |     - path: ../resources/const.py
 66 |     - path: ../resources/yaml
 67 |   
 68 |   # resources for unit testing your component
 69 |   tests:
 70 |     - type: python_script
 71 |       path: test.py
 72 |     - path: sample_data
 73 |       
 74 | # target platforms
 75 | platforms:
 76 | 
 77 |   # By specifying 'docker' platform, viash will build a standalone
 78 |   # executable which uses docker in the back end to run your method.
 79 |   - type: docker
 80 |     # you need to specify a base image that contains at least bash and python
 81 |     image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
 82 |     run_args: [ "--gpus all --ipc=host"]
 83 |     # You can specify additional dependencies with 'setup'. 
 84 |     # See https://viash.io/docs/reference_config/platform-docker/#setup-list
 85 |     # for more information on how to add more dependencies.
 86 |     setup:
 87 |       # - type: apt
 88 |       #   packages:
 89 |       #     - bash
 90 |       # - type: python
 91 |       #   packages:
 92 |       #     - scanpy
 93 |       - type: python
 94 |         packages:
 95 |           - scikit-learn
 96 |           - anndata
 97 |           - scanpy
 98 |           - pytorch-lightning
 99 | 
100 |   # By specifying a 'nextflow', viash will also build a viash module
101 |   # which uses the docker container built above to also be able to 
102 |   # run your method as part of a nextflow pipeline.
103 |   - type: nextflow
104 |     labels: [ highmem, hightime, highcpu, gpu]
105 | 
106 |   # used for saturn cloud
107 |   - type: native
108 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/run/script.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import anndata as ad
 3 | import sys
 4 | from scipy.sparse import csc_matrix
 5 | 
 6 | from sklearn.decomposition import TruncatedSVD
 7 | from sklearn.linear_model import LinearRegression
 8 | import numpy as np
 9 | 
10 | logging.basicConfig(level=logging.INFO)
11 | 
12 | ## VIASH START
13 | # Anything within this block will be removed by `viash` and will be
14 | # replaced with the parameters as specified in your config.vsh.yaml.
15 | par = {
16 |     'input_train_mod1': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad',
17 |     'input_train_mod2': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad',
18 |     'input_test_mod1': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod1.h5ad',
19 |     'input_pretrain': 'path/to/model',
20 |     'output': 'output.h5ad'
21 | }
22 | meta = {
23 |     'resources_dir': 'src/predict_modality/methods/AXX/resources'
24 | }
25 | ## VIASH END
26 | sys.path.append(meta['resources_dir'])
27 | from predict import predict
28 | from utils import get_y_dim
29 | 
30 | logging.info('Reading `h5ad` files...')
31 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
32 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
33 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
34 | 
35 | y_dim,task = get_y_dim(par['input_test_mod1'])
36 | ymean = np.asarray(input_train_mod2.X.mean(axis=0))
37 | if task == 'GEX2ATAC':
38 |     y_pred = ymean*np.ones([input_test_mod1.shape[0],y_dim])
39 | else:
40 |     y_pred = predict(ymean,test_data_path=par['input_test_mod1'],
41 |                      folds=[0,1,2],cp=meta['resources_dir'],
42 |                      wp=par['input_pretrain'])
43 | 
44 | y_pred = csc_matrix(y_pred)
45 | 
46 | adata = ad.AnnData(
47 |     X=y_pred,
48 |     obs=input_test_mod1.obs,
49 |     var=input_train_mod2.var,
50 |     uns={
51 |         'dataset_id': input_train_mod1.uns['dataset_id'],
52 |         'method_id': meta['functionality_name'],
53 |     },
54 | )
55 | 
56 | logging.info('Storing annotated data...')
57 | adata.write_h5ad(par['output'], compression = "gzip")
58 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
 4 | export NXF_VER=21.04.1
 5 | export PIPELINE_VERSION=1.4.0
 6 | method_id=simplemlp
 7 | task_id=predict_modality
 8 | 
 9 | # CITE GEX2ADT
10 | dataset_id=openproblems_bmmc_cite_phase2_rna
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
13 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
14 | 
15 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
16 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
17 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
18 |   --output_pretrain ${pretrain_path}
19 | 
20 | target/docker/${task_id}_methods/${method_id}/${method_id} \
21 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
22 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
23 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
24 |   --input_pretrain ${pretrain_path} \
25 |   --output ${pred_path}.${method_id}.output.h5ad
26 | 
27 | # CITE ADT2GEX
28 | dataset_id=openproblems_bmmc_cite_phase2_mod2
29 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
30 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
31 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
32 | 
33 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
34 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
35 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
36 |   --output_pretrain ${pretrain_path}
37 | 
38 | target/docker/${task_id}_methods/${method_id}/${method_id} \
39 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
40 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
41 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
42 |   --input_pretrain ${pretrain_path} \
43 |   --output ${pred_path}.${method_id}.output.h5ad
44 | 
45 | # MULTIOME GEX2ATAC
46 | dataset_id=openproblems_bmmc_multiome_phase2_rna
47 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
48 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
49 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
50 | 
51 | target/docker/${task_id}_methods/${method_id}/${method_id} \
52 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
53 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
54 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
55 |   --input_pretrain ${pretrain_path} \
56 |   --output ${pred_path}.${method_id}.output.h5ad
57 | 
58 | # MULTIOME ATAC2GEX
59 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
60 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
61 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
62 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
63 | 
64 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
65 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
66 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
67 |   --output_pretrain ${pretrain_path}
68 | 
69 | target/docker/${task_id}_methods/${method_id}/${method_id} \
70 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
71 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
72 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
73 |   --input_pretrain ${pretrain_path} \
74 |   --output ${pred_path}.${method_id}.output.h5ad
75 | 
76 | # RUN EVALUATION
77 | bin/nextflow run "$PIPELINE_REPO" \
78 |   -r "$PIPELINE_VERSION" \
79 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
80 |   --solutionDir "output/datasets/$task_id" \
81 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
82 |   --publishDir "output/evaluation/$task_id/$method_id/" \
83 |   -latest \
84 |   -resume \
85 |   -c "src/resources/nextflow_moremem.config"
86 | 
87 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/train/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: simplemlp_train
 3 |   namespace: predict_modality_methods
 4 |   
 5 |   # metadata for your method
 6 |   description: Ensemble of MLPs trained on differnt sites
 7 |   info:
 8 |     submission_id: "170812"
 9 |     team_name: AXX
10 |     # project_url: https://github.com/foo/bar
11 |     # publication_doi: 10.1101/0123.45.67.890123
12 |     # publication_url: https://arxiv.org/abs/1234.56789
13 | 
14 |   authors:
15 |     - name: Xueer Chen
16 |       email: xc2579@columbia.edu
17 |       roles: [ author, maintainer ]
18 |       props: { github: xuerchen, orcid: "0000-0000-0000-0000" }
19 |     - name: Jiwei Liu
20 |       email: jiweil@nvidia.com
21 |       roles: [ author, maintainer ]
22 |       props: { github: daxiongshu, orcid: "0000-0002-8799-9763" }
23 | 
24 |       
25 |   # parameters
26 |   arguments:
27 |     # required inputs
28 |     - name: "--input_train_mod1"
29 |       type: "file"
30 |       example: "dataset_mod1.h5ad"
31 |       description: Censored dataset, training cells.
32 |       required: true
33 |     - name: "--input_train_mod2"
34 |       type: "file"
35 |       example: "dataset_mod2.h5ad"
36 |       description: Censored dataset.
37 |       required: true
38 |     # required outputs
39 |     - name: "--output_pretrain"
40 |       type: "file"
41 |       direction: "output"
42 |       example: "pretrain_model"
43 |       description: Path to the directory containing a pretrained model.
44 |       required: true
45 |       
46 |   # files your script needs
47 |   resources:
48 |     - type: python_script
49 |       path: script.py
50 |     - path: ../resources/train.py
51 |     - path: ../resources/models.py
52 |     - path: ../resources/utils.py
53 |     - path: ../resources/const.py
54 |     - path: ../resources/yaml
55 | 
56 | # target platforms
57 | platforms:
58 | 
59 |   # By specifying 'docker' platform, viash will build a standalone
60 |   # executable which uses docker in the back end to run your method.
61 |   - type: docker
62 |     # you need to specify a base image that contains at least bash and python
63 |     image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
64 |     run_args: [ "--gpus all --ipc=host"]
65 |     # You can specify additional dependencies with 'setup'. 
66 |     # See https://viash.io/docs/reference_config/platform-docker/#setup-list
67 |     # for more information on how to add more dependencies.
68 |     setup:
69 |       # - type: apt
70 |       #   packages:
71 |       #     - bash
72 |       # - type: python
73 |       #   packages:
74 |       #     - scanpy
75 |       - type: python
76 |         packages:
77 |           - scikit-learn
78 |           - anndata
79 |           - scanpy
80 |           - pytorch-lightning
81 | 
82 |   # By specifying a 'nextflow', viash will also build a viash module
83 |   # which uses the docker container built above to also be able to 
84 |   # run your method as part of a nextflow pipeline.
85 |   - type: nextflow
86 |     labels: [ highmem, hightime, highcpu, gpu]
87 | 
88 |   # used for saturn cloud
89 |   - type: native
90 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/train/script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import anndata as ad
 4 | import pickle
 5 | import numpy as np
 6 | import pandas as pd
 7 | import scanpy as sc
 8 | from sklearn.preprocessing import binarize
 9 | 
10 | logging.basicConfig(level=logging.INFO)
11 | 
12 | ## VIASH START
13 | par = {
14 |     'input_train_mod1': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad',
15 |     'input_train_mod2': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad',
16 |     'output_pretrain': 'path/to/model'
17 | }
18 | 
19 | meta = {
20 |     'resources_dir': 'src/predict_modality/methods/AXX/resources'
21 | }
22 | ## VIASH END
23 | 
24 | import sys
25 | sys.path.append(meta['resources_dir'])
26 | from train import train
27 | 
28 | 
29 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
30 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
31 | 
32 | mod_1 = input_train_mod1.var["feature_types"][0]
33 | mod_2 = input_train_mod2.var["feature_types"][0]
34 | 
35 | os.makedirs(par['output_pretrain'], exist_ok=True)
36 | 
37 | task = f'{mod_1}2{mod_2}'
38 | train(task,cp=meta['resources_dir'],
39 |       wp=par['output_pretrain'],
40 |       tr1=input_train_mod1,
41 |       tr2=input_train_mod2)


--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/resources/baseline.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from sklearn.decomposition import TruncatedSVD
 4 | from sklearn.linear_model import LinearRegression
 5 | 
 6 | def baseline_linear(input_train_mod1, input_train_mod2, input_test_mod1):
 7 |     '''Baseline method training a linear regressor on the input data'''
 8 | 
 9 |     # Do PCA on the input data
10 |     logging.info('Performing dimensionality reduction on modality 1 values...')
11 |     embedder_mod1 = TruncatedSVD(n_components=50)
12 |     X_train = embedder_mod1.fit_transform(input_train_mod1)
13 |     X_test = embedder_mod1.transform(input_test_mod1)
14 | 
15 |     logging.info('Performing dimensionality reduction on modality 2 values...')
16 |     embedder_mod2 = TruncatedSVD(n_components=50)
17 |     y_train = embedder_mod2.fit_transform(input_train_mod2)
18 | 
19 |     logging.info('Running Linear regression...')
20 | 
21 |     reg = LinearRegression()
22 | 
23 |     # Train the model on the PCA reduced modality 1 and 2 data
24 |     reg.fit(X_train, y_train)
25 |     y_pred = reg.predict(X_test)
26 | 
27 |     # Project the predictions back to the modality 2 feature space
28 |     y_pred = y_pred @ embedder_mod2.components_
29 | 
30 |     return y_pred


--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/run/config.vsh.yaml:
--------------------------------------------------------------------------------
  1 | functionality:
  2 |   name: dance
  3 |   namespace: predict_modality_methods
  4 |   
  5 |   # metadata for your method
  6 |   description: A description for your method.
  7 |   info:
  8 |     method_label: "DANCE"
  9 |     submission_id: "171129"
 10 |     team_name: DANCE
 11 |     # project_url: https://github.com/foo/bar
 12 |     # publication_doi: 10.1101/0123.45.67.890123
 13 |     # publication_url: https://arxiv.org/abs/1234.56789
 14 | 
 15 |   authors:
 16 |     - name: Hongzhi Wen
 17 |       email: wenhongz@msu.edu
 18 |       roles: [ author, maintainer ]
 19 |     - name: Jiayuan Ding
 20 |       email: dingjia5@msu.edu
 21 |       roles: [ author, maintainer ]
 22 |     - name: Wei Jin
 23 |       email: jinwei2@msu.edu
 24 |       roles: [ author ]
 25 |     - name: Xiaoyan Li
 26 |       email: lixiaoy5@msu.edu
 27 |       roles: [ author ]
 28 |     - name: Zhaoheng Li
 29 |       email: zli1@macalester.edu
 30 |       roles: [ author ]
 31 |     - name: Haoyu Han
 32 |       email: hanhaoy1@msu.edu
 33 |       roles: [ assistant ]
 34 |     - name: Yuying Xie
 35 |       email: xyy@msu.edu
 36 |       roles: [ advisor ]
 37 |     - name: Jiliang Tang
 38 |       email: tangjili@msu.edu
 39 |       roles: [ advisor ]
 40 |       
 41 |       
 42 |   # parameters
 43 |   arguments:
 44 |     # required inputs
 45 |     - name: "--input_train_mod1"
 46 |       type: "file"
 47 |       example: "dataset_mod1.h5ad"
 48 |       description: Censored dataset, training cells.
 49 |       required: true
 50 |     - name: "--input_test_mod1"
 51 |       type: "file"
 52 |       example: "dataset_mod1.h5ad"
 53 |       description: Censored dataset, test cells.
 54 |       required: true
 55 |     - name: "--input_train_mod2"
 56 |       type: "file"
 57 |       example: "dataset_mod2.h5ad"
 58 |       description: Censored dataset.
 59 |       required: true
 60 |     - name: "--input_pretrain"
 61 |       type: "file"
 62 |       example: "pretrain_model"
 63 |       description: Path to the directory containing a pretrained model.
 64 |       required: true
 65 |     # required outputs
 66 |     - name: "--output"
 67 |       type: "file"
 68 |       direction: "output"
 69 |       example: "output.h5ad"
 70 |       description: Dataset with predicted values for modality2.
 71 |       required: true
 72 |       
 73 |   # files your script needs
 74 |   resources:
 75 |     - type: python_script
 76 |       path: script.py
 77 |     - path: ../resources/baseline.py
 78 |     - path: ../resources/graph_util.py
 79 |       
 80 | # target platforms
 81 | platforms:
 82 |   - type: docker
 83 |     image: dataintuitive/randpy:py3.8
 84 |     setup:
 85 |       - type: docker
 86 |         run: [pip install scikit-learn==0.24.1]
 87 |         
 88 |       - type: python
 89 |         packages:
 90 |           #- scikit-learn
 91 |           - anndata
 92 |           - scanpy
 93 |           - numpy
 94 |           - torch
 95 |           - dgl
 96 |           - lightgbm
 97 |           - joblib
 98 | 
 99 |   - type: nextflow
100 |     labels: [ midmem, hightime, lowcpu ]


--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/run/script.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import anndata as ad
  4 | import numpy as np
  5 | import json
  6 | import sys
  7 | import re
  8 | from scipy.sparse import csc_matrix
  9 | 
 10 | 
 11 | logging.basicConfig(level=logging.INFO)
 12 | 
 13 | ## VIASH START
 14 | dataset_path = "output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_"
 15 | pretrain_path = "output/pretrain/predict_modality/dance/openproblems_bmmc_cite_phase2_rna.dance_train.output_pretrain/"
 16 | 
 17 | par = {
 18 |     'input_train_mod1': f'{dataset_path}train_mod1.h5ad',
 19 |     'input_train_mod2': f'{dataset_path}train_mod2.h5ad',
 20 |     'input_test_mod1': f'{dataset_path}test_mod1.h5ad',
 21 |     'input_pretrain': pretrain_path,
 22 |     'output': 'output.h5ad'
 23 | }
 24 | meta = {
 25 |     'resources_dir': 'src/predict_modality/methods/DANCE/resources',
 26 |     'functionality_name': '171129'
 27 | }
 28 | ## VIASH END
 29 | 
 30 | logging.info('Reading `h5ad` files...')
 31 | train_mod1 = ad.read_h5ad(par['input_train_mod1'])
 32 | mod1 = train_mod1.var['feature_types'][0]
 33 | dataset_id = train_mod1.uns['dataset_id']
 34 | input_train_mod1 = train_mod1.X
 35 | 
 36 | train_mod2 = ad.read_h5ad(par['input_train_mod2'])
 37 | var = train_mod2.var
 38 | mod2 = train_mod2.var['feature_types'][0]
 39 | input_train_mod2 = train_mod2.X
 40 | 
 41 | test_mod1 = ad.read_h5ad(par['input_test_mod1'])
 42 | obs = test_mod1.obs
 43 | input_test_mod1 = test_mod1.X
 44 | 
 45 | if mod1 == 'GEX':
 46 |     sys.path.append(meta['resources_dir'])
 47 |     from graph_util import graph_construction, WeightedGCN4
 48 |     
 49 |     import torch
 50 | 
 51 |     # # This will get passed to the method
 52 |     FEATURE_SIZE = train_mod1.shape[1]
 53 |     OUTPUT_SIZE = train_mod2.shape[1]
 54 |     TRAIN_SIZE = train_mod1.shape[0]
 55 |     TEST_SIZE = test_mod1.shape[0]
 56 |     
 57 |     g, bf = graph_construction(meta, train_mod1, train_mod2, test_mod1, pretrain_path=par['input_pretrain'])
 58 |     
 59 |     class Dict(dict):
 60 |         __setattr__ = dict.__setitem__
 61 |         __getattr__ = dict.__getitem__
 62 | 
 63 |     def dict2obj(dictObj):
 64 |         if not isinstance(dictObj, dict):
 65 |             return dictObj
 66 |         d = Dict()
 67 |         for k, v in dictObj.items():
 68 |             d[k] = dict2obj(v)
 69 |         return d
 70 |             
 71 |     def evaluate(mod, args):
 72 |         mod.eval()
 73 |         with torch.no_grad():
 74 |             logits = mod(g, bf, args)
 75 |             logits = logits[-TEST_SIZE:]
 76 |             return logits
 77 |     
 78 |     def build_args(LOG_FILE_PATH):
 79 |         string = open(LOG_FILE_PATH, 'r').readline()
 80 |         string = string.replace('Namespace', '').replace('=', ':').replace('(', '{ ').replace(')', '}').replace("'", '"').replace(',', ',\n').replace('True', 'true').replace('False','false')
 81 |         string = re.sub('[ ](.*?):',  r' "\1":', string)
 82 |         args = json.loads(string)
 83 |         return dict2obj(args)
 84 |     
 85 |     if mod2 == 'ADT':
 86 |         y_pred = []
 87 |         model_names = ['f_alpha_conv4_mean_fullbatch_12000_phase2_inductive_batch_speration.pkl', 'bf_alpha_conv4_mean_fullbatch_10000_phase2_inductive_gex2adt_2.pkl', 'bf_alpha_conv4_mean_fullbatch_12000_phase2_inductive_gex2adt_sep_2.pkl', 'bf_alpha_conv4_mean_fullbatch_15000_phase2_inductive.pkl']
 88 |         
 89 |         for model_name in model_names:
 90 |             args = build_args(os.path.join(par['input_pretrain'], model_name).replace('.pkl', '.log'))
 91 |             model = torch.load(os.path.join(par['input_pretrain'], model_name), map_location='cpu')
 92 |             y_pred.append(evaluate(model, args).numpy())
 93 |             del model, args
 94 | 
 95 |         y_pred = csc_matrix((y_pred[0]+y_pred[1]+y_pred[2]+y_pred[3])/4)
 96 |     
 97 |     elif mod2 == 'ATAC':
 98 |         y_pred = []
 99 |         model_names = ['bf_alpha_conv4_mean_fullbatch_8000_phase2_inductive_gex2atac_3.pkl', 'bf_alpha_conv4_mean_fullbatch_8000_phase2_inductive_gex2atac_2.pkl', 'bf_alpha_conv4_mean_fullbatch_8000_phase2_inductive_gex2atac.pkl', 'bf_alpha_conv4_mean_fullbatch_10000_phase2_inductive_gex2atac.pkl']
100 |         
101 |         for model_name in model_names:
102 |             args = build_args(os.path.join(par['input_pretrain'], model_name).replace('.pkl', '.log'))
103 |             model = torch.load(os.path.join(par['input_pretrain'], model_name), map_location='cpu')
104 |             y_pred.append(evaluate(model, args).numpy())
105 |             del model, args
106 |         
107 |         y_pred = csc_matrix((y_pred[0]+y_pred[1]+y_pred[2]+y_pred[3])/4)
108 | 
109 | elif mod1=='ATAC' and mod2=='GEX':
110 |     y_pred = csc_matrix(np.tile(np.mean(input_train_mod2.toarray(), 0), (input_test_mod1.shape[0], 1)))
111 |         
112 | else:
113 |     sys.path.append(meta['resources_dir'])
114 |     from baseline import baseline_linear
115 | 
116 |     input_train_mod1 = train_mod1[train_mod1.obs['batch']!='s3d1'].X
117 |     input_train_mod2 = train_mod2[train_mod2.obs['batch']!='s3d1'].X
118 |     y_pred = csc_matrix(baseline_linear(input_train_mod1, input_train_mod2, input_test_mod1))
119 | 
120 | adata = ad.AnnData(
121 |     X=y_pred,
122 |     obs=obs,
123 |     var=var,
124 |     uns={
125 |         'dataset_id': dataset_id,
126 |         'method_id': meta['functionality_name'],
127 |     },
128 | )
129 | 
130 | logging.info('Storing annotated data...')
131 | adata.write_h5ad(par['output'], compression = "gzip")
132 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
 4 | export NXF_VER=21.04.1
 5 | export PIPELINE_VERSION=1.4.0
 6 | method_id=dance
 7 | task_id=predict_modality
 8 | 
 9 | # GENERATE PRETRAIN
10 | pretrain_path=output/pretrain/$task_id/$method_id/pretrain.${method_id}_train.output_pretrain/
11 | 
12 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
13 |   --data_dir output/datasets/$task_id \
14 |   --output_pretrain ${pretrain_path}
15 | 
16 | # CITE GEX2ADT
17 | dataset_id=openproblems_bmmc_cite_phase2_rna
18 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
19 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
20 | 
21 | target/docker/${task_id}_methods/${method_id}/${method_id} \
22 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
23 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
24 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
25 |   --input_pretrain ${pretrain_path} \
26 |   --output ${pred_path}.${method_id}.output.h5ad
27 | 
28 | # CITE ADT2GEX
29 | dataset_id=openproblems_bmmc_cite_phase2_mod2
30 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
31 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
32 | 
33 | target/docker/${task_id}_methods/${method_id}/${method_id} \
34 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
35 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
36 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
37 |   --input_pretrain ${pretrain_path} \
38 |   --output ${pred_path}.${method_id}.output.h5ad
39 | 
40 | 
41 | # MULTIOME GEX2ATAC
42 | dataset_id=openproblems_bmmc_multiome_phase2_rna
43 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
44 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
45 | 
46 | target/docker/${task_id}_methods/${method_id}/${method_id} \
47 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
48 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
49 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
50 |   --input_pretrain ${pretrain_path} \
51 |   --output ${pred_path}.${method_id}.output.h5ad
52 | 
53 | # MULTIOME ATAC2GEX
54 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
55 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
56 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
57 | 
58 | target/docker/${task_id}_methods/${method_id}/${method_id} \
59 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
60 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
61 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
62 |   --input_pretrain ${pretrain_path} \
63 |   --output ${pred_path}.${method_id}.output.h5ad
64 | 
65 | # RUN EVALUATION
66 | bin/nextflow run "$PIPELINE_REPO" \
67 |   -r "$PIPELINE_VERSION" \
68 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
69 |   --solutionDir "output/datasets/$task_id" \
70 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
71 |   --publishDir "output/evaluation/$task_id/$method_id/" \
72 |   -latest \
73 |   -resume \
74 |   -c "src/resources/nextflow_moremem.config"
75 | 
76 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/train/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: dance_train
 3 |   namespace: predict_modality_methods
 4 |   
 5 |   # metadata for your method
 6 |   
 7 |   description: A description for your method.
 8 |   authors:
 9 |     - name: Hongzhi Wen
10 |       email: wenhongz@msu.edu
11 |       roles: [ author, maintainer ]
12 |     - name: Jiayuan Ding
13 |       email: dingjia5@msu.edu
14 |       roles: [ author, maintainer ]
15 |     - name: Wei Jin
16 |       email: jinwei2@msu.edu
17 |       roles: [ author ]
18 |     - name: Xiaoyan Li
19 |       email: lixiaoy5@msu.edu
20 |       roles: [ author ]
21 |     - name: Zhaoheng Li
22 |       email: zli1@macalester.edu
23 |       roles: [ author ]
24 |     - name: Haoyu Han
25 |       email: hanhaoy1@msu.edu
26 |       roles: [ assistant ]
27 |     - name: Yuying Xie
28 |       email: xyy@msu.edu
29 |       roles: [ advisor ]
30 |     - name: Jiliang Tang
31 |       email: tangjili@msu.edu
32 |       roles: [ advisor ]
33 |       
34 |   # parameters
35 |   arguments:
36 |     # required inputs
37 |     - name: "--data_dir"
38 |       type: "file"
39 |       description: The path to the predict_modality datasets
40 |       required: true
41 | 
42 |     # required outputs
43 |     - name: "--output_pretrain"
44 |       type: "file"
45 |       direction: "output"
46 |       example: "pretrain_model"
47 |       description: Path to the directory containing the pretrained models.
48 |       required: true
49 |       
50 |   # files your script needs
51 |   resources:
52 |     - type: bash_script
53 |       path: script.sh
54 |     - path: hetero_arg_version_v5.py
55 |     - path: generate_extra_files.py
56 |     - path: h.all.v7.4.entrez.gmt
57 |     - path: h.all.v7.4.symbols.gmt
58 |     # suggestion: use same WeightedGCN4 as run component
59 |     # to use, uncomment the following line
60 |     # - path: ../resources/graph_util.py
61 |       
62 | # target platforms
63 | platforms:
64 |   - type: docker
65 |     image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
66 |     run_args: [ "--gpus all --shm-size=5G" ]
67 |     setup:
68 |       - type: docker
69 |         run: [pip install scikit-learn==0.24.1]
70 |         
71 |       - type: python
72 |         packages:
73 |           #- scikit-learn
74 |           - anndata
75 |           - scanpy
76 |           - numpy
77 |           - torch
78 |           - dgl-cu111
79 |           - lightgbm
80 |           - joblib
81 | 
82 |   - type: nextflow
83 |     labels: [ midmem, hightime, lowcpu, gpu ]
84 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/train/generate_extra_files.py:
--------------------------------------------------------------------------------
  1 | import anndata as ad
  2 | import pickle
  3 | import numpy as np
  4 | from collections import defaultdict
  5 | import random
  6 | 
  7 | import argparse
  8 | 
  9 | 
 10 | parser = argparse.ArgumentParser()
 11 | parser.add_argument('-d', '--data_folder', default = './data/public/phase2-data/predict_modality/')
 12 | parser.add_argument('-ef', '--extra_files_folder', default = './')
 13 | 
 14 | args = parser.parse_args()
 15 | 
 16 | def load_pw():
 17 |     with open(args.extra_files_folder + '/h.all.v7.4.entrez.gmt') as gmt:
 18 |         gene_list = gmt.read().split()
 19 |     gene_sets_entrez = defaultdict(list)
 20 | 
 21 |     indicator = 0
 22 |     for ele in gene_list:
 23 |         if not ele.isnumeric() and indicator == 1:
 24 |             indicator = 0
 25 |             continue
 26 |         if not ele.isnumeric() and indicator == 0:
 27 |             indicator = 1
 28 |             gene_set_name = ele
 29 |         else:
 30 |             gene_sets_entrez[gene_set_name].append(ele)
 31 | 
 32 |     with open(args.extra_files_folder + '/h.all.v7.4.symbols.gmt') as gmt:
 33 |         gene_list = gmt.read().split()
 34 |     gene_sets_symbols = defaultdict(list)
 35 | 
 36 |     for ele in gene_list:
 37 |         if ele in gene_sets_entrez:
 38 |             gene_set_name = ele
 39 |         elif not ele.startswith( 'http://' ):
 40 |             gene_sets_symbols[gene_set_name].append(ele)
 41 | 
 42 |     return [i[1] for i in gene_sets_symbols.items()]
 43 | 
 44 | def graph_construct(train_mod1):
 45 |     counter = 0
 46 |     total = 0
 47 |     input_train_mod1 = train_mod1.X
 48 |     feature_index = train_mod1.var['feature_types'].index.tolist()
 49 |     new_pw = []
 50 |     for i in pw:
 51 |         new_pw.append([])
 52 |         for j in i:
 53 |             if j in feature_index:
 54 |                 new_pw[-1].append(feature_index.index(j))
 55 | 
 56 |     # cos similarity weight
 57 |     uu=[]
 58 |     vv=[]
 59 |     ee=[]
 60 |     for i in new_pw:
 61 |         for j in i:
 62 |             for k in i:
 63 |                 if j!=k:
 64 |                     uu.append(j)
 65 |                     vv.append(k)
 66 |                     sj = np.sqrt(np.dot(input_train_mod1[:,j].toarray().T, input_train_mod1[:,j].toarray()).item())
 67 |                     sk = np.sqrt(np.dot(input_train_mod1[:,k].toarray().T, input_train_mod1[:,k].toarray()).item())
 68 |                     jk = np.dot(input_train_mod1[:,j].toarray().T, input_train_mod1[:,k].toarray())
 69 |                     cossim = jk/sj/sk
 70 |                     ee.append(cossim)
 71 |                     
 72 |     return uu, vv, ee
 73 | 
 74 | print("Loading pw")
 75 | pw = load_pw()
 76 | 
 77 | print("Generating 'pw.pkl'")
 78 | # Generate pw.pkl
 79 | subtask = 'openproblems_bmmc_cite_phase2_rna'
 80 | subtask_folder = args.data_folder + '/' + subtask + '/'
 81 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad'
 82 | uu, vv, ee = graph_construct(ad.read_h5ad(subtask_filename.format('train_mod1')))
 83 | pickle.dump([uu,vv,ee], open(args.extra_files_folder + '/pw.pkl', 'wb'))
 84 | 
 85 | print("Generating 'pw_multiome.pkl'")
 86 | subtask = 'openproblems_bmmc_multiome_phase2_rna'
 87 | subtask_folder = args.data_folder + '/' + subtask + '/'
 88 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad'
 89 | uu, vv, ee = graph_construct(ad.read_h5ad(subtask_filename.format('train_mod1')))
 90 | pickle.dump([uu,vv,ee], open(args.extra_files_folder + '/pw_multiome.pkl', 'wb'))
 91 | 
 92 | print("Generating 'phase2_mask.pkl'")
 93 | subtasks = ['openproblems_bmmc_cite_phase2_rna', 'openproblems_bmmc_cite_phase2_mod2', 'openproblems_bmmc_multiome_phase2_rna', 'openproblems_bmmc_multiome_phase2_mod2']
 94 | task_names = ['gex2adt', 'adt2gex', 'gex2atac', 'atac2gex']
 95 | mask = {}
 96 | 
 97 | for ts in range(4):
 98 |     subtask = subtasks[ts]
 99 |     mask[subtask] = {}
100 |     subtask_folder = args.data_folder + '/' + subtask + '/'
101 |     subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad'
102 |     train_mod1 = ad.read_h5ad(subtask_filename.format('train_mod1'))
103 |     l = list(range(train_mod1.X.shape[0]))
104 |     random.shuffle(l)
105 |     train_size = int(train_mod1.X.shape[0] * 0.85)
106 |     valid_size = train_mod1.X.shape[0] - train_size
107 |     mask[subtask]['train'] = l[:train_size]
108 |     mask[subtask]['test'] = l[-valid_size:]
109 | 
110 | import pickle
111 | pickle.dump(mask, open(args.extra_files_folder + '/phase2_mask.pkl','wb'))
112 | 
113 | print("Generating 'phase2_mask_sep.pkl'")
114 | subtask = 'openproblems_bmmc_cite_phase2_rna'
115 | subtask_folder = args.data_folder + '/' + subtask + '/'
116 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad'
117 | 
118 | train_mod1 = ad.read_h5ad(subtask_filename.format('train_mod1'))
119 | 
120 | def get_index(batch):
121 |     index = []
122 |     for i in train_mod1[train_mod1.obs['batch']==batch].obs['batch'].index:
123 |         index.append(list(train_mod1.obs['batch'].index).index(i))
124 |     return index
125 | 
126 | s3d1 = get_index('s3d1')
127 | s3d7 = get_index('s3d7')
128 | s1d2 = get_index('s1d2')
129 | 
130 | test = s3d7+s1d2
131 | train = [i for i in range(train_mod1.X.shape[0]) if i not in (test + s3d1)]
132 | 
133 | gex2adt = {}
134 | gex2adt['test'] = test
135 | gex2adt['train'] = train
136 | 
137 | mask = {}
138 | mask['openproblems_bmmc_cite_phase2_rna'] = gex2adt
139 | pickle.dump(mask, open(args.extra_files_folder + '/phase2_mask_sep.pkl', 'wb'))


--------------------------------------------------------------------------------
/src/predict_modality/methods/Guanlab-dengkw/run/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: guanlab_dengkw_pm
 3 |   namespace: predict_modality_methods
 4 |   
 5 |   # metadata for your method
 6 |   description: A description for your method.
 7 |   info:
 8 |     method_label: "Guanlab-dengkw"
 9 |     submission_id: "170636"
10 |     team_name: Guanlab-dengkw
11 |     # project_url: https://github.com/foo/bar
12 |     # publication_doi: 10.1101/0123.45.67.890123
13 |     # publication_url: https://arxiv.org/abs/1234.56789
14 | 
15 |   authors:
16 |     - name: Kaiwen Deng
17 |       email: dengkw@umich.edu
18 |       roles: [ author, maintainer ]
19 |       props: { github: nonztalk }
20 |       
21 |   # parameters
22 |   arguments:
23 |     # required inputs
24 |     - name: "--input_train_mod1"
25 |       type: "file"
26 |       example: "dataset_mod1.h5ad"
27 |       description: Censored dataset, training cells.
28 |       required: true
29 |     - name: "--input_test_mod1"
30 |       type: "file"
31 |       example: "dataset_mod1.h5ad"
32 |       description: Censored dataset, test cells.
33 |       required: true
34 |     - name: "--input_train_mod2"
35 |       type: "file"
36 |       example: "dataset_mod2.h5ad"
37 |       description: Censored dataset.
38 |       required: true
39 |     # required outputs
40 |     - name: "--output"
41 |       type: "file"
42 |       direction: "output"
43 |       example: "output.h5ad"
44 |       description: Dataset with predicted values for modality2.
45 |       required: true
46 |     # additional parameters
47 |     - name: "--distance_method"
48 |       type: "string"
49 |       default: "minkowski"
50 |       description: The distance metric to use. Possible values include `euclidean` and `minkowski`.
51 |     - name: "--n_pcs"
52 |       type: "integer"
53 |       default: 50
54 |       description: Number of components to use for dimensionality reduction.
55 |       
56 |   # files your script needs
57 |   resources:
58 |     - type: python_script
59 |       path: script.py
60 |       
61 | # target platforms
62 | platforms:
63 |   - type: docker
64 |     image: dataintuitive/randpy:py3.8
65 |     setup:
66 | 
67 |       - type: python
68 |         packages:
69 |           - scikit-learn
70 |           - anndata
71 |           - pandas
72 |           - numpy
73 |           - scanpy
74 | 
75 |   - type: nextflow
76 |     labels: [ vhighmem, vvhightime, vhighcpu ]
77 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/Guanlab-dengkw/run/script.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import anndata as ad
  3 | import numpy as np
  4 | 
  5 | from scipy.sparse import csc_matrix
  6 | 
  7 | from sklearn.decomposition import TruncatedSVD
  8 | from sklearn.gaussian_process.kernels import RBF
  9 | from sklearn.kernel_ridge import KernelRidge
 10 | 
 11 | logging.basicConfig(level=logging.INFO)
 12 | 
 13 | ## VIASH START
 14 | par = {
 15 |     'input_train_mod1': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad',
 16 |     'input_train_mod2': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad',
 17 |     'input_test_mod1': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad',
 18 |     'output': 'output.h5ad',
 19 | }
 20 | meta = { 'functionality_name': 'submission_170636' }
 21 | ## VIASH END
 22 | 
 23 | logging.info('Reading `h5ad` files...')
 24 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
 25 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
 26 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
 27 | 
 28 | pred_dimx = input_test_mod1.shape[0]
 29 | pred_dimy = input_train_mod2.shape[1]
 30 | 
 31 | feature_obs = input_train_mod1.obs
 32 | gs_obs = input_train_mod2.obs
 33 | 
 34 | batches = input_train_mod1.obs.batch.unique().tolist()
 35 | batch_len = len(batches)
 36 | 
 37 | obs = input_test_mod1.obs
 38 | var = input_train_mod2.var
 39 | dataset_id = input_train_mod1.uns['dataset_id']
 40 | 
 41 | input_train = ad.concat(
 42 |     {"train": input_train_mod1, "test": input_test_mod1},
 43 |     axis=0,
 44 |     join="outer",
 45 |     label="group",
 46 |     fill_value=0,
 47 |     index_unique="-"
 48 | )
 49 | 
 50 | logging.info('Determine parameters by the modalities')
 51 | mod1_type = input_train_mod1.var.feature_types[0]
 52 | mod1_type = mod1_type.upper()
 53 | mod2_type = input_train_mod2.var.feature_types[0]
 54 | mod2_type = mod2_type.upper()
 55 | n_comp_dict = {
 56 |         ("GEX", "ADT"): (300, 70, 10, 0.2),
 57 |         ("ADT", "GEX"): (None, 50, 10, 0.2),
 58 |         ("GEX", "ATAC"): (1000, 50, 10, 0.1),
 59 |         ("ATAC", "GEX"): (100, 70, 10, 0.1)
 60 |         }
 61 | logging.info(f"{mod1_type}, {mod2_type}")
 62 | n_mod1, n_mod2, scale, alpha = n_comp_dict[(mod1_type, mod2_type)]
 63 | logging.info(f"{n_mod1}, {n_mod2}, {scale}, {alpha}")
 64 | 
 65 | # Do PCA on the input data
 66 | logging.info('Models using the Truncated SVD to reduce the dimension')
 67 | 
 68 | if n_mod1 is not None and n_mod1 < input_train.shape[1]:
 69 |     embedder_mod1 = TruncatedSVD(n_components=n_mod1)
 70 |     mod1_pca = embedder_mod1.fit_transform(input_train.X).astype(np.float32)
 71 |     train_matrix = mod1_pca[input_train.obs['group'] == 'train']
 72 |     test_matrix = mod1_pca[input_train.obs['group'] == 'test']
 73 | else:
 74 |     train_matrix = input_train_mod1.to_df().values.astype(np.float32)
 75 |     test_matrix = input_test_mod1.to_df().values.astype(np.float32)
 76 | 
 77 | if n_mod2 is not None and n_mod2 < input_train_mod2.shape[1]:
 78 |     embedder_mod2 = TruncatedSVD(n_components=n_mod2)
 79 |     train_gs = embedder_mod2.fit_transform(input_train_mod2.X).astype(np.float32)
 80 | else:
 81 |     train_gs = input_train_mod2.to_df().values.astype(np.float32)
 82 | 
 83 | del input_train
 84 | del input_train_mod1
 85 | del input_train_mod2
 86 | del input_test_mod1
 87 | 
 88 | logging.info('Running normalization ...')
 89 | train_sd = np.std(train_matrix, axis=1).reshape(-1, 1)
 90 | train_sd[train_sd == 0] = 1
 91 | train_norm = (train_matrix - np.mean(train_matrix, axis=1).reshape(-1, 1)) / train_sd
 92 | train_norm = train_norm.astype(np.float32)
 93 | del train_matrix
 94 | 
 95 | test_sd = np.std(test_matrix, axis=1).reshape(-1, 1)
 96 | test_sd[test_sd == 0] = 1
 97 | test_norm = (test_matrix - np.mean(test_matrix, axis=1).reshape(-1, 1)) / test_sd
 98 | test_norm = test_norm.astype(np.float32)
 99 | del test_matrix
100 | 
101 | logging.info('Running KRR model ...')
102 | y_pred = np.zeros((pred_dimx, pred_dimy), dtype=np.float32)
103 | np.random.seed(1000)
104 | 
105 | for _ in range(5):
106 |     np.random.shuffle(batches)
107 |     for batch in [batches[:batch_len//2], batches[batch_len//2:]]:
108 |         # for passing the test
109 |         if not batch:
110 |             batch = [batches[0]]
111 | 
112 |         logging.info(batch)
113 |         kernel = RBF(length_scale = scale)
114 |         krr = KernelRidge(alpha=alpha, kernel=kernel)
115 |         logging.info('Fitting KRR ... ')
116 |         krr.fit(train_norm[feature_obs.batch.isin(batch)], 
117 |                 train_gs[gs_obs.batch.isin(batch)])
118 |         y_pred += (krr.predict(test_norm) @ embedder_mod2.components_)
119 | 
120 | np.clip(y_pred, a_min=0, a_max=None, out=y_pred)
121 | if mod2_type == "ATAC":
122 |     np.clip(y_pred, a_min=0, a_max=1, out=y_pred)
123 | 
124 | y_pred /= 10
125 | 
126 | # Store as sparse matrix to be efficient. Note that this might require
127 | # different classifiers/embedders before-hand. Not every class is able
128 | # to support such data structures.
129 | y_pred = csc_matrix(y_pred)
130 | 
131 | logging.info("Generate anndata object ...")
132 | adata = ad.AnnData(
133 |     X=y_pred,
134 |     obs=obs,
135 |     var=var,
136 |     uns={
137 |         'dataset_id': dataset_id,
138 |         'method_id': meta['functionality_name'],
139 |     },
140 | )
141 | 
142 | logging.info('Storing annotated data...')
143 | adata.write_h5ad(par['output'], compression = "gzip")
144 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/Guanlab-dengkw/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
 4 | export NXF_VER=21.04.1
 5 | export PIPELINE_VERSION=1.4.0
 6 | method_id=guanlab_dengkw_pm
 7 | task_id=predict_modality
 8 | 
 9 | # CITE GEX2ADT
10 | dataset_id=openproblems_bmmc_cite_phase2_rna
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
13 | 
14 | target/docker/${task_id}_methods/${method_id}/${method_id} \
15 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
16 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
17 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
18 |   --output ${pred_path}.${method_id}.output.h5ad
19 | 
20 | # CITE ADT2GEX
21 | dataset_id=openproblems_bmmc_cite_phase2_mod2
22 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
23 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
24 | 
25 | target/docker/${task_id}_methods/${method_id}/${method_id} \
26 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
27 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
28 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
29 |   --input_pretrain ${pretrain_path} \
30 |   --output ${pred_path}.${method_id}.output.h5ad
31 | 
32 | # MULTIOME GEX2ATAC
33 | dataset_id=openproblems_bmmc_multiome_phase2_rna
34 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
36 | 
37 | target/docker/${task_id}_methods/${method_id}/${method_id} \
38 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
39 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
40 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
41 |   --input_pretrain ${pretrain_path} \
42 |   --output ${pred_path}.${method_id}.output.h5ad
43 | 
44 | # MULTIOME ATAC2GEX
45 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
46 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
47 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
48 | 
49 | target/docker/${task_id}_methods/${method_id}/${method_id} \
50 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
51 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
52 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
53 |   --input_pretrain ${pretrain_path} \
54 |   --output ${pred_path}.${method_id}.output.h5ad
55 | 
56 | # RUN EVALUATION
57 | bin/nextflow run "$PIPELINE_REPO" \
58 |   -r "$PIPELINE_VERSION" \
59 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
60 |   --solutionDir "output/datasets/$task_id" \
61 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
62 |   --publishDir "output/evaluation/$task_id/$method_id/" \
63 |   -latest \
64 |   -resume \
65 |   -c "src/resources/nextflow_moremem.config"
66 | 
67 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
68 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/LS_lab/run/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: lslab
 3 |   namespace: predict_modality_methods
 4 |   
 5 |   # metadata for your method
 6 |   description: A description for your method.
 7 |   info:
 8 |     method_label: "LS_Lab"
 9 |     submission_id: "171123"
10 |     team_name: LS_lab
11 |     # project_url: https://github.com/foo/bar
12 |     # publication_doi: 10.1101/0123.45.67.890123
13 |     # publication_url: https://arxiv.org/abs/1234.56789
14 | 
15 |   authors:
16 |     - name: Aidyn Ubingazhibov
17 |       email: aidyn.ubingazhibov@nu.edu.kz
18 |       roles: [ author, maintainer ]
19 |       props: { github: aidynabirov }
20 |       
21 |   # parameters
22 |   arguments:
23 |     # required inputs
24 |     - name: "--input_train_mod1"
25 |       type: "file"
26 |       example: "dataset_mod1.h5ad"
27 |       description: Censored dataset, training cells.
28 |       required: true
29 |     - name: "--input_test_mod1"
30 |       type: "file"
31 |       example: "dataset_mod1.h5ad"
32 |       description: Censored dataset, test cells.
33 |       required: true
34 |     - name: "--input_train_mod2"
35 |       type: "file"
36 |       example: "dataset_mod2.h5ad"
37 |       description: Censored dataset.
38 |       required: true
39 |     # required outputs
40 |     - name: "--output"
41 |       type: "file"
42 |       direction: "output"
43 |       example: "output.h5ad"
44 |       description: Dataset with predicted values for modality2.
45 |       required: true
46 |       
47 |   # files your script needs
48 |   resources:
49 |     - type: python_script
50 |       path: script.py
51 |       
52 | # target platforms
53 | platforms:
54 |   - type: docker
55 |     image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
56 |     run_args: [ "--gpus all" ]
57 |     setup:
58 |       - type: python
59 |         packages:
60 |           - scikit-learn
61 |           - catboost
62 |           - anndata
63 |           - scanpy
64 |           - tqdm
65 |   - type: nextflow
66 |     labels: [ vhighmem, vvhightime, highcpu, gpu]
67 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/LS_lab/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
 4 | export NXF_VER=21.04.1
 5 | export PIPELINE_VERSION=1.4.0
 6 | method_id=submission_171123
 7 | task_id=predict_modality
 8 | 
 9 | # CITE GEX2ADT
10 | dataset_id=openproblems_bmmc_cite_phase2_rna
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
13 | 
14 | target/docker/${task_id}_methods/${method_id}/${method_id} \
15 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
16 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
17 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
18 |   --output ${pred_path}.${method_id}.output.h5ad
19 | 
20 | # CITE ADT2GEX
21 | dataset_id=openproblems_bmmc_cite_phase2_mod2
22 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
23 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
24 | 
25 | target/docker/${task_id}_methods/${method_id}/${method_id} \
26 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
27 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
28 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
29 |   --input_pretrain ${pretrain_path} \
30 |   --output ${pred_path}.${method_id}.output.h5ad
31 | 
32 | # MULTIOME GEX2ATAC
33 | dataset_id=openproblems_bmmc_multiome_phase2_rna
34 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
36 | 
37 | target/docker/${task_id}_methods/${method_id}/${method_id} \
38 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
39 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
40 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
41 |   --input_pretrain ${pretrain_path} \
42 |   --output ${pred_path}.${method_id}.output.h5ad
43 | 
44 | # MULTIOME ATAC2GEX
45 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
46 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
47 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
48 | 
49 | target/docker/${task_id}_methods/${method_id}/${method_id} \
50 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
51 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
52 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
53 |   --input_pretrain ${pretrain_path} \
54 |   --output ${pred_path}.${method_id}.output.h5ad
55 | 
56 | # RUN EVALUATION
57 | bin/nextflow run "$PIPELINE_REPO" \
58 |   -r "$PIPELINE_VERSION" \
59 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
60 |   --solutionDir "output/datasets/$task_id" \
61 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
62 |   --publishDir "output/evaluation/$task_id/$method_id/" \
63 |   -latest \
64 |   -resume \
65 |   -c "src/resources/nextflow_moremem.config"
66 | 
67 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/predict_modality/methods/cajal/run/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: cajal
 3 |   namespace: predict_modality_methods
 4 |   
 5 |   # metadata for your method
 6 |   description: A description for your method.
 7 |   info:
 8 |     method_label: "Cajal"
 9 |     submission_id: "170613"
10 |     team_name: Cajal
11 |     # project_url: https://github.com/foo/bar
12 |     # publication_doi: 10.1101/0123.45.67.890123
13 |     # publication_url: https://arxiv.org/abs/1234.56789
14 | 
15 |   authors:
16 |     - name: Anna Laddach
17 |       email: anna.laddach@crick.ac.uk
18 |       roles: [ author, maintainer ]
19 |       props: { github: AnnaLaddach, orcid: "0000-0001-5552-6534" }
20 |     - name: Roman Laddach
21 |       email: roman.laddach@kcl.ac.uk
22 |       roles: [ author, maintainer ]
23 |       props: { github: rladdach, orcid: "0000-0002-0118-4548" }
24 |     - name: Michael Shapiro
25 |       email: michael.shapiro@crick.ac.uk
26 |       roles: [ author, maintainer ]
27 |       props: { github: michaeldshapiro, orcid: "0000-0002-2769-9320" }
28 |       
29 |   # parameters
30 |   arguments:
31 |     # required inputs
32 |     - name: "--input_train_mod1"
33 |       type: "file"
34 |       example: "dataset_mod1.h5ad"
35 |       description: Censored dataset, training cells.
36 |       required: true
37 |     - name: "--input_test_mod1"
38 |       type: "file"
39 |       example: "dataset_mod1.h5ad"
40 |       description: Censored dataset, test cells.
41 |       required: true
42 |     - name: "--input_train_mod2"
43 |       type: "file"
44 |       example: "dataset_mod2.h5ad"
45 |       description: Censored dataset.
46 |       required: true
47 |     - name: "--input_pretrain"
48 |       type: "file"
49 |       example: "pretrain_model"
50 |       description: Path to the directory containing a pretrained model.
51 |       required: true
52 | 
53 |     # required outputs
54 |     - name: "--output"
55 |       type: "file"
56 |       direction: "output"
57 |       example: "output.h5ad"
58 |       description: Dataset with predicted values for modality2.
59 |       required: true
60 |       
61 |   # files your script needs
62 |   resources:
63 |     - type: python_script
64 |       path: script.py
65 |       
66 | # target platforms
67 | platforms:
68 |   - type: docker
69 |     image: tensorflow/tensorflow:2.5.0-gpu
70 |     run_args: [ "--gpus all" ]
71 |     setup:
72 |       - type: python
73 |         packages:
74 |           - scikit-learn
75 |           - anndata
76 |           - scanpy
77 |           - tensorflow
78 |           - pandas
79 |   - type: nextflow
80 |     labels: [ vhighmem, vvhightime, highcpu, gpu]
81 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/cajal/run/script.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import anndata as ad
  3 | import pickle
  4 | import numpy as np
  5 | 
  6 | from scipy.sparse import csc_matrix
  7 | 
  8 | import tensorflow as tf
  9 | import scanpy as sc
 10 | 
 11 | logging.basicConfig(level=logging.INFO)
 12 | 
 13 | ## VIASH START
 14 | par = {
 15 |     'input_train_mod1': 'sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad',
 16 |     'input_train_mod2': 'sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad',
 17 |     'input_test_mod1': 'sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad',
 18 |     'input_pretrain': 'path/to/model',
 19 |     'output': 'output.h5ad'
 20 | }
 21 | meta = { 'functionality_name': 'cajal' }
 22 | ## VIASH END
 23 | 
 24 | logging.info('Reading `h5ad` files...')
 25 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
 26 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
 27 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
 28 | 
 29 | #get modalities
 30 | mod_1 = input_train_mod1.var["feature_types"][0]
 31 | mod_2 = input_train_mod2.var["feature_types"][0]
 32 | 
 33 | 
 34 | test_total = np.sum(input_test_mod1.layers['counts'].toarray(), axis=1)
 35 | 
 36 | if mod_1 == "GEX":
 37 |     input_test_mod1.X = input_test_mod1.layers['counts']
 38 |     sc.pp.normalize_per_cell(input_test_mod1, counts_per_cell_after=1e6)
 39 |     sc.pp.log1p(input_test_mod1)
 40 | 
 41 | with open(par["input_pretrain"] + "/genes.pkl", "rb") as f:
 42 |         genes = pickle.load(f)
 43 |         input_test_mod1 = input_test_mod1[:,genes]
 44 | 
 45 | if mod_1 == "GEX":
 46 |     input_train_mod1.X = input_train_mod1.layers['counts']
 47 |     sc.pp.normalize_per_cell(input_train_mod1, counts_per_cell_after=1e6)
 48 |     sc.pp.log1p(input_train_mod1)
 49 | 
 50 | X_test = input_test_mod1.X.toarray()
 51 | 
 52 | test_batches = set(input_test_mod1.obs.batch)
 53 | 
 54 | input_test_mod1.obs["batch_median"] = 0
 55 | 
 56 | input_test_mod1.obs["batch_sd"] = 0
 57 | 
 58 | for batch in test_batches:
 59 |     input_test_mod1.obs["batch_median"][input_test_mod1.obs.batch == batch] = np.median(test_total[input_test_mod1.obs.batch == batch])
 60 |     input_test_mod1.obs["batch_sd"][input_test_mod1.obs.batch == batch] = np.std(test_total[input_test_mod1.obs.batch == batch])
 61 | 
 62 | 
 63 | for i in range(50):
 64 |     X_test = np.column_stack((X_test,test_total))
 65 | 
 66 | for i in range(50):
 67 |     X_test = np.column_stack((X_test,input_test_mod1.obs["batch_median"]))
 68 | 
 69 | for i in range(50):
 70 |     X_test = np.column_stack((X_test,input_test_mod1.obs["batch_sd"]))
 71 | 
 72 | with open(par["input_pretrain"] + "/transformation.pkl", "rb") as f:
 73 |         info = pickle.load(f)
 74 | 
 75 | X_test = X_test.T
 76 | X_test = (X_test - info["means"])/info["sds"]
 77 | X_test = X_test.T
 78 | 
 79 | 
 80 | #load pretrained model for correct modalities
 81 | model = tf.keras.models.load_model(par["input_pretrain"] + "/model.h5")
 82 | 
 83 | #make predictions for y
 84 | y_pred = model.predict(X_test)
 85 | 
 86 | #convert to sparse matrix
 87 | y_pred = csc_matrix(y_pred)
 88 | 
 89 | adata = ad.AnnData(
 90 |     X=y_pred,
 91 |     obs=input_test_mod1.obs,
 92 |     var=input_train_mod2.var,
 93 |     uns={
 94 |         'dataset_id': input_train_mod1.uns['dataset_id'],
 95 |         'method_id': "cajal",
 96 |     },
 97 | )
 98 | 
 99 | 
100 | logging.info('Storing annotated data...')
101 | adata.write_h5ad(par['output'], compression = "gzip")
102 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/cajal/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
  4 | export NXF_VER=21.04.1
  5 | export PIPELINE_VERSION=1.4.0
  6 | method_id=cajal
  7 | task_id=predict_modality
  8 | 
  9 | # CITE GEX2ADT
 10 | dataset_id=openproblems_bmmc_cite_phase2_rna
 11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 12 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 13 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 14 | 
 15 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 16 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 17 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 18 |   --input_explore_mod1 output/datasets_explore/cite/cite_gex_processed_training.h5ad \
 19 |   --input_explore_mod2 output/datasets_explore/cite/cite_adt_processed_training.h5ad \
 20 |   --output_pretrain ${pretrain_path}
 21 | 
 22 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 23 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 24 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 25 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 26 |   --input_pretrain ${pretrain_path} \
 27 |   --output ${pred_path}.${method_id}.output.h5ad
 28 | 
 29 | # CITE ADT2GEX
 30 | dataset_id=openproblems_bmmc_cite_phase2_mod2
 31 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 32 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 33 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 34 | 
 35 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 36 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 37 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 38 |   --input_explore_mod1 output/datasets_explore/cite/cite_adt_processed_training.h5ad \
 39 |   --input_explore_mod2 output/datasets_explore/cite/cite_gex_processed_training.h5ad \
 40 |   --output_pretrain ${pretrain_path}
 41 | 
 42 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 43 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 44 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 45 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 46 |   --input_pretrain ${pretrain_path} \
 47 |   --output ${pred_path}.${method_id}.output.h5ad
 48 | 
 49 | 
 50 | # MULTIOME GEX2ATAC
 51 | dataset_id=openproblems_bmmc_multiome_phase2_rna
 52 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 53 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 54 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 55 | 
 56 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 57 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 58 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 59 |   --input_explore_mod1 output/datasets_explore/multiome/multiome_gex_processed_training.h5ad \
 60 |   --input_explore_mod2 output/datasets_explore/multiome/multiome_atac_processed_training.h5ad \
 61 |   --output_pretrain ${pretrain_path}
 62 | 
 63 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 64 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 65 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 66 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 67 |   --input_pretrain ${pretrain_path} \
 68 |   --output ${pred_path}.${method_id}.output.h5ad
 69 | 
 70 | # MULTIOME ATAC2GEX
 71 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
 72 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 73 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 74 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 75 | 
 76 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 77 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 78 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 79 |   --input_explore_mod1 output/datasets_explore/multiome/multiome_atac_processed_training.h5ad \
 80 |   --input_explore_mod2 output/datasets_explore/multiome/multiome_gex_processed_training.h5ad \
 81 |   --output_pretrain ${pretrain_path}
 82 | 
 83 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 84 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 85 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 86 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 87 |   --input_pretrain ${pretrain_path} \
 88 |   --output ${pred_path}.${method_id}.output.h5ad
 89 | 
 90 | # RUN EVALUATION
 91 | bin/nextflow run "$PIPELINE_REPO" \
 92 |   -r "$PIPELINE_VERSION" \
 93 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
 94 |   --solutionDir "output/datasets/$task_id" \
 95 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
 96 |   --publishDir "output/evaluation/$task_id/$method_id/" \
 97 |   -latest \
 98 |   -resume \
 99 |   -c "src/resources/nextflow_moremem.config"
100 | 
101 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/predict_modality/methods/cajal/train/ADT_list_df_updated.csv:
--------------------------------------------------------------------------------
  1 | markers,gene_name,
  2 | CD86,CD86,
  3 | CD274,CD274,
  4 | CD270,TNFRSF14,
  5 | CD155,PVR,
  6 | CD112,NECTIN2,
  7 | CD47,CD47,
  8 | CD48,CD48,
  9 | CD40,CD40,
 10 | CD154,CD40LG,
 11 | CD52,CD52,
 12 | CD3,CD3E,ambiguity
 13 | CD8,CD8A,
 14 | CD56,NCAM1,
 15 | CD19,CD19,
 16 | CD33,CD33,
 17 | CD11c,ITGAX,
 18 | HLA-A-B-C,HLA-A,ambiguity
 19 | HLA-A-B-C,HLA-B,ambiguity
 20 | HLA-A-B-C,HLA-C,ambiguity
 21 | CD45RA,PTPRC,ambiguity
 22 | CD123,IL3RA,
 23 | CD7,CD7,
 24 | CD105,ENG,
 25 | CD49f,ITGA6,
 26 | CD194,CCR4,
 27 | CD4,CD4,
 28 | CD44,CD44,
 29 | CD14,CD14,
 30 | CD16,FCGR3A,
 31 | CD25,IL2RA,
 32 | CD45RO,PTPRC,ambiguity
 33 | CD279,PDCD1,
 34 | TIGIT,TIGIT,
 35 | CD20,MS4A1,
 36 | CD335,NCR1,
 37 | CD31,PECAM1,
 38 | Podoplanin,PDPN,
 39 | CD146,MCAM,
 40 | IgM,IGHM,
 41 | CD5,CD5,
 42 | CD195,CCR5,
 43 | CD32,FCGR2A,
 44 | CD196,CCR6,
 45 | CD185,CXCR5,
 46 | CD103,ITGAE,
 47 | CD69,CD69,
 48 | CD62L,SELL,
 49 | CD161,KLRB1,
 50 | CD152,CTLA4,
 51 | CD223,LAG3,
 52 | KLRG1,KLRG1,
 53 | CD27,CD27,
 54 | CD107a,LAMP1,
 55 | CD95,FAS,
 56 | CD134,TNFRSF4,
 57 | HLA-DR,HLA-DRB1,
 58 | CD1c,CD1C,
 59 | CD11b,ITGAM,
 60 | CD64,FCGR1A,
 61 | CD141,THBD,
 62 | CD1d,CD1D,
 63 | CD314,KLRK1,
 64 | CD35,CR1,
 65 | CD57,B3GAT1,
 66 | CD272,BTLA,
 67 | CD278,ICOS,
 68 | CD58,CD58,
 69 | CD39,ENTPD1,
 70 | CX3CR1,CX3CR1,
 71 | CD24,CD24,
 72 | CD21,CR2,
 73 | CD11a,ITGAL,
 74 | CD79b,CD79B,
 75 | CD244,CD244,
 76 | CD169,SIGLEC1,
 77 | integrinB7,ITGB7,
 78 | CD268,TNFRSF13C,
 79 | CD42b,GP1BA,
 80 | CD54,ICAM1,
 81 | CD62P,SELP,
 82 | CD119,IFNGR1,
 83 | TCR,TRA,
 84 | TCR,TRB,
 85 | TCR,TRG,
 86 | TCR,TRD,
 87 | CD192,CCR2,
 88 | CD122,IL2RB,
 89 | FceRIa, FCER1A,
 90 | CD41,ITGA2B,
 91 | CD137,TNFRSF9,
 92 | CD163,CD163,
 93 | CD83,CD83,
 94 | CD124,IL4R,
 95 | CD13,ANPEP,
 96 | CD2,CD22,
 97 | CD226,CD226,
 98 | CD29,ITGB1,
 99 | CD303,CLEC4C,
100 | CD49b, ITGA2,
101 | CD81,CD81,
102 | IgD,IGHD,
103 | CD18,ITGB2,
104 | CD28,CD28,
105 | CD38,CD38,
106 | CD127,IL7R,
107 | CD45,PTPRC,ambiguity
108 | CD22,CD22,
109 | CD71,TFRC,
110 | CD26,DPP4,
111 | CD115,CSF1R,
112 | CD63,CD63,
113 | CD304,NRP1,
114 | CD36,CD36,
115 | CD172a,SIRPA,
116 | CD72,CD72,
117 | CD158,KIR2DL3,
118 | CD93,CD93,
119 | CD49a,ITGA1,
120 | CD49d,ITGA4,
121 | CD73,NT5E,
122 | CD9,CD9,
123 | TCRVa7.2,?,ambiguity
124 | TCRVd2,?,ambiguity
125 | LOX-1,OLR1,
126 | CD158b,KIR2DL3,
127 | CD158e1,KIR3DL1,
128 | CD142,F3,
129 | CD319,SLAMF7,
130 | CD352,SLAMF6,
131 | CD94,KLRD1,
132 | CD162,SELPLG,
133 | CD85j,LILRB1,
134 | CD23,FCER2,
135 | CD328,SIGLEC7,
136 | HLA-E,HLA-E,
137 | CD82,CD82,
138 | CD101,CD101,
139 | CD88,C5AR1,
140 | CD224,GGT1,
141 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/cajal/train/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: cajal_train
 3 |   namespace: predict_modality_methods
 4 |   
 5 |   # metadata for your method
 6 |   description: A description for your method.
 7 |   info:
 8 |     submission_id: "170613"
 9 |     team_name: Cajal
10 |     # project_url: https://github.com/foo/bar
11 |     # publication_doi: 10.1101/0123.45.67.890123
12 |     # publication_url: https://arxiv.org/abs/1234.56789
13 | 
14 |   authors:
15 |     - name: Anna Laddach
16 |       email: anna.laddach@crick.ac.uk
17 |       roles: [ author, maintainer ]
18 |       props: { github: AnnaLaddach, orcid: "0000-0001-5552-6534" }
19 |     - name: Roman Laddach
20 |       email: roman.laddach@kcl.ac.uk
21 |       roles: [ author, maintainer ]
22 |       props: { github: rladdach, orcid: "0000-0002-0118-4548" }
23 |     - name: Michael Shapiro
24 |       email: michael.shapiro@crick.ac.uk
25 |       roles: [ author, maintainer ]
26 |       props: { github: michaeldshapiro, orcid: "0000-0002-2769-9320" }
27 |       
28 |   # parameters
29 |   arguments:
30 |     # required inputs
31 |     - name: "--input_train_mod1"
32 |       type: "file"
33 |       example: "dataset_mod1.h5ad"
34 |       description: Censored dataset, training cells.
35 |       required: true
36 |     - name: "--input_train_mod2"
37 |       type: "file"
38 |       example: "dataset_mod2.h5ad"
39 |       description: Censored dataset.
40 |       required: true
41 |     - name: "--input_explore_mod1"
42 |       type: "file"
43 |       example: "dataset_mod1.h5ad"
44 |       description: Explore version of the modality 1 dataset.
45 |       required: true
46 |     - name: "--input_explore_mod2"
47 |       type: "file"
48 |       example: "dataset_mod2.h5ad"
49 |       description: Explore version of the modality 2 dataset.
50 |       required: true
51 | 
52 |     # required outputs
53 |     - name: "--output_pretrain"
54 |       type: "file"
55 |       direction: "output"
56 |       example: "pretrain_model"
57 |       description: Path to the directory containing a pretrained model.
58 |       required: true
59 |       
60 |   # files your script needs
61 |   resources:
62 |     - type: python_script
63 |       path: script.py
64 |     - path: ADT_list_df_updated.csv
65 |       
66 | # target platforms
67 | platforms:
68 |   - type: docker
69 |     image: tensorflow/tensorflow:2.5.0-gpu
70 |     run_args: [ "--gpus all" ]
71 |     setup:
72 |       - type: python
73 |         packages:
74 |           - scikit-learn
75 |           - anndata
76 |           - scanpy
77 |           - tensorflow
78 |           - pandas
79 |   - type: nextflow
80 |     labels: [ vhighmem, vvhightime, highcpu, gpu]


--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/README.md:
--------------------------------------------------------------------------------
1 | # NeurIPS-Single-Cell-MultiModality
2 | 
3 | Team Novel: Gleb Ryazantsev, Nikolay Russkikh, Igor I
4 | 
5 | The task is solved via training encoder-decoder MLP model with one output neuron per  component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework
6 | 
7 | <img src="novel_architecture.jpg" width="100%">


--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/novel_architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/predict_modality/methods/novel/novel_architecture.jpg


--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/run/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: novel
 3 |   namespace: predict_modality_methods
 4 |   
 5 |   # metadata for your method
 6 |   description: The task is solved via training encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework.
 7 |   info:
 8 |     method_label: "Novel"
 9 |     submission_id: "169769"
10 |     team_name: Novel
11 |     # project_url: https://github.com/foo/bar
12 |     # publication_doi: 10.1101/0123.45.67.890123
13 |     # publication_url: https://arxiv.org/abs/1234.56789
14 | 
15 |   authors:
16 |     - name: Gleb Ryazantsev
17 |       email: ryazantsev.gleb@gmail.com
18 |       roles: [ author, maintainer ]
19 |     - name: Nikolay Russkikh
20 |       email: russkikh.nikolay@gmail.com
21 |       roles: [ author, maintainer ]
22 |     - name: Igor I
23 |       email: herri.i.67@gmail.com
24 |       roles: [ author, maintainer ]
25 |       
26 |   # parameters
27 |   arguments:
28 |     # required inputs
29 |     - name: "--input_train_mod1"
30 |       type: "file"
31 |       example: "dataset_mod1.h5ad"
32 |       description: Censored dataset, training cells.
33 |       required: true
34 |     - name: "--input_test_mod1"
35 |       type: "file"
36 |       example: "dataset_mod1.h5ad"
37 |       description: Censored dataset, test cells.
38 |       required: true
39 |     - name: "--input_train_mod2"
40 |       type: "file"
41 |       example: "dataset_mod2.h5ad"
42 |       description: Censored dataset.
43 |       required: true
44 |     - name: "--input_pretrain"
45 |       type: "file"
46 |       example: "pretrain_model"
47 |       description: Path to the directory containing a pretrained model.
48 |       required: true
49 |     # required outputs
50 |     - name: "--output"
51 |       type: "file"
52 |       direction: "output"
53 |       example: "output.h5ad"
54 |       description: Dataset with predicted values for modality2.
55 |       required: true
56 |       
57 |   # files your script needs
58 |   resources:
59 |     - type: python_script
60 |       path: script.py
61 |     - path: ../resources/helper_functions.py
62 |       
63 | # target platforms
64 | platforms:
65 |   - type: docker
66 |     image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime"
67 |     setup:
68 |       - type: python
69 |         packages:
70 |           - anndata
71 |           - scikit-learn
72 |           - networkx
73 | 
74 |   - type: nextflow
75 |     labels: [ lowmem, lowtime, lowcpu ]
76 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/run/script.py:
--------------------------------------------------------------------------------
  1 | import anndata as ad
  2 | import pickle
  3 | import torch
  4 | 
  5 | from torch.utils.data import DataLoader
  6 | 
  7 | import sys
  8 | 
  9 | import numpy as np
 10 | 
 11 | from scipy.sparse import csc_matrix
 12 | 
 13 | ## VIASH START
 14 | dataset_path = "output/datasets/match_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_"
 15 | pretrain_path = "output/pretrain/match_modality/clue/openproblems_bmmc_cite_phase2_rna.clue_train.output_pretrain/"
 16 | 
 17 | par = {
 18 |     'input_train_mod1': f'{dataset_path}train_mod1.h5ad',
 19 |     'input_train_mod2': f'{dataset_path}train_mod2.h5ad',
 20 |     'input_test_mod1': f'{dataset_path}test_mod1.h5ad',
 21 |     'input_pretrain': pretrain_path,
 22 |     'output': 'output.h5ad'
 23 | }
 24 | meta = {
 25 |     'resources_dir': '.',
 26 |     'functionality_name': '169769'
 27 | }
 28 | ## VIASH END
 29 | 
 30 | sys.path.append(meta['resources_dir'])
 31 | from helper_functions import ModelRegressionGex2Adt, ModelRegressionGex2Atac, ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModalityMatchingDataset
 32 | 
 33 | 
 34 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
 35 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
 36 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
 37 | 
 38 | 
 39 | mod1 = input_train_mod1.var['feature_types'][0]
 40 | mod2 = input_train_mod2.var['feature_types'][0]
 41 | 
 42 | if mod1 == 'GEX' and mod2 == 'ADT':
 43 |     model = ModelRegressionGex2Adt(256,134)   
 44 |     weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu')
 45 |     with open(par['input_pretrain'] + '/lsi_transformer.pickle', 'rb') as f:
 46 |         lsi_transformer_gex = pickle.load(f)
 47 |     
 48 |         
 49 |     model.load_state_dict(weight)    
 50 |     input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1)
 51 | 
 52 | elif mod1 == 'GEX' and mod2 == 'ATAC':
 53 |     model = ModelRegressionGex2Atac(256,10000)   
 54 |     weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu')
 55 |     with open(par['input_pretrain'] + '/lsi_transformer.pickle', 'rb') as f:
 56 |         lsi_transformer_gex = pickle.load(f)
 57 |     
 58 |         
 59 |     model.load_state_dict(weight)    
 60 |     input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1)
 61 |     
 62 | elif mod1 == 'ATAC' and mod2 == 'GEX':
 63 |     model = ModelRegressionAtac2Gex(256,13431)   
 64 |     weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu')
 65 |     with open(par['input_pretrain'] + '/lsi_transformer.pickle', 'rb') as f:
 66 |         lsi_transformer_gex = pickle.load(f)
 67 |         
 68 |     model.load_state_dict(weight)    
 69 |     input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1)
 70 | 
 71 | elif mod1 == 'ADT' and mod2 == 'GEX':
 72 |     model = ModelRegressionAdt2Gex(134,13953)   
 73 |     weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu')
 74 |         
 75 |     model.load_state_dict(weight)    
 76 |     #input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1)
 77 |     input_test_mod1_ = input_test_mod1.to_df()
 78 |     
 79 | dataset_test = ModalityMatchingDataset(input_test_mod1_, None, is_train=False)
 80 | dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4)
 81 | 
 82 | outputs = []
 83 | model.eval()
 84 | with torch.no_grad():
 85 |     for x in dataloader_test:
 86 |         output = model(x.float())
 87 |         outputs.append(output.detach().cpu().numpy())
 88 | 
 89 | outputs = np.concatenate(outputs)
 90 | outputs[outputs<0] = 0
 91 | outputs = csc_matrix(outputs)
 92 | 
 93 | adata = ad.AnnData(
 94 |     X=outputs,
 95 |     obs=input_test_mod1.obs,
 96 |     var=input_train_mod2.var,
 97 |     uns={
 98 |         'dataset_id': input_train_mod1.uns['dataset_id'],
 99 |         'method_id': meta['functionality_name'],
100 |     },
101 | )
102 | adata.write_h5ad(par['output'], compression = "gzip")


--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
  4 | export NXF_VER=21.04.1
  5 | export PIPELINE_VERSION=1.4.0
  6 | method_id=novel
  7 | task_id=predict_modality
  8 | 
  9 | 
 10 | # CITE ADT2GEX
 11 | dataset_id=openproblems_bmmc_cite_phase1_mod2
 12 | dataset_id_val=openproblems_bmmc_cite_phase2_mod2
 13 | dataset_path=output/datasets_phase1/$task_id/$dataset_id/$dataset_id.censor_dataset
 14 | dataset_path_val=output/datasets/$task_id/$dataset_id_val/$dataset_id_val.censor_dataset
 15 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 16 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 17 | 
 18 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 19 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 20 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 21 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 22 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 23 |   --output_pretrain ${pretrain_path}
 24 | 
 25 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 26 |   --input_train_mod1 ${dataset_path_val}.output_train_mod1.h5ad \
 27 |   --input_train_mod2 ${dataset_path_val}.output_train_mod2.h5ad \
 28 |   --input_test_mod1 ${dataset_path_val}.output_test_mod1.h5ad \
 29 |   --input_pretrain ${pretrain_path} \
 30 |   --output ${pred_path}.${method_id}.output.h5ad
 31 | 
 32 | # CITE GEX2ADT
 33 | dataset_id=openproblems_bmmc_cite_phase2_rna
 34 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 35 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 36 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 37 | 
 38 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 39 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 40 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 41 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 42 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 43 |   --output_pretrain ${pretrain_path}
 44 | 
 45 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 46 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 47 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 48 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 49 |   --input_pretrain ${pretrain_path} \
 50 |   --output ${pred_path}.${method_id}.output.h5ad
 51 | 
 52 | # MULTIOME GEX2ATAC
 53 | dataset_id=openproblems_bmmc_multiome_phase2_rna
 54 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 55 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 56 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 57 | 
 58 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 59 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 60 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 61 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 62 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 63 |   --output_pretrain ${pretrain_path}
 64 | 
 65 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 66 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 67 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 68 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 69 |   --input_pretrain ${pretrain_path} \
 70 |   --output ${pred_path}.${method_id}.output.h5ad
 71 | 
 72 | # MULTIOME ATAC2GEX
 73 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
 74 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 75 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
 76 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 77 | 
 78 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 79 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 80 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 81 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 82 |   --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
 83 |   --output_pretrain ${pretrain_path}
 84 | 
 85 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 86 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 87 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 88 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 89 |   --input_pretrain ${pretrain_path} \
 90 |   --output ${pred_path}.${method_id}.output.h5ad
 91 | 
 92 | # RUN EVALUATION
 93 | bin/nextflow run "$PIPELINE_REPO" \
 94 |   -r "$PIPELINE_VERSION" \
 95 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
 96 |   --solutionDir "output/datasets/$task_id" \
 97 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
 98 |   --publishDir "output/evaluation/$task_id/$method_id/" \
 99 |   -latest \
100 |   -resume \
101 |   -c "src/resources/nextflow_moremem.config"
102 | 
103 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/train/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: novel_train
 3 |   namespace: predict_modality_methods
 4 |   
 5 |   # metadata for your method
 6 |   
 7 |   description: The task is solved via training encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework.
 8 |   
 9 |   authors:
10 |     - name: Gleb Ryazantsev
11 |       email: ryazantsev.gleb@gmail.com
12 |       roles: [ author, maintainer ]
13 |     - name: Nikolay Russkikh
14 |       email: russkikh.nikolay@gmail.com
15 |       roles: [ author, maintainer ]
16 |     - name: Igor I
17 |       email: herri.i.67@gmail.com
18 |       roles: [ author, maintainer ]
19 |       
20 |   # parameters
21 |   arguments:
22 |     # required inputs
23 |     - name: "--input_train_mod1"
24 |       type: "file"
25 |       example: "dataset_mod1.h5ad"
26 |       description: Censored dataset, training cells.
27 |       required: true
28 |     - name: "--input_train_mod2"
29 |       type: "file"
30 |       example: "dataset_mod2.h5ad"
31 |       description: Censored dataset.
32 |       required: true
33 |     - name: "--input_test_mod1"
34 |       type: "file"
35 |       example: "dataset_test_mod1.h5ad"
36 |       description: Censored dataset, training cells.
37 |       required: true
38 |     - name: "--input_test_mod2"
39 |       type: "file"
40 |       example: "dataset_test_mod2.h5ad"
41 |       description: Censored dataset.
42 |       required: true
43 | 
44 |     # required outputs
45 |     - name: "--output_pretrain"
46 |       type: "file"
47 |       direction: "output"
48 |       example: "pretrain_model"
49 |       description: Path to the directory containing a pretrained model.
50 |       required: true
51 |       
52 |   # files your script needs
53 |   resources:
54 |     - type: python_script
55 |       path: script.py
56 |     - path: ../resources/helper_functions.py
57 |       
58 | # target platforms
59 | platforms:
60 |   - type: docker
61 |     image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime"
62 |     run_args: [ "--gpus all --shm-size=5G" ]
63 |     setup:
64 |       - type: python
65 |         packages:
66 |           - anndata
67 |           - scikit-learn
68 |           - networkx
69 | 
70 |   - type: nextflow
71 |     labels: [ vhighmem, vvhightime, vhighcpu, gpu]
72 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/train/script.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import torch
  5 | from torch.utils.data import DataLoader
  6 | 
  7 | import anndata as ad
  8 | 
  9 | from sklearn.model_selection import train_test_split
 10 | 
 11 | import pickle
 12 | 
 13 | #check gpu available
 14 | if (torch.cuda.is_available()):
 15 |     device = 'cuda:0' #switch to current device
 16 |     print('current device: gpu')
 17 | else:
 18 |     device = 'cpu'
 19 |     print('current device: cpu')
 20 | 
 21 | 
 22 | ## VIASH START
 23 | dataset_path = "output/datasets/match_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_"
 24 | pretrain_path = "output/pretrain/match_modality/clue/openproblems_bmmc_cite_phase2_rna.clue_train.output_pretrain/"
 25 | 
 26 | par = {
 27 |     'input_train_mod1': f'{dataset_path}train_mod1.h5ad',
 28 |     'input_train_mod2': f'{dataset_path}train_mod2.h5ad',
 29 |     'input_test_mod1': f'{dataset_path}test_mod1.h5ad',
 30 |     'input_test_mod2': f'{dataset_path}test_mod2.h5ad',
 31 |     'output_pretrain': pretrain_path
 32 | }
 33 | meta = {
 34 |     'resources_dir': '.',
 35 |     'functionality_name': '171129'
 36 | }
 37 | ## VIASH END
 38 | 
 39 | sys.path.append(meta['resources_dir'])
 40 | from helper_functions import train_and_valid, lsiTransformer, ModalityMatchingDataset
 41 | from helper_functions import ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModelRegressionGex2Adt, ModelRegressionGex2Atac
 42 | 
 43 | os.makedirs(par['output_pretrain'], exist_ok=True)
 44 | 
 45 | print("Start train")
 46 | 
 47 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
 48 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
 49 | 
 50 | mod1 = input_train_mod1.var['feature_types'][0]
 51 | mod2 = input_train_mod2.var['feature_types'][0]
 52 | if mod1 != "ADT":
 53 |     input_train_mod2_df = input_train_mod2.to_df()
 54 |     
 55 |     lsi_transformer_gex = lsiTransformer(n_components=256)
 56 |     gex_train = lsi_transformer_gex.fit_transform(input_train_mod1)
 57 |     
 58 |     train_mod1, test_mod1, train_mod2, test_mod2 = train_test_split(gex_train, input_train_mod2_df, test_size=0.25, random_state=666)
 59 |     input_train_mod2_df = input_train_mod2.to_df()
 60 | else:
 61 |     train_mod1 = input_train_mod1.to_df()
 62 |     train_mod2 = input_train_mod2.to_df()
 63 |     test_mod1 = ad.read_h5ad(par['input_test_mod1']).to_df()
 64 |     test_mod2 = ad.read_h5ad(par['input_test_mod2']).to_df()
 65 | 
 66 | 
 67 | if mod1 == 'ATAC' and mod2 == 'GEX':
 68 |     dataset_train = ModalityMatchingDataset(train_mod1, train_mod2)
 69 |     dataloader_train = DataLoader(dataset_train, 256, shuffle = True, num_workers = 8)
 70 | 
 71 |     dataset_test = ModalityMatchingDataset(test_mod1, test_mod2)
 72 |     dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8)
 73 | 
 74 |     model = ModelRegressionAtac2Gex(256,13431).to(device)
 75 |     optimizer = torch.optim.AdamW(model.parameters(), lr=0.00008386597445284492,weight_decay=0.000684887347727808)
 76 |         
 77 | elif mod1 == 'ADT' and mod2 == 'GEX':
 78 |     dataset_train = ModalityMatchingDataset(train_mod1, train_mod2)
 79 |     dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 4)
 80 | 
 81 |     dataset_test = ModalityMatchingDataset(test_mod1, test_mod2)
 82 |     dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4)
 83 | 
 84 |     model = ModelRegressionAdt2Gex(134,13953).to(device)
 85 |     optimizer = torch.optim.Adam(model.parameters(), lr=0.00041, weight_decay=0.0000139)
 86 | 
 87 | 
 88 | elif mod1 == 'GEX' and mod2 == 'ADT':
 89 |     dataset_train = ModalityMatchingDataset(train_mod1, train_mod2)
 90 |     dataloader_train = DataLoader(dataset_train, 32, shuffle = True, num_workers = 8)
 91 | 
 92 |     dataset_test = ModalityMatchingDataset(test_mod1, test_mod2)
 93 |     dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8)
 94 | 
 95 |     model = ModelRegressionGex2Adt(256,134).to(device)
 96 |     optimizer = torch.optim.AdamW(model.parameters(), lr=0.000034609210829678734, weight_decay=0.0009965881574697426)
 97 | 
 98 | 
 99 | elif mod1 == 'GEX' and mod2 == 'ATAC':
100 |     dataset_train = ModalityMatchingDataset(train_mod1, train_mod2)
101 |     dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 8)
102 | 
103 |     dataset_test = ModalityMatchingDataset(test_mod1, test_mod2)
104 |     dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8)
105 | 
106 |     model = ModelRegressionGex2Atac(256,10000).to(device)
107 |     optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001806762345275399, weight_decay=0.0004084171379280058)
108 | 
109 | loss_fn = torch.nn.MSELoss()
110 | train_and_valid(model, optimizer, loss_fn, dataloader_train, dataloader_test, par['output_pretrain'] + '/model.pt', device)
111 | 
112 | if mod1 != "ADT":
113 |     with open(par['output_pretrain'] + '/lsi_transformer.pickle', 'wb') as f:
114 |         pickle.dump(lsi_transformer_gex, f)
115 | 
116 | print("End train")
117 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | pretrains_v10/
3 | run/results.py
4 | run/script_v10.5.py


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/README.md:
--------------------------------------------------------------------------------
 1 | [scJoint] Predict Modality
 2 | ===
 3 | Team scJoint: Yu-Hsiu Chen, Sheng Wan, Tung-Yu Wu
 4 | 
 5 | Project URL: https://github.com/itscassie/scJoint-neurips2021-modality-prediction
 6 | 
 7 | This folder contains our training pipeline and script used for the **NeurIPS 2021 Competition - Multimodal Single-Cell Data Integration**, the **Predict Modality** task. Our team **scJoint** took [3rd place of the modality prediction task](https://eval.ai/web/challenges/challenge-page/1111/leaderboard/2860) in terms of the overall ranking of 4 subtasks: namely `GEX to ADT`, `ADT to GEX`, `GEX to ATAC`, and `ATAC to GEX`. Specifically, our methods ranked 3rd in `GEX to ADT` and 4th in `ATAC to GEX`. More details about the training configurations can be found in our project ([link](https://github.com/itscassie/scJoint-neurips2021-modality-prediction)).
 8 | 
 9 | Full documentation for the competition, including dataset, can be found at [openproblems.bio/neurips_docs/](https://openproblems.bio/neurips_docs/).
10 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/modules/model_ae.py:
--------------------------------------------------------------------------------
  1 | """ autoencoder based models """
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | 
  6 | class Encoder(nn.Module):
  7 |     """base encoder module"""
  8 | 
  9 |     def __init__(self, input_dim, out_dim, hidden_dim, dropout=0.2):
 10 |         super(Encoder, self).__init__()
 11 |         self.encoder = nn.Sequential(
 12 |             nn.Dropout(dropout),
 13 |             nn.Linear(input_dim, hidden_dim),
 14 |             nn.BatchNorm1d(hidden_dim),
 15 |             nn.LeakyReLU(0.2),
 16 |             nn.Linear(hidden_dim, hidden_dim),
 17 |             nn.BatchNorm1d(hidden_dim),
 18 |             nn.LeakyReLU(0.2),
 19 |             nn.Linear(hidden_dim, out_dim),
 20 |         )
 21 | 
 22 |     def forward(self, x_input):
 23 |         """forward propogation of the encoder arch"""
 24 |         x_emb = self.encoder(x_input)
 25 |         return x_emb
 26 | 
 27 | 
 28 | class Decoder(nn.Module):
 29 |     """base decoder module"""
 30 | 
 31 |     def __init__(self, input_dim, out_dim, hidden_dim):
 32 |         super(Decoder, self).__init__()
 33 |         self.decoder = nn.Sequential(
 34 |             nn.Linear(input_dim, hidden_dim // 2),
 35 |             nn.BatchNorm1d(hidden_dim // 2),
 36 |             nn.LeakyReLU(0.2),
 37 |             nn.Linear(hidden_dim // 2, hidden_dim),
 38 |             nn.BatchNorm1d(hidden_dim),
 39 |             nn.LeakyReLU(0.2),
 40 |             nn.Linear(hidden_dim, out_dim),
 41 |             nn.ReLU(),
 42 |         )
 43 | 
 44 |     def forward(self, x_emb):
 45 |         """forward propogation of the decoder arch"""
 46 |         x_rec = self.decoder(x_emb)
 47 |         return x_rec
 48 | 
 49 | 
 50 | class AutoEncoder(nn.Module):
 51 |     """autoencoder module"""
 52 | 
 53 |     def __init__(self, input_dim, out_dim, feat_dim, hidden_dim, dropout=0.2):
 54 |         super(AutoEncoder, self).__init__()
 55 |         self.encoder = Encoder(input_dim, feat_dim, hidden_dim, dropout)
 56 |         self.decoder = Decoder(feat_dim, out_dim, hidden_dim)
 57 | 
 58 |     def forward(self, x_input):
 59 |         """forward propogation of the autoencoder arch"""
 60 |         x_emb = self.encoder(x_input)
 61 |         x_rec = self.decoder(x_emb)
 62 |         return x_rec
 63 | 
 64 | 
 65 | class BatchClassifier(nn.Module):
 66 |     """base batch classifier class"""
 67 | 
 68 |     def __init__(self, input_dim, cls_num=6, hidden_dim=50):
 69 |         super(BatchClassifier, self).__init__()
 70 |         self.classifier = nn.Sequential(
 71 |             nn.Linear(input_dim, hidden_dim),
 72 |             nn.BatchNorm1d(hidden_dim),
 73 |             nn.LeakyReLU(0.2),
 74 |             nn.Linear(hidden_dim, cls_num),
 75 |             nn.LeakyReLU(0.2),
 76 |         )
 77 | 
 78 |     def forward(self, x_feat):
 79 |         """forward propogation of the batch classifier arch"""
 80 |         return self.classifier(x_feat)
 81 | 
 82 | 
 83 | class BatchRemovalGAN(nn.Module):
 84 |     """batch removal module"""
 85 | 
 86 |     def __init__(self, input_dim, out_dim, feat_dim, hidden_dim, cls_num=10, dropout=0.2):
 87 |         super(BatchRemovalGAN, self).__init__()
 88 |         self.encoder = Encoder(input_dim, feat_dim, hidden_dim, dropout)
 89 |         self.decoder = Decoder(feat_dim, out_dim, hidden_dim)
 90 |         self.classifier = BatchClassifier(feat_dim, cls_num=cls_num)
 91 | 
 92 |     def forward(self, x_input):
 93 |         """forward propogation of the batch removal gan arch"""
 94 |         x_feat = self.encoder(x_input)
 95 |         x_rec = self.decoder(x_feat)
 96 |         cls_prob = self.classifier(x_feat)
 97 | 
 98 |         return x_rec, cls_prob
 99 | 
100 | 
101 | if __name__ == "__main__":
102 | 
103 |     bsz = 5
104 |     in_d = 10
105 |     out_d = 3
106 |     feat_d = 2
107 |     hid_d = 10
108 | 
109 |     x1 = torch.randn(bsz, in_d).cuda()
110 | 
111 |     model = AutoEncoder(in_d, out_d, feat_d, hid_d).cuda().float()
112 |     print(model)
113 |     output = model(x1)
114 |     print(output.shape)
115 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/preprocess/save_highlyvar_genes.py:
--------------------------------------------------------------------------------
  1 | """ save highly variable using scanpy package """
  2 | import os
  3 | import argparse
  4 | import numpy as np
  5 | import anndata as ad
  6 | import scanpy as sc
  7 | import pandas as pd
  8 | 
  9 | parser = argparse.ArgumentParser()
 10 | parser.add_argument(
 11 |     "-d",
 12 |     "--data_dir",
 13 |     type=str,
 14 |     default="output/datasets/predict_modality",
 15 |     help="path to dataset directory",
 16 | )
 17 | parser.add_argument(
 18 |     "-o",
 19 |     "--output_dir",
 20 |     type=str,
 21 |     default="output/pretrain/predict_modality/scjoint",
 22 |     help="path to output directory",
 23 | )
 24 | parser.add_argument(
 25 |     "-p",
 26 |     "--phase",
 27 |     type=str,
 28 |     default="phase2",
 29 |     choices=["phase1", "phase1v2", "phase2"],
 30 |     help="dataset phase",
 31 | )
 32 | 
 33 | parser.add_argument(
 34 |     "-m",
 35 |     "--mode",
 36 |     nargs="*",
 37 |     type=str,
 38 |     default=["atac2gex"],
 39 |     help="modes for generating idf matrix",
 40 | )
 41 | 
 42 | parser.add_argument("-n", "--n_top", type=int, default=10000, help="returns n top highly variable genes")
 43 | args = parser.parse_args()
 44 | 
 45 | # datset path
 46 | ADT2GEX_ID = f"openproblems_bmmc_cite_{args.phase}_mod2"
 47 | GEX2ADT_ID = f"openproblems_bmmc_cite_{args.phase}_rna"
 48 | ATAC2GEX_ID = f"openproblems_bmmc_multiome_{args.phase}_mod2"
 49 | GEX2ATAC_ID = f"openproblems_bmmc_multiome_{args.phase}_rna"
 50 | 
 51 | # path to different modes
 52 | ADT2GEX_PTH = f"{args.data_dir}/{ADT2GEX_ID}/{ADT2GEX_ID}.censor_dataset"
 53 | GEX2ADT_PTH = f"{args.data_dir}/{GEX2ADT_ID}/{GEX2ADT_ID}.censor_dataset"
 54 | ATAC2GEX_PTH = f"{args.data_dir}/{ATAC2GEX_ID}/{ATAC2GEX_ID}.censor_dataset"
 55 | GEX2ATAC_PTH = f"{args.data_dir}/{GEX2ATAC_ID}/{GEX2ATAC_ID}.censor_dataset"
 56 | 
 57 | ADT2GEX = [
 58 |     f"{ADT2GEX_PTH}.output_train_mod1.h5ad",
 59 |     f"{ADT2GEX_PTH}.output_train_mod2.h5ad",
 60 |     f"{ADT2GEX_PTH}.output_test_mod1.h5ad",
 61 |     f"{ADT2GEX_PTH}.output_test_mod2.h5ad",
 62 |     f"{args.output_dir}/adt2gex_train.output_pretrain",
 63 | ]
 64 | 
 65 | GEX2ADT = [
 66 |     f"{GEX2ADT_PTH}.output_train_mod1.h5ad",
 67 |     f"{GEX2ADT_PTH}.output_train_mod2.h5ad",
 68 |     f"{GEX2ADT_PTH}.output_test_mod1.h5ad",
 69 |     f"{GEX2ADT_PTH}.output_test_mod2.h5ad",
 70 |     f"{args.output_dir}/gex2adt_train.output_pretrain",
 71 | ]
 72 | 
 73 | ATAC2GEX = [
 74 |     f"{ATAC2GEX_PTH}.output_train_mod1.h5ad",
 75 |     f"{ATAC2GEX_PTH}.output_train_mod2.h5ad",
 76 |     f"{ATAC2GEX_PTH}.output_test_mod1.h5ad",
 77 |     f"{ATAC2GEX_PTH}.output_test_mod2.h5ad",
 78 |     f"{args.output_dir}/atac2gex_train.output_pretrain",
 79 | ]
 80 | 
 81 | GEX2ATAC = [
 82 |     f"{GEX2ATAC_PTH}.output_train_mod1.h5ad",
 83 |     f"{GEX2ATAC_PTH}.output_train_mod2.h5ad",
 84 |     f"{GEX2ATAC_PTH}.output_test_mod1.h5ad",
 85 |     f"{GEX2ATAC_PTH}.output_test_mod2.h5ad",
 86 |     f"{args.output_dir}/gex2atac_train.output_pretrain",
 87 | ]
 88 | 
 89 | MODES = {"adt2gex": ADT2GEX, "gex2adt": GEX2ADT, "atac2gex": ATAC2GEX, "gex2atac": GEX2ATAC}
 90 | 
 91 | 
 92 | if __name__ == "__main__":
 93 |     # desired data path
 94 |     DATAPTH = [MODES[i] for i in args.mode]
 95 | 
 96 |     for (i, mode) in enumerate(DATAPTH):
 97 |         print(f"MODE [{i + 1} / {len(DATAPTH)}]: {args.mode[i]}")
 98 |         train_mod1_pth = mode[0]
 99 |         test_mod1_pth = mode[2]
100 |         train_mod1 = sc.read_h5ad(train_mod1_pth)
101 |         test_mod1 = sc.read_h5ad(test_mod1_pth)
102 | 
103 |         # concat train/test sets
104 |         X_raw = sc.concat(
105 |             {"train": train_mod1, "test": test_mod1},
106 |             axis=0,
107 |             join="outer",
108 |             label="group",
109 |             fill_value=0,
110 |             index_unique="-",
111 |         )
112 |         print(X_raw.shape)
113 | 
114 |         # collect highly variable genes
115 |         sc.pp.highly_variable_genes(X_raw, n_top_genes=args.n_top)
116 |         X_raw = X_raw[:, X_raw.var.highly_variable]
117 | 
118 |         train_highly = X_raw[: train_mod1.X.shape[0], :]
119 |         train_highly = ad.AnnData(
120 |             X=train_highly.X,
121 |             obs=train_highly.obs,
122 |             var=pd.DataFrame({"feature_types": train_mod1.var["feature_types"][X_raw.var_names]}),
123 |             uns=train_highly.uns,
124 |             layers=train_highly.layers,
125 |         )
126 | 
127 |         test_highly = X_raw[train_mod1.X.shape[0] :, :]
128 |         test_highly = ad.AnnData(
129 |             X=test_highly.X,
130 |             obs=test_highly.obs,
131 |             var=pd.DataFrame({"feature_types": test_mod1.var["feature_types"][X_raw.var_names]}),
132 |             uns=test_highly.uns,
133 |             layers=test_highly.layers,
134 |         )
135 |         print(train_highly)
136 |         print(test_highly)
137 | 
138 |         # save highly variable indexs
139 |         mod1_vars = np.array(train_mod1.var_names)
140 |         mod1_highly_idx = [
141 |             int(np.where(mod1_vars == np.array(X_raw.var_names[i]))[0])
142 |             for i in range(np.array(X_raw.var_names).shape[0])
143 |         ]
144 | 
145 |         file_path = f"{mode[4]}"
146 |         os.makedirs(file_path, exist_ok=True)
147 | 
148 |         with open(f"{file_path}/index_highly{args.n_top}.txt", "w", encoding="utf8") as index_file:
149 |             index_file.write(f"index num: {len(mod1_highly_idx)}\n")
150 |             for ind in mod1_highly_idx:
151 |                 index_file.write(str(ind) + "\n")
152 | 
153 |         print(f"finish saving {file_path}/index_highly{args.n_top}.txt")
154 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/preprocess/save_idf_matrix.py:
--------------------------------------------------------------------------------
  1 | """ this function save idf matrixs from the dataset """
  2 | import os
  3 | import argparse
  4 | import numpy as np
  5 | import anndata as ad
  6 | 
  7 | parser = argparse.ArgumentParser()
  8 | parser.add_argument(
  9 |     "-d",
 10 |     "--data_dir",
 11 |     type=str,
 12 |     default="output/datasets/predict_modality",
 13 |     help="path to dataset directory",
 14 | )
 15 | parser.add_argument(
 16 |     "-o",
 17 |     "--output_dir",
 18 |     type=str,
 19 |     default="output/pretrain/predict_modality/scjoint",
 20 |     help="path to output directory",
 21 | )
 22 | parser.add_argument(
 23 |     "-p",
 24 |     "--phase",
 25 |     default="phase2",
 26 |     type=str,
 27 |     choices=["phase1", "phase1v2", "phase2"],
 28 |     help="dataset phase",
 29 | )
 30 | 
 31 | parser.add_argument(
 32 |     "-m",
 33 |     "--mode",
 34 |     type=str,
 35 |     nargs="*",
 36 |     default=["adt2gex", "gex2adt", "atac2gex", "gex2atac"],
 37 |     help="modes for generating idf matrix",
 38 | )
 39 | args = parser.parse_args()
 40 | 
 41 | # datset path
 42 | ADT2GEX_ID = f"openproblems_bmmc_cite_{args.phase}_mod2"
 43 | GEX2ADT_ID = f"openproblems_bmmc_cite_{args.phase}_rna"
 44 | ATAC2GEX_ID = f"openproblems_bmmc_multiome_{args.phase}_mod2"
 45 | GEX2ATAC_ID = f"openproblems_bmmc_multiome_{args.phase}_rna"
 46 | 
 47 | # path to different modes
 48 | ADT2GEX_PTH = f"{args.data_dir}/{ADT2GEX_ID}/{ADT2GEX_ID}.censor_dataset"
 49 | GEX2ADT_PTH = f"{args.data_dir}/{GEX2ADT_ID}/{GEX2ADT_ID}.censor_dataset"
 50 | ATAC2GEX_PTH = f"{args.data_dir}/{ATAC2GEX_ID}/{ATAC2GEX_ID}.censor_dataset"
 51 | GEX2ATAC_PTH = f"{args.data_dir}/{GEX2ATAC_ID}/{GEX2ATAC_ID}.censor_dataset"
 52 | 
 53 | ADT2GEX = [
 54 |     f"{ADT2GEX_PTH}.output_train_mod1.h5ad",
 55 |     f"{ADT2GEX_PTH}.output_train_mod2.h5ad",
 56 |     f"{ADT2GEX_PTH}.output_test_mod1.h5ad",
 57 |     f"{ADT2GEX_PTH}.output_test_mod2.h5ad",
 58 |     f"{args.output_dir}/adt2gex_train.output_pretrain",
 59 | ]
 60 | 
 61 | GEX2ADT = [
 62 |     f"{GEX2ADT_PTH}.output_train_mod1.h5ad",
 63 |     f"{GEX2ADT_PTH}.output_train_mod2.h5ad",
 64 |     f"{GEX2ADT_PTH}.output_test_mod1.h5ad",
 65 |     f"{GEX2ADT_PTH}.output_test_mod2.h5ad",
 66 |     f"{args.output_dir}/gex2adt_train.output_pretrain",
 67 | ]
 68 | 
 69 | ATAC2GEX = [
 70 |     f"{ATAC2GEX_PTH}.output_train_mod1.h5ad",
 71 |     f"{ATAC2GEX_PTH}.output_train_mod2.h5ad",
 72 |     f"{ATAC2GEX_PTH}.output_test_mod1.h5ad",
 73 |     f"{ATAC2GEX_PTH}.output_test_mod2.h5ad",
 74 |     f"{args.output_dir}/atac2gex_train.output_pretrain",
 75 | ]
 76 | 
 77 | GEX2ATAC = [
 78 |     f"{GEX2ATAC_PTH}.output_train_mod1.h5ad",
 79 |     f"{GEX2ATAC_PTH}.output_train_mod2.h5ad",
 80 |     f"{GEX2ATAC_PTH}.output_test_mod1.h5ad",
 81 |     f"{GEX2ATAC_PTH}.output_test_mod2.h5ad",
 82 |     f"{args.output_dir}/gex2atac_train.output_pretrain",
 83 | ]
 84 | 
 85 | MODES = {"adt2gex": ADT2GEX, "gex2adt": GEX2ADT, "atac2gex": ATAC2GEX, "gex2atac": GEX2ATAC}
 86 | 
 87 | 
 88 | def idf_matrix(x_raw):
 89 |     """returns idf matrix"""
 90 |     x_idf = np.zeros_like(x_raw).astype(np.single)
 91 |     x_idf[x_raw > 0] = 1
 92 |     idf = np.log(x_raw.shape[0] / (np.sum(x_idf, axis=0, keepdims=True) + 1))
 93 |     return idf
 94 | 
 95 | 
 96 | if __name__ == "__main__":
 97 |     # desired data path
 98 |     DATAPTH = [MODES[i] for i in args.mode]
 99 |     for (i, mode) in enumerate(DATAPTH):
100 |         print(f"MODE [{i + 1} / {len(DATAPTH)}]: {args.mode[i]}")
101 | 
102 |         train_mod1_pth = mode[0]
103 |         train_mod1 = ad.read_h5ad(train_mod1_pth)
104 | 
105 |         x_raw_matrix = train_mod1.layers["counts"].toarray().astype(np.float16)
106 |         print(f"train data shape: {x_raw_matrix.shape}")
107 | 
108 |         x_idf_matrix = idf_matrix(x_raw_matrix)
109 |         print(f"idf matrix shape: {x_idf_matrix.shape}")
110 | 
111 |         file_path = f"{mode[4]}"
112 |         print(f"output dir: {file_path}")
113 |         os.makedirs(file_path, exist_ok=True)
114 | 
115 |         np.save(f"{file_path}/mod1_idf.npy", x_idf_matrix)
116 |         print(f"finish saving {file_path}/mod1_idf.npy")
117 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/train.py:
--------------------------------------------------------------------------------
  1 | """ main training process """
  2 | import os
  3 | import logging
  4 | import argparse
  5 | from datetime import datetime
  6 | 
  7 | from trainer.trainer_nn import TrainProcess as TrainProcess_NN
  8 | from trainer.trainer_cycle import TrainProcess as TrainProcess_Cycle
  9 | from trainer.trainer_batchgan import TrainProcess as TrainProcess_BATCHGAN
 10 | 
 11 | from opts import DATASET, model_opts
 12 | from utils.dataloader import get_data_dim
 13 | 
 14 | if __name__ == "__main__":
 15 |     # config parser
 16 |     parser = argparse.ArgumentParser(add_help=False)
 17 |     model_opts(parser)
 18 |     args = parser.parse_known_args()[0]
 19 | 
 20 |     # exp name for train log, weights, model
 21 |     if args.train == "train":
 22 |         TIME_NOW = datetime.now().strftime("%b%d-%H-%M")
 23 |         exp_name = f"{args.arch}_{args.mode}"
 24 |         if args.selection:
 25 |             assert args.mod1_idx_path is not None, "need to specified --mod1_idx_path"
 26 |             SELECT_NUM = args.mod1_idx_path.split("/")[-1].replace(".txt", "").replace("index_", "")
 27 |             exp_name += f"_select{SELECT_NUM}"
 28 | 
 29 |         if args.tfidf != 0:
 30 |             assert args.idf_path is not None, "need to specified --idf_path"
 31 |             assert not args.gene_activity, "support either ga or tfidf != 0"
 32 |             if args.tfidf == 1:
 33 |                 exp_name += f"_tfidf"
 34 |             elif args.tfidf == 2:
 35 |                 exp_name += f"_tfidfconcat"
 36 |             elif args.tfidf == 3:
 37 |                 exp_name += f"_tfidfconcatga"
 38 |                 assert args.mode == "atac2gex" and args.phase in [
 39 |                     "phase1v2",
 40 |                     "phase2",
 41 |                 ], "gene activity mode support only atac2gex mode (p1v2 or p2)"
 42 |         elif args.gene_activity:
 43 |             exp_name += f"_ga"
 44 |             assert args.mode == "atac2gex" and args.phase in [
 45 |                 "phase1v2",
 46 |                 "phase2",
 47 |             ], "gene activity mode support only atac2gex mode (p1v2 or p2)"
 48 |         if args.norm:
 49 |             exp_name += f"_norm"
 50 |         if args.dropout != 0.2:
 51 |             exp_name += f"_dropout{args.dropout}"
 52 |         if args.name != "":
 53 |             exp_name += f"_{args.name}"
 54 |         else:
 55 |             exp_name += f"_{TIME_NOW}"
 56 | 
 57 |     # exp name for eval log file
 58 |     elif args.train == "eval":
 59 |         assert args.checkpoint is not None, "need to specified --checkpoint"
 60 |         exp_name = args.checkpoint.split("/")[-1].replace(".pt", "")
 61 |         exp_name += f"_{args.phase}"
 62 | 
 63 |     # loggings and logs
 64 |     if args.dryrun:
 65 |         handlers = [logging.StreamHandler()]
 66 |     else:
 67 |         os.makedirs(f"{args.output_dir}/logs/", exist_ok=True)
 68 |         os.makedirs(f"{DATASET[args.mode]['weight_dir']}", exist_ok=True)
 69 |         handlers = [
 70 |             logging.FileHandler(f"{args.output_dir}/logs/{args.train}_{exp_name}.log", mode="w"),
 71 |             logging.StreamHandler(),
 72 |         ]
 73 | 
 74 |     logging.basicConfig(level=logging.DEBUG, format="%(message)s", handlers=handlers)
 75 | 
 76 |     # load data
 77 |     MOD1_DIM, MOD2_DIM = get_data_dim(DATASET[args.mode], args)
 78 | 
 79 |     parser.add_argument("--mod1_dim", default=MOD1_DIM)
 80 |     parser.add_argument("--mod2_dim", default=MOD2_DIM)
 81 |     parser.add_argument("--exp_name", default=exp_name)
 82 |     args = parser.parse_args()
 83 | 
 84 |     logging.info("\nArgument:")
 85 |     for arg, value in vars(args).items():
 86 |         logging.info(f"{arg:20s}: {value}")
 87 |     logging.info("\n")
 88 | 
 89 |     # trainer
 90 |     if args.arch == "nn":
 91 |         trainer = TrainProcess_NN(args)
 92 |     elif args.arch == "cycle":
 93 |         trainer = TrainProcess_Cycle(args)
 94 |     elif args.arch == "batchgan":
 95 |         trainer = TrainProcess_BATCHGAN(args)
 96 | 
 97 |     if args.train == "train":
 98 |         trainer.run()
 99 |         trainer.eval()
100 | 
101 |     elif args.train == "eval":
102 |         trainer.load_checkpoint()
103 |         trainer.eval()
104 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/predict_modality/methods/scJoint/resources/trainer/__init__.py


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/predict_modality/methods/scJoint/resources/utils/__init__.py


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/utils/loss.py:
--------------------------------------------------------------------------------
 1 | """ define custum loss function in this file """
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | def cosine_sim(arr_1, arr_2):
 7 |     """ return consine similarity of 2 arrays """
 8 |     arr_1 = arr_1 / torch.norm(arr_1, dim=1, keepdim=True)
 9 |     arr_2 = arr_2 / torch.norm(arr_2, dim=1, keepdim=True)
10 |     sim = torch.matmul(arr_1, torch.transpose(arr_2, 0, 1))
11 | 
12 |     return sim
13 | 
14 | 
15 | class CosineLoss(nn.Module):
16 |     """ custum loss for mean cosine similarity """
17 |     def __init__(self):
18 |         super(CosineLoss, self).__init__()
19 | 
20 |     def forward(self, emb1, emb2, emb1_resid, emb2_resid):
21 |         """ define cosine loss """
22 |         emb1, emb2 = emb1.float(), emb2.float()
23 |         cosine_loss = torch.mean(
24 |             torch.abs(cosine_sim(emb1, emb1_resid) + cosine_sim(emb2, emb2_resid))
25 |         )
26 |         return cosine_loss
27 | 
28 | 
29 | class L1regularization(nn.Module):
30 |     """ l1 regularization loss for model """
31 |     def __init__(self, weight_decay=0.1):
32 |         super(L1regularization, self).__init__()
33 |         self.weight_decay = weight_decay
34 | 
35 |     def forward(self, model):
36 |         """ define l1 reg loss """
37 |         regularization_loss = 0.0
38 |         for param in model.parameters():
39 |             regularization_loss += torch.mean(abs(param)) * self.weight_decay
40 | 
41 |         return regularization_loss
42 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/utils/metric.py:
--------------------------------------------------------------------------------
 1 | """ calculate metrics """
 2 | 
 3 | import numpy as np
 4 | 
 5 | def rmse(mod2_sol, mod2_pred):
 6 |     """
 7 |     input: prediction / ans
 8 |     output: rmse
 9 |     """
10 |     tmp = mod2_sol - mod2_pred
11 |     rmse_out = np.sqrt(tmp.power(2).mean())
12 |     return rmse_out
13 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/run/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: scjoint
 3 |   namespace: predict_modality_methods
 4 |   
 5 |   description: An ensemble method including pca, nn, feature extraction.
 6 | 
 7 |   info:
 8 |     method_label: "scJoint"
 9 |     submission_id: "171135"
10 |     team_name: scJoint
11 | 
12 |   authors:
13 |     - name: Yu-Hsiu Chen
14 |       email: yhchen.cm06g@nctu.edu.tw
15 |       roles: [ author, maintainer ]
16 |       props: { github: itscassie }
17 |     - name: Sheng Wan
18 |       email: a5736735a.eecs99@g2.nctu.edu.tw
19 |     - name: Tung-Yu Wu
20 |       email: wtywty@gmail.com
21 |       
22 |   # parameters
23 |   arguments:
24 |     # required inputs
25 |     - name: "--input_train_mod1"
26 |       type: "file"
27 |       example: "dataset_mod1.h5ad"
28 |       description: Censored dataset, training cells.
29 |       required: true
30 |     - name: "--input_test_mod1"
31 |       type: "file"
32 |       example: "dataset_mod1.h5ad"
33 |       description: Censored dataset, test cells.
34 |       required: true
35 |     - name: "--input_train_mod2"
36 |       type: "file"
37 |       example: "dataset_mod2.h5ad"
38 |       description: Censored dataset.
39 |       required: true
40 |     - name: "--input_pretrain"
41 |       type: "file"
42 |       example: "pretrain_model"
43 |       description: Path to the directory containing a pretrained model.
44 |       required: true
45 |     # required outputs
46 |     - name: "--output"
47 |       type: "file"
48 |       direction: "output"
49 |       example: "output.h5ad"
50 |       description: Dataset with predicted values for modality2.
51 |       required: true
52 |       
53 |   # files your script needs
54 |   resources:
55 |     - type: python_script
56 |       path: script.py
57 |     # includes all code under resources/ 
58 |     - path: ../resources
59 | 
60 | # target platforms
61 | platforms:
62 |   - type: docker
63 |     image: "pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime"
64 |     run_args: [ "--gpus all" ] 
65 |     setup:
66 |       - type: python
67 |         packages:
68 |           - scikit-learn
69 |           - anndata
70 |           - scanpy
71 |           - numpy
72 | 
73 |   - type: nextflow
74 |     labels: [ highmem, hightime, midcpu, gpu ]
75 | 


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
  4 | export NXF_VER=21.04.1
  5 | export PIPELINE_VERSION=1.4.0
  6 | method_id=scjoint
  7 | task_id=predict_modality
  8 | pretrain_path=output/pretrain/$task_id/$method_id
  9 | 
 10 | # GENERATE PRETRAIN
 11 | echo ""
 12 | echo "######################################################################"
 13 | echo "##                Generating pretrain weights/files                 ##"
 14 | echo "######################################################################"
 15 | 
 16 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
 17 |   --data_dir output/datasets/$task_id \
 18 |   --output_pretrain ${pretrain_path}
 19 | 
 20 | echo ""
 21 | echo "######################################################################"
 22 | echo "##                   Generating prediction files                    ##"
 23 | echo "######################################################################"
 24 | 
 25 | # CITE GEX2ADT
 26 | echo ""
 27 | echo "CITE GEX to ADT"
 28 | dataset_id=openproblems_bmmc_cite_phase2_rna
 29 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 30 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 31 | 
 32 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 33 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 34 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 35 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 36 |   --input_pretrain "${pretrain_path}/gex2adt_train.output_pretrain/" \
 37 |   --output ${pred_path}.${method_id}.output.h5ad
 38 | 
 39 | # CITE ADT2GEX
 40 | echo ""
 41 | echo "CITE ADT to GEX"
 42 | dataset_id=openproblems_bmmc_cite_phase2_mod2
 43 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 44 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 45 | 
 46 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 47 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 48 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 49 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 50 |   --input_pretrain "${pretrain_path}/adt2gex_train.output_pretrain/" \
 51 |   --output ${pred_path}.${method_id}.output.h5ad
 52 | 
 53 | # MULTIOME GEX2ATAC
 54 | echo ""
 55 | echo "MULTIOME GEX to ATAC"
 56 | dataset_id=openproblems_bmmc_multiome_phase2_rna
 57 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 58 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 59 | 
 60 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 61 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 62 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 63 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 64 |   --input_pretrain "${pretrain_path}/gex2atac_train.output_pretrain/" \
 65 |   --output ${pred_path}.${method_id}.output.h5ad
 66 | 
 67 | # MULTIOME ATAC2GEX
 68 | echo ""
 69 | echo "MULTIOME ATAC to GEX"
 70 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
 71 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
 72 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
 73 | 
 74 | target/docker/${task_id}_methods/${method_id}/${method_id} \
 75 |   --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
 76 |   --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
 77 |   --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
 78 |   --input_pretrain "${pretrain_path}/atac2gex_train.output_pretrain/" \
 79 |   --output ${pred_path}.${method_id}.output.h5ad
 80 | 
 81 | # RUN EVALUATION
 82 | echo ""
 83 | echo "######################################################################"
 84 | echo "##                      Evaluating predictions                      ##"
 85 | echo "######################################################################"
 86 | bin/nextflow run "$PIPELINE_REPO" \
 87 |   -r "$PIPELINE_VERSION" \
 88 |   -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
 89 |   --solutionDir "output/datasets/$task_id" \
 90 |   --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
 91 |   --publishDir "output/evaluation/$task_id/$method_id/" \
 92 |   -latest \
 93 |   -resume \
 94 |   -c "src/resources/nextflow_moremem.config"
 95 | 
 96 | echo ""
 97 | echo "######################################################################"
 98 | echo "##                        Evaluation summary                        ##"
 99 | echo "######################################################################"
100 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"


--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/train/config.vsh.yaml:
--------------------------------------------------------------------------------
 1 | functionality:
 2 |   name: scjoint_train
 3 |   namespace: predict_modality_methods
 4 |   
 5 |   # metadata for your method
 6 |   version: dev
 7 |   description: An ensemble method including pca, nn, feature extraction.
 8 |   authors:
 9 |     - name: Yu-Hsiu Chen
10 |       email: yhchen.cm06g@nctu.edu.tw
11 |       roles: [ author, maintainer ]
12 |     - name: Sheng Wan
13 |       email: a5736735a.eecs99@g2.nctu.edu.tw
14 |     - name: Tung-Yu Wu
15 |       email: wtywty@gmail.com
16 | 
17 |   # parameters
18 |   arguments:
19 |     # required inputs
20 |     - name: "--data_dir"
21 |       type: "file"
22 |       description: The path to the predict_modality datasets
23 |       required: true
24 | 
25 |     # required outputs
26 |     - name: "--output_pretrain"
27 |       type: "file"
28 |       direction: "output"
29 |       example: "pretrain_model"
30 |       description: Path to the directory containing the pretrained models.
31 |       required: true
32 |       
33 |   # files your script needs
34 |   resources:
35 |     - type: bash_script
36 |       path: train.sh
37 |     # includes all code under resources/ 
38 |     - path: ../resources
39 | 
40 | # target platforms
41 | platforms:
42 | 
43 |   - type: docker
44 |     image: "pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime"
45 |     run_args: [ "--gpus all" ] 
46 |     setup:
47 |       - type: python
48 |         packages:
49 |           - scikit-learn
50 |           - anndata
51 |           - scanpy
52 |           - numpy
53 | 
54 |   - type: nextflow
55 |     labels: [ highmem, hightime, midcpu, gpu ]
56 | 


--------------------------------------------------------------------------------
/src/resources/nextflow.config:
--------------------------------------------------------------------------------
 1 | includeConfig "${launchDir}/target/nextflow/nextflow.config"
 2 | 
 3 | process {
 4 |   withLabel: lowcpu { cpus = 4 }
 5 |   withLabel: midcpu { cpus = 4 }
 6 |   withLabel: highcpu { cpus = 15 }
 7 |   withLabel: vhighcpu { cpus = 30 }
 8 |   withLabel: lowmem { memory = 60.GB }
 9 |   withLabel: midmem { memory = 60.GB }
10 |   withLabel: highmem { memory = 110.GB }
11 |   withLabel: vhighmem { memory = 110.GB }
12 |   withLabel: lowtime { time = "20m" }
13 |   withLabel: midtime { time = "40m" }
14 |   withLabel: hightime { time = "60m" }
15 |   withLabel: vhightime { time = "120m" }
16 |   withLabel: vvhightime { time = "360m" }
17 |   withLabel: gpu { maxForks = 1; containerOptions = '--gpus all' }
18 | }
19 | 
20 | def viash_temp = System.getenv("VIASH_TEMP") ?: "/tmp/"
21 | docker.runOptions = "-v ${launchDir}/target/nextflow:${launchDir}/target/nextflow -v $viash_temp:$viash_temp --shm-size=5G --net none"
22 | 


--------------------------------------------------------------------------------
/src/resources/nextflow_moremem.config:
--------------------------------------------------------------------------------
 1 | process {
 2 |   withLabel: lowcpu { cpus = 4 }
 3 |   withLabel: midcpu { cpus = 4 }
 4 |   withLabel: highcpu { cpus = 15 }
 5 |   withLabel: vhighcpu { cpus = 30 }
 6 |   withLabel: lowmem { memory = 60.GB }
 7 |   withLabel: midmem { memory = 60.GB }
 8 |   withLabel: highmem { memory = 110.GB }
 9 |   withLabel: vhighmem { memory = 110.GB }
10 |   withLabel: lowtime { time = "20m" }
11 |   withLabel: midtime { time = "40m" }
12 |   withLabel: hightime { time = "60m" }
13 |   withLabel: vhightime { time = "120m" }
14 |   withLabel: vvhightime { time = "360m" }
15 |   withLabel: gpu { maxForks = 1; containerOptions = '--gpus all' }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/sync_datasets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | function aws_s3 {
 4 |   CMD="$1"
 5 |   SOURCE="$2"
 6 |   DEST="$3"
 7 |   # use aws cli if installed
 8 |   if command -v aws &> /dev/null; then
 9 |     aws s3 "$CMD" --no-sign-request "$SOURCE" "$DEST"
10 |   # else use aws docker container instead
11 |   else
12 |     docker run \
13 |       --user $(id -u):$(id -g) \
14 |       --rm -it \
15 |       -v $(pwd)/output:/output \
16 |       -w / \
17 |       amazon/aws-cli \
18 |       s3 "$CMD" --no-sign-request "$SOURCE" "$DEST"
19 |   fi
20 | }
21 | 
22 | aws_s3 sync "s3://openproblems-bio/public/phase1-data/" "output/datasets_phase1"
23 | aws_s3 sync "s3://openproblems-bio/public/phase1v2-data/" "output/datasets_phase1v2"
24 | # aws_s3 sync "s3://openproblems-bio/public/phase2-data/" "output/datasets_phase2_public"
25 | aws_s3 sync "s3://openproblems-bio/public/phase2-data/joint_embedding/" "output/datasets_phase2_public/joint_embedding"
26 | aws_s3 sync "s3://openproblems-bio/public/phase2-private-data/" "output/datasets"
27 | aws_s3 sync "s3://openproblems-bio/public/explore/" "output/datasets_explore"


--------------------------------------------------------------------------------