├── .gitignore
├── LICENSE
├── README.Rmd
├── README.md
├── bin
├── .gitignore
├── README.md
└── init
├── resources
├── github_mark.svg
└── orcid_id.svg
├── sample_data
└── predict_modality
│ ├── openproblems_bmmc_cite_starter
│ ├── openproblems_bmmc_cite_starter.test_mod1.h5ad
│ ├── openproblems_bmmc_cite_starter.test_mod2.h5ad
│ ├── openproblems_bmmc_cite_starter.train_mod1.h5ad
│ └── openproblems_bmmc_cite_starter.train_mod2.h5ad
│ └── openproblems_bmmc_multiome_starter
│ ├── openproblems_bmmc_multiome_starter.test_mod1.h5ad
│ ├── openproblems_bmmc_multiome_starter.test_mod2.h5ad
│ ├── openproblems_bmmc_multiome_starter.train_mod1.h5ad
│ └── openproblems_bmmc_multiome_starter.train_mod2.h5ad
└── src
├── joint_embedding
└── methods
│ ├── Guanlab-dengkw
│ ├── run
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ └── test.sh
│ ├── jae
│ ├── README.md
│ ├── model_architecture.png
│ ├── resources
│ │ └── utils.py
│ ├── run
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ ├── test.sh
│ └── train
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ └── lsl_ae
│ ├── run
│ ├── config.vsh.yaml
│ └── script.py
│ └── test.sh
├── match_modality
└── methods
│ ├── clue
│ ├── README.md
│ ├── clue_architecture.jpg
│ ├── resources
│ │ ├── scglue-0.1.1-py3-none-any.whl
│ │ └── utils.py
│ ├── run
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ ├── test.sh
│ └── train
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ └── novel
│ ├── README.md
│ ├── novel_architecture1.png
│ ├── novel_architecture2.png
│ ├── resources
│ ├── catalyst_tools.py
│ ├── config_ADT2GEX.py
│ ├── config_ATAC2GEX.py
│ ├── data.py
│ ├── models.py
│ ├── postprocessing.py
│ └── preprocessing.py
│ ├── run
│ ├── config.vsh.yaml
│ └── script.py
│ ├── test.sh
│ └── train
│ ├── config.vsh.yaml
│ └── script.py
├── predict_modality
└── methods
│ ├── AXX
│ ├── .gitignore
│ ├── README.md
│ ├── resources
│ │ ├── const.py
│ │ ├── models.py
│ │ ├── predict.py
│ │ ├── test.py
│ │ ├── train.py
│ │ ├── utils.py
│ │ └── yaml
│ │ │ ├── mlp_ADT2GEX.yaml
│ │ │ ├── mlp_ATAC2GEX.yaml
│ │ │ └── mlp_GEX2ADT.yaml
│ ├── run
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ ├── test.sh
│ └── train
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ ├── DANCE
│ ├── resources
│ │ ├── baseline.py
│ │ └── graph_util.py
│ ├── run
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ ├── test.sh
│ └── train
│ │ ├── config.vsh.yaml
│ │ ├── generate_extra_files.py
│ │ ├── h.all.v7.4.entrez.gmt
│ │ ├── h.all.v7.4.symbols.gmt
│ │ ├── hetero_arg_version_v5.py
│ │ └── script.sh
│ ├── Guanlab-dengkw
│ ├── run
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ └── test.sh
│ ├── LS_lab
│ ├── run
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ └── test.sh
│ ├── cajal
│ ├── run
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ ├── test.sh
│ └── train
│ │ ├── ADT_list_df_updated.csv
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ ├── novel
│ ├── README.md
│ ├── novel_architecture.jpg
│ ├── resources
│ │ └── helper_functions.py
│ ├── run
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ ├── test.sh
│ └── train
│ │ ├── config.vsh.yaml
│ │ └── script.py
│ └── scJoint
│ ├── .gitignore
│ ├── README.md
│ ├── resources
│ ├── modules
│ │ └── model_ae.py
│ ├── opts.py
│ ├── preprocess
│ │ ├── save_filter_genes.py
│ │ ├── save_highlyvar_genes.py
│ │ └── save_idf_matrix.py
│ ├── train.py
│ ├── trainer
│ │ ├── __init__.py
│ │ ├── trainer_batchgan.py
│ │ ├── trainer_cycle.py
│ │ └── trainer_nn.py
│ └── utils
│ │ ├── __init__.py
│ │ ├── dataloader.py
│ │ ├── loss.py
│ │ └── metric.py
│ ├── run
│ ├── config.vsh.yaml
│ └── script.py
│ ├── test.sh
│ └── train
│ ├── config.vsh.yaml
│ └── train.sh
├── resources
├── nextflow.config
└── nextflow_moremem.config
└── sync_datasets.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | *.pyc
3 | target
4 | work
5 | .nextflow*
6 | log.txt
7 | README.html
8 | bin/build_for_release.sh
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Open Problems in Single-Cell Analysis
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/bin/.gitignore:
--------------------------------------------------------------------------------
1 | fetch
2 | viash*
3 | nextflow
4 |
--------------------------------------------------------------------------------
/bin/README.md:
--------------------------------------------------------------------------------
1 | These executables were generated by running the `bin/init` executable.
2 |
--------------------------------------------------------------------------------
/bin/init:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Check Java installed
4 | if ! command -v java --version &> /dev/null
5 | then
6 | echo "Please ensure Java Runtime ≥8 is installed. You can find an open source installer here: https://adoptopenjdk.net/?variant=openjdk8&jvmVariant=hotspot"
7 | exit
8 | fi
9 |
10 | # Check Docker installed
11 | if ! command -v docker --version &> /dev/null
12 | then
13 | echo "Please ensure Docker is installed and up-to-date. Instructions at https://www.docker.com/get-started"
14 | exit
15 | fi
16 |
17 | # get the root of the directory
18 | REPO_ROOT=$(git rev-parse --show-toplevel)
19 |
20 | # ensure that the command below is run from the root of the repository
21 | cd "$REPO_ROOT"
22 |
23 | curl -fsSL http://get.viash.io | bash -s -- \
24 | --viash bin/viash \
25 | --registry openproblems \
26 | --tag 0.5.5 \
27 | --log check_results/results.tsv \
28 | --config_mod '.platforms[.type == "nextflow"].separate_multiple_outputs := false' \
29 | --config_mod '.platforms[.type == "nextflow"].directive_memory := "10GB"' \
30 | --config_mod '.platforms[.type == "nextflow"].directive_time := "10 m"'
31 |
32 | cd bin
33 |
34 | curl -s https://get.nextflow.io | bash
35 |
--------------------------------------------------------------------------------
/resources/github_mark.svg:
--------------------------------------------------------------------------------
1 |
2 |
28 |
--------------------------------------------------------------------------------
/resources/orcid_id.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod1.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod1.h5ad
--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod2.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod2.h5ad
--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod1.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod1.h5ad
--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod2.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod2.h5ad
--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad
--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad
--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad
--------------------------------------------------------------------------------
/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad
--------------------------------------------------------------------------------
/src/joint_embedding/methods/Guanlab-dengkw/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: guanlab_dengkw_je
3 | namespace: joint_embedding_methods
4 |
5 | # metadata for your method
6 | description: A description for your method.
7 | info:
8 | method_label: "Guanlab-dengkw"
9 | submission_id: "170795"
10 | team_name: Guanlab-dengkw
11 | # project_url: https://github.com/foo/bar
12 | # publication_doi: 10.1101/0123.45.67.890123
13 | # publication_url: https://arxiv.org/abs/1234.56789
14 |
15 | authors:
16 | - name: Kaiwen Deng
17 | email: dengkw@umich.edu
18 | roles: [ author, maintainer ]
19 | props: { github: nonztalk }
20 |
21 | # parameters
22 | arguments:
23 | # required inputs
24 | - name: "--input_mod1"
25 | type: "file"
26 | example: "dataset_mod1.h5ad"
27 | description: Modality 1 dataset.
28 | required: true
29 | - name: "--input_mod2"
30 | type: "file"
31 | example: "dataset_mod2.h5ad"
32 | description: Modality 2 dataset.
33 | required: true
34 | # required outputs
35 | - name: "--output"
36 | type: "file"
37 | direction: "output"
38 | example: "output.h5ad"
39 | description: Data for all cells in mod1 and mod2 embedded to ≤100 dimensions.
40 | required: true
41 |
42 |
43 | # files your script needs
44 | resources:
45 | - type: python_script
46 | path: script.py
47 |
48 | # target platforms
49 | platforms:
50 | - type: docker
51 | image: dataintuitive/randpy:py3.8
52 | setup:
53 | - type: python
54 | packages:
55 | - anndata
56 | - umap-learn
57 | - type: nextflow
58 | labels: [ vhighmem, hightime, vhighcpu ]
59 |
--------------------------------------------------------------------------------
/src/joint_embedding/methods/Guanlab-dengkw/run/script.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import anndata as ad
3 | import numpy as np
4 |
5 | from sklearn.decomposition import TruncatedSVD
6 |
7 | logging.basicConfig(level=logging.INFO)
8 |
9 | ## VIASH START
10 | dataset_path = "output/datasets/joint_embedding/openproblems_bmmc_cite_phase2/openproblems_bmmc_cite_phase2.censor_dataset.output_"
11 |
12 | par = {
13 | 'input_mod1': f'{dataset_path}mod1.h5ad',
14 | 'input_mod2': f'{dataset_path}mod2.h5ad',
15 | 'output': 'output.h5ad'
16 | }
17 | meta = {
18 | 'resources_dir': '.',
19 | 'functionality_name': 'submission_170795'
20 | }
21 | ## VIASH END
22 |
23 | def normalize(arr):
24 | arr_sd = np.std(arr, axis=1).reshape(-1, 1)
25 | arr_mean = np.mean(arr, axis=1).reshape(-1, 1)
26 | return (arr - arr_mean) / arr_sd
27 |
28 | logging.info('Reading `h5ad` files...')
29 | ad_mod1 = ad.read_h5ad(par['input_mod1'])
30 | ad_mod2 = ad.read_h5ad(par['input_mod2'])
31 |
32 | logging.info('Determine parameters by the modalities')
33 | mod1_type = ad_mod1.var.feature_types[0].upper()
34 | mod2_type = ad_mod2.var.feature_types[0].upper()
35 |
36 | if mod1_type == "GEX" and mod2_type == "ADT":
37 | n_mod1 = 73
38 | n_mod2 = 27
39 |
40 | elif mod1_type == "ADT" and mod2_type == "GEX":
41 | n_mod1 = 27
42 | n_mod2 = 73
43 |
44 | elif mod1_type == "GEX" and mod2_type == "ATAC":
45 | n_mod1 = 38
46 | n_mod2 = 62
47 |
48 | elif mod1_type == "ATAC" and mod2_type == "GEX":
49 | n_mod1 = 62
50 | n_mod2 = 38
51 |
52 | else:
53 | n_mod1 = 50
54 | n_mod2 = 50
55 |
56 | logging.info('Performing dimensionality reduction on modality 1 values...')
57 | embedder_mod1 = TruncatedSVD(n_components=n_mod1)
58 | mod1_pca = embedder_mod1.fit_transform(ad_mod1.X)
59 | mod1_obs = ad_mod1.obs
60 | mod1_uns = ad_mod1.uns
61 | del ad_mod1
62 |
63 | logging.info('Performing dimensionality reduction on modality 2 values...')
64 | embedder_mod1 = TruncatedSVD(n_components=n_mod2)
65 | mod2_pca = embedder_mod1.fit_transform(ad_mod2.X)
66 | del ad_mod2
67 |
68 | logging.info('Concatenating datasets')
69 | pca_combined = np.concatenate([normalize(mod1_pca), normalize(mod2_pca)], axis=1)
70 |
71 | logging.info('Storing output to file')
72 | adata = ad.AnnData(
73 | X=pca_combined,
74 | obs=mod1_obs,
75 | uns={
76 | 'dataset_id': mod1_uns['dataset_id'],
77 | 'method_id': meta['functionality_name'],
78 | },
79 | )
80 | adata.write_h5ad(par['output'], compression="gzip")
81 |
--------------------------------------------------------------------------------
/src/joint_embedding/methods/Guanlab-dengkw/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=submission_170795
7 | task_id=joint_embedding
8 |
9 | # CITE
10 | dataset_id=openproblems_bmmc_cite_phase2
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
13 |
14 | target/docker/${task_id}_methods/${method_id}/${method_id} \
15 | --input_mod1 ${dataset_path}.output_mod1.h5ad \
16 | --input_mod2 ${dataset_path}.output_mod2.h5ad \
17 | --output ${pred_path}.${method_id}.output.h5ad
18 |
19 | # MULTIOME
20 | dataset_id=openproblems_bmmc_multiome_phase2
21 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
22 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
23 |
24 | target/docker/${task_id}_methods/${method_id}/${method_id} \
25 | --input_mod1 ${dataset_path}.output_mod1.h5ad \
26 | --input_mod2 ${dataset_path}.output_mod2.h5ad \
27 | --output ${pred_path}.${method_id}.output.h5ad
28 |
29 |
30 | # RUN EVALUATION
31 | bin/nextflow run "$PIPELINE_REPO" \
32 | -r "$PIPELINE_VERSION" \
33 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
34 | --solutionDir "output/datasets/$task_id" \
35 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
36 | --publishDir "output/evaluation/$task_id/$method_id/" \
37 | -latest \
38 | -resume \
39 | -c "src/resources/nextflow_moremem.config"
40 |
41 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/README.md:
--------------------------------------------------------------------------------
1 | # Single cell joint embedding with an autoencoder (JAE)
2 |
3 | **Team**: Amateur
4 |
5 | **Team members**: Qiao Liu, Wanwen Zeng, Chencheng Xu
6 |
7 | **Project URL**: https://github.com/kimmo1019/JAE
8 |
9 |
10 |
11 | In brief, we built an autoencoder for joint embedding (JAE). Each modality will first be SVD transformed and concatenated together (denoted as x). The major difference from standard AE is that we incorporated the information from cell annotations (e.g., cell label, cell cycle score, and cell batch) to constrain the structure of latent features. We desire that some latent features (c) predict the cell type information, some features predict the cell cycle score. Noticeably, for feature (b), we want it to predict the batch label as randomly as possible to potentially eliminate the batch effect. z has no constrain at all to ensure the flexibility of neural network.
12 |
13 | In the pretrain stage, JAE was trained with exploration data where the cell annotation information (cell type, cell cycle phase score) is available. In the test stage where the cell annotation information is not available, we only minimize the reconstruction loss of the autoencoder with a smaller learning rate (fine-tune).
14 |
15 |
16 | Feel free to contact `liuqiao@stanford.edu` if you have any problem in the JAE model.
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/model_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/joint_embedding/methods/jae/model_architecture.png
--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/resources/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Utilities for consistent data preprocessing
3 | """
4 |
5 | import numpy as np
6 | import tensorflow as tf
7 |
8 | class EarlyStoppingAtMinLoss(tf.keras.callbacks.Callback):
9 | def __init__(self, patience=0):
10 | super(EarlyStoppingAtMinLoss, self).__init__()
11 | self.patience = patience
12 | self.best_weights = None
13 |
14 | def on_train_begin(self, logs=None):
15 | self.wait = 0
16 | self.stopped_epoch = 0
17 | self.best = np.Inf
18 |
19 | def on_epoch_end(self, epoch, logs=None):
20 | current = logs.get("val_loss")
21 | if np.less(current, self.best):
22 | self.best = current
23 | self.wait = 0
24 | self.best_weights = self.model.get_weights()
25 | else:
26 | self.wait += 1
27 | if self.wait >= self.patience:
28 | self.stopped_epoch = epoch
29 | self.model.stop_training = True
30 | print("Restoring model weights from the end of the best epoch.")
31 | self.model.set_weights(self.best_weights)
32 |
33 | def on_train_end(self, logs=None):
34 | if self.stopped_epoch > 0:
35 | print("Epoch %05d: early stopping" % (self.stopped_epoch + 1))
36 |
37 |
38 | class JointEmbeddingModel(tf.keras.Model):
39 | def __init__(self, params, name=None):
40 | super(JointEmbeddingModel, self).__init__(name=name)
41 | self.params = params
42 | self.encoder = self.create_encoder()
43 | self.decoder = self.create_decoder()
44 | self.classifier = self.create_classifier()
45 |
46 | def get_config(self):
47 | return {
48 | "params": self.params,
49 | }
50 | def call(self, inputs, training):
51 | encoded = self.encoder(inputs)
52 | decoded = self.decoder(encoded)
53 | digits_cell_type, digits_batch, digits_phase = self.classifier(encoded)
54 | if self.params['use_batch']:
55 | return decoded, digits_cell_type, digits_batch, digits_phase
56 | else:
57 | return decoded, digits_cell_type
58 |
59 | def create_encoder(self, use_resnet=True):
60 | if use_resnet:
61 | inputs = tf.keras.layers.Input(shape=(self.params['dim'],))
62 | for i, n_unit in enumerate(self.params['hidden_units'][:-1]):
63 | if i==0:
64 | x_init = tf.keras.layers.Dense(n_unit, activation='relu')(inputs)
65 | else:
66 | x_init = tf.keras.layers.Dense(n_unit, activation='relu')(x)
67 | x = tf.keras.layers.Dropout(0.1)(x_init)
68 | x = tf.keras.layers.BatchNormalization()(x)
69 | x = tf.keras.layers.Dense(n_unit)(x)
70 | x = tf.keras.layers.Add()([x,x_init])
71 | x = tf.keras.layers.Activation(activation='relu')(x)
72 | encoded = tf.keras.layers.Dense(self.params['hidden_units'][-1], activation='relu')(x)
73 | else:
74 | inputs = tf.keras.layers.Input(shape=(self.params['dim'],))
75 | for i, n_unit in enumerate(self.params['hidden_units'][:-1]):
76 | if i==0:
77 | x = tf.keras.layers.Dense(n_unit, activation='relu')(inputs)
78 | else:
79 | x = tf.keras.layers.Dense(n_unit, activation='relu')(x)
80 | x = tf.keras.layers.Dropout(0.1)(x)
81 | x = tf.keras.layers.BatchNormalization()(x)
82 | encoded = tf.keras.layers.Dense(self.params['hidden_units'][-1], activation='relu')(x)
83 | return tf.keras.Model(inputs=inputs, outputs=encoded, name='encoder')
84 |
85 | def create_decoder(self):
86 | inputs = tf.keras.layers.Input(shape=(self.params['hidden_units'][-1],))
87 | for i, n_unit in enumerate(self.params['hidden_units'][:-1][::-1]):
88 | if i==0:
89 | x = tf.keras.layers.Dense(n_unit, activation='relu')(inputs)
90 | else:
91 | x = tf.keras.layers.Dense(n_unit, activation='relu')(x)
92 | decoded = tf.keras.layers.Dense(self.params['dim'], activation='relu')(x)
93 | return tf.keras.Model(inputs=inputs, outputs=decoded, name='decoder')
94 |
95 | def create_classifier(self):
96 | inputs = tf.keras.layers.Input(shape=(self.params['hidden_units'][-1],))
97 | digits_cell_type = inputs[:,:self.params['nb_cell_types']]
98 | digits_batch = inputs[:,self.params['nb_cell_types']:(self.params['nb_cell_types']+self.params['nb_batches'])]
99 | digits_phase = inputs[:,(self.params['nb_cell_types']+self.params['nb_batches']):(self.params['nb_cell_types']+self.params['nb_batches']+self.params['nb_phases'])]
100 | return tf.keras.Model(inputs=inputs, outputs=[digits_cell_type, digits_batch, digits_phase], name='classifier')
101 |
102 |
--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: jae
3 | namespace: joint_embedding_methods
4 |
5 | # metadata for your method
6 | description: In brief, we built an autoencoder for joint embedding (JAE). Each modality will first be SVD transformed and concatenated together. The major difference from standard AE is that we incorporated the information from cell annotations (e.g., cell label, cell cycle score, and cell batch) to constrain the structure of latent features. We desire that some latent features predict the cell type information, some features predict the cell cycle score. Noticeably, for feature corresponding to batch effect, we want it to predict the batch label as randomly as possible to potentially eliminate the batch effect. There are also several nodes that have no constrain at all to ensure the flexibility of neural network.
7 | info:
8 | method_label: "JAE"
9 | submission_id: "170936/171079"
10 | team_name: Amateur
11 | project_url: https://github.com/kimmo1019/JAE
12 |
13 | authors:
14 | - name: Qiao Liu
15 | email: liuqiao@stanford.edu
16 | roles: [ author, maintainer ]
17 | props: { github: kimmo1019, orcid: "0000-0002-9781-3360", url: "http://liuqiao.me" }
18 | - name: Wanwen Zeng
19 | email: wanwen@stanford.edu
20 | roles: [ author ]
21 | props: { github: wanwenzeng, orcid: "0000-0003-3426-0890", url: "https://scholar.google.com/citations?user=MbeOhkgAAAAJ&hl=zh-CN" }
22 | - name: Chencheng Xu
23 | roles: [ author ]
24 | props: { github: Zoesgithub, orcid: "0000-0002-2262-6966" }
25 |
26 | # parameters
27 | arguments:
28 | # required inputs
29 | - name: "--input_mod1"
30 | type: "file"
31 | example: "dataset_mod1.h5ad"
32 | description: Modality 1 dataset.
33 | required: true
34 | - name: "--input_mod2"
35 | type: "file"
36 | example: "dataset_mod2.h5ad"
37 | description: Modality 2 dataset.
38 | required: true
39 | - name: "--input_pretrain"
40 | type: "file"
41 | example: "pretrain_model"
42 | description: Path to the directory containing a pretrained model.
43 | required: true
44 | # required outputs
45 | - name: "--output"
46 | type: "file"
47 | direction: "output"
48 | example: "output.h5ad"
49 | description: Data for all cells in mod1 and mod2 embedded to ≤100 dimensions.
50 | required: true
51 |
52 | # files your script needs
53 | resources:
54 | - type: python_script
55 | path: script.py
56 | - path: '../resources/utils.py'
57 |
58 | # target platforms
59 | platforms:
60 | - type: docker
61 | image: tensorflow/tensorflow:latest-gpu
62 | run_args: [ "--gpus all" ]
63 | setup:
64 | - type: python
65 | packages:
66 | - anndata
67 | - umap-learn
68 | - scanpy
69 | - type: nextflow
70 | labels: [ vhighmem, vhightime, vhighcpu, gpu ]
71 |
--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/run/script.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import logging
4 | import json
5 | import anndata as ad
6 | import numpy as np
7 | from sklearn.preprocessing import normalize
8 | import tensorflow as tf
9 | import pickle as pk
10 | import scipy
11 |
12 | logging.basicConfig(level=logging.INFO)
13 |
14 | ## VIASH START
15 | dataset_path = 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.'
16 |
17 | par = {
18 | 'input_mod1': dataset_path + 'mod1.h5ad',
19 | 'input_mod2': dataset_path + 'mod2.h5ad',
20 | 'input_pretrain': '...',
21 | 'output': 'output.h5ad',
22 | }
23 |
24 | meta = { 'resources_dir': '.', 'functionality_name': 'submission_171079' }
25 | ## VIASH END
26 |
27 | sys.path.append(meta['resources_dir'])
28 | from utils import JointEmbeddingModel
29 |
30 | logging.info('Reading `h5ad` files...')
31 | ad_mod1 = ad.read_h5ad(par['input_mod1'])
32 | ad_mod2 = ad.read_h5ad(par['input_mod2'])
33 | mod1_obs = ad_mod1.obs
34 | mod1_uns = ad_mod1.uns
35 |
36 | ad_mod2_var = ad_mod2.var
37 |
38 | mod_type = ad_mod2_var['feature_types'][0]
39 |
40 | mod1_mat = ad_mod1.layers["counts"]
41 | mod2_mat = ad_mod2.layers["counts"]
42 |
43 | del ad_mod2, ad_mod1
44 |
45 | if mod_type == 'ATAC':
46 | mod1_svd = pk.load(open(os.path.join(par['input_pretrain'], 'svd_mod1.pkl'),'rb'))
47 | mod2_svd = pk.load(open(os.path.join(par['input_pretrain'], 'svd_mod2.pkl'),'rb'))
48 | else:
49 | mod1_svd = pk.load(open(os.path.join(par['input_pretrain'], 'svd_mod1.pkl'),'rb'))
50 | mod2_svd = None
51 |
52 | def svd_transform(mod1_data, mod2_data, mod1_svd, mod2_svd, scale=1e4):
53 | mod1_data = scale * normalize(mod1_data, norm='l1', axis=1)
54 | mod2_data = scale * normalize(mod2_data, norm='l1', axis=1)
55 | mod1_data = scipy.sparse.csr_matrix.log1p(mod1_data) / np.log(10)
56 | mod2_data = scipy.sparse.csr_matrix.log1p(mod2_data) / np.log(10)
57 | pca_data_mod1 = mod1_svd.transform(mod1_data)
58 |
59 | if mod_type == 'ADT':
60 | pca_data_mod2 = mod2_data.toarray()
61 | else:
62 | pca_data_mod2 = mod2_svd.transform(mod2_data)
63 | return pca_data_mod1, pca_data_mod2
64 |
65 | mod1_pca, mod2_pca = svd_transform(mod1_mat, mod2_mat, mod1_svd, mod2_svd)
66 |
67 | del mod1_mat, mod2_mat
68 |
69 | pca_combined = np.concatenate([mod1_pca, mod2_pca],axis=1)
70 | del mod1_pca, mod2_pca
71 |
72 | if mod_type == 'ATAC':
73 | epochs = 2
74 | else:
75 | epochs = 1
76 |
77 | coeff = [1.0, 0.0, 0.0, 0.0]
78 |
79 | with open(os.path.join(par['input_pretrain'], 'hyperparams.json'), 'r') as file:
80 | params = json.load(file)
81 |
82 | mymodel = JointEmbeddingModel(params)
83 | mymodel(np.zeros((2, params['dim'])))
84 |
85 | mymodel.compile(tf.keras.optimizers.Adam(learning_rate = params["lr"]),
86 | loss = [tf.keras.losses.MeanSquaredError(),
87 | tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
88 | tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
89 | tf.keras.losses.MeanSquaredError()
90 | ],
91 | loss_weights=coeff, run_eagerly=True)
92 |
93 | #load pretrain model
94 | mymodel.load_weights(os.path.join(par['input_pretrain'], 'weights.h5'))
95 |
96 |
97 | X_train = pca_combined
98 | c_fakes = np.random.randint(low=0, high=params['nb_cell_types'],size=pca_combined.shape[0])
99 | b_fakes = np.random.randint(low=0, high=params['nb_batches'],size=pca_combined.shape[0])
100 | p_fakes = np.random.randint(low=0, high=params['nb_phases'],size=pca_combined.shape[0])
101 | Y_train = [pca_combined, c_fakes, b_fakes, p_fakes]
102 |
103 | #finetune on the test data
104 | mymodel.fit(x=X_train, y=Y_train,
105 | epochs = epochs,
106 | batch_size = 32,
107 | shuffle=True)
108 |
109 | embeds = mymodel.encoder.predict(pca_combined)
110 | print(embeds.shape)
111 |
112 | adata = ad.AnnData(
113 | X=embeds,
114 | obs=mod1_obs,
115 | uns={
116 | 'dataset_id': mod1_uns['dataset_id'],
117 | 'method_id': meta['functionality_name'],
118 | },
119 | )
120 | adata.write_h5ad(par['output'], compression="gzip")
121 |
--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=jae
7 | task_id=joint_embedding
8 |
9 | # CITE
10 | dataset_id=openproblems_bmmc_cite_phase2
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | dataset_train_path=output/datasets_phase2_public/$task_id/$dataset_id/$dataset_id.censor_dataset
13 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
14 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
15 |
16 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
17 | --input_mod1 ${dataset_train_path}.output_mod1.h5ad \
18 | --input_mod2 ${dataset_train_path}.output_mod2.h5ad \
19 | --input_explore_mod1 output/datasets_explore/cite/cite_gex_processed_training.h5ad \
20 | --input_explore_mod2 output/datasets_explore/cite/cite_adt_processed_training.h5ad \
21 | --input_sol ${dataset_path}.output_sol.h5ad \
22 | --output_pretrain ${pretrain_path}
23 |
24 | target/docker/${task_id}_methods/${method_id}/${method_id} \
25 | --input_mod1 ${dataset_path}.output_mod1.h5ad \
26 | --input_mod2 ${dataset_path}.output_mod2.h5ad \
27 | --input_pretrain ${pretrain_path} \
28 | --output ${pred_path}.${method_id}.output.h5ad
29 |
30 | # MULTIOME
31 | dataset_id=openproblems_bmmc_multiome_phase2
32 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
33 | dataset_train_path=output/datasets_phase2_public/$task_id/$dataset_id/$dataset_id.censor_dataset
34 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
36 |
37 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
38 | --input_mod1 ${dataset_train_path}.output_mod1.h5ad \
39 | --input_mod2 ${dataset_train_path}.output_mod2.h5ad \
40 | --input_explore_mod1 output/datasets_explore/multiome/multiome_gex_processed_training.h5ad \
41 | --input_explore_mod2 output/datasets_explore/multiome/multiome_atac_processed_training.h5ad \
42 | --input_sol ${dataset_path}.output_sol.h5ad \
43 | --output_pretrain ${pretrain_path}
44 |
45 | target/docker/${task_id}_methods/${method_id}/${method_id} \
46 | --input_mod1 ${dataset_path}.output_mod1.h5ad \
47 | --input_mod2 ${dataset_path}.output_mod2.h5ad \
48 | --input_pretrain ${pretrain_path} \
49 | --output ${pred_path}.${method_id}.output.h5ad
50 |
51 | # RUN EVALUATION
52 | bin/nextflow run "$PIPELINE_REPO" \
53 | -r "$PIPELINE_VERSION" \
54 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
55 | --solutionDir "output/datasets/$task_id" \
56 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
57 | --publishDir "output/evaluation/$task_id/$method_id/" \
58 | -latest \
59 | -resume \
60 | -c "src/resources/nextflow_moremem.config"
61 |
62 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/joint_embedding/methods/jae/train/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: jae_train
3 | namespace: joint_embedding_methods
4 |
5 | # metadata for your method
6 | description: In brief, we built an autoencoder for joint embedding (JAE). Each modality will first be SVD transformed and concatenated together. The major difference from standard AE is that we incorporated the information from cell annotations (e.g., cell label, cell cycle score, and cell batch) to constrain the structure of latent features. We desire that some latent features predict the cell type information, some features predict the cell cycle score. Noticeably, for feature corresponding to batch effect, we want it to predict the batch label as randomly as possible to potentially eliminate the batch effect. There are also several nodes that have no constrain at all to ensure the flexibility of neural network.
7 | authors:
8 | - name: Qiao Liu
9 | email: liuqiao@stanford.edu
10 | roles: [ author, maintainer ]
11 | props: { github: kimmo1019, orcid: "0000-0002-9781-3360", url: "http://liuqiao.me" }
12 | - name: Wanwen Zeng
13 | email: wanwen@stanford.edu
14 | roles: [ author ]
15 | props: { github: wanwenzeng, orcid: "0000-0003-3426-0890", url: "https://scholar.google.com/citations?user=MbeOhkgAAAAJ&hl=zh-CN" }
16 | - name: Chencheng Xu
17 | roles: [ author ]
18 | props: { github: Zoesgithub, orcid: "0000-0002-2262-6966" }
19 |
20 | # parameters
21 | arguments:
22 | # required inputs
23 | - name: "--input_mod1"
24 | type: "file"
25 | example: "dataset_mod1.h5ad"
26 | description: Modality 1 dataset.
27 | required: true
28 | - name: "--input_mod2"
29 | type: "file"
30 | example: "dataset_mod2.h5ad"
31 | description: Modality 2 dataset.
32 | required: true
33 | - name: "--input_explore_mod1"
34 | type: "file"
35 | example: "dataset_mod1.h5ad"
36 | description: Explore version of the modality 1 dataset.
37 | required: true
38 | - name: "--input_explore_mod2"
39 | type: "file"
40 | example: "dataset_mod2.h5ad"
41 | description: Explore version of the modality 2 dataset.
42 | required: true
43 | - name: "--tf_seed"
44 | type: "integer"
45 | default: 46
46 | description: ...
47 | - name: "--np_seed"
48 | type: "integer"
49 | default: 56
50 | description: ...
51 |
52 | # required outputs
53 | - name: "--output_pretrain"
54 | type: "file"
55 | direction: "output"
56 | example: "pretrain_model"
57 | description: Path to the directory containing a pretrained model.
58 | required: true
59 |
60 | # files your script needs
61 | resources:
62 | - type: python_script
63 | path: script.py
64 | - path: '../resources/utils.py'
65 |
66 | # target platforms
67 | platforms:
68 | - type: docker
69 | image: tensorflow/tensorflow:latest-gpu
70 | run_args: [ "--gpus all" ]
71 | setup:
72 | - type: python
73 | packages:
74 | - anndata
75 | - umap-learn
76 | - scanpy
77 | - type: nextflow
78 | labels: [ vhighmem, vhightime, vhighcpu, gpu ]
79 |
--------------------------------------------------------------------------------
/src/joint_embedding/methods/lsl_ae/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: lsl_ae
3 | namespace: joint_embedding_methods
4 |
5 | # metadata for your method
6 | description: A description for your method.
7 | info:
8 | method_label: "LSL_AE"
9 | submission_id: "170825"
10 | team_name: Living-Systems-Lab
11 | # project_url: https://github.com/foo/bar
12 | # publication_doi: 10.1101/0123.45.67.890123
13 | # publication_url: https://arxiv.org/abs/1234.56789
14 |
15 | authors:
16 | - name: Sumeer Khan
17 | email: sumeer.khan@kaust.edu.sa
18 | roles: [ author, maintainer ]
19 | - name: Robert Lehman
20 | email: robert.lehman@kaust.edu.sa
21 | roles: [ author, maintainer ]
22 | - name: Xabier Martinez De Morentin
23 | email: xavier.martinez.demorentin@navarra.es
24 | roles: [ author, maintainer ]
25 | - name: Aidyn Ubingazhibov
26 | email: aidyn.ubingazhibov@nu.edu.kz
27 | roles: [ author, maintainer ]
28 | - name: Minxing Pang
29 | email: minxing.pang@kaust.edu.sa
30 | roles: [ author, maintainer ]
31 |
32 | # parameters
33 | arguments:
34 | # required inputs
35 | - name: "--input_mod1"
36 | type: "file"
37 | example: "dataset_mod1.h5ad"
38 | description: Modality 1 dataset.
39 | required: true
40 | - name: "--input_mod2"
41 | type: "file"
42 | example: "dataset_mod2.h5ad"
43 | description: Modality 2 dataset.
44 | required: true
45 | # required outputs
46 | - name: "--output"
47 | type: "file"
48 | direction: "output"
49 | example: "output.h5ad"
50 | description: Data for all cells in mod1 and mod2 embedded to ≤100 dimensions.
51 | required: true
52 |
53 | # files your script needs
54 | resources:
55 | - type: python_script
56 | path: script.py
57 |
58 | # target platforms
59 | platforms:
60 | - type: docker
61 | image: nvcr.io/nvidia/tensorflow:20.10-tf1-py3
62 | run_args: [ "--gpus all" ]
63 | setup:
64 | - type: python
65 | packages:
66 | - anndata
67 | - umap-learn
68 | - keras
69 | - matplotlib
70 | - scanpy
71 | - scipy
72 | - type: nextflow
73 | labels: [ vhighmem, vvhightime, vhighcpu, gpu ]
74 |
--------------------------------------------------------------------------------
/src/joint_embedding/methods/lsl_ae/run/script.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import anndata as ad
3 | import pandas as pd
4 | from tensorflow.keras.layers import Input, Dense, Dropout
5 | from tensorflow.keras.layers import concatenate
6 | from tensorflow.keras.models import Model
7 | from tensorflow.keras.callbacks import EarlyStopping
8 | from tensorflow import keras
9 | import warnings
10 | warnings.filterwarnings('ignore')
11 | import scanpy as sc
12 | #from keras import backend as K
13 | from tensorflow.keras.constraints import Constraint
14 | import tensorflow.keras.backend as K
15 |
16 | from tensorflow.keras.optimizers import Adam
17 | from tensorflow.keras.models import Model
18 | import warnings
19 | from numpy.random import seed
20 | seed(1)
21 | import tensorflow as tf
22 | tf.compat.v1.random.set_random_seed(2)
23 |
24 |
25 | print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
26 |
27 |
28 | warnings.filterwarnings('ignore')
29 |
30 | ## VIASH START
31 | dataset_path = "output/datasets/joint_embedding/openproblems_bmmc_cite_phase2/openproblems_bmmc_cite_phase2.censor_dataset.output_"
32 |
33 | par = {
34 | 'input_mod1': f'{dataset_path}mod1.h5ad',
35 | 'input_mod2': f'{dataset_path}mod2.h5ad',
36 | 'output': 'output.h5ad'
37 | }
38 | meta = {
39 | 'resources_dir': '.',
40 | 'functionality_name': 'submission_170795'
41 | }
42 | ## VIASH END
43 |
44 |
45 | logging.info('Reading `h5ad` files...')
46 | ad_mod1 = ad.read_h5ad(par['input_mod1'])
47 | ad_mod2 = ad.read_h5ad(par['input_mod2'])
48 |
49 | # high variable gene calculation
50 | min_cells = int(ad_mod2.shape[0] * 0.03)
51 | sc.pp.highly_variable_genes(ad_mod1, batch_key ='batch', subset = True)
52 | sc.pp.filter_genes(ad_mod2, min_cells=min_cells)
53 |
54 | ad_mod_1 = ad_mod1[:, ad_mod1.var.highly_variable]
55 |
56 | ## Convert to csv for AE training
57 | scRNAseq1 = ad_mod_1.X.toarray()
58 | scRNAseq2 = ad_mod2.X.toarray()
59 |
60 |
61 | class WeightsOrthogonalityConstraint(Constraint):
62 | def __init__(self, encoding_dim, weightage = 1.0, axis = 0):
63 | self.encoding_dim = encoding_dim
64 | self.weightage = weightage
65 | self.axis = axis
66 |
67 | def weights_orthogonality(self, w):
68 | if(self.axis==1):
69 | w = K.transpose(w)
70 | if(self.encoding_dim > 1):
71 | m = K.dot(K.transpose(w), w) - K.eye(self.encoding_dim)
72 | return self.weightage * K.sqrt(K.sum(K.square(m)))
73 | else:
74 | m = K.sum(w ** 2) - 1.
75 | return m
76 |
77 | def __call__(self, w):
78 | return self.weights_orthogonality(w)
79 |
80 |
81 | # Input Layer
82 | ncol_scRNAseq1 = scRNAseq1.shape[1]
83 | input_dim_scRNAseq1 = Input(shape = (ncol_scRNAseq1, ), name = "scRNAseq1")
84 | ncol_scRNAseq2 = scRNAseq2.shape[1]
85 | input_dim_scRNAseq2 = Input(shape = (ncol_scRNAseq2, ), name = "scRNAseq2")
86 |
87 | encoding_dim_scRNAseq1 = 64
88 | encoding_dim_scRNAseq2 = 64
89 |
90 | dropout_scRNAseq1 = Dropout(0.1, name = "Dropout_scRNAseq1")(input_dim_scRNAseq1)
91 | dropout_scRNAseq2 = Dropout(0.1, name = "Dropout_scRNAseq2")(input_dim_scRNAseq2)
92 |
93 | encoded_scRNAseq1 = Dense(encoding_dim_scRNAseq1, activation = 'relu', name = "Encoder_scRNAseq1", use_bias=True, kernel_regularizer=WeightsOrthogonalityConstraint(64, weightage=1., axis=0))(dropout_scRNAseq1) #300 #prv 256
94 | encoded_scRNAseq2 = Dense(encoding_dim_scRNAseq2, activation = 'relu', name = "Encoder_scRNAseq2", use_bias=True, kernel_regularizer=WeightsOrthogonalityConstraint(64, weightage=1., axis=0))(dropout_scRNAseq2)
95 |
96 | merge = concatenate([encoded_scRNAseq1, encoded_scRNAseq2])
97 |
98 | bottleneck = Dense(64, kernel_initializer = 'uniform', activation = 'linear', name = "Bottleneck")(merge) #50
99 |
100 | merge_inverse = Dense(encoding_dim_scRNAseq1 + encoding_dim_scRNAseq2, activation = 'relu', name = "Concatenate_Inverse")(bottleneck)
101 |
102 | decoded_scRNAseq1 = Dense(ncol_scRNAseq1, activation = 'relu', name = "Decoder_scRNAseq1")(merge_inverse) #sigmoid
103 |
104 | decoded_scRNAseq2 = Dense(ncol_scRNAseq2, activation = 'relu', name = "Decoder_scRNAseq2")(merge_inverse)
105 |
106 | autoencoder = Model([input_dim_scRNAseq1, input_dim_scRNAseq2], [decoded_scRNAseq1, decoded_scRNAseq2])
107 |
108 | opt = Adam(lr=0.0001)
109 | autoencoder.compile(optimizer = opt, loss={'Decoder_scRNAseq1': 'mean_squared_error', 'Decoder_scRNAseq2': 'mean_squared_error'}) #loss_weights = [1., 1.]
110 | autoencoder.summary()
111 |
112 | es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=20)
113 | # Autoencoder training
114 | estimator = autoencoder.fit([scRNAseq1, scRNAseq2], [scRNAseq1, scRNAseq2], epochs = 600, batch_size = 32, validation_split = 0.2, shuffle = True, verbose = 1, callbacks=[es]) #prev 64 BS prev 32
115 |
116 |
117 | encoder = Model([input_dim_scRNAseq1, input_dim_scRNAseq2], bottleneck)
118 | bottleneck_representation = encoder.predict([scRNAseq1, scRNAseq2])
119 |
120 | embd = pd.DataFrame(bottleneck_representation)
121 | #embd = scipy.sparse.csr_matrix(RNA_ATAC_Latent.values)
122 |
123 | mod1_obs = ad_mod1.obs
124 | mod1_uns = ad_mod1.uns
125 | logging.info('Storing output to file')
126 | adata = ad.AnnData(
127 | X=embd.values,
128 | obs=mod1_obs,
129 | uns={
130 | 'dataset_id': mod1_uns['dataset_id'],
131 | 'method_id': meta['functionality_name'],
132 | },
133 | )
134 | adata.write_h5ad(par['output'], compression="gzip")
135 |
--------------------------------------------------------------------------------
/src/joint_embedding/methods/lsl_ae/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=submission_170825
7 | task_id=joint_embedding
8 |
9 | # CITE
10 | dataset_id=openproblems_bmmc_cite_phase2
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
13 |
14 | target/docker/${task_id}_methods/${method_id}/${method_id} \
15 | --input_mod1 ${dataset_path}.output_mod1.h5ad \
16 | --input_mod2 ${dataset_path}.output_mod2.h5ad \
17 | --output ${pred_path}.${method_id}.output.h5ad
18 |
19 | # MULTIOME
20 | dataset_id=openproblems_bmmc_multiome_phase2
21 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
22 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
23 |
24 | target/docker/${task_id}_methods/${method_id}/${method_id} \
25 | --input_mod1 ${dataset_path}.output_mod1.h5ad \
26 | --input_mod2 ${dataset_path}.output_mod2.h5ad \
27 | --output ${pred_path}.${method_id}.output.h5ad
28 |
29 |
30 | # RUN EVALUATION
31 | bin/nextflow run "$PIPELINE_REPO" \
32 | -r "$PIPELINE_VERSION" \
33 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
34 | --solutionDir "output/datasets/$task_id" \
35 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
36 | --publishDir "output/evaluation/$task_id/$method_id/" \
37 | -latest \
38 | -resume \
39 | -c "src/resources/nextflow_moremem.config"
40 |
41 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/match_modality/methods/clue/README.md:
--------------------------------------------------------------------------------
1 | # CLUE (Cross-Linked Unified Embedding)
2 |
3 | Team GLUE: Zhi-Jie Cao, Xin-Ming Tu, Chen-Rui Xia
4 |
5 | **CLUE** is a semi-supervised single-cell multi-omics integration model. It employs variational autoencoders to project cells from different modalities into a unified low-dimensional embedding space, where modality matching can be performed. Specially, we model data in each modality as generated from a modality-specific subspace of the complete cell embedding. Through a matrix of cross-encoders, CLUE projects cells in each modality into all modality-specific subspaces, which are then concatenated to build a comprehensive embedding, allowing the model to capture both shared and modality-specific information.
6 |
7 |
8 |
9 | **General architecture of CLUE ⤴️**
10 |
11 | > CLUE is implemented as part of the `scglue` Python package. A pre-release containing the CLUE model is available as `resources/scglue-0.1.1-py3-none-any.whl`. A formal release will be made available later on PyPI and Anaconda. Stay tuned at [https://github.com/gao-lab/GLUE](https://github.com/gao-lab/GLUE).
12 |
--------------------------------------------------------------------------------
/src/match_modality/methods/clue/clue_architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/clue/clue_architecture.jpg
--------------------------------------------------------------------------------
/src/match_modality/methods/clue/resources/scglue-0.1.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/clue/resources/scglue-0.1.1-py3-none-any.whl
--------------------------------------------------------------------------------
/src/match_modality/methods/clue/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: clue
3 | namespace: match_modality_methods
4 |
5 | # metadata for your method
6 |
7 | description: Cross-linked unified embedding for single-cell multi-omics data integration
8 | info:
9 | method_label: "CLUE"
10 | submission_id: "169959"
11 | team_name: GLUE
12 | project_url: https://github.com/gao-lab/GLUE
13 | # publication_doi: 10.1101/2021.08.22.457275
14 | # publication_url: https://arxiv.org/abs/1234.56789
15 |
16 | authors:
17 | - name: Zhi-Jie Cao
18 | email: caozj@mail.cbi.pku.edu.cn
19 | roles: [ author, maintainer ]
20 | props: { github: Jeff1995, orcid: "0000-0002-0026-671X" }
21 | - name: Xin-Ming Tu
22 | email: xinmingtu@pku.edu.cn
23 | roles: [ author, maintainer ]
24 | props: { github: XinmingTu }
25 | - name: Chen-Rui Xia
26 | email: xiachenrui@mail.cbi.pku.edu.cn
27 | roles: [ author, maintainer ]
28 | props: { github: xiachenrui }
29 |
30 | # parameters
31 | arguments:
32 | # required inputs
33 | - name: "--input_train_mod1"
34 | type: "file"
35 | example: "dataset_censored.h5ad"
36 | description: "The censored shuffled train mod1 profiles."
37 | required: true
38 | - name: "--input_train_mod2"
39 | type: "file"
40 | example: "dataset_censored.h5ad"
41 | description: "The censored shuffled train mod2 profiles."
42 | required: true
43 | - name: "--input_train_sol"
44 | type: "file"
45 | example: "dataset_solution.h5ad"
46 | description: "The pairing of train mod1&mod2 profiles."
47 | required: true
48 | - name: "--input_test_mod1"
49 | type: "file"
50 | example: "dataset_censored.h5ad"
51 | description: "The censored shuffled test mod1 profiles."
52 | required: true
53 | - name: "--input_test_mod2"
54 | type: "file"
55 | example: "dataset_censored.h5ad"
56 | description: "The censored shuffled test mod2 profiles."
57 | required: true
58 | - name: "--input_pretrain"
59 | type: "file"
60 | example: "pretrain_model"
61 | description: Path to the directory containing a pretrained model.
62 | required: true
63 |
64 | # required outputs
65 | - name: "--output"
66 | type: "file"
67 | direction: "output"
68 | example: "output.h5ad"
69 | description: "The predicted pairing of test mod1&mod2 profiles."
70 | required: true
71 |
72 | # files your script needs
73 | resources:
74 | - type: python_script
75 | path: script.py
76 | - path: ../resources/utils.py
77 | - path: ../resources/scglue-0.1.1-py3-none-any.whl
78 |
79 | # target platforms
80 | platforms:
81 | - type: docker
82 | image: nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04
83 | setup:
84 | - type: apt
85 | packages:
86 | - python3-pip
87 | - python3.8-dev
88 | - type: docker
89 | run:
90 | - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10
91 | - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 10
92 | - python -m pip install --upgrade pip
93 | - pip install scglue-0.1.1-py3-none-any.whl
94 | - pip install pyyaml
95 | resources:
96 | - scglue-0.1.1-py3-none-any.whl scglue-0.1.1-py3-none-any.whl
97 |
98 | - type: nextflow
99 | labels: [ highmem, hightime, highcpu, gpu ]
--------------------------------------------------------------------------------
/src/match_modality/methods/clue/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=clue
7 | task_id=match_modality
8 |
9 | # CITE GEX2ADT
10 | dataset_id=openproblems_bmmc_cite_phase2_rna
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
13 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
14 |
15 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
16 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
17 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
18 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
19 | --output_pretrain ${pretrain_path}
20 |
21 | target/docker/${task_id}_methods/${method_id}/${method_id} \
22 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
23 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
24 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
25 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
26 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
27 | --input_pretrain ${pretrain_path} \
28 | --output ${pred_path}.${method_id}.output.h5ad
29 |
30 | # CITE ADT2GEX
31 | dataset_id=openproblems_bmmc_cite_phase2_mod2
32 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
33 | # can reuse same pretrain
34 | # pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
36 |
37 | # target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
38 | # --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
39 | # --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
40 | # --input_train_sol ${dataset_path}.output_train_sol.h5ad \
41 | # --output_pretrain ${pretrain_path}
42 |
43 | target/docker/${task_id}_methods/${method_id}/${method_id} \
44 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
45 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
46 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
47 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
48 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
49 | --input_pretrain ${pretrain_path} \
50 | --output ${pred_path}.${method_id}.output.h5ad
51 |
52 |
53 | # MULTIOME GEX2ATAC
54 | dataset_id=openproblems_bmmc_multiome_phase2_rna
55 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
56 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
57 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
58 |
59 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
60 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
61 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
62 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
63 | --output_pretrain ${pretrain_path}
64 |
65 | target/docker/${task_id}_methods/${method_id}/${method_id} \
66 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
67 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
68 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
69 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
70 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
71 | --input_pretrain ${pretrain_path} \
72 | --output ${pred_path}.${method_id}.output.h5ad
73 |
74 | # MULTIOME ATAC2GEX
75 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
76 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
77 | # can reuse same pretrains
78 | # pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
79 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
80 |
81 | # target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
82 | # --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
83 | # --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
84 | # --input_train_sol ${dataset_path}.output_train_sol.h5ad \
85 | # --output_pretrain ${pretrain_path}
86 |
87 | target/docker/${task_id}_methods/${method_id}/${method_id} \
88 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
89 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
90 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
91 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
92 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
93 | --input_pretrain ${pretrain_path} \
94 | --output ${pred_path}.${method_id}.output.h5ad
95 |
96 | # RUN EVALUATION
97 | bin/nextflow run "$PIPELINE_REPO" \
98 | -r "$PIPELINE_VERSION" \
99 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
100 | --solutionDir "output/datasets/$task_id" \
101 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
102 | --publishDir "output/evaluation/$task_id/$method_id/" \
103 | -latest \
104 | -resume \
105 | -c "src/resources/nextflow_moremem.config"
106 |
107 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/match_modality/methods/clue/train/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: clue_train
3 | namespace: match_modality_methods
4 |
5 | # metadata for your method
6 | description: Cross-linked unified embedding for single-cell multi-omics data integration
7 |
8 | info:
9 | submission_id: "169959"
10 | team_name: GLUE
11 | # project_url: https://github.com/foo/bar
12 | # publication_doi: 10.1101/0123.45.67.890123
13 | # publication_url: https://arxiv.org/abs/1234.56789
14 |
15 | authors:
16 | - name: Zhi-Jie Cao
17 | email: caozj@mail.cbi.pku.edu.cn
18 | roles: [ author, maintainer ]
19 | props: { github: Jeff1995, orcid: "0000-0002-0026-671X" }
20 | - name: Xin-Ming Tu
21 | email: xinmingtu@pku.edu.cn
22 | roles: [ author, maintainer ]
23 | props: { github: XinmingTu }
24 | - name: Chen-Rui Xia
25 | email: xiachenrui@mail.cbi.pku.edu.cn
26 | roles: [ author, maintainer ]
27 | props: { github: xiachenrui }
28 |
29 | # parameters
30 | arguments:
31 | # required inputs
32 | - name: "--input_train_mod1"
33 | type: "file"
34 | example: "dataset_censored.h5ad"
35 | description: "The censored shuffled train mod1 profiles."
36 | required: true
37 | - name: "--input_train_mod2"
38 | type: "file"
39 | example: "dataset_censored.h5ad"
40 | description: "The censored shuffled train mod2 profiles."
41 | required: true
42 | - name: "--input_train_sol"
43 | type: "file"
44 | example: "dataset_solution.h5ad"
45 | description: "The pairing of train mod1&mod2 profiles."
46 | required: true
47 |
48 | # required outputs
49 | - name: "--output_pretrain"
50 | type: "file"
51 | example: "pretrain_model"
52 | direction: "output"
53 | description: Path to the directory containing a pretrained model.
54 | required: true
55 |
56 | # files your script needs
57 | resources:
58 | - type: python_script
59 | path: script.py
60 | - path: ../resources/utils.py
61 | - path: ../resources/scglue-0.1.1-py3-none-any.whl
62 |
63 | # target platforms
64 | platforms:
65 | - type: docker
66 | image: nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04
67 | run_args: [ "--gpus all" ]
68 | setup:
69 | - type: apt
70 | packages:
71 | - python3-pip
72 | - python3.8-dev
73 | - type: docker
74 | run:
75 | - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10
76 | - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 10
77 | - python -m pip install --upgrade pip
78 | - pip install scglue-0.1.1-py3-none-any.whl
79 | - pip install pyyaml scikit-misc
80 | resources:
81 | - scglue-0.1.1-py3-none-any.whl scglue-0.1.1-py3-none-any.whl
82 |
83 | - type: nextflow
84 | labels: [ highmem, hightime, highcpu, gpu ]
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/README.md:
--------------------------------------------------------------------------------
1 | # NeurIPS-Single-Cell-MultiModality
2 |
3 | Team Novel: Gleb Ryazantsev, Nikolay Russkikh, Igor I
4 |
5 | The approach utilizes sample representations, learned in the same way as in the CLIP model. Encoders for all of the modalities are fully connected, the dimensionality of GEX and ATAC data is reduces via LSI transform (ADT is left as-is). Then, to obtain sample pairings, a maximum weight matching on a bipartite graph is performed, where weights are cosine similarities between sample embeddings.
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/novel_architecture1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/novel/novel_architecture1.png
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/novel_architecture2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/novel/novel_architecture2.png
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/catalyst_tools.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from catalyst import runners, metrics
4 | from models import symmetric_npair_loss
5 |
6 |
7 | import numpy as np
8 | import torch.nn.functional as F
9 | from tqdm.notebook import tqdm
10 |
11 | from networkx.algorithms import bipartite
12 | from scipy import sparse
13 |
14 |
15 |
16 | class scRNARunner(runners.Runner):
17 | def handle_batch(self, batch):
18 | features_first = batch['features_first']
19 | features_second = batch['features_second']
20 |
21 | logits, embeddings_first, embeddings_second = self.model(features_first, features_second)
22 | targets = torch.arange(logits.shape[0]).to(logits.device)
23 |
24 | loss = symmetric_npair_loss(logits, targets)
25 |
26 | batch_temperature = self.model.logit_scale.exp().item()
27 |
28 | self.batch_metrics.update({"loss": loss})
29 | self.batch_metrics.update({"T": batch_temperature})
30 |
31 | self.batch = {
32 | 'features_first': features_first,
33 | 'features_second': features_second,
34 | 'embeddings_first': embeddings_first,
35 | 'embeddings_second': embeddings_second,
36 | 'scores': logits,
37 | 'targets': targets,
38 | 'temperature': batch_temperature
39 |
40 | }
41 | self.input = { 'features_first': features_first,
42 | 'features_second': features_second,
43 | }
44 | self.output = {'scores': logits,
45 | 'embeddings_first': embeddings_first,
46 | 'embeddings_second': embeddings_second
47 | }
48 |
49 | class CustomMetric(metrics.ICallbackLoaderMetric):
50 | def __init__(self, compute_on_call: bool = True, prefix: str = None, suffix: str = None):
51 | """Init."""
52 | super().__init__(compute_on_call=compute_on_call)
53 | self.prefix = prefix or ""
54 | self.suffix = suffix or ""
55 | self.embeddings_list_first = []
56 | self.embeddings_list_second = []
57 |
58 | def reset(self, num_batches: int, num_samples: int) -> None:
59 | self.embeddings_list_first = []
60 | self.embeddings_list_second = []
61 | torch.cuda.empty_cache()
62 |
63 | def update(self, *args, **kwargs) -> None:
64 | embeddings_first = kwargs['embeddings_first']
65 | embeddings_second = kwargs['embeddings_second']
66 | temperature = kwargs['temperature']
67 | self.embeddings_list_first.append(temperature*embeddings_first)
68 | self.embeddings_list_second.append(embeddings_second)
69 |
70 | def compute(self):
71 | raise NotImplementedError('This method is not supported')
72 |
73 |
74 | def compute_key_value(self):
75 | all_embeddings_first = torch.cat(self.embeddings_list_first).detach().cpu()
76 | all_embeddings_second = torch.cat(self.embeddings_list_second).detach().cpu()
77 | logits = all_embeddings_first@all_embeddings_second.T
78 | #labels = torch.arange(logits.shape[0]).to(logits.device)
79 | labels = torch.arange(logits.shape[0])
80 |
81 | del(all_embeddings_first)
82 | del(all_embeddings_second)
83 |
84 | forward_accuracy = (torch.argmax(logits, dim=1)==labels).float().mean().item()
85 | backward_accuracy = (torch.argmax(logits, dim=0)==labels).float().mean().item()
86 | del(logits)
87 |
88 | avg_accuracy = 0.5*(forward_accuracy+backward_accuracy)
89 |
90 | loader_metrics = {
91 |
92 | 'forward_acc':forward_accuracy,
93 | 'backward_acc':backward_accuracy,
94 | 'avg_acc': avg_accuracy
95 | }
96 | return loader_metrics
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/config_ADT2GEX.py:
--------------------------------------------------------------------------------
1 | LR = 7.79984e-05
2 | OPTIM = 'AdamW'
3 | weight_decay=0
4 |
5 | EMBEDDING_DIM = 64
6 |
7 | DROPOUT_RATES_FIRST = [0.0221735, 0.296919]
8 | DROPOUT_RATES_GEX = [0.0107121,0.254689]
9 |
10 | LAYERS_DIM_FIRST = [512, 2048]
11 | LAYERS_DIM_GEX = [1024, 512]
12 |
13 | LOG_T = 3.463735
14 |
15 | N_LSI_COMPONENTS_GEX = 128
16 | N_EPOCHS = 7000
17 |
18 | BATCH_SIZE = 2048
19 |
20 | SWAP_RATE_FIRST = 0.
21 | SWAP_RATE_GEX = 0.
22 |
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/config_ATAC2GEX.py:
--------------------------------------------------------------------------------
1 | #optimizer
2 | LR = 0.000585
3 | OPTIM = 'AdamW'
4 | weight_decay=0
5 | EMBEDDING_DIM = 256
6 |
7 | DROPOUT_RATES_FIRST = [0.661]
8 | DROPOUT_RATES_GEX = [ 0.541, 0.396]
9 |
10 | LAYERS_DIM_FIRST = [2048]
11 | LAYERS_DIM_GEX = [1024, 1024]
12 |
13 | LOG_T = 3.065016
14 |
15 |
16 | N_LSI_COMPONENTS_FIRST= 512
17 | N_LSI_COMPONENTS_GEX = 64
18 |
19 | N_EPOCHS = 7000
20 |
21 | BATCH_SIZE = 16384
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/data.py:
--------------------------------------------------------------------------------
1 | from torch.utils.data import Dataset,DataLoader
2 |
3 | class ModalityMatchingDataset(Dataset):
4 | def __init__(
5 | self, df_modality1, df_modality2
6 | ):
7 | super().__init__()
8 |
9 | self.df_modality1 = df_modality1.values
10 | self.df_modality2 = df_modality2.values
11 |
12 |
13 | def __len__(self):
14 | return self.df_modality1.shape[0]
15 |
16 | def __getitem__(self, index: int):
17 | x_modality_1 = self.df_modality1[index]
18 | x_modality_2 = self.df_modality2[index]
19 | return {'features_first':x_modality_1, 'features_second':x_modality_2}
20 |
21 | def get_dataloaders(mod1_train, mod2_train, sol_train,
22 | mod1_test, mod2_test, sol_test, NUM_WORKERS, BATCH_SIZE):
23 |
24 | mod2_train = mod2_train.iloc[sol_train.values.argmax(1)]
25 | mod2_test = mod2_test.iloc[sol_test.values.argmax(1)]
26 |
27 | dataset_train = ModalityMatchingDataset(mod1_train, mod2_train)
28 | data_train = DataLoader(dataset_train, BATCH_SIZE, shuffle = True, num_workers = NUM_WORKERS)
29 |
30 | dataset_test = ModalityMatchingDataset(mod1_test, mod2_test)
31 | data_test = DataLoader(dataset_test, BATCH_SIZE, shuffle = False, num_workers = NUM_WORKERS)
32 |
33 | return data_train, data_test
34 |
35 |
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/resources/models.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.utils.data import Dataset,DataLoader
3 | import torch.nn.functional as F
4 | from torch import nn
5 |
6 |
7 | class BatchSwapNoise(nn.Module):
8 | """Swap Noise module"""
9 | def __init__(self, p):
10 | super().__init__()
11 | self.p = p
12 |
13 | def forward(self, x):
14 | if self.training:
15 | mask = torch.rand(x.size()) > (1 - self.p)
16 | idx = torch.add(torch.arange(x.nelement()),
17 | (torch.floor(torch.rand(x.size()) * x.size(0)).type(torch.LongTensor) *
18 | (mask.type(torch.LongTensor) * x.size(1))).view(-1))
19 | idx[idx>=x.nelement()] = idx[idx>=x.nelement()]-x.nelement()
20 | return x.view(-1)[idx].view(x.size())
21 | else:
22 | return x
23 |
24 |
25 | class Encoder(nn.Module):
26 | def __init__(self, n_input, embedding_size, dropout_rates, dims_layers, swap_noise_ratio):
27 | super(Encoder, self).__init__()
28 | dropout = []
29 | layers = []
30 | layers.append(nn.Linear(n_input, dims_layers[0]))
31 |
32 | for i in range(len(dims_layers)-1):
33 | layers.append(nn.Linear(dims_layers[i], dims_layers[i+1]))
34 | for i in range(len(dropout_rates)):
35 | dropout.append(nn.Dropout(p=dropout_rates[i]))
36 |
37 | layers.append(nn.Linear(dims_layers[-1], embedding_size))
38 |
39 | self.fc_list = nn.ModuleList(layers)
40 | self.dropout_list = nn.ModuleList(dropout)
41 |
42 | def forward(self, x):
43 | for i in range(len(self.fc_list)-1):
44 | x = F.elu(self.fc_list[i](x))
45 | if(i None:
108 | r"""
109 | LSI analysis (following the Seurat v3 approach)
110 | Parameters
111 | ----------
112 | adata
113 | Input dataset
114 | n_components
115 | Number of dimensions to use
116 | use_highly_variable
117 | Whether to use highly variable features only, stored in
118 | ``adata.var['highly_variable']``. By default uses them if they
119 | have been determined beforehand.
120 | **kwargs
121 | Additional keyword arguments are passed to
122 | :func:`sklearn.utils.extmath.randomized_svd`
123 | """
124 | if use_highly_variable is None:
125 | use_highly_variable = "highly_variable" in adata.var
126 | adata_use = adata[:, adata.var["highly_variable"]] if use_highly_variable else adata
127 | X = tfidf(adata_use.X)
128 | X_norm = sklearn.preprocessing.Normalizer(norm="l1").fit_transform(X)
129 | X_norm = np.log1p(X_norm * 1e4)
130 | X_lsi = sklearn.utils.extmath.randomized_svd(X_norm, n_components, random_state=777, **kwargs)[0]
131 | X_lsi -= X_lsi.mean(axis=1, keepdims=True)
132 | X_lsi /= X_lsi.std(axis=1, ddof=1, keepdims=True)
133 | adata.obsm["X_lsi"] = X_lsi
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: novel
3 | namespace: match_modality_methods
4 |
5 | description: The approach utilizes sample representations, learned in the same way as in the CLIP model. Encoders for all of the modalities are fully connected, the dimensionality of GEX and ATAC data is reduces via LSI transform (ADT is left as-is). Then, to obtain sample pairings, a maximum weight matching on a bipartite graph is performed, where weights are cosine similarities between sample embeddings.
6 | info:
7 | method_label: "Novel"
8 | submission_id: "169594/170690"
9 | team_name: Novel
10 |
11 | authors:
12 | - name: Gleb Ryazantsev
13 | email: ryazantsev.gleb@gmail.com
14 | roles: [ author, maintainer ]
15 | - name: Nikolay Russkikh
16 | email: russkikh.nikolay@gmail.com
17 | roles: [ author, maintainer ]
18 | - name: Igor I
19 | email: herri.i.67@gmail.com
20 | roles: [ author, maintainer ]
21 |
22 | # parameters
23 | arguments:
24 | # required inputs
25 | - name: "--input_train_mod1"
26 | type: "file"
27 | example: "dataset_censored.h5ad"
28 | description: "The censored shuffled train mod1 profiles."
29 | required: true
30 | - name: "--input_train_mod2"
31 | type: "file"
32 | example: "dataset_censored.h5ad"
33 | description: "The censored shuffled train mod2 profiles."
34 | required: true
35 | - name: "--input_train_sol"
36 | type: "file"
37 | example: "dataset_solution.h5ad"
38 | description: "The pairing of train mod1&mod2 profiles."
39 | required: true
40 | - name: "--input_test_mod1"
41 | type: "file"
42 | example: "dataset_censored.h5ad"
43 | description: "The censored shuffled test mod1 profiles."
44 | required: true
45 | - name: "--input_test_mod2"
46 | type: "file"
47 | example: "dataset_censored.h5ad"
48 | description: "The censored shuffled test mod2 profiles."
49 | required: true
50 | - name: "--input_pretrain"
51 | type: "file"
52 | example: "pretrain_model"
53 | description: Path to the directory containing a pretrained model.
54 | required: true
55 |
56 | # required outputs
57 | - name: "--output"
58 | type: "file"
59 | direction: "output"
60 | example: "output.h5ad"
61 | description: "The predicted pairing of test mod1&mod2 profiles."
62 | required: true
63 |
64 | # files your script needs
65 | resources:
66 | - type: python_script
67 | path: script.py
68 | - path: ../resources/catalyst_tools.py
69 | - path: ../resources/config_ADT2GEX.py
70 | - path: ../resources/config_ATAC2GEX.py
71 | - path: ../resources/data.py
72 | - path: ../resources/models.py
73 | - path: ../resources/postprocessing.py
74 | - path: ../resources/preprocessing.py
75 |
76 | # target platforms
77 | platforms:
78 | - type: docker
79 | image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime"
80 | run_args: [ "--gpus all --shm-size=5G" ]
81 | setup:
82 | - type: python
83 | packages:
84 | - catalyst
85 | - anndata
86 | - scikit-learn
87 | - networkx
88 |
89 | - type: nextflow
90 | labels: [ vhighmem, vvhightime, vhighcpu, gpu]
91 |
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=novel
7 | task_id=match_modality
8 |
9 |
10 | # CITE ADT2GEX
11 | dataset_id=openproblems_bmmc_cite_phase2_mod2
12 | dataset_id_val=openproblems_bmmc_cite_phase2_mod2
13 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
14 | dataset_path_val=output/datasets/$task_id/$dataset_id_val/$dataset_id_val.censor_dataset
15 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
16 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
17 |
18 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
19 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
20 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
21 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
22 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
23 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
24 | --input_test_sol ${dataset_path}.output_test_sol.h5ad \
25 | --output_pretrain ${pretrain_path}
26 |
27 | target/docker/${task_id}_methods/${method_id}/${method_id} \
28 | --input_train_mod1 ${dataset_path_val}.output_train_mod1.h5ad \
29 | --input_train_mod2 ${dataset_path_val}.output_train_mod2.h5ad \
30 | --input_train_sol ${dataset_path_val}.output_train_sol.h5ad \
31 | --input_test_mod1 ${dataset_path_val}.output_test_mod1.h5ad \
32 | --input_test_mod2 ${dataset_path_val}.output_test_mod2.h5ad \
33 | --input_pretrain ${pretrain_path} \
34 | --output ${pred_path}.${method_id}.output.h5ad
35 |
36 | #CITE GEX2ADT
37 | dataset_id=openproblems_bmmc_cite_phase2_rna
38 | pretrain_dataset_id=openproblems_bmmc_cite_phase2_mod2
39 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
40 | pretrain_path=output/pretrain/$task_id/$method_id/$pretrain_dataset_id.${method_id}_train.output_pretrain/
41 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
42 |
43 | target/docker/${task_id}_methods/${method_id}/${method_id} \
44 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
45 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
46 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
47 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
48 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
49 | --input_pretrain ${pretrain_path} \
50 | --output ${pred_path}.${method_id}.output.h5ad
51 |
52 |
53 |
54 | # MULTIOME ATAC2GEX
55 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
56 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
57 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
58 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
59 |
60 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
61 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
62 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
63 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
64 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
65 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
66 | --input_test_sol ${dataset_path}.output_test_sol.h5ad \
67 | --output_pretrain ${pretrain_path}
68 |
69 | target/docker/${task_id}_methods/${method_id}/${method_id} \
70 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
71 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
72 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
73 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
74 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
75 | --input_pretrain ${pretrain_path} \
76 | --output ${pred_path}.${method_id}.output.h5ad
77 |
78 | # MULTIOME GEX2ATAC
79 | dataset_id=openproblems_bmmc_multiome_phase2_rna
80 | pretrain_dataset_id=openproblems_bmmc_multiome_phase2_mod2
81 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
82 | pretrain_path=output/pretrain/$task_id/$method_id/$pretrain_dataset_id.${method_id}_train.output_pretrain/
83 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
84 |
85 | target/docker/${task_id}_methods/${method_id}/${method_id} \
86 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
87 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
88 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \
89 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
90 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
91 | --input_pretrain ${pretrain_path} \
92 | --output ${pred_path}.${method_id}.output.h5ad
93 |
94 | # RUN EVALUATION
95 | bin/nextflow run "$PIPELINE_REPO" \
96 | -r "$PIPELINE_VERSION" \
97 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
98 | --solutionDir "output/datasets/$task_id" \
99 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
100 | --publishDir "output/evaluation/$task_id/$method_id/" \
101 | -latest \
102 | -resume \
103 | -c "src/resources/nextflow_moremem.config"
104 |
105 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/match_modality/methods/novel/train/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: novel_train
3 | namespace: match_modality_methods
4 |
5 | # metadata for your method
6 |
7 | description: The approach utilizes sample representations, learned in the same way as in the CLIP model. Encoders for all of the modalities are fully connected, the dimensionality of GEX and ATAC data is reduces via LSI transform (ADT is left as-is). Then, to obtain sample pairings, a maximum weight matching on a bipartite graph is performed, where weights are cosine similarities between sample embeddings.
8 |
9 | authors:
10 | - name: Gleb Ryazantsev
11 | email: ryazantsev.gleb@gmail.com
12 | roles: [ author, maintainer ]
13 | - name: Nikolay Russkikh
14 | email: russkikh.nikolay@gmail.com
15 | roles: [ author, maintainer ]
16 | - name: Igor I
17 | email: herri.i.67@gmail.com
18 | roles: [ author, maintainer ]
19 |
20 | # parameters
21 | arguments:
22 | # required inputs
23 | - name: "--input_train_mod1"
24 | type: "file"
25 | example: "dataset_mod1.h5ad"
26 | description: Censored dataset, training cells.
27 | required: true
28 | - name: "--input_train_mod2"
29 | type: "file"
30 | example: "dataset_mod2.h5ad"
31 | description: Censored dataset.
32 | required: true
33 | - name: "--input_train_sol"
34 | type: "file"
35 | example: "dataset_solution.h5ad"
36 | description: "The pairing of train mod1&mod2 profiles."
37 | required: true
38 | - name: "--input_test_mod1"
39 | type: "file"
40 | example: "dataset_test_mod1.h5ad"
41 | description: Censored dataset, training cells.
42 | required: true
43 | - name: "--input_test_mod2"
44 | type: "file"
45 | example: "dataset_test_mod2.h5ad"
46 | description: Censored dataset.
47 | required: true
48 | - name: "--input_test_sol"
49 | type: "file"
50 | example: "dataset_solution.h5ad"
51 | description: "The pairing of train mod1&mod2 profiles."
52 | required: true
53 |
54 | # required outputs
55 | - name: "--output_pretrain"
56 | type: "file"
57 | direction: "output"
58 | example: "pretrain_model"
59 | description: Path to the directory containing a pretrained model.
60 | required: true
61 |
62 | # files your script needs
63 | resources:
64 | - type: python_script
65 | path: script.py
66 | - path: ../resources/catalyst_tools.py
67 | - path: ../resources/config_ADT2GEX.py
68 | - path: ../resources/config_ATAC2GEX.py
69 | - path: ../resources/data.py
70 | - path: ../resources/models.py
71 | - path: ../resources/postprocessing.py
72 | - path: ../resources/preprocessing.py
73 |
74 | # target platforms
75 | platforms:
76 | - type: docker
77 | image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime"
78 | run_args: [ "--gpus all --shm-size=5G" ]
79 | setup:
80 | - type: python
81 | packages:
82 | - catalyst
83 | - anndata
84 | - scikit-learn
85 | - networkx
86 |
87 | - type: nextflow
88 | labels: [ vhighmem, vvhightime, vhighcpu, gpu]
89 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/README.md:
--------------------------------------------------------------------------------
1 | # NeurIPS-Single-Cell-MultiModality
2 |
3 | Team: [Xueer Chen](https://github.com/xuerchen), [Jiwei Liu](https://github.com/daxiongshu)
4 |
5 | This folder contains our solution to the [OpenProblems-NeurIPS2021 Single-Cell Multimodal Data Integration](https://eval.ai/web/challenges/challenge-page/1111/overview). Our team AXX took the [4th place of the modality prediction task](https://eval.ai/web/challenges/challenge-page/1111/leaderboard/2860) in terms of overall ranking of 4 subtasks: namely `GEX to ADT`, `ADT to GEX`, `GEX to ATAC` and `ATAC to GEX`. Specifically, our methods ranked **3rd** in `GEX to ATAC` and **4th** in `GEX to ADT`. More details about the task can be found in the [competition webpage](https://openproblems.bio/neurips_docs/about_tasks/task1_modality_prediction/).
6 |
7 |
8 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/const.py:
--------------------------------------------------------------------------------
1 | PATH = '.'
2 | OUT_PATH = '.'
3 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/models.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import pytorch_lightning as pl
3 | import torch.nn as nn
4 | import torch.nn.functional as F
5 | from torch.utils.data import TensorDataset,DataLoader
6 |
7 | class MLP(pl.LightningModule):
8 | def __init__(self,in_dim,out_dim,ymean,config):
9 | super(MLP, self).__init__()
10 | self.ymean = ymean.cuda()
11 | H1 = config.H1
12 | H2 = config.H2
13 | p = config.dropout
14 | self.config = config
15 | self.fc1 = nn.Linear(in_dim, H1)
16 | self.fc2 = nn.Linear(H1,H2)
17 | self.fc3 = nn.Linear(H1+H2, out_dim)
18 | self.dp2 = nn.Dropout(p=p)
19 |
20 | def forward(self, x):
21 | x0 = x
22 | x1 = F.relu(self.fc1(x))
23 | x1 = self.dp2(x1)
24 | x = F.relu(self.fc2(x1))
25 | x = torch.cat([x,x1],dim=1)
26 | x = self.fc3(x)
27 | x = self.apply_mask(x)
28 | return x
29 |
30 | def apply_mask(self,yp):
31 | tmp = torch.ones_like(yp).float()*self.ymean
32 | mask = tmp Running method")
29 | out = subprocess.check_output([
30 | command,
31 | "--input_train_mod1", testpar['input_train_mod1'],
32 | "--input_train_mod2", testpar['input_train_mod2'],
33 | "--input_test_mod1", testpar['input_test_mod1'],
34 | "--output", testpar['output']
35 | ]).decode("utf-8")
36 |
37 | print("> Checking whether output files were created")
38 | assert path.exists(testpar['output'])
39 |
40 | print("> Reading h5ad files")
41 | ad_sol = ad.read_h5ad(testpar['input_test_mod2'])
42 | ad_pred = ad.read_h5ad(testpar['output'])
43 |
44 | print("> Checking dataset id")
45 | assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id']
46 |
47 | print("> Checking method id", ad_pred.uns['method_id'], method_id)
48 | assert ad_pred.uns['method_id'] == method_id
49 |
50 | print("> Checking X")
51 | assert issparse(ad_pred.X)
52 | assert ad_pred.n_obs == ad_sol.n_obs
53 | assert ad_pred.n_vars == ad_sol.n_vars
54 | assert all(ad_pred.obs_names == ad_sol.obs_names)
55 | assert all(ad_pred.var_names == ad_sol.var_names)
56 |
57 | print("> Test succeeded!")
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/train.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import pytorch_lightning as pl
3 | from torch.utils.data import TensorDataset,DataLoader
4 | from models import MLP
5 | from pytorch_lightning.callbacks import ModelCheckpoint
6 | from pytorch_lightning.loggers import TensorBoardLogger,WandbLogger
7 | from pathlib import Path
8 | import utils
9 | import anndata as ad
10 | import numpy as np
11 | import json
12 | from const import PATH, OUT_PATH
13 |
14 | def _train(X, y, Xt, yt, enable_ckpt, logger, yaml_path):
15 | config = utils.load_yaml(yaml_path)
16 | X = torch.from_numpy(X).float()
17 | y = torch.from_numpy(y).float()
18 | ymean = torch.mean(y,dim=0,keepdim=True)
19 |
20 | tr_ds = TensorDataset(X,y)
21 | nw = 0 if 'ATAC' in yaml_path else 4
22 | tr_loader = DataLoader(tr_ds, batch_size=config.batch_size,num_workers=nw,
23 | shuffle=True, drop_last=True)
24 |
25 | Xt = torch.from_numpy(Xt).float()
26 | yt = torch.from_numpy(yt).float()
27 | te_ds = TensorDataset(Xt,yt)
28 | te_loader = DataLoader(te_ds, batch_size=config.batch_size,num_workers=0,
29 | shuffle=False, drop_last=False)
30 |
31 | checkpoint_callback = ModelCheckpoint(monitor='valid_RMSE')
32 | if enable_ckpt:
33 | epochs = config.epochs
34 | cb = [checkpoint_callback]
35 | else:
36 | epochs = 1
37 | cb = None
38 |
39 | trainer = pl.Trainer(enable_checkpointing=enable_ckpt, logger=logger,
40 | gpus=1, max_epochs=epochs,
41 | callbacks=cb,
42 | progress_bar_refresh_rate=5)
43 |
44 | net = MLP(X.shape[1],y.shape[1],ymean,config)
45 | trainer.fit(net, tr_loader, te_loader)
46 |
47 | cp = 'best' if enable_ckpt else None
48 | yp = trainer.predict(net,te_loader,ckpt_path=cp)
49 | yp = torch.cat(yp,dim=0)
50 |
51 | score = ((yp-yt)**2).mean()**0.5
52 | print(f"VALID RMSE {score:.3f}")
53 | del trainer
54 | return score,yp.detach().numpy()
55 |
56 |
57 | def train(task,cp,wp,tr1,tr2):
58 | yaml_path = f'{cp}/yaml/mlp_{task}.yaml'
59 | yps = []
60 | scores = []
61 |
62 | msgs = {}
63 | for fold in range(3):
64 |
65 | run_name = f"{task}_fold_{fold}"
66 | save_path = f'{wp}/{run_name}'
67 | Path(save_path).mkdir(parents=True, exist_ok=True)
68 |
69 | X,y,Xt,yt = utils.split(tr1, tr2, fold)
70 | run_name = f'fold_{fold}'
71 | logger = TensorBoardLogger(save_path, name='')
72 |
73 | enable_ckpt = True
74 |
75 | score, yp = _train(X, y, Xt, yt, enable_ckpt, logger, yaml_path)
76 | yps.append(yp)
77 | scores.append(score)
78 | msg = f"{task} Fold {fold} RMSE {score:.3f}"
79 | msgs[f'Fold {fold}'] = f'{score:.3f}'
80 | print(msg)
81 |
82 | yp = np.concatenate(yps)
83 | score = np.mean(scores)
84 | msgs['Overall'] = f'{score:.3f}'
85 | print('Overall', f'{score:.3f}')
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/yaml/mlp_ADT2GEX.yaml:
--------------------------------------------------------------------------------
1 | # sample config defaults file
2 | epochs:
3 | desc: Number of epochs to train over
4 | value: 10
5 | batch_size:
6 | desc: Size of each mini-batch
7 | value: 512
8 | H1:
9 | desc: Number of hidden neurons in 1st layer of MLP
10 | value: 256
11 | H2:
12 | desc: Number of hidden neurons in 2nd layer of MLP
13 | value: 128
14 | dropout:
15 | desc: probs of zeroing values
16 | value: 0
17 | lr:
18 | desc: learning rate
19 | value: 0.001
20 | wd:
21 | desc: weight decay
22 | value: 1e-5
23 | threshold:
24 | desc: threshold to set values to zero
25 | value: 0
26 | lr_schedule:
27 | desc: learning rate scheduler
28 | value: adam
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/yaml/mlp_ATAC2GEX.yaml:
--------------------------------------------------------------------------------
1 | # sample config defaults file
2 | epochs:
3 | desc: Number of epochs to train over
4 | value: 10
5 | batch_size:
6 | desc: Size of each mini-batch
7 | value: 512
8 | H1:
9 | desc: Number of hidden neurons in 1st layer of MLP
10 | value: 256
11 | H2:
12 | desc: Number of hidden neurons in 2nd layer of MLP
13 | value: 128
14 | dropout:
15 | desc: probs of zeroing values
16 | value: 0.5
17 | lr:
18 | desc: learning rate
19 | value: 0.001
20 | wd:
21 | desc: weight decay
22 | value: 1e-5
23 | threshold:
24 | desc: threshold to set values to zero
25 | value: 0
26 | lr_schedule:
27 | desc: learning rate scheduler
28 | value: adam
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/resources/yaml/mlp_GEX2ADT.yaml:
--------------------------------------------------------------------------------
1 | # sample config defaults file
2 | epochs:
3 | desc: Number of epochs to train over
4 | value: 10
5 | batch_size:
6 | desc: Size of each mini-batch
7 | value: 512
8 | H1:
9 | desc: Number of hidden neurons in 1st layer of MLP
10 | value: 1024
11 | H2:
12 | desc: Number of hidden neurons in 2nd layer of MLP
13 | value: 512
14 | dropout:
15 | desc: probs of zeroing values
16 | value: 0
17 | lr:
18 | desc: learning rate
19 | value: 0.001
20 | wd:
21 | desc: weight decay
22 | value: 1e-5
23 | threshold:
24 | desc: threshold to set values to zero
25 | value: 0.05
26 | lr_schedule:
27 | desc: learning rate scheduler
28 | value: adam_cosin
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: simplemlp
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 | description: Ensemble of MLPs trained on different sites
7 | info:
8 | method_label: SimpleMLP
9 | submission_id: "170812"
10 | team_name: AXX
11 | # project_url: https://github.com/foo/bar
12 | # publication_doi: 10.1101/0123.45.67.890123
13 | # publication_url: https://arxiv.org/abs/1234.56789
14 |
15 | authors:
16 | - name: Xueer Chen
17 | email: xc2579@columbia.edu
18 | roles: [ author, maintainer ]
19 | props: { github: xuerchen }
20 | - name: Jiwei Liu
21 | email: jiweil@nvidia.com
22 | roles: [ author, maintainer ]
23 | props: { github: daxiongshu, orcid: "0000-0002-8799-9763" }
24 |
25 | # parameters
26 | arguments:
27 | # required inputs
28 | - name: "--input_train_mod1"
29 | type: "file"
30 | example: "dataset_mod1.h5ad"
31 | description: Censored dataset, training cells.
32 | required: true
33 | - name: "--input_test_mod1"
34 | type: "file"
35 | example: "dataset_mod1.h5ad"
36 | description: Censored dataset, test cells.
37 | required: true
38 | - name: "--input_train_mod2"
39 | type: "file"
40 | example: "dataset_mod2.h5ad"
41 | description: Censored dataset.
42 | required: true
43 | - name: "--input_pretrain"
44 | type: "file"
45 | direction: "output"
46 | example: "pretrain_model"
47 | description: Path to the directory containing a pretrained model.
48 | required: true
49 | # required outputs
50 | - name: "--output"
51 | type: "file"
52 | direction: "output"
53 | example: "output.h5ad"
54 | description: Dataset with predicted values for modality2.
55 | required: true
56 |
57 |
58 | # files your script needs
59 | resources:
60 | - type: python_script
61 | path: script.py
62 | - path: ../resources/predict.py
63 | - path: ../resources/models.py
64 | - path: ../resources/utils.py
65 | - path: ../resources/const.py
66 | - path: ../resources/yaml
67 |
68 | # resources for unit testing your component
69 | tests:
70 | - type: python_script
71 | path: test.py
72 | - path: sample_data
73 |
74 | # target platforms
75 | platforms:
76 |
77 | # By specifying 'docker' platform, viash will build a standalone
78 | # executable which uses docker in the back end to run your method.
79 | - type: docker
80 | # you need to specify a base image that contains at least bash and python
81 | image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
82 | run_args: [ "--gpus all --ipc=host"]
83 | # You can specify additional dependencies with 'setup'.
84 | # See https://viash.io/docs/reference_config/platform-docker/#setup-list
85 | # for more information on how to add more dependencies.
86 | setup:
87 | # - type: apt
88 | # packages:
89 | # - bash
90 | # - type: python
91 | # packages:
92 | # - scanpy
93 | - type: python
94 | packages:
95 | - scikit-learn
96 | - anndata
97 | - scanpy
98 | - pytorch-lightning
99 |
100 | # By specifying a 'nextflow', viash will also build a viash module
101 | # which uses the docker container built above to also be able to
102 | # run your method as part of a nextflow pipeline.
103 | - type: nextflow
104 | labels: [ highmem, hightime, highcpu, gpu]
105 |
106 | # used for saturn cloud
107 | - type: native
108 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/run/script.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import anndata as ad
3 | import sys
4 | from scipy.sparse import csc_matrix
5 |
6 | from sklearn.decomposition import TruncatedSVD
7 | from sklearn.linear_model import LinearRegression
8 | import numpy as np
9 |
10 | logging.basicConfig(level=logging.INFO)
11 |
12 | ## VIASH START
13 | # Anything within this block will be removed by `viash` and will be
14 | # replaced with the parameters as specified in your config.vsh.yaml.
15 | par = {
16 | 'input_train_mod1': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad',
17 | 'input_train_mod2': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad',
18 | 'input_test_mod1': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod1.h5ad',
19 | 'input_pretrain': 'path/to/model',
20 | 'output': 'output.h5ad'
21 | }
22 | meta = {
23 | 'resources_dir': 'src/predict_modality/methods/AXX/resources'
24 | }
25 | ## VIASH END
26 | sys.path.append(meta['resources_dir'])
27 | from predict import predict
28 | from utils import get_y_dim
29 |
30 | logging.info('Reading `h5ad` files...')
31 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
32 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
33 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
34 |
35 | y_dim,task = get_y_dim(par['input_test_mod1'])
36 | ymean = np.asarray(input_train_mod2.X.mean(axis=0))
37 | if task == 'GEX2ATAC':
38 | y_pred = ymean*np.ones([input_test_mod1.shape[0],y_dim])
39 | else:
40 | y_pred = predict(ymean,test_data_path=par['input_test_mod1'],
41 | folds=[0,1,2],cp=meta['resources_dir'],
42 | wp=par['input_pretrain'])
43 |
44 | y_pred = csc_matrix(y_pred)
45 |
46 | adata = ad.AnnData(
47 | X=y_pred,
48 | obs=input_test_mod1.obs,
49 | var=input_train_mod2.var,
50 | uns={
51 | 'dataset_id': input_train_mod1.uns['dataset_id'],
52 | 'method_id': meta['functionality_name'],
53 | },
54 | )
55 |
56 | logging.info('Storing annotated data...')
57 | adata.write_h5ad(par['output'], compression = "gzip")
58 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=simplemlp
7 | task_id=predict_modality
8 |
9 | # CITE GEX2ADT
10 | dataset_id=openproblems_bmmc_cite_phase2_rna
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
13 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
14 |
15 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
16 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
17 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
18 | --output_pretrain ${pretrain_path}
19 |
20 | target/docker/${task_id}_methods/${method_id}/${method_id} \
21 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
22 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
23 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
24 | --input_pretrain ${pretrain_path} \
25 | --output ${pred_path}.${method_id}.output.h5ad
26 |
27 | # CITE ADT2GEX
28 | dataset_id=openproblems_bmmc_cite_phase2_mod2
29 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
30 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
31 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
32 |
33 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
34 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
35 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
36 | --output_pretrain ${pretrain_path}
37 |
38 | target/docker/${task_id}_methods/${method_id}/${method_id} \
39 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
40 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
41 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
42 | --input_pretrain ${pretrain_path} \
43 | --output ${pred_path}.${method_id}.output.h5ad
44 |
45 | # MULTIOME GEX2ATAC
46 | dataset_id=openproblems_bmmc_multiome_phase2_rna
47 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
48 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
49 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
50 |
51 | target/docker/${task_id}_methods/${method_id}/${method_id} \
52 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
53 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
54 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
55 | --input_pretrain ${pretrain_path} \
56 | --output ${pred_path}.${method_id}.output.h5ad
57 |
58 | # MULTIOME ATAC2GEX
59 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
60 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
61 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
62 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
63 |
64 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
65 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
66 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
67 | --output_pretrain ${pretrain_path}
68 |
69 | target/docker/${task_id}_methods/${method_id}/${method_id} \
70 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
71 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
72 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
73 | --input_pretrain ${pretrain_path} \
74 | --output ${pred_path}.${method_id}.output.h5ad
75 |
76 | # RUN EVALUATION
77 | bin/nextflow run "$PIPELINE_REPO" \
78 | -r "$PIPELINE_VERSION" \
79 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
80 | --solutionDir "output/datasets/$task_id" \
81 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
82 | --publishDir "output/evaluation/$task_id/$method_id/" \
83 | -latest \
84 | -resume \
85 | -c "src/resources/nextflow_moremem.config"
86 |
87 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/train/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: simplemlp_train
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 | description: Ensemble of MLPs trained on differnt sites
7 | info:
8 | submission_id: "170812"
9 | team_name: AXX
10 | # project_url: https://github.com/foo/bar
11 | # publication_doi: 10.1101/0123.45.67.890123
12 | # publication_url: https://arxiv.org/abs/1234.56789
13 |
14 | authors:
15 | - name: Xueer Chen
16 | email: xc2579@columbia.edu
17 | roles: [ author, maintainer ]
18 | props: { github: xuerchen, orcid: "0000-0000-0000-0000" }
19 | - name: Jiwei Liu
20 | email: jiweil@nvidia.com
21 | roles: [ author, maintainer ]
22 | props: { github: daxiongshu, orcid: "0000-0002-8799-9763" }
23 |
24 |
25 | # parameters
26 | arguments:
27 | # required inputs
28 | - name: "--input_train_mod1"
29 | type: "file"
30 | example: "dataset_mod1.h5ad"
31 | description: Censored dataset, training cells.
32 | required: true
33 | - name: "--input_train_mod2"
34 | type: "file"
35 | example: "dataset_mod2.h5ad"
36 | description: Censored dataset.
37 | required: true
38 | # required outputs
39 | - name: "--output_pretrain"
40 | type: "file"
41 | direction: "output"
42 | example: "pretrain_model"
43 | description: Path to the directory containing a pretrained model.
44 | required: true
45 |
46 | # files your script needs
47 | resources:
48 | - type: python_script
49 | path: script.py
50 | - path: ../resources/train.py
51 | - path: ../resources/models.py
52 | - path: ../resources/utils.py
53 | - path: ../resources/const.py
54 | - path: ../resources/yaml
55 |
56 | # target platforms
57 | platforms:
58 |
59 | # By specifying 'docker' platform, viash will build a standalone
60 | # executable which uses docker in the back end to run your method.
61 | - type: docker
62 | # you need to specify a base image that contains at least bash and python
63 | image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
64 | run_args: [ "--gpus all --ipc=host"]
65 | # You can specify additional dependencies with 'setup'.
66 | # See https://viash.io/docs/reference_config/platform-docker/#setup-list
67 | # for more information on how to add more dependencies.
68 | setup:
69 | # - type: apt
70 | # packages:
71 | # - bash
72 | # - type: python
73 | # packages:
74 | # - scanpy
75 | - type: python
76 | packages:
77 | - scikit-learn
78 | - anndata
79 | - scanpy
80 | - pytorch-lightning
81 |
82 | # By specifying a 'nextflow', viash will also build a viash module
83 | # which uses the docker container built above to also be able to
84 | # run your method as part of a nextflow pipeline.
85 | - type: nextflow
86 | labels: [ highmem, hightime, highcpu, gpu]
87 |
88 | # used for saturn cloud
89 | - type: native
90 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/AXX/train/script.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import anndata as ad
4 | import pickle
5 | import numpy as np
6 | import pandas as pd
7 | import scanpy as sc
8 | from sklearn.preprocessing import binarize
9 |
10 | logging.basicConfig(level=logging.INFO)
11 |
12 | ## VIASH START
13 | par = {
14 | 'input_train_mod1': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad',
15 | 'input_train_mod2': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad',
16 | 'output_pretrain': 'path/to/model'
17 | }
18 |
19 | meta = {
20 | 'resources_dir': 'src/predict_modality/methods/AXX/resources'
21 | }
22 | ## VIASH END
23 |
24 | import sys
25 | sys.path.append(meta['resources_dir'])
26 | from train import train
27 |
28 |
29 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
30 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
31 |
32 | mod_1 = input_train_mod1.var["feature_types"][0]
33 | mod_2 = input_train_mod2.var["feature_types"][0]
34 |
35 | os.makedirs(par['output_pretrain'], exist_ok=True)
36 |
37 | task = f'{mod_1}2{mod_2}'
38 | train(task,cp=meta['resources_dir'],
39 | wp=par['output_pretrain'],
40 | tr1=input_train_mod1,
41 | tr2=input_train_mod2)
--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/resources/baseline.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from sklearn.decomposition import TruncatedSVD
4 | from sklearn.linear_model import LinearRegression
5 |
6 | def baseline_linear(input_train_mod1, input_train_mod2, input_test_mod1):
7 | '''Baseline method training a linear regressor on the input data'''
8 |
9 | # Do PCA on the input data
10 | logging.info('Performing dimensionality reduction on modality 1 values...')
11 | embedder_mod1 = TruncatedSVD(n_components=50)
12 | X_train = embedder_mod1.fit_transform(input_train_mod1)
13 | X_test = embedder_mod1.transform(input_test_mod1)
14 |
15 | logging.info('Performing dimensionality reduction on modality 2 values...')
16 | embedder_mod2 = TruncatedSVD(n_components=50)
17 | y_train = embedder_mod2.fit_transform(input_train_mod2)
18 |
19 | logging.info('Running Linear regression...')
20 |
21 | reg = LinearRegression()
22 |
23 | # Train the model on the PCA reduced modality 1 and 2 data
24 | reg.fit(X_train, y_train)
25 | y_pred = reg.predict(X_test)
26 |
27 | # Project the predictions back to the modality 2 feature space
28 | y_pred = y_pred @ embedder_mod2.components_
29 |
30 | return y_pred
--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: dance
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 | description: A description for your method.
7 | info:
8 | method_label: "DANCE"
9 | submission_id: "171129"
10 | team_name: DANCE
11 | # project_url: https://github.com/foo/bar
12 | # publication_doi: 10.1101/0123.45.67.890123
13 | # publication_url: https://arxiv.org/abs/1234.56789
14 |
15 | authors:
16 | - name: Hongzhi Wen
17 | email: wenhongz@msu.edu
18 | roles: [ author, maintainer ]
19 | - name: Jiayuan Ding
20 | email: dingjia5@msu.edu
21 | roles: [ author, maintainer ]
22 | - name: Wei Jin
23 | email: jinwei2@msu.edu
24 | roles: [ author ]
25 | - name: Xiaoyan Li
26 | email: lixiaoy5@msu.edu
27 | roles: [ author ]
28 | - name: Zhaoheng Li
29 | email: zli1@macalester.edu
30 | roles: [ author ]
31 | - name: Haoyu Han
32 | email: hanhaoy1@msu.edu
33 | roles: [ assistant ]
34 | - name: Yuying Xie
35 | email: xyy@msu.edu
36 | roles: [ advisor ]
37 | - name: Jiliang Tang
38 | email: tangjili@msu.edu
39 | roles: [ advisor ]
40 |
41 |
42 | # parameters
43 | arguments:
44 | # required inputs
45 | - name: "--input_train_mod1"
46 | type: "file"
47 | example: "dataset_mod1.h5ad"
48 | description: Censored dataset, training cells.
49 | required: true
50 | - name: "--input_test_mod1"
51 | type: "file"
52 | example: "dataset_mod1.h5ad"
53 | description: Censored dataset, test cells.
54 | required: true
55 | - name: "--input_train_mod2"
56 | type: "file"
57 | example: "dataset_mod2.h5ad"
58 | description: Censored dataset.
59 | required: true
60 | - name: "--input_pretrain"
61 | type: "file"
62 | example: "pretrain_model"
63 | description: Path to the directory containing a pretrained model.
64 | required: true
65 | # required outputs
66 | - name: "--output"
67 | type: "file"
68 | direction: "output"
69 | example: "output.h5ad"
70 | description: Dataset with predicted values for modality2.
71 | required: true
72 |
73 | # files your script needs
74 | resources:
75 | - type: python_script
76 | path: script.py
77 | - path: ../resources/baseline.py
78 | - path: ../resources/graph_util.py
79 |
80 | # target platforms
81 | platforms:
82 | - type: docker
83 | image: dataintuitive/randpy:py3.8
84 | setup:
85 | - type: docker
86 | run: [pip install scikit-learn==0.24.1]
87 |
88 | - type: python
89 | packages:
90 | #- scikit-learn
91 | - anndata
92 | - scanpy
93 | - numpy
94 | - torch
95 | - dgl
96 | - lightgbm
97 | - joblib
98 |
99 | - type: nextflow
100 | labels: [ midmem, hightime, lowcpu ]
--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/run/script.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import anndata as ad
4 | import numpy as np
5 | import json
6 | import sys
7 | import re
8 | from scipy.sparse import csc_matrix
9 |
10 |
11 | logging.basicConfig(level=logging.INFO)
12 |
13 | ## VIASH START
14 | dataset_path = "output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_"
15 | pretrain_path = "output/pretrain/predict_modality/dance/openproblems_bmmc_cite_phase2_rna.dance_train.output_pretrain/"
16 |
17 | par = {
18 | 'input_train_mod1': f'{dataset_path}train_mod1.h5ad',
19 | 'input_train_mod2': f'{dataset_path}train_mod2.h5ad',
20 | 'input_test_mod1': f'{dataset_path}test_mod1.h5ad',
21 | 'input_pretrain': pretrain_path,
22 | 'output': 'output.h5ad'
23 | }
24 | meta = {
25 | 'resources_dir': 'src/predict_modality/methods/DANCE/resources',
26 | 'functionality_name': '171129'
27 | }
28 | ## VIASH END
29 |
30 | logging.info('Reading `h5ad` files...')
31 | train_mod1 = ad.read_h5ad(par['input_train_mod1'])
32 | mod1 = train_mod1.var['feature_types'][0]
33 | dataset_id = train_mod1.uns['dataset_id']
34 | input_train_mod1 = train_mod1.X
35 |
36 | train_mod2 = ad.read_h5ad(par['input_train_mod2'])
37 | var = train_mod2.var
38 | mod2 = train_mod2.var['feature_types'][0]
39 | input_train_mod2 = train_mod2.X
40 |
41 | test_mod1 = ad.read_h5ad(par['input_test_mod1'])
42 | obs = test_mod1.obs
43 | input_test_mod1 = test_mod1.X
44 |
45 | if mod1 == 'GEX':
46 | sys.path.append(meta['resources_dir'])
47 | from graph_util import graph_construction, WeightedGCN4
48 |
49 | import torch
50 |
51 | # # This will get passed to the method
52 | FEATURE_SIZE = train_mod1.shape[1]
53 | OUTPUT_SIZE = train_mod2.shape[1]
54 | TRAIN_SIZE = train_mod1.shape[0]
55 | TEST_SIZE = test_mod1.shape[0]
56 |
57 | g, bf = graph_construction(meta, train_mod1, train_mod2, test_mod1, pretrain_path=par['input_pretrain'])
58 |
59 | class Dict(dict):
60 | __setattr__ = dict.__setitem__
61 | __getattr__ = dict.__getitem__
62 |
63 | def dict2obj(dictObj):
64 | if not isinstance(dictObj, dict):
65 | return dictObj
66 | d = Dict()
67 | for k, v in dictObj.items():
68 | d[k] = dict2obj(v)
69 | return d
70 |
71 | def evaluate(mod, args):
72 | mod.eval()
73 | with torch.no_grad():
74 | logits = mod(g, bf, args)
75 | logits = logits[-TEST_SIZE:]
76 | return logits
77 |
78 | def build_args(LOG_FILE_PATH):
79 | string = open(LOG_FILE_PATH, 'r').readline()
80 | string = string.replace('Namespace', '').replace('=', ':').replace('(', '{ ').replace(')', '}').replace("'", '"').replace(',', ',\n').replace('True', 'true').replace('False','false')
81 | string = re.sub('[ ](.*?):', r' "\1":', string)
82 | args = json.loads(string)
83 | return dict2obj(args)
84 |
85 | if mod2 == 'ADT':
86 | y_pred = []
87 | model_names = ['f_alpha_conv4_mean_fullbatch_12000_phase2_inductive_batch_speration.pkl', 'bf_alpha_conv4_mean_fullbatch_10000_phase2_inductive_gex2adt_2.pkl', 'bf_alpha_conv4_mean_fullbatch_12000_phase2_inductive_gex2adt_sep_2.pkl', 'bf_alpha_conv4_mean_fullbatch_15000_phase2_inductive.pkl']
88 |
89 | for model_name in model_names:
90 | args = build_args(os.path.join(par['input_pretrain'], model_name).replace('.pkl', '.log'))
91 | model = torch.load(os.path.join(par['input_pretrain'], model_name), map_location='cpu')
92 | y_pred.append(evaluate(model, args).numpy())
93 | del model, args
94 |
95 | y_pred = csc_matrix((y_pred[0]+y_pred[1]+y_pred[2]+y_pred[3])/4)
96 |
97 | elif mod2 == 'ATAC':
98 | y_pred = []
99 | model_names = ['bf_alpha_conv4_mean_fullbatch_8000_phase2_inductive_gex2atac_3.pkl', 'bf_alpha_conv4_mean_fullbatch_8000_phase2_inductive_gex2atac_2.pkl', 'bf_alpha_conv4_mean_fullbatch_8000_phase2_inductive_gex2atac.pkl', 'bf_alpha_conv4_mean_fullbatch_10000_phase2_inductive_gex2atac.pkl']
100 |
101 | for model_name in model_names:
102 | args = build_args(os.path.join(par['input_pretrain'], model_name).replace('.pkl', '.log'))
103 | model = torch.load(os.path.join(par['input_pretrain'], model_name), map_location='cpu')
104 | y_pred.append(evaluate(model, args).numpy())
105 | del model, args
106 |
107 | y_pred = csc_matrix((y_pred[0]+y_pred[1]+y_pred[2]+y_pred[3])/4)
108 |
109 | elif mod1=='ATAC' and mod2=='GEX':
110 | y_pred = csc_matrix(np.tile(np.mean(input_train_mod2.toarray(), 0), (input_test_mod1.shape[0], 1)))
111 |
112 | else:
113 | sys.path.append(meta['resources_dir'])
114 | from baseline import baseline_linear
115 |
116 | input_train_mod1 = train_mod1[train_mod1.obs['batch']!='s3d1'].X
117 | input_train_mod2 = train_mod2[train_mod2.obs['batch']!='s3d1'].X
118 | y_pred = csc_matrix(baseline_linear(input_train_mod1, input_train_mod2, input_test_mod1))
119 |
120 | adata = ad.AnnData(
121 | X=y_pred,
122 | obs=obs,
123 | var=var,
124 | uns={
125 | 'dataset_id': dataset_id,
126 | 'method_id': meta['functionality_name'],
127 | },
128 | )
129 |
130 | logging.info('Storing annotated data...')
131 | adata.write_h5ad(par['output'], compression = "gzip")
132 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=dance
7 | task_id=predict_modality
8 |
9 | # GENERATE PRETRAIN
10 | pretrain_path=output/pretrain/$task_id/$method_id/pretrain.${method_id}_train.output_pretrain/
11 |
12 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
13 | --data_dir output/datasets/$task_id \
14 | --output_pretrain ${pretrain_path}
15 |
16 | # CITE GEX2ADT
17 | dataset_id=openproblems_bmmc_cite_phase2_rna
18 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
19 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
20 |
21 | target/docker/${task_id}_methods/${method_id}/${method_id} \
22 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
23 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
24 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
25 | --input_pretrain ${pretrain_path} \
26 | --output ${pred_path}.${method_id}.output.h5ad
27 |
28 | # CITE ADT2GEX
29 | dataset_id=openproblems_bmmc_cite_phase2_mod2
30 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
31 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
32 |
33 | target/docker/${task_id}_methods/${method_id}/${method_id} \
34 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
35 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
36 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
37 | --input_pretrain ${pretrain_path} \
38 | --output ${pred_path}.${method_id}.output.h5ad
39 |
40 |
41 | # MULTIOME GEX2ATAC
42 | dataset_id=openproblems_bmmc_multiome_phase2_rna
43 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
44 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
45 |
46 | target/docker/${task_id}_methods/${method_id}/${method_id} \
47 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
48 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
49 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
50 | --input_pretrain ${pretrain_path} \
51 | --output ${pred_path}.${method_id}.output.h5ad
52 |
53 | # MULTIOME ATAC2GEX
54 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
55 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
56 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
57 |
58 | target/docker/${task_id}_methods/${method_id}/${method_id} \
59 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
60 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
61 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
62 | --input_pretrain ${pretrain_path} \
63 | --output ${pred_path}.${method_id}.output.h5ad
64 |
65 | # RUN EVALUATION
66 | bin/nextflow run "$PIPELINE_REPO" \
67 | -r "$PIPELINE_VERSION" \
68 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
69 | --solutionDir "output/datasets/$task_id" \
70 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
71 | --publishDir "output/evaluation/$task_id/$method_id/" \
72 | -latest \
73 | -resume \
74 | -c "src/resources/nextflow_moremem.config"
75 |
76 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/train/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: dance_train
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 |
7 | description: A description for your method.
8 | authors:
9 | - name: Hongzhi Wen
10 | email: wenhongz@msu.edu
11 | roles: [ author, maintainer ]
12 | - name: Jiayuan Ding
13 | email: dingjia5@msu.edu
14 | roles: [ author, maintainer ]
15 | - name: Wei Jin
16 | email: jinwei2@msu.edu
17 | roles: [ author ]
18 | - name: Xiaoyan Li
19 | email: lixiaoy5@msu.edu
20 | roles: [ author ]
21 | - name: Zhaoheng Li
22 | email: zli1@macalester.edu
23 | roles: [ author ]
24 | - name: Haoyu Han
25 | email: hanhaoy1@msu.edu
26 | roles: [ assistant ]
27 | - name: Yuying Xie
28 | email: xyy@msu.edu
29 | roles: [ advisor ]
30 | - name: Jiliang Tang
31 | email: tangjili@msu.edu
32 | roles: [ advisor ]
33 |
34 | # parameters
35 | arguments:
36 | # required inputs
37 | - name: "--data_dir"
38 | type: "file"
39 | description: The path to the predict_modality datasets
40 | required: true
41 |
42 | # required outputs
43 | - name: "--output_pretrain"
44 | type: "file"
45 | direction: "output"
46 | example: "pretrain_model"
47 | description: Path to the directory containing the pretrained models.
48 | required: true
49 |
50 | # files your script needs
51 | resources:
52 | - type: bash_script
53 | path: script.sh
54 | - path: hetero_arg_version_v5.py
55 | - path: generate_extra_files.py
56 | - path: h.all.v7.4.entrez.gmt
57 | - path: h.all.v7.4.symbols.gmt
58 | # suggestion: use same WeightedGCN4 as run component
59 | # to use, uncomment the following line
60 | # - path: ../resources/graph_util.py
61 |
62 | # target platforms
63 | platforms:
64 | - type: docker
65 | image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
66 | run_args: [ "--gpus all --shm-size=5G" ]
67 | setup:
68 | - type: docker
69 | run: [pip install scikit-learn==0.24.1]
70 |
71 | - type: python
72 | packages:
73 | #- scikit-learn
74 | - anndata
75 | - scanpy
76 | - numpy
77 | - torch
78 | - dgl-cu111
79 | - lightgbm
80 | - joblib
81 |
82 | - type: nextflow
83 | labels: [ midmem, hightime, lowcpu, gpu ]
84 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/DANCE/train/generate_extra_files.py:
--------------------------------------------------------------------------------
1 | import anndata as ad
2 | import pickle
3 | import numpy as np
4 | from collections import defaultdict
5 | import random
6 |
7 | import argparse
8 |
9 |
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('-d', '--data_folder', default = './data/public/phase2-data/predict_modality/')
12 | parser.add_argument('-ef', '--extra_files_folder', default = './')
13 |
14 | args = parser.parse_args()
15 |
16 | def load_pw():
17 | with open(args.extra_files_folder + '/h.all.v7.4.entrez.gmt') as gmt:
18 | gene_list = gmt.read().split()
19 | gene_sets_entrez = defaultdict(list)
20 |
21 | indicator = 0
22 | for ele in gene_list:
23 | if not ele.isnumeric() and indicator == 1:
24 | indicator = 0
25 | continue
26 | if not ele.isnumeric() and indicator == 0:
27 | indicator = 1
28 | gene_set_name = ele
29 | else:
30 | gene_sets_entrez[gene_set_name].append(ele)
31 |
32 | with open(args.extra_files_folder + '/h.all.v7.4.symbols.gmt') as gmt:
33 | gene_list = gmt.read().split()
34 | gene_sets_symbols = defaultdict(list)
35 |
36 | for ele in gene_list:
37 | if ele in gene_sets_entrez:
38 | gene_set_name = ele
39 | elif not ele.startswith( 'http://' ):
40 | gene_sets_symbols[gene_set_name].append(ele)
41 |
42 | return [i[1] for i in gene_sets_symbols.items()]
43 |
44 | def graph_construct(train_mod1):
45 | counter = 0
46 | total = 0
47 | input_train_mod1 = train_mod1.X
48 | feature_index = train_mod1.var['feature_types'].index.tolist()
49 | new_pw = []
50 | for i in pw:
51 | new_pw.append([])
52 | for j in i:
53 | if j in feature_index:
54 | new_pw[-1].append(feature_index.index(j))
55 |
56 | # cos similarity weight
57 | uu=[]
58 | vv=[]
59 | ee=[]
60 | for i in new_pw:
61 | for j in i:
62 | for k in i:
63 | if j!=k:
64 | uu.append(j)
65 | vv.append(k)
66 | sj = np.sqrt(np.dot(input_train_mod1[:,j].toarray().T, input_train_mod1[:,j].toarray()).item())
67 | sk = np.sqrt(np.dot(input_train_mod1[:,k].toarray().T, input_train_mod1[:,k].toarray()).item())
68 | jk = np.dot(input_train_mod1[:,j].toarray().T, input_train_mod1[:,k].toarray())
69 | cossim = jk/sj/sk
70 | ee.append(cossim)
71 |
72 | return uu, vv, ee
73 |
74 | print("Loading pw")
75 | pw = load_pw()
76 |
77 | print("Generating 'pw.pkl'")
78 | # Generate pw.pkl
79 | subtask = 'openproblems_bmmc_cite_phase2_rna'
80 | subtask_folder = args.data_folder + '/' + subtask + '/'
81 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad'
82 | uu, vv, ee = graph_construct(ad.read_h5ad(subtask_filename.format('train_mod1')))
83 | pickle.dump([uu,vv,ee], open(args.extra_files_folder + '/pw.pkl', 'wb'))
84 |
85 | print("Generating 'pw_multiome.pkl'")
86 | subtask = 'openproblems_bmmc_multiome_phase2_rna'
87 | subtask_folder = args.data_folder + '/' + subtask + '/'
88 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad'
89 | uu, vv, ee = graph_construct(ad.read_h5ad(subtask_filename.format('train_mod1')))
90 | pickle.dump([uu,vv,ee], open(args.extra_files_folder + '/pw_multiome.pkl', 'wb'))
91 |
92 | print("Generating 'phase2_mask.pkl'")
93 | subtasks = ['openproblems_bmmc_cite_phase2_rna', 'openproblems_bmmc_cite_phase2_mod2', 'openproblems_bmmc_multiome_phase2_rna', 'openproblems_bmmc_multiome_phase2_mod2']
94 | task_names = ['gex2adt', 'adt2gex', 'gex2atac', 'atac2gex']
95 | mask = {}
96 |
97 | for ts in range(4):
98 | subtask = subtasks[ts]
99 | mask[subtask] = {}
100 | subtask_folder = args.data_folder + '/' + subtask + '/'
101 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad'
102 | train_mod1 = ad.read_h5ad(subtask_filename.format('train_mod1'))
103 | l = list(range(train_mod1.X.shape[0]))
104 | random.shuffle(l)
105 | train_size = int(train_mod1.X.shape[0] * 0.85)
106 | valid_size = train_mod1.X.shape[0] - train_size
107 | mask[subtask]['train'] = l[:train_size]
108 | mask[subtask]['test'] = l[-valid_size:]
109 |
110 | import pickle
111 | pickle.dump(mask, open(args.extra_files_folder + '/phase2_mask.pkl','wb'))
112 |
113 | print("Generating 'phase2_mask_sep.pkl'")
114 | subtask = 'openproblems_bmmc_cite_phase2_rna'
115 | subtask_folder = args.data_folder + '/' + subtask + '/'
116 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad'
117 |
118 | train_mod1 = ad.read_h5ad(subtask_filename.format('train_mod1'))
119 |
120 | def get_index(batch):
121 | index = []
122 | for i in train_mod1[train_mod1.obs['batch']==batch].obs['batch'].index:
123 | index.append(list(train_mod1.obs['batch'].index).index(i))
124 | return index
125 |
126 | s3d1 = get_index('s3d1')
127 | s3d7 = get_index('s3d7')
128 | s1d2 = get_index('s1d2')
129 |
130 | test = s3d7+s1d2
131 | train = [i for i in range(train_mod1.X.shape[0]) if i not in (test + s3d1)]
132 |
133 | gex2adt = {}
134 | gex2adt['test'] = test
135 | gex2adt['train'] = train
136 |
137 | mask = {}
138 | mask['openproblems_bmmc_cite_phase2_rna'] = gex2adt
139 | pickle.dump(mask, open(args.extra_files_folder + '/phase2_mask_sep.pkl', 'wb'))
--------------------------------------------------------------------------------
/src/predict_modality/methods/Guanlab-dengkw/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: guanlab_dengkw_pm
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 | description: A description for your method.
7 | info:
8 | method_label: "Guanlab-dengkw"
9 | submission_id: "170636"
10 | team_name: Guanlab-dengkw
11 | # project_url: https://github.com/foo/bar
12 | # publication_doi: 10.1101/0123.45.67.890123
13 | # publication_url: https://arxiv.org/abs/1234.56789
14 |
15 | authors:
16 | - name: Kaiwen Deng
17 | email: dengkw@umich.edu
18 | roles: [ author, maintainer ]
19 | props: { github: nonztalk }
20 |
21 | # parameters
22 | arguments:
23 | # required inputs
24 | - name: "--input_train_mod1"
25 | type: "file"
26 | example: "dataset_mod1.h5ad"
27 | description: Censored dataset, training cells.
28 | required: true
29 | - name: "--input_test_mod1"
30 | type: "file"
31 | example: "dataset_mod1.h5ad"
32 | description: Censored dataset, test cells.
33 | required: true
34 | - name: "--input_train_mod2"
35 | type: "file"
36 | example: "dataset_mod2.h5ad"
37 | description: Censored dataset.
38 | required: true
39 | # required outputs
40 | - name: "--output"
41 | type: "file"
42 | direction: "output"
43 | example: "output.h5ad"
44 | description: Dataset with predicted values for modality2.
45 | required: true
46 | # additional parameters
47 | - name: "--distance_method"
48 | type: "string"
49 | default: "minkowski"
50 | description: The distance metric to use. Possible values include `euclidean` and `minkowski`.
51 | - name: "--n_pcs"
52 | type: "integer"
53 | default: 50
54 | description: Number of components to use for dimensionality reduction.
55 |
56 | # files your script needs
57 | resources:
58 | - type: python_script
59 | path: script.py
60 |
61 | # target platforms
62 | platforms:
63 | - type: docker
64 | image: dataintuitive/randpy:py3.8
65 | setup:
66 |
67 | - type: python
68 | packages:
69 | - scikit-learn
70 | - anndata
71 | - pandas
72 | - numpy
73 | - scanpy
74 |
75 | - type: nextflow
76 | labels: [ vhighmem, vvhightime, vhighcpu ]
77 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/Guanlab-dengkw/run/script.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import anndata as ad
3 | import numpy as np
4 |
5 | from scipy.sparse import csc_matrix
6 |
7 | from sklearn.decomposition import TruncatedSVD
8 | from sklearn.gaussian_process.kernels import RBF
9 | from sklearn.kernel_ridge import KernelRidge
10 |
11 | logging.basicConfig(level=logging.INFO)
12 |
13 | ## VIASH START
14 | par = {
15 | 'input_train_mod1': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad',
16 | 'input_train_mod2': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad',
17 | 'input_test_mod1': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad',
18 | 'output': 'output.h5ad',
19 | }
20 | meta = { 'functionality_name': 'submission_170636' }
21 | ## VIASH END
22 |
23 | logging.info('Reading `h5ad` files...')
24 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
25 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
26 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
27 |
28 | pred_dimx = input_test_mod1.shape[0]
29 | pred_dimy = input_train_mod2.shape[1]
30 |
31 | feature_obs = input_train_mod1.obs
32 | gs_obs = input_train_mod2.obs
33 |
34 | batches = input_train_mod1.obs.batch.unique().tolist()
35 | batch_len = len(batches)
36 |
37 | obs = input_test_mod1.obs
38 | var = input_train_mod2.var
39 | dataset_id = input_train_mod1.uns['dataset_id']
40 |
41 | input_train = ad.concat(
42 | {"train": input_train_mod1, "test": input_test_mod1},
43 | axis=0,
44 | join="outer",
45 | label="group",
46 | fill_value=0,
47 | index_unique="-"
48 | )
49 |
50 | logging.info('Determine parameters by the modalities')
51 | mod1_type = input_train_mod1.var.feature_types[0]
52 | mod1_type = mod1_type.upper()
53 | mod2_type = input_train_mod2.var.feature_types[0]
54 | mod2_type = mod2_type.upper()
55 | n_comp_dict = {
56 | ("GEX", "ADT"): (300, 70, 10, 0.2),
57 | ("ADT", "GEX"): (None, 50, 10, 0.2),
58 | ("GEX", "ATAC"): (1000, 50, 10, 0.1),
59 | ("ATAC", "GEX"): (100, 70, 10, 0.1)
60 | }
61 | logging.info(f"{mod1_type}, {mod2_type}")
62 | n_mod1, n_mod2, scale, alpha = n_comp_dict[(mod1_type, mod2_type)]
63 | logging.info(f"{n_mod1}, {n_mod2}, {scale}, {alpha}")
64 |
65 | # Do PCA on the input data
66 | logging.info('Models using the Truncated SVD to reduce the dimension')
67 |
68 | if n_mod1 is not None and n_mod1 < input_train.shape[1]:
69 | embedder_mod1 = TruncatedSVD(n_components=n_mod1)
70 | mod1_pca = embedder_mod1.fit_transform(input_train.X).astype(np.float32)
71 | train_matrix = mod1_pca[input_train.obs['group'] == 'train']
72 | test_matrix = mod1_pca[input_train.obs['group'] == 'test']
73 | else:
74 | train_matrix = input_train_mod1.to_df().values.astype(np.float32)
75 | test_matrix = input_test_mod1.to_df().values.astype(np.float32)
76 |
77 | if n_mod2 is not None and n_mod2 < input_train_mod2.shape[1]:
78 | embedder_mod2 = TruncatedSVD(n_components=n_mod2)
79 | train_gs = embedder_mod2.fit_transform(input_train_mod2.X).astype(np.float32)
80 | else:
81 | train_gs = input_train_mod2.to_df().values.astype(np.float32)
82 |
83 | del input_train
84 | del input_train_mod1
85 | del input_train_mod2
86 | del input_test_mod1
87 |
88 | logging.info('Running normalization ...')
89 | train_sd = np.std(train_matrix, axis=1).reshape(-1, 1)
90 | train_sd[train_sd == 0] = 1
91 | train_norm = (train_matrix - np.mean(train_matrix, axis=1).reshape(-1, 1)) / train_sd
92 | train_norm = train_norm.astype(np.float32)
93 | del train_matrix
94 |
95 | test_sd = np.std(test_matrix, axis=1).reshape(-1, 1)
96 | test_sd[test_sd == 0] = 1
97 | test_norm = (test_matrix - np.mean(test_matrix, axis=1).reshape(-1, 1)) / test_sd
98 | test_norm = test_norm.astype(np.float32)
99 | del test_matrix
100 |
101 | logging.info('Running KRR model ...')
102 | y_pred = np.zeros((pred_dimx, pred_dimy), dtype=np.float32)
103 | np.random.seed(1000)
104 |
105 | for _ in range(5):
106 | np.random.shuffle(batches)
107 | for batch in [batches[:batch_len//2], batches[batch_len//2:]]:
108 | # for passing the test
109 | if not batch:
110 | batch = [batches[0]]
111 |
112 | logging.info(batch)
113 | kernel = RBF(length_scale = scale)
114 | krr = KernelRidge(alpha=alpha, kernel=kernel)
115 | logging.info('Fitting KRR ... ')
116 | krr.fit(train_norm[feature_obs.batch.isin(batch)],
117 | train_gs[gs_obs.batch.isin(batch)])
118 | y_pred += (krr.predict(test_norm) @ embedder_mod2.components_)
119 |
120 | np.clip(y_pred, a_min=0, a_max=None, out=y_pred)
121 | if mod2_type == "ATAC":
122 | np.clip(y_pred, a_min=0, a_max=1, out=y_pred)
123 |
124 | y_pred /= 10
125 |
126 | # Store as sparse matrix to be efficient. Note that this might require
127 | # different classifiers/embedders before-hand. Not every class is able
128 | # to support such data structures.
129 | y_pred = csc_matrix(y_pred)
130 |
131 | logging.info("Generate anndata object ...")
132 | adata = ad.AnnData(
133 | X=y_pred,
134 | obs=obs,
135 | var=var,
136 | uns={
137 | 'dataset_id': dataset_id,
138 | 'method_id': meta['functionality_name'],
139 | },
140 | )
141 |
142 | logging.info('Storing annotated data...')
143 | adata.write_h5ad(par['output'], compression = "gzip")
144 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/Guanlab-dengkw/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=guanlab_dengkw_pm
7 | task_id=predict_modality
8 |
9 | # CITE GEX2ADT
10 | dataset_id=openproblems_bmmc_cite_phase2_rna
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
13 |
14 | target/docker/${task_id}_methods/${method_id}/${method_id} \
15 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
16 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
17 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
18 | --output ${pred_path}.${method_id}.output.h5ad
19 |
20 | # CITE ADT2GEX
21 | dataset_id=openproblems_bmmc_cite_phase2_mod2
22 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
23 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
24 |
25 | target/docker/${task_id}_methods/${method_id}/${method_id} \
26 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
27 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
28 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
29 | --input_pretrain ${pretrain_path} \
30 | --output ${pred_path}.${method_id}.output.h5ad
31 |
32 | # MULTIOME GEX2ATAC
33 | dataset_id=openproblems_bmmc_multiome_phase2_rna
34 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
36 |
37 | target/docker/${task_id}_methods/${method_id}/${method_id} \
38 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
39 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
40 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
41 | --input_pretrain ${pretrain_path} \
42 | --output ${pred_path}.${method_id}.output.h5ad
43 |
44 | # MULTIOME ATAC2GEX
45 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
46 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
47 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
48 |
49 | target/docker/${task_id}_methods/${method_id}/${method_id} \
50 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
51 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
52 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
53 | --input_pretrain ${pretrain_path} \
54 | --output ${pred_path}.${method_id}.output.h5ad
55 |
56 | # RUN EVALUATION
57 | bin/nextflow run "$PIPELINE_REPO" \
58 | -r "$PIPELINE_VERSION" \
59 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
60 | --solutionDir "output/datasets/$task_id" \
61 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
62 | --publishDir "output/evaluation/$task_id/$method_id/" \
63 | -latest \
64 | -resume \
65 | -c "src/resources/nextflow_moremem.config"
66 |
67 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
68 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/LS_lab/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: lslab
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 | description: A description for your method.
7 | info:
8 | method_label: "LS_Lab"
9 | submission_id: "171123"
10 | team_name: LS_lab
11 | # project_url: https://github.com/foo/bar
12 | # publication_doi: 10.1101/0123.45.67.890123
13 | # publication_url: https://arxiv.org/abs/1234.56789
14 |
15 | authors:
16 | - name: Aidyn Ubingazhibov
17 | email: aidyn.ubingazhibov@nu.edu.kz
18 | roles: [ author, maintainer ]
19 | props: { github: aidynabirov }
20 |
21 | # parameters
22 | arguments:
23 | # required inputs
24 | - name: "--input_train_mod1"
25 | type: "file"
26 | example: "dataset_mod1.h5ad"
27 | description: Censored dataset, training cells.
28 | required: true
29 | - name: "--input_test_mod1"
30 | type: "file"
31 | example: "dataset_mod1.h5ad"
32 | description: Censored dataset, test cells.
33 | required: true
34 | - name: "--input_train_mod2"
35 | type: "file"
36 | example: "dataset_mod2.h5ad"
37 | description: Censored dataset.
38 | required: true
39 | # required outputs
40 | - name: "--output"
41 | type: "file"
42 | direction: "output"
43 | example: "output.h5ad"
44 | description: Dataset with predicted values for modality2.
45 | required: true
46 |
47 | # files your script needs
48 | resources:
49 | - type: python_script
50 | path: script.py
51 |
52 | # target platforms
53 | platforms:
54 | - type: docker
55 | image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
56 | run_args: [ "--gpus all" ]
57 | setup:
58 | - type: python
59 | packages:
60 | - scikit-learn
61 | - catboost
62 | - anndata
63 | - scanpy
64 | - tqdm
65 | - type: nextflow
66 | labels: [ vhighmem, vvhightime, highcpu, gpu]
67 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/LS_lab/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=submission_171123
7 | task_id=predict_modality
8 |
9 | # CITE GEX2ADT
10 | dataset_id=openproblems_bmmc_cite_phase2_rna
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
13 |
14 | target/docker/${task_id}_methods/${method_id}/${method_id} \
15 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
16 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
17 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
18 | --output ${pred_path}.${method_id}.output.h5ad
19 |
20 | # CITE ADT2GEX
21 | dataset_id=openproblems_bmmc_cite_phase2_mod2
22 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
23 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
24 |
25 | target/docker/${task_id}_methods/${method_id}/${method_id} \
26 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
27 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
28 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
29 | --input_pretrain ${pretrain_path} \
30 | --output ${pred_path}.${method_id}.output.h5ad
31 |
32 | # MULTIOME GEX2ATAC
33 | dataset_id=openproblems_bmmc_multiome_phase2_rna
34 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
36 |
37 | target/docker/${task_id}_methods/${method_id}/${method_id} \
38 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
39 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
40 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
41 | --input_pretrain ${pretrain_path} \
42 | --output ${pred_path}.${method_id}.output.h5ad
43 |
44 | # MULTIOME ATAC2GEX
45 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
46 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
47 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
48 |
49 | target/docker/${task_id}_methods/${method_id}/${method_id} \
50 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
51 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
52 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
53 | --input_pretrain ${pretrain_path} \
54 | --output ${pred_path}.${method_id}.output.h5ad
55 |
56 | # RUN EVALUATION
57 | bin/nextflow run "$PIPELINE_REPO" \
58 | -r "$PIPELINE_VERSION" \
59 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
60 | --solutionDir "output/datasets/$task_id" \
61 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
62 | --publishDir "output/evaluation/$task_id/$method_id/" \
63 | -latest \
64 | -resume \
65 | -c "src/resources/nextflow_moremem.config"
66 |
67 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/predict_modality/methods/cajal/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: cajal
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 | description: A description for your method.
7 | info:
8 | method_label: "Cajal"
9 | submission_id: "170613"
10 | team_name: Cajal
11 | # project_url: https://github.com/foo/bar
12 | # publication_doi: 10.1101/0123.45.67.890123
13 | # publication_url: https://arxiv.org/abs/1234.56789
14 |
15 | authors:
16 | - name: Anna Laddach
17 | email: anna.laddach@crick.ac.uk
18 | roles: [ author, maintainer ]
19 | props: { github: AnnaLaddach, orcid: "0000-0001-5552-6534" }
20 | - name: Roman Laddach
21 | email: roman.laddach@kcl.ac.uk
22 | roles: [ author, maintainer ]
23 | props: { github: rladdach, orcid: "0000-0002-0118-4548" }
24 | - name: Michael Shapiro
25 | email: michael.shapiro@crick.ac.uk
26 | roles: [ author, maintainer ]
27 | props: { github: michaeldshapiro, orcid: "0000-0002-2769-9320" }
28 |
29 | # parameters
30 | arguments:
31 | # required inputs
32 | - name: "--input_train_mod1"
33 | type: "file"
34 | example: "dataset_mod1.h5ad"
35 | description: Censored dataset, training cells.
36 | required: true
37 | - name: "--input_test_mod1"
38 | type: "file"
39 | example: "dataset_mod1.h5ad"
40 | description: Censored dataset, test cells.
41 | required: true
42 | - name: "--input_train_mod2"
43 | type: "file"
44 | example: "dataset_mod2.h5ad"
45 | description: Censored dataset.
46 | required: true
47 | - name: "--input_pretrain"
48 | type: "file"
49 | example: "pretrain_model"
50 | description: Path to the directory containing a pretrained model.
51 | required: true
52 |
53 | # required outputs
54 | - name: "--output"
55 | type: "file"
56 | direction: "output"
57 | example: "output.h5ad"
58 | description: Dataset with predicted values for modality2.
59 | required: true
60 |
61 | # files your script needs
62 | resources:
63 | - type: python_script
64 | path: script.py
65 |
66 | # target platforms
67 | platforms:
68 | - type: docker
69 | image: tensorflow/tensorflow:2.5.0-gpu
70 | run_args: [ "--gpus all" ]
71 | setup:
72 | - type: python
73 | packages:
74 | - scikit-learn
75 | - anndata
76 | - scanpy
77 | - tensorflow
78 | - pandas
79 | - type: nextflow
80 | labels: [ vhighmem, vvhightime, highcpu, gpu]
81 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/cajal/run/script.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import anndata as ad
3 | import pickle
4 | import numpy as np
5 |
6 | from scipy.sparse import csc_matrix
7 |
8 | import tensorflow as tf
9 | import scanpy as sc
10 |
11 | logging.basicConfig(level=logging.INFO)
12 |
13 | ## VIASH START
14 | par = {
15 | 'input_train_mod1': 'sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad',
16 | 'input_train_mod2': 'sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad',
17 | 'input_test_mod1': 'sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad',
18 | 'input_pretrain': 'path/to/model',
19 | 'output': 'output.h5ad'
20 | }
21 | meta = { 'functionality_name': 'cajal' }
22 | ## VIASH END
23 |
24 | logging.info('Reading `h5ad` files...')
25 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
26 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
27 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
28 |
29 | #get modalities
30 | mod_1 = input_train_mod1.var["feature_types"][0]
31 | mod_2 = input_train_mod2.var["feature_types"][0]
32 |
33 |
34 | test_total = np.sum(input_test_mod1.layers['counts'].toarray(), axis=1)
35 |
36 | if mod_1 == "GEX":
37 | input_test_mod1.X = input_test_mod1.layers['counts']
38 | sc.pp.normalize_per_cell(input_test_mod1, counts_per_cell_after=1e6)
39 | sc.pp.log1p(input_test_mod1)
40 |
41 | with open(par["input_pretrain"] + "/genes.pkl", "rb") as f:
42 | genes = pickle.load(f)
43 | input_test_mod1 = input_test_mod1[:,genes]
44 |
45 | if mod_1 == "GEX":
46 | input_train_mod1.X = input_train_mod1.layers['counts']
47 | sc.pp.normalize_per_cell(input_train_mod1, counts_per_cell_after=1e6)
48 | sc.pp.log1p(input_train_mod1)
49 |
50 | X_test = input_test_mod1.X.toarray()
51 |
52 | test_batches = set(input_test_mod1.obs.batch)
53 |
54 | input_test_mod1.obs["batch_median"] = 0
55 |
56 | input_test_mod1.obs["batch_sd"] = 0
57 |
58 | for batch in test_batches:
59 | input_test_mod1.obs["batch_median"][input_test_mod1.obs.batch == batch] = np.median(test_total[input_test_mod1.obs.batch == batch])
60 | input_test_mod1.obs["batch_sd"][input_test_mod1.obs.batch == batch] = np.std(test_total[input_test_mod1.obs.batch == batch])
61 |
62 |
63 | for i in range(50):
64 | X_test = np.column_stack((X_test,test_total))
65 |
66 | for i in range(50):
67 | X_test = np.column_stack((X_test,input_test_mod1.obs["batch_median"]))
68 |
69 | for i in range(50):
70 | X_test = np.column_stack((X_test,input_test_mod1.obs["batch_sd"]))
71 |
72 | with open(par["input_pretrain"] + "/transformation.pkl", "rb") as f:
73 | info = pickle.load(f)
74 |
75 | X_test = X_test.T
76 | X_test = (X_test - info["means"])/info["sds"]
77 | X_test = X_test.T
78 |
79 |
80 | #load pretrained model for correct modalities
81 | model = tf.keras.models.load_model(par["input_pretrain"] + "/model.h5")
82 |
83 | #make predictions for y
84 | y_pred = model.predict(X_test)
85 |
86 | #convert to sparse matrix
87 | y_pred = csc_matrix(y_pred)
88 |
89 | adata = ad.AnnData(
90 | X=y_pred,
91 | obs=input_test_mod1.obs,
92 | var=input_train_mod2.var,
93 | uns={
94 | 'dataset_id': input_train_mod1.uns['dataset_id'],
95 | 'method_id': "cajal",
96 | },
97 | )
98 |
99 |
100 | logging.info('Storing annotated data...')
101 | adata.write_h5ad(par['output'], compression = "gzip")
102 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/cajal/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=cajal
7 | task_id=predict_modality
8 |
9 | # CITE GEX2ADT
10 | dataset_id=openproblems_bmmc_cite_phase2_rna
11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
12 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
13 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
14 |
15 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
16 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
17 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
18 | --input_explore_mod1 output/datasets_explore/cite/cite_gex_processed_training.h5ad \
19 | --input_explore_mod2 output/datasets_explore/cite/cite_adt_processed_training.h5ad \
20 | --output_pretrain ${pretrain_path}
21 |
22 | target/docker/${task_id}_methods/${method_id}/${method_id} \
23 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
24 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
25 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
26 | --input_pretrain ${pretrain_path} \
27 | --output ${pred_path}.${method_id}.output.h5ad
28 |
29 | # CITE ADT2GEX
30 | dataset_id=openproblems_bmmc_cite_phase2_mod2
31 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
32 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
33 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
34 |
35 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
36 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
37 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
38 | --input_explore_mod1 output/datasets_explore/cite/cite_adt_processed_training.h5ad \
39 | --input_explore_mod2 output/datasets_explore/cite/cite_gex_processed_training.h5ad \
40 | --output_pretrain ${pretrain_path}
41 |
42 | target/docker/${task_id}_methods/${method_id}/${method_id} \
43 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
44 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
45 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
46 | --input_pretrain ${pretrain_path} \
47 | --output ${pred_path}.${method_id}.output.h5ad
48 |
49 |
50 | # MULTIOME GEX2ATAC
51 | dataset_id=openproblems_bmmc_multiome_phase2_rna
52 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
53 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
54 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
55 |
56 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
57 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
58 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
59 | --input_explore_mod1 output/datasets_explore/multiome/multiome_gex_processed_training.h5ad \
60 | --input_explore_mod2 output/datasets_explore/multiome/multiome_atac_processed_training.h5ad \
61 | --output_pretrain ${pretrain_path}
62 |
63 | target/docker/${task_id}_methods/${method_id}/${method_id} \
64 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
65 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
66 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
67 | --input_pretrain ${pretrain_path} \
68 | --output ${pred_path}.${method_id}.output.h5ad
69 |
70 | # MULTIOME ATAC2GEX
71 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
72 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
73 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
74 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
75 |
76 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
77 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
78 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
79 | --input_explore_mod1 output/datasets_explore/multiome/multiome_atac_processed_training.h5ad \
80 | --input_explore_mod2 output/datasets_explore/multiome/multiome_gex_processed_training.h5ad \
81 | --output_pretrain ${pretrain_path}
82 |
83 | target/docker/${task_id}_methods/${method_id}/${method_id} \
84 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
85 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
86 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
87 | --input_pretrain ${pretrain_path} \
88 | --output ${pred_path}.${method_id}.output.h5ad
89 |
90 | # RUN EVALUATION
91 | bin/nextflow run "$PIPELINE_REPO" \
92 | -r "$PIPELINE_VERSION" \
93 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
94 | --solutionDir "output/datasets/$task_id" \
95 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
96 | --publishDir "output/evaluation/$task_id/$method_id/" \
97 | -latest \
98 | -resume \
99 | -c "src/resources/nextflow_moremem.config"
100 |
101 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/predict_modality/methods/cajal/train/ADT_list_df_updated.csv:
--------------------------------------------------------------------------------
1 | markers,gene_name,
2 | CD86,CD86,
3 | CD274,CD274,
4 | CD270,TNFRSF14,
5 | CD155,PVR,
6 | CD112,NECTIN2,
7 | CD47,CD47,
8 | CD48,CD48,
9 | CD40,CD40,
10 | CD154,CD40LG,
11 | CD52,CD52,
12 | CD3,CD3E,ambiguity
13 | CD8,CD8A,
14 | CD56,NCAM1,
15 | CD19,CD19,
16 | CD33,CD33,
17 | CD11c,ITGAX,
18 | HLA-A-B-C,HLA-A,ambiguity
19 | HLA-A-B-C,HLA-B,ambiguity
20 | HLA-A-B-C,HLA-C,ambiguity
21 | CD45RA,PTPRC,ambiguity
22 | CD123,IL3RA,
23 | CD7,CD7,
24 | CD105,ENG,
25 | CD49f,ITGA6,
26 | CD194,CCR4,
27 | CD4,CD4,
28 | CD44,CD44,
29 | CD14,CD14,
30 | CD16,FCGR3A,
31 | CD25,IL2RA,
32 | CD45RO,PTPRC,ambiguity
33 | CD279,PDCD1,
34 | TIGIT,TIGIT,
35 | CD20,MS4A1,
36 | CD335,NCR1,
37 | CD31,PECAM1,
38 | Podoplanin,PDPN,
39 | CD146,MCAM,
40 | IgM,IGHM,
41 | CD5,CD5,
42 | CD195,CCR5,
43 | CD32,FCGR2A,
44 | CD196,CCR6,
45 | CD185,CXCR5,
46 | CD103,ITGAE,
47 | CD69,CD69,
48 | CD62L,SELL,
49 | CD161,KLRB1,
50 | CD152,CTLA4,
51 | CD223,LAG3,
52 | KLRG1,KLRG1,
53 | CD27,CD27,
54 | CD107a,LAMP1,
55 | CD95,FAS,
56 | CD134,TNFRSF4,
57 | HLA-DR,HLA-DRB1,
58 | CD1c,CD1C,
59 | CD11b,ITGAM,
60 | CD64,FCGR1A,
61 | CD141,THBD,
62 | CD1d,CD1D,
63 | CD314,KLRK1,
64 | CD35,CR1,
65 | CD57,B3GAT1,
66 | CD272,BTLA,
67 | CD278,ICOS,
68 | CD58,CD58,
69 | CD39,ENTPD1,
70 | CX3CR1,CX3CR1,
71 | CD24,CD24,
72 | CD21,CR2,
73 | CD11a,ITGAL,
74 | CD79b,CD79B,
75 | CD244,CD244,
76 | CD169,SIGLEC1,
77 | integrinB7,ITGB7,
78 | CD268,TNFRSF13C,
79 | CD42b,GP1BA,
80 | CD54,ICAM1,
81 | CD62P,SELP,
82 | CD119,IFNGR1,
83 | TCR,TRA,
84 | TCR,TRB,
85 | TCR,TRG,
86 | TCR,TRD,
87 | CD192,CCR2,
88 | CD122,IL2RB,
89 | FceRIa, FCER1A,
90 | CD41,ITGA2B,
91 | CD137,TNFRSF9,
92 | CD163,CD163,
93 | CD83,CD83,
94 | CD124,IL4R,
95 | CD13,ANPEP,
96 | CD2,CD22,
97 | CD226,CD226,
98 | CD29,ITGB1,
99 | CD303,CLEC4C,
100 | CD49b, ITGA2,
101 | CD81,CD81,
102 | IgD,IGHD,
103 | CD18,ITGB2,
104 | CD28,CD28,
105 | CD38,CD38,
106 | CD127,IL7R,
107 | CD45,PTPRC,ambiguity
108 | CD22,CD22,
109 | CD71,TFRC,
110 | CD26,DPP4,
111 | CD115,CSF1R,
112 | CD63,CD63,
113 | CD304,NRP1,
114 | CD36,CD36,
115 | CD172a,SIRPA,
116 | CD72,CD72,
117 | CD158,KIR2DL3,
118 | CD93,CD93,
119 | CD49a,ITGA1,
120 | CD49d,ITGA4,
121 | CD73,NT5E,
122 | CD9,CD9,
123 | TCRVa7.2,?,ambiguity
124 | TCRVd2,?,ambiguity
125 | LOX-1,OLR1,
126 | CD158b,KIR2DL3,
127 | CD158e1,KIR3DL1,
128 | CD142,F3,
129 | CD319,SLAMF7,
130 | CD352,SLAMF6,
131 | CD94,KLRD1,
132 | CD162,SELPLG,
133 | CD85j,LILRB1,
134 | CD23,FCER2,
135 | CD328,SIGLEC7,
136 | HLA-E,HLA-E,
137 | CD82,CD82,
138 | CD101,CD101,
139 | CD88,C5AR1,
140 | CD224,GGT1,
141 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/cajal/train/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: cajal_train
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 | description: A description for your method.
7 | info:
8 | submission_id: "170613"
9 | team_name: Cajal
10 | # project_url: https://github.com/foo/bar
11 | # publication_doi: 10.1101/0123.45.67.890123
12 | # publication_url: https://arxiv.org/abs/1234.56789
13 |
14 | authors:
15 | - name: Anna Laddach
16 | email: anna.laddach@crick.ac.uk
17 | roles: [ author, maintainer ]
18 | props: { github: AnnaLaddach, orcid: "0000-0001-5552-6534" }
19 | - name: Roman Laddach
20 | email: roman.laddach@kcl.ac.uk
21 | roles: [ author, maintainer ]
22 | props: { github: rladdach, orcid: "0000-0002-0118-4548" }
23 | - name: Michael Shapiro
24 | email: michael.shapiro@crick.ac.uk
25 | roles: [ author, maintainer ]
26 | props: { github: michaeldshapiro, orcid: "0000-0002-2769-9320" }
27 |
28 | # parameters
29 | arguments:
30 | # required inputs
31 | - name: "--input_train_mod1"
32 | type: "file"
33 | example: "dataset_mod1.h5ad"
34 | description: Censored dataset, training cells.
35 | required: true
36 | - name: "--input_train_mod2"
37 | type: "file"
38 | example: "dataset_mod2.h5ad"
39 | description: Censored dataset.
40 | required: true
41 | - name: "--input_explore_mod1"
42 | type: "file"
43 | example: "dataset_mod1.h5ad"
44 | description: Explore version of the modality 1 dataset.
45 | required: true
46 | - name: "--input_explore_mod2"
47 | type: "file"
48 | example: "dataset_mod2.h5ad"
49 | description: Explore version of the modality 2 dataset.
50 | required: true
51 |
52 | # required outputs
53 | - name: "--output_pretrain"
54 | type: "file"
55 | direction: "output"
56 | example: "pretrain_model"
57 | description: Path to the directory containing a pretrained model.
58 | required: true
59 |
60 | # files your script needs
61 | resources:
62 | - type: python_script
63 | path: script.py
64 | - path: ADT_list_df_updated.csv
65 |
66 | # target platforms
67 | platforms:
68 | - type: docker
69 | image: tensorflow/tensorflow:2.5.0-gpu
70 | run_args: [ "--gpus all" ]
71 | setup:
72 | - type: python
73 | packages:
74 | - scikit-learn
75 | - anndata
76 | - scanpy
77 | - tensorflow
78 | - pandas
79 | - type: nextflow
80 | labels: [ vhighmem, vvhightime, highcpu, gpu]
--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/README.md:
--------------------------------------------------------------------------------
1 | # NeurIPS-Single-Cell-MultiModality
2 |
3 | Team Novel: Gleb Ryazantsev, Nikolay Russkikh, Igor I
4 |
5 | The task is solved via training encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework
6 |
7 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/novel_architecture.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/predict_modality/methods/novel/novel_architecture.jpg
--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: novel
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 | description: The task is solved via training encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework.
7 | info:
8 | method_label: "Novel"
9 | submission_id: "169769"
10 | team_name: Novel
11 | # project_url: https://github.com/foo/bar
12 | # publication_doi: 10.1101/0123.45.67.890123
13 | # publication_url: https://arxiv.org/abs/1234.56789
14 |
15 | authors:
16 | - name: Gleb Ryazantsev
17 | email: ryazantsev.gleb@gmail.com
18 | roles: [ author, maintainer ]
19 | - name: Nikolay Russkikh
20 | email: russkikh.nikolay@gmail.com
21 | roles: [ author, maintainer ]
22 | - name: Igor I
23 | email: herri.i.67@gmail.com
24 | roles: [ author, maintainer ]
25 |
26 | # parameters
27 | arguments:
28 | # required inputs
29 | - name: "--input_train_mod1"
30 | type: "file"
31 | example: "dataset_mod1.h5ad"
32 | description: Censored dataset, training cells.
33 | required: true
34 | - name: "--input_test_mod1"
35 | type: "file"
36 | example: "dataset_mod1.h5ad"
37 | description: Censored dataset, test cells.
38 | required: true
39 | - name: "--input_train_mod2"
40 | type: "file"
41 | example: "dataset_mod2.h5ad"
42 | description: Censored dataset.
43 | required: true
44 | - name: "--input_pretrain"
45 | type: "file"
46 | example: "pretrain_model"
47 | description: Path to the directory containing a pretrained model.
48 | required: true
49 | # required outputs
50 | - name: "--output"
51 | type: "file"
52 | direction: "output"
53 | example: "output.h5ad"
54 | description: Dataset with predicted values for modality2.
55 | required: true
56 |
57 | # files your script needs
58 | resources:
59 | - type: python_script
60 | path: script.py
61 | - path: ../resources/helper_functions.py
62 |
63 | # target platforms
64 | platforms:
65 | - type: docker
66 | image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime"
67 | setup:
68 | - type: python
69 | packages:
70 | - anndata
71 | - scikit-learn
72 | - networkx
73 |
74 | - type: nextflow
75 | labels: [ lowmem, lowtime, lowcpu ]
76 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/run/script.py:
--------------------------------------------------------------------------------
1 | import anndata as ad
2 | import pickle
3 | import torch
4 |
5 | from torch.utils.data import DataLoader
6 |
7 | import sys
8 |
9 | import numpy as np
10 |
11 | from scipy.sparse import csc_matrix
12 |
13 | ## VIASH START
14 | dataset_path = "output/datasets/match_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_"
15 | pretrain_path = "output/pretrain/match_modality/clue/openproblems_bmmc_cite_phase2_rna.clue_train.output_pretrain/"
16 |
17 | par = {
18 | 'input_train_mod1': f'{dataset_path}train_mod1.h5ad',
19 | 'input_train_mod2': f'{dataset_path}train_mod2.h5ad',
20 | 'input_test_mod1': f'{dataset_path}test_mod1.h5ad',
21 | 'input_pretrain': pretrain_path,
22 | 'output': 'output.h5ad'
23 | }
24 | meta = {
25 | 'resources_dir': '.',
26 | 'functionality_name': '169769'
27 | }
28 | ## VIASH END
29 |
30 | sys.path.append(meta['resources_dir'])
31 | from helper_functions import ModelRegressionGex2Adt, ModelRegressionGex2Atac, ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModalityMatchingDataset
32 |
33 |
34 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])
35 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
36 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
37 |
38 |
39 | mod1 = input_train_mod1.var['feature_types'][0]
40 | mod2 = input_train_mod2.var['feature_types'][0]
41 |
42 | if mod1 == 'GEX' and mod2 == 'ADT':
43 | model = ModelRegressionGex2Adt(256,134)
44 | weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu')
45 | with open(par['input_pretrain'] + '/lsi_transformer.pickle', 'rb') as f:
46 | lsi_transformer_gex = pickle.load(f)
47 |
48 |
49 | model.load_state_dict(weight)
50 | input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1)
51 |
52 | elif mod1 == 'GEX' and mod2 == 'ATAC':
53 | model = ModelRegressionGex2Atac(256,10000)
54 | weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu')
55 | with open(par['input_pretrain'] + '/lsi_transformer.pickle', 'rb') as f:
56 | lsi_transformer_gex = pickle.load(f)
57 |
58 |
59 | model.load_state_dict(weight)
60 | input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1)
61 |
62 | elif mod1 == 'ATAC' and mod2 == 'GEX':
63 | model = ModelRegressionAtac2Gex(256,13431)
64 | weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu')
65 | with open(par['input_pretrain'] + '/lsi_transformer.pickle', 'rb') as f:
66 | lsi_transformer_gex = pickle.load(f)
67 |
68 | model.load_state_dict(weight)
69 | input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1)
70 |
71 | elif mod1 == 'ADT' and mod2 == 'GEX':
72 | model = ModelRegressionAdt2Gex(134,13953)
73 | weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu')
74 |
75 | model.load_state_dict(weight)
76 | #input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1)
77 | input_test_mod1_ = input_test_mod1.to_df()
78 |
79 | dataset_test = ModalityMatchingDataset(input_test_mod1_, None, is_train=False)
80 | dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4)
81 |
82 | outputs = []
83 | model.eval()
84 | with torch.no_grad():
85 | for x in dataloader_test:
86 | output = model(x.float())
87 | outputs.append(output.detach().cpu().numpy())
88 |
89 | outputs = np.concatenate(outputs)
90 | outputs[outputs<0] = 0
91 | outputs = csc_matrix(outputs)
92 |
93 | adata = ad.AnnData(
94 | X=outputs,
95 | obs=input_test_mod1.obs,
96 | var=input_train_mod2.var,
97 | uns={
98 | 'dataset_id': input_train_mod1.uns['dataset_id'],
99 | 'method_id': meta['functionality_name'],
100 | },
101 | )
102 | adata.write_h5ad(par['output'], compression = "gzip")
--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=novel
7 | task_id=predict_modality
8 |
9 |
10 | # CITE ADT2GEX
11 | dataset_id=openproblems_bmmc_cite_phase1_mod2
12 | dataset_id_val=openproblems_bmmc_cite_phase2_mod2
13 | dataset_path=output/datasets_phase1/$task_id/$dataset_id/$dataset_id.censor_dataset
14 | dataset_path_val=output/datasets/$task_id/$dataset_id_val/$dataset_id_val.censor_dataset
15 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
16 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
17 |
18 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
19 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
20 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
21 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
22 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
23 | --output_pretrain ${pretrain_path}
24 |
25 | target/docker/${task_id}_methods/${method_id}/${method_id} \
26 | --input_train_mod1 ${dataset_path_val}.output_train_mod1.h5ad \
27 | --input_train_mod2 ${dataset_path_val}.output_train_mod2.h5ad \
28 | --input_test_mod1 ${dataset_path_val}.output_test_mod1.h5ad \
29 | --input_pretrain ${pretrain_path} \
30 | --output ${pred_path}.${method_id}.output.h5ad
31 |
32 | # CITE GEX2ADT
33 | dataset_id=openproblems_bmmc_cite_phase2_rna
34 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
35 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
36 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
37 |
38 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
39 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
40 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
41 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
42 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
43 | --output_pretrain ${pretrain_path}
44 |
45 | target/docker/${task_id}_methods/${method_id}/${method_id} \
46 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
47 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
48 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
49 | --input_pretrain ${pretrain_path} \
50 | --output ${pred_path}.${method_id}.output.h5ad
51 |
52 | # MULTIOME GEX2ATAC
53 | dataset_id=openproblems_bmmc_multiome_phase2_rna
54 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
55 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
56 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
57 |
58 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
59 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
60 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
61 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
62 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
63 | --output_pretrain ${pretrain_path}
64 |
65 | target/docker/${task_id}_methods/${method_id}/${method_id} \
66 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
67 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
68 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
69 | --input_pretrain ${pretrain_path} \
70 | --output ${pred_path}.${method_id}.output.h5ad
71 |
72 | # MULTIOME ATAC2GEX
73 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
74 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
75 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/
76 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
77 |
78 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
79 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
80 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
81 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
82 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \
83 | --output_pretrain ${pretrain_path}
84 |
85 | target/docker/${task_id}_methods/${method_id}/${method_id} \
86 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
87 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
88 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
89 | --input_pretrain ${pretrain_path} \
90 | --output ${pred_path}.${method_id}.output.h5ad
91 |
92 | # RUN EVALUATION
93 | bin/nextflow run "$PIPELINE_REPO" \
94 | -r "$PIPELINE_VERSION" \
95 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
96 | --solutionDir "output/datasets/$task_id" \
97 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
98 | --publishDir "output/evaluation/$task_id/$method_id/" \
99 | -latest \
100 | -resume \
101 | -c "src/resources/nextflow_moremem.config"
102 |
103 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/train/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: novel_train
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 |
7 | description: The task is solved via training encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework.
8 |
9 | authors:
10 | - name: Gleb Ryazantsev
11 | email: ryazantsev.gleb@gmail.com
12 | roles: [ author, maintainer ]
13 | - name: Nikolay Russkikh
14 | email: russkikh.nikolay@gmail.com
15 | roles: [ author, maintainer ]
16 | - name: Igor I
17 | email: herri.i.67@gmail.com
18 | roles: [ author, maintainer ]
19 |
20 | # parameters
21 | arguments:
22 | # required inputs
23 | - name: "--input_train_mod1"
24 | type: "file"
25 | example: "dataset_mod1.h5ad"
26 | description: Censored dataset, training cells.
27 | required: true
28 | - name: "--input_train_mod2"
29 | type: "file"
30 | example: "dataset_mod2.h5ad"
31 | description: Censored dataset.
32 | required: true
33 | - name: "--input_test_mod1"
34 | type: "file"
35 | example: "dataset_test_mod1.h5ad"
36 | description: Censored dataset, training cells.
37 | required: true
38 | - name: "--input_test_mod2"
39 | type: "file"
40 | example: "dataset_test_mod2.h5ad"
41 | description: Censored dataset.
42 | required: true
43 |
44 | # required outputs
45 | - name: "--output_pretrain"
46 | type: "file"
47 | direction: "output"
48 | example: "pretrain_model"
49 | description: Path to the directory containing a pretrained model.
50 | required: true
51 |
52 | # files your script needs
53 | resources:
54 | - type: python_script
55 | path: script.py
56 | - path: ../resources/helper_functions.py
57 |
58 | # target platforms
59 | platforms:
60 | - type: docker
61 | image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime"
62 | run_args: [ "--gpus all --shm-size=5G" ]
63 | setup:
64 | - type: python
65 | packages:
66 | - anndata
67 | - scikit-learn
68 | - networkx
69 |
70 | - type: nextflow
71 | labels: [ vhighmem, vvhightime, vhighcpu, gpu]
72 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/novel/train/script.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import torch
5 | from torch.utils.data import DataLoader
6 |
7 | import anndata as ad
8 |
9 | from sklearn.model_selection import train_test_split
10 |
11 | import pickle
12 |
13 | #check gpu available
14 | if (torch.cuda.is_available()):
15 | device = 'cuda:0' #switch to current device
16 | print('current device: gpu')
17 | else:
18 | device = 'cpu'
19 | print('current device: cpu')
20 |
21 |
22 | ## VIASH START
23 | dataset_path = "output/datasets/match_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_"
24 | pretrain_path = "output/pretrain/match_modality/clue/openproblems_bmmc_cite_phase2_rna.clue_train.output_pretrain/"
25 |
26 | par = {
27 | 'input_train_mod1': f'{dataset_path}train_mod1.h5ad',
28 | 'input_train_mod2': f'{dataset_path}train_mod2.h5ad',
29 | 'input_test_mod1': f'{dataset_path}test_mod1.h5ad',
30 | 'input_test_mod2': f'{dataset_path}test_mod2.h5ad',
31 | 'output_pretrain': pretrain_path
32 | }
33 | meta = {
34 | 'resources_dir': '.',
35 | 'functionality_name': '171129'
36 | }
37 | ## VIASH END
38 |
39 | sys.path.append(meta['resources_dir'])
40 | from helper_functions import train_and_valid, lsiTransformer, ModalityMatchingDataset
41 | from helper_functions import ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModelRegressionGex2Adt, ModelRegressionGex2Atac
42 |
43 | os.makedirs(par['output_pretrain'], exist_ok=True)
44 |
45 | print("Start train")
46 |
47 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
48 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
49 |
50 | mod1 = input_train_mod1.var['feature_types'][0]
51 | mod2 = input_train_mod2.var['feature_types'][0]
52 | if mod1 != "ADT":
53 | input_train_mod2_df = input_train_mod2.to_df()
54 |
55 | lsi_transformer_gex = lsiTransformer(n_components=256)
56 | gex_train = lsi_transformer_gex.fit_transform(input_train_mod1)
57 |
58 | train_mod1, test_mod1, train_mod2, test_mod2 = train_test_split(gex_train, input_train_mod2_df, test_size=0.25, random_state=666)
59 | input_train_mod2_df = input_train_mod2.to_df()
60 | else:
61 | train_mod1 = input_train_mod1.to_df()
62 | train_mod2 = input_train_mod2.to_df()
63 | test_mod1 = ad.read_h5ad(par['input_test_mod1']).to_df()
64 | test_mod2 = ad.read_h5ad(par['input_test_mod2']).to_df()
65 |
66 |
67 | if mod1 == 'ATAC' and mod2 == 'GEX':
68 | dataset_train = ModalityMatchingDataset(train_mod1, train_mod2)
69 | dataloader_train = DataLoader(dataset_train, 256, shuffle = True, num_workers = 8)
70 |
71 | dataset_test = ModalityMatchingDataset(test_mod1, test_mod2)
72 | dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8)
73 |
74 | model = ModelRegressionAtac2Gex(256,13431).to(device)
75 | optimizer = torch.optim.AdamW(model.parameters(), lr=0.00008386597445284492,weight_decay=0.000684887347727808)
76 |
77 | elif mod1 == 'ADT' and mod2 == 'GEX':
78 | dataset_train = ModalityMatchingDataset(train_mod1, train_mod2)
79 | dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 4)
80 |
81 | dataset_test = ModalityMatchingDataset(test_mod1, test_mod2)
82 | dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4)
83 |
84 | model = ModelRegressionAdt2Gex(134,13953).to(device)
85 | optimizer = torch.optim.Adam(model.parameters(), lr=0.00041, weight_decay=0.0000139)
86 |
87 |
88 | elif mod1 == 'GEX' and mod2 == 'ADT':
89 | dataset_train = ModalityMatchingDataset(train_mod1, train_mod2)
90 | dataloader_train = DataLoader(dataset_train, 32, shuffle = True, num_workers = 8)
91 |
92 | dataset_test = ModalityMatchingDataset(test_mod1, test_mod2)
93 | dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8)
94 |
95 | model = ModelRegressionGex2Adt(256,134).to(device)
96 | optimizer = torch.optim.AdamW(model.parameters(), lr=0.000034609210829678734, weight_decay=0.0009965881574697426)
97 |
98 |
99 | elif mod1 == 'GEX' and mod2 == 'ATAC':
100 | dataset_train = ModalityMatchingDataset(train_mod1, train_mod2)
101 | dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 8)
102 |
103 | dataset_test = ModalityMatchingDataset(test_mod1, test_mod2)
104 | dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8)
105 |
106 | model = ModelRegressionGex2Atac(256,10000).to(device)
107 | optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001806762345275399, weight_decay=0.0004084171379280058)
108 |
109 | loss_fn = torch.nn.MSELoss()
110 | train_and_valid(model, optimizer, loss_fn, dataloader_train, dataloader_test, par['output_pretrain'] + '/model.pt', device)
111 |
112 | if mod1 != "ADT":
113 | with open(par['output_pretrain'] + '/lsi_transformer.pickle', 'wb') as f:
114 | pickle.dump(lsi_transformer_gex, f)
115 |
116 | print("End train")
117 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | pretrains_v10/
3 | run/results.py
4 | run/script_v10.5.py
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/README.md:
--------------------------------------------------------------------------------
1 | [scJoint] Predict Modality
2 | ===
3 | Team scJoint: Yu-Hsiu Chen, Sheng Wan, Tung-Yu Wu
4 |
5 | Project URL: https://github.com/itscassie/scJoint-neurips2021-modality-prediction
6 |
7 | This folder contains our training pipeline and script used for the **NeurIPS 2021 Competition - Multimodal Single-Cell Data Integration**, the **Predict Modality** task. Our team **scJoint** took [3rd place of the modality prediction task](https://eval.ai/web/challenges/challenge-page/1111/leaderboard/2860) in terms of the overall ranking of 4 subtasks: namely `GEX to ADT`, `ADT to GEX`, `GEX to ATAC`, and `ATAC to GEX`. Specifically, our methods ranked 3rd in `GEX to ADT` and 4th in `ATAC to GEX`. More details about the training configurations can be found in our project ([link](https://github.com/itscassie/scJoint-neurips2021-modality-prediction)).
8 |
9 | Full documentation for the competition, including dataset, can be found at [openproblems.bio/neurips_docs/](https://openproblems.bio/neurips_docs/).
10 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/modules/model_ae.py:
--------------------------------------------------------------------------------
1 | """ autoencoder based models """
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class Encoder(nn.Module):
7 | """base encoder module"""
8 |
9 | def __init__(self, input_dim, out_dim, hidden_dim, dropout=0.2):
10 | super(Encoder, self).__init__()
11 | self.encoder = nn.Sequential(
12 | nn.Dropout(dropout),
13 | nn.Linear(input_dim, hidden_dim),
14 | nn.BatchNorm1d(hidden_dim),
15 | nn.LeakyReLU(0.2),
16 | nn.Linear(hidden_dim, hidden_dim),
17 | nn.BatchNorm1d(hidden_dim),
18 | nn.LeakyReLU(0.2),
19 | nn.Linear(hidden_dim, out_dim),
20 | )
21 |
22 | def forward(self, x_input):
23 | """forward propogation of the encoder arch"""
24 | x_emb = self.encoder(x_input)
25 | return x_emb
26 |
27 |
28 | class Decoder(nn.Module):
29 | """base decoder module"""
30 |
31 | def __init__(self, input_dim, out_dim, hidden_dim):
32 | super(Decoder, self).__init__()
33 | self.decoder = nn.Sequential(
34 | nn.Linear(input_dim, hidden_dim // 2),
35 | nn.BatchNorm1d(hidden_dim // 2),
36 | nn.LeakyReLU(0.2),
37 | nn.Linear(hidden_dim // 2, hidden_dim),
38 | nn.BatchNorm1d(hidden_dim),
39 | nn.LeakyReLU(0.2),
40 | nn.Linear(hidden_dim, out_dim),
41 | nn.ReLU(),
42 | )
43 |
44 | def forward(self, x_emb):
45 | """forward propogation of the decoder arch"""
46 | x_rec = self.decoder(x_emb)
47 | return x_rec
48 |
49 |
50 | class AutoEncoder(nn.Module):
51 | """autoencoder module"""
52 |
53 | def __init__(self, input_dim, out_dim, feat_dim, hidden_dim, dropout=0.2):
54 | super(AutoEncoder, self).__init__()
55 | self.encoder = Encoder(input_dim, feat_dim, hidden_dim, dropout)
56 | self.decoder = Decoder(feat_dim, out_dim, hidden_dim)
57 |
58 | def forward(self, x_input):
59 | """forward propogation of the autoencoder arch"""
60 | x_emb = self.encoder(x_input)
61 | x_rec = self.decoder(x_emb)
62 | return x_rec
63 |
64 |
65 | class BatchClassifier(nn.Module):
66 | """base batch classifier class"""
67 |
68 | def __init__(self, input_dim, cls_num=6, hidden_dim=50):
69 | super(BatchClassifier, self).__init__()
70 | self.classifier = nn.Sequential(
71 | nn.Linear(input_dim, hidden_dim),
72 | nn.BatchNorm1d(hidden_dim),
73 | nn.LeakyReLU(0.2),
74 | nn.Linear(hidden_dim, cls_num),
75 | nn.LeakyReLU(0.2),
76 | )
77 |
78 | def forward(self, x_feat):
79 | """forward propogation of the batch classifier arch"""
80 | return self.classifier(x_feat)
81 |
82 |
83 | class BatchRemovalGAN(nn.Module):
84 | """batch removal module"""
85 |
86 | def __init__(self, input_dim, out_dim, feat_dim, hidden_dim, cls_num=10, dropout=0.2):
87 | super(BatchRemovalGAN, self).__init__()
88 | self.encoder = Encoder(input_dim, feat_dim, hidden_dim, dropout)
89 | self.decoder = Decoder(feat_dim, out_dim, hidden_dim)
90 | self.classifier = BatchClassifier(feat_dim, cls_num=cls_num)
91 |
92 | def forward(self, x_input):
93 | """forward propogation of the batch removal gan arch"""
94 | x_feat = self.encoder(x_input)
95 | x_rec = self.decoder(x_feat)
96 | cls_prob = self.classifier(x_feat)
97 |
98 | return x_rec, cls_prob
99 |
100 |
101 | if __name__ == "__main__":
102 |
103 | bsz = 5
104 | in_d = 10
105 | out_d = 3
106 | feat_d = 2
107 | hid_d = 10
108 |
109 | x1 = torch.randn(bsz, in_d).cuda()
110 |
111 | model = AutoEncoder(in_d, out_d, feat_d, hid_d).cuda().float()
112 | print(model)
113 | output = model(x1)
114 | print(output.shape)
115 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/preprocess/save_highlyvar_genes.py:
--------------------------------------------------------------------------------
1 | """ save highly variable using scanpy package """
2 | import os
3 | import argparse
4 | import numpy as np
5 | import anndata as ad
6 | import scanpy as sc
7 | import pandas as pd
8 |
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument(
11 | "-d",
12 | "--data_dir",
13 | type=str,
14 | default="output/datasets/predict_modality",
15 | help="path to dataset directory",
16 | )
17 | parser.add_argument(
18 | "-o",
19 | "--output_dir",
20 | type=str,
21 | default="output/pretrain/predict_modality/scjoint",
22 | help="path to output directory",
23 | )
24 | parser.add_argument(
25 | "-p",
26 | "--phase",
27 | type=str,
28 | default="phase2",
29 | choices=["phase1", "phase1v2", "phase2"],
30 | help="dataset phase",
31 | )
32 |
33 | parser.add_argument(
34 | "-m",
35 | "--mode",
36 | nargs="*",
37 | type=str,
38 | default=["atac2gex"],
39 | help="modes for generating idf matrix",
40 | )
41 |
42 | parser.add_argument("-n", "--n_top", type=int, default=10000, help="returns n top highly variable genes")
43 | args = parser.parse_args()
44 |
45 | # datset path
46 | ADT2GEX_ID = f"openproblems_bmmc_cite_{args.phase}_mod2"
47 | GEX2ADT_ID = f"openproblems_bmmc_cite_{args.phase}_rna"
48 | ATAC2GEX_ID = f"openproblems_bmmc_multiome_{args.phase}_mod2"
49 | GEX2ATAC_ID = f"openproblems_bmmc_multiome_{args.phase}_rna"
50 |
51 | # path to different modes
52 | ADT2GEX_PTH = f"{args.data_dir}/{ADT2GEX_ID}/{ADT2GEX_ID}.censor_dataset"
53 | GEX2ADT_PTH = f"{args.data_dir}/{GEX2ADT_ID}/{GEX2ADT_ID}.censor_dataset"
54 | ATAC2GEX_PTH = f"{args.data_dir}/{ATAC2GEX_ID}/{ATAC2GEX_ID}.censor_dataset"
55 | GEX2ATAC_PTH = f"{args.data_dir}/{GEX2ATAC_ID}/{GEX2ATAC_ID}.censor_dataset"
56 |
57 | ADT2GEX = [
58 | f"{ADT2GEX_PTH}.output_train_mod1.h5ad",
59 | f"{ADT2GEX_PTH}.output_train_mod2.h5ad",
60 | f"{ADT2GEX_PTH}.output_test_mod1.h5ad",
61 | f"{ADT2GEX_PTH}.output_test_mod2.h5ad",
62 | f"{args.output_dir}/adt2gex_train.output_pretrain",
63 | ]
64 |
65 | GEX2ADT = [
66 | f"{GEX2ADT_PTH}.output_train_mod1.h5ad",
67 | f"{GEX2ADT_PTH}.output_train_mod2.h5ad",
68 | f"{GEX2ADT_PTH}.output_test_mod1.h5ad",
69 | f"{GEX2ADT_PTH}.output_test_mod2.h5ad",
70 | f"{args.output_dir}/gex2adt_train.output_pretrain",
71 | ]
72 |
73 | ATAC2GEX = [
74 | f"{ATAC2GEX_PTH}.output_train_mod1.h5ad",
75 | f"{ATAC2GEX_PTH}.output_train_mod2.h5ad",
76 | f"{ATAC2GEX_PTH}.output_test_mod1.h5ad",
77 | f"{ATAC2GEX_PTH}.output_test_mod2.h5ad",
78 | f"{args.output_dir}/atac2gex_train.output_pretrain",
79 | ]
80 |
81 | GEX2ATAC = [
82 | f"{GEX2ATAC_PTH}.output_train_mod1.h5ad",
83 | f"{GEX2ATAC_PTH}.output_train_mod2.h5ad",
84 | f"{GEX2ATAC_PTH}.output_test_mod1.h5ad",
85 | f"{GEX2ATAC_PTH}.output_test_mod2.h5ad",
86 | f"{args.output_dir}/gex2atac_train.output_pretrain",
87 | ]
88 |
89 | MODES = {"adt2gex": ADT2GEX, "gex2adt": GEX2ADT, "atac2gex": ATAC2GEX, "gex2atac": GEX2ATAC}
90 |
91 |
92 | if __name__ == "__main__":
93 | # desired data path
94 | DATAPTH = [MODES[i] for i in args.mode]
95 |
96 | for (i, mode) in enumerate(DATAPTH):
97 | print(f"MODE [{i + 1} / {len(DATAPTH)}]: {args.mode[i]}")
98 | train_mod1_pth = mode[0]
99 | test_mod1_pth = mode[2]
100 | train_mod1 = sc.read_h5ad(train_mod1_pth)
101 | test_mod1 = sc.read_h5ad(test_mod1_pth)
102 |
103 | # concat train/test sets
104 | X_raw = sc.concat(
105 | {"train": train_mod1, "test": test_mod1},
106 | axis=0,
107 | join="outer",
108 | label="group",
109 | fill_value=0,
110 | index_unique="-",
111 | )
112 | print(X_raw.shape)
113 |
114 | # collect highly variable genes
115 | sc.pp.highly_variable_genes(X_raw, n_top_genes=args.n_top)
116 | X_raw = X_raw[:, X_raw.var.highly_variable]
117 |
118 | train_highly = X_raw[: train_mod1.X.shape[0], :]
119 | train_highly = ad.AnnData(
120 | X=train_highly.X,
121 | obs=train_highly.obs,
122 | var=pd.DataFrame({"feature_types": train_mod1.var["feature_types"][X_raw.var_names]}),
123 | uns=train_highly.uns,
124 | layers=train_highly.layers,
125 | )
126 |
127 | test_highly = X_raw[train_mod1.X.shape[0] :, :]
128 | test_highly = ad.AnnData(
129 | X=test_highly.X,
130 | obs=test_highly.obs,
131 | var=pd.DataFrame({"feature_types": test_mod1.var["feature_types"][X_raw.var_names]}),
132 | uns=test_highly.uns,
133 | layers=test_highly.layers,
134 | )
135 | print(train_highly)
136 | print(test_highly)
137 |
138 | # save highly variable indexs
139 | mod1_vars = np.array(train_mod1.var_names)
140 | mod1_highly_idx = [
141 | int(np.where(mod1_vars == np.array(X_raw.var_names[i]))[0])
142 | for i in range(np.array(X_raw.var_names).shape[0])
143 | ]
144 |
145 | file_path = f"{mode[4]}"
146 | os.makedirs(file_path, exist_ok=True)
147 |
148 | with open(f"{file_path}/index_highly{args.n_top}.txt", "w", encoding="utf8") as index_file:
149 | index_file.write(f"index num: {len(mod1_highly_idx)}\n")
150 | for ind in mod1_highly_idx:
151 | index_file.write(str(ind) + "\n")
152 |
153 | print(f"finish saving {file_path}/index_highly{args.n_top}.txt")
154 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/preprocess/save_idf_matrix.py:
--------------------------------------------------------------------------------
1 | """ this function save idf matrixs from the dataset """
2 | import os
3 | import argparse
4 | import numpy as np
5 | import anndata as ad
6 |
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument(
9 | "-d",
10 | "--data_dir",
11 | type=str,
12 | default="output/datasets/predict_modality",
13 | help="path to dataset directory",
14 | )
15 | parser.add_argument(
16 | "-o",
17 | "--output_dir",
18 | type=str,
19 | default="output/pretrain/predict_modality/scjoint",
20 | help="path to output directory",
21 | )
22 | parser.add_argument(
23 | "-p",
24 | "--phase",
25 | default="phase2",
26 | type=str,
27 | choices=["phase1", "phase1v2", "phase2"],
28 | help="dataset phase",
29 | )
30 |
31 | parser.add_argument(
32 | "-m",
33 | "--mode",
34 | type=str,
35 | nargs="*",
36 | default=["adt2gex", "gex2adt", "atac2gex", "gex2atac"],
37 | help="modes for generating idf matrix",
38 | )
39 | args = parser.parse_args()
40 |
41 | # datset path
42 | ADT2GEX_ID = f"openproblems_bmmc_cite_{args.phase}_mod2"
43 | GEX2ADT_ID = f"openproblems_bmmc_cite_{args.phase}_rna"
44 | ATAC2GEX_ID = f"openproblems_bmmc_multiome_{args.phase}_mod2"
45 | GEX2ATAC_ID = f"openproblems_bmmc_multiome_{args.phase}_rna"
46 |
47 | # path to different modes
48 | ADT2GEX_PTH = f"{args.data_dir}/{ADT2GEX_ID}/{ADT2GEX_ID}.censor_dataset"
49 | GEX2ADT_PTH = f"{args.data_dir}/{GEX2ADT_ID}/{GEX2ADT_ID}.censor_dataset"
50 | ATAC2GEX_PTH = f"{args.data_dir}/{ATAC2GEX_ID}/{ATAC2GEX_ID}.censor_dataset"
51 | GEX2ATAC_PTH = f"{args.data_dir}/{GEX2ATAC_ID}/{GEX2ATAC_ID}.censor_dataset"
52 |
53 | ADT2GEX = [
54 | f"{ADT2GEX_PTH}.output_train_mod1.h5ad",
55 | f"{ADT2GEX_PTH}.output_train_mod2.h5ad",
56 | f"{ADT2GEX_PTH}.output_test_mod1.h5ad",
57 | f"{ADT2GEX_PTH}.output_test_mod2.h5ad",
58 | f"{args.output_dir}/adt2gex_train.output_pretrain",
59 | ]
60 |
61 | GEX2ADT = [
62 | f"{GEX2ADT_PTH}.output_train_mod1.h5ad",
63 | f"{GEX2ADT_PTH}.output_train_mod2.h5ad",
64 | f"{GEX2ADT_PTH}.output_test_mod1.h5ad",
65 | f"{GEX2ADT_PTH}.output_test_mod2.h5ad",
66 | f"{args.output_dir}/gex2adt_train.output_pretrain",
67 | ]
68 |
69 | ATAC2GEX = [
70 | f"{ATAC2GEX_PTH}.output_train_mod1.h5ad",
71 | f"{ATAC2GEX_PTH}.output_train_mod2.h5ad",
72 | f"{ATAC2GEX_PTH}.output_test_mod1.h5ad",
73 | f"{ATAC2GEX_PTH}.output_test_mod2.h5ad",
74 | f"{args.output_dir}/atac2gex_train.output_pretrain",
75 | ]
76 |
77 | GEX2ATAC = [
78 | f"{GEX2ATAC_PTH}.output_train_mod1.h5ad",
79 | f"{GEX2ATAC_PTH}.output_train_mod2.h5ad",
80 | f"{GEX2ATAC_PTH}.output_test_mod1.h5ad",
81 | f"{GEX2ATAC_PTH}.output_test_mod2.h5ad",
82 | f"{args.output_dir}/gex2atac_train.output_pretrain",
83 | ]
84 |
85 | MODES = {"adt2gex": ADT2GEX, "gex2adt": GEX2ADT, "atac2gex": ATAC2GEX, "gex2atac": GEX2ATAC}
86 |
87 |
88 | def idf_matrix(x_raw):
89 | """returns idf matrix"""
90 | x_idf = np.zeros_like(x_raw).astype(np.single)
91 | x_idf[x_raw > 0] = 1
92 | idf = np.log(x_raw.shape[0] / (np.sum(x_idf, axis=0, keepdims=True) + 1))
93 | return idf
94 |
95 |
96 | if __name__ == "__main__":
97 | # desired data path
98 | DATAPTH = [MODES[i] for i in args.mode]
99 | for (i, mode) in enumerate(DATAPTH):
100 | print(f"MODE [{i + 1} / {len(DATAPTH)}]: {args.mode[i]}")
101 |
102 | train_mod1_pth = mode[0]
103 | train_mod1 = ad.read_h5ad(train_mod1_pth)
104 |
105 | x_raw_matrix = train_mod1.layers["counts"].toarray().astype(np.float16)
106 | print(f"train data shape: {x_raw_matrix.shape}")
107 |
108 | x_idf_matrix = idf_matrix(x_raw_matrix)
109 | print(f"idf matrix shape: {x_idf_matrix.shape}")
110 |
111 | file_path = f"{mode[4]}"
112 | print(f"output dir: {file_path}")
113 | os.makedirs(file_path, exist_ok=True)
114 |
115 | np.save(f"{file_path}/mod1_idf.npy", x_idf_matrix)
116 | print(f"finish saving {file_path}/mod1_idf.npy")
117 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/train.py:
--------------------------------------------------------------------------------
1 | """ main training process """
2 | import os
3 | import logging
4 | import argparse
5 | from datetime import datetime
6 |
7 | from trainer.trainer_nn import TrainProcess as TrainProcess_NN
8 | from trainer.trainer_cycle import TrainProcess as TrainProcess_Cycle
9 | from trainer.trainer_batchgan import TrainProcess as TrainProcess_BATCHGAN
10 |
11 | from opts import DATASET, model_opts
12 | from utils.dataloader import get_data_dim
13 |
14 | if __name__ == "__main__":
15 | # config parser
16 | parser = argparse.ArgumentParser(add_help=False)
17 | model_opts(parser)
18 | args = parser.parse_known_args()[0]
19 |
20 | # exp name for train log, weights, model
21 | if args.train == "train":
22 | TIME_NOW = datetime.now().strftime("%b%d-%H-%M")
23 | exp_name = f"{args.arch}_{args.mode}"
24 | if args.selection:
25 | assert args.mod1_idx_path is not None, "need to specified --mod1_idx_path"
26 | SELECT_NUM = args.mod1_idx_path.split("/")[-1].replace(".txt", "").replace("index_", "")
27 | exp_name += f"_select{SELECT_NUM}"
28 |
29 | if args.tfidf != 0:
30 | assert args.idf_path is not None, "need to specified --idf_path"
31 | assert not args.gene_activity, "support either ga or tfidf != 0"
32 | if args.tfidf == 1:
33 | exp_name += f"_tfidf"
34 | elif args.tfidf == 2:
35 | exp_name += f"_tfidfconcat"
36 | elif args.tfidf == 3:
37 | exp_name += f"_tfidfconcatga"
38 | assert args.mode == "atac2gex" and args.phase in [
39 | "phase1v2",
40 | "phase2",
41 | ], "gene activity mode support only atac2gex mode (p1v2 or p2)"
42 | elif args.gene_activity:
43 | exp_name += f"_ga"
44 | assert args.mode == "atac2gex" and args.phase in [
45 | "phase1v2",
46 | "phase2",
47 | ], "gene activity mode support only atac2gex mode (p1v2 or p2)"
48 | if args.norm:
49 | exp_name += f"_norm"
50 | if args.dropout != 0.2:
51 | exp_name += f"_dropout{args.dropout}"
52 | if args.name != "":
53 | exp_name += f"_{args.name}"
54 | else:
55 | exp_name += f"_{TIME_NOW}"
56 |
57 | # exp name for eval log file
58 | elif args.train == "eval":
59 | assert args.checkpoint is not None, "need to specified --checkpoint"
60 | exp_name = args.checkpoint.split("/")[-1].replace(".pt", "")
61 | exp_name += f"_{args.phase}"
62 |
63 | # loggings and logs
64 | if args.dryrun:
65 | handlers = [logging.StreamHandler()]
66 | else:
67 | os.makedirs(f"{args.output_dir}/logs/", exist_ok=True)
68 | os.makedirs(f"{DATASET[args.mode]['weight_dir']}", exist_ok=True)
69 | handlers = [
70 | logging.FileHandler(f"{args.output_dir}/logs/{args.train}_{exp_name}.log", mode="w"),
71 | logging.StreamHandler(),
72 | ]
73 |
74 | logging.basicConfig(level=logging.DEBUG, format="%(message)s", handlers=handlers)
75 |
76 | # load data
77 | MOD1_DIM, MOD2_DIM = get_data_dim(DATASET[args.mode], args)
78 |
79 | parser.add_argument("--mod1_dim", default=MOD1_DIM)
80 | parser.add_argument("--mod2_dim", default=MOD2_DIM)
81 | parser.add_argument("--exp_name", default=exp_name)
82 | args = parser.parse_args()
83 |
84 | logging.info("\nArgument:")
85 | for arg, value in vars(args).items():
86 | logging.info(f"{arg:20s}: {value}")
87 | logging.info("\n")
88 |
89 | # trainer
90 | if args.arch == "nn":
91 | trainer = TrainProcess_NN(args)
92 | elif args.arch == "cycle":
93 | trainer = TrainProcess_Cycle(args)
94 | elif args.arch == "batchgan":
95 | trainer = TrainProcess_BATCHGAN(args)
96 |
97 | if args.train == "train":
98 | trainer.run()
99 | trainer.eval()
100 |
101 | elif args.train == "eval":
102 | trainer.load_checkpoint()
103 | trainer.eval()
104 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/predict_modality/methods/scJoint/resources/trainer/__init__.py
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/predict_modality/methods/scJoint/resources/utils/__init__.py
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/utils/loss.py:
--------------------------------------------------------------------------------
1 | """ define custum loss function in this file """
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | def cosine_sim(arr_1, arr_2):
7 | """ return consine similarity of 2 arrays """
8 | arr_1 = arr_1 / torch.norm(arr_1, dim=1, keepdim=True)
9 | arr_2 = arr_2 / torch.norm(arr_2, dim=1, keepdim=True)
10 | sim = torch.matmul(arr_1, torch.transpose(arr_2, 0, 1))
11 |
12 | return sim
13 |
14 |
15 | class CosineLoss(nn.Module):
16 | """ custum loss for mean cosine similarity """
17 | def __init__(self):
18 | super(CosineLoss, self).__init__()
19 |
20 | def forward(self, emb1, emb2, emb1_resid, emb2_resid):
21 | """ define cosine loss """
22 | emb1, emb2 = emb1.float(), emb2.float()
23 | cosine_loss = torch.mean(
24 | torch.abs(cosine_sim(emb1, emb1_resid) + cosine_sim(emb2, emb2_resid))
25 | )
26 | return cosine_loss
27 |
28 |
29 | class L1regularization(nn.Module):
30 | """ l1 regularization loss for model """
31 | def __init__(self, weight_decay=0.1):
32 | super(L1regularization, self).__init__()
33 | self.weight_decay = weight_decay
34 |
35 | def forward(self, model):
36 | """ define l1 reg loss """
37 | regularization_loss = 0.0
38 | for param in model.parameters():
39 | regularization_loss += torch.mean(abs(param)) * self.weight_decay
40 |
41 | return regularization_loss
42 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/resources/utils/metric.py:
--------------------------------------------------------------------------------
1 | """ calculate metrics """
2 |
3 | import numpy as np
4 |
5 | def rmse(mod2_sol, mod2_pred):
6 | """
7 | input: prediction / ans
8 | output: rmse
9 | """
10 | tmp = mod2_sol - mod2_pred
11 | rmse_out = np.sqrt(tmp.power(2).mean())
12 | return rmse_out
13 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/run/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: scjoint
3 | namespace: predict_modality_methods
4 |
5 | description: An ensemble method including pca, nn, feature extraction.
6 |
7 | info:
8 | method_label: "scJoint"
9 | submission_id: "171135"
10 | team_name: scJoint
11 |
12 | authors:
13 | - name: Yu-Hsiu Chen
14 | email: yhchen.cm06g@nctu.edu.tw
15 | roles: [ author, maintainer ]
16 | props: { github: itscassie }
17 | - name: Sheng Wan
18 | email: a5736735a.eecs99@g2.nctu.edu.tw
19 | - name: Tung-Yu Wu
20 | email: wtywty@gmail.com
21 |
22 | # parameters
23 | arguments:
24 | # required inputs
25 | - name: "--input_train_mod1"
26 | type: "file"
27 | example: "dataset_mod1.h5ad"
28 | description: Censored dataset, training cells.
29 | required: true
30 | - name: "--input_test_mod1"
31 | type: "file"
32 | example: "dataset_mod1.h5ad"
33 | description: Censored dataset, test cells.
34 | required: true
35 | - name: "--input_train_mod2"
36 | type: "file"
37 | example: "dataset_mod2.h5ad"
38 | description: Censored dataset.
39 | required: true
40 | - name: "--input_pretrain"
41 | type: "file"
42 | example: "pretrain_model"
43 | description: Path to the directory containing a pretrained model.
44 | required: true
45 | # required outputs
46 | - name: "--output"
47 | type: "file"
48 | direction: "output"
49 | example: "output.h5ad"
50 | description: Dataset with predicted values for modality2.
51 | required: true
52 |
53 | # files your script needs
54 | resources:
55 | - type: python_script
56 | path: script.py
57 | # includes all code under resources/
58 | - path: ../resources
59 |
60 | # target platforms
61 | platforms:
62 | - type: docker
63 | image: "pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime"
64 | run_args: [ "--gpus all" ]
65 | setup:
66 | - type: python
67 | packages:
68 | - scikit-learn
69 | - anndata
70 | - scanpy
71 | - numpy
72 |
73 | - type: nextflow
74 | labels: [ highmem, hightime, midcpu, gpu ]
75 |
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash"
4 | export NXF_VER=21.04.1
5 | export PIPELINE_VERSION=1.4.0
6 | method_id=scjoint
7 | task_id=predict_modality
8 | pretrain_path=output/pretrain/$task_id/$method_id
9 |
10 | # GENERATE PRETRAIN
11 | echo ""
12 | echo "######################################################################"
13 | echo "## Generating pretrain weights/files ##"
14 | echo "######################################################################"
15 |
16 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \
17 | --data_dir output/datasets/$task_id \
18 | --output_pretrain ${pretrain_path}
19 |
20 | echo ""
21 | echo "######################################################################"
22 | echo "## Generating prediction files ##"
23 | echo "######################################################################"
24 |
25 | # CITE GEX2ADT
26 | echo ""
27 | echo "CITE GEX to ADT"
28 | dataset_id=openproblems_bmmc_cite_phase2_rna
29 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
30 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
31 |
32 | target/docker/${task_id}_methods/${method_id}/${method_id} \
33 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
34 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
35 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
36 | --input_pretrain "${pretrain_path}/gex2adt_train.output_pretrain/" \
37 | --output ${pred_path}.${method_id}.output.h5ad
38 |
39 | # CITE ADT2GEX
40 | echo ""
41 | echo "CITE ADT to GEX"
42 | dataset_id=openproblems_bmmc_cite_phase2_mod2
43 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
44 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
45 |
46 | target/docker/${task_id}_methods/${method_id}/${method_id} \
47 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
48 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
49 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
50 | --input_pretrain "${pretrain_path}/adt2gex_train.output_pretrain/" \
51 | --output ${pred_path}.${method_id}.output.h5ad
52 |
53 | # MULTIOME GEX2ATAC
54 | echo ""
55 | echo "MULTIOME GEX to ATAC"
56 | dataset_id=openproblems_bmmc_multiome_phase2_rna
57 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
58 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
59 |
60 | target/docker/${task_id}_methods/${method_id}/${method_id} \
61 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
62 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
63 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
64 | --input_pretrain "${pretrain_path}/gex2atac_train.output_pretrain/" \
65 | --output ${pred_path}.${method_id}.output.h5ad
66 |
67 | # MULTIOME ATAC2GEX
68 | echo ""
69 | echo "MULTIOME ATAC to GEX"
70 | dataset_id=openproblems_bmmc_multiome_phase2_mod2
71 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset
72 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id
73 |
74 | target/docker/${task_id}_methods/${method_id}/${method_id} \
75 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \
76 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \
77 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \
78 | --input_pretrain "${pretrain_path}/atac2gex_train.output_pretrain/" \
79 | --output ${pred_path}.${method_id}.output.h5ad
80 |
81 | # RUN EVALUATION
82 | echo ""
83 | echo "######################################################################"
84 | echo "## Evaluating predictions ##"
85 | echo "######################################################################"
86 | bin/nextflow run "$PIPELINE_REPO" \
87 | -r "$PIPELINE_VERSION" \
88 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \
89 | --solutionDir "output/datasets/$task_id" \
90 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \
91 | --publishDir "output/evaluation/$task_id/$method_id/" \
92 | -latest \
93 | -resume \
94 | -c "src/resources/nextflow_moremem.config"
95 |
96 | echo ""
97 | echo "######################################################################"
98 | echo "## Evaluation summary ##"
99 | echo "######################################################################"
100 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json"
--------------------------------------------------------------------------------
/src/predict_modality/methods/scJoint/train/config.vsh.yaml:
--------------------------------------------------------------------------------
1 | functionality:
2 | name: scjoint_train
3 | namespace: predict_modality_methods
4 |
5 | # metadata for your method
6 | version: dev
7 | description: An ensemble method including pca, nn, feature extraction.
8 | authors:
9 | - name: Yu-Hsiu Chen
10 | email: yhchen.cm06g@nctu.edu.tw
11 | roles: [ author, maintainer ]
12 | - name: Sheng Wan
13 | email: a5736735a.eecs99@g2.nctu.edu.tw
14 | - name: Tung-Yu Wu
15 | email: wtywty@gmail.com
16 |
17 | # parameters
18 | arguments:
19 | # required inputs
20 | - name: "--data_dir"
21 | type: "file"
22 | description: The path to the predict_modality datasets
23 | required: true
24 |
25 | # required outputs
26 | - name: "--output_pretrain"
27 | type: "file"
28 | direction: "output"
29 | example: "pretrain_model"
30 | description: Path to the directory containing the pretrained models.
31 | required: true
32 |
33 | # files your script needs
34 | resources:
35 | - type: bash_script
36 | path: train.sh
37 | # includes all code under resources/
38 | - path: ../resources
39 |
40 | # target platforms
41 | platforms:
42 |
43 | - type: docker
44 | image: "pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime"
45 | run_args: [ "--gpus all" ]
46 | setup:
47 | - type: python
48 | packages:
49 | - scikit-learn
50 | - anndata
51 | - scanpy
52 | - numpy
53 |
54 | - type: nextflow
55 | labels: [ highmem, hightime, midcpu, gpu ]
56 |
--------------------------------------------------------------------------------
/src/resources/nextflow.config:
--------------------------------------------------------------------------------
1 | includeConfig "${launchDir}/target/nextflow/nextflow.config"
2 |
3 | process {
4 | withLabel: lowcpu { cpus = 4 }
5 | withLabel: midcpu { cpus = 4 }
6 | withLabel: highcpu { cpus = 15 }
7 | withLabel: vhighcpu { cpus = 30 }
8 | withLabel: lowmem { memory = 60.GB }
9 | withLabel: midmem { memory = 60.GB }
10 | withLabel: highmem { memory = 110.GB }
11 | withLabel: vhighmem { memory = 110.GB }
12 | withLabel: lowtime { time = "20m" }
13 | withLabel: midtime { time = "40m" }
14 | withLabel: hightime { time = "60m" }
15 | withLabel: vhightime { time = "120m" }
16 | withLabel: vvhightime { time = "360m" }
17 | withLabel: gpu { maxForks = 1; containerOptions = '--gpus all' }
18 | }
19 |
20 | def viash_temp = System.getenv("VIASH_TEMP") ?: "/tmp/"
21 | docker.runOptions = "-v ${launchDir}/target/nextflow:${launchDir}/target/nextflow -v $viash_temp:$viash_temp --shm-size=5G --net none"
22 |
--------------------------------------------------------------------------------
/src/resources/nextflow_moremem.config:
--------------------------------------------------------------------------------
1 | process {
2 | withLabel: lowcpu { cpus = 4 }
3 | withLabel: midcpu { cpus = 4 }
4 | withLabel: highcpu { cpus = 15 }
5 | withLabel: vhighcpu { cpus = 30 }
6 | withLabel: lowmem { memory = 60.GB }
7 | withLabel: midmem { memory = 60.GB }
8 | withLabel: highmem { memory = 110.GB }
9 | withLabel: vhighmem { memory = 110.GB }
10 | withLabel: lowtime { time = "20m" }
11 | withLabel: midtime { time = "40m" }
12 | withLabel: hightime { time = "60m" }
13 | withLabel: vhightime { time = "120m" }
14 | withLabel: vvhightime { time = "360m" }
15 | withLabel: gpu { maxForks = 1; containerOptions = '--gpus all' }
16 | }
17 |
--------------------------------------------------------------------------------
/src/sync_datasets.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | function aws_s3 {
4 | CMD="$1"
5 | SOURCE="$2"
6 | DEST="$3"
7 | # use aws cli if installed
8 | if command -v aws &> /dev/null; then
9 | aws s3 "$CMD" --no-sign-request "$SOURCE" "$DEST"
10 | # else use aws docker container instead
11 | else
12 | docker run \
13 | --user $(id -u):$(id -g) \
14 | --rm -it \
15 | -v $(pwd)/output:/output \
16 | -w / \
17 | amazon/aws-cli \
18 | s3 "$CMD" --no-sign-request "$SOURCE" "$DEST"
19 | fi
20 | }
21 |
22 | aws_s3 sync "s3://openproblems-bio/public/phase1-data/" "output/datasets_phase1"
23 | aws_s3 sync "s3://openproblems-bio/public/phase1v2-data/" "output/datasets_phase1v2"
24 | # aws_s3 sync "s3://openproblems-bio/public/phase2-data/" "output/datasets_phase2_public"
25 | aws_s3 sync "s3://openproblems-bio/public/phase2-data/joint_embedding/" "output/datasets_phase2_public/joint_embedding"
26 | aws_s3 sync "s3://openproblems-bio/public/phase2-private-data/" "output/datasets"
27 | aws_s3 sync "s3://openproblems-bio/public/explore/" "output/datasets_explore"
--------------------------------------------------------------------------------