├── .gitignore ├── LICENSE ├── README.Rmd ├── README.md ├── bin ├── .gitignore ├── README.md └── init ├── resources ├── github_mark.svg └── orcid_id.svg ├── sample_data └── predict_modality │ ├── openproblems_bmmc_cite_starter │ ├── openproblems_bmmc_cite_starter.test_mod1.h5ad │ ├── openproblems_bmmc_cite_starter.test_mod2.h5ad │ ├── openproblems_bmmc_cite_starter.train_mod1.h5ad │ └── openproblems_bmmc_cite_starter.train_mod2.h5ad │ └── openproblems_bmmc_multiome_starter │ ├── openproblems_bmmc_multiome_starter.test_mod1.h5ad │ ├── openproblems_bmmc_multiome_starter.test_mod2.h5ad │ ├── openproblems_bmmc_multiome_starter.train_mod1.h5ad │ └── openproblems_bmmc_multiome_starter.train_mod2.h5ad └── src ├── joint_embedding └── methods │ ├── Guanlab-dengkw │ ├── run │ │ ├── config.vsh.yaml │ │ └── script.py │ └── test.sh │ ├── jae │ ├── README.md │ ├── model_architecture.png │ ├── resources │ │ └── utils.py │ ├── run │ │ ├── config.vsh.yaml │ │ └── script.py │ ├── test.sh │ └── train │ │ ├── config.vsh.yaml │ │ └── script.py │ └── lsl_ae │ ├── run │ ├── config.vsh.yaml │ └── script.py │ └── test.sh ├── match_modality └── methods │ ├── clue │ ├── README.md │ ├── clue_architecture.jpg │ ├── resources │ │ ├── scglue-0.1.1-py3-none-any.whl │ │ └── utils.py │ ├── run │ │ ├── config.vsh.yaml │ │ └── script.py │ ├── test.sh │ └── train │ │ ├── config.vsh.yaml │ │ └── script.py │ └── novel │ ├── README.md │ ├── novel_architecture1.png │ ├── novel_architecture2.png │ ├── resources │ ├── catalyst_tools.py │ ├── config_ADT2GEX.py │ ├── config_ATAC2GEX.py │ ├── data.py │ ├── models.py │ ├── postprocessing.py │ └── preprocessing.py │ ├── run │ ├── config.vsh.yaml │ └── script.py │ ├── test.sh │ └── train │ ├── config.vsh.yaml │ └── script.py ├── predict_modality └── methods │ ├── AXX │ ├── .gitignore │ ├── README.md │ ├── resources │ │ ├── const.py │ │ ├── models.py │ │ ├── predict.py │ │ ├── test.py │ │ ├── train.py │ │ ├── utils.py │ │ └── yaml │ │ │ ├── mlp_ADT2GEX.yaml │ │ │ ├── mlp_ATAC2GEX.yaml │ │ │ └── mlp_GEX2ADT.yaml │ ├── run │ │ ├── config.vsh.yaml │ │ └── script.py │ ├── test.sh │ └── train │ │ ├── config.vsh.yaml │ │ └── script.py │ ├── DANCE │ ├── resources │ │ ├── baseline.py │ │ └── graph_util.py │ ├── run │ │ ├── config.vsh.yaml │ │ └── script.py │ ├── test.sh │ └── train │ │ ├── config.vsh.yaml │ │ ├── generate_extra_files.py │ │ ├── h.all.v7.4.entrez.gmt │ │ ├── h.all.v7.4.symbols.gmt │ │ ├── hetero_arg_version_v5.py │ │ └── script.sh │ ├── Guanlab-dengkw │ ├── run │ │ ├── config.vsh.yaml │ │ └── script.py │ └── test.sh │ ├── LS_lab │ ├── run │ │ ├── config.vsh.yaml │ │ └── script.py │ └── test.sh │ ├── cajal │ ├── run │ │ ├── config.vsh.yaml │ │ └── script.py │ ├── test.sh │ └── train │ │ ├── ADT_list_df_updated.csv │ │ ├── config.vsh.yaml │ │ └── script.py │ ├── novel │ ├── README.md │ ├── novel_architecture.jpg │ ├── resources │ │ └── helper_functions.py │ ├── run │ │ ├── config.vsh.yaml │ │ └── script.py │ ├── test.sh │ └── train │ │ ├── config.vsh.yaml │ │ └── script.py │ └── scJoint │ ├── .gitignore │ ├── README.md │ ├── resources │ ├── modules │ │ └── model_ae.py │ ├── opts.py │ ├── preprocess │ │ ├── save_filter_genes.py │ │ ├── save_highlyvar_genes.py │ │ └── save_idf_matrix.py │ ├── train.py │ ├── trainer │ │ ├── __init__.py │ │ ├── trainer_batchgan.py │ │ ├── trainer_cycle.py │ │ └── trainer_nn.py │ └── utils │ │ ├── __init__.py │ │ ├── dataloader.py │ │ ├── loss.py │ │ └── metric.py │ ├── run │ ├── config.vsh.yaml │ └── script.py │ ├── test.sh │ └── train │ ├── config.vsh.yaml │ └── train.sh ├── resources ├── nextflow.config └── nextflow_moremem.config └── sync_datasets.sh /.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | *.pyc 3 | target 4 | work 5 | .nextflow* 6 | log.txt 7 | README.html 8 | bin/build_for_release.sh 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Open Problems in Single-Cell Analysis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bin/.gitignore: -------------------------------------------------------------------------------- 1 | fetch 2 | viash* 3 | nextflow 4 | -------------------------------------------------------------------------------- /bin/README.md: -------------------------------------------------------------------------------- 1 | These executables were generated by running the `bin/init` executable. 2 | -------------------------------------------------------------------------------- /bin/init: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check Java installed 4 | if ! command -v java --version &> /dev/null 5 | then 6 | echo "Please ensure Java Runtime ≥8 is installed. You can find an open source installer here: https://adoptopenjdk.net/?variant=openjdk8&jvmVariant=hotspot" 7 | exit 8 | fi 9 | 10 | # Check Docker installed 11 | if ! command -v docker --version &> /dev/null 12 | then 13 | echo "Please ensure Docker is installed and up-to-date. Instructions at https://www.docker.com/get-started" 14 | exit 15 | fi 16 | 17 | # get the root of the directory 18 | REPO_ROOT=$(git rev-parse --show-toplevel) 19 | 20 | # ensure that the command below is run from the root of the repository 21 | cd "$REPO_ROOT" 22 | 23 | curl -fsSL http://get.viash.io | bash -s -- \ 24 | --viash bin/viash \ 25 | --registry openproblems \ 26 | --tag 0.5.5 \ 27 | --log check_results/results.tsv \ 28 | --config_mod '.platforms[.type == "nextflow"].separate_multiple_outputs := false' \ 29 | --config_mod '.platforms[.type == "nextflow"].directive_memory := "10GB"' \ 30 | --config_mod '.platforms[.type == "nextflow"].directive_time := "10 m"' 31 | 32 | cd bin 33 | 34 | curl -s https://get.nextflow.io | bash 35 | -------------------------------------------------------------------------------- /resources/github_mark.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xml 28 | -------------------------------------------------------------------------------- /resources/orcid_id.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod1.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod1.h5ad -------------------------------------------------------------------------------- /sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod2.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod2.h5ad -------------------------------------------------------------------------------- /sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod1.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod1.h5ad -------------------------------------------------------------------------------- /sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod2.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod2.h5ad -------------------------------------------------------------------------------- /sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad -------------------------------------------------------------------------------- /sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad -------------------------------------------------------------------------------- /sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad -------------------------------------------------------------------------------- /sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad -------------------------------------------------------------------------------- /src/joint_embedding/methods/Guanlab-dengkw/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: guanlab_dengkw_je 3 | namespace: joint_embedding_methods 4 | 5 | # metadata for your method 6 | description: A description for your method. 7 | info: 8 | method_label: "Guanlab-dengkw" 9 | submission_id: "170795" 10 | team_name: Guanlab-dengkw 11 | # project_url: https://github.com/foo/bar 12 | # publication_doi: 10.1101/0123.45.67.890123 13 | # publication_url: https://arxiv.org/abs/1234.56789 14 | 15 | authors: 16 | - name: Kaiwen Deng 17 | email: dengkw@umich.edu 18 | roles: [ author, maintainer ] 19 | props: { github: nonztalk } 20 | 21 | # parameters 22 | arguments: 23 | # required inputs 24 | - name: "--input_mod1" 25 | type: "file" 26 | example: "dataset_mod1.h5ad" 27 | description: Modality 1 dataset. 28 | required: true 29 | - name: "--input_mod2" 30 | type: "file" 31 | example: "dataset_mod2.h5ad" 32 | description: Modality 2 dataset. 33 | required: true 34 | # required outputs 35 | - name: "--output" 36 | type: "file" 37 | direction: "output" 38 | example: "output.h5ad" 39 | description: Data for all cells in mod1 and mod2 embedded to ≤100 dimensions. 40 | required: true 41 | 42 | 43 | # files your script needs 44 | resources: 45 | - type: python_script 46 | path: script.py 47 | 48 | # target platforms 49 | platforms: 50 | - type: docker 51 | image: dataintuitive/randpy:py3.8 52 | setup: 53 | - type: python 54 | packages: 55 | - anndata 56 | - umap-learn 57 | - type: nextflow 58 | labels: [ vhighmem, hightime, vhighcpu ] 59 | -------------------------------------------------------------------------------- /src/joint_embedding/methods/Guanlab-dengkw/run/script.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import anndata as ad 3 | import numpy as np 4 | 5 | from sklearn.decomposition import TruncatedSVD 6 | 7 | logging.basicConfig(level=logging.INFO) 8 | 9 | ## VIASH START 10 | dataset_path = "output/datasets/joint_embedding/openproblems_bmmc_cite_phase2/openproblems_bmmc_cite_phase2.censor_dataset.output_" 11 | 12 | par = { 13 | 'input_mod1': f'{dataset_path}mod1.h5ad', 14 | 'input_mod2': f'{dataset_path}mod2.h5ad', 15 | 'output': 'output.h5ad' 16 | } 17 | meta = { 18 | 'resources_dir': '.', 19 | 'functionality_name': 'submission_170795' 20 | } 21 | ## VIASH END 22 | 23 | def normalize(arr): 24 | arr_sd = np.std(arr, axis=1).reshape(-1, 1) 25 | arr_mean = np.mean(arr, axis=1).reshape(-1, 1) 26 | return (arr - arr_mean) / arr_sd 27 | 28 | logging.info('Reading `h5ad` files...') 29 | ad_mod1 = ad.read_h5ad(par['input_mod1']) 30 | ad_mod2 = ad.read_h5ad(par['input_mod2']) 31 | 32 | logging.info('Determine parameters by the modalities') 33 | mod1_type = ad_mod1.var.feature_types[0].upper() 34 | mod2_type = ad_mod2.var.feature_types[0].upper() 35 | 36 | if mod1_type == "GEX" and mod2_type == "ADT": 37 | n_mod1 = 73 38 | n_mod2 = 27 39 | 40 | elif mod1_type == "ADT" and mod2_type == "GEX": 41 | n_mod1 = 27 42 | n_mod2 = 73 43 | 44 | elif mod1_type == "GEX" and mod2_type == "ATAC": 45 | n_mod1 = 38 46 | n_mod2 = 62 47 | 48 | elif mod1_type == "ATAC" and mod2_type == "GEX": 49 | n_mod1 = 62 50 | n_mod2 = 38 51 | 52 | else: 53 | n_mod1 = 50 54 | n_mod2 = 50 55 | 56 | logging.info('Performing dimensionality reduction on modality 1 values...') 57 | embedder_mod1 = TruncatedSVD(n_components=n_mod1) 58 | mod1_pca = embedder_mod1.fit_transform(ad_mod1.X) 59 | mod1_obs = ad_mod1.obs 60 | mod1_uns = ad_mod1.uns 61 | del ad_mod1 62 | 63 | logging.info('Performing dimensionality reduction on modality 2 values...') 64 | embedder_mod1 = TruncatedSVD(n_components=n_mod2) 65 | mod2_pca = embedder_mod1.fit_transform(ad_mod2.X) 66 | del ad_mod2 67 | 68 | logging.info('Concatenating datasets') 69 | pca_combined = np.concatenate([normalize(mod1_pca), normalize(mod2_pca)], axis=1) 70 | 71 | logging.info('Storing output to file') 72 | adata = ad.AnnData( 73 | X=pca_combined, 74 | obs=mod1_obs, 75 | uns={ 76 | 'dataset_id': mod1_uns['dataset_id'], 77 | 'method_id': meta['functionality_name'], 78 | }, 79 | ) 80 | adata.write_h5ad(par['output'], compression="gzip") 81 | -------------------------------------------------------------------------------- /src/joint_embedding/methods/Guanlab-dengkw/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=submission_170795 7 | task_id=joint_embedding 8 | 9 | # CITE 10 | dataset_id=openproblems_bmmc_cite_phase2 11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 13 | 14 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 15 | --input_mod1 ${dataset_path}.output_mod1.h5ad \ 16 | --input_mod2 ${dataset_path}.output_mod2.h5ad \ 17 | --output ${pred_path}.${method_id}.output.h5ad 18 | 19 | # MULTIOME 20 | dataset_id=openproblems_bmmc_multiome_phase2 21 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 22 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 23 | 24 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 25 | --input_mod1 ${dataset_path}.output_mod1.h5ad \ 26 | --input_mod2 ${dataset_path}.output_mod2.h5ad \ 27 | --output ${pred_path}.${method_id}.output.h5ad 28 | 29 | 30 | # RUN EVALUATION 31 | bin/nextflow run "$PIPELINE_REPO" \ 32 | -r "$PIPELINE_VERSION" \ 33 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 34 | --solutionDir "output/datasets/$task_id" \ 35 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 36 | --publishDir "output/evaluation/$task_id/$method_id/" \ 37 | -latest \ 38 | -resume \ 39 | -c "src/resources/nextflow_moremem.config" 40 | 41 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/joint_embedding/methods/jae/README.md: -------------------------------------------------------------------------------- 1 | # Single cell joint embedding with an autoencoder (JAE) 2 | 3 | **Team**: Amateur 4 | 5 | **Team members**: Qiao Liu, Wanwen Zeng, Chencheng Xu 6 | 7 | **Project URL**: https://github.com/kimmo1019/JAE 8 | 9 | 10 | 11 | In brief, we built an autoencoder for joint embedding (JAE). Each modality will first be SVD transformed and concatenated together (denoted as x). The major difference from standard AE is that we incorporated the information from cell annotations (e.g., cell label, cell cycle score, and cell batch) to constrain the structure of latent features. We desire that some latent features (c) predict the cell type information, some features predict the cell cycle score. Noticeably, for feature (b), we want it to predict the batch label as randomly as possible to potentially eliminate the batch effect. z has no constrain at all to ensure the flexibility of neural network. 12 | 13 | In the pretrain stage, JAE was trained with exploration data where the cell annotation information (cell type, cell cycle phase score) is available. In the test stage where the cell annotation information is not available, we only minimize the reconstruction loss of the autoencoder with a smaller learning rate (fine-tune). 14 | 15 | 16 | Feel free to contact `liuqiao@stanford.edu` if you have any problem in the JAE model. 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /src/joint_embedding/methods/jae/model_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/joint_embedding/methods/jae/model_architecture.png -------------------------------------------------------------------------------- /src/joint_embedding/methods/jae/resources/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for consistent data preprocessing 3 | """ 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | class EarlyStoppingAtMinLoss(tf.keras.callbacks.Callback): 9 | def __init__(self, patience=0): 10 | super(EarlyStoppingAtMinLoss, self).__init__() 11 | self.patience = patience 12 | self.best_weights = None 13 | 14 | def on_train_begin(self, logs=None): 15 | self.wait = 0 16 | self.stopped_epoch = 0 17 | self.best = np.Inf 18 | 19 | def on_epoch_end(self, epoch, logs=None): 20 | current = logs.get("val_loss") 21 | if np.less(current, self.best): 22 | self.best = current 23 | self.wait = 0 24 | self.best_weights = self.model.get_weights() 25 | else: 26 | self.wait += 1 27 | if self.wait >= self.patience: 28 | self.stopped_epoch = epoch 29 | self.model.stop_training = True 30 | print("Restoring model weights from the end of the best epoch.") 31 | self.model.set_weights(self.best_weights) 32 | 33 | def on_train_end(self, logs=None): 34 | if self.stopped_epoch > 0: 35 | print("Epoch %05d: early stopping" % (self.stopped_epoch + 1)) 36 | 37 | 38 | class JointEmbeddingModel(tf.keras.Model): 39 | def __init__(self, params, name=None): 40 | super(JointEmbeddingModel, self).__init__(name=name) 41 | self.params = params 42 | self.encoder = self.create_encoder() 43 | self.decoder = self.create_decoder() 44 | self.classifier = self.create_classifier() 45 | 46 | def get_config(self): 47 | return { 48 | "params": self.params, 49 | } 50 | def call(self, inputs, training): 51 | encoded = self.encoder(inputs) 52 | decoded = self.decoder(encoded) 53 | digits_cell_type, digits_batch, digits_phase = self.classifier(encoded) 54 | if self.params['use_batch']: 55 | return decoded, digits_cell_type, digits_batch, digits_phase 56 | else: 57 | return decoded, digits_cell_type 58 | 59 | def create_encoder(self, use_resnet=True): 60 | if use_resnet: 61 | inputs = tf.keras.layers.Input(shape=(self.params['dim'],)) 62 | for i, n_unit in enumerate(self.params['hidden_units'][:-1]): 63 | if i==0: 64 | x_init = tf.keras.layers.Dense(n_unit, activation='relu')(inputs) 65 | else: 66 | x_init = tf.keras.layers.Dense(n_unit, activation='relu')(x) 67 | x = tf.keras.layers.Dropout(0.1)(x_init) 68 | x = tf.keras.layers.BatchNormalization()(x) 69 | x = tf.keras.layers.Dense(n_unit)(x) 70 | x = tf.keras.layers.Add()([x,x_init]) 71 | x = tf.keras.layers.Activation(activation='relu')(x) 72 | encoded = tf.keras.layers.Dense(self.params['hidden_units'][-1], activation='relu')(x) 73 | else: 74 | inputs = tf.keras.layers.Input(shape=(self.params['dim'],)) 75 | for i, n_unit in enumerate(self.params['hidden_units'][:-1]): 76 | if i==0: 77 | x = tf.keras.layers.Dense(n_unit, activation='relu')(inputs) 78 | else: 79 | x = tf.keras.layers.Dense(n_unit, activation='relu')(x) 80 | x = tf.keras.layers.Dropout(0.1)(x) 81 | x = tf.keras.layers.BatchNormalization()(x) 82 | encoded = tf.keras.layers.Dense(self.params['hidden_units'][-1], activation='relu')(x) 83 | return tf.keras.Model(inputs=inputs, outputs=encoded, name='encoder') 84 | 85 | def create_decoder(self): 86 | inputs = tf.keras.layers.Input(shape=(self.params['hidden_units'][-1],)) 87 | for i, n_unit in enumerate(self.params['hidden_units'][:-1][::-1]): 88 | if i==0: 89 | x = tf.keras.layers.Dense(n_unit, activation='relu')(inputs) 90 | else: 91 | x = tf.keras.layers.Dense(n_unit, activation='relu')(x) 92 | decoded = tf.keras.layers.Dense(self.params['dim'], activation='relu')(x) 93 | return tf.keras.Model(inputs=inputs, outputs=decoded, name='decoder') 94 | 95 | def create_classifier(self): 96 | inputs = tf.keras.layers.Input(shape=(self.params['hidden_units'][-1],)) 97 | digits_cell_type = inputs[:,:self.params['nb_cell_types']] 98 | digits_batch = inputs[:,self.params['nb_cell_types']:(self.params['nb_cell_types']+self.params['nb_batches'])] 99 | digits_phase = inputs[:,(self.params['nb_cell_types']+self.params['nb_batches']):(self.params['nb_cell_types']+self.params['nb_batches']+self.params['nb_phases'])] 100 | return tf.keras.Model(inputs=inputs, outputs=[digits_cell_type, digits_batch, digits_phase], name='classifier') 101 | 102 | -------------------------------------------------------------------------------- /src/joint_embedding/methods/jae/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: jae 3 | namespace: joint_embedding_methods 4 | 5 | # metadata for your method 6 | description: In brief, we built an autoencoder for joint embedding (JAE). Each modality will first be SVD transformed and concatenated together. The major difference from standard AE is that we incorporated the information from cell annotations (e.g., cell label, cell cycle score, and cell batch) to constrain the structure of latent features. We desire that some latent features predict the cell type information, some features predict the cell cycle score. Noticeably, for feature corresponding to batch effect, we want it to predict the batch label as randomly as possible to potentially eliminate the batch effect. There are also several nodes that have no constrain at all to ensure the flexibility of neural network. 7 | info: 8 | method_label: "JAE" 9 | submission_id: "170936/171079" 10 | team_name: Amateur 11 | project_url: https://github.com/kimmo1019/JAE 12 | 13 | authors: 14 | - name: Qiao Liu 15 | email: liuqiao@stanford.edu 16 | roles: [ author, maintainer ] 17 | props: { github: kimmo1019, orcid: "0000-0002-9781-3360", url: "http://liuqiao.me" } 18 | - name: Wanwen Zeng 19 | email: wanwen@stanford.edu 20 | roles: [ author ] 21 | props: { github: wanwenzeng, orcid: "0000-0003-3426-0890", url: "https://scholar.google.com/citations?user=MbeOhkgAAAAJ&hl=zh-CN" } 22 | - name: Chencheng Xu 23 | roles: [ author ] 24 | props: { github: Zoesgithub, orcid: "0000-0002-2262-6966" } 25 | 26 | # parameters 27 | arguments: 28 | # required inputs 29 | - name: "--input_mod1" 30 | type: "file" 31 | example: "dataset_mod1.h5ad" 32 | description: Modality 1 dataset. 33 | required: true 34 | - name: "--input_mod2" 35 | type: "file" 36 | example: "dataset_mod2.h5ad" 37 | description: Modality 2 dataset. 38 | required: true 39 | - name: "--input_pretrain" 40 | type: "file" 41 | example: "pretrain_model" 42 | description: Path to the directory containing a pretrained model. 43 | required: true 44 | # required outputs 45 | - name: "--output" 46 | type: "file" 47 | direction: "output" 48 | example: "output.h5ad" 49 | description: Data for all cells in mod1 and mod2 embedded to ≤100 dimensions. 50 | required: true 51 | 52 | # files your script needs 53 | resources: 54 | - type: python_script 55 | path: script.py 56 | - path: '../resources/utils.py' 57 | 58 | # target platforms 59 | platforms: 60 | - type: docker 61 | image: tensorflow/tensorflow:latest-gpu 62 | run_args: [ "--gpus all" ] 63 | setup: 64 | - type: python 65 | packages: 66 | - anndata 67 | - umap-learn 68 | - scanpy 69 | - type: nextflow 70 | labels: [ vhighmem, vhightime, vhighcpu, gpu ] 71 | -------------------------------------------------------------------------------- /src/joint_embedding/methods/jae/run/script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | import json 5 | import anndata as ad 6 | import numpy as np 7 | from sklearn.preprocessing import normalize 8 | import tensorflow as tf 9 | import pickle as pk 10 | import scipy 11 | 12 | logging.basicConfig(level=logging.INFO) 13 | 14 | ## VIASH START 15 | dataset_path = 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.' 16 | 17 | par = { 18 | 'input_mod1': dataset_path + 'mod1.h5ad', 19 | 'input_mod2': dataset_path + 'mod2.h5ad', 20 | 'input_pretrain': '...', 21 | 'output': 'output.h5ad', 22 | } 23 | 24 | meta = { 'resources_dir': '.', 'functionality_name': 'submission_171079' } 25 | ## VIASH END 26 | 27 | sys.path.append(meta['resources_dir']) 28 | from utils import JointEmbeddingModel 29 | 30 | logging.info('Reading `h5ad` files...') 31 | ad_mod1 = ad.read_h5ad(par['input_mod1']) 32 | ad_mod2 = ad.read_h5ad(par['input_mod2']) 33 | mod1_obs = ad_mod1.obs 34 | mod1_uns = ad_mod1.uns 35 | 36 | ad_mod2_var = ad_mod2.var 37 | 38 | mod_type = ad_mod2_var['feature_types'][0] 39 | 40 | mod1_mat = ad_mod1.layers["counts"] 41 | mod2_mat = ad_mod2.layers["counts"] 42 | 43 | del ad_mod2, ad_mod1 44 | 45 | if mod_type == 'ATAC': 46 | mod1_svd = pk.load(open(os.path.join(par['input_pretrain'], 'svd_mod1.pkl'),'rb')) 47 | mod2_svd = pk.load(open(os.path.join(par['input_pretrain'], 'svd_mod2.pkl'),'rb')) 48 | else: 49 | mod1_svd = pk.load(open(os.path.join(par['input_pretrain'], 'svd_mod1.pkl'),'rb')) 50 | mod2_svd = None 51 | 52 | def svd_transform(mod1_data, mod2_data, mod1_svd, mod2_svd, scale=1e4): 53 | mod1_data = scale * normalize(mod1_data, norm='l1', axis=1) 54 | mod2_data = scale * normalize(mod2_data, norm='l1', axis=1) 55 | mod1_data = scipy.sparse.csr_matrix.log1p(mod1_data) / np.log(10) 56 | mod2_data = scipy.sparse.csr_matrix.log1p(mod2_data) / np.log(10) 57 | pca_data_mod1 = mod1_svd.transform(mod1_data) 58 | 59 | if mod_type == 'ADT': 60 | pca_data_mod2 = mod2_data.toarray() 61 | else: 62 | pca_data_mod2 = mod2_svd.transform(mod2_data) 63 | return pca_data_mod1, pca_data_mod2 64 | 65 | mod1_pca, mod2_pca = svd_transform(mod1_mat, mod2_mat, mod1_svd, mod2_svd) 66 | 67 | del mod1_mat, mod2_mat 68 | 69 | pca_combined = np.concatenate([mod1_pca, mod2_pca],axis=1) 70 | del mod1_pca, mod2_pca 71 | 72 | if mod_type == 'ATAC': 73 | epochs = 2 74 | else: 75 | epochs = 1 76 | 77 | coeff = [1.0, 0.0, 0.0, 0.0] 78 | 79 | with open(os.path.join(par['input_pretrain'], 'hyperparams.json'), 'r') as file: 80 | params = json.load(file) 81 | 82 | mymodel = JointEmbeddingModel(params) 83 | mymodel(np.zeros((2, params['dim']))) 84 | 85 | mymodel.compile(tf.keras.optimizers.Adam(learning_rate = params["lr"]), 86 | loss = [tf.keras.losses.MeanSquaredError(), 87 | tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 88 | tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 89 | tf.keras.losses.MeanSquaredError() 90 | ], 91 | loss_weights=coeff, run_eagerly=True) 92 | 93 | #load pretrain model 94 | mymodel.load_weights(os.path.join(par['input_pretrain'], 'weights.h5')) 95 | 96 | 97 | X_train = pca_combined 98 | c_fakes = np.random.randint(low=0, high=params['nb_cell_types'],size=pca_combined.shape[0]) 99 | b_fakes = np.random.randint(low=0, high=params['nb_batches'],size=pca_combined.shape[0]) 100 | p_fakes = np.random.randint(low=0, high=params['nb_phases'],size=pca_combined.shape[0]) 101 | Y_train = [pca_combined, c_fakes, b_fakes, p_fakes] 102 | 103 | #finetune on the test data 104 | mymodel.fit(x=X_train, y=Y_train, 105 | epochs = epochs, 106 | batch_size = 32, 107 | shuffle=True) 108 | 109 | embeds = mymodel.encoder.predict(pca_combined) 110 | print(embeds.shape) 111 | 112 | adata = ad.AnnData( 113 | X=embeds, 114 | obs=mod1_obs, 115 | uns={ 116 | 'dataset_id': mod1_uns['dataset_id'], 117 | 'method_id': meta['functionality_name'], 118 | }, 119 | ) 120 | adata.write_h5ad(par['output'], compression="gzip") 121 | -------------------------------------------------------------------------------- /src/joint_embedding/methods/jae/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=jae 7 | task_id=joint_embedding 8 | 9 | # CITE 10 | dataset_id=openproblems_bmmc_cite_phase2 11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 12 | dataset_train_path=output/datasets_phase2_public/$task_id/$dataset_id/$dataset_id.censor_dataset 13 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 14 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 15 | 16 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 17 | --input_mod1 ${dataset_train_path}.output_mod1.h5ad \ 18 | --input_mod2 ${dataset_train_path}.output_mod2.h5ad \ 19 | --input_explore_mod1 output/datasets_explore/cite/cite_gex_processed_training.h5ad \ 20 | --input_explore_mod2 output/datasets_explore/cite/cite_adt_processed_training.h5ad \ 21 | --input_sol ${dataset_path}.output_sol.h5ad \ 22 | --output_pretrain ${pretrain_path} 23 | 24 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 25 | --input_mod1 ${dataset_path}.output_mod1.h5ad \ 26 | --input_mod2 ${dataset_path}.output_mod2.h5ad \ 27 | --input_pretrain ${pretrain_path} \ 28 | --output ${pred_path}.${method_id}.output.h5ad 29 | 30 | # MULTIOME 31 | dataset_id=openproblems_bmmc_multiome_phase2 32 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 33 | dataset_train_path=output/datasets_phase2_public/$task_id/$dataset_id/$dataset_id.censor_dataset 34 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 36 | 37 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 38 | --input_mod1 ${dataset_train_path}.output_mod1.h5ad \ 39 | --input_mod2 ${dataset_train_path}.output_mod2.h5ad \ 40 | --input_explore_mod1 output/datasets_explore/multiome/multiome_gex_processed_training.h5ad \ 41 | --input_explore_mod2 output/datasets_explore/multiome/multiome_atac_processed_training.h5ad \ 42 | --input_sol ${dataset_path}.output_sol.h5ad \ 43 | --output_pretrain ${pretrain_path} 44 | 45 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 46 | --input_mod1 ${dataset_path}.output_mod1.h5ad \ 47 | --input_mod2 ${dataset_path}.output_mod2.h5ad \ 48 | --input_pretrain ${pretrain_path} \ 49 | --output ${pred_path}.${method_id}.output.h5ad 50 | 51 | # RUN EVALUATION 52 | bin/nextflow run "$PIPELINE_REPO" \ 53 | -r "$PIPELINE_VERSION" \ 54 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 55 | --solutionDir "output/datasets/$task_id" \ 56 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 57 | --publishDir "output/evaluation/$task_id/$method_id/" \ 58 | -latest \ 59 | -resume \ 60 | -c "src/resources/nextflow_moremem.config" 61 | 62 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/joint_embedding/methods/jae/train/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: jae_train 3 | namespace: joint_embedding_methods 4 | 5 | # metadata for your method 6 | description: In brief, we built an autoencoder for joint embedding (JAE). Each modality will first be SVD transformed and concatenated together. The major difference from standard AE is that we incorporated the information from cell annotations (e.g., cell label, cell cycle score, and cell batch) to constrain the structure of latent features. We desire that some latent features predict the cell type information, some features predict the cell cycle score. Noticeably, for feature corresponding to batch effect, we want it to predict the batch label as randomly as possible to potentially eliminate the batch effect. There are also several nodes that have no constrain at all to ensure the flexibility of neural network. 7 | authors: 8 | - name: Qiao Liu 9 | email: liuqiao@stanford.edu 10 | roles: [ author, maintainer ] 11 | props: { github: kimmo1019, orcid: "0000-0002-9781-3360", url: "http://liuqiao.me" } 12 | - name: Wanwen Zeng 13 | email: wanwen@stanford.edu 14 | roles: [ author ] 15 | props: { github: wanwenzeng, orcid: "0000-0003-3426-0890", url: "https://scholar.google.com/citations?user=MbeOhkgAAAAJ&hl=zh-CN" } 16 | - name: Chencheng Xu 17 | roles: [ author ] 18 | props: { github: Zoesgithub, orcid: "0000-0002-2262-6966" } 19 | 20 | # parameters 21 | arguments: 22 | # required inputs 23 | - name: "--input_mod1" 24 | type: "file" 25 | example: "dataset_mod1.h5ad" 26 | description: Modality 1 dataset. 27 | required: true 28 | - name: "--input_mod2" 29 | type: "file" 30 | example: "dataset_mod2.h5ad" 31 | description: Modality 2 dataset. 32 | required: true 33 | - name: "--input_explore_mod1" 34 | type: "file" 35 | example: "dataset_mod1.h5ad" 36 | description: Explore version of the modality 1 dataset. 37 | required: true 38 | - name: "--input_explore_mod2" 39 | type: "file" 40 | example: "dataset_mod2.h5ad" 41 | description: Explore version of the modality 2 dataset. 42 | required: true 43 | - name: "--tf_seed" 44 | type: "integer" 45 | default: 46 46 | description: ... 47 | - name: "--np_seed" 48 | type: "integer" 49 | default: 56 50 | description: ... 51 | 52 | # required outputs 53 | - name: "--output_pretrain" 54 | type: "file" 55 | direction: "output" 56 | example: "pretrain_model" 57 | description: Path to the directory containing a pretrained model. 58 | required: true 59 | 60 | # files your script needs 61 | resources: 62 | - type: python_script 63 | path: script.py 64 | - path: '../resources/utils.py' 65 | 66 | # target platforms 67 | platforms: 68 | - type: docker 69 | image: tensorflow/tensorflow:latest-gpu 70 | run_args: [ "--gpus all" ] 71 | setup: 72 | - type: python 73 | packages: 74 | - anndata 75 | - umap-learn 76 | - scanpy 77 | - type: nextflow 78 | labels: [ vhighmem, vhightime, vhighcpu, gpu ] 79 | -------------------------------------------------------------------------------- /src/joint_embedding/methods/lsl_ae/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: lsl_ae 3 | namespace: joint_embedding_methods 4 | 5 | # metadata for your method 6 | description: A description for your method. 7 | info: 8 | method_label: "LSL_AE" 9 | submission_id: "170825" 10 | team_name: Living-Systems-Lab 11 | # project_url: https://github.com/foo/bar 12 | # publication_doi: 10.1101/0123.45.67.890123 13 | # publication_url: https://arxiv.org/abs/1234.56789 14 | 15 | authors: 16 | - name: Sumeer Khan 17 | email: sumeer.khan@kaust.edu.sa 18 | roles: [ author, maintainer ] 19 | - name: Robert Lehman 20 | email: robert.lehman@kaust.edu.sa 21 | roles: [ author, maintainer ] 22 | - name: Xabier Martinez De Morentin 23 | email: xavier.martinez.demorentin@navarra.es 24 | roles: [ author, maintainer ] 25 | - name: Aidyn Ubingazhibov 26 | email: aidyn.ubingazhibov@nu.edu.kz 27 | roles: [ author, maintainer ] 28 | - name: Minxing Pang 29 | email: minxing.pang@kaust.edu.sa 30 | roles: [ author, maintainer ] 31 | 32 | # parameters 33 | arguments: 34 | # required inputs 35 | - name: "--input_mod1" 36 | type: "file" 37 | example: "dataset_mod1.h5ad" 38 | description: Modality 1 dataset. 39 | required: true 40 | - name: "--input_mod2" 41 | type: "file" 42 | example: "dataset_mod2.h5ad" 43 | description: Modality 2 dataset. 44 | required: true 45 | # required outputs 46 | - name: "--output" 47 | type: "file" 48 | direction: "output" 49 | example: "output.h5ad" 50 | description: Data for all cells in mod1 and mod2 embedded to ≤100 dimensions. 51 | required: true 52 | 53 | # files your script needs 54 | resources: 55 | - type: python_script 56 | path: script.py 57 | 58 | # target platforms 59 | platforms: 60 | - type: docker 61 | image: nvcr.io/nvidia/tensorflow:20.10-tf1-py3 62 | run_args: [ "--gpus all" ] 63 | setup: 64 | - type: python 65 | packages: 66 | - anndata 67 | - umap-learn 68 | - keras 69 | - matplotlib 70 | - scanpy 71 | - scipy 72 | - type: nextflow 73 | labels: [ vhighmem, vvhightime, vhighcpu, gpu ] 74 | -------------------------------------------------------------------------------- /src/joint_embedding/methods/lsl_ae/run/script.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import anndata as ad 3 | import pandas as pd 4 | from tensorflow.keras.layers import Input, Dense, Dropout 5 | from tensorflow.keras.layers import concatenate 6 | from tensorflow.keras.models import Model 7 | from tensorflow.keras.callbacks import EarlyStopping 8 | from tensorflow import keras 9 | import warnings 10 | warnings.filterwarnings('ignore') 11 | import scanpy as sc 12 | #from keras import backend as K 13 | from tensorflow.keras.constraints import Constraint 14 | import tensorflow.keras.backend as K 15 | 16 | from tensorflow.keras.optimizers import Adam 17 | from tensorflow.keras.models import Model 18 | import warnings 19 | from numpy.random import seed 20 | seed(1) 21 | import tensorflow as tf 22 | tf.compat.v1.random.set_random_seed(2) 23 | 24 | 25 | print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) 26 | 27 | 28 | warnings.filterwarnings('ignore') 29 | 30 | ## VIASH START 31 | dataset_path = "output/datasets/joint_embedding/openproblems_bmmc_cite_phase2/openproblems_bmmc_cite_phase2.censor_dataset.output_" 32 | 33 | par = { 34 | 'input_mod1': f'{dataset_path}mod1.h5ad', 35 | 'input_mod2': f'{dataset_path}mod2.h5ad', 36 | 'output': 'output.h5ad' 37 | } 38 | meta = { 39 | 'resources_dir': '.', 40 | 'functionality_name': 'submission_170795' 41 | } 42 | ## VIASH END 43 | 44 | 45 | logging.info('Reading `h5ad` files...') 46 | ad_mod1 = ad.read_h5ad(par['input_mod1']) 47 | ad_mod2 = ad.read_h5ad(par['input_mod2']) 48 | 49 | # high variable gene calculation 50 | min_cells = int(ad_mod2.shape[0] * 0.03) 51 | sc.pp.highly_variable_genes(ad_mod1, batch_key ='batch', subset = True) 52 | sc.pp.filter_genes(ad_mod2, min_cells=min_cells) 53 | 54 | ad_mod_1 = ad_mod1[:, ad_mod1.var.highly_variable] 55 | 56 | ## Convert to csv for AE training 57 | scRNAseq1 = ad_mod_1.X.toarray() 58 | scRNAseq2 = ad_mod2.X.toarray() 59 | 60 | 61 | class WeightsOrthogonalityConstraint(Constraint): 62 | def __init__(self, encoding_dim, weightage = 1.0, axis = 0): 63 | self.encoding_dim = encoding_dim 64 | self.weightage = weightage 65 | self.axis = axis 66 | 67 | def weights_orthogonality(self, w): 68 | if(self.axis==1): 69 | w = K.transpose(w) 70 | if(self.encoding_dim > 1): 71 | m = K.dot(K.transpose(w), w) - K.eye(self.encoding_dim) 72 | return self.weightage * K.sqrt(K.sum(K.square(m))) 73 | else: 74 | m = K.sum(w ** 2) - 1. 75 | return m 76 | 77 | def __call__(self, w): 78 | return self.weights_orthogonality(w) 79 | 80 | 81 | # Input Layer 82 | ncol_scRNAseq1 = scRNAseq1.shape[1] 83 | input_dim_scRNAseq1 = Input(shape = (ncol_scRNAseq1, ), name = "scRNAseq1") 84 | ncol_scRNAseq2 = scRNAseq2.shape[1] 85 | input_dim_scRNAseq2 = Input(shape = (ncol_scRNAseq2, ), name = "scRNAseq2") 86 | 87 | encoding_dim_scRNAseq1 = 64 88 | encoding_dim_scRNAseq2 = 64 89 | 90 | dropout_scRNAseq1 = Dropout(0.1, name = "Dropout_scRNAseq1")(input_dim_scRNAseq1) 91 | dropout_scRNAseq2 = Dropout(0.1, name = "Dropout_scRNAseq2")(input_dim_scRNAseq2) 92 | 93 | encoded_scRNAseq1 = Dense(encoding_dim_scRNAseq1, activation = 'relu', name = "Encoder_scRNAseq1", use_bias=True, kernel_regularizer=WeightsOrthogonalityConstraint(64, weightage=1., axis=0))(dropout_scRNAseq1) #300 #prv 256 94 | encoded_scRNAseq2 = Dense(encoding_dim_scRNAseq2, activation = 'relu', name = "Encoder_scRNAseq2", use_bias=True, kernel_regularizer=WeightsOrthogonalityConstraint(64, weightage=1., axis=0))(dropout_scRNAseq2) 95 | 96 | merge = concatenate([encoded_scRNAseq1, encoded_scRNAseq2]) 97 | 98 | bottleneck = Dense(64, kernel_initializer = 'uniform', activation = 'linear', name = "Bottleneck")(merge) #50 99 | 100 | merge_inverse = Dense(encoding_dim_scRNAseq1 + encoding_dim_scRNAseq2, activation = 'relu', name = "Concatenate_Inverse")(bottleneck) 101 | 102 | decoded_scRNAseq1 = Dense(ncol_scRNAseq1, activation = 'relu', name = "Decoder_scRNAseq1")(merge_inverse) #sigmoid 103 | 104 | decoded_scRNAseq2 = Dense(ncol_scRNAseq2, activation = 'relu', name = "Decoder_scRNAseq2")(merge_inverse) 105 | 106 | autoencoder = Model([input_dim_scRNAseq1, input_dim_scRNAseq2], [decoded_scRNAseq1, decoded_scRNAseq2]) 107 | 108 | opt = Adam(lr=0.0001) 109 | autoencoder.compile(optimizer = opt, loss={'Decoder_scRNAseq1': 'mean_squared_error', 'Decoder_scRNAseq2': 'mean_squared_error'}) #loss_weights = [1., 1.] 110 | autoencoder.summary() 111 | 112 | es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=20) 113 | # Autoencoder training 114 | estimator = autoencoder.fit([scRNAseq1, scRNAseq2], [scRNAseq1, scRNAseq2], epochs = 600, batch_size = 32, validation_split = 0.2, shuffle = True, verbose = 1, callbacks=[es]) #prev 64 BS prev 32 115 | 116 | 117 | encoder = Model([input_dim_scRNAseq1, input_dim_scRNAseq2], bottleneck) 118 | bottleneck_representation = encoder.predict([scRNAseq1, scRNAseq2]) 119 | 120 | embd = pd.DataFrame(bottleneck_representation) 121 | #embd = scipy.sparse.csr_matrix(RNA_ATAC_Latent.values) 122 | 123 | mod1_obs = ad_mod1.obs 124 | mod1_uns = ad_mod1.uns 125 | logging.info('Storing output to file') 126 | adata = ad.AnnData( 127 | X=embd.values, 128 | obs=mod1_obs, 129 | uns={ 130 | 'dataset_id': mod1_uns['dataset_id'], 131 | 'method_id': meta['functionality_name'], 132 | }, 133 | ) 134 | adata.write_h5ad(par['output'], compression="gzip") 135 | -------------------------------------------------------------------------------- /src/joint_embedding/methods/lsl_ae/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=submission_170825 7 | task_id=joint_embedding 8 | 9 | # CITE 10 | dataset_id=openproblems_bmmc_cite_phase2 11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 13 | 14 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 15 | --input_mod1 ${dataset_path}.output_mod1.h5ad \ 16 | --input_mod2 ${dataset_path}.output_mod2.h5ad \ 17 | --output ${pred_path}.${method_id}.output.h5ad 18 | 19 | # MULTIOME 20 | dataset_id=openproblems_bmmc_multiome_phase2 21 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 22 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 23 | 24 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 25 | --input_mod1 ${dataset_path}.output_mod1.h5ad \ 26 | --input_mod2 ${dataset_path}.output_mod2.h5ad \ 27 | --output ${pred_path}.${method_id}.output.h5ad 28 | 29 | 30 | # RUN EVALUATION 31 | bin/nextflow run "$PIPELINE_REPO" \ 32 | -r "$PIPELINE_VERSION" \ 33 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 34 | --solutionDir "output/datasets/$task_id" \ 35 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 36 | --publishDir "output/evaluation/$task_id/$method_id/" \ 37 | -latest \ 38 | -resume \ 39 | -c "src/resources/nextflow_moremem.config" 40 | 41 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/match_modality/methods/clue/README.md: -------------------------------------------------------------------------------- 1 | # CLUE (Cross-Linked Unified Embedding) 2 | 3 | Team GLUE: Zhi-Jie Cao, Xin-Ming Tu, Chen-Rui Xia 4 | 5 | **CLUE** is a semi-supervised single-cell multi-omics integration model. It employs variational autoencoders to project cells from different modalities into a unified low-dimensional embedding space, where modality matching can be performed. Specially, we model data in each modality as generated from a modality-specific subspace of the complete cell embedding. Through a matrix of cross-encoders, CLUE projects cells in each modality into all modality-specific subspaces, which are then concatenated to build a comprehensive embedding, allowing the model to capture both shared and modality-specific information. 6 | 7 | 8 | 9 | **General architecture of CLUE ⤴️** 10 | 11 | > CLUE is implemented as part of the `scglue` Python package. A pre-release containing the CLUE model is available as `resources/scglue-0.1.1-py3-none-any.whl`. A formal release will be made available later on PyPI and Anaconda. Stay tuned at [https://github.com/gao-lab/GLUE](https://github.com/gao-lab/GLUE). 12 | -------------------------------------------------------------------------------- /src/match_modality/methods/clue/clue_architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/clue/clue_architecture.jpg -------------------------------------------------------------------------------- /src/match_modality/methods/clue/resources/scglue-0.1.1-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/clue/resources/scglue-0.1.1-py3-none-any.whl -------------------------------------------------------------------------------- /src/match_modality/methods/clue/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: clue 3 | namespace: match_modality_methods 4 | 5 | # metadata for your method 6 | 7 | description: Cross-linked unified embedding for single-cell multi-omics data integration 8 | info: 9 | method_label: "CLUE" 10 | submission_id: "169959" 11 | team_name: GLUE 12 | project_url: https://github.com/gao-lab/GLUE 13 | # publication_doi: 10.1101/2021.08.22.457275 14 | # publication_url: https://arxiv.org/abs/1234.56789 15 | 16 | authors: 17 | - name: Zhi-Jie Cao 18 | email: caozj@mail.cbi.pku.edu.cn 19 | roles: [ author, maintainer ] 20 | props: { github: Jeff1995, orcid: "0000-0002-0026-671X" } 21 | - name: Xin-Ming Tu 22 | email: xinmingtu@pku.edu.cn 23 | roles: [ author, maintainer ] 24 | props: { github: XinmingTu } 25 | - name: Chen-Rui Xia 26 | email: xiachenrui@mail.cbi.pku.edu.cn 27 | roles: [ author, maintainer ] 28 | props: { github: xiachenrui } 29 | 30 | # parameters 31 | arguments: 32 | # required inputs 33 | - name: "--input_train_mod1" 34 | type: "file" 35 | example: "dataset_censored.h5ad" 36 | description: "The censored shuffled train mod1 profiles." 37 | required: true 38 | - name: "--input_train_mod2" 39 | type: "file" 40 | example: "dataset_censored.h5ad" 41 | description: "The censored shuffled train mod2 profiles." 42 | required: true 43 | - name: "--input_train_sol" 44 | type: "file" 45 | example: "dataset_solution.h5ad" 46 | description: "The pairing of train mod1&mod2 profiles." 47 | required: true 48 | - name: "--input_test_mod1" 49 | type: "file" 50 | example: "dataset_censored.h5ad" 51 | description: "The censored shuffled test mod1 profiles." 52 | required: true 53 | - name: "--input_test_mod2" 54 | type: "file" 55 | example: "dataset_censored.h5ad" 56 | description: "The censored shuffled test mod2 profiles." 57 | required: true 58 | - name: "--input_pretrain" 59 | type: "file" 60 | example: "pretrain_model" 61 | description: Path to the directory containing a pretrained model. 62 | required: true 63 | 64 | # required outputs 65 | - name: "--output" 66 | type: "file" 67 | direction: "output" 68 | example: "output.h5ad" 69 | description: "The predicted pairing of test mod1&mod2 profiles." 70 | required: true 71 | 72 | # files your script needs 73 | resources: 74 | - type: python_script 75 | path: script.py 76 | - path: ../resources/utils.py 77 | - path: ../resources/scglue-0.1.1-py3-none-any.whl 78 | 79 | # target platforms 80 | platforms: 81 | - type: docker 82 | image: nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04 83 | setup: 84 | - type: apt 85 | packages: 86 | - python3-pip 87 | - python3.8-dev 88 | - type: docker 89 | run: 90 | - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 91 | - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 10 92 | - python -m pip install --upgrade pip 93 | - pip install scglue-0.1.1-py3-none-any.whl 94 | - pip install pyyaml 95 | resources: 96 | - scglue-0.1.1-py3-none-any.whl scglue-0.1.1-py3-none-any.whl 97 | 98 | - type: nextflow 99 | labels: [ highmem, hightime, highcpu, gpu ] -------------------------------------------------------------------------------- /src/match_modality/methods/clue/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=clue 7 | task_id=match_modality 8 | 9 | # CITE GEX2ADT 10 | dataset_id=openproblems_bmmc_cite_phase2_rna 11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 12 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 13 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 14 | 15 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 16 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 17 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 18 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 19 | --output_pretrain ${pretrain_path} 20 | 21 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 22 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 23 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 24 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 25 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 26 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 27 | --input_pretrain ${pretrain_path} \ 28 | --output ${pred_path}.${method_id}.output.h5ad 29 | 30 | # CITE ADT2GEX 31 | dataset_id=openproblems_bmmc_cite_phase2_mod2 32 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 33 | # can reuse same pretrain 34 | # pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 36 | 37 | # target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 38 | # --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 39 | # --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 40 | # --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 41 | # --output_pretrain ${pretrain_path} 42 | 43 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 44 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 45 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 46 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 47 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 48 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 49 | --input_pretrain ${pretrain_path} \ 50 | --output ${pred_path}.${method_id}.output.h5ad 51 | 52 | 53 | # MULTIOME GEX2ATAC 54 | dataset_id=openproblems_bmmc_multiome_phase2_rna 55 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 56 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 57 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 58 | 59 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 60 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 61 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 62 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 63 | --output_pretrain ${pretrain_path} 64 | 65 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 66 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 67 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 68 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 69 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 70 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 71 | --input_pretrain ${pretrain_path} \ 72 | --output ${pred_path}.${method_id}.output.h5ad 73 | 74 | # MULTIOME ATAC2GEX 75 | dataset_id=openproblems_bmmc_multiome_phase2_mod2 76 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 77 | # can reuse same pretrains 78 | # pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 79 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 80 | 81 | # target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 82 | # --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 83 | # --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 84 | # --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 85 | # --output_pretrain ${pretrain_path} 86 | 87 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 88 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 89 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 90 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 91 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 92 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 93 | --input_pretrain ${pretrain_path} \ 94 | --output ${pred_path}.${method_id}.output.h5ad 95 | 96 | # RUN EVALUATION 97 | bin/nextflow run "$PIPELINE_REPO" \ 98 | -r "$PIPELINE_VERSION" \ 99 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 100 | --solutionDir "output/datasets/$task_id" \ 101 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 102 | --publishDir "output/evaluation/$task_id/$method_id/" \ 103 | -latest \ 104 | -resume \ 105 | -c "src/resources/nextflow_moremem.config" 106 | 107 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/match_modality/methods/clue/train/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: clue_train 3 | namespace: match_modality_methods 4 | 5 | # metadata for your method 6 | description: Cross-linked unified embedding for single-cell multi-omics data integration 7 | 8 | info: 9 | submission_id: "169959" 10 | team_name: GLUE 11 | # project_url: https://github.com/foo/bar 12 | # publication_doi: 10.1101/0123.45.67.890123 13 | # publication_url: https://arxiv.org/abs/1234.56789 14 | 15 | authors: 16 | - name: Zhi-Jie Cao 17 | email: caozj@mail.cbi.pku.edu.cn 18 | roles: [ author, maintainer ] 19 | props: { github: Jeff1995, orcid: "0000-0002-0026-671X" } 20 | - name: Xin-Ming Tu 21 | email: xinmingtu@pku.edu.cn 22 | roles: [ author, maintainer ] 23 | props: { github: XinmingTu } 24 | - name: Chen-Rui Xia 25 | email: xiachenrui@mail.cbi.pku.edu.cn 26 | roles: [ author, maintainer ] 27 | props: { github: xiachenrui } 28 | 29 | # parameters 30 | arguments: 31 | # required inputs 32 | - name: "--input_train_mod1" 33 | type: "file" 34 | example: "dataset_censored.h5ad" 35 | description: "The censored shuffled train mod1 profiles." 36 | required: true 37 | - name: "--input_train_mod2" 38 | type: "file" 39 | example: "dataset_censored.h5ad" 40 | description: "The censored shuffled train mod2 profiles." 41 | required: true 42 | - name: "--input_train_sol" 43 | type: "file" 44 | example: "dataset_solution.h5ad" 45 | description: "The pairing of train mod1&mod2 profiles." 46 | required: true 47 | 48 | # required outputs 49 | - name: "--output_pretrain" 50 | type: "file" 51 | example: "pretrain_model" 52 | direction: "output" 53 | description: Path to the directory containing a pretrained model. 54 | required: true 55 | 56 | # files your script needs 57 | resources: 58 | - type: python_script 59 | path: script.py 60 | - path: ../resources/utils.py 61 | - path: ../resources/scglue-0.1.1-py3-none-any.whl 62 | 63 | # target platforms 64 | platforms: 65 | - type: docker 66 | image: nvidia/cuda:11.0-cudnn8-runtime-ubuntu18.04 67 | run_args: [ "--gpus all" ] 68 | setup: 69 | - type: apt 70 | packages: 71 | - python3-pip 72 | - python3.8-dev 73 | - type: docker 74 | run: 75 | - update-alternatives --install /usr/bin/python python /usr/bin/python3.8 10 76 | - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 10 77 | - python -m pip install --upgrade pip 78 | - pip install scglue-0.1.1-py3-none-any.whl 79 | - pip install pyyaml scikit-misc 80 | resources: 81 | - scglue-0.1.1-py3-none-any.whl scglue-0.1.1-py3-none-any.whl 82 | 83 | - type: nextflow 84 | labels: [ highmem, hightime, highcpu, gpu ] -------------------------------------------------------------------------------- /src/match_modality/methods/novel/README.md: -------------------------------------------------------------------------------- 1 | # NeurIPS-Single-Cell-MultiModality 2 | 3 | Team Novel: Gleb Ryazantsev, Nikolay Russkikh, Igor I 4 | 5 | The approach utilizes sample representations, learned in the same way as in the CLIP model. Encoders for all of the modalities are fully connected, the dimensionality of GEX and ATAC data is reduces via LSI transform (ADT is left as-is). Then, to obtain sample pairings, a maximum weight matching on a bipartite graph is performed, where weights are cosine similarities between sample embeddings. 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/match_modality/methods/novel/novel_architecture1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/novel/novel_architecture1.png -------------------------------------------------------------------------------- /src/match_modality/methods/novel/novel_architecture2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/match_modality/methods/novel/novel_architecture2.png -------------------------------------------------------------------------------- /src/match_modality/methods/novel/resources/catalyst_tools.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from catalyst import runners, metrics 4 | from models import symmetric_npair_loss 5 | 6 | 7 | import numpy as np 8 | import torch.nn.functional as F 9 | from tqdm.notebook import tqdm 10 | 11 | from networkx.algorithms import bipartite 12 | from scipy import sparse 13 | 14 | 15 | 16 | class scRNARunner(runners.Runner): 17 | def handle_batch(self, batch): 18 | features_first = batch['features_first'] 19 | features_second = batch['features_second'] 20 | 21 | logits, embeddings_first, embeddings_second = self.model(features_first, features_second) 22 | targets = torch.arange(logits.shape[0]).to(logits.device) 23 | 24 | loss = symmetric_npair_loss(logits, targets) 25 | 26 | batch_temperature = self.model.logit_scale.exp().item() 27 | 28 | self.batch_metrics.update({"loss": loss}) 29 | self.batch_metrics.update({"T": batch_temperature}) 30 | 31 | self.batch = { 32 | 'features_first': features_first, 33 | 'features_second': features_second, 34 | 'embeddings_first': embeddings_first, 35 | 'embeddings_second': embeddings_second, 36 | 'scores': logits, 37 | 'targets': targets, 38 | 'temperature': batch_temperature 39 | 40 | } 41 | self.input = { 'features_first': features_first, 42 | 'features_second': features_second, 43 | } 44 | self.output = {'scores': logits, 45 | 'embeddings_first': embeddings_first, 46 | 'embeddings_second': embeddings_second 47 | } 48 | 49 | class CustomMetric(metrics.ICallbackLoaderMetric): 50 | def __init__(self, compute_on_call: bool = True, prefix: str = None, suffix: str = None): 51 | """Init.""" 52 | super().__init__(compute_on_call=compute_on_call) 53 | self.prefix = prefix or "" 54 | self.suffix = suffix or "" 55 | self.embeddings_list_first = [] 56 | self.embeddings_list_second = [] 57 | 58 | def reset(self, num_batches: int, num_samples: int) -> None: 59 | self.embeddings_list_first = [] 60 | self.embeddings_list_second = [] 61 | torch.cuda.empty_cache() 62 | 63 | def update(self, *args, **kwargs) -> None: 64 | embeddings_first = kwargs['embeddings_first'] 65 | embeddings_second = kwargs['embeddings_second'] 66 | temperature = kwargs['temperature'] 67 | self.embeddings_list_first.append(temperature*embeddings_first) 68 | self.embeddings_list_second.append(embeddings_second) 69 | 70 | def compute(self): 71 | raise NotImplementedError('This method is not supported') 72 | 73 | 74 | def compute_key_value(self): 75 | all_embeddings_first = torch.cat(self.embeddings_list_first).detach().cpu() 76 | all_embeddings_second = torch.cat(self.embeddings_list_second).detach().cpu() 77 | logits = all_embeddings_first@all_embeddings_second.T 78 | #labels = torch.arange(logits.shape[0]).to(logits.device) 79 | labels = torch.arange(logits.shape[0]) 80 | 81 | del(all_embeddings_first) 82 | del(all_embeddings_second) 83 | 84 | forward_accuracy = (torch.argmax(logits, dim=1)==labels).float().mean().item() 85 | backward_accuracy = (torch.argmax(logits, dim=0)==labels).float().mean().item() 86 | del(logits) 87 | 88 | avg_accuracy = 0.5*(forward_accuracy+backward_accuracy) 89 | 90 | loader_metrics = { 91 | 92 | 'forward_acc':forward_accuracy, 93 | 'backward_acc':backward_accuracy, 94 | 'avg_acc': avg_accuracy 95 | } 96 | return loader_metrics -------------------------------------------------------------------------------- /src/match_modality/methods/novel/resources/config_ADT2GEX.py: -------------------------------------------------------------------------------- 1 | LR = 7.79984e-05 2 | OPTIM = 'AdamW' 3 | weight_decay=0 4 | 5 | EMBEDDING_DIM = 64 6 | 7 | DROPOUT_RATES_FIRST = [0.0221735, 0.296919] 8 | DROPOUT_RATES_GEX = [0.0107121,0.254689] 9 | 10 | LAYERS_DIM_FIRST = [512, 2048] 11 | LAYERS_DIM_GEX = [1024, 512] 12 | 13 | LOG_T = 3.463735 14 | 15 | N_LSI_COMPONENTS_GEX = 128 16 | N_EPOCHS = 7000 17 | 18 | BATCH_SIZE = 2048 19 | 20 | SWAP_RATE_FIRST = 0. 21 | SWAP_RATE_GEX = 0. 22 | -------------------------------------------------------------------------------- /src/match_modality/methods/novel/resources/config_ATAC2GEX.py: -------------------------------------------------------------------------------- 1 | #optimizer 2 | LR = 0.000585 3 | OPTIM = 'AdamW' 4 | weight_decay=0 5 | EMBEDDING_DIM = 256 6 | 7 | DROPOUT_RATES_FIRST = [0.661] 8 | DROPOUT_RATES_GEX = [ 0.541, 0.396] 9 | 10 | LAYERS_DIM_FIRST = [2048] 11 | LAYERS_DIM_GEX = [1024, 1024] 12 | 13 | LOG_T = 3.065016 14 | 15 | 16 | N_LSI_COMPONENTS_FIRST= 512 17 | N_LSI_COMPONENTS_GEX = 64 18 | 19 | N_EPOCHS = 7000 20 | 21 | BATCH_SIZE = 16384 -------------------------------------------------------------------------------- /src/match_modality/methods/novel/resources/data.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset,DataLoader 2 | 3 | class ModalityMatchingDataset(Dataset): 4 | def __init__( 5 | self, df_modality1, df_modality2 6 | ): 7 | super().__init__() 8 | 9 | self.df_modality1 = df_modality1.values 10 | self.df_modality2 = df_modality2.values 11 | 12 | 13 | def __len__(self): 14 | return self.df_modality1.shape[0] 15 | 16 | def __getitem__(self, index: int): 17 | x_modality_1 = self.df_modality1[index] 18 | x_modality_2 = self.df_modality2[index] 19 | return {'features_first':x_modality_1, 'features_second':x_modality_2} 20 | 21 | def get_dataloaders(mod1_train, mod2_train, sol_train, 22 | mod1_test, mod2_test, sol_test, NUM_WORKERS, BATCH_SIZE): 23 | 24 | mod2_train = mod2_train.iloc[sol_train.values.argmax(1)] 25 | mod2_test = mod2_test.iloc[sol_test.values.argmax(1)] 26 | 27 | dataset_train = ModalityMatchingDataset(mod1_train, mod2_train) 28 | data_train = DataLoader(dataset_train, BATCH_SIZE, shuffle = True, num_workers = NUM_WORKERS) 29 | 30 | dataset_test = ModalityMatchingDataset(mod1_test, mod2_test) 31 | data_test = DataLoader(dataset_test, BATCH_SIZE, shuffle = False, num_workers = NUM_WORKERS) 32 | 33 | return data_train, data_test 34 | 35 | -------------------------------------------------------------------------------- /src/match_modality/methods/novel/resources/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import Dataset,DataLoader 3 | import torch.nn.functional as F 4 | from torch import nn 5 | 6 | 7 | class BatchSwapNoise(nn.Module): 8 | """Swap Noise module""" 9 | def __init__(self, p): 10 | super().__init__() 11 | self.p = p 12 | 13 | def forward(self, x): 14 | if self.training: 15 | mask = torch.rand(x.size()) > (1 - self.p) 16 | idx = torch.add(torch.arange(x.nelement()), 17 | (torch.floor(torch.rand(x.size()) * x.size(0)).type(torch.LongTensor) * 18 | (mask.type(torch.LongTensor) * x.size(1))).view(-1)) 19 | idx[idx>=x.nelement()] = idx[idx>=x.nelement()]-x.nelement() 20 | return x.view(-1)[idx].view(x.size()) 21 | else: 22 | return x 23 | 24 | 25 | class Encoder(nn.Module): 26 | def __init__(self, n_input, embedding_size, dropout_rates, dims_layers, swap_noise_ratio): 27 | super(Encoder, self).__init__() 28 | dropout = [] 29 | layers = [] 30 | layers.append(nn.Linear(n_input, dims_layers[0])) 31 | 32 | for i in range(len(dims_layers)-1): 33 | layers.append(nn.Linear(dims_layers[i], dims_layers[i+1])) 34 | for i in range(len(dropout_rates)): 35 | dropout.append(nn.Dropout(p=dropout_rates[i])) 36 | 37 | layers.append(nn.Linear(dims_layers[-1], embedding_size)) 38 | 39 | self.fc_list = nn.ModuleList(layers) 40 | self.dropout_list = nn.ModuleList(dropout) 41 | 42 | def forward(self, x): 43 | for i in range(len(self.fc_list)-1): 44 | x = F.elu(self.fc_list[i](x)) 45 | if(i None: 108 | r""" 109 | LSI analysis (following the Seurat v3 approach) 110 | Parameters 111 | ---------- 112 | adata 113 | Input dataset 114 | n_components 115 | Number of dimensions to use 116 | use_highly_variable 117 | Whether to use highly variable features only, stored in 118 | ``adata.var['highly_variable']``. By default uses them if they 119 | have been determined beforehand. 120 | **kwargs 121 | Additional keyword arguments are passed to 122 | :func:`sklearn.utils.extmath.randomized_svd` 123 | """ 124 | if use_highly_variable is None: 125 | use_highly_variable = "highly_variable" in adata.var 126 | adata_use = adata[:, adata.var["highly_variable"]] if use_highly_variable else adata 127 | X = tfidf(adata_use.X) 128 | X_norm = sklearn.preprocessing.Normalizer(norm="l1").fit_transform(X) 129 | X_norm = np.log1p(X_norm * 1e4) 130 | X_lsi = sklearn.utils.extmath.randomized_svd(X_norm, n_components, random_state=777, **kwargs)[0] 131 | X_lsi -= X_lsi.mean(axis=1, keepdims=True) 132 | X_lsi /= X_lsi.std(axis=1, ddof=1, keepdims=True) 133 | adata.obsm["X_lsi"] = X_lsi -------------------------------------------------------------------------------- /src/match_modality/methods/novel/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: novel 3 | namespace: match_modality_methods 4 | 5 | description: The approach utilizes sample representations, learned in the same way as in the CLIP model. Encoders for all of the modalities are fully connected, the dimensionality of GEX and ATAC data is reduces via LSI transform (ADT is left as-is). Then, to obtain sample pairings, a maximum weight matching on a bipartite graph is performed, where weights are cosine similarities between sample embeddings. 6 | info: 7 | method_label: "Novel" 8 | submission_id: "169594/170690" 9 | team_name: Novel 10 | 11 | authors: 12 | - name: Gleb Ryazantsev 13 | email: ryazantsev.gleb@gmail.com 14 | roles: [ author, maintainer ] 15 | - name: Nikolay Russkikh 16 | email: russkikh.nikolay@gmail.com 17 | roles: [ author, maintainer ] 18 | - name: Igor I 19 | email: herri.i.67@gmail.com 20 | roles: [ author, maintainer ] 21 | 22 | # parameters 23 | arguments: 24 | # required inputs 25 | - name: "--input_train_mod1" 26 | type: "file" 27 | example: "dataset_censored.h5ad" 28 | description: "The censored shuffled train mod1 profiles." 29 | required: true 30 | - name: "--input_train_mod2" 31 | type: "file" 32 | example: "dataset_censored.h5ad" 33 | description: "The censored shuffled train mod2 profiles." 34 | required: true 35 | - name: "--input_train_sol" 36 | type: "file" 37 | example: "dataset_solution.h5ad" 38 | description: "The pairing of train mod1&mod2 profiles." 39 | required: true 40 | - name: "--input_test_mod1" 41 | type: "file" 42 | example: "dataset_censored.h5ad" 43 | description: "The censored shuffled test mod1 profiles." 44 | required: true 45 | - name: "--input_test_mod2" 46 | type: "file" 47 | example: "dataset_censored.h5ad" 48 | description: "The censored shuffled test mod2 profiles." 49 | required: true 50 | - name: "--input_pretrain" 51 | type: "file" 52 | example: "pretrain_model" 53 | description: Path to the directory containing a pretrained model. 54 | required: true 55 | 56 | # required outputs 57 | - name: "--output" 58 | type: "file" 59 | direction: "output" 60 | example: "output.h5ad" 61 | description: "The predicted pairing of test mod1&mod2 profiles." 62 | required: true 63 | 64 | # files your script needs 65 | resources: 66 | - type: python_script 67 | path: script.py 68 | - path: ../resources/catalyst_tools.py 69 | - path: ../resources/config_ADT2GEX.py 70 | - path: ../resources/config_ATAC2GEX.py 71 | - path: ../resources/data.py 72 | - path: ../resources/models.py 73 | - path: ../resources/postprocessing.py 74 | - path: ../resources/preprocessing.py 75 | 76 | # target platforms 77 | platforms: 78 | - type: docker 79 | image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime" 80 | run_args: [ "--gpus all --shm-size=5G" ] 81 | setup: 82 | - type: python 83 | packages: 84 | - catalyst 85 | - anndata 86 | - scikit-learn 87 | - networkx 88 | 89 | - type: nextflow 90 | labels: [ vhighmem, vvhightime, vhighcpu, gpu] 91 | -------------------------------------------------------------------------------- /src/match_modality/methods/novel/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=novel 7 | task_id=match_modality 8 | 9 | 10 | # CITE ADT2GEX 11 | dataset_id=openproblems_bmmc_cite_phase2_mod2 12 | dataset_id_val=openproblems_bmmc_cite_phase2_mod2 13 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 14 | dataset_path_val=output/datasets/$task_id/$dataset_id_val/$dataset_id_val.censor_dataset 15 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 16 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 17 | 18 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 19 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 20 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 21 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 22 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 23 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 24 | --input_test_sol ${dataset_path}.output_test_sol.h5ad \ 25 | --output_pretrain ${pretrain_path} 26 | 27 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 28 | --input_train_mod1 ${dataset_path_val}.output_train_mod1.h5ad \ 29 | --input_train_mod2 ${dataset_path_val}.output_train_mod2.h5ad \ 30 | --input_train_sol ${dataset_path_val}.output_train_sol.h5ad \ 31 | --input_test_mod1 ${dataset_path_val}.output_test_mod1.h5ad \ 32 | --input_test_mod2 ${dataset_path_val}.output_test_mod2.h5ad \ 33 | --input_pretrain ${pretrain_path} \ 34 | --output ${pred_path}.${method_id}.output.h5ad 35 | 36 | #CITE GEX2ADT 37 | dataset_id=openproblems_bmmc_cite_phase2_rna 38 | pretrain_dataset_id=openproblems_bmmc_cite_phase2_mod2 39 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 40 | pretrain_path=output/pretrain/$task_id/$method_id/$pretrain_dataset_id.${method_id}_train.output_pretrain/ 41 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 42 | 43 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 44 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 45 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 46 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 47 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 48 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 49 | --input_pretrain ${pretrain_path} \ 50 | --output ${pred_path}.${method_id}.output.h5ad 51 | 52 | 53 | 54 | # MULTIOME ATAC2GEX 55 | dataset_id=openproblems_bmmc_multiome_phase2_mod2 56 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 57 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 58 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 59 | 60 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 61 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 62 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 63 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 64 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 65 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 66 | --input_test_sol ${dataset_path}.output_test_sol.h5ad \ 67 | --output_pretrain ${pretrain_path} 68 | 69 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 70 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 71 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 72 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 73 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 74 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 75 | --input_pretrain ${pretrain_path} \ 76 | --output ${pred_path}.${method_id}.output.h5ad 77 | 78 | # MULTIOME GEX2ATAC 79 | dataset_id=openproblems_bmmc_multiome_phase2_rna 80 | pretrain_dataset_id=openproblems_bmmc_multiome_phase2_mod2 81 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 82 | pretrain_path=output/pretrain/$task_id/$method_id/$pretrain_dataset_id.${method_id}_train.output_pretrain/ 83 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 84 | 85 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 86 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 87 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 88 | --input_train_sol ${dataset_path}.output_train_sol.h5ad \ 89 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 90 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 91 | --input_pretrain ${pretrain_path} \ 92 | --output ${pred_path}.${method_id}.output.h5ad 93 | 94 | # RUN EVALUATION 95 | bin/nextflow run "$PIPELINE_REPO" \ 96 | -r "$PIPELINE_VERSION" \ 97 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 98 | --solutionDir "output/datasets/$task_id" \ 99 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 100 | --publishDir "output/evaluation/$task_id/$method_id/" \ 101 | -latest \ 102 | -resume \ 103 | -c "src/resources/nextflow_moremem.config" 104 | 105 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/match_modality/methods/novel/train/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: novel_train 3 | namespace: match_modality_methods 4 | 5 | # metadata for your method 6 | 7 | description: The approach utilizes sample representations, learned in the same way as in the CLIP model. Encoders for all of the modalities are fully connected, the dimensionality of GEX and ATAC data is reduces via LSI transform (ADT is left as-is). Then, to obtain sample pairings, a maximum weight matching on a bipartite graph is performed, where weights are cosine similarities between sample embeddings. 8 | 9 | authors: 10 | - name: Gleb Ryazantsev 11 | email: ryazantsev.gleb@gmail.com 12 | roles: [ author, maintainer ] 13 | - name: Nikolay Russkikh 14 | email: russkikh.nikolay@gmail.com 15 | roles: [ author, maintainer ] 16 | - name: Igor I 17 | email: herri.i.67@gmail.com 18 | roles: [ author, maintainer ] 19 | 20 | # parameters 21 | arguments: 22 | # required inputs 23 | - name: "--input_train_mod1" 24 | type: "file" 25 | example: "dataset_mod1.h5ad" 26 | description: Censored dataset, training cells. 27 | required: true 28 | - name: "--input_train_mod2" 29 | type: "file" 30 | example: "dataset_mod2.h5ad" 31 | description: Censored dataset. 32 | required: true 33 | - name: "--input_train_sol" 34 | type: "file" 35 | example: "dataset_solution.h5ad" 36 | description: "The pairing of train mod1&mod2 profiles." 37 | required: true 38 | - name: "--input_test_mod1" 39 | type: "file" 40 | example: "dataset_test_mod1.h5ad" 41 | description: Censored dataset, training cells. 42 | required: true 43 | - name: "--input_test_mod2" 44 | type: "file" 45 | example: "dataset_test_mod2.h5ad" 46 | description: Censored dataset. 47 | required: true 48 | - name: "--input_test_sol" 49 | type: "file" 50 | example: "dataset_solution.h5ad" 51 | description: "The pairing of train mod1&mod2 profiles." 52 | required: true 53 | 54 | # required outputs 55 | - name: "--output_pretrain" 56 | type: "file" 57 | direction: "output" 58 | example: "pretrain_model" 59 | description: Path to the directory containing a pretrained model. 60 | required: true 61 | 62 | # files your script needs 63 | resources: 64 | - type: python_script 65 | path: script.py 66 | - path: ../resources/catalyst_tools.py 67 | - path: ../resources/config_ADT2GEX.py 68 | - path: ../resources/config_ATAC2GEX.py 69 | - path: ../resources/data.py 70 | - path: ../resources/models.py 71 | - path: ../resources/postprocessing.py 72 | - path: ../resources/preprocessing.py 73 | 74 | # target platforms 75 | platforms: 76 | - type: docker 77 | image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime" 78 | run_args: [ "--gpus all --shm-size=5G" ] 79 | setup: 80 | - type: python 81 | packages: 82 | - catalyst 83 | - anndata 84 | - scikit-learn 85 | - networkx 86 | 87 | - type: nextflow 88 | labels: [ vhighmem, vvhightime, vhighcpu, gpu] 89 | -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/README.md: -------------------------------------------------------------------------------- 1 | # NeurIPS-Single-Cell-MultiModality 2 | 3 | Team: [Xueer Chen](https://github.com/xuerchen), [Jiwei Liu](https://github.com/daxiongshu) 4 | 5 | This folder contains our solution to the [OpenProblems-NeurIPS2021 Single-Cell Multimodal Data Integration](https://eval.ai/web/challenges/challenge-page/1111/overview). Our team AXX took the [4th place of the modality prediction task](https://eval.ai/web/challenges/challenge-page/1111/leaderboard/2860) in terms of overall ranking of 4 subtasks: namely `GEX to ADT`, `ADT to GEX`, `GEX to ATAC` and `ATAC to GEX`. Specifically, our methods ranked **3rd** in `GEX to ATAC` and **4th** in `GEX to ADT`. More details about the task can be found in the [competition webpage](https://openproblems.bio/neurips_docs/about_tasks/task1_modality_prediction/). 6 | 7 | 8 | -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/resources/const.py: -------------------------------------------------------------------------------- 1 | PATH = '.' 2 | OUT_PATH = '.' 3 | -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/resources/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytorch_lightning as pl 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.utils.data import TensorDataset,DataLoader 6 | 7 | class MLP(pl.LightningModule): 8 | def __init__(self,in_dim,out_dim,ymean,config): 9 | super(MLP, self).__init__() 10 | self.ymean = ymean.cuda() 11 | H1 = config.H1 12 | H2 = config.H2 13 | p = config.dropout 14 | self.config = config 15 | self.fc1 = nn.Linear(in_dim, H1) 16 | self.fc2 = nn.Linear(H1,H2) 17 | self.fc3 = nn.Linear(H1+H2, out_dim) 18 | self.dp2 = nn.Dropout(p=p) 19 | 20 | def forward(self, x): 21 | x0 = x 22 | x1 = F.relu(self.fc1(x)) 23 | x1 = self.dp2(x1) 24 | x = F.relu(self.fc2(x1)) 25 | x = torch.cat([x,x1],dim=1) 26 | x = self.fc3(x) 27 | x = self.apply_mask(x) 28 | return x 29 | 30 | def apply_mask(self,yp): 31 | tmp = torch.ones_like(yp).float()*self.ymean 32 | mask = tmp Running method") 29 | out = subprocess.check_output([ 30 | command, 31 | "--input_train_mod1", testpar['input_train_mod1'], 32 | "--input_train_mod2", testpar['input_train_mod2'], 33 | "--input_test_mod1", testpar['input_test_mod1'], 34 | "--output", testpar['output'] 35 | ]).decode("utf-8") 36 | 37 | print("> Checking whether output files were created") 38 | assert path.exists(testpar['output']) 39 | 40 | print("> Reading h5ad files") 41 | ad_sol = ad.read_h5ad(testpar['input_test_mod2']) 42 | ad_pred = ad.read_h5ad(testpar['output']) 43 | 44 | print("> Checking dataset id") 45 | assert ad_pred.uns['dataset_id'] == ad_sol.uns['dataset_id'] 46 | 47 | print("> Checking method id", ad_pred.uns['method_id'], method_id) 48 | assert ad_pred.uns['method_id'] == method_id 49 | 50 | print("> Checking X") 51 | assert issparse(ad_pred.X) 52 | assert ad_pred.n_obs == ad_sol.n_obs 53 | assert ad_pred.n_vars == ad_sol.n_vars 54 | assert all(ad_pred.obs_names == ad_sol.obs_names) 55 | assert all(ad_pred.var_names == ad_sol.var_names) 56 | 57 | print("> Test succeeded!") -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/resources/train.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import pytorch_lightning as pl 3 | from torch.utils.data import TensorDataset,DataLoader 4 | from models import MLP 5 | from pytorch_lightning.callbacks import ModelCheckpoint 6 | from pytorch_lightning.loggers import TensorBoardLogger,WandbLogger 7 | from pathlib import Path 8 | import utils 9 | import anndata as ad 10 | import numpy as np 11 | import json 12 | from const import PATH, OUT_PATH 13 | 14 | def _train(X, y, Xt, yt, enable_ckpt, logger, yaml_path): 15 | config = utils.load_yaml(yaml_path) 16 | X = torch.from_numpy(X).float() 17 | y = torch.from_numpy(y).float() 18 | ymean = torch.mean(y,dim=0,keepdim=True) 19 | 20 | tr_ds = TensorDataset(X,y) 21 | nw = 0 if 'ATAC' in yaml_path else 4 22 | tr_loader = DataLoader(tr_ds, batch_size=config.batch_size,num_workers=nw, 23 | shuffle=True, drop_last=True) 24 | 25 | Xt = torch.from_numpy(Xt).float() 26 | yt = torch.from_numpy(yt).float() 27 | te_ds = TensorDataset(Xt,yt) 28 | te_loader = DataLoader(te_ds, batch_size=config.batch_size,num_workers=0, 29 | shuffle=False, drop_last=False) 30 | 31 | checkpoint_callback = ModelCheckpoint(monitor='valid_RMSE') 32 | if enable_ckpt: 33 | epochs = config.epochs 34 | cb = [checkpoint_callback] 35 | else: 36 | epochs = 1 37 | cb = None 38 | 39 | trainer = pl.Trainer(enable_checkpointing=enable_ckpt, logger=logger, 40 | gpus=1, max_epochs=epochs, 41 | callbacks=cb, 42 | progress_bar_refresh_rate=5) 43 | 44 | net = MLP(X.shape[1],y.shape[1],ymean,config) 45 | trainer.fit(net, tr_loader, te_loader) 46 | 47 | cp = 'best' if enable_ckpt else None 48 | yp = trainer.predict(net,te_loader,ckpt_path=cp) 49 | yp = torch.cat(yp,dim=0) 50 | 51 | score = ((yp-yt)**2).mean()**0.5 52 | print(f"VALID RMSE {score:.3f}") 53 | del trainer 54 | return score,yp.detach().numpy() 55 | 56 | 57 | def train(task,cp,wp,tr1,tr2): 58 | yaml_path = f'{cp}/yaml/mlp_{task}.yaml' 59 | yps = [] 60 | scores = [] 61 | 62 | msgs = {} 63 | for fold in range(3): 64 | 65 | run_name = f"{task}_fold_{fold}" 66 | save_path = f'{wp}/{run_name}' 67 | Path(save_path).mkdir(parents=True, exist_ok=True) 68 | 69 | X,y,Xt,yt = utils.split(tr1, tr2, fold) 70 | run_name = f'fold_{fold}' 71 | logger = TensorBoardLogger(save_path, name='') 72 | 73 | enable_ckpt = True 74 | 75 | score, yp = _train(X, y, Xt, yt, enable_ckpt, logger, yaml_path) 76 | yps.append(yp) 77 | scores.append(score) 78 | msg = f"{task} Fold {fold} RMSE {score:.3f}" 79 | msgs[f'Fold {fold}'] = f'{score:.3f}' 80 | print(msg) 81 | 82 | yp = np.concatenate(yps) 83 | score = np.mean(scores) 84 | msgs['Overall'] = f'{score:.3f}' 85 | print('Overall', f'{score:.3f}') -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/resources/yaml/mlp_ADT2GEX.yaml: -------------------------------------------------------------------------------- 1 | # sample config defaults file 2 | epochs: 3 | desc: Number of epochs to train over 4 | value: 10 5 | batch_size: 6 | desc: Size of each mini-batch 7 | value: 512 8 | H1: 9 | desc: Number of hidden neurons in 1st layer of MLP 10 | value: 256 11 | H2: 12 | desc: Number of hidden neurons in 2nd layer of MLP 13 | value: 128 14 | dropout: 15 | desc: probs of zeroing values 16 | value: 0 17 | lr: 18 | desc: learning rate 19 | value: 0.001 20 | wd: 21 | desc: weight decay 22 | value: 1e-5 23 | threshold: 24 | desc: threshold to set values to zero 25 | value: 0 26 | lr_schedule: 27 | desc: learning rate scheduler 28 | value: adam -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/resources/yaml/mlp_ATAC2GEX.yaml: -------------------------------------------------------------------------------- 1 | # sample config defaults file 2 | epochs: 3 | desc: Number of epochs to train over 4 | value: 10 5 | batch_size: 6 | desc: Size of each mini-batch 7 | value: 512 8 | H1: 9 | desc: Number of hidden neurons in 1st layer of MLP 10 | value: 256 11 | H2: 12 | desc: Number of hidden neurons in 2nd layer of MLP 13 | value: 128 14 | dropout: 15 | desc: probs of zeroing values 16 | value: 0.5 17 | lr: 18 | desc: learning rate 19 | value: 0.001 20 | wd: 21 | desc: weight decay 22 | value: 1e-5 23 | threshold: 24 | desc: threshold to set values to zero 25 | value: 0 26 | lr_schedule: 27 | desc: learning rate scheduler 28 | value: adam -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/resources/yaml/mlp_GEX2ADT.yaml: -------------------------------------------------------------------------------- 1 | # sample config defaults file 2 | epochs: 3 | desc: Number of epochs to train over 4 | value: 10 5 | batch_size: 6 | desc: Size of each mini-batch 7 | value: 512 8 | H1: 9 | desc: Number of hidden neurons in 1st layer of MLP 10 | value: 1024 11 | H2: 12 | desc: Number of hidden neurons in 2nd layer of MLP 13 | value: 512 14 | dropout: 15 | desc: probs of zeroing values 16 | value: 0 17 | lr: 18 | desc: learning rate 19 | value: 0.001 20 | wd: 21 | desc: weight decay 22 | value: 1e-5 23 | threshold: 24 | desc: threshold to set values to zero 25 | value: 0.05 26 | lr_schedule: 27 | desc: learning rate scheduler 28 | value: adam_cosin -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: simplemlp 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | description: Ensemble of MLPs trained on different sites 7 | info: 8 | method_label: SimpleMLP 9 | submission_id: "170812" 10 | team_name: AXX 11 | # project_url: https://github.com/foo/bar 12 | # publication_doi: 10.1101/0123.45.67.890123 13 | # publication_url: https://arxiv.org/abs/1234.56789 14 | 15 | authors: 16 | - name: Xueer Chen 17 | email: xc2579@columbia.edu 18 | roles: [ author, maintainer ] 19 | props: { github: xuerchen } 20 | - name: Jiwei Liu 21 | email: jiweil@nvidia.com 22 | roles: [ author, maintainer ] 23 | props: { github: daxiongshu, orcid: "0000-0002-8799-9763" } 24 | 25 | # parameters 26 | arguments: 27 | # required inputs 28 | - name: "--input_train_mod1" 29 | type: "file" 30 | example: "dataset_mod1.h5ad" 31 | description: Censored dataset, training cells. 32 | required: true 33 | - name: "--input_test_mod1" 34 | type: "file" 35 | example: "dataset_mod1.h5ad" 36 | description: Censored dataset, test cells. 37 | required: true 38 | - name: "--input_train_mod2" 39 | type: "file" 40 | example: "dataset_mod2.h5ad" 41 | description: Censored dataset. 42 | required: true 43 | - name: "--input_pretrain" 44 | type: "file" 45 | direction: "output" 46 | example: "pretrain_model" 47 | description: Path to the directory containing a pretrained model. 48 | required: true 49 | # required outputs 50 | - name: "--output" 51 | type: "file" 52 | direction: "output" 53 | example: "output.h5ad" 54 | description: Dataset with predicted values for modality2. 55 | required: true 56 | 57 | 58 | # files your script needs 59 | resources: 60 | - type: python_script 61 | path: script.py 62 | - path: ../resources/predict.py 63 | - path: ../resources/models.py 64 | - path: ../resources/utils.py 65 | - path: ../resources/const.py 66 | - path: ../resources/yaml 67 | 68 | # resources for unit testing your component 69 | tests: 70 | - type: python_script 71 | path: test.py 72 | - path: sample_data 73 | 74 | # target platforms 75 | platforms: 76 | 77 | # By specifying 'docker' platform, viash will build a standalone 78 | # executable which uses docker in the back end to run your method. 79 | - type: docker 80 | # you need to specify a base image that contains at least bash and python 81 | image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime 82 | run_args: [ "--gpus all --ipc=host"] 83 | # You can specify additional dependencies with 'setup'. 84 | # See https://viash.io/docs/reference_config/platform-docker/#setup-list 85 | # for more information on how to add more dependencies. 86 | setup: 87 | # - type: apt 88 | # packages: 89 | # - bash 90 | # - type: python 91 | # packages: 92 | # - scanpy 93 | - type: python 94 | packages: 95 | - scikit-learn 96 | - anndata 97 | - scanpy 98 | - pytorch-lightning 99 | 100 | # By specifying a 'nextflow', viash will also build a viash module 101 | # which uses the docker container built above to also be able to 102 | # run your method as part of a nextflow pipeline. 103 | - type: nextflow 104 | labels: [ highmem, hightime, highcpu, gpu] 105 | 106 | # used for saturn cloud 107 | - type: native 108 | -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/run/script.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import anndata as ad 3 | import sys 4 | from scipy.sparse import csc_matrix 5 | 6 | from sklearn.decomposition import TruncatedSVD 7 | from sklearn.linear_model import LinearRegression 8 | import numpy as np 9 | 10 | logging.basicConfig(level=logging.INFO) 11 | 12 | ## VIASH START 13 | # Anything within this block will be removed by `viash` and will be 14 | # replaced with the parameters as specified in your config.vsh.yaml. 15 | par = { 16 | 'input_train_mod1': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad', 17 | 'input_train_mod2': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad', 18 | 'input_test_mod1': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_test_mod1.h5ad', 19 | 'input_pretrain': 'path/to/model', 20 | 'output': 'output.h5ad' 21 | } 22 | meta = { 23 | 'resources_dir': 'src/predict_modality/methods/AXX/resources' 24 | } 25 | ## VIASH END 26 | sys.path.append(meta['resources_dir']) 27 | from predict import predict 28 | from utils import get_y_dim 29 | 30 | logging.info('Reading `h5ad` files...') 31 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) 32 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) 33 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) 34 | 35 | y_dim,task = get_y_dim(par['input_test_mod1']) 36 | ymean = np.asarray(input_train_mod2.X.mean(axis=0)) 37 | if task == 'GEX2ATAC': 38 | y_pred = ymean*np.ones([input_test_mod1.shape[0],y_dim]) 39 | else: 40 | y_pred = predict(ymean,test_data_path=par['input_test_mod1'], 41 | folds=[0,1,2],cp=meta['resources_dir'], 42 | wp=par['input_pretrain']) 43 | 44 | y_pred = csc_matrix(y_pred) 45 | 46 | adata = ad.AnnData( 47 | X=y_pred, 48 | obs=input_test_mod1.obs, 49 | var=input_train_mod2.var, 50 | uns={ 51 | 'dataset_id': input_train_mod1.uns['dataset_id'], 52 | 'method_id': meta['functionality_name'], 53 | }, 54 | ) 55 | 56 | logging.info('Storing annotated data...') 57 | adata.write_h5ad(par['output'], compression = "gzip") 58 | -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=simplemlp 7 | task_id=predict_modality 8 | 9 | # CITE GEX2ADT 10 | dataset_id=openproblems_bmmc_cite_phase2_rna 11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 12 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 13 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 14 | 15 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 16 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 17 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 18 | --output_pretrain ${pretrain_path} 19 | 20 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 21 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 22 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 23 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 24 | --input_pretrain ${pretrain_path} \ 25 | --output ${pred_path}.${method_id}.output.h5ad 26 | 27 | # CITE ADT2GEX 28 | dataset_id=openproblems_bmmc_cite_phase2_mod2 29 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 30 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 31 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 32 | 33 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 34 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 35 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 36 | --output_pretrain ${pretrain_path} 37 | 38 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 39 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 40 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 41 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 42 | --input_pretrain ${pretrain_path} \ 43 | --output ${pred_path}.${method_id}.output.h5ad 44 | 45 | # MULTIOME GEX2ATAC 46 | dataset_id=openproblems_bmmc_multiome_phase2_rna 47 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 48 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 49 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 50 | 51 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 52 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 53 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 54 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 55 | --input_pretrain ${pretrain_path} \ 56 | --output ${pred_path}.${method_id}.output.h5ad 57 | 58 | # MULTIOME ATAC2GEX 59 | dataset_id=openproblems_bmmc_multiome_phase2_mod2 60 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 61 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 62 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 63 | 64 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 65 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 66 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 67 | --output_pretrain ${pretrain_path} 68 | 69 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 70 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 71 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 72 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 73 | --input_pretrain ${pretrain_path} \ 74 | --output ${pred_path}.${method_id}.output.h5ad 75 | 76 | # RUN EVALUATION 77 | bin/nextflow run "$PIPELINE_REPO" \ 78 | -r "$PIPELINE_VERSION" \ 79 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 80 | --solutionDir "output/datasets/$task_id" \ 81 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 82 | --publishDir "output/evaluation/$task_id/$method_id/" \ 83 | -latest \ 84 | -resume \ 85 | -c "src/resources/nextflow_moremem.config" 86 | 87 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/train/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: simplemlp_train 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | description: Ensemble of MLPs trained on differnt sites 7 | info: 8 | submission_id: "170812" 9 | team_name: AXX 10 | # project_url: https://github.com/foo/bar 11 | # publication_doi: 10.1101/0123.45.67.890123 12 | # publication_url: https://arxiv.org/abs/1234.56789 13 | 14 | authors: 15 | - name: Xueer Chen 16 | email: xc2579@columbia.edu 17 | roles: [ author, maintainer ] 18 | props: { github: xuerchen, orcid: "0000-0000-0000-0000" } 19 | - name: Jiwei Liu 20 | email: jiweil@nvidia.com 21 | roles: [ author, maintainer ] 22 | props: { github: daxiongshu, orcid: "0000-0002-8799-9763" } 23 | 24 | 25 | # parameters 26 | arguments: 27 | # required inputs 28 | - name: "--input_train_mod1" 29 | type: "file" 30 | example: "dataset_mod1.h5ad" 31 | description: Censored dataset, training cells. 32 | required: true 33 | - name: "--input_train_mod2" 34 | type: "file" 35 | example: "dataset_mod2.h5ad" 36 | description: Censored dataset. 37 | required: true 38 | # required outputs 39 | - name: "--output_pretrain" 40 | type: "file" 41 | direction: "output" 42 | example: "pretrain_model" 43 | description: Path to the directory containing a pretrained model. 44 | required: true 45 | 46 | # files your script needs 47 | resources: 48 | - type: python_script 49 | path: script.py 50 | - path: ../resources/train.py 51 | - path: ../resources/models.py 52 | - path: ../resources/utils.py 53 | - path: ../resources/const.py 54 | - path: ../resources/yaml 55 | 56 | # target platforms 57 | platforms: 58 | 59 | # By specifying 'docker' platform, viash will build a standalone 60 | # executable which uses docker in the back end to run your method. 61 | - type: docker 62 | # you need to specify a base image that contains at least bash and python 63 | image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime 64 | run_args: [ "--gpus all --ipc=host"] 65 | # You can specify additional dependencies with 'setup'. 66 | # See https://viash.io/docs/reference_config/platform-docker/#setup-list 67 | # for more information on how to add more dependencies. 68 | setup: 69 | # - type: apt 70 | # packages: 71 | # - bash 72 | # - type: python 73 | # packages: 74 | # - scanpy 75 | - type: python 76 | packages: 77 | - scikit-learn 78 | - anndata 79 | - scanpy 80 | - pytorch-lightning 81 | 82 | # By specifying a 'nextflow', viash will also build a viash module 83 | # which uses the docker container built above to also be able to 84 | # run your method as part of a nextflow pipeline. 85 | - type: nextflow 86 | labels: [ highmem, hightime, highcpu, gpu] 87 | 88 | # used for saturn cloud 89 | - type: native 90 | -------------------------------------------------------------------------------- /src/predict_modality/methods/AXX/train/script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import anndata as ad 4 | import pickle 5 | import numpy as np 6 | import pandas as pd 7 | import scanpy as sc 8 | from sklearn.preprocessing import binarize 9 | 10 | logging.basicConfig(level=logging.INFO) 11 | 12 | ## VIASH START 13 | par = { 14 | 'input_train_mod1': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod1.h5ad', 15 | 'input_train_mod2': 'output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_train_mod2.h5ad', 16 | 'output_pretrain': 'path/to/model' 17 | } 18 | 19 | meta = { 20 | 'resources_dir': 'src/predict_modality/methods/AXX/resources' 21 | } 22 | ## VIASH END 23 | 24 | import sys 25 | sys.path.append(meta['resources_dir']) 26 | from train import train 27 | 28 | 29 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) 30 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) 31 | 32 | mod_1 = input_train_mod1.var["feature_types"][0] 33 | mod_2 = input_train_mod2.var["feature_types"][0] 34 | 35 | os.makedirs(par['output_pretrain'], exist_ok=True) 36 | 37 | task = f'{mod_1}2{mod_2}' 38 | train(task,cp=meta['resources_dir'], 39 | wp=par['output_pretrain'], 40 | tr1=input_train_mod1, 41 | tr2=input_train_mod2) -------------------------------------------------------------------------------- /src/predict_modality/methods/DANCE/resources/baseline.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from sklearn.decomposition import TruncatedSVD 4 | from sklearn.linear_model import LinearRegression 5 | 6 | def baseline_linear(input_train_mod1, input_train_mod2, input_test_mod1): 7 | '''Baseline method training a linear regressor on the input data''' 8 | 9 | # Do PCA on the input data 10 | logging.info('Performing dimensionality reduction on modality 1 values...') 11 | embedder_mod1 = TruncatedSVD(n_components=50) 12 | X_train = embedder_mod1.fit_transform(input_train_mod1) 13 | X_test = embedder_mod1.transform(input_test_mod1) 14 | 15 | logging.info('Performing dimensionality reduction on modality 2 values...') 16 | embedder_mod2 = TruncatedSVD(n_components=50) 17 | y_train = embedder_mod2.fit_transform(input_train_mod2) 18 | 19 | logging.info('Running Linear regression...') 20 | 21 | reg = LinearRegression() 22 | 23 | # Train the model on the PCA reduced modality 1 and 2 data 24 | reg.fit(X_train, y_train) 25 | y_pred = reg.predict(X_test) 26 | 27 | # Project the predictions back to the modality 2 feature space 28 | y_pred = y_pred @ embedder_mod2.components_ 29 | 30 | return y_pred -------------------------------------------------------------------------------- /src/predict_modality/methods/DANCE/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: dance 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | description: A description for your method. 7 | info: 8 | method_label: "DANCE" 9 | submission_id: "171129" 10 | team_name: DANCE 11 | # project_url: https://github.com/foo/bar 12 | # publication_doi: 10.1101/0123.45.67.890123 13 | # publication_url: https://arxiv.org/abs/1234.56789 14 | 15 | authors: 16 | - name: Hongzhi Wen 17 | email: wenhongz@msu.edu 18 | roles: [ author, maintainer ] 19 | - name: Jiayuan Ding 20 | email: dingjia5@msu.edu 21 | roles: [ author, maintainer ] 22 | - name: Wei Jin 23 | email: jinwei2@msu.edu 24 | roles: [ author ] 25 | - name: Xiaoyan Li 26 | email: lixiaoy5@msu.edu 27 | roles: [ author ] 28 | - name: Zhaoheng Li 29 | email: zli1@macalester.edu 30 | roles: [ author ] 31 | - name: Haoyu Han 32 | email: hanhaoy1@msu.edu 33 | roles: [ assistant ] 34 | - name: Yuying Xie 35 | email: xyy@msu.edu 36 | roles: [ advisor ] 37 | - name: Jiliang Tang 38 | email: tangjili@msu.edu 39 | roles: [ advisor ] 40 | 41 | 42 | # parameters 43 | arguments: 44 | # required inputs 45 | - name: "--input_train_mod1" 46 | type: "file" 47 | example: "dataset_mod1.h5ad" 48 | description: Censored dataset, training cells. 49 | required: true 50 | - name: "--input_test_mod1" 51 | type: "file" 52 | example: "dataset_mod1.h5ad" 53 | description: Censored dataset, test cells. 54 | required: true 55 | - name: "--input_train_mod2" 56 | type: "file" 57 | example: "dataset_mod2.h5ad" 58 | description: Censored dataset. 59 | required: true 60 | - name: "--input_pretrain" 61 | type: "file" 62 | example: "pretrain_model" 63 | description: Path to the directory containing a pretrained model. 64 | required: true 65 | # required outputs 66 | - name: "--output" 67 | type: "file" 68 | direction: "output" 69 | example: "output.h5ad" 70 | description: Dataset with predicted values for modality2. 71 | required: true 72 | 73 | # files your script needs 74 | resources: 75 | - type: python_script 76 | path: script.py 77 | - path: ../resources/baseline.py 78 | - path: ../resources/graph_util.py 79 | 80 | # target platforms 81 | platforms: 82 | - type: docker 83 | image: dataintuitive/randpy:py3.8 84 | setup: 85 | - type: docker 86 | run: [pip install scikit-learn==0.24.1] 87 | 88 | - type: python 89 | packages: 90 | #- scikit-learn 91 | - anndata 92 | - scanpy 93 | - numpy 94 | - torch 95 | - dgl 96 | - lightgbm 97 | - joblib 98 | 99 | - type: nextflow 100 | labels: [ midmem, hightime, lowcpu ] -------------------------------------------------------------------------------- /src/predict_modality/methods/DANCE/run/script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import anndata as ad 4 | import numpy as np 5 | import json 6 | import sys 7 | import re 8 | from scipy.sparse import csc_matrix 9 | 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | 13 | ## VIASH START 14 | dataset_path = "output/datasets/predict_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_" 15 | pretrain_path = "output/pretrain/predict_modality/dance/openproblems_bmmc_cite_phase2_rna.dance_train.output_pretrain/" 16 | 17 | par = { 18 | 'input_train_mod1': f'{dataset_path}train_mod1.h5ad', 19 | 'input_train_mod2': f'{dataset_path}train_mod2.h5ad', 20 | 'input_test_mod1': f'{dataset_path}test_mod1.h5ad', 21 | 'input_pretrain': pretrain_path, 22 | 'output': 'output.h5ad' 23 | } 24 | meta = { 25 | 'resources_dir': 'src/predict_modality/methods/DANCE/resources', 26 | 'functionality_name': '171129' 27 | } 28 | ## VIASH END 29 | 30 | logging.info('Reading `h5ad` files...') 31 | train_mod1 = ad.read_h5ad(par['input_train_mod1']) 32 | mod1 = train_mod1.var['feature_types'][0] 33 | dataset_id = train_mod1.uns['dataset_id'] 34 | input_train_mod1 = train_mod1.X 35 | 36 | train_mod2 = ad.read_h5ad(par['input_train_mod2']) 37 | var = train_mod2.var 38 | mod2 = train_mod2.var['feature_types'][0] 39 | input_train_mod2 = train_mod2.X 40 | 41 | test_mod1 = ad.read_h5ad(par['input_test_mod1']) 42 | obs = test_mod1.obs 43 | input_test_mod1 = test_mod1.X 44 | 45 | if mod1 == 'GEX': 46 | sys.path.append(meta['resources_dir']) 47 | from graph_util import graph_construction, WeightedGCN4 48 | 49 | import torch 50 | 51 | # # This will get passed to the method 52 | FEATURE_SIZE = train_mod1.shape[1] 53 | OUTPUT_SIZE = train_mod2.shape[1] 54 | TRAIN_SIZE = train_mod1.shape[0] 55 | TEST_SIZE = test_mod1.shape[0] 56 | 57 | g, bf = graph_construction(meta, train_mod1, train_mod2, test_mod1, pretrain_path=par['input_pretrain']) 58 | 59 | class Dict(dict): 60 | __setattr__ = dict.__setitem__ 61 | __getattr__ = dict.__getitem__ 62 | 63 | def dict2obj(dictObj): 64 | if not isinstance(dictObj, dict): 65 | return dictObj 66 | d = Dict() 67 | for k, v in dictObj.items(): 68 | d[k] = dict2obj(v) 69 | return d 70 | 71 | def evaluate(mod, args): 72 | mod.eval() 73 | with torch.no_grad(): 74 | logits = mod(g, bf, args) 75 | logits = logits[-TEST_SIZE:] 76 | return logits 77 | 78 | def build_args(LOG_FILE_PATH): 79 | string = open(LOG_FILE_PATH, 'r').readline() 80 | string = string.replace('Namespace', '').replace('=', ':').replace('(', '{ ').replace(')', '}').replace("'", '"').replace(',', ',\n').replace('True', 'true').replace('False','false') 81 | string = re.sub('[ ](.*?):', r' "\1":', string) 82 | args = json.loads(string) 83 | return dict2obj(args) 84 | 85 | if mod2 == 'ADT': 86 | y_pred = [] 87 | model_names = ['f_alpha_conv4_mean_fullbatch_12000_phase2_inductive_batch_speration.pkl', 'bf_alpha_conv4_mean_fullbatch_10000_phase2_inductive_gex2adt_2.pkl', 'bf_alpha_conv4_mean_fullbatch_12000_phase2_inductive_gex2adt_sep_2.pkl', 'bf_alpha_conv4_mean_fullbatch_15000_phase2_inductive.pkl'] 88 | 89 | for model_name in model_names: 90 | args = build_args(os.path.join(par['input_pretrain'], model_name).replace('.pkl', '.log')) 91 | model = torch.load(os.path.join(par['input_pretrain'], model_name), map_location='cpu') 92 | y_pred.append(evaluate(model, args).numpy()) 93 | del model, args 94 | 95 | y_pred = csc_matrix((y_pred[0]+y_pred[1]+y_pred[2]+y_pred[3])/4) 96 | 97 | elif mod2 == 'ATAC': 98 | y_pred = [] 99 | model_names = ['bf_alpha_conv4_mean_fullbatch_8000_phase2_inductive_gex2atac_3.pkl', 'bf_alpha_conv4_mean_fullbatch_8000_phase2_inductive_gex2atac_2.pkl', 'bf_alpha_conv4_mean_fullbatch_8000_phase2_inductive_gex2atac.pkl', 'bf_alpha_conv4_mean_fullbatch_10000_phase2_inductive_gex2atac.pkl'] 100 | 101 | for model_name in model_names: 102 | args = build_args(os.path.join(par['input_pretrain'], model_name).replace('.pkl', '.log')) 103 | model = torch.load(os.path.join(par['input_pretrain'], model_name), map_location='cpu') 104 | y_pred.append(evaluate(model, args).numpy()) 105 | del model, args 106 | 107 | y_pred = csc_matrix((y_pred[0]+y_pred[1]+y_pred[2]+y_pred[3])/4) 108 | 109 | elif mod1=='ATAC' and mod2=='GEX': 110 | y_pred = csc_matrix(np.tile(np.mean(input_train_mod2.toarray(), 0), (input_test_mod1.shape[0], 1))) 111 | 112 | else: 113 | sys.path.append(meta['resources_dir']) 114 | from baseline import baseline_linear 115 | 116 | input_train_mod1 = train_mod1[train_mod1.obs['batch']!='s3d1'].X 117 | input_train_mod2 = train_mod2[train_mod2.obs['batch']!='s3d1'].X 118 | y_pred = csc_matrix(baseline_linear(input_train_mod1, input_train_mod2, input_test_mod1)) 119 | 120 | adata = ad.AnnData( 121 | X=y_pred, 122 | obs=obs, 123 | var=var, 124 | uns={ 125 | 'dataset_id': dataset_id, 126 | 'method_id': meta['functionality_name'], 127 | }, 128 | ) 129 | 130 | logging.info('Storing annotated data...') 131 | adata.write_h5ad(par['output'], compression = "gzip") 132 | -------------------------------------------------------------------------------- /src/predict_modality/methods/DANCE/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=dance 7 | task_id=predict_modality 8 | 9 | # GENERATE PRETRAIN 10 | pretrain_path=output/pretrain/$task_id/$method_id/pretrain.${method_id}_train.output_pretrain/ 11 | 12 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 13 | --data_dir output/datasets/$task_id \ 14 | --output_pretrain ${pretrain_path} 15 | 16 | # CITE GEX2ADT 17 | dataset_id=openproblems_bmmc_cite_phase2_rna 18 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 19 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 20 | 21 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 22 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 23 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 24 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 25 | --input_pretrain ${pretrain_path} \ 26 | --output ${pred_path}.${method_id}.output.h5ad 27 | 28 | # CITE ADT2GEX 29 | dataset_id=openproblems_bmmc_cite_phase2_mod2 30 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 31 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 32 | 33 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 34 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 35 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 36 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 37 | --input_pretrain ${pretrain_path} \ 38 | --output ${pred_path}.${method_id}.output.h5ad 39 | 40 | 41 | # MULTIOME GEX2ATAC 42 | dataset_id=openproblems_bmmc_multiome_phase2_rna 43 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 44 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 45 | 46 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 47 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 48 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 49 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 50 | --input_pretrain ${pretrain_path} \ 51 | --output ${pred_path}.${method_id}.output.h5ad 52 | 53 | # MULTIOME ATAC2GEX 54 | dataset_id=openproblems_bmmc_multiome_phase2_mod2 55 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 56 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 57 | 58 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 59 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 60 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 61 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 62 | --input_pretrain ${pretrain_path} \ 63 | --output ${pred_path}.${method_id}.output.h5ad 64 | 65 | # RUN EVALUATION 66 | bin/nextflow run "$PIPELINE_REPO" \ 67 | -r "$PIPELINE_VERSION" \ 68 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 69 | --solutionDir "output/datasets/$task_id" \ 70 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 71 | --publishDir "output/evaluation/$task_id/$method_id/" \ 72 | -latest \ 73 | -resume \ 74 | -c "src/resources/nextflow_moremem.config" 75 | 76 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/predict_modality/methods/DANCE/train/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: dance_train 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | 7 | description: A description for your method. 8 | authors: 9 | - name: Hongzhi Wen 10 | email: wenhongz@msu.edu 11 | roles: [ author, maintainer ] 12 | - name: Jiayuan Ding 13 | email: dingjia5@msu.edu 14 | roles: [ author, maintainer ] 15 | - name: Wei Jin 16 | email: jinwei2@msu.edu 17 | roles: [ author ] 18 | - name: Xiaoyan Li 19 | email: lixiaoy5@msu.edu 20 | roles: [ author ] 21 | - name: Zhaoheng Li 22 | email: zli1@macalester.edu 23 | roles: [ author ] 24 | - name: Haoyu Han 25 | email: hanhaoy1@msu.edu 26 | roles: [ assistant ] 27 | - name: Yuying Xie 28 | email: xyy@msu.edu 29 | roles: [ advisor ] 30 | - name: Jiliang Tang 31 | email: tangjili@msu.edu 32 | roles: [ advisor ] 33 | 34 | # parameters 35 | arguments: 36 | # required inputs 37 | - name: "--data_dir" 38 | type: "file" 39 | description: The path to the predict_modality datasets 40 | required: true 41 | 42 | # required outputs 43 | - name: "--output_pretrain" 44 | type: "file" 45 | direction: "output" 46 | example: "pretrain_model" 47 | description: Path to the directory containing the pretrained models. 48 | required: true 49 | 50 | # files your script needs 51 | resources: 52 | - type: bash_script 53 | path: script.sh 54 | - path: hetero_arg_version_v5.py 55 | - path: generate_extra_files.py 56 | - path: h.all.v7.4.entrez.gmt 57 | - path: h.all.v7.4.symbols.gmt 58 | # suggestion: use same WeightedGCN4 as run component 59 | # to use, uncomment the following line 60 | # - path: ../resources/graph_util.py 61 | 62 | # target platforms 63 | platforms: 64 | - type: docker 65 | image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime 66 | run_args: [ "--gpus all --shm-size=5G" ] 67 | setup: 68 | - type: docker 69 | run: [pip install scikit-learn==0.24.1] 70 | 71 | - type: python 72 | packages: 73 | #- scikit-learn 74 | - anndata 75 | - scanpy 76 | - numpy 77 | - torch 78 | - dgl-cu111 79 | - lightgbm 80 | - joblib 81 | 82 | - type: nextflow 83 | labels: [ midmem, hightime, lowcpu, gpu ] 84 | -------------------------------------------------------------------------------- /src/predict_modality/methods/DANCE/train/generate_extra_files.py: -------------------------------------------------------------------------------- 1 | import anndata as ad 2 | import pickle 3 | import numpy as np 4 | from collections import defaultdict 5 | import random 6 | 7 | import argparse 8 | 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('-d', '--data_folder', default = './data/public/phase2-data/predict_modality/') 12 | parser.add_argument('-ef', '--extra_files_folder', default = './') 13 | 14 | args = parser.parse_args() 15 | 16 | def load_pw(): 17 | with open(args.extra_files_folder + '/h.all.v7.4.entrez.gmt') as gmt: 18 | gene_list = gmt.read().split() 19 | gene_sets_entrez = defaultdict(list) 20 | 21 | indicator = 0 22 | for ele in gene_list: 23 | if not ele.isnumeric() and indicator == 1: 24 | indicator = 0 25 | continue 26 | if not ele.isnumeric() and indicator == 0: 27 | indicator = 1 28 | gene_set_name = ele 29 | else: 30 | gene_sets_entrez[gene_set_name].append(ele) 31 | 32 | with open(args.extra_files_folder + '/h.all.v7.4.symbols.gmt') as gmt: 33 | gene_list = gmt.read().split() 34 | gene_sets_symbols = defaultdict(list) 35 | 36 | for ele in gene_list: 37 | if ele in gene_sets_entrez: 38 | gene_set_name = ele 39 | elif not ele.startswith( 'http://' ): 40 | gene_sets_symbols[gene_set_name].append(ele) 41 | 42 | return [i[1] for i in gene_sets_symbols.items()] 43 | 44 | def graph_construct(train_mod1): 45 | counter = 0 46 | total = 0 47 | input_train_mod1 = train_mod1.X 48 | feature_index = train_mod1.var['feature_types'].index.tolist() 49 | new_pw = [] 50 | for i in pw: 51 | new_pw.append([]) 52 | for j in i: 53 | if j in feature_index: 54 | new_pw[-1].append(feature_index.index(j)) 55 | 56 | # cos similarity weight 57 | uu=[] 58 | vv=[] 59 | ee=[] 60 | for i in new_pw: 61 | for j in i: 62 | for k in i: 63 | if j!=k: 64 | uu.append(j) 65 | vv.append(k) 66 | sj = np.sqrt(np.dot(input_train_mod1[:,j].toarray().T, input_train_mod1[:,j].toarray()).item()) 67 | sk = np.sqrt(np.dot(input_train_mod1[:,k].toarray().T, input_train_mod1[:,k].toarray()).item()) 68 | jk = np.dot(input_train_mod1[:,j].toarray().T, input_train_mod1[:,k].toarray()) 69 | cossim = jk/sj/sk 70 | ee.append(cossim) 71 | 72 | return uu, vv, ee 73 | 74 | print("Loading pw") 75 | pw = load_pw() 76 | 77 | print("Generating 'pw.pkl'") 78 | # Generate pw.pkl 79 | subtask = 'openproblems_bmmc_cite_phase2_rna' 80 | subtask_folder = args.data_folder + '/' + subtask + '/' 81 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad' 82 | uu, vv, ee = graph_construct(ad.read_h5ad(subtask_filename.format('train_mod1'))) 83 | pickle.dump([uu,vv,ee], open(args.extra_files_folder + '/pw.pkl', 'wb')) 84 | 85 | print("Generating 'pw_multiome.pkl'") 86 | subtask = 'openproblems_bmmc_multiome_phase2_rna' 87 | subtask_folder = args.data_folder + '/' + subtask + '/' 88 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad' 89 | uu, vv, ee = graph_construct(ad.read_h5ad(subtask_filename.format('train_mod1'))) 90 | pickle.dump([uu,vv,ee], open(args.extra_files_folder + '/pw_multiome.pkl', 'wb')) 91 | 92 | print("Generating 'phase2_mask.pkl'") 93 | subtasks = ['openproblems_bmmc_cite_phase2_rna', 'openproblems_bmmc_cite_phase2_mod2', 'openproblems_bmmc_multiome_phase2_rna', 'openproblems_bmmc_multiome_phase2_mod2'] 94 | task_names = ['gex2adt', 'adt2gex', 'gex2atac', 'atac2gex'] 95 | mask = {} 96 | 97 | for ts in range(4): 98 | subtask = subtasks[ts] 99 | mask[subtask] = {} 100 | subtask_folder = args.data_folder + '/' + subtask + '/' 101 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad' 102 | train_mod1 = ad.read_h5ad(subtask_filename.format('train_mod1')) 103 | l = list(range(train_mod1.X.shape[0])) 104 | random.shuffle(l) 105 | train_size = int(train_mod1.X.shape[0] * 0.85) 106 | valid_size = train_mod1.X.shape[0] - train_size 107 | mask[subtask]['train'] = l[:train_size] 108 | mask[subtask]['test'] = l[-valid_size:] 109 | 110 | import pickle 111 | pickle.dump(mask, open(args.extra_files_folder + '/phase2_mask.pkl','wb')) 112 | 113 | print("Generating 'phase2_mask_sep.pkl'") 114 | subtask = 'openproblems_bmmc_cite_phase2_rna' 115 | subtask_folder = args.data_folder + '/' + subtask + '/' 116 | subtask_filename = subtask_folder + subtask + '.censor_dataset.output_{}.h5ad' 117 | 118 | train_mod1 = ad.read_h5ad(subtask_filename.format('train_mod1')) 119 | 120 | def get_index(batch): 121 | index = [] 122 | for i in train_mod1[train_mod1.obs['batch']==batch].obs['batch'].index: 123 | index.append(list(train_mod1.obs['batch'].index).index(i)) 124 | return index 125 | 126 | s3d1 = get_index('s3d1') 127 | s3d7 = get_index('s3d7') 128 | s1d2 = get_index('s1d2') 129 | 130 | test = s3d7+s1d2 131 | train = [i for i in range(train_mod1.X.shape[0]) if i not in (test + s3d1)] 132 | 133 | gex2adt = {} 134 | gex2adt['test'] = test 135 | gex2adt['train'] = train 136 | 137 | mask = {} 138 | mask['openproblems_bmmc_cite_phase2_rna'] = gex2adt 139 | pickle.dump(mask, open(args.extra_files_folder + '/phase2_mask_sep.pkl', 'wb')) -------------------------------------------------------------------------------- /src/predict_modality/methods/Guanlab-dengkw/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: guanlab_dengkw_pm 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | description: A description for your method. 7 | info: 8 | method_label: "Guanlab-dengkw" 9 | submission_id: "170636" 10 | team_name: Guanlab-dengkw 11 | # project_url: https://github.com/foo/bar 12 | # publication_doi: 10.1101/0123.45.67.890123 13 | # publication_url: https://arxiv.org/abs/1234.56789 14 | 15 | authors: 16 | - name: Kaiwen Deng 17 | email: dengkw@umich.edu 18 | roles: [ author, maintainer ] 19 | props: { github: nonztalk } 20 | 21 | # parameters 22 | arguments: 23 | # required inputs 24 | - name: "--input_train_mod1" 25 | type: "file" 26 | example: "dataset_mod1.h5ad" 27 | description: Censored dataset, training cells. 28 | required: true 29 | - name: "--input_test_mod1" 30 | type: "file" 31 | example: "dataset_mod1.h5ad" 32 | description: Censored dataset, test cells. 33 | required: true 34 | - name: "--input_train_mod2" 35 | type: "file" 36 | example: "dataset_mod2.h5ad" 37 | description: Censored dataset. 38 | required: true 39 | # required outputs 40 | - name: "--output" 41 | type: "file" 42 | direction: "output" 43 | example: "output.h5ad" 44 | description: Dataset with predicted values for modality2. 45 | required: true 46 | # additional parameters 47 | - name: "--distance_method" 48 | type: "string" 49 | default: "minkowski" 50 | description: The distance metric to use. Possible values include `euclidean` and `minkowski`. 51 | - name: "--n_pcs" 52 | type: "integer" 53 | default: 50 54 | description: Number of components to use for dimensionality reduction. 55 | 56 | # files your script needs 57 | resources: 58 | - type: python_script 59 | path: script.py 60 | 61 | # target platforms 62 | platforms: 63 | - type: docker 64 | image: dataintuitive/randpy:py3.8 65 | setup: 66 | 67 | - type: python 68 | packages: 69 | - scikit-learn 70 | - anndata 71 | - pandas 72 | - numpy 73 | - scanpy 74 | 75 | - type: nextflow 76 | labels: [ vhighmem, vvhightime, vhighcpu ] 77 | -------------------------------------------------------------------------------- /src/predict_modality/methods/Guanlab-dengkw/run/script.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import anndata as ad 3 | import numpy as np 4 | 5 | from scipy.sparse import csc_matrix 6 | 7 | from sklearn.decomposition import TruncatedSVD 8 | from sklearn.gaussian_process.kernels import RBF 9 | from sklearn.kernel_ridge import KernelRidge 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | 13 | ## VIASH START 14 | par = { 15 | 'input_train_mod1': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad', 16 | 'input_train_mod2': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad', 17 | 'input_test_mod1': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad', 18 | 'output': 'output.h5ad', 19 | } 20 | meta = { 'functionality_name': 'submission_170636' } 21 | ## VIASH END 22 | 23 | logging.info('Reading `h5ad` files...') 24 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) 25 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) 26 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) 27 | 28 | pred_dimx = input_test_mod1.shape[0] 29 | pred_dimy = input_train_mod2.shape[1] 30 | 31 | feature_obs = input_train_mod1.obs 32 | gs_obs = input_train_mod2.obs 33 | 34 | batches = input_train_mod1.obs.batch.unique().tolist() 35 | batch_len = len(batches) 36 | 37 | obs = input_test_mod1.obs 38 | var = input_train_mod2.var 39 | dataset_id = input_train_mod1.uns['dataset_id'] 40 | 41 | input_train = ad.concat( 42 | {"train": input_train_mod1, "test": input_test_mod1}, 43 | axis=0, 44 | join="outer", 45 | label="group", 46 | fill_value=0, 47 | index_unique="-" 48 | ) 49 | 50 | logging.info('Determine parameters by the modalities') 51 | mod1_type = input_train_mod1.var.feature_types[0] 52 | mod1_type = mod1_type.upper() 53 | mod2_type = input_train_mod2.var.feature_types[0] 54 | mod2_type = mod2_type.upper() 55 | n_comp_dict = { 56 | ("GEX", "ADT"): (300, 70, 10, 0.2), 57 | ("ADT", "GEX"): (None, 50, 10, 0.2), 58 | ("GEX", "ATAC"): (1000, 50, 10, 0.1), 59 | ("ATAC", "GEX"): (100, 70, 10, 0.1) 60 | } 61 | logging.info(f"{mod1_type}, {mod2_type}") 62 | n_mod1, n_mod2, scale, alpha = n_comp_dict[(mod1_type, mod2_type)] 63 | logging.info(f"{n_mod1}, {n_mod2}, {scale}, {alpha}") 64 | 65 | # Do PCA on the input data 66 | logging.info('Models using the Truncated SVD to reduce the dimension') 67 | 68 | if n_mod1 is not None and n_mod1 < input_train.shape[1]: 69 | embedder_mod1 = TruncatedSVD(n_components=n_mod1) 70 | mod1_pca = embedder_mod1.fit_transform(input_train.X).astype(np.float32) 71 | train_matrix = mod1_pca[input_train.obs['group'] == 'train'] 72 | test_matrix = mod1_pca[input_train.obs['group'] == 'test'] 73 | else: 74 | train_matrix = input_train_mod1.to_df().values.astype(np.float32) 75 | test_matrix = input_test_mod1.to_df().values.astype(np.float32) 76 | 77 | if n_mod2 is not None and n_mod2 < input_train_mod2.shape[1]: 78 | embedder_mod2 = TruncatedSVD(n_components=n_mod2) 79 | train_gs = embedder_mod2.fit_transform(input_train_mod2.X).astype(np.float32) 80 | else: 81 | train_gs = input_train_mod2.to_df().values.astype(np.float32) 82 | 83 | del input_train 84 | del input_train_mod1 85 | del input_train_mod2 86 | del input_test_mod1 87 | 88 | logging.info('Running normalization ...') 89 | train_sd = np.std(train_matrix, axis=1).reshape(-1, 1) 90 | train_sd[train_sd == 0] = 1 91 | train_norm = (train_matrix - np.mean(train_matrix, axis=1).reshape(-1, 1)) / train_sd 92 | train_norm = train_norm.astype(np.float32) 93 | del train_matrix 94 | 95 | test_sd = np.std(test_matrix, axis=1).reshape(-1, 1) 96 | test_sd[test_sd == 0] = 1 97 | test_norm = (test_matrix - np.mean(test_matrix, axis=1).reshape(-1, 1)) / test_sd 98 | test_norm = test_norm.astype(np.float32) 99 | del test_matrix 100 | 101 | logging.info('Running KRR model ...') 102 | y_pred = np.zeros((pred_dimx, pred_dimy), dtype=np.float32) 103 | np.random.seed(1000) 104 | 105 | for _ in range(5): 106 | np.random.shuffle(batches) 107 | for batch in [batches[:batch_len//2], batches[batch_len//2:]]: 108 | # for passing the test 109 | if not batch: 110 | batch = [batches[0]] 111 | 112 | logging.info(batch) 113 | kernel = RBF(length_scale = scale) 114 | krr = KernelRidge(alpha=alpha, kernel=kernel) 115 | logging.info('Fitting KRR ... ') 116 | krr.fit(train_norm[feature_obs.batch.isin(batch)], 117 | train_gs[gs_obs.batch.isin(batch)]) 118 | y_pred += (krr.predict(test_norm) @ embedder_mod2.components_) 119 | 120 | np.clip(y_pred, a_min=0, a_max=None, out=y_pred) 121 | if mod2_type == "ATAC": 122 | np.clip(y_pred, a_min=0, a_max=1, out=y_pred) 123 | 124 | y_pred /= 10 125 | 126 | # Store as sparse matrix to be efficient. Note that this might require 127 | # different classifiers/embedders before-hand. Not every class is able 128 | # to support such data structures. 129 | y_pred = csc_matrix(y_pred) 130 | 131 | logging.info("Generate anndata object ...") 132 | adata = ad.AnnData( 133 | X=y_pred, 134 | obs=obs, 135 | var=var, 136 | uns={ 137 | 'dataset_id': dataset_id, 138 | 'method_id': meta['functionality_name'], 139 | }, 140 | ) 141 | 142 | logging.info('Storing annotated data...') 143 | adata.write_h5ad(par['output'], compression = "gzip") 144 | -------------------------------------------------------------------------------- /src/predict_modality/methods/Guanlab-dengkw/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=guanlab_dengkw_pm 7 | task_id=predict_modality 8 | 9 | # CITE GEX2ADT 10 | dataset_id=openproblems_bmmc_cite_phase2_rna 11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 13 | 14 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 15 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 16 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 17 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 18 | --output ${pred_path}.${method_id}.output.h5ad 19 | 20 | # CITE ADT2GEX 21 | dataset_id=openproblems_bmmc_cite_phase2_mod2 22 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 23 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 24 | 25 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 26 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 27 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 28 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 29 | --input_pretrain ${pretrain_path} \ 30 | --output ${pred_path}.${method_id}.output.h5ad 31 | 32 | # MULTIOME GEX2ATAC 33 | dataset_id=openproblems_bmmc_multiome_phase2_rna 34 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 36 | 37 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 38 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 39 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 40 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 41 | --input_pretrain ${pretrain_path} \ 42 | --output ${pred_path}.${method_id}.output.h5ad 43 | 44 | # MULTIOME ATAC2GEX 45 | dataset_id=openproblems_bmmc_multiome_phase2_mod2 46 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 47 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 48 | 49 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 50 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 51 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 52 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 53 | --input_pretrain ${pretrain_path} \ 54 | --output ${pred_path}.${method_id}.output.h5ad 55 | 56 | # RUN EVALUATION 57 | bin/nextflow run "$PIPELINE_REPO" \ 58 | -r "$PIPELINE_VERSION" \ 59 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 60 | --solutionDir "output/datasets/$task_id" \ 61 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 62 | --publishDir "output/evaluation/$task_id/$method_id/" \ 63 | -latest \ 64 | -resume \ 65 | -c "src/resources/nextflow_moremem.config" 66 | 67 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" 68 | -------------------------------------------------------------------------------- /src/predict_modality/methods/LS_lab/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: lslab 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | description: A description for your method. 7 | info: 8 | method_label: "LS_Lab" 9 | submission_id: "171123" 10 | team_name: LS_lab 11 | # project_url: https://github.com/foo/bar 12 | # publication_doi: 10.1101/0123.45.67.890123 13 | # publication_url: https://arxiv.org/abs/1234.56789 14 | 15 | authors: 16 | - name: Aidyn Ubingazhibov 17 | email: aidyn.ubingazhibov@nu.edu.kz 18 | roles: [ author, maintainer ] 19 | props: { github: aidynabirov } 20 | 21 | # parameters 22 | arguments: 23 | # required inputs 24 | - name: "--input_train_mod1" 25 | type: "file" 26 | example: "dataset_mod1.h5ad" 27 | description: Censored dataset, training cells. 28 | required: true 29 | - name: "--input_test_mod1" 30 | type: "file" 31 | example: "dataset_mod1.h5ad" 32 | description: Censored dataset, test cells. 33 | required: true 34 | - name: "--input_train_mod2" 35 | type: "file" 36 | example: "dataset_mod2.h5ad" 37 | description: Censored dataset. 38 | required: true 39 | # required outputs 40 | - name: "--output" 41 | type: "file" 42 | direction: "output" 43 | example: "output.h5ad" 44 | description: Dataset with predicted values for modality2. 45 | required: true 46 | 47 | # files your script needs 48 | resources: 49 | - type: python_script 50 | path: script.py 51 | 52 | # target platforms 53 | platforms: 54 | - type: docker 55 | image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime 56 | run_args: [ "--gpus all" ] 57 | setup: 58 | - type: python 59 | packages: 60 | - scikit-learn 61 | - catboost 62 | - anndata 63 | - scanpy 64 | - tqdm 65 | - type: nextflow 66 | labels: [ vhighmem, vvhightime, highcpu, gpu] 67 | -------------------------------------------------------------------------------- /src/predict_modality/methods/LS_lab/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=submission_171123 7 | task_id=predict_modality 8 | 9 | # CITE GEX2ADT 10 | dataset_id=openproblems_bmmc_cite_phase2_rna 11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 12 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 13 | 14 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 15 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 16 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 17 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 18 | --output ${pred_path}.${method_id}.output.h5ad 19 | 20 | # CITE ADT2GEX 21 | dataset_id=openproblems_bmmc_cite_phase2_mod2 22 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 23 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 24 | 25 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 26 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 27 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 28 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 29 | --input_pretrain ${pretrain_path} \ 30 | --output ${pred_path}.${method_id}.output.h5ad 31 | 32 | # MULTIOME GEX2ATAC 33 | dataset_id=openproblems_bmmc_multiome_phase2_rna 34 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 35 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 36 | 37 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 38 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 39 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 40 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 41 | --input_pretrain ${pretrain_path} \ 42 | --output ${pred_path}.${method_id}.output.h5ad 43 | 44 | # MULTIOME ATAC2GEX 45 | dataset_id=openproblems_bmmc_multiome_phase2_mod2 46 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 47 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 48 | 49 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 50 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 51 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 52 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 53 | --input_pretrain ${pretrain_path} \ 54 | --output ${pred_path}.${method_id}.output.h5ad 55 | 56 | # RUN EVALUATION 57 | bin/nextflow run "$PIPELINE_REPO" \ 58 | -r "$PIPELINE_VERSION" \ 59 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 60 | --solutionDir "output/datasets/$task_id" \ 61 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 62 | --publishDir "output/evaluation/$task_id/$method_id/" \ 63 | -latest \ 64 | -resume \ 65 | -c "src/resources/nextflow_moremem.config" 66 | 67 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/predict_modality/methods/cajal/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: cajal 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | description: A description for your method. 7 | info: 8 | method_label: "Cajal" 9 | submission_id: "170613" 10 | team_name: Cajal 11 | # project_url: https://github.com/foo/bar 12 | # publication_doi: 10.1101/0123.45.67.890123 13 | # publication_url: https://arxiv.org/abs/1234.56789 14 | 15 | authors: 16 | - name: Anna Laddach 17 | email: anna.laddach@crick.ac.uk 18 | roles: [ author, maintainer ] 19 | props: { github: AnnaLaddach, orcid: "0000-0001-5552-6534" } 20 | - name: Roman Laddach 21 | email: roman.laddach@kcl.ac.uk 22 | roles: [ author, maintainer ] 23 | props: { github: rladdach, orcid: "0000-0002-0118-4548" } 24 | - name: Michael Shapiro 25 | email: michael.shapiro@crick.ac.uk 26 | roles: [ author, maintainer ] 27 | props: { github: michaeldshapiro, orcid: "0000-0002-2769-9320" } 28 | 29 | # parameters 30 | arguments: 31 | # required inputs 32 | - name: "--input_train_mod1" 33 | type: "file" 34 | example: "dataset_mod1.h5ad" 35 | description: Censored dataset, training cells. 36 | required: true 37 | - name: "--input_test_mod1" 38 | type: "file" 39 | example: "dataset_mod1.h5ad" 40 | description: Censored dataset, test cells. 41 | required: true 42 | - name: "--input_train_mod2" 43 | type: "file" 44 | example: "dataset_mod2.h5ad" 45 | description: Censored dataset. 46 | required: true 47 | - name: "--input_pretrain" 48 | type: "file" 49 | example: "pretrain_model" 50 | description: Path to the directory containing a pretrained model. 51 | required: true 52 | 53 | # required outputs 54 | - name: "--output" 55 | type: "file" 56 | direction: "output" 57 | example: "output.h5ad" 58 | description: Dataset with predicted values for modality2. 59 | required: true 60 | 61 | # files your script needs 62 | resources: 63 | - type: python_script 64 | path: script.py 65 | 66 | # target platforms 67 | platforms: 68 | - type: docker 69 | image: tensorflow/tensorflow:2.5.0-gpu 70 | run_args: [ "--gpus all" ] 71 | setup: 72 | - type: python 73 | packages: 74 | - scikit-learn 75 | - anndata 76 | - scanpy 77 | - tensorflow 78 | - pandas 79 | - type: nextflow 80 | labels: [ vhighmem, vvhightime, highcpu, gpu] 81 | -------------------------------------------------------------------------------- /src/predict_modality/methods/cajal/run/script.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import anndata as ad 3 | import pickle 4 | import numpy as np 5 | 6 | from scipy.sparse import csc_matrix 7 | 8 | import tensorflow as tf 9 | import scanpy as sc 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | 13 | ## VIASH START 14 | par = { 15 | 'input_train_mod1': 'sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad', 16 | 'input_train_mod2': 'sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad', 17 | 'input_test_mod1': 'sample_data/predict_modality/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad', 18 | 'input_pretrain': 'path/to/model', 19 | 'output': 'output.h5ad' 20 | } 21 | meta = { 'functionality_name': 'cajal' } 22 | ## VIASH END 23 | 24 | logging.info('Reading `h5ad` files...') 25 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) 26 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) 27 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) 28 | 29 | #get modalities 30 | mod_1 = input_train_mod1.var["feature_types"][0] 31 | mod_2 = input_train_mod2.var["feature_types"][0] 32 | 33 | 34 | test_total = np.sum(input_test_mod1.layers['counts'].toarray(), axis=1) 35 | 36 | if mod_1 == "GEX": 37 | input_test_mod1.X = input_test_mod1.layers['counts'] 38 | sc.pp.normalize_per_cell(input_test_mod1, counts_per_cell_after=1e6) 39 | sc.pp.log1p(input_test_mod1) 40 | 41 | with open(par["input_pretrain"] + "/genes.pkl", "rb") as f: 42 | genes = pickle.load(f) 43 | input_test_mod1 = input_test_mod1[:,genes] 44 | 45 | if mod_1 == "GEX": 46 | input_train_mod1.X = input_train_mod1.layers['counts'] 47 | sc.pp.normalize_per_cell(input_train_mod1, counts_per_cell_after=1e6) 48 | sc.pp.log1p(input_train_mod1) 49 | 50 | X_test = input_test_mod1.X.toarray() 51 | 52 | test_batches = set(input_test_mod1.obs.batch) 53 | 54 | input_test_mod1.obs["batch_median"] = 0 55 | 56 | input_test_mod1.obs["batch_sd"] = 0 57 | 58 | for batch in test_batches: 59 | input_test_mod1.obs["batch_median"][input_test_mod1.obs.batch == batch] = np.median(test_total[input_test_mod1.obs.batch == batch]) 60 | input_test_mod1.obs["batch_sd"][input_test_mod1.obs.batch == batch] = np.std(test_total[input_test_mod1.obs.batch == batch]) 61 | 62 | 63 | for i in range(50): 64 | X_test = np.column_stack((X_test,test_total)) 65 | 66 | for i in range(50): 67 | X_test = np.column_stack((X_test,input_test_mod1.obs["batch_median"])) 68 | 69 | for i in range(50): 70 | X_test = np.column_stack((X_test,input_test_mod1.obs["batch_sd"])) 71 | 72 | with open(par["input_pretrain"] + "/transformation.pkl", "rb") as f: 73 | info = pickle.load(f) 74 | 75 | X_test = X_test.T 76 | X_test = (X_test - info["means"])/info["sds"] 77 | X_test = X_test.T 78 | 79 | 80 | #load pretrained model for correct modalities 81 | model = tf.keras.models.load_model(par["input_pretrain"] + "/model.h5") 82 | 83 | #make predictions for y 84 | y_pred = model.predict(X_test) 85 | 86 | #convert to sparse matrix 87 | y_pred = csc_matrix(y_pred) 88 | 89 | adata = ad.AnnData( 90 | X=y_pred, 91 | obs=input_test_mod1.obs, 92 | var=input_train_mod2.var, 93 | uns={ 94 | 'dataset_id': input_train_mod1.uns['dataset_id'], 95 | 'method_id': "cajal", 96 | }, 97 | ) 98 | 99 | 100 | logging.info('Storing annotated data...') 101 | adata.write_h5ad(par['output'], compression = "gzip") 102 | -------------------------------------------------------------------------------- /src/predict_modality/methods/cajal/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=cajal 7 | task_id=predict_modality 8 | 9 | # CITE GEX2ADT 10 | dataset_id=openproblems_bmmc_cite_phase2_rna 11 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 12 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 13 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 14 | 15 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 16 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 17 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 18 | --input_explore_mod1 output/datasets_explore/cite/cite_gex_processed_training.h5ad \ 19 | --input_explore_mod2 output/datasets_explore/cite/cite_adt_processed_training.h5ad \ 20 | --output_pretrain ${pretrain_path} 21 | 22 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 23 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 24 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 25 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 26 | --input_pretrain ${pretrain_path} \ 27 | --output ${pred_path}.${method_id}.output.h5ad 28 | 29 | # CITE ADT2GEX 30 | dataset_id=openproblems_bmmc_cite_phase2_mod2 31 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 32 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 33 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 34 | 35 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 36 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 37 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 38 | --input_explore_mod1 output/datasets_explore/cite/cite_adt_processed_training.h5ad \ 39 | --input_explore_mod2 output/datasets_explore/cite/cite_gex_processed_training.h5ad \ 40 | --output_pretrain ${pretrain_path} 41 | 42 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 43 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 44 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 45 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 46 | --input_pretrain ${pretrain_path} \ 47 | --output ${pred_path}.${method_id}.output.h5ad 48 | 49 | 50 | # MULTIOME GEX2ATAC 51 | dataset_id=openproblems_bmmc_multiome_phase2_rna 52 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 53 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 54 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 55 | 56 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 57 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 58 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 59 | --input_explore_mod1 output/datasets_explore/multiome/multiome_gex_processed_training.h5ad \ 60 | --input_explore_mod2 output/datasets_explore/multiome/multiome_atac_processed_training.h5ad \ 61 | --output_pretrain ${pretrain_path} 62 | 63 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 64 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 65 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 66 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 67 | --input_pretrain ${pretrain_path} \ 68 | --output ${pred_path}.${method_id}.output.h5ad 69 | 70 | # MULTIOME ATAC2GEX 71 | dataset_id=openproblems_bmmc_multiome_phase2_mod2 72 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 73 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 74 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 75 | 76 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 77 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 78 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 79 | --input_explore_mod1 output/datasets_explore/multiome/multiome_atac_processed_training.h5ad \ 80 | --input_explore_mod2 output/datasets_explore/multiome/multiome_gex_processed_training.h5ad \ 81 | --output_pretrain ${pretrain_path} 82 | 83 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 84 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 85 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 86 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 87 | --input_pretrain ${pretrain_path} \ 88 | --output ${pred_path}.${method_id}.output.h5ad 89 | 90 | # RUN EVALUATION 91 | bin/nextflow run "$PIPELINE_REPO" \ 92 | -r "$PIPELINE_VERSION" \ 93 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 94 | --solutionDir "output/datasets/$task_id" \ 95 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 96 | --publishDir "output/evaluation/$task_id/$method_id/" \ 97 | -latest \ 98 | -resume \ 99 | -c "src/resources/nextflow_moremem.config" 100 | 101 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/predict_modality/methods/cajal/train/ADT_list_df_updated.csv: -------------------------------------------------------------------------------- 1 | markers,gene_name, 2 | CD86,CD86, 3 | CD274,CD274, 4 | CD270,TNFRSF14, 5 | CD155,PVR, 6 | CD112,NECTIN2, 7 | CD47,CD47, 8 | CD48,CD48, 9 | CD40,CD40, 10 | CD154,CD40LG, 11 | CD52,CD52, 12 | CD3,CD3E,ambiguity 13 | CD8,CD8A, 14 | CD56,NCAM1, 15 | CD19,CD19, 16 | CD33,CD33, 17 | CD11c,ITGAX, 18 | HLA-A-B-C,HLA-A,ambiguity 19 | HLA-A-B-C,HLA-B,ambiguity 20 | HLA-A-B-C,HLA-C,ambiguity 21 | CD45RA,PTPRC,ambiguity 22 | CD123,IL3RA, 23 | CD7,CD7, 24 | CD105,ENG, 25 | CD49f,ITGA6, 26 | CD194,CCR4, 27 | CD4,CD4, 28 | CD44,CD44, 29 | CD14,CD14, 30 | CD16,FCGR3A, 31 | CD25,IL2RA, 32 | CD45RO,PTPRC,ambiguity 33 | CD279,PDCD1, 34 | TIGIT,TIGIT, 35 | CD20,MS4A1, 36 | CD335,NCR1, 37 | CD31,PECAM1, 38 | Podoplanin,PDPN, 39 | CD146,MCAM, 40 | IgM,IGHM, 41 | CD5,CD5, 42 | CD195,CCR5, 43 | CD32,FCGR2A, 44 | CD196,CCR6, 45 | CD185,CXCR5, 46 | CD103,ITGAE, 47 | CD69,CD69, 48 | CD62L,SELL, 49 | CD161,KLRB1, 50 | CD152,CTLA4, 51 | CD223,LAG3, 52 | KLRG1,KLRG1, 53 | CD27,CD27, 54 | CD107a,LAMP1, 55 | CD95,FAS, 56 | CD134,TNFRSF4, 57 | HLA-DR,HLA-DRB1, 58 | CD1c,CD1C, 59 | CD11b,ITGAM, 60 | CD64,FCGR1A, 61 | CD141,THBD, 62 | CD1d,CD1D, 63 | CD314,KLRK1, 64 | CD35,CR1, 65 | CD57,B3GAT1, 66 | CD272,BTLA, 67 | CD278,ICOS, 68 | CD58,CD58, 69 | CD39,ENTPD1, 70 | CX3CR1,CX3CR1, 71 | CD24,CD24, 72 | CD21,CR2, 73 | CD11a,ITGAL, 74 | CD79b,CD79B, 75 | CD244,CD244, 76 | CD169,SIGLEC1, 77 | integrinB7,ITGB7, 78 | CD268,TNFRSF13C, 79 | CD42b,GP1BA, 80 | CD54,ICAM1, 81 | CD62P,SELP, 82 | CD119,IFNGR1, 83 | TCR,TRA, 84 | TCR,TRB, 85 | TCR,TRG, 86 | TCR,TRD, 87 | CD192,CCR2, 88 | CD122,IL2RB, 89 | FceRIa, FCER1A, 90 | CD41,ITGA2B, 91 | CD137,TNFRSF9, 92 | CD163,CD163, 93 | CD83,CD83, 94 | CD124,IL4R, 95 | CD13,ANPEP, 96 | CD2,CD22, 97 | CD226,CD226, 98 | CD29,ITGB1, 99 | CD303,CLEC4C, 100 | CD49b, ITGA2, 101 | CD81,CD81, 102 | IgD,IGHD, 103 | CD18,ITGB2, 104 | CD28,CD28, 105 | CD38,CD38, 106 | CD127,IL7R, 107 | CD45,PTPRC,ambiguity 108 | CD22,CD22, 109 | CD71,TFRC, 110 | CD26,DPP4, 111 | CD115,CSF1R, 112 | CD63,CD63, 113 | CD304,NRP1, 114 | CD36,CD36, 115 | CD172a,SIRPA, 116 | CD72,CD72, 117 | CD158,KIR2DL3, 118 | CD93,CD93, 119 | CD49a,ITGA1, 120 | CD49d,ITGA4, 121 | CD73,NT5E, 122 | CD9,CD9, 123 | TCRVa7.2,?,ambiguity 124 | TCRVd2,?,ambiguity 125 | LOX-1,OLR1, 126 | CD158b,KIR2DL3, 127 | CD158e1,KIR3DL1, 128 | CD142,F3, 129 | CD319,SLAMF7, 130 | CD352,SLAMF6, 131 | CD94,KLRD1, 132 | CD162,SELPLG, 133 | CD85j,LILRB1, 134 | CD23,FCER2, 135 | CD328,SIGLEC7, 136 | HLA-E,HLA-E, 137 | CD82,CD82, 138 | CD101,CD101, 139 | CD88,C5AR1, 140 | CD224,GGT1, 141 | -------------------------------------------------------------------------------- /src/predict_modality/methods/cajal/train/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: cajal_train 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | description: A description for your method. 7 | info: 8 | submission_id: "170613" 9 | team_name: Cajal 10 | # project_url: https://github.com/foo/bar 11 | # publication_doi: 10.1101/0123.45.67.890123 12 | # publication_url: https://arxiv.org/abs/1234.56789 13 | 14 | authors: 15 | - name: Anna Laddach 16 | email: anna.laddach@crick.ac.uk 17 | roles: [ author, maintainer ] 18 | props: { github: AnnaLaddach, orcid: "0000-0001-5552-6534" } 19 | - name: Roman Laddach 20 | email: roman.laddach@kcl.ac.uk 21 | roles: [ author, maintainer ] 22 | props: { github: rladdach, orcid: "0000-0002-0118-4548" } 23 | - name: Michael Shapiro 24 | email: michael.shapiro@crick.ac.uk 25 | roles: [ author, maintainer ] 26 | props: { github: michaeldshapiro, orcid: "0000-0002-2769-9320" } 27 | 28 | # parameters 29 | arguments: 30 | # required inputs 31 | - name: "--input_train_mod1" 32 | type: "file" 33 | example: "dataset_mod1.h5ad" 34 | description: Censored dataset, training cells. 35 | required: true 36 | - name: "--input_train_mod2" 37 | type: "file" 38 | example: "dataset_mod2.h5ad" 39 | description: Censored dataset. 40 | required: true 41 | - name: "--input_explore_mod1" 42 | type: "file" 43 | example: "dataset_mod1.h5ad" 44 | description: Explore version of the modality 1 dataset. 45 | required: true 46 | - name: "--input_explore_mod2" 47 | type: "file" 48 | example: "dataset_mod2.h5ad" 49 | description: Explore version of the modality 2 dataset. 50 | required: true 51 | 52 | # required outputs 53 | - name: "--output_pretrain" 54 | type: "file" 55 | direction: "output" 56 | example: "pretrain_model" 57 | description: Path to the directory containing a pretrained model. 58 | required: true 59 | 60 | # files your script needs 61 | resources: 62 | - type: python_script 63 | path: script.py 64 | - path: ADT_list_df_updated.csv 65 | 66 | # target platforms 67 | platforms: 68 | - type: docker 69 | image: tensorflow/tensorflow:2.5.0-gpu 70 | run_args: [ "--gpus all" ] 71 | setup: 72 | - type: python 73 | packages: 74 | - scikit-learn 75 | - anndata 76 | - scanpy 77 | - tensorflow 78 | - pandas 79 | - type: nextflow 80 | labels: [ vhighmem, vvhightime, highcpu, gpu] -------------------------------------------------------------------------------- /src/predict_modality/methods/novel/README.md: -------------------------------------------------------------------------------- 1 | # NeurIPS-Single-Cell-MultiModality 2 | 3 | Team Novel: Gleb Ryazantsev, Nikolay Russkikh, Igor I 4 | 5 | The task is solved via training encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework 6 | 7 | -------------------------------------------------------------------------------- /src/predict_modality/methods/novel/novel_architecture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/predict_modality/methods/novel/novel_architecture.jpg -------------------------------------------------------------------------------- /src/predict_modality/methods/novel/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: novel 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | description: The task is solved via training encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework. 7 | info: 8 | method_label: "Novel" 9 | submission_id: "169769" 10 | team_name: Novel 11 | # project_url: https://github.com/foo/bar 12 | # publication_doi: 10.1101/0123.45.67.890123 13 | # publication_url: https://arxiv.org/abs/1234.56789 14 | 15 | authors: 16 | - name: Gleb Ryazantsev 17 | email: ryazantsev.gleb@gmail.com 18 | roles: [ author, maintainer ] 19 | - name: Nikolay Russkikh 20 | email: russkikh.nikolay@gmail.com 21 | roles: [ author, maintainer ] 22 | - name: Igor I 23 | email: herri.i.67@gmail.com 24 | roles: [ author, maintainer ] 25 | 26 | # parameters 27 | arguments: 28 | # required inputs 29 | - name: "--input_train_mod1" 30 | type: "file" 31 | example: "dataset_mod1.h5ad" 32 | description: Censored dataset, training cells. 33 | required: true 34 | - name: "--input_test_mod1" 35 | type: "file" 36 | example: "dataset_mod1.h5ad" 37 | description: Censored dataset, test cells. 38 | required: true 39 | - name: "--input_train_mod2" 40 | type: "file" 41 | example: "dataset_mod2.h5ad" 42 | description: Censored dataset. 43 | required: true 44 | - name: "--input_pretrain" 45 | type: "file" 46 | example: "pretrain_model" 47 | description: Path to the directory containing a pretrained model. 48 | required: true 49 | # required outputs 50 | - name: "--output" 51 | type: "file" 52 | direction: "output" 53 | example: "output.h5ad" 54 | description: Dataset with predicted values for modality2. 55 | required: true 56 | 57 | # files your script needs 58 | resources: 59 | - type: python_script 60 | path: script.py 61 | - path: ../resources/helper_functions.py 62 | 63 | # target platforms 64 | platforms: 65 | - type: docker 66 | image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime" 67 | setup: 68 | - type: python 69 | packages: 70 | - anndata 71 | - scikit-learn 72 | - networkx 73 | 74 | - type: nextflow 75 | labels: [ lowmem, lowtime, lowcpu ] 76 | -------------------------------------------------------------------------------- /src/predict_modality/methods/novel/run/script.py: -------------------------------------------------------------------------------- 1 | import anndata as ad 2 | import pickle 3 | import torch 4 | 5 | from torch.utils.data import DataLoader 6 | 7 | import sys 8 | 9 | import numpy as np 10 | 11 | from scipy.sparse import csc_matrix 12 | 13 | ## VIASH START 14 | dataset_path = "output/datasets/match_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_" 15 | pretrain_path = "output/pretrain/match_modality/clue/openproblems_bmmc_cite_phase2_rna.clue_train.output_pretrain/" 16 | 17 | par = { 18 | 'input_train_mod1': f'{dataset_path}train_mod1.h5ad', 19 | 'input_train_mod2': f'{dataset_path}train_mod2.h5ad', 20 | 'input_test_mod1': f'{dataset_path}test_mod1.h5ad', 21 | 'input_pretrain': pretrain_path, 22 | 'output': 'output.h5ad' 23 | } 24 | meta = { 25 | 'resources_dir': '.', 26 | 'functionality_name': '169769' 27 | } 28 | ## VIASH END 29 | 30 | sys.path.append(meta['resources_dir']) 31 | from helper_functions import ModelRegressionGex2Adt, ModelRegressionGex2Atac, ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModalityMatchingDataset 32 | 33 | 34 | input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) 35 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) 36 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) 37 | 38 | 39 | mod1 = input_train_mod1.var['feature_types'][0] 40 | mod2 = input_train_mod2.var['feature_types'][0] 41 | 42 | if mod1 == 'GEX' and mod2 == 'ADT': 43 | model = ModelRegressionGex2Adt(256,134) 44 | weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu') 45 | with open(par['input_pretrain'] + '/lsi_transformer.pickle', 'rb') as f: 46 | lsi_transformer_gex = pickle.load(f) 47 | 48 | 49 | model.load_state_dict(weight) 50 | input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) 51 | 52 | elif mod1 == 'GEX' and mod2 == 'ATAC': 53 | model = ModelRegressionGex2Atac(256,10000) 54 | weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu') 55 | with open(par['input_pretrain'] + '/lsi_transformer.pickle', 'rb') as f: 56 | lsi_transformer_gex = pickle.load(f) 57 | 58 | 59 | model.load_state_dict(weight) 60 | input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) 61 | 62 | elif mod1 == 'ATAC' and mod2 == 'GEX': 63 | model = ModelRegressionAtac2Gex(256,13431) 64 | weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu') 65 | with open(par['input_pretrain'] + '/lsi_transformer.pickle', 'rb') as f: 66 | lsi_transformer_gex = pickle.load(f) 67 | 68 | model.load_state_dict(weight) 69 | input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) 70 | 71 | elif mod1 == 'ADT' and mod2 == 'GEX': 72 | model = ModelRegressionAdt2Gex(134,13953) 73 | weight = torch.load(par['input_pretrain'] + '/model.pt', map_location='cpu') 74 | 75 | model.load_state_dict(weight) 76 | #input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) 77 | input_test_mod1_ = input_test_mod1.to_df() 78 | 79 | dataset_test = ModalityMatchingDataset(input_test_mod1_, None, is_train=False) 80 | dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4) 81 | 82 | outputs = [] 83 | model.eval() 84 | with torch.no_grad(): 85 | for x in dataloader_test: 86 | output = model(x.float()) 87 | outputs.append(output.detach().cpu().numpy()) 88 | 89 | outputs = np.concatenate(outputs) 90 | outputs[outputs<0] = 0 91 | outputs = csc_matrix(outputs) 92 | 93 | adata = ad.AnnData( 94 | X=outputs, 95 | obs=input_test_mod1.obs, 96 | var=input_train_mod2.var, 97 | uns={ 98 | 'dataset_id': input_train_mod1.uns['dataset_id'], 99 | 'method_id': meta['functionality_name'], 100 | }, 101 | ) 102 | adata.write_h5ad(par['output'], compression = "gzip") -------------------------------------------------------------------------------- /src/predict_modality/methods/novel/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=novel 7 | task_id=predict_modality 8 | 9 | 10 | # CITE ADT2GEX 11 | dataset_id=openproblems_bmmc_cite_phase1_mod2 12 | dataset_id_val=openproblems_bmmc_cite_phase2_mod2 13 | dataset_path=output/datasets_phase1/$task_id/$dataset_id/$dataset_id.censor_dataset 14 | dataset_path_val=output/datasets/$task_id/$dataset_id_val/$dataset_id_val.censor_dataset 15 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 16 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 17 | 18 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 19 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 20 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 21 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 22 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 23 | --output_pretrain ${pretrain_path} 24 | 25 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 26 | --input_train_mod1 ${dataset_path_val}.output_train_mod1.h5ad \ 27 | --input_train_mod2 ${dataset_path_val}.output_train_mod2.h5ad \ 28 | --input_test_mod1 ${dataset_path_val}.output_test_mod1.h5ad \ 29 | --input_pretrain ${pretrain_path} \ 30 | --output ${pred_path}.${method_id}.output.h5ad 31 | 32 | # CITE GEX2ADT 33 | dataset_id=openproblems_bmmc_cite_phase2_rna 34 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 35 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 36 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 37 | 38 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 39 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 40 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 41 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 42 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 43 | --output_pretrain ${pretrain_path} 44 | 45 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 46 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 47 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 48 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 49 | --input_pretrain ${pretrain_path} \ 50 | --output ${pred_path}.${method_id}.output.h5ad 51 | 52 | # MULTIOME GEX2ATAC 53 | dataset_id=openproblems_bmmc_multiome_phase2_rna 54 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 55 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 56 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 57 | 58 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 59 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 60 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 61 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 62 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 63 | --output_pretrain ${pretrain_path} 64 | 65 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 66 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 67 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 68 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 69 | --input_pretrain ${pretrain_path} \ 70 | --output ${pred_path}.${method_id}.output.h5ad 71 | 72 | # MULTIOME ATAC2GEX 73 | dataset_id=openproblems_bmmc_multiome_phase2_mod2 74 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 75 | pretrain_path=output/pretrain/$task_id/$method_id/$dataset_id.${method_id}_train.output_pretrain/ 76 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 77 | 78 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 79 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 80 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 81 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 82 | --input_test_mod2 ${dataset_path}.output_test_mod2.h5ad \ 83 | --output_pretrain ${pretrain_path} 84 | 85 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 86 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 87 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 88 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 89 | --input_pretrain ${pretrain_path} \ 90 | --output ${pred_path}.${method_id}.output.h5ad 91 | 92 | # RUN EVALUATION 93 | bin/nextflow run "$PIPELINE_REPO" \ 94 | -r "$PIPELINE_VERSION" \ 95 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 96 | --solutionDir "output/datasets/$task_id" \ 97 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 98 | --publishDir "output/evaluation/$task_id/$method_id/" \ 99 | -latest \ 100 | -resume \ 101 | -c "src/resources/nextflow_moremem.config" 102 | 103 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/predict_modality/methods/novel/train/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: novel_train 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | 7 | description: The task is solved via training encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework. 8 | 9 | authors: 10 | - name: Gleb Ryazantsev 11 | email: ryazantsev.gleb@gmail.com 12 | roles: [ author, maintainer ] 13 | - name: Nikolay Russkikh 14 | email: russkikh.nikolay@gmail.com 15 | roles: [ author, maintainer ] 16 | - name: Igor I 17 | email: herri.i.67@gmail.com 18 | roles: [ author, maintainer ] 19 | 20 | # parameters 21 | arguments: 22 | # required inputs 23 | - name: "--input_train_mod1" 24 | type: "file" 25 | example: "dataset_mod1.h5ad" 26 | description: Censored dataset, training cells. 27 | required: true 28 | - name: "--input_train_mod2" 29 | type: "file" 30 | example: "dataset_mod2.h5ad" 31 | description: Censored dataset. 32 | required: true 33 | - name: "--input_test_mod1" 34 | type: "file" 35 | example: "dataset_test_mod1.h5ad" 36 | description: Censored dataset, training cells. 37 | required: true 38 | - name: "--input_test_mod2" 39 | type: "file" 40 | example: "dataset_test_mod2.h5ad" 41 | description: Censored dataset. 42 | required: true 43 | 44 | # required outputs 45 | - name: "--output_pretrain" 46 | type: "file" 47 | direction: "output" 48 | example: "pretrain_model" 49 | description: Path to the directory containing a pretrained model. 50 | required: true 51 | 52 | # files your script needs 53 | resources: 54 | - type: python_script 55 | path: script.py 56 | - path: ../resources/helper_functions.py 57 | 58 | # target platforms 59 | platforms: 60 | - type: docker 61 | image: "pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime" 62 | run_args: [ "--gpus all --shm-size=5G" ] 63 | setup: 64 | - type: python 65 | packages: 66 | - anndata 67 | - scikit-learn 68 | - networkx 69 | 70 | - type: nextflow 71 | labels: [ vhighmem, vvhightime, vhighcpu, gpu] 72 | -------------------------------------------------------------------------------- /src/predict_modality/methods/novel/train/script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import torch 5 | from torch.utils.data import DataLoader 6 | 7 | import anndata as ad 8 | 9 | from sklearn.model_selection import train_test_split 10 | 11 | import pickle 12 | 13 | #check gpu available 14 | if (torch.cuda.is_available()): 15 | device = 'cuda:0' #switch to current device 16 | print('current device: gpu') 17 | else: 18 | device = 'cpu' 19 | print('current device: cpu') 20 | 21 | 22 | ## VIASH START 23 | dataset_path = "output/datasets/match_modality/openproblems_bmmc_cite_phase2_rna/openproblems_bmmc_cite_phase2_rna.censor_dataset.output_" 24 | pretrain_path = "output/pretrain/match_modality/clue/openproblems_bmmc_cite_phase2_rna.clue_train.output_pretrain/" 25 | 26 | par = { 27 | 'input_train_mod1': f'{dataset_path}train_mod1.h5ad', 28 | 'input_train_mod2': f'{dataset_path}train_mod2.h5ad', 29 | 'input_test_mod1': f'{dataset_path}test_mod1.h5ad', 30 | 'input_test_mod2': f'{dataset_path}test_mod2.h5ad', 31 | 'output_pretrain': pretrain_path 32 | } 33 | meta = { 34 | 'resources_dir': '.', 35 | 'functionality_name': '171129' 36 | } 37 | ## VIASH END 38 | 39 | sys.path.append(meta['resources_dir']) 40 | from helper_functions import train_and_valid, lsiTransformer, ModalityMatchingDataset 41 | from helper_functions import ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModelRegressionGex2Adt, ModelRegressionGex2Atac 42 | 43 | os.makedirs(par['output_pretrain'], exist_ok=True) 44 | 45 | print("Start train") 46 | 47 | input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) 48 | input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) 49 | 50 | mod1 = input_train_mod1.var['feature_types'][0] 51 | mod2 = input_train_mod2.var['feature_types'][0] 52 | if mod1 != "ADT": 53 | input_train_mod2_df = input_train_mod2.to_df() 54 | 55 | lsi_transformer_gex = lsiTransformer(n_components=256) 56 | gex_train = lsi_transformer_gex.fit_transform(input_train_mod1) 57 | 58 | train_mod1, test_mod1, train_mod2, test_mod2 = train_test_split(gex_train, input_train_mod2_df, test_size=0.25, random_state=666) 59 | input_train_mod2_df = input_train_mod2.to_df() 60 | else: 61 | train_mod1 = input_train_mod1.to_df() 62 | train_mod2 = input_train_mod2.to_df() 63 | test_mod1 = ad.read_h5ad(par['input_test_mod1']).to_df() 64 | test_mod2 = ad.read_h5ad(par['input_test_mod2']).to_df() 65 | 66 | 67 | if mod1 == 'ATAC' and mod2 == 'GEX': 68 | dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) 69 | dataloader_train = DataLoader(dataset_train, 256, shuffle = True, num_workers = 8) 70 | 71 | dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) 72 | dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) 73 | 74 | model = ModelRegressionAtac2Gex(256,13431).to(device) 75 | optimizer = torch.optim.AdamW(model.parameters(), lr=0.00008386597445284492,weight_decay=0.000684887347727808) 76 | 77 | elif mod1 == 'ADT' and mod2 == 'GEX': 78 | dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) 79 | dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 4) 80 | 81 | dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) 82 | dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4) 83 | 84 | model = ModelRegressionAdt2Gex(134,13953).to(device) 85 | optimizer = torch.optim.Adam(model.parameters(), lr=0.00041, weight_decay=0.0000139) 86 | 87 | 88 | elif mod1 == 'GEX' and mod2 == 'ADT': 89 | dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) 90 | dataloader_train = DataLoader(dataset_train, 32, shuffle = True, num_workers = 8) 91 | 92 | dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) 93 | dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) 94 | 95 | model = ModelRegressionGex2Adt(256,134).to(device) 96 | optimizer = torch.optim.AdamW(model.parameters(), lr=0.000034609210829678734, weight_decay=0.0009965881574697426) 97 | 98 | 99 | elif mod1 == 'GEX' and mod2 == 'ATAC': 100 | dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) 101 | dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 8) 102 | 103 | dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) 104 | dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) 105 | 106 | model = ModelRegressionGex2Atac(256,10000).to(device) 107 | optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001806762345275399, weight_decay=0.0004084171379280058) 108 | 109 | loss_fn = torch.nn.MSELoss() 110 | train_and_valid(model, optimizer, loss_fn, dataloader_train, dataloader_test, par['output_pretrain'] + '/model.pt', device) 111 | 112 | if mod1 != "ADT": 113 | with open(par['output_pretrain'] + '/lsi_transformer.pickle', 'wb') as f: 114 | pickle.dump(lsi_transformer_gex, f) 115 | 116 | print("End train") 117 | -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | pretrains_v10/ 3 | run/results.py 4 | run/script_v10.5.py -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/README.md: -------------------------------------------------------------------------------- 1 | [scJoint] Predict Modality 2 | === 3 | Team scJoint: Yu-Hsiu Chen, Sheng Wan, Tung-Yu Wu 4 | 5 | Project URL: https://github.com/itscassie/scJoint-neurips2021-modality-prediction 6 | 7 | This folder contains our training pipeline and script used for the **NeurIPS 2021 Competition - Multimodal Single-Cell Data Integration**, the **Predict Modality** task. Our team **scJoint** took [3rd place of the modality prediction task](https://eval.ai/web/challenges/challenge-page/1111/leaderboard/2860) in terms of the overall ranking of 4 subtasks: namely `GEX to ADT`, `ADT to GEX`, `GEX to ATAC`, and `ATAC to GEX`. Specifically, our methods ranked 3rd in `GEX to ADT` and 4th in `ATAC to GEX`. More details about the training configurations can be found in our project ([link](https://github.com/itscassie/scJoint-neurips2021-modality-prediction)). 8 | 9 | Full documentation for the competition, including dataset, can be found at [openproblems.bio/neurips_docs/](https://openproblems.bio/neurips_docs/). 10 | -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/resources/modules/model_ae.py: -------------------------------------------------------------------------------- 1 | """ autoencoder based models """ 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class Encoder(nn.Module): 7 | """base encoder module""" 8 | 9 | def __init__(self, input_dim, out_dim, hidden_dim, dropout=0.2): 10 | super(Encoder, self).__init__() 11 | self.encoder = nn.Sequential( 12 | nn.Dropout(dropout), 13 | nn.Linear(input_dim, hidden_dim), 14 | nn.BatchNorm1d(hidden_dim), 15 | nn.LeakyReLU(0.2), 16 | nn.Linear(hidden_dim, hidden_dim), 17 | nn.BatchNorm1d(hidden_dim), 18 | nn.LeakyReLU(0.2), 19 | nn.Linear(hidden_dim, out_dim), 20 | ) 21 | 22 | def forward(self, x_input): 23 | """forward propogation of the encoder arch""" 24 | x_emb = self.encoder(x_input) 25 | return x_emb 26 | 27 | 28 | class Decoder(nn.Module): 29 | """base decoder module""" 30 | 31 | def __init__(self, input_dim, out_dim, hidden_dim): 32 | super(Decoder, self).__init__() 33 | self.decoder = nn.Sequential( 34 | nn.Linear(input_dim, hidden_dim // 2), 35 | nn.BatchNorm1d(hidden_dim // 2), 36 | nn.LeakyReLU(0.2), 37 | nn.Linear(hidden_dim // 2, hidden_dim), 38 | nn.BatchNorm1d(hidden_dim), 39 | nn.LeakyReLU(0.2), 40 | nn.Linear(hidden_dim, out_dim), 41 | nn.ReLU(), 42 | ) 43 | 44 | def forward(self, x_emb): 45 | """forward propogation of the decoder arch""" 46 | x_rec = self.decoder(x_emb) 47 | return x_rec 48 | 49 | 50 | class AutoEncoder(nn.Module): 51 | """autoencoder module""" 52 | 53 | def __init__(self, input_dim, out_dim, feat_dim, hidden_dim, dropout=0.2): 54 | super(AutoEncoder, self).__init__() 55 | self.encoder = Encoder(input_dim, feat_dim, hidden_dim, dropout) 56 | self.decoder = Decoder(feat_dim, out_dim, hidden_dim) 57 | 58 | def forward(self, x_input): 59 | """forward propogation of the autoencoder arch""" 60 | x_emb = self.encoder(x_input) 61 | x_rec = self.decoder(x_emb) 62 | return x_rec 63 | 64 | 65 | class BatchClassifier(nn.Module): 66 | """base batch classifier class""" 67 | 68 | def __init__(self, input_dim, cls_num=6, hidden_dim=50): 69 | super(BatchClassifier, self).__init__() 70 | self.classifier = nn.Sequential( 71 | nn.Linear(input_dim, hidden_dim), 72 | nn.BatchNorm1d(hidden_dim), 73 | nn.LeakyReLU(0.2), 74 | nn.Linear(hidden_dim, cls_num), 75 | nn.LeakyReLU(0.2), 76 | ) 77 | 78 | def forward(self, x_feat): 79 | """forward propogation of the batch classifier arch""" 80 | return self.classifier(x_feat) 81 | 82 | 83 | class BatchRemovalGAN(nn.Module): 84 | """batch removal module""" 85 | 86 | def __init__(self, input_dim, out_dim, feat_dim, hidden_dim, cls_num=10, dropout=0.2): 87 | super(BatchRemovalGAN, self).__init__() 88 | self.encoder = Encoder(input_dim, feat_dim, hidden_dim, dropout) 89 | self.decoder = Decoder(feat_dim, out_dim, hidden_dim) 90 | self.classifier = BatchClassifier(feat_dim, cls_num=cls_num) 91 | 92 | def forward(self, x_input): 93 | """forward propogation of the batch removal gan arch""" 94 | x_feat = self.encoder(x_input) 95 | x_rec = self.decoder(x_feat) 96 | cls_prob = self.classifier(x_feat) 97 | 98 | return x_rec, cls_prob 99 | 100 | 101 | if __name__ == "__main__": 102 | 103 | bsz = 5 104 | in_d = 10 105 | out_d = 3 106 | feat_d = 2 107 | hid_d = 10 108 | 109 | x1 = torch.randn(bsz, in_d).cuda() 110 | 111 | model = AutoEncoder(in_d, out_d, feat_d, hid_d).cuda().float() 112 | print(model) 113 | output = model(x1) 114 | print(output.shape) 115 | -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/resources/preprocess/save_highlyvar_genes.py: -------------------------------------------------------------------------------- 1 | """ save highly variable using scanpy package """ 2 | import os 3 | import argparse 4 | import numpy as np 5 | import anndata as ad 6 | import scanpy as sc 7 | import pandas as pd 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument( 11 | "-d", 12 | "--data_dir", 13 | type=str, 14 | default="output/datasets/predict_modality", 15 | help="path to dataset directory", 16 | ) 17 | parser.add_argument( 18 | "-o", 19 | "--output_dir", 20 | type=str, 21 | default="output/pretrain/predict_modality/scjoint", 22 | help="path to output directory", 23 | ) 24 | parser.add_argument( 25 | "-p", 26 | "--phase", 27 | type=str, 28 | default="phase2", 29 | choices=["phase1", "phase1v2", "phase2"], 30 | help="dataset phase", 31 | ) 32 | 33 | parser.add_argument( 34 | "-m", 35 | "--mode", 36 | nargs="*", 37 | type=str, 38 | default=["atac2gex"], 39 | help="modes for generating idf matrix", 40 | ) 41 | 42 | parser.add_argument("-n", "--n_top", type=int, default=10000, help="returns n top highly variable genes") 43 | args = parser.parse_args() 44 | 45 | # datset path 46 | ADT2GEX_ID = f"openproblems_bmmc_cite_{args.phase}_mod2" 47 | GEX2ADT_ID = f"openproblems_bmmc_cite_{args.phase}_rna" 48 | ATAC2GEX_ID = f"openproblems_bmmc_multiome_{args.phase}_mod2" 49 | GEX2ATAC_ID = f"openproblems_bmmc_multiome_{args.phase}_rna" 50 | 51 | # path to different modes 52 | ADT2GEX_PTH = f"{args.data_dir}/{ADT2GEX_ID}/{ADT2GEX_ID}.censor_dataset" 53 | GEX2ADT_PTH = f"{args.data_dir}/{GEX2ADT_ID}/{GEX2ADT_ID}.censor_dataset" 54 | ATAC2GEX_PTH = f"{args.data_dir}/{ATAC2GEX_ID}/{ATAC2GEX_ID}.censor_dataset" 55 | GEX2ATAC_PTH = f"{args.data_dir}/{GEX2ATAC_ID}/{GEX2ATAC_ID}.censor_dataset" 56 | 57 | ADT2GEX = [ 58 | f"{ADT2GEX_PTH}.output_train_mod1.h5ad", 59 | f"{ADT2GEX_PTH}.output_train_mod2.h5ad", 60 | f"{ADT2GEX_PTH}.output_test_mod1.h5ad", 61 | f"{ADT2GEX_PTH}.output_test_mod2.h5ad", 62 | f"{args.output_dir}/adt2gex_train.output_pretrain", 63 | ] 64 | 65 | GEX2ADT = [ 66 | f"{GEX2ADT_PTH}.output_train_mod1.h5ad", 67 | f"{GEX2ADT_PTH}.output_train_mod2.h5ad", 68 | f"{GEX2ADT_PTH}.output_test_mod1.h5ad", 69 | f"{GEX2ADT_PTH}.output_test_mod2.h5ad", 70 | f"{args.output_dir}/gex2adt_train.output_pretrain", 71 | ] 72 | 73 | ATAC2GEX = [ 74 | f"{ATAC2GEX_PTH}.output_train_mod1.h5ad", 75 | f"{ATAC2GEX_PTH}.output_train_mod2.h5ad", 76 | f"{ATAC2GEX_PTH}.output_test_mod1.h5ad", 77 | f"{ATAC2GEX_PTH}.output_test_mod2.h5ad", 78 | f"{args.output_dir}/atac2gex_train.output_pretrain", 79 | ] 80 | 81 | GEX2ATAC = [ 82 | f"{GEX2ATAC_PTH}.output_train_mod1.h5ad", 83 | f"{GEX2ATAC_PTH}.output_train_mod2.h5ad", 84 | f"{GEX2ATAC_PTH}.output_test_mod1.h5ad", 85 | f"{GEX2ATAC_PTH}.output_test_mod2.h5ad", 86 | f"{args.output_dir}/gex2atac_train.output_pretrain", 87 | ] 88 | 89 | MODES = {"adt2gex": ADT2GEX, "gex2adt": GEX2ADT, "atac2gex": ATAC2GEX, "gex2atac": GEX2ATAC} 90 | 91 | 92 | if __name__ == "__main__": 93 | # desired data path 94 | DATAPTH = [MODES[i] for i in args.mode] 95 | 96 | for (i, mode) in enumerate(DATAPTH): 97 | print(f"MODE [{i + 1} / {len(DATAPTH)}]: {args.mode[i]}") 98 | train_mod1_pth = mode[0] 99 | test_mod1_pth = mode[2] 100 | train_mod1 = sc.read_h5ad(train_mod1_pth) 101 | test_mod1 = sc.read_h5ad(test_mod1_pth) 102 | 103 | # concat train/test sets 104 | X_raw = sc.concat( 105 | {"train": train_mod1, "test": test_mod1}, 106 | axis=0, 107 | join="outer", 108 | label="group", 109 | fill_value=0, 110 | index_unique="-", 111 | ) 112 | print(X_raw.shape) 113 | 114 | # collect highly variable genes 115 | sc.pp.highly_variable_genes(X_raw, n_top_genes=args.n_top) 116 | X_raw = X_raw[:, X_raw.var.highly_variable] 117 | 118 | train_highly = X_raw[: train_mod1.X.shape[0], :] 119 | train_highly = ad.AnnData( 120 | X=train_highly.X, 121 | obs=train_highly.obs, 122 | var=pd.DataFrame({"feature_types": train_mod1.var["feature_types"][X_raw.var_names]}), 123 | uns=train_highly.uns, 124 | layers=train_highly.layers, 125 | ) 126 | 127 | test_highly = X_raw[train_mod1.X.shape[0] :, :] 128 | test_highly = ad.AnnData( 129 | X=test_highly.X, 130 | obs=test_highly.obs, 131 | var=pd.DataFrame({"feature_types": test_mod1.var["feature_types"][X_raw.var_names]}), 132 | uns=test_highly.uns, 133 | layers=test_highly.layers, 134 | ) 135 | print(train_highly) 136 | print(test_highly) 137 | 138 | # save highly variable indexs 139 | mod1_vars = np.array(train_mod1.var_names) 140 | mod1_highly_idx = [ 141 | int(np.where(mod1_vars == np.array(X_raw.var_names[i]))[0]) 142 | for i in range(np.array(X_raw.var_names).shape[0]) 143 | ] 144 | 145 | file_path = f"{mode[4]}" 146 | os.makedirs(file_path, exist_ok=True) 147 | 148 | with open(f"{file_path}/index_highly{args.n_top}.txt", "w", encoding="utf8") as index_file: 149 | index_file.write(f"index num: {len(mod1_highly_idx)}\n") 150 | for ind in mod1_highly_idx: 151 | index_file.write(str(ind) + "\n") 152 | 153 | print(f"finish saving {file_path}/index_highly{args.n_top}.txt") 154 | -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/resources/preprocess/save_idf_matrix.py: -------------------------------------------------------------------------------- 1 | """ this function save idf matrixs from the dataset """ 2 | import os 3 | import argparse 4 | import numpy as np 5 | import anndata as ad 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument( 9 | "-d", 10 | "--data_dir", 11 | type=str, 12 | default="output/datasets/predict_modality", 13 | help="path to dataset directory", 14 | ) 15 | parser.add_argument( 16 | "-o", 17 | "--output_dir", 18 | type=str, 19 | default="output/pretrain/predict_modality/scjoint", 20 | help="path to output directory", 21 | ) 22 | parser.add_argument( 23 | "-p", 24 | "--phase", 25 | default="phase2", 26 | type=str, 27 | choices=["phase1", "phase1v2", "phase2"], 28 | help="dataset phase", 29 | ) 30 | 31 | parser.add_argument( 32 | "-m", 33 | "--mode", 34 | type=str, 35 | nargs="*", 36 | default=["adt2gex", "gex2adt", "atac2gex", "gex2atac"], 37 | help="modes for generating idf matrix", 38 | ) 39 | args = parser.parse_args() 40 | 41 | # datset path 42 | ADT2GEX_ID = f"openproblems_bmmc_cite_{args.phase}_mod2" 43 | GEX2ADT_ID = f"openproblems_bmmc_cite_{args.phase}_rna" 44 | ATAC2GEX_ID = f"openproblems_bmmc_multiome_{args.phase}_mod2" 45 | GEX2ATAC_ID = f"openproblems_bmmc_multiome_{args.phase}_rna" 46 | 47 | # path to different modes 48 | ADT2GEX_PTH = f"{args.data_dir}/{ADT2GEX_ID}/{ADT2GEX_ID}.censor_dataset" 49 | GEX2ADT_PTH = f"{args.data_dir}/{GEX2ADT_ID}/{GEX2ADT_ID}.censor_dataset" 50 | ATAC2GEX_PTH = f"{args.data_dir}/{ATAC2GEX_ID}/{ATAC2GEX_ID}.censor_dataset" 51 | GEX2ATAC_PTH = f"{args.data_dir}/{GEX2ATAC_ID}/{GEX2ATAC_ID}.censor_dataset" 52 | 53 | ADT2GEX = [ 54 | f"{ADT2GEX_PTH}.output_train_mod1.h5ad", 55 | f"{ADT2GEX_PTH}.output_train_mod2.h5ad", 56 | f"{ADT2GEX_PTH}.output_test_mod1.h5ad", 57 | f"{ADT2GEX_PTH}.output_test_mod2.h5ad", 58 | f"{args.output_dir}/adt2gex_train.output_pretrain", 59 | ] 60 | 61 | GEX2ADT = [ 62 | f"{GEX2ADT_PTH}.output_train_mod1.h5ad", 63 | f"{GEX2ADT_PTH}.output_train_mod2.h5ad", 64 | f"{GEX2ADT_PTH}.output_test_mod1.h5ad", 65 | f"{GEX2ADT_PTH}.output_test_mod2.h5ad", 66 | f"{args.output_dir}/gex2adt_train.output_pretrain", 67 | ] 68 | 69 | ATAC2GEX = [ 70 | f"{ATAC2GEX_PTH}.output_train_mod1.h5ad", 71 | f"{ATAC2GEX_PTH}.output_train_mod2.h5ad", 72 | f"{ATAC2GEX_PTH}.output_test_mod1.h5ad", 73 | f"{ATAC2GEX_PTH}.output_test_mod2.h5ad", 74 | f"{args.output_dir}/atac2gex_train.output_pretrain", 75 | ] 76 | 77 | GEX2ATAC = [ 78 | f"{GEX2ATAC_PTH}.output_train_mod1.h5ad", 79 | f"{GEX2ATAC_PTH}.output_train_mod2.h5ad", 80 | f"{GEX2ATAC_PTH}.output_test_mod1.h5ad", 81 | f"{GEX2ATAC_PTH}.output_test_mod2.h5ad", 82 | f"{args.output_dir}/gex2atac_train.output_pretrain", 83 | ] 84 | 85 | MODES = {"adt2gex": ADT2GEX, "gex2adt": GEX2ADT, "atac2gex": ATAC2GEX, "gex2atac": GEX2ATAC} 86 | 87 | 88 | def idf_matrix(x_raw): 89 | """returns idf matrix""" 90 | x_idf = np.zeros_like(x_raw).astype(np.single) 91 | x_idf[x_raw > 0] = 1 92 | idf = np.log(x_raw.shape[0] / (np.sum(x_idf, axis=0, keepdims=True) + 1)) 93 | return idf 94 | 95 | 96 | if __name__ == "__main__": 97 | # desired data path 98 | DATAPTH = [MODES[i] for i in args.mode] 99 | for (i, mode) in enumerate(DATAPTH): 100 | print(f"MODE [{i + 1} / {len(DATAPTH)}]: {args.mode[i]}") 101 | 102 | train_mod1_pth = mode[0] 103 | train_mod1 = ad.read_h5ad(train_mod1_pth) 104 | 105 | x_raw_matrix = train_mod1.layers["counts"].toarray().astype(np.float16) 106 | print(f"train data shape: {x_raw_matrix.shape}") 107 | 108 | x_idf_matrix = idf_matrix(x_raw_matrix) 109 | print(f"idf matrix shape: {x_idf_matrix.shape}") 110 | 111 | file_path = f"{mode[4]}" 112 | print(f"output dir: {file_path}") 113 | os.makedirs(file_path, exist_ok=True) 114 | 115 | np.save(f"{file_path}/mod1_idf.npy", x_idf_matrix) 116 | print(f"finish saving {file_path}/mod1_idf.npy") 117 | -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/resources/train.py: -------------------------------------------------------------------------------- 1 | """ main training process """ 2 | import os 3 | import logging 4 | import argparse 5 | from datetime import datetime 6 | 7 | from trainer.trainer_nn import TrainProcess as TrainProcess_NN 8 | from trainer.trainer_cycle import TrainProcess as TrainProcess_Cycle 9 | from trainer.trainer_batchgan import TrainProcess as TrainProcess_BATCHGAN 10 | 11 | from opts import DATASET, model_opts 12 | from utils.dataloader import get_data_dim 13 | 14 | if __name__ == "__main__": 15 | # config parser 16 | parser = argparse.ArgumentParser(add_help=False) 17 | model_opts(parser) 18 | args = parser.parse_known_args()[0] 19 | 20 | # exp name for train log, weights, model 21 | if args.train == "train": 22 | TIME_NOW = datetime.now().strftime("%b%d-%H-%M") 23 | exp_name = f"{args.arch}_{args.mode}" 24 | if args.selection: 25 | assert args.mod1_idx_path is not None, "need to specified --mod1_idx_path" 26 | SELECT_NUM = args.mod1_idx_path.split("/")[-1].replace(".txt", "").replace("index_", "") 27 | exp_name += f"_select{SELECT_NUM}" 28 | 29 | if args.tfidf != 0: 30 | assert args.idf_path is not None, "need to specified --idf_path" 31 | assert not args.gene_activity, "support either ga or tfidf != 0" 32 | if args.tfidf == 1: 33 | exp_name += f"_tfidf" 34 | elif args.tfidf == 2: 35 | exp_name += f"_tfidfconcat" 36 | elif args.tfidf == 3: 37 | exp_name += f"_tfidfconcatga" 38 | assert args.mode == "atac2gex" and args.phase in [ 39 | "phase1v2", 40 | "phase2", 41 | ], "gene activity mode support only atac2gex mode (p1v2 or p2)" 42 | elif args.gene_activity: 43 | exp_name += f"_ga" 44 | assert args.mode == "atac2gex" and args.phase in [ 45 | "phase1v2", 46 | "phase2", 47 | ], "gene activity mode support only atac2gex mode (p1v2 or p2)" 48 | if args.norm: 49 | exp_name += f"_norm" 50 | if args.dropout != 0.2: 51 | exp_name += f"_dropout{args.dropout}" 52 | if args.name != "": 53 | exp_name += f"_{args.name}" 54 | else: 55 | exp_name += f"_{TIME_NOW}" 56 | 57 | # exp name for eval log file 58 | elif args.train == "eval": 59 | assert args.checkpoint is not None, "need to specified --checkpoint" 60 | exp_name = args.checkpoint.split("/")[-1].replace(".pt", "") 61 | exp_name += f"_{args.phase}" 62 | 63 | # loggings and logs 64 | if args.dryrun: 65 | handlers = [logging.StreamHandler()] 66 | else: 67 | os.makedirs(f"{args.output_dir}/logs/", exist_ok=True) 68 | os.makedirs(f"{DATASET[args.mode]['weight_dir']}", exist_ok=True) 69 | handlers = [ 70 | logging.FileHandler(f"{args.output_dir}/logs/{args.train}_{exp_name}.log", mode="w"), 71 | logging.StreamHandler(), 72 | ] 73 | 74 | logging.basicConfig(level=logging.DEBUG, format="%(message)s", handlers=handlers) 75 | 76 | # load data 77 | MOD1_DIM, MOD2_DIM = get_data_dim(DATASET[args.mode], args) 78 | 79 | parser.add_argument("--mod1_dim", default=MOD1_DIM) 80 | parser.add_argument("--mod2_dim", default=MOD2_DIM) 81 | parser.add_argument("--exp_name", default=exp_name) 82 | args = parser.parse_args() 83 | 84 | logging.info("\nArgument:") 85 | for arg, value in vars(args).items(): 86 | logging.info(f"{arg:20s}: {value}") 87 | logging.info("\n") 88 | 89 | # trainer 90 | if args.arch == "nn": 91 | trainer = TrainProcess_NN(args) 92 | elif args.arch == "cycle": 93 | trainer = TrainProcess_Cycle(args) 94 | elif args.arch == "batchgan": 95 | trainer = TrainProcess_BATCHGAN(args) 96 | 97 | if args.train == "train": 98 | trainer.run() 99 | trainer.eval() 100 | 101 | elif args.train == "eval": 102 | trainer.load_checkpoint() 103 | trainer.eval() 104 | -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/resources/trainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/predict_modality/methods/scJoint/resources/trainer/__init__.py -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/resources/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openproblems-bio/neurips2021_multimodal_topmethods/5782a87a3bb46b30eb264d85cca999724aaaf7d2/src/predict_modality/methods/scJoint/resources/utils/__init__.py -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/resources/utils/loss.py: -------------------------------------------------------------------------------- 1 | """ define custum loss function in this file """ 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | def cosine_sim(arr_1, arr_2): 7 | """ return consine similarity of 2 arrays """ 8 | arr_1 = arr_1 / torch.norm(arr_1, dim=1, keepdim=True) 9 | arr_2 = arr_2 / torch.norm(arr_2, dim=1, keepdim=True) 10 | sim = torch.matmul(arr_1, torch.transpose(arr_2, 0, 1)) 11 | 12 | return sim 13 | 14 | 15 | class CosineLoss(nn.Module): 16 | """ custum loss for mean cosine similarity """ 17 | def __init__(self): 18 | super(CosineLoss, self).__init__() 19 | 20 | def forward(self, emb1, emb2, emb1_resid, emb2_resid): 21 | """ define cosine loss """ 22 | emb1, emb2 = emb1.float(), emb2.float() 23 | cosine_loss = torch.mean( 24 | torch.abs(cosine_sim(emb1, emb1_resid) + cosine_sim(emb2, emb2_resid)) 25 | ) 26 | return cosine_loss 27 | 28 | 29 | class L1regularization(nn.Module): 30 | """ l1 regularization loss for model """ 31 | def __init__(self, weight_decay=0.1): 32 | super(L1regularization, self).__init__() 33 | self.weight_decay = weight_decay 34 | 35 | def forward(self, model): 36 | """ define l1 reg loss """ 37 | regularization_loss = 0.0 38 | for param in model.parameters(): 39 | regularization_loss += torch.mean(abs(param)) * self.weight_decay 40 | 41 | return regularization_loss 42 | -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/resources/utils/metric.py: -------------------------------------------------------------------------------- 1 | """ calculate metrics """ 2 | 3 | import numpy as np 4 | 5 | def rmse(mod2_sol, mod2_pred): 6 | """ 7 | input: prediction / ans 8 | output: rmse 9 | """ 10 | tmp = mod2_sol - mod2_pred 11 | rmse_out = np.sqrt(tmp.power(2).mean()) 12 | return rmse_out 13 | -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/run/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: scjoint 3 | namespace: predict_modality_methods 4 | 5 | description: An ensemble method including pca, nn, feature extraction. 6 | 7 | info: 8 | method_label: "scJoint" 9 | submission_id: "171135" 10 | team_name: scJoint 11 | 12 | authors: 13 | - name: Yu-Hsiu Chen 14 | email: yhchen.cm06g@nctu.edu.tw 15 | roles: [ author, maintainer ] 16 | props: { github: itscassie } 17 | - name: Sheng Wan 18 | email: a5736735a.eecs99@g2.nctu.edu.tw 19 | - name: Tung-Yu Wu 20 | email: wtywty@gmail.com 21 | 22 | # parameters 23 | arguments: 24 | # required inputs 25 | - name: "--input_train_mod1" 26 | type: "file" 27 | example: "dataset_mod1.h5ad" 28 | description: Censored dataset, training cells. 29 | required: true 30 | - name: "--input_test_mod1" 31 | type: "file" 32 | example: "dataset_mod1.h5ad" 33 | description: Censored dataset, test cells. 34 | required: true 35 | - name: "--input_train_mod2" 36 | type: "file" 37 | example: "dataset_mod2.h5ad" 38 | description: Censored dataset. 39 | required: true 40 | - name: "--input_pretrain" 41 | type: "file" 42 | example: "pretrain_model" 43 | description: Path to the directory containing a pretrained model. 44 | required: true 45 | # required outputs 46 | - name: "--output" 47 | type: "file" 48 | direction: "output" 49 | example: "output.h5ad" 50 | description: Dataset with predicted values for modality2. 51 | required: true 52 | 53 | # files your script needs 54 | resources: 55 | - type: python_script 56 | path: script.py 57 | # includes all code under resources/ 58 | - path: ../resources 59 | 60 | # target platforms 61 | platforms: 62 | - type: docker 63 | image: "pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime" 64 | run_args: [ "--gpus all" ] 65 | setup: 66 | - type: python 67 | packages: 68 | - scikit-learn 69 | - anndata 70 | - scanpy 71 | - numpy 72 | 73 | - type: nextflow 74 | labels: [ highmem, hightime, midcpu, gpu ] 75 | -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export PIPELINE_REPO="openproblems-bio/neurips2021_multimodal_viash" 4 | export NXF_VER=21.04.1 5 | export PIPELINE_VERSION=1.4.0 6 | method_id=scjoint 7 | task_id=predict_modality 8 | pretrain_path=output/pretrain/$task_id/$method_id 9 | 10 | # GENERATE PRETRAIN 11 | echo "" 12 | echo "######################################################################" 13 | echo "## Generating pretrain weights/files ##" 14 | echo "######################################################################" 15 | 16 | target/docker/${task_id}_methods/${method_id}_train/${method_id}_train \ 17 | --data_dir output/datasets/$task_id \ 18 | --output_pretrain ${pretrain_path} 19 | 20 | echo "" 21 | echo "######################################################################" 22 | echo "## Generating prediction files ##" 23 | echo "######################################################################" 24 | 25 | # CITE GEX2ADT 26 | echo "" 27 | echo "CITE GEX to ADT" 28 | dataset_id=openproblems_bmmc_cite_phase2_rna 29 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 30 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 31 | 32 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 33 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 34 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 35 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 36 | --input_pretrain "${pretrain_path}/gex2adt_train.output_pretrain/" \ 37 | --output ${pred_path}.${method_id}.output.h5ad 38 | 39 | # CITE ADT2GEX 40 | echo "" 41 | echo "CITE ADT to GEX" 42 | dataset_id=openproblems_bmmc_cite_phase2_mod2 43 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 44 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 45 | 46 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 47 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 48 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 49 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 50 | --input_pretrain "${pretrain_path}/adt2gex_train.output_pretrain/" \ 51 | --output ${pred_path}.${method_id}.output.h5ad 52 | 53 | # MULTIOME GEX2ATAC 54 | echo "" 55 | echo "MULTIOME GEX to ATAC" 56 | dataset_id=openproblems_bmmc_multiome_phase2_rna 57 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 58 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 59 | 60 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 61 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 62 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 63 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 64 | --input_pretrain "${pretrain_path}/gex2atac_train.output_pretrain/" \ 65 | --output ${pred_path}.${method_id}.output.h5ad 66 | 67 | # MULTIOME ATAC2GEX 68 | echo "" 69 | echo "MULTIOME ATAC to GEX" 70 | dataset_id=openproblems_bmmc_multiome_phase2_mod2 71 | dataset_path=output/datasets/$task_id/$dataset_id/$dataset_id.censor_dataset 72 | pred_path=output/predictions/$task_id/$dataset_id/$dataset_id 73 | 74 | target/docker/${task_id}_methods/${method_id}/${method_id} \ 75 | --input_train_mod1 ${dataset_path}.output_train_mod1.h5ad \ 76 | --input_train_mod2 ${dataset_path}.output_train_mod2.h5ad \ 77 | --input_test_mod1 ${dataset_path}.output_test_mod1.h5ad \ 78 | --input_pretrain "${pretrain_path}/atac2gex_train.output_pretrain/" \ 79 | --output ${pred_path}.${method_id}.output.h5ad 80 | 81 | # RUN EVALUATION 82 | echo "" 83 | echo "######################################################################" 84 | echo "## Evaluating predictions ##" 85 | echo "######################################################################" 86 | bin/nextflow run "$PIPELINE_REPO" \ 87 | -r "$PIPELINE_VERSION" \ 88 | -main-script "src/$task_id/workflows/evaluate_submission/main.nf" \ 89 | --solutionDir "output/datasets/$task_id" \ 90 | --predictions "output/predictions/$task_id/**.${method_id}.output.h5ad" \ 91 | --publishDir "output/evaluation/$task_id/$method_id/" \ 92 | -latest \ 93 | -resume \ 94 | -c "src/resources/nextflow_moremem.config" 95 | 96 | echo "" 97 | echo "######################################################################" 98 | echo "## Evaluation summary ##" 99 | echo "######################################################################" 100 | cat "output/evaluation/$task_id/$method_id/output.final_scores.output_json.json" -------------------------------------------------------------------------------- /src/predict_modality/methods/scJoint/train/config.vsh.yaml: -------------------------------------------------------------------------------- 1 | functionality: 2 | name: scjoint_train 3 | namespace: predict_modality_methods 4 | 5 | # metadata for your method 6 | version: dev 7 | description: An ensemble method including pca, nn, feature extraction. 8 | authors: 9 | - name: Yu-Hsiu Chen 10 | email: yhchen.cm06g@nctu.edu.tw 11 | roles: [ author, maintainer ] 12 | - name: Sheng Wan 13 | email: a5736735a.eecs99@g2.nctu.edu.tw 14 | - name: Tung-Yu Wu 15 | email: wtywty@gmail.com 16 | 17 | # parameters 18 | arguments: 19 | # required inputs 20 | - name: "--data_dir" 21 | type: "file" 22 | description: The path to the predict_modality datasets 23 | required: true 24 | 25 | # required outputs 26 | - name: "--output_pretrain" 27 | type: "file" 28 | direction: "output" 29 | example: "pretrain_model" 30 | description: Path to the directory containing the pretrained models. 31 | required: true 32 | 33 | # files your script needs 34 | resources: 35 | - type: bash_script 36 | path: train.sh 37 | # includes all code under resources/ 38 | - path: ../resources 39 | 40 | # target platforms 41 | platforms: 42 | 43 | - type: docker 44 | image: "pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime" 45 | run_args: [ "--gpus all" ] 46 | setup: 47 | - type: python 48 | packages: 49 | - scikit-learn 50 | - anndata 51 | - scanpy 52 | - numpy 53 | 54 | - type: nextflow 55 | labels: [ highmem, hightime, midcpu, gpu ] 56 | -------------------------------------------------------------------------------- /src/resources/nextflow.config: -------------------------------------------------------------------------------- 1 | includeConfig "${launchDir}/target/nextflow/nextflow.config" 2 | 3 | process { 4 | withLabel: lowcpu { cpus = 4 } 5 | withLabel: midcpu { cpus = 4 } 6 | withLabel: highcpu { cpus = 15 } 7 | withLabel: vhighcpu { cpus = 30 } 8 | withLabel: lowmem { memory = 60.GB } 9 | withLabel: midmem { memory = 60.GB } 10 | withLabel: highmem { memory = 110.GB } 11 | withLabel: vhighmem { memory = 110.GB } 12 | withLabel: lowtime { time = "20m" } 13 | withLabel: midtime { time = "40m" } 14 | withLabel: hightime { time = "60m" } 15 | withLabel: vhightime { time = "120m" } 16 | withLabel: vvhightime { time = "360m" } 17 | withLabel: gpu { maxForks = 1; containerOptions = '--gpus all' } 18 | } 19 | 20 | def viash_temp = System.getenv("VIASH_TEMP") ?: "/tmp/" 21 | docker.runOptions = "-v ${launchDir}/target/nextflow:${launchDir}/target/nextflow -v $viash_temp:$viash_temp --shm-size=5G --net none" 22 | -------------------------------------------------------------------------------- /src/resources/nextflow_moremem.config: -------------------------------------------------------------------------------- 1 | process { 2 | withLabel: lowcpu { cpus = 4 } 3 | withLabel: midcpu { cpus = 4 } 4 | withLabel: highcpu { cpus = 15 } 5 | withLabel: vhighcpu { cpus = 30 } 6 | withLabel: lowmem { memory = 60.GB } 7 | withLabel: midmem { memory = 60.GB } 8 | withLabel: highmem { memory = 110.GB } 9 | withLabel: vhighmem { memory = 110.GB } 10 | withLabel: lowtime { time = "20m" } 11 | withLabel: midtime { time = "40m" } 12 | withLabel: hightime { time = "60m" } 13 | withLabel: vhightime { time = "120m" } 14 | withLabel: vvhightime { time = "360m" } 15 | withLabel: gpu { maxForks = 1; containerOptions = '--gpus all' } 16 | } 17 | -------------------------------------------------------------------------------- /src/sync_datasets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function aws_s3 { 4 | CMD="$1" 5 | SOURCE="$2" 6 | DEST="$3" 7 | # use aws cli if installed 8 | if command -v aws &> /dev/null; then 9 | aws s3 "$CMD" --no-sign-request "$SOURCE" "$DEST" 10 | # else use aws docker container instead 11 | else 12 | docker run \ 13 | --user $(id -u):$(id -g) \ 14 | --rm -it \ 15 | -v $(pwd)/output:/output \ 16 | -w / \ 17 | amazon/aws-cli \ 18 | s3 "$CMD" --no-sign-request "$SOURCE" "$DEST" 19 | fi 20 | } 21 | 22 | aws_s3 sync "s3://openproblems-bio/public/phase1-data/" "output/datasets_phase1" 23 | aws_s3 sync "s3://openproblems-bio/public/phase1v2-data/" "output/datasets_phase1v2" 24 | # aws_s3 sync "s3://openproblems-bio/public/phase2-data/" "output/datasets_phase2_public" 25 | aws_s3 sync "s3://openproblems-bio/public/phase2-data/joint_embedding/" "output/datasets_phase2_public/joint_embedding" 26 | aws_s3 sync "s3://openproblems-bio/public/phase2-private-data/" "output/datasets" 27 | aws_s3 sync "s3://openproblems-bio/public/explore/" "output/datasets_explore" --------------------------------------------------------------------------------