├── .gitignore ├── slides.pdf ├── files ├── domains.png ├── profiling.png ├── application.png └── cell_painting.png ├── LICENSE ├── cytodata-toolkit ├── python │ └── cytodata.py ├── datasets.csv └── R │ ├── Create-Submission_R.Rmd │ ├── Create-Submission_R-day2.ipynb │ └── Create-Submission_R-day2-CP.ipynb └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | cytodata-toolkit/R/.ipynb_checkpoints/ 2 | -------------------------------------------------------------------------------- /slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/slides.pdf -------------------------------------------------------------------------------- /files/domains.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/domains.png -------------------------------------------------------------------------------- /files/profiling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/profiling.png -------------------------------------------------------------------------------- /files/application.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/application.png -------------------------------------------------------------------------------- /files/cell_painting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/cell_painting.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 cytodata 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cytodata-toolkit/python/cytodata.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import io 3 | import pandas as pd 4 | from tqdm import tqdm 5 | 6 | 7 | ## dataset_files: Pandas dataframe 8 | ## Input csv files to read including columns Dataset,Plate,Link 9 | 10 | dataset_files = pd.read_csv("../datasets.csv") 11 | 12 | 13 | def load_dataset(dataset_id, partition, features): 14 | """Load dataset from any collection of csv files 15 | 16 | Parameters 17 | ---------- 18 | dataset_id: string 19 | Dataset ID 20 | partition: 21 | Partition can be "Train" or "Test" 22 | features: 23 | Feature type can be "CellProfiler" or "DeepLearning" 24 | 25 | Returns 26 | ------- 27 | dataframe 28 | All read selected features from the bucket with given ID and partition 29 | 30 | """ 31 | 32 | cond1 = dataset_files["Dataset"] == dataset_id 33 | cond2 = dataset_files["Partition"] == partition 34 | cond3 = dataset_files["Features"] == features 35 | df_row = dataset_files[cond1 & cond2 & cond3] 36 | 37 | if df_row.empty: 38 | print("No such partition {} for dataset {} with features {}".format(partition, dataset_id, features)) 39 | return None 40 | 41 | dataframes = [] 42 | 43 | for key,row in tqdm(df_row.iterrows()): 44 | response = urllib.request.urlopen(row.Link) 45 | data = response.read() 46 | df = pd.read_csv(io.StringIO(data.decode('utf-8'))) 47 | dataframes.append(df) 48 | 49 | return pd.concat(dataframes, ignore_index=True) 50 | 51 | 52 | -------------------------------------------------------------------------------- /cytodata-toolkit/datasets.csv: -------------------------------------------------------------------------------- 1 | Dataset,Partition,Features,Link 2 | BBBC037,Test,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_cp/bbbc037_test.csv 3 | BBBC037,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_cp/bbbc037_train.csv 4 | BBBC043,Test,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_cp/bbbc043_test.csv 5 | BBBC043,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_cp/bbbc043_train.csv 6 | BBBC037,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_dp/bbbc037_test.csv 7 | BBBC037,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_dp/bbbc037_train.csv 8 | BBBC043,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_dp/bbbc043_test.csv 9 | BBBC043,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_dp/bbbc043_train.csv 10 | BBBC036,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_dp/bbbc036_test.csv 11 | BBBC036,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_dp/bbbc036_train.csv 12 | BBBC036,Test,CellProfiler, https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_cp/bbbc036_test.csv 13 | BBBC036,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_cp/bbbc036_train.csv 14 | BBBC022,Test,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_cp/bbbc022_test.csv 15 | BBBC022,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_cp/bbbc022_train.csv 16 | BBBC022,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_dp/bbbc022_test.csv 17 | BBBC022,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_dp/bbbc022_train.csv -------------------------------------------------------------------------------- /cytodata-toolkit/R/Create-Submission_R.Rmd: -------------------------------------------------------------------------------- 1 | ```{r, message=FALSE} 2 | library(tidyverse) 3 | library(cytominer) 4 | library(magrittr) 5 | library(RCurl) 6 | ``` 7 | 8 | ```{r, message=FALSE} 9 | load_dataset <- function(partition, dataset,feature){ 10 | file_name <- read_csv("../datasets.csv") 11 | x <- file_name %>% filter( 12 | Partition == partition, 13 | Dataset == dataset, 14 | Features == feature) %>% 15 | extract2("Link") 16 | 17 | return(read_csv(x) %>% 18 | mutate(Metadata_dataset = dataset) %>% 19 | mutate(Metadata_partition = partition) %>% 20 | mutate(Metadata_features = feature) 21 | ) 22 | } 23 | ``` 24 | 25 | # Load data 26 | We load training and test datasets for both genetic perturbation experiments 27 | 28 | ```{r, message=FALSE} 29 | # bbbc37 data 30 | bbbc037_train <- load_dataset("Train","BBBC037","CellProfiler") %>% 31 | mutate(Metadata_x_mutation_status = "none") %>% 32 | filter(str_detect(Metadata_pert_name, "WT") | Metadata_ASSAY_WELL_ROLE %in% c("Untreated", "CTRL")) 33 | 34 | bbbc037_test <- load_dataset("Test","BBBC037","CellProfiler") %>% 35 | mutate(Metadata_x_mutation_status = "none") %>% 36 | filter(str_detect(Metadata_pert_name, "WT") | Metadata_ASSAY_WELL_ROLE %in% c("Untreated", "CTRL")) 37 | 38 | bbbc037 <- 39 | bind_rows(bbbc037_train, bbbc037_test) 40 | ``` 41 | 42 | ```{r, message=FALSE} 43 | # bbbc043 data 44 | bbbc043_train <- load_dataset("Train","BBBC043","CellProfiler") 45 | 46 | bbbc043_test <- load_dataset("Test","BBBC043","CellProfiler") 47 | 48 | bbbc043 <- bind_rows(bbbc043_train, bbbc043_test) 49 | ``` 50 | 51 | ## Check dimensionality 52 | 53 | ```{r} 54 | dim(bbbc043) 55 | dim(bbbc037) 56 | ``` 57 | 58 | ## Extract common features 59 | 60 | ```{r} 61 | colnames_bbbc037 <- colnames(bbbc037) 62 | colnames_bbbc043 <- colnames(bbbc043) 63 | 64 | 65 | Metadata_names_bbbc037 <- c( 66 | stringr::str_subset(colnames_bbbc037, "^Meta") 67 | ) 68 | 69 | Metadata_names_bbbc043 <- c( 70 | stringr::str_subset(colnames_bbbc043, "^Meta") 71 | ) 72 | 73 | common_metadata <- intersect(Metadata_names_bbbc037, Metadata_names_bbbc043) 74 | common_features <- setdiff(intersect(colnames_bbbc037, colnames_bbbc043),common_metadata) 75 | 76 | ``` 77 | 78 | # Concatenate data sets 79 | 80 | ```{r} 81 | population <- bind_rows( 82 | bbbc037 %>% 83 | select(c(common_metadata, common_features)), 84 | bbbc043 %>% 85 | select(c(common_metadata, common_features)) 86 | ) %>% 87 | mutate(Metadata_perturbation = "genetic") %>% 88 | select(matches("^Meta"), everything()) 89 | ``` 90 | 91 | ## Important: update column names! 92 | 93 | ```{r} 94 | colnames_combined <- colnames(population) 95 | 96 | common_metadata <- c( 97 | stringr::str_subset(colnames_combined, "^Meta") 98 | ) 99 | 100 | common_features <- setdiff(colnames_combined, common_metadata) 101 | ``` 102 | 103 | Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ... 104 | 105 | ```{r} 106 | common_features <- paste0("Feature_",common_features) 107 | colnames(population) <- c(common_metadata, common_features) 108 | ``` 109 | 110 | # Normalize data 111 | We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes 112 | 113 | ```{r} 114 | population_normalized <- cytominer::normalize( 115 | population, 116 | variables = common_features, 117 | strata = c("Metadata_perturbation"), 118 | sample = population %>% 119 | filter( 120 | Metadata_gene_name == 'EMPTY', 121 | Metadata_partition == "Train" 122 | ), 123 | operation = "standardize" 124 | ) 125 | ``` 126 | 127 | ```{r} 128 | population_normalized %>% dim() %>% print 129 | ``` 130 | 131 | # Aggregate data 132 | 133 | ```{r} 134 | population_aggregated <- cytominer::aggregate( 135 | population = population_normalized, 136 | variables = common_features, 137 | strata = c("Metadata_gene_name","Metadata_dataset","Metadata_x_mutation_status"), 138 | operation = "mean" 139 | ) 140 | ``` 141 | 142 | ```{r} 143 | population_normalized %>% extract2("Metadata_gene_name") %>% print 144 | ``` 145 | 146 | ```{r} 147 | population_aggregated %>% slice(1:2) %>% print 148 | ``` 149 | 150 | # Correlation matrix 151 | 152 | ```{r} 153 | cor_matrix <- cor( 154 | x = population_aggregated %>% 155 | filter(Metadata_dataset == 'BBBC037') %>% 156 | select(common_features) %>% 157 | as.matrix() %>% 158 | t, 159 | y = population_aggregated %>% 160 | filter(Metadata_dataset == 'BBBC043') %>% 161 | select(common_features) %>% 162 | as.matrix() %>% 163 | t, 164 | use = "complete.obs" 165 | ) 166 | ``` 167 | 168 | # Submision file 169 | 170 | ```{r} 171 | # set column names 172 | colnames(cor_matrix) <- population_aggregated %>% 173 | filter(Metadata_dataset == 'BBBC043') %>% 174 | extract2("Metadata_x_mutation_status") 175 | 176 | # set row names 177 | rownames(cor_matrix) <- population_aggregated %>% 178 | filter(Metadata_dataset == 'BBBC037') %>% 179 | extract2("Metadata_gene_name") 180 | 181 | 182 | df <- cor_matrix %>% as_data_frame() %>% 183 | mutate(Metadata_gene_name = population_aggregated %>% 184 | filter(Metadata_dataset == 'BBBC037') %>% 185 | extract2("Metadata_gene_name")) %>% 186 | select(Metadata_gene_name, everything()) 187 | 188 | # write submission file 189 | write.csv(df,"../cytodata-baseline_R.csv",row.names = FALSE) 190 | ``` 191 | 192 | ```{r} 193 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # :microscope: CytoData - 2018 Challenge 2 | 3 | If we want to retrieve "matching" profiles from a large collection of image-based profiling experiments (for example to find similar drugs, similar genes, or drug-gene or drug-disease combinations), how do we ensure that the profiles are aligned well enough? 4 | The CytoData 2018 Challenge addresses this, featuring batch effect correction and cross dataset profile matching :cd: :twisted_rightwards_arrows: :dvd:. 5 | The challenge involves the transformation of signatures using machine learning :space_invader: or statistical methods :bar_chart:. 6 | You will be given two datasets of image-based signatures :cd: :heavy_plus_sign: :dvd: acquired at different times :date: :clock130: and with different experimental conditions :pill: :syringe: with the goal of retrieving correct matches accurately :dart:. 7 | See http://cytodata.org/ for details of the event. 8 | 9 | ## Table of Contents 10 | 11 | [Background](#tv-background) 12 | 13 | [Challenge](#checkered_flag-challenge) 14 | 15 | [Data](#dvd-data) 16 | 17 | [Format](#performing_arts-format) 18 | 19 | [Resources](#wrench-resources) 20 | 21 | 22 | # :tv: Background 23 | 24 | :alien: **: What is image-based profiling?** 25 | 26 | :sunglasses: : In the study of biological systems, microscopy images are used to measure the response of cells to treatments or perturbations. 27 | Cell state can be observed and quantitatively measured using images by following a computational workflow known as profiling. 28 | Single cells are first identified in all images, and then their main characteristics are represented in feature vectors. 29 | The information of a population of cells is aggregated into a single vector, also called profile, containing summary statistics of the features of all cells. 30 | These profiles encode the morphological changes of cell populations exposed to treatments. 31 | Image-based profiles can be used to compare the response of cells to different treatments, and to map their similarities. 32 | 33 |

34 | Profiling 35 |

36 | 37 | 38 | :alien: **: Is image-based profiling the same as image-based screening?** 39 | 40 | :sunglasses: : Screening and profiling are different. 41 | Screening uses images to identify phenotype(s) of interest known beforehand. 42 | Profiling measures as many cell properties as possible, using all the phenotypes to identify relationships among multiple different samples. 43 | 44 | 45 | :alien: **: What are the applications of image-based profiling?** 46 | 47 | :sunglasses: : Image-based profiles can be used for drug discovery and functional genomics applications. 48 | There are many types of biological studies that can be conducted using image-based profiling. 49 | In the CytoData challenge, we use data from chemical and genetic perturbation experiments (see below). 50 | 51 |

52 | Applications 53 |

54 | 55 | 56 | :alien: **: What imaging assays can be used for profiling?** 57 | 58 | :sunglasses: : Virtually any imaging assay can be used for profiling, especially high-content assays. 59 | In the 2018 CytoData challenge, we use an imaging assay called Cell Painting, that paints the cells with 6 stains, imaged in 5 channels, highlighting 8 cellular compartments. 60 | This is an unbiased, general purpose assay that maximizes information content for profiling, but the assay can be adapted to meet the needs of a research project. 61 | 62 |

63 | Applications 64 |

65 | 66 | 67 | # :checkered_flag: Challenge 68 | 69 | As in many biological experiments, imaging data may be subject to batch effects and undesired artifacts :scream:. 70 | More specifically, given two batches of microscopy images with the same treatments :pill:, but acquired under different technical conditions :a::vs::b:, a difference in the quantitative measures is likely to be observed :x:. 71 | These differences are not due to meaningful biological variations and can be removed using computational methods :computer:. 72 | 73 | The goal of the challenge :checkered_flag: is to analyze the profiles of two different batches of data :a::b: and design computational methods to correct batch effects :white_check_mark:. 74 | A successful method :trophy: will be able to align the information content of both batches :ab:, 75 | making profiles of the same treatment have similar measurements without distorting the relationships among other treatments :smiley:. 76 | The following metrics will be used to assess the quality of entries :triangular_ruler:: 77 | 78 | 1. :arrow_upper_right::arrow_upper_right: Replicate correlation 79 | 2. :top::arrows_counterclockwise: Enrichment of biologically relevant matches in the top connections 80 | 3. :id::white_check_mark: Correct association of treatment type 81 | 82 | ## :bulb: Tip 83 | 84 | From the data analysis perspective, the problem can be formulated in various ways, including 85 | manifold learning, domain adaptation, subspace alignment, and transfer learning. 86 | 87 |

88 | Domains 89 |

90 | 91 | 92 | # :dvd: Data 93 | 94 | We are glad to announce that four datasets will be provided during the CytoData 2018 Challenge :tada::tada::tada::tada:. 95 | All of them were acquired using the Cell Painting assay, at high-throughput, in 384 well plates :microscope:, as part of the research 96 | conducted in the Broad Institute of MIT and Harvard. 97 | The following table describes the experimental details of each dataset. 98 | 99 | | Dataset :dvd: | Type :syringe: :pill: | Number of treatments :hash: | Cell line :cancer: | 100 | |---|---|---|---| 101 | | BBBC037 | Genetic perturbations. ORF over-expression | 200 wild type genes | U2OS | 102 | | BBBC043 | Genetic perturbations. ORF over-expression | 596 alleles of 53 genes | A549 | 103 | | BBBC022 | Chemical perturbations. Bioactive compounds | 1,600 compounds | U2OS | 104 | | BBBC036 | Chemical perturbations. Bioactive compounds | 5,000 compounds | U2OS | 105 | 106 | Notice that two datasets represent genetic perturbations and the other two represent chemical perturbations. 107 | The challenge will consider the cross-dataset matching problem across each of the two pairs :cd::twisted_rightwards_arrows::dvd:, 108 | i.e, profiles in BBBC037 have to be matched with profiles in BBBC043 because both contain genetic perturbations. 109 | Similarly, profiles in BBBC022 have to be matched with profiles in BBBC036 because both contain chemical perturbations. 110 | 111 | The imaging data for all three datasets is more than 3TB of data :boom:, which will be available to everyone during and after the challenge. 112 | However, to facilitate the analysis of treatment profiles and to focus on the cross-dataset matching problem, all the datasets have been processed 113 | before-hand using the profiling workflow described above :sunglasses:. 114 | In particular, two versions of well-level population profiles will be available during the challenge: 115 | 1. Classical features computed with the CellProfiler software using pipelines optimized for Cell Painting images. 116 | 2. Deep learning features computed with a convolutional neural network pretrained on ImageNet. 117 | 118 | ## Data available on AWS 119 | 120 | As of 2024, all data has moved to the [Cell Painting Gallery](https://github.com/broadinstitute/cellpainting-gallery) at `s3://cellpainting-gallery`. 121 | The folder structue has changed slightly from the original structure to comply with [Cell Painting Gallery formatting](https://github.com/broadinstitute/cellpainting-gallery/blob/main/folder_structure.md). 122 | 123 | The datasets have undergone the following renaming in the Cell Painting Gallery: 124 | Bioactives-BBBC022-Gustafsdottir => `cpg0030-gustafsdottir-cellpainting` 125 | CDRPBIO-BBBC036-Bray => `cpg0012-wawer-bioactivecompoundprofiling` 126 | LUAD-BBBC041-Caicedo => `cpg0031-caicedo-cmvip` 127 | 128 | 129 | During the Cytodata hackathon the data was available as Amazon Public Data Set on https://registry.opendata.aws/cell-painting-image-collection/ at `s3://cytodata`. 130 | All image data and extracted single cell features and aggregated profiles were found in `s3://cytodata/datasets/` with the following structure: 131 | ``` 132 | . 133 | ├── Bioactives-BBBC022-Gustafsdottir 134 | │ ├── profiles 135 | │ │ └── Bioactives-BBBC022-Gustafsdottir 136 | │ ├── images 137 | │ │ └── Bioactives-BBBC022-Gustafsdottir 138 | │ └── metadata 139 | │ └── Bioactives-BBBC022-Gustafsdottir 140 | ├── CDRPBIO-BBBC036-Bray 141 | │ ├── profiles 142 | │ │ └── CDRPBIO-BBBC036-Bray 143 | │ ├── images 144 | │ │ └── CDRPBIO-BBBC036-Bray 145 | │ └── metadata 146 | │ └── CDRPBIO-BBBC036-Bray 147 | ├── LUAD-BBBC041-Caicedo 148 | │ ├── profiles 149 | │ │ └── LUAD-BBBC041-Caicedo 150 | │ ├── images 151 | │ │ └── LUAD-BBBC041-Caicedo 152 | │ └── metadata 153 | │ └── LUAD-BBBC041-Caicedo 154 | └── TA-ORF-BBBC037-Rohban 155 | ├── profiles 156 | │ └── TA-ORF-BBBC037-Rohban 157 | ├── images 158 | │ └── TA-ORF-BBBC037-Rohban 159 | └── metadata 160 | └── TA-ORF-BBBC037-Rohban 161 | ``` 162 | 163 | The subfolder contain the following information: 164 | * the directory `images` contain Cell Painting images as tiff files 165 | * the directory `profiles` contains single cell data in sqlite format and profiles aggregated to replicate level as csv files (aggregated as mean profiles per well) 166 | * the metadata directory contains information about the platemaps and the used perturbations. 167 | 168 | 169 | # :performing_arts: Format 170 | 171 | The CytoData 2018 challenge will be a collaborative hackathon :sparkles::computer:, with participants forming teams to discuss and implement solutions to the problem. 172 | The challenge will run for two days only, so participants are encouraged to investigate and plan some solutions before the event starts :pencil:. 173 | In order to meet other participants, we will provide a slack channel to make general announcements and allow participants to organize teams and exchange ideas :bulb:. 174 | It's also a great idea to start discussing methods here in this GitHub repository :octocat:: 175 | 176 | ```add issues with relevant links if you want to suggest a methodology and discuss it with other participants!``` 177 | 178 | 179 | Teams will have no fewer than three :three: and no more than five :five: participants, ideally from different institutions. 180 | Teams will compete with each other :rage1: to improve the three performance metrics mentioned above :bowling:. 181 | Participants of the team will be able to upload solutions to a scoreboard to check that everything is running properly and to get feedback on performance :ok_hand:. 182 | The best performing solutions will win prizes provided by our sponsors! :trophy::clap: 183 | 184 | 185 | # :wrench: Resources 186 | 187 | The following resources will be provided during the challenge: 188 | 189 | 1. :satellite: Internet connection. 190 | 2. :dvd: Access to all files of the four datasets, including pre-computed profiles. 191 | 3. :octocat: A toolkit, written in R and Python, to load the pre-computed profiles, run a baseline model and create a submission. 192 | 4. :chart_with_upwards_trend: An account in the scoreboard to evaluate the generated submissions. 193 | 5. :computer: Teams will be given access to pre-configured virtual machines in the Amazon Cloud to run experiments. 194 | 195 | Participants of the challenge can make use of their own computational resources (laptops, servers, etc) to run experiments during the challenge. 196 | -------------------------------------------------------------------------------- /cytodata-toolkit/R/Create-Submission_R-day2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "library(tidyverse)\n", 12 | "library(cytominer)\n", 13 | "library(magrittr)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 5, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "load_dataset <- function(partition, dataset,feature){\n", 23 | " file_name <- read_csv(\"../datasets.csv\") \n", 24 | " x <- file_name %>% filter(\n", 25 | " Partition == partition,\n", 26 | " Dataset == dataset,\n", 27 | " Features == feature) %>% \n", 28 | " extract2(\"Link\")\n", 29 | "\n", 30 | " return(read_csv(x) %>% \n", 31 | " mutate(Metadata_dataset = dataset) %>%\n", 32 | " mutate(Metadata_partition = partition) %>% \n", 33 | " mutate(Metadata_features = feature) \n", 34 | " )\n", 35 | " }" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Load data \n", 43 | "We load training and test datasets for both genetic perturbation experiments " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stderr", 53 | "output_type": "stream", 54 | "text": [ 55 | "Parsed with column specification:\n", 56 | "cols(\n", 57 | " Dataset = col_character(),\n", 58 | " Partition = col_character(),\n", 59 | " Features = col_character(),\n", 60 | " Link = col_character()\n", 61 | ")\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "# bbbc37 data \n", 67 | "bbbc036_train <- load_dataset(\"Train\",\"BBBC036\",\"DeepLearning\") %>% \n", 68 | " mutate(Metadata_x_mutation_status = \"none\")\n", 69 | "\n", 70 | "bbbc036_test <- load_dataset(\"Test\",\"BBBC036\",\"DeepLearning\") %>% \n", 71 | " mutate(Metadata_x_mutation_status = \"none\")\n", 72 | "\n", 73 | "bbbc036 <- rbind(bbbc036_train, bbbc036_test)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "bbbc036_train %>% dim()\n", 83 | "bbbc036_test %>% dim()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# bbbc043 data \n", 93 | "bbbc022_train <- load_dataset(\"Train\",\"BBBC022\",\"DeepLearning\")\n", 94 | " \n", 95 | "bbbc022_test <- load_dataset(\"Test\",\"BBBC022\",\"DeepLearning\")\n", 96 | "\n", 97 | "bbbc022 <- rbind(bbbc022_train, bbbc022_test)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "## Check dimensionality" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "dim(bbbc022)\n", 114 | "dim(bbbc036)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "## Extract common features " 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "colnames_bbbc022 <- colnames(bbbc022)\n", 131 | "colnames_bbbc036 <- colnames(bbbc036)\n", 132 | "\n", 133 | "\n", 134 | "Metadata_names_bbbc022 <- c(\n", 135 | " stringr::str_subset(colnames_bbbc022, \"^Meta\")\n", 136 | ") \n", 137 | "\n", 138 | "Metadata_names_bbbc036 <- c(\n", 139 | " stringr::str_subset(colnames_bbbc036, \"^Meta\")\n", 140 | ") \n", 141 | "\n", 142 | "common_metadata <- intersect(Metadata_names_bbbc022, Metadata_names_bbbc036) \n", 143 | "common_features <- setdiff(intersect(colnames_bbbc022, colnames_bbbc036),common_metadata)\n", 144 | "\n", 145 | "colnames_bbbc036 %>% length()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "bbbc022_na_feature <- cytominer::drop_na_columns(\n", 155 | " population = bbbc022 %>% \n", 156 | " filter(\n", 157 | " Metadata_broad_sample == \"DMSO\"\n", 158 | " ) %>% \n", 159 | " slice(1:100),\n", 160 | " variables = common_features,\n", 161 | " cutoff = 0\n", 162 | " )\n", 163 | "\n", 164 | "#bbbc036_na_feature <- cytominer::drop_na_columns(\n", 165 | "# population = bbbc036,\n", 166 | "# variables = common_features,\n", 167 | "# cutoff = 0\n", 168 | "# )" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "bbbc022_na_feature %>% print" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "features_to_remove <- cytominer::variance_threshold(\n", 187 | " variables = common_features,\n", 188 | " sample = bbbc022 %>% \n", 189 | " filter(\n", 190 | " Metadata_broad_sample == \"DMSO\"\n", 191 | " ) %>% \n", 192 | " slice(1:100)\n", 193 | ")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "# Concatenate data sets" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "population <- rbind(\n", 217 | " bbbc022 %>% \n", 218 | " select(c(common_metadata, common_features)),\n", 219 | " bbbc036 %>% \n", 220 | " select(c(common_metadata, common_features))\n", 221 | " ) %>% \n", 222 | " mutate(Metadata_perturbation = 'chemical') %>% \n", 223 | " select(Metadata_perturbation, everything())" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "## Important: update column names! " 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "colnames_combined <- colnames(population)\n", 240 | "\n", 241 | "common_metadata <- c(\n", 242 | " stringr::str_subset(colnames_combined, \"^Meta\")\n", 243 | ") \n", 244 | "\n", 245 | "common_features <- setdiff(colnames_combined, common_metadata)\n" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ... " 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "common_features <- paste0(\"Feature_\",common_features)\n", 262 | "colnames(population) <- c(common_metadata, common_features)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "# Normalize data\n", 270 | "We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "population_normalized <- cytominer::normalize(\n", 280 | " population, \n", 281 | " variables = common_features, \n", 282 | " strata = c(\"Metadata_perturbation\"), \n", 283 | " sample = population %>% \n", 284 | " filter(\n", 285 | " Metadata_broad_sample == \"DMSO\"\n", 286 | " ) %>% \n", 287 | " slice(1:100), \n", 288 | " operation = \"standardize\"\n", 289 | ")" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "population_normalized %>% dim() %>% print" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "# Aggregate data " 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "population_aggregated <- cytominer::aggregate(\n", 315 | " population = population_normalized, \n", 316 | " variables = common_features, \n", 317 | " strata = c(\"Metadata_broad_sample\",\"Metadata_dataset\"), \n", 318 | " operation = \"mean\"\n", 319 | ") " 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "population_normalized %>% extract2(\"Metadata_broad_sample\") %>% print" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "population_aggregated %>% slice(1:2) %>% print" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "# Correlation matrix " 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "cor_matrix <- cor(\n", 354 | " x = population_aggregated %>% \n", 355 | " filter(Metadata_dataset == 'BBBC022') %>% \n", 356 | " select(common_features) %>% \n", 357 | " as.matrix() %>% \n", 358 | " t, \n", 359 | " y = population_aggregated %>% \n", 360 | " filter(Metadata_dataset == 'BBBC036') %>% \n", 361 | " select(common_features) %>% \n", 362 | " as.matrix() %>% \n", 363 | " t,\n", 364 | " use = \"complete.obs\"\n", 365 | " ) \n" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "# Submision file " 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "# set column names \n", 382 | "colnames(cor_matrix) <- population_aggregated %>% \n", 383 | " filter(Metadata_dataset == 'BBBC036') %>%\n", 384 | " extract2(\"Metadata_pert_id\")\n", 385 | "\n", 386 | "# set row names \n", 387 | "#rownames(cor_matrix) <- population_aggregated %>% \n", 388 | "# filter(Metadata_dataset == 'BBBC036') %>%\n", 389 | "# extract2(\"Metadata_broad_sample\")#\n" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "df <- cor_matrix %>% as_data_frame() %>% \n", 399 | " mutate(Metadata_pert_id = population_aggregated %>% \n", 400 | " filter(Metadata_dataset == 'BBBC022') %>%\n", 401 | " extract2(\"Metadata_pert_id\")) %>% \n", 402 | " select(Metadata_pert_id, everything())\n", 403 | "\n", 404 | "# write submission file\n", 405 | "write.csv(df,\"../cytodata-baseline_R_day_2.csv\",row.names = FALSE)" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "df %>% print" 415 | ] 416 | } 417 | ], 418 | "metadata": { 419 | "kernelspec": { 420 | "display_name": "R", 421 | "language": "R", 422 | "name": "ir" 423 | }, 424 | "language_info": { 425 | "codemirror_mode": "r", 426 | "file_extension": ".r", 427 | "mimetype": "text/x-r-source", 428 | "name": "R", 429 | "pygments_lexer": "r", 430 | "version": "3.4.4" 431 | } 432 | }, 433 | "nbformat": 4, 434 | "nbformat_minor": 2 435 | } 436 | -------------------------------------------------------------------------------- /cytodata-toolkit/R/Create-Submission_R-day2-CP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stderr", 12 | "output_type": "stream", 13 | "text": [ 14 | "── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──\n", 15 | "✔ ggplot2 3.0.0 ✔ purrr 0.2.5\n", 16 | "✔ tibble 1.4.2 ✔ dplyr 0.7.6\n", 17 | "✔ tidyr 0.8.1 ✔ stringr 1.3.1\n", 18 | "✔ readr 1.1.1 ✔ forcats 0.3.0\n", 19 | "── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n", 20 | "✖ dplyr::filter() masks stats::filter()\n", 21 | "✖ dplyr::lag() masks stats::lag()\n", 22 | "\n", 23 | "Attaching package: ‘cytominer’\n", 24 | "\n", 25 | "The following object is masked from ‘package:stats’:\n", 26 | "\n", 27 | " aggregate\n", 28 | "\n", 29 | "The following object is masked from ‘package:base’:\n", 30 | "\n", 31 | " transform\n", 32 | "\n", 33 | "\n", 34 | "Attaching package: ‘magrittr’\n", 35 | "\n", 36 | "The following object is masked from ‘package:purrr’:\n", 37 | "\n", 38 | " set_names\n", 39 | "\n", 40 | "The following object is masked from ‘package:tidyr’:\n", 41 | "\n", 42 | " extract\n", 43 | "\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "library(tidyverse)\n", 49 | "library(cytominer)\n", 50 | "library(magrittr)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "# function to load different data sets\n", 58 | "This function also adds the Metdata columns Metadata_dataset, Metadata_partition and Metadata_features" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 2, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "load_dataset <- function(partition, dataset,feature){\n", 68 | " file_name <- read_csv(\"../datasets.csv\") \n", 69 | " x <- file_name %>% filter(\n", 70 | " Partition == partition,\n", 71 | " Dataset == dataset,\n", 72 | " Features == feature) %>% \n", 73 | " extract2(\"Link\")\n", 74 | "\n", 75 | " return(read_csv(x) %>% \n", 76 | " mutate(Metadata_dataset = dataset) %>%\n", 77 | " mutate(Metadata_partition = partition) %>% \n", 78 | " mutate(Metadata_features = feature) \n", 79 | " )\n", 80 | " }" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "# Load data \n", 88 | "We load training and test datasets for both BBBC036 / CDRP data set and select only important Metadata colums" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 3, 94 | "metadata": { 95 | "scrolled": true 96 | }, 97 | "outputs": [ 98 | { 99 | "name": "stderr", 100 | "output_type": "stream", 101 | "text": [ 102 | "Parsed with column specification:\n", 103 | "cols(\n", 104 | " Dataset = col_character(),\n", 105 | " Partition = col_character(),\n", 106 | " Features = col_character(),\n", 107 | " Link = col_character()\n", 108 | ")\n", 109 | "Parsed with column specification:\n", 110 | "cols(\n", 111 | " .default = col_double(),\n", 112 | " Metadata_Plate = col_integer(),\n", 113 | " Metadata_Well = col_character(),\n", 114 | " Metadata_Assay_Plate_Barcode = col_integer(),\n", 115 | " Metadata_Plate_Map_Name = col_character(),\n", 116 | " Metadata_well_position = col_character(),\n", 117 | " Metadata_ASSAY_WELL_ROLE = col_character(),\n", 118 | " Metadata_broad_sample = col_character(),\n", 119 | " Metadata_solvent = col_character(),\n", 120 | " Metadata_pert_id = col_character(),\n", 121 | " Metadata_pert_mfc_id = col_character(),\n", 122 | " Metadata_pert_well = col_character(),\n", 123 | " Metadata_pert_id_vendor = col_character(),\n", 124 | " Metadata_cell_id = col_character(),\n", 125 | " Metadata_broad_sample_type = col_character(),\n", 126 | " Metadata_pert_vehicle = col_character(),\n", 127 | " Metadata_pert_type = col_character(),\n", 128 | " Cells_AreaShape_EulerNumber = col_integer(),\n", 129 | " Cells_Children_Cytoplasm_Count = col_integer(),\n", 130 | " Cells_Neighbors_FirstClosestObjectNumber_5 = col_integer(),\n", 131 | " Cells_Neighbors_FirstClosestObjectNumber_Adjacent = col_integer()\n", 132 | " # ... with 20 more columns\n", 133 | ")\n", 134 | "See spec(...) for full column specifications.\n", 135 | "Warning message in rbind(names(probs), probs_f):\n", 136 | "“number of columns of result is not a multiple of vector length (arg 1)”Warning message:\n", 137 | "“2201 parsing failures.\n", 138 | "row # A tibble: 5 x 5 col row col expected actual file expected actual 1 1013 Cells_Neighbors_Fir… no trailing … .5 'https://s3.amazonaws.com/cy… file 2 1013 Cells_Neighbors_Fir… no trailing … .5 'https://s3.amazonaws.com/cy… row 3 1013 Nuclei_AreaShape_Ar… no trailing … .5 'https://s3.amazonaws.com/cy… col 4 1013 Nuclei_Neighbors_Se… no trailing … .5 'https://s3.amazonaws.com/cy… expected 5 1017 Nuclei_AreaShape_Ar… no trailing … .5 'https://s3.amazonaws.com/cy…\n", 139 | "... ................. ... ............................................................................... ........ ............................................................................... ...... ............................................................................... .... ............................................................................... ... ............................................................................... ... ............................................................................... ........ ...............................................................................\n", 140 | "See problems(...) for more details.\n", 141 | "”Parsed with column specification:\n", 142 | "cols(\n", 143 | " Dataset = col_character(),\n", 144 | " Partition = col_character(),\n", 145 | " Features = col_character(),\n", 146 | " Link = col_character()\n", 147 | ")\n", 148 | "Parsed with column specification:\n", 149 | "cols(\n", 150 | " .default = col_double(),\n", 151 | " Metadata_Plate = col_integer(),\n", 152 | " Metadata_Well = col_character(),\n", 153 | " Metadata_Assay_Plate_Barcode = col_integer(),\n", 154 | " Metadata_Plate_Map_Name = col_character(),\n", 155 | " Metadata_well_position = col_character(),\n", 156 | " Metadata_ASSAY_WELL_ROLE = col_character(),\n", 157 | " Metadata_broad_sample = col_character(),\n", 158 | " Metadata_solvent = col_character(),\n", 159 | " Metadata_pert_id = col_character(),\n", 160 | " Metadata_pert_mfc_id = col_character(),\n", 161 | " Metadata_pert_well = col_character(),\n", 162 | " Metadata_pert_id_vendor = col_character(),\n", 163 | " Metadata_cell_id = col_character(),\n", 164 | " Metadata_broad_sample_type = col_character(),\n", 165 | " Metadata_pert_vehicle = col_character(),\n", 166 | " Metadata_pert_type = col_character(),\n", 167 | " Cells_AreaShape_EulerNumber = col_integer(),\n", 168 | " Cells_Children_Cytoplasm_Count = col_integer(),\n", 169 | " Cytoplasm_AreaShape_EulerNumber = col_integer(),\n", 170 | " Cytoplasm_Correlation_Manders_AGP_DNA = col_integer()\n", 171 | " # ... with 31 more columns\n", 172 | ")\n", 173 | "See spec(...) for full column specifications.\n", 174 | "Warning message in rbind(names(probs), probs_f):\n", 175 | "“number of columns of result is not a multiple of vector length (arg 1)”Warning message:\n", 176 | "“33 parsing failures.\n", 177 | "row # A tibble: 5 x 5 col row col expected actual file expected actual 1 1103 Nuclei_Correlat… no trailing … .8570798… 'https://s3.amazonaws.com/cyt… file 2 1103 Nuclei_Correlat… no trailing … .5790690… 'https://s3.amazonaws.com/cyt… row 3 1103 Nuclei_Correlat… no trailing … .9671379… 'https://s3.amazonaws.com/cyt… col 4 1103 Nuclei_Correlat… no trailing … .6148584… 'https://s3.amazonaws.com/cyt… expected 5 1104 Nuclei_Correlat… no trailing … .9769246… 'https://s3.amazonaws.com/cyt…\n", 178 | "... ................. ... ............................................................................... ........ ............................................................................... ...... ............................................................................... .... ............................................................................... ... ............................................................................... ... ............................................................................... ........ ...............................................................................\n", 179 | "See problems(...) for more details.\n", 180 | "”" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "# bbbc36 data \n", 186 | "bbbc036_train <- load_dataset(\"Train\",\"BBBC036\",\"CellProfiler\") %>% \n", 187 | " mutate(Metadata_x_mutation_status = \"none\")\n", 188 | "\n", 189 | "bbbc036_test <- load_dataset(\"Test\",\"BBBC036\",\"CellProfiler\") %>% \n", 190 | " mutate(Metadata_x_mutation_status = \"none\")\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 4, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "bbbc036 <- rbind(bbbc036_train, bbbc036_test) " 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "# How large are the data sets? " 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 5, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/html": [ 217 | "
    \n", 218 | "\t
  1. 18929
  2. \n", 219 | "\t
  3. 1805
  4. \n", 220 | "
\n" 221 | ], 222 | "text/latex": [ 223 | "\\begin{enumerate*}\n", 224 | "\\item 18929\n", 225 | "\\item 1805\n", 226 | "\\end{enumerate*}\n" 227 | ], 228 | "text/markdown": [ 229 | "1. 18929\n", 230 | "2. 1805\n", 231 | "\n", 232 | "\n" 233 | ], 234 | "text/plain": [ 235 | "[1] 18929 1805" 236 | ] 237 | }, 238 | "metadata": {}, 239 | "output_type": "display_data" 240 | }, 241 | { 242 | "data": { 243 | "text/html": [ 244 | "
    \n", 245 | "\t
  1. 2177
  2. \n", 246 | "\t
  3. 1805
  4. \n", 247 | "
\n" 248 | ], 249 | "text/latex": [ 250 | "\\begin{enumerate*}\n", 251 | "\\item 2177\n", 252 | "\\item 1805\n", 253 | "\\end{enumerate*}\n" 254 | ], 255 | "text/markdown": [ 256 | "1. 2177\n", 257 | "2. 1805\n", 258 | "\n", 259 | "\n" 260 | ], 261 | "text/plain": [ 262 | "[1] 2177 1805" 263 | ] 264 | }, 265 | "metadata": {}, 266 | "output_type": "display_data" 267 | } 268 | ], 269 | "source": [ 270 | "bbbc036_train %>% dim()\n", 271 | "bbbc036_test %>% dim()" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "# What are the Metadata colums?" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 6, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "# A tibble: 5 x 4\n", 291 | " Metadata_Plate Metadata_Well Metadata_Plate_Map_Name Metadata_pert_id\n", 292 | " \n", 293 | "1 24277 a01 H-BIOA-004-3 BRD-K18250272 \n", 294 | "2 24277 a02 H-BIOA-004-3 BRD-K18316707 \n", 295 | "3 24277 a03 H-BIOA-004-3 BRD-K18438502 \n", 296 | "4 24277 a04 H-BIOA-004-3 BRD-K18550767 \n", 297 | "5 24277 a05 H-BIOA-004-3 BRD-K18574842 \n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "bbbc036 %>% \n", 303 | " select(Metadata_Plate, Metadata_Well, Metadata_Plate_Map_Name, Metadata_pert_id) %>% \n", 304 | " slice(1:5) %>% \n", 305 | " print()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 27, 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "# A tibble: 5 x 4\n", 318 | " Metadata_pert_id Metadata_broad_sample_ty… Metadata_dataset Metadata_partiti…\n", 319 | " \n", 320 | "1 BRD-K18250272 trt BBBC036 Train \n", 321 | "2 BRD-K18316707 trt BBBC036 Train \n", 322 | "3 BRD-K18438502 trt BBBC036 Train \n", 323 | "4 BRD-K18550767 trt BBBC036 Train \n", 324 | "5 BRD-K18574842 trt BBBC036 Train \n" 325 | ] 326 | } 327 | ], 328 | "source": [ 329 | "bbbc036 %>% \n", 330 | " select( Metadata_pert_id, Metadata_broad_sample_type,Metadata_dataset, Metadata_partition) %>% \n", 331 | " slice(1:5) %>% \n", 332 | " print()" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "# how many replicates do we have? " 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 8, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/html": [ 350 | "\n", 351 | "\n", 352 | "\n", 353 | "\t\n", 354 | "\t\n", 355 | "\t\n", 356 | "\t\n", 357 | "\t\n", 358 | "\t\n", 359 | "\t\n", 360 | "\n", 361 | "
Metadata_Plate_Map_Namemean_replicates
H-BIOA-001-39.600000
H-BIOA-002-38.373832
H-BIOA-003-39.554517
H-BIOA-004-39.570093
H-BIOA-005-39.563863
H-BIOA-006-39.554517
H-BIOA-007-39.593750
\n" 362 | ], 363 | "text/latex": [ 364 | "\\begin{tabular}{r|ll}\n", 365 | " Metadata\\_Plate\\_Map\\_Name & mean\\_replicates\\\\\n", 366 | "\\hline\n", 367 | "\t H-BIOA-001-3 & 9.600000 \\\\\n", 368 | "\t H-BIOA-002-3 & 8.373832 \\\\\n", 369 | "\t H-BIOA-003-3 & 9.554517 \\\\\n", 370 | "\t H-BIOA-004-3 & 9.570093 \\\\\n", 371 | "\t H-BIOA-005-3 & 9.563863 \\\\\n", 372 | "\t H-BIOA-006-3 & 9.554517 \\\\\n", 373 | "\t H-BIOA-007-3 & 9.593750 \\\\\n", 374 | "\\end{tabular}\n" 375 | ], 376 | "text/markdown": [ 377 | "\n", 378 | "Metadata_Plate_Map_Name | mean_replicates | \n", 379 | "|---|---|---|---|---|---|---|\n", 380 | "| H-BIOA-001-3 | 9.600000 | \n", 381 | "| H-BIOA-002-3 | 8.373832 | \n", 382 | "| H-BIOA-003-3 | 9.554517 | \n", 383 | "| H-BIOA-004-3 | 9.570093 | \n", 384 | "| H-BIOA-005-3 | 9.563863 | \n", 385 | "| H-BIOA-006-3 | 9.554517 | \n", 386 | "| H-BIOA-007-3 | 9.593750 | \n", 387 | "\n", 388 | "\n" 389 | ], 390 | "text/plain": [ 391 | " Metadata_Plate_Map_Name mean_replicates\n", 392 | "1 H-BIOA-001-3 9.600000 \n", 393 | "2 H-BIOA-002-3 8.373832 \n", 394 | "3 H-BIOA-003-3 9.554517 \n", 395 | "4 H-BIOA-004-3 9.570093 \n", 396 | "5 H-BIOA-005-3 9.563863 \n", 397 | "6 H-BIOA-006-3 9.554517 \n", 398 | "7 H-BIOA-007-3 9.593750 " 399 | ] 400 | }, 401 | "metadata": {}, 402 | "output_type": "display_data" 403 | } 404 | ], 405 | "source": [ 406 | "bbbc036 %>% \n", 407 | " group_by(Metadata_Plate_Map_Name, Metadata_pert_id) %>%\n", 408 | " summarise(n_groups = n()) %>%\n", 409 | " summarise(mean_replicates = mean(n_groups))" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": 9, 415 | "metadata": { 416 | "scrolled": true 417 | }, 418 | "outputs": [ 419 | { 420 | "name": "stderr", 421 | "output_type": "stream", 422 | "text": [ 423 | "Parsed with column specification:\n", 424 | "cols(\n", 425 | " Dataset = col_character(),\n", 426 | " Partition = col_character(),\n", 427 | " Features = col_character(),\n", 428 | " Link = col_character()\n", 429 | ")\n", 430 | "Parsed with column specification:\n", 431 | "cols(\n", 432 | " .default = col_double(),\n", 433 | " Metadata_Plate = col_integer(),\n", 434 | " Metadata_Well = col_character(),\n", 435 | " Metadata_Assay_Plate_Barcode = col_integer(),\n", 436 | " Metadata_Plate_Map_Name = col_character(),\n", 437 | " Metadata_well_position = col_character(),\n", 438 | " Metadata_broad_sample = col_character(),\n", 439 | " Metadata_source_name = col_character(),\n", 440 | " Metadata_compound_name = col_character(),\n", 441 | " Metadata_smiles = col_character(),\n", 442 | " Metadata_solvent = col_character(),\n", 443 | " Metadata_pert_id = col_character(),\n", 444 | " Metadata_pert_mfc_id = col_character(),\n", 445 | " Metadata_pert_well = col_character(),\n", 446 | " Metadata_pert_id_vendor = col_character(),\n", 447 | " Metadata_cell_id = col_character(),\n", 448 | " Metadata_broad_sample_type = col_character(),\n", 449 | " Metadata_pert_vehicle = col_character(),\n", 450 | " Metadata_pert_type = col_character(),\n", 451 | " Metadata_exp = col_character()\n", 452 | ")\n", 453 | "See spec(...) for full column specifications.\n", 454 | "Parsed with column specification:\n", 455 | "cols(\n", 456 | " Dataset = col_character(),\n", 457 | " Partition = col_character(),\n", 458 | " Features = col_character(),\n", 459 | " Link = col_character()\n", 460 | ")\n", 461 | "Parsed with column specification:\n", 462 | "cols(\n", 463 | " .default = col_double(),\n", 464 | " Metadata_Plate = col_integer(),\n", 465 | " Metadata_Well = col_character(),\n", 466 | " Metadata_Assay_Plate_Barcode = col_integer(),\n", 467 | " Metadata_Plate_Map_Name = col_character(),\n", 468 | " Metadata_well_position = col_character(),\n", 469 | " Metadata_broad_sample = col_character(),\n", 470 | " Metadata_source_name = col_character(),\n", 471 | " Metadata_compound_name = col_character(),\n", 472 | " Metadata_smiles = col_character(),\n", 473 | " Metadata_solvent = col_character(),\n", 474 | " Metadata_pert_id = col_character(),\n", 475 | " Metadata_pert_mfc_id = col_character(),\n", 476 | " Metadata_pert_well = col_character(),\n", 477 | " Metadata_pert_id_vendor = col_character(),\n", 478 | " Metadata_cell_id = col_character(),\n", 479 | " Metadata_broad_sample_type = col_character(),\n", 480 | " Metadata_pert_vehicle = col_character(),\n", 481 | " Metadata_pert_type = col_character(),\n", 482 | " Metadata_exp = col_character()\n", 483 | ")\n", 484 | "See spec(...) for full column specifications.\n" 485 | ] 486 | } 487 | ], 488 | "source": [ 489 | "# bbbc022 data \n", 490 | "bbbc022_train <- load_dataset(\"Train\",\"BBBC022\",\"CellProfiler\")\n", 491 | " \n", 492 | "bbbc022_test <- load_dataset(\"Test\",\"BBBC022\",\"CellProfiler\")\n", 493 | "\n", 494 | "bbbc022 <- rbind(bbbc022_train, bbbc022_test) %>%\n", 495 | " select(Metadata_Plate, Metadata_Well, Metadata_Plate_Map_Name, \n", 496 | " Metadata_pert_id, Metadata_broad_sample_type,\n", 497 | " Metadata_dataset,Metadata_partition, \n", 498 | " everything()\n", 499 | " ) " 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "# How large are the training and test partitions for BBBC022?" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 10, 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "data": { 516 | "text/html": [ 517 | "
    \n", 518 | "\t
  1. 6462
  2. \n", 519 | "\t
  3. 1806
  4. \n", 520 | "
\n" 521 | ], 522 | "text/latex": [ 523 | "\\begin{enumerate*}\n", 524 | "\\item 6462\n", 525 | "\\item 1806\n", 526 | "\\end{enumerate*}\n" 527 | ], 528 | "text/markdown": [ 529 | "1. 6462\n", 530 | "2. 1806\n", 531 | "\n", 532 | "\n" 533 | ], 534 | "text/plain": [ 535 | "[1] 6462 1806" 536 | ] 537 | }, 538 | "metadata": {}, 539 | "output_type": "display_data" 540 | }, 541 | { 542 | "data": { 543 | "text/html": [ 544 | "
    \n", 545 | "\t
  1. 1120
  2. \n", 546 | "\t
  3. 1806
  4. \n", 547 | "
\n" 548 | ], 549 | "text/latex": [ 550 | "\\begin{enumerate*}\n", 551 | "\\item 1120\n", 552 | "\\item 1806\n", 553 | "\\end{enumerate*}\n" 554 | ], 555 | "text/markdown": [ 556 | "1. 1120\n", 557 | "2. 1806\n", 558 | "\n", 559 | "\n" 560 | ], 561 | "text/plain": [ 562 | "[1] 1120 1806" 563 | ] 564 | }, 565 | "metadata": {}, 566 | "output_type": "display_data" 567 | } 568 | ], 569 | "source": [ 570 | "bbbc022_train %>% dim()\n", 571 | "bbbc022_test %>% dim()" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 11, 577 | "metadata": {}, 578 | "outputs": [ 579 | { 580 | "data": { 581 | "text/html": [ 582 | "\n", 583 | "\n", 584 | "\n", 585 | "\t\n", 586 | "\t\n", 587 | "\t\n", 588 | "\t\n", 589 | "\t\n", 590 | "\n", 591 | "
Metadata_Plate_Map_Namemean_replicates
H-BIOA-002-14.860759
H-BIOA-003-14.780255
H-BIOA-004-14.830128
H-BIOA-005-14.816456
H-BIOA-006-14.797468
\n" 592 | ], 593 | "text/latex": [ 594 | "\\begin{tabular}{r|ll}\n", 595 | " Metadata\\_Plate\\_Map\\_Name & mean\\_replicates\\\\\n", 596 | "\\hline\n", 597 | "\t H-BIOA-002-1 & 4.860759 \\\\\n", 598 | "\t H-BIOA-003-1 & 4.780255 \\\\\n", 599 | "\t H-BIOA-004-1 & 4.830128 \\\\\n", 600 | "\t H-BIOA-005-1 & 4.816456 \\\\\n", 601 | "\t H-BIOA-006-1 & 4.797468 \\\\\n", 602 | "\\end{tabular}\n" 603 | ], 604 | "text/markdown": [ 605 | "\n", 606 | "Metadata_Plate_Map_Name | mean_replicates | \n", 607 | "|---|---|---|---|---|\n", 608 | "| H-BIOA-002-1 | 4.860759 | \n", 609 | "| H-BIOA-003-1 | 4.780255 | \n", 610 | "| H-BIOA-004-1 | 4.830128 | \n", 611 | "| H-BIOA-005-1 | 4.816456 | \n", 612 | "| H-BIOA-006-1 | 4.797468 | \n", 613 | "\n", 614 | "\n" 615 | ], 616 | "text/plain": [ 617 | " Metadata_Plate_Map_Name mean_replicates\n", 618 | "1 H-BIOA-002-1 4.860759 \n", 619 | "2 H-BIOA-003-1 4.780255 \n", 620 | "3 H-BIOA-004-1 4.830128 \n", 621 | "4 H-BIOA-005-1 4.816456 \n", 622 | "5 H-BIOA-006-1 4.797468 " 623 | ] 624 | }, 625 | "metadata": {}, 626 | "output_type": "display_data" 627 | } 628 | ], 629 | "source": [ 630 | "bbbc022 %>% \n", 631 | " group_by(Metadata_Plate_Map_Name, Metadata_pert_id) %>%\n", 632 | " summarise(n_groups = n()) %>%\n", 633 | " summarise(mean_replicates = mean(n_groups))" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "# How large are the combined data sets? " 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 12, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "data": { 650 | "text/html": [ 651 | "
    \n", 652 | "\t
  1. 7582
  2. \n", 653 | "\t
  3. 1806
  4. \n", 654 | "
\n" 655 | ], 656 | "text/latex": [ 657 | "\\begin{enumerate*}\n", 658 | "\\item 7582\n", 659 | "\\item 1806\n", 660 | "\\end{enumerate*}\n" 661 | ], 662 | "text/markdown": [ 663 | "1. 7582\n", 664 | "2. 1806\n", 665 | "\n", 666 | "\n" 667 | ], 668 | "text/plain": [ 669 | "[1] 7582 1806" 670 | ] 671 | }, 672 | "metadata": {}, 673 | "output_type": "display_data" 674 | }, 675 | { 676 | "data": { 677 | "text/html": [ 678 | "
    \n", 679 | "\t
  1. 21106
  2. \n", 680 | "\t
  3. 1805
  4. \n", 681 | "
\n" 682 | ], 683 | "text/latex": [ 684 | "\\begin{enumerate*}\n", 685 | "\\item 21106\n", 686 | "\\item 1805\n", 687 | "\\end{enumerate*}\n" 688 | ], 689 | "text/markdown": [ 690 | "1. 21106\n", 691 | "2. 1805\n", 692 | "\n", 693 | "\n" 694 | ], 695 | "text/plain": [ 696 | "[1] 21106 1805" 697 | ] 698 | }, 699 | "metadata": {}, 700 | "output_type": "display_data" 701 | } 702 | ], 703 | "source": [ 704 | "dim(bbbc022)\n", 705 | "dim(bbbc036)" 706 | ] 707 | }, 708 | { 709 | "cell_type": "markdown", 710 | "metadata": {}, 711 | "source": [ 712 | "## Extract common features and common metadata" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": 13, 718 | "metadata": {}, 719 | "outputs": [], 720 | "source": [ 721 | "colnames_bbbc022 <- colnames(bbbc022)\n", 722 | "colnames_bbbc036 <- colnames(bbbc036)\n", 723 | "\n", 724 | "Metadata_names_bbbc022 <- c(\n", 725 | " stringr::str_subset(colnames_bbbc022, \"^Meta\")\n", 726 | ") \n", 727 | "\n", 728 | "Metadata_names_bbbc036 <- c(\n", 729 | " stringr::str_subset(colnames_bbbc036, \"^Meta\")\n", 730 | ") \n", 731 | "\n", 732 | "common_metadata <- intersect(Metadata_names_bbbc022, Metadata_names_bbbc036) \n", 733 | "common_features <- setdiff(intersect(colnames_bbbc022, colnames_bbbc036),common_metadata)" 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "# Concatenate data sets" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 14, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "population <- rbind(\n", 750 | " bbbc022 %>% \n", 751 | " select(c(common_metadata, common_features)),\n", 752 | " bbbc036 %>% \n", 753 | " select(c(common_metadata, common_features))\n", 754 | " ) %>% \n", 755 | " mutate(Metadata_perturbation = 'chemical') %>% \n", 756 | " select(Metadata_perturbation, everything())" 757 | ] 758 | }, 759 | { 760 | "cell_type": "markdown", 761 | "metadata": {}, 762 | "source": [ 763 | "## Important: update column names! " 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": 15, 769 | "metadata": {}, 770 | "outputs": [], 771 | "source": [ 772 | "colnames_combined <- colnames(population)\n", 773 | "\n", 774 | "common_metadata <- c(\n", 775 | " stringr::str_subset(colnames_combined, \"^Meta\")\n", 776 | ") \n", 777 | "\n", 778 | "common_features <- setdiff(colnames_combined, common_metadata)" 779 | ] 780 | }, 781 | { 782 | "cell_type": "markdown", 783 | "metadata": {}, 784 | "source": [ 785 | "Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ... " 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": 16, 791 | "metadata": {}, 792 | "outputs": [], 793 | "source": [ 794 | "common_features <- paste0(\"Feature_\",common_features)\n", 795 | "colnames(population) <- c(common_metadata, common_features)" 796 | ] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "metadata": {}, 801 | "source": [ 802 | "# Normalize data\n", 803 | "We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": 17, 809 | "metadata": {}, 810 | "outputs": [], 811 | "source": [ 812 | "population_normalized <- cytominer::normalize(\n", 813 | " population, \n", 814 | " variables = common_features, \n", 815 | " strata = c(\"Metadata_perturbation\"), \n", 816 | " sample = population %>% \n", 817 | " filter(\n", 818 | " Metadata_broad_sample == \"DMSO\",\n", 819 | " Metadata_partition == \"Train\"\n", 820 | " ), \n", 821 | " operation = \"standardize\"\n", 822 | ")" 823 | ] 824 | }, 825 | { 826 | "cell_type": "markdown", 827 | "metadata": {}, 828 | "source": [ 829 | "# Aggregate data " 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": 18, 835 | "metadata": {}, 836 | "outputs": [ 837 | { 838 | "data": { 839 | "text/html": [ 840 | "
    \n", 841 | "\t
  1. 3782
  2. \n", 842 | "\t
  3. 1786
  4. \n", 843 | "
\n" 844 | ], 845 | "text/latex": [ 846 | "\\begin{enumerate*}\n", 847 | "\\item 3782\n", 848 | "\\item 1786\n", 849 | "\\end{enumerate*}\n" 850 | ], 851 | "text/markdown": [ 852 | "1. 3782\n", 853 | "2. 1786\n", 854 | "\n", 855 | "\n" 856 | ], 857 | "text/plain": [ 858 | "[1] 3782 1786" 859 | ] 860 | }, 861 | "metadata": {}, 862 | "output_type": "display_data" 863 | } 864 | ], 865 | "source": [ 866 | "population_aggregated <- cytominer::aggregate(\n", 867 | " population = population_normalized, \n", 868 | " variables = common_features, \n", 869 | " strata = c(\"Metadata_pert_id\",\"Metadata_dataset\",\"Metadata_partition\"), \n", 870 | " operation = \"mean\"\n", 871 | ") \n", 872 | "\n", 873 | "population_aggregated %>% dim()" 874 | ] 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": 19, 879 | "metadata": {}, 880 | "outputs": [ 881 | { 882 | "data": { 883 | "text/html": [ 884 | "
    \n", 885 | "\t
  1. 1543
  2. \n", 886 | "\t
  3. 1786
  4. \n", 887 | "
\n" 888 | ], 889 | "text/latex": [ 890 | "\\begin{enumerate*}\n", 891 | "\\item 1543\n", 892 | "\\item 1786\n", 893 | "\\end{enumerate*}\n" 894 | ], 895 | "text/markdown": [ 896 | "1. 1543\n", 897 | "2. 1786\n", 898 | "\n", 899 | "\n" 900 | ], 901 | "text/plain": [ 902 | "[1] 1543 1786" 903 | ] 904 | }, 905 | "metadata": {}, 906 | "output_type": "display_data" 907 | }, 908 | { 909 | "data": { 910 | "text/html": [ 911 | "
    \n", 912 | "\t
  1. 2239
  2. \n", 913 | "\t
  3. 1786
  4. \n", 914 | "
\n" 915 | ], 916 | "text/latex": [ 917 | "\\begin{enumerate*}\n", 918 | "\\item 2239\n", 919 | "\\item 1786\n", 920 | "\\end{enumerate*}\n" 921 | ], 922 | "text/markdown": [ 923 | "1. 2239\n", 924 | "2. 1786\n", 925 | "\n", 926 | "\n" 927 | ], 928 | "text/plain": [ 929 | "[1] 2239 1786" 930 | ] 931 | }, 932 | "metadata": {}, 933 | "output_type": "display_data" 934 | } 935 | ], 936 | "source": [ 937 | "population_aggregated %>% filter(Metadata_dataset == \"BBBC022\") %>% dim()\n", 938 | "population_aggregated %>% filter(Metadata_dataset == \"BBBC036\") %>% dim()" 939 | ] 940 | }, 941 | { 942 | "cell_type": "markdown", 943 | "metadata": {}, 944 | "source": [ 945 | "# Correlation matrix " 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": 20, 951 | "metadata": {}, 952 | "outputs": [ 953 | { 954 | "data": { 955 | "text/html": [ 956 | "
    \n", 957 | "\t
  1. 1543
  2. \n", 958 | "\t
  3. 2239
  4. \n", 959 | "
\n" 960 | ], 961 | "text/latex": [ 962 | "\\begin{enumerate*}\n", 963 | "\\item 1543\n", 964 | "\\item 2239\n", 965 | "\\end{enumerate*}\n" 966 | ], 967 | "text/markdown": [ 968 | "1. 1543\n", 969 | "2. 2239\n", 970 | "\n", 971 | "\n" 972 | ], 973 | "text/plain": [ 974 | "[1] 1543 2239" 975 | ] 976 | }, 977 | "metadata": {}, 978 | "output_type": "display_data" 979 | } 980 | ], 981 | "source": [ 982 | "cor_matrix <- cor(\n", 983 | " x = population_aggregated %>% \n", 984 | " filter(Metadata_dataset == 'BBBC022') %>% \n", 985 | " select(common_features) %>% \n", 986 | " as.matrix() %>% \n", 987 | " t, \n", 988 | " y = population_aggregated %>% \n", 989 | " filter(Metadata_dataset == 'BBBC036') %>% \n", 990 | " select(common_features) %>% \n", 991 | " as.matrix() %>% \n", 992 | " t,\n", 993 | " use = \"complete.obs\"\n", 994 | " ) \n", 995 | "\n", 996 | "cor_matrix %>% dim()" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "markdown", 1001 | "metadata": {}, 1002 | "source": [ 1003 | "# Submision file " 1004 | ] 1005 | }, 1006 | { 1007 | "cell_type": "code", 1008 | "execution_count": 21, 1009 | "metadata": {}, 1010 | "outputs": [ 1011 | { 1012 | "data": { 1013 | "text/html": [ 1014 | "
    \n", 1015 | "\t
  1. 1543
  2. \n", 1016 | "\t
  3. 2239
  4. \n", 1017 | "
\n" 1018 | ], 1019 | "text/latex": [ 1020 | "\\begin{enumerate*}\n", 1021 | "\\item 1543\n", 1022 | "\\item 2239\n", 1023 | "\\end{enumerate*}\n" 1024 | ], 1025 | "text/markdown": [ 1026 | "1. 1543\n", 1027 | "2. 2239\n", 1028 | "\n", 1029 | "\n" 1030 | ], 1031 | "text/plain": [ 1032 | "[1] 1543 2239" 1033 | ] 1034 | }, 1035 | "metadata": {}, 1036 | "output_type": "display_data" 1037 | } 1038 | ], 1039 | "source": [ 1040 | "# set column names \n", 1041 | "colnames(cor_matrix) <- population_aggregated %>% \n", 1042 | " filter(Metadata_dataset == 'BBBC036') %>%\n", 1043 | " extract2(\"Metadata_pert_id\")\n", 1044 | "\n", 1045 | "# set row names \n", 1046 | "#rownames(cor_matrix) <- population_aggregated %>% \n", 1047 | "# filter(Metadata_dataset == 'BBBC036') %>%\n", 1048 | "# extract2(\"Metadata_broad_sample\")#\n", 1049 | "\n", 1050 | "cor_matrix %>% dim()" 1051 | ] 1052 | }, 1053 | { 1054 | "cell_type": "code", 1055 | "execution_count": 23, 1056 | "metadata": {}, 1057 | "outputs": [], 1058 | "source": [ 1059 | "df <- cor_matrix %>% as_data_frame() %>% \n", 1060 | " mutate(Metadata_pert_id = population_aggregated %>% \n", 1061 | " filter(Metadata_dataset == 'BBBC022') %>%\n", 1062 | " extract2(\"Metadata_pert_id\")) %>% \n", 1063 | " select(Metadata_pert_id, everything())\n", 1064 | "\n", 1065 | "# write submission file\n", 1066 | "write.csv(df,\"../cytodata-baseline_R_day_2_CP.csv\",row.names = FALSE)" 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "code", 1071 | "execution_count": 25, 1072 | "metadata": {}, 1073 | "outputs": [ 1074 | { 1075 | "data": { 1076 | "text/html": [ 1077 | "
    \n", 1078 | "\t
  1. 1543
  2. \n", 1079 | "\t
  3. 2240
  4. \n", 1080 | "
\n" 1081 | ], 1082 | "text/latex": [ 1083 | "\\begin{enumerate*}\n", 1084 | "\\item 1543\n", 1085 | "\\item 2240\n", 1086 | "\\end{enumerate*}\n" 1087 | ], 1088 | "text/markdown": [ 1089 | "1. 1543\n", 1090 | "2. 2240\n", 1091 | "\n", 1092 | "\n" 1093 | ], 1094 | "text/plain": [ 1095 | "[1] 1543 2240" 1096 | ] 1097 | }, 1098 | "metadata": {}, 1099 | "output_type": "display_data" 1100 | } 1101 | ], 1102 | "source": [ 1103 | "dim(df)" 1104 | ] 1105 | } 1106 | ], 1107 | "metadata": { 1108 | "kernelspec": { 1109 | "display_name": "R", 1110 | "language": "R", 1111 | "name": "ir" 1112 | }, 1113 | "language_info": { 1114 | "codemirror_mode": "r", 1115 | "file_extension": ".r", 1116 | "mimetype": "text/x-r-source", 1117 | "name": "R", 1118 | "pygments_lexer": "r", 1119 | "version": "3.4.4" 1120 | } 1121 | }, 1122 | "nbformat": 4, 1123 | "nbformat_minor": 2 1124 | } 1125 | --------------------------------------------------------------------------------