├── .gitignore
├── slides.pdf
├── files
    ├── domains.png
    ├── profiling.png
    ├── application.png
    └── cell_painting.png
├── LICENSE
├── cytodata-toolkit
    ├── python
    │   └── cytodata.py
    ├── datasets.csv
    └── R
    │   ├── Create-Submission_R.Rmd
    │   ├── Create-Submission_R-day2.ipynb
    │   └── Create-Submission_R-day2-CP.ipynb
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | cytodata-toolkit/R/.ipynb_checkpoints/
2 | 


--------------------------------------------------------------------------------
/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/slides.pdf


--------------------------------------------------------------------------------
/files/domains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/domains.png


--------------------------------------------------------------------------------
/files/profiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/profiling.png


--------------------------------------------------------------------------------
/files/application.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/application.png


--------------------------------------------------------------------------------
/files/cell_painting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/cell_painting.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 cytodata
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cytodata-toolkit/python/cytodata.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import io
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | ## dataset_files: Pandas dataframe
 8 | ## Input csv files to read including columns Dataset,Plate,Link
 9 | 
10 | dataset_files = pd.read_csv("../datasets.csv")
11 | 
12 | 
13 | def load_dataset(dataset_id, partition, features):
14 |     """Load dataset from any collection of csv files
15 | 
16 |     Parameters
17 |     ----------
18 |     dataset_id: string
19 |         Dataset ID
20 |     partition:
21 |         Partition can be "Train" or "Test"
22 |     features:
23 |         Feature type can be "CellProfiler" or "DeepLearning"
24 | 
25 |     Returns
26 |     -------
27 |     dataframe
28 |         All read selected features from the bucket with given ID and partition
29 | 
30 |     """
31 |     
32 |     cond1 = dataset_files["Dataset"] == dataset_id
33 |     cond2 = dataset_files["Partition"] == partition
34 |     cond3 = dataset_files["Features"] == features
35 |     df_row = dataset_files[cond1 & cond2 & cond3]
36 |     
37 |     if df_row.empty:
38 |         print("No such partition {} for dataset {} with features {}".format(partition, dataset_id, features)) 
39 |         return None    
40 |     
41 |     dataframes = []
42 | 
43 |     for key,row in tqdm(df_row.iterrows()):
44 |         response = urllib.request.urlopen(row.Link)
45 |         data = response.read()
46 |         df = pd.read_csv(io.StringIO(data.decode('utf-8')))
47 |         dataframes.append(df)
48 |         
49 |     return pd.concat(dataframes, ignore_index=True)
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/cytodata-toolkit/datasets.csv:
--------------------------------------------------------------------------------
 1 | Dataset,Partition,Features,Link
 2 | BBBC037,Test,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_cp/bbbc037_test.csv
 3 | BBBC037,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_cp/bbbc037_train.csv
 4 | BBBC043,Test,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_cp/bbbc043_test.csv
 5 | BBBC043,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_cp/bbbc043_train.csv
 6 | BBBC037,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_dp/bbbc037_test.csv
 7 | BBBC037,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_dp/bbbc037_train.csv
 8 | BBBC043,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_dp/bbbc043_test.csv
 9 | BBBC043,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_dp/bbbc043_train.csv
10 | BBBC036,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_dp/bbbc036_test.csv
11 | BBBC036,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_dp/bbbc036_train.csv
12 | BBBC036,Test,CellProfiler, https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_cp/bbbc036_test.csv
13 | BBBC036,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_cp/bbbc036_train.csv
14 | BBBC022,Test,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_cp/bbbc022_test.csv
15 | BBBC022,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_cp/bbbc022_train.csv
16 | BBBC022,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_dp/bbbc022_test.csv
17 | BBBC022,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_dp/bbbc022_train.csv


--------------------------------------------------------------------------------
/cytodata-toolkit/R/Create-Submission_R.Rmd:
--------------------------------------------------------------------------------
  1 | ```{r, message=FALSE}
  2 | library(tidyverse)
  3 | library(cytominer)
  4 | library(magrittr)
  5 | library(RCurl)
  6 | ```
  7 | 
  8 | ```{r, message=FALSE}
  9 | load_dataset  <- function(partition, dataset,feature){
 10 |     file_name  <- read_csv("../datasets.csv") 
 11 |     x  <-  file_name %>% filter(
 12 |          Partition == partition,
 13 |          Dataset == dataset,
 14 |          Features == feature) %>% 
 15 |          extract2("Link")
 16 | 
 17 |     return(read_csv(x) %>% 
 18 |           mutate(Metadata_dataset = dataset) %>%
 19 |           mutate(Metadata_partition = partition) %>% 
 20 |           mutate(Metadata_features = feature) 
 21 |           )
 22 |     }
 23 | ```
 24 | 
 25 | # Load data 
 26 | We load training and test datasets for both genetic perturbation experiments 
 27 | 
 28 | ```{r, message=FALSE}
 29 | # bbbc37 data 
 30 | bbbc037_train  <- load_dataset("Train","BBBC037","CellProfiler")  %>% 
 31 |     mutate(Metadata_x_mutation_status = "none")  %>%
 32 |   filter(str_detect(Metadata_pert_name, "WT") | Metadata_ASSAY_WELL_ROLE %in% c("Untreated", "CTRL")) 
 33 | 
 34 | bbbc037_test <- load_dataset("Test","BBBC037","CellProfiler")  %>% 
 35 |     mutate(Metadata_x_mutation_status = "none")  %>%
 36 |   filter(str_detect(Metadata_pert_name, "WT") | Metadata_ASSAY_WELL_ROLE %in% c("Untreated", "CTRL")) 
 37 | 
 38 | bbbc037 <- 
 39 |   bind_rows(bbbc037_train, bbbc037_test)
 40 | ```
 41 | 
 42 | ```{r, message=FALSE}
 43 | # bbbc043 data 
 44 | bbbc043_train  <- load_dataset("Train","BBBC043","CellProfiler")
 45 |  
 46 | bbbc043_test <- load_dataset("Test","BBBC043","CellProfiler")
 47 | 
 48 | bbbc043  <- bind_rows(bbbc043_train, bbbc043_test)
 49 | ```
 50 | 
 51 | ## Check dimensionality
 52 | 
 53 | ```{r}
 54 | dim(bbbc043)
 55 | dim(bbbc037)
 56 | ```
 57 | 
 58 | ## Extract common features 
 59 | 
 60 | ```{r}
 61 | colnames_bbbc037 <- colnames(bbbc037)
 62 | colnames_bbbc043 <- colnames(bbbc043)
 63 | 
 64 | 
 65 | Metadata_names_bbbc037 <- c(
 66 |    stringr::str_subset(colnames_bbbc037, "^Meta")
 67 | ) 
 68 | 
 69 | Metadata_names_bbbc043 <- c(
 70 |    stringr::str_subset(colnames_bbbc043, "^Meta")
 71 | ) 
 72 | 
 73 | common_metadata  <- intersect(Metadata_names_bbbc037, Metadata_names_bbbc043)  
 74 | common_features  <- setdiff(intersect(colnames_bbbc037, colnames_bbbc043),common_metadata)
 75 | 
 76 | ```
 77 | 
 78 | # Concatenate data sets
 79 | 
 80 | ```{r}
 81 | population  <- bind_rows(
 82 |     bbbc037 %>% 
 83 |         select(c(common_metadata, common_features)),
 84 |     bbbc043 %>% 
 85 |         select(c(common_metadata, common_features))
 86 |     ) %>% 
 87 |     mutate(Metadata_perturbation = "genetic") %>% 
 88 |     select(matches("^Meta"), everything())
 89 | ```
 90 | 
 91 | ## Important: update column names! 
 92 | 
 93 | ```{r}
 94 | colnames_combined  <- colnames(population)
 95 | 
 96 | common_metadata  <- c(
 97 |    stringr::str_subset(colnames_combined, "^Meta")
 98 | ) 
 99 | 
100 | common_features  <- setdiff(colnames_combined, common_metadata)
101 | ```
102 | 
103 | Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ... 
104 | 
105 | ```{r}
106 | common_features  <- paste0("Feature_",common_features)
107 | colnames(population)  <- c(common_metadata, common_features)
108 | ```
109 | 
110 | # Normalize data
111 | We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes
112 | 
113 | ```{r}
114 | population_normalized  <- cytominer::normalize(
115 |     population, 
116 |     variables = common_features, 
117 |     strata = c("Metadata_perturbation"), 
118 |     sample = population %>% 
119 |                 filter(
120 |                     Metadata_gene_name == 'EMPTY',
121 |                     Metadata_partition == "Train"
122 |                 ), 
123 |     operation = "standardize"
124 | )
125 | ```
126 | 
127 | ```{r}
128 | population_normalized %>% dim() %>% print
129 | ```
130 | 
131 | # Aggregate data 
132 | 
133 | ```{r}
134 | population_aggregated  <- cytominer::aggregate(
135 |     population = population_normalized, 
136 |     variables = common_features, 
137 |     strata = c("Metadata_gene_name","Metadata_dataset","Metadata_x_mutation_status"), 
138 |     operation = "mean"
139 | ) 
140 | ```
141 | 
142 | ```{r}
143 | population_normalized %>% extract2("Metadata_gene_name") %>% print
144 | ```
145 | 
146 | ```{r}
147 | population_aggregated %>% slice(1:2) %>% print
148 | ```
149 | 
150 | # Correlation matrix 
151 | 
152 | ```{r}
153 | cor_matrix  <- cor(
154 |     x = population_aggregated %>% 
155 |         filter(Metadata_dataset == 'BBBC037') %>% 
156 |         select(common_features) %>% 
157 |         as.matrix() %>% 
158 |         t, 
159 |     y = population_aggregated %>% 
160 |         filter(Metadata_dataset == 'BBBC043') %>% 
161 |         select(common_features) %>% 
162 |         as.matrix() %>% 
163 |         t,
164 |     use  = "complete.obs"
165 |     ) 
166 | ```
167 | 
168 | # Submision file 
169 | 
170 | ```{r}
171 | # set column names 
172 | colnames(cor_matrix)  <- population_aggregated %>% 
173 |                             filter(Metadata_dataset == 'BBBC043') %>%
174 |                             extract2("Metadata_x_mutation_status")
175 | 
176 | # set row names 
177 | rownames(cor_matrix)  <- population_aggregated %>% 
178 |                             filter(Metadata_dataset == 'BBBC037') %>%
179 |                             extract2("Metadata_gene_name")
180 | 
181 | 
182 | df  <- cor_matrix %>% as_data_frame() %>% 
183 |             mutate(Metadata_gene_name = population_aggregated %>% 
184 |                             filter(Metadata_dataset == 'BBBC037') %>%
185 |                             extract2("Metadata_gene_name")) %>% 
186 |             select(Metadata_gene_name, everything())
187 | 
188 | # write submission file
189 | write.csv(df,"../cytodata-baseline_R.csv",row.names = FALSE)
190 | ```
191 | 
192 | ```{r}
193 | ```


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # :microscope: CytoData - 2018 Challenge 
  2 | 
  3 | If we want to retrieve "matching" profiles from a large collection of image-based profiling experiments (for example to find similar drugs, similar genes, or drug-gene or drug-disease combinations), how do we ensure that the profiles are aligned well enough? 
  4 | The CytoData 2018 Challenge addresses this, featuring batch effect correction and cross dataset profile matching :cd: :twisted_rightwards_arrows: :dvd:. 
  5 | The challenge involves the transformation of signatures using machine learning :space_invader: or statistical methods :bar_chart:. 
  6 | You will be given two datasets of image-based signatures :cd: :heavy_plus_sign: :dvd: acquired at different times :date: :clock130: and with different experimental conditions :pill: :syringe: with the goal of retrieving correct matches accurately :dart:. 
  7 | See http://cytodata.org/ for details of the event.
  8 | 
  9 | ## Table of Contents
 10 | 
 11 | [Background](#tv-background)
 12 | 
 13 | [Challenge](#checkered_flag-challenge)
 14 | 
 15 | [Data](#dvd-data)
 16 | 
 17 | [Format](#performing_arts-format)
 18 | 
 19 | [Resources](#wrench-resources)
 20 | 
 21 | 
 22 | # :tv: Background
 23 | 
 24 | :alien: **: What is image-based profiling?**
 25 | 
 26 | :sunglasses: : In the study of biological systems, microscopy images are used to measure the response of cells to treatments or perturbations. 
 27 | Cell state can be observed and quantitatively measured using images by following a computational workflow known as profiling. 
 28 | Single cells are first identified in all images, and then their main characteristics are represented in feature vectors.
 29 | The information of a population of cells is aggregated into a single vector, also called profile, containing summary statistics of the features of all cells.
 30 | These profiles encode the morphological changes of cell populations exposed to treatments.
 31 | Image-based profiles can be used to compare the response of cells to different treatments, and to map their similarities.
 32 | 
 33 | <p align="center">
 34 |   <img src="files/profiling.png?raw=True" alt="Profiling"/>
 35 | </p>
 36 | 
 37 | 
 38 | :alien: **: Is image-based profiling the same as image-based screening?**
 39 | 
 40 | :sunglasses: : Screening and profiling are different. 
 41 | Screening uses images to identify phenotype(s) of interest known beforehand.
 42 | Profiling measures as many cell properties as possible, using all the phenotypes to identify relationships among multiple different samples.
 43 | 
 44 | 
 45 | :alien: **: What are the applications of image-based profiling?**
 46 | 
 47 | :sunglasses: : Image-based profiles can be used for drug discovery and functional genomics applications. 
 48 | There are many types of biological studies that can be conducted using image-based profiling.
 49 | In the CytoData challenge, we use data from chemical and genetic perturbation experiments (see below).
 50 | 
 51 | <p align="center">
 52 |   <img src="files/application.png?raw=True" alt="Applications"/>
 53 | </p>
 54 | 
 55 | 
 56 | :alien: **: What imaging assays can be used for profiling?**
 57 | 
 58 | :sunglasses: : Virtually any imaging assay can be used for profiling, especially high-content assays.
 59 | In the 2018 CytoData challenge, we use an imaging assay called Cell Painting, that paints the cells with 6 stains, imaged in 5 channels, highlighting 8 cellular compartments. 
 60 | This is an unbiased, general purpose assay that maximizes information content for profiling, but the assay can be adapted to meet the needs of a research project.
 61 | 
 62 | <p align="center">
 63 |   <img src="files/cell_painting.png?raw=True" alt="Applications"/>
 64 | </p>
 65 | 
 66 | 
 67 | # :checkered_flag: Challenge 
 68 | 
 69 | As in many biological experiments, imaging data may be subject to batch effects and undesired artifacts :scream:.
 70 | More specifically, given two batches of microscopy images with the same treatments :pill:, but acquired under different technical conditions :a::vs::b:, a difference in the quantitative measures is likely to be observed :x:.
 71 | These differences are not due to meaningful biological variations and can be removed using computational methods :computer:.
 72 | 
 73 | The goal of the challenge :checkered_flag: is to analyze the profiles of two different batches of data :a::b: and design computational methods to correct batch effects :white_check_mark:.
 74 | A successful method :trophy: will be able to align the information content of both batches :ab:, 
 75 | making profiles of the same treatment have similar measurements without distorting the relationships among other treatments :smiley:.
 76 | The following metrics will be used to assess the quality of entries :triangular_ruler::
 77 | 
 78 | 1. :arrow_upper_right::arrow_upper_right: Replicate correlation
 79 | 2. :top::arrows_counterclockwise: Enrichment of biologically relevant matches in the top connections
 80 | 3. :id::white_check_mark: Correct association of treatment type
 81 | 
 82 | ## :bulb: Tip
 83 | 
 84 | From the data analysis perspective, the problem can be formulated in various ways, including 
 85 | manifold learning, domain adaptation, subspace alignment, and transfer learning.
 86 | 
 87 | <p align="center">
 88 |   <img src="files/domains.png?raw=True" alt="Domains"/>
 89 | </p>
 90 | 
 91 | 
 92 | # :dvd: Data
 93 | 
 94 | We are glad to announce that four datasets will be provided during the CytoData 2018 Challenge :tada::tada::tada::tada:. 
 95 | All of them were acquired using the Cell Painting assay, at high-throughput, in 384 well plates :microscope:, as part of the research 
 96 | conducted in the Broad Institute of MIT and Harvard. 
 97 | The following table describes the experimental details of each dataset.
 98 | 
 99 | | Dataset :dvd: | Type :syringe: :pill: | Number of treatments :hash: | Cell line :cancer: |
100 | |---|---|---|---|
101 | | BBBC037 | Genetic perturbations. ORF over-expression | 200 wild type genes | U2OS |
102 | | BBBC043 | Genetic perturbations. ORF over-expression | 596 alleles of 53 genes | A549 |
103 | | BBBC022 | Chemical perturbations. Bioactive compounds | 1,600 compounds | U2OS |
104 | | BBBC036 | Chemical perturbations. Bioactive compounds | 5,000 compounds | U2OS |
105 | 
106 | Notice that two datasets represent genetic perturbations and the other two represent chemical perturbations.
107 | The challenge will consider the cross-dataset matching problem across each of the two pairs :cd::twisted_rightwards_arrows::dvd:, 
108 | i.e, profiles in BBBC037 have to be matched with profiles in BBBC043 because both contain genetic perturbations.
109 | Similarly, profiles in BBBC022 have to be matched with profiles in BBBC036 because both contain chemical perturbations.
110 | 
111 | The imaging data for all three datasets is more than 3TB of data :boom:, which will be available to everyone during and after the challenge.
112 | However, to facilitate the analysis of treatment profiles and to focus on the cross-dataset matching problem, all the datasets have been processed
113 | before-hand using the profiling workflow described above :sunglasses:.
114 | In particular, two versions of well-level population profiles will be available during the challenge: 
115 | 1. Classical features computed with the CellProfiler software using pipelines optimized for Cell Painting images.
116 | 2. Deep learning features computed with a convolutional neural network pretrained on ImageNet.
117 | 
118 | ## Data available on AWS 
119 | 
120 | As of 2024, all data has moved to the [Cell Painting Gallery](https://github.com/broadinstitute/cellpainting-gallery) at `s3://cellpainting-gallery`.
121 | The folder structue has changed slightly from the original structure to comply with [Cell Painting Gallery formatting](https://github.com/broadinstitute/cellpainting-gallery/blob/main/folder_structure.md).
122 | 
123 | The datasets have undergone the following renaming in the Cell Painting Gallery:  
124 | Bioactives-BBBC022-Gustafsdottir => `cpg0030-gustafsdottir-cellpainting`  
125 | CDRPBIO-BBBC036-Bray => `cpg0012-wawer-bioactivecompoundprofiling`  
126 | LUAD-BBBC041-Caicedo => `cpg0031-caicedo-cmvip`  
127 | 
128 | 
129 | During the Cytodata hackathon the data was available as Amazon Public Data Set on https://registry.opendata.aws/cell-painting-image-collection/ at `s3://cytodata`.
130 | All image data and extracted single cell features and aggregated profiles were found in `s3://cytodata/datasets/` with the following structure:
131 | ```
132 | .
133 | ├── Bioactives-BBBC022-Gustafsdottir
134 | │   ├── profiles
135 | │   │   └── Bioactives-BBBC022-Gustafsdottir
136 | │   ├── images
137 | │   │   └── Bioactives-BBBC022-Gustafsdottir
138 | │   └── metadata
139 | │       └── Bioactives-BBBC022-Gustafsdottir
140 | ├── CDRPBIO-BBBC036-Bray
141 | │   ├── profiles
142 | │   │   └── CDRPBIO-BBBC036-Bray
143 | │   ├── images
144 | │   │   └── CDRPBIO-BBBC036-Bray
145 | │   └── metadata
146 | │       └── CDRPBIO-BBBC036-Bray
147 | ├── LUAD-BBBC041-Caicedo
148 | │   ├── profiles
149 | │   │   └── LUAD-BBBC041-Caicedo
150 | │   ├── images
151 | │   │   └── LUAD-BBBC041-Caicedo
152 | │   └── metadata
153 | │       └── LUAD-BBBC041-Caicedo
154 | └── TA-ORF-BBBC037-Rohban
155 |     ├── profiles
156 |     │   └── TA-ORF-BBBC037-Rohban
157 |     ├── images
158 |     │   └── TA-ORF-BBBC037-Rohban
159 |     └── metadata
160 |         └── TA-ORF-BBBC037-Rohban
161 | ```
162 | 
163 | The subfolder contain the following information: 
164 | * the directory `images` contain Cell Painting images as tiff files 
165 | * the directory `profiles` contains single cell data in sqlite format and profiles aggregated to replicate level as csv files (aggregated as mean profiles per well) 
166 | * the metadata directory contains information about the platemaps and the used perturbations. 
167 | 
168 | 
169 | # :performing_arts: Format
170 | 
171 | The CytoData 2018 challenge will be a collaborative hackathon :sparkles::computer:, with participants forming teams to discuss and implement solutions to the problem.
172 | The challenge will run for two days only, so participants are encouraged to investigate and plan some solutions before the event starts :pencil:.
173 | In order to meet other participants, we will provide a slack channel to make general announcements and allow participants to organize teams and exchange ideas :bulb:.
174 | It's also a great idea to start discussing methods here in this GitHub repository :octocat:: 
175 | 
176 | ```add issues with relevant links if you want to suggest a methodology and discuss it with other participants!```
177 | 
178 | 
179 | Teams will have no fewer than three :three: and no more than five :five: participants, ideally from different institutions. 
180 | Teams will compete with each other :rage1: to improve the three performance metrics mentioned above :bowling:.
181 | Participants of the team will be able to upload solutions to a scoreboard to check that everything is running properly and to get feedback on performance :ok_hand:.
182 | The best performing solutions will win prizes provided by our sponsors! :trophy::clap:
183 | 
184 | 
185 | # :wrench: Resources
186 | 
187 | The following resources will be provided during the challenge:
188 | 
189 | 1. :satellite: Internet connection.
190 | 2. :dvd: Access to all files of the four datasets, including pre-computed profiles.
191 | 3. :octocat: A toolkit, written in R and Python, to load the pre-computed profiles, run a baseline model and create a submission.
192 | 4. :chart_with_upwards_trend: An account in the scoreboard to evaluate the generated submissions.
193 | 5. :computer: Teams will be given access to pre-configured virtual machines in the Amazon Cloud to run experiments.
194 | 
195 | Participants of the challenge can make use of their own computational resources (laptops, servers, etc) to run experiments during the challenge.
196 | 


--------------------------------------------------------------------------------
/cytodata-toolkit/R/Create-Submission_R-day2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "metadata": {
  7 |     "scrolled": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "library(tidyverse)\n",
 12 |     "library(cytominer)\n",
 13 |     "library(magrittr)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 5,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "load_dataset  <- function(partition, dataset,feature){\n",
 23 |     "    file_name  <- read_csv(\"../datasets.csv\") \n",
 24 |     "    x  <-  file_name %>% filter(\n",
 25 |     "         Partition == partition,\n",
 26 |     "         Dataset == dataset,\n",
 27 |     "         Features == feature) %>% \n",
 28 |     "         extract2(\"Link\")\n",
 29 |     "\n",
 30 |     "    return(read_csv(x) %>% \n",
 31 |     "          mutate(Metadata_dataset = dataset) %>%\n",
 32 |     "          mutate(Metadata_partition = partition) %>% \n",
 33 |     "          mutate(Metadata_features = feature) \n",
 34 |     "          )\n",
 35 |     "    }"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# Load data \n",
 43 |     "We load training and test datasets for both genetic perturbation experiments "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stderr",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "Parsed with column specification:\n",
 56 |       "cols(\n",
 57 |       "  Dataset = col_character(),\n",
 58 |       "  Partition = col_character(),\n",
 59 |       "  Features = col_character(),\n",
 60 |       "  Link = col_character()\n",
 61 |       ")\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "# bbbc37 data \n",
 67 |     "bbbc036_train  <- load_dataset(\"Train\",\"BBBC036\",\"DeepLearning\")  %>% \n",
 68 |     "    mutate(Metadata_x_mutation_status = \"none\")\n",
 69 |     "\n",
 70 |     "bbbc036_test <- load_dataset(\"Test\",\"BBBC036\",\"DeepLearning\")  %>% \n",
 71 |     "    mutate(Metadata_x_mutation_status = \"none\")\n",
 72 |     "\n",
 73 |     "bbbc036  <- rbind(bbbc036_train, bbbc036_test)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "bbbc036_train %>% dim()\n",
 83 |     "bbbc036_test %>% dim()"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# bbbc043 data \n",
 93 |     "bbbc022_train  <- load_dataset(\"Train\",\"BBBC022\",\"DeepLearning\")\n",
 94 |     " \n",
 95 |     "bbbc022_test <- load_dataset(\"Test\",\"BBBC022\",\"DeepLearning\")\n",
 96 |     "\n",
 97 |     "bbbc022  <- rbind(bbbc022_train, bbbc022_test)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "## Check dimensionality"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "dim(bbbc022)\n",
114 |     "dim(bbbc036)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "## Extract common features "
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "colnames_bbbc022 <- colnames(bbbc022)\n",
131 |     "colnames_bbbc036 <- colnames(bbbc036)\n",
132 |     "\n",
133 |     "\n",
134 |     "Metadata_names_bbbc022 <- c(\n",
135 |     "   stringr::str_subset(colnames_bbbc022, \"^Meta\")\n",
136 |     ") \n",
137 |     "\n",
138 |     "Metadata_names_bbbc036 <- c(\n",
139 |     "   stringr::str_subset(colnames_bbbc036, \"^Meta\")\n",
140 |     ") \n",
141 |     "\n",
142 |     "common_metadata  <- intersect(Metadata_names_bbbc022, Metadata_names_bbbc036)  \n",
143 |     "common_features  <- setdiff(intersect(colnames_bbbc022, colnames_bbbc036),common_metadata)\n",
144 |     "\n",
145 |     "colnames_bbbc036 %>% length()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "bbbc022_na_feature  <- cytominer::drop_na_columns(\n",
155 |     "    population = bbbc022  %>% \n",
156 |     "                filter(\n",
157 |     "                    Metadata_broad_sample == \"DMSO\"\n",
158 |     "                ) %>% \n",
159 |     "                slice(1:100),\n",
160 |     "    variables = common_features,\n",
161 |     "    cutoff = 0\n",
162 |     "    )\n",
163 |     "\n",
164 |     "#bbbc036_na_feature  <- cytominer::drop_na_columns(\n",
165 |     "#    population = bbbc036,\n",
166 |     "#    variables = common_features,\n",
167 |     "#    cutoff = 0\n",
168 |     "#    )"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "bbbc022_na_feature %>% print"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "features_to_remove  <- cytominer::variance_threshold(\n",
187 |     "    variables = common_features,\n",
188 |     "    sample = bbbc022  %>% \n",
189 |     "                filter(\n",
190 |     "                    Metadata_broad_sample == \"DMSO\"\n",
191 |     "                ) %>% \n",
192 |     "                slice(1:100)\n",
193 |     ")"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": []
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "# Concatenate data sets"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "population  <- rbind(\n",
217 |     "    bbbc022 %>% \n",
218 |     "        select(c(common_metadata, common_features)),\n",
219 |     "    bbbc036 %>% \n",
220 |     "        select(c(common_metadata, common_features))\n",
221 |     "    ) %>% \n",
222 |     "    mutate(Metadata_perturbation = 'chemical') %>% \n",
223 |     "    select(Metadata_perturbation, everything())"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "## Important: update column names! "
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "colnames_combined  <- colnames(population)\n",
240 |     "\n",
241 |     "common_metadata  <- c(\n",
242 |     "   stringr::str_subset(colnames_combined, \"^Meta\")\n",
243 |     ") \n",
244 |     "\n",
245 |     "common_features  <- setdiff(colnames_combined, common_metadata)\n"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ... "
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "common_features  <- paste0(\"Feature_\",common_features)\n",
262 |     "colnames(population)  <- c(common_metadata, common_features)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "metadata": {},
268 |    "source": [
269 |     "# Normalize data\n",
270 |     "We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "population_normalized  <- cytominer::normalize(\n",
280 |     "    population, \n",
281 |     "    variables = common_features, \n",
282 |     "    strata = c(\"Metadata_perturbation\"), \n",
283 |     "    sample = population %>% \n",
284 |     "                filter(\n",
285 |     "                    Metadata_broad_sample == \"DMSO\"\n",
286 |     "                ) %>% \n",
287 |     "                slice(1:100), \n",
288 |     "    operation = \"standardize\"\n",
289 |     ")"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "population_normalized %>% dim() %>% print"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "markdown",
303 |    "metadata": {},
304 |    "source": [
305 |     "# Aggregate data "
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "population_aggregated  <- cytominer::aggregate(\n",
315 |     "    population = population_normalized, \n",
316 |     "    variables = common_features, \n",
317 |     "    strata = c(\"Metadata_broad_sample\",\"Metadata_dataset\"), \n",
318 |     "    operation = \"mean\"\n",
319 |     ") "
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": [
328 |     "population_normalized %>% extract2(\"Metadata_broad_sample\") %>% print"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "population_aggregated %>% slice(1:2) %>% print"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "metadata": {},
343 |    "source": [
344 |     "# Correlation matrix "
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {},
351 |    "outputs": [],
352 |    "source": [
353 |     "cor_matrix  <- cor(\n",
354 |     "    x = population_aggregated %>% \n",
355 |     "        filter(Metadata_dataset == 'BBBC022') %>% \n",
356 |     "        select(common_features) %>% \n",
357 |     "        as.matrix() %>% \n",
358 |     "        t, \n",
359 |     "    y = population_aggregated %>% \n",
360 |     "        filter(Metadata_dataset == 'BBBC036') %>% \n",
361 |     "        select(common_features) %>% \n",
362 |     "        as.matrix() %>% \n",
363 |     "        t,\n",
364 |     "    use  = \"complete.obs\"\n",
365 |     "    ) \n"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "markdown",
370 |    "metadata": {},
371 |    "source": [
372 |     "# Submision file "
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {},
379 |    "outputs": [],
380 |    "source": [
381 |     "# set column names \n",
382 |     "colnames(cor_matrix)  <- population_aggregated %>% \n",
383 |     "                            filter(Metadata_dataset == 'BBBC036') %>%\n",
384 |     "                            extract2(\"Metadata_pert_id\")\n",
385 |     "\n",
386 |     "# set row names \n",
387 |     "#rownames(cor_matrix)  <- population_aggregated %>% \n",
388 |     "#                            filter(Metadata_dataset == 'BBBC036') %>%\n",
389 |     "#                            extract2(\"Metadata_broad_sample\")#\n"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "df  <- cor_matrix %>% as_data_frame() %>% \n",
399 |     "            mutate(Metadata_pert_id = population_aggregated %>% \n",
400 |     "                            filter(Metadata_dataset == 'BBBC022') %>%\n",
401 |     "                            extract2(\"Metadata_pert_id\")) %>% \n",
402 |     "            select(Metadata_pert_id, everything())\n",
403 |     "\n",
404 |     "# write submission file\n",
405 |     "write.csv(df,\"../cytodata-baseline_R_day_2.csv\",row.names = FALSE)"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": null,
411 |    "metadata": {},
412 |    "outputs": [],
413 |    "source": [
414 |     "df %>% print"
415 |    ]
416 |   }
417 |  ],
418 |  "metadata": {
419 |   "kernelspec": {
420 |    "display_name": "R",
421 |    "language": "R",
422 |    "name": "ir"
423 |   },
424 |   "language_info": {
425 |    "codemirror_mode": "r",
426 |    "file_extension": ".r",
427 |    "mimetype": "text/x-r-source",
428 |    "name": "R",
429 |    "pygments_lexer": "r",
430 |    "version": "3.4.4"
431 |   }
432 |  },
433 |  "nbformat": 4,
434 |  "nbformat_minor": 2
435 | }
436 | 


--------------------------------------------------------------------------------
/cytodata-toolkit/R/Create-Submission_R-day2-CP.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {
   7 |     "scrolled": true
   8 |    },
   9 |    "outputs": [
  10 |     {
  11 |      "name": "stderr",
  12 |      "output_type": "stream",
  13 |      "text": [
  14 |       "── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──\n",
  15 |       "✔ ggplot2 3.0.0     ✔ purrr   0.2.5\n",
  16 |       "✔ tibble  1.4.2     ✔ dplyr   0.7.6\n",
  17 |       "✔ tidyr   0.8.1     ✔ stringr 1.3.1\n",
  18 |       "✔ readr   1.1.1     ✔ forcats 0.3.0\n",
  19 |       "── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n",
  20 |       "✖ dplyr::filter() masks stats::filter()\n",
  21 |       "✖ dplyr::lag()    masks stats::lag()\n",
  22 |       "\n",
  23 |       "Attaching package: ‘cytominer’\n",
  24 |       "\n",
  25 |       "The following object is masked from ‘package:stats’:\n",
  26 |       "\n",
  27 |       "    aggregate\n",
  28 |       "\n",
  29 |       "The following object is masked from ‘package:base’:\n",
  30 |       "\n",
  31 |       "    transform\n",
  32 |       "\n",
  33 |       "\n",
  34 |       "Attaching package: ‘magrittr’\n",
  35 |       "\n",
  36 |       "The following object is masked from ‘package:purrr’:\n",
  37 |       "\n",
  38 |       "    set_names\n",
  39 |       "\n",
  40 |       "The following object is masked from ‘package:tidyr’:\n",
  41 |       "\n",
  42 |       "    extract\n",
  43 |       "\n"
  44 |      ]
  45 |     }
  46 |    ],
  47 |    "source": [
  48 |     "library(tidyverse)\n",
  49 |     "library(cytominer)\n",
  50 |     "library(magrittr)"
  51 |    ]
  52 |   },
  53 |   {
  54 |    "cell_type": "markdown",
  55 |    "metadata": {},
  56 |    "source": [
  57 |     "# function to load different data sets\n",
  58 |     "This function also adds the Metdata columns Metadata_dataset, Metadata_partition and Metadata_features"
  59 |    ]
  60 |   },
  61 |   {
  62 |    "cell_type": "code",
  63 |    "execution_count": 2,
  64 |    "metadata": {},
  65 |    "outputs": [],
  66 |    "source": [
  67 |     "load_dataset  <- function(partition, dataset,feature){\n",
  68 |     "    file_name  <- read_csv(\"../datasets.csv\") \n",
  69 |     "    x  <-  file_name %>% filter(\n",
  70 |     "         Partition == partition,\n",
  71 |     "         Dataset == dataset,\n",
  72 |     "         Features == feature) %>% \n",
  73 |     "         extract2(\"Link\")\n",
  74 |     "\n",
  75 |     "    return(read_csv(x) %>% \n",
  76 |     "          mutate(Metadata_dataset = dataset) %>%\n",
  77 |     "          mutate(Metadata_partition = partition) %>% \n",
  78 |     "          mutate(Metadata_features = feature) \n",
  79 |     "          )\n",
  80 |     "    }"
  81 |    ]
  82 |   },
  83 |   {
  84 |    "cell_type": "markdown",
  85 |    "metadata": {},
  86 |    "source": [
  87 |     "# Load data \n",
  88 |     "We load training and test datasets for both BBBC036 / CDRP data set and select only important Metadata colums"
  89 |    ]
  90 |   },
  91 |   {
  92 |    "cell_type": "code",
  93 |    "execution_count": 3,
  94 |    "metadata": {
  95 |     "scrolled": true
  96 |    },
  97 |    "outputs": [
  98 |     {
  99 |      "name": "stderr",
 100 |      "output_type": "stream",
 101 |      "text": [
 102 |       "Parsed with column specification:\n",
 103 |       "cols(\n",
 104 |       "  Dataset = col_character(),\n",
 105 |       "  Partition = col_character(),\n",
 106 |       "  Features = col_character(),\n",
 107 |       "  Link = col_character()\n",
 108 |       ")\n",
 109 |       "Parsed with column specification:\n",
 110 |       "cols(\n",
 111 |       "  .default = col_double(),\n",
 112 |       "  Metadata_Plate = col_integer(),\n",
 113 |       "  Metadata_Well = col_character(),\n",
 114 |       "  Metadata_Assay_Plate_Barcode = col_integer(),\n",
 115 |       "  Metadata_Plate_Map_Name = col_character(),\n",
 116 |       "  Metadata_well_position = col_character(),\n",
 117 |       "  Metadata_ASSAY_WELL_ROLE = col_character(),\n",
 118 |       "  Metadata_broad_sample = col_character(),\n",
 119 |       "  Metadata_solvent = col_character(),\n",
 120 |       "  Metadata_pert_id = col_character(),\n",
 121 |       "  Metadata_pert_mfc_id = col_character(),\n",
 122 |       "  Metadata_pert_well = col_character(),\n",
 123 |       "  Metadata_pert_id_vendor = col_character(),\n",
 124 |       "  Metadata_cell_id = col_character(),\n",
 125 |       "  Metadata_broad_sample_type = col_character(),\n",
 126 |       "  Metadata_pert_vehicle = col_character(),\n",
 127 |       "  Metadata_pert_type = col_character(),\n",
 128 |       "  Cells_AreaShape_EulerNumber = col_integer(),\n",
 129 |       "  Cells_Children_Cytoplasm_Count = col_integer(),\n",
 130 |       "  Cells_Neighbors_FirstClosestObjectNumber_5 = col_integer(),\n",
 131 |       "  Cells_Neighbors_FirstClosestObjectNumber_Adjacent = col_integer()\n",
 132 |       "  # ... with 20 more columns\n",
 133 |       ")\n",
 134 |       "See spec(...) for full column specifications.\n",
 135 |       "Warning message in rbind(names(probs), probs_f):\n",
 136 |       "“number of columns of result is not a multiple of vector length (arg 1)”Warning message:\n",
 137 |       "“2201 parsing failures.\n",
 138 |       "row # A tibble: 5 x 5 col     row col                  expected      actual file                          expected   <int> <chr>                <chr>         <chr>  <chr>                         actual 1  1013 Cells_Neighbors_Fir… no trailing … .5     'https://s3.amazonaws.com/cy… file 2  1013 Cells_Neighbors_Fir… no trailing … .5     'https://s3.amazonaws.com/cy… row 3  1013 Nuclei_AreaShape_Ar… no trailing … .5     'https://s3.amazonaws.com/cy… col 4  1013 Nuclei_Neighbors_Se… no trailing … .5     'https://s3.amazonaws.com/cy… expected 5  1017 Nuclei_AreaShape_Ar… no trailing … .5     'https://s3.amazonaws.com/cy…\n",
 139 |       "... ................. ... ............................................................................... ........ ............................................................................... ...... ............................................................................... .... ............................................................................... ... ............................................................................... ... ............................................................................... ........ ...............................................................................\n",
 140 |       "See problems(...) for more details.\n",
 141 |       "”Parsed with column specification:\n",
 142 |       "cols(\n",
 143 |       "  Dataset = col_character(),\n",
 144 |       "  Partition = col_character(),\n",
 145 |       "  Features = col_character(),\n",
 146 |       "  Link = col_character()\n",
 147 |       ")\n",
 148 |       "Parsed with column specification:\n",
 149 |       "cols(\n",
 150 |       "  .default = col_double(),\n",
 151 |       "  Metadata_Plate = col_integer(),\n",
 152 |       "  Metadata_Well = col_character(),\n",
 153 |       "  Metadata_Assay_Plate_Barcode = col_integer(),\n",
 154 |       "  Metadata_Plate_Map_Name = col_character(),\n",
 155 |       "  Metadata_well_position = col_character(),\n",
 156 |       "  Metadata_ASSAY_WELL_ROLE = col_character(),\n",
 157 |       "  Metadata_broad_sample = col_character(),\n",
 158 |       "  Metadata_solvent = col_character(),\n",
 159 |       "  Metadata_pert_id = col_character(),\n",
 160 |       "  Metadata_pert_mfc_id = col_character(),\n",
 161 |       "  Metadata_pert_well = col_character(),\n",
 162 |       "  Metadata_pert_id_vendor = col_character(),\n",
 163 |       "  Metadata_cell_id = col_character(),\n",
 164 |       "  Metadata_broad_sample_type = col_character(),\n",
 165 |       "  Metadata_pert_vehicle = col_character(),\n",
 166 |       "  Metadata_pert_type = col_character(),\n",
 167 |       "  Cells_AreaShape_EulerNumber = col_integer(),\n",
 168 |       "  Cells_Children_Cytoplasm_Count = col_integer(),\n",
 169 |       "  Cytoplasm_AreaShape_EulerNumber = col_integer(),\n",
 170 |       "  Cytoplasm_Correlation_Manders_AGP_DNA = col_integer()\n",
 171 |       "  # ... with 31 more columns\n",
 172 |       ")\n",
 173 |       "See spec(...) for full column specifications.\n",
 174 |       "Warning message in rbind(names(probs), probs_f):\n",
 175 |       "“number of columns of result is not a multiple of vector length (arg 1)”Warning message:\n",
 176 |       "“33 parsing failures.\n",
 177 |       "row # A tibble: 5 x 5 col     row col              expected      actual    file                           expected   <int> <chr>            <chr>         <chr>     <chr>                          actual 1  1103 Nuclei_Correlat… no trailing … .8570798… 'https://s3.amazonaws.com/cyt… file 2  1103 Nuclei_Correlat… no trailing … .5790690… 'https://s3.amazonaws.com/cyt… row 3  1103 Nuclei_Correlat… no trailing … .9671379… 'https://s3.amazonaws.com/cyt… col 4  1103 Nuclei_Correlat… no trailing … .6148584… 'https://s3.amazonaws.com/cyt… expected 5  1104 Nuclei_Correlat… no trailing … .9769246… 'https://s3.amazonaws.com/cyt…\n",
 178 |       "... ................. ... ............................................................................... ........ ............................................................................... ...... ............................................................................... .... ............................................................................... ... ............................................................................... ... ............................................................................... ........ ...............................................................................\n",
 179 |       "See problems(...) for more details.\n",
 180 |       "”"
 181 |      ]
 182 |     }
 183 |    ],
 184 |    "source": [
 185 |     "# bbbc36 data \n",
 186 |     "bbbc036_train  <- load_dataset(\"Train\",\"BBBC036\",\"CellProfiler\")  %>% \n",
 187 |     "    mutate(Metadata_x_mutation_status = \"none\")\n",
 188 |     "\n",
 189 |     "bbbc036_test <- load_dataset(\"Test\",\"BBBC036\",\"CellProfiler\")  %>% \n",
 190 |     "    mutate(Metadata_x_mutation_status = \"none\")\n"
 191 |    ]
 192 |   },
 193 |   {
 194 |    "cell_type": "code",
 195 |    "execution_count": 4,
 196 |    "metadata": {},
 197 |    "outputs": [],
 198 |    "source": [
 199 |     "bbbc036  <- rbind(bbbc036_train, bbbc036_test)  "
 200 |    ]
 201 |   },
 202 |   {
 203 |    "cell_type": "markdown",
 204 |    "metadata": {},
 205 |    "source": [
 206 |     "# How large are the data sets? "
 207 |    ]
 208 |   },
 209 |   {
 210 |    "cell_type": "code",
 211 |    "execution_count": 5,
 212 |    "metadata": {},
 213 |    "outputs": [
 214 |     {
 215 |      "data": {
 216 |       "text/html": [
 217 |        "<ol class=list-inline>\n",
 218 |        "\t<li>18929</li>\n",
 219 |        "\t<li>1805</li>\n",
 220 |        "</ol>\n"
 221 |       ],
 222 |       "text/latex": [
 223 |        "\\begin{enumerate*}\n",
 224 |        "\\item 18929\n",
 225 |        "\\item 1805\n",
 226 |        "\\end{enumerate*}\n"
 227 |       ],
 228 |       "text/markdown": [
 229 |        "1. 18929\n",
 230 |        "2. 1805\n",
 231 |        "\n",
 232 |        "\n"
 233 |       ],
 234 |       "text/plain": [
 235 |        "[1] 18929  1805"
 236 |       ]
 237 |      },
 238 |      "metadata": {},
 239 |      "output_type": "display_data"
 240 |     },
 241 |     {
 242 |      "data": {
 243 |       "text/html": [
 244 |        "<ol class=list-inline>\n",
 245 |        "\t<li>2177</li>\n",
 246 |        "\t<li>1805</li>\n",
 247 |        "</ol>\n"
 248 |       ],
 249 |       "text/latex": [
 250 |        "\\begin{enumerate*}\n",
 251 |        "\\item 2177\n",
 252 |        "\\item 1805\n",
 253 |        "\\end{enumerate*}\n"
 254 |       ],
 255 |       "text/markdown": [
 256 |        "1. 2177\n",
 257 |        "2. 1805\n",
 258 |        "\n",
 259 |        "\n"
 260 |       ],
 261 |       "text/plain": [
 262 |        "[1] 2177 1805"
 263 |       ]
 264 |      },
 265 |      "metadata": {},
 266 |      "output_type": "display_data"
 267 |     }
 268 |    ],
 269 |    "source": [
 270 |     "bbbc036_train %>% dim()\n",
 271 |     "bbbc036_test %>% dim()"
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "markdown",
 276 |    "metadata": {},
 277 |    "source": [
 278 |     "# What are the Metadata colums?"
 279 |    ]
 280 |   },
 281 |   {
 282 |    "cell_type": "code",
 283 |    "execution_count": 6,
 284 |    "metadata": {},
 285 |    "outputs": [
 286 |     {
 287 |      "name": "stdout",
 288 |      "output_type": "stream",
 289 |      "text": [
 290 |       "# A tibble: 5 x 4\n",
 291 |       "  Metadata_Plate Metadata_Well Metadata_Plate_Map_Name Metadata_pert_id\n",
 292 |       "           <int> <chr>         <chr>                   <chr>           \n",
 293 |       "1          24277 a01           H-BIOA-004-3            BRD-K18250272   \n",
 294 |       "2          24277 a02           H-BIOA-004-3            BRD-K18316707   \n",
 295 |       "3          24277 a03           H-BIOA-004-3            BRD-K18438502   \n",
 296 |       "4          24277 a04           H-BIOA-004-3            BRD-K18550767   \n",
 297 |       "5          24277 a05           H-BIOA-004-3            BRD-K18574842   \n"
 298 |      ]
 299 |     }
 300 |    ],
 301 |    "source": [
 302 |     "bbbc036 %>% \n",
 303 |     "    select(Metadata_Plate, Metadata_Well, Metadata_Plate_Map_Name, Metadata_pert_id) %>% \n",
 304 |     "    slice(1:5) %>% \n",
 305 |     "    print()"
 306 |    ]
 307 |   },
 308 |   {
 309 |    "cell_type": "code",
 310 |    "execution_count": 27,
 311 |    "metadata": {},
 312 |    "outputs": [
 313 |     {
 314 |      "name": "stdout",
 315 |      "output_type": "stream",
 316 |      "text": [
 317 |       "# A tibble: 5 x 4\n",
 318 |       "  Metadata_pert_id Metadata_broad_sample_ty… Metadata_dataset Metadata_partiti…\n",
 319 |       "  <chr>            <chr>                     <chr>            <chr>            \n",
 320 |       "1 BRD-K18250272    trt                       BBBC036          Train            \n",
 321 |       "2 BRD-K18316707    trt                       BBBC036          Train            \n",
 322 |       "3 BRD-K18438502    trt                       BBBC036          Train            \n",
 323 |       "4 BRD-K18550767    trt                       BBBC036          Train            \n",
 324 |       "5 BRD-K18574842    trt                       BBBC036          Train            \n"
 325 |      ]
 326 |     }
 327 |    ],
 328 |    "source": [
 329 |     "bbbc036 %>% \n",
 330 |     "    select( Metadata_pert_id, Metadata_broad_sample_type,Metadata_dataset, Metadata_partition) %>% \n",
 331 |     "    slice(1:5) %>% \n",
 332 |     "    print()"
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "markdown",
 337 |    "metadata": {},
 338 |    "source": [
 339 |     "# how many replicates do we have? "
 340 |    ]
 341 |   },
 342 |   {
 343 |    "cell_type": "code",
 344 |    "execution_count": 8,
 345 |    "metadata": {},
 346 |    "outputs": [
 347 |     {
 348 |      "data": {
 349 |       "text/html": [
 350 |        "<table>\n",
 351 |        "<thead><tr><th scope=col>Metadata_Plate_Map_Name</th><th scope=col>mean_replicates</th></tr></thead>\n",
 352 |        "<tbody>\n",
 353 |        "\t<tr><td>H-BIOA-001-3</td><td>9.600000    </td></tr>\n",
 354 |        "\t<tr><td>H-BIOA-002-3</td><td>8.373832    </td></tr>\n",
 355 |        "\t<tr><td>H-BIOA-003-3</td><td>9.554517    </td></tr>\n",
 356 |        "\t<tr><td>H-BIOA-004-3</td><td>9.570093    </td></tr>\n",
 357 |        "\t<tr><td>H-BIOA-005-3</td><td>9.563863    </td></tr>\n",
 358 |        "\t<tr><td>H-BIOA-006-3</td><td>9.554517    </td></tr>\n",
 359 |        "\t<tr><td>H-BIOA-007-3</td><td>9.593750    </td></tr>\n",
 360 |        "</tbody>\n",
 361 |        "</table>\n"
 362 |       ],
 363 |       "text/latex": [
 364 |        "\\begin{tabular}{r|ll}\n",
 365 |        " Metadata\\_Plate\\_Map\\_Name & mean\\_replicates\\\\\n",
 366 |        "\\hline\n",
 367 |        "\t H-BIOA-001-3 & 9.600000    \\\\\n",
 368 |        "\t H-BIOA-002-3 & 8.373832    \\\\\n",
 369 |        "\t H-BIOA-003-3 & 9.554517    \\\\\n",
 370 |        "\t H-BIOA-004-3 & 9.570093    \\\\\n",
 371 |        "\t H-BIOA-005-3 & 9.563863    \\\\\n",
 372 |        "\t H-BIOA-006-3 & 9.554517    \\\\\n",
 373 |        "\t H-BIOA-007-3 & 9.593750    \\\\\n",
 374 |        "\\end{tabular}\n"
 375 |       ],
 376 |       "text/markdown": [
 377 |        "\n",
 378 |        "Metadata_Plate_Map_Name | mean_replicates | \n",
 379 |        "|---|---|---|---|---|---|---|\n",
 380 |        "| H-BIOA-001-3 | 9.600000     | \n",
 381 |        "| H-BIOA-002-3 | 8.373832     | \n",
 382 |        "| H-BIOA-003-3 | 9.554517     | \n",
 383 |        "| H-BIOA-004-3 | 9.570093     | \n",
 384 |        "| H-BIOA-005-3 | 9.563863     | \n",
 385 |        "| H-BIOA-006-3 | 9.554517     | \n",
 386 |        "| H-BIOA-007-3 | 9.593750     | \n",
 387 |        "\n",
 388 |        "\n"
 389 |       ],
 390 |       "text/plain": [
 391 |        "  Metadata_Plate_Map_Name mean_replicates\n",
 392 |        "1 H-BIOA-001-3            9.600000       \n",
 393 |        "2 H-BIOA-002-3            8.373832       \n",
 394 |        "3 H-BIOA-003-3            9.554517       \n",
 395 |        "4 H-BIOA-004-3            9.570093       \n",
 396 |        "5 H-BIOA-005-3            9.563863       \n",
 397 |        "6 H-BIOA-006-3            9.554517       \n",
 398 |        "7 H-BIOA-007-3            9.593750       "
 399 |       ]
 400 |      },
 401 |      "metadata": {},
 402 |      "output_type": "display_data"
 403 |     }
 404 |    ],
 405 |    "source": [
 406 |     "bbbc036 %>% \n",
 407 |     "    group_by(Metadata_Plate_Map_Name, Metadata_pert_id) %>%\n",
 408 |     "    summarise(n_groups = n()) %>%\n",
 409 |     "    summarise(mean_replicates = mean(n_groups))"
 410 |    ]
 411 |   },
 412 |   {
 413 |    "cell_type": "code",
 414 |    "execution_count": 9,
 415 |    "metadata": {
 416 |     "scrolled": true
 417 |    },
 418 |    "outputs": [
 419 |     {
 420 |      "name": "stderr",
 421 |      "output_type": "stream",
 422 |      "text": [
 423 |       "Parsed with column specification:\n",
 424 |       "cols(\n",
 425 |       "  Dataset = col_character(),\n",
 426 |       "  Partition = col_character(),\n",
 427 |       "  Features = col_character(),\n",
 428 |       "  Link = col_character()\n",
 429 |       ")\n",
 430 |       "Parsed with column specification:\n",
 431 |       "cols(\n",
 432 |       "  .default = col_double(),\n",
 433 |       "  Metadata_Plate = col_integer(),\n",
 434 |       "  Metadata_Well = col_character(),\n",
 435 |       "  Metadata_Assay_Plate_Barcode = col_integer(),\n",
 436 |       "  Metadata_Plate_Map_Name = col_character(),\n",
 437 |       "  Metadata_well_position = col_character(),\n",
 438 |       "  Metadata_broad_sample = col_character(),\n",
 439 |       "  Metadata_source_name = col_character(),\n",
 440 |       "  Metadata_compound_name = col_character(),\n",
 441 |       "  Metadata_smiles = col_character(),\n",
 442 |       "  Metadata_solvent = col_character(),\n",
 443 |       "  Metadata_pert_id = col_character(),\n",
 444 |       "  Metadata_pert_mfc_id = col_character(),\n",
 445 |       "  Metadata_pert_well = col_character(),\n",
 446 |       "  Metadata_pert_id_vendor = col_character(),\n",
 447 |       "  Metadata_cell_id = col_character(),\n",
 448 |       "  Metadata_broad_sample_type = col_character(),\n",
 449 |       "  Metadata_pert_vehicle = col_character(),\n",
 450 |       "  Metadata_pert_type = col_character(),\n",
 451 |       "  Metadata_exp = col_character()\n",
 452 |       ")\n",
 453 |       "See spec(...) for full column specifications.\n",
 454 |       "Parsed with column specification:\n",
 455 |       "cols(\n",
 456 |       "  Dataset = col_character(),\n",
 457 |       "  Partition = col_character(),\n",
 458 |       "  Features = col_character(),\n",
 459 |       "  Link = col_character()\n",
 460 |       ")\n",
 461 |       "Parsed with column specification:\n",
 462 |       "cols(\n",
 463 |       "  .default = col_double(),\n",
 464 |       "  Metadata_Plate = col_integer(),\n",
 465 |       "  Metadata_Well = col_character(),\n",
 466 |       "  Metadata_Assay_Plate_Barcode = col_integer(),\n",
 467 |       "  Metadata_Plate_Map_Name = col_character(),\n",
 468 |       "  Metadata_well_position = col_character(),\n",
 469 |       "  Metadata_broad_sample = col_character(),\n",
 470 |       "  Metadata_source_name = col_character(),\n",
 471 |       "  Metadata_compound_name = col_character(),\n",
 472 |       "  Metadata_smiles = col_character(),\n",
 473 |       "  Metadata_solvent = col_character(),\n",
 474 |       "  Metadata_pert_id = col_character(),\n",
 475 |       "  Metadata_pert_mfc_id = col_character(),\n",
 476 |       "  Metadata_pert_well = col_character(),\n",
 477 |       "  Metadata_pert_id_vendor = col_character(),\n",
 478 |       "  Metadata_cell_id = col_character(),\n",
 479 |       "  Metadata_broad_sample_type = col_character(),\n",
 480 |       "  Metadata_pert_vehicle = col_character(),\n",
 481 |       "  Metadata_pert_type = col_character(),\n",
 482 |       "  Metadata_exp = col_character()\n",
 483 |       ")\n",
 484 |       "See spec(...) for full column specifications.\n"
 485 |      ]
 486 |     }
 487 |    ],
 488 |    "source": [
 489 |     "# bbbc022 data \n",
 490 |     "bbbc022_train  <- load_dataset(\"Train\",\"BBBC022\",\"CellProfiler\")\n",
 491 |     " \n",
 492 |     "bbbc022_test <- load_dataset(\"Test\",\"BBBC022\",\"CellProfiler\")\n",
 493 |     "\n",
 494 |     "bbbc022  <- rbind(bbbc022_train, bbbc022_test) %>%\n",
 495 |     "  select(Metadata_Plate, Metadata_Well, Metadata_Plate_Map_Name, \n",
 496 |     "         Metadata_pert_id, Metadata_broad_sample_type,\n",
 497 |     "         Metadata_dataset,Metadata_partition, \n",
 498 |     "         everything()\n",
 499 |     "        ) "
 500 |    ]
 501 |   },
 502 |   {
 503 |    "cell_type": "markdown",
 504 |    "metadata": {},
 505 |    "source": [
 506 |     "# How large are the training and test partitions for BBBC022?"
 507 |    ]
 508 |   },
 509 |   {
 510 |    "cell_type": "code",
 511 |    "execution_count": 10,
 512 |    "metadata": {},
 513 |    "outputs": [
 514 |     {
 515 |      "data": {
 516 |       "text/html": [
 517 |        "<ol class=list-inline>\n",
 518 |        "\t<li>6462</li>\n",
 519 |        "\t<li>1806</li>\n",
 520 |        "</ol>\n"
 521 |       ],
 522 |       "text/latex": [
 523 |        "\\begin{enumerate*}\n",
 524 |        "\\item 6462\n",
 525 |        "\\item 1806\n",
 526 |        "\\end{enumerate*}\n"
 527 |       ],
 528 |       "text/markdown": [
 529 |        "1. 6462\n",
 530 |        "2. 1806\n",
 531 |        "\n",
 532 |        "\n"
 533 |       ],
 534 |       "text/plain": [
 535 |        "[1] 6462 1806"
 536 |       ]
 537 |      },
 538 |      "metadata": {},
 539 |      "output_type": "display_data"
 540 |     },
 541 |     {
 542 |      "data": {
 543 |       "text/html": [
 544 |        "<ol class=list-inline>\n",
 545 |        "\t<li>1120</li>\n",
 546 |        "\t<li>1806</li>\n",
 547 |        "</ol>\n"
 548 |       ],
 549 |       "text/latex": [
 550 |        "\\begin{enumerate*}\n",
 551 |        "\\item 1120\n",
 552 |        "\\item 1806\n",
 553 |        "\\end{enumerate*}\n"
 554 |       ],
 555 |       "text/markdown": [
 556 |        "1. 1120\n",
 557 |        "2. 1806\n",
 558 |        "\n",
 559 |        "\n"
 560 |       ],
 561 |       "text/plain": [
 562 |        "[1] 1120 1806"
 563 |       ]
 564 |      },
 565 |      "metadata": {},
 566 |      "output_type": "display_data"
 567 |     }
 568 |    ],
 569 |    "source": [
 570 |     "bbbc022_train %>% dim()\n",
 571 |     "bbbc022_test %>% dim()"
 572 |    ]
 573 |   },
 574 |   {
 575 |    "cell_type": "code",
 576 |    "execution_count": 11,
 577 |    "metadata": {},
 578 |    "outputs": [
 579 |     {
 580 |      "data": {
 581 |       "text/html": [
 582 |        "<table>\n",
 583 |        "<thead><tr><th scope=col>Metadata_Plate_Map_Name</th><th scope=col>mean_replicates</th></tr></thead>\n",
 584 |        "<tbody>\n",
 585 |        "\t<tr><td>H-BIOA-002-1</td><td>4.860759    </td></tr>\n",
 586 |        "\t<tr><td>H-BIOA-003-1</td><td>4.780255    </td></tr>\n",
 587 |        "\t<tr><td>H-BIOA-004-1</td><td>4.830128    </td></tr>\n",
 588 |        "\t<tr><td>H-BIOA-005-1</td><td>4.816456    </td></tr>\n",
 589 |        "\t<tr><td>H-BIOA-006-1</td><td>4.797468    </td></tr>\n",
 590 |        "</tbody>\n",
 591 |        "</table>\n"
 592 |       ],
 593 |       "text/latex": [
 594 |        "\\begin{tabular}{r|ll}\n",
 595 |        " Metadata\\_Plate\\_Map\\_Name & mean\\_replicates\\\\\n",
 596 |        "\\hline\n",
 597 |        "\t H-BIOA-002-1 & 4.860759    \\\\\n",
 598 |        "\t H-BIOA-003-1 & 4.780255    \\\\\n",
 599 |        "\t H-BIOA-004-1 & 4.830128    \\\\\n",
 600 |        "\t H-BIOA-005-1 & 4.816456    \\\\\n",
 601 |        "\t H-BIOA-006-1 & 4.797468    \\\\\n",
 602 |        "\\end{tabular}\n"
 603 |       ],
 604 |       "text/markdown": [
 605 |        "\n",
 606 |        "Metadata_Plate_Map_Name | mean_replicates | \n",
 607 |        "|---|---|---|---|---|\n",
 608 |        "| H-BIOA-002-1 | 4.860759     | \n",
 609 |        "| H-BIOA-003-1 | 4.780255     | \n",
 610 |        "| H-BIOA-004-1 | 4.830128     | \n",
 611 |        "| H-BIOA-005-1 | 4.816456     | \n",
 612 |        "| H-BIOA-006-1 | 4.797468     | \n",
 613 |        "\n",
 614 |        "\n"
 615 |       ],
 616 |       "text/plain": [
 617 |        "  Metadata_Plate_Map_Name mean_replicates\n",
 618 |        "1 H-BIOA-002-1            4.860759       \n",
 619 |        "2 H-BIOA-003-1            4.780255       \n",
 620 |        "3 H-BIOA-004-1            4.830128       \n",
 621 |        "4 H-BIOA-005-1            4.816456       \n",
 622 |        "5 H-BIOA-006-1            4.797468       "
 623 |       ]
 624 |      },
 625 |      "metadata": {},
 626 |      "output_type": "display_data"
 627 |     }
 628 |    ],
 629 |    "source": [
 630 |     "bbbc022 %>% \n",
 631 |     "    group_by(Metadata_Plate_Map_Name, Metadata_pert_id) %>%\n",
 632 |     "    summarise(n_groups = n()) %>%\n",
 633 |     "    summarise(mean_replicates = mean(n_groups))"
 634 |    ]
 635 |   },
 636 |   {
 637 |    "cell_type": "markdown",
 638 |    "metadata": {},
 639 |    "source": [
 640 |     "# How large are the combined data sets? "
 641 |    ]
 642 |   },
 643 |   {
 644 |    "cell_type": "code",
 645 |    "execution_count": 12,
 646 |    "metadata": {},
 647 |    "outputs": [
 648 |     {
 649 |      "data": {
 650 |       "text/html": [
 651 |        "<ol class=list-inline>\n",
 652 |        "\t<li>7582</li>\n",
 653 |        "\t<li>1806</li>\n",
 654 |        "</ol>\n"
 655 |       ],
 656 |       "text/latex": [
 657 |        "\\begin{enumerate*}\n",
 658 |        "\\item 7582\n",
 659 |        "\\item 1806\n",
 660 |        "\\end{enumerate*}\n"
 661 |       ],
 662 |       "text/markdown": [
 663 |        "1. 7582\n",
 664 |        "2. 1806\n",
 665 |        "\n",
 666 |        "\n"
 667 |       ],
 668 |       "text/plain": [
 669 |        "[1] 7582 1806"
 670 |       ]
 671 |      },
 672 |      "metadata": {},
 673 |      "output_type": "display_data"
 674 |     },
 675 |     {
 676 |      "data": {
 677 |       "text/html": [
 678 |        "<ol class=list-inline>\n",
 679 |        "\t<li>21106</li>\n",
 680 |        "\t<li>1805</li>\n",
 681 |        "</ol>\n"
 682 |       ],
 683 |       "text/latex": [
 684 |        "\\begin{enumerate*}\n",
 685 |        "\\item 21106\n",
 686 |        "\\item 1805\n",
 687 |        "\\end{enumerate*}\n"
 688 |       ],
 689 |       "text/markdown": [
 690 |        "1. 21106\n",
 691 |        "2. 1805\n",
 692 |        "\n",
 693 |        "\n"
 694 |       ],
 695 |       "text/plain": [
 696 |        "[1] 21106  1805"
 697 |       ]
 698 |      },
 699 |      "metadata": {},
 700 |      "output_type": "display_data"
 701 |     }
 702 |    ],
 703 |    "source": [
 704 |     "dim(bbbc022)\n",
 705 |     "dim(bbbc036)"
 706 |    ]
 707 |   },
 708 |   {
 709 |    "cell_type": "markdown",
 710 |    "metadata": {},
 711 |    "source": [
 712 |     "## Extract common features and common metadata"
 713 |    ]
 714 |   },
 715 |   {
 716 |    "cell_type": "code",
 717 |    "execution_count": 13,
 718 |    "metadata": {},
 719 |    "outputs": [],
 720 |    "source": [
 721 |     "colnames_bbbc022 <- colnames(bbbc022)\n",
 722 |     "colnames_bbbc036 <- colnames(bbbc036)\n",
 723 |     "\n",
 724 |     "Metadata_names_bbbc022 <- c(\n",
 725 |     "   stringr::str_subset(colnames_bbbc022, \"^Meta\")\n",
 726 |     ") \n",
 727 |     "\n",
 728 |     "Metadata_names_bbbc036 <- c(\n",
 729 |     "   stringr::str_subset(colnames_bbbc036, \"^Meta\")\n",
 730 |     ") \n",
 731 |     "\n",
 732 |     "common_metadata  <- intersect(Metadata_names_bbbc022, Metadata_names_bbbc036)  \n",
 733 |     "common_features  <- setdiff(intersect(colnames_bbbc022, colnames_bbbc036),common_metadata)"
 734 |    ]
 735 |   },
 736 |   {
 737 |    "cell_type": "markdown",
 738 |    "metadata": {},
 739 |    "source": [
 740 |     "# Concatenate data sets"
 741 |    ]
 742 |   },
 743 |   {
 744 |    "cell_type": "code",
 745 |    "execution_count": 14,
 746 |    "metadata": {},
 747 |    "outputs": [],
 748 |    "source": [
 749 |     "population  <- rbind(\n",
 750 |     "    bbbc022 %>% \n",
 751 |     "        select(c(common_metadata, common_features)),\n",
 752 |     "    bbbc036 %>% \n",
 753 |     "        select(c(common_metadata, common_features))\n",
 754 |     "    ) %>% \n",
 755 |     "    mutate(Metadata_perturbation = 'chemical') %>% \n",
 756 |     "    select(Metadata_perturbation, everything())"
 757 |    ]
 758 |   },
 759 |   {
 760 |    "cell_type": "markdown",
 761 |    "metadata": {},
 762 |    "source": [
 763 |     "## Important: update column names! "
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "code",
 768 |    "execution_count": 15,
 769 |    "metadata": {},
 770 |    "outputs": [],
 771 |    "source": [
 772 |     "colnames_combined  <- colnames(population)\n",
 773 |     "\n",
 774 |     "common_metadata  <- c(\n",
 775 |     "   stringr::str_subset(colnames_combined, \"^Meta\")\n",
 776 |     ") \n",
 777 |     "\n",
 778 |     "common_features  <- setdiff(colnames_combined, common_metadata)"
 779 |    ]
 780 |   },
 781 |   {
 782 |    "cell_type": "markdown",
 783 |    "metadata": {},
 784 |    "source": [
 785 |     "Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ... "
 786 |    ]
 787 |   },
 788 |   {
 789 |    "cell_type": "code",
 790 |    "execution_count": 16,
 791 |    "metadata": {},
 792 |    "outputs": [],
 793 |    "source": [
 794 |     "common_features  <- paste0(\"Feature_\",common_features)\n",
 795 |     "colnames(population)  <- c(common_metadata, common_features)"
 796 |    ]
 797 |   },
 798 |   {
 799 |    "cell_type": "markdown",
 800 |    "metadata": {},
 801 |    "source": [
 802 |     "# Normalize data\n",
 803 |     "We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes"
 804 |    ]
 805 |   },
 806 |   {
 807 |    "cell_type": "code",
 808 |    "execution_count": 17,
 809 |    "metadata": {},
 810 |    "outputs": [],
 811 |    "source": [
 812 |     "population_normalized  <- cytominer::normalize(\n",
 813 |     "    population, \n",
 814 |     "    variables = common_features, \n",
 815 |     "    strata = c(\"Metadata_perturbation\"), \n",
 816 |     "    sample = population %>% \n",
 817 |     "                filter(\n",
 818 |     "                    Metadata_broad_sample == \"DMSO\",\n",
 819 |     "                    Metadata_partition == \"Train\"\n",
 820 |     "                ), \n",
 821 |     "    operation = \"standardize\"\n",
 822 |     ")"
 823 |    ]
 824 |   },
 825 |   {
 826 |    "cell_type": "markdown",
 827 |    "metadata": {},
 828 |    "source": [
 829 |     "# Aggregate data "
 830 |    ]
 831 |   },
 832 |   {
 833 |    "cell_type": "code",
 834 |    "execution_count": 18,
 835 |    "metadata": {},
 836 |    "outputs": [
 837 |     {
 838 |      "data": {
 839 |       "text/html": [
 840 |        "<ol class=list-inline>\n",
 841 |        "\t<li>3782</li>\n",
 842 |        "\t<li>1786</li>\n",
 843 |        "</ol>\n"
 844 |       ],
 845 |       "text/latex": [
 846 |        "\\begin{enumerate*}\n",
 847 |        "\\item 3782\n",
 848 |        "\\item 1786\n",
 849 |        "\\end{enumerate*}\n"
 850 |       ],
 851 |       "text/markdown": [
 852 |        "1. 3782\n",
 853 |        "2. 1786\n",
 854 |        "\n",
 855 |        "\n"
 856 |       ],
 857 |       "text/plain": [
 858 |        "[1] 3782 1786"
 859 |       ]
 860 |      },
 861 |      "metadata": {},
 862 |      "output_type": "display_data"
 863 |     }
 864 |    ],
 865 |    "source": [
 866 |     "population_aggregated  <- cytominer::aggregate(\n",
 867 |     "    population = population_normalized, \n",
 868 |     "    variables = common_features, \n",
 869 |     "    strata = c(\"Metadata_pert_id\",\"Metadata_dataset\",\"Metadata_partition\"), \n",
 870 |     "    operation = \"mean\"\n",
 871 |     ") \n",
 872 |     "\n",
 873 |     "population_aggregated %>% dim()"
 874 |    ]
 875 |   },
 876 |   {
 877 |    "cell_type": "code",
 878 |    "execution_count": 19,
 879 |    "metadata": {},
 880 |    "outputs": [
 881 |     {
 882 |      "data": {
 883 |       "text/html": [
 884 |        "<ol class=list-inline>\n",
 885 |        "\t<li>1543</li>\n",
 886 |        "\t<li>1786</li>\n",
 887 |        "</ol>\n"
 888 |       ],
 889 |       "text/latex": [
 890 |        "\\begin{enumerate*}\n",
 891 |        "\\item 1543\n",
 892 |        "\\item 1786\n",
 893 |        "\\end{enumerate*}\n"
 894 |       ],
 895 |       "text/markdown": [
 896 |        "1. 1543\n",
 897 |        "2. 1786\n",
 898 |        "\n",
 899 |        "\n"
 900 |       ],
 901 |       "text/plain": [
 902 |        "[1] 1543 1786"
 903 |       ]
 904 |      },
 905 |      "metadata": {},
 906 |      "output_type": "display_data"
 907 |     },
 908 |     {
 909 |      "data": {
 910 |       "text/html": [
 911 |        "<ol class=list-inline>\n",
 912 |        "\t<li>2239</li>\n",
 913 |        "\t<li>1786</li>\n",
 914 |        "</ol>\n"
 915 |       ],
 916 |       "text/latex": [
 917 |        "\\begin{enumerate*}\n",
 918 |        "\\item 2239\n",
 919 |        "\\item 1786\n",
 920 |        "\\end{enumerate*}\n"
 921 |       ],
 922 |       "text/markdown": [
 923 |        "1. 2239\n",
 924 |        "2. 1786\n",
 925 |        "\n",
 926 |        "\n"
 927 |       ],
 928 |       "text/plain": [
 929 |        "[1] 2239 1786"
 930 |       ]
 931 |      },
 932 |      "metadata": {},
 933 |      "output_type": "display_data"
 934 |     }
 935 |    ],
 936 |    "source": [
 937 |     "population_aggregated %>% filter(Metadata_dataset == \"BBBC022\") %>% dim()\n",
 938 |     "population_aggregated %>% filter(Metadata_dataset == \"BBBC036\") %>% dim()"
 939 |    ]
 940 |   },
 941 |   {
 942 |    "cell_type": "markdown",
 943 |    "metadata": {},
 944 |    "source": [
 945 |     "# Correlation matrix "
 946 |    ]
 947 |   },
 948 |   {
 949 |    "cell_type": "code",
 950 |    "execution_count": 20,
 951 |    "metadata": {},
 952 |    "outputs": [
 953 |     {
 954 |      "data": {
 955 |       "text/html": [
 956 |        "<ol class=list-inline>\n",
 957 |        "\t<li>1543</li>\n",
 958 |        "\t<li>2239</li>\n",
 959 |        "</ol>\n"
 960 |       ],
 961 |       "text/latex": [
 962 |        "\\begin{enumerate*}\n",
 963 |        "\\item 1543\n",
 964 |        "\\item 2239\n",
 965 |        "\\end{enumerate*}\n"
 966 |       ],
 967 |       "text/markdown": [
 968 |        "1. 1543\n",
 969 |        "2. 2239\n",
 970 |        "\n",
 971 |        "\n"
 972 |       ],
 973 |       "text/plain": [
 974 |        "[1] 1543 2239"
 975 |       ]
 976 |      },
 977 |      "metadata": {},
 978 |      "output_type": "display_data"
 979 |     }
 980 |    ],
 981 |    "source": [
 982 |     "cor_matrix  <- cor(\n",
 983 |     "    x = population_aggregated %>% \n",
 984 |     "        filter(Metadata_dataset == 'BBBC022') %>% \n",
 985 |     "        select(common_features) %>% \n",
 986 |     "        as.matrix() %>% \n",
 987 |     "        t, \n",
 988 |     "    y = population_aggregated %>% \n",
 989 |     "        filter(Metadata_dataset == 'BBBC036') %>% \n",
 990 |     "        select(common_features) %>% \n",
 991 |     "        as.matrix() %>% \n",
 992 |     "        t,\n",
 993 |     "    use  = \"complete.obs\"\n",
 994 |     "    ) \n",
 995 |     "\n",
 996 |     "cor_matrix %>% dim()"
 997 |    ]
 998 |   },
 999 |   {
1000 |    "cell_type": "markdown",
1001 |    "metadata": {},
1002 |    "source": [
1003 |     "# Submision file "
1004 |    ]
1005 |   },
1006 |   {
1007 |    "cell_type": "code",
1008 |    "execution_count": 21,
1009 |    "metadata": {},
1010 |    "outputs": [
1011 |     {
1012 |      "data": {
1013 |       "text/html": [
1014 |        "<ol class=list-inline>\n",
1015 |        "\t<li>1543</li>\n",
1016 |        "\t<li>2239</li>\n",
1017 |        "</ol>\n"
1018 |       ],
1019 |       "text/latex": [
1020 |        "\\begin{enumerate*}\n",
1021 |        "\\item 1543\n",
1022 |        "\\item 2239\n",
1023 |        "\\end{enumerate*}\n"
1024 |       ],
1025 |       "text/markdown": [
1026 |        "1. 1543\n",
1027 |        "2. 2239\n",
1028 |        "\n",
1029 |        "\n"
1030 |       ],
1031 |       "text/plain": [
1032 |        "[1] 1543 2239"
1033 |       ]
1034 |      },
1035 |      "metadata": {},
1036 |      "output_type": "display_data"
1037 |     }
1038 |    ],
1039 |    "source": [
1040 |     "# set column names \n",
1041 |     "colnames(cor_matrix)  <- population_aggregated %>% \n",
1042 |     "                            filter(Metadata_dataset == 'BBBC036') %>%\n",
1043 |     "                            extract2(\"Metadata_pert_id\")\n",
1044 |     "\n",
1045 |     "# set row names \n",
1046 |     "#rownames(cor_matrix)  <- population_aggregated %>% \n",
1047 |     "#                            filter(Metadata_dataset == 'BBBC036') %>%\n",
1048 |     "#                            extract2(\"Metadata_broad_sample\")#\n",
1049 |     "\n",
1050 |     "cor_matrix %>% dim()"
1051 |    ]
1052 |   },
1053 |   {
1054 |    "cell_type": "code",
1055 |    "execution_count": 23,
1056 |    "metadata": {},
1057 |    "outputs": [],
1058 |    "source": [
1059 |     "df  <- cor_matrix %>% as_data_frame() %>% \n",
1060 |     "            mutate(Metadata_pert_id = population_aggregated %>% \n",
1061 |     "                            filter(Metadata_dataset == 'BBBC022') %>%\n",
1062 |     "                            extract2(\"Metadata_pert_id\")) %>% \n",
1063 |     "            select(Metadata_pert_id, everything())\n",
1064 |     "\n",
1065 |     "# write submission file\n",
1066 |     "write.csv(df,\"../cytodata-baseline_R_day_2_CP.csv\",row.names = FALSE)"
1067 |    ]
1068 |   },
1069 |   {
1070 |    "cell_type": "code",
1071 |    "execution_count": 25,
1072 |    "metadata": {},
1073 |    "outputs": [
1074 |     {
1075 |      "data": {
1076 |       "text/html": [
1077 |        "<ol class=list-inline>\n",
1078 |        "\t<li>1543</li>\n",
1079 |        "\t<li>2240</li>\n",
1080 |        "</ol>\n"
1081 |       ],
1082 |       "text/latex": [
1083 |        "\\begin{enumerate*}\n",
1084 |        "\\item 1543\n",
1085 |        "\\item 2240\n",
1086 |        "\\end{enumerate*}\n"
1087 |       ],
1088 |       "text/markdown": [
1089 |        "1. 1543\n",
1090 |        "2. 2240\n",
1091 |        "\n",
1092 |        "\n"
1093 |       ],
1094 |       "text/plain": [
1095 |        "[1] 1543 2240"
1096 |       ]
1097 |      },
1098 |      "metadata": {},
1099 |      "output_type": "display_data"
1100 |     }
1101 |    ],
1102 |    "source": [
1103 |     "dim(df)"
1104 |    ]
1105 |   }
1106 |  ],
1107 |  "metadata": {
1108 |   "kernelspec": {
1109 |    "display_name": "R",
1110 |    "language": "R",
1111 |    "name": "ir"
1112 |   },
1113 |   "language_info": {
1114 |    "codemirror_mode": "r",
1115 |    "file_extension": ".r",
1116 |    "mimetype": "text/x-r-source",
1117 |    "name": "R",
1118 |    "pygments_lexer": "r",
1119 |    "version": "3.4.4"
1120 |   }
1121 |  },
1122 |  "nbformat": 4,
1123 |  "nbformat_minor": 2
1124 | }
1125 | 


--------------------------------------------------------------------------------