├── .gitignore
├── slides.pdf
├── files
├── domains.png
├── profiling.png
├── application.png
└── cell_painting.png
├── LICENSE
├── cytodata-toolkit
├── python
│ └── cytodata.py
├── datasets.csv
└── R
│ ├── Create-Submission_R.Rmd
│ ├── Create-Submission_R-day2.ipynb
│ └── Create-Submission_R-day2-CP.ipynb
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | cytodata-toolkit/R/.ipynb_checkpoints/
2 |
--------------------------------------------------------------------------------
/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/slides.pdf
--------------------------------------------------------------------------------
/files/domains.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/domains.png
--------------------------------------------------------------------------------
/files/profiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/profiling.png
--------------------------------------------------------------------------------
/files/application.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/application.png
--------------------------------------------------------------------------------
/files/cell_painting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cytodata/cytodata-hackathon-2018/HEAD/files/cell_painting.png
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 cytodata
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/cytodata-toolkit/python/cytodata.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import io
3 | import pandas as pd
4 | from tqdm import tqdm
5 |
6 |
7 | ## dataset_files: Pandas dataframe
8 | ## Input csv files to read including columns Dataset,Plate,Link
9 |
10 | dataset_files = pd.read_csv("../datasets.csv")
11 |
12 |
13 | def load_dataset(dataset_id, partition, features):
14 | """Load dataset from any collection of csv files
15 |
16 | Parameters
17 | ----------
18 | dataset_id: string
19 | Dataset ID
20 | partition:
21 | Partition can be "Train" or "Test"
22 | features:
23 | Feature type can be "CellProfiler" or "DeepLearning"
24 |
25 | Returns
26 | -------
27 | dataframe
28 | All read selected features from the bucket with given ID and partition
29 |
30 | """
31 |
32 | cond1 = dataset_files["Dataset"] == dataset_id
33 | cond2 = dataset_files["Partition"] == partition
34 | cond3 = dataset_files["Features"] == features
35 | df_row = dataset_files[cond1 & cond2 & cond3]
36 |
37 | if df_row.empty:
38 | print("No such partition {} for dataset {} with features {}".format(partition, dataset_id, features))
39 | return None
40 |
41 | dataframes = []
42 |
43 | for key,row in tqdm(df_row.iterrows()):
44 | response = urllib.request.urlopen(row.Link)
45 | data = response.read()
46 | df = pd.read_csv(io.StringIO(data.decode('utf-8')))
47 | dataframes.append(df)
48 |
49 | return pd.concat(dataframes, ignore_index=True)
50 |
51 |
52 |
--------------------------------------------------------------------------------
/cytodata-toolkit/datasets.csv:
--------------------------------------------------------------------------------
1 | Dataset,Partition,Features,Link
2 | BBBC037,Test,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_cp/bbbc037_test.csv
3 | BBBC037,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_cp/bbbc037_train.csv
4 | BBBC043,Test,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_cp/bbbc043_test.csv
5 | BBBC043,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_cp/bbbc043_train.csv
6 | BBBC037,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_dp/bbbc037_test.csv
7 | BBBC037,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/TA-ORF-BBBC037-Rohban/profiles_dp/bbbc037_train.csv
8 | BBBC043,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_dp/bbbc043_test.csv
9 | BBBC043,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/LUAD-BBBC043-Caicedo/profiles_dp/bbbc043_train.csv
10 | BBBC036,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_dp/bbbc036_test.csv
11 | BBBC036,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_dp/bbbc036_train.csv
12 | BBBC036,Test,CellProfiler, https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_cp/bbbc036_test.csv
13 | BBBC036,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/CDRPBIO-BBBC036-Bray/profiles_cp/bbbc036_train.csv
14 | BBBC022,Test,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_cp/bbbc022_test.csv
15 | BBBC022,Train,CellProfiler,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_cp/bbbc022_train.csv
16 | BBBC022,Test,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_dp/bbbc022_test.csv
17 | BBBC022,Train,DeepLearning,https://s3.amazonaws.com/cytodata/evaluation/Bioactives-BBBC022-Gustafsdottir/profiles_dp/bbbc022_train.csv
--------------------------------------------------------------------------------
/cytodata-toolkit/R/Create-Submission_R.Rmd:
--------------------------------------------------------------------------------
1 | ```{r, message=FALSE}
2 | library(tidyverse)
3 | library(cytominer)
4 | library(magrittr)
5 | library(RCurl)
6 | ```
7 |
8 | ```{r, message=FALSE}
9 | load_dataset <- function(partition, dataset,feature){
10 | file_name <- read_csv("../datasets.csv")
11 | x <- file_name %>% filter(
12 | Partition == partition,
13 | Dataset == dataset,
14 | Features == feature) %>%
15 | extract2("Link")
16 |
17 | return(read_csv(x) %>%
18 | mutate(Metadata_dataset = dataset) %>%
19 | mutate(Metadata_partition = partition) %>%
20 | mutate(Metadata_features = feature)
21 | )
22 | }
23 | ```
24 |
25 | # Load data
26 | We load training and test datasets for both genetic perturbation experiments
27 |
28 | ```{r, message=FALSE}
29 | # bbbc37 data
30 | bbbc037_train <- load_dataset("Train","BBBC037","CellProfiler") %>%
31 | mutate(Metadata_x_mutation_status = "none") %>%
32 | filter(str_detect(Metadata_pert_name, "WT") | Metadata_ASSAY_WELL_ROLE %in% c("Untreated", "CTRL"))
33 |
34 | bbbc037_test <- load_dataset("Test","BBBC037","CellProfiler") %>%
35 | mutate(Metadata_x_mutation_status = "none") %>%
36 | filter(str_detect(Metadata_pert_name, "WT") | Metadata_ASSAY_WELL_ROLE %in% c("Untreated", "CTRL"))
37 |
38 | bbbc037 <-
39 | bind_rows(bbbc037_train, bbbc037_test)
40 | ```
41 |
42 | ```{r, message=FALSE}
43 | # bbbc043 data
44 | bbbc043_train <- load_dataset("Train","BBBC043","CellProfiler")
45 |
46 | bbbc043_test <- load_dataset("Test","BBBC043","CellProfiler")
47 |
48 | bbbc043 <- bind_rows(bbbc043_train, bbbc043_test)
49 | ```
50 |
51 | ## Check dimensionality
52 |
53 | ```{r}
54 | dim(bbbc043)
55 | dim(bbbc037)
56 | ```
57 |
58 | ## Extract common features
59 |
60 | ```{r}
61 | colnames_bbbc037 <- colnames(bbbc037)
62 | colnames_bbbc043 <- colnames(bbbc043)
63 |
64 |
65 | Metadata_names_bbbc037 <- c(
66 | stringr::str_subset(colnames_bbbc037, "^Meta")
67 | )
68 |
69 | Metadata_names_bbbc043 <- c(
70 | stringr::str_subset(colnames_bbbc043, "^Meta")
71 | )
72 |
73 | common_metadata <- intersect(Metadata_names_bbbc037, Metadata_names_bbbc043)
74 | common_features <- setdiff(intersect(colnames_bbbc037, colnames_bbbc043),common_metadata)
75 |
76 | ```
77 |
78 | # Concatenate data sets
79 |
80 | ```{r}
81 | population <- bind_rows(
82 | bbbc037 %>%
83 | select(c(common_metadata, common_features)),
84 | bbbc043 %>%
85 | select(c(common_metadata, common_features))
86 | ) %>%
87 | mutate(Metadata_perturbation = "genetic") %>%
88 | select(matches("^Meta"), everything())
89 | ```
90 |
91 | ## Important: update column names!
92 |
93 | ```{r}
94 | colnames_combined <- colnames(population)
95 |
96 | common_metadata <- c(
97 | stringr::str_subset(colnames_combined, "^Meta")
98 | )
99 |
100 | common_features <- setdiff(colnames_combined, common_metadata)
101 | ```
102 |
103 | Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ...
104 |
105 | ```{r}
106 | common_features <- paste0("Feature_",common_features)
107 | colnames(population) <- c(common_metadata, common_features)
108 | ```
109 |
110 | # Normalize data
111 | We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes
112 |
113 | ```{r}
114 | population_normalized <- cytominer::normalize(
115 | population,
116 | variables = common_features,
117 | strata = c("Metadata_perturbation"),
118 | sample = population %>%
119 | filter(
120 | Metadata_gene_name == 'EMPTY',
121 | Metadata_partition == "Train"
122 | ),
123 | operation = "standardize"
124 | )
125 | ```
126 |
127 | ```{r}
128 | population_normalized %>% dim() %>% print
129 | ```
130 |
131 | # Aggregate data
132 |
133 | ```{r}
134 | population_aggregated <- cytominer::aggregate(
135 | population = population_normalized,
136 | variables = common_features,
137 | strata = c("Metadata_gene_name","Metadata_dataset","Metadata_x_mutation_status"),
138 | operation = "mean"
139 | )
140 | ```
141 |
142 | ```{r}
143 | population_normalized %>% extract2("Metadata_gene_name") %>% print
144 | ```
145 |
146 | ```{r}
147 | population_aggregated %>% slice(1:2) %>% print
148 | ```
149 |
150 | # Correlation matrix
151 |
152 | ```{r}
153 | cor_matrix <- cor(
154 | x = population_aggregated %>%
155 | filter(Metadata_dataset == 'BBBC037') %>%
156 | select(common_features) %>%
157 | as.matrix() %>%
158 | t,
159 | y = population_aggregated %>%
160 | filter(Metadata_dataset == 'BBBC043') %>%
161 | select(common_features) %>%
162 | as.matrix() %>%
163 | t,
164 | use = "complete.obs"
165 | )
166 | ```
167 |
168 | # Submision file
169 |
170 | ```{r}
171 | # set column names
172 | colnames(cor_matrix) <- population_aggregated %>%
173 | filter(Metadata_dataset == 'BBBC043') %>%
174 | extract2("Metadata_x_mutation_status")
175 |
176 | # set row names
177 | rownames(cor_matrix) <- population_aggregated %>%
178 | filter(Metadata_dataset == 'BBBC037') %>%
179 | extract2("Metadata_gene_name")
180 |
181 |
182 | df <- cor_matrix %>% as_data_frame() %>%
183 | mutate(Metadata_gene_name = population_aggregated %>%
184 | filter(Metadata_dataset == 'BBBC037') %>%
185 | extract2("Metadata_gene_name")) %>%
186 | select(Metadata_gene_name, everything())
187 |
188 | # write submission file
189 | write.csv(df,"../cytodata-baseline_R.csv",row.names = FALSE)
190 | ```
191 |
192 | ```{r}
193 | ```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # :microscope: CytoData - 2018 Challenge
2 |
3 | If we want to retrieve "matching" profiles from a large collection of image-based profiling experiments (for example to find similar drugs, similar genes, or drug-gene or drug-disease combinations), how do we ensure that the profiles are aligned well enough?
4 | The CytoData 2018 Challenge addresses this, featuring batch effect correction and cross dataset profile matching :cd: :twisted_rightwards_arrows: :dvd:.
5 | The challenge involves the transformation of signatures using machine learning :space_invader: or statistical methods :bar_chart:.
6 | You will be given two datasets of image-based signatures :cd: :heavy_plus_sign: :dvd: acquired at different times :date: :clock130: and with different experimental conditions :pill: :syringe: with the goal of retrieving correct matches accurately :dart:.
7 | See http://cytodata.org/ for details of the event.
8 |
9 | ## Table of Contents
10 |
11 | [Background](#tv-background)
12 |
13 | [Challenge](#checkered_flag-challenge)
14 |
15 | [Data](#dvd-data)
16 |
17 | [Format](#performing_arts-format)
18 |
19 | [Resources](#wrench-resources)
20 |
21 |
22 | # :tv: Background
23 |
24 | :alien: **: What is image-based profiling?**
25 |
26 | :sunglasses: : In the study of biological systems, microscopy images are used to measure the response of cells to treatments or perturbations.
27 | Cell state can be observed and quantitatively measured using images by following a computational workflow known as profiling.
28 | Single cells are first identified in all images, and then their main characteristics are represented in feature vectors.
29 | The information of a population of cells is aggregated into a single vector, also called profile, containing summary statistics of the features of all cells.
30 | These profiles encode the morphological changes of cell populations exposed to treatments.
31 | Image-based profiles can be used to compare the response of cells to different treatments, and to map their similarities.
32 |
33 |
34 |
35 |
36 |
37 |
38 | :alien: **: Is image-based profiling the same as image-based screening?**
39 |
40 | :sunglasses: : Screening and profiling are different.
41 | Screening uses images to identify phenotype(s) of interest known beforehand.
42 | Profiling measures as many cell properties as possible, using all the phenotypes to identify relationships among multiple different samples.
43 |
44 |
45 | :alien: **: What are the applications of image-based profiling?**
46 |
47 | :sunglasses: : Image-based profiles can be used for drug discovery and functional genomics applications.
48 | There are many types of biological studies that can be conducted using image-based profiling.
49 | In the CytoData challenge, we use data from chemical and genetic perturbation experiments (see below).
50 |
51 |
52 |
53 |
54 |
55 |
56 | :alien: **: What imaging assays can be used for profiling?**
57 |
58 | :sunglasses: : Virtually any imaging assay can be used for profiling, especially high-content assays.
59 | In the 2018 CytoData challenge, we use an imaging assay called Cell Painting, that paints the cells with 6 stains, imaged in 5 channels, highlighting 8 cellular compartments.
60 | This is an unbiased, general purpose assay that maximizes information content for profiling, but the assay can be adapted to meet the needs of a research project.
61 |
62 |
63 |
64 |
65 |
66 |
67 | # :checkered_flag: Challenge
68 |
69 | As in many biological experiments, imaging data may be subject to batch effects and undesired artifacts :scream:.
70 | More specifically, given two batches of microscopy images with the same treatments :pill:, but acquired under different technical conditions :a::vs::b:, a difference in the quantitative measures is likely to be observed :x:.
71 | These differences are not due to meaningful biological variations and can be removed using computational methods :computer:.
72 |
73 | The goal of the challenge :checkered_flag: is to analyze the profiles of two different batches of data :a::b: and design computational methods to correct batch effects :white_check_mark:.
74 | A successful method :trophy: will be able to align the information content of both batches :ab:,
75 | making profiles of the same treatment have similar measurements without distorting the relationships among other treatments :smiley:.
76 | The following metrics will be used to assess the quality of entries :triangular_ruler::
77 |
78 | 1. :arrow_upper_right::arrow_upper_right: Replicate correlation
79 | 2. :top::arrows_counterclockwise: Enrichment of biologically relevant matches in the top connections
80 | 3. :id::white_check_mark: Correct association of treatment type
81 |
82 | ## :bulb: Tip
83 |
84 | From the data analysis perspective, the problem can be formulated in various ways, including
85 | manifold learning, domain adaptation, subspace alignment, and transfer learning.
86 |
87 |
88 |
89 |
90 |
91 |
92 | # :dvd: Data
93 |
94 | We are glad to announce that four datasets will be provided during the CytoData 2018 Challenge :tada::tada::tada::tada:.
95 | All of them were acquired using the Cell Painting assay, at high-throughput, in 384 well plates :microscope:, as part of the research
96 | conducted in the Broad Institute of MIT and Harvard.
97 | The following table describes the experimental details of each dataset.
98 |
99 | | Dataset :dvd: | Type :syringe: :pill: | Number of treatments :hash: | Cell line :cancer: |
100 | |---|---|---|---|
101 | | BBBC037 | Genetic perturbations. ORF over-expression | 200 wild type genes | U2OS |
102 | | BBBC043 | Genetic perturbations. ORF over-expression | 596 alleles of 53 genes | A549 |
103 | | BBBC022 | Chemical perturbations. Bioactive compounds | 1,600 compounds | U2OS |
104 | | BBBC036 | Chemical perturbations. Bioactive compounds | 5,000 compounds | U2OS |
105 |
106 | Notice that two datasets represent genetic perturbations and the other two represent chemical perturbations.
107 | The challenge will consider the cross-dataset matching problem across each of the two pairs :cd::twisted_rightwards_arrows::dvd:,
108 | i.e, profiles in BBBC037 have to be matched with profiles in BBBC043 because both contain genetic perturbations.
109 | Similarly, profiles in BBBC022 have to be matched with profiles in BBBC036 because both contain chemical perturbations.
110 |
111 | The imaging data for all three datasets is more than 3TB of data :boom:, which will be available to everyone during and after the challenge.
112 | However, to facilitate the analysis of treatment profiles and to focus on the cross-dataset matching problem, all the datasets have been processed
113 | before-hand using the profiling workflow described above :sunglasses:.
114 | In particular, two versions of well-level population profiles will be available during the challenge:
115 | 1. Classical features computed with the CellProfiler software using pipelines optimized for Cell Painting images.
116 | 2. Deep learning features computed with a convolutional neural network pretrained on ImageNet.
117 |
118 | ## Data available on AWS
119 |
120 | As of 2024, all data has moved to the [Cell Painting Gallery](https://github.com/broadinstitute/cellpainting-gallery) at `s3://cellpainting-gallery`.
121 | The folder structue has changed slightly from the original structure to comply with [Cell Painting Gallery formatting](https://github.com/broadinstitute/cellpainting-gallery/blob/main/folder_structure.md).
122 |
123 | The datasets have undergone the following renaming in the Cell Painting Gallery:
124 | Bioactives-BBBC022-Gustafsdottir => `cpg0030-gustafsdottir-cellpainting`
125 | CDRPBIO-BBBC036-Bray => `cpg0012-wawer-bioactivecompoundprofiling`
126 | LUAD-BBBC041-Caicedo => `cpg0031-caicedo-cmvip`
127 |
128 |
129 | During the Cytodata hackathon the data was available as Amazon Public Data Set on https://registry.opendata.aws/cell-painting-image-collection/ at `s3://cytodata`.
130 | All image data and extracted single cell features and aggregated profiles were found in `s3://cytodata/datasets/` with the following structure:
131 | ```
132 | .
133 | ├── Bioactives-BBBC022-Gustafsdottir
134 | │ ├── profiles
135 | │ │ └── Bioactives-BBBC022-Gustafsdottir
136 | │ ├── images
137 | │ │ └── Bioactives-BBBC022-Gustafsdottir
138 | │ └── metadata
139 | │ └── Bioactives-BBBC022-Gustafsdottir
140 | ├── CDRPBIO-BBBC036-Bray
141 | │ ├── profiles
142 | │ │ └── CDRPBIO-BBBC036-Bray
143 | │ ├── images
144 | │ │ └── CDRPBIO-BBBC036-Bray
145 | │ └── metadata
146 | │ └── CDRPBIO-BBBC036-Bray
147 | ├── LUAD-BBBC041-Caicedo
148 | │ ├── profiles
149 | │ │ └── LUAD-BBBC041-Caicedo
150 | │ ├── images
151 | │ │ └── LUAD-BBBC041-Caicedo
152 | │ └── metadata
153 | │ └── LUAD-BBBC041-Caicedo
154 | └── TA-ORF-BBBC037-Rohban
155 | ├── profiles
156 | │ └── TA-ORF-BBBC037-Rohban
157 | ├── images
158 | │ └── TA-ORF-BBBC037-Rohban
159 | └── metadata
160 | └── TA-ORF-BBBC037-Rohban
161 | ```
162 |
163 | The subfolder contain the following information:
164 | * the directory `images` contain Cell Painting images as tiff files
165 | * the directory `profiles` contains single cell data in sqlite format and profiles aggregated to replicate level as csv files (aggregated as mean profiles per well)
166 | * the metadata directory contains information about the platemaps and the used perturbations.
167 |
168 |
169 | # :performing_arts: Format
170 |
171 | The CytoData 2018 challenge will be a collaborative hackathon :sparkles::computer:, with participants forming teams to discuss and implement solutions to the problem.
172 | The challenge will run for two days only, so participants are encouraged to investigate and plan some solutions before the event starts :pencil:.
173 | In order to meet other participants, we will provide a slack channel to make general announcements and allow participants to organize teams and exchange ideas :bulb:.
174 | It's also a great idea to start discussing methods here in this GitHub repository :octocat::
175 |
176 | ```add issues with relevant links if you want to suggest a methodology and discuss it with other participants!```
177 |
178 |
179 | Teams will have no fewer than three :three: and no more than five :five: participants, ideally from different institutions.
180 | Teams will compete with each other :rage1: to improve the three performance metrics mentioned above :bowling:.
181 | Participants of the team will be able to upload solutions to a scoreboard to check that everything is running properly and to get feedback on performance :ok_hand:.
182 | The best performing solutions will win prizes provided by our sponsors! :trophy::clap:
183 |
184 |
185 | # :wrench: Resources
186 |
187 | The following resources will be provided during the challenge:
188 |
189 | 1. :satellite: Internet connection.
190 | 2. :dvd: Access to all files of the four datasets, including pre-computed profiles.
191 | 3. :octocat: A toolkit, written in R and Python, to load the pre-computed profiles, run a baseline model and create a submission.
192 | 4. :chart_with_upwards_trend: An account in the scoreboard to evaluate the generated submissions.
193 | 5. :computer: Teams will be given access to pre-configured virtual machines in the Amazon Cloud to run experiments.
194 |
195 | Participants of the challenge can make use of their own computational resources (laptops, servers, etc) to run experiments during the challenge.
196 |
--------------------------------------------------------------------------------
/cytodata-toolkit/R/Create-Submission_R-day2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {
7 | "scrolled": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "library(tidyverse)\n",
12 | "library(cytominer)\n",
13 | "library(magrittr)"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 5,
19 | "metadata": {},
20 | "outputs": [],
21 | "source": [
22 | "load_dataset <- function(partition, dataset,feature){\n",
23 | " file_name <- read_csv(\"../datasets.csv\") \n",
24 | " x <- file_name %>% filter(\n",
25 | " Partition == partition,\n",
26 | " Dataset == dataset,\n",
27 | " Features == feature) %>% \n",
28 | " extract2(\"Link\")\n",
29 | "\n",
30 | " return(read_csv(x) %>% \n",
31 | " mutate(Metadata_dataset = dataset) %>%\n",
32 | " mutate(Metadata_partition = partition) %>% \n",
33 | " mutate(Metadata_features = feature) \n",
34 | " )\n",
35 | " }"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "# Load data \n",
43 | "We load training and test datasets for both genetic perturbation experiments "
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stderr",
53 | "output_type": "stream",
54 | "text": [
55 | "Parsed with column specification:\n",
56 | "cols(\n",
57 | " Dataset = col_character(),\n",
58 | " Partition = col_character(),\n",
59 | " Features = col_character(),\n",
60 | " Link = col_character()\n",
61 | ")\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "# bbbc37 data \n",
67 | "bbbc036_train <- load_dataset(\"Train\",\"BBBC036\",\"DeepLearning\") %>% \n",
68 | " mutate(Metadata_x_mutation_status = \"none\")\n",
69 | "\n",
70 | "bbbc036_test <- load_dataset(\"Test\",\"BBBC036\",\"DeepLearning\") %>% \n",
71 | " mutate(Metadata_x_mutation_status = \"none\")\n",
72 | "\n",
73 | "bbbc036 <- rbind(bbbc036_train, bbbc036_test)"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "bbbc036_train %>% dim()\n",
83 | "bbbc036_test %>% dim()"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "# bbbc043 data \n",
93 | "bbbc022_train <- load_dataset(\"Train\",\"BBBC022\",\"DeepLearning\")\n",
94 | " \n",
95 | "bbbc022_test <- load_dataset(\"Test\",\"BBBC022\",\"DeepLearning\")\n",
96 | "\n",
97 | "bbbc022 <- rbind(bbbc022_train, bbbc022_test)"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "## Check dimensionality"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "dim(bbbc022)\n",
114 | "dim(bbbc036)"
115 | ]
116 | },
117 | {
118 | "cell_type": "markdown",
119 | "metadata": {},
120 | "source": [
121 | "## Extract common features "
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "metadata": {},
128 | "outputs": [],
129 | "source": [
130 | "colnames_bbbc022 <- colnames(bbbc022)\n",
131 | "colnames_bbbc036 <- colnames(bbbc036)\n",
132 | "\n",
133 | "\n",
134 | "Metadata_names_bbbc022 <- c(\n",
135 | " stringr::str_subset(colnames_bbbc022, \"^Meta\")\n",
136 | ") \n",
137 | "\n",
138 | "Metadata_names_bbbc036 <- c(\n",
139 | " stringr::str_subset(colnames_bbbc036, \"^Meta\")\n",
140 | ") \n",
141 | "\n",
142 | "common_metadata <- intersect(Metadata_names_bbbc022, Metadata_names_bbbc036) \n",
143 | "common_features <- setdiff(intersect(colnames_bbbc022, colnames_bbbc036),common_metadata)\n",
144 | "\n",
145 | "colnames_bbbc036 %>% length()"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "bbbc022_na_feature <- cytominer::drop_na_columns(\n",
155 | " population = bbbc022 %>% \n",
156 | " filter(\n",
157 | " Metadata_broad_sample == \"DMSO\"\n",
158 | " ) %>% \n",
159 | " slice(1:100),\n",
160 | " variables = common_features,\n",
161 | " cutoff = 0\n",
162 | " )\n",
163 | "\n",
164 | "#bbbc036_na_feature <- cytominer::drop_na_columns(\n",
165 | "# population = bbbc036,\n",
166 | "# variables = common_features,\n",
167 | "# cutoff = 0\n",
168 | "# )"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": [
177 | "bbbc022_na_feature %>% print"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "features_to_remove <- cytominer::variance_threshold(\n",
187 | " variables = common_features,\n",
188 | " sample = bbbc022 %>% \n",
189 | " filter(\n",
190 | " Metadata_broad_sample == \"DMSO\"\n",
191 | " ) %>% \n",
192 | " slice(1:100)\n",
193 | ")"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": []
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "# Concatenate data sets"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": null,
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "population <- rbind(\n",
217 | " bbbc022 %>% \n",
218 | " select(c(common_metadata, common_features)),\n",
219 | " bbbc036 %>% \n",
220 | " select(c(common_metadata, common_features))\n",
221 | " ) %>% \n",
222 | " mutate(Metadata_perturbation = 'chemical') %>% \n",
223 | " select(Metadata_perturbation, everything())"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "## Important: update column names! "
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": null,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "colnames_combined <- colnames(population)\n",
240 | "\n",
241 | "common_metadata <- c(\n",
242 | " stringr::str_subset(colnames_combined, \"^Meta\")\n",
243 | ") \n",
244 | "\n",
245 | "common_features <- setdiff(colnames_combined, common_metadata)\n"
246 | ]
247 | },
248 | {
249 | "cell_type": "markdown",
250 | "metadata": {},
251 | "source": [
252 | "Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ... "
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": [
261 | "common_features <- paste0(\"Feature_\",common_features)\n",
262 | "colnames(population) <- c(common_metadata, common_features)"
263 | ]
264 | },
265 | {
266 | "cell_type": "markdown",
267 | "metadata": {},
268 | "source": [
269 | "# Normalize data\n",
270 | "We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "population_normalized <- cytominer::normalize(\n",
280 | " population, \n",
281 | " variables = common_features, \n",
282 | " strata = c(\"Metadata_perturbation\"), \n",
283 | " sample = population %>% \n",
284 | " filter(\n",
285 | " Metadata_broad_sample == \"DMSO\"\n",
286 | " ) %>% \n",
287 | " slice(1:100), \n",
288 | " operation = \"standardize\"\n",
289 | ")"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": [
298 | "population_normalized %>% dim() %>% print"
299 | ]
300 | },
301 | {
302 | "cell_type": "markdown",
303 | "metadata": {},
304 | "source": [
305 | "# Aggregate data "
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "metadata": {},
312 | "outputs": [],
313 | "source": [
314 | "population_aggregated <- cytominer::aggregate(\n",
315 | " population = population_normalized, \n",
316 | " variables = common_features, \n",
317 | " strata = c(\"Metadata_broad_sample\",\"Metadata_dataset\"), \n",
318 | " operation = \"mean\"\n",
319 | ") "
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": null,
325 | "metadata": {},
326 | "outputs": [],
327 | "source": [
328 | "population_normalized %>% extract2(\"Metadata_broad_sample\") %>% print"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": null,
334 | "metadata": {},
335 | "outputs": [],
336 | "source": [
337 | "population_aggregated %>% slice(1:2) %>% print"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {},
343 | "source": [
344 | "# Correlation matrix "
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "metadata": {},
351 | "outputs": [],
352 | "source": [
353 | "cor_matrix <- cor(\n",
354 | " x = population_aggregated %>% \n",
355 | " filter(Metadata_dataset == 'BBBC022') %>% \n",
356 | " select(common_features) %>% \n",
357 | " as.matrix() %>% \n",
358 | " t, \n",
359 | " y = population_aggregated %>% \n",
360 | " filter(Metadata_dataset == 'BBBC036') %>% \n",
361 | " select(common_features) %>% \n",
362 | " as.matrix() %>% \n",
363 | " t,\n",
364 | " use = \"complete.obs\"\n",
365 | " ) \n"
366 | ]
367 | },
368 | {
369 | "cell_type": "markdown",
370 | "metadata": {},
371 | "source": [
372 | "# Submision file "
373 | ]
374 | },
375 | {
376 | "cell_type": "code",
377 | "execution_count": null,
378 | "metadata": {},
379 | "outputs": [],
380 | "source": [
381 | "# set column names \n",
382 | "colnames(cor_matrix) <- population_aggregated %>% \n",
383 | " filter(Metadata_dataset == 'BBBC036') %>%\n",
384 | " extract2(\"Metadata_pert_id\")\n",
385 | "\n",
386 | "# set row names \n",
387 | "#rownames(cor_matrix) <- population_aggregated %>% \n",
388 | "# filter(Metadata_dataset == 'BBBC036') %>%\n",
389 | "# extract2(\"Metadata_broad_sample\")#\n"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": null,
395 | "metadata": {},
396 | "outputs": [],
397 | "source": [
398 | "df <- cor_matrix %>% as_data_frame() %>% \n",
399 | " mutate(Metadata_pert_id = population_aggregated %>% \n",
400 | " filter(Metadata_dataset == 'BBBC022') %>%\n",
401 | " extract2(\"Metadata_pert_id\")) %>% \n",
402 | " select(Metadata_pert_id, everything())\n",
403 | "\n",
404 | "# write submission file\n",
405 | "write.csv(df,\"../cytodata-baseline_R_day_2.csv\",row.names = FALSE)"
406 | ]
407 | },
408 | {
409 | "cell_type": "code",
410 | "execution_count": null,
411 | "metadata": {},
412 | "outputs": [],
413 | "source": [
414 | "df %>% print"
415 | ]
416 | }
417 | ],
418 | "metadata": {
419 | "kernelspec": {
420 | "display_name": "R",
421 | "language": "R",
422 | "name": "ir"
423 | },
424 | "language_info": {
425 | "codemirror_mode": "r",
426 | "file_extension": ".r",
427 | "mimetype": "text/x-r-source",
428 | "name": "R",
429 | "pygments_lexer": "r",
430 | "version": "3.4.4"
431 | }
432 | },
433 | "nbformat": 4,
434 | "nbformat_minor": 2
435 | }
436 |
--------------------------------------------------------------------------------
/cytodata-toolkit/R/Create-Submission_R-day2-CP.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "scrolled": true
8 | },
9 | "outputs": [
10 | {
11 | "name": "stderr",
12 | "output_type": "stream",
13 | "text": [
14 | "── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──\n",
15 | "✔ ggplot2 3.0.0 ✔ purrr 0.2.5\n",
16 | "✔ tibble 1.4.2 ✔ dplyr 0.7.6\n",
17 | "✔ tidyr 0.8.1 ✔ stringr 1.3.1\n",
18 | "✔ readr 1.1.1 ✔ forcats 0.3.0\n",
19 | "── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──\n",
20 | "✖ dplyr::filter() masks stats::filter()\n",
21 | "✖ dplyr::lag() masks stats::lag()\n",
22 | "\n",
23 | "Attaching package: ‘cytominer’\n",
24 | "\n",
25 | "The following object is masked from ‘package:stats’:\n",
26 | "\n",
27 | " aggregate\n",
28 | "\n",
29 | "The following object is masked from ‘package:base’:\n",
30 | "\n",
31 | " transform\n",
32 | "\n",
33 | "\n",
34 | "Attaching package: ‘magrittr’\n",
35 | "\n",
36 | "The following object is masked from ‘package:purrr’:\n",
37 | "\n",
38 | " set_names\n",
39 | "\n",
40 | "The following object is masked from ‘package:tidyr’:\n",
41 | "\n",
42 | " extract\n",
43 | "\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "library(tidyverse)\n",
49 | "library(cytominer)\n",
50 | "library(magrittr)"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "# function to load different data sets\n",
58 | "This function also adds the Metdata columns Metadata_dataset, Metadata_partition and Metadata_features"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 2,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "load_dataset <- function(partition, dataset,feature){\n",
68 | " file_name <- read_csv(\"../datasets.csv\") \n",
69 | " x <- file_name %>% filter(\n",
70 | " Partition == partition,\n",
71 | " Dataset == dataset,\n",
72 | " Features == feature) %>% \n",
73 | " extract2(\"Link\")\n",
74 | "\n",
75 | " return(read_csv(x) %>% \n",
76 | " mutate(Metadata_dataset = dataset) %>%\n",
77 | " mutate(Metadata_partition = partition) %>% \n",
78 | " mutate(Metadata_features = feature) \n",
79 | " )\n",
80 | " }"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "# Load data \n",
88 | "We load training and test datasets for both BBBC036 / CDRP data set and select only important Metadata colums"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 3,
94 | "metadata": {
95 | "scrolled": true
96 | },
97 | "outputs": [
98 | {
99 | "name": "stderr",
100 | "output_type": "stream",
101 | "text": [
102 | "Parsed with column specification:\n",
103 | "cols(\n",
104 | " Dataset = col_character(),\n",
105 | " Partition = col_character(),\n",
106 | " Features = col_character(),\n",
107 | " Link = col_character()\n",
108 | ")\n",
109 | "Parsed with column specification:\n",
110 | "cols(\n",
111 | " .default = col_double(),\n",
112 | " Metadata_Plate = col_integer(),\n",
113 | " Metadata_Well = col_character(),\n",
114 | " Metadata_Assay_Plate_Barcode = col_integer(),\n",
115 | " Metadata_Plate_Map_Name = col_character(),\n",
116 | " Metadata_well_position = col_character(),\n",
117 | " Metadata_ASSAY_WELL_ROLE = col_character(),\n",
118 | " Metadata_broad_sample = col_character(),\n",
119 | " Metadata_solvent = col_character(),\n",
120 | " Metadata_pert_id = col_character(),\n",
121 | " Metadata_pert_mfc_id = col_character(),\n",
122 | " Metadata_pert_well = col_character(),\n",
123 | " Metadata_pert_id_vendor = col_character(),\n",
124 | " Metadata_cell_id = col_character(),\n",
125 | " Metadata_broad_sample_type = col_character(),\n",
126 | " Metadata_pert_vehicle = col_character(),\n",
127 | " Metadata_pert_type = col_character(),\n",
128 | " Cells_AreaShape_EulerNumber = col_integer(),\n",
129 | " Cells_Children_Cytoplasm_Count = col_integer(),\n",
130 | " Cells_Neighbors_FirstClosestObjectNumber_5 = col_integer(),\n",
131 | " Cells_Neighbors_FirstClosestObjectNumber_Adjacent = col_integer()\n",
132 | " # ... with 20 more columns\n",
133 | ")\n",
134 | "See spec(...) for full column specifications.\n",
135 | "Warning message in rbind(names(probs), probs_f):\n",
136 | "“number of columns of result is not a multiple of vector length (arg 1)”Warning message:\n",
137 | "“2201 parsing failures.\n",
138 | "row # A tibble: 5 x 5 col row col expected actual file expected actual 1 1013 Cells_Neighbors_Fir… no trailing … .5 'https://s3.amazonaws.com/cy… file 2 1013 Cells_Neighbors_Fir… no trailing … .5 'https://s3.amazonaws.com/cy… row 3 1013 Nuclei_AreaShape_Ar… no trailing … .5 'https://s3.amazonaws.com/cy… col 4 1013 Nuclei_Neighbors_Se… no trailing … .5 'https://s3.amazonaws.com/cy… expected 5 1017 Nuclei_AreaShape_Ar… no trailing … .5 'https://s3.amazonaws.com/cy…\n",
139 | "... ................. ... ............................................................................... ........ ............................................................................... ...... ............................................................................... .... ............................................................................... ... ............................................................................... ... ............................................................................... ........ ...............................................................................\n",
140 | "See problems(...) for more details.\n",
141 | "”Parsed with column specification:\n",
142 | "cols(\n",
143 | " Dataset = col_character(),\n",
144 | " Partition = col_character(),\n",
145 | " Features = col_character(),\n",
146 | " Link = col_character()\n",
147 | ")\n",
148 | "Parsed with column specification:\n",
149 | "cols(\n",
150 | " .default = col_double(),\n",
151 | " Metadata_Plate = col_integer(),\n",
152 | " Metadata_Well = col_character(),\n",
153 | " Metadata_Assay_Plate_Barcode = col_integer(),\n",
154 | " Metadata_Plate_Map_Name = col_character(),\n",
155 | " Metadata_well_position = col_character(),\n",
156 | " Metadata_ASSAY_WELL_ROLE = col_character(),\n",
157 | " Metadata_broad_sample = col_character(),\n",
158 | " Metadata_solvent = col_character(),\n",
159 | " Metadata_pert_id = col_character(),\n",
160 | " Metadata_pert_mfc_id = col_character(),\n",
161 | " Metadata_pert_well = col_character(),\n",
162 | " Metadata_pert_id_vendor = col_character(),\n",
163 | " Metadata_cell_id = col_character(),\n",
164 | " Metadata_broad_sample_type = col_character(),\n",
165 | " Metadata_pert_vehicle = col_character(),\n",
166 | " Metadata_pert_type = col_character(),\n",
167 | " Cells_AreaShape_EulerNumber = col_integer(),\n",
168 | " Cells_Children_Cytoplasm_Count = col_integer(),\n",
169 | " Cytoplasm_AreaShape_EulerNumber = col_integer(),\n",
170 | " Cytoplasm_Correlation_Manders_AGP_DNA = col_integer()\n",
171 | " # ... with 31 more columns\n",
172 | ")\n",
173 | "See spec(...) for full column specifications.\n",
174 | "Warning message in rbind(names(probs), probs_f):\n",
175 | "“number of columns of result is not a multiple of vector length (arg 1)”Warning message:\n",
176 | "“33 parsing failures.\n",
177 | "row # A tibble: 5 x 5 col row col expected actual file expected actual 1 1103 Nuclei_Correlat… no trailing … .8570798… 'https://s3.amazonaws.com/cyt… file 2 1103 Nuclei_Correlat… no trailing … .5790690… 'https://s3.amazonaws.com/cyt… row 3 1103 Nuclei_Correlat… no trailing … .9671379… 'https://s3.amazonaws.com/cyt… col 4 1103 Nuclei_Correlat… no trailing … .6148584… 'https://s3.amazonaws.com/cyt… expected 5 1104 Nuclei_Correlat… no trailing … .9769246… 'https://s3.amazonaws.com/cyt…\n",
178 | "... ................. ... ............................................................................... ........ ............................................................................... ...... ............................................................................... .... ............................................................................... ... ............................................................................... ... ............................................................................... ........ ...............................................................................\n",
179 | "See problems(...) for more details.\n",
180 | "”"
181 | ]
182 | }
183 | ],
184 | "source": [
185 | "# bbbc36 data \n",
186 | "bbbc036_train <- load_dataset(\"Train\",\"BBBC036\",\"CellProfiler\") %>% \n",
187 | " mutate(Metadata_x_mutation_status = \"none\")\n",
188 | "\n",
189 | "bbbc036_test <- load_dataset(\"Test\",\"BBBC036\",\"CellProfiler\") %>% \n",
190 | " mutate(Metadata_x_mutation_status = \"none\")\n"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 4,
196 | "metadata": {},
197 | "outputs": [],
198 | "source": [
199 | "bbbc036 <- rbind(bbbc036_train, bbbc036_test) "
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "# How large are the data sets? "
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 5,
212 | "metadata": {},
213 | "outputs": [
214 | {
215 | "data": {
216 | "text/html": [
217 | "\n",
218 | "\t- 18929
\n",
219 | "\t- 1805
\n",
220 | "
\n"
221 | ],
222 | "text/latex": [
223 | "\\begin{enumerate*}\n",
224 | "\\item 18929\n",
225 | "\\item 1805\n",
226 | "\\end{enumerate*}\n"
227 | ],
228 | "text/markdown": [
229 | "1. 18929\n",
230 | "2. 1805\n",
231 | "\n",
232 | "\n"
233 | ],
234 | "text/plain": [
235 | "[1] 18929 1805"
236 | ]
237 | },
238 | "metadata": {},
239 | "output_type": "display_data"
240 | },
241 | {
242 | "data": {
243 | "text/html": [
244 | "\n",
245 | "\t- 2177
\n",
246 | "\t- 1805
\n",
247 | "
\n"
248 | ],
249 | "text/latex": [
250 | "\\begin{enumerate*}\n",
251 | "\\item 2177\n",
252 | "\\item 1805\n",
253 | "\\end{enumerate*}\n"
254 | ],
255 | "text/markdown": [
256 | "1. 2177\n",
257 | "2. 1805\n",
258 | "\n",
259 | "\n"
260 | ],
261 | "text/plain": [
262 | "[1] 2177 1805"
263 | ]
264 | },
265 | "metadata": {},
266 | "output_type": "display_data"
267 | }
268 | ],
269 | "source": [
270 | "bbbc036_train %>% dim()\n",
271 | "bbbc036_test %>% dim()"
272 | ]
273 | },
274 | {
275 | "cell_type": "markdown",
276 | "metadata": {},
277 | "source": [
278 | "# What are the Metadata colums?"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 6,
284 | "metadata": {},
285 | "outputs": [
286 | {
287 | "name": "stdout",
288 | "output_type": "stream",
289 | "text": [
290 | "# A tibble: 5 x 4\n",
291 | " Metadata_Plate Metadata_Well Metadata_Plate_Map_Name Metadata_pert_id\n",
292 | " \n",
293 | "1 24277 a01 H-BIOA-004-3 BRD-K18250272 \n",
294 | "2 24277 a02 H-BIOA-004-3 BRD-K18316707 \n",
295 | "3 24277 a03 H-BIOA-004-3 BRD-K18438502 \n",
296 | "4 24277 a04 H-BIOA-004-3 BRD-K18550767 \n",
297 | "5 24277 a05 H-BIOA-004-3 BRD-K18574842 \n"
298 | ]
299 | }
300 | ],
301 | "source": [
302 | "bbbc036 %>% \n",
303 | " select(Metadata_Plate, Metadata_Well, Metadata_Plate_Map_Name, Metadata_pert_id) %>% \n",
304 | " slice(1:5) %>% \n",
305 | " print()"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 27,
311 | "metadata": {},
312 | "outputs": [
313 | {
314 | "name": "stdout",
315 | "output_type": "stream",
316 | "text": [
317 | "# A tibble: 5 x 4\n",
318 | " Metadata_pert_id Metadata_broad_sample_ty… Metadata_dataset Metadata_partiti…\n",
319 | " \n",
320 | "1 BRD-K18250272 trt BBBC036 Train \n",
321 | "2 BRD-K18316707 trt BBBC036 Train \n",
322 | "3 BRD-K18438502 trt BBBC036 Train \n",
323 | "4 BRD-K18550767 trt BBBC036 Train \n",
324 | "5 BRD-K18574842 trt BBBC036 Train \n"
325 | ]
326 | }
327 | ],
328 | "source": [
329 | "bbbc036 %>% \n",
330 | " select( Metadata_pert_id, Metadata_broad_sample_type,Metadata_dataset, Metadata_partition) %>% \n",
331 | " slice(1:5) %>% \n",
332 | " print()"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": [
339 | "# how many replicates do we have? "
340 | ]
341 | },
342 | {
343 | "cell_type": "code",
344 | "execution_count": 8,
345 | "metadata": {},
346 | "outputs": [
347 | {
348 | "data": {
349 | "text/html": [
350 | "\n",
351 | "| Metadata_Plate_Map_Name | mean_replicates |
\n",
352 | "\n",
353 | "\t| H-BIOA-001-3 | 9.600000 |
\n",
354 | "\t| H-BIOA-002-3 | 8.373832 |
\n",
355 | "\t| H-BIOA-003-3 | 9.554517 |
\n",
356 | "\t| H-BIOA-004-3 | 9.570093 |
\n",
357 | "\t| H-BIOA-005-3 | 9.563863 |
\n",
358 | "\t| H-BIOA-006-3 | 9.554517 |
\n",
359 | "\t| H-BIOA-007-3 | 9.593750 |
\n",
360 | "\n",
361 | "
\n"
362 | ],
363 | "text/latex": [
364 | "\\begin{tabular}{r|ll}\n",
365 | " Metadata\\_Plate\\_Map\\_Name & mean\\_replicates\\\\\n",
366 | "\\hline\n",
367 | "\t H-BIOA-001-3 & 9.600000 \\\\\n",
368 | "\t H-BIOA-002-3 & 8.373832 \\\\\n",
369 | "\t H-BIOA-003-3 & 9.554517 \\\\\n",
370 | "\t H-BIOA-004-3 & 9.570093 \\\\\n",
371 | "\t H-BIOA-005-3 & 9.563863 \\\\\n",
372 | "\t H-BIOA-006-3 & 9.554517 \\\\\n",
373 | "\t H-BIOA-007-3 & 9.593750 \\\\\n",
374 | "\\end{tabular}\n"
375 | ],
376 | "text/markdown": [
377 | "\n",
378 | "Metadata_Plate_Map_Name | mean_replicates | \n",
379 | "|---|---|---|---|---|---|---|\n",
380 | "| H-BIOA-001-3 | 9.600000 | \n",
381 | "| H-BIOA-002-3 | 8.373832 | \n",
382 | "| H-BIOA-003-3 | 9.554517 | \n",
383 | "| H-BIOA-004-3 | 9.570093 | \n",
384 | "| H-BIOA-005-3 | 9.563863 | \n",
385 | "| H-BIOA-006-3 | 9.554517 | \n",
386 | "| H-BIOA-007-3 | 9.593750 | \n",
387 | "\n",
388 | "\n"
389 | ],
390 | "text/plain": [
391 | " Metadata_Plate_Map_Name mean_replicates\n",
392 | "1 H-BIOA-001-3 9.600000 \n",
393 | "2 H-BIOA-002-3 8.373832 \n",
394 | "3 H-BIOA-003-3 9.554517 \n",
395 | "4 H-BIOA-004-3 9.570093 \n",
396 | "5 H-BIOA-005-3 9.563863 \n",
397 | "6 H-BIOA-006-3 9.554517 \n",
398 | "7 H-BIOA-007-3 9.593750 "
399 | ]
400 | },
401 | "metadata": {},
402 | "output_type": "display_data"
403 | }
404 | ],
405 | "source": [
406 | "bbbc036 %>% \n",
407 | " group_by(Metadata_Plate_Map_Name, Metadata_pert_id) %>%\n",
408 | " summarise(n_groups = n()) %>%\n",
409 | " summarise(mean_replicates = mean(n_groups))"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 9,
415 | "metadata": {
416 | "scrolled": true
417 | },
418 | "outputs": [
419 | {
420 | "name": "stderr",
421 | "output_type": "stream",
422 | "text": [
423 | "Parsed with column specification:\n",
424 | "cols(\n",
425 | " Dataset = col_character(),\n",
426 | " Partition = col_character(),\n",
427 | " Features = col_character(),\n",
428 | " Link = col_character()\n",
429 | ")\n",
430 | "Parsed with column specification:\n",
431 | "cols(\n",
432 | " .default = col_double(),\n",
433 | " Metadata_Plate = col_integer(),\n",
434 | " Metadata_Well = col_character(),\n",
435 | " Metadata_Assay_Plate_Barcode = col_integer(),\n",
436 | " Metadata_Plate_Map_Name = col_character(),\n",
437 | " Metadata_well_position = col_character(),\n",
438 | " Metadata_broad_sample = col_character(),\n",
439 | " Metadata_source_name = col_character(),\n",
440 | " Metadata_compound_name = col_character(),\n",
441 | " Metadata_smiles = col_character(),\n",
442 | " Metadata_solvent = col_character(),\n",
443 | " Metadata_pert_id = col_character(),\n",
444 | " Metadata_pert_mfc_id = col_character(),\n",
445 | " Metadata_pert_well = col_character(),\n",
446 | " Metadata_pert_id_vendor = col_character(),\n",
447 | " Metadata_cell_id = col_character(),\n",
448 | " Metadata_broad_sample_type = col_character(),\n",
449 | " Metadata_pert_vehicle = col_character(),\n",
450 | " Metadata_pert_type = col_character(),\n",
451 | " Metadata_exp = col_character()\n",
452 | ")\n",
453 | "See spec(...) for full column specifications.\n",
454 | "Parsed with column specification:\n",
455 | "cols(\n",
456 | " Dataset = col_character(),\n",
457 | " Partition = col_character(),\n",
458 | " Features = col_character(),\n",
459 | " Link = col_character()\n",
460 | ")\n",
461 | "Parsed with column specification:\n",
462 | "cols(\n",
463 | " .default = col_double(),\n",
464 | " Metadata_Plate = col_integer(),\n",
465 | " Metadata_Well = col_character(),\n",
466 | " Metadata_Assay_Plate_Barcode = col_integer(),\n",
467 | " Metadata_Plate_Map_Name = col_character(),\n",
468 | " Metadata_well_position = col_character(),\n",
469 | " Metadata_broad_sample = col_character(),\n",
470 | " Metadata_source_name = col_character(),\n",
471 | " Metadata_compound_name = col_character(),\n",
472 | " Metadata_smiles = col_character(),\n",
473 | " Metadata_solvent = col_character(),\n",
474 | " Metadata_pert_id = col_character(),\n",
475 | " Metadata_pert_mfc_id = col_character(),\n",
476 | " Metadata_pert_well = col_character(),\n",
477 | " Metadata_pert_id_vendor = col_character(),\n",
478 | " Metadata_cell_id = col_character(),\n",
479 | " Metadata_broad_sample_type = col_character(),\n",
480 | " Metadata_pert_vehicle = col_character(),\n",
481 | " Metadata_pert_type = col_character(),\n",
482 | " Metadata_exp = col_character()\n",
483 | ")\n",
484 | "See spec(...) for full column specifications.\n"
485 | ]
486 | }
487 | ],
488 | "source": [
489 | "# bbbc022 data \n",
490 | "bbbc022_train <- load_dataset(\"Train\",\"BBBC022\",\"CellProfiler\")\n",
491 | " \n",
492 | "bbbc022_test <- load_dataset(\"Test\",\"BBBC022\",\"CellProfiler\")\n",
493 | "\n",
494 | "bbbc022 <- rbind(bbbc022_train, bbbc022_test) %>%\n",
495 | " select(Metadata_Plate, Metadata_Well, Metadata_Plate_Map_Name, \n",
496 | " Metadata_pert_id, Metadata_broad_sample_type,\n",
497 | " Metadata_dataset,Metadata_partition, \n",
498 | " everything()\n",
499 | " ) "
500 | ]
501 | },
502 | {
503 | "cell_type": "markdown",
504 | "metadata": {},
505 | "source": [
506 | "# How large are the training and test partitions for BBBC022?"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": 10,
512 | "metadata": {},
513 | "outputs": [
514 | {
515 | "data": {
516 | "text/html": [
517 | "\n",
518 | "\t- 6462
\n",
519 | "\t- 1806
\n",
520 | "
\n"
521 | ],
522 | "text/latex": [
523 | "\\begin{enumerate*}\n",
524 | "\\item 6462\n",
525 | "\\item 1806\n",
526 | "\\end{enumerate*}\n"
527 | ],
528 | "text/markdown": [
529 | "1. 6462\n",
530 | "2. 1806\n",
531 | "\n",
532 | "\n"
533 | ],
534 | "text/plain": [
535 | "[1] 6462 1806"
536 | ]
537 | },
538 | "metadata": {},
539 | "output_type": "display_data"
540 | },
541 | {
542 | "data": {
543 | "text/html": [
544 | "\n",
545 | "\t- 1120
\n",
546 | "\t- 1806
\n",
547 | "
\n"
548 | ],
549 | "text/latex": [
550 | "\\begin{enumerate*}\n",
551 | "\\item 1120\n",
552 | "\\item 1806\n",
553 | "\\end{enumerate*}\n"
554 | ],
555 | "text/markdown": [
556 | "1. 1120\n",
557 | "2. 1806\n",
558 | "\n",
559 | "\n"
560 | ],
561 | "text/plain": [
562 | "[1] 1120 1806"
563 | ]
564 | },
565 | "metadata": {},
566 | "output_type": "display_data"
567 | }
568 | ],
569 | "source": [
570 | "bbbc022_train %>% dim()\n",
571 | "bbbc022_test %>% dim()"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 11,
577 | "metadata": {},
578 | "outputs": [
579 | {
580 | "data": {
581 | "text/html": [
582 | "\n",
583 | "| Metadata_Plate_Map_Name | mean_replicates |
\n",
584 | "\n",
585 | "\t| H-BIOA-002-1 | 4.860759 |
\n",
586 | "\t| H-BIOA-003-1 | 4.780255 |
\n",
587 | "\t| H-BIOA-004-1 | 4.830128 |
\n",
588 | "\t| H-BIOA-005-1 | 4.816456 |
\n",
589 | "\t| H-BIOA-006-1 | 4.797468 |
\n",
590 | "\n",
591 | "
\n"
592 | ],
593 | "text/latex": [
594 | "\\begin{tabular}{r|ll}\n",
595 | " Metadata\\_Plate\\_Map\\_Name & mean\\_replicates\\\\\n",
596 | "\\hline\n",
597 | "\t H-BIOA-002-1 & 4.860759 \\\\\n",
598 | "\t H-BIOA-003-1 & 4.780255 \\\\\n",
599 | "\t H-BIOA-004-1 & 4.830128 \\\\\n",
600 | "\t H-BIOA-005-1 & 4.816456 \\\\\n",
601 | "\t H-BIOA-006-1 & 4.797468 \\\\\n",
602 | "\\end{tabular}\n"
603 | ],
604 | "text/markdown": [
605 | "\n",
606 | "Metadata_Plate_Map_Name | mean_replicates | \n",
607 | "|---|---|---|---|---|\n",
608 | "| H-BIOA-002-1 | 4.860759 | \n",
609 | "| H-BIOA-003-1 | 4.780255 | \n",
610 | "| H-BIOA-004-1 | 4.830128 | \n",
611 | "| H-BIOA-005-1 | 4.816456 | \n",
612 | "| H-BIOA-006-1 | 4.797468 | \n",
613 | "\n",
614 | "\n"
615 | ],
616 | "text/plain": [
617 | " Metadata_Plate_Map_Name mean_replicates\n",
618 | "1 H-BIOA-002-1 4.860759 \n",
619 | "2 H-BIOA-003-1 4.780255 \n",
620 | "3 H-BIOA-004-1 4.830128 \n",
621 | "4 H-BIOA-005-1 4.816456 \n",
622 | "5 H-BIOA-006-1 4.797468 "
623 | ]
624 | },
625 | "metadata": {},
626 | "output_type": "display_data"
627 | }
628 | ],
629 | "source": [
630 | "bbbc022 %>% \n",
631 | " group_by(Metadata_Plate_Map_Name, Metadata_pert_id) %>%\n",
632 | " summarise(n_groups = n()) %>%\n",
633 | " summarise(mean_replicates = mean(n_groups))"
634 | ]
635 | },
636 | {
637 | "cell_type": "markdown",
638 | "metadata": {},
639 | "source": [
640 | "# How large are the combined data sets? "
641 | ]
642 | },
643 | {
644 | "cell_type": "code",
645 | "execution_count": 12,
646 | "metadata": {},
647 | "outputs": [
648 | {
649 | "data": {
650 | "text/html": [
651 | "\n",
652 | "\t- 7582
\n",
653 | "\t- 1806
\n",
654 | "
\n"
655 | ],
656 | "text/latex": [
657 | "\\begin{enumerate*}\n",
658 | "\\item 7582\n",
659 | "\\item 1806\n",
660 | "\\end{enumerate*}\n"
661 | ],
662 | "text/markdown": [
663 | "1. 7582\n",
664 | "2. 1806\n",
665 | "\n",
666 | "\n"
667 | ],
668 | "text/plain": [
669 | "[1] 7582 1806"
670 | ]
671 | },
672 | "metadata": {},
673 | "output_type": "display_data"
674 | },
675 | {
676 | "data": {
677 | "text/html": [
678 | "\n",
679 | "\t- 21106
\n",
680 | "\t- 1805
\n",
681 | "
\n"
682 | ],
683 | "text/latex": [
684 | "\\begin{enumerate*}\n",
685 | "\\item 21106\n",
686 | "\\item 1805\n",
687 | "\\end{enumerate*}\n"
688 | ],
689 | "text/markdown": [
690 | "1. 21106\n",
691 | "2. 1805\n",
692 | "\n",
693 | "\n"
694 | ],
695 | "text/plain": [
696 | "[1] 21106 1805"
697 | ]
698 | },
699 | "metadata": {},
700 | "output_type": "display_data"
701 | }
702 | ],
703 | "source": [
704 | "dim(bbbc022)\n",
705 | "dim(bbbc036)"
706 | ]
707 | },
708 | {
709 | "cell_type": "markdown",
710 | "metadata": {},
711 | "source": [
712 | "## Extract common features and common metadata"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": 13,
718 | "metadata": {},
719 | "outputs": [],
720 | "source": [
721 | "colnames_bbbc022 <- colnames(bbbc022)\n",
722 | "colnames_bbbc036 <- colnames(bbbc036)\n",
723 | "\n",
724 | "Metadata_names_bbbc022 <- c(\n",
725 | " stringr::str_subset(colnames_bbbc022, \"^Meta\")\n",
726 | ") \n",
727 | "\n",
728 | "Metadata_names_bbbc036 <- c(\n",
729 | " stringr::str_subset(colnames_bbbc036, \"^Meta\")\n",
730 | ") \n",
731 | "\n",
732 | "common_metadata <- intersect(Metadata_names_bbbc022, Metadata_names_bbbc036) \n",
733 | "common_features <- setdiff(intersect(colnames_bbbc022, colnames_bbbc036),common_metadata)"
734 | ]
735 | },
736 | {
737 | "cell_type": "markdown",
738 | "metadata": {},
739 | "source": [
740 | "# Concatenate data sets"
741 | ]
742 | },
743 | {
744 | "cell_type": "code",
745 | "execution_count": 14,
746 | "metadata": {},
747 | "outputs": [],
748 | "source": [
749 | "population <- rbind(\n",
750 | " bbbc022 %>% \n",
751 | " select(c(common_metadata, common_features)),\n",
752 | " bbbc036 %>% \n",
753 | " select(c(common_metadata, common_features))\n",
754 | " ) %>% \n",
755 | " mutate(Metadata_perturbation = 'chemical') %>% \n",
756 | " select(Metadata_perturbation, everything())"
757 | ]
758 | },
759 | {
760 | "cell_type": "markdown",
761 | "metadata": {},
762 | "source": [
763 | "## Important: update column names! "
764 | ]
765 | },
766 | {
767 | "cell_type": "code",
768 | "execution_count": 15,
769 | "metadata": {},
770 | "outputs": [],
771 | "source": [
772 | "colnames_combined <- colnames(population)\n",
773 | "\n",
774 | "common_metadata <- c(\n",
775 | " stringr::str_subset(colnames_combined, \"^Meta\")\n",
776 | ") \n",
777 | "\n",
778 | "common_features <- setdiff(colnames_combined, common_metadata)"
779 | ]
780 | },
781 | {
782 | "cell_type": "markdown",
783 | "metadata": {},
784 | "source": [
785 | "Cytominer has problems handling column names '1', '2' so we rename them to 'Feature_1', ... "
786 | ]
787 | },
788 | {
789 | "cell_type": "code",
790 | "execution_count": 16,
791 | "metadata": {},
792 | "outputs": [],
793 | "source": [
794 | "common_features <- paste0(\"Feature_\",common_features)\n",
795 | "colnames(population) <- c(common_metadata, common_features)"
796 | ]
797 | },
798 | {
799 | "cell_type": "markdown",
800 | "metadata": {},
801 | "source": [
802 | "# Normalize data\n",
803 | "We use cytominer to normalize both datasets with respect to the controls, i.e. EMPTY genes"
804 | ]
805 | },
806 | {
807 | "cell_type": "code",
808 | "execution_count": 17,
809 | "metadata": {},
810 | "outputs": [],
811 | "source": [
812 | "population_normalized <- cytominer::normalize(\n",
813 | " population, \n",
814 | " variables = common_features, \n",
815 | " strata = c(\"Metadata_perturbation\"), \n",
816 | " sample = population %>% \n",
817 | " filter(\n",
818 | " Metadata_broad_sample == \"DMSO\",\n",
819 | " Metadata_partition == \"Train\"\n",
820 | " ), \n",
821 | " operation = \"standardize\"\n",
822 | ")"
823 | ]
824 | },
825 | {
826 | "cell_type": "markdown",
827 | "metadata": {},
828 | "source": [
829 | "# Aggregate data "
830 | ]
831 | },
832 | {
833 | "cell_type": "code",
834 | "execution_count": 18,
835 | "metadata": {},
836 | "outputs": [
837 | {
838 | "data": {
839 | "text/html": [
840 | "\n",
841 | "\t- 3782
\n",
842 | "\t- 1786
\n",
843 | "
\n"
844 | ],
845 | "text/latex": [
846 | "\\begin{enumerate*}\n",
847 | "\\item 3782\n",
848 | "\\item 1786\n",
849 | "\\end{enumerate*}\n"
850 | ],
851 | "text/markdown": [
852 | "1. 3782\n",
853 | "2. 1786\n",
854 | "\n",
855 | "\n"
856 | ],
857 | "text/plain": [
858 | "[1] 3782 1786"
859 | ]
860 | },
861 | "metadata": {},
862 | "output_type": "display_data"
863 | }
864 | ],
865 | "source": [
866 | "population_aggregated <- cytominer::aggregate(\n",
867 | " population = population_normalized, \n",
868 | " variables = common_features, \n",
869 | " strata = c(\"Metadata_pert_id\",\"Metadata_dataset\",\"Metadata_partition\"), \n",
870 | " operation = \"mean\"\n",
871 | ") \n",
872 | "\n",
873 | "population_aggregated %>% dim()"
874 | ]
875 | },
876 | {
877 | "cell_type": "code",
878 | "execution_count": 19,
879 | "metadata": {},
880 | "outputs": [
881 | {
882 | "data": {
883 | "text/html": [
884 | "\n",
885 | "\t- 1543
\n",
886 | "\t- 1786
\n",
887 | "
\n"
888 | ],
889 | "text/latex": [
890 | "\\begin{enumerate*}\n",
891 | "\\item 1543\n",
892 | "\\item 1786\n",
893 | "\\end{enumerate*}\n"
894 | ],
895 | "text/markdown": [
896 | "1. 1543\n",
897 | "2. 1786\n",
898 | "\n",
899 | "\n"
900 | ],
901 | "text/plain": [
902 | "[1] 1543 1786"
903 | ]
904 | },
905 | "metadata": {},
906 | "output_type": "display_data"
907 | },
908 | {
909 | "data": {
910 | "text/html": [
911 | "\n",
912 | "\t- 2239
\n",
913 | "\t- 1786
\n",
914 | "
\n"
915 | ],
916 | "text/latex": [
917 | "\\begin{enumerate*}\n",
918 | "\\item 2239\n",
919 | "\\item 1786\n",
920 | "\\end{enumerate*}\n"
921 | ],
922 | "text/markdown": [
923 | "1. 2239\n",
924 | "2. 1786\n",
925 | "\n",
926 | "\n"
927 | ],
928 | "text/plain": [
929 | "[1] 2239 1786"
930 | ]
931 | },
932 | "metadata": {},
933 | "output_type": "display_data"
934 | }
935 | ],
936 | "source": [
937 | "population_aggregated %>% filter(Metadata_dataset == \"BBBC022\") %>% dim()\n",
938 | "population_aggregated %>% filter(Metadata_dataset == \"BBBC036\") %>% dim()"
939 | ]
940 | },
941 | {
942 | "cell_type": "markdown",
943 | "metadata": {},
944 | "source": [
945 | "# Correlation matrix "
946 | ]
947 | },
948 | {
949 | "cell_type": "code",
950 | "execution_count": 20,
951 | "metadata": {},
952 | "outputs": [
953 | {
954 | "data": {
955 | "text/html": [
956 | "\n",
957 | "\t- 1543
\n",
958 | "\t- 2239
\n",
959 | "
\n"
960 | ],
961 | "text/latex": [
962 | "\\begin{enumerate*}\n",
963 | "\\item 1543\n",
964 | "\\item 2239\n",
965 | "\\end{enumerate*}\n"
966 | ],
967 | "text/markdown": [
968 | "1. 1543\n",
969 | "2. 2239\n",
970 | "\n",
971 | "\n"
972 | ],
973 | "text/plain": [
974 | "[1] 1543 2239"
975 | ]
976 | },
977 | "metadata": {},
978 | "output_type": "display_data"
979 | }
980 | ],
981 | "source": [
982 | "cor_matrix <- cor(\n",
983 | " x = population_aggregated %>% \n",
984 | " filter(Metadata_dataset == 'BBBC022') %>% \n",
985 | " select(common_features) %>% \n",
986 | " as.matrix() %>% \n",
987 | " t, \n",
988 | " y = population_aggregated %>% \n",
989 | " filter(Metadata_dataset == 'BBBC036') %>% \n",
990 | " select(common_features) %>% \n",
991 | " as.matrix() %>% \n",
992 | " t,\n",
993 | " use = \"complete.obs\"\n",
994 | " ) \n",
995 | "\n",
996 | "cor_matrix %>% dim()"
997 | ]
998 | },
999 | {
1000 | "cell_type": "markdown",
1001 | "metadata": {},
1002 | "source": [
1003 | "# Submision file "
1004 | ]
1005 | },
1006 | {
1007 | "cell_type": "code",
1008 | "execution_count": 21,
1009 | "metadata": {},
1010 | "outputs": [
1011 | {
1012 | "data": {
1013 | "text/html": [
1014 | "\n",
1015 | "\t- 1543
\n",
1016 | "\t- 2239
\n",
1017 | "
\n"
1018 | ],
1019 | "text/latex": [
1020 | "\\begin{enumerate*}\n",
1021 | "\\item 1543\n",
1022 | "\\item 2239\n",
1023 | "\\end{enumerate*}\n"
1024 | ],
1025 | "text/markdown": [
1026 | "1. 1543\n",
1027 | "2. 2239\n",
1028 | "\n",
1029 | "\n"
1030 | ],
1031 | "text/plain": [
1032 | "[1] 1543 2239"
1033 | ]
1034 | },
1035 | "metadata": {},
1036 | "output_type": "display_data"
1037 | }
1038 | ],
1039 | "source": [
1040 | "# set column names \n",
1041 | "colnames(cor_matrix) <- population_aggregated %>% \n",
1042 | " filter(Metadata_dataset == 'BBBC036') %>%\n",
1043 | " extract2(\"Metadata_pert_id\")\n",
1044 | "\n",
1045 | "# set row names \n",
1046 | "#rownames(cor_matrix) <- population_aggregated %>% \n",
1047 | "# filter(Metadata_dataset == 'BBBC036') %>%\n",
1048 | "# extract2(\"Metadata_broad_sample\")#\n",
1049 | "\n",
1050 | "cor_matrix %>% dim()"
1051 | ]
1052 | },
1053 | {
1054 | "cell_type": "code",
1055 | "execution_count": 23,
1056 | "metadata": {},
1057 | "outputs": [],
1058 | "source": [
1059 | "df <- cor_matrix %>% as_data_frame() %>% \n",
1060 | " mutate(Metadata_pert_id = population_aggregated %>% \n",
1061 | " filter(Metadata_dataset == 'BBBC022') %>%\n",
1062 | " extract2(\"Metadata_pert_id\")) %>% \n",
1063 | " select(Metadata_pert_id, everything())\n",
1064 | "\n",
1065 | "# write submission file\n",
1066 | "write.csv(df,\"../cytodata-baseline_R_day_2_CP.csv\",row.names = FALSE)"
1067 | ]
1068 | },
1069 | {
1070 | "cell_type": "code",
1071 | "execution_count": 25,
1072 | "metadata": {},
1073 | "outputs": [
1074 | {
1075 | "data": {
1076 | "text/html": [
1077 | "\n",
1078 | "\t- 1543
\n",
1079 | "\t- 2240
\n",
1080 | "
\n"
1081 | ],
1082 | "text/latex": [
1083 | "\\begin{enumerate*}\n",
1084 | "\\item 1543\n",
1085 | "\\item 2240\n",
1086 | "\\end{enumerate*}\n"
1087 | ],
1088 | "text/markdown": [
1089 | "1. 1543\n",
1090 | "2. 2240\n",
1091 | "\n",
1092 | "\n"
1093 | ],
1094 | "text/plain": [
1095 | "[1] 1543 2240"
1096 | ]
1097 | },
1098 | "metadata": {},
1099 | "output_type": "display_data"
1100 | }
1101 | ],
1102 | "source": [
1103 | "dim(df)"
1104 | ]
1105 | }
1106 | ],
1107 | "metadata": {
1108 | "kernelspec": {
1109 | "display_name": "R",
1110 | "language": "R",
1111 | "name": "ir"
1112 | },
1113 | "language_info": {
1114 | "codemirror_mode": "r",
1115 | "file_extension": ".r",
1116 | "mimetype": "text/x-r-source",
1117 | "name": "R",
1118 | "pygments_lexer": "r",
1119 | "version": "3.4.4"
1120 | }
1121 | },
1122 | "nbformat": 4,
1123 | "nbformat_minor": 2
1124 | }
1125 |
--------------------------------------------------------------------------------