├── .gitignore
├── .pre-commit-config.yaml
├── 0.dataset_creation
└── 0-preprocess_datasets.ipynb
├── 1.dataset_curation
├── README.md
├── curate_dataset.py
├── pyproject.toml
└── uv.lock
├── App1.single_feature_prediction
├── 1-single-Gene-CPfeature-prediction.ipynb
└── lookup_luad_images.ipynb
├── App2.MoA_prediction
├── 2a-Modality_Integration_CDRP-bio.ipynb
└── 2b-Modality_Integration_LINCS.ipynb
├── GO_terms_search
├── 4-GO-terms-search-analysis.ipynb
└── source
│ ├── GO_bp_cc_mf_direct_LUAD_975.txt
│ ├── GO_bp_cc_mf_direct_LUAD_976.txt
│ ├── GO_bp_cc_mf_direct_intersection_782.txt
│ ├── GO_bp_cc_mf_direct_intersection_782_completed.csv
│ ├── GO_bp_cc_mf_direct_union_1165.txt
│ ├── LUAD_geneSymbols_978.txt
│ ├── intersection_geneSymbols_785.txt
│ ├── top_100_luad.txt
│ ├── top_59_atleast_topIn3.txt
│ └── union_geneSymbols_1170.txt
├── LICENSE
├── README.md
├── environment.yml
├── etag.json
├── explore_the_link.ipynb
├── generate_paper_figures
└── generate_paper_figs.ipynb
├── idmap.xlsx
├── read_and_match_profiles.ipynb
├── results
├── DAVIDoutput_CytoScapeInput_Figure2d
│ ├── chart_UP_KEYWORDS_FunctionalAnot_all.txt
│ └── chart_UP_KEYWORDS_FunctionalAnot_top.txt
├── Figs_Source_Data.xlsx
├── MoAprediction
│ ├── JI_cdrpbio.txt
│ ├── JI_lincs.txt
│ ├── pred_moa.xlsx
│ ├── pred_moa_2.xlsx
│ ├── pred_moa_CDRP.xlsx
│ └── pred_moa_LINCS.xlsx
├── RepCor
│ └── RepCorrDF.xlsx
├── SingleCPfeatPred
│ └── scores_corrected.xlsx
├── SingleGenePred
│ ├── scores_corrected.xlsx
│ ├── scores_cross_dts_LU_LI.xlsx
│ └── supplementary_D.csv
└── SingleGenePred_cpCategoryMap
│ ├── CatMap-LINCS-25-lasso-ht.png
│ ├── CatMap-LUAD-9-MLP-keras-ht.pdf
│ ├── CatMap-LUAD-9-MLP-keras-ht.png
│ ├── CatMap-LUAD-9-lasso-ht.png
│ └── cat_scores_maps.xlsx
└── utils
├── pred_models.py
├── readProfiles.py
├── replicateCorrs.py
└── saveAsNewSheetToExistingFile.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # result folder
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | .python-version
88 |
89 | # pipenv
90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
93 | # install all needed dependencies.
94 | #Pipfile.lock
95 |
96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
97 | __pypackages__/
98 |
99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 |
103 | # SageMath parsed files
104 | *.sage.py
105 |
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 |
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 |
119 | # Rope project settings
120 | .ropeproject
121 |
122 | # mkdocs documentation
123 | /site
124 |
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 |
130 | # Pyre type checker
131 | .pyre/
132 | .Rproj.user
133 | .Rhistory
134 | .Rprofile
135 | *.nb.html
136 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v5.0.0
4 | hooks:
5 | - id: trailing-whitespace
6 | exclude: ^.*/$
7 |
8 |
--------------------------------------------------------------------------------
/1.dataset_curation/README.md:
--------------------------------------------------------------------------------
1 | # Dataset Curation
2 |
3 | `curate_dataset.py` selects, renames, and fixes columns from the preprocessed data to create a curated dataset.
4 |
5 | ## Structure
6 |
7 | Available at:
8 | `s3://cellpainting-gallery/cpg0003-rosetta/broad/workspace/curated_preprocessed_data`
9 |
10 | ```
11 | curated_preprocessed_data
12 | ├── CDRP-BBBC047-Bray
13 | │ ├── CellPainting
14 | │ │ └── replicate_level_cp_augmented.parquet
15 | │ └── L1000
16 | │ └── replicate_level_l1k.parquet
17 | ├── LINCS-Pilot1
18 | │ ├── CellPainting
19 | │ │ └── replicate_level_cp_augmented.parquet
20 | │ └── L1000
21 | │ └── replicate_level_l1k.parquet
22 | ├── LUAD-BBBC041-Caicedo
23 | │ ├── CellPainting
24 | │ │ └── replicate_level_cp_augmented.parquet
25 | │ └── L1000
26 | │ └── replicate_level_l1k.parquet
27 | └── TA-ORF-BBBC037-Rohban
28 | ├── CellPainting
29 | │ └── replicate_level_cp_augmented.parquet
30 | └── L1000
31 | └── replicate_level_l1k.parquet
32 | ```
33 |
34 | ## Curated columns
35 |
36 | - `Metadata_Plate` [All]: Identifier of the multi‐well plate (e.g., SQ00015156, PAC053_U2OS_6H_X2_B1_UNI4445R, TA.OE005_U2OS_72H_X1_B15).
37 | - `Metadata_Plate_Map_Name` [All CP]: Plate‐map identifier (e.g., C-7161-01-LM6-003).
38 | - `Metadata_ARP_ID` [LINCS-Pilot1 L1K, TA-ORF-BBBC037-Rohban L1K, LUAD-BBBC041-Caicedo L1K]: Internal plate identifier (e.g., AB00016187).
39 | - `Metadata_Well` [All except CDRP-BBBC047-Bray L1K]: Specific well position within the plate (e.g., A01, H11).
40 | - `Metadata_pert_id` [All]: Unique perturbation identifier (e.g., BRD-K50691590-001-02-2, TRCN0000471252, EMPTY).
41 | - `Metadata_pert_type` [All except CDRP-BBBC047-Bray L1K]: Perturbation type (e.g., trt_cp, ctl_vehicle, trt, control).
42 | - `Metadata_cell_id` [All]: Cell line used (e.g., A549, U2OS).
43 | - `Metadata_pert_timepoint` [All]: Time (in hours) from perturbation to measurement (e.g., 24, 48, 72, 96).
44 | - `Metadata_pert_dose_micromolar` [LINCS-Pilot1, CDRP-BBBC047-Bray]: Final compound concentration (µM) (e.g., 0.0411523, 10).
45 | - `Metadata_pert_iname` [LINCS-Pilot1, CDRP-BBBC047-Bray]: Common name of the compound or control (e.g., bortezomib, DMSO).
46 | - `Metadata_SMILES` [LINCS-Pilot1 L1K, CDRP-BBBC047-Bray L1K]: SMILES string for the compound structure.
47 | - `Metadata_cdrp_group` [CDRP-BBBC047-Bray L1K]: Subset/group label in the CDRP compound library (e.g., DOS, BIO).
48 | - `Metadata_genesymbol_mutation` [TA-ORF-BBBC037-Rohban L1K+CP, LUAD-BBBC041-Caicedo CP]: Gene plus mutation notation (e.g., TP53_p.R248Q).
49 | - `Metadata_genesymbol` [TA-ORF-BBBC037-Rohban CP, LUAD-BBBC041-Caicedo CP]: Gene symbol alone (e.g., TP53, MAPK8).
50 | - `Metadata_transcriptdb` [LUAD-BBBC041-Caicedo L1K]: Reference to specific transcript/isoform (e.g., NM_001126112.2:c.796G>C).
51 |
52 |
--------------------------------------------------------------------------------
/1.dataset_curation/curate_dataset.py:
--------------------------------------------------------------------------------
1 | # ---
2 | # jupyter:
3 | # jupytext:
4 | # text_representation:
5 | # extension: .py
6 | # format_name: percent
7 | # format_version: '1.3'
8 | # jupytext_version: 1.15.2
9 | # kernelspec:
10 | # display_name: Python 3
11 | # language: python
12 | # name: python3
13 | # ---
14 |
15 | # %%
16 | import pandas as pd
17 | from pathlib import Path
18 | from IPython.display import display
19 |
20 | # %%
21 | # First download the data from
22 | # s3://cellpainting-gallery/cpg0003-rosetta/broad/workspace/preprocessed_data
23 | # and save it in the ./preprocessed_data folder
24 |
25 | dataset_paths = {
26 | "LINCS-Pilot1": {
27 | "l1k": "./preprocessed_data/LINCS-Pilot1/L1000/replicate_level_l1k.csv.gz",
28 | "cp": "./preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_augmented.csv.gz",
29 | },
30 | "CDRP-BBBC047-Bray": {
31 | "l1k": "./preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k.csv.gz",
32 | "cp": "./preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_augmented.csv.gz",
33 | },
34 | "TA-ORF-BBBC037-Rohban": {
35 | "l1k": "./preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k.csv.gz",
36 | "cp": "./preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_augmented.csv.gz",
37 | },
38 | "LUAD-BBBC041-Caicedo": {
39 | "l1k": "./preprocessed_data/LUAD-BBBC041-Caicedo/L1000/replicate_level_l1k.csv.gz",
40 | "cp": "./preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_augmented.csv.gz",
41 | },
42 | }
43 |
44 | # Define column mappings for each dataset and data type
45 | column_rename_mappings = {
46 | "CDRP-BBBC047-Bray": {
47 | "l1k": {
48 | "pert_id": "Metadata_pert_id",
49 | "pert_dose": "Metadata_pert_dose_micromolar",
50 | "det_plate": "Metadata_Plate",
51 | "CPD_NAME": "Metadata_pert_iname",
52 | "CPD_TYPE": "Metadata_cdrp_group",
53 | "CPD_SMILES": "Metadata_SMILES",
54 | },
55 | "cp": {
56 | "Metadata_broad_sample": "Metadata_pert_id",
57 | "Metadata_broad_sample_type": "Metadata_pert_type",
58 | "Metadata_mmoles_per_liter2": "Metadata_pert_dose_micromolar",
59 | },
60 | },
61 | "LINCS-Pilot1": {
62 | "l1k": {
63 | "pert_dose": "Metadata_pert_dose_micromolar",
64 | "det_plate": "Metadata_Plate",
65 | "cell_id": "Metadata_cell_id",
66 | "det_well": "Metadata_Well",
67 | "mfc_plate_name": "Metadata_ARP_ID",
68 | "pert_iname_x": "Metadata_pert_iname",
69 | "pert_time": "Metadata_pert_timepoint",
70 | "pert_mfc_id": "Metadata_pert_id",
71 | "pert_type_x": "Metadata_pert_type",
72 | "x_smiles": "Metadata_SMILES",
73 | },
74 | "cp": {
75 | "Metadata_broad_sample": "Metadata_pert_id",
76 | "Metadata_broad_sample_type": "Metadata_pert_type",
77 | "Metadata_mmoles_per_liter": "Metadata_pert_dose_micromolar",
78 | "pert_iname": "Metadata_pert_iname",
79 | },
80 | },
81 | "TA-ORF-BBBC037-Rohban": {
82 | "l1k": {
83 | "det_plate": "Metadata_Plate",
84 | "cell_id": "Metadata_cell_id",
85 | "det_well": "Metadata_Well",
86 | "mfc_plate_name": "Metadata_ARP_ID",
87 | "pert_time": "Metadata_pert_timepoint",
88 | "pert_mfc_id": "Metadata_pert_id",
89 | "pert_type": "Metadata_pert_type",
90 | "x_genesymbol_mutation": "Metadata_genesymbol_mutation",
91 | },
92 | "cp": {
93 | "Metadata_broad_sample": "Metadata_pert_id",
94 | "Metadata_broad_sample_type": "Metadata_pert_type",
95 | "Metadata_pert_name": "Metadata_genesymbol_mutation",
96 | "Metadata_gene_name": "Metadata_genesymbol",
97 | },
98 | },
99 | "LUAD-BBBC041-Caicedo": {
100 | "l1k": {
101 | "det_plate": "Metadata_Plate",
102 | "cell_id": "Metadata_cell_id",
103 | "det_well": "Metadata_Well",
104 | "mfc_plate_name": "Metadata_ARP_ID",
105 | "pert_time": "Metadata_pert_timepoint",
106 | "pert_mfc_id": "Metadata_pert_id",
107 | "pert_type": "Metadata_pert_type",
108 | "x_transcriptdb": "Metadata_transcriptdb",
109 | },
110 | "cp": {
111 | "Metadata_broad_sample": "Metadata_pert_id",
112 | "Metadata_broad_sample_type": "Metadata_pert_type",
113 | "x_mutation_status": "Metadata_genesymbol_mutation",
114 | "Symbol": "Metadata_genesymbol",
115 | },
116 | },
117 | }
118 |
119 | # Define the columns we want to keep for each dataset and data type
120 | columns_to_keep = {
121 | "CDRP-BBBC047-Bray": {
122 | "l1k": [
123 | "Metadata_Plate",
124 | "Metadata_pert_id",
125 | "Metadata_pert_iname",
126 | "Metadata_pert_dose_micromolar",
127 | "Metadata_cdrp_group",
128 | "Metadata_SMILES",
129 | ],
130 | "cp": [
131 | "Metadata_Plate_Map_Name",
132 | "Metadata_Plate",
133 | "Metadata_Well",
134 | "Metadata_pert_id",
135 | "Metadata_pert_dose_micromolar",
136 | "Metadata_pert_type",
137 | "Metadata_cell_id",
138 | ],
139 | },
140 | "LINCS-Pilot1": {
141 | "l1k": [
142 | "Metadata_Plate",
143 | "Metadata_Well",
144 | "Metadata_pert_id",
145 | "Metadata_pert_type",
146 | "Metadata_pert_dose_micromolar",
147 | "Metadata_cell_id",
148 | "Metadata_pert_iname",
149 | "Metadata_ARP_ID",
150 | "Metadata_pert_timepoint",
151 | "Metadata_SMILES",
152 | ],
153 | "cp": [
154 | "Metadata_Plate_Map_Name",
155 | "Metadata_Plate",
156 | "Metadata_Well",
157 | "Metadata_pert_id",
158 | "Metadata_pert_type",
159 | "Metadata_pert_dose_micromolar",
160 | "Metadata_cell_id",
161 | "Metadata_pert_iname",
162 | ],
163 | },
164 | "TA-ORF-BBBC037-Rohban": {
165 | "l1k": [
166 | "Metadata_Plate",
167 | "Metadata_Well",
168 | "Metadata_pert_id",
169 | "Metadata_pert_type",
170 | "Metadata_cell_id",
171 | "Metadata_ARP_ID",
172 | "Metadata_pert_timepoint",
173 | "Metadata_genesymbol_mutation",
174 | ],
175 | "cp": [
176 | "Metadata_Plate_Map_Name",
177 | "Metadata_Plate",
178 | "Metadata_Well",
179 | "Metadata_pert_id",
180 | "Metadata_pert_type",
181 | "Metadata_cell_id",
182 | "Metadata_genesymbol_mutation",
183 | "Metadata_genesymbol",
184 | ],
185 | },
186 | "LUAD-BBBC041-Caicedo": {
187 | "l1k": [
188 | "Metadata_Plate",
189 | "Metadata_Well",
190 | "Metadata_pert_id",
191 | "Metadata_pert_type",
192 | "Metadata_cell_id",
193 | "Metadata_ARP_ID",
194 | "Metadata_pert_timepoint",
195 | "Metadata_transcriptdb",
196 | ],
197 | "cp": [
198 | "Metadata_Plate_Map_Name",
199 | "Metadata_Plate",
200 | "Metadata_Well",
201 | "Metadata_pert_id",
202 | "Metadata_pert_type",
203 | "Metadata_cell_id",
204 | "Metadata_genesymbol_mutation",
205 | "Metadata_genesymbol",
206 | ],
207 | },
208 | }
209 |
210 | # First load the data
211 | dataset_data = {}
212 | for dataset_name, paths in dataset_paths.items():
213 | dataset_data[dataset_name] = {}
214 | for data_type, dataset_path in paths.items():
215 | parquet_path = dataset_path.replace(".csv.gz", ".parquet")
216 | if not Path(parquet_path).exists():
217 | data = pd.read_csv(dataset_path, low_memory=False)
218 | data.to_parquet(parquet_path)
219 | dataset_data[dataset_name][data_type] = data
220 | else:
221 | data = pd.read_parquet(parquet_path)
222 | dataset_data[dataset_name][data_type] = data
223 |
224 |
225 | # %%
226 |
227 | # Then apply the column renaming
228 | for dataset_name, data_types in dataset_data.items():
229 | for data_type, data in data_types.items():
230 | if (
231 | dataset_name in column_rename_mappings
232 | and data_type in column_rename_mappings[dataset_name]
233 | ):
234 | # First, identify feature columns we want to preserve
235 | if data_type == "l1k":
236 | feature_mask = data.columns.str.endswith("_at")
237 | else: # cp
238 | feature_mask = (
239 | data.columns.str.startswith("Cells_")
240 | | data.columns.str.startswith("Cytoplasm_")
241 | | data.columns.str.startswith("Nuclei_")
242 | )
243 | feature_cols = data.columns[feature_mask]
244 | metadata_cols = data.columns[~feature_mask]
245 |
246 | # Apply renaming only to metadata columns
247 | rename_mapping = {
248 | k: v
249 | for k, v in column_rename_mappings[dataset_name][data_type].items()
250 | if k in metadata_cols
251 | }
252 |
253 | # Check if new name already exists and drop it if so
254 | for old, new in rename_mapping.items():
255 | if new in data.columns and new != old:
256 | data.drop(columns=[new], inplace=True)
257 | # Rename metadata columns
258 | data = data.rename(columns=rename_mapping)
259 |
260 | # Keep only desired metadata columns plus all feature columns
261 | keep_metadata = columns_to_keep[dataset_name][data_type]
262 | dataset_data[dataset_name][data_type] = data[
263 | keep_metadata + feature_cols.tolist()
264 | ]
265 |
266 | # %%
267 |
268 | # Make "Metadata_Well" uppercase
269 | for dataset_name, data_types in dataset_data.items():
270 | for data_type, data in data_types.items():
271 | if "Metadata_Well" in data.columns:
272 | data["Metadata_Well"] = data["Metadata_Well"].str.upper()
273 |
274 |
275 | # Make "Metadata_cell_id" = U2OS for CDRP-BBBC047-Bray cp
276 | dataset_data["CDRP-BBBC047-Bray"]["l1k"]["Metadata_cell_id"] = "U2OS"
277 |
278 | # Set timepoints
279 | dataset_data["LINCS-Pilot1"]["cp"]["Metadata_pert_timepoint"] = 48
280 | dataset_data["LINCS-Pilot1"]["l1k"]["Metadata_pert_timepoint"] = 24
281 |
282 | dataset_data["CDRP-BBBC047-Bray"]["cp"]["Metadata_pert_timepoint"] = 48
283 | dataset_data["CDRP-BBBC047-Bray"]["l1k"]["Metadata_pert_timepoint"] = 6
284 |
285 | dataset_data["TA-ORF-BBBC037-Rohban"]["cp"]["Metadata_pert_timepoint"] = 72
286 | dataset_data["TA-ORF-BBBC037-Rohban"]["l1k"]["Metadata_pert_timepoint"] = 72
287 |
288 | dataset_data["LUAD-BBBC041-Caicedo"]["cp"]["Metadata_pert_timepoint"] = 96
289 | dataset_data["LUAD-BBBC041-Caicedo"]["l1k"]["Metadata_pert_timepoint"] = 96
290 |
291 | # %%
292 |
293 | # Display the datasets
294 | for dataset_name, data_types in dataset_data.items():
295 | for data_type, data in data_types.items():
296 | display(f"Dataset: {dataset_name}, Data Type: {data_type}")
297 | display(data.sample(5)[data.columns[data.columns.str.startswith("Metadata")]])
298 |
299 | # %%
300 |
301 | # %%
302 | for dataset_name, data_types in dataset_data.items():
303 | for data_type, data in data_types.items():
304 | if "Metadata_pert_type" in data.columns:
305 | data["Metadata_pert_type"] = data["Metadata_pert_type"].replace(
306 | {"ctl_vehicle": "control", "trt_cp": "trt"}
307 | )
308 |
309 |
310 | # TA-ORF-BBBC037-Rohban cp does not correctly identify Metadata_pert_type, because it marks all as trt.
311 |
312 | # %%
313 |
314 | # Print columns for each dataset and data type
315 | print("\nColumns in each dataset:")
316 | for dataset_name, data_types in dataset_data.items():
317 | print(f"\n{dataset_name}:")
318 | for data_type, data in data_types.items():
319 | metadata_cols = [col for col in data.columns if col.startswith("Metadata")]
320 | print(f" {data_type}: {sorted(metadata_cols)}")
321 |
322 | # Find common columns between l1k datasets
323 | l1k_common = set.intersection(
324 | *[set(data_types["l1k"].columns) for data_types in dataset_data.values()]
325 | )
326 | l1k_metadata_common = sorted([col for col in l1k_common if col.startswith("Metadata")])
327 |
328 | # Find common columns between cp datasets
329 | cp_common = set.intersection(
330 | *[set(data_types["cp"].columns) for data_types in dataset_data.values()]
331 | )
332 | cp_metadata_common = sorted([col for col in cp_common if col.startswith("Metadata")])
333 |
334 | # Find common columns across all datasets
335 | all_common = set.intersection(l1k_common, cp_common)
336 | all_metadata_common = sorted([col for col in all_common if col.startswith("Metadata")])
337 |
338 | print("\nCommon Metadata columns across L1K datasets:")
339 | print(l1k_metadata_common)
340 | print("\nCommon Metadata columns across CP datasets:")
341 | print(cp_metadata_common)
342 | print("\nCommon Metadata columns across ALL datasets:")
343 | print(all_metadata_common)
344 |
345 | # %%
346 |
347 | # Check for duplicate columns within each dataset
348 | for dataset_name, data_types in dataset_data.items():
349 | for data_type, data in data_types.items():
350 | duplicate_cols = data.columns.duplicated()
351 | if any(duplicate_cols):
352 | print(f"Duplicate columns found in {dataset_name} {data_type}:")
353 | print(data.columns[duplicate_cols])
354 |
355 | # %%
356 | # Create markdown output for datasets
357 | markdown_output = "# Dataset Samples\n\n"
358 |
359 | for dataset_name, data_types in dataset_data.items():
360 | markdown_output += f"## {dataset_name}\n\n"
361 | for data_type, data in data_types.items():
362 | markdown_output += f"### {data_type.upper()} Data\n\n"
363 | # Convert sample to markdown table
364 | sample_df = data.sample(5)[
365 | [col for col in data.columns if col.startswith("Metadata")]
366 | ]
367 | markdown_output += sample_df.to_markdown(index=False) + "\n\n"
368 | display(sample_df.head())
369 |
370 | # Write to file
371 | with open("dataset_samples.md", "w") as f:
372 | f.write(markdown_output)
373 |
374 | print("Dataset samples have been written to dataset_samples.md")
375 |
376 | # %%
377 |
378 | # Save processed datasets using same structure as input
379 | for dataset_name, data_types in dataset_data.items():
380 | for data_type, data in data_types.items():
381 | # Mirror the input path structure but with processed data
382 | input_path = Path(dataset_paths[dataset_name][data_type])
383 | output_path = (
384 | Path("curated")
385 | / input_path.parent
386 | / input_path.name.replace(".csv.gz", ".parquet")
387 | )
388 | # Create the processed subdirectory if it doesn't exist
389 | output_path.parent.mkdir(exist_ok=True, parents=True)
390 |
391 | # # Save the data
392 | data.to_parquet(output_path, index=False)
393 | print(f"Saved {dataset_name} {data_type} data to {output_path}")
394 |
395 | # %%
396 |
--------------------------------------------------------------------------------
/1.dataset_curation/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "1-dataset-curation"
3 | version = "0.1.0"
4 | description = "Curate 2022_Haghighi_NatureMethods"
5 | readme = "README.md"
6 | requires-python = ">=3.12"
7 | dependencies = [
8 | "ipython>=8.31.0",
9 | "pandas>=2.2.3",
10 | "pyarrow>=19.0.0",
11 | "tabulate>=0.9.0",
12 | ]
13 |
--------------------------------------------------------------------------------
/1.dataset_curation/uv.lock:
--------------------------------------------------------------------------------
1 | version = 1
2 | requires-python = ">=3.12"
3 |
4 | [[package]]
5 | name = "1-dataset-curation"
6 | version = "0.1.0"
7 | source = { virtual = "." }
8 | dependencies = [
9 | { name = "ipython" },
10 | { name = "pandas" },
11 | { name = "pyarrow" },
12 | { name = "tabulate" },
13 | ]
14 |
15 | [package.metadata]
16 | requires-dist = [
17 | { name = "ipython", specifier = ">=8.31.0" },
18 | { name = "pandas", specifier = ">=2.2.3" },
19 | { name = "pyarrow", specifier = ">=19.0.0" },
20 | { name = "tabulate", specifier = ">=0.9.0" },
21 | ]
22 |
23 | [[package]]
24 | name = "asttokens"
25 | version = "3.0.0"
26 | source = { registry = "https://pypi.org/simple" }
27 | sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978 }
28 | wheels = [
29 | { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918 },
30 | ]
31 |
32 | [[package]]
33 | name = "colorama"
34 | version = "0.4.6"
35 | source = { registry = "https://pypi.org/simple" }
36 | sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
37 | wheels = [
38 | { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
39 | ]
40 |
41 | [[package]]
42 | name = "decorator"
43 | version = "5.1.1"
44 | source = { registry = "https://pypi.org/simple" }
45 | sdist = { url = "https://files.pythonhosted.org/packages/66/0c/8d907af351aa16b42caae42f9d6aa37b900c67308052d10fdce809f8d952/decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330", size = 35016 }
46 | wheels = [
47 | { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 },
48 | ]
49 |
50 | [[package]]
51 | name = "executing"
52 | version = "2.2.0"
53 | source = { registry = "https://pypi.org/simple" }
54 | sdist = { url = "https://files.pythonhosted.org/packages/91/50/a9d80c47ff289c611ff12e63f7c5d13942c65d68125160cefd768c73e6e4/executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755", size = 978693 }
55 | wheels = [
56 | { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702 },
57 | ]
58 |
59 | [[package]]
60 | name = "ipython"
61 | version = "8.31.0"
62 | source = { registry = "https://pypi.org/simple" }
63 | dependencies = [
64 | { name = "colorama", marker = "sys_platform == 'win32'" },
65 | { name = "decorator" },
66 | { name = "jedi" },
67 | { name = "matplotlib-inline" },
68 | { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
69 | { name = "prompt-toolkit" },
70 | { name = "pygments" },
71 | { name = "stack-data" },
72 | { name = "traitlets" },
73 | ]
74 | sdist = { url = "https://files.pythonhosted.org/packages/01/35/6f90fdddff7a08b7b715fccbd2427b5212c9525cd043d26fdc45bee0708d/ipython-8.31.0.tar.gz", hash = "sha256:b6a2274606bec6166405ff05e54932ed6e5cfecaca1fc05f2cacde7bb074d70b", size = 5501011 }
75 | wheels = [
76 | { url = "https://files.pythonhosted.org/packages/04/60/d0feb6b6d9fe4ab89fe8fe5b47cbf6cd936bfd9f1e7ffa9d0015425aeed6/ipython-8.31.0-py3-none-any.whl", hash = "sha256:46ec58f8d3d076a61d128fe517a51eb730e3aaf0c184ea8c17d16e366660c6a6", size = 821583 },
77 | ]
78 |
79 | [[package]]
80 | name = "jedi"
81 | version = "0.19.2"
82 | source = { registry = "https://pypi.org/simple" }
83 | dependencies = [
84 | { name = "parso" },
85 | ]
86 | sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287 }
87 | wheels = [
88 | { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278 },
89 | ]
90 |
91 | [[package]]
92 | name = "matplotlib-inline"
93 | version = "0.1.7"
94 | source = { registry = "https://pypi.org/simple" }
95 | dependencies = [
96 | { name = "traitlets" },
97 | ]
98 | sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159 }
99 | wheels = [
100 | { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899 },
101 | ]
102 |
103 | [[package]]
104 | name = "numpy"
105 | version = "2.2.2"
106 | source = { registry = "https://pypi.org/simple" }
107 | sdist = { url = "https://files.pythonhosted.org/packages/ec/d0/c12ddfd3a02274be06ffc71f3efc6d0e457b0409c4481596881e748cb264/numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f", size = 20233295 }
108 | wheels = [
109 | { url = "https://files.pythonhosted.org/packages/0c/e6/847d15770ab7a01e807bdfcd4ead5bdae57c0092b7dc83878171b6af97bb/numpy-2.2.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ac9bea18d6d58a995fac1b2cb4488e17eceeac413af014b1dd26170b766d8467", size = 20912636 },
110 | { url = "https://files.pythonhosted.org/packages/d1/af/f83580891577b13bd7e261416120e036d0d8fb508c8a43a73e38928b794b/numpy-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23ae9f0c2d889b7b2d88a3791f6c09e2ef827c2446f1c4a3e3e76328ee4afd9a", size = 14098403 },
111 | { url = "https://files.pythonhosted.org/packages/2b/86/d019fb60a9d0f1d4cf04b014fe88a9135090adfadcc31c1fadbb071d7fa7/numpy-2.2.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:3074634ea4d6df66be04f6728ee1d173cfded75d002c75fac79503a880bf3825", size = 5128938 },
112 | { url = "https://files.pythonhosted.org/packages/7a/1b/50985edb6f1ec495a1c36452e860476f5b7ecdc3fc59ea89ccad3c4926c5/numpy-2.2.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8ec0636d3f7d68520afc6ac2dc4b8341ddb725039de042faf0e311599f54eb37", size = 6661937 },
113 | { url = "https://files.pythonhosted.org/packages/f4/1b/17efd94cad1b9d605c3f8907fb06bcffc4ce4d1d14d46b95316cccccf2b9/numpy-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ffbb1acd69fdf8e89dd60ef6182ca90a743620957afb7066385a7bbe88dc748", size = 14049518 },
114 | { url = "https://files.pythonhosted.org/packages/5b/73/65d2f0b698df1731e851e3295eb29a5ab8aa06f763f7e4188647a809578d/numpy-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0349b025e15ea9d05c3d63f9657707a4e1d471128a3b1d876c095f328f8ff7f0", size = 16099146 },
115 | { url = "https://files.pythonhosted.org/packages/d5/69/308f55c0e19d4b5057b5df286c5433822e3c8039ede06d4051d96f1c2c4e/numpy-2.2.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:463247edcee4a5537841d5350bc87fe8e92d7dd0e8c71c995d2c6eecb8208278", size = 15246336 },
116 | { url = "https://files.pythonhosted.org/packages/f0/d8/d8d333ad0d8518d077a21aeea7b7c826eff766a2b1ce1194dea95ca0bacf/numpy-2.2.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9dd47ff0cb2a656ad69c38da850df3454da88ee9a6fde0ba79acceee0e79daba", size = 17863507 },
117 | { url = "https://files.pythonhosted.org/packages/82/6e/0b84ad3103ffc16d6673e63b5acbe7901b2af96c2837174c6318c98e27ab/numpy-2.2.2-cp312-cp312-win32.whl", hash = "sha256:4525b88c11906d5ab1b0ec1f290996c0020dd318af8b49acaa46f198b1ffc283", size = 6276491 },
118 | { url = "https://files.pythonhosted.org/packages/fc/84/7f801a42a67b9772a883223a0a1e12069a14626c81a732bd70aac57aebc1/numpy-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:5acea83b801e98541619af398cc0109ff48016955cc0818f478ee9ef1c5c3dcb", size = 12616372 },
119 | { url = "https://files.pythonhosted.org/packages/e1/fe/df5624001f4f5c3e0b78e9017bfab7fdc18a8d3b3d3161da3d64924dd659/numpy-2.2.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b208cfd4f5fe34e1535c08983a1a6803fdbc7a1e86cf13dd0c61de0b51a0aadc", size = 20899188 },
120 | { url = "https://files.pythonhosted.org/packages/a9/80/d349c3b5ed66bd3cb0214be60c27e32b90a506946857b866838adbe84040/numpy-2.2.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d0bbe7dd86dca64854f4b6ce2ea5c60b51e36dfd597300057cf473d3615f2369", size = 14113972 },
121 | { url = "https://files.pythonhosted.org/packages/9d/50/949ec9cbb28c4b751edfa64503f0913cbfa8d795b4a251e7980f13a8a655/numpy-2.2.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:22ea3bb552ade325530e72a0c557cdf2dea8914d3a5e1fecf58fa5dbcc6f43cd", size = 5114294 },
122 | { url = "https://files.pythonhosted.org/packages/8d/f3/399c15629d5a0c68ef2aa7621d430b2be22034f01dd7f3c65a9c9666c445/numpy-2.2.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:128c41c085cab8a85dc29e66ed88c05613dccf6bc28b3866cd16050a2f5448be", size = 6648426 },
123 | { url = "https://files.pythonhosted.org/packages/2c/03/c72474c13772e30e1bc2e558cdffd9123c7872b731263d5648b5c49dd459/numpy-2.2.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:250c16b277e3b809ac20d1f590716597481061b514223c7badb7a0f9993c7f84", size = 14045990 },
124 | { url = "https://files.pythonhosted.org/packages/83/9c/96a9ab62274ffafb023f8ee08c88d3d31ee74ca58869f859db6845494fa6/numpy-2.2.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0c8854b09bc4de7b041148d8550d3bd712b5c21ff6a8ed308085f190235d7ff", size = 16096614 },
125 | { url = "https://files.pythonhosted.org/packages/d5/34/cd0a735534c29bec7093544b3a509febc9b0df77718a9b41ffb0809c9f46/numpy-2.2.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b6fb9c32a91ec32a689ec6410def76443e3c750e7cfc3fb2206b985ffb2b85f0", size = 15242123 },
126 | { url = "https://files.pythonhosted.org/packages/5e/6d/541717a554a8f56fa75e91886d9b79ade2e595918690eb5d0d3dbd3accb9/numpy-2.2.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:57b4012e04cc12b78590a334907e01b3a85efb2107df2b8733ff1ed05fce71de", size = 17859160 },
127 | { url = "https://files.pythonhosted.org/packages/b9/a5/fbf1f2b54adab31510728edd06a05c1b30839f37cf8c9747cb85831aaf1b/numpy-2.2.2-cp313-cp313-win32.whl", hash = "sha256:4dbd80e453bd34bd003b16bd802fac70ad76bd463f81f0c518d1245b1c55e3d9", size = 6273337 },
128 | { url = "https://files.pythonhosted.org/packages/56/e5/01106b9291ef1d680f82bc47d0c5b5e26dfed15b0754928e8f856c82c881/numpy-2.2.2-cp313-cp313-win_amd64.whl", hash = "sha256:5a8c863ceacae696aff37d1fd636121f1a512117652e5dfb86031c8d84836369", size = 12609010 },
129 | { url = "https://files.pythonhosted.org/packages/9f/30/f23d9876de0f08dceb707c4dcf7f8dd7588266745029debb12a3cdd40be6/numpy-2.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:b3482cb7b3325faa5f6bc179649406058253d91ceda359c104dac0ad320e1391", size = 20924451 },
130 | { url = "https://files.pythonhosted.org/packages/6a/ec/6ea85b2da9d5dfa1dbb4cb3c76587fc8ddcae580cb1262303ab21c0926c4/numpy-2.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9491100aba630910489c1d0158034e1c9a6546f0b1340f716d522dc103788e39", size = 14122390 },
131 | { url = "https://files.pythonhosted.org/packages/68/05/bfbdf490414a7dbaf65b10c78bc243f312c4553234b6d91c94eb7c4b53c2/numpy-2.2.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:41184c416143defa34cc8eb9d070b0a5ba4f13a0fa96a709e20584638254b317", size = 5156590 },
132 | { url = "https://files.pythonhosted.org/packages/f7/ec/fe2e91b2642b9d6544518388a441bcd65c904cea38d9ff998e2e8ebf808e/numpy-2.2.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:7dca87ca328f5ea7dafc907c5ec100d187911f94825f8700caac0b3f4c384b49", size = 6671958 },
133 | { url = "https://files.pythonhosted.org/packages/b1/6f/6531a78e182f194d33ee17e59d67d03d0d5a1ce7f6be7343787828d1bd4a/numpy-2.2.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bc61b307655d1a7f9f4b043628b9f2b721e80839914ede634e3d485913e1fb2", size = 14019950 },
134 | { url = "https://files.pythonhosted.org/packages/e1/fb/13c58591d0b6294a08cc40fcc6b9552d239d773d520858ae27f39997f2ae/numpy-2.2.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fad446ad0bc886855ddf5909cbf8cb5d0faa637aaa6277fb4b19ade134ab3c7", size = 16079759 },
135 | { url = "https://files.pythonhosted.org/packages/2c/f2/f2f8edd62abb4b289f65a7f6d1f3650273af00b91b7267a2431be7f1aec6/numpy-2.2.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:149d1113ac15005652e8d0d3f6fd599360e1a708a4f98e43c9c77834a28238cb", size = 15226139 },
136 | { url = "https://files.pythonhosted.org/packages/aa/29/14a177f1a90b8ad8a592ca32124ac06af5eff32889874e53a308f850290f/numpy-2.2.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:106397dbbb1896f99e044efc90360d098b3335060375c26aa89c0d8a97c5f648", size = 17856316 },
137 | { url = "https://files.pythonhosted.org/packages/95/03/242ae8d7b97f4e0e4ab8dd51231465fb23ed5e802680d629149722e3faf1/numpy-2.2.2-cp313-cp313t-win32.whl", hash = "sha256:0eec19f8af947a61e968d5429f0bd92fec46d92b0008d0a6685b40d6adf8a4f4", size = 6329134 },
138 | { url = "https://files.pythonhosted.org/packages/80/94/cd9e9b04012c015cb6320ab3bf43bc615e248dddfeb163728e800a5d96f0/numpy-2.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:97b974d3ba0fb4612b77ed35d7627490e8e3dff56ab41454d9e8b23448940576", size = 12696208 },
139 | ]
140 |
141 | [[package]]
142 | name = "pandas"
143 | version = "2.2.3"
144 | source = { registry = "https://pypi.org/simple" }
145 | dependencies = [
146 | { name = "numpy" },
147 | { name = "python-dateutil" },
148 | { name = "pytz" },
149 | { name = "tzdata" },
150 | ]
151 | sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 }
152 | wheels = [
153 | { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893 },
154 | { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475 },
155 | { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645 },
156 | { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445 },
157 | { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235 },
158 | { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756 },
159 | { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 },
160 | { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643 },
161 | { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573 },
162 | { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085 },
163 | { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809 },
164 | { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316 },
165 | { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055 },
166 | { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175 },
167 | { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650 },
168 | { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177 },
169 | { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526 },
170 | { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013 },
171 | { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620 },
172 | { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 },
173 | ]
174 |
175 | [[package]]
176 | name = "parso"
177 | version = "0.8.4"
178 | source = { registry = "https://pypi.org/simple" }
179 | sdist = { url = "https://files.pythonhosted.org/packages/66/94/68e2e17afaa9169cf6412ab0f28623903be73d1b32e208d9e8e541bb086d/parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d", size = 400609 }
180 | wheels = [
181 | { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650 },
182 | ]
183 |
184 | [[package]]
185 | name = "pexpect"
186 | version = "4.9.0"
187 | source = { registry = "https://pypi.org/simple" }
188 | dependencies = [
189 | { name = "ptyprocess" },
190 | ]
191 | sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450 }
192 | wheels = [
193 | { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772 },
194 | ]
195 |
196 | [[package]]
197 | name = "prompt-toolkit"
198 | version = "3.0.50"
199 | source = { registry = "https://pypi.org/simple" }
200 | dependencies = [
201 | { name = "wcwidth" },
202 | ]
203 | sdist = { url = "https://files.pythonhosted.org/packages/a1/e1/bd15cb8ffdcfeeb2bdc215de3c3cffca11408d829e4b8416dcfe71ba8854/prompt_toolkit-3.0.50.tar.gz", hash = "sha256:544748f3860a2623ca5cd6d2795e7a14f3d0e1c3c9728359013f79877fc89bab", size = 429087 }
204 | wheels = [
205 | { url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816 },
206 | ]
207 |
208 | [[package]]
209 | name = "ptyprocess"
210 | version = "0.7.0"
211 | source = { registry = "https://pypi.org/simple" }
212 | sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762 }
213 | wheels = [
214 | { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993 },
215 | ]
216 |
217 | [[package]]
218 | name = "pure-eval"
219 | version = "0.2.3"
220 | source = { registry = "https://pypi.org/simple" }
221 | sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752 }
222 | wheels = [
223 | { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842 },
224 | ]
225 |
226 | [[package]]
227 | name = "pyarrow"
228 | version = "19.0.0"
229 | source = { registry = "https://pypi.org/simple" }
230 | sdist = { url = "https://files.pythonhosted.org/packages/7b/01/fe1fd04744c2aa038e5a11c7a4adb3d62bce09798695e54f7274b5977134/pyarrow-19.0.0.tar.gz", hash = "sha256:8d47c691765cf497aaeed4954d226568563f1b3b74ff61139f2d77876717084b", size = 1129096 }
231 | wheels = [
232 | { url = "https://files.pythonhosted.org/packages/bc/2e/152885f5ef421e80dae68b9c133ab261934f93a6d5e16b61d79c0ed597fb/pyarrow-19.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:a7bbe7109ab6198688b7079cbad5a8c22de4d47c4880d8e4847520a83b0d1b68", size = 30667964 },
233 | { url = "https://files.pythonhosted.org/packages/80/c2/08bbee9a8610a47c9a1466845f405baf53a639ddd947c5133d8ba13544b6/pyarrow-19.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:4624c89d6f777c580e8732c27bb8e77fd1433b89707f17c04af7635dd9638351", size = 32125039 },
234 | { url = "https://files.pythonhosted.org/packages/d2/56/06994df823212f5688d3c8bf4294928b12c9be36681872853655724d28c6/pyarrow-19.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b6d3ce4288793350dc2d08d1e184fd70631ea22a4ff9ea5c4ff182130249d9b", size = 41140729 },
235 | { url = "https://files.pythonhosted.org/packages/94/65/38ad577c98140a9db71e9e1e594b6adb58a7478a5afec6456a8ca2df7f70/pyarrow-19.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:450a7d27e840e4d9a384b5c77199d489b401529e75a3b7a3799d4cd7957f2f9c", size = 42202267 },
236 | { url = "https://files.pythonhosted.org/packages/b6/1f/966b722251a7354114ccbb71cf1a83922023e69efd8945ebf628a851ec4c/pyarrow-19.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a08e2a8a039a3f72afb67a6668180f09fddaa38fe0d21f13212b4aba4b5d2451", size = 40505858 },
237 | { url = "https://files.pythonhosted.org/packages/3b/5e/6bc81aa7fc9affc7d1c03b912fbcc984ca56c2a18513684da267715dab7b/pyarrow-19.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:f43f5aef2a13d4d56adadae5720d1fed4c1356c993eda8b59dace4b5983843c1", size = 42084973 },
238 | { url = "https://files.pythonhosted.org/packages/53/c3/2f56da818b6a4758cbd514957c67bd0f078ebffa5390ee2e2bf0f9e8defc/pyarrow-19.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:2f672f5364b2d7829ef7c94be199bb88bf5661dd485e21d2d37de12ccb78a136", size = 25241976 },
239 | { url = "https://files.pythonhosted.org/packages/f5/b9/ba07ed3dd6b6e4f379b78e9c47c50c8886e07862ab7fa6339ac38622d755/pyarrow-19.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:cf3bf0ce511b833f7bc5f5bb3127ba731e97222023a444b7359f3a22e2a3b463", size = 30651291 },
240 | { url = "https://files.pythonhosted.org/packages/ad/10/0d304243c8277035298a68a70807efb76199c6c929bb3363c92ac9be6a0d/pyarrow-19.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:4d8b0c0de0a73df1f1bf439af1b60f273d719d70648e898bc077547649bb8352", size = 32100461 },
241 | { url = "https://files.pythonhosted.org/packages/8a/61/bcfc5182e11831bca3f849945b9b106e09fd10ded773dff466658e972a45/pyarrow-19.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92aff08e23d281c69835e4a47b80569242a504095ef6a6223c1f6bb8883431d", size = 41132491 },
242 | { url = "https://files.pythonhosted.org/packages/8e/87/2915a29049ec352dc69a967fbcbd76b0180319233de0daf8bd368df37099/pyarrow-19.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3b78eff5968a1889a0f3bc81ca57e1e19b75f664d9c61a42a604bf9d8402aae", size = 42192529 },
243 | { url = "https://files.pythonhosted.org/packages/48/18/44e5542b2707a8afaf78b5b88c608f261871ae77787eac07b7c679ca6f0f/pyarrow-19.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b34d3bde38eba66190b215bae441646330f8e9da05c29e4b5dd3e41bde701098", size = 40495363 },
244 | { url = "https://files.pythonhosted.org/packages/ba/d6/5096deb7599bbd20bc2768058fe23bc725b88eb41bee58303293583a2935/pyarrow-19.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5418d4d0fab3a0ed497bad21d17a7973aad336d66ad4932a3f5f7480d4ca0c04", size = 42074075 },
245 | { url = "https://files.pythonhosted.org/packages/2c/df/e3c839c04c284c9ec3d62b02a8c452b795d9b07b04079ab91ce33484d4c5/pyarrow-19.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e82c3d5e44e969c217827b780ed8faf7ac4c53f934ae9238872e749fa531f7c9", size = 25239803 },
246 | { url = "https://files.pythonhosted.org/packages/6a/d3/a6d4088e906c7b5d47792256212606d2ae679046dc750eee0ae167338e5c/pyarrow-19.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f208c3b58a6df3b239e0bb130e13bc7487ed14f39a9ff357b6415e3f6339b560", size = 30695401 },
247 | { url = "https://files.pythonhosted.org/packages/94/25/70040fd0e397dd1b937f459eaeeec942a76027357491dca0ada09d1322af/pyarrow-19.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:c751c1c93955b7a84c06794df46f1cec93e18610dcd5ab7d08e89a81df70a849", size = 32104680 },
248 | { url = "https://files.pythonhosted.org/packages/4e/f9/92783290cc0d80ca16d34b0c126305bfacca4b87dd889c8f16c6ef2a8fd7/pyarrow-19.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b903afaa5df66d50fc38672ad095806443b05f202c792694f3a604ead7c6ea6e", size = 41076754 },
249 | { url = "https://files.pythonhosted.org/packages/05/46/2c9870f50a495c72e2b8982ae29a9b1680707ea936edc0de444cec48f875/pyarrow-19.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a22a4bc0937856263df8b94f2f2781b33dd7f876f787ed746608e06902d691a5", size = 42163133 },
250 | { url = "https://files.pythonhosted.org/packages/7b/2f/437922b902549228fb15814e8a26105bff2787ece466a8d886eb6699efad/pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:5e8a28b918e2e878c918f6d89137386c06fe577cd08d73a6be8dafb317dc2d73", size = 40452210 },
251 | { url = "https://files.pythonhosted.org/packages/36/ef/1d7975053af9d106da973bac142d0d4da71b7550a3576cc3e0b3f444d21a/pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:29cd86c8001a94f768f79440bf83fee23963af5e7bc68ce3a7e5f120e17edf89", size = 42077618 },
252 | ]
253 |
254 | [[package]]
255 | name = "pygments"
256 | version = "2.19.1"
257 | source = { registry = "https://pypi.org/simple" }
258 | sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 }
259 | wheels = [
260 | { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 },
261 | ]
262 |
263 | [[package]]
264 | name = "python-dateutil"
265 | version = "2.9.0.post0"
266 | source = { registry = "https://pypi.org/simple" }
267 | dependencies = [
268 | { name = "six" },
269 | ]
270 | sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 }
271 | wheels = [
272 | { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 },
273 | ]
274 |
275 | [[package]]
276 | name = "pytz"
277 | version = "2024.2"
278 | source = { registry = "https://pypi.org/simple" }
279 | sdist = { url = "https://files.pythonhosted.org/packages/3a/31/3c70bf7603cc2dca0f19bdc53b4537a797747a58875b552c8c413d963a3f/pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a", size = 319692 }
280 | wheels = [
281 | { url = "https://files.pythonhosted.org/packages/11/c3/005fcca25ce078d2cc29fd559379817424e94885510568bc1bc53d7d5846/pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725", size = 508002 },
282 | ]
283 |
284 | [[package]]
285 | name = "six"
286 | version = "1.17.0"
287 | source = { registry = "https://pypi.org/simple" }
288 | sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 }
289 | wheels = [
290 | { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 },
291 | ]
292 |
293 | [[package]]
294 | name = "stack-data"
295 | version = "0.6.3"
296 | source = { registry = "https://pypi.org/simple" }
297 | dependencies = [
298 | { name = "asttokens" },
299 | { name = "executing" },
300 | { name = "pure-eval" },
301 | ]
302 | sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707 }
303 | wheels = [
304 | { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521 },
305 | ]
306 |
307 | [[package]]
308 | name = "tabulate"
309 | version = "0.9.0"
310 | source = { registry = "https://pypi.org/simple" }
311 | sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090 }
312 | wheels = [
313 | { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252 },
314 | ]
315 |
316 | [[package]]
317 | name = "traitlets"
318 | version = "5.14.3"
319 | source = { registry = "https://pypi.org/simple" }
320 | sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621 }
321 | wheels = [
322 | { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359 },
323 | ]
324 |
325 | [[package]]
326 | name = "tzdata"
327 | version = "2025.1"
328 | source = { registry = "https://pypi.org/simple" }
329 | sdist = { url = "https://files.pythonhosted.org/packages/43/0f/fa4723f22942480be4ca9527bbde8d43f6c3f2fe8412f00e7f5f6746bc8b/tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694", size = 194950 }
330 | wheels = [
331 | { url = "https://files.pythonhosted.org/packages/0f/dd/84f10e23edd882c6f968c21c2434fe67bd4a528967067515feca9e611e5e/tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639", size = 346762 },
332 | ]
333 |
334 | [[package]]
335 | name = "wcwidth"
336 | version = "0.2.13"
337 | source = { registry = "https://pypi.org/simple" }
338 | sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301 }
339 | wheels = [
340 | { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 },
341 | ]
342 |
--------------------------------------------------------------------------------
/GO_terms_search/4-GO-terms-search-analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "02b2641f",
6 | "metadata": {},
7 | "source": [
8 | "#### Here we investigate the relashionship between:\n",
9 | " - mRNA level predictability of a landmark gene \n",
10 | " and \n",
11 | " - its known organelle level biological function using GO annotations"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "id": "367cc88b",
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "import pandas as pd\n",
22 | "import numpy as np\n",
23 | "def locations_of_substring(string, substring):\n",
24 | " \"\"\"Return a list of locations of a substring.\"\"\"\n",
25 | " substring_length = len(substring) \n",
26 | " def recurse(locations_found, start):\n",
27 | " location = string.find(substring, start)\n",
28 | " if location != -1:\n",
29 | " return recurse(locations_found + [location], location+substring_length)\n",
30 | " else:\n",
31 | " return locations_found\n",
32 | " return recurse([], 0)"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "id": "d80e3f38",
38 | "metadata": {},
39 | "source": [
40 | "#### For LUAD dataset:\n",
41 | "\n",
42 | "1 - Read predictability map of categorical features (using MLP model)\n",
43 | "\n",
44 | "2 - Assign the feature categories to compartments/stains\n",
45 | "\n",
46 | "3 - Read functional annotations of the reference set according to DAVIDs output and add columns for each channel\n",
47 | " - Add channel specific annotation to each columns channel\n",
48 | " "
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "id": "00bc67ee",
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "import sys\n",
59 | "sys.path.insert(0, '../utils/') \n",
60 | "from readProfiles import rename_affyprobe_to_genename\n",
61 | "from saveAsNewSheetToExistingFile import saveAsNewSheetToExistingFile\n",
62 | "########### 1 ###########\n",
63 | "filename='../results/SingleGenePred_cpCategoryMap/cat_scores_maps.xlsx'\n",
64 | "saved_scores=pd.read_excel(filename, sheet_name=None)\n",
65 | "# which_ds_model='LUAD-9-MLP-ht'\n",
66 | "which_ds_model='LUAD-9-MLP-keras-ht'\n",
67 | "# which_ds_model='LUAD-9-lasso-ht'\n",
68 | "dfcats=saved_scores[which_ds_model].rename(columns={'Unnamed: 0':'ID'})\n",
69 | "dfcats=dfcats[dfcats.columns[~dfcats.isna().any()].tolist()]\n",
70 | "dfcats2,_=rename_affyprobe_to_genename(dfcats.set_index('ID').T,dfcats.ID.tolist())\n",
71 | "dfcats=dfcats2.T.reset_index()\n",
72 | "\n",
73 | "\n",
74 | "########### 2 ###########\n",
75 | "Channelss=['DNA','RNA','AGP','Mito','ER']\n",
76 | "Channelss_cats=['DNA|Nuclei_AreaShape','RNA','AGP|Cytoplasm_AreaShape|Cells_AreaShape','Mito','ER']\n",
77 | "# Channelss_cats=['DNA|Nuclei_AreaShape','RNA','AGP|Cytoplasm_AreaShape','Mito','ER']\n",
78 | "\n",
79 | "for ci in range(len(Channelss)):\n",
80 | " dfcats['max_'+Channelss[ci]]=dfcats.loc[:,dfcats.columns.str.contains(Channelss_cats[ci])].max(axis=1)\n",
81 | "\n",
82 | " \n",
83 | "dfcats['top_channel']=dfcats[['max_'+Channelss[ci] for ci in range(len(Channelss))]].idxmax(axis=\"columns\")\n",
84 | "########### 3 ###########\n",
85 | "# gene_cats_bpcc=pd.read_csv('./go_BP_CC_MF_DIRECT_921.txt',delimiter='\\t')\n",
86 | "gene_cats_bpcc=pd.read_csv('./source/GO_bp_cc_mf_direct_LUAD_976.txt',delimiter='\\t')\n",
87 | "comps=['mitochondri','golgi','membrane','cytoskeleton','actin','endoplasmic','rna','nucleol',\\\n",
88 | " 'cell division','mitosis','mitotic','cell cycle']\n",
89 | "\n",
90 | "# GOTERM_BP_DIRECT\n",
91 | "# GOTERM_CC_DIRECT\n",
92 | "# GOTERM_MF_DIRECT\n",
93 | "for c in comps:\n",
94 | " gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
95 | " ''.join([x[:si].split('~')[-1]+x[si:].split('go')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
96 | " gene_cats_bpcc['GOTERM_MF_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
97 | " ''.join([x[:si].split('~')[-1]+x[si:].split('go')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
98 | " gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
99 | " ''.join([x[:si].split('~')[-1]+x[si:].split('go')[0] for si in locations_of_substring(x,c)]) if c in x else '')#+\\\n",
100 | "# gene_cats_bpcc['UP_KW_BIOLOGICAL_PROCESS'].astype(str).str.lower().apply(lambda x:\\\n",
101 | "# ''.join([x[:si].split('~')[-1]+x[si:].split('kw')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
102 | "# gene_cats_bpcc['UP_KW_CELLULAR_COMPONENT'].astype(str).str.lower().apply(lambda x:\\\n",
103 | "# ''.join([x[:si].split('~')[-1]+x[si:].split('kw')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
104 | "# gene_cats_bpcc['UP_KW_MOLECULAR_FUNCTION'].astype(str).str.lower().apply(lambda x:\\\n",
105 | "# ''.join([x[:si].split('~')[-1]+x[si:].split('kw')[0] for si in locations_of_substring(x,c)]) if c in x else '')#+\\\n",
106 | "# # gene_cats_bpcc['UP_SEQ_FEATURE'].astype(str).str.lower().apply(lambda x:\\\n",
107 | "# ''.join([x[:si].split(':')[-1]+','+x[si:].split(',')[0] for si in locations_of_substring(x,c)]) if c in x else '')\n",
108 | " \n",
109 | "gene_cats_bpcc['RNA_nucleoli']=gene_cats_bpcc['rna']+gene_cats_bpcc['nucleol']\n",
110 | "gene_cats_bpcc['DNA']=gene_cats_bpcc['cell division']+gene_cats_bpcc['mitosis']+\\\n",
111 | "gene_cats_bpcc['mitotic']+gene_cats_bpcc['cell cycle']\n",
112 | "\n",
113 | "gene_cats_bpcc['cytoskeleton-actin']=gene_cats_bpcc['cytoskeleton']+gene_cats_bpcc['actin']+\\\n",
114 | "gene_cats_bpcc['golgi']+gene_cats_bpcc['membrane']\n",
115 | "# gene_cats_bpcc['ER']=gene_cats_bpcc['endoplasmic']\n",
116 | "gene_cats_bpcc['mitochondria']=gene_cats_bpcc['mitochondri']\n",
117 | "\n",
118 | "gene_cats_bpcc=pd.merge(gene_cats_bpcc,dfcats,how='inner',on='ID')\n",
119 | "\n",
120 | "#########################\n",
121 | "Channelss_dict={'DNA':'DNA','RNA_nucleoli':'RNA','cytoskeleton-actin':'AGP','mitochondria':'Mito','endoplasmic':'ER'}\n",
122 | "Chan_rev_dict = dict(zip(Channelss_dict.values(),Channelss_dict.keys()))\n",
123 | "\n",
124 | "comps2=['mitochondri','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n",
125 | "gene_cats_bpcc['any_comps']=''\n",
126 | "for co in comps2:\n",
127 | " gene_cats_bpcc['any_comps']=gene_cats_bpcc['any_comps']+gene_cats_bpcc[co]\n",
128 | " \n",
129 | " \n",
130 | " \n",
131 | "# gene_cats_bpcc=gene_cats_bpcc[gene_cats_bpcc['any_comps']!=''].reset_index(drop=True) \n",
132 | "from sklearn.metrics import confusion_matrix\n",
133 | "from scipy.stats import fisher_exact\n",
134 | "# top_bool=(gene_cats_bpcc['top58']==True).values\n",
135 | "table2=pd.DataFrame(index=Channelss_dict.keys(),columns=Channelss_dict.values())\n",
136 | "table3=pd.DataFrame(index=Channelss,columns=['odds ratio','med_restComp_oddsratio','restComp_oddsratio','anyComp_oddsratio','top-ratio'])\n",
137 | "# table3=pd.DataFrame(index=Channelss,columns=['Prevalence','anyComp-Prevalence','noComp-Prevalence','top-ratio'])\n",
138 | "\n",
139 | "table=[]\n",
140 | "for c in Channelss:\n",
141 | " print(c)\n",
142 | " c_r=list(set(Channelss)-set([c]))\n",
143 | "# gene_cats_bpcc_highP=gene_cats_bpcc[(gene_cats_bpcc['max_'+c]>0.7)]\n",
144 | "# gene_cats_bpcc_lowP=gene_cats_bpcc[(gene_cats_bpcc['max_'+c]<0.1)]\n",
145 | " \n",
146 | " low_ind=gene_cats_bpcc[['max_'+c]].sort_values(by='max_'+c)[:100].index\n",
147 | " high_ind=gene_cats_bpcc[['max_'+c]].sort_values(by='max_'+c)[-100:].index \n",
148 | " gene_cats_bpcc_highP=gene_cats_bpcc.loc[high_ind].reset_index(drop=True)\n",
149 | " gene_cats_bpcc_lowP=gene_cats_bpcc.loc[low_ind].reset_index(drop=True)\n",
150 | " gene_cats_bpcc2=gene_cats_bpcc.copy()\n",
151 | " \n",
152 | "# top_bool=(gene_cats_bpcc['max_'+c]>.6).values\n",
153 | " top_bool=(gene_cats_bpcc['max_'+c]<0).values\n",
154 | " \n",
155 | " print(np.sum(gene_cats_bpcc.loc[gene_cats_bpcc['max_'+c]<0,['max_'+Channelss[ci] for ci in range(len(Channelss))]].max(axis=1)>0.3))\n",
156 | " n_top=sum(top_bool)\n",
157 | " print(n_top)\n",
158 | " \n",
159 | " comps2=['mitochondria','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n",
160 | " table1=pd.DataFrame(index=comps2+['any comp','no comp'],columns=['Prevalence','p-value','odds ratio'])\n",
161 | " for co in comps2:\n",
162 | " \n",
163 | " enr_ratio=gene_cats_bpcc2[top_bool & (gene_cats_bpcc2[co]!='')].shape[0]/\\\n",
164 | " gene_cats_bpcc2[(gene_cats_bpcc2[co]!='')].shape[0]\n",
165 | " comp_bool=(gene_cats_bpcc2[co]!='').values\n",
166 | " oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, comp_bool))\n",
167 | " \n",
168 | " table1.loc[co,['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n",
169 | "# print(co,\": \",np.round(enr_ratio*100,2),'% ','pvalue:', np.round(pvalue,2),' oddsratio:',np.round(oddsratio,2))\n",
170 | " table2.loc[co,c]=oddsratio\n",
171 | " \n",
172 | " \n",
173 | " enr_ratio=gene_cats_bpcc[top_bool & (gene_cats_bpcc['any_comps']!='')].shape[0]/\\\n",
174 | " gene_cats_bpcc[(gene_cats_bpcc['any_comps']!='')].shape[0]\n",
175 | " any_oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['any_comps']!='').values))\n",
176 | "# print(\"any_comps: \",np.round(enr_ratio*100,2),'% ','pvalue:', np.round(pvalue,2),' oddsratio:',np.round(oddsratio,2))\n",
177 | " table1.loc['any comp',['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,any_oddsratio\n",
178 | " print(any_oddsratio)\n",
179 | "\n",
180 | " nocomp_enr_ratio=gene_cats_bpcc[(top_bool) & (gene_cats_bpcc['any_comps']=='')].shape[0]/\\\n",
181 | "gene_cats_bpcc[(gene_cats_bpcc['any_comps']=='')].shape[0]\n",
182 | "\n",
183 | " comps2=['mitochondria','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n",
184 | " comps2.remove(Chan_rev_dict[c])\n",
185 | " \n",
186 | " med_restComp_oddsratio=table2.loc[comps2,c].median()\n",
187 | " gene_cats_bpcc['rest_comps']=''\n",
188 | " for co in comps2:\n",
189 | " gene_cats_bpcc['rest_comps']=gene_cats_bpcc['rest_comps']+gene_cats_bpcc[co] \n",
190 | " \n",
191 | " rest_enr_ratio=gene_cats_bpcc[(top_bool) & (gene_cats_bpcc['rest_comps']=='')].shape[0]/\\\n",
192 | "gene_cats_bpcc[(gene_cats_bpcc['rest_comps']=='')].shape[0] \n",
193 | " rest_oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['rest_comps']!='').values))\n",
194 | " \n",
195 | " table.append(table1)\n",
196 | "# print('num top ('+ str(n_top)+')/total genes (912): ', np.round((n_top/912)*100,2),'%')\n",
197 | " table3.loc[c,['odds ratio','med_restComp_oddsratio','restComp_oddsratio','anyComp_oddsratio','top-ratio']]=\\\n",
198 | " table1.loc[Chan_rev_dict[c],'odds ratio'],med_restComp_oddsratio,rest_oddsratio,\\\n",
199 | " any_oddsratio,np.round((n_top/912)*100,2)\n",
200 | " \n",
201 | "# table3.loc[c,['Prevalence','anyComp-Prevalence','noComp-Prevalence','top-ratio']]=table1.loc[Chan_rev_dict[c],'Prevalence'],\\\n",
202 | "# enr_ratio*100,nocomp_enr_ratio*100,np.round((n_top/912)*100,2) \n",
203 | " \n",
204 | "# print(Chan_rev_dict[c],': ',table1.loc[Chan_rev_dict[c],['Prevalence']].values)\n",
205 | " \n",
206 | "# # table3['dif']=table3['Prevalence']-table3['anyComp-Prevalence']\n",
207 | "table3['dif']=table3['odds ratio']-table3['restComp_oddsratio']\n",
208 | "table3['dif2']=table3['odds ratio']-table3['med_restComp_oddsratio']\n",
209 | "table3['dif3']=table3['odds ratio']-table3['anyComp_oddsratio']\n",
210 | "# print(table3['dif'].min(),table3['dif'].sum())\n",
211 | "# table3\n",
212 | "\n",
213 | "source_data_add='../results/Figs_Source_Data.xlsx'\n",
214 | "if 0:\n",
215 | " saveAsNewSheetToExistingFile(source_data_add,pd.concat([table3.astype(float).round(3)[['odds ratio','restComp_oddsratio']],x1],axis=1),'ExtendedData5')"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "id": "2a98dcc6",
221 | "metadata": {},
222 | "source": [
223 | "## GO terms search for overlap of highly predictable genes (top 58)"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "id": "87d28528",
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "# top58=pd.read_csv('./top_58_common.txt',header=None)[0].tolist()\n",
234 | "# gene_cats_bpcc=pd.read_csv('./go_bp_cc_D2021_each_gene_cat.txt',delimiter='\\t')\n",
235 | "import sys\n",
236 | "sys.path.insert(0, '../utils/') \n",
237 | "from readProfiles import rename_affyprobe_to_genename\n",
238 | "from saveAsNewSheetToExistingFile import saveAsNewSheetToExistingFile\n",
239 | "top58=pd.read_csv('./source/top_59_atleast_topIn3.txt',header=None)[0].tolist()\n",
240 | "gene_cats_bpcc=pd.read_csv('./source/GO_bp_cc_mf_direct_intersection_782.txt',delimiter='\\t')\n",
241 | "\n",
242 | "# comps=['mitochondri','Golgi','membrane','cytoskeleton','actin','endoplasmic','RNA','nucleol','cell division','mitosis','mitotic','cell cycle']\n",
243 | "\n",
244 | "# for c in comps:\n",
245 | "# gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).apply(lambda x: x if c in x else '')+\\\n",
246 | "# gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).apply(lambda x: x[:x.find(c)].split('~')[-1]+x[x.find(c):].split('GO')[0] if c in x else '')\n",
247 | "comps=['mitochondri','golgi','membrane','cytoskeleton','actin','endoplasmic','rna','nucleol',\\\n",
248 | " 'cell division','mitosis','mitotic','cell cycle','cytokine','hormone']\n",
249 | "\n",
250 | "for c in comps:\n",
251 | " gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
252 | " ''.join([x[:si].split('~')[-1]+x[si:].split('GO')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
253 | " gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
254 | " ''.join([x[:si].split('~')[-1]+x[si:].split('GO')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
255 | " gene_cats_bpcc['GOTERM_MF_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
256 | " ''.join([x[:si].split('~')[-1]+x[si:].split('GO')[0] for si in locations_of_substring(x,c)]) if c in x else '')\n",
257 | " \n",
258 | "gene_cats_bpcc['RNA_nucleoli']=gene_cats_bpcc['rna']+gene_cats_bpcc['nucleol']\n",
259 | "gene_cats_bpcc['DNA']=gene_cats_bpcc['cell division']+gene_cats_bpcc['mitosis']+gene_cats_bpcc['mitotic']+gene_cats_bpcc['cell cycle']\n",
260 | "\n",
261 | "gene_cats_bpcc['cytoskeleton-actin']=gene_cats_bpcc['cytoskeleton']+gene_cats_bpcc['actin']\n",
262 | "\n",
263 | "gene_cats_bpcc.loc[gene_cats_bpcc['ID'].isin(top58),'top58']=True\n",
264 | "\n",
265 | "\n",
266 | "\n",
267 | "#####################################\n",
268 | "comps2=['mitochondri','golgi','membrane','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n",
269 | "from sklearn.metrics import confusion_matrix\n",
270 | "from scipy.stats import fisher_exact\n",
271 | "top_bool=(gene_cats_bpcc['top58']==True).values\n",
272 | "\n",
273 | "table1=pd.DataFrame(index=comps2+['any comp','no comp'],columns=['Prevalence','p-value','odds ratio'])\n",
274 | "for co in comps2:\n",
275 | " enr_ratio=gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc[co]!='')].shape[0]/\\\n",
276 | " gene_cats_bpcc[(gene_cats_bpcc[co]!='')].shape[0]\n",
277 | " comp_bool=(gene_cats_bpcc[co]!='').values\n",
278 | " oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, comp_bool))\n",
279 | "# print(co, ':',gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc[co]!='')].shape[0],\\\n",
280 | "# ', ',gene_cats_bpcc[(gene_cats_bpcc[co]!='')].shape[0])\n",
281 | "\n",
282 | " table1.loc[co,['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n",
283 | " print(co,\": \",np.round(enr_ratio*100,2),'% ','pvalue:', np.round(pvalue,2),' oddsratio:',np.round(oddsratio,2))\n",
284 | " \n",
285 | "gene_cats_bpcc['any_comps']=''\n",
286 | "for co in comps2:\n",
287 | " gene_cats_bpcc['any_comps']=gene_cats_bpcc['any_comps']+gene_cats_bpcc[co]\n",
288 | " \n",
289 | " \n",
290 | "enr_ratio=gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc['any_comps']!='')].shape[0]/\\\n",
291 | "gene_cats_bpcc[(gene_cats_bpcc['any_comps']!='')].shape[0]\n",
292 | "oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['any_comps']!='').values))\n",
293 | "print(\"any_comps: \",np.round(enr_ratio*100,2),'% ','pvalue:', np.round(pvalue,2),' oddsratio:',np.round(oddsratio,2))\n",
294 | "table1.loc['any comp',['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n",
295 | "\n",
296 | "\n",
297 | "enr_ratio=gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc['any_comps']=='')].shape[0]/\\\n",
298 | "gene_cats_bpcc[(gene_cats_bpcc['any_comps']=='')].shape[0]\n",
299 | "oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['any_comps']=='').values))\n",
300 | "print(\"no comps: \",np.round(enr_ratio*100,2),'% ','pvalue:', np.round(pvalue,2),' oddsratio:',np.round(oddsratio,2))\n",
301 | "table1.loc['no comp',['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n",
302 | "\n",
303 | "# print('num top (52)/total genes (782): ', np.round((52/782)*100,2),'%')\n",
304 | "\n",
305 | "source_data_add='../results/Figs_Source_Data.xlsx'\n",
306 | "if 0:\n",
307 | " saveAsNewSheetToExistingFile(source_data_add,table1,'ExtendedData6')\n",
308 | "\n",
309 | "# gene_cats_bpcc.to_csv('./GO/go_bp_cc_D2021_each_gene_cat_completed.csv',index=False)\n",
310 | "# gene_cats_bpcc.to_csv('./source/GO_bp_cc_mf_direct_intersection_782_completed.csv',index=False)\n"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": null,
316 | "id": "83ea53e3",
317 | "metadata": {},
318 | "outputs": [],
319 | "source": [
320 | "table1['odds ratio'].astype(float).round(2)"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": null,
326 | "id": "4d4a04f6",
327 | "metadata": {},
328 | "outputs": [],
329 | "source": [
330 | "table1['odds ratio'].astype(float).round(2).values"
331 | ]
332 | }
333 | ],
334 | "metadata": {
335 | "kernelspec": {
336 | "display_name": "Python 3 (ipykernel)",
337 | "language": "python",
338 | "name": "python3"
339 | },
340 | "language_info": {
341 | "codemirror_mode": {
342 | "name": "ipython",
343 | "version": 3
344 | },
345 | "file_extension": ".py",
346 | "mimetype": "text/x-python",
347 | "name": "python",
348 | "nbconvert_exporter": "python",
349 | "pygments_lexer": "ipython3",
350 | "version": "3.8.12"
351 | }
352 | },
353 | "nbformat": 4,
354 | "nbformat_minor": 5
355 | }
356 |
--------------------------------------------------------------------------------
/GO_terms_search/source/LUAD_geneSymbols_978.txt:
--------------------------------------------------------------------------------
1 | AARS1
2 | ABCB6
3 | ABCC5
4 | ABCF1
5 | ABCF3
6 | ABHD4
7 | ABHD6
8 | ABL1
9 | ACAA1
10 | ACAT2
11 | ACBD3
12 | ACD
13 | ACLY
14 | ACOT9
15 | ADAM10
16 | ADAT1
17 | ADGRE2
18 | ADGRG1
19 | ADH5
20 | ADI1
21 | ADO
22 | ADRB2
23 | AGER
24 | AGL
25 | AKAP8
26 | AKAP8L
27 | AKR7A2
28 | AKT1
29 | ALAS1
30 | ALDH7A1
31 | ALDOA
32 | ALDOC
33 | AMDHD2
34 | ANKRD10
35 | ANO10
36 | ANXA7
37 | APBB2
38 | APOE
39 | APP
40 | APPBP2
41 | ARFIP2
42 | ARHGAP1
43 | ARHGEF12
44 | ARHGEF2
45 | ARID4B
46 | ARID5B
47 | ARL4C
48 | ARNT2
49 | ARPP19
50 | ASAH1
51 | ASCC3
52 | ATF1
53 | ATF5
54 | ATF6
55 | ATG3
56 | ATMIN
57 | ATP11B
58 | ATP1B1
59 | ATP2C1
60 | ATP6V0B
61 | ATP6V1D
62 | AURKA
63 | AURKB
64 | AXIN1
65 | B3GNT2
66 | BACE2
67 | BAD
68 | BAG3
69 | BAMBI
70 | BAX
71 | BCL2
72 | BCL7B
73 | BDH1
74 | BECN1
75 | BHLHE40
76 | BID
77 | BIRC2
78 | BIRC5
79 | BLCAP
80 | BLMH
81 | BLTP2
82 | BLVRA
83 | BMP4
84 | BNIP3
85 | BNIP3L
86 | BPHL
87 | BRCA1
88 | BTK
89 | BUB1B
90 | BZW2
91 | C2CD2
92 | C2CD2L
93 | C2CD5
94 | C5
95 | CAB39
96 | CALM1
97 | CALU
98 | CAMSAP2
99 | CANT1
100 | CAPN1
101 | CARMIL1
102 | CASC3
103 | CASK
104 | CASP10
105 | CASP2
106 | CASP3
107 | CASP7
108 | CAST
109 | CAT
110 | CBLB
111 | CBR1
112 | CBR3
113 | CCDC85B
114 | CCDC86
115 | CCDC92
116 | CCL2
117 | CCNA1
118 | CCNA2
119 | CCNB1
120 | CCNB2
121 | CCND1
122 | CCND3
123 | CCNE2
124 | CCNF
125 | CCNH
126 | CCP110
127 | CD320
128 | CD40
129 | CD44
130 | CD58
131 | CDC20
132 | CDC25A
133 | CDC25B
134 | CDC42
135 | CDC45
136 | CDCA4
137 | CDH3
138 | CDK19
139 | CDK2
140 | CDK4
141 | CDK5R1
142 | CDK6
143 | CDK7
144 | CDKN1A
145 | CDKN1B
146 | CDKN2A
147 | CEBPA
148 | CEBPD
149 | CEBPZ
150 | CEMIP2
151 | CENPE
152 | CEP57
153 | CERK
154 | CETN3
155 | CFLAR
156 | CGRRF1
157 | CHAC1
158 | CHEK1
159 | CHEK2
160 | CHERP
161 | CHIC2
162 | CHMP4A
163 | CHMP6
164 | CHN1
165 | CIAO3
166 | CIAPIN1
167 | CIRBP
168 | CISD1
169 | CLIC4
170 | CLPX
171 | CLSTN1
172 | CLTB
173 | CLTC
174 | CNDP2
175 | CNOT4
176 | CNPY3
177 | COASY
178 | COG2
179 | COG4
180 | COG7
181 | COL1A1
182 | COL4A1
183 | COPB2
184 | COPS7A
185 | COQ8A
186 | CORO1A
187 | CPNE3
188 | CPSF4
189 | CRAMP1
190 | CREB1
191 | CREG1
192 | CRELD2
193 | CRK
194 | CRKL
195 | CRTAP
196 | CRYZ
197 | CSK
198 | CSNK1A1
199 | CSNK1E
200 | CSNK2A2
201 | CSRP1
202 | CTNNAL1
203 | CTNND1
204 | CTSD
205 | CTSL
206 | CTTN
207 | CXCL2
208 | CXCR4
209 | CYB561
210 | CYCS
211 | CYTH1
212 | DAG1
213 | DAXX
214 | DCK
215 | DCTD
216 | DCUN1D4
217 | DDB2
218 | DDIT4
219 | DDR1
220 | DDX10
221 | DDX42
222 | DECR1
223 | DENND2D
224 | DERA
225 | DFFA
226 | DFFB
227 | DHDDS
228 | DHRS7
229 | DHX29
230 | DIPK1A
231 | DLD
232 | DMAC2L
233 | DMTF1
234 | DNAJA3
235 | DNAJB1
236 | DNAJB2
237 | DNAJB6
238 | DNAJC15
239 | DNM1
240 | DNM1L
241 | DNMT1
242 | DNMT3A
243 | DNTTIP2
244 | DPH2
245 | DRAP1
246 | DSG2
247 | DUSP11
248 | DUSP14
249 | DUSP22
250 | DUSP3
251 | DUSP4
252 | DUSP6
253 | DYNLT3
254 | DYRK3
255 | E2F2
256 | EAPP
257 | EBNA1BP2
258 | EBP
259 | ECD
260 | ECH1
261 | EDEM1
262 | EDN1
263 | EED
264 | EFCAB14
265 | EGF
266 | EGFR
267 | EGR1
268 | EIF4EBP1
269 | EIF4G1
270 | EIF5
271 | ELAC2
272 | ELAVL1
273 | ELOVL6
274 | ELP1
275 | EML3
276 | ENOPH1
277 | ENOSF1
278 | EPB41L2
279 | EPHA3
280 | EPHB2
281 | EPN2
282 | EPRS1
283 | ERBB2
284 | ERBB3
285 | ERO1A
286 | ETFB
287 | ETS1
288 | ETV1
289 | EVL
290 | EXOSC4
291 | EXT1
292 | EZH2
293 | FAH
294 | FAIM
295 | FAM20B
296 | FAS
297 | FASTKD5
298 | FAT1
299 | FBXL12
300 | FBXO11
301 | FBXO21
302 | FBXO7
303 | FCHO1
304 | FDFT1
305 | FEZ2
306 | FGFR2
307 | FGFR4
308 | FHL2
309 | FIS1
310 | FKBP14
311 | FKBP4
312 | FOS
313 | FOSL1
314 | FOXJ3
315 | FOXO3
316 | FOXO4
317 | FPGS
318 | FRS2
319 | FSD1
320 | FUT1
321 | FYN
322 | FZD1
323 | FZD7
324 | G3BP1
325 | GAA
326 | GABPB1
327 | GADD45A
328 | GADD45B
329 | GALE
330 | GAPDH
331 | GARRE1
332 | GATA2
333 | GATA3
334 | GDPD5
335 | GET1
336 | GFOD1
337 | GFPT1
338 | GFUS
339 | GHR
340 | GLI2
341 | GLOD4
342 | GLRX
343 | GMNN
344 | GNA11
345 | GNA15
346 | GNAI1
347 | GNAI2
348 | GNAS
349 | GNB5
350 | GNPDA1
351 | GOLT1B
352 | GPATCH8
353 | GPC1
354 | GPER1
355 | GRB10
356 | GRB7
357 | GRN
358 | GRWD1
359 | GSTM2
360 | GSTZ1
361 | GTF2A2
362 | GTF2E2
363 | GTPBP8
364 | H2AZ2
365 | H2BC12
366 | H2BC21
367 | HACD3
368 | HADH
369 | HAT1
370 | HDAC2
371 | HDAC6
372 | HDGFL3
373 | HEATR1
374 | HEBP1
375 | HERC6
376 | HERPUD1
377 | HES1
378 | HIF1A
379 | HK1
380 | HLA-DMA
381 | HLA-DRA
382 | HMG20B
383 | HMGA2
384 | HMGCR
385 | HMGCS1
386 | HMOX1
387 | HOMER2
388 | HOOK2
389 | HOXA10
390 | HOXA5
391 | HPRT1
392 | HS2ST1
393 | HSD17B10
394 | HSD17B11
395 | HSPA1A
396 | HSPA4
397 | HSPA8
398 | HSPB1
399 | HSPD1
400 | HTATSF1
401 | HTRA1
402 | HYOU1
403 | IARS2
404 | ICAM1
405 | ICAM3
406 | ICMT
407 | ID2
408 | IDE
409 | IER3
410 | IFNAR1
411 | IFRD2
412 | IGF1R
413 | IGF2BP2
414 | IGF2R
415 | IGFBP3
416 | IGHMBP2
417 | IKBKB
418 | IKBKE
419 | IKZF1
420 | IL13RA1
421 | IL1B
422 | IL4R
423 | ILK
424 | INPP1
425 | INPP4B
426 | INSIG1
427 | INTS3
428 | IPO13
429 | IQGAP1
430 | ISOC1
431 | ITFG1
432 | ITGAE
433 | ITGB1BP1
434 | ITGB5
435 | JADE2
436 | JMJD6
437 | JUN
438 | KAT6A
439 | KAT6B
440 | KCNK1
441 | KCTD5
442 | KDELR2
443 | KDM3A
444 | KDM5A
445 | KDM5B
446 | KEAP1
447 | KHDC4
448 | KIAA0753
449 | KIF14
450 | KIF20A
451 | KIF2C
452 | KIF5C
453 | KIFBP
454 | KIT
455 | KLHDC2
456 | KLHL21
457 | KLHL9
458 | KLK8
459 | KTN1
460 | LAGE3
461 | LAMA3
462 | LAP3
463 | LBR
464 | LGALS8
465 | LGMN
466 | LIG1
467 | LIPA
468 | LOXL1
469 | LPAR2
470 | LPGAT1
471 | LRP10
472 | LRPAP1
473 | LRRC41
474 | LSM5
475 | LSM6
476 | LSR
477 | LYN
478 | LYPLA1
479 | LYRM1
480 | MACF1
481 | MALT1
482 | MAMLD1
483 | MAN2B1
484 | MAP2K5
485 | MAP3K4
486 | MAP4K4
487 | MAP7
488 | MAPK13
489 | MAPK1IP1L
490 | MAPK9
491 | MAPKAPK2
492 | MAPKAPK3
493 | MAPKAPK5
494 | MAST2
495 | MAT2A
496 | MBNL1
497 | MBNL2
498 | MBOAT7
499 | MBTPS1
500 | MCM3
501 | MCOLN1
502 | MCUR1
503 | ME2
504 | MEF2C
505 | MELK
506 | MEST
507 | METRN
508 | MFSD10
509 | MICALL1
510 | MIF
511 | MINDY1
512 | MKNK1
513 | MLEC
514 | MLLT11
515 | MMP1
516 | MMP2
517 | MNAT1
518 | MPC2
519 | MPZL1
520 | MRPL12
521 | MRPL19
522 | MRPS16
523 | MRPS2
524 | MSH6
525 | MSRA
526 | MTA1
527 | MTERF3
528 | MTF2
529 | MTFR1
530 | MTHFD2
531 | MUC1
532 | MVP
533 | MYBL2
534 | MYC
535 | MYCBP
536 | MYCBP2
537 | MYL9
538 | MYLK
539 | MYO10
540 | NCAPD2
541 | NCK1
542 | NCK2
543 | NCOA3
544 | NENF
545 | NET1
546 | NFATC3
547 | NFATC4
548 | NFE2L2
549 | NFIL3
550 | NFKB2
551 | NFKBIA
552 | NFKBIB
553 | NFKBIE
554 | NGRN
555 | NIPSNAP1
556 | NISCH
557 | NIT1
558 | NMT1
559 | NNT
560 | NOL3
561 | NOLC1
562 | NOS3
563 | NOSIP
564 | NOTCH1
565 | NPC1
566 | NPDC1
567 | NPEPL1
568 | NPRL2
569 | NR1H2
570 | NR2F6
571 | NR3C1
572 | NRAS
573 | NRIP1
574 | NSDHL
575 | NT5DC2
576 | NUCB2
577 | NUDCD3
578 | NUDT9
579 | NUP133
580 | NUP62
581 | NUP85
582 | NUP88
583 | NUP93
584 | NUSAP1
585 | NVL
586 | ORC1
587 | OXA1L
588 | OXCT1
589 | OXSR1
590 | P4HA2
591 | P4HTM
592 | PACSIN3
593 | PAF1
594 | PAFAH1B1
595 | PAFAH1B3
596 | PAICS
597 | PAK1
598 | PAK4
599 | PAK6
600 | PAN2
601 | PARP1
602 | PARP2
603 | PAX8
604 | PCBD1
605 | PCCB
606 | PCK2
607 | PCM1
608 | PCMT1
609 | PCNA
610 | PDGFA
611 | PDHX
612 | PDIA5
613 | PDLIM1
614 | PDS5A
615 | PECR
616 | PEX11A
617 | PFKL
618 | PGAM1
619 | PGM1
620 | PGRMC1
621 | PHGDH
622 | PHKA1
623 | PHKB
624 | PHKG2
625 | PIGB
626 | PIH1D1
627 | PIK3C2B
628 | PIK3C3
629 | PIK3CA
630 | PIK3R3
631 | PIK3R4
632 | PIN1
633 | PIP4K2B
634 | PKIG
635 | PLA2G15
636 | PLA2G4A
637 | PLCB3
638 | PLEKHJ1
639 | PLEKHM1
640 | PLK1
641 | PLOD3
642 | PLP2
643 | PLS1
644 | PLSCR1
645 | PLSCR3
646 | PMAIP1
647 | PMM2
648 | PNKP
649 | POLB
650 | POLD1
651 | POLD4
652 | POLE2
653 | POLG2
654 | POLR1C
655 | POLR2I
656 | POLR2K
657 | POP4
658 | PPARD
659 | PPARG
660 | PPIC
661 | PPIE
662 | PPOX
663 | PPP1R13B
664 | PPP2R3C
665 | PPP2R5A
666 | PPP2R5E
667 | PRAF2
668 | PRCP
669 | PRKACA
670 | PRKAG2
671 | PRKCD
672 | PRKCH
673 | PRKCQ
674 | PRKX
675 | PROS1
676 | PRPF4
677 | PRR15L
678 | PRR7
679 | PRSS23
680 | PRUNE1
681 | PSIP1
682 | PSMB10
683 | PSMB8
684 | PSMD10
685 | PSMD2
686 | PSMD4
687 | PSMD9
688 | PSME1
689 | PSME2
690 | PSMF1
691 | PSMG1
692 | PSRC1
693 | PTGS2
694 | PTK2
695 | PTK2B
696 | PTPN1
697 | PTPN12
698 | PTPN6
699 | PTPRC
700 | PTPRF
701 | PTPRK
702 | PUF60
703 | PWP1
704 | PXMP2
705 | PXN
706 | PYCR1
707 | PYGL
708 | RAB11FIP2
709 | RAB21
710 | RAB27A
711 | RAB31
712 | RAB4A
713 | RAC2
714 | RAD51C
715 | RAD9A
716 | RAE1
717 | RAI14
718 | RALA
719 | RALB
720 | RALGDS
721 | RAP1GAP
722 | RASA1
723 | RB1
724 | RBKS
725 | RBM15B
726 | RBM34
727 | RBM6
728 | REEP5
729 | RELB
730 | RFC2
731 | RFC5
732 | RFNG
733 | RFX5
734 | RGS2
735 | RHEB
736 | RHOA
737 | RHOV
738 | RNF167
739 | RNH1
740 | RNMT
741 | RNPS1
742 | RPA1
743 | RPA2
744 | RPA3
745 | RPIA
746 | RPL39L
747 | RPN1
748 | RPP38
749 | RPS5
750 | RPS6
751 | RPS6KA1
752 | RRAGA
753 | RRP12
754 | RRP1B
755 | RRP8
756 | RRS1
757 | RSU1
758 | RTN2
759 | RUVBL1
760 | RXYLT1
761 | S100A13
762 | S100A4
763 | SACM1L
764 | SATB1
765 | SCAND1
766 | SCARB1
767 | SCCPDH
768 | SCP2
769 | SCRN1
770 | SCYL3
771 | SDHB
772 | SENP6
773 | SERPINE1
774 | SESN1
775 | SFN
776 | SGCB
777 | SH3BP5
778 | SHB
779 | SHC1
780 | SIRT3
781 | SKIC2
782 | SKIC8
783 | SKP1
784 | SLC11A2
785 | SLC1A4
786 | SLC25A13
787 | SLC25A14
788 | SLC25A4
789 | SLC25A46
790 | SLC27A3
791 | SLC2A6
792 | SLC35A1
793 | SLC35A3
794 | SLC35B1
795 | SLC35F2
796 | SLC37A4
797 | SLC5A6
798 | SMAD3
799 | SMARCA4
800 | SMARCC1
801 | SMARCD2
802 | SMC1A
803 | SMC3
804 | SMC4
805 | SMNDC1
806 | SNAP25
807 | SNCA
808 | SNX11
809 | SNX13
810 | SNX6
811 | SNX7
812 | SOCS2
813 | SORBS3
814 | SOX2
815 | SOX4
816 | SPAG4
817 | SPAG7
818 | SPDEF
819 | SPEN
820 | SPP1
821 | SPR
822 | SPRED2
823 | SPTAN1
824 | SPTLC2
825 | SQOR
826 | SQSTM1
827 | SRC
828 | SSBP2
829 | ST3GAL5
830 | ST6GALNAC2
831 | ST7
832 | STAMBP
833 | STAP2
834 | STAT1
835 | STAT3
836 | STAT5B
837 | STIMATE
838 | STK10
839 | STK25
840 | STMN1
841 | STUB1
842 | STX1A
843 | STX4
844 | STXBP1
845 | STXBP2
846 | SUPV3L1
847 | SUV39H1
848 | SUZ12
849 | SYK
850 | SYNE2
851 | SYNGR3
852 | SYPL1
853 | TARBP1
854 | TATDN2
855 | TBC1D31
856 | TBC1D9B
857 | TBP
858 | TBPL1
859 | TBX2
860 | TBXA2R
861 | TCEA2
862 | TCEAL4
863 | TCERG1
864 | TCFL5
865 | TCTA
866 | TCTN1
867 | TENT4A
868 | TERF2IP
869 | TERT
870 | TES
871 | TESK1
872 | TEX10
873 | TFAP2A
874 | TFDP1
875 | TGFB3
876 | TGFBR2
877 | THAP11
878 | TIAM1
879 | TICAM1
880 | TIMELESS
881 | TIMM17B
882 | TIMM22
883 | TIMM9
884 | TIMP2
885 | TIPARP
886 | TJP1
887 | TLCD3A
888 | TLE1
889 | TLK2
890 | TLR4
891 | TM9SF2
892 | TM9SF3
893 | TMCO1
894 | TMED10
895 | TMEM109
896 | TMEM50A
897 | TMEM97
898 | TNFRSF21
899 | TNIP1
900 | TOMM34
901 | TOMM70
902 | TOP2A
903 | TOPBP1
904 | TOR1A
905 | TP53
906 | TP53BP1
907 | TP53BP2
908 | TPD52L2
909 | TPM1
910 | TRAK2
911 | TRAM2
912 | TRAP1
913 | TRAPPC3
914 | TRAPPC6A
915 | TRIB1
916 | TRIB3
917 | TRIM13
918 | TRIM2
919 | TSC22D3
920 | TSEN2
921 | TSKU
922 | TSPAN3
923 | TSPAN4
924 | TSPAN6
925 | TUBB6
926 | TWF2
927 | TXLNA
928 | TXNDC9
929 | TXNL4B
930 | TXNRD1
931 | UBE2A
932 | UBE2C
933 | UBE2J1
934 | UBE2L6
935 | UBE3B
936 | UBE3C
937 | UBQLN2
938 | UBR7
939 | UFM1
940 | UGDH
941 | USP1
942 | USP14
943 | USP22
944 | USP6NL
945 | USP7
946 | UTP14A
947 | VAPB
948 | VAT1
949 | VAV3
950 | VDAC1
951 | VGLL4
952 | VPS28
953 | VPS72
954 | WASF3
955 | WASHC4
956 | WASHC5
957 | WDR7
958 | WDTC1
959 | WFS1
960 | WIPF2
961 | XBP1
962 | XPNPEP1
963 | XPO7
964 | YKT6
965 | YME1L1
966 | YTHDF1
967 | ZDHHC6
968 | ZFP36
969 | ZMIZ1
970 | ZMYM2
971 | ZNF131
972 | ZNF274
973 | ZNF318
974 | ZNF395
975 | ZNF451
976 | ZNF586
977 | ZNF589
978 | ZW10
979 |
--------------------------------------------------------------------------------
/GO_terms_search/source/intersection_geneSymbols_785.txt:
--------------------------------------------------------------------------------
1 | AARS1
2 | ABCB6
3 | ABCC5
4 | ABCF1
5 | ABCF3
6 | ABHD4
7 | ABHD6
8 | ABL1
9 | ACAA1
10 | ACAT2
11 | ACBD3
12 | ACD
13 | ACLY
14 | ACOT9
15 | ADAM10
16 | ADAT1
17 | ADGRE2
18 | ADGRG1
19 | ADH5
20 | ADI1
21 | ADO
22 | AGER
23 | AGL
24 | AKAP8
25 | AKAP8L
26 | AKR7A2
27 | AKT1
28 | ALAS1
29 | ALDH7A1
30 | ALDOC
31 | AMDHD2
32 | ANKRD10
33 | ANO10
34 | ANXA7
35 | APBB2
36 | APPBP2
37 | ARFIP2
38 | ARHGAP1
39 | ARHGEF2
40 | ARID4B
41 | ARID5B
42 | ARL4C
43 | ARNT2
44 | ARPP19
45 | ASCC3
46 | ATF5
47 | ATG3
48 | ATMIN
49 | ATP11B
50 | ATP1B1
51 | ATP2C1
52 | ATP6V0B
53 | ATP6V1D
54 | B3GNT2
55 | BACE2
56 | BAD
57 | BAG3
58 | BAMBI
59 | BCL2
60 | BCL7B
61 | BDH1
62 | BECN1
63 | BHLHE40
64 | BID
65 | BIRC5
66 | BLCAP
67 | BLMH
68 | BLTP2
69 | BLVRA
70 | BNIP3
71 | BPHL
72 | BUB1B
73 | BZW2
74 | C2CD2
75 | C2CD2L
76 | C2CD5
77 | CAB39
78 | CALU
79 | CAMSAP2
80 | CANT1
81 | CAPN1
82 | CARMIL1
83 | CASC3
84 | CASK
85 | CASP3
86 | CAST
87 | CAT
88 | CBR1
89 | CBR3
90 | CCDC85B
91 | CCDC86
92 | CCDC92
93 | CCNA2
94 | CCNB2
95 | CCNF
96 | CCP110
97 | CD320
98 | CD44
99 | CD58
100 | CDC45
101 | CDCA4
102 | CDH3
103 | CDK19
104 | CDK2
105 | CDKN1A
106 | CEBPD
107 | CEBPZ
108 | CEMIP2
109 | CENPE
110 | CEP57
111 | CERK
112 | CETN3
113 | CFLAR
114 | CGRRF1
115 | CHAC1
116 | CHEK2
117 | CHERP
118 | CHIC2
119 | CHMP4A
120 | CHMP6
121 | CHN1
122 | CIAO3
123 | CIAPIN1
124 | CIRBP
125 | CISD1
126 | CLIC4
127 | CLPX
128 | CLSTN1
129 | CLTB
130 | CNDP2
131 | CNOT4
132 | CNPY3
133 | COASY
134 | COG2
135 | COG4
136 | COG7
137 | COPB2
138 | COPS7A
139 | COQ8A
140 | CORO1A
141 | CPNE3
142 | CPSF4
143 | CRAMP1
144 | CREB1
145 | CREG1
146 | CRELD2
147 | CRKL
148 | CRTAP
149 | CRYZ
150 | CSK
151 | CSNK2A2
152 | CSRP1
153 | CTNNAL1
154 | CTNND1
155 | CTSD
156 | CTSL
157 | CTTN
158 | CXCR4
159 | CYB561
160 | CYTH1
161 | DCK
162 | DCTD
163 | DCUN1D4
164 | DDB2
165 | DDIT4
166 | DDR1
167 | DDX10
168 | DDX42
169 | DECR1
170 | DENND2D
171 | DERA
172 | DHDDS
173 | DHRS7
174 | DHX29
175 | DIPK1A
176 | DLD
177 | DMAC2L
178 | DMTF1
179 | DNAJA3
180 | DNAJB1
181 | DNAJB2
182 | DNAJB6
183 | DNAJC15
184 | DNM1
185 | DNM1L
186 | DNMT1
187 | DNTTIP2
188 | DPH2
189 | DRAP1
190 | DSG2
191 | DUSP11
192 | DUSP14
193 | DUSP22
194 | DYNLT3
195 | DYRK3
196 | EAPP
197 | EBNA1BP2
198 | EBP
199 | ECD
200 | ECH1
201 | EDEM1
202 | EFCAB14
203 | EGFR
204 | EIF5
205 | ELAC2
206 | ELAVL1
207 | ELOVL6
208 | ELP1
209 | EML3
210 | ENOPH1
211 | ENOSF1
212 | EPB41L2
213 | EPN2
214 | EPRS1
215 | ERBB2
216 | ETFB
217 | EVL
218 | EXOSC4
219 | EXT1
220 | EZH2
221 | FAH
222 | FAIM
223 | FAM20B
224 | FAS
225 | FASTKD5
226 | FAT1
227 | FBXL12
228 | FBXO21
229 | FBXO7
230 | FCHO1
231 | FDFT1
232 | FEZ2
233 | FHL2
234 | FIS1
235 | FKBP14
236 | FKBP4
237 | FOS
238 | FOXJ3
239 | FOXO4
240 | FPGS
241 | FSD1
242 | FUT1
243 | G3BP1
244 | GAA
245 | GABPB1
246 | GADD45A
247 | GADD45B
248 | GALE
249 | GARRE1
250 | GATA2
251 | GATA3
252 | GDPD5
253 | GET1
254 | GFOD1
255 | GFPT1
256 | GFUS
257 | GLOD4
258 | GLRX
259 | GMNN
260 | GNA11
261 | GNAI2
262 | GNAS
263 | GNB5
264 | GNPDA1
265 | GOLT1B
266 | GPATCH8
267 | GPC1
268 | GPER1
269 | GRB10
270 | GRN
271 | GRWD1
272 | GSTM2
273 | GSTZ1
274 | GTF2A2
275 | GTF2E2
276 | GTPBP8
277 | H2AZ2
278 | H2BC12
279 | H2BC21
280 | HACD3
281 | HADH
282 | HAT1
283 | HDAC2
284 | HDAC6
285 | HDGFL3
286 | HEATR1
287 | HEBP1
288 | HERC6
289 | HERPUD1
290 | HES1
291 | HK1
292 | HMG20B
293 | HMGCR
294 | HMGCS1
295 | HOMER2
296 | HOOK2
297 | HOXA10
298 | HOXA5
299 | HPRT1
300 | HS2ST1
301 | HSD17B10
302 | HSD17B11
303 | HSPA4
304 | HTATSF1
305 | HTRA1
306 | HYOU1
307 | IARS2
308 | ICAM3
309 | ICMT
310 | ID2
311 | IDE
312 | IER3
313 | IFRD2
314 | IGF1R
315 | IGF2R
316 | IGHMBP2
317 | IKBKB
318 | IKBKE
319 | IL13RA1
320 | IL4R
321 | ILK
322 | INPP1
323 | INPP4B
324 | INSIG1
325 | INTS3
326 | IPO13
327 | IQGAP1
328 | ISOC1
329 | ITFG1
330 | ITGAE
331 | ITGB1BP1
332 | ITGB5
333 | JADE2
334 | JMJD6
335 | JUN
336 | KAT6A
337 | KAT6B
338 | KCNK1
339 | KCTD5
340 | KDELR2
341 | KDM3A
342 | KDM5A
343 | KDM5B
344 | KEAP1
345 | KHDC4
346 | KIAA0753
347 | KIF14
348 | KIF20A
349 | KIF2C
350 | KIF5C
351 | KIFBP
352 | KIT
353 | KLHDC2
354 | KLHL21
355 | KLHL9
356 | KLK8
357 | KTN1
358 | LAGE3
359 | LAMA3
360 | LAP3
361 | LBR
362 | LGALS8
363 | LGMN
364 | LIG1
365 | LIPA
366 | LOXL1
367 | LPAR2
368 | LPGAT1
369 | LRP10
370 | LRPAP1
371 | LRRC41
372 | LSM5
373 | LSM6
374 | LSR
375 | LYN
376 | LYPLA1
377 | LYRM1
378 | MACF1
379 | MALT1
380 | MAMLD1
381 | MAN2B1
382 | MAP2K5
383 | MAP3K4
384 | MAP4K4
385 | MAP7
386 | MAPK13
387 | MAPK1IP1L
388 | MAPK9
389 | MAPKAPK3
390 | MAPKAPK5
391 | MAST2
392 | MBNL1
393 | MBNL2
394 | MBOAT7
395 | MBTPS1
396 | MCM3
397 | MCOLN1
398 | MCUR1
399 | ME2
400 | MELK
401 | MEST
402 | METRN
403 | MFSD10
404 | MICALL1
405 | MINDY1
406 | MLEC
407 | MLLT11
408 | MPC2
409 | MPZL1
410 | MRPL12
411 | MRPL19
412 | MRPS16
413 | MRPS2
414 | MSH6
415 | MSRA
416 | MTA1
417 | MTERF3
418 | MTF2
419 | MTFR1
420 | MTHFD2
421 | MVP
422 | MYBL2
423 | MYC
424 | MYCBP
425 | MYCBP2
426 | MYO10
427 | NCAPD2
428 | NCOA3
429 | NENF
430 | NET1
431 | NFE2L2
432 | NFIL3
433 | NGRN
434 | NIPSNAP1
435 | NISCH
436 | NIT1
437 | NMT1
438 | NNT
439 | NOL3
440 | NOLC1
441 | NOSIP
442 | NPC1
443 | NPDC1
444 | NPEPL1
445 | NPRL2
446 | NR1H2
447 | NR2F6
448 | NR3C1
449 | NRIP1
450 | NSDHL
451 | NT5DC2
452 | NUCB2
453 | NUDCD3
454 | NUDT9
455 | NUP133
456 | NUP62
457 | NUP85
458 | NUP88
459 | NUP93
460 | NUSAP1
461 | NVL
462 | ORC1
463 | OXA1L
464 | OXCT1
465 | OXSR1
466 | P4HA2
467 | P4HTM
468 | PACSIN3
469 | PAF1
470 | PAFAH1B1
471 | PAFAH1B3
472 | PAICS
473 | PAK4
474 | PAN2
475 | PARP2
476 | PAX8
477 | PCBD1
478 | PCCB
479 | PCK2
480 | PCM1
481 | PCMT1
482 | PDHX
483 | PDIA5
484 | PDLIM1
485 | PDS5A
486 | PECR
487 | PEX11A
488 | PGM1
489 | PGRMC1
490 | PHGDH
491 | PHKA1
492 | PHKB
493 | PHKG2
494 | PIGB
495 | PIH1D1
496 | PIK3C2B
497 | PIN1
498 | PIP4K2B
499 | PKIG
500 | PLA2G15
501 | PLEKHJ1
502 | PLEKHM1
503 | PLOD3
504 | PLP2
505 | PLS1
506 | PLSCR1
507 | PLSCR3
508 | PMAIP1
509 | PMM2
510 | PNKP
511 | POLB
512 | POLD1
513 | POLD4
514 | POLE2
515 | POLG2
516 | POLR1C
517 | POLR2I
518 | POLR2K
519 | POP4
520 | PPARG
521 | PPIC
522 | PPIE
523 | PPOX
524 | PPP2R3C
525 | PPP2R5A
526 | PPP2R5E
527 | PRAF2
528 | PRCP
529 | PRKACA
530 | PRKCD
531 | PRPF4
532 | PRR15L
533 | PRR7
534 | PRSS23
535 | PRUNE1
536 | PSIP1
537 | PSMD10
538 | PSMG1
539 | PSRC1
540 | PTK2
541 | PTPN1
542 | PTPN12
543 | PTPRF
544 | PTPRK
545 | PUF60
546 | PWP1
547 | PXMP2
548 | PXN
549 | PYCR1
550 | PYGL
551 | RAB11FIP2
552 | RAB21
553 | RAB27A
554 | RAB31
555 | RAB4A
556 | RAD51C
557 | RAD9A
558 | RAE1
559 | RAI14
560 | RAP1GAP
561 | RBKS
562 | RBM15B
563 | RBM34
564 | RBM6
565 | REEP5
566 | RELB
567 | RFC2
568 | RFC5
569 | RFNG
570 | RFX5
571 | RGS2
572 | RNF167
573 | RNH1
574 | RNMT
575 | RNPS1
576 | RPA1
577 | RPA2
578 | RPA3
579 | RPIA
580 | RPL39L
581 | RPN1
582 | RPP38
583 | RPS6KA1
584 | RRAGA
585 | RRP12
586 | RRP1B
587 | RRP8
588 | RRS1
589 | RSU1
590 | RTN2
591 | RUVBL1
592 | RXYLT1
593 | S100A13
594 | S100A4
595 | SACM1L
596 | SCAND1
597 | SCARB1
598 | SCCPDH
599 | SCP2
600 | SCRN1
601 | SCYL3
602 | SDHB
603 | SENP6
604 | SESN1
605 | SFN
606 | SGCB
607 | SH3BP5
608 | SHB
609 | SKIC2
610 | SKIC8
611 | SLC11A2
612 | SLC1A4
613 | SLC25A13
614 | SLC25A14
615 | SLC25A4
616 | SLC25A46
617 | SLC27A3
618 | SLC2A6
619 | SLC35A1
620 | SLC35A3
621 | SLC35B1
622 | SLC35F2
623 | SLC37A4
624 | SLC5A6
625 | SMAD3
626 | SMARCA4
627 | SMARCC1
628 | SMARCD2
629 | SMC1A
630 | SMC3
631 | SMC4
632 | SMNDC1
633 | SNX11
634 | SNX13
635 | SNX6
636 | SNX7
637 | SOCS2
638 | SORBS3
639 | SOX4
640 | SPAG4
641 | SPAG7
642 | SPDEF
643 | SPEN
644 | SPR
645 | SPRED2
646 | SPTLC2
647 | SQOR
648 | SSBP2
649 | ST3GAL5
650 | ST6GALNAC2
651 | ST7
652 | STAMBP
653 | STAP2
654 | STAT1
655 | STIMATE
656 | STK10
657 | STK25
658 | STMN1
659 | STUB1
660 | STX1A
661 | STX4
662 | STXBP1
663 | STXBP2
664 | SUPV3L1
665 | SYNE2
666 | SYNGR3
667 | SYPL1
668 | TARBP1
669 | TATDN2
670 | TBC1D31
671 | TBC1D9B
672 | TBPL1
673 | TBX2
674 | TBXA2R
675 | TCEA2
676 | TCEAL4
677 | TCERG1
678 | TCFL5
679 | TCTA
680 | TCTN1
681 | TENT4A
682 | TERF2IP
683 | TES
684 | TESK1
685 | TEX10
686 | TFAP2A
687 | THAP11
688 | TIAM1
689 | TIMELESS
690 | TIMM17B
691 | TIMM22
692 | TIMM9
693 | TIMP2
694 | TIPARP
695 | TJP1
696 | TLCD3A
697 | TLE1
698 | TLK2
699 | TM9SF2
700 | TM9SF3
701 | TMCO1
702 | TMED10
703 | TMEM109
704 | TMEM50A
705 | TMEM97
706 | TNFRSF21
707 | TNIP1
708 | TOMM34
709 | TOMM70
710 | TOP2A
711 | TOPBP1
712 | TOR1A
713 | TP53BP1
714 | TP53BP2
715 | TPD52L2
716 | TPM1
717 | TRAK2
718 | TRAM2
719 | TRAP1
720 | TRAPPC3
721 | TRAPPC6A
722 | TRIB1
723 | TRIB3
724 | TRIM13
725 | TRIM2
726 | TSC22D3
727 | TSEN2
728 | TSKU
729 | TSPAN3
730 | TSPAN4
731 | TSPAN6
732 | TUBB6
733 | TWF2
734 | TXLNA
735 | TXNDC9
736 | TXNL4B
737 | TXNRD1
738 | UBE2A
739 | UBE2C
740 | UBE2J1
741 | UBE2L6
742 | UBE3B
743 | UBE3C
744 | UBQLN2
745 | UBR7
746 | UFM1
747 | UGDH
748 | USP1
749 | USP14
750 | USP22
751 | USP6NL
752 | USP7
753 | UTP14A
754 | VAPB
755 | VAT1
756 | VAV3
757 | VDAC1
758 | VGLL4
759 | VPS28
760 | VPS72
761 | WASF3
762 | WASHC4
763 | WASHC5
764 | WDR7
765 | WDTC1
766 | WFS1
767 | WIPF2
768 | XBP1
769 | XPNPEP1
770 | XPO7
771 | YKT6
772 | YME1L1
773 | YTHDF1
774 | ZDHHC6
775 | ZFP36
776 | ZMIZ1
777 | ZMYM2
778 | ZNF131
779 | ZNF274
780 | ZNF318
781 | ZNF395
782 | ZNF451
783 | ZNF586
784 | ZNF589
785 | ZW10
786 |
--------------------------------------------------------------------------------
/GO_terms_search/source/top_100_luad.txt:
--------------------------------------------------------------------------------
1 | TPM1
2 | CDKN1A
3 | SERPINE1
4 | COL4A1
5 | RPIA
6 | BIRC5
7 | EDN1
8 | GADD45A
9 | CCNA2
10 | POLD1
11 | FHL2
12 | SLC2A6
13 | CTSL
14 | AURKA
15 | ATF1
16 | YKT6
17 | JUN
18 | DNAJB2
19 | ABHD4
20 | MTHFD2
21 | AURKB
22 | MMP1
23 | TOP2A
24 | UBE2C
25 | PAFAH1B3
26 | MRPL12
27 | HDAC2
28 | CTSD
29 | TSEN2
30 | SCARB1
31 | LBR
32 | POLE2
33 | PAICS
34 | PRSS23
35 | RGS2
36 | IER3
37 | HSPB1
38 | PTPN12
39 | CHEK2
40 | ARHGAP1
41 | ADGRG1
42 | MCM3
43 | POP4
44 | PXN
45 | HMOX1
46 | USP1
47 | RUVBL1
48 | DDX10
49 | DUSP6
50 | CCL2
51 | NUP88
52 | CDC25A
53 | TXNRD1
54 | HMGA2
55 | MYL9
56 | DUSP4
57 | CAT
58 | MVP
59 | SQSTM1
60 | TIMELESS
61 | DCK
62 | GPC1
63 | NIPSNAP1
64 | COL1A1
65 | C5
66 | NET1
67 | MPC2
68 | TIMP2
69 | TMEM97
70 | RAE1
71 | RPL39L
72 | EFCAB14
73 | MAN2B1
74 | RAI14
75 | ILK
76 | ABCB6
77 | TIPARP
78 | RNPS1
79 | PPIC
80 | CEBPD
81 | CCND3
82 | EZH2
83 | SOX4
84 | MYBL2
85 | SLC35A1
86 | TMEM109
87 | RSU1
88 | DAG1
89 | GRB10
90 | INPP1
91 | STAT1
92 | RRP12
93 | CREG1
94 | TES
95 | PDGFA
96 | SMC4
97 | ERBB2
98 | EIF4EBP1
99 | DPH2
100 | UBE2L6
101 |
--------------------------------------------------------------------------------
/GO_terms_search/source/top_59_atleast_topIn3.txt:
--------------------------------------------------------------------------------
1 | ATP1B1
2 | BCL7B
3 | BIRC5
4 | BUB1B
5 | CCNA2
6 | CDK4
7 | CISD1
8 | CLIC4
9 | COASY
10 | CPNE3
11 | DAG1
12 | DCK
13 | EBP
14 | EPRS1
15 | ERBB2
16 | FHL2
17 | GLRX
18 | GNPDA1
19 | HMOX1
20 | IER3
21 | IGF2R
22 | LBR
23 | LIG1
24 | MCM3
25 | MPZL1
26 | MRPL19
27 | MTHFD2
28 | MYC
29 | NFKBIB
30 | NIPSNAP1
31 | NPC1
32 | PAFAH1B3
33 | PAICS
34 | PAK4
35 | PSME1
36 | PSRC1
37 | RELB
38 | RPA1
39 | SACM1L
40 | SCARB1
41 | SERPINE1
42 | SESN1
43 | SLC25A4
44 | SMAD3
45 | SMC4
46 | SPP1
47 | STMN1
48 | STX1A
49 | TERF2IP
50 | TMEM50A
51 | TOP2A
52 | TPM1
53 | TRIB1
54 | TSC22D3
55 | TSKU
56 | TXNRD1
57 | UBE2C
58 | XBP1
59 | YKT6
60 |
--------------------------------------------------------------------------------
/GO_terms_search/source/union_geneSymbols_1170.txt:
--------------------------------------------------------------------------------
1 | AARS1
2 | ABCB6
3 | ABCC5
4 | ABCF1
5 | ABCF3
6 | ABHD4
7 | ABHD6
8 | ABL1
9 | ACAA1
10 | ACAT2
11 | ACBD3
12 | ACD
13 | ACLY
14 | ACOT9
15 | ADAM10
16 | ADAT1
17 | ADGRE2
18 | ADGRG1
19 | ADH5
20 | ADI1
21 | ADO
22 | ADRB2
23 | AGER
24 | AGL
25 | AKAP8
26 | AKAP8L
27 | AKR7A2
28 | AKT1
29 | ALAS1
30 | ALDH7A1
31 | ALDOA
32 | ALDOC
33 | AMDHD2
34 | ANKRD10
35 | ANO10
36 | ANXA7
37 | APBB2
38 | APOE
39 | APP
40 | APPBP2
41 | ARFIP2
42 | ARHGAP1
43 | ARHGEF12
44 | ARHGEF2
45 | ARID4B
46 | ARID5B
47 | ARL4C
48 | ARNT2
49 | ARPP19
50 | ASAH1
51 | ASCC3
52 | ATF1
53 | ATF5
54 | ATF6
55 | ATG3
56 | ATMIN
57 | ATP11B
58 | ATP1B1
59 | ATP2C1
60 | ATP6V0B
61 | ATP6V1D
62 | AURKA
63 | AURKB
64 | AXIN1
65 | B3GNT2
66 | BACE2
67 | BAD
68 | BAG3
69 | BAMBI
70 | BAX
71 | BCL2
72 | BCL7B
73 | BDH1
74 | BECN1
75 | BHLHE40
76 | BID
77 | BIRC2
78 | BIRC5
79 | BLCAP
80 | BLMH
81 | BLTP2
82 | BLVRA
83 | BMP4
84 | BNIP3
85 | BNIP3L
86 | BPHL
87 | BRCA1
88 | BTK
89 | BUB1B
90 | BZW2
91 | C2CD2
92 | C2CD2L
93 | C2CD5
94 | C5
95 | CAB39
96 | CALM1
97 | CALU
98 | CAMSAP2
99 | CANT1
100 | CAPN1
101 | CARMIL1
102 | CASC3
103 | CASK
104 | CASP10
105 | CASP2
106 | CASP3
107 | CASP7
108 | CAST
109 | CAT
110 | CBLB
111 | CBR1
112 | CBR3
113 | CCDC85B
114 | CCDC86
115 | CCDC92
116 | CCL2
117 | CCNA1
118 | CCNA2
119 | CCNB1
120 | CCNB2
121 | CCND1
122 | CCND3
123 | CCNE2
124 | CCNF
125 | CCNH
126 | CCP110
127 | CD320
128 | CD40
129 | CD44
130 | CD58
131 | CDC20
132 | CDC25A
133 | CDC25B
134 | CDC42
135 | CDC45
136 | CDCA4
137 | CDH3
138 | CDK19
139 | CDK2
140 | CDK4
141 | CDK5R1
142 | CDK6
143 | CDK7
144 | CDKN1A
145 | CDKN1B
146 | CDKN2A
147 | CEBPA
148 | CEBPD
149 | CEBPZ
150 | CEMIP2
151 | CENPE
152 | CEP57
153 | CERK
154 | CETN3
155 | CFLAR
156 | CGRRF1
157 | CHAC1
158 | CHEK1
159 | CHEK2
160 | CHERP
161 | CHIC2
162 | CHMP4A
163 | CHMP6
164 | CHN1
165 | CIAO3
166 | CIAPIN1
167 | CIRBP
168 | CISD1
169 | CLIC4
170 | CLPX
171 | CLSTN1
172 | CLTB
173 | CLTC
174 | CNDP2
175 | CNOT4
176 | CNPY3
177 | COASY
178 | COG2
179 | COG4
180 | COG7
181 | COL1A1
182 | COL4A1
183 | COPB2
184 | COPS7A
185 | COQ8A
186 | CORO1A
187 | CPNE3
188 | CPSF4
189 | CRAMP1
190 | CREB1
191 | CREG1
192 | CRELD2
193 | CRK
194 | CRKL
195 | CRTAP
196 | CRYZ
197 | CSK
198 | CSNK1A1
199 | CSNK1E
200 | CSNK2A2
201 | CSRP1
202 | CTNNAL1
203 | CTNND1
204 | CTSD
205 | CTSL
206 | CTTN
207 | CXCL2
208 | CXCR4
209 | CYB561
210 | CYCS
211 | CYTH1
212 | DAG1
213 | DAXX
214 | DCK
215 | DCTD
216 | DCUN1D4
217 | DDB2
218 | DDIT4
219 | DDR1
220 | DDX10
221 | DDX42
222 | DECR1
223 | DENND2D
224 | DERA
225 | DFFA
226 | DFFB
227 | DHDDS
228 | DHRS7
229 | DHX29
230 | DIPK1A
231 | DLD
232 | DMAC2L
233 | DMTF1
234 | DNAJA3
235 | DNAJB1
236 | DNAJB2
237 | DNAJB6
238 | DNAJC15
239 | DNM1
240 | DNM1L
241 | DNMT1
242 | DNMT3A
243 | DNTTIP2
244 | DPH2
245 | DRAP1
246 | DSG2
247 | DUSP11
248 | DUSP14
249 | DUSP22
250 | DUSP3
251 | DUSP4
252 | DUSP6
253 | DYNLT3
254 | DYRK3
255 | E2F2
256 | EAPP
257 | EBNA1BP2
258 | EBP
259 | ECD
260 | ECH1
261 | EDEM1
262 | EDN1
263 | EED
264 | EFCAB14
265 | EGF
266 | EGFR
267 | EGR1
268 | EIF4EBP1
269 | EIF4G1
270 | EIF5
271 | ELAC2
272 | ELAVL1
273 | ELOVL6
274 | ELP1
275 | EML3
276 | ENOPH1
277 | ENOSF1
278 | EPB41L2
279 | EPHA3
280 | EPHB2
281 | EPN2
282 | EPRS1
283 | ERBB2
284 | ERBB3
285 | ERO1A
286 | ETFB
287 | ETS1
288 | ETV1
289 | EVL
290 | EXOSC4
291 | EXT1
292 | EZH2
293 | FAH
294 | FAIM
295 | FAM20B
296 | FAS
297 | FASTKD5
298 | FAT1
299 | FBXL12
300 | FBXO11
301 | FBXO21
302 | FBXO7
303 | FCHO1
304 | FDFT1
305 | FEZ2
306 | FGFR2
307 | FGFR4
308 | FHL2
309 | FIS1
310 | FKBP14
311 | FKBP4
312 | FOS
313 | FOSL1
314 | FOXJ3
315 | FOXO3
316 | FOXO4
317 | FPGS
318 | FRS2
319 | FSD1
320 | FUT1
321 | FYN
322 | FZD1
323 | FZD7
324 | G3BP1
325 | GAA
326 | GABPB1
327 | GADD45A
328 | GADD45B
329 | GALE
330 | GAPDH
331 | GARRE1
332 | GATA2
333 | GATA3
334 | GDPD5
335 | GET1
336 | GFOD1
337 | GFPT1
338 | GFUS
339 | GHR
340 | GLI2
341 | GLOD4
342 | GLRX
343 | GMNN
344 | GNA11
345 | GNA15
346 | GNAI1
347 | GNAI2
348 | GNAS
349 | GNB5
350 | GNPDA1
351 | GOLT1B
352 | GPATCH8
353 | GPC1
354 | GPER1
355 | GRB10
356 | GRB7
357 | GRN
358 | GRWD1
359 | GSTM2
360 | GSTZ1
361 | GTF2A2
362 | GTF2E2
363 | GTPBP8
364 | H2AZ2
365 | H2BC12
366 | H2BC21
367 | HACD3
368 | HADH
369 | HAT1
370 | HDAC2
371 | HDAC6
372 | HDGFL3
373 | HEATR1
374 | HEBP1
375 | HERC6
376 | HERPUD1
377 | HES1
378 | HIF1A
379 | HK1
380 | HLA-DMA
381 | HLA-DRA
382 | HMG20B
383 | HMGA2
384 | HMGCR
385 | HMGCS1
386 | HMOX1
387 | HOMER2
388 | HOOK2
389 | HOXA10
390 | HOXA5
391 | HPRT1
392 | HS2ST1
393 | HSD17B10
394 | HSD17B11
395 | HSPA1A
396 | HSPA4
397 | HSPA8
398 | HSPB1
399 | HSPD1
400 | HTATSF1
401 | HTRA1
402 | HYOU1
403 | IARS2
404 | ICAM1
405 | ICAM3
406 | ICMT
407 | ID2
408 | IDE
409 | IER3
410 | IFNAR1
411 | IFRD2
412 | IGF1R
413 | IGF2BP2
414 | IGF2R
415 | IGFBP3
416 | IGHMBP2
417 | IKBKB
418 | IKBKE
419 | IKZF1
420 | IL13RA1
421 | IL1B
422 | IL4R
423 | ILK
424 | INPP1
425 | INPP4B
426 | INSIG1
427 | INTS3
428 | IPO13
429 | IQGAP1
430 | ISOC1
431 | ITFG1
432 | ITGAE
433 | ITGB1BP1
434 | ITGB5
435 | JADE2
436 | JMJD6
437 | JUN
438 | KAT6A
439 | KAT6B
440 | KCNK1
441 | KCTD5
442 | KDELR2
443 | KDM3A
444 | KDM5A
445 | KDM5B
446 | KEAP1
447 | KHDC4
448 | KIAA0753
449 | KIF14
450 | KIF20A
451 | KIF2C
452 | KIF5C
453 | KIFBP
454 | KIT
455 | KLHDC2
456 | KLHL21
457 | KLHL9
458 | KLK8
459 | KTN1
460 | LAGE3
461 | LAMA3
462 | LAP3
463 | LBR
464 | LGALS8
465 | LGMN
466 | LIG1
467 | LIPA
468 | LOXL1
469 | LPAR2
470 | LPGAT1
471 | LRP10
472 | LRPAP1
473 | LRRC41
474 | LSM5
475 | LSM6
476 | LSR
477 | LYN
478 | LYPLA1
479 | LYRM1
480 | MACF1
481 | MALT1
482 | MAMLD1
483 | MAN2B1
484 | MAP2K5
485 | MAP3K4
486 | MAP4K4
487 | MAP7
488 | MAPK13
489 | MAPK1IP1L
490 | MAPK9
491 | MAPKAPK2
492 | MAPKAPK3
493 | MAPKAPK5
494 | MAST2
495 | MAT2A
496 | MBNL1
497 | MBNL2
498 | MBOAT7
499 | MBTPS1
500 | MCM3
501 | MCOLN1
502 | MCUR1
503 | ME2
504 | MEF2C
505 | MELK
506 | MEST
507 | METRN
508 | MFSD10
509 | MICALL1
510 | MIF
511 | MINDY1
512 | MKNK1
513 | MLEC
514 | MLLT11
515 | MMP1
516 | MMP2
517 | MNAT1
518 | MPC2
519 | MPZL1
520 | MRPL12
521 | MRPL19
522 | MRPS16
523 | MRPS2
524 | MSH6
525 | MSRA
526 | MTA1
527 | MTERF3
528 | MTF2
529 | MTFR1
530 | MTHFD2
531 | MUC1
532 | MVP
533 | MYBL2
534 | MYC
535 | MYCBP
536 | MYCBP2
537 | MYL9
538 | MYLK
539 | MYO10
540 | NCAPD2
541 | NCK1
542 | NCK2
543 | NCOA3
544 | NENF
545 | NET1
546 | NFATC3
547 | NFATC4
548 | NFE2L2
549 | NFIL3
550 | NFKB2
551 | NFKBIA
552 | NFKBIB
553 | NFKBIE
554 | NGRN
555 | NIPSNAP1
556 | NISCH
557 | NIT1
558 | NMT1
559 | NNT
560 | NOL3
561 | NOLC1
562 | NOS3
563 | NOSIP
564 | NOTCH1
565 | NPC1
566 | NPDC1
567 | NPEPL1
568 | NPRL2
569 | NR1H2
570 | NR2F6
571 | NR3C1
572 | NRAS
573 | NRIP1
574 | NSDHL
575 | NT5DC2
576 | NUCB2
577 | NUDCD3
578 | NUDT9
579 | NUP133
580 | NUP62
581 | NUP85
582 | NUP88
583 | NUP93
584 | NUSAP1
585 | NVL
586 | ORC1
587 | OXA1L
588 | OXCT1
589 | OXSR1
590 | P4HA2
591 | P4HTM
592 | PACSIN3
593 | PAF1
594 | PAFAH1B1
595 | PAFAH1B3
596 | PAICS
597 | PAK1
598 | PAK4
599 | PAK6
600 | PAN2
601 | PARP1
602 | PARP2
603 | PAX8
604 | PCBD1
605 | PCCB
606 | PCK2
607 | PCM1
608 | PCMT1
609 | PCNA
610 | PDGFA
611 | PDHX
612 | PDIA5
613 | PDLIM1
614 | PDS5A
615 | PECR
616 | PEX11A
617 | PFKL
618 | PGAM1
619 | PGM1
620 | PGRMC1
621 | PHGDH
622 | PHKA1
623 | PHKB
624 | PHKG2
625 | PIGB
626 | PIH1D1
627 | PIK3C2B
628 | PIK3C3
629 | PIK3CA
630 | PIK3R3
631 | PIK3R4
632 | PIN1
633 | PIP4K2B
634 | PKIG
635 | PLA2G15
636 | PLA2G4A
637 | PLCB3
638 | PLEKHJ1
639 | PLEKHM1
640 | PLK1
641 | PLOD3
642 | PLP2
643 | PLS1
644 | PLSCR1
645 | PLSCR3
646 | PMAIP1
647 | PMM2
648 | PNKP
649 | POLB
650 | POLD1
651 | POLD4
652 | POLE2
653 | POLG2
654 | POLR1C
655 | POLR2I
656 | POLR2K
657 | POP4
658 | PPARD
659 | PPARG
660 | PPIC
661 | PPIE
662 | PPOX
663 | PPP1R13B
664 | PPP2R3C
665 | PPP2R5A
666 | PPP2R5E
667 | PRAF2
668 | PRCP
669 | PRKACA
670 | PRKAG2
671 | PRKCD
672 | PRKCH
673 | PRKCQ
674 | PRKX
675 | PROS1
676 | PRPF4
677 | PRR15L
678 | PRR7
679 | PRSS23
680 | PRUNE1
681 | PSIP1
682 | PSMB10
683 | PSMB8
684 | PSMD10
685 | PSMD2
686 | PSMD4
687 | PSMD9
688 | PSME1
689 | PSME2
690 | PSMF1
691 | PSMG1
692 | PSRC1
693 | PTGS2
694 | PTK2
695 | PTK2B
696 | PTPN1
697 | PTPN12
698 | PTPN6
699 | PTPRC
700 | PTPRF
701 | PTPRK
702 | PUF60
703 | PWP1
704 | PXMP2
705 | PXN
706 | PYCR1
707 | PYGL
708 | RAB11FIP2
709 | RAB21
710 | RAB27A
711 | RAB31
712 | RAB4A
713 | RAC2
714 | RAD51C
715 | RAD9A
716 | RAE1
717 | RAI14
718 | RALA
719 | RALB
720 | RALGDS
721 | RAP1GAP
722 | RASA1
723 | RB1
724 | RBKS
725 | RBM15B
726 | RBM34
727 | RBM6
728 | REEP5
729 | RELB
730 | RFC2
731 | RFC5
732 | RFNG
733 | RFX5
734 | RGS2
735 | RHEB
736 | RHOA
737 | RHOV
738 | RNF167
739 | RNH1
740 | RNMT
741 | RNPS1
742 | RPA1
743 | RPA2
744 | RPA3
745 | RPIA
746 | RPL39L
747 | RPN1
748 | RPP38
749 | RPS5
750 | RPS6
751 | RPS6KA1
752 | RRAGA
753 | RRP12
754 | RRP1B
755 | RRP8
756 | RRS1
757 | RSU1
758 | RTN2
759 | RUVBL1
760 | RXYLT1
761 | S100A13
762 | S100A4
763 | SACM1L
764 | SATB1
765 | SCAND1
766 | SCARB1
767 | SCCPDH
768 | SCP2
769 | SCRN1
770 | SCYL3
771 | SDHB
772 | SENP6
773 | SERPINE1
774 | SESN1
775 | SFN
776 | SGCB
777 | SH3BP5
778 | SHB
779 | SHC1
780 | SIRT3
781 | SKIC2
782 | SKIC8
783 | SKP1
784 | SLC11A2
785 | SLC1A4
786 | SLC25A13
787 | SLC25A14
788 | SLC25A4
789 | SLC25A46
790 | SLC27A3
791 | SLC2A6
792 | SLC35A1
793 | SLC35A3
794 | SLC35B1
795 | SLC35F2
796 | SLC37A4
797 | SLC5A6
798 | SMAD3
799 | SMARCA4
800 | SMARCC1
801 | SMARCD2
802 | SMC1A
803 | SMC3
804 | SMC4
805 | SMNDC1
806 | SNAP25
807 | SNCA
808 | SNX11
809 | SNX13
810 | SNX6
811 | SNX7
812 | SOCS2
813 | SORBS3
814 | SOX2
815 | SOX4
816 | SPAG4
817 | SPAG7
818 | SPDEF
819 | SPEN
820 | SPP1
821 | SPR
822 | SPRED2
823 | SPTAN1
824 | SPTLC2
825 | SQOR
826 | SQSTM1
827 | SRC
828 | SSBP2
829 | ST3GAL5
830 | ST6GALNAC2
831 | ST7
832 | STAMBP
833 | STAP2
834 | STAT1
835 | STAT3
836 | STAT5B
837 | STIMATE
838 | STK10
839 | STK25
840 | STMN1
841 | STUB1
842 | STX1A
843 | STX4
844 | STXBP1
845 | STXBP2
846 | SUPV3L1
847 | SUV39H1
848 | SUZ12
849 | SYK
850 | SYNE2
851 | SYNGR3
852 | SYPL1
853 | TARBP1
854 | TATDN2
855 | TBC1D31
856 | TBC1D9B
857 | TBP
858 | TBPL1
859 | TBX2
860 | TBXA2R
861 | TCEA2
862 | TCEAL4
863 | TCERG1
864 | TCFL5
865 | TCTA
866 | TCTN1
867 | TENT4A
868 | TERF2IP
869 | TERT
870 | TES
871 | TESK1
872 | TEX10
873 | TFAP2A
874 | TFDP1
875 | TGFB3
876 | TGFBR2
877 | THAP11
878 | TIAM1
879 | TICAM1
880 | TIMELESS
881 | TIMM17B
882 | TIMM22
883 | TIMM9
884 | TIMP2
885 | TIPARP
886 | TJP1
887 | TLCD3A
888 | TLE1
889 | TLK2
890 | TLR4
891 | TM9SF2
892 | TM9SF3
893 | TMCO1
894 | TMED10
895 | TMEM109
896 | TMEM50A
897 | TMEM97
898 | TNFRSF21
899 | TNIP1
900 | TOMM34
901 | TOMM70
902 | TOP2A
903 | TOPBP1
904 | TOR1A
905 | TP53
906 | TP53BP1
907 | TP53BP2
908 | TPD52L2
909 | TPM1
910 | TRAK2
911 | TRAM2
912 | TRAP1
913 | TRAPPC3
914 | TRAPPC6A
915 | TRIB1
916 | TRIB3
917 | TRIM13
918 | TRIM2
919 | TSC22D3
920 | TSEN2
921 | TSKU
922 | TSPAN3
923 | TSPAN4
924 | TSPAN6
925 | TUBB6
926 | TWF2
927 | TXLNA
928 | TXNDC9
929 | TXNL4B
930 | TXNRD1
931 | UBE2A
932 | UBE2C
933 | UBE2J1
934 | UBE2L6
935 | UBE3B
936 | UBE3C
937 | UBQLN2
938 | UBR7
939 | UFM1
940 | UGDH
941 | USP1
942 | USP14
943 | USP22
944 | USP6NL
945 | USP7
946 | UTP14A
947 | VAPB
948 | VAT1
949 | VAV3
950 | VDAC1
951 | VGLL4
952 | VPS28
953 | VPS72
954 | WASF3
955 | WASHC4
956 | WASHC5
957 | WDR7
958 | WDTC1
959 | WFS1
960 | WIPF2
961 | XBP1
962 | XPNPEP1
963 | XPO7
964 | YKT6
965 | YME1L1
966 | YTHDF1
967 | ZDHHC6
968 | ZFP36
969 | ZMIZ1
970 | ZMYM2
971 | ZNF131
972 | ZNF274
973 | ZNF318
974 | ZNF395
975 | ZNF451
976 | ZNF586
977 | ZNF589
978 | ZW10
979 | ABAT
980 | ACSL1
981 | ADCY9
982 | ADGRA3
983 | AFF1
984 | AKAP1
985 | ALMS1
986 | ANKRD49
987 | AREL1
988 | ARHGEF5
989 | ASAP2
990 | ASL
991 | ASMTL
992 | BAZ1B
993 | BAZ2B
994 | BCKDHA
995 | BTBD3
996 | BTG2
997 | BTG3
998 | CA12
999 | CABIN1
1000 | CD14
1001 | CD19
1002 | CEP55
1003 | CHCHD7
1004 | CISH
1005 | CKB
1006 | CLASRP
1007 | CLCN3
1008 | CMPK1
1009 | COL4A5
1010 | COQ3
1011 | CREB3L2
1012 | CTCF
1013 | CTNS
1014 | CTSV
1015 | DALRD3
1016 | DDX49
1017 | DHPS
1018 | DHX8
1019 | DLGAP5
1020 | DNAJB12
1021 | DOK4
1022 | DSE
1023 | DTNA
1024 | ECHDC1
1025 | EFNB3
1026 | EIF1B
1027 | ENTPD6
1028 | EPB41L4B
1029 | EPHA2
1030 | EPHB4
1031 | ERCC1
1032 | ERCC5
1033 | ERCC6L
1034 | ERLIN1
1035 | ETFA
1036 | F12
1037 | FADD
1038 | FAM162A
1039 | FAM3C
1040 | FANCA
1041 | FANCL
1042 | FBRS
1043 | FDX1
1044 | FEZ1
1045 | FTSJ1
1046 | FUZ
1047 | FZD5
1048 | GGA2
1049 | GM2A
1050 | GMDS
1051 | GOLIM4
1052 | GPRC5C
1053 | GRHPR
1054 | GYS1
1055 | H2BC5
1056 | HACD1
1057 | HADHB
1058 | HBE1
1059 | HDAC4
1060 | HLF
1061 | HMGCL
1062 | IFIT5
1063 | IL1RAP
1064 | IMPA2
1065 | KDELR3
1066 | KHDC1
1067 | LAMTOR3
1068 | LASP1
1069 | LHPP
1070 | LIMK2
1071 | LRRC40
1072 | LYPD3
1073 | MAD2L1BP
1074 | MAN2A2
1075 | MARK4
1076 | MEGF9
1077 | MGST2
1078 | MPDZ
1079 | NAA50
1080 | NIPBL
1081 | NSFL1C
1082 | PANK2
1083 | PARL
1084 | PARN
1085 | PDS5B
1086 | PEPD
1087 | PER2
1088 | PEX19
1089 | PHF2
1090 | PJA1
1091 | PKD1
1092 | PLPP1
1093 | POLR3B
1094 | PPIF
1095 | PPP2R3A
1096 | PPP2R5B
1097 | PPP3CA
1098 | PPP4R1
1099 | PSMD5
1100 | PTEN
1101 | QRSL1
1102 | RAB1B
1103 | RAB3GAP1
1104 | RABGGTA
1105 | RAD23B
1106 | RAP1A
1107 | RARA
1108 | RHOBTB1
1109 | RPF1
1110 | RTCA
1111 | SAP18
1112 | SCAF8
1113 | SEC14L1
1114 | SEC24B
1115 | SEC24C
1116 | SEC61A2
1117 | SENP5
1118 | SERTAD3
1119 | SETD1B
1120 | SETDB1
1121 | SFMBT1
1122 | SGK1
1123 | SIK1
1124 | SLC16A6
1125 | SLC2A1
1126 | SLC36A1
1127 | SMG7
1128 | SMYD3
1129 | SNRPD1
1130 | SNRPF
1131 | SNX2
1132 | SRF
1133 | SRPRB
1134 | SRRM1
1135 | SRSF8
1136 | STAT6
1137 | SWAP70
1138 | TARS1
1139 | TBCB
1140 | TCIRG1
1141 | TGDS
1142 | TLE3
1143 | TMEM127
1144 | TMEM131L
1145 | TNFAIP1
1146 | TOMM22
1147 | TP53TG1
1148 | TRIP10
1149 | TSN
1150 | TTF1
1151 | UBE4A
1152 | UGP2
1153 | USP20
1154 | UVRAG
1155 | VAMP7
1156 | VPS26A
1157 | WHRN
1158 | WRN
1159 | WWOX
1160 | YBX3
1161 | ZBTB14
1162 | ZBTB24
1163 | ZBTB5
1164 | ZC3H4
1165 | ZER1
1166 | ZGPAT
1167 | ZNF629
1168 | ZNF672
1169 | ZNF692
1170 | ZNF768
1171 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2021, carpenterlab
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # High-Dimensional Gene Expression and Morphology Profiles of Cells across 28,000 Genetic and Chemical Perturbations
2 | Populations of cells can be perturbed by various chemical and genetic treatments and the impact on the cells’ gene expression (transcription, i.e. mRNA levels) and morphology (in an image-based assay) can be measured in high dimensions.
3 | The patterns observed in this data can be used for more than a dozen applications in drug discovery and basic biology research.
4 | We provide a collection of four datasets where both gene expression and morphological data are available; roughly a thousand features are measured for each data type, across more than 28,000 thousand chemical and genetic perturbations.
5 | We have defined a set of biological problems that can be investigated using these two data modalities and provided baseline analysis and evaluation metrics for addressing each.
6 |
7 | [Link to Paper](https://www.nature.com/articles/s41592-022-01667-0)
8 |
9 |
10 | # Data Modalities
11 |
12 | Click to expand
13 |
14 | ### Gene expression (GE) profiles
15 | Each cell has DNA in the nucleus which is transcribed into various mRNA molecules which are then translated into proteins that carry out functions in the cell.
16 | The levels of mRNA in the cell are often biologically meaningful - collectively, mRNA levels for a cell are known as its transcriptional state; each individual mRNA level is referred to as the corresponding gene's "expression".
17 | The L1000 assay was used to measure the transcriptional state of cells in the datasets here.
18 | The assay reports a sample's mRNA levels for 978 genes at high-throughput, from the bulk population of cells treated with a given perturbation.
19 | These 978 "landmark" genes capture approximately $80\%$ of the transcriptional variance for the entire genome.
20 | The data processing tools and workflows to produce these profiles are available at https://clue.io/.
21 |
22 |
23 | ### Cell Painting morphological (CP) profiles
24 | We used the Cell Painting assay to measure the morphological state of cells treated with a given perturbation.
25 | The assay captures fluorescence images of cells colored by six well-characterized fluorescent dyes to stain the nucleus, nucleoli, cytoplasmic RNA, endoplasmic reticulum, actin cytoskeleton, Golgi apparatus and plasma membrane.
26 | These eight labeled cell compartments are captured through five channels of high-resolution microscopy images (_DNA, RNA, ER, AGP_, and _Mito_).
27 | Images are then processed using [CellProfiler software](https://cellprofiler.org/) to extract thousands of features of each cell’s morphology and form a high-dimensional profile for each single cell.
28 | These features are based on various shape, intensity and texture statistics and are then aggregated for all the single cells in a "well" (a miniature test tube) that are called replicate-level profiles of perturbations.
29 | Aggregation of replicate-level profiles across all the wells or replicates of a perturbation is called a treatment-level profile.
30 | In our study, we used treatment-level profiles in all experiments but have provided replicate-level profiles for researchers interested in further data exploration.
31 |
32 |
33 |
34 | # Datasets
35 |
36 | - We have gathered the following five available data sets that had both Cell Painting morphological (CP) and L1000 gene expression (GE) profiles, preprocessed the data from different sources and in different formats in a unified .csv format.
37 |
38 | - CDRP-BBBC047-Bray-CP-GE (Cell line: U2OS)
39 | - CDRPBIO-BBBC036-Bray-CP-GE (Cell line: U2OS)
40 | - LUAD-BBBC041-Caicedo-CP-GE (Cell line: A549)
41 | - TA-ORF-BBBC037-Rohban-CP-GE (Cell line: U2OS)
42 | - LINCS-Pilot1-CP-GE (Cell line: A549)
43 |
44 | ## References to raw profiles and images
45 |
46 | Click to expand
47 |
48 | - CDRP-BBBC047-Bray-[CP](https://pubmed.ncbi.nlm.nih.gov/28327978/) - [GE](https://pubmed.ncbi.nlm.nih.gov/29195078/)
49 | - CDRP-bio-BBBC036-Bray-[CP](https://pubmed.ncbi.nlm.nih.gov/28327978/) - [GE](https://pubmed.ncbi.nlm.nih.gov/29195078/)
50 | - LUAD-BBBC041-Caicedo-[CP](https://registry.opendata.aws/cell-painting-image-collection/) - [GE](https://pubmed.ncbi.nlm.nih.gov/27478040/)
51 | - TA-ORF-BBBC037-Rohban-[CP](https://elifesciences.org/articles/24060) - [GE](https://github.com/carpenterlab/2017_rohban_elife/tree/master/input/TA-OE-L1000-B1)
52 | - LINCS-Pilot1-[CP](https://zenodo.org/record/3928744#.YNu3WzZKheV) - [GE](https://figshare.com/articles/dataset/L1000_data_for_profiling_comparison/13181966)
53 |
54 |
55 |
56 |
57 | ## Preprocessed publicly available profiles
58 | Preprocessed profiles (~9.5GB) are available on a S3 bucket.
59 | They can be downloaded at no cost and no need for registration of any sort, using the command:
60 |
61 | ```bash
62 | aws s3 sync \
63 | --no-sign-request \
64 | s3://cellpainting-gallery/cpg0003-rosetta/broad/workspace/preprocessed_data .
65 | ```
66 |
67 | See this [wiki](https://github.com/carpenterlab/2016_bray_natprot/wiki/What-do-Cell-Painting-features-mean%3F) for sample Cell Painting images and the meaning of ([CellProfiler](https://cellprofiler.org/)-derived) Cell Painting features.
68 |
69 | - AWS CLI installation instructions can be found [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html).
70 |
71 | ### Data version
72 |
73 | The [Etags](https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html) of these files are listed [here](etag.json).
74 |
75 | They were generated using:
76 |
77 | ```sh
78 | aws s3api list-objects --bucket cellpainting-gallery --prefix rosetta/broad/workspace/preprocessed_data/
79 | ```
80 | ### CP-L1000 Profile descriptions
81 |
82 | We gathered four available data sets that had both Cell Painting morphological (CP) and L1000 gene expression (GE) profiles, preprocessed the data from different sources and in different formats in a unified .csv format, and made the data publicly available. Single cell morphological (CP) profiles were created using CellProfiler software and processed to form aggregated replicate and treatment levels using the R cytominer package [cytominer](https://github.com/cytomining/cytominer/blob/master/vignettes/cytominer-pipeline.Rmd).
83 | We made the following three types of profiles available for cell-painting modality of each of four datasets:
84 |
85 |
86 | | Folder | File name | Description |
87 | | ------------ | -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
88 | | CellPainting | `replicate_level_cp_augmented.csv` | Aggregated and Metadata annotated profiles which are the average of single cell profiles in each well. |
89 | | CellPainting | `replicate_level_cp_normalized.csv.gz` | Normalized profiles which are the z-scored aggregated profiles, where the scores are computing using the distribution of negative controls as the reference. |
90 | | CellPainting | `replicate_level_cp_normalized_variable_selected.csv.gz` | Normalized variable selected which are normalized profiles with features selection applied |
91 | | L1000 | `replicate_level_l1k.csv` | Aggregated and Metadata annotated profiles which are the average of single cell profiles in each well. |
92 |
93 |
94 |
95 | ### Metadata information
96 |
97 | This [spreadsheet](https://docs.google.com/spreadsheets/d/1EpqBLJqio8ptGlZe9Ywq1OUJahKSpYNb6S4lJ9yFc0o/edit#gid=174183831) contains a description all the metadata fields across all 8 datasets.
98 |
99 | #### Keywords to match tables across modalities for each dataset
100 |
101 |
102 | | Dataset | perturbation match column
CP | perturbation match column
GE | Control perturbation value in each of columns
CP and GE |
103 | | :-------------------- | :------------------------------- | :------------------------------- | :---------------------------- |
104 | | CDRP-BBBC047-Bray | Metadata_Sample_Dose | pert_sample_dose | negcon |
105 | | CDRPBIO-BBBC036-Bray | Metadata_Sample_Dose | pert_sample_dose | negcon |
106 | | TA-ORF-BBBC037-Rohban | Metadata_broad_sample | pert_id | negcon |
107 | | LUAD-BBBC041-Caicedo | x_mutation_status | allele | negcon |
108 | | LINCS-Pilot1 | Metadata_pert_id_dose | pert_id_dose | negcon |
109 |
110 | * Two aditional columns can also be used to filter for the "Control perturbation" in each data table:
111 | - **pert_type** wich can take 'trt' or 'control' values , and column control_type indicates negcon (otherwise empty).
112 | - **control_type** wich can take 'negcon' (for control) or NaN (for treatments) values
113 |
114 | #### Number of features for each dataset
115 |
116 | | Dataset | GE | CP
`normalized` | CP
`normalized_variable_selected` |
117 | | -------- | --- | ------------------- | ------------------------------------- |
118 | | CDRP | 977 | 1565 | 727 |
119 | | CDRP-BIO | 977 | 1570 | 601 |
120 | | LUAD | 978 | 1569 | 291 |
121 | | TA-ORF | 978 | 1677 | 63 |
122 | | LINCS | 978 | 1670 | 119 |
123 |
124 |
125 | # Lookup table for L1000 genes predictability
126 |
127 | [Table](results/SingleGenePred/Appendix_D.csv)
128 |
129 |
130 | # License
131 |
132 | We license the data, results, and figures as [CC0 1.0](LICENSE_CC0.md) and the source code as BSD 3-Clause.
133 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | name: rosetta
2 | channels:
3 | - conda-forge
4 | dependencies:
5 | - pip=22.0.4
6 | - conda-forge::pandas=1.4.1
7 | - conda-forge::scikit-learn=1.0.2
8 | - conda-forge::umap-learn=0.5.2
9 | - conda-forge::jupyter=1.0.0
10 | - conda-forge::matplotlib=3.3.3
11 | - conda-forge::seaborn=0.11.2
12 | - conda-forge::openpyxl=3.0.9
13 |
--------------------------------------------------------------------------------
/etag.json:
--------------------------------------------------------------------------------
1 | {
2 | "Contents": [
3 | {
4 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_augmented.csv.gz",
5 | "LastModified": "2022-02-25T20:24:06.000Z",
6 | "ETag": "\"8367b77b245035279d21e083fb57564e-261\"",
7 | "Size": 2183033139,
8 | "StorageClass": "STANDARD",
9 | "Owner": {
10 | "DisplayName": "cellpainting",
11 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
12 | }
13 | },
14 | {
15 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_normalized.csv.gz",
16 | "LastModified": "2022-02-25T20:24:06.000Z",
17 | "ETag": "\"572869293e0cfacdd8882c2b758fac00-272\"",
18 | "Size": 2277911750,
19 | "StorageClass": "STANDARD",
20 | "Owner": {
21 | "DisplayName": "cellpainting",
22 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
23 | }
24 | },
25 | {
26 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz",
27 | "LastModified": "2022-02-25T20:24:06.000Z",
28 | "ETag": "\"510f9c5a93436c8af2f36f0308c78be0-131\"",
29 | "Size": 1098352960,
30 | "StorageClass": "STANDARD",
31 | "Owner": {
32 | "DisplayName": "cellpainting",
33 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
34 | }
35 | },
36 | {
37 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k.csv.gz",
38 | "LastModified": "2022-02-25T20:24:06.000Z",
39 | "ETag": "\"40e1f7285238c5381b9d9fdeebb5a026-32\"",
40 | "Size": 262406281,
41 | "StorageClass": "STANDARD",
42 | "Owner": {
43 | "DisplayName": "cellpainting",
44 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
45 | }
46 | },
47 | {
48 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k_pclfc.csv.gz",
49 | "LastModified": "2022-02-25T20:24:06.000Z",
50 | "ETag": "\"630b98d69d185f530acfb0c272e82031-31\"",
51 | "Size": 258651159,
52 | "StorageClass": "STANDARD",
53 | "Owner": {
54 | "DisplayName": "cellpainting",
55 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
56 | }
57 | },
58 | {
59 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k_pczscore.csv.gz",
60 | "LastModified": "2022-02-25T20:24:13.000Z",
61 | "ETag": "\"5ad1f4b412c8ea9b9abb55a254a7ebbe-72\"",
62 | "Size": 603440498,
63 | "StorageClass": "STANDARD",
64 | "Owner": {
65 | "DisplayName": "cellpainting",
66 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
67 | }
68 | },
69 | {
70 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k_vczscore.csv.gz",
71 | "LastModified": "2022-02-25T20:24:13.000Z",
72 | "ETag": "\"b58b4d31e96964f28165f048bdfd60c8-73\"",
73 | "Size": 605293966,
74 | "StorageClass": "STANDARD",
75 | "Owner": {
76 | "DisplayName": "cellpainting",
77 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
78 | }
79 | },
80 | {
81 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/treatment_level_l1k.csv.gz",
82 | "LastModified": "2022-02-25T20:24:27.000Z",
83 | "ETag": "\"e695e3d5f520553f516516ab8719719f-13\"",
84 | "Size": 107934871,
85 | "StorageClass": "STANDARD",
86 | "Owner": {
87 | "DisplayName": "cellpainting",
88 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
89 | }
90 | },
91 | {
92 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/CellPainting/replicate_level_cp_augmented.csv.gz",
93 | "LastModified": "2022-02-25T20:24:27.000Z",
94 | "ETag": "\"3e199aeba5209250e0d2c5948f5bd522-36\"",
95 | "Size": 298941736,
96 | "StorageClass": "STANDARD",
97 | "Owner": {
98 | "DisplayName": "cellpainting",
99 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
100 | }
101 | },
102 | {
103 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/CellPainting/replicate_level_cp_normalized.csv.gz",
104 | "LastModified": "2022-02-25T20:24:30.000Z",
105 | "ETag": "\"0b86065f8840aff626d64c6f52a8caf4-38\"",
106 | "Size": 311539701,
107 | "StorageClass": "STANDARD",
108 | "Owner": {
109 | "DisplayName": "cellpainting",
110 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
111 | }
112 | },
113 | {
114 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz",
115 | "LastModified": "2022-02-25T20:24:32.000Z",
116 | "ETag": "\"bffd9db9578fcc70bbd7d72e0dfff773-14\"",
117 | "Size": 117242590,
118 | "StorageClass": "STANDARD",
119 | "Owner": {
120 | "DisplayName": "cellpainting",
121 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
122 | }
123 | },
124 | {
125 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/L1000/replicate_level_l1k.csv.gz",
126 | "LastModified": "2022-02-25T20:24:35.000Z",
127 | "ETag": "\"5b45e5cb94f0466a2abb11fbac8a655e-4\"",
128 | "Size": 26842289,
129 | "StorageClass": "STANDARD",
130 | "Owner": {
131 | "DisplayName": "cellpainting",
132 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
133 | }
134 | },
135 | {
136 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_augmented.csv.gz",
137 | "LastModified": "2022-02-25T20:24:35.000Z",
138 | "ETag": "\"9bde4d7112c06ffa1849fbfa4efa22f1-36\"",
139 | "Size": 296762474,
140 | "StorageClass": "STANDARD",
141 | "Owner": {
142 | "DisplayName": "cellpainting",
143 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
144 | }
145 | },
146 | {
147 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_normalized.csv.gz",
148 | "LastModified": "2022-02-25T20:24:36.000Z",
149 | "ETag": "\"f42af6b4109ef9ed110004def49f6c2c-36\"",
150 | "Size": 299683743,
151 | "StorageClass": "STANDARD",
152 | "Owner": {
153 | "DisplayName": "cellpainting",
154 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
155 | }
156 | },
157 | {
158 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz",
159 | "LastModified": "2022-02-25T20:24:38.000Z",
160 | "ETag": "\"33783625dc59b0de2bf16c299f5380dd-12\"",
161 | "Size": 94527797,
162 | "StorageClass": "STANDARD",
163 | "Owner": {
164 | "DisplayName": "cellpainting",
165 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
166 | }
167 | },
168 | {
169 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_3.csv.gz",
170 | "LastModified": "2022-02-25T20:24:41.000Z",
171 | "ETag": "\"8491fe32e9b0b040f10c7d51225d6111-11\"",
172 | "Size": 89725093,
173 | "StorageClass": "STANDARD",
174 | "Owner": {
175 | "DisplayName": "cellpainting",
176 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
177 | }
178 | },
179 | {
180 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_4.csv.gz",
181 | "LastModified": "2022-02-25T20:24:42.000Z",
182 | "ETag": "\"14679d4b4cae5e12a4e7be8255bd22ff-10\"",
183 | "Size": 78596325,
184 | "StorageClass": "STANDARD",
185 | "Owner": {
186 | "DisplayName": "cellpainting",
187 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
188 | }
189 | },
190 | {
191 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_4W.csv.gz",
192 | "LastModified": "2022-02-25T20:24:43.000Z",
193 | "ETag": "\"370607c1f148942263037a7e26018303-17\"",
194 | "Size": 140912507,
195 | "StorageClass": "STANDARD",
196 | "Owner": {
197 | "DisplayName": "cellpainting",
198 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
199 | }
200 | },
201 | {
202 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_5_modz.csv.gz",
203 | "LastModified": "2022-02-25T20:24:43.000Z",
204 | "ETag": "\"5967bd8a92d2c57242436330950f1cd2\"",
205 | "Size": 3631,
206 | "StorageClass": "STANDARD",
207 | "Owner": {
208 | "DisplayName": "cellpainting",
209 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
210 | }
211 | },
212 | {
213 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_5_rank.csv.gz",
214 | "LastModified": "2022-02-25T20:24:43.000Z",
215 | "ETag": "\"83c8146ea2f8a2a6392643b3c4472727\"",
216 | "Size": 3631,
217 | "StorageClass": "STANDARD",
218 | "Owner": {
219 | "DisplayName": "cellpainting",
220 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
221 | }
222 | },
223 | {
224 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/replicate_level_l1k.csv.gz",
225 | "LastModified": "2022-02-25T20:24:44.000Z",
226 | "ETag": "\"872c318560ba21c9d36e805fb97992a4-10\"",
227 | "Size": 78596337,
228 | "StorageClass": "STANDARD",
229 | "Owner": {
230 | "DisplayName": "cellpainting",
231 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
232 | }
233 | },
234 | {
235 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_augmented.csv.gz",
236 | "LastModified": "2022-02-25T20:24:44.000Z",
237 | "ETag": "\"11a0a26d299f09452455e0c7e44c571c-11\"",
238 | "Size": 85105940,
239 | "StorageClass": "STANDARD",
240 | "Owner": {
241 | "DisplayName": "cellpainting",
242 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
243 | }
244 | },
245 | {
246 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_normalized.csv.gz",
247 | "LastModified": "2022-02-25T20:24:46.000Z",
248 | "ETag": "\"f91d40a978c96834973f24b96b8a3b02-11\"",
249 | "Size": 88273100,
250 | "StorageClass": "STANDARD",
251 | "Owner": {
252 | "DisplayName": "cellpainting",
253 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
254 | }
255 | },
256 | {
257 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz",
258 | "LastModified": "2022-02-25T20:24:47.000Z",
259 | "ETag": "\"1ba6936ab1188268850a798e30c4823f-2\"",
260 | "Size": 16570136,
261 | "StorageClass": "STANDARD",
262 | "Owner": {
263 | "DisplayName": "cellpainting",
264 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
265 | }
266 | },
267 | {
268 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/L1000/replicate_level_l1k.csv.gz",
269 | "LastModified": "2022-02-25T20:24:47.000Z",
270 | "ETag": "\"c1b8cabef1934d213baf797b80c4c32c-2\"",
271 | "Size": 11448027,
272 | "StorageClass": "STANDARD",
273 | "Owner": {
274 | "DisplayName": "cellpainting",
275 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
276 | }
277 | },
278 | {
279 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/L1000/replicate_level_l1k_Juan.csv.gz",
280 | "LastModified": "2022-02-25T20:24:47.000Z",
281 | "ETag": "\"587d00f75c5fa6164929e3592bf96080-4\"",
282 | "Size": 25582111,
283 | "StorageClass": "STANDARD",
284 | "Owner": {
285 | "DisplayName": "cellpainting",
286 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
287 | }
288 | },
289 | {
290 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/L1000/treatment_level_l1k.csv.gz",
291 | "LastModified": "2022-02-25T20:24:48.000Z",
292 | "ETag": "\"c7f285af2a39efc64a4c8d57854d6a0e\"",
293 | "Size": 4575373,
294 | "StorageClass": "STANDARD",
295 | "Owner": {
296 | "DisplayName": "cellpainting",
297 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
298 | }
299 | },
300 | {
301 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_augmented.csv.gz",
302 | "LastModified": "2022-02-25T20:24:48.000Z",
303 | "ETag": "\"9707bd02924cda850ed6f1e7eba33d9a-4\"",
304 | "Size": 27548449,
305 | "StorageClass": "STANDARD",
306 | "Owner": {
307 | "DisplayName": "cellpainting",
308 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
309 | }
310 | },
311 | {
312 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_normalized.csv.gz",
313 | "LastModified": "2022-02-25T20:24:48.000Z",
314 | "ETag": "\"736ef2b85bf5406f27239153f3772218-4\"",
315 | "Size": 27482072,
316 | "StorageClass": "STANDARD",
317 | "Owner": {
318 | "DisplayName": "cellpainting",
319 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
320 | }
321 | },
322 | {
323 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz",
324 | "LastModified": "2022-02-25T20:24:48.000Z",
325 | "ETag": "\"1315c2fd175b265d10e929e51d9dfef0\"",
326 | "Size": 1106334,
327 | "StorageClass": "STANDARD",
328 | "Owner": {
329 | "DisplayName": "cellpainting",
330 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
331 | }
332 | },
333 | {
334 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k.csv.gz",
335 | "LastModified": "2022-02-25T20:24:49.000Z",
336 | "ETag": "\"1e643bb1182555a8e7699230a0ea98d1\"",
337 | "Size": 2022367,
338 | "StorageClass": "STANDARD",
339 | "Owner": {
340 | "DisplayName": "cellpainting",
341 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
342 | }
343 | },
344 | {
345 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k_QNORM.csv.gz",
346 | "LastModified": "2022-02-25T20:24:49.000Z",
347 | "ETag": "\"8ffb9c82772442cbbd138a6ab05a9a97\"",
348 | "Size": 1782302,
349 | "StorageClass": "STANDARD",
350 | "Owner": {
351 | "DisplayName": "cellpainting",
352 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
353 | }
354 | },
355 | {
356 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k_ZSPCQNORM.csv.gz",
357 | "LastModified": "2022-02-25T20:24:49.000Z",
358 | "ETag": "\"36783d73bb48bec466aeda707384c7e5\"",
359 | "Size": 1997953,
360 | "StorageClass": "STANDARD",
361 | "Owner": {
362 | "DisplayName": "cellpainting",
363 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
364 | }
365 | }
366 | ]
367 | }
--------------------------------------------------------------------------------
/idmap.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/idmap.xlsx
--------------------------------------------------------------------------------
/read_and_match_profiles.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 35,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "The autoreload extension is already loaded. To reload it, use:\n",
13 | " %reload_ext autoreload\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "%matplotlib inline\n",
19 | "%load_ext autoreload\n",
20 | "%autoreload 2\n",
21 | "import numpy as np\n",
22 | "import pandas as pd\n",
23 | "import matplotlib.pyplot as plt\n",
24 | "import seaborn as sns\n",
25 | "from utils.readProfiles import *"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {},
32 | "outputs": [],
33 | "source": [
34 | "# ls"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "### Metadata column in each dataset to match perturbations across modalities\n",
42 | "\n",
43 | "Table 1.\n",
44 | "\n",
45 | "| Dataset | perturbation match column
CP | perturbation match column
GE | Control perturbation value
CP/GE|\n",
46 | "|:----------------------|:-----------------|:-----------------------------|:--------------|\n",
47 | "| CDRP-BBBC047-Bray | Metadata_Sample_Dose | pert_sample_dose | negcon |\n",
48 | "| CDRPBIO-BBBC036-Bray | Metadata_Sample_Dose | pert_sample_dose | negcon |\n",
49 | "| TA-ORF-BBBC037-Rohban | Metadata_broad_sample | pert_id | negcon |\n",
50 | "| LUAD-BBBC041-Caicedo | x_mutation_status | allele | negcon|\n",
51 | "| LINCS-Pilot1 | Metadata_pert_id_dose | pert_id_dose | negcon |\n"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 36,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "ds_info_dict = {\n",
61 | " \"CDRP\": [\"CDRP-BBBC047-Bray\", [\"Metadata_Sample_Dose\", \"pert_sample_dose\"]],\n",
62 | " \"CDRP-bio\": [\"CDRPBIO-BBBC036-Bray\", [\"Metadata_Sample_Dose\", \"pert_sample_dose\"]],\n",
63 | " \"TAORF\": [\"TA-ORF-BBBC037-Rohban\", [\"Metadata_broad_sample\", \"pert_id\"]],\n",
64 | " \"LUAD\": [\"LUAD-BBBC041-Caicedo\", [\"x_mutation_status\", \"allele\"]],\n",
65 | " \"LINCS\": [\"LINCS-Pilot1\", [\"Metadata_pert_id_dose\", \"pert_id_dose\"]],\n",
66 | "}\n",
67 | "# pd.DataFrame(ds_info_dict.values(), index=ds_info_dict.keys()).to_markdown(index=False)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": []
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "### In this notebook you can find examples of how to:\n",
82 | "- read replicate or treatment level profiles \n",
83 | "- match profiles across modalities\n",
84 | "\n",
85 | "\n",
86 | "\n",
87 | "* Finctions used in this notebook:\n",
88 | "\n",
89 | " - Read **treatment** level data\n",
90 | " - read_treatment_level_profiles\n",
91 | " \n",
92 | " - Read and match **treatment** level data\n",
93 | " - read_paired_treatment_level_profiles\n",
94 | " \n",
95 | " - Read **Replicate** level data\n",
96 | " - read_replicate_level_profiles\n",
97 | " \n",
98 | " - Read and match **Replicate** level data\n",
99 | " - read_paired_replicate_level_profiles\n"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "### User input parameters"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 37,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "####################### Root directories ###############################################\n",
116 | "procProf_dir = \"/home/ubuntu/gallery/cpg0003-rosetta/broad/workspace/\"\n",
117 | "# procProf_dir = \"/home/ubuntu/bucket/projects/2018_04_20_Rosetta/workspace/\"\n",
118 | "\n",
119 | "############################# Dataset ##################################################\n",
120 | "# dataset options: 'LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP'\n",
121 | "dataset = \"LUAD\"\n",
122 | "\n",
123 | "####################### Type of cell painting profile to read ##########################\n",
124 | "# CP Profile Type options: 'augmented' , 'normalized', 'normalized_variable_selected'\n",
125 | "profileType = \"normalized_variable_selected\"\n",
126 | "\n",
127 | "############################ Filtering low quality samples option #######################\n",
128 | "# filtering to compounds which have high replicates for both GE and CP datasets\n",
129 | "# highRepOverlapEnabled=0\n",
130 | "# 'highRepUnion','highRepOverlap'\n",
131 | "filter_perts = \"highRepUnion\"\n",
132 | "repCorrFilePath = \"./results/RepCor/RepCorrDF.xlsx\"\n",
133 | "\n",
134 | "filter_repCorr_params = [filter_perts, repCorrFilePath]"
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "metadata": {},
140 | "source": [
141 | "### Read Replicate level profiles"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 14,
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "name": "stderr",
151 | "output_type": "stream",
152 | "text": [
153 | "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:54: DtypeWarning: Columns (1023,1028,1032) have mixed types. Specify dtype option on import or set low_memory=False.\n",
154 | " l1k_data_repLevel = pd.read_csv(dataDir + \"/L1000/replicate_level_l1k.csv.gz\")\n"
155 | ]
156 | }
157 | ],
158 | "source": [
159 | "# dataset = \"LINCS\"\n",
160 | "per_plate_normalized_flag = 0\n",
161 | "[cp_data_repLevel, cp_features], [l1k_data_repLevel, l1k_features] = read_replicate_level_profiles(\n",
162 | " procProf_dir, dataset, profileType, per_plate_normalized_flag\n",
163 | ")"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "### Read and pair Replicate level profiles"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 6,
176 | "metadata": {},
177 | "outputs": [
178 | {
179 | "name": "stderr",
180 | "output_type": "stream",
181 | "text": [
182 | "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:51: DtypeWarning: Columns (18,19,1249,1250) have mixed types. Specify dtype option on import or set low_memory=False.\n",
183 | " cp_data_repLevel = pd.read_csv(\n"
184 | ]
185 | },
186 | {
187 | "name": "stdout",
188 | "output_type": "stream",
189 | "text": [
190 | "LINCS: Replicate Level Shapes (nSamples x nFeatures): cp: 52223 , 119 , l1k: 27837 , 978\n",
191 | "l1k n of rep: 3.0\n",
192 | "cp n of rep: 5.0\n",
193 | "CP: from 9394 to 4647\n",
194 | "l1k: from 8369 to 2338\n",
195 | "CP and l1k high rep union: 5845\n"
196 | ]
197 | },
198 | {
199 | "name": "stderr",
200 | "output_type": "stream",
201 | "text": [
202 | "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:376: FutureWarning: Passing 'suffixes' which cause duplicate columns {'pert_type_y'} in the result is deprecated and will raise a MergeError in a future version.\n",
203 | " mergedProfiles_repLevel = pd.merge(\n"
204 | ]
205 | }
206 | ],
207 | "source": [
208 | "nRep = 2\n",
209 | "per_plate_normalized_flag = 1\n",
210 | "mergedProfiles_repLevel, cp_features, l1k_features = read_paired_replicate_level_profiles(\n",
211 | " procProf_dir, dataset, profileType, nRep, filter_repCorr_params, per_plate_normalized_flag\n",
212 | ")"
213 | ]
214 | },
215 | {
216 | "cell_type": "markdown",
217 | "metadata": {},
218 | "source": [
219 | "### Read treatment level profiles"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 7,
225 | "metadata": {},
226 | "outputs": [
227 | {
228 | "name": "stderr",
229 | "output_type": "stream",
230 | "text": [
231 | "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:51: DtypeWarning: Columns (18,19,1249,1250) have mixed types. Specify dtype option on import or set low_memory=False.\n",
232 | " cp_data_repLevel = pd.read_csv(\n"
233 | ]
234 | },
235 | {
236 | "name": "stdout",
237 | "output_type": "stream",
238 | "text": [
239 | "LINCS: Replicate Level Shapes (nSamples x nFeatures): cp: 52223 , 119 , l1k: 27837 , 978\n",
240 | "l1k n of rep: 3.0\n",
241 | "cp n of rep: 5.0\n",
242 | "CP: from 9394 to 4647\n",
243 | "l1k: from 8369 to 2338\n",
244 | "CP and l1k high rep union: 5845\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "[cp_data_treatLevel, cp_features], [\n",
250 | " l1k_data_treatLevel,\n",
251 | " l1k_features,\n",
252 | "] = read_treatment_level_profiles(\n",
253 | " procProf_dir, dataset, profileType, filter_repCorr_params, per_plate_normalized_flag\n",
254 | ")"
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "metadata": {},
260 | "source": [
261 | "### Read and pair treatment level profiles"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 9,
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "name": "stderr",
271 | "output_type": "stream",
272 | "text": [
273 | "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:51: DtypeWarning: Columns (18,19,1249,1250) have mixed types. Specify dtype option on import or set low_memory=False.\n",
274 | " cp_data_repLevel = pd.read_csv(\n"
275 | ]
276 | },
277 | {
278 | "name": "stdout",
279 | "output_type": "stream",
280 | "text": [
281 | "LINCS: Replicate Level Shapes (nSamples x nFeatures): cp: 52223 , 119 , l1k: 27837 , 978\n",
282 | "l1k n of rep: 3.0\n",
283 | "cp n of rep: 5.0\n",
284 | "CP: from 9394 to 4647\n",
285 | "l1k: from 8369 to 2338\n",
286 | "CP and l1k high rep union: 5845\n",
287 | "Treatment Level Shapes (nSamples x nFeatures+metadata): (5243, 122) (4431, 980) Merged Profiles Shape: (3828, 1101)\n"
288 | ]
289 | }
290 | ],
291 | "source": [
292 | "mergedProfiles_treatLevel, cp_features, l1k_features = read_paired_treatment_level_profiles(\n",
293 | " procProf_dir, dataset, profileType, filter_repCorr_params, per_plate_normalized_flag\n",
294 | ")"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": null,
300 | "metadata": {},
301 | "outputs": [],
302 | "source": []
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 40,
307 | "metadata": {
308 | "scrolled": false
309 | },
310 | "outputs": [],
311 | "source": [
312 | "# l1k_data_repLevel[ds_info_dict[dataset][1][1]].unique()\n",
313 | "# cp_data_repLevel[ds_info_dict[dataset][1][0]].unique()"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 41,
319 | "metadata": {},
320 | "outputs": [],
321 | "source": [
322 | "# per_plate_normalized_flag"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {},
329 | "outputs": [],
330 | "source": []
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "metadata": {},
336 | "outputs": [],
337 | "source": []
338 | }
339 | ],
340 | "metadata": {
341 | "kernelspec": {
342 | "display_name": "Python 3 (ipykernel)",
343 | "language": "python",
344 | "name": "python3"
345 | },
346 | "language_info": {
347 | "codemirror_mode": {
348 | "name": "ipython",
349 | "version": 3
350 | },
351 | "file_extension": ".py",
352 | "mimetype": "text/x-python",
353 | "name": "python",
354 | "nbconvert_exporter": "python",
355 | "pygments_lexer": "ipython3",
356 | "version": "3.9.0"
357 | },
358 | "latex_envs": {
359 | "LaTeX_envs_menu_present": true,
360 | "autoclose": false,
361 | "autocomplete": true,
362 | "bibliofile": "biblio.bib",
363 | "cite_by": "apalike",
364 | "current_citInitial": 1,
365 | "eqLabelWithNumbers": true,
366 | "eqNumInitial": 1,
367 | "hotkeys": {
368 | "equation": "Ctrl-E",
369 | "itemize": "Ctrl-I"
370 | },
371 | "labels_anchors": false,
372 | "latex_user_defs": false,
373 | "report_style_numbering": false,
374 | "user_envs_cfg": false
375 | },
376 | "varInspector": {
377 | "cols": {
378 | "lenName": 16,
379 | "lenType": 16,
380 | "lenVar": 40
381 | },
382 | "kernels_config": {
383 | "python": {
384 | "delete_cmd_postfix": "",
385 | "delete_cmd_prefix": "del ",
386 | "library": "var_list.py",
387 | "varRefreshCmd": "print(var_dic_list())"
388 | },
389 | "r": {
390 | "delete_cmd_postfix": ") ",
391 | "delete_cmd_prefix": "rm(",
392 | "library": "var_list.r",
393 | "varRefreshCmd": "cat(var_dic_list()) "
394 | }
395 | },
396 | "position": {
397 | "height": "438.212px",
398 | "left": "1507.78px",
399 | "right": "20px",
400 | "top": "120px",
401 | "width": "350px"
402 | },
403 | "types_to_exclude": [
404 | "module",
405 | "function",
406 | "builtin_function_or_method",
407 | "instance",
408 | "_Feature"
409 | ],
410 | "window_display": false
411 | }
412 | },
413 | "nbformat": 4,
414 | "nbformat_minor": 2
415 | }
416 |
--------------------------------------------------------------------------------
/results/DAVIDoutput_CytoScapeInput_Figure2d/chart_UP_KEYWORDS_FunctionalAnot_top.txt:
--------------------------------------------------------------------------------
1 | Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR
2 | UP_KEYWORDS Acetylation 30 54.54545454545454 5.124648987027806E-10 TOP2A, CLIC4, SLC35F2, NOLC1, GLRX, EBP, NNT, STMN1, ANXA7, PCBD1, HADH, LBR, LIG1, TXNRD1, USP22, TPM1, RPA1, PYCR1, DDX10, PAICS, HIST2H2BE, CCNA2, GNPDA1, MTHFD2, BIRC5, PSMG1, KIF2C, KIF20A, ARHGEF2, PAFAH1B3 55 3424 20581 3.2786214953271027 8.609409085647002E-8 8.609410298206715E-8 8.45567082859588E-8
3 | UP_KEYWORDS Phosphoprotein 36 65.45454545454545 2.0388780008061567E-4 TOP2A, CLIC4, SLC35F2, INPP1, MRPL19, FHL2, NOLC1, PSIP1, RELB, PGRMC1, OXCT1, STMN1, STX4, CPNE3, LBR, MPZL1, IER3, LIG1, TXNRD1, TPM1, RPA1, PYCR1, DDX10, PAICS, HIST2H2BE, CCNA2, TXLNA, GNPDA1, TCEA2, BIRC5, PSMG1, KIF2C, NCAPD2, KIF20A, ARHGEF2, PAFAH1B3 55 8246 20581 1.6336648071792386 0.03367652713909708 0.017126575206771716 0.016820743506650793
4 | UP_KEYWORDS Cell division 6 10.909090909090908 0.003440617247463365 CCNA2, LIG1, BIRC5, KIF2C, NCAPD2, ARHGEF2 55 388 20581 5.78659793814433 0.4395528677953666 0.19267456585794843 0.18923394861048506
5 | UP_KEYWORDS Mitosis 5 9.090909090909092 0.004928799319810609 CCNA2, BIRC5, KIF2C, NCAPD2, ARHGEF2 55 262 20581 7.141221374045802 0.5639880523221983 0.19369482099750268 0.19023598490826157
6 | UP_KEYWORDS Microtubule 5 9.090909090909092 0.006220048085963932 STMN1, BIRC5, KIF2C, KIF20A, ARHGEF2 55 280 20581 6.682142857142857 0.649442836226727 0.19369482099750268 0.19023598490826157
7 | UP_KEYWORDS Cell cycle 7 12.727272727272727 0.006917672178482239 CCNA2, LIG1, USP22, BIRC5, KIF2C, NCAPD2, ARHGEF2 55 650 20581 4.029846153846154 0.6884536989331027 0.19369482099750268 0.19023598490826157
8 | UP_KEYWORDS Transit peptide 6 10.909090909090908 0.013033979603529661 ALAS1, NNT, OXCT1, MTHFD2, MRPL19, HADH 55 536 20581 4.188805970149254 0.8896506576574663 0.3128155104847119 0.30722951922605635
9 | UP_KEYWORDS Cytoplasm 21 38.18181818181819 0.01679969816231558 TOP2A, CLIC4, ZNF274, TXNRD1, TPM1, FHL2, NOLC1, GLRX, RELB, CCNA2, GNPDA1, STMN1, BIRC5, PCBD1, PSMG1, KIF2C, NCAPD2, CPNE3, KIF20A, ARHGEF2, PAFAH1B3 55 4816 20581 1.631686046511628 0.9419427396195738 0.33607778138786243 0.3300763924345077
10 | UP_KEYWORDS Oxidoreductase 6 10.909090909090908 0.018004166860064057 NNT, MTHFD2, P4HA2, TXNRD1, PYCR1, HADH 55 582 20581 3.857731958762886 0.9527479826636567 0.33607778138786243 0.3300763924345077
11 | UP_KEYWORDS Mitochondrion 8 14.545454545454545 0.026249398106128065 CLIC4, ALAS1, NNT, OXCT1, MTHFD2, MRPL19, PYCR1, HADH 55 1119 20581 2.6752457551385165 0.9885390617335139 0.39814595131174274 0.3910362021811759
12 | UP_KEYWORDS Isopeptide bond 8 14.545454545454545 0.027734439083459617 TOP2A, LIG1, TPM1, FHL2, RPA1, TCEA2, NOLC1, HIST2H2BE 55 1132 20581 2.64452296819788 0.9911312247584608 0.39814595131174274 0.3910362021811759
13 | UP_KEYWORDS Cytoskeleton 8 14.545454545454545 0.028438996522267338 CLIC4, TPM1, STMN1, BIRC5, KIF2C, KIF20A, ARHGEF2, RELB 55 1138 20581 2.630579964850615 0.9921481397546121 0.39814595131174274 0.3910362021811759
14 | UP_KEYWORDS Ubl conjugation 10 18.181818181818183 0.03212534559864598 TOP2A, CCNA2, TXNRD1, FHL2, RPA1, TCEA2, NOLC1, BIRC5, KIF2C, HIST2H2BE 55 1705 20581 2.1947214076246335 0.9958541680959994 0.4151583123517327 0.4077447710597374
15 | UP_KEYWORDS Nucleus 21 38.18181818181819 0.04001475328204553 TOP2A, CLIC4, ZNF274, LIG1, TXNRD1, USP22, FHL2, RPA1, PSIP1, NOLC1, HIST2H2BE, RELB, CCNA2, EBP, TCEA2, BIRC5, PCBD1, KIF2C, NCAPD2, CPNE3, LBR 55 5244 20581 1.498512585812357 0.998951795861702 0.48017703938454637 0.4716024493955367
16 | UP_KEYWORDS Magnesium 5 9.090909090909092 0.05656549310361622 TOP2A, LIG1, INPP1, MTHFD2, ATP2C1 55 552 20581 3.3894927536231885 0.9999435615398904 0.6335335227605017 0.6222204241397784
17 | UP_KEYWORDS Disease mutation 12 21.818181818181817 0.06441555750669184 EBP, NPC1, NNT, OXCT1, P4HA2, TPM1, PYCR1, PCBD1, ATP2C1, HADH, SLC37A4, LBR 55 2550 20581 1.7609411764705882 0.9999861342755192 0.6763633538202644 0.6642854367877596
18 | UP_KEYWORDS ATP-binding 8 14.545454545454545 0.07015335858609392 TOP2A, LIG1, NOLC1, DDX10, KIF2C, KIF20A, ATP2C1, PAICS 55 1391 20581 2.152120776419842 0.9999950670933881 0.6932802495566929 0.6809002451003233
19 | UP_KEYWORDS NAD 3 5.454545454545454 0.07719857350347654 NNT, MTHFD2, HADH 55 175 20581 6.414857142857143 0.9999986253655966 0.7205200193657811 0.7076535904485349
20 | UP_KEYWORDS NADP 3 5.454545454545454 0.08568717461020739 NNT, TXNRD1, PYCR1 55 186 20581 6.035483870967742 0.9999997089762821 0.7356053928073722 0.7224695822215264
21 | UP_KEYWORDS Chromosome 4 7.2727272727272725 0.08757207057230622 BIRC5, KIF2C, NCAPD2, HIST2H2BE 55 400 20581 3.7419999999999995 0.9999997942405193 0.7356053928073722 0.7224695822215264
22 | UP_KEYWORDS Alternative splicing 34 61.81818181818181 0.09860418033718393 TOP2A, ZNF274, ALAS1, SLC35F2, FHL2, NOLC1, PSIP1, ATP2C1, PGRMC1, OXCT1, GPC1, STMN1, ANXA7, STX4, HADH, SLC37A4, MPZL1, LIG1, TXNRD1, USP22, TPM1, PYCR1, PAICS, GNPDA1, NPC1, P4HA2, MTHFD2, TCEA2, BIRC5, PSMG1, KIF2C, KIF20A, ARHGEF2, TRIB1 55 10587 20581 1.2017379805421742 0.9999999733430376 0.7888334426974715 0.7747471312207309
23 |
--------------------------------------------------------------------------------
/results/Figs_Source_Data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/Figs_Source_Data.xlsx
--------------------------------------------------------------------------------
/results/MoAprediction/JI_cdrpbio.txt:
--------------------------------------------------------------------------------
1 | "CP" "GE" "Early Fusion" "RGCCA" "MCIA" "MOFA" "iCluster" "intNMF" "JIVE" "scikit-fusion"
2 | 0.278851699 0.21674894 0.259032794 0.31199107 0.244958406 0.245157412 0.166666667 0.273648354 0.26111079 0.202512456
3 | 0.281781202 0.217353574 0.272947479 0.299993438 0.237120438 0.24766634 0.166666667 0.284937711 0.271319067 0.204582565
4 | 0.2718746 0.219221298 0.288572557 0.300109206 0.245544432 0.239201534 0.176470588 0.284523564 0.264215061 0.20097943
5 | 0.274209543 0.214641334 0.266898688 0.298896298 0.248330213 0.247923298 0.166666667 0.292745201 0.256330169 0.192807023
6 | 0.265629351 0.217429769 0.279090052 0.307711408 0.239454065 0.239942443 1 0.285721498 0.26371381 0.199625524
7 | 0.275029708 0.2239464 0.28495489 0.301235572 0.2386484 0.254481673 0.153846154 0.282939774 0.264471233 0.1976511
8 | 0.288765182 0.217004773 0.282173851 0.306246664 0.246732016 0.254969989 0.4 0.294275281 0.263112267 0.200504611
9 | 0.278531041 0.223395083 0.275002364 0.293082233 0.24396531 0.242451334 0.333333333 0.291354281 0.258611036 0.209453967
10 | 0.272455248 0.231600555 0.262360363 0.295268069 0.241331141 0.255809239 0.5 0.296717155 0.261306837 0.207150644
11 | 0.275023829 0.225685487 0.271300656 0.295524507 0.246870483 0.238823953 0.16 0.274089946 0.265535744 0.20338892
12 | 0.28143867 0.224871523 0.260911678 0.303672023 0.236066059 0.239288014 0.171428571 0.294494354 0.261902235 0.209384306
13 | 0.282288852 0.223132721 0.282475537 0.290829442 0.244147491 0.256209037 0.2 0.282647138 0.259500099 0.19759714
14 | 0.287532178 0.222311311 0.26242836 0.295981315 0.243569808 0.243439708 0.157894737 0.275122015 0.260815605 0.202213235
15 | 0.301480301 0.215302881 0.270409742 0.300910495 0.239649912 0.247746601 0.1 0.28834381 0.251075084 0.204514447
16 | 0.277106418 0.217944084 0.281592592 0.309385645 0.246099862 0.247184454 0.083333333 0.286270538 0.25763249 0.207762257
17 | 0.260563304 0.229904461 0.266062905 0.301805175 0.238773842 0.237490211 0.117647059 0.285515992 0.252312053 0.20449294
--------------------------------------------------------------------------------
/results/MoAprediction/JI_lincs.txt:
--------------------------------------------------------------------------------
1 | "CP" "GE" "Early Fusion" "RGCCA" "MCIA" "MOFA" "iCluster" "intNMF" "JIVE" "scikit-fusion"
2 | 0.188998329 0.152892632 0.161685084 0.201019074 0.174152986 0.186524756 0.177777778 0.159674769 0.177569702 0.119184686
3 | 0.182455547 0.155460808 0.1663579 0.198310907 0.179608683 0.173331065 0.226415094 0.16362849 0.167412958 0.116937471
4 | 0.193899409 0.152630737 0.174071877 0.190282351 0.181639662 0.172031122 0.070588235 0.166882412 0.169969459 0.120875801
5 | 0.182039125 0.155030388 0.165013806 0.196283122 0.182571093 0.172776906 0.071005917 0.168034196 0.1757652 0.124514593
6 | 0.184550744 0.147509477 0.169153875 0.187533527 0.177913267 0.17130581 0.043478261 0.159508666 0.167613402 0.113748412
7 | 0.182496631 0.159455754 0.165133202 0.18379198 0.179867253 0.183528183 0.089430894 0.169158962 0.159833977 0.12420253
8 | 0.171946117 0.154463604 0.178991604 0.197523283 0.19030103 0.171250903 0.2 0.172767563 0.164891016 0.123406295
9 | 0.186694719 0.155486143 0.164093562 0.187112227 0.186310238 0.183258565 0.208333333 0.165169404 0.171638208 0.118322396
10 | 0.180338058 0.143601898 0.15861667 0.200142072 0.184137339 0.17660344 0.072164948 0.16547984 0.163073586 0.126149742
11 | 0.183605321 0.15741292 0.171381991 0.188784002 0.173252761 0.177570921 0.01369863 0.163314014 0.173093975 0.11873174
12 | 0.193222559 0.156995026 0.173813632 0.196182388 0.174394311 0.171367636 0.083333333 0.165245188 0.164822426 0.11878228
13 | 0.192671978 0.154919056 0.170624979 0.193073079 0.181786851 0.171575995 0.291139241 0.171691008 0.165878195 0.119526703
14 | 0.187967039 0.155867196 0.162878291 0.19683875 0.192637938 0.18045576 0.461538462 0.157649099 0.163766256 0.117880675
15 | 0.18544228 0.157804763 0.179560591 0.185849847 0.187052994 0.177218298 0.049382716 0.163521993 0.161072242 0.124717351
16 | 0.190641757 0.155245631 0.15904326 0.192819574 0.18380893 0.170782525 0.058823529 0.164331798 0.174827621 0.120163039
17 | 0.185672437 0.164612477 0.173447376 0.196032956 0.174881143 0.17674655 0.25 0.162468251 0.162594174 0.124472168
18 | 0.190136704 0.153092828 0.170318814 0.199112922 0.181481014 0.181176604 0.15625 0.169043792 0.158751472 0.118636228
19 | 0.184765443 0.157565332 0.173143963 0.198382252 0.179289922 0.173976218 0.45 0.172475028 0.164535448 0.11733305
20 | 0.192641148 0.154739976 0.171832001 0.197707877 0.179048991 0.182646244 0.138888889 0.169634439 0.170512163 0.120909347
21 | 0.183218664 0.156702842 0.17199648 0.189603926 0.180477667 0.179052585 0.057971014 0.171736775 0.152669412 0.116083564
22 | 0.187256874 0.147593792 0.170884185 0.198366585 0.180195479 0.175878718 0.058139535 0.169146685 0.159364501 0.118681457
23 | 0.187069918 0.149542804 0.16561384 0.187212941 0.183953164 0.182117759 0.046511628 0.166623749 0.170293955 0.120635555
24 | 0.183073028 0.153191771 0.156706199 0.198252086 0.178945469 0.183918728 0.01369863 0.164690939 0.168051618 0.12046373
25 | 0.188407875 0.158852641 0.155445963 0.199950627 0.178443793 0.179828374 0.079365079 0.167779642 0.166961766 0.119892686
26 | 0.175651634 0.171142449 0.158127203 0.191347613 0.195579897 0.178017005 0.2 0.170611727 0.166003424 0.11948773
27 | 0.17865041 0.155742474 0.18308929 0.187846662 0.185773521 0.183502339 0.06 0.167828001 0.164108882 0.120033978
28 | 0.194311319 0.158786831 0.175152329 0.188190067 0.18457021 0.17641168 0.0625 0.164302617 0.158739428 0.119227709
29 | 0.193254176 0.15960622 0.175409666 0.187384662 0.179096717 0.18390748 0.105263158 0.163219297 0.167798949 0.121511195
30 | 0.18617729 0.155678692 0.167753437 0.194289268 0.174803478 0.175536734 0.083333333 0.164659934 0.159521227 0.123949342
31 | 0.179601909 0.156356456 0.161356004 0.193791932 0.183172202 0.175497751 0.051724138 0.162390982 0.170494378 0.120299648
32 | 0.18963734 0.156625601 0.166336797 0.193811041 0.179803533 0.17315947 0.183333333 0.156586373 0.170302999 0.11827188
33 | 0.184352173 0.152024514 0.16540649 0.19131148 0.172935734 0.178541939 0.223529412 0.176281185 0.157959047 0.116664658
34 | 0.189699393 0.148682331 0.164026646 0.201137787 0.182479125 0.168811084 0.620689655 0.175702153 0.162975292 0.120901982
35 | 0.180999142 0.161272099 0.160666305 0.20022499 0.185258429 0.176733567 0.016666667 0.165612072 0.160725882 0.116746776
36 | 0.180157675 0.157840116 0.167903519 0.200828676 0.182595129 0.175124784 0.046875 0.170652949 0.16434307 0.121763081
37 | 0.175230255 0.155045636 0.166836702 0.198913695 0.177755714 0.181654699 0.051282051 0.161360514 0.162028228 0.116845951
38 | 0.184173371 0.155724161 0.16486378 0.186722298 0.172474892 0.176250337 0.058823529 0.167288187 0.161137409 0.118986834
39 | 0.187505371 0.162342476 0.170702512 0.18814401 0.180784286 0.175958655 0.043478261 0.168911293 0.158109144 0.11885624
40 | 0.184911138 0.152442416 0.170156448 0.194377939 0.174680795 0.184703302 0.111111111 0.1634296 0.157688838 0.117673344
41 | 0.200188868 0.167102791 0.175260347 0.207820825 0.180506812 0.181896395 0.452380952 0.177000251 0.159222812 0.118347626
42 | 0.185687782 0.145595456 0.166612367 0.194777853 0.180121812 0.18308245 0.06122449 0.163437656 0.167444436 0.116668697
43 | 0.184420505 0.164602208 0.162737406 0.195980252 0.179240673 0.182344867 0.117647059 0.170539941 0.166948171 0.124213738
44 | 0.189296239 0.15369093 0.175729401 0.197048519 0.181657677 0.172635318 0.060869565 0.166161194 0.160457602 0.122806357
45 | 0.185131847 0.153377751 0.157125437 0.198765646 0.176936046 0.177792184 0.1 0.159993478 0.166106644 0.123517094
46 | 0.191163865 0.153658909 0.173349824 0.196983778 0.175305658 0.184276093 0.063829787 0.165410291 0.164367546 0.122196122
47 | 0.182868061 0.152204389 0.164102408 0.193891247 0.179592803 0.171609376 0.051724138 0.162577 0.168739518 0.11732252
48 | 0.182594781 0.158816683 0.172826354 0.203075052 0.179938037 0.169869442 0.28 0.160315945 0.158795732 0.121053386
49 | 0.1857397 0.1627972 0.164690579 0.193372016 0.178605952 0.180772956 0.681818182 0.172442065 0.166283947 0.12059789
50 | 0.185656684 0.154339706 0.164199974 0.191689141 0.175449618 0.178893609 0.041666667 0.170628469 0.165995994 0.114831952
51 | 0.183158086 0.152878986 0.163597095 0.194775553 0.176895727 0.175380238 0.048 0.161713348 0.176740485 0.124128933
52 | 0.184734257 0.149737635 0.170376461 0.194929765 0.179171628 0.185745275 0.14 0.166058303 0.167017443 0.115490732
53 | 0.179232878 0.156775481 0.161159229 0.202649183 0.187129908 0.172128717 0.065420561 0.180383253 0.157901665 0.121413283
54 | 0.178903625 0.157511588 0.16991642 0.197078156 0.191806662 0.181379942 0.131578947 0.162768724 0.160685666 0.119430025
55 | 0.177152292 0.163682115 0.167529903 0.20160861 0.185839945 0.175390124 0.417721519 0.171189056 0.163199997 0.122366042
56 | 0.186724616 0.148527899 0.179844588 0.192022594 0.174337312 0.179788153 0.08 0.170737665 0.165948529 0.120087864
57 | 0.185444784 0.159524103 0.166411955 0.195164402 0.180475026 0.176058824 0.607142857 0.164282465 0.160104594 0.122952826
58 | 0.189335143 0.146887219 0.16845284 0.192139051 0.180729516 0.186868239 0.056603774 0.168942511 0.160584253 0.12571239
--------------------------------------------------------------------------------
/results/MoAprediction/pred_moa.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa.xlsx
--------------------------------------------------------------------------------
/results/MoAprediction/pred_moa_2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa_2.xlsx
--------------------------------------------------------------------------------
/results/MoAprediction/pred_moa_CDRP.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa_CDRP.xlsx
--------------------------------------------------------------------------------
/results/MoAprediction/pred_moa_LINCS.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa_LINCS.xlsx
--------------------------------------------------------------------------------
/results/RepCor/RepCorrDF.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/RepCor/RepCorrDF.xlsx
--------------------------------------------------------------------------------
/results/SingleCPfeatPred/scores_corrected.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleCPfeatPred/scores_corrected.xlsx
--------------------------------------------------------------------------------
/results/SingleGenePred/scores_corrected.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred/scores_corrected.xlsx
--------------------------------------------------------------------------------
/results/SingleGenePred/scores_cross_dts_LU_LI.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred/scores_cross_dts_LU_LI.xlsx
--------------------------------------------------------------------------------
/results/SingleGenePred_cpCategoryMap/CatMap-LINCS-25-lasso-ht.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LINCS-25-lasso-ht.png
--------------------------------------------------------------------------------
/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.pdf
--------------------------------------------------------------------------------
/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.png
--------------------------------------------------------------------------------
/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-lasso-ht.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-lasso-ht.png
--------------------------------------------------------------------------------
/results/SingleGenePred_cpCategoryMap/cat_scores_maps.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/cat_scores_maps.xlsx
--------------------------------------------------------------------------------
/utils/pred_models.py:
--------------------------------------------------------------------------------
1 | from sklearn.model_selection import (
2 | cross_val_score,
3 | cross_val_predict,
4 | GroupKFold,
5 | LeaveOneGroupOut,
6 | )
7 | from sklearn.model_selection import GridSearchCV
8 | from sklearn import metrics
9 | import numpy as np
10 | from sklearn import preprocessing
11 | from warnings import simplefilter
12 | from sklearn.neural_network import MLPRegressor
13 | from sklearn.exceptions import ConvergenceWarning
14 | from sklearn.model_selection import train_test_split
15 | from sklearn.svm import SVR
16 |
17 | # simplefilter("ignore", category=ConvergenceWarning)
18 | # from sklearn.exceptions import ConvergenceWarning
19 | # ConvergenceWarning('ignore')
20 |
21 |
22 | ########################## Lasso models
23 | def lasso_cv(X, y, k, group_labels):
24 | """
25 | X: CP data [perts/samples, features]
26 | y: lm gene expression value [perts/samples, 1 (feature value)]
27 |
28 | Returns:
29 | prediction scores, y permutated scores
30 | """
31 | from sklearn import linear_model
32 |
33 | n_j = 3
34 | # build sklearn model
35 | clf = linear_model.Lasso(alpha=0.1, max_iter=10000)
36 |
37 | # k=np.unique(group_labels).shape[0]
38 | split_obj = GroupKFold(n_splits=k)
39 | # split_obj = LeaveOneGroupOut()
40 | # Perform k-fold cross validation
41 | scores = cross_val_score(clf, X, y, groups=group_labels, cv=split_obj, n_jobs=n_j)
42 |
43 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples
44 | # y.sample(frac = 1) this just shuffles the vector
45 | scores_rand = cross_val_score(
46 | clf, X, y.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
47 | )
48 | return scores, scores_rand
49 |
50 |
51 | def lasso_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag):
52 | """
53 | X: CP data [perts/samples, features]
54 | y: lm gene expression value [perts/samples, 1 (feature value)]
55 |
56 | Returns:
57 | prediction scores, y permutated scores
58 | """
59 | from sklearn import linear_model
60 |
61 | n_j = 3
62 | # build sklearn model
63 | clf = linear_model.Lasso(alpha=0.1, max_iter=1000)
64 |
65 | # k=np.unique(group_labels).shape[0]
66 | split_obj = GroupKFold(n_splits=k)
67 | # split_obj = LeaveOneGroupOut()
68 | # Perform k-fold cross validation
69 |
70 | # alphas = np.linspace(0, 0.02, 11)
71 | alphas1 = np.linspace(0, 0.2, 20)
72 | alphas2 = np.linspace(0.2, 0.5, 10)[1:]
73 | alphas = np.concatenate((alphas1, alphas2))
74 | # alphas = np.logspace(-4, -0.5, 30)
75 | lasso_cv = linear_model.LassoCV(
76 | alphas=alphas, random_state=0, max_iter=1000, selection="random", n_jobs=k
77 | )
78 | # lasso_cv = linear_model.LassoLarsCV(cv=5)
79 | X, y = X0.values, y0.values
80 |
81 | # scores=np.zeros(k,)
82 | scores = []
83 | for train_index, test_index in split_obj.split(X, y, group_labels):
84 | # print("TRAIN:", train_index, "TEST:", test_index)
85 | X_train, X_test = X[train_index], X[test_index]
86 | y_train, y_test = y[train_index], y[test_index]
87 |
88 | lasso_cv.fit(X_train, y_train)
89 | scores.append(lasso_cv.score(X_test, y_test))
90 | # print(lasso_cv.alpha_)
91 |
92 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples
93 | # y.sample(frac = 1) this just shuffles the vector
94 | if rand_added_flag:
95 | scores_rand = cross_val_score(
96 | clf, X0, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
97 | )
98 | else:
99 | scores_rand = 0
100 | return np.array(scores), scores_rand
101 |
102 |
103 | def ridge_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag):
104 |
105 | """
106 | X: CP data [perts/samples, features]
107 | y: lm gene expression value [perts/samples, 1 (feature value)]
108 |
109 | Returns:
110 | prediction scores, y permutated scores
111 | """
112 |
113 | from sklearn import linear_model
114 |
115 | n_j = 3
116 | # build sklearn model
117 | clf = linear_model.Ridge(alpha=0.1, max_iter=10000)
118 |
119 | # k=np.unique(group_labels).shape[0]
120 | split_obj = GroupKFold(n_splits=k)
121 | # split_obj = LeaveOneGroupOut()
122 | # Perform k-fold cross validation
123 |
124 | # alphas = np.linspace(0, 0.02, 11)
125 | alphas1 = np.linspace(0.1, 0.2, 10)
126 | alphas2 = np.linspace(0.2, 0.5, 10)[1:]
127 | alphas = np.concatenate((alphas1, alphas2))
128 | # alphas = np.logspace(-4, -0.5, 30)
129 | lasso_cv = linear_model.RidgeCV(alphas)
130 |
131 | # X,y=X0,y0
132 | X, y = X0.values, y0.values
133 |
134 | # scores=np.zeros(k,)
135 | scores = []
136 | for train_index, test_index in split_obj.split(X, y, group_labels):
137 | # print("TRAIN:", train_index, "TEST:", test_index)
138 | X_train, X_test = X[train_index], X[test_index]
139 | y_train, y_test = y[train_index], y[test_index]
140 |
141 | lasso_cv.fit(X_train, y_train)
142 | scores.append(lasso_cv.score(X_test, y_test))
143 | # print(lasso_cv.alpha_)
144 |
145 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples
146 | # y.sample(frac = 1) this just shuffles the vector
147 | if rand_added_flag:
148 | scores_rand = cross_val_score(
149 | clf, X0, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
150 | )
151 | else:
152 | scores_rand = 0
153 | return np.array(scores), scores_rand
154 |
155 |
156 | ########################## MLP
157 | # def MLP_cv(X,y,k,group_labels):
158 | # from sklearn.neural_network import MLPRegressor
159 |
160 | # n_j=-1
161 | # # hidden_layer_sizes=100,
162 | # # hidden_layer_sizes = (50, 20, 10)
163 | # regr = MLPRegressor(random_state=1,hidden_layer_sizes = (100), max_iter=10000,activation='tanh',early_stopping=True)
164 |
165 | # split_obj=GroupKFold(n_splits=k)
166 | # # Perform k-fold cross validation
167 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
168 |
169 | # # Perform k-fold cross validation on the shuffled vector of lm GE across samples
170 | # # y.sample(frac = 1) this just shuffles the vector
171 | # scores_rand = cross_val_score(regr, X, y.sample(frac = 1) ,groups=group_labels,cv=split_obj,n_jobs=n_j)
172 | # return scores, scores_rand
173 | # X is train samples and y is the corresponding labels
174 |
175 |
176 | def MLP_cv(X, y, k, group_labels, rand_added_flag):
177 | from sklearn.neural_network import MLPRegressor
178 |
179 | n_j = -1
180 | # hidden_layer_sizes=100,
181 | # hidden_layer_sizes = (50, 20, 10)
182 | regr = MLPRegressor(
183 | hidden_layer_sizes=(50, 10),
184 | activation="logistic",
185 | alpha=0.01,
186 | early_stopping=True,
187 | )
188 |
189 | split_obj = GroupKFold(n_splits=k)
190 | # Perform k-fold cross validation
191 | scores = cross_val_score(regr, X, y, groups=group_labels, cv=split_obj, n_jobs=n_j)
192 |
193 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples
194 | # y.sample(frac = 1) this just shuffles the vector
195 |
196 | if rand_added_flag:
197 | scores_rand = cross_val_score(
198 | regr, X, y.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
199 | )
200 | else:
201 | scores_rand = 0
202 |
203 | return scores, scores_rand
204 |
205 |
206 | def MLP_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag):
207 | n_j = -1
208 | # hidden_layer_sizes=100,
209 | # hidden_layer_sizes = (50, 20, 10)
210 | # regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\
211 | # alpha=0.01,early_stopping=True)
212 |
213 | mlp_gs = MLPRegressor(
214 | random_state=0,
215 | early_stopping=True,
216 | n_iter_no_change=4,
217 | learning_rate="adaptive",
218 | )
219 |
220 | split_obj = GroupKFold(n_splits=k)
221 | # Perform k-fold cross validation
222 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
223 |
224 | # mlp_gs = MLPClassifier(max_iter=100)
225 | # parameter_space = {
226 | # 'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)],
227 | # 'activation': ['tanh', 'relu','logistic'],
228 | # 'alpha': [0.0001, 0.05,0.01,0.1,0.2],
229 | # 'early_stopping':[True,False]
230 | # }
231 |
232 | parameter_space = {
233 | "max_iter": [10, 100, 300, 500],
234 | "hidden_layer_sizes": [
235 | (32, 64),
236 | (64, 32),
237 | (50, 10),
238 | (50, 10, 10),
239 | (20, 10),
240 | (),
241 | ], # (50,5),(50,),(10,)
242 | "activation": ["logistic", "tanh"],
243 | "alpha": [0.0005, 0.01, 0.3, 1, 2, 3, 4, 5, 6, 7],
244 | # 'learning_rate': ['constant','adaptive']
245 | # 'early_stopping':[True,False]
246 | }
247 |
248 | clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=k, cv=4)
249 |
250 | X, y = X0, y0.values
251 |
252 | scores = []
253 | for train_index, test_index in split_obj.split(X, y, group_labels):
254 | # print("TRAIN:", train_index, "TEST:", test_index)
255 | X_train, X_test = X[train_index], X[test_index]
256 | y_train, y_test = y[train_index], y[test_index]
257 |
258 | clf.fit(X_train, y_train)
259 | # clf.fit(X, y)
260 | scores.append(clf.best_estimator_.score(X_test, y_test))
261 | print(clf.best_params_)
262 |
263 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples
264 | # y.sample(frac = 1) this just shuffles the vector
265 | # scores_rand=0
266 |
267 | if rand_added_flag:
268 | scores_rand = cross_val_score(
269 | mlp_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
270 | )
271 | else:
272 | scores_rand = 0
273 | return scores, scores_rand
274 |
275 |
276 | def MLP_cv_plus_model_selection_keras(X0, y0, k, group_labels, rand_added_flag):
277 | from keras.models import Sequential
278 | from keras.layers import Dense, Conv1D, Flatten, Dropout
279 | from sklearn.metrics import mean_squared_error, r2_score
280 | from keras.callbacks import EarlyStopping
281 | from keras import backend as K
282 |
283 | X = X0.reshape(X0.shape[0], X0.shape[1], 1)
284 | y = y0.values
285 | # model.summary()
286 | model = Sequential()
287 | model.add(Dense(16, activation="relu", input_shape=(X0.shape[1], 1)))
288 | # model.add(Conv1D(32, 2, activation="relu", input_shape=(X0.shape[1],1)))
289 | model.add(Flatten())
290 | model.add(Dropout(0.6))
291 | model.add(Dense(64, activation="relu"))
292 | model.add(Dropout(0.2))
293 | model.add(Dense(1))
294 | model.compile(loss="mse", optimizer="adam") # ,metrics=[coeff_determination])
295 | # model.compile(loss=coeff_determination, optimizer="adam")#,metrics=[coeff_determination])
296 |
297 | es = EarlyStopping(monitor="val_loss", mode="min", verbose=0, patience=10)
298 |
299 | Wsave = model.get_weights()
300 |
301 | split_obj = GroupKFold(n_splits=k)
302 |
303 | scores = []
304 | for train_index, test_index in split_obj.split(X, y, group_labels):
305 | # print("TRAIN:", train_index, "TEST:", test_index)
306 | X_train, X_test = X[train_index], X[test_index]
307 | y_train, y_test = y[train_index], y[test_index]
308 |
309 | XTraining, XValidation, YTraining, YValidation = train_test_split(
310 | X_train, y_train, test_size=0.1
311 | )
312 |
313 | model.set_weights(Wsave)
314 | model.fit(
315 | XTraining,
316 | YTraining,
317 | batch_size=XTraining.shape[0],
318 | epochs=1000,
319 | validation_data=(XValidation, YValidation),
320 | callbacks=[es],
321 | verbose=0,
322 | )
323 | ypred = model.predict(X_test)
324 |
325 | scores.append(r2_score(y_test, ypred))
326 |
327 | return scores, 0
328 |
329 |
330 | def SVR_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag):
331 | n_j = -1
332 | # hidden_layer_sizes=100,
333 | # hidden_layer_sizes = (50, 20, 10)
334 | # regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\
335 | # alpha=0.01,early_stopping=True)
336 |
337 | svr_gs = SVR(epsilon=0.2)
338 |
339 | split_obj = GroupKFold(n_splits=k)
340 | # Perform k-fold cross validation
341 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
342 |
343 | # mlp_gs = MLPClassifier(max_iter=100)
344 | # parameter_space = {
345 | # 'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)],
346 | # 'activation': ['tanh', 'relu','logistic'],
347 | # 'alpha': [0.0001, 0.05,0.01,0.1,0.2],
348 | # 'early_stopping':[True,False]
349 | # }
350 |
351 | parameter_space = {
352 | "kernel": ("poly", "rbf", "sigmoid"),
353 | "C": [1, 2, 3, 5, 20, 100, 500, 1000], # (50,5),(50,),(10,)
354 | "degree": [1, 2, 3, 4],
355 | "coef0": [0.01, 0.5, 1, 10],
356 | "gamma": ("auto", "scale"),
357 | # 'epsilon':[0.1,0.2,0.5,0.3]
358 | # 'early_stopping':[True,False]
359 | }
360 |
361 | clf = GridSearchCV(svr_gs, parameter_space, n_jobs=k, cv=4)
362 |
363 | X, y = X0, y0.values
364 |
365 | scores = []
366 | for train_index, test_index in split_obj.split(X, y, group_labels):
367 | # print("TRAIN:", train_index, "TEST:", test_index)
368 | X_train, X_test = X[train_index], X[test_index]
369 | y_train, y_test = y[train_index], y[test_index]
370 |
371 | clf.fit(X_train, y_train)
372 | # clf.fit(X, y)
373 | scores.append(clf.best_estimator_.score(X_test, y_test))
374 | print(clf.best_params_)
375 |
376 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples
377 | # y.sample(frac = 1) this just shuffles the vector
378 | # scores_rand=0
379 |
380 | if rand_added_flag:
381 | scores_rand = cross_val_score(
382 | svr_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
383 | )
384 | else:
385 | scores_rand = 0
386 | return scores, scores_rand
387 |
388 |
389 | def MLP_cv_plus_model_selection_rand_test(X0, y0, k, group_labels, rand_added_flag):
390 | n_j = -1
391 | # hidden_layer_sizes=100,
392 | # hidden_layer_sizes = (50, 20, 10)
393 | # regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\
394 | # alpha=0.01,early_stopping=True)
395 |
396 | mlp_gs = MLPRegressor(random_state=0, early_stopping=True, n_iter_no_change=20)
397 |
398 | split_obj = GroupKFold(n_splits=k)
399 | # Perform k-fold cross validation
400 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
401 |
402 | # mlp_gs = MLPClassifier(max_iter=100)
403 | # parameter_space = {
404 | # 'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)],
405 | # 'activation': ['tanh', 'relu','logistic'],
406 | # 'alpha': [0.0001, 0.05,0.01,0.1,0.2],
407 | # 'early_stopping':[True,False]
408 | # }
409 |
410 | parameter_space = {
411 | "max_iter": [10, 100, 300, 500],
412 | "hidden_layer_sizes": [
413 | (32, 64),
414 | (64, 32),
415 | (50, 10),
416 | (50, 10, 10),
417 | (20, 10),
418 | ], # (50,5),(50,),(10,)
419 | "activation": ["logistic", "tanh"],
420 | "alpha": [0.0005, 0.01, 0.3, 1, 2],
421 | # 'learning_rate': ['constant','adaptive']
422 | # 'early_stopping':[True,False]
423 | }
424 |
425 | clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=k, cv=4)
426 |
427 | X, y = X0, y0.values
428 |
429 | scores = []
430 | for train_index, test_index in split_obj.split(X, y, group_labels):
431 | # print("TRAIN:", train_index, "TEST:", test_index)
432 | X_train, X_test = X[train_index], X[test_index]
433 | y_train, y_test = y[train_index], y[test_index]
434 |
435 | clf.fit(X_train, y_train)
436 | # clf.fit(X, y)
437 | scores.append(clf.best_estimator_.score(X_test, y_test))
438 | print(clf.best_params_)
439 |
440 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples
441 | # y.sample(frac = 1) this just shuffles the vector
442 | # scores_rand=0
443 |
444 | if rand_added_flag:
445 | scores_rand = cross_val_score(
446 | mlp_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
447 | )
448 | else:
449 | scores_rand = 0
450 | return scores, scores_rand
451 |
452 |
453 | def MLP_cv_plus_model_selection_taorf(X0, y0, k, group_labels, rand_added_flag):
454 | from sklearn.neural_network import MLPRegressor
455 |
456 | n_j = -1
457 | # hidden_layer_sizes=100,
458 | # hidden_layer_sizes = (50, 20, 10)
459 | # regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\
460 | # alpha=0.01,early_stopping=True)
461 |
462 | mlp_gs = MLPRegressor(random_state=0, max_iter=1000)
463 |
464 | split_obj = GroupKFold(n_splits=k)
465 | # Perform k-fold cross validation
466 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
467 |
468 | # mlp_gs = MLPClassifier(max_iter=100)
469 | # parameter_space = {
470 | # 'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)],
471 | # 'activation': ['tanh', 'relu','logistic'],
472 | # 'alpha': [0.0001, 0.05,0.01,0.1,0.2],
473 | # 'early_stopping':[True,False]
474 | # }
475 |
476 | parameter_space = {
477 | "hidden_layer_sizes": [(50,), (10, 30, 10), (50, 10), (50, 10, 10)],
478 | "activation": ["tanh", "relu", "logistic"],
479 | "alpha": [0.0001, 0.05, 0.01, 0.2, 0.5, 0.7],
480 | "learning_rate": ["constant", "adaptive"]
481 | # 'early_stopping':[True,False]
482 | }
483 |
484 | # parameter_space = {
485 | # 'hidden_layer_sizes': [(50,),(10,),(50,10),(50,10,10)],
486 | # 'activation': ['tanh', 'relu','logistic'],
487 | # 'alpha': [0.05,0.01,0.2,0.5],
488 | # # 'early_stopping':[True,False]
489 | # }
490 |
491 | clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=6, cv=2)
492 |
493 | X, y = X0.values, y0.values
494 |
495 | scores = []
496 | for train_index, test_index in split_obj.split(X, y, group_labels):
497 | # print("TRAIN:", train_index, "TEST:", test_index)
498 | X_train, X_test = X[train_index], X[test_index]
499 | y_train, y_test = y[train_index], y[test_index]
500 |
501 | clf.fit(X_train, y_train)
502 | # clf.fit(X, y)
503 | scores.append(clf.score(X_test, y_test))
504 | print(clf.best_params_)
505 |
506 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples
507 | # y.sample(frac = 1) this just shuffles the vector
508 |
509 | # scores_rand=0
510 |
511 | if rand_added_flag:
512 | scores_rand = cross_val_score(
513 | mlp_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
514 | )
515 | else:
516 | scores_rand = 0
517 | return scores, scores_rand
518 |
519 |
520 | # from sklearn.model_selection import RandomizedSearchCV
521 | # # Number of trees in random forest
522 | # n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
523 | # # Number of features to consider at every split
524 | # max_features = ['auto', 'sqrt']
525 | # # Maximum number of levels in tree
526 | # max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
527 | # max_depth.append(None)
528 | # # Minimum number of samples required to split a node
529 | # min_samples_split = [2, 5, 10]
530 | # # Minimum number of samples required at each leaf node
531 | # min_samples_leaf = [1, 2, 4]
532 | # # Method of selecting samples for training each tree
533 | # bootstrap = [True, False]
534 | # # Create the random grid
535 | # random_grid = {'n_estimators': n_estimators,
536 | # 'max_features': max_features,
537 | # 'max_depth': max_depth,
538 | # 'min_samples_split': min_samples_split,
539 | # 'min_samples_leaf': min_samples_leaf,
540 | # 'bootstrap': bootstrap}
541 | # pprint(random_grid)
542 |
543 |
544 | ########################## Random Forest
545 | def RFR_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag):
546 | from sklearn.ensemble import RandomForestRegressor
547 | from sklearn.model_selection import GridSearchCV
548 |
549 | n_j = -1
550 |
551 | # parameter_space ={'bootstrap': [True, False],\
552 | # 'max_depth': [10, 20, 40, 50, 100, None],\
553 | # 'max_features': ['auto', 'sqrt'],\
554 | # 'min_samples_leaf': [1, 2, 4],\
555 | # 'min_samples_split': [2, 5, 10],\
556 | # 'n_estimators': [200, 400, 600, 800, 1000]}
557 |
558 | parameter_space = {
559 | "max_depth": [10, 20, None],
560 | "min_samples_leaf": [1, 4],
561 | "min_samples_split": [2, 5, 10],
562 | }
563 |
564 | rfr_gs = RandomForestRegressor(bootstrap=True, max_features="auto")
565 |
566 | split_obj = GroupKFold(n_splits=k)
567 | # Perform k-fold cross validation
568 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
569 |
570 | # mlp_gs = MLPClassifier(max_iter=100)
571 |
572 | clf = GridSearchCV(rfr_gs, parameter_space, n_jobs=-1, cv=2)
573 |
574 | X, y = X0.values, y0.values
575 |
576 | scores = []
577 | for train_index, test_index in split_obj.split(X, y, group_labels):
578 | # print("TRAIN:", train_index, "TEST:", test_index)
579 | X_train, X_test = X[train_index], X[test_index]
580 | y_train, y_test = y[train_index], y[test_index]
581 |
582 | # lasso_cv.fit(X_train, y_train)
583 | clf.fit(X, y)
584 | scores.append(clf.score(X_test, y_test))
585 | print(clf.best_params_)
586 |
587 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples
588 | # y.sample(frac = 1) this just shuffles the vector
589 | scores_rand = cross_val_score(
590 | rfr_gs, X0, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
591 | )
592 | # scores_rand=0
593 | return scores, scores_rand
594 |
595 |
596 | ############################## Feature Ranking #########################
597 | def linear_model_feature_ranking(X0, y0, k, group_labels, l1k_features_gn):
598 | """
599 | X: CP data [perts/samples, features]
600 | y: lm gene expression value [perts/samples, 1 (feature value)]
601 |
602 | Returns:
603 | prediction scores, y permutated scores
604 | """
605 | from sklearn import linear_model
606 | from sklearn.feature_selection import SelectKBest
607 | from sklearn.feature_selection import mutual_info_regression
608 |
609 | n_j = 3
610 | # build sklearn model
611 | # clf = linear_model.Lasso(alpha=0.1,max_iter=10000)
612 | clf = linear_model.LinearRegression()
613 |
614 | # k=np.unique(group_labels).shape[0]
615 |
616 | split_obj = GroupKFold(n_splits=k)
617 | # split_obj = LeaveOneGroupOut()
618 | # Perform k-fold cross validation
619 |
620 | # alphas = np.linspace(0, 0.02, 11)
621 | alphas1 = np.linspace(0, 0.2, 20)
622 | alphas2 = np.linspace(0.2, 0.5, 10)[1:]
623 | alphas = np.concatenate((alphas1, alphas2))
624 | # alphas = np.logspace(-4, -0.5, 30)
625 | # lasso_cv = linear_model.LassoCV(alphas=alphas, random_state=0, max_iter=1000,selection='random')
626 |
627 | X, y = X0.values, y0.values
628 |
629 | fs = SelectKBest(score_func=mutual_info_regression, k="all")
630 | fs.fit(X, y)
631 |
632 | clf.fit(X, y)
633 | return clf.coef_, fs.scores_
634 |
635 |
636 | # return ranking(np.abs(lasso_cv.coef_), l1k_features_gn)
637 |
638 |
639 | ranks = {}
640 |
641 |
642 | # Create our function which stores the feature rankings to the ranks dictionary
643 | def ranking(ranks, names, order=1):
644 | minmax = preprocessing.MinMaxScaler()
645 | ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0]
646 | ranks = map(lambda x: round(x, 2), ranks)
647 | return dict(zip(names, ranks))
648 |
--------------------------------------------------------------------------------
/utils/readProfiles.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.spatial
3 | import pandas as pd
4 | import sklearn.decomposition
5 | from sklearn import preprocessing
6 | from sklearn.metrics import pairwise_distances
7 |
8 | # from utils.normalize_funcs import standardize_per_catX
9 | # from normalize_funcs import standardize_per_catX
10 |
11 | #'dataset_name',['folder_name',[cp_pert_col_name,l1k_pert_col_name],[cp_control_val,l1k_control_val]]
12 | ds_info_dict = {
13 | "CDRP": ["CDRP-BBBC047-Bray", ["Metadata_Sample_Dose", "pert_sample_dose"]],
14 | "CDRP-bio": ["CDRPBIO-BBBC036-Bray", ["Metadata_Sample_Dose", "pert_sample_dose"]],
15 | "TAORF": [
16 | "TA-ORF-BBBC037-Rohban",
17 | [
18 | "Metadata_broad_sample",
19 | "pert_id",
20 | ],
21 | ],
22 | "LUAD": ["LUAD-BBBC041-Caicedo", ["x_mutation_status", "allele"]],
23 | "LINCS": ["LINCS-Pilot1", ["Metadata_pert_id_dose", "pert_id_dose"]],
24 | }
25 |
26 | labelCol = "PERT"
27 |
28 |
29 | ################################################################################
30 | def read_replicate_level_profiles(
31 | dataset_rootDir, dataset, profileType, per_plate_normalized_flag
32 | ):
33 | """
34 | Reads replicate level CSV files in the form of a dataframe
35 | Extract measurments column names for each modalities
36 | Remove columns with low variance (null_vals_ratio)
38 |
39 | Inputs:
40 | dataset_rootDir: datasets root dir
41 | dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP']
42 | profileType: Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected'
43 | per_plate_normalized_flag: if True it will standardize data per plate
44 |
45 | Output:
46 | cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data
47 | """
48 |
49 | dataDir = dataset_rootDir + "/preprocessed_data/" + ds_info_dict[dataset][0] + "/"
50 |
51 | cp_data_repLevel = pd.read_csv(
52 | dataDir + "/CellPainting/replicate_level_cp_" + profileType + ".csv.gz"
53 | )
54 | l1k_data_repLevel = pd.read_csv(dataDir + "/L1000/replicate_level_l1k.csv.gz")
55 |
56 | cp_features, l1k_features = extract_feature_names(
57 | cp_data_repLevel, l1k_data_repLevel
58 | )
59 |
60 | ########## removes nan and inf values
61 | l1k_data_repLevel = l1k_data_repLevel.replace([np.inf, -np.inf], np.nan)
62 | cp_data_repLevel = cp_data_repLevel.replace([np.inf, -np.inf], np.nan)
63 |
64 | #
65 | null_vals_ratio = 0.05
66 | thrsh_std = 0.0001
67 | cols2remove_manyNulls = [
68 | i
69 | for i in cp_features
70 | if (cp_data_repLevel[i].isnull().sum(axis=0) / cp_data_repLevel.shape[0])
71 | > null_vals_ratio
72 | ]
73 | cols2remove_lowVars = (
74 | cp_data_repLevel[cp_features]
75 | .std()[cp_data_repLevel[cp_features].std() < thrsh_std]
76 | .index.tolist()
77 | )
78 |
79 | cols2removeCP = cols2remove_manyNulls + cols2remove_lowVars
80 | # print(cols2removeCP)
81 |
82 | cp_features = list(set(cp_features) - set(cols2removeCP))
83 | cp_data_repLevel = cp_data_repLevel.drop(cols2removeCP, axis=1)
84 | cp_data_repLevel[cp_features] = cp_data_repLevel[cp_features].interpolate()
85 |
86 | # cols2removeCP=[i for i in cp_features if cp_data_repLevel[i].isnull().sum(axis=0)>0]
87 | # print(cols2removeCP)
88 |
89 | # cp=cp.fillna(cp.median())
90 |
91 | # cols2removeGE=[i for i in l1k.columns if l1k[i].isnull().sum(axis=0)>0]
92 | # print(cols2removeGE)
93 | # l1k_features = list(set(l1k_features) - set(cols2removeGE))
94 | # print(len(l1k_features))
95 | # l1k=l1k.drop(cols2removeGE, axis=1);
96 | l1k_data_repLevel[l1k_features] = l1k_data_repLevel[l1k_features].interpolate()
97 | # l1k=l1k.fillna(l1k.median())
98 |
99 | ################ Per plate scaling
100 | if per_plate_normalized_flag:
101 | cp_data_repLevel = standardize_per_catX(
102 | cp_data_repLevel, "Metadata_Plate", cp_features
103 | )
104 | l1k_data_repLevel = standardize_per_catX(
105 | l1k_data_repLevel, "det_plate", l1k_features
106 | )
107 |
108 | cols2removeCP = [
109 | i
110 | for i in cp_features
111 | if (cp_data_repLevel[i].isnull().sum(axis=0) / cp_data_repLevel.shape[0])
112 | > 0.05
113 | ]
114 | cp_data_repLevel = cp_data_repLevel.drop(cols2removeCP, axis=1)
115 | cp_features = list(set(cp_features) - set(cols2removeCP))
116 | cp_data_repLevel[cp_features] = cp_data_repLevel[cp_features].interpolate()
117 |
118 | return [cp_data_repLevel, cp_features], [l1k_data_repLevel, l1k_features]
119 |
120 |
121 | ################################################################################
122 | def extract_feature_names(cp_data_repLevel, l1k_data_repLevel):
123 | """
124 | extract Cell Painting and L1000 measurments names among the column names
125 |
126 | Inputs:
127 | cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data
128 |
129 | Outputs: list of feature names for each modality
130 |
131 | """
132 | # features to analyse
133 | cp_features = cp_data_repLevel.columns[
134 | cp_data_repLevel.columns.str.contains("Cells_|Cytoplasm_|Nuclei_")
135 | ].tolist()
136 | l1k_features = l1k_data_repLevel.columns[
137 | l1k_data_repLevel.columns.str.contains("_at")
138 | ].tolist()
139 |
140 | return cp_features, l1k_features
141 |
142 |
143 | ################################################################################
144 | def extract_metadata_column_names(cp_data, l1k_data):
145 | """
146 | extract metadata column names among the column names for any level of data
147 |
148 | Inputs:
149 | cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data
150 |
151 | Outputs: list of metadata column names for each modality
152 |
153 | """
154 | cp_meta_col_names = cp_data.columns[
155 | ~cp_data.columns.str.contains("Cells_|Cytoplasm_|Nuclei_")
156 | ].tolist()
157 | l1k_meta_col_names = l1k_data.columns[
158 | ~l1k_data.columns.str.contains("_at")
159 | ].tolist()
160 |
161 | return cp_meta_col_names, l1k_meta_col_names
162 |
163 |
164 | ################################################################################
165 | def read_treatment_level_profiles(
166 | dataset_rootDir,
167 | dataset,
168 | profileType,
169 | filter_repCorr_params,
170 | per_plate_normalized_flag,
171 | ):
172 | """
173 | Reads replicate level CSV files (scaled replicate level profiles per plate)
174 | Rename the column names to match across datasets to PERT in both modalities
175 | Remove perturbations with low rep corr across both (filter_perts='highRepOverlap')
176 | or one of the modalities (filter_perts='highRepUnion')
177 | Form treatment level profiles by averaging the replicates
178 | Select and keep the metadata columns you want to keep for each dataset
179 | Merge treatment level profiles to its own metadata
180 |
181 | Inputs:
182 | dataset_rootDir: datasets root dir
183 | dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP']
184 | profileType: Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected'
185 |
186 | Output:
187 | [cp_data_treatLevel,cp_features], [l1k_data_treatLevel,l1k_features]
188 | each is a list of dataframe and feature names for each of modalities
189 | """
190 |
191 | filter_perts = filter_repCorr_params[0]
192 | repCorrFilePath = filter_repCorr_params[1]
193 |
194 | [cp_data_repLevel, cp_features], [
195 | l1k_data_repLevel,
196 | l1k_features,
197 | ] = read_replicate_level_profiles(
198 | dataset_rootDir, dataset, profileType, per_plate_normalized_flag
199 | )
200 |
201 | ############ rename columns that should match to PERT
202 | labelCol = "PERT"
203 | cp_data_repLevel = cp_data_repLevel.rename(
204 | columns={ds_info_dict[dataset][1][0]: labelCol}
205 | )
206 | l1k_data_repLevel = l1k_data_repLevel.rename(
207 | columns={ds_info_dict[dataset][1][1]: labelCol}
208 | )
209 |
210 | ###### print some data statistics
211 | print(
212 | dataset + ": Replicate Level Shapes (nSamples x nFeatures): cp: ",
213 | cp_data_repLevel.shape[0],
214 | ",",
215 | len(cp_features),
216 | ", l1k: ",
217 | l1k_data_repLevel.shape[0],
218 | ",",
219 | len(l1k_features),
220 | )
221 |
222 | print("l1k n of rep: ", l1k_data_repLevel.groupby([labelCol]).size().median())
223 | print("cp n of rep: ", cp_data_repLevel.groupby([labelCol]).size().median())
224 |
225 | ###### remove perts with low rep corr
226 | if filter_perts == "highRepOverlap":
227 | highRepPerts = highRepFinder(dataset, "intersection", repCorrFilePath) + [
228 | "negcon"
229 | ]
230 |
231 | cp_data_repLevel = cp_data_repLevel[
232 | cp_data_repLevel["PERT"].isin(highRepPerts)
233 | ].reset_index()
234 | l1k_data_repLevel = l1k_data_repLevel[
235 | l1k_data_repLevel["PERT"].isin(highRepPerts)
236 | ].reset_index()
237 |
238 | elif filter_perts == "highRepUnion":
239 | highRepPerts = highRepFinder(dataset, "union", repCorrFilePath) + ["negcon"]
240 |
241 | cp_data_repLevel = cp_data_repLevel[
242 | cp_data_repLevel["PERT"].isin(highRepPerts)
243 | ].reset_index()
244 | l1k_data_repLevel = l1k_data_repLevel[
245 | l1k_data_repLevel["PERT"].isin(highRepPerts)
246 | ].reset_index()
247 |
248 | ####### form treatment level profiles
249 | l1k_data_treatLevel = (
250 | l1k_data_repLevel.groupby(labelCol)[l1k_features].mean().reset_index()
251 | )
252 | cp_data_treatLevel = (
253 | cp_data_repLevel.groupby(labelCol)[cp_features].mean().reset_index()
254 | )
255 |
256 | ###### define metadata and merge treatment level profiles
257 | # dataset:[[cp_columns],[l1k_columns]]
258 | # meta_dict={'CDRP':[['Metadata_moa','Metadata_target'],['CPD_NAME','CPD_TYPE','CPD_SMILES']],
259 | # 'CDRP-bio':[['Metadata_moa','Metadata_target'],['CPD_NAME','CPD_TYPE','CPD_SMILES']],
260 | # 'TAORF':[['Metadata_moa'],['pert_type']],
261 | # 'LUAD':[['Metadata_broad_sample_type','Metadata_pert_type'],[]],
262 | # 'LINCS':[['Metadata_moa', 'Metadata_alternative_moa'],['moa']]}
263 |
264 | meta_dict = {
265 | "CDRP": [["Metadata_moa", "Metadata_target"], []],
266 | "CDRP-bio": [["Metadata_moa", "Metadata_target"], []],
267 | "TAORF": [[], []],
268 | "LUAD": [[], []],
269 | "LINCS": [["Metadata_moa", "Metadata_alternative_moa"], ["moa"]],
270 | }
271 |
272 | meta_cp = (
273 | cp_data_repLevel[[labelCol] + meta_dict[dataset][0]]
274 | .drop_duplicates()
275 | .reset_index(drop=True)
276 | )
277 | meta_l1k = (
278 | l1k_data_repLevel[[labelCol] + meta_dict[dataset][1]]
279 | .drop_duplicates()
280 | .reset_index(drop=True)
281 | )
282 |
283 | cp_data_treatLevel = pd.merge(
284 | cp_data_treatLevel, meta_cp, how="inner", on=[labelCol]
285 | )
286 | l1k_data_treatLevel = pd.merge(
287 | l1k_data_treatLevel, meta_l1k, how="inner", on=[labelCol]
288 | )
289 |
290 | return [cp_data_treatLevel, cp_features], [l1k_data_treatLevel, l1k_features]
291 |
292 |
293 | ################################################################################
294 | def read_paired_treatment_level_profiles(
295 | dataset_rootDir,
296 | dataset,
297 | profileType,
298 | filter_repCorr_params,
299 | per_plate_normalized_flag,
300 | ):
301 | """
302 | Reads treatment level profiles
303 | Merge dataframes by PERT column
304 |
305 | Inputs:
306 | dataset_rootDir: datasets root dir
307 | dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP']
308 | profileType: Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected'
309 | per_plate_normalized_flag: True for scaling per plate
310 |
311 | Output:
312 | mergedProfiles_treatLevel: paired treatment level profiles
313 | cp_features,l1k_features list of feature names for each of modalities
314 | """
315 |
316 | [cp_data_treatLevel, cp_features], [
317 | l1k_data_treatLevel,
318 | l1k_features,
319 | ] = read_treatment_level_profiles(
320 | dataset_rootDir,
321 | dataset,
322 | profileType,
323 | filter_repCorr_params,
324 | per_plate_normalized_flag,
325 | )
326 |
327 | mergedProfiles_treatLevel = pd.merge(
328 | cp_data_treatLevel, l1k_data_treatLevel, how="inner", on=[labelCol]
329 | )
330 |
331 | print(
332 | "Treatment Level Shapes (nSamples x nFeatures+metadata):",
333 | cp_data_treatLevel.shape,
334 | l1k_data_treatLevel.shape,
335 | "Merged Profiles Shape:",
336 | mergedProfiles_treatLevel.shape,
337 | )
338 |
339 | return mergedProfiles_treatLevel, cp_features, l1k_features
340 |
341 |
342 | ################################################################################
343 | def generate_random_match_of_replicate_pairs(cp_data_repLevel, l1k_data_repLevel, nRep):
344 | """
345 | Note that there is no match at the replicate level for this dataset, we either:
346 | - Forming ALL the possible pairs for replicate level data matching (nRep='all' - string)
347 | - Randomly sample samples in each modality and form pairs (nRep -> int)
348 |
349 | Inputs:
350 | cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data
351 |
352 | Outputs:
353 | Randomly paired replicate level profiles
354 |
355 | """
356 | labelCol = "PERT"
357 |
358 | if nRep == "all":
359 | cp_data_n_repLevel = cp_data_repLevel.copy()
360 | l1k_data_n_repLevel = l1k_data_repLevel.copy()
361 | else:
362 | # nR=np.min((cp_data_repLevel.groupby(labelCol).size().min(),l1k_data_repLevel.groupby(labelCol).size().min()))
363 | # cp_data_n_repLevel=cp_data_repLevel.groupby(labelCol).apply(lambda x: x.sample(n=nR,replace=True)).reset_index(drop=True)
364 | nR = nRep
365 | cp_data_n_repLevel = (
366 | cp_data_repLevel.groupby(labelCol)
367 | .apply(lambda x: x.sample(n=np.min([nR, x.shape[0]])))
368 | .reset_index(drop=True)
369 | )
370 | l1k_data_n_repLevel = (
371 | l1k_data_repLevel.groupby(labelCol)
372 | .apply(lambda x: x.sample(n=np.min([nR, x.shape[0]])))
373 | .reset_index(drop=True)
374 | )
375 |
376 | mergedProfiles_repLevel = pd.merge(
377 | cp_data_n_repLevel, l1k_data_n_repLevel, how="inner", on=[labelCol]
378 | )
379 |
380 | return mergedProfiles_repLevel
381 |
382 |
383 | ################################################################################
384 | def highRepFinder(dataset, how, repCorrFilePath):
385 | """
386 | This function reads pre calculated and saved Replicate Correlation values file and filters perturbations
387 | using one of the following filters:
388 | - intersection: intersection of high quality profiles across both modalities
389 | - union: union of high quality profiles across both modalities
390 |
391 | * A High Quality profile is defined as a profile having replicate correlation more than 90th percentile of
392 | its null distribution
393 |
394 | Inputs:
395 | dataset (str): dataset name
396 | how (str): can be intersection or union
397 |
398 | Output: list of high quality perurbations
399 |
400 | """
401 | repCorDF = pd.read_excel(repCorrFilePath, sheet_name=None)
402 | cpRepDF = repCorDF["cp-" + dataset.lower()]
403 | cpHighList = cpRepDF[cpRepDF["RepCor"] > cpRepDF["Rand90Perc"]][
404 | "Unnamed: 0"
405 | ].tolist()
406 | print("CP: from ", cpRepDF.shape[0], " to ", len(cpHighList))
407 | cpRepDF = repCorDF["l1k-" + dataset.lower()]
408 | l1kHighList = cpRepDF[cpRepDF["RepCor"] > cpRepDF["Rand90Perc"]][
409 | "Unnamed: 0"
410 | ].tolist()
411 | # print("l1kHighList",l1kHighList)
412 | # print("cpHighList",cpHighList)
413 | if how == "intersection":
414 | highRepPerts = list(set(l1kHighList) & set(cpHighList))
415 | print("l1k: from ", cpRepDF.shape[0], " to ", len(l1kHighList))
416 | print("CP and l1k high rep overlap: ", len(highRepPerts))
417 |
418 | elif how == "union":
419 | highRepPerts = list(set(l1kHighList) | set(cpHighList))
420 | print("l1k: from ", cpRepDF.shape[0], " to ", len(l1kHighList))
421 | print("CP and l1k high rep union: ", len(highRepPerts))
422 |
423 | return highRepPerts
424 |
425 |
426 | ################################################################################
427 | def read_paired_replicate_level_profiles(
428 | dataset_rootDir,
429 | dataset,
430 | profileType,
431 | nRep,
432 | filter_repCorr_params,
433 | per_plate_normalized_flag,
434 | ):
435 | """
436 | Reads replicate level CSV files (scaled replicate level profiles per plate)
437 | Rename the column names to match across datasets to PERT in both modalities
438 | Remove perturbations with low rep corr across both (filter_perts='highRepOverlap')
439 | or one of the modalities (filter_perts='highRepUnion')
440 | Form treatment level profiles by averaging the replicates
441 | Select and keep the metadata columns you want to keep for each dataset
442 | Merge dataframes by PERT column
443 |
444 | Inputs:
445 | dataset_rootDir: datasets root dir
446 | dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP']
447 | profileType: Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected'
448 |
449 | Output:
450 | mergedProfiles_treatLevel: paired treatment level profiles
451 | cp_features,l1k_features list of feature names for each of modalities
452 | """
453 |
454 | filter_perts = filter_repCorr_params[0]
455 | repCorrFilePath = filter_repCorr_params[1]
456 |
457 | [cp_data_repLevel, cp_features], [
458 | l1k_data_repLevel,
459 | l1k_features,
460 | ] = read_replicate_level_profiles(
461 | dataset_rootDir, dataset, profileType, per_plate_normalized_flag
462 | )
463 |
464 | ############ rename columns that should match to PERT
465 | cp_data_repLevel = cp_data_repLevel.rename(
466 | columns={ds_info_dict[dataset][1][0]: labelCol}
467 | )
468 | l1k_data_repLevel = l1k_data_repLevel.rename(
469 | columns={ds_info_dict[dataset][1][1]: labelCol}
470 | )
471 |
472 | ###### print some data statistics
473 | print(
474 | dataset + ": Replicate Level Shapes (nSamples x nFeatures): cp: ",
475 | cp_data_repLevel.shape[0],
476 | ",",
477 | len(cp_features),
478 | ", l1k: ",
479 | l1k_data_repLevel.shape[0],
480 | ",",
481 | len(l1k_features),
482 | )
483 |
484 | print("l1k n of rep: ", l1k_data_repLevel.groupby([labelCol]).size().median())
485 | print("cp n of rep: ", cp_data_repLevel.groupby([labelCol]).size().median())
486 |
487 | ###### remove perts with low rep corr
488 | if filter_perts == "highRepOverlap":
489 | highRepPerts = highRepFinder(dataset, "intersection", repCorrFilePath) + [
490 | "negcon"
491 | ]
492 |
493 | cp_data_repLevel = cp_data_repLevel[
494 | cp_data_repLevel["PERT"].isin(highRepPerts)
495 | ].reset_index()
496 | l1k_data_repLevel = l1k_data_repLevel[
497 | l1k_data_repLevel["PERT"].isin(highRepPerts)
498 | ].reset_index()
499 |
500 | elif filter_perts == "highRepUnion":
501 | highRepPerts = highRepFinder(dataset, "union", repCorrFilePath) + ["negcon"]
502 |
503 | cp_data_repLevel = cp_data_repLevel[
504 | cp_data_repLevel["PERT"].isin(highRepPerts)
505 | ].reset_index()
506 | l1k_data_repLevel = l1k_data_repLevel[
507 | l1k_data_repLevel["PERT"].isin(highRepPerts)
508 | ].reset_index()
509 |
510 | mergedProfiles_repLevel = generate_random_match_of_replicate_pairs(
511 | cp_data_repLevel, l1k_data_repLevel, nRep
512 | )
513 |
514 | return mergedProfiles_repLevel, cp_features, l1k_features
515 |
516 |
517 | def rename_affyprobe_to_genename(l1k_data_df, l1k_features, map_source_address):
518 | """
519 | map input dataframe column name from affy prob id to gene names
520 |
521 | """
522 | meta = pd.read_excel(map_source_address)
523 |
524 | # meta=pd.read_csv("../affy_probe_gene_mapping.txt",delimiter="\t",header=None, names=["probe_id", "gene"])
525 | meta_gene_probID = meta.set_index("probe_id")
526 | d = dict(zip(meta_gene_probID.index, meta_gene_probID["symbol"]))
527 | l1k_features_gn = [d[l] for l in l1k_features]
528 | l1k_data_df = l1k_data_df.rename(columns=d)
529 |
530 | return l1k_data_df, l1k_features_gn
531 |
532 |
533 | def rename_to_genename_list_to_affyprobe(
534 | l1k_features_gn, our_l1k_prob_list, map_source_address
535 | ):
536 | """
537 | map a list of gene names to a list of affy prob ids
538 |
539 | """
540 | # map_source_address='../idmap.xlsx'
541 | meta = pd.read_excel(map_source_address)
542 | # meta=pd.read_csv("../affy_probe_gene_mapping.txt",delimiter="\t",header=None, names=["probe_id", "gene"])
543 | # meta=meta[meta['probe_id'].isin(our_l1k_prob_list)].reset_index(drop=True)
544 | meta_gene_probID = meta.set_index("symbol")
545 | d = dict(zip(meta_gene_probID.index, meta_gene_probID["probe_id"]))
546 | l1k_features = [d[l] for l in l1k_features_gn]
547 | # l1k_data_df = l1k_data_df.rename(columns=d)
548 |
549 | return l1k_features
550 |
551 |
552 | def standardize_per_catX(df, column_name, cp_features):
553 | # column_name='Metadata_Plate'
554 | # cp_features=df.columns[df.columns.str.contains("Cells_|Cytoplasm_|Nuclei_")]
555 | df_scaled_perPlate = df.copy()
556 | df_scaled_perPlate[cp_features] = (
557 | df[cp_features + [column_name]]
558 | .groupby(column_name)
559 | .transform(lambda x: (x - x.mean()) / x.std())
560 | .values
561 | )
562 | return df_scaled_perPlate
563 |
--------------------------------------------------------------------------------
/utils/replicateCorrs.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.spatial
3 | import pandas as pd
4 | import matplotlib.pyplot as plt
5 | import seaborn as sns
6 | from random import sample, choices
7 | from scipy.stats import pearsonr
8 |
9 | # sns.set_style("whitegrid")
10 | sns.set(rc={"lines.linewidth": 2})
11 |
12 |
13 | def replicateCorrs(inDf, pertColName, featColNames, plotEnabled):
14 | """
15 | Calculates replicate correlation versus across purtburtion correlations
16 |
17 | This function takes the input dataframe and output/plot replicate correlations.
18 |
19 | Parameters:
20 | inDf (pandas df): input dataframe contains metadata and features
21 | pertColName (str): The column based on which we define replicates of a purturbation
22 | featColNames(list): The list of all columns corresponding to features
23 | plotEnabled (bool): If True or 1, plots the curves
24 |
25 | Returns:
26 | repCorrDf (list):
27 |
28 | """
29 |
30 | df = inDf.copy()
31 | df[featColNames] = inDf[featColNames].interpolate()
32 | uniqPert = df[pertColName].unique().tolist()
33 | repC = []
34 | randC = []
35 |
36 | repCorrDf = pd.DataFrame(index=uniqPert, columns=["RepCor"])
37 |
38 | repSizeDF = df.groupby([pertColName]).size().reset_index()
39 | highRepComp = repSizeDF[repSizeDF[0] > 1][pertColName].tolist()
40 |
41 | for u in highRepComp:
42 | df1 = df[df[pertColName] == u].drop_duplicates().reset_index(drop=True)
43 | # df2=df[df[pertColName]!=u].drop_duplicates().reset_index(drop=True)
44 |
45 | repCorrPurtbs = df1.loc[:, featColNames].T.corr()
46 | repCorr = list(
47 | repCorrPurtbs.values[np.triu_indices(repCorrPurtbs.shape[0], k=1)]
48 | )
49 | # print(repCorr)
50 | repCorrDf.loc[u, "RepCor"] = np.nanmean(repCorr)
51 | # print(repCorr)
52 | # repCorr=np.sort(np.unique(df1.loc[:,featColNames].T.corr().values))[:-1].tolist()
53 | # repC=repC+repCorr
54 | repC = repC + [np.nanmedian(repCorr)]
55 | # repC=repC+[np.median(repCorr)]
56 | # # randPertbs=df2[pertColName].drop_duplicates().sample(df1.shape[0],replace=True).tolist()
57 | # nS=np.min([len(df2[pertColName].unique().tolist()),df1.shape[0]])
58 | # # nS=df1.shape[0]
59 |
60 | # # print(nS,[len(df2[pertColName].unique().tolist()),df1.shape[0]])
61 |
62 | # randPertbs=sample(df2[pertColName].unique().tolist(),k=nS)
63 | # # print(randPertbs)
64 | # df3=pd.concat([df2[df2[pertColName]==i].sample(1,replace=True) for i in randPertbs],ignore_index=True)
65 | # # print(df1.sample(df3.shape[0],replace=False).shape,df3.shape)
66 | # randCorr=df1[featColNames].sample(df3.shape[0],replace=False).reset_index(drop=True).\
67 | # corrwith(df3[featColNames], axis = 1,method='pearson',drop=True).values.tolist()
68 |
69 | # # x1=df1.sample(df3.shape[0],replace=False).values
70 |
71 | # # randCorr=pearsonr()
72 | # # randCorr = [x for x in randCorr if str(x) != 'nan']
73 | # randC=randC+randCorr
74 | # # print(randC)
75 | # print('here3')
76 | randC_v2 = []
77 | for i in range(1):
78 | uniqeSamplesFromEachPurt = inDf.groupby(pertColName)[featColNames].apply(
79 | lambda s: s.sample(1)
80 | )
81 | corrMatAcrossPurtbs = uniqeSamplesFromEachPurt.loc[:, featColNames].T.corr()
82 | randCorrVals = list(
83 | corrMatAcrossPurtbs.values[
84 | np.triu_indices(corrMatAcrossPurtbs.shape[0], k=1)
85 | ]
86 | )
87 | randC_v2 = randC_v2 + randCorrVals
88 |
89 | if 0:
90 | fig, axes = plt.subplots(figsize=(5, 3))
91 | sns.kdeplot(randC, bw=0.1, label="random pairs", ax=axes)
92 | sns.kdeplot(repC, bw=0.1, label="replicate pairs", ax=axes)
93 | axes.set_xlabel("CC")
94 | sns.kdeplot(randC_v2, bw=0.1, label="random v2 pairs", ax=axes)
95 | axes.set_xlabel("CC")
96 | # perc5=np.percentile(repCC, 50);axes.axvline(x=perc5,linestyle=':',color='darkorange');
97 | # perc95=np.percentile(randCC, 90);axes.axvline(x=perc95,linestyle=':');
98 | axes.legend()
99 | # axes.set_title('');
100 | axes.set_xlim(-1.1, 1.1)
101 |
102 | repC = [repC for repC in repC if str(repC) != "nan"]
103 | randC_v2 = [randC_v2 for randC_v2 in randC_v2 if str(randC_v2) != "nan"]
104 |
105 | perc95 = np.percentile(randC_v2, 90)
106 | rep10 = np.percentile(repC, 10)
107 |
108 | if plotEnabled:
109 | fig, axes = plt.subplots(figsize=(5, 4))
110 | # sns.kdeplot(randC_v2, bw=.1, label="random pairs",ax=axes);axes.set_xlabel('CC');
111 | # sns.kdeplot(repC, bw=.1, label="replicate pairs",ax=axes,color='r');axes.set_xlabel('CC');
112 | sns.distplot(
113 | randC_v2,
114 | kde=True,
115 | hist=True,
116 | bins=100,
117 | label="random pairs",
118 | ax=axes,
119 | norm_hist=True,
120 | )
121 | sns.distplot(
122 | repC,
123 | kde=True,
124 | hist=True,
125 | bins=100,
126 | label="replicate pairs",
127 | ax=axes,
128 | norm_hist=True,
129 | color="r",
130 | )
131 |
132 | # perc5=np.percentile(repCC, 50);axes.axvline(x=perc5,linestyle=':',color='darkorange');
133 | axes.axvline(x=perc95, linestyle=":")
134 | axes.axvline(x=0, linestyle=":")
135 | axes.legend(loc=2)
136 | # axes.set_title('');
137 | axes.set_xlim(-1, 1)
138 | plt.tight_layout()
139 |
140 | repCorrDf["Rand90Perc"] = perc95
141 | repCorrDf["Rep10Perc"] = rep10
142 | # highRepPertbs=repCorrDf[repCorrDf['RepCor']>perc95].index.tolist()
143 | # return repCorrDf
144 | return [randC_v2, repC, repCorrDf]
145 |
146 |
147 | # input is a list of dfs--> [cp,l1k,cp_cca,l1k_cca]
148 | #######
149 | def plotRepCorrs(allData, pertName):
150 | corrAll = []
151 | for d in range(len(allData)):
152 | df = allData[d][0]
153 | features = allData[d][1]
154 | uniqPert = df[pertName].unique().tolist()
155 | repC = []
156 | randC = []
157 | for u in uniqPert:
158 | df1 = df[df[pertName] == u].drop_duplicates().reset_index(drop=True)
159 | df2 = df[df[pertName] != u].drop_duplicates().reset_index(drop=True)
160 | repCorr = np.sort(np.unique(df1.loc[:, features].T.corr().values))[
161 | :-1
162 | ].tolist()
163 | # print(repCorr)
164 | repC = repC + repCorr
165 | randAllels = (
166 | df2[pertName]
167 | .drop_duplicates()
168 | .sample(df1.shape[0], replace=True)
169 | .tolist()
170 | )
171 | df3 = pd.concat(
172 | [
173 | df2[df2[pertName] == i].reset_index(drop=True).iloc[0:1, :]
174 | for i in randAllels
175 | ],
176 | ignore_index=True,
177 | )
178 | randCorr = df1.corrwith(df3, axis=1, method="pearson").values.tolist()
179 | randC = randC + randCorr
180 |
181 | corrAll.append([randC, repC])
182 | return corrAll
183 |
--------------------------------------------------------------------------------
/utils/saveAsNewSheetToExistingFile.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import openpyxl as pxl
3 | import os
4 |
5 | # ------------------------------------------------------
6 |
7 |
8 | # Save the input dataframe to the specified sheet name of filename file
9 | def saveAsNewSheetToExistingFile(filename, newDF, newSheetName):
10 |
11 |
12 |
13 | if os.path.exists(filename):
14 | excel_book = pxl.load_workbook(filename)
15 |
16 | if newSheetName in excel_book.sheetnames:
17 | del excel_book[newSheetName]
18 |
19 | with pd.ExcelWriter(filename, engine="openpyxl") as writer:
20 | writer.book = excel_book
21 |
22 | writer.sheets = {
23 | worksheet.title: worksheet
24 | for worksheet in excel_book.worksheets
25 | if newSheetName not in worksheet
26 | }
27 | newDF.to_excel(writer, newSheetName)
28 | writer.save()
29 | else:
30 | newDF.to_excel(filename, newSheetName)
31 |
32 | print(newSheetName, " saved!")
33 | return
34 |
35 |
36 | # ------------------------------------------------------
37 |
38 |
39 | # saveDF_to_CSV_GZ_no_timestamp
40 | def saveDF_to_CSV_GZ_no_timestamp(df, filename):
41 | from gzip import GzipFile
42 | from io import TextIOWrapper
43 |
44 | with TextIOWrapper(GzipFile(filename, "w", mtime=0), encoding="utf-8") as fd:
45 | df.to_csv(fd, index=False, compression="gzip")
46 |
47 | return
48 |
--------------------------------------------------------------------------------