├── .gitignore
├── .pre-commit-config.yaml
├── 0.dataset_creation
    └── 0-preprocess_datasets.ipynb
├── 1.dataset_curation
    ├── README.md
    ├── curate_dataset.py
    ├── pyproject.toml
    └── uv.lock
├── App1.single_feature_prediction
    ├── 1-single-Gene-CPfeature-prediction.ipynb
    └── lookup_luad_images.ipynb
├── App2.MoA_prediction
    ├── 2a-Modality_Integration_CDRP-bio.ipynb
    └── 2b-Modality_Integration_LINCS.ipynb
├── GO_terms_search
    ├── 4-GO-terms-search-analysis.ipynb
    └── source
    │   ├── GO_bp_cc_mf_direct_LUAD_975.txt
    │   ├── GO_bp_cc_mf_direct_LUAD_976.txt
    │   ├── GO_bp_cc_mf_direct_intersection_782.txt
    │   ├── GO_bp_cc_mf_direct_intersection_782_completed.csv
    │   ├── GO_bp_cc_mf_direct_union_1165.txt
    │   ├── LUAD_geneSymbols_978.txt
    │   ├── intersection_geneSymbols_785.txt
    │   ├── top_100_luad.txt
    │   ├── top_59_atleast_topIn3.txt
    │   └── union_geneSymbols_1170.txt
├── LICENSE
├── README.md
├── environment.yml
├── etag.json
├── explore_the_link.ipynb
├── generate_paper_figures
    └── generate_paper_figs.ipynb
├── idmap.xlsx
├── read_and_match_profiles.ipynb
├── results
    ├── DAVIDoutput_CytoScapeInput_Figure2d
    │   ├── chart_UP_KEYWORDS_FunctionalAnot_all.txt
    │   └── chart_UP_KEYWORDS_FunctionalAnot_top.txt
    ├── Figs_Source_Data.xlsx
    ├── MoAprediction
    │   ├── JI_cdrpbio.txt
    │   ├── JI_lincs.txt
    │   ├── pred_moa.xlsx
    │   ├── pred_moa_2.xlsx
    │   ├── pred_moa_CDRP.xlsx
    │   └── pred_moa_LINCS.xlsx
    ├── RepCor
    │   └── RepCorrDF.xlsx
    ├── SingleCPfeatPred
    │   └── scores_corrected.xlsx
    ├── SingleGenePred
    │   ├── scores_corrected.xlsx
    │   ├── scores_cross_dts_LU_LI.xlsx
    │   └── supplementary_D.csv
    └── SingleGenePred_cpCategoryMap
    │   ├── CatMap-LINCS-25-lasso-ht.png
    │   ├── CatMap-LUAD-9-MLP-keras-ht.pdf
    │   ├── CatMap-LUAD-9-MLP-keras-ht.png
    │   ├── CatMap-LUAD-9-lasso-ht.png
    │   └── cat_scores_maps.xlsx
└── utils
    ├── pred_models.py
    ├── readProfiles.py
    ├── replicateCorrs.py
    └── saveAsNewSheetToExistingFile.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # result folder
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | .Rproj.user
133 | .Rhistory
134 | .Rprofile
135 | *.nb.html
136 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 |   rev: v5.0.0
4 |   hooks:
5 |   - id: trailing-whitespace
6 |     exclude: ^.*/$
7 | 
8 | 


--------------------------------------------------------------------------------
/1.dataset_curation/README.md:
--------------------------------------------------------------------------------
 1 | # Dataset Curation
 2 | 
 3 | `curate_dataset.py` selects, renames, and fixes columns from the preprocessed data to create a curated dataset.
 4 | 
 5 | ## Structure
 6 | 
 7 | Available at:
 8 | `s3://cellpainting-gallery/cpg0003-rosetta/broad/workspace/curated_preprocessed_data`
 9 | 
10 | ```
11 | curated_preprocessed_data
12 | ├── CDRP-BBBC047-Bray
13 | │   ├── CellPainting
14 | │   │   └── replicate_level_cp_augmented.parquet
15 | │   └── L1000
16 | │       └── replicate_level_l1k.parquet
17 | ├── LINCS-Pilot1
18 | │   ├── CellPainting
19 | │   │   └── replicate_level_cp_augmented.parquet
20 | │   └── L1000
21 | │       └── replicate_level_l1k.parquet
22 | ├── LUAD-BBBC041-Caicedo
23 | │   ├── CellPainting
24 | │   │   └── replicate_level_cp_augmented.parquet
25 | │   └── L1000
26 | │       └── replicate_level_l1k.parquet
27 | └── TA-ORF-BBBC037-Rohban
28 |     ├── CellPainting
29 |     │   └── replicate_level_cp_augmented.parquet
30 |     └── L1000
31 |         └── replicate_level_l1k.parquet
32 | ```
33 | 
34 | ## Curated columns
35 | 
36 | - `Metadata_Plate` [All]: Identifier of the multi‐well plate (e.g., SQ00015156, PAC053_U2OS_6H_X2_B1_UNI4445R, TA.OE005_U2OS_72H_X1_B15).
37 | - `Metadata_Plate_Map_Name` [All CP]: Plate‐map identifier (e.g., C-7161-01-LM6-003).
38 | - `Metadata_ARP_ID` [LINCS-Pilot1 L1K, TA-ORF-BBBC037-Rohban L1K, LUAD-BBBC041-Caicedo L1K]: Internal plate identifier (e.g., AB00016187).
39 | - `Metadata_Well` [All except CDRP-BBBC047-Bray L1K]: Specific well position within the plate (e.g., A01, H11).
40 | - `Metadata_pert_id` [All]: Unique perturbation identifier (e.g., BRD-K50691590-001-02-2, TRCN0000471252, EMPTY).
41 | - `Metadata_pert_type` [All except CDRP-BBBC047-Bray L1K]: Perturbation type (e.g., trt_cp, ctl_vehicle, trt, control).
42 | - `Metadata_cell_id` [All]: Cell line used (e.g., A549, U2OS).
43 | - `Metadata_pert_timepoint` [All]: Time (in hours) from perturbation to measurement (e.g., 24, 48, 72, 96).
44 | - `Metadata_pert_dose_micromolar` [LINCS-Pilot1, CDRP-BBBC047-Bray]: Final compound concentration (µM) (e.g., 0.0411523, 10).
45 | - `Metadata_pert_iname` [LINCS-Pilot1, CDRP-BBBC047-Bray]: Common name of the compound or control (e.g., bortezomib, DMSO).
46 | - `Metadata_SMILES` [LINCS-Pilot1 L1K, CDRP-BBBC047-Bray L1K]: SMILES string for the compound structure.
47 | - `Metadata_cdrp_group` [CDRP-BBBC047-Bray L1K]: Subset/group label in the CDRP compound library (e.g., DOS, BIO).
48 | - `Metadata_genesymbol_mutation` [TA-ORF-BBBC037-Rohban L1K+CP, LUAD-BBBC041-Caicedo CP]: Gene plus mutation notation (e.g., TP53_p.R248Q).
49 | - `Metadata_genesymbol` [TA-ORF-BBBC037-Rohban CP, LUAD-BBBC041-Caicedo CP]: Gene symbol alone (e.g., TP53, MAPK8).
50 | - `Metadata_transcriptdb` [LUAD-BBBC041-Caicedo L1K]: Reference to specific transcript/isoform (e.g., NM_001126112.2:c.796G>C).
51 | 
52 | 


--------------------------------------------------------------------------------
/1.dataset_curation/curate_dataset.py:
--------------------------------------------------------------------------------
  1 | # ---
  2 | # jupyter:
  3 | #   jupytext:
  4 | #     text_representation:
  5 | #       extension: .py
  6 | #       format_name: percent
  7 | #       format_version: '1.3'
  8 | #       jupytext_version: 1.15.2
  9 | #   kernelspec:
 10 | #     display_name: Python 3
 11 | #     language: python
 12 | #     name: python3
 13 | # ---
 14 | 
 15 | # %%
 16 | import pandas as pd
 17 | from pathlib import Path
 18 | from IPython.display import display
 19 | 
 20 | # %%
 21 | # First download the data from
 22 | # s3://cellpainting-gallery/cpg0003-rosetta/broad/workspace/preprocessed_data
 23 | # and save it in the ./preprocessed_data folder
 24 | 
 25 | dataset_paths = {
 26 |     "LINCS-Pilot1": {
 27 |         "l1k": "./preprocessed_data/LINCS-Pilot1/L1000/replicate_level_l1k.csv.gz",
 28 |         "cp": "./preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_augmented.csv.gz",
 29 |     },
 30 |     "CDRP-BBBC047-Bray": {
 31 |         "l1k": "./preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k.csv.gz",
 32 |         "cp": "./preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_augmented.csv.gz",
 33 |     },
 34 |     "TA-ORF-BBBC037-Rohban": {
 35 |         "l1k": "./preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k.csv.gz",
 36 |         "cp": "./preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_augmented.csv.gz",
 37 |     },
 38 |     "LUAD-BBBC041-Caicedo": {
 39 |         "l1k": "./preprocessed_data/LUAD-BBBC041-Caicedo/L1000/replicate_level_l1k.csv.gz",
 40 |         "cp": "./preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_augmented.csv.gz",
 41 |     },
 42 | }
 43 | 
 44 | # Define column mappings for each dataset and data type
 45 | column_rename_mappings = {
 46 |     "CDRP-BBBC047-Bray": {
 47 |         "l1k": {
 48 |             "pert_id": "Metadata_pert_id",
 49 |             "pert_dose": "Metadata_pert_dose_micromolar",
 50 |             "det_plate": "Metadata_Plate",
 51 |             "CPD_NAME": "Metadata_pert_iname",
 52 |             "CPD_TYPE": "Metadata_cdrp_group",
 53 |             "CPD_SMILES": "Metadata_SMILES",
 54 |         },
 55 |         "cp": {
 56 |             "Metadata_broad_sample": "Metadata_pert_id",
 57 |             "Metadata_broad_sample_type": "Metadata_pert_type",
 58 |             "Metadata_mmoles_per_liter2": "Metadata_pert_dose_micromolar",
 59 |         },
 60 |     },
 61 |     "LINCS-Pilot1": {
 62 |         "l1k": {
 63 |             "pert_dose": "Metadata_pert_dose_micromolar",
 64 |             "det_plate": "Metadata_Plate",
 65 |             "cell_id": "Metadata_cell_id",
 66 |             "det_well": "Metadata_Well",
 67 |             "mfc_plate_name": "Metadata_ARP_ID",
 68 |             "pert_iname_x": "Metadata_pert_iname",
 69 |             "pert_time": "Metadata_pert_timepoint",
 70 |             "pert_mfc_id": "Metadata_pert_id",
 71 |             "pert_type_x": "Metadata_pert_type",
 72 |             "x_smiles": "Metadata_SMILES",
 73 |         },
 74 |         "cp": {
 75 |             "Metadata_broad_sample": "Metadata_pert_id",
 76 |             "Metadata_broad_sample_type": "Metadata_pert_type",
 77 |             "Metadata_mmoles_per_liter": "Metadata_pert_dose_micromolar",
 78 |             "pert_iname": "Metadata_pert_iname",
 79 |         },
 80 |     },
 81 |     "TA-ORF-BBBC037-Rohban": {
 82 |         "l1k": {
 83 |             "det_plate": "Metadata_Plate",
 84 |             "cell_id": "Metadata_cell_id",
 85 |             "det_well": "Metadata_Well",
 86 |             "mfc_plate_name": "Metadata_ARP_ID",
 87 |             "pert_time": "Metadata_pert_timepoint",
 88 |             "pert_mfc_id": "Metadata_pert_id",
 89 |             "pert_type": "Metadata_pert_type",
 90 |             "x_genesymbol_mutation": "Metadata_genesymbol_mutation",
 91 |         },
 92 |         "cp": {
 93 |             "Metadata_broad_sample": "Metadata_pert_id",
 94 |             "Metadata_broad_sample_type": "Metadata_pert_type",
 95 |             "Metadata_pert_name": "Metadata_genesymbol_mutation",
 96 |             "Metadata_gene_name": "Metadata_genesymbol",
 97 |         },
 98 |     },
 99 |     "LUAD-BBBC041-Caicedo": {
100 |         "l1k": {
101 |             "det_plate": "Metadata_Plate",
102 |             "cell_id": "Metadata_cell_id",
103 |             "det_well": "Metadata_Well",
104 |             "mfc_plate_name": "Metadata_ARP_ID",
105 |             "pert_time": "Metadata_pert_timepoint",
106 |             "pert_mfc_id": "Metadata_pert_id",
107 |             "pert_type": "Metadata_pert_type",
108 |             "x_transcriptdb": "Metadata_transcriptdb",
109 |         },
110 |         "cp": {
111 |             "Metadata_broad_sample": "Metadata_pert_id",
112 |             "Metadata_broad_sample_type": "Metadata_pert_type",
113 |             "x_mutation_status": "Metadata_genesymbol_mutation",
114 |             "Symbol": "Metadata_genesymbol",
115 |         },
116 |     },
117 | }
118 | 
119 | # Define the columns we want to keep for each dataset and data type
120 | columns_to_keep = {
121 |     "CDRP-BBBC047-Bray": {
122 |         "l1k": [
123 |             "Metadata_Plate",
124 |             "Metadata_pert_id",
125 |             "Metadata_pert_iname",
126 |             "Metadata_pert_dose_micromolar",
127 |             "Metadata_cdrp_group",
128 |             "Metadata_SMILES",
129 |         ],
130 |         "cp": [
131 |             "Metadata_Plate_Map_Name",
132 |             "Metadata_Plate",
133 |             "Metadata_Well",
134 |             "Metadata_pert_id",
135 |             "Metadata_pert_dose_micromolar",
136 |             "Metadata_pert_type",
137 |             "Metadata_cell_id",
138 |         ],
139 |     },
140 |     "LINCS-Pilot1": {
141 |         "l1k": [
142 |             "Metadata_Plate",
143 |             "Metadata_Well",
144 |             "Metadata_pert_id",
145 |             "Metadata_pert_type",
146 |             "Metadata_pert_dose_micromolar",
147 |             "Metadata_cell_id",
148 |             "Metadata_pert_iname",
149 |             "Metadata_ARP_ID",
150 |             "Metadata_pert_timepoint",
151 |             "Metadata_SMILES",
152 |         ],
153 |         "cp": [
154 |             "Metadata_Plate_Map_Name",
155 |             "Metadata_Plate",
156 |             "Metadata_Well",
157 |             "Metadata_pert_id",
158 |             "Metadata_pert_type",
159 |             "Metadata_pert_dose_micromolar",
160 |             "Metadata_cell_id",
161 |             "Metadata_pert_iname",
162 |         ],
163 |     },
164 |     "TA-ORF-BBBC037-Rohban": {
165 |         "l1k": [
166 |             "Metadata_Plate",
167 |             "Metadata_Well",
168 |             "Metadata_pert_id",
169 |             "Metadata_pert_type",
170 |             "Metadata_cell_id",
171 |             "Metadata_ARP_ID",
172 |             "Metadata_pert_timepoint",
173 |             "Metadata_genesymbol_mutation",
174 |         ],
175 |         "cp": [
176 |             "Metadata_Plate_Map_Name",
177 |             "Metadata_Plate",
178 |             "Metadata_Well",
179 |             "Metadata_pert_id",
180 |             "Metadata_pert_type",
181 |             "Metadata_cell_id",
182 |             "Metadata_genesymbol_mutation",
183 |             "Metadata_genesymbol",
184 |         ],
185 |     },
186 |     "LUAD-BBBC041-Caicedo": {
187 |         "l1k": [
188 |             "Metadata_Plate",
189 |             "Metadata_Well",
190 |             "Metadata_pert_id",
191 |             "Metadata_pert_type",
192 |             "Metadata_cell_id",
193 |             "Metadata_ARP_ID",
194 |             "Metadata_pert_timepoint",
195 |             "Metadata_transcriptdb",
196 |         ],
197 |         "cp": [
198 |             "Metadata_Plate_Map_Name",
199 |             "Metadata_Plate",
200 |             "Metadata_Well",
201 |             "Metadata_pert_id",
202 |             "Metadata_pert_type",
203 |             "Metadata_cell_id",
204 |             "Metadata_genesymbol_mutation",
205 |             "Metadata_genesymbol",
206 |         ],
207 |     },
208 | }
209 | 
210 | # First load the data
211 | dataset_data = {}
212 | for dataset_name, paths in dataset_paths.items():
213 |     dataset_data[dataset_name] = {}
214 |     for data_type, dataset_path in paths.items():
215 |         parquet_path = dataset_path.replace(".csv.gz", ".parquet")
216 |         if not Path(parquet_path).exists():
217 |             data = pd.read_csv(dataset_path, low_memory=False)
218 |             data.to_parquet(parquet_path)
219 |             dataset_data[dataset_name][data_type] = data
220 |         else:
221 |             data = pd.read_parquet(parquet_path)
222 |             dataset_data[dataset_name][data_type] = data
223 | 
224 | 
225 | # %%
226 | 
227 | # Then apply the column renaming
228 | for dataset_name, data_types in dataset_data.items():
229 |     for data_type, data in data_types.items():
230 |         if (
231 |             dataset_name in column_rename_mappings
232 |             and data_type in column_rename_mappings[dataset_name]
233 |         ):
234 |             # First, identify feature columns we want to preserve
235 |             if data_type == "l1k":
236 |                 feature_mask = data.columns.str.endswith("_at")
237 |             else:  # cp
238 |                 feature_mask = (
239 |                     data.columns.str.startswith("Cells_")
240 |                     | data.columns.str.startswith("Cytoplasm_")
241 |                     | data.columns.str.startswith("Nuclei_")
242 |                 )
243 |             feature_cols = data.columns[feature_mask]
244 |             metadata_cols = data.columns[~feature_mask]
245 | 
246 |             # Apply renaming only to metadata columns
247 |             rename_mapping = {
248 |                 k: v
249 |                 for k, v in column_rename_mappings[dataset_name][data_type].items()
250 |                 if k in metadata_cols
251 |             }
252 | 
253 |             # Check if new name already exists and drop it if so
254 |             for old, new in rename_mapping.items():
255 |                 if new in data.columns and new != old:
256 |                     data.drop(columns=[new], inplace=True)
257 |             # Rename metadata columns
258 |             data = data.rename(columns=rename_mapping)
259 | 
260 |             # Keep only desired metadata columns plus all feature columns
261 |             keep_metadata = columns_to_keep[dataset_name][data_type]
262 |             dataset_data[dataset_name][data_type] = data[
263 |                 keep_metadata + feature_cols.tolist()
264 |             ]
265 | 
266 | # %%
267 | 
268 | # Make "Metadata_Well" uppercase
269 | for dataset_name, data_types in dataset_data.items():
270 |     for data_type, data in data_types.items():
271 |         if "Metadata_Well" in data.columns:
272 |             data["Metadata_Well"] = data["Metadata_Well"].str.upper()
273 | 
274 | 
275 | # Make "Metadata_cell_id" = U2OS for CDRP-BBBC047-Bray cp
276 | dataset_data["CDRP-BBBC047-Bray"]["l1k"]["Metadata_cell_id"] = "U2OS"
277 | 
278 | # Set timepoints
279 | dataset_data["LINCS-Pilot1"]["cp"]["Metadata_pert_timepoint"] = 48
280 | dataset_data["LINCS-Pilot1"]["l1k"]["Metadata_pert_timepoint"] = 24
281 | 
282 | dataset_data["CDRP-BBBC047-Bray"]["cp"]["Metadata_pert_timepoint"] = 48
283 | dataset_data["CDRP-BBBC047-Bray"]["l1k"]["Metadata_pert_timepoint"] = 6
284 | 
285 | dataset_data["TA-ORF-BBBC037-Rohban"]["cp"]["Metadata_pert_timepoint"] = 72
286 | dataset_data["TA-ORF-BBBC037-Rohban"]["l1k"]["Metadata_pert_timepoint"] = 72
287 | 
288 | dataset_data["LUAD-BBBC041-Caicedo"]["cp"]["Metadata_pert_timepoint"] = 96
289 | dataset_data["LUAD-BBBC041-Caicedo"]["l1k"]["Metadata_pert_timepoint"] = 96
290 | 
291 | # %%
292 | 
293 | # Display the datasets
294 | for dataset_name, data_types in dataset_data.items():
295 |     for data_type, data in data_types.items():
296 |         display(f"Dataset: {dataset_name}, Data Type: {data_type}")
297 |         display(data.sample(5)[data.columns[data.columns.str.startswith("Metadata")]])
298 | 
299 | # %%
300 | 
301 | # %%
302 | for dataset_name, data_types in dataset_data.items():
303 |     for data_type, data in data_types.items():
304 |         if "Metadata_pert_type" in data.columns:
305 |             data["Metadata_pert_type"] = data["Metadata_pert_type"].replace(
306 |                 {"ctl_vehicle": "control", "trt_cp": "trt"}
307 |             )
308 | 
309 | 
310 | #  TA-ORF-BBBC037-Rohban cp does not correctly identify Metadata_pert_type, because it marks all as trt.
311 | 
312 | # %%
313 | 
314 | # Print columns for each dataset and data type
315 | print("\nColumns in each dataset:")
316 | for dataset_name, data_types in dataset_data.items():
317 |     print(f"\n{dataset_name}:")
318 |     for data_type, data in data_types.items():
319 |         metadata_cols = [col for col in data.columns if col.startswith("Metadata")]
320 |         print(f"  {data_type}: {sorted(metadata_cols)}")
321 | 
322 | # Find common columns between l1k datasets
323 | l1k_common = set.intersection(
324 |     *[set(data_types["l1k"].columns) for data_types in dataset_data.values()]
325 | )
326 | l1k_metadata_common = sorted([col for col in l1k_common if col.startswith("Metadata")])
327 | 
328 | # Find common columns between cp datasets
329 | cp_common = set.intersection(
330 |     *[set(data_types["cp"].columns) for data_types in dataset_data.values()]
331 | )
332 | cp_metadata_common = sorted([col for col in cp_common if col.startswith("Metadata")])
333 | 
334 | # Find common columns across all datasets
335 | all_common = set.intersection(l1k_common, cp_common)
336 | all_metadata_common = sorted([col for col in all_common if col.startswith("Metadata")])
337 | 
338 | print("\nCommon Metadata columns across L1K datasets:")
339 | print(l1k_metadata_common)
340 | print("\nCommon Metadata columns across CP datasets:")
341 | print(cp_metadata_common)
342 | print("\nCommon Metadata columns across ALL datasets:")
343 | print(all_metadata_common)
344 | 
345 | # %%
346 | 
347 | # Check for duplicate columns within each dataset
348 | for dataset_name, data_types in dataset_data.items():
349 |     for data_type, data in data_types.items():
350 |         duplicate_cols = data.columns.duplicated()
351 |         if any(duplicate_cols):
352 |             print(f"Duplicate columns found in {dataset_name} {data_type}:")
353 |             print(data.columns[duplicate_cols])
354 | 
355 | # %%
356 | # Create markdown output for datasets
357 | markdown_output = "# Dataset Samples\n\n"
358 | 
359 | for dataset_name, data_types in dataset_data.items():
360 |     markdown_output += f"## {dataset_name}\n\n"
361 |     for data_type, data in data_types.items():
362 |         markdown_output += f"### {data_type.upper()} Data\n\n"
363 |         # Convert sample to markdown table
364 |         sample_df = data.sample(5)[
365 |             [col for col in data.columns if col.startswith("Metadata")]
366 |         ]
367 |         markdown_output += sample_df.to_markdown(index=False) + "\n\n"
368 |         display(sample_df.head())
369 | 
370 | # Write to file
371 | with open("dataset_samples.md", "w") as f:
372 |     f.write(markdown_output)
373 | 
374 | print("Dataset samples have been written to dataset_samples.md")
375 | 
376 | # %%
377 | 
378 | # Save processed datasets using same structure as input
379 | for dataset_name, data_types in dataset_data.items():
380 |     for data_type, data in data_types.items():
381 |         # Mirror the input path structure but with processed data
382 |         input_path = Path(dataset_paths[dataset_name][data_type])
383 |         output_path = (
384 |             Path("curated")
385 |             / input_path.parent
386 |             / input_path.name.replace(".csv.gz", ".parquet")
387 |         )
388 |         # Create the processed subdirectory if it doesn't exist
389 |         output_path.parent.mkdir(exist_ok=True, parents=True)
390 | 
391 |         # # Save the data
392 |         data.to_parquet(output_path, index=False)
393 |         print(f"Saved {dataset_name} {data_type} data to {output_path}")
394 | 
395 | # %%
396 | 


--------------------------------------------------------------------------------
/1.dataset_curation/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "1-dataset-curation"
 3 | version = "0.1.0"
 4 | description = "Curate 2022_Haghighi_NatureMethods"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "ipython>=8.31.0",
 9 |     "pandas>=2.2.3",
10 |     "pyarrow>=19.0.0",
11 |     "tabulate>=0.9.0",
12 | ]
13 | 


--------------------------------------------------------------------------------
/1.dataset_curation/uv.lock:
--------------------------------------------------------------------------------
  1 | version = 1
  2 | requires-python = ">=3.12"
  3 | 
  4 | [[package]]
  5 | name = "1-dataset-curation"
  6 | version = "0.1.0"
  7 | source = { virtual = "." }
  8 | dependencies = [
  9 |     { name = "ipython" },
 10 |     { name = "pandas" },
 11 |     { name = "pyarrow" },
 12 |     { name = "tabulate" },
 13 | ]
 14 | 
 15 | [package.metadata]
 16 | requires-dist = [
 17 |     { name = "ipython", specifier = ">=8.31.0" },
 18 |     { name = "pandas", specifier = ">=2.2.3" },
 19 |     { name = "pyarrow", specifier = ">=19.0.0" },
 20 |     { name = "tabulate", specifier = ">=0.9.0" },
 21 | ]
 22 | 
 23 | [[package]]
 24 | name = "asttokens"
 25 | version = "3.0.0"
 26 | source = { registry = "https://pypi.org/simple" }
 27 | sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978 }
 28 | wheels = [
 29 |     { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918 },
 30 | ]
 31 | 
 32 | [[package]]
 33 | name = "colorama"
 34 | version = "0.4.6"
 35 | source = { registry = "https://pypi.org/simple" }
 36 | sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
 37 | wheels = [
 38 |     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
 39 | ]
 40 | 
 41 | [[package]]
 42 | name = "decorator"
 43 | version = "5.1.1"
 44 | source = { registry = "https://pypi.org/simple" }
 45 | sdist = { url = "https://files.pythonhosted.org/packages/66/0c/8d907af351aa16b42caae42f9d6aa37b900c67308052d10fdce809f8d952/decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330", size = 35016 }
 46 | wheels = [
 47 |     { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 },
 48 | ]
 49 | 
 50 | [[package]]
 51 | name = "executing"
 52 | version = "2.2.0"
 53 | source = { registry = "https://pypi.org/simple" }
 54 | sdist = { url = "https://files.pythonhosted.org/packages/91/50/a9d80c47ff289c611ff12e63f7c5d13942c65d68125160cefd768c73e6e4/executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755", size = 978693 }
 55 | wheels = [
 56 |     { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702 },
 57 | ]
 58 | 
 59 | [[package]]
 60 | name = "ipython"
 61 | version = "8.31.0"
 62 | source = { registry = "https://pypi.org/simple" }
 63 | dependencies = [
 64 |     { name = "colorama", marker = "sys_platform == 'win32'" },
 65 |     { name = "decorator" },
 66 |     { name = "jedi" },
 67 |     { name = "matplotlib-inline" },
 68 |     { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
 69 |     { name = "prompt-toolkit" },
 70 |     { name = "pygments" },
 71 |     { name = "stack-data" },
 72 |     { name = "traitlets" },
 73 | ]
 74 | sdist = { url = "https://files.pythonhosted.org/packages/01/35/6f90fdddff7a08b7b715fccbd2427b5212c9525cd043d26fdc45bee0708d/ipython-8.31.0.tar.gz", hash = "sha256:b6a2274606bec6166405ff05e54932ed6e5cfecaca1fc05f2cacde7bb074d70b", size = 5501011 }
 75 | wheels = [
 76 |     { url = "https://files.pythonhosted.org/packages/04/60/d0feb6b6d9fe4ab89fe8fe5b47cbf6cd936bfd9f1e7ffa9d0015425aeed6/ipython-8.31.0-py3-none-any.whl", hash = "sha256:46ec58f8d3d076a61d128fe517a51eb730e3aaf0c184ea8c17d16e366660c6a6", size = 821583 },
 77 | ]
 78 | 
 79 | [[package]]
 80 | name = "jedi"
 81 | version = "0.19.2"
 82 | source = { registry = "https://pypi.org/simple" }
 83 | dependencies = [
 84 |     { name = "parso" },
 85 | ]
 86 | sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287 }
 87 | wheels = [
 88 |     { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278 },
 89 | ]
 90 | 
 91 | [[package]]
 92 | name = "matplotlib-inline"
 93 | version = "0.1.7"
 94 | source = { registry = "https://pypi.org/simple" }
 95 | dependencies = [
 96 |     { name = "traitlets" },
 97 | ]
 98 | sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159 }
 99 | wheels = [
100 |     { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899 },
101 | ]
102 | 
103 | [[package]]
104 | name = "numpy"
105 | version = "2.2.2"
106 | source = { registry = "https://pypi.org/simple" }
107 | sdist = { url = "https://files.pythonhosted.org/packages/ec/d0/c12ddfd3a02274be06ffc71f3efc6d0e457b0409c4481596881e748cb264/numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f", size = 20233295 }
108 | wheels = [
109 |     { url = "https://files.pythonhosted.org/packages/0c/e6/847d15770ab7a01e807bdfcd4ead5bdae57c0092b7dc83878171b6af97bb/numpy-2.2.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ac9bea18d6d58a995fac1b2cb4488e17eceeac413af014b1dd26170b766d8467", size = 20912636 },
110 |     { url = "https://files.pythonhosted.org/packages/d1/af/f83580891577b13bd7e261416120e036d0d8fb508c8a43a73e38928b794b/numpy-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23ae9f0c2d889b7b2d88a3791f6c09e2ef827c2446f1c4a3e3e76328ee4afd9a", size = 14098403 },
111 |     { url = "https://files.pythonhosted.org/packages/2b/86/d019fb60a9d0f1d4cf04b014fe88a9135090adfadcc31c1fadbb071d7fa7/numpy-2.2.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:3074634ea4d6df66be04f6728ee1d173cfded75d002c75fac79503a880bf3825", size = 5128938 },
112 |     { url = "https://files.pythonhosted.org/packages/7a/1b/50985edb6f1ec495a1c36452e860476f5b7ecdc3fc59ea89ccad3c4926c5/numpy-2.2.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8ec0636d3f7d68520afc6ac2dc4b8341ddb725039de042faf0e311599f54eb37", size = 6661937 },
113 |     { url = "https://files.pythonhosted.org/packages/f4/1b/17efd94cad1b9d605c3f8907fb06bcffc4ce4d1d14d46b95316cccccf2b9/numpy-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ffbb1acd69fdf8e89dd60ef6182ca90a743620957afb7066385a7bbe88dc748", size = 14049518 },
114 |     { url = "https://files.pythonhosted.org/packages/5b/73/65d2f0b698df1731e851e3295eb29a5ab8aa06f763f7e4188647a809578d/numpy-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0349b025e15ea9d05c3d63f9657707a4e1d471128a3b1d876c095f328f8ff7f0", size = 16099146 },
115 |     { url = "https://files.pythonhosted.org/packages/d5/69/308f55c0e19d4b5057b5df286c5433822e3c8039ede06d4051d96f1c2c4e/numpy-2.2.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:463247edcee4a5537841d5350bc87fe8e92d7dd0e8c71c995d2c6eecb8208278", size = 15246336 },
116 |     { url = "https://files.pythonhosted.org/packages/f0/d8/d8d333ad0d8518d077a21aeea7b7c826eff766a2b1ce1194dea95ca0bacf/numpy-2.2.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9dd47ff0cb2a656ad69c38da850df3454da88ee9a6fde0ba79acceee0e79daba", size = 17863507 },
117 |     { url = "https://files.pythonhosted.org/packages/82/6e/0b84ad3103ffc16d6673e63b5acbe7901b2af96c2837174c6318c98e27ab/numpy-2.2.2-cp312-cp312-win32.whl", hash = "sha256:4525b88c11906d5ab1b0ec1f290996c0020dd318af8b49acaa46f198b1ffc283", size = 6276491 },
118 |     { url = "https://files.pythonhosted.org/packages/fc/84/7f801a42a67b9772a883223a0a1e12069a14626c81a732bd70aac57aebc1/numpy-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:5acea83b801e98541619af398cc0109ff48016955cc0818f478ee9ef1c5c3dcb", size = 12616372 },
119 |     { url = "https://files.pythonhosted.org/packages/e1/fe/df5624001f4f5c3e0b78e9017bfab7fdc18a8d3b3d3161da3d64924dd659/numpy-2.2.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b208cfd4f5fe34e1535c08983a1a6803fdbc7a1e86cf13dd0c61de0b51a0aadc", size = 20899188 },
120 |     { url = "https://files.pythonhosted.org/packages/a9/80/d349c3b5ed66bd3cb0214be60c27e32b90a506946857b866838adbe84040/numpy-2.2.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d0bbe7dd86dca64854f4b6ce2ea5c60b51e36dfd597300057cf473d3615f2369", size = 14113972 },
121 |     { url = "https://files.pythonhosted.org/packages/9d/50/949ec9cbb28c4b751edfa64503f0913cbfa8d795b4a251e7980f13a8a655/numpy-2.2.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:22ea3bb552ade325530e72a0c557cdf2dea8914d3a5e1fecf58fa5dbcc6f43cd", size = 5114294 },
122 |     { url = "https://files.pythonhosted.org/packages/8d/f3/399c15629d5a0c68ef2aa7621d430b2be22034f01dd7f3c65a9c9666c445/numpy-2.2.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:128c41c085cab8a85dc29e66ed88c05613dccf6bc28b3866cd16050a2f5448be", size = 6648426 },
123 |     { url = "https://files.pythonhosted.org/packages/2c/03/c72474c13772e30e1bc2e558cdffd9123c7872b731263d5648b5c49dd459/numpy-2.2.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:250c16b277e3b809ac20d1f590716597481061b514223c7badb7a0f9993c7f84", size = 14045990 },
124 |     { url = "https://files.pythonhosted.org/packages/83/9c/96a9ab62274ffafb023f8ee08c88d3d31ee74ca58869f859db6845494fa6/numpy-2.2.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0c8854b09bc4de7b041148d8550d3bd712b5c21ff6a8ed308085f190235d7ff", size = 16096614 },
125 |     { url = "https://files.pythonhosted.org/packages/d5/34/cd0a735534c29bec7093544b3a509febc9b0df77718a9b41ffb0809c9f46/numpy-2.2.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b6fb9c32a91ec32a689ec6410def76443e3c750e7cfc3fb2206b985ffb2b85f0", size = 15242123 },
126 |     { url = "https://files.pythonhosted.org/packages/5e/6d/541717a554a8f56fa75e91886d9b79ade2e595918690eb5d0d3dbd3accb9/numpy-2.2.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:57b4012e04cc12b78590a334907e01b3a85efb2107df2b8733ff1ed05fce71de", size = 17859160 },
127 |     { url = "https://files.pythonhosted.org/packages/b9/a5/fbf1f2b54adab31510728edd06a05c1b30839f37cf8c9747cb85831aaf1b/numpy-2.2.2-cp313-cp313-win32.whl", hash = "sha256:4dbd80e453bd34bd003b16bd802fac70ad76bd463f81f0c518d1245b1c55e3d9", size = 6273337 },
128 |     { url = "https://files.pythonhosted.org/packages/56/e5/01106b9291ef1d680f82bc47d0c5b5e26dfed15b0754928e8f856c82c881/numpy-2.2.2-cp313-cp313-win_amd64.whl", hash = "sha256:5a8c863ceacae696aff37d1fd636121f1a512117652e5dfb86031c8d84836369", size = 12609010 },
129 |     { url = "https://files.pythonhosted.org/packages/9f/30/f23d9876de0f08dceb707c4dcf7f8dd7588266745029debb12a3cdd40be6/numpy-2.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:b3482cb7b3325faa5f6bc179649406058253d91ceda359c104dac0ad320e1391", size = 20924451 },
130 |     { url = "https://files.pythonhosted.org/packages/6a/ec/6ea85b2da9d5dfa1dbb4cb3c76587fc8ddcae580cb1262303ab21c0926c4/numpy-2.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9491100aba630910489c1d0158034e1c9a6546f0b1340f716d522dc103788e39", size = 14122390 },
131 |     { url = "https://files.pythonhosted.org/packages/68/05/bfbdf490414a7dbaf65b10c78bc243f312c4553234b6d91c94eb7c4b53c2/numpy-2.2.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:41184c416143defa34cc8eb9d070b0a5ba4f13a0fa96a709e20584638254b317", size = 5156590 },
132 |     { url = "https://files.pythonhosted.org/packages/f7/ec/fe2e91b2642b9d6544518388a441bcd65c904cea38d9ff998e2e8ebf808e/numpy-2.2.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:7dca87ca328f5ea7dafc907c5ec100d187911f94825f8700caac0b3f4c384b49", size = 6671958 },
133 |     { url = "https://files.pythonhosted.org/packages/b1/6f/6531a78e182f194d33ee17e59d67d03d0d5a1ce7f6be7343787828d1bd4a/numpy-2.2.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bc61b307655d1a7f9f4b043628b9f2b721e80839914ede634e3d485913e1fb2", size = 14019950 },
134 |     { url = "https://files.pythonhosted.org/packages/e1/fb/13c58591d0b6294a08cc40fcc6b9552d239d773d520858ae27f39997f2ae/numpy-2.2.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fad446ad0bc886855ddf5909cbf8cb5d0faa637aaa6277fb4b19ade134ab3c7", size = 16079759 },
135 |     { url = "https://files.pythonhosted.org/packages/2c/f2/f2f8edd62abb4b289f65a7f6d1f3650273af00b91b7267a2431be7f1aec6/numpy-2.2.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:149d1113ac15005652e8d0d3f6fd599360e1a708a4f98e43c9c77834a28238cb", size = 15226139 },
136 |     { url = "https://files.pythonhosted.org/packages/aa/29/14a177f1a90b8ad8a592ca32124ac06af5eff32889874e53a308f850290f/numpy-2.2.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:106397dbbb1896f99e044efc90360d098b3335060375c26aa89c0d8a97c5f648", size = 17856316 },
137 |     { url = "https://files.pythonhosted.org/packages/95/03/242ae8d7b97f4e0e4ab8dd51231465fb23ed5e802680d629149722e3faf1/numpy-2.2.2-cp313-cp313t-win32.whl", hash = "sha256:0eec19f8af947a61e968d5429f0bd92fec46d92b0008d0a6685b40d6adf8a4f4", size = 6329134 },
138 |     { url = "https://files.pythonhosted.org/packages/80/94/cd9e9b04012c015cb6320ab3bf43bc615e248dddfeb163728e800a5d96f0/numpy-2.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:97b974d3ba0fb4612b77ed35d7627490e8e3dff56ab41454d9e8b23448940576", size = 12696208 },
139 | ]
140 | 
141 | [[package]]
142 | name = "pandas"
143 | version = "2.2.3"
144 | source = { registry = "https://pypi.org/simple" }
145 | dependencies = [
146 |     { name = "numpy" },
147 |     { name = "python-dateutil" },
148 |     { name = "pytz" },
149 |     { name = "tzdata" },
150 | ]
151 | sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 }
152 | wheels = [
153 |     { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893 },
154 |     { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475 },
155 |     { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645 },
156 |     { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445 },
157 |     { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235 },
158 |     { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756 },
159 |     { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 },
160 |     { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643 },
161 |     { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573 },
162 |     { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085 },
163 |     { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809 },
164 |     { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316 },
165 |     { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055 },
166 |     { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175 },
167 |     { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650 },
168 |     { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177 },
169 |     { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526 },
170 |     { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013 },
171 |     { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620 },
172 |     { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 },
173 | ]
174 | 
175 | [[package]]
176 | name = "parso"
177 | version = "0.8.4"
178 | source = { registry = "https://pypi.org/simple" }
179 | sdist = { url = "https://files.pythonhosted.org/packages/66/94/68e2e17afaa9169cf6412ab0f28623903be73d1b32e208d9e8e541bb086d/parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d", size = 400609 }
180 | wheels = [
181 |     { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650 },
182 | ]
183 | 
184 | [[package]]
185 | name = "pexpect"
186 | version = "4.9.0"
187 | source = { registry = "https://pypi.org/simple" }
188 | dependencies = [
189 |     { name = "ptyprocess" },
190 | ]
191 | sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450 }
192 | wheels = [
193 |     { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772 },
194 | ]
195 | 
196 | [[package]]
197 | name = "prompt-toolkit"
198 | version = "3.0.50"
199 | source = { registry = "https://pypi.org/simple" }
200 | dependencies = [
201 |     { name = "wcwidth" },
202 | ]
203 | sdist = { url = "https://files.pythonhosted.org/packages/a1/e1/bd15cb8ffdcfeeb2bdc215de3c3cffca11408d829e4b8416dcfe71ba8854/prompt_toolkit-3.0.50.tar.gz", hash = "sha256:544748f3860a2623ca5cd6d2795e7a14f3d0e1c3c9728359013f79877fc89bab", size = 429087 }
204 | wheels = [
205 |     { url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816 },
206 | ]
207 | 
208 | [[package]]
209 | name = "ptyprocess"
210 | version = "0.7.0"
211 | source = { registry = "https://pypi.org/simple" }
212 | sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762 }
213 | wheels = [
214 |     { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993 },
215 | ]
216 | 
217 | [[package]]
218 | name = "pure-eval"
219 | version = "0.2.3"
220 | source = { registry = "https://pypi.org/simple" }
221 | sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752 }
222 | wheels = [
223 |     { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842 },
224 | ]
225 | 
226 | [[package]]
227 | name = "pyarrow"
228 | version = "19.0.0"
229 | source = { registry = "https://pypi.org/simple" }
230 | sdist = { url = "https://files.pythonhosted.org/packages/7b/01/fe1fd04744c2aa038e5a11c7a4adb3d62bce09798695e54f7274b5977134/pyarrow-19.0.0.tar.gz", hash = "sha256:8d47c691765cf497aaeed4954d226568563f1b3b74ff61139f2d77876717084b", size = 1129096 }
231 | wheels = [
232 |     { url = "https://files.pythonhosted.org/packages/bc/2e/152885f5ef421e80dae68b9c133ab261934f93a6d5e16b61d79c0ed597fb/pyarrow-19.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:a7bbe7109ab6198688b7079cbad5a8c22de4d47c4880d8e4847520a83b0d1b68", size = 30667964 },
233 |     { url = "https://files.pythonhosted.org/packages/80/c2/08bbee9a8610a47c9a1466845f405baf53a639ddd947c5133d8ba13544b6/pyarrow-19.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:4624c89d6f777c580e8732c27bb8e77fd1433b89707f17c04af7635dd9638351", size = 32125039 },
234 |     { url = "https://files.pythonhosted.org/packages/d2/56/06994df823212f5688d3c8bf4294928b12c9be36681872853655724d28c6/pyarrow-19.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b6d3ce4288793350dc2d08d1e184fd70631ea22a4ff9ea5c4ff182130249d9b", size = 41140729 },
235 |     { url = "https://files.pythonhosted.org/packages/94/65/38ad577c98140a9db71e9e1e594b6adb58a7478a5afec6456a8ca2df7f70/pyarrow-19.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:450a7d27e840e4d9a384b5c77199d489b401529e75a3b7a3799d4cd7957f2f9c", size = 42202267 },
236 |     { url = "https://files.pythonhosted.org/packages/b6/1f/966b722251a7354114ccbb71cf1a83922023e69efd8945ebf628a851ec4c/pyarrow-19.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a08e2a8a039a3f72afb67a6668180f09fddaa38fe0d21f13212b4aba4b5d2451", size = 40505858 },
237 |     { url = "https://files.pythonhosted.org/packages/3b/5e/6bc81aa7fc9affc7d1c03b912fbcc984ca56c2a18513684da267715dab7b/pyarrow-19.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:f43f5aef2a13d4d56adadae5720d1fed4c1356c993eda8b59dace4b5983843c1", size = 42084973 },
238 |     { url = "https://files.pythonhosted.org/packages/53/c3/2f56da818b6a4758cbd514957c67bd0f078ebffa5390ee2e2bf0f9e8defc/pyarrow-19.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:2f672f5364b2d7829ef7c94be199bb88bf5661dd485e21d2d37de12ccb78a136", size = 25241976 },
239 |     { url = "https://files.pythonhosted.org/packages/f5/b9/ba07ed3dd6b6e4f379b78e9c47c50c8886e07862ab7fa6339ac38622d755/pyarrow-19.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:cf3bf0ce511b833f7bc5f5bb3127ba731e97222023a444b7359f3a22e2a3b463", size = 30651291 },
240 |     { url = "https://files.pythonhosted.org/packages/ad/10/0d304243c8277035298a68a70807efb76199c6c929bb3363c92ac9be6a0d/pyarrow-19.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:4d8b0c0de0a73df1f1bf439af1b60f273d719d70648e898bc077547649bb8352", size = 32100461 },
241 |     { url = "https://files.pythonhosted.org/packages/8a/61/bcfc5182e11831bca3f849945b9b106e09fd10ded773dff466658e972a45/pyarrow-19.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92aff08e23d281c69835e4a47b80569242a504095ef6a6223c1f6bb8883431d", size = 41132491 },
242 |     { url = "https://files.pythonhosted.org/packages/8e/87/2915a29049ec352dc69a967fbcbd76b0180319233de0daf8bd368df37099/pyarrow-19.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3b78eff5968a1889a0f3bc81ca57e1e19b75f664d9c61a42a604bf9d8402aae", size = 42192529 },
243 |     { url = "https://files.pythonhosted.org/packages/48/18/44e5542b2707a8afaf78b5b88c608f261871ae77787eac07b7c679ca6f0f/pyarrow-19.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b34d3bde38eba66190b215bae441646330f8e9da05c29e4b5dd3e41bde701098", size = 40495363 },
244 |     { url = "https://files.pythonhosted.org/packages/ba/d6/5096deb7599bbd20bc2768058fe23bc725b88eb41bee58303293583a2935/pyarrow-19.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5418d4d0fab3a0ed497bad21d17a7973aad336d66ad4932a3f5f7480d4ca0c04", size = 42074075 },
245 |     { url = "https://files.pythonhosted.org/packages/2c/df/e3c839c04c284c9ec3d62b02a8c452b795d9b07b04079ab91ce33484d4c5/pyarrow-19.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e82c3d5e44e969c217827b780ed8faf7ac4c53f934ae9238872e749fa531f7c9", size = 25239803 },
246 |     { url = "https://files.pythonhosted.org/packages/6a/d3/a6d4088e906c7b5d47792256212606d2ae679046dc750eee0ae167338e5c/pyarrow-19.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f208c3b58a6df3b239e0bb130e13bc7487ed14f39a9ff357b6415e3f6339b560", size = 30695401 },
247 |     { url = "https://files.pythonhosted.org/packages/94/25/70040fd0e397dd1b937f459eaeeec942a76027357491dca0ada09d1322af/pyarrow-19.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:c751c1c93955b7a84c06794df46f1cec93e18610dcd5ab7d08e89a81df70a849", size = 32104680 },
248 |     { url = "https://files.pythonhosted.org/packages/4e/f9/92783290cc0d80ca16d34b0c126305bfacca4b87dd889c8f16c6ef2a8fd7/pyarrow-19.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b903afaa5df66d50fc38672ad095806443b05f202c792694f3a604ead7c6ea6e", size = 41076754 },
249 |     { url = "https://files.pythonhosted.org/packages/05/46/2c9870f50a495c72e2b8982ae29a9b1680707ea936edc0de444cec48f875/pyarrow-19.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a22a4bc0937856263df8b94f2f2781b33dd7f876f787ed746608e06902d691a5", size = 42163133 },
250 |     { url = "https://files.pythonhosted.org/packages/7b/2f/437922b902549228fb15814e8a26105bff2787ece466a8d886eb6699efad/pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:5e8a28b918e2e878c918f6d89137386c06fe577cd08d73a6be8dafb317dc2d73", size = 40452210 },
251 |     { url = "https://files.pythonhosted.org/packages/36/ef/1d7975053af9d106da973bac142d0d4da71b7550a3576cc3e0b3f444d21a/pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:29cd86c8001a94f768f79440bf83fee23963af5e7bc68ce3a7e5f120e17edf89", size = 42077618 },
252 | ]
253 | 
254 | [[package]]
255 | name = "pygments"
256 | version = "2.19.1"
257 | source = { registry = "https://pypi.org/simple" }
258 | sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 }
259 | wheels = [
260 |     { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 },
261 | ]
262 | 
263 | [[package]]
264 | name = "python-dateutil"
265 | version = "2.9.0.post0"
266 | source = { registry = "https://pypi.org/simple" }
267 | dependencies = [
268 |     { name = "six" },
269 | ]
270 | sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 }
271 | wheels = [
272 |     { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 },
273 | ]
274 | 
275 | [[package]]
276 | name = "pytz"
277 | version = "2024.2"
278 | source = { registry = "https://pypi.org/simple" }
279 | sdist = { url = "https://files.pythonhosted.org/packages/3a/31/3c70bf7603cc2dca0f19bdc53b4537a797747a58875b552c8c413d963a3f/pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a", size = 319692 }
280 | wheels = [
281 |     { url = "https://files.pythonhosted.org/packages/11/c3/005fcca25ce078d2cc29fd559379817424e94885510568bc1bc53d7d5846/pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725", size = 508002 },
282 | ]
283 | 
284 | [[package]]
285 | name = "six"
286 | version = "1.17.0"
287 | source = { registry = "https://pypi.org/simple" }
288 | sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 }
289 | wheels = [
290 |     { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 },
291 | ]
292 | 
293 | [[package]]
294 | name = "stack-data"
295 | version = "0.6.3"
296 | source = { registry = "https://pypi.org/simple" }
297 | dependencies = [
298 |     { name = "asttokens" },
299 |     { name = "executing" },
300 |     { name = "pure-eval" },
301 | ]
302 | sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707 }
303 | wheels = [
304 |     { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521 },
305 | ]
306 | 
307 | [[package]]
308 | name = "tabulate"
309 | version = "0.9.0"
310 | source = { registry = "https://pypi.org/simple" }
311 | sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090 }
312 | wheels = [
313 |     { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252 },
314 | ]
315 | 
316 | [[package]]
317 | name = "traitlets"
318 | version = "5.14.3"
319 | source = { registry = "https://pypi.org/simple" }
320 | sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621 }
321 | wheels = [
322 |     { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359 },
323 | ]
324 | 
325 | [[package]]
326 | name = "tzdata"
327 | version = "2025.1"
328 | source = { registry = "https://pypi.org/simple" }
329 | sdist = { url = "https://files.pythonhosted.org/packages/43/0f/fa4723f22942480be4ca9527bbde8d43f6c3f2fe8412f00e7f5f6746bc8b/tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694", size = 194950 }
330 | wheels = [
331 |     { url = "https://files.pythonhosted.org/packages/0f/dd/84f10e23edd882c6f968c21c2434fe67bd4a528967067515feca9e611e5e/tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639", size = 346762 },
332 | ]
333 | 
334 | [[package]]
335 | name = "wcwidth"
336 | version = "0.2.13"
337 | source = { registry = "https://pypi.org/simple" }
338 | sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301 }
339 | wheels = [
340 |     { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 },
341 | ]
342 | 


--------------------------------------------------------------------------------
/GO_terms_search/4-GO-terms-search-analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "02b2641f",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "#### Here we investigate the relashionship between:\n",
  9 |     "    - mRNA level predictability of a landmark gene \n",
 10 |     "    and \n",
 11 |     "    - its known organelle level biological function using GO annotations"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "id": "367cc88b",
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np\n",
 23 |     "def locations_of_substring(string, substring):\n",
 24 |     "    \"\"\"Return a list of locations of a substring.\"\"\"\n",
 25 |     "    substring_length = len(substring)    \n",
 26 |     "    def recurse(locations_found, start):\n",
 27 |     "        location = string.find(substring, start)\n",
 28 |     "        if location != -1:\n",
 29 |     "            return recurse(locations_found + [location], location+substring_length)\n",
 30 |     "        else:\n",
 31 |     "            return locations_found\n",
 32 |     "    return recurse([], 0)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "id": "d80e3f38",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "#### For LUAD dataset:\n",
 41 |     "\n",
 42 |     "1 - Read predictability map of categorical features (using MLP model)\n",
 43 |     "\n",
 44 |     "2 - Assign the feature categories to compartments/stains\n",
 45 |     "\n",
 46 |     "3 - Read functional annotations of the reference set according to DAVIDs output and add columns for each channel\n",
 47 |     "   - Add channel specific annotation to each columns channel\n",
 48 |     "   "
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "id": "00bc67ee",
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "import sys\n",
 59 |     "sys.path.insert(0, '../utils/') \n",
 60 |     "from readProfiles import rename_affyprobe_to_genename\n",
 61 |     "from saveAsNewSheetToExistingFile import saveAsNewSheetToExistingFile\n",
 62 |     "########### 1 ###########\n",
 63 |     "filename='../results/SingleGenePred_cpCategoryMap/cat_scores_maps.xlsx'\n",
 64 |     "saved_scores=pd.read_excel(filename, sheet_name=None)\n",
 65 |     "# which_ds_model='LUAD-9-MLP-ht'\n",
 66 |     "which_ds_model='LUAD-9-MLP-keras-ht'\n",
 67 |     "# which_ds_model='LUAD-9-lasso-ht'\n",
 68 |     "dfcats=saved_scores[which_ds_model].rename(columns={'Unnamed: 0':'ID'})\n",
 69 |     "dfcats=dfcats[dfcats.columns[~dfcats.isna().any()].tolist()]\n",
 70 |     "dfcats2,_=rename_affyprobe_to_genename(dfcats.set_index('ID').T,dfcats.ID.tolist())\n",
 71 |     "dfcats=dfcats2.T.reset_index()\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "########### 2 ###########\n",
 75 |     "Channelss=['DNA','RNA','AGP','Mito','ER']\n",
 76 |     "Channelss_cats=['DNA|Nuclei_AreaShape','RNA','AGP|Cytoplasm_AreaShape|Cells_AreaShape','Mito','ER']\n",
 77 |     "# Channelss_cats=['DNA|Nuclei_AreaShape','RNA','AGP|Cytoplasm_AreaShape','Mito','ER']\n",
 78 |     "\n",
 79 |     "for ci in range(len(Channelss)):\n",
 80 |     "    dfcats['max_'+Channelss[ci]]=dfcats.loc[:,dfcats.columns.str.contains(Channelss_cats[ci])].max(axis=1)\n",
 81 |     "\n",
 82 |     "    \n",
 83 |     "dfcats['top_channel']=dfcats[['max_'+Channelss[ci] for ci in range(len(Channelss))]].idxmax(axis=\"columns\")\n",
 84 |     "########### 3 ###########\n",
 85 |     "# gene_cats_bpcc=pd.read_csv('./go_BP_CC_MF_DIRECT_921.txt',delimiter='\\t')\n",
 86 |     "gene_cats_bpcc=pd.read_csv('./source/GO_bp_cc_mf_direct_LUAD_976.txt',delimiter='\\t')\n",
 87 |     "comps=['mitochondri','golgi','membrane','cytoskeleton','actin','endoplasmic','rna','nucleol',\\\n",
 88 |     "       'cell division','mitosis','mitotic','cell cycle']\n",
 89 |     "\n",
 90 |     "# GOTERM_BP_DIRECT\n",
 91 |     "# GOTERM_CC_DIRECT\n",
 92 |     "# GOTERM_MF_DIRECT\n",
 93 |     "for c in comps:\n",
 94 |     "    gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
 95 |     "    ''.join([x[:si].split('~')[-1]+x[si:].split('go')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
 96 |     "    gene_cats_bpcc['GOTERM_MF_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
 97 |     "    ''.join([x[:si].split('~')[-1]+x[si:].split('go')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
 98 |     "    gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
 99 |     "    ''.join([x[:si].split('~')[-1]+x[si:].split('go')[0] for si in locations_of_substring(x,c)]) if c in x else '')#+\\\n",
100 |     "#     gene_cats_bpcc['UP_KW_BIOLOGICAL_PROCESS'].astype(str).str.lower().apply(lambda x:\\\n",
101 |     "#     ''.join([x[:si].split('~')[-1]+x[si:].split('kw')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
102 |     "#     gene_cats_bpcc['UP_KW_CELLULAR_COMPONENT'].astype(str).str.lower().apply(lambda x:\\\n",
103 |     "#     ''.join([x[:si].split('~')[-1]+x[si:].split('kw')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
104 |     "#     gene_cats_bpcc['UP_KW_MOLECULAR_FUNCTION'].astype(str).str.lower().apply(lambda x:\\\n",
105 |     "#     ''.join([x[:si].split('~')[-1]+x[si:].split('kw')[0] for si in locations_of_substring(x,c)]) if c in x else '')#+\\\n",
106 |     "# #     gene_cats_bpcc['UP_SEQ_FEATURE'].astype(str).str.lower().apply(lambda x:\\\n",
107 |     "#     ''.join([x[:si].split(':')[-1]+','+x[si:].split(',')[0] for si in locations_of_substring(x,c)]) if c in x else '')\n",
108 |     "    \n",
109 |     "gene_cats_bpcc['RNA_nucleoli']=gene_cats_bpcc['rna']+gene_cats_bpcc['nucleol']\n",
110 |     "gene_cats_bpcc['DNA']=gene_cats_bpcc['cell division']+gene_cats_bpcc['mitosis']+\\\n",
111 |     "gene_cats_bpcc['mitotic']+gene_cats_bpcc['cell cycle']\n",
112 |     "\n",
113 |     "gene_cats_bpcc['cytoskeleton-actin']=gene_cats_bpcc['cytoskeleton']+gene_cats_bpcc['actin']+\\\n",
114 |     "gene_cats_bpcc['golgi']+gene_cats_bpcc['membrane']\n",
115 |     "# gene_cats_bpcc['ER']=gene_cats_bpcc['endoplasmic']\n",
116 |     "gene_cats_bpcc['mitochondria']=gene_cats_bpcc['mitochondri']\n",
117 |     "\n",
118 |     "gene_cats_bpcc=pd.merge(gene_cats_bpcc,dfcats,how='inner',on='ID')\n",
119 |     "\n",
120 |     "#########################\n",
121 |     "Channelss_dict={'DNA':'DNA','RNA_nucleoli':'RNA','cytoskeleton-actin':'AGP','mitochondria':'Mito','endoplasmic':'ER'}\n",
122 |     "Chan_rev_dict = dict(zip(Channelss_dict.values(),Channelss_dict.keys()))\n",
123 |     "\n",
124 |     "comps2=['mitochondri','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n",
125 |     "gene_cats_bpcc['any_comps']=''\n",
126 |     "for co in comps2:\n",
127 |     "    gene_cats_bpcc['any_comps']=gene_cats_bpcc['any_comps']+gene_cats_bpcc[co]\n",
128 |     "        \n",
129 |     "        \n",
130 |     "        \n",
131 |     "# gene_cats_bpcc=gene_cats_bpcc[gene_cats_bpcc['any_comps']!=''].reset_index(drop=True)        \n",
132 |     "from sklearn.metrics import confusion_matrix\n",
133 |     "from scipy.stats import fisher_exact\n",
134 |     "# top_bool=(gene_cats_bpcc['top58']==True).values\n",
135 |     "table2=pd.DataFrame(index=Channelss_dict.keys(),columns=Channelss_dict.values())\n",
136 |     "table3=pd.DataFrame(index=Channelss,columns=['odds ratio','med_restComp_oddsratio','restComp_oddsratio','anyComp_oddsratio','top-ratio'])\n",
137 |     "# table3=pd.DataFrame(index=Channelss,columns=['Prevalence','anyComp-Prevalence','noComp-Prevalence','top-ratio'])\n",
138 |     "\n",
139 |     "table=[]\n",
140 |     "for c in Channelss:\n",
141 |     "    print(c)\n",
142 |     "    c_r=list(set(Channelss)-set([c]))\n",
143 |     "#     gene_cats_bpcc_highP=gene_cats_bpcc[(gene_cats_bpcc['max_'+c]>0.7)]\n",
144 |     "#     gene_cats_bpcc_lowP=gene_cats_bpcc[(gene_cats_bpcc['max_'+c]<0.1)]\n",
145 |     "    \n",
146 |     "    low_ind=gene_cats_bpcc[['max_'+c]].sort_values(by='max_'+c)[:100].index\n",
147 |     "    high_ind=gene_cats_bpcc[['max_'+c]].sort_values(by='max_'+c)[-100:].index    \n",
148 |     "    gene_cats_bpcc_highP=gene_cats_bpcc.loc[high_ind].reset_index(drop=True)\n",
149 |     "    gene_cats_bpcc_lowP=gene_cats_bpcc.loc[low_ind].reset_index(drop=True)\n",
150 |     "    gene_cats_bpcc2=gene_cats_bpcc.copy()\n",
151 |     "    \n",
152 |     "#     top_bool=(gene_cats_bpcc['max_'+c]>.6).values\n",
153 |     "    top_bool=(gene_cats_bpcc['max_'+c]<0).values\n",
154 |     "    \n",
155 |     "    print(np.sum(gene_cats_bpcc.loc[gene_cats_bpcc['max_'+c]<0,['max_'+Channelss[ci] for ci in range(len(Channelss))]].max(axis=1)>0.3))\n",
156 |     "    n_top=sum(top_bool)\n",
157 |     "    print(n_top)\n",
158 |     "    \n",
159 |     "    comps2=['mitochondria','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n",
160 |     "    table1=pd.DataFrame(index=comps2+['any comp','no comp'],columns=['Prevalence','p-value','odds ratio'])\n",
161 |     "    for co in comps2:\n",
162 |     "        \n",
163 |     "        enr_ratio=gene_cats_bpcc2[top_bool & (gene_cats_bpcc2[co]!='')].shape[0]/\\\n",
164 |     "        gene_cats_bpcc2[(gene_cats_bpcc2[co]!='')].shape[0]\n",
165 |     "        comp_bool=(gene_cats_bpcc2[co]!='').values\n",
166 |     "        oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, comp_bool))\n",
167 |     "        \n",
168 |     "        table1.loc[co,['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n",
169 |     "#         print(co,\": \",np.round(enr_ratio*100,2),'%     ','pvalue:', np.round(pvalue,2),'  oddsratio:',np.round(oddsratio,2))\n",
170 |     "        table2.loc[co,c]=oddsratio\n",
171 |     "         \n",
172 |     "    \n",
173 |     "    enr_ratio=gene_cats_bpcc[top_bool & (gene_cats_bpcc['any_comps']!='')].shape[0]/\\\n",
174 |     "    gene_cats_bpcc[(gene_cats_bpcc['any_comps']!='')].shape[0]\n",
175 |     "    any_oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['any_comps']!='').values))\n",
176 |     "#     print(\"any_comps: \",np.round(enr_ratio*100,2),'%     ','pvalue:', np.round(pvalue,2),'  oddsratio:',np.round(oddsratio,2))\n",
177 |     "    table1.loc['any comp',['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,any_oddsratio\n",
178 |     "    print(any_oddsratio)\n",
179 |     "\n",
180 |     "    nocomp_enr_ratio=gene_cats_bpcc[(top_bool) & (gene_cats_bpcc['any_comps']=='')].shape[0]/\\\n",
181 |     "gene_cats_bpcc[(gene_cats_bpcc['any_comps']=='')].shape[0]\n",
182 |     "\n",
183 |     "    comps2=['mitochondria','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n",
184 |     "    comps2.remove(Chan_rev_dict[c])\n",
185 |     "    \n",
186 |     "    med_restComp_oddsratio=table2.loc[comps2,c].median()\n",
187 |     "    gene_cats_bpcc['rest_comps']=''\n",
188 |     "    for co in comps2:\n",
189 |     "        gene_cats_bpcc['rest_comps']=gene_cats_bpcc['rest_comps']+gene_cats_bpcc[co]    \n",
190 |     "    \n",
191 |     "    rest_enr_ratio=gene_cats_bpcc[(top_bool) & (gene_cats_bpcc['rest_comps']=='')].shape[0]/\\\n",
192 |     "gene_cats_bpcc[(gene_cats_bpcc['rest_comps']=='')].shape[0]    \n",
193 |     "    rest_oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['rest_comps']!='').values))\n",
194 |     "    \n",
195 |     "    table.append(table1)\n",
196 |     "#     print('num top ('+ str(n_top)+')/total genes (912): ', np.round((n_top/912)*100,2),'%')\n",
197 |     "    table3.loc[c,['odds ratio','med_restComp_oddsratio','restComp_oddsratio','anyComp_oddsratio','top-ratio']]=\\\n",
198 |     "    table1.loc[Chan_rev_dict[c],'odds ratio'],med_restComp_oddsratio,rest_oddsratio,\\\n",
199 |     "    any_oddsratio,np.round((n_top/912)*100,2)\n",
200 |     "    \n",
201 |     "#     table3.loc[c,['Prevalence','anyComp-Prevalence','noComp-Prevalence','top-ratio']]=table1.loc[Chan_rev_dict[c],'Prevalence'],\\\n",
202 |     "#     enr_ratio*100,nocomp_enr_ratio*100,np.round((n_top/912)*100,2)    \n",
203 |     "    \n",
204 |     "#     print(Chan_rev_dict[c],': ',table1.loc[Chan_rev_dict[c],['Prevalence']].values)\n",
205 |     "    \n",
206 |     "# # table3['dif']=table3['Prevalence']-table3['anyComp-Prevalence']\n",
207 |     "table3['dif']=table3['odds ratio']-table3['restComp_oddsratio']\n",
208 |     "table3['dif2']=table3['odds ratio']-table3['med_restComp_oddsratio']\n",
209 |     "table3['dif3']=table3['odds ratio']-table3['anyComp_oddsratio']\n",
210 |     "# print(table3['dif'].min(),table3['dif'].sum())\n",
211 |     "# table3\n",
212 |     "\n",
213 |     "source_data_add='../results/Figs_Source_Data.xlsx'\n",
214 |     "if 0:\n",
215 |     "    saveAsNewSheetToExistingFile(source_data_add,pd.concat([table3.astype(float).round(3)[['odds ratio','restComp_oddsratio']],x1],axis=1),'ExtendedData5')"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "id": "2a98dcc6",
221 |    "metadata": {},
222 |    "source": [
223 |     "## GO terms search for overlap of highly predictable genes (top 58)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "id": "87d28528",
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "# top58=pd.read_csv('./top_58_common.txt',header=None)[0].tolist()\n",
234 |     "# gene_cats_bpcc=pd.read_csv('./go_bp_cc_D2021_each_gene_cat.txt',delimiter='\\t')\n",
235 |     "import sys\n",
236 |     "sys.path.insert(0, '../utils/') \n",
237 |     "from readProfiles import rename_affyprobe_to_genename\n",
238 |     "from saveAsNewSheetToExistingFile import saveAsNewSheetToExistingFile\n",
239 |     "top58=pd.read_csv('./source/top_59_atleast_topIn3.txt',header=None)[0].tolist()\n",
240 |     "gene_cats_bpcc=pd.read_csv('./source/GO_bp_cc_mf_direct_intersection_782.txt',delimiter='\\t')\n",
241 |     "\n",
242 |     "# comps=['mitochondri','Golgi','membrane','cytoskeleton','actin','endoplasmic','RNA','nucleol','cell division','mitosis','mitotic','cell cycle']\n",
243 |     "\n",
244 |     "# for c in comps:\n",
245 |     "#     gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).apply(lambda x: x if c in x else '')+\\\n",
246 |     "#     gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).apply(lambda x: x[:x.find(c)].split('~')[-1]+x[x.find(c):].split('GO')[0] if c in x else '')\n",
247 |     "comps=['mitochondri','golgi','membrane','cytoskeleton','actin','endoplasmic','rna','nucleol',\\\n",
248 |     "       'cell division','mitosis','mitotic','cell cycle','cytokine','hormone']\n",
249 |     "\n",
250 |     "for c in comps:\n",
251 |     "    gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
252 |     "    ''.join([x[:si].split('~')[-1]+x[si:].split('GO')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
253 |     "    gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
254 |     "    ''.join([x[:si].split('~')[-1]+x[si:].split('GO')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n",
255 |     "    gene_cats_bpcc['GOTERM_MF_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n",
256 |     "    ''.join([x[:si].split('~')[-1]+x[si:].split('GO')[0] for si in locations_of_substring(x,c)]) if c in x else '')\n",
257 |     "    \n",
258 |     "gene_cats_bpcc['RNA_nucleoli']=gene_cats_bpcc['rna']+gene_cats_bpcc['nucleol']\n",
259 |     "gene_cats_bpcc['DNA']=gene_cats_bpcc['cell division']+gene_cats_bpcc['mitosis']+gene_cats_bpcc['mitotic']+gene_cats_bpcc['cell cycle']\n",
260 |     "\n",
261 |     "gene_cats_bpcc['cytoskeleton-actin']=gene_cats_bpcc['cytoskeleton']+gene_cats_bpcc['actin']\n",
262 |     "\n",
263 |     "gene_cats_bpcc.loc[gene_cats_bpcc['ID'].isin(top58),'top58']=True\n",
264 |     "\n",
265 |     "\n",
266 |     "\n",
267 |     "#####################################\n",
268 |     "comps2=['mitochondri','golgi','membrane','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n",
269 |     "from sklearn.metrics import confusion_matrix\n",
270 |     "from scipy.stats import fisher_exact\n",
271 |     "top_bool=(gene_cats_bpcc['top58']==True).values\n",
272 |     "\n",
273 |     "table1=pd.DataFrame(index=comps2+['any comp','no comp'],columns=['Prevalence','p-value','odds ratio'])\n",
274 |     "for co in comps2:\n",
275 |     "    enr_ratio=gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc[co]!='')].shape[0]/\\\n",
276 |     "    gene_cats_bpcc[(gene_cats_bpcc[co]!='')].shape[0]\n",
277 |     "    comp_bool=(gene_cats_bpcc[co]!='').values\n",
278 |     "    oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, comp_bool))\n",
279 |     "#     print(co, ':',gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc[co]!='')].shape[0],\\\n",
280 |     "#          ', ',gene_cats_bpcc[(gene_cats_bpcc[co]!='')].shape[0])\n",
281 |     "\n",
282 |     "    table1.loc[co,['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n",
283 |     "    print(co,\": \",np.round(enr_ratio*100,2),'%     ','pvalue:', np.round(pvalue,2),'  oddsratio:',np.round(oddsratio,2))\n",
284 |     "    \n",
285 |     "gene_cats_bpcc['any_comps']=''\n",
286 |     "for co in comps2:\n",
287 |     "    gene_cats_bpcc['any_comps']=gene_cats_bpcc['any_comps']+gene_cats_bpcc[co]\n",
288 |     "     \n",
289 |     "    \n",
290 |     "enr_ratio=gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc['any_comps']!='')].shape[0]/\\\n",
291 |     "gene_cats_bpcc[(gene_cats_bpcc['any_comps']!='')].shape[0]\n",
292 |     "oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['any_comps']!='').values))\n",
293 |     "print(\"any_comps: \",np.round(enr_ratio*100,2),'%     ','pvalue:', np.round(pvalue,2),'  oddsratio:',np.round(oddsratio,2))\n",
294 |     "table1.loc['any comp',['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n",
295 |     "\n",
296 |     "\n",
297 |     "enr_ratio=gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc['any_comps']=='')].shape[0]/\\\n",
298 |     "gene_cats_bpcc[(gene_cats_bpcc['any_comps']=='')].shape[0]\n",
299 |     "oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['any_comps']=='').values))\n",
300 |     "print(\"no comps: \",np.round(enr_ratio*100,2),'%     ','pvalue:', np.round(pvalue,2),'  oddsratio:',np.round(oddsratio,2))\n",
301 |     "table1.loc['no comp',['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n",
302 |     "\n",
303 |     "# print('num top (52)/total genes (782): ', np.round((52/782)*100,2),'%')\n",
304 |     "\n",
305 |     "source_data_add='../results/Figs_Source_Data.xlsx'\n",
306 |     "if 0:\n",
307 |     "    saveAsNewSheetToExistingFile(source_data_add,table1,'ExtendedData6')\n",
308 |     "\n",
309 |     "# gene_cats_bpcc.to_csv('./GO/go_bp_cc_D2021_each_gene_cat_completed.csv',index=False)\n",
310 |     "# gene_cats_bpcc.to_csv('./source/GO_bp_cc_mf_direct_intersection_782_completed.csv',index=False)\n"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": null,
316 |    "id": "83ea53e3",
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "table1['odds ratio'].astype(float).round(2)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": null,
326 |    "id": "4d4a04f6",
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": [
330 |     "table1['odds ratio'].astype(float).round(2).values"
331 |    ]
332 |   }
333 |  ],
334 |  "metadata": {
335 |   "kernelspec": {
336 |    "display_name": "Python 3 (ipykernel)",
337 |    "language": "python",
338 |    "name": "python3"
339 |   },
340 |   "language_info": {
341 |    "codemirror_mode": {
342 |     "name": "ipython",
343 |     "version": 3
344 |    },
345 |    "file_extension": ".py",
346 |    "mimetype": "text/x-python",
347 |    "name": "python",
348 |    "nbconvert_exporter": "python",
349 |    "pygments_lexer": "ipython3",
350 |    "version": "3.8.12"
351 |   }
352 |  },
353 |  "nbformat": 4,
354 |  "nbformat_minor": 5
355 | }
356 | 


--------------------------------------------------------------------------------
/GO_terms_search/source/LUAD_geneSymbols_978.txt:
--------------------------------------------------------------------------------
  1 | AARS1
  2 | ABCB6
  3 | ABCC5
  4 | ABCF1
  5 | ABCF3
  6 | ABHD4
  7 | ABHD6
  8 | ABL1
  9 | ACAA1
 10 | ACAT2
 11 | ACBD3
 12 | ACD
 13 | ACLY
 14 | ACOT9
 15 | ADAM10
 16 | ADAT1
 17 | ADGRE2
 18 | ADGRG1
 19 | ADH5
 20 | ADI1
 21 | ADO
 22 | ADRB2
 23 | AGER
 24 | AGL
 25 | AKAP8
 26 | AKAP8L
 27 | AKR7A2
 28 | AKT1
 29 | ALAS1
 30 | ALDH7A1
 31 | ALDOA
 32 | ALDOC
 33 | AMDHD2
 34 | ANKRD10
 35 | ANO10
 36 | ANXA7
 37 | APBB2
 38 | APOE
 39 | APP
 40 | APPBP2
 41 | ARFIP2
 42 | ARHGAP1
 43 | ARHGEF12
 44 | ARHGEF2
 45 | ARID4B
 46 | ARID5B
 47 | ARL4C
 48 | ARNT2
 49 | ARPP19
 50 | ASAH1
 51 | ASCC3
 52 | ATF1
 53 | ATF5
 54 | ATF6
 55 | ATG3
 56 | ATMIN
 57 | ATP11B
 58 | ATP1B1
 59 | ATP2C1
 60 | ATP6V0B
 61 | ATP6V1D
 62 | AURKA
 63 | AURKB
 64 | AXIN1
 65 | B3GNT2
 66 | BACE2
 67 | BAD
 68 | BAG3
 69 | BAMBI
 70 | BAX
 71 | BCL2
 72 | BCL7B
 73 | BDH1
 74 | BECN1
 75 | BHLHE40
 76 | BID
 77 | BIRC2
 78 | BIRC5
 79 | BLCAP
 80 | BLMH
 81 | BLTP2
 82 | BLVRA
 83 | BMP4
 84 | BNIP3
 85 | BNIP3L
 86 | BPHL
 87 | BRCA1
 88 | BTK
 89 | BUB1B
 90 | BZW2
 91 | C2CD2
 92 | C2CD2L
 93 | C2CD5
 94 | C5
 95 | CAB39
 96 | CALM1
 97 | CALU
 98 | CAMSAP2
 99 | CANT1
100 | CAPN1
101 | CARMIL1
102 | CASC3
103 | CASK
104 | CASP10
105 | CASP2
106 | CASP3
107 | CASP7
108 | CAST
109 | CAT
110 | CBLB
111 | CBR1
112 | CBR3
113 | CCDC85B
114 | CCDC86
115 | CCDC92
116 | CCL2
117 | CCNA1
118 | CCNA2
119 | CCNB1
120 | CCNB2
121 | CCND1
122 | CCND3
123 | CCNE2
124 | CCNF
125 | CCNH
126 | CCP110
127 | CD320
128 | CD40
129 | CD44
130 | CD58
131 | CDC20
132 | CDC25A
133 | CDC25B
134 | CDC42
135 | CDC45
136 | CDCA4
137 | CDH3
138 | CDK19
139 | CDK2
140 | CDK4
141 | CDK5R1
142 | CDK6
143 | CDK7
144 | CDKN1A
145 | CDKN1B
146 | CDKN2A
147 | CEBPA
148 | CEBPD
149 | CEBPZ
150 | CEMIP2
151 | CENPE
152 | CEP57
153 | CERK
154 | CETN3
155 | CFLAR
156 | CGRRF1
157 | CHAC1
158 | CHEK1
159 | CHEK2
160 | CHERP
161 | CHIC2
162 | CHMP4A
163 | CHMP6
164 | CHN1
165 | CIAO3
166 | CIAPIN1
167 | CIRBP
168 | CISD1
169 | CLIC4
170 | CLPX
171 | CLSTN1
172 | CLTB
173 | CLTC
174 | CNDP2
175 | CNOT4
176 | CNPY3
177 | COASY
178 | COG2
179 | COG4
180 | COG7
181 | COL1A1
182 | COL4A1
183 | COPB2
184 | COPS7A
185 | COQ8A
186 | CORO1A
187 | CPNE3
188 | CPSF4
189 | CRAMP1
190 | CREB1
191 | CREG1
192 | CRELD2
193 | CRK
194 | CRKL
195 | CRTAP
196 | CRYZ
197 | CSK
198 | CSNK1A1
199 | CSNK1E
200 | CSNK2A2
201 | CSRP1
202 | CTNNAL1
203 | CTNND1
204 | CTSD
205 | CTSL
206 | CTTN
207 | CXCL2
208 | CXCR4
209 | CYB561
210 | CYCS
211 | CYTH1
212 | DAG1
213 | DAXX
214 | DCK
215 | DCTD
216 | DCUN1D4
217 | DDB2
218 | DDIT4
219 | DDR1
220 | DDX10
221 | DDX42
222 | DECR1
223 | DENND2D
224 | DERA
225 | DFFA
226 | DFFB
227 | DHDDS
228 | DHRS7
229 | DHX29
230 | DIPK1A
231 | DLD
232 | DMAC2L
233 | DMTF1
234 | DNAJA3
235 | DNAJB1
236 | DNAJB2
237 | DNAJB6
238 | DNAJC15
239 | DNM1
240 | DNM1L
241 | DNMT1
242 | DNMT3A
243 | DNTTIP2
244 | DPH2
245 | DRAP1
246 | DSG2
247 | DUSP11
248 | DUSP14
249 | DUSP22
250 | DUSP3
251 | DUSP4
252 | DUSP6
253 | DYNLT3
254 | DYRK3
255 | E2F2
256 | EAPP
257 | EBNA1BP2
258 | EBP
259 | ECD
260 | ECH1
261 | EDEM1
262 | EDN1
263 | EED
264 | EFCAB14
265 | EGF
266 | EGFR
267 | EGR1
268 | EIF4EBP1
269 | EIF4G1
270 | EIF5
271 | ELAC2
272 | ELAVL1
273 | ELOVL6
274 | ELP1
275 | EML3
276 | ENOPH1
277 | ENOSF1
278 | EPB41L2
279 | EPHA3
280 | EPHB2
281 | EPN2
282 | EPRS1
283 | ERBB2
284 | ERBB3
285 | ERO1A
286 | ETFB
287 | ETS1
288 | ETV1
289 | EVL
290 | EXOSC4
291 | EXT1
292 | EZH2
293 | FAH
294 | FAIM
295 | FAM20B
296 | FAS
297 | FASTKD5
298 | FAT1
299 | FBXL12
300 | FBXO11
301 | FBXO21
302 | FBXO7
303 | FCHO1
304 | FDFT1
305 | FEZ2
306 | FGFR2
307 | FGFR4
308 | FHL2
309 | FIS1
310 | FKBP14
311 | FKBP4
312 | FOS
313 | FOSL1
314 | FOXJ3
315 | FOXO3
316 | FOXO4
317 | FPGS
318 | FRS2
319 | FSD1
320 | FUT1
321 | FYN
322 | FZD1
323 | FZD7
324 | G3BP1
325 | GAA
326 | GABPB1
327 | GADD45A
328 | GADD45B
329 | GALE
330 | GAPDH
331 | GARRE1
332 | GATA2
333 | GATA3
334 | GDPD5
335 | GET1
336 | GFOD1
337 | GFPT1
338 | GFUS
339 | GHR
340 | GLI2
341 | GLOD4
342 | GLRX
343 | GMNN
344 | GNA11
345 | GNA15
346 | GNAI1
347 | GNAI2
348 | GNAS
349 | GNB5
350 | GNPDA1
351 | GOLT1B
352 | GPATCH8
353 | GPC1
354 | GPER1
355 | GRB10
356 | GRB7
357 | GRN
358 | GRWD1
359 | GSTM2
360 | GSTZ1
361 | GTF2A2
362 | GTF2E2
363 | GTPBP8
364 | H2AZ2
365 | H2BC12
366 | H2BC21
367 | HACD3
368 | HADH
369 | HAT1
370 | HDAC2
371 | HDAC6
372 | HDGFL3
373 | HEATR1
374 | HEBP1
375 | HERC6
376 | HERPUD1
377 | HES1
378 | HIF1A
379 | HK1
380 | HLA-DMA
381 | HLA-DRA
382 | HMG20B
383 | HMGA2
384 | HMGCR
385 | HMGCS1
386 | HMOX1
387 | HOMER2
388 | HOOK2
389 | HOXA10
390 | HOXA5
391 | HPRT1
392 | HS2ST1
393 | HSD17B10
394 | HSD17B11
395 | HSPA1A
396 | HSPA4
397 | HSPA8
398 | HSPB1
399 | HSPD1
400 | HTATSF1
401 | HTRA1
402 | HYOU1
403 | IARS2
404 | ICAM1
405 | ICAM3
406 | ICMT
407 | ID2
408 | IDE
409 | IER3
410 | IFNAR1
411 | IFRD2
412 | IGF1R
413 | IGF2BP2
414 | IGF2R
415 | IGFBP3
416 | IGHMBP2
417 | IKBKB
418 | IKBKE
419 | IKZF1
420 | IL13RA1
421 | IL1B
422 | IL4R
423 | ILK
424 | INPP1
425 | INPP4B
426 | INSIG1
427 | INTS3
428 | IPO13
429 | IQGAP1
430 | ISOC1
431 | ITFG1
432 | ITGAE
433 | ITGB1BP1
434 | ITGB5
435 | JADE2
436 | JMJD6
437 | JUN
438 | KAT6A
439 | KAT6B
440 | KCNK1
441 | KCTD5
442 | KDELR2
443 | KDM3A
444 | KDM5A
445 | KDM5B
446 | KEAP1
447 | KHDC4
448 | KIAA0753
449 | KIF14
450 | KIF20A
451 | KIF2C
452 | KIF5C
453 | KIFBP
454 | KIT
455 | KLHDC2
456 | KLHL21
457 | KLHL9
458 | KLK8
459 | KTN1
460 | LAGE3
461 | LAMA3
462 | LAP3
463 | LBR
464 | LGALS8
465 | LGMN
466 | LIG1
467 | LIPA
468 | LOXL1
469 | LPAR2
470 | LPGAT1
471 | LRP10
472 | LRPAP1
473 | LRRC41
474 | LSM5
475 | LSM6
476 | LSR
477 | LYN
478 | LYPLA1
479 | LYRM1
480 | MACF1
481 | MALT1
482 | MAMLD1
483 | MAN2B1
484 | MAP2K5
485 | MAP3K4
486 | MAP4K4
487 | MAP7
488 | MAPK13
489 | MAPK1IP1L
490 | MAPK9
491 | MAPKAPK2
492 | MAPKAPK3
493 | MAPKAPK5
494 | MAST2
495 | MAT2A
496 | MBNL1
497 | MBNL2
498 | MBOAT7
499 | MBTPS1
500 | MCM3
501 | MCOLN1
502 | MCUR1
503 | ME2
504 | MEF2C
505 | MELK
506 | MEST
507 | METRN
508 | MFSD10
509 | MICALL1
510 | MIF
511 | MINDY1
512 | MKNK1
513 | MLEC
514 | MLLT11
515 | MMP1
516 | MMP2
517 | MNAT1
518 | MPC2
519 | MPZL1
520 | MRPL12
521 | MRPL19
522 | MRPS16
523 | MRPS2
524 | MSH6
525 | MSRA
526 | MTA1
527 | MTERF3
528 | MTF2
529 | MTFR1
530 | MTHFD2
531 | MUC1
532 | MVP
533 | MYBL2
534 | MYC
535 | MYCBP
536 | MYCBP2
537 | MYL9
538 | MYLK
539 | MYO10
540 | NCAPD2
541 | NCK1
542 | NCK2
543 | NCOA3
544 | NENF
545 | NET1
546 | NFATC3
547 | NFATC4
548 | NFE2L2
549 | NFIL3
550 | NFKB2
551 | NFKBIA
552 | NFKBIB
553 | NFKBIE
554 | NGRN
555 | NIPSNAP1
556 | NISCH
557 | NIT1
558 | NMT1
559 | NNT
560 | NOL3
561 | NOLC1
562 | NOS3
563 | NOSIP
564 | NOTCH1
565 | NPC1
566 | NPDC1
567 | NPEPL1
568 | NPRL2
569 | NR1H2
570 | NR2F6
571 | NR3C1
572 | NRAS
573 | NRIP1
574 | NSDHL
575 | NT5DC2
576 | NUCB2
577 | NUDCD3
578 | NUDT9
579 | NUP133
580 | NUP62
581 | NUP85
582 | NUP88
583 | NUP93
584 | NUSAP1
585 | NVL
586 | ORC1
587 | OXA1L
588 | OXCT1
589 | OXSR1
590 | P4HA2
591 | P4HTM
592 | PACSIN3
593 | PAF1
594 | PAFAH1B1
595 | PAFAH1B3
596 | PAICS
597 | PAK1
598 | PAK4
599 | PAK6
600 | PAN2
601 | PARP1
602 | PARP2
603 | PAX8
604 | PCBD1
605 | PCCB
606 | PCK2
607 | PCM1
608 | PCMT1
609 | PCNA
610 | PDGFA
611 | PDHX
612 | PDIA5
613 | PDLIM1
614 | PDS5A
615 | PECR
616 | PEX11A
617 | PFKL
618 | PGAM1
619 | PGM1
620 | PGRMC1
621 | PHGDH
622 | PHKA1
623 | PHKB
624 | PHKG2
625 | PIGB
626 | PIH1D1
627 | PIK3C2B
628 | PIK3C3
629 | PIK3CA
630 | PIK3R3
631 | PIK3R4
632 | PIN1
633 | PIP4K2B
634 | PKIG
635 | PLA2G15
636 | PLA2G4A
637 | PLCB3
638 | PLEKHJ1
639 | PLEKHM1
640 | PLK1
641 | PLOD3
642 | PLP2
643 | PLS1
644 | PLSCR1
645 | PLSCR3
646 | PMAIP1
647 | PMM2
648 | PNKP
649 | POLB
650 | POLD1
651 | POLD4
652 | POLE2
653 | POLG2
654 | POLR1C
655 | POLR2I
656 | POLR2K
657 | POP4
658 | PPARD
659 | PPARG
660 | PPIC
661 | PPIE
662 | PPOX
663 | PPP1R13B
664 | PPP2R3C
665 | PPP2R5A
666 | PPP2R5E
667 | PRAF2
668 | PRCP
669 | PRKACA
670 | PRKAG2
671 | PRKCD
672 | PRKCH
673 | PRKCQ
674 | PRKX
675 | PROS1
676 | PRPF4
677 | PRR15L
678 | PRR7
679 | PRSS23
680 | PRUNE1
681 | PSIP1
682 | PSMB10
683 | PSMB8
684 | PSMD10
685 | PSMD2
686 | PSMD4
687 | PSMD9
688 | PSME1
689 | PSME2
690 | PSMF1
691 | PSMG1
692 | PSRC1
693 | PTGS2
694 | PTK2
695 | PTK2B
696 | PTPN1
697 | PTPN12
698 | PTPN6
699 | PTPRC
700 | PTPRF
701 | PTPRK
702 | PUF60
703 | PWP1
704 | PXMP2
705 | PXN
706 | PYCR1
707 | PYGL
708 | RAB11FIP2
709 | RAB21
710 | RAB27A
711 | RAB31
712 | RAB4A
713 | RAC2
714 | RAD51C
715 | RAD9A
716 | RAE1
717 | RAI14
718 | RALA
719 | RALB
720 | RALGDS
721 | RAP1GAP
722 | RASA1
723 | RB1
724 | RBKS
725 | RBM15B
726 | RBM34
727 | RBM6
728 | REEP5
729 | RELB
730 | RFC2
731 | RFC5
732 | RFNG
733 | RFX5
734 | RGS2
735 | RHEB
736 | RHOA
737 | RHOV
738 | RNF167
739 | RNH1
740 | RNMT
741 | RNPS1
742 | RPA1
743 | RPA2
744 | RPA3
745 | RPIA
746 | RPL39L
747 | RPN1
748 | RPP38
749 | RPS5
750 | RPS6
751 | RPS6KA1
752 | RRAGA
753 | RRP12
754 | RRP1B
755 | RRP8
756 | RRS1
757 | RSU1
758 | RTN2
759 | RUVBL1
760 | RXYLT1
761 | S100A13
762 | S100A4
763 | SACM1L
764 | SATB1
765 | SCAND1
766 | SCARB1
767 | SCCPDH
768 | SCP2
769 | SCRN1
770 | SCYL3
771 | SDHB
772 | SENP6
773 | SERPINE1
774 | SESN1
775 | SFN
776 | SGCB
777 | SH3BP5
778 | SHB
779 | SHC1
780 | SIRT3
781 | SKIC2
782 | SKIC8
783 | SKP1
784 | SLC11A2
785 | SLC1A4
786 | SLC25A13
787 | SLC25A14
788 | SLC25A4
789 | SLC25A46
790 | SLC27A3
791 | SLC2A6
792 | SLC35A1
793 | SLC35A3
794 | SLC35B1
795 | SLC35F2
796 | SLC37A4
797 | SLC5A6
798 | SMAD3
799 | SMARCA4
800 | SMARCC1
801 | SMARCD2
802 | SMC1A
803 | SMC3
804 | SMC4
805 | SMNDC1
806 | SNAP25
807 | SNCA
808 | SNX11
809 | SNX13
810 | SNX6
811 | SNX7
812 | SOCS2
813 | SORBS3
814 | SOX2
815 | SOX4
816 | SPAG4
817 | SPAG7
818 | SPDEF
819 | SPEN
820 | SPP1
821 | SPR
822 | SPRED2
823 | SPTAN1
824 | SPTLC2
825 | SQOR
826 | SQSTM1
827 | SRC
828 | SSBP2
829 | ST3GAL5
830 | ST6GALNAC2
831 | ST7
832 | STAMBP
833 | STAP2
834 | STAT1
835 | STAT3
836 | STAT5B
837 | STIMATE
838 | STK10
839 | STK25
840 | STMN1
841 | STUB1
842 | STX1A
843 | STX4
844 | STXBP1
845 | STXBP2
846 | SUPV3L1
847 | SUV39H1
848 | SUZ12
849 | SYK
850 | SYNE2
851 | SYNGR3
852 | SYPL1
853 | TARBP1
854 | TATDN2
855 | TBC1D31
856 | TBC1D9B
857 | TBP
858 | TBPL1
859 | TBX2
860 | TBXA2R
861 | TCEA2
862 | TCEAL4
863 | TCERG1
864 | TCFL5
865 | TCTA
866 | TCTN1
867 | TENT4A
868 | TERF2IP
869 | TERT
870 | TES
871 | TESK1
872 | TEX10
873 | TFAP2A
874 | TFDP1
875 | TGFB3
876 | TGFBR2
877 | THAP11
878 | TIAM1
879 | TICAM1
880 | TIMELESS
881 | TIMM17B
882 | TIMM22
883 | TIMM9
884 | TIMP2
885 | TIPARP
886 | TJP1
887 | TLCD3A
888 | TLE1
889 | TLK2
890 | TLR4
891 | TM9SF2
892 | TM9SF3
893 | TMCO1
894 | TMED10
895 | TMEM109
896 | TMEM50A
897 | TMEM97
898 | TNFRSF21
899 | TNIP1
900 | TOMM34
901 | TOMM70
902 | TOP2A
903 | TOPBP1
904 | TOR1A
905 | TP53
906 | TP53BP1
907 | TP53BP2
908 | TPD52L2
909 | TPM1
910 | TRAK2
911 | TRAM2
912 | TRAP1
913 | TRAPPC3
914 | TRAPPC6A
915 | TRIB1
916 | TRIB3
917 | TRIM13
918 | TRIM2
919 | TSC22D3
920 | TSEN2
921 | TSKU
922 | TSPAN3
923 | TSPAN4
924 | TSPAN6
925 | TUBB6
926 | TWF2
927 | TXLNA
928 | TXNDC9
929 | TXNL4B
930 | TXNRD1
931 | UBE2A
932 | UBE2C
933 | UBE2J1
934 | UBE2L6
935 | UBE3B
936 | UBE3C
937 | UBQLN2
938 | UBR7
939 | UFM1
940 | UGDH
941 | USP1
942 | USP14
943 | USP22
944 | USP6NL
945 | USP7
946 | UTP14A
947 | VAPB
948 | VAT1
949 | VAV3
950 | VDAC1
951 | VGLL4
952 | VPS28
953 | VPS72
954 | WASF3
955 | WASHC4
956 | WASHC5
957 | WDR7
958 | WDTC1
959 | WFS1
960 | WIPF2
961 | XBP1
962 | XPNPEP1
963 | XPO7
964 | YKT6
965 | YME1L1
966 | YTHDF1
967 | ZDHHC6
968 | ZFP36
969 | ZMIZ1
970 | ZMYM2
971 | ZNF131
972 | ZNF274
973 | ZNF318
974 | ZNF395
975 | ZNF451
976 | ZNF586
977 | ZNF589
978 | ZW10
979 | 


--------------------------------------------------------------------------------
/GO_terms_search/source/intersection_geneSymbols_785.txt:
--------------------------------------------------------------------------------
  1 | AARS1
  2 | ABCB6
  3 | ABCC5
  4 | ABCF1
  5 | ABCF3
  6 | ABHD4
  7 | ABHD6
  8 | ABL1
  9 | ACAA1
 10 | ACAT2
 11 | ACBD3
 12 | ACD
 13 | ACLY
 14 | ACOT9
 15 | ADAM10
 16 | ADAT1
 17 | ADGRE2
 18 | ADGRG1
 19 | ADH5
 20 | ADI1
 21 | ADO
 22 | AGER
 23 | AGL
 24 | AKAP8
 25 | AKAP8L
 26 | AKR7A2
 27 | AKT1
 28 | ALAS1
 29 | ALDH7A1
 30 | ALDOC
 31 | AMDHD2
 32 | ANKRD10
 33 | ANO10
 34 | ANXA7
 35 | APBB2
 36 | APPBP2
 37 | ARFIP2
 38 | ARHGAP1
 39 | ARHGEF2
 40 | ARID4B
 41 | ARID5B
 42 | ARL4C
 43 | ARNT2
 44 | ARPP19
 45 | ASCC3
 46 | ATF5
 47 | ATG3
 48 | ATMIN
 49 | ATP11B
 50 | ATP1B1
 51 | ATP2C1
 52 | ATP6V0B
 53 | ATP6V1D
 54 | B3GNT2
 55 | BACE2
 56 | BAD
 57 | BAG3
 58 | BAMBI
 59 | BCL2
 60 | BCL7B
 61 | BDH1
 62 | BECN1
 63 | BHLHE40
 64 | BID
 65 | BIRC5
 66 | BLCAP
 67 | BLMH
 68 | BLTP2
 69 | BLVRA
 70 | BNIP3
 71 | BPHL
 72 | BUB1B
 73 | BZW2
 74 | C2CD2
 75 | C2CD2L
 76 | C2CD5
 77 | CAB39
 78 | CALU
 79 | CAMSAP2
 80 | CANT1
 81 | CAPN1
 82 | CARMIL1
 83 | CASC3
 84 | CASK
 85 | CASP3
 86 | CAST
 87 | CAT
 88 | CBR1
 89 | CBR3
 90 | CCDC85B
 91 | CCDC86
 92 | CCDC92
 93 | CCNA2
 94 | CCNB2
 95 | CCNF
 96 | CCP110
 97 | CD320
 98 | CD44
 99 | CD58
100 | CDC45
101 | CDCA4
102 | CDH3
103 | CDK19
104 | CDK2
105 | CDKN1A
106 | CEBPD
107 | CEBPZ
108 | CEMIP2
109 | CENPE
110 | CEP57
111 | CERK
112 | CETN3
113 | CFLAR
114 | CGRRF1
115 | CHAC1
116 | CHEK2
117 | CHERP
118 | CHIC2
119 | CHMP4A
120 | CHMP6
121 | CHN1
122 | CIAO3
123 | CIAPIN1
124 | CIRBP
125 | CISD1
126 | CLIC4
127 | CLPX
128 | CLSTN1
129 | CLTB
130 | CNDP2
131 | CNOT4
132 | CNPY3
133 | COASY
134 | COG2
135 | COG4
136 | COG7
137 | COPB2
138 | COPS7A
139 | COQ8A
140 | CORO1A
141 | CPNE3
142 | CPSF4
143 | CRAMP1
144 | CREB1
145 | CREG1
146 | CRELD2
147 | CRKL
148 | CRTAP
149 | CRYZ
150 | CSK
151 | CSNK2A2
152 | CSRP1
153 | CTNNAL1
154 | CTNND1
155 | CTSD
156 | CTSL
157 | CTTN
158 | CXCR4
159 | CYB561
160 | CYTH1
161 | DCK
162 | DCTD
163 | DCUN1D4
164 | DDB2
165 | DDIT4
166 | DDR1
167 | DDX10
168 | DDX42
169 | DECR1
170 | DENND2D
171 | DERA
172 | DHDDS
173 | DHRS7
174 | DHX29
175 | DIPK1A
176 | DLD
177 | DMAC2L
178 | DMTF1
179 | DNAJA3
180 | DNAJB1
181 | DNAJB2
182 | DNAJB6
183 | DNAJC15
184 | DNM1
185 | DNM1L
186 | DNMT1
187 | DNTTIP2
188 | DPH2
189 | DRAP1
190 | DSG2
191 | DUSP11
192 | DUSP14
193 | DUSP22
194 | DYNLT3
195 | DYRK3
196 | EAPP
197 | EBNA1BP2
198 | EBP
199 | ECD
200 | ECH1
201 | EDEM1
202 | EFCAB14
203 | EGFR
204 | EIF5
205 | ELAC2
206 | ELAVL1
207 | ELOVL6
208 | ELP1
209 | EML3
210 | ENOPH1
211 | ENOSF1
212 | EPB41L2
213 | EPN2
214 | EPRS1
215 | ERBB2
216 | ETFB
217 | EVL
218 | EXOSC4
219 | EXT1
220 | EZH2
221 | FAH
222 | FAIM
223 | FAM20B
224 | FAS
225 | FASTKD5
226 | FAT1
227 | FBXL12
228 | FBXO21
229 | FBXO7
230 | FCHO1
231 | FDFT1
232 | FEZ2
233 | FHL2
234 | FIS1
235 | FKBP14
236 | FKBP4
237 | FOS
238 | FOXJ3
239 | FOXO4
240 | FPGS
241 | FSD1
242 | FUT1
243 | G3BP1
244 | GAA
245 | GABPB1
246 | GADD45A
247 | GADD45B
248 | GALE
249 | GARRE1
250 | GATA2
251 | GATA3
252 | GDPD5
253 | GET1
254 | GFOD1
255 | GFPT1
256 | GFUS
257 | GLOD4
258 | GLRX
259 | GMNN
260 | GNA11
261 | GNAI2
262 | GNAS
263 | GNB5
264 | GNPDA1
265 | GOLT1B
266 | GPATCH8
267 | GPC1
268 | GPER1
269 | GRB10
270 | GRN
271 | GRWD1
272 | GSTM2
273 | GSTZ1
274 | GTF2A2
275 | GTF2E2
276 | GTPBP8
277 | H2AZ2
278 | H2BC12
279 | H2BC21
280 | HACD3
281 | HADH
282 | HAT1
283 | HDAC2
284 | HDAC6
285 | HDGFL3
286 | HEATR1
287 | HEBP1
288 | HERC6
289 | HERPUD1
290 | HES1
291 | HK1
292 | HMG20B
293 | HMGCR
294 | HMGCS1
295 | HOMER2
296 | HOOK2
297 | HOXA10
298 | HOXA5
299 | HPRT1
300 | HS2ST1
301 | HSD17B10
302 | HSD17B11
303 | HSPA4
304 | HTATSF1
305 | HTRA1
306 | HYOU1
307 | IARS2
308 | ICAM3
309 | ICMT
310 | ID2
311 | IDE
312 | IER3
313 | IFRD2
314 | IGF1R
315 | IGF2R
316 | IGHMBP2
317 | IKBKB
318 | IKBKE
319 | IL13RA1
320 | IL4R
321 | ILK
322 | INPP1
323 | INPP4B
324 | INSIG1
325 | INTS3
326 | IPO13
327 | IQGAP1
328 | ISOC1
329 | ITFG1
330 | ITGAE
331 | ITGB1BP1
332 | ITGB5
333 | JADE2
334 | JMJD6
335 | JUN
336 | KAT6A
337 | KAT6B
338 | KCNK1
339 | KCTD5
340 | KDELR2
341 | KDM3A
342 | KDM5A
343 | KDM5B
344 | KEAP1
345 | KHDC4
346 | KIAA0753
347 | KIF14
348 | KIF20A
349 | KIF2C
350 | KIF5C
351 | KIFBP
352 | KIT
353 | KLHDC2
354 | KLHL21
355 | KLHL9
356 | KLK8
357 | KTN1
358 | LAGE3
359 | LAMA3
360 | LAP3
361 | LBR
362 | LGALS8
363 | LGMN
364 | LIG1
365 | LIPA
366 | LOXL1
367 | LPAR2
368 | LPGAT1
369 | LRP10
370 | LRPAP1
371 | LRRC41
372 | LSM5
373 | LSM6
374 | LSR
375 | LYN
376 | LYPLA1
377 | LYRM1
378 | MACF1
379 | MALT1
380 | MAMLD1
381 | MAN2B1
382 | MAP2K5
383 | MAP3K4
384 | MAP4K4
385 | MAP7
386 | MAPK13
387 | MAPK1IP1L
388 | MAPK9
389 | MAPKAPK3
390 | MAPKAPK5
391 | MAST2
392 | MBNL1
393 | MBNL2
394 | MBOAT7
395 | MBTPS1
396 | MCM3
397 | MCOLN1
398 | MCUR1
399 | ME2
400 | MELK
401 | MEST
402 | METRN
403 | MFSD10
404 | MICALL1
405 | MINDY1
406 | MLEC
407 | MLLT11
408 | MPC2
409 | MPZL1
410 | MRPL12
411 | MRPL19
412 | MRPS16
413 | MRPS2
414 | MSH6
415 | MSRA
416 | MTA1
417 | MTERF3
418 | MTF2
419 | MTFR1
420 | MTHFD2
421 | MVP
422 | MYBL2
423 | MYC
424 | MYCBP
425 | MYCBP2
426 | MYO10
427 | NCAPD2
428 | NCOA3
429 | NENF
430 | NET1
431 | NFE2L2
432 | NFIL3
433 | NGRN
434 | NIPSNAP1
435 | NISCH
436 | NIT1
437 | NMT1
438 | NNT
439 | NOL3
440 | NOLC1
441 | NOSIP
442 | NPC1
443 | NPDC1
444 | NPEPL1
445 | NPRL2
446 | NR1H2
447 | NR2F6
448 | NR3C1
449 | NRIP1
450 | NSDHL
451 | NT5DC2
452 | NUCB2
453 | NUDCD3
454 | NUDT9
455 | NUP133
456 | NUP62
457 | NUP85
458 | NUP88
459 | NUP93
460 | NUSAP1
461 | NVL
462 | ORC1
463 | OXA1L
464 | OXCT1
465 | OXSR1
466 | P4HA2
467 | P4HTM
468 | PACSIN3
469 | PAF1
470 | PAFAH1B1
471 | PAFAH1B3
472 | PAICS
473 | PAK4
474 | PAN2
475 | PARP2
476 | PAX8
477 | PCBD1
478 | PCCB
479 | PCK2
480 | PCM1
481 | PCMT1
482 | PDHX
483 | PDIA5
484 | PDLIM1
485 | PDS5A
486 | PECR
487 | PEX11A
488 | PGM1
489 | PGRMC1
490 | PHGDH
491 | PHKA1
492 | PHKB
493 | PHKG2
494 | PIGB
495 | PIH1D1
496 | PIK3C2B
497 | PIN1
498 | PIP4K2B
499 | PKIG
500 | PLA2G15
501 | PLEKHJ1
502 | PLEKHM1
503 | PLOD3
504 | PLP2
505 | PLS1
506 | PLSCR1
507 | PLSCR3
508 | PMAIP1
509 | PMM2
510 | PNKP
511 | POLB
512 | POLD1
513 | POLD4
514 | POLE2
515 | POLG2
516 | POLR1C
517 | POLR2I
518 | POLR2K
519 | POP4
520 | PPARG
521 | PPIC
522 | PPIE
523 | PPOX
524 | PPP2R3C
525 | PPP2R5A
526 | PPP2R5E
527 | PRAF2
528 | PRCP
529 | PRKACA
530 | PRKCD
531 | PRPF4
532 | PRR15L
533 | PRR7
534 | PRSS23
535 | PRUNE1
536 | PSIP1
537 | PSMD10
538 | PSMG1
539 | PSRC1
540 | PTK2
541 | PTPN1
542 | PTPN12
543 | PTPRF
544 | PTPRK
545 | PUF60
546 | PWP1
547 | PXMP2
548 | PXN
549 | PYCR1
550 | PYGL
551 | RAB11FIP2
552 | RAB21
553 | RAB27A
554 | RAB31
555 | RAB4A
556 | RAD51C
557 | RAD9A
558 | RAE1
559 | RAI14
560 | RAP1GAP
561 | RBKS
562 | RBM15B
563 | RBM34
564 | RBM6
565 | REEP5
566 | RELB
567 | RFC2
568 | RFC5
569 | RFNG
570 | RFX5
571 | RGS2
572 | RNF167
573 | RNH1
574 | RNMT
575 | RNPS1
576 | RPA1
577 | RPA2
578 | RPA3
579 | RPIA
580 | RPL39L
581 | RPN1
582 | RPP38
583 | RPS6KA1
584 | RRAGA
585 | RRP12
586 | RRP1B
587 | RRP8
588 | RRS1
589 | RSU1
590 | RTN2
591 | RUVBL1
592 | RXYLT1
593 | S100A13
594 | S100A4
595 | SACM1L
596 | SCAND1
597 | SCARB1
598 | SCCPDH
599 | SCP2
600 | SCRN1
601 | SCYL3
602 | SDHB
603 | SENP6
604 | SESN1
605 | SFN
606 | SGCB
607 | SH3BP5
608 | SHB
609 | SKIC2
610 | SKIC8
611 | SLC11A2
612 | SLC1A4
613 | SLC25A13
614 | SLC25A14
615 | SLC25A4
616 | SLC25A46
617 | SLC27A3
618 | SLC2A6
619 | SLC35A1
620 | SLC35A3
621 | SLC35B1
622 | SLC35F2
623 | SLC37A4
624 | SLC5A6
625 | SMAD3
626 | SMARCA4
627 | SMARCC1
628 | SMARCD2
629 | SMC1A
630 | SMC3
631 | SMC4
632 | SMNDC1
633 | SNX11
634 | SNX13
635 | SNX6
636 | SNX7
637 | SOCS2
638 | SORBS3
639 | SOX4
640 | SPAG4
641 | SPAG7
642 | SPDEF
643 | SPEN
644 | SPR
645 | SPRED2
646 | SPTLC2
647 | SQOR
648 | SSBP2
649 | ST3GAL5
650 | ST6GALNAC2
651 | ST7
652 | STAMBP
653 | STAP2
654 | STAT1
655 | STIMATE
656 | STK10
657 | STK25
658 | STMN1
659 | STUB1
660 | STX1A
661 | STX4
662 | STXBP1
663 | STXBP2
664 | SUPV3L1
665 | SYNE2
666 | SYNGR3
667 | SYPL1
668 | TARBP1
669 | TATDN2
670 | TBC1D31
671 | TBC1D9B
672 | TBPL1
673 | TBX2
674 | TBXA2R
675 | TCEA2
676 | TCEAL4
677 | TCERG1
678 | TCFL5
679 | TCTA
680 | TCTN1
681 | TENT4A
682 | TERF2IP
683 | TES
684 | TESK1
685 | TEX10
686 | TFAP2A
687 | THAP11
688 | TIAM1
689 | TIMELESS
690 | TIMM17B
691 | TIMM22
692 | TIMM9
693 | TIMP2
694 | TIPARP
695 | TJP1
696 | TLCD3A
697 | TLE1
698 | TLK2
699 | TM9SF2
700 | TM9SF3
701 | TMCO1
702 | TMED10
703 | TMEM109
704 | TMEM50A
705 | TMEM97
706 | TNFRSF21
707 | TNIP1
708 | TOMM34
709 | TOMM70
710 | TOP2A
711 | TOPBP1
712 | TOR1A
713 | TP53BP1
714 | TP53BP2
715 | TPD52L2
716 | TPM1
717 | TRAK2
718 | TRAM2
719 | TRAP1
720 | TRAPPC3
721 | TRAPPC6A
722 | TRIB1
723 | TRIB3
724 | TRIM13
725 | TRIM2
726 | TSC22D3
727 | TSEN2
728 | TSKU
729 | TSPAN3
730 | TSPAN4
731 | TSPAN6
732 | TUBB6
733 | TWF2
734 | TXLNA
735 | TXNDC9
736 | TXNL4B
737 | TXNRD1
738 | UBE2A
739 | UBE2C
740 | UBE2J1
741 | UBE2L6
742 | UBE3B
743 | UBE3C
744 | UBQLN2
745 | UBR7
746 | UFM1
747 | UGDH
748 | USP1
749 | USP14
750 | USP22
751 | USP6NL
752 | USP7
753 | UTP14A
754 | VAPB
755 | VAT1
756 | VAV3
757 | VDAC1
758 | VGLL4
759 | VPS28
760 | VPS72
761 | WASF3
762 | WASHC4
763 | WASHC5
764 | WDR7
765 | WDTC1
766 | WFS1
767 | WIPF2
768 | XBP1
769 | XPNPEP1
770 | XPO7
771 | YKT6
772 | YME1L1
773 | YTHDF1
774 | ZDHHC6
775 | ZFP36
776 | ZMIZ1
777 | ZMYM2
778 | ZNF131
779 | ZNF274
780 | ZNF318
781 | ZNF395
782 | ZNF451
783 | ZNF586
784 | ZNF589
785 | ZW10
786 | 


--------------------------------------------------------------------------------
/GO_terms_search/source/top_100_luad.txt:
--------------------------------------------------------------------------------
  1 | TPM1
  2 | CDKN1A
  3 | SERPINE1
  4 | COL4A1
  5 | RPIA
  6 | BIRC5
  7 | EDN1
  8 | GADD45A
  9 | CCNA2
 10 | POLD1
 11 | FHL2
 12 | SLC2A6
 13 | CTSL
 14 | AURKA
 15 | ATF1
 16 | YKT6
 17 | JUN
 18 | DNAJB2
 19 | ABHD4
 20 | MTHFD2
 21 | AURKB
 22 | MMP1
 23 | TOP2A
 24 | UBE2C
 25 | PAFAH1B3
 26 | MRPL12
 27 | HDAC2
 28 | CTSD
 29 | TSEN2
 30 | SCARB1
 31 | LBR
 32 | POLE2
 33 | PAICS
 34 | PRSS23
 35 | RGS2
 36 | IER3
 37 | HSPB1
 38 | PTPN12
 39 | CHEK2
 40 | ARHGAP1
 41 | ADGRG1
 42 | MCM3
 43 | POP4
 44 | PXN
 45 | HMOX1
 46 | USP1
 47 | RUVBL1
 48 | DDX10
 49 | DUSP6
 50 | CCL2
 51 | NUP88
 52 | CDC25A
 53 | TXNRD1
 54 | HMGA2
 55 | MYL9
 56 | DUSP4
 57 | CAT
 58 | MVP
 59 | SQSTM1
 60 | TIMELESS
 61 | DCK
 62 | GPC1
 63 | NIPSNAP1
 64 | COL1A1
 65 | C5
 66 | NET1
 67 | MPC2
 68 | TIMP2
 69 | TMEM97
 70 | RAE1
 71 | RPL39L
 72 | EFCAB14
 73 | MAN2B1
 74 | RAI14
 75 | ILK
 76 | ABCB6
 77 | TIPARP
 78 | RNPS1
 79 | PPIC
 80 | CEBPD
 81 | CCND3
 82 | EZH2
 83 | SOX4
 84 | MYBL2
 85 | SLC35A1
 86 | TMEM109
 87 | RSU1
 88 | DAG1
 89 | GRB10
 90 | INPP1
 91 | STAT1
 92 | RRP12
 93 | CREG1
 94 | TES
 95 | PDGFA
 96 | SMC4
 97 | ERBB2
 98 | EIF4EBP1
 99 | DPH2
100 | UBE2L6
101 | 


--------------------------------------------------------------------------------
/GO_terms_search/source/top_59_atleast_topIn3.txt:
--------------------------------------------------------------------------------
 1 | ATP1B1
 2 | BCL7B
 3 | BIRC5
 4 | BUB1B
 5 | CCNA2
 6 | CDK4
 7 | CISD1
 8 | CLIC4
 9 | COASY
10 | CPNE3
11 | DAG1
12 | DCK
13 | EBP
14 | EPRS1
15 | ERBB2
16 | FHL2
17 | GLRX
18 | GNPDA1
19 | HMOX1
20 | IER3
21 | IGF2R
22 | LBR
23 | LIG1
24 | MCM3
25 | MPZL1
26 | MRPL19
27 | MTHFD2
28 | MYC
29 | NFKBIB
30 | NIPSNAP1
31 | NPC1
32 | PAFAH1B3
33 | PAICS
34 | PAK4
35 | PSME1
36 | PSRC1
37 | RELB
38 | RPA1
39 | SACM1L
40 | SCARB1
41 | SERPINE1
42 | SESN1
43 | SLC25A4
44 | SMAD3
45 | SMC4
46 | SPP1
47 | STMN1
48 | STX1A
49 | TERF2IP
50 | TMEM50A
51 | TOP2A
52 | TPM1
53 | TRIB1
54 | TSC22D3
55 | TSKU
56 | TXNRD1
57 | UBE2C
58 | XBP1
59 | YKT6
60 | 


--------------------------------------------------------------------------------
/GO_terms_search/source/union_geneSymbols_1170.txt:
--------------------------------------------------------------------------------
   1 | AARS1
   2 | ABCB6
   3 | ABCC5
   4 | ABCF1
   5 | ABCF3
   6 | ABHD4
   7 | ABHD6
   8 | ABL1
   9 | ACAA1
  10 | ACAT2
  11 | ACBD3
  12 | ACD
  13 | ACLY
  14 | ACOT9
  15 | ADAM10
  16 | ADAT1
  17 | ADGRE2
  18 | ADGRG1
  19 | ADH5
  20 | ADI1
  21 | ADO
  22 | ADRB2
  23 | AGER
  24 | AGL
  25 | AKAP8
  26 | AKAP8L
  27 | AKR7A2
  28 | AKT1
  29 | ALAS1
  30 | ALDH7A1
  31 | ALDOA
  32 | ALDOC
  33 | AMDHD2
  34 | ANKRD10
  35 | ANO10
  36 | ANXA7
  37 | APBB2
  38 | APOE
  39 | APP
  40 | APPBP2
  41 | ARFIP2
  42 | ARHGAP1
  43 | ARHGEF12
  44 | ARHGEF2
  45 | ARID4B
  46 | ARID5B
  47 | ARL4C
  48 | ARNT2
  49 | ARPP19
  50 | ASAH1
  51 | ASCC3
  52 | ATF1
  53 | ATF5
  54 | ATF6
  55 | ATG3
  56 | ATMIN
  57 | ATP11B
  58 | ATP1B1
  59 | ATP2C1
  60 | ATP6V0B
  61 | ATP6V1D
  62 | AURKA
  63 | AURKB
  64 | AXIN1
  65 | B3GNT2
  66 | BACE2
  67 | BAD
  68 | BAG3
  69 | BAMBI
  70 | BAX
  71 | BCL2
  72 | BCL7B
  73 | BDH1
  74 | BECN1
  75 | BHLHE40
  76 | BID
  77 | BIRC2
  78 | BIRC5
  79 | BLCAP
  80 | BLMH
  81 | BLTP2
  82 | BLVRA
  83 | BMP4
  84 | BNIP3
  85 | BNIP3L
  86 | BPHL
  87 | BRCA1
  88 | BTK
  89 | BUB1B
  90 | BZW2
  91 | C2CD2
  92 | C2CD2L
  93 | C2CD5
  94 | C5
  95 | CAB39
  96 | CALM1
  97 | CALU
  98 | CAMSAP2
  99 | CANT1
 100 | CAPN1
 101 | CARMIL1
 102 | CASC3
 103 | CASK
 104 | CASP10
 105 | CASP2
 106 | CASP3
 107 | CASP7
 108 | CAST
 109 | CAT
 110 | CBLB
 111 | CBR1
 112 | CBR3
 113 | CCDC85B
 114 | CCDC86
 115 | CCDC92
 116 | CCL2
 117 | CCNA1
 118 | CCNA2
 119 | CCNB1
 120 | CCNB2
 121 | CCND1
 122 | CCND3
 123 | CCNE2
 124 | CCNF
 125 | CCNH
 126 | CCP110
 127 | CD320
 128 | CD40
 129 | CD44
 130 | CD58
 131 | CDC20
 132 | CDC25A
 133 | CDC25B
 134 | CDC42
 135 | CDC45
 136 | CDCA4
 137 | CDH3
 138 | CDK19
 139 | CDK2
 140 | CDK4
 141 | CDK5R1
 142 | CDK6
 143 | CDK7
 144 | CDKN1A
 145 | CDKN1B
 146 | CDKN2A
 147 | CEBPA
 148 | CEBPD
 149 | CEBPZ
 150 | CEMIP2
 151 | CENPE
 152 | CEP57
 153 | CERK
 154 | CETN3
 155 | CFLAR
 156 | CGRRF1
 157 | CHAC1
 158 | CHEK1
 159 | CHEK2
 160 | CHERP
 161 | CHIC2
 162 | CHMP4A
 163 | CHMP6
 164 | CHN1
 165 | CIAO3
 166 | CIAPIN1
 167 | CIRBP
 168 | CISD1
 169 | CLIC4
 170 | CLPX
 171 | CLSTN1
 172 | CLTB
 173 | CLTC
 174 | CNDP2
 175 | CNOT4
 176 | CNPY3
 177 | COASY
 178 | COG2
 179 | COG4
 180 | COG7
 181 | COL1A1
 182 | COL4A1
 183 | COPB2
 184 | COPS7A
 185 | COQ8A
 186 | CORO1A
 187 | CPNE3
 188 | CPSF4
 189 | CRAMP1
 190 | CREB1
 191 | CREG1
 192 | CRELD2
 193 | CRK
 194 | CRKL
 195 | CRTAP
 196 | CRYZ
 197 | CSK
 198 | CSNK1A1
 199 | CSNK1E
 200 | CSNK2A2
 201 | CSRP1
 202 | CTNNAL1
 203 | CTNND1
 204 | CTSD
 205 | CTSL
 206 | CTTN
 207 | CXCL2
 208 | CXCR4
 209 | CYB561
 210 | CYCS
 211 | CYTH1
 212 | DAG1
 213 | DAXX
 214 | DCK
 215 | DCTD
 216 | DCUN1D4
 217 | DDB2
 218 | DDIT4
 219 | DDR1
 220 | DDX10
 221 | DDX42
 222 | DECR1
 223 | DENND2D
 224 | DERA
 225 | DFFA
 226 | DFFB
 227 | DHDDS
 228 | DHRS7
 229 | DHX29
 230 | DIPK1A
 231 | DLD
 232 | DMAC2L
 233 | DMTF1
 234 | DNAJA3
 235 | DNAJB1
 236 | DNAJB2
 237 | DNAJB6
 238 | DNAJC15
 239 | DNM1
 240 | DNM1L
 241 | DNMT1
 242 | DNMT3A
 243 | DNTTIP2
 244 | DPH2
 245 | DRAP1
 246 | DSG2
 247 | DUSP11
 248 | DUSP14
 249 | DUSP22
 250 | DUSP3
 251 | DUSP4
 252 | DUSP6
 253 | DYNLT3
 254 | DYRK3
 255 | E2F2
 256 | EAPP
 257 | EBNA1BP2
 258 | EBP
 259 | ECD
 260 | ECH1
 261 | EDEM1
 262 | EDN1
 263 | EED
 264 | EFCAB14
 265 | EGF
 266 | EGFR
 267 | EGR1
 268 | EIF4EBP1
 269 | EIF4G1
 270 | EIF5
 271 | ELAC2
 272 | ELAVL1
 273 | ELOVL6
 274 | ELP1
 275 | EML3
 276 | ENOPH1
 277 | ENOSF1
 278 | EPB41L2
 279 | EPHA3
 280 | EPHB2
 281 | EPN2
 282 | EPRS1
 283 | ERBB2
 284 | ERBB3
 285 | ERO1A
 286 | ETFB
 287 | ETS1
 288 | ETV1
 289 | EVL
 290 | EXOSC4
 291 | EXT1
 292 | EZH2
 293 | FAH
 294 | FAIM
 295 | FAM20B
 296 | FAS
 297 | FASTKD5
 298 | FAT1
 299 | FBXL12
 300 | FBXO11
 301 | FBXO21
 302 | FBXO7
 303 | FCHO1
 304 | FDFT1
 305 | FEZ2
 306 | FGFR2
 307 | FGFR4
 308 | FHL2
 309 | FIS1
 310 | FKBP14
 311 | FKBP4
 312 | FOS
 313 | FOSL1
 314 | FOXJ3
 315 | FOXO3
 316 | FOXO4
 317 | FPGS
 318 | FRS2
 319 | FSD1
 320 | FUT1
 321 | FYN
 322 | FZD1
 323 | FZD7
 324 | G3BP1
 325 | GAA
 326 | GABPB1
 327 | GADD45A
 328 | GADD45B
 329 | GALE
 330 | GAPDH
 331 | GARRE1
 332 | GATA2
 333 | GATA3
 334 | GDPD5
 335 | GET1
 336 | GFOD1
 337 | GFPT1
 338 | GFUS
 339 | GHR
 340 | GLI2
 341 | GLOD4
 342 | GLRX
 343 | GMNN
 344 | GNA11
 345 | GNA15
 346 | GNAI1
 347 | GNAI2
 348 | GNAS
 349 | GNB5
 350 | GNPDA1
 351 | GOLT1B
 352 | GPATCH8
 353 | GPC1
 354 | GPER1
 355 | GRB10
 356 | GRB7
 357 | GRN
 358 | GRWD1
 359 | GSTM2
 360 | GSTZ1
 361 | GTF2A2
 362 | GTF2E2
 363 | GTPBP8
 364 | H2AZ2
 365 | H2BC12
 366 | H2BC21
 367 | HACD3
 368 | HADH
 369 | HAT1
 370 | HDAC2
 371 | HDAC6
 372 | HDGFL3
 373 | HEATR1
 374 | HEBP1
 375 | HERC6
 376 | HERPUD1
 377 | HES1
 378 | HIF1A
 379 | HK1
 380 | HLA-DMA
 381 | HLA-DRA
 382 | HMG20B
 383 | HMGA2
 384 | HMGCR
 385 | HMGCS1
 386 | HMOX1
 387 | HOMER2
 388 | HOOK2
 389 | HOXA10
 390 | HOXA5
 391 | HPRT1
 392 | HS2ST1
 393 | HSD17B10
 394 | HSD17B11
 395 | HSPA1A
 396 | HSPA4
 397 | HSPA8
 398 | HSPB1
 399 | HSPD1
 400 | HTATSF1
 401 | HTRA1
 402 | HYOU1
 403 | IARS2
 404 | ICAM1
 405 | ICAM3
 406 | ICMT
 407 | ID2
 408 | IDE
 409 | IER3
 410 | IFNAR1
 411 | IFRD2
 412 | IGF1R
 413 | IGF2BP2
 414 | IGF2R
 415 | IGFBP3
 416 | IGHMBP2
 417 | IKBKB
 418 | IKBKE
 419 | IKZF1
 420 | IL13RA1
 421 | IL1B
 422 | IL4R
 423 | ILK
 424 | INPP1
 425 | INPP4B
 426 | INSIG1
 427 | INTS3
 428 | IPO13
 429 | IQGAP1
 430 | ISOC1
 431 | ITFG1
 432 | ITGAE
 433 | ITGB1BP1
 434 | ITGB5
 435 | JADE2
 436 | JMJD6
 437 | JUN
 438 | KAT6A
 439 | KAT6B
 440 | KCNK1
 441 | KCTD5
 442 | KDELR2
 443 | KDM3A
 444 | KDM5A
 445 | KDM5B
 446 | KEAP1
 447 | KHDC4
 448 | KIAA0753
 449 | KIF14
 450 | KIF20A
 451 | KIF2C
 452 | KIF5C
 453 | KIFBP
 454 | KIT
 455 | KLHDC2
 456 | KLHL21
 457 | KLHL9
 458 | KLK8
 459 | KTN1
 460 | LAGE3
 461 | LAMA3
 462 | LAP3
 463 | LBR
 464 | LGALS8
 465 | LGMN
 466 | LIG1
 467 | LIPA
 468 | LOXL1
 469 | LPAR2
 470 | LPGAT1
 471 | LRP10
 472 | LRPAP1
 473 | LRRC41
 474 | LSM5
 475 | LSM6
 476 | LSR
 477 | LYN
 478 | LYPLA1
 479 | LYRM1
 480 | MACF1
 481 | MALT1
 482 | MAMLD1
 483 | MAN2B1
 484 | MAP2K5
 485 | MAP3K4
 486 | MAP4K4
 487 | MAP7
 488 | MAPK13
 489 | MAPK1IP1L
 490 | MAPK9
 491 | MAPKAPK2
 492 | MAPKAPK3
 493 | MAPKAPK5
 494 | MAST2
 495 | MAT2A
 496 | MBNL1
 497 | MBNL2
 498 | MBOAT7
 499 | MBTPS1
 500 | MCM3
 501 | MCOLN1
 502 | MCUR1
 503 | ME2
 504 | MEF2C
 505 | MELK
 506 | MEST
 507 | METRN
 508 | MFSD10
 509 | MICALL1
 510 | MIF
 511 | MINDY1
 512 | MKNK1
 513 | MLEC
 514 | MLLT11
 515 | MMP1
 516 | MMP2
 517 | MNAT1
 518 | MPC2
 519 | MPZL1
 520 | MRPL12
 521 | MRPL19
 522 | MRPS16
 523 | MRPS2
 524 | MSH6
 525 | MSRA
 526 | MTA1
 527 | MTERF3
 528 | MTF2
 529 | MTFR1
 530 | MTHFD2
 531 | MUC1
 532 | MVP
 533 | MYBL2
 534 | MYC
 535 | MYCBP
 536 | MYCBP2
 537 | MYL9
 538 | MYLK
 539 | MYO10
 540 | NCAPD2
 541 | NCK1
 542 | NCK2
 543 | NCOA3
 544 | NENF
 545 | NET1
 546 | NFATC3
 547 | NFATC4
 548 | NFE2L2
 549 | NFIL3
 550 | NFKB2
 551 | NFKBIA
 552 | NFKBIB
 553 | NFKBIE
 554 | NGRN
 555 | NIPSNAP1
 556 | NISCH
 557 | NIT1
 558 | NMT1
 559 | NNT
 560 | NOL3
 561 | NOLC1
 562 | NOS3
 563 | NOSIP
 564 | NOTCH1
 565 | NPC1
 566 | NPDC1
 567 | NPEPL1
 568 | NPRL2
 569 | NR1H2
 570 | NR2F6
 571 | NR3C1
 572 | NRAS
 573 | NRIP1
 574 | NSDHL
 575 | NT5DC2
 576 | NUCB2
 577 | NUDCD3
 578 | NUDT9
 579 | NUP133
 580 | NUP62
 581 | NUP85
 582 | NUP88
 583 | NUP93
 584 | NUSAP1
 585 | NVL
 586 | ORC1
 587 | OXA1L
 588 | OXCT1
 589 | OXSR1
 590 | P4HA2
 591 | P4HTM
 592 | PACSIN3
 593 | PAF1
 594 | PAFAH1B1
 595 | PAFAH1B3
 596 | PAICS
 597 | PAK1
 598 | PAK4
 599 | PAK6
 600 | PAN2
 601 | PARP1
 602 | PARP2
 603 | PAX8
 604 | PCBD1
 605 | PCCB
 606 | PCK2
 607 | PCM1
 608 | PCMT1
 609 | PCNA
 610 | PDGFA
 611 | PDHX
 612 | PDIA5
 613 | PDLIM1
 614 | PDS5A
 615 | PECR
 616 | PEX11A
 617 | PFKL
 618 | PGAM1
 619 | PGM1
 620 | PGRMC1
 621 | PHGDH
 622 | PHKA1
 623 | PHKB
 624 | PHKG2
 625 | PIGB
 626 | PIH1D1
 627 | PIK3C2B
 628 | PIK3C3
 629 | PIK3CA
 630 | PIK3R3
 631 | PIK3R4
 632 | PIN1
 633 | PIP4K2B
 634 | PKIG
 635 | PLA2G15
 636 | PLA2G4A
 637 | PLCB3
 638 | PLEKHJ1
 639 | PLEKHM1
 640 | PLK1
 641 | PLOD3
 642 | PLP2
 643 | PLS1
 644 | PLSCR1
 645 | PLSCR3
 646 | PMAIP1
 647 | PMM2
 648 | PNKP
 649 | POLB
 650 | POLD1
 651 | POLD4
 652 | POLE2
 653 | POLG2
 654 | POLR1C
 655 | POLR2I
 656 | POLR2K
 657 | POP4
 658 | PPARD
 659 | PPARG
 660 | PPIC
 661 | PPIE
 662 | PPOX
 663 | PPP1R13B
 664 | PPP2R3C
 665 | PPP2R5A
 666 | PPP2R5E
 667 | PRAF2
 668 | PRCP
 669 | PRKACA
 670 | PRKAG2
 671 | PRKCD
 672 | PRKCH
 673 | PRKCQ
 674 | PRKX
 675 | PROS1
 676 | PRPF4
 677 | PRR15L
 678 | PRR7
 679 | PRSS23
 680 | PRUNE1
 681 | PSIP1
 682 | PSMB10
 683 | PSMB8
 684 | PSMD10
 685 | PSMD2
 686 | PSMD4
 687 | PSMD9
 688 | PSME1
 689 | PSME2
 690 | PSMF1
 691 | PSMG1
 692 | PSRC1
 693 | PTGS2
 694 | PTK2
 695 | PTK2B
 696 | PTPN1
 697 | PTPN12
 698 | PTPN6
 699 | PTPRC
 700 | PTPRF
 701 | PTPRK
 702 | PUF60
 703 | PWP1
 704 | PXMP2
 705 | PXN
 706 | PYCR1
 707 | PYGL
 708 | RAB11FIP2
 709 | RAB21
 710 | RAB27A
 711 | RAB31
 712 | RAB4A
 713 | RAC2
 714 | RAD51C
 715 | RAD9A
 716 | RAE1
 717 | RAI14
 718 | RALA
 719 | RALB
 720 | RALGDS
 721 | RAP1GAP
 722 | RASA1
 723 | RB1
 724 | RBKS
 725 | RBM15B
 726 | RBM34
 727 | RBM6
 728 | REEP5
 729 | RELB
 730 | RFC2
 731 | RFC5
 732 | RFNG
 733 | RFX5
 734 | RGS2
 735 | RHEB
 736 | RHOA
 737 | RHOV
 738 | RNF167
 739 | RNH1
 740 | RNMT
 741 | RNPS1
 742 | RPA1
 743 | RPA2
 744 | RPA3
 745 | RPIA
 746 | RPL39L
 747 | RPN1
 748 | RPP38
 749 | RPS5
 750 | RPS6
 751 | RPS6KA1
 752 | RRAGA
 753 | RRP12
 754 | RRP1B
 755 | RRP8
 756 | RRS1
 757 | RSU1
 758 | RTN2
 759 | RUVBL1
 760 | RXYLT1
 761 | S100A13
 762 | S100A4
 763 | SACM1L
 764 | SATB1
 765 | SCAND1
 766 | SCARB1
 767 | SCCPDH
 768 | SCP2
 769 | SCRN1
 770 | SCYL3
 771 | SDHB
 772 | SENP6
 773 | SERPINE1
 774 | SESN1
 775 | SFN
 776 | SGCB
 777 | SH3BP5
 778 | SHB
 779 | SHC1
 780 | SIRT3
 781 | SKIC2
 782 | SKIC8
 783 | SKP1
 784 | SLC11A2
 785 | SLC1A4
 786 | SLC25A13
 787 | SLC25A14
 788 | SLC25A4
 789 | SLC25A46
 790 | SLC27A3
 791 | SLC2A6
 792 | SLC35A1
 793 | SLC35A3
 794 | SLC35B1
 795 | SLC35F2
 796 | SLC37A4
 797 | SLC5A6
 798 | SMAD3
 799 | SMARCA4
 800 | SMARCC1
 801 | SMARCD2
 802 | SMC1A
 803 | SMC3
 804 | SMC4
 805 | SMNDC1
 806 | SNAP25
 807 | SNCA
 808 | SNX11
 809 | SNX13
 810 | SNX6
 811 | SNX7
 812 | SOCS2
 813 | SORBS3
 814 | SOX2
 815 | SOX4
 816 | SPAG4
 817 | SPAG7
 818 | SPDEF
 819 | SPEN
 820 | SPP1
 821 | SPR
 822 | SPRED2
 823 | SPTAN1
 824 | SPTLC2
 825 | SQOR
 826 | SQSTM1
 827 | SRC
 828 | SSBP2
 829 | ST3GAL5
 830 | ST6GALNAC2
 831 | ST7
 832 | STAMBP
 833 | STAP2
 834 | STAT1
 835 | STAT3
 836 | STAT5B
 837 | STIMATE
 838 | STK10
 839 | STK25
 840 | STMN1
 841 | STUB1
 842 | STX1A
 843 | STX4
 844 | STXBP1
 845 | STXBP2
 846 | SUPV3L1
 847 | SUV39H1
 848 | SUZ12
 849 | SYK
 850 | SYNE2
 851 | SYNGR3
 852 | SYPL1
 853 | TARBP1
 854 | TATDN2
 855 | TBC1D31
 856 | TBC1D9B
 857 | TBP
 858 | TBPL1
 859 | TBX2
 860 | TBXA2R
 861 | TCEA2
 862 | TCEAL4
 863 | TCERG1
 864 | TCFL5
 865 | TCTA
 866 | TCTN1
 867 | TENT4A
 868 | TERF2IP
 869 | TERT
 870 | TES
 871 | TESK1
 872 | TEX10
 873 | TFAP2A
 874 | TFDP1
 875 | TGFB3
 876 | TGFBR2
 877 | THAP11
 878 | TIAM1
 879 | TICAM1
 880 | TIMELESS
 881 | TIMM17B
 882 | TIMM22
 883 | TIMM9
 884 | TIMP2
 885 | TIPARP
 886 | TJP1
 887 | TLCD3A
 888 | TLE1
 889 | TLK2
 890 | TLR4
 891 | TM9SF2
 892 | TM9SF3
 893 | TMCO1
 894 | TMED10
 895 | TMEM109
 896 | TMEM50A
 897 | TMEM97
 898 | TNFRSF21
 899 | TNIP1
 900 | TOMM34
 901 | TOMM70
 902 | TOP2A
 903 | TOPBP1
 904 | TOR1A
 905 | TP53
 906 | TP53BP1
 907 | TP53BP2
 908 | TPD52L2
 909 | TPM1
 910 | TRAK2
 911 | TRAM2
 912 | TRAP1
 913 | TRAPPC3
 914 | TRAPPC6A
 915 | TRIB1
 916 | TRIB3
 917 | TRIM13
 918 | TRIM2
 919 | TSC22D3
 920 | TSEN2
 921 | TSKU
 922 | TSPAN3
 923 | TSPAN4
 924 | TSPAN6
 925 | TUBB6
 926 | TWF2
 927 | TXLNA
 928 | TXNDC9
 929 | TXNL4B
 930 | TXNRD1
 931 | UBE2A
 932 | UBE2C
 933 | UBE2J1
 934 | UBE2L6
 935 | UBE3B
 936 | UBE3C
 937 | UBQLN2
 938 | UBR7
 939 | UFM1
 940 | UGDH
 941 | USP1
 942 | USP14
 943 | USP22
 944 | USP6NL
 945 | USP7
 946 | UTP14A
 947 | VAPB
 948 | VAT1
 949 | VAV3
 950 | VDAC1
 951 | VGLL4
 952 | VPS28
 953 | VPS72
 954 | WASF3
 955 | WASHC4
 956 | WASHC5
 957 | WDR7
 958 | WDTC1
 959 | WFS1
 960 | WIPF2
 961 | XBP1
 962 | XPNPEP1
 963 | XPO7
 964 | YKT6
 965 | YME1L1
 966 | YTHDF1
 967 | ZDHHC6
 968 | ZFP36
 969 | ZMIZ1
 970 | ZMYM2
 971 | ZNF131
 972 | ZNF274
 973 | ZNF318
 974 | ZNF395
 975 | ZNF451
 976 | ZNF586
 977 | ZNF589
 978 | ZW10
 979 | ABAT
 980 | ACSL1
 981 | ADCY9
 982 | ADGRA3
 983 | AFF1
 984 | AKAP1
 985 | ALMS1
 986 | ANKRD49
 987 | AREL1
 988 | ARHGEF5
 989 | ASAP2
 990 | ASL
 991 | ASMTL
 992 | BAZ1B
 993 | BAZ2B
 994 | BCKDHA
 995 | BTBD3
 996 | BTG2
 997 | BTG3
 998 | CA12
 999 | CABIN1
1000 | CD14
1001 | CD19
1002 | CEP55
1003 | CHCHD7
1004 | CISH
1005 | CKB
1006 | CLASRP
1007 | CLCN3
1008 | CMPK1
1009 | COL4A5
1010 | COQ3
1011 | CREB3L2
1012 | CTCF
1013 | CTNS
1014 | CTSV
1015 | DALRD3
1016 | DDX49
1017 | DHPS
1018 | DHX8
1019 | DLGAP5
1020 | DNAJB12
1021 | DOK4
1022 | DSE
1023 | DTNA
1024 | ECHDC1
1025 | EFNB3
1026 | EIF1B
1027 | ENTPD6
1028 | EPB41L4B
1029 | EPHA2
1030 | EPHB4
1031 | ERCC1
1032 | ERCC5
1033 | ERCC6L
1034 | ERLIN1
1035 | ETFA
1036 | F12
1037 | FADD
1038 | FAM162A
1039 | FAM3C
1040 | FANCA
1041 | FANCL
1042 | FBRS
1043 | FDX1
1044 | FEZ1
1045 | FTSJ1
1046 | FUZ
1047 | FZD5
1048 | GGA2
1049 | GM2A
1050 | GMDS
1051 | GOLIM4
1052 | GPRC5C
1053 | GRHPR
1054 | GYS1
1055 | H2BC5
1056 | HACD1
1057 | HADHB
1058 | HBE1
1059 | HDAC4
1060 | HLF
1061 | HMGCL
1062 | IFIT5
1063 | IL1RAP
1064 | IMPA2
1065 | KDELR3
1066 | KHDC1
1067 | LAMTOR3
1068 | LASP1
1069 | LHPP
1070 | LIMK2
1071 | LRRC40
1072 | LYPD3
1073 | MAD2L1BP
1074 | MAN2A2
1075 | MARK4
1076 | MEGF9
1077 | MGST2
1078 | MPDZ
1079 | NAA50
1080 | NIPBL
1081 | NSFL1C
1082 | PANK2
1083 | PARL
1084 | PARN
1085 | PDS5B
1086 | PEPD
1087 | PER2
1088 | PEX19
1089 | PHF2
1090 | PJA1
1091 | PKD1
1092 | PLPP1
1093 | POLR3B
1094 | PPIF
1095 | PPP2R3A
1096 | PPP2R5B
1097 | PPP3CA
1098 | PPP4R1
1099 | PSMD5
1100 | PTEN
1101 | QRSL1
1102 | RAB1B
1103 | RAB3GAP1
1104 | RABGGTA
1105 | RAD23B
1106 | RAP1A
1107 | RARA
1108 | RHOBTB1
1109 | RPF1
1110 | RTCA
1111 | SAP18
1112 | SCAF8
1113 | SEC14L1
1114 | SEC24B
1115 | SEC24C
1116 | SEC61A2
1117 | SENP5
1118 | SERTAD3
1119 | SETD1B
1120 | SETDB1
1121 | SFMBT1
1122 | SGK1
1123 | SIK1
1124 | SLC16A6
1125 | SLC2A1
1126 | SLC36A1
1127 | SMG7
1128 | SMYD3
1129 | SNRPD1
1130 | SNRPF
1131 | SNX2
1132 | SRF
1133 | SRPRB
1134 | SRRM1
1135 | SRSF8
1136 | STAT6
1137 | SWAP70
1138 | TARS1
1139 | TBCB
1140 | TCIRG1
1141 | TGDS
1142 | TLE3
1143 | TMEM127
1144 | TMEM131L
1145 | TNFAIP1
1146 | TOMM22
1147 | TP53TG1
1148 | TRIP10
1149 | TSN
1150 | TTF1
1151 | UBE4A
1152 | UGP2
1153 | USP20
1154 | UVRAG
1155 | VAMP7
1156 | VPS26A
1157 | WHRN
1158 | WRN
1159 | WWOX
1160 | YBX3
1161 | ZBTB14
1162 | ZBTB24
1163 | ZBTB5
1164 | ZC3H4
1165 | ZER1
1166 | ZGPAT
1167 | ZNF629
1168 | ZNF672
1169 | ZNF692
1170 | ZNF768
1171 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2021, carpenterlab
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # High-Dimensional Gene Expression and Morphology Profiles of Cells across 28,000 Genetic and Chemical Perturbations
  2 | Populations of cells can be perturbed by various chemical and genetic treatments and the impact on the cells’ gene expression (transcription, i.e. mRNA levels) and morphology (in an image-based assay) can be measured in high dimensions.
  3 | The patterns observed in this data can be used for more than a dozen applications in drug discovery and basic biology research.
  4 |  We provide a collection of four datasets where both gene expression and morphological data are available; roughly a thousand features are measured for each data type, across more than 28,000 thousand chemical and genetic perturbations.
  5 |  We have defined a set of biological problems that can be investigated using these two data modalities and provided baseline analysis and evaluation metrics for addressing each.
  6 | 
  7 |  [Link to Paper](https://www.nature.com/articles/s41592-022-01667-0)
  8 | 
  9 | 
 10 | # Data Modalities
 11 | <details>
 12 | <summary>Click to expand</summary>
 13 | 
 14 | ### Gene expression (GE) profiles
 15 | Each cell has DNA in the nucleus which is transcribed into various mRNA molecules which are then translated into proteins that carry out functions in the cell.
 16 | The levels of mRNA in the cell are often biologically meaningful - collectively, mRNA levels for a cell are known as its transcriptional state; each individual mRNA level is referred to as the corresponding gene's "expression".
 17 | The L1000 assay was used to measure the transcriptional state of cells in the datasets here.
 18 | The assay reports a sample's mRNA levels for 978 genes at high-throughput, from the bulk population of cells treated with a given perturbation.
 19 | These 978 "landmark" genes capture approximately $80\%$ of the transcriptional variance for the entire genome.
 20 | The data processing tools and workflows to produce these profiles are available at https://clue.io/.
 21 | 
 22 | 
 23 | ### Cell Painting morphological (CP) profiles
 24 | We used the Cell Painting assay to measure the morphological state of cells treated with a given perturbation.
 25 | The assay captures fluorescence images of cells colored by six well-characterized fluorescent dyes to stain the nucleus, nucleoli, cytoplasmic RNA, endoplasmic reticulum, actin cytoskeleton, Golgi apparatus and plasma membrane.
 26 | These eight labeled cell compartments are captured through five channels of high-resolution microscopy images (_DNA, RNA, ER, AGP_, and _Mito_).
 27 | Images are then processed using [CellProfiler software](https://cellprofiler.org/) to extract thousands of features of each cell’s morphology and form a high-dimensional profile for each single cell.
 28 | These features are based on various shape, intensity and texture statistics and are then aggregated for all the single cells in a "well" (a miniature test tube) that are called replicate-level profiles of perturbations.
 29 | Aggregation of replicate-level profiles across all the wells or replicates of a perturbation is called a treatment-level profile.
 30 | In our study, we used treatment-level profiles in all experiments but have provided replicate-level profiles for researchers interested in further data exploration.
 31 | 
 32 | </details>
 33 | 
 34 | # Datasets
 35 | 
 36 | - We have gathered the following five available data sets that had both Cell Painting morphological (CP) and L1000 gene expression (GE) profiles, preprocessed the data from different sources and in different formats in a unified .csv format.
 37 | 
 38 |     - CDRP-BBBC047-Bray-CP-GE (Cell line: U2OS)
 39 |     - CDRPBIO-BBBC036-Bray-CP-GE (Cell line: U2OS)
 40 |     - LUAD-BBBC041-Caicedo-CP-GE (Cell line: A549)
 41 |     - TA-ORF-BBBC037-Rohban-CP-GE (Cell line: U2OS)
 42 |     - LINCS-Pilot1-CP-GE (Cell line: A549)
 43 | 
 44 | ## References to raw profiles and images
 45 | <details>
 46 | <summary>Click to expand</summary>
 47 | 
 48 | - CDRP-BBBC047-Bray-[CP](https://pubmed.ncbi.nlm.nih.gov/28327978/) - [GE](https://pubmed.ncbi.nlm.nih.gov/29195078/)
 49 | - CDRP-bio-BBBC036-Bray-[CP](https://pubmed.ncbi.nlm.nih.gov/28327978/) - [GE](https://pubmed.ncbi.nlm.nih.gov/29195078/)
 50 | - LUAD-BBBC041-Caicedo-[CP](https://registry.opendata.aws/cell-painting-image-collection/) - [GE](https://pubmed.ncbi.nlm.nih.gov/27478040/)
 51 | - TA-ORF-BBBC037-Rohban-[CP](https://elifesciences.org/articles/24060) - [GE](https://github.com/carpenterlab/2017_rohban_elife/tree/master/input/TA-OE-L1000-B1)
 52 | - LINCS-Pilot1-[CP](https://zenodo.org/record/3928744#.YNu3WzZKheV) - [GE](https://figshare.com/articles/dataset/L1000_data_for_profiling_comparison/13181966)
 53 | 
 54 | </details>
 55 | 
 56 | 
 57 | ## Preprocessed publicly available profiles
 58 | Preprocessed profiles (~9.5GB) are available on a S3 bucket.
 59 | They can be downloaded at no cost and no need for registration of any sort, using the command:
 60 | 
 61 | ```bash
 62 | aws s3 sync \
 63 |   --no-sign-request \
 64 |   s3://cellpainting-gallery/cpg0003-rosetta/broad/workspace/preprocessed_data .
 65 | ```
 66 | 
 67 | See this [wiki](https://github.com/carpenterlab/2016_bray_natprot/wiki/What-do-Cell-Painting-features-mean%3F) for sample Cell Painting images and the meaning of ([CellProfiler](https://cellprofiler.org/)-derived) Cell Painting features.
 68 | 
 69 | - AWS CLI installation instructions can be found [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html).
 70 | 
 71 | ### Data version
 72 | 
 73 | The [Etags](https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html) of these files are listed [here](etag.json).
 74 | 
 75 | They were generated using:
 76 | 
 77 | ```sh
 78 | aws s3api list-objects --bucket cellpainting-gallery --prefix rosetta/broad/workspace/preprocessed_data/
 79 | ```
 80 | ### CP-L1000 Profile descriptions
 81 | 
 82 | We gathered four available data sets that had both Cell Painting morphological (CP) and L1000 gene expression (GE) profiles, preprocessed the data from different sources and in different formats in a unified .csv format, and made the data publicly available. Single cell morphological (CP) profiles were created using CellProfiler software and processed to form aggregated replicate and treatment levels using the R cytominer package [cytominer](https://github.com/cytomining/cytominer/blob/master/vignettes/cytominer-pipeline.Rmd).
 83 | We made the following three types of profiles available for cell-painting modality of each of four datasets:
 84 | 
 85 | 
 86 | | Folder       | File name                                                | Description                                                                                                                                                  |
 87 | | ------------ | -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 88 | | CellPainting | `replicate_level_cp_augmented.csv`                       | Aggregated and Metadata annotated profiles which are the average of single cell profiles in each well.                                                       |
 89 | | CellPainting | `replicate_level_cp_normalized.csv.gz`                   | Normalized profiles which are the z-scored aggregated profiles, where the scores are computing using the distribution of negative controls as the reference. |
 90 | | CellPainting | `replicate_level_cp_normalized_variable_selected.csv.gz` | Normalized variable selected which are normalized profiles with features selection applied                                                                   |
 91 | | L1000        | `replicate_level_l1k.csv`                                | Aggregated and Metadata annotated profiles which are the average of single cell profiles in each well.                                                       |
 92 | 
 93 | 
 94 | 
 95 | ### Metadata information
 96 | 
 97 | This [spreadsheet](https://docs.google.com/spreadsheets/d/1EpqBLJqio8ptGlZe9Ywq1OUJahKSpYNb6S4lJ9yFc0o/edit#gid=174183831) contains a description all the metadata fields across all 8 datasets.
 98 | 
 99 | #### Keywords to match tables across modalities for each dataset
100 | 
101 | 
102 | | Dataset               | perturbation match column<br/>CP | perturbation match column<br/>GE | Control perturbation value in each of columns <br/>CP and GE |
103 | | :-------------------- | :------------------------------- | :------------------------------- | :---------------------------- |
104 | | CDRP-BBBC047-Bray     | Metadata_Sample_Dose             | pert_sample_dose                 | negcon                          |
105 | | CDRPBIO-BBBC036-Bray  | Metadata_Sample_Dose             | pert_sample_dose                 | negcon                          |
106 | | TA-ORF-BBBC037-Rohban | Metadata_broad_sample            | pert_id                          | negcon                          |
107 | | LUAD-BBBC041-Caicedo  | x_mutation_status                | allele                           | negcon                   |
108 | | LINCS-Pilot1          | Metadata_pert_id_dose            | pert_id_dose                     | negcon                          |
109 | 
110 | * Two aditional columns can also be used to filter for the "Control perturbation" in each data table:
111 |    -  **pert_type** wich can take 'trt' or 'control' values , and column control_type indicates negcon (otherwise empty).
112 |    -  **control_type** wich can take 'negcon' (for control) or NaN (for treatments) values
113 | 
114 | #### Number of features for each dataset
115 | 
116 | | Dataset  | GE  | CP<br/>`normalized` | CP<br/>`normalized_variable_selected` |
117 | | -------- | --- | ------------------- | ------------------------------------- |
118 | | CDRP     | 977 | 1565                | 727                                   |
119 | | CDRP-BIO | 977 | 1570                | 601                                   |
120 | | LUAD     | 978 | 1569                | 291                                   |
121 | | TA-ORF   | 978 | 1677                | 63                                    |
122 | | LINCS    | 978 | 1670                | 119                                   |
123 | 
124 | 
125 | # Lookup table for L1000 genes predictability
126 | 
127 | [Table](results/SingleGenePred/Appendix_D.csv)
128 | 
129 | 
130 | # License
131 | 
132 | We license the data, results, and figures as [CC0 1.0](LICENSE_CC0.md) and the source code as BSD 3-Clause.
133 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: rosetta
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - pip=22.0.4
 6 |   - conda-forge::pandas=1.4.1
 7 |   - conda-forge::scikit-learn=1.0.2
 8 |   - conda-forge::umap-learn=0.5.2
 9 |   - conda-forge::jupyter=1.0.0
10 |   - conda-forge::matplotlib=3.3.3
11 |   - conda-forge::seaborn=0.11.2
12 |   - conda-forge::openpyxl=3.0.9
13 | 


--------------------------------------------------------------------------------
/etag.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "Contents": [
  3 |         {
  4 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_augmented.csv.gz",
  5 |             "LastModified": "2022-02-25T20:24:06.000Z",
  6 |             "ETag": "\"8367b77b245035279d21e083fb57564e-261\"",
  7 |             "Size": 2183033139,
  8 |             "StorageClass": "STANDARD",
  9 |             "Owner": {
 10 |                 "DisplayName": "cellpainting",
 11 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
 12 |             }
 13 |         },
 14 |         {
 15 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_normalized.csv.gz",
 16 |             "LastModified": "2022-02-25T20:24:06.000Z",
 17 |             "ETag": "\"572869293e0cfacdd8882c2b758fac00-272\"",
 18 |             "Size": 2277911750,
 19 |             "StorageClass": "STANDARD",
 20 |             "Owner": {
 21 |                 "DisplayName": "cellpainting",
 22 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
 23 |             }
 24 |         },
 25 |         {
 26 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz",
 27 |             "LastModified": "2022-02-25T20:24:06.000Z",
 28 |             "ETag": "\"510f9c5a93436c8af2f36f0308c78be0-131\"",
 29 |             "Size": 1098352960,
 30 |             "StorageClass": "STANDARD",
 31 |             "Owner": {
 32 |                 "DisplayName": "cellpainting",
 33 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
 34 |             }
 35 |         },
 36 |         {
 37 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k.csv.gz",
 38 |             "LastModified": "2022-02-25T20:24:06.000Z",
 39 |             "ETag": "\"40e1f7285238c5381b9d9fdeebb5a026-32\"",
 40 |             "Size": 262406281,
 41 |             "StorageClass": "STANDARD",
 42 |             "Owner": {
 43 |                 "DisplayName": "cellpainting",
 44 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
 45 |             }
 46 |         },
 47 |         {
 48 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k_pclfc.csv.gz",
 49 |             "LastModified": "2022-02-25T20:24:06.000Z",
 50 |             "ETag": "\"630b98d69d185f530acfb0c272e82031-31\"",
 51 |             "Size": 258651159,
 52 |             "StorageClass": "STANDARD",
 53 |             "Owner": {
 54 |                 "DisplayName": "cellpainting",
 55 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
 56 |             }
 57 |         },
 58 |         {
 59 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k_pczscore.csv.gz",
 60 |             "LastModified": "2022-02-25T20:24:13.000Z",
 61 |             "ETag": "\"5ad1f4b412c8ea9b9abb55a254a7ebbe-72\"",
 62 |             "Size": 603440498,
 63 |             "StorageClass": "STANDARD",
 64 |             "Owner": {
 65 |                 "DisplayName": "cellpainting",
 66 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
 67 |             }
 68 |         },
 69 |         {
 70 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k_vczscore.csv.gz",
 71 |             "LastModified": "2022-02-25T20:24:13.000Z",
 72 |             "ETag": "\"b58b4d31e96964f28165f048bdfd60c8-73\"",
 73 |             "Size": 605293966,
 74 |             "StorageClass": "STANDARD",
 75 |             "Owner": {
 76 |                 "DisplayName": "cellpainting",
 77 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
 78 |             }
 79 |         },
 80 |         {
 81 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/treatment_level_l1k.csv.gz",
 82 |             "LastModified": "2022-02-25T20:24:27.000Z",
 83 |             "ETag": "\"e695e3d5f520553f516516ab8719719f-13\"",
 84 |             "Size": 107934871,
 85 |             "StorageClass": "STANDARD",
 86 |             "Owner": {
 87 |                 "DisplayName": "cellpainting",
 88 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
 89 |             }
 90 |         },
 91 |         {
 92 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/CellPainting/replicate_level_cp_augmented.csv.gz",
 93 |             "LastModified": "2022-02-25T20:24:27.000Z",
 94 |             "ETag": "\"3e199aeba5209250e0d2c5948f5bd522-36\"",
 95 |             "Size": 298941736,
 96 |             "StorageClass": "STANDARD",
 97 |             "Owner": {
 98 |                 "DisplayName": "cellpainting",
 99 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
100 |             }
101 |         },
102 |         {
103 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/CellPainting/replicate_level_cp_normalized.csv.gz",
104 |             "LastModified": "2022-02-25T20:24:30.000Z",
105 |             "ETag": "\"0b86065f8840aff626d64c6f52a8caf4-38\"",
106 |             "Size": 311539701,
107 |             "StorageClass": "STANDARD",
108 |             "Owner": {
109 |                 "DisplayName": "cellpainting",
110 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
111 |             }
112 |         },
113 |         {
114 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz",
115 |             "LastModified": "2022-02-25T20:24:32.000Z",
116 |             "ETag": "\"bffd9db9578fcc70bbd7d72e0dfff773-14\"",
117 |             "Size": 117242590,
118 |             "StorageClass": "STANDARD",
119 |             "Owner": {
120 |                 "DisplayName": "cellpainting",
121 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
122 |             }
123 |         },
124 |         {
125 |             "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/L1000/replicate_level_l1k.csv.gz",
126 |             "LastModified": "2022-02-25T20:24:35.000Z",
127 |             "ETag": "\"5b45e5cb94f0466a2abb11fbac8a655e-4\"",
128 |             "Size": 26842289,
129 |             "StorageClass": "STANDARD",
130 |             "Owner": {
131 |                 "DisplayName": "cellpainting",
132 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
133 |             }
134 |         },
135 |         {
136 |             "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_augmented.csv.gz",
137 |             "LastModified": "2022-02-25T20:24:35.000Z",
138 |             "ETag": "\"9bde4d7112c06ffa1849fbfa4efa22f1-36\"",
139 |             "Size": 296762474,
140 |             "StorageClass": "STANDARD",
141 |             "Owner": {
142 |                 "DisplayName": "cellpainting",
143 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
144 |             }
145 |         },
146 |         {
147 |             "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_normalized.csv.gz",
148 |             "LastModified": "2022-02-25T20:24:36.000Z",
149 |             "ETag": "\"f42af6b4109ef9ed110004def49f6c2c-36\"",
150 |             "Size": 299683743,
151 |             "StorageClass": "STANDARD",
152 |             "Owner": {
153 |                 "DisplayName": "cellpainting",
154 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
155 |             }
156 |         },
157 |         {
158 |             "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz",
159 |             "LastModified": "2022-02-25T20:24:38.000Z",
160 |             "ETag": "\"33783625dc59b0de2bf16c299f5380dd-12\"",
161 |             "Size": 94527797,
162 |             "StorageClass": "STANDARD",
163 |             "Owner": {
164 |                 "DisplayName": "cellpainting",
165 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
166 |             }
167 |         },
168 |         {
169 |             "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_3.csv.gz",
170 |             "LastModified": "2022-02-25T20:24:41.000Z",
171 |             "ETag": "\"8491fe32e9b0b040f10c7d51225d6111-11\"",
172 |             "Size": 89725093,
173 |             "StorageClass": "STANDARD",
174 |             "Owner": {
175 |                 "DisplayName": "cellpainting",
176 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
177 |             }
178 |         },
179 |         {
180 |             "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_4.csv.gz",
181 |             "LastModified": "2022-02-25T20:24:42.000Z",
182 |             "ETag": "\"14679d4b4cae5e12a4e7be8255bd22ff-10\"",
183 |             "Size": 78596325,
184 |             "StorageClass": "STANDARD",
185 |             "Owner": {
186 |                 "DisplayName": "cellpainting",
187 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
188 |             }
189 |         },
190 |         {
191 |             "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_4W.csv.gz",
192 |             "LastModified": "2022-02-25T20:24:43.000Z",
193 |             "ETag": "\"370607c1f148942263037a7e26018303-17\"",
194 |             "Size": 140912507,
195 |             "StorageClass": "STANDARD",
196 |             "Owner": {
197 |                 "DisplayName": "cellpainting",
198 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
199 |             }
200 |         },
201 |         {
202 |             "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_5_modz.csv.gz",
203 |             "LastModified": "2022-02-25T20:24:43.000Z",
204 |             "ETag": "\"5967bd8a92d2c57242436330950f1cd2\"",
205 |             "Size": 3631,
206 |             "StorageClass": "STANDARD",
207 |             "Owner": {
208 |                 "DisplayName": "cellpainting",
209 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
210 |             }
211 |         },
212 |         {
213 |             "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_5_rank.csv.gz",
214 |             "LastModified": "2022-02-25T20:24:43.000Z",
215 |             "ETag": "\"83c8146ea2f8a2a6392643b3c4472727\"",
216 |             "Size": 3631,
217 |             "StorageClass": "STANDARD",
218 |             "Owner": {
219 |                 "DisplayName": "cellpainting",
220 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
221 |             }
222 |         },
223 |         {
224 |             "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/replicate_level_l1k.csv.gz",
225 |             "LastModified": "2022-02-25T20:24:44.000Z",
226 |             "ETag": "\"872c318560ba21c9d36e805fb97992a4-10\"",
227 |             "Size": 78596337,
228 |             "StorageClass": "STANDARD",
229 |             "Owner": {
230 |                 "DisplayName": "cellpainting",
231 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
232 |             }
233 |         },
234 |         {
235 |             "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_augmented.csv.gz",
236 |             "LastModified": "2022-02-25T20:24:44.000Z",
237 |             "ETag": "\"11a0a26d299f09452455e0c7e44c571c-11\"",
238 |             "Size": 85105940,
239 |             "StorageClass": "STANDARD",
240 |             "Owner": {
241 |                 "DisplayName": "cellpainting",
242 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
243 |             }
244 |         },
245 |         {
246 |             "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_normalized.csv.gz",
247 |             "LastModified": "2022-02-25T20:24:46.000Z",
248 |             "ETag": "\"f91d40a978c96834973f24b96b8a3b02-11\"",
249 |             "Size": 88273100,
250 |             "StorageClass": "STANDARD",
251 |             "Owner": {
252 |                 "DisplayName": "cellpainting",
253 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
254 |             }
255 |         },
256 |         {
257 |             "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz",
258 |             "LastModified": "2022-02-25T20:24:47.000Z",
259 |             "ETag": "\"1ba6936ab1188268850a798e30c4823f-2\"",
260 |             "Size": 16570136,
261 |             "StorageClass": "STANDARD",
262 |             "Owner": {
263 |                 "DisplayName": "cellpainting",
264 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
265 |             }
266 |         },
267 |         {
268 |             "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/L1000/replicate_level_l1k.csv.gz",
269 |             "LastModified": "2022-02-25T20:24:47.000Z",
270 |             "ETag": "\"c1b8cabef1934d213baf797b80c4c32c-2\"",
271 |             "Size": 11448027,
272 |             "StorageClass": "STANDARD",
273 |             "Owner": {
274 |                 "DisplayName": "cellpainting",
275 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
276 |             }
277 |         },
278 |         {
279 |             "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/L1000/replicate_level_l1k_Juan.csv.gz",
280 |             "LastModified": "2022-02-25T20:24:47.000Z",
281 |             "ETag": "\"587d00f75c5fa6164929e3592bf96080-4\"",
282 |             "Size": 25582111,
283 |             "StorageClass": "STANDARD",
284 |             "Owner": {
285 |                 "DisplayName": "cellpainting",
286 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
287 |             }
288 |         },
289 |         {
290 |             "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/L1000/treatment_level_l1k.csv.gz",
291 |             "LastModified": "2022-02-25T20:24:48.000Z",
292 |             "ETag": "\"c7f285af2a39efc64a4c8d57854d6a0e\"",
293 |             "Size": 4575373,
294 |             "StorageClass": "STANDARD",
295 |             "Owner": {
296 |                 "DisplayName": "cellpainting",
297 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
298 |             }
299 |         },
300 |         {
301 |             "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_augmented.csv.gz",
302 |             "LastModified": "2022-02-25T20:24:48.000Z",
303 |             "ETag": "\"9707bd02924cda850ed6f1e7eba33d9a-4\"",
304 |             "Size": 27548449,
305 |             "StorageClass": "STANDARD",
306 |             "Owner": {
307 |                 "DisplayName": "cellpainting",
308 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
309 |             }
310 |         },
311 |         {
312 |             "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_normalized.csv.gz",
313 |             "LastModified": "2022-02-25T20:24:48.000Z",
314 |             "ETag": "\"736ef2b85bf5406f27239153f3772218-4\"",
315 |             "Size": 27482072,
316 |             "StorageClass": "STANDARD",
317 |             "Owner": {
318 |                 "DisplayName": "cellpainting",
319 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
320 |             }
321 |         },
322 |         {
323 |             "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz",
324 |             "LastModified": "2022-02-25T20:24:48.000Z",
325 |             "ETag": "\"1315c2fd175b265d10e929e51d9dfef0\"",
326 |             "Size": 1106334,
327 |             "StorageClass": "STANDARD",
328 |             "Owner": {
329 |                 "DisplayName": "cellpainting",
330 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
331 |             }
332 |         },
333 |         {
334 |             "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k.csv.gz",
335 |             "LastModified": "2022-02-25T20:24:49.000Z",
336 |             "ETag": "\"1e643bb1182555a8e7699230a0ea98d1\"",
337 |             "Size": 2022367,
338 |             "StorageClass": "STANDARD",
339 |             "Owner": {
340 |                 "DisplayName": "cellpainting",
341 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
342 |             }
343 |         },
344 |         {
345 |             "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k_QNORM.csv.gz",
346 |             "LastModified": "2022-02-25T20:24:49.000Z",
347 |             "ETag": "\"8ffb9c82772442cbbd138a6ab05a9a97\"",
348 |             "Size": 1782302,
349 |             "StorageClass": "STANDARD",
350 |             "Owner": {
351 |                 "DisplayName": "cellpainting",
352 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
353 |             }
354 |         },
355 |         {
356 |             "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k_ZSPCQNORM.csv.gz",
357 |             "LastModified": "2022-02-25T20:24:49.000Z",
358 |             "ETag": "\"36783d73bb48bec466aeda707384c7e5\"",
359 |             "Size": 1997953,
360 |             "StorageClass": "STANDARD",
361 |             "Owner": {
362 |                 "DisplayName": "cellpainting",
363 |                 "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c"
364 |             }
365 |         }
366 |     ]
367 | }


--------------------------------------------------------------------------------
/idmap.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/idmap.xlsx


--------------------------------------------------------------------------------
/read_and_match_profiles.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 35,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "The autoreload extension is already loaded. To reload it, use:\n",
 13 |       "  %reload_ext autoreload\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "%matplotlib inline\n",
 19 |     "%load_ext autoreload\n",
 20 |     "%autoreload 2\n",
 21 |     "import numpy as np\n",
 22 |     "import pandas as pd\n",
 23 |     "import matplotlib.pyplot as plt\n",
 24 |     "import seaborn as sns\n",
 25 |     "from utils.readProfiles import *"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# ls"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "### Metadata column in each dataset to match perturbations across modalities\n",
 42 |     "\n",
 43 |     "Table 1.\n",
 44 |     "\n",
 45 |     "| Dataset                  |  perturbation match column<br/>CP  | perturbation match column<br/>GE   | Control perturbation value <br/>CP/GE|\n",
 46 |     "|:----------------------|:-----------------|:-----------------------------|:--------------|\n",
 47 |     "| CDRP-BBBC047-Bray     |  Metadata_Sample_Dose | pert_sample_dose | negcon |\n",
 48 |     "| CDRPBIO-BBBC036-Bray  | Metadata_Sample_Dose | pert_sample_dose | negcon |\n",
 49 |     "| TA-ORF-BBBC037-Rohban | Metadata_broad_sample | pert_id        | negcon |\n",
 50 |     "| LUAD-BBBC041-Caicedo  |  x_mutation_status | allele             | negcon|\n",
 51 |     "| LINCS-Pilot1          | Metadata_pert_id_dose | pert_id_dose   | negcon |\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 36,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "ds_info_dict = {\n",
 61 |     "    \"CDRP\": [\"CDRP-BBBC047-Bray\", [\"Metadata_Sample_Dose\", \"pert_sample_dose\"]],\n",
 62 |     "    \"CDRP-bio\": [\"CDRPBIO-BBBC036-Bray\", [\"Metadata_Sample_Dose\", \"pert_sample_dose\"]],\n",
 63 |     "    \"TAORF\": [\"TA-ORF-BBBC037-Rohban\", [\"Metadata_broad_sample\", \"pert_id\"]],\n",
 64 |     "    \"LUAD\": [\"LUAD-BBBC041-Caicedo\", [\"x_mutation_status\", \"allele\"]],\n",
 65 |     "    \"LINCS\": [\"LINCS-Pilot1\", [\"Metadata_pert_id_dose\", \"pert_id_dose\"]],\n",
 66 |     "}\n",
 67 |     "# pd.DataFrame(ds_info_dict.values(), index=ds_info_dict.keys()).to_markdown(index=False)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": []
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "### In this notebook you can find examples of how to:\n",
 82 |     "- read replicate or treatment level profiles \n",
 83 |     "- match profiles across modalities\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "\n",
 87 |     "* Finctions used in this notebook:\n",
 88 |     "\n",
 89 |     "   - Read **treatment** level data\n",
 90 |     "      - read_treatment_level_profiles\n",
 91 |     "      \n",
 92 |     "   - Read and match **treatment** level data\n",
 93 |     "      - read_paired_treatment_level_profiles\n",
 94 |     "      \n",
 95 |     "   - Read **Replicate** level data\n",
 96 |     "      - read_replicate_level_profiles\n",
 97 |     "   \n",
 98 |     "   - Read and match **Replicate** level data\n",
 99 |     "      - read_paired_replicate_level_profiles\n"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "### User input parameters"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 37,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "####################### Root directories ###############################################\n",
116 |     "procProf_dir = \"/home/ubuntu/gallery/cpg0003-rosetta/broad/workspace/\"\n",
117 |     "# procProf_dir = \"/home/ubuntu/bucket/projects/2018_04_20_Rosetta/workspace/\"\n",
118 |     "\n",
119 |     "############################# Dataset ##################################################\n",
120 |     "# dataset options: 'LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP'\n",
121 |     "dataset = \"LUAD\"\n",
122 |     "\n",
123 |     "####################### Type of cell painting profile to read ##########################\n",
124 |     "# CP Profile Type options: 'augmented' , 'normalized', 'normalized_variable_selected'\n",
125 |     "profileType = \"normalized_variable_selected\"\n",
126 |     "\n",
127 |     "############################ Filtering low quality samples option #######################\n",
128 |     "# filtering to compounds which have high replicates for both GE and CP datasets\n",
129 |     "# highRepOverlapEnabled=0\n",
130 |     "# 'highRepUnion','highRepOverlap'\n",
131 |     "filter_perts = \"highRepUnion\"\n",
132 |     "repCorrFilePath = \"./results/RepCor/RepCorrDF.xlsx\"\n",
133 |     "\n",
134 |     "filter_repCorr_params = [filter_perts, repCorrFilePath]"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "### Read Replicate level profiles"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 14,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "name": "stderr",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:54: DtypeWarning: Columns (1023,1028,1032) have mixed types. Specify dtype option on import or set low_memory=False.\n",
154 |       "  l1k_data_repLevel = pd.read_csv(dataDir + \"/L1000/replicate_level_l1k.csv.gz\")\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "# dataset = \"LINCS\"\n",
160 |     "per_plate_normalized_flag = 0\n",
161 |     "[cp_data_repLevel, cp_features], [l1k_data_repLevel, l1k_features] = read_replicate_level_profiles(\n",
162 |     "    procProf_dir, dataset, profileType, per_plate_normalized_flag\n",
163 |     ")"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "### Read and pair Replicate level profiles"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 6,
176 |    "metadata": {},
177 |    "outputs": [
178 |     {
179 |      "name": "stderr",
180 |      "output_type": "stream",
181 |      "text": [
182 |       "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:51: DtypeWarning: Columns (18,19,1249,1250) have mixed types. Specify dtype option on import or set low_memory=False.\n",
183 |       "  cp_data_repLevel = pd.read_csv(\n"
184 |      ]
185 |     },
186 |     {
187 |      "name": "stdout",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "LINCS: Replicate Level Shapes (nSamples x nFeatures): cp:  52223 , 119 ,  l1k:  27837 , 978\n",
191 |       "l1k n of rep:  3.0\n",
192 |       "cp n of rep:  5.0\n",
193 |       "CP: from  9394  to  4647\n",
194 |       "l1k: from  8369  to  2338\n",
195 |       "CP and l1k high rep union:  5845\n"
196 |      ]
197 |     },
198 |     {
199 |      "name": "stderr",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:376: FutureWarning: Passing 'suffixes' which cause duplicate columns {'pert_type_y'} in the result is deprecated and will raise a MergeError in a future version.\n",
203 |       "  mergedProfiles_repLevel = pd.merge(\n"
204 |      ]
205 |     }
206 |    ],
207 |    "source": [
208 |     "nRep = 2\n",
209 |     "per_plate_normalized_flag = 1\n",
210 |     "mergedProfiles_repLevel, cp_features, l1k_features = read_paired_replicate_level_profiles(\n",
211 |     "    procProf_dir, dataset, profileType, nRep, filter_repCorr_params, per_plate_normalized_flag\n",
212 |     ")"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "### Read treatment level profiles"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 7,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "name": "stderr",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:51: DtypeWarning: Columns (18,19,1249,1250) have mixed types. Specify dtype option on import or set low_memory=False.\n",
232 |       "  cp_data_repLevel = pd.read_csv(\n"
233 |      ]
234 |     },
235 |     {
236 |      "name": "stdout",
237 |      "output_type": "stream",
238 |      "text": [
239 |       "LINCS: Replicate Level Shapes (nSamples x nFeatures): cp:  52223 , 119 ,  l1k:  27837 , 978\n",
240 |       "l1k n of rep:  3.0\n",
241 |       "cp n of rep:  5.0\n",
242 |       "CP: from  9394  to  4647\n",
243 |       "l1k: from  8369  to  2338\n",
244 |       "CP and l1k high rep union:  5845\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "[cp_data_treatLevel, cp_features], [\n",
250 |     "    l1k_data_treatLevel,\n",
251 |     "    l1k_features,\n",
252 |     "] = read_treatment_level_profiles(\n",
253 |     "    procProf_dir, dataset, profileType, filter_repCorr_params, per_plate_normalized_flag\n",
254 |     ")"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {},
260 |    "source": [
261 |     "### Read and pair treatment level profiles"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 9,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "name": "stderr",
271 |      "output_type": "stream",
272 |      "text": [
273 |       "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:51: DtypeWarning: Columns (18,19,1249,1250) have mixed types. Specify dtype option on import or set low_memory=False.\n",
274 |       "  cp_data_repLevel = pd.read_csv(\n"
275 |      ]
276 |     },
277 |     {
278 |      "name": "stdout",
279 |      "output_type": "stream",
280 |      "text": [
281 |       "LINCS: Replicate Level Shapes (nSamples x nFeatures): cp:  52223 , 119 ,  l1k:  27837 , 978\n",
282 |       "l1k n of rep:  3.0\n",
283 |       "cp n of rep:  5.0\n",
284 |       "CP: from  9394  to  4647\n",
285 |       "l1k: from  8369  to  2338\n",
286 |       "CP and l1k high rep union:  5845\n",
287 |       "Treatment Level Shapes (nSamples x nFeatures+metadata): (5243, 122) (4431, 980) Merged Profiles Shape: (3828, 1101)\n"
288 |      ]
289 |     }
290 |    ],
291 |    "source": [
292 |     "mergedProfiles_treatLevel, cp_features, l1k_features = read_paired_treatment_level_profiles(\n",
293 |     "    procProf_dir, dataset, profileType, filter_repCorr_params, per_plate_normalized_flag\n",
294 |     ")"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": []
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 40,
307 |    "metadata": {
308 |     "scrolled": false
309 |    },
310 |    "outputs": [],
311 |    "source": [
312 |     "# l1k_data_repLevel[ds_info_dict[dataset][1][1]].unique()\n",
313 |     "# cp_data_repLevel[ds_info_dict[dataset][1][0]].unique()"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 41,
319 |    "metadata": {},
320 |    "outputs": [],
321 |    "source": [
322 |     "# per_plate_normalized_flag"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": []
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": []
338 |   }
339 |  ],
340 |  "metadata": {
341 |   "kernelspec": {
342 |    "display_name": "Python 3 (ipykernel)",
343 |    "language": "python",
344 |    "name": "python3"
345 |   },
346 |   "language_info": {
347 |    "codemirror_mode": {
348 |     "name": "ipython",
349 |     "version": 3
350 |    },
351 |    "file_extension": ".py",
352 |    "mimetype": "text/x-python",
353 |    "name": "python",
354 |    "nbconvert_exporter": "python",
355 |    "pygments_lexer": "ipython3",
356 |    "version": "3.9.0"
357 |   },
358 |   "latex_envs": {
359 |    "LaTeX_envs_menu_present": true,
360 |    "autoclose": false,
361 |    "autocomplete": true,
362 |    "bibliofile": "biblio.bib",
363 |    "cite_by": "apalike",
364 |    "current_citInitial": 1,
365 |    "eqLabelWithNumbers": true,
366 |    "eqNumInitial": 1,
367 |    "hotkeys": {
368 |     "equation": "Ctrl-E",
369 |     "itemize": "Ctrl-I"
370 |    },
371 |    "labels_anchors": false,
372 |    "latex_user_defs": false,
373 |    "report_style_numbering": false,
374 |    "user_envs_cfg": false
375 |   },
376 |   "varInspector": {
377 |    "cols": {
378 |     "lenName": 16,
379 |     "lenType": 16,
380 |     "lenVar": 40
381 |    },
382 |    "kernels_config": {
383 |     "python": {
384 |      "delete_cmd_postfix": "",
385 |      "delete_cmd_prefix": "del ",
386 |      "library": "var_list.py",
387 |      "varRefreshCmd": "print(var_dic_list())"
388 |     },
389 |     "r": {
390 |      "delete_cmd_postfix": ") ",
391 |      "delete_cmd_prefix": "rm(",
392 |      "library": "var_list.r",
393 |      "varRefreshCmd": "cat(var_dic_list()) "
394 |     }
395 |    },
396 |    "position": {
397 |     "height": "438.212px",
398 |     "left": "1507.78px",
399 |     "right": "20px",
400 |     "top": "120px",
401 |     "width": "350px"
402 |    },
403 |    "types_to_exclude": [
404 |     "module",
405 |     "function",
406 |     "builtin_function_or_method",
407 |     "instance",
408 |     "_Feature"
409 |    ],
410 |    "window_display": false
411 |   }
412 |  },
413 |  "nbformat": 4,
414 |  "nbformat_minor": 2
415 | }
416 | 


--------------------------------------------------------------------------------
/results/DAVIDoutput_CytoScapeInput_Figure2d/chart_UP_KEYWORDS_FunctionalAnot_top.txt:
--------------------------------------------------------------------------------
 1 | Category	Term	Count	%	PValue	Genes	List Total	Pop Hits	Pop Total	Fold Enrichment	Bonferroni	Benjamini	FDR
 2 | UP_KEYWORDS	Acetylation	30	54.54545454545454	5.124648987027806E-10	TOP2A, CLIC4, SLC35F2, NOLC1, GLRX, EBP, NNT, STMN1, ANXA7, PCBD1, HADH, LBR, LIG1, TXNRD1, USP22, TPM1, RPA1, PYCR1, DDX10, PAICS, HIST2H2BE, CCNA2, GNPDA1, MTHFD2, BIRC5, PSMG1, KIF2C, KIF20A, ARHGEF2, PAFAH1B3	55	3424	20581	3.2786214953271027	8.609409085647002E-8	8.609410298206715E-8	8.45567082859588E-8
 3 | UP_KEYWORDS	Phosphoprotein	36	65.45454545454545	2.0388780008061567E-4	TOP2A, CLIC4, SLC35F2, INPP1, MRPL19, FHL2, NOLC1, PSIP1, RELB, PGRMC1, OXCT1, STMN1, STX4, CPNE3, LBR, MPZL1, IER3, LIG1, TXNRD1, TPM1, RPA1, PYCR1, DDX10, PAICS, HIST2H2BE, CCNA2, TXLNA, GNPDA1, TCEA2, BIRC5, PSMG1, KIF2C, NCAPD2, KIF20A, ARHGEF2, PAFAH1B3	55	8246	20581	1.6336648071792386	0.03367652713909708	0.017126575206771716	0.016820743506650793
 4 | UP_KEYWORDS	Cell division	6	10.909090909090908	0.003440617247463365	CCNA2, LIG1, BIRC5, KIF2C, NCAPD2, ARHGEF2	55	388	20581	5.78659793814433	0.4395528677953666	0.19267456585794843	0.18923394861048506
 5 | UP_KEYWORDS	Mitosis	5	9.090909090909092	0.004928799319810609	CCNA2, BIRC5, KIF2C, NCAPD2, ARHGEF2	55	262	20581	7.141221374045802	0.5639880523221983	0.19369482099750268	0.19023598490826157
 6 | UP_KEYWORDS	Microtubule	5	9.090909090909092	0.006220048085963932	STMN1, BIRC5, KIF2C, KIF20A, ARHGEF2	55	280	20581	6.682142857142857	0.649442836226727	0.19369482099750268	0.19023598490826157
 7 | UP_KEYWORDS	Cell cycle	7	12.727272727272727	0.006917672178482239	CCNA2, LIG1, USP22, BIRC5, KIF2C, NCAPD2, ARHGEF2	55	650	20581	4.029846153846154	0.6884536989331027	0.19369482099750268	0.19023598490826157
 8 | UP_KEYWORDS	Transit peptide	6	10.909090909090908	0.013033979603529661	ALAS1, NNT, OXCT1, MTHFD2, MRPL19, HADH	55	536	20581	4.188805970149254	0.8896506576574663	0.3128155104847119	0.30722951922605635
 9 | UP_KEYWORDS	Cytoplasm	21	38.18181818181819	0.01679969816231558	TOP2A, CLIC4, ZNF274, TXNRD1, TPM1, FHL2, NOLC1, GLRX, RELB, CCNA2, GNPDA1, STMN1, BIRC5, PCBD1, PSMG1, KIF2C, NCAPD2, CPNE3, KIF20A, ARHGEF2, PAFAH1B3	55	4816	20581	1.631686046511628	0.9419427396195738	0.33607778138786243	0.3300763924345077
10 | UP_KEYWORDS	Oxidoreductase	6	10.909090909090908	0.018004166860064057	NNT, MTHFD2, P4HA2, TXNRD1, PYCR1, HADH	55	582	20581	3.857731958762886	0.9527479826636567	0.33607778138786243	0.3300763924345077
11 | UP_KEYWORDS	Mitochondrion	8	14.545454545454545	0.026249398106128065	CLIC4, ALAS1, NNT, OXCT1, MTHFD2, MRPL19, PYCR1, HADH	55	1119	20581	2.6752457551385165	0.9885390617335139	0.39814595131174274	0.3910362021811759
12 | UP_KEYWORDS	Isopeptide bond	8	14.545454545454545	0.027734439083459617	TOP2A, LIG1, TPM1, FHL2, RPA1, TCEA2, NOLC1, HIST2H2BE	55	1132	20581	2.64452296819788	0.9911312247584608	0.39814595131174274	0.3910362021811759
13 | UP_KEYWORDS	Cytoskeleton	8	14.545454545454545	0.028438996522267338	CLIC4, TPM1, STMN1, BIRC5, KIF2C, KIF20A, ARHGEF2, RELB	55	1138	20581	2.630579964850615	0.9921481397546121	0.39814595131174274	0.3910362021811759
14 | UP_KEYWORDS	Ubl conjugation	10	18.181818181818183	0.03212534559864598	TOP2A, CCNA2, TXNRD1, FHL2, RPA1, TCEA2, NOLC1, BIRC5, KIF2C, HIST2H2BE	55	1705	20581	2.1947214076246335	0.9958541680959994	0.4151583123517327	0.4077447710597374
15 | UP_KEYWORDS	Nucleus	21	38.18181818181819	0.04001475328204553	TOP2A, CLIC4, ZNF274, LIG1, TXNRD1, USP22, FHL2, RPA1, PSIP1, NOLC1, HIST2H2BE, RELB, CCNA2, EBP, TCEA2, BIRC5, PCBD1, KIF2C, NCAPD2, CPNE3, LBR	55	5244	20581	1.498512585812357	0.998951795861702	0.48017703938454637	0.4716024493955367
16 | UP_KEYWORDS	Magnesium	5	9.090909090909092	0.05656549310361622	TOP2A, LIG1, INPP1, MTHFD2, ATP2C1	55	552	20581	3.3894927536231885	0.9999435615398904	0.6335335227605017	0.6222204241397784
17 | UP_KEYWORDS	Disease mutation	12	21.818181818181817	0.06441555750669184	EBP, NPC1, NNT, OXCT1, P4HA2, TPM1, PYCR1, PCBD1, ATP2C1, HADH, SLC37A4, LBR	55	2550	20581	1.7609411764705882	0.9999861342755192	0.6763633538202644	0.6642854367877596
18 | UP_KEYWORDS	ATP-binding	8	14.545454545454545	0.07015335858609392	TOP2A, LIG1, NOLC1, DDX10, KIF2C, KIF20A, ATP2C1, PAICS	55	1391	20581	2.152120776419842	0.9999950670933881	0.6932802495566929	0.6809002451003233
19 | UP_KEYWORDS	NAD	3	5.454545454545454	0.07719857350347654	NNT, MTHFD2, HADH	55	175	20581	6.414857142857143	0.9999986253655966	0.7205200193657811	0.7076535904485349
20 | UP_KEYWORDS	NADP	3	5.454545454545454	0.08568717461020739	NNT, TXNRD1, PYCR1	55	186	20581	6.035483870967742	0.9999997089762821	0.7356053928073722	0.7224695822215264
21 | UP_KEYWORDS	Chromosome	4	7.2727272727272725	0.08757207057230622	BIRC5, KIF2C, NCAPD2, HIST2H2BE	55	400	20581	3.7419999999999995	0.9999997942405193	0.7356053928073722	0.7224695822215264
22 | UP_KEYWORDS	Alternative splicing	34	61.81818181818181	0.09860418033718393	TOP2A, ZNF274, ALAS1, SLC35F2, FHL2, NOLC1, PSIP1, ATP2C1, PGRMC1, OXCT1, GPC1, STMN1, ANXA7, STX4, HADH, SLC37A4, MPZL1, LIG1, TXNRD1, USP22, TPM1, PYCR1, PAICS, GNPDA1, NPC1, P4HA2, MTHFD2, TCEA2, BIRC5, PSMG1, KIF2C, KIF20A, ARHGEF2, TRIB1	55	10587	20581	1.2017379805421742	0.9999999733430376	0.7888334426974715	0.7747471312207309
23 | 


--------------------------------------------------------------------------------
/results/Figs_Source_Data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/Figs_Source_Data.xlsx


--------------------------------------------------------------------------------
/results/MoAprediction/JI_cdrpbio.txt:
--------------------------------------------------------------------------------
 1 | "CP"	"GE"	"Early Fusion"	"RGCCA"	"MCIA"	"MOFA"	"iCluster"	"intNMF"	"JIVE"	"scikit-fusion"
 2 | 0.278851699	0.21674894	0.259032794	0.31199107	0.244958406	0.245157412	0.166666667	0.273648354	0.26111079	0.202512456
 3 | 0.281781202	0.217353574	0.272947479	0.299993438	0.237120438	0.24766634	0.166666667	0.284937711	0.271319067	0.204582565
 4 | 0.2718746	0.219221298	0.288572557	0.300109206	0.245544432	0.239201534	0.176470588	0.284523564	0.264215061	0.20097943
 5 | 0.274209543	0.214641334	0.266898688	0.298896298	0.248330213	0.247923298	0.166666667	0.292745201	0.256330169	0.192807023
 6 | 0.265629351	0.217429769	0.279090052	0.307711408	0.239454065	0.239942443	1	0.285721498	0.26371381	0.199625524
 7 | 0.275029708	0.2239464	0.28495489	0.301235572	0.2386484	0.254481673	0.153846154	0.282939774	0.264471233	0.1976511
 8 | 0.288765182	0.217004773	0.282173851	0.306246664	0.246732016	0.254969989	0.4	0.294275281	0.263112267	0.200504611
 9 | 0.278531041	0.223395083	0.275002364	0.293082233	0.24396531	0.242451334	0.333333333	0.291354281	0.258611036	0.209453967
10 | 0.272455248	0.231600555	0.262360363	0.295268069	0.241331141	0.255809239	0.5	0.296717155	0.261306837	0.207150644
11 | 0.275023829	0.225685487	0.271300656	0.295524507	0.246870483	0.238823953	0.16	0.274089946	0.265535744	0.20338892
12 | 0.28143867	0.224871523	0.260911678	0.303672023	0.236066059	0.239288014	0.171428571	0.294494354	0.261902235	0.209384306
13 | 0.282288852	0.223132721	0.282475537	0.290829442	0.244147491	0.256209037	0.2	0.282647138	0.259500099	0.19759714
14 | 0.287532178	0.222311311	0.26242836	0.295981315	0.243569808	0.243439708	0.157894737	0.275122015	0.260815605	0.202213235
15 | 0.301480301	0.215302881	0.270409742	0.300910495	0.239649912	0.247746601	0.1	0.28834381	0.251075084	0.204514447
16 | 0.277106418	0.217944084	0.281592592	0.309385645	0.246099862	0.247184454	0.083333333	0.286270538	0.25763249	0.207762257
17 | 0.260563304	0.229904461	0.266062905	0.301805175	0.238773842	0.237490211	0.117647059	0.285515992	0.252312053	0.20449294


--------------------------------------------------------------------------------
/results/MoAprediction/JI_lincs.txt:
--------------------------------------------------------------------------------
 1 | "CP"	"GE"	"Early Fusion"	"RGCCA"	"MCIA"	"MOFA"	"iCluster"	"intNMF"	"JIVE"	"scikit-fusion"
 2 | 0.188998329	0.152892632	0.161685084	0.201019074	0.174152986	0.186524756	0.177777778	0.159674769	0.177569702	0.119184686
 3 | 0.182455547	0.155460808	0.1663579	0.198310907	0.179608683	0.173331065	0.226415094	0.16362849	0.167412958	0.116937471
 4 | 0.193899409	0.152630737	0.174071877	0.190282351	0.181639662	0.172031122	0.070588235	0.166882412	0.169969459	0.120875801
 5 | 0.182039125	0.155030388	0.165013806	0.196283122	0.182571093	0.172776906	0.071005917	0.168034196	0.1757652	0.124514593
 6 | 0.184550744	0.147509477	0.169153875	0.187533527	0.177913267	0.17130581	0.043478261	0.159508666	0.167613402	0.113748412
 7 | 0.182496631	0.159455754	0.165133202	0.18379198	0.179867253	0.183528183	0.089430894	0.169158962	0.159833977	0.12420253
 8 | 0.171946117	0.154463604	0.178991604	0.197523283	0.19030103	0.171250903	0.2	0.172767563	0.164891016	0.123406295
 9 | 0.186694719	0.155486143	0.164093562	0.187112227	0.186310238	0.183258565	0.208333333	0.165169404	0.171638208	0.118322396
10 | 0.180338058	0.143601898	0.15861667	0.200142072	0.184137339	0.17660344	0.072164948	0.16547984	0.163073586	0.126149742
11 | 0.183605321	0.15741292	0.171381991	0.188784002	0.173252761	0.177570921	0.01369863	0.163314014	0.173093975	0.11873174
12 | 0.193222559	0.156995026	0.173813632	0.196182388	0.174394311	0.171367636	0.083333333	0.165245188	0.164822426	0.11878228
13 | 0.192671978	0.154919056	0.170624979	0.193073079	0.181786851	0.171575995	0.291139241	0.171691008	0.165878195	0.119526703
14 | 0.187967039	0.155867196	0.162878291	0.19683875	0.192637938	0.18045576	0.461538462	0.157649099	0.163766256	0.117880675
15 | 0.18544228	0.157804763	0.179560591	0.185849847	0.187052994	0.177218298	0.049382716	0.163521993	0.161072242	0.124717351
16 | 0.190641757	0.155245631	0.15904326	0.192819574	0.18380893	0.170782525	0.058823529	0.164331798	0.174827621	0.120163039
17 | 0.185672437	0.164612477	0.173447376	0.196032956	0.174881143	0.17674655	0.25	0.162468251	0.162594174	0.124472168
18 | 0.190136704	0.153092828	0.170318814	0.199112922	0.181481014	0.181176604	0.15625	0.169043792	0.158751472	0.118636228
19 | 0.184765443	0.157565332	0.173143963	0.198382252	0.179289922	0.173976218	0.45	0.172475028	0.164535448	0.11733305
20 | 0.192641148	0.154739976	0.171832001	0.197707877	0.179048991	0.182646244	0.138888889	0.169634439	0.170512163	0.120909347
21 | 0.183218664	0.156702842	0.17199648	0.189603926	0.180477667	0.179052585	0.057971014	0.171736775	0.152669412	0.116083564
22 | 0.187256874	0.147593792	0.170884185	0.198366585	0.180195479	0.175878718	0.058139535	0.169146685	0.159364501	0.118681457
23 | 0.187069918	0.149542804	0.16561384	0.187212941	0.183953164	0.182117759	0.046511628	0.166623749	0.170293955	0.120635555
24 | 0.183073028	0.153191771	0.156706199	0.198252086	0.178945469	0.183918728	0.01369863	0.164690939	0.168051618	0.12046373
25 | 0.188407875	0.158852641	0.155445963	0.199950627	0.178443793	0.179828374	0.079365079	0.167779642	0.166961766	0.119892686
26 | 0.175651634	0.171142449	0.158127203	0.191347613	0.195579897	0.178017005	0.2	0.170611727	0.166003424	0.11948773
27 | 0.17865041	0.155742474	0.18308929	0.187846662	0.185773521	0.183502339	0.06	0.167828001	0.164108882	0.120033978
28 | 0.194311319	0.158786831	0.175152329	0.188190067	0.18457021	0.17641168	0.0625	0.164302617	0.158739428	0.119227709
29 | 0.193254176	0.15960622	0.175409666	0.187384662	0.179096717	0.18390748	0.105263158	0.163219297	0.167798949	0.121511195
30 | 0.18617729	0.155678692	0.167753437	0.194289268	0.174803478	0.175536734	0.083333333	0.164659934	0.159521227	0.123949342
31 | 0.179601909	0.156356456	0.161356004	0.193791932	0.183172202	0.175497751	0.051724138	0.162390982	0.170494378	0.120299648
32 | 0.18963734	0.156625601	0.166336797	0.193811041	0.179803533	0.17315947	0.183333333	0.156586373	0.170302999	0.11827188
33 | 0.184352173	0.152024514	0.16540649	0.19131148	0.172935734	0.178541939	0.223529412	0.176281185	0.157959047	0.116664658
34 | 0.189699393	0.148682331	0.164026646	0.201137787	0.182479125	0.168811084	0.620689655	0.175702153	0.162975292	0.120901982
35 | 0.180999142	0.161272099	0.160666305	0.20022499	0.185258429	0.176733567	0.016666667	0.165612072	0.160725882	0.116746776
36 | 0.180157675	0.157840116	0.167903519	0.200828676	0.182595129	0.175124784	0.046875	0.170652949	0.16434307	0.121763081
37 | 0.175230255	0.155045636	0.166836702	0.198913695	0.177755714	0.181654699	0.051282051	0.161360514	0.162028228	0.116845951
38 | 0.184173371	0.155724161	0.16486378	0.186722298	0.172474892	0.176250337	0.058823529	0.167288187	0.161137409	0.118986834
39 | 0.187505371	0.162342476	0.170702512	0.18814401	0.180784286	0.175958655	0.043478261	0.168911293	0.158109144	0.11885624
40 | 0.184911138	0.152442416	0.170156448	0.194377939	0.174680795	0.184703302	0.111111111	0.1634296	0.157688838	0.117673344
41 | 0.200188868	0.167102791	0.175260347	0.207820825	0.180506812	0.181896395	0.452380952	0.177000251	0.159222812	0.118347626
42 | 0.185687782	0.145595456	0.166612367	0.194777853	0.180121812	0.18308245	0.06122449	0.163437656	0.167444436	0.116668697
43 | 0.184420505	0.164602208	0.162737406	0.195980252	0.179240673	0.182344867	0.117647059	0.170539941	0.166948171	0.124213738
44 | 0.189296239	0.15369093	0.175729401	0.197048519	0.181657677	0.172635318	0.060869565	0.166161194	0.160457602	0.122806357
45 | 0.185131847	0.153377751	0.157125437	0.198765646	0.176936046	0.177792184	0.1	0.159993478	0.166106644	0.123517094
46 | 0.191163865	0.153658909	0.173349824	0.196983778	0.175305658	0.184276093	0.063829787	0.165410291	0.164367546	0.122196122
47 | 0.182868061	0.152204389	0.164102408	0.193891247	0.179592803	0.171609376	0.051724138	0.162577	0.168739518	0.11732252
48 | 0.182594781	0.158816683	0.172826354	0.203075052	0.179938037	0.169869442	0.28	0.160315945	0.158795732	0.121053386
49 | 0.1857397	0.1627972	0.164690579	0.193372016	0.178605952	0.180772956	0.681818182	0.172442065	0.166283947	0.12059789
50 | 0.185656684	0.154339706	0.164199974	0.191689141	0.175449618	0.178893609	0.041666667	0.170628469	0.165995994	0.114831952
51 | 0.183158086	0.152878986	0.163597095	0.194775553	0.176895727	0.175380238	0.048	0.161713348	0.176740485	0.124128933
52 | 0.184734257	0.149737635	0.170376461	0.194929765	0.179171628	0.185745275	0.14	0.166058303	0.167017443	0.115490732
53 | 0.179232878	0.156775481	0.161159229	0.202649183	0.187129908	0.172128717	0.065420561	0.180383253	0.157901665	0.121413283
54 | 0.178903625	0.157511588	0.16991642	0.197078156	0.191806662	0.181379942	0.131578947	0.162768724	0.160685666	0.119430025
55 | 0.177152292	0.163682115	0.167529903	0.20160861	0.185839945	0.175390124	0.417721519	0.171189056	0.163199997	0.122366042
56 | 0.186724616	0.148527899	0.179844588	0.192022594	0.174337312	0.179788153	0.08	0.170737665	0.165948529	0.120087864
57 | 0.185444784	0.159524103	0.166411955	0.195164402	0.180475026	0.176058824	0.607142857	0.164282465	0.160104594	0.122952826
58 | 0.189335143	0.146887219	0.16845284	0.192139051	0.180729516	0.186868239	0.056603774	0.168942511	0.160584253	0.12571239


--------------------------------------------------------------------------------
/results/MoAprediction/pred_moa.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa.xlsx


--------------------------------------------------------------------------------
/results/MoAprediction/pred_moa_2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa_2.xlsx


--------------------------------------------------------------------------------
/results/MoAprediction/pred_moa_CDRP.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa_CDRP.xlsx


--------------------------------------------------------------------------------
/results/MoAprediction/pred_moa_LINCS.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa_LINCS.xlsx


--------------------------------------------------------------------------------
/results/RepCor/RepCorrDF.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/RepCor/RepCorrDF.xlsx


--------------------------------------------------------------------------------
/results/SingleCPfeatPred/scores_corrected.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleCPfeatPred/scores_corrected.xlsx


--------------------------------------------------------------------------------
/results/SingleGenePred/scores_corrected.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred/scores_corrected.xlsx


--------------------------------------------------------------------------------
/results/SingleGenePred/scores_cross_dts_LU_LI.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred/scores_cross_dts_LU_LI.xlsx


--------------------------------------------------------------------------------
/results/SingleGenePred_cpCategoryMap/CatMap-LINCS-25-lasso-ht.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LINCS-25-lasso-ht.png


--------------------------------------------------------------------------------
/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.pdf


--------------------------------------------------------------------------------
/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.png


--------------------------------------------------------------------------------
/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-lasso-ht.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-lasso-ht.png


--------------------------------------------------------------------------------
/results/SingleGenePred_cpCategoryMap/cat_scores_maps.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/cat_scores_maps.xlsx


--------------------------------------------------------------------------------
/utils/pred_models.py:
--------------------------------------------------------------------------------
  1 | from sklearn.model_selection import (
  2 |     cross_val_score,
  3 |     cross_val_predict,
  4 |     GroupKFold,
  5 |     LeaveOneGroupOut,
  6 | )
  7 | from sklearn.model_selection import GridSearchCV
  8 | from sklearn import metrics
  9 | import numpy as np
 10 | from sklearn import preprocessing
 11 | from warnings import simplefilter
 12 | from sklearn.neural_network import MLPRegressor
 13 | from sklearn.exceptions import ConvergenceWarning
 14 | from sklearn.model_selection import train_test_split
 15 | from sklearn.svm import SVR
 16 | 
 17 | # simplefilter("ignore", category=ConvergenceWarning)
 18 | # from sklearn.exceptions import ConvergenceWarning
 19 | # ConvergenceWarning('ignore')
 20 | 
 21 | 
 22 | ########################## Lasso models
 23 | def lasso_cv(X, y, k, group_labels):
 24 |     """
 25 |     X: CP data [perts/samples, features]
 26 |     y: lm gene expression value [perts/samples, 1 (feature value)]
 27 | 
 28 |     Returns:
 29 |         prediction scores, y permutated scores
 30 |     """
 31 |     from sklearn import linear_model
 32 | 
 33 |     n_j = 3
 34 |     # build sklearn model
 35 |     clf = linear_model.Lasso(alpha=0.1, max_iter=10000)
 36 | 
 37 |     #     k=np.unique(group_labels).shape[0]
 38 |     split_obj = GroupKFold(n_splits=k)
 39 |     #     split_obj = LeaveOneGroupOut()
 40 |     # Perform k-fold cross validation
 41 |     scores = cross_val_score(clf, X, y, groups=group_labels, cv=split_obj, n_jobs=n_j)
 42 | 
 43 |     # Perform k-fold cross validation on the shuffled vector of lm GE across samples
 44 |     # y.sample(frac = 1) this just shuffles the vector
 45 |     scores_rand = cross_val_score(
 46 |         clf, X, y.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
 47 |     )
 48 |     return scores, scores_rand
 49 | 
 50 | 
 51 | def lasso_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag):
 52 |     """
 53 |     X: CP data [perts/samples, features]
 54 |     y: lm gene expression value [perts/samples, 1 (feature value)]
 55 | 
 56 |     Returns:
 57 |         prediction scores, y permutated scores
 58 |     """
 59 |     from sklearn import linear_model
 60 | 
 61 |     n_j = 3
 62 |     # build sklearn model
 63 |     clf = linear_model.Lasso(alpha=0.1, max_iter=1000)
 64 | 
 65 |     #     k=np.unique(group_labels).shape[0]
 66 |     split_obj = GroupKFold(n_splits=k)
 67 |     #     split_obj = LeaveOneGroupOut()
 68 |     # Perform k-fold cross validation
 69 | 
 70 |     #     alphas = np.linspace(0, 0.02, 11)
 71 |     alphas1 = np.linspace(0, 0.2, 20)
 72 |     alphas2 = np.linspace(0.2, 0.5, 10)[1:]
 73 |     alphas = np.concatenate((alphas1, alphas2))
 74 |     #     alphas = np.logspace(-4, -0.5, 30)
 75 |     lasso_cv = linear_model.LassoCV(
 76 |         alphas=alphas, random_state=0, max_iter=1000, selection="random", n_jobs=k
 77 |     )
 78 |     #     lasso_cv = linear_model.LassoLarsCV(cv=5)
 79 |     X, y = X0.values, y0.values
 80 | 
 81 |     #     scores=np.zeros(k,)
 82 |     scores = []
 83 |     for train_index, test_index in split_obj.split(X, y, group_labels):
 84 |         #         print("TRAIN:", train_index, "TEST:", test_index)
 85 |         X_train, X_test = X[train_index], X[test_index]
 86 |         y_train, y_test = y[train_index], y[test_index]
 87 | 
 88 |         lasso_cv.fit(X_train, y_train)
 89 |         scores.append(lasso_cv.score(X_test, y_test))
 90 |     #         print(lasso_cv.alpha_)
 91 | 
 92 |     # Perform k-fold cross validation on the shuffled vector of lm GE across samples
 93 |     # y.sample(frac = 1) this just shuffles the vector
 94 |     if rand_added_flag:
 95 |         scores_rand = cross_val_score(
 96 |             clf, X0, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
 97 |         )
 98 |     else:
 99 |         scores_rand = 0
100 |     return np.array(scores), scores_rand
101 | 
102 | 
103 | def ridge_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag):
104 | 
105 |     """
106 |     X: CP data [perts/samples, features]
107 |     y: lm gene expression value [perts/samples, 1 (feature value)]
108 | 
109 |     Returns:
110 |         prediction scores, y permutated scores
111 |     """
112 | 
113 |     from sklearn import linear_model
114 | 
115 |     n_j = 3
116 |     # build sklearn model
117 |     clf = linear_model.Ridge(alpha=0.1, max_iter=10000)
118 | 
119 |     #     k=np.unique(group_labels).shape[0]
120 |     split_obj = GroupKFold(n_splits=k)
121 |     #     split_obj = LeaveOneGroupOut()
122 |     # Perform k-fold cross validation
123 | 
124 |     #     alphas = np.linspace(0, 0.02, 11)
125 |     alphas1 = np.linspace(0.1, 0.2, 10)
126 |     alphas2 = np.linspace(0.2, 0.5, 10)[1:]
127 |     alphas = np.concatenate((alphas1, alphas2))
128 |     #     alphas = np.logspace(-4, -0.5, 30)
129 |     lasso_cv = linear_model.RidgeCV(alphas)
130 | 
131 |     #     X,y=X0,y0
132 |     X, y = X0.values, y0.values
133 | 
134 |     #     scores=np.zeros(k,)
135 |     scores = []
136 |     for train_index, test_index in split_obj.split(X, y, group_labels):
137 |         #         print("TRAIN:", train_index, "TEST:", test_index)
138 |         X_train, X_test = X[train_index], X[test_index]
139 |         y_train, y_test = y[train_index], y[test_index]
140 | 
141 |         lasso_cv.fit(X_train, y_train)
142 |         scores.append(lasso_cv.score(X_test, y_test))
143 |     #         print(lasso_cv.alpha_)
144 | 
145 |     # Perform k-fold cross validation on the shuffled vector of lm GE across samples
146 |     # y.sample(frac = 1) this just shuffles the vector
147 |     if rand_added_flag:
148 |         scores_rand = cross_val_score(
149 |             clf, X0, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
150 |         )
151 |     else:
152 |         scores_rand = 0
153 |     return np.array(scores), scores_rand
154 | 
155 | 
156 | ########################## MLP
157 | # def MLP_cv(X,y,k,group_labels):
158 | #     from sklearn.neural_network import MLPRegressor
159 | 
160 | #     n_j=-1
161 | # #     hidden_layer_sizes=100,
162 | # #     hidden_layer_sizes = (50, 20, 10)
163 | #     regr = MLPRegressor(random_state=1,hidden_layer_sizes = (100), max_iter=10000,activation='tanh',early_stopping=True)
164 | 
165 | #     split_obj=GroupKFold(n_splits=k)
166 | #     # Perform k-fold cross validation
167 | #     scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
168 | 
169 | #     # Perform k-fold cross validation on the shuffled vector of lm GE across samples
170 | #     # y.sample(frac = 1) this just shuffles the vector
171 | #     scores_rand = cross_val_score(regr, X, y.sample(frac = 1) ,groups=group_labels,cv=split_obj,n_jobs=n_j)
172 | #     return scores, scores_rand
173 | # X is train samples and y is the corresponding labels
174 | 
175 | 
176 | def MLP_cv(X, y, k, group_labels, rand_added_flag):
177 |     from sklearn.neural_network import MLPRegressor
178 | 
179 |     n_j = -1
180 |     #     hidden_layer_sizes=100,
181 |     #     hidden_layer_sizes = (50, 20, 10)
182 |     regr = MLPRegressor(
183 |         hidden_layer_sizes=(50, 10),
184 |         activation="logistic",
185 |         alpha=0.01,
186 |         early_stopping=True,
187 |     )
188 | 
189 |     split_obj = GroupKFold(n_splits=k)
190 |     # Perform k-fold cross validation
191 |     scores = cross_val_score(regr, X, y, groups=group_labels, cv=split_obj, n_jobs=n_j)
192 | 
193 |     # Perform k-fold cross validation on the shuffled vector of lm GE across samples
194 |     # y.sample(frac = 1) this just shuffles the vector
195 | 
196 |     if rand_added_flag:
197 |         scores_rand = cross_val_score(
198 |             regr, X, y.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
199 |         )
200 |     else:
201 |         scores_rand = 0
202 | 
203 |     return scores, scores_rand
204 | 
205 | 
206 | def MLP_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag):
207 |     n_j = -1
208 |     #     hidden_layer_sizes=100,
209 |     #     hidden_layer_sizes = (50, 20, 10)
210 |     #     regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\
211 |     #                         alpha=0.01,early_stopping=True)
212 | 
213 |     mlp_gs = MLPRegressor(
214 |         random_state=0,
215 |         early_stopping=True,
216 |         n_iter_no_change=4,
217 |         learning_rate="adaptive",
218 |     )
219 | 
220 |     split_obj = GroupKFold(n_splits=k)
221 |     # Perform k-fold cross validation
222 |     #     scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
223 | 
224 |     #     mlp_gs = MLPClassifier(max_iter=100)
225 |     #     parameter_space = {
226 |     #         'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)],
227 |     #         'activation': ['tanh', 'relu','logistic'],
228 |     #         'alpha': [0.0001, 0.05,0.01,0.1,0.2],
229 |     #         'early_stopping':[True,False]
230 |     #     }
231 | 
232 |     parameter_space = {
233 |         "max_iter": [10, 100, 300, 500],
234 |         "hidden_layer_sizes": [
235 |             (32, 64),
236 |             (64, 32),
237 |             (50, 10),
238 |             (50, 10, 10),
239 |             (20, 10),
240 |             (),
241 |         ],  # (50,5),(50,),(10,)
242 |         "activation": ["logistic", "tanh"],
243 |         "alpha": [0.0005, 0.01, 0.3, 1, 2, 3, 4, 5, 6, 7],
244 |         #         'learning_rate': ['constant','adaptive']
245 |         #         'early_stopping':[True,False]
246 |     }
247 | 
248 |     clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=k, cv=4)
249 | 
250 |     X, y = X0, y0.values
251 | 
252 |     scores = []
253 |     for train_index, test_index in split_obj.split(X, y, group_labels):
254 |         #         print("TRAIN:", train_index, "TEST:", test_index)
255 |         X_train, X_test = X[train_index], X[test_index]
256 |         y_train, y_test = y[train_index], y[test_index]
257 | 
258 |         clf.fit(X_train, y_train)
259 |         #         clf.fit(X, y)
260 |         scores.append(clf.best_estimator_.score(X_test, y_test))
261 |         print(clf.best_params_)
262 | 
263 |     # Perform k-fold cross validation on the shuffled vector of lm GE across samples
264 |     # y.sample(frac = 1) this just shuffles the vector
265 |     #     scores_rand=0
266 | 
267 |     if rand_added_flag:
268 |         scores_rand = cross_val_score(
269 |             mlp_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
270 |         )
271 |     else:
272 |         scores_rand = 0
273 |     return scores, scores_rand
274 | 
275 | 
276 | def MLP_cv_plus_model_selection_keras(X0, y0, k, group_labels, rand_added_flag):
277 |     from keras.models import Sequential
278 |     from keras.layers import Dense, Conv1D, Flatten, Dropout
279 |     from sklearn.metrics import mean_squared_error, r2_score
280 |     from keras.callbacks import EarlyStopping
281 |     from keras import backend as K
282 | 
283 |     X = X0.reshape(X0.shape[0], X0.shape[1], 1)
284 |     y = y0.values
285 |     #     model.summary()
286 |     model = Sequential()
287 |     model.add(Dense(16, activation="relu", input_shape=(X0.shape[1], 1)))
288 |     #     model.add(Conv1D(32, 2, activation="relu", input_shape=(X0.shape[1],1)))
289 |     model.add(Flatten())
290 |     model.add(Dropout(0.6))
291 |     model.add(Dense(64, activation="relu"))
292 |     model.add(Dropout(0.2))
293 |     model.add(Dense(1))
294 |     model.compile(loss="mse", optimizer="adam")  # ,metrics=[coeff_determination])
295 |     #     model.compile(loss=coeff_determination, optimizer="adam")#,metrics=[coeff_determination])
296 | 
297 |     es = EarlyStopping(monitor="val_loss", mode="min", verbose=0, patience=10)
298 | 
299 |     Wsave = model.get_weights()
300 | 
301 |     split_obj = GroupKFold(n_splits=k)
302 | 
303 |     scores = []
304 |     for train_index, test_index in split_obj.split(X, y, group_labels):
305 |         #         print("TRAIN:", train_index, "TEST:", test_index)
306 |         X_train, X_test = X[train_index], X[test_index]
307 |         y_train, y_test = y[train_index], y[test_index]
308 | 
309 |         XTraining, XValidation, YTraining, YValidation = train_test_split(
310 |             X_train, y_train, test_size=0.1
311 |         )
312 | 
313 |         model.set_weights(Wsave)
314 |         model.fit(
315 |             XTraining,
316 |             YTraining,
317 |             batch_size=XTraining.shape[0],
318 |             epochs=1000,
319 |             validation_data=(XValidation, YValidation),
320 |             callbacks=[es],
321 |             verbose=0,
322 |         )
323 |         ypred = model.predict(X_test)
324 | 
325 |         scores.append(r2_score(y_test, ypred))
326 | 
327 |     return scores, 0
328 | 
329 | 
330 | def SVR_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag):
331 |     n_j = -1
332 |     #     hidden_layer_sizes=100,
333 |     #     hidden_layer_sizes = (50, 20, 10)
334 |     #     regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\
335 |     #                         alpha=0.01,early_stopping=True)
336 | 
337 |     svr_gs = SVR(epsilon=0.2)
338 | 
339 |     split_obj = GroupKFold(n_splits=k)
340 |     # Perform k-fold cross validation
341 |     #     scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
342 | 
343 |     #     mlp_gs = MLPClassifier(max_iter=100)
344 |     #     parameter_space = {
345 |     #         'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)],
346 |     #         'activation': ['tanh', 'relu','logistic'],
347 |     #         'alpha': [0.0001, 0.05,0.01,0.1,0.2],
348 |     #         'early_stopping':[True,False]
349 |     #     }
350 | 
351 |     parameter_space = {
352 |         "kernel": ("poly", "rbf", "sigmoid"),
353 |         "C": [1, 2, 3, 5, 20, 100, 500, 1000],  # (50,5),(50,),(10,)
354 |         "degree": [1, 2, 3, 4],
355 |         "coef0": [0.01, 0.5, 1, 10],
356 |         "gamma": ("auto", "scale"),
357 |         #         'epsilon':[0.1,0.2,0.5,0.3]
358 |         #         'early_stopping':[True,False]
359 |     }
360 | 
361 |     clf = GridSearchCV(svr_gs, parameter_space, n_jobs=k, cv=4)
362 | 
363 |     X, y = X0, y0.values
364 | 
365 |     scores = []
366 |     for train_index, test_index in split_obj.split(X, y, group_labels):
367 |         #         print("TRAIN:", train_index, "TEST:", test_index)
368 |         X_train, X_test = X[train_index], X[test_index]
369 |         y_train, y_test = y[train_index], y[test_index]
370 | 
371 |         clf.fit(X_train, y_train)
372 |         #         clf.fit(X, y)
373 |         scores.append(clf.best_estimator_.score(X_test, y_test))
374 |         print(clf.best_params_)
375 | 
376 |     # Perform k-fold cross validation on the shuffled vector of lm GE across samples
377 |     # y.sample(frac = 1) this just shuffles the vector
378 |     #     scores_rand=0
379 | 
380 |     if rand_added_flag:
381 |         scores_rand = cross_val_score(
382 |             svr_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
383 |         )
384 |     else:
385 |         scores_rand = 0
386 |     return scores, scores_rand
387 | 
388 | 
389 | def MLP_cv_plus_model_selection_rand_test(X0, y0, k, group_labels, rand_added_flag):
390 |     n_j = -1
391 |     #     hidden_layer_sizes=100,
392 |     #     hidden_layer_sizes = (50, 20, 10)
393 |     #     regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\
394 |     #                         alpha=0.01,early_stopping=True)
395 | 
396 |     mlp_gs = MLPRegressor(random_state=0, early_stopping=True, n_iter_no_change=20)
397 | 
398 |     split_obj = GroupKFold(n_splits=k)
399 |     # Perform k-fold cross validation
400 |     #     scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
401 | 
402 |     #     mlp_gs = MLPClassifier(max_iter=100)
403 |     #     parameter_space = {
404 |     #         'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)],
405 |     #         'activation': ['tanh', 'relu','logistic'],
406 |     #         'alpha': [0.0001, 0.05,0.01,0.1,0.2],
407 |     #         'early_stopping':[True,False]
408 |     #     }
409 | 
410 |     parameter_space = {
411 |         "max_iter": [10, 100, 300, 500],
412 |         "hidden_layer_sizes": [
413 |             (32, 64),
414 |             (64, 32),
415 |             (50, 10),
416 |             (50, 10, 10),
417 |             (20, 10),
418 |         ],  # (50,5),(50,),(10,)
419 |         "activation": ["logistic", "tanh"],
420 |         "alpha": [0.0005, 0.01, 0.3, 1, 2],
421 |         #         'learning_rate': ['constant','adaptive']
422 |         #         'early_stopping':[True,False]
423 |     }
424 | 
425 |     clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=k, cv=4)
426 | 
427 |     X, y = X0, y0.values
428 | 
429 |     scores = []
430 |     for train_index, test_index in split_obj.split(X, y, group_labels):
431 |         #         print("TRAIN:", train_index, "TEST:", test_index)
432 |         X_train, X_test = X[train_index], X[test_index]
433 |         y_train, y_test = y[train_index], y[test_index]
434 | 
435 |         clf.fit(X_train, y_train)
436 |         #         clf.fit(X, y)
437 |         scores.append(clf.best_estimator_.score(X_test, y_test))
438 |         print(clf.best_params_)
439 | 
440 |     # Perform k-fold cross validation on the shuffled vector of lm GE across samples
441 |     # y.sample(frac = 1) this just shuffles the vector
442 |     #     scores_rand=0
443 | 
444 |     if rand_added_flag:
445 |         scores_rand = cross_val_score(
446 |             mlp_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
447 |         )
448 |     else:
449 |         scores_rand = 0
450 |     return scores, scores_rand
451 | 
452 | 
453 | def MLP_cv_plus_model_selection_taorf(X0, y0, k, group_labels, rand_added_flag):
454 |     from sklearn.neural_network import MLPRegressor
455 | 
456 |     n_j = -1
457 |     #     hidden_layer_sizes=100,
458 |     #     hidden_layer_sizes = (50, 20, 10)
459 |     #     regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\
460 |     #                         alpha=0.01,early_stopping=True)
461 | 
462 |     mlp_gs = MLPRegressor(random_state=0, max_iter=1000)
463 | 
464 |     split_obj = GroupKFold(n_splits=k)
465 |     # Perform k-fold cross validation
466 |     #     scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
467 | 
468 |     #     mlp_gs = MLPClassifier(max_iter=100)
469 |     #     parameter_space = {
470 |     #         'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)],
471 |     #         'activation': ['tanh', 'relu','logistic'],
472 |     #         'alpha': [0.0001, 0.05,0.01,0.1,0.2],
473 |     #         'early_stopping':[True,False]
474 |     #     }
475 | 
476 |     parameter_space = {
477 |         "hidden_layer_sizes": [(50,), (10, 30, 10), (50, 10), (50, 10, 10)],
478 |         "activation": ["tanh", "relu", "logistic"],
479 |         "alpha": [0.0001, 0.05, 0.01, 0.2, 0.5, 0.7],
480 |         "learning_rate": ["constant", "adaptive"]
481 |         #         'early_stopping':[True,False]
482 |     }
483 | 
484 |     #         parameter_space = {
485 |     #         'hidden_layer_sizes': [(50,),(10,),(50,10),(50,10,10)],
486 |     #         'activation': ['tanh', 'relu','logistic'],
487 |     #         'alpha': [0.05,0.01,0.2,0.5],
488 |     # #         'early_stopping':[True,False]
489 |     #     }
490 | 
491 |     clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=6, cv=2)
492 | 
493 |     X, y = X0.values, y0.values
494 | 
495 |     scores = []
496 |     for train_index, test_index in split_obj.split(X, y, group_labels):
497 |         #         print("TRAIN:", train_index, "TEST:", test_index)
498 |         X_train, X_test = X[train_index], X[test_index]
499 |         y_train, y_test = y[train_index], y[test_index]
500 | 
501 |         clf.fit(X_train, y_train)
502 |         #         clf.fit(X, y)
503 |         scores.append(clf.score(X_test, y_test))
504 |         print(clf.best_params_)
505 | 
506 |     # Perform k-fold cross validation on the shuffled vector of lm GE across samples
507 |     # y.sample(frac = 1) this just shuffles the vector
508 | 
509 |     #     scores_rand=0
510 | 
511 |     if rand_added_flag:
512 |         scores_rand = cross_val_score(
513 |             mlp_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
514 |         )
515 |     else:
516 |         scores_rand = 0
517 |     return scores, scores_rand
518 | 
519 | 
520 | # from sklearn.model_selection import RandomizedSearchCV
521 | # # Number of trees in random forest
522 | # n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
523 | # # Number of features to consider at every split
524 | # max_features = ['auto', 'sqrt']
525 | # # Maximum number of levels in tree
526 | # max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
527 | # max_depth.append(None)
528 | # # Minimum number of samples required to split a node
529 | # min_samples_split = [2, 5, 10]
530 | # # Minimum number of samples required at each leaf node
531 | # min_samples_leaf = [1, 2, 4]
532 | # # Method of selecting samples for training each tree
533 | # bootstrap = [True, False]
534 | # # Create the random grid
535 | # random_grid = {'n_estimators': n_estimators,
536 | #                'max_features': max_features,
537 | #                'max_depth': max_depth,
538 | #                'min_samples_split': min_samples_split,
539 | #                'min_samples_leaf': min_samples_leaf,
540 | #                'bootstrap': bootstrap}
541 | # pprint(random_grid)
542 | 
543 | 
544 | ########################## Random Forest
545 | def RFR_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag):
546 |     from sklearn.ensemble import RandomForestRegressor
547 |     from sklearn.model_selection import GridSearchCV
548 | 
549 |     n_j = -1
550 | 
551 |     #     parameter_space ={'bootstrap': [True, False],\
552 |     #      'max_depth': [10, 20, 40, 50, 100, None],\
553 |     #      'max_features': ['auto', 'sqrt'],\
554 |     #      'min_samples_leaf': [1, 2, 4],\
555 |     #      'min_samples_split': [2, 5, 10],\
556 |     #      'n_estimators': [200, 400, 600, 800, 1000]}
557 | 
558 |     parameter_space = {
559 |         "max_depth": [10, 20, None],
560 |         "min_samples_leaf": [1, 4],
561 |         "min_samples_split": [2, 5, 10],
562 |     }
563 | 
564 |     rfr_gs = RandomForestRegressor(bootstrap=True, max_features="auto")
565 | 
566 |     split_obj = GroupKFold(n_splits=k)
567 |     # Perform k-fold cross validation
568 |     #     scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j)
569 | 
570 |     #     mlp_gs = MLPClassifier(max_iter=100)
571 | 
572 |     clf = GridSearchCV(rfr_gs, parameter_space, n_jobs=-1, cv=2)
573 | 
574 |     X, y = X0.values, y0.values
575 | 
576 |     scores = []
577 |     for train_index, test_index in split_obj.split(X, y, group_labels):
578 |         #         print("TRAIN:", train_index, "TEST:", test_index)
579 |         X_train, X_test = X[train_index], X[test_index]
580 |         y_train, y_test = y[train_index], y[test_index]
581 | 
582 |         #         lasso_cv.fit(X_train, y_train)
583 |         clf.fit(X, y)
584 |         scores.append(clf.score(X_test, y_test))
585 |         print(clf.best_params_)
586 | 
587 |     # Perform k-fold cross validation on the shuffled vector of lm GE across samples
588 |     # y.sample(frac = 1) this just shuffles the vector
589 |     scores_rand = cross_val_score(
590 |         rfr_gs, X0, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j
591 |     )
592 |     #     scores_rand=0
593 |     return scores, scores_rand
594 | 
595 | 
596 | ############################## Feature Ranking #########################
597 | def linear_model_feature_ranking(X0, y0, k, group_labels, l1k_features_gn):
598 |     """
599 |     X: CP data [perts/samples, features]
600 |     y: lm gene expression value [perts/samples, 1 (feature value)]
601 | 
602 |     Returns:
603 |         prediction scores, y permutated scores
604 |     """
605 |     from sklearn import linear_model
606 |     from sklearn.feature_selection import SelectKBest
607 |     from sklearn.feature_selection import mutual_info_regression
608 | 
609 |     n_j = 3
610 |     # build sklearn model
611 |     #     clf = linear_model.Lasso(alpha=0.1,max_iter=10000)
612 |     clf = linear_model.LinearRegression()
613 | 
614 |     #     k=np.unique(group_labels).shape[0]
615 | 
616 |     split_obj = GroupKFold(n_splits=k)
617 |     #     split_obj = LeaveOneGroupOut()
618 |     # Perform k-fold cross validation
619 | 
620 |     #     alphas = np.linspace(0, 0.02, 11)
621 |     alphas1 = np.linspace(0, 0.2, 20)
622 |     alphas2 = np.linspace(0.2, 0.5, 10)[1:]
623 |     alphas = np.concatenate((alphas1, alphas2))
624 |     #     alphas = np.logspace(-4, -0.5, 30)
625 |     #     lasso_cv = linear_model.LassoCV(alphas=alphas, random_state=0, max_iter=1000,selection='random')
626 | 
627 |     X, y = X0.values, y0.values
628 | 
629 |     fs = SelectKBest(score_func=mutual_info_regression, k="all")
630 |     fs.fit(X, y)
631 | 
632 |     clf.fit(X, y)
633 |     return clf.coef_, fs.scores_
634 | 
635 | 
636 | #     return ranking(np.abs(lasso_cv.coef_), l1k_features_gn)
637 | 
638 | 
639 | ranks = {}
640 | 
641 | 
642 | # Create our function which stores the feature rankings to the ranks dictionary
643 | def ranking(ranks, names, order=1):
644 |     minmax = preprocessing.MinMaxScaler()
645 |     ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0]
646 |     ranks = map(lambda x: round(x, 2), ranks)
647 |     return dict(zip(names, ranks))
648 | 


--------------------------------------------------------------------------------
/utils/readProfiles.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.spatial
  3 | import pandas as pd
  4 | import sklearn.decomposition
  5 | from sklearn import preprocessing
  6 | from sklearn.metrics import pairwise_distances
  7 | 
  8 | # from utils.normalize_funcs import standardize_per_catX
  9 | # from normalize_funcs import standardize_per_catX
 10 | 
 11 | #'dataset_name',['folder_name',[cp_pert_col_name,l1k_pert_col_name],[cp_control_val,l1k_control_val]]
 12 | ds_info_dict = {
 13 |     "CDRP": ["CDRP-BBBC047-Bray", ["Metadata_Sample_Dose", "pert_sample_dose"]],
 14 |     "CDRP-bio": ["CDRPBIO-BBBC036-Bray", ["Metadata_Sample_Dose", "pert_sample_dose"]],
 15 |     "TAORF": [
 16 |         "TA-ORF-BBBC037-Rohban",
 17 |         [
 18 |             "Metadata_broad_sample",
 19 |             "pert_id",
 20 |         ],
 21 |     ],
 22 |     "LUAD": ["LUAD-BBBC041-Caicedo", ["x_mutation_status", "allele"]],
 23 |     "LINCS": ["LINCS-Pilot1", ["Metadata_pert_id_dose", "pert_id_dose"]],
 24 | }
 25 | 
 26 | labelCol = "PERT"
 27 | 
 28 | 
 29 | ################################################################################
 30 | def read_replicate_level_profiles(
 31 |     dataset_rootDir, dataset, profileType, per_plate_normalized_flag
 32 | ):
 33 |     """
 34 |     Reads replicate level CSV files in the form of a dataframe
 35 |     Extract measurments column names for each modalities
 36 |     Remove columns with low variance (<thrsh_var)
 37 |     Remove columns with more NaNs than a certain threshold (>null_vals_ratio)
 38 | 
 39 |     Inputs:
 40 |     dataset_rootDir: datasets root dir
 41 |     dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP']
 42 |     profileType:   Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected'
 43 |     per_plate_normalized_flag: if True it will standardize data per plate
 44 | 
 45 |     Output:
 46 |     cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data
 47 |     """
 48 | 
 49 |     dataDir = dataset_rootDir + "/preprocessed_data/" + ds_info_dict[dataset][0] + "/"
 50 | 
 51 |     cp_data_repLevel = pd.read_csv(
 52 |         dataDir + "/CellPainting/replicate_level_cp_" + profileType + ".csv.gz"
 53 |     )
 54 |     l1k_data_repLevel = pd.read_csv(dataDir + "/L1000/replicate_level_l1k.csv.gz")
 55 | 
 56 |     cp_features, l1k_features = extract_feature_names(
 57 |         cp_data_repLevel, l1k_data_repLevel
 58 |     )
 59 | 
 60 |     ########## removes nan and inf values
 61 |     l1k_data_repLevel = l1k_data_repLevel.replace([np.inf, -np.inf], np.nan)
 62 |     cp_data_repLevel = cp_data_repLevel.replace([np.inf, -np.inf], np.nan)
 63 | 
 64 |     #
 65 |     null_vals_ratio = 0.05
 66 |     thrsh_std = 0.0001
 67 |     cols2remove_manyNulls = [
 68 |         i
 69 |         for i in cp_features
 70 |         if (cp_data_repLevel[i].isnull().sum(axis=0) / cp_data_repLevel.shape[0])
 71 |         > null_vals_ratio
 72 |     ]
 73 |     cols2remove_lowVars = (
 74 |         cp_data_repLevel[cp_features]
 75 |         .std()[cp_data_repLevel[cp_features].std() < thrsh_std]
 76 |         .index.tolist()
 77 |     )
 78 | 
 79 |     cols2removeCP = cols2remove_manyNulls + cols2remove_lowVars
 80 |     #     print(cols2removeCP)
 81 | 
 82 |     cp_features = list(set(cp_features) - set(cols2removeCP))
 83 |     cp_data_repLevel = cp_data_repLevel.drop(cols2removeCP, axis=1)
 84 |     cp_data_repLevel[cp_features] = cp_data_repLevel[cp_features].interpolate()
 85 | 
 86 |     #     cols2removeCP=[i for i in cp_features if cp_data_repLevel[i].isnull().sum(axis=0)>0]
 87 |     #     print(cols2removeCP)
 88 | 
 89 |     #     cp=cp.fillna(cp.median())
 90 | 
 91 |     # cols2removeGE=[i for i in l1k.columns if l1k[i].isnull().sum(axis=0)>0]
 92 |     # print(cols2removeGE)
 93 |     # l1k_features = list(set(l1k_features) - set(cols2removeGE))
 94 |     # print(len(l1k_features))
 95 |     # l1k=l1k.drop(cols2removeGE, axis=1);
 96 |     l1k_data_repLevel[l1k_features] = l1k_data_repLevel[l1k_features].interpolate()
 97 |     # l1k=l1k.fillna(l1k.median())
 98 | 
 99 |     ################ Per plate scaling
100 |     if per_plate_normalized_flag:
101 |         cp_data_repLevel = standardize_per_catX(
102 |             cp_data_repLevel, "Metadata_Plate", cp_features
103 |         )
104 |         l1k_data_repLevel = standardize_per_catX(
105 |             l1k_data_repLevel, "det_plate", l1k_features
106 |         )
107 | 
108 |         cols2removeCP = [
109 |             i
110 |             for i in cp_features
111 |             if (cp_data_repLevel[i].isnull().sum(axis=0) / cp_data_repLevel.shape[0])
112 |             > 0.05
113 |         ]
114 |         cp_data_repLevel = cp_data_repLevel.drop(cols2removeCP, axis=1)
115 |         cp_features = list(set(cp_features) - set(cols2removeCP))
116 |         cp_data_repLevel[cp_features] = cp_data_repLevel[cp_features].interpolate()
117 | 
118 |     return [cp_data_repLevel, cp_features], [l1k_data_repLevel, l1k_features]
119 | 
120 | 
121 | ################################################################################
122 | def extract_feature_names(cp_data_repLevel, l1k_data_repLevel):
123 |     """
124 |     extract Cell Painting and L1000 measurments names among the column names
125 | 
126 |     Inputs:
127 |     cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data
128 | 
129 |     Outputs: list of feature names for each modality
130 | 
131 |     """
132 |     # features to analyse
133 |     cp_features = cp_data_repLevel.columns[
134 |         cp_data_repLevel.columns.str.contains("Cells_|Cytoplasm_|Nuclei_")
135 |     ].tolist()
136 |     l1k_features = l1k_data_repLevel.columns[
137 |         l1k_data_repLevel.columns.str.contains("_at")
138 |     ].tolist()
139 | 
140 |     return cp_features, l1k_features
141 | 
142 | 
143 | ################################################################################
144 | def extract_metadata_column_names(cp_data, l1k_data):
145 |     """
146 |     extract metadata column names among the column names for any level of data
147 | 
148 |     Inputs:
149 |     cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data
150 | 
151 |     Outputs: list of metadata column names for each modality
152 | 
153 |     """
154 |     cp_meta_col_names = cp_data.columns[
155 |         ~cp_data.columns.str.contains("Cells_|Cytoplasm_|Nuclei_")
156 |     ].tolist()
157 |     l1k_meta_col_names = l1k_data.columns[
158 |         ~l1k_data.columns.str.contains("_at")
159 |     ].tolist()
160 | 
161 |     return cp_meta_col_names, l1k_meta_col_names
162 | 
163 | 
164 | ################################################################################
165 | def read_treatment_level_profiles(
166 |     dataset_rootDir,
167 |     dataset,
168 |     profileType,
169 |     filter_repCorr_params,
170 |     per_plate_normalized_flag,
171 | ):
172 |     """
173 |     Reads replicate level CSV files (scaled replicate level profiles per plate)
174 |     Rename the column names to match across datasets to PERT in both modalities
175 |     Remove perturbations with low rep corr across both (filter_perts='highRepOverlap')
176 |             or one of the modalities (filter_perts='highRepUnion')
177 |     Form treatment level profiles by averaging the replicates
178 |     Select and keep the metadata columns you want to keep for each dataset
179 |     Merge treatment level profiles to its own metadata
180 | 
181 |     Inputs:
182 |     dataset_rootDir: datasets root dir
183 |     dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP']
184 |     profileType:   Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected'
185 | 
186 |     Output:
187 |     [cp_data_treatLevel,cp_features], [l1k_data_treatLevel,l1k_features]
188 |     each is a list of dataframe and feature names for each of modalities
189 |     """
190 | 
191 |     filter_perts = filter_repCorr_params[0]
192 |     repCorrFilePath = filter_repCorr_params[1]
193 | 
194 |     [cp_data_repLevel, cp_features], [
195 |         l1k_data_repLevel,
196 |         l1k_features,
197 |     ] = read_replicate_level_profiles(
198 |         dataset_rootDir, dataset, profileType, per_plate_normalized_flag
199 |     )
200 | 
201 |     ############ rename columns that should match to PERT
202 |     labelCol = "PERT"
203 |     cp_data_repLevel = cp_data_repLevel.rename(
204 |         columns={ds_info_dict[dataset][1][0]: labelCol}
205 |     )
206 |     l1k_data_repLevel = l1k_data_repLevel.rename(
207 |         columns={ds_info_dict[dataset][1][1]: labelCol}
208 |     )
209 | 
210 |     ###### print some data statistics
211 |     print(
212 |         dataset + ": Replicate Level Shapes (nSamples x nFeatures): cp: ",
213 |         cp_data_repLevel.shape[0],
214 |         ",",
215 |         len(cp_features),
216 |         ",  l1k: ",
217 |         l1k_data_repLevel.shape[0],
218 |         ",",
219 |         len(l1k_features),
220 |     )
221 | 
222 |     print("l1k n of rep: ", l1k_data_repLevel.groupby([labelCol]).size().median())
223 |     print("cp n of rep: ", cp_data_repLevel.groupby([labelCol]).size().median())
224 | 
225 |     ###### remove perts with low rep corr
226 |     if filter_perts == "highRepOverlap":
227 |         highRepPerts = highRepFinder(dataset, "intersection", repCorrFilePath) + [
228 |             "negcon"
229 |         ]
230 | 
231 |         cp_data_repLevel = cp_data_repLevel[
232 |             cp_data_repLevel["PERT"].isin(highRepPerts)
233 |         ].reset_index()
234 |         l1k_data_repLevel = l1k_data_repLevel[
235 |             l1k_data_repLevel["PERT"].isin(highRepPerts)
236 |         ].reset_index()
237 | 
238 |     elif filter_perts == "highRepUnion":
239 |         highRepPerts = highRepFinder(dataset, "union", repCorrFilePath) + ["negcon"]
240 | 
241 |         cp_data_repLevel = cp_data_repLevel[
242 |             cp_data_repLevel["PERT"].isin(highRepPerts)
243 |         ].reset_index()
244 |         l1k_data_repLevel = l1k_data_repLevel[
245 |             l1k_data_repLevel["PERT"].isin(highRepPerts)
246 |         ].reset_index()
247 | 
248 |     ####### form treatment level profiles
249 |     l1k_data_treatLevel = (
250 |         l1k_data_repLevel.groupby(labelCol)[l1k_features].mean().reset_index()
251 |     )
252 |     cp_data_treatLevel = (
253 |         cp_data_repLevel.groupby(labelCol)[cp_features].mean().reset_index()
254 |     )
255 | 
256 |     ###### define metadata and merge treatment level profiles
257 |     #     dataset:[[cp_columns],[l1k_columns]]
258 |     #     meta_dict={'CDRP':[['Metadata_moa','Metadata_target'],['CPD_NAME','CPD_TYPE','CPD_SMILES']],
259 |     #                'CDRP-bio':[['Metadata_moa','Metadata_target'],['CPD_NAME','CPD_TYPE','CPD_SMILES']],
260 |     #               'TAORF':[['Metadata_moa'],['pert_type']],
261 |     #               'LUAD':[['Metadata_broad_sample_type','Metadata_pert_type'],[]],
262 |     #               'LINCS':[['Metadata_moa', 'Metadata_alternative_moa'],['moa']]}
263 | 
264 |     meta_dict = {
265 |         "CDRP": [["Metadata_moa", "Metadata_target"], []],
266 |         "CDRP-bio": [["Metadata_moa", "Metadata_target"], []],
267 |         "TAORF": [[], []],
268 |         "LUAD": [[], []],
269 |         "LINCS": [["Metadata_moa", "Metadata_alternative_moa"], ["moa"]],
270 |     }
271 | 
272 |     meta_cp = (
273 |         cp_data_repLevel[[labelCol] + meta_dict[dataset][0]]
274 |         .drop_duplicates()
275 |         .reset_index(drop=True)
276 |     )
277 |     meta_l1k = (
278 |         l1k_data_repLevel[[labelCol] + meta_dict[dataset][1]]
279 |         .drop_duplicates()
280 |         .reset_index(drop=True)
281 |     )
282 | 
283 |     cp_data_treatLevel = pd.merge(
284 |         cp_data_treatLevel, meta_cp, how="inner", on=[labelCol]
285 |     )
286 |     l1k_data_treatLevel = pd.merge(
287 |         l1k_data_treatLevel, meta_l1k, how="inner", on=[labelCol]
288 |     )
289 | 
290 |     return [cp_data_treatLevel, cp_features], [l1k_data_treatLevel, l1k_features]
291 | 
292 | 
293 | ################################################################################
294 | def read_paired_treatment_level_profiles(
295 |     dataset_rootDir,
296 |     dataset,
297 |     profileType,
298 |     filter_repCorr_params,
299 |     per_plate_normalized_flag,
300 | ):
301 |     """
302 |     Reads treatment level profiles
303 |     Merge dataframes by PERT column
304 | 
305 |     Inputs:
306 |     dataset_rootDir: datasets root dir
307 |     dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP']
308 |     profileType:   Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected'
309 |     per_plate_normalized_flag: True for scaling per plate
310 | 
311 |     Output:
312 |     mergedProfiles_treatLevel: paired treatment level profiles
313 |     cp_features,l1k_features list of feature names for each of modalities
314 |     """
315 | 
316 |     [cp_data_treatLevel, cp_features], [
317 |         l1k_data_treatLevel,
318 |         l1k_features,
319 |     ] = read_treatment_level_profiles(
320 |         dataset_rootDir,
321 |         dataset,
322 |         profileType,
323 |         filter_repCorr_params,
324 |         per_plate_normalized_flag,
325 |     )
326 | 
327 |     mergedProfiles_treatLevel = pd.merge(
328 |         cp_data_treatLevel, l1k_data_treatLevel, how="inner", on=[labelCol]
329 |     )
330 | 
331 |     print(
332 |         "Treatment Level Shapes (nSamples x nFeatures+metadata):",
333 |         cp_data_treatLevel.shape,
334 |         l1k_data_treatLevel.shape,
335 |         "Merged Profiles Shape:",
336 |         mergedProfiles_treatLevel.shape,
337 |     )
338 | 
339 |     return mergedProfiles_treatLevel, cp_features, l1k_features
340 | 
341 | 
342 | ################################################################################
343 | def generate_random_match_of_replicate_pairs(cp_data_repLevel, l1k_data_repLevel, nRep):
344 |     """
345 |     Note that there is no match at the replicate level for this dataset, we either:
346 |         - Forming ALL the possible pairs for replicate level data matching (nRep='all' - string)
347 |         - Randomly sample samples in each modality and form pairs (nRep -> int)
348 | 
349 |     Inputs:
350 |         cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data
351 | 
352 |     Outputs:
353 |         Randomly paired replicate level profiles
354 | 
355 |     """
356 |     labelCol = "PERT"
357 | 
358 |     if nRep == "all":
359 |         cp_data_n_repLevel = cp_data_repLevel.copy()
360 |         l1k_data_n_repLevel = l1k_data_repLevel.copy()
361 |     else:
362 |         #         nR=np.min((cp_data_repLevel.groupby(labelCol).size().min(),l1k_data_repLevel.groupby(labelCol).size().min()))
363 |         #     cp_data_n_repLevel=cp_data_repLevel.groupby(labelCol).apply(lambda x: x.sample(n=nR,replace=True)).reset_index(drop=True)
364 |         nR = nRep
365 |         cp_data_n_repLevel = (
366 |             cp_data_repLevel.groupby(labelCol)
367 |             .apply(lambda x: x.sample(n=np.min([nR, x.shape[0]])))
368 |             .reset_index(drop=True)
369 |         )
370 |         l1k_data_n_repLevel = (
371 |             l1k_data_repLevel.groupby(labelCol)
372 |             .apply(lambda x: x.sample(n=np.min([nR, x.shape[0]])))
373 |             .reset_index(drop=True)
374 |         )
375 | 
376 |     mergedProfiles_repLevel = pd.merge(
377 |         cp_data_n_repLevel, l1k_data_n_repLevel, how="inner", on=[labelCol]
378 |     )
379 | 
380 |     return mergedProfiles_repLevel
381 | 
382 | 
383 | ################################################################################
384 | def highRepFinder(dataset, how, repCorrFilePath):
385 |     """
386 |     This function reads pre calculated and saved Replicate Correlation values file and filters perturbations
387 |     using one of the following filters:
388 |         - intersection: intersection of high quality profiles across both modalities
389 |         - union: union of high quality profiles across both modalities
390 | 
391 |     * A High Quality profile is defined as a profile having replicate correlation more than 90th percentile of
392 |       its null distribution
393 | 
394 |     Inputs:
395 |         dataset (str): dataset name
396 |         how (str):  can be intersection or union
397 | 
398 |     Output: list of high quality perurbations
399 | 
400 |     """
401 |     repCorDF = pd.read_excel(repCorrFilePath, sheet_name=None)
402 |     cpRepDF = repCorDF["cp-" + dataset.lower()]
403 |     cpHighList = cpRepDF[cpRepDF["RepCor"] > cpRepDF["Rand90Perc"]][
404 |         "Unnamed: 0"
405 |     ].tolist()
406 |     print("CP: from ", cpRepDF.shape[0], " to ", len(cpHighList))
407 |     cpRepDF = repCorDF["l1k-" + dataset.lower()]
408 |     l1kHighList = cpRepDF[cpRepDF["RepCor"] > cpRepDF["Rand90Perc"]][
409 |         "Unnamed: 0"
410 |     ].tolist()
411 |     #     print("l1kHighList",l1kHighList)
412 |     #     print("cpHighList",cpHighList)
413 |     if how == "intersection":
414 |         highRepPerts = list(set(l1kHighList) & set(cpHighList))
415 |         print("l1k: from ", cpRepDF.shape[0], " to ", len(l1kHighList))
416 |         print("CP and l1k high rep overlap: ", len(highRepPerts))
417 | 
418 |     elif how == "union":
419 |         highRepPerts = list(set(l1kHighList) | set(cpHighList))
420 |         print("l1k: from ", cpRepDF.shape[0], " to ", len(l1kHighList))
421 |         print("CP and l1k high rep union: ", len(highRepPerts))
422 | 
423 |     return highRepPerts
424 | 
425 | 
426 | ################################################################################
427 | def read_paired_replicate_level_profiles(
428 |     dataset_rootDir,
429 |     dataset,
430 |     profileType,
431 |     nRep,
432 |     filter_repCorr_params,
433 |     per_plate_normalized_flag,
434 | ):
435 |     """
436 |     Reads replicate level CSV files (scaled replicate level profiles per plate)
437 |     Rename the column names to match across datasets to PERT in both modalities
438 |     Remove perturbations with low rep corr across both (filter_perts='highRepOverlap')
439 |             or one of the modalities (filter_perts='highRepUnion')
440 |     Form treatment level profiles by averaging the replicates
441 |     Select and keep the metadata columns you want to keep for each dataset
442 |     Merge dataframes by PERT column
443 | 
444 |     Inputs:
445 |     dataset_rootDir: datasets root dir
446 |     dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP']
447 |     profileType:   Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected'
448 | 
449 |     Output:
450 |     mergedProfiles_treatLevel: paired treatment level profiles
451 |     cp_features,l1k_features list of feature names for each of modalities
452 |     """
453 | 
454 |     filter_perts = filter_repCorr_params[0]
455 |     repCorrFilePath = filter_repCorr_params[1]
456 | 
457 |     [cp_data_repLevel, cp_features], [
458 |         l1k_data_repLevel,
459 |         l1k_features,
460 |     ] = read_replicate_level_profiles(
461 |         dataset_rootDir, dataset, profileType, per_plate_normalized_flag
462 |     )
463 | 
464 |     ############ rename columns that should match to PERT
465 |     cp_data_repLevel = cp_data_repLevel.rename(
466 |         columns={ds_info_dict[dataset][1][0]: labelCol}
467 |     )
468 |     l1k_data_repLevel = l1k_data_repLevel.rename(
469 |         columns={ds_info_dict[dataset][1][1]: labelCol}
470 |     )
471 | 
472 |     ###### print some data statistics
473 |     print(
474 |         dataset + ": Replicate Level Shapes (nSamples x nFeatures): cp: ",
475 |         cp_data_repLevel.shape[0],
476 |         ",",
477 |         len(cp_features),
478 |         ",  l1k: ",
479 |         l1k_data_repLevel.shape[0],
480 |         ",",
481 |         len(l1k_features),
482 |     )
483 | 
484 |     print("l1k n of rep: ", l1k_data_repLevel.groupby([labelCol]).size().median())
485 |     print("cp n of rep: ", cp_data_repLevel.groupby([labelCol]).size().median())
486 | 
487 |     ###### remove perts with low rep corr
488 |     if filter_perts == "highRepOverlap":
489 |         highRepPerts = highRepFinder(dataset, "intersection", repCorrFilePath) + [
490 |             "negcon"
491 |         ]
492 | 
493 |         cp_data_repLevel = cp_data_repLevel[
494 |             cp_data_repLevel["PERT"].isin(highRepPerts)
495 |         ].reset_index()
496 |         l1k_data_repLevel = l1k_data_repLevel[
497 |             l1k_data_repLevel["PERT"].isin(highRepPerts)
498 |         ].reset_index()
499 | 
500 |     elif filter_perts == "highRepUnion":
501 |         highRepPerts = highRepFinder(dataset, "union", repCorrFilePath) + ["negcon"]
502 | 
503 |         cp_data_repLevel = cp_data_repLevel[
504 |             cp_data_repLevel["PERT"].isin(highRepPerts)
505 |         ].reset_index()
506 |         l1k_data_repLevel = l1k_data_repLevel[
507 |             l1k_data_repLevel["PERT"].isin(highRepPerts)
508 |         ].reset_index()
509 | 
510 |     mergedProfiles_repLevel = generate_random_match_of_replicate_pairs(
511 |         cp_data_repLevel, l1k_data_repLevel, nRep
512 |     )
513 | 
514 |     return mergedProfiles_repLevel, cp_features, l1k_features
515 | 
516 | 
517 | def rename_affyprobe_to_genename(l1k_data_df, l1k_features, map_source_address):
518 |     """
519 |     map input dataframe column name from affy prob id to gene names
520 | 
521 |     """
522 |     meta = pd.read_excel(map_source_address)
523 | 
524 |     #     meta=pd.read_csv("../affy_probe_gene_mapping.txt",delimiter="\t",header=None, names=["probe_id", "gene"])
525 |     meta_gene_probID = meta.set_index("probe_id")
526 |     d = dict(zip(meta_gene_probID.index, meta_gene_probID["symbol"]))
527 |     l1k_features_gn = [d[l] for l in l1k_features]
528 |     l1k_data_df = l1k_data_df.rename(columns=d)
529 | 
530 |     return l1k_data_df, l1k_features_gn
531 | 
532 | 
533 | def rename_to_genename_list_to_affyprobe(
534 |     l1k_features_gn, our_l1k_prob_list, map_source_address
535 | ):
536 |     """
537 |     map a list of gene names to a list of affy prob ids
538 | 
539 |     """
540 |     #     map_source_address='../idmap.xlsx'
541 |     meta = pd.read_excel(map_source_address)
542 |     #     meta=pd.read_csv("../affy_probe_gene_mapping.txt",delimiter="\t",header=None, names=["probe_id", "gene"])
543 |     #     meta=meta[meta['probe_id'].isin(our_l1k_prob_list)].reset_index(drop=True)
544 |     meta_gene_probID = meta.set_index("symbol")
545 |     d = dict(zip(meta_gene_probID.index, meta_gene_probID["probe_id"]))
546 |     l1k_features = [d[l] for l in l1k_features_gn]
547 |     #     l1k_data_df = l1k_data_df.rename(columns=d)
548 | 
549 |     return l1k_features
550 | 
551 | 
552 | def standardize_per_catX(df, column_name, cp_features):
553 |     # column_name='Metadata_Plate'
554 |     #     cp_features=df.columns[df.columns.str.contains("Cells_|Cytoplasm_|Nuclei_")]
555 |     df_scaled_perPlate = df.copy()
556 |     df_scaled_perPlate[cp_features] = (
557 |         df[cp_features + [column_name]]
558 |         .groupby(column_name)
559 |         .transform(lambda x: (x - x.mean()) / x.std())
560 |         .values
561 |     )
562 |     return df_scaled_perPlate
563 | 


--------------------------------------------------------------------------------
/utils/replicateCorrs.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.spatial
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | from random import sample, choices
  7 | from scipy.stats import pearsonr
  8 | 
  9 | # sns.set_style("whitegrid")
 10 | sns.set(rc={"lines.linewidth": 2})
 11 | 
 12 | 
 13 | def replicateCorrs(inDf, pertColName, featColNames, plotEnabled):
 14 |     """
 15 |     Calculates replicate correlation versus across purtburtion correlations
 16 | 
 17 |     This function takes the input dataframe and output/plot replicate correlations.
 18 | 
 19 |     Parameters:
 20 |     inDf   (pandas df): input dataframe contains metadata and features
 21 |     pertColName  (str): The column based on which we define replicates of a purturbation
 22 |     featColNames(list): The list of all columns corresponding to features
 23 |     plotEnabled (bool): If True or 1, plots the curves
 24 | 
 25 |     Returns:
 26 |     repCorrDf   (list):
 27 | 
 28 |     """
 29 | 
 30 |     df = inDf.copy()
 31 |     df[featColNames] = inDf[featColNames].interpolate()
 32 |     uniqPert = df[pertColName].unique().tolist()
 33 |     repC = []
 34 |     randC = []
 35 | 
 36 |     repCorrDf = pd.DataFrame(index=uniqPert, columns=["RepCor"])
 37 | 
 38 |     repSizeDF = df.groupby([pertColName]).size().reset_index()
 39 |     highRepComp = repSizeDF[repSizeDF[0] > 1][pertColName].tolist()
 40 | 
 41 |     for u in highRepComp:
 42 |         df1 = df[df[pertColName] == u].drop_duplicates().reset_index(drop=True)
 43 |         #         df2=df[df[pertColName]!=u].drop_duplicates().reset_index(drop=True)
 44 | 
 45 |         repCorrPurtbs = df1.loc[:, featColNames].T.corr()
 46 |         repCorr = list(
 47 |             repCorrPurtbs.values[np.triu_indices(repCorrPurtbs.shape[0], k=1)]
 48 |         )
 49 |         #         print(repCorr)
 50 |         repCorrDf.loc[u, "RepCor"] = np.nanmean(repCorr)
 51 |         #         print(repCorr)
 52 |         #         repCorr=np.sort(np.unique(df1.loc[:,featColNames].T.corr().values))[:-1].tolist()
 53 |         #         repC=repC+repCorr
 54 |         repC = repC + [np.nanmedian(repCorr)]
 55 |     #         repC=repC+[np.median(repCorr)]
 56 |     # #         randPertbs=df2[pertColName].drop_duplicates().sample(df1.shape[0],replace=True).tolist()
 57 |     #         nS=np.min([len(df2[pertColName].unique().tolist()),df1.shape[0]])
 58 |     # #         nS=df1.shape[0]
 59 | 
 60 |     # #         print(nS,[len(df2[pertColName].unique().tolist()),df1.shape[0]])
 61 | 
 62 |     #         randPertbs=sample(df2[pertColName].unique().tolist(),k=nS)
 63 |     # #         print(randPertbs)
 64 |     #         df3=pd.concat([df2[df2[pertColName]==i].sample(1,replace=True) for i in randPertbs],ignore_index=True)
 65 |     # #         print(df1.sample(df3.shape[0],replace=False).shape,df3.shape)
 66 |     #         randCorr=df1[featColNames].sample(df3.shape[0],replace=False).reset_index(drop=True).\
 67 |     #     corrwith(df3[featColNames], axis = 1,method='pearson',drop=True).values.tolist()
 68 | 
 69 |     # #         x1=df1.sample(df3.shape[0],replace=False).values
 70 | 
 71 |     # #         randCorr=pearsonr()
 72 |     # #         randCorr = [x for x in randCorr if str(x) != 'nan']
 73 |     #         randC=randC+randCorr
 74 |     # #     print(randC)
 75 |     #     print('here3')
 76 |     randC_v2 = []
 77 |     for i in range(1):
 78 |         uniqeSamplesFromEachPurt = inDf.groupby(pertColName)[featColNames].apply(
 79 |             lambda s: s.sample(1)
 80 |         )
 81 |         corrMatAcrossPurtbs = uniqeSamplesFromEachPurt.loc[:, featColNames].T.corr()
 82 |         randCorrVals = list(
 83 |             corrMatAcrossPurtbs.values[
 84 |                 np.triu_indices(corrMatAcrossPurtbs.shape[0], k=1)
 85 |             ]
 86 |         )
 87 |     randC_v2 = randC_v2 + randCorrVals
 88 | 
 89 |     if 0:
 90 |         fig, axes = plt.subplots(figsize=(5, 3))
 91 |         sns.kdeplot(randC, bw=0.1, label="random pairs", ax=axes)
 92 |         sns.kdeplot(repC, bw=0.1, label="replicate pairs", ax=axes)
 93 |         axes.set_xlabel("CC")
 94 |         sns.kdeplot(randC_v2, bw=0.1, label="random v2 pairs", ax=axes)
 95 |         axes.set_xlabel("CC")
 96 |         #         perc5=np.percentile(repCC, 50);axes.axvline(x=perc5,linestyle=':',color='darkorange');
 97 |         #         perc95=np.percentile(randCC, 90);axes.axvline(x=perc95,linestyle=':');
 98 |         axes.legend()
 99 |         # axes.set_title('');
100 |         axes.set_xlim(-1.1, 1.1)
101 | 
102 |     repC = [repC for repC in repC if str(repC) != "nan"]
103 |     randC_v2 = [randC_v2 for randC_v2 in randC_v2 if str(randC_v2) != "nan"]
104 | 
105 |     perc95 = np.percentile(randC_v2, 90)
106 |     rep10 = np.percentile(repC, 10)
107 | 
108 |     if plotEnabled:
109 |         fig, axes = plt.subplots(figsize=(5, 4))
110 |         #         sns.kdeplot(randC_v2, bw=.1, label="random pairs",ax=axes);axes.set_xlabel('CC');
111 |         #         sns.kdeplot(repC, bw=.1, label="replicate pairs",ax=axes,color='r');axes.set_xlabel('CC');
112 |         sns.distplot(
113 |             randC_v2,
114 |             kde=True,
115 |             hist=True,
116 |             bins=100,
117 |             label="random pairs",
118 |             ax=axes,
119 |             norm_hist=True,
120 |         )
121 |         sns.distplot(
122 |             repC,
123 |             kde=True,
124 |             hist=True,
125 |             bins=100,
126 |             label="replicate pairs",
127 |             ax=axes,
128 |             norm_hist=True,
129 |             color="r",
130 |         )
131 | 
132 |         #         perc5=np.percentile(repCC, 50);axes.axvline(x=perc5,linestyle=':',color='darkorange');
133 |         axes.axvline(x=perc95, linestyle=":")
134 |         axes.axvline(x=0, linestyle=":")
135 |         axes.legend(loc=2)
136 |         # axes.set_title('');
137 |         axes.set_xlim(-1, 1)
138 |         plt.tight_layout()
139 | 
140 |     repCorrDf["Rand90Perc"] = perc95
141 |     repCorrDf["Rep10Perc"] = rep10
142 |     #     highRepPertbs=repCorrDf[repCorrDf['RepCor']>perc95].index.tolist()
143 |     #     return repCorrDf
144 |     return [randC_v2, repC, repCorrDf]
145 | 
146 | 
147 | # input is a list of dfs--> [cp,l1k,cp_cca,l1k_cca]
148 | #######
149 | def plotRepCorrs(allData, pertName):
150 |     corrAll = []
151 |     for d in range(len(allData)):
152 |         df = allData[d][0]
153 |         features = allData[d][1]
154 |         uniqPert = df[pertName].unique().tolist()
155 |         repC = []
156 |         randC = []
157 |         for u in uniqPert:
158 |             df1 = df[df[pertName] == u].drop_duplicates().reset_index(drop=True)
159 |             df2 = df[df[pertName] != u].drop_duplicates().reset_index(drop=True)
160 |             repCorr = np.sort(np.unique(df1.loc[:, features].T.corr().values))[
161 |                 :-1
162 |             ].tolist()
163 |             #             print(repCorr)
164 |             repC = repC + repCorr
165 |             randAllels = (
166 |                 df2[pertName]
167 |                 .drop_duplicates()
168 |                 .sample(df1.shape[0], replace=True)
169 |                 .tolist()
170 |             )
171 |             df3 = pd.concat(
172 |                 [
173 |                     df2[df2[pertName] == i].reset_index(drop=True).iloc[0:1, :]
174 |                     for i in randAllels
175 |                 ],
176 |                 ignore_index=True,
177 |             )
178 |             randCorr = df1.corrwith(df3, axis=1, method="pearson").values.tolist()
179 |             randC = randC + randCorr
180 | 
181 |         corrAll.append([randC, repC])
182 |     return corrAll
183 | 


--------------------------------------------------------------------------------
/utils/saveAsNewSheetToExistingFile.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import openpyxl as pxl
 3 | import os
 4 | 
 5 | # ------------------------------------------------------
 6 | 
 7 | 
 8 | # Save the input dataframe to the specified sheet name of filename file
 9 | def saveAsNewSheetToExistingFile(filename, newDF, newSheetName):
10 | 
11 | 
12 | 
13 |     if os.path.exists(filename):
14 |         excel_book = pxl.load_workbook(filename)
15 | 
16 |         if newSheetName in excel_book.sheetnames:
17 |             del excel_book[newSheetName]
18 | 
19 |         with pd.ExcelWriter(filename, engine="openpyxl") as writer:
20 |             writer.book = excel_book
21 | 
22 |             writer.sheets = {
23 |                 worksheet.title: worksheet
24 |                 for worksheet in excel_book.worksheets
25 |                 if newSheetName not in worksheet
26 |             }
27 |             newDF.to_excel(writer, newSheetName)
28 |             writer.save()
29 |     else:
30 |         newDF.to_excel(filename, newSheetName)
31 | 
32 |     print(newSheetName, " saved!")
33 |     return
34 | 
35 | 
36 | # ------------------------------------------------------
37 | 
38 | 
39 | # saveDF_to_CSV_GZ_no_timestamp
40 | def saveDF_to_CSV_GZ_no_timestamp(df, filename):
41 |     from gzip import GzipFile
42 |     from io import TextIOWrapper
43 | 
44 |     with TextIOWrapper(GzipFile(filename, "w", mtime=0), encoding="utf-8") as fd:
45 |         df.to_csv(fd, index=False, compression="gzip")
46 | 
47 |     return
48 | 


--------------------------------------------------------------------------------