├── .gitignore ├── .pre-commit-config.yaml ├── 0.dataset_creation └── 0-preprocess_datasets.ipynb ├── 1.dataset_curation ├── README.md ├── curate_dataset.py ├── pyproject.toml └── uv.lock ├── App1.single_feature_prediction ├── 1-single-Gene-CPfeature-prediction.ipynb └── lookup_luad_images.ipynb ├── App2.MoA_prediction ├── 2a-Modality_Integration_CDRP-bio.ipynb └── 2b-Modality_Integration_LINCS.ipynb ├── GO_terms_search ├── 4-GO-terms-search-analysis.ipynb └── source │ ├── GO_bp_cc_mf_direct_LUAD_975.txt │ ├── GO_bp_cc_mf_direct_LUAD_976.txt │ ├── GO_bp_cc_mf_direct_intersection_782.txt │ ├── GO_bp_cc_mf_direct_intersection_782_completed.csv │ ├── GO_bp_cc_mf_direct_union_1165.txt │ ├── LUAD_geneSymbols_978.txt │ ├── intersection_geneSymbols_785.txt │ ├── top_100_luad.txt │ ├── top_59_atleast_topIn3.txt │ └── union_geneSymbols_1170.txt ├── LICENSE ├── README.md ├── environment.yml ├── etag.json ├── explore_the_link.ipynb ├── generate_paper_figures └── generate_paper_figs.ipynb ├── idmap.xlsx ├── read_and_match_profiles.ipynb ├── results ├── DAVIDoutput_CytoScapeInput_Figure2d │ ├── chart_UP_KEYWORDS_FunctionalAnot_all.txt │ └── chart_UP_KEYWORDS_FunctionalAnot_top.txt ├── Figs_Source_Data.xlsx ├── MoAprediction │ ├── JI_cdrpbio.txt │ ├── JI_lincs.txt │ ├── pred_moa.xlsx │ ├── pred_moa_2.xlsx │ ├── pred_moa_CDRP.xlsx │ └── pred_moa_LINCS.xlsx ├── RepCor │ └── RepCorrDF.xlsx ├── SingleCPfeatPred │ └── scores_corrected.xlsx ├── SingleGenePred │ ├── scores_corrected.xlsx │ ├── scores_cross_dts_LU_LI.xlsx │ └── supplementary_D.csv └── SingleGenePred_cpCategoryMap │ ├── CatMap-LINCS-25-lasso-ht.png │ ├── CatMap-LUAD-9-MLP-keras-ht.pdf │ ├── CatMap-LUAD-9-MLP-keras-ht.png │ ├── CatMap-LUAD-9-lasso-ht.png │ └── cat_scores_maps.xlsx └── utils ├── pred_models.py ├── readProfiles.py ├── replicateCorrs.py └── saveAsNewSheetToExistingFile.py /.gitignore: -------------------------------------------------------------------------------- 1 | # result folder 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | .Rproj.user 133 | .Rhistory 134 | .Rprofile 135 | *.nb.html 136 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v5.0.0 4 | hooks: 5 | - id: trailing-whitespace 6 | exclude: ^.*/$ 7 | 8 | -------------------------------------------------------------------------------- /1.dataset_curation/README.md: -------------------------------------------------------------------------------- 1 | # Dataset Curation 2 | 3 | `curate_dataset.py` selects, renames, and fixes columns from the preprocessed data to create a curated dataset. 4 | 5 | ## Structure 6 | 7 | Available at: 8 | `s3://cellpainting-gallery/cpg0003-rosetta/broad/workspace/curated_preprocessed_data` 9 | 10 | ``` 11 | curated_preprocessed_data 12 | ├── CDRP-BBBC047-Bray 13 | │ ├── CellPainting 14 | │ │ └── replicate_level_cp_augmented.parquet 15 | │ └── L1000 16 | │ └── replicate_level_l1k.parquet 17 | ├── LINCS-Pilot1 18 | │ ├── CellPainting 19 | │ │ └── replicate_level_cp_augmented.parquet 20 | │ └── L1000 21 | │ └── replicate_level_l1k.parquet 22 | ├── LUAD-BBBC041-Caicedo 23 | │ ├── CellPainting 24 | │ │ └── replicate_level_cp_augmented.parquet 25 | │ └── L1000 26 | │ └── replicate_level_l1k.parquet 27 | └── TA-ORF-BBBC037-Rohban 28 | ├── CellPainting 29 | │ └── replicate_level_cp_augmented.parquet 30 | └── L1000 31 | └── replicate_level_l1k.parquet 32 | ``` 33 | 34 | ## Curated columns 35 | 36 | - `Metadata_Plate` [All]: Identifier of the multi‐well plate (e.g., SQ00015156, PAC053_U2OS_6H_X2_B1_UNI4445R, TA.OE005_U2OS_72H_X1_B15). 37 | - `Metadata_Plate_Map_Name` [All CP]: Plate‐map identifier (e.g., C-7161-01-LM6-003). 38 | - `Metadata_ARP_ID` [LINCS-Pilot1 L1K, TA-ORF-BBBC037-Rohban L1K, LUAD-BBBC041-Caicedo L1K]: Internal plate identifier (e.g., AB00016187). 39 | - `Metadata_Well` [All except CDRP-BBBC047-Bray L1K]: Specific well position within the plate (e.g., A01, H11). 40 | - `Metadata_pert_id` [All]: Unique perturbation identifier (e.g., BRD-K50691590-001-02-2, TRCN0000471252, EMPTY). 41 | - `Metadata_pert_type` [All except CDRP-BBBC047-Bray L1K]: Perturbation type (e.g., trt_cp, ctl_vehicle, trt, control). 42 | - `Metadata_cell_id` [All]: Cell line used (e.g., A549, U2OS). 43 | - `Metadata_pert_timepoint` [All]: Time (in hours) from perturbation to measurement (e.g., 24, 48, 72, 96). 44 | - `Metadata_pert_dose_micromolar` [LINCS-Pilot1, CDRP-BBBC047-Bray]: Final compound concentration (µM) (e.g., 0.0411523, 10). 45 | - `Metadata_pert_iname` [LINCS-Pilot1, CDRP-BBBC047-Bray]: Common name of the compound or control (e.g., bortezomib, DMSO). 46 | - `Metadata_SMILES` [LINCS-Pilot1 L1K, CDRP-BBBC047-Bray L1K]: SMILES string for the compound structure. 47 | - `Metadata_cdrp_group` [CDRP-BBBC047-Bray L1K]: Subset/group label in the CDRP compound library (e.g., DOS, BIO). 48 | - `Metadata_genesymbol_mutation` [TA-ORF-BBBC037-Rohban L1K+CP, LUAD-BBBC041-Caicedo CP]: Gene plus mutation notation (e.g., TP53_p.R248Q). 49 | - `Metadata_genesymbol` [TA-ORF-BBBC037-Rohban CP, LUAD-BBBC041-Caicedo CP]: Gene symbol alone (e.g., TP53, MAPK8). 50 | - `Metadata_transcriptdb` [LUAD-BBBC041-Caicedo L1K]: Reference to specific transcript/isoform (e.g., NM_001126112.2:c.796G>C). 51 | 52 | -------------------------------------------------------------------------------- /1.dataset_curation/curate_dataset.py: -------------------------------------------------------------------------------- 1 | # --- 2 | # jupyter: 3 | # jupytext: 4 | # text_representation: 5 | # extension: .py 6 | # format_name: percent 7 | # format_version: '1.3' 8 | # jupytext_version: 1.15.2 9 | # kernelspec: 10 | # display_name: Python 3 11 | # language: python 12 | # name: python3 13 | # --- 14 | 15 | # %% 16 | import pandas as pd 17 | from pathlib import Path 18 | from IPython.display import display 19 | 20 | # %% 21 | # First download the data from 22 | # s3://cellpainting-gallery/cpg0003-rosetta/broad/workspace/preprocessed_data 23 | # and save it in the ./preprocessed_data folder 24 | 25 | dataset_paths = { 26 | "LINCS-Pilot1": { 27 | "l1k": "./preprocessed_data/LINCS-Pilot1/L1000/replicate_level_l1k.csv.gz", 28 | "cp": "./preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_augmented.csv.gz", 29 | }, 30 | "CDRP-BBBC047-Bray": { 31 | "l1k": "./preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k.csv.gz", 32 | "cp": "./preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_augmented.csv.gz", 33 | }, 34 | "TA-ORF-BBBC037-Rohban": { 35 | "l1k": "./preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k.csv.gz", 36 | "cp": "./preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_augmented.csv.gz", 37 | }, 38 | "LUAD-BBBC041-Caicedo": { 39 | "l1k": "./preprocessed_data/LUAD-BBBC041-Caicedo/L1000/replicate_level_l1k.csv.gz", 40 | "cp": "./preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_augmented.csv.gz", 41 | }, 42 | } 43 | 44 | # Define column mappings for each dataset and data type 45 | column_rename_mappings = { 46 | "CDRP-BBBC047-Bray": { 47 | "l1k": { 48 | "pert_id": "Metadata_pert_id", 49 | "pert_dose": "Metadata_pert_dose_micromolar", 50 | "det_plate": "Metadata_Plate", 51 | "CPD_NAME": "Metadata_pert_iname", 52 | "CPD_TYPE": "Metadata_cdrp_group", 53 | "CPD_SMILES": "Metadata_SMILES", 54 | }, 55 | "cp": { 56 | "Metadata_broad_sample": "Metadata_pert_id", 57 | "Metadata_broad_sample_type": "Metadata_pert_type", 58 | "Metadata_mmoles_per_liter2": "Metadata_pert_dose_micromolar", 59 | }, 60 | }, 61 | "LINCS-Pilot1": { 62 | "l1k": { 63 | "pert_dose": "Metadata_pert_dose_micromolar", 64 | "det_plate": "Metadata_Plate", 65 | "cell_id": "Metadata_cell_id", 66 | "det_well": "Metadata_Well", 67 | "mfc_plate_name": "Metadata_ARP_ID", 68 | "pert_iname_x": "Metadata_pert_iname", 69 | "pert_time": "Metadata_pert_timepoint", 70 | "pert_mfc_id": "Metadata_pert_id", 71 | "pert_type_x": "Metadata_pert_type", 72 | "x_smiles": "Metadata_SMILES", 73 | }, 74 | "cp": { 75 | "Metadata_broad_sample": "Metadata_pert_id", 76 | "Metadata_broad_sample_type": "Metadata_pert_type", 77 | "Metadata_mmoles_per_liter": "Metadata_pert_dose_micromolar", 78 | "pert_iname": "Metadata_pert_iname", 79 | }, 80 | }, 81 | "TA-ORF-BBBC037-Rohban": { 82 | "l1k": { 83 | "det_plate": "Metadata_Plate", 84 | "cell_id": "Metadata_cell_id", 85 | "det_well": "Metadata_Well", 86 | "mfc_plate_name": "Metadata_ARP_ID", 87 | "pert_time": "Metadata_pert_timepoint", 88 | "pert_mfc_id": "Metadata_pert_id", 89 | "pert_type": "Metadata_pert_type", 90 | "x_genesymbol_mutation": "Metadata_genesymbol_mutation", 91 | }, 92 | "cp": { 93 | "Metadata_broad_sample": "Metadata_pert_id", 94 | "Metadata_broad_sample_type": "Metadata_pert_type", 95 | "Metadata_pert_name": "Metadata_genesymbol_mutation", 96 | "Metadata_gene_name": "Metadata_genesymbol", 97 | }, 98 | }, 99 | "LUAD-BBBC041-Caicedo": { 100 | "l1k": { 101 | "det_plate": "Metadata_Plate", 102 | "cell_id": "Metadata_cell_id", 103 | "det_well": "Metadata_Well", 104 | "mfc_plate_name": "Metadata_ARP_ID", 105 | "pert_time": "Metadata_pert_timepoint", 106 | "pert_mfc_id": "Metadata_pert_id", 107 | "pert_type": "Metadata_pert_type", 108 | "x_transcriptdb": "Metadata_transcriptdb", 109 | }, 110 | "cp": { 111 | "Metadata_broad_sample": "Metadata_pert_id", 112 | "Metadata_broad_sample_type": "Metadata_pert_type", 113 | "x_mutation_status": "Metadata_genesymbol_mutation", 114 | "Symbol": "Metadata_genesymbol", 115 | }, 116 | }, 117 | } 118 | 119 | # Define the columns we want to keep for each dataset and data type 120 | columns_to_keep = { 121 | "CDRP-BBBC047-Bray": { 122 | "l1k": [ 123 | "Metadata_Plate", 124 | "Metadata_pert_id", 125 | "Metadata_pert_iname", 126 | "Metadata_pert_dose_micromolar", 127 | "Metadata_cdrp_group", 128 | "Metadata_SMILES", 129 | ], 130 | "cp": [ 131 | "Metadata_Plate_Map_Name", 132 | "Metadata_Plate", 133 | "Metadata_Well", 134 | "Metadata_pert_id", 135 | "Metadata_pert_dose_micromolar", 136 | "Metadata_pert_type", 137 | "Metadata_cell_id", 138 | ], 139 | }, 140 | "LINCS-Pilot1": { 141 | "l1k": [ 142 | "Metadata_Plate", 143 | "Metadata_Well", 144 | "Metadata_pert_id", 145 | "Metadata_pert_type", 146 | "Metadata_pert_dose_micromolar", 147 | "Metadata_cell_id", 148 | "Metadata_pert_iname", 149 | "Metadata_ARP_ID", 150 | "Metadata_pert_timepoint", 151 | "Metadata_SMILES", 152 | ], 153 | "cp": [ 154 | "Metadata_Plate_Map_Name", 155 | "Metadata_Plate", 156 | "Metadata_Well", 157 | "Metadata_pert_id", 158 | "Metadata_pert_type", 159 | "Metadata_pert_dose_micromolar", 160 | "Metadata_cell_id", 161 | "Metadata_pert_iname", 162 | ], 163 | }, 164 | "TA-ORF-BBBC037-Rohban": { 165 | "l1k": [ 166 | "Metadata_Plate", 167 | "Metadata_Well", 168 | "Metadata_pert_id", 169 | "Metadata_pert_type", 170 | "Metadata_cell_id", 171 | "Metadata_ARP_ID", 172 | "Metadata_pert_timepoint", 173 | "Metadata_genesymbol_mutation", 174 | ], 175 | "cp": [ 176 | "Metadata_Plate_Map_Name", 177 | "Metadata_Plate", 178 | "Metadata_Well", 179 | "Metadata_pert_id", 180 | "Metadata_pert_type", 181 | "Metadata_cell_id", 182 | "Metadata_genesymbol_mutation", 183 | "Metadata_genesymbol", 184 | ], 185 | }, 186 | "LUAD-BBBC041-Caicedo": { 187 | "l1k": [ 188 | "Metadata_Plate", 189 | "Metadata_Well", 190 | "Metadata_pert_id", 191 | "Metadata_pert_type", 192 | "Metadata_cell_id", 193 | "Metadata_ARP_ID", 194 | "Metadata_pert_timepoint", 195 | "Metadata_transcriptdb", 196 | ], 197 | "cp": [ 198 | "Metadata_Plate_Map_Name", 199 | "Metadata_Plate", 200 | "Metadata_Well", 201 | "Metadata_pert_id", 202 | "Metadata_pert_type", 203 | "Metadata_cell_id", 204 | "Metadata_genesymbol_mutation", 205 | "Metadata_genesymbol", 206 | ], 207 | }, 208 | } 209 | 210 | # First load the data 211 | dataset_data = {} 212 | for dataset_name, paths in dataset_paths.items(): 213 | dataset_data[dataset_name] = {} 214 | for data_type, dataset_path in paths.items(): 215 | parquet_path = dataset_path.replace(".csv.gz", ".parquet") 216 | if not Path(parquet_path).exists(): 217 | data = pd.read_csv(dataset_path, low_memory=False) 218 | data.to_parquet(parquet_path) 219 | dataset_data[dataset_name][data_type] = data 220 | else: 221 | data = pd.read_parquet(parquet_path) 222 | dataset_data[dataset_name][data_type] = data 223 | 224 | 225 | # %% 226 | 227 | # Then apply the column renaming 228 | for dataset_name, data_types in dataset_data.items(): 229 | for data_type, data in data_types.items(): 230 | if ( 231 | dataset_name in column_rename_mappings 232 | and data_type in column_rename_mappings[dataset_name] 233 | ): 234 | # First, identify feature columns we want to preserve 235 | if data_type == "l1k": 236 | feature_mask = data.columns.str.endswith("_at") 237 | else: # cp 238 | feature_mask = ( 239 | data.columns.str.startswith("Cells_") 240 | | data.columns.str.startswith("Cytoplasm_") 241 | | data.columns.str.startswith("Nuclei_") 242 | ) 243 | feature_cols = data.columns[feature_mask] 244 | metadata_cols = data.columns[~feature_mask] 245 | 246 | # Apply renaming only to metadata columns 247 | rename_mapping = { 248 | k: v 249 | for k, v in column_rename_mappings[dataset_name][data_type].items() 250 | if k in metadata_cols 251 | } 252 | 253 | # Check if new name already exists and drop it if so 254 | for old, new in rename_mapping.items(): 255 | if new in data.columns and new != old: 256 | data.drop(columns=[new], inplace=True) 257 | # Rename metadata columns 258 | data = data.rename(columns=rename_mapping) 259 | 260 | # Keep only desired metadata columns plus all feature columns 261 | keep_metadata = columns_to_keep[dataset_name][data_type] 262 | dataset_data[dataset_name][data_type] = data[ 263 | keep_metadata + feature_cols.tolist() 264 | ] 265 | 266 | # %% 267 | 268 | # Make "Metadata_Well" uppercase 269 | for dataset_name, data_types in dataset_data.items(): 270 | for data_type, data in data_types.items(): 271 | if "Metadata_Well" in data.columns: 272 | data["Metadata_Well"] = data["Metadata_Well"].str.upper() 273 | 274 | 275 | # Make "Metadata_cell_id" = U2OS for CDRP-BBBC047-Bray cp 276 | dataset_data["CDRP-BBBC047-Bray"]["l1k"]["Metadata_cell_id"] = "U2OS" 277 | 278 | # Set timepoints 279 | dataset_data["LINCS-Pilot1"]["cp"]["Metadata_pert_timepoint"] = 48 280 | dataset_data["LINCS-Pilot1"]["l1k"]["Metadata_pert_timepoint"] = 24 281 | 282 | dataset_data["CDRP-BBBC047-Bray"]["cp"]["Metadata_pert_timepoint"] = 48 283 | dataset_data["CDRP-BBBC047-Bray"]["l1k"]["Metadata_pert_timepoint"] = 6 284 | 285 | dataset_data["TA-ORF-BBBC037-Rohban"]["cp"]["Metadata_pert_timepoint"] = 72 286 | dataset_data["TA-ORF-BBBC037-Rohban"]["l1k"]["Metadata_pert_timepoint"] = 72 287 | 288 | dataset_data["LUAD-BBBC041-Caicedo"]["cp"]["Metadata_pert_timepoint"] = 96 289 | dataset_data["LUAD-BBBC041-Caicedo"]["l1k"]["Metadata_pert_timepoint"] = 96 290 | 291 | # %% 292 | 293 | # Display the datasets 294 | for dataset_name, data_types in dataset_data.items(): 295 | for data_type, data in data_types.items(): 296 | display(f"Dataset: {dataset_name}, Data Type: {data_type}") 297 | display(data.sample(5)[data.columns[data.columns.str.startswith("Metadata")]]) 298 | 299 | # %% 300 | 301 | # %% 302 | for dataset_name, data_types in dataset_data.items(): 303 | for data_type, data in data_types.items(): 304 | if "Metadata_pert_type" in data.columns: 305 | data["Metadata_pert_type"] = data["Metadata_pert_type"].replace( 306 | {"ctl_vehicle": "control", "trt_cp": "trt"} 307 | ) 308 | 309 | 310 | # TA-ORF-BBBC037-Rohban cp does not correctly identify Metadata_pert_type, because it marks all as trt. 311 | 312 | # %% 313 | 314 | # Print columns for each dataset and data type 315 | print("\nColumns in each dataset:") 316 | for dataset_name, data_types in dataset_data.items(): 317 | print(f"\n{dataset_name}:") 318 | for data_type, data in data_types.items(): 319 | metadata_cols = [col for col in data.columns if col.startswith("Metadata")] 320 | print(f" {data_type}: {sorted(metadata_cols)}") 321 | 322 | # Find common columns between l1k datasets 323 | l1k_common = set.intersection( 324 | *[set(data_types["l1k"].columns) for data_types in dataset_data.values()] 325 | ) 326 | l1k_metadata_common = sorted([col for col in l1k_common if col.startswith("Metadata")]) 327 | 328 | # Find common columns between cp datasets 329 | cp_common = set.intersection( 330 | *[set(data_types["cp"].columns) for data_types in dataset_data.values()] 331 | ) 332 | cp_metadata_common = sorted([col for col in cp_common if col.startswith("Metadata")]) 333 | 334 | # Find common columns across all datasets 335 | all_common = set.intersection(l1k_common, cp_common) 336 | all_metadata_common = sorted([col for col in all_common if col.startswith("Metadata")]) 337 | 338 | print("\nCommon Metadata columns across L1K datasets:") 339 | print(l1k_metadata_common) 340 | print("\nCommon Metadata columns across CP datasets:") 341 | print(cp_metadata_common) 342 | print("\nCommon Metadata columns across ALL datasets:") 343 | print(all_metadata_common) 344 | 345 | # %% 346 | 347 | # Check for duplicate columns within each dataset 348 | for dataset_name, data_types in dataset_data.items(): 349 | for data_type, data in data_types.items(): 350 | duplicate_cols = data.columns.duplicated() 351 | if any(duplicate_cols): 352 | print(f"Duplicate columns found in {dataset_name} {data_type}:") 353 | print(data.columns[duplicate_cols]) 354 | 355 | # %% 356 | # Create markdown output for datasets 357 | markdown_output = "# Dataset Samples\n\n" 358 | 359 | for dataset_name, data_types in dataset_data.items(): 360 | markdown_output += f"## {dataset_name}\n\n" 361 | for data_type, data in data_types.items(): 362 | markdown_output += f"### {data_type.upper()} Data\n\n" 363 | # Convert sample to markdown table 364 | sample_df = data.sample(5)[ 365 | [col for col in data.columns if col.startswith("Metadata")] 366 | ] 367 | markdown_output += sample_df.to_markdown(index=False) + "\n\n" 368 | display(sample_df.head()) 369 | 370 | # Write to file 371 | with open("dataset_samples.md", "w") as f: 372 | f.write(markdown_output) 373 | 374 | print("Dataset samples have been written to dataset_samples.md") 375 | 376 | # %% 377 | 378 | # Save processed datasets using same structure as input 379 | for dataset_name, data_types in dataset_data.items(): 380 | for data_type, data in data_types.items(): 381 | # Mirror the input path structure but with processed data 382 | input_path = Path(dataset_paths[dataset_name][data_type]) 383 | output_path = ( 384 | Path("curated") 385 | / input_path.parent 386 | / input_path.name.replace(".csv.gz", ".parquet") 387 | ) 388 | # Create the processed subdirectory if it doesn't exist 389 | output_path.parent.mkdir(exist_ok=True, parents=True) 390 | 391 | # # Save the data 392 | data.to_parquet(output_path, index=False) 393 | print(f"Saved {dataset_name} {data_type} data to {output_path}") 394 | 395 | # %% 396 | -------------------------------------------------------------------------------- /1.dataset_curation/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "1-dataset-curation" 3 | version = "0.1.0" 4 | description = "Curate 2022_Haghighi_NatureMethods" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "ipython>=8.31.0", 9 | "pandas>=2.2.3", 10 | "pyarrow>=19.0.0", 11 | "tabulate>=0.9.0", 12 | ] 13 | -------------------------------------------------------------------------------- /1.dataset_curation/uv.lock: -------------------------------------------------------------------------------- 1 | version = 1 2 | requires-python = ">=3.12" 3 | 4 | [[package]] 5 | name = "1-dataset-curation" 6 | version = "0.1.0" 7 | source = { virtual = "." } 8 | dependencies = [ 9 | { name = "ipython" }, 10 | { name = "pandas" }, 11 | { name = "pyarrow" }, 12 | { name = "tabulate" }, 13 | ] 14 | 15 | [package.metadata] 16 | requires-dist = [ 17 | { name = "ipython", specifier = ">=8.31.0" }, 18 | { name = "pandas", specifier = ">=2.2.3" }, 19 | { name = "pyarrow", specifier = ">=19.0.0" }, 20 | { name = "tabulate", specifier = ">=0.9.0" }, 21 | ] 22 | 23 | [[package]] 24 | name = "asttokens" 25 | version = "3.0.0" 26 | source = { registry = "https://pypi.org/simple" } 27 | sdist = { url = "https://files.pythonhosted.org/packages/4a/e7/82da0a03e7ba5141f05cce0d302e6eed121ae055e0456ca228bf693984bc/asttokens-3.0.0.tar.gz", hash = "sha256:0dcd8baa8d62b0c1d118b399b2ddba3c4aff271d0d7a9e0d4c1681c79035bbc7", size = 61978 } 28 | wheels = [ 29 | { url = "https://files.pythonhosted.org/packages/25/8a/c46dcc25341b5bce5472c718902eb3d38600a903b14fa6aeecef3f21a46f/asttokens-3.0.0-py3-none-any.whl", hash = "sha256:e3078351a059199dd5138cb1c706e6430c05eff2ff136af5eb4790f9d28932e2", size = 26918 }, 30 | ] 31 | 32 | [[package]] 33 | name = "colorama" 34 | version = "0.4.6" 35 | source = { registry = "https://pypi.org/simple" } 36 | sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } 37 | wheels = [ 38 | { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, 39 | ] 40 | 41 | [[package]] 42 | name = "decorator" 43 | version = "5.1.1" 44 | source = { registry = "https://pypi.org/simple" } 45 | sdist = { url = "https://files.pythonhosted.org/packages/66/0c/8d907af351aa16b42caae42f9d6aa37b900c67308052d10fdce809f8d952/decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330", size = 35016 } 46 | wheels = [ 47 | { url = "https://files.pythonhosted.org/packages/d5/50/83c593b07763e1161326b3b8c6686f0f4b0f24d5526546bee538c89837d6/decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186", size = 9073 }, 48 | ] 49 | 50 | [[package]] 51 | name = "executing" 52 | version = "2.2.0" 53 | source = { registry = "https://pypi.org/simple" } 54 | sdist = { url = "https://files.pythonhosted.org/packages/91/50/a9d80c47ff289c611ff12e63f7c5d13942c65d68125160cefd768c73e6e4/executing-2.2.0.tar.gz", hash = "sha256:5d108c028108fe2551d1a7b2e8b713341e2cb4fc0aa7dcf966fa4327a5226755", size = 978693 } 55 | wheels = [ 56 | { url = "https://files.pythonhosted.org/packages/7b/8f/c4d9bafc34ad7ad5d8dc16dd1347ee0e507a52c3adb6bfa8887e1c6a26ba/executing-2.2.0-py2.py3-none-any.whl", hash = "sha256:11387150cad388d62750327a53d3339fad4888b39a6fe233c3afbb54ecffd3aa", size = 26702 }, 57 | ] 58 | 59 | [[package]] 60 | name = "ipython" 61 | version = "8.31.0" 62 | source = { registry = "https://pypi.org/simple" } 63 | dependencies = [ 64 | { name = "colorama", marker = "sys_platform == 'win32'" }, 65 | { name = "decorator" }, 66 | { name = "jedi" }, 67 | { name = "matplotlib-inline" }, 68 | { name = "pexpect", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" }, 69 | { name = "prompt-toolkit" }, 70 | { name = "pygments" }, 71 | { name = "stack-data" }, 72 | { name = "traitlets" }, 73 | ] 74 | sdist = { url = "https://files.pythonhosted.org/packages/01/35/6f90fdddff7a08b7b715fccbd2427b5212c9525cd043d26fdc45bee0708d/ipython-8.31.0.tar.gz", hash = "sha256:b6a2274606bec6166405ff05e54932ed6e5cfecaca1fc05f2cacde7bb074d70b", size = 5501011 } 75 | wheels = [ 76 | { url = "https://files.pythonhosted.org/packages/04/60/d0feb6b6d9fe4ab89fe8fe5b47cbf6cd936bfd9f1e7ffa9d0015425aeed6/ipython-8.31.0-py3-none-any.whl", hash = "sha256:46ec58f8d3d076a61d128fe517a51eb730e3aaf0c184ea8c17d16e366660c6a6", size = 821583 }, 77 | ] 78 | 79 | [[package]] 80 | name = "jedi" 81 | version = "0.19.2" 82 | source = { registry = "https://pypi.org/simple" } 83 | dependencies = [ 84 | { name = "parso" }, 85 | ] 86 | sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287 } 87 | wheels = [ 88 | { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278 }, 89 | ] 90 | 91 | [[package]] 92 | name = "matplotlib-inline" 93 | version = "0.1.7" 94 | source = { registry = "https://pypi.org/simple" } 95 | dependencies = [ 96 | { name = "traitlets" }, 97 | ] 98 | sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159 } 99 | wheels = [ 100 | { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899 }, 101 | ] 102 | 103 | [[package]] 104 | name = "numpy" 105 | version = "2.2.2" 106 | source = { registry = "https://pypi.org/simple" } 107 | sdist = { url = "https://files.pythonhosted.org/packages/ec/d0/c12ddfd3a02274be06ffc71f3efc6d0e457b0409c4481596881e748cb264/numpy-2.2.2.tar.gz", hash = "sha256:ed6906f61834d687738d25988ae117683705636936cc605be0bb208b23df4d8f", size = 20233295 } 108 | wheels = [ 109 | { url = "https://files.pythonhosted.org/packages/0c/e6/847d15770ab7a01e807bdfcd4ead5bdae57c0092b7dc83878171b6af97bb/numpy-2.2.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ac9bea18d6d58a995fac1b2cb4488e17eceeac413af014b1dd26170b766d8467", size = 20912636 }, 110 | { url = "https://files.pythonhosted.org/packages/d1/af/f83580891577b13bd7e261416120e036d0d8fb508c8a43a73e38928b794b/numpy-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23ae9f0c2d889b7b2d88a3791f6c09e2ef827c2446f1c4a3e3e76328ee4afd9a", size = 14098403 }, 111 | { url = "https://files.pythonhosted.org/packages/2b/86/d019fb60a9d0f1d4cf04b014fe88a9135090adfadcc31c1fadbb071d7fa7/numpy-2.2.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:3074634ea4d6df66be04f6728ee1d173cfded75d002c75fac79503a880bf3825", size = 5128938 }, 112 | { url = "https://files.pythonhosted.org/packages/7a/1b/50985edb6f1ec495a1c36452e860476f5b7ecdc3fc59ea89ccad3c4926c5/numpy-2.2.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:8ec0636d3f7d68520afc6ac2dc4b8341ddb725039de042faf0e311599f54eb37", size = 6661937 }, 113 | { url = "https://files.pythonhosted.org/packages/f4/1b/17efd94cad1b9d605c3f8907fb06bcffc4ce4d1d14d46b95316cccccf2b9/numpy-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ffbb1acd69fdf8e89dd60ef6182ca90a743620957afb7066385a7bbe88dc748", size = 14049518 }, 114 | { url = "https://files.pythonhosted.org/packages/5b/73/65d2f0b698df1731e851e3295eb29a5ab8aa06f763f7e4188647a809578d/numpy-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0349b025e15ea9d05c3d63f9657707a4e1d471128a3b1d876c095f328f8ff7f0", size = 16099146 }, 115 | { url = "https://files.pythonhosted.org/packages/d5/69/308f55c0e19d4b5057b5df286c5433822e3c8039ede06d4051d96f1c2c4e/numpy-2.2.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:463247edcee4a5537841d5350bc87fe8e92d7dd0e8c71c995d2c6eecb8208278", size = 15246336 }, 116 | { url = "https://files.pythonhosted.org/packages/f0/d8/d8d333ad0d8518d077a21aeea7b7c826eff766a2b1ce1194dea95ca0bacf/numpy-2.2.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9dd47ff0cb2a656ad69c38da850df3454da88ee9a6fde0ba79acceee0e79daba", size = 17863507 }, 117 | { url = "https://files.pythonhosted.org/packages/82/6e/0b84ad3103ffc16d6673e63b5acbe7901b2af96c2837174c6318c98e27ab/numpy-2.2.2-cp312-cp312-win32.whl", hash = "sha256:4525b88c11906d5ab1b0ec1f290996c0020dd318af8b49acaa46f198b1ffc283", size = 6276491 }, 118 | { url = "https://files.pythonhosted.org/packages/fc/84/7f801a42a67b9772a883223a0a1e12069a14626c81a732bd70aac57aebc1/numpy-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:5acea83b801e98541619af398cc0109ff48016955cc0818f478ee9ef1c5c3dcb", size = 12616372 }, 119 | { url = "https://files.pythonhosted.org/packages/e1/fe/df5624001f4f5c3e0b78e9017bfab7fdc18a8d3b3d3161da3d64924dd659/numpy-2.2.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b208cfd4f5fe34e1535c08983a1a6803fdbc7a1e86cf13dd0c61de0b51a0aadc", size = 20899188 }, 120 | { url = "https://files.pythonhosted.org/packages/a9/80/d349c3b5ed66bd3cb0214be60c27e32b90a506946857b866838adbe84040/numpy-2.2.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d0bbe7dd86dca64854f4b6ce2ea5c60b51e36dfd597300057cf473d3615f2369", size = 14113972 }, 121 | { url = "https://files.pythonhosted.org/packages/9d/50/949ec9cbb28c4b751edfa64503f0913cbfa8d795b4a251e7980f13a8a655/numpy-2.2.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:22ea3bb552ade325530e72a0c557cdf2dea8914d3a5e1fecf58fa5dbcc6f43cd", size = 5114294 }, 122 | { url = "https://files.pythonhosted.org/packages/8d/f3/399c15629d5a0c68ef2aa7621d430b2be22034f01dd7f3c65a9c9666c445/numpy-2.2.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:128c41c085cab8a85dc29e66ed88c05613dccf6bc28b3866cd16050a2f5448be", size = 6648426 }, 123 | { url = "https://files.pythonhosted.org/packages/2c/03/c72474c13772e30e1bc2e558cdffd9123c7872b731263d5648b5c49dd459/numpy-2.2.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:250c16b277e3b809ac20d1f590716597481061b514223c7badb7a0f9993c7f84", size = 14045990 }, 124 | { url = "https://files.pythonhosted.org/packages/83/9c/96a9ab62274ffafb023f8ee08c88d3d31ee74ca58869f859db6845494fa6/numpy-2.2.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0c8854b09bc4de7b041148d8550d3bd712b5c21ff6a8ed308085f190235d7ff", size = 16096614 }, 125 | { url = "https://files.pythonhosted.org/packages/d5/34/cd0a735534c29bec7093544b3a509febc9b0df77718a9b41ffb0809c9f46/numpy-2.2.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b6fb9c32a91ec32a689ec6410def76443e3c750e7cfc3fb2206b985ffb2b85f0", size = 15242123 }, 126 | { url = "https://files.pythonhosted.org/packages/5e/6d/541717a554a8f56fa75e91886d9b79ade2e595918690eb5d0d3dbd3accb9/numpy-2.2.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:57b4012e04cc12b78590a334907e01b3a85efb2107df2b8733ff1ed05fce71de", size = 17859160 }, 127 | { url = "https://files.pythonhosted.org/packages/b9/a5/fbf1f2b54adab31510728edd06a05c1b30839f37cf8c9747cb85831aaf1b/numpy-2.2.2-cp313-cp313-win32.whl", hash = "sha256:4dbd80e453bd34bd003b16bd802fac70ad76bd463f81f0c518d1245b1c55e3d9", size = 6273337 }, 128 | { url = "https://files.pythonhosted.org/packages/56/e5/01106b9291ef1d680f82bc47d0c5b5e26dfed15b0754928e8f856c82c881/numpy-2.2.2-cp313-cp313-win_amd64.whl", hash = "sha256:5a8c863ceacae696aff37d1fd636121f1a512117652e5dfb86031c8d84836369", size = 12609010 }, 129 | { url = "https://files.pythonhosted.org/packages/9f/30/f23d9876de0f08dceb707c4dcf7f8dd7588266745029debb12a3cdd40be6/numpy-2.2.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:b3482cb7b3325faa5f6bc179649406058253d91ceda359c104dac0ad320e1391", size = 20924451 }, 130 | { url = "https://files.pythonhosted.org/packages/6a/ec/6ea85b2da9d5dfa1dbb4cb3c76587fc8ddcae580cb1262303ab21c0926c4/numpy-2.2.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9491100aba630910489c1d0158034e1c9a6546f0b1340f716d522dc103788e39", size = 14122390 }, 131 | { url = "https://files.pythonhosted.org/packages/68/05/bfbdf490414a7dbaf65b10c78bc243f312c4553234b6d91c94eb7c4b53c2/numpy-2.2.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:41184c416143defa34cc8eb9d070b0a5ba4f13a0fa96a709e20584638254b317", size = 5156590 }, 132 | { url = "https://files.pythonhosted.org/packages/f7/ec/fe2e91b2642b9d6544518388a441bcd65c904cea38d9ff998e2e8ebf808e/numpy-2.2.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:7dca87ca328f5ea7dafc907c5ec100d187911f94825f8700caac0b3f4c384b49", size = 6671958 }, 133 | { url = "https://files.pythonhosted.org/packages/b1/6f/6531a78e182f194d33ee17e59d67d03d0d5a1ce7f6be7343787828d1bd4a/numpy-2.2.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bc61b307655d1a7f9f4b043628b9f2b721e80839914ede634e3d485913e1fb2", size = 14019950 }, 134 | { url = "https://files.pythonhosted.org/packages/e1/fb/13c58591d0b6294a08cc40fcc6b9552d239d773d520858ae27f39997f2ae/numpy-2.2.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fad446ad0bc886855ddf5909cbf8cb5d0faa637aaa6277fb4b19ade134ab3c7", size = 16079759 }, 135 | { url = "https://files.pythonhosted.org/packages/2c/f2/f2f8edd62abb4b289f65a7f6d1f3650273af00b91b7267a2431be7f1aec6/numpy-2.2.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:149d1113ac15005652e8d0d3f6fd599360e1a708a4f98e43c9c77834a28238cb", size = 15226139 }, 136 | { url = "https://files.pythonhosted.org/packages/aa/29/14a177f1a90b8ad8a592ca32124ac06af5eff32889874e53a308f850290f/numpy-2.2.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:106397dbbb1896f99e044efc90360d098b3335060375c26aa89c0d8a97c5f648", size = 17856316 }, 137 | { url = "https://files.pythonhosted.org/packages/95/03/242ae8d7b97f4e0e4ab8dd51231465fb23ed5e802680d629149722e3faf1/numpy-2.2.2-cp313-cp313t-win32.whl", hash = "sha256:0eec19f8af947a61e968d5429f0bd92fec46d92b0008d0a6685b40d6adf8a4f4", size = 6329134 }, 138 | { url = "https://files.pythonhosted.org/packages/80/94/cd9e9b04012c015cb6320ab3bf43bc615e248dddfeb163728e800a5d96f0/numpy-2.2.2-cp313-cp313t-win_amd64.whl", hash = "sha256:97b974d3ba0fb4612b77ed35d7627490e8e3dff56ab41454d9e8b23448940576", size = 12696208 }, 139 | ] 140 | 141 | [[package]] 142 | name = "pandas" 143 | version = "2.2.3" 144 | source = { registry = "https://pypi.org/simple" } 145 | dependencies = [ 146 | { name = "numpy" }, 147 | { name = "python-dateutil" }, 148 | { name = "pytz" }, 149 | { name = "tzdata" }, 150 | ] 151 | sdist = { url = "https://files.pythonhosted.org/packages/9c/d6/9f8431bacc2e19dca897724cd097b1bb224a6ad5433784a44b587c7c13af/pandas-2.2.3.tar.gz", hash = "sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667", size = 4399213 } 152 | wheels = [ 153 | { url = "https://files.pythonhosted.org/packages/17/a3/fb2734118db0af37ea7433f57f722c0a56687e14b14690edff0cdb4b7e58/pandas-2.2.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9", size = 12529893 }, 154 | { url = "https://files.pythonhosted.org/packages/e1/0c/ad295fd74bfac85358fd579e271cded3ac969de81f62dd0142c426b9da91/pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4", size = 11363475 }, 155 | { url = "https://files.pythonhosted.org/packages/c6/2a/4bba3f03f7d07207481fed47f5b35f556c7441acddc368ec43d6643c5777/pandas-2.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3", size = 15188645 }, 156 | { url = "https://files.pythonhosted.org/packages/38/f8/d8fddee9ed0d0c0f4a2132c1dfcf0e3e53265055da8df952a53e7eaf178c/pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319", size = 12739445 }, 157 | { url = "https://files.pythonhosted.org/packages/20/e8/45a05d9c39d2cea61ab175dbe6a2de1d05b679e8de2011da4ee190d7e748/pandas-2.2.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8", size = 16359235 }, 158 | { url = "https://files.pythonhosted.org/packages/1d/99/617d07a6a5e429ff90c90da64d428516605a1ec7d7bea494235e1c3882de/pandas-2.2.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a", size = 14056756 }, 159 | { url = "https://files.pythonhosted.org/packages/29/d4/1244ab8edf173a10fd601f7e13b9566c1b525c4f365d6bee918e68381889/pandas-2.2.3-cp312-cp312-win_amd64.whl", hash = "sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13", size = 11504248 }, 160 | { url = "https://files.pythonhosted.org/packages/64/22/3b8f4e0ed70644e85cfdcd57454686b9057c6c38d2f74fe4b8bc2527214a/pandas-2.2.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015", size = 12477643 }, 161 | { url = "https://files.pythonhosted.org/packages/e4/93/b3f5d1838500e22c8d793625da672f3eec046b1a99257666c94446969282/pandas-2.2.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28", size = 11281573 }, 162 | { url = "https://files.pythonhosted.org/packages/f5/94/6c79b07f0e5aab1dcfa35a75f4817f5c4f677931d4234afcd75f0e6a66ca/pandas-2.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0", size = 15196085 }, 163 | { url = "https://files.pythonhosted.org/packages/e8/31/aa8da88ca0eadbabd0a639788a6da13bb2ff6edbbb9f29aa786450a30a91/pandas-2.2.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24", size = 12711809 }, 164 | { url = "https://files.pythonhosted.org/packages/ee/7c/c6dbdb0cb2a4344cacfb8de1c5808ca885b2e4dcfde8008266608f9372af/pandas-2.2.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659", size = 16356316 }, 165 | { url = "https://files.pythonhosted.org/packages/57/b7/8b757e7d92023b832869fa8881a992696a0bfe2e26f72c9ae9f255988d42/pandas-2.2.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb", size = 14022055 }, 166 | { url = "https://files.pythonhosted.org/packages/3b/bc/4b18e2b8c002572c5a441a64826252ce5da2aa738855747247a971988043/pandas-2.2.3-cp313-cp313-win_amd64.whl", hash = "sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d", size = 11481175 }, 167 | { url = "https://files.pythonhosted.org/packages/76/a3/a5d88146815e972d40d19247b2c162e88213ef51c7c25993942c39dbf41d/pandas-2.2.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468", size = 12615650 }, 168 | { url = "https://files.pythonhosted.org/packages/9c/8c/f0fd18f6140ddafc0c24122c8a964e48294acc579d47def376fef12bcb4a/pandas-2.2.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18", size = 11290177 }, 169 | { url = "https://files.pythonhosted.org/packages/ed/f9/e995754eab9c0f14c6777401f7eece0943840b7a9fc932221c19d1abee9f/pandas-2.2.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2", size = 14651526 }, 170 | { url = "https://files.pythonhosted.org/packages/25/b0/98d6ae2e1abac4f35230aa756005e8654649d305df9a28b16b9ae4353bff/pandas-2.2.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4", size = 11871013 }, 171 | { url = "https://files.pythonhosted.org/packages/cc/57/0f72a10f9db6a4628744c8e8f0df4e6e21de01212c7c981d31e50ffc8328/pandas-2.2.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d", size = 15711620 }, 172 | { url = "https://files.pythonhosted.org/packages/ab/5f/b38085618b950b79d2d9164a711c52b10aefc0ae6833b96f626b7021b2ed/pandas-2.2.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a", size = 13098436 }, 173 | ] 174 | 175 | [[package]] 176 | name = "parso" 177 | version = "0.8.4" 178 | source = { registry = "https://pypi.org/simple" } 179 | sdist = { url = "https://files.pythonhosted.org/packages/66/94/68e2e17afaa9169cf6412ab0f28623903be73d1b32e208d9e8e541bb086d/parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d", size = 400609 } 180 | wheels = [ 181 | { url = "https://files.pythonhosted.org/packages/c6/ac/dac4a63f978e4dcb3c6d3a78c4d8e0192a113d288502a1216950c41b1027/parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18", size = 103650 }, 182 | ] 183 | 184 | [[package]] 185 | name = "pexpect" 186 | version = "4.9.0" 187 | source = { registry = "https://pypi.org/simple" } 188 | dependencies = [ 189 | { name = "ptyprocess" }, 190 | ] 191 | sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450 } 192 | wheels = [ 193 | { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772 }, 194 | ] 195 | 196 | [[package]] 197 | name = "prompt-toolkit" 198 | version = "3.0.50" 199 | source = { registry = "https://pypi.org/simple" } 200 | dependencies = [ 201 | { name = "wcwidth" }, 202 | ] 203 | sdist = { url = "https://files.pythonhosted.org/packages/a1/e1/bd15cb8ffdcfeeb2bdc215de3c3cffca11408d829e4b8416dcfe71ba8854/prompt_toolkit-3.0.50.tar.gz", hash = "sha256:544748f3860a2623ca5cd6d2795e7a14f3d0e1c3c9728359013f79877fc89bab", size = 429087 } 204 | wheels = [ 205 | { url = "https://files.pythonhosted.org/packages/e4/ea/d836f008d33151c7a1f62caf3d8dd782e4d15f6a43897f64480c2b8de2ad/prompt_toolkit-3.0.50-py3-none-any.whl", hash = "sha256:9b6427eb19e479d98acff65196a307c555eb567989e6d88ebbb1b509d9779198", size = 387816 }, 206 | ] 207 | 208 | [[package]] 209 | name = "ptyprocess" 210 | version = "0.7.0" 211 | source = { registry = "https://pypi.org/simple" } 212 | sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762 } 213 | wheels = [ 214 | { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993 }, 215 | ] 216 | 217 | [[package]] 218 | name = "pure-eval" 219 | version = "0.2.3" 220 | source = { registry = "https://pypi.org/simple" } 221 | sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752 } 222 | wheels = [ 223 | { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842 }, 224 | ] 225 | 226 | [[package]] 227 | name = "pyarrow" 228 | version = "19.0.0" 229 | source = { registry = "https://pypi.org/simple" } 230 | sdist = { url = "https://files.pythonhosted.org/packages/7b/01/fe1fd04744c2aa038e5a11c7a4adb3d62bce09798695e54f7274b5977134/pyarrow-19.0.0.tar.gz", hash = "sha256:8d47c691765cf497aaeed4954d226568563f1b3b74ff61139f2d77876717084b", size = 1129096 } 231 | wheels = [ 232 | { url = "https://files.pythonhosted.org/packages/bc/2e/152885f5ef421e80dae68b9c133ab261934f93a6d5e16b61d79c0ed597fb/pyarrow-19.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:a7bbe7109ab6198688b7079cbad5a8c22de4d47c4880d8e4847520a83b0d1b68", size = 30667964 }, 233 | { url = "https://files.pythonhosted.org/packages/80/c2/08bbee9a8610a47c9a1466845f405baf53a639ddd947c5133d8ba13544b6/pyarrow-19.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:4624c89d6f777c580e8732c27bb8e77fd1433b89707f17c04af7635dd9638351", size = 32125039 }, 234 | { url = "https://files.pythonhosted.org/packages/d2/56/06994df823212f5688d3c8bf4294928b12c9be36681872853655724d28c6/pyarrow-19.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b6d3ce4288793350dc2d08d1e184fd70631ea22a4ff9ea5c4ff182130249d9b", size = 41140729 }, 235 | { url = "https://files.pythonhosted.org/packages/94/65/38ad577c98140a9db71e9e1e594b6adb58a7478a5afec6456a8ca2df7f70/pyarrow-19.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:450a7d27e840e4d9a384b5c77199d489b401529e75a3b7a3799d4cd7957f2f9c", size = 42202267 }, 236 | { url = "https://files.pythonhosted.org/packages/b6/1f/966b722251a7354114ccbb71cf1a83922023e69efd8945ebf628a851ec4c/pyarrow-19.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:a08e2a8a039a3f72afb67a6668180f09fddaa38fe0d21f13212b4aba4b5d2451", size = 40505858 }, 237 | { url = "https://files.pythonhosted.org/packages/3b/5e/6bc81aa7fc9affc7d1c03b912fbcc984ca56c2a18513684da267715dab7b/pyarrow-19.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:f43f5aef2a13d4d56adadae5720d1fed4c1356c993eda8b59dace4b5983843c1", size = 42084973 }, 238 | { url = "https://files.pythonhosted.org/packages/53/c3/2f56da818b6a4758cbd514957c67bd0f078ebffa5390ee2e2bf0f9e8defc/pyarrow-19.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:2f672f5364b2d7829ef7c94be199bb88bf5661dd485e21d2d37de12ccb78a136", size = 25241976 }, 239 | { url = "https://files.pythonhosted.org/packages/f5/b9/ba07ed3dd6b6e4f379b78e9c47c50c8886e07862ab7fa6339ac38622d755/pyarrow-19.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:cf3bf0ce511b833f7bc5f5bb3127ba731e97222023a444b7359f3a22e2a3b463", size = 30651291 }, 240 | { url = "https://files.pythonhosted.org/packages/ad/10/0d304243c8277035298a68a70807efb76199c6c929bb3363c92ac9be6a0d/pyarrow-19.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:4d8b0c0de0a73df1f1bf439af1b60f273d719d70648e898bc077547649bb8352", size = 32100461 }, 241 | { url = "https://files.pythonhosted.org/packages/8a/61/bcfc5182e11831bca3f849945b9b106e09fd10ded773dff466658e972a45/pyarrow-19.0.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a92aff08e23d281c69835e4a47b80569242a504095ef6a6223c1f6bb8883431d", size = 41132491 }, 242 | { url = "https://files.pythonhosted.org/packages/8e/87/2915a29049ec352dc69a967fbcbd76b0180319233de0daf8bd368df37099/pyarrow-19.0.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3b78eff5968a1889a0f3bc81ca57e1e19b75f664d9c61a42a604bf9d8402aae", size = 42192529 }, 243 | { url = "https://files.pythonhosted.org/packages/48/18/44e5542b2707a8afaf78b5b88c608f261871ae77787eac07b7c679ca6f0f/pyarrow-19.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b34d3bde38eba66190b215bae441646330f8e9da05c29e4b5dd3e41bde701098", size = 40495363 }, 244 | { url = "https://files.pythonhosted.org/packages/ba/d6/5096deb7599bbd20bc2768058fe23bc725b88eb41bee58303293583a2935/pyarrow-19.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:5418d4d0fab3a0ed497bad21d17a7973aad336d66ad4932a3f5f7480d4ca0c04", size = 42074075 }, 245 | { url = "https://files.pythonhosted.org/packages/2c/df/e3c839c04c284c9ec3d62b02a8c452b795d9b07b04079ab91ce33484d4c5/pyarrow-19.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:e82c3d5e44e969c217827b780ed8faf7ac4c53f934ae9238872e749fa531f7c9", size = 25239803 }, 246 | { url = "https://files.pythonhosted.org/packages/6a/d3/a6d4088e906c7b5d47792256212606d2ae679046dc750eee0ae167338e5c/pyarrow-19.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f208c3b58a6df3b239e0bb130e13bc7487ed14f39a9ff357b6415e3f6339b560", size = 30695401 }, 247 | { url = "https://files.pythonhosted.org/packages/94/25/70040fd0e397dd1b937f459eaeeec942a76027357491dca0ada09d1322af/pyarrow-19.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:c751c1c93955b7a84c06794df46f1cec93e18610dcd5ab7d08e89a81df70a849", size = 32104680 }, 248 | { url = "https://files.pythonhosted.org/packages/4e/f9/92783290cc0d80ca16d34b0c126305bfacca4b87dd889c8f16c6ef2a8fd7/pyarrow-19.0.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b903afaa5df66d50fc38672ad095806443b05f202c792694f3a604ead7c6ea6e", size = 41076754 }, 249 | { url = "https://files.pythonhosted.org/packages/05/46/2c9870f50a495c72e2b8982ae29a9b1680707ea936edc0de444cec48f875/pyarrow-19.0.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a22a4bc0937856263df8b94f2f2781b33dd7f876f787ed746608e06902d691a5", size = 42163133 }, 250 | { url = "https://files.pythonhosted.org/packages/7b/2f/437922b902549228fb15814e8a26105bff2787ece466a8d886eb6699efad/pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:5e8a28b918e2e878c918f6d89137386c06fe577cd08d73a6be8dafb317dc2d73", size = 40452210 }, 251 | { url = "https://files.pythonhosted.org/packages/36/ef/1d7975053af9d106da973bac142d0d4da71b7550a3576cc3e0b3f444d21a/pyarrow-19.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:29cd86c8001a94f768f79440bf83fee23963af5e7bc68ce3a7e5f120e17edf89", size = 42077618 }, 252 | ] 253 | 254 | [[package]] 255 | name = "pygments" 256 | version = "2.19.1" 257 | source = { registry = "https://pypi.org/simple" } 258 | sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 } 259 | wheels = [ 260 | { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, 261 | ] 262 | 263 | [[package]] 264 | name = "python-dateutil" 265 | version = "2.9.0.post0" 266 | source = { registry = "https://pypi.org/simple" } 267 | dependencies = [ 268 | { name = "six" }, 269 | ] 270 | sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 } 271 | wheels = [ 272 | { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892 }, 273 | ] 274 | 275 | [[package]] 276 | name = "pytz" 277 | version = "2024.2" 278 | source = { registry = "https://pypi.org/simple" } 279 | sdist = { url = "https://files.pythonhosted.org/packages/3a/31/3c70bf7603cc2dca0f19bdc53b4537a797747a58875b552c8c413d963a3f/pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a", size = 319692 } 280 | wheels = [ 281 | { url = "https://files.pythonhosted.org/packages/11/c3/005fcca25ce078d2cc29fd559379817424e94885510568bc1bc53d7d5846/pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725", size = 508002 }, 282 | ] 283 | 284 | [[package]] 285 | name = "six" 286 | version = "1.17.0" 287 | source = { registry = "https://pypi.org/simple" } 288 | sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031 } 289 | wheels = [ 290 | { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050 }, 291 | ] 292 | 293 | [[package]] 294 | name = "stack-data" 295 | version = "0.6.3" 296 | source = { registry = "https://pypi.org/simple" } 297 | dependencies = [ 298 | { name = "asttokens" }, 299 | { name = "executing" }, 300 | { name = "pure-eval" }, 301 | ] 302 | sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707 } 303 | wheels = [ 304 | { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521 }, 305 | ] 306 | 307 | [[package]] 308 | name = "tabulate" 309 | version = "0.9.0" 310 | source = { registry = "https://pypi.org/simple" } 311 | sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090 } 312 | wheels = [ 313 | { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252 }, 314 | ] 315 | 316 | [[package]] 317 | name = "traitlets" 318 | version = "5.14.3" 319 | source = { registry = "https://pypi.org/simple" } 320 | sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621 } 321 | wheels = [ 322 | { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359 }, 323 | ] 324 | 325 | [[package]] 326 | name = "tzdata" 327 | version = "2025.1" 328 | source = { registry = "https://pypi.org/simple" } 329 | sdist = { url = "https://files.pythonhosted.org/packages/43/0f/fa4723f22942480be4ca9527bbde8d43f6c3f2fe8412f00e7f5f6746bc8b/tzdata-2025.1.tar.gz", hash = "sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694", size = 194950 } 330 | wheels = [ 331 | { url = "https://files.pythonhosted.org/packages/0f/dd/84f10e23edd882c6f968c21c2434fe67bd4a528967067515feca9e611e5e/tzdata-2025.1-py2.py3-none-any.whl", hash = "sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639", size = 346762 }, 332 | ] 333 | 334 | [[package]] 335 | name = "wcwidth" 336 | version = "0.2.13" 337 | source = { registry = "https://pypi.org/simple" } 338 | sdist = { url = "https://files.pythonhosted.org/packages/6c/63/53559446a878410fc5a5974feb13d31d78d752eb18aeba59c7fef1af7598/wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5", size = 101301 } 339 | wheels = [ 340 | { url = "https://files.pythonhosted.org/packages/fd/84/fd2ba7aafacbad3c4201d395674fc6348826569da3c0937e75505ead3528/wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859", size = 34166 }, 341 | ] 342 | -------------------------------------------------------------------------------- /GO_terms_search/4-GO-terms-search-analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "02b2641f", 6 | "metadata": {}, 7 | "source": [ 8 | "#### Here we investigate the relashionship between:\n", 9 | " - mRNA level predictability of a landmark gene \n", 10 | " and \n", 11 | " - its known organelle level biological function using GO annotations" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "367cc88b", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "def locations_of_substring(string, substring):\n", 24 | " \"\"\"Return a list of locations of a substring.\"\"\"\n", 25 | " substring_length = len(substring) \n", 26 | " def recurse(locations_found, start):\n", 27 | " location = string.find(substring, start)\n", 28 | " if location != -1:\n", 29 | " return recurse(locations_found + [location], location+substring_length)\n", 30 | " else:\n", 31 | " return locations_found\n", 32 | " return recurse([], 0)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "d80e3f38", 38 | "metadata": {}, 39 | "source": [ 40 | "#### For LUAD dataset:\n", 41 | "\n", 42 | "1 - Read predictability map of categorical features (using MLP model)\n", 43 | "\n", 44 | "2 - Assign the feature categories to compartments/stains\n", 45 | "\n", 46 | "3 - Read functional annotations of the reference set according to DAVIDs output and add columns for each channel\n", 47 | " - Add channel specific annotation to each columns channel\n", 48 | " " 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "00bc67ee", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "import sys\n", 59 | "sys.path.insert(0, '../utils/') \n", 60 | "from readProfiles import rename_affyprobe_to_genename\n", 61 | "from saveAsNewSheetToExistingFile import saveAsNewSheetToExistingFile\n", 62 | "########### 1 ###########\n", 63 | "filename='../results/SingleGenePred_cpCategoryMap/cat_scores_maps.xlsx'\n", 64 | "saved_scores=pd.read_excel(filename, sheet_name=None)\n", 65 | "# which_ds_model='LUAD-9-MLP-ht'\n", 66 | "which_ds_model='LUAD-9-MLP-keras-ht'\n", 67 | "# which_ds_model='LUAD-9-lasso-ht'\n", 68 | "dfcats=saved_scores[which_ds_model].rename(columns={'Unnamed: 0':'ID'})\n", 69 | "dfcats=dfcats[dfcats.columns[~dfcats.isna().any()].tolist()]\n", 70 | "dfcats2,_=rename_affyprobe_to_genename(dfcats.set_index('ID').T,dfcats.ID.tolist())\n", 71 | "dfcats=dfcats2.T.reset_index()\n", 72 | "\n", 73 | "\n", 74 | "########### 2 ###########\n", 75 | "Channelss=['DNA','RNA','AGP','Mito','ER']\n", 76 | "Channelss_cats=['DNA|Nuclei_AreaShape','RNA','AGP|Cytoplasm_AreaShape|Cells_AreaShape','Mito','ER']\n", 77 | "# Channelss_cats=['DNA|Nuclei_AreaShape','RNA','AGP|Cytoplasm_AreaShape','Mito','ER']\n", 78 | "\n", 79 | "for ci in range(len(Channelss)):\n", 80 | " dfcats['max_'+Channelss[ci]]=dfcats.loc[:,dfcats.columns.str.contains(Channelss_cats[ci])].max(axis=1)\n", 81 | "\n", 82 | " \n", 83 | "dfcats['top_channel']=dfcats[['max_'+Channelss[ci] for ci in range(len(Channelss))]].idxmax(axis=\"columns\")\n", 84 | "########### 3 ###########\n", 85 | "# gene_cats_bpcc=pd.read_csv('./go_BP_CC_MF_DIRECT_921.txt',delimiter='\\t')\n", 86 | "gene_cats_bpcc=pd.read_csv('./source/GO_bp_cc_mf_direct_LUAD_976.txt',delimiter='\\t')\n", 87 | "comps=['mitochondri','golgi','membrane','cytoskeleton','actin','endoplasmic','rna','nucleol',\\\n", 88 | " 'cell division','mitosis','mitotic','cell cycle']\n", 89 | "\n", 90 | "# GOTERM_BP_DIRECT\n", 91 | "# GOTERM_CC_DIRECT\n", 92 | "# GOTERM_MF_DIRECT\n", 93 | "for c in comps:\n", 94 | " gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n", 95 | " ''.join([x[:si].split('~')[-1]+x[si:].split('go')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n", 96 | " gene_cats_bpcc['GOTERM_MF_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n", 97 | " ''.join([x[:si].split('~')[-1]+x[si:].split('go')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n", 98 | " gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n", 99 | " ''.join([x[:si].split('~')[-1]+x[si:].split('go')[0] for si in locations_of_substring(x,c)]) if c in x else '')#+\\\n", 100 | "# gene_cats_bpcc['UP_KW_BIOLOGICAL_PROCESS'].astype(str).str.lower().apply(lambda x:\\\n", 101 | "# ''.join([x[:si].split('~')[-1]+x[si:].split('kw')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n", 102 | "# gene_cats_bpcc['UP_KW_CELLULAR_COMPONENT'].astype(str).str.lower().apply(lambda x:\\\n", 103 | "# ''.join([x[:si].split('~')[-1]+x[si:].split('kw')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n", 104 | "# gene_cats_bpcc['UP_KW_MOLECULAR_FUNCTION'].astype(str).str.lower().apply(lambda x:\\\n", 105 | "# ''.join([x[:si].split('~')[-1]+x[si:].split('kw')[0] for si in locations_of_substring(x,c)]) if c in x else '')#+\\\n", 106 | "# # gene_cats_bpcc['UP_SEQ_FEATURE'].astype(str).str.lower().apply(lambda x:\\\n", 107 | "# ''.join([x[:si].split(':')[-1]+','+x[si:].split(',')[0] for si in locations_of_substring(x,c)]) if c in x else '')\n", 108 | " \n", 109 | "gene_cats_bpcc['RNA_nucleoli']=gene_cats_bpcc['rna']+gene_cats_bpcc['nucleol']\n", 110 | "gene_cats_bpcc['DNA']=gene_cats_bpcc['cell division']+gene_cats_bpcc['mitosis']+\\\n", 111 | "gene_cats_bpcc['mitotic']+gene_cats_bpcc['cell cycle']\n", 112 | "\n", 113 | "gene_cats_bpcc['cytoskeleton-actin']=gene_cats_bpcc['cytoskeleton']+gene_cats_bpcc['actin']+\\\n", 114 | "gene_cats_bpcc['golgi']+gene_cats_bpcc['membrane']\n", 115 | "# gene_cats_bpcc['ER']=gene_cats_bpcc['endoplasmic']\n", 116 | "gene_cats_bpcc['mitochondria']=gene_cats_bpcc['mitochondri']\n", 117 | "\n", 118 | "gene_cats_bpcc=pd.merge(gene_cats_bpcc,dfcats,how='inner',on='ID')\n", 119 | "\n", 120 | "#########################\n", 121 | "Channelss_dict={'DNA':'DNA','RNA_nucleoli':'RNA','cytoskeleton-actin':'AGP','mitochondria':'Mito','endoplasmic':'ER'}\n", 122 | "Chan_rev_dict = dict(zip(Channelss_dict.values(),Channelss_dict.keys()))\n", 123 | "\n", 124 | "comps2=['mitochondri','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n", 125 | "gene_cats_bpcc['any_comps']=''\n", 126 | "for co in comps2:\n", 127 | " gene_cats_bpcc['any_comps']=gene_cats_bpcc['any_comps']+gene_cats_bpcc[co]\n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | "# gene_cats_bpcc=gene_cats_bpcc[gene_cats_bpcc['any_comps']!=''].reset_index(drop=True) \n", 132 | "from sklearn.metrics import confusion_matrix\n", 133 | "from scipy.stats import fisher_exact\n", 134 | "# top_bool=(gene_cats_bpcc['top58']==True).values\n", 135 | "table2=pd.DataFrame(index=Channelss_dict.keys(),columns=Channelss_dict.values())\n", 136 | "table3=pd.DataFrame(index=Channelss,columns=['odds ratio','med_restComp_oddsratio','restComp_oddsratio','anyComp_oddsratio','top-ratio'])\n", 137 | "# table3=pd.DataFrame(index=Channelss,columns=['Prevalence','anyComp-Prevalence','noComp-Prevalence','top-ratio'])\n", 138 | "\n", 139 | "table=[]\n", 140 | "for c in Channelss:\n", 141 | " print(c)\n", 142 | " c_r=list(set(Channelss)-set([c]))\n", 143 | "# gene_cats_bpcc_highP=gene_cats_bpcc[(gene_cats_bpcc['max_'+c]>0.7)]\n", 144 | "# gene_cats_bpcc_lowP=gene_cats_bpcc[(gene_cats_bpcc['max_'+c]<0.1)]\n", 145 | " \n", 146 | " low_ind=gene_cats_bpcc[['max_'+c]].sort_values(by='max_'+c)[:100].index\n", 147 | " high_ind=gene_cats_bpcc[['max_'+c]].sort_values(by='max_'+c)[-100:].index \n", 148 | " gene_cats_bpcc_highP=gene_cats_bpcc.loc[high_ind].reset_index(drop=True)\n", 149 | " gene_cats_bpcc_lowP=gene_cats_bpcc.loc[low_ind].reset_index(drop=True)\n", 150 | " gene_cats_bpcc2=gene_cats_bpcc.copy()\n", 151 | " \n", 152 | "# top_bool=(gene_cats_bpcc['max_'+c]>.6).values\n", 153 | " top_bool=(gene_cats_bpcc['max_'+c]<0).values\n", 154 | " \n", 155 | " print(np.sum(gene_cats_bpcc.loc[gene_cats_bpcc['max_'+c]<0,['max_'+Channelss[ci] for ci in range(len(Channelss))]].max(axis=1)>0.3))\n", 156 | " n_top=sum(top_bool)\n", 157 | " print(n_top)\n", 158 | " \n", 159 | " comps2=['mitochondria','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n", 160 | " table1=pd.DataFrame(index=comps2+['any comp','no comp'],columns=['Prevalence','p-value','odds ratio'])\n", 161 | " for co in comps2:\n", 162 | " \n", 163 | " enr_ratio=gene_cats_bpcc2[top_bool & (gene_cats_bpcc2[co]!='')].shape[0]/\\\n", 164 | " gene_cats_bpcc2[(gene_cats_bpcc2[co]!='')].shape[0]\n", 165 | " comp_bool=(gene_cats_bpcc2[co]!='').values\n", 166 | " oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, comp_bool))\n", 167 | " \n", 168 | " table1.loc[co,['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n", 169 | "# print(co,\": \",np.round(enr_ratio*100,2),'% ','pvalue:', np.round(pvalue,2),' oddsratio:',np.round(oddsratio,2))\n", 170 | " table2.loc[co,c]=oddsratio\n", 171 | " \n", 172 | " \n", 173 | " enr_ratio=gene_cats_bpcc[top_bool & (gene_cats_bpcc['any_comps']!='')].shape[0]/\\\n", 174 | " gene_cats_bpcc[(gene_cats_bpcc['any_comps']!='')].shape[0]\n", 175 | " any_oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['any_comps']!='').values))\n", 176 | "# print(\"any_comps: \",np.round(enr_ratio*100,2),'% ','pvalue:', np.round(pvalue,2),' oddsratio:',np.round(oddsratio,2))\n", 177 | " table1.loc['any comp',['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,any_oddsratio\n", 178 | " print(any_oddsratio)\n", 179 | "\n", 180 | " nocomp_enr_ratio=gene_cats_bpcc[(top_bool) & (gene_cats_bpcc['any_comps']=='')].shape[0]/\\\n", 181 | "gene_cats_bpcc[(gene_cats_bpcc['any_comps']=='')].shape[0]\n", 182 | "\n", 183 | " comps2=['mitochondria','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n", 184 | " comps2.remove(Chan_rev_dict[c])\n", 185 | " \n", 186 | " med_restComp_oddsratio=table2.loc[comps2,c].median()\n", 187 | " gene_cats_bpcc['rest_comps']=''\n", 188 | " for co in comps2:\n", 189 | " gene_cats_bpcc['rest_comps']=gene_cats_bpcc['rest_comps']+gene_cats_bpcc[co] \n", 190 | " \n", 191 | " rest_enr_ratio=gene_cats_bpcc[(top_bool) & (gene_cats_bpcc['rest_comps']=='')].shape[0]/\\\n", 192 | "gene_cats_bpcc[(gene_cats_bpcc['rest_comps']=='')].shape[0] \n", 193 | " rest_oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['rest_comps']!='').values))\n", 194 | " \n", 195 | " table.append(table1)\n", 196 | "# print('num top ('+ str(n_top)+')/total genes (912): ', np.round((n_top/912)*100,2),'%')\n", 197 | " table3.loc[c,['odds ratio','med_restComp_oddsratio','restComp_oddsratio','anyComp_oddsratio','top-ratio']]=\\\n", 198 | " table1.loc[Chan_rev_dict[c],'odds ratio'],med_restComp_oddsratio,rest_oddsratio,\\\n", 199 | " any_oddsratio,np.round((n_top/912)*100,2)\n", 200 | " \n", 201 | "# table3.loc[c,['Prevalence','anyComp-Prevalence','noComp-Prevalence','top-ratio']]=table1.loc[Chan_rev_dict[c],'Prevalence'],\\\n", 202 | "# enr_ratio*100,nocomp_enr_ratio*100,np.round((n_top/912)*100,2) \n", 203 | " \n", 204 | "# print(Chan_rev_dict[c],': ',table1.loc[Chan_rev_dict[c],['Prevalence']].values)\n", 205 | " \n", 206 | "# # table3['dif']=table3['Prevalence']-table3['anyComp-Prevalence']\n", 207 | "table3['dif']=table3['odds ratio']-table3['restComp_oddsratio']\n", 208 | "table3['dif2']=table3['odds ratio']-table3['med_restComp_oddsratio']\n", 209 | "table3['dif3']=table3['odds ratio']-table3['anyComp_oddsratio']\n", 210 | "# print(table3['dif'].min(),table3['dif'].sum())\n", 211 | "# table3\n", 212 | "\n", 213 | "source_data_add='../results/Figs_Source_Data.xlsx'\n", 214 | "if 0:\n", 215 | " saveAsNewSheetToExistingFile(source_data_add,pd.concat([table3.astype(float).round(3)[['odds ratio','restComp_oddsratio']],x1],axis=1),'ExtendedData5')" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "id": "2a98dcc6", 221 | "metadata": {}, 222 | "source": [ 223 | "## GO terms search for overlap of highly predictable genes (top 58)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "87d28528", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "# top58=pd.read_csv('./top_58_common.txt',header=None)[0].tolist()\n", 234 | "# gene_cats_bpcc=pd.read_csv('./go_bp_cc_D2021_each_gene_cat.txt',delimiter='\\t')\n", 235 | "import sys\n", 236 | "sys.path.insert(0, '../utils/') \n", 237 | "from readProfiles import rename_affyprobe_to_genename\n", 238 | "from saveAsNewSheetToExistingFile import saveAsNewSheetToExistingFile\n", 239 | "top58=pd.read_csv('./source/top_59_atleast_topIn3.txt',header=None)[0].tolist()\n", 240 | "gene_cats_bpcc=pd.read_csv('./source/GO_bp_cc_mf_direct_intersection_782.txt',delimiter='\\t')\n", 241 | "\n", 242 | "# comps=['mitochondri','Golgi','membrane','cytoskeleton','actin','endoplasmic','RNA','nucleol','cell division','mitosis','mitotic','cell cycle']\n", 243 | "\n", 244 | "# for c in comps:\n", 245 | "# gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).apply(lambda x: x if c in x else '')+\\\n", 246 | "# gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).apply(lambda x: x[:x.find(c)].split('~')[-1]+x[x.find(c):].split('GO')[0] if c in x else '')\n", 247 | "comps=['mitochondri','golgi','membrane','cytoskeleton','actin','endoplasmic','rna','nucleol',\\\n", 248 | " 'cell division','mitosis','mitotic','cell cycle','cytokine','hormone']\n", 249 | "\n", 250 | "for c in comps:\n", 251 | " gene_cats_bpcc[c]=gene_cats_bpcc['GOTERM_BP_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n", 252 | " ''.join([x[:si].split('~')[-1]+x[si:].split('GO')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n", 253 | " gene_cats_bpcc['GOTERM_CC_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n", 254 | " ''.join([x[:si].split('~')[-1]+x[si:].split('GO')[0] for si in locations_of_substring(x,c)]) if c in x else '')+\\\n", 255 | " gene_cats_bpcc['GOTERM_MF_DIRECT'].astype(str).str.lower().apply(lambda x:\\\n", 256 | " ''.join([x[:si].split('~')[-1]+x[si:].split('GO')[0] for si in locations_of_substring(x,c)]) if c in x else '')\n", 257 | " \n", 258 | "gene_cats_bpcc['RNA_nucleoli']=gene_cats_bpcc['rna']+gene_cats_bpcc['nucleol']\n", 259 | "gene_cats_bpcc['DNA']=gene_cats_bpcc['cell division']+gene_cats_bpcc['mitosis']+gene_cats_bpcc['mitotic']+gene_cats_bpcc['cell cycle']\n", 260 | "\n", 261 | "gene_cats_bpcc['cytoskeleton-actin']=gene_cats_bpcc['cytoskeleton']+gene_cats_bpcc['actin']\n", 262 | "\n", 263 | "gene_cats_bpcc.loc[gene_cats_bpcc['ID'].isin(top58),'top58']=True\n", 264 | "\n", 265 | "\n", 266 | "\n", 267 | "#####################################\n", 268 | "comps2=['mitochondri','golgi','membrane','cytoskeleton-actin','endoplasmic','RNA_nucleoli','DNA']\n", 269 | "from sklearn.metrics import confusion_matrix\n", 270 | "from scipy.stats import fisher_exact\n", 271 | "top_bool=(gene_cats_bpcc['top58']==True).values\n", 272 | "\n", 273 | "table1=pd.DataFrame(index=comps2+['any comp','no comp'],columns=['Prevalence','p-value','odds ratio'])\n", 274 | "for co in comps2:\n", 275 | " enr_ratio=gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc[co]!='')].shape[0]/\\\n", 276 | " gene_cats_bpcc[(gene_cats_bpcc[co]!='')].shape[0]\n", 277 | " comp_bool=(gene_cats_bpcc[co]!='').values\n", 278 | " oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, comp_bool))\n", 279 | "# print(co, ':',gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc[co]!='')].shape[0],\\\n", 280 | "# ', ',gene_cats_bpcc[(gene_cats_bpcc[co]!='')].shape[0])\n", 281 | "\n", 282 | " table1.loc[co,['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n", 283 | " print(co,\": \",np.round(enr_ratio*100,2),'% ','pvalue:', np.round(pvalue,2),' oddsratio:',np.round(oddsratio,2))\n", 284 | " \n", 285 | "gene_cats_bpcc['any_comps']=''\n", 286 | "for co in comps2:\n", 287 | " gene_cats_bpcc['any_comps']=gene_cats_bpcc['any_comps']+gene_cats_bpcc[co]\n", 288 | " \n", 289 | " \n", 290 | "enr_ratio=gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc['any_comps']!='')].shape[0]/\\\n", 291 | "gene_cats_bpcc[(gene_cats_bpcc['any_comps']!='')].shape[0]\n", 292 | "oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['any_comps']!='').values))\n", 293 | "print(\"any_comps: \",np.round(enr_ratio*100,2),'% ','pvalue:', np.round(pvalue,2),' oddsratio:',np.round(oddsratio,2))\n", 294 | "table1.loc['any comp',['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n", 295 | "\n", 296 | "\n", 297 | "enr_ratio=gene_cats_bpcc[(gene_cats_bpcc['top58']==True) & (gene_cats_bpcc['any_comps']=='')].shape[0]/\\\n", 298 | "gene_cats_bpcc[(gene_cats_bpcc['any_comps']=='')].shape[0]\n", 299 | "oddsratio, pvalue = fisher_exact(confusion_matrix(top_bool, (gene_cats_bpcc['any_comps']=='').values))\n", 300 | "print(\"no comps: \",np.round(enr_ratio*100,2),'% ','pvalue:', np.round(pvalue,2),' oddsratio:',np.round(oddsratio,2))\n", 301 | "table1.loc['no comp',['Prevalence','p-value','odds ratio']]=enr_ratio*100,pvalue,oddsratio\n", 302 | "\n", 303 | "# print('num top (52)/total genes (782): ', np.round((52/782)*100,2),'%')\n", 304 | "\n", 305 | "source_data_add='../results/Figs_Source_Data.xlsx'\n", 306 | "if 0:\n", 307 | " saveAsNewSheetToExistingFile(source_data_add,table1,'ExtendedData6')\n", 308 | "\n", 309 | "# gene_cats_bpcc.to_csv('./GO/go_bp_cc_D2021_each_gene_cat_completed.csv',index=False)\n", 310 | "# gene_cats_bpcc.to_csv('./source/GO_bp_cc_mf_direct_intersection_782_completed.csv',index=False)\n" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "id": "83ea53e3", 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "table1['odds ratio'].astype(float).round(2)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "id": "4d4a04f6", 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "table1['odds ratio'].astype(float).round(2).values" 331 | ] 332 | } 333 | ], 334 | "metadata": { 335 | "kernelspec": { 336 | "display_name": "Python 3 (ipykernel)", 337 | "language": "python", 338 | "name": "python3" 339 | }, 340 | "language_info": { 341 | "codemirror_mode": { 342 | "name": "ipython", 343 | "version": 3 344 | }, 345 | "file_extension": ".py", 346 | "mimetype": "text/x-python", 347 | "name": "python", 348 | "nbconvert_exporter": "python", 349 | "pygments_lexer": "ipython3", 350 | "version": "3.8.12" 351 | } 352 | }, 353 | "nbformat": 4, 354 | "nbformat_minor": 5 355 | } 356 | -------------------------------------------------------------------------------- /GO_terms_search/source/LUAD_geneSymbols_978.txt: -------------------------------------------------------------------------------- 1 | AARS1 2 | ABCB6 3 | ABCC5 4 | ABCF1 5 | ABCF3 6 | ABHD4 7 | ABHD6 8 | ABL1 9 | ACAA1 10 | ACAT2 11 | ACBD3 12 | ACD 13 | ACLY 14 | ACOT9 15 | ADAM10 16 | ADAT1 17 | ADGRE2 18 | ADGRG1 19 | ADH5 20 | ADI1 21 | ADO 22 | ADRB2 23 | AGER 24 | AGL 25 | AKAP8 26 | AKAP8L 27 | AKR7A2 28 | AKT1 29 | ALAS1 30 | ALDH7A1 31 | ALDOA 32 | ALDOC 33 | AMDHD2 34 | ANKRD10 35 | ANO10 36 | ANXA7 37 | APBB2 38 | APOE 39 | APP 40 | APPBP2 41 | ARFIP2 42 | ARHGAP1 43 | ARHGEF12 44 | ARHGEF2 45 | ARID4B 46 | ARID5B 47 | ARL4C 48 | ARNT2 49 | ARPP19 50 | ASAH1 51 | ASCC3 52 | ATF1 53 | ATF5 54 | ATF6 55 | ATG3 56 | ATMIN 57 | ATP11B 58 | ATP1B1 59 | ATP2C1 60 | ATP6V0B 61 | ATP6V1D 62 | AURKA 63 | AURKB 64 | AXIN1 65 | B3GNT2 66 | BACE2 67 | BAD 68 | BAG3 69 | BAMBI 70 | BAX 71 | BCL2 72 | BCL7B 73 | BDH1 74 | BECN1 75 | BHLHE40 76 | BID 77 | BIRC2 78 | BIRC5 79 | BLCAP 80 | BLMH 81 | BLTP2 82 | BLVRA 83 | BMP4 84 | BNIP3 85 | BNIP3L 86 | BPHL 87 | BRCA1 88 | BTK 89 | BUB1B 90 | BZW2 91 | C2CD2 92 | C2CD2L 93 | C2CD5 94 | C5 95 | CAB39 96 | CALM1 97 | CALU 98 | CAMSAP2 99 | CANT1 100 | CAPN1 101 | CARMIL1 102 | CASC3 103 | CASK 104 | CASP10 105 | CASP2 106 | CASP3 107 | CASP7 108 | CAST 109 | CAT 110 | CBLB 111 | CBR1 112 | CBR3 113 | CCDC85B 114 | CCDC86 115 | CCDC92 116 | CCL2 117 | CCNA1 118 | CCNA2 119 | CCNB1 120 | CCNB2 121 | CCND1 122 | CCND3 123 | CCNE2 124 | CCNF 125 | CCNH 126 | CCP110 127 | CD320 128 | CD40 129 | CD44 130 | CD58 131 | CDC20 132 | CDC25A 133 | CDC25B 134 | CDC42 135 | CDC45 136 | CDCA4 137 | CDH3 138 | CDK19 139 | CDK2 140 | CDK4 141 | CDK5R1 142 | CDK6 143 | CDK7 144 | CDKN1A 145 | CDKN1B 146 | CDKN2A 147 | CEBPA 148 | CEBPD 149 | CEBPZ 150 | CEMIP2 151 | CENPE 152 | CEP57 153 | CERK 154 | CETN3 155 | CFLAR 156 | CGRRF1 157 | CHAC1 158 | CHEK1 159 | CHEK2 160 | CHERP 161 | CHIC2 162 | CHMP4A 163 | CHMP6 164 | CHN1 165 | CIAO3 166 | CIAPIN1 167 | CIRBP 168 | CISD1 169 | CLIC4 170 | CLPX 171 | CLSTN1 172 | CLTB 173 | CLTC 174 | CNDP2 175 | CNOT4 176 | CNPY3 177 | COASY 178 | COG2 179 | COG4 180 | COG7 181 | COL1A1 182 | COL4A1 183 | COPB2 184 | COPS7A 185 | COQ8A 186 | CORO1A 187 | CPNE3 188 | CPSF4 189 | CRAMP1 190 | CREB1 191 | CREG1 192 | CRELD2 193 | CRK 194 | CRKL 195 | CRTAP 196 | CRYZ 197 | CSK 198 | CSNK1A1 199 | CSNK1E 200 | CSNK2A2 201 | CSRP1 202 | CTNNAL1 203 | CTNND1 204 | CTSD 205 | CTSL 206 | CTTN 207 | CXCL2 208 | CXCR4 209 | CYB561 210 | CYCS 211 | CYTH1 212 | DAG1 213 | DAXX 214 | DCK 215 | DCTD 216 | DCUN1D4 217 | DDB2 218 | DDIT4 219 | DDR1 220 | DDX10 221 | DDX42 222 | DECR1 223 | DENND2D 224 | DERA 225 | DFFA 226 | DFFB 227 | DHDDS 228 | DHRS7 229 | DHX29 230 | DIPK1A 231 | DLD 232 | DMAC2L 233 | DMTF1 234 | DNAJA3 235 | DNAJB1 236 | DNAJB2 237 | DNAJB6 238 | DNAJC15 239 | DNM1 240 | DNM1L 241 | DNMT1 242 | DNMT3A 243 | DNTTIP2 244 | DPH2 245 | DRAP1 246 | DSG2 247 | DUSP11 248 | DUSP14 249 | DUSP22 250 | DUSP3 251 | DUSP4 252 | DUSP6 253 | DYNLT3 254 | DYRK3 255 | E2F2 256 | EAPP 257 | EBNA1BP2 258 | EBP 259 | ECD 260 | ECH1 261 | EDEM1 262 | EDN1 263 | EED 264 | EFCAB14 265 | EGF 266 | EGFR 267 | EGR1 268 | EIF4EBP1 269 | EIF4G1 270 | EIF5 271 | ELAC2 272 | ELAVL1 273 | ELOVL6 274 | ELP1 275 | EML3 276 | ENOPH1 277 | ENOSF1 278 | EPB41L2 279 | EPHA3 280 | EPHB2 281 | EPN2 282 | EPRS1 283 | ERBB2 284 | ERBB3 285 | ERO1A 286 | ETFB 287 | ETS1 288 | ETV1 289 | EVL 290 | EXOSC4 291 | EXT1 292 | EZH2 293 | FAH 294 | FAIM 295 | FAM20B 296 | FAS 297 | FASTKD5 298 | FAT1 299 | FBXL12 300 | FBXO11 301 | FBXO21 302 | FBXO7 303 | FCHO1 304 | FDFT1 305 | FEZ2 306 | FGFR2 307 | FGFR4 308 | FHL2 309 | FIS1 310 | FKBP14 311 | FKBP4 312 | FOS 313 | FOSL1 314 | FOXJ3 315 | FOXO3 316 | FOXO4 317 | FPGS 318 | FRS2 319 | FSD1 320 | FUT1 321 | FYN 322 | FZD1 323 | FZD7 324 | G3BP1 325 | GAA 326 | GABPB1 327 | GADD45A 328 | GADD45B 329 | GALE 330 | GAPDH 331 | GARRE1 332 | GATA2 333 | GATA3 334 | GDPD5 335 | GET1 336 | GFOD1 337 | GFPT1 338 | GFUS 339 | GHR 340 | GLI2 341 | GLOD4 342 | GLRX 343 | GMNN 344 | GNA11 345 | GNA15 346 | GNAI1 347 | GNAI2 348 | GNAS 349 | GNB5 350 | GNPDA1 351 | GOLT1B 352 | GPATCH8 353 | GPC1 354 | GPER1 355 | GRB10 356 | GRB7 357 | GRN 358 | GRWD1 359 | GSTM2 360 | GSTZ1 361 | GTF2A2 362 | GTF2E2 363 | GTPBP8 364 | H2AZ2 365 | H2BC12 366 | H2BC21 367 | HACD3 368 | HADH 369 | HAT1 370 | HDAC2 371 | HDAC6 372 | HDGFL3 373 | HEATR1 374 | HEBP1 375 | HERC6 376 | HERPUD1 377 | HES1 378 | HIF1A 379 | HK1 380 | HLA-DMA 381 | HLA-DRA 382 | HMG20B 383 | HMGA2 384 | HMGCR 385 | HMGCS1 386 | HMOX1 387 | HOMER2 388 | HOOK2 389 | HOXA10 390 | HOXA5 391 | HPRT1 392 | HS2ST1 393 | HSD17B10 394 | HSD17B11 395 | HSPA1A 396 | HSPA4 397 | HSPA8 398 | HSPB1 399 | HSPD1 400 | HTATSF1 401 | HTRA1 402 | HYOU1 403 | IARS2 404 | ICAM1 405 | ICAM3 406 | ICMT 407 | ID2 408 | IDE 409 | IER3 410 | IFNAR1 411 | IFRD2 412 | IGF1R 413 | IGF2BP2 414 | IGF2R 415 | IGFBP3 416 | IGHMBP2 417 | IKBKB 418 | IKBKE 419 | IKZF1 420 | IL13RA1 421 | IL1B 422 | IL4R 423 | ILK 424 | INPP1 425 | INPP4B 426 | INSIG1 427 | INTS3 428 | IPO13 429 | IQGAP1 430 | ISOC1 431 | ITFG1 432 | ITGAE 433 | ITGB1BP1 434 | ITGB5 435 | JADE2 436 | JMJD6 437 | JUN 438 | KAT6A 439 | KAT6B 440 | KCNK1 441 | KCTD5 442 | KDELR2 443 | KDM3A 444 | KDM5A 445 | KDM5B 446 | KEAP1 447 | KHDC4 448 | KIAA0753 449 | KIF14 450 | KIF20A 451 | KIF2C 452 | KIF5C 453 | KIFBP 454 | KIT 455 | KLHDC2 456 | KLHL21 457 | KLHL9 458 | KLK8 459 | KTN1 460 | LAGE3 461 | LAMA3 462 | LAP3 463 | LBR 464 | LGALS8 465 | LGMN 466 | LIG1 467 | LIPA 468 | LOXL1 469 | LPAR2 470 | LPGAT1 471 | LRP10 472 | LRPAP1 473 | LRRC41 474 | LSM5 475 | LSM6 476 | LSR 477 | LYN 478 | LYPLA1 479 | LYRM1 480 | MACF1 481 | MALT1 482 | MAMLD1 483 | MAN2B1 484 | MAP2K5 485 | MAP3K4 486 | MAP4K4 487 | MAP7 488 | MAPK13 489 | MAPK1IP1L 490 | MAPK9 491 | MAPKAPK2 492 | MAPKAPK3 493 | MAPKAPK5 494 | MAST2 495 | MAT2A 496 | MBNL1 497 | MBNL2 498 | MBOAT7 499 | MBTPS1 500 | MCM3 501 | MCOLN1 502 | MCUR1 503 | ME2 504 | MEF2C 505 | MELK 506 | MEST 507 | METRN 508 | MFSD10 509 | MICALL1 510 | MIF 511 | MINDY1 512 | MKNK1 513 | MLEC 514 | MLLT11 515 | MMP1 516 | MMP2 517 | MNAT1 518 | MPC2 519 | MPZL1 520 | MRPL12 521 | MRPL19 522 | MRPS16 523 | MRPS2 524 | MSH6 525 | MSRA 526 | MTA1 527 | MTERF3 528 | MTF2 529 | MTFR1 530 | MTHFD2 531 | MUC1 532 | MVP 533 | MYBL2 534 | MYC 535 | MYCBP 536 | MYCBP2 537 | MYL9 538 | MYLK 539 | MYO10 540 | NCAPD2 541 | NCK1 542 | NCK2 543 | NCOA3 544 | NENF 545 | NET1 546 | NFATC3 547 | NFATC4 548 | NFE2L2 549 | NFIL3 550 | NFKB2 551 | NFKBIA 552 | NFKBIB 553 | NFKBIE 554 | NGRN 555 | NIPSNAP1 556 | NISCH 557 | NIT1 558 | NMT1 559 | NNT 560 | NOL3 561 | NOLC1 562 | NOS3 563 | NOSIP 564 | NOTCH1 565 | NPC1 566 | NPDC1 567 | NPEPL1 568 | NPRL2 569 | NR1H2 570 | NR2F6 571 | NR3C1 572 | NRAS 573 | NRIP1 574 | NSDHL 575 | NT5DC2 576 | NUCB2 577 | NUDCD3 578 | NUDT9 579 | NUP133 580 | NUP62 581 | NUP85 582 | NUP88 583 | NUP93 584 | NUSAP1 585 | NVL 586 | ORC1 587 | OXA1L 588 | OXCT1 589 | OXSR1 590 | P4HA2 591 | P4HTM 592 | PACSIN3 593 | PAF1 594 | PAFAH1B1 595 | PAFAH1B3 596 | PAICS 597 | PAK1 598 | PAK4 599 | PAK6 600 | PAN2 601 | PARP1 602 | PARP2 603 | PAX8 604 | PCBD1 605 | PCCB 606 | PCK2 607 | PCM1 608 | PCMT1 609 | PCNA 610 | PDGFA 611 | PDHX 612 | PDIA5 613 | PDLIM1 614 | PDS5A 615 | PECR 616 | PEX11A 617 | PFKL 618 | PGAM1 619 | PGM1 620 | PGRMC1 621 | PHGDH 622 | PHKA1 623 | PHKB 624 | PHKG2 625 | PIGB 626 | PIH1D1 627 | PIK3C2B 628 | PIK3C3 629 | PIK3CA 630 | PIK3R3 631 | PIK3R4 632 | PIN1 633 | PIP4K2B 634 | PKIG 635 | PLA2G15 636 | PLA2G4A 637 | PLCB3 638 | PLEKHJ1 639 | PLEKHM1 640 | PLK1 641 | PLOD3 642 | PLP2 643 | PLS1 644 | PLSCR1 645 | PLSCR3 646 | PMAIP1 647 | PMM2 648 | PNKP 649 | POLB 650 | POLD1 651 | POLD4 652 | POLE2 653 | POLG2 654 | POLR1C 655 | POLR2I 656 | POLR2K 657 | POP4 658 | PPARD 659 | PPARG 660 | PPIC 661 | PPIE 662 | PPOX 663 | PPP1R13B 664 | PPP2R3C 665 | PPP2R5A 666 | PPP2R5E 667 | PRAF2 668 | PRCP 669 | PRKACA 670 | PRKAG2 671 | PRKCD 672 | PRKCH 673 | PRKCQ 674 | PRKX 675 | PROS1 676 | PRPF4 677 | PRR15L 678 | PRR7 679 | PRSS23 680 | PRUNE1 681 | PSIP1 682 | PSMB10 683 | PSMB8 684 | PSMD10 685 | PSMD2 686 | PSMD4 687 | PSMD9 688 | PSME1 689 | PSME2 690 | PSMF1 691 | PSMG1 692 | PSRC1 693 | PTGS2 694 | PTK2 695 | PTK2B 696 | PTPN1 697 | PTPN12 698 | PTPN6 699 | PTPRC 700 | PTPRF 701 | PTPRK 702 | PUF60 703 | PWP1 704 | PXMP2 705 | PXN 706 | PYCR1 707 | PYGL 708 | RAB11FIP2 709 | RAB21 710 | RAB27A 711 | RAB31 712 | RAB4A 713 | RAC2 714 | RAD51C 715 | RAD9A 716 | RAE1 717 | RAI14 718 | RALA 719 | RALB 720 | RALGDS 721 | RAP1GAP 722 | RASA1 723 | RB1 724 | RBKS 725 | RBM15B 726 | RBM34 727 | RBM6 728 | REEP5 729 | RELB 730 | RFC2 731 | RFC5 732 | RFNG 733 | RFX5 734 | RGS2 735 | RHEB 736 | RHOA 737 | RHOV 738 | RNF167 739 | RNH1 740 | RNMT 741 | RNPS1 742 | RPA1 743 | RPA2 744 | RPA3 745 | RPIA 746 | RPL39L 747 | RPN1 748 | RPP38 749 | RPS5 750 | RPS6 751 | RPS6KA1 752 | RRAGA 753 | RRP12 754 | RRP1B 755 | RRP8 756 | RRS1 757 | RSU1 758 | RTN2 759 | RUVBL1 760 | RXYLT1 761 | S100A13 762 | S100A4 763 | SACM1L 764 | SATB1 765 | SCAND1 766 | SCARB1 767 | SCCPDH 768 | SCP2 769 | SCRN1 770 | SCYL3 771 | SDHB 772 | SENP6 773 | SERPINE1 774 | SESN1 775 | SFN 776 | SGCB 777 | SH3BP5 778 | SHB 779 | SHC1 780 | SIRT3 781 | SKIC2 782 | SKIC8 783 | SKP1 784 | SLC11A2 785 | SLC1A4 786 | SLC25A13 787 | SLC25A14 788 | SLC25A4 789 | SLC25A46 790 | SLC27A3 791 | SLC2A6 792 | SLC35A1 793 | SLC35A3 794 | SLC35B1 795 | SLC35F2 796 | SLC37A4 797 | SLC5A6 798 | SMAD3 799 | SMARCA4 800 | SMARCC1 801 | SMARCD2 802 | SMC1A 803 | SMC3 804 | SMC4 805 | SMNDC1 806 | SNAP25 807 | SNCA 808 | SNX11 809 | SNX13 810 | SNX6 811 | SNX7 812 | SOCS2 813 | SORBS3 814 | SOX2 815 | SOX4 816 | SPAG4 817 | SPAG7 818 | SPDEF 819 | SPEN 820 | SPP1 821 | SPR 822 | SPRED2 823 | SPTAN1 824 | SPTLC2 825 | SQOR 826 | SQSTM1 827 | SRC 828 | SSBP2 829 | ST3GAL5 830 | ST6GALNAC2 831 | ST7 832 | STAMBP 833 | STAP2 834 | STAT1 835 | STAT3 836 | STAT5B 837 | STIMATE 838 | STK10 839 | STK25 840 | STMN1 841 | STUB1 842 | STX1A 843 | STX4 844 | STXBP1 845 | STXBP2 846 | SUPV3L1 847 | SUV39H1 848 | SUZ12 849 | SYK 850 | SYNE2 851 | SYNGR3 852 | SYPL1 853 | TARBP1 854 | TATDN2 855 | TBC1D31 856 | TBC1D9B 857 | TBP 858 | TBPL1 859 | TBX2 860 | TBXA2R 861 | TCEA2 862 | TCEAL4 863 | TCERG1 864 | TCFL5 865 | TCTA 866 | TCTN1 867 | TENT4A 868 | TERF2IP 869 | TERT 870 | TES 871 | TESK1 872 | TEX10 873 | TFAP2A 874 | TFDP1 875 | TGFB3 876 | TGFBR2 877 | THAP11 878 | TIAM1 879 | TICAM1 880 | TIMELESS 881 | TIMM17B 882 | TIMM22 883 | TIMM9 884 | TIMP2 885 | TIPARP 886 | TJP1 887 | TLCD3A 888 | TLE1 889 | TLK2 890 | TLR4 891 | TM9SF2 892 | TM9SF3 893 | TMCO1 894 | TMED10 895 | TMEM109 896 | TMEM50A 897 | TMEM97 898 | TNFRSF21 899 | TNIP1 900 | TOMM34 901 | TOMM70 902 | TOP2A 903 | TOPBP1 904 | TOR1A 905 | TP53 906 | TP53BP1 907 | TP53BP2 908 | TPD52L2 909 | TPM1 910 | TRAK2 911 | TRAM2 912 | TRAP1 913 | TRAPPC3 914 | TRAPPC6A 915 | TRIB1 916 | TRIB3 917 | TRIM13 918 | TRIM2 919 | TSC22D3 920 | TSEN2 921 | TSKU 922 | TSPAN3 923 | TSPAN4 924 | TSPAN6 925 | TUBB6 926 | TWF2 927 | TXLNA 928 | TXNDC9 929 | TXNL4B 930 | TXNRD1 931 | UBE2A 932 | UBE2C 933 | UBE2J1 934 | UBE2L6 935 | UBE3B 936 | UBE3C 937 | UBQLN2 938 | UBR7 939 | UFM1 940 | UGDH 941 | USP1 942 | USP14 943 | USP22 944 | USP6NL 945 | USP7 946 | UTP14A 947 | VAPB 948 | VAT1 949 | VAV3 950 | VDAC1 951 | VGLL4 952 | VPS28 953 | VPS72 954 | WASF3 955 | WASHC4 956 | WASHC5 957 | WDR7 958 | WDTC1 959 | WFS1 960 | WIPF2 961 | XBP1 962 | XPNPEP1 963 | XPO7 964 | YKT6 965 | YME1L1 966 | YTHDF1 967 | ZDHHC6 968 | ZFP36 969 | ZMIZ1 970 | ZMYM2 971 | ZNF131 972 | ZNF274 973 | ZNF318 974 | ZNF395 975 | ZNF451 976 | ZNF586 977 | ZNF589 978 | ZW10 979 | -------------------------------------------------------------------------------- /GO_terms_search/source/intersection_geneSymbols_785.txt: -------------------------------------------------------------------------------- 1 | AARS1 2 | ABCB6 3 | ABCC5 4 | ABCF1 5 | ABCF3 6 | ABHD4 7 | ABHD6 8 | ABL1 9 | ACAA1 10 | ACAT2 11 | ACBD3 12 | ACD 13 | ACLY 14 | ACOT9 15 | ADAM10 16 | ADAT1 17 | ADGRE2 18 | ADGRG1 19 | ADH5 20 | ADI1 21 | ADO 22 | AGER 23 | AGL 24 | AKAP8 25 | AKAP8L 26 | AKR7A2 27 | AKT1 28 | ALAS1 29 | ALDH7A1 30 | ALDOC 31 | AMDHD2 32 | ANKRD10 33 | ANO10 34 | ANXA7 35 | APBB2 36 | APPBP2 37 | ARFIP2 38 | ARHGAP1 39 | ARHGEF2 40 | ARID4B 41 | ARID5B 42 | ARL4C 43 | ARNT2 44 | ARPP19 45 | ASCC3 46 | ATF5 47 | ATG3 48 | ATMIN 49 | ATP11B 50 | ATP1B1 51 | ATP2C1 52 | ATP6V0B 53 | ATP6V1D 54 | B3GNT2 55 | BACE2 56 | BAD 57 | BAG3 58 | BAMBI 59 | BCL2 60 | BCL7B 61 | BDH1 62 | BECN1 63 | BHLHE40 64 | BID 65 | BIRC5 66 | BLCAP 67 | BLMH 68 | BLTP2 69 | BLVRA 70 | BNIP3 71 | BPHL 72 | BUB1B 73 | BZW2 74 | C2CD2 75 | C2CD2L 76 | C2CD5 77 | CAB39 78 | CALU 79 | CAMSAP2 80 | CANT1 81 | CAPN1 82 | CARMIL1 83 | CASC3 84 | CASK 85 | CASP3 86 | CAST 87 | CAT 88 | CBR1 89 | CBR3 90 | CCDC85B 91 | CCDC86 92 | CCDC92 93 | CCNA2 94 | CCNB2 95 | CCNF 96 | CCP110 97 | CD320 98 | CD44 99 | CD58 100 | CDC45 101 | CDCA4 102 | CDH3 103 | CDK19 104 | CDK2 105 | CDKN1A 106 | CEBPD 107 | CEBPZ 108 | CEMIP2 109 | CENPE 110 | CEP57 111 | CERK 112 | CETN3 113 | CFLAR 114 | CGRRF1 115 | CHAC1 116 | CHEK2 117 | CHERP 118 | CHIC2 119 | CHMP4A 120 | CHMP6 121 | CHN1 122 | CIAO3 123 | CIAPIN1 124 | CIRBP 125 | CISD1 126 | CLIC4 127 | CLPX 128 | CLSTN1 129 | CLTB 130 | CNDP2 131 | CNOT4 132 | CNPY3 133 | COASY 134 | COG2 135 | COG4 136 | COG7 137 | COPB2 138 | COPS7A 139 | COQ8A 140 | CORO1A 141 | CPNE3 142 | CPSF4 143 | CRAMP1 144 | CREB1 145 | CREG1 146 | CRELD2 147 | CRKL 148 | CRTAP 149 | CRYZ 150 | CSK 151 | CSNK2A2 152 | CSRP1 153 | CTNNAL1 154 | CTNND1 155 | CTSD 156 | CTSL 157 | CTTN 158 | CXCR4 159 | CYB561 160 | CYTH1 161 | DCK 162 | DCTD 163 | DCUN1D4 164 | DDB2 165 | DDIT4 166 | DDR1 167 | DDX10 168 | DDX42 169 | DECR1 170 | DENND2D 171 | DERA 172 | DHDDS 173 | DHRS7 174 | DHX29 175 | DIPK1A 176 | DLD 177 | DMAC2L 178 | DMTF1 179 | DNAJA3 180 | DNAJB1 181 | DNAJB2 182 | DNAJB6 183 | DNAJC15 184 | DNM1 185 | DNM1L 186 | DNMT1 187 | DNTTIP2 188 | DPH2 189 | DRAP1 190 | DSG2 191 | DUSP11 192 | DUSP14 193 | DUSP22 194 | DYNLT3 195 | DYRK3 196 | EAPP 197 | EBNA1BP2 198 | EBP 199 | ECD 200 | ECH1 201 | EDEM1 202 | EFCAB14 203 | EGFR 204 | EIF5 205 | ELAC2 206 | ELAVL1 207 | ELOVL6 208 | ELP1 209 | EML3 210 | ENOPH1 211 | ENOSF1 212 | EPB41L2 213 | EPN2 214 | EPRS1 215 | ERBB2 216 | ETFB 217 | EVL 218 | EXOSC4 219 | EXT1 220 | EZH2 221 | FAH 222 | FAIM 223 | FAM20B 224 | FAS 225 | FASTKD5 226 | FAT1 227 | FBXL12 228 | FBXO21 229 | FBXO7 230 | FCHO1 231 | FDFT1 232 | FEZ2 233 | FHL2 234 | FIS1 235 | FKBP14 236 | FKBP4 237 | FOS 238 | FOXJ3 239 | FOXO4 240 | FPGS 241 | FSD1 242 | FUT1 243 | G3BP1 244 | GAA 245 | GABPB1 246 | GADD45A 247 | GADD45B 248 | GALE 249 | GARRE1 250 | GATA2 251 | GATA3 252 | GDPD5 253 | GET1 254 | GFOD1 255 | GFPT1 256 | GFUS 257 | GLOD4 258 | GLRX 259 | GMNN 260 | GNA11 261 | GNAI2 262 | GNAS 263 | GNB5 264 | GNPDA1 265 | GOLT1B 266 | GPATCH8 267 | GPC1 268 | GPER1 269 | GRB10 270 | GRN 271 | GRWD1 272 | GSTM2 273 | GSTZ1 274 | GTF2A2 275 | GTF2E2 276 | GTPBP8 277 | H2AZ2 278 | H2BC12 279 | H2BC21 280 | HACD3 281 | HADH 282 | HAT1 283 | HDAC2 284 | HDAC6 285 | HDGFL3 286 | HEATR1 287 | HEBP1 288 | HERC6 289 | HERPUD1 290 | HES1 291 | HK1 292 | HMG20B 293 | HMGCR 294 | HMGCS1 295 | HOMER2 296 | HOOK2 297 | HOXA10 298 | HOXA5 299 | HPRT1 300 | HS2ST1 301 | HSD17B10 302 | HSD17B11 303 | HSPA4 304 | HTATSF1 305 | HTRA1 306 | HYOU1 307 | IARS2 308 | ICAM3 309 | ICMT 310 | ID2 311 | IDE 312 | IER3 313 | IFRD2 314 | IGF1R 315 | IGF2R 316 | IGHMBP2 317 | IKBKB 318 | IKBKE 319 | IL13RA1 320 | IL4R 321 | ILK 322 | INPP1 323 | INPP4B 324 | INSIG1 325 | INTS3 326 | IPO13 327 | IQGAP1 328 | ISOC1 329 | ITFG1 330 | ITGAE 331 | ITGB1BP1 332 | ITGB5 333 | JADE2 334 | JMJD6 335 | JUN 336 | KAT6A 337 | KAT6B 338 | KCNK1 339 | KCTD5 340 | KDELR2 341 | KDM3A 342 | KDM5A 343 | KDM5B 344 | KEAP1 345 | KHDC4 346 | KIAA0753 347 | KIF14 348 | KIF20A 349 | KIF2C 350 | KIF5C 351 | KIFBP 352 | KIT 353 | KLHDC2 354 | KLHL21 355 | KLHL9 356 | KLK8 357 | KTN1 358 | LAGE3 359 | LAMA3 360 | LAP3 361 | LBR 362 | LGALS8 363 | LGMN 364 | LIG1 365 | LIPA 366 | LOXL1 367 | LPAR2 368 | LPGAT1 369 | LRP10 370 | LRPAP1 371 | LRRC41 372 | LSM5 373 | LSM6 374 | LSR 375 | LYN 376 | LYPLA1 377 | LYRM1 378 | MACF1 379 | MALT1 380 | MAMLD1 381 | MAN2B1 382 | MAP2K5 383 | MAP3K4 384 | MAP4K4 385 | MAP7 386 | MAPK13 387 | MAPK1IP1L 388 | MAPK9 389 | MAPKAPK3 390 | MAPKAPK5 391 | MAST2 392 | MBNL1 393 | MBNL2 394 | MBOAT7 395 | MBTPS1 396 | MCM3 397 | MCOLN1 398 | MCUR1 399 | ME2 400 | MELK 401 | MEST 402 | METRN 403 | MFSD10 404 | MICALL1 405 | MINDY1 406 | MLEC 407 | MLLT11 408 | MPC2 409 | MPZL1 410 | MRPL12 411 | MRPL19 412 | MRPS16 413 | MRPS2 414 | MSH6 415 | MSRA 416 | MTA1 417 | MTERF3 418 | MTF2 419 | MTFR1 420 | MTHFD2 421 | MVP 422 | MYBL2 423 | MYC 424 | MYCBP 425 | MYCBP2 426 | MYO10 427 | NCAPD2 428 | NCOA3 429 | NENF 430 | NET1 431 | NFE2L2 432 | NFIL3 433 | NGRN 434 | NIPSNAP1 435 | NISCH 436 | NIT1 437 | NMT1 438 | NNT 439 | NOL3 440 | NOLC1 441 | NOSIP 442 | NPC1 443 | NPDC1 444 | NPEPL1 445 | NPRL2 446 | NR1H2 447 | NR2F6 448 | NR3C1 449 | NRIP1 450 | NSDHL 451 | NT5DC2 452 | NUCB2 453 | NUDCD3 454 | NUDT9 455 | NUP133 456 | NUP62 457 | NUP85 458 | NUP88 459 | NUP93 460 | NUSAP1 461 | NVL 462 | ORC1 463 | OXA1L 464 | OXCT1 465 | OXSR1 466 | P4HA2 467 | P4HTM 468 | PACSIN3 469 | PAF1 470 | PAFAH1B1 471 | PAFAH1B3 472 | PAICS 473 | PAK4 474 | PAN2 475 | PARP2 476 | PAX8 477 | PCBD1 478 | PCCB 479 | PCK2 480 | PCM1 481 | PCMT1 482 | PDHX 483 | PDIA5 484 | PDLIM1 485 | PDS5A 486 | PECR 487 | PEX11A 488 | PGM1 489 | PGRMC1 490 | PHGDH 491 | PHKA1 492 | PHKB 493 | PHKG2 494 | PIGB 495 | PIH1D1 496 | PIK3C2B 497 | PIN1 498 | PIP4K2B 499 | PKIG 500 | PLA2G15 501 | PLEKHJ1 502 | PLEKHM1 503 | PLOD3 504 | PLP2 505 | PLS1 506 | PLSCR1 507 | PLSCR3 508 | PMAIP1 509 | PMM2 510 | PNKP 511 | POLB 512 | POLD1 513 | POLD4 514 | POLE2 515 | POLG2 516 | POLR1C 517 | POLR2I 518 | POLR2K 519 | POP4 520 | PPARG 521 | PPIC 522 | PPIE 523 | PPOX 524 | PPP2R3C 525 | PPP2R5A 526 | PPP2R5E 527 | PRAF2 528 | PRCP 529 | PRKACA 530 | PRKCD 531 | PRPF4 532 | PRR15L 533 | PRR7 534 | PRSS23 535 | PRUNE1 536 | PSIP1 537 | PSMD10 538 | PSMG1 539 | PSRC1 540 | PTK2 541 | PTPN1 542 | PTPN12 543 | PTPRF 544 | PTPRK 545 | PUF60 546 | PWP1 547 | PXMP2 548 | PXN 549 | PYCR1 550 | PYGL 551 | RAB11FIP2 552 | RAB21 553 | RAB27A 554 | RAB31 555 | RAB4A 556 | RAD51C 557 | RAD9A 558 | RAE1 559 | RAI14 560 | RAP1GAP 561 | RBKS 562 | RBM15B 563 | RBM34 564 | RBM6 565 | REEP5 566 | RELB 567 | RFC2 568 | RFC5 569 | RFNG 570 | RFX5 571 | RGS2 572 | RNF167 573 | RNH1 574 | RNMT 575 | RNPS1 576 | RPA1 577 | RPA2 578 | RPA3 579 | RPIA 580 | RPL39L 581 | RPN1 582 | RPP38 583 | RPS6KA1 584 | RRAGA 585 | RRP12 586 | RRP1B 587 | RRP8 588 | RRS1 589 | RSU1 590 | RTN2 591 | RUVBL1 592 | RXYLT1 593 | S100A13 594 | S100A4 595 | SACM1L 596 | SCAND1 597 | SCARB1 598 | SCCPDH 599 | SCP2 600 | SCRN1 601 | SCYL3 602 | SDHB 603 | SENP6 604 | SESN1 605 | SFN 606 | SGCB 607 | SH3BP5 608 | SHB 609 | SKIC2 610 | SKIC8 611 | SLC11A2 612 | SLC1A4 613 | SLC25A13 614 | SLC25A14 615 | SLC25A4 616 | SLC25A46 617 | SLC27A3 618 | SLC2A6 619 | SLC35A1 620 | SLC35A3 621 | SLC35B1 622 | SLC35F2 623 | SLC37A4 624 | SLC5A6 625 | SMAD3 626 | SMARCA4 627 | SMARCC1 628 | SMARCD2 629 | SMC1A 630 | SMC3 631 | SMC4 632 | SMNDC1 633 | SNX11 634 | SNX13 635 | SNX6 636 | SNX7 637 | SOCS2 638 | SORBS3 639 | SOX4 640 | SPAG4 641 | SPAG7 642 | SPDEF 643 | SPEN 644 | SPR 645 | SPRED2 646 | SPTLC2 647 | SQOR 648 | SSBP2 649 | ST3GAL5 650 | ST6GALNAC2 651 | ST7 652 | STAMBP 653 | STAP2 654 | STAT1 655 | STIMATE 656 | STK10 657 | STK25 658 | STMN1 659 | STUB1 660 | STX1A 661 | STX4 662 | STXBP1 663 | STXBP2 664 | SUPV3L1 665 | SYNE2 666 | SYNGR3 667 | SYPL1 668 | TARBP1 669 | TATDN2 670 | TBC1D31 671 | TBC1D9B 672 | TBPL1 673 | TBX2 674 | TBXA2R 675 | TCEA2 676 | TCEAL4 677 | TCERG1 678 | TCFL5 679 | TCTA 680 | TCTN1 681 | TENT4A 682 | TERF2IP 683 | TES 684 | TESK1 685 | TEX10 686 | TFAP2A 687 | THAP11 688 | TIAM1 689 | TIMELESS 690 | TIMM17B 691 | TIMM22 692 | TIMM9 693 | TIMP2 694 | TIPARP 695 | TJP1 696 | TLCD3A 697 | TLE1 698 | TLK2 699 | TM9SF2 700 | TM9SF3 701 | TMCO1 702 | TMED10 703 | TMEM109 704 | TMEM50A 705 | TMEM97 706 | TNFRSF21 707 | TNIP1 708 | TOMM34 709 | TOMM70 710 | TOP2A 711 | TOPBP1 712 | TOR1A 713 | TP53BP1 714 | TP53BP2 715 | TPD52L2 716 | TPM1 717 | TRAK2 718 | TRAM2 719 | TRAP1 720 | TRAPPC3 721 | TRAPPC6A 722 | TRIB1 723 | TRIB3 724 | TRIM13 725 | TRIM2 726 | TSC22D3 727 | TSEN2 728 | TSKU 729 | TSPAN3 730 | TSPAN4 731 | TSPAN6 732 | TUBB6 733 | TWF2 734 | TXLNA 735 | TXNDC9 736 | TXNL4B 737 | TXNRD1 738 | UBE2A 739 | UBE2C 740 | UBE2J1 741 | UBE2L6 742 | UBE3B 743 | UBE3C 744 | UBQLN2 745 | UBR7 746 | UFM1 747 | UGDH 748 | USP1 749 | USP14 750 | USP22 751 | USP6NL 752 | USP7 753 | UTP14A 754 | VAPB 755 | VAT1 756 | VAV3 757 | VDAC1 758 | VGLL4 759 | VPS28 760 | VPS72 761 | WASF3 762 | WASHC4 763 | WASHC5 764 | WDR7 765 | WDTC1 766 | WFS1 767 | WIPF2 768 | XBP1 769 | XPNPEP1 770 | XPO7 771 | YKT6 772 | YME1L1 773 | YTHDF1 774 | ZDHHC6 775 | ZFP36 776 | ZMIZ1 777 | ZMYM2 778 | ZNF131 779 | ZNF274 780 | ZNF318 781 | ZNF395 782 | ZNF451 783 | ZNF586 784 | ZNF589 785 | ZW10 786 | -------------------------------------------------------------------------------- /GO_terms_search/source/top_100_luad.txt: -------------------------------------------------------------------------------- 1 | TPM1 2 | CDKN1A 3 | SERPINE1 4 | COL4A1 5 | RPIA 6 | BIRC5 7 | EDN1 8 | GADD45A 9 | CCNA2 10 | POLD1 11 | FHL2 12 | SLC2A6 13 | CTSL 14 | AURKA 15 | ATF1 16 | YKT6 17 | JUN 18 | DNAJB2 19 | ABHD4 20 | MTHFD2 21 | AURKB 22 | MMP1 23 | TOP2A 24 | UBE2C 25 | PAFAH1B3 26 | MRPL12 27 | HDAC2 28 | CTSD 29 | TSEN2 30 | SCARB1 31 | LBR 32 | POLE2 33 | PAICS 34 | PRSS23 35 | RGS2 36 | IER3 37 | HSPB1 38 | PTPN12 39 | CHEK2 40 | ARHGAP1 41 | ADGRG1 42 | MCM3 43 | POP4 44 | PXN 45 | HMOX1 46 | USP1 47 | RUVBL1 48 | DDX10 49 | DUSP6 50 | CCL2 51 | NUP88 52 | CDC25A 53 | TXNRD1 54 | HMGA2 55 | MYL9 56 | DUSP4 57 | CAT 58 | MVP 59 | SQSTM1 60 | TIMELESS 61 | DCK 62 | GPC1 63 | NIPSNAP1 64 | COL1A1 65 | C5 66 | NET1 67 | MPC2 68 | TIMP2 69 | TMEM97 70 | RAE1 71 | RPL39L 72 | EFCAB14 73 | MAN2B1 74 | RAI14 75 | ILK 76 | ABCB6 77 | TIPARP 78 | RNPS1 79 | PPIC 80 | CEBPD 81 | CCND3 82 | EZH2 83 | SOX4 84 | MYBL2 85 | SLC35A1 86 | TMEM109 87 | RSU1 88 | DAG1 89 | GRB10 90 | INPP1 91 | STAT1 92 | RRP12 93 | CREG1 94 | TES 95 | PDGFA 96 | SMC4 97 | ERBB2 98 | EIF4EBP1 99 | DPH2 100 | UBE2L6 101 | -------------------------------------------------------------------------------- /GO_terms_search/source/top_59_atleast_topIn3.txt: -------------------------------------------------------------------------------- 1 | ATP1B1 2 | BCL7B 3 | BIRC5 4 | BUB1B 5 | CCNA2 6 | CDK4 7 | CISD1 8 | CLIC4 9 | COASY 10 | CPNE3 11 | DAG1 12 | DCK 13 | EBP 14 | EPRS1 15 | ERBB2 16 | FHL2 17 | GLRX 18 | GNPDA1 19 | HMOX1 20 | IER3 21 | IGF2R 22 | LBR 23 | LIG1 24 | MCM3 25 | MPZL1 26 | MRPL19 27 | MTHFD2 28 | MYC 29 | NFKBIB 30 | NIPSNAP1 31 | NPC1 32 | PAFAH1B3 33 | PAICS 34 | PAK4 35 | PSME1 36 | PSRC1 37 | RELB 38 | RPA1 39 | SACM1L 40 | SCARB1 41 | SERPINE1 42 | SESN1 43 | SLC25A4 44 | SMAD3 45 | SMC4 46 | SPP1 47 | STMN1 48 | STX1A 49 | TERF2IP 50 | TMEM50A 51 | TOP2A 52 | TPM1 53 | TRIB1 54 | TSC22D3 55 | TSKU 56 | TXNRD1 57 | UBE2C 58 | XBP1 59 | YKT6 60 | -------------------------------------------------------------------------------- /GO_terms_search/source/union_geneSymbols_1170.txt: -------------------------------------------------------------------------------- 1 | AARS1 2 | ABCB6 3 | ABCC5 4 | ABCF1 5 | ABCF3 6 | ABHD4 7 | ABHD6 8 | ABL1 9 | ACAA1 10 | ACAT2 11 | ACBD3 12 | ACD 13 | ACLY 14 | ACOT9 15 | ADAM10 16 | ADAT1 17 | ADGRE2 18 | ADGRG1 19 | ADH5 20 | ADI1 21 | ADO 22 | ADRB2 23 | AGER 24 | AGL 25 | AKAP8 26 | AKAP8L 27 | AKR7A2 28 | AKT1 29 | ALAS1 30 | ALDH7A1 31 | ALDOA 32 | ALDOC 33 | AMDHD2 34 | ANKRD10 35 | ANO10 36 | ANXA7 37 | APBB2 38 | APOE 39 | APP 40 | APPBP2 41 | ARFIP2 42 | ARHGAP1 43 | ARHGEF12 44 | ARHGEF2 45 | ARID4B 46 | ARID5B 47 | ARL4C 48 | ARNT2 49 | ARPP19 50 | ASAH1 51 | ASCC3 52 | ATF1 53 | ATF5 54 | ATF6 55 | ATG3 56 | ATMIN 57 | ATP11B 58 | ATP1B1 59 | ATP2C1 60 | ATP6V0B 61 | ATP6V1D 62 | AURKA 63 | AURKB 64 | AXIN1 65 | B3GNT2 66 | BACE2 67 | BAD 68 | BAG3 69 | BAMBI 70 | BAX 71 | BCL2 72 | BCL7B 73 | BDH1 74 | BECN1 75 | BHLHE40 76 | BID 77 | BIRC2 78 | BIRC5 79 | BLCAP 80 | BLMH 81 | BLTP2 82 | BLVRA 83 | BMP4 84 | BNIP3 85 | BNIP3L 86 | BPHL 87 | BRCA1 88 | BTK 89 | BUB1B 90 | BZW2 91 | C2CD2 92 | C2CD2L 93 | C2CD5 94 | C5 95 | CAB39 96 | CALM1 97 | CALU 98 | CAMSAP2 99 | CANT1 100 | CAPN1 101 | CARMIL1 102 | CASC3 103 | CASK 104 | CASP10 105 | CASP2 106 | CASP3 107 | CASP7 108 | CAST 109 | CAT 110 | CBLB 111 | CBR1 112 | CBR3 113 | CCDC85B 114 | CCDC86 115 | CCDC92 116 | CCL2 117 | CCNA1 118 | CCNA2 119 | CCNB1 120 | CCNB2 121 | CCND1 122 | CCND3 123 | CCNE2 124 | CCNF 125 | CCNH 126 | CCP110 127 | CD320 128 | CD40 129 | CD44 130 | CD58 131 | CDC20 132 | CDC25A 133 | CDC25B 134 | CDC42 135 | CDC45 136 | CDCA4 137 | CDH3 138 | CDK19 139 | CDK2 140 | CDK4 141 | CDK5R1 142 | CDK6 143 | CDK7 144 | CDKN1A 145 | CDKN1B 146 | CDKN2A 147 | CEBPA 148 | CEBPD 149 | CEBPZ 150 | CEMIP2 151 | CENPE 152 | CEP57 153 | CERK 154 | CETN3 155 | CFLAR 156 | CGRRF1 157 | CHAC1 158 | CHEK1 159 | CHEK2 160 | CHERP 161 | CHIC2 162 | CHMP4A 163 | CHMP6 164 | CHN1 165 | CIAO3 166 | CIAPIN1 167 | CIRBP 168 | CISD1 169 | CLIC4 170 | CLPX 171 | CLSTN1 172 | CLTB 173 | CLTC 174 | CNDP2 175 | CNOT4 176 | CNPY3 177 | COASY 178 | COG2 179 | COG4 180 | COG7 181 | COL1A1 182 | COL4A1 183 | COPB2 184 | COPS7A 185 | COQ8A 186 | CORO1A 187 | CPNE3 188 | CPSF4 189 | CRAMP1 190 | CREB1 191 | CREG1 192 | CRELD2 193 | CRK 194 | CRKL 195 | CRTAP 196 | CRYZ 197 | CSK 198 | CSNK1A1 199 | CSNK1E 200 | CSNK2A2 201 | CSRP1 202 | CTNNAL1 203 | CTNND1 204 | CTSD 205 | CTSL 206 | CTTN 207 | CXCL2 208 | CXCR4 209 | CYB561 210 | CYCS 211 | CYTH1 212 | DAG1 213 | DAXX 214 | DCK 215 | DCTD 216 | DCUN1D4 217 | DDB2 218 | DDIT4 219 | DDR1 220 | DDX10 221 | DDX42 222 | DECR1 223 | DENND2D 224 | DERA 225 | DFFA 226 | DFFB 227 | DHDDS 228 | DHRS7 229 | DHX29 230 | DIPK1A 231 | DLD 232 | DMAC2L 233 | DMTF1 234 | DNAJA3 235 | DNAJB1 236 | DNAJB2 237 | DNAJB6 238 | DNAJC15 239 | DNM1 240 | DNM1L 241 | DNMT1 242 | DNMT3A 243 | DNTTIP2 244 | DPH2 245 | DRAP1 246 | DSG2 247 | DUSP11 248 | DUSP14 249 | DUSP22 250 | DUSP3 251 | DUSP4 252 | DUSP6 253 | DYNLT3 254 | DYRK3 255 | E2F2 256 | EAPP 257 | EBNA1BP2 258 | EBP 259 | ECD 260 | ECH1 261 | EDEM1 262 | EDN1 263 | EED 264 | EFCAB14 265 | EGF 266 | EGFR 267 | EGR1 268 | EIF4EBP1 269 | EIF4G1 270 | EIF5 271 | ELAC2 272 | ELAVL1 273 | ELOVL6 274 | ELP1 275 | EML3 276 | ENOPH1 277 | ENOSF1 278 | EPB41L2 279 | EPHA3 280 | EPHB2 281 | EPN2 282 | EPRS1 283 | ERBB2 284 | ERBB3 285 | ERO1A 286 | ETFB 287 | ETS1 288 | ETV1 289 | EVL 290 | EXOSC4 291 | EXT1 292 | EZH2 293 | FAH 294 | FAIM 295 | FAM20B 296 | FAS 297 | FASTKD5 298 | FAT1 299 | FBXL12 300 | FBXO11 301 | FBXO21 302 | FBXO7 303 | FCHO1 304 | FDFT1 305 | FEZ2 306 | FGFR2 307 | FGFR4 308 | FHL2 309 | FIS1 310 | FKBP14 311 | FKBP4 312 | FOS 313 | FOSL1 314 | FOXJ3 315 | FOXO3 316 | FOXO4 317 | FPGS 318 | FRS2 319 | FSD1 320 | FUT1 321 | FYN 322 | FZD1 323 | FZD7 324 | G3BP1 325 | GAA 326 | GABPB1 327 | GADD45A 328 | GADD45B 329 | GALE 330 | GAPDH 331 | GARRE1 332 | GATA2 333 | GATA3 334 | GDPD5 335 | GET1 336 | GFOD1 337 | GFPT1 338 | GFUS 339 | GHR 340 | GLI2 341 | GLOD4 342 | GLRX 343 | GMNN 344 | GNA11 345 | GNA15 346 | GNAI1 347 | GNAI2 348 | GNAS 349 | GNB5 350 | GNPDA1 351 | GOLT1B 352 | GPATCH8 353 | GPC1 354 | GPER1 355 | GRB10 356 | GRB7 357 | GRN 358 | GRWD1 359 | GSTM2 360 | GSTZ1 361 | GTF2A2 362 | GTF2E2 363 | GTPBP8 364 | H2AZ2 365 | H2BC12 366 | H2BC21 367 | HACD3 368 | HADH 369 | HAT1 370 | HDAC2 371 | HDAC6 372 | HDGFL3 373 | HEATR1 374 | HEBP1 375 | HERC6 376 | HERPUD1 377 | HES1 378 | HIF1A 379 | HK1 380 | HLA-DMA 381 | HLA-DRA 382 | HMG20B 383 | HMGA2 384 | HMGCR 385 | HMGCS1 386 | HMOX1 387 | HOMER2 388 | HOOK2 389 | HOXA10 390 | HOXA5 391 | HPRT1 392 | HS2ST1 393 | HSD17B10 394 | HSD17B11 395 | HSPA1A 396 | HSPA4 397 | HSPA8 398 | HSPB1 399 | HSPD1 400 | HTATSF1 401 | HTRA1 402 | HYOU1 403 | IARS2 404 | ICAM1 405 | ICAM3 406 | ICMT 407 | ID2 408 | IDE 409 | IER3 410 | IFNAR1 411 | IFRD2 412 | IGF1R 413 | IGF2BP2 414 | IGF2R 415 | IGFBP3 416 | IGHMBP2 417 | IKBKB 418 | IKBKE 419 | IKZF1 420 | IL13RA1 421 | IL1B 422 | IL4R 423 | ILK 424 | INPP1 425 | INPP4B 426 | INSIG1 427 | INTS3 428 | IPO13 429 | IQGAP1 430 | ISOC1 431 | ITFG1 432 | ITGAE 433 | ITGB1BP1 434 | ITGB5 435 | JADE2 436 | JMJD6 437 | JUN 438 | KAT6A 439 | KAT6B 440 | KCNK1 441 | KCTD5 442 | KDELR2 443 | KDM3A 444 | KDM5A 445 | KDM5B 446 | KEAP1 447 | KHDC4 448 | KIAA0753 449 | KIF14 450 | KIF20A 451 | KIF2C 452 | KIF5C 453 | KIFBP 454 | KIT 455 | KLHDC2 456 | KLHL21 457 | KLHL9 458 | KLK8 459 | KTN1 460 | LAGE3 461 | LAMA3 462 | LAP3 463 | LBR 464 | LGALS8 465 | LGMN 466 | LIG1 467 | LIPA 468 | LOXL1 469 | LPAR2 470 | LPGAT1 471 | LRP10 472 | LRPAP1 473 | LRRC41 474 | LSM5 475 | LSM6 476 | LSR 477 | LYN 478 | LYPLA1 479 | LYRM1 480 | MACF1 481 | MALT1 482 | MAMLD1 483 | MAN2B1 484 | MAP2K5 485 | MAP3K4 486 | MAP4K4 487 | MAP7 488 | MAPK13 489 | MAPK1IP1L 490 | MAPK9 491 | MAPKAPK2 492 | MAPKAPK3 493 | MAPKAPK5 494 | MAST2 495 | MAT2A 496 | MBNL1 497 | MBNL2 498 | MBOAT7 499 | MBTPS1 500 | MCM3 501 | MCOLN1 502 | MCUR1 503 | ME2 504 | MEF2C 505 | MELK 506 | MEST 507 | METRN 508 | MFSD10 509 | MICALL1 510 | MIF 511 | MINDY1 512 | MKNK1 513 | MLEC 514 | MLLT11 515 | MMP1 516 | MMP2 517 | MNAT1 518 | MPC2 519 | MPZL1 520 | MRPL12 521 | MRPL19 522 | MRPS16 523 | MRPS2 524 | MSH6 525 | MSRA 526 | MTA1 527 | MTERF3 528 | MTF2 529 | MTFR1 530 | MTHFD2 531 | MUC1 532 | MVP 533 | MYBL2 534 | MYC 535 | MYCBP 536 | MYCBP2 537 | MYL9 538 | MYLK 539 | MYO10 540 | NCAPD2 541 | NCK1 542 | NCK2 543 | NCOA3 544 | NENF 545 | NET1 546 | NFATC3 547 | NFATC4 548 | NFE2L2 549 | NFIL3 550 | NFKB2 551 | NFKBIA 552 | NFKBIB 553 | NFKBIE 554 | NGRN 555 | NIPSNAP1 556 | NISCH 557 | NIT1 558 | NMT1 559 | NNT 560 | NOL3 561 | NOLC1 562 | NOS3 563 | NOSIP 564 | NOTCH1 565 | NPC1 566 | NPDC1 567 | NPEPL1 568 | NPRL2 569 | NR1H2 570 | NR2F6 571 | NR3C1 572 | NRAS 573 | NRIP1 574 | NSDHL 575 | NT5DC2 576 | NUCB2 577 | NUDCD3 578 | NUDT9 579 | NUP133 580 | NUP62 581 | NUP85 582 | NUP88 583 | NUP93 584 | NUSAP1 585 | NVL 586 | ORC1 587 | OXA1L 588 | OXCT1 589 | OXSR1 590 | P4HA2 591 | P4HTM 592 | PACSIN3 593 | PAF1 594 | PAFAH1B1 595 | PAFAH1B3 596 | PAICS 597 | PAK1 598 | PAK4 599 | PAK6 600 | PAN2 601 | PARP1 602 | PARP2 603 | PAX8 604 | PCBD1 605 | PCCB 606 | PCK2 607 | PCM1 608 | PCMT1 609 | PCNA 610 | PDGFA 611 | PDHX 612 | PDIA5 613 | PDLIM1 614 | PDS5A 615 | PECR 616 | PEX11A 617 | PFKL 618 | PGAM1 619 | PGM1 620 | PGRMC1 621 | PHGDH 622 | PHKA1 623 | PHKB 624 | PHKG2 625 | PIGB 626 | PIH1D1 627 | PIK3C2B 628 | PIK3C3 629 | PIK3CA 630 | PIK3R3 631 | PIK3R4 632 | PIN1 633 | PIP4K2B 634 | PKIG 635 | PLA2G15 636 | PLA2G4A 637 | PLCB3 638 | PLEKHJ1 639 | PLEKHM1 640 | PLK1 641 | PLOD3 642 | PLP2 643 | PLS1 644 | PLSCR1 645 | PLSCR3 646 | PMAIP1 647 | PMM2 648 | PNKP 649 | POLB 650 | POLD1 651 | POLD4 652 | POLE2 653 | POLG2 654 | POLR1C 655 | POLR2I 656 | POLR2K 657 | POP4 658 | PPARD 659 | PPARG 660 | PPIC 661 | PPIE 662 | PPOX 663 | PPP1R13B 664 | PPP2R3C 665 | PPP2R5A 666 | PPP2R5E 667 | PRAF2 668 | PRCP 669 | PRKACA 670 | PRKAG2 671 | PRKCD 672 | PRKCH 673 | PRKCQ 674 | PRKX 675 | PROS1 676 | PRPF4 677 | PRR15L 678 | PRR7 679 | PRSS23 680 | PRUNE1 681 | PSIP1 682 | PSMB10 683 | PSMB8 684 | PSMD10 685 | PSMD2 686 | PSMD4 687 | PSMD9 688 | PSME1 689 | PSME2 690 | PSMF1 691 | PSMG1 692 | PSRC1 693 | PTGS2 694 | PTK2 695 | PTK2B 696 | PTPN1 697 | PTPN12 698 | PTPN6 699 | PTPRC 700 | PTPRF 701 | PTPRK 702 | PUF60 703 | PWP1 704 | PXMP2 705 | PXN 706 | PYCR1 707 | PYGL 708 | RAB11FIP2 709 | RAB21 710 | RAB27A 711 | RAB31 712 | RAB4A 713 | RAC2 714 | RAD51C 715 | RAD9A 716 | RAE1 717 | RAI14 718 | RALA 719 | RALB 720 | RALGDS 721 | RAP1GAP 722 | RASA1 723 | RB1 724 | RBKS 725 | RBM15B 726 | RBM34 727 | RBM6 728 | REEP5 729 | RELB 730 | RFC2 731 | RFC5 732 | RFNG 733 | RFX5 734 | RGS2 735 | RHEB 736 | RHOA 737 | RHOV 738 | RNF167 739 | RNH1 740 | RNMT 741 | RNPS1 742 | RPA1 743 | RPA2 744 | RPA3 745 | RPIA 746 | RPL39L 747 | RPN1 748 | RPP38 749 | RPS5 750 | RPS6 751 | RPS6KA1 752 | RRAGA 753 | RRP12 754 | RRP1B 755 | RRP8 756 | RRS1 757 | RSU1 758 | RTN2 759 | RUVBL1 760 | RXYLT1 761 | S100A13 762 | S100A4 763 | SACM1L 764 | SATB1 765 | SCAND1 766 | SCARB1 767 | SCCPDH 768 | SCP2 769 | SCRN1 770 | SCYL3 771 | SDHB 772 | SENP6 773 | SERPINE1 774 | SESN1 775 | SFN 776 | SGCB 777 | SH3BP5 778 | SHB 779 | SHC1 780 | SIRT3 781 | SKIC2 782 | SKIC8 783 | SKP1 784 | SLC11A2 785 | SLC1A4 786 | SLC25A13 787 | SLC25A14 788 | SLC25A4 789 | SLC25A46 790 | SLC27A3 791 | SLC2A6 792 | SLC35A1 793 | SLC35A3 794 | SLC35B1 795 | SLC35F2 796 | SLC37A4 797 | SLC5A6 798 | SMAD3 799 | SMARCA4 800 | SMARCC1 801 | SMARCD2 802 | SMC1A 803 | SMC3 804 | SMC4 805 | SMNDC1 806 | SNAP25 807 | SNCA 808 | SNX11 809 | SNX13 810 | SNX6 811 | SNX7 812 | SOCS2 813 | SORBS3 814 | SOX2 815 | SOX4 816 | SPAG4 817 | SPAG7 818 | SPDEF 819 | SPEN 820 | SPP1 821 | SPR 822 | SPRED2 823 | SPTAN1 824 | SPTLC2 825 | SQOR 826 | SQSTM1 827 | SRC 828 | SSBP2 829 | ST3GAL5 830 | ST6GALNAC2 831 | ST7 832 | STAMBP 833 | STAP2 834 | STAT1 835 | STAT3 836 | STAT5B 837 | STIMATE 838 | STK10 839 | STK25 840 | STMN1 841 | STUB1 842 | STX1A 843 | STX4 844 | STXBP1 845 | STXBP2 846 | SUPV3L1 847 | SUV39H1 848 | SUZ12 849 | SYK 850 | SYNE2 851 | SYNGR3 852 | SYPL1 853 | TARBP1 854 | TATDN2 855 | TBC1D31 856 | TBC1D9B 857 | TBP 858 | TBPL1 859 | TBX2 860 | TBXA2R 861 | TCEA2 862 | TCEAL4 863 | TCERG1 864 | TCFL5 865 | TCTA 866 | TCTN1 867 | TENT4A 868 | TERF2IP 869 | TERT 870 | TES 871 | TESK1 872 | TEX10 873 | TFAP2A 874 | TFDP1 875 | TGFB3 876 | TGFBR2 877 | THAP11 878 | TIAM1 879 | TICAM1 880 | TIMELESS 881 | TIMM17B 882 | TIMM22 883 | TIMM9 884 | TIMP2 885 | TIPARP 886 | TJP1 887 | TLCD3A 888 | TLE1 889 | TLK2 890 | TLR4 891 | TM9SF2 892 | TM9SF3 893 | TMCO1 894 | TMED10 895 | TMEM109 896 | TMEM50A 897 | TMEM97 898 | TNFRSF21 899 | TNIP1 900 | TOMM34 901 | TOMM70 902 | TOP2A 903 | TOPBP1 904 | TOR1A 905 | TP53 906 | TP53BP1 907 | TP53BP2 908 | TPD52L2 909 | TPM1 910 | TRAK2 911 | TRAM2 912 | TRAP1 913 | TRAPPC3 914 | TRAPPC6A 915 | TRIB1 916 | TRIB3 917 | TRIM13 918 | TRIM2 919 | TSC22D3 920 | TSEN2 921 | TSKU 922 | TSPAN3 923 | TSPAN4 924 | TSPAN6 925 | TUBB6 926 | TWF2 927 | TXLNA 928 | TXNDC9 929 | TXNL4B 930 | TXNRD1 931 | UBE2A 932 | UBE2C 933 | UBE2J1 934 | UBE2L6 935 | UBE3B 936 | UBE3C 937 | UBQLN2 938 | UBR7 939 | UFM1 940 | UGDH 941 | USP1 942 | USP14 943 | USP22 944 | USP6NL 945 | USP7 946 | UTP14A 947 | VAPB 948 | VAT1 949 | VAV3 950 | VDAC1 951 | VGLL4 952 | VPS28 953 | VPS72 954 | WASF3 955 | WASHC4 956 | WASHC5 957 | WDR7 958 | WDTC1 959 | WFS1 960 | WIPF2 961 | XBP1 962 | XPNPEP1 963 | XPO7 964 | YKT6 965 | YME1L1 966 | YTHDF1 967 | ZDHHC6 968 | ZFP36 969 | ZMIZ1 970 | ZMYM2 971 | ZNF131 972 | ZNF274 973 | ZNF318 974 | ZNF395 975 | ZNF451 976 | ZNF586 977 | ZNF589 978 | ZW10 979 | ABAT 980 | ACSL1 981 | ADCY9 982 | ADGRA3 983 | AFF1 984 | AKAP1 985 | ALMS1 986 | ANKRD49 987 | AREL1 988 | ARHGEF5 989 | ASAP2 990 | ASL 991 | ASMTL 992 | BAZ1B 993 | BAZ2B 994 | BCKDHA 995 | BTBD3 996 | BTG2 997 | BTG3 998 | CA12 999 | CABIN1 1000 | CD14 1001 | CD19 1002 | CEP55 1003 | CHCHD7 1004 | CISH 1005 | CKB 1006 | CLASRP 1007 | CLCN3 1008 | CMPK1 1009 | COL4A5 1010 | COQ3 1011 | CREB3L2 1012 | CTCF 1013 | CTNS 1014 | CTSV 1015 | DALRD3 1016 | DDX49 1017 | DHPS 1018 | DHX8 1019 | DLGAP5 1020 | DNAJB12 1021 | DOK4 1022 | DSE 1023 | DTNA 1024 | ECHDC1 1025 | EFNB3 1026 | EIF1B 1027 | ENTPD6 1028 | EPB41L4B 1029 | EPHA2 1030 | EPHB4 1031 | ERCC1 1032 | ERCC5 1033 | ERCC6L 1034 | ERLIN1 1035 | ETFA 1036 | F12 1037 | FADD 1038 | FAM162A 1039 | FAM3C 1040 | FANCA 1041 | FANCL 1042 | FBRS 1043 | FDX1 1044 | FEZ1 1045 | FTSJ1 1046 | FUZ 1047 | FZD5 1048 | GGA2 1049 | GM2A 1050 | GMDS 1051 | GOLIM4 1052 | GPRC5C 1053 | GRHPR 1054 | GYS1 1055 | H2BC5 1056 | HACD1 1057 | HADHB 1058 | HBE1 1059 | HDAC4 1060 | HLF 1061 | HMGCL 1062 | IFIT5 1063 | IL1RAP 1064 | IMPA2 1065 | KDELR3 1066 | KHDC1 1067 | LAMTOR3 1068 | LASP1 1069 | LHPP 1070 | LIMK2 1071 | LRRC40 1072 | LYPD3 1073 | MAD2L1BP 1074 | MAN2A2 1075 | MARK4 1076 | MEGF9 1077 | MGST2 1078 | MPDZ 1079 | NAA50 1080 | NIPBL 1081 | NSFL1C 1082 | PANK2 1083 | PARL 1084 | PARN 1085 | PDS5B 1086 | PEPD 1087 | PER2 1088 | PEX19 1089 | PHF2 1090 | PJA1 1091 | PKD1 1092 | PLPP1 1093 | POLR3B 1094 | PPIF 1095 | PPP2R3A 1096 | PPP2R5B 1097 | PPP3CA 1098 | PPP4R1 1099 | PSMD5 1100 | PTEN 1101 | QRSL1 1102 | RAB1B 1103 | RAB3GAP1 1104 | RABGGTA 1105 | RAD23B 1106 | RAP1A 1107 | RARA 1108 | RHOBTB1 1109 | RPF1 1110 | RTCA 1111 | SAP18 1112 | SCAF8 1113 | SEC14L1 1114 | SEC24B 1115 | SEC24C 1116 | SEC61A2 1117 | SENP5 1118 | SERTAD3 1119 | SETD1B 1120 | SETDB1 1121 | SFMBT1 1122 | SGK1 1123 | SIK1 1124 | SLC16A6 1125 | SLC2A1 1126 | SLC36A1 1127 | SMG7 1128 | SMYD3 1129 | SNRPD1 1130 | SNRPF 1131 | SNX2 1132 | SRF 1133 | SRPRB 1134 | SRRM1 1135 | SRSF8 1136 | STAT6 1137 | SWAP70 1138 | TARS1 1139 | TBCB 1140 | TCIRG1 1141 | TGDS 1142 | TLE3 1143 | TMEM127 1144 | TMEM131L 1145 | TNFAIP1 1146 | TOMM22 1147 | TP53TG1 1148 | TRIP10 1149 | TSN 1150 | TTF1 1151 | UBE4A 1152 | UGP2 1153 | USP20 1154 | UVRAG 1155 | VAMP7 1156 | VPS26A 1157 | WHRN 1158 | WRN 1159 | WWOX 1160 | YBX3 1161 | ZBTB14 1162 | ZBTB24 1163 | ZBTB5 1164 | ZC3H4 1165 | ZER1 1166 | ZGPAT 1167 | ZNF629 1168 | ZNF672 1169 | ZNF692 1170 | ZNF768 1171 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, carpenterlab 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # High-Dimensional Gene Expression and Morphology Profiles of Cells across 28,000 Genetic and Chemical Perturbations 2 | Populations of cells can be perturbed by various chemical and genetic treatments and the impact on the cells’ gene expression (transcription, i.e. mRNA levels) and morphology (in an image-based assay) can be measured in high dimensions. 3 | The patterns observed in this data can be used for more than a dozen applications in drug discovery and basic biology research. 4 | We provide a collection of four datasets where both gene expression and morphological data are available; roughly a thousand features are measured for each data type, across more than 28,000 thousand chemical and genetic perturbations. 5 | We have defined a set of biological problems that can be investigated using these two data modalities and provided baseline analysis and evaluation metrics for addressing each. 6 | 7 | [Link to Paper](https://www.nature.com/articles/s41592-022-01667-0) 8 | 9 | 10 | # Data Modalities 11 |
12 | Click to expand 13 | 14 | ### Gene expression (GE) profiles 15 | Each cell has DNA in the nucleus which is transcribed into various mRNA molecules which are then translated into proteins that carry out functions in the cell. 16 | The levels of mRNA in the cell are often biologically meaningful - collectively, mRNA levels for a cell are known as its transcriptional state; each individual mRNA level is referred to as the corresponding gene's "expression". 17 | The L1000 assay was used to measure the transcriptional state of cells in the datasets here. 18 | The assay reports a sample's mRNA levels for 978 genes at high-throughput, from the bulk population of cells treated with a given perturbation. 19 | These 978 "landmark" genes capture approximately $80\%$ of the transcriptional variance for the entire genome. 20 | The data processing tools and workflows to produce these profiles are available at https://clue.io/. 21 | 22 | 23 | ### Cell Painting morphological (CP) profiles 24 | We used the Cell Painting assay to measure the morphological state of cells treated with a given perturbation. 25 | The assay captures fluorescence images of cells colored by six well-characterized fluorescent dyes to stain the nucleus, nucleoli, cytoplasmic RNA, endoplasmic reticulum, actin cytoskeleton, Golgi apparatus and plasma membrane. 26 | These eight labeled cell compartments are captured through five channels of high-resolution microscopy images (_DNA, RNA, ER, AGP_, and _Mito_). 27 | Images are then processed using [CellProfiler software](https://cellprofiler.org/) to extract thousands of features of each cell’s morphology and form a high-dimensional profile for each single cell. 28 | These features are based on various shape, intensity and texture statistics and are then aggregated for all the single cells in a "well" (a miniature test tube) that are called replicate-level profiles of perturbations. 29 | Aggregation of replicate-level profiles across all the wells or replicates of a perturbation is called a treatment-level profile. 30 | In our study, we used treatment-level profiles in all experiments but have provided replicate-level profiles for researchers interested in further data exploration. 31 | 32 |
33 | 34 | # Datasets 35 | 36 | - We have gathered the following five available data sets that had both Cell Painting morphological (CP) and L1000 gene expression (GE) profiles, preprocessed the data from different sources and in different formats in a unified .csv format. 37 | 38 | - CDRP-BBBC047-Bray-CP-GE (Cell line: U2OS) 39 | - CDRPBIO-BBBC036-Bray-CP-GE (Cell line: U2OS) 40 | - LUAD-BBBC041-Caicedo-CP-GE (Cell line: A549) 41 | - TA-ORF-BBBC037-Rohban-CP-GE (Cell line: U2OS) 42 | - LINCS-Pilot1-CP-GE (Cell line: A549) 43 | 44 | ## References to raw profiles and images 45 |
46 | Click to expand 47 | 48 | - CDRP-BBBC047-Bray-[CP](https://pubmed.ncbi.nlm.nih.gov/28327978/) - [GE](https://pubmed.ncbi.nlm.nih.gov/29195078/) 49 | - CDRP-bio-BBBC036-Bray-[CP](https://pubmed.ncbi.nlm.nih.gov/28327978/) - [GE](https://pubmed.ncbi.nlm.nih.gov/29195078/) 50 | - LUAD-BBBC041-Caicedo-[CP](https://registry.opendata.aws/cell-painting-image-collection/) - [GE](https://pubmed.ncbi.nlm.nih.gov/27478040/) 51 | - TA-ORF-BBBC037-Rohban-[CP](https://elifesciences.org/articles/24060) - [GE](https://github.com/carpenterlab/2017_rohban_elife/tree/master/input/TA-OE-L1000-B1) 52 | - LINCS-Pilot1-[CP](https://zenodo.org/record/3928744#.YNu3WzZKheV) - [GE](https://figshare.com/articles/dataset/L1000_data_for_profiling_comparison/13181966) 53 | 54 |
55 | 56 | 57 | ## Preprocessed publicly available profiles 58 | Preprocessed profiles (~9.5GB) are available on a S3 bucket. 59 | They can be downloaded at no cost and no need for registration of any sort, using the command: 60 | 61 | ```bash 62 | aws s3 sync \ 63 | --no-sign-request \ 64 | s3://cellpainting-gallery/cpg0003-rosetta/broad/workspace/preprocessed_data . 65 | ``` 66 | 67 | See this [wiki](https://github.com/carpenterlab/2016_bray_natprot/wiki/What-do-Cell-Painting-features-mean%3F) for sample Cell Painting images and the meaning of ([CellProfiler](https://cellprofiler.org/)-derived) Cell Painting features. 68 | 69 | - AWS CLI installation instructions can be found [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html). 70 | 71 | ### Data version 72 | 73 | The [Etags](https://docs.aws.amazon.com/AmazonS3/latest/API/API_Object.html) of these files are listed [here](etag.json). 74 | 75 | They were generated using: 76 | 77 | ```sh 78 | aws s3api list-objects --bucket cellpainting-gallery --prefix rosetta/broad/workspace/preprocessed_data/ 79 | ``` 80 | ### CP-L1000 Profile descriptions 81 | 82 | We gathered four available data sets that had both Cell Painting morphological (CP) and L1000 gene expression (GE) profiles, preprocessed the data from different sources and in different formats in a unified .csv format, and made the data publicly available. Single cell morphological (CP) profiles were created using CellProfiler software and processed to form aggregated replicate and treatment levels using the R cytominer package [cytominer](https://github.com/cytomining/cytominer/blob/master/vignettes/cytominer-pipeline.Rmd). 83 | We made the following three types of profiles available for cell-painting modality of each of four datasets: 84 | 85 | 86 | | Folder | File name | Description | 87 | | ------------ | -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | 88 | | CellPainting | `replicate_level_cp_augmented.csv` | Aggregated and Metadata annotated profiles which are the average of single cell profiles in each well. | 89 | | CellPainting | `replicate_level_cp_normalized.csv.gz` | Normalized profiles which are the z-scored aggregated profiles, where the scores are computing using the distribution of negative controls as the reference. | 90 | | CellPainting | `replicate_level_cp_normalized_variable_selected.csv.gz` | Normalized variable selected which are normalized profiles with features selection applied | 91 | | L1000 | `replicate_level_l1k.csv` | Aggregated and Metadata annotated profiles which are the average of single cell profiles in each well. | 92 | 93 | 94 | 95 | ### Metadata information 96 | 97 | This [spreadsheet](https://docs.google.com/spreadsheets/d/1EpqBLJqio8ptGlZe9Ywq1OUJahKSpYNb6S4lJ9yFc0o/edit#gid=174183831) contains a description all the metadata fields across all 8 datasets. 98 | 99 | #### Keywords to match tables across modalities for each dataset 100 | 101 | 102 | | Dataset | perturbation match column
CP | perturbation match column
GE | Control perturbation value in each of columns
CP and GE | 103 | | :-------------------- | :------------------------------- | :------------------------------- | :---------------------------- | 104 | | CDRP-BBBC047-Bray | Metadata_Sample_Dose | pert_sample_dose | negcon | 105 | | CDRPBIO-BBBC036-Bray | Metadata_Sample_Dose | pert_sample_dose | negcon | 106 | | TA-ORF-BBBC037-Rohban | Metadata_broad_sample | pert_id | negcon | 107 | | LUAD-BBBC041-Caicedo | x_mutation_status | allele | negcon | 108 | | LINCS-Pilot1 | Metadata_pert_id_dose | pert_id_dose | negcon | 109 | 110 | * Two aditional columns can also be used to filter for the "Control perturbation" in each data table: 111 | - **pert_type** wich can take 'trt' or 'control' values , and column control_type indicates negcon (otherwise empty). 112 | - **control_type** wich can take 'negcon' (for control) or NaN (for treatments) values 113 | 114 | #### Number of features for each dataset 115 | 116 | | Dataset | GE | CP
`normalized` | CP
`normalized_variable_selected` | 117 | | -------- | --- | ------------------- | ------------------------------------- | 118 | | CDRP | 977 | 1565 | 727 | 119 | | CDRP-BIO | 977 | 1570 | 601 | 120 | | LUAD | 978 | 1569 | 291 | 121 | | TA-ORF | 978 | 1677 | 63 | 122 | | LINCS | 978 | 1670 | 119 | 123 | 124 | 125 | # Lookup table for L1000 genes predictability 126 | 127 | [Table](results/SingleGenePred/Appendix_D.csv) 128 | 129 | 130 | # License 131 | 132 | We license the data, results, and figures as [CC0 1.0](LICENSE_CC0.md) and the source code as BSD 3-Clause. 133 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: rosetta 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - pip=22.0.4 6 | - conda-forge::pandas=1.4.1 7 | - conda-forge::scikit-learn=1.0.2 8 | - conda-forge::umap-learn=0.5.2 9 | - conda-forge::jupyter=1.0.0 10 | - conda-forge::matplotlib=3.3.3 11 | - conda-forge::seaborn=0.11.2 12 | - conda-forge::openpyxl=3.0.9 13 | -------------------------------------------------------------------------------- /etag.json: -------------------------------------------------------------------------------- 1 | { 2 | "Contents": [ 3 | { 4 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_augmented.csv.gz", 5 | "LastModified": "2022-02-25T20:24:06.000Z", 6 | "ETag": "\"8367b77b245035279d21e083fb57564e-261\"", 7 | "Size": 2183033139, 8 | "StorageClass": "STANDARD", 9 | "Owner": { 10 | "DisplayName": "cellpainting", 11 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 12 | } 13 | }, 14 | { 15 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_normalized.csv.gz", 16 | "LastModified": "2022-02-25T20:24:06.000Z", 17 | "ETag": "\"572869293e0cfacdd8882c2b758fac00-272\"", 18 | "Size": 2277911750, 19 | "StorageClass": "STANDARD", 20 | "Owner": { 21 | "DisplayName": "cellpainting", 22 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 23 | } 24 | }, 25 | { 26 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz", 27 | "LastModified": "2022-02-25T20:24:06.000Z", 28 | "ETag": "\"510f9c5a93436c8af2f36f0308c78be0-131\"", 29 | "Size": 1098352960, 30 | "StorageClass": "STANDARD", 31 | "Owner": { 32 | "DisplayName": "cellpainting", 33 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 34 | } 35 | }, 36 | { 37 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k.csv.gz", 38 | "LastModified": "2022-02-25T20:24:06.000Z", 39 | "ETag": "\"40e1f7285238c5381b9d9fdeebb5a026-32\"", 40 | "Size": 262406281, 41 | "StorageClass": "STANDARD", 42 | "Owner": { 43 | "DisplayName": "cellpainting", 44 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 45 | } 46 | }, 47 | { 48 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k_pclfc.csv.gz", 49 | "LastModified": "2022-02-25T20:24:06.000Z", 50 | "ETag": "\"630b98d69d185f530acfb0c272e82031-31\"", 51 | "Size": 258651159, 52 | "StorageClass": "STANDARD", 53 | "Owner": { 54 | "DisplayName": "cellpainting", 55 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 56 | } 57 | }, 58 | { 59 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k_pczscore.csv.gz", 60 | "LastModified": "2022-02-25T20:24:13.000Z", 61 | "ETag": "\"5ad1f4b412c8ea9b9abb55a254a7ebbe-72\"", 62 | "Size": 603440498, 63 | "StorageClass": "STANDARD", 64 | "Owner": { 65 | "DisplayName": "cellpainting", 66 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 67 | } 68 | }, 69 | { 70 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/replicate_level_l1k_vczscore.csv.gz", 71 | "LastModified": "2022-02-25T20:24:13.000Z", 72 | "ETag": "\"b58b4d31e96964f28165f048bdfd60c8-73\"", 73 | "Size": 605293966, 74 | "StorageClass": "STANDARD", 75 | "Owner": { 76 | "DisplayName": "cellpainting", 77 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 78 | } 79 | }, 80 | { 81 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRP-BBBC047-Bray/L1000/treatment_level_l1k.csv.gz", 82 | "LastModified": "2022-02-25T20:24:27.000Z", 83 | "ETag": "\"e695e3d5f520553f516516ab8719719f-13\"", 84 | "Size": 107934871, 85 | "StorageClass": "STANDARD", 86 | "Owner": { 87 | "DisplayName": "cellpainting", 88 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 89 | } 90 | }, 91 | { 92 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/CellPainting/replicate_level_cp_augmented.csv.gz", 93 | "LastModified": "2022-02-25T20:24:27.000Z", 94 | "ETag": "\"3e199aeba5209250e0d2c5948f5bd522-36\"", 95 | "Size": 298941736, 96 | "StorageClass": "STANDARD", 97 | "Owner": { 98 | "DisplayName": "cellpainting", 99 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 100 | } 101 | }, 102 | { 103 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/CellPainting/replicate_level_cp_normalized.csv.gz", 104 | "LastModified": "2022-02-25T20:24:30.000Z", 105 | "ETag": "\"0b86065f8840aff626d64c6f52a8caf4-38\"", 106 | "Size": 311539701, 107 | "StorageClass": "STANDARD", 108 | "Owner": { 109 | "DisplayName": "cellpainting", 110 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 111 | } 112 | }, 113 | { 114 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz", 115 | "LastModified": "2022-02-25T20:24:32.000Z", 116 | "ETag": "\"bffd9db9578fcc70bbd7d72e0dfff773-14\"", 117 | "Size": 117242590, 118 | "StorageClass": "STANDARD", 119 | "Owner": { 120 | "DisplayName": "cellpainting", 121 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 122 | } 123 | }, 124 | { 125 | "Key": "rosetta/broad/workspace/preprocessed_data/CDRPBIO-BBBC036-Bray/L1000/replicate_level_l1k.csv.gz", 126 | "LastModified": "2022-02-25T20:24:35.000Z", 127 | "ETag": "\"5b45e5cb94f0466a2abb11fbac8a655e-4\"", 128 | "Size": 26842289, 129 | "StorageClass": "STANDARD", 130 | "Owner": { 131 | "DisplayName": "cellpainting", 132 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 133 | } 134 | }, 135 | { 136 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_augmented.csv.gz", 137 | "LastModified": "2022-02-25T20:24:35.000Z", 138 | "ETag": "\"9bde4d7112c06ffa1849fbfa4efa22f1-36\"", 139 | "Size": 296762474, 140 | "StorageClass": "STANDARD", 141 | "Owner": { 142 | "DisplayName": "cellpainting", 143 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 144 | } 145 | }, 146 | { 147 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_normalized.csv.gz", 148 | "LastModified": "2022-02-25T20:24:36.000Z", 149 | "ETag": "\"f42af6b4109ef9ed110004def49f6c2c-36\"", 150 | "Size": 299683743, 151 | "StorageClass": "STANDARD", 152 | "Owner": { 153 | "DisplayName": "cellpainting", 154 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 155 | } 156 | }, 157 | { 158 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz", 159 | "LastModified": "2022-02-25T20:24:38.000Z", 160 | "ETag": "\"33783625dc59b0de2bf16c299f5380dd-12\"", 161 | "Size": 94527797, 162 | "StorageClass": "STANDARD", 163 | "Owner": { 164 | "DisplayName": "cellpainting", 165 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 166 | } 167 | }, 168 | { 169 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_3.csv.gz", 170 | "LastModified": "2022-02-25T20:24:41.000Z", 171 | "ETag": "\"8491fe32e9b0b040f10c7d51225d6111-11\"", 172 | "Size": 89725093, 173 | "StorageClass": "STANDARD", 174 | "Owner": { 175 | "DisplayName": "cellpainting", 176 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 177 | } 178 | }, 179 | { 180 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_4.csv.gz", 181 | "LastModified": "2022-02-25T20:24:42.000Z", 182 | "ETag": "\"14679d4b4cae5e12a4e7be8255bd22ff-10\"", 183 | "Size": 78596325, 184 | "StorageClass": "STANDARD", 185 | "Owner": { 186 | "DisplayName": "cellpainting", 187 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 188 | } 189 | }, 190 | { 191 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_4W.csv.gz", 192 | "LastModified": "2022-02-25T20:24:43.000Z", 193 | "ETag": "\"370607c1f148942263037a7e26018303-17\"", 194 | "Size": 140912507, 195 | "StorageClass": "STANDARD", 196 | "Owner": { 197 | "DisplayName": "cellpainting", 198 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 199 | } 200 | }, 201 | { 202 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_5_modz.csv.gz", 203 | "LastModified": "2022-02-25T20:24:43.000Z", 204 | "ETag": "\"5967bd8a92d2c57242436330950f1cd2\"", 205 | "Size": 3631, 206 | "StorageClass": "STANDARD", 207 | "Owner": { 208 | "DisplayName": "cellpainting", 209 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 210 | } 211 | }, 212 | { 213 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/level_5_rank.csv.gz", 214 | "LastModified": "2022-02-25T20:24:43.000Z", 215 | "ETag": "\"83c8146ea2f8a2a6392643b3c4472727\"", 216 | "Size": 3631, 217 | "StorageClass": "STANDARD", 218 | "Owner": { 219 | "DisplayName": "cellpainting", 220 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 221 | } 222 | }, 223 | { 224 | "Key": "rosetta/broad/workspace/preprocessed_data/LINCS-Pilot1/L1000/replicate_level_l1k.csv.gz", 225 | "LastModified": "2022-02-25T20:24:44.000Z", 226 | "ETag": "\"872c318560ba21c9d36e805fb97992a4-10\"", 227 | "Size": 78596337, 228 | "StorageClass": "STANDARD", 229 | "Owner": { 230 | "DisplayName": "cellpainting", 231 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 232 | } 233 | }, 234 | { 235 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_augmented.csv.gz", 236 | "LastModified": "2022-02-25T20:24:44.000Z", 237 | "ETag": "\"11a0a26d299f09452455e0c7e44c571c-11\"", 238 | "Size": 85105940, 239 | "StorageClass": "STANDARD", 240 | "Owner": { 241 | "DisplayName": "cellpainting", 242 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 243 | } 244 | }, 245 | { 246 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_normalized.csv.gz", 247 | "LastModified": "2022-02-25T20:24:46.000Z", 248 | "ETag": "\"f91d40a978c96834973f24b96b8a3b02-11\"", 249 | "Size": 88273100, 250 | "StorageClass": "STANDARD", 251 | "Owner": { 252 | "DisplayName": "cellpainting", 253 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 254 | } 255 | }, 256 | { 257 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz", 258 | "LastModified": "2022-02-25T20:24:47.000Z", 259 | "ETag": "\"1ba6936ab1188268850a798e30c4823f-2\"", 260 | "Size": 16570136, 261 | "StorageClass": "STANDARD", 262 | "Owner": { 263 | "DisplayName": "cellpainting", 264 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 265 | } 266 | }, 267 | { 268 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/L1000/replicate_level_l1k.csv.gz", 269 | "LastModified": "2022-02-25T20:24:47.000Z", 270 | "ETag": "\"c1b8cabef1934d213baf797b80c4c32c-2\"", 271 | "Size": 11448027, 272 | "StorageClass": "STANDARD", 273 | "Owner": { 274 | "DisplayName": "cellpainting", 275 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 276 | } 277 | }, 278 | { 279 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/L1000/replicate_level_l1k_Juan.csv.gz", 280 | "LastModified": "2022-02-25T20:24:47.000Z", 281 | "ETag": "\"587d00f75c5fa6164929e3592bf96080-4\"", 282 | "Size": 25582111, 283 | "StorageClass": "STANDARD", 284 | "Owner": { 285 | "DisplayName": "cellpainting", 286 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 287 | } 288 | }, 289 | { 290 | "Key": "rosetta/broad/workspace/preprocessed_data/LUAD-BBBC041-Caicedo/L1000/treatment_level_l1k.csv.gz", 291 | "LastModified": "2022-02-25T20:24:48.000Z", 292 | "ETag": "\"c7f285af2a39efc64a4c8d57854d6a0e\"", 293 | "Size": 4575373, 294 | "StorageClass": "STANDARD", 295 | "Owner": { 296 | "DisplayName": "cellpainting", 297 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 298 | } 299 | }, 300 | { 301 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_augmented.csv.gz", 302 | "LastModified": "2022-02-25T20:24:48.000Z", 303 | "ETag": "\"9707bd02924cda850ed6f1e7eba33d9a-4\"", 304 | "Size": 27548449, 305 | "StorageClass": "STANDARD", 306 | "Owner": { 307 | "DisplayName": "cellpainting", 308 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 309 | } 310 | }, 311 | { 312 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_normalized.csv.gz", 313 | "LastModified": "2022-02-25T20:24:48.000Z", 314 | "ETag": "\"736ef2b85bf5406f27239153f3772218-4\"", 315 | "Size": 27482072, 316 | "StorageClass": "STANDARD", 317 | "Owner": { 318 | "DisplayName": "cellpainting", 319 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 320 | } 321 | }, 322 | { 323 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/CellPainting/replicate_level_cp_normalized_variable_selected.csv.gz", 324 | "LastModified": "2022-02-25T20:24:48.000Z", 325 | "ETag": "\"1315c2fd175b265d10e929e51d9dfef0\"", 326 | "Size": 1106334, 327 | "StorageClass": "STANDARD", 328 | "Owner": { 329 | "DisplayName": "cellpainting", 330 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 331 | } 332 | }, 333 | { 334 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k.csv.gz", 335 | "LastModified": "2022-02-25T20:24:49.000Z", 336 | "ETag": "\"1e643bb1182555a8e7699230a0ea98d1\"", 337 | "Size": 2022367, 338 | "StorageClass": "STANDARD", 339 | "Owner": { 340 | "DisplayName": "cellpainting", 341 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 342 | } 343 | }, 344 | { 345 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k_QNORM.csv.gz", 346 | "LastModified": "2022-02-25T20:24:49.000Z", 347 | "ETag": "\"8ffb9c82772442cbbd138a6ab05a9a97\"", 348 | "Size": 1782302, 349 | "StorageClass": "STANDARD", 350 | "Owner": { 351 | "DisplayName": "cellpainting", 352 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 353 | } 354 | }, 355 | { 356 | "Key": "rosetta/broad/workspace/preprocessed_data/TA-ORF-BBBC037-Rohban/L1000/replicate_level_l1k_ZSPCQNORM.csv.gz", 357 | "LastModified": "2022-02-25T20:24:49.000Z", 358 | "ETag": "\"36783d73bb48bec466aeda707384c7e5\"", 359 | "Size": 1997953, 360 | "StorageClass": "STANDARD", 361 | "Owner": { 362 | "DisplayName": "cellpainting", 363 | "ID": "b2ff2dec476b541160cb5edae0ba12ffb6f3cd979ce9352e9ca765d92ac2170c" 364 | } 365 | } 366 | ] 367 | } -------------------------------------------------------------------------------- /idmap.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/idmap.xlsx -------------------------------------------------------------------------------- /read_and_match_profiles.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 35, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "The autoreload extension is already loaded. To reload it, use:\n", 13 | " %reload_ext autoreload\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "%matplotlib inline\n", 19 | "%load_ext autoreload\n", 20 | "%autoreload 2\n", 21 | "import numpy as np\n", 22 | "import pandas as pd\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import seaborn as sns\n", 25 | "from utils.readProfiles import *" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# ls" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Metadata column in each dataset to match perturbations across modalities\n", 42 | "\n", 43 | "Table 1.\n", 44 | "\n", 45 | "| Dataset | perturbation match column
CP | perturbation match column
GE | Control perturbation value
CP/GE|\n", 46 | "|:----------------------|:-----------------|:-----------------------------|:--------------|\n", 47 | "| CDRP-BBBC047-Bray | Metadata_Sample_Dose | pert_sample_dose | negcon |\n", 48 | "| CDRPBIO-BBBC036-Bray | Metadata_Sample_Dose | pert_sample_dose | negcon |\n", 49 | "| TA-ORF-BBBC037-Rohban | Metadata_broad_sample | pert_id | negcon |\n", 50 | "| LUAD-BBBC041-Caicedo | x_mutation_status | allele | negcon|\n", 51 | "| LINCS-Pilot1 | Metadata_pert_id_dose | pert_id_dose | negcon |\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 36, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "ds_info_dict = {\n", 61 | " \"CDRP\": [\"CDRP-BBBC047-Bray\", [\"Metadata_Sample_Dose\", \"pert_sample_dose\"]],\n", 62 | " \"CDRP-bio\": [\"CDRPBIO-BBBC036-Bray\", [\"Metadata_Sample_Dose\", \"pert_sample_dose\"]],\n", 63 | " \"TAORF\": [\"TA-ORF-BBBC037-Rohban\", [\"Metadata_broad_sample\", \"pert_id\"]],\n", 64 | " \"LUAD\": [\"LUAD-BBBC041-Caicedo\", [\"x_mutation_status\", \"allele\"]],\n", 65 | " \"LINCS\": [\"LINCS-Pilot1\", [\"Metadata_pert_id_dose\", \"pert_id_dose\"]],\n", 66 | "}\n", 67 | "# pd.DataFrame(ds_info_dict.values(), index=ds_info_dict.keys()).to_markdown(index=False)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### In this notebook you can find examples of how to:\n", 82 | "- read replicate or treatment level profiles \n", 83 | "- match profiles across modalities\n", 84 | "\n", 85 | "\n", 86 | "\n", 87 | "* Finctions used in this notebook:\n", 88 | "\n", 89 | " - Read **treatment** level data\n", 90 | " - read_treatment_level_profiles\n", 91 | " \n", 92 | " - Read and match **treatment** level data\n", 93 | " - read_paired_treatment_level_profiles\n", 94 | " \n", 95 | " - Read **Replicate** level data\n", 96 | " - read_replicate_level_profiles\n", 97 | " \n", 98 | " - Read and match **Replicate** level data\n", 99 | " - read_paired_replicate_level_profiles\n" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "### User input parameters" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 37, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "####################### Root directories ###############################################\n", 116 | "procProf_dir = \"/home/ubuntu/gallery/cpg0003-rosetta/broad/workspace/\"\n", 117 | "# procProf_dir = \"/home/ubuntu/bucket/projects/2018_04_20_Rosetta/workspace/\"\n", 118 | "\n", 119 | "############################# Dataset ##################################################\n", 120 | "# dataset options: 'LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP'\n", 121 | "dataset = \"LUAD\"\n", 122 | "\n", 123 | "####################### Type of cell painting profile to read ##########################\n", 124 | "# CP Profile Type options: 'augmented' , 'normalized', 'normalized_variable_selected'\n", 125 | "profileType = \"normalized_variable_selected\"\n", 126 | "\n", 127 | "############################ Filtering low quality samples option #######################\n", 128 | "# filtering to compounds which have high replicates for both GE and CP datasets\n", 129 | "# highRepOverlapEnabled=0\n", 130 | "# 'highRepUnion','highRepOverlap'\n", 131 | "filter_perts = \"highRepUnion\"\n", 132 | "repCorrFilePath = \"./results/RepCor/RepCorrDF.xlsx\"\n", 133 | "\n", 134 | "filter_repCorr_params = [filter_perts, repCorrFilePath]" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "### Read Replicate level profiles" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 14, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stderr", 151 | "output_type": "stream", 152 | "text": [ 153 | "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:54: DtypeWarning: Columns (1023,1028,1032) have mixed types. Specify dtype option on import or set low_memory=False.\n", 154 | " l1k_data_repLevel = pd.read_csv(dataDir + \"/L1000/replicate_level_l1k.csv.gz\")\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "# dataset = \"LINCS\"\n", 160 | "per_plate_normalized_flag = 0\n", 161 | "[cp_data_repLevel, cp_features], [l1k_data_repLevel, l1k_features] = read_replicate_level_profiles(\n", 162 | " procProf_dir, dataset, profileType, per_plate_normalized_flag\n", 163 | ")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "### Read and pair Replicate level profiles" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 6, 176 | "metadata": {}, 177 | "outputs": [ 178 | { 179 | "name": "stderr", 180 | "output_type": "stream", 181 | "text": [ 182 | "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:51: DtypeWarning: Columns (18,19,1249,1250) have mixed types. Specify dtype option on import or set low_memory=False.\n", 183 | " cp_data_repLevel = pd.read_csv(\n" 184 | ] 185 | }, 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "LINCS: Replicate Level Shapes (nSamples x nFeatures): cp: 52223 , 119 , l1k: 27837 , 978\n", 191 | "l1k n of rep: 3.0\n", 192 | "cp n of rep: 5.0\n", 193 | "CP: from 9394 to 4647\n", 194 | "l1k: from 8369 to 2338\n", 195 | "CP and l1k high rep union: 5845\n" 196 | ] 197 | }, 198 | { 199 | "name": "stderr", 200 | "output_type": "stream", 201 | "text": [ 202 | "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:376: FutureWarning: Passing 'suffixes' which cause duplicate columns {'pert_type_y'} in the result is deprecated and will raise a MergeError in a future version.\n", 203 | " mergedProfiles_repLevel = pd.merge(\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "nRep = 2\n", 209 | "per_plate_normalized_flag = 1\n", 210 | "mergedProfiles_repLevel, cp_features, l1k_features = read_paired_replicate_level_profiles(\n", 211 | " procProf_dir, dataset, profileType, nRep, filter_repCorr_params, per_plate_normalized_flag\n", 212 | ")" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "### Read treatment level profiles" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 7, 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "name": "stderr", 229 | "output_type": "stream", 230 | "text": [ 231 | "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:51: DtypeWarning: Columns (18,19,1249,1250) have mixed types. Specify dtype option on import or set low_memory=False.\n", 232 | " cp_data_repLevel = pd.read_csv(\n" 233 | ] 234 | }, 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "LINCS: Replicate Level Shapes (nSamples x nFeatures): cp: 52223 , 119 , l1k: 27837 , 978\n", 240 | "l1k n of rep: 3.0\n", 241 | "cp n of rep: 5.0\n", 242 | "CP: from 9394 to 4647\n", 243 | "l1k: from 8369 to 2338\n", 244 | "CP and l1k high rep union: 5845\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "[cp_data_treatLevel, cp_features], [\n", 250 | " l1k_data_treatLevel,\n", 251 | " l1k_features,\n", 252 | "] = read_treatment_level_profiles(\n", 253 | " procProf_dir, dataset, profileType, filter_repCorr_params, per_plate_normalized_flag\n", 254 | ")" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "### Read and pair treatment level profiles" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 9, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "name": "stderr", 271 | "output_type": "stream", 272 | "text": [ 273 | "/home/ubuntu/workspace_rosetta/workspace/software/2022_Haghighi_NatureMethods/utils/readProfiles.py:51: DtypeWarning: Columns (18,19,1249,1250) have mixed types. Specify dtype option on import or set low_memory=False.\n", 274 | " cp_data_repLevel = pd.read_csv(\n" 275 | ] 276 | }, 277 | { 278 | "name": "stdout", 279 | "output_type": "stream", 280 | "text": [ 281 | "LINCS: Replicate Level Shapes (nSamples x nFeatures): cp: 52223 , 119 , l1k: 27837 , 978\n", 282 | "l1k n of rep: 3.0\n", 283 | "cp n of rep: 5.0\n", 284 | "CP: from 9394 to 4647\n", 285 | "l1k: from 8369 to 2338\n", 286 | "CP and l1k high rep union: 5845\n", 287 | "Treatment Level Shapes (nSamples x nFeatures+metadata): (5243, 122) (4431, 980) Merged Profiles Shape: (3828, 1101)\n" 288 | ] 289 | } 290 | ], 291 | "source": [ 292 | "mergedProfiles_treatLevel, cp_features, l1k_features = read_paired_treatment_level_profiles(\n", 293 | " procProf_dir, dataset, profileType, filter_repCorr_params, per_plate_normalized_flag\n", 294 | ")" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 40, 307 | "metadata": { 308 | "scrolled": false 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "# l1k_data_repLevel[ds_info_dict[dataset][1][1]].unique()\n", 313 | "# cp_data_repLevel[ds_info_dict[dataset][1][0]].unique()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 41, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "# per_plate_normalized_flag" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [] 338 | } 339 | ], 340 | "metadata": { 341 | "kernelspec": { 342 | "display_name": "Python 3 (ipykernel)", 343 | "language": "python", 344 | "name": "python3" 345 | }, 346 | "language_info": { 347 | "codemirror_mode": { 348 | "name": "ipython", 349 | "version": 3 350 | }, 351 | "file_extension": ".py", 352 | "mimetype": "text/x-python", 353 | "name": "python", 354 | "nbconvert_exporter": "python", 355 | "pygments_lexer": "ipython3", 356 | "version": "3.9.0" 357 | }, 358 | "latex_envs": { 359 | "LaTeX_envs_menu_present": true, 360 | "autoclose": false, 361 | "autocomplete": true, 362 | "bibliofile": "biblio.bib", 363 | "cite_by": "apalike", 364 | "current_citInitial": 1, 365 | "eqLabelWithNumbers": true, 366 | "eqNumInitial": 1, 367 | "hotkeys": { 368 | "equation": "Ctrl-E", 369 | "itemize": "Ctrl-I" 370 | }, 371 | "labels_anchors": false, 372 | "latex_user_defs": false, 373 | "report_style_numbering": false, 374 | "user_envs_cfg": false 375 | }, 376 | "varInspector": { 377 | "cols": { 378 | "lenName": 16, 379 | "lenType": 16, 380 | "lenVar": 40 381 | }, 382 | "kernels_config": { 383 | "python": { 384 | "delete_cmd_postfix": "", 385 | "delete_cmd_prefix": "del ", 386 | "library": "var_list.py", 387 | "varRefreshCmd": "print(var_dic_list())" 388 | }, 389 | "r": { 390 | "delete_cmd_postfix": ") ", 391 | "delete_cmd_prefix": "rm(", 392 | "library": "var_list.r", 393 | "varRefreshCmd": "cat(var_dic_list()) " 394 | } 395 | }, 396 | "position": { 397 | "height": "438.212px", 398 | "left": "1507.78px", 399 | "right": "20px", 400 | "top": "120px", 401 | "width": "350px" 402 | }, 403 | "types_to_exclude": [ 404 | "module", 405 | "function", 406 | "builtin_function_or_method", 407 | "instance", 408 | "_Feature" 409 | ], 410 | "window_display": false 411 | } 412 | }, 413 | "nbformat": 4, 414 | "nbformat_minor": 2 415 | } 416 | -------------------------------------------------------------------------------- /results/DAVIDoutput_CytoScapeInput_Figure2d/chart_UP_KEYWORDS_FunctionalAnot_top.txt: -------------------------------------------------------------------------------- 1 | Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR 2 | UP_KEYWORDS Acetylation 30 54.54545454545454 5.124648987027806E-10 TOP2A, CLIC4, SLC35F2, NOLC1, GLRX, EBP, NNT, STMN1, ANXA7, PCBD1, HADH, LBR, LIG1, TXNRD1, USP22, TPM1, RPA1, PYCR1, DDX10, PAICS, HIST2H2BE, CCNA2, GNPDA1, MTHFD2, BIRC5, PSMG1, KIF2C, KIF20A, ARHGEF2, PAFAH1B3 55 3424 20581 3.2786214953271027 8.609409085647002E-8 8.609410298206715E-8 8.45567082859588E-8 3 | UP_KEYWORDS Phosphoprotein 36 65.45454545454545 2.0388780008061567E-4 TOP2A, CLIC4, SLC35F2, INPP1, MRPL19, FHL2, NOLC1, PSIP1, RELB, PGRMC1, OXCT1, STMN1, STX4, CPNE3, LBR, MPZL1, IER3, LIG1, TXNRD1, TPM1, RPA1, PYCR1, DDX10, PAICS, HIST2H2BE, CCNA2, TXLNA, GNPDA1, TCEA2, BIRC5, PSMG1, KIF2C, NCAPD2, KIF20A, ARHGEF2, PAFAH1B3 55 8246 20581 1.6336648071792386 0.03367652713909708 0.017126575206771716 0.016820743506650793 4 | UP_KEYWORDS Cell division 6 10.909090909090908 0.003440617247463365 CCNA2, LIG1, BIRC5, KIF2C, NCAPD2, ARHGEF2 55 388 20581 5.78659793814433 0.4395528677953666 0.19267456585794843 0.18923394861048506 5 | UP_KEYWORDS Mitosis 5 9.090909090909092 0.004928799319810609 CCNA2, BIRC5, KIF2C, NCAPD2, ARHGEF2 55 262 20581 7.141221374045802 0.5639880523221983 0.19369482099750268 0.19023598490826157 6 | UP_KEYWORDS Microtubule 5 9.090909090909092 0.006220048085963932 STMN1, BIRC5, KIF2C, KIF20A, ARHGEF2 55 280 20581 6.682142857142857 0.649442836226727 0.19369482099750268 0.19023598490826157 7 | UP_KEYWORDS Cell cycle 7 12.727272727272727 0.006917672178482239 CCNA2, LIG1, USP22, BIRC5, KIF2C, NCAPD2, ARHGEF2 55 650 20581 4.029846153846154 0.6884536989331027 0.19369482099750268 0.19023598490826157 8 | UP_KEYWORDS Transit peptide 6 10.909090909090908 0.013033979603529661 ALAS1, NNT, OXCT1, MTHFD2, MRPL19, HADH 55 536 20581 4.188805970149254 0.8896506576574663 0.3128155104847119 0.30722951922605635 9 | UP_KEYWORDS Cytoplasm 21 38.18181818181819 0.01679969816231558 TOP2A, CLIC4, ZNF274, TXNRD1, TPM1, FHL2, NOLC1, GLRX, RELB, CCNA2, GNPDA1, STMN1, BIRC5, PCBD1, PSMG1, KIF2C, NCAPD2, CPNE3, KIF20A, ARHGEF2, PAFAH1B3 55 4816 20581 1.631686046511628 0.9419427396195738 0.33607778138786243 0.3300763924345077 10 | UP_KEYWORDS Oxidoreductase 6 10.909090909090908 0.018004166860064057 NNT, MTHFD2, P4HA2, TXNRD1, PYCR1, HADH 55 582 20581 3.857731958762886 0.9527479826636567 0.33607778138786243 0.3300763924345077 11 | UP_KEYWORDS Mitochondrion 8 14.545454545454545 0.026249398106128065 CLIC4, ALAS1, NNT, OXCT1, MTHFD2, MRPL19, PYCR1, HADH 55 1119 20581 2.6752457551385165 0.9885390617335139 0.39814595131174274 0.3910362021811759 12 | UP_KEYWORDS Isopeptide bond 8 14.545454545454545 0.027734439083459617 TOP2A, LIG1, TPM1, FHL2, RPA1, TCEA2, NOLC1, HIST2H2BE 55 1132 20581 2.64452296819788 0.9911312247584608 0.39814595131174274 0.3910362021811759 13 | UP_KEYWORDS Cytoskeleton 8 14.545454545454545 0.028438996522267338 CLIC4, TPM1, STMN1, BIRC5, KIF2C, KIF20A, ARHGEF2, RELB 55 1138 20581 2.630579964850615 0.9921481397546121 0.39814595131174274 0.3910362021811759 14 | UP_KEYWORDS Ubl conjugation 10 18.181818181818183 0.03212534559864598 TOP2A, CCNA2, TXNRD1, FHL2, RPA1, TCEA2, NOLC1, BIRC5, KIF2C, HIST2H2BE 55 1705 20581 2.1947214076246335 0.9958541680959994 0.4151583123517327 0.4077447710597374 15 | UP_KEYWORDS Nucleus 21 38.18181818181819 0.04001475328204553 TOP2A, CLIC4, ZNF274, LIG1, TXNRD1, USP22, FHL2, RPA1, PSIP1, NOLC1, HIST2H2BE, RELB, CCNA2, EBP, TCEA2, BIRC5, PCBD1, KIF2C, NCAPD2, CPNE3, LBR 55 5244 20581 1.498512585812357 0.998951795861702 0.48017703938454637 0.4716024493955367 16 | UP_KEYWORDS Magnesium 5 9.090909090909092 0.05656549310361622 TOP2A, LIG1, INPP1, MTHFD2, ATP2C1 55 552 20581 3.3894927536231885 0.9999435615398904 0.6335335227605017 0.6222204241397784 17 | UP_KEYWORDS Disease mutation 12 21.818181818181817 0.06441555750669184 EBP, NPC1, NNT, OXCT1, P4HA2, TPM1, PYCR1, PCBD1, ATP2C1, HADH, SLC37A4, LBR 55 2550 20581 1.7609411764705882 0.9999861342755192 0.6763633538202644 0.6642854367877596 18 | UP_KEYWORDS ATP-binding 8 14.545454545454545 0.07015335858609392 TOP2A, LIG1, NOLC1, DDX10, KIF2C, KIF20A, ATP2C1, PAICS 55 1391 20581 2.152120776419842 0.9999950670933881 0.6932802495566929 0.6809002451003233 19 | UP_KEYWORDS NAD 3 5.454545454545454 0.07719857350347654 NNT, MTHFD2, HADH 55 175 20581 6.414857142857143 0.9999986253655966 0.7205200193657811 0.7076535904485349 20 | UP_KEYWORDS NADP 3 5.454545454545454 0.08568717461020739 NNT, TXNRD1, PYCR1 55 186 20581 6.035483870967742 0.9999997089762821 0.7356053928073722 0.7224695822215264 21 | UP_KEYWORDS Chromosome 4 7.2727272727272725 0.08757207057230622 BIRC5, KIF2C, NCAPD2, HIST2H2BE 55 400 20581 3.7419999999999995 0.9999997942405193 0.7356053928073722 0.7224695822215264 22 | UP_KEYWORDS Alternative splicing 34 61.81818181818181 0.09860418033718393 TOP2A, ZNF274, ALAS1, SLC35F2, FHL2, NOLC1, PSIP1, ATP2C1, PGRMC1, OXCT1, GPC1, STMN1, ANXA7, STX4, HADH, SLC37A4, MPZL1, LIG1, TXNRD1, USP22, TPM1, PYCR1, PAICS, GNPDA1, NPC1, P4HA2, MTHFD2, TCEA2, BIRC5, PSMG1, KIF2C, KIF20A, ARHGEF2, TRIB1 55 10587 20581 1.2017379805421742 0.9999999733430376 0.7888334426974715 0.7747471312207309 23 | -------------------------------------------------------------------------------- /results/Figs_Source_Data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/Figs_Source_Data.xlsx -------------------------------------------------------------------------------- /results/MoAprediction/JI_cdrpbio.txt: -------------------------------------------------------------------------------- 1 | "CP" "GE" "Early Fusion" "RGCCA" "MCIA" "MOFA" "iCluster" "intNMF" "JIVE" "scikit-fusion" 2 | 0.278851699 0.21674894 0.259032794 0.31199107 0.244958406 0.245157412 0.166666667 0.273648354 0.26111079 0.202512456 3 | 0.281781202 0.217353574 0.272947479 0.299993438 0.237120438 0.24766634 0.166666667 0.284937711 0.271319067 0.204582565 4 | 0.2718746 0.219221298 0.288572557 0.300109206 0.245544432 0.239201534 0.176470588 0.284523564 0.264215061 0.20097943 5 | 0.274209543 0.214641334 0.266898688 0.298896298 0.248330213 0.247923298 0.166666667 0.292745201 0.256330169 0.192807023 6 | 0.265629351 0.217429769 0.279090052 0.307711408 0.239454065 0.239942443 1 0.285721498 0.26371381 0.199625524 7 | 0.275029708 0.2239464 0.28495489 0.301235572 0.2386484 0.254481673 0.153846154 0.282939774 0.264471233 0.1976511 8 | 0.288765182 0.217004773 0.282173851 0.306246664 0.246732016 0.254969989 0.4 0.294275281 0.263112267 0.200504611 9 | 0.278531041 0.223395083 0.275002364 0.293082233 0.24396531 0.242451334 0.333333333 0.291354281 0.258611036 0.209453967 10 | 0.272455248 0.231600555 0.262360363 0.295268069 0.241331141 0.255809239 0.5 0.296717155 0.261306837 0.207150644 11 | 0.275023829 0.225685487 0.271300656 0.295524507 0.246870483 0.238823953 0.16 0.274089946 0.265535744 0.20338892 12 | 0.28143867 0.224871523 0.260911678 0.303672023 0.236066059 0.239288014 0.171428571 0.294494354 0.261902235 0.209384306 13 | 0.282288852 0.223132721 0.282475537 0.290829442 0.244147491 0.256209037 0.2 0.282647138 0.259500099 0.19759714 14 | 0.287532178 0.222311311 0.26242836 0.295981315 0.243569808 0.243439708 0.157894737 0.275122015 0.260815605 0.202213235 15 | 0.301480301 0.215302881 0.270409742 0.300910495 0.239649912 0.247746601 0.1 0.28834381 0.251075084 0.204514447 16 | 0.277106418 0.217944084 0.281592592 0.309385645 0.246099862 0.247184454 0.083333333 0.286270538 0.25763249 0.207762257 17 | 0.260563304 0.229904461 0.266062905 0.301805175 0.238773842 0.237490211 0.117647059 0.285515992 0.252312053 0.20449294 -------------------------------------------------------------------------------- /results/MoAprediction/JI_lincs.txt: -------------------------------------------------------------------------------- 1 | "CP" "GE" "Early Fusion" "RGCCA" "MCIA" "MOFA" "iCluster" "intNMF" "JIVE" "scikit-fusion" 2 | 0.188998329 0.152892632 0.161685084 0.201019074 0.174152986 0.186524756 0.177777778 0.159674769 0.177569702 0.119184686 3 | 0.182455547 0.155460808 0.1663579 0.198310907 0.179608683 0.173331065 0.226415094 0.16362849 0.167412958 0.116937471 4 | 0.193899409 0.152630737 0.174071877 0.190282351 0.181639662 0.172031122 0.070588235 0.166882412 0.169969459 0.120875801 5 | 0.182039125 0.155030388 0.165013806 0.196283122 0.182571093 0.172776906 0.071005917 0.168034196 0.1757652 0.124514593 6 | 0.184550744 0.147509477 0.169153875 0.187533527 0.177913267 0.17130581 0.043478261 0.159508666 0.167613402 0.113748412 7 | 0.182496631 0.159455754 0.165133202 0.18379198 0.179867253 0.183528183 0.089430894 0.169158962 0.159833977 0.12420253 8 | 0.171946117 0.154463604 0.178991604 0.197523283 0.19030103 0.171250903 0.2 0.172767563 0.164891016 0.123406295 9 | 0.186694719 0.155486143 0.164093562 0.187112227 0.186310238 0.183258565 0.208333333 0.165169404 0.171638208 0.118322396 10 | 0.180338058 0.143601898 0.15861667 0.200142072 0.184137339 0.17660344 0.072164948 0.16547984 0.163073586 0.126149742 11 | 0.183605321 0.15741292 0.171381991 0.188784002 0.173252761 0.177570921 0.01369863 0.163314014 0.173093975 0.11873174 12 | 0.193222559 0.156995026 0.173813632 0.196182388 0.174394311 0.171367636 0.083333333 0.165245188 0.164822426 0.11878228 13 | 0.192671978 0.154919056 0.170624979 0.193073079 0.181786851 0.171575995 0.291139241 0.171691008 0.165878195 0.119526703 14 | 0.187967039 0.155867196 0.162878291 0.19683875 0.192637938 0.18045576 0.461538462 0.157649099 0.163766256 0.117880675 15 | 0.18544228 0.157804763 0.179560591 0.185849847 0.187052994 0.177218298 0.049382716 0.163521993 0.161072242 0.124717351 16 | 0.190641757 0.155245631 0.15904326 0.192819574 0.18380893 0.170782525 0.058823529 0.164331798 0.174827621 0.120163039 17 | 0.185672437 0.164612477 0.173447376 0.196032956 0.174881143 0.17674655 0.25 0.162468251 0.162594174 0.124472168 18 | 0.190136704 0.153092828 0.170318814 0.199112922 0.181481014 0.181176604 0.15625 0.169043792 0.158751472 0.118636228 19 | 0.184765443 0.157565332 0.173143963 0.198382252 0.179289922 0.173976218 0.45 0.172475028 0.164535448 0.11733305 20 | 0.192641148 0.154739976 0.171832001 0.197707877 0.179048991 0.182646244 0.138888889 0.169634439 0.170512163 0.120909347 21 | 0.183218664 0.156702842 0.17199648 0.189603926 0.180477667 0.179052585 0.057971014 0.171736775 0.152669412 0.116083564 22 | 0.187256874 0.147593792 0.170884185 0.198366585 0.180195479 0.175878718 0.058139535 0.169146685 0.159364501 0.118681457 23 | 0.187069918 0.149542804 0.16561384 0.187212941 0.183953164 0.182117759 0.046511628 0.166623749 0.170293955 0.120635555 24 | 0.183073028 0.153191771 0.156706199 0.198252086 0.178945469 0.183918728 0.01369863 0.164690939 0.168051618 0.12046373 25 | 0.188407875 0.158852641 0.155445963 0.199950627 0.178443793 0.179828374 0.079365079 0.167779642 0.166961766 0.119892686 26 | 0.175651634 0.171142449 0.158127203 0.191347613 0.195579897 0.178017005 0.2 0.170611727 0.166003424 0.11948773 27 | 0.17865041 0.155742474 0.18308929 0.187846662 0.185773521 0.183502339 0.06 0.167828001 0.164108882 0.120033978 28 | 0.194311319 0.158786831 0.175152329 0.188190067 0.18457021 0.17641168 0.0625 0.164302617 0.158739428 0.119227709 29 | 0.193254176 0.15960622 0.175409666 0.187384662 0.179096717 0.18390748 0.105263158 0.163219297 0.167798949 0.121511195 30 | 0.18617729 0.155678692 0.167753437 0.194289268 0.174803478 0.175536734 0.083333333 0.164659934 0.159521227 0.123949342 31 | 0.179601909 0.156356456 0.161356004 0.193791932 0.183172202 0.175497751 0.051724138 0.162390982 0.170494378 0.120299648 32 | 0.18963734 0.156625601 0.166336797 0.193811041 0.179803533 0.17315947 0.183333333 0.156586373 0.170302999 0.11827188 33 | 0.184352173 0.152024514 0.16540649 0.19131148 0.172935734 0.178541939 0.223529412 0.176281185 0.157959047 0.116664658 34 | 0.189699393 0.148682331 0.164026646 0.201137787 0.182479125 0.168811084 0.620689655 0.175702153 0.162975292 0.120901982 35 | 0.180999142 0.161272099 0.160666305 0.20022499 0.185258429 0.176733567 0.016666667 0.165612072 0.160725882 0.116746776 36 | 0.180157675 0.157840116 0.167903519 0.200828676 0.182595129 0.175124784 0.046875 0.170652949 0.16434307 0.121763081 37 | 0.175230255 0.155045636 0.166836702 0.198913695 0.177755714 0.181654699 0.051282051 0.161360514 0.162028228 0.116845951 38 | 0.184173371 0.155724161 0.16486378 0.186722298 0.172474892 0.176250337 0.058823529 0.167288187 0.161137409 0.118986834 39 | 0.187505371 0.162342476 0.170702512 0.18814401 0.180784286 0.175958655 0.043478261 0.168911293 0.158109144 0.11885624 40 | 0.184911138 0.152442416 0.170156448 0.194377939 0.174680795 0.184703302 0.111111111 0.1634296 0.157688838 0.117673344 41 | 0.200188868 0.167102791 0.175260347 0.207820825 0.180506812 0.181896395 0.452380952 0.177000251 0.159222812 0.118347626 42 | 0.185687782 0.145595456 0.166612367 0.194777853 0.180121812 0.18308245 0.06122449 0.163437656 0.167444436 0.116668697 43 | 0.184420505 0.164602208 0.162737406 0.195980252 0.179240673 0.182344867 0.117647059 0.170539941 0.166948171 0.124213738 44 | 0.189296239 0.15369093 0.175729401 0.197048519 0.181657677 0.172635318 0.060869565 0.166161194 0.160457602 0.122806357 45 | 0.185131847 0.153377751 0.157125437 0.198765646 0.176936046 0.177792184 0.1 0.159993478 0.166106644 0.123517094 46 | 0.191163865 0.153658909 0.173349824 0.196983778 0.175305658 0.184276093 0.063829787 0.165410291 0.164367546 0.122196122 47 | 0.182868061 0.152204389 0.164102408 0.193891247 0.179592803 0.171609376 0.051724138 0.162577 0.168739518 0.11732252 48 | 0.182594781 0.158816683 0.172826354 0.203075052 0.179938037 0.169869442 0.28 0.160315945 0.158795732 0.121053386 49 | 0.1857397 0.1627972 0.164690579 0.193372016 0.178605952 0.180772956 0.681818182 0.172442065 0.166283947 0.12059789 50 | 0.185656684 0.154339706 0.164199974 0.191689141 0.175449618 0.178893609 0.041666667 0.170628469 0.165995994 0.114831952 51 | 0.183158086 0.152878986 0.163597095 0.194775553 0.176895727 0.175380238 0.048 0.161713348 0.176740485 0.124128933 52 | 0.184734257 0.149737635 0.170376461 0.194929765 0.179171628 0.185745275 0.14 0.166058303 0.167017443 0.115490732 53 | 0.179232878 0.156775481 0.161159229 0.202649183 0.187129908 0.172128717 0.065420561 0.180383253 0.157901665 0.121413283 54 | 0.178903625 0.157511588 0.16991642 0.197078156 0.191806662 0.181379942 0.131578947 0.162768724 0.160685666 0.119430025 55 | 0.177152292 0.163682115 0.167529903 0.20160861 0.185839945 0.175390124 0.417721519 0.171189056 0.163199997 0.122366042 56 | 0.186724616 0.148527899 0.179844588 0.192022594 0.174337312 0.179788153 0.08 0.170737665 0.165948529 0.120087864 57 | 0.185444784 0.159524103 0.166411955 0.195164402 0.180475026 0.176058824 0.607142857 0.164282465 0.160104594 0.122952826 58 | 0.189335143 0.146887219 0.16845284 0.192139051 0.180729516 0.186868239 0.056603774 0.168942511 0.160584253 0.12571239 -------------------------------------------------------------------------------- /results/MoAprediction/pred_moa.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa.xlsx -------------------------------------------------------------------------------- /results/MoAprediction/pred_moa_2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa_2.xlsx -------------------------------------------------------------------------------- /results/MoAprediction/pred_moa_CDRP.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa_CDRP.xlsx -------------------------------------------------------------------------------- /results/MoAprediction/pred_moa_LINCS.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/MoAprediction/pred_moa_LINCS.xlsx -------------------------------------------------------------------------------- /results/RepCor/RepCorrDF.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/RepCor/RepCorrDF.xlsx -------------------------------------------------------------------------------- /results/SingleCPfeatPred/scores_corrected.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleCPfeatPred/scores_corrected.xlsx -------------------------------------------------------------------------------- /results/SingleGenePred/scores_corrected.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred/scores_corrected.xlsx -------------------------------------------------------------------------------- /results/SingleGenePred/scores_cross_dts_LU_LI.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred/scores_cross_dts_LU_LI.xlsx -------------------------------------------------------------------------------- /results/SingleGenePred_cpCategoryMap/CatMap-LINCS-25-lasso-ht.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LINCS-25-lasso-ht.png -------------------------------------------------------------------------------- /results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.pdf -------------------------------------------------------------------------------- /results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-MLP-keras-ht.png -------------------------------------------------------------------------------- /results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-lasso-ht.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/CatMap-LUAD-9-lasso-ht.png -------------------------------------------------------------------------------- /results/SingleGenePred_cpCategoryMap/cat_scores_maps.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carpenter-singh-lab/2022_Haghighi_NatureMethods/f23205944e17f47d7e8959be71f4b7d25075b191/results/SingleGenePred_cpCategoryMap/cat_scores_maps.xlsx -------------------------------------------------------------------------------- /utils/pred_models.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import ( 2 | cross_val_score, 3 | cross_val_predict, 4 | GroupKFold, 5 | LeaveOneGroupOut, 6 | ) 7 | from sklearn.model_selection import GridSearchCV 8 | from sklearn import metrics 9 | import numpy as np 10 | from sklearn import preprocessing 11 | from warnings import simplefilter 12 | from sklearn.neural_network import MLPRegressor 13 | from sklearn.exceptions import ConvergenceWarning 14 | from sklearn.model_selection import train_test_split 15 | from sklearn.svm import SVR 16 | 17 | # simplefilter("ignore", category=ConvergenceWarning) 18 | # from sklearn.exceptions import ConvergenceWarning 19 | # ConvergenceWarning('ignore') 20 | 21 | 22 | ########################## Lasso models 23 | def lasso_cv(X, y, k, group_labels): 24 | """ 25 | X: CP data [perts/samples, features] 26 | y: lm gene expression value [perts/samples, 1 (feature value)] 27 | 28 | Returns: 29 | prediction scores, y permutated scores 30 | """ 31 | from sklearn import linear_model 32 | 33 | n_j = 3 34 | # build sklearn model 35 | clf = linear_model.Lasso(alpha=0.1, max_iter=10000) 36 | 37 | # k=np.unique(group_labels).shape[0] 38 | split_obj = GroupKFold(n_splits=k) 39 | # split_obj = LeaveOneGroupOut() 40 | # Perform k-fold cross validation 41 | scores = cross_val_score(clf, X, y, groups=group_labels, cv=split_obj, n_jobs=n_j) 42 | 43 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples 44 | # y.sample(frac = 1) this just shuffles the vector 45 | scores_rand = cross_val_score( 46 | clf, X, y.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j 47 | ) 48 | return scores, scores_rand 49 | 50 | 51 | def lasso_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag): 52 | """ 53 | X: CP data [perts/samples, features] 54 | y: lm gene expression value [perts/samples, 1 (feature value)] 55 | 56 | Returns: 57 | prediction scores, y permutated scores 58 | """ 59 | from sklearn import linear_model 60 | 61 | n_j = 3 62 | # build sklearn model 63 | clf = linear_model.Lasso(alpha=0.1, max_iter=1000) 64 | 65 | # k=np.unique(group_labels).shape[0] 66 | split_obj = GroupKFold(n_splits=k) 67 | # split_obj = LeaveOneGroupOut() 68 | # Perform k-fold cross validation 69 | 70 | # alphas = np.linspace(0, 0.02, 11) 71 | alphas1 = np.linspace(0, 0.2, 20) 72 | alphas2 = np.linspace(0.2, 0.5, 10)[1:] 73 | alphas = np.concatenate((alphas1, alphas2)) 74 | # alphas = np.logspace(-4, -0.5, 30) 75 | lasso_cv = linear_model.LassoCV( 76 | alphas=alphas, random_state=0, max_iter=1000, selection="random", n_jobs=k 77 | ) 78 | # lasso_cv = linear_model.LassoLarsCV(cv=5) 79 | X, y = X0.values, y0.values 80 | 81 | # scores=np.zeros(k,) 82 | scores = [] 83 | for train_index, test_index in split_obj.split(X, y, group_labels): 84 | # print("TRAIN:", train_index, "TEST:", test_index) 85 | X_train, X_test = X[train_index], X[test_index] 86 | y_train, y_test = y[train_index], y[test_index] 87 | 88 | lasso_cv.fit(X_train, y_train) 89 | scores.append(lasso_cv.score(X_test, y_test)) 90 | # print(lasso_cv.alpha_) 91 | 92 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples 93 | # y.sample(frac = 1) this just shuffles the vector 94 | if rand_added_flag: 95 | scores_rand = cross_val_score( 96 | clf, X0, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j 97 | ) 98 | else: 99 | scores_rand = 0 100 | return np.array(scores), scores_rand 101 | 102 | 103 | def ridge_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag): 104 | 105 | """ 106 | X: CP data [perts/samples, features] 107 | y: lm gene expression value [perts/samples, 1 (feature value)] 108 | 109 | Returns: 110 | prediction scores, y permutated scores 111 | """ 112 | 113 | from sklearn import linear_model 114 | 115 | n_j = 3 116 | # build sklearn model 117 | clf = linear_model.Ridge(alpha=0.1, max_iter=10000) 118 | 119 | # k=np.unique(group_labels).shape[0] 120 | split_obj = GroupKFold(n_splits=k) 121 | # split_obj = LeaveOneGroupOut() 122 | # Perform k-fold cross validation 123 | 124 | # alphas = np.linspace(0, 0.02, 11) 125 | alphas1 = np.linspace(0.1, 0.2, 10) 126 | alphas2 = np.linspace(0.2, 0.5, 10)[1:] 127 | alphas = np.concatenate((alphas1, alphas2)) 128 | # alphas = np.logspace(-4, -0.5, 30) 129 | lasso_cv = linear_model.RidgeCV(alphas) 130 | 131 | # X,y=X0,y0 132 | X, y = X0.values, y0.values 133 | 134 | # scores=np.zeros(k,) 135 | scores = [] 136 | for train_index, test_index in split_obj.split(X, y, group_labels): 137 | # print("TRAIN:", train_index, "TEST:", test_index) 138 | X_train, X_test = X[train_index], X[test_index] 139 | y_train, y_test = y[train_index], y[test_index] 140 | 141 | lasso_cv.fit(X_train, y_train) 142 | scores.append(lasso_cv.score(X_test, y_test)) 143 | # print(lasso_cv.alpha_) 144 | 145 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples 146 | # y.sample(frac = 1) this just shuffles the vector 147 | if rand_added_flag: 148 | scores_rand = cross_val_score( 149 | clf, X0, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j 150 | ) 151 | else: 152 | scores_rand = 0 153 | return np.array(scores), scores_rand 154 | 155 | 156 | ########################## MLP 157 | # def MLP_cv(X,y,k,group_labels): 158 | # from sklearn.neural_network import MLPRegressor 159 | 160 | # n_j=-1 161 | # # hidden_layer_sizes=100, 162 | # # hidden_layer_sizes = (50, 20, 10) 163 | # regr = MLPRegressor(random_state=1,hidden_layer_sizes = (100), max_iter=10000,activation='tanh',early_stopping=True) 164 | 165 | # split_obj=GroupKFold(n_splits=k) 166 | # # Perform k-fold cross validation 167 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j) 168 | 169 | # # Perform k-fold cross validation on the shuffled vector of lm GE across samples 170 | # # y.sample(frac = 1) this just shuffles the vector 171 | # scores_rand = cross_val_score(regr, X, y.sample(frac = 1) ,groups=group_labels,cv=split_obj,n_jobs=n_j) 172 | # return scores, scores_rand 173 | # X is train samples and y is the corresponding labels 174 | 175 | 176 | def MLP_cv(X, y, k, group_labels, rand_added_flag): 177 | from sklearn.neural_network import MLPRegressor 178 | 179 | n_j = -1 180 | # hidden_layer_sizes=100, 181 | # hidden_layer_sizes = (50, 20, 10) 182 | regr = MLPRegressor( 183 | hidden_layer_sizes=(50, 10), 184 | activation="logistic", 185 | alpha=0.01, 186 | early_stopping=True, 187 | ) 188 | 189 | split_obj = GroupKFold(n_splits=k) 190 | # Perform k-fold cross validation 191 | scores = cross_val_score(regr, X, y, groups=group_labels, cv=split_obj, n_jobs=n_j) 192 | 193 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples 194 | # y.sample(frac = 1) this just shuffles the vector 195 | 196 | if rand_added_flag: 197 | scores_rand = cross_val_score( 198 | regr, X, y.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j 199 | ) 200 | else: 201 | scores_rand = 0 202 | 203 | return scores, scores_rand 204 | 205 | 206 | def MLP_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag): 207 | n_j = -1 208 | # hidden_layer_sizes=100, 209 | # hidden_layer_sizes = (50, 20, 10) 210 | # regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\ 211 | # alpha=0.01,early_stopping=True) 212 | 213 | mlp_gs = MLPRegressor( 214 | random_state=0, 215 | early_stopping=True, 216 | n_iter_no_change=4, 217 | learning_rate="adaptive", 218 | ) 219 | 220 | split_obj = GroupKFold(n_splits=k) 221 | # Perform k-fold cross validation 222 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j) 223 | 224 | # mlp_gs = MLPClassifier(max_iter=100) 225 | # parameter_space = { 226 | # 'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)], 227 | # 'activation': ['tanh', 'relu','logistic'], 228 | # 'alpha': [0.0001, 0.05,0.01,0.1,0.2], 229 | # 'early_stopping':[True,False] 230 | # } 231 | 232 | parameter_space = { 233 | "max_iter": [10, 100, 300, 500], 234 | "hidden_layer_sizes": [ 235 | (32, 64), 236 | (64, 32), 237 | (50, 10), 238 | (50, 10, 10), 239 | (20, 10), 240 | (), 241 | ], # (50,5),(50,),(10,) 242 | "activation": ["logistic", "tanh"], 243 | "alpha": [0.0005, 0.01, 0.3, 1, 2, 3, 4, 5, 6, 7], 244 | # 'learning_rate': ['constant','adaptive'] 245 | # 'early_stopping':[True,False] 246 | } 247 | 248 | clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=k, cv=4) 249 | 250 | X, y = X0, y0.values 251 | 252 | scores = [] 253 | for train_index, test_index in split_obj.split(X, y, group_labels): 254 | # print("TRAIN:", train_index, "TEST:", test_index) 255 | X_train, X_test = X[train_index], X[test_index] 256 | y_train, y_test = y[train_index], y[test_index] 257 | 258 | clf.fit(X_train, y_train) 259 | # clf.fit(X, y) 260 | scores.append(clf.best_estimator_.score(X_test, y_test)) 261 | print(clf.best_params_) 262 | 263 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples 264 | # y.sample(frac = 1) this just shuffles the vector 265 | # scores_rand=0 266 | 267 | if rand_added_flag: 268 | scores_rand = cross_val_score( 269 | mlp_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j 270 | ) 271 | else: 272 | scores_rand = 0 273 | return scores, scores_rand 274 | 275 | 276 | def MLP_cv_plus_model_selection_keras(X0, y0, k, group_labels, rand_added_flag): 277 | from keras.models import Sequential 278 | from keras.layers import Dense, Conv1D, Flatten, Dropout 279 | from sklearn.metrics import mean_squared_error, r2_score 280 | from keras.callbacks import EarlyStopping 281 | from keras import backend as K 282 | 283 | X = X0.reshape(X0.shape[0], X0.shape[1], 1) 284 | y = y0.values 285 | # model.summary() 286 | model = Sequential() 287 | model.add(Dense(16, activation="relu", input_shape=(X0.shape[1], 1))) 288 | # model.add(Conv1D(32, 2, activation="relu", input_shape=(X0.shape[1],1))) 289 | model.add(Flatten()) 290 | model.add(Dropout(0.6)) 291 | model.add(Dense(64, activation="relu")) 292 | model.add(Dropout(0.2)) 293 | model.add(Dense(1)) 294 | model.compile(loss="mse", optimizer="adam") # ,metrics=[coeff_determination]) 295 | # model.compile(loss=coeff_determination, optimizer="adam")#,metrics=[coeff_determination]) 296 | 297 | es = EarlyStopping(monitor="val_loss", mode="min", verbose=0, patience=10) 298 | 299 | Wsave = model.get_weights() 300 | 301 | split_obj = GroupKFold(n_splits=k) 302 | 303 | scores = [] 304 | for train_index, test_index in split_obj.split(X, y, group_labels): 305 | # print("TRAIN:", train_index, "TEST:", test_index) 306 | X_train, X_test = X[train_index], X[test_index] 307 | y_train, y_test = y[train_index], y[test_index] 308 | 309 | XTraining, XValidation, YTraining, YValidation = train_test_split( 310 | X_train, y_train, test_size=0.1 311 | ) 312 | 313 | model.set_weights(Wsave) 314 | model.fit( 315 | XTraining, 316 | YTraining, 317 | batch_size=XTraining.shape[0], 318 | epochs=1000, 319 | validation_data=(XValidation, YValidation), 320 | callbacks=[es], 321 | verbose=0, 322 | ) 323 | ypred = model.predict(X_test) 324 | 325 | scores.append(r2_score(y_test, ypred)) 326 | 327 | return scores, 0 328 | 329 | 330 | def SVR_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag): 331 | n_j = -1 332 | # hidden_layer_sizes=100, 333 | # hidden_layer_sizes = (50, 20, 10) 334 | # regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\ 335 | # alpha=0.01,early_stopping=True) 336 | 337 | svr_gs = SVR(epsilon=0.2) 338 | 339 | split_obj = GroupKFold(n_splits=k) 340 | # Perform k-fold cross validation 341 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j) 342 | 343 | # mlp_gs = MLPClassifier(max_iter=100) 344 | # parameter_space = { 345 | # 'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)], 346 | # 'activation': ['tanh', 'relu','logistic'], 347 | # 'alpha': [0.0001, 0.05,0.01,0.1,0.2], 348 | # 'early_stopping':[True,False] 349 | # } 350 | 351 | parameter_space = { 352 | "kernel": ("poly", "rbf", "sigmoid"), 353 | "C": [1, 2, 3, 5, 20, 100, 500, 1000], # (50,5),(50,),(10,) 354 | "degree": [1, 2, 3, 4], 355 | "coef0": [0.01, 0.5, 1, 10], 356 | "gamma": ("auto", "scale"), 357 | # 'epsilon':[0.1,0.2,0.5,0.3] 358 | # 'early_stopping':[True,False] 359 | } 360 | 361 | clf = GridSearchCV(svr_gs, parameter_space, n_jobs=k, cv=4) 362 | 363 | X, y = X0, y0.values 364 | 365 | scores = [] 366 | for train_index, test_index in split_obj.split(X, y, group_labels): 367 | # print("TRAIN:", train_index, "TEST:", test_index) 368 | X_train, X_test = X[train_index], X[test_index] 369 | y_train, y_test = y[train_index], y[test_index] 370 | 371 | clf.fit(X_train, y_train) 372 | # clf.fit(X, y) 373 | scores.append(clf.best_estimator_.score(X_test, y_test)) 374 | print(clf.best_params_) 375 | 376 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples 377 | # y.sample(frac = 1) this just shuffles the vector 378 | # scores_rand=0 379 | 380 | if rand_added_flag: 381 | scores_rand = cross_val_score( 382 | svr_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j 383 | ) 384 | else: 385 | scores_rand = 0 386 | return scores, scores_rand 387 | 388 | 389 | def MLP_cv_plus_model_selection_rand_test(X0, y0, k, group_labels, rand_added_flag): 390 | n_j = -1 391 | # hidden_layer_sizes=100, 392 | # hidden_layer_sizes = (50, 20, 10) 393 | # regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\ 394 | # alpha=0.01,early_stopping=True) 395 | 396 | mlp_gs = MLPRegressor(random_state=0, early_stopping=True, n_iter_no_change=20) 397 | 398 | split_obj = GroupKFold(n_splits=k) 399 | # Perform k-fold cross validation 400 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j) 401 | 402 | # mlp_gs = MLPClassifier(max_iter=100) 403 | # parameter_space = { 404 | # 'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)], 405 | # 'activation': ['tanh', 'relu','logistic'], 406 | # 'alpha': [0.0001, 0.05,0.01,0.1,0.2], 407 | # 'early_stopping':[True,False] 408 | # } 409 | 410 | parameter_space = { 411 | "max_iter": [10, 100, 300, 500], 412 | "hidden_layer_sizes": [ 413 | (32, 64), 414 | (64, 32), 415 | (50, 10), 416 | (50, 10, 10), 417 | (20, 10), 418 | ], # (50,5),(50,),(10,) 419 | "activation": ["logistic", "tanh"], 420 | "alpha": [0.0005, 0.01, 0.3, 1, 2], 421 | # 'learning_rate': ['constant','adaptive'] 422 | # 'early_stopping':[True,False] 423 | } 424 | 425 | clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=k, cv=4) 426 | 427 | X, y = X0, y0.values 428 | 429 | scores = [] 430 | for train_index, test_index in split_obj.split(X, y, group_labels): 431 | # print("TRAIN:", train_index, "TEST:", test_index) 432 | X_train, X_test = X[train_index], X[test_index] 433 | y_train, y_test = y[train_index], y[test_index] 434 | 435 | clf.fit(X_train, y_train) 436 | # clf.fit(X, y) 437 | scores.append(clf.best_estimator_.score(X_test, y_test)) 438 | print(clf.best_params_) 439 | 440 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples 441 | # y.sample(frac = 1) this just shuffles the vector 442 | # scores_rand=0 443 | 444 | if rand_added_flag: 445 | scores_rand = cross_val_score( 446 | mlp_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j 447 | ) 448 | else: 449 | scores_rand = 0 450 | return scores, scores_rand 451 | 452 | 453 | def MLP_cv_plus_model_selection_taorf(X0, y0, k, group_labels, rand_added_flag): 454 | from sklearn.neural_network import MLPRegressor 455 | 456 | n_j = -1 457 | # hidden_layer_sizes=100, 458 | # hidden_layer_sizes = (50, 20, 10) 459 | # regr = MLPRegressor(hidden_layer_sizes = (50,10),activation='logistic',\ 460 | # alpha=0.01,early_stopping=True) 461 | 462 | mlp_gs = MLPRegressor(random_state=0, max_iter=1000) 463 | 464 | split_obj = GroupKFold(n_splits=k) 465 | # Perform k-fold cross validation 466 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j) 467 | 468 | # mlp_gs = MLPClassifier(max_iter=100) 469 | # parameter_space = { 470 | # 'hidden_layer_sizes': [(50,),(200,),(500,),(10,30,10),(50,10),(50,10,10)], 471 | # 'activation': ['tanh', 'relu','logistic'], 472 | # 'alpha': [0.0001, 0.05,0.01,0.1,0.2], 473 | # 'early_stopping':[True,False] 474 | # } 475 | 476 | parameter_space = { 477 | "hidden_layer_sizes": [(50,), (10, 30, 10), (50, 10), (50, 10, 10)], 478 | "activation": ["tanh", "relu", "logistic"], 479 | "alpha": [0.0001, 0.05, 0.01, 0.2, 0.5, 0.7], 480 | "learning_rate": ["constant", "adaptive"] 481 | # 'early_stopping':[True,False] 482 | } 483 | 484 | # parameter_space = { 485 | # 'hidden_layer_sizes': [(50,),(10,),(50,10),(50,10,10)], 486 | # 'activation': ['tanh', 'relu','logistic'], 487 | # 'alpha': [0.05,0.01,0.2,0.5], 488 | # # 'early_stopping':[True,False] 489 | # } 490 | 491 | clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=6, cv=2) 492 | 493 | X, y = X0.values, y0.values 494 | 495 | scores = [] 496 | for train_index, test_index in split_obj.split(X, y, group_labels): 497 | # print("TRAIN:", train_index, "TEST:", test_index) 498 | X_train, X_test = X[train_index], X[test_index] 499 | y_train, y_test = y[train_index], y[test_index] 500 | 501 | clf.fit(X_train, y_train) 502 | # clf.fit(X, y) 503 | scores.append(clf.score(X_test, y_test)) 504 | print(clf.best_params_) 505 | 506 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples 507 | # y.sample(frac = 1) this just shuffles the vector 508 | 509 | # scores_rand=0 510 | 511 | if rand_added_flag: 512 | scores_rand = cross_val_score( 513 | mlp_gs, X, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j 514 | ) 515 | else: 516 | scores_rand = 0 517 | return scores, scores_rand 518 | 519 | 520 | # from sklearn.model_selection import RandomizedSearchCV 521 | # # Number of trees in random forest 522 | # n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] 523 | # # Number of features to consider at every split 524 | # max_features = ['auto', 'sqrt'] 525 | # # Maximum number of levels in tree 526 | # max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] 527 | # max_depth.append(None) 528 | # # Minimum number of samples required to split a node 529 | # min_samples_split = [2, 5, 10] 530 | # # Minimum number of samples required at each leaf node 531 | # min_samples_leaf = [1, 2, 4] 532 | # # Method of selecting samples for training each tree 533 | # bootstrap = [True, False] 534 | # # Create the random grid 535 | # random_grid = {'n_estimators': n_estimators, 536 | # 'max_features': max_features, 537 | # 'max_depth': max_depth, 538 | # 'min_samples_split': min_samples_split, 539 | # 'min_samples_leaf': min_samples_leaf, 540 | # 'bootstrap': bootstrap} 541 | # pprint(random_grid) 542 | 543 | 544 | ########################## Random Forest 545 | def RFR_cv_plus_model_selection(X0, y0, k, group_labels, rand_added_flag): 546 | from sklearn.ensemble import RandomForestRegressor 547 | from sklearn.model_selection import GridSearchCV 548 | 549 | n_j = -1 550 | 551 | # parameter_space ={'bootstrap': [True, False],\ 552 | # 'max_depth': [10, 20, 40, 50, 100, None],\ 553 | # 'max_features': ['auto', 'sqrt'],\ 554 | # 'min_samples_leaf': [1, 2, 4],\ 555 | # 'min_samples_split': [2, 5, 10],\ 556 | # 'n_estimators': [200, 400, 600, 800, 1000]} 557 | 558 | parameter_space = { 559 | "max_depth": [10, 20, None], 560 | "min_samples_leaf": [1, 4], 561 | "min_samples_split": [2, 5, 10], 562 | } 563 | 564 | rfr_gs = RandomForestRegressor(bootstrap=True, max_features="auto") 565 | 566 | split_obj = GroupKFold(n_splits=k) 567 | # Perform k-fold cross validation 568 | # scores = cross_val_score(regr, X, y, groups=group_labels,cv=split_obj,n_jobs=n_j) 569 | 570 | # mlp_gs = MLPClassifier(max_iter=100) 571 | 572 | clf = GridSearchCV(rfr_gs, parameter_space, n_jobs=-1, cv=2) 573 | 574 | X, y = X0.values, y0.values 575 | 576 | scores = [] 577 | for train_index, test_index in split_obj.split(X, y, group_labels): 578 | # print("TRAIN:", train_index, "TEST:", test_index) 579 | X_train, X_test = X[train_index], X[test_index] 580 | y_train, y_test = y[train_index], y[test_index] 581 | 582 | # lasso_cv.fit(X_train, y_train) 583 | clf.fit(X, y) 584 | scores.append(clf.score(X_test, y_test)) 585 | print(clf.best_params_) 586 | 587 | # Perform k-fold cross validation on the shuffled vector of lm GE across samples 588 | # y.sample(frac = 1) this just shuffles the vector 589 | scores_rand = cross_val_score( 590 | rfr_gs, X0, y0.sample(frac=1), groups=group_labels, cv=split_obj, n_jobs=n_j 591 | ) 592 | # scores_rand=0 593 | return scores, scores_rand 594 | 595 | 596 | ############################## Feature Ranking ######################### 597 | def linear_model_feature_ranking(X0, y0, k, group_labels, l1k_features_gn): 598 | """ 599 | X: CP data [perts/samples, features] 600 | y: lm gene expression value [perts/samples, 1 (feature value)] 601 | 602 | Returns: 603 | prediction scores, y permutated scores 604 | """ 605 | from sklearn import linear_model 606 | from sklearn.feature_selection import SelectKBest 607 | from sklearn.feature_selection import mutual_info_regression 608 | 609 | n_j = 3 610 | # build sklearn model 611 | # clf = linear_model.Lasso(alpha=0.1,max_iter=10000) 612 | clf = linear_model.LinearRegression() 613 | 614 | # k=np.unique(group_labels).shape[0] 615 | 616 | split_obj = GroupKFold(n_splits=k) 617 | # split_obj = LeaveOneGroupOut() 618 | # Perform k-fold cross validation 619 | 620 | # alphas = np.linspace(0, 0.02, 11) 621 | alphas1 = np.linspace(0, 0.2, 20) 622 | alphas2 = np.linspace(0.2, 0.5, 10)[1:] 623 | alphas = np.concatenate((alphas1, alphas2)) 624 | # alphas = np.logspace(-4, -0.5, 30) 625 | # lasso_cv = linear_model.LassoCV(alphas=alphas, random_state=0, max_iter=1000,selection='random') 626 | 627 | X, y = X0.values, y0.values 628 | 629 | fs = SelectKBest(score_func=mutual_info_regression, k="all") 630 | fs.fit(X, y) 631 | 632 | clf.fit(X, y) 633 | return clf.coef_, fs.scores_ 634 | 635 | 636 | # return ranking(np.abs(lasso_cv.coef_), l1k_features_gn) 637 | 638 | 639 | ranks = {} 640 | 641 | 642 | # Create our function which stores the feature rankings to the ranks dictionary 643 | def ranking(ranks, names, order=1): 644 | minmax = preprocessing.MinMaxScaler() 645 | ranks = minmax.fit_transform(order * np.array([ranks]).T).T[0] 646 | ranks = map(lambda x: round(x, 2), ranks) 647 | return dict(zip(names, ranks)) 648 | -------------------------------------------------------------------------------- /utils/readProfiles.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.spatial 3 | import pandas as pd 4 | import sklearn.decomposition 5 | from sklearn import preprocessing 6 | from sklearn.metrics import pairwise_distances 7 | 8 | # from utils.normalize_funcs import standardize_per_catX 9 | # from normalize_funcs import standardize_per_catX 10 | 11 | #'dataset_name',['folder_name',[cp_pert_col_name,l1k_pert_col_name],[cp_control_val,l1k_control_val]] 12 | ds_info_dict = { 13 | "CDRP": ["CDRP-BBBC047-Bray", ["Metadata_Sample_Dose", "pert_sample_dose"]], 14 | "CDRP-bio": ["CDRPBIO-BBBC036-Bray", ["Metadata_Sample_Dose", "pert_sample_dose"]], 15 | "TAORF": [ 16 | "TA-ORF-BBBC037-Rohban", 17 | [ 18 | "Metadata_broad_sample", 19 | "pert_id", 20 | ], 21 | ], 22 | "LUAD": ["LUAD-BBBC041-Caicedo", ["x_mutation_status", "allele"]], 23 | "LINCS": ["LINCS-Pilot1", ["Metadata_pert_id_dose", "pert_id_dose"]], 24 | } 25 | 26 | labelCol = "PERT" 27 | 28 | 29 | ################################################################################ 30 | def read_replicate_level_profiles( 31 | dataset_rootDir, dataset, profileType, per_plate_normalized_flag 32 | ): 33 | """ 34 | Reads replicate level CSV files in the form of a dataframe 35 | Extract measurments column names for each modalities 36 | Remove columns with low variance (null_vals_ratio) 38 | 39 | Inputs: 40 | dataset_rootDir: datasets root dir 41 | dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP'] 42 | profileType: Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected' 43 | per_plate_normalized_flag: if True it will standardize data per plate 44 | 45 | Output: 46 | cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data 47 | """ 48 | 49 | dataDir = dataset_rootDir + "/preprocessed_data/" + ds_info_dict[dataset][0] + "/" 50 | 51 | cp_data_repLevel = pd.read_csv( 52 | dataDir + "/CellPainting/replicate_level_cp_" + profileType + ".csv.gz" 53 | ) 54 | l1k_data_repLevel = pd.read_csv(dataDir + "/L1000/replicate_level_l1k.csv.gz") 55 | 56 | cp_features, l1k_features = extract_feature_names( 57 | cp_data_repLevel, l1k_data_repLevel 58 | ) 59 | 60 | ########## removes nan and inf values 61 | l1k_data_repLevel = l1k_data_repLevel.replace([np.inf, -np.inf], np.nan) 62 | cp_data_repLevel = cp_data_repLevel.replace([np.inf, -np.inf], np.nan) 63 | 64 | # 65 | null_vals_ratio = 0.05 66 | thrsh_std = 0.0001 67 | cols2remove_manyNulls = [ 68 | i 69 | for i in cp_features 70 | if (cp_data_repLevel[i].isnull().sum(axis=0) / cp_data_repLevel.shape[0]) 71 | > null_vals_ratio 72 | ] 73 | cols2remove_lowVars = ( 74 | cp_data_repLevel[cp_features] 75 | .std()[cp_data_repLevel[cp_features].std() < thrsh_std] 76 | .index.tolist() 77 | ) 78 | 79 | cols2removeCP = cols2remove_manyNulls + cols2remove_lowVars 80 | # print(cols2removeCP) 81 | 82 | cp_features = list(set(cp_features) - set(cols2removeCP)) 83 | cp_data_repLevel = cp_data_repLevel.drop(cols2removeCP, axis=1) 84 | cp_data_repLevel[cp_features] = cp_data_repLevel[cp_features].interpolate() 85 | 86 | # cols2removeCP=[i for i in cp_features if cp_data_repLevel[i].isnull().sum(axis=0)>0] 87 | # print(cols2removeCP) 88 | 89 | # cp=cp.fillna(cp.median()) 90 | 91 | # cols2removeGE=[i for i in l1k.columns if l1k[i].isnull().sum(axis=0)>0] 92 | # print(cols2removeGE) 93 | # l1k_features = list(set(l1k_features) - set(cols2removeGE)) 94 | # print(len(l1k_features)) 95 | # l1k=l1k.drop(cols2removeGE, axis=1); 96 | l1k_data_repLevel[l1k_features] = l1k_data_repLevel[l1k_features].interpolate() 97 | # l1k=l1k.fillna(l1k.median()) 98 | 99 | ################ Per plate scaling 100 | if per_plate_normalized_flag: 101 | cp_data_repLevel = standardize_per_catX( 102 | cp_data_repLevel, "Metadata_Plate", cp_features 103 | ) 104 | l1k_data_repLevel = standardize_per_catX( 105 | l1k_data_repLevel, "det_plate", l1k_features 106 | ) 107 | 108 | cols2removeCP = [ 109 | i 110 | for i in cp_features 111 | if (cp_data_repLevel[i].isnull().sum(axis=0) / cp_data_repLevel.shape[0]) 112 | > 0.05 113 | ] 114 | cp_data_repLevel = cp_data_repLevel.drop(cols2removeCP, axis=1) 115 | cp_features = list(set(cp_features) - set(cols2removeCP)) 116 | cp_data_repLevel[cp_features] = cp_data_repLevel[cp_features].interpolate() 117 | 118 | return [cp_data_repLevel, cp_features], [l1k_data_repLevel, l1k_features] 119 | 120 | 121 | ################################################################################ 122 | def extract_feature_names(cp_data_repLevel, l1k_data_repLevel): 123 | """ 124 | extract Cell Painting and L1000 measurments names among the column names 125 | 126 | Inputs: 127 | cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data 128 | 129 | Outputs: list of feature names for each modality 130 | 131 | """ 132 | # features to analyse 133 | cp_features = cp_data_repLevel.columns[ 134 | cp_data_repLevel.columns.str.contains("Cells_|Cytoplasm_|Nuclei_") 135 | ].tolist() 136 | l1k_features = l1k_data_repLevel.columns[ 137 | l1k_data_repLevel.columns.str.contains("_at") 138 | ].tolist() 139 | 140 | return cp_features, l1k_features 141 | 142 | 143 | ################################################################################ 144 | def extract_metadata_column_names(cp_data, l1k_data): 145 | """ 146 | extract metadata column names among the column names for any level of data 147 | 148 | Inputs: 149 | cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data 150 | 151 | Outputs: list of metadata column names for each modality 152 | 153 | """ 154 | cp_meta_col_names = cp_data.columns[ 155 | ~cp_data.columns.str.contains("Cells_|Cytoplasm_|Nuclei_") 156 | ].tolist() 157 | l1k_meta_col_names = l1k_data.columns[ 158 | ~l1k_data.columns.str.contains("_at") 159 | ].tolist() 160 | 161 | return cp_meta_col_names, l1k_meta_col_names 162 | 163 | 164 | ################################################################################ 165 | def read_treatment_level_profiles( 166 | dataset_rootDir, 167 | dataset, 168 | profileType, 169 | filter_repCorr_params, 170 | per_plate_normalized_flag, 171 | ): 172 | """ 173 | Reads replicate level CSV files (scaled replicate level profiles per plate) 174 | Rename the column names to match across datasets to PERT in both modalities 175 | Remove perturbations with low rep corr across both (filter_perts='highRepOverlap') 176 | or one of the modalities (filter_perts='highRepUnion') 177 | Form treatment level profiles by averaging the replicates 178 | Select and keep the metadata columns you want to keep for each dataset 179 | Merge treatment level profiles to its own metadata 180 | 181 | Inputs: 182 | dataset_rootDir: datasets root dir 183 | dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP'] 184 | profileType: Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected' 185 | 186 | Output: 187 | [cp_data_treatLevel,cp_features], [l1k_data_treatLevel,l1k_features] 188 | each is a list of dataframe and feature names for each of modalities 189 | """ 190 | 191 | filter_perts = filter_repCorr_params[0] 192 | repCorrFilePath = filter_repCorr_params[1] 193 | 194 | [cp_data_repLevel, cp_features], [ 195 | l1k_data_repLevel, 196 | l1k_features, 197 | ] = read_replicate_level_profiles( 198 | dataset_rootDir, dataset, profileType, per_plate_normalized_flag 199 | ) 200 | 201 | ############ rename columns that should match to PERT 202 | labelCol = "PERT" 203 | cp_data_repLevel = cp_data_repLevel.rename( 204 | columns={ds_info_dict[dataset][1][0]: labelCol} 205 | ) 206 | l1k_data_repLevel = l1k_data_repLevel.rename( 207 | columns={ds_info_dict[dataset][1][1]: labelCol} 208 | ) 209 | 210 | ###### print some data statistics 211 | print( 212 | dataset + ": Replicate Level Shapes (nSamples x nFeatures): cp: ", 213 | cp_data_repLevel.shape[0], 214 | ",", 215 | len(cp_features), 216 | ", l1k: ", 217 | l1k_data_repLevel.shape[0], 218 | ",", 219 | len(l1k_features), 220 | ) 221 | 222 | print("l1k n of rep: ", l1k_data_repLevel.groupby([labelCol]).size().median()) 223 | print("cp n of rep: ", cp_data_repLevel.groupby([labelCol]).size().median()) 224 | 225 | ###### remove perts with low rep corr 226 | if filter_perts == "highRepOverlap": 227 | highRepPerts = highRepFinder(dataset, "intersection", repCorrFilePath) + [ 228 | "negcon" 229 | ] 230 | 231 | cp_data_repLevel = cp_data_repLevel[ 232 | cp_data_repLevel["PERT"].isin(highRepPerts) 233 | ].reset_index() 234 | l1k_data_repLevel = l1k_data_repLevel[ 235 | l1k_data_repLevel["PERT"].isin(highRepPerts) 236 | ].reset_index() 237 | 238 | elif filter_perts == "highRepUnion": 239 | highRepPerts = highRepFinder(dataset, "union", repCorrFilePath) + ["negcon"] 240 | 241 | cp_data_repLevel = cp_data_repLevel[ 242 | cp_data_repLevel["PERT"].isin(highRepPerts) 243 | ].reset_index() 244 | l1k_data_repLevel = l1k_data_repLevel[ 245 | l1k_data_repLevel["PERT"].isin(highRepPerts) 246 | ].reset_index() 247 | 248 | ####### form treatment level profiles 249 | l1k_data_treatLevel = ( 250 | l1k_data_repLevel.groupby(labelCol)[l1k_features].mean().reset_index() 251 | ) 252 | cp_data_treatLevel = ( 253 | cp_data_repLevel.groupby(labelCol)[cp_features].mean().reset_index() 254 | ) 255 | 256 | ###### define metadata and merge treatment level profiles 257 | # dataset:[[cp_columns],[l1k_columns]] 258 | # meta_dict={'CDRP':[['Metadata_moa','Metadata_target'],['CPD_NAME','CPD_TYPE','CPD_SMILES']], 259 | # 'CDRP-bio':[['Metadata_moa','Metadata_target'],['CPD_NAME','CPD_TYPE','CPD_SMILES']], 260 | # 'TAORF':[['Metadata_moa'],['pert_type']], 261 | # 'LUAD':[['Metadata_broad_sample_type','Metadata_pert_type'],[]], 262 | # 'LINCS':[['Metadata_moa', 'Metadata_alternative_moa'],['moa']]} 263 | 264 | meta_dict = { 265 | "CDRP": [["Metadata_moa", "Metadata_target"], []], 266 | "CDRP-bio": [["Metadata_moa", "Metadata_target"], []], 267 | "TAORF": [[], []], 268 | "LUAD": [[], []], 269 | "LINCS": [["Metadata_moa", "Metadata_alternative_moa"], ["moa"]], 270 | } 271 | 272 | meta_cp = ( 273 | cp_data_repLevel[[labelCol] + meta_dict[dataset][0]] 274 | .drop_duplicates() 275 | .reset_index(drop=True) 276 | ) 277 | meta_l1k = ( 278 | l1k_data_repLevel[[labelCol] + meta_dict[dataset][1]] 279 | .drop_duplicates() 280 | .reset_index(drop=True) 281 | ) 282 | 283 | cp_data_treatLevel = pd.merge( 284 | cp_data_treatLevel, meta_cp, how="inner", on=[labelCol] 285 | ) 286 | l1k_data_treatLevel = pd.merge( 287 | l1k_data_treatLevel, meta_l1k, how="inner", on=[labelCol] 288 | ) 289 | 290 | return [cp_data_treatLevel, cp_features], [l1k_data_treatLevel, l1k_features] 291 | 292 | 293 | ################################################################################ 294 | def read_paired_treatment_level_profiles( 295 | dataset_rootDir, 296 | dataset, 297 | profileType, 298 | filter_repCorr_params, 299 | per_plate_normalized_flag, 300 | ): 301 | """ 302 | Reads treatment level profiles 303 | Merge dataframes by PERT column 304 | 305 | Inputs: 306 | dataset_rootDir: datasets root dir 307 | dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP'] 308 | profileType: Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected' 309 | per_plate_normalized_flag: True for scaling per plate 310 | 311 | Output: 312 | mergedProfiles_treatLevel: paired treatment level profiles 313 | cp_features,l1k_features list of feature names for each of modalities 314 | """ 315 | 316 | [cp_data_treatLevel, cp_features], [ 317 | l1k_data_treatLevel, 318 | l1k_features, 319 | ] = read_treatment_level_profiles( 320 | dataset_rootDir, 321 | dataset, 322 | profileType, 323 | filter_repCorr_params, 324 | per_plate_normalized_flag, 325 | ) 326 | 327 | mergedProfiles_treatLevel = pd.merge( 328 | cp_data_treatLevel, l1k_data_treatLevel, how="inner", on=[labelCol] 329 | ) 330 | 331 | print( 332 | "Treatment Level Shapes (nSamples x nFeatures+metadata):", 333 | cp_data_treatLevel.shape, 334 | l1k_data_treatLevel.shape, 335 | "Merged Profiles Shape:", 336 | mergedProfiles_treatLevel.shape, 337 | ) 338 | 339 | return mergedProfiles_treatLevel, cp_features, l1k_features 340 | 341 | 342 | ################################################################################ 343 | def generate_random_match_of_replicate_pairs(cp_data_repLevel, l1k_data_repLevel, nRep): 344 | """ 345 | Note that there is no match at the replicate level for this dataset, we either: 346 | - Forming ALL the possible pairs for replicate level data matching (nRep='all' - string) 347 | - Randomly sample samples in each modality and form pairs (nRep -> int) 348 | 349 | Inputs: 350 | cp_data_repLevel, l1k_data_repLevel: dataframes with all the annotations available in the raw data 351 | 352 | Outputs: 353 | Randomly paired replicate level profiles 354 | 355 | """ 356 | labelCol = "PERT" 357 | 358 | if nRep == "all": 359 | cp_data_n_repLevel = cp_data_repLevel.copy() 360 | l1k_data_n_repLevel = l1k_data_repLevel.copy() 361 | else: 362 | # nR=np.min((cp_data_repLevel.groupby(labelCol).size().min(),l1k_data_repLevel.groupby(labelCol).size().min())) 363 | # cp_data_n_repLevel=cp_data_repLevel.groupby(labelCol).apply(lambda x: x.sample(n=nR,replace=True)).reset_index(drop=True) 364 | nR = nRep 365 | cp_data_n_repLevel = ( 366 | cp_data_repLevel.groupby(labelCol) 367 | .apply(lambda x: x.sample(n=np.min([nR, x.shape[0]]))) 368 | .reset_index(drop=True) 369 | ) 370 | l1k_data_n_repLevel = ( 371 | l1k_data_repLevel.groupby(labelCol) 372 | .apply(lambda x: x.sample(n=np.min([nR, x.shape[0]]))) 373 | .reset_index(drop=True) 374 | ) 375 | 376 | mergedProfiles_repLevel = pd.merge( 377 | cp_data_n_repLevel, l1k_data_n_repLevel, how="inner", on=[labelCol] 378 | ) 379 | 380 | return mergedProfiles_repLevel 381 | 382 | 383 | ################################################################################ 384 | def highRepFinder(dataset, how, repCorrFilePath): 385 | """ 386 | This function reads pre calculated and saved Replicate Correlation values file and filters perturbations 387 | using one of the following filters: 388 | - intersection: intersection of high quality profiles across both modalities 389 | - union: union of high quality profiles across both modalities 390 | 391 | * A High Quality profile is defined as a profile having replicate correlation more than 90th percentile of 392 | its null distribution 393 | 394 | Inputs: 395 | dataset (str): dataset name 396 | how (str): can be intersection or union 397 | 398 | Output: list of high quality perurbations 399 | 400 | """ 401 | repCorDF = pd.read_excel(repCorrFilePath, sheet_name=None) 402 | cpRepDF = repCorDF["cp-" + dataset.lower()] 403 | cpHighList = cpRepDF[cpRepDF["RepCor"] > cpRepDF["Rand90Perc"]][ 404 | "Unnamed: 0" 405 | ].tolist() 406 | print("CP: from ", cpRepDF.shape[0], " to ", len(cpHighList)) 407 | cpRepDF = repCorDF["l1k-" + dataset.lower()] 408 | l1kHighList = cpRepDF[cpRepDF["RepCor"] > cpRepDF["Rand90Perc"]][ 409 | "Unnamed: 0" 410 | ].tolist() 411 | # print("l1kHighList",l1kHighList) 412 | # print("cpHighList",cpHighList) 413 | if how == "intersection": 414 | highRepPerts = list(set(l1kHighList) & set(cpHighList)) 415 | print("l1k: from ", cpRepDF.shape[0], " to ", len(l1kHighList)) 416 | print("CP and l1k high rep overlap: ", len(highRepPerts)) 417 | 418 | elif how == "union": 419 | highRepPerts = list(set(l1kHighList) | set(cpHighList)) 420 | print("l1k: from ", cpRepDF.shape[0], " to ", len(l1kHighList)) 421 | print("CP and l1k high rep union: ", len(highRepPerts)) 422 | 423 | return highRepPerts 424 | 425 | 426 | ################################################################################ 427 | def read_paired_replicate_level_profiles( 428 | dataset_rootDir, 429 | dataset, 430 | profileType, 431 | nRep, 432 | filter_repCorr_params, 433 | per_plate_normalized_flag, 434 | ): 435 | """ 436 | Reads replicate level CSV files (scaled replicate level profiles per plate) 437 | Rename the column names to match across datasets to PERT in both modalities 438 | Remove perturbations with low rep corr across both (filter_perts='highRepOverlap') 439 | or one of the modalities (filter_perts='highRepUnion') 440 | Form treatment level profiles by averaging the replicates 441 | Select and keep the metadata columns you want to keep for each dataset 442 | Merge dataframes by PERT column 443 | 444 | Inputs: 445 | dataset_rootDir: datasets root dir 446 | dataset: any from the available list of ['LUAD', 'TAORF', 'LINCS', 'CDRP-bio', 'CDRP'] 447 | profileType: Cell Painting profile type that can be 'augmented' , 'normalized', 'normalized_variable_selected' 448 | 449 | Output: 450 | mergedProfiles_treatLevel: paired treatment level profiles 451 | cp_features,l1k_features list of feature names for each of modalities 452 | """ 453 | 454 | filter_perts = filter_repCorr_params[0] 455 | repCorrFilePath = filter_repCorr_params[1] 456 | 457 | [cp_data_repLevel, cp_features], [ 458 | l1k_data_repLevel, 459 | l1k_features, 460 | ] = read_replicate_level_profiles( 461 | dataset_rootDir, dataset, profileType, per_plate_normalized_flag 462 | ) 463 | 464 | ############ rename columns that should match to PERT 465 | cp_data_repLevel = cp_data_repLevel.rename( 466 | columns={ds_info_dict[dataset][1][0]: labelCol} 467 | ) 468 | l1k_data_repLevel = l1k_data_repLevel.rename( 469 | columns={ds_info_dict[dataset][1][1]: labelCol} 470 | ) 471 | 472 | ###### print some data statistics 473 | print( 474 | dataset + ": Replicate Level Shapes (nSamples x nFeatures): cp: ", 475 | cp_data_repLevel.shape[0], 476 | ",", 477 | len(cp_features), 478 | ", l1k: ", 479 | l1k_data_repLevel.shape[0], 480 | ",", 481 | len(l1k_features), 482 | ) 483 | 484 | print("l1k n of rep: ", l1k_data_repLevel.groupby([labelCol]).size().median()) 485 | print("cp n of rep: ", cp_data_repLevel.groupby([labelCol]).size().median()) 486 | 487 | ###### remove perts with low rep corr 488 | if filter_perts == "highRepOverlap": 489 | highRepPerts = highRepFinder(dataset, "intersection", repCorrFilePath) + [ 490 | "negcon" 491 | ] 492 | 493 | cp_data_repLevel = cp_data_repLevel[ 494 | cp_data_repLevel["PERT"].isin(highRepPerts) 495 | ].reset_index() 496 | l1k_data_repLevel = l1k_data_repLevel[ 497 | l1k_data_repLevel["PERT"].isin(highRepPerts) 498 | ].reset_index() 499 | 500 | elif filter_perts == "highRepUnion": 501 | highRepPerts = highRepFinder(dataset, "union", repCorrFilePath) + ["negcon"] 502 | 503 | cp_data_repLevel = cp_data_repLevel[ 504 | cp_data_repLevel["PERT"].isin(highRepPerts) 505 | ].reset_index() 506 | l1k_data_repLevel = l1k_data_repLevel[ 507 | l1k_data_repLevel["PERT"].isin(highRepPerts) 508 | ].reset_index() 509 | 510 | mergedProfiles_repLevel = generate_random_match_of_replicate_pairs( 511 | cp_data_repLevel, l1k_data_repLevel, nRep 512 | ) 513 | 514 | return mergedProfiles_repLevel, cp_features, l1k_features 515 | 516 | 517 | def rename_affyprobe_to_genename(l1k_data_df, l1k_features, map_source_address): 518 | """ 519 | map input dataframe column name from affy prob id to gene names 520 | 521 | """ 522 | meta = pd.read_excel(map_source_address) 523 | 524 | # meta=pd.read_csv("../affy_probe_gene_mapping.txt",delimiter="\t",header=None, names=["probe_id", "gene"]) 525 | meta_gene_probID = meta.set_index("probe_id") 526 | d = dict(zip(meta_gene_probID.index, meta_gene_probID["symbol"])) 527 | l1k_features_gn = [d[l] for l in l1k_features] 528 | l1k_data_df = l1k_data_df.rename(columns=d) 529 | 530 | return l1k_data_df, l1k_features_gn 531 | 532 | 533 | def rename_to_genename_list_to_affyprobe( 534 | l1k_features_gn, our_l1k_prob_list, map_source_address 535 | ): 536 | """ 537 | map a list of gene names to a list of affy prob ids 538 | 539 | """ 540 | # map_source_address='../idmap.xlsx' 541 | meta = pd.read_excel(map_source_address) 542 | # meta=pd.read_csv("../affy_probe_gene_mapping.txt",delimiter="\t",header=None, names=["probe_id", "gene"]) 543 | # meta=meta[meta['probe_id'].isin(our_l1k_prob_list)].reset_index(drop=True) 544 | meta_gene_probID = meta.set_index("symbol") 545 | d = dict(zip(meta_gene_probID.index, meta_gene_probID["probe_id"])) 546 | l1k_features = [d[l] for l in l1k_features_gn] 547 | # l1k_data_df = l1k_data_df.rename(columns=d) 548 | 549 | return l1k_features 550 | 551 | 552 | def standardize_per_catX(df, column_name, cp_features): 553 | # column_name='Metadata_Plate' 554 | # cp_features=df.columns[df.columns.str.contains("Cells_|Cytoplasm_|Nuclei_")] 555 | df_scaled_perPlate = df.copy() 556 | df_scaled_perPlate[cp_features] = ( 557 | df[cp_features + [column_name]] 558 | .groupby(column_name) 559 | .transform(lambda x: (x - x.mean()) / x.std()) 560 | .values 561 | ) 562 | return df_scaled_perPlate 563 | -------------------------------------------------------------------------------- /utils/replicateCorrs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.spatial 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | from random import sample, choices 7 | from scipy.stats import pearsonr 8 | 9 | # sns.set_style("whitegrid") 10 | sns.set(rc={"lines.linewidth": 2}) 11 | 12 | 13 | def replicateCorrs(inDf, pertColName, featColNames, plotEnabled): 14 | """ 15 | Calculates replicate correlation versus across purtburtion correlations 16 | 17 | This function takes the input dataframe and output/plot replicate correlations. 18 | 19 | Parameters: 20 | inDf (pandas df): input dataframe contains metadata and features 21 | pertColName (str): The column based on which we define replicates of a purturbation 22 | featColNames(list): The list of all columns corresponding to features 23 | plotEnabled (bool): If True or 1, plots the curves 24 | 25 | Returns: 26 | repCorrDf (list): 27 | 28 | """ 29 | 30 | df = inDf.copy() 31 | df[featColNames] = inDf[featColNames].interpolate() 32 | uniqPert = df[pertColName].unique().tolist() 33 | repC = [] 34 | randC = [] 35 | 36 | repCorrDf = pd.DataFrame(index=uniqPert, columns=["RepCor"]) 37 | 38 | repSizeDF = df.groupby([pertColName]).size().reset_index() 39 | highRepComp = repSizeDF[repSizeDF[0] > 1][pertColName].tolist() 40 | 41 | for u in highRepComp: 42 | df1 = df[df[pertColName] == u].drop_duplicates().reset_index(drop=True) 43 | # df2=df[df[pertColName]!=u].drop_duplicates().reset_index(drop=True) 44 | 45 | repCorrPurtbs = df1.loc[:, featColNames].T.corr() 46 | repCorr = list( 47 | repCorrPurtbs.values[np.triu_indices(repCorrPurtbs.shape[0], k=1)] 48 | ) 49 | # print(repCorr) 50 | repCorrDf.loc[u, "RepCor"] = np.nanmean(repCorr) 51 | # print(repCorr) 52 | # repCorr=np.sort(np.unique(df1.loc[:,featColNames].T.corr().values))[:-1].tolist() 53 | # repC=repC+repCorr 54 | repC = repC + [np.nanmedian(repCorr)] 55 | # repC=repC+[np.median(repCorr)] 56 | # # randPertbs=df2[pertColName].drop_duplicates().sample(df1.shape[0],replace=True).tolist() 57 | # nS=np.min([len(df2[pertColName].unique().tolist()),df1.shape[0]]) 58 | # # nS=df1.shape[0] 59 | 60 | # # print(nS,[len(df2[pertColName].unique().tolist()),df1.shape[0]]) 61 | 62 | # randPertbs=sample(df2[pertColName].unique().tolist(),k=nS) 63 | # # print(randPertbs) 64 | # df3=pd.concat([df2[df2[pertColName]==i].sample(1,replace=True) for i in randPertbs],ignore_index=True) 65 | # # print(df1.sample(df3.shape[0],replace=False).shape,df3.shape) 66 | # randCorr=df1[featColNames].sample(df3.shape[0],replace=False).reset_index(drop=True).\ 67 | # corrwith(df3[featColNames], axis = 1,method='pearson',drop=True).values.tolist() 68 | 69 | # # x1=df1.sample(df3.shape[0],replace=False).values 70 | 71 | # # randCorr=pearsonr() 72 | # # randCorr = [x for x in randCorr if str(x) != 'nan'] 73 | # randC=randC+randCorr 74 | # # print(randC) 75 | # print('here3') 76 | randC_v2 = [] 77 | for i in range(1): 78 | uniqeSamplesFromEachPurt = inDf.groupby(pertColName)[featColNames].apply( 79 | lambda s: s.sample(1) 80 | ) 81 | corrMatAcrossPurtbs = uniqeSamplesFromEachPurt.loc[:, featColNames].T.corr() 82 | randCorrVals = list( 83 | corrMatAcrossPurtbs.values[ 84 | np.triu_indices(corrMatAcrossPurtbs.shape[0], k=1) 85 | ] 86 | ) 87 | randC_v2 = randC_v2 + randCorrVals 88 | 89 | if 0: 90 | fig, axes = plt.subplots(figsize=(5, 3)) 91 | sns.kdeplot(randC, bw=0.1, label="random pairs", ax=axes) 92 | sns.kdeplot(repC, bw=0.1, label="replicate pairs", ax=axes) 93 | axes.set_xlabel("CC") 94 | sns.kdeplot(randC_v2, bw=0.1, label="random v2 pairs", ax=axes) 95 | axes.set_xlabel("CC") 96 | # perc5=np.percentile(repCC, 50);axes.axvline(x=perc5,linestyle=':',color='darkorange'); 97 | # perc95=np.percentile(randCC, 90);axes.axvline(x=perc95,linestyle=':'); 98 | axes.legend() 99 | # axes.set_title(''); 100 | axes.set_xlim(-1.1, 1.1) 101 | 102 | repC = [repC for repC in repC if str(repC) != "nan"] 103 | randC_v2 = [randC_v2 for randC_v2 in randC_v2 if str(randC_v2) != "nan"] 104 | 105 | perc95 = np.percentile(randC_v2, 90) 106 | rep10 = np.percentile(repC, 10) 107 | 108 | if plotEnabled: 109 | fig, axes = plt.subplots(figsize=(5, 4)) 110 | # sns.kdeplot(randC_v2, bw=.1, label="random pairs",ax=axes);axes.set_xlabel('CC'); 111 | # sns.kdeplot(repC, bw=.1, label="replicate pairs",ax=axes,color='r');axes.set_xlabel('CC'); 112 | sns.distplot( 113 | randC_v2, 114 | kde=True, 115 | hist=True, 116 | bins=100, 117 | label="random pairs", 118 | ax=axes, 119 | norm_hist=True, 120 | ) 121 | sns.distplot( 122 | repC, 123 | kde=True, 124 | hist=True, 125 | bins=100, 126 | label="replicate pairs", 127 | ax=axes, 128 | norm_hist=True, 129 | color="r", 130 | ) 131 | 132 | # perc5=np.percentile(repCC, 50);axes.axvline(x=perc5,linestyle=':',color='darkorange'); 133 | axes.axvline(x=perc95, linestyle=":") 134 | axes.axvline(x=0, linestyle=":") 135 | axes.legend(loc=2) 136 | # axes.set_title(''); 137 | axes.set_xlim(-1, 1) 138 | plt.tight_layout() 139 | 140 | repCorrDf["Rand90Perc"] = perc95 141 | repCorrDf["Rep10Perc"] = rep10 142 | # highRepPertbs=repCorrDf[repCorrDf['RepCor']>perc95].index.tolist() 143 | # return repCorrDf 144 | return [randC_v2, repC, repCorrDf] 145 | 146 | 147 | # input is a list of dfs--> [cp,l1k,cp_cca,l1k_cca] 148 | ####### 149 | def plotRepCorrs(allData, pertName): 150 | corrAll = [] 151 | for d in range(len(allData)): 152 | df = allData[d][0] 153 | features = allData[d][1] 154 | uniqPert = df[pertName].unique().tolist() 155 | repC = [] 156 | randC = [] 157 | for u in uniqPert: 158 | df1 = df[df[pertName] == u].drop_duplicates().reset_index(drop=True) 159 | df2 = df[df[pertName] != u].drop_duplicates().reset_index(drop=True) 160 | repCorr = np.sort(np.unique(df1.loc[:, features].T.corr().values))[ 161 | :-1 162 | ].tolist() 163 | # print(repCorr) 164 | repC = repC + repCorr 165 | randAllels = ( 166 | df2[pertName] 167 | .drop_duplicates() 168 | .sample(df1.shape[0], replace=True) 169 | .tolist() 170 | ) 171 | df3 = pd.concat( 172 | [ 173 | df2[df2[pertName] == i].reset_index(drop=True).iloc[0:1, :] 174 | for i in randAllels 175 | ], 176 | ignore_index=True, 177 | ) 178 | randCorr = df1.corrwith(df3, axis=1, method="pearson").values.tolist() 179 | randC = randC + randCorr 180 | 181 | corrAll.append([randC, repC]) 182 | return corrAll 183 | -------------------------------------------------------------------------------- /utils/saveAsNewSheetToExistingFile.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import openpyxl as pxl 3 | import os 4 | 5 | # ------------------------------------------------------ 6 | 7 | 8 | # Save the input dataframe to the specified sheet name of filename file 9 | def saveAsNewSheetToExistingFile(filename, newDF, newSheetName): 10 | 11 | 12 | 13 | if os.path.exists(filename): 14 | excel_book = pxl.load_workbook(filename) 15 | 16 | if newSheetName in excel_book.sheetnames: 17 | del excel_book[newSheetName] 18 | 19 | with pd.ExcelWriter(filename, engine="openpyxl") as writer: 20 | writer.book = excel_book 21 | 22 | writer.sheets = { 23 | worksheet.title: worksheet 24 | for worksheet in excel_book.worksheets 25 | if newSheetName not in worksheet 26 | } 27 | newDF.to_excel(writer, newSheetName) 28 | writer.save() 29 | else: 30 | newDF.to_excel(filename, newSheetName) 31 | 32 | print(newSheetName, " saved!") 33 | return 34 | 35 | 36 | # ------------------------------------------------------ 37 | 38 | 39 | # saveDF_to_CSV_GZ_no_timestamp 40 | def saveDF_to_CSV_GZ_no_timestamp(df, filename): 41 | from gzip import GzipFile 42 | from io import TextIOWrapper 43 | 44 | with TextIOWrapper(GzipFile(filename, "w", mtime=0), encoding="utf-8") as fd: 45 | df.to_csv(fd, index=False, compression="gzip") 46 | 47 | return 48 | --------------------------------------------------------------------------------