├── Data ├── SampleData │ ├── OmicsCNGene.hdf5 │ ├── KYReadcounts.hdf5 │ ├── AvanaReadcounts.hdf5 │ ├── DeWeirdtReadcounts.hdf5 │ ├── OmicsExpressionProteinCodingGenesTPMLogp1.hdf5 │ ├── DeWeirdtConditionMap.csv │ ├── RNAiExpressionAddictions.csv │ ├── KYSequenceMap.csv │ ├── OmicsSomaticMutations.csv │ ├── AchillesCommonEssentialControls.csv │ ├── AchillesNonessentialControls.csv │ └── AvanaSequenceMap.csv └── DepMapDataURLs.json ├── chronos ├── __init__.py ├── fetch_parameters.py ├── copy_correction.py ├── figshare.py ├── plotting.py ├── reports.py └── evaluations.py ├── .gitignore ├── setup.py ├── project.toml ├── LICENSE └── README.md /Data/SampleData/OmicsCNGene.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/chronos/HEAD/Data/SampleData/OmicsCNGene.hdf5 -------------------------------------------------------------------------------- /Data/SampleData/KYReadcounts.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/chronos/HEAD/Data/SampleData/KYReadcounts.hdf5 -------------------------------------------------------------------------------- /Data/SampleData/AvanaReadcounts.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/chronos/HEAD/Data/SampleData/AvanaReadcounts.hdf5 -------------------------------------------------------------------------------- /Data/SampleData/DeWeirdtReadcounts.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/chronos/HEAD/Data/SampleData/DeWeirdtReadcounts.hdf5 -------------------------------------------------------------------------------- /chronos/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import * 2 | from .copy_correction import * 3 | from .figshare import * 4 | from .fetch_parameters import fetch_parameters -------------------------------------------------------------------------------- /Data/SampleData/OmicsExpressionProteinCodingGenesTPMLogp1.hdf5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/broadinstitute/chronos/HEAD/Data/SampleData/OmicsExpressionProteinCodingGenesTPMLogp1.hdf5 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_store 2 | Data/Achilles_run/ 3 | Data/Achilles_run_compare/ 4 | Data/logs/ 5 | Data/reports 6 | Data/DepMapParameters/ 7 | *.egg-info 8 | *.pyc 9 | .ipynb_checkpoints/ 10 | build/ 11 | dist/ 12 | chronos/__pycache__ 13 | chronos/unimodal_density_estimate.py 14 | 15 | -------------------------------------------------------------------------------- /Data/DepMapDataURLs.json: -------------------------------------------------------------------------------- 1 | { 2 | "gene_effect.csv": "https://plus.figshare.com/ndownloader/files/43346616", 3 | "guide_efficacy.csv": "https://plus.figshare.com/ndownloader/files/43346709", 4 | "cell_line_efficacy.csv": "https://plus.figshare.com/ndownloader/files/43346718", 5 | "library_effect.csv": "https://plus.figshare.com/ndownloader/files/43346715", 6 | "t0_offset.csv": "https://plus.figshare.com/ndownloader/files/43346733" 7 | } -------------------------------------------------------------------------------- /Data/SampleData/DeWeirdtConditionMap.csv: -------------------------------------------------------------------------------- 1 | sequence_ID,replicate,cell_line_name,days,pDNA_batch,condition 2 | pDNA,pDNA,pDNA,21,batch1,pDNA 3 | Meljuso,RepA,Meljuso,21,batch1,Control 4 | Meljuso.1,RepB,Meljuso,21,batch1,Control 5 | Meljuso.2,RepA,Meljuso,21,batch1,A-1331852 6 | Meljuso.3,RepB,Meljuso,21,batch1,A-1331852 7 | OVCAR8,RepA,OVCAR8,21,batch1,Control 8 | OVCAR8.1,RepB,OVCAR8,21,batch1,Control 9 | OVCAR8.2,RepA,OVCAR8,21,batch1,A-1331852 10 | OVCAR8.3,RepB,OVCAR8,21,batch1,A-1331852 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | 4 | extras_require = { 5 | "copy_correction": ["patsy>=0.5.2"], 6 | "evaluations": ["matplotlib>=3.6", "seaborn>=0.12", "scikit-learn>=1.1", "statsmodels>=0.13", "scipy>=1.9"], 7 | "adjust_text": ["adjustText"], 8 | "embedding": ["umap-learn>=0.5.3"], 9 | "reports": ["reportlab>=3.6"], 10 | "model": ["numpy>=1.2", "pandas>=1.3", "tensorflow>2", "h5py>=3.7"], 11 | "hit_calling": ["scipy>=1.9", "sympy>=1.0", "statsmodels>=0.13"] 12 | } 13 | extras_require['all'] = sorted(set.union(*[set(v) for v in extras_require.values()])) 14 | 15 | setup( 16 | name='crispr_chronos', 17 | version='2.3.10', 18 | author="BroadInstitute CDS", 19 | description="Time series modeling of CRISPR perturbation readcounts in biological data", 20 | packages=find_packages(), 21 | package_data={'': ['*.r']}, 22 | install_requires=extras_require['all'] 23 | #extras_require = extras_require 24 | ) 25 | -------------------------------------------------------------------------------- /project.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "numpy>=1.2", "pandas>=1.3", "tensorflow>2", "h5py>=3.7", "patsy>=0.5.2", 4 | "matplotlib>=3.6", "seaborn>=0.12", "scikit-learn>=1.1", "statsmodels>=0.13", "scipy>=1.9", "sympy>=1.0", 5 | "adjustText", 6 | "umap-learn>=0.5.3", 7 | "reportlab>=3.6" 8 | ] 9 | build-backend = "setuptools.build_meta" 10 | 11 | [project] 12 | name = "crispr_chronos" 13 | version = "2.0.6" 14 | authors = [ 15 | { name="Joshua Dempster", email="cds@broadinstitute.org" }, 16 | ] 17 | description = "A package for processing readcount data from CRISPR knockout viability experiments" 18 | readme = "README.md" 19 | requires-python = ">=3.8" 20 | classifiers = [ 21 | "Programming Language :: Python :: 3", 22 | "License :: OSI Approved :: BSD License", 23 | "Operating System :: OS Independent", 24 | "Development Status :: 5", 25 | "Intended Audience :: Science/Research", 26 | "Natural Language :: English", 27 | "Topic:: Scientific/Engineering :: Bio-Informatics", 28 | ] 29 | 30 | [project.urls] 31 | "Homepage" = "https://github.com/broadinstitute/chronos" 32 | "Bug Tracker" = "https://github.com/broadinstitute/chronos/issues" 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021, 2023 Joshua M. Dempster 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form qmust reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /Data/SampleData/RNAiExpressionAddictions.csv: -------------------------------------------------------------------------------- 1 | Gene 2 | AADAC (13) 3 | ACTL8 (81569) 4 | AKT1 (207) 5 | ANG (283) 6 | APOBR (55911) 7 | ARHGAP29 (9411) 8 | ARHGEF6 (9459) 9 | ATP8B2 (57198) 10 | AXL (558) 11 | BCL2 (596) 12 | BTK (695) 13 | C11orf71 (54494) 14 | C8orf37 (157657) 15 | CACFD1 (11094) 16 | CBFA2T3 (863) 17 | CCND1 (595) 18 | CCND2 (894) 19 | CDX2 (1045) 20 | CEBPA (1050) 21 | CSF2RB (1439) 22 | DTX2 (113878) 23 | EBF1 (1879) 24 | EGFR (1956) 25 | ERBB3 (2065) 26 | ESR1 (2099) 27 | FBRSL1 (57666) 28 | FERMT1 (55612) 29 | FGFR1 (2260) 30 | FLI1 (2313) 31 | FLT3 (2322) 32 | FOXA1 (3169) 33 | FOXR2 (139628) 34 | FRAT1 (10023) 35 | FUNDC2 (65991) 36 | FZD8 (8325) 37 | GATA1 (2623) 38 | GATA2 (2624) 39 | GATA3 (2625) 40 | GNAI2 (2771) 41 | GNS (2799) 42 | GRHL2 (79977) 43 | HERC1 (8925) 44 | HNF1B (6928) 45 | HNF4A (3172) 46 | HNRNPH1 (3187) 47 | HOXB13 (10481) 48 | INHBC (3626) 49 | IRF2BP2 (359948) 50 | IRF4 (3662) 51 | IRF8 (3394) 52 | JUN (3725) 53 | JUP (3728) 54 | KLF5 (688) 55 | KRAS (3845) 56 | LMX1B (4010) 57 | LYL1 (4066) 58 | MCL1 (4170) 59 | MDM2 (4193) 60 | MECOM (2122) 61 | MEF2D (4209) 62 | MET (4233) 63 | MPRIP (23164) 64 | MSI2 (124540) 65 | MYB (4602) 66 | MYBL1 (4603) 67 | NAV3 (89795) 68 | NFKBIE (4794) 69 | NKX2-1 (7080) 70 | PARD3 (56288) 71 | PARD6B (84612) 72 | PASD1 (139135) 73 | PAX8 (7849) 74 | PEA15 (8682) 75 | PHIP (55023) 76 | POU2F2 (5452) 77 | PPME1 (51400) 78 | PRDM1 (639) 79 | RBM47 (54502) 80 | RELB (5971) 81 | RHBDD1 (84236) 82 | RUNX1 (861) 83 | SATB2 (23314) 84 | SLC16A11 (162515) 85 | SOX10 (6663) 86 | SOX4 (6659) 87 | SOX9 (6662) 88 | SPDEF (25803) 89 | SPI1 (6688) 90 | TBX2 (6909) 91 | TBX3 (6926) 92 | TCF4 (6925) 93 | TCF7 (6932) 94 | TEAD1 (7003) 95 | TFAP2A (7020) 96 | TFAP2C (7022) 97 | TLE3 (7090) 98 | TNS4 (84951) 99 | TOX (9760) 100 | TRIM10 (10107) 101 | UBE2D2 (7322) 102 | VAV1 (7409) 103 | YAP1 (10413) 104 | ZBTB7B (51043) 105 | ZC4H2 (55906) 106 | ZEB2 (9839) 107 | ZFP36L1 (677) 108 | -------------------------------------------------------------------------------- /chronos/fetch_parameters.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import requests 4 | import pandas as pd 5 | from .model import write_hdf5 6 | 7 | 8 | def fetch_parameters(url_loc="Data/DepMapDataURLs.json", 9 | output_dir="Data/DepMapParameters/", overwrite=False, 10 | relative_to_chronos=True 11 | ): 12 | ''' 13 | Fetch a set of trained Chronos parameters located at the urls in 14 | the json file at `url_loc` (see default file for an example) 15 | and writes them to the local directory in `output_dir`. 16 | Files present will be skipped unless `overwrite` is `True`. 17 | Both `url_loc` and `output_dir` are relative to the chronos package 18 | unless `relative_to_chronos` is `False`. 19 | ''' 20 | chronos_dir = os.path.dirname(__file__) 21 | if not url_loc.startswith("/"): 22 | if relative_to_chronos: 23 | print("`url_loc` will be found relative to the chronos package directory\n'%s'\n\ 24 | Pass `relative_to_chronos=False` to make the path relative to your current working directory\n'%s'\n\ 25 | instead.\n" % (chronos_dir, os.getcwd())) 26 | url_loc = os.path.join(chronos_dir, '..', url_loc) 27 | 28 | if not output_dir.startswith("/"): 29 | if relative_to_chronos: 30 | print("`output_dir` will be found relative to the chronos package directory\n'%s'\n\ 31 | Pass `relative_to_chronos=False` to make the path relative to your current working directory\n'%s'\n\ 32 | instead.\n" % (chronos_dir, os.getcwd())) 33 | output_dir = os.path.join(chronos_dir, '..', output_dir) 34 | if not os.path.isdir(output_dir): 35 | os.mkdir(output_dir) 36 | 37 | print("downloading files to %s" % output_dir) 38 | 39 | url_dict = json.loads(open(url_loc).read()) 40 | 41 | for filename, url in url_dict.items(): 42 | path = os.path.join(output_dir, filename) 43 | if filename in os.listdir(output_dir) and not overwrite: 44 | print("Skipping %s as it already exists, pass `overwrite=True` to overwrite" % filename) 45 | else: 46 | print("fetching %s from %s" % (filename, url)) 47 | file = requests.get(url, allow_redirects=True) 48 | open(path, 'wb').write(file.content) 49 | print("all files fetched, tranforming format") 50 | reformat_directory(output_dir) 51 | print('done') 52 | 53 | 54 | 55 | def reformat_directory(directory): 56 | '''transforms file formats in `directory` from DepMap release format to Chronos' expected format for `import_model`''' 57 | if not "gene_effect.hdf5" in os.listdir(directory): 58 | print("transforming gene_effect.csv, this may take a minute") 59 | ge = pd.read_csv(os.path.join(directory, "gene_effect.csv"), index_col=0) 60 | write_hdf5(ge, os.path.join(directory, "gene_effect.hdf5")) 61 | 62 | print("transforming guide efficacy") 63 | guide_eff = pd.read_csv(os.path.join(directory, "guide_efficacy.csv")) 64 | guide_eff.rename(columns={"sgRNA": "sgrna", "Efficacy": "efficacy"}, errors="ignore", inplace=True) 65 | guide_eff.to_csv(os.path.join(directory, "guide_efficacy.csv"), index=None) 66 | 67 | 68 | -------------------------------------------------------------------------------- /Data/SampleData/KYSequenceMap.csv: -------------------------------------------------------------------------------- 1 | sequence_ID,ScreenID,days,pDNA_batch,Replicate,ScreenType,cell_line_name,ModelConditionID,Library,PassesQC 2 | A2058_c903R1_KY-2,SC-000788.KY01,14,KY-2,A,2DS,ACH-000788,MC-000788-UK2d,KY,True 3 | A2058_c903R2_KY-2,SC-000788.KY01,14,KY-2,B,2DS,ACH-000788,MC-000788-UK2d,KY,True 4 | A2058_c903R3_KY-2,SC-000788.KY01,14,KY-2,C,2DS,ACH-000788,MC-000788-UK2d,KY,True 5 | A2780_c905R1_KY-1,SC-000657.KY01,14,KY-1,A,2DS,ACH-000657,MC-000657-ERaD,KY,True 6 | A2780_c905R2_KY-1,SC-000657.KY01,14,KY-1,B,2DS,ACH-000657,MC-000657-ERaD,KY,True 7 | ESS1_C908R2_KY-1,SC-000913.KY01,14,KY-1,A,2DS,ACH-000913,MC-000913-mRWp,KY,True 8 | ESS1_C908R3_KY-1,SC-000913.KY01,14,KY-1,B,2DS,ACH-000913,MC-000913-mRWp,KY,True 9 | H1915_c906R1_KY-1,SC-000434.KY01,14,KY-1,A,2DS,ACH-000434,MC-000434-JJXr,KY,True 10 | H1915_c906R2_KY-1,SC-000434.KY01,14,KY-1,B,2DS,ACH-000434,MC-000434-JJXr,KY,True 11 | H1915_c906R3_KY-1,SC-000434.KY01,14,KY-1,C,2DS,ACH-000434,MC-000434-JJXr,KY,True 12 | H2087_c906R1_KY-1,SC-000841.KY01,14,KY-1,A,2DS,ACH-000841,MC-000841-gQHi,KY,True 13 | H2087_c906R2_KY-1,SC-000841.KY01,14,KY-1,B,2DS,ACH-000841,MC-000841-gQHi,KY,True 14 | H2087_c906R3_KY-1,SC-000841.KY01,14,KY-1,C,2DS,ACH-000841,MC-000841-gQHi,KY,True 15 | HCC299_c903R1_KY-2,SC-001081.KY01,14,KY-2,A,2DS,ACH-001081,MC-001081-EiKC,KY,True 16 | HCC299_c903R2_KY-2,SC-001081.KY01,14,KY-2,B,2DS,ACH-001081,MC-001081-EiKC,KY,True 17 | HCC299_c903R3_KY-2,SC-001081.KY01,14,KY-2,C,2DS,ACH-001081,MC-001081-EiKC,KY,True 18 | HEC1_c907R1_KY-1,SC-001517.KY01,14,KY-1,A,2DS,ACH-001517,MC-001517-TVRG,KY,True 19 | HEC1_c907R2_KY-1,SC-001517.KY01,14,KY-1,B,2DS,ACH-001517,MC-001517-TVRG,KY,True 20 | HEC1_c907R3_KY-1,SC-001517.KY01,14,KY-1,C,2DS,ACH-001517,MC-001517-TVRG,KY,True 21 | KM12_c908R1_100_KY-1,SC-000969.KY01,14,KY-1,A,2DS,ACH-000969,MC-000969-8Uk9,KY,True 22 | KM12_c908R2_100_KY-1,SC-000969.KY01,14,KY-1,B,2DS,ACH-000969,MC-000969-8Uk9,KY,True 23 | KM12_c908R3_100_KY-1,SC-000969.KY01,14,KY-1,C,2DS,ACH-000969,MC-000969-8Uk9,KY,True 24 | KMS11_c907R1_KY-1,SC-000714.KY01,14,KY-1,A,2DS,ACH-000714,MC-000714-Ep4x,KY,True 25 | KMS11_c907R2_KY-1,SC-000714.KY01,14,KY-1,B,2DS,ACH-000714,MC-000714-Ep4x,KY,True 26 | KMS11_c907R3_KY-1,SC-000714.KY01,14,KY-1,C,2DS,ACH-000714,MC-000714-Ep4x,KY,True 27 | KYSE15_c907R1_KY-1,SC-000855.KY01,14,KY-1,A,2DS,ACH-000855,MC-000855-P9gt,KY,True 28 | KYSE15_c907R2_KY-1,SC-000855.KY01,14,KY-1,B,2DS,ACH-000855,MC-000855-P9gt,KY,True 29 | KYSE15_c907R3_KY-1,SC-000855.KY01,14,KY-1,C,2DS,ACH-000855,MC-000855-P9gt,KY,True 30 | KYSE70_c907R1_KY-1,SC-000784.KY01,14,KY-1,A,2DS,ACH-000784,MC-000784-ayo7,KY,True 31 | KYSE70_c907R2_KY-1,SC-000784.KY01,14,KY-1,B,2DS,ACH-000784,MC-000784-ayo7,KY,True 32 | KYSE70_c907R3_KY-1,SC-000784.KY01,14,KY-1,C,2DS,ACH-000784,MC-000784-ayo7,KY,True 33 | L363_c907R4_KY-1,SC-000183.KY01,14,KY-1,A,2DS,ACH-000183,MC-000183-QKEz,KY,True 34 | L363_c907R5_KY-1,SC-000183.KY01,14,KY-1,B,2DS,ACH-000183,MC-000183-QKEz,KY,True 35 | L363_c907R6_KY-1,SC-000183.KY01,14,KY-1,C,2DS,ACH-000183,MC-000183-QKEz,KY,True 36 | LB771H_c908R1_KY-1,SC-002265.KY01,14,KY-1,A,2DS,ACH-002265,MC-002265-Hq0I,KY,True 37 | LB771H_c908R2_KY-1,SC-002265.KY01,14,KY-1,B,2DS,ACH-002265,MC-002265-Hq0I,KY,True 38 | LB771H_c908R3_KY-1,SC-002265.KY01,14,KY-1,C,2DS,ACH-002265,MC-002265-Hq0I,KY,True 39 | LXF289_c906R1_KY-1,SC-000787.KY01,14,KY-1,A,2DS,ACH-000787,MC-000787-eQaU,KY,True 40 | LXF289_c906R2_KY-1,SC-000787.KY01,14,KY-1,B,2DS,ACH-000787,MC-000787-eQaU,KY,True 41 | LXF289_c906R3_KY-1,SC-000787.KY01,14,KY-1,C,2DS,ACH-000787,MC-000787-eQaU,KY,True 42 | MDST8_c903R3_KY-2,SC-000935.KY01,14,KY-2,A,2DS,ACH-000935,MC-000935-4xuf,KY,True 43 | MFE280_C908R1_KY-1,SC-000192.KY01,14,KY-1,A,2DS,ACH-000192,MC-000192-cA5R,KY,True 44 | MFE280_C908R2_KY-1,SC-000192.KY01,14,KY-1,B,2DS,ACH-000192,MC-000192-cA5R,KY,True 45 | MFE280_C908R3_KY-1,SC-000192.KY01,14,KY-1,C,2DS,ACH-000192,MC-000192-cA5R,KY,True 46 | OVISE_c905R2_KY-1,SC-000527.KY01,14,KY-1,A,2DS,ACH-000527,MC-000527-sDUu,KY,True 47 | OVISE_c905R3_KY-1,SC-000527.KY01,14,KY-1,B,2DS,ACH-000527,MC-000527-sDUu,KY,True 48 | RCCFG2_C908R1_KY-1,SC-002189.KY01,14,KY-1,A,2DS,ACH-002189,MC-002189-EopU,KY,True 49 | RCCFG2_C908R2_KY-1,SC-002189.KY01,14,KY-1,B,2DS,ACH-002189,MC-002189-EopU,KY,True 50 | RCCFG2_C908R3_KY-1,SC-002189.KY01,14,KY-1,C,2DS,ACH-002189,MC-002189-EopU,KY,True 51 | SAS_c907R1_KY-1,SC-002029.KY01,14,KY-1,A,2DS,ACH-002029,MC-002029-eTYU,KY,True 52 | SAS_c907R2_KY-1,SC-002029.KY01,14,KY-1,B,2DS,ACH-002029,MC-002029-eTYU,KY,True 53 | SAS_c907R3_KY-1,SC-002029.KY01,14,KY-1,C,2DS,ACH-002029,MC-002029-eTYU,KY,True 54 | SKNSH_c906R1_KY-1,SC-000149.KY01,14,KY-1,A,2DS,ACH-000149,MC-000149-T5h1,KY,True 55 | SKNSH_c906R2_KY-1,SC-000149.KY01,14,KY-1,B,2DS,ACH-000149,MC-000149-T5h1,KY,True 56 | SKNSH_c906R3_KY-1,SC-000149.KY01,14,KY-1,C,2DS,ACH-000149,MC-000149-T5h1,KY,True 57 | SNU81_c903R1_KY-2,SC-000991.KY01,14,KY-2,A,2DS,ACH-000991,MC-000991-vB9l,KY,True 58 | SNU81_c903R2_KY-2,SC-000991.KY01,14,KY-2,B,2DS,ACH-000991,MC-000991-vB9l,KY,True 59 | SNU81_c903R3_KY-2,SC-000991.KY01,14,KY-2,C,2DS,ACH-000991,MC-000991-vB9l,KY,True 60 | SW48_C902R1_P1D14_KY-2,SC-000958.KY01,14,KY-2,A,2DS,ACH-000958,MC-000958-mNiZ,KY,True 61 | SW48_C902R2_P1D14_KY-2,SC-000958.KY01,14,KY-2,B,2DS,ACH-000958,MC-000958-mNiZ,KY,True 62 | SW48_C902R3_P1D14_KY-2,SC-000958.KY01,14,KY-2,C,2DS,ACH-000958,MC-000958-mNiZ,KY,True 63 | T47d_c903R2_KY-2,SC-000147.KY01,14,KY-2,A,2DS,ACH-000147,MC-000147-L6rc,KY,True 64 | T47d_c903R3_KY-2,SC-000147.KY01,14,KY-2,B,2DS,ACH-000147,MC-000147-L6rc,KY,True 65 | pDNA_batch_KY-1,pDNA,0,KY-1,,pDNA,pDNA,pDNA,KY,True 66 | pDNA_batch_KY-2,pDNA,0,KY-2,,pDNA,pDNA,pDNA,KY,True 67 | -------------------------------------------------------------------------------- /chronos/copy_correction.py: -------------------------------------------------------------------------------- 1 | try: 2 | from patsy import dmatrix 3 | except ModuleNotFoundError: 4 | raise ModuleNotFoundError("patsy required for copy_correction submodule. \ 5 | Try `pip install patsy`") 6 | import numpy as np 7 | import pandas as pd 8 | import tensorflow as tf 9 | 10 | 11 | def get_shifts(gene_effect, copy_number): 12 | ge = gene_effect.copy() 13 | ge -= ge.median() 14 | cn = copy_number.loc[ge.index, ge.columns].fillna(1)[ge.notnull()].stack() 15 | return pd.DataFrame({ 16 | "gene_effect_shift": ge.stack().values, 17 | "cn": cn.values, 18 | "cell_line_name": cn.index.get_level_values(0), 19 | "gene": cn.index.get_level_values(1) 20 | }) 21 | 22 | def logspace(low, high, n): 23 | start = 0 24 | end = np.log(high - low + 1) 25 | steps = np.linspace(start, end, n) 26 | converted = np.exp(steps) + low - 1 27 | return converted 28 | 29 | 30 | def add_global_shift(cn, y, means, dtype, nknots_cn=10, nknots_ge=5, alpha=.2): 31 | np_dtype = {tf.double: np.double, tf.float32: np.float32}[dtype] 32 | knots_cn = list(cn.quantile(np.linspace(0, 1, nknots_cn))) 33 | knots_cn[0] += 1e-1 34 | knots_cn[-1] -= 1e-1 35 | 36 | knots_ge = list(logspace(np.quantile(means, 0.01), np.quantile(means, .99), nknots_ge)) 37 | 38 | spline_gc = np.array(dmatrix( 39 | "te( \ 40 | bs(cn, knots=%r, degree=3, include_intercept=False), \ 41 | bs(means, knots=%r, degree=3, include_intercept=False) \ 42 | )" % (knots_cn, knots_ge), 43 | {"cn": cn.values, 'means': means}, return_type='matrix' 44 | )) 45 | print('constructed spline matrix of shape %i, %i' % spline_gc.shape) 46 | _spline = tf.constant(spline_gc, dtype=dtype) 47 | _y = tf.constant(y.values, dtype=dtype) 48 | var = y.var() 49 | init = np.random.uniform(-.001, -.0001, size=(spline_gc.shape[1])) 50 | v_coeffs = tf.Variable(init.reshape((-1, 1)), dtype=dtype) 51 | v_weights = tf.Variable(1e-6 * np.ones(len(spline_gc)), dtype=dtype) 52 | _weights = tf.exp(-tf.abs(v_weights)) 53 | _weight_cost = tf.reduce_mean(input_tensor=tf.square(v_weights)) 54 | 55 | _out = _weights * tf.matmul(_spline, v_coeffs)[:, 0] 56 | _cost = tf.reduce_mean(input_tensor=tf.square(_out - _y) ) 57 | optimizer = tf.compat.v1.train.AdamOptimizer(.005) 58 | _step = optimizer.minimize(_cost + alpha * _weight_cost, var_list=[v_coeffs, v_weights]) 59 | 60 | sess = tf.compat.v1.Session() 61 | sess.run(tf.compat.v1.global_variables_initializer()) 62 | 63 | 64 | for i in range(501): 65 | sess.run(_step) 66 | if not i%100: 67 | print('\tcost:', sess.run(_cost)) 68 | out = sess.run(_out) 69 | weights = sess.run(_weights) 70 | 71 | 72 | return weights, sess.run(_out) 73 | 74 | 75 | 76 | def get_adjusted_matrix(shifts, gene_effect,): 77 | ge = gene_effect.stack() 78 | means = gene_effect.mean() 79 | 80 | adjusted = pd.Series( 81 | shifts['adjusted'].values + means.loc[shifts.gene].values, 82 | index=ge.index 83 | ).reset_index() 84 | 85 | adjusted = pd.pivot(adjusted, index=adjusted.columns[0], columns=adjusted.columns[1])[0] 86 | adjusted.index.name = "cell_line_name" 87 | adjusted.columns.name = "gene" 88 | return adjusted 89 | 90 | 91 | 92 | def alternate_CN(gene_effect, copy_number, nknots_cn=10, nknots_ge=5, dtype=tf.double, 93 | max_lines=150): 94 | ''' 95 | removes biases due to copy number by aligning copy nunber segments to the mean. Changes the gene_effect matrix. 96 | Parameters: 97 | gene_map (`pandas.DataFrame`): either a CCDS gene alignment or a DepMap style guide alignment with the column 98 | "genome_alignment". Must include all genes. 99 | copy_number ('pandas.DataFrame'): a cell-line by gene matrix of relative (floating point) copy number 100 | ''' 101 | 102 | if len(gene_effect) < 3: 103 | raise RuntimeError("Correct for CN should not be used with fewer than 3 cell lines. Consider preprocessing with CRISPRCleanR") 104 | missing_lines = sorted(set(gene_effect.index) - set(copy_number.index)) 105 | if len(missing_lines) > 0: 106 | print("Warning: missing lines from gene_effect in copy_number, which won't be corrected.\nExamples: %r" % missing_lines[:5]) 107 | missing_genes = sorted(set(gene_effect.columns) - set(copy_number.columns)) 108 | if len(missing_genes) > 0: 109 | raise ValueError("Missing %i genes from gene_effect in copy_number.\nExamples: %r" % ( 110 | len(missing_genes), missing_genes[:5])) 111 | 112 | lines = list(gene_effect.index) 113 | np.random.shuffle(lines) 114 | ngroups = int(len(gene_effect) / max_lines) + 1 115 | groups = np.array_split(lines, ngroups) 116 | shift_list = [] 117 | new_list = [] 118 | for i, group in enumerate(groups): 119 | print("\nFitting cell line group %i of %i" % (i+1, len(groups))) 120 | 121 | print('finding low CN gene effect shifts') 122 | shifts= get_shifts(gene_effect.loc[group], copy_number.loc[group]) 123 | 124 | print('smoothing and interpolating cutting toxicity for all genes') 125 | means = gene_effect.loc[group].mean().sort_values() 126 | means_expanded = means.loc[shifts.gene].values 127 | weights, cn_effect = add_global_shift(shifts.cn, shifts.gene_effect_shift, means_expanded, dtype, nknots_cn, nknots_ge) 128 | shifts['weights'] = weights 129 | shifts['cn_effect'] = cn_effect 130 | shifts['adjusted'] = shifts['gene_effect_shift'].values - cn_effect 131 | 132 | 133 | print("generating matrix") 134 | new = get_adjusted_matrix(shifts, gene_effect.loc[group]) 135 | new_list.append(new) 136 | shift_list.append(shifts) 137 | shifts = pd.concat(shift_list, ignore_index=True) 138 | new = pd.concat(new_list) 139 | return new, shifts 140 | -------------------------------------------------------------------------------- /Data/SampleData/OmicsSomaticMutations.csv: -------------------------------------------------------------------------------- 1 | ModelID,Gene,DNAChange,ProteinChange,LikelyGoF,Driver,LikelyDriver 2 | ACH-000784,GATA3 (2625),c.1183G>A,p.A395T,True,False,True 3 | ACH-000913,XPO1 (7514),c.1711G>A,p.E571K,True,False,True 4 | ACH-000913,SF3B1 (23451),c.1998G>T,p.K666N,True,False,True 5 | ACH-000913,PIK3CA (5290),c.1633G>A,p.E545K,True,True,True 6 | ACH-000479,EGFR (1956),c.1658G>T,p.G553V,True,False,True 7 | ACH-000955,CTNNB1 (1499),c.121A>G,p.T41A,True,True,True 8 | ACH-000955,PIK3CA (5290),c.3140A>G,p.H1047R,True,True,True 9 | ACH-000955,KRAS (3845),c.35G>A,p.G12D,True,True,True 10 | ACH-000955,ERBB3 (2065),c.310G>A,p.V104M,True,True,True 11 | ACH-000955,ERBB3 (2065),c.785C>A,p.P262H,True,False,True 12 | ACH-000958,CTNNB1 (1499),c.98C>A,p.S33Y,True,False,True 13 | ACH-000958,EGFR (1956),c.2020G>A,p.G674S,True,True,True 14 | ACH-000958,MAP2K1 (5604),c.167A>C,p.Q56P,True,False,True 15 | ACH-001533,MYD88 (4615),c.439T>C,p.*147R,True,True,True 16 | ACH-000750,BRAF (673),c.1799T>A,p.V600E,True,True,True 17 | ACH-000750,KSR2 (283455),c.2175C>T,p.F725F,True,False,True 18 | ACH-000912,DDR2 (4921),c.1912A>T,p.I638F,True,False,True 19 | ACH-001636,PGR (5241),c.2219G>A,p.R740Q,True,False,True 20 | ACH-001129,U2AF1 (7307),c.470A>C,p.Q157P,True,True,True 21 | ACH-002926,GNA11 (2767),c.626A>T,p.Q209L,True,False,True 22 | ACH-000988,FGFR2 (2263),c.71C>G,p.S24W,True,False,True 23 | ACH-000788,TP63 (8626),c.1135C>T,p.R379C,True,False,True 24 | ACH-000788,BRAF (673),c.1799T>A,p.V600E,True,True,True 25 | ACH-000788,MAP2K1 (5604),c.370C>T,p.P124S,True,False,True 26 | ACH-000969,GNAS (2778),c.602G>A,p.R201H,True,False,True 27 | ACH-000714,FGFR3 (2261),c.1118A>G,p.Y373C,True,True,True 28 | ACH-000714,FGFR3 (2261),c.1118A>G,p.Y373C,True,False,True 29 | ACH-000192,PIK3CA (5290),c.3139C>T,p.H1047Y,True,False,True 30 | ACH-000192,MLLT10 (8028),c.2111G>T,p.R704L,True,False,True 31 | ACH-000192,FGFR2 (2263),c.71C>G,p.S24W,True,False,True 32 | ACH-000657,PIK3CA (5290),c.1093G>A,p.E365K,True,False,True 33 | ACH-000657,RRAS2 (22800),c.215A>T,p.Q72L,True,False,True 34 | ACH-000657,MED12 (9968),c.67G>T,p.D23Y,True,False,True 35 | ACH-000614,BRAF (673),c.1799T>A,p.V600E,True,True,True 36 | ACH-000147,PIK3CA (5290),c.3140A>G,p.H1047R,True,True,True 37 | ACH-000183,NRAS (4893),c.183A>C,p.Q61H,True,False,True 38 | ACH-000183,PIK3CA (5290),c.1633G>A,p.E545K,True,True,True 39 | ACH-000787,SOS1 (6654),c.697A>T,p.N233Y,True,False,True 40 | ACH-000787,CTNNB1 (1499),c.121A>G,p.T41A,True,True,True 41 | ACH-001736,FLT3 (2322),c.2503G>C,p.D835H,True,True,True 42 | ACH-001843,CTNNB1 (1499),c.97T>C,p.S33P,True,False,True 43 | ACH-001843,KRAS (3845),c.35G>A,p.G12D,True,True,True 44 | ACH-001843,GNAS (2778),c.602G>A,p.R201H,True,False,True 45 | ACH-000396,FGFR3 (2261),c.1951A>G,p.K651E,True,False,True 46 | ACH-000396,ERBB2 (2064),c.2033G>A,p.R678Q,True,False,True 47 | ACH-000458,RHOA (387),c.118G>C,p.E40Q,True,False,True 48 | ACH-000458,HRAS (3265),c.182A>T,p.Q61L,True,False,True 49 | ACH-000004,JAK2 (3717),c.1849G>T,p.V617F,True,True,True 50 | ACH-002189,PIK3CB (5291),c.3200A>T,p.D1067V,True,False,True 51 | ACH-000935,KDR (3791),c.3095G>A,p.R1032Q,True,False,True 52 | ACH-000935,BRAF (673),c.1798_1799GT>AA,p.V600K,True,True,True 53 | ACH-000263,KIT (3815),c.2466T>A,p.N822K,True,True,True 54 | ACH-001411,PIK3CA (5290),c.1633G>A,p.E545K,True,True,True 55 | ACH-001842,SF3B1 (23451),c.2098A>G,p.K700E,True,False,True 56 | ACH-001842,AKT1 (207),c.49G>A,p.E17K,True,True,True 57 | ACH-000841,NRAS (4893),c.181C>A,p.Q61K,True,True,True 58 | ACH-000841,BRAF (673),c.1789C>G,p.L597V,True,False,True 59 | ACH-000149,ALK (238),c.3522C>A,p.F1174L,True,True,True 60 | ACH-000993,MTOR (2475),c.6644C>A,p.S2215Y,True,False,True 61 | ACH-000993,XPO1 (7514),c.2246G>A,p.R749Q,True,False,True 62 | ACH-000993,NT5C2 (22978),c.1168G>A,p.E390K,True,False,True 63 | ACH-000168,KRAS (3845),c.38G>A,p.G13D,True,True,True 64 | ACH-001303,DDR2 (4921),c.187C>G,p.L63V,True,False,True 65 | ACH-001303,ALK (238),c.3824G>A,p.R1275Q,True,True,True 66 | ACH-002029,ERBB4 (2066),c.3325G>T,p.G1109C,True,False,True 67 | ACH-001081,MTOR (2475),c.7500T>G,p.I2500M,True,False,True 68 | ACH-001081,XPO1 (7514),c.1711G>A,p.E571K,True,False,True 69 | ACH-001081,RAF1 (5894),c.770C>T,p.S257L,True,False,True 70 | ACH-001081,KRAS (3845),c.436G>A,p.A146T,True,True,True 71 | ACH-000996,PIK3CA (5290),c.3129G>T,p.M1043I,True,False,True 72 | ACH-000996,ARHGAP35 (2909),c.2989C>T,p.R997*,True,False,True 73 | ACH-000946,NFE2L2 (4780),c.53G>C,p.R18P,True,False,True 74 | ACH-000946,CTNNB1 (1499),c.95A>T,p.D32V,True,False,True 75 | ACH-000946,CCND1 (595),c.859C>A,p.P287T,True,False,True 76 | ACH-000937,NRAS (4893),c.34G>A,p.G12S,True,False,True 77 | ACH-000937,NOTCH1 (4851),c.4721T>C,p.L1574P,True,False,True 78 | ACH-001563,NRAS (4893),c.37G>C,p.G13R,True,False,True 79 | ACH-001563,IDH1 (3417),c.394C>T,p.R132C,True,True,True 80 | ACH-000774,KRAS (3845),c.35G>T,p.G12V,True,True,True 81 | ACH-001554,GNAQ (2776),c.626A>T,p.Q209L,True,False,True 82 | ACH-002265,NFE2L2 (4780),c.52C>G,p.R18G,True,False,True 83 | ACH-001674,ALK (238),c.3824G>A,p.R1275Q,True,True,True 84 | ACH-001674,RAC1 (5879),c.85C>T,p.P29S,True,False,True 85 | ACH-000406,JAK3 (3718),c.1533G>A,p.M511I,True,False,True 86 | ACH-001517,PIK3CA (5290),c.3145G>C,p.G1049R,True,False,True 87 | ACH-001517,KRAS (3845),c.35G>A,p.G12D,True,True,True 88 | ACH-001517,ERBB2 (2064),c.2393C>T,p.T798I,True,False,True 89 | ACH-001517,ERBB2 (2064),c.2524G>A,p.V842I,True,True,True 90 | ACH-000434,ABL1 (25),c.1051C>T,p.R351W,True,True,True 91 | ACH-000434,HRAS (3265),c.182A>T,p.Q61L,True,False,True 92 | ACH-000855,RAC1 (5879),c.85C>T,p.P29S,True,False,True 93 | ACH-000855,ERBB3 (2065),c.889G>T,p.D297Y,True,False,True 94 | ACH-000404,BRAF (673),c.1799T>A,p.V600E,True,True,True 95 | ACH-000991,XPO1 (7514),c.2246G>A,p.R749Q,True,False,True 96 | ACH-000991,XPO1 (7514),c.1711G>A,p.E571K,True,False,True 97 | ACH-000991,BCL6 (604),c.1613G>A,p.R538Q,True,False,True 98 | ACH-000991,KRAS (3845),c.436G>A,p.A146T,True,True,True 99 | ACH-000527,PIK3CA (5290),c.1258T>C,p.C420R,True,True,True 100 | ACH-000605,GATA3 (2625),c.1183G>A,p.A395T,True,False,True 101 | -------------------------------------------------------------------------------- /Data/SampleData/AchillesCommonEssentialControls.csv: -------------------------------------------------------------------------------- 1 | Gene 2 | AAMP (14) 3 | ABCE1 (6059) 4 | ACTL6A (86) 5 | ACTR8 (93973) 6 | AHCY (191) 7 | ALG1 (56052) 8 | ALG2 (85365) 9 | ANAPC10 (10393) 10 | ANAPC5 (51433) 11 | AQR (9716) 12 | ARMC7 (79637) 13 | ARPC4 (10093) 14 | ATL2 (64225) 15 | ATP6AP2 (10159) 16 | ATP6V1A (523) 17 | ATR (545) 18 | BARD1 (580) 19 | BCAS2 (10286) 20 | BDP1 (55814) 21 | BMS1 (9790) 22 | BPTF (2186) 23 | BRD8 (10902) 24 | BUB1B (701) 25 | BYSL (705) 26 | CAD (790) 27 | CCNA2 (890) 28 | CCT2 (10576) 29 | CCT6A (908) 30 | CDC20 (991) 31 | CDC45 (8318) 32 | CDC73 (79577) 33 | CDK1 (983) 34 | CEBPZ (10153) 35 | CENPJ (55835) 36 | CENPW (387103) 37 | CEP57 (9702) 38 | CHAF1A (10036) 39 | CHMP2A (27243) 40 | CHORDC1 (26973) 41 | CLASRP (11129) 42 | CMPK1 (51727) 43 | COA5 (493753) 44 | COG4 (25839) 45 | COPB2 (9276) 46 | COPS3 (8533) 47 | COQ4 (51117) 48 | COX5B (1329) 49 | CPSF6 (11052) 50 | CSE1L (1434) 51 | CSTF3 (1479) 52 | CTNNBL1 (56259) 53 | CTU2 (348180) 54 | CWF19L2 (143884) 55 | DARS2 (55157) 56 | DCTN2 (10540) 57 | DDB1 (1642) 58 | DDX21 (9188) 59 | DDX39B (7919) 60 | DDX49 (54555) 61 | DDX56 (54606) 62 | DGCR8 (54487) 63 | DHX33 (56919) 64 | DHX8 (1659) 65 | DIS3 (22894) 66 | DMAP1 (55929) 67 | DNAJC2 (27000) 68 | DNMT1 (1786) 69 | DR1 (1810) 70 | DYNC1H1 (1778) 71 | ECD (11319) 72 | EEFSEC (60678) 73 | EIF1AD (84285) 74 | EIF2B4 (8890) 75 | EIF3A (8661) 76 | EIF3J (8669) 77 | EIF4E (1977) 78 | EIF6 (3692) 79 | ELP4 (26610) 80 | ENO1 (2023) 81 | ERCC4 (2072) 82 | EXOC1 (55763) 83 | EXOSC1 (51013) 84 | EXOSC5 (56915) 85 | FAM210A (125228) 86 | FARSB (10056) 87 | FEN1 (2237) 88 | FNTB (2342) 89 | GAPDH (2597) 90 | GATC (283459) 91 | GEMIN8 (54960) 92 | GFM1 (85476) 93 | GINS3 (64785) 94 | GNB1L (54584) 95 | GON4L (54856) 96 | GPN1 (11321) 97 | GRPEL1 (80273) 98 | GTF2A2 (2958) 99 | GTF3C1 (2975) 100 | GTPBP4 (23560) 101 | HAUS1 (115106) 102 | HAUS7 (55559) 103 | HEATR1 (55127) 104 | HMGCS1 (3157) 105 | HNRNPH1 (3187) 106 | HNRNPL (3191) 107 | HSD17B10 (3028) 108 | HSPA9 (3313) 109 | IARS1 (3376) 110 | IGBP1 (3476) 111 | IMP4 (92856) 112 | INTS3 (65123) 113 | IPO11 (51194) 114 | ISG20L2 (81875) 115 | KANSL2 (54934) 116 | KCMF1 (56888) 117 | KIN (22944) 118 | LAGE3 (8270) 119 | LAS1L (81887) 120 | LIAS (11019) 121 | LRPPRC (10128) 122 | LSM4 (25804) 123 | LYRM4 (57128) 124 | MASTL (84930) 125 | MCM10 (55388) 126 | MCM6 (4175) 127 | MDC1 (9656) 128 | MED12 (9968) 129 | MED14 (9282) 130 | MED20 (9477) 131 | MED31 (51003) 132 | MED8 (112950) 133 | METTL1 (4234) 134 | MFAP1 (4236) 135 | MIS18BP1 (55320) 136 | MOB4 (25843) 137 | MRPL16 (54948) 138 | MRPL20 (55052) 139 | MRPL27 (51264) 140 | MRPL37 (51253) 141 | MRPL40 (64976) 142 | MRPL48 (51642) 143 | MRPL9 (65005) 144 | MRPS15 (64960) 145 | MRPS23 (51649) 146 | MRPS30 (10884) 147 | MRPS5 (64969) 148 | MRTO4 (51154) 149 | MTOR (2475) 150 | MTPAP (55149) 151 | N6AMT1 (29104) 152 | NAA25 (80018) 153 | NAF1 (92345) 154 | NARS2 (79731) 155 | NCAPG (64151) 156 | NCL (4691) 157 | NEDD1 (121441) 158 | NFS1 (9054) 159 | NIP7 (51388) 160 | NOB1 (28987) 161 | NOL11 (25926) 162 | NOL9 (79707) 163 | NOP10 (55505) 164 | NOP56 (10528) 165 | NPAT (4863) 166 | NSA2 (10412) 167 | NSMCE4A (54780) 168 | NUDC (10726) 169 | NUFIP1 (26747) 170 | NUP155 (9631) 171 | NUP43 (348995) 172 | NUP93 (9688) 173 | OGT (8473) 174 | ORC2 (4999) 175 | ORC6 (23594) 176 | PABPN1 (8106) 177 | PALB2 (79728) 178 | PCF11 (51585) 179 | PCYT1A (5130) 180 | PDCL (5082) 181 | PES1 (23481) 182 | PFDN2 (5202) 183 | PGK1 (5230) 184 | PIK3C3 (5289) 185 | PLRG1 (5356) 186 | PMVK (10654) 187 | PNPT1 (87178) 188 | POLD2 (5425) 189 | POLR1A (25885) 190 | POLR1F (221830) 191 | POLR2D (5433) 192 | POLR2L (5441) 193 | POLR3E (55718) 194 | POP4 (10775) 195 | PPIE (10450) 196 | PPP1R15B (84919) 197 | PPP2CA (5515) 198 | PPRC1 (23082) 199 | PREB (10113) 200 | PRMT5 (10419) 201 | PRPF3 (9129) 202 | PRPF4 (9128) 203 | PRPF8 (10594) 204 | PSMA5 (5686) 205 | PSMB4 (5692) 206 | PSMC1 (5700) 207 | PSMC6 (5706) 208 | PSMD13 (5719) 209 | PSMD4 (5710) 210 | PSMG1 (8624) 211 | PTCD3 (55037) 212 | QARS1 (5859) 213 | RAC1 (5879) 214 | RACGAP1 (29127) 215 | RAD51C (5889) 216 | RANGAP1 (5905) 217 | RBBP5 (5929) 218 | RBM14 (10432) 219 | RBM28 (55131) 220 | RBM48 (84060) 221 | RCC1 (1104) 222 | RFC2 (5982) 223 | RFT1 (91869) 224 | RHOA (387) 225 | RIOK1 (83732) 226 | RNF168 (165918) 227 | RNGTT (8732) 228 | RPA1 (6117) 229 | RPF1 (80135) 230 | RPL11 (6135) 231 | RPL18 (6141) 232 | RPL23A (6147) 233 | RPL3 (6122) 234 | RPL7L1 (285855) 235 | RPN1 (6184) 236 | RPP38 (10557) 237 | RPS11 (6205) 238 | RPS15A (6210) 239 | RPS19BP1 (91582) 240 | RPS21 (6227) 241 | RPS4X (6191) 242 | RPTOR (57521) 243 | RRP1 (8568) 244 | RRP7A (27341) 245 | RTEL1 (51750) 246 | RUVBL2 (10856) 247 | SAMM50 (25813) 248 | SARS2 (54938) 249 | SCFD1 (23256) 250 | SDHA (6389) 251 | SEC61A1 (29927) 252 | SEH1L (81929) 253 | SF3A1 (10291) 254 | SF3B1 (23451) 255 | SF3B3 (23450) 256 | SHQ1 (55164) 257 | SLC25A26 (115286) 258 | SLU7 (10569) 259 | SMC2 (10592) 260 | SMC6 (79677) 261 | SMNDC1 (10285) 262 | SNIP1 (79753) 263 | SNRNP40 (9410) 264 | SNRPC (6631) 265 | SNRPF (6636) 266 | SON (6651) 267 | SPC25 (57405) 268 | SRBD1 (55133) 269 | SRP19 (6728) 270 | SRPRB (58477) 271 | SRSF11 (9295) 272 | SSU72 (29101) 273 | STX5 (6811) 274 | SUPT16H (11198) 275 | SUPV3L1 (6832) 276 | TACC3 (10460) 277 | TAF1B (9014) 278 | TAF6L (10629) 279 | TBCA (6902) 280 | TCP1 (6950) 281 | TFAM (7019) 282 | THOC7 (80145) 283 | TIMM22 (29928) 284 | TIPRL (261726) 285 | TMED2 (10959) 286 | TNPO3 (23534) 287 | TOP2A (7153) 288 | TPR (7175) 289 | TRAPPC1 (58485) 290 | TRAPPC8 (22878) 291 | TRMT61A (115708) 292 | TSEN2 (80746) 293 | TSR2 (90121) 294 | TTF2 (8458) 295 | TUBB (203068) 296 | TUBGCP3 (10426) 297 | TUT1 (64852) 298 | U2AF1 (7307) 299 | U2AF2 (11338) 300 | UBA3 (9039) 301 | UBL5 (59286) 302 | UMPS (7372) 303 | UQCRFS1 (7386) 304 | URI1 (8725) 305 | USP36 (57602) 306 | USPL1 (10208) 307 | UTP20 (27340) 308 | VARS1 (7407) 309 | VPS25 (84313) 310 | VPS72 (6944) 311 | WDHD1 (11169) 312 | WDR3 (10885) 313 | WDR43 (23160) 314 | WDR70 (55100) 315 | WDR82 (80335) 316 | XPO1 (7514) 317 | XRCC6 (2547) 318 | YARS2 (51067) 319 | YKT6 (10652) 320 | ZBTB11 (27107) 321 | ZC3H8 (84524) 322 | ZMAT2 (153527) 323 | ZNF207 (7756) 324 | ZNF622 (90441) 325 | -------------------------------------------------------------------------------- /Data/SampleData/AchillesNonessentialControls.csv: -------------------------------------------------------------------------------- 1 | Gene 2 | ABCG8 (64241) 3 | ACTL7A (10881) 4 | ACTL9 (284382) 5 | ADAM18 (8749) 6 | ADAM20 (8748) 7 | ADGRG7 (84873) 8 | AFM (173) 9 | AIPL1 (23746) 10 | ALPI (248) 11 | ANKRD30A (91074) 12 | APOA4 (337) 13 | APOF (319) 14 | ARGFX (503582) 15 | ASB17 (127247) 16 | ASZ1 (136991) 17 | ATP4B (496) 18 | B3GNT6 (192134) 19 | BARHL1 (56751) 20 | BMP10 (27302) 21 | BPIFA3 (128861) 22 | BPIFB6 (128859) 23 | BRDT (676) 24 | C10orf53 (282966) 25 | C12orf40 (283461) 26 | C17orf78 (284099) 27 | C20orf173 (140873) 28 | C8A (731) 29 | CABP5 (56344) 30 | CACNG2 (10369) 31 | CACNG5 (27091) 32 | CBLIF (2694) 33 | CCDC83 (220047) 34 | CCL1 (6346) 35 | CD200R1L (344807) 36 | CDX2 (1045) 37 | CELA2A (63036) 38 | CELA3B (23436) 39 | CETN1 (1068) 40 | CFHR5 (81494) 41 | CHRNA6 (8973) 42 | CLCA1 (1179) 43 | CLEC2A (387836) 44 | CLEC6A (93978) 45 | CNBD1 (168975) 46 | CNPY1 (285888) 47 | COL20A1 (57642) 48 | CRNN (49860) 49 | CRYGB (1419) 50 | CSHL1 (1444) 51 | CSN3 (1448) 52 | CST4 (1472) 53 | CST8 (10047) 54 | CST9L (128821) 55 | CTCFL (140690) 56 | CYLC2 (1539) 57 | CYP11B2 (1585) 58 | CYP2A13 (1553) 59 | CYP4A22 (284541) 60 | DAZL (1618) 61 | DCANP1 (140947) 62 | DDX4 (54514) 63 | DEFA6 (1671) 64 | DEFB104A (140596) 65 | DEFB118 (117285) 66 | DEFB126 (81623) 67 | DEFB129 (140881) 68 | DMP1 (1758) 69 | DMRTB1 (63948) 70 | DPRX (503834) 71 | DRGX (644168) 72 | DSG4 (147409) 73 | DTX2 (113878) 74 | EFCAB3 (146779) 75 | ELOA2 (51224) 76 | EVX1 (2128) 77 | FABP2 (2169) 78 | FCRL4 (83417) 79 | FEZF2 (55079) 80 | FGF3 (2248) 81 | FGF6 (2251) 82 | FLG2 (388698) 83 | FNDC9 (408263) 84 | FOXB2 (442425) 85 | FOXE3 (2301) 86 | FOXR1 (283150) 87 | FSCB (84075) 88 | FUT9 (10690) 89 | GABRA1 (2554) 90 | GALNTL5 (168391) 91 | GALR3 (8484) 92 | GCG (2641) 93 | GDF2 (2658) 94 | GFRAL (389400) 95 | GHRH (2691) 96 | GJA10 (84694) 97 | GK2 (2712) 98 | GLRA1 (2741) 99 | GML (2765) 100 | GPR139 (124274) 101 | GPR151 (134391) 102 | GPR26 (2849) 103 | GPR32 (2854) 104 | GPR52 (9293) 105 | GPRC6A (222545) 106 | GPX6 (257202) 107 | GRM5 (2915) 108 | GSC2 (2928) 109 | GSX1 (219409) 110 | GUCA2A (2980) 111 | H2AC1 (221613) 112 | H4C7 (8369) 113 | HCRTR2 (3062) 114 | HMX1 (3166) 115 | HOXD12 (3238) 116 | HRH3 (11255) 117 | HTR1A (3350) 118 | HTR3D (200909) 119 | HTR5A (3361) 120 | IAPP (3375) 121 | IFNA10 (3446) 122 | IFNA16 (3449) 123 | IFNA2 (3440) 124 | IFNA4 (3441) 125 | IFNA6 (3443) 126 | IFNA8 (3445) 127 | IFNK (56832) 128 | IL12B (3593) 129 | IL17A (3605) 130 | IL1F10 (84639) 131 | IL22 (50616) 132 | IL26 (55801) 133 | IL31 (386653) 134 | IL36B (27177) 135 | IL9 (3578) 136 | INSL5 (10022) 137 | INSM2 (84684) 138 | IQCF1 (132141) 139 | ISX (91464) 140 | KASH5 (147872) 141 | KCNB2 (9312) 142 | KCNK10 (54207) 143 | KCNK18 (338567) 144 | KHDC3L (154288) 145 | KIR2DL1 (3802) 146 | KLK12 (43849) 147 | KRT2 (3849) 148 | KRT26 (353288) 149 | KRT33A (3883) 150 | KRT36 (8689) 151 | KRT38 (8687) 152 | KRT71 (112802) 153 | KRT74 (121391) 154 | KRT76 (51350) 155 | KRT78 (196374) 156 | KRT84 (3890) 157 | KRT86 (3892) 158 | KRTAP1-1 (81851) 159 | KRTAP10-10 (353333) 160 | KRTAP10-12 (386685) 161 | KRTAP10-4 (386672) 162 | KRTAP10-6 (386674) 163 | KRTAP10-9 (386676) 164 | KRTAP13-1 (140258) 165 | KRTAP13-3 (337960) 166 | KRTAP15-1 (254950) 167 | KRTAP19-3 (337970) 168 | KRTAP26-1 (388818) 169 | KRTAP4-11 (653240) 170 | KRTAP4-2 (85291) 171 | KRTAP4-7 (100132476) 172 | KRTAP9-2 (83899) 173 | KRTAP9-4 (85280) 174 | LBX1 (10660) 175 | LCT (3938) 176 | LGALS14 (56891) 177 | LHX3 (8022) 178 | LIM2 (3982) 179 | LORICRIN (4014) 180 | LRIT2 (340745) 181 | LYZL1 (84569) 182 | MAS1 (4142) 183 | MBD3L1 (85509) 184 | MBL2 (4153) 185 | MC3R (4159) 186 | MEP1A (4224) 187 | MEPE (56955) 188 | MMD2 (221938) 189 | MMP21 (118856) 190 | MMP27 (64066) 191 | MORC1 (27136) 192 | MRGPRX1 (259249) 193 | MRGPRX4 (117196) 194 | MS4A13 (503497) 195 | MSGN1 (343930) 196 | MTNR1B (4544) 197 | MUC7 (4589) 198 | MYBPC3 (4607) 199 | NANOGNB (360030) 200 | NCR2 (9436) 201 | NEUROD2 (4761) 202 | NEUROD6 (63974) 203 | NKX2-1 (7080) 204 | NLRP4 (147945) 205 | NLRP8 (126205) 206 | NMS (129521) 207 | NOX3 (50508) 208 | NPHS2 (7827) 209 | NPVF (64111) 210 | OC90 (729330) 211 | OLIG3 (167826) 212 | OPN5 (221391) 213 | OR10A4 (283297) 214 | OR10H1 (26539) 215 | OR10H3 (26532) 216 | OR10R2 (343406) 217 | OR11A1 (26531) 218 | OR12D3 (81797) 219 | OR13D1 (286365) 220 | OR1A1 (8383) 221 | OR1E1 (8387) 222 | OR1G1 (8390) 223 | OR1N2 (138882) 224 | OR2AT4 (341152) 225 | OR2C3 (81472) 226 | OR2D3 (120775) 227 | OR2G2 (81470) 228 | OR2H1 (26716) 229 | OR2L3 (391192) 230 | OR2T10 (127069) 231 | OR2T2 (401992) 232 | OR2T33 (391195) 233 | OR2T5 (401993) 234 | OR3A1 (4994) 235 | OR3A3 (8392) 236 | OR4C3 (256144) 237 | OR4D10 (390197) 238 | OR4D9 (390199) 239 | OR51B6 (390058) 240 | OR51F2 (119694) 241 | OR51V1 (283111) 242 | OR52A5 (390054) 243 | OR52B6 (340980) 244 | OR52L1 (338751) 245 | OR56A1 (120796) 246 | OR56B1 (387748) 247 | OR5C1 (392391) 248 | OR5M1 (390168) 249 | OR5P2 (120065) 250 | OR5T1 (390155) 251 | OR5T3 (390154) 252 | OR5W2 (390148) 253 | OR6V1 (346517) 254 | OR7C2 (26658) 255 | OR7G2 (390882) 256 | OR8B8 (26493) 257 | OR8U1 (219417) 258 | OR9Q2 (219957) 259 | OTOP3 (347741) 260 | OTP (23440) 261 | PANX3 (116337) 262 | PAX4 (5078) 263 | PCARE (388939) 264 | PDE6H (5149) 265 | PDX1 (3651) 266 | PGK2 (5232) 267 | PGLYRP3 (114771) 268 | PIWIL3 (440822) 269 | PLA2G2F (64600) 270 | PNLIP (5406) 271 | PNPLA5 (150379) 272 | POTED (317754) 273 | POTEH (23784) 274 | POU4F3 (5459) 275 | PPP3R2 (5535) 276 | PRAMEF2 (65122) 277 | PRAMEF7 (441871) 278 | PRB4 (5545) 279 | PRDM14 (63978) 280 | PRDM9 (56979) 281 | PRLH (51052) 282 | PROP1 (5626) 283 | PRSS37 (136242) 284 | PRSS55 (203074) 285 | PSKH2 (85481) 286 | RAX (30062) 287 | RBM46 (166863) 288 | RBP3 (5949) 289 | RD3 (343035) 290 | REG3A (5068) 291 | RETNLB (84666) 292 | RFPL4B (442247) 293 | RHO (6010) 294 | RNASE11 (122651) 295 | RNASE13 (440163) 296 | RNASE9 (390443) 297 | RNF113B (140432) 298 | RP1 (6101) 299 | RPE65 (6121) 300 | RTP1 (132112) 301 | RXFP2 (122042) 302 | S100A7A (338324) 303 | SCGB1D1 (10648) 304 | SCRT2 (85508) 305 | SEC14L3 (266629) 306 | SEPTIN14 (346288) 307 | SERPINA9 (327657) 308 | SHCBP1L (81626) 309 | SIGLECL1 (284369) 310 | SLC10A2 (6555) 311 | SLC17A2 (10246) 312 | SLC18A3 (6572) 313 | SLC22A13 (9390) 314 | SLC22A6 (9356) 315 | SLC22A9 (114571) 316 | SLC25A31 (83447) 317 | SLC2A7 (155184) 318 | SLC34A1 (6569) 319 | SLC39A12 (221074) 320 | SLC6A5 (9152) 321 | SLC7A13 (157724) 322 | SLCO6A1 (133482) 323 | SOHLH1 (402381) 324 | SOX14 (8403) 325 | SPACA1 (81833) 326 | SPATA16 (83893) 327 | SPEM1 (374768) 328 | SPINK14 (408187) 329 | SPPL2C (162540) 330 | SSTR4 (6754) 331 | STPG4 (285051) 332 | SUN5 (140732) 333 | TAAR2 (9287) 334 | TAAR6 (319100) 335 | TAS1R2 (80834) 336 | TAS2R13 (50838) 337 | TAS2R39 (259285) 338 | TAS2R41 (259287) 339 | TAS2R43 (259289) 340 | TAS2R50 (259296) 341 | TAS2R7 (50837) 342 | TAS2R9 (50835) 343 | TBC1D21 (161514) 344 | TBR1 (10716) 345 | TBXT (6862) 346 | TEX101 (83639) 347 | TEX45 (374877) 348 | TFAP2D (83741) 349 | TKTL2 (84076) 350 | TMEM132D (121256) 351 | TMEM174 (134288) 352 | TMEM225 (338661) 353 | TMPRSS11A (339967) 354 | TMPRSS11F (389208) 355 | TMPRSS15 (5651) 356 | TPD52L3 (89882) 357 | TPRX1 (284355) 358 | TREML4 (285852) 359 | TRIM40 (135644) 360 | TRIM43 (129868) 361 | TRIM60 (166655) 362 | TRIML1 (339976) 363 | TRPM1 (4308) 364 | TSBP1 (10665) 365 | TSHB (7252) 366 | TSPO2 (222642) 367 | TSSK1B (83942) 368 | TXNDC8 (255220) 369 | UBQLN3 (50613) 370 | UROC1 (131669) 371 | USP29 (57663) 372 | VAX1 (11023) 373 | VN1R4 (317703) 374 | VRTN (55237) 375 | WFDC10A (140832) 376 | WFDC9 (259240) 377 | ZG16 (653808) 378 | ZNF648 (127665) 379 | ZNF804B (219578) 380 | ZP2 (7783) 381 | ZSWIM2 (151112) 382 | -------------------------------------------------------------------------------- /Data/SampleData/AvanaSequenceMap.csv: -------------------------------------------------------------------------------- 1 | sequence_ID,ScreenID,days,pDNA_batch,Replicate,ScreenType,cell_line_name,ModelConditionID,Library,PassesQC 2 | HEL-311Cas9_RepA_p4_Avana-3,SC-000004.AV01,21,Avana-3,A,2DS,ACH-000004,MC-000004-pA3k,Avana,True 3 | HEL-311Cas9_RepB_p4_Avana-3,SC-000004.AV01,21,Avana-3,B,2DS,ACH-000004,MC-000004-pA3k,Avana,True 4 | KU812-311cas9-RepA-p6_Avana-3,SC-000074.AV01,21,Avana-3,A,2DS,ACH-000074,MC-000074-OKtM,Avana,True 5 | KU812-311cas9-RepB-p6_Avana-3,SC-000074.AV01,21,Avana-3,B,2DS,ACH-000074,MC-000074-OKtM,Avana,True 6 | T47D-311Cas9-RepA-p6_Avana-4,SC-000147.AV01,21,Avana-4,A,2DS,ACH-000147,MC-000147-Uovr,Avana,True 7 | T47D-311Cas9-RepB-p6_Avana-4,SC-000147.AV01,21,Avana-4,B,2DS,ACH-000147,MC-000147-Uovr,Avana,True 8 | NOMO-1-311Cas9_RepA_p4_Avana-2,SC-000168.AV01,21,Avana-2,A,2DS,ACH-000168,MC-000168-L3Ll,Avana,True 9 | NOMO-1-311Cas9_RepB_p4_Avana-2,SC-000168.AV01,21,Avana-2,B,2DS,ACH-000168,MC-000168-L3Ll,Avana,True 10 | L363-311Cas9_RepA_p6_Avana-3,SC-000183.AV01,21,Avana-3,A,2DS,ACH-000183,MC-000183-k64I,Avana,True 11 | L363-311Cas9_RepB_p6_Avana-3,SC-000183.AV01,21,Avana-3,B,2DS,ACH-000183,MC-000183-k64I,Avana,True 12 | KASUMI-1-311cas9_RepA_p6_Avana-3,SC-000263.AV01,21,Avana-3,A,2DS,ACH-000263,MC-000263-MxQI,Avana,True 13 | KASUMI-1-311cas9_RepB_p6_Avana-3,SC-000263.AV01,21,Avana-3,B,2DS,ACH-000263,MC-000263-MxQI,Avana,True 14 | NCI-H841-311as9_RepA_p6_Avana-3,SC-000292.AV01,21,Avana-3,A,2DS,ACH-000292,MC-000292-Oy94,Avana,True 15 | NCI-H841-311as9_RepB_p6_Avana-3,SC-000292.AV01,21,Avana-3,B,2DS,ACH-000292,MC-000292-Oy94,Avana,True 16 | DB-311Cas9_RepA_p3_Avana-3,SC-000334.AV01,21,Avana-3,A,2DS,ACH-000334,MC-000334-r0NH,Avana,True 17 | DB-311Cas9_RepB_p3_Avana-3,SC-000334.AV01,21,Avana-3,B,2DS,ACH-000334,MC-000334-r0NH,Avana,True 18 | J82-311Cas9_RepA_p5_Avana-3,SC-000396.AV01,21,Avana-3,A,2DS,ACH-000396,MC-000396-DZtc,Avana,True 19 | J82-311Cas9_RepB_p5_Avana-3,SC-000396.AV01,21,Avana-3,B,2DS,ACH-000396,MC-000396-DZtc,Avana,False 20 | K029AX-311cas9_RepB_p6_Avana-3,SC-000404.AV01,21,Avana-3,B,2DS,ACH-000404,MC-000404-GqDy,Avana,True 21 | U937-101Cas9 Rep A p6_Avana-3,SC-000406.AV01,21,Avana-3,A,2DS,ACH-000406,MC-000406-va7T,Avana,True 22 | U937-101Cas9 Rep C p6_Avana-3,SC-000406.AV01,21,Avana-3,C,2DS,ACH-000406,MC-000406-va7T,Avana,True 23 | U937-101Cas9 Rep D p6_Avana-3,SC-000406.AV01,21,Avana-3,D,2DS,ACH-000406,MC-000406-va7T,Avana,True 24 | NCI-H1915-311Cas9_RepA_p6_Avana-3,SC-000434.AV01,21,Avana-3,A,2DS,ACH-000434,MC-000434-8t7w,Avana,True 25 | NCI-H1915-311Cas9_RepB_p6_Avana-3,SC-000434.AV01,21,Avana-3,B,2DS,ACH-000434,MC-000434-8t7w,Avana,True 26 | CJM-311Cas9_RepA_p7_Avana-3,SC-000458.AV01,21,Avana-3,A,2DS,ACH-000458,MC-000458-xKvR,Avana,True 27 | CJM-311Cas9_RepB_p7_Avana-3,SC-000458.AV01,21,Avana-3,B,2DS,ACH-000458,MC-000458-xKvR,Avana,True 28 | KNS-81-311cas9_RepA_p5_Avana-3,SC-000479.AV01,21,Avana-3,A,2DS,ACH-000479,MC-000479-6qKr,Avana,True 29 | KNS-81-311cas9_RepB_p5_Avana-3,SC-000479.AV01,21,Avana-3,B,2DS,ACH-000479,MC-000479-6qKr,Avana,True 30 | OVISE-311cas9 Rep A p6_Avana-2,SC-000527.AV01,21,Avana-2,A,2DS,ACH-000527,MC-000527-f2HC,Avana,True 31 | OVISE-311cas9 Rep B p6_Avana-2,SC-000527.AV01,21,Avana-2,B,2DS,ACH-000527,MC-000527-f2HC,Avana,True 32 | OVISE-311cas9 Rep C p6_Avana-2,SC-000527.AV01,21,Avana-2,C,2DS,ACH-000527,MC-000527-f2HC,Avana,True 33 | TE6-311cas9_RepA_p6_Avana-3,SC-000605.AV01,21,Avana-3,A,2DS,ACH-000605,MC-000605-mA6N,Avana,True 34 | TE6-311cas9_RepB_p6_Avana-3,SC-000605.AV01,21,Avana-3,B,2DS,ACH-000605,MC-000605-mA6N,Avana,True 35 | RVH421-311Cas9_RepA_p5_Avana-3,SC-000614.AV01,21,Avana-3,A,2DS,ACH-000614,MC-000614-7X1Q,Avana,True 36 | RVH421-311Cas9_RepB_p5_Avana-3,SC-000614.AV01,21,Avana-3,B,2DS,ACH-000614,MC-000614-7X1Q,Avana,True 37 | A2780-311cas9 Rep A p6_Avana-2,SC-000657.AV01,21,Avana-2,A,2DS,ACH-000657,MC-000657-vN70,Avana,True 38 | A2780-311cas9 Rep B p6_Avana-2,SC-000657.AV01,21,Avana-2,B,2DS,ACH-000657,MC-000657-vN70,Avana,True 39 | LOXIMVI-311Cas9_RepA_p6_Avana-2,SC-000750.AV01,21,Avana-2,A,2DS,ACH-000750,MC-000750-bafP,Avana,False 40 | LOXIMVI-311Cas9_RepB_p6_Avana-2,SC-000750.AV01,21,Avana-2,B,2DS,ACH-000750,MC-000750-bafP,Avana,False 41 | RERF-LC-Ad2-311cas9_RepA_p6_Avana-3,SC-000774.AV01,21,Avana-3,A,2DS,ACH-000774,MC-000774-GTYc,Avana,True 42 | RERF-LC-Ad2-311cas9_RepB_p6_Avana-3,SC-000774.AV01,21,Avana-3,B,2DS,ACH-000774,MC-000774-GTYc,Avana,True 43 | LXF-289-311cas9 Rep B p6_Avana-3,SC-000787.AV01,21,Avana-3,B,2DS,ACH-000787,MC-000787-yzmP,Avana,True 44 | LXF-289-311cas9 Rep C p6_Avana-3,SC-000787.AV01,21,Avana-3,C,2DS,ACH-000787,MC-000787-yzmP,Avana,True 45 | LXF-289-311cas9 Rep D p6_Avana-3,SC-000787.AV01,21,Avana-3,D,2DS,ACH-000787,MC-000787-yzmP,Avana,True 46 | A2058-311cas9_RepA_p6_Avana-3,SC-000788.AV01,21,Avana-3,A,2DS,ACH-000788,MC-000788-xvTl,Avana,True 47 | A2058-311cas9_RepB_p6_Avana-3,SC-000788.AV01,21,Avana-3,B,2DS,ACH-000788,MC-000788-xvTl,Avana,True 48 | NCI-H2286-311caa9_RepA_p6_Avana-3,SC-000912.AV01,21,Avana-3,A,2DS,ACH-000912,MC-000912-cLYP,Avana,True 49 | NCI-H2286-311caa9_RepB_p6_Avana-3,SC-000912.AV01,21,Avana-3,B,2DS,ACH-000912,MC-000912-cLYP,Avana,True 50 | MDST8-311Cas9 Rep A p6_Avana-3,SC-000935.AV01,21,Avana-3,A,2DS,ACH-000935,MC-000935-E55p,Avana,True 51 | MDST8-311Cas9 Rep C p6_Avana-3,SC-000935.AV01,21,Avana-3,C,2DS,ACH-000935,MC-000935-E55p,Avana,True 52 | MDST8-311Cas9 Rep D p6_Avana-3,SC-000935.AV01,21,Avana-3,D,2DS,ACH-000935,MC-000935-E55p,Avana,True 53 | PF382-311CAS9_RepA_p6_Avana-3,SC-000937.AV01,21,Avana-3,A,2DS,ACH-000937,MC-000937-QPMF,Avana,True 54 | PF382-311CAS9_RepB_p6_Avana-3,SC-000937.AV01,21,Avana-3,B,2DS,ACH-000937,MC-000937-QPMF,Avana,True 55 | HEC265-311Cas9_RepA_p6_Avana-3,SC-000946.AV01,21,Avana-3,A,2DS,ACH-000946,MC-000946-Imv7,Avana,True 56 | HEC265-311Cas9_RepB_p6_Avana-3,SC-000946.AV01,21,Avana-3,B,2DS,ACH-000946,MC-000946-Imv7,Avana,True 57 | SNU407-311Cas9_RepB_p6_Avana-4,SC-000955.AV01,21,Avana-4,B,2DS,ACH-000955,MC-000955-q7cl,Avana,False 58 | MFE-319-311Cas9_RepA_p6_Avana-3,SC-000988.AV01,21,Avana-3,A,2DS,ACH-000988,MC-000988-YnfA,Avana,True 59 | MFE-319-311Cas9_RepB_p6_Avana-3,SC-000988.AV01,21,Avana-3,B,2DS,ACH-000988,MC-000988-YnfA,Avana,True 60 | JHUEM7-311Cas9_RepA_p5_Avana-3,SC-000993.AV01,21,Avana-3,A,2DS,ACH-000993,MC-000993-g8KR,Avana,True 61 | JHUEM7-311Cas9_RepB_p5_Avana-3,SC-000993.AV01,21,Avana-3,B,2DS,ACH-000993,MC-000993-g8KR,Avana,True 62 | HEC-251-311Cas9_RepA_p5_Avana-3,SC-000996.AV01,21,Avana-3,A,2DS,ACH-000996,MC-000996-nXxW,Avana,True 63 | HEC-251-311Cas9_RepB_p5_Avana-3,SC-000996.AV01,21,Avana-3,B,2DS,ACH-000996,MC-000996-nXxW,Avana,True 64 | MONO-MAC1-311cas9_RepA_p6_Avana-3,SC-001129.AV01,21,Avana-3,A,2DS,ACH-001129,MC-001129-nxTf,Avana,True 65 | MONO-MAC1-311cas9_RepB_p6_Avana-3,SC-001129.AV01,21,Avana-3,B,2DS,ACH-001129,MC-001129-nxTf,Avana,True 66 | NB1643-311Cas9_RepA_p4_Avana-3,SC-001303.AV01,21,Avana-3,A,2DS,ACH-001303,MC-001303-7eBW,Avana,True 67 | NB1643-311Cas9_RepB_p4_Avana-3,SC-001303.AV01,21,Avana-3,B,2DS,ACH-001303,MC-001303-7eBW,Avana,True 68 | UMUC5-311Cas9_RepA_p6_Avana-3,SC-001411.AV01,21,Avana-3,A,2DS,ACH-001411,MC-001411-EIzh,Avana,True 69 | UMUC5-311Cas9_RepB_p6_Avana-3,SC-001411.AV01,21,Avana-3,B,2DS,ACH-001411,MC-001411-EIzh,Avana,True 70 | KARPAS1718-311cas9-RepA-p6_Avana-4,SC-001533.AV01,21,Avana-4,A,2DS,ACH-001533,MC-001533-SowA,Avana,True 71 | KARPAS1718-311cas9-RepB-p6_Avana-4,SC-001533.AV01,21,Avana-4,B,2DS,ACH-001533,MC-001533-SowA,Avana,True 72 | MEL202-311cas9_RepA_p6_Avana-3,SC-001554.AV01,21,Avana-3,A,2DS,ACH-001554,MC-001554-W5Vn,Avana,True 73 | MEL202-311cas9_RepB_p6_Avana-3,SC-001554.AV01,21,Avana-3,B,2DS,ACH-001554,MC-001554-W5Vn,Avana,True 74 | MM127-311cas9_RepA_p5_Avana-3,SC-001563.AV01,21,Avana-3,A,2DS,ACH-001563,MC-001563-W7yA,Avana,True 75 | MM127-311cas9_RepB_p5_Avana-3,SC-001563.AV01,21,Avana-3,B,2DS,ACH-001563,MC-001563-W7yA,Avana,True 76 | RAMOS-311cas9-RepA-p6_Avana-4,SC-001636.AV01,21,Avana-4,A,2DS,ACH-001636,MC-001636-mxpz,Avana,False 77 | RAMOS-311cas9-RepB-p6_Avana-4,SC-001636.AV01,21,Avana-4,B,2DS,ACH-001636,MC-001636-mxpz,Avana,False 78 | TGW-311Cas9-RepA-P6_Avana-4,SC-001674.AV01,21,Avana-4,A,2DS,ACH-001674,MC-001674-vHBi,Avana,True 79 | TGW-311Cas9-RepB-P6_Avana-4,SC-001674.AV01,21,Avana-4,B,2DS,ACH-001674,MC-001674-vHBi,Avana,True 80 | HB11;19-311CAS9_RepA_p6_Avana-3,SC-001736.AV01,21,Avana-3,A,2DS,ACH-001736,MC-001736-pJnl,Avana,True 81 | HB11;19-311CAS9_RepB_p6_Avana-3,SC-001736.AV01,21,Avana-3,B,2DS,ACH-001736,MC-001736-pJnl,Avana,True 82 | ICC108-311cas9-RepA-p4_Avana-3,SC-001836.AV01,21,Avana-3,A,2DS,ACH-001836,MC-001836-jYoz,Avana,True 83 | ICC108-311cas9-RepB-p4_Avana-3,SC-001836.AV01,21,Avana-3,B,2DS,ACH-001836,MC-001836-jYoz,Avana,True 84 | ICC2-311cas9_RepA_p6_Avana-3,SC-001842.AV01,21,Avana-3,A,2DS,ACH-001842,MC-001842-XQ3q,Avana,True 85 | ICC2-311cas9_RepB_p6_Avana-3,SC-001842.AV01,21,Avana-3,B,2DS,ACH-001842,MC-001842-XQ3q,Avana,True 86 | ICC3-311cas9_RepA_p6_Avana-3,SC-001843.AV01,21,Avana-3,A,2DS,ACH-001843,MC-001843-hmPS,Avana,True 87 | ICC3-311cas9_RepB_p6_Avana-3,SC-001843.AV01,21,Avana-3,B,2DS,ACH-001843,MC-001843-hmPS,Avana,True 88 | SAS-311cas9-RepA-p6_Avana-4,SC-002029.AV01,21,Avana-4,A,2DS,ACH-002029,MC-002029-jZk3,Avana,True 89 | SAS-311cas9-RepB-p6_Avana-4,SC-002029.AV01,21,Avana-4,B,2DS,ACH-002029,MC-002029-jZk3,Avana,True 90 | RVH421RPMI-311Cas9-RepA-p6_Avana-4,SC-002875.AV01,21,Avana-4,A,2DS,ACH-000614,MC-002875-AKge,Avana,True 91 | RVH421RPMI-311Cas9-RepB-p6_Avana-4,SC-002875.AV01,21,Avana-4,B,2DS,ACH-000614,MC-002875-AKge,Avana,True 92 | UPMD1-311cas9-RepA-p6_Avana-4,SC-002926.AV01,21,Avana-4,A,2DS,ACH-002926,MC-002926-BDYS,Avana,True 93 | UPMD1-311cas9-RepB-p6_Avana-4,SC-002926.AV01,21,Avana-4,B,2DS,ACH-002926,MC-002926-BDYS,Avana,True 94 | pDNA_batch_Avana-4,pDNA,0,Avana-4,,pDNA,pDNA,pDNA,Avana,True 95 | pDNA_batch_Avana-3,pDNA,0,Avana-3,,pDNA,pDNA,pDNA,Avana,True 96 | pDNA_batch_Avana-2,pDNA,0,Avana-2,,pDNA,pDNA,pDNA,Avana,True 97 | -------------------------------------------------------------------------------- /chronos/figshare.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import os 4 | import requests 5 | import zipfile 6 | 7 | from requests.exceptions import HTTPError 8 | 9 | # CONSTANTS 10 | BASE_URL = 'https://api.figshare.com/v2/{endpoint}' 11 | CHUNK_SIZE = 10485760 12 | 13 | # FIGSHARE 14 | FIGSHARE_TOKEN = 'c727b3826353164e0ae4ba35c9325ee339597f2ca66c3ab4349394cc0bcf2662b91f0a3e722f41b0d26152c65ed63692c834a18be6d7df8141d16e3fed834aa4' 15 | FIGSHARE_ID = 21411663 16 | MODEL_NAME = 'mini_model.zip' 17 | # ^ credentials for smaffa and test article id 18 | 19 | 20 | # FIGSHARE_TOKEN = 21 | # FIGSHARE_ID = (14067047?) 22 | # MODEL_NAME = 23 | 24 | 25 | ##### GENERAL API UTILS ##### 26 | 27 | def raw_issue_request(method, url, data=None, binary=False): 28 | ''' 29 | Helper for issuing an HTTPS request 30 | ''' 31 | headers = {'Authorization': 'token ' + FIGSHARE_TOKEN} 32 | if data is not None and not binary: 33 | data = json.dumps(data) 34 | response = requests.request(method, url, headers=headers, data=data) 35 | try: 36 | response.raise_for_status() 37 | try: 38 | data = json.loads(response.content) 39 | except ValueError: 40 | data = response.content 41 | except HTTPError as error: 42 | print('Caught an HTTPError: {}'.format(error.message)) 43 | print('Body:\n', response.content) 44 | raise 45 | 46 | return data 47 | 48 | 49 | def issue_request(method, endpoint, *args, **kwargs): 50 | ''' 51 | Formats an HTTPS request 52 | ''' 53 | return raw_issue_request(method, BASE_URL.format(endpoint=endpoint), *args, **kwargs) 54 | 55 | 56 | ##### UPLOADING ##### 57 | 58 | def zip_chronos_model(path, archive_name=None): 59 | ''' 60 | Zip the necessary files for storing a Chronos model 61 | ''' 62 | files_in_path = os.listdir(path) 63 | necessary_files = ['chronos_ge_unscaled.hdf5', 64 | 'guide_efficacy.csv', 65 | 'cell_line_efficacy.csv', 66 | 'screen_delay.csv', 67 | 'library_effect.csv'] 68 | for filename in necessary_files: 69 | assert filename in files_in_path, "Cannot locate file {} in target directory {}".format(filename, path) 70 | 71 | if archive_name is None: 72 | archive_name = path.rstrip('/') 73 | 74 | with zipfile.ZipFile(archive_name + '.zip', mode='w', compression=zipfile.ZIP_DEFLATED) as ziph: 75 | for filename in necessary_files: 76 | ziph.write(os.path.join(path, filename), 77 | os.path.relpath(os.path.join(archive_name, filename), 78 | os.path.join(path, '..'))) 79 | return archive_name + '.zip' 80 | 81 | 82 | def list_files_of_article(article_id, private=True): 83 | ''' 84 | List all the files present in a figshare article 85 | ''' 86 | if private: 87 | result = issue_request('GET', 'account/articles/{}/files'.format(article_id)) 88 | else: 89 | result = issue_request('GET', 'articles/{}/files'.format(article_id)) 90 | print('Listing files for article {}:'.format(article_id)) 91 | if result: 92 | for item in result: 93 | print(' {id} - {name}'.format(**item)) 94 | else: 95 | print(' No files.') 96 | 97 | 98 | def create_article(title): 99 | ''' 100 | Make a new figshare article 101 | ''' 102 | data = { 103 | 'title': title 104 | } 105 | result = issue_request('POST', 'account/articles', data=data) 106 | print('Created article:', result['location'], '\n') 107 | 108 | result = raw_issue_request('GET', result['location']) 109 | 110 | return result['id'] 111 | 112 | 113 | def get_file_check_data(file_name): 114 | ''' 115 | Ensure file can be streamed for upload 116 | ''' 117 | with open(file_name, 'rb') as fin: 118 | md5 = hashlib.md5() 119 | size = 0 120 | data = fin.read(CHUNK_SIZE) 121 | while data: 122 | size += len(data) 123 | md5.update(data) 124 | data = fin.read(CHUNK_SIZE) 125 | return md5.hexdigest(), size 126 | 127 | 128 | def initiate_new_upload(article_id, file_name): 129 | ''' 130 | Initiate the upload process for a file 131 | ''' 132 | endpoint = 'account/articles/{}/files' 133 | endpoint = endpoint.format(article_id) 134 | 135 | md5, size = get_file_check_data(file_name) 136 | data = {'name': os.path.basename(file_name), 137 | 'md5': md5, 138 | 'size': size} 139 | 140 | result = issue_request('POST', endpoint, data=data) 141 | print('Initiated file upload:', result['location'], '\n') 142 | 143 | result = raw_issue_request('GET', result['location']) 144 | 145 | return result 146 | 147 | 148 | def complete_upload(article_id, file_id): 149 | ''' 150 | Complete the file upload 151 | ''' 152 | issue_request('POST', 'account/articles/{}/files/{}'.format(article_id, file_id)) 153 | 154 | 155 | def upload_parts(file_info, file_name): 156 | ''' 157 | Uploads an entire file in chunks 158 | ''' 159 | url = '{upload_url}'.format(**file_info) 160 | result = raw_issue_request('GET', url) 161 | 162 | print('Uploading parts:') 163 | with open(file_name, 'rb') as fin: 164 | for part in result['parts']: 165 | upload_part(file_info, fin, part) 166 | 167 | 168 | def upload_part(file_info, stream, part): 169 | ''' 170 | Uploads a single chunk of a file 171 | ''' 172 | udata = file_info.copy() 173 | udata.update(part) 174 | url = '{upload_url}/{partNo}'.format(**udata) 175 | 176 | stream.seek(part['startOffset']) 177 | data = stream.read(part['endOffset'] - part['startOffset'] + 1) 178 | 179 | raw_issue_request('PUT', url, data=data, binary=True) 180 | print(' Uploaded part {partNo} from {startOffset} to {endOffset}'.format(**part)) 181 | 182 | 183 | def upload(file_path, article_id=None, article_title=None, overwrite=False): 184 | ''' 185 | Uploads a local file to the specified article, or creates a new article with the file 186 | ''' 187 | # create article if not exists 188 | if article_id is None: 189 | assert article_title is not None, 'No article_id supplied, please provide a title for the new dataset or specify the id of an existing one' 190 | article_id = create_article(article_title) 191 | else: 192 | # check if file exists 193 | response = issue_request('GET', 'account/articles/{article_id}'.format(article_id=article_id)) 194 | file_list = response['files'] 195 | 196 | for file_info in file_list: 197 | if file_info['name'] == os.path.basename(file_path): 198 | if overwrite: 199 | # Delete the existing file first 200 | issue_request('DELETE', 'account/articles/{article_id}/files/{file_id}'.format(article_id=article_id, file_id=file_info['id'])) 201 | else: 202 | # Throw an error 203 | raise ValueError('{} exists in figshare article'.format(os.path.basename(file_path))) 204 | 205 | 206 | # Upload the file 207 | file_info = initiate_new_upload(article_id, file_path) 208 | upload_parts(file_info, file_path) 209 | complete_upload(article_id, file_info['id']) 210 | 211 | list_files_of_article(article_id) 212 | 213 | 214 | ##### DOWNLOADING ##### 215 | 216 | def unzip(archive_path, target_path=None): 217 | ''' 218 | Unzips a zip archive into the target directory 219 | ''' 220 | if target_path is None: 221 | target_path = os.path.dirname(archive_path) 222 | 223 | with zipfile.ZipFile(archive_path, 'r') as ziph: 224 | ziph.extractall(target_path) 225 | 226 | return os.path.join(target_path, os.path.basename(archive_path).rstrip('.zip')) 227 | 228 | 229 | def download_files_from_article(article_id, target_directory=None, fileset=None, private=False): 230 | ''' 231 | Downloads files from a public (or private) Figshare article 232 | Parameters: 233 | article_id (`str` or `int`): identifier for a Figshare dataset 234 | target_directory (`str`): the location to download files into; if None, creates a local directory named by article_id 235 | fileset (iterable): Figshare file ids or names to download 236 | ''' 237 | 238 | if private: # for test purposes 239 | response = issue_request('GET', 'account/articles/{article_id}'.format(article_id=article_id)) 240 | else: 241 | response = issue_request('GET', 'articles/{article_id}'.format(article_id=article_id)) 242 | 243 | headers = {'Authorization': 'token ' + FIGSHARE_TOKEN} 244 | 245 | file_list = response['files'] 246 | 247 | if target_directory is None: # save the downloads by the article id 248 | target_directory = 'figshare_{}'.format(article_id) 249 | if not os.path.exists(target_directory): 250 | os.makedirs(target_directory) 251 | 252 | for file_info in file_list: 253 | if file_info['id'] in fileset or file_info['name'] in fileset: 254 | r = requests.get('https://ndownloader.figshare.com/files/{file_id}'.format(file_id=file_info['id']), 255 | allow_redirects=True, headers=headers) 256 | with open(os.path.join(target_directory, file_info['name']), 'wb') as f: 257 | for chunk in r.iter_content(1024): 258 | f.write(chunk) 259 | print('Downloaded {} from article {}'.format(file_info['name'], article_id)) 260 | print('Downloads are located at {}/'.format(target_directory)) 261 | 262 | 263 | # def download_files_from_article(article_id, target_directory=None, fileset=None): 264 | # ''' 265 | # Downloads files from a public Figshare article 266 | # Parameters: 267 | # article_id (`str` or `int`): identifier for a Figshare dataset 268 | # target_directory (`str`): the location to download files into; if None, creates a local directory named by article_id 269 | # fileset (iterable): Figshare file ids or names to download 270 | # ''' 271 | 272 | # response = issue_request('GET', 'articles/{article_id}'.format(article_id=article_id)) 273 | 274 | # headers = {'Authorization': 'token ' + FIGSHARE_TOKEN} 275 | 276 | # file_list = response['files'] 277 | 278 | # if target_directory is None: # save the downloads by the article id 279 | # target_directory = 'figshare_{}'.format(article_id) 280 | # if not os.path.exists(target_directory): 281 | # os.makedirs(target_directory) 282 | 283 | # for file_info in file_list: 284 | # if file_info['id'] in fileset or file_info['name'] in fileset: 285 | # r = requests.get('https://ndownloader.figshare.com/files/{file_id}'.format(file_id=file_info['id']), 286 | # allow_redirects=True, headers=headers) 287 | # with open(os.path.join(target_directory, file_info['name']), 'wb') as f: 288 | # for chunk in r.iter_content(1024): 289 | # f.write(chunk) 290 | # print('Downloaded {} from article {}'.format(file_info['name'], article_id)) 291 | # print('Downloads are located at {}'.format(target_directory)) 292 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chronos: an algorithm for inferring gene fitness effects from CRISPR knockout experiments. 2 | 3 | A full description and benchmarking of Chronos 1 are available in a publication: https://doi.org/10.1186/s13059-021-02540-7 4 | 5 | An additional preprint describing the changes made to Chronos 2 will be released when the underlying data is public, expected 2024. 6 | 7 | NEW IN 2.3: The Chronos hit-calling module now allows you to assess significance and control false discovery. See details in the preprint: https://doi.org/10.1101/2025.04.24.650434. Examples are given in Vignette.py. 8 | 9 | # When to use it 10 | Chronos is well suited for any CRISPR KO experiment where: 11 | - You measured initial pDNA sgRNA readcounts and readcounts at one or more later time points. 12 | - You might have one or more cell lines. 13 | - You might have one library, or be combining data from multiple libraries. 14 | - Genome-wide or sub-genome coverage. 15 | - You expect most cells to be proliferating. 16 | - You expect the majority of gene knockouts to have little to no effect on proliferation. 17 | - You might or might not have copy number data for your cell lines. 18 | - You might be using CRISPRko or CRISPRi. Chronos was developed for CRISPRko, but should work for CRISPRi experiments. 19 | - You might want to compare the effects of gene knockouts between screens in two different conditions, for example, treated and untreated. However, please note that to estimate the significance of the difference in effects, Chronos requires two independent biological replicates per condition (ideally infected separately) 20 | 21 | Chronos may not work well for: 22 | - RNAi experiments. Chronos makes biological assumptions that are fundamentally incompatible with RNAi. Try DEMETER 2. 23 | - Rescue experiments. If most cells are dying, we can't offer any guarantees of Chronos' performance. 24 | - A focused essential gene library, for the same reason. 25 | - Multi-condition experiments where your only control is a late time point (such as DMSO). Chronos requires pDNA abundance. 26 | 27 | We strongly recommend having at least two sgRNAs per gene. This is true regardless of the algorithm you use. 28 | 29 | Chronos is competitive with or superior to the other CRISPR algorithms we tested given readcounts from only one late time point, but it will perform even better with multiple late time points if your experiment has them. 30 | 31 | 32 | # Installation 33 | 34 | ## Note on Mac M1 chips 35 | 36 | As of 09/01/2023, `pip install tensorflow` should work on Macs with arm64. 37 | 38 | ## Installing Chronos 39 | 40 | If you have `pip` installed, you can install Chronos from PyPI with 41 | 42 | ` $ pip install crispr_chronos` 43 | 44 | However, we recommend downloading this repository as well to run the vignette and download the DepMap trained Chronos parameters. 45 | 46 | Chronos `model` requires `python 3` with the packages `tensorflow 2.x`, `numpy`, `pandas`,`h5py`. However, additional modules require additional packages which will be installed by default if missing: `patsy`, `statsmodels`, `scipy`, `matplotlib`, `seaborn`, `adjust_text`, `scikit-learn`, `umap`, `reportlab`. 47 | 48 | # Getting Started 49 | If you have jupyter notebook, you should run through `Vignette.ipynb`. This will both verify that you have a working installation and demonstrate a typical workflow for Chronos. Chronos is meant to be run in a python environment. 50 | 51 | To run Chronos, you need a minimum of three Pandas dataframes: 52 | 53 | 1. _readcounts_: A matrix of raw readcounts, where the columns are targeting sgRNAs, the rows are pDNA sequencing samples or replicate samples, and the entries are the number of reads of the given sgRNA in the given sample. Notice that in Chronos matrices, GUIDES and GENES are always COLUMNS and SAMPLES are always ROWS. Readcounts can have null values as long as no column or row is entirely null. 54 | 55 | 2. _sequence_map_: A table with at least four columns, `sequence_ID`, `cell_line_name`, `pDNA_batch`, and `days`, mapping sequencing samples to cell lines and pDNA measurements. `sequence_ID` should match the row names of the raw readcounts. `days` is the number of days between infection and when the sample was collected, should be integer or float. It will be ignored for pDNA samples. `cell_line_name` MUST be "pDNA" for pDNA samples. if, instead of pDNA, you are sequencing your cells at a very early time point to get initial library abundance, treat these as pDNA samples. If you don't have either, Chronos may not be the right algorithm for your experiment. `pDNA_batch` is needed when your experiment combines samples that have different pDNA references (within the same library). This is the case for Achilles because the PCR primer strategy has changed several times during the course of the experiment. pDNA samples belonging to the same batch will be combined into a single reference. If you don't have pDNA batches, just fill this column some value, such as "batch1". 56 | 57 | 3. _guide_gene_map_: A table with at least two columns, `sgrna` and `gene`, mapping the sgRNAs to genes. Chronos will not accept sgRNAs that map to more than one gene. This is intentional. `sgrna` entries should match the columns in raw readcounts. `gene` can be in any format. 58 | 59 | To benefit from improved normalization and allow Chronos to infer the overdispersion of screens, supplying a list or array of `negative_control_sgrnas` is also necessary. These are simply the sgRNAs which you believe should have no viability effect in any of your screens. It is much better to use cutting than noncutting controls, and as many as possible. 60 | 61 | We've found that a small number of clones in CRISPR cell lines will exhibit dramatic outgrowth that seems unrelated to the intended CRISPR perturbation. We recommend you remove these in place by running 62 | 63 | import chronos 64 | chronos.nan_outgrowths(readcounts, sequence_map, guide_gene_map) 65 | 66 | You can then initialize the Chronos model 67 | 68 | model = chronos.Chronos( 69 | readcounts={'my_library': readcounts}, 70 | sequence_map={'my_library': sequence_map}, 71 | guide_gene_map={'my_library': guide_gene_map}, 72 | negative_control_sgrnas={'my_library': negative_control_sgrnas} 73 | ) 74 | 75 | 76 | This odd syntax is used because it allows you to process results from different libraries at the same time. If you have libraries 1 and 2, and readcounts, sequence maps, guide maps, and negative control sgRNAs for them, you would initialize Chronos as such: 77 | 78 | model = chronos.Chronos( 79 | readcounts={'my_library1': readcounts1, 'my_library2': readcounts2}, 80 | sequence_map={'my_library': sequence_map, 'my_library2': sequence_map2}, 81 | guide_gene_map={'my_library': guide_gene_map, 'my_library2': guide_gene_map2}, 82 | negative_control_sgrnas={'my_library1': negative_control_sgrnas1, 'my_library2': negative_control_sgrnas2} 83 | ) 84 | 85 | Either way, you can then train Chronos by calling 86 | 87 | model.train() 88 | 89 | Once the model is trained, you can save all the parameters by calling 90 | 91 | model.save("my_save_directory") 92 | 93 | You can also directly access model parameters, for example: 94 | 95 | gene_effect = model.gene_effect 96 | guide_efficacy = model.guide_efficacy 97 | 98 | `gene_effect` is the primary attribute you will be interested in in 99% of use cases. It is a numerical matrix indexed on rows by `cell_line_name` and on columns by `gene`, with values indicating the _relative change in growth rate_ caused by successful knockout of the gene. 0 indicates no change, negative values a loss of viability, and positive values a gain of viability. NaNs in this matrix can occur because no sgRNAs targeting the gene 99 | 100 | Note some parameters will be dictionaries or tables, because they are learned separately per library. 101 | 102 | If you have labeled gene_level copy number data, Chronos has an option to correct the gene effect matrix. We recommend first globally normalizing the gene effect matrix so the median of all common essential gene scores is -1 and the median of all nonessential genes is 0. Unlike CERES outputs, we do NOT recommend normalizing per cell line. Chronos includes parameters like `cell_line_growth_rate` and `cell_line_efficacy` along with other regularization terms that help align data between cell lines. 103 | 104 | gene_effect -= gene_effect.reindex(columns=my_nonessential_gene_list).median(axix=1).median() 105 | gene_effect /= gene_effect.reindex(columns=my_essential_gene_list).median(axis=1).abs().median() 106 | gene_effect_corrected, shifts = chronos.alternate_cn(gene_effect, copy_number) 107 | chronos.write_hdf5(gene_effect_corrected, "my_save_directory/gene_effect.hdf5") 108 | 109 | The copy number matrix needs to be aligned to the gene_effect_matrix. Additionally, we assume that it is in the current CCLE format: log2(relative CN + 1), where CN 1 means the relative CN matches the reference. This may still work fine with CN with different units, but has not been tested. 110 | 111 | New functionality in Chronos 2.x includes two types of quality control reports, one you can run on your raw data, the other on the trained Chronos results, and the ability to load DepMap public Chronos runs and use the trained parameters for processing your own screens (if they are in a public DepMap library, currently just Avana and KY). See the vignette for details on how to do this. 112 | 113 | # Calling hits 114 | 115 | New functionality in Chronos 2.3.x includes the `hit_calling` module, which allows you to assess the statistical significance of Chronos results. See the preprint for a detailed explanation and benchmarking of the methods: https://doi.org/10.1101/2025.04.24.650434 116 | 117 | ## Identify significantly depleting knockouts 118 | To get empirical p-values that a gene knockout causes a true negative viability phenotype (requires a list of many negative control genes) from a gene effect matrix (which can be supplied by Chronos or any other algorithm, as long as negative = more dependent): 119 | 120 | from chronos.hit_calling import get_pvalue_dependent, get_fdr_from_pvalues 121 | pvalues = get_pvalue_dependent(gene_effect, negative_control_genes) 122 | fdr_from_pvalues = get_fdr_from_pvalues(pvalues) 123 | 124 | `hit_calling` also includes an empirical Bayesian method for controlling false discovery. This method generates posterior probabilities that a given gene effect score was generated from the distribution of positive control genes rather than the negative control genes - i.e., the probability that the cell line is dependent on the gene. 125 | 126 | from chronos.hit_calling import get_probability_dependent, get_fdr_from_probability 127 | probabilities = get_probability_dependent(gene_effect, negative_control_genes, positive_control_genes) 128 | fdr_from_probabilities = get_fdr_from_probabilities(probabilities) 129 | 130 | DepMap published `fdr_from_probabilities` every quarter as CRISPRGeneDependency. This method is generally preferable over the frequentist version since it is better-calibrated and produces good results even with relatively few controls (on the order of 10s), but it does require a good set of positive controls that represent the full range of expected dependent phenotypes. If you only include highly lethal knockouts in your positive control set, you should expect to be limited in detecting less extreme loss of viability phenotypes in other knockouts. 131 | 132 | ## Comparing gene effect between two screening conditions 133 | 134 | A common experimental design involves running a CRISPR screen with the same library on the same cell line multiple times with some experimental condition changed - such as in the presence or absence of a drug, an isogenic perturbation, or a different growth condition. The `hit_calling.ConditionComparison` will report p-values for differences of viabiliy effects between any two conditions in such an experiment, *provided* you have at least two independent biological replicates for your condition. Initializing `ConditionComparison` is almost exactly the same as initializing a `Chronos` instance, except that instead of a `sequence_map`, you must provide a `condition_map` which has all of the same columns as `sequence_map`, plus a `replicate` column and a `condition` column. The `condition` column tells Chronos which replicates belong to which condition; you can choose any labels you like. The `replicate` column tells Chronos which late time points are different sequencing results of the same biological replicate. If you only have one late timepoint for each biological replicate, you can fill this column with any labels as long as they are unique to each row. `condition` and `replicate` for rows with `cell_line_name == "pDNA"` will be ignored. 135 | 136 | from chronos.hit_calling import ConditionComparison 137 | comparator = ConditionComparison( 138 | readcounts={"my_library": my_readcounts}, 139 | condition_map={"my_library": my_condition_map}, 140 | guide_gene_map={"my_library": my_guide_map}, 141 | negative_control_sgrnas={"my_library": my_negative_controls} 142 | ) 143 | 144 | You can also pass `negative_control_genes` instead of `negative_control_sgrnas`, and in fact this is recommended. If you do, you only pass one list rather than a dict of entries per library: 145 | 146 | comparator = ConditionComparison( 147 | readcounts={"my_library": my_readcounts}, 148 | condition_map={"my_library": my_condition_map}, 149 | guide_gene_map={"my_library": my_guide_map}, 150 | negative_control_genes=my_negative_control_genes 151 | ) 152 | 153 | To compare screens in DrugA to screens in Control, you would call 154 | 155 | drugA_vs_control_statistics = comparator.compare_conditions(conditions=("Control", "DrugA")) 156 | 157 | Of course, the two conditions named in `compare_conditions` must be present in `condition_map["condition"]`. 158 | 159 | Running `compare_conditions` requires Chronos to build and train at least four models, so expect this to take longer than a typical Chronos run. It will also be less verbose by default. Problems can arise when the "biological replicates" are not genuinely independent replicates - for example, if a single pool of cells was infected with the CRISPR library, then split into replicates, we've observed that even knockouts with no viability effects will be more correlated with their coinfected partners than with other replicates. Chronos will try to check for this and do its best to report and correct for problems. 160 | 161 | 162 | # Expected run times 163 | The full Achilles dataset takes 3-4 hours to run a gcloud VM with 52 GB of memory. Training the vignette in this package should take around 10 minutes on a typical laptop. 164 | 165 | # Other Chronos Options 166 | The Chronos model has a large number of hyperparameters which are described in the model code. Generally we advise against changing these. We've tested them in a wide variety of experimental settings and found the defaults work well. However, a few may be worth tweaking if you want to try and maximize performance. If you do choose to tune the hyperparameters, make sure you evaluate the results with a metric that captures what you really want to get out of the data. We decribe the hyperparameters that might be worth changing here. 167 | 168 | - `gene_effect_hierarchical` and `gene_effect_smoothing`: The first of these is a CERES style penalty that punishes gene effect scores in individual cell lines for deviating from the mean. The second punishes the deviation of a REGION of gene effect scores in a cell line from the mean, where a region is a contiguous block of genes arranged by their mean gene effect. Cranking up the first of these will reduce the variance within genes, potentially losing interesting differences between samples (but improving measures of control separation within samples). Cranking up the second can produce artifacts in the tails of gene effect, especially if `gene_effect_hierarchical` is too low. If you don't care about differences between samples, or have strong reason to believe all your samples should give the same results, you could consider increasing both of these. 169 | 170 | - `kernel_width`: this is the width of the gaussian kernel applied for `gene_effect_smoothing`. The number of genes used to calculation regional deviation from the mean for each gene will be 6x this number, 3x in each direction from the gene in question. Consider reducing this from its default value (50) for subgenome libraries. 171 | 172 | - `cell_efficacy_guide_quantile`: Chronos pre-estimates how efficacious a cell line is (you could think of this as related to Cas9 activity in the cell line). To do this, it looks at the nth percentile guide's log fold change and takes that as the maximum real depletion the cell line can achieve. If screening a small library, especially one highly biased towards essentials, you might consider increasing it from the default value of 0.01. 173 | 174 | - `library_batch_reg`: this regularizes the mean gene effect within libraries towards the mean effect across libraries. Has no effect unless you have more than one library in the run. Note that this is one of two Chronos properties that removes library batch effects; the other is the internal matrix of `library_batch_effect`, which can't be turned off. If you think there should be real biological differences between your libraries, consider concatenating the input files into a single pseudolibrary. On the other hand, if you have two screen batches in the same library and you want to correct batch effects, you can split your screens into two pseudolibraries with the same sgRNAs in each. 175 | 176 | - `scale_cost`: amplifies or diminishes the cost function. Lowering this value effectively increases the strength of all regularization terms. 177 | 178 | 179 | # Tools that are useful outside of Chronos: 180 | 181 | ## Preprocessing tools: 182 | 183 | - `nan_outgrowths` will remove readcounts suspected to be caused by clonal outgrowth (see Michlits et. al., https://doi.org/10.1038/nmeth.4466 for a description of this phenomenon in CRISPR screens). 184 | 185 | - `normalize_readcounts` will sum pDNA measurements of the same pDNA batch, align the different batches by mode in log space, then align replicates to their pDNA batch by median abundance of the negative controls (if negative controls are supplied) 186 | 187 | - `calculate_fold_change` will convert a readcounts matrix into a fold change. Will use RPM normalization by default, which will undo the normalization in `normalize_readcounts` 188 | 189 | - `estimate_alpha` estimates the overdispersion parameter of the NB2 negative binomial counts model on a per-replicate basis using negative controls 190 | 191 | 192 | ## Postprocessing tools: 193 | 194 | - `alternate_CN`, a copy number correction method that accepts any gene effect matrix and a gene-level copy number matrix and returns a corrected gene effect matrix. 195 | 196 | 197 | ## QC reports (requires the matplotlib, seaborn, and reportlab packages): 198 | 199 | - `reports.qc_initial_data` takes in readcounts, a guide map, a sequence map, and optionally postive and negative control sgRNAs and provides a number of plots and metrics to assess the quality of CRISPR screen data. 200 | 201 | - `reports.qc_dataset` evaluates data quality after Chronos processing. You will want to call `.save` on your trained model to create a properly formatted directory to load with this function. Some aspects of the QC require omics data in various forms. See the vignette for a walkthrough. 202 | 203 | 204 | ## Generally useful functions: 205 | 206 | - `read_hdf5` and `write_hdf5` allow you to translate numerical matrices between pandas DataFrames and effiicient binary files. 207 | 208 | - `evaluations.fast_cor` efficiently computes the correlation matrix of one or two matrices (pandas DataFrames) with block null values. `evalautions.fast_cor_core` accepts numpy arrays as inputs instead. 209 | 210 | - `evaluations.nnmd`, `evaluations.auroc`, and `evaluations.pr_auc` compute control separation metrics 211 | 212 | - `plotting.density_scatter` produces a scatter plot with points colored by density, a trendline (much more efficient than seaborn's version), and optionally a diagonal, along with several options for labeling outlier points. 213 | 214 | - `plotting.binplot` turns scatter data into a boxplot by binning one axis, which can reveal trends that are hard to see with scatter 215 | 216 | - `plotting.dict_plot` takes a dictionary of data and produces a subplot per entry, titled with its key. 217 | 218 | -------------------------------------------------------------------------------- /chronos/plotting.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import pandas as pd 4 | from warnings import warn 5 | 6 | from matplotlib import pyplot as plt 7 | from matplotlib import cm 8 | from matplotlib import colormaps 9 | from matplotlib.ticker import FormatStrFormatter 10 | from matplotlib.colors import Normalize, LogNorm 11 | from matplotlib.cm import ScalarMappable 12 | import matplotlib.patheffects as pe 13 | 14 | from scipy.interpolate import interpn 15 | from statsmodels.nonparametric.smoothers_lowess import lowess 16 | try: 17 | from adjustText import adjust_text 18 | adjust_text_present = True 19 | except: 20 | 21 | adjust_text_present = False 22 | 23 | 24 | def lowess_trend(x, y, frac=.25, max_points=2000, min_points=50, delta_frac=.01, **kwargs): 25 | ''' 26 | A wrapper for statsmodel's lowess with a somewhat more useful parameterization 27 | Parameters: 28 | `x`, `y`: the points. `y` will be smoothed as a function of `x`. 29 | `frac`: `float` in [0, 1]. The fraction of the points used for each linear regression. 30 | `min_points`: `int`. The maximum number of points to be used for each linear regression. 31 | Overrides `frac` when larger. 32 | `max_points`: `int`. The maximum number of points to be used for each linear regression. 33 | Overrides `frac` when smaller. 34 | `delta_frac`: the fraction of the range of `x` within which to use linear interpolation 35 | ` instead of a new regression. 36 | Other args passed to lowess. 37 | Returns: 38 | The unsorted smoothed y values. 39 | ''' 40 | frac = min(max_points/len(x), frac) 41 | frac = max(frac, min_points/len(x)) 42 | frac = np.clip(frac, 0, 1) 43 | rng = x.max() - x.min() 44 | delta = min(delta_frac * rng, 50/len(x)*rng) 45 | delta = min(delta, rng) 46 | return lowess(y, x, frac, delta=delta, is_sorted=False, return_sorted=False, **kwargs) 47 | 48 | 49 | def identify_outliers_by_trend(x, y, n_outliers, y_trend=None, min_outlier_std=3, **kwargs): 50 | ''' 51 | Get the `n_outliers` farthest from the smoothed trend. y_trend is not supplied, it will be estimated 52 | using `lowess_trend`. 53 | Parameters: 54 | `x`, `y`: the points. `y` will be smoothed as a function of `x`. 55 | `n_outliers`: how many outliers to return. 56 | `y_trend`: The trend of y(x). If not provided, will be estimated by lowess. 57 | `min_outlier_std` (`float`): `y` must be at least this many standard deviations from `y_trend` 58 | (standard deviation measured as deviation from the trend) to be an outlier. 59 | Other args are passed to `lowess_trend`, if used. 60 | Returns: 61 | the numerical index of the outliers 62 | ''' 63 | if y_trend is None: 64 | y_trend = lowess_trend(x, y, **kwargs) 65 | 66 | diff = y-y_trend 67 | sd = np.std(diff) 68 | normed = np.abs(diff/sd) 69 | index = np.arange(len(normed)).astype(int) 70 | candidates = normed[normed > min_outlier_std] 71 | index = index[normed > min_outlier_std] 72 | order = np.argsort(candidates) 73 | return index[order[-n_outliers:]] 74 | 75 | 76 | def identify_outliers_by_density(x, y, density, n_outliers, candidate_density_quantile=.05, high_density_quantile=.5, 77 | max_candidates=500, max_high=10000): 78 | ''' 79 | Identify outliers in 2D space by point density. This is done by first identifying a set of candidate points of lowest 80 | density, then a set of points with high density, then choosing candidates that have the greatest minimum distance 81 | to any point with high density. 82 | Parameters: 83 | `x`, `y`, `density`: 1D arrays giving the position of each point and the estimated density of points at that position 84 | `n_outliers`: how many outliers to return. If fewer candidates are found than the number of requested outliers, 85 | all candidates will be returned. 86 | `candidate_density_quantile`: the fraction of points to choose as possible outliers based on density 87 | `high_density_quantile`: the fraction of points to be treated as dense 88 | ` max_candidates`: overrides `candidate_density_quantile` if too many candidates are considered. Useful for very large datasets. 89 | ` max_high`: overrides `high_density_quantile` if too many high density points are considered. Useful for very large datasets. 90 | Returns: 91 | the numerical index of the outliers 92 | ''' 93 | if not (len(x)==len(y)==len(density)): 94 | raise ValueError("`x`, `y`, and `density` must have the same length") 95 | if candidate_density_quantile > high_density_quantile: 96 | raise ValueError("`candidate_density_quantile` must be less than `high_density_quantile`") 97 | candidate_density_quantile = min(candidate_density_quantile, max_candidates/len(x)) 98 | high_density_quantile = max(high_density_quantile, 1-max_high/len(x)) 99 | 100 | candidates = np.arange(len(density)).astype(int)[density < np.quantile(density, candidate_density_quantile)] 101 | high_density = density > np.quantile(density, high_density_quantile) 102 | x_diff = np.subtract.outer(x[candidates], x[high_density]) 103 | y_diff = np.subtract.outer(y[candidates], y[high_density]) 104 | r2 = np.square(x_diff) + np.square(y_diff) 105 | r2_min = r2.min(axis=1) 106 | farthest = np.argsort(r2_min)[-n_outliers:] 107 | return candidates[farthest] 108 | 109 | 110 | def identify_outliers_by_diagonal(x, y, n_outliers): 111 | ''' 112 | Identify points in 2D space as outliers by distance from the diagonal x==y, i.e. the points with the greatest difference 113 | between x and y. 114 | Parameters: 115 | `x`, `y` : 1D arrays giving the position of each point 116 | `n_outliers`: how many outliers to return. If fewer candidates are found than the number of requested outliers, 117 | all candidates will be returned. 118 | Returns: 119 | the numerical index of the outliers 120 | ''' 121 | diff = np.abs(x - y) 122 | diff[pd.isnull(diff)] = 0 123 | order = np.argsort(diff) 124 | return order[-n_outliers:] 125 | 126 | 127 | def identify_outliers_by_zscore(x, y, n_outliers): 128 | ''' 129 | Identify points in 2D space as outliers by zscore. `x` and `y` are first zscored, then combined into a scaled Euclidian distance 130 | from the mean (`x**2 + y**2`). Those with the greatest distance are returned as outliers. 131 | Parameters: 132 | `x`, `y`: 1D arrays giving the position of each point 133 | `n_outliers`: how many outliers to return. If fewer candidates are found than the number of requested outliers, 134 | all candidates will be returned. 135 | Returns: 136 | the numerical index of the outliers 137 | ''' 138 | zx = np.abs(x - np.mean(x))/np.std(x) 139 | zy = np.abs(y - np.mean(y))/np.std(y) 140 | r = zx**2 + zy**2 141 | r[pd.isnull(r)] = 0 142 | order = np.argsort(r) 143 | return order[-n_outliers:] 144 | 145 | 146 | def identify_outliers_1d(x, n_outliers): 147 | ''' 148 | Identify points in 1D space as outliers by distance from median. 149 | Parameters: 150 | `x`: 1D array 151 | `n_outliers`: how many outliers to return. If fewer candidates are found than the number of requested outliers, 152 | all candidates will be returned. 153 | Returns: 154 | the numerical index of the outliers 155 | ''' 156 | zx = np.abs(x - np.median(x)) 157 | order = np.argsort(zx) 158 | return order[-n_outliers:] 159 | 160 | 161 | def get_density(x, y, bins=50): 162 | ''' 163 | get the 2D density of the 1D arrays `x` and `y` using a histogram with n `bins` 164 | on each axis 165 | ''' 166 | try: 167 | data , x_e, y_e = np.histogram2d( x, y, bins = bins, density = True ) 168 | except ValueError as e: 169 | print(x) 170 | print(y) 171 | print(bins) 172 | raise e 173 | z = interpn( ( 0.5*(x_e[1:] + x_e[:-1]) , 0.5*(y_e[1:]+y_e[:-1]) ) , 174 | data , np.vstack([x,y]).T , 175 | method = "splinef2d", bounds_error = False 176 | ) 177 | 178 | #NaNs should have zero density 179 | z[np.where(np.isnan(z))] = 0.0 180 | z[z < 0] = 0 181 | return z 182 | 183 | 184 | def dict_plot(dictionary, plot_func, figure_width=7.5, min_subplot_width=3.74, 185 | aspect_ratio=.8, aliases={}, xlabel=None, ylabel=None, *args, **kwargs): 186 | ''' 187 | A utility for generating a figure with a subplot for each entry in `dictionary`. 188 | Parameters: 189 | `dictionary` (`dict`): The data to be plotted. The keys of the dictionary will be used 190 | as subplot titles. 191 | `plot_func` (callable): will be called as `plot_func(value, *args, **kwargs)` for each value in ` 192 | dictionary`. 193 | `figure_width` (`float`: total width of the figure 194 | `min_subplot_width` (`float`): when laying out subplots, how narrow they are allowed to be. 195 | `aspect_ratio` (`float`): the ration of subplot height to width - not the same as matplotlib's 196 | definition 197 | `aliases` (`dict`): optional alternative names to use as plot titles 198 | `xlabel`, `ylabel` (`str`): optional axis labels for the subplots 199 | Other args and kwargs passed to `plot_func` 200 | Returns: 201 | fig, axes: the matplotlib figure and subplots 202 | ''' 203 | nplots = len(dictionary) 204 | plots_per_row = min(nplots, int(figure_width//min_subplot_width)) 205 | nrows = int(np.ceil(nplots/plots_per_row)) 206 | panel_width = figure_width/plots_per_row 207 | panel_height = panel_width * aspect_ratio 208 | figure_height = panel_height * nrows 209 | fig, axes = plt.subplots(nrows, plots_per_row, figsize=(figure_width, figure_height)) 210 | if nrows > 1: 211 | axes = [a for ax in axes for a in ax] 212 | elif nplots == 1: 213 | axes = [axes] 214 | for key, ax in zip(dictionary.keys(), axes): 215 | plt.sca(ax) 216 | plot_func(dictionary[key], *args, **kwargs) 217 | if not xlabel is None: 218 | plt.xlabel(xlabel) 219 | if not ylabel is None: 220 | plt.ylabel(ylabel) 221 | if key in aliases: 222 | key = aliases[key] 223 | plt.title(key) 224 | plt.tight_layout() 225 | return fig, axes 226 | 227 | 228 | 229 | 230 | def density_scatter(x, y, ax=None, sort=True, bins=50, trend_line=True, trend_line_args=dict(color='r'), 231 | lowess_args={}, diagonal=False, diagonal_kws=dict(color='black', lw=.3, linestyle='--'), 232 | c="density", cbar_label=None, 233 | label_specific=[], label_outliers=0, outliers_from='trend', 234 | label_kws=dict( 235 | fontsize=8, color=(.3, 0, 0), 236 | path_effects=[pe.withStroke(linewidth=1.25, foreground=(1, 1, 1))] 237 | ), 238 | outlier_scatter_kws=dict(color=(.8, .2, .1), s=10, linewidth=.6, edgecolor=[0, 0, 0]), 239 | adjust_text_kws={}, **kwargs ): 240 | """ 241 | Adapted from Guillaume's answer at 242 | https://stackoverflow.com/questions/20105364/how-can-i-make-a-scatter-plot-colored-by-density-in-matplotlib 243 | Scatter plot colored by 2d histogram, with optional trend_line, diagonal, and outlier labeling 244 | Parameters: 245 | `x`, `y`: `pandas.Series` with overlapping indices or iterables of the same length. Values to plot on each axis. 246 | `ax` (`matplotlib.Axis`): if provided, draw plot to this 247 | `sort` (`bool`): if `True` (default), the densest points are plotted last. 248 | `bins` (`int`): How many bins to use in np.histogram2d for estimating density. Default 50. 249 | `trend_line` (`bool`): Whether to draw a lowess trend_line line 250 | `lowess_args` (`dict`): passed to `lowess_trend` for the trend_line line 251 | `trend_line_args` (`dict`): passed to `pyplot.plot` for the trend_line line 252 | `c` ("density" or array): if "density", points will be colored by the square root of point density in the plot. 253 | Otherwise, passed to `pyplot.scatter`. 254 | `diagonal` (`bool`): If true, draw a line on the diagonal 255 | `diagonal_kws` (`dict`): Passed to `pyplot.plot`. By default, colors diagonal line red 256 | `label_outliers` (`int`): if > 0, the number of outliers to label with their index. 257 | If `trend_line`, the outliers will be identified by deviation from the trend. 258 | 'outliers_from': 259 | 'trend': outliers identified by distance from trend line 260 | 'diagonal': outliers identified by difference between `x` and `y` 261 | 'density': outliers identified by minimum distance to plot region of high density 262 | 'xy_zscore': outliers identified by euclidian distance from zero in z-score space 263 | `label_kws` (`dict`): passed to `pyplot.text` for the labels 264 | 'outlier_scatter_kws': passed to `pyplot.scatter` to plot over outliers 265 | **kwargs: additional arguments passed to `pyplot.scatter`. 266 | """ 267 | if ax is None : 268 | fig = plt.gcf() 269 | ax = plt.gca() 270 | else: 271 | fig = ax.figure 272 | index = None 273 | if isinstance(x, pd.Series) and isinstance(y, pd.Series): 274 | x, y = x.align(y, join="inner") 275 | index = x.index 276 | if len(x) != len(y): 277 | raise ValueError("If not pd.Series, x and y must be the same length") 278 | mask = pd.notnull(x) & pd.notnull(y) 279 | x = np.array(x[mask]).astype(float) 280 | y = np.array(y[mask]).astype(float) 281 | if not index is None: 282 | index = index[mask] 283 | 284 | c_is_density = False 285 | if isinstance(c, str): 286 | if c == "density": 287 | c_is_density = True 288 | else: 289 | raise ValueError(f"if passed, `c` can't be {c}, only 'density' or iterable.") 290 | 291 | if c_is_density or outliers_from == "density": 292 | z = get_density(x, y, bins) 293 | z = np.sqrt(z) 294 | 295 | if c_is_density: 296 | c = z 297 | if cbar_label is None: 298 | cbar_label = "Density (sqrt)" 299 | 300 | # Sort the points by c, so that the strongest points are plotted last 301 | if sort : 302 | idx = c.argsort() 303 | x, y, c = x[idx], y[idx], c[idx] 304 | if not index is None: 305 | index = index[idx] 306 | if c_is_density: 307 | z = z[idx] 308 | 309 | im = ax.scatter( x, y, c=c, **kwargs ) 310 | 311 | norm = Normalize(vmin = np.min(c), vmax = np.max(c)) 312 | colormap = cm.ScalarMappable(norm = norm) 313 | colormap.set_array([]) 314 | colormap.set_cmap(im.get_cmap()) 315 | cbar = fig.colorbar(colormap, ax=ax) 316 | cbar.ax.set_ylabel(cbar_label) 317 | 318 | smoothed=None 319 | if trend_line: 320 | smoothed = lowess_trend(x, y, **lowess_args) 321 | xsort = np.argsort(x) 322 | ax.plot(x[xsort], smoothed[xsort], **trend_line_args) 323 | 324 | outliers = None 325 | if label_outliers: 326 | if outliers_from == 'trend': 327 | outliers = identify_outliers_by_trend(x, y, label_outliers, smoothed) 328 | elif outliers_from =='density': 329 | outliers = identify_outliers_by_density(x, y, z, label_outliers) 330 | elif outliers_from == 'diagonal': 331 | outliers = identify_outliers_by_diagonal(x, y, label_outliers) 332 | elif outliers_from == 'xy_zscore': 333 | outliers = identify_outliers_by_zscore(x, y, label_outliers) 334 | elif outliers_from == 'x': 335 | outliers = identify_outliers_1d(x, label_outliers) 336 | elif outliers_from == 'y': 337 | outliers = identify_outliers_1d(y, label_outliers) 338 | else: 339 | raise ValueError("`outliers_from` must be one of 'trend', 'density', 'diagonal', 'xy_zscore', 'x', or 'y'") 340 | 341 | if len(label_specific) and not index is None: 342 | label_specific = [index.get_loc(v) for v in label_specific] 343 | 344 | if not outliers is None: 345 | label_specific = sorted(set(label_specific) | set(outliers)) 346 | 347 | if not index is None: 348 | labels = index[label_specific] 349 | else: 350 | labels = label_specific 351 | 352 | if len(label_specific): 353 | 354 | label_x = np.array([x[label] for label in label_specific]) 355 | label_y = np.array([y[label] for label in label_specific]) 356 | plt.scatter(label_x, label_y, **outlier_scatter_kws) 357 | 358 | #prevent overlapping point text labels from failing to differentiate with adjust_text 359 | label_x_jittered = label_x + np.random.normal(size=len(label_x), scale=.05*(x.max() - x.min())) 360 | label_y_jittered = label_y + np.random.normal(size=len(label_y), scale=.05*(y.max() - y.min())) 361 | texts = [plt.text(s=labels[i], x=label_x_jittered[i], y=label_y_jittered[i], zorder=10, **label_kws) 362 | for i, val in enumerate(label_specific)] 363 | 364 | if adjust_text_present and len(texts) > 0: 365 | 366 | base_adjust_text_kws = dict( 367 | lim=500, 368 | target_x=label_x, target_y=label_y, 369 | arrowprops=dict(arrowstyle="-", color=[.7, .5, .5]), 370 | expand=(1.2, 1.4), 371 | force_explode=(.3, .5), 372 | avoid_self=True 373 | 374 | ) 375 | base_adjust_text_kws.update(adjust_text_kws) 376 | adjust_text(texts, **base_adjust_text_kws) 377 | 378 | elif len(texts) > 0: 379 | warn("adjustText not found. Install to have labels moved off points.") 380 | 381 | if not diagonal: 382 | return ax 383 | 384 | minmin = min(ax.get_xlim()[0], ax.get_ylim()[0]) 385 | maxmax = max(ax.get_xlim()[1], ax.get_ylim()[1]) 386 | ax.plot([minmin, maxmax], [minmin, maxmax], **diagonal_kws) 387 | 388 | return ax 389 | 390 | 391 | def binplot(x, y, binned_axis='x', nbins=10, endpoints=None, right=False, ax=None, colors=None, cbar_label='Number Samples', **kwargs): 392 | ''' 393 | creates a plot with values binned into boxes along one axis. 394 | Params: 395 | x: iterable of numbers indicating position on x axis 396 | y: iterable of numbers indicating position on y axis. 397 | binned_axis (str): 'x' or 'y', the axis to bin ('x' default) 398 | nbins (int): number of discrete bins that will be created 399 | endpoints (None or tuple of two numbers): The right/top edge of the first bin and the left/bottom edge of the last bin. If provided, 400 | the first and last bins will include points in [-infinity, endpoints[0]] and [endpoints[1], +infinity] respectively. Other bins 401 | will be evenly spaced between them. If endpoints is None (default), bins will be evenly spaced between the minimum and maximum 402 | data points. 403 | right (bool): whether points falling on an edge are included in the left or right bin. 404 | axis (None or pyplot.Axis): axis to draw plot on (default or None draws to current axis) 405 | colors (None or str or iterable of RGBA values): color palette used to color the bins 406 | Additional keyword arguments are passed to pyplot.boxplot. 407 | ''' 408 | if isinstance(x, pd.Series) and isinstance(y, pd.Series): 409 | x, y = x.align(y, join="inner") 410 | index = x.index 411 | mask = pd.notnull(x) & pd.notnull(y) 412 | x = np.array(x)[mask] 413 | y = np.array(y)[mask] 414 | if colors is None: 415 | colors = "viridis" 416 | 417 | if binned_axis == 'x': 418 | unbinned = 'y' 419 | vert = True 420 | elif binned_axis == 'y': 421 | unbinned = 'x' 422 | vert = False 423 | else: 424 | raise ValueError("binned_axis must be 'x' or 'y'") 425 | 426 | if isinstance(x, pd.Series) and isinstance(y, pd.Series): 427 | assert len(set(x.index) & set(y.index)) > 2, "x and y lack common indices" 428 | else: 429 | assert len(x) == len(y), "if x and y are not Series, they must be the same length" 430 | 431 | df = pd.DataFrame({'x': x, 'y': y}) 432 | 433 | if endpoints is None: 434 | bins = np.linspace(df[binned_axis].min()-1e-12, df[binned_axis].max()+1e-12, nbins+1) 435 | space = bins[2] - bins[1] 436 | medians = .5*(bins[1:] + bins[:-1]) 437 | else: 438 | bins = [-np.inf] + list(np.linspace(endpoints[0], endpoints[1], nbins-1)) + [np.inf] 439 | space = bins[2] - bins[1] 440 | medians = np.array( 441 | [bins[1] - .5*space] + list(.5*np.array(bins[2:-1]) + .5*np.array(bins[1:-2])) + [bins[-2] + .5*space] 442 | ) 443 | 444 | digits = np.digitize(df[binned_axis], bins, right=right).astype(int) 445 | 446 | if any(digits > nbins): 447 | print(df[binned_axis][digits > nbins]) 448 | assert False 449 | df[binned_axis] = medians[digits-1] 450 | 451 | if ax is None: 452 | ax=plt.gca() 453 | else: 454 | plt.sca(ax) 455 | vals = sorted([val for val in sorted(medians) if (df[binned_axis] == val).sum() > 0]) 456 | 457 | boxes = plt.boxplot(x=[df[df[binned_axis] == val][unbinned] 458 | for val in vals 459 | ], 460 | positions=vals, widths=[.9*space]*len(vals), patch_artist=True, vert=vert, 461 | 462 | **kwargs) 463 | 464 | counts = df[binned_axis].value_counts().reindex(index=medians).fillna(0) 465 | normer = LogNorm(vmin=0) 466 | normer.autoscale(counts.values) 467 | cvals = normer(counts.values) 468 | if isinstance(colors, str): 469 | cmap = colormaps[colors] 470 | colors = [cmap(v) for v in cvals] 471 | for box, color in zip(boxes['boxes'], colors): 472 | box.set_facecolor(color) 473 | 474 | 475 | if binned_axis == 'x': 476 | ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f')) 477 | if not endpoints is None: 478 | plt.xticks(bins[1:-1]) 479 | else: 480 | plt.xticks(bins) 481 | else: 482 | ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f')) 483 | if not endpoints is None: 484 | plt.yticks(bins[1:-1]) 485 | else: 486 | plt.yticks(bins) 487 | if binned_axis == 'x': 488 | plt.xlim(bins[1] - 1.2*space, bins[-2] + 1.2 * space) 489 | else: 490 | plt.ylim(bins[1] - 1.2*space, bins[-2] + 1.2 * space) 491 | 492 | try: 493 | mappable=ScalarMappable(norm=normer, cmap=cmap) 494 | mappable.set_array(colors) 495 | plt.gcf().colorbar(mappable, ax=plt.gca(), label=cbar_label) 496 | except: 497 | pass 498 | 499 | return ax -------------------------------------------------------------------------------- /chronos/reports.py: -------------------------------------------------------------------------------- 1 | try: 2 | import reportlab 3 | except ModuleNotFoundError: 4 | raise ModuleNotFoundError("reportlab must be installed to use the reports module. Try `pip install reportlab`") 5 | from reportlab.lib.enums import TA_JUSTIFY 6 | from reportlab.lib.enums import TA_JUSTIFY 7 | from reportlab.lib.pagesizes import letter 8 | from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak, Table 9 | from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle 10 | from reportlab.lib.units import inch 11 | 12 | from .model import read_hdf5, calculate_fold_change, powerset, normalize_readcounts 13 | from .evaluations import * 14 | 15 | import numpy as np 16 | import pandas as pd 17 | import os 18 | from matplotlib import pyplot as plt 19 | import seaborn as sns 20 | 21 | from .plotting import density_scatter, dict_plot 22 | from scipy.stats import pearsonr 23 | from statsmodels.stats.multitest import fdrcorrection 24 | 25 | 26 | def load_chronos_data_for_qc(directory, gene_effect_file="gene_effect.hdf5"): 27 | ''' 28 | Loads the results of a Chronos run saved to the `directory` using the `Chronos.save` method in a `dict` 29 | suitable for passing to qc report functions. 30 | Parameters: 31 | `directory` (`str`): location of the saved run 32 | `gene_effect_file` (`str`): optionally specify a different file in the directory where gene effect is 33 | saved. This can be used to load a copy-mumber corrected version of the data. Must be in Chronos' 34 | h5 format. 35 | Returns: 36 | `dict` containing the results of the run with the keys expected by the qc report functions in this module. 37 | ''' 38 | libraries = [ 39 | f.split('_')[0] 40 | for f in os.listdir(directory) 41 | if f.endswith("sequence_map.csv") 42 | ] 43 | data = { 44 | 'gene_effect': read_hdf5(os.path.join(directory, gene_effect_file)), 45 | 'library_effect': pd.read_csv(os.path.join(directory, "library_effect.csv"), index_col=0), 46 | 't0_offset': pd.read_csv(os.path.join(directory, "t0_offset.csv"), index_col=0), 47 | 'guide_efficacy': pd.read_csv(os.path.join(directory, "guide_efficacy.csv"), index_col=0)["efficacy"], 48 | 'replicate_efficacy': pd.read_csv(os.path.join(directory, "replicate_efficacy.csv"), index_col=0), 49 | 'growth_rate': pd.read_csv(os.path.join(directory, "growth_rate.csv"), index_col=0), 50 | 'readcounts': { 51 | library: read_hdf5(os.path.join(directory, "%s_readcounts.hdf5" % library)) 52 | for library in libraries 53 | }, 54 | 'sequence_map': { 55 | library: pd.read_csv(os.path.join(directory, "%s_sequence_map.csv" % library)) 56 | for library in libraries 57 | }, 58 | 'guide_map': { 59 | library: pd.read_csv(os.path.join(directory, "%s_guide_gene_map.csv" % library)) 60 | for library in libraries 61 | }, 62 | 'excess_variance': { 63 | library: pd.read_csv(os.path.join(directory, "screen_excess_variance.csv"), index_col=0)[library] 64 | for library in libraries 65 | }, 66 | 'predicted_readcounts': { 67 | library: read_hdf5(os.path.join(directory, "%s_predicted_readcounts.hdf5" % library)) 68 | for library in libraries 69 | }, 70 | 'predicted_logfoldchange': { 71 | library: read_hdf5(os.path.join(directory, "%s_predicted_lfc.hdf5" % library)) 72 | for library in libraries 73 | }, 74 | 75 | } 76 | 77 | data["logfoldchange"] = {} 78 | for library in libraries: 79 | fc = calculate_fold_change( 80 | data["readcounts"][library], 81 | data["sequence_map"][library], 82 | rpm_normalize=False 83 | ) 84 | data['logfoldchange'][library] = pd.DataFrame( 85 | np.log2(fc.values), 86 | index=fc.index, columns=fc.columns 87 | ) 88 | return data 89 | 90 | 91 | def get_naive(data): 92 | ''' 93 | Computes naive gene effect per library libraries by finding the median 94 | of guides/gene and replicates/line within each library 95 | Parameters: 96 | `data` (`dict`): must have keys "logfoldchange", "guide_map", and "sequence_map" 97 | returns: 98 | `dict`[`pandas.DataFrame`] holding naive gene effect estimates. 99 | ''' 100 | naive = {} 101 | for library in data["logfoldchange"]: 102 | naive[library] = data['logfoldchange'][library]\ 103 | .T.groupby(data['guide_map'][library].set_index("sgrna").gene)\ 104 | .median().T\ 105 | .groupby(data['sequence_map'][library].set_index("sequence_ID").cell_line_name)\ 106 | .median() 107 | return naive 108 | 109 | 110 | def mean_collapse_dataframes(dfs): 111 | ''' 112 | Given an iterable of pandas DataFrames, returns a single dataframe 113 | where each value is given by the mean value for the same index/column 114 | across the input DataFrames, ignoring NaNs. 115 | ''' 116 | numerator = None 117 | denominator = None 118 | for df in dfs: 119 | if numerator is None: 120 | numerator = df.fillna(0) 121 | denominator = df.notnull().astype(int) 122 | else: 123 | numerator, df = numerator.align(df, join='outer') 124 | numerator.fillna(0, inplace=True) 125 | denominator, numerator = denominator.align(numerator, join="right") 126 | denominator.fillna(0, inplace=True) 127 | numerator += df.fillna(0).values 128 | denominator += df.notnull().values 129 | numerator = numerator.mask(denominator==0) 130 | denominator.replace(0, np.nan, inplace=True) 131 | return numerator/denominator 132 | 133 | def sum_collapse_dataframes(dfs): 134 | ''' 135 | Given an iterable of pandas DataFrames, returns a single dataframe 136 | where each value is given by the sum of values for the same index/column 137 | across the input DataFrames, filling NaNs with 0. 138 | ''' 139 | numerator = None 140 | for df in dfs: 141 | if numerator is None: 142 | numerator = df.fillna(0) 143 | else: 144 | numerator, df = numerator.align(df, join='outer') 145 | numerator.fillna(0, inplace=True) 146 | numerator += df.fillna(0).values 147 | return numerator 148 | 149 | 150 | def qc_compare_plot(plot_func, data, data_key, metrics, plot_width, plot_height, **kwargs): 151 | ''' 152 | A convenience method for comparing results from two different runs side by side 153 | Parameters: 154 | `plot_func` (`function`): a plotting function that accepts an object of the type `data[data_key]` 155 | and a `metrics` kew word argument and plots to the current matplotlib axis 156 | `data` (`dict`): dict containing data to plot 157 | `data_key` (`str`): the entry in the `data` that will be plotted 158 | `metrics` (`dict`): passed to `plot_func` 159 | `plot_width`, `plot_height`: desired (total) plot size in inches 160 | Additions kwargs passed to `plot_func` 161 | Returns: 162 | `matplotlib.Figure` 163 | ''' 164 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 165 | for i, key, in enumerate(data.keys()): 166 | plt.sca(axes[i]) 167 | plt.title(key) 168 | plot_func(data[key][data_key], metrics=metrics[key], **kwargs) 169 | plt.tight_layout() 170 | return fig 171 | 172 | 173 | def qc_initial_data(title, readcounts, sequence_map, guide_map, negative_control_sgrnas=None, positive_control_sgrnas=None, 174 | report_name=None, directory='./', plot_width=7.5, plot_height=3.25, 175 | doc_args=dict( 176 | pagesize=letter, rightMargin=.5*inch, leftMargin=.5*inch, 177 | topMargin=.5*inch,bottomMargin=.5*inch 178 | ), 179 | specific_plot_dimensions={} 180 | ): 181 | ''' 182 | QC dthe data that would be passed to Chronos. This can be helpful to develop a sense of data quality but also to exclude 183 | bad results. 184 | Parameters: 185 | `title` (`str`): the report title, printed on first page 186 | `readcounts` (`pd.DataFrame`): read numbers for each pDNA and late timepoint as rows with sgRNAs as columns. 187 | Do not need to be normalized. 188 | `sequence_map` (`pd.DataFrame`): map of sequences for both pDNA and late replicates to cell lines, timepoints, and pDNA batches. 189 | See `chronos.Chronos` for format. 190 | `guide_map` (`pd.DataFrame`): map of sgRNAs to genes. Must include the columns 'sgrna' and 'gene'. 191 | `negative_control_sgrnas`, `positive_control_sgrnas` (ordered indexable of `str`): optional guides where no effect or 192 | a strong depleting effect is expected, respectively. If not provided a number of the more useful QC metrics can't 193 | be calculated. 194 | `report_name` (`str`): an optional file name for the report. If none is provided, `title` + '.pdf' will be used. 195 | `directory` (`str`): where the report and figure panels will be generated. 196 | `plot_width`, `plot_height` (`float`): size of plots that will be put in the report in inches. 197 | `doc_args` (`dict`): additional arguments will be passed to `SimpleDocTemplate`. 198 | `specific_plot_dimensions` (`dict` of 2-tuple`): if a plot's name is present, will use the the value 199 | to specify dimensions for that plot instead of deriving them from `plot_width` and `plot_height` 200 | Returns: 201 | `dict` containing the calculated QC metrics, which will also be in the report. 202 | ''' 203 | if report_name is None: 204 | report_name = title + ".pdf" 205 | doc = SimpleDocTemplate(os.path.join(directory, report_name), **doc_args) 206 | styles=getSampleStyleSheet() 207 | story = [] 208 | metrics = {} 209 | 210 | def add_image(filename): 211 | fig = plt.gcf() 212 | label = '.'.join(filename.split('.')[:-1]) 213 | if label in specific_plot_dimensions: 214 | fig.set_size_inches(specific_plot_dimensions[label]) 215 | width, height = fig.get_size_inches() 216 | plt.tight_layout() 217 | fig.savefig(os.path.join(directory, filename)) 218 | plt.close(fig) 219 | im = Image(os.path.join(directory, filename), width*inch, height*inch) 220 | story.append(im) 221 | story.append(Spacer(.125, 12)) 222 | 223 | normalized = normalize_readcounts(readcounts, negative_control_sgrnas, sequence_map) 224 | lfc = np.log2(calculate_fold_change(normalized, sequence_map,rpm_normalize=False)) 225 | nlines = len(set(sequence_map.cell_line_name) - set(['pDNA'])) 226 | 227 | print("calculating replicate correlation") 228 | mean_corrs = [] 229 | for line in sequence_map.cell_line_name.unique(): 230 | if line == 'pDNA': 231 | continue 232 | reps = sequence_map.query("cell_line_name == %r" % line).sequence_ID 233 | corrs = fast_cor(lfc.loc[reps].T) 234 | np.fill_diagonal(corrs.values, np.nan) 235 | mean_corrs.append(corrs.mean()) 236 | metrics['MeanReplicateCorr'] = pd.concat(mean_corrs) 237 | metrics["ReplicateCorrWithMean"] = lfc.corrwith(lfc.mean(), axis=1) 238 | worst = metrics['MeanReplicateCorr']\ 239 | .groupby(sequence_map.set_index("sequence_ID").cell_line_name)\ 240 | .min()\ 241 | .sort_values().dropna().index[:10] 242 | 243 | def get_nnmd(x): 244 | return nnmd(x[positive_control_sgrnas], x[negative_control_sgrnas]) 245 | def get_roc_auc_score(x): 246 | return auroc(x[positive_control_sgrnas], x[negative_control_sgrnas]) 247 | 248 | if not negative_control_sgrnas is None and not positive_control_sgrnas is None: 249 | print("generating control separation metrics") 250 | negative_control_sgrnas = sorted(set(negative_control_sgrnas) & set(readcounts.columns)) 251 | if not len(negative_control_sgrnas): 252 | raise ValueError( 253 | "none of the negative control sgRNAs found in readcounts columns:\n%r" 254 | % negative_control_sgrnas 255 | ) 256 | positive_control_sgrnas = sorted(set(positive_control_sgrnas) & set(readcounts.columns)) 257 | if not len(positive_control_sgrnas): 258 | raise ValueError( 259 | "none of the negative control sgRNAs found in readcounts columns:\n%r" 260 | % positive_control_sgrnas 261 | ) 262 | metrics['NNMD'] = lfc.apply(get_nnmd, axis=1) 263 | metrics['AUROC'] = lfc.apply(get_roc_auc_score, axis=1) 264 | metrics["PosConMedian"] = lfc[positive_control_sgrnas].median(axis=1) 265 | metrics["NegConMedian"] = lfc[negative_control_sgrnas].median(axis=1) 266 | metrics["NegConSD"] = lfc[negative_control_sgrnas].std(axis=1) 267 | worst_sep = metrics['AUROC']\ 268 | .groupby(sequence_map.set_index("sequence_ID").cell_line_name)\ 269 | .min()\ 270 | .sort_values().dropna().index[:10] 271 | worst = sorted(set(worst) & set(worst_sep)) 272 | 273 | else: 274 | print("One or both control groups not supplied, skipping control separation metrics") 275 | story.append(Paragraph(title, style=styles["Heading1"])) 276 | 277 | print("Plotting log fold-change distribution") 278 | story.append(Paragraph("sgRNA Log Fold-Change Distribution", style=styles["Heading2"])) 279 | story.append(Paragraph( 280 | "For a traditional genome-wide loss of viability experiment we expect the bulk of log fold change \ 281 | scores near 0, with a long left tail of true viability depletion." 282 | )) 283 | 284 | sns.kdeplot(lfc.stack(), label="All sgRNAs", fill=True, color="gray", bw_adjust=.25) 285 | if not negative_control_sgrnas is None: 286 | sns.kdeplot(lfc[negative_control_sgrnas].stack(), label="Negative Controls sgRNAs", 287 | color=[.3, .1, .9], bw_adjust=.25) 288 | if not positive_control_sgrnas is None: 289 | sns.kdeplot(lfc[positive_control_sgrnas].stack(), label="Positive Controls sgRNAs", 290 | color=[.9, .2, 0], bw_adjust=.25) 291 | plt.legend() 292 | plt.xlabel("Log Fold-Change of late timepoints from pDNA") 293 | plt.gcf().set_size_inches((plot_width, plot_height)) 294 | add_image("lfc_distribution.png") 295 | 296 | if 'NNMD' in metrics: 297 | print("plotting control separation metrics") 298 | story.append(Paragraph("Control QC Metrics", style=styles["Heading2"])) 299 | story.append(Paragraph( 300 | "Depletion of positive controls is a positive signal for screen quality, while \ 301 | high standard deviation in negative controls is a negative signal for screen quality. \ 302 | However, these measures tend to be negatively correlated in CRISPR screens: screens that show \ 303 | the greatest dropout of essential genes also have the greatest noise in nonessential genes." 304 | )) 305 | 306 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 307 | 308 | plt.sca(axes[0]) 309 | density_scatter(metrics["PosConMedian"] - metrics["NegConMedian"], 310 | metrics["NegConSD"], 311 | label_outliers=4, 312 | alpha=.5) 313 | plt.xlabel("Pos. Con. median LFC") 314 | plt.ylabel("Neg. Con. SD") 315 | 316 | story.append(Paragraph( 317 | "The null-normalized median difference (NNMD) is" 318 | )) 319 | story.append(Paragraph( 320 | "\t\t((median(positive controls) - median(negative controls)) / mad(negative controls)" 321 | )) 322 | story.append(Paragraph( 323 | "In Project Achilles, we look for NNMD scores below -1.25 to consider a replicate passing \ 324 | but this threshold depends strongly on the controls you have chosen. \ 325 | We also provide the area under the ROC curve for separating the positive and negative control \ 326 | log fold changes. These measures should have a strong negative correlation." 327 | )) 328 | plt.sca(axes[1]) 329 | density_scatter(metrics["NNMD"], metrics["AUROC"], label_outliers=4, outliers_from="xy_zscore", 330 | alpha=.5) 331 | xlim, ylim = plt.gca().get_xlim(), plt.gca().get_ylim() 332 | plt.xlabel("NNMD") 333 | plt.ylabel("AUROC") 334 | 335 | 336 | add_image("control_sep.png") 337 | 338 | if metrics["MeanReplicateCorr"].any(): 339 | story.append(Paragraph("Replicate Correlation", style=styles["Heading2"])) 340 | story.append(Paragraph( 341 | "Below is the Pearson correlation of replicate Log Fold-Change with the mean LFC over all replicates (x axis) vs \ 342 | the mean correlation with other replicates of the same cell line (y axis). Generally these are closely related \ 343 | and correlate with other measures of screen quality.")) 344 | density_scatter(metrics["ReplicateCorrWithMean"], metrics["MeanReplicateCorr"], 345 | label_outliers=5) 346 | plt.xlabel("Replicate R with Mean LFC") 347 | plt.ylabel("Mean Replicate R with same line") 348 | add_image("replicate_correlations.png") 349 | 350 | story.append(PageBreak()) 351 | story.append(Paragraph("Details for worst performing cell lines", style=styles["Heading2"])) 352 | story.append(Paragraph( 353 | "For a dozen or so of the lines with the worst quality metrics, more details are given below. \ 354 | It can be useful to look at the replicate-replicate plots carefully for effects such as" 355 | )) 356 | story.append(Paragraph("\t- dropouts that aren't shared between replicates")) 357 | story.append(Paragraph( 358 | "\t- extreme outgrowths (whether shared or not). \ 359 | These are concerning unless there is a sound biological reason \ 360 | such as tumor suppressor KO or your experiment is a rescue experiment." 361 | )) 362 | story.append(Paragraph("")) 363 | story.append(Paragraph( 364 | "We also show reads in the late timepoints compared to the pDNA. If control groups are provided, these are broken \ 365 | out separately. We expect negative control sgRNAs to be closely aligned to pDNA abundance, while positive control \ 366 | sgRNAs should tend to fall below the diagonal. Note that each axis is the log(normalized counts + 1).")) 367 | for line in worst: 368 | story.append(PageBreak()) 369 | story.append(Paragraph(line, style=styles["Heading3"])) 370 | all_replicate_plot(normalized, sequence_map, line, plot_width) 371 | add_image("%s_rep_plot.png" % line) 372 | paired_pDNA_plots(normalized, sequence_map, line, negative_control_sgrnas, positive_control_sgrnas, 373 | plot_width, plot_height) 374 | add_image("%s_pdna_plot.png" % line) 375 | 376 | doc.build(story) 377 | 378 | return metrics 379 | 380 | 381 | def dataset_qc_report(title, data, 382 | positive_control_genes, negative_control_genes, 383 | mutation_matrix=None, addiction_expressions=None, copy_number=None, 384 | report_name=None, directory='.', gene_effect_file="gene_effect.hdf5", 385 | plot_width=7.5, plot_height=3.25, 386 | doc_args=dict( 387 | pagesize=letter, rightMargin=.5*inch, leftMargin=.5*inch, 388 | topMargin=.5*inch,bottomMargin=.5*inch 389 | ), 390 | specific_plot_dimensions={} 391 | ): 392 | ''' 393 | QC the results of the Chronos run. 394 | Parameters: 395 | `title` (`str`): the report title, printed on first page 396 | `data` (`str` or `dict`): A path to a saved Chronos directory, or the results of `load_chronos_data_for_qc`. 397 | If you manually assemble `data` as a `dict`, please consult that function for the correct format. 398 | `positive_control_genes`, `negative_control_genes` (`list`, `pandas.Index`, or `numpy.array` of `str`): 399 | Genes whose KO is expected to cause loss of viability or no loss of viability, respectively. 400 | `mutation_matrix` (`pandas.DataFrame`): optional boolean matrix of cell line by gene. 401 | Each value indicates that the gene has a gain of function mutation in that cell line. 402 | Genes should be selected such that a gain of function mutation is expected to make the cell line 403 | dependent on that gene. Tbhis is used to evaluate the separation of gene effects for that gene 404 | between mutated and wildtype cell lines. 405 | `addiction_expressions` (`pandas.DataFrame`): optional `float` matrix of cell lines by genes containing 406 | expressions. The genes should be chosen such that cell lines highly expressing the gene are expected 407 | to be dependent on it, while other cell lines are not. 408 | `copy_number` (`pandas.DataFrame`): optional cell line by gene `float` matrix of logged copy number counts. Used to QC the copy 409 | number effect. 410 | `report_name` (`str`): an optional file name for the report. If none is provided, `title` + '.pdf' will be used. 411 | `directory` (`str`): where the report and figure panels will be generated. 412 | `gene_effect_file` (`str`): If `data` is a path to a directory, this arg is passed to `load_chronos_data_for_qc`. 413 | `plot_width`, `plot_height` (`float`): size of plots that will be put in the report in inches. 414 | `doc_args` (`dict`): additional arguments will be passed to `SimpleDocTemplate`. 415 | `specific_plot_dimensions` (`dict` of 2-tuple`): if a plot's name is present, will use the the value 416 | to specify dimensions for that plot instead of deriving them from `plot_width` and `plot_height` 417 | Returns: 418 | `dict` containing the calculated QC metrics, which will also be in the report. 419 | ''' 420 | if isinstance(data, str): 421 | try: 422 | print("Loading data from %s" % data) 423 | data = load_chronos_data_for_qc(data, gene_effect_file) 424 | except IOError: 425 | raise ValueError("If `data` is a string, it must be the path to a directory containing Chronos saved data. \ 426 | gene_effect_file must be the name of an hdf5 file in that directory. \ 427 | You passed '%s', %r" % (data, gene_effect_file)) 428 | if not isinstance(data, dict): 429 | raise ValueError("`data` must be a `dict` of data or a string pointing to Chronos saved directory") 430 | required_data_keys = ["gene_effect", "sequence_map", "guide_map", "guide_efficacy", 431 | "predicted_readcounts", "readcounts", 432 | "logfoldchange", 'predicted_logfoldchange', 433 | "excess_variance", "growth_rate", "replicate_efficacy", 434 | "t0_offset", "library_effect" 435 | ] 436 | for key in required_data_keys: 437 | if not key in data: 438 | raise ValueError("`data` missing required entry %s" % (key)) 439 | library_data = { 440 | library: { 441 | key: data[key][library] 442 | for key in ['readcounts', 'predicted_readcounts', 443 | 'logfoldchange', 'predicted_logfoldchange', 444 | "excess_variance" 445 | ] 446 | } 447 | for library in data['readcounts'] 448 | } 449 | orig_working_dir = os.getcwd() 450 | if report_name is None: 451 | report_name = title + ".pdf" 452 | doc = SimpleDocTemplate(os.path.join(directory, report_name), **doc_args) 453 | styles=getSampleStyleSheet() 454 | story = [] 455 | metrics = {} 456 | 457 | 458 | def add_image(filename): 459 | fig = plt.gcf() 460 | label = '.'.join(filename.split('.')[:-1]) 461 | if label in specific_plot_dimensions: 462 | fig.set_size_inches(specific_plot_dimensions[label]) 463 | width, height = fig.get_size_inches() 464 | plt.tight_layout() 465 | fig.savefig(os.path.join(directory, filename)) 466 | plt.close(fig) 467 | im = Image(os.path.join(directory, filename), width*inch, height*inch) 468 | story.append(im) 469 | story.append(Spacer(.125, 12)) 470 | 471 | 472 | story.append(Paragraph(title, style=styles["Heading1"])) 473 | 474 | story.append(Paragraph("Control Separation", style=styles["Heading2"])) 475 | print("plotting global control separation") 476 | story.append(Paragraph("Global Control Separation", style=styles["Heading3"])) 477 | story.append(Paragraph( 478 | "Separation of positive/negative control genes both overall and by screen. \ 479 | More negative NNMD is better." 480 | )) 481 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 482 | plt.sca(axes[0]) 483 | control_histogram(data["gene_effect"], positive_control_genes, 484 | negative_control_genes, metrics=metrics) 485 | plt.sca(axes[1]) 486 | screen_nnmd_auroc_scatter(data["gene_effect"], positive_control_genes, 487 | negative_control_genes, metrics=metrics) 488 | add_image("global_controls.png") 489 | 490 | if (not mutation_matrix is None) or (not addiction_expressions is None): 491 | print("plotting selective dependency separation") 492 | story.append(Paragraph("Selective Control Separation", style=styles["Heading3"])) 493 | story.append(Paragraph( 494 | "Separation of known selective dependencies between indications. \ 495 | On the left, known oncogene gene effects are compared between models where \ 496 | a known oncogenic GoF mutation occurred in that gene vs the rest, if `mutation_matrix` is supplied. \ 497 | On the right, we test expression addictions using a one-tailed test on pearson correlations, \ 498 | if `addiction_expressions` is supplied. \ 499 | The FDRs should be considered optimistic." 500 | )) 501 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 502 | plt.sca(axes[0]) 503 | if not mutation_matrix is None: 504 | selective_mutated_vs_not_scatter(data["gene_effect"], mutation_matrix, metrics=metrics) 505 | plt.sca(axes[1]) 506 | if not addiction_expressions is None: 507 | expression_addiction_volcano(data["gene_effect"], addiction_expressions, metrics=metrics) 508 | if (not mutation_matrix is None) or (not addiction_expressions is None): 509 | add_image("selective_dependencies.png") 510 | story.append(PageBreak()) 511 | 512 | 513 | story.append(Paragraph("General Parameter Info", style=styles["Heading2"])) 514 | 515 | story.append(Paragraph("Statistical Properties of Gene Effects", style=styles["Heading3"])) 516 | print("plotting gene effect mean relationships") 517 | story.append(Paragraph( 518 | "Higher overall gene SD is better (if control separation in each cell line is maintained). There is usually a trend \ 519 | towards more variance in more negative genes. There should NOT be a trend in the second plot." 520 | )) 521 | fig, axes = plt.subplots(1, 1, figsize=(plot_width, plot_height)) 522 | mean_vs_sd_scatter(data["gene_effect"], metrics=metrics) 523 | 524 | if not copy_number is None: 525 | print("plotting copy number effect") 526 | story.append(Paragraph("Copy Number Effect", style=styles["Heading3"])) 527 | story.append(Paragraph( 528 | "Relationship of genomic copy number to estimated gene effect both overall (left) and per gene binned \ 529 | by gene mean (right). Ideally there is no systematic relationship." 530 | )) 531 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 532 | plt.sca(axes[0]) 533 | copy_number_trend(data['gene_effect'], copy_number, downsample=.01, downsample_lower_quantile_bound=.01, 534 | downsample_upper_quantile_bound=.99, metrics=metrics) 535 | plt.sca(axes[1]) 536 | copy_number_gene_corrs(data['gene_effect'], copy_number, metrics=metrics) 537 | add_image("copy_number_effect.png") 538 | 539 | print("plotting screen efficacy and growth rate") 540 | story.append(Paragraph("Screen Efficacy, Growth Rate, and Guide Efficacy", style=styles["Heading3"])) 541 | story.append(Paragraph( 542 | "These parameters together translate a gene effect into the expected impact on cell proliferation. \ 543 | Often there will be a trend towards lower growth estimates with lower cell efficacy estimates. \ 544 | Guide efficacies have a single global value, but here have been grouped by presence in a library. \ 545 | They should have a high peak near 1.")) 546 | 547 | growth_rate = [] 548 | replicate_efficacy = [] 549 | 550 | for library in library_data: 551 | 552 | gr, cle = data["growth_rate"].query("library == %r" % library)["growth_rate"].dropna().align( 553 | data['replicate_efficacy'].query("library == %r" % library)["replicate_efficacy"].dropna(), 554 | join="inner" 555 | ) 556 | 557 | growth_rate.append(gr) 558 | replicate_efficacy.append(cle) 559 | 560 | growth_rate, replicate_efficacy = pd.concat(growth_rate), pd.concat(replicate_efficacy) 561 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 562 | plt.sca(axes[0]) 563 | density_scatter(growth_rate, replicate_efficacy, trend_line=False, outliers_from="xy_zscore") 564 | plt.xlabel("Relative Growth Rate") 565 | plt.ylabel("Replicate Screening Efficacy") 566 | metrics["growth_rate_sd"] = growth_rate.std() 567 | metrics["cell_efficacy_mean"] = replicate_efficacy.mean() 568 | plt.sca(axes[1]) 569 | for library, guide_map in data['guide_map'].items(): 570 | guides = guide_map.sgrna.unique() 571 | efficacies = data['guide_efficacy'].reindex(guides).dropna() 572 | sns.kdeplot(efficacies, bw_adjust=.5, lw=1, label=library) 573 | metrics["guide_eff_%s_mean" % library] = efficacies.mean() 574 | plt.legend() 575 | plt.xlabel("Guide Efficacy") 576 | add_image("parameter_distributions.png") 577 | story.append(PageBreak()) 578 | 579 | if len(data['guide_map']) > 1: 580 | print("plotting library integration") 581 | story.append(Paragraph("Library Integration", style=styles["Heading2"])) 582 | story.append(Paragraph( 583 | "The UMAP embedding of cell line gene effects colored by library presence (left) and how \ 584 | far a gene's average within a library deviates from the overall average, by library (right). \ 585 | The UMAP embedding uses only the 50% most variable genes. \ 586 | On the right, a lowess trend is fitted per library to the squared difference of the gene's mean within \ 587 | models screened with the library and its mean overall." 588 | )) 589 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 590 | plt.sca(axes[0]) 591 | check_integration_umap(data['gene_effect'], data['sequence_map'], metrics=metrics) 592 | plt.sca(axes[1]) 593 | check_integration_mean_deviation(data['gene_effect'], data['sequence_map'], metrics=metrics) 594 | story.append(Paragraph("Prediction Accuracy", style=styles["Heading2"])) 595 | add_image("library_integration.png") 596 | story.append(PageBreak()) 597 | 598 | print("plotting readcount predictions") 599 | story.append(Paragraph("Predictions", style=styles["Heading2"])) 600 | story.append(Paragraph("Readcount Predictions", style=styles["Heading3"])) 601 | story.append(Paragraph( 602 | "Chronos' readcount predictions should generally line up well with observation, but it will predict \ 603 | greater than observed readcounts for cases with very few counts." 604 | )) 605 | 606 | def plot_func(x): 607 | predicted_vs_observed_readcounts( 608 | x["predicted_readcounts"], x['readcounts'], 609 | metrics=metrics) 610 | fig, axes = dict_plot(library_data, plot_func, plot_width) 611 | add_image("readcount_predictions.png") 612 | 613 | print("plotting LFC predictions") 614 | story.append(Paragraph("Log Fold-Change Predictions", style=styles["Heading3"])) 615 | story.append(Spacer(.125, 12)) 616 | story.append(Paragraph( 617 | "Screens with greater excess variance (overdispersion) should have worse correlation between \ 618 | observed LFC and Chronos' predictions." 619 | )) 620 | def plot_func(x): 621 | lfc_corr_vs_excess_variance( 622 | x["predicted_logfoldchange"], x['logfoldchange'], x['excess_variance'], 623 | metrics=metrics) 624 | fig, axes = dict_plot(library_data, plot_func, plot_width) 625 | add_image("lfc_corr_vs_excess_variance.png") 626 | story.append(PageBreak()) 627 | 628 | 629 | print("plotting difference from naive gene score") 630 | naive = get_naive(data) 631 | naive_collapsed = mean_collapse_dataframes(naive.values()) 632 | story.append(Paragraph("Gene Score Difference from Naive", style=styles["Heading2"])) 633 | story.append(Paragraph( 634 | "Comparing the gene effect scores to a naive score estimated as log fold change median per guide/replicate \ 635 | within libraries, then the mean across libraries. The first plots show the correlation of individual genes, both vs mean effect \ 636 | and vs the difference of means between \ 637 | the supplied and naive gene effects. Below is the direct comparison of gene means and a comparison of the most extreme \ 638 | values for each gene's score." 639 | )) 640 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 641 | plt.sca(axes[0]) 642 | gene_corr_vs_mean(naive_collapsed, data['gene_effect'], 643 | metrics=metrics) 644 | plt.sca(axes[1]) 645 | gene_corr_vs_mean_diff(naive_collapsed, data['gene_effect'], 646 | metrics=metrics) 647 | plt.xlabel("Naive Mean - Gene Effect Mean") 648 | add_image("gene_corrs.png") 649 | 650 | fig, ax = plt.subplots(1, 1, figsize=(plot_width, plot_width - 2)) 651 | plt.sca(ax) 652 | density_scatter(naive_collapsed.mean(), data['gene_effect'].mean(), diagonal=True, 653 | label_outliers=10, alpha=.5, s=10) 654 | plt.title("Mean Gene Effect") 655 | plt.xlabel("Naive") 656 | plt.ylabel("Gene Effect") 657 | add_image("gene_means.png") 658 | fig, ax = plt.subplots(1, 1, figsize=(plot_width, plot_width - 2)) 659 | plt.sca(ax) 660 | gene_outlier_plot(naive_collapsed, data['gene_effect'], metrics=metrics) 661 | plt.title("Most Extreme Z-Scores by Gene") 662 | plt.xlabel("Gene Effect Extreme ZScore") 663 | plt.ylabel("Naive Extreme ZScore") 664 | add_image("gene_zscore_extremes.png") 665 | story.append(PageBreak()) 666 | 667 | print("summarizing") 668 | ge_mean = data['gene_effect'].mean() 669 | cell_line_mean = data['gene_effect'].mean(axis=1).std()/ge_mean.std() 670 | naive_means = {key: v.mean() for key, v in naive.items()} 671 | 672 | naive_corr_text = '\n'.join([ 673 | '\t%s: %1.3f' % (key, v.corr(ge_mean)) 674 | for key, v in naive_means.items() 675 | ]) 676 | story.insert(1, Paragraph( 677 | ''' 678 | Summary: the standard deviation (SD) of gene means in gene effect is %1.3f.\n 679 | The mean of gene SDs is %1.3f the SD of gene means.\n 680 | The SD of cell line means is %1.3f the SD of gene means\n. 681 | The correlation of each library's mean LFC per gene with Chronos' mean gene effect is:\n 682 | %s 683 | ''' % (ge_mean.std(), metrics['mean_SD:SD_means'], cell_line_mean, naive_corr_text) 684 | )) 685 | 686 | print("plotting genes with low agreement with naive gene effect") 687 | story.append(Paragraph("Exploring Low Agreement Genes", style=styles['Heading2'])) 688 | story.append(Spacer(.125, 12)) 689 | story.append(Paragraph("In the remaining plots, the genes with lowest agreement are explored further. \ 690 | NA results for guide efficacy are replaced with -.1")) 691 | story.append(Spacer(.125, 12)) 692 | 693 | outliers = set(metrics['worst_agreement']) \ 694 | | set([s.split('_')[0] for s in metrics['low_outliers']]) \ 695 | | set([s.split('_')[0] for s in metrics['high_outliers']]) 696 | for gene in outliers: 697 | print("\t%s" % gene) 698 | header = Paragraph(gene, style=styles["Heading3"]) 699 | story.append(header) 700 | fig = interrogate_gene(data, naive, naive_collapsed, gene, plot_width, plot_height) 701 | add_image(gene + '.png') 702 | story.append(PageBreak()) 703 | 704 | 705 | print("building report") 706 | doc.build(story) 707 | return metrics 708 | 709 | 710 | 711 | 712 | def comparative_qc_report(title, data, 713 | positive_control_genes, negative_control_genes, 714 | mutation_matrix, addiction_expressions, 715 | report_name=None, directory='.', 716 | plot_width=7.5, plot_height=3.25, 717 | doc_args=dict( 718 | pagesize=letter, rightMargin=.5*inch, leftMargin=.5*inch, 719 | topMargin=.5*inch,bottomMargin=.5*inch 720 | ), 721 | specific_plot_dimensions={} 722 | ): 723 | ''' 724 | Compare the output of two Chronos runs, or Chronos with another algorithm (if that algorithm also 725 | estimates gene effect and guide efficacy). 726 | Parameters: 727 | `title` (`str`): the report title, printed on first page 728 | `data` (`dict`): A `dict` with EXACTLY two entries. the keys of the entries will be used as labels 729 | in the plots in the report. Each value is also a `dict` which must contain the keys 'gene_effect', 730 | 'sequence_map', 'guide_map', 'guide_efficacy', and 'logfoldchange'. Gene effect and guide efficacy 731 | are model outputs, while logfoldchange can be calculated directly from the data. 732 | `positive_control_genes`, `negative_control_genes` (`list`, `pandas.Index`, or `numpy.array` of `str`): 733 | Genes whose KO is expected to cause loss of viability or no loss of viability, respectively. 734 | `mutation_matrix` (`pandas.DataFrame`): optional boolean matrix of cell line by gene. 735 | Each value indicates that the gene has a gain of function mutation in that cell line. 736 | Genes should be selected such that a gain of function mutation is expected to make the cell line 737 | dependent on that gene. Tbhis is used to evaluate the separation of gene effects for that gene 738 | between mutated and wildtype cell lines. 739 | `addiction_expressions` (`pandas.DataFrame`): optional `float` matrix of cell lines by genes containing 740 | expressions. The genes should be chosen such that cell lines highly expressing the gene are expected 741 | to be dependent on it, while other cell lines are not. 742 | `copy_number` (`pandas.DataFrame`): optional cell line by gene `float` matrix of logged copy number counts. Used to QC the copy 743 | number effect. 744 | `report_name` (`str`): an optional file name for the report. If none is provided, `title` + '.pdf' will be used. 745 | `directory` (`str`): where the report and figure panels will be generated. 746 | `gene_effect_file` (`str`): If `data` is a path to a directory, this arg is passed to `load_chronos_data_for_qc`. 747 | `plot_width`, `plot_height` (`float`): size of plots that will be put in the report in inches. 748 | `doc_args` (`dict`): additional arguments will be passed to `SimpleDocTemplate`. 749 | `specific_plot_dimensions` (`dict` of 2-tuple`): if a plot's name is present, will use the the value 750 | to specify dimensions for that plot instead of deriving them from `plot_width` and `plot_height` 751 | Returns: 752 | `dict` containing the calculated QC metrics, which will also be in the report. 753 | ''' 754 | required_data_keys = ["gene_effect", "sequence_map", "guide_map", "guide_efficacy", 755 | "logfoldchange"] 756 | if len(data) != 2: 757 | raise ValueError("`data` must be a dict with two keys") 758 | for key, val in data.items(): 759 | for key2 in required_data_keys: 760 | if not key2 in data[key]: 761 | raise ValueError("`data[%s] missing required entry %s" % (key, key2)) 762 | 763 | if report_name is None: 764 | report_name = title + ".pdf" 765 | 766 | doc = SimpleDocTemplate(os.path.join(directory, report_name), **doc_args) 767 | styles=getSampleStyleSheet() 768 | keys = list(data.keys()) 769 | story = [] 770 | metrics = {keys[0]: {}, keys[1]: {}, "joint": {}} 771 | 772 | def add_image(filename): 773 | fig = plt.gcf() 774 | label = '.'.join(filename.split('.')[:-1]) 775 | if label in specific_plot_dimensions: 776 | fig.set_size_inches(specific_plot_dimensions[label]) 777 | width, height = fig.get_size_inches() 778 | plt.tight_layout() 779 | fig.savefig(os.path.join(directory, filename)) 780 | plt.close(fig) 781 | im = Image(os.path.join(directory, filename), width*inch, height*inch) 782 | story.append(im) 783 | story.append(Spacer(.125, 12)) 784 | 785 | 786 | story.append(Paragraph(title, style=styles["Heading1"])) 787 | print("plotting global control separation") 788 | story.append(Paragraph("Control Separation", style=styles["Heading2"])) 789 | story.append(Paragraph("Control Histogram", style=styles["Heading3"])) 790 | paragraph = Paragraph( 791 | "A direct visualization of control separation." 792 | ) 793 | story.append(paragraph) 794 | fig = qc_compare_plot(control_histogram, data, "gene_effect", metrics, 795 | plot_width, plot_height, 796 | positive_control_genes=positive_control_genes, 797 | negative_control_genes=negative_control_genes) 798 | add_image("control_histogram.png") 799 | 800 | 801 | story.append(Paragraph("Per Model QC Metrics", style=styles["Heading3"])) 802 | print("plotting per-screen control separation") 803 | story.append(Paragraph( 804 | "Head-to-head comparison of control separation for each model (cell line).\ 805 | For NNMD, more negative is better. For AUROC, more positive is better." 806 | )) 807 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 808 | plt.sca(axes[0]) 809 | nnmds = {key: v['gene_effect'].apply(lambda x: 810 | nnmd(x.reindex(positive_control_genes), x.reindex(negative_control_genes)), 811 | axis=1) 812 | for key, v in data.items()} 813 | density_scatter(nnmds[keys[0]], nnmds[keys[1]], diagonal=True, label_outliers=4, s=10, alpha=.5) 814 | plt.title("NNMD") 815 | plt.xlabel(keys[0]) 816 | plt.ylabel(keys[1]) 817 | plt.sca(axes[1]) 818 | aurocs = {key: v['gene_effect'].apply(lambda x: 819 | auroc(x.reindex(positive_control_genes), x.reindex(negative_control_genes)), 820 | axis=1) 821 | for key, v in data.items()} 822 | density_scatter(aurocs[keys[0]], aurocs[keys[1]], diagonal=True, label_outliers=4, s=10, alpha=.5) 823 | plt.title("ROC AUC") 824 | plt.xlabel(keys[0]) 825 | plt.ylabel(keys[1]) 826 | add_image("model_qc_comparison.png") 827 | 828 | print("plotting selective dependency separation") 829 | header = Paragraph("Selective Dependency Distinction", style=styles["Heading3"]) 830 | story.append(header) 831 | paragraph = Paragraph( 832 | "For known cancer dependencies, the gene effect score with vs without the known indication.\ 833 | Ideally each point would fall inthe bottom right corner." 834 | ) 835 | story.append(paragraph) 836 | fig = qc_compare_plot(selective_mutated_vs_not_scatter, data, "gene_effect", metrics, 837 | plot_width, plot_height, 838 | mutation_matrix=mutation_matrix) 839 | add_image("selective_dependencies.png") 840 | print("plotting expression addictions") 841 | fig = qc_compare_plot(expression_addiction_volcano, data, "gene_effect", metrics, 842 | plot_width, plot_height, 843 | addiction_expressions=addiction_expressions) 844 | add_image("expression_addiction.png") 845 | 846 | print("plotting gene differences between datasets") 847 | story.append(Paragraph("Key Differences", style=styles["Heading2"])) 848 | story.append(Paragraph( 849 | "The correlation of individual genes between datasets, both vs mean effect \ 850 | and vs the difference of means between \ 851 | the two datasets. Below is the direct comparison of gene means in each dataset \ 852 | and a comparison of the most extreme values for each gene's score." 853 | )) 854 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 855 | plt.sca(axes[0]) 856 | gene_corr_vs_mean(data[keys[0]]["gene_effect"], data[keys[1]]['gene_effect'], 857 | metrics=metrics["joint"]) 858 | plt.sca(axes[1]) 859 | gene_corr_vs_mean_diff(data[keys[0]]["gene_effect"], data[keys[1]]['gene_effect'], 860 | metrics=metrics["joint"]) 861 | plt.xlabel("%s Mean - %s Mean" % tuple(keys)) 862 | add_image("gene_corrs.png") 863 | 864 | fig, ax = plt.subplots(1, 1, figsize=(plot_width, plot_width - 2)) 865 | plt.sca(ax) 866 | density_scatter(data[keys[0]]['gene_effect'].mean(), data[keys[1]]['gene_effect'].mean(), diagonal=True, 867 | label_outliers=10, alpha=.5, s=10) 868 | plt.title("Mean Gene Effect") 869 | plt.xlabel(keys[0]) 870 | plt.ylabel(keys[1]) 871 | add_image("gene_means.png") 872 | fig, ax = plt.subplots(1, 1, figsize=(plot_width, plot_width - 2)) 873 | plt.sca(ax) 874 | gene_outlier_plot(data[keys[0]]['gene_effect'], data[keys[1]]['gene_effect'], metrics=metrics['joint']) 875 | plt.title("Most Extreme Z-Scores by Gene") 876 | plt.xlabel(keys[0] + " Extreme ZScore") 877 | plt.ylabel(keys[1] + " Extreme ZScore") 878 | add_image("gene_zscore_extremes.png") 879 | story.append(PageBreak()) 880 | 881 | story.append(Paragraph("Library Integration", style=styles['Heading2'])) 882 | 883 | print("plotting library UMAPs") 884 | story.append(Paragraph("Library Integration UMAP", style=styles["Heading3"])) 885 | story.append(Paragraph( 886 | "Embedding of models in gene effect space colored by library coverage." 887 | )) 888 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 889 | for i, key, in enumerate(keys): 890 | plt.sca(axes[i]) 891 | plt.title(key) 892 | check_integration_umap(data[key]["gene_effect"], data[key]['sequence_map'], metrics=metrics[key], 893 | ) 894 | add_image("integration_umap.png") 895 | 896 | print("plotting library mean deviation") 897 | story.append(Paragraph("Library Mean Deviation", style=styles["Heading3"])) 898 | story.append(Paragraph( 899 | "How far a gene's average within a library deviates from the overall average, by library. \ 900 | Here, a lowess trend is fitted per library to the squared difference of the gene's mean within \ 901 | models screened with the library and its mean overall. Note that the two plots are not necessarily \ 902 | on the same scale." 903 | )) 904 | fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height)) 905 | for i, key, in enumerate(data.keys()): 906 | plt.sca(axes[i]) 907 | plt.title(key) 908 | check_integration_mean_deviation(data[key]["gene_effect"], data[key]['sequence_map'], metrics=metrics[key], 909 | ) 910 | add_image("integration_deviation.png") 911 | story.append(PageBreak()) 912 | 913 | print("plotting genes with low agreement") 914 | story.append(Paragraph("Exploring Low Agreement Genes", style=styles['Heading2'])) 915 | story.append(Spacer(.125, 12)) 916 | story.append(Paragraph("In the remaining plots, the genes with lowest agreement are explored further. \ 917 | NA results for guide efficacy are replaced with -.1")) 918 | story.append(Spacer(.125, 12)) 919 | lfc = {} 920 | guide_map = {} 921 | for key in keys: 922 | for library in data[key]['logfoldchange']: 923 | if not library in lfc: 924 | lfc[library] = data[key]['logfoldchange'][library] 925 | guide_map[library] = data[key]['guide_map'][library] 926 | else: 927 | aligned_left, aligned_right = lfc[library].align(data[key]['logfoldchange'][library], 928 | join='outer') 929 | lfc[library] = aligned_left.mask(aligned_left.isnull(), aligned_right) 930 | guide_map[library] = pd.concat( 931 | [guide_map[library], data[key]['guide_map'][library]], 932 | ignore_index=True 933 | ).drop_duplicates(subset=['sgrna', 'gene']) 934 | outliers = set(metrics['joint']['worst_agreement']) \ 935 | | set([s.split('_')[0] for s in metrics['joint']['low_outliers']]) \ 936 | | set([s.split('_')[0] for s in metrics['joint']['high_outliers']]) 937 | for gene in outliers: 938 | print("\t%s" % gene) 939 | header = Paragraph(gene, style=styles["Heading3"]) 940 | story.append(header) 941 | fig = interrogate_gene_compare(data, lfc, guide_map, gene, plot_width, plot_width) 942 | add_image(gene + '.png') 943 | story.append(PageBreak()) 944 | 945 | 946 | print("building report") 947 | doc.build(story) 948 | return metrics -------------------------------------------------------------------------------- /chronos/evaluations.py: -------------------------------------------------------------------------------- 1 | 2 | from warnings import warn 3 | import numpy as np 4 | import pandas as pd 5 | from colorsys import hsv_to_rgb, rgb_to_hsv 6 | 7 | try: 8 | from matplotlib import pyplot as plt 9 | from matplotlib.patches import Patch 10 | import seaborn as sns 11 | from scipy.stats import pearsonr 12 | from sklearn.metrics import roc_auc_score, precision_recall_curve, auc 13 | from sklearn.decomposition import PCA 14 | from statsmodels.stats.multitest import fdrcorrection 15 | except ModuleNotFoundError: 16 | raise ModuleNotFoundError("matplotlib, seaborn, statsmodels, scipy, and sklearn are required for the evaluations submodule. Try \ 17 | `pip install matplotlib; pip install seaborn; pip install scikit-learn; pip install statsmodels`") 18 | 19 | from .model import powerset 20 | from .plotting import density_scatter, lowess_trend, identify_outliers_by_zscore, identify_outliers_by_trend 21 | from .plotting import binplot, dict_plot, identify_outliers_by_diagonal 22 | try: 23 | from umap.umap_ import UMAP 24 | umap_present = True 25 | except ModuleNotFoundError: 26 | warn("umap module not found. Some plots can't be made without it. Try `pip install umap-learn") 27 | umap_present = False 28 | except NameError: 29 | warn("UMAP class not found where expected. Your umap module may be out of date. \ 30 | Try updating your version with `pip install --upgrade umap-learn") 31 | umap_present = False 32 | 33 | try: 34 | from adjustText import adjust_text 35 | adjustText_present = True 36 | except ModuleNotFoundError: 37 | warn("adjustText not found, which means labels in plots will not be adjusted to avoid overlap.\ 38 | Try `pip install adjustText`") 39 | 40 | 41 | 42 | # UTILITIES 43 | 44 | 45 | def np_cor_no_missing(x, y): 46 | """Full column-wise Pearson correlations of two matrices with no missing values.""" 47 | try: 48 | xv = (x - x.mean(axis=0))/x.std(axis=0) 49 | yv = (y - y.mean(axis=0))/y.std(axis=0) 50 | except TypeError as e: 51 | print("failed to correlate") 52 | print(x) 53 | print(y) 54 | raise e 55 | result = np.dot(xv.T, yv)/len(xv) 56 | return result 57 | 58 | 59 | def group_cols_with_same_mask(x): 60 | """ 61 | Group columns with the same indexes of NAN values. 62 | 63 | Return a sequence of tuples (mask, columns) where columns are the column indices 64 | in x which all have the mask. 65 | """ 66 | per_mask = {} 67 | for i in range(x.shape[1]): 68 | try: 69 | o_mask = pd.notnull(x[:, i]) 70 | except TypeError as e: 71 | print(x.dtype) 72 | raise(e) 73 | o_mask_b = np.packbits(o_mask).tobytes() 74 | if o_mask_b not in per_mask: 75 | per_mask[o_mask_b] = [o_mask, []] 76 | per_mask[o_mask_b][1].append(i) 77 | return per_mask.values() 78 | 79 | 80 | def fast_cor_core(x, y): 81 | ''' 82 | x (`np.array`): 2D array. All columns will be correlated with all columns of y. 83 | y (`np.array`): 2D array. All columns will be correlated with all columns of x. 84 | Must have save length as x. 85 | returns: `np.array` of shape (x.shape[1], y.shape[1]), where the ith, jth element 86 | is the pearson correlation of x[:, i] and y[:, j] with null elements removed. 87 | ''' 88 | result = np.zeros(shape=(x.shape[1], y.shape[1])) 89 | 90 | x_groups = group_cols_with_same_mask(x) 91 | y_groups = group_cols_with_same_mask(y) 92 | for x_mask, x_columns in x_groups: 93 | for y_mask, y_columns in y_groups: 94 | # print(x_mask, x_columns, y_mask, y_columns) 95 | combined_mask = x_mask & y_mask 96 | 97 | # not sure if this is the fastest way to slice out the relevant subset 98 | x_without_holes = x[:, x_columns][combined_mask, :] 99 | y_without_holes = y[:, y_columns][combined_mask, :] 100 | 101 | try: 102 | c = np_cor_no_missing(x_without_holes, y_without_holes) 103 | except ValueError: 104 | raise ValueError("trying to correlate two groups with shapes %r and %r" %( 105 | x_without_holes.shape, y_without_holes.shape 106 | )) 107 | # update result with these correlations 108 | result[np.ix_(x_columns, y_columns)] = c 109 | return result 110 | 111 | 112 | def fast_cor(x, y=None): 113 | ''' 114 | x (`pd.DataFrame`): Numerical matrix. All columns will be correlated with all columns of y. 115 | y (`pd.DataFrame`): Numerical matrix. All columns will be correlated with all columns of x. 116 | Index must overlap x. 117 | returns: `pd.DataFrame` of shape (x.shape[1], y.shape[1]), where the ith, jth element 118 | is the pearson correlation of x[:, i] and y[:, j] with null elements removed. 119 | ''' 120 | if y is None: 121 | y = x 122 | if x is y: 123 | shared = x.index 124 | else: 125 | shared = sorted(set(x.index) & set(y.index)) 126 | if len(shared) < 2: 127 | raise ValueError("x and y don't have at least two rows in common") 128 | out = pd.DataFrame(fast_cor_core(x.loc[shared].values, y.loc[shared].values), 129 | index=x.columns, columns=y.columns) 130 | return out 131 | 132 | 133 | def get_aligned_mutation_matrix(base_matrix, gene_effect): 134 | '''Aligning a mutation matrix with gene effect, requiring a minimum number of non-null values in gene effect''' 135 | aligned_matrix = base_matrix.reindex(gene_effect.index).fillna(False) 136 | aligned_matrix = aligned_matrix[sorted(set(aligned_matrix.columns) & set(gene_effect.columns))] 137 | aligned_matrix.fillna(False, inplace=True) 138 | aligned_matrix[gene_effect[aligned_matrix.columns].isnull()] = np.nan 139 | aligned_matrix = aligned_matrix[aligned_matrix.columns[ 140 | (aligned_matrix & gene_effect[aligned_matrix.columns].notnull() ).sum() > 2 141 | ]] 142 | return aligned_matrix 143 | 144 | 145 | def split_color(rgb): 146 | ''' get two colors with the same hue and saturation but different values''' 147 | h, s, v = rgb_to_hsv(*rgb) 148 | return hsv_to_rgb(h, s, .3), hsv_to_rgb(h, s, .6) 149 | 150 | 151 | def generate_powerset_palette(keys, start='random', 152 | base_saturation=1, base_hsv_value=.7): 153 | ''' 154 | Generate a palette for the powerset of `keys`. Colors for the individual keys will be evenly spaced 155 | in hue space. Combinations will have the average of the hues of each key, with identical hues being resolved 156 | by different hsv values (brightness). 157 | Parameters: 158 | `keys` (iterable): the base keys that will be combined into a powerset 159 | `start` (`float` or "random"): optional hue for the first entry in `keys` 160 | `base_saturation`: saturation of colors for the individual keys 161 | `base_hsv_value`: hsv value parameter for colors for the individual keys 162 | Returns: 163 | `dict` with an entry for each possible unique combination of the keys (excluding the empty set) containing 164 | an RGB color for the combination 165 | ''' 166 | if start == 'random': 167 | start = np.random.uniform() 168 | base_hues = start + np.arange(len(keys))/len(keys) 169 | base_rgb = dict(zip(keys, [hsv_to_rgb(hue, base_saturation, base_hsv_value) for hue in base_hues])) 170 | out = {} 171 | keysets = list(powerset(keys)) 172 | for keyset in keysets: 173 | if not len(keyset): 174 | continue 175 | color = np.mean(np.stack([base_rgb[key] for key in keyset]), axis=0) 176 | out[keyset] = tuple(color) 177 | for i, keyset1 in enumerate(keysets): 178 | if not len(keyset1): 179 | continue 180 | for keyset2 in keysets[i+1:]: 181 | if not len(keyset2): 182 | continue 183 | dist = np.sqrt(((np.array(out[keyset1]) - np.array(out[keyset2]))**2).sum()) 184 | if dist < .1: 185 | out[keyset1], out[keyset2] = split_color(out[keyset1]) 186 | return out 187 | 188 | 189 | def trim_overlapping_lead_and_tail(strings): 190 | ''' 191 | Removes extraneous prefixes/suffixes common to all the strings for more parsimonious labeling 192 | ''' 193 | if len(strings) < 2: 194 | return strings 195 | n = min([len(string) for string in strings]) 196 | for i in range(n): 197 | c = strings[0][i] 198 | if any([string[i] != c for string in strings[1:]]): 199 | break 200 | if i == n: 201 | raise ValueError("Shortest string has no distinct substring:\n%r" % strings) 202 | for j in range(n): 203 | c = strings[0][-j-1] 204 | if any([string[-j-1] != c for string in strings[1:]]): 205 | break 206 | if j == 0: 207 | return [string[i:] for string in strings] 208 | return [string[i:-j] for string in strings] 209 | 210 | 211 | def _strip_identical_prefix(s1, s2): 212 | i = 0 213 | while s1[i] == s2[i]: 214 | i += 1 215 | return s1[i:], s2[i:] 216 | 217 | def _make_aliases(keys): 218 | ''' 219 | Tries to create a series with values holding a unique, logical two-letter code 220 | for each key in keys. 221 | ''' 222 | deduplicated = pd.Series([s for s in trim_overlapping_lead_and_tail(keys)], index=[s for s in keys]) 223 | true_unique = deduplicated.copy() 224 | for i in range(len(deduplicated)): 225 | for j in range(i+1, len(deduplicated)): 226 | true_unique.iloc[i], true_unique.iloc[j] = _strip_identical_prefix(true_unique.iloc[i], true_unique.iloc[j]) 227 | out = {} 228 | for s in keys: 229 | if deduplicated[s].startswith(true_unique[s]): 230 | out[s] = deduplicated[s][:2] 231 | else: 232 | out[s] = deduplicated[s][0] + true_unique[s][0] 233 | return pd.Series(out) 234 | 235 | 236 | def append_to_legend_handles(lines, ax): 237 | ''' 238 | Add text to the matplotlib legend to an axis 239 | Parameters: 240 | `lines` (iterable of `str`): lines to add 241 | `ax`: `matplotlib.Axis` 242 | Returns: 243 | legend handles 244 | ''' 245 | handles, labels = ax.get_legend_handles_labels() 246 | for line in lines: 247 | handles.append( 248 | Patch( 249 | color=(0, 0, 0, 0), 250 | label=line 251 | ) 252 | ) 253 | return handles 254 | 255 | 256 | # METRICS 257 | 258 | def mad(x, axis=None): 259 | '''median absolute deviation from the median''' 260 | x = x[pd.notnull(x)] 261 | med = np.median(x, axis) 262 | return np.median( np.abs(x-med), axis) 263 | 264 | 265 | def nnmd(pos, neg): 266 | '''null-normalixed median difference between the `pos` and `neg` arrays''' 267 | return (np.median(pos[pd.notnull(pos)]) - np.median(neg[pd.notnull(neg)]))/mad(neg) 268 | 269 | 270 | def auroc(pos, neg): 271 | '''ROC AUC of separation between `pos` and `neg` arrays''' 272 | pos = pos[pd.notnull(pos)] 273 | neg = neg[pd.notnull(neg)] 274 | true = [0] * len(pos) + [1] * len(neg) 275 | return roc_auc_score(y_true=true, y_score=list(pos) + list(neg)) 276 | 277 | def pr_auc(pos, neg): 278 | '''Area under precision-recall curve separating `pos` and `neg` arrays''' 279 | pos = pos[pd.notnull(pos)] 280 | neg = neg[pd.notnull(neg)] 281 | probas = np.concatenate([np.array(pos), np.array(neg)]) 282 | true = [0] * len(pos) + [1] * len(neg) 283 | precision, recall, thresh = precision_recall_curve(y_true=true, probas_pred=probas) 284 | return auc(recall, precision) 285 | 286 | 287 | # PRE RUN PLOTS 288 | 289 | def replicate_plot(readcounts, rep1, rep2): 290 | ''' 291 | Given a `pandas.DataFrame` matrix of `readcounts` with replicates as rows and sgRNAs as columns, plot 292 | the logged readcounts of `rep1` vs `rep2` and annotate with their Pearson correlation. 293 | ''' 294 | for rep in rep1, rep2: 295 | if not rep in readcounts.index: 296 | raise ValueError("replicate label %s not found in the index of `readcounts`" % rep) 297 | x = np.log2(readcounts.loc[rep1]+1) 298 | y = np.log2(readcounts.loc[rep2]+1) 299 | density_scatter(x, y, 300 | label_outliers=2) 301 | plt.xlabel("%s Readcounts (+1Log)" % rep1) 302 | plt.ylabel("%s Readcounts (+1Log)" % rep2) 303 | r = x.corr(y) 304 | plt.text(s='R = %1.2f' % r, x=.05, y=.9, transform=plt.gca().transAxes) 305 | 306 | 307 | def all_replicate_plot(readcounts, sequence_map, cell_line, plot_width): 308 | ''' 309 | Given a `pandas.DataFrame` matrix of `readcounts` with replicates as rows and sgRNAs as columns, generate a 310 | `replicate_plot` for all pairs of replicates of `cell_line`. See `chronos.Chronos` for a description 311 | of `sequence_map`. `plot_width` gives the plot width in inches. 312 | ''' 313 | reps = sequence_map.query("cell_line_name == %r" % cell_line).sequence_ID.unique() 314 | rep_labels = dict(zip(reps, trim_overlapping_lead_and_tail(reps))) 315 | n = 0 316 | titles = {} 317 | for i in range(len(reps)-1): 318 | for j in range(i+1, len(reps)): 319 | n += 1 320 | titles["%s %i" % (cell_line, n)] = (reps[i], reps[j]) 321 | def plotfunc(x): 322 | replicate_plot(readcounts, *x) 323 | plt.xlabel("Rep." + rep_labels[x[0]]) 324 | plt.ylabel("Rep." + rep_labels[x[1]]) 325 | dict_plot(titles, plotfunc, plot_width) 326 | plt.tight_layout() 327 | 328 | 329 | def pDNA_plot(readcounts, sequence_map, rep, sgrnas=None): 330 | ''' 331 | Given a `pandas.DataFrame` matrix of `readcounts` with replicates as rows and sgRNAs as columns, plot 332 | the logged readcounts of `rep` vs median readcounts in pDNA of the same batch. `sgrnas` optionally 333 | subsets the plot to the specified sgrnas. See `chronos.Chronos` for a description 334 | of `sequence_map`. 335 | ''' 336 | if not rep in readcounts.index: 337 | raise ValueError("Rep %r not in readcounts index" %rep) 338 | if not sgrnas is None: 339 | controls = sorted(set(sgrnas) & set(readcounts.columns)) 340 | if not controls: 341 | raise ValueError("None of the specified sgRNAs are in the readcounts columns: \n%r" 342 | % sgrnas) 343 | batch_label = sequence_map.query("sequence_ID == %r" % rep).iloc[0]['pDNA_batch'] 344 | pdna_seq = sequence_map\ 345 | .query("cell_line_name == 'pDNA'")\ 346 | .query("pDNA_batch == %r" % batch_label)\ 347 | .sequence_ID 348 | pdna = np.log2(readcounts.loc[pdna_seq]+1).median() 349 | ltp = np.log2(readcounts.loc[rep]+1) 350 | if not sgrnas is None: 351 | pdna = pdna.loc[controls] 352 | ltp = ltp.loc[controls] 353 | density_scatter(pdna, ltp, 354 | trend_line=False, diagonal=True) 355 | plt.xlabel("%s Readcounts (+1Log)" % "pDNA") 356 | plt.ylabel("%s Readcounts (+1Log)" % rep) 357 | r = ltp.corr(pdna) 358 | plt.text(s='R = %1.2f' % r, x=.05, y=.9, transform=plt.gca().transAxes) 359 | 360 | 361 | 362 | def paired_pDNA_plots(readcounts, sequence_map, cell_line, 363 | negative_control_sgRNAs=None, positive_control_sgRNAs=None, 364 | plot_width=7.5, plot_height=3, page_height=9): 365 | ''' 366 | If `negative_control_sgRNAs` and `positive_control_sgRNAs` is none, 367 | produces one subplot for each replicate of the cell line with a `pDNA_plot`. 368 | Otherwise, generates pairs of pDNA plots for each replicate. If both control types 369 | are supplied, one will be plotted on each side. If one is missing, 370 | it will be replaced with all sgRNAs. `plot_width` specified the figure width in inches, 371 | but `plot_height` specifies subplot height. This will be adjusted if the total figure 372 | height would exceed `page_height`. See `pDNA_plot` for other parameters. 373 | ''' 374 | reps = sequence_map.query("cell_line_name == %r" % cell_line).sequence_ID.unique() 375 | labels = dict(zip(reps, trim_overlapping_lead_and_tail(reps))) 376 | left_title = "Negative Controls" 377 | right_title = "Positive Controls" 378 | 379 | if negative_control_sgRNAs is None and positive_control_sgRNAs is None: 380 | titles = dict(zip(trim_overlapping_lead_and_tail(reps), reps)) 381 | def plotfunc(x): 382 | pDNA_plot(readcounts, sequence_map, x) 383 | plt.ylabel(labels[x]) 384 | dict_plot(titles, plotfunc) 385 | return 386 | elif positive_control_sgRNAs is None: 387 | positive_control_sgRNAs = readcounts.columns 388 | right_title = "All sgRNAs" 389 | elif negative_control_sgRNAs is None: 390 | negative_control_sgRNAs = readcounts.columns 391 | left_title = "All sgRNAs" 392 | 393 | height = min(page_height, plot_height*len(reps)) 394 | fig, axes = plt.subplots(len(reps), 2, figsize=(plot_width, height)) 395 | for i, rep in enumerate(reps): 396 | plt.sca(axes[i, 0]) 397 | pDNA_plot(readcounts, sequence_map, rep, negative_control_sgRNAs) 398 | plt.ylabel('Rep. ' + labels[rep]) 399 | plt.title(left_title) 400 | 401 | plt.sca(axes[i, 1]) 402 | pDNA_plot(readcounts, sequence_map, rep, positive_control_sgRNAs) 403 | plt.ylabel("") 404 | plt.title(right_title) 405 | 406 | 407 | 408 | # POST RUN PLOTS 409 | 410 | 411 | def gene_outlier_plot(gene_effect1, gene_effect2, 412 | xlabel="gene_effect1 zscore", ylabel="gene_effect2 zscore", 413 | ax=None, legend=True, 414 | density_scatter_args={"label_outliers": 10, "trend_line": True}, 415 | legend_args={}, metrics=None 416 | ): 417 | ''' 418 | Compares the most extreme outliers for the matrices `gene_effect1` and `gene_effect2` 419 | with a density scatter of the maximum and minimum screen gene effect for each gene 420 | with results from `gene_effect1` on one axis and `gene_effect2` on the other, and returns the 421 | outliers from the trend line. 422 | This plot is useful to detect if one method is producing etreme outliers within a gene score 423 | relative to the other. 424 | ''' 425 | gene_effect1, gene_effect2 = gene_effect1.align(gene_effect2, join="inner") 426 | zscore1 = (gene_effect1 - gene_effect1.mean())/gene_effect1.std() 427 | zscore2 = (gene_effect2 - gene_effect2.mean())/gene_effect2.std() 428 | mins1 = zscore1.min() 429 | mins1.index = [s + "_Min" for s in mins1.index] 430 | max1 = zscore1.max() 431 | max1.index = mins1.index 432 | mins2 = zscore2.min() 433 | mins2.index = [s + "_Min" for s in mins2.index] 434 | max2 = zscore2.max() 435 | max2.index = mins2.index 436 | x = pd.concat([mins1, max1]) 437 | y = pd.concat([mins2, max2]) 438 | if not ax: 439 | ax = plt.gca() 440 | plt.sca(ax) 441 | density_scatter(x, y, **density_scatter_args) 442 | plt.title("Most Extreme Values by ZScore") 443 | out = { 444 | 'low_outliers': mins1.index[identify_outliers_by_trend(mins1, mins2, 5)], 445 | 'high_outliers': max1.index[identify_outliers_by_trend(max1, max2, 5)] 446 | } 447 | if metrics is None: 448 | return out 449 | else: 450 | metrics.update(out) 451 | 452 | 453 | 454 | 455 | def gene_corr_vs_mean(gene_effect1, gene_effect2, ax=None, 456 | legend=True, 457 | density_scatter_args={"label_outliers": 5, "trend_line": False}, legend_args={}, metrics=None 458 | ): 459 | ''' 460 | Shows the correlation of a gene's gene effect profile within the two matrices `gene_effect` and `gene_effect2` 461 | with the gene's mean effect (averaged between the two matrices) on the x axis, and returns the genes with lowest correlation. 462 | ''' 463 | corrs = gene_effect1.corrwith(gene_effect2).dropna() 464 | means = .5*(gene_effect1[corrs.index].mean() + gene_effect2[corrs.index].mean()) 465 | density_scatter(means, corrs, **density_scatter_args) 466 | if not ax: 467 | ax = plt.gca() 468 | ax.set_xlabel("Gene Mean") 469 | ax.set_ylabel("Gene Correlation") 470 | 471 | out = { 472 | "gene_corr_med": corrs.median(), 473 | "gene_corr_lt_9": (corrs < .9).sum(), 474 | } 475 | if legend: 476 | handles = append_to_legend_handles([ 477 | "%s: %1.3f" % (key.replace("_", ' '), val) 478 | for key, val in out.items() 479 | ], ax) 480 | plt.legend(handles=handles, **legend_args) 481 | out["lowest_corr"] = corrs.sort_values().index[0:20] 482 | if metrics is None: 483 | return out 484 | else: 485 | metrics.update(out) 486 | 487 | 488 | def gene_corr_vs_mean_diff(gene_effect1, gene_effect2, ax=None, 489 | legend=True, 490 | density_scatter_args={"label_outliers": 5, "trend_line": False, "outliers_from": "xy_zscore"}, 491 | legend_args={}, metrics=None 492 | ): 493 | ''' 494 | Shows the correlation of a gene's gene effect profile within the two matrices `gene_effect` and `gene_effect2` 495 | with the difference in the gene's mean effect between the two matrices on the x axis. This plot is useful for 496 | seeing which genes have the most disagreement between the matrices either by correlation or by mean effect. 497 | It returns outliers found by zscore, i.e. genes with lowest agreement taking into account both their means 498 | and their correlations. 499 | ''' 500 | corrs = gene_effect1.corrwith(gene_effect2).dropna() 501 | mean_diff = gene_effect1[corrs.index].mean() - gene_effect2[corrs.index].mean() 502 | corrs, mean_diff = corrs.align(mean_diff.dropna(), join="inner") 503 | density_scatter(mean_diff, corrs, **density_scatter_args) 504 | if not ax: 505 | ax = plt.gca() 506 | ax.set_xlabel("Gene Mean Diff") 507 | ax.set_ylabel("Gene Correlation") 508 | 509 | out = { 510 | "gene_corr_med": corrs.median(), 511 | "gene_corr_lt_9": (corrs < .9).sum(), 512 | } 513 | if legend: 514 | handles = append_to_legend_handles([ 515 | "%s: %1.3f" % (key.replace("_", ' '), val) 516 | for key, val in out.items() 517 | ], ax) 518 | plt.legend(handles=handles, **legend_args) 519 | out["worst_agreement"] = corrs.index[identify_outliers_by_zscore(corrs, mean_diff, 10)] 520 | if metrics is None: 521 | return out 522 | else: 523 | metrics.update(out) 524 | 525 | 526 | def control_histogram(gene_effect, positive_control_genes, negative_control_genes, ax=None, 527 | legend=True, 528 | kde_args={}, legend_args={}, metrics=None): 529 | ''' 530 | Produces KDE plots of the distribution of positive and negative control gene scores 531 | in the gene effect matrix. Both the mean gene scores and the raveled gene scores are 532 | shown. Control separatioon results measured by NNMD and AUROC are returned. 533 | ''' 534 | pos = gene_effect.reindex(columns=positive_control_genes).dropna(axis=1, how='all') 535 | neg = gene_effect.reindex(columns=negative_control_genes).dropna(axis=1, how='all') 536 | sns.kdeplot(pos.mean(), bw_adjust=.5, fill=True, alpha=.3, lw=0, color="red", 537 | label="Positive Control Means", ax=ax, gridsize=1000, **kde_args) 538 | sns.kdeplot(neg.mean(), bw_adjust=.5, fill=True, alpha=.3, lw=0, color="blue", 539 | label="Negative Control Means", ax=ax, gridsize=1000, **kde_args) 540 | sns.kdeplot(pos.stack(), bw_adjust=.5, lw=2, color="crimson", 541 | label="Positive Control Scores", ax=ax, gridsize=1000, **kde_args) 542 | sns.kdeplot(neg.stack(), bw_adjust=.5, lw=2, color="navy", 543 | label="Negative Control Scores", ax=ax, gridsize=1000, **kde_args) 544 | if not ax: 545 | ax = plt.gca() 546 | ax.set_xlabel("Gene Effect") 547 | out = { 548 | "NNMD_of_means": nnmd(pos.mean(), neg.mean()), 549 | "NNMD_of_scores": nnmd(pos.stack(), neg.stack()), 550 | "AUROC_of_means": auroc(pos.mean(), neg.mean()), 551 | "AUROC_of_scores": auroc(pos.stack(), neg.stack()) 552 | } 553 | if legend: 554 | handles = append_to_legend_handles([ 555 | "%s: %1.3f" % (key.replace("_", ' '), val) 556 | for key, val in out.items() 557 | ], ax) 558 | plt.legend(handles=handles, **legend_args) 559 | if metrics is None: 560 | return out 561 | else: 562 | metrics.update(out) 563 | 564 | 565 | def mean_vs_sd_scatter(gene_effect, 566 | ax=None, 567 | metrics=None, legend=True, legend_args={}, 568 | density_scatter_args={"alpha": .6, "s": 10, "label_outliers": 3, "outliers_from": "xy_zscore"} 569 | ): 570 | ''' 571 | Plots the gene mean vs its standard deviation for each gene in the matrix `gene_effect`. 572 | ''' 573 | means = gene_effect.mean() 574 | sd = gene_effect.std()/means.std() 575 | 576 | if not ax: 577 | ax = plt.gca() 578 | density_scatter(means, sd, ax=ax, **density_scatter_args) 579 | plt.ylabel("Gene SD / SD of Gene Means") 580 | plt.xlabel("Gene Mean") 581 | 582 | out = { 583 | "mean_SD:SD_means": sd.mean() 584 | } 585 | if legend: 586 | handles = append_to_legend_handles([ 587 | "%s: %1.2f" % (key.replace("_", ' '), val) 588 | for key, val in out.items() 589 | ], ax) 590 | plt.legend(handles=handles, **legend_args) 591 | if metrics is None: 592 | return out 593 | else: 594 | metrics.update(out) 595 | 596 | 597 | def mean_vs_cell_eff_correlation(gene_effect, replicate_efficacy, 598 | ax=None, metrics=None, legend=True, legend_args={"loc": "upper right"}, 599 | density_scatter_args={"alpha": .6, "s": 10, "label_outliers": 5, "outliers_from": "y"} 600 | ): 601 | ''' 602 | Usiung the matrix `gene_effect`, plots each gene's gene effect profile's mean vs its correlation with estimated 603 | `cell_efficacy` (`pandas.Series` or `numpy.ndarray`) of all screens. 604 | This plot is useful for detecting screen quality bias, in that genes with lower means will tend to 605 | be negatively correlated with cell efficacy. Returns cell efficacy correlation mean, standard deviation, 606 | and it correlation with gene mean. 607 | ''' 608 | raise NotImplementedError("Function needs to be updated before using") 609 | means = gene_effect.mean() 610 | corrs = gene_effect.corrwith(cell_efficacy) 611 | if not ax: 612 | ax = plt.gca() 613 | density_scatter(means, corrs, ax=ax, **density_scatter_args) 614 | plt.ylabel("Gene Effect R with Cell Efficacy") 615 | plt.xlabel("Gene Mean") 616 | out = { 617 | "cell_efficacy_corr_mean": corrs.mean(), 618 | "cell_efficacy_corr_sd": corrs.std(), 619 | "cell_efficacy_corr_gene_mean_trend": means.corr(corrs) 620 | } 621 | if legend: 622 | handles = append_to_legend_handles([ 623 | "%s: %1.2f" % (key.replace("_", ' '), val) 624 | for key, val in out.items() 625 | ], ax) 626 | plt.legend(handles=handles, **legend_args) 627 | if metrics is None: 628 | return out 629 | else: 630 | metrics.update(out) 631 | 632 | 633 | def screen_nnmd_auroc_scatter(gene_effect, positive_control_genes, negative_control_genes, ax=None, 634 | metrics=None, legend=True, legend_args={}, 635 | density_scatter_args={}): 636 | ''' 637 | For each screen (row) in the matrix `gene_effect`, computes the separation of the iterable `positive_control_genes` 638 | from `negative_control_genes` by NNMD and AUROC. This is useful for visualizing the distribution of screen quality. 639 | The median and mean of both measures are returned. 640 | ''' 641 | poscon = sorted(set(positive_control_genes) & set(gene_effect.columns)) 642 | negcon = sorted(set(negative_control_genes) & set(gene_effect.columns)) 643 | nnmds = gene_effect.apply( 644 | lambda x: nnmd(x[poscon], x[negcon]), 645 | axis=1 646 | ) 647 | aurocs = gene_effect.apply( 648 | lambda x: auroc(x[poscon], x[negcon]), 649 | axis=1 650 | ) 651 | 652 | if not ax: 653 | ax=plt.gca() 654 | density_scatter(aurocs, nnmds, ax=ax, **density_scatter_args) 655 | plt.xlabel("AUROC - Higher is Better") 656 | plt.ylabel("NNMD - Lower is Better") 657 | 658 | out = { 659 | "NNMD_median": nnmds.median(), 660 | "NNMD_mean": nnmds.mean(), 661 | "AUROC_median": aurocs.median(), 662 | "AUROC_mean": aurocs.mean() 663 | } 664 | 665 | if legend: 666 | handles = append_to_legend_handles([ 667 | "%s: %1.3f" % (key.replace("_", ' '), val) 668 | for key, val in out.items() 669 | ], ax) 670 | plt.legend(handles=handles, loc="lower left", **legend_args) 671 | if metrics is None: 672 | return out 673 | else: 674 | metrics.update(out) 675 | 676 | 677 | def expression_addiction_volcano(gene_effect, addiction_expressions, 678 | max_threshold=-.2, 679 | ax=None, 680 | metrics=None, legend=True, legend_args={}, 681 | density_scatter_args={"trend_line": False} 682 | ): 683 | ''' 684 | Given the matrix of expression `addiction_expressions`, whose columns should only include 685 | genes expected to be expression addictions (i.e. cause loss of viability in cell lines 686 | that overexpress them), computes the Pearson correlation and associated false discovery rate between 687 | the gene's expression and its gene effect in the matrix `gene_effect` and plots the result 688 | as a volcano. Note that the p-values informing the FDRs (q values) are optimistic due to 689 | the assumption of normal errors. The fraction of selective dependencies with FDR < 0.1 or 690 | R < -0.2 is returned. 691 | This plot is useful for evaluating the ability to identify 692 | selective dependencies and their association with the correct biomarker. 693 | ''' 694 | gene_effect, addiction_expressions = gene_effect.align(addiction_expressions, join="inner") 695 | corr = {} 696 | p = {} 697 | for gene in gene_effect: 698 | mask = gene_effect[gene].notnull() & addiction_expressions[gene].notnull() 699 | corr[gene], p[gene] = pearsonr(gene_effect[gene][mask], addiction_expressions[gene][mask]) 700 | corr, p = pd.Series(corr), pd.Series(p) 701 | p /= 2 702 | p[corr > 0] = 1 - p[corr > 0] 703 | q = pd.Series(fdrcorrection(p.values, .05)[1], index=p.index) 704 | if ax is None: 705 | ax = plt.gca() 706 | plt.sca(ax) 707 | density_scatter(corr, -np.log10(q), **density_scatter_args) 708 | plt.xlabel("Expression/GE Correlation") 709 | plt.ylabel("-log10(FDR)") 710 | out = { 711 | "expression_addictions_FDR_0.10": (q < .1).mean(), 712 | "expression_addictions_<_-0.2": (corr < -.2).mean() 713 | } 714 | if legend: 715 | handles = append_to_legend_handles([ 716 | "%s: %1.2f" % (key.replace("_", ' '), val) 717 | for key, val in out.items() 718 | ], ax) 719 | plt.legend(handles=handles, loc="upper right", **legend_args) 720 | if metrics is None: 721 | return out 722 | else: 723 | metrics.update(out) 724 | 725 | 726 | def selective_mutated_vs_not_scatter(gene_effect, mutation_matrix, 727 | ax=None, 728 | metrics=None, legend=True, legend_args={}, label_outliers=3, 729 | scatter_args={"alpha": .75, "linewidth": 1, "cmap": 'viridis_r'} 730 | ): 731 | ''' 732 | A common pattern of dependency in cancer is "oncogene addiction," in which cells 733 | selectively require proteins with oncogenic gain of function mutations to maintain viability. 734 | Canonical examples include BRAF, NRAS, KRAS, CTNNB1, and EGFR. We expect cells with the 735 | gain of function alteration to show more negative gene effect than those without. This 736 | function takes in a boolean `mutation_matrix` with gene columns and cell line rows, 737 | which should only include genes that have known oncogenic gain of function alterations and should 738 | be `True` for cell lines that have one of the known gain of function alterations. 739 | For each gene in `mutation_matrix`, its mean in `gene_effect` for cell lines without 740 | gain of function is plotted on the x axis and its mean in cell lines with gain of function 741 | is plotted on the y axis. The separation of these two by NNMD and AUROC is returned 742 | both as a median over genes of results per gene and by combining all GoF scores for all genes as positive controls 743 | and the same genes' scores in lines without their indicated GoF alterations as the negative 744 | controls ("total" NNMD/AUROC). This plot is useful for evaluating the ability to detect 745 | selective dependencies and their biomarkers. 746 | ''' 747 | gene_effect = gene_effect.dropna(how='any', axis=1) 748 | mutation_matrix = get_aligned_mutation_matrix(mutation_matrix, gene_effect) 749 | gene_effect, mutation_matrix = gene_effect.align(mutation_matrix, join="inner") 750 | mutation_matrix = mutation_matrix.dropna(how='all', axis=1).astype(bool) 751 | gene_effect = gene_effect.dropna(how='all', axis=1) 752 | gene_effect, mutation_matrix = gene_effect.align(mutation_matrix, join="inner") 753 | if gene_effect.shape[0] == 0 or gene_effect.shape[1] == 0: 754 | raise ValueError("Gene_effect and mutation_matrix have an axis with no overlaps (either genes or screens)") 755 | scale = np.log2(mutation_matrix.sum().astype(float)) 756 | nnmds = mutation_matrix.apply(lambda x: nnmd(gene_effect[x.name][x], gene_effect[x.name][x==False]), 757 | axis=0) 758 | aurocs = mutation_matrix.apply(lambda x: auroc(gene_effect[x.name][x], gene_effect[x.name][x==False]), 759 | axis=0) 760 | total_nnmd = nnmd( 761 | gene_effect[mutation_matrix==True].stack(), 762 | gene_effect[mutation_matrix.fillna(False)==False].stack() 763 | ) 764 | total_auroc = auroc( 765 | gene_effect[mutation_matrix==True].stack(), 766 | gene_effect[mutation_matrix==False].stack() 767 | ) 768 | pos_means = gene_effect[mutation_matrix].mean() 769 | neg_means = gene_effect[mutation_matrix==False].mean() 770 | pos_means, neg_means = pos_means.align(neg_means, join="inner") 771 | 772 | if ax: 773 | plt.sca(ax) #needed because scatter doesn't accept ax arg? 774 | plt.scatter(neg_means, pos_means, s=10*scale, c=scale, **scatter_args) 775 | if label_outliers: 776 | outliers = identify_outliers_by_diagonal(neg_means, pos_means, label_outliers) 777 | texts = [plt.text(s=neg_means.index[i],x=neg_means.iloc[i], y=pos_means.iloc[i], fontsize=6, color=[.8, .3, .05]) for i in outliers] 778 | if adjustText_present: 779 | adjust_text(texts, x=neg_means.values, y=pos_means.values, arrowprops=dict(lw=1, arrowstyle="-", color="black"), 780 | ) 781 | xlim = plt.gca().get_xlim() 782 | ylim = plt.gca().get_ylim() 783 | plt.plot( 784 | [min(xlim[0], ylim[0]), max(xlim[1], ylim[1])], 785 | [min(xlim[0], ylim[0]), max(xlim[1], ylim[1])], 786 | '--', color='tomato', lw=1 787 | ) 788 | plt.colorbar(label="Log2(# Mutated Lines)") 789 | plt.xlabel("Gene Mean Without Mutation") 790 | plt.ylabel("Gene Mean With Mutation") 791 | 792 | out = { 793 | "selective_NNMD_gene_median": nnmds.median(), 794 | "selective_NNMD_raveled": total_nnmd, 795 | "selective_AUROC_gene_median": aurocs.median(), 796 | "selective_AUROC_raveled": total_auroc 797 | } 798 | if ax is None: 799 | ax = plt.gca() 800 | if legend: 801 | handles = append_to_legend_handles([ 802 | "%s: %1.2f" % (key.replace("_", ' '), val) 803 | for key, val in out.items() 804 | ], ax) 805 | plt.legend(handles=handles, loc="upper left", **legend_args) 806 | if metrics is None: 807 | return out 808 | else: 809 | metrics.update(out) 810 | 811 | 812 | def copy_number_trend(gene_effect, copy_number, 813 | downsample=False, downsample_lower_quantile_bound=.05, downsample_upper_quantile_bound=.95, 814 | ax=None, 815 | metrics=None, legend=True, legend_args={}, 816 | density_scatter_args={"alpha": .75} 817 | ): 818 | ''' 819 | Produces a scatter of the raveled `gene_effect` matrix (y) vs the raveled `copy_number` matrix 820 | (x). USeful for visualizing how much depletion highly amplified regions can produce. 821 | If `downsample` is a float between 0 and 1, points with CN between `downsample_lower_quantile` and 822 | `downsample_upper_quantile` will be randomly reduced to the fraction given by `downsample`. 823 | This can greatly increase plotting speed by reducing the number of uninformative plots 824 | with euploid CN being plotted. 825 | The overall correlation of the raveled gene effect and CN matrices is returned. 826 | ''' 827 | gene_effect, copy_number = gene_effect.align(copy_number, join='inner') 828 | ge_raveled, cn_raveled = np.ravel(gene_effect), np.ravel(copy_number) 829 | mask = pd.notnull(cn_raveled) & pd.notnull(ge_raveled) 830 | out = { 831 | "raveled_CN_corr": pearsonr(ge_raveled[mask], cn_raveled[mask])[0] 832 | } 833 | if downsample: 834 | ind = np.arange(len(cn_raveled)) 835 | selection = np.random.binomial(p=downsample, n=1, size=len(cn_raveled)) 836 | low = np.quantile(cn_raveled, downsample_lower_quantile_bound) 837 | high = np.quantile(cn_raveled, downsample_upper_quantile_bound) 838 | selection[cn_raveled < low] = 1 839 | selection[cn_raveled > high] = 1 840 | cn_raveled[selection == 0] = np.nan 841 | mask = pd.notnull(cn_raveled) & pd.notnull(ge_raveled) 842 | if ax is None: 843 | ax = plt.gca() 844 | plt.sca(ax) 845 | density_scatter(cn_raveled[mask], ge_raveled[mask], **density_scatter_args) 846 | plt.xlabel("Copy Number") 847 | plt.ylabel("Gene Effect") 848 | out = { 849 | "raveled_CN_corr": pearsonr(ge_raveled[mask], cn_raveled[mask])[0] 850 | } 851 | if legend: 852 | handles = append_to_legend_handles([ 853 | "%s: %1.2f" % (key.replace("_", ' '), val) 854 | for key, val in out.items() 855 | ], ax) 856 | plt.legend(handles=handles, loc="upper right", **legend_args) 857 | if metrics is None: 858 | return out 859 | else: 860 | metrics.update(out) 861 | 862 | 863 | def copy_number_gene_corrs(gene_effect, copy_number, 864 | ax=None, 865 | metrics=None, legend=True, legend_args={}, 866 | binplot_args={} 867 | ): 868 | ''' 869 | Computes the correlation for each gene in the matrix `gene_effect` with its copy nunber in the matrix 870 | `copy_number` (genes are columns, cell lines are rows), then bins by mean gene effect and plots the result. 871 | This is useful for idenfying the two types of copy number effect: double strand break toxicity, 872 | which causes nonessential genes to have gene effect negatively correlated with their own CN, 873 | and the copy buffering effect, which causes common essential genes to be positively correlated. 874 | ''' 875 | gene_effect, copy_number = gene_effect.align(copy_number, join='inner') 876 | corrs = gene_effect.corrwith(copy_number) 877 | means = gene_effect.mean() 878 | if ax is None: 879 | ax = plt.gca() 880 | plt.sca(ax) 881 | binplot(means, corrs, **binplot_args) 882 | plt.xlabel("Gene Effect Mean") 883 | plt.ylabel("Gene Effect R with CN") 884 | return {} 885 | 886 | 887 | def guide_estimate_corr_vs_sd_scatter(predicted_lfc, observed_lfc, 888 | ax=None, 889 | metrics=None, legend=True, legend_args={}, 890 | density_scatter_args={} 891 | ): 892 | ''' 893 | Given the two matrices of log fold-change, one predicted by the model (`predicted_lfc`) 894 | with guides as columns and replicates as rows, computes the correlation between 895 | each sgRNA and produces a scatter with the standard deviation of the sgRNAs in the observed 896 | matrix on the x axis. This is useful to see the agreement. In general sgRNAs with lower SD 897 | may have lower correlation as there is less signal. The median correlation of all sgRNAs 898 | and the median correlation of sgRNAs in the top 20% highest SD are returned. 899 | ''' 900 | corrs = predicted_lfc.corrwith(observed_lfc).dropna() 901 | sd = observed_lfc.std().loc[corrs.index] 902 | density_scatter(sd, corrs, ax=ax, **density_scatter_args) 903 | plt.xlabel("Guide LFC SD") 904 | plt.ylabel("Guide LFC Estimated/Observed R") 905 | 906 | out = { 907 | "corrs_median": corrs.median(), 908 | "corrs_median_20ile_most_variable": corrs[sd > sd.quantile(.8)].median() 909 | } 910 | if ax is None: 911 | ax = plt.gca() 912 | if legend: 913 | handles = append_to_legend_handles([ 914 | "%s: %1.2f" % (key.replace("_", ' '), val) 915 | for key, val in out.items() 916 | ], ax) 917 | plt.legend(handles=handles, loc="lower right", **legend_args) 918 | if metrics is None: 919 | return out 920 | else: 921 | metrics.update(out) 922 | 923 | 924 | def guide_estimate_corr_vs_guide_efficacy_scatter(predicted_lfc, observed_lfc, 925 | guide_efficacy, 926 | ax=None, 927 | metrics=None, legend=False, legend_args={}, 928 | density_scatter_args={} 929 | ): 930 | ''' 931 | Given the two matrices of log fold-change, one predicted by the model (`predicted_lfc`) 932 | with guides as columns and replicates as rows, computes the correlation between 933 | each sgRNA, then plots that correlation with the `pandas.Series` `guide_efficacy` 934 | which should be estimated by the model. In general we expect lower fidelity between 935 | predicted and observed sgRNAs for guides with low efficacy. 936 | ''' 937 | corrs = predicted_lfc.corrwith(observed_lfc) 938 | guide_efficacy = guide_efficacy.loc[corrs.index] 939 | density_scatter(guide_efficacy, corrs, ax=ax, **density_scatter_args) 940 | plt.xlabel("Guide Efficacy") 941 | plt.ylabel("Guide LFC R") 942 | 943 | out = { 944 | } 945 | if ax is None: 946 | ax = plt.gca() 947 | if legend: 948 | handles = append_to_legend_handles([ 949 | "%s: %1.2f" % (key.replace("_", ' '), val) 950 | for key, val in out.items() 951 | ], ax) 952 | plt.legend(handles=handles, loc="lower right", **legend_args) 953 | if metrics is None: 954 | return out 955 | else: 956 | metrics.update(out) 957 | 958 | 959 | def predicted_vs_observed_readcounts(predicted_readcounts, observed_readcounts, 960 | ax=None, max_points=10000, 961 | metrics=None, legend=True, legend_args={}, 962 | density_scatter_args={"alpha": .5, "s": 10, "diagonal":True} 963 | ): 964 | ''' 965 | Given the two normalized matrices of readcounts, one predicted by the model (`predicted_readcounts`) 966 | and one observed (`observed_readcounts`), 967 | with guides as columns and replicates as rows, produces a scatter plot with observations 968 | on the x axis and predictions on y. Points are subsampled to `max_points`. 969 | For Chronos, very low observed readcounts will be systematically 970 | predicted to have more, due to the structure of counts noise (it is more likely to observe few counts 971 | if the real expectation is high than vice versa). If the total trend of readcounts is above or 972 | below the diagonal however, that may indicate a normalization problem with the normalization. 973 | Returns the correlation, mean difference (should be near 0), and median difference (should 974 | also be near 0) 975 | ''' 976 | estimated = pd.DataFrame(np.log10(predicted_readcounts.values+1), 977 | index=predicted_readcounts.index, 978 | columns=predicted_readcounts.columns 979 | ) 980 | observed = pd.DataFrame(np.log10(observed_readcounts.values+1), 981 | index=observed_readcounts.index, 982 | columns=observed_readcounts.columns 983 | ) 984 | estimated, observed = estimated.align(observed, join="inner") 985 | stacked_est = np.ravel(estimated.values) 986 | stacked_obs = np.ravel(observed.values) 987 | 988 | if len(stacked_est) > max_points: 989 | chosen = np.random.choice(range(len(stacked_est)), size=max_points) 990 | else: 991 | chosen = range(len(stacked_est)) 992 | stacked_obs = stacked_obs[chosen] 993 | stacked_est = stacked_est[chosen] 994 | mask = pd.notnull(stacked_est) & pd.notnull(stacked_obs) 995 | stacked_est = pd.Series(stacked_est[mask]) 996 | stacked_obs = pd.Series(stacked_obs[mask]) 997 | density_scatter(stacked_obs, stacked_est, 998 | ax=ax, **density_scatter_args) 999 | plt.xlabel("Observed Readcounts (Log10)") 1000 | plt.ylabel("Estimated Readcounts (Log10)") 1001 | diff = pd.DataFrame(observed.values-estimated.values, 1002 | index=observed.index, columns=observed.columns) 1003 | out = { 1004 | "readcount_estimate_corr": pearsonr(stacked_est.values, stacked_obs.values)[0], 1005 | "readcount_estimate_mean_displacement": diff.mean().mean(), 1006 | "readcount_estimate_median_displacement": diff.median().median() 1007 | } 1008 | if ax is None: 1009 | ax = plt.gca() 1010 | if legend: 1011 | handles = append_to_legend_handles([ 1012 | "%s: %1.2f" % (key.replace("_", ' '), val) 1013 | for key, val in out.items() 1014 | ], ax) 1015 | plt.legend(handles=handles, loc="lower right", fontsize=8, **legend_args) 1016 | if metrics is None: 1017 | return out 1018 | else: 1019 | metrics.update(out) 1020 | 1021 | 1022 | def lfc_corr_vs_excess_variance(predicted_lfc, observed_lfc, excess_variance, 1023 | ax=None, 1024 | metrics=None, legend=True, legend_args={'loc': 'upper right'}, 1025 | density_scatter_args={"alpha": .5, "s": 10, 'trend_line': False, 'label_outliers':5} 1026 | ): 1027 | ''' 1028 | Given the two matrices of log fold-change, one predicted by the model (`predicted_lfc`) 1029 | with guides as columns and replicates as rows, computes the correlation between 1030 | each replicate, then correlates that correlation with the `pandas.Series` `guide_efficacy` 1031 | which should be estimated by the model. In general we expect lower fidelity between 1032 | predicted and observed sgRNAs for guides with low efficacy. 1033 | ''' 1034 | corrs = predicted_lfc.corrwith(observed_lfc, axis=1).dropna() 1035 | corrs, excess_variance = corrs.align(excess_variance.dropna(), join='inner') 1036 | if ax is None: 1037 | ax = plt.gca() 1038 | if excess_variance.dropna().nunique() == 1: 1039 | excess_variance = excess_variance + np.random.uniform(0, excess_variance.dropna().iloc[0]/10, size=len(excess_variance)) 1040 | ax.set_xlabel("Screen Excess Variance (Log10) jittered") 1041 | else: 1042 | ax.set_xlabel("Screen Excess Variance (Log10)") 1043 | density_scatter(np.log10(excess_variance), corrs, 1044 | ax=ax, **density_scatter_args) 1045 | 1046 | ax.set_ylabel("Correlation Predicted/Observed LFC") 1047 | 1048 | out = { 1049 | "lfc_cell_corrs_median": corrs.median(), 1050 | "lfc_cell_corrs_min": corrs.min(), 1051 | } 1052 | if legend: 1053 | handles = append_to_legend_handles([ 1054 | "%s: %1.2f" % (key.replace("_", ' '), val) 1055 | for key, val in out.items() 1056 | ], ax) 1057 | plt.legend(handles=handles, **legend_args) 1058 | out['lfc_cell_corrs_low'] = corrs.sort_values().index[:10] 1059 | if metrics is None: 1060 | return out 1061 | else: 1062 | metrics.update(out) 1063 | 1064 | 1065 | 1066 | def _mean_ge_deviation_vs_grad( 1067 | predicted_readcounts, observed_readcounts, guide_map, ge_mean_grad, 1068 | ax=None, 1069 | metrics=None, legend=True, legend_args={}, 1070 | density_scatter_args={"alpha": .5, "s": 10} 1071 | ): 1072 | ''' 1073 | A Chronos-specific plot. 1074 | Given the two normalized matrices of readcounts, one predicted by the model (`predicted_readcounts`) 1075 | and one observed (`observed_readcounts`), with guides as columns and replicates as rows, 1076 | computes the difference in mean log readcounts predicted from observed for each gene by 1077 | taking the mean of each sgRNA's difference of mean log readcounts. This is plotted vs 1078 | the NB2 cost gradient on the gene's mean value. Genes with systematically higher predicted 1079 | than observed readcounts should have negative cost gradients and vice versa. This is useful 1080 | for Chronos debugging. 1081 | ''' 1082 | estimated = pd.DataFrame(np.log10(predicted_readcounts.values+1), 1083 | index=predicted_readcounts.index, 1084 | columns=predicted_readcounts.columns 1085 | ) 1086 | observed = pd.DataFrame(np.log10(observed_readcounts.values+1), 1087 | index=observed_readcounts.index, 1088 | columns=observed_readcounts.columns 1089 | ) 1090 | estimated, observed = estimated.align(observed) 1091 | diff = estimated.mean() - observed.mean() 1092 | diff_gene = diff.groupby(guide_map.set_index("sgrna").gene).mean() 1093 | density_scatter(diff_gene, ge_mean_grad, 1094 | ax=ax, **density_scatter_args) 1095 | plt.xlabel("Estimated - Observed Readcounts (Log10)") 1096 | plt.ylabel("Mean Gene Effect Cost Gradient") 1097 | 1098 | 1099 | def check_integration_umap(gene_effect, sequence_map, 1100 | variance_quantile=.5, 1101 | ax=None, metrics=None, legend=True, 1102 | legend_args=dict(loc='upper left', bbox_to_anchor=(1, 1.05)), 1103 | scatter_args=dict(alpha=1, s=10) 1104 | ): 1105 | ''' 1106 | Given the matrix of `gene_effect` and a `dict` of `sequence_map`s (see chronos.Chronos doc string 1107 | for format), creates a UMAP embedding of cell lines in gene effect space, colored by the 1108 | presence of the cell lines in the various batches indicated by the keys of `sequence_map`. 1109 | To make the legend a manageable size, batch names are abbreviated to two letters. Returns the 1110 | max variance explained by batch membership as evaluated by finding the principle components of 1111 | `gene_effect`, correlating them with batch membership indicators, and multiplying that squared 1112 | correlation with the variance explained by the component, summed over all PCs (one result 1113 | per batch), returning the result for the batch that explains the most variance. This is useful 1114 | to evaluate how well different batches are integrated. 1115 | ''' 1116 | if not umap_present: 1117 | raise ModuleNotFoundError("umap must be installed to use this plot") 1118 | aliases = _make_aliases(list(sequence_map.keys())) 1119 | palette = generate_powerset_palette(sequence_map.keys(), start=0, base_hsv_value=1) 1120 | keysets = powerset(sequence_map.keys()) 1121 | keyset_lines = {} 1122 | indicators = pd.DataFrame({ 1123 | key: pd.Series(True, 1124 | index=sorted(set(sequence_map[key].cell_line_name) - set(['pDNA'])) 1125 | ) 1126 | for key in sequence_map 1127 | }) 1128 | indicators.fillna(False, inplace=True) 1129 | gene_effect, indicators = gene_effect.align(indicators, join="inner", axis=0) 1130 | sds = gene_effect.std() 1131 | cutoff = sds.quantile(variance_quantile) 1132 | gene_effect = gene_effect[sds.loc[lambda x: x>cutoff].index] 1133 | for keyset in keysets: 1134 | if not len(keyset): 1135 | continue 1136 | lines = set.intersection(*[set(sequence_map[key].cell_line_name) - set(['pDNA']) 1137 | for key in keyset]) & set(gene_effect.index) 1138 | if len(keyset) < len(sequence_map): 1139 | lines -= set.union(*[set(sequence_map[key].cell_line_name) - set(['pDNA']) 1140 | for key in sequence_map.keys() 1141 | if not key in keyset 1142 | ]) 1143 | keyset_lines[keyset] = sorted(lines) 1144 | 1145 | ump = UMAP(n_neighbors=5, min_dist=.02) 1146 | umps = pd.DataFrame(ump.fit_transform(gene_effect.dropna(axis=1).values), index=gene_effect.index) 1147 | for keyset, lines in keyset_lines.items(): 1148 | plt.scatter(umps.loc[lines, 0], umps.loc[lines, 1], label=''.join(aliases[list(keyset)]), 1149 | color=palette[keyset], 1150 | linewidth=(len(keyset)-1)/2, edgecolor='black', **scatter_args) 1151 | plt.xlabel("UMAP1") 1152 | plt.ylabel("UMAP2") 1153 | if legend: 1154 | plt.legend(**legend_args) 1155 | 1156 | out = {} 1157 | 1158 | pca = PCA() 1159 | pcs = pd.DataFrame(pca.fit_transform(gene_effect.dropna(axis=1).values), index=gene_effect.index) 1160 | corrs_squared = fast_cor(pcs, indicators.astype(float))**2 1161 | out['library_pc_variance_explained_max'] = corrs_squared\ 1162 | .multiply(pca.explained_variance_ratio_, axis=0)\ 1163 | .sum()\ 1164 | .sort_values(ascending=False)\ 1165 | .max() 1166 | if metrics is None: 1167 | return out 1168 | else: 1169 | metrics.update(out) 1170 | 1171 | 1172 | 1173 | def check_integration_mean_deviation(gene_effect, sequence_map, 1174 | ax=None, metrics=None, legend=True, 1175 | legend_args=dict(fontsize=7), 1176 | plot_args=dict(lw=1) 1177 | ): 1178 | ''' 1179 | Given the matrix of `gene_effect` and a `dict` of `sequence_map`s (see chronos.Chronos doc string 1180 | for format), calculates the mean gene effect for each gene within each batch of the sequence map, 1181 | then the squared difference between that mean and the overall mean. This is plotted as a trend line 1182 | per batch vs the overall gene mean. Returns the mean of the square root of this per-batch variance 1183 | from the overall mean, and the genes with the largest variance in each batch. 1184 | ''' 1185 | keyset_lines = {} 1186 | indicators = pd.DataFrame({ 1187 | key: pd.Series(True, 1188 | index=sorted(set(sequence_map[key].cell_line_name) - set(['pDNA'])) 1189 | ) 1190 | for key in sequence_map 1191 | }) 1192 | means1 = gene_effect.mean() 1193 | cutoffs = means1.quantile([min(.1, 100/len(means1)), max(.9, 1-100/len(means1))]) 1194 | keep = means1.loc[lambda x: (x < cutoffs.iloc[1]) & (x > cutoffs.iloc[0])].index 1195 | gene_effect = gene_effect[keep] 1196 | indicators.fillna(False, inplace=True) 1197 | gene_effect, indicators = gene_effect.align(indicators, join="inner", axis=0) 1198 | library_means = pd.DataFrame({ 1199 | library: gene_effect[indicators[library]].mean() 1200 | for library in indicators 1201 | }) 1202 | means = gene_effect.mean() 1203 | if ax is None: 1204 | ax = plt.gca() 1205 | else: 1206 | plt.sca(ax) 1207 | for library in indicators: 1208 | y = (library_means[library]-means)**2 1209 | trend = np.clip(lowess_trend(means, y), 0, np.inf) 1210 | order = np.argsort(means) 1211 | plt.plot(means.iloc[order], trend[order], label=library, **plot_args) 1212 | plt.xlabel("Gene Mean Overall") 1213 | plt.ylabel("Gene Mean Variance Trend") 1214 | 1215 | out = {} 1216 | sd = gene_effect.std() 1217 | normed_library_sd = np.sqrt(indicators.astype(float).sum()) * np.abs(library_means.subtract(means, axis=0)) 1218 | out['normed_library_deviation'] = normed_library_sd.mean().mean() 1219 | if legend: 1220 | handles = append_to_legend_handles([ 1221 | "%s: %1.2f" % (key.replace("_", ' '), val) 1222 | for key, val in out.items() 1223 | ], ax) 1224 | plt.legend(handles=handles, **legend_args) 1225 | 1226 | out['library_outliers'] = {key: normed_library_sd[key].dropna().sort_values()[-5:] 1227 | for key in normed_library_sd} 1228 | 1229 | 1230 | if ax is None: 1231 | ax = plt.gca() 1232 | else: 1233 | plt.sca(ax) 1234 | 1235 | if metrics is None: 1236 | return out 1237 | else: 1238 | metrics.update(out) 1239 | 1240 | 1241 | def guide_lfc_plot(lfc, palette): 1242 | '''convenience method for kde plotting a subset of sgRNA's log fold change with fixed color for each sgRNA''' 1243 | for j, key in enumerate(lfc.keys()): 1244 | for guide in palette[key].index: 1245 | sns.kdeplot(lfc[key][guide], label=key + guide[:4], bw_adjust=.5, color=palette[key][guide], 1246 | lw=.5) 1247 | 1248 | 1249 | def guide_palette(guide_map, gene): 1250 | ''' 1251 | Returns a palette with a unique color for each sgRNA in `guide_map` targeting `gene`. 1252 | ''' 1253 | start = np.pi * np.arange(len(guide_map))/len(guide_map) 1254 | palette = {} 1255 | for i, key in enumerate(guide_map): 1256 | guides = guide_map[key].query("gene == %r" % gene).sgrna.unique() 1257 | palette[key] = pd.Series( 1258 | sns.cubehelix_palette(len(guides), start=start[i], rot=.25/len(guide_map), dark=.35, light=.7, hue=1), 1259 | index=guides 1260 | ) 1261 | return palette 1262 | 1263 | 1264 | def interrogate_gene(data, naive, naive_collapsed, gene, plot_width, plot_height): 1265 | ''' 1266 | Creates a set of summary plots for a given gene effect profile. 1267 | Parameters: 1268 | `data` (`dict`): must contain (all of these files can be loaded from a `chronos.Chronos.save` directory) 1269 | "gene_effect": `pandas.DataFrame` with genes as columns, 1270 | "logfoldchange": `pandas.DataFrame` with sgRNAs as columns, 1271 | "guide_efficacy": `pandas.Series` indexed by sgRNA with efficacy estimates, 1272 | "t0_offset": `pandas.Series` indexed by sgRNA with offset estimates, 1273 | "library effect": pandas.DataFrame` with genes as columns, 1274 | `naive` (`dict`): contains a `pandas.DataFrame` matrix per batch with naive estimates of gene effect 1275 | (typically median log fold change over guides per gene and replicates per cell line) 1276 | `naive_collapsed`: a `pandas.DataFrame` matrix holding he consensus naive estimate over all libraries. 1277 | Easily calculcated from `chronos.reports.collapse_dataframes`. 1278 | `gene` (`str`): the gene of interest 1279 | `plot_width`, `plot_height`: the total width of the figure and the height of individual panels, in inches. 1280 | Returns: 1281 | `matplotlib.Figure` 1282 | ''' 1283 | palette = guide_palette(data['guide_map'], gene) 1284 | fig, axes = plt.subplots(3, 2, figsize=(plot_width, plot_height*2.5)) 1285 | axes = [a for ax in axes for a in ax] 1286 | 1287 | plt.sca(axes[0]) 1288 | density_scatter(naive_collapsed[gene], data["gene_effect"][gene], 1289 | diagonal=True, label_outliers=5, outliers_from='diagonal') 1290 | plt.xlabel("Naive Gene Effect") 1291 | plt.ylabel("Gene Effect") 1292 | 1293 | plt.sca(axes[1]) 1294 | for j, key in enumerate(data['logfoldchange'].keys()): 1295 | for guide in palette[key].index: 1296 | sns.kdeplot(data['logfoldchange'][key][guide], label=key + '_' + guide[:4], bw_adjust=.5, 1297 | color=palette[key][guide], 1298 | lw=1) 1299 | plt.legend(fontsize=6) 1300 | plt.xlabel("Guide LFC") 1301 | 1302 | plt.sca(axes[2]) 1303 | labels = [] 1304 | for library in palette: 1305 | x = data['guide_efficacy'].reindex(palette[library].index).fillna(-.1) 1306 | y = data['t0_offset'][library].reindex(palette[library].index).fillna(-.1) 1307 | plt.scatter( 1308 | x, y, 1309 | s=20, alpha=.75, linewidth=1, color=palette[library] 1310 | ) 1311 | labels.extend([plt.text(s='%s_%s' % (library, ind[:4]), 1312 | x=x[ind], 1313 | y=y[ind], 1314 | fontsize=6, color=palette[library][ind] 1315 | ) for ind in palette[library].index]) 1316 | if adjustText_present: 1317 | adjust_text(labels, arrowprops=dict(arrowstyle='-', color="black", lw=.5)) 1318 | plt.xlabel("Guide Efficacy") 1319 | plt.ylabel("T0 Guide Offset") 1320 | 1321 | plt.sca(axes[3]) 1322 | x = pd.Series({library: naive[library][gene].mean() 1323 | for library in naive 1324 | if gene in naive[library]}) 1325 | y = data['library_effect'].loc[gene] 1326 | colors = pd.Series({library: palette[library].iloc[0] 1327 | for library in palette 1328 | if len(palette[library])}) 1329 | x, y = x.dropna().align(y.dropna(), join="inner") 1330 | x, colors = x.align(colors.dropna(), join="inner") 1331 | x, y = x.align(y, join="inner") 1332 | plt.scatter(x, y, s=20, alpha=.75, linewidth=1, c=colors) 1333 | labels = [plt.text( 1334 | s=ind, 1335 | x=x[ind], 1336 | y=y[ind], 1337 | fontsize=8, color=colors[ind] 1338 | ) for ind in x.index] 1339 | if adjustText_present: 1340 | adjust_text(labels, 1341 | arrowprops=dict(arrowstyle='-', color="black", lw=.5)) 1342 | plt.xlabel("Library Naive Gene Average") 1343 | plt.ylabel("Library Effect") 1344 | 1345 | sorted_ge = data['gene_effect'][gene].sort_values().dropna() 1346 | lowest_line = sorted_ge.index[0] 1347 | highest_line = sorted_ge.index[-1] 1348 | 1349 | plt.sca(axes[4]) 1350 | single_line_interrogation(data, gene, lowest_line) 1351 | plt.title('%s in %s (Lowest)' % (gene, lowest_line), fontsize=10) 1352 | 1353 | plt.sca(axes[5]) 1354 | single_line_interrogation(data, gene, highest_line) 1355 | plt.title('%s in %s (Highest)' % (gene, highest_line), fontsize=10) 1356 | 1357 | return fig 1358 | 1359 | 1360 | def single_line_interrogation(data, gene, line, ax=None, 1361 | density_scatter_args={'trend_line': False, 'diagonal': True} 1362 | ): 1363 | ''' 1364 | A scatterplot of predicted vs observed log fold-change of sgRNAs for the selected gene 1365 | in screened replicates of the selected line. 1366 | Parameters: 1367 | `data` (`dict`): see `interrogate_gene` 1368 | `gene` (`str`): the gene of interest 1369 | `line` (`str`): the cell line of interest. 1370 | ''' 1371 | if not ax is None: 1372 | plt.sca(ax) 1373 | guides = {library: data['guide_map'][library].query("gene == %r" % gene).sgrna.unique() 1374 | for library in data['guide_map']} 1375 | sequences = {library: data['sequence_map'][library].query("cell_line_name == %r" % line).sequence_ID.unique() 1376 | for library in data['sequence_map'] 1377 | } 1378 | 1379 | abbreviated_guide_mapper = {} 1380 | abbreviated_replicate_mapper = {} 1381 | stacked_lfc = [] 1382 | stacked_lfc_predicted = [] 1383 | aliases = _make_aliases(list(data['logfoldchange'].keys())) 1384 | 1385 | def consolidate_index(index): 1386 | out = [] 1387 | for v in list(index): 1388 | lib1, rep = v[0].split('Rep') 1389 | lib2, guide = v[1].split('Guide') 1390 | if lib1 != lib2: 1391 | raise ValueError("Something went wrong in abbreviating index labels for log fold change") 1392 | out.append( '%sRep%sGuide%s' % (lib1, rep, guide)) 1393 | return out 1394 | 1395 | for key, lfc in data['logfoldchange'].items(): 1396 | subset = lfc.loc[sequences[key], guides[key]] 1397 | abbreviated_guide_mapper.update({guide: '%sGuide%i' %(aliases[key], i+1) 1398 | for i, guide in enumerate(guides[key])}) 1399 | abbreviated_replicate_mapper.update({sequence: '%sRep%i' %(aliases[key], i+1) 1400 | for i, sequence in enumerate(sequences[key])}) 1401 | subset_predicted = data['predicted_logfoldchange'][key].loc[sequences[key], guides[key]] 1402 | subset.rename(index=abbreviated_replicate_mapper, columns=abbreviated_guide_mapper, inplace=True) 1403 | subset_predicted.rename(index=abbreviated_replicate_mapper, columns=abbreviated_guide_mapper, inplace=True) 1404 | stacked = subset.stack() 1405 | stacked_predicted = subset_predicted.stack() 1406 | stacked.index = consolidate_index(stacked.index) 1407 | stacked_predicted.index = consolidate_index(stacked_predicted.index) 1408 | stacked_lfc.append(stacked) 1409 | stacked_lfc_predicted.append(stacked_predicted) 1410 | x = pd.concat(stacked_lfc) 1411 | y = pd.concat(stacked_lfc_predicted) 1412 | x, y = x.align(y, join="inner") 1413 | if not len(x): 1414 | return 1415 | density_scatter(x, y, **density_scatter_args) 1416 | texts = [plt.text(s=ind, x=x[ind], y=y[ind], fontsize=7) for ind in x.index] 1417 | if adjustText_present: 1418 | adjust_text(texts, arrowprops=dict(arrowstyle="-", color="black", lw=.5)) 1419 | plt.xlabel("Observed LFC") 1420 | plt.ylabel("Predicted LFC") 1421 | plt.title('%s in %s' % (gene, line)) 1422 | print("Guide and replicate key for %s, %s:\n%r\n%r\n%r" % (gene, line, aliases, pd.Series(abbreviated_guide_mapper), 1423 | pd.Series(abbreviated_replicate_mapper))) 1424 | 1425 | 1426 | def interrogate_gene_compare(paired_data, lfc, guide_map, gene, plot_width, plot_height): 1427 | ''' 1428 | Creates a set of comparison plots for results from two different models for a specific gene. 1429 | This is mostly useful for internal Chronos development. 1430 | Parameters: 1431 | `paired_data` (`dict`): must contain two keys labeling `data` (`dict`) from two different models. 1432 | See `interrogate_gene` for the format of `data`. 1433 | `lfc` (`dict`): one key per batch, with the value being a `pandas.DataFrame` of observed 1434 | log fold change, with sgRNAs as columns. 1435 | `guide_map` (`pandas.DataFrame`): see `chronos.Chronos` for format. 1436 | `gene` (`str`): the gene to examine. 1437 | `plot_width`, `plot_height`: the total width of the figure and the height of individual panels, in inches. 1438 | Returns: 1439 | `matplotlib.Figure` 1440 | ''' 1441 | keys = list(paired_data.keys()) 1442 | palette = guide_palette(guide_map, gene) 1443 | fig, axes = plt.subplots(2, 2, figsize=(plot_width, plot_height)) 1444 | axes = [a for ax in axes for a in ax] 1445 | 1446 | plt.sca(axes[0]) 1447 | density_scatter(paired_data[keys[0]]["gene_effect"][gene], paired_data[keys[1]]["gene_effect"][gene], 1448 | diagonal=True, label_outliers=5, outliers_from='diagonal') 1449 | plt.xlabel(keys[0]) 1450 | plt.ylabel(keys[1]) 1451 | plt.title('%s Gene Effect' % gene) 1452 | 1453 | plt.sca(axes[2]) 1454 | for j, key in enumerate(lfc.keys()): 1455 | for guide in palette[key].index: 1456 | sns.kdeplot(lfc[key][guide], label=key + '_' + guide[:4], bw_adjust=.5, 1457 | color=palette[key][guide], 1458 | lw=1) 1459 | plt.legend(fontsize=6) 1460 | plt.xlabel("Guide LFC") 1461 | 1462 | plt.sca(axes[3]) 1463 | plt.title("Guide Efficacy") 1464 | labels = [] 1465 | for library in lfc: 1466 | x = paired_data[keys[0]]['guide_efficacy'].reindex(palette[library].index).fillna(-.1) 1467 | y = paired_data[keys[1]]['guide_efficacy'].reindex(palette[library].index).fillna(-.1) 1468 | plt.scatter( 1469 | x, y, 1470 | s=20, alpha=.75, linewidth=1, color=palette[library] 1471 | ) 1472 | labels.extend([plt.text(s='%s_%s' % (library, ind[:4]), 1473 | x=x[ind], 1474 | y=y[ind], 1475 | fontsize=6, color=palette[library][ind] 1476 | ) for ind in palette[library].index]) 1477 | if adjustText_present: 1478 | adjust_text(labels, arrowprops=dict(arrowstyle='-', color="black", lw=.5)) 1479 | plt.xlabel(keys[0]) 1480 | plt.ylabel(keys[1]) 1481 | 1482 | plt.sca(axes[1]) 1483 | corrs = {} 1484 | for key in keys: 1485 | corrs[key] = {} 1486 | for library in lfc: 1487 | naive = lfc[library][palette[library].index]\ 1488 | .groupby(paired_data[key]['sequence_map'][library].set_index("sequence_ID").cell_line_name)\ 1489 | .median() 1490 | series = fast_cor( 1491 | paired_data[key]['gene_effect'][[gene]], 1492 | naive 1493 | ).loc[gene] 1494 | corrs[key][library] = series 1495 | labels = [] 1496 | for library in lfc: 1497 | plt.scatter(corrs[keys[0]][library], corrs[keys[1]][library], 1498 | color=palette[library], s=15, alpha=.75) 1499 | labels.extend([plt.text(s='%s_%s' % (library, ind[:4]), 1500 | x=corrs[keys[0]][library][ind], 1501 | y=corrs[keys[1]][library][ind], 1502 | fontsize=6, color=palette[library][ind] 1503 | ) for ind in corrs[keys[0]][library].index]) 1504 | if adjustText_present: 1505 | adjust_text(labels, arrowprops=dict(arrowstyle='-', color="black", lw=.5)) 1506 | plt.xlabel(keys[0]) 1507 | plt.ylabel(keys[1]) 1508 | plt.title("Gene Effect - Guide LFC Corr") 1509 | 1510 | return fig --------------------------------------------------------------------------------