├── Data
    ├── SampleData
    │   ├── OmicsCNGene.hdf5
    │   ├── KYReadcounts.hdf5
    │   ├── AvanaReadcounts.hdf5
    │   ├── DeWeirdtReadcounts.hdf5
    │   ├── OmicsExpressionProteinCodingGenesTPMLogp1.hdf5
    │   ├── DeWeirdtConditionMap.csv
    │   ├── RNAiExpressionAddictions.csv
    │   ├── KYSequenceMap.csv
    │   ├── OmicsSomaticMutations.csv
    │   ├── AchillesCommonEssentialControls.csv
    │   ├── AchillesNonessentialControls.csv
    │   └── AvanaSequenceMap.csv
    └── DepMapDataURLs.json
├── chronos
    ├── __init__.py
    ├── fetch_parameters.py
    ├── copy_correction.py
    ├── figshare.py
    ├── plotting.py
    ├── reports.py
    └── evaluations.py
├── .gitignore
├── setup.py
├── project.toml
├── LICENSE
└── README.md


/Data/SampleData/OmicsCNGene.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/chronos/HEAD/Data/SampleData/OmicsCNGene.hdf5


--------------------------------------------------------------------------------
/Data/SampleData/KYReadcounts.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/chronos/HEAD/Data/SampleData/KYReadcounts.hdf5


--------------------------------------------------------------------------------
/Data/SampleData/AvanaReadcounts.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/chronos/HEAD/Data/SampleData/AvanaReadcounts.hdf5


--------------------------------------------------------------------------------
/Data/SampleData/DeWeirdtReadcounts.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/chronos/HEAD/Data/SampleData/DeWeirdtReadcounts.hdf5


--------------------------------------------------------------------------------
/chronos/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import *
2 | from .copy_correction import *
3 | from .figshare import *
4 | from .fetch_parameters import fetch_parameters


--------------------------------------------------------------------------------
/Data/SampleData/OmicsExpressionProteinCodingGenesTPMLogp1.hdf5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/broadinstitute/chronos/HEAD/Data/SampleData/OmicsExpressionProteinCodingGenesTPMLogp1.hdf5


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.DS_store
 2 | Data/Achilles_run/
 3 | Data/Achilles_run_compare/
 4 | Data/logs/
 5 | Data/reports
 6 | Data/DepMapParameters/
 7 | *.egg-info
 8 | *.pyc
 9 | .ipynb_checkpoints/
10 | build/
11 | dist/
12 | chronos/__pycache__
13 | chronos/unimodal_density_estimate.py
14 | 
15 | 


--------------------------------------------------------------------------------
/Data/DepMapDataURLs.json:
--------------------------------------------------------------------------------
1 | {
2 |     "gene_effect.csv": "https://plus.figshare.com/ndownloader/files/43346616",
3 |     "guide_efficacy.csv": "https://plus.figshare.com/ndownloader/files/43346709",
4 |     "cell_line_efficacy.csv": "https://plus.figshare.com/ndownloader/files/43346718",
5 |     "library_effect.csv": "https://plus.figshare.com/ndownloader/files/43346715",
6 |     "t0_offset.csv": "https://plus.figshare.com/ndownloader/files/43346733"
7 | }


--------------------------------------------------------------------------------
/Data/SampleData/DeWeirdtConditionMap.csv:
--------------------------------------------------------------------------------
 1 | sequence_ID,replicate,cell_line_name,days,pDNA_batch,condition
 2 | pDNA,pDNA,pDNA,21,batch1,pDNA
 3 | Meljuso,RepA,Meljuso,21,batch1,Control
 4 | Meljuso.1,RepB,Meljuso,21,batch1,Control
 5 | Meljuso.2,RepA,Meljuso,21,batch1,A-1331852
 6 | Meljuso.3,RepB,Meljuso,21,batch1,A-1331852
 7 | OVCAR8,RepA,OVCAR8,21,batch1,Control
 8 | OVCAR8.1,RepB,OVCAR8,21,batch1,Control
 9 | OVCAR8.2,RepA,OVCAR8,21,batch1,A-1331852
10 | OVCAR8.3,RepB,OVCAR8,21,batch1,A-1331852
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup, find_packages
 3 | 
 4 | extras_require = {
 5 | 	"copy_correction": ["patsy>=0.5.2"],
 6 | 	"evaluations": ["matplotlib>=3.6", "seaborn>=0.12", "scikit-learn>=1.1", "statsmodels>=0.13", "scipy>=1.9"],
 7 | 	"adjust_text": ["adjustText"],
 8 | 	"embedding": ["umap-learn>=0.5.3"],
 9 | 	"reports": ["reportlab>=3.6"],
10 | 	"model": ["numpy>=1.2", "pandas>=1.3", "tensorflow>2", "h5py>=3.7"],
11 | 	"hit_calling": ["scipy>=1.9", "sympy>=1.0", "statsmodels>=0.13"]
12 | }
13 | extras_require['all'] = sorted(set.union(*[set(v) for v in extras_require.values()]))
14 | 
15 | setup(
16 | 	name='crispr_chronos',
17 | 	version='2.3.10',
18 | 	author="BroadInstitute CDS",
19 | 	description="Time series modeling of CRISPR perturbation readcounts in biological data",
20 | 	packages=find_packages(),
21 | 	package_data={'': ['*.r']},
22 | 	install_requires=extras_require['all']
23 | 	#extras_require = extras_require
24 | )
25 | 


--------------------------------------------------------------------------------
/project.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 | 	"numpy>=1.2", "pandas>=1.3", "tensorflow>2", "h5py>=3.7", "patsy>=0.5.2",
 4 | 	"matplotlib>=3.6", "seaborn>=0.12", "scikit-learn>=1.1", "statsmodels>=0.13", "scipy>=1.9", "sympy>=1.0",
 5 | 	"adjustText",
 6 | 	"umap-learn>=0.5.3",
 7 | 	"reportlab>=3.6"
 8 | ]
 9 | build-backend = "setuptools.build_meta"
10 | 
11 | [project]
12 | name = "crispr_chronos"
13 | version = "2.0.6"
14 | authors = [
15 |   { name="Joshua Dempster", email="cds@broadinstitute.org" },
16 | ]
17 | description = "A package for processing readcount data from CRISPR knockout viability experiments"
18 | readme = "README.md"
19 | requires-python = ">=3.8"
20 | classifiers = [
21 |     "Programming Language :: Python :: 3",
22 |     "License :: OSI Approved :: BSD License",
23 |     "Operating System :: OS Independent",
24 |     "Development Status :: 5",
25 |     "Intended Audience :: Science/Research",
26 |     "Natural Language :: English",
27 |     "Topic:: Scientific/Engineering :: Bio-Informatics",
28 | ]
29 | 
30 | [project.urls]
31 | "Homepage" = "https://github.com/broadinstitute/chronos"
32 | "Bug Tracker" = "https://github.com/broadinstitute/chronos/issues"
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2021, 2023 Joshua M. Dempster
 2 | 
 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 4 | 
 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 6 | 
 7 | 2. Redistributions in binary form qmust reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | 
 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 | 
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/Data/SampleData/RNAiExpressionAddictions.csv:
--------------------------------------------------------------------------------
  1 | Gene
  2 | AADAC (13)
  3 | ACTL8 (81569)
  4 | AKT1 (207)
  5 | ANG (283)
  6 | APOBR (55911)
  7 | ARHGAP29 (9411)
  8 | ARHGEF6 (9459)
  9 | ATP8B2 (57198)
 10 | AXL (558)
 11 | BCL2 (596)
 12 | BTK (695)
 13 | C11orf71 (54494)
 14 | C8orf37 (157657)
 15 | CACFD1 (11094)
 16 | CBFA2T3 (863)
 17 | CCND1 (595)
 18 | CCND2 (894)
 19 | CDX2 (1045)
 20 | CEBPA (1050)
 21 | CSF2RB (1439)
 22 | DTX2 (113878)
 23 | EBF1 (1879)
 24 | EGFR (1956)
 25 | ERBB3 (2065)
 26 | ESR1 (2099)
 27 | FBRSL1 (57666)
 28 | FERMT1 (55612)
 29 | FGFR1 (2260)
 30 | FLI1 (2313)
 31 | FLT3 (2322)
 32 | FOXA1 (3169)
 33 | FOXR2 (139628)
 34 | FRAT1 (10023)
 35 | FUNDC2 (65991)
 36 | FZD8 (8325)
 37 | GATA1 (2623)
 38 | GATA2 (2624)
 39 | GATA3 (2625)
 40 | GNAI2 (2771)
 41 | GNS (2799)
 42 | GRHL2 (79977)
 43 | HERC1 (8925)
 44 | HNF1B (6928)
 45 | HNF4A (3172)
 46 | HNRNPH1 (3187)
 47 | HOXB13 (10481)
 48 | INHBC (3626)
 49 | IRF2BP2 (359948)
 50 | IRF4 (3662)
 51 | IRF8 (3394)
 52 | JUN (3725)
 53 | JUP (3728)
 54 | KLF5 (688)
 55 | KRAS (3845)
 56 | LMX1B (4010)
 57 | LYL1 (4066)
 58 | MCL1 (4170)
 59 | MDM2 (4193)
 60 | MECOM (2122)
 61 | MEF2D (4209)
 62 | MET (4233)
 63 | MPRIP (23164)
 64 | MSI2 (124540)
 65 | MYB (4602)
 66 | MYBL1 (4603)
 67 | NAV3 (89795)
 68 | NFKBIE (4794)
 69 | NKX2-1 (7080)
 70 | PARD3 (56288)
 71 | PARD6B (84612)
 72 | PASD1 (139135)
 73 | PAX8 (7849)
 74 | PEA15 (8682)
 75 | PHIP (55023)
 76 | POU2F2 (5452)
 77 | PPME1 (51400)
 78 | PRDM1 (639)
 79 | RBM47 (54502)
 80 | RELB (5971)
 81 | RHBDD1 (84236)
 82 | RUNX1 (861)
 83 | SATB2 (23314)
 84 | SLC16A11 (162515)
 85 | SOX10 (6663)
 86 | SOX4 (6659)
 87 | SOX9 (6662)
 88 | SPDEF (25803)
 89 | SPI1 (6688)
 90 | TBX2 (6909)
 91 | TBX3 (6926)
 92 | TCF4 (6925)
 93 | TCF7 (6932)
 94 | TEAD1 (7003)
 95 | TFAP2A (7020)
 96 | TFAP2C (7022)
 97 | TLE3 (7090)
 98 | TNS4 (84951)
 99 | TOX (9760)
100 | TRIM10 (10107)
101 | UBE2D2 (7322)
102 | VAV1 (7409)
103 | YAP1 (10413)
104 | ZBTB7B (51043)
105 | ZC4H2 (55906)
106 | ZEB2 (9839)
107 | ZFP36L1 (677)
108 | 


--------------------------------------------------------------------------------
/chronos/fetch_parameters.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import requests
 4 | import pandas as pd
 5 | from .model import write_hdf5
 6 | 
 7 | 
 8 | def fetch_parameters(url_loc="Data/DepMapDataURLs.json", 
 9 | 	output_dir="Data/DepMapParameters/", overwrite=False,
10 | 	relative_to_chronos=True
11 | ):
12 | 	'''
13 | 	Fetch a set of trained Chronos parameters located at the urls in 
14 | 	the json file at `url_loc` (see default file for an example)
15 | 	and writes them to the local directory in `output_dir`. 
16 | 	Files present will be skipped unless `overwrite` is `True`.
17 | 	Both `url_loc` and `output_dir` are relative to the chronos package
18 | 	unless `relative_to_chronos` is `False`.
19 | 	'''
20 | 	chronos_dir = os.path.dirname(__file__)
21 | 	if not url_loc.startswith("/"):
22 | 		if relative_to_chronos:
23 | 			print("`url_loc` will be found relative to the chronos package directory\n'%s'\n\
24 | Pass `relative_to_chronos=False` to make the path relative to your current working directory\n'%s'\n\
25 | instead.\n" % (chronos_dir, os.getcwd()))
26 | 			url_loc = os.path.join(chronos_dir, '..', url_loc)
27 | 
28 | 	if not output_dir.startswith("/"):
29 | 		if relative_to_chronos:
30 | 			print("`output_dir` will be found relative to the chronos package directory\n'%s'\n\
31 | Pass `relative_to_chronos=False` to make the path relative to your current working directory\n'%s'\n\
32 | instead.\n" % (chronos_dir, os.getcwd()))
33 | 			output_dir = os.path.join(chronos_dir, '..', output_dir)
34 | 			if not os.path.isdir(output_dir):
35 | 				os.mkdir(output_dir)
36 | 
37 | 	print("downloading files to %s" % output_dir)
38 | 
39 | 	url_dict = json.loads(open(url_loc).read())
40 | 
41 | 	for filename, url in url_dict.items():
42 | 		path = os.path.join(output_dir, filename)
43 | 		if filename in os.listdir(output_dir) and not overwrite:
44 | 			print("Skipping %s as it already exists, pass `overwrite=True` to overwrite" % filename)
45 | 		else:
46 | 			print("fetching %s from %s" % (filename, url))
47 | 			file = requests.get(url, allow_redirects=True)
48 | 			open(path, 'wb').write(file.content)
49 | 	print("all files fetched, tranforming format")
50 | 	reformat_directory(output_dir)
51 | 	print('done')
52 | 
53 | 
54 | 
55 | def reformat_directory(directory):
56 | 	'''transforms file formats in `directory` from DepMap release format to Chronos' expected format for `import_model`'''
57 | 	if not "gene_effect.hdf5" in os.listdir(directory):
58 | 		print("transforming gene_effect.csv, this may take a minute")
59 | 		ge = pd.read_csv(os.path.join(directory, "gene_effect.csv"), index_col=0)
60 | 		write_hdf5(ge, os.path.join(directory, "gene_effect.hdf5"))
61 | 
62 | 	print("transforming guide efficacy")
63 | 	guide_eff = pd.read_csv(os.path.join(directory, "guide_efficacy.csv"))
64 | 	guide_eff.rename(columns={"sgRNA": "sgrna", "Efficacy": "efficacy"}, errors="ignore", inplace=True)
65 | 	guide_eff.to_csv(os.path.join(directory, "guide_efficacy.csv"), index=None)
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/Data/SampleData/KYSequenceMap.csv:
--------------------------------------------------------------------------------
 1 | sequence_ID,ScreenID,days,pDNA_batch,Replicate,ScreenType,cell_line_name,ModelConditionID,Library,PassesQC
 2 | A2058_c903R1_KY-2,SC-000788.KY01,14,KY-2,A,2DS,ACH-000788,MC-000788-UK2d,KY,True
 3 | A2058_c903R2_KY-2,SC-000788.KY01,14,KY-2,B,2DS,ACH-000788,MC-000788-UK2d,KY,True
 4 | A2058_c903R3_KY-2,SC-000788.KY01,14,KY-2,C,2DS,ACH-000788,MC-000788-UK2d,KY,True
 5 | A2780_c905R1_KY-1,SC-000657.KY01,14,KY-1,A,2DS,ACH-000657,MC-000657-ERaD,KY,True
 6 | A2780_c905R2_KY-1,SC-000657.KY01,14,KY-1,B,2DS,ACH-000657,MC-000657-ERaD,KY,True
 7 | ESS1_C908R2_KY-1,SC-000913.KY01,14,KY-1,A,2DS,ACH-000913,MC-000913-mRWp,KY,True
 8 | ESS1_C908R3_KY-1,SC-000913.KY01,14,KY-1,B,2DS,ACH-000913,MC-000913-mRWp,KY,True
 9 | H1915_c906R1_KY-1,SC-000434.KY01,14,KY-1,A,2DS,ACH-000434,MC-000434-JJXr,KY,True
10 | H1915_c906R2_KY-1,SC-000434.KY01,14,KY-1,B,2DS,ACH-000434,MC-000434-JJXr,KY,True
11 | H1915_c906R3_KY-1,SC-000434.KY01,14,KY-1,C,2DS,ACH-000434,MC-000434-JJXr,KY,True
12 | H2087_c906R1_KY-1,SC-000841.KY01,14,KY-1,A,2DS,ACH-000841,MC-000841-gQHi,KY,True
13 | H2087_c906R2_KY-1,SC-000841.KY01,14,KY-1,B,2DS,ACH-000841,MC-000841-gQHi,KY,True
14 | H2087_c906R3_KY-1,SC-000841.KY01,14,KY-1,C,2DS,ACH-000841,MC-000841-gQHi,KY,True
15 | HCC299_c903R1_KY-2,SC-001081.KY01,14,KY-2,A,2DS,ACH-001081,MC-001081-EiKC,KY,True
16 | HCC299_c903R2_KY-2,SC-001081.KY01,14,KY-2,B,2DS,ACH-001081,MC-001081-EiKC,KY,True
17 | HCC299_c903R3_KY-2,SC-001081.KY01,14,KY-2,C,2DS,ACH-001081,MC-001081-EiKC,KY,True
18 | HEC1_c907R1_KY-1,SC-001517.KY01,14,KY-1,A,2DS,ACH-001517,MC-001517-TVRG,KY,True
19 | HEC1_c907R2_KY-1,SC-001517.KY01,14,KY-1,B,2DS,ACH-001517,MC-001517-TVRG,KY,True
20 | HEC1_c907R3_KY-1,SC-001517.KY01,14,KY-1,C,2DS,ACH-001517,MC-001517-TVRG,KY,True
21 | KM12_c908R1_100_KY-1,SC-000969.KY01,14,KY-1,A,2DS,ACH-000969,MC-000969-8Uk9,KY,True
22 | KM12_c908R2_100_KY-1,SC-000969.KY01,14,KY-1,B,2DS,ACH-000969,MC-000969-8Uk9,KY,True
23 | KM12_c908R3_100_KY-1,SC-000969.KY01,14,KY-1,C,2DS,ACH-000969,MC-000969-8Uk9,KY,True
24 | KMS11_c907R1_KY-1,SC-000714.KY01,14,KY-1,A,2DS,ACH-000714,MC-000714-Ep4x,KY,True
25 | KMS11_c907R2_KY-1,SC-000714.KY01,14,KY-1,B,2DS,ACH-000714,MC-000714-Ep4x,KY,True
26 | KMS11_c907R3_KY-1,SC-000714.KY01,14,KY-1,C,2DS,ACH-000714,MC-000714-Ep4x,KY,True
27 | KYSE15_c907R1_KY-1,SC-000855.KY01,14,KY-1,A,2DS,ACH-000855,MC-000855-P9gt,KY,True
28 | KYSE15_c907R2_KY-1,SC-000855.KY01,14,KY-1,B,2DS,ACH-000855,MC-000855-P9gt,KY,True
29 | KYSE15_c907R3_KY-1,SC-000855.KY01,14,KY-1,C,2DS,ACH-000855,MC-000855-P9gt,KY,True
30 | KYSE70_c907R1_KY-1,SC-000784.KY01,14,KY-1,A,2DS,ACH-000784,MC-000784-ayo7,KY,True
31 | KYSE70_c907R2_KY-1,SC-000784.KY01,14,KY-1,B,2DS,ACH-000784,MC-000784-ayo7,KY,True
32 | KYSE70_c907R3_KY-1,SC-000784.KY01,14,KY-1,C,2DS,ACH-000784,MC-000784-ayo7,KY,True
33 | L363_c907R4_KY-1,SC-000183.KY01,14,KY-1,A,2DS,ACH-000183,MC-000183-QKEz,KY,True
34 | L363_c907R5_KY-1,SC-000183.KY01,14,KY-1,B,2DS,ACH-000183,MC-000183-QKEz,KY,True
35 | L363_c907R6_KY-1,SC-000183.KY01,14,KY-1,C,2DS,ACH-000183,MC-000183-QKEz,KY,True
36 | LB771H_c908R1_KY-1,SC-002265.KY01,14,KY-1,A,2DS,ACH-002265,MC-002265-Hq0I,KY,True
37 | LB771H_c908R2_KY-1,SC-002265.KY01,14,KY-1,B,2DS,ACH-002265,MC-002265-Hq0I,KY,True
38 | LB771H_c908R3_KY-1,SC-002265.KY01,14,KY-1,C,2DS,ACH-002265,MC-002265-Hq0I,KY,True
39 | LXF289_c906R1_KY-1,SC-000787.KY01,14,KY-1,A,2DS,ACH-000787,MC-000787-eQaU,KY,True
40 | LXF289_c906R2_KY-1,SC-000787.KY01,14,KY-1,B,2DS,ACH-000787,MC-000787-eQaU,KY,True
41 | LXF289_c906R3_KY-1,SC-000787.KY01,14,KY-1,C,2DS,ACH-000787,MC-000787-eQaU,KY,True
42 | MDST8_c903R3_KY-2,SC-000935.KY01,14,KY-2,A,2DS,ACH-000935,MC-000935-4xuf,KY,True
43 | MFE280_C908R1_KY-1,SC-000192.KY01,14,KY-1,A,2DS,ACH-000192,MC-000192-cA5R,KY,True
44 | MFE280_C908R2_KY-1,SC-000192.KY01,14,KY-1,B,2DS,ACH-000192,MC-000192-cA5R,KY,True
45 | MFE280_C908R3_KY-1,SC-000192.KY01,14,KY-1,C,2DS,ACH-000192,MC-000192-cA5R,KY,True
46 | OVISE_c905R2_KY-1,SC-000527.KY01,14,KY-1,A,2DS,ACH-000527,MC-000527-sDUu,KY,True
47 | OVISE_c905R3_KY-1,SC-000527.KY01,14,KY-1,B,2DS,ACH-000527,MC-000527-sDUu,KY,True
48 | RCCFG2_C908R1_KY-1,SC-002189.KY01,14,KY-1,A,2DS,ACH-002189,MC-002189-EopU,KY,True
49 | RCCFG2_C908R2_KY-1,SC-002189.KY01,14,KY-1,B,2DS,ACH-002189,MC-002189-EopU,KY,True
50 | RCCFG2_C908R3_KY-1,SC-002189.KY01,14,KY-1,C,2DS,ACH-002189,MC-002189-EopU,KY,True
51 | SAS_c907R1_KY-1,SC-002029.KY01,14,KY-1,A,2DS,ACH-002029,MC-002029-eTYU,KY,True
52 | SAS_c907R2_KY-1,SC-002029.KY01,14,KY-1,B,2DS,ACH-002029,MC-002029-eTYU,KY,True
53 | SAS_c907R3_KY-1,SC-002029.KY01,14,KY-1,C,2DS,ACH-002029,MC-002029-eTYU,KY,True
54 | SKNSH_c906R1_KY-1,SC-000149.KY01,14,KY-1,A,2DS,ACH-000149,MC-000149-T5h1,KY,True
55 | SKNSH_c906R2_KY-1,SC-000149.KY01,14,KY-1,B,2DS,ACH-000149,MC-000149-T5h1,KY,True
56 | SKNSH_c906R3_KY-1,SC-000149.KY01,14,KY-1,C,2DS,ACH-000149,MC-000149-T5h1,KY,True
57 | SNU81_c903R1_KY-2,SC-000991.KY01,14,KY-2,A,2DS,ACH-000991,MC-000991-vB9l,KY,True
58 | SNU81_c903R2_KY-2,SC-000991.KY01,14,KY-2,B,2DS,ACH-000991,MC-000991-vB9l,KY,True
59 | SNU81_c903R3_KY-2,SC-000991.KY01,14,KY-2,C,2DS,ACH-000991,MC-000991-vB9l,KY,True
60 | SW48_C902R1_P1D14_KY-2,SC-000958.KY01,14,KY-2,A,2DS,ACH-000958,MC-000958-mNiZ,KY,True
61 | SW48_C902R2_P1D14_KY-2,SC-000958.KY01,14,KY-2,B,2DS,ACH-000958,MC-000958-mNiZ,KY,True
62 | SW48_C902R3_P1D14_KY-2,SC-000958.KY01,14,KY-2,C,2DS,ACH-000958,MC-000958-mNiZ,KY,True
63 | T47d_c903R2_KY-2,SC-000147.KY01,14,KY-2,A,2DS,ACH-000147,MC-000147-L6rc,KY,True
64 | T47d_c903R3_KY-2,SC-000147.KY01,14,KY-2,B,2DS,ACH-000147,MC-000147-L6rc,KY,True
65 | pDNA_batch_KY-1,pDNA,0,KY-1,,pDNA,pDNA,pDNA,KY,True
66 | pDNA_batch_KY-2,pDNA,0,KY-2,,pDNA,pDNA,pDNA,KY,True
67 | 


--------------------------------------------------------------------------------
/chronos/copy_correction.py:
--------------------------------------------------------------------------------
  1 | try:
  2 | 	from patsy import dmatrix
  3 | except ModuleNotFoundError:
  4 | 	raise ModuleNotFoundError("patsy required for copy_correction submodule. \
  5 | Try `pip install patsy`")
  6 | import numpy as np
  7 | import pandas as pd
  8 | import tensorflow as tf
  9 | 
 10 | 
 11 | def get_shifts(gene_effect, copy_number):
 12 | 	ge = gene_effect.copy()
 13 | 	ge -= ge.median()
 14 | 	cn = copy_number.loc[ge.index, ge.columns].fillna(1)[ge.notnull()].stack()
 15 | 	return pd.DataFrame({
 16 | 		"gene_effect_shift": ge.stack().values,
 17 | 		"cn": cn.values,
 18 | 		"cell_line_name": cn.index.get_level_values(0),
 19 | 		"gene": cn.index.get_level_values(1)
 20 | 		})
 21 | 	
 22 | def logspace(low, high, n):
 23 | 	start = 0
 24 | 	end = np.log(high - low + 1)
 25 | 	steps = np.linspace(start, end, n)
 26 | 	converted = np.exp(steps) + low - 1
 27 | 	return converted
 28 | 
 29 | 
 30 | def add_global_shift(cn, y, means, dtype, nknots_cn=10, nknots_ge=5, alpha=.2):
 31 | 	np_dtype = {tf.double: np.double, tf.float32: np.float32}[dtype]
 32 | 	knots_cn = list(cn.quantile(np.linspace(0, 1, nknots_cn)))
 33 | 	knots_cn[0] += 1e-1
 34 | 	knots_cn[-1] -= 1e-1
 35 | 
 36 | 	knots_ge = list(logspace(np.quantile(means, 0.01), np.quantile(means, .99), nknots_ge))
 37 | 
 38 | 	spline_gc = np.array(dmatrix(
 39 | 				"te( \
 40 | 					bs(cn, knots=%r, degree=3, include_intercept=False), \
 41 | 					bs(means, knots=%r, degree=3, include_intercept=False) \
 42 | 				)" % (knots_cn, knots_ge), 
 43 | 				{"cn": cn.values, 'means': means}, return_type='matrix'
 44 | 			))
 45 | 	print('constructed spline matrix of shape %i, %i' % spline_gc.shape)
 46 | 	_spline = tf.constant(spline_gc, dtype=dtype)
 47 | 	_y = tf.constant(y.values, dtype=dtype)
 48 | 	var = y.var()
 49 | 	init = np.random.uniform(-.001, -.0001, size=(spline_gc.shape[1]))
 50 | 	v_coeffs = tf.Variable(init.reshape((-1, 1)), dtype=dtype)
 51 | 	v_weights = tf.Variable(1e-6 * np.ones(len(spline_gc)), dtype=dtype)
 52 | 	_weights = tf.exp(-tf.abs(v_weights))
 53 | 	_weight_cost = tf.reduce_mean(input_tensor=tf.square(v_weights))
 54 | 
 55 | 	_out = _weights * tf.matmul(_spline, v_coeffs)[:, 0]	
 56 | 	_cost = tf.reduce_mean(input_tensor=tf.square(_out - _y) )
 57 | 	optimizer = tf.compat.v1.train.AdamOptimizer(.005)
 58 | 	_step = optimizer.minimize(_cost + alpha * _weight_cost, var_list=[v_coeffs, v_weights])
 59 | 	
 60 | 	sess = tf.compat.v1.Session()
 61 | 	sess.run(tf.compat.v1.global_variables_initializer())
 62 | 
 63 | 
 64 | 	for i in range(501):
 65 | 		sess.run(_step)
 66 | 		if not i%100:
 67 | 			print('\tcost:', sess.run(_cost))
 68 | 	out = sess.run(_out)
 69 | 	weights = sess.run(_weights)
 70 | 
 71 | 
 72 | 	return weights, sess.run(_out)
 73 | 
 74 | 
 75 | 
 76 | def get_adjusted_matrix(shifts, gene_effect,):
 77 | 	ge = gene_effect.stack()
 78 | 	means = gene_effect.mean()
 79 | 
 80 | 	adjusted = pd.Series(
 81 | 		shifts['adjusted'].values + means.loc[shifts.gene].values,
 82 | 		index=ge.index
 83 | 	).reset_index()
 84 | 
 85 | 	adjusted = pd.pivot(adjusted, index=adjusted.columns[0], columns=adjusted.columns[1])[0]
 86 | 	adjusted.index.name = "cell_line_name"
 87 | 	adjusted.columns.name = "gene"
 88 | 	return adjusted
 89 | 
 90 | 
 91 | 
 92 | def alternate_CN(gene_effect, copy_number, nknots_cn=10, nknots_ge=5, dtype=tf.double,
 93 | 	max_lines=150):
 94 | 	'''
 95 | 	removes biases due to copy number by aligning copy nunber segments to the mean. Changes the gene_effect matrix.
 96 | 	Parameters:
 97 | 		gene_map (`pandas.DataFrame`): either a CCDS gene alignment or a DepMap style guide alignment with the column
 98 | 										"genome_alignment". Must include all genes.
 99 | 		copy_number ('pandas.DataFrame'): a cell-line by gene matrix of relative (floating point) copy number
100 | 	'''
101 | 
102 | 	if len(gene_effect) < 3:
103 | 		raise RuntimeError("Correct for CN should not be used with fewer than 3 cell lines. Consider preprocessing with CRISPRCleanR")
104 | 	missing_lines = sorted(set(gene_effect.index) - set(copy_number.index))
105 | 	if len(missing_lines) > 0:
106 | 		print("Warning: missing lines from gene_effect in copy_number, which won't be corrected.\nExamples: %r" % missing_lines[:5])
107 | 	missing_genes = sorted(set(gene_effect.columns) - set(copy_number.columns))
108 | 	if len(missing_genes) > 0:
109 | 		raise ValueError("Missing %i genes from gene_effect in copy_number.\nExamples: %r" % (
110 | 			len(missing_genes), missing_genes[:5]))
111 | 
112 | 	lines = list(gene_effect.index)
113 | 	np.random.shuffle(lines)
114 | 	ngroups = int(len(gene_effect) / max_lines) + 1
115 | 	groups = np.array_split(lines, ngroups)
116 | 	shift_list = []
117 | 	new_list = []
118 | 	for i, group in enumerate(groups):
119 | 		print("\nFitting cell line group %i of %i" % (i+1, len(groups)))
120 | 	
121 | 		print('finding low CN gene effect shifts')
122 | 		shifts= get_shifts(gene_effect.loc[group], copy_number.loc[group])
123 | 
124 | 		print('smoothing and interpolating cutting toxicity for all genes')
125 | 		means = gene_effect.loc[group].mean().sort_values()
126 | 		means_expanded = means.loc[shifts.gene].values
127 | 		weights, cn_effect = add_global_shift(shifts.cn, shifts.gene_effect_shift, means_expanded, dtype, nknots_cn, nknots_ge)
128 | 		shifts['weights'] = weights
129 | 		shifts['cn_effect'] = cn_effect
130 | 		shifts['adjusted'] = shifts['gene_effect_shift'].values - cn_effect
131 | 
132 | 
133 | 		print("generating matrix")
134 | 		new = get_adjusted_matrix(shifts, gene_effect.loc[group])
135 | 		new_list.append(new)
136 | 		shift_list.append(shifts)
137 | 	shifts = pd.concat(shift_list, ignore_index=True)
138 | 	new = pd.concat(new_list)
139 | 	return new, shifts
140 | 


--------------------------------------------------------------------------------
/Data/SampleData/OmicsSomaticMutations.csv:
--------------------------------------------------------------------------------
  1 | ModelID,Gene,DNAChange,ProteinChange,LikelyGoF,Driver,LikelyDriver
  2 | ACH-000784,GATA3 (2625),c.1183G>A,p.A395T,True,False,True
  3 | ACH-000913,XPO1 (7514),c.1711G>A,p.E571K,True,False,True
  4 | ACH-000913,SF3B1 (23451),c.1998G>T,p.K666N,True,False,True
  5 | ACH-000913,PIK3CA (5290),c.1633G>A,p.E545K,True,True,True
  6 | ACH-000479,EGFR (1956),c.1658G>T,p.G553V,True,False,True
  7 | ACH-000955,CTNNB1 (1499),c.121A>G,p.T41A,True,True,True
  8 | ACH-000955,PIK3CA (5290),c.3140A>G,p.H1047R,True,True,True
  9 | ACH-000955,KRAS (3845),c.35G>A,p.G12D,True,True,True
 10 | ACH-000955,ERBB3 (2065),c.310G>A,p.V104M,True,True,True
 11 | ACH-000955,ERBB3 (2065),c.785C>A,p.P262H,True,False,True
 12 | ACH-000958,CTNNB1 (1499),c.98C>A,p.S33Y,True,False,True
 13 | ACH-000958,EGFR (1956),c.2020G>A,p.G674S,True,True,True
 14 | ACH-000958,MAP2K1 (5604),c.167A>C,p.Q56P,True,False,True
 15 | ACH-001533,MYD88 (4615),c.439T>C,p.*147R,True,True,True
 16 | ACH-000750,BRAF (673),c.1799T>A,p.V600E,True,True,True
 17 | ACH-000750,KSR2 (283455),c.2175C>T,p.F725F,True,False,True
 18 | ACH-000912,DDR2 (4921),c.1912A>T,p.I638F,True,False,True
 19 | ACH-001636,PGR (5241),c.2219G>A,p.R740Q,True,False,True
 20 | ACH-001129,U2AF1 (7307),c.470A>C,p.Q157P,True,True,True
 21 | ACH-002926,GNA11 (2767),c.626A>T,p.Q209L,True,False,True
 22 | ACH-000988,FGFR2 (2263),c.71C>G,p.S24W,True,False,True
 23 | ACH-000788,TP63 (8626),c.1135C>T,p.R379C,True,False,True
 24 | ACH-000788,BRAF (673),c.1799T>A,p.V600E,True,True,True
 25 | ACH-000788,MAP2K1 (5604),c.370C>T,p.P124S,True,False,True
 26 | ACH-000969,GNAS (2778),c.602G>A,p.R201H,True,False,True
 27 | ACH-000714,FGFR3 (2261),c.1118A>G,p.Y373C,True,True,True
 28 | ACH-000714,FGFR3 (2261),c.1118A>G,p.Y373C,True,False,True
 29 | ACH-000192,PIK3CA (5290),c.3139C>T,p.H1047Y,True,False,True
 30 | ACH-000192,MLLT10 (8028),c.2111G>T,p.R704L,True,False,True
 31 | ACH-000192,FGFR2 (2263),c.71C>G,p.S24W,True,False,True
 32 | ACH-000657,PIK3CA (5290),c.1093G>A,p.E365K,True,False,True
 33 | ACH-000657,RRAS2 (22800),c.215A>T,p.Q72L,True,False,True
 34 | ACH-000657,MED12 (9968),c.67G>T,p.D23Y,True,False,True
 35 | ACH-000614,BRAF (673),c.1799T>A,p.V600E,True,True,True
 36 | ACH-000147,PIK3CA (5290),c.3140A>G,p.H1047R,True,True,True
 37 | ACH-000183,NRAS (4893),c.183A>C,p.Q61H,True,False,True
 38 | ACH-000183,PIK3CA (5290),c.1633G>A,p.E545K,True,True,True
 39 | ACH-000787,SOS1 (6654),c.697A>T,p.N233Y,True,False,True
 40 | ACH-000787,CTNNB1 (1499),c.121A>G,p.T41A,True,True,True
 41 | ACH-001736,FLT3 (2322),c.2503G>C,p.D835H,True,True,True
 42 | ACH-001843,CTNNB1 (1499),c.97T>C,p.S33P,True,False,True
 43 | ACH-001843,KRAS (3845),c.35G>A,p.G12D,True,True,True
 44 | ACH-001843,GNAS (2778),c.602G>A,p.R201H,True,False,True
 45 | ACH-000396,FGFR3 (2261),c.1951A>G,p.K651E,True,False,True
 46 | ACH-000396,ERBB2 (2064),c.2033G>A,p.R678Q,True,False,True
 47 | ACH-000458,RHOA (387),c.118G>C,p.E40Q,True,False,True
 48 | ACH-000458,HRAS (3265),c.182A>T,p.Q61L,True,False,True
 49 | ACH-000004,JAK2 (3717),c.1849G>T,p.V617F,True,True,True
 50 | ACH-002189,PIK3CB (5291),c.3200A>T,p.D1067V,True,False,True
 51 | ACH-000935,KDR (3791),c.3095G>A,p.R1032Q,True,False,True
 52 | ACH-000935,BRAF (673),c.1798_1799GT>AA,p.V600K,True,True,True
 53 | ACH-000263,KIT (3815),c.2466T>A,p.N822K,True,True,True
 54 | ACH-001411,PIK3CA (5290),c.1633G>A,p.E545K,True,True,True
 55 | ACH-001842,SF3B1 (23451),c.2098A>G,p.K700E,True,False,True
 56 | ACH-001842,AKT1 (207),c.49G>A,p.E17K,True,True,True
 57 | ACH-000841,NRAS (4893),c.181C>A,p.Q61K,True,True,True
 58 | ACH-000841,BRAF (673),c.1789C>G,p.L597V,True,False,True
 59 | ACH-000149,ALK (238),c.3522C>A,p.F1174L,True,True,True
 60 | ACH-000993,MTOR (2475),c.6644C>A,p.S2215Y,True,False,True
 61 | ACH-000993,XPO1 (7514),c.2246G>A,p.R749Q,True,False,True
 62 | ACH-000993,NT5C2 (22978),c.1168G>A,p.E390K,True,False,True
 63 | ACH-000168,KRAS (3845),c.38G>A,p.G13D,True,True,True
 64 | ACH-001303,DDR2 (4921),c.187C>G,p.L63V,True,False,True
 65 | ACH-001303,ALK (238),c.3824G>A,p.R1275Q,True,True,True
 66 | ACH-002029,ERBB4 (2066),c.3325G>T,p.G1109C,True,False,True
 67 | ACH-001081,MTOR (2475),c.7500T>G,p.I2500M,True,False,True
 68 | ACH-001081,XPO1 (7514),c.1711G>A,p.E571K,True,False,True
 69 | ACH-001081,RAF1 (5894),c.770C>T,p.S257L,True,False,True
 70 | ACH-001081,KRAS (3845),c.436G>A,p.A146T,True,True,True
 71 | ACH-000996,PIK3CA (5290),c.3129G>T,p.M1043I,True,False,True
 72 | ACH-000996,ARHGAP35 (2909),c.2989C>T,p.R997*,True,False,True
 73 | ACH-000946,NFE2L2 (4780),c.53G>C,p.R18P,True,False,True
 74 | ACH-000946,CTNNB1 (1499),c.95A>T,p.D32V,True,False,True
 75 | ACH-000946,CCND1 (595),c.859C>A,p.P287T,True,False,True
 76 | ACH-000937,NRAS (4893),c.34G>A,p.G12S,True,False,True
 77 | ACH-000937,NOTCH1 (4851),c.4721T>C,p.L1574P,True,False,True
 78 | ACH-001563,NRAS (4893),c.37G>C,p.G13R,True,False,True
 79 | ACH-001563,IDH1 (3417),c.394C>T,p.R132C,True,True,True
 80 | ACH-000774,KRAS (3845),c.35G>T,p.G12V,True,True,True
 81 | ACH-001554,GNAQ (2776),c.626A>T,p.Q209L,True,False,True
 82 | ACH-002265,NFE2L2 (4780),c.52C>G,p.R18G,True,False,True
 83 | ACH-001674,ALK (238),c.3824G>A,p.R1275Q,True,True,True
 84 | ACH-001674,RAC1 (5879),c.85C>T,p.P29S,True,False,True
 85 | ACH-000406,JAK3 (3718),c.1533G>A,p.M511I,True,False,True
 86 | ACH-001517,PIK3CA (5290),c.3145G>C,p.G1049R,True,False,True
 87 | ACH-001517,KRAS (3845),c.35G>A,p.G12D,True,True,True
 88 | ACH-001517,ERBB2 (2064),c.2393C>T,p.T798I,True,False,True
 89 | ACH-001517,ERBB2 (2064),c.2524G>A,p.V842I,True,True,True
 90 | ACH-000434,ABL1 (25),c.1051C>T,p.R351W,True,True,True
 91 | ACH-000434,HRAS (3265),c.182A>T,p.Q61L,True,False,True
 92 | ACH-000855,RAC1 (5879),c.85C>T,p.P29S,True,False,True
 93 | ACH-000855,ERBB3 (2065),c.889G>T,p.D297Y,True,False,True
 94 | ACH-000404,BRAF (673),c.1799T>A,p.V600E,True,True,True
 95 | ACH-000991,XPO1 (7514),c.2246G>A,p.R749Q,True,False,True
 96 | ACH-000991,XPO1 (7514),c.1711G>A,p.E571K,True,False,True
 97 | ACH-000991,BCL6 (604),c.1613G>A,p.R538Q,True,False,True
 98 | ACH-000991,KRAS (3845),c.436G>A,p.A146T,True,True,True
 99 | ACH-000527,PIK3CA (5290),c.1258T>C,p.C420R,True,True,True
100 | ACH-000605,GATA3 (2625),c.1183G>A,p.A395T,True,False,True
101 | 


--------------------------------------------------------------------------------
/Data/SampleData/AchillesCommonEssentialControls.csv:
--------------------------------------------------------------------------------
  1 | Gene
  2 | AAMP (14)
  3 | ABCE1 (6059)
  4 | ACTL6A (86)
  5 | ACTR8 (93973)
  6 | AHCY (191)
  7 | ALG1 (56052)
  8 | ALG2 (85365)
  9 | ANAPC10 (10393)
 10 | ANAPC5 (51433)
 11 | AQR (9716)
 12 | ARMC7 (79637)
 13 | ARPC4 (10093)
 14 | ATL2 (64225)
 15 | ATP6AP2 (10159)
 16 | ATP6V1A (523)
 17 | ATR (545)
 18 | BARD1 (580)
 19 | BCAS2 (10286)
 20 | BDP1 (55814)
 21 | BMS1 (9790)
 22 | BPTF (2186)
 23 | BRD8 (10902)
 24 | BUB1B (701)
 25 | BYSL (705)
 26 | CAD (790)
 27 | CCNA2 (890)
 28 | CCT2 (10576)
 29 | CCT6A (908)
 30 | CDC20 (991)
 31 | CDC45 (8318)
 32 | CDC73 (79577)
 33 | CDK1 (983)
 34 | CEBPZ (10153)
 35 | CENPJ (55835)
 36 | CENPW (387103)
 37 | CEP57 (9702)
 38 | CHAF1A (10036)
 39 | CHMP2A (27243)
 40 | CHORDC1 (26973)
 41 | CLASRP (11129)
 42 | CMPK1 (51727)
 43 | COA5 (493753)
 44 | COG4 (25839)
 45 | COPB2 (9276)
 46 | COPS3 (8533)
 47 | COQ4 (51117)
 48 | COX5B (1329)
 49 | CPSF6 (11052)
 50 | CSE1L (1434)
 51 | CSTF3 (1479)
 52 | CTNNBL1 (56259)
 53 | CTU2 (348180)
 54 | CWF19L2 (143884)
 55 | DARS2 (55157)
 56 | DCTN2 (10540)
 57 | DDB1 (1642)
 58 | DDX21 (9188)
 59 | DDX39B (7919)
 60 | DDX49 (54555)
 61 | DDX56 (54606)
 62 | DGCR8 (54487)
 63 | DHX33 (56919)
 64 | DHX8 (1659)
 65 | DIS3 (22894)
 66 | DMAP1 (55929)
 67 | DNAJC2 (27000)
 68 | DNMT1 (1786)
 69 | DR1 (1810)
 70 | DYNC1H1 (1778)
 71 | ECD (11319)
 72 | EEFSEC (60678)
 73 | EIF1AD (84285)
 74 | EIF2B4 (8890)
 75 | EIF3A (8661)
 76 | EIF3J (8669)
 77 | EIF4E (1977)
 78 | EIF6 (3692)
 79 | ELP4 (26610)
 80 | ENO1 (2023)
 81 | ERCC4 (2072)
 82 | EXOC1 (55763)
 83 | EXOSC1 (51013)
 84 | EXOSC5 (56915)
 85 | FAM210A (125228)
 86 | FARSB (10056)
 87 | FEN1 (2237)
 88 | FNTB (2342)
 89 | GAPDH (2597)
 90 | GATC (283459)
 91 | GEMIN8 (54960)
 92 | GFM1 (85476)
 93 | GINS3 (64785)
 94 | GNB1L (54584)
 95 | GON4L (54856)
 96 | GPN1 (11321)
 97 | GRPEL1 (80273)
 98 | GTF2A2 (2958)
 99 | GTF3C1 (2975)
100 | GTPBP4 (23560)
101 | HAUS1 (115106)
102 | HAUS7 (55559)
103 | HEATR1 (55127)
104 | HMGCS1 (3157)
105 | HNRNPH1 (3187)
106 | HNRNPL (3191)
107 | HSD17B10 (3028)
108 | HSPA9 (3313)
109 | IARS1 (3376)
110 | IGBP1 (3476)
111 | IMP4 (92856)
112 | INTS3 (65123)
113 | IPO11 (51194)
114 | ISG20L2 (81875)
115 | KANSL2 (54934)
116 | KCMF1 (56888)
117 | KIN (22944)
118 | LAGE3 (8270)
119 | LAS1L (81887)
120 | LIAS (11019)
121 | LRPPRC (10128)
122 | LSM4 (25804)
123 | LYRM4 (57128)
124 | MASTL (84930)
125 | MCM10 (55388)
126 | MCM6 (4175)
127 | MDC1 (9656)
128 | MED12 (9968)
129 | MED14 (9282)
130 | MED20 (9477)
131 | MED31 (51003)
132 | MED8 (112950)
133 | METTL1 (4234)
134 | MFAP1 (4236)
135 | MIS18BP1 (55320)
136 | MOB4 (25843)
137 | MRPL16 (54948)
138 | MRPL20 (55052)
139 | MRPL27 (51264)
140 | MRPL37 (51253)
141 | MRPL40 (64976)
142 | MRPL48 (51642)
143 | MRPL9 (65005)
144 | MRPS15 (64960)
145 | MRPS23 (51649)
146 | MRPS30 (10884)
147 | MRPS5 (64969)
148 | MRTO4 (51154)
149 | MTOR (2475)
150 | MTPAP (55149)
151 | N6AMT1 (29104)
152 | NAA25 (80018)
153 | NAF1 (92345)
154 | NARS2 (79731)
155 | NCAPG (64151)
156 | NCL (4691)
157 | NEDD1 (121441)
158 | NFS1 (9054)
159 | NIP7 (51388)
160 | NOB1 (28987)
161 | NOL11 (25926)
162 | NOL9 (79707)
163 | NOP10 (55505)
164 | NOP56 (10528)
165 | NPAT (4863)
166 | NSA2 (10412)
167 | NSMCE4A (54780)
168 | NUDC (10726)
169 | NUFIP1 (26747)
170 | NUP155 (9631)
171 | NUP43 (348995)
172 | NUP93 (9688)
173 | OGT (8473)
174 | ORC2 (4999)
175 | ORC6 (23594)
176 | PABPN1 (8106)
177 | PALB2 (79728)
178 | PCF11 (51585)
179 | PCYT1A (5130)
180 | PDCL (5082)
181 | PES1 (23481)
182 | PFDN2 (5202)
183 | PGK1 (5230)
184 | PIK3C3 (5289)
185 | PLRG1 (5356)
186 | PMVK (10654)
187 | PNPT1 (87178)
188 | POLD2 (5425)
189 | POLR1A (25885)
190 | POLR1F (221830)
191 | POLR2D (5433)
192 | POLR2L (5441)
193 | POLR3E (55718)
194 | POP4 (10775)
195 | PPIE (10450)
196 | PPP1R15B (84919)
197 | PPP2CA (5515)
198 | PPRC1 (23082)
199 | PREB (10113)
200 | PRMT5 (10419)
201 | PRPF3 (9129)
202 | PRPF4 (9128)
203 | PRPF8 (10594)
204 | PSMA5 (5686)
205 | PSMB4 (5692)
206 | PSMC1 (5700)
207 | PSMC6 (5706)
208 | PSMD13 (5719)
209 | PSMD4 (5710)
210 | PSMG1 (8624)
211 | PTCD3 (55037)
212 | QARS1 (5859)
213 | RAC1 (5879)
214 | RACGAP1 (29127)
215 | RAD51C (5889)
216 | RANGAP1 (5905)
217 | RBBP5 (5929)
218 | RBM14 (10432)
219 | RBM28 (55131)
220 | RBM48 (84060)
221 | RCC1 (1104)
222 | RFC2 (5982)
223 | RFT1 (91869)
224 | RHOA (387)
225 | RIOK1 (83732)
226 | RNF168 (165918)
227 | RNGTT (8732)
228 | RPA1 (6117)
229 | RPF1 (80135)
230 | RPL11 (6135)
231 | RPL18 (6141)
232 | RPL23A (6147)
233 | RPL3 (6122)
234 | RPL7L1 (285855)
235 | RPN1 (6184)
236 | RPP38 (10557)
237 | RPS11 (6205)
238 | RPS15A (6210)
239 | RPS19BP1 (91582)
240 | RPS21 (6227)
241 | RPS4X (6191)
242 | RPTOR (57521)
243 | RRP1 (8568)
244 | RRP7A (27341)
245 | RTEL1 (51750)
246 | RUVBL2 (10856)
247 | SAMM50 (25813)
248 | SARS2 (54938)
249 | SCFD1 (23256)
250 | SDHA (6389)
251 | SEC61A1 (29927)
252 | SEH1L (81929)
253 | SF3A1 (10291)
254 | SF3B1 (23451)
255 | SF3B3 (23450)
256 | SHQ1 (55164)
257 | SLC25A26 (115286)
258 | SLU7 (10569)
259 | SMC2 (10592)
260 | SMC6 (79677)
261 | SMNDC1 (10285)
262 | SNIP1 (79753)
263 | SNRNP40 (9410)
264 | SNRPC (6631)
265 | SNRPF (6636)
266 | SON (6651)
267 | SPC25 (57405)
268 | SRBD1 (55133)
269 | SRP19 (6728)
270 | SRPRB (58477)
271 | SRSF11 (9295)
272 | SSU72 (29101)
273 | STX5 (6811)
274 | SUPT16H (11198)
275 | SUPV3L1 (6832)
276 | TACC3 (10460)
277 | TAF1B (9014)
278 | TAF6L (10629)
279 | TBCA (6902)
280 | TCP1 (6950)
281 | TFAM (7019)
282 | THOC7 (80145)
283 | TIMM22 (29928)
284 | TIPRL (261726)
285 | TMED2 (10959)
286 | TNPO3 (23534)
287 | TOP2A (7153)
288 | TPR (7175)
289 | TRAPPC1 (58485)
290 | TRAPPC8 (22878)
291 | TRMT61A (115708)
292 | TSEN2 (80746)
293 | TSR2 (90121)
294 | TTF2 (8458)
295 | TUBB (203068)
296 | TUBGCP3 (10426)
297 | TUT1 (64852)
298 | U2AF1 (7307)
299 | U2AF2 (11338)
300 | UBA3 (9039)
301 | UBL5 (59286)
302 | UMPS (7372)
303 | UQCRFS1 (7386)
304 | URI1 (8725)
305 | USP36 (57602)
306 | USPL1 (10208)
307 | UTP20 (27340)
308 | VARS1 (7407)
309 | VPS25 (84313)
310 | VPS72 (6944)
311 | WDHD1 (11169)
312 | WDR3 (10885)
313 | WDR43 (23160)
314 | WDR70 (55100)
315 | WDR82 (80335)
316 | XPO1 (7514)
317 | XRCC6 (2547)
318 | YARS2 (51067)
319 | YKT6 (10652)
320 | ZBTB11 (27107)
321 | ZC3H8 (84524)
322 | ZMAT2 (153527)
323 | ZNF207 (7756)
324 | ZNF622 (90441)
325 | 


--------------------------------------------------------------------------------
/Data/SampleData/AchillesNonessentialControls.csv:
--------------------------------------------------------------------------------
  1 | Gene
  2 | ABCG8 (64241)
  3 | ACTL7A (10881)
  4 | ACTL9 (284382)
  5 | ADAM18 (8749)
  6 | ADAM20 (8748)
  7 | ADGRG7 (84873)
  8 | AFM (173)
  9 | AIPL1 (23746)
 10 | ALPI (248)
 11 | ANKRD30A (91074)
 12 | APOA4 (337)
 13 | APOF (319)
 14 | ARGFX (503582)
 15 | ASB17 (127247)
 16 | ASZ1 (136991)
 17 | ATP4B (496)
 18 | B3GNT6 (192134)
 19 | BARHL1 (56751)
 20 | BMP10 (27302)
 21 | BPIFA3 (128861)
 22 | BPIFB6 (128859)
 23 | BRDT (676)
 24 | C10orf53 (282966)
 25 | C12orf40 (283461)
 26 | C17orf78 (284099)
 27 | C20orf173 (140873)
 28 | C8A (731)
 29 | CABP5 (56344)
 30 | CACNG2 (10369)
 31 | CACNG5 (27091)
 32 | CBLIF (2694)
 33 | CCDC83 (220047)
 34 | CCL1 (6346)
 35 | CD200R1L (344807)
 36 | CDX2 (1045)
 37 | CELA2A (63036)
 38 | CELA3B (23436)
 39 | CETN1 (1068)
 40 | CFHR5 (81494)
 41 | CHRNA6 (8973)
 42 | CLCA1 (1179)
 43 | CLEC2A (387836)
 44 | CLEC6A (93978)
 45 | CNBD1 (168975)
 46 | CNPY1 (285888)
 47 | COL20A1 (57642)
 48 | CRNN (49860)
 49 | CRYGB (1419)
 50 | CSHL1 (1444)
 51 | CSN3 (1448)
 52 | CST4 (1472)
 53 | CST8 (10047)
 54 | CST9L (128821)
 55 | CTCFL (140690)
 56 | CYLC2 (1539)
 57 | CYP11B2 (1585)
 58 | CYP2A13 (1553)
 59 | CYP4A22 (284541)
 60 | DAZL (1618)
 61 | DCANP1 (140947)
 62 | DDX4 (54514)
 63 | DEFA6 (1671)
 64 | DEFB104A (140596)
 65 | DEFB118 (117285)
 66 | DEFB126 (81623)
 67 | DEFB129 (140881)
 68 | DMP1 (1758)
 69 | DMRTB1 (63948)
 70 | DPRX (503834)
 71 | DRGX (644168)
 72 | DSG4 (147409)
 73 | DTX2 (113878)
 74 | EFCAB3 (146779)
 75 | ELOA2 (51224)
 76 | EVX1 (2128)
 77 | FABP2 (2169)
 78 | FCRL4 (83417)
 79 | FEZF2 (55079)
 80 | FGF3 (2248)
 81 | FGF6 (2251)
 82 | FLG2 (388698)
 83 | FNDC9 (408263)
 84 | FOXB2 (442425)
 85 | FOXE3 (2301)
 86 | FOXR1 (283150)
 87 | FSCB (84075)
 88 | FUT9 (10690)
 89 | GABRA1 (2554)
 90 | GALNTL5 (168391)
 91 | GALR3 (8484)
 92 | GCG (2641)
 93 | GDF2 (2658)
 94 | GFRAL (389400)
 95 | GHRH (2691)
 96 | GJA10 (84694)
 97 | GK2 (2712)
 98 | GLRA1 (2741)
 99 | GML (2765)
100 | GPR139 (124274)
101 | GPR151 (134391)
102 | GPR26 (2849)
103 | GPR32 (2854)
104 | GPR52 (9293)
105 | GPRC6A (222545)
106 | GPX6 (257202)
107 | GRM5 (2915)
108 | GSC2 (2928)
109 | GSX1 (219409)
110 | GUCA2A (2980)
111 | H2AC1 (221613)
112 | H4C7 (8369)
113 | HCRTR2 (3062)
114 | HMX1 (3166)
115 | HOXD12 (3238)
116 | HRH3 (11255)
117 | HTR1A (3350)
118 | HTR3D (200909)
119 | HTR5A (3361)
120 | IAPP (3375)
121 | IFNA10 (3446)
122 | IFNA16 (3449)
123 | IFNA2 (3440)
124 | IFNA4 (3441)
125 | IFNA6 (3443)
126 | IFNA8 (3445)
127 | IFNK (56832)
128 | IL12B (3593)
129 | IL17A (3605)
130 | IL1F10 (84639)
131 | IL22 (50616)
132 | IL26 (55801)
133 | IL31 (386653)
134 | IL36B (27177)
135 | IL9 (3578)
136 | INSL5 (10022)
137 | INSM2 (84684)
138 | IQCF1 (132141)
139 | ISX (91464)
140 | KASH5 (147872)
141 | KCNB2 (9312)
142 | KCNK10 (54207)
143 | KCNK18 (338567)
144 | KHDC3L (154288)
145 | KIR2DL1 (3802)
146 | KLK12 (43849)
147 | KRT2 (3849)
148 | KRT26 (353288)
149 | KRT33A (3883)
150 | KRT36 (8689)
151 | KRT38 (8687)
152 | KRT71 (112802)
153 | KRT74 (121391)
154 | KRT76 (51350)
155 | KRT78 (196374)
156 | KRT84 (3890)
157 | KRT86 (3892)
158 | KRTAP1-1 (81851)
159 | KRTAP10-10 (353333)
160 | KRTAP10-12 (386685)
161 | KRTAP10-4 (386672)
162 | KRTAP10-6 (386674)
163 | KRTAP10-9 (386676)
164 | KRTAP13-1 (140258)
165 | KRTAP13-3 (337960)
166 | KRTAP15-1 (254950)
167 | KRTAP19-3 (337970)
168 | KRTAP26-1 (388818)
169 | KRTAP4-11 (653240)
170 | KRTAP4-2 (85291)
171 | KRTAP4-7 (100132476)
172 | KRTAP9-2 (83899)
173 | KRTAP9-4 (85280)
174 | LBX1 (10660)
175 | LCT (3938)
176 | LGALS14 (56891)
177 | LHX3 (8022)
178 | LIM2 (3982)
179 | LORICRIN (4014)
180 | LRIT2 (340745)
181 | LYZL1 (84569)
182 | MAS1 (4142)
183 | MBD3L1 (85509)
184 | MBL2 (4153)
185 | MC3R (4159)
186 | MEP1A (4224)
187 | MEPE (56955)
188 | MMD2 (221938)
189 | MMP21 (118856)
190 | MMP27 (64066)
191 | MORC1 (27136)
192 | MRGPRX1 (259249)
193 | MRGPRX4 (117196)
194 | MS4A13 (503497)
195 | MSGN1 (343930)
196 | MTNR1B (4544)
197 | MUC7 (4589)
198 | MYBPC3 (4607)
199 | NANOGNB (360030)
200 | NCR2 (9436)
201 | NEUROD2 (4761)
202 | NEUROD6 (63974)
203 | NKX2-1 (7080)
204 | NLRP4 (147945)
205 | NLRP8 (126205)
206 | NMS (129521)
207 | NOX3 (50508)
208 | NPHS2 (7827)
209 | NPVF (64111)
210 | OC90 (729330)
211 | OLIG3 (167826)
212 | OPN5 (221391)
213 | OR10A4 (283297)
214 | OR10H1 (26539)
215 | OR10H3 (26532)
216 | OR10R2 (343406)
217 | OR11A1 (26531)
218 | OR12D3 (81797)
219 | OR13D1 (286365)
220 | OR1A1 (8383)
221 | OR1E1 (8387)
222 | OR1G1 (8390)
223 | OR1N2 (138882)
224 | OR2AT4 (341152)
225 | OR2C3 (81472)
226 | OR2D3 (120775)
227 | OR2G2 (81470)
228 | OR2H1 (26716)
229 | OR2L3 (391192)
230 | OR2T10 (127069)
231 | OR2T2 (401992)
232 | OR2T33 (391195)
233 | OR2T5 (401993)
234 | OR3A1 (4994)
235 | OR3A3 (8392)
236 | OR4C3 (256144)
237 | OR4D10 (390197)
238 | OR4D9 (390199)
239 | OR51B6 (390058)
240 | OR51F2 (119694)
241 | OR51V1 (283111)
242 | OR52A5 (390054)
243 | OR52B6 (340980)
244 | OR52L1 (338751)
245 | OR56A1 (120796)
246 | OR56B1 (387748)
247 | OR5C1 (392391)
248 | OR5M1 (390168)
249 | OR5P2 (120065)
250 | OR5T1 (390155)
251 | OR5T3 (390154)
252 | OR5W2 (390148)
253 | OR6V1 (346517)
254 | OR7C2 (26658)
255 | OR7G2 (390882)
256 | OR8B8 (26493)
257 | OR8U1 (219417)
258 | OR9Q2 (219957)
259 | OTOP3 (347741)
260 | OTP (23440)
261 | PANX3 (116337)
262 | PAX4 (5078)
263 | PCARE (388939)
264 | PDE6H (5149)
265 | PDX1 (3651)
266 | PGK2 (5232)
267 | PGLYRP3 (114771)
268 | PIWIL3 (440822)
269 | PLA2G2F (64600)
270 | PNLIP (5406)
271 | PNPLA5 (150379)
272 | POTED (317754)
273 | POTEH (23784)
274 | POU4F3 (5459)
275 | PPP3R2 (5535)
276 | PRAMEF2 (65122)
277 | PRAMEF7 (441871)
278 | PRB4 (5545)
279 | PRDM14 (63978)
280 | PRDM9 (56979)
281 | PRLH (51052)
282 | PROP1 (5626)
283 | PRSS37 (136242)
284 | PRSS55 (203074)
285 | PSKH2 (85481)
286 | RAX (30062)
287 | RBM46 (166863)
288 | RBP3 (5949)
289 | RD3 (343035)
290 | REG3A (5068)
291 | RETNLB (84666)
292 | RFPL4B (442247)
293 | RHO (6010)
294 | RNASE11 (122651)
295 | RNASE13 (440163)
296 | RNASE9 (390443)
297 | RNF113B (140432)
298 | RP1 (6101)
299 | RPE65 (6121)
300 | RTP1 (132112)
301 | RXFP2 (122042)
302 | S100A7A (338324)
303 | SCGB1D1 (10648)
304 | SCRT2 (85508)
305 | SEC14L3 (266629)
306 | SEPTIN14 (346288)
307 | SERPINA9 (327657)
308 | SHCBP1L (81626)
309 | SIGLECL1 (284369)
310 | SLC10A2 (6555)
311 | SLC17A2 (10246)
312 | SLC18A3 (6572)
313 | SLC22A13 (9390)
314 | SLC22A6 (9356)
315 | SLC22A9 (114571)
316 | SLC25A31 (83447)
317 | SLC2A7 (155184)
318 | SLC34A1 (6569)
319 | SLC39A12 (221074)
320 | SLC6A5 (9152)
321 | SLC7A13 (157724)
322 | SLCO6A1 (133482)
323 | SOHLH1 (402381)
324 | SOX14 (8403)
325 | SPACA1 (81833)
326 | SPATA16 (83893)
327 | SPEM1 (374768)
328 | SPINK14 (408187)
329 | SPPL2C (162540)
330 | SSTR4 (6754)
331 | STPG4 (285051)
332 | SUN5 (140732)
333 | TAAR2 (9287)
334 | TAAR6 (319100)
335 | TAS1R2 (80834)
336 | TAS2R13 (50838)
337 | TAS2R39 (259285)
338 | TAS2R41 (259287)
339 | TAS2R43 (259289)
340 | TAS2R50 (259296)
341 | TAS2R7 (50837)
342 | TAS2R9 (50835)
343 | TBC1D21 (161514)
344 | TBR1 (10716)
345 | TBXT (6862)
346 | TEX101 (83639)
347 | TEX45 (374877)
348 | TFAP2D (83741)
349 | TKTL2 (84076)
350 | TMEM132D (121256)
351 | TMEM174 (134288)
352 | TMEM225 (338661)
353 | TMPRSS11A (339967)
354 | TMPRSS11F (389208)
355 | TMPRSS15 (5651)
356 | TPD52L3 (89882)
357 | TPRX1 (284355)
358 | TREML4 (285852)
359 | TRIM40 (135644)
360 | TRIM43 (129868)
361 | TRIM60 (166655)
362 | TRIML1 (339976)
363 | TRPM1 (4308)
364 | TSBP1 (10665)
365 | TSHB (7252)
366 | TSPO2 (222642)
367 | TSSK1B (83942)
368 | TXNDC8 (255220)
369 | UBQLN3 (50613)
370 | UROC1 (131669)
371 | USP29 (57663)
372 | VAX1 (11023)
373 | VN1R4 (317703)
374 | VRTN (55237)
375 | WFDC10A (140832)
376 | WFDC9 (259240)
377 | ZG16 (653808)
378 | ZNF648 (127665)
379 | ZNF804B (219578)
380 | ZP2 (7783)
381 | ZSWIM2 (151112)
382 | 


--------------------------------------------------------------------------------
/Data/SampleData/AvanaSequenceMap.csv:
--------------------------------------------------------------------------------
 1 | sequence_ID,ScreenID,days,pDNA_batch,Replicate,ScreenType,cell_line_name,ModelConditionID,Library,PassesQC
 2 | HEL-311Cas9_RepA_p4_Avana-3,SC-000004.AV01,21,Avana-3,A,2DS,ACH-000004,MC-000004-pA3k,Avana,True
 3 | HEL-311Cas9_RepB_p4_Avana-3,SC-000004.AV01,21,Avana-3,B,2DS,ACH-000004,MC-000004-pA3k,Avana,True
 4 | KU812-311cas9-RepA-p6_Avana-3,SC-000074.AV01,21,Avana-3,A,2DS,ACH-000074,MC-000074-OKtM,Avana,True
 5 | KU812-311cas9-RepB-p6_Avana-3,SC-000074.AV01,21,Avana-3,B,2DS,ACH-000074,MC-000074-OKtM,Avana,True
 6 | T47D-311Cas9-RepA-p6_Avana-4,SC-000147.AV01,21,Avana-4,A,2DS,ACH-000147,MC-000147-Uovr,Avana,True
 7 | T47D-311Cas9-RepB-p6_Avana-4,SC-000147.AV01,21,Avana-4,B,2DS,ACH-000147,MC-000147-Uovr,Avana,True
 8 | NOMO-1-311Cas9_RepA_p4_Avana-2,SC-000168.AV01,21,Avana-2,A,2DS,ACH-000168,MC-000168-L3Ll,Avana,True
 9 | NOMO-1-311Cas9_RepB_p4_Avana-2,SC-000168.AV01,21,Avana-2,B,2DS,ACH-000168,MC-000168-L3Ll,Avana,True
10 | L363-311Cas9_RepA_p6_Avana-3,SC-000183.AV01,21,Avana-3,A,2DS,ACH-000183,MC-000183-k64I,Avana,True
11 | L363-311Cas9_RepB_p6_Avana-3,SC-000183.AV01,21,Avana-3,B,2DS,ACH-000183,MC-000183-k64I,Avana,True
12 | KASUMI-1-311cas9_RepA_p6_Avana-3,SC-000263.AV01,21,Avana-3,A,2DS,ACH-000263,MC-000263-MxQI,Avana,True
13 | KASUMI-1-311cas9_RepB_p6_Avana-3,SC-000263.AV01,21,Avana-3,B,2DS,ACH-000263,MC-000263-MxQI,Avana,True
14 | NCI-H841-311as9_RepA_p6_Avana-3,SC-000292.AV01,21,Avana-3,A,2DS,ACH-000292,MC-000292-Oy94,Avana,True
15 | NCI-H841-311as9_RepB_p6_Avana-3,SC-000292.AV01,21,Avana-3,B,2DS,ACH-000292,MC-000292-Oy94,Avana,True
16 | DB-311Cas9_RepA_p3_Avana-3,SC-000334.AV01,21,Avana-3,A,2DS,ACH-000334,MC-000334-r0NH,Avana,True
17 | DB-311Cas9_RepB_p3_Avana-3,SC-000334.AV01,21,Avana-3,B,2DS,ACH-000334,MC-000334-r0NH,Avana,True
18 | J82-311Cas9_RepA_p5_Avana-3,SC-000396.AV01,21,Avana-3,A,2DS,ACH-000396,MC-000396-DZtc,Avana,True
19 | J82-311Cas9_RepB_p5_Avana-3,SC-000396.AV01,21,Avana-3,B,2DS,ACH-000396,MC-000396-DZtc,Avana,False
20 | K029AX-311cas9_RepB_p6_Avana-3,SC-000404.AV01,21,Avana-3,B,2DS,ACH-000404,MC-000404-GqDy,Avana,True
21 | U937-101Cas9 Rep A p6_Avana-3,SC-000406.AV01,21,Avana-3,A,2DS,ACH-000406,MC-000406-va7T,Avana,True
22 | U937-101Cas9 Rep C p6_Avana-3,SC-000406.AV01,21,Avana-3,C,2DS,ACH-000406,MC-000406-va7T,Avana,True
23 | U937-101Cas9 Rep D p6_Avana-3,SC-000406.AV01,21,Avana-3,D,2DS,ACH-000406,MC-000406-va7T,Avana,True
24 | NCI-H1915-311Cas9_RepA_p6_Avana-3,SC-000434.AV01,21,Avana-3,A,2DS,ACH-000434,MC-000434-8t7w,Avana,True
25 | NCI-H1915-311Cas9_RepB_p6_Avana-3,SC-000434.AV01,21,Avana-3,B,2DS,ACH-000434,MC-000434-8t7w,Avana,True
26 | CJM-311Cas9_RepA_p7_Avana-3,SC-000458.AV01,21,Avana-3,A,2DS,ACH-000458,MC-000458-xKvR,Avana,True
27 | CJM-311Cas9_RepB_p7_Avana-3,SC-000458.AV01,21,Avana-3,B,2DS,ACH-000458,MC-000458-xKvR,Avana,True
28 | KNS-81-311cas9_RepA_p5_Avana-3,SC-000479.AV01,21,Avana-3,A,2DS,ACH-000479,MC-000479-6qKr,Avana,True
29 | KNS-81-311cas9_RepB_p5_Avana-3,SC-000479.AV01,21,Avana-3,B,2DS,ACH-000479,MC-000479-6qKr,Avana,True
30 | OVISE-311cas9 Rep A p6_Avana-2,SC-000527.AV01,21,Avana-2,A,2DS,ACH-000527,MC-000527-f2HC,Avana,True
31 | OVISE-311cas9 Rep B p6_Avana-2,SC-000527.AV01,21,Avana-2,B,2DS,ACH-000527,MC-000527-f2HC,Avana,True
32 | OVISE-311cas9 Rep C p6_Avana-2,SC-000527.AV01,21,Avana-2,C,2DS,ACH-000527,MC-000527-f2HC,Avana,True
33 | TE6-311cas9_RepA_p6_Avana-3,SC-000605.AV01,21,Avana-3,A,2DS,ACH-000605,MC-000605-mA6N,Avana,True
34 | TE6-311cas9_RepB_p6_Avana-3,SC-000605.AV01,21,Avana-3,B,2DS,ACH-000605,MC-000605-mA6N,Avana,True
35 | RVH421-311Cas9_RepA_p5_Avana-3,SC-000614.AV01,21,Avana-3,A,2DS,ACH-000614,MC-000614-7X1Q,Avana,True
36 | RVH421-311Cas9_RepB_p5_Avana-3,SC-000614.AV01,21,Avana-3,B,2DS,ACH-000614,MC-000614-7X1Q,Avana,True
37 | A2780-311cas9 Rep A p6_Avana-2,SC-000657.AV01,21,Avana-2,A,2DS,ACH-000657,MC-000657-vN70,Avana,True
38 | A2780-311cas9 Rep B p6_Avana-2,SC-000657.AV01,21,Avana-2,B,2DS,ACH-000657,MC-000657-vN70,Avana,True
39 | LOXIMVI-311Cas9_RepA_p6_Avana-2,SC-000750.AV01,21,Avana-2,A,2DS,ACH-000750,MC-000750-bafP,Avana,False
40 | LOXIMVI-311Cas9_RepB_p6_Avana-2,SC-000750.AV01,21,Avana-2,B,2DS,ACH-000750,MC-000750-bafP,Avana,False
41 | RERF-LC-Ad2-311cas9_RepA_p6_Avana-3,SC-000774.AV01,21,Avana-3,A,2DS,ACH-000774,MC-000774-GTYc,Avana,True
42 | RERF-LC-Ad2-311cas9_RepB_p6_Avana-3,SC-000774.AV01,21,Avana-3,B,2DS,ACH-000774,MC-000774-GTYc,Avana,True
43 | LXF-289-311cas9 Rep B p6_Avana-3,SC-000787.AV01,21,Avana-3,B,2DS,ACH-000787,MC-000787-yzmP,Avana,True
44 | LXF-289-311cas9 Rep C p6_Avana-3,SC-000787.AV01,21,Avana-3,C,2DS,ACH-000787,MC-000787-yzmP,Avana,True
45 | LXF-289-311cas9 Rep D p6_Avana-3,SC-000787.AV01,21,Avana-3,D,2DS,ACH-000787,MC-000787-yzmP,Avana,True
46 | A2058-311cas9_RepA_p6_Avana-3,SC-000788.AV01,21,Avana-3,A,2DS,ACH-000788,MC-000788-xvTl,Avana,True
47 | A2058-311cas9_RepB_p6_Avana-3,SC-000788.AV01,21,Avana-3,B,2DS,ACH-000788,MC-000788-xvTl,Avana,True
48 | NCI-H2286-311caa9_RepA_p6_Avana-3,SC-000912.AV01,21,Avana-3,A,2DS,ACH-000912,MC-000912-cLYP,Avana,True
49 | NCI-H2286-311caa9_RepB_p6_Avana-3,SC-000912.AV01,21,Avana-3,B,2DS,ACH-000912,MC-000912-cLYP,Avana,True
50 | MDST8-311Cas9 Rep A p6_Avana-3,SC-000935.AV01,21,Avana-3,A,2DS,ACH-000935,MC-000935-E55p,Avana,True
51 | MDST8-311Cas9 Rep C p6_Avana-3,SC-000935.AV01,21,Avana-3,C,2DS,ACH-000935,MC-000935-E55p,Avana,True
52 | MDST8-311Cas9 Rep D p6_Avana-3,SC-000935.AV01,21,Avana-3,D,2DS,ACH-000935,MC-000935-E55p,Avana,True
53 | PF382-311CAS9_RepA_p6_Avana-3,SC-000937.AV01,21,Avana-3,A,2DS,ACH-000937,MC-000937-QPMF,Avana,True
54 | PF382-311CAS9_RepB_p6_Avana-3,SC-000937.AV01,21,Avana-3,B,2DS,ACH-000937,MC-000937-QPMF,Avana,True
55 | HEC265-311Cas9_RepA_p6_Avana-3,SC-000946.AV01,21,Avana-3,A,2DS,ACH-000946,MC-000946-Imv7,Avana,True
56 | HEC265-311Cas9_RepB_p6_Avana-3,SC-000946.AV01,21,Avana-3,B,2DS,ACH-000946,MC-000946-Imv7,Avana,True
57 | SNU407-311Cas9_RepB_p6_Avana-4,SC-000955.AV01,21,Avana-4,B,2DS,ACH-000955,MC-000955-q7cl,Avana,False
58 | MFE-319-311Cas9_RepA_p6_Avana-3,SC-000988.AV01,21,Avana-3,A,2DS,ACH-000988,MC-000988-YnfA,Avana,True
59 | MFE-319-311Cas9_RepB_p6_Avana-3,SC-000988.AV01,21,Avana-3,B,2DS,ACH-000988,MC-000988-YnfA,Avana,True
60 | JHUEM7-311Cas9_RepA_p5_Avana-3,SC-000993.AV01,21,Avana-3,A,2DS,ACH-000993,MC-000993-g8KR,Avana,True
61 | JHUEM7-311Cas9_RepB_p5_Avana-3,SC-000993.AV01,21,Avana-3,B,2DS,ACH-000993,MC-000993-g8KR,Avana,True
62 | HEC-251-311Cas9_RepA_p5_Avana-3,SC-000996.AV01,21,Avana-3,A,2DS,ACH-000996,MC-000996-nXxW,Avana,True
63 | HEC-251-311Cas9_RepB_p5_Avana-3,SC-000996.AV01,21,Avana-3,B,2DS,ACH-000996,MC-000996-nXxW,Avana,True
64 | MONO-MAC1-311cas9_RepA_p6_Avana-3,SC-001129.AV01,21,Avana-3,A,2DS,ACH-001129,MC-001129-nxTf,Avana,True
65 | MONO-MAC1-311cas9_RepB_p6_Avana-3,SC-001129.AV01,21,Avana-3,B,2DS,ACH-001129,MC-001129-nxTf,Avana,True
66 | NB1643-311Cas9_RepA_p4_Avana-3,SC-001303.AV01,21,Avana-3,A,2DS,ACH-001303,MC-001303-7eBW,Avana,True
67 | NB1643-311Cas9_RepB_p4_Avana-3,SC-001303.AV01,21,Avana-3,B,2DS,ACH-001303,MC-001303-7eBW,Avana,True
68 | UMUC5-311Cas9_RepA_p6_Avana-3,SC-001411.AV01,21,Avana-3,A,2DS,ACH-001411,MC-001411-EIzh,Avana,True
69 | UMUC5-311Cas9_RepB_p6_Avana-3,SC-001411.AV01,21,Avana-3,B,2DS,ACH-001411,MC-001411-EIzh,Avana,True
70 | KARPAS1718-311cas9-RepA-p6_Avana-4,SC-001533.AV01,21,Avana-4,A,2DS,ACH-001533,MC-001533-SowA,Avana,True
71 | KARPAS1718-311cas9-RepB-p6_Avana-4,SC-001533.AV01,21,Avana-4,B,2DS,ACH-001533,MC-001533-SowA,Avana,True
72 | MEL202-311cas9_RepA_p6_Avana-3,SC-001554.AV01,21,Avana-3,A,2DS,ACH-001554,MC-001554-W5Vn,Avana,True
73 | MEL202-311cas9_RepB_p6_Avana-3,SC-001554.AV01,21,Avana-3,B,2DS,ACH-001554,MC-001554-W5Vn,Avana,True
74 | MM127-311cas9_RepA_p5_Avana-3,SC-001563.AV01,21,Avana-3,A,2DS,ACH-001563,MC-001563-W7yA,Avana,True
75 | MM127-311cas9_RepB_p5_Avana-3,SC-001563.AV01,21,Avana-3,B,2DS,ACH-001563,MC-001563-W7yA,Avana,True
76 | RAMOS-311cas9-RepA-p6_Avana-4,SC-001636.AV01,21,Avana-4,A,2DS,ACH-001636,MC-001636-mxpz,Avana,False
77 | RAMOS-311cas9-RepB-p6_Avana-4,SC-001636.AV01,21,Avana-4,B,2DS,ACH-001636,MC-001636-mxpz,Avana,False
78 | TGW-311Cas9-RepA-P6_Avana-4,SC-001674.AV01,21,Avana-4,A,2DS,ACH-001674,MC-001674-vHBi,Avana,True
79 | TGW-311Cas9-RepB-P6_Avana-4,SC-001674.AV01,21,Avana-4,B,2DS,ACH-001674,MC-001674-vHBi,Avana,True
80 | HB11;19-311CAS9_RepA_p6_Avana-3,SC-001736.AV01,21,Avana-3,A,2DS,ACH-001736,MC-001736-pJnl,Avana,True
81 | HB11;19-311CAS9_RepB_p6_Avana-3,SC-001736.AV01,21,Avana-3,B,2DS,ACH-001736,MC-001736-pJnl,Avana,True
82 | ICC108-311cas9-RepA-p4_Avana-3,SC-001836.AV01,21,Avana-3,A,2DS,ACH-001836,MC-001836-jYoz,Avana,True
83 | ICC108-311cas9-RepB-p4_Avana-3,SC-001836.AV01,21,Avana-3,B,2DS,ACH-001836,MC-001836-jYoz,Avana,True
84 | ICC2-311cas9_RepA_p6_Avana-3,SC-001842.AV01,21,Avana-3,A,2DS,ACH-001842,MC-001842-XQ3q,Avana,True
85 | ICC2-311cas9_RepB_p6_Avana-3,SC-001842.AV01,21,Avana-3,B,2DS,ACH-001842,MC-001842-XQ3q,Avana,True
86 | ICC3-311cas9_RepA_p6_Avana-3,SC-001843.AV01,21,Avana-3,A,2DS,ACH-001843,MC-001843-hmPS,Avana,True
87 | ICC3-311cas9_RepB_p6_Avana-3,SC-001843.AV01,21,Avana-3,B,2DS,ACH-001843,MC-001843-hmPS,Avana,True
88 | SAS-311cas9-RepA-p6_Avana-4,SC-002029.AV01,21,Avana-4,A,2DS,ACH-002029,MC-002029-jZk3,Avana,True
89 | SAS-311cas9-RepB-p6_Avana-4,SC-002029.AV01,21,Avana-4,B,2DS,ACH-002029,MC-002029-jZk3,Avana,True
90 | RVH421RPMI-311Cas9-RepA-p6_Avana-4,SC-002875.AV01,21,Avana-4,A,2DS,ACH-000614,MC-002875-AKge,Avana,True
91 | RVH421RPMI-311Cas9-RepB-p6_Avana-4,SC-002875.AV01,21,Avana-4,B,2DS,ACH-000614,MC-002875-AKge,Avana,True
92 | UPMD1-311cas9-RepA-p6_Avana-4,SC-002926.AV01,21,Avana-4,A,2DS,ACH-002926,MC-002926-BDYS,Avana,True
93 | UPMD1-311cas9-RepB-p6_Avana-4,SC-002926.AV01,21,Avana-4,B,2DS,ACH-002926,MC-002926-BDYS,Avana,True
94 | pDNA_batch_Avana-4,pDNA,0,Avana-4,,pDNA,pDNA,pDNA,Avana,True
95 | pDNA_batch_Avana-3,pDNA,0,Avana-3,,pDNA,pDNA,pDNA,Avana,True
96 | pDNA_batch_Avana-2,pDNA,0,Avana-2,,pDNA,pDNA,pDNA,Avana,True
97 | 


--------------------------------------------------------------------------------
/chronos/figshare.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import json
  3 | import os
  4 | import requests
  5 | import zipfile
  6 | 
  7 | from requests.exceptions import HTTPError
  8 | 
  9 | # CONSTANTS
 10 | BASE_URL = 'https://api.figshare.com/v2/{endpoint}'
 11 | CHUNK_SIZE = 10485760
 12 | 
 13 | # FIGSHARE
 14 | FIGSHARE_TOKEN = 'c727b3826353164e0ae4ba35c9325ee339597f2ca66c3ab4349394cc0bcf2662b91f0a3e722f41b0d26152c65ed63692c834a18be6d7df8141d16e3fed834aa4'
 15 | FIGSHARE_ID = 21411663
 16 | MODEL_NAME = 'mini_model.zip'
 17 | # ^ credentials for smaffa and test article id
 18 | 
 19 | 
 20 | # FIGSHARE_TOKEN = <achilles-master-token>
 21 | # FIGSHARE_ID = <chronos-model-article-id> (14067047?)
 22 | # MODEL_NAME = <model-zip-file>
 23 | 
 24 | 
 25 | ##### GENERAL API UTILS #####
 26 | 
 27 | def raw_issue_request(method, url, data=None, binary=False):
 28 |     '''
 29 |     Helper for issuing an HTTPS request
 30 |     '''
 31 |     headers = {'Authorization': 'token ' + FIGSHARE_TOKEN}
 32 |     if data is not None and not binary:
 33 |         data = json.dumps(data)
 34 |     response = requests.request(method, url, headers=headers, data=data)
 35 |     try:
 36 |         response.raise_for_status()
 37 |         try:
 38 |             data = json.loads(response.content)
 39 |         except ValueError:
 40 |             data = response.content
 41 |     except HTTPError as error:
 42 |         print('Caught an HTTPError: {}'.format(error.message))
 43 |         print('Body:\n', response.content)
 44 |         raise
 45 | 
 46 |     return data
 47 | 
 48 | 
 49 | def issue_request(method, endpoint, *args, **kwargs):
 50 |     '''
 51 |     Formats an HTTPS request
 52 |     '''
 53 |     return raw_issue_request(method, BASE_URL.format(endpoint=endpoint), *args, **kwargs)
 54 | 
 55 | 
 56 | ##### UPLOADING #####
 57 | 
 58 | def zip_chronos_model(path, archive_name=None):
 59 |     '''
 60 |     Zip the necessary files for storing a Chronos model
 61 |     '''
 62 |     files_in_path = os.listdir(path)
 63 |     necessary_files = ['chronos_ge_unscaled.hdf5', 
 64 |                      'guide_efficacy.csv', 
 65 |                      'cell_line_efficacy.csv', 
 66 |                      'screen_delay.csv', 
 67 |                      'library_effect.csv']
 68 |     for filename in necessary_files:
 69 |         assert filename in files_in_path, "Cannot locate file {} in target directory {}".format(filename, path)
 70 |     
 71 |     if archive_name is None:
 72 |         archive_name = path.rstrip('/')
 73 |     
 74 |     with zipfile.ZipFile(archive_name + '.zip', mode='w', compression=zipfile.ZIP_DEFLATED) as ziph:
 75 |         for filename in necessary_files:
 76 |             ziph.write(os.path.join(path, filename), 
 77 |                        os.path.relpath(os.path.join(archive_name, filename),
 78 |                                        os.path.join(path, '..')))
 79 |     return archive_name + '.zip'
 80 | 
 81 | 
 82 | def list_files_of_article(article_id, private=True):
 83 |     '''
 84 |     List all the files present in a figshare article
 85 |     '''
 86 |     if private:
 87 |         result = issue_request('GET', 'account/articles/{}/files'.format(article_id))
 88 |     else:
 89 |         result = issue_request('GET', 'articles/{}/files'.format(article_id))
 90 |     print('Listing files for article {}:'.format(article_id))
 91 |     if result:
 92 |         for item in result:
 93 |             print('  {id} - {name}'.format(**item))
 94 |     else:
 95 |         print('  No files.')
 96 | 
 97 | 
 98 | def create_article(title):
 99 |     '''
100 |     Make a new figshare article
101 |     '''
102 |     data = {
103 |         'title': title
104 |     }
105 |     result = issue_request('POST', 'account/articles', data=data)
106 |     print('Created article:', result['location'], '\n')
107 | 
108 |     result = raw_issue_request('GET', result['location'])
109 | 
110 |     return result['id']
111 | 
112 | 
113 | def get_file_check_data(file_name):
114 |     '''
115 |     Ensure file can be streamed for upload
116 |     '''
117 |     with open(file_name, 'rb') as fin:
118 |         md5 = hashlib.md5()
119 |         size = 0
120 |         data = fin.read(CHUNK_SIZE)
121 |         while data:
122 |             size += len(data)
123 |             md5.update(data)
124 |             data = fin.read(CHUNK_SIZE)
125 |         return md5.hexdigest(), size
126 | 
127 | 
128 | def initiate_new_upload(article_id, file_name):
129 |     '''
130 |     Initiate the upload process for a file
131 |     '''
132 |     endpoint = 'account/articles/{}/files'
133 |     endpoint = endpoint.format(article_id)
134 | 
135 |     md5, size = get_file_check_data(file_name)
136 |     data = {'name': os.path.basename(file_name),
137 |             'md5': md5,
138 |             'size': size}
139 | 
140 |     result = issue_request('POST', endpoint, data=data)
141 |     print('Initiated file upload:', result['location'], '\n')
142 | 
143 |     result = raw_issue_request('GET', result['location'])
144 | 
145 |     return result
146 | 
147 | 
148 | def complete_upload(article_id, file_id):
149 |     '''
150 |     Complete the file upload
151 |     '''
152 |     issue_request('POST', 'account/articles/{}/files/{}'.format(article_id, file_id))
153 | 
154 | 
155 | def upload_parts(file_info, file_name):
156 |     '''
157 |     Uploads an entire file in chunks
158 |     '''
159 |     url = '{upload_url}'.format(**file_info)
160 |     result = raw_issue_request('GET', url)
161 | 
162 |     print('Uploading parts:')
163 |     with open(file_name, 'rb') as fin:
164 |         for part in result['parts']:
165 |             upload_part(file_info, fin, part)
166 | 
167 | 
168 | def upload_part(file_info, stream, part):
169 |     '''
170 |     Uploads a single chunk of a file
171 |     '''
172 |     udata = file_info.copy()
173 |     udata.update(part)
174 |     url = '{upload_url}/{partNo}'.format(**udata)
175 | 
176 |     stream.seek(part['startOffset'])
177 |     data = stream.read(part['endOffset'] - part['startOffset'] + 1)
178 | 
179 |     raw_issue_request('PUT', url, data=data, binary=True)
180 |     print('  Uploaded part {partNo} from {startOffset} to {endOffset}'.format(**part))
181 | 
182 | 
183 | def upload(file_path, article_id=None, article_title=None, overwrite=False):
184 |     '''
185 |     Uploads a local file to the specified article, or creates a new article with the file
186 |     '''
187 |     # create article if not exists
188 |     if article_id is None:
189 |         assert article_title is not None, 'No article_id supplied, please provide a title for the new dataset or specify the id of an existing one'
190 |         article_id = create_article(article_title) 
191 |     else:
192 |         # check if file exists
193 |         response = issue_request('GET', 'account/articles/{article_id}'.format(article_id=article_id))
194 |         file_list = response['files']
195 |         
196 |         for file_info in file_list:
197 |             if file_info['name'] == os.path.basename(file_path):
198 |                 if overwrite:
199 |                     # Delete the existing file first
200 |                     issue_request('DELETE', 'account/articles/{article_id}/files/{file_id}'.format(article_id=article_id, file_id=file_info['id']))
201 |                 else:
202 |                     # Throw an error
203 |                     raise ValueError('{} exists in figshare article'.format(os.path.basename(file_path)))
204 |                     
205 | 
206 |     # Upload the file
207 |     file_info = initiate_new_upload(article_id, file_path)
208 |     upload_parts(file_info, file_path)
209 |     complete_upload(article_id, file_info['id'])
210 |     
211 |     list_files_of_article(article_id)
212 | 
213 | 
214 | ##### DOWNLOADING #####
215 | 
216 | def unzip(archive_path, target_path=None):
217 |     '''
218 |     Unzips a zip archive into the target directory
219 |     '''
220 |     if target_path is None:
221 |         target_path = os.path.dirname(archive_path)
222 |     
223 |     with zipfile.ZipFile(archive_path, 'r') as ziph:
224 |         ziph.extractall(target_path)
225 |     
226 |     return os.path.join(target_path, os.path.basename(archive_path).rstrip('.zip'))
227 | 
228 | 
229 | def download_files_from_article(article_id, target_directory=None, fileset=None, private=False):
230 |     '''
231 |     Downloads files from a public (or private) Figshare article
232 |     Parameters:
233 |         article_id (`str` or `int`): identifier for a Figshare dataset
234 |         target_directory (`str`): the location to download files into; if None, creates a local directory named by article_id
235 |         fileset (iterable): Figshare file ids or names to download
236 |     '''
237 |         
238 |     if private: # for test purposes
239 |         response = issue_request('GET', 'account/articles/{article_id}'.format(article_id=article_id))
240 |     else:
241 |         response = issue_request('GET', 'articles/{article_id}'.format(article_id=article_id))
242 |     
243 |     headers = {'Authorization': 'token ' + FIGSHARE_TOKEN}
244 |     
245 |     file_list = response['files']
246 |     
247 |     if target_directory is None: # save the downloads by the article id
248 |         target_directory = 'figshare_{}'.format(article_id)
249 |     if not os.path.exists(target_directory):
250 |         os.makedirs(target_directory)
251 |     
252 |     for file_info in file_list:
253 |         if file_info['id'] in fileset or file_info['name'] in fileset:
254 |             r = requests.get('https://ndownloader.figshare.com/files/{file_id}'.format(file_id=file_info['id']), 
255 |                              allow_redirects=True, headers=headers)
256 |             with open(os.path.join(target_directory, file_info['name']), 'wb') as f:
257 |                 for chunk in r.iter_content(1024):
258 |                     f.write(chunk)
259 |                 print('Downloaded {} from article {}'.format(file_info['name'], article_id))
260 |     print('Downloads are located at {}/'.format(target_directory))
261 |     
262 | 
263 | # def download_files_from_article(article_id, target_directory=None, fileset=None):
264 | #     '''
265 | #     Downloads files from a public Figshare article
266 | #     Parameters:
267 | #         article_id (`str` or `int`): identifier for a Figshare dataset
268 | #         target_directory (`str`): the location to download files into; if None, creates a local directory named by article_id
269 | #         fileset (iterable): Figshare file ids or names to download
270 | #     '''
271 |         
272 | #     response = issue_request('GET', 'articles/{article_id}'.format(article_id=article_id))
273 |     
274 | #     headers = {'Authorization': 'token ' + FIGSHARE_TOKEN}
275 |     
276 | #     file_list = response['files']
277 |     
278 | #     if target_directory is None: # save the downloads by the article id
279 | #         target_directory = 'figshare_{}'.format(article_id)
280 | #     if not os.path.exists(target_directory):
281 | #         os.makedirs(target_directory)
282 |     
283 | #     for file_info in file_list:
284 | #         if file_info['id'] in fileset or file_info['name'] in fileset:
285 | #             r = requests.get('https://ndownloader.figshare.com/files/{file_id}'.format(file_id=file_info['id']), 
286 | #                              allow_redirects=True, headers=headers)
287 | #             with open(os.path.join(target_directory, file_info['name']), 'wb') as f:
288 | #                 for chunk in r.iter_content(1024):
289 | #                     f.write(chunk)
290 | #                 print('Downloaded {} from article {}'.format(file_info['name'], article_id))
291 | #     print('Downloads are located at {}'.format(target_directory))
292 |                     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Chronos: an algorithm for inferring gene fitness effects from CRISPR knockout experiments. 
  2 | 
  3 | A full description and benchmarking of Chronos 1 are available in a publication: https://doi.org/10.1186/s13059-021-02540-7
  4 | 
  5 | An additional preprint describing the changes made to Chronos 2 will be released when the underlying data is public, expected 2024.
  6 | 
  7 | NEW IN 2.3: The Chronos hit-calling module now allows you to assess significance and control false discovery. See details in the preprint: https://doi.org/10.1101/2025.04.24.650434. Examples are given in Vignette.py.
  8 | 
  9 | # When to use it
 10 | Chronos is well suited for any CRISPR KO experiment where:
 11 | - You measured initial pDNA sgRNA readcounts and readcounts at one or more later time points.
 12 | - You might have one or more cell lines.
 13 | - You might have one library, or be combining data from multiple libraries.
 14 | - Genome-wide or sub-genome coverage.
 15 | - You expect most cells to be proliferating.
 16 | - You expect the majority of gene knockouts to have little to no effect on proliferation.
 17 | - You might or might not have copy number data for your cell lines.
 18 | - You might be using CRISPRko or CRISPRi. Chronos was developed for CRISPRko, but should work for CRISPRi experiments. 
 19 | - You might want to compare the effects of gene knockouts between screens in two different conditions, for example, treated and untreated. However, please note that to estimate the significance of the difference in effects, Chronos requires two independent biological replicates per condition (ideally infected separately)
 20 | 
 21 | Chronos may not work well for:
 22 | - RNAi experiments. Chronos makes biological assumptions that are fundamentally incompatible with RNAi. Try DEMETER 2.
 23 | - Rescue experiments. If most cells are dying, we can't offer any guarantees of Chronos' performance.
 24 | - A focused essential gene library, for the same reason. 
 25 | - Multi-condition experiments where your only control is a late time point (such as DMSO). Chronos requires pDNA abundance.
 26 | 
 27 | We strongly recommend having at least two sgRNAs per gene. This is true regardless of the algorithm you use.
 28 | 
 29 | Chronos is competitive with or superior to the other CRISPR algorithms we tested given readcounts from only one late time point, but it will perform even better with multiple late time points if your experiment has them.
 30 | 
 31 | 
 32 | # Installation
 33 | 
 34 | ## Note on Mac M1 chips
 35 | 
 36 | As of 09/01/2023, `pip install tensorflow` should work on Macs with arm64.
 37 | 
 38 | ## Installing Chronos
 39 | 
 40 | If you have `pip` installed, you can install Chronos from PyPI with
 41 | 
 42 | `    $ pip install crispr_chronos`
 43 | 
 44 | However, we recommend downloading this repository as well to run the vignette and download the DepMap trained Chronos parameters.
 45 | 
 46 | Chronos `model` requires `python 3` with the packages `tensorflow 2.x`, `numpy`, `pandas`,`h5py`. However, additional modules require additional packages which will be installed by default if missing: `patsy`, `statsmodels`, `scipy`, `matplotlib`, `seaborn`, `adjust_text`, `scikit-learn`, `umap`, `reportlab`.
 47 | 
 48 | # Getting Started
 49 | If you have jupyter notebook, you should run through `Vignette.ipynb`. This will both verify that you have a working installation and demonstrate a typical workflow for Chronos. Chronos is meant to be run in a python environment. 
 50 | 
 51 | To run Chronos, you need a minimum of three Pandas dataframes:
 52 | 
 53 | 1. _readcounts_: A matrix of raw readcounts, where the columns are targeting sgRNAs, the rows are pDNA sequencing samples or replicate samples, and the entries are the number of reads of the given sgRNA in the given sample. Notice that in Chronos matrices, GUIDES and GENES are always COLUMNS and SAMPLES are always ROWS. Readcounts can have null values as long as no column or row is entirely null.
 54 | 
 55 | 2. _sequence_map_: A table with at least four columns, `sequence_ID`, `cell_line_name`, `pDNA_batch`, and `days`, mapping sequencing samples to cell lines and pDNA measurements. `sequence_ID` should match the row names of the raw readcounts. `days` is the number of days between infection and when the sample was collected, should be integer or float. It will be ignored for pDNA samples. `cell_line_name` MUST be "pDNA" for pDNA samples. if, instead of pDNA, you are sequencing your cells at a very early time point to get initial library abundance, treat these as pDNA samples. If you don't have either, Chronos may not be the right algorithm for your experiment. `pDNA_batch` is needed when your experiment combines samples that have different pDNA references (within the same library). This is the case for Achilles because the PCR primer strategy has changed several times during the course of the experiment. pDNA samples belonging to the same batch will be combined into a single reference. If you don't have pDNA batches, just fill this column some value, such as "batch1".
 56 | 
 57 | 3. _guide_gene_map_: A table with at least two columns, `sgrna` and `gene`, mapping the sgRNAs to genes. Chronos will not accept sgRNAs that map to more than one gene. This is intentional. `sgrna` entries should match the columns in raw readcounts. `gene` can be in any format.
 58 | 
 59 | To benefit from improved normalization and allow Chronos to infer the overdispersion of screens, supplying a list or array of `negative_control_sgrnas` is also necessary. These are simply the sgRNAs which you believe should have no viability effect in any of your screens. It is much better to use cutting than noncutting controls, and as many as possible.
 60 | 
 61 | We've found that a small number of clones in CRISPR cell lines will exhibit dramatic outgrowth that seems unrelated to the intended CRISPR perturbation. We recommend you remove these in place by running
 62 | 
 63 | 	import chronos
 64 | 	chronos.nan_outgrowths(readcounts, sequence_map, guide_gene_map)
 65 | 
 66 | You can then initialize the Chronos model
 67 | 
 68 |     model = chronos.Chronos(
 69 |     	readcounts={'my_library': readcounts},
 70 |     	sequence_map={'my_library': sequence_map},
 71 |     	guide_gene_map={'my_library': guide_gene_map},
 72 |         negative_control_sgrnas={'my_library': negative_control_sgrnas}
 73 |     )
 74 | 
 75 | 
 76 | This odd syntax is used because it allows you to process results from different libraries at the same time. If you have libraries 1 and 2, and readcounts, sequence maps, guide maps, and negative control sgRNAs for them, you would initialize Chronos as such:
 77 | 
 78 |     model = chronos.Chronos(
 79 |     	readcounts={'my_library1': readcounts1, 'my_library2': readcounts2},
 80 |     	sequence_map={'my_library': sequence_map, 'my_library2': sequence_map2},
 81 |     	guide_gene_map={'my_library': guide_gene_map, 'my_library2': guide_gene_map2},
 82 |         negative_control_sgrnas={'my_library1': negative_control_sgrnas1, 'my_library2': negative_control_sgrnas2}
 83 |     )
 84 | 
 85 | Either way, you can then train Chronos by calling 
 86 | 
 87 |     model.train()
 88 | 
 89 | Once the model is trained, you can save all the parameters by calling
 90 | 
 91 |     model.save("my_save_directory")
 92 | 
 93 | You can also directly access model parameters, for example:
 94 | 
 95 | 	gene_effect = model.gene_effect
 96 | 	guide_efficacy = model.guide_efficacy
 97 | 
 98 | `gene_effect` is the primary attribute you will be interested in in 99% of use cases. It is a numerical matrix indexed on rows by `cell_line_name` and on columns by `gene`, with values indicating the _relative change in growth rate_ caused by successful knockout of the gene. 0 indicates no change, negative values a loss of viability, and positive values a gain of viability. NaNs in this matrix can occur because no sgRNAs targeting the gene
 99 | 
100 | Note some parameters will be dictionaries or tables, because they are learned separately per library. 
101 | 
102 | If you have labeled gene_level copy number data, Chronos has an option to correct the gene effect matrix. We recommend first globally normalizing the gene effect matrix so the median of all common essential gene scores is -1 and the median of all nonessential genes is 0. Unlike CERES outputs, we do NOT recommend normalizing per cell line. Chronos includes parameters like `cell_line_growth_rate` and `cell_line_efficacy` along with other regularization terms that help align data between cell lines. 
103 | 
104 |     gene_effect -= gene_effect.reindex(columns=my_nonessential_gene_list).median(axix=1).median()
105 |     gene_effect /= gene_effect.reindex(columns=my_essential_gene_list).median(axis=1).abs().median()
106 |     gene_effect_corrected, shifts = chronos.alternate_cn(gene_effect, copy_number)
107 |     chronos.write_hdf5(gene_effect_corrected, "my_save_directory/gene_effect.hdf5")
108 | 
109 | The copy number matrix needs to be aligned to the gene_effect_matrix. Additionally, we assume that it is in the current CCLE format: log2(relative CN + 1), where CN 1 means the relative CN matches the reference. This may still work fine with CN with different units, but has not been tested. 
110 | 
111 | New functionality in Chronos 2.x includes two types of quality control reports, one you can run on your raw data, the other on the trained Chronos results, and the ability to load DepMap public Chronos runs and use the trained parameters for processing your own screens (if they are in a public DepMap library, currently just Avana and KY). See the vignette for details on how to do this.
112 | 
113 | # Calling hits
114 | 
115 | New functionality in Chronos 2.3.x includes the `hit_calling` module, which allows you to assess the statistical significance of Chronos results. See the preprint for a detailed explanation and benchmarking of the methods: https://doi.org/10.1101/2025.04.24.650434
116 | 
117 | ## Identify significantly depleting knockouts
118 | To get empirical p-values that a gene knockout causes a true negative viability phenotype (requires a list of many negative control genes) from a gene effect matrix (which can be supplied by Chronos or any other algorithm, as long as negative = more dependent):
119 | 
120 |     from chronos.hit_calling import get_pvalue_dependent, get_fdr_from_pvalues
121 |     pvalues = get_pvalue_dependent(gene_effect, negative_control_genes)
122 |     fdr_from_pvalues = get_fdr_from_pvalues(pvalues)
123 | 
124 | `hit_calling` also includes an empirical Bayesian method for controlling false discovery. This method generates posterior probabilities that a given gene effect score was generated from the distribution of positive control genes rather than the negative control genes - i.e., the probability that the cell line is dependent on the gene. 
125 | 
126 |     from chronos.hit_calling import get_probability_dependent, get_fdr_from_probability
127 |     probabilities = get_probability_dependent(gene_effect, negative_control_genes, positive_control_genes)
128 |     fdr_from_probabilities = get_fdr_from_probabilities(probabilities)
129 | 
130 | DepMap published `fdr_from_probabilities` every quarter as CRISPRGeneDependency. This method is generally preferable over the frequentist version since it is better-calibrated and produces good results even with relatively few controls (on the order of 10s), but it does require a good set of positive controls that represent the full range of expected dependent phenotypes. If you only include highly lethal knockouts in your positive control set, you should expect to be limited in detecting less extreme loss of viability phenotypes in other knockouts.
131 | 
132 | ## Comparing gene effect between two screening conditions
133 | 
134 | A common experimental design involves running a CRISPR screen with the same library on the same cell line multiple times with some experimental condition changed - such as in the presence or absence of a drug, an isogenic perturbation, or a different growth condition. The `hit_calling.ConditionComparison` will report p-values for differences of viabiliy effects between any two conditions in such an experiment, *provided* you have at least two independent biological replicates for your condition. Initializing `ConditionComparison` is almost exactly the same as initializing a `Chronos` instance, except that instead of a `sequence_map`, you must provide a `condition_map` which has all of the same columns as `sequence_map`, plus a `replicate` column and a `condition` column. The `condition` column tells Chronos which replicates belong to which condition; you can choose any labels you like. The `replicate` column tells Chronos which late time points are different sequencing results of the same biological replicate. If you only have one late timepoint for each biological replicate, you can fill this column with any labels as long as they are unique to each row. `condition` and `replicate` for rows with `cell_line_name == "pDNA"` will be ignored. 
135 | 
136 |     from chronos.hit_calling import ConditionComparison
137 |     comparator = ConditionComparison(
138 |         readcounts={"my_library": my_readcounts},
139 |         condition_map={"my_library": my_condition_map},
140 |         guide_gene_map={"my_library": my_guide_map},
141 |         negative_control_sgrnas={"my_library": my_negative_controls}
142 |     )
143 | 
144 | You can also pass `negative_control_genes` instead of `negative_control_sgrnas`, and in fact this is recommended. If you do, you only pass one list rather than a dict of entries per library:
145 | 
146 |     comparator = ConditionComparison(
147 |         readcounts={"my_library": my_readcounts},
148 |         condition_map={"my_library": my_condition_map},
149 |         guide_gene_map={"my_library": my_guide_map},
150 |         negative_control_genes=my_negative_control_genes
151 |     )
152 | 
153 | To compare screens in DrugA to screens in Control, you would call 
154 | 
155 |     drugA_vs_control_statistics = comparator.compare_conditions(conditions=("Control", "DrugA"))
156 | 
157 | Of course, the two conditions named in `compare_conditions` must be present in `condition_map["condition"]`. 
158 | 
159 | Running `compare_conditions` requires Chronos to build and train at least four models, so expect this to take longer than a typical Chronos run. It will also be less verbose by default. Problems can arise when the "biological replicates" are not genuinely independent replicates - for example, if a single pool of cells was infected with the CRISPR library, then split into replicates, we've observed that even knockouts with no viability effects will be more correlated with their coinfected partners than with other replicates. Chronos will try to check for this and do its best to report and correct for problems.
160 | 
161 | 
162 | # Expected run times
163 | The full Achilles dataset takes 3-4 hours to run a gcloud VM with 52 GB of memory. Training the vignette in this package should take around 10 minutes on a typical laptop.
164 | 
165 | # Other Chronos Options
166 | The Chronos model has a large number of hyperparameters which are described in the model code. Generally we advise against changing these. We've tested them in a wide variety of experimental settings and found the defaults work well. However, a few may be worth tweaking if you want to try and maximize performance. If you do choose to tune the hyperparameters, make sure you evaluate the results with a metric that captures what you really want to get out of the data. We decribe the hyperparameters that might be worth changing here.
167 | 
168 | - `gene_effect_hierarchical` and `gene_effect_smoothing`: The first of these is a CERES style penalty that punishes gene effect scores in individual cell lines for deviating from the mean. The second punishes the deviation of a REGION of gene effect scores in a cell line from the mean, where a region is a contiguous block of genes arranged by their mean gene effect. Cranking up the first of these will reduce the variance within genes, potentially losing interesting differences between samples (but improving measures of control separation within samples). Cranking up the second can produce artifacts in the tails of gene effect, especially if `gene_effect_hierarchical` is too low. If you don't care about differences between samples, or have strong reason to believe all your samples should give the same results, you could consider increasing both of these. 
169 | 
170 | - `kernel_width`: this is the width of the gaussian kernel applied for `gene_effect_smoothing`. The number of genes used to calculation regional deviation from the mean for each gene will be 6x this number, 3x in each direction from the gene in question. Consider reducing this from its default value (50) for subgenome libraries.
171 | 
172 | - `cell_efficacy_guide_quantile`: Chronos pre-estimates how efficacious a cell line is (you could think of this as related to Cas9 activity in the cell line). To do this, it looks at the nth percentile guide's log fold change and takes that as the maximum real depletion the cell line can achieve. If screening a small library, especially one highly biased towards essentials, you might consider increasing it from the default value of 0.01. 
173 | 
174 | - `library_batch_reg`: this regularizes the mean gene effect within libraries towards the mean effect across libraries. Has no effect unless you have more than one library in the run. Note that this is one of two Chronos properties that removes library batch effects; the other is the internal matrix of `library_batch_effect`, which can't be turned off. If you think there should be real biological differences between your libraries, consider concatenating the input files into a single pseudolibrary. On the other hand, if you have two screen batches in the same library and you want to correct batch effects, you can split your screens into two pseudolibraries with the same sgRNAs in each.
175 | 
176 | - `scale_cost`: amplifies or diminishes the cost function. Lowering this value effectively increases the strength of all regularization terms. 
177 | 
178 | 
179 | # Tools that are useful outside of Chronos:
180 | 
181 | ## Preprocessing tools:
182 | 
183 | - `nan_outgrowths` will remove readcounts suspected to be caused by clonal outgrowth (see Michlits et. al., https://doi.org/10.1038/nmeth.4466 for a description of this phenomenon in CRISPR screens). 
184 | 
185 | - `normalize_readcounts` will sum pDNA measurements of the same pDNA batch, align the different batches by mode in log space, then align replicates to their pDNA batch by median abundance of the negative controls (if negative controls are supplied)
186 | 
187 | - `calculate_fold_change` will convert a readcounts matrix into a fold change. Will use RPM normalization by default, which will undo the normalization in `normalize_readcounts`
188 | 
189 | - `estimate_alpha` estimates the overdispersion parameter of the NB2 negative binomial counts model on a per-replicate basis using negative controls
190 | 
191 | 
192 | ## Postprocessing tools:
193 | 
194 | - `alternate_CN`, a copy number correction method that accepts any gene effect matrix and a gene-level copy number matrix and returns a corrected gene effect matrix. 
195 | 
196 | 
197 | ## QC reports (requires the matplotlib, seaborn, and reportlab packages):
198 | 
199 | - `reports.qc_initial_data` takes in readcounts, a guide map, a sequence map, and optionally postive and negative control sgRNAs and provides a number of plots and metrics to assess the quality of CRISPR screen data.
200 | 
201 | - `reports.qc_dataset` evaluates data quality after Chronos processing. You will want to call `.save` on your trained model to create a properly formatted directory to load with this function. Some aspects of the QC require omics data in various forms. See the vignette for a walkthrough.
202 | 
203 | 
204 | ## Generally useful functions:
205 | 
206 | - `read_hdf5` and `write_hdf5` allow you to translate numerical matrices between pandas DataFrames and effiicient binary files.
207 | 
208 | - `evaluations.fast_cor` efficiently computes the correlation matrix of one or two matrices (pandas DataFrames) with block null values. `evalautions.fast_cor_core` accepts numpy arrays as inputs instead.
209 | 
210 | - `evaluations.nnmd`, `evaluations.auroc`, and `evaluations.pr_auc` compute control separation metrics
211 | 
212 | - `plotting.density_scatter` produces a scatter plot with points colored by density, a trendline (much more efficient than seaborn's version), and optionally a diagonal, along with several options for labeling outlier points.
213 | 
214 | - `plotting.binplot` turns scatter data into a boxplot by binning one axis, which can reveal trends that are hard to see with scatter
215 | 
216 | - `plotting.dict_plot` takes a dictionary of data and produces a subplot per entry, titled with its key.
217 | 	
218 | 


--------------------------------------------------------------------------------
/chronos/plotting.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import numpy as np
  3 | import pandas as pd
  4 | from warnings import warn
  5 | 
  6 | from matplotlib import pyplot as plt
  7 | from matplotlib import cm
  8 | from matplotlib import colormaps
  9 | from matplotlib.ticker import FormatStrFormatter
 10 | from matplotlib.colors import Normalize, LogNorm
 11 | from matplotlib.cm import ScalarMappable
 12 | import matplotlib.patheffects as pe
 13 | 
 14 | from scipy.interpolate import interpn
 15 | from statsmodels.nonparametric.smoothers_lowess import lowess
 16 | try:
 17 | 	from adjustText import adjust_text
 18 | 	adjust_text_present = True
 19 | except:
 20 | 
 21 | 	adjust_text_present = False
 22 | 
 23 | 
 24 | def lowess_trend(x, y, frac=.25, max_points=2000, min_points=50, delta_frac=.01, **kwargs):
 25 | 	'''
 26 | 	A wrapper for statsmodel's lowess with a somewhat more useful parameterization
 27 | 	Parameters:
 28 | 		`x`, `y`: the points. `y` will be smoothed as a function of `x`.
 29 | 		`frac`: `float` in [0, 1]. The fraction of the points used for each linear regression.
 30 | 		`min_points`: `int`. The maximum number of points to be used for each linear regression.
 31 | 					Overrides `frac` when larger.
 32 | 		`max_points`: `int`. The maximum number of points to be used for each linear regression.
 33 | 					Overrides `frac` when smaller.
 34 | 		`delta_frac`: the fraction of the range of `x` within which to use linear interpolation
 35 | 		`              instead of a new regression.
 36 | 		Other args passed to lowess.
 37 | 	Returns:
 38 | 		The unsorted smoothed y values.
 39 | 	'''
 40 | 	frac = min(max_points/len(x), frac)
 41 | 	frac = max(frac, min_points/len(x))
 42 | 	frac = np.clip(frac, 0, 1)
 43 | 	rng = x.max() - x.min()
 44 | 	delta = min(delta_frac * rng, 50/len(x)*rng)
 45 | 	delta = min(delta, rng)
 46 | 	return lowess(y, x, frac, delta=delta, is_sorted=False, return_sorted=False, **kwargs)
 47 | 
 48 | 
 49 | def identify_outliers_by_trend(x, y, n_outliers, y_trend=None, min_outlier_std=3, **kwargs):
 50 | 	'''
 51 | 	Get the `n_outliers` farthest from the smoothed trend. y_trend is not supplied, it will be estimated
 52 | 	using `lowess_trend`.
 53 | 	Parameters:
 54 | 		`x`, `y`: the points. `y` will be smoothed as a function of `x`.
 55 | 		`n_outliers`: how many outliers to return.
 56 | 		`y_trend`: The trend of y(x). If not provided, will be estimated by lowess.
 57 | 		`min_outlier_std` (`float`): `y` must be at least this many standard deviations from `y_trend`
 58 | 				(standard deviation measured as deviation from the trend) to be an outlier.
 59 | 		Other args are passed to `lowess_trend`, if used.
 60 | 	Returns:
 61 | 		the numerical index of the outliers
 62 | 	'''
 63 | 	if y_trend is None:
 64 | 		y_trend = lowess_trend(x, y, **kwargs)
 65 | 
 66 | 	diff = y-y_trend
 67 | 	sd = np.std(diff)
 68 | 	normed = np.abs(diff/sd)
 69 | 	index = np.arange(len(normed)).astype(int)
 70 | 	candidates = normed[normed > min_outlier_std]
 71 | 	index = index[normed > min_outlier_std]
 72 | 	order = np.argsort(candidates)
 73 | 	return index[order[-n_outliers:]]
 74 | 	
 75 | 
 76 | def identify_outliers_by_density(x, y, density, n_outliers, candidate_density_quantile=.05, high_density_quantile=.5,
 77 | 		max_candidates=500, max_high=10000):
 78 | 	'''
 79 | 	Identify outliers in 2D space by point density. This is done by first identifying a set of candidate points of lowest
 80 | 	density, then a set of points with high density, then choosing candidates that have the greatest minimum distance
 81 | 	to any point with high density.
 82 | 	Parameters:
 83 | 		`x`, `y`, `density`: 1D arrays giving the position of each point and the estimated density of points at that position
 84 | 		`n_outliers`: how many outliers to return. If fewer candidates are found than the number of requested outliers,
 85 | 				 all candidates will be returned.
 86 | 		`candidate_density_quantile`: the fraction of points to choose as possible outliers based on density
 87 | 		`high_density_quantile`: the fraction of points to be treated as dense
 88 | 	`   max_candidates`: overrides `candidate_density_quantile` if too many candidates are considered. Useful for very large datasets.
 89 | 	`   max_high`: overrides `high_density_quantile` if too many high density points are considered. Useful for very large datasets.
 90 | 	Returns:
 91 | 		the numerical index of the outliers
 92 | 	'''
 93 | 	if not (len(x)==len(y)==len(density)):
 94 | 		raise ValueError("`x`, `y`, and `density` must have the same length")
 95 | 	if candidate_density_quantile > high_density_quantile:
 96 | 		raise ValueError("`candidate_density_quantile` must be less than `high_density_quantile`")
 97 | 	candidate_density_quantile = min(candidate_density_quantile, max_candidates/len(x))
 98 | 	high_density_quantile = max(high_density_quantile, 1-max_high/len(x))
 99 | 
100 | 	candidates = np.arange(len(density)).astype(int)[density < np.quantile(density, candidate_density_quantile)]
101 | 	high_density = density > np.quantile(density, high_density_quantile)
102 | 	x_diff = np.subtract.outer(x[candidates], x[high_density])
103 | 	y_diff = np.subtract.outer(y[candidates], y[high_density])
104 | 	r2 = np.square(x_diff) + np.square(y_diff)
105 | 	r2_min = r2.min(axis=1)
106 | 	farthest = np.argsort(r2_min)[-n_outliers:]
107 | 	return candidates[farthest]
108 | 
109 | 
110 | def identify_outliers_by_diagonal(x, y, n_outliers):
111 | 	'''
112 | 	Identify points in 2D space as outliers by distance from the diagonal x==y, i.e. the points with the greatest difference
113 | 	between x and y.
114 | 	Parameters:
115 | 		`x`, `y` : 1D arrays giving the position of each point
116 | 		`n_outliers`: how many outliers to return. If fewer candidates are found than the number of requested outliers,
117 | 				 all candidates will be returned.
118 | 	Returns:
119 | 		the numerical index of the outliers
120 | 	'''
121 | 	diff = np.abs(x - y)
122 | 	diff[pd.isnull(diff)] = 0
123 | 	order = np.argsort(diff)
124 | 	return order[-n_outliers:]
125 | 
126 | 
127 | def identify_outliers_by_zscore(x, y, n_outliers):
128 | 	'''
129 | 	Identify points in 2D space as outliers by zscore. `x` and `y` are first zscored, then combined into a scaled Euclidian distance
130 | 	from the mean (`x**2 + y**2`). Those with the greatest distance are returned as outliers.
131 | 	Parameters:
132 | 		`x`, `y`: 1D arrays giving the position of each point
133 | 		`n_outliers`: how many outliers to return. If fewer candidates are found than the number of requested outliers,
134 | 				 all candidates will be returned.
135 | 	Returns:
136 | 		the numerical index of the outliers
137 | 	'''
138 | 	zx = np.abs(x - np.mean(x))/np.std(x)
139 | 	zy = np.abs(y - np.mean(y))/np.std(y)
140 | 	r = zx**2 + zy**2
141 | 	r[pd.isnull(r)] = 0
142 | 	order = np.argsort(r)
143 | 	return order[-n_outliers:]
144 | 
145 | 
146 | def identify_outliers_1d(x, n_outliers):
147 | 	'''
148 | 	Identify points in 1D space as outliers by distance from median. 
149 | 	Parameters:
150 | 		`x`: 1D array
151 | 		`n_outliers`: how many outliers to return. If fewer candidates are found than the number of requested outliers,
152 | 				 all candidates will be returned.
153 | 	Returns:
154 | 		the numerical index of the outliers
155 | 	'''
156 | 	zx = np.abs(x - np.median(x))
157 | 	order = np.argsort(zx)
158 | 	return order[-n_outliers:]
159 | 
160 | 
161 | def get_density(x, y, bins=50):
162 | 	'''
163 | 	get the 2D density of the 1D arrays `x` and `y` using a histogram with n `bins`
164 | 	on each axis
165 | 	'''
166 | 	try:
167 | 		data , x_e, y_e = np.histogram2d( x, y, bins = bins, density = True )
168 | 	except ValueError as e:
169 | 		print(x)
170 | 		print(y)
171 | 		print(bins)
172 | 		raise e
173 | 	z =  interpn( ( 0.5*(x_e[1:] + x_e[:-1]) , 0.5*(y_e[1:]+y_e[:-1]) ) , 
174 | 			data , np.vstack([x,y]).T , 
175 | 			method = "splinef2d", bounds_error = False
176 | 	)
177 | 
178 | 	#NaNs should have zero density
179 | 	z[np.where(np.isnan(z))] = 0.0
180 | 	z[z < 0] = 0
181 | 	return z
182 | 
183 | 
184 | def dict_plot(dictionary, plot_func, figure_width=7.5, min_subplot_width=3.74,
185 | 			  aspect_ratio=.8, aliases={}, xlabel=None, ylabel=None, *args, **kwargs):
186 | 	'''
187 | 	A utility for generating a figure with a subplot for each entry in `dictionary`. 
188 | 	Parameters:
189 | 		`dictionary` (`dict`): The data to be plotted. The keys of the dictionary will be used
190 | 			as subplot titles.
191 | 		`plot_func` (callable): will be called as `plot_func(value, *args, **kwargs)` for each value in `
192 | 			dictionary`.
193 | 		`figure_width` (`float`: total width of the figure
194 | 		`min_subplot_width` (`float`): when laying out subplots, how narrow they are allowed to be.
195 | 		`aspect_ratio` (`float`): the ration of subplot height to width - not the same as matplotlib's
196 | 			definition
197 | 		`aliases` (`dict`):  optional alternative names to use as plot titles
198 | 		`xlabel`, `ylabel` (`str`): optional axis labels for the subplots
199 | 		Other args and kwargs passed to `plot_func`
200 | 	Returns:
201 | 		fig, axes: the matplotlib figure and subplots
202 | 	'''
203 | 	nplots = len(dictionary)
204 | 	plots_per_row = min(nplots, int(figure_width//min_subplot_width))
205 | 	nrows = int(np.ceil(nplots/plots_per_row))
206 | 	panel_width = figure_width/plots_per_row
207 | 	panel_height = panel_width * aspect_ratio
208 | 	figure_height = panel_height * nrows
209 | 	fig, axes = plt.subplots(nrows, plots_per_row, figsize=(figure_width, figure_height))
210 | 	if nrows > 1:
211 | 		axes = [a for ax in axes for a in ax]
212 | 	elif nplots == 1:
213 | 		axes = [axes]
214 | 	for key, ax in zip(dictionary.keys(), axes):
215 | 		plt.sca(ax)
216 | 		plot_func(dictionary[key], *args, **kwargs)
217 | 		if not xlabel is None:
218 | 			plt.xlabel(xlabel)
219 | 		if not ylabel is None:
220 | 			plt.ylabel(ylabel)
221 | 		if key in aliases:
222 | 			key = aliases[key]
223 | 		plt.title(key)
224 | 	plt.tight_layout()
225 | 	return fig, axes
226 | 
227 | 
228 | 
229 | 
230 | def density_scatter(x, y, ax=None, sort=True, bins=50, trend_line=True, trend_line_args=dict(color='r'),
231 | 	lowess_args={}, diagonal=False, diagonal_kws=dict(color='black', lw=.3, linestyle='--'), 
232 | 	c="density", cbar_label=None,
233 | 	label_specific=[], label_outliers=0, outliers_from='trend', 
234 | 	label_kws=dict(
235 | 					fontsize=8, color=(.3, 0, 0), 
236 | 					path_effects=[pe.withStroke(linewidth=1.25, foreground=(1, 1, 1))]
237 | 	), 
238 | 	outlier_scatter_kws=dict(color=(.8, .2, .1), s=10, linewidth=.6, edgecolor=[0, 0, 0]), 
239 | 	adjust_text_kws={}, **kwargs ):
240 | 	"""
241 | 	Adapted from Guillaume's answer at
242 | 	 https://stackoverflow.com/questions/20105364/how-can-i-make-a-scatter-plot-colored-by-density-in-matplotlib
243 | 	Scatter plot colored by 2d histogram, with optional trend_line, diagonal, and outlier labeling
244 | 	Parameters:
245 | 		`x`, `y`: `pandas.Series` with overlapping indices or iterables of the same length. Values to plot on each axis.
246 | 		`ax` (`matplotlib.Axis`): if provided, draw plot to this
247 | 		`sort` (`bool`): if `True` (default), the densest points are plotted last. 
248 | 		`bins` (`int`): How many bins to use in np.histogram2d for estimating density. Default 50.
249 | 		`trend_line` (`bool`): Whether to draw a lowess trend_line line
250 | 		`lowess_args` (`dict`): passed to `lowess_trend` for the trend_line line
251 | 		`trend_line_args` (`dict`): passed to `pyplot.plot` for the trend_line line
252 | 		`c` ("density" or array): if "density", points will be colored by the square root of point density in the plot.
253 | 			Otherwise, passed to `pyplot.scatter`.
254 | 		`diagonal` (`bool`): If true, draw a line on the diagonal
255 | 		`diagonal_kws` (`dict`): Passed to `pyplot.plot`. By default, colors diagonal line red
256 | 		`label_outliers` (`int`): if > 0, the number of outliers to label with their index. 
257 | 				If `trend_line`, the outliers will be identified by deviation from the trend.
258 | 		'outliers_from':
259 | 			'trend': outliers identified by distance from trend line
260 | 			'diagonal': outliers identified by difference between `x` and `y`
261 | 			'density': outliers identified by minimum distance to plot region of high density
262 | 			'xy_zscore': outliers identified by euclidian distance from zero in z-score space
263 | 		`label_kws` (`dict`): passed to `pyplot.text` for the labels
264 | 		'outlier_scatter_kws': passed to `pyplot.scatter` to plot over outliers
265 | 		**kwargs: additional arguments passed to `pyplot.scatter`.
266 | 	"""
267 | 	if ax is None :
268 | 		fig = plt.gcf()
269 | 		ax = plt.gca()
270 | 	else:
271 | 		fig = ax.figure
272 | 	index = None
273 | 	if isinstance(x, pd.Series) and isinstance(y, pd.Series):
274 | 		x, y = x.align(y, join="inner")
275 | 		index = x.index
276 | 	if len(x) != len(y):
277 | 		raise ValueError("If not pd.Series, x and y must be the same length")
278 | 	mask = pd.notnull(x) & pd.notnull(y)
279 | 	x = np.array(x[mask]).astype(float)
280 | 	y = np.array(y[mask]).astype(float)
281 | 	if not index is None:
282 | 		index = index[mask]
283 | 
284 | 	c_is_density = False
285 | 	if isinstance(c, str):
286 | 		if c == "density":
287 | 			c_is_density = True
288 | 		else:
289 | 			raise ValueError(f"if passed, `c` can't be {c}, only 'density' or iterable.")
290 | 
291 | 	if c_is_density or outliers_from == "density":
292 | 		z = get_density(x, y, bins)
293 | 		z = np.sqrt(z)
294 | 
295 | 	if c_is_density:
296 | 		c = z
297 | 		if cbar_label is None:
298 | 			cbar_label = "Density (sqrt)"
299 | 
300 | 	# Sort the points by c, so that the strongest points are plotted last
301 | 	if sort :
302 | 		idx = c.argsort()
303 | 		x, y, c = x[idx], y[idx], c[idx]
304 | 		if not index is None:
305 | 			index = index[idx]
306 | 			if c_is_density:
307 | 				z = z[idx]
308 | 
309 | 	im = ax.scatter( x, y, c=c, **kwargs )
310 | 
311 | 	norm = Normalize(vmin = np.min(c), vmax = np.max(c))
312 | 	colormap = cm.ScalarMappable(norm = norm)
313 | 	colormap.set_array([])
314 | 	colormap.set_cmap(im.get_cmap())
315 | 	cbar = fig.colorbar(colormap, ax=ax)
316 | 	cbar.ax.set_ylabel(cbar_label)
317 | 
318 | 	smoothed=None
319 | 	if trend_line:
320 | 		smoothed = lowess_trend(x, y, **lowess_args)
321 | 		xsort = np.argsort(x)
322 | 		ax.plot(x[xsort], smoothed[xsort], **trend_line_args)
323 | 
324 | 	outliers = None
325 | 	if label_outliers:
326 | 		if outliers_from == 'trend':
327 | 			outliers = identify_outliers_by_trend(x, y, label_outliers, smoothed)
328 | 		elif outliers_from =='density':
329 | 			outliers = identify_outliers_by_density(x, y, z, label_outliers)
330 | 		elif outliers_from == 'diagonal':
331 | 			outliers = identify_outliers_by_diagonal(x, y, label_outliers)
332 | 		elif outliers_from == 'xy_zscore':
333 | 			outliers = identify_outliers_by_zscore(x, y, label_outliers)
334 | 		elif outliers_from == 'x':
335 | 			outliers = identify_outliers_1d(x, label_outliers)
336 | 		elif outliers_from == 'y':
337 | 			outliers = identify_outliers_1d(y, label_outliers)
338 | 		else:
339 | 			raise ValueError("`outliers_from` must be one of 'trend', 'density', 'diagonal', 'xy_zscore', 'x', or 'y'")
340 | 
341 | 	if len(label_specific) and not index is None:
342 | 		label_specific = [index.get_loc(v) for v in label_specific]
343 | 
344 | 	if not outliers is None:
345 | 		label_specific = sorted(set(label_specific) | set(outliers))
346 | 
347 | 	if not index is None:
348 | 		labels = index[label_specific]
349 | 	else:
350 | 		labels = label_specific
351 | 
352 | 	if len(label_specific):
353 | 
354 | 		label_x = np.array([x[label] for label in label_specific])
355 | 		label_y = np.array([y[label] for label in label_specific])
356 | 		plt.scatter(label_x, label_y, **outlier_scatter_kws)
357 | 
358 | 		#prevent overlapping point text labels from failing to differentiate with adjust_text
359 | 		label_x_jittered = label_x + np.random.normal(size=len(label_x), scale=.05*(x.max() - x.min()))
360 | 		label_y_jittered = label_y + np.random.normal(size=len(label_y), scale=.05*(y.max() - y.min()))
361 | 		texts = [plt.text(s=labels[i], x=label_x_jittered[i], y=label_y_jittered[i], zorder=10, **label_kws)
362 | 				for i, val in enumerate(label_specific)]
363 | 
364 | 		if adjust_text_present and len(texts) > 0:
365 | 
366 | 			base_adjust_text_kws = dict(
367 | 				lim=500,
368 | 				target_x=label_x, target_y=label_y,
369 | 				arrowprops=dict(arrowstyle="-", color=[.7, .5, .5]),
370 | 				expand=(1.2, 1.4),
371 | 				force_explode=(.3, .5),
372 | 				avoid_self=True
373 | 
374 | 			)
375 | 			base_adjust_text_kws.update(adjust_text_kws)
376 | 			adjust_text(texts, **base_adjust_text_kws)
377 | 
378 | 		elif len(texts) > 0:
379 | 			warn("adjustText not found. Install to have labels moved off points.")
380 | 
381 | 	if not diagonal:
382 | 		return ax
383 | 
384 | 	minmin = min(ax.get_xlim()[0], ax.get_ylim()[0])
385 | 	maxmax = max(ax.get_xlim()[1], ax.get_ylim()[1])
386 | 	ax.plot([minmin, maxmax], [minmin, maxmax], **diagonal_kws)
387 | 
388 | 	return ax
389 | 
390 | 
391 | def binplot(x, y, binned_axis='x', nbins=10, endpoints=None, right=False, ax=None, colors=None, cbar_label='Number Samples', **kwargs):
392 | 	'''
393 | 	creates a plot with values binned into boxes along one axis. 
394 | 	Params:
395 | 		x: iterable of numbers indicating position on x axis
396 | 		y: iterable of numbers indicating position on y axis. 
397 | 		binned_axis (str): 'x' or 'y', the axis to bin ('x' default)
398 | 		nbins (int): number of discrete bins that will be created
399 | 		endpoints (None or tuple of two numbers): The right/top edge of the first bin and the left/bottom edge of the last bin. If provided,
400 | 				the first and last bins will include points in [-infinity, endpoints[0]] and [endpoints[1], +infinity] respectively. Other bins
401 | 				will be evenly spaced between them. If endpoints is None (default), bins will be evenly spaced between the minimum and maximum
402 | 				data points.
403 | 		right (bool): whether points falling on an edge are included in the left or right bin.
404 | 		axis (None or pyplot.Axis): axis to draw plot on (default or None draws to current axis)
405 | 		colors (None or str or iterable of RGBA values): color palette used to color the bins
406 | 	Additional keyword arguments are passed to pyplot.boxplot.
407 | 	'''
408 | 	if isinstance(x, pd.Series) and isinstance(y, pd.Series):
409 | 		x, y = x.align(y, join="inner")
410 | 		index = x.index
411 | 	mask = pd.notnull(x) & pd.notnull(y)
412 | 	x = np.array(x)[mask]
413 | 	y = np.array(y)[mask]
414 | 	if colors is None:
415 | 		colors = "viridis"
416 | 		
417 | 	if binned_axis == 'x':
418 | 		unbinned = 'y'
419 | 		vert = True
420 | 	elif binned_axis == 'y':
421 | 		unbinned = 'x'
422 | 		vert = False
423 | 	else:
424 | 		raise ValueError("binned_axis must be 'x' or 'y'")
425 | 		
426 | 	if isinstance(x, pd.Series) and isinstance(y, pd.Series):
427 | 		assert len(set(x.index) & set(y.index)) > 2, "x and y lack common indices"
428 | 	else:
429 | 		assert len(x) == len(y), "if x and y are not Series, they must be the same length"
430 | 		
431 | 	df = pd.DataFrame({'x': x, 'y': y})
432 | 	
433 | 	if endpoints is None:
434 | 		bins = np.linspace(df[binned_axis].min()-1e-12, df[binned_axis].max()+1e-12, nbins+1)
435 | 		space = bins[2] - bins[1]
436 | 		medians = .5*(bins[1:] + bins[:-1])
437 | 	else:
438 | 		bins = [-np.inf] + list(np.linspace(endpoints[0], endpoints[1], nbins-1)) + [np.inf]
439 | 		space = bins[2] - bins[1]
440 | 		medians = np.array(
441 | 			[bins[1] - .5*space] + list(.5*np.array(bins[2:-1]) + .5*np.array(bins[1:-2])) + [bins[-2] + .5*space]
442 | 		)
443 | 		
444 | 	digits = np.digitize(df[binned_axis], bins, right=right).astype(int)
445 | 
446 | 	if any(digits > nbins):
447 | 		print(df[binned_axis][digits > nbins])
448 | 		assert False
449 | 	df[binned_axis] = medians[digits-1]
450 | 	
451 | 	if ax is None:
452 | 		ax=plt.gca()
453 | 	else:
454 | 		plt.sca(ax)
455 | 	vals = sorted([val for val in sorted(medians) if (df[binned_axis] == val).sum() > 0])
456 | 	
457 | 	boxes = plt.boxplot(x=[df[df[binned_axis] == val][unbinned]
458 | 				   for val in vals
459 | 				  ], 
460 | 				positions=vals, widths=[.9*space]*len(vals), patch_artist=True, vert=vert,
461 | 
462 | 				**kwargs)
463 | 
464 | 	counts = df[binned_axis].value_counts().reindex(index=medians).fillna(0)
465 | 	normer = LogNorm(vmin=0)
466 | 	normer.autoscale(counts.values)
467 | 	cvals = normer(counts.values)
468 | 	if isinstance(colors, str):
469 | 		cmap = colormaps[colors]
470 | 		colors = [cmap(v) for v in cvals]
471 | 	for box, color in zip(boxes['boxes'], colors):
472 | 		box.set_facecolor(color)
473 | 
474 | 		
475 | 	if binned_axis == 'x':
476 | 		ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
477 | 		if not endpoints is None:
478 | 			plt.xticks(bins[1:-1])
479 | 		else:
480 | 			plt.xticks(bins)
481 | 	else:
482 | 		ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
483 | 		if not endpoints is None:
484 | 			plt.yticks(bins[1:-1])
485 | 		else:
486 | 			plt.yticks(bins)
487 | 	if binned_axis == 'x':
488 | 		plt.xlim(bins[1] - 1.2*space, bins[-2] + 1.2 * space)
489 | 	else:
490 | 		plt.ylim(bins[1] - 1.2*space, bins[-2] + 1.2 * space)
491 | 
492 | 	try:
493 | 		mappable=ScalarMappable(norm=normer, cmap=cmap)
494 | 		mappable.set_array(colors)
495 | 		plt.gcf().colorbar(mappable, ax=plt.gca(), label=cbar_label)
496 | 	except:
497 | 		pass
498 | 
499 | 	return ax


--------------------------------------------------------------------------------
/chronos/reports.py:
--------------------------------------------------------------------------------
  1 | try:
  2 | 	import reportlab
  3 | except ModuleNotFoundError:
  4 | 	raise ModuleNotFoundError("reportlab must be installed to use the reports module. Try `pip install reportlab`")
  5 | from reportlab.lib.enums import TA_JUSTIFY
  6 | from reportlab.lib.enums import TA_JUSTIFY
  7 | from reportlab.lib.pagesizes import letter
  8 | from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, PageBreak, Table
  9 | from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 10 | from reportlab.lib.units import inch
 11 | 
 12 | from .model import read_hdf5, calculate_fold_change, powerset, normalize_readcounts
 13 | from .evaluations import *
 14 | 
 15 | import numpy as np
 16 | import pandas as pd
 17 | import os
 18 | from matplotlib import pyplot as plt
 19 | import seaborn as sns
 20 | 
 21 | from .plotting import density_scatter, dict_plot
 22 | from scipy.stats import pearsonr
 23 | from statsmodels.stats.multitest import fdrcorrection
 24 | 
 25 | 
 26 | def load_chronos_data_for_qc(directory, gene_effect_file="gene_effect.hdf5"):
 27 | 	'''
 28 | 	Loads the results of a Chronos run saved to the `directory` using the `Chronos.save` method in a `dict`
 29 | 	suitable for passing to qc report functions.
 30 | 	Parameters:
 31 | 		`directory` (`str`): location of the saved run
 32 | 		`gene_effect_file` (`str`): optionally specify a different file in the directory where gene effect is
 33 | 			saved. This can be used to load a copy-mumber corrected version of the data. Must be in Chronos'
 34 | 			h5 format.
 35 | 	Returns:
 36 | 		`dict` containing the results of the run with the keys expected by the qc report functions in this module.
 37 | 	'''
 38 | 	libraries = [
 39 | 		f.split('_')[0]
 40 | 		for f in os.listdir(directory)
 41 | 		if f.endswith("sequence_map.csv")
 42 | 	]
 43 | 	data = {
 44 | 		'gene_effect': read_hdf5(os.path.join(directory, gene_effect_file)),
 45 | 		'library_effect': pd.read_csv(os.path.join(directory, "library_effect.csv"), index_col=0),
 46 | 		't0_offset': pd.read_csv(os.path.join(directory, "t0_offset.csv"), index_col=0),
 47 | 		'guide_efficacy': pd.read_csv(os.path.join(directory, "guide_efficacy.csv"), index_col=0)["efficacy"],
 48 | 		'replicate_efficacy': pd.read_csv(os.path.join(directory, "replicate_efficacy.csv"), index_col=0),
 49 | 		'growth_rate': pd.read_csv(os.path.join(directory, "growth_rate.csv"), index_col=0),
 50 | 		'readcounts': {
 51 | 			library: read_hdf5(os.path.join(directory, "%s_readcounts.hdf5" % library))
 52 | 			for library in libraries
 53 | 		},
 54 | 		'sequence_map': {
 55 | 			library: pd.read_csv(os.path.join(directory, "%s_sequence_map.csv" % library))
 56 | 			for library in libraries
 57 | 		},
 58 | 		'guide_map': {
 59 | 			library: pd.read_csv(os.path.join(directory, "%s_guide_gene_map.csv" % library))
 60 | 			for library in libraries
 61 | 		},
 62 | 		'excess_variance': {
 63 | 			library: pd.read_csv(os.path.join(directory, "screen_excess_variance.csv"), index_col=0)[library]
 64 | 			for library in libraries
 65 | 		},
 66 | 		'predicted_readcounts': {
 67 | 			library: read_hdf5(os.path.join(directory, "%s_predicted_readcounts.hdf5" % library))
 68 | 			for library in libraries
 69 | 		},
 70 | 		'predicted_logfoldchange': {
 71 | 			library: read_hdf5(os.path.join(directory, "%s_predicted_lfc.hdf5" % library))
 72 | 			for library in libraries
 73 | 		},
 74 | 
 75 | 	}
 76 | 
 77 | 	data["logfoldchange"] = {}
 78 | 	for library in libraries:
 79 | 		fc = calculate_fold_change(
 80 | 				data["readcounts"][library],
 81 | 				data["sequence_map"][library],
 82 | 				rpm_normalize=False
 83 | 		)
 84 | 		data['logfoldchange'][library] = pd.DataFrame(
 85 | 			np.log2(fc.values),
 86 | 			index=fc.index, columns=fc.columns
 87 | 		)
 88 | 	return data
 89 | 
 90 | 
 91 | def get_naive(data):
 92 | 	'''
 93 | 	Computes naive gene effect per library libraries by finding the median 
 94 | 	of guides/gene and replicates/line within each library
 95 | 	Parameters:
 96 | 		`data` (`dict`): must have keys "logfoldchange", "guide_map", and "sequence_map"
 97 | 	returns:
 98 | 		`dict`[`pandas.DataFrame`] holding naive gene effect estimates.
 99 | 	'''
100 | 	naive = {}
101 | 	for library in data["logfoldchange"]:
102 | 		naive[library] = data['logfoldchange'][library]\
103 | 			.T.groupby(data['guide_map'][library].set_index("sgrna").gene)\
104 | 			.median().T\
105 | 			.groupby(data['sequence_map'][library].set_index("sequence_ID").cell_line_name)\
106 | 			.median()
107 | 	return naive
108 | 
109 | 
110 | def mean_collapse_dataframes(dfs):
111 | 	'''
112 | 	Given an iterable of pandas DataFrames, returns a single dataframe
113 | 	where each value is given by the mean value for the same index/column
114 | 	across the input DataFrames, ignoring NaNs.
115 | 	'''
116 | 	numerator = None
117 | 	denominator = None
118 | 	for df in dfs:
119 | 		if numerator is None:
120 | 			numerator = df.fillna(0)
121 | 			denominator = df.notnull().astype(int)
122 | 		else:
123 | 			numerator, df = numerator.align(df, join='outer')
124 | 			numerator.fillna(0, inplace=True)
125 | 			denominator, numerator = denominator.align(numerator, join="right")
126 | 			denominator.fillna(0, inplace=True)
127 | 			numerator += df.fillna(0).values
128 | 			denominator += df.notnull().values
129 | 	numerator = numerator.mask(denominator==0)
130 | 	denominator.replace(0, np.nan, inplace=True)
131 | 	return numerator/denominator
132 | 
133 | def sum_collapse_dataframes(dfs):
134 | 	'''
135 | 	Given an iterable of pandas DataFrames, returns a single dataframe
136 | 	where each value is given by the sum of values for the same index/column
137 | 	across the input DataFrames, filling NaNs with 0.
138 | 	'''
139 | 	numerator = None
140 | 	for df in dfs:
141 | 		if numerator is None:
142 | 			numerator = df.fillna(0)
143 | 		else:
144 | 			numerator, df = numerator.align(df, join='outer')
145 | 			numerator.fillna(0, inplace=True)
146 | 			numerator += df.fillna(0).values
147 | 	return numerator
148 | 	
149 | 
150 | def qc_compare_plot(plot_func, data, data_key, metrics, plot_width, plot_height, **kwargs):
151 | 	'''
152 | 	A convenience method for comparing results from two different runs side by side
153 | 	Parameters:
154 | 		`plot_func` (`function`): a plotting function that accepts an object of the type `data[data_key]`
155 | 			and a `metrics` kew word argument and plots to the current matplotlib axis
156 | 		`data` (`dict`): dict containing data to plot
157 | 		`data_key` (`str`): the entry in the `data` that will be plotted
158 | 		`metrics` (`dict`): passed to `plot_func`
159 | 		`plot_width`, `plot_height`: desired (total) plot size in inches
160 | 		Additions kwargs passed to `plot_func`
161 | 	Returns:
162 | 		`matplotlib.Figure`
163 | 	'''
164 | 	fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
165 | 	for i, key, in enumerate(data.keys()):
166 | 		plt.sca(axes[i])
167 | 		plt.title(key)
168 | 		plot_func(data[key][data_key], metrics=metrics[key], **kwargs)
169 | 	plt.tight_layout()
170 | 	return fig
171 | 
172 | 
173 | def qc_initial_data(title, readcounts, sequence_map, guide_map, negative_control_sgrnas=None, positive_control_sgrnas=None,
174 | 		   report_name=None, directory='./', plot_width=7.5, plot_height=3.25,
175 | 		  doc_args=dict(
176 | 			pagesize=letter, rightMargin=.5*inch, leftMargin=.5*inch,
177 | 			topMargin=.5*inch,bottomMargin=.5*inch
178 | 		  ),
179 | 		  specific_plot_dimensions={}
180 | ):
181 | 	'''
182 | 	QC dthe data that would be passed to Chronos. This can be helpful to develop a sense of data quality but also to exclude 
183 | 	bad results.
184 | 	Parameters:
185 | 		`title` (`str`): the report title, printed on first page
186 | 		`readcounts` (`pd.DataFrame`): read numbers for each pDNA and late timepoint as rows with sgRNAs as columns.
187 | 		 	Do not need to be normalized.
188 | 		`sequence_map` (`pd.DataFrame`): map of sequences for both pDNA and late replicates to cell lines, timepoints, and pDNA batches.
189 | 			See `chronos.Chronos` for format.
190 | 		`guide_map` (`pd.DataFrame`): map of sgRNAs to genes.  Must include the columns 'sgrna' and 'gene'.
191 | 		`negative_control_sgrnas`, `positive_control_sgrnas` (ordered indexable of `str`): optional guides where no effect or
192 | 			a strong depleting effect is expected, respectively. If not provided a number of the more useful QC metrics can't
193 | 			be calculated.
194 | 		`report_name` (`str`): an optional file name for the report. If none is provided, `title` + '.pdf' will be used.
195 | 		`directory` (`str`): where the report and figure panels will be generated.
196 | 		`plot_width`, `plot_height` (`float`): size of plots that will be put in the report in inches.
197 | 		`doc_args` (`dict`): additional arguments will be passed to `SimpleDocTemplate`.
198 | 		`specific_plot_dimensions` (`dict` of 2-tuple`): if a plot's name is present, will use the the value
199 | 			 to specify dimensions for that plot instead of deriving them from `plot_width` and `plot_height`
200 | 	Returns:
201 | 		`dict` containing the calculated QC metrics, which will also be in the report.
202 | 	'''
203 | 	if report_name is None:
204 | 		report_name = title + ".pdf"
205 | 	doc = SimpleDocTemplate(os.path.join(directory, report_name), **doc_args)
206 | 	styles=getSampleStyleSheet()
207 | 	story = []
208 | 	metrics = {}
209 | 	
210 | 	def add_image(filename):
211 | 		fig = plt.gcf()
212 | 		label = '.'.join(filename.split('.')[:-1])
213 | 		if label in specific_plot_dimensions:
214 | 			fig.set_size_inches(specific_plot_dimensions[label])
215 | 		width, height = fig.get_size_inches()
216 | 		plt.tight_layout()
217 | 		fig.savefig(os.path.join(directory, filename))
218 | 		plt.close(fig)
219 | 		im = Image(os.path.join(directory, filename), width*inch, height*inch)
220 | 		story.append(im)
221 | 		story.append(Spacer(.125, 12))
222 | 			
223 | 	normalized = normalize_readcounts(readcounts, negative_control_sgrnas, sequence_map)
224 | 	lfc = np.log2(calculate_fold_change(normalized, sequence_map,rpm_normalize=False))
225 | 	nlines = len(set(sequence_map.cell_line_name) - set(['pDNA']))
226 | 	
227 | 	print("calculating replicate correlation")
228 | 	mean_corrs = []    
229 | 	for line in sequence_map.cell_line_name.unique():
230 | 		if line == 'pDNA':
231 | 			continue
232 | 		reps = sequence_map.query("cell_line_name == %r" % line).sequence_ID
233 | 		corrs = fast_cor(lfc.loc[reps].T)
234 | 		np.fill_diagonal(corrs.values, np.nan)
235 | 		mean_corrs.append(corrs.mean())
236 | 	metrics['MeanReplicateCorr'] = pd.concat(mean_corrs)
237 | 	metrics["ReplicateCorrWithMean"] = lfc.corrwith(lfc.mean(), axis=1)
238 | 	worst = metrics['MeanReplicateCorr']\
239 | 				.groupby(sequence_map.set_index("sequence_ID").cell_line_name)\
240 | 				.min()\
241 | 				.sort_values().dropna().index[:10]
242 | 	
243 | 	def get_nnmd(x):
244 | 		return nnmd(x[positive_control_sgrnas], x[negative_control_sgrnas])
245 | 	def get_roc_auc_score(x):
246 | 		return auroc(x[positive_control_sgrnas], x[negative_control_sgrnas])
247 | 
248 | 	if not negative_control_sgrnas is None and not positive_control_sgrnas is None:
249 | 		print("generating control separation metrics")
250 | 		negative_control_sgrnas = sorted(set(negative_control_sgrnas) & set(readcounts.columns))
251 | 		if not len(negative_control_sgrnas):
252 | 			raise ValueError(
253 | 				"none of the negative control sgRNAs found in readcounts columns:\n%r" 
254 | 				% negative_control_sgrnas
255 | 			)
256 | 		positive_control_sgrnas = sorted(set(positive_control_sgrnas) & set(readcounts.columns))
257 | 		if not len(positive_control_sgrnas):
258 | 			raise ValueError(
259 | 				"none of the negative control sgRNAs found in readcounts columns:\n%r" 
260 | 				% positive_control_sgrnas
261 | 			)
262 | 		metrics['NNMD'] = lfc.apply(get_nnmd, axis=1)
263 | 		metrics['AUROC'] = lfc.apply(get_roc_auc_score, axis=1)
264 | 		metrics["PosConMedian"] = lfc[positive_control_sgrnas].median(axis=1)
265 | 		metrics["NegConMedian"] = lfc[negative_control_sgrnas].median(axis=1)
266 | 		metrics["NegConSD"] = lfc[negative_control_sgrnas].std(axis=1)
267 | 		worst_sep = metrics['AUROC']\
268 | 				.groupby(sequence_map.set_index("sequence_ID").cell_line_name)\
269 | 				.min()\
270 | 				.sort_values().dropna().index[:10]
271 | 		worst = sorted(set(worst) & set(worst_sep))
272 | 	
273 | 	else:
274 | 		print("One or both control groups not supplied, skipping control separation metrics")
275 | 	story.append(Paragraph(title, style=styles["Heading1"]))
276 | 	
277 | 	print("Plotting log fold-change distribution")
278 | 	story.append(Paragraph("sgRNA Log Fold-Change Distribution", style=styles["Heading2"]))
279 | 	story.append(Paragraph(
280 | "For a traditional genome-wide loss of viability experiment we expect the bulk of log fold change \
281 | scores near 0, with a long left tail of true viability depletion."
282 | 	))
283 | 	
284 | 	sns.kdeplot(lfc.stack(), label="All sgRNAs", fill=True, color="gray", bw_adjust=.25)
285 | 	if not negative_control_sgrnas is None:
286 | 	   sns.kdeplot(lfc[negative_control_sgrnas].stack(), label="Negative Controls sgRNAs", 
287 | 				   color=[.3, .1, .9], bw_adjust=.25)
288 | 	if not positive_control_sgrnas is None:
289 | 		sns.kdeplot(lfc[positive_control_sgrnas].stack(), label="Positive Controls sgRNAs", 
290 | 				   color=[.9, .2, 0], bw_adjust=.25)
291 | 	plt.legend()
292 | 	plt.xlabel("Log Fold-Change of late timepoints from pDNA")
293 | 	plt.gcf().set_size_inches((plot_width, plot_height))
294 | 	add_image("lfc_distribution.png")
295 | 	
296 | 	if 'NNMD' in metrics:
297 | 		print("plotting control separation metrics")
298 | 		story.append(Paragraph("Control QC Metrics", style=styles["Heading2"]))
299 | 		story.append(Paragraph(
300 | "Depletion of positive controls is a positive signal for screen quality, while \
301 | high standard deviation in negative controls is a negative signal for screen quality. \
302 | However, these measures tend to be negatively correlated in CRISPR screens: screens that show \
303 | the greatest dropout of essential genes also have the greatest noise in nonessential genes."
304 | 		))
305 | 		
306 | 		fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
307 | 		
308 | 		plt.sca(axes[0])
309 | 		density_scatter(metrics["PosConMedian"] - metrics["NegConMedian"],
310 | 								 metrics["NegConSD"], 
311 | 								 label_outliers=4,
312 | 								alpha=.5)
313 | 		plt.xlabel("Pos. Con. median LFC")
314 | 		plt.ylabel("Neg. Con. SD")
315 | 		
316 | 		story.append(Paragraph(
317 | "The null-normalized median difference (NNMD) is"
318 | 		))
319 | 		story.append(Paragraph(
320 | 			"\t\t((median(positive controls) - median(negative controls)) / mad(negative controls)"
321 | 		))
322 | 		story.append(Paragraph(
323 | "In Project Achilles, we look for NNMD scores below -1.25 to consider a replicate passing \
324 | but this threshold depends strongly on the controls you have chosen. \
325 | We also provide the area under the ROC curve for separating the positive and negative control \
326 | log fold changes. These measures should have a strong negative correlation."
327 | 		))
328 | 		plt.sca(axes[1])
329 | 		density_scatter(metrics["NNMD"], metrics["AUROC"], label_outliers=4, outliers_from="xy_zscore",
330 | 								alpha=.5)
331 | 		xlim, ylim = plt.gca().get_xlim(), plt.gca().get_ylim()
332 | 		plt.xlabel("NNMD")
333 | 		plt.ylabel("AUROC")
334 | 		
335 | 		
336 | 		add_image("control_sep.png")
337 | 
338 | 	if metrics["MeanReplicateCorr"].any():
339 | 		story.append(Paragraph("Replicate Correlation", style=styles["Heading2"]))
340 | 		story.append(Paragraph(
341 | "Below is the Pearson correlation of replicate Log Fold-Change with the mean LFC over all replicates (x axis) vs \
342 | the mean correlation with other replicates of the same cell line (y axis). Generally these are closely related \
343 | and correlate with other measures of screen quality."))
344 | 		density_scatter(metrics["ReplicateCorrWithMean"], metrics["MeanReplicateCorr"],
345 | 					   label_outliers=5)
346 | 		plt.xlabel("Replicate R with Mean LFC")
347 | 		plt.ylabel("Mean Replicate R with same line")
348 | 		add_image("replicate_correlations.png")
349 | 		
350 | 	story.append(PageBreak())
351 | 	story.append(Paragraph("Details for worst performing cell lines", style=styles["Heading2"]))
352 | 	story.append(Paragraph(
353 | "For a dozen or so of the lines with the worst quality metrics, more details are given below. \
354 | It can be useful to look at the replicate-replicate plots carefully for effects such as"
355 | 	))
356 | 	story.append(Paragraph("\t- dropouts that aren't shared between replicates"))
357 | 	story.append(Paragraph(
358 | 		"\t- extreme outgrowths (whether shared or not). \
359 | These are concerning unless there is a sound biological reason \
360 | such as tumor suppressor KO or your experiment is a rescue experiment."
361 | 	 ))
362 | 	story.append(Paragraph(""))
363 | 	story.append(Paragraph(
364 | "We also show reads in the late timepoints compared to the pDNA. If control groups are provided, these are broken \
365 | out separately. We expect negative control sgRNAs to be closely aligned to pDNA abundance, while positive control \
366 | sgRNAs should tend to fall below the diagonal. Note that each axis is the log(normalized counts + 1)."))
367 | 	for line in worst:
368 | 		story.append(PageBreak())
369 | 		story.append(Paragraph(line, style=styles["Heading3"]))
370 | 		all_replicate_plot(normalized, sequence_map, line, plot_width)
371 | 		add_image("%s_rep_plot.png" % line)
372 | 		paired_pDNA_plots(normalized, sequence_map, line, negative_control_sgrnas, positive_control_sgrnas,
373 | 						 plot_width, plot_height)
374 | 		add_image("%s_pdna_plot.png" % line)
375 | 		
376 | 	doc.build(story)
377 | 	
378 | 	return metrics
379 | 
380 | 
381 | def dataset_qc_report(title, data,
382 | 	positive_control_genes, negative_control_genes, 
383 | 	mutation_matrix=None, addiction_expressions=None, copy_number=None,
384 | 	report_name=None, directory='.', gene_effect_file="gene_effect.hdf5",
385 | 						  plot_width=7.5, plot_height=3.25,
386 | 						  doc_args=dict(
387 | 							pagesize=letter, rightMargin=.5*inch, leftMargin=.5*inch,
388 | 							topMargin=.5*inch,bottomMargin=.5*inch
389 | 						  ),
390 | 						  specific_plot_dimensions={}
391 | ):
392 | 	'''
393 | 	QC the results of the Chronos run.
394 | 	Parameters:
395 | 		`title` (`str`): the report title, printed on first page
396 | 		`data` (`str` or `dict`): A path to a saved Chronos directory, or the results of `load_chronos_data_for_qc`. 
397 | 			If you manually assemble `data` as a `dict`, please consult that function for the correct format.
398 | 		`positive_control_genes`, `negative_control_genes` (`list`, `pandas.Index`, or `numpy.array` of `str`):
399 | 			Genes whose KO is expected to cause loss of viability or no loss of viability, respectively.
400 | 		`mutation_matrix` (`pandas.DataFrame`): optional boolean matrix of cell line by gene.
401 | 			Each value indicates that the gene has a gain of function mutation in that cell line.
402 | 			Genes should be selected such that a gain of function mutation is expected to make the cell line
403 | 			dependent on that gene. Tbhis is used to evaluate the separation of gene effects for that gene
404 | 			between mutated and wildtype cell lines.
405 | 		`addiction_expressions` (`pandas.DataFrame`): optional `float` matrix of cell lines by genes containing
406 | 			expressions. The genes should be chosen such that cell lines highly expressing the gene are expected
407 | 			to be dependent on it, while other cell lines are not.
408 | 		`copy_number` (`pandas.DataFrame`): optional cell line by gene `float` matrix of logged copy number counts. Used to QC the copy
409 | 			number effect. 
410 | 		`report_name` (`str`): an optional file name for the report. If none is provided, `title` + '.pdf' will be used.
411 | 		`directory` (`str`): where the report and figure panels will be generated.
412 | 		`gene_effect_file` (`str`): If `data` is a path to a directory, this arg is passed to `load_chronos_data_for_qc`.
413 | 		`plot_width`, `plot_height` (`float`): size of plots that will be put in the report in inches.
414 | 		`doc_args` (`dict`): additional arguments will be passed to `SimpleDocTemplate`.
415 | 		`specific_plot_dimensions` (`dict` of 2-tuple`): if a plot's name is present, will use the the value
416 | 			 to specify dimensions for that plot instead of deriving them from `plot_width` and `plot_height`
417 | 	Returns:
418 | 		`dict` containing the calculated QC metrics, which will also be in the report.
419 | 	'''
420 | 	if isinstance(data, str):
421 | 		try:
422 | 			print("Loading data from %s" % data)
423 | 			data = load_chronos_data_for_qc(data, gene_effect_file)
424 | 		except IOError:
425 | 			raise ValueError("If `data` is a string, it must be the path to a directory containing Chronos saved data. \
426 | gene_effect_file must be the name of an hdf5 file in that directory. \
427 | You passed '%s', %r" % (data, gene_effect_file))
428 | 	if not isinstance(data, dict):
429 | 		raise ValueError("`data` must be a `dict` of data or a string pointing to Chronos saved directory")
430 | 	required_data_keys = ["gene_effect", "sequence_map", "guide_map", "guide_efficacy",
431 | 						  "predicted_readcounts", "readcounts",
432 | 						 "logfoldchange", 'predicted_logfoldchange', 
433 | 						 "excess_variance", "growth_rate", "replicate_efficacy",
434 | 						 "t0_offset", "library_effect"
435 | 						 ] 
436 | 	for key in required_data_keys:
437 | 		if not key in data:
438 | 			raise ValueError("`data` missing required entry %s" % (key))
439 | 	library_data = {
440 | 		library: {
441 | 			key: data[key][library]
442 | 			for key in ['readcounts', 'predicted_readcounts', 
443 | 						'logfoldchange', 'predicted_logfoldchange',
444 | 						"excess_variance"
445 | 					   ]
446 | 		}
447 | 		for library in data['readcounts']
448 | 	}
449 | 	orig_working_dir = os.getcwd()
450 | 	if report_name is None:
451 | 		report_name = title + ".pdf"
452 | 	doc = SimpleDocTemplate(os.path.join(directory, report_name), **doc_args)
453 | 	styles=getSampleStyleSheet()
454 | 	story = []
455 | 	metrics = {}
456 | 
457 | 
458 | 	def add_image(filename):
459 | 		fig = plt.gcf()
460 | 		label = '.'.join(filename.split('.')[:-1])
461 | 		if label in specific_plot_dimensions:
462 | 			fig.set_size_inches(specific_plot_dimensions[label])
463 | 		width, height = fig.get_size_inches()
464 | 		plt.tight_layout()
465 | 		fig.savefig(os.path.join(directory, filename))
466 | 		plt.close(fig)
467 | 		im = Image(os.path.join(directory, filename), width*inch, height*inch)
468 | 		story.append(im)
469 | 		story.append(Spacer(.125, 12))
470 | 
471 | 	
472 | 	story.append(Paragraph(title, style=styles["Heading1"]))
473 | 	
474 | 	story.append(Paragraph("Control Separation", style=styles["Heading2"]))
475 | 	print("plotting global control separation")
476 | 	story.append(Paragraph("Global Control Separation", style=styles["Heading3"]))
477 | 	story.append(Paragraph(
478 | "Separation of positive/negative control genes both overall and by screen. \
479 | More negative NNMD is better."
480 | 	))
481 | 	fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
482 | 	plt.sca(axes[0])
483 | 	control_histogram(data["gene_effect"], positive_control_genes,
484 | 					negative_control_genes, metrics=metrics)
485 | 	plt.sca(axes[1])
486 | 	screen_nnmd_auroc_scatter(data["gene_effect"], positive_control_genes,
487 | 					negative_control_genes, metrics=metrics)
488 | 	add_image("global_controls.png")
489 | 	
490 | 	if (not mutation_matrix is None) or (not addiction_expressions is None):
491 | 		print("plotting selective dependency separation")
492 | 		story.append(Paragraph("Selective Control Separation", style=styles["Heading3"]))
493 | 		story.append(Paragraph(
494 | "Separation of known selective dependencies between indications. \
495 | On the left, known oncogene gene effects are compared between models where \
496 | a known oncogenic GoF mutation occurred in that gene vs the rest, if `mutation_matrix` is supplied. \
497 | On the right, we test expression addictions using a one-tailed test on pearson correlations, \
498 | if `addiction_expressions` is supplied. \
499 | The FDRs should be considered optimistic."
500 | 		))
501 | 		fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
502 | 		plt.sca(axes[0])
503 | 	if not mutation_matrix is None:
504 | 		selective_mutated_vs_not_scatter(data["gene_effect"], mutation_matrix, metrics=metrics)
505 | 	plt.sca(axes[1])
506 | 	if not addiction_expressions is None:
507 | 		expression_addiction_volcano(data["gene_effect"], addiction_expressions, metrics=metrics)
508 | 	if (not mutation_matrix is None) or (not addiction_expressions is None):
509 | 		add_image("selective_dependencies.png")
510 | 	story.append(PageBreak())
511 | 	
512 | 
513 | 	story.append(Paragraph("General Parameter Info", style=styles["Heading2"]))
514 | 
515 | 	story.append(Paragraph("Statistical Properties of Gene Effects", style=styles["Heading3"]))
516 | 	print("plotting gene effect mean relationships")
517 | 	story.append(Paragraph(
518 | "Higher overall gene SD is better (if control separation in each cell line is maintained). There is usually a trend \
519 | towards more variance in more negative genes. There should NOT be a trend in the second plot."
520 | ))
521 | 	fig, axes = plt.subplots(1, 1, figsize=(plot_width, plot_height))
522 | 	mean_vs_sd_scatter(data["gene_effect"], metrics=metrics)
523 | 
524 | 	if not copy_number is None:
525 | 		print("plotting copy number effect")
526 | 		story.append(Paragraph("Copy Number Effect", style=styles["Heading3"])) 
527 | 		story.append(Paragraph(
528 | 		"Relationship of genomic copy number to estimated gene effect both overall (left) and per gene binned \
529 | 		by gene mean (right). Ideally there is no systematic relationship."
530 | 		))
531 | 		fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
532 | 		plt.sca(axes[0])
533 | 		copy_number_trend(data['gene_effect'], copy_number, downsample=.01, downsample_lower_quantile_bound=.01,
534 | 						downsample_upper_quantile_bound=.99, metrics=metrics)
535 | 		plt.sca(axes[1])
536 | 		copy_number_gene_corrs(data['gene_effect'], copy_number, metrics=metrics)
537 | 		add_image("copy_number_effect.png")
538 | 	
539 | 	print("plotting screen efficacy and growth rate")
540 | 	story.append(Paragraph("Screen Efficacy, Growth Rate, and Guide Efficacy", style=styles["Heading3"]))
541 | 	story.append(Paragraph(
542 | "These parameters together translate a gene effect into the expected impact on cell proliferation. \
543 | Often there will be a trend towards lower growth estimates with lower cell efficacy estimates. \
544 | Guide efficacies have a single global value, but here have been grouped by presence in a library. \
545 | They should have a high peak near 1."))
546 | 
547 | 	growth_rate = []
548 | 	replicate_efficacy = []
549 | 
550 | 	for library in library_data:
551 | 
552 | 		gr, cle = data["growth_rate"].query("library == %r" % library)["growth_rate"].dropna().align(
553 | 			data['replicate_efficacy'].query("library == %r" % library)["replicate_efficacy"].dropna(), 
554 | 			join="inner"
555 | 		)
556 | 
557 | 		growth_rate.append(gr)
558 | 		replicate_efficacy.append(cle)
559 | 
560 | 	growth_rate, replicate_efficacy = pd.concat(growth_rate), pd.concat(replicate_efficacy)
561 | 	fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
562 | 	plt.sca(axes[0])
563 | 	density_scatter(growth_rate, replicate_efficacy, trend_line=False, outliers_from="xy_zscore")
564 | 	plt.xlabel("Relative Growth Rate")
565 | 	plt.ylabel("Replicate Screening Efficacy")
566 | 	metrics["growth_rate_sd"] = growth_rate.std()
567 | 	metrics["cell_efficacy_mean"] = replicate_efficacy.mean()
568 | 	plt.sca(axes[1])
569 | 	for library, guide_map in data['guide_map'].items():
570 | 		guides = guide_map.sgrna.unique()
571 | 		efficacies = data['guide_efficacy'].reindex(guides).dropna()
572 | 		sns.kdeplot(efficacies, bw_adjust=.5, lw=1, label=library)
573 | 		metrics["guide_eff_%s_mean" % library] = efficacies.mean()
574 | 	plt.legend()
575 | 	plt.xlabel("Guide Efficacy")
576 | 	add_image("parameter_distributions.png")
577 | 	story.append(PageBreak())
578 | 
579 | 	if len(data['guide_map']) > 1:
580 | 		print("plotting library integration")
581 | 		story.append(Paragraph("Library Integration", style=styles["Heading2"]))
582 | 		story.append(Paragraph(
583 | 			"The UMAP embedding of cell line gene effects colored by library presence (left) and how \
584 | 	far a gene's average within a library deviates from the overall average, by library (right). \
585 | 	The UMAP embedding uses only the 50% most variable genes. \
586 | 	On the right, a lowess trend is fitted per library to the squared difference of the gene's mean within \
587 | 	models screened with the library and its mean overall."
588 | 		))
589 | 		fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
590 | 		plt.sca(axes[0])
591 | 		check_integration_umap(data['gene_effect'], data['sequence_map'], metrics=metrics)
592 | 		plt.sca(axes[1])
593 | 		check_integration_mean_deviation(data['gene_effect'], data['sequence_map'], metrics=metrics)
594 | 		story.append(Paragraph("Prediction Accuracy", style=styles["Heading2"])) 
595 | 		add_image("library_integration.png")
596 | 		story.append(PageBreak())
597 | 
598 | 	print("plotting readcount predictions")
599 | 	story.append(Paragraph("Predictions", style=styles["Heading2"]))
600 | 	story.append(Paragraph("Readcount Predictions", style=styles["Heading3"]))
601 | 	story.append(Paragraph(
602 | "Chronos' readcount predictions should generally line up well with observation, but it will predict \
603 | greater than observed readcounts for cases with very few counts."
604 | 	))
605 | 
606 | 	def plot_func(x):   
607 | 		predicted_vs_observed_readcounts(
608 | 			x["predicted_readcounts"], x['readcounts'],
609 | 						metrics=metrics)
610 | 	fig, axes = dict_plot(library_data, plot_func, plot_width)
611 | 	add_image("readcount_predictions.png")
612 | 	
613 | 	print("plotting LFC predictions")
614 | 	story.append(Paragraph("Log Fold-Change Predictions", style=styles["Heading3"]))
615 | 	story.append(Spacer(.125, 12))
616 | 	story.append(Paragraph(
617 | "Screens with greater excess variance (overdispersion) should have worse correlation between \
618 | observed LFC and Chronos' predictions."
619 | 	))
620 | 	def plot_func(x):
621 | 		lfc_corr_vs_excess_variance(
622 | 			x["predicted_logfoldchange"], x['logfoldchange'], x['excess_variance'],
623 | 						metrics=metrics)
624 | 	fig, axes = dict_plot(library_data, plot_func, plot_width)
625 | 	add_image("lfc_corr_vs_excess_variance.png")
626 | 	story.append(PageBreak())
627 | 	
628 | 
629 | 	print("plotting difference from naive gene score")
630 | 	naive = get_naive(data)
631 | 	naive_collapsed = mean_collapse_dataframes(naive.values())
632 | 	story.append(Paragraph("Gene Score Difference from Naive", style=styles["Heading2"]))
633 | 	story.append(Paragraph(
634 | 		"Comparing the gene effect scores to a naive score estimated as log fold change median per guide/replicate \
635 | within libraries, then the mean across libraries. The first plots show the correlation of individual genes, both vs mean effect \
636 | and vs the difference of means between \
637 | the supplied and naive gene effects. Below is the direct comparison of gene means and a comparison of the most extreme \
638 | values for each gene's score."
639 | 	))
640 | 	fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
641 | 	plt.sca(axes[0])
642 | 	gene_corr_vs_mean(naive_collapsed, data['gene_effect'],
643 | 					metrics=metrics)
644 | 	plt.sca(axes[1])
645 | 	gene_corr_vs_mean_diff(naive_collapsed, data['gene_effect'],
646 | 					metrics=metrics)
647 | 	plt.xlabel("Naive Mean - Gene Effect Mean")
648 | 	add_image("gene_corrs.png")
649 | 
650 | 	fig, ax = plt.subplots(1, 1, figsize=(plot_width, plot_width - 2))
651 | 	plt.sca(ax)
652 | 	density_scatter(naive_collapsed.mean(), data['gene_effect'].mean(), diagonal=True, 
653 | 					label_outliers=10, alpha=.5, s=10)
654 | 	plt.title("Mean Gene Effect")
655 | 	plt.xlabel("Naive")
656 | 	plt.ylabel("Gene Effect")
657 | 	add_image("gene_means.png")
658 | 	fig, ax = plt.subplots(1, 1, figsize=(plot_width, plot_width - 2))
659 | 	plt.sca(ax)
660 | 	gene_outlier_plot(naive_collapsed, data['gene_effect'], metrics=metrics)
661 | 	plt.title("Most Extreme Z-Scores by Gene")
662 | 	plt.xlabel("Gene Effect Extreme ZScore")
663 | 	plt.ylabel("Naive Extreme ZScore")
664 | 	add_image("gene_zscore_extremes.png")
665 | 	story.append(PageBreak())
666 | 	
667 | 	print("summarizing")
668 | 	ge_mean = data['gene_effect'].mean()
669 | 	cell_line_mean = data['gene_effect'].mean(axis=1).std()/ge_mean.std()
670 | 	naive_means = {key: v.mean() for key, v in naive.items()}
671 | 
672 | 	naive_corr_text = '\n'.join([
673 | 		'\t%s: %1.3f' % (key, v.corr(ge_mean))
674 | 		for key, v in naive_means.items()
675 | 	])
676 | 	story.insert(1, Paragraph(
677 | '''
678 | Summary: the standard deviation (SD) of gene means in gene effect is %1.3f.\n
679 | The mean of gene SDs is %1.3f the SD of gene means.\n
680 | The SD of cell line means is %1.3f the SD of gene means\n. 
681 | The correlation of each library's mean LFC per gene with Chronos' mean gene effect is:\n
682 | %s
683 | ''' % (ge_mean.std(), metrics['mean_SD:SD_means'], cell_line_mean, naive_corr_text)
684 | 	))
685 | 
686 | 	print("plotting genes with low agreement with naive gene effect")
687 | 	story.append(Paragraph("Exploring Low Agreement Genes", style=styles['Heading2']))
688 | 	story.append(Spacer(.125, 12))
689 | 	story.append(Paragraph("In the remaining plots, the genes with lowest agreement are explored further. \
690 | NA results for guide efficacy are replaced with -.1"))
691 | 	story.append(Spacer(.125, 12))
692 | 
693 | 	outliers = set(metrics['worst_agreement']) \
694 | 				| set([s.split('_')[0] for s in metrics['low_outliers']]) \
695 | 				| set([s.split('_')[0] for s in metrics['high_outliers']])
696 | 	for gene in outliers:
697 | 		print("\t%s" % gene)
698 | 		header = Paragraph(gene, style=styles["Heading3"])
699 | 		story.append(header)
700 | 		fig = interrogate_gene(data, naive, naive_collapsed, gene, plot_width, plot_height)
701 | 		add_image(gene + '.png')
702 | 		story.append(PageBreak())
703 | 			
704 | 
705 | 	print("building report")
706 | 	doc.build(story)
707 | 	return metrics
708 | 
709 | 
710 | 
711 | 
712 | def comparative_qc_report(title, data, 
713 | 						  positive_control_genes, negative_control_genes, 
714 | 						  mutation_matrix, addiction_expressions,
715 | 						  report_name=None, directory='.', 
716 | 						  plot_width=7.5, plot_height=3.25,
717 | 						  doc_args=dict(
718 | 							pagesize=letter, rightMargin=.5*inch, leftMargin=.5*inch,
719 | 							topMargin=.5*inch,bottomMargin=.5*inch
720 | 						  ),
721 | 						  specific_plot_dimensions={}
722 | ):
723 | 	'''
724 | 	Compare the output of two Chronos runs, or Chronos with another algorithm (if that algorithm also 
725 | 		estimates gene effect and guide efficacy). 
726 | 	Parameters:
727 | 		`title` (`str`): the report title, printed on first page
728 | 		`data` (`dict`): A `dict` with EXACTLY two entries. the keys of the entries will be used as labels
729 | 			in the plots in the report. Each value is also a `dict` which must contain the keys 'gene_effect',
730 | 			'sequence_map', 'guide_map', 'guide_efficacy', and 'logfoldchange'. Gene effect and guide efficacy
731 | 			are model outputs, while logfoldchange can be calculated directly from the data.
732 | 		`positive_control_genes`, `negative_control_genes` (`list`, `pandas.Index`, or `numpy.array` of `str`):
733 | 			Genes whose KO is expected to cause loss of viability or no loss of viability, respectively.
734 | 		`mutation_matrix` (`pandas.DataFrame`): optional boolean matrix of cell line by gene.
735 | 			Each value indicates that the gene has a gain of function mutation in that cell line.
736 | 			Genes should be selected such that a gain of function mutation is expected to make the cell line
737 | 			dependent on that gene. Tbhis is used to evaluate the separation of gene effects for that gene
738 | 			between mutated and wildtype cell lines.
739 | 		`addiction_expressions` (`pandas.DataFrame`): optional `float` matrix of cell lines by genes containing
740 | 			expressions. The genes should be chosen such that cell lines highly expressing the gene are expected
741 | 			to be dependent on it, while other cell lines are not.
742 | 		`copy_number` (`pandas.DataFrame`): optional cell line by gene `float` matrix of logged copy number counts. Used to QC the copy
743 | 			number effect. 
744 | 		`report_name` (`str`): an optional file name for the report. If none is provided, `title` + '.pdf' will be used.
745 | 		`directory` (`str`): where the report and figure panels will be generated.
746 | 		`gene_effect_file` (`str`): If `data` is a path to a directory, this arg is passed to `load_chronos_data_for_qc`.
747 | 		`plot_width`, `plot_height` (`float`): size of plots that will be put in the report in inches.
748 | 		`doc_args` (`dict`): additional arguments will be passed to `SimpleDocTemplate`.
749 | 		`specific_plot_dimensions` (`dict` of 2-tuple`): if a plot's name is present, will use the the value
750 | 			 to specify dimensions for that plot instead of deriving them from `plot_width` and `plot_height`
751 | 	Returns:
752 | 		`dict` containing the calculated QC metrics, which will also be in the report.
753 | 	'''
754 | 	required_data_keys = ["gene_effect", "sequence_map", "guide_map", "guide_efficacy",
755 | 						 "logfoldchange"]
756 | 	if len(data) != 2:
757 | 		raise ValueError("`data` must be a dict with two keys")
758 | 	for key, val in data.items():
759 | 		for key2 in required_data_keys:
760 | 			if not key2 in data[key]:
761 | 				raise ValueError("`data[%s] missing required entry %s" % (key, key2))
762 | 
763 | 	if report_name is None:
764 | 		report_name = title + ".pdf"
765 | 	
766 | 	doc = SimpleDocTemplate(os.path.join(directory, report_name), **doc_args)
767 | 	styles=getSampleStyleSheet()
768 | 	keys = list(data.keys())
769 | 	story = []
770 | 	metrics = {keys[0]: {}, keys[1]: {}, "joint": {}}
771 | 
772 | 	def add_image(filename):
773 | 		fig = plt.gcf()
774 | 		label = '.'.join(filename.split('.')[:-1])
775 | 		if label in specific_plot_dimensions:
776 | 			fig.set_size_inches(specific_plot_dimensions[label])
777 | 		width, height = fig.get_size_inches()
778 | 		plt.tight_layout()
779 | 		fig.savefig(os.path.join(directory, filename))
780 | 		plt.close(fig)
781 | 		im = Image(os.path.join(directory, filename), width*inch, height*inch)
782 | 		story.append(im)
783 | 		story.append(Spacer(.125, 12))
784 | 
785 | 	
786 | 	story.append(Paragraph(title, style=styles["Heading1"]))
787 | 	print("plotting global control separation")
788 | 	story.append(Paragraph("Control Separation", style=styles["Heading2"]))
789 | 	story.append(Paragraph("Control Histogram", style=styles["Heading3"]))
790 | 	paragraph = Paragraph(
791 | 		"A direct visualization of control separation."
792 | 	)
793 | 	story.append(paragraph)
794 | 	fig = qc_compare_plot(control_histogram, data, "gene_effect", metrics,  
795 | 		plot_width, plot_height,
796 | 						positive_control_genes=positive_control_genes,
797 | 						negative_control_genes=negative_control_genes)
798 | 	add_image("control_histogram.png")
799 | 
800 | 
801 | 	story.append(Paragraph("Per Model QC Metrics", style=styles["Heading3"]))
802 | 	print("plotting per-screen control separation")
803 | 	story.append(Paragraph(
804 | 		"Head-to-head comparison of control separation for each model (cell line).\
805 | For NNMD, more negative is better. For AUROC, more positive is better."
806 | 	))
807 | 	fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
808 | 	plt.sca(axes[0])
809 | 	nnmds = {key: v['gene_effect'].apply(lambda x: 
810 | 									nnmd(x.reindex(positive_control_genes), x.reindex(negative_control_genes)),
811 | 										axis=1) 
812 | 						for key, v in data.items()}
813 | 	density_scatter(nnmds[keys[0]], nnmds[keys[1]], diagonal=True, label_outliers=4, s=10, alpha=.5)
814 | 	plt.title("NNMD")
815 | 	plt.xlabel(keys[0])
816 | 	plt.ylabel(keys[1])
817 | 	plt.sca(axes[1])
818 | 	aurocs = {key: v['gene_effect'].apply(lambda x: 
819 | 									auroc(x.reindex(positive_control_genes), x.reindex(negative_control_genes)), 
820 | 										axis=1) 
821 | 						for key, v in data.items()}
822 | 	density_scatter(aurocs[keys[0]], aurocs[keys[1]], diagonal=True, label_outliers=4, s=10, alpha=.5)
823 | 	plt.title("ROC AUC")
824 | 	plt.xlabel(keys[0])
825 | 	plt.ylabel(keys[1])
826 | 	add_image("model_qc_comparison.png")
827 | 
828 | 	print("plotting selective dependency separation")
829 | 	header = Paragraph("Selective Dependency Distinction", style=styles["Heading3"])
830 | 	story.append(header)
831 | 	paragraph = Paragraph(
832 | 		"For known cancer dependencies, the gene effect score with vs without the known indication.\
833 | Ideally each point would fall inthe bottom right corner."
834 | 	)
835 | 	story.append(paragraph)
836 | 	fig = qc_compare_plot(selective_mutated_vs_not_scatter, data, "gene_effect", metrics,
837 | 		plot_width, plot_height,  
838 | 						mutation_matrix=mutation_matrix)
839 | 	add_image("selective_dependencies.png")
840 | 	print("plotting expression addictions")
841 | 	fig = qc_compare_plot(expression_addiction_volcano, data, "gene_effect", metrics, 
842 | 		plot_width, plot_height, 
843 | 						addiction_expressions=addiction_expressions)
844 | 	add_image("expression_addiction.png")
845 | 	
846 | 	print("plotting gene differences between datasets")
847 | 	story.append(Paragraph("Key Differences", style=styles["Heading2"]))
848 | 	story.append(Paragraph(
849 | 		"The correlation of individual genes between datasets, both vs mean effect \
850 | and vs the difference of means between \
851 | the two datasets. Below is the direct comparison of gene means in each dataset \
852 | and a comparison of the most extreme values for each gene's score."
853 | 	))
854 | 	fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
855 | 	plt.sca(axes[0])
856 | 	gene_corr_vs_mean(data[keys[0]]["gene_effect"], data[keys[1]]['gene_effect'],
857 | 					metrics=metrics["joint"])
858 | 	plt.sca(axes[1])
859 | 	gene_corr_vs_mean_diff(data[keys[0]]["gene_effect"], data[keys[1]]['gene_effect'],
860 | 					metrics=metrics["joint"])
861 | 	plt.xlabel("%s Mean - %s Mean" % tuple(keys))
862 | 	add_image("gene_corrs.png")
863 | 
864 | 	fig, ax = plt.subplots(1, 1, figsize=(plot_width, plot_width - 2))
865 | 	plt.sca(ax)
866 | 	density_scatter(data[keys[0]]['gene_effect'].mean(), data[keys[1]]['gene_effect'].mean(), diagonal=True, 
867 | 					label_outliers=10, alpha=.5, s=10)
868 | 	plt.title("Mean Gene Effect")
869 | 	plt.xlabel(keys[0])
870 | 	plt.ylabel(keys[1])
871 | 	add_image("gene_means.png")
872 | 	fig, ax = plt.subplots(1, 1, figsize=(plot_width, plot_width - 2))
873 | 	plt.sca(ax)
874 | 	gene_outlier_plot(data[keys[0]]['gene_effect'], data[keys[1]]['gene_effect'], metrics=metrics['joint'])
875 | 	plt.title("Most Extreme Z-Scores by Gene")
876 | 	plt.xlabel(keys[0] + " Extreme ZScore")
877 | 	plt.ylabel(keys[1] + " Extreme ZScore")
878 | 	add_image("gene_zscore_extremes.png")
879 | 	story.append(PageBreak())
880 | 
881 | 	story.append(Paragraph("Library Integration", style=styles['Heading2']))
882 | 
883 | 	print("plotting library UMAPs")		 
884 | 	story.append(Paragraph("Library Integration UMAP", style=styles["Heading3"]))
885 | 	story.append(Paragraph(
886 | 		"Embedding of models in gene effect space colored by library coverage."
887 | 	))
888 | 	fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
889 | 	for i, key, in enumerate(keys):
890 | 		plt.sca(axes[i])
891 | 		plt.title(key)
892 | 		check_integration_umap(data[key]["gene_effect"], data[key]['sequence_map'], metrics=metrics[key],
893 | 								)
894 | 	add_image("integration_umap.png")
895 | 
896 | 	print("plotting library mean deviation")
897 | 	story.append(Paragraph("Library Mean Deviation", style=styles["Heading3"]))
898 | 	story.append(Paragraph(
899 | 		"How far a gene's average within a library deviates from the overall average, by library. \
900 | Here, a lowess trend is fitted per library to the squared difference of the gene's mean within \
901 | models screened with the library and its mean overall. Note that the two plots are not necessarily \
902 | on the same scale."
903 | 	))
904 | 	fig, axes = plt.subplots(1, 2, figsize=(plot_width, plot_height))
905 | 	for i, key, in enumerate(data.keys()):
906 | 		plt.sca(axes[i])
907 | 		plt.title(key)
908 | 		check_integration_mean_deviation(data[key]["gene_effect"], data[key]['sequence_map'], metrics=metrics[key],
909 | 								)
910 | 	add_image("integration_deviation.png")
911 | 	story.append(PageBreak())
912 | 	
913 | 	print("plotting genes with low agreement")
914 | 	story.append(Paragraph("Exploring Low Agreement Genes", style=styles['Heading2']))
915 | 	story.append(Spacer(.125, 12))
916 | 	story.append(Paragraph("In the remaining plots, the genes with lowest agreement are explored further. \
917 | NA results for guide efficacy are replaced with -.1"))
918 | 	story.append(Spacer(.125, 12))
919 | 	lfc = {}
920 | 	guide_map = {}
921 | 	for key in keys:
922 | 		for library in data[key]['logfoldchange']:
923 | 			if not library in lfc:
924 | 				lfc[library] = data[key]['logfoldchange'][library]
925 | 				guide_map[library] = data[key]['guide_map'][library]
926 | 			else:
927 | 				aligned_left, aligned_right = lfc[library].align(data[key]['logfoldchange'][library],
928 | 																join='outer')
929 | 				lfc[library] = aligned_left.mask(aligned_left.isnull(), aligned_right)
930 | 				guide_map[library] = pd.concat(
931 | 					[guide_map[library],  data[key]['guide_map'][library]],
932 | 					ignore_index=True
933 | 				).drop_duplicates(subset=['sgrna', 'gene'])
934 | 	outliers = set(metrics['joint']['worst_agreement']) \
935 | 				| set([s.split('_')[0] for s in metrics['joint']['low_outliers']]) \
936 | 				| set([s.split('_')[0] for s in metrics['joint']['high_outliers']])
937 | 	for gene in outliers:
938 | 		print("\t%s" % gene)
939 | 		header = Paragraph(gene, style=styles["Heading3"])
940 | 		story.append(header)
941 | 		fig = interrogate_gene_compare(data, lfc, guide_map, gene, plot_width, plot_width)
942 | 		add_image(gene + '.png')
943 | 		story.append(PageBreak())
944 | 
945 | 
946 | 	print("building report")
947 | 	doc.build(story)
948 | 	return metrics


--------------------------------------------------------------------------------
/chronos/evaluations.py:
--------------------------------------------------------------------------------
   1 | 
   2 | from warnings import warn
   3 | import numpy as np
   4 | import pandas as pd
   5 | from colorsys import hsv_to_rgb, rgb_to_hsv
   6 | 
   7 | try:
   8 | 	from matplotlib import pyplot as plt
   9 | 	from matplotlib.patches import Patch
  10 | 	import seaborn as sns
  11 | 	from scipy.stats import pearsonr
  12 | 	from sklearn.metrics import roc_auc_score, precision_recall_curve, auc
  13 | 	from sklearn.decomposition import PCA
  14 | 	from statsmodels.stats.multitest import fdrcorrection
  15 | except ModuleNotFoundError:
  16 | 	raise ModuleNotFoundError("matplotlib, seaborn, statsmodels, scipy, and sklearn are required for the evaluations submodule. Try \
  17 | `pip install matplotlib; pip install seaborn; pip install scikit-learn; pip install statsmodels`")
  18 | 
  19 | from .model import powerset
  20 | from .plotting import density_scatter, lowess_trend, identify_outliers_by_zscore, identify_outliers_by_trend
  21 | from .plotting import binplot, dict_plot, identify_outliers_by_diagonal
  22 | try:
  23 | 	from umap.umap_ import UMAP
  24 | 	umap_present = True
  25 | except ModuleNotFoundError:
  26 | 	warn("umap module not found. Some plots can't be made without it. Try `pip install umap-learn")
  27 | 	umap_present = False
  28 | except NameError:
  29 | 	warn("UMAP class not found where expected. Your umap module may be out of date. \
  30 | Try updating your version with `pip install --upgrade umap-learn")
  31 | 	umap_present = False
  32 | 
  33 | try:
  34 | 	from adjustText import adjust_text
  35 | 	adjustText_present = True
  36 | except ModuleNotFoundError:
  37 | 	warn("adjustText not found, which means labels in plots will not be adjusted to avoid overlap.\
  38 | Try `pip install adjustText`")
  39 | 
  40 | 
  41 | 
  42 | # UTILITIES
  43 | 
  44 | 
  45 | def np_cor_no_missing(x, y):
  46 | 	"""Full column-wise Pearson correlations of two matrices with no missing values."""
  47 | 	try:
  48 | 		xv = (x - x.mean(axis=0))/x.std(axis=0)
  49 | 		yv = (y - y.mean(axis=0))/y.std(axis=0)
  50 | 	except TypeError as e:
  51 | 		print("failed to correlate")
  52 | 		print(x)
  53 | 		print(y)
  54 | 		raise e
  55 | 	result = np.dot(xv.T, yv)/len(xv)
  56 | 	return result
  57 | 
  58 | 
  59 | def group_cols_with_same_mask(x):
  60 | 	"""
  61 | 	Group columns with the same indexes of NAN values.
  62 | 	
  63 | 	Return a sequence of tuples (mask, columns) where columns are the column indices
  64 | 	in x which all have the mask.
  65 | 	"""
  66 | 	per_mask = {}
  67 | 	for i in range(x.shape[1]):
  68 | 		try:
  69 | 			o_mask = pd.notnull(x[:, i])
  70 | 		except TypeError as e:
  71 | 			print(x.dtype)
  72 | 			raise(e)
  73 | 		o_mask_b = np.packbits(o_mask).tobytes()
  74 | 		if o_mask_b not in per_mask:
  75 | 			per_mask[o_mask_b] = [o_mask, []]
  76 | 		per_mask[o_mask_b][1].append(i)
  77 | 	return per_mask.values()
  78 | 
  79 | 
  80 | def fast_cor_core(x, y):
  81 | 	'''
  82 | 	x (`np.array`): 2D array. All columns will be correlated with all columns of y.
  83 | 	y (`np.array`): 2D array. All columns will be correlated with all columns of x. 
  84 | 					Must have save length as x.
  85 | 	returns: `np.array` of shape (x.shape[1], y.shape[1]), where the ith, jth element
  86 | 		is the pearson correlation of x[:, i] and y[:, j] with null elements removed.
  87 | 	'''
  88 | 	result = np.zeros(shape=(x.shape[1], y.shape[1]))
  89 | 
  90 | 	x_groups = group_cols_with_same_mask(x)
  91 | 	y_groups = group_cols_with_same_mask(y)
  92 | 	for x_mask, x_columns in x_groups:
  93 | 		for y_mask, y_columns in y_groups:
  94 | 			# print(x_mask, x_columns, y_mask, y_columns)
  95 | 			combined_mask = x_mask & y_mask
  96 | 
  97 | 			# not sure if this is the fastest way to slice out the relevant subset
  98 | 			x_without_holes = x[:, x_columns][combined_mask, :]
  99 | 			y_without_holes = y[:, y_columns][combined_mask, :]
 100 | 
 101 | 			try:
 102 | 				c = np_cor_no_missing(x_without_holes, y_without_holes)
 103 | 			except ValueError:
 104 | 				raise ValueError("trying to correlate two groups with shapes %r and %r" %(
 105 | 					x_without_holes.shape, y_without_holes.shape
 106 | 				))
 107 | 			# update result with these correlations
 108 | 			result[np.ix_(x_columns, y_columns)] = c
 109 | 	return result
 110 | 
 111 | 
 112 | def fast_cor(x, y=None):
 113 | 	'''
 114 | 	x (`pd.DataFrame`): Numerical matrix. All columns will be correlated with all columns of y.
 115 | 	y (`pd.DataFrame`): Numerical matrix. All columns will be correlated with all columns of x. 
 116 | 					Index must overlap x.
 117 | 	returns: `pd.DataFrame` of shape (x.shape[1], y.shape[1]), where the ith, jth element
 118 | 		is the pearson correlation of x[:, i] and y[:, j] with null elements removed.
 119 | 	'''
 120 | 	if y is None:
 121 | 		y = x
 122 | 	if x is y:
 123 | 		shared = x.index
 124 | 	else:
 125 | 		shared = sorted(set(x.index) & set(y.index))
 126 | 	if len(shared) < 2:
 127 | 		raise ValueError("x and y don't have at least two rows in common")
 128 | 	out = pd.DataFrame(fast_cor_core(x.loc[shared].values, y.loc[shared].values),
 129 | 		index=x.columns, columns=y.columns)
 130 | 	return out
 131 | 	
 132 | 
 133 | def get_aligned_mutation_matrix(base_matrix, gene_effect):
 134 | 	'''Aligning a mutation matrix with gene effect, requiring a minimum number of non-null values in gene effect'''
 135 | 	aligned_matrix = base_matrix.reindex(gene_effect.index).fillna(False)
 136 | 	aligned_matrix = aligned_matrix[sorted(set(aligned_matrix.columns) & set(gene_effect.columns))]
 137 | 	aligned_matrix.fillna(False, inplace=True)
 138 | 	aligned_matrix[gene_effect[aligned_matrix.columns].isnull()] = np.nan
 139 | 	aligned_matrix = aligned_matrix[aligned_matrix.columns[
 140 | 		(aligned_matrix & gene_effect[aligned_matrix.columns].notnull() ).sum() > 2
 141 | 	]]
 142 | 	return aligned_matrix
 143 | 
 144 | 
 145 | def split_color(rgb):
 146 | 	''' get two colors with the same hue and saturation but different values'''
 147 | 	h, s, v = rgb_to_hsv(*rgb)
 148 | 	return hsv_to_rgb(h, s, .3), hsv_to_rgb(h, s, .6)
 149 | 
 150 | 
 151 | def generate_powerset_palette(keys, start='random',
 152 | 							 base_saturation=1, base_hsv_value=.7):
 153 | 	'''
 154 | 	Generate a palette for the powerset of `keys`. Colors for the individual keys will be evenly spaced
 155 | 	in hue space. Combinations will have the average of the hues of each key, with identical hues being resolved
 156 | 	by different hsv values (brightness).
 157 | 	Parameters:
 158 | 		`keys` (iterable): the base keys that will be combined into a powerset
 159 | 		`start` (`float` or "random"): optional hue for the first entry in `keys`
 160 | 		`base_saturation`: saturation of colors for the individual keys
 161 | 		`base_hsv_value`: hsv value parameter for colors for the individual keys
 162 | 	Returns:
 163 | 		`dict` with an entry for each possible unique combination of the keys (excluding the empty set) containing
 164 | 			an RGB color for the combination
 165 | 	'''
 166 | 	if start == 'random':
 167 | 		start = np.random.uniform()
 168 | 	base_hues = start + np.arange(len(keys))/len(keys)
 169 | 	base_rgb = dict(zip(keys, [hsv_to_rgb(hue, base_saturation, base_hsv_value) for hue in base_hues]))
 170 | 	out = {}
 171 | 	keysets = list(powerset(keys))
 172 | 	for keyset in keysets:
 173 | 		if not len(keyset):
 174 | 			continue
 175 | 		color = np.mean(np.stack([base_rgb[key] for key in keyset]), axis=0)
 176 | 		out[keyset] = tuple(color)
 177 | 	for i, keyset1 in enumerate(keysets):
 178 | 		if not len(keyset1):
 179 | 			continue
 180 | 		for keyset2 in keysets[i+1:]:
 181 | 			if not len(keyset2):
 182 | 				continue
 183 | 			dist = np.sqrt(((np.array(out[keyset1]) - np.array(out[keyset2]))**2).sum())
 184 | 			if dist < .1:
 185 | 				out[keyset1], out[keyset2] = split_color(out[keyset1])
 186 | 	return out
 187 | 
 188 | 
 189 | def trim_overlapping_lead_and_tail(strings):
 190 | 	'''
 191 | 	Removes extraneous prefixes/suffixes common to all the strings for more parsimonious labeling
 192 | 	'''
 193 | 	if len(strings) < 2:
 194 | 		return strings
 195 | 	n = min([len(string) for string in strings])
 196 | 	for i in range(n):
 197 | 		c = strings[0][i]
 198 | 		if any([string[i] != c for string in strings[1:]]):
 199 | 			break
 200 | 	if i == n:
 201 | 		raise ValueError("Shortest string has no distinct substring:\n%r" % strings)
 202 | 	for j in range(n):
 203 | 		c = strings[0][-j-1]
 204 | 		if any([string[-j-1] != c for string in strings[1:]]):
 205 | 			break
 206 | 	if j == 0:
 207 | 		return [string[i:] for string in strings]
 208 | 	return [string[i:-j] for string in strings]
 209 | 
 210 | 
 211 | def _strip_identical_prefix(s1, s2):
 212 | 	i = 0
 213 | 	while s1[i] == s2[i]:
 214 | 		i += 1
 215 | 	return s1[i:], s2[i:]
 216 | 
 217 | def _make_aliases(keys):
 218 | 	'''
 219 | 	Tries to create a series with values holding a unique, logical two-letter code
 220 | 	for each key in keys.
 221 | 	'''
 222 | 	deduplicated = pd.Series([s for s in trim_overlapping_lead_and_tail(keys)], index=[s for s in keys])
 223 | 	true_unique = deduplicated.copy()
 224 | 	for i in range(len(deduplicated)):
 225 | 		for j in range(i+1, len(deduplicated)):
 226 | 			true_unique.iloc[i], true_unique.iloc[j] = _strip_identical_prefix(true_unique.iloc[i], true_unique.iloc[j])
 227 | 	out = {}
 228 | 	for s in keys:
 229 | 		if deduplicated[s].startswith(true_unique[s]):
 230 | 			out[s] = deduplicated[s][:2]
 231 | 		else:
 232 | 			out[s] = deduplicated[s][0] + true_unique[s][0]
 233 | 	return pd.Series(out)
 234 | 
 235 | 
 236 | def append_to_legend_handles(lines, ax):
 237 | 	'''
 238 | 	Add text to the matplotlib legend to an axis
 239 | 	Parameters:
 240 | 		`lines` (iterable of `str`): lines to add
 241 | 		`ax`: `matplotlib.Axis`
 242 | 	Returns:
 243 | 		legend handles
 244 | 	'''
 245 | 	handles, labels = ax.get_legend_handles_labels()
 246 | 	for line in lines:
 247 | 		handles.append(
 248 | 			Patch( 
 249 | 				color=(0, 0, 0, 0), 
 250 | 				label=line 
 251 | 			)
 252 | 		)
 253 | 	return handles
 254 | 
 255 | 
 256 | # METRICS
 257 | 
 258 | def mad(x, axis=None):
 259 | 	'''median absolute deviation from the median'''
 260 | 	x = x[pd.notnull(x)]
 261 | 	med = np.median(x, axis)
 262 | 	return np.median( np.abs(x-med), axis)
 263 | 
 264 | 
 265 | def nnmd(pos, neg):
 266 | 	'''null-normalixed median difference between the `pos` and `neg` arrays'''
 267 | 	return (np.median(pos[pd.notnull(pos)]) - np.median(neg[pd.notnull(neg)]))/mad(neg)
 268 | 
 269 | 
 270 | def auroc(pos, neg):
 271 | 	'''ROC AUC of separation between `pos` and `neg` arrays'''
 272 | 	pos = pos[pd.notnull(pos)]
 273 | 	neg = neg[pd.notnull(neg)]
 274 | 	true = [0] * len(pos) + [1] * len(neg)
 275 | 	return roc_auc_score(y_true=true, y_score=list(pos) + list(neg))
 276 | 
 277 | def pr_auc(pos, neg):
 278 | 	'''Area under precision-recall curve separating `pos` and `neg` arrays'''
 279 | 	pos = pos[pd.notnull(pos)]
 280 | 	neg = neg[pd.notnull(neg)]
 281 | 	probas = np.concatenate([np.array(pos), np.array(neg)])
 282 | 	true = [0] * len(pos) + [1] * len(neg)
 283 | 	precision, recall, thresh = precision_recall_curve(y_true=true, probas_pred=probas)
 284 | 	return auc(recall, precision)
 285 | 
 286 | 
 287 | # PRE RUN PLOTS
 288 | 
 289 | def replicate_plot(readcounts, rep1, rep2):
 290 | 	'''
 291 | 	Given a `pandas.DataFrame` matrix of `readcounts` with replicates as rows and sgRNAs as columns, plot
 292 | 	the logged readcounts of `rep1` vs `rep2` and annotate with their Pearson correlation.
 293 | 	'''
 294 | 	for rep in rep1, rep2:
 295 | 		if not rep in readcounts.index:
 296 | 			raise ValueError("replicate label %s not found in the index of `readcounts`" % rep)
 297 | 	x = np.log2(readcounts.loc[rep1]+1)
 298 | 	y = np.log2(readcounts.loc[rep2]+1)
 299 | 	density_scatter(x, y,
 300 | 							label_outliers=2)
 301 | 	plt.xlabel("%s Readcounts (+1Log)" % rep1)
 302 | 	plt.ylabel("%s Readcounts (+1Log)" % rep2)
 303 | 	r = x.corr(y)
 304 | 	plt.text(s='R = %1.2f' % r, x=.05, y=.9, transform=plt.gca().transAxes)
 305 | 
 306 | 
 307 | def all_replicate_plot(readcounts, sequence_map, cell_line, plot_width):
 308 | 	'''
 309 | 	Given a `pandas.DataFrame` matrix of `readcounts` with replicates as rows and sgRNAs as columns, generate a
 310 | 	`replicate_plot` for all pairs of replicates of `cell_line`. See `chronos.Chronos` for a description
 311 | 	of `sequence_map`. `plot_width` gives the plot width in inches.
 312 | 	'''
 313 | 	reps = sequence_map.query("cell_line_name == %r" % cell_line).sequence_ID.unique()
 314 | 	rep_labels = dict(zip(reps, trim_overlapping_lead_and_tail(reps)))
 315 | 	n = 0
 316 | 	titles = {}
 317 | 	for i in range(len(reps)-1):
 318 | 		for j in range(i+1, len(reps)):
 319 | 			n += 1
 320 | 			titles["%s %i" % (cell_line, n)] = (reps[i], reps[j])
 321 | 	def plotfunc(x):
 322 | 		replicate_plot(readcounts, *x)
 323 | 		plt.xlabel("Rep." + rep_labels[x[0]])
 324 | 		plt.ylabel("Rep." + rep_labels[x[1]])
 325 | 	dict_plot(titles, plotfunc, plot_width)
 326 | 	plt.tight_layout()
 327 | 
 328 | 
 329 | def pDNA_plot(readcounts, sequence_map, rep, sgrnas=None):
 330 | 	'''
 331 | 	Given a `pandas.DataFrame` matrix of `readcounts` with replicates as rows and sgRNAs as columns, plot
 332 | 	the logged readcounts of `rep` vs median readcounts in pDNA of the same batch. `sgrnas` optionally
 333 | 	subsets the plot to the specified sgrnas. See `chronos.Chronos` for a description
 334 | 	of `sequence_map`.
 335 | 	'''
 336 | 	if not rep in readcounts.index:
 337 | 		raise ValueError("Rep %r not in readcounts index" %rep)
 338 | 	if not sgrnas is None:
 339 | 		controls = sorted(set(sgrnas) & set(readcounts.columns))
 340 | 		if not controls:
 341 | 			raise ValueError("None of the specified sgRNAs are in the readcounts columns: \n%r" 
 342 | 							% sgrnas)
 343 | 	batch_label = sequence_map.query("sequence_ID == %r" % rep).iloc[0]['pDNA_batch']
 344 | 	pdna_seq = sequence_map\
 345 | 				.query("cell_line_name == 'pDNA'")\
 346 | 				.query("pDNA_batch == %r" % batch_label)\
 347 | 				.sequence_ID
 348 | 	pdna = np.log2(readcounts.loc[pdna_seq]+1).median()
 349 | 	ltp = np.log2(readcounts.loc[rep]+1)
 350 | 	if not sgrnas is None:
 351 | 		pdna = pdna.loc[controls]
 352 | 		ltp = ltp.loc[controls]
 353 | 	density_scatter(pdna, ltp,
 354 | 							trend_line=False, diagonal=True)
 355 | 	plt.xlabel("%s Readcounts (+1Log)" % "pDNA")
 356 | 	plt.ylabel("%s Readcounts (+1Log)" % rep)
 357 | 	r = ltp.corr(pdna)
 358 | 	plt.text(s='R = %1.2f' % r, x=.05, y=.9, transform=plt.gca().transAxes)
 359 | 
 360 | 
 361 | 
 362 | def paired_pDNA_plots(readcounts, sequence_map, cell_line, 
 363 | 					  negative_control_sgRNAs=None, positive_control_sgRNAs=None,
 364 | 					 plot_width=7.5, plot_height=3, page_height=9):
 365 | 	'''
 366 | 	If `negative_control_sgRNAs` and `positive_control_sgRNAs` is none,
 367 | 	produces one subplot for each replicate of the cell line with a `pDNA_plot`.
 368 | 	Otherwise, generates pairs of pDNA plots for each replicate. If both control types
 369 | 	are supplied, one will be plotted on each side. If one is missing,
 370 | 	it will be replaced with all sgRNAs. `plot_width` specified the figure width in inches,
 371 | 	but `plot_height` specifies subplot height. This will be adjusted if the total figure
 372 | 	height would exceed `page_height`. See `pDNA_plot` for other parameters.
 373 | 	'''
 374 | 	reps = sequence_map.query("cell_line_name == %r" % cell_line).sequence_ID.unique()
 375 | 	labels = dict(zip(reps, trim_overlapping_lead_and_tail(reps)))
 376 | 	left_title = "Negative Controls"
 377 | 	right_title = "Positive Controls"
 378 | 	
 379 | 	if negative_control_sgRNAs is None and positive_control_sgRNAs is None:
 380 | 		titles = dict(zip(trim_overlapping_lead_and_tail(reps), reps))
 381 | 		def plotfunc(x):
 382 | 			pDNA_plot(readcounts, sequence_map, x)
 383 | 			plt.ylabel(labels[x])
 384 | 		dict_plot(titles, plotfunc)
 385 | 		return
 386 | 	elif positive_control_sgRNAs is None:
 387 | 		positive_control_sgRNAs = readcounts.columns
 388 | 		right_title = "All sgRNAs"
 389 | 	elif negative_control_sgRNAs is None:
 390 | 		negative_control_sgRNAs = readcounts.columns
 391 | 		left_title = "All sgRNAs"
 392 | 		
 393 | 	height = min(page_height, plot_height*len(reps))
 394 | 	fig, axes = plt.subplots(len(reps), 2, figsize=(plot_width, height))
 395 | 	for i, rep in enumerate(reps):
 396 | 		plt.sca(axes[i, 0])
 397 | 		pDNA_plot(readcounts, sequence_map, rep, negative_control_sgRNAs)
 398 | 		plt.ylabel('Rep. ' + labels[rep])
 399 | 		plt.title(left_title)
 400 | 		
 401 | 		plt.sca(axes[i, 1])
 402 | 		pDNA_plot(readcounts, sequence_map, rep, positive_control_sgRNAs)
 403 | 		plt.ylabel("")
 404 | 		plt.title(right_title)
 405 | 
 406 | 
 407 | 
 408 | # POST RUN PLOTS
 409 | 
 410 | 
 411 | def gene_outlier_plot(gene_effect1, gene_effect2,
 412 | 	xlabel="gene_effect1 zscore", ylabel="gene_effect2 zscore",
 413 | 		ax=None, legend=True,
 414 | 		density_scatter_args={"label_outliers": 10, "trend_line": True}, 
 415 | 		legend_args={}, metrics=None
 416 | ):
 417 | 	'''
 418 | 	Compares the most extreme outliers for the matrices `gene_effect1` and `gene_effect2` 
 419 | 	with a density scatter of the maximum and minimum screen gene effect for each gene
 420 | 	with results from `gene_effect1` on one axis and `gene_effect2` on the other, and returns the 
 421 | 	outliers from the trend line.
 422 | 	This plot is useful to detect if one method is producing etreme outliers within a gene score
 423 | 	relative to the other.
 424 | 	'''
 425 | 	gene_effect1, gene_effect2 = gene_effect1.align(gene_effect2, join="inner")
 426 | 	zscore1 = (gene_effect1 - gene_effect1.mean())/gene_effect1.std()
 427 | 	zscore2 = (gene_effect2 - gene_effect2.mean())/gene_effect2.std()
 428 | 	mins1 = zscore1.min()
 429 | 	mins1.index = [s + "_Min" for s in mins1.index]
 430 | 	max1 = zscore1.max()
 431 | 	max1.index = mins1.index
 432 | 	mins2 = zscore2.min()
 433 | 	mins2.index = [s + "_Min" for s in mins2.index]
 434 | 	max2 = zscore2.max()
 435 | 	max2.index = mins2.index
 436 | 	x = pd.concat([mins1, max1])
 437 | 	y = pd.concat([mins2, max2])
 438 | 	if not ax:
 439 | 		ax = plt.gca()
 440 | 	plt.sca(ax)
 441 | 	density_scatter(x, y, **density_scatter_args)
 442 | 	plt.title("Most Extreme Values by ZScore")
 443 | 	out = {
 444 | 		'low_outliers': mins1.index[identify_outliers_by_trend(mins1, mins2, 5)],
 445 | 		'high_outliers': max1.index[identify_outliers_by_trend(max1, max2, 5)]
 446 | 	}
 447 | 	if metrics is None:
 448 | 		return out
 449 | 	else:
 450 | 		metrics.update(out)
 451 | 
 452 | 
 453 | 
 454 | 
 455 | def gene_corr_vs_mean(gene_effect1, gene_effect2, ax=None,
 456 | 					  legend=True,
 457 | 					 density_scatter_args={"label_outliers": 5, "trend_line": False}, legend_args={}, metrics=None
 458 | 					 ):
 459 | 	'''
 460 | 	Shows the correlation of a gene's gene effect profile within the two matrices `gene_effect` and `gene_effect2`
 461 | 	with the gene's mean effect (averaged between the two matrices) on the x axis, and returns the genes with lowest correlation.
 462 | 	'''
 463 | 	corrs = gene_effect1.corrwith(gene_effect2).dropna()
 464 | 	means = .5*(gene_effect1[corrs.index].mean() + gene_effect2[corrs.index].mean())
 465 | 	density_scatter(means, corrs, **density_scatter_args)
 466 | 	if not ax:
 467 | 		ax = plt.gca()
 468 | 	ax.set_xlabel("Gene Mean")
 469 | 	ax.set_ylabel("Gene Correlation")
 470 | 
 471 | 	out = {
 472 | 		"gene_corr_med": corrs.median(),
 473 | 		"gene_corr_lt_9": (corrs < .9).sum(),
 474 | 	}
 475 | 	if legend:
 476 | 		handles = append_to_legend_handles([
 477 | 			"%s: %1.3f" % (key.replace("_", ' '), val)
 478 | 			for key, val in out.items()
 479 | 		], ax)
 480 | 		plt.legend(handles=handles, **legend_args)
 481 | 	out["lowest_corr"] = corrs.sort_values().index[0:20]
 482 | 	if metrics is None:
 483 | 		return out
 484 | 	else:
 485 | 		metrics.update(out)
 486 | 
 487 | 
 488 | def gene_corr_vs_mean_diff(gene_effect1, gene_effect2, ax=None,
 489 | 					  legend=True,
 490 | 					 density_scatter_args={"label_outliers": 5, "trend_line": False, "outliers_from": "xy_zscore"},
 491 | 					  legend_args={}, metrics=None
 492 | 					 ):
 493 | 	'''
 494 | 	Shows the correlation of a gene's gene effect profile within the two matrices `gene_effect` and `gene_effect2`
 495 | 	with the difference in the gene's mean effect between the two matrices on the x axis. This plot is useful for
 496 | 	seeing which genes have the most disagreement between the matrices either by correlation or by mean effect.
 497 | 	It returns outliers found by zscore, i.e. genes with lowest agreement taking into account both their means
 498 | 	and their correlations.
 499 | 	'''
 500 | 	corrs = gene_effect1.corrwith(gene_effect2).dropna()
 501 | 	mean_diff = gene_effect1[corrs.index].mean() - gene_effect2[corrs.index].mean()
 502 | 	corrs, mean_diff =  corrs.align(mean_diff.dropna(), join="inner")
 503 | 	density_scatter(mean_diff, corrs, **density_scatter_args)
 504 | 	if not ax:
 505 | 		ax = plt.gca()
 506 | 	ax.set_xlabel("Gene Mean Diff")
 507 | 	ax.set_ylabel("Gene Correlation")
 508 | 
 509 | 	out = {
 510 | 		"gene_corr_med": corrs.median(),
 511 | 		"gene_corr_lt_9": (corrs < .9).sum(),
 512 | 	}
 513 | 	if legend:
 514 | 		handles = append_to_legend_handles([
 515 | 			"%s: %1.3f" % (key.replace("_", ' '), val)
 516 | 			for key, val in out.items()
 517 | 		], ax)
 518 | 		plt.legend(handles=handles, **legend_args)
 519 | 	out["worst_agreement"] = corrs.index[identify_outliers_by_zscore(corrs, mean_diff, 10)]
 520 | 	if metrics is None:
 521 | 		return out
 522 | 	else:
 523 | 		metrics.update(out)
 524 | 
 525 | 
 526 | def control_histogram(gene_effect, positive_control_genes, negative_control_genes, ax=None,
 527 | 					  legend=True,
 528 | 					 kde_args={}, legend_args={}, metrics=None):
 529 | 	'''
 530 | 	Produces KDE plots of the distribution of positive and negative control gene scores 
 531 | 	in the gene effect matrix. Both the mean gene scores and the raveled gene scores are
 532 | 	shown. Control separatioon results measured by NNMD and AUROC are returned.
 533 | 	'''
 534 | 	pos = gene_effect.reindex(columns=positive_control_genes).dropna(axis=1, how='all')
 535 | 	neg = gene_effect.reindex(columns=negative_control_genes).dropna(axis=1, how='all')
 536 | 	sns.kdeplot(pos.mean(), bw_adjust=.5, fill=True, alpha=.3, lw=0, color="red", 
 537 | 				label="Positive Control Means", ax=ax, gridsize=1000, **kde_args)
 538 | 	sns.kdeplot(neg.mean(), bw_adjust=.5, fill=True, alpha=.3, lw=0, color="blue",
 539 | 				label="Negative Control Means", ax=ax, gridsize=1000, **kde_args)
 540 | 	sns.kdeplot(pos.stack(), bw_adjust=.5, lw=2, color="crimson", 
 541 | 				label="Positive Control Scores", ax=ax, gridsize=1000, **kde_args)
 542 | 	sns.kdeplot(neg.stack(), bw_adjust=.5, lw=2, color="navy",
 543 | 				label="Negative Control Scores", ax=ax, gridsize=1000, **kde_args)
 544 | 	if not ax:
 545 | 		ax = plt.gca()
 546 | 	ax.set_xlabel("Gene Effect")
 547 | 	out = {
 548 | 		"NNMD_of_means": nnmd(pos.mean(), neg.mean()),
 549 | 		"NNMD_of_scores": nnmd(pos.stack(), neg.stack()),
 550 | 		"AUROC_of_means": auroc(pos.mean(), neg.mean()),
 551 | 		"AUROC_of_scores": auroc(pos.stack(), neg.stack())
 552 | 	}
 553 | 	if legend:
 554 | 		handles = append_to_legend_handles([
 555 | 			"%s: %1.3f" % (key.replace("_", ' '), val)
 556 | 			for key, val in out.items()
 557 | 		], ax)
 558 | 		plt.legend(handles=handles, **legend_args)
 559 | 	if metrics is None:
 560 | 		return out
 561 | 	else:
 562 | 		metrics.update(out)
 563 | 
 564 | 
 565 | def mean_vs_sd_scatter(gene_effect,
 566 | 						   ax=None,
 567 | 							 metrics=None, legend=True, legend_args={},
 568 | 							   density_scatter_args={"alpha": .6, "s": 10, "label_outliers": 3, "outliers_from": "xy_zscore"}
 569 | 						  ):
 570 | 	'''
 571 | 	Plots the gene mean vs its standard deviation for each gene in the matrix `gene_effect`.
 572 | 	'''
 573 | 	means = gene_effect.mean()
 574 | 	sd = gene_effect.std()/means.std()
 575 | 	
 576 | 	if not ax:
 577 | 		ax = plt.gca()
 578 | 	density_scatter(means, sd, ax=ax, **density_scatter_args)
 579 | 	plt.ylabel("Gene SD / SD of Gene Means")
 580 | 	plt.xlabel("Gene Mean")
 581 | 	
 582 | 	out = {
 583 | 		"mean_SD:SD_means": sd.mean()
 584 | 	}
 585 | 	if legend:
 586 | 		handles = append_to_legend_handles([
 587 | 			"%s: %1.2f" % (key.replace("_", ' '), val)
 588 | 			for key, val in out.items()
 589 | 		], ax)
 590 | 		plt.legend(handles=handles, **legend_args)
 591 | 	if metrics is None:
 592 | 		return out
 593 | 	else:
 594 | 		metrics.update(out)
 595 | 
 596 | 
 597 | def mean_vs_cell_eff_correlation(gene_effect, replicate_efficacy,
 598 | 		ax=None, metrics=None, legend=True, legend_args={"loc": "upper right"},
 599 | 		density_scatter_args={"alpha": .6, "s": 10, "label_outliers": 5, "outliers_from": "y"}
 600 | 	):
 601 | 	'''
 602 | 	Usiung the matrix `gene_effect`, plots each gene's gene effect profile's mean vs its correlation with estimated
 603 | 	`cell_efficacy` (`pandas.Series` or `numpy.ndarray`) of all screens.
 604 | 	This plot is useful for detecting screen quality bias, in that genes with lower means will tend to 
 605 | 	be negatively correlated with cell efficacy. Returns cell efficacy correlation mean, standard deviation,
 606 | 	and it correlation with gene mean.
 607 | 	'''
 608 | 	raise NotImplementedError("Function needs to be updated before using")
 609 | 	means = gene_effect.mean()
 610 | 	corrs = gene_effect.corrwith(cell_efficacy)
 611 | 	if not ax:
 612 | 		ax = plt.gca()
 613 | 	density_scatter(means, corrs, ax=ax, **density_scatter_args)
 614 | 	plt.ylabel("Gene Effect R with Cell Efficacy")
 615 | 	plt.xlabel("Gene Mean")
 616 | 	out = {
 617 | 		"cell_efficacy_corr_mean": corrs.mean(),
 618 | 		"cell_efficacy_corr_sd": corrs.std(),
 619 | 		"cell_efficacy_corr_gene_mean_trend": means.corr(corrs)
 620 | 	}
 621 | 	if legend:
 622 | 		handles = append_to_legend_handles([
 623 | 			"%s: %1.2f" % (key.replace("_", ' '), val)
 624 | 			for key, val in out.items()
 625 | 		], ax)
 626 | 		plt.legend(handles=handles, **legend_args)
 627 | 	if metrics is None:
 628 | 		return out
 629 | 	else:
 630 | 		metrics.update(out)
 631 | 
 632 | 
 633 | def screen_nnmd_auroc_scatter(gene_effect, positive_control_genes, negative_control_genes, ax=None,
 634 | 							 metrics=None, legend=True, legend_args={},
 635 | 							   density_scatter_args={}):
 636 | 	'''
 637 | 	For each screen (row) in the matrix `gene_effect`, computes the separation of the iterable `positive_control_genes`
 638 | 	from `negative_control_genes` by NNMD and AUROC. This is useful for visualizing the distribution of screen quality.
 639 | 	The median and mean of both measures are returned.
 640 | 	'''
 641 | 	poscon = sorted(set(positive_control_genes) & set(gene_effect.columns))
 642 | 	negcon = sorted(set(negative_control_genes) & set(gene_effect.columns))
 643 | 	nnmds = gene_effect.apply(
 644 | 		lambda x: nnmd(x[poscon], x[negcon]),
 645 | 		axis=1
 646 | 			)
 647 | 	aurocs = gene_effect.apply(
 648 | 		lambda x: auroc(x[poscon], x[negcon]),
 649 | 		axis=1
 650 | 			)
 651 | 	
 652 | 	if not ax:
 653 | 		ax=plt.gca()
 654 | 	density_scatter(aurocs, nnmds, ax=ax, **density_scatter_args)
 655 | 	plt.xlabel("AUROC - Higher is Better")
 656 | 	plt.ylabel("NNMD - Lower is Better")
 657 | 	
 658 | 	out = {
 659 | 		"NNMD_median": nnmds.median(),
 660 | 		"NNMD_mean": nnmds.mean(),
 661 | 		"AUROC_median": aurocs.median(),
 662 | 		"AUROC_mean": aurocs.mean()
 663 | 	}
 664 | 
 665 | 	if legend:
 666 | 		handles = append_to_legend_handles([
 667 | 			"%s: %1.3f" % (key.replace("_", ' '), val)
 668 | 			for key, val in out.items()
 669 | 		], ax)
 670 | 		plt.legend(handles=handles, loc="lower left", **legend_args)
 671 | 	if metrics is None:
 672 | 		return out
 673 | 	else:
 674 | 		metrics.update(out) 
 675 | 
 676 | 
 677 | def expression_addiction_volcano(gene_effect, addiction_expressions,
 678 | 	max_threshold=-.2,
 679 | 									 ax=None,
 680 | 							 metrics=None, legend=True, legend_args={},
 681 | 							   density_scatter_args={"trend_line": False}
 682 | 	):
 683 | 	'''
 684 | 	Given the matrix of expression `addiction_expressions`, whose columns should only include
 685 | 	genes expected to be expression addictions (i.e. cause loss of viability in cell lines
 686 | 	that overexpress them), computes the Pearson correlation and associated false discovery rate between
 687 | 	the gene's expression and its gene effect in the matrix `gene_effect` and plots the result
 688 | 	as a volcano. Note that the p-values informing the FDRs (q values) are optimistic due to
 689 | 	the assumption of normal errors. The fraction of selective dependencies with FDR < 0.1 or
 690 | 	R < -0.2 is returned.
 691 | 	This plot is useful for evaluating the ability to identify 
 692 | 	selective dependencies and their association with the correct biomarker.
 693 | 	'''
 694 | 	gene_effect, addiction_expressions = gene_effect.align(addiction_expressions, join="inner")
 695 | 	corr = {}
 696 | 	p = {}
 697 | 	for gene in gene_effect:
 698 | 		mask = gene_effect[gene].notnull() & addiction_expressions[gene].notnull()
 699 | 		corr[gene], p[gene] = pearsonr(gene_effect[gene][mask], addiction_expressions[gene][mask])
 700 | 	corr, p = pd.Series(corr), pd.Series(p)
 701 | 	p /= 2
 702 | 	p[corr > 0] = 1 - p[corr > 0]
 703 | 	q = pd.Series(fdrcorrection(p.values, .05)[1], index=p.index)
 704 | 	if ax is None:
 705 | 		ax = plt.gca()
 706 | 	plt.sca(ax)
 707 | 	density_scatter(corr, -np.log10(q), **density_scatter_args)
 708 | 	plt.xlabel("Expression/GE Correlation")
 709 | 	plt.ylabel("-log10(FDR)")
 710 | 	out = {
 711 | 		"expression_addictions_FDR_0.10": (q < .1).mean(),
 712 | 		 "expression_addictions_<_-0.2": (corr < -.2).mean()
 713 | 	}
 714 | 	if legend:
 715 | 		handles = append_to_legend_handles([
 716 | 			"%s: %1.2f" % (key.replace("_", ' '), val)
 717 | 			for key, val in out.items()
 718 | 		], ax)
 719 | 		plt.legend(handles=handles, loc="upper right", **legend_args)
 720 | 	if metrics is None:
 721 | 		return out
 722 | 	else:
 723 | 		metrics.update(out)
 724 | 
 725 | 
 726 | def selective_mutated_vs_not_scatter(gene_effect, mutation_matrix,
 727 | 									 ax=None,
 728 | 							 metrics=None, legend=True, legend_args={}, label_outliers=3,
 729 | 							   scatter_args={"alpha": .75, "linewidth": 1, "cmap": 'viridis_r'}
 730 | 	):
 731 | 	'''
 732 | 	A common pattern of dependency in cancer is "oncogene addiction," in which cells
 733 | 	selectively require proteins with oncogenic gain of function mutations to maintain viability.
 734 | 	Canonical examples include BRAF, NRAS, KRAS, CTNNB1, and EGFR. We expect cells with the 
 735 | 	gain of function alteration to show more negative gene effect than those without. This
 736 | 	function takes in a boolean `mutation_matrix` with gene columns and cell line rows,
 737 | 	which should only include genes that have known oncogenic gain of function alterations and should
 738 | 	be `True` for cell lines that have one of the known gain of function alterations.
 739 | 	For each gene in `mutation_matrix`, its mean in `gene_effect` for cell lines without 
 740 | 	gain of function is plotted on the x axis and its mean in cell lines with gain of function
 741 | 	is plotted on the y axis. The separation of these two by NNMD and AUROC is returned 
 742 | 	both as a median over genes of results per gene and by combining all GoF scores for all genes as positive controls
 743 | 	and the same genes' scores in lines without their indicated GoF alterations as the negative
 744 | 	controls ("total" NNMD/AUROC). This plot is useful for evaluating the ability to detect 
 745 | 	selective dependencies and their biomarkers.
 746 | 	'''
 747 | 	gene_effect = gene_effect.dropna(how='any', axis=1)
 748 | 	mutation_matrix = get_aligned_mutation_matrix(mutation_matrix, gene_effect)
 749 | 	gene_effect, mutation_matrix = gene_effect.align(mutation_matrix, join="inner")
 750 | 	mutation_matrix = mutation_matrix.dropna(how='all', axis=1).astype(bool)
 751 | 	gene_effect = gene_effect.dropna(how='all', axis=1)
 752 | 	gene_effect, mutation_matrix = gene_effect.align(mutation_matrix, join="inner")
 753 | 	if gene_effect.shape[0] == 0 or gene_effect.shape[1] == 0:
 754 | 		raise ValueError("Gene_effect and mutation_matrix have an axis with no overlaps (either genes or screens)")
 755 | 	scale = np.log2(mutation_matrix.sum().astype(float))
 756 | 	nnmds = mutation_matrix.apply(lambda x: nnmd(gene_effect[x.name][x], gene_effect[x.name][x==False]),
 757 | 								 axis=0)
 758 | 	aurocs = mutation_matrix.apply(lambda x: auroc(gene_effect[x.name][x], gene_effect[x.name][x==False]),
 759 | 								 axis=0)
 760 | 	total_nnmd = nnmd(
 761 | 		gene_effect[mutation_matrix==True].stack(), 
 762 | 		gene_effect[mutation_matrix.fillna(False)==False].stack()
 763 | 	)
 764 | 	total_auroc = auroc(
 765 | 		gene_effect[mutation_matrix==True].stack(), 
 766 | 		gene_effect[mutation_matrix==False].stack()
 767 | 		)
 768 | 	pos_means = gene_effect[mutation_matrix].mean()
 769 | 	neg_means = gene_effect[mutation_matrix==False].mean()
 770 | 	pos_means, neg_means = pos_means.align(neg_means, join="inner")
 771 | 	
 772 | 	if ax:
 773 | 		plt.sca(ax) #needed because scatter doesn't accept ax arg?
 774 | 	plt.scatter(neg_means, pos_means, s=10*scale, c=scale, **scatter_args)
 775 | 	if label_outliers:
 776 | 		outliers = identify_outliers_by_diagonal(neg_means, pos_means, label_outliers)
 777 | 		texts = [plt.text(s=neg_means.index[i],x=neg_means.iloc[i], y=pos_means.iloc[i], fontsize=6, color=[.8, .3, .05]) for i in outliers]
 778 | 		if adjustText_present:
 779 | 			adjust_text(texts, x=neg_means.values, y=pos_means.values, arrowprops=dict(lw=1, arrowstyle="-", color="black"),
 780 | 				)
 781 | 	xlim = plt.gca().get_xlim()
 782 | 	ylim = plt.gca().get_ylim()
 783 | 	plt.plot(
 784 | 		[min(xlim[0], ylim[0]), max(xlim[1], ylim[1])],
 785 | 		[min(xlim[0], ylim[0]), max(xlim[1], ylim[1])],
 786 | 		'--', color='tomato', lw=1
 787 | 		)
 788 | 	plt.colorbar(label="Log2(# Mutated Lines)")
 789 | 	plt.xlabel("Gene Mean Without Mutation")
 790 | 	plt.ylabel("Gene Mean With Mutation")
 791 | 	
 792 | 	out = {
 793 | 		"selective_NNMD_gene_median": nnmds.median(),
 794 | 		"selective_NNMD_raveled": total_nnmd,
 795 | 		"selective_AUROC_gene_median": aurocs.median(),
 796 | 		"selective_AUROC_raveled": total_auroc
 797 | 	}
 798 | 	if ax is None:
 799 | 		ax = plt.gca()
 800 | 	if legend:
 801 | 		handles = append_to_legend_handles([
 802 | 			"%s: %1.2f" % (key.replace("_", ' '), val)
 803 | 			for key, val in out.items()
 804 | 		], ax)
 805 | 		plt.legend(handles=handles, loc="upper left", **legend_args)
 806 | 	if metrics is None:
 807 | 		return out
 808 | 	else:
 809 | 		metrics.update(out)
 810 | 
 811 | 
 812 | def copy_number_trend(gene_effect, copy_number,
 813 | 	downsample=False, downsample_lower_quantile_bound=.05, downsample_upper_quantile_bound=.95,
 814 | 								ax=None,
 815 | 							 metrics=None, legend=True, legend_args={},
 816 | 							   density_scatter_args={"alpha": .75}
 817 | 	):
 818 | 	'''
 819 | 	Produces a scatter of the raveled `gene_effect` matrix (y) vs the raveled `copy_number` matrix
 820 | 	(x). USeful for visualizing how much depletion highly amplified regions can produce. 
 821 | 	If `downsample` is a float between 0 and 1, points with CN between `downsample_lower_quantile` and 
 822 | 	`downsample_upper_quantile` will be randomly reduced to the fraction given by `downsample`. 
 823 | 	This can greatly increase plotting speed by reducing the number of uninformative plots
 824 | 	with euploid CN being plotted.
 825 | 	The overall correlation of the raveled gene effect and CN matrices is returned.
 826 | 	'''
 827 | 	gene_effect, copy_number = gene_effect.align(copy_number, join='inner')
 828 | 	ge_raveled, cn_raveled = np.ravel(gene_effect), np.ravel(copy_number)
 829 | 	mask = pd.notnull(cn_raveled) & pd.notnull(ge_raveled)
 830 | 	out = {
 831 | 		"raveled_CN_corr":  pearsonr(ge_raveled[mask], cn_raveled[mask])[0]
 832 | 	}
 833 | 	if downsample:
 834 | 		ind = np.arange(len(cn_raveled))
 835 | 		selection = np.random.binomial(p=downsample, n=1, size=len(cn_raveled))
 836 | 		low = np.quantile(cn_raveled, downsample_lower_quantile_bound)
 837 | 		high = np.quantile(cn_raveled, downsample_upper_quantile_bound)
 838 | 		selection[cn_raveled < low] = 1
 839 | 		selection[cn_raveled > high] = 1
 840 | 		cn_raveled[selection == 0] = np.nan
 841 | 		mask = pd.notnull(cn_raveled) & pd.notnull(ge_raveled)
 842 | 	if ax is None:
 843 | 		ax = plt.gca()
 844 | 	plt.sca(ax)
 845 | 	density_scatter(cn_raveled[mask], ge_raveled[mask], **density_scatter_args)
 846 | 	plt.xlabel("Copy Number")
 847 | 	plt.ylabel("Gene Effect")
 848 | 	out = {
 849 | 		"raveled_CN_corr":  pearsonr(ge_raveled[mask], cn_raveled[mask])[0]
 850 | 	}
 851 | 	if legend:
 852 | 		handles = append_to_legend_handles([
 853 | 			"%s: %1.2f" % (key.replace("_", ' '), val)
 854 | 			for key, val in out.items()
 855 | 		], ax)
 856 | 		plt.legend(handles=handles, loc="upper right", **legend_args)
 857 | 	if metrics is None:
 858 | 		return out
 859 | 	else:
 860 | 		metrics.update(out)
 861 | 
 862 | 
 863 | def copy_number_gene_corrs(gene_effect, copy_number,
 864 | 								ax=None,
 865 | 							 metrics=None, legend=True, legend_args={},
 866 | 							   binplot_args={}
 867 | 	):
 868 | 		'''
 869 | 		Computes the correlation for each gene in the matrix `gene_effect` with its copy nunber in the matrix
 870 | 		`copy_number` (genes are columns, cell lines are rows), then bins by mean gene effect and plots the result.
 871 | 		This is useful for idenfying the two types of copy number effect: double strand break toxicity,
 872 | 		which causes nonessential genes to have gene effect negatively correlated with their own CN,
 873 | 		and the copy buffering effect, which causes common essential genes to be positively correlated.
 874 | 		'''
 875 | 		gene_effect, copy_number = gene_effect.align(copy_number, join='inner')
 876 | 		corrs = gene_effect.corrwith(copy_number)
 877 | 		means = gene_effect.mean()
 878 | 		if ax is None:
 879 | 			ax = plt.gca()
 880 | 		plt.sca(ax)
 881 | 		binplot(means, corrs, **binplot_args)
 882 | 		plt.xlabel("Gene Effect Mean")
 883 | 		plt.ylabel("Gene Effect R with CN")
 884 | 		return {}
 885 | 
 886 | 
 887 | def guide_estimate_corr_vs_sd_scatter(predicted_lfc, observed_lfc,
 888 | 							  ax=None,
 889 | 							 metrics=None, legend=True, legend_args={},
 890 | 							   density_scatter_args={}
 891 | 							 ):
 892 | 	'''
 893 | 	Given the two matrices of log fold-change, one predicted by the model (`predicted_lfc`)
 894 | 	with guides as columns and replicates as rows, computes the correlation between 
 895 | 	each sgRNA and produces a scatter with the standard deviation of the sgRNAs in the observed
 896 | 	matrix on the x axis. This is useful to see the agreement. In general sgRNAs with lower SD
 897 | 	may have lower correlation as there is less signal. The median correlation of all sgRNAs
 898 | 	and the median correlation of sgRNAs in the top 20% highest SD are returned. 
 899 | 	'''
 900 | 	corrs = predicted_lfc.corrwith(observed_lfc).dropna()
 901 | 	sd = observed_lfc.std().loc[corrs.index]
 902 | 	density_scatter(sd, corrs, ax=ax, **density_scatter_args)
 903 | 	plt.xlabel("Guide LFC SD")
 904 | 	plt.ylabel("Guide LFC Estimated/Observed R")
 905 | 	
 906 | 	out = {
 907 | 		"corrs_median": corrs.median(),
 908 | 		"corrs_median_20ile_most_variable": corrs[sd > sd.quantile(.8)].median()
 909 | 	}
 910 | 	if ax is None:
 911 | 		ax = plt.gca()
 912 | 	if legend:
 913 | 		handles = append_to_legend_handles([
 914 | 			"%s: %1.2f" % (key.replace("_", ' '), val)
 915 | 			for key, val in out.items()
 916 | 		], ax)
 917 | 		plt.legend(handles=handles, loc="lower right", **legend_args)
 918 | 	if metrics is None:
 919 | 		return out
 920 | 	else:
 921 | 		metrics.update(out)
 922 | 
 923 | 
 924 | def guide_estimate_corr_vs_guide_efficacy_scatter(predicted_lfc, observed_lfc,
 925 | 											 guide_efficacy,
 926 | 							  ax=None,
 927 | 							 metrics=None, legend=False, legend_args={},
 928 | 							   density_scatter_args={}
 929 | 							 ):
 930 | 	'''
 931 | 	Given the two matrices of log fold-change, one predicted by the model (`predicted_lfc`)
 932 | 	with guides as columns and replicates as rows, computes the correlation between 
 933 | 	each sgRNA, then plots that correlation with the `pandas.Series` `guide_efficacy`
 934 | 	which should be estimated by the model. In general we expect lower fidelity between
 935 | 	predicted and observed sgRNAs for guides with low efficacy. 
 936 | 	'''
 937 | 	corrs = predicted_lfc.corrwith(observed_lfc)
 938 | 	guide_efficacy = guide_efficacy.loc[corrs.index]
 939 | 	density_scatter(guide_efficacy, corrs, ax=ax, **density_scatter_args)
 940 | 	plt.xlabel("Guide Efficacy")
 941 | 	plt.ylabel("Guide LFC R")
 942 | 	
 943 | 	out = {
 944 | 	}
 945 | 	if ax is None:
 946 | 		ax = plt.gca()
 947 | 	if legend:
 948 | 		handles = append_to_legend_handles([
 949 | 			"%s: %1.2f" % (key.replace("_", ' '), val)
 950 | 			for key, val in out.items()
 951 | 		], ax)
 952 | 		plt.legend(handles=handles, loc="lower right", **legend_args)
 953 | 	if metrics is None:
 954 | 		return out
 955 | 	else:
 956 | 		metrics.update(out)  
 957 | 
 958 | 
 959 | def predicted_vs_observed_readcounts(predicted_readcounts, observed_readcounts,
 960 | 							  ax=None, max_points=10000,
 961 | 							 metrics=None, legend=True, legend_args={},
 962 | 							   density_scatter_args={"alpha": .5, "s": 10, "diagonal":True}
 963 | 							 ):
 964 | 	'''
 965 | 	Given the two normalized matrices of readcounts, one predicted by the model (`predicted_readcounts`)
 966 | 	and one observed (`observed_readcounts`), 
 967 | 	with guides as columns and replicates as rows, produces a scatter plot with observations
 968 | 	on the x axis and predictions on y. Points are subsampled to `max_points`.
 969 | 	For Chronos, very low observed readcounts will be systematically
 970 | 	predicted to have more, due to the structure of counts noise (it is more likely to observe few counts
 971 | 	if the real expectation is high than vice versa). If the total trend of readcounts is above or
 972 | 	below the diagonal however, that may indicate a normalization problem with the normalization.
 973 | 	Returns the correlation, mean difference (should be near 0), and median difference (should
 974 | 	also be near 0)
 975 | 	'''
 976 | 	estimated = pd.DataFrame(np.log10(predicted_readcounts.values+1), 
 977 | 								 index=predicted_readcounts.index, 
 978 | 								 columns=predicted_readcounts.columns
 979 | 								)
 980 | 	observed = pd.DataFrame(np.log10(observed_readcounts.values+1), 
 981 | 								 index=observed_readcounts.index, 
 982 | 								 columns=observed_readcounts.columns
 983 | 								)
 984 | 	estimated, observed = estimated.align(observed, join="inner")
 985 | 	stacked_est = np.ravel(estimated.values)
 986 | 	stacked_obs = np.ravel(observed.values)
 987 | 	
 988 | 	if len(stacked_est) > max_points:
 989 | 		chosen = np.random.choice(range(len(stacked_est)), size=max_points)
 990 | 	else:
 991 | 		chosen = range(len(stacked_est))
 992 | 	stacked_obs = stacked_obs[chosen]
 993 | 	stacked_est = stacked_est[chosen]
 994 | 	mask = pd.notnull(stacked_est) & pd.notnull(stacked_obs)
 995 | 	stacked_est = pd.Series(stacked_est[mask])
 996 | 	stacked_obs = pd.Series(stacked_obs[mask])
 997 | 	density_scatter(stacked_obs, stacked_est,
 998 | 					ax=ax, **density_scatter_args)
 999 | 	plt.xlabel("Observed Readcounts (Log10)")
1000 | 	plt.ylabel("Estimated Readcounts (Log10)")
1001 | 	diff = pd.DataFrame(observed.values-estimated.values,
1002 | 		index=observed.index, columns=observed.columns)
1003 | 	out = {
1004 | 		"readcount_estimate_corr": pearsonr(stacked_est.values, stacked_obs.values)[0],
1005 | 		"readcount_estimate_mean_displacement": diff.mean().mean(),
1006 | 		"readcount_estimate_median_displacement": diff.median().median()
1007 | 			}
1008 | 	if ax is None:
1009 | 		ax = plt.gca()
1010 | 	if legend:
1011 | 		handles = append_to_legend_handles([
1012 | 			"%s: %1.2f" % (key.replace("_", ' '), val)
1013 | 			for key, val in out.items()
1014 | 		], ax)
1015 | 		plt.legend(handles=handles, loc="lower right", fontsize=8, **legend_args)
1016 | 	if metrics is None:
1017 | 		return out
1018 | 	else:
1019 | 		metrics.update(out)
1020 | 
1021 | 
1022 | def lfc_corr_vs_excess_variance(predicted_lfc, observed_lfc, excess_variance,
1023 | 							  ax=None,
1024 | 							 metrics=None, legend=True, legend_args={'loc': 'upper right'},
1025 | 							   density_scatter_args={"alpha": .5, "s": 10, 'trend_line': False, 'label_outliers':5}
1026 | 							 ):
1027 | 	'''
1028 | 	Given the two matrices of log fold-change, one predicted by the model (`predicted_lfc`)
1029 | 	with guides as columns and replicates as rows, computes the correlation between 
1030 | 	each replicate, then correlates that correlation with the `pandas.Series` `guide_efficacy`
1031 | 	which should be estimated by the model. In general we expect lower fidelity between
1032 | 	predicted and observed sgRNAs for guides with low efficacy.
1033 | 	'''
1034 | 	corrs = predicted_lfc.corrwith(observed_lfc, axis=1).dropna()
1035 | 	corrs, excess_variance = corrs.align(excess_variance.dropna(), join='inner')
1036 | 	if ax is None:
1037 | 		ax = plt.gca()
1038 | 	if excess_variance.dropna().nunique() == 1:
1039 | 		excess_variance = excess_variance + np.random.uniform(0, excess_variance.dropna().iloc[0]/10, size=len(excess_variance))
1040 | 		ax.set_xlabel("Screen Excess Variance (Log10) jittered")
1041 | 	else:
1042 | 		ax.set_xlabel("Screen Excess Variance (Log10)")
1043 | 	density_scatter(np.log10(excess_variance), corrs,
1044 | 					ax=ax, **density_scatter_args)
1045 | 
1046 | 	ax.set_ylabel("Correlation Predicted/Observed LFC")
1047 | 	
1048 | 	out = {
1049 | 		"lfc_cell_corrs_median": corrs.median(),
1050 | 		"lfc_cell_corrs_min": corrs.min(),
1051 | 			}
1052 | 	if legend:
1053 | 		handles = append_to_legend_handles([
1054 | 			"%s: %1.2f" % (key.replace("_", ' '), val)
1055 | 			for key, val in out.items()
1056 | 		], ax)
1057 | 		plt.legend(handles=handles, **legend_args)
1058 | 	out['lfc_cell_corrs_low'] = corrs.sort_values().index[:10]
1059 | 	if metrics is None:
1060 | 		return out
1061 | 	else:
1062 | 		metrics.update(out)   
1063 | 
1064 | 
1065 | 
1066 | def _mean_ge_deviation_vs_grad(
1067 | 		predicted_readcounts, observed_readcounts, guide_map, ge_mean_grad, 
1068 | 			ax=None,
1069 | 		metrics=None, legend=True, legend_args={},
1070 | 		density_scatter_args={"alpha": .5, "s": 10}
1071 | ):
1072 | 	'''
1073 | 	A Chronos-specific plot.
1074 | 	Given the two normalized matrices of readcounts, one predicted by the model (`predicted_readcounts`)
1075 | 	and one observed (`observed_readcounts`), with guides as columns and replicates as rows,
1076 | 	computes the difference in mean log readcounts predicted from observed for each gene by 
1077 | 	taking the mean of each sgRNA's difference of mean log readcounts. This is plotted vs
1078 | 	the NB2 cost gradient on the gene's mean value. Genes with systematically higher predicted
1079 | 	than observed readcounts should have negative cost gradients and vice versa. This is useful
1080 | 	for Chronos debugging.
1081 | 	'''
1082 | 	estimated = pd.DataFrame(np.log10(predicted_readcounts.values+1), 
1083 | 								 index=predicted_readcounts.index, 
1084 | 								 columns=predicted_readcounts.columns
1085 | 								)
1086 | 	observed = pd.DataFrame(np.log10(observed_readcounts.values+1), 
1087 | 								 index=observed_readcounts.index, 
1088 | 								 columns=observed_readcounts.columns
1089 | 								)
1090 | 	estimated, observed = estimated.align(observed)
1091 | 	diff = estimated.mean() - observed.mean()
1092 | 	diff_gene = diff.groupby(guide_map.set_index("sgrna").gene).mean()
1093 | 	density_scatter(diff_gene, ge_mean_grad,
1094 | 					ax=ax, **density_scatter_args)
1095 | 	plt.xlabel("Estimated - Observed Readcounts (Log10)")
1096 | 	plt.ylabel("Mean Gene Effect Cost Gradient")
1097 | 
1098 | 
1099 | def check_integration_umap(gene_effect, sequence_map,
1100 | 					variance_quantile=.5,
1101 | 					 ax=None, metrics=None, legend=True, 
1102 | 					  legend_args=dict(loc='upper left', bbox_to_anchor=(1, 1.05)),
1103 | 						   scatter_args=dict(alpha=1, s=10)
1104 | 					 ):
1105 | 	'''
1106 | 	Given the matrix of `gene_effect` and a `dict` of `sequence_map`s (see chronos.Chronos doc string
1107 | 	for format), creates a UMAP embedding of cell lines in gene effect space, colored by the 
1108 | 	presence of the cell lines in the various batches indicated by the keys of `sequence_map`. 
1109 | 	To make the legend a manageable size, batch names are abbreviated to two letters. Returns the 
1110 | 	max variance explained by batch membership as evaluated by finding the principle components of 
1111 | 	`gene_effect`, correlating them with batch membership indicators, and multiplying that squared
1112 | 	correlation with the variance explained by the component, summed over all PCs (one result
1113 | 	per batch), returning the result for the batch that explains the most variance. This is useful
1114 | 	to evaluate how well different batches are integrated.
1115 | 	'''
1116 | 	if not umap_present:
1117 | 		raise ModuleNotFoundError("umap must be installed to use this plot")
1118 | 	aliases = _make_aliases(list(sequence_map.keys()))
1119 | 	palette = generate_powerset_palette(sequence_map.keys(), start=0, base_hsv_value=1)
1120 | 	keysets = powerset(sequence_map.keys())
1121 | 	keyset_lines = {}
1122 | 	indicators = pd.DataFrame({
1123 | 		key: pd.Series(True, 
1124 | 					   index=sorted(set(sequence_map[key].cell_line_name) - set(['pDNA']))
1125 | 					  )
1126 | 		for key in sequence_map
1127 | 	})
1128 | 	indicators.fillna(False, inplace=True)
1129 | 	gene_effect, indicators = gene_effect.align(indicators, join="inner", axis=0)
1130 | 	sds = gene_effect.std()
1131 | 	cutoff = sds.quantile(variance_quantile)
1132 | 	gene_effect = gene_effect[sds.loc[lambda x: x>cutoff].index]
1133 | 	for keyset in keysets:
1134 | 		if not len(keyset):
1135 | 			continue
1136 | 		lines = set.intersection(*[set(sequence_map[key].cell_line_name) - set(['pDNA'])
1137 | 								  for key in keyset]) & set(gene_effect.index)
1138 | 		if len(keyset) < len(sequence_map):
1139 | 			lines -= set.union(*[set(sequence_map[key].cell_line_name) - set(['pDNA'])
1140 | 								  for key in sequence_map.keys()
1141 | 								 if not key in keyset
1142 | 								])
1143 | 		keyset_lines[keyset] = sorted(lines)
1144 | 	
1145 | 	ump = UMAP(n_neighbors=5, min_dist=.02)
1146 | 	umps = pd.DataFrame(ump.fit_transform(gene_effect.dropna(axis=1).values), index=gene_effect.index)
1147 | 	for keyset, lines in keyset_lines.items():
1148 | 		plt.scatter(umps.loc[lines, 0], umps.loc[lines, 1], label=''.join(aliases[list(keyset)]),
1149 | 			color=palette[keyset],
1150 | 				   linewidth=(len(keyset)-1)/2, edgecolor='black', **scatter_args)
1151 | 	plt.xlabel("UMAP1")
1152 | 	plt.ylabel("UMAP2")
1153 | 	if legend:
1154 | 		plt.legend(**legend_args)
1155 | 
1156 | 	out = {}
1157 | 	
1158 | 	pca = PCA()
1159 | 	pcs = pd.DataFrame(pca.fit_transform(gene_effect.dropna(axis=1).values), index=gene_effect.index)
1160 | 	corrs_squared = fast_cor(pcs, indicators.astype(float))**2
1161 | 	out['library_pc_variance_explained_max'] = corrs_squared\
1162 | 											.multiply(pca.explained_variance_ratio_, axis=0)\
1163 | 											.sum()\
1164 | 											.sort_values(ascending=False)\
1165 | 											.max()
1166 | 	if metrics is None:
1167 | 		return out
1168 | 	else:
1169 | 		metrics.update(out)
1170 | 
1171 | 
1172 | 
1173 | def check_integration_mean_deviation(gene_effect, sequence_map,
1174 | 					 ax=None, metrics=None, legend=True,
1175 | 					  legend_args=dict(fontsize=7),
1176 | 					plot_args=dict(lw=1)
1177 | 					 ):
1178 | 	'''
1179 | 	Given the matrix of `gene_effect` and a `dict` of `sequence_map`s (see chronos.Chronos doc string
1180 | 	for format), calculates the mean gene effect for each gene within each batch of the sequence map, 
1181 | 	then the squared difference between that mean and the overall mean. This is plotted as a trend line
1182 | 	per batch vs the overall gene mean. Returns the mean of the square root of this per-batch variance
1183 | 	from the overall mean, and the genes with the largest variance in each batch.
1184 | 	'''
1185 | 	keyset_lines = {}
1186 | 	indicators = pd.DataFrame({
1187 | 		key: pd.Series(True, 
1188 | 					   index=sorted(set(sequence_map[key].cell_line_name) - set(['pDNA']))
1189 | 					  )
1190 | 		for key in sequence_map
1191 | 	})
1192 | 	means1 = gene_effect.mean()
1193 | 	cutoffs = means1.quantile([min(.1, 100/len(means1)), max(.9, 1-100/len(means1))])
1194 | 	keep = means1.loc[lambda x: (x < cutoffs.iloc[1]) & (x > cutoffs.iloc[0])].index
1195 | 	gene_effect = gene_effect[keep]
1196 | 	indicators.fillna(False, inplace=True)
1197 | 	gene_effect, indicators = gene_effect.align(indicators, join="inner", axis=0)
1198 | 	library_means = pd.DataFrame({
1199 | 		library: gene_effect[indicators[library]].mean()
1200 | 		for library in indicators
1201 | 	})
1202 | 	means = gene_effect.mean()
1203 | 	if ax is None:
1204 | 		ax = plt.gca()
1205 | 	else:
1206 | 		plt.sca(ax)
1207 | 	for library in indicators:
1208 | 		y = (library_means[library]-means)**2
1209 | 		trend = np.clip(lowess_trend(means, y), 0, np.inf)
1210 | 		order = np.argsort(means)
1211 | 		plt.plot(means.iloc[order], trend[order], label=library, **plot_args)
1212 | 	plt.xlabel("Gene Mean Overall")
1213 | 	plt.ylabel("Gene Mean Variance Trend")
1214 | 	
1215 | 	out = {}
1216 | 	sd = gene_effect.std()
1217 | 	normed_library_sd = np.sqrt(indicators.astype(float).sum()) * np.abs(library_means.subtract(means, axis=0))
1218 | 	out['normed_library_deviation'] = normed_library_sd.mean().mean()
1219 | 	if legend:
1220 | 		handles = append_to_legend_handles([
1221 | 			"%s: %1.2f" % (key.replace("_", ' '), val)
1222 | 			for key, val in out.items()
1223 | 		], ax)
1224 | 		plt.legend(handles=handles, **legend_args)
1225 | 		
1226 | 	out['library_outliers'] = {key: normed_library_sd[key].dropna().sort_values()[-5:]
1227 | 							  for key in normed_library_sd}
1228 | 	
1229 | 	
1230 | 	if ax is None:
1231 | 		ax = plt.gca()
1232 | 	else:
1233 | 		plt.sca(ax)
1234 | 	
1235 | 	if metrics is None:
1236 | 		return out
1237 | 	else:
1238 | 		metrics.update(out)
1239 | 
1240 | 
1241 | def guide_lfc_plot(lfc, palette):
1242 | 	'''convenience method for kde plotting a subset of sgRNA's log fold change with fixed color for each sgRNA'''
1243 | 	for j, key in enumerate(lfc.keys()):
1244 | 		for guide in palette[key].index:
1245 | 			sns.kdeplot(lfc[key][guide], label=key + guide[:4], bw_adjust=.5, color=palette[key][guide],
1246 | 					   lw=.5)
1247 | 
1248 | 
1249 | def guide_palette(guide_map, gene):
1250 | 	'''
1251 | 	Returns a palette with a unique color for each sgRNA in `guide_map` targeting `gene`.
1252 | 	'''
1253 | 	start = np.pi * np.arange(len(guide_map))/len(guide_map)
1254 | 	palette = {}
1255 | 	for i, key in enumerate(guide_map):
1256 | 		guides = guide_map[key].query("gene == %r" % gene).sgrna.unique()
1257 | 		palette[key] = pd.Series(
1258 | 			sns.cubehelix_palette(len(guides), start=start[i], rot=.25/len(guide_map), dark=.35, light=.7, hue=1),
1259 | 			index=guides
1260 | 		)
1261 | 	return palette
1262 | 
1263 | 
1264 | def interrogate_gene(data, naive, naive_collapsed, gene, plot_width, plot_height):
1265 | 	'''
1266 | 	Creates a set of summary plots for a given gene effect profile.
1267 | 	Parameters:
1268 | 		`data` (`dict`): must contain (all of these files can be loaded from a `chronos.Chronos.save` directory)
1269 | 			"gene_effect": `pandas.DataFrame` with genes as columns,
1270 | 			"logfoldchange": `pandas.DataFrame` with sgRNAs as columns,
1271 | 			"guide_efficacy": `pandas.Series` indexed by sgRNA with efficacy estimates,
1272 | 			"t0_offset": `pandas.Series` indexed by sgRNA with offset estimates,
1273 | 			"library effect": pandas.DataFrame` with genes as columns,
1274 | 		`naive` (`dict`): contains a `pandas.DataFrame` matrix per batch with naive estimates of gene effect
1275 | 			(typically median log fold change over guides per gene and replicates per cell line)
1276 | 		`naive_collapsed`: a `pandas.DataFrame` matrix holding he consensus naive estimate over all libraries.
1277 | 			Easily calculcated from `chronos.reports.collapse_dataframes`.
1278 | 		`gene` (`str`): the gene of interest
1279 | 		`plot_width`, `plot_height`: the total width of the figure and the height of individual panels, in inches.
1280 | 	Returns:
1281 | 		`matplotlib.Figure`
1282 | 	'''
1283 | 	palette = guide_palette(data['guide_map'], gene)
1284 | 	fig, axes = plt.subplots(3, 2, figsize=(plot_width, plot_height*2.5))
1285 | 	axes = [a for ax in axes for a in ax]
1286 | 
1287 | 	plt.sca(axes[0])
1288 | 	density_scatter(naive_collapsed[gene], data["gene_effect"][gene],
1289 | 				   diagonal=True, label_outliers=5, outliers_from='diagonal')
1290 | 	plt.xlabel("Naive Gene Effect")
1291 | 	plt.ylabel("Gene Effect")
1292 | 	
1293 | 	plt.sca(axes[1])
1294 | 	for j, key in enumerate(data['logfoldchange'].keys()):
1295 | 		for guide in palette[key].index:
1296 | 			sns.kdeplot(data['logfoldchange'][key][guide], label=key + '_' + guide[:4], bw_adjust=.5, 
1297 | 						color=palette[key][guide],
1298 | 					   lw=1)
1299 | 	plt.legend(fontsize=6)
1300 | 	plt.xlabel("Guide LFC")
1301 | 
1302 | 	plt.sca(axes[2])
1303 | 	labels = []
1304 | 	for library in palette:
1305 | 		x = data['guide_efficacy'].reindex(palette[library].index).fillna(-.1)
1306 | 		y = data['t0_offset'][library].reindex(palette[library].index).fillna(-.1)
1307 | 		plt.scatter(
1308 | 			x, y,
1309 | 			s=20, alpha=.75, linewidth=1, color=palette[library]
1310 | 		)
1311 | 		labels.extend([plt.text(s='%s_%s' % (library, ind[:4]), 
1312 | 								x=x[ind],
1313 | 								y=y[ind],
1314 | 								fontsize=6, color=palette[library][ind]
1315 | 							   ) for ind in palette[library].index])
1316 | 	if adjustText_present:
1317 | 		adjust_text(labels, arrowprops=dict(arrowstyle='-', color="black", lw=.5))
1318 | 	plt.xlabel("Guide Efficacy")
1319 | 	plt.ylabel("T0 Guide Offset")
1320 | 
1321 | 	plt.sca(axes[3])
1322 | 	x = pd.Series({library: naive[library][gene].mean()
1323 | 		for library in naive
1324 | 		if gene in naive[library]})
1325 | 	y = data['library_effect'].loc[gene]
1326 | 	colors = pd.Series({library: palette[library].iloc[0]
1327 | 		for library in palette
1328 | 		if len(palette[library])})
1329 | 	x, y = x.dropna().align(y.dropna(), join="inner")
1330 | 	x, colors = x.align(colors.dropna(), join="inner")
1331 | 	x, y = x.align(y, join="inner")
1332 | 	plt.scatter(x, y, s=20, alpha=.75, linewidth=1, c=colors)
1333 | 	labels = [plt.text(
1334 | 		s=ind, 
1335 | 		x=x[ind],
1336 | 		y=y[ind],
1337 | 		fontsize=8, color=colors[ind]
1338 | 	) for ind in x.index]
1339 | 	if adjustText_present:
1340 | 		adjust_text(labels, 
1341 | 			arrowprops=dict(arrowstyle='-', color="black", lw=.5))
1342 | 	plt.xlabel("Library Naive Gene Average")
1343 | 	plt.ylabel("Library Effect")
1344 | 
1345 | 	sorted_ge = data['gene_effect'][gene].sort_values().dropna()
1346 | 	lowest_line = sorted_ge.index[0]
1347 | 	highest_line = sorted_ge.index[-1]
1348 | 
1349 | 	plt.sca(axes[4])
1350 | 	single_line_interrogation(data, gene, lowest_line)
1351 | 	plt.title('%s in %s (Lowest)' % (gene, lowest_line), fontsize=10)
1352 | 
1353 | 	plt.sca(axes[5])
1354 | 	single_line_interrogation(data, gene, highest_line)
1355 | 	plt.title('%s in %s (Highest)' % (gene, highest_line), fontsize=10)
1356 | 
1357 | 	return fig
1358 | 
1359 | 
1360 | def single_line_interrogation(data, gene, line, ax=None, 
1361 | 		density_scatter_args={'trend_line': False, 'diagonal': True}
1362 | ):
1363 | 	'''
1364 | 	A scatterplot of predicted vs observed log fold-change of sgRNAs for the selected gene
1365 | 	in screened replicates of the selected line.
1366 | 	Parameters:
1367 | 		`data` (`dict`): see `interrogate_gene`
1368 | 		`gene` (`str`): the gene of interest
1369 | 		`line` (`str`): the cell line of interest.
1370 | 	'''
1371 | 	if not ax is None:
1372 | 		plt.sca(ax)
1373 | 	guides = {library: data['guide_map'][library].query("gene == %r" % gene).sgrna.unique()
1374 | 				for library in data['guide_map']}
1375 | 	sequences = {library: data['sequence_map'][library].query("cell_line_name == %r" % line).sequence_ID.unique()
1376 | 				for library in data['sequence_map']
1377 | 				}
1378 | 
1379 | 	abbreviated_guide_mapper = {}
1380 | 	abbreviated_replicate_mapper = {}
1381 | 	stacked_lfc = []
1382 | 	stacked_lfc_predicted = []
1383 | 	aliases = _make_aliases(list(data['logfoldchange'].keys()))
1384 | 
1385 | 	def consolidate_index(index):
1386 | 		out = []
1387 | 		for v in list(index):
1388 | 			lib1, rep = v[0].split('Rep')
1389 | 			lib2, guide = v[1].split('Guide')
1390 | 			if lib1 != lib2:
1391 | 				raise ValueError("Something went wrong in abbreviating index labels for log fold change")
1392 | 			out.append( '%sRep%sGuide%s' % (lib1, rep, guide))
1393 | 		return out
1394 | 
1395 | 	for key, lfc in data['logfoldchange'].items():
1396 | 		subset = lfc.loc[sequences[key], guides[key]]
1397 | 		abbreviated_guide_mapper.update({guide: '%sGuide%i' %(aliases[key], i+1)
1398 | 			for i, guide in enumerate(guides[key])})
1399 | 		abbreviated_replicate_mapper.update({sequence: '%sRep%i' %(aliases[key], i+1)
1400 | 			for i, sequence in enumerate(sequences[key])})
1401 | 		subset_predicted = data['predicted_logfoldchange'][key].loc[sequences[key], guides[key]]
1402 | 		subset.rename(index=abbreviated_replicate_mapper, columns=abbreviated_guide_mapper, inplace=True)
1403 | 		subset_predicted.rename(index=abbreviated_replicate_mapper, columns=abbreviated_guide_mapper, inplace=True)
1404 | 		stacked = subset.stack()
1405 | 		stacked_predicted = subset_predicted.stack()
1406 | 		stacked.index = consolidate_index(stacked.index)
1407 | 		stacked_predicted.index = consolidate_index(stacked_predicted.index)
1408 | 		stacked_lfc.append(stacked)
1409 | 		stacked_lfc_predicted.append(stacked_predicted)
1410 | 	x = pd.concat(stacked_lfc)
1411 | 	y = pd.concat(stacked_lfc_predicted)
1412 | 	x, y = x.align(y, join="inner")
1413 | 	if not len(x):
1414 | 		return
1415 | 	density_scatter(x, y, **density_scatter_args)
1416 | 	texts = [plt.text(s=ind, x=x[ind], y=y[ind], fontsize=7) for ind in x.index]
1417 | 	if adjustText_present:
1418 | 		adjust_text(texts, arrowprops=dict(arrowstyle="-", color="black", lw=.5))
1419 | 	plt.xlabel("Observed LFC")
1420 | 	plt.ylabel("Predicted LFC")
1421 | 	plt.title('%s in %s' % (gene, line))
1422 | 	print("Guide and replicate key for %s, %s:\n%r\n%r\n%r" % (gene, line, aliases, pd.Series(abbreviated_guide_mapper), 
1423 | 		pd.Series(abbreviated_replicate_mapper)))
1424 | 
1425 | 
1426 | def interrogate_gene_compare(paired_data, lfc, guide_map, gene, plot_width, plot_height):
1427 | 	'''
1428 | 	Creates a set of comparison plots for results from two different models for a specific gene.
1429 | 	 This is mostly useful for internal Chronos development.
1430 | 	Parameters:
1431 | 		`paired_data` (`dict`): must contain two keys labeling `data` (`dict`) from two different models.
1432 | 								See `interrogate_gene` for the format of `data`. 
1433 | 		`lfc` (`dict`): one key per batch, with the value being a `pandas.DataFrame` of observed 
1434 | 			log fold change, with sgRNAs as columns.
1435 | 		`guide_map` (`pandas.DataFrame`): see `chronos.Chronos` for format.
1436 | 		`gene` (`str`): the gene to examine.
1437 | 		`plot_width`, `plot_height`: the total width of the figure and the height of individual panels, in inches.
1438 | 	Returns:
1439 | 		`matplotlib.Figure`
1440 | 	'''
1441 | 	keys = list(paired_data.keys())
1442 | 	palette = guide_palette(guide_map, gene)
1443 | 	fig, axes = plt.subplots(2, 2, figsize=(plot_width, plot_height))
1444 | 	axes = [a for ax in axes for a in ax]
1445 | 	
1446 | 	plt.sca(axes[0])
1447 | 	density_scatter(paired_data[keys[0]]["gene_effect"][gene], paired_data[keys[1]]["gene_effect"][gene],
1448 | 				   diagonal=True, label_outliers=5, outliers_from='diagonal')
1449 | 	plt.xlabel(keys[0])
1450 | 	plt.ylabel(keys[1])
1451 | 	plt.title('%s Gene Effect' % gene)
1452 | 	
1453 | 	plt.sca(axes[2])
1454 | 	for j, key in enumerate(lfc.keys()):
1455 | 		for guide in palette[key].index:
1456 | 			sns.kdeplot(lfc[key][guide], label=key + '_' + guide[:4], bw_adjust=.5, 
1457 | 						color=palette[key][guide],
1458 | 					   lw=1)
1459 | 	plt.legend(fontsize=6)
1460 | 	plt.xlabel("Guide LFC")
1461 | 	
1462 | 	plt.sca(axes[3])
1463 | 	plt.title("Guide Efficacy")
1464 | 	labels = []
1465 | 	for library in lfc:
1466 | 		x = paired_data[keys[0]]['guide_efficacy'].reindex(palette[library].index).fillna(-.1)
1467 | 		y = paired_data[keys[1]]['guide_efficacy'].reindex(palette[library].index).fillna(-.1)
1468 | 		plt.scatter(
1469 | 			x, y,
1470 | 			s=20, alpha=.75, linewidth=1, color=palette[library]
1471 | 		)
1472 | 		labels.extend([plt.text(s='%s_%s' % (library, ind[:4]), 
1473 | 								x=x[ind],
1474 | 								y=y[ind],
1475 | 								fontsize=6, color=palette[library][ind]
1476 | 							   ) for ind in palette[library].index])
1477 | 	if adjustText_present:
1478 | 		adjust_text(labels, arrowprops=dict(arrowstyle='-', color="black", lw=.5))
1479 | 	plt.xlabel(keys[0])
1480 | 	plt.ylabel(keys[1])
1481 | 	
1482 | 	plt.sca(axes[1])
1483 | 	corrs = {}
1484 | 	for key in keys:
1485 | 		corrs[key] = {}
1486 | 		for library in lfc:
1487 | 			naive = lfc[library][palette[library].index]\
1488 | 					.groupby(paired_data[key]['sequence_map'][library].set_index("sequence_ID").cell_line_name)\
1489 | 					.median()
1490 | 			series = fast_cor(
1491 | 				paired_data[key]['gene_effect'][[gene]], 
1492 | 				naive
1493 | 			).loc[gene]
1494 | 			corrs[key][library] = series
1495 | 	labels = []
1496 | 	for library in lfc:
1497 | 		plt.scatter(corrs[keys[0]][library], corrs[keys[1]][library],
1498 | 				   color=palette[library], s=15, alpha=.75)
1499 | 		labels.extend([plt.text(s='%s_%s' % (library, ind[:4]), 
1500 | 								x=corrs[keys[0]][library][ind],
1501 | 								y=corrs[keys[1]][library][ind],
1502 | 								fontsize=6, color=palette[library][ind]
1503 | 							   ) for ind in corrs[keys[0]][library].index])
1504 | 	if adjustText_present:
1505 | 		adjust_text(labels, arrowprops=dict(arrowstyle='-', color="black", lw=.5))
1506 | 	plt.xlabel(keys[0])
1507 | 	plt.ylabel(keys[1])
1508 | 	plt.title("Gene Effect - Guide LFC Corr")
1509 | 	
1510 | 	return fig


--------------------------------------------------------------------------------