├── .gitignore
├── Data
    ├── Database_Citations.xlsx
    ├── DisGeNET_genesets.txt
    ├── DisGeNET_genesets_AUPRCs.csv
    ├── DisGeNET_genesets_Effect_Size.csv
    ├── GWAS_Catalog_genesets.txt
    ├── Oncogenic_Components_genesets.txt
    ├── Oncogenic_genesets_AUPRCs.csv
    └── Oncogenic_genesets_Effect_Size.csv
├── LICENSE.txt
├── Network Evaluation Examples
    ├── Network Evaluation Example.ipynb
    └── run_network_evaluation.py
├── Network Processing Notebooks
    ├── BIND Processing.ipynb
    ├── BioGRID Processing.ipynb
    ├── BioPlex Processing.ipynb
    ├── ConsensusPathDB Processing.ipynb
    ├── DIP Processing.ipynb
    ├── Degree-Preserved Network Shufflings.ipynb
    ├── GIANT Processing.ipynb
    ├── GeneMANIA Processing.ipynb
    ├── HINT Processing.ipynb
    ├── HPRD Processing.ipynb
    ├── HumanInteractome Processing.ipynb
    ├── HumanNet Processing.ipynb
    ├── InBioMap Processing.ipynb
    ├── IntAct Processing.ipynb
    ├── Mentha Processing.ipynb
    ├── MultiNet Processing.ipynb
    ├── PID Processing.ipynb
    ├── Pathway Commons Processing.ipynb
    ├── Reactome Processing.ipynb
    ├── Reactome-FIs Processing.ipynb
    ├── STRING Processing.ipynb
    └── iRefIndex Processing.ipynb
├── README.md
├── network_evaluation_tools
    ├── .ipynb_checkpoints
    │   ├── PSN Construction-checkpoint.ipynb
    │   └── SBNE Method-checkpoint.ipynb
    ├── __init__.py
    ├── data_import_tools.py
    ├── gene_conversion_tools.py
    ├── miscellaneous_functions.py
    ├── network_evaluation_functions.py
    └── network_propagation.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.pyc
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | .ipynb_checkpoints/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # Environments
 84 | .env
 85 | .venv
 86 | env/
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/Data/Database_Citations.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/Data/Database_Citations.xlsx


--------------------------------------------------------------------------------
/Data/DisGeNET_genesets_AUPRCs.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/Data/DisGeNET_genesets_AUPRCs.csv


--------------------------------------------------------------------------------
/Data/DisGeNET_genesets_Effect_Size.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/Data/DisGeNET_genesets_Effect_Size.csv


--------------------------------------------------------------------------------
/Data/Oncogenic_Components_genesets.txt:
--------------------------------------------------------------------------------
 1 | C1: ERBB3 / PI3K	ESRP1	PRSS8	TMEM125	GRHL2	RP11-388M20.2	RP11-354M1.2	CDH1	C1orf210	CRB3	ESRP2	GALNT3	ERBB3	TC2N	CCDC64B	RP11-429J17.6	TMC4	CDH3	MARVELD3	OVOL2	EPS8L1	CDS1	CDC42BPG	PVRL4	ATP2C2	LSR	LLGL2	MAP7	SPINT2	DSP	GRB7	C19orf21	EPS8L2	C6orf132	F11R	SH2D3A	RP11-615I2.2	GRHL1	GPR56	CHMP4C	SLC44A2	RHOD	PRRG2	RP11-22C11.2	ARHGEF16	RGL3	SIGIRR	TMEM184A	RNF223	AIF1L	MYO6	HOOK2	MYO5B	ARHGEF35	CNKSR1	MARVELD2	SMPDL3B	HOOK1	TTC9	ARHGEF5	CXCL16	ATP8B1	CST6	SYT7	RP4-798C17.5	COBL	TPD52L1	RGL2	PRRG4	LITAF
 2 | C2: MYC / E2F	TEAD2	SMO	FBXO2	PACSIN3	AC008738.2	CEBPA	PFAS	CENPV	CTSL2	FKBP10	IL27RA	CTSL1	TRAP1	FBLN1	VIM	RP11-124N14.3	ETV4	GEMIN5	TRIM65	RP11-40H20.2	TGFB1	NOB1	CTD-2033A16.3	EXOSC2	LIX1L	PPAT	RPIA	AC006111.1	LYAR	AMPD2	ERCC1	CDKN2AIPNL	CTD-2165H16.1	C20orf20	CCDC85B	RP4-765A10.1	RCOR2	RNMTL1	RSAD1	PLOD1	CDCA5	LEPREL2	GNL3	CACYBP	NOP56	EIF3E	FAM216A	CD320	EEF1A1P19	CAP2
 3 | C3: RAS / WNT / PI3K	PRSS3	RP11-133O22.6	SLC17A9	ALDH2	CDX2	ETV4	CENPV	SH3BGRL2	DUSP6	HOXA10	ABLIM1	STEAP1	HNF1B	PHLDA1	RP11-867G2.8	DSG2	C1orf106	SMAGP	C19orf21	SLC27A2	SPRY2	CD320	COL17A1	TIMP1	ERBB3	DSP	SGPP2	EFHD2	ANXA3	SYK	LSR	NOB1	HMGA1	IL18	LLGL2	GLYCTK	CHDH	EEF1A1P5	SLC25A6	EPB49	EEF1A1	EEF1A1P6	RPL4	YBX1	RPL6	CRYL1	RPS24	RPL5	CTD-2033A16.3	EIF3E
 4 | C4: EMT	COL5A1	SPARC	CDH11	CDH13	CCDC80	LTBP1	PCOLCE	DKK3	TBX3	C1S	KCNMA1	NEXN	LEPREL2	ANPEP	HEG1	RP11-443A13.3	COL16A1	ENG	CNRIP1	GAS6	RP13-530H6.2	ADAMTSL1	EFEMP1	SRPX	CD99	PALLD	IGFBP4	IFFO1	ITGA5	SLC44A2	GNG11	VIM	RP11-124N14.3	GPC1	TSPAN5	DPYSL3	FKBP10	C1orf198	MAN1A1	ATP8B1	CAP2	RCAN1	NDST1	PLOD1	EEF1A1P5	RAB13	RP11-342D11.2	EEF1A1	TIMP1	DDAH1
 5 | C5: HNF1 / PAX8	CLDN1	EPS8L2	PAX8	LEPREL1	HNF1B	ANXA3	DSG2	IL18	GNG11	WWC1	F11R	LIMCH1	ELOVL7	CHMP4C	ARHGEF5	TMEM56	GNAI1	PTPRJ	BCAM	RP11-124N14.3	STXBP2	VIM	RP4-798C17.5	GPX8	ARHGEF35	LITAF	SPINT2	HSPG2	LSR	RIPK4	RHPN2	DSP	PHLDB2	EPB49	PDGFB	NXN	LEPROT	BAIAP2L1	PLCB4	RP11-54F2.1	RP11-342D11.2	CCDC80	ABLIM1	CELSR1	CTSL1	TPD52L1	PALLD	KIAA1598	NDST1	UBE2H
 6 | C6: BRAF / MAPK	SRPX	PLAT	TNFRSF19	SPARC	MITF	ERBB3	SPRY2	DUSP6	GPR56	RENBP	GNG11	VIM	RP11-124N14.3	ETV4	PHLDA1	ST6GALNAC2	ENG	NES	SPRY4	AGPAT9	PHLDA3	TIMP1	CTSL1	RCAN1	PYGB	FKBP10	NFATC2	IFFO1	PLOD1	RIPK4	EEF1A1P5	EEF1A1	UBL3	YBX1	EEF1A1P6	RPL4	SLC20A1	RPL6	CHST11	SLC6A15	VAT1	SLC25A6	ENTPD6	RPL5	CD320	HMGB1	GLT25D1	SPRED1	SSH1	HMGA1
 7 | C7: TNF / NF-kB	NT5E	CDCP1	PHLDA1	CALB2	STEAP1	NRP1	RP11-342D11.2	PLAT	MT1E	ELK3	ANTXR2	AGPAT9	IRAK2	LINC00460	TM4SF19	RP11-394J1.2	HPCAL1	TGFB1	PRDM8	STX1A	HMGA2	TIMP1	FMNL1	RAB31	ITGA5	PDP1	HRH1	CHST11	IL31RA	TMEM158	RP11-124N14.3	C11orf68	VIM	IGFBP4	ETV4	EFHD2	DUSP6	AC138150.4	TSPAN5	SLC20A1	MAP4K4	CCDC85B	WDR54	FUT8	ADAM19	DST	GEM	DPYSL3	IL18	PHLDB2	DNER	DSG2	CMTM3	AC005035.1	ARSJ	GFPT2	CTSL1	EFEMP1	TPBG	HMGA1	CAPRIN2	LYAR	STMN3	FOXL1	GPX8	STAMBPL1	STK10	ARAP3	SMAGP	HJURP	RP11-221N13.3	ANXA3	CTHRC1	ITPRIP	FKBP10	CLDN1	TOMM34	SERPINA1	TRBC2
 8 | C8: MYC	KIF1A	CHGB	DPYSL3	SYP	SYT1	STMN3	PKIA	VANGL2	AP3B2	UNC13A	CENPV	MAPRE2	CTA-221G9.10	DLL1	CNTNAP2	JPH1	ELOVL2	TMEM145	STXBP1	RP11-122A3.2	RCOR2	DNAJC6	ZNF512	STX1A	VGF	RIC3	SLC6A15	RIMS2	AGPAT5	RAP2A	SSBP3	CD320	RIMS3	RP11-158I13.2	CXXC1	TPD52	CCDC64	HOOK1	SYT7	WDR54	IVNS1ABP	NOP56	EEF1A1P5	AC005035.1	YBX1	EEF1A1	RAB3B	EEF1A1P6	AC012379.1	PRPF19	RPL6	MAP4K4	HMGB1	CDCA5	RPL4	TTL	RPL5	ATP2A2	U2AF2	SLC25A6	TTLL7	IPO5	YBX1P1	GNAI1	CACYBP	MCL1	RPS24	SLC44A5	TSPYL2	PIH1D1	TSHZ1	HMGA1	EIF3E	RHBDD2	GSK3B	GOLIM4	GNL3	CBLN1	TMTC4	KHDRBS3	NEURL1B	SH3BGRL2	KATNB1	GART	PEX5L	EIF3H	ALDH2	SCN3B	HJURP	PSD	PPAT	CTB-79E8.2	SLC20A1	POLR3GL	METTL9	GAB2	AMOTL1	ARL2	DDRGK1	COPS8	PYGB	MEST	NELF	AGPAT6	MFSD6	EXOSC2	KIAA1324	RTN2	DAP	TOMM34	ID1	GLT25D1	VAT1	FAM216A	SRD5A1	ACN9	E2F4	TRAP1	CDKN2AIPNL	DBF4	RPIA	CXADR
 9 | C9: RAS / AP1	KRT17	KRT5	GPR87	DSC3	DSG3	FBLN1	COL17A1	CDH3	FAT2	RP11-615I2.2	AL391137.1	NXN	LEPREL1	IL18	CLDN1	PPP1R14C	EFEMP1	GPC1	RHOD	CDH1	CTSL2	CCDC80	DSP	VANGL2	ST6GALNAC2	PHLDA3	TMEM40	LY6D	C10orf54	CTSH	ANXA3	BCAM	RP11-354M1.2	CXCL16	FGFR2	DSG2	CREG1	RIPK4	LIMK2	MMP28	ID1	LSR	F11R	LITAF	CELSR2	DAB2IP	PHLDB2	C1orf106	TPD52L1	GNAI1
10 | 


--------------------------------------------------------------------------------
/Data/Oncogenic_genesets_AUPRCs.csv:
--------------------------------------------------------------------------------
 1 | Oncogenic Component Gene Set,GeneMANIA,GIANT,STRING,ReactomeFI,Reactome,MultiNet,PathwayCommons,HumanNet,BioPlex,DIP,InBioMap,BioGRID,BIND,Mentha,IRefIndex,PID,HPRD,IntAct,ConsensusPathDB,HINT,HumanInteractome
 2 | C1: ERBB3 / PI3K,79.504,88.463,45.321,9.31,9.144,13.522,8.321,25.382,29.247,-0.007,8.148,16.023,3.102,4.689,4.741,7.379,13.017,3.638,6.904,1.435,-0.254
 3 | C2: MYC / E2F,12.144,4.593,10.196,1.583,32.015,2.582,1.592,-0.428,2.878,-1.108,0.699,0.985,-3.08,0.356,-0.487,-1.122,-0.851,0.172,-0.771,0.758,0.237
 4 | C3: RAS / WNT / PI3K,13.709,24.005,3.724,4.536,6.139,1.996,6.138,1.109,0.347,8.047,0.94,0.942,9.901,0.587,0.301,0.252,2.324,1.861,-0.288,0.736,-0.622
 5 | C4: EMT,46.876,56.378,49.475,8.366,12.124,6.63,14.062,16.828,6.057,7.183,7.538,6.861,3.157,5.922,3.089,0.478,2.729,3.528,6.462,1.059,-1.832
 6 | C5: HNF1 / PAX8,18.51,12.137,3.211,6.21,5.811,14.762,3.296,4.714,17.87,2.503,6.311,2.412,33.288,3.325,3.186,44.088,1.666,0.784,5.496,3.214,0.698
 7 | C6: BRAF / MAPK,33.463,37.398,8.971,7.753,0.6,5.81,11.364,2.604,9.481,8.875,0.752,2.887,-1.686,1.98,1.22,0.66,1.176,2.421,1.623,0.883,-0.98
 8 | C7: TNF / NF-kB,25.602,75.85,3.472,5.288,5.415,0.043,0.039,4.025,0.666,4.193,1.091,-0.009,-0.242,0.091,2.078,-0.565,-0.538,0.483,-0.265,-0.332,2.302
 9 | C8: MYC,19.938,12.769,14.662,7.637,7.606,5.71,7.618,0.515,0.809,0.992,4.678,1.226,-1.186,1.449,2.012,3.4,2.329,0.815,-0.345,2.297,1.027
10 | C9: RAS / AP1,63.402,75.232,6.942,15.589,13.236,18.125,6.703,19.884,3.203,29.165,6.068,7.148,294.849,4.349,6.704,10.111,6.48,1.973,5.423,0.892,3.703


--------------------------------------------------------------------------------
/Data/Oncogenic_genesets_Effect_Size.csv:
--------------------------------------------------------------------------------
 1 | Oncogenic Component Gene Set,STRING,ConsensusPathDB,HumanNet,Reactome,ReactomeFI,GIANT,InBioMap,GeneMANIA,DIP,MultiNet,HINT,IRefIndex,PathwayCommons,HPRD,BioGRID,Mentha,IntAct,PID,BioPlex,BIND,HumanInteractome
 2 | C1: ERBB3 / PI3K,0.5135262,0.058763726,0.692236671,0.122222216,0.15754273,6.684008471,0.082859901,4.067461582,-4.31E-05,0.189440939,0.016354416,0.064830058,0.068458882,0.273601605,0.545721195,0.051626723,0.031076848,0.118967686,0.994315007,0.005668391,-0.001785174
 3 | C2: MYC / E2F,0.092846659,-0.024804717,-0.011586723,0.141646337,0.021769631,0.126131444,0.011438366,0.229788537,-0.007466792,0.042600429,0.024478392,-0.013013427,0.01547359,-0.016721966,0.021910887,0.011695219,0.004778901,-0.002736735,0.056399241,-0.01325828,0.000747868
 4 | C3: RAS / WNT / PI3K,0.058378867,-0.015405315,0.050083469,0.116639188,0.130570541,0.423772198,0.024225851,0.353977705,0.163708126,0.030888897,0.017779446,0.009526628,0.302163126,0.058046682,0.019992612,0.017686172,0.041009049,0.003238783,0.005259451,0.094973291,-0.003334094
 5 | C4: EMT,0.502137765,0.078045291,0.539404955,0.171974632,0.143326712,1.56712006,0.108628694,1.578435016,0.034923587,0.045392111,0.0082312,0.080228263,0.377918883,0.048341957,0.107700053,0.074111893,0.082371847,0.00104432,0.147561307,0.014931652,-0.001871211
 6 | C5: HNF1 / PAX8,0.067260554,0.051132155,0.048942587,0.086221776,0.124037539,0.359179111,0.034975803,0.493826433,0.025733051,0.12134834,0.038769791,0.060915915,0.039247966,0.039383136,0.070949222,0.045455993,0.009638796,0.584454442,0.4182192,0.049113775,0.00191155
 7 | C6: BRAF / MAPK,0.16734618,0.040467197,0.144514141,0.008246205,0.124472595,0.660173496,0.029069971,0.636674143,0.171066126,0.066039347,0.022348522,0.039525486,0.347191285,0.045675844,0.078249904,0.056334395,0.048530168,0.007730904,0.242063957,-0.033614766,-0.007001517
 8 | C7: TNF / NF-kB,0.070352307,-0.003042142,0.078385588,0.113450311,0.064574197,1.614945003,0.014547852,0.874980732,0.150039047,0.000784241,-0.005916828,0.024739026,0.00103285,-0.026220378,-0.00014158,0.001885841,0.013316024,-0.004948463,0.014466807,-0.002359939,0.019704809
 9 | C8: MYC,0.334557167,-0.016903927,0.02853045,0.282954697,0.306728046,0.350495141,0.138018638,0.407845466,0.06911551,0.226512037,0.059835179,0.090457074,0.298779819,0.11275682,0.058599939,0.050242058,0.029014753,0.12451458,0.034957168,-0.079153753,0.007960667
10 | C9: RAS / AP1,0.227817837,0.058713082,0.425458334,0.451898009,0.508070925,2.110367453,0.047765235,1.711314568,0.376062304,0.171777257,0.005529503,0.103466426,0.120951911,0.172031028,0.233638383,0.048789225,0.018321645,0.235847681,0.061694523,0.305124847,0.00876537


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) Idekerlab 2017
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/Network Evaluation Examples/run_network_evaluation.py:
--------------------------------------------------------------------------------
  1 | ###################################################################
  2 | # Command line script to analyze network on node sets of interest #
  3 | ###################################################################
  4 | 
  5 | from network_evaluation_tools import network_evaluation_functions as nef
  6 | from network_evaluation_tools import data_import_tools as dit
  7 | from network_evaluation_tools import gene_conversion_tools as gct
  8 | import argparse
  9 | import os
 10 | import pandas as pd
 11 | 
 12 | # Checking valid alpha and p values (Range is 0.0-1.0 exclusive)
 13 | # Value can also be None.
 14 | def restricted_float(x):
 15 | 	if x is not None:
 16 | 		x = float(x)
 17 | 		if x <= 0.0 or x >= 1.0:
 18 | 			raise argparse.ArgumentTypeError("%r not in range (0.0, 1.0) exclusive"%(x,))
 19 | 	return x
 20 | 
 21 | # Checking valid integer values (for all values that must be >0)
 22 | def positive_int(x):
 23 | 	x = int(x)
 24 | 	if x <= 0:
 25 | 		 raise argparse.ArgumentTypeError("%s must be a positive integer" % x)
 26 | 	return x
 27 | 
 28 | # Valid file path check (Does not check file formatting, but checks if given path exists and is readable)
 29 | def valid_infile(in_file):
 30 | 	if not os.path.isfile(in_file):
 31 | 		raise argparse.ArgumentTypeError("{0} is not a valid input file path".format(in_file))	
 32 | 	if os.access(in_file, os.R_OK):
 33 | 		return in_file
 34 | 	else:
 35 | 		raise argparse.ArgumentTypeError("{0} is not a readable input file".format(in_file))
 36 | 
 37 | # Valid output directory path check (Checks if the output directory path can be found and written to by removing given filename from full path)
 38 | # Note: This uses '/' character for splitting pathnames on Linux and Mac OSX. The character may need to be changed to '\' for Windows executions
 39 | def valid_outfile(out_file):
 40 | 	outdir = '/'.join(out_file.split('/')[:-1])
 41 | 	if not os.path.isdir(outdir):
 42 | 		raise argparse.ArgumentTypeError("{0} is not a valid output directory".format(outdir))
 43 | 	if os.access(outdir, os.W_OK):
 44 | 		return out_file
 45 | 	else:
 46 | 		raise argparse.ArgumentTypeError("{0} is not a writable output directory".format(outdir))
 47 | 
 48 | if __name__ == "__main__":
 49 | 	# Network Evaluation Setup Variables
 50 | 	parser = argparse.ArgumentParser(description='Analyze network performance on ability to aggregate sets of nodes in network space.')
 51 | 	parser.add_argument("network_path", type=valid_infile, 
 52 | 		help='Path to file of network to be evaluated. File must be 2-column edge list where each line is a gene interaction separated by a common delimiter.')
 53 | 	parser.add_argument("node_sets_file", type=valid_infile, 
 54 | 		help='Path to file of node sets. Each line is a list, separated by a common delimiter. The first item in each line will be the name of the node set.')
 55 | 	parser.add_argument("actual_AUPRCs_save_path", type=valid_outfile, 
 56 | 		help='CSV file path of network evaluation result scores (AUPRCs). This script minimally returns these values to save. Must have a writable directory.')		
 57 | 	parser.add_argument('-v', '--verbose', default=False, action="store_true", required=False,
 58 | 		help='Verbosity flag for reporting on patient similarity network construction steps.')	
 59 | 	parser.add_argument('-netd', '--net_file_delim', type=str, default='\t', required=False,
 60 | 		help='Delimiter used in network file between columns. Default is tab white space.')
 61 | 	parser.add_argument('-setd', '--set_file_delim', type=str, default='\t', required=False,
 62 | 		help='Delimiter used in node set file to delimit lists. Default is tab white space.')	
 63 | 	parser.add_argument("-p", "--sample_p", type=restricted_float, default=None, required=False,
 64 | 		help='Sub-sampling percentage for node sets of interest. Default is None. Each gene set''s p is automatically determined by the network in this case.')
 65 | 	parser.add_argument("-a", "--alpha", type=restricted_float, default=None, required=False,
 66 | 		help='Propagation constant to use in the propagation of node sub-samples over given network. Overrides alpha calculation model if given.')
 67 | 	parser.add_argument("-n", "--sub_sample_iter", type=positive_int, default=30, required=False,
 68 | 		help='Number of times to perform sub-sampling during performance recovery (AUPRC) calculation for each node set. Default is 30.')
 69 | 	parser.add_argument('-c', '--cores', type=positive_int, default=1, required=False,
 70 | 		help='Number of cores to be utilized by machine for performance calculation step. NOTE: Each core must have enough memory to store at least network-sized square matrix and given node sets to perform calculations.')	
 71 | 	parser.add_argument('-bg', '--background', type=str, default='network', choices=['genesets', 'network'], required=False,
 72 | 		help='Establishes the background gene set to calculate AUPRC over. Default is to use all genes in the network, can change to use only genes from the union of all gene sets tested (i.e. disease genes only).')	
 73 | 
 74 | 	# Network performance score calculations (with null networks)
 75 | 	parser.add_argument("-i", "--null_iter", type=positive_int, default=30, required=False,
 76 | 		help='Number of times to perform degree-preserved shuffling of network to construct performance value null distribution. Default is 30. If this value is >0, --null_AUPRCs_save_path will be required')
 77 | 	parser.add_argument('-nno', '--null_network_outdir', type=valid_outfile, default=None, required=False,
 78 | 		help='File directory to save null networks after generation.')
 79 | 	parser.add_argument('-nsp', '--null_AUPRCs_save_path', type=valid_outfile, default=None, required=False,
 80 | 		help='CSV file path of where to save null network evaluation results. Used in the calculation of network performance score and perfomance gain scores')
 81 | 	parser.add_argument('-psp', '--performance_save_path', type=valid_outfile, default=None, required=False,
 82 | 		help='CSV file path of where to save network evaluation results as z-scores.')
 83 | 	parser.add_argument('-gsp', '--performance_gain_save_path', type=valid_outfile, default=None, required=False,
 84 | 		help='CSV file path of where to save network evaluation results as gain in AUPRC over median null AUPRCs.')
 85 | 
 86 | 	args = parser.parse_args()
 87 | 	# If null networks need to be constructed
 88 | 	if args.null_iter > 0:
 89 | 		# A file path must be given to either save the null networks or the null network performance
 90 | 		if (args.null_AUPRCs_save_path is None) and (args.null_network_outdir is None):
 91 | 			parser.error('Save path required for null network edge lists or null network evaluation results.')
 92 | 
 93 | 	####################################
 94 | 	##### Network Evaluation Setup #####
 95 | 	####################################
 96 | 
 97 | 	# Limit core usage (if defined)
 98 | 	import mkl
 99 | 	mkl.set_num_threads(args.cores)
100 | 	
101 | 	# Load Network
102 | 	network = dit.load_network_file(args.network_path, verbose=args.verbose)
103 | 	network_size = len(network.nodes())
104 | 
105 | 	# Load Gene sets
106 | 	genesets = dit.load_node_sets(args.node_sets_file, verbose=args.verbose)
107 | 
108 | 	# Calculate gene set sub-sample rate with network (if not set)
109 | 	if args.sample_p is None:
110 | 		genesets_p = nef.calculate_p(network, genesets)
111 | 	else:
112 | 		genesets_p = {geneset:args.sample_p for geneset in genesets}
113 | 	if args.verbose:
114 | 		print 'Gene set sub-sample rates set'
115 | 
116 | 	# Calculate network kernel (also determine propagation constant if not set)
117 | 	kernel = nef.construct_prop_kernel(network, alpha=args.alpha, verbose=True)
118 | 
119 | 	# Change background gene list if needed
120 | 	if args.background == 'genesets':
121 | 		background_node_set = set()
122 | 		for geneset in genesets:
123 | 			background_node_set = background_node_set.union(genesets[geneset])
124 | 		background_nodes = list(background_node_set.intersection(set(kernel.index)))
125 | 	else:
126 | 		background_nodes = list(kernel.index)
127 | 
128 | 
129 | 	############################################
130 | 	##### Network Performance Calculations #####
131 | 	############################################
132 | 
133 | 	# Calculate AUPRC for each gene set on actual network (large networks are >=10k nodes)
134 | 	if network_size < 10000:
135 | 		actual_AUPRC_values = nef.small_network_AUPRC_wrapper(kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True)
136 | 	else:
137 | 		actual_AUPRC_values = nef.large_network_AUPRC_wrapper(kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True)
138 | 
139 | 	# Save the actual network's AUPRC values
140 | 	actual_AUPRC_values.to_csv(args.actual_AUPRCs_save_path)
141 | 
142 | 
143 | 	#################################################
144 | 	##### Null Network Performance Calculations #####
145 | 	#################################################
146 | 
147 | 	# If number of null networks > 0:
148 | 	if args.null_iter > 0:
149 | 		null_AUPRCs = []
150 | 		for i in range(args.null_iter):
151 | 			# Construct null networks and calculate AUPRCs for each gene set on each null network
152 | 			shuffNet = nef.shuffle_network(network, max_tries_n=10, verbose=True)
153 | 			# Save null network if null network output directory is given
154 | 			if args.null_network_outdir is not None:
155 | 				shuffNet_edges = shuffNet.edges()
156 | 				gct.write_edgelist(shuffNet_edges, args.null_network_outdir+'shuffNet_'+repr(i+1)+'.txt',
157 | 					delimiter='\t', binary=True)
158 | 				if args.verbose:
159 | 					print('Shuffled Network', i+1, 'written to file')
160 | 			# Construct null network kernel
161 | 			shuffNet_kernel = nef.construct_prop_kernel(shuffNet, alpha=args.alpha, verbose=False)
162 | 			# Calculate null network AUPRCs
163 | 			if network_size < 10000:
164 | 				shuffNet_AUPRCs = nef.small_network_AUPRC_wrapper(shuffNet_kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True)
165 | 			else:
166 | 				shuffNet_AUPRCs = nef.large_network_AUPRC_wrapper(shuffNet_kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True)
167 | 			null_AUPRCs.append(shuffNet_AUPRCs)
168 | 		# Construct table of null AUPRCs
169 | 		null_AUPRCs_table = pd.concat(null_AUPRCs, axis=1)
170 | 		null_AUPRCs_table.columns = ['shuffNet'+repr(i+1) for i in range(len(null_AUPRCs))]
171 | 		if args.verbose:
172 | 			print 'All null network gene set AUPRCs calculated'
173 | 		# Save null network AUPRCs if save path is given
174 | 		if args.null_AUPRCs_save_path is not None:
175 | 			null_AUPRCs_table.to_csv(args.null_AUPRCs_save_path)
176 | 		# Calculate performance score for each gene set's AUPRC if performance score save path is given
177 | 		if args.performance_save_path is not None:
178 | 			network_performance = nef.calculate_network_performance_score(actual_AUPRC_values, null_AUPRCs_table, verbose=args.verbose)			
179 | 			network_performance.to_csv(args.performance_save_path)
180 | 		# Calculate network performance gain over median null AUPRC if AUPRC performance gain save path is given
181 | 		if args.performance_gain_save_path is not None:
182 | 			network_perf_gain = nef.calculate_network_performance_gain(actual_AUPRC_values, null_AUPRCs_table, verbose=args.verbose)			
183 | 			network_perf_gain.to_csv(args.performance_save_path)
184 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/BIND Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "from network_evaluation_tools import gene_conversion_tools as gct"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "## Load BIND Raw Data\n",
 20 |     "#### Source: http://www.pathwaycommons.org/archives/PC2/v8/PathwayCommons.8.bind.BINARY_SIF.hgnc.txt.sif.gz\n",
 21 |     "Downloaded: June 15, 2017  \n",
 22 |     "Last Updated (via Pathway Commons v9 datasources.txt file): December 15, 2010  \n",
 23 |     "Note: For this processing, we used the data file provided in the PathwayCommons v8 distribution. The SIF file provided by Pathway Commons v9 at the given time only yields 13078 interactions significantly less than the file provided by the v8 distribution. It is unclear where all of those interactions have gone for now, but at this time, we will be using the Pathway Commons v8 distribution of BIND.  \n",
 24 |     "Also note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 30,
 30 |    "metadata": {
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 36 |     "BIND_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons.8.bind.BINARY_SIF.hgnc.txt.sif',sep='\\t', header=-1)"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 32,
 42 |    "metadata": {
 43 |     "collapsed": false,
 44 |     "scrolled": true
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "name": "stdout",
 49 |      "output_type": "stream",
 50 |      "text": [
 51 |       "Edges in BIND: 72780\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "# Convert table of interactions to edgelist (no scores given)\n",
 57 |     "# Also no gene symbol conversion necessary because network is given in symbol format already\n",
 58 |     "BIND_edgelist = BIND_Raw[[0, 2]].values.tolist()\n",
 59 |     "print 'Edges in BIND:', len(BIND_edgelist)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 33,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# Sort each edge representation for filtering\n",
 71 |     "BIND_edgelist_sorted = [sorted(edge) for edge in BIND_edgelist]"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 34,
 77 |    "metadata": {
 78 |     "collapsed": false
 79 |    },
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stdout",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "72780 input edges\n",
 86 |       "0 self-edges removed\n",
 87 |       "0 edges with un-mapped genes removed\n",
 88 |       "0 duplicate edges removed\n",
 89 |       "Edge list filtered: 0.19 seconds\n",
 90 |       "72780 Edges remaining\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "# Filter edgelist for duplicate nodes and for self-edges\n",
 96 |     "BIND_edgelist_filt = gct.filter_converted_edgelist(BIND_edgelist_sorted)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 35,
102 |    "metadata": {
103 |     "collapsed": false
104 |    },
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "Edge list saved: 0.09 seconds\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "# Save genelist to file\n",
116 |     "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
117 |     "gct.write_edgelist(BIND_edgelist_filt, outdir+'BIND_Symbol.sif')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {
124 |     "collapsed": true
125 |    },
126 |    "outputs": [],
127 |    "source": []
128 |   }
129 |  ],
130 |  "metadata": {
131 |   "kernelspec": {
132 |    "display_name": "Python 2",
133 |    "language": "python",
134 |    "name": "python2"
135 |   },
136 |   "language_info": {
137 |    "codemirror_mode": {
138 |     "name": "ipython",
139 |     "version": 2
140 |    },
141 |    "file_extension": ".py",
142 |    "mimetype": "text/x-python",
143 |    "name": "python",
144 |    "nbconvert_exporter": "python",
145 |    "pygments_lexer": "ipython2",
146 |    "version": "2.7.11"
147 |   }
148 |  },
149 |  "nbformat": 4,
150 |  "nbformat_minor": 0
151 | }
152 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/BioGRID Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "import pandas as pd\n",
 13 |     "import itertools"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Load BioGRID Raw Data\n",
 21 |     "#### Source (MITAB): http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.149/BIOGRID-ORGANISM-3.4.149.tab2.zip\n",
 22 |     "Downloaded: June 15, 2017  \n",
 23 |     "Last Updated: June 01, 2017  \n",
 24 |     "Notes for download: There is a new verision of BioGRID released on the first of every month. Download the organism specific files to extract only human interactions from the database.  \n",
 25 |     "Notes for processing: This is the file for human protein interactions, however, not all interactions may be human-human interactions. These need to be filtered. There is a column for \"Score\" filtering, but it seems that most of these values are missing so they will be ignored for processing BioGRID"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "Raw edge count in BioGRID: 394749\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 45 |     "BioGRID_Raw = pd.read_csv(wd+'Network_Data_Raw/BioGRID/BIOGRID-ORGANISM-3.4.149.tab2/BIOGRID-ORGANISM-Homo_sapiens-3.4.149.tab2.txt',sep='\\t', low_memory=False)\n",
 46 |     "print 'Raw edge count in BioGRID:', len(BioGRID_Raw)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 4,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/plain": [
 59 |        "physical    392779\n",
 60 |        "genetic       1970\n",
 61 |        "Name: Experimental System Type, dtype: int64"
 62 |       ]
 63 |      },
 64 |      "execution_count": 4,
 65 |      "metadata": {},
 66 |      "output_type": "execute_result"
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "# Show not all interactions in BioGRID are physical PPI, though the overwhelming majority are\n",
 71 |     "BioGRID_Raw['Experimental System Type'].value_counts()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 27,
 77 |    "metadata": {
 78 |     "collapsed": false,
 79 |     "scrolled": true
 80 |    },
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "9606      372979\n",
 86 |        "10090      17963\n",
 87 |        "11676       1591\n",
 88 |        "10116        570\n",
 89 |        "559292       355\n",
 90 |        "Name: Organism Interactor A, dtype: int64"
 91 |       ]
 92 |      },
 93 |      "execution_count": 27,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "# Not all interactions are from Human\n",
100 |     "BioGRID_Raw['Organism Interactor A'].value_counts().head()"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 28,
106 |    "metadata": {
107 |     "collapsed": false,
108 |     "scrolled": true
109 |    },
110 |    "outputs": [
111 |     {
112 |      "data": {
113 |       "text/plain": [
114 |        "9606      389334\n",
115 |        "10090       2543\n",
116 |        "559292      1045\n",
117 |        "10116        708\n",
118 |        "11676        318\n",
119 |        "Name: Organism Interactor B, dtype: int64"
120 |       ]
121 |      },
122 |      "execution_count": 28,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "# Not all interactions are from Human\n",
129 |     "BioGRID_Raw['Organism Interactor B'].value_counts().head()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "#### Since there are so few genetic interactions relative to physical interactions, we will not filter these edges. However, we will filter all interactions that are not labelled as human-human interactions"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "#### Keep only human-human interactions"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 9,
149 |    "metadata": {
150 |     "collapsed": false
151 |    },
152 |    "outputs": [
153 |     {
154 |      "name": "stdout",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "Human-Human only interactions in BioGRID 3.4.149: 367564\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "BioGRID_Human_Only = BioGRID_Raw[(BioGRID_Raw['Organism Interactor A']==9606) & (BioGRID_Raw['Organism Interactor B']==9606)]\n",
163 |     "print 'Human-Human only interactions in BioGRID 3.4.149:', len(BioGRID_Human_Only)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 29,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [
173 |     {
174 |      "data": {
175 |       "text/plain": [
176 |        "Series([], Name: Official Symbol Interactor A, dtype: object)"
177 |       ]
178 |      },
179 |      "execution_count": 29,
180 |      "metadata": {},
181 |      "output_type": "execute_result"
182 |     }
183 |    ],
184 |    "source": [
185 |     "# Any missing symbol names in column A?\n",
186 |     "BioGRID_Human_Only['Official Symbol Interactor A'][BioGRID_Human_Only['Official Symbol Interactor A']=='-']"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 30,
192 |    "metadata": {
193 |     "collapsed": false
194 |    },
195 |    "outputs": [
196 |     {
197 |      "data": {
198 |       "text/plain": [
199 |        "Series([], Name: Official Symbol Interactor B, dtype: object)"
200 |       ]
201 |      },
202 |      "execution_count": 30,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "# Any missing symbol names in column B?\n",
209 |     "BioGRID_Human_Only['Official Symbol Interactor B'][BioGRID_Human_Only['Official Symbol Interactor B']=='-']"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 32,
215 |    "metadata": {
216 |     "collapsed": false
217 |    },
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "Edges in BioGRID: 367564\n"
224 |      ]
225 |     }
226 |    ],
227 |    "source": [
228 |     "# Convert table of interactions to edgelist (no scores given)\n",
229 |     "# Also no gene symbol conversion necessary because network is given in symbol format already\n",
230 |     "BioGRID_edgelist = BioGRID_Human_Only[['Official Symbol Interactor A', 'Official Symbol Interactor B']].values.tolist()\n",
231 |     "print 'Edges in BioGRID:', len(BioGRID_edgelist)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 33,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "# Sort each edge representation for filtering\n",
243 |     "BioGRID_edgelist_sorted = [sorted(edge) for edge in BioGRID_edgelist]"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 34,
249 |    "metadata": {
250 |     "collapsed": false
251 |    },
252 |    "outputs": [
253 |     {
254 |      "name": "stdout",
255 |      "output_type": "stream",
256 |      "text": [
257 |       "367564 input edges\n",
258 |       "4598 self-edges removed\n",
259 |       "0 edges with un-mapped genes removed\n",
260 |       "104709 duplicate edges removed\n",
261 |       "Edge list filtered: 0.29 seconds\n",
262 |       "258257 Edges remaining\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "# Filter edgelist for duplicate nodes and for self-edges\n",
268 |     "BioGRID_edgelist_filt = gct.filter_converted_edgelist(BioGRID_edgelist_sorted)"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 37,
274 |    "metadata": {
275 |     "collapsed": false
276 |    },
277 |    "outputs": [
278 |     {
279 |      "name": "stdout",
280 |      "output_type": "stream",
281 |      "text": [
282 |       "Edge list saved: 0.21 seconds\n"
283 |      ]
284 |     }
285 |    ],
286 |    "source": [
287 |     "# Save genelist to file\n",
288 |     "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
289 |     "gct.write_edgelist(BioGRID_edgelist_filt, outdir+'BioGRID_Symbol.sif')"
290 |    ]
291 |   }
292 |  ],
293 |  "metadata": {
294 |   "kernelspec": {
295 |    "display_name": "Python 2",
296 |    "language": "python",
297 |    "name": "python2"
298 |   },
299 |   "language_info": {
300 |    "codemirror_mode": {
301 |     "name": "ipython",
302 |     "version": 2
303 |    },
304 |    "file_extension": ".py",
305 |    "mimetype": "text/x-python",
306 |    "name": "python",
307 |    "nbconvert_exporter": "python",
308 |    "pygments_lexer": "ipython2",
309 |    "version": "2.7.11"
310 |   }
311 |  },
312 |  "nbformat": 4,
313 |  "nbformat_minor": 0
314 | }
315 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/BioPlex Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "import pandas as pd\n",
 13 |     "import itertools"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Load BioPlex Raw Data\n",
 21 |     "#### Source: http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv\n",
 22 |     "Downloaded: June 20, 2017  \n",
 23 |     "Last Updated: December 01, 2016     \n",
 24 |     "This latest update of BioPlex (2.0 v4) is associated with the recent paper: Huttlin et al. (2017) Nature doi: 10.1038/nature22366  \n",
 25 |     "Note: We could use the 'p(Interaction)' column as a scoring metric to filter the network further, however, a top 10% filtering of this network would yield a network with <6000 interactions, so we did not feel like it was necessary to filter the network further for analysis."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 13,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "Raw edge count in BioPlex: 56553\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 45 |     "BioPlex_Raw = pd.read_csv(wd+'Network_Data_Raw/BioPlex_interactionList_v4a.tsv',sep='\\t')\n",
 46 |     "print 'Raw edge count in BioPlex:', len(BioPlex_Raw)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 14,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/html": [
 59 |        "<div>\n",
 60 |        "<table border=\"1\" class=\"dataframe\">\n",
 61 |        "  <thead>\n",
 62 |        "    <tr style=\"text-align: right;\">\n",
 63 |        "      <th></th>\n",
 64 |        "      <th>GeneA</th>\n",
 65 |        "      <th>GeneB</th>\n",
 66 |        "      <th>UniprotA</th>\n",
 67 |        "      <th>UniprotB</th>\n",
 68 |        "      <th>SymbolA</th>\n",
 69 |        "      <th>SymbolB</th>\n",
 70 |        "      <th>p(Wrong)</th>\n",
 71 |        "      <th>p(No Interaction)</th>\n",
 72 |        "      <th>p(Interaction)</th>\n",
 73 |        "    </tr>\n",
 74 |        "  </thead>\n",
 75 |        "  <tbody>\n",
 76 |        "    <tr>\n",
 77 |        "      <th>0</th>\n",
 78 |        "      <td>100</td>\n",
 79 |        "      <td>728378</td>\n",
 80 |        "      <td>P00813</td>\n",
 81 |        "      <td>A5A3E0</td>\n",
 82 |        "      <td>ADA</td>\n",
 83 |        "      <td>POTEF</td>\n",
 84 |        "      <td>2.380858e-09</td>\n",
 85 |        "      <td>0.000332</td>\n",
 86 |        "      <td>0.999668</td>\n",
 87 |        "    </tr>\n",
 88 |        "    <tr>\n",
 89 |        "      <th>1</th>\n",
 90 |        "      <td>100</td>\n",
 91 |        "      <td>345651</td>\n",
 92 |        "      <td>P00813</td>\n",
 93 |        "      <td>Q562R1</td>\n",
 94 |        "      <td>ADA</td>\n",
 95 |        "      <td>ACTBL2</td>\n",
 96 |        "      <td>9.786437e-18</td>\n",
 97 |        "      <td>0.211914</td>\n",
 98 |        "      <td>0.788086</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>2</th>\n",
102 |        "      <td>222389</td>\n",
103 |        "      <td>708</td>\n",
104 |        "      <td>Q8N7W2</td>\n",
105 |        "      <td>Q07021</td>\n",
106 |        "      <td>BEND7</td>\n",
107 |        "      <td>C1QBP</td>\n",
108 |        "      <td>2.962215e-17</td>\n",
109 |        "      <td>0.005645</td>\n",
110 |        "      <td>0.994355</td>\n",
111 |        "    </tr>\n",
112 |        "    <tr>\n",
113 |        "      <th>3</th>\n",
114 |        "      <td>222389</td>\n",
115 |        "      <td>4038</td>\n",
116 |        "      <td>Q8N7W2</td>\n",
117 |        "      <td>O75096</td>\n",
118 |        "      <td>BEND7</td>\n",
119 |        "      <td>LRP4</td>\n",
120 |        "      <td>3.302994e-10</td>\n",
121 |        "      <td>0.000280</td>\n",
122 |        "      <td>0.999720</td>\n",
123 |        "    </tr>\n",
124 |        "    <tr>\n",
125 |        "      <th>4</th>\n",
126 |        "      <td>645121</td>\n",
127 |        "      <td>3312</td>\n",
128 |        "      <td>Q6ZMN8</td>\n",
129 |        "      <td>P11142</td>\n",
130 |        "      <td>CCNI2</td>\n",
131 |        "      <td>HSPA8</td>\n",
132 |        "      <td>2.060285e-16</td>\n",
133 |        "      <td>0.036235</td>\n",
134 |        "      <td>0.963765</td>\n",
135 |        "    </tr>\n",
136 |        "  </tbody>\n",
137 |        "</table>\n",
138 |        "</div>"
139 |       ],
140 |       "text/plain": [
141 |        "    GeneA   GeneB UniprotA UniprotB SymbolA SymbolB      p(Wrong)  \\\n",
142 |        "0     100  728378   P00813   A5A3E0     ADA   POTEF  2.380858e-09   \n",
143 |        "1     100  345651   P00813   Q562R1     ADA  ACTBL2  9.786437e-18   \n",
144 |        "2  222389     708   Q8N7W2   Q07021   BEND7   C1QBP  2.962215e-17   \n",
145 |        "3  222389    4038   Q8N7W2   O75096   BEND7    LRP4  3.302994e-10   \n",
146 |        "4  645121    3312   Q6ZMN8   P11142   CCNI2   HSPA8  2.060285e-16   \n",
147 |        "\n",
148 |        "   p(No Interaction)  p(Interaction)  \n",
149 |        "0           0.000332        0.999668  \n",
150 |        "1           0.211914        0.788086  \n",
151 |        "2           0.005645        0.994355  \n",
152 |        "3           0.000280        0.999720  \n",
153 |        "4           0.036235        0.963765  "
154 |       ]
155 |      },
156 |      "execution_count": 14,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "BioPlex_Raw.head()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 15,
168 |    "metadata": {
169 |     "collapsed": false
170 |    },
171 |    "outputs": [
172 |     {
173 |      "name": "stdout",
174 |      "output_type": "stream",
175 |      "text": [
176 |       "Edges in BIND: 56553\n"
177 |      ]
178 |     }
179 |    ],
180 |    "source": [
181 |     "# Convert table of interactions to edgelist (no scores given)\n",
182 |     "# Also no gene symbol conversion necessary because network is given in symbol format already\n",
183 |     "BioPlex_edgelist = BioPlex_Raw[['SymbolA', 'SymbolB']].values.tolist()\n",
184 |     "print 'Edges in BIND:', len(BioPlex_edgelist)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 16,
190 |    "metadata": {
191 |     "collapsed": true
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "# Sort each edge representation for filtering\n",
196 |     "BioPlex_edgelist_sorted = [sorted(edge) for edge in BioPlex_edgelist]"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 17,
202 |    "metadata": {
203 |     "collapsed": false
204 |    },
205 |    "outputs": [
206 |     {
207 |      "name": "stdout",
208 |      "output_type": "stream",
209 |      "text": [
210 |       "56553 input edges\n",
211 |       "0 self-edges removed\n",
212 |       "0 edges with un-mapped genes removed\n",
213 |       "0 duplicate edges removed\n",
214 |       "Edge list filtered: 0.21 seconds\n",
215 |       "56553 Edges remaining\n"
216 |      ]
217 |     }
218 |    ],
219 |    "source": [
220 |     "# Filter edgelist for duplicate nodes and for self-edges\n",
221 |     "BioPlex_edgelist_filt = gct.filter_converted_edgelist(BioPlex_edgelist)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 18,
227 |    "metadata": {
228 |     "collapsed": false
229 |    },
230 |    "outputs": [
231 |     {
232 |      "name": "stdout",
233 |      "output_type": "stream",
234 |      "text": [
235 |       "Edge list saved: 0.1 seconds\n"
236 |      ]
237 |     }
238 |    ],
239 |    "source": [
240 |     "# Write network to file\n",
241 |     "gct.write_edgelist(BioPlex_edgelist_filt, wd+'Network_SIFs_Symbol/BioPlex_Symbol.sif', binary=True)"
242 |    ]
243 |   }
244 |  ],
245 |  "metadata": {
246 |   "kernelspec": {
247 |    "display_name": "Python 2",
248 |    "language": "python",
249 |    "name": "python2"
250 |   },
251 |   "language_info": {
252 |    "codemirror_mode": {
253 |     "name": "ipython",
254 |     "version": 2
255 |    },
256 |    "file_extension": ".py",
257 |    "mimetype": "text/x-python",
258 |    "name": "python",
259 |    "nbconvert_exporter": "python",
260 |    "pygments_lexer": "ipython2",
261 |    "version": "2.7.11"
262 |   }
263 |  },
264 |  "nbformat": 4,
265 |  "nbformat_minor": 0
266 | }
267 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/DIP Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "import pandas as pd\n",
 13 |     "import itertools"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Load PID Raw Data\n",
 21 |     "#### Source (MITAB): http://dip.doe-mbi.ucla.edu/dip/File.cgi?FN=2016/tab25/Hsapi20170205.txt\n",
 22 |     "Downloaded: June 15, 2017  \n",
 23 |     "Last Updated: Februrary 05, 2017    \n",
 24 |     "Notes for download: Website requires registration. Register for the site to download the file from the link.  \n",
 25 |     "Notes for processing: This is the file for human protein interactions, however, not all interactions are human-human interactions. These need to be filtered. Also all ID's not without RefSeq or UniProt ID are excluded. Custom processing for this network is described below"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 5,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "Raw edge count in DIP: 7794\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 45 |     "DIP_Raw = pd.read_csv(wd+'Network_Data_Raw/DIP/Hsapi20170205.txt', index_col=0, sep='\\t')\n",
 46 |     "print 'Raw edge count in DIP:', len(DIP_Raw)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 12,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# Fix the column offset in the interaction data table\n",
 58 |     "DIP_Raw_offset = DIP_Raw.reset_index(drop=False)[DIP_Raw.reset_index(drop=False).columns[:-2]]\n",
 59 |     "DIP_Raw_offset.columns = DIP_Raw.columns[:-1]"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 16,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stdout",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "Human-Human only interactions in DIP: 5569\n"
 74 |      ]
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "# Keep only human-human interactions\n",
 79 |     "DIP_Human_only = DIP_Raw_offset[(DIP_Raw_offset['Taxid interactor A']=='taxid:9606(Homo sapiens)') & (DIP_Raw_offset['Taxid interactor B']=='taxid:9606(Homo sapiens)')]\n",
 80 |     "print 'Human-Human only interactions in DIP:', len(DIP_Human_only)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "#### Parse all genes in filtered DIP and keep only RefSeq/UniProtKB labelled interactions"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 18,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# Extract gene list\n",
 99 |     "Human_DIP_Genes = list(set(DIP_Human_only['ID interactor A']).union(set(DIP_Human_only['ID interactor B'])))"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 25,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "# Split all gene names into list of genes and concatenate\n",
111 |     "Human_DIP_Genes_split = [name.split('|') for name in Human_DIP_Genes]\n",
112 |     "Human_DIP_Genes_full_list = list(itertools.chain.from_iterable(Human_DIP_Genes_split))\n",
113 |     "\n",
114 |     "# Note about this line: This is to fix the one example where one of the Uniprot genes gets labelled as \"uniprotkb:Q13936,159'\n",
115 |     "Human_DIP_Genes_full_list = [name.split(',')[0] for name in Human_DIP_Genes_full_list] "
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "## Convert Genes"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 26,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "5017 Valid Query Genes\n",
137 |       "3281 Invalid Query Genes\n"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'DIP' prefix)\n",
143 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_DIP_Genes_full_list, exclude_prefixes=['DIP'])"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 31,
149 |    "metadata": {
150 |     "collapsed": false
151 |    },
152 |    "outputs": [
153 |     {
154 |      "name": "stdout",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "Batch query complete: 7.97 seconds\n",
158 |       "5074 Matched query results\n"
159 |      ]
160 |     }
161 |    ],
162 |    "source": [
163 |     "# Set scopes (gene naming systems to search)\n",
164 |     "scopes = \"uniprot, refseq\"\n",
165 |     "# Set fields (systems from which to return gene names from)\n",
166 |     "fields = \"symbol, entrezgene\"\n",
167 |     "# Query MyGene.Info\n",
168 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
169 |     "print len(match_list), 'Matched query results'"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 32,
175 |    "metadata": {
176 |     "collapsed": false,
177 |     "scrolled": true
178 |    },
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "Queries without full matching results found: 106\n",
185 |       "\n",
186 |       "74 Queries with mutliple matches found\n",
187 |       "\n",
188 |       "Query mapping table/dictionary construction complete: 6.82 seconds\n"
189 |      ]
190 |     }
191 |    ],
192 |    "source": [
193 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {},
199 |    "source": [
200 |     "## Construct Converted Network"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 36,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "# This is a custom gene conversion function written due to the parsing required for gene interactor labels\n",
212 |     "# Returns best matched symbol and/or entrez id from each DIP interactor string (if applicable)\n",
213 |     "def convert_DIP_string(string, field):\n",
214 |     "    names = [gct.get_identifier_without_prefix(name) for name in string.split('|')]\n",
215 |     "    # Keep only mappings defined for field of interest\n",
216 |     "    if field=='symbol':\n",
217 |     "        # Return match table values that have matched symbol\n",
218 |     "        conversion = match_table_trim.ix[names][~(match_table_trim.ix[names]['Symbol'].isnull())]\n",
219 |     "        # Return conversion with max score or None if no conversion\n",
220 |     "        if conversion.shape[0]==0:\n",
221 |     "            return None\n",
222 |     "        else:\n",
223 |     "            max_score = conversion['Score'].max()\n",
224 |     "            return conversion[conversion['Score']==max_score].ix[0]['Symbol']\n",
225 |     "    elif field=='entrez':\n",
226 |     "        # Return match table values that have matched symbol\n",
227 |     "        conversion = match_table_trim.ix[names][~(match_table_trim.ix[names]['EntrezID'].isnull())]\n",
228 |     "        if conversion.shape[0]==0:\n",
229 |     "            return None\n",
230 |     "        else:\n",
231 |     "            # Return conversion with max score or None if no conversion\n",
232 |     "            max_score = conversion['Score'].max()\n",
233 |     "            return conversion[conversion['Score']==max_score].ix[0]['EntrezID']"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 37,
239 |    "metadata": {
240 |     "collapsed": false
241 |    },
242 |    "outputs": [],
243 |    "source": [
244 |     "DIP_Human_only_edges = DIP_Human_only[['ID interactor A', 'ID interactor B']].values.tolist()\n",
245 |     "DIP_edgelist_symbol = [sorted([convert_DIP_string(edge[0],'symbol'),convert_DIP_string(edge[1],'symbol')]) for edge in DIP_Human_only_edges]"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 39,
251 |    "metadata": {
252 |     "collapsed": false
253 |    },
254 |    "outputs": [
255 |     {
256 |      "name": "stdout",
257 |      "output_type": "stream",
258 |      "text": [
259 |       "5569 input edges\n",
260 |       "512 self-edges removed\n",
261 |       "309 edges with un-mapped genes removed\n",
262 |       "26 duplicate edges removed\n",
263 |       "Edge list filtered: 0.02 seconds\n",
264 |       "4722 Edges remaining\n"
265 |      ]
266 |     }
267 |    ],
268 |    "source": [
269 |     "# Filter converted edge list\n",
270 |     "DIP_edgelist_symbol_filt = gct.filter_converted_edgelist(DIP_edgelist_symbol)"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 40,
276 |    "metadata": {
277 |     "collapsed": false
278 |    },
279 |    "outputs": [
280 |     {
281 |      "name": "stdout",
282 |      "output_type": "stream",
283 |      "text": [
284 |       "Edge list saved: 0.02 seconds\n"
285 |      ]
286 |     }
287 |    ],
288 |    "source": [
289 |     "# Save converted edge list\n",
290 |     "gct.write_edgelist(DIP_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/DIP_Symbol.sif')"
291 |    ]
292 |   }
293 |  ],
294 |  "metadata": {
295 |   "kernelspec": {
296 |    "display_name": "Python 2",
297 |    "language": "python",
298 |    "name": "python2"
299 |   },
300 |   "language_info": {
301 |    "codemirror_mode": {
302 |     "name": "ipython",
303 |     "version": 2
304 |    },
305 |    "file_extension": ".py",
306 |    "mimetype": "text/x-python",
307 |    "name": "python",
308 |    "nbconvert_exporter": "python",
309 |    "pygments_lexer": "ipython2",
310 |    "version": "2.7.11"
311 |   }
312 |  },
313 |  "nbformat": 4,
314 |  "nbformat_minor": 0
315 | }
316 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/Degree-Preserved Network Shufflings.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from pyNBS import data_import_tools as dit\n",
 12 |     "from pyNBS import network_propagation as prop\n",
 13 |     "import os\n",
 14 |     "import numpy as np\n",
 15 |     "import pandas as pd\n",
 16 |     "import networkx as nx\n",
 17 |     "import time"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 4,
 23 |    "metadata": {
 24 |     "collapsed": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "def shuffle_network(network, verbose=False):\n",
 29 |     "\t# Shuffle Network\n",
 30 |     "\tshuff_time = time.time()\n",
 31 |     "\tedge_len=len(network.edges())\n",
 32 |     "\tshuff_net=network.copy()\n",
 33 |     "\ttry:\n",
 34 |     "\t\tnx.double_edge_swap(shuff_net, nswap=edge_len, max_tries=edge_len*10)\n",
 35 |     "\texcept:\n",
 36 |     "\t\tif verbose:\n",
 37 |     "\t\t\tprint 'Note: Maximum number of swap attempts ('+repr(edge_len*10)+') exceeded before desired swaps achieved ('+repr(edge_len)+').'\n",
 38 |     "\tif verbose:\n",
 39 |     "\t\t# Evaluate Network Similarity\n",
 40 |     "\t\tshared_edges = len(set(network.edges()).intersection(set(shuff_net.edges())))\n",
 41 |     "\t\tprint 'Network shuffled:', time.time()-shuff_time, 'seconds. Edge similarity:', shared_edges/float(edge_len)\n",
 42 |     "\treturn shuff_net"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 5,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "wd = '/cellar/users/jkhuang/Data/Projects/pyNBS/Data/Network_Data/Network_Files/'\n",
 54 |     "randNet_outdir = '/cellar/users/jkhuang/Data/Projects/pyNBS/Data/Network_Data/Shuffled_Network_Files/'"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 6,
 60 |    "metadata": {
 61 |     "collapsed": false
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "network_files = [wd+fn for fn in os.listdir(wd) if fn.endswith('.txt')]"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 15,
 71 |    "metadata": {
 72 |     "collapsed": false
 73 |    },
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stdout",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "PathwayCommons\n",
 80 |       "Network shuffled: 88.9572019577 seconds. Edge similarity: 0.14217722133\n",
 81 |       "Shuffled PathwayCommons saved.\n",
 82 |       "STRING90\n",
 83 |       "Network shuffled: 31.8355379105 seconds. Edge similarity: 0.135569697974\n",
 84 |       "Shuffled STRING90 saved.\n",
 85 |       "HumanNet90\n",
 86 |       "Network shuffled: 1.94090199471 seconds. Edge similarity: 0.157831705011\n",
 87 |       "Shuffled HumanNet90 saved.\n",
 88 |       "PID\n",
 89 |       "Network shuffled: 0.650630950928 seconds. Edge similarity: 0.172511892547\n",
 90 |       "Shuffled PID saved.\n",
 91 |       "Mentha\n",
 92 |       "Network shuffled: 12.5241580009 seconds. Edge similarity: 0.136090780444\n",
 93 |       "Shuffled Mentha saved.\n",
 94 |       "ConsensusPathDB\n",
 95 |       "Network shuffled: 472.858560085 seconds. Edge similarity: 0.266427489011\n",
 96 |       "Shuffled ConsensusPathDB saved.\n",
 97 |       "MultiNet\n",
 98 |       "Network shuffled: 11.9793038368 seconds. Edge similarity: 0.139956933521\n",
 99 |       "Shuffled MultiNet saved.\n",
100 |       "HPRD\n",
101 |       "Network shuffled: 2.19464206696 seconds. Edge similarity: 0.132373984179\n",
102 |       "Shuffled HPRD saved.\n",
103 |       "GIANT\n",
104 |       "Network shuffled: 953.094853163 seconds. Edge similarity: 0.181710364213\n",
105 |       "Shuffled GIANT saved.\n",
106 |       "HINT\n",
107 |       "Network shuffled: 10.6648330688 seconds. Edge similarity: 0.132703799716\n",
108 |       "Shuffled HINT saved.\n",
109 |       "GeneMANIA\n",
110 |       "Network shuffled: 1266.22839403 seconds. Edge similarity: 0.146754353915\n",
111 |       "Shuffled GeneMANIA saved.\n",
112 |       "Reactome\n",
113 |       "Network shuffled: 10.7709050179 seconds. Edge similarity: 0.157268305724\n",
114 |       "Shuffled Reactome saved.\n",
115 |       "STRING\n",
116 |       "Network shuffled: 1679.15529799 seconds. Edge similarity: 0.209015282622\n",
117 |       "Shuffled STRING saved.\n",
118 |       "IntAct\n",
119 |       "Network shuffled: 8.56541705132 seconds. Edge similarity: 0.130773661977\n",
120 |       "Shuffled IntAct saved.\n",
121 |       "Mentha90\n",
122 |       "Network shuffled: 0.904587030411 seconds. Edge similarity: 0.134449008127\n",
123 |       "Shuffled Mentha90 saved.\n",
124 |       "ReactomeFI\n",
125 |       "Network shuffled: 10.2852549553 seconds. Edge similarity: 0.146912035846\n",
126 |       "Shuffled ReactomeFI saved.\n",
127 |       "BIND\n",
128 |       "Network shuffled: 9.11399793625 seconds. Edge similarity: 0.322492442979\n",
129 |       "Shuffled BIND saved.\n",
130 |       "DIP\n",
131 |       "Network shuffled: 0.137312889099 seconds. Edge similarity: 0.120499788225\n",
132 |       "Shuffled DIP saved.\n",
133 |       "InBioMap75\n",
134 |       "Network shuffled: 6.4067800045 seconds. Edge similarity: 0.167107140969\n",
135 |       "Shuffled InBioMap75 saved.\n",
136 |       "HumanInteractome\n",
137 |       "Network shuffled: 0.723779201508 seconds. Edge similarity: 0.136739405675\n",
138 |       "Shuffled HumanInteractome saved.\n",
139 |       "BioPlex\n",
140 |       "Network shuffled: 1.60635495186 seconds. Edge similarity: 0.123919155482\n",
141 |       "Shuffled BioPlex saved.\n",
142 |       "GeneMANIA90\n",
143 |       "Network shuffled: 25.3215258121 seconds. Edge similarity: 0.118961241363\n",
144 |       "Shuffled GeneMANIA90 saved.\n",
145 |       "BioGRID\n",
146 |       "Network shuffled: 11.8226139545 seconds. Edge similarity: 0.131481431287\n",
147 |       "Shuffled BioGRID saved.\n",
148 |       "GIANT90\n",
149 |       "Network shuffled: 22.5300149918 seconds. Edge similarity: 0.188063162301\n",
150 |       "Shuffled GIANT90 saved.\n",
151 |       "HumanNet\n",
152 |       "Network shuffled: 25.1538288593 seconds. Edge similarity: 0.137587481275\n",
153 |       "Shuffled HumanNet saved.\n",
154 |       "IRefIndex\n",
155 |       "Network shuffled: 7.51319789886 seconds. Edge similarity: 0.160039835864\n",
156 |       "Shuffled IRefIndex saved.\n",
157 |       "InBioMap\n",
158 |       "Network shuffled: 46.8094351292 seconds. Edge similarity: 0.167921346275\n",
159 |       "Shuffled InBioMap saved.\n"
160 |      ]
161 |     }
162 |    ],
163 |    "source": [
164 |     "for network_file in network_files:\n",
165 |     "    network_name = network_file.split('/')[-1].split('_')[0]\n",
166 |     "    print network_name\n",
167 |     "    network = dit.load_network_file(network_file)\n",
168 |     "    shuffNet = shuffle_network(network, verbose=True)\n",
169 |     "    shuffNet_edges = shuffNet.edges()\n",
170 |     "    f = open(randNet_outdir+network_name+'-shuffled_Symbol.txt', 'w')\n",
171 |     "    for edge in shuffNet_edges:\n",
172 |     "        f.write(str(edge[0])+'\\t'+str(edge[1])+'\\n')\n",
173 |     "    f.close()\n",
174 |     "    print 'Shuffled', network_name, 'saved.'"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {
181 |     "collapsed": true
182 |    },
183 |    "outputs": [],
184 |    "source": []
185 |   }
186 |  ],
187 |  "metadata": {
188 |   "kernelspec": {
189 |    "display_name": "Python 2",
190 |    "language": "python",
191 |    "name": "python2"
192 |   },
193 |   "language_info": {
194 |    "codemirror_mode": {
195 |     "name": "ipython",
196 |     "version": 2
197 |    },
198 |    "file_extension": ".py",
199 |    "mimetype": "text/x-python",
200 |    "name": "python",
201 |    "nbconvert_exporter": "python",
202 |    "pygments_lexer": "ipython2",
203 |    "version": "2.7.11"
204 |   }
205 |  },
206 |  "nbformat": 4,
207 |  "nbformat_minor": 0
208 | }
209 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/GIANT Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import time"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Load GIANT Raw Data\n",
 22 |     "#### Source: http://giant.princeton.edu/static//networks/all_tissues_top.gz\n",
 23 |     "Downloaded: June 15, 2017  \n",
 24 |     "Last Updated: N/A, but paper published in 2015  \n",
 25 |     "Note about processing: This network (even if it is already the top 10% of all edges) is extremely large. Therefore, we will further filter this 'top' functional network further to the top 10% which should yield about 4 million edges. We will then take the top 10% of this filtered network (about 400k edges) to use as the 'filtered' version of this network."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "GIANT All Tissues (Top) Interactions: 38903547\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 45 |     "GIANT_Raw = pd.read_csv(wd+'/Network_Data_Raw/GIANT_All_Tissues_Top', sep='\\t', header=-1, low_memory=False)\n",
 46 |     "GIANT_Raw.columns = ['NodeA', 'NodeB', 'Prob']\n",
 47 |     "print 'GIANT All Tissues (Top) Interactions:', GIANT_Raw.shape[0]"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 5,
 53 |    "metadata": {
 54 |     "collapsed": false,
 55 |     "scrolled": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "# Get all genes to convert from GeneMANIA\n",
 60 |     "GIANT_Raw_Genes = list(set(GIANT_Raw['NodeA']).union(GIANT_Raw['NodeB']))\n",
 61 |     "# Convert all entrezIDs to string forst\n",
 62 |     "GIANT_Raw_Genes = [str(entrezID) for entrezID in GIANT_Raw_Genes]"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "##  Convert genes from Entrez ID to HUGO Symbol"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 6,
 75 |    "metadata": {
 76 |     "collapsed": false
 77 |    },
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "25689 Valid Query Genes\n",
 84 |       "0 Invalid Query Genes\n"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(GIANT_Raw_Genes)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 7,
 95 |    "metadata": {
 96 |     "collapsed": false
 97 |    },
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "Batch query complete: 30.55 seconds\n",
104 |       "25690 Matched query results\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "# Set scopes (gene naming systems to search)\n",
110 |     "scopes = \"entrezgene, retired, alias\"\n",
111 |     "\n",
112 |     "# Set fields (systems from which to return gene names from)\n",
113 |     "fields = \"symbol, entrezgene\"\n",
114 |     "\n",
115 |     "# Query MyGene.Info\n",
116 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
117 |     "print len(match_list), 'Matched query results'"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 8,
123 |    "metadata": {
124 |     "collapsed": false
125 |    },
126 |    "outputs": [
127 |     {
128 |      "name": "stdout",
129 |      "output_type": "stream",
130 |      "text": [
131 |       "Queries without full matching results found: 806\n",
132 |       "\n",
133 |       "1 Queries with mutliple matches found\n",
134 |       "\n",
135 |       "Query mapping table/dictionary construction complete: 140.47 seconds\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "## Construct converted network and filter edges"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 9,
153 |    "metadata": {
154 |     "collapsed": true
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "GIANT_Raw_edgelist = GIANT_Raw.values.tolist()"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 13,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "# Convert GIANT network edgelist\n",
170 |     "GIANT_Raw_edgelist_symbol = [sorted([query_to_symbol[str(int(edge[0]))], query_to_symbol[str(int(edge[1]))]])+[edge[2]] for edge in GIANT_Raw_edgelist]"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": 14,
176 |    "metadata": {
177 |     "collapsed": false
178 |    },
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "38903547 input edges\n",
185 |       "19204 self-edges removed\n",
186 |       "2417020 edges with un-mapped genes removed\n",
187 |       "151720 duplicate edges removed\n",
188 |       "Edge list filtered: 225.47 seconds\n",
189 |       "36315603 Edges remaining\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "# Filter GIANT network edgelist\n",
195 |     "GIANT_edgelist_symbol_filt = gct.filter_converted_edgelist(GIANT_Raw_edgelist_symbol, remove_self_edges=True, weighted=True)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "## Filter to top 10% of edges by weight/score"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 16,
208 |    "metadata": {
209 |     "collapsed": false
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "GIANT_edgelist_symbol_filt_table = pd.DataFrame(GIANT_edgelist_symbol_filt, columns=['NodeA', 'NodeB', 'Score'])"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 20,
219 |    "metadata": {
220 |     "collapsed": false
221 |    },
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "90% score: 0.207416\n"
228 |      ]
229 |     }
230 |    ],
231 |    "source": [
232 |     "# Filter edges by score quantile\n",
233 |     "q_score = GIANT_edgelist_symbol_filt_table['Score'].quantile(0.9)\n",
234 |     "print '90% score:', q_score\n",
235 |     "GIANTtop_edgelist = GIANT_edgelist_symbol_filt_table[GIANT_edgelist_symbol_filt_table['Score']>q_score]"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 23,
241 |    "metadata": {
242 |     "collapsed": true
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "# Save weighted network for GIANT filtered to top 10% of downloaded edges to file\n",
247 |     "GIANTtop_edgelist.to_csv('/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/GIANT_Symbol.sif', sep='\\t', header=False, index=False)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 24,
253 |    "metadata": {
254 |     "collapsed": false
255 |    },
256 |    "outputs": [
257 |     {
258 |      "name": "stdout",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "90.0% score: 0.574097\n",
262 |       "363128 / 3631554 edges retained\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "# Create filtered network for GIANT\n",
268 |     "GIANT90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/GIANT_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
269 |     "                                                   q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/GIANT90_Symbol.sif')"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {
276 |     "collapsed": true
277 |    },
278 |    "outputs": [],
279 |    "source": []
280 |   }
281 |  ],
282 |  "metadata": {
283 |   "kernelspec": {
284 |    "display_name": "Python 2",
285 |    "language": "python",
286 |    "name": "python2"
287 |   },
288 |   "language_info": {
289 |    "codemirror_mode": {
290 |     "name": "ipython",
291 |     "version": 2
292 |    },
293 |    "file_extension": ".py",
294 |    "mimetype": "text/x-python",
295 |    "name": "python",
296 |    "nbconvert_exporter": "python",
297 |    "pygments_lexer": "ipython2",
298 |    "version": "2.7.11"
299 |   }
300 |  },
301 |  "nbformat": 4,
302 |  "nbformat_minor": 0
303 | }
304 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/GeneMANIA Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import time"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Load GeneMANIA Raw Data\n",
 22 |     "#### Source: http://genemania.org/data/current/Homo_sapiens.COMBINED/COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt\n",
 23 |     "Downloaded: July 28, 2016  \n",
 24 |     "Last Updated: October 15, 2014\t"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 36 |     "GeneMANIA_Raw = pd.read_csv(wd+'/Network_Data_Raw/GeneMANIA/GeneMANIA_2014_10_15.txt',sep='\\t')"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "GeneMANIA_Raw_Genes = list(set(GeneMANIA_Raw['Gene_A']).union(set(GeneMANIA_Raw['Gene_B'])))"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {
 54 |     "collapsed": false
 55 |    },
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "7290094 Total GeneMANIA Edges\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "# Get Edgelist of network\n",
 67 |     "query_edgelist = GeneMANIA_Raw[['Gene_A','Gene_B', 'Weight']].values.tolist()\n",
 68 |     "print len(query_edgelist), \"Total GeneMANIA Edges\""
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "## Convert Genes (from ensembl gene to gene symbol)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 5,
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "19264 Valid Query Genes\n",
 90 |       "0 Invalid Query Genes\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(GeneMANIA_Raw_Genes)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 6,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "# Set scopes (gene naming systems to search)\n",
107 |     "scopes = \"ensemblgene\"\n",
108 |     "\n",
109 |     "# Set fields (systems from which to return gene names from)\n",
110 |     "fields = \"symbol, entrezgene\""
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 7,
116 |    "metadata": {
117 |     "collapsed": false
118 |    },
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "Batch query complete: 35.43 seconds\n",
125 |       "19266 Matched query results\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "# Query MyGene.Info\n",
131 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
132 |     "print len(match_list), 'Matched query results'"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 8,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "Queries without full matching results found: 1547\n",
147 |       "\n",
148 |       "1 Queries with mutliple matches found\n",
149 |       "\n",
150 |       "Query mapping table/dictionary construction complete: 111.04 seconds\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "## Construct Converted Network"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 9,
168 |    "metadata": {
169 |     "collapsed": false
170 |    },
171 |    "outputs": [
172 |     {
173 |      "name": "stdout",
174 |      "output_type": "stream",
175 |      "text": [
176 |       "CPU times: user 18.5 s, sys: 1.36 s, total: 19.9 s\n",
177 |       "Wall time: 19.5 s\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "%%time\n",
183 |     "# Convert weighted edge list\n",
184 |     "GeneMANIA_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 10,
190 |    "metadata": {
191 |     "collapsed": false
192 |    },
193 |    "outputs": [
194 |     {
195 |      "name": "stdout",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "7290094 input edges\n",
199 |       "22144 self-edges removed\n",
200 |       "665798 edges with un-mapped genes removed\n",
201 |       "508 duplicate edges removed\n",
202 |       "Edge list filtered: 39.33 seconds\n",
203 |       "6601644 Edges remaining\n"
204 |      ]
205 |     }
206 |    ],
207 |    "source": [
208 |     "# Filter converted edge list\n",
209 |     "GeneMANIA_edgelist_symbol_filt = gct.filter_converted_edgelist(GeneMANIA_edgelist_symbol, weighted=True)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 11,
215 |    "metadata": {
216 |     "collapsed": false
217 |    },
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "Edge list saved: 13.39 seconds\n"
224 |      ]
225 |     }
226 |    ],
227 |    "source": [
228 |     "# Write network to file\n",
229 |     "gct.write_edgelist(GeneMANIA_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/GeneMANIA_Symbol.sif', binary=False)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 12,
235 |    "metadata": {
236 |     "collapsed": false
237 |    },
238 |    "outputs": [
239 |     {
240 |      "name": "stdout",
241 |      "output_type": "stream",
242 |      "text": [
243 |       "90.0% score: 0.00023\n",
244 |       "618546 / 6601644 edges retained\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "# Create filtered network\n",
250 |     "GeneMANIA90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/GeneMANIA_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
251 |     "                                                       q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/GeneMANIA90_Symbol.sif')"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {
258 |     "collapsed": true
259 |    },
260 |    "outputs": [],
261 |    "source": []
262 |   }
263 |  ],
264 |  "metadata": {
265 |   "kernelspec": {
266 |    "display_name": "Python 2",
267 |    "language": "python",
268 |    "name": "python2"
269 |   },
270 |   "language_info": {
271 |    "codemirror_mode": {
272 |     "name": "ipython",
273 |     "version": 2
274 |    },
275 |    "file_extension": ".py",
276 |    "mimetype": "text/x-python",
277 |    "name": "python",
278 |    "nbconvert_exporter": "python",
279 |    "pygments_lexer": "ipython2",
280 |    "version": "2.7.11"
281 |   }
282 |  },
283 |  "nbformat": 4,
284 |  "nbformat_minor": 0
285 | }
286 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/HINT Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import numpy as np"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Load HINT Raw Data\n",
 22 |     "#### Source: http://hint.yulab.org/batch.html\n",
 23 |     "Downloaded: June 15, 2017  \n",
 24 |     "Last update not listed, but currently on version 4 (updated early 2017). The two binary interactomes for High-Quality (HQ) and Co-Complex (CC) interactions were downloaded and merged into a single interactome for HINT.     \n",
 25 |     "Citation: Das J and Yu H. HINT: High-quality protein interactomes and their applications in understanding human disease. BMC Systems Biology, 2012 Jul 30;6(1):92."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 37 |     "HINT_Bin_Raw = pd.read_csv(wd+'Network_Data_Raw/HINT_v4_binary_HomoSapiens.txt',sep='\\t')\n",
 38 |     "HINT_Com_Raw = pd.read_csv(wd+'Network_Data_Raw/HINT_v4_complex_HomoSapiens.txt',sep='\\t')"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 5,
 44 |    "metadata": {
 45 |     "collapsed": false,
 46 |     "scrolled": true
 47 |    },
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "Concatenated list of edges: (181699, 9)\n",
 54 |       "After duplicate edges removed: (181375, 9)\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "HINT_Raw = pd.concat([HINT_Bin_Raw, HINT_Com_Raw])\n",
 60 |     "print 'Concatenated list of edges:', HINT_Raw.shape\n",
 61 |     "HINT_Raw = HINT_Raw.drop_duplicates()\n",
 62 |     "print 'After duplicate edges removed:', HINT_Raw.shape"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 7,
 68 |    "metadata": {
 69 |     "collapsed": false,
 70 |     "scrolled": true
 71 |    },
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# Use UniProtID labels to annotate interactions\n",
 75 |     "HPRD_Raw_Genes_Uniprot = set(HINT_Raw['Uniprot_A']).union(set(HINT_Raw['Uniprot_B']))"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "## Convert Genes from UniProt Accession ID to gene symbols"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 9,
 88 |    "metadata": {
 89 |     "collapsed": false
 90 |    },
 91 |    "outputs": [
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "15784 Valid Query Genes\n",
 97 |       "0 Invalid Query Genes\n"
 98 |      ]
 99 |     }
100 |    ],
101 |    "source": [
102 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(HPRD_Raw_Genes_Uniprot)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 10,
108 |    "metadata": {
109 |     "collapsed": false
110 |    },
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "Batch query complete: 19.17 seconds\n",
117 |       "16001 Matched query results\n"
118 |      ]
119 |     }
120 |    ],
121 |    "source": [
122 |     "# Set scopes (gene naming systems to search)\n",
123 |     "scopes = \"uniprot\"\n",
124 |     "\n",
125 |     "# Set fields (systems from which to return gene names from)\n",
126 |     "fields = \"symbol, entrezgene\"\n",
127 |     "\n",
128 |     "# Query MyGene.Info\n",
129 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
130 |     "print len(match_list), 'Matched query results'"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 11,
136 |    "metadata": {
137 |     "collapsed": false,
138 |     "scrolled": true
139 |    },
140 |    "outputs": [
141 |     {
142 |      "name": "stdout",
143 |      "output_type": "stream",
144 |      "text": [
145 |       "Queries without full matching results found: 670\n",
146 |       "\n",
147 |       "163 Queries with mutliple matches found\n",
148 |       "\n",
149 |       "Query mapping table/dictionary construction complete: 59.26 seconds\n"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "## Construct Converted Network"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 13,
167 |    "metadata": {
168 |     "collapsed": true
169 |    },
170 |    "outputs": [],
171 |    "source": [
172 |     "HINT_edgelist = HINT_Raw[['Uniprot_A', 'Uniprot_B']].values.tolist()"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 16,
178 |    "metadata": {
179 |     "collapsed": false
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "# Convert edge list\n",
184 |     "HINT_edgelist_symbol = gct.convert_edgelist(HINT_edgelist, query_to_symbol, weighted=False)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 19,
190 |    "metadata": {
191 |     "collapsed": false
192 |    },
193 |    "outputs": [
194 |     {
195 |      "name": "stdout",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "181375 input edges\n",
199 |       "4730 self-edges removed\n",
200 |       "2861 edges with un-mapped genes removed\n",
201 |       "18325 duplicate edges removed\n",
202 |       "Edge list filtered: 0.33 seconds\n",
203 |       "155459 Edges remaining\n"
204 |      ]
205 |     }
206 |    ],
207 |    "source": [
208 |     "# Filter edge list\n",
209 |     "HINT_edgelist_symbol_filt = gct.filter_converted_edgelist(HINT_edgelist_symbol)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 20,
215 |    "metadata": {
216 |     "collapsed": false
217 |    },
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "Edge list saved: 0.26 seconds\n"
224 |      ]
225 |     }
226 |    ],
227 |    "source": [
228 |     "# Save edge list\n",
229 |     "gct.write_edgelist(HINT_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/HINT_Symbol.sif')"
230 |    ]
231 |   }
232 |  ],
233 |  "metadata": {
234 |   "kernelspec": {
235 |    "display_name": "Python 2",
236 |    "language": "python",
237 |    "name": "python2"
238 |   },
239 |   "language_info": {
240 |    "codemirror_mode": {
241 |     "name": "ipython",
242 |     "version": 2
243 |    },
244 |    "file_extension": ".py",
245 |    "mimetype": "text/x-python",
246 |    "name": "python",
247 |    "nbconvert_exporter": "python",
248 |    "pygments_lexer": "ipython2",
249 |    "version": "2.7.11"
250 |   }
251 |  },
252 |  "nbformat": 4,
253 |  "nbformat_minor": 0
254 | }
255 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/HPRD Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Load HPRD Raw Data\n",
 21 |     "#### Source: http://www.hprd.org/download\n",
 22 |     "#### The file requires registration with the database. Download the file: HPRD_Release9_041310.tar.gz\n",
 23 |     "Downloaded: August 12, 2016  \n",
 24 |     "Last Updated: June 29, 2010  \n",
 25 |     "The following files are manipulated after unzipping the .tar.gz file"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 3,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 37 |     "HPRD_Raw = pd.read_csv(wd+'Network_Data_Raw/HPRD_Release9_062910/BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt',sep='\\t',header=-1)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 5,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# Assign column names from README file from archive\n",
 49 |     "HPRD_Raw.columns = ['Interactor 1 Gene Symbol', 'Interactor 1 HPRD ID', 'Interactor 1 RefSeq ID',\n",
 50 |     "                    'Interactor 2 Gene Symbol', 'Interactor 2 HPRD ID', 'Interactor 2 RefSeq ID',\n",
 51 |     "                    'Experiment Type', 'PubMed ID']"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 7,
 57 |    "metadata": {
 58 |     "collapsed": false
 59 |    },
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "Edges in HPRD: 39240\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "# Convert table of interactions to edgelist (no scores given)\n",
 71 |     "# Also no gene symbol conversion necessary because network is given in symbol format already\n",
 72 |     "HPRD_edgelist = HPRD_Raw[['Interactor 1 Gene Symbol', 'Interactor 2 Gene Symbol']].values.tolist()\n",
 73 |     "print 'Edges in HPRD:', len(HPRD_edgelist)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 9,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# Sort each edge representation for filtering\n",
 85 |     "HPRD_edgelist_sorted = [sorted(edge) for edge in HPRD_edgelist]"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 10,
 91 |    "metadata": {
 92 |     "collapsed": false
 93 |    },
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stdout",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "39240 input edges\n",
100 |       "2160 self-edges removed\n",
101 |       "0 edges with un-mapped genes removed\n",
102 |       "41 duplicate edges removed\n",
103 |       "Edge list filtered: 0.05 seconds\n",
104 |       "37039 Edges remaining\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "# Filter edgelist for duplicate nodes and for self-edges\n",
110 |     "HPRD_edgelist_filt = gct.filter_converted_edgelist(HPRD_edgelist_sorted)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 12,
116 |    "metadata": {
117 |     "collapsed": false
118 |    },
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "Edge list saved: 0.04 seconds\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "# Save genelist to file\n",
130 |     "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
131 |     "gct.write_edgelist(HPRD_edgelist_filt, outdir+'HPRD_Symbol.sif')"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": []
142 |   }
143 |  ],
144 |  "metadata": {
145 |   "kernelspec": {
146 |    "display_name": "Python 2",
147 |    "language": "python",
148 |    "name": "python2"
149 |   },
150 |   "language_info": {
151 |    "codemirror_mode": {
152 |     "name": "ipython",
153 |     "version": 2
154 |    },
155 |    "file_extension": ".py",
156 |    "mimetype": "text/x-python",
157 |    "name": "python",
158 |    "nbconvert_exporter": "python",
159 |    "pygments_lexer": "ipython2",
160 |    "version": "2.7.11"
161 |   }
162 |  },
163 |  "nbformat": 4,
164 |  "nbformat_minor": 0
165 | }
166 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/HumanInteractome Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "from network_evaluation_tools import gene_conversion_tools as gct"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "The following data was downloaded from CCSB and converted to edge list sifs for both symbol and entrez from the simple sifs given in both cases. No additional gene conversions were performed for these networks."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Load HI-II-14 (Human Interactome) Raw Data\n",
 27 |     "#### Source: http://interactome.dfci.harvard.edu/H_sapiens/download/HI-II-14.tsv\n",
 28 |     "#### File: 'HI-II-14'\n",
 29 |     "Downloaded: June 20, 2017  \n",
 30 |     "Last Updated: Not Listed\n",
 31 |     "Proteome-scale map of the human binary interactome network generated by systematically screening Space-II associated with Rolland et al Cell 2014"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "Raw Interactions in HI-II-14: 13944\n"
 46 |      ]
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 51 |     "HumanInteractome_Raw = pd.read_csv(wd+'Network_Data_Raw/HI-II-14.tsv',sep='\\t')\n",
 52 |     "print 'Raw Interactions in HI-II-14:', len(HumanInteractome_Raw)"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 5,
 58 |    "metadata": {
 59 |     "collapsed": false,
 60 |     "scrolled": true
 61 |    },
 62 |    "outputs": [
 63 |     {
 64 |      "name": "stdout",
 65 |      "output_type": "stream",
 66 |      "text": [
 67 |       "Edges in HI-II-14: 13944\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "# Convert table of interactions to edgelist (no scores given)\n",
 73 |     "# Also no gene symbol conversion necessary because network is given in symbol format already\n",
 74 |     "HumanInteractome_edgelist = HumanInteractome_Raw[['Symbol A', 'Symbol B']].values.tolist()\n",
 75 |     "print 'Edges in HI-II-14:', len(HumanInteractome_edgelist)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 7,
 81 |    "metadata": {
 82 |     "collapsed": true
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "# Sort each edge representation for filtering\n",
 87 |     "HumanInteractome_edgelist_sorted = [sorted(edge) for edge in HumanInteractome_edgelist]"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 8,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "13944 input edges\n",
102 |       "517 self-edges removed\n",
103 |       "0 edges with un-mapped genes removed\n",
104 |       "0 duplicate edges removed\n",
105 |       "Edge list filtered: 0.02 seconds\n",
106 |       "13427 Edges remaining\n"
107 |      ]
108 |     }
109 |    ],
110 |    "source": [
111 |     "# Filter edgelist for duplicate nodes and for self-edges\n",
112 |     "HumanInteractome_edgelist_filt = gct.filter_converted_edgelist(HumanInteractome_edgelist_sorted)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 10,
118 |    "metadata": {
119 |     "collapsed": false
120 |    },
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "Edge list saved: 0.02 seconds\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "# Save genelist to file\n",
132 |     "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
133 |     "gct.write_edgelist(HumanInteractome_edgelist_filt, outdir+'HumanInteractome_Symbol.sif')"
134 |    ]
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "kernelspec": {
139 |    "display_name": "Python 2",
140 |    "language": "python",
141 |    "name": "python2"
142 |   },
143 |   "language_info": {
144 |    "codemirror_mode": {
145 |     "name": "ipython",
146 |     "version": 2
147 |    },
148 |    "file_extension": ".py",
149 |    "mimetype": "text/x-python",
150 |    "name": "python",
151 |    "nbconvert_exporter": "python",
152 |    "pygments_lexer": "ipython2",
153 |    "version": "2.7.11"
154 |   }
155 |  },
156 |  "nbformat": 4,
157 |  "nbformat_minor": 0
158 | }
159 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/HumanNet Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import time"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Load HumanNet Raw Data\n",
 22 |     "#### Source: http://www.functionalnet.org/humannet/HumanNet.v1.benchmark.txt\n",
 23 |     "Downloaded: August 12, 2016  \n",
 24 |     "No latest version date posted (last updated likely around 2011).  \n",
 25 |     "Citation: Insuk Lee, U. Martin Blom, Peggy I. Wang, Jung Eun Shin, and Edward M. Marcotte\n",
 26 |     "Genome Research 21(7):1109-21 (2011)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 38 |     "HumanNet_Raw = pd.read_csv(wd+'Network_Data_Raw/HumanNet.v1.join.txt',sep='\\t',header=-1)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 3,
 44 |    "metadata": {
 45 |     "collapsed": false
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "f = open(wd+'Network_Data_Raw/HumanNet.v1.evidence_code.txt')\n",
 50 |     "HumanNet_headers = ['Gene 1', 'Gene 2']+[name.split(' = ')[0] for name in f.read().splitlines()[1:-1]]\n",
 51 |     "HumanNet_Raw.columns = HumanNet_headers"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# Extract gene list\n",
 63 |     "HumanNet_Raw_Genes = list(set(HumanNet_Raw['Gene 1']).union(set(HumanNet_Raw['Gene 2'])))\n",
 64 |     "HumanNet_Raw_Genes = [str(gene) for gene in HumanNet_Raw_Genes]"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 5,
 70 |    "metadata": {
 71 |     "collapsed": false
 72 |    },
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "476399 HumanNet Edges\n"
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "# Get edge list of network\n",
 84 |     "query_edgelist = HumanNet_Raw[['Gene 1','Gene 2']].astype(str)\n",
 85 |     "query_edgelist = pd.concat([query_edgelist, HumanNet_Raw['IntNet']], axis=1).values.tolist()\n",
 86 |     "print len(query_edgelist), \"HumanNet Edges\""
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "##  Convert genes from Entrez ID to HUGO Symbol"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 6,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "16243 Valid Query Genes\n",
108 |       "0 Invalid Query Genes\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(HumanNet_Raw_Genes)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 7,
119 |    "metadata": {
120 |     "collapsed": false
121 |    },
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "Batch query complete: 19.6 seconds\n",
128 |       "16243 Matched query results\n"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "# Set scopes (gene naming systems to search)\n",
134 |     "scopes = \"entrezgene, retired\"\n",
135 |     "\n",
136 |     "# Set fields (systems from which to return gene names from)\n",
137 |     "fields = \"symbol, entrezgene\"\n",
138 |     "\n",
139 |     "# Query MyGene.Info\n",
140 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
141 |     "print len(match_list), 'Matched query results'"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 8,
147 |    "metadata": {
148 |     "collapsed": false
149 |    },
150 |    "outputs": [
151 |     {
152 |      "name": "stdout",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "Queries without full matching results found: 10\n",
156 |       "\n",
157 |       "0 Queries with mutliple matches found\n",
158 |       "\n",
159 |       "Query mapping table/dictionary construction complete: 19.62 seconds\n"
160 |      ]
161 |     }
162 |    ],
163 |    "source": [
164 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "## Construct Converted Network"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 9,
177 |    "metadata": {
178 |     "collapsed": false
179 |    },
180 |    "outputs": [
181 |     {
182 |      "name": "stdout",
183 |      "output_type": "stream",
184 |      "text": [
185 |       "CPU times: user 1.54 s, sys: 260 ms, total: 1.8 s\n",
186 |       "Wall time: 1.69 s\n"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "%%time\n",
192 |     "# Convert weighted edge list\n",
193 |     "HumanNet_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 10,
199 |    "metadata": {
200 |     "collapsed": false
201 |    },
202 |    "outputs": [
203 |     {
204 |      "name": "stdout",
205 |      "output_type": "stream",
206 |      "text": [
207 |       "476399 input edges\n",
208 |       "7 self-edges removed\n",
209 |       "225 edges with un-mapped genes removed\n",
210 |       "208 duplicate edges removed\n",
211 |       "Edge list filtered: 4.15 seconds\n",
212 |       "475959 Edges remaining\n"
213 |      ]
214 |     }
215 |    ],
216 |    "source": [
217 |     "# Filter converted edge list\n",
218 |     "HumanNet_edgelist_symbol_filt = gct.filter_converted_edgelist(HumanNet_edgelist_symbol, weighted=True)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 11,
224 |    "metadata": {
225 |     "collapsed": false
226 |    },
227 |    "outputs": [
228 |     {
229 |      "name": "stdout",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "Edge list saved: 1.24 seconds\n"
233 |      ]
234 |     }
235 |    ],
236 |    "source": [
237 |     "# Write network to file\n",
238 |     "gct.write_edgelist(HumanNet_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/HumanNet_Symbol.sif', binary=False)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 15,
244 |    "metadata": {
245 |     "collapsed": false
246 |    },
247 |    "outputs": [
248 |     {
249 |      "name": "stdout",
250 |      "output_type": "stream",
251 |      "text": [
252 |       "90.0% score: 2.17047289928\n",
253 |       "47595 / 475959 edges retained\n"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "# Create filtered network\n",
259 |     "HumanNet90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/HumanNet_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
260 |     "                                                      q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/HumanNet90_Symbol.sif')"
261 |    ]
262 |   }
263 |  ],
264 |  "metadata": {
265 |   "kernelspec": {
266 |    "display_name": "Python 2",
267 |    "language": "python",
268 |    "name": "python2"
269 |   },
270 |   "language_info": {
271 |    "codemirror_mode": {
272 |     "name": "ipython",
273 |     "version": 2
274 |    },
275 |    "file_extension": ".py",
276 |    "mimetype": "text/x-python",
277 |    "name": "python",
278 |    "nbconvert_exporter": "python",
279 |    "pygments_lexer": "ipython2",
280 |    "version": "2.7.11"
281 |   }
282 |  },
283 |  "nbformat": 4,
284 |  "nbformat_minor": 0
285 | }
286 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/InBioMap Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import itertools\n",
 15 |     "import time"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Load InBio_Map Raw Data\n",
 23 |     "#### Source: https://www.intomics.com/inbio/map/#downloads\n",
 24 |     "Downloaded: November 30, 2016  \n",
 25 |     "Last Updated: September 12, 2016   \n",
 26 |     "Note about scoring: According to the supplement of the associated paper (Li T, et al. A scored human protein–protein interaction network to catalyze genomic interpretation. Nature Methods 14, 61–64 (2017) doi:10.1038/nmeth.4083), column 15 (index=14) should correspond to the confidence score of the edge. This column has 2 values, the confidence score and initial score. We will use the confidence score as it is a corrected version of the initial score calculated, indicating confidence that a particular interaction is real."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Raw edge count in InBio_Map: 625641\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 46 |     "InBio_Map_Raw = pd.read_csv(wd+'Network_Data_Raw/InBio_Map_core_2016_09_12/core.psimitab',sep='\\t', header=-1)\n",
 47 |     "print 'Raw edge count in InBio_Map:', len(InBio_Map_Raw)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": false
 55 |    },
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "Human-Human only interactions in InBioMap: 625641\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "InBio_Map_Human_Only = InBio_Map_Raw[(InBio_Map_Raw[9]=='taxid:9606(Homo sapiens)') & (InBio_Map_Raw[10]=='taxid:9606(Homo sapiens)')]\n",
 67 |     "print 'Human-Human only interactions in InBioMap:', len(InBio_Map_Human_Only)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "# Extract gene list\n",
 79 |     "InBio_Map_Human_Genes = list(set(InBio_Map_Human_Only[0]).union(set(InBio_Map_Human_Only[1])))\n",
 80 |     "InBio_Map_Human_Genes = [str(gene) for gene in InBio_Map_Human_Genes]"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "## Convert Genes"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 5,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "17653 Valid Query Genes\n",
102 |       "0 Invalid Query Genes\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "# Construct list of genes to be submitted to MyGene.Info API\n",
108 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(InBio_Map_Human_Genes)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "# Set scopes (gene naming systems to search)\n",
120 |     "scopes = \"uniprot\"\n",
121 |     "\n",
122 |     "# Set fields (systems from which to return gene names from)\n",
123 |     "fields = \"symbol, entrezgene\""
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 7,
129 |    "metadata": {
130 |     "collapsed": false
131 |    },
132 |    "outputs": [
133 |     {
134 |      "name": "stdout",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "Batch query complete: 39.84 seconds\n",
138 |       "17984 Matched query results\n"
139 |      ]
140 |     }
141 |    ],
142 |    "source": [
143 |     "# Query MyGene.Info\n",
144 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
145 |     "print len(match_list), 'Matched query results'"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 8,
151 |    "metadata": {
152 |     "collapsed": false
153 |    },
154 |    "outputs": [
155 |     {
156 |      "name": "stdout",
157 |      "output_type": "stream",
158 |      "text": [
159 |       "Queries without full matching results found: 419\n",
160 |       "\n",
161 |       "233 Queries with mutliple matches found\n",
162 |       "\n",
163 |       "Query mapping table/dictionary construction complete: 76.78 seconds\n"
164 |      ]
165 |     }
166 |    ],
167 |    "source": [
168 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {},
174 |    "source": [
175 |     "## Construct Converted Network"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 9,
181 |    "metadata": {
182 |     "collapsed": true
183 |    },
184 |    "outputs": [],
185 |    "source": [
186 |     "query_edgelist = InBio_Map_Human_Only[[0, 1, 14]].values.tolist()\n",
187 |     "query_edgelist_fmt = [[edge[0].split(':')[1], edge[1].split(':')[1], float(edge[2].split('|')[0])] for edge in query_edgelist]"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 10,
193 |    "metadata": {
194 |     "collapsed": false
195 |    },
196 |    "outputs": [
197 |     {
198 |      "name": "stdout",
199 |      "output_type": "stream",
200 |      "text": [
201 |       "CPU times: user 1.89 s, sys: 197 ms, total: 2.09 s\n",
202 |       "Wall time: 1.87 s\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "%%time\n",
208 |     "# Convert weighted edge list\n",
209 |     "InBioMap_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=True)"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 11,
215 |    "metadata": {
216 |     "collapsed": false
217 |    },
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "625641 input edges\n",
224 |       "2498 self-edges removed\n",
225 |       "12249 edges with un-mapped genes removed\n",
226 |       "4896 duplicate edges removed\n",
227 |       "Edge list filtered: 3.15 seconds\n",
228 |       "605998 Edges remaining\n"
229 |      ]
230 |     }
231 |    ],
232 |    "source": [
233 |     "# Filter converted edge list\n",
234 |     "InBioMap_edgelist_symbol_filt = gct.filter_converted_edgelist(InBioMap_edgelist_symbol, weighted=True)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 12,
240 |    "metadata": {
241 |     "collapsed": false
242 |    },
243 |    "outputs": [
244 |     {
245 |      "name": "stdout",
246 |      "output_type": "stream",
247 |      "text": [
248 |       "Edge list saved: 1.77 seconds\n"
249 |      ]
250 |     }
251 |    ],
252 |    "source": [
253 |     "# Write network to file\n",
254 |     "gct.write_edgelist(InBioMap_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/InBioMap_Symbol.sif', binary=False)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 13,
260 |    "metadata": {
261 |     "collapsed": false
262 |    },
263 |    "outputs": [
264 |     {
265 |      "name": "stdout",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "90.0% score: 1.0\n",
269 |       "0 / 605998 edges retained\n"
270 |      ]
271 |     }
272 |    ],
273 |    "source": [
274 |     "# Create filtered network\n",
275 |     "InBioMap90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/InBioMap_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
276 |     "                                                       q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/InBioMap90_Symbol.sif')"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 14,
282 |    "metadata": {
283 |     "collapsed": false
284 |    },
285 |    "outputs": [
286 |     {
287 |      "name": "stdout",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "151352 / 605998 edges kept,  0.249756599857\n"
291 |      ]
292 |     }
293 |    ],
294 |    "source": [
295 |     "# The filter function didn't work here because the max value makes up >90% of the edges. \n",
296 |     "# We need to filter but keep all max edges instead\n",
297 |     "InBioMap_edgelist = pd.DataFrame(InBioMap_edgelist_symbol_filt, columns=['NodeA', 'NodeB', 'edgeScore'])\n",
298 |     "q_score = InBioMap_edgelist['edgeScore'].quantile(0.9)\n",
299 |     "InBioMap_edgelist_filt = InBioMap_edgelist[InBioMap_edgelist['edgeScore']>=q_score]\n",
300 |     "print InBioMap_edgelist_filt.shape[0], '/', InBioMap_edgelist.shape[0], 'edges kept, ', float(InBioMap_edgelist_filt.shape[0])/InBioMap_edgelist.shape[0]"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 15,
306 |    "metadata": {
307 |     "collapsed": false
308 |    },
309 |    "outputs": [],
310 |    "source": [
311 |     "# Keeping all edges where the score == 1, it's a top 75% network, we will save this\n",
312 |     "InBioMap_edgelist_filt[['NodeA', 'NodeB']].to_csv(wd+'Network_SIFs_Symbol/InBioMap75_Symbol.sif', sep='\\t', index=False, header=False)"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {
319 |     "collapsed": true
320 |    },
321 |    "outputs": [],
322 |    "source": []
323 |   }
324 |  ],
325 |  "metadata": {
326 |   "kernelspec": {
327 |    "display_name": "Python 2",
328 |    "language": "python",
329 |    "name": "python2"
330 |   },
331 |   "language_info": {
332 |    "codemirror_mode": {
333 |     "name": "ipython",
334 |     "version": 2
335 |    },
336 |    "file_extension": ".py",
337 |    "mimetype": "text/x-python",
338 |    "name": "python",
339 |    "nbconvert_exporter": "python",
340 |    "pygments_lexer": "ipython2",
341 |    "version": "2.7.11"
342 |   }
343 |  },
344 |  "nbformat": 4,
345 |  "nbformat_minor": 0
346 | }
347 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/IntAct Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import time"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Load IntAct Raw Data\n",
 22 |     "#### Source (PSI-MITAB): ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt\n",
 23 |     "Downloaded: June 15, 2017   \n",
 24 |     "Last Updated: June 05, 2017  \n",
 25 |     "Notes for processing: All interactions listed here need to be filtered for human-human interactions. Given the size of the file, we will filter the interactions and save the human-only interactions to a separate file to be loaded to save memory."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stderr",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "/cellar/users/jkhuang/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (38,39) have mixed types. Specify dtype option on import or set low_memory=False.\n",
 40 |       "  interactivity=interactivity, compiler=compiler, result=result)\n"
 41 |      ]
 42 |     },
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "Raw edge count in IntAct: 653104\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 53 |     "IntAct_Raw = pd.read_csv(wd+'Network_Data_Raw/IntAct/2016-09-08_intact.txt', sep='\\t')\n",
 54 |     "print 'Raw edge count in IntAct:', len(IntAct_Raw)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Custom Processing of Raw DIP Data"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 5,
 67 |    "metadata": {
 68 |     "collapsed": false
 69 |    },
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "Index([u'#ID(s) interactor A', u'ID(s) interactor B',\n",
 75 |        "       u'Alt. ID(s) interactor A', u'Alt. ID(s) interactor B',\n",
 76 |        "       u'Alias(es) interactor A', u'Alias(es) interactor B',\n",
 77 |        "       u'Interaction detection method(s)', u'Publication 1st author(s)',\n",
 78 |        "       u'Publication Identifier(s)', u'Taxid interactor A',\n",
 79 |        "       u'Taxid interactor B', u'Interaction type(s)', u'Source database(s)',\n",
 80 |        "       u'Interaction identifier(s)', u'Confidence value(s)',\n",
 81 |        "       u'Expansion method(s)', u'Biological role(s) interactor A',\n",
 82 |        "       u'Biological role(s) interactor B',\n",
 83 |        "       u'Experimental role(s) interactor A',\n",
 84 |        "       u'Experimental role(s) interactor B', u'Type(s) interactor A',\n",
 85 |        "       u'Type(s) interactor B', u'Xref(s) interactor A',\n",
 86 |        "       u'Xref(s) interactor B', u'Interaction Xref(s)',\n",
 87 |        "       u'Annotation(s) interactor A', u'Annotation(s) interactor B',\n",
 88 |        "       u'Interaction annotation(s)', u'Host organism(s)',\n",
 89 |        "       u'Interaction parameter(s)', u'Creation date', u'Update date',\n",
 90 |        "       u'Checksum(s) interactor A', u'Checksum(s) interactor B',\n",
 91 |        "       u'Interaction Checksum(s)', u'Negative', u'Feature(s) interactor A',\n",
 92 |        "       u'Feature(s) interactor B', u'Stoichiometry(s) interactor A',\n",
 93 |        "       u'Stoichiometry(s) interactor B',\n",
 94 |        "       u'Identification method participant A',\n",
 95 |        "       u'Identification method participant B'],\n",
 96 |        "      dtype='object')"
 97 |       ]
 98 |      },
 99 |      "execution_count": 5,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "IntAct_Raw.columns"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "#### Keep only human-human interactions"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 7,
118 |    "metadata": {
119 |     "collapsed": false,
120 |     "scrolled": false
121 |    },
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "Human-Human only edge count in IntAct: 247565\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "# Filter  for only human-human interactions in IntAct\n",
133 |     "IntAct_Human_Only = IntAct_Raw[(IntAct_Raw['Taxid interactor A']=='taxid:9606(human)|taxid:9606(Homo sapiens)') & (IntAct_Raw['Taxid interactor B']=='taxid:9606(human)|taxid:9606(Homo sapiens)')]\n",
134 |     "IntAct_Human_Only = IntAct_Human_Only.drop_duplicates()\n",
135 |     "print 'Human-Human only edge count in IntAct:', IntAct_Human_Only.shape[0]"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 9,
141 |    "metadata": {
142 |     "collapsed": false
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "Human_IntAct_Genes = list(set(IntAct_Human_Only['#ID(s) interactor A']).union(set(IntAct_Human_Only['ID(s) interactor B'])))"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## Convert Genes"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 14,
159 |    "metadata": {
160 |     "collapsed": false
161 |    },
162 |    "outputs": [
163 |     {
164 |      "name": "stdout",
165 |      "output_type": "stream",
166 |      "text": [
167 |       "19143 Valid Query Genes\n",
168 |       "1162 Invalid Query Genes\n"
169 |      ]
170 |     }
171 |    ],
172 |    "source": [
173 |     "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'intact' prefix)\n",
174 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_IntAct_Genes, exclude_prefixes=['intact'])"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 17,
180 |    "metadata": {
181 |     "collapsed": false
182 |    },
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "Batch query complete: 29.14 seconds\n",
189 |       "19368 Matched query results\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "# Set scopes (gene naming systems to search)\n",
195 |     "scopes = \"uniprot\"\n",
196 |     "\n",
197 |     "# Set fields (systems from which to return gene names from)\n",
198 |     "fields = \"symbol, entrezgene\"\n",
199 |     "\n",
200 |     "# Query MyGene.Info\n",
201 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
202 |     "print len(match_list), 'Matched query results'"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 18,
208 |    "metadata": {
209 |     "collapsed": false,
210 |     "scrolled": true
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "Queries without full matching results found: 4329\n",
218 |       "\n",
219 |       "157 Queries with mutliple matches found\n",
220 |       "\n",
221 |       "Query mapping table/dictionary construction complete: 94.21 seconds\n"
222 |      ]
223 |     }
224 |    ],
225 |    "source": [
226 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "## Construct Converted Network"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 19,
239 |    "metadata": {
240 |     "collapsed": false
241 |    },
242 |    "outputs": [],
243 |    "source": [
244 |     "query_edgelist = IntAct_Human_Only[['#ID(s) interactor A', 'ID(s) interactor B']].drop_duplicates().values.tolist()"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 21,
250 |    "metadata": {
251 |     "collapsed": false
252 |    },
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "5864 / 161035 edges with invalid nodes removed\n"
259 |      ]
260 |     }
261 |    ],
262 |    "source": [
263 |     "# Filter query edgelist of interactions with invalid genes\n",
264 |     "query_edgelist_filt = gct.filter_query_edgelist(query_edgelist, invalid_genes)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 23,
270 |    "metadata": {
271 |     "collapsed": true
272 |    },
273 |    "outputs": [],
274 |    "source": [
275 |     "# Format edge list by removing 'uniprot:' prefix from all interactors\n",
276 |     "query_edgelist_filt_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist_filt]"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 24,
282 |    "metadata": {
283 |     "collapsed": false
284 |    },
285 |    "outputs": [],
286 |    "source": [
287 |     "# Convert network edge list to symbol\n",
288 |     "IntAct_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt_fmt, query_to_symbol)"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 25,
294 |    "metadata": {
295 |     "collapsed": false
296 |    },
297 |    "outputs": [
298 |     {
299 |      "name": "stdout",
300 |      "output_type": "stream",
301 |      "text": [
302 |       "155171 input edges\n",
303 |       "3236 self-edges removed\n",
304 |       "20662 edges with un-mapped genes removed\n",
305 |       "16701 duplicate edges removed\n",
306 |       "Edge list filtered: 0.43 seconds\n",
307 |       "114572 Edges remaining\n"
308 |      ]
309 |     }
310 |    ],
311 |    "source": [
312 |     "# Filter converted edge list\n",
313 |     "IntAct_edgelist_symbol_filt = gct.filter_converted_edgelist(IntAct_edgelist_symbol)"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 26,
319 |    "metadata": {
320 |     "collapsed": false
321 |    },
322 |    "outputs": [
323 |     {
324 |      "name": "stdout",
325 |      "output_type": "stream",
326 |      "text": [
327 |       "Edge list saved: 0.24 seconds\n"
328 |      ]
329 |     }
330 |    ],
331 |    "source": [
332 |     "# Save filtered, converted edge list to file\n",
333 |     "gct.write_edgelist(IntAct_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/IntAct_Symbol.sif')"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {
340 |     "collapsed": true
341 |    },
342 |    "outputs": [],
343 |    "source": []
344 |   }
345 |  ],
346 |  "metadata": {
347 |   "kernelspec": {
348 |    "display_name": "Python 2",
349 |    "language": "python",
350 |    "name": "python2"
351 |   },
352 |   "language_info": {
353 |    "codemirror_mode": {
354 |     "name": "ipython",
355 |     "version": 2
356 |    },
357 |    "file_extension": ".py",
358 |    "mimetype": "text/x-python",
359 |    "name": "python",
360 |    "nbconvert_exporter": "python",
361 |    "pygments_lexer": "ipython2",
362 |    "version": "2.7.11"
363 |   }
364 |  },
365 |  "nbformat": 4,
366 |  "nbformat_minor": 0
367 | }
368 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/Mentha Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import itertools\n",
 15 |     "import time"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Load Mentha Raw Data\n",
 23 |     "#### Source (MITAB): http://mentha.uniroma2.it/doDownload.php?file=2017-06-12_MITAB-2.5.zip\n",
 24 |     "Downloaded: June 15, 2017  \n",
 25 |     "Last Updated: June 12, 2017  \n",
 26 |     "Notes for processing: This is the file should contain only human-human protein interactions but this should be checked and filtered if needed.  \n",
 27 |     "A Note about scoring: Mentha does have a score assigned for each interaction called the 'mentha-score', this will be the score we use to filter the network."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "Raw edge count in Mentha: 1114184\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 47 |     "Mentha_Raw = pd.read_csv(wd+'Network_Data_Raw/mentha_2017_06_12', sep='\\t', header=-1)\n",
 48 |     "print 'Raw edge count in Mentha:', len(Mentha_Raw)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "Human-Human only interactions in Mentha: 531726\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "# Keep only human-human interactions\n",
 68 |     "Mentha_Human_only = Mentha_Raw[(Mentha_Raw[9]=='taxid:9606(Homo sapiens)') & (Mentha_Raw[10]=='taxid:9606(Homo sapiens)')]\n",
 69 |     "print 'Human-Human only interactions in Mentha:', len(Mentha_Human_only)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 4,
 75 |    "metadata": {
 76 |     "collapsed": true
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Extract gene list\n",
 81 |     "Human_Mentha_Genes = list(set(Mentha_Human_only[0]).union(set(Mentha_Human_only[1])))"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## Convert Network Genes to symbol from UniProt Accession ID"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 5,
 94 |    "metadata": {
 95 |     "collapsed": false
 96 |    },
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stdout",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "18626 Valid Query Genes\n",
103 |       "0 Invalid Query Genes\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'intact' prefix)\n",
109 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_Mentha_Genes)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 6,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "Batch query complete: 62.69 seconds\n",
124 |       "18932 Matched query results\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "# Set scopes (gene naming systems to search)\n",
130 |     "scopes = \"uniprot\"\n",
131 |     "\n",
132 |     "# Set fields (systems from which to return gene names from)\n",
133 |     "fields = \"symbol, entrezgene\"\n",
134 |     "\n",
135 |     "# Query MyGene.Info\n",
136 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
137 |     "print len(match_list), 'Matched query results'"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 7,
143 |    "metadata": {
144 |     "collapsed": false
145 |    },
146 |    "outputs": [
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "Queries without full matching results found: 1198\n",
152 |       "The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.\n",
153 |       "\n",
154 |       "207 Queries with mutliple matches found\n",
155 |       "\n",
156 |       "Query mapping table/dictionary construction complete: 83.92 seconds\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "## Construct Converted Network"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 8,
174 |    "metadata": {
175 |     "collapsed": true
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "query_edgelist = Mentha_Human_only[[0, 1, 14]].drop_duplicates().values.tolist()"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 9,
185 |    "metadata": {
186 |     "collapsed": true
187 |    },
188 |    "outputs": [],
189 |    "source": [
190 |     "# Format edge list by removing 'uniprot:' prefix from all interactors\n",
191 |     "query_edgelist_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1]), float(edge[2].split(':')[-1])] for edge in query_edgelist]"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 10,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "# Convert network edge list to symbol\n",
203 |     "Mentha_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=True)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 11,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "327857 input edges\n",
218 |       "3247 self-edges removed\n",
219 |       "8219 edges with un-mapped genes removed\n",
220 |       "53515 duplicate edges removed\n",
221 |       "Edge list filtered: 1.61 seconds\n",
222 |       "262876 Edges remaining\n"
223 |      ]
224 |     }
225 |    ],
226 |    "source": [
227 |     "# Filter converted edge list\n",
228 |     "Mentha_edgelist_symbol_filt = gct.filter_converted_edgelist(Mentha_edgelist_symbol, weighted=True)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 12,
234 |    "metadata": {
235 |     "collapsed": false
236 |    },
237 |    "outputs": [
238 |     {
239 |      "name": "stdout",
240 |      "output_type": "stream",
241 |      "text": [
242 |       "Edge list saved: 0.79 seconds\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "# Save filtered, converted edge list to file\n",
248 |     "gct.write_edgelist(Mentha_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/Mentha_Symbol.sif', binary=False)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 13,
254 |    "metadata": {
255 |     "collapsed": false
256 |    },
257 |    "outputs": [
258 |     {
259 |      "name": "stdout",
260 |      "output_type": "stream",
261 |      "text": [
262 |       "90.0% score: 0.454\n",
263 |       "22886 / 262876 edges retained\n"
264 |      ]
265 |     }
266 |    ],
267 |    "source": [
268 |     "# Create filtered network\n",
269 |     "Mentha90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/Mentha_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
270 |     "                                                    q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/Mentha90_Symbol.sif')"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {
277 |     "collapsed": true
278 |    },
279 |    "outputs": [],
280 |    "source": []
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "Python 2",
286 |    "language": "python",
287 |    "name": "python2"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 2
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython2",
299 |    "version": "2.7.11"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 0
304 | }
305 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/MultiNet Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import itertools"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Load MultiNet Raw Data\n",
 22 |     "#### Source: http://homes.gersteinlab.org/Khurana-PLoSCompBio-2013/\n",
 23 |     "Downloaded: August 12, 2016  \n",
 24 |     "Last Updated: March 17, 2013  \n",
 25 |     "Processing Notes: MultiNet has labels which interactions are noted as PPI and which are not. In the initial case, we will be examining all interaction information for MultiNet. However, in this case it is simple enough to parse the PPI only information from the data, and can be done in future work if necessary."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "Raw edge count in MultiNet: 109598\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 45 |     "MultiNet_Raw = pd.read_csv(wd+'Network_Data_Raw/Multinet.interactions.network_presence_2013_03_17.txt',sep='\\t')\n",
 46 |     "print 'Raw edge count in MultiNet:', MultiNet_Raw.shape[0]"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 8,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "# Build edge list from interaction column. The two parts of the interaction name on either side of '_' are gene symbols\n",
 58 |     "MultiNet_edgelist = [interaction.split('_') for interaction in MultiNet_Raw['INTERACTION_NAME']]"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 9,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "# Sort each edge representation for filtering\n",
 70 |     "MultiNet_edgelist_sorted = [sorted(edge) for edge in MultiNet_edgelist]"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 10,
 76 |    "metadata": {
 77 |     "collapsed": false
 78 |    },
 79 |    "outputs": [
 80 |     {
 81 |      "name": "stdout",
 82 |      "output_type": "stream",
 83 |      "text": [
 84 |       "109598 input edges\n",
 85 |       "0 self-edges removed\n",
 86 |       "0 edges with un-mapped genes removed\n",
 87 |       "0 duplicate edges removed\n",
 88 |       "Edge list filtered: 0.31 seconds\n",
 89 |       "109598 Edges remaining\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "# Filter edgelist for duplicate nodes and for self-edges\n",
 95 |     "MultiNet_edgelist_filt = gct.filter_converted_edgelist(MultiNet_edgelist_sorted)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 13,
101 |    "metadata": {
102 |     "collapsed": false
103 |    },
104 |    "outputs": [
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "Edge list saved: 0.11 seconds\n"
110 |      ]
111 |     }
112 |    ],
113 |    "source": [
114 |     "# Save genelist to file\n",
115 |     "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
116 |     "gct.write_edgelist(MultiNet_edgelist_filt, outdir+'MultiNet_Symbol.sif')"
117 |    ]
118 |   }
119 |  ],
120 |  "metadata": {
121 |   "kernelspec": {
122 |    "display_name": "Python 2",
123 |    "language": "python",
124 |    "name": "python2"
125 |   },
126 |   "language_info": {
127 |    "codemirror_mode": {
128 |     "name": "ipython",
129 |     "version": 2
130 |    },
131 |    "file_extension": ".py",
132 |    "mimetype": "text/x-python",
133 |    "name": "python",
134 |    "nbconvert_exporter": "python",
135 |    "pygments_lexer": "ipython2",
136 |    "version": "2.7.11"
137 |   }
138 |  },
139 |  "nbformat": 4,
140 |  "nbformat_minor": 0
141 | }
142 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/PID Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import time"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Load PID Raw Data\n",
 22 |     "#### Source: http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.pid.hgnc.sif.gz\n",
 23 |     "Downloaded: June 19, 2017  \n",
 24 |     "Last (via Pathway Commons v8 datasources.txt file): July 27, 2010  \n",
 25 |     "Note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons  \n",
 26 |     "Also note: This network contains some interacions with CHEBI small molecules. These interactions will be removed"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 38 |     "PID_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons9.pid.hgnc.sif',sep='\\t', header=-1)\n",
 39 |     "print 'Raw interactions in NCI PID:', PID_Raw.shape[0]"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 10,
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "Protein-Protein interactions in NCI PID: 27489\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "# Filter all interactions that contain a CHEBI: item\n",
 59 |     "PID_filt = PID_Raw[(~PID_Raw[0].str.contains(':')) & (~PID_Raw[2].str.contains(':'))]\n",
 60 |     "PID_edgelist = PID_filt[[0, 2]].values.tolist()\n",
 61 |     "print 'Protein-Protein interactions in NCI PID:', len(PID_edgelist)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 11,
 67 |    "metadata": {
 68 |     "collapsed": true
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# Sort each edge representation for filtering\n",
 73 |     "PID_edgelist_sorted = [sorted(edge) for edge in PID_edgelist]"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 12,
 79 |    "metadata": {
 80 |     "collapsed": false
 81 |    },
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "27489 input edges\n",
 88 |       "0 self-edges removed\n",
 89 |       "0 edges with un-mapped genes removed\n",
 90 |       "6047 duplicate edges removed\n",
 91 |       "Edge list filtered: 0.11 seconds\n",
 92 |       "21442 Edges remaining\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# Filter edgelist for duplicate nodes and for self-edges\n",
 98 |     "PID_edgelist_filt = gct.filter_converted_edgelist(PID_edgelist_sorted)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 14,
104 |    "metadata": {
105 |     "collapsed": false
106 |    },
107 |    "outputs": [
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "Edge list saved: 0.06 seconds\n"
113 |      ]
114 |     }
115 |    ],
116 |    "source": [
117 |     "# Save genelist to file\n",
118 |     "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
119 |     "gct.write_edgelist(PID_edgelist_filt, outdir+'PID_Symbol.sif')"
120 |    ]
121 |   }
122 |  ],
123 |  "metadata": {
124 |   "kernelspec": {
125 |    "display_name": "Python 2",
126 |    "language": "python",
127 |    "name": "python2"
128 |   },
129 |   "language_info": {
130 |    "codemirror_mode": {
131 |     "name": "ipython",
132 |     "version": 2
133 |    },
134 |    "file_extension": ".py",
135 |    "mimetype": "text/x-python",
136 |    "name": "python",
137 |    "nbconvert_exporter": "python",
138 |    "pygments_lexer": "ipython2",
139 |    "version": "2.7.11"
140 |   }
141 |  },
142 |  "nbformat": 4,
143 |  "nbformat_minor": 0
144 | }
145 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/Pathway Commons Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import time"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Load Pathway Commons Raw Data (All interactions)\n",
 22 |     "#### Source: http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.All.hgnc.txt.gz\n",
 23 |     "Downloaded: June 15, 2017  \n",
 24 |     "Last Updated: May 25, 2017  \n",
 25 |     "Citation: Pathway Commons, a web resource for biological pathway data. Cerami E et al. Nucleic Acids Research (2011).  \n",
 26 |     "A Note about filtering interactions: Pathway Commons also contains interactions between proteins and small molecules from the CHEBI database. These interactions will need to be filtered out as they are not protein-protein interactions.  \n",
 27 |     "Also note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 9,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "Raw interactions in Pathway Commons v9: 1503144\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 47 |     "PC_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons9.All.hgnc.sif', sep='\\t', header=-1)\n",
 48 |     "print 'Raw interactions in Pathway Commons v9:', PC_Raw.shape[0]"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 25,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "Protein-Protein interactions in Pathway Commons v9: 968186\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "# Filter all interactions that contain a CHEBI: item\n",
 68 |     "PC_filt = PC_Raw[(~PC_Raw[0].str.contains(':')) & (~PC_Raw[2].str.contains(':'))]\n",
 69 |     "PC_edgelist = PC_filt[[0, 2]].values.tolist()\n",
 70 |     "print 'Protein-Protein interactions in Pathway Commons v9:', len(PC_edgelist)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 26,
 76 |    "metadata": {
 77 |     "collapsed": true
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# Sort each edge representation for filtering\n",
 82 |     "PC_edgelist_sorted = [sorted(edge) for edge in PC_edgelist]"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 27,
 88 |    "metadata": {
 89 |     "collapsed": false
 90 |    },
 91 |    "outputs": [
 92 |     {
 93 |      "name": "stdout",
 94 |      "output_type": "stream",
 95 |      "text": [
 96 |       "968186 input edges\n",
 97 |       "0 self-edges removed\n",
 98 |       "0 edges with un-mapped genes removed\n",
 99 |       "143511 duplicate edges removed\n",
100 |       "Edge list filtered: 1.92 seconds\n",
101 |       "824675 Edges remaining\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "# Filter edgelist for duplicate nodes and for self-edges\n",
107 |     "PC_edgelist_filt = gct.filter_converted_edgelist(PC_edgelist_sorted)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 28,
113 |    "metadata": {
114 |     "collapsed": false
115 |    },
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "Edge list saved: 0.55 seconds\n"
122 |      ]
123 |     }
124 |    ],
125 |    "source": [
126 |     "# Save genelist to file\n",
127 |     "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
128 |     "gct.write_edgelist(PC_edgelist_filt, outdir+'PathwayCommons_Symbol.sif')"
129 |    ]
130 |   }
131 |  ],
132 |  "metadata": {
133 |   "kernelspec": {
134 |    "display_name": "Python 2",
135 |    "language": "python",
136 |    "name": "python2"
137 |   },
138 |   "language_info": {
139 |    "codemirror_mode": {
140 |     "name": "ipython",
141 |     "version": 2
142 |    },
143 |    "file_extension": ".py",
144 |    "mimetype": "text/x-python",
145 |    "name": "python",
146 |    "nbconvert_exporter": "python",
147 |    "pygments_lexer": "ipython2",
148 |    "version": "2.7.11"
149 |   }
150 |  },
151 |  "nbformat": 4,
152 |  "nbformat_minor": 0
153 | }
154 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/Reactome Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import itertools\n",
 15 |     "import time"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Load Reactome Raw Data\n",
 23 |     "#### Source: http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz\n",
 24 |     "#### File to download: The link labelled \"Human protein-protein interaction pairs in tab-delimited format\" seems to have many more interactions than the MITAB file format. This is the file that we will use for this network.\n",
 25 |     "Downloaded: June 15, 2017    \n",
 26 |     "Last Updated: April 20, 2017  "
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Raw Edges in Reactome v60: 2523567\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 46 |     "Reactome_Raw = pd.read_csv(wd+'Network_Data_Raw/Reactome_v60.interactions.txt',sep='\\t',skiprows=1, header=-1, low_memory=False)\n",
 47 |     "print 'Raw Edges in Reactome v60:', len(Reactome_Raw)"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": false
 55 |    },
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "214432 Raw Reactome Edges after removing duplicate edges\n",
 62 |       "210066 Raw Reactome Edges after removing duplicate and self-edges\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "# Get edge list of network (filter for duplicate edges and self-edges)\n",
 68 |     "query_edgelist_filt = Reactome_Raw[[0,3]].drop_duplicates()\n",
 69 |     "print len(query_edgelist_filt), \"Raw Reactome Edges after removing duplicate edges\"\n",
 70 |     "query_edgelist_filt2 = query_edgelist_filt[query_edgelist_filt[0]!=query_edgelist_filt[3]]\n",
 71 |     "print len(query_edgelist_filt2), \"Raw Reactome Edges after removing duplicate and self-edges\"\n",
 72 |     "query_edgelist = query_edgelist_filt2.values.tolist()"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {
 79 |     "collapsed": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# Extract gene list\n",
 84 |     "Reactome_Raw_Genes = list(set(query_edgelist_filt2[0]).union(set(query_edgelist_filt2[3])))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "## Convert Genes from UniProtKB to Symbol"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 5,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [
101 |     {
102 |      "name": "stdout",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "8387 Valid Query Genes\n",
106 |       "0 Invalid Query Genes\n"
107 |      ]
108 |     }
109 |    ],
110 |    "source": [
111 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(Reactome_Raw_Genes)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 6,
117 |    "metadata": {
118 |     "collapsed": false
119 |    },
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "Batch query complete: 13.56 seconds\n",
126 |       "8518 Matched query results\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "# Set scopes (gene naming systems to search)\n",
132 |     "scopes = \"uniprot\"\n",
133 |     "\n",
134 |     "# Set fields (systems from which to return gene names from)\n",
135 |     "fields = \"symbol, entrezgene\"\n",
136 |     "\n",
137 |     "# Query MyGene.Info\n",
138 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
139 |     "print len(match_list), 'Matched query results'"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 7,
145 |    "metadata": {
146 |     "collapsed": false
147 |    },
148 |    "outputs": [
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "Queries without full matching results found: 511\n",
154 |       "\n",
155 |       "102 Queries with mutliple matches found\n",
156 |       "\n",
157 |       "Query mapping table/dictionary construction complete: 17.83 seconds\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "## Construct Converted Network"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 8,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [],
179 |    "source": [
180 |     "# Format edge list by removing prefixes from all interactors\n",
181 |     "query_edgelist_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist]"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 9,
187 |    "metadata": {
188 |     "collapsed": true
189 |    },
190 |    "outputs": [],
191 |    "source": [
192 |     "# Convert network edge list to symbol\n",
193 |     "Reactome_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=False)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 11,
199 |    "metadata": {
200 |     "collapsed": false
201 |    },
202 |    "outputs": [
203 |     {
204 |      "name": "stdout",
205 |      "output_type": "stream",
206 |      "text": [
207 |       "210066 input edges\n",
208 |       "2708 self-edges removed\n",
209 |       "10886 edges with un-mapped genes removed\n",
210 |       "1970 duplicate edges removed\n",
211 |       "Edge list filtered: 0.51 seconds\n",
212 |       "194502 Edges remaining\n"
213 |      ]
214 |     }
215 |    ],
216 |    "source": [
217 |     "# Filter converted edge list\n",
218 |     "Reactome_edgelist_symbol_filt = gct.filter_converted_edgelist(Reactome_edgelist_symbol, weighted=False)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 12,
224 |    "metadata": {
225 |     "collapsed": false
226 |    },
227 |    "outputs": [
228 |     {
229 |      "name": "stdout",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "Edge list saved: 0.59 seconds\n"
233 |      ]
234 |     }
235 |    ],
236 |    "source": [
237 |     "# Save filtered, converted edge list to file\n",
238 |     "gct.write_edgelist(Reactome_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/Reactome_Symbol.sif', binary=True)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {
245 |     "collapsed": true
246 |    },
247 |    "outputs": [],
248 |    "source": []
249 |   }
250 |  ],
251 |  "metadata": {
252 |   "kernelspec": {
253 |    "display_name": "Python 2",
254 |    "language": "python",
255 |    "name": "python2"
256 |   },
257 |   "language_info": {
258 |    "codemirror_mode": {
259 |     "name": "ipython",
260 |     "version": 2
261 |    },
262 |    "file_extension": ".py",
263 |    "mimetype": "text/x-python",
264 |    "name": "python",
265 |    "nbconvert_exporter": "python",
266 |    "pygments_lexer": "ipython2",
267 |    "version": "2.7.11"
268 |   }
269 |  },
270 |  "nbformat": 4,
271 |  "nbformat_minor": 0
272 | }
273 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/Reactome-FIs Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import itertools\n",
 15 |     "import time"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Load Reactome-Functional Interactions Raw Data\n",
 23 |     "#### Source: http://reactomews.oicr.on.ca:8080/caBigR3WebApp2016/FIsInGene_022717_with_annotations.txt.zip\n",
 24 |     "Downloaded: June 15, 2017  \n",
 25 |     "Last Updated: February 27, 2017  \n",
 26 |     "Note about processing: It looks like most of the edges are given as gene symbols but many of them seem to be invalid names, so we will use some of the gene conversion tools to filter these results as best we can."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Raw edges in ReactomeFI: 230243\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 46 |     "Reactome_FIs_Raw = pd.read_csv(wd+'Network_Data_Raw/FIsInGene_022717_with_annotations.txt',sep='\\t')\n",
 47 |     "print 'Raw edges in ReactomeFI:', Reactome_FIs_Raw.shape[0]"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "# Extract gene list\n",
 59 |     "Reactome_FIs_Raw_Genes = list(set(Reactome_FIs_Raw['Gene1']).union(set(Reactome_FIs_Raw['Gene2'])))"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 4,
 65 |    "metadata": {
 66 |     "collapsed": false
 67 |    },
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stdout",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "12254 Valid Query Genes\n",
 74 |       "23 Invalid Query Genes:\n",
 75 |       "['YWHAE/FAM22B FUSION', 'RUNX1/C20ORF112 FUSION', 'IGKV A18', 'APC VARIANT PROTEIN', 'STAG1 VARIANT PROTEIN', 'MIR CL-10', 'BETA 2-MICROGLOBULIN', 'BCR/ABL FUSION', 'ATP2B2 VARIANT PROTEIN', 'ITGA7 VARIANT PROTEIN', '<ALPHA><BETA>CREB-1', 'CD40 LIGAND', 'NUMA1 VARIANT PROTEIN', 'PIK4CA VARIANT PROTEIN', 'EPHB2 VARIANT PROTEIN', 'RUNX1/CBFA2T2 FUSION', 'TNC VARIANT PROTEIN', 'PIK3C2B VARIANT PROTEIN', 'PLCG1 VARIANT PROTEIN', 'WUGSC:H_GS165O14.2', 'PIK3CA VARIANT PROTEIN', 'YWHAE/FAM22A FUSION', 'PDHA1/LOC79064']\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "# Find \"invalid genes\" by text format\n",
 81 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(Reactome_FIs_Raw_Genes, exclude_prefixes=['CHEBI'], print_invalid_genes=True)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 5,
 87 |    "metadata": {
 88 |     "collapsed": false
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# Get Edgelist of network\n",
 93 |     "query_edgelist = Reactome_FIs_Raw[['Gene1','Gene2', 'Score']].values.tolist()"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 6,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "820 / 230243 edges with invalid nodes removed\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "# Filter query edges\n",
113 |     "query_edgelist_filt = gct.filter_query_edgelist(query_edgelist,invalid_genes)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 7,
119 |    "metadata": {
120 |     "collapsed": false
121 |    },
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "229423 input edges\n",
128 |       "0 self-edges removed\n",
129 |       "0 edges with un-mapped genes removed\n",
130 |       "0 duplicate edges removed\n",
131 |       "Edge list filtered: 1.95 seconds\n",
132 |       "229423 Edges remaining\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "# Filter edge list\n",
138 |     "ReactomeFI_edgelist_filt = gct.filter_converted_edgelist(query_edgelist_filt, weighted=True)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 8,
144 |    "metadata": {
145 |     "collapsed": false
146 |    },
147 |    "outputs": [
148 |     {
149 |      "name": "stdout",
150 |      "output_type": "stream",
151 |      "text": [
152 |       "Edge list saved: 0.68 seconds\n"
153 |      ]
154 |     }
155 |    ],
156 |    "source": [
157 |     "# Save filtered, converted edge list to file\n",
158 |     "gct.write_edgelist(ReactomeFI_edgelist_filt, wd+'Network_SIFs_Symbol/ReactomeFI_Symbol.sif', binary=False)"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 36,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "90.0% score: 1.0\n",
173 |       "0 / 229423 edges retained\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "# Create filtered network\n",
179 |     "ReactomeFI90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/ReactomeFI_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
180 |     "                                                        q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/ReactomeFI90_edgelist_Symbol.sif')"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 37,
186 |    "metadata": {
187 |     "collapsed": false
188 |    },
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "198541 / 229423 edges kept,  0.86539274615\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "# The filter function didn't work here because the max value makes up >90% of the edges. \n",
200 |     "# We need to filter but keep all max edges instead\n",
201 |     "ReactomeFI_edgelist = pd.DataFrame(ReactomeFI_edgelist_filt, columns=['NodeA', 'NodeB', 'Score'])\n",
202 |     "q_score = ReactomeFI_edgelist['Score'].quantile(0.9)\n",
203 |     "ReactomeFI_edgelist_filt2 = ReactomeFI_edgelist[ReactomeFI_edgelist['Score']>=q_score]\n",
204 |     "print ReactomeFI_edgelist_filt2.shape[0], '/', ReactomeFI_edgelist.shape[0], 'edges kept, ', float(ReactomeFI_edgelist_filt2.shape[0])/ReactomeFI_edgelist.shape[0]"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "# Essentially >85% of the edges have the 'maximum score' which makes almost no sense for filtering further"
216 |    ]
217 |   }
218 |  ],
219 |  "metadata": {
220 |   "kernelspec": {
221 |    "display_name": "Python 2",
222 |    "language": "python",
223 |    "name": "python2"
224 |   },
225 |   "language_info": {
226 |    "codemirror_mode": {
227 |     "name": "ipython",
228 |     "version": 2
229 |    },
230 |    "file_extension": ".py",
231 |    "mimetype": "text/x-python",
232 |    "name": "python",
233 |    "nbconvert_exporter": "python",
234 |    "pygments_lexer": "ipython2",
235 |    "version": "2.7.11"
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 0
240 | }
241 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/STRING Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import time"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Load STRING Raw Data\n",
 22 |     "#### Source: http://string-db.org/download/protein.links.v10.5.txt.gz\n",
 23 |     "#### Source (detailed): http://string-db.org/download/protein.links.detailed.v10.5.txt.gz\n",
 24 |     "#### File to download: The link labelled 'protein.links.v10.5.txt.gz' is simply the binary file version of the 'detailed' file. The detailed file documents the types of interactions and support for each interaction. It can be used for filtering in the future if desired, but will not be filtered on those categories currently.\n",
 25 |     "Downloaded: June 15, 2016  \n",
 26 |     "Last Updated: May 14, 2017\t\n",
 27 |     "Processing note: This data needs to be filtered for human-only interactions. This is a very long and large file, so we will parse the edges that are human-human interactions only by streaming the file. Then the resulting human-human interaction file will be read to be processed."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 4,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "Filtered human-human STRING interactions only: 1793.17046094 seconds\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "# Load and filter STRING for only human-human protein interactions\n",
 47 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 48 |     "starttime=time.time()\n",
 49 |     "g=open(wd+'Network_Data_Raw/STRING/STRING_human_v10.5.txt','w')\n",
 50 |     "with open(wd+'Network_Data_Raw/STRING/protein.links.v10.5.txt') as f:\n",
 51 |     "    for line in f:\n",
 52 |     "        edge = line.split(' ')\n",
 53 |     "        if edge[0].startswith('9606') and edge[1].startswith('9606'):\n",
 54 |     "            g.write(edge[0].split('.')[1]+'\\t'+edge[1].split('.')[1]+'\\t'+edge[2]+'\\n')\n",
 55 |     "print 'Filtered human-human STRING interactions only:', time.time()-starttime, 'seconds'\n",
 56 |     "g.close()"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "## Load human-filtered STRING edges"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 2,
 69 |    "metadata": {
 70 |     "collapsed": false
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "Raw Edges in STRING v10.5: 11353056\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 83 |     "STRING_Raw = pd.read_csv(wd+'Network_Data_Raw/STRING/STRING_human_v10.5.txt',sep='\\t',header=-1)\n",
 84 |     "STRING_Raw.columns = ['NodeA', 'NodeB', 'Score']\n",
 85 |     "print 'Raw Edges in STRING v10.5:', len(STRING_Raw)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 3,
 91 |    "metadata": {
 92 |     "collapsed": false
 93 |    },
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stdout",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "Edges in STRING v10.5 after dropping duplicates: 11353056\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "STRING_Raw_filt = STRING_Raw.drop_duplicates()\n",
105 |     "print 'Edges in STRING v10.5 after dropping duplicates:', len(STRING_Raw_filt)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 4,
111 |    "metadata": {
112 |     "collapsed": false
113 |    },
114 |    "outputs": [
115 |     {
116 |      "name": "stdout",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "STRING_Genes = list(set(STRING_Raw_filt['NodeA']).union(set(STRING_Raw_filt['NodeB'])))"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 5,
130 |    "metadata": {
131 |     "collapsed": false
132 |    },
133 |    "outputs": [],
134 |    "source": [
135 |     "query_edgelist = STRING_Raw_filt[['NodeA', 'NodeB', 'Score']].values.tolist()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "## Convert Genes from Ensembl Protein to Hugo Symbol"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 6,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "19576 Valid Query Genes\n",
157 |       "0 Invalid Query Genes\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(STRING_Genes)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 7,
168 |    "metadata": {
169 |     "collapsed": false
170 |    },
171 |    "outputs": [
172 |     {
173 |      "name": "stdout",
174 |      "output_type": "stream",
175 |      "text": [
176 |       "Batch query complete: 23.11 seconds\n",
177 |       "19578 Matched query results\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "# Set scopes (gene naming systems to search)\n",
183 |     "scopes = \"ensemblprotein\"\n",
184 |     "\n",
185 |     "# Set fields (systems from which to return gene names from)\n",
186 |     "fields = \"symbol, entrezgene\"\n",
187 |     "\n",
188 |     "# Query MyGene.Info\n",
189 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
190 |     "print len(match_list), 'Matched query results'"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 8,
196 |    "metadata": {
197 |     "collapsed": false
198 |    },
199 |    "outputs": [
200 |     {
201 |      "name": "stdout",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "Queries without full matching results found: 1584\n",
205 |       "\n",
206 |       "1 Queries with mutliple matches found\n",
207 |       "\n",
208 |       "Query mapping table/dictionary construction complete: 115.61 seconds\n"
209 |      ]
210 |     }
211 |    ],
212 |    "source": [
213 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "## Construct Converted Network"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 9,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "CPU times: user 26.7 s, sys: 2.74 s, total: 29.5 s\n",
235 |       "Wall time: 29.2 s\n"
236 |      ]
237 |     }
238 |    ],
239 |    "source": [
240 |     "%%time\n",
241 |     "# Convert weighted edge list\n",
242 |     "STRING_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 10,
248 |    "metadata": {
249 |     "collapsed": false
250 |    },
251 |    "outputs": [
252 |     {
253 |      "name": "stdout",
254 |      "output_type": "stream",
255 |      "text": [
256 |       "11353056 input edges\n",
257 |       "30268 self-edges removed\n",
258 |       "1043874 edges with un-mapped genes removed\n",
259 |       "5143146 duplicate edges removed\n",
260 |       "Edge list filtered: 77.42 seconds\n",
261 |       "5135768 Edges remaining\n"
262 |      ]
263 |     }
264 |    ],
265 |    "source": [
266 |     "# Filter converted edge list\n",
267 |     "STRING_edgelist_symbol_filt = gct.filter_converted_edgelist(STRING_edgelist_symbol, weighted=True)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 11,
273 |    "metadata": {
274 |     "collapsed": false
275 |    },
276 |    "outputs": [
277 |     {
278 |      "name": "stdout",
279 |      "output_type": "stream",
280 |      "text": [
281 |       "Edge list saved: 8.28 seconds\n"
282 |      ]
283 |     }
284 |    ],
285 |    "source": [
286 |     "# Write network to file\n",
287 |     "gct.write_edgelist(STRING_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/STRING_Symbol.sif', binary=False)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 12,
293 |    "metadata": {
294 |     "collapsed": false
295 |    },
296 |    "outputs": [
297 |     {
298 |      "name": "stdout",
299 |      "output_type": "stream",
300 |      "text": [
301 |       "90.0% score: 497.0\n",
302 |       "513035 / 5135768 edges retained\n"
303 |      ]
304 |     }
305 |    ],
306 |    "source": [
307 |     "# Create filtered network\n",
308 |     "STRING90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/STRING_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
309 |     "                                                    q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/STRING90_Symbol.sif')"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "collapsed": true
317 |    },
318 |    "outputs": [],
319 |    "source": []
320 |   }
321 |  ],
322 |  "metadata": {
323 |   "kernelspec": {
324 |    "display_name": "Python 2",
325 |    "language": "python",
326 |    "name": "python2"
327 |   },
328 |   "language_info": {
329 |    "codemirror_mode": {
330 |     "name": "ipython",
331 |     "version": 2
332 |    },
333 |    "file_extension": ".py",
334 |    "mimetype": "text/x-python",
335 |    "name": "python",
336 |    "nbconvert_exporter": "python",
337 |    "pygments_lexer": "ipython2",
338 |    "version": "2.7.11"
339 |   }
340 |  },
341 |  "nbformat": 4,
342 |  "nbformat_minor": 0
343 | }
344 | 


--------------------------------------------------------------------------------
/Network Processing Notebooks/iRefIndex Processing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from network_evaluation_tools import gene_conversion_tools as gct\n",
 12 |     "from network_evaluation_tools import data_import_tools as dit\n",
 13 |     "import pandas as pd\n",
 14 |     "import itertools\n",
 15 |     "import time"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "## Load iRefIndex Raw Data\n",
 23 |     "#### Source (MITAB): http://irefindex.org/download/irefindex/data/archive/release_14.0/psi_mitab/MITAB2.6/9606.mitab.07042015.txt.zip\n",
 24 |     "Downloaded: July 28, 2016  \n",
 25 |     "Last Updated: April 20, 2015  \n",
 26 |     "Notes for processing: This is the file for human protein interactions, however, not all interactions are human-human interactions. These need to be filtered. Also all ID's not without RefSeq or UniProt ID are excluded. Custom processing for this network is described below\n",
 27 |     "### From iRefIndex Mapping Documentation Page:\n",
 28 |     "\"We have made a file which provides mappings between iRefIndex identifiers and popular external identifiers. The current files contain all UniProt and RefSeq identifiers known to the current version of iRefIndex as documented on the sources page. For specific source documentation, see the sources for each released version.  \n",
 29 |     "  \n",
 30 |     "Other database identifiers are provided as database/accession pairs only when the iRefIndex identifier (ROGID) does not have a corresponding UniProt or RefSeq record with an identical sequence.\"  \n",
 31 |     "  \n",
 32 |     "Therefore: Interactions containing an ROGID identifier will be removed"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {
 39 |     "collapsed": false
 40 |    },
 41 |    "outputs": [
 42 |     {
 43 |      "name": "stdout",
 44 |      "output_type": "stream",
 45 |      "text": [
 46 |       "Raw edge count in iRefIndex: 673100\n"
 47 |      ]
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
 52 |     "iRefIndex_Raw = pd.read_csv(wd+'Network_Data_Raw/iRefIndex/9606.mitab.04072015.txt',sep='\\t')\n",
 53 |     "print 'Raw edge count in iRefIndex:', len(iRefIndex_Raw)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 3,
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [
 63 |     {
 64 |      "name": "stdout",
 65 |      "output_type": "stream",
 66 |      "text": [
 67 |       "Human-Human only interactions in iRefIndex: 485030\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "# Keep only human-human interactions\n",
 73 |     "iRef_Human_only = iRefIndex_Raw[(iRefIndex_Raw['taxa']=='taxid:9606(Homo sapiens)') & (iRefIndex_Raw['taxb']=='taxid:9606(Homo sapiens)')]\n",
 74 |     "print 'Human-Human only interactions in iRefIndex:', len(iRef_Human_only)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 5,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "# Extract gene list\n",
 86 |     "Human_iRef_Genes = list(set(iRef_Human_only['#uidA']).union(set(iRef_Human_only['uidB'])))"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 6,
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "name": "stdout",
 98 |      "output_type": "stream",
 99 |      "text": [
100 |       "['uniprotkb', 'refseq', 'rogid']\n"
101 |      ]
102 |     }
103 |    ],
104 |    "source": [
105 |     "# Get all iRef prefixes\n",
106 |     "prefixes=[]\n",
107 |     "for gene in Human_iRef_Genes:\n",
108 |     "    prefix=gene.split(':')[0]\n",
109 |     "    if prefix not in prefixes:\n",
110 |     "        prefixes.append(prefix)\n",
111 |     "print prefixes"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 7,
117 |    "metadata": {
118 |     "collapsed": false
119 |    },
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "485030 Human iRefIndex Edges\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "# Get edge list of network\n",
131 |     "query_edgelist = iRef_Human_only[['#uidA','uidB']].values.tolist()\n",
132 |     "print len(query_edgelist), \"Human iRefIndex Edges\""
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## Convert Genes"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 9,
145 |    "metadata": {
146 |     "collapsed": false
147 |    },
148 |    "outputs": [
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "23906 Valid Query Genes\n",
154 |       "945 Invalid Query Genes\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'rogid' prefix)\n",
160 |     "# This should only keep uniprotkb and refseq as queries\n",
161 |     "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_iRef_Genes, exclude_prefixes=['rogid'])"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 10,
167 |    "metadata": {
168 |     "collapsed": false
169 |    },
170 |    "outputs": [
171 |     {
172 |      "name": "stdout",
173 |      "output_type": "stream",
174 |      "text": [
175 |       "6305 / 485030 edges with invalid nodes removed\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "# filter edgelist because len(invalid_genes) > 0\n",
181 |     "query_edgelist_filt = gct.filter_query_edgelist(query_edgelist, invalid_genes)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 11,
187 |    "metadata": {
188 |     "collapsed": false
189 |    },
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "Batch query complete: 48.3 seconds\n",
196 |       "24127 Matched query results\n"
197 |      ]
198 |     }
199 |    ],
200 |    "source": [
201 |     "# Set scopes (gene naming systems to search)\n",
202 |     "scopes = \"uniprot, refseq\"\n",
203 |     "\n",
204 |     "# Set fields (systems from which to return gene names from)\n",
205 |     "fields = \"symbol, entrezgene\"\n",
206 |     "\n",
207 |     "# Query MyGene.Info\n",
208 |     "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
209 |     "print len(match_list), 'Matched query results'"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 12,
215 |    "metadata": {
216 |     "collapsed": false,
217 |     "scrolled": true
218 |    },
219 |    "outputs": [
220 |     {
221 |      "name": "stdout",
222 |      "output_type": "stream",
223 |      "text": [
224 |       "Queries without full matching results found: 6147\n",
225 |       "\n",
226 |       "162 Queries with mutliple matches found\n",
227 |       "\n",
228 |       "Query mapping table/dictionary construction complete: 149.88 seconds\n"
229 |      ]
230 |     }
231 |    ],
232 |    "source": [
233 |     "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "## Construct Converted Network"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 13,
246 |    "metadata": {
247 |     "collapsed": false
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "# Format edge list by removing prefix indicators from all interactors\n",
252 |     "query_edgelist_filt_fmt = [[gct.get_identifier_without_prefix(edge[0]),gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist_filt]"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 15,
258 |    "metadata": {
259 |     "collapsed": true
260 |    },
261 |    "outputs": [],
262 |    "source": [
263 |     "# Convert network edge list to symbol\n",
264 |     "iRefIndex_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt_fmt, query_to_symbol)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 16,
270 |    "metadata": {
271 |     "collapsed": false
272 |    },
273 |    "outputs": [
274 |     {
275 |      "name": "stdout",
276 |      "output_type": "stream",
277 |      "text": [
278 |       "478725 input edges\n",
279 |       "34326 self-edges removed\n",
280 |       "132730 edges with un-mapped genes removed\n",
281 |       "178121 duplicate edges removed\n",
282 |       "Edge list filtered: 0.78 seconds\n",
283 |       "133548 Edges remaining\n"
284 |      ]
285 |     }
286 |    ],
287 |    "source": [
288 |     "# Filter converted edge list\n",
289 |     "iRefIndex_edgelist_symbol_filt = gct.filter_converted_edgelist(iRefIndex_edgelist_symbol)"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 17,
295 |    "metadata": {
296 |     "collapsed": false
297 |    },
298 |    "outputs": [
299 |     {
300 |      "name": "stdout",
301 |      "output_type": "stream",
302 |      "text": [
303 |       "Edge list saved: 0.22 seconds\n"
304 |      ]
305 |     }
306 |    ],
307 |    "source": [
308 |     "# Save filtered, converted edge list to file\n",
309 |     "gct.write_edgelist(iRefIndex_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/iRefIndex_Symbol.sif')"
310 |    ]
311 |   }
312 |  ],
313 |  "metadata": {
314 |   "kernelspec": {
315 |    "display_name": "Python 2",
316 |    "language": "python",
317 |    "name": "python2"
318 |   },
319 |   "language_info": {
320 |    "codemirror_mode": {
321 |     "name": "ipython",
322 |     "version": 2
323 |    },
324 |    "file_extension": ".py",
325 |    "mimetype": "text/x-python",
326 |    "name": "python",
327 |    "nbconvert_exporter": "python",
328 |    "pygments_lexer": "ipython2",
329 |    "version": "2.7.11"
330 |   }
331 |  },
332 |  "nbformat": 4,
333 |  "nbformat_minor": 0
334 | }
335 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Network Evaluation Tools
 2 | 
 3 | Network Evaluation Tools is a Python 2.7 package with corresponding examples for evaluating a network's ability to group a given node set in network proximity. This package was developed as a part of the work done in [Huang and Carlin et al. 2018](http://www.cell.com/cell-systems/fulltext/S2405-4712(18)30095-4). 
 4 | 
 5 | ## Modules in this package
 6 |   - _data_import_tools_ - This module contains functions for helping import network files and gene set files for analysis.
 7 |   - _gene_conversion_tools_ - This module contains functions for helping convert, filter, and save networks from their raw database form. Used in the Network Processing Jupyter Notebooks.
 8 |   - _miscellaneous_functions_ - This module contains various functions developed to help with analysis along the way. These functions are not well tested and may contain bugs. These functions were generally used to determine other network performance metrics on network recovery of gene sets.
 9 |   - _network_evaluation_functions_ - This module contains many of the core functions of the set-based network evaluation algorithm.
10 |   - _network_propagation_ - This module contains functions to help with network propagation steps used in the set-based network evaluation algorithm.
11 | 
12 | ## Version and Dendencies
13 | Currently, the network_evaluation_tools package requires Python 2.7 - Python 2.7.13. Note that some functions in this package may not work with Python 3.0+.
14 | network_evaluation_tools requires: 
15 |   - Argparse >= 1.1
16 |   - NetworkX >= 2.1
17 |   - Numpy >= 1.11.0
18 |   - Matplotlib >= 1.5.1
19 |   - Pandas >= 0.19.0
20 |   - Requests >= 2.13.0
21 |   - Scipy >= 0.17.0
22 |   - Scikit-learn >= 0.17.1
23 | 
24 | Note:
25 | - In Pandas v0.20.0+, the ```.ix```indexer has been deprecated. There may be warning regarding this issue, yet the function still works.
26 | 
27 | ## Installation
28 | 1. Clone the repository 
29 | 2. cd to new respository
30 | 3. Execute following command:  
31 | ```python setup.py install```
32 | 
33 | ## Network analysis
34 | 1. If the network needs to be normalized to a particular naming scheme:<br>
35 | A Jupyter Notebook describing how each network was processed from the raw download file in the original [paper](Link) can be found in the ```Network Processing Notebooks``` folder.<br>
36 | 2. There are two ways to perform the network evaluation on a gene set:<br>
37 | The following network analyses can be performed either from a Jupyter Notebook or from the command line (see ```Network Evaluation Examples``` folder). Jupyter notebooks are documented within the notebook and the documentation for the python scripts can be seen using the command ```python [script_name].py -h```. <br>
38 | 
39 | ## Data provided in this repository (see ```Data``` Folder)
40 |  - Database Citations - An Excel file containing details about all of the networks used in the original paper's analysis and affiliated citations for all of the databases used.
41 |  - _DisGeNET / Oncogenic Component Gene Sets_ - Two tab separated files, each line containing a gene set from either DisGeNET or the Oncogenic Component collection. The first column of each file is the name of the gene set followed by the list of genes associated with that given gene set on the same line.
42 |  - _Network performance (AUPRCs) on DisGeNET / Oncogenic Component Gene Sets_ - Two csv files containing the raw Z-normalized AUPRC scores (network performance scores) of each network analyzed on each gene set analyzed from DisGeNET or the Oncogenic Component gene set collection.
43 |  - _Network performance effect sizes on DisGeNET / Oncogenic Component Gene Sets_ - Two csv files containing the relative performance gain of each network's AUPRC score over the median null AUPRC score for each gene set analyzed from DisGeNET or the Oncogenic Component gene set collection.
44 | 
45 | ## Issues
46 | Please feel free to post issues/bug reports. Questions can be sent to jkh013@ucsd.edu
47 | 
48 | ## License
49 | See the [LICENSE](https://github.com/huangger/Network_Evaluation_Tools/blob/master/LICENSE.txt) file for license rights and limitations (MIT).
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/network_evaluation_tools/.ipynb_checkpoints/PSN Construction-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 1
6 | }
7 | 


--------------------------------------------------------------------------------
/network_evaluation_tools/.ipynb_checkpoints/SBNE Method-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 1
6 | }
7 | 


--------------------------------------------------------------------------------
/network_evaluation_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/network_evaluation_tools/__init__.py


--------------------------------------------------------------------------------
/network_evaluation_tools/data_import_tools.py:
--------------------------------------------------------------------------------
  1 | ###############################################
  2 | # ---------- Data Import Functions ---------- #
  3 | ###############################################
  4 | 
  5 | import pandas as pd
  6 | import networkx as nx
  7 | import time
  8 | import os
  9 | 
 10 | # Filter extended sif file where all edges are weighted by a specific quantile
 11 | # Return the filtered network edge list and save it to a file if desired (for import by load_network_file)
 12 | def filter_weighted_network_sif(network_file_path, nodeA_col=0, nodeB_col=1, score_col=2, q=0.9, delimiter='\t', verbose=False, save_path=None):
 13 |     data = pd.read_csv(network_file_path, sep=delimiter, header=-1, low_memory=False)
 14 |     # Filter edges by score quantile
 15 |     q_score = data[score_col].quantile(q)
 16 |     if verbose:
 17 |         print str(round(q*100,2))+'%', 'score:', q_score
 18 |     data_filt = data[data[score_col]>q_score][data.columns[[nodeA_col, nodeB_col, score_col]]]
 19 |     data_filt.columns = ['nodeA', 'nodeB', 'edgeScore']
 20 |     if verbose:
 21 |         print data_filt.shape[0], '/', data.shape[0], 'edges retained'
 22 |     if save_path is not None:
 23 |         data_filt.to_csv(save_path, sep='\t', header=False, index=False)
 24 |     return data_filt
 25 | 
 26 | # Load network from file as unweighted network
 27 | # Can set delimiter, but default delimiter is tab
 28 | # Only will read edges as first two columns, all other columns will be ignored
 29 | def load_network_file(network_file_path, delimiter='\t', verbose=False):
 30 | 	network = nx.read_edgelist(network_file_path, delimiter=delimiter, data=False)
 31 | 	if verbose:
 32 | 		print 'Network File Loaded:', network_file_path
 33 | 	return network
 34 | 
 35 | # Get full paths to all networks in directory with a given file name structure:
 36 | # e.g. If filename = 'BIND_Symbol.sif', then network_name='BIND', suffix='_Symbol', ext='.sif
 37 | def get_networks(wd, suffix=None, file_ext='.sif'):
 38 | 	network_files = {}
 39 | 	for fn in os.listdir(wd):
 40 | 		if suffix==None:
 41 | 			if fn.endswith(file_ext):
 42 | 				network_files[fn.split(file_ext)[0]]=wd+fn			
 43 | 		else:
 44 | 			if fn.endswith(file_ext) and fn.split(file_ext)[0].endswith(suffix):
 45 | 				network_files[fn.split(suffix)[0]]=wd+fn
 46 | 	return network_files
 47 | 
 48 | # Companion function with get_networks(), loads all of the network files found in a directory
 49 | # Uses the load_network_file() function to load each network, also only imports first two columns, no edge data
 50 | # Constructs a dictionary of useful network items for each network in the directory:
 51 | #  - Actual networkx object representation of network
 52 | #  - List of nodes by name for each network
 53 | #  - List of edges by node name for each network
 54 | def load_networks(network_file_map, delimiter='\t', verbose=False):
 55 | 	# Initialize dictionaries
 56 | 	networks, network_edges, network_nodes = {}, {}, {}
 57 | 	# Loading network and network properties
 58 | 	for network_name in network_file_map:
 59 | 		loadtime = time.time()
 60 | 		# Load network
 61 | 		network = load_network_file(network_file_map[network_name], verbose=verbose)
 62 | 		networks[network_name]=network
 63 | 		# Construct network node list
 64 | 		network_nodes[network_name] = network.nodes()
 65 | 		# Construct network edge list
 66 | 		network_edges[network_name] = network.edges()
 67 | 	if verbose:
 68 | 		print 'All given network files loaded'
 69 | 	# Return data structure
 70 | 	return networks, network_edges, network_nodes
 71 | 
 72 | # Convert and save MAF from Broad Firehose
 73 | # Can produce 2 types of filetypes: 'matrix' or 'list', matrix is a full samples-by-genes binary csv, 'list' is a sparse representaiton of 'matrix'
 74 | # This is a conversion tool, so the result must be saved (most tools will require a path to a processed MAF file and load it separately)
 75 | # Gene naming can be 'Symbol' or 'Entrez'
 76 | def process_TCGA_MAF(maf_file, save_path, filetype='matrix', gene_naming='Symbol', verbose=False):
 77 | 	loadtime = time.time()
 78 | 	# Load MAF File
 79 | 	TCGA_MAF = pd.read_csv(maf_file,sep='\t',low_memory=False)
 80 | 	# Get all patient somatic mutation (sm) pairs from MAF file
 81 | 	if gene_naming=='Entrez':
 82 | 		TCGA_sm = TCGA_MAF.groupby(['Tumor_Sample_Barcode', 'Entrez_Gene_Id']).size()
 83 | 	else:
 84 | 		TCGA_sm = TCGA_MAF.groupby(['Tumor_Sample_Barcode', 'Hugo_Symbol']).size()
 85 | 	# Turn somatic mutation data into binary matrix
 86 | 	TCGA_sm_mat = TCGA_sm.unstack().fillna(0)
 87 | 	TCGA_sm_mat = (TCGA_sm_mat>0).astype(int)
 88 | 	# Trim TCGA barcodes
 89 | 	TCGA_sm_mat.index = [pat[:12] for pat in TCGA_sm_mat.index]
 90 | 	# Filter samples with duplicate IDs
 91 | 	non_dup_IDs = list(TCGA_sm_mat.index.value_counts().index[TCGA_sm_mat.index.value_counts()==1])
 92 | 	dup_IDs = list(TCGA_sm_mat.index.value_counts().index[TCGA_sm_mat.index.value_counts()>1])
 93 | 	# Save file as binary matrix or sparse list
 94 | 	if filetype=='list':
 95 | 		# Now try to construct two-column/sparse representation of binary sm data
 96 | 		# Get list of all patient somatic mutations
 97 | 		index_list = list(TCGA_sm.index)
 98 | 		# Filter list of patient somatic mutations of duplicate patient barcodes
 99 | 		index_list_filt = [i for i in index_list if not any([True if barcode in i[0] else False for barcode in dup_IDs])]
100 | 		# Save patient somatic mutations list to file
101 | 		f = open(save_path, 'w')
102 | 		for sm in index_list_filt:
103 | 			f.write(sm[0][:12]+'\t'+sm[1]+'\n')
104 | 		f.close()
105 | 		if verbose:
106 | 			print 'Binary somatic mutations list saved'
107 | 	else:
108 | 		# Save non-duplicate patients' binary TCGA somatic mutation matrix to csv
109 | 		TCGA_sm_mat_filt = TCGA_sm_mat.ix[non_dup_IDs]
110 | 		# Remove all genes that have no more mutations after patient filtering
111 | 		nonempty_cols = [col for col in TCGA_sm_mat_filt.columns if not all(TCGA_sm_mat_filt[col]==0)]
112 | 		TCGA_sm_mat_filt2 = TCGA_sm_mat_filt[nonempty_cols]
113 | 		# Remove columns with bad names like '0'
114 | 		named_cols = [col for col in TCGA_sm_mat_filt.columns if col!='0']
115 | 		TCGA_sm_mat_filt3 = TCGA_sm_mat_filt2[nonempty_cols]
116 | 		TCGA_sm_mat_filt3.to_csv(save_path)
117 | 		if verbose:
118 | 			print 'Binary somatic mutation matrix saved'
119 | 	if verbose:
120 | 		print 'MAF file processed:', maf_file, round(time.time()-loadtime, 2), 'seconds.'
121 | 	return
122 | 
123 | # Load binary mutation data with 2 file types (filetype= 'matrix' or 'list')
124 | # filetype=='matrix' is a csv or tsv style matrix with row and column headers, rows are samples/patients, columns are genes
125 | # filetype=='list' is a 2 columns text file separated by the delimiter where 1st column is sample/patient, 2nd column is one gene mutated in that patient
126 | # Line example in 'list' file: 'Patient ID','Gene Mutated'
127 | def load_binary_mutation_data(filename, filetype='matrix', delimiter=',', verbose=False):
128 | 	if filetype=='list':
129 | 		f = open(filename)
130 | 		binary_mat_lines = f.read().splitlines()
131 | 		binary_mat_data = [(line.split('\t')[0], line.split('\t')[1]) for line in binary_mat_lines]
132 | 		binary_mat_index = pd.MultiIndex.from_tuples(binary_mat_data, names=['Tumor_Sample_Barcode', 'Hugo_Symbol'])
133 | 		binary_mat_2col = pd.DataFrame(1, index=binary_mat_index, columns=[0])[0]
134 | 		binary_mat = binary_mat_2col.unstack().fillna(0)
135 | 	else:
136 | 		binary_mat = pd.read_csv(filename, delimiter=delimiter, index_col=0).astype(int)
137 | 	if verbose:
138 | 	   print 'Binary Mutation Matrix Loaded:', filename
139 | 	return binary_mat
140 | 
141 | # Concatinate multiple mutation matrices together
142 | # All file type structures and delimiters must be the same (see load_binary_mutation_matrix()) across all files
143 | def concat_binary_mutation_matrices(filename_list, filetype='matrix', delimiter=',', verbose=False, save_path=None):
144 | 	binary_mat_list = [load_binary_mutation_data(fn, filetype=filetype, delimiter=delimiter, verbose=verbose) for fn in filename_list]  
145 | 	binary_mat_concat = pd.concat(binary_mat_list).fillna(0)
146 | 	if verbose:
147 | 		print 'All binary mutation matrices loaded and concatenated'
148 | 	if save_path==None:
149 | 		return binary_mat_concat
150 | 	else:
151 | 		binary_mat_concat.to_csv(save_path)
152 | 		return binary_mat_concat
153 | 
154 | # Construct dictionary of node sets from input text file to perform AUPRC analysis on for network of interest
155 | # File format: Each line is a delimited list with the first item in the list is the name of the node set
156 | # All other nodes in the list follow the node set name
157 | def load_node_sets(node_set_file, delimiter='\t', verbose=False):
158 | 	f = open(node_set_file)
159 | 	node_set_lines = f.read().splitlines()
160 | 	node_set_lines_split = [line.split(delimiter) for line in node_set_lines]
161 | 	f.close()
162 | 	node_sets = {node_set[0]:set(node_set[1:]) for node_set in node_set_lines_split}
163 | 	if verbose:
164 | 		print 'Node cohorts loaded:', node_set_file
165 | 	return node_sets


--------------------------------------------------------------------------------
/network_evaluation_tools/gene_conversion_tools.py:
--------------------------------------------------------------------------------
  1 | ################################################################
  2 | # ---------- Network Gene Name Conversion Functions ---------- #
  3 | ################################################################
  4 | import requests
  5 | import re
  6 | import time
  7 | import pandas as pd
  8 | 
  9 | # Determine if id to be input is a valid gene name (does not contain parentheses or quotations or whitespace)
 10 | def exclude_id(name, bad_prefixes=None):
 11 | 	excluded_id_regex = re.compile('[(),\'\"\s\/\|\.<>]+')
 12 | 	# Remove genes that may also have prefixes that we do not want (e.g. CHEBI)
 13 | 	if bad_prefixes:
 14 | 		for prefix in bad_prefixes:
 15 | 			if name.startswith(prefix):
 16 | 				return True
 17 | 	return excluded_id_regex.search(name)
 18 | 
 19 | # Remove the naming system prefix, if there is one
 20 | def get_identifier_without_prefix(string):
 21 | 	elements = string.split(':')
 22 | 	length = len(elements)
 23 | 	if length is 2:
 24 | 		return str(elements[1])
 25 | 	elif length > 2:
 26 | 		return None
 27 | 	else:
 28 | 		return string
 29 | 
 30 | # Construct string for bach query to MyGene.Info v3.0.0 API
 31 | def query_constructor(gene_list, exclude_prefixes=None, print_invalid_genes=False):
 32 | 	# Find genes that are valid and return only gene identifiers
 33 | 	valid_query_genes = [get_identifier_without_prefix(gene) for gene in gene_list if exclude_id(gene, exclude_prefixes)==None]
 34 | 	# Find all genes that have invalid names
 35 | 	invalid_query_genes = [gene for gene in gene_list if exclude_id(gene, exclude_prefixes)!=None]
 36 | 	print len(valid_query_genes), "Valid Query Genes"
 37 | 	if print_invalid_genes:
 38 | 		print len(invalid_query_genes), "Invalid Query Genes:"
 39 | 		print invalid_query_genes
 40 | 	else:
 41 | 		print len(invalid_query_genes), "Invalid Query Genes"
 42 | 	query_string = ' '.join(valid_query_genes) # Build string of names to input into MyGene.Info
 43 | 	return query_string, valid_query_genes, invalid_query_genes
 44 | 
 45 | # Function for posting batch query to MyGene.info v3.0.0 API
 46 | def query_batch(query_string, tax_id='9606', scopes="symbol, entrezgene, alias, uniprot", fields="symbol, entrezgene"):
 47 | 	query_split = query_string.split(' ')
 48 | 	query_n = len(query_split)
 49 | 	query_time = time.time()
 50 | 	if query_n <=1000:
 51 | 		data = {'species': tax_id, # Human Only
 52 | 				'scopes': scopes, # Default symbol, entrez, alias, uniprot. Alias often returns more genes than needed, return only higest scoring genes
 53 | 				'fields': fields, # Which gene name spaces to convert to
 54 | 				'q': query_string}
 55 | 		res = requests.post('http://mygene.info/v3/query', data)
 56 | 		json = res.json()
 57 | 	else:
 58 | 		# If the query is too long, we will need to break it up into chunks of 1000 query genes (MyGene.info cap)
 59 | 		if query_n % 1000 == 0:
 60 | 		    chunks = query_n / 1000
 61 | 		else:
 62 | 		    chunks = (query_n / 1000) + 1
 63 | 		query_chunks = []
 64 | 		for i in range(chunks):
 65 | 		    start_i, end_i = i*1000, (i+1)*1000
 66 | 		    query_chunks.append(' '.join(query_split[start_i:end_i]))
 67 | 		json = []
 68 | 		for chunk in query_chunks:
 69 | 		    data = {'species': '9606', # Human Only
 70 | 		        	'scopes': "entrezgene, retired", # Default symbol, entrez, alias, uniprot. Alias often returns more genes than needed, return only higest scoring genes
 71 | 		        	'fields': "symbol, entrezgene", # Which gene name spaces to convert to
 72 | 		        	'q': chunk}
 73 | 		    res = requests.post('http://mygene.info/v3/query', data)
 74 | 		    json = json+res.json()		    
 75 | 	print len(json), 'Matched query results'
 76 | 	print 'Batch query complete:', round(time.time()-query_time,2), 'seconds'
 77 | 	return json
 78 | 
 79 | # Construct matched queries maps
 80 | def construct_query_map_table(query_result, query_genes, display_unmatched_queries=False):
 81 | 	construction_time = time.time()
 82 | 	# Construct DataFrame of matched queries (only keep the results for each query where both symbol and entrez id were mapped)
 83 | 	matched_data, matched_genes=[], []
 84 | 	for match in query_result:
 85 | 		if match.get('entrezgene') and match.get('symbol'):
 86 | 			matched_data.append([match.get('query'), match.get('_score'), match.get('symbol'), str(match.get('entrezgene'))])
 87 | 			matched_genes.append(match.get('query'))
 88 | 	# Add all other partial mappings or non-mappings to the list
 89 | 	partial_match_genes = [gene for gene in query_genes if gene not in matched_genes]
 90 | 	partial_match_results = []
 91 | 	for match in query_result:
 92 | 		if match.get('query') in partial_match_genes:
 93 | 			partial_match_results.append(match)
 94 | 			if match.get('entrezgene'): # If there if an entrez gene, we want that that in string form, otherwise we want None
 95 | 				matched_data.append([match.get('query'), match.get('_score'), match.get('symbol'), str(match.get('entrezgene'))])
 96 | 			else:
 97 | 				matched_data.append([match.get('query'), match.get('_score'), match.get('symbol'), match.get('entrezgene')])
 98 | 	print 'Queries without full matching results found:', len(partial_match_results)
 99 | 	if display_unmatched_queries:
100 | 		for entry in partial_match_results:
101 | 			print entry
102 | 	# Convert matched data list into data frame table
103 | 	match_table = pd.DataFrame(data=matched_data, columns=['Query','Score','Symbol','EntrezID'])
104 | 	match_table = match_table.set_index('Query')
105 | 	# Some genes will be matched in duplicates (due to alias mapping, generally the highest scoring matches will be correct)
106 | 	# Therefore we remove duplicate mappings to create 1-to-1 mappings for query to genes.
107 | 	duplicate_matched_genes = []
108 | 	for gene in query_genes:
109 | 		if type(match_table.ix[gene])==pd.DataFrame:
110 | 			duplicate_matched_genes.append(gene)
111 | 	print
112 | 	print len(duplicate_matched_genes), "Queries with mutliple matches found"
113 | 	# Construct mapping table of genes with only one full result
114 | 	single_match_genes = [gene for gene in query_genes if gene not in duplicate_matched_genes]
115 | 	match_table_single = match_table.ix[single_match_genes]
116 | 	# Keep matches of queries matched only once if there are duplicate matches for genes
117 | 	if len(duplicate_matched_genes) > 0:
118 | 		# Keep maximum scored matches of queries matched more than once
119 | 		max_score_matches=[]
120 | 		for gene in duplicate_matched_genes:
121 | 			matched_duplicates = match_table.ix[gene]
122 | 			max_score = max(matched_duplicates['Score'])
123 | 			max_score_matches.append(matched_duplicates[matched_duplicates['Score']==max_score])
124 | 		match_table_duplicate_max = pd.concat(max_score_matches)
125 | 		# Construct Query maps for symbol and entrez
126 | 		match_table_trim = pd.concat([match_table_single, match_table_duplicate_max])
127 | 	else:
128 | 		match_table_trim = match_table_single.copy(deep=True)
129 | 	# Construct query map dictionaries
130 | 	query_to_symbol = match_table_trim['Symbol'].to_dict()
131 | 	query_to_entrez = match_table_trim['EntrezID'].to_dict()
132 | 	print
133 | 	print 'Query mapping table/dictionary construction complete:', round(time.time()-construction_time,2), 'seconds'
134 | 	return match_table_trim, query_to_symbol, query_to_entrez
135 | 
136 | # Filter edgelist to remove all genes that contain invalid query names
137 | # This function is only required if there are any invalid genes found by query_constructor()
138 | def filter_query_edgelist(query_edgelist, invalid_genes):
139 | 	edgelist_filt = []
140 | 	count=0
141 | 	for edge in query_edgelist:
142 | 		if edge[0] in invalid_genes or edge[1] in invalid_genes:
143 | 			count+=1
144 | 		else:
145 | 			edgelist_filt.append(edge)
146 | 	print count, '/', len(query_edgelist), 'edges with invalid nodes removed'
147 | 	return edgelist_filt
148 | 
149 | # Convert network edge lists
150 | # Third column is for weights if desired to pass weights forward
151 | def convert_edgelist(query_edgelist, gene_map, weighted=False):
152 | 	if weighted:
153 | 		return [sorted([gene_map[edge[0]],gene_map[edge[1]]])+[edge[2]] for edge in query_edgelist]	
154 | 	else:
155 | 		return [sorted([gene_map[edge[0]],gene_map[edge[1]]]) for edge in query_edgelist]
156 | 
157 | # Sometimes each node needs to be converted by its best match if there are multiple names per node
158 | # This function uses the match_table constructed earlier to convert genes to either symbol or entrez format only
159 | def convert_custom_namelist(names, field, match_table):
160 | 	# Keep only mappings defined for field of interest
161 | 	if field=='symbol':
162 | 		# Return match table values that have matched symbol
163 | 		conversion = match_table.ix[names][~(match_table.ix[names]['Symbol'].isnull())]
164 | 		if conversion.shape[0]==0:
165 | 			return None
166 | 		else:
167 | 			# Return conversion with max score or None if no conversion
168 | 			max_score = conversion['Score'].max()
169 | 			return conversion[conversion['Score']==max_score].ix[0]['Symbol']
170 | 	elif field=='entrez':
171 | 		# Return match table values that have matched symbol
172 | 		conversion = match_table.ix[names][~(match_table.ix[names]['EntrezID'].isnull())]
173 | 		if conversion.shape[0]==0:
174 | 			return None
175 | 		else:
176 | 			# Return conversion with max score or None if no conversion
177 | 			max_score = conversion['Score'].max()
178 | 			return conversion[conversion['Score']==max_score].ix[0]['EntrezID']
179 | 
180 | # Filter converted edge lists
181 | def filter_converted_edgelist(edgelist, remove_self_edges=True, weighted=False):
182 | 	filter_time = time.time()
183 | 	print len(edgelist),'input edges'
184 | 	# Remove self-edges
185 | 	if remove_self_edges:
186 | 		edgelist_filt1 = [edge for edge in edgelist if edge[0]!=edge[1]]
187 | 		print len(edgelist)-len(edgelist_filt1), 'self-edges removed'
188 | 	else:
189 | 		edgelist_filt1 = edgelist
190 | 		print 'Self-edges not removed'
191 | 	if weighted:
192 | 		# Remove edges where one or both nodes are "None"
193 | 		edgelist_filt2 = pd.DataFrame(data=edgelist_filt1).dropna().values.tolist()
194 | 		print len(edgelist_filt1)-len(edgelist_filt2), 'edges with un-mapped genes removed'
195 | 		# Remove duplicates by keeping the max score
196 | 		edgelist_filt3_scoremap = {}
197 | 		for edge in edgelist_filt2:
198 | 			if edge[0]+'+'+edge[1] not in edgelist_filt3_scoremap:
199 | 				edgelist_filt3_scoremap[edge[0]+'+'+edge[1]] = edge[2]
200 | 			else:
201 | 				edgelist_filt3_scoremap[edge[0]+'+'+edge[1]] = max(edgelist_filt3_scoremap[edge[0]+'+'+edge[1]], edge[2])
202 | 		# Convert dictionary of scores to list
203 | 		edgelist_filt3 = []
204 | 		for edge in edgelist_filt3_scoremap:
205 | 			edgelist_filt3.append(edge.split('+')+[edgelist_filt3_scoremap[edge]])
206 | 		print len(edgelist_filt2)-len(edgelist_filt3), 'duplicate edges removed'
207 | 	else:
208 | 		# Remove edges where one or both nodes are "None"
209 | 		edgelist_filt2 = pd.DataFrame(data=edgelist_filt1).dropna()
210 | 		print len(edgelist_filt1)-edgelist_filt2.shape[0], 'edges with un-mapped genes removed'
211 | 		# Remove duplicate edges
212 | 		edgelist_filt3 = edgelist_filt2.drop_duplicates().values.tolist()
213 | 		print edgelist_filt2.shape[0]-len(edgelist_filt3), 'duplicate edges removed'
214 | 	print 'Edge list filtered:',round(time.time()-filter_time,2),'seconds'
215 | 	print len(edgelist_filt3), 'Edges remaining'
216 | 	return edgelist_filt3
217 | 
218 | # Write edgelist to file
219 | def write_edgelist(edgelist, output_file, delimiter='\t', binary=True):
220 | 	write_time=time.time()
221 | 	f = open(output_file,'w')
222 | 	for edge in edgelist:
223 | 		if binary:
224 | 			f.write(delimiter.join([edge[0], edge[1]])+'\n')
225 | 		else:
226 | 			f.write(delimiter.join([str(val) for val in edge])+'\n')
227 | 	f.close()
228 | 	print 'Edge list saved:', round(time.time()-write_time,2),'seconds'
229 | 


--------------------------------------------------------------------------------
/network_evaluation_tools/miscellaneous_functions.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import pandas as pd
  3 | import numpy as np
  4 | import data_import_tools as dit
  5 | import network_propagation as prop
  6 | import network_evaluation_functions as nef
  7 | from multiprocessing import Pool
  8 | import pickle as p
  9 | 
 10 | ################################################################################
 11 | # ---------- Additional Node Set-Based Network Evaluation Functions ---------- #
 12 | ################################################################################
 13 | 
 14 | # Calculate confusion matrix (true positive, false negatives, false positives, true negatives) of node set recovery for given node set
 15 | # The confusion matrix for every position on every AUPRC curve is returned/stored
 16 | def calculate_confusion_matrix_serial(prop_geno, p, n, node_set_name, node_set, verbose=False):
 17 | 	runtime = time.time()
 18 | 	intersect = [nodes for nodes in node_set if nodes in prop_geno.index]
 19 | 	confusion_matrices = {}
 20 | 	sample_size = int(round(p*len(intersect)))
 21 | 	for i in range(n):																					  	# Number of times to run the sampling
 22 | 		sample = random.sample(intersect, sample_size)														 	# get node set sample
 23 | 		intersect_non_sample = [node for node in intersect if node not in sample]							   	# nodes in intersect not in sample
 24 | 		prop_geno_non_sample = list(prop_geno.index[~prop_geno.index.isin(sample)])							 	# nodes in network not in sample
 25 | 		prop_geno_sample_sum = prop_geno.ix[sample][prop_geno_non_sample].sum().sort_values(ascending=False)	# summed prop value for all nodes
 26 | 		y_actual = pd.Series(0, index=prop_geno_sample_sum.index, dtype=int)									# nodes sorted by mean prop value
 27 | 		y_actual.ix[intersect_non_sample]+=1																	# which nodes in sorted list are in intersect_non_sample
 28 | 		intersect_non_sample_sorted = y_actual[y_actual==1].index											   	# intersect_non_sample sorted
 29 | 		confusion_matrix = {'TP':[], 'FN':[], 'FP':[], 'TN':[]}													# initialize true positive, false negative, false positive, true negative lists
 30 | 		for node in intersect_non_sample_sorted:															# Slide down sorted nodes by summed prop value by nodes that are in intersect_non_sample
 31 | 			TP, FN = sum(y_actual.ix[:node]), sum(y_actual.ix[node:])-1										   	# Calculate true positives and false negatives found at this point in list
 32 | 			FP, TN = len(y_actual.ix[:node])-TP, len(y_actual.ix[node:])-1-FN									# Calculate false positives and true negatives found at this point in list
 33 | 			confusion_matrix['TP'].append(TP)
 34 | 			confusion_matrix['FN'].append(FN)
 35 | 			confusion_matrix['FP'].append(FP)
 36 | 			confusion_matrix['TN'].append(TN)
 37 | 		confusion_matrices[i]=confusion_matrix
 38 | 	if verbose:
 39 | 		print 'Confusion matrices calculated for node set', node_set_name, 'complete.', repr(len(intersect))+' nodes in network,', round(time.time()-runtime, 2), 'seconds.'
 40 | 	return confusion_matrices
 41 | 
 42 | # Calculate confusion matrix (true positive, false negatives, false positives, true negatives) of node set recovery for given node set 
 43 | # The parameter setup is written for running in serial, only difference is that the name of the node set also must be passed, and prop_geno will be set as a global variable
 44 | # The confusion matrix for every position on every AUPRC curve is returned/stored
 45 | def calculate_confusion_matrix_parallel(node_set_params):
 46 | 	node_set_name, node_set, p, n, verbose = node_set_params[0], node_set_params[1], node_set_params[2], node_set_params[3], node_set_params[4]
 47 | 	runtime = time.time()
 48 | 	intersect = [nodes for nodes in node_set if nodes in prop_geno.index]
 49 | 	confusion_matrices = {}
 50 | 	sample_size = int(round(p*len(intersect)))
 51 | 	for i in range(n):																					  	# Number of times to run the sampling
 52 | 		sample = random.sample(intersect, sample_size)														 	# get node set sample
 53 | 		intersect_non_sample = [node for node in intersect if node not in sample]							   	# nodes in intersect not in sample
 54 | 		prop_geno_non_sample = list(prop_geno.index[~prop_geno.index.isin(sample)])							 	# nodes in network not in sample
 55 | 		prop_geno_sample_sum = prop_geno.ix[sample][prop_geno_non_sample].sum().sort_values(ascending=False)	# summed prop value for all nodes
 56 | 		y_actual = pd.Series(0, index=prop_geno_sample_sum.index, dtype=int)									# nodes sorted by mean prop value
 57 | 		y_actual.ix[intersect_non_sample]+=1																	# which nodes in sorted list are in intersect_non_sample
 58 | 		intersect_non_sample_sorted = y_actual[y_actual==1].index											   	# intersect_non_sample sorted
 59 | 		confusion_matrix = {'TP':[], 'FN':[], 'FP':[], 'TN':[]}													# initialize true positive, false negative, false positive, true negative lists
 60 | 		for node in intersect_non_sample_sorted:															# Slide down sorted nodes by summed prop value by nodes that are in intersect_non_sample
 61 | 			TP, FN = sum(y_actual.ix[:node]), sum(y_actual.ix[node:])-1										   	# Calculate true positives and false negatives found at this point in list
 62 | 			FP, TN = len(y_actual.ix[:node])-TP, len(y_actual.ix[node:])-1-FN									# Calculate false positives and true negatives found at this point in list
 63 | 			confusion_matrix['TP'].append(TP)
 64 | 			confusion_matrix['FN'].append(FN)
 65 | 			confusion_matrix['FP'].append(FP)
 66 | 			confusion_matrix['TN'].append(TN)
 67 | 		confusion_matrices[i]=confusion_matrix
 68 | 	if verbose:
 69 | 		print 'Confusion matrices calculated for node set', node_set_name, 'complete.', repr(len(intersect))+' nodes in network,', round(time.time()-runtime, 2), 'seconds.'
 70 | 	return [node_set_name, confusion_matrices]
 71 | 
 72 | # Wapper for calculating the confusion matrices for input node set file and network (has parallel option)
 73 | # Not run for null network shuffles
 74 | def confusion_matrix_construction_wrapper(network_file, node_set_file, sample_p, sub_sample_iterations, 
 75 | 	alpha=None, m=-0.17190024, b=0.7674828, net_delim='\t', set_delim='\t', cores=1, verbose=False, save_path=None):
 76 | 	starttime = time.time()
 77 | 	# Load network
 78 | 	network = dit.load_network_file(network_file, delimiter=net_delim, verbose=verbose)
 79 | 	# Load node set
 80 | 	node_sets = dit.load_node_sets(node_set_file, delimiter=set_delim, verbose=verbose)
 81 | 	# Calculate network influence matrix
 82 | 	prop_net = nef.construct_prop_kernel(network, alpha=alpha, m=m, b=b)
 83 | 	# Calculate confusion matrix values for each node set
 84 | 	if cores == 1:
 85 | 		# Calculate confusion matrix values for node sets one at a time
 86 | 		node_set_conf_mat = {node_set:nef.calculate_confusion_matrix_serial(prop_net, sample_p, sub_sample_iterations, node_set, node_sets[node_set], verbose=verbose) for node_set in node_sets}
 87 | 	else:
 88 | 		# Initialize multiple threads for confusion matrix analysis of multiple node sets
 89 | 		initializer_args = [prop_net]
 90 | 		pool = Pool(cores, nef.parallel_analysis_initializer, initializer_args)
 91 | 		# Construct parameter list to be passed
 92 | 		conf_mat_Analysis_params = [[node_set, node_sets[node_set], sample_p, sub_sample_iterations, verbose] for node_set in node_sets]
 93 | 		# Run the confusion matrix analysis for each geneset
 94 | 		conf_mat_results = pool.map(nef.calculate_confusion_matrix_parallel, conf_mat_Analysis_params)
 95 | 		# Construct confusion matrix results dictionary
 96 | 		node_set_conf_mat = {result[0]:result[1] for result in conf_mat_results}
 97 | 	if save_path is None:
 98 | 		if verbose:
 99 | 			print 'Network confusion matrix values calcualted:', round(time.time()-starttime, 2), 'seconds'			
100 | 		return node_set_conf_mat
101 | 	else:
102 | 		p.dump(node_set_conf_mat, open(save_path, 'wb'))
103 | 		if verbose:
104 | 			print 'Network confusion matrix values calcualted:', round(time.time()-starttime, 2), 'seconds'					
105 | 		return node_set_conf_mat
106 | 
107 | # Use confusion matrix results to calculate odds ratio, risk ratio, accuracy or precision at a given recall threshold
108 | def confusion_matrix_analysis(confusion_matrix_input, calculation, recall_threshold=0.9, verbose=False, save_path=None):
109 | 	runtime = time.time()
110 | 	# Load confusion matrix data
111 | 	if type(confusion_matrix_input)!=dict:
112 | 		confusion_matrix = p.load(open(confusion_matrix_input, 'rb'))
113 | 	else:
114 | 		confusion_matrix = confusion_matrix_input
115 | 
116 | 	# Calculate average and variance of specified calculation
117 | 	cohort_calculated_values_mean, cohort_calculated_values_var = {}, {}
118 | 	# For each cohort tested
119 | 	for cohort in confusion_matrix:
120 | 		print cohort
121 | 		n = len(confusion_matrix[cohort])
122 | 		calculation_values = []
123 | 		# For all sub-sample iterations
124 | 		for i in range(n):
125 | 			# Find where recall >= recall threshold
126 | 			for j in range(len(confusion_matrix[cohort][i]['TP'])):
127 | 				TP = confusion_matrix[cohort][i]['TP'][j]
128 | 				FN = confusion_matrix[cohort][i]['FN'][j]
129 | 				recall = TP / float((TP+FN))
130 | 				if recall >= recall_threshold:
131 | 					FP = confusion_matrix[cohort][i]['FP'][j]
132 | 					TN = confusion_matrix[cohort][i]['TN'][j]
133 | 					if calculation=='OR': # Odds Ratio: OR = (TP/FP) / (FN/TN)
134 | 						calculation_values.append((float(TP)/FP) / (float(FN)/TN))
135 | 					elif calculation=='RR': # Risk Ratio / Relative Risk: RR = (TP/(TP+FN)) / (FP/(FP+TN))
136 | 						calculation_values.append((float(TP)/(TP+FN)) / (float(FP)/(FP+TN)))
137 | 					elif calculation=='accuracy': # accuracy = (TP + TN) / (TP + TN + FP + FN)
138 | 						calculation_values.append(float(TP+TN) / (TP+FN+FP+TN))
139 | 					else: # precision = (TP) / (TP+FP)
140 | 						calculation_values.append(float(TP) / (TP+FP))
141 | 					break
142 | 		# Calculate average and variance of value of interest across all iterations for given cohort
143 | 		cohort_calculated_values_mean[cohort] = np.mean(calculation_values)
144 | 		cohort_calculated_values_var[cohort] = np.var(calculation_values)
145 | 	# Return table of average/variance values for performance on all cohorts at given threshold
146 | 	cohort_calculated_values_table = pd.concat([pd.Series(cohort_calculated_values_mean, name='Average '+calculation),
147 | 												pd.Series(cohort_calculated_values_var, name=calculation+' Var')], axis=1)
148 | 	if save_path is None:
149 | 		if verbose:
150 | 			print calculation, 'calculation completed for all cohorts', round(time.time()-runtime, 2), 'seconds.'
151 | 		return cohort_calculated_values_table
152 | 	else:
153 | 		cohort_calculated_values_table.to_csv(save_path)
154 | 		if verbose:
155 | 			print calculation, 'calculation completed for all cohorts', round(time.time()-runtime, 2), 'seconds.'		
156 | 		return cohort_calculated_values_table
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/network_evaluation_tools/network_propagation.py:
--------------------------------------------------------------------------------
 1 | #######################################################
 2 | # ---------- Network Propagation Functions ---------- #
 3 | #######################################################
 4 | import networkx as nx
 5 | import time
 6 | import numpy as np
 7 | import scipy
 8 | import pandas as pd
 9 | import copy
10 | 
11 | # Normalize network (or network subgraph) for random walk propagation
12 | def normalize_network(network, symmetric_norm=False):
13 | 	adj_mat = nx.adjacency_matrix(network)
14 | 	adj_array = np.array(adj_mat.todense())
15 | 	if symmetric_norm:
16 | 		D = np.diag(1/np.sqrt(sum(adj_array)))
17 | 		adj_array_norm = np.dot(np.dot(D, adj_array), D)
18 | 	else:
19 | 		degree_norm_array = np.diag(1/sum(adj_array).astype(float))
20 | 		sparse_degree_norm_array = scipy.sparse.csr_matrix(degree_norm_array)
21 | 		adj_array_norm = sparse_degree_norm_array.dot(adj_mat).toarray()
22 | 	return adj_array_norm
23 | # Note about normalizing by degree, if multiply by degree_norm_array first (D^-1 * A), then do not need to return
24 | # transposed adjacency array, it is already in the correct orientation
25 | 
26 | # Calculate optimal propagation coefficient (updated model)
27 | def calculate_alpha(network, m=-0.02935302, b=0.74842057):
28 | 	log_edge_count = np.log10(len(network.edges()))
29 | 	alpha_val = round(m*log_edge_count+b,3)
30 | 	if alpha_val <=0:
31 | 		raise ValueError('Alpha <= 0 - Network Edge Count is too high')
32 | 		# There should never be a case where Alpha >= 1, as avg node degree will never be negative
33 | 	else:
34 | 		return alpha_val
35 | 
36 | # Closed form random-walk propagation (as seen in HotNet2) for each subgraph: Ft = (1-alpha)*Fo * (I-alpha*norm_adj_mat)^-1
37 | # Concatenate to previous set of subgraphs
38 | def fast_random_walk(alpha, binary_mat, subgraph_norm, prop_data):
39 | 	term1=(1-alpha)*binary_mat
40 | 	term2=np.identity(binary_mat.shape[1])-alpha*subgraph_norm
41 | 	term2_inv = np.linalg.inv(term2)
42 | 	subgraph_prop = np.dot(term1, term2_inv)
43 | 	return np.concatenate((prop_data, subgraph_prop), axis=1)
44 | 
45 | # Wrapper for random walk propagation of full network by subgraphs
46 | def closed_form_network_propagation(network, binary_matrix, network_alpha, symmetric_norm=False,  verbose=False, save_path=None):
47 | 	starttime=time.time()
48 | 	if verbose:
49 | 		print 'Alpha:', network_alpha
50 | 	# Separate network into connected components and calculate propagation values of each sub-sample on each connected component
51 | 	subgraphs = list(nx.connected_component_subgraphs(network))
52 | 	# Initialize propagation results by propagating first subgraph
53 | 	subgraph = subgraphs[0]
54 | 	subgraph_nodes = list(subgraph.nodes)
55 | 	prop_data_node_order = list(subgraph_nodes)
56 | 	binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T)
57 | 	subgraph_norm = normalize_network(subgraph, symmetric_norm=symmetric_norm)
58 | 	prop_data_empty = np.zeros((binary_matrix_filt.shape[0], 1))
59 | 	prop_data = fast_random_walk(network_alpha, binary_matrix_filt, subgraph_norm, prop_data_empty)
60 | 	# Get propagated results for remaining subgraphs
61 | 	for subgraph in subgraphs[1:]:
62 | 		subgraph_nodes = list(subgraph.nodes)
63 | 		prop_data_node_order = prop_data_node_order + subgraph_nodes
64 | 		binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T)
65 | 		subgraph_norm = normalize_network(subgraph, symmetric_norm=symmetric_norm)
66 | 		prop_data = fast_random_walk(network_alpha, binary_matrix_filt, subgraph_norm, prop_data)
67 | 	# Return propagated result as dataframe
68 | 	prop_data_df = pd.DataFrame(data=prop_data[:,1:], index = binary_matrix.index, columns=prop_data_node_order)
69 | 	if save_path is None:
70 | 		if verbose:
71 | 			print 'Network Propagation Complete:', time.time()-starttime, 'seconds'		
72 | 		return prop_data_df
73 | 	else:
74 | 		prop_data_df.to_csv(save_path)
75 | 		if verbose:
76 | 			print 'Network Propagation Complete:', time.time()-starttime, 'seconds'				
77 | 		return prop_data_df
78 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Setup module adapted from setuptools code. See:
 3 | https://packaging.python.org/en/latest/distributing.html
 4 | https://github.com/pypa/sampleproject
 5 | """
 6 | 
 7 | # Always prefer setuptools over distutils
 8 | from setuptools import setup, find_packages
 9 | 
10 | setup(
11 | 	name='network_evaluation_tools',
12 | 	version='1.0.2',
13 | 	description='Module to perform patient and molecular network evaluation as described in Huang and Carlin, et al. 2018',
14 | 	url='https://github.com/idekerlab/Network_Evaluation_Tools',
15 | 	author='Justin Huang',
16 | 	author_email='jkh013@ucsd.edu',
17 | 	license='MIT',
18 | 	classifiers=[
19 | 		'Development Status :: 5 - Production/Stable',
20 | 		'Intended Audience :: Science/Research',
21 | 		'Topic :: Software Development :: Build Tools',
22 | 		'License :: OSI Approved :: MIT License',
23 | 		'Programming Language :: Python :: 2.7'
24 | 	],
25 | 	packages=find_packages(exclude=['copy', 'itertools', 'os', 're', 'time']),
26 | 	install_requires=[
27 |         'argparse>=1.1',
28 |         'networkx>=2.1',
29 |         'numpy>=1.11.0',
30 |         'matplotlib>=1.5.1',
31 |         'pandas>=0.19.0',
32 |         'requests>=2.13.0',
33 |         'scipy>=0.17.0',
34 |         'scikit-learn>=0.17.1',
35 |         'seaborn>=0.7.1']
36 | )


--------------------------------------------------------------------------------