├── .gitignore
├── Data
├── Database_Citations.xlsx
├── DisGeNET_genesets.txt
├── DisGeNET_genesets_AUPRCs.csv
├── DisGeNET_genesets_Effect_Size.csv
├── GWAS_Catalog_genesets.txt
├── Oncogenic_Components_genesets.txt
├── Oncogenic_genesets_AUPRCs.csv
└── Oncogenic_genesets_Effect_Size.csv
├── LICENSE.txt
├── Network Evaluation Examples
├── Network Evaluation Example.ipynb
└── run_network_evaluation.py
├── Network Processing Notebooks
├── BIND Processing.ipynb
├── BioGRID Processing.ipynb
├── BioPlex Processing.ipynb
├── ConsensusPathDB Processing.ipynb
├── DIP Processing.ipynb
├── Degree-Preserved Network Shufflings.ipynb
├── GIANT Processing.ipynb
├── GeneMANIA Processing.ipynb
├── HINT Processing.ipynb
├── HPRD Processing.ipynb
├── HumanInteractome Processing.ipynb
├── HumanNet Processing.ipynb
├── InBioMap Processing.ipynb
├── IntAct Processing.ipynb
├── Mentha Processing.ipynb
├── MultiNet Processing.ipynb
├── PID Processing.ipynb
├── Pathway Commons Processing.ipynb
├── Reactome Processing.ipynb
├── Reactome-FIs Processing.ipynb
├── STRING Processing.ipynb
└── iRefIndex Processing.ipynb
├── README.md
├── network_evaluation_tools
├── .ipynb_checkpoints
│ ├── PSN Construction-checkpoint.ipynb
│ └── SBNE Method-checkpoint.ipynb
├── __init__.py
├── data_import_tools.py
├── gene_conversion_tools.py
├── miscellaneous_functions.py
├── network_evaluation_functions.py
└── network_propagation.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 | *.pyc
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | .ipynb_checkpoints/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *.cover
48 | .hypothesis/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 |
58 | # Flask stuff:
59 | instance/
60 | .webassets-cache
61 |
62 | # Scrapy stuff:
63 | .scrapy
64 |
65 | # Sphinx documentation
66 | docs/_build/
67 |
68 | # PyBuilder
69 | target/
70 |
71 | # Jupyter Notebook
72 | .ipynb_checkpoints
73 |
74 | # pyenv
75 | .python-version
76 |
77 | # celery beat schedule file
78 | celerybeat-schedule
79 |
80 | # SageMath parsed files
81 | *.sage.py
82 |
83 | # Environments
84 | .env
85 | .venv
86 | env/
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/Data/Database_Citations.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/Data/Database_Citations.xlsx
--------------------------------------------------------------------------------
/Data/DisGeNET_genesets_AUPRCs.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/Data/DisGeNET_genesets_AUPRCs.csv
--------------------------------------------------------------------------------
/Data/DisGeNET_genesets_Effect_Size.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/Data/DisGeNET_genesets_Effect_Size.csv
--------------------------------------------------------------------------------
/Data/Oncogenic_Components_genesets.txt:
--------------------------------------------------------------------------------
1 | C1: ERBB3 / PI3K ESRP1 PRSS8 TMEM125 GRHL2 RP11-388M20.2 RP11-354M1.2 CDH1 C1orf210 CRB3 ESRP2 GALNT3 ERBB3 TC2N CCDC64B RP11-429J17.6 TMC4 CDH3 MARVELD3 OVOL2 EPS8L1 CDS1 CDC42BPG PVRL4 ATP2C2 LSR LLGL2 MAP7 SPINT2 DSP GRB7 C19orf21 EPS8L2 C6orf132 F11R SH2D3A RP11-615I2.2 GRHL1 GPR56 CHMP4C SLC44A2 RHOD PRRG2 RP11-22C11.2 ARHGEF16 RGL3 SIGIRR TMEM184A RNF223 AIF1L MYO6 HOOK2 MYO5B ARHGEF35 CNKSR1 MARVELD2 SMPDL3B HOOK1 TTC9 ARHGEF5 CXCL16 ATP8B1 CST6 SYT7 RP4-798C17.5 COBL TPD52L1 RGL2 PRRG4 LITAF
2 | C2: MYC / E2F TEAD2 SMO FBXO2 PACSIN3 AC008738.2 CEBPA PFAS CENPV CTSL2 FKBP10 IL27RA CTSL1 TRAP1 FBLN1 VIM RP11-124N14.3 ETV4 GEMIN5 TRIM65 RP11-40H20.2 TGFB1 NOB1 CTD-2033A16.3 EXOSC2 LIX1L PPAT RPIA AC006111.1 LYAR AMPD2 ERCC1 CDKN2AIPNL CTD-2165H16.1 C20orf20 CCDC85B RP4-765A10.1 RCOR2 RNMTL1 RSAD1 PLOD1 CDCA5 LEPREL2 GNL3 CACYBP NOP56 EIF3E FAM216A CD320 EEF1A1P19 CAP2
3 | C3: RAS / WNT / PI3K PRSS3 RP11-133O22.6 SLC17A9 ALDH2 CDX2 ETV4 CENPV SH3BGRL2 DUSP6 HOXA10 ABLIM1 STEAP1 HNF1B PHLDA1 RP11-867G2.8 DSG2 C1orf106 SMAGP C19orf21 SLC27A2 SPRY2 CD320 COL17A1 TIMP1 ERBB3 DSP SGPP2 EFHD2 ANXA3 SYK LSR NOB1 HMGA1 IL18 LLGL2 GLYCTK CHDH EEF1A1P5 SLC25A6 EPB49 EEF1A1 EEF1A1P6 RPL4 YBX1 RPL6 CRYL1 RPS24 RPL5 CTD-2033A16.3 EIF3E
4 | C4: EMT COL5A1 SPARC CDH11 CDH13 CCDC80 LTBP1 PCOLCE DKK3 TBX3 C1S KCNMA1 NEXN LEPREL2 ANPEP HEG1 RP11-443A13.3 COL16A1 ENG CNRIP1 GAS6 RP13-530H6.2 ADAMTSL1 EFEMP1 SRPX CD99 PALLD IGFBP4 IFFO1 ITGA5 SLC44A2 GNG11 VIM RP11-124N14.3 GPC1 TSPAN5 DPYSL3 FKBP10 C1orf198 MAN1A1 ATP8B1 CAP2 RCAN1 NDST1 PLOD1 EEF1A1P5 RAB13 RP11-342D11.2 EEF1A1 TIMP1 DDAH1
5 | C5: HNF1 / PAX8 CLDN1 EPS8L2 PAX8 LEPREL1 HNF1B ANXA3 DSG2 IL18 GNG11 WWC1 F11R LIMCH1 ELOVL7 CHMP4C ARHGEF5 TMEM56 GNAI1 PTPRJ BCAM RP11-124N14.3 STXBP2 VIM RP4-798C17.5 GPX8 ARHGEF35 LITAF SPINT2 HSPG2 LSR RIPK4 RHPN2 DSP PHLDB2 EPB49 PDGFB NXN LEPROT BAIAP2L1 PLCB4 RP11-54F2.1 RP11-342D11.2 CCDC80 ABLIM1 CELSR1 CTSL1 TPD52L1 PALLD KIAA1598 NDST1 UBE2H
6 | C6: BRAF / MAPK SRPX PLAT TNFRSF19 SPARC MITF ERBB3 SPRY2 DUSP6 GPR56 RENBP GNG11 VIM RP11-124N14.3 ETV4 PHLDA1 ST6GALNAC2 ENG NES SPRY4 AGPAT9 PHLDA3 TIMP1 CTSL1 RCAN1 PYGB FKBP10 NFATC2 IFFO1 PLOD1 RIPK4 EEF1A1P5 EEF1A1 UBL3 YBX1 EEF1A1P6 RPL4 SLC20A1 RPL6 CHST11 SLC6A15 VAT1 SLC25A6 ENTPD6 RPL5 CD320 HMGB1 GLT25D1 SPRED1 SSH1 HMGA1
7 | C7: TNF / NF-kB NT5E CDCP1 PHLDA1 CALB2 STEAP1 NRP1 RP11-342D11.2 PLAT MT1E ELK3 ANTXR2 AGPAT9 IRAK2 LINC00460 TM4SF19 RP11-394J1.2 HPCAL1 TGFB1 PRDM8 STX1A HMGA2 TIMP1 FMNL1 RAB31 ITGA5 PDP1 HRH1 CHST11 IL31RA TMEM158 RP11-124N14.3 C11orf68 VIM IGFBP4 ETV4 EFHD2 DUSP6 AC138150.4 TSPAN5 SLC20A1 MAP4K4 CCDC85B WDR54 FUT8 ADAM19 DST GEM DPYSL3 IL18 PHLDB2 DNER DSG2 CMTM3 AC005035.1 ARSJ GFPT2 CTSL1 EFEMP1 TPBG HMGA1 CAPRIN2 LYAR STMN3 FOXL1 GPX8 STAMBPL1 STK10 ARAP3 SMAGP HJURP RP11-221N13.3 ANXA3 CTHRC1 ITPRIP FKBP10 CLDN1 TOMM34 SERPINA1 TRBC2
8 | C8: MYC KIF1A CHGB DPYSL3 SYP SYT1 STMN3 PKIA VANGL2 AP3B2 UNC13A CENPV MAPRE2 CTA-221G9.10 DLL1 CNTNAP2 JPH1 ELOVL2 TMEM145 STXBP1 RP11-122A3.2 RCOR2 DNAJC6 ZNF512 STX1A VGF RIC3 SLC6A15 RIMS2 AGPAT5 RAP2A SSBP3 CD320 RIMS3 RP11-158I13.2 CXXC1 TPD52 CCDC64 HOOK1 SYT7 WDR54 IVNS1ABP NOP56 EEF1A1P5 AC005035.1 YBX1 EEF1A1 RAB3B EEF1A1P6 AC012379.1 PRPF19 RPL6 MAP4K4 HMGB1 CDCA5 RPL4 TTL RPL5 ATP2A2 U2AF2 SLC25A6 TTLL7 IPO5 YBX1P1 GNAI1 CACYBP MCL1 RPS24 SLC44A5 TSPYL2 PIH1D1 TSHZ1 HMGA1 EIF3E RHBDD2 GSK3B GOLIM4 GNL3 CBLN1 TMTC4 KHDRBS3 NEURL1B SH3BGRL2 KATNB1 GART PEX5L EIF3H ALDH2 SCN3B HJURP PSD PPAT CTB-79E8.2 SLC20A1 POLR3GL METTL9 GAB2 AMOTL1 ARL2 DDRGK1 COPS8 PYGB MEST NELF AGPAT6 MFSD6 EXOSC2 KIAA1324 RTN2 DAP TOMM34 ID1 GLT25D1 VAT1 FAM216A SRD5A1 ACN9 E2F4 TRAP1 CDKN2AIPNL DBF4 RPIA CXADR
9 | C9: RAS / AP1 KRT17 KRT5 GPR87 DSC3 DSG3 FBLN1 COL17A1 CDH3 FAT2 RP11-615I2.2 AL391137.1 NXN LEPREL1 IL18 CLDN1 PPP1R14C EFEMP1 GPC1 RHOD CDH1 CTSL2 CCDC80 DSP VANGL2 ST6GALNAC2 PHLDA3 TMEM40 LY6D C10orf54 CTSH ANXA3 BCAM RP11-354M1.2 CXCL16 FGFR2 DSG2 CREG1 RIPK4 LIMK2 MMP28 ID1 LSR F11R LITAF CELSR2 DAB2IP PHLDB2 C1orf106 TPD52L1 GNAI1
10 |
--------------------------------------------------------------------------------
/Data/Oncogenic_genesets_AUPRCs.csv:
--------------------------------------------------------------------------------
1 | Oncogenic Component Gene Set,GeneMANIA,GIANT,STRING,ReactomeFI,Reactome,MultiNet,PathwayCommons,HumanNet,BioPlex,DIP,InBioMap,BioGRID,BIND,Mentha,IRefIndex,PID,HPRD,IntAct,ConsensusPathDB,HINT,HumanInteractome
2 | C1: ERBB3 / PI3K,79.504,88.463,45.321,9.31,9.144,13.522,8.321,25.382,29.247,-0.007,8.148,16.023,3.102,4.689,4.741,7.379,13.017,3.638,6.904,1.435,-0.254
3 | C2: MYC / E2F,12.144,4.593,10.196,1.583,32.015,2.582,1.592,-0.428,2.878,-1.108,0.699,0.985,-3.08,0.356,-0.487,-1.122,-0.851,0.172,-0.771,0.758,0.237
4 | C3: RAS / WNT / PI3K,13.709,24.005,3.724,4.536,6.139,1.996,6.138,1.109,0.347,8.047,0.94,0.942,9.901,0.587,0.301,0.252,2.324,1.861,-0.288,0.736,-0.622
5 | C4: EMT,46.876,56.378,49.475,8.366,12.124,6.63,14.062,16.828,6.057,7.183,7.538,6.861,3.157,5.922,3.089,0.478,2.729,3.528,6.462,1.059,-1.832
6 | C5: HNF1 / PAX8,18.51,12.137,3.211,6.21,5.811,14.762,3.296,4.714,17.87,2.503,6.311,2.412,33.288,3.325,3.186,44.088,1.666,0.784,5.496,3.214,0.698
7 | C6: BRAF / MAPK,33.463,37.398,8.971,7.753,0.6,5.81,11.364,2.604,9.481,8.875,0.752,2.887,-1.686,1.98,1.22,0.66,1.176,2.421,1.623,0.883,-0.98
8 | C7: TNF / NF-kB,25.602,75.85,3.472,5.288,5.415,0.043,0.039,4.025,0.666,4.193,1.091,-0.009,-0.242,0.091,2.078,-0.565,-0.538,0.483,-0.265,-0.332,2.302
9 | C8: MYC,19.938,12.769,14.662,7.637,7.606,5.71,7.618,0.515,0.809,0.992,4.678,1.226,-1.186,1.449,2.012,3.4,2.329,0.815,-0.345,2.297,1.027
10 | C9: RAS / AP1,63.402,75.232,6.942,15.589,13.236,18.125,6.703,19.884,3.203,29.165,6.068,7.148,294.849,4.349,6.704,10.111,6.48,1.973,5.423,0.892,3.703
--------------------------------------------------------------------------------
/Data/Oncogenic_genesets_Effect_Size.csv:
--------------------------------------------------------------------------------
1 | Oncogenic Component Gene Set,STRING,ConsensusPathDB,HumanNet,Reactome,ReactomeFI,GIANT,InBioMap,GeneMANIA,DIP,MultiNet,HINT,IRefIndex,PathwayCommons,HPRD,BioGRID,Mentha,IntAct,PID,BioPlex,BIND,HumanInteractome
2 | C1: ERBB3 / PI3K,0.5135262,0.058763726,0.692236671,0.122222216,0.15754273,6.684008471,0.082859901,4.067461582,-4.31E-05,0.189440939,0.016354416,0.064830058,0.068458882,0.273601605,0.545721195,0.051626723,0.031076848,0.118967686,0.994315007,0.005668391,-0.001785174
3 | C2: MYC / E2F,0.092846659,-0.024804717,-0.011586723,0.141646337,0.021769631,0.126131444,0.011438366,0.229788537,-0.007466792,0.042600429,0.024478392,-0.013013427,0.01547359,-0.016721966,0.021910887,0.011695219,0.004778901,-0.002736735,0.056399241,-0.01325828,0.000747868
4 | C3: RAS / WNT / PI3K,0.058378867,-0.015405315,0.050083469,0.116639188,0.130570541,0.423772198,0.024225851,0.353977705,0.163708126,0.030888897,0.017779446,0.009526628,0.302163126,0.058046682,0.019992612,0.017686172,0.041009049,0.003238783,0.005259451,0.094973291,-0.003334094
5 | C4: EMT,0.502137765,0.078045291,0.539404955,0.171974632,0.143326712,1.56712006,0.108628694,1.578435016,0.034923587,0.045392111,0.0082312,0.080228263,0.377918883,0.048341957,0.107700053,0.074111893,0.082371847,0.00104432,0.147561307,0.014931652,-0.001871211
6 | C5: HNF1 / PAX8,0.067260554,0.051132155,0.048942587,0.086221776,0.124037539,0.359179111,0.034975803,0.493826433,0.025733051,0.12134834,0.038769791,0.060915915,0.039247966,0.039383136,0.070949222,0.045455993,0.009638796,0.584454442,0.4182192,0.049113775,0.00191155
7 | C6: BRAF / MAPK,0.16734618,0.040467197,0.144514141,0.008246205,0.124472595,0.660173496,0.029069971,0.636674143,0.171066126,0.066039347,0.022348522,0.039525486,0.347191285,0.045675844,0.078249904,0.056334395,0.048530168,0.007730904,0.242063957,-0.033614766,-0.007001517
8 | C7: TNF / NF-kB,0.070352307,-0.003042142,0.078385588,0.113450311,0.064574197,1.614945003,0.014547852,0.874980732,0.150039047,0.000784241,-0.005916828,0.024739026,0.00103285,-0.026220378,-0.00014158,0.001885841,0.013316024,-0.004948463,0.014466807,-0.002359939,0.019704809
9 | C8: MYC,0.334557167,-0.016903927,0.02853045,0.282954697,0.306728046,0.350495141,0.138018638,0.407845466,0.06911551,0.226512037,0.059835179,0.090457074,0.298779819,0.11275682,0.058599939,0.050242058,0.029014753,0.12451458,0.034957168,-0.079153753,0.007960667
10 | C9: RAS / AP1,0.227817837,0.058713082,0.425458334,0.451898009,0.508070925,2.110367453,0.047765235,1.711314568,0.376062304,0.171777257,0.005529503,0.103466426,0.120951911,0.172031028,0.233638383,0.048789225,0.018321645,0.235847681,0.061694523,0.305124847,0.00876537
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) Idekerlab 2017
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 |
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 |
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/Network Evaluation Examples/run_network_evaluation.py:
--------------------------------------------------------------------------------
1 | ###################################################################
2 | # Command line script to analyze network on node sets of interest #
3 | ###################################################################
4 |
5 | from network_evaluation_tools import network_evaluation_functions as nef
6 | from network_evaluation_tools import data_import_tools as dit
7 | from network_evaluation_tools import gene_conversion_tools as gct
8 | import argparse
9 | import os
10 | import pandas as pd
11 |
12 | # Checking valid alpha and p values (Range is 0.0-1.0 exclusive)
13 | # Value can also be None.
14 | def restricted_float(x):
15 | if x is not None:
16 | x = float(x)
17 | if x <= 0.0 or x >= 1.0:
18 | raise argparse.ArgumentTypeError("%r not in range (0.0, 1.0) exclusive"%(x,))
19 | return x
20 |
21 | # Checking valid integer values (for all values that must be >0)
22 | def positive_int(x):
23 | x = int(x)
24 | if x <= 0:
25 | raise argparse.ArgumentTypeError("%s must be a positive integer" % x)
26 | return x
27 |
28 | # Valid file path check (Does not check file formatting, but checks if given path exists and is readable)
29 | def valid_infile(in_file):
30 | if not os.path.isfile(in_file):
31 | raise argparse.ArgumentTypeError("{0} is not a valid input file path".format(in_file))
32 | if os.access(in_file, os.R_OK):
33 | return in_file
34 | else:
35 | raise argparse.ArgumentTypeError("{0} is not a readable input file".format(in_file))
36 |
37 | # Valid output directory path check (Checks if the output directory path can be found and written to by removing given filename from full path)
38 | # Note: This uses '/' character for splitting pathnames on Linux and Mac OSX. The character may need to be changed to '\' for Windows executions
39 | def valid_outfile(out_file):
40 | outdir = '/'.join(out_file.split('/')[:-1])
41 | if not os.path.isdir(outdir):
42 | raise argparse.ArgumentTypeError("{0} is not a valid output directory".format(outdir))
43 | if os.access(outdir, os.W_OK):
44 | return out_file
45 | else:
46 | raise argparse.ArgumentTypeError("{0} is not a writable output directory".format(outdir))
47 |
48 | if __name__ == "__main__":
49 | # Network Evaluation Setup Variables
50 | parser = argparse.ArgumentParser(description='Analyze network performance on ability to aggregate sets of nodes in network space.')
51 | parser.add_argument("network_path", type=valid_infile,
52 | help='Path to file of network to be evaluated. File must be 2-column edge list where each line is a gene interaction separated by a common delimiter.')
53 | parser.add_argument("node_sets_file", type=valid_infile,
54 | help='Path to file of node sets. Each line is a list, separated by a common delimiter. The first item in each line will be the name of the node set.')
55 | parser.add_argument("actual_AUPRCs_save_path", type=valid_outfile,
56 | help='CSV file path of network evaluation result scores (AUPRCs). This script minimally returns these values to save. Must have a writable directory.')
57 | parser.add_argument('-v', '--verbose', default=False, action="store_true", required=False,
58 | help='Verbosity flag for reporting on patient similarity network construction steps.')
59 | parser.add_argument('-netd', '--net_file_delim', type=str, default='\t', required=False,
60 | help='Delimiter used in network file between columns. Default is tab white space.')
61 | parser.add_argument('-setd', '--set_file_delim', type=str, default='\t', required=False,
62 | help='Delimiter used in node set file to delimit lists. Default is tab white space.')
63 | parser.add_argument("-p", "--sample_p", type=restricted_float, default=None, required=False,
64 | help='Sub-sampling percentage for node sets of interest. Default is None. Each gene set''s p is automatically determined by the network in this case.')
65 | parser.add_argument("-a", "--alpha", type=restricted_float, default=None, required=False,
66 | help='Propagation constant to use in the propagation of node sub-samples over given network. Overrides alpha calculation model if given.')
67 | parser.add_argument("-n", "--sub_sample_iter", type=positive_int, default=30, required=False,
68 | help='Number of times to perform sub-sampling during performance recovery (AUPRC) calculation for each node set. Default is 30.')
69 | parser.add_argument('-c', '--cores', type=positive_int, default=1, required=False,
70 | help='Number of cores to be utilized by machine for performance calculation step. NOTE: Each core must have enough memory to store at least network-sized square matrix and given node sets to perform calculations.')
71 | parser.add_argument('-bg', '--background', type=str, default='network', choices=['genesets', 'network'], required=False,
72 | help='Establishes the background gene set to calculate AUPRC over. Default is to use all genes in the network, can change to use only genes from the union of all gene sets tested (i.e. disease genes only).')
73 |
74 | # Network performance score calculations (with null networks)
75 | parser.add_argument("-i", "--null_iter", type=positive_int, default=30, required=False,
76 | help='Number of times to perform degree-preserved shuffling of network to construct performance value null distribution. Default is 30. If this value is >0, --null_AUPRCs_save_path will be required')
77 | parser.add_argument('-nno', '--null_network_outdir', type=valid_outfile, default=None, required=False,
78 | help='File directory to save null networks after generation.')
79 | parser.add_argument('-nsp', '--null_AUPRCs_save_path', type=valid_outfile, default=None, required=False,
80 | help='CSV file path of where to save null network evaluation results. Used in the calculation of network performance score and perfomance gain scores')
81 | parser.add_argument('-psp', '--performance_save_path', type=valid_outfile, default=None, required=False,
82 | help='CSV file path of where to save network evaluation results as z-scores.')
83 | parser.add_argument('-gsp', '--performance_gain_save_path', type=valid_outfile, default=None, required=False,
84 | help='CSV file path of where to save network evaluation results as gain in AUPRC over median null AUPRCs.')
85 |
86 | args = parser.parse_args()
87 | # If null networks need to be constructed
88 | if args.null_iter > 0:
89 | # A file path must be given to either save the null networks or the null network performance
90 | if (args.null_AUPRCs_save_path is None) and (args.null_network_outdir is None):
91 | parser.error('Save path required for null network edge lists or null network evaluation results.')
92 |
93 | ####################################
94 | ##### Network Evaluation Setup #####
95 | ####################################
96 |
97 | # Limit core usage (if defined)
98 | import mkl
99 | mkl.set_num_threads(args.cores)
100 |
101 | # Load Network
102 | network = dit.load_network_file(args.network_path, verbose=args.verbose)
103 | network_size = len(network.nodes())
104 |
105 | # Load Gene sets
106 | genesets = dit.load_node_sets(args.node_sets_file, verbose=args.verbose)
107 |
108 | # Calculate gene set sub-sample rate with network (if not set)
109 | if args.sample_p is None:
110 | genesets_p = nef.calculate_p(network, genesets)
111 | else:
112 | genesets_p = {geneset:args.sample_p for geneset in genesets}
113 | if args.verbose:
114 | print 'Gene set sub-sample rates set'
115 |
116 | # Calculate network kernel (also determine propagation constant if not set)
117 | kernel = nef.construct_prop_kernel(network, alpha=args.alpha, verbose=True)
118 |
119 | # Change background gene list if needed
120 | if args.background == 'genesets':
121 | background_node_set = set()
122 | for geneset in genesets:
123 | background_node_set = background_node_set.union(genesets[geneset])
124 | background_nodes = list(background_node_set.intersection(set(kernel.index)))
125 | else:
126 | background_nodes = list(kernel.index)
127 |
128 |
129 | ############################################
130 | ##### Network Performance Calculations #####
131 | ############################################
132 |
133 | # Calculate AUPRC for each gene set on actual network (large networks are >=10k nodes)
134 | if network_size < 10000:
135 | actual_AUPRC_values = nef.small_network_AUPRC_wrapper(kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True)
136 | else:
137 | actual_AUPRC_values = nef.large_network_AUPRC_wrapper(kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True)
138 |
139 | # Save the actual network's AUPRC values
140 | actual_AUPRC_values.to_csv(args.actual_AUPRCs_save_path)
141 |
142 |
143 | #################################################
144 | ##### Null Network Performance Calculations #####
145 | #################################################
146 |
147 | # If number of null networks > 0:
148 | if args.null_iter > 0:
149 | null_AUPRCs = []
150 | for i in range(args.null_iter):
151 | # Construct null networks and calculate AUPRCs for each gene set on each null network
152 | shuffNet = nef.shuffle_network(network, max_tries_n=10, verbose=True)
153 | # Save null network if null network output directory is given
154 | if args.null_network_outdir is not None:
155 | shuffNet_edges = shuffNet.edges()
156 | gct.write_edgelist(shuffNet_edges, args.null_network_outdir+'shuffNet_'+repr(i+1)+'.txt',
157 | delimiter='\t', binary=True)
158 | if args.verbose:
159 | print('Shuffled Network', i+1, 'written to file')
160 | # Construct null network kernel
161 | shuffNet_kernel = nef.construct_prop_kernel(shuffNet, alpha=args.alpha, verbose=False)
162 | # Calculate null network AUPRCs
163 | if network_size < 10000:
164 | shuffNet_AUPRCs = nef.small_network_AUPRC_wrapper(shuffNet_kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True)
165 | else:
166 | shuffNet_AUPRCs = nef.large_network_AUPRC_wrapper(shuffNet_kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True)
167 | null_AUPRCs.append(shuffNet_AUPRCs)
168 | # Construct table of null AUPRCs
169 | null_AUPRCs_table = pd.concat(null_AUPRCs, axis=1)
170 | null_AUPRCs_table.columns = ['shuffNet'+repr(i+1) for i in range(len(null_AUPRCs))]
171 | if args.verbose:
172 | print 'All null network gene set AUPRCs calculated'
173 | # Save null network AUPRCs if save path is given
174 | if args.null_AUPRCs_save_path is not None:
175 | null_AUPRCs_table.to_csv(args.null_AUPRCs_save_path)
176 | # Calculate performance score for each gene set's AUPRC if performance score save path is given
177 | if args.performance_save_path is not None:
178 | network_performance = nef.calculate_network_performance_score(actual_AUPRC_values, null_AUPRCs_table, verbose=args.verbose)
179 | network_performance.to_csv(args.performance_save_path)
180 | # Calculate network performance gain over median null AUPRC if AUPRC performance gain save path is given
181 | if args.performance_gain_save_path is not None:
182 | network_perf_gain = nef.calculate_network_performance_gain(actual_AUPRC_values, null_AUPRCs_table, verbose=args.verbose)
183 | network_perf_gain.to_csv(args.performance_save_path)
184 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/BIND Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "from network_evaluation_tools import gene_conversion_tools as gct"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "## Load BIND Raw Data\n",
20 | "#### Source: http://www.pathwaycommons.org/archives/PC2/v8/PathwayCommons.8.bind.BINARY_SIF.hgnc.txt.sif.gz\n",
21 | "Downloaded: June 15, 2017 \n",
22 | "Last Updated (via Pathway Commons v9 datasources.txt file): December 15, 2010 \n",
23 | "Note: For this processing, we used the data file provided in the PathwayCommons v8 distribution. The SIF file provided by Pathway Commons v9 at the given time only yields 13078 interactions significantly less than the file provided by the v8 distribution. It is unclear where all of those interactions have gone for now, but at this time, we will be using the Pathway Commons v8 distribution of BIND. \n",
24 | "Also note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 30,
30 | "metadata": {
31 | "collapsed": false
32 | },
33 | "outputs": [],
34 | "source": [
35 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
36 | "BIND_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons.8.bind.BINARY_SIF.hgnc.txt.sif',sep='\\t', header=-1)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 32,
42 | "metadata": {
43 | "collapsed": false,
44 | "scrolled": true
45 | },
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "Edges in BIND: 72780\n"
52 | ]
53 | }
54 | ],
55 | "source": [
56 | "# Convert table of interactions to edgelist (no scores given)\n",
57 | "# Also no gene symbol conversion necessary because network is given in symbol format already\n",
58 | "BIND_edgelist = BIND_Raw[[0, 2]].values.tolist()\n",
59 | "print 'Edges in BIND:', len(BIND_edgelist)"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 33,
65 | "metadata": {
66 | "collapsed": false
67 | },
68 | "outputs": [],
69 | "source": [
70 | "# Sort each edge representation for filtering\n",
71 | "BIND_edgelist_sorted = [sorted(edge) for edge in BIND_edgelist]"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 34,
77 | "metadata": {
78 | "collapsed": false
79 | },
80 | "outputs": [
81 | {
82 | "name": "stdout",
83 | "output_type": "stream",
84 | "text": [
85 | "72780 input edges\n",
86 | "0 self-edges removed\n",
87 | "0 edges with un-mapped genes removed\n",
88 | "0 duplicate edges removed\n",
89 | "Edge list filtered: 0.19 seconds\n",
90 | "72780 Edges remaining\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "# Filter edgelist for duplicate nodes and for self-edges\n",
96 | "BIND_edgelist_filt = gct.filter_converted_edgelist(BIND_edgelist_sorted)"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 35,
102 | "metadata": {
103 | "collapsed": false
104 | },
105 | "outputs": [
106 | {
107 | "name": "stdout",
108 | "output_type": "stream",
109 | "text": [
110 | "Edge list saved: 0.09 seconds\n"
111 | ]
112 | }
113 | ],
114 | "source": [
115 | "# Save genelist to file\n",
116 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
117 | "gct.write_edgelist(BIND_edgelist_filt, outdir+'BIND_Symbol.sif')"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {
124 | "collapsed": true
125 | },
126 | "outputs": [],
127 | "source": []
128 | }
129 | ],
130 | "metadata": {
131 | "kernelspec": {
132 | "display_name": "Python 2",
133 | "language": "python",
134 | "name": "python2"
135 | },
136 | "language_info": {
137 | "codemirror_mode": {
138 | "name": "ipython",
139 | "version": 2
140 | },
141 | "file_extension": ".py",
142 | "mimetype": "text/x-python",
143 | "name": "python",
144 | "nbconvert_exporter": "python",
145 | "pygments_lexer": "ipython2",
146 | "version": "2.7.11"
147 | }
148 | },
149 | "nbformat": 4,
150 | "nbformat_minor": 0
151 | }
152 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/BioGRID Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "import pandas as pd\n",
13 | "import itertools"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "## Load BioGRID Raw Data\n",
21 | "#### Source (MITAB): http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.149/BIOGRID-ORGANISM-3.4.149.tab2.zip\n",
22 | "Downloaded: June 15, 2017 \n",
23 | "Last Updated: June 01, 2017 \n",
24 | "Notes for download: There is a new verision of BioGRID released on the first of every month. Download the organism specific files to extract only human interactions from the database. \n",
25 | "Notes for processing: This is the file for human protein interactions, however, not all interactions may be human-human interactions. These need to be filtered. There is a column for \"Score\" filtering, but it seems that most of these values are missing so they will be ignored for processing BioGRID"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "Raw edge count in BioGRID: 394749\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
45 | "BioGRID_Raw = pd.read_csv(wd+'Network_Data_Raw/BioGRID/BIOGRID-ORGANISM-3.4.149.tab2/BIOGRID-ORGANISM-Homo_sapiens-3.4.149.tab2.txt',sep='\\t', low_memory=False)\n",
46 | "print 'Raw edge count in BioGRID:', len(BioGRID_Raw)"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 4,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/plain": [
59 | "physical 392779\n",
60 | "genetic 1970\n",
61 | "Name: Experimental System Type, dtype: int64"
62 | ]
63 | },
64 | "execution_count": 4,
65 | "metadata": {},
66 | "output_type": "execute_result"
67 | }
68 | ],
69 | "source": [
70 | "# Show not all interactions in BioGRID are physical PPI, though the overwhelming majority are\n",
71 | "BioGRID_Raw['Experimental System Type'].value_counts()"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 27,
77 | "metadata": {
78 | "collapsed": false,
79 | "scrolled": true
80 | },
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/plain": [
85 | "9606 372979\n",
86 | "10090 17963\n",
87 | "11676 1591\n",
88 | "10116 570\n",
89 | "559292 355\n",
90 | "Name: Organism Interactor A, dtype: int64"
91 | ]
92 | },
93 | "execution_count": 27,
94 | "metadata": {},
95 | "output_type": "execute_result"
96 | }
97 | ],
98 | "source": [
99 | "# Not all interactions are from Human\n",
100 | "BioGRID_Raw['Organism Interactor A'].value_counts().head()"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 28,
106 | "metadata": {
107 | "collapsed": false,
108 | "scrolled": true
109 | },
110 | "outputs": [
111 | {
112 | "data": {
113 | "text/plain": [
114 | "9606 389334\n",
115 | "10090 2543\n",
116 | "559292 1045\n",
117 | "10116 708\n",
118 | "11676 318\n",
119 | "Name: Organism Interactor B, dtype: int64"
120 | ]
121 | },
122 | "execution_count": 28,
123 | "metadata": {},
124 | "output_type": "execute_result"
125 | }
126 | ],
127 | "source": [
128 | "# Not all interactions are from Human\n",
129 | "BioGRID_Raw['Organism Interactor B'].value_counts().head()"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "#### Since there are so few genetic interactions relative to physical interactions, we will not filter these edges. However, we will filter all interactions that are not labelled as human-human interactions"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "#### Keep only human-human interactions"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 9,
149 | "metadata": {
150 | "collapsed": false
151 | },
152 | "outputs": [
153 | {
154 | "name": "stdout",
155 | "output_type": "stream",
156 | "text": [
157 | "Human-Human only interactions in BioGRID 3.4.149: 367564\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "BioGRID_Human_Only = BioGRID_Raw[(BioGRID_Raw['Organism Interactor A']==9606) & (BioGRID_Raw['Organism Interactor B']==9606)]\n",
163 | "print 'Human-Human only interactions in BioGRID 3.4.149:', len(BioGRID_Human_Only)"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 29,
169 | "metadata": {
170 | "collapsed": false
171 | },
172 | "outputs": [
173 | {
174 | "data": {
175 | "text/plain": [
176 | "Series([], Name: Official Symbol Interactor A, dtype: object)"
177 | ]
178 | },
179 | "execution_count": 29,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "# Any missing symbol names in column A?\n",
186 | "BioGRID_Human_Only['Official Symbol Interactor A'][BioGRID_Human_Only['Official Symbol Interactor A']=='-']"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 30,
192 | "metadata": {
193 | "collapsed": false
194 | },
195 | "outputs": [
196 | {
197 | "data": {
198 | "text/plain": [
199 | "Series([], Name: Official Symbol Interactor B, dtype: object)"
200 | ]
201 | },
202 | "execution_count": 30,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "# Any missing symbol names in column B?\n",
209 | "BioGRID_Human_Only['Official Symbol Interactor B'][BioGRID_Human_Only['Official Symbol Interactor B']=='-']"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 32,
215 | "metadata": {
216 | "collapsed": false
217 | },
218 | "outputs": [
219 | {
220 | "name": "stdout",
221 | "output_type": "stream",
222 | "text": [
223 | "Edges in BioGRID: 367564\n"
224 | ]
225 | }
226 | ],
227 | "source": [
228 | "# Convert table of interactions to edgelist (no scores given)\n",
229 | "# Also no gene symbol conversion necessary because network is given in symbol format already\n",
230 | "BioGRID_edgelist = BioGRID_Human_Only[['Official Symbol Interactor A', 'Official Symbol Interactor B']].values.tolist()\n",
231 | "print 'Edges in BioGRID:', len(BioGRID_edgelist)"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 33,
237 | "metadata": {
238 | "collapsed": true
239 | },
240 | "outputs": [],
241 | "source": [
242 | "# Sort each edge representation for filtering\n",
243 | "BioGRID_edgelist_sorted = [sorted(edge) for edge in BioGRID_edgelist]"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 34,
249 | "metadata": {
250 | "collapsed": false
251 | },
252 | "outputs": [
253 | {
254 | "name": "stdout",
255 | "output_type": "stream",
256 | "text": [
257 | "367564 input edges\n",
258 | "4598 self-edges removed\n",
259 | "0 edges with un-mapped genes removed\n",
260 | "104709 duplicate edges removed\n",
261 | "Edge list filtered: 0.29 seconds\n",
262 | "258257 Edges remaining\n"
263 | ]
264 | }
265 | ],
266 | "source": [
267 | "# Filter edgelist for duplicate nodes and for self-edges\n",
268 | "BioGRID_edgelist_filt = gct.filter_converted_edgelist(BioGRID_edgelist_sorted)"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 37,
274 | "metadata": {
275 | "collapsed": false
276 | },
277 | "outputs": [
278 | {
279 | "name": "stdout",
280 | "output_type": "stream",
281 | "text": [
282 | "Edge list saved: 0.21 seconds\n"
283 | ]
284 | }
285 | ],
286 | "source": [
287 | "# Save genelist to file\n",
288 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
289 | "gct.write_edgelist(BioGRID_edgelist_filt, outdir+'BioGRID_Symbol.sif')"
290 | ]
291 | }
292 | ],
293 | "metadata": {
294 | "kernelspec": {
295 | "display_name": "Python 2",
296 | "language": "python",
297 | "name": "python2"
298 | },
299 | "language_info": {
300 | "codemirror_mode": {
301 | "name": "ipython",
302 | "version": 2
303 | },
304 | "file_extension": ".py",
305 | "mimetype": "text/x-python",
306 | "name": "python",
307 | "nbconvert_exporter": "python",
308 | "pygments_lexer": "ipython2",
309 | "version": "2.7.11"
310 | }
311 | },
312 | "nbformat": 4,
313 | "nbformat_minor": 0
314 | }
315 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/BioPlex Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "import pandas as pd\n",
13 | "import itertools"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "## Load BioPlex Raw Data\n",
21 | "#### Source: http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv\n",
22 | "Downloaded: June 20, 2017 \n",
23 | "Last Updated: December 01, 2016 \n",
24 | "This latest update of BioPlex (2.0 v4) is associated with the recent paper: Huttlin et al. (2017) Nature doi: 10.1038/nature22366 \n",
25 | "Note: We could use the 'p(Interaction)' column as a scoring metric to filter the network further, however, a top 10% filtering of this network would yield a network with <6000 interactions, so we did not feel like it was necessary to filter the network further for analysis."
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 13,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "Raw edge count in BioPlex: 56553\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
45 | "BioPlex_Raw = pd.read_csv(wd+'Network_Data_Raw/BioPlex_interactionList_v4a.tsv',sep='\\t')\n",
46 | "print 'Raw edge count in BioPlex:', len(BioPlex_Raw)"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 14,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [
56 | {
57 | "data": {
58 | "text/html": [
59 | "
\n",
60 | "
\n",
61 | " \n",
62 | " \n",
63 | " | \n",
64 | " GeneA | \n",
65 | " GeneB | \n",
66 | " UniprotA | \n",
67 | " UniprotB | \n",
68 | " SymbolA | \n",
69 | " SymbolB | \n",
70 | " p(Wrong) | \n",
71 | " p(No Interaction) | \n",
72 | " p(Interaction) | \n",
73 | "
\n",
74 | " \n",
75 | " \n",
76 | " \n",
77 | " 0 | \n",
78 | " 100 | \n",
79 | " 728378 | \n",
80 | " P00813 | \n",
81 | " A5A3E0 | \n",
82 | " ADA | \n",
83 | " POTEF | \n",
84 | " 2.380858e-09 | \n",
85 | " 0.000332 | \n",
86 | " 0.999668 | \n",
87 | "
\n",
88 | " \n",
89 | " 1 | \n",
90 | " 100 | \n",
91 | " 345651 | \n",
92 | " P00813 | \n",
93 | " Q562R1 | \n",
94 | " ADA | \n",
95 | " ACTBL2 | \n",
96 | " 9.786437e-18 | \n",
97 | " 0.211914 | \n",
98 | " 0.788086 | \n",
99 | "
\n",
100 | " \n",
101 | " 2 | \n",
102 | " 222389 | \n",
103 | " 708 | \n",
104 | " Q8N7W2 | \n",
105 | " Q07021 | \n",
106 | " BEND7 | \n",
107 | " C1QBP | \n",
108 | " 2.962215e-17 | \n",
109 | " 0.005645 | \n",
110 | " 0.994355 | \n",
111 | "
\n",
112 | " \n",
113 | " 3 | \n",
114 | " 222389 | \n",
115 | " 4038 | \n",
116 | " Q8N7W2 | \n",
117 | " O75096 | \n",
118 | " BEND7 | \n",
119 | " LRP4 | \n",
120 | " 3.302994e-10 | \n",
121 | " 0.000280 | \n",
122 | " 0.999720 | \n",
123 | "
\n",
124 | " \n",
125 | " 4 | \n",
126 | " 645121 | \n",
127 | " 3312 | \n",
128 | " Q6ZMN8 | \n",
129 | " P11142 | \n",
130 | " CCNI2 | \n",
131 | " HSPA8 | \n",
132 | " 2.060285e-16 | \n",
133 | " 0.036235 | \n",
134 | " 0.963765 | \n",
135 | "
\n",
136 | " \n",
137 | "
\n",
138 | "
"
139 | ],
140 | "text/plain": [
141 | " GeneA GeneB UniprotA UniprotB SymbolA SymbolB p(Wrong) \\\n",
142 | "0 100 728378 P00813 A5A3E0 ADA POTEF 2.380858e-09 \n",
143 | "1 100 345651 P00813 Q562R1 ADA ACTBL2 9.786437e-18 \n",
144 | "2 222389 708 Q8N7W2 Q07021 BEND7 C1QBP 2.962215e-17 \n",
145 | "3 222389 4038 Q8N7W2 O75096 BEND7 LRP4 3.302994e-10 \n",
146 | "4 645121 3312 Q6ZMN8 P11142 CCNI2 HSPA8 2.060285e-16 \n",
147 | "\n",
148 | " p(No Interaction) p(Interaction) \n",
149 | "0 0.000332 0.999668 \n",
150 | "1 0.211914 0.788086 \n",
151 | "2 0.005645 0.994355 \n",
152 | "3 0.000280 0.999720 \n",
153 | "4 0.036235 0.963765 "
154 | ]
155 | },
156 | "execution_count": 14,
157 | "metadata": {},
158 | "output_type": "execute_result"
159 | }
160 | ],
161 | "source": [
162 | "BioPlex_Raw.head()"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 15,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "Edges in BIND: 56553\n"
177 | ]
178 | }
179 | ],
180 | "source": [
181 | "# Convert table of interactions to edgelist (no scores given)\n",
182 | "# Also no gene symbol conversion necessary because network is given in symbol format already\n",
183 | "BioPlex_edgelist = BioPlex_Raw[['SymbolA', 'SymbolB']].values.tolist()\n",
184 | "print 'Edges in BIND:', len(BioPlex_edgelist)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 16,
190 | "metadata": {
191 | "collapsed": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "# Sort each edge representation for filtering\n",
196 | "BioPlex_edgelist_sorted = [sorted(edge) for edge in BioPlex_edgelist]"
197 | ]
198 | },
199 | {
200 | "cell_type": "code",
201 | "execution_count": 17,
202 | "metadata": {
203 | "collapsed": false
204 | },
205 | "outputs": [
206 | {
207 | "name": "stdout",
208 | "output_type": "stream",
209 | "text": [
210 | "56553 input edges\n",
211 | "0 self-edges removed\n",
212 | "0 edges with un-mapped genes removed\n",
213 | "0 duplicate edges removed\n",
214 | "Edge list filtered: 0.21 seconds\n",
215 | "56553 Edges remaining\n"
216 | ]
217 | }
218 | ],
219 | "source": [
220 | "# Filter edgelist for duplicate nodes and for self-edges\n",
221 | "BioPlex_edgelist_filt = gct.filter_converted_edgelist(BioPlex_edgelist)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 18,
227 | "metadata": {
228 | "collapsed": false
229 | },
230 | "outputs": [
231 | {
232 | "name": "stdout",
233 | "output_type": "stream",
234 | "text": [
235 | "Edge list saved: 0.1 seconds\n"
236 | ]
237 | }
238 | ],
239 | "source": [
240 | "# Write network to file\n",
241 | "gct.write_edgelist(BioPlex_edgelist_filt, wd+'Network_SIFs_Symbol/BioPlex_Symbol.sif', binary=True)"
242 | ]
243 | }
244 | ],
245 | "metadata": {
246 | "kernelspec": {
247 | "display_name": "Python 2",
248 | "language": "python",
249 | "name": "python2"
250 | },
251 | "language_info": {
252 | "codemirror_mode": {
253 | "name": "ipython",
254 | "version": 2
255 | },
256 | "file_extension": ".py",
257 | "mimetype": "text/x-python",
258 | "name": "python",
259 | "nbconvert_exporter": "python",
260 | "pygments_lexer": "ipython2",
261 | "version": "2.7.11"
262 | }
263 | },
264 | "nbformat": 4,
265 | "nbformat_minor": 0
266 | }
267 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/DIP Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "import pandas as pd\n",
13 | "import itertools"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "## Load PID Raw Data\n",
21 | "#### Source (MITAB): http://dip.doe-mbi.ucla.edu/dip/File.cgi?FN=2016/tab25/Hsapi20170205.txt\n",
22 | "Downloaded: June 15, 2017 \n",
23 | "Last Updated: Februrary 05, 2017 \n",
24 | "Notes for download: Website requires registration. Register for the site to download the file from the link. \n",
25 | "Notes for processing: This is the file for human protein interactions, however, not all interactions are human-human interactions. These need to be filtered. Also all ID's not without RefSeq or UniProt ID are excluded. Custom processing for this network is described below"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 5,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "Raw edge count in DIP: 7794\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
45 | "DIP_Raw = pd.read_csv(wd+'Network_Data_Raw/DIP/Hsapi20170205.txt', index_col=0, sep='\\t')\n",
46 | "print 'Raw edge count in DIP:', len(DIP_Raw)"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 12,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [],
56 | "source": [
57 | "# Fix the column offset in the interaction data table\n",
58 | "DIP_Raw_offset = DIP_Raw.reset_index(drop=False)[DIP_Raw.reset_index(drop=False).columns[:-2]]\n",
59 | "DIP_Raw_offset.columns = DIP_Raw.columns[:-1]"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 16,
65 | "metadata": {
66 | "collapsed": false
67 | },
68 | "outputs": [
69 | {
70 | "name": "stdout",
71 | "output_type": "stream",
72 | "text": [
73 | "Human-Human only interactions in DIP: 5569\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "# Keep only human-human interactions\n",
79 | "DIP_Human_only = DIP_Raw_offset[(DIP_Raw_offset['Taxid interactor A']=='taxid:9606(Homo sapiens)') & (DIP_Raw_offset['Taxid interactor B']=='taxid:9606(Homo sapiens)')]\n",
80 | "print 'Human-Human only interactions in DIP:', len(DIP_Human_only)"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "#### Parse all genes in filtered DIP and keep only RefSeq/UniProtKB labelled interactions"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 18,
93 | "metadata": {
94 | "collapsed": true
95 | },
96 | "outputs": [],
97 | "source": [
98 | "# Extract gene list\n",
99 | "Human_DIP_Genes = list(set(DIP_Human_only['ID interactor A']).union(set(DIP_Human_only['ID interactor B'])))"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": 25,
105 | "metadata": {
106 | "collapsed": false
107 | },
108 | "outputs": [],
109 | "source": [
110 | "# Split all gene names into list of genes and concatenate\n",
111 | "Human_DIP_Genes_split = [name.split('|') for name in Human_DIP_Genes]\n",
112 | "Human_DIP_Genes_full_list = list(itertools.chain.from_iterable(Human_DIP_Genes_split))\n",
113 | "\n",
114 | "# Note about this line: This is to fix the one example where one of the Uniprot genes gets labelled as \"uniprotkb:Q13936,159'\n",
115 | "Human_DIP_Genes_full_list = [name.split(',')[0] for name in Human_DIP_Genes_full_list] "
116 | ]
117 | },
118 | {
119 | "cell_type": "markdown",
120 | "metadata": {},
121 | "source": [
122 | "## Convert Genes"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 26,
128 | "metadata": {
129 | "collapsed": false
130 | },
131 | "outputs": [
132 | {
133 | "name": "stdout",
134 | "output_type": "stream",
135 | "text": [
136 | "5017 Valid Query Genes\n",
137 | "3281 Invalid Query Genes\n"
138 | ]
139 | }
140 | ],
141 | "source": [
142 | "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'DIP' prefix)\n",
143 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_DIP_Genes_full_list, exclude_prefixes=['DIP'])"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 31,
149 | "metadata": {
150 | "collapsed": false
151 | },
152 | "outputs": [
153 | {
154 | "name": "stdout",
155 | "output_type": "stream",
156 | "text": [
157 | "Batch query complete: 7.97 seconds\n",
158 | "5074 Matched query results\n"
159 | ]
160 | }
161 | ],
162 | "source": [
163 | "# Set scopes (gene naming systems to search)\n",
164 | "scopes = \"uniprot, refseq\"\n",
165 | "# Set fields (systems from which to return gene names from)\n",
166 | "fields = \"symbol, entrezgene\"\n",
167 | "# Query MyGene.Info\n",
168 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
169 | "print len(match_list), 'Matched query results'"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 32,
175 | "metadata": {
176 | "collapsed": false,
177 | "scrolled": true
178 | },
179 | "outputs": [
180 | {
181 | "name": "stdout",
182 | "output_type": "stream",
183 | "text": [
184 | "Queries without full matching results found: 106\n",
185 | "\n",
186 | "74 Queries with mutliple matches found\n",
187 | "\n",
188 | "Query mapping table/dictionary construction complete: 6.82 seconds\n"
189 | ]
190 | }
191 | ],
192 | "source": [
193 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "## Construct Converted Network"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 36,
206 | "metadata": {
207 | "collapsed": true
208 | },
209 | "outputs": [],
210 | "source": [
211 | "# This is a custom gene conversion function written due to the parsing required for gene interactor labels\n",
212 | "# Returns best matched symbol and/or entrez id from each DIP interactor string (if applicable)\n",
213 | "def convert_DIP_string(string, field):\n",
214 | " names = [gct.get_identifier_without_prefix(name) for name in string.split('|')]\n",
215 | " # Keep only mappings defined for field of interest\n",
216 | " if field=='symbol':\n",
217 | " # Return match table values that have matched symbol\n",
218 | " conversion = match_table_trim.ix[names][~(match_table_trim.ix[names]['Symbol'].isnull())]\n",
219 | " # Return conversion with max score or None if no conversion\n",
220 | " if conversion.shape[0]==0:\n",
221 | " return None\n",
222 | " else:\n",
223 | " max_score = conversion['Score'].max()\n",
224 | " return conversion[conversion['Score']==max_score].ix[0]['Symbol']\n",
225 | " elif field=='entrez':\n",
226 | " # Return match table values that have matched symbol\n",
227 | " conversion = match_table_trim.ix[names][~(match_table_trim.ix[names]['EntrezID'].isnull())]\n",
228 | " if conversion.shape[0]==0:\n",
229 | " return None\n",
230 | " else:\n",
231 | " # Return conversion with max score or None if no conversion\n",
232 | " max_score = conversion['Score'].max()\n",
233 | " return conversion[conversion['Score']==max_score].ix[0]['EntrezID']"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 37,
239 | "metadata": {
240 | "collapsed": false
241 | },
242 | "outputs": [],
243 | "source": [
244 | "DIP_Human_only_edges = DIP_Human_only[['ID interactor A', 'ID interactor B']].values.tolist()\n",
245 | "DIP_edgelist_symbol = [sorted([convert_DIP_string(edge[0],'symbol'),convert_DIP_string(edge[1],'symbol')]) for edge in DIP_Human_only_edges]"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 39,
251 | "metadata": {
252 | "collapsed": false
253 | },
254 | "outputs": [
255 | {
256 | "name": "stdout",
257 | "output_type": "stream",
258 | "text": [
259 | "5569 input edges\n",
260 | "512 self-edges removed\n",
261 | "309 edges with un-mapped genes removed\n",
262 | "26 duplicate edges removed\n",
263 | "Edge list filtered: 0.02 seconds\n",
264 | "4722 Edges remaining\n"
265 | ]
266 | }
267 | ],
268 | "source": [
269 | "# Filter converted edge list\n",
270 | "DIP_edgelist_symbol_filt = gct.filter_converted_edgelist(DIP_edgelist_symbol)"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 40,
276 | "metadata": {
277 | "collapsed": false
278 | },
279 | "outputs": [
280 | {
281 | "name": "stdout",
282 | "output_type": "stream",
283 | "text": [
284 | "Edge list saved: 0.02 seconds\n"
285 | ]
286 | }
287 | ],
288 | "source": [
289 | "# Save converted edge list\n",
290 | "gct.write_edgelist(DIP_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/DIP_Symbol.sif')"
291 | ]
292 | }
293 | ],
294 | "metadata": {
295 | "kernelspec": {
296 | "display_name": "Python 2",
297 | "language": "python",
298 | "name": "python2"
299 | },
300 | "language_info": {
301 | "codemirror_mode": {
302 | "name": "ipython",
303 | "version": 2
304 | },
305 | "file_extension": ".py",
306 | "mimetype": "text/x-python",
307 | "name": "python",
308 | "nbconvert_exporter": "python",
309 | "pygments_lexer": "ipython2",
310 | "version": "2.7.11"
311 | }
312 | },
313 | "nbformat": 4,
314 | "nbformat_minor": 0
315 | }
316 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/Degree-Preserved Network Shufflings.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from pyNBS import data_import_tools as dit\n",
12 | "from pyNBS import network_propagation as prop\n",
13 | "import os\n",
14 | "import numpy as np\n",
15 | "import pandas as pd\n",
16 | "import networkx as nx\n",
17 | "import time"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 4,
23 | "metadata": {
24 | "collapsed": true
25 | },
26 | "outputs": [],
27 | "source": [
28 | "def shuffle_network(network, verbose=False):\n",
29 | "\t# Shuffle Network\n",
30 | "\tshuff_time = time.time()\n",
31 | "\tedge_len=len(network.edges())\n",
32 | "\tshuff_net=network.copy()\n",
33 | "\ttry:\n",
34 | "\t\tnx.double_edge_swap(shuff_net, nswap=edge_len, max_tries=edge_len*10)\n",
35 | "\texcept:\n",
36 | "\t\tif verbose:\n",
37 | "\t\t\tprint 'Note: Maximum number of swap attempts ('+repr(edge_len*10)+') exceeded before desired swaps achieved ('+repr(edge_len)+').'\n",
38 | "\tif verbose:\n",
39 | "\t\t# Evaluate Network Similarity\n",
40 | "\t\tshared_edges = len(set(network.edges()).intersection(set(shuff_net.edges())))\n",
41 | "\t\tprint 'Network shuffled:', time.time()-shuff_time, 'seconds. Edge similarity:', shared_edges/float(edge_len)\n",
42 | "\treturn shuff_net"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 5,
48 | "metadata": {
49 | "collapsed": true
50 | },
51 | "outputs": [],
52 | "source": [
53 | "wd = '/cellar/users/jkhuang/Data/Projects/pyNBS/Data/Network_Data/Network_Files/'\n",
54 | "randNet_outdir = '/cellar/users/jkhuang/Data/Projects/pyNBS/Data/Network_Data/Shuffled_Network_Files/'"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 6,
60 | "metadata": {
61 | "collapsed": false
62 | },
63 | "outputs": [],
64 | "source": [
65 | "network_files = [wd+fn for fn in os.listdir(wd) if fn.endswith('.txt')]"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 15,
71 | "metadata": {
72 | "collapsed": false
73 | },
74 | "outputs": [
75 | {
76 | "name": "stdout",
77 | "output_type": "stream",
78 | "text": [
79 | "PathwayCommons\n",
80 | "Network shuffled: 88.9572019577 seconds. Edge similarity: 0.14217722133\n",
81 | "Shuffled PathwayCommons saved.\n",
82 | "STRING90\n",
83 | "Network shuffled: 31.8355379105 seconds. Edge similarity: 0.135569697974\n",
84 | "Shuffled STRING90 saved.\n",
85 | "HumanNet90\n",
86 | "Network shuffled: 1.94090199471 seconds. Edge similarity: 0.157831705011\n",
87 | "Shuffled HumanNet90 saved.\n",
88 | "PID\n",
89 | "Network shuffled: 0.650630950928 seconds. Edge similarity: 0.172511892547\n",
90 | "Shuffled PID saved.\n",
91 | "Mentha\n",
92 | "Network shuffled: 12.5241580009 seconds. Edge similarity: 0.136090780444\n",
93 | "Shuffled Mentha saved.\n",
94 | "ConsensusPathDB\n",
95 | "Network shuffled: 472.858560085 seconds. Edge similarity: 0.266427489011\n",
96 | "Shuffled ConsensusPathDB saved.\n",
97 | "MultiNet\n",
98 | "Network shuffled: 11.9793038368 seconds. Edge similarity: 0.139956933521\n",
99 | "Shuffled MultiNet saved.\n",
100 | "HPRD\n",
101 | "Network shuffled: 2.19464206696 seconds. Edge similarity: 0.132373984179\n",
102 | "Shuffled HPRD saved.\n",
103 | "GIANT\n",
104 | "Network shuffled: 953.094853163 seconds. Edge similarity: 0.181710364213\n",
105 | "Shuffled GIANT saved.\n",
106 | "HINT\n",
107 | "Network shuffled: 10.6648330688 seconds. Edge similarity: 0.132703799716\n",
108 | "Shuffled HINT saved.\n",
109 | "GeneMANIA\n",
110 | "Network shuffled: 1266.22839403 seconds. Edge similarity: 0.146754353915\n",
111 | "Shuffled GeneMANIA saved.\n",
112 | "Reactome\n",
113 | "Network shuffled: 10.7709050179 seconds. Edge similarity: 0.157268305724\n",
114 | "Shuffled Reactome saved.\n",
115 | "STRING\n",
116 | "Network shuffled: 1679.15529799 seconds. Edge similarity: 0.209015282622\n",
117 | "Shuffled STRING saved.\n",
118 | "IntAct\n",
119 | "Network shuffled: 8.56541705132 seconds. Edge similarity: 0.130773661977\n",
120 | "Shuffled IntAct saved.\n",
121 | "Mentha90\n",
122 | "Network shuffled: 0.904587030411 seconds. Edge similarity: 0.134449008127\n",
123 | "Shuffled Mentha90 saved.\n",
124 | "ReactomeFI\n",
125 | "Network shuffled: 10.2852549553 seconds. Edge similarity: 0.146912035846\n",
126 | "Shuffled ReactomeFI saved.\n",
127 | "BIND\n",
128 | "Network shuffled: 9.11399793625 seconds. Edge similarity: 0.322492442979\n",
129 | "Shuffled BIND saved.\n",
130 | "DIP\n",
131 | "Network shuffled: 0.137312889099 seconds. Edge similarity: 0.120499788225\n",
132 | "Shuffled DIP saved.\n",
133 | "InBioMap75\n",
134 | "Network shuffled: 6.4067800045 seconds. Edge similarity: 0.167107140969\n",
135 | "Shuffled InBioMap75 saved.\n",
136 | "HumanInteractome\n",
137 | "Network shuffled: 0.723779201508 seconds. Edge similarity: 0.136739405675\n",
138 | "Shuffled HumanInteractome saved.\n",
139 | "BioPlex\n",
140 | "Network shuffled: 1.60635495186 seconds. Edge similarity: 0.123919155482\n",
141 | "Shuffled BioPlex saved.\n",
142 | "GeneMANIA90\n",
143 | "Network shuffled: 25.3215258121 seconds. Edge similarity: 0.118961241363\n",
144 | "Shuffled GeneMANIA90 saved.\n",
145 | "BioGRID\n",
146 | "Network shuffled: 11.8226139545 seconds. Edge similarity: 0.131481431287\n",
147 | "Shuffled BioGRID saved.\n",
148 | "GIANT90\n",
149 | "Network shuffled: 22.5300149918 seconds. Edge similarity: 0.188063162301\n",
150 | "Shuffled GIANT90 saved.\n",
151 | "HumanNet\n",
152 | "Network shuffled: 25.1538288593 seconds. Edge similarity: 0.137587481275\n",
153 | "Shuffled HumanNet saved.\n",
154 | "IRefIndex\n",
155 | "Network shuffled: 7.51319789886 seconds. Edge similarity: 0.160039835864\n",
156 | "Shuffled IRefIndex saved.\n",
157 | "InBioMap\n",
158 | "Network shuffled: 46.8094351292 seconds. Edge similarity: 0.167921346275\n",
159 | "Shuffled InBioMap saved.\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "for network_file in network_files:\n",
165 | " network_name = network_file.split('/')[-1].split('_')[0]\n",
166 | " print network_name\n",
167 | " network = dit.load_network_file(network_file)\n",
168 | " shuffNet = shuffle_network(network, verbose=True)\n",
169 | " shuffNet_edges = shuffNet.edges()\n",
170 | " f = open(randNet_outdir+network_name+'-shuffled_Symbol.txt', 'w')\n",
171 | " for edge in shuffNet_edges:\n",
172 | " f.write(str(edge[0])+'\\t'+str(edge[1])+'\\n')\n",
173 | " f.close()\n",
174 | " print 'Shuffled', network_name, 'saved.'"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": null,
180 | "metadata": {
181 | "collapsed": true
182 | },
183 | "outputs": [],
184 | "source": []
185 | }
186 | ],
187 | "metadata": {
188 | "kernelspec": {
189 | "display_name": "Python 2",
190 | "language": "python",
191 | "name": "python2"
192 | },
193 | "language_info": {
194 | "codemirror_mode": {
195 | "name": "ipython",
196 | "version": 2
197 | },
198 | "file_extension": ".py",
199 | "mimetype": "text/x-python",
200 | "name": "python",
201 | "nbconvert_exporter": "python",
202 | "pygments_lexer": "ipython2",
203 | "version": "2.7.11"
204 | }
205 | },
206 | "nbformat": 4,
207 | "nbformat_minor": 0
208 | }
209 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/GIANT Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import time"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Load GIANT Raw Data\n",
22 | "#### Source: http://giant.princeton.edu/static//networks/all_tissues_top.gz\n",
23 | "Downloaded: June 15, 2017 \n",
24 | "Last Updated: N/A, but paper published in 2015 \n",
25 | "Note about processing: This network (even if it is already the top 10% of all edges) is extremely large. Therefore, we will further filter this 'top' functional network further to the top 10% which should yield about 4 million edges. We will then take the top 10% of this filtered network (about 400k edges) to use as the 'filtered' version of this network."
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "GIANT All Tissues (Top) Interactions: 38903547\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
45 | "GIANT_Raw = pd.read_csv(wd+'/Network_Data_Raw/GIANT_All_Tissues_Top', sep='\\t', header=-1, low_memory=False)\n",
46 | "GIANT_Raw.columns = ['NodeA', 'NodeB', 'Prob']\n",
47 | "print 'GIANT All Tissues (Top) Interactions:', GIANT_Raw.shape[0]"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 5,
53 | "metadata": {
54 | "collapsed": false,
55 | "scrolled": true
56 | },
57 | "outputs": [],
58 | "source": [
59 | "# Get all genes to convert from GeneMANIA\n",
60 | "GIANT_Raw_Genes = list(set(GIANT_Raw['NodeA']).union(GIANT_Raw['NodeB']))\n",
61 | "# Convert all entrezIDs to string forst\n",
62 | "GIANT_Raw_Genes = [str(entrezID) for entrezID in GIANT_Raw_Genes]"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## Convert genes from Entrez ID to HUGO Symbol"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 6,
75 | "metadata": {
76 | "collapsed": false
77 | },
78 | "outputs": [
79 | {
80 | "name": "stdout",
81 | "output_type": "stream",
82 | "text": [
83 | "25689 Valid Query Genes\n",
84 | "0 Invalid Query Genes\n"
85 | ]
86 | }
87 | ],
88 | "source": [
89 | "query_string, valid_genes, invalid_genes = gct.query_constructor(GIANT_Raw_Genes)"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 7,
95 | "metadata": {
96 | "collapsed": false
97 | },
98 | "outputs": [
99 | {
100 | "name": "stdout",
101 | "output_type": "stream",
102 | "text": [
103 | "Batch query complete: 30.55 seconds\n",
104 | "25690 Matched query results\n"
105 | ]
106 | }
107 | ],
108 | "source": [
109 | "# Set scopes (gene naming systems to search)\n",
110 | "scopes = \"entrezgene, retired, alias\"\n",
111 | "\n",
112 | "# Set fields (systems from which to return gene names from)\n",
113 | "fields = \"symbol, entrezgene\"\n",
114 | "\n",
115 | "# Query MyGene.Info\n",
116 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
117 | "print len(match_list), 'Matched query results'"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 8,
123 | "metadata": {
124 | "collapsed": false
125 | },
126 | "outputs": [
127 | {
128 | "name": "stdout",
129 | "output_type": "stream",
130 | "text": [
131 | "Queries without full matching results found: 806\n",
132 | "\n",
133 | "1 Queries with mutliple matches found\n",
134 | "\n",
135 | "Query mapping table/dictionary construction complete: 140.47 seconds\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {},
146 | "source": [
147 | "## Construct converted network and filter edges"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 9,
153 | "metadata": {
154 | "collapsed": true
155 | },
156 | "outputs": [],
157 | "source": [
158 | "GIANT_Raw_edgelist = GIANT_Raw.values.tolist()"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 13,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [],
168 | "source": [
169 | "# Convert GIANT network edgelist\n",
170 | "GIANT_Raw_edgelist_symbol = [sorted([query_to_symbol[str(int(edge[0]))], query_to_symbol[str(int(edge[1]))]])+[edge[2]] for edge in GIANT_Raw_edgelist]"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 14,
176 | "metadata": {
177 | "collapsed": false
178 | },
179 | "outputs": [
180 | {
181 | "name": "stdout",
182 | "output_type": "stream",
183 | "text": [
184 | "38903547 input edges\n",
185 | "19204 self-edges removed\n",
186 | "2417020 edges with un-mapped genes removed\n",
187 | "151720 duplicate edges removed\n",
188 | "Edge list filtered: 225.47 seconds\n",
189 | "36315603 Edges remaining\n"
190 | ]
191 | }
192 | ],
193 | "source": [
194 | "# Filter GIANT network edgelist\n",
195 | "GIANT_edgelist_symbol_filt = gct.filter_converted_edgelist(GIANT_Raw_edgelist_symbol, remove_self_edges=True, weighted=True)"
196 | ]
197 | },
198 | {
199 | "cell_type": "markdown",
200 | "metadata": {},
201 | "source": [
202 | "## Filter to top 10% of edges by weight/score"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 16,
208 | "metadata": {
209 | "collapsed": false
210 | },
211 | "outputs": [],
212 | "source": [
213 | "GIANT_edgelist_symbol_filt_table = pd.DataFrame(GIANT_edgelist_symbol_filt, columns=['NodeA', 'NodeB', 'Score'])"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 20,
219 | "metadata": {
220 | "collapsed": false
221 | },
222 | "outputs": [
223 | {
224 | "name": "stdout",
225 | "output_type": "stream",
226 | "text": [
227 | "90% score: 0.207416\n"
228 | ]
229 | }
230 | ],
231 | "source": [
232 | "# Filter edges by score quantile\n",
233 | "q_score = GIANT_edgelist_symbol_filt_table['Score'].quantile(0.9)\n",
234 | "print '90% score:', q_score\n",
235 | "GIANTtop_edgelist = GIANT_edgelist_symbol_filt_table[GIANT_edgelist_symbol_filt_table['Score']>q_score]"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 23,
241 | "metadata": {
242 | "collapsed": true
243 | },
244 | "outputs": [],
245 | "source": [
246 | "# Save weighted network for GIANT filtered to top 10% of downloaded edges to file\n",
247 | "GIANTtop_edgelist.to_csv('/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/GIANT_Symbol.sif', sep='\\t', header=False, index=False)"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 24,
253 | "metadata": {
254 | "collapsed": false
255 | },
256 | "outputs": [
257 | {
258 | "name": "stdout",
259 | "output_type": "stream",
260 | "text": [
261 | "90.0% score: 0.574097\n",
262 | "363128 / 3631554 edges retained\n"
263 | ]
264 | }
265 | ],
266 | "source": [
267 | "# Create filtered network for GIANT\n",
268 | "GIANT90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/GIANT_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
269 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/GIANT90_Symbol.sif')"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {
276 | "collapsed": true
277 | },
278 | "outputs": [],
279 | "source": []
280 | }
281 | ],
282 | "metadata": {
283 | "kernelspec": {
284 | "display_name": "Python 2",
285 | "language": "python",
286 | "name": "python2"
287 | },
288 | "language_info": {
289 | "codemirror_mode": {
290 | "name": "ipython",
291 | "version": 2
292 | },
293 | "file_extension": ".py",
294 | "mimetype": "text/x-python",
295 | "name": "python",
296 | "nbconvert_exporter": "python",
297 | "pygments_lexer": "ipython2",
298 | "version": "2.7.11"
299 | }
300 | },
301 | "nbformat": 4,
302 | "nbformat_minor": 0
303 | }
304 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/GeneMANIA Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import time"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Load GeneMANIA Raw Data\n",
22 | "#### Source: http://genemania.org/data/current/Homo_sapiens.COMBINED/COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt\n",
23 | "Downloaded: July 28, 2016 \n",
24 | "Last Updated: October 15, 2014\t"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {
31 | "collapsed": false
32 | },
33 | "outputs": [],
34 | "source": [
35 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
36 | "GeneMANIA_Raw = pd.read_csv(wd+'/Network_Data_Raw/GeneMANIA/GeneMANIA_2014_10_15.txt',sep='\\t')"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 3,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [],
46 | "source": [
47 | "GeneMANIA_Raw_Genes = list(set(GeneMANIA_Raw['Gene_A']).union(set(GeneMANIA_Raw['Gene_B'])))"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 4,
53 | "metadata": {
54 | "collapsed": false
55 | },
56 | "outputs": [
57 | {
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "7290094 Total GeneMANIA Edges\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "# Get Edgelist of network\n",
67 | "query_edgelist = GeneMANIA_Raw[['Gene_A','Gene_B', 'Weight']].values.tolist()\n",
68 | "print len(query_edgelist), \"Total GeneMANIA Edges\""
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "## Convert Genes (from ensembl gene to gene symbol)"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 5,
81 | "metadata": {
82 | "collapsed": false
83 | },
84 | "outputs": [
85 | {
86 | "name": "stdout",
87 | "output_type": "stream",
88 | "text": [
89 | "19264 Valid Query Genes\n",
90 | "0 Invalid Query Genes\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "query_string, valid_genes, invalid_genes = gct.query_constructor(GeneMANIA_Raw_Genes)"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 6,
101 | "metadata": {
102 | "collapsed": true
103 | },
104 | "outputs": [],
105 | "source": [
106 | "# Set scopes (gene naming systems to search)\n",
107 | "scopes = \"ensemblgene\"\n",
108 | "\n",
109 | "# Set fields (systems from which to return gene names from)\n",
110 | "fields = \"symbol, entrezgene\""
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 7,
116 | "metadata": {
117 | "collapsed": false
118 | },
119 | "outputs": [
120 | {
121 | "name": "stdout",
122 | "output_type": "stream",
123 | "text": [
124 | "Batch query complete: 35.43 seconds\n",
125 | "19266 Matched query results\n"
126 | ]
127 | }
128 | ],
129 | "source": [
130 | "# Query MyGene.Info\n",
131 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
132 | "print len(match_list), 'Matched query results'"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 8,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [
142 | {
143 | "name": "stdout",
144 | "output_type": "stream",
145 | "text": [
146 | "Queries without full matching results found: 1547\n",
147 | "\n",
148 | "1 Queries with mutliple matches found\n",
149 | "\n",
150 | "Query mapping table/dictionary construction complete: 111.04 seconds\n"
151 | ]
152 | }
153 | ],
154 | "source": [
155 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "## Construct Converted Network"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 9,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "CPU times: user 18.5 s, sys: 1.36 s, total: 19.9 s\n",
177 | "Wall time: 19.5 s\n"
178 | ]
179 | }
180 | ],
181 | "source": [
182 | "%%time\n",
183 | "# Convert weighted edge list\n",
184 | "GeneMANIA_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 10,
190 | "metadata": {
191 | "collapsed": false
192 | },
193 | "outputs": [
194 | {
195 | "name": "stdout",
196 | "output_type": "stream",
197 | "text": [
198 | "7290094 input edges\n",
199 | "22144 self-edges removed\n",
200 | "665798 edges with un-mapped genes removed\n",
201 | "508 duplicate edges removed\n",
202 | "Edge list filtered: 39.33 seconds\n",
203 | "6601644 Edges remaining\n"
204 | ]
205 | }
206 | ],
207 | "source": [
208 | "# Filter converted edge list\n",
209 | "GeneMANIA_edgelist_symbol_filt = gct.filter_converted_edgelist(GeneMANIA_edgelist_symbol, weighted=True)"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 11,
215 | "metadata": {
216 | "collapsed": false
217 | },
218 | "outputs": [
219 | {
220 | "name": "stdout",
221 | "output_type": "stream",
222 | "text": [
223 | "Edge list saved: 13.39 seconds\n"
224 | ]
225 | }
226 | ],
227 | "source": [
228 | "# Write network to file\n",
229 | "gct.write_edgelist(GeneMANIA_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/GeneMANIA_Symbol.sif', binary=False)"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 12,
235 | "metadata": {
236 | "collapsed": false
237 | },
238 | "outputs": [
239 | {
240 | "name": "stdout",
241 | "output_type": "stream",
242 | "text": [
243 | "90.0% score: 0.00023\n",
244 | "618546 / 6601644 edges retained\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "# Create filtered network\n",
250 | "GeneMANIA90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/GeneMANIA_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
251 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/GeneMANIA90_Symbol.sif')"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": null,
257 | "metadata": {
258 | "collapsed": true
259 | },
260 | "outputs": [],
261 | "source": []
262 | }
263 | ],
264 | "metadata": {
265 | "kernelspec": {
266 | "display_name": "Python 2",
267 | "language": "python",
268 | "name": "python2"
269 | },
270 | "language_info": {
271 | "codemirror_mode": {
272 | "name": "ipython",
273 | "version": 2
274 | },
275 | "file_extension": ".py",
276 | "mimetype": "text/x-python",
277 | "name": "python",
278 | "nbconvert_exporter": "python",
279 | "pygments_lexer": "ipython2",
280 | "version": "2.7.11"
281 | }
282 | },
283 | "nbformat": 4,
284 | "nbformat_minor": 0
285 | }
286 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/HINT Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import numpy as np"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Load HINT Raw Data\n",
22 | "#### Source: http://hint.yulab.org/batch.html\n",
23 | "Downloaded: June 15, 2017 \n",
24 | "Last update not listed, but currently on version 4 (updated early 2017). The two binary interactomes for High-Quality (HQ) and Co-Complex (CC) interactions were downloaded and merged into a single interactome for HINT. \n",
25 | "Citation: Das J and Yu H. HINT: High-quality protein interactomes and their applications in understanding human disease. BMC Systems Biology, 2012 Jul 30;6(1):92."
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [],
35 | "source": [
36 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
37 | "HINT_Bin_Raw = pd.read_csv(wd+'Network_Data_Raw/HINT_v4_binary_HomoSapiens.txt',sep='\\t')\n",
38 | "HINT_Com_Raw = pd.read_csv(wd+'Network_Data_Raw/HINT_v4_complex_HomoSapiens.txt',sep='\\t')"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 5,
44 | "metadata": {
45 | "collapsed": false,
46 | "scrolled": true
47 | },
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "Concatenated list of edges: (181699, 9)\n",
54 | "After duplicate edges removed: (181375, 9)\n"
55 | ]
56 | }
57 | ],
58 | "source": [
59 | "HINT_Raw = pd.concat([HINT_Bin_Raw, HINT_Com_Raw])\n",
60 | "print 'Concatenated list of edges:', HINT_Raw.shape\n",
61 | "HINT_Raw = HINT_Raw.drop_duplicates()\n",
62 | "print 'After duplicate edges removed:', HINT_Raw.shape"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 7,
68 | "metadata": {
69 | "collapsed": false,
70 | "scrolled": true
71 | },
72 | "outputs": [],
73 | "source": [
74 | "# Use UniProtID labels to annotate interactions\n",
75 | "HPRD_Raw_Genes_Uniprot = set(HINT_Raw['Uniprot_A']).union(set(HINT_Raw['Uniprot_B']))"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "## Convert Genes from UniProt Accession ID to gene symbols"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 9,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [
92 | {
93 | "name": "stdout",
94 | "output_type": "stream",
95 | "text": [
96 | "15784 Valid Query Genes\n",
97 | "0 Invalid Query Genes\n"
98 | ]
99 | }
100 | ],
101 | "source": [
102 | "query_string, valid_genes, invalid_genes = gct.query_constructor(HPRD_Raw_Genes_Uniprot)"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 10,
108 | "metadata": {
109 | "collapsed": false
110 | },
111 | "outputs": [
112 | {
113 | "name": "stdout",
114 | "output_type": "stream",
115 | "text": [
116 | "Batch query complete: 19.17 seconds\n",
117 | "16001 Matched query results\n"
118 | ]
119 | }
120 | ],
121 | "source": [
122 | "# Set scopes (gene naming systems to search)\n",
123 | "scopes = \"uniprot\"\n",
124 | "\n",
125 | "# Set fields (systems from which to return gene names from)\n",
126 | "fields = \"symbol, entrezgene\"\n",
127 | "\n",
128 | "# Query MyGene.Info\n",
129 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
130 | "print len(match_list), 'Matched query results'"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 11,
136 | "metadata": {
137 | "collapsed": false,
138 | "scrolled": true
139 | },
140 | "outputs": [
141 | {
142 | "name": "stdout",
143 | "output_type": "stream",
144 | "text": [
145 | "Queries without full matching results found: 670\n",
146 | "\n",
147 | "163 Queries with mutliple matches found\n",
148 | "\n",
149 | "Query mapping table/dictionary construction complete: 59.26 seconds\n"
150 | ]
151 | }
152 | ],
153 | "source": [
154 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "## Construct Converted Network"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 13,
167 | "metadata": {
168 | "collapsed": true
169 | },
170 | "outputs": [],
171 | "source": [
172 | "HINT_edgelist = HINT_Raw[['Uniprot_A', 'Uniprot_B']].values.tolist()"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 16,
178 | "metadata": {
179 | "collapsed": false
180 | },
181 | "outputs": [],
182 | "source": [
183 | "# Convert edge list\n",
184 | "HINT_edgelist_symbol = gct.convert_edgelist(HINT_edgelist, query_to_symbol, weighted=False)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 19,
190 | "metadata": {
191 | "collapsed": false
192 | },
193 | "outputs": [
194 | {
195 | "name": "stdout",
196 | "output_type": "stream",
197 | "text": [
198 | "181375 input edges\n",
199 | "4730 self-edges removed\n",
200 | "2861 edges with un-mapped genes removed\n",
201 | "18325 duplicate edges removed\n",
202 | "Edge list filtered: 0.33 seconds\n",
203 | "155459 Edges remaining\n"
204 | ]
205 | }
206 | ],
207 | "source": [
208 | "# Filter edge list\n",
209 | "HINT_edgelist_symbol_filt = gct.filter_converted_edgelist(HINT_edgelist_symbol)"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 20,
215 | "metadata": {
216 | "collapsed": false
217 | },
218 | "outputs": [
219 | {
220 | "name": "stdout",
221 | "output_type": "stream",
222 | "text": [
223 | "Edge list saved: 0.26 seconds\n"
224 | ]
225 | }
226 | ],
227 | "source": [
228 | "# Save edge list\n",
229 | "gct.write_edgelist(HINT_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/HINT_Symbol.sif')"
230 | ]
231 | }
232 | ],
233 | "metadata": {
234 | "kernelspec": {
235 | "display_name": "Python 2",
236 | "language": "python",
237 | "name": "python2"
238 | },
239 | "language_info": {
240 | "codemirror_mode": {
241 | "name": "ipython",
242 | "version": 2
243 | },
244 | "file_extension": ".py",
245 | "mimetype": "text/x-python",
246 | "name": "python",
247 | "nbconvert_exporter": "python",
248 | "pygments_lexer": "ipython2",
249 | "version": "2.7.11"
250 | }
251 | },
252 | "nbformat": 4,
253 | "nbformat_minor": 0
254 | }
255 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/HPRD Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "## Load HPRD Raw Data\n",
21 | "#### Source: http://www.hprd.org/download\n",
22 | "#### The file requires registration with the database. Download the file: HPRD_Release9_041310.tar.gz\n",
23 | "Downloaded: August 12, 2016 \n",
24 | "Last Updated: June 29, 2010 \n",
25 | "The following files are manipulated after unzipping the .tar.gz file"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 3,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [],
35 | "source": [
36 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
37 | "HPRD_Raw = pd.read_csv(wd+'Network_Data_Raw/HPRD_Release9_062910/BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt',sep='\\t',header=-1)"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 5,
43 | "metadata": {
44 | "collapsed": false
45 | },
46 | "outputs": [],
47 | "source": [
48 | "# Assign column names from README file from archive\n",
49 | "HPRD_Raw.columns = ['Interactor 1 Gene Symbol', 'Interactor 1 HPRD ID', 'Interactor 1 RefSeq ID',\n",
50 | " 'Interactor 2 Gene Symbol', 'Interactor 2 HPRD ID', 'Interactor 2 RefSeq ID',\n",
51 | " 'Experiment Type', 'PubMed ID']"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 7,
57 | "metadata": {
58 | "collapsed": false
59 | },
60 | "outputs": [
61 | {
62 | "name": "stdout",
63 | "output_type": "stream",
64 | "text": [
65 | "Edges in HPRD: 39240\n"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "# Convert table of interactions to edgelist (no scores given)\n",
71 | "# Also no gene symbol conversion necessary because network is given in symbol format already\n",
72 | "HPRD_edgelist = HPRD_Raw[['Interactor 1 Gene Symbol', 'Interactor 2 Gene Symbol']].values.tolist()\n",
73 | "print 'Edges in HPRD:', len(HPRD_edgelist)"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 9,
79 | "metadata": {
80 | "collapsed": true
81 | },
82 | "outputs": [],
83 | "source": [
84 | "# Sort each edge representation for filtering\n",
85 | "HPRD_edgelist_sorted = [sorted(edge) for edge in HPRD_edgelist]"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 10,
91 | "metadata": {
92 | "collapsed": false
93 | },
94 | "outputs": [
95 | {
96 | "name": "stdout",
97 | "output_type": "stream",
98 | "text": [
99 | "39240 input edges\n",
100 | "2160 self-edges removed\n",
101 | "0 edges with un-mapped genes removed\n",
102 | "41 duplicate edges removed\n",
103 | "Edge list filtered: 0.05 seconds\n",
104 | "37039 Edges remaining\n"
105 | ]
106 | }
107 | ],
108 | "source": [
109 | "# Filter edgelist for duplicate nodes and for self-edges\n",
110 | "HPRD_edgelist_filt = gct.filter_converted_edgelist(HPRD_edgelist_sorted)"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 12,
116 | "metadata": {
117 | "collapsed": false
118 | },
119 | "outputs": [
120 | {
121 | "name": "stdout",
122 | "output_type": "stream",
123 | "text": [
124 | "Edge list saved: 0.04 seconds\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "# Save genelist to file\n",
130 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
131 | "gct.write_edgelist(HPRD_edgelist_filt, outdir+'HPRD_Symbol.sif')"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {
138 | "collapsed": true
139 | },
140 | "outputs": [],
141 | "source": []
142 | }
143 | ],
144 | "metadata": {
145 | "kernelspec": {
146 | "display_name": "Python 2",
147 | "language": "python",
148 | "name": "python2"
149 | },
150 | "language_info": {
151 | "codemirror_mode": {
152 | "name": "ipython",
153 | "version": 2
154 | },
155 | "file_extension": ".py",
156 | "mimetype": "text/x-python",
157 | "name": "python",
158 | "nbconvert_exporter": "python",
159 | "pygments_lexer": "ipython2",
160 | "version": "2.7.11"
161 | }
162 | },
163 | "nbformat": 4,
164 | "nbformat_minor": 0
165 | }
166 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/HumanInteractome Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "from network_evaluation_tools import gene_conversion_tools as gct"
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "The following data was downloaded from CCSB and converted to edge list sifs for both symbol and entrez from the simple sifs given in both cases. No additional gene conversions were performed for these networks."
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Load HI-II-14 (Human Interactome) Raw Data\n",
27 | "#### Source: http://interactome.dfci.harvard.edu/H_sapiens/download/HI-II-14.tsv\n",
28 | "#### File: 'HI-II-14'\n",
29 | "Downloaded: June 20, 2017 \n",
30 | "Last Updated: Not Listed\n",
31 | "Proteome-scale map of the human binary interactome network generated by systematically screening Space-II associated with Rolland et al Cell 2014"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 3,
37 | "metadata": {
38 | "collapsed": false
39 | },
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "Raw Interactions in HI-II-14: 13944\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
51 | "HumanInteractome_Raw = pd.read_csv(wd+'Network_Data_Raw/HI-II-14.tsv',sep='\\t')\n",
52 | "print 'Raw Interactions in HI-II-14:', len(HumanInteractome_Raw)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 5,
58 | "metadata": {
59 | "collapsed": false,
60 | "scrolled": true
61 | },
62 | "outputs": [
63 | {
64 | "name": "stdout",
65 | "output_type": "stream",
66 | "text": [
67 | "Edges in HI-II-14: 13944\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "# Convert table of interactions to edgelist (no scores given)\n",
73 | "# Also no gene symbol conversion necessary because network is given in symbol format already\n",
74 | "HumanInteractome_edgelist = HumanInteractome_Raw[['Symbol A', 'Symbol B']].values.tolist()\n",
75 | "print 'Edges in HI-II-14:', len(HumanInteractome_edgelist)"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 7,
81 | "metadata": {
82 | "collapsed": true
83 | },
84 | "outputs": [],
85 | "source": [
86 | "# Sort each edge representation for filtering\n",
87 | "HumanInteractome_edgelist_sorted = [sorted(edge) for edge in HumanInteractome_edgelist]"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 8,
93 | "metadata": {
94 | "collapsed": false
95 | },
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "13944 input edges\n",
102 | "517 self-edges removed\n",
103 | "0 edges with un-mapped genes removed\n",
104 | "0 duplicate edges removed\n",
105 | "Edge list filtered: 0.02 seconds\n",
106 | "13427 Edges remaining\n"
107 | ]
108 | }
109 | ],
110 | "source": [
111 | "# Filter edgelist for duplicate nodes and for self-edges\n",
112 | "HumanInteractome_edgelist_filt = gct.filter_converted_edgelist(HumanInteractome_edgelist_sorted)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 10,
118 | "metadata": {
119 | "collapsed": false
120 | },
121 | "outputs": [
122 | {
123 | "name": "stdout",
124 | "output_type": "stream",
125 | "text": [
126 | "Edge list saved: 0.02 seconds\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "# Save genelist to file\n",
132 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
133 | "gct.write_edgelist(HumanInteractome_edgelist_filt, outdir+'HumanInteractome_Symbol.sif')"
134 | ]
135 | }
136 | ],
137 | "metadata": {
138 | "kernelspec": {
139 | "display_name": "Python 2",
140 | "language": "python",
141 | "name": "python2"
142 | },
143 | "language_info": {
144 | "codemirror_mode": {
145 | "name": "ipython",
146 | "version": 2
147 | },
148 | "file_extension": ".py",
149 | "mimetype": "text/x-python",
150 | "name": "python",
151 | "nbconvert_exporter": "python",
152 | "pygments_lexer": "ipython2",
153 | "version": "2.7.11"
154 | }
155 | },
156 | "nbformat": 4,
157 | "nbformat_minor": 0
158 | }
159 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/HumanNet Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import time"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Load HumanNet Raw Data\n",
22 | "#### Source: http://www.functionalnet.org/humannet/HumanNet.v1.benchmark.txt\n",
23 | "Downloaded: August 12, 2016 \n",
24 | "No latest version date posted (last updated likely around 2011). \n",
25 | "Citation: Insuk Lee, U. Martin Blom, Peggy I. Wang, Jung Eun Shin, and Edward M. Marcotte\n",
26 | "Genome Research 21(7):1109-21 (2011)"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [],
36 | "source": [
37 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
38 | "HumanNet_Raw = pd.read_csv(wd+'Network_Data_Raw/HumanNet.v1.join.txt',sep='\\t',header=-1)"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 3,
44 | "metadata": {
45 | "collapsed": false
46 | },
47 | "outputs": [],
48 | "source": [
49 | "f = open(wd+'Network_Data_Raw/HumanNet.v1.evidence_code.txt')\n",
50 | "HumanNet_headers = ['Gene 1', 'Gene 2']+[name.split(' = ')[0] for name in f.read().splitlines()[1:-1]]\n",
51 | "HumanNet_Raw.columns = HumanNet_headers"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 4,
57 | "metadata": {
58 | "collapsed": true
59 | },
60 | "outputs": [],
61 | "source": [
62 | "# Extract gene list\n",
63 | "HumanNet_Raw_Genes = list(set(HumanNet_Raw['Gene 1']).union(set(HumanNet_Raw['Gene 2'])))\n",
64 | "HumanNet_Raw_Genes = [str(gene) for gene in HumanNet_Raw_Genes]"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 5,
70 | "metadata": {
71 | "collapsed": false
72 | },
73 | "outputs": [
74 | {
75 | "name": "stdout",
76 | "output_type": "stream",
77 | "text": [
78 | "476399 HumanNet Edges\n"
79 | ]
80 | }
81 | ],
82 | "source": [
83 | "# Get edge list of network\n",
84 | "query_edgelist = HumanNet_Raw[['Gene 1','Gene 2']].astype(str)\n",
85 | "query_edgelist = pd.concat([query_edgelist, HumanNet_Raw['IntNet']], axis=1).values.tolist()\n",
86 | "print len(query_edgelist), \"HumanNet Edges\""
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "## Convert genes from Entrez ID to HUGO Symbol"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 6,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "16243 Valid Query Genes\n",
108 | "0 Invalid Query Genes\n"
109 | ]
110 | }
111 | ],
112 | "source": [
113 | "query_string, valid_genes, invalid_genes = gct.query_constructor(HumanNet_Raw_Genes)"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 7,
119 | "metadata": {
120 | "collapsed": false
121 | },
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "Batch query complete: 19.6 seconds\n",
128 | "16243 Matched query results\n"
129 | ]
130 | }
131 | ],
132 | "source": [
133 | "# Set scopes (gene naming systems to search)\n",
134 | "scopes = \"entrezgene, retired\"\n",
135 | "\n",
136 | "# Set fields (systems from which to return gene names from)\n",
137 | "fields = \"symbol, entrezgene\"\n",
138 | "\n",
139 | "# Query MyGene.Info\n",
140 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
141 | "print len(match_list), 'Matched query results'"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 8,
147 | "metadata": {
148 | "collapsed": false
149 | },
150 | "outputs": [
151 | {
152 | "name": "stdout",
153 | "output_type": "stream",
154 | "text": [
155 | "Queries without full matching results found: 10\n",
156 | "\n",
157 | "0 Queries with mutliple matches found\n",
158 | "\n",
159 | "Query mapping table/dictionary construction complete: 19.62 seconds\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
165 | ]
166 | },
167 | {
168 | "cell_type": "markdown",
169 | "metadata": {},
170 | "source": [
171 | "## Construct Converted Network"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 9,
177 | "metadata": {
178 | "collapsed": false
179 | },
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "CPU times: user 1.54 s, sys: 260 ms, total: 1.8 s\n",
186 | "Wall time: 1.69 s\n"
187 | ]
188 | }
189 | ],
190 | "source": [
191 | "%%time\n",
192 | "# Convert weighted edge list\n",
193 | "HumanNet_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 10,
199 | "metadata": {
200 | "collapsed": false
201 | },
202 | "outputs": [
203 | {
204 | "name": "stdout",
205 | "output_type": "stream",
206 | "text": [
207 | "476399 input edges\n",
208 | "7 self-edges removed\n",
209 | "225 edges with un-mapped genes removed\n",
210 | "208 duplicate edges removed\n",
211 | "Edge list filtered: 4.15 seconds\n",
212 | "475959 Edges remaining\n"
213 | ]
214 | }
215 | ],
216 | "source": [
217 | "# Filter converted edge list\n",
218 | "HumanNet_edgelist_symbol_filt = gct.filter_converted_edgelist(HumanNet_edgelist_symbol, weighted=True)"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 11,
224 | "metadata": {
225 | "collapsed": false
226 | },
227 | "outputs": [
228 | {
229 | "name": "stdout",
230 | "output_type": "stream",
231 | "text": [
232 | "Edge list saved: 1.24 seconds\n"
233 | ]
234 | }
235 | ],
236 | "source": [
237 | "# Write network to file\n",
238 | "gct.write_edgelist(HumanNet_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/HumanNet_Symbol.sif', binary=False)"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 15,
244 | "metadata": {
245 | "collapsed": false
246 | },
247 | "outputs": [
248 | {
249 | "name": "stdout",
250 | "output_type": "stream",
251 | "text": [
252 | "90.0% score: 2.17047289928\n",
253 | "47595 / 475959 edges retained\n"
254 | ]
255 | }
256 | ],
257 | "source": [
258 | "# Create filtered network\n",
259 | "HumanNet90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/HumanNet_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
260 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/HumanNet90_Symbol.sif')"
261 | ]
262 | }
263 | ],
264 | "metadata": {
265 | "kernelspec": {
266 | "display_name": "Python 2",
267 | "language": "python",
268 | "name": "python2"
269 | },
270 | "language_info": {
271 | "codemirror_mode": {
272 | "name": "ipython",
273 | "version": 2
274 | },
275 | "file_extension": ".py",
276 | "mimetype": "text/x-python",
277 | "name": "python",
278 | "nbconvert_exporter": "python",
279 | "pygments_lexer": "ipython2",
280 | "version": "2.7.11"
281 | }
282 | },
283 | "nbformat": 4,
284 | "nbformat_minor": 0
285 | }
286 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/InBioMap Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import itertools\n",
15 | "import time"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Load InBio_Map Raw Data\n",
23 | "#### Source: https://www.intomics.com/inbio/map/#downloads\n",
24 | "Downloaded: November 30, 2016 \n",
25 | "Last Updated: September 12, 2016 \n",
26 | "Note about scoring: According to the supplement of the associated paper (Li T, et al. A scored human protein–protein interaction network to catalyze genomic interpretation. Nature Methods 14, 61–64 (2017) doi:10.1038/nmeth.4083), column 15 (index=14) should correspond to the confidence score of the edge. This column has 2 values, the confidence score and initial score. We will use the confidence score as it is a corrected version of the initial score calculated, indicating confidence that a particular interaction is real."
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "Raw edge count in InBio_Map: 625641\n"
41 | ]
42 | }
43 | ],
44 | "source": [
45 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
46 | "InBio_Map_Raw = pd.read_csv(wd+'Network_Data_Raw/InBio_Map_core_2016_09_12/core.psimitab',sep='\\t', header=-1)\n",
47 | "print 'Raw edge count in InBio_Map:', len(InBio_Map_Raw)"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {
54 | "collapsed": false
55 | },
56 | "outputs": [
57 | {
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "Human-Human only interactions in InBioMap: 625641\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "InBio_Map_Human_Only = InBio_Map_Raw[(InBio_Map_Raw[9]=='taxid:9606(Homo sapiens)') & (InBio_Map_Raw[10]=='taxid:9606(Homo sapiens)')]\n",
67 | "print 'Human-Human only interactions in InBioMap:', len(InBio_Map_Human_Only)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 4,
73 | "metadata": {
74 | "collapsed": false
75 | },
76 | "outputs": [],
77 | "source": [
78 | "# Extract gene list\n",
79 | "InBio_Map_Human_Genes = list(set(InBio_Map_Human_Only[0]).union(set(InBio_Map_Human_Only[1])))\n",
80 | "InBio_Map_Human_Genes = [str(gene) for gene in InBio_Map_Human_Genes]"
81 | ]
82 | },
83 | {
84 | "cell_type": "markdown",
85 | "metadata": {},
86 | "source": [
87 | "## Convert Genes"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 5,
93 | "metadata": {
94 | "collapsed": false
95 | },
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "17653 Valid Query Genes\n",
102 | "0 Invalid Query Genes\n"
103 | ]
104 | }
105 | ],
106 | "source": [
107 | "# Construct list of genes to be submitted to MyGene.Info API\n",
108 | "query_string, valid_genes, invalid_genes = gct.query_constructor(InBio_Map_Human_Genes)"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 6,
114 | "metadata": {
115 | "collapsed": true
116 | },
117 | "outputs": [],
118 | "source": [
119 | "# Set scopes (gene naming systems to search)\n",
120 | "scopes = \"uniprot\"\n",
121 | "\n",
122 | "# Set fields (systems from which to return gene names from)\n",
123 | "fields = \"symbol, entrezgene\""
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 7,
129 | "metadata": {
130 | "collapsed": false
131 | },
132 | "outputs": [
133 | {
134 | "name": "stdout",
135 | "output_type": "stream",
136 | "text": [
137 | "Batch query complete: 39.84 seconds\n",
138 | "17984 Matched query results\n"
139 | ]
140 | }
141 | ],
142 | "source": [
143 | "# Query MyGene.Info\n",
144 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
145 | "print len(match_list), 'Matched query results'"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 8,
151 | "metadata": {
152 | "collapsed": false
153 | },
154 | "outputs": [
155 | {
156 | "name": "stdout",
157 | "output_type": "stream",
158 | "text": [
159 | "Queries without full matching results found: 419\n",
160 | "\n",
161 | "233 Queries with mutliple matches found\n",
162 | "\n",
163 | "Query mapping table/dictionary construction complete: 76.78 seconds\n"
164 | ]
165 | }
166 | ],
167 | "source": [
168 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "## Construct Converted Network"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 9,
181 | "metadata": {
182 | "collapsed": true
183 | },
184 | "outputs": [],
185 | "source": [
186 | "query_edgelist = InBio_Map_Human_Only[[0, 1, 14]].values.tolist()\n",
187 | "query_edgelist_fmt = [[edge[0].split(':')[1], edge[1].split(':')[1], float(edge[2].split('|')[0])] for edge in query_edgelist]"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 10,
193 | "metadata": {
194 | "collapsed": false
195 | },
196 | "outputs": [
197 | {
198 | "name": "stdout",
199 | "output_type": "stream",
200 | "text": [
201 | "CPU times: user 1.89 s, sys: 197 ms, total: 2.09 s\n",
202 | "Wall time: 1.87 s\n"
203 | ]
204 | }
205 | ],
206 | "source": [
207 | "%%time\n",
208 | "# Convert weighted edge list\n",
209 | "InBioMap_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=True)"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 11,
215 | "metadata": {
216 | "collapsed": false
217 | },
218 | "outputs": [
219 | {
220 | "name": "stdout",
221 | "output_type": "stream",
222 | "text": [
223 | "625641 input edges\n",
224 | "2498 self-edges removed\n",
225 | "12249 edges with un-mapped genes removed\n",
226 | "4896 duplicate edges removed\n",
227 | "Edge list filtered: 3.15 seconds\n",
228 | "605998 Edges remaining\n"
229 | ]
230 | }
231 | ],
232 | "source": [
233 | "# Filter converted edge list\n",
234 | "InBioMap_edgelist_symbol_filt = gct.filter_converted_edgelist(InBioMap_edgelist_symbol, weighted=True)"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 12,
240 | "metadata": {
241 | "collapsed": false
242 | },
243 | "outputs": [
244 | {
245 | "name": "stdout",
246 | "output_type": "stream",
247 | "text": [
248 | "Edge list saved: 1.77 seconds\n"
249 | ]
250 | }
251 | ],
252 | "source": [
253 | "# Write network to file\n",
254 | "gct.write_edgelist(InBioMap_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/InBioMap_Symbol.sif', binary=False)"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 13,
260 | "metadata": {
261 | "collapsed": false
262 | },
263 | "outputs": [
264 | {
265 | "name": "stdout",
266 | "output_type": "stream",
267 | "text": [
268 | "90.0% score: 1.0\n",
269 | "0 / 605998 edges retained\n"
270 | ]
271 | }
272 | ],
273 | "source": [
274 | "# Create filtered network\n",
275 | "InBioMap90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/InBioMap_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
276 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/InBioMap90_Symbol.sif')"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 14,
282 | "metadata": {
283 | "collapsed": false
284 | },
285 | "outputs": [
286 | {
287 | "name": "stdout",
288 | "output_type": "stream",
289 | "text": [
290 | "151352 / 605998 edges kept, 0.249756599857\n"
291 | ]
292 | }
293 | ],
294 | "source": [
295 | "# The filter function didn't work here because the max value makes up >90% of the edges. \n",
296 | "# We need to filter but keep all max edges instead\n",
297 | "InBioMap_edgelist = pd.DataFrame(InBioMap_edgelist_symbol_filt, columns=['NodeA', 'NodeB', 'edgeScore'])\n",
298 | "q_score = InBioMap_edgelist['edgeScore'].quantile(0.9)\n",
299 | "InBioMap_edgelist_filt = InBioMap_edgelist[InBioMap_edgelist['edgeScore']>=q_score]\n",
300 | "print InBioMap_edgelist_filt.shape[0], '/', InBioMap_edgelist.shape[0], 'edges kept, ', float(InBioMap_edgelist_filt.shape[0])/InBioMap_edgelist.shape[0]"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 15,
306 | "metadata": {
307 | "collapsed": false
308 | },
309 | "outputs": [],
310 | "source": [
311 | "# Keeping all edges where the score == 1, it's a top 75% network, we will save this\n",
312 | "InBioMap_edgelist_filt[['NodeA', 'NodeB']].to_csv(wd+'Network_SIFs_Symbol/InBioMap75_Symbol.sif', sep='\\t', index=False, header=False)"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {
319 | "collapsed": true
320 | },
321 | "outputs": [],
322 | "source": []
323 | }
324 | ],
325 | "metadata": {
326 | "kernelspec": {
327 | "display_name": "Python 2",
328 | "language": "python",
329 | "name": "python2"
330 | },
331 | "language_info": {
332 | "codemirror_mode": {
333 | "name": "ipython",
334 | "version": 2
335 | },
336 | "file_extension": ".py",
337 | "mimetype": "text/x-python",
338 | "name": "python",
339 | "nbconvert_exporter": "python",
340 | "pygments_lexer": "ipython2",
341 | "version": "2.7.11"
342 | }
343 | },
344 | "nbformat": 4,
345 | "nbformat_minor": 0
346 | }
347 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/IntAct Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import time"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Load IntAct Raw Data\n",
22 | "#### Source (PSI-MITAB): ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt\n",
23 | "Downloaded: June 15, 2017 \n",
24 | "Last Updated: June 05, 2017 \n",
25 | "Notes for processing: All interactions listed here need to be filtered for human-human interactions. Given the size of the file, we will filter the interactions and save the human-only interactions to a separate file to be loaded to save memory."
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stderr",
37 | "output_type": "stream",
38 | "text": [
39 | "/cellar/users/jkhuang/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (38,39) have mixed types. Specify dtype option on import or set low_memory=False.\n",
40 | " interactivity=interactivity, compiler=compiler, result=result)\n"
41 | ]
42 | },
43 | {
44 | "name": "stdout",
45 | "output_type": "stream",
46 | "text": [
47 | "Raw edge count in IntAct: 653104\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
53 | "IntAct_Raw = pd.read_csv(wd+'Network_Data_Raw/IntAct/2016-09-08_intact.txt', sep='\\t')\n",
54 | "print 'Raw edge count in IntAct:', len(IntAct_Raw)"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "## Custom Processing of Raw DIP Data"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 5,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "Index([u'#ID(s) interactor A', u'ID(s) interactor B',\n",
75 | " u'Alt. ID(s) interactor A', u'Alt. ID(s) interactor B',\n",
76 | " u'Alias(es) interactor A', u'Alias(es) interactor B',\n",
77 | " u'Interaction detection method(s)', u'Publication 1st author(s)',\n",
78 | " u'Publication Identifier(s)', u'Taxid interactor A',\n",
79 | " u'Taxid interactor B', u'Interaction type(s)', u'Source database(s)',\n",
80 | " u'Interaction identifier(s)', u'Confidence value(s)',\n",
81 | " u'Expansion method(s)', u'Biological role(s) interactor A',\n",
82 | " u'Biological role(s) interactor B',\n",
83 | " u'Experimental role(s) interactor A',\n",
84 | " u'Experimental role(s) interactor B', u'Type(s) interactor A',\n",
85 | " u'Type(s) interactor B', u'Xref(s) interactor A',\n",
86 | " u'Xref(s) interactor B', u'Interaction Xref(s)',\n",
87 | " u'Annotation(s) interactor A', u'Annotation(s) interactor B',\n",
88 | " u'Interaction annotation(s)', u'Host organism(s)',\n",
89 | " u'Interaction parameter(s)', u'Creation date', u'Update date',\n",
90 | " u'Checksum(s) interactor A', u'Checksum(s) interactor B',\n",
91 | " u'Interaction Checksum(s)', u'Negative', u'Feature(s) interactor A',\n",
92 | " u'Feature(s) interactor B', u'Stoichiometry(s) interactor A',\n",
93 | " u'Stoichiometry(s) interactor B',\n",
94 | " u'Identification method participant A',\n",
95 | " u'Identification method participant B'],\n",
96 | " dtype='object')"
97 | ]
98 | },
99 | "execution_count": 5,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "IntAct_Raw.columns"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "#### Keep only human-human interactions"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 7,
118 | "metadata": {
119 | "collapsed": false,
120 | "scrolled": false
121 | },
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "Human-Human only edge count in IntAct: 247565\n"
128 | ]
129 | }
130 | ],
131 | "source": [
132 | "# Filter for only human-human interactions in IntAct\n",
133 | "IntAct_Human_Only = IntAct_Raw[(IntAct_Raw['Taxid interactor A']=='taxid:9606(human)|taxid:9606(Homo sapiens)') & (IntAct_Raw['Taxid interactor B']=='taxid:9606(human)|taxid:9606(Homo sapiens)')]\n",
134 | "IntAct_Human_Only = IntAct_Human_Only.drop_duplicates()\n",
135 | "print 'Human-Human only edge count in IntAct:', IntAct_Human_Only.shape[0]"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 9,
141 | "metadata": {
142 | "collapsed": false
143 | },
144 | "outputs": [],
145 | "source": [
146 | "Human_IntAct_Genes = list(set(IntAct_Human_Only['#ID(s) interactor A']).union(set(IntAct_Human_Only['ID(s) interactor B'])))"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "metadata": {},
152 | "source": [
153 | "## Convert Genes"
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 14,
159 | "metadata": {
160 | "collapsed": false
161 | },
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "19143 Valid Query Genes\n",
168 | "1162 Invalid Query Genes\n"
169 | ]
170 | }
171 | ],
172 | "source": [
173 | "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'intact' prefix)\n",
174 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_IntAct_Genes, exclude_prefixes=['intact'])"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 17,
180 | "metadata": {
181 | "collapsed": false
182 | },
183 | "outputs": [
184 | {
185 | "name": "stdout",
186 | "output_type": "stream",
187 | "text": [
188 | "Batch query complete: 29.14 seconds\n",
189 | "19368 Matched query results\n"
190 | ]
191 | }
192 | ],
193 | "source": [
194 | "# Set scopes (gene naming systems to search)\n",
195 | "scopes = \"uniprot\"\n",
196 | "\n",
197 | "# Set fields (systems from which to return gene names from)\n",
198 | "fields = \"symbol, entrezgene\"\n",
199 | "\n",
200 | "# Query MyGene.Info\n",
201 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
202 | "print len(match_list), 'Matched query results'"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 18,
208 | "metadata": {
209 | "collapsed": false,
210 | "scrolled": true
211 | },
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "Queries without full matching results found: 4329\n",
218 | "\n",
219 | "157 Queries with mutliple matches found\n",
220 | "\n",
221 | "Query mapping table/dictionary construction complete: 94.21 seconds\n"
222 | ]
223 | }
224 | ],
225 | "source": [
226 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
227 | ]
228 | },
229 | {
230 | "cell_type": "markdown",
231 | "metadata": {},
232 | "source": [
233 | "## Construct Converted Network"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": 19,
239 | "metadata": {
240 | "collapsed": false
241 | },
242 | "outputs": [],
243 | "source": [
244 | "query_edgelist = IntAct_Human_Only[['#ID(s) interactor A', 'ID(s) interactor B']].drop_duplicates().values.tolist()"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 21,
250 | "metadata": {
251 | "collapsed": false
252 | },
253 | "outputs": [
254 | {
255 | "name": "stdout",
256 | "output_type": "stream",
257 | "text": [
258 | "5864 / 161035 edges with invalid nodes removed\n"
259 | ]
260 | }
261 | ],
262 | "source": [
263 | "# Filter query edgelist of interactions with invalid genes\n",
264 | "query_edgelist_filt = gct.filter_query_edgelist(query_edgelist, invalid_genes)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 23,
270 | "metadata": {
271 | "collapsed": true
272 | },
273 | "outputs": [],
274 | "source": [
275 | "# Format edge list by removing 'uniprot:' prefix from all interactors\n",
276 | "query_edgelist_filt_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist_filt]"
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 24,
282 | "metadata": {
283 | "collapsed": false
284 | },
285 | "outputs": [],
286 | "source": [
287 | "# Convert network edge list to symbol\n",
288 | "IntAct_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt_fmt, query_to_symbol)"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 25,
294 | "metadata": {
295 | "collapsed": false
296 | },
297 | "outputs": [
298 | {
299 | "name": "stdout",
300 | "output_type": "stream",
301 | "text": [
302 | "155171 input edges\n",
303 | "3236 self-edges removed\n",
304 | "20662 edges with un-mapped genes removed\n",
305 | "16701 duplicate edges removed\n",
306 | "Edge list filtered: 0.43 seconds\n",
307 | "114572 Edges remaining\n"
308 | ]
309 | }
310 | ],
311 | "source": [
312 | "# Filter converted edge list\n",
313 | "IntAct_edgelist_symbol_filt = gct.filter_converted_edgelist(IntAct_edgelist_symbol)"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 26,
319 | "metadata": {
320 | "collapsed": false
321 | },
322 | "outputs": [
323 | {
324 | "name": "stdout",
325 | "output_type": "stream",
326 | "text": [
327 | "Edge list saved: 0.24 seconds\n"
328 | ]
329 | }
330 | ],
331 | "source": [
332 | "# Save filtered, converted edge list to file\n",
333 | "gct.write_edgelist(IntAct_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/IntAct_Symbol.sif')"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {
340 | "collapsed": true
341 | },
342 | "outputs": [],
343 | "source": []
344 | }
345 | ],
346 | "metadata": {
347 | "kernelspec": {
348 | "display_name": "Python 2",
349 | "language": "python",
350 | "name": "python2"
351 | },
352 | "language_info": {
353 | "codemirror_mode": {
354 | "name": "ipython",
355 | "version": 2
356 | },
357 | "file_extension": ".py",
358 | "mimetype": "text/x-python",
359 | "name": "python",
360 | "nbconvert_exporter": "python",
361 | "pygments_lexer": "ipython2",
362 | "version": "2.7.11"
363 | }
364 | },
365 | "nbformat": 4,
366 | "nbformat_minor": 0
367 | }
368 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/Mentha Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import itertools\n",
15 | "import time"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Load Mentha Raw Data\n",
23 | "#### Source (MITAB): http://mentha.uniroma2.it/doDownload.php?file=2017-06-12_MITAB-2.5.zip\n",
24 | "Downloaded: June 15, 2017 \n",
25 | "Last Updated: June 12, 2017 \n",
26 | "Notes for processing: This is the file should contain only human-human protein interactions but this should be checked and filtered if needed. \n",
27 | "A Note about scoring: Mentha does have a score assigned for each interaction called the 'mentha-score', this will be the score we use to filter the network."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "metadata": {
34 | "collapsed": false
35 | },
36 | "outputs": [
37 | {
38 | "name": "stdout",
39 | "output_type": "stream",
40 | "text": [
41 | "Raw edge count in Mentha: 1114184\n"
42 | ]
43 | }
44 | ],
45 | "source": [
46 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
47 | "Mentha_Raw = pd.read_csv(wd+'Network_Data_Raw/mentha_2017_06_12', sep='\\t', header=-1)\n",
48 | "print 'Raw edge count in Mentha:', len(Mentha_Raw)"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 3,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "Human-Human only interactions in Mentha: 531726\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "# Keep only human-human interactions\n",
68 | "Mentha_Human_only = Mentha_Raw[(Mentha_Raw[9]=='taxid:9606(Homo sapiens)') & (Mentha_Raw[10]=='taxid:9606(Homo sapiens)')]\n",
69 | "print 'Human-Human only interactions in Mentha:', len(Mentha_Human_only)"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 4,
75 | "metadata": {
76 | "collapsed": true
77 | },
78 | "outputs": [],
79 | "source": [
80 | "# Extract gene list\n",
81 | "Human_Mentha_Genes = list(set(Mentha_Human_only[0]).union(set(Mentha_Human_only[1])))"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "## Convert Network Genes to symbol from UniProt Accession ID"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 5,
94 | "metadata": {
95 | "collapsed": false
96 | },
97 | "outputs": [
98 | {
99 | "name": "stdout",
100 | "output_type": "stream",
101 | "text": [
102 | "18626 Valid Query Genes\n",
103 | "0 Invalid Query Genes\n"
104 | ]
105 | }
106 | ],
107 | "source": [
108 | "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'intact' prefix)\n",
109 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_Mentha_Genes)"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 6,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "Batch query complete: 62.69 seconds\n",
124 | "18932 Matched query results\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "# Set scopes (gene naming systems to search)\n",
130 | "scopes = \"uniprot\"\n",
131 | "\n",
132 | "# Set fields (systems from which to return gene names from)\n",
133 | "fields = \"symbol, entrezgene\"\n",
134 | "\n",
135 | "# Query MyGene.Info\n",
136 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
137 | "print len(match_list), 'Matched query results'"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 7,
143 | "metadata": {
144 | "collapsed": false
145 | },
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "Queries without full matching results found: 1198\n",
152 | "The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.\n",
153 | "\n",
154 | "207 Queries with mutliple matches found\n",
155 | "\n",
156 | "Query mapping table/dictionary construction complete: 83.92 seconds\n"
157 | ]
158 | }
159 | ],
160 | "source": [
161 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
162 | ]
163 | },
164 | {
165 | "cell_type": "markdown",
166 | "metadata": {},
167 | "source": [
168 | "## Construct Converted Network"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 8,
174 | "metadata": {
175 | "collapsed": true
176 | },
177 | "outputs": [],
178 | "source": [
179 | "query_edgelist = Mentha_Human_only[[0, 1, 14]].drop_duplicates().values.tolist()"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 9,
185 | "metadata": {
186 | "collapsed": true
187 | },
188 | "outputs": [],
189 | "source": [
190 | "# Format edge list by removing 'uniprot:' prefix from all interactors\n",
191 | "query_edgelist_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1]), float(edge[2].split(':')[-1])] for edge in query_edgelist]"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 10,
197 | "metadata": {
198 | "collapsed": true
199 | },
200 | "outputs": [],
201 | "source": [
202 | "# Convert network edge list to symbol\n",
203 | "Mentha_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=True)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 11,
209 | "metadata": {
210 | "collapsed": false
211 | },
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "327857 input edges\n",
218 | "3247 self-edges removed\n",
219 | "8219 edges with un-mapped genes removed\n",
220 | "53515 duplicate edges removed\n",
221 | "Edge list filtered: 1.61 seconds\n",
222 | "262876 Edges remaining\n"
223 | ]
224 | }
225 | ],
226 | "source": [
227 | "# Filter converted edge list\n",
228 | "Mentha_edgelist_symbol_filt = gct.filter_converted_edgelist(Mentha_edgelist_symbol, weighted=True)"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 12,
234 | "metadata": {
235 | "collapsed": false
236 | },
237 | "outputs": [
238 | {
239 | "name": "stdout",
240 | "output_type": "stream",
241 | "text": [
242 | "Edge list saved: 0.79 seconds\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "# Save filtered, converted edge list to file\n",
248 | "gct.write_edgelist(Mentha_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/Mentha_Symbol.sif', binary=False)"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 13,
254 | "metadata": {
255 | "collapsed": false
256 | },
257 | "outputs": [
258 | {
259 | "name": "stdout",
260 | "output_type": "stream",
261 | "text": [
262 | "90.0% score: 0.454\n",
263 | "22886 / 262876 edges retained\n"
264 | ]
265 | }
266 | ],
267 | "source": [
268 | "# Create filtered network\n",
269 | "Mentha90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/Mentha_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
270 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/Mentha90_Symbol.sif')"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": null,
276 | "metadata": {
277 | "collapsed": true
278 | },
279 | "outputs": [],
280 | "source": []
281 | }
282 | ],
283 | "metadata": {
284 | "kernelspec": {
285 | "display_name": "Python 2",
286 | "language": "python",
287 | "name": "python2"
288 | },
289 | "language_info": {
290 | "codemirror_mode": {
291 | "name": "ipython",
292 | "version": 2
293 | },
294 | "file_extension": ".py",
295 | "mimetype": "text/x-python",
296 | "name": "python",
297 | "nbconvert_exporter": "python",
298 | "pygments_lexer": "ipython2",
299 | "version": "2.7.11"
300 | }
301 | },
302 | "nbformat": 4,
303 | "nbformat_minor": 0
304 | }
305 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/MultiNet Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import itertools"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Load MultiNet Raw Data\n",
22 | "#### Source: http://homes.gersteinlab.org/Khurana-PLoSCompBio-2013/\n",
23 | "Downloaded: August 12, 2016 \n",
24 | "Last Updated: March 17, 2013 \n",
25 | "Processing Notes: MultiNet has labels which interactions are noted as PPI and which are not. In the initial case, we will be examining all interaction information for MultiNet. However, in this case it is simple enough to parse the PPI only information from the data, and can be done in future work if necessary."
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "Raw edge count in MultiNet: 109598\n"
40 | ]
41 | }
42 | ],
43 | "source": [
44 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
45 | "MultiNet_Raw = pd.read_csv(wd+'Network_Data_Raw/Multinet.interactions.network_presence_2013_03_17.txt',sep='\\t')\n",
46 | "print 'Raw edge count in MultiNet:', MultiNet_Raw.shape[0]"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 8,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [],
56 | "source": [
57 | "# Build edge list from interaction column. The two parts of the interaction name on either side of '_' are gene symbols\n",
58 | "MultiNet_edgelist = [interaction.split('_') for interaction in MultiNet_Raw['INTERACTION_NAME']]"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 9,
64 | "metadata": {
65 | "collapsed": false
66 | },
67 | "outputs": [],
68 | "source": [
69 | "# Sort each edge representation for filtering\n",
70 | "MultiNet_edgelist_sorted = [sorted(edge) for edge in MultiNet_edgelist]"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 10,
76 | "metadata": {
77 | "collapsed": false
78 | },
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "109598 input edges\n",
85 | "0 self-edges removed\n",
86 | "0 edges with un-mapped genes removed\n",
87 | "0 duplicate edges removed\n",
88 | "Edge list filtered: 0.31 seconds\n",
89 | "109598 Edges remaining\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "# Filter edgelist for duplicate nodes and for self-edges\n",
95 | "MultiNet_edgelist_filt = gct.filter_converted_edgelist(MultiNet_edgelist_sorted)"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 13,
101 | "metadata": {
102 | "collapsed": false
103 | },
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "Edge list saved: 0.11 seconds\n"
110 | ]
111 | }
112 | ],
113 | "source": [
114 | "# Save genelist to file\n",
115 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
116 | "gct.write_edgelist(MultiNet_edgelist_filt, outdir+'MultiNet_Symbol.sif')"
117 | ]
118 | }
119 | ],
120 | "metadata": {
121 | "kernelspec": {
122 | "display_name": "Python 2",
123 | "language": "python",
124 | "name": "python2"
125 | },
126 | "language_info": {
127 | "codemirror_mode": {
128 | "name": "ipython",
129 | "version": 2
130 | },
131 | "file_extension": ".py",
132 | "mimetype": "text/x-python",
133 | "name": "python",
134 | "nbconvert_exporter": "python",
135 | "pygments_lexer": "ipython2",
136 | "version": "2.7.11"
137 | }
138 | },
139 | "nbformat": 4,
140 | "nbformat_minor": 0
141 | }
142 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/PID Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import time"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Load PID Raw Data\n",
22 | "#### Source: http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.pid.hgnc.sif.gz\n",
23 | "Downloaded: June 19, 2017 \n",
24 | "Last (via Pathway Commons v8 datasources.txt file): July 27, 2010 \n",
25 | "Note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons \n",
26 | "Also note: This network contains some interacions with CHEBI small molecules. These interactions will be removed"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [],
36 | "source": [
37 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
38 | "PID_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons9.pid.hgnc.sif',sep='\\t', header=-1)\n",
39 | "print 'Raw interactions in NCI PID:', PID_Raw.shape[0]"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 10,
45 | "metadata": {
46 | "collapsed": false
47 | },
48 | "outputs": [
49 | {
50 | "name": "stdout",
51 | "output_type": "stream",
52 | "text": [
53 | "Protein-Protein interactions in NCI PID: 27489\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "# Filter all interactions that contain a CHEBI: item\n",
59 | "PID_filt = PID_Raw[(~PID_Raw[0].str.contains(':')) & (~PID_Raw[2].str.contains(':'))]\n",
60 | "PID_edgelist = PID_filt[[0, 2]].values.tolist()\n",
61 | "print 'Protein-Protein interactions in NCI PID:', len(PID_edgelist)"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 11,
67 | "metadata": {
68 | "collapsed": true
69 | },
70 | "outputs": [],
71 | "source": [
72 | "# Sort each edge representation for filtering\n",
73 | "PID_edgelist_sorted = [sorted(edge) for edge in PID_edgelist]"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 12,
79 | "metadata": {
80 | "collapsed": false
81 | },
82 | "outputs": [
83 | {
84 | "name": "stdout",
85 | "output_type": "stream",
86 | "text": [
87 | "27489 input edges\n",
88 | "0 self-edges removed\n",
89 | "0 edges with un-mapped genes removed\n",
90 | "6047 duplicate edges removed\n",
91 | "Edge list filtered: 0.11 seconds\n",
92 | "21442 Edges remaining\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "# Filter edgelist for duplicate nodes and for self-edges\n",
98 | "PID_edgelist_filt = gct.filter_converted_edgelist(PID_edgelist_sorted)"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 14,
104 | "metadata": {
105 | "collapsed": false
106 | },
107 | "outputs": [
108 | {
109 | "name": "stdout",
110 | "output_type": "stream",
111 | "text": [
112 | "Edge list saved: 0.06 seconds\n"
113 | ]
114 | }
115 | ],
116 | "source": [
117 | "# Save genelist to file\n",
118 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
119 | "gct.write_edgelist(PID_edgelist_filt, outdir+'PID_Symbol.sif')"
120 | ]
121 | }
122 | ],
123 | "metadata": {
124 | "kernelspec": {
125 | "display_name": "Python 2",
126 | "language": "python",
127 | "name": "python2"
128 | },
129 | "language_info": {
130 | "codemirror_mode": {
131 | "name": "ipython",
132 | "version": 2
133 | },
134 | "file_extension": ".py",
135 | "mimetype": "text/x-python",
136 | "name": "python",
137 | "nbconvert_exporter": "python",
138 | "pygments_lexer": "ipython2",
139 | "version": "2.7.11"
140 | }
141 | },
142 | "nbformat": 4,
143 | "nbformat_minor": 0
144 | }
145 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/Pathway Commons Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import time"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Load Pathway Commons Raw Data (All interactions)\n",
22 | "#### Source: http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.All.hgnc.txt.gz\n",
23 | "Downloaded: June 15, 2017 \n",
24 | "Last Updated: May 25, 2017 \n",
25 | "Citation: Pathway Commons, a web resource for biological pathway data. Cerami E et al. Nucleic Acids Research (2011). \n",
26 | "A Note about filtering interactions: Pathway Commons also contains interactions between proteins and small molecules from the CHEBI database. These interactions will need to be filtered out as they are not protein-protein interactions. \n",
27 | "Also note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 9,
33 | "metadata": {
34 | "collapsed": false
35 | },
36 | "outputs": [
37 | {
38 | "name": "stdout",
39 | "output_type": "stream",
40 | "text": [
41 | "Raw interactions in Pathway Commons v9: 1503144\n"
42 | ]
43 | }
44 | ],
45 | "source": [
46 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
47 | "PC_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons9.All.hgnc.sif', sep='\\t', header=-1)\n",
48 | "print 'Raw interactions in Pathway Commons v9:', PC_Raw.shape[0]"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 25,
54 | "metadata": {
55 | "collapsed": false
56 | },
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "Protein-Protein interactions in Pathway Commons v9: 968186\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "# Filter all interactions that contain a CHEBI: item\n",
68 | "PC_filt = PC_Raw[(~PC_Raw[0].str.contains(':')) & (~PC_Raw[2].str.contains(':'))]\n",
69 | "PC_edgelist = PC_filt[[0, 2]].values.tolist()\n",
70 | "print 'Protein-Protein interactions in Pathway Commons v9:', len(PC_edgelist)"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 26,
76 | "metadata": {
77 | "collapsed": true
78 | },
79 | "outputs": [],
80 | "source": [
81 | "# Sort each edge representation for filtering\n",
82 | "PC_edgelist_sorted = [sorted(edge) for edge in PC_edgelist]"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 27,
88 | "metadata": {
89 | "collapsed": false
90 | },
91 | "outputs": [
92 | {
93 | "name": "stdout",
94 | "output_type": "stream",
95 | "text": [
96 | "968186 input edges\n",
97 | "0 self-edges removed\n",
98 | "0 edges with un-mapped genes removed\n",
99 | "143511 duplicate edges removed\n",
100 | "Edge list filtered: 1.92 seconds\n",
101 | "824675 Edges remaining\n"
102 | ]
103 | }
104 | ],
105 | "source": [
106 | "# Filter edgelist for duplicate nodes and for self-edges\n",
107 | "PC_edgelist_filt = gct.filter_converted_edgelist(PC_edgelist_sorted)"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 28,
113 | "metadata": {
114 | "collapsed": false
115 | },
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "Edge list saved: 0.55 seconds\n"
122 | ]
123 | }
124 | ],
125 | "source": [
126 | "# Save genelist to file\n",
127 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n",
128 | "gct.write_edgelist(PC_edgelist_filt, outdir+'PathwayCommons_Symbol.sif')"
129 | ]
130 | }
131 | ],
132 | "metadata": {
133 | "kernelspec": {
134 | "display_name": "Python 2",
135 | "language": "python",
136 | "name": "python2"
137 | },
138 | "language_info": {
139 | "codemirror_mode": {
140 | "name": "ipython",
141 | "version": 2
142 | },
143 | "file_extension": ".py",
144 | "mimetype": "text/x-python",
145 | "name": "python",
146 | "nbconvert_exporter": "python",
147 | "pygments_lexer": "ipython2",
148 | "version": "2.7.11"
149 | }
150 | },
151 | "nbformat": 4,
152 | "nbformat_minor": 0
153 | }
154 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/Reactome Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import itertools\n",
15 | "import time"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Load Reactome Raw Data\n",
23 | "#### Source: http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz\n",
24 | "#### File to download: The link labelled \"Human protein-protein interaction pairs in tab-delimited format\" seems to have many more interactions than the MITAB file format. This is the file that we will use for this network.\n",
25 | "Downloaded: June 15, 2017 \n",
26 | "Last Updated: April 20, 2017 "
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "Raw Edges in Reactome v60: 2523567\n"
41 | ]
42 | }
43 | ],
44 | "source": [
45 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
46 | "Reactome_Raw = pd.read_csv(wd+'Network_Data_Raw/Reactome_v60.interactions.txt',sep='\\t',skiprows=1, header=-1, low_memory=False)\n",
47 | "print 'Raw Edges in Reactome v60:', len(Reactome_Raw)"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {
54 | "collapsed": false
55 | },
56 | "outputs": [
57 | {
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "214432 Raw Reactome Edges after removing duplicate edges\n",
62 | "210066 Raw Reactome Edges after removing duplicate and self-edges\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "# Get edge list of network (filter for duplicate edges and self-edges)\n",
68 | "query_edgelist_filt = Reactome_Raw[[0,3]].drop_duplicates()\n",
69 | "print len(query_edgelist_filt), \"Raw Reactome Edges after removing duplicate edges\"\n",
70 | "query_edgelist_filt2 = query_edgelist_filt[query_edgelist_filt[0]!=query_edgelist_filt[3]]\n",
71 | "print len(query_edgelist_filt2), \"Raw Reactome Edges after removing duplicate and self-edges\"\n",
72 | "query_edgelist = query_edgelist_filt2.values.tolist()"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "metadata": {
79 | "collapsed": true
80 | },
81 | "outputs": [],
82 | "source": [
83 | "# Extract gene list\n",
84 | "Reactome_Raw_Genes = list(set(query_edgelist_filt2[0]).union(set(query_edgelist_filt2[3])))"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "## Convert Genes from UniProtKB to Symbol"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 5,
97 | "metadata": {
98 | "collapsed": false
99 | },
100 | "outputs": [
101 | {
102 | "name": "stdout",
103 | "output_type": "stream",
104 | "text": [
105 | "8387 Valid Query Genes\n",
106 | "0 Invalid Query Genes\n"
107 | ]
108 | }
109 | ],
110 | "source": [
111 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Reactome_Raw_Genes)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 6,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [
121 | {
122 | "name": "stdout",
123 | "output_type": "stream",
124 | "text": [
125 | "Batch query complete: 13.56 seconds\n",
126 | "8518 Matched query results\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "# Set scopes (gene naming systems to search)\n",
132 | "scopes = \"uniprot\"\n",
133 | "\n",
134 | "# Set fields (systems from which to return gene names from)\n",
135 | "fields = \"symbol, entrezgene\"\n",
136 | "\n",
137 | "# Query MyGene.Info\n",
138 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
139 | "print len(match_list), 'Matched query results'"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 7,
145 | "metadata": {
146 | "collapsed": false
147 | },
148 | "outputs": [
149 | {
150 | "name": "stdout",
151 | "output_type": "stream",
152 | "text": [
153 | "Queries without full matching results found: 511\n",
154 | "\n",
155 | "102 Queries with mutliple matches found\n",
156 | "\n",
157 | "Query mapping table/dictionary construction complete: 17.83 seconds\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
163 | ]
164 | },
165 | {
166 | "cell_type": "markdown",
167 | "metadata": {},
168 | "source": [
169 | "## Construct Converted Network"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 8,
175 | "metadata": {
176 | "collapsed": false
177 | },
178 | "outputs": [],
179 | "source": [
180 | "# Format edge list by removing prefixes from all interactors\n",
181 | "query_edgelist_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist]"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 9,
187 | "metadata": {
188 | "collapsed": true
189 | },
190 | "outputs": [],
191 | "source": [
192 | "# Convert network edge list to symbol\n",
193 | "Reactome_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=False)"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 11,
199 | "metadata": {
200 | "collapsed": false
201 | },
202 | "outputs": [
203 | {
204 | "name": "stdout",
205 | "output_type": "stream",
206 | "text": [
207 | "210066 input edges\n",
208 | "2708 self-edges removed\n",
209 | "10886 edges with un-mapped genes removed\n",
210 | "1970 duplicate edges removed\n",
211 | "Edge list filtered: 0.51 seconds\n",
212 | "194502 Edges remaining\n"
213 | ]
214 | }
215 | ],
216 | "source": [
217 | "# Filter converted edge list\n",
218 | "Reactome_edgelist_symbol_filt = gct.filter_converted_edgelist(Reactome_edgelist_symbol, weighted=False)"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 12,
224 | "metadata": {
225 | "collapsed": false
226 | },
227 | "outputs": [
228 | {
229 | "name": "stdout",
230 | "output_type": "stream",
231 | "text": [
232 | "Edge list saved: 0.59 seconds\n"
233 | ]
234 | }
235 | ],
236 | "source": [
237 | "# Save filtered, converted edge list to file\n",
238 | "gct.write_edgelist(Reactome_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/Reactome_Symbol.sif', binary=True)"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": null,
244 | "metadata": {
245 | "collapsed": true
246 | },
247 | "outputs": [],
248 | "source": []
249 | }
250 | ],
251 | "metadata": {
252 | "kernelspec": {
253 | "display_name": "Python 2",
254 | "language": "python",
255 | "name": "python2"
256 | },
257 | "language_info": {
258 | "codemirror_mode": {
259 | "name": "ipython",
260 | "version": 2
261 | },
262 | "file_extension": ".py",
263 | "mimetype": "text/x-python",
264 | "name": "python",
265 | "nbconvert_exporter": "python",
266 | "pygments_lexer": "ipython2",
267 | "version": "2.7.11"
268 | }
269 | },
270 | "nbformat": 4,
271 | "nbformat_minor": 0
272 | }
273 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/Reactome-FIs Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import itertools\n",
15 | "import time"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Load Reactome-Functional Interactions Raw Data\n",
23 | "#### Source: http://reactomews.oicr.on.ca:8080/caBigR3WebApp2016/FIsInGene_022717_with_annotations.txt.zip\n",
24 | "Downloaded: June 15, 2017 \n",
25 | "Last Updated: February 27, 2017 \n",
26 | "Note about processing: It looks like most of the edges are given as gene symbols but many of them seem to be invalid names, so we will use some of the gene conversion tools to filter these results as best we can."
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {
33 | "collapsed": false
34 | },
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "Raw edges in ReactomeFI: 230243\n"
41 | ]
42 | }
43 | ],
44 | "source": [
45 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
46 | "Reactome_FIs_Raw = pd.read_csv(wd+'Network_Data_Raw/FIsInGene_022717_with_annotations.txt',sep='\\t')\n",
47 | "print 'Raw edges in ReactomeFI:', Reactome_FIs_Raw.shape[0]"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {
54 | "collapsed": true
55 | },
56 | "outputs": [],
57 | "source": [
58 | "# Extract gene list\n",
59 | "Reactome_FIs_Raw_Genes = list(set(Reactome_FIs_Raw['Gene1']).union(set(Reactome_FIs_Raw['Gene2'])))"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 4,
65 | "metadata": {
66 | "collapsed": false
67 | },
68 | "outputs": [
69 | {
70 | "name": "stdout",
71 | "output_type": "stream",
72 | "text": [
73 | "12254 Valid Query Genes\n",
74 | "23 Invalid Query Genes:\n",
75 | "['YWHAE/FAM22B FUSION', 'RUNX1/C20ORF112 FUSION', 'IGKV A18', 'APC VARIANT PROTEIN', 'STAG1 VARIANT PROTEIN', 'MIR CL-10', 'BETA 2-MICROGLOBULIN', 'BCR/ABL FUSION', 'ATP2B2 VARIANT PROTEIN', 'ITGA7 VARIANT PROTEIN', 'CREB-1', 'CD40 LIGAND', 'NUMA1 VARIANT PROTEIN', 'PIK4CA VARIANT PROTEIN', 'EPHB2 VARIANT PROTEIN', 'RUNX1/CBFA2T2 FUSION', 'TNC VARIANT PROTEIN', 'PIK3C2B VARIANT PROTEIN', 'PLCG1 VARIANT PROTEIN', 'WUGSC:H_GS165O14.2', 'PIK3CA VARIANT PROTEIN', 'YWHAE/FAM22A FUSION', 'PDHA1/LOC79064']\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "# Find \"invalid genes\" by text format\n",
81 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Reactome_FIs_Raw_Genes, exclude_prefixes=['CHEBI'], print_invalid_genes=True)"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 5,
87 | "metadata": {
88 | "collapsed": false
89 | },
90 | "outputs": [],
91 | "source": [
92 | "# Get Edgelist of network\n",
93 | "query_edgelist = Reactome_FIs_Raw[['Gene1','Gene2', 'Score']].values.tolist()"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 6,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "820 / 230243 edges with invalid nodes removed\n"
108 | ]
109 | }
110 | ],
111 | "source": [
112 | "# Filter query edges\n",
113 | "query_edgelist_filt = gct.filter_query_edgelist(query_edgelist,invalid_genes)"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 7,
119 | "metadata": {
120 | "collapsed": false
121 | },
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "229423 input edges\n",
128 | "0 self-edges removed\n",
129 | "0 edges with un-mapped genes removed\n",
130 | "0 duplicate edges removed\n",
131 | "Edge list filtered: 1.95 seconds\n",
132 | "229423 Edges remaining\n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "# Filter edge list\n",
138 | "ReactomeFI_edgelist_filt = gct.filter_converted_edgelist(query_edgelist_filt, weighted=True)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 8,
144 | "metadata": {
145 | "collapsed": false
146 | },
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "Edge list saved: 0.68 seconds\n"
153 | ]
154 | }
155 | ],
156 | "source": [
157 | "# Save filtered, converted edge list to file\n",
158 | "gct.write_edgelist(ReactomeFI_edgelist_filt, wd+'Network_SIFs_Symbol/ReactomeFI_Symbol.sif', binary=False)"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 36,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [
168 | {
169 | "name": "stdout",
170 | "output_type": "stream",
171 | "text": [
172 | "90.0% score: 1.0\n",
173 | "0 / 229423 edges retained\n"
174 | ]
175 | }
176 | ],
177 | "source": [
178 | "# Create filtered network\n",
179 | "ReactomeFI90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/ReactomeFI_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
180 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/ReactomeFI90_edgelist_Symbol.sif')"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 37,
186 | "metadata": {
187 | "collapsed": false
188 | },
189 | "outputs": [
190 | {
191 | "name": "stdout",
192 | "output_type": "stream",
193 | "text": [
194 | "198541 / 229423 edges kept, 0.86539274615\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "# The filter function didn't work here because the max value makes up >90% of the edges. \n",
200 | "# We need to filter but keep all max edges instead\n",
201 | "ReactomeFI_edgelist = pd.DataFrame(ReactomeFI_edgelist_filt, columns=['NodeA', 'NodeB', 'Score'])\n",
202 | "q_score = ReactomeFI_edgelist['Score'].quantile(0.9)\n",
203 | "ReactomeFI_edgelist_filt2 = ReactomeFI_edgelist[ReactomeFI_edgelist['Score']>=q_score]\n",
204 | "print ReactomeFI_edgelist_filt2.shape[0], '/', ReactomeFI_edgelist.shape[0], 'edges kept, ', float(ReactomeFI_edgelist_filt2.shape[0])/ReactomeFI_edgelist.shape[0]"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {
211 | "collapsed": true
212 | },
213 | "outputs": [],
214 | "source": [
215 | "# Essentially >85% of the edges have the 'maximum score' which makes almost no sense for filtering further"
216 | ]
217 | }
218 | ],
219 | "metadata": {
220 | "kernelspec": {
221 | "display_name": "Python 2",
222 | "language": "python",
223 | "name": "python2"
224 | },
225 | "language_info": {
226 | "codemirror_mode": {
227 | "name": "ipython",
228 | "version": 2
229 | },
230 | "file_extension": ".py",
231 | "mimetype": "text/x-python",
232 | "name": "python",
233 | "nbconvert_exporter": "python",
234 | "pygments_lexer": "ipython2",
235 | "version": "2.7.11"
236 | }
237 | },
238 | "nbformat": 4,
239 | "nbformat_minor": 0
240 | }
241 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/STRING Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import time"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "## Load STRING Raw Data\n",
22 | "#### Source: http://string-db.org/download/protein.links.v10.5.txt.gz\n",
23 | "#### Source (detailed): http://string-db.org/download/protein.links.detailed.v10.5.txt.gz\n",
24 | "#### File to download: The link labelled 'protein.links.v10.5.txt.gz' is simply the binary file version of the 'detailed' file. The detailed file documents the types of interactions and support for each interaction. It can be used for filtering in the future if desired, but will not be filtered on those categories currently.\n",
25 | "Downloaded: June 15, 2016 \n",
26 | "Last Updated: May 14, 2017\t\n",
27 | "Processing note: This data needs to be filtered for human-only interactions. This is a very long and large file, so we will parse the edges that are human-human interactions only by streaming the file. Then the resulting human-human interaction file will be read to be processed."
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 4,
33 | "metadata": {
34 | "collapsed": false
35 | },
36 | "outputs": [
37 | {
38 | "name": "stdout",
39 | "output_type": "stream",
40 | "text": [
41 | "Filtered human-human STRING interactions only: 1793.17046094 seconds\n"
42 | ]
43 | }
44 | ],
45 | "source": [
46 | "# Load and filter STRING for only human-human protein interactions\n",
47 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
48 | "starttime=time.time()\n",
49 | "g=open(wd+'Network_Data_Raw/STRING/STRING_human_v10.5.txt','w')\n",
50 | "with open(wd+'Network_Data_Raw/STRING/protein.links.v10.5.txt') as f:\n",
51 | " for line in f:\n",
52 | " edge = line.split(' ')\n",
53 | " if edge[0].startswith('9606') and edge[1].startswith('9606'):\n",
54 | " g.write(edge[0].split('.')[1]+'\\t'+edge[1].split('.')[1]+'\\t'+edge[2]+'\\n')\n",
55 | "print 'Filtered human-human STRING interactions only:', time.time()-starttime, 'seconds'\n",
56 | "g.close()"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "## Load human-filtered STRING edges"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 2,
69 | "metadata": {
70 | "collapsed": false
71 | },
72 | "outputs": [
73 | {
74 | "name": "stdout",
75 | "output_type": "stream",
76 | "text": [
77 | "Raw Edges in STRING v10.5: 11353056\n"
78 | ]
79 | }
80 | ],
81 | "source": [
82 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
83 | "STRING_Raw = pd.read_csv(wd+'Network_Data_Raw/STRING/STRING_human_v10.5.txt',sep='\\t',header=-1)\n",
84 | "STRING_Raw.columns = ['NodeA', 'NodeB', 'Score']\n",
85 | "print 'Raw Edges in STRING v10.5:', len(STRING_Raw)"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 3,
91 | "metadata": {
92 | "collapsed": false
93 | },
94 | "outputs": [
95 | {
96 | "name": "stdout",
97 | "output_type": "stream",
98 | "text": [
99 | "Edges in STRING v10.5 after dropping duplicates: 11353056\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "STRING_Raw_filt = STRING_Raw.drop_duplicates()\n",
105 | "print 'Edges in STRING v10.5 after dropping duplicates:', len(STRING_Raw_filt)"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 4,
111 | "metadata": {
112 | "collapsed": false
113 | },
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.\n"
120 | ]
121 | }
122 | ],
123 | "source": [
124 | "STRING_Genes = list(set(STRING_Raw_filt['NodeA']).union(set(STRING_Raw_filt['NodeB'])))"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 5,
130 | "metadata": {
131 | "collapsed": false
132 | },
133 | "outputs": [],
134 | "source": [
135 | "query_edgelist = STRING_Raw_filt[['NodeA', 'NodeB', 'Score']].values.tolist()"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "## Convert Genes from Ensembl Protein to Hugo Symbol"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 6,
148 | "metadata": {
149 | "collapsed": false
150 | },
151 | "outputs": [
152 | {
153 | "name": "stdout",
154 | "output_type": "stream",
155 | "text": [
156 | "19576 Valid Query Genes\n",
157 | "0 Invalid Query Genes\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "query_string, valid_genes, invalid_genes = gct.query_constructor(STRING_Genes)"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 7,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "Batch query complete: 23.11 seconds\n",
177 | "19578 Matched query results\n"
178 | ]
179 | }
180 | ],
181 | "source": [
182 | "# Set scopes (gene naming systems to search)\n",
183 | "scopes = \"ensemblprotein\"\n",
184 | "\n",
185 | "# Set fields (systems from which to return gene names from)\n",
186 | "fields = \"symbol, entrezgene\"\n",
187 | "\n",
188 | "# Query MyGene.Info\n",
189 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
190 | "print len(match_list), 'Matched query results'"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 8,
196 | "metadata": {
197 | "collapsed": false
198 | },
199 | "outputs": [
200 | {
201 | "name": "stdout",
202 | "output_type": "stream",
203 | "text": [
204 | "Queries without full matching results found: 1584\n",
205 | "\n",
206 | "1 Queries with mutliple matches found\n",
207 | "\n",
208 | "Query mapping table/dictionary construction complete: 115.61 seconds\n"
209 | ]
210 | }
211 | ],
212 | "source": [
213 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
214 | ]
215 | },
216 | {
217 | "cell_type": "markdown",
218 | "metadata": {},
219 | "source": [
220 | "## Construct Converted Network"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 9,
226 | "metadata": {
227 | "collapsed": false
228 | },
229 | "outputs": [
230 | {
231 | "name": "stdout",
232 | "output_type": "stream",
233 | "text": [
234 | "CPU times: user 26.7 s, sys: 2.74 s, total: 29.5 s\n",
235 | "Wall time: 29.2 s\n"
236 | ]
237 | }
238 | ],
239 | "source": [
240 | "%%time\n",
241 | "# Convert weighted edge list\n",
242 | "STRING_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 10,
248 | "metadata": {
249 | "collapsed": false
250 | },
251 | "outputs": [
252 | {
253 | "name": "stdout",
254 | "output_type": "stream",
255 | "text": [
256 | "11353056 input edges\n",
257 | "30268 self-edges removed\n",
258 | "1043874 edges with un-mapped genes removed\n",
259 | "5143146 duplicate edges removed\n",
260 | "Edge list filtered: 77.42 seconds\n",
261 | "5135768 Edges remaining\n"
262 | ]
263 | }
264 | ],
265 | "source": [
266 | "# Filter converted edge list\n",
267 | "STRING_edgelist_symbol_filt = gct.filter_converted_edgelist(STRING_edgelist_symbol, weighted=True)"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": 11,
273 | "metadata": {
274 | "collapsed": false
275 | },
276 | "outputs": [
277 | {
278 | "name": "stdout",
279 | "output_type": "stream",
280 | "text": [
281 | "Edge list saved: 8.28 seconds\n"
282 | ]
283 | }
284 | ],
285 | "source": [
286 | "# Write network to file\n",
287 | "gct.write_edgelist(STRING_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/STRING_Symbol.sif', binary=False)"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": 12,
293 | "metadata": {
294 | "collapsed": false
295 | },
296 | "outputs": [
297 | {
298 | "name": "stdout",
299 | "output_type": "stream",
300 | "text": [
301 | "90.0% score: 497.0\n",
302 | "513035 / 5135768 edges retained\n"
303 | ]
304 | }
305 | ],
306 | "source": [
307 | "# Create filtered network\n",
308 | "STRING90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/STRING_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n",
309 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/STRING90_Symbol.sif')"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": null,
315 | "metadata": {
316 | "collapsed": true
317 | },
318 | "outputs": [],
319 | "source": []
320 | }
321 | ],
322 | "metadata": {
323 | "kernelspec": {
324 | "display_name": "Python 2",
325 | "language": "python",
326 | "name": "python2"
327 | },
328 | "language_info": {
329 | "codemirror_mode": {
330 | "name": "ipython",
331 | "version": 2
332 | },
333 | "file_extension": ".py",
334 | "mimetype": "text/x-python",
335 | "name": "python",
336 | "nbconvert_exporter": "python",
337 | "pygments_lexer": "ipython2",
338 | "version": "2.7.11"
339 | }
340 | },
341 | "nbformat": 4,
342 | "nbformat_minor": 0
343 | }
344 |
--------------------------------------------------------------------------------
/Network Processing Notebooks/iRefIndex Processing.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from network_evaluation_tools import gene_conversion_tools as gct\n",
12 | "from network_evaluation_tools import data_import_tools as dit\n",
13 | "import pandas as pd\n",
14 | "import itertools\n",
15 | "import time"
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {},
21 | "source": [
22 | "## Load iRefIndex Raw Data\n",
23 | "#### Source (MITAB): http://irefindex.org/download/irefindex/data/archive/release_14.0/psi_mitab/MITAB2.6/9606.mitab.07042015.txt.zip\n",
24 | "Downloaded: July 28, 2016 \n",
25 | "Last Updated: April 20, 2015 \n",
26 | "Notes for processing: This is the file for human protein interactions, however, not all interactions are human-human interactions. These need to be filtered. Also all ID's not without RefSeq or UniProt ID are excluded. Custom processing for this network is described below\n",
27 | "### From iRefIndex Mapping Documentation Page:\n",
28 | "\"We have made a file which provides mappings between iRefIndex identifiers and popular external identifiers. The current files contain all UniProt and RefSeq identifiers known to the current version of iRefIndex as documented on the sources page. For specific source documentation, see the sources for each released version. \n",
29 | " \n",
30 | "Other database identifiers are provided as database/accession pairs only when the iRefIndex identifier (ROGID) does not have a corresponding UniProt or RefSeq record with an identical sequence.\" \n",
31 | " \n",
32 | "Therefore: Interactions containing an ROGID identifier will be removed"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 2,
38 | "metadata": {
39 | "collapsed": false
40 | },
41 | "outputs": [
42 | {
43 | "name": "stdout",
44 | "output_type": "stream",
45 | "text": [
46 | "Raw edge count in iRefIndex: 673100\n"
47 | ]
48 | }
49 | ],
50 | "source": [
51 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n",
52 | "iRefIndex_Raw = pd.read_csv(wd+'Network_Data_Raw/iRefIndex/9606.mitab.04072015.txt',sep='\\t')\n",
53 | "print 'Raw edge count in iRefIndex:', len(iRefIndex_Raw)"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 3,
59 | "metadata": {
60 | "collapsed": false
61 | },
62 | "outputs": [
63 | {
64 | "name": "stdout",
65 | "output_type": "stream",
66 | "text": [
67 | "Human-Human only interactions in iRefIndex: 485030\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "# Keep only human-human interactions\n",
73 | "iRef_Human_only = iRefIndex_Raw[(iRefIndex_Raw['taxa']=='taxid:9606(Homo sapiens)') & (iRefIndex_Raw['taxb']=='taxid:9606(Homo sapiens)')]\n",
74 | "print 'Human-Human only interactions in iRefIndex:', len(iRef_Human_only)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 5,
80 | "metadata": {
81 | "collapsed": true
82 | },
83 | "outputs": [],
84 | "source": [
85 | "# Extract gene list\n",
86 | "Human_iRef_Genes = list(set(iRef_Human_only['#uidA']).union(set(iRef_Human_only['uidB'])))"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 6,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [
96 | {
97 | "name": "stdout",
98 | "output_type": "stream",
99 | "text": [
100 | "['uniprotkb', 'refseq', 'rogid']\n"
101 | ]
102 | }
103 | ],
104 | "source": [
105 | "# Get all iRef prefixes\n",
106 | "prefixes=[]\n",
107 | "for gene in Human_iRef_Genes:\n",
108 | " prefix=gene.split(':')[0]\n",
109 | " if prefix not in prefixes:\n",
110 | " prefixes.append(prefix)\n",
111 | "print prefixes"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 7,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [
121 | {
122 | "name": "stdout",
123 | "output_type": "stream",
124 | "text": [
125 | "485030 Human iRefIndex Edges\n"
126 | ]
127 | }
128 | ],
129 | "source": [
130 | "# Get edge list of network\n",
131 | "query_edgelist = iRef_Human_only[['#uidA','uidB']].values.tolist()\n",
132 | "print len(query_edgelist), \"Human iRefIndex Edges\""
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "## Convert Genes"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 9,
145 | "metadata": {
146 | "collapsed": false
147 | },
148 | "outputs": [
149 | {
150 | "name": "stdout",
151 | "output_type": "stream",
152 | "text": [
153 | "23906 Valid Query Genes\n",
154 | "945 Invalid Query Genes\n"
155 | ]
156 | }
157 | ],
158 | "source": [
159 | "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'rogid' prefix)\n",
160 | "# This should only keep uniprotkb and refseq as queries\n",
161 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_iRef_Genes, exclude_prefixes=['rogid'])"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": 10,
167 | "metadata": {
168 | "collapsed": false
169 | },
170 | "outputs": [
171 | {
172 | "name": "stdout",
173 | "output_type": "stream",
174 | "text": [
175 | "6305 / 485030 edges with invalid nodes removed\n"
176 | ]
177 | }
178 | ],
179 | "source": [
180 | "# filter edgelist because len(invalid_genes) > 0\n",
181 | "query_edgelist_filt = gct.filter_query_edgelist(query_edgelist, invalid_genes)"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 11,
187 | "metadata": {
188 | "collapsed": false
189 | },
190 | "outputs": [
191 | {
192 | "name": "stdout",
193 | "output_type": "stream",
194 | "text": [
195 | "Batch query complete: 48.3 seconds\n",
196 | "24127 Matched query results\n"
197 | ]
198 | }
199 | ],
200 | "source": [
201 | "# Set scopes (gene naming systems to search)\n",
202 | "scopes = \"uniprot, refseq\"\n",
203 | "\n",
204 | "# Set fields (systems from which to return gene names from)\n",
205 | "fields = \"symbol, entrezgene\"\n",
206 | "\n",
207 | "# Query MyGene.Info\n",
208 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n",
209 | "print len(match_list), 'Matched query results'"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 12,
215 | "metadata": {
216 | "collapsed": false,
217 | "scrolled": true
218 | },
219 | "outputs": [
220 | {
221 | "name": "stdout",
222 | "output_type": "stream",
223 | "text": [
224 | "Queries without full matching results found: 6147\n",
225 | "\n",
226 | "162 Queries with mutliple matches found\n",
227 | "\n",
228 | "Query mapping table/dictionary construction complete: 149.88 seconds\n"
229 | ]
230 | }
231 | ],
232 | "source": [
233 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "## Construct Converted Network"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 13,
246 | "metadata": {
247 | "collapsed": false
248 | },
249 | "outputs": [],
250 | "source": [
251 | "# Format edge list by removing prefix indicators from all interactors\n",
252 | "query_edgelist_filt_fmt = [[gct.get_identifier_without_prefix(edge[0]),gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist_filt]"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 15,
258 | "metadata": {
259 | "collapsed": true
260 | },
261 | "outputs": [],
262 | "source": [
263 | "# Convert network edge list to symbol\n",
264 | "iRefIndex_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt_fmt, query_to_symbol)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 16,
270 | "metadata": {
271 | "collapsed": false
272 | },
273 | "outputs": [
274 | {
275 | "name": "stdout",
276 | "output_type": "stream",
277 | "text": [
278 | "478725 input edges\n",
279 | "34326 self-edges removed\n",
280 | "132730 edges with un-mapped genes removed\n",
281 | "178121 duplicate edges removed\n",
282 | "Edge list filtered: 0.78 seconds\n",
283 | "133548 Edges remaining\n"
284 | ]
285 | }
286 | ],
287 | "source": [
288 | "# Filter converted edge list\n",
289 | "iRefIndex_edgelist_symbol_filt = gct.filter_converted_edgelist(iRefIndex_edgelist_symbol)"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 17,
295 | "metadata": {
296 | "collapsed": false
297 | },
298 | "outputs": [
299 | {
300 | "name": "stdout",
301 | "output_type": "stream",
302 | "text": [
303 | "Edge list saved: 0.22 seconds\n"
304 | ]
305 | }
306 | ],
307 | "source": [
308 | "# Save filtered, converted edge list to file\n",
309 | "gct.write_edgelist(iRefIndex_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/iRefIndex_Symbol.sif')"
310 | ]
311 | }
312 | ],
313 | "metadata": {
314 | "kernelspec": {
315 | "display_name": "Python 2",
316 | "language": "python",
317 | "name": "python2"
318 | },
319 | "language_info": {
320 | "codemirror_mode": {
321 | "name": "ipython",
322 | "version": 2
323 | },
324 | "file_extension": ".py",
325 | "mimetype": "text/x-python",
326 | "name": "python",
327 | "nbconvert_exporter": "python",
328 | "pygments_lexer": "ipython2",
329 | "version": "2.7.11"
330 | }
331 | },
332 | "nbformat": 4,
333 | "nbformat_minor": 0
334 | }
335 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Network Evaluation Tools
2 |
3 | Network Evaluation Tools is a Python 2.7 package with corresponding examples for evaluating a network's ability to group a given node set in network proximity. This package was developed as a part of the work done in [Huang and Carlin et al. 2018](http://www.cell.com/cell-systems/fulltext/S2405-4712(18)30095-4).
4 |
5 | ## Modules in this package
6 | - _data_import_tools_ - This module contains functions for helping import network files and gene set files for analysis.
7 | - _gene_conversion_tools_ - This module contains functions for helping convert, filter, and save networks from their raw database form. Used in the Network Processing Jupyter Notebooks.
8 | - _miscellaneous_functions_ - This module contains various functions developed to help with analysis along the way. These functions are not well tested and may contain bugs. These functions were generally used to determine other network performance metrics on network recovery of gene sets.
9 | - _network_evaluation_functions_ - This module contains many of the core functions of the set-based network evaluation algorithm.
10 | - _network_propagation_ - This module contains functions to help with network propagation steps used in the set-based network evaluation algorithm.
11 |
12 | ## Version and Dendencies
13 | Currently, the network_evaluation_tools package requires Python 2.7 - Python 2.7.13. Note that some functions in this package may not work with Python 3.0+.
14 | network_evaluation_tools requires:
15 | - Argparse >= 1.1
16 | - NetworkX >= 2.1
17 | - Numpy >= 1.11.0
18 | - Matplotlib >= 1.5.1
19 | - Pandas >= 0.19.0
20 | - Requests >= 2.13.0
21 | - Scipy >= 0.17.0
22 | - Scikit-learn >= 0.17.1
23 |
24 | Note:
25 | - In Pandas v0.20.0+, the ```.ix```indexer has been deprecated. There may be warning regarding this issue, yet the function still works.
26 |
27 | ## Installation
28 | 1. Clone the repository
29 | 2. cd to new respository
30 | 3. Execute following command:
31 | ```python setup.py install```
32 |
33 | ## Network analysis
34 | 1. If the network needs to be normalized to a particular naming scheme:
35 | A Jupyter Notebook describing how each network was processed from the raw download file in the original [paper](Link) can be found in the ```Network Processing Notebooks``` folder.
36 | 2. There are two ways to perform the network evaluation on a gene set:
37 | The following network analyses can be performed either from a Jupyter Notebook or from the command line (see ```Network Evaluation Examples``` folder). Jupyter notebooks are documented within the notebook and the documentation for the python scripts can be seen using the command ```python [script_name].py -h```.
38 |
39 | ## Data provided in this repository (see ```Data``` Folder)
40 | - Database Citations - An Excel file containing details about all of the networks used in the original paper's analysis and affiliated citations for all of the databases used.
41 | - _DisGeNET / Oncogenic Component Gene Sets_ - Two tab separated files, each line containing a gene set from either DisGeNET or the Oncogenic Component collection. The first column of each file is the name of the gene set followed by the list of genes associated with that given gene set on the same line.
42 | - _Network performance (AUPRCs) on DisGeNET / Oncogenic Component Gene Sets_ - Two csv files containing the raw Z-normalized AUPRC scores (network performance scores) of each network analyzed on each gene set analyzed from DisGeNET or the Oncogenic Component gene set collection.
43 | - _Network performance effect sizes on DisGeNET / Oncogenic Component Gene Sets_ - Two csv files containing the relative performance gain of each network's AUPRC score over the median null AUPRC score for each gene set analyzed from DisGeNET or the Oncogenic Component gene set collection.
44 |
45 | ## Issues
46 | Please feel free to post issues/bug reports. Questions can be sent to jkh013@ucsd.edu
47 |
48 | ## License
49 | See the [LICENSE](https://github.com/huangger/Network_Evaluation_Tools/blob/master/LICENSE.txt) file for license rights and limitations (MIT).
50 |
51 |
52 |
--------------------------------------------------------------------------------
/network_evaluation_tools/.ipynb_checkpoints/PSN Construction-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 1
6 | }
7 |
--------------------------------------------------------------------------------
/network_evaluation_tools/.ipynb_checkpoints/SBNE Method-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 1
6 | }
7 |
--------------------------------------------------------------------------------
/network_evaluation_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/network_evaluation_tools/__init__.py
--------------------------------------------------------------------------------
/network_evaluation_tools/data_import_tools.py:
--------------------------------------------------------------------------------
1 | ###############################################
2 | # ---------- Data Import Functions ---------- #
3 | ###############################################
4 |
5 | import pandas as pd
6 | import networkx as nx
7 | import time
8 | import os
9 |
10 | # Filter extended sif file where all edges are weighted by a specific quantile
11 | # Return the filtered network edge list and save it to a file if desired (for import by load_network_file)
12 | def filter_weighted_network_sif(network_file_path, nodeA_col=0, nodeB_col=1, score_col=2, q=0.9, delimiter='\t', verbose=False, save_path=None):
13 | data = pd.read_csv(network_file_path, sep=delimiter, header=-1, low_memory=False)
14 | # Filter edges by score quantile
15 | q_score = data[score_col].quantile(q)
16 | if verbose:
17 | print str(round(q*100,2))+'%', 'score:', q_score
18 | data_filt = data[data[score_col]>q_score][data.columns[[nodeA_col, nodeB_col, score_col]]]
19 | data_filt.columns = ['nodeA', 'nodeB', 'edgeScore']
20 | if verbose:
21 | print data_filt.shape[0], '/', data.shape[0], 'edges retained'
22 | if save_path is not None:
23 | data_filt.to_csv(save_path, sep='\t', header=False, index=False)
24 | return data_filt
25 |
26 | # Load network from file as unweighted network
27 | # Can set delimiter, but default delimiter is tab
28 | # Only will read edges as first two columns, all other columns will be ignored
29 | def load_network_file(network_file_path, delimiter='\t', verbose=False):
30 | network = nx.read_edgelist(network_file_path, delimiter=delimiter, data=False)
31 | if verbose:
32 | print 'Network File Loaded:', network_file_path
33 | return network
34 |
35 | # Get full paths to all networks in directory with a given file name structure:
36 | # e.g. If filename = 'BIND_Symbol.sif', then network_name='BIND', suffix='_Symbol', ext='.sif
37 | def get_networks(wd, suffix=None, file_ext='.sif'):
38 | network_files = {}
39 | for fn in os.listdir(wd):
40 | if suffix==None:
41 | if fn.endswith(file_ext):
42 | network_files[fn.split(file_ext)[0]]=wd+fn
43 | else:
44 | if fn.endswith(file_ext) and fn.split(file_ext)[0].endswith(suffix):
45 | network_files[fn.split(suffix)[0]]=wd+fn
46 | return network_files
47 |
48 | # Companion function with get_networks(), loads all of the network files found in a directory
49 | # Uses the load_network_file() function to load each network, also only imports first two columns, no edge data
50 | # Constructs a dictionary of useful network items for each network in the directory:
51 | # - Actual networkx object representation of network
52 | # - List of nodes by name for each network
53 | # - List of edges by node name for each network
54 | def load_networks(network_file_map, delimiter='\t', verbose=False):
55 | # Initialize dictionaries
56 | networks, network_edges, network_nodes = {}, {}, {}
57 | # Loading network and network properties
58 | for network_name in network_file_map:
59 | loadtime = time.time()
60 | # Load network
61 | network = load_network_file(network_file_map[network_name], verbose=verbose)
62 | networks[network_name]=network
63 | # Construct network node list
64 | network_nodes[network_name] = network.nodes()
65 | # Construct network edge list
66 | network_edges[network_name] = network.edges()
67 | if verbose:
68 | print 'All given network files loaded'
69 | # Return data structure
70 | return networks, network_edges, network_nodes
71 |
72 | # Convert and save MAF from Broad Firehose
73 | # Can produce 2 types of filetypes: 'matrix' or 'list', matrix is a full samples-by-genes binary csv, 'list' is a sparse representaiton of 'matrix'
74 | # This is a conversion tool, so the result must be saved (most tools will require a path to a processed MAF file and load it separately)
75 | # Gene naming can be 'Symbol' or 'Entrez'
76 | def process_TCGA_MAF(maf_file, save_path, filetype='matrix', gene_naming='Symbol', verbose=False):
77 | loadtime = time.time()
78 | # Load MAF File
79 | TCGA_MAF = pd.read_csv(maf_file,sep='\t',low_memory=False)
80 | # Get all patient somatic mutation (sm) pairs from MAF file
81 | if gene_naming=='Entrez':
82 | TCGA_sm = TCGA_MAF.groupby(['Tumor_Sample_Barcode', 'Entrez_Gene_Id']).size()
83 | else:
84 | TCGA_sm = TCGA_MAF.groupby(['Tumor_Sample_Barcode', 'Hugo_Symbol']).size()
85 | # Turn somatic mutation data into binary matrix
86 | TCGA_sm_mat = TCGA_sm.unstack().fillna(0)
87 | TCGA_sm_mat = (TCGA_sm_mat>0).astype(int)
88 | # Trim TCGA barcodes
89 | TCGA_sm_mat.index = [pat[:12] for pat in TCGA_sm_mat.index]
90 | # Filter samples with duplicate IDs
91 | non_dup_IDs = list(TCGA_sm_mat.index.value_counts().index[TCGA_sm_mat.index.value_counts()==1])
92 | dup_IDs = list(TCGA_sm_mat.index.value_counts().index[TCGA_sm_mat.index.value_counts()>1])
93 | # Save file as binary matrix or sparse list
94 | if filetype=='list':
95 | # Now try to construct two-column/sparse representation of binary sm data
96 | # Get list of all patient somatic mutations
97 | index_list = list(TCGA_sm.index)
98 | # Filter list of patient somatic mutations of duplicate patient barcodes
99 | index_list_filt = [i for i in index_list if not any([True if barcode in i[0] else False for barcode in dup_IDs])]
100 | # Save patient somatic mutations list to file
101 | f = open(save_path, 'w')
102 | for sm in index_list_filt:
103 | f.write(sm[0][:12]+'\t'+sm[1]+'\n')
104 | f.close()
105 | if verbose:
106 | print 'Binary somatic mutations list saved'
107 | else:
108 | # Save non-duplicate patients' binary TCGA somatic mutation matrix to csv
109 | TCGA_sm_mat_filt = TCGA_sm_mat.ix[non_dup_IDs]
110 | # Remove all genes that have no more mutations after patient filtering
111 | nonempty_cols = [col for col in TCGA_sm_mat_filt.columns if not all(TCGA_sm_mat_filt[col]==0)]
112 | TCGA_sm_mat_filt2 = TCGA_sm_mat_filt[nonempty_cols]
113 | # Remove columns with bad names like '0'
114 | named_cols = [col for col in TCGA_sm_mat_filt.columns if col!='0']
115 | TCGA_sm_mat_filt3 = TCGA_sm_mat_filt2[nonempty_cols]
116 | TCGA_sm_mat_filt3.to_csv(save_path)
117 | if verbose:
118 | print 'Binary somatic mutation matrix saved'
119 | if verbose:
120 | print 'MAF file processed:', maf_file, round(time.time()-loadtime, 2), 'seconds.'
121 | return
122 |
123 | # Load binary mutation data with 2 file types (filetype= 'matrix' or 'list')
124 | # filetype=='matrix' is a csv or tsv style matrix with row and column headers, rows are samples/patients, columns are genes
125 | # filetype=='list' is a 2 columns text file separated by the delimiter where 1st column is sample/patient, 2nd column is one gene mutated in that patient
126 | # Line example in 'list' file: 'Patient ID','Gene Mutated'
127 | def load_binary_mutation_data(filename, filetype='matrix', delimiter=',', verbose=False):
128 | if filetype=='list':
129 | f = open(filename)
130 | binary_mat_lines = f.read().splitlines()
131 | binary_mat_data = [(line.split('\t')[0], line.split('\t')[1]) for line in binary_mat_lines]
132 | binary_mat_index = pd.MultiIndex.from_tuples(binary_mat_data, names=['Tumor_Sample_Barcode', 'Hugo_Symbol'])
133 | binary_mat_2col = pd.DataFrame(1, index=binary_mat_index, columns=[0])[0]
134 | binary_mat = binary_mat_2col.unstack().fillna(0)
135 | else:
136 | binary_mat = pd.read_csv(filename, delimiter=delimiter, index_col=0).astype(int)
137 | if verbose:
138 | print 'Binary Mutation Matrix Loaded:', filename
139 | return binary_mat
140 |
141 | # Concatinate multiple mutation matrices together
142 | # All file type structures and delimiters must be the same (see load_binary_mutation_matrix()) across all files
143 | def concat_binary_mutation_matrices(filename_list, filetype='matrix', delimiter=',', verbose=False, save_path=None):
144 | binary_mat_list = [load_binary_mutation_data(fn, filetype=filetype, delimiter=delimiter, verbose=verbose) for fn in filename_list]
145 | binary_mat_concat = pd.concat(binary_mat_list).fillna(0)
146 | if verbose:
147 | print 'All binary mutation matrices loaded and concatenated'
148 | if save_path==None:
149 | return binary_mat_concat
150 | else:
151 | binary_mat_concat.to_csv(save_path)
152 | return binary_mat_concat
153 |
154 | # Construct dictionary of node sets from input text file to perform AUPRC analysis on for network of interest
155 | # File format: Each line is a delimited list with the first item in the list is the name of the node set
156 | # All other nodes in the list follow the node set name
157 | def load_node_sets(node_set_file, delimiter='\t', verbose=False):
158 | f = open(node_set_file)
159 | node_set_lines = f.read().splitlines()
160 | node_set_lines_split = [line.split(delimiter) for line in node_set_lines]
161 | f.close()
162 | node_sets = {node_set[0]:set(node_set[1:]) for node_set in node_set_lines_split}
163 | if verbose:
164 | print 'Node cohorts loaded:', node_set_file
165 | return node_sets
--------------------------------------------------------------------------------
/network_evaluation_tools/gene_conversion_tools.py:
--------------------------------------------------------------------------------
1 | ################################################################
2 | # ---------- Network Gene Name Conversion Functions ---------- #
3 | ################################################################
4 | import requests
5 | import re
6 | import time
7 | import pandas as pd
8 |
9 | # Determine if id to be input is a valid gene name (does not contain parentheses or quotations or whitespace)
10 | def exclude_id(name, bad_prefixes=None):
11 | excluded_id_regex = re.compile('[(),\'\"\s\/\|\.<>]+')
12 | # Remove genes that may also have prefixes that we do not want (e.g. CHEBI)
13 | if bad_prefixes:
14 | for prefix in bad_prefixes:
15 | if name.startswith(prefix):
16 | return True
17 | return excluded_id_regex.search(name)
18 |
19 | # Remove the naming system prefix, if there is one
20 | def get_identifier_without_prefix(string):
21 | elements = string.split(':')
22 | length = len(elements)
23 | if length is 2:
24 | return str(elements[1])
25 | elif length > 2:
26 | return None
27 | else:
28 | return string
29 |
30 | # Construct string for bach query to MyGene.Info v3.0.0 API
31 | def query_constructor(gene_list, exclude_prefixes=None, print_invalid_genes=False):
32 | # Find genes that are valid and return only gene identifiers
33 | valid_query_genes = [get_identifier_without_prefix(gene) for gene in gene_list if exclude_id(gene, exclude_prefixes)==None]
34 | # Find all genes that have invalid names
35 | invalid_query_genes = [gene for gene in gene_list if exclude_id(gene, exclude_prefixes)!=None]
36 | print len(valid_query_genes), "Valid Query Genes"
37 | if print_invalid_genes:
38 | print len(invalid_query_genes), "Invalid Query Genes:"
39 | print invalid_query_genes
40 | else:
41 | print len(invalid_query_genes), "Invalid Query Genes"
42 | query_string = ' '.join(valid_query_genes) # Build string of names to input into MyGene.Info
43 | return query_string, valid_query_genes, invalid_query_genes
44 |
45 | # Function for posting batch query to MyGene.info v3.0.0 API
46 | def query_batch(query_string, tax_id='9606', scopes="symbol, entrezgene, alias, uniprot", fields="symbol, entrezgene"):
47 | query_split = query_string.split(' ')
48 | query_n = len(query_split)
49 | query_time = time.time()
50 | if query_n <=1000:
51 | data = {'species': tax_id, # Human Only
52 | 'scopes': scopes, # Default symbol, entrez, alias, uniprot. Alias often returns more genes than needed, return only higest scoring genes
53 | 'fields': fields, # Which gene name spaces to convert to
54 | 'q': query_string}
55 | res = requests.post('http://mygene.info/v3/query', data)
56 | json = res.json()
57 | else:
58 | # If the query is too long, we will need to break it up into chunks of 1000 query genes (MyGene.info cap)
59 | if query_n % 1000 == 0:
60 | chunks = query_n / 1000
61 | else:
62 | chunks = (query_n / 1000) + 1
63 | query_chunks = []
64 | for i in range(chunks):
65 | start_i, end_i = i*1000, (i+1)*1000
66 | query_chunks.append(' '.join(query_split[start_i:end_i]))
67 | json = []
68 | for chunk in query_chunks:
69 | data = {'species': '9606', # Human Only
70 | 'scopes': "entrezgene, retired", # Default symbol, entrez, alias, uniprot. Alias often returns more genes than needed, return only higest scoring genes
71 | 'fields': "symbol, entrezgene", # Which gene name spaces to convert to
72 | 'q': chunk}
73 | res = requests.post('http://mygene.info/v3/query', data)
74 | json = json+res.json()
75 | print len(json), 'Matched query results'
76 | print 'Batch query complete:', round(time.time()-query_time,2), 'seconds'
77 | return json
78 |
79 | # Construct matched queries maps
80 | def construct_query_map_table(query_result, query_genes, display_unmatched_queries=False):
81 | construction_time = time.time()
82 | # Construct DataFrame of matched queries (only keep the results for each query where both symbol and entrez id were mapped)
83 | matched_data, matched_genes=[], []
84 | for match in query_result:
85 | if match.get('entrezgene') and match.get('symbol'):
86 | matched_data.append([match.get('query'), match.get('_score'), match.get('symbol'), str(match.get('entrezgene'))])
87 | matched_genes.append(match.get('query'))
88 | # Add all other partial mappings or non-mappings to the list
89 | partial_match_genes = [gene for gene in query_genes if gene not in matched_genes]
90 | partial_match_results = []
91 | for match in query_result:
92 | if match.get('query') in partial_match_genes:
93 | partial_match_results.append(match)
94 | if match.get('entrezgene'): # If there if an entrez gene, we want that that in string form, otherwise we want None
95 | matched_data.append([match.get('query'), match.get('_score'), match.get('symbol'), str(match.get('entrezgene'))])
96 | else:
97 | matched_data.append([match.get('query'), match.get('_score'), match.get('symbol'), match.get('entrezgene')])
98 | print 'Queries without full matching results found:', len(partial_match_results)
99 | if display_unmatched_queries:
100 | for entry in partial_match_results:
101 | print entry
102 | # Convert matched data list into data frame table
103 | match_table = pd.DataFrame(data=matched_data, columns=['Query','Score','Symbol','EntrezID'])
104 | match_table = match_table.set_index('Query')
105 | # Some genes will be matched in duplicates (due to alias mapping, generally the highest scoring matches will be correct)
106 | # Therefore we remove duplicate mappings to create 1-to-1 mappings for query to genes.
107 | duplicate_matched_genes = []
108 | for gene in query_genes:
109 | if type(match_table.ix[gene])==pd.DataFrame:
110 | duplicate_matched_genes.append(gene)
111 | print
112 | print len(duplicate_matched_genes), "Queries with mutliple matches found"
113 | # Construct mapping table of genes with only one full result
114 | single_match_genes = [gene for gene in query_genes if gene not in duplicate_matched_genes]
115 | match_table_single = match_table.ix[single_match_genes]
116 | # Keep matches of queries matched only once if there are duplicate matches for genes
117 | if len(duplicate_matched_genes) > 0:
118 | # Keep maximum scored matches of queries matched more than once
119 | max_score_matches=[]
120 | for gene in duplicate_matched_genes:
121 | matched_duplicates = match_table.ix[gene]
122 | max_score = max(matched_duplicates['Score'])
123 | max_score_matches.append(matched_duplicates[matched_duplicates['Score']==max_score])
124 | match_table_duplicate_max = pd.concat(max_score_matches)
125 | # Construct Query maps for symbol and entrez
126 | match_table_trim = pd.concat([match_table_single, match_table_duplicate_max])
127 | else:
128 | match_table_trim = match_table_single.copy(deep=True)
129 | # Construct query map dictionaries
130 | query_to_symbol = match_table_trim['Symbol'].to_dict()
131 | query_to_entrez = match_table_trim['EntrezID'].to_dict()
132 | print
133 | print 'Query mapping table/dictionary construction complete:', round(time.time()-construction_time,2), 'seconds'
134 | return match_table_trim, query_to_symbol, query_to_entrez
135 |
136 | # Filter edgelist to remove all genes that contain invalid query names
137 | # This function is only required if there are any invalid genes found by query_constructor()
138 | def filter_query_edgelist(query_edgelist, invalid_genes):
139 | edgelist_filt = []
140 | count=0
141 | for edge in query_edgelist:
142 | if edge[0] in invalid_genes or edge[1] in invalid_genes:
143 | count+=1
144 | else:
145 | edgelist_filt.append(edge)
146 | print count, '/', len(query_edgelist), 'edges with invalid nodes removed'
147 | return edgelist_filt
148 |
149 | # Convert network edge lists
150 | # Third column is for weights if desired to pass weights forward
151 | def convert_edgelist(query_edgelist, gene_map, weighted=False):
152 | if weighted:
153 | return [sorted([gene_map[edge[0]],gene_map[edge[1]]])+[edge[2]] for edge in query_edgelist]
154 | else:
155 | return [sorted([gene_map[edge[0]],gene_map[edge[1]]]) for edge in query_edgelist]
156 |
157 | # Sometimes each node needs to be converted by its best match if there are multiple names per node
158 | # This function uses the match_table constructed earlier to convert genes to either symbol or entrez format only
159 | def convert_custom_namelist(names, field, match_table):
160 | # Keep only mappings defined for field of interest
161 | if field=='symbol':
162 | # Return match table values that have matched symbol
163 | conversion = match_table.ix[names][~(match_table.ix[names]['Symbol'].isnull())]
164 | if conversion.shape[0]==0:
165 | return None
166 | else:
167 | # Return conversion with max score or None if no conversion
168 | max_score = conversion['Score'].max()
169 | return conversion[conversion['Score']==max_score].ix[0]['Symbol']
170 | elif field=='entrez':
171 | # Return match table values that have matched symbol
172 | conversion = match_table.ix[names][~(match_table.ix[names]['EntrezID'].isnull())]
173 | if conversion.shape[0]==0:
174 | return None
175 | else:
176 | # Return conversion with max score or None if no conversion
177 | max_score = conversion['Score'].max()
178 | return conversion[conversion['Score']==max_score].ix[0]['EntrezID']
179 |
180 | # Filter converted edge lists
181 | def filter_converted_edgelist(edgelist, remove_self_edges=True, weighted=False):
182 | filter_time = time.time()
183 | print len(edgelist),'input edges'
184 | # Remove self-edges
185 | if remove_self_edges:
186 | edgelist_filt1 = [edge for edge in edgelist if edge[0]!=edge[1]]
187 | print len(edgelist)-len(edgelist_filt1), 'self-edges removed'
188 | else:
189 | edgelist_filt1 = edgelist
190 | print 'Self-edges not removed'
191 | if weighted:
192 | # Remove edges where one or both nodes are "None"
193 | edgelist_filt2 = pd.DataFrame(data=edgelist_filt1).dropna().values.tolist()
194 | print len(edgelist_filt1)-len(edgelist_filt2), 'edges with un-mapped genes removed'
195 | # Remove duplicates by keeping the max score
196 | edgelist_filt3_scoremap = {}
197 | for edge in edgelist_filt2:
198 | if edge[0]+'+'+edge[1] not in edgelist_filt3_scoremap:
199 | edgelist_filt3_scoremap[edge[0]+'+'+edge[1]] = edge[2]
200 | else:
201 | edgelist_filt3_scoremap[edge[0]+'+'+edge[1]] = max(edgelist_filt3_scoremap[edge[0]+'+'+edge[1]], edge[2])
202 | # Convert dictionary of scores to list
203 | edgelist_filt3 = []
204 | for edge in edgelist_filt3_scoremap:
205 | edgelist_filt3.append(edge.split('+')+[edgelist_filt3_scoremap[edge]])
206 | print len(edgelist_filt2)-len(edgelist_filt3), 'duplicate edges removed'
207 | else:
208 | # Remove edges where one or both nodes are "None"
209 | edgelist_filt2 = pd.DataFrame(data=edgelist_filt1).dropna()
210 | print len(edgelist_filt1)-edgelist_filt2.shape[0], 'edges with un-mapped genes removed'
211 | # Remove duplicate edges
212 | edgelist_filt3 = edgelist_filt2.drop_duplicates().values.tolist()
213 | print edgelist_filt2.shape[0]-len(edgelist_filt3), 'duplicate edges removed'
214 | print 'Edge list filtered:',round(time.time()-filter_time,2),'seconds'
215 | print len(edgelist_filt3), 'Edges remaining'
216 | return edgelist_filt3
217 |
218 | # Write edgelist to file
219 | def write_edgelist(edgelist, output_file, delimiter='\t', binary=True):
220 | write_time=time.time()
221 | f = open(output_file,'w')
222 | for edge in edgelist:
223 | if binary:
224 | f.write(delimiter.join([edge[0], edge[1]])+'\n')
225 | else:
226 | f.write(delimiter.join([str(val) for val in edge])+'\n')
227 | f.close()
228 | print 'Edge list saved:', round(time.time()-write_time,2),'seconds'
229 |
--------------------------------------------------------------------------------
/network_evaluation_tools/miscellaneous_functions.py:
--------------------------------------------------------------------------------
1 | import time
2 | import pandas as pd
3 | import numpy as np
4 | import data_import_tools as dit
5 | import network_propagation as prop
6 | import network_evaluation_functions as nef
7 | from multiprocessing import Pool
8 | import pickle as p
9 |
10 | ################################################################################
11 | # ---------- Additional Node Set-Based Network Evaluation Functions ---------- #
12 | ################################################################################
13 |
14 | # Calculate confusion matrix (true positive, false negatives, false positives, true negatives) of node set recovery for given node set
15 | # The confusion matrix for every position on every AUPRC curve is returned/stored
16 | def calculate_confusion_matrix_serial(prop_geno, p, n, node_set_name, node_set, verbose=False):
17 | runtime = time.time()
18 | intersect = [nodes for nodes in node_set if nodes in prop_geno.index]
19 | confusion_matrices = {}
20 | sample_size = int(round(p*len(intersect)))
21 | for i in range(n): # Number of times to run the sampling
22 | sample = random.sample(intersect, sample_size) # get node set sample
23 | intersect_non_sample = [node for node in intersect if node not in sample] # nodes in intersect not in sample
24 | prop_geno_non_sample = list(prop_geno.index[~prop_geno.index.isin(sample)]) # nodes in network not in sample
25 | prop_geno_sample_sum = prop_geno.ix[sample][prop_geno_non_sample].sum().sort_values(ascending=False) # summed prop value for all nodes
26 | y_actual = pd.Series(0, index=prop_geno_sample_sum.index, dtype=int) # nodes sorted by mean prop value
27 | y_actual.ix[intersect_non_sample]+=1 # which nodes in sorted list are in intersect_non_sample
28 | intersect_non_sample_sorted = y_actual[y_actual==1].index # intersect_non_sample sorted
29 | confusion_matrix = {'TP':[], 'FN':[], 'FP':[], 'TN':[]} # initialize true positive, false negative, false positive, true negative lists
30 | for node in intersect_non_sample_sorted: # Slide down sorted nodes by summed prop value by nodes that are in intersect_non_sample
31 | TP, FN = sum(y_actual.ix[:node]), sum(y_actual.ix[node:])-1 # Calculate true positives and false negatives found at this point in list
32 | FP, TN = len(y_actual.ix[:node])-TP, len(y_actual.ix[node:])-1-FN # Calculate false positives and true negatives found at this point in list
33 | confusion_matrix['TP'].append(TP)
34 | confusion_matrix['FN'].append(FN)
35 | confusion_matrix['FP'].append(FP)
36 | confusion_matrix['TN'].append(TN)
37 | confusion_matrices[i]=confusion_matrix
38 | if verbose:
39 | print 'Confusion matrices calculated for node set', node_set_name, 'complete.', repr(len(intersect))+' nodes in network,', round(time.time()-runtime, 2), 'seconds.'
40 | return confusion_matrices
41 |
42 | # Calculate confusion matrix (true positive, false negatives, false positives, true negatives) of node set recovery for given node set
43 | # The parameter setup is written for running in serial, only difference is that the name of the node set also must be passed, and prop_geno will be set as a global variable
44 | # The confusion matrix for every position on every AUPRC curve is returned/stored
45 | def calculate_confusion_matrix_parallel(node_set_params):
46 | node_set_name, node_set, p, n, verbose = node_set_params[0], node_set_params[1], node_set_params[2], node_set_params[3], node_set_params[4]
47 | runtime = time.time()
48 | intersect = [nodes for nodes in node_set if nodes in prop_geno.index]
49 | confusion_matrices = {}
50 | sample_size = int(round(p*len(intersect)))
51 | for i in range(n): # Number of times to run the sampling
52 | sample = random.sample(intersect, sample_size) # get node set sample
53 | intersect_non_sample = [node for node in intersect if node not in sample] # nodes in intersect not in sample
54 | prop_geno_non_sample = list(prop_geno.index[~prop_geno.index.isin(sample)]) # nodes in network not in sample
55 | prop_geno_sample_sum = prop_geno.ix[sample][prop_geno_non_sample].sum().sort_values(ascending=False) # summed prop value for all nodes
56 | y_actual = pd.Series(0, index=prop_geno_sample_sum.index, dtype=int) # nodes sorted by mean prop value
57 | y_actual.ix[intersect_non_sample]+=1 # which nodes in sorted list are in intersect_non_sample
58 | intersect_non_sample_sorted = y_actual[y_actual==1].index # intersect_non_sample sorted
59 | confusion_matrix = {'TP':[], 'FN':[], 'FP':[], 'TN':[]} # initialize true positive, false negative, false positive, true negative lists
60 | for node in intersect_non_sample_sorted: # Slide down sorted nodes by summed prop value by nodes that are in intersect_non_sample
61 | TP, FN = sum(y_actual.ix[:node]), sum(y_actual.ix[node:])-1 # Calculate true positives and false negatives found at this point in list
62 | FP, TN = len(y_actual.ix[:node])-TP, len(y_actual.ix[node:])-1-FN # Calculate false positives and true negatives found at this point in list
63 | confusion_matrix['TP'].append(TP)
64 | confusion_matrix['FN'].append(FN)
65 | confusion_matrix['FP'].append(FP)
66 | confusion_matrix['TN'].append(TN)
67 | confusion_matrices[i]=confusion_matrix
68 | if verbose:
69 | print 'Confusion matrices calculated for node set', node_set_name, 'complete.', repr(len(intersect))+' nodes in network,', round(time.time()-runtime, 2), 'seconds.'
70 | return [node_set_name, confusion_matrices]
71 |
72 | # Wapper for calculating the confusion matrices for input node set file and network (has parallel option)
73 | # Not run for null network shuffles
74 | def confusion_matrix_construction_wrapper(network_file, node_set_file, sample_p, sub_sample_iterations,
75 | alpha=None, m=-0.17190024, b=0.7674828, net_delim='\t', set_delim='\t', cores=1, verbose=False, save_path=None):
76 | starttime = time.time()
77 | # Load network
78 | network = dit.load_network_file(network_file, delimiter=net_delim, verbose=verbose)
79 | # Load node set
80 | node_sets = dit.load_node_sets(node_set_file, delimiter=set_delim, verbose=verbose)
81 | # Calculate network influence matrix
82 | prop_net = nef.construct_prop_kernel(network, alpha=alpha, m=m, b=b)
83 | # Calculate confusion matrix values for each node set
84 | if cores == 1:
85 | # Calculate confusion matrix values for node sets one at a time
86 | node_set_conf_mat = {node_set:nef.calculate_confusion_matrix_serial(prop_net, sample_p, sub_sample_iterations, node_set, node_sets[node_set], verbose=verbose) for node_set in node_sets}
87 | else:
88 | # Initialize multiple threads for confusion matrix analysis of multiple node sets
89 | initializer_args = [prop_net]
90 | pool = Pool(cores, nef.parallel_analysis_initializer, initializer_args)
91 | # Construct parameter list to be passed
92 | conf_mat_Analysis_params = [[node_set, node_sets[node_set], sample_p, sub_sample_iterations, verbose] for node_set in node_sets]
93 | # Run the confusion matrix analysis for each geneset
94 | conf_mat_results = pool.map(nef.calculate_confusion_matrix_parallel, conf_mat_Analysis_params)
95 | # Construct confusion matrix results dictionary
96 | node_set_conf_mat = {result[0]:result[1] for result in conf_mat_results}
97 | if save_path is None:
98 | if verbose:
99 | print 'Network confusion matrix values calcualted:', round(time.time()-starttime, 2), 'seconds'
100 | return node_set_conf_mat
101 | else:
102 | p.dump(node_set_conf_mat, open(save_path, 'wb'))
103 | if verbose:
104 | print 'Network confusion matrix values calcualted:', round(time.time()-starttime, 2), 'seconds'
105 | return node_set_conf_mat
106 |
107 | # Use confusion matrix results to calculate odds ratio, risk ratio, accuracy or precision at a given recall threshold
108 | def confusion_matrix_analysis(confusion_matrix_input, calculation, recall_threshold=0.9, verbose=False, save_path=None):
109 | runtime = time.time()
110 | # Load confusion matrix data
111 | if type(confusion_matrix_input)!=dict:
112 | confusion_matrix = p.load(open(confusion_matrix_input, 'rb'))
113 | else:
114 | confusion_matrix = confusion_matrix_input
115 |
116 | # Calculate average and variance of specified calculation
117 | cohort_calculated_values_mean, cohort_calculated_values_var = {}, {}
118 | # For each cohort tested
119 | for cohort in confusion_matrix:
120 | print cohort
121 | n = len(confusion_matrix[cohort])
122 | calculation_values = []
123 | # For all sub-sample iterations
124 | for i in range(n):
125 | # Find where recall >= recall threshold
126 | for j in range(len(confusion_matrix[cohort][i]['TP'])):
127 | TP = confusion_matrix[cohort][i]['TP'][j]
128 | FN = confusion_matrix[cohort][i]['FN'][j]
129 | recall = TP / float((TP+FN))
130 | if recall >= recall_threshold:
131 | FP = confusion_matrix[cohort][i]['FP'][j]
132 | TN = confusion_matrix[cohort][i]['TN'][j]
133 | if calculation=='OR': # Odds Ratio: OR = (TP/FP) / (FN/TN)
134 | calculation_values.append((float(TP)/FP) / (float(FN)/TN))
135 | elif calculation=='RR': # Risk Ratio / Relative Risk: RR = (TP/(TP+FN)) / (FP/(FP+TN))
136 | calculation_values.append((float(TP)/(TP+FN)) / (float(FP)/(FP+TN)))
137 | elif calculation=='accuracy': # accuracy = (TP + TN) / (TP + TN + FP + FN)
138 | calculation_values.append(float(TP+TN) / (TP+FN+FP+TN))
139 | else: # precision = (TP) / (TP+FP)
140 | calculation_values.append(float(TP) / (TP+FP))
141 | break
142 | # Calculate average and variance of value of interest across all iterations for given cohort
143 | cohort_calculated_values_mean[cohort] = np.mean(calculation_values)
144 | cohort_calculated_values_var[cohort] = np.var(calculation_values)
145 | # Return table of average/variance values for performance on all cohorts at given threshold
146 | cohort_calculated_values_table = pd.concat([pd.Series(cohort_calculated_values_mean, name='Average '+calculation),
147 | pd.Series(cohort_calculated_values_var, name=calculation+' Var')], axis=1)
148 | if save_path is None:
149 | if verbose:
150 | print calculation, 'calculation completed for all cohorts', round(time.time()-runtime, 2), 'seconds.'
151 | return cohort_calculated_values_table
152 | else:
153 | cohort_calculated_values_table.to_csv(save_path)
154 | if verbose:
155 | print calculation, 'calculation completed for all cohorts', round(time.time()-runtime, 2), 'seconds.'
156 | return cohort_calculated_values_table
157 |
158 |
159 |
--------------------------------------------------------------------------------
/network_evaluation_tools/network_propagation.py:
--------------------------------------------------------------------------------
1 | #######################################################
2 | # ---------- Network Propagation Functions ---------- #
3 | #######################################################
4 | import networkx as nx
5 | import time
6 | import numpy as np
7 | import scipy
8 | import pandas as pd
9 | import copy
10 |
11 | # Normalize network (or network subgraph) for random walk propagation
12 | def normalize_network(network, symmetric_norm=False):
13 | adj_mat = nx.adjacency_matrix(network)
14 | adj_array = np.array(adj_mat.todense())
15 | if symmetric_norm:
16 | D = np.diag(1/np.sqrt(sum(adj_array)))
17 | adj_array_norm = np.dot(np.dot(D, adj_array), D)
18 | else:
19 | degree_norm_array = np.diag(1/sum(adj_array).astype(float))
20 | sparse_degree_norm_array = scipy.sparse.csr_matrix(degree_norm_array)
21 | adj_array_norm = sparse_degree_norm_array.dot(adj_mat).toarray()
22 | return adj_array_norm
23 | # Note about normalizing by degree, if multiply by degree_norm_array first (D^-1 * A), then do not need to return
24 | # transposed adjacency array, it is already in the correct orientation
25 |
26 | # Calculate optimal propagation coefficient (updated model)
27 | def calculate_alpha(network, m=-0.02935302, b=0.74842057):
28 | log_edge_count = np.log10(len(network.edges()))
29 | alpha_val = round(m*log_edge_count+b,3)
30 | if alpha_val <=0:
31 | raise ValueError('Alpha <= 0 - Network Edge Count is too high')
32 | # There should never be a case where Alpha >= 1, as avg node degree will never be negative
33 | else:
34 | return alpha_val
35 |
36 | # Closed form random-walk propagation (as seen in HotNet2) for each subgraph: Ft = (1-alpha)*Fo * (I-alpha*norm_adj_mat)^-1
37 | # Concatenate to previous set of subgraphs
38 | def fast_random_walk(alpha, binary_mat, subgraph_norm, prop_data):
39 | term1=(1-alpha)*binary_mat
40 | term2=np.identity(binary_mat.shape[1])-alpha*subgraph_norm
41 | term2_inv = np.linalg.inv(term2)
42 | subgraph_prop = np.dot(term1, term2_inv)
43 | return np.concatenate((prop_data, subgraph_prop), axis=1)
44 |
45 | # Wrapper for random walk propagation of full network by subgraphs
46 | def closed_form_network_propagation(network, binary_matrix, network_alpha, symmetric_norm=False, verbose=False, save_path=None):
47 | starttime=time.time()
48 | if verbose:
49 | print 'Alpha:', network_alpha
50 | # Separate network into connected components and calculate propagation values of each sub-sample on each connected component
51 | subgraphs = list(nx.connected_component_subgraphs(network))
52 | # Initialize propagation results by propagating first subgraph
53 | subgraph = subgraphs[0]
54 | subgraph_nodes = list(subgraph.nodes)
55 | prop_data_node_order = list(subgraph_nodes)
56 | binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T)
57 | subgraph_norm = normalize_network(subgraph, symmetric_norm=symmetric_norm)
58 | prop_data_empty = np.zeros((binary_matrix_filt.shape[0], 1))
59 | prop_data = fast_random_walk(network_alpha, binary_matrix_filt, subgraph_norm, prop_data_empty)
60 | # Get propagated results for remaining subgraphs
61 | for subgraph in subgraphs[1:]:
62 | subgraph_nodes = list(subgraph.nodes)
63 | prop_data_node_order = prop_data_node_order + subgraph_nodes
64 | binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T)
65 | subgraph_norm = normalize_network(subgraph, symmetric_norm=symmetric_norm)
66 | prop_data = fast_random_walk(network_alpha, binary_matrix_filt, subgraph_norm, prop_data)
67 | # Return propagated result as dataframe
68 | prop_data_df = pd.DataFrame(data=prop_data[:,1:], index = binary_matrix.index, columns=prop_data_node_order)
69 | if save_path is None:
70 | if verbose:
71 | print 'Network Propagation Complete:', time.time()-starttime, 'seconds'
72 | return prop_data_df
73 | else:
74 | prop_data_df.to_csv(save_path)
75 | if verbose:
76 | print 'Network Propagation Complete:', time.time()-starttime, 'seconds'
77 | return prop_data_df
78 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """
2 | Setup module adapted from setuptools code. See:
3 | https://packaging.python.org/en/latest/distributing.html
4 | https://github.com/pypa/sampleproject
5 | """
6 |
7 | # Always prefer setuptools over distutils
8 | from setuptools import setup, find_packages
9 |
10 | setup(
11 | name='network_evaluation_tools',
12 | version='1.0.2',
13 | description='Module to perform patient and molecular network evaluation as described in Huang and Carlin, et al. 2018',
14 | url='https://github.com/idekerlab/Network_Evaluation_Tools',
15 | author='Justin Huang',
16 | author_email='jkh013@ucsd.edu',
17 | license='MIT',
18 | classifiers=[
19 | 'Development Status :: 5 - Production/Stable',
20 | 'Intended Audience :: Science/Research',
21 | 'Topic :: Software Development :: Build Tools',
22 | 'License :: OSI Approved :: MIT License',
23 | 'Programming Language :: Python :: 2.7'
24 | ],
25 | packages=find_packages(exclude=['copy', 'itertools', 'os', 're', 'time']),
26 | install_requires=[
27 | 'argparse>=1.1',
28 | 'networkx>=2.1',
29 | 'numpy>=1.11.0',
30 | 'matplotlib>=1.5.1',
31 | 'pandas>=0.19.0',
32 | 'requests>=2.13.0',
33 | 'scipy>=0.17.0',
34 | 'scikit-learn>=0.17.1',
35 | 'seaborn>=0.7.1']
36 | )
--------------------------------------------------------------------------------