├── .gitignore ├── Data ├── Database_Citations.xlsx ├── DisGeNET_genesets.txt ├── DisGeNET_genesets_AUPRCs.csv ├── DisGeNET_genesets_Effect_Size.csv ├── GWAS_Catalog_genesets.txt ├── Oncogenic_Components_genesets.txt ├── Oncogenic_genesets_AUPRCs.csv └── Oncogenic_genesets_Effect_Size.csv ├── LICENSE.txt ├── Network Evaluation Examples ├── Network Evaluation Example.ipynb └── run_network_evaluation.py ├── Network Processing Notebooks ├── BIND Processing.ipynb ├── BioGRID Processing.ipynb ├── BioPlex Processing.ipynb ├── ConsensusPathDB Processing.ipynb ├── DIP Processing.ipynb ├── Degree-Preserved Network Shufflings.ipynb ├── GIANT Processing.ipynb ├── GeneMANIA Processing.ipynb ├── HINT Processing.ipynb ├── HPRD Processing.ipynb ├── HumanInteractome Processing.ipynb ├── HumanNet Processing.ipynb ├── InBioMap Processing.ipynb ├── IntAct Processing.ipynb ├── Mentha Processing.ipynb ├── MultiNet Processing.ipynb ├── PID Processing.ipynb ├── Pathway Commons Processing.ipynb ├── Reactome Processing.ipynb ├── Reactome-FIs Processing.ipynb ├── STRING Processing.ipynb └── iRefIndex Processing.ipynb ├── README.md ├── network_evaluation_tools ├── .ipynb_checkpoints │ ├── PSN Construction-checkpoint.ipynb │ └── SBNE Method-checkpoint.ipynb ├── __init__.py ├── data_import_tools.py ├── gene_conversion_tools.py ├── miscellaneous_functions.py ├── network_evaluation_functions.py └── network_propagation.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.pyc 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | .ipynb_checkpoints/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # Environments 84 | .env 85 | .venv 86 | env/ 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /Data/Database_Citations.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/Data/Database_Citations.xlsx -------------------------------------------------------------------------------- /Data/DisGeNET_genesets_AUPRCs.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/Data/DisGeNET_genesets_AUPRCs.csv -------------------------------------------------------------------------------- /Data/DisGeNET_genesets_Effect_Size.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/Data/DisGeNET_genesets_Effect_Size.csv -------------------------------------------------------------------------------- /Data/Oncogenic_Components_genesets.txt: -------------------------------------------------------------------------------- 1 | C1: ERBB3 / PI3K ESRP1 PRSS8 TMEM125 GRHL2 RP11-388M20.2 RP11-354M1.2 CDH1 C1orf210 CRB3 ESRP2 GALNT3 ERBB3 TC2N CCDC64B RP11-429J17.6 TMC4 CDH3 MARVELD3 OVOL2 EPS8L1 CDS1 CDC42BPG PVRL4 ATP2C2 LSR LLGL2 MAP7 SPINT2 DSP GRB7 C19orf21 EPS8L2 C6orf132 F11R SH2D3A RP11-615I2.2 GRHL1 GPR56 CHMP4C SLC44A2 RHOD PRRG2 RP11-22C11.2 ARHGEF16 RGL3 SIGIRR TMEM184A RNF223 AIF1L MYO6 HOOK2 MYO5B ARHGEF35 CNKSR1 MARVELD2 SMPDL3B HOOK1 TTC9 ARHGEF5 CXCL16 ATP8B1 CST6 SYT7 RP4-798C17.5 COBL TPD52L1 RGL2 PRRG4 LITAF 2 | C2: MYC / E2F TEAD2 SMO FBXO2 PACSIN3 AC008738.2 CEBPA PFAS CENPV CTSL2 FKBP10 IL27RA CTSL1 TRAP1 FBLN1 VIM RP11-124N14.3 ETV4 GEMIN5 TRIM65 RP11-40H20.2 TGFB1 NOB1 CTD-2033A16.3 EXOSC2 LIX1L PPAT RPIA AC006111.1 LYAR AMPD2 ERCC1 CDKN2AIPNL CTD-2165H16.1 C20orf20 CCDC85B RP4-765A10.1 RCOR2 RNMTL1 RSAD1 PLOD1 CDCA5 LEPREL2 GNL3 CACYBP NOP56 EIF3E FAM216A CD320 EEF1A1P19 CAP2 3 | C3: RAS / WNT / PI3K PRSS3 RP11-133O22.6 SLC17A9 ALDH2 CDX2 ETV4 CENPV SH3BGRL2 DUSP6 HOXA10 ABLIM1 STEAP1 HNF1B PHLDA1 RP11-867G2.8 DSG2 C1orf106 SMAGP C19orf21 SLC27A2 SPRY2 CD320 COL17A1 TIMP1 ERBB3 DSP SGPP2 EFHD2 ANXA3 SYK LSR NOB1 HMGA1 IL18 LLGL2 GLYCTK CHDH EEF1A1P5 SLC25A6 EPB49 EEF1A1 EEF1A1P6 RPL4 YBX1 RPL6 CRYL1 RPS24 RPL5 CTD-2033A16.3 EIF3E 4 | C4: EMT COL5A1 SPARC CDH11 CDH13 CCDC80 LTBP1 PCOLCE DKK3 TBX3 C1S KCNMA1 NEXN LEPREL2 ANPEP HEG1 RP11-443A13.3 COL16A1 ENG CNRIP1 GAS6 RP13-530H6.2 ADAMTSL1 EFEMP1 SRPX CD99 PALLD IGFBP4 IFFO1 ITGA5 SLC44A2 GNG11 VIM RP11-124N14.3 GPC1 TSPAN5 DPYSL3 FKBP10 C1orf198 MAN1A1 ATP8B1 CAP2 RCAN1 NDST1 PLOD1 EEF1A1P5 RAB13 RP11-342D11.2 EEF1A1 TIMP1 DDAH1 5 | C5: HNF1 / PAX8 CLDN1 EPS8L2 PAX8 LEPREL1 HNF1B ANXA3 DSG2 IL18 GNG11 WWC1 F11R LIMCH1 ELOVL7 CHMP4C ARHGEF5 TMEM56 GNAI1 PTPRJ BCAM RP11-124N14.3 STXBP2 VIM RP4-798C17.5 GPX8 ARHGEF35 LITAF SPINT2 HSPG2 LSR RIPK4 RHPN2 DSP PHLDB2 EPB49 PDGFB NXN LEPROT BAIAP2L1 PLCB4 RP11-54F2.1 RP11-342D11.2 CCDC80 ABLIM1 CELSR1 CTSL1 TPD52L1 PALLD KIAA1598 NDST1 UBE2H 6 | C6: BRAF / MAPK SRPX PLAT TNFRSF19 SPARC MITF ERBB3 SPRY2 DUSP6 GPR56 RENBP GNG11 VIM RP11-124N14.3 ETV4 PHLDA1 ST6GALNAC2 ENG NES SPRY4 AGPAT9 PHLDA3 TIMP1 CTSL1 RCAN1 PYGB FKBP10 NFATC2 IFFO1 PLOD1 RIPK4 EEF1A1P5 EEF1A1 UBL3 YBX1 EEF1A1P6 RPL4 SLC20A1 RPL6 CHST11 SLC6A15 VAT1 SLC25A6 ENTPD6 RPL5 CD320 HMGB1 GLT25D1 SPRED1 SSH1 HMGA1 7 | C7: TNF / NF-kB NT5E CDCP1 PHLDA1 CALB2 STEAP1 NRP1 RP11-342D11.2 PLAT MT1E ELK3 ANTXR2 AGPAT9 IRAK2 LINC00460 TM4SF19 RP11-394J1.2 HPCAL1 TGFB1 PRDM8 STX1A HMGA2 TIMP1 FMNL1 RAB31 ITGA5 PDP1 HRH1 CHST11 IL31RA TMEM158 RP11-124N14.3 C11orf68 VIM IGFBP4 ETV4 EFHD2 DUSP6 AC138150.4 TSPAN5 SLC20A1 MAP4K4 CCDC85B WDR54 FUT8 ADAM19 DST GEM DPYSL3 IL18 PHLDB2 DNER DSG2 CMTM3 AC005035.1 ARSJ GFPT2 CTSL1 EFEMP1 TPBG HMGA1 CAPRIN2 LYAR STMN3 FOXL1 GPX8 STAMBPL1 STK10 ARAP3 SMAGP HJURP RP11-221N13.3 ANXA3 CTHRC1 ITPRIP FKBP10 CLDN1 TOMM34 SERPINA1 TRBC2 8 | C8: MYC KIF1A CHGB DPYSL3 SYP SYT1 STMN3 PKIA VANGL2 AP3B2 UNC13A CENPV MAPRE2 CTA-221G9.10 DLL1 CNTNAP2 JPH1 ELOVL2 TMEM145 STXBP1 RP11-122A3.2 RCOR2 DNAJC6 ZNF512 STX1A VGF RIC3 SLC6A15 RIMS2 AGPAT5 RAP2A SSBP3 CD320 RIMS3 RP11-158I13.2 CXXC1 TPD52 CCDC64 HOOK1 SYT7 WDR54 IVNS1ABP NOP56 EEF1A1P5 AC005035.1 YBX1 EEF1A1 RAB3B EEF1A1P6 AC012379.1 PRPF19 RPL6 MAP4K4 HMGB1 CDCA5 RPL4 TTL RPL5 ATP2A2 U2AF2 SLC25A6 TTLL7 IPO5 YBX1P1 GNAI1 CACYBP MCL1 RPS24 SLC44A5 TSPYL2 PIH1D1 TSHZ1 HMGA1 EIF3E RHBDD2 GSK3B GOLIM4 GNL3 CBLN1 TMTC4 KHDRBS3 NEURL1B SH3BGRL2 KATNB1 GART PEX5L EIF3H ALDH2 SCN3B HJURP PSD PPAT CTB-79E8.2 SLC20A1 POLR3GL METTL9 GAB2 AMOTL1 ARL2 DDRGK1 COPS8 PYGB MEST NELF AGPAT6 MFSD6 EXOSC2 KIAA1324 RTN2 DAP TOMM34 ID1 GLT25D1 VAT1 FAM216A SRD5A1 ACN9 E2F4 TRAP1 CDKN2AIPNL DBF4 RPIA CXADR 9 | C9: RAS / AP1 KRT17 KRT5 GPR87 DSC3 DSG3 FBLN1 COL17A1 CDH3 FAT2 RP11-615I2.2 AL391137.1 NXN LEPREL1 IL18 CLDN1 PPP1R14C EFEMP1 GPC1 RHOD CDH1 CTSL2 CCDC80 DSP VANGL2 ST6GALNAC2 PHLDA3 TMEM40 LY6D C10orf54 CTSH ANXA3 BCAM RP11-354M1.2 CXCL16 FGFR2 DSG2 CREG1 RIPK4 LIMK2 MMP28 ID1 LSR F11R LITAF CELSR2 DAB2IP PHLDB2 C1orf106 TPD52L1 GNAI1 10 | -------------------------------------------------------------------------------- /Data/Oncogenic_genesets_AUPRCs.csv: -------------------------------------------------------------------------------- 1 | Oncogenic Component Gene Set,GeneMANIA,GIANT,STRING,ReactomeFI,Reactome,MultiNet,PathwayCommons,HumanNet,BioPlex,DIP,InBioMap,BioGRID,BIND,Mentha,IRefIndex,PID,HPRD,IntAct,ConsensusPathDB,HINT,HumanInteractome 2 | C1: ERBB3 / PI3K,79.504,88.463,45.321,9.31,9.144,13.522,8.321,25.382,29.247,-0.007,8.148,16.023,3.102,4.689,4.741,7.379,13.017,3.638,6.904,1.435,-0.254 3 | C2: MYC / E2F,12.144,4.593,10.196,1.583,32.015,2.582,1.592,-0.428,2.878,-1.108,0.699,0.985,-3.08,0.356,-0.487,-1.122,-0.851,0.172,-0.771,0.758,0.237 4 | C3: RAS / WNT / PI3K,13.709,24.005,3.724,4.536,6.139,1.996,6.138,1.109,0.347,8.047,0.94,0.942,9.901,0.587,0.301,0.252,2.324,1.861,-0.288,0.736,-0.622 5 | C4: EMT,46.876,56.378,49.475,8.366,12.124,6.63,14.062,16.828,6.057,7.183,7.538,6.861,3.157,5.922,3.089,0.478,2.729,3.528,6.462,1.059,-1.832 6 | C5: HNF1 / PAX8,18.51,12.137,3.211,6.21,5.811,14.762,3.296,4.714,17.87,2.503,6.311,2.412,33.288,3.325,3.186,44.088,1.666,0.784,5.496,3.214,0.698 7 | C6: BRAF / MAPK,33.463,37.398,8.971,7.753,0.6,5.81,11.364,2.604,9.481,8.875,0.752,2.887,-1.686,1.98,1.22,0.66,1.176,2.421,1.623,0.883,-0.98 8 | C7: TNF / NF-kB,25.602,75.85,3.472,5.288,5.415,0.043,0.039,4.025,0.666,4.193,1.091,-0.009,-0.242,0.091,2.078,-0.565,-0.538,0.483,-0.265,-0.332,2.302 9 | C8: MYC,19.938,12.769,14.662,7.637,7.606,5.71,7.618,0.515,0.809,0.992,4.678,1.226,-1.186,1.449,2.012,3.4,2.329,0.815,-0.345,2.297,1.027 10 | C9: RAS / AP1,63.402,75.232,6.942,15.589,13.236,18.125,6.703,19.884,3.203,29.165,6.068,7.148,294.849,4.349,6.704,10.111,6.48,1.973,5.423,0.892,3.703 -------------------------------------------------------------------------------- /Data/Oncogenic_genesets_Effect_Size.csv: -------------------------------------------------------------------------------- 1 | Oncogenic Component Gene Set,STRING,ConsensusPathDB,HumanNet,Reactome,ReactomeFI,GIANT,InBioMap,GeneMANIA,DIP,MultiNet,HINT,IRefIndex,PathwayCommons,HPRD,BioGRID,Mentha,IntAct,PID,BioPlex,BIND,HumanInteractome 2 | C1: ERBB3 / PI3K,0.5135262,0.058763726,0.692236671,0.122222216,0.15754273,6.684008471,0.082859901,4.067461582,-4.31E-05,0.189440939,0.016354416,0.064830058,0.068458882,0.273601605,0.545721195,0.051626723,0.031076848,0.118967686,0.994315007,0.005668391,-0.001785174 3 | C2: MYC / E2F,0.092846659,-0.024804717,-0.011586723,0.141646337,0.021769631,0.126131444,0.011438366,0.229788537,-0.007466792,0.042600429,0.024478392,-0.013013427,0.01547359,-0.016721966,0.021910887,0.011695219,0.004778901,-0.002736735,0.056399241,-0.01325828,0.000747868 4 | C3: RAS / WNT / PI3K,0.058378867,-0.015405315,0.050083469,0.116639188,0.130570541,0.423772198,0.024225851,0.353977705,0.163708126,0.030888897,0.017779446,0.009526628,0.302163126,0.058046682,0.019992612,0.017686172,0.041009049,0.003238783,0.005259451,0.094973291,-0.003334094 5 | C4: EMT,0.502137765,0.078045291,0.539404955,0.171974632,0.143326712,1.56712006,0.108628694,1.578435016,0.034923587,0.045392111,0.0082312,0.080228263,0.377918883,0.048341957,0.107700053,0.074111893,0.082371847,0.00104432,0.147561307,0.014931652,-0.001871211 6 | C5: HNF1 / PAX8,0.067260554,0.051132155,0.048942587,0.086221776,0.124037539,0.359179111,0.034975803,0.493826433,0.025733051,0.12134834,0.038769791,0.060915915,0.039247966,0.039383136,0.070949222,0.045455993,0.009638796,0.584454442,0.4182192,0.049113775,0.00191155 7 | C6: BRAF / MAPK,0.16734618,0.040467197,0.144514141,0.008246205,0.124472595,0.660173496,0.029069971,0.636674143,0.171066126,0.066039347,0.022348522,0.039525486,0.347191285,0.045675844,0.078249904,0.056334395,0.048530168,0.007730904,0.242063957,-0.033614766,-0.007001517 8 | C7: TNF / NF-kB,0.070352307,-0.003042142,0.078385588,0.113450311,0.064574197,1.614945003,0.014547852,0.874980732,0.150039047,0.000784241,-0.005916828,0.024739026,0.00103285,-0.026220378,-0.00014158,0.001885841,0.013316024,-0.004948463,0.014466807,-0.002359939,0.019704809 9 | C8: MYC,0.334557167,-0.016903927,0.02853045,0.282954697,0.306728046,0.350495141,0.138018638,0.407845466,0.06911551,0.226512037,0.059835179,0.090457074,0.298779819,0.11275682,0.058599939,0.050242058,0.029014753,0.12451458,0.034957168,-0.079153753,0.007960667 10 | C9: RAS / AP1,0.227817837,0.058713082,0.425458334,0.451898009,0.508070925,2.110367453,0.047765235,1.711314568,0.376062304,0.171777257,0.005529503,0.103466426,0.120951911,0.172031028,0.233638383,0.048789225,0.018321645,0.235847681,0.061694523,0.305124847,0.00876537 -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) Idekerlab 2017 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /Network Evaluation Examples/run_network_evaluation.py: -------------------------------------------------------------------------------- 1 | ################################################################### 2 | # Command line script to analyze network on node sets of interest # 3 | ################################################################### 4 | 5 | from network_evaluation_tools import network_evaluation_functions as nef 6 | from network_evaluation_tools import data_import_tools as dit 7 | from network_evaluation_tools import gene_conversion_tools as gct 8 | import argparse 9 | import os 10 | import pandas as pd 11 | 12 | # Checking valid alpha and p values (Range is 0.0-1.0 exclusive) 13 | # Value can also be None. 14 | def restricted_float(x): 15 | if x is not None: 16 | x = float(x) 17 | if x <= 0.0 or x >= 1.0: 18 | raise argparse.ArgumentTypeError("%r not in range (0.0, 1.0) exclusive"%(x,)) 19 | return x 20 | 21 | # Checking valid integer values (for all values that must be >0) 22 | def positive_int(x): 23 | x = int(x) 24 | if x <= 0: 25 | raise argparse.ArgumentTypeError("%s must be a positive integer" % x) 26 | return x 27 | 28 | # Valid file path check (Does not check file formatting, but checks if given path exists and is readable) 29 | def valid_infile(in_file): 30 | if not os.path.isfile(in_file): 31 | raise argparse.ArgumentTypeError("{0} is not a valid input file path".format(in_file)) 32 | if os.access(in_file, os.R_OK): 33 | return in_file 34 | else: 35 | raise argparse.ArgumentTypeError("{0} is not a readable input file".format(in_file)) 36 | 37 | # Valid output directory path check (Checks if the output directory path can be found and written to by removing given filename from full path) 38 | # Note: This uses '/' character for splitting pathnames on Linux and Mac OSX. The character may need to be changed to '\' for Windows executions 39 | def valid_outfile(out_file): 40 | outdir = '/'.join(out_file.split('/')[:-1]) 41 | if not os.path.isdir(outdir): 42 | raise argparse.ArgumentTypeError("{0} is not a valid output directory".format(outdir)) 43 | if os.access(outdir, os.W_OK): 44 | return out_file 45 | else: 46 | raise argparse.ArgumentTypeError("{0} is not a writable output directory".format(outdir)) 47 | 48 | if __name__ == "__main__": 49 | # Network Evaluation Setup Variables 50 | parser = argparse.ArgumentParser(description='Analyze network performance on ability to aggregate sets of nodes in network space.') 51 | parser.add_argument("network_path", type=valid_infile, 52 | help='Path to file of network to be evaluated. File must be 2-column edge list where each line is a gene interaction separated by a common delimiter.') 53 | parser.add_argument("node_sets_file", type=valid_infile, 54 | help='Path to file of node sets. Each line is a list, separated by a common delimiter. The first item in each line will be the name of the node set.') 55 | parser.add_argument("actual_AUPRCs_save_path", type=valid_outfile, 56 | help='CSV file path of network evaluation result scores (AUPRCs). This script minimally returns these values to save. Must have a writable directory.') 57 | parser.add_argument('-v', '--verbose', default=False, action="store_true", required=False, 58 | help='Verbosity flag for reporting on patient similarity network construction steps.') 59 | parser.add_argument('-netd', '--net_file_delim', type=str, default='\t', required=False, 60 | help='Delimiter used in network file between columns. Default is tab white space.') 61 | parser.add_argument('-setd', '--set_file_delim', type=str, default='\t', required=False, 62 | help='Delimiter used in node set file to delimit lists. Default is tab white space.') 63 | parser.add_argument("-p", "--sample_p", type=restricted_float, default=None, required=False, 64 | help='Sub-sampling percentage for node sets of interest. Default is None. Each gene set''s p is automatically determined by the network in this case.') 65 | parser.add_argument("-a", "--alpha", type=restricted_float, default=None, required=False, 66 | help='Propagation constant to use in the propagation of node sub-samples over given network. Overrides alpha calculation model if given.') 67 | parser.add_argument("-n", "--sub_sample_iter", type=positive_int, default=30, required=False, 68 | help='Number of times to perform sub-sampling during performance recovery (AUPRC) calculation for each node set. Default is 30.') 69 | parser.add_argument('-c', '--cores', type=positive_int, default=1, required=False, 70 | help='Number of cores to be utilized by machine for performance calculation step. NOTE: Each core must have enough memory to store at least network-sized square matrix and given node sets to perform calculations.') 71 | parser.add_argument('-bg', '--background', type=str, default='network', choices=['genesets', 'network'], required=False, 72 | help='Establishes the background gene set to calculate AUPRC over. Default is to use all genes in the network, can change to use only genes from the union of all gene sets tested (i.e. disease genes only).') 73 | 74 | # Network performance score calculations (with null networks) 75 | parser.add_argument("-i", "--null_iter", type=positive_int, default=30, required=False, 76 | help='Number of times to perform degree-preserved shuffling of network to construct performance value null distribution. Default is 30. If this value is >0, --null_AUPRCs_save_path will be required') 77 | parser.add_argument('-nno', '--null_network_outdir', type=valid_outfile, default=None, required=False, 78 | help='File directory to save null networks after generation.') 79 | parser.add_argument('-nsp', '--null_AUPRCs_save_path', type=valid_outfile, default=None, required=False, 80 | help='CSV file path of where to save null network evaluation results. Used in the calculation of network performance score and perfomance gain scores') 81 | parser.add_argument('-psp', '--performance_save_path', type=valid_outfile, default=None, required=False, 82 | help='CSV file path of where to save network evaluation results as z-scores.') 83 | parser.add_argument('-gsp', '--performance_gain_save_path', type=valid_outfile, default=None, required=False, 84 | help='CSV file path of where to save network evaluation results as gain in AUPRC over median null AUPRCs.') 85 | 86 | args = parser.parse_args() 87 | # If null networks need to be constructed 88 | if args.null_iter > 0: 89 | # A file path must be given to either save the null networks or the null network performance 90 | if (args.null_AUPRCs_save_path is None) and (args.null_network_outdir is None): 91 | parser.error('Save path required for null network edge lists or null network evaluation results.') 92 | 93 | #################################### 94 | ##### Network Evaluation Setup ##### 95 | #################################### 96 | 97 | # Limit core usage (if defined) 98 | import mkl 99 | mkl.set_num_threads(args.cores) 100 | 101 | # Load Network 102 | network = dit.load_network_file(args.network_path, verbose=args.verbose) 103 | network_size = len(network.nodes()) 104 | 105 | # Load Gene sets 106 | genesets = dit.load_node_sets(args.node_sets_file, verbose=args.verbose) 107 | 108 | # Calculate gene set sub-sample rate with network (if not set) 109 | if args.sample_p is None: 110 | genesets_p = nef.calculate_p(network, genesets) 111 | else: 112 | genesets_p = {geneset:args.sample_p for geneset in genesets} 113 | if args.verbose: 114 | print 'Gene set sub-sample rates set' 115 | 116 | # Calculate network kernel (also determine propagation constant if not set) 117 | kernel = nef.construct_prop_kernel(network, alpha=args.alpha, verbose=True) 118 | 119 | # Change background gene list if needed 120 | if args.background == 'genesets': 121 | background_node_set = set() 122 | for geneset in genesets: 123 | background_node_set = background_node_set.union(genesets[geneset]) 124 | background_nodes = list(background_node_set.intersection(set(kernel.index))) 125 | else: 126 | background_nodes = list(kernel.index) 127 | 128 | 129 | ############################################ 130 | ##### Network Performance Calculations ##### 131 | ############################################ 132 | 133 | # Calculate AUPRC for each gene set on actual network (large networks are >=10k nodes) 134 | if network_size < 10000: 135 | actual_AUPRC_values = nef.small_network_AUPRC_wrapper(kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True) 136 | else: 137 | actual_AUPRC_values = nef.large_network_AUPRC_wrapper(kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True) 138 | 139 | # Save the actual network's AUPRC values 140 | actual_AUPRC_values.to_csv(args.actual_AUPRCs_save_path) 141 | 142 | 143 | ################################################# 144 | ##### Null Network Performance Calculations ##### 145 | ################################################# 146 | 147 | # If number of null networks > 0: 148 | if args.null_iter > 0: 149 | null_AUPRCs = [] 150 | for i in range(args.null_iter): 151 | # Construct null networks and calculate AUPRCs for each gene set on each null network 152 | shuffNet = nef.shuffle_network(network, max_tries_n=10, verbose=True) 153 | # Save null network if null network output directory is given 154 | if args.null_network_outdir is not None: 155 | shuffNet_edges = shuffNet.edges() 156 | gct.write_edgelist(shuffNet_edges, args.null_network_outdir+'shuffNet_'+repr(i+1)+'.txt', 157 | delimiter='\t', binary=True) 158 | if args.verbose: 159 | print('Shuffled Network', i+1, 'written to file') 160 | # Construct null network kernel 161 | shuffNet_kernel = nef.construct_prop_kernel(shuffNet, alpha=args.alpha, verbose=False) 162 | # Calculate null network AUPRCs 163 | if network_size < 10000: 164 | shuffNet_AUPRCs = nef.small_network_AUPRC_wrapper(shuffNet_kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True) 165 | else: 166 | shuffNet_AUPRCs = nef.large_network_AUPRC_wrapper(shuffNet_kernel, genesets, genesets_p, n=args.sub_sample_iter, cores=args.cores, bg=background_nodes, verbose=True) 167 | null_AUPRCs.append(shuffNet_AUPRCs) 168 | # Construct table of null AUPRCs 169 | null_AUPRCs_table = pd.concat(null_AUPRCs, axis=1) 170 | null_AUPRCs_table.columns = ['shuffNet'+repr(i+1) for i in range(len(null_AUPRCs))] 171 | if args.verbose: 172 | print 'All null network gene set AUPRCs calculated' 173 | # Save null network AUPRCs if save path is given 174 | if args.null_AUPRCs_save_path is not None: 175 | null_AUPRCs_table.to_csv(args.null_AUPRCs_save_path) 176 | # Calculate performance score for each gene set's AUPRC if performance score save path is given 177 | if args.performance_save_path is not None: 178 | network_performance = nef.calculate_network_performance_score(actual_AUPRC_values, null_AUPRCs_table, verbose=args.verbose) 179 | network_performance.to_csv(args.performance_save_path) 180 | # Calculate network performance gain over median null AUPRC if AUPRC performance gain save path is given 181 | if args.performance_gain_save_path is not None: 182 | network_perf_gain = nef.calculate_network_performance_gain(actual_AUPRC_values, null_AUPRCs_table, verbose=args.verbose) 183 | network_perf_gain.to_csv(args.performance_save_path) 184 | -------------------------------------------------------------------------------- /Network Processing Notebooks/BIND Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "from network_evaluation_tools import gene_conversion_tools as gct" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Load BIND Raw Data\n", 20 | "#### Source: http://www.pathwaycommons.org/archives/PC2/v8/PathwayCommons.8.bind.BINARY_SIF.hgnc.txt.sif.gz\n", 21 | "Downloaded: June 15, 2017 \n", 22 | "Last Updated (via Pathway Commons v9 datasources.txt file): December 15, 2010 \n", 23 | "Note: For this processing, we used the data file provided in the PathwayCommons v8 distribution. The SIF file provided by Pathway Commons v9 at the given time only yields 13078 interactions significantly less than the file provided by the v8 distribution. It is unclear where all of those interactions have gone for now, but at this time, we will be using the Pathway Commons v8 distribution of BIND. \n", 24 | "Also note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 30, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 36 | "BIND_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons.8.bind.BINARY_SIF.hgnc.txt.sif',sep='\\t', header=-1)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 32, 42 | "metadata": { 43 | "collapsed": false, 44 | "scrolled": true 45 | }, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "Edges in BIND: 72780\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "# Convert table of interactions to edgelist (no scores given)\n", 57 | "# Also no gene symbol conversion necessary because network is given in symbol format already\n", 58 | "BIND_edgelist = BIND_Raw[[0, 2]].values.tolist()\n", 59 | "print 'Edges in BIND:', len(BIND_edgelist)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 33, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "# Sort each edge representation for filtering\n", 71 | "BIND_edgelist_sorted = [sorted(edge) for edge in BIND_edgelist]" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 34, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "72780 input edges\n", 86 | "0 self-edges removed\n", 87 | "0 edges with un-mapped genes removed\n", 88 | "0 duplicate edges removed\n", 89 | "Edge list filtered: 0.19 seconds\n", 90 | "72780 Edges remaining\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "# Filter edgelist for duplicate nodes and for self-edges\n", 96 | "BIND_edgelist_filt = gct.filter_converted_edgelist(BIND_edgelist_sorted)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 35, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "Edge list saved: 0.09 seconds\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "# Save genelist to file\n", 116 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n", 117 | "gct.write_edgelist(BIND_edgelist_filt, outdir+'BIND_Symbol.sif')" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [] 128 | } 129 | ], 130 | "metadata": { 131 | "kernelspec": { 132 | "display_name": "Python 2", 133 | "language": "python", 134 | "name": "python2" 135 | }, 136 | "language_info": { 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 2 140 | }, 141 | "file_extension": ".py", 142 | "mimetype": "text/x-python", 143 | "name": "python", 144 | "nbconvert_exporter": "python", 145 | "pygments_lexer": "ipython2", 146 | "version": "2.7.11" 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 0 151 | } 152 | -------------------------------------------------------------------------------- /Network Processing Notebooks/BioGRID Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "import pandas as pd\n", 13 | "import itertools" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Load BioGRID Raw Data\n", 21 | "#### Source (MITAB): http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.149/BIOGRID-ORGANISM-3.4.149.tab2.zip\n", 22 | "Downloaded: June 15, 2017 \n", 23 | "Last Updated: June 01, 2017 \n", 24 | "Notes for download: There is a new verision of BioGRID released on the first of every month. Download the organism specific files to extract only human interactions from the database. \n", 25 | "Notes for processing: This is the file for human protein interactions, however, not all interactions may be human-human interactions. These need to be filtered. There is a column for \"Score\" filtering, but it seems that most of these values are missing so they will be ignored for processing BioGRID" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Raw edge count in BioGRID: 394749\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 45 | "BioGRID_Raw = pd.read_csv(wd+'Network_Data_Raw/BioGRID/BIOGRID-ORGANISM-3.4.149.tab2/BIOGRID-ORGANISM-Homo_sapiens-3.4.149.tab2.txt',sep='\\t', low_memory=False)\n", 46 | "print 'Raw edge count in BioGRID:', len(BioGRID_Raw)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "physical 392779\n", 60 | "genetic 1970\n", 61 | "Name: Experimental System Type, dtype: int64" 62 | ] 63 | }, 64 | "execution_count": 4, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "# Show not all interactions in BioGRID are physical PPI, though the overwhelming majority are\n", 71 | "BioGRID_Raw['Experimental System Type'].value_counts()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 27, 77 | "metadata": { 78 | "collapsed": false, 79 | "scrolled": true 80 | }, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "9606 372979\n", 86 | "10090 17963\n", 87 | "11676 1591\n", 88 | "10116 570\n", 89 | "559292 355\n", 90 | "Name: Organism Interactor A, dtype: int64" 91 | ] 92 | }, 93 | "execution_count": 27, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "# Not all interactions are from Human\n", 100 | "BioGRID_Raw['Organism Interactor A'].value_counts().head()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 28, 106 | "metadata": { 107 | "collapsed": false, 108 | "scrolled": true 109 | }, 110 | "outputs": [ 111 | { 112 | "data": { 113 | "text/plain": [ 114 | "9606 389334\n", 115 | "10090 2543\n", 116 | "559292 1045\n", 117 | "10116 708\n", 118 | "11676 318\n", 119 | "Name: Organism Interactor B, dtype: int64" 120 | ] 121 | }, 122 | "execution_count": 28, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "# Not all interactions are from Human\n", 129 | "BioGRID_Raw['Organism Interactor B'].value_counts().head()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "#### Since there are so few genetic interactions relative to physical interactions, we will not filter these edges. However, we will filter all interactions that are not labelled as human-human interactions" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "#### Keep only human-human interactions" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 9, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "Human-Human only interactions in BioGRID 3.4.149: 367564\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "BioGRID_Human_Only = BioGRID_Raw[(BioGRID_Raw['Organism Interactor A']==9606) & (BioGRID_Raw['Organism Interactor B']==9606)]\n", 163 | "print 'Human-Human only interactions in BioGRID 3.4.149:', len(BioGRID_Human_Only)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 29, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "Series([], Name: Official Symbol Interactor A, dtype: object)" 177 | ] 178 | }, 179 | "execution_count": 29, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "# Any missing symbol names in column A?\n", 186 | "BioGRID_Human_Only['Official Symbol Interactor A'][BioGRID_Human_Only['Official Symbol Interactor A']=='-']" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 30, 192 | "metadata": { 193 | "collapsed": false 194 | }, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "Series([], Name: Official Symbol Interactor B, dtype: object)" 200 | ] 201 | }, 202 | "execution_count": 30, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "# Any missing symbol names in column B?\n", 209 | "BioGRID_Human_Only['Official Symbol Interactor B'][BioGRID_Human_Only['Official Symbol Interactor B']=='-']" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 32, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "Edges in BioGRID: 367564\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "# Convert table of interactions to edgelist (no scores given)\n", 229 | "# Also no gene symbol conversion necessary because network is given in symbol format already\n", 230 | "BioGRID_edgelist = BioGRID_Human_Only[['Official Symbol Interactor A', 'Official Symbol Interactor B']].values.tolist()\n", 231 | "print 'Edges in BioGRID:', len(BioGRID_edgelist)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 33, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "# Sort each edge representation for filtering\n", 243 | "BioGRID_edgelist_sorted = [sorted(edge) for edge in BioGRID_edgelist]" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 34, 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [ 253 | { 254 | "name": "stdout", 255 | "output_type": "stream", 256 | "text": [ 257 | "367564 input edges\n", 258 | "4598 self-edges removed\n", 259 | "0 edges with un-mapped genes removed\n", 260 | "104709 duplicate edges removed\n", 261 | "Edge list filtered: 0.29 seconds\n", 262 | "258257 Edges remaining\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "# Filter edgelist for duplicate nodes and for self-edges\n", 268 | "BioGRID_edgelist_filt = gct.filter_converted_edgelist(BioGRID_edgelist_sorted)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 37, 274 | "metadata": { 275 | "collapsed": false 276 | }, 277 | "outputs": [ 278 | { 279 | "name": "stdout", 280 | "output_type": "stream", 281 | "text": [ 282 | "Edge list saved: 0.21 seconds\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "# Save genelist to file\n", 288 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n", 289 | "gct.write_edgelist(BioGRID_edgelist_filt, outdir+'BioGRID_Symbol.sif')" 290 | ] 291 | } 292 | ], 293 | "metadata": { 294 | "kernelspec": { 295 | "display_name": "Python 2", 296 | "language": "python", 297 | "name": "python2" 298 | }, 299 | "language_info": { 300 | "codemirror_mode": { 301 | "name": "ipython", 302 | "version": 2 303 | }, 304 | "file_extension": ".py", 305 | "mimetype": "text/x-python", 306 | "name": "python", 307 | "nbconvert_exporter": "python", 308 | "pygments_lexer": "ipython2", 309 | "version": "2.7.11" 310 | } 311 | }, 312 | "nbformat": 4, 313 | "nbformat_minor": 0 314 | } 315 | -------------------------------------------------------------------------------- /Network Processing Notebooks/BioPlex Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "import pandas as pd\n", 13 | "import itertools" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Load BioPlex Raw Data\n", 21 | "#### Source: http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv\n", 22 | "Downloaded: June 20, 2017 \n", 23 | "Last Updated: December 01, 2016 \n", 24 | "This latest update of BioPlex (2.0 v4) is associated with the recent paper: Huttlin et al. (2017) Nature doi: 10.1038/nature22366 \n", 25 | "Note: We could use the 'p(Interaction)' column as a scoring metric to filter the network further, however, a top 10% filtering of this network would yield a network with <6000 interactions, so we did not feel like it was necessary to filter the network further for analysis." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 13, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Raw edge count in BioPlex: 56553\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 45 | "BioPlex_Raw = pd.read_csv(wd+'Network_Data_Raw/BioPlex_interactionList_v4a.tsv',sep='\\t')\n", 46 | "print 'Raw edge count in BioPlex:', len(BioPlex_Raw)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 14, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/html": [ 59 | "
\n", 60 | "\n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | "
GeneAGeneBUniprotAUniprotBSymbolASymbolBp(Wrong)p(No Interaction)p(Interaction)
0100728378P00813A5A3E0ADAPOTEF2.380858e-090.0003320.999668
1100345651P00813Q562R1ADAACTBL29.786437e-180.2119140.788086
2222389708Q8N7W2Q07021BEND7C1QBP2.962215e-170.0056450.994355
32223894038Q8N7W2O75096BEND7LRP43.302994e-100.0002800.999720
46451213312Q6ZMN8P11142CCNI2HSPA82.060285e-160.0362350.963765
\n", 138 | "
" 139 | ], 140 | "text/plain": [ 141 | " GeneA GeneB UniprotA UniprotB SymbolA SymbolB p(Wrong) \\\n", 142 | "0 100 728378 P00813 A5A3E0 ADA POTEF 2.380858e-09 \n", 143 | "1 100 345651 P00813 Q562R1 ADA ACTBL2 9.786437e-18 \n", 144 | "2 222389 708 Q8N7W2 Q07021 BEND7 C1QBP 2.962215e-17 \n", 145 | "3 222389 4038 Q8N7W2 O75096 BEND7 LRP4 3.302994e-10 \n", 146 | "4 645121 3312 Q6ZMN8 P11142 CCNI2 HSPA8 2.060285e-16 \n", 147 | "\n", 148 | " p(No Interaction) p(Interaction) \n", 149 | "0 0.000332 0.999668 \n", 150 | "1 0.211914 0.788086 \n", 151 | "2 0.005645 0.994355 \n", 152 | "3 0.000280 0.999720 \n", 153 | "4 0.036235 0.963765 " 154 | ] 155 | }, 156 | "execution_count": 14, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "BioPlex_Raw.head()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 15, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "Edges in BIND: 56553\n" 177 | ] 178 | } 179 | ], 180 | "source": [ 181 | "# Convert table of interactions to edgelist (no scores given)\n", 182 | "# Also no gene symbol conversion necessary because network is given in symbol format already\n", 183 | "BioPlex_edgelist = BioPlex_Raw[['SymbolA', 'SymbolB']].values.tolist()\n", 184 | "print 'Edges in BIND:', len(BioPlex_edgelist)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 16, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "# Sort each edge representation for filtering\n", 196 | "BioPlex_edgelist_sorted = [sorted(edge) for edge in BioPlex_edgelist]" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 17, 202 | "metadata": { 203 | "collapsed": false 204 | }, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "56553 input edges\n", 211 | "0 self-edges removed\n", 212 | "0 edges with un-mapped genes removed\n", 213 | "0 duplicate edges removed\n", 214 | "Edge list filtered: 0.21 seconds\n", 215 | "56553 Edges remaining\n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "# Filter edgelist for duplicate nodes and for self-edges\n", 221 | "BioPlex_edgelist_filt = gct.filter_converted_edgelist(BioPlex_edgelist)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 18, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "Edge list saved: 0.1 seconds\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "# Write network to file\n", 241 | "gct.write_edgelist(BioPlex_edgelist_filt, wd+'Network_SIFs_Symbol/BioPlex_Symbol.sif', binary=True)" 242 | ] 243 | } 244 | ], 245 | "metadata": { 246 | "kernelspec": { 247 | "display_name": "Python 2", 248 | "language": "python", 249 | "name": "python2" 250 | }, 251 | "language_info": { 252 | "codemirror_mode": { 253 | "name": "ipython", 254 | "version": 2 255 | }, 256 | "file_extension": ".py", 257 | "mimetype": "text/x-python", 258 | "name": "python", 259 | "nbconvert_exporter": "python", 260 | "pygments_lexer": "ipython2", 261 | "version": "2.7.11" 262 | } 263 | }, 264 | "nbformat": 4, 265 | "nbformat_minor": 0 266 | } 267 | -------------------------------------------------------------------------------- /Network Processing Notebooks/DIP Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "import pandas as pd\n", 13 | "import itertools" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Load PID Raw Data\n", 21 | "#### Source (MITAB): http://dip.doe-mbi.ucla.edu/dip/File.cgi?FN=2016/tab25/Hsapi20170205.txt\n", 22 | "Downloaded: June 15, 2017 \n", 23 | "Last Updated: Februrary 05, 2017 \n", 24 | "Notes for download: Website requires registration. Register for the site to download the file from the link. \n", 25 | "Notes for processing: This is the file for human protein interactions, however, not all interactions are human-human interactions. These need to be filtered. Also all ID's not without RefSeq or UniProt ID are excluded. Custom processing for this network is described below" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 5, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Raw edge count in DIP: 7794\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 45 | "DIP_Raw = pd.read_csv(wd+'Network_Data_Raw/DIP/Hsapi20170205.txt', index_col=0, sep='\\t')\n", 46 | "print 'Raw edge count in DIP:', len(DIP_Raw)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 12, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "# Fix the column offset in the interaction data table\n", 58 | "DIP_Raw_offset = DIP_Raw.reset_index(drop=False)[DIP_Raw.reset_index(drop=False).columns[:-2]]\n", 59 | "DIP_Raw_offset.columns = DIP_Raw.columns[:-1]" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 16, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "Human-Human only interactions in DIP: 5569\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "# Keep only human-human interactions\n", 79 | "DIP_Human_only = DIP_Raw_offset[(DIP_Raw_offset['Taxid interactor A']=='taxid:9606(Homo sapiens)') & (DIP_Raw_offset['Taxid interactor B']=='taxid:9606(Homo sapiens)')]\n", 80 | "print 'Human-Human only interactions in DIP:', len(DIP_Human_only)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "#### Parse all genes in filtered DIP and keep only RefSeq/UniProtKB labelled interactions" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 18, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "# Extract gene list\n", 99 | "Human_DIP_Genes = list(set(DIP_Human_only['ID interactor A']).union(set(DIP_Human_only['ID interactor B'])))" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 25, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "# Split all gene names into list of genes and concatenate\n", 111 | "Human_DIP_Genes_split = [name.split('|') for name in Human_DIP_Genes]\n", 112 | "Human_DIP_Genes_full_list = list(itertools.chain.from_iterable(Human_DIP_Genes_split))\n", 113 | "\n", 114 | "# Note about this line: This is to fix the one example where one of the Uniprot genes gets labelled as \"uniprotkb:Q13936,159'\n", 115 | "Human_DIP_Genes_full_list = [name.split(',')[0] for name in Human_DIP_Genes_full_list] " 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "## Convert Genes" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 26, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "5017 Valid Query Genes\n", 137 | "3281 Invalid Query Genes\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'DIP' prefix)\n", 143 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_DIP_Genes_full_list, exclude_prefixes=['DIP'])" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 31, 149 | "metadata": { 150 | "collapsed": false 151 | }, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "Batch query complete: 7.97 seconds\n", 158 | "5074 Matched query results\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "# Set scopes (gene naming systems to search)\n", 164 | "scopes = \"uniprot, refseq\"\n", 165 | "# Set fields (systems from which to return gene names from)\n", 166 | "fields = \"symbol, entrezgene\"\n", 167 | "# Query MyGene.Info\n", 168 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 169 | "print len(match_list), 'Matched query results'" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 32, 175 | "metadata": { 176 | "collapsed": false, 177 | "scrolled": true 178 | }, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "Queries without full matching results found: 106\n", 185 | "\n", 186 | "74 Queries with mutliple matches found\n", 187 | "\n", 188 | "Query mapping table/dictionary construction complete: 6.82 seconds\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "## Construct Converted Network" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 36, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "# This is a custom gene conversion function written due to the parsing required for gene interactor labels\n", 212 | "# Returns best matched symbol and/or entrez id from each DIP interactor string (if applicable)\n", 213 | "def convert_DIP_string(string, field):\n", 214 | " names = [gct.get_identifier_without_prefix(name) for name in string.split('|')]\n", 215 | " # Keep only mappings defined for field of interest\n", 216 | " if field=='symbol':\n", 217 | " # Return match table values that have matched symbol\n", 218 | " conversion = match_table_trim.ix[names][~(match_table_trim.ix[names]['Symbol'].isnull())]\n", 219 | " # Return conversion with max score or None if no conversion\n", 220 | " if conversion.shape[0]==0:\n", 221 | " return None\n", 222 | " else:\n", 223 | " max_score = conversion['Score'].max()\n", 224 | " return conversion[conversion['Score']==max_score].ix[0]['Symbol']\n", 225 | " elif field=='entrez':\n", 226 | " # Return match table values that have matched symbol\n", 227 | " conversion = match_table_trim.ix[names][~(match_table_trim.ix[names]['EntrezID'].isnull())]\n", 228 | " if conversion.shape[0]==0:\n", 229 | " return None\n", 230 | " else:\n", 231 | " # Return conversion with max score or None if no conversion\n", 232 | " max_score = conversion['Score'].max()\n", 233 | " return conversion[conversion['Score']==max_score].ix[0]['EntrezID']" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 37, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "DIP_Human_only_edges = DIP_Human_only[['ID interactor A', 'ID interactor B']].values.tolist()\n", 245 | "DIP_edgelist_symbol = [sorted([convert_DIP_string(edge[0],'symbol'),convert_DIP_string(edge[1],'symbol')]) for edge in DIP_Human_only_edges]" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 39, 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [ 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "5569 input edges\n", 260 | "512 self-edges removed\n", 261 | "309 edges with un-mapped genes removed\n", 262 | "26 duplicate edges removed\n", 263 | "Edge list filtered: 0.02 seconds\n", 264 | "4722 Edges remaining\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "# Filter converted edge list\n", 270 | "DIP_edgelist_symbol_filt = gct.filter_converted_edgelist(DIP_edgelist_symbol)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 40, 276 | "metadata": { 277 | "collapsed": false 278 | }, 279 | "outputs": [ 280 | { 281 | "name": "stdout", 282 | "output_type": "stream", 283 | "text": [ 284 | "Edge list saved: 0.02 seconds\n" 285 | ] 286 | } 287 | ], 288 | "source": [ 289 | "# Save converted edge list\n", 290 | "gct.write_edgelist(DIP_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/DIP_Symbol.sif')" 291 | ] 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "Python 2", 297 | "language": "python", 298 | "name": "python2" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 2 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython2", 310 | "version": "2.7.11" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 0 315 | } 316 | -------------------------------------------------------------------------------- /Network Processing Notebooks/Degree-Preserved Network Shufflings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from pyNBS import data_import_tools as dit\n", 12 | "from pyNBS import network_propagation as prop\n", 13 | "import os\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "import networkx as nx\n", 17 | "import time" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 4, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "def shuffle_network(network, verbose=False):\n", 29 | "\t# Shuffle Network\n", 30 | "\tshuff_time = time.time()\n", 31 | "\tedge_len=len(network.edges())\n", 32 | "\tshuff_net=network.copy()\n", 33 | "\ttry:\n", 34 | "\t\tnx.double_edge_swap(shuff_net, nswap=edge_len, max_tries=edge_len*10)\n", 35 | "\texcept:\n", 36 | "\t\tif verbose:\n", 37 | "\t\t\tprint 'Note: Maximum number of swap attempts ('+repr(edge_len*10)+') exceeded before desired swaps achieved ('+repr(edge_len)+').'\n", 38 | "\tif verbose:\n", 39 | "\t\t# Evaluate Network Similarity\n", 40 | "\t\tshared_edges = len(set(network.edges()).intersection(set(shuff_net.edges())))\n", 41 | "\t\tprint 'Network shuffled:', time.time()-shuff_time, 'seconds. Edge similarity:', shared_edges/float(edge_len)\n", 42 | "\treturn shuff_net" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 5, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "wd = '/cellar/users/jkhuang/Data/Projects/pyNBS/Data/Network_Data/Network_Files/'\n", 54 | "randNet_outdir = '/cellar/users/jkhuang/Data/Projects/pyNBS/Data/Network_Data/Shuffled_Network_Files/'" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 6, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "network_files = [wd+fn for fn in os.listdir(wd) if fn.endswith('.txt')]" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 15, 71 | "metadata": { 72 | "collapsed": false 73 | }, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "PathwayCommons\n", 80 | "Network shuffled: 88.9572019577 seconds. Edge similarity: 0.14217722133\n", 81 | "Shuffled PathwayCommons saved.\n", 82 | "STRING90\n", 83 | "Network shuffled: 31.8355379105 seconds. Edge similarity: 0.135569697974\n", 84 | "Shuffled STRING90 saved.\n", 85 | "HumanNet90\n", 86 | "Network shuffled: 1.94090199471 seconds. Edge similarity: 0.157831705011\n", 87 | "Shuffled HumanNet90 saved.\n", 88 | "PID\n", 89 | "Network shuffled: 0.650630950928 seconds. Edge similarity: 0.172511892547\n", 90 | "Shuffled PID saved.\n", 91 | "Mentha\n", 92 | "Network shuffled: 12.5241580009 seconds. Edge similarity: 0.136090780444\n", 93 | "Shuffled Mentha saved.\n", 94 | "ConsensusPathDB\n", 95 | "Network shuffled: 472.858560085 seconds. Edge similarity: 0.266427489011\n", 96 | "Shuffled ConsensusPathDB saved.\n", 97 | "MultiNet\n", 98 | "Network shuffled: 11.9793038368 seconds. Edge similarity: 0.139956933521\n", 99 | "Shuffled MultiNet saved.\n", 100 | "HPRD\n", 101 | "Network shuffled: 2.19464206696 seconds. Edge similarity: 0.132373984179\n", 102 | "Shuffled HPRD saved.\n", 103 | "GIANT\n", 104 | "Network shuffled: 953.094853163 seconds. Edge similarity: 0.181710364213\n", 105 | "Shuffled GIANT saved.\n", 106 | "HINT\n", 107 | "Network shuffled: 10.6648330688 seconds. Edge similarity: 0.132703799716\n", 108 | "Shuffled HINT saved.\n", 109 | "GeneMANIA\n", 110 | "Network shuffled: 1266.22839403 seconds. Edge similarity: 0.146754353915\n", 111 | "Shuffled GeneMANIA saved.\n", 112 | "Reactome\n", 113 | "Network shuffled: 10.7709050179 seconds. Edge similarity: 0.157268305724\n", 114 | "Shuffled Reactome saved.\n", 115 | "STRING\n", 116 | "Network shuffled: 1679.15529799 seconds. Edge similarity: 0.209015282622\n", 117 | "Shuffled STRING saved.\n", 118 | "IntAct\n", 119 | "Network shuffled: 8.56541705132 seconds. Edge similarity: 0.130773661977\n", 120 | "Shuffled IntAct saved.\n", 121 | "Mentha90\n", 122 | "Network shuffled: 0.904587030411 seconds. Edge similarity: 0.134449008127\n", 123 | "Shuffled Mentha90 saved.\n", 124 | "ReactomeFI\n", 125 | "Network shuffled: 10.2852549553 seconds. Edge similarity: 0.146912035846\n", 126 | "Shuffled ReactomeFI saved.\n", 127 | "BIND\n", 128 | "Network shuffled: 9.11399793625 seconds. Edge similarity: 0.322492442979\n", 129 | "Shuffled BIND saved.\n", 130 | "DIP\n", 131 | "Network shuffled: 0.137312889099 seconds. Edge similarity: 0.120499788225\n", 132 | "Shuffled DIP saved.\n", 133 | "InBioMap75\n", 134 | "Network shuffled: 6.4067800045 seconds. Edge similarity: 0.167107140969\n", 135 | "Shuffled InBioMap75 saved.\n", 136 | "HumanInteractome\n", 137 | "Network shuffled: 0.723779201508 seconds. Edge similarity: 0.136739405675\n", 138 | "Shuffled HumanInteractome saved.\n", 139 | "BioPlex\n", 140 | "Network shuffled: 1.60635495186 seconds. Edge similarity: 0.123919155482\n", 141 | "Shuffled BioPlex saved.\n", 142 | "GeneMANIA90\n", 143 | "Network shuffled: 25.3215258121 seconds. Edge similarity: 0.118961241363\n", 144 | "Shuffled GeneMANIA90 saved.\n", 145 | "BioGRID\n", 146 | "Network shuffled: 11.8226139545 seconds. Edge similarity: 0.131481431287\n", 147 | "Shuffled BioGRID saved.\n", 148 | "GIANT90\n", 149 | "Network shuffled: 22.5300149918 seconds. Edge similarity: 0.188063162301\n", 150 | "Shuffled GIANT90 saved.\n", 151 | "HumanNet\n", 152 | "Network shuffled: 25.1538288593 seconds. Edge similarity: 0.137587481275\n", 153 | "Shuffled HumanNet saved.\n", 154 | "IRefIndex\n", 155 | "Network shuffled: 7.51319789886 seconds. Edge similarity: 0.160039835864\n", 156 | "Shuffled IRefIndex saved.\n", 157 | "InBioMap\n", 158 | "Network shuffled: 46.8094351292 seconds. Edge similarity: 0.167921346275\n", 159 | "Shuffled InBioMap saved.\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "for network_file in network_files:\n", 165 | " network_name = network_file.split('/')[-1].split('_')[0]\n", 166 | " print network_name\n", 167 | " network = dit.load_network_file(network_file)\n", 168 | " shuffNet = shuffle_network(network, verbose=True)\n", 169 | " shuffNet_edges = shuffNet.edges()\n", 170 | " f = open(randNet_outdir+network_name+'-shuffled_Symbol.txt', 'w')\n", 171 | " for edge in shuffNet_edges:\n", 172 | " f.write(str(edge[0])+'\\t'+str(edge[1])+'\\n')\n", 173 | " f.close()\n", 174 | " print 'Shuffled', network_name, 'saved.'" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": { 181 | "collapsed": true 182 | }, 183 | "outputs": [], 184 | "source": [] 185 | } 186 | ], 187 | "metadata": { 188 | "kernelspec": { 189 | "display_name": "Python 2", 190 | "language": "python", 191 | "name": "python2" 192 | }, 193 | "language_info": { 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 2 197 | }, 198 | "file_extension": ".py", 199 | "mimetype": "text/x-python", 200 | "name": "python", 201 | "nbconvert_exporter": "python", 202 | "pygments_lexer": "ipython2", 203 | "version": "2.7.11" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 0 208 | } 209 | -------------------------------------------------------------------------------- /Network Processing Notebooks/GIANT Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import time" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Load GIANT Raw Data\n", 22 | "#### Source: http://giant.princeton.edu/static//networks/all_tissues_top.gz\n", 23 | "Downloaded: June 15, 2017 \n", 24 | "Last Updated: N/A, but paper published in 2015 \n", 25 | "Note about processing: This network (even if it is already the top 10% of all edges) is extremely large. Therefore, we will further filter this 'top' functional network further to the top 10% which should yield about 4 million edges. We will then take the top 10% of this filtered network (about 400k edges) to use as the 'filtered' version of this network." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "GIANT All Tissues (Top) Interactions: 38903547\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 45 | "GIANT_Raw = pd.read_csv(wd+'/Network_Data_Raw/GIANT_All_Tissues_Top', sep='\\t', header=-1, low_memory=False)\n", 46 | "GIANT_Raw.columns = ['NodeA', 'NodeB', 'Prob']\n", 47 | "print 'GIANT All Tissues (Top) Interactions:', GIANT_Raw.shape[0]" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": { 54 | "collapsed": false, 55 | "scrolled": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "# Get all genes to convert from GeneMANIA\n", 60 | "GIANT_Raw_Genes = list(set(GIANT_Raw['NodeA']).union(GIANT_Raw['NodeB']))\n", 61 | "# Convert all entrezIDs to string forst\n", 62 | "GIANT_Raw_Genes = [str(entrezID) for entrezID in GIANT_Raw_Genes]" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Convert genes from Entrez ID to HUGO Symbol" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 6, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "25689 Valid Query Genes\n", 84 | "0 Invalid Query Genes\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "query_string, valid_genes, invalid_genes = gct.query_constructor(GIANT_Raw_Genes)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 7, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "Batch query complete: 30.55 seconds\n", 104 | "25690 Matched query results\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "# Set scopes (gene naming systems to search)\n", 110 | "scopes = \"entrezgene, retired, alias\"\n", 111 | "\n", 112 | "# Set fields (systems from which to return gene names from)\n", 113 | "fields = \"symbol, entrezgene\"\n", 114 | "\n", 115 | "# Query MyGene.Info\n", 116 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 117 | "print len(match_list), 'Matched query results'" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 8, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "Queries without full matching results found: 806\n", 132 | "\n", 133 | "1 Queries with mutliple matches found\n", 134 | "\n", 135 | "Query mapping table/dictionary construction complete: 140.47 seconds\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "## Construct converted network and filter edges" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 9, 153 | "metadata": { 154 | "collapsed": true 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "GIANT_Raw_edgelist = GIANT_Raw.values.tolist()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 13, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "# Convert GIANT network edgelist\n", 170 | "GIANT_Raw_edgelist_symbol = [sorted([query_to_symbol[str(int(edge[0]))], query_to_symbol[str(int(edge[1]))]])+[edge[2]] for edge in GIANT_Raw_edgelist]" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 14, 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "38903547 input edges\n", 185 | "19204 self-edges removed\n", 186 | "2417020 edges with un-mapped genes removed\n", 187 | "151720 duplicate edges removed\n", 188 | "Edge list filtered: 225.47 seconds\n", 189 | "36315603 Edges remaining\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "# Filter GIANT network edgelist\n", 195 | "GIANT_edgelist_symbol_filt = gct.filter_converted_edgelist(GIANT_Raw_edgelist_symbol, remove_self_edges=True, weighted=True)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "## Filter to top 10% of edges by weight/score" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 16, 208 | "metadata": { 209 | "collapsed": false 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "GIANT_edgelist_symbol_filt_table = pd.DataFrame(GIANT_edgelist_symbol_filt, columns=['NodeA', 'NodeB', 'Score'])" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 20, 219 | "metadata": { 220 | "collapsed": false 221 | }, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "90% score: 0.207416\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "# Filter edges by score quantile\n", 233 | "q_score = GIANT_edgelist_symbol_filt_table['Score'].quantile(0.9)\n", 234 | "print '90% score:', q_score\n", 235 | "GIANTtop_edgelist = GIANT_edgelist_symbol_filt_table[GIANT_edgelist_symbol_filt_table['Score']>q_score]" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 23, 241 | "metadata": { 242 | "collapsed": true 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "# Save weighted network for GIANT filtered to top 10% of downloaded edges to file\n", 247 | "GIANTtop_edgelist.to_csv('/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/GIANT_Symbol.sif', sep='\\t', header=False, index=False)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 24, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "90.0% score: 0.574097\n", 262 | "363128 / 3631554 edges retained\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "# Create filtered network for GIANT\n", 268 | "GIANT90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/GIANT_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n", 269 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/GIANT90_Symbol.sif')" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "Python 2", 285 | "language": "python", 286 | "name": "python2" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 2 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython2", 298 | "version": "2.7.11" 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 0 303 | } 304 | -------------------------------------------------------------------------------- /Network Processing Notebooks/GeneMANIA Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import time" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Load GeneMANIA Raw Data\n", 22 | "#### Source: http://genemania.org/data/current/Homo_sapiens.COMBINED/COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt\n", 23 | "Downloaded: July 28, 2016 \n", 24 | "Last Updated: October 15, 2014\t" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 36 | "GeneMANIA_Raw = pd.read_csv(wd+'/Network_Data_Raw/GeneMANIA/GeneMANIA_2014_10_15.txt',sep='\\t')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "GeneMANIA_Raw_Genes = list(set(GeneMANIA_Raw['Gene_A']).union(set(GeneMANIA_Raw['Gene_B'])))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "7290094 Total GeneMANIA Edges\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "# Get Edgelist of network\n", 67 | "query_edgelist = GeneMANIA_Raw[['Gene_A','Gene_B', 'Weight']].values.tolist()\n", 68 | "print len(query_edgelist), \"Total GeneMANIA Edges\"" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Convert Genes (from ensembl gene to gene symbol)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 5, 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "19264 Valid Query Genes\n", 90 | "0 Invalid Query Genes\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "query_string, valid_genes, invalid_genes = gct.query_constructor(GeneMANIA_Raw_Genes)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 6, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "# Set scopes (gene naming systems to search)\n", 107 | "scopes = \"ensemblgene\"\n", 108 | "\n", 109 | "# Set fields (systems from which to return gene names from)\n", 110 | "fields = \"symbol, entrezgene\"" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 7, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "Batch query complete: 35.43 seconds\n", 125 | "19266 Matched query results\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "# Query MyGene.Info\n", 131 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 132 | "print len(match_list), 'Matched query results'" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 8, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "Queries without full matching results found: 1547\n", 147 | "\n", 148 | "1 Queries with mutliple matches found\n", 149 | "\n", 150 | "Query mapping table/dictionary construction complete: 111.04 seconds\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "## Construct Converted Network" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 9, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "CPU times: user 18.5 s, sys: 1.36 s, total: 19.9 s\n", 177 | "Wall time: 19.5 s\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "%%time\n", 183 | "# Convert weighted edge list\n", 184 | "GeneMANIA_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 10, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "7290094 input edges\n", 199 | "22144 self-edges removed\n", 200 | "665798 edges with un-mapped genes removed\n", 201 | "508 duplicate edges removed\n", 202 | "Edge list filtered: 39.33 seconds\n", 203 | "6601644 Edges remaining\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "# Filter converted edge list\n", 209 | "GeneMANIA_edgelist_symbol_filt = gct.filter_converted_edgelist(GeneMANIA_edgelist_symbol, weighted=True)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 11, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "Edge list saved: 13.39 seconds\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "# Write network to file\n", 229 | "gct.write_edgelist(GeneMANIA_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/GeneMANIA_Symbol.sif', binary=False)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 12, 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [ 239 | { 240 | "name": "stdout", 241 | "output_type": "stream", 242 | "text": [ 243 | "90.0% score: 0.00023\n", 244 | "618546 / 6601644 edges retained\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "# Create filtered network\n", 250 | "GeneMANIA90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/GeneMANIA_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n", 251 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/GeneMANIA90_Symbol.sif')" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "collapsed": true 259 | }, 260 | "outputs": [], 261 | "source": [] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "Python 2", 267 | "language": "python", 268 | "name": "python2" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 2 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython2", 280 | "version": "2.7.11" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 0 285 | } 286 | -------------------------------------------------------------------------------- /Network Processing Notebooks/HINT Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import numpy as np" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Load HINT Raw Data\n", 22 | "#### Source: http://hint.yulab.org/batch.html\n", 23 | "Downloaded: June 15, 2017 \n", 24 | "Last update not listed, but currently on version 4 (updated early 2017). The two binary interactomes for High-Quality (HQ) and Co-Complex (CC) interactions were downloaded and merged into a single interactome for HINT. \n", 25 | "Citation: Das J and Yu H. HINT: High-quality protein interactomes and their applications in understanding human disease. BMC Systems Biology, 2012 Jul 30;6(1):92." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 37 | "HINT_Bin_Raw = pd.read_csv(wd+'Network_Data_Raw/HINT_v4_binary_HomoSapiens.txt',sep='\\t')\n", 38 | "HINT_Com_Raw = pd.read_csv(wd+'Network_Data_Raw/HINT_v4_complex_HomoSapiens.txt',sep='\\t')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 5, 44 | "metadata": { 45 | "collapsed": false, 46 | "scrolled": true 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Concatenated list of edges: (181699, 9)\n", 54 | "After duplicate edges removed: (181375, 9)\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "HINT_Raw = pd.concat([HINT_Bin_Raw, HINT_Com_Raw])\n", 60 | "print 'Concatenated list of edges:', HINT_Raw.shape\n", 61 | "HINT_Raw = HINT_Raw.drop_duplicates()\n", 62 | "print 'After duplicate edges removed:', HINT_Raw.shape" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 7, 68 | "metadata": { 69 | "collapsed": false, 70 | "scrolled": true 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "# Use UniProtID labels to annotate interactions\n", 75 | "HPRD_Raw_Genes_Uniprot = set(HINT_Raw['Uniprot_A']).union(set(HINT_Raw['Uniprot_B']))" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## Convert Genes from UniProt Accession ID to gene symbols" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 9, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "15784 Valid Query Genes\n", 97 | "0 Invalid Query Genes\n" 98 | ] 99 | } 100 | ], 101 | "source": [ 102 | "query_string, valid_genes, invalid_genes = gct.query_constructor(HPRD_Raw_Genes_Uniprot)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 10, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "Batch query complete: 19.17 seconds\n", 117 | "16001 Matched query results\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "# Set scopes (gene naming systems to search)\n", 123 | "scopes = \"uniprot\"\n", 124 | "\n", 125 | "# Set fields (systems from which to return gene names from)\n", 126 | "fields = \"symbol, entrezgene\"\n", 127 | "\n", 128 | "# Query MyGene.Info\n", 129 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 130 | "print len(match_list), 'Matched query results'" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 11, 136 | "metadata": { 137 | "collapsed": false, 138 | "scrolled": true 139 | }, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "Queries without full matching results found: 670\n", 146 | "\n", 147 | "163 Queries with mutliple matches found\n", 148 | "\n", 149 | "Query mapping table/dictionary construction complete: 59.26 seconds\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "## Construct Converted Network" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 13, 167 | "metadata": { 168 | "collapsed": true 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "HINT_edgelist = HINT_Raw[['Uniprot_A', 'Uniprot_B']].values.tolist()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 16, 178 | "metadata": { 179 | "collapsed": false 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "# Convert edge list\n", 184 | "HINT_edgelist_symbol = gct.convert_edgelist(HINT_edgelist, query_to_symbol, weighted=False)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 19, 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "181375 input edges\n", 199 | "4730 self-edges removed\n", 200 | "2861 edges with un-mapped genes removed\n", 201 | "18325 duplicate edges removed\n", 202 | "Edge list filtered: 0.33 seconds\n", 203 | "155459 Edges remaining\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "# Filter edge list\n", 209 | "HINT_edgelist_symbol_filt = gct.filter_converted_edgelist(HINT_edgelist_symbol)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 20, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "Edge list saved: 0.26 seconds\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "# Save edge list\n", 229 | "gct.write_edgelist(HINT_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/HINT_Symbol.sif')" 230 | ] 231 | } 232 | ], 233 | "metadata": { 234 | "kernelspec": { 235 | "display_name": "Python 2", 236 | "language": "python", 237 | "name": "python2" 238 | }, 239 | "language_info": { 240 | "codemirror_mode": { 241 | "name": "ipython", 242 | "version": 2 243 | }, 244 | "file_extension": ".py", 245 | "mimetype": "text/x-python", 246 | "name": "python", 247 | "nbconvert_exporter": "python", 248 | "pygments_lexer": "ipython2", 249 | "version": "2.7.11" 250 | } 251 | }, 252 | "nbformat": 4, 253 | "nbformat_minor": 0 254 | } 255 | -------------------------------------------------------------------------------- /Network Processing Notebooks/HPRD Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "## Load HPRD Raw Data\n", 21 | "#### Source: http://www.hprd.org/download\n", 22 | "#### The file requires registration with the database. Download the file: HPRD_Release9_041310.tar.gz\n", 23 | "Downloaded: August 12, 2016 \n", 24 | "Last Updated: June 29, 2010 \n", 25 | "The following files are manipulated after unzipping the .tar.gz file" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 37 | "HPRD_Raw = pd.read_csv(wd+'Network_Data_Raw/HPRD_Release9_062910/BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt',sep='\\t',header=-1)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 5, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "# Assign column names from README file from archive\n", 49 | "HPRD_Raw.columns = ['Interactor 1 Gene Symbol', 'Interactor 1 HPRD ID', 'Interactor 1 RefSeq ID',\n", 50 | " 'Interactor 2 Gene Symbol', 'Interactor 2 HPRD ID', 'Interactor 2 RefSeq ID',\n", 51 | " 'Experiment Type', 'PubMed ID']" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 7, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "Edges in HPRD: 39240\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "# Convert table of interactions to edgelist (no scores given)\n", 71 | "# Also no gene symbol conversion necessary because network is given in symbol format already\n", 72 | "HPRD_edgelist = HPRD_Raw[['Interactor 1 Gene Symbol', 'Interactor 2 Gene Symbol']].values.tolist()\n", 73 | "print 'Edges in HPRD:', len(HPRD_edgelist)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 9, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "# Sort each edge representation for filtering\n", 85 | "HPRD_edgelist_sorted = [sorted(edge) for edge in HPRD_edgelist]" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 10, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "39240 input edges\n", 100 | "2160 self-edges removed\n", 101 | "0 edges with un-mapped genes removed\n", 102 | "41 duplicate edges removed\n", 103 | "Edge list filtered: 0.05 seconds\n", 104 | "37039 Edges remaining\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "# Filter edgelist for duplicate nodes and for self-edges\n", 110 | "HPRD_edgelist_filt = gct.filter_converted_edgelist(HPRD_edgelist_sorted)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 12, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "Edge list saved: 0.04 seconds\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "# Save genelist to file\n", 130 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n", 131 | "gct.write_edgelist(HPRD_edgelist_filt, outdir+'HPRD_Symbol.sif')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [] 142 | } 143 | ], 144 | "metadata": { 145 | "kernelspec": { 146 | "display_name": "Python 2", 147 | "language": "python", 148 | "name": "python2" 149 | }, 150 | "language_info": { 151 | "codemirror_mode": { 152 | "name": "ipython", 153 | "version": 2 154 | }, 155 | "file_extension": ".py", 156 | "mimetype": "text/x-python", 157 | "name": "python", 158 | "nbconvert_exporter": "python", 159 | "pygments_lexer": "ipython2", 160 | "version": "2.7.11" 161 | } 162 | }, 163 | "nbformat": 4, 164 | "nbformat_minor": 0 165 | } 166 | -------------------------------------------------------------------------------- /Network Processing Notebooks/HumanInteractome Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "from network_evaluation_tools import gene_conversion_tools as gct" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "The following data was downloaded from CCSB and converted to edge list sifs for both symbol and entrez from the simple sifs given in both cases. No additional gene conversions were performed for these networks." 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Load HI-II-14 (Human Interactome) Raw Data\n", 27 | "#### Source: http://interactome.dfci.harvard.edu/H_sapiens/download/HI-II-14.tsv\n", 28 | "#### File: 'HI-II-14'\n", 29 | "Downloaded: June 20, 2017 \n", 30 | "Last Updated: Not Listed\n", 31 | "Proteome-scale map of the human binary interactome network generated by systematically screening Space-II associated with Rolland et al Cell 2014" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Raw Interactions in HI-II-14: 13944\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 51 | "HumanInteractome_Raw = pd.read_csv(wd+'Network_Data_Raw/HI-II-14.tsv',sep='\\t')\n", 52 | "print 'Raw Interactions in HI-II-14:', len(HumanInteractome_Raw)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 5, 58 | "metadata": { 59 | "collapsed": false, 60 | "scrolled": true 61 | }, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "Edges in HI-II-14: 13944\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "# Convert table of interactions to edgelist (no scores given)\n", 73 | "# Also no gene symbol conversion necessary because network is given in symbol format already\n", 74 | "HumanInteractome_edgelist = HumanInteractome_Raw[['Symbol A', 'Symbol B']].values.tolist()\n", 75 | "print 'Edges in HI-II-14:', len(HumanInteractome_edgelist)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 7, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "# Sort each edge representation for filtering\n", 87 | "HumanInteractome_edgelist_sorted = [sorted(edge) for edge in HumanInteractome_edgelist]" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 8, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "13944 input edges\n", 102 | "517 self-edges removed\n", 103 | "0 edges with un-mapped genes removed\n", 104 | "0 duplicate edges removed\n", 105 | "Edge list filtered: 0.02 seconds\n", 106 | "13427 Edges remaining\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "# Filter edgelist for duplicate nodes and for self-edges\n", 112 | "HumanInteractome_edgelist_filt = gct.filter_converted_edgelist(HumanInteractome_edgelist_sorted)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 10, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "Edge list saved: 0.02 seconds\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "# Save genelist to file\n", 132 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n", 133 | "gct.write_edgelist(HumanInteractome_edgelist_filt, outdir+'HumanInteractome_Symbol.sif')" 134 | ] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Python 2", 140 | "language": "python", 141 | "name": "python2" 142 | }, 143 | "language_info": { 144 | "codemirror_mode": { 145 | "name": "ipython", 146 | "version": 2 147 | }, 148 | "file_extension": ".py", 149 | "mimetype": "text/x-python", 150 | "name": "python", 151 | "nbconvert_exporter": "python", 152 | "pygments_lexer": "ipython2", 153 | "version": "2.7.11" 154 | } 155 | }, 156 | "nbformat": 4, 157 | "nbformat_minor": 0 158 | } 159 | -------------------------------------------------------------------------------- /Network Processing Notebooks/HumanNet Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import time" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Load HumanNet Raw Data\n", 22 | "#### Source: http://www.functionalnet.org/humannet/HumanNet.v1.benchmark.txt\n", 23 | "Downloaded: August 12, 2016 \n", 24 | "No latest version date posted (last updated likely around 2011). \n", 25 | "Citation: Insuk Lee, U. Martin Blom, Peggy I. Wang, Jung Eun Shin, and Edward M. Marcotte\n", 26 | "Genome Research 21(7):1109-21 (2011)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 38 | "HumanNet_Raw = pd.read_csv(wd+'Network_Data_Raw/HumanNet.v1.join.txt',sep='\\t',header=-1)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "f = open(wd+'Network_Data_Raw/HumanNet.v1.evidence_code.txt')\n", 50 | "HumanNet_headers = ['Gene 1', 'Gene 2']+[name.split(' = ')[0] for name in f.read().splitlines()[1:-1]]\n", 51 | "HumanNet_Raw.columns = HumanNet_headers" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "# Extract gene list\n", 63 | "HumanNet_Raw_Genes = list(set(HumanNet_Raw['Gene 1']).union(set(HumanNet_Raw['Gene 2'])))\n", 64 | "HumanNet_Raw_Genes = [str(gene) for gene in HumanNet_Raw_Genes]" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 5, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "476399 HumanNet Edges\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "# Get edge list of network\n", 84 | "query_edgelist = HumanNet_Raw[['Gene 1','Gene 2']].astype(str)\n", 85 | "query_edgelist = pd.concat([query_edgelist, HumanNet_Raw['IntNet']], axis=1).values.tolist()\n", 86 | "print len(query_edgelist), \"HumanNet Edges\"" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "## Convert genes from Entrez ID to HUGO Symbol" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 6, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "16243 Valid Query Genes\n", 108 | "0 Invalid Query Genes\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "query_string, valid_genes, invalid_genes = gct.query_constructor(HumanNet_Raw_Genes)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 7, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "Batch query complete: 19.6 seconds\n", 128 | "16243 Matched query results\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "# Set scopes (gene naming systems to search)\n", 134 | "scopes = \"entrezgene, retired\"\n", 135 | "\n", 136 | "# Set fields (systems from which to return gene names from)\n", 137 | "fields = \"symbol, entrezgene\"\n", 138 | "\n", 139 | "# Query MyGene.Info\n", 140 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 141 | "print len(match_list), 'Matched query results'" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 8, 147 | "metadata": { 148 | "collapsed": false 149 | }, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "Queries without full matching results found: 10\n", 156 | "\n", 157 | "0 Queries with mutliple matches found\n", 158 | "\n", 159 | "Query mapping table/dictionary construction complete: 19.62 seconds\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Construct Converted Network" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 9, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "CPU times: user 1.54 s, sys: 260 ms, total: 1.8 s\n", 186 | "Wall time: 1.69 s\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "%%time\n", 192 | "# Convert weighted edge list\n", 193 | "HumanNet_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 10, 199 | "metadata": { 200 | "collapsed": false 201 | }, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "476399 input edges\n", 208 | "7 self-edges removed\n", 209 | "225 edges with un-mapped genes removed\n", 210 | "208 duplicate edges removed\n", 211 | "Edge list filtered: 4.15 seconds\n", 212 | "475959 Edges remaining\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "# Filter converted edge list\n", 218 | "HumanNet_edgelist_symbol_filt = gct.filter_converted_edgelist(HumanNet_edgelist_symbol, weighted=True)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 11, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "Edge list saved: 1.24 seconds\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "# Write network to file\n", 238 | "gct.write_edgelist(HumanNet_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/HumanNet_Symbol.sif', binary=False)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 15, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | "90.0% score: 2.17047289928\n", 253 | "47595 / 475959 edges retained\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "# Create filtered network\n", 259 | "HumanNet90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/HumanNet_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n", 260 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/HumanNet90_Symbol.sif')" 261 | ] 262 | } 263 | ], 264 | "metadata": { 265 | "kernelspec": { 266 | "display_name": "Python 2", 267 | "language": "python", 268 | "name": "python2" 269 | }, 270 | "language_info": { 271 | "codemirror_mode": { 272 | "name": "ipython", 273 | "version": 2 274 | }, 275 | "file_extension": ".py", 276 | "mimetype": "text/x-python", 277 | "name": "python", 278 | "nbconvert_exporter": "python", 279 | "pygments_lexer": "ipython2", 280 | "version": "2.7.11" 281 | } 282 | }, 283 | "nbformat": 4, 284 | "nbformat_minor": 0 285 | } 286 | -------------------------------------------------------------------------------- /Network Processing Notebooks/InBioMap Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import itertools\n", 15 | "import time" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Load InBio_Map Raw Data\n", 23 | "#### Source: https://www.intomics.com/inbio/map/#downloads\n", 24 | "Downloaded: November 30, 2016 \n", 25 | "Last Updated: September 12, 2016 \n", 26 | "Note about scoring: According to the supplement of the associated paper (Li T, et al. A scored human protein–protein interaction network to catalyze genomic interpretation. Nature Methods 14, 61–64 (2017) doi:10.1038/nmeth.4083), column 15 (index=14) should correspond to the confidence score of the edge. This column has 2 values, the confidence score and initial score. We will use the confidence score as it is a corrected version of the initial score calculated, indicating confidence that a particular interaction is real." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Raw edge count in InBio_Map: 625641\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 46 | "InBio_Map_Raw = pd.read_csv(wd+'Network_Data_Raw/InBio_Map_core_2016_09_12/core.psimitab',sep='\\t', header=-1)\n", 47 | "print 'Raw edge count in InBio_Map:', len(InBio_Map_Raw)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "Human-Human only interactions in InBioMap: 625641\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "InBio_Map_Human_Only = InBio_Map_Raw[(InBio_Map_Raw[9]=='taxid:9606(Homo sapiens)') & (InBio_Map_Raw[10]=='taxid:9606(Homo sapiens)')]\n", 67 | "print 'Human-Human only interactions in InBioMap:', len(InBio_Map_Human_Only)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "# Extract gene list\n", 79 | "InBio_Map_Human_Genes = list(set(InBio_Map_Human_Only[0]).union(set(InBio_Map_Human_Only[1])))\n", 80 | "InBio_Map_Human_Genes = [str(gene) for gene in InBio_Map_Human_Genes]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "## Convert Genes" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 5, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "17653 Valid Query Genes\n", 102 | "0 Invalid Query Genes\n" 103 | ] 104 | } 105 | ], 106 | "source": [ 107 | "# Construct list of genes to be submitted to MyGene.Info API\n", 108 | "query_string, valid_genes, invalid_genes = gct.query_constructor(InBio_Map_Human_Genes)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "# Set scopes (gene naming systems to search)\n", 120 | "scopes = \"uniprot\"\n", 121 | "\n", 122 | "# Set fields (systems from which to return gene names from)\n", 123 | "fields = \"symbol, entrezgene\"" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 7, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "Batch query complete: 39.84 seconds\n", 138 | "17984 Matched query results\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "# Query MyGene.Info\n", 144 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 145 | "print len(match_list), 'Matched query results'" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 8, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "Queries without full matching results found: 419\n", 160 | "\n", 161 | "233 Queries with mutliple matches found\n", 162 | "\n", 163 | "Query mapping table/dictionary construction complete: 76.78 seconds\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": {}, 174 | "source": [ 175 | "## Construct Converted Network" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 9, 181 | "metadata": { 182 | "collapsed": true 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "query_edgelist = InBio_Map_Human_Only[[0, 1, 14]].values.tolist()\n", 187 | "query_edgelist_fmt = [[edge[0].split(':')[1], edge[1].split(':')[1], float(edge[2].split('|')[0])] for edge in query_edgelist]" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 10, 193 | "metadata": { 194 | "collapsed": false 195 | }, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "CPU times: user 1.89 s, sys: 197 ms, total: 2.09 s\n", 202 | "Wall time: 1.87 s\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "%%time\n", 208 | "# Convert weighted edge list\n", 209 | "InBioMap_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=True)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 11, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "625641 input edges\n", 224 | "2498 self-edges removed\n", 225 | "12249 edges with un-mapped genes removed\n", 226 | "4896 duplicate edges removed\n", 227 | "Edge list filtered: 3.15 seconds\n", 228 | "605998 Edges remaining\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "# Filter converted edge list\n", 234 | "InBioMap_edgelist_symbol_filt = gct.filter_converted_edgelist(InBioMap_edgelist_symbol, weighted=True)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 12, 240 | "metadata": { 241 | "collapsed": false 242 | }, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "Edge list saved: 1.77 seconds\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "# Write network to file\n", 254 | "gct.write_edgelist(InBioMap_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/InBioMap_Symbol.sif', binary=False)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 13, 260 | "metadata": { 261 | "collapsed": false 262 | }, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "90.0% score: 1.0\n", 269 | "0 / 605998 edges retained\n" 270 | ] 271 | } 272 | ], 273 | "source": [ 274 | "# Create filtered network\n", 275 | "InBioMap90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/InBioMap_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n", 276 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/InBioMap90_Symbol.sif')" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 14, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "151352 / 605998 edges kept, 0.249756599857\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "# The filter function didn't work here because the max value makes up >90% of the edges. \n", 296 | "# We need to filter but keep all max edges instead\n", 297 | "InBioMap_edgelist = pd.DataFrame(InBioMap_edgelist_symbol_filt, columns=['NodeA', 'NodeB', 'edgeScore'])\n", 298 | "q_score = InBioMap_edgelist['edgeScore'].quantile(0.9)\n", 299 | "InBioMap_edgelist_filt = InBioMap_edgelist[InBioMap_edgelist['edgeScore']>=q_score]\n", 300 | "print InBioMap_edgelist_filt.shape[0], '/', InBioMap_edgelist.shape[0], 'edges kept, ', float(InBioMap_edgelist_filt.shape[0])/InBioMap_edgelist.shape[0]" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 15, 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "# Keeping all edges where the score == 1, it's a top 75% network, we will save this\n", 312 | "InBioMap_edgelist_filt[['NodeA', 'NodeB']].to_csv(wd+'Network_SIFs_Symbol/InBioMap75_Symbol.sif', sep='\\t', index=False, header=False)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": { 319 | "collapsed": true 320 | }, 321 | "outputs": [], 322 | "source": [] 323 | } 324 | ], 325 | "metadata": { 326 | "kernelspec": { 327 | "display_name": "Python 2", 328 | "language": "python", 329 | "name": "python2" 330 | }, 331 | "language_info": { 332 | "codemirror_mode": { 333 | "name": "ipython", 334 | "version": 2 335 | }, 336 | "file_extension": ".py", 337 | "mimetype": "text/x-python", 338 | "name": "python", 339 | "nbconvert_exporter": "python", 340 | "pygments_lexer": "ipython2", 341 | "version": "2.7.11" 342 | } 343 | }, 344 | "nbformat": 4, 345 | "nbformat_minor": 0 346 | } 347 | -------------------------------------------------------------------------------- /Network Processing Notebooks/IntAct Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import time" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Load IntAct Raw Data\n", 22 | "#### Source (PSI-MITAB): ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt\n", 23 | "Downloaded: June 15, 2017 \n", 24 | "Last Updated: June 05, 2017 \n", 25 | "Notes for processing: All interactions listed here need to be filtered for human-human interactions. Given the size of the file, we will filter the interactions and save the human-only interactions to a separate file to be loaded to save memory." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stderr", 37 | "output_type": "stream", 38 | "text": [ 39 | "/cellar/users/jkhuang/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (38,39) have mixed types. Specify dtype option on import or set low_memory=False.\n", 40 | " interactivity=interactivity, compiler=compiler, result=result)\n" 41 | ] 42 | }, 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "Raw edge count in IntAct: 653104\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 53 | "IntAct_Raw = pd.read_csv(wd+'Network_Data_Raw/IntAct/2016-09-08_intact.txt', sep='\\t')\n", 54 | "print 'Raw edge count in IntAct:', len(IntAct_Raw)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Custom Processing of Raw DIP Data" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 5, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "Index([u'#ID(s) interactor A', u'ID(s) interactor B',\n", 75 | " u'Alt. ID(s) interactor A', u'Alt. ID(s) interactor B',\n", 76 | " u'Alias(es) interactor A', u'Alias(es) interactor B',\n", 77 | " u'Interaction detection method(s)', u'Publication 1st author(s)',\n", 78 | " u'Publication Identifier(s)', u'Taxid interactor A',\n", 79 | " u'Taxid interactor B', u'Interaction type(s)', u'Source database(s)',\n", 80 | " u'Interaction identifier(s)', u'Confidence value(s)',\n", 81 | " u'Expansion method(s)', u'Biological role(s) interactor A',\n", 82 | " u'Biological role(s) interactor B',\n", 83 | " u'Experimental role(s) interactor A',\n", 84 | " u'Experimental role(s) interactor B', u'Type(s) interactor A',\n", 85 | " u'Type(s) interactor B', u'Xref(s) interactor A',\n", 86 | " u'Xref(s) interactor B', u'Interaction Xref(s)',\n", 87 | " u'Annotation(s) interactor A', u'Annotation(s) interactor B',\n", 88 | " u'Interaction annotation(s)', u'Host organism(s)',\n", 89 | " u'Interaction parameter(s)', u'Creation date', u'Update date',\n", 90 | " u'Checksum(s) interactor A', u'Checksum(s) interactor B',\n", 91 | " u'Interaction Checksum(s)', u'Negative', u'Feature(s) interactor A',\n", 92 | " u'Feature(s) interactor B', u'Stoichiometry(s) interactor A',\n", 93 | " u'Stoichiometry(s) interactor B',\n", 94 | " u'Identification method participant A',\n", 95 | " u'Identification method participant B'],\n", 96 | " dtype='object')" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "IntAct_Raw.columns" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "#### Keep only human-human interactions" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 7, 118 | "metadata": { 119 | "collapsed": false, 120 | "scrolled": false 121 | }, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "Human-Human only edge count in IntAct: 247565\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "# Filter for only human-human interactions in IntAct\n", 133 | "IntAct_Human_Only = IntAct_Raw[(IntAct_Raw['Taxid interactor A']=='taxid:9606(human)|taxid:9606(Homo sapiens)') & (IntAct_Raw['Taxid interactor B']=='taxid:9606(human)|taxid:9606(Homo sapiens)')]\n", 134 | "IntAct_Human_Only = IntAct_Human_Only.drop_duplicates()\n", 135 | "print 'Human-Human only edge count in IntAct:', IntAct_Human_Only.shape[0]" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 9, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "Human_IntAct_Genes = list(set(IntAct_Human_Only['#ID(s) interactor A']).union(set(IntAct_Human_Only['ID(s) interactor B'])))" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## Convert Genes" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 14, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "19143 Valid Query Genes\n", 168 | "1162 Invalid Query Genes\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'intact' prefix)\n", 174 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_IntAct_Genes, exclude_prefixes=['intact'])" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 17, 180 | "metadata": { 181 | "collapsed": false 182 | }, 183 | "outputs": [ 184 | { 185 | "name": "stdout", 186 | "output_type": "stream", 187 | "text": [ 188 | "Batch query complete: 29.14 seconds\n", 189 | "19368 Matched query results\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "# Set scopes (gene naming systems to search)\n", 195 | "scopes = \"uniprot\"\n", 196 | "\n", 197 | "# Set fields (systems from which to return gene names from)\n", 198 | "fields = \"symbol, entrezgene\"\n", 199 | "\n", 200 | "# Query MyGene.Info\n", 201 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 202 | "print len(match_list), 'Matched query results'" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 18, 208 | "metadata": { 209 | "collapsed": false, 210 | "scrolled": true 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "Queries without full matching results found: 4329\n", 218 | "\n", 219 | "157 Queries with mutliple matches found\n", 220 | "\n", 221 | "Query mapping table/dictionary construction complete: 94.21 seconds\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "## Construct Converted Network" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 19, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "query_edgelist = IntAct_Human_Only[['#ID(s) interactor A', 'ID(s) interactor B']].drop_duplicates().values.tolist()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 21, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "5864 / 161035 edges with invalid nodes removed\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "# Filter query edgelist of interactions with invalid genes\n", 264 | "query_edgelist_filt = gct.filter_query_edgelist(query_edgelist, invalid_genes)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 23, 270 | "metadata": { 271 | "collapsed": true 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "# Format edge list by removing 'uniprot:' prefix from all interactors\n", 276 | "query_edgelist_filt_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist_filt]" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 24, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "# Convert network edge list to symbol\n", 288 | "IntAct_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt_fmt, query_to_symbol)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 25, 294 | "metadata": { 295 | "collapsed": false 296 | }, 297 | "outputs": [ 298 | { 299 | "name": "stdout", 300 | "output_type": "stream", 301 | "text": [ 302 | "155171 input edges\n", 303 | "3236 self-edges removed\n", 304 | "20662 edges with un-mapped genes removed\n", 305 | "16701 duplicate edges removed\n", 306 | "Edge list filtered: 0.43 seconds\n", 307 | "114572 Edges remaining\n" 308 | ] 309 | } 310 | ], 311 | "source": [ 312 | "# Filter converted edge list\n", 313 | "IntAct_edgelist_symbol_filt = gct.filter_converted_edgelist(IntAct_edgelist_symbol)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 26, 319 | "metadata": { 320 | "collapsed": false 321 | }, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "Edge list saved: 0.24 seconds\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "# Save filtered, converted edge list to file\n", 333 | "gct.write_edgelist(IntAct_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/IntAct_Symbol.sif')" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "outputs": [], 343 | "source": [] 344 | } 345 | ], 346 | "metadata": { 347 | "kernelspec": { 348 | "display_name": "Python 2", 349 | "language": "python", 350 | "name": "python2" 351 | }, 352 | "language_info": { 353 | "codemirror_mode": { 354 | "name": "ipython", 355 | "version": 2 356 | }, 357 | "file_extension": ".py", 358 | "mimetype": "text/x-python", 359 | "name": "python", 360 | "nbconvert_exporter": "python", 361 | "pygments_lexer": "ipython2", 362 | "version": "2.7.11" 363 | } 364 | }, 365 | "nbformat": 4, 366 | "nbformat_minor": 0 367 | } 368 | -------------------------------------------------------------------------------- /Network Processing Notebooks/Mentha Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import itertools\n", 15 | "import time" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Load Mentha Raw Data\n", 23 | "#### Source (MITAB): http://mentha.uniroma2.it/doDownload.php?file=2017-06-12_MITAB-2.5.zip\n", 24 | "Downloaded: June 15, 2017 \n", 25 | "Last Updated: June 12, 2017 \n", 26 | "Notes for processing: This is the file should contain only human-human protein interactions but this should be checked and filtered if needed. \n", 27 | "A Note about scoring: Mentha does have a score assigned for each interaction called the 'mentha-score', this will be the score we use to filter the network." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "Raw edge count in Mentha: 1114184\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 47 | "Mentha_Raw = pd.read_csv(wd+'Network_Data_Raw/mentha_2017_06_12', sep='\\t', header=-1)\n", 48 | "print 'Raw edge count in Mentha:', len(Mentha_Raw)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "Human-Human only interactions in Mentha: 531726\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "# Keep only human-human interactions\n", 68 | "Mentha_Human_only = Mentha_Raw[(Mentha_Raw[9]=='taxid:9606(Homo sapiens)') & (Mentha_Raw[10]=='taxid:9606(Homo sapiens)')]\n", 69 | "print 'Human-Human only interactions in Mentha:', len(Mentha_Human_only)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": { 76 | "collapsed": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "# Extract gene list\n", 81 | "Human_Mentha_Genes = list(set(Mentha_Human_only[0]).union(set(Mentha_Human_only[1])))" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Convert Network Genes to symbol from UniProt Accession ID" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "18626 Valid Query Genes\n", 103 | "0 Invalid Query Genes\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'intact' prefix)\n", 109 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_Mentha_Genes)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 6, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "Batch query complete: 62.69 seconds\n", 124 | "18932 Matched query results\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "# Set scopes (gene naming systems to search)\n", 130 | "scopes = \"uniprot\"\n", 131 | "\n", 132 | "# Set fields (systems from which to return gene names from)\n", 133 | "fields = \"symbol, entrezgene\"\n", 134 | "\n", 135 | "# Query MyGene.Info\n", 136 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 137 | "print len(match_list), 'Matched query results'" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 7, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "Queries without full matching results found: 1198\n", 152 | "The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.\n", 153 | "\n", 154 | "207 Queries with mutliple matches found\n", 155 | "\n", 156 | "Query mapping table/dictionary construction complete: 83.92 seconds\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "## Construct Converted Network" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 8, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "query_edgelist = Mentha_Human_only[[0, 1, 14]].drop_duplicates().values.tolist()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 9, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "# Format edge list by removing 'uniprot:' prefix from all interactors\n", 191 | "query_edgelist_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1]), float(edge[2].split(':')[-1])] for edge in query_edgelist]" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 10, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "# Convert network edge list to symbol\n", 203 | "Mentha_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=True)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 11, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "327857 input edges\n", 218 | "3247 self-edges removed\n", 219 | "8219 edges with un-mapped genes removed\n", 220 | "53515 duplicate edges removed\n", 221 | "Edge list filtered: 1.61 seconds\n", 222 | "262876 Edges remaining\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "# Filter converted edge list\n", 228 | "Mentha_edgelist_symbol_filt = gct.filter_converted_edgelist(Mentha_edgelist_symbol, weighted=True)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 12, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "Edge list saved: 0.79 seconds\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "# Save filtered, converted edge list to file\n", 248 | "gct.write_edgelist(Mentha_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/Mentha_Symbol.sif', binary=False)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 13, 254 | "metadata": { 255 | "collapsed": false 256 | }, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "90.0% score: 0.454\n", 263 | "22886 / 262876 edges retained\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "# Create filtered network\n", 269 | "Mentha90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/Mentha_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n", 270 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/Mentha90_Symbol.sif')" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "collapsed": true 278 | }, 279 | "outputs": [], 280 | "source": [] 281 | } 282 | ], 283 | "metadata": { 284 | "kernelspec": { 285 | "display_name": "Python 2", 286 | "language": "python", 287 | "name": "python2" 288 | }, 289 | "language_info": { 290 | "codemirror_mode": { 291 | "name": "ipython", 292 | "version": 2 293 | }, 294 | "file_extension": ".py", 295 | "mimetype": "text/x-python", 296 | "name": "python", 297 | "nbconvert_exporter": "python", 298 | "pygments_lexer": "ipython2", 299 | "version": "2.7.11" 300 | } 301 | }, 302 | "nbformat": 4, 303 | "nbformat_minor": 0 304 | } 305 | -------------------------------------------------------------------------------- /Network Processing Notebooks/MultiNet Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import itertools" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Load MultiNet Raw Data\n", 22 | "#### Source: http://homes.gersteinlab.org/Khurana-PLoSCompBio-2013/\n", 23 | "Downloaded: August 12, 2016 \n", 24 | "Last Updated: March 17, 2013 \n", 25 | "Processing Notes: MultiNet has labels which interactions are noted as PPI and which are not. In the initial case, we will be examining all interaction information for MultiNet. However, in this case it is simple enough to parse the PPI only information from the data, and can be done in future work if necessary." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "Raw edge count in MultiNet: 109598\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 45 | "MultiNet_Raw = pd.read_csv(wd+'Network_Data_Raw/Multinet.interactions.network_presence_2013_03_17.txt',sep='\\t')\n", 46 | "print 'Raw edge count in MultiNet:', MultiNet_Raw.shape[0]" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 8, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "# Build edge list from interaction column. The two parts of the interaction name on either side of '_' are gene symbols\n", 58 | "MultiNet_edgelist = [interaction.split('_') for interaction in MultiNet_Raw['INTERACTION_NAME']]" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 9, 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "# Sort each edge representation for filtering\n", 70 | "MultiNet_edgelist_sorted = [sorted(edge) for edge in MultiNet_edgelist]" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 10, 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "109598 input edges\n", 85 | "0 self-edges removed\n", 86 | "0 edges with un-mapped genes removed\n", 87 | "0 duplicate edges removed\n", 88 | "Edge list filtered: 0.31 seconds\n", 89 | "109598 Edges remaining\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "# Filter edgelist for duplicate nodes and for self-edges\n", 95 | "MultiNet_edgelist_filt = gct.filter_converted_edgelist(MultiNet_edgelist_sorted)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 13, 101 | "metadata": { 102 | "collapsed": false 103 | }, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "Edge list saved: 0.11 seconds\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "# Save genelist to file\n", 115 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n", 116 | "gct.write_edgelist(MultiNet_edgelist_filt, outdir+'MultiNet_Symbol.sif')" 117 | ] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "Python 2", 123 | "language": "python", 124 | "name": "python2" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 2 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython2", 136 | "version": "2.7.11" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 0 141 | } 142 | -------------------------------------------------------------------------------- /Network Processing Notebooks/PID Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import time" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Load PID Raw Data\n", 22 | "#### Source: http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.pid.hgnc.sif.gz\n", 23 | "Downloaded: June 19, 2017 \n", 24 | "Last (via Pathway Commons v8 datasources.txt file): July 27, 2010 \n", 25 | "Note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons \n", 26 | "Also note: This network contains some interacions with CHEBI small molecules. These interactions will be removed" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 38 | "PID_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons9.pid.hgnc.sif',sep='\\t', header=-1)\n", 39 | "print 'Raw interactions in NCI PID:', PID_Raw.shape[0]" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 10, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Protein-Protein interactions in NCI PID: 27489\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "# Filter all interactions that contain a CHEBI: item\n", 59 | "PID_filt = PID_Raw[(~PID_Raw[0].str.contains(':')) & (~PID_Raw[2].str.contains(':'))]\n", 60 | "PID_edgelist = PID_filt[[0, 2]].values.tolist()\n", 61 | "print 'Protein-Protein interactions in NCI PID:', len(PID_edgelist)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 11, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "# Sort each edge representation for filtering\n", 73 | "PID_edgelist_sorted = [sorted(edge) for edge in PID_edgelist]" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 12, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "27489 input edges\n", 88 | "0 self-edges removed\n", 89 | "0 edges with un-mapped genes removed\n", 90 | "6047 duplicate edges removed\n", 91 | "Edge list filtered: 0.11 seconds\n", 92 | "21442 Edges remaining\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "# Filter edgelist for duplicate nodes and for self-edges\n", 98 | "PID_edgelist_filt = gct.filter_converted_edgelist(PID_edgelist_sorted)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 14, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "Edge list saved: 0.06 seconds\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "# Save genelist to file\n", 118 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n", 119 | "gct.write_edgelist(PID_edgelist_filt, outdir+'PID_Symbol.sif')" 120 | ] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": "Python 2", 126 | "language": "python", 127 | "name": "python2" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 2 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython2", 139 | "version": "2.7.11" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 0 144 | } 145 | -------------------------------------------------------------------------------- /Network Processing Notebooks/Pathway Commons Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import time" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Load Pathway Commons Raw Data (All interactions)\n", 22 | "#### Source: http://www.pathwaycommons.org/archives/PC2/v9/PathwayCommons9.All.hgnc.txt.gz\n", 23 | "Downloaded: June 15, 2017 \n", 24 | "Last Updated: May 25, 2017 \n", 25 | "Citation: Pathway Commons, a web resource for biological pathway data. Cerami E et al. Nucleic Acids Research (2011). \n", 26 | "A Note about filtering interactions: Pathway Commons also contains interactions between proteins and small molecules from the CHEBI database. These interactions will need to be filtered out as they are not protein-protein interactions. \n", 27 | "Also note: The text file has more lines than the sif file in Pathway Commons. However, the text file has some interactions that are unclear how to resolve so for this case we will use the sif file provided by Pathway Commons" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 9, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "Raw interactions in Pathway Commons v9: 1503144\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 47 | "PC_Raw = pd.read_csv(wd+'Network_Data_Raw/PathwayCommons9.All.hgnc.sif', sep='\\t', header=-1)\n", 48 | "print 'Raw interactions in Pathway Commons v9:', PC_Raw.shape[0]" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 25, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "Protein-Protein interactions in Pathway Commons v9: 968186\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "# Filter all interactions that contain a CHEBI: item\n", 68 | "PC_filt = PC_Raw[(~PC_Raw[0].str.contains(':')) & (~PC_Raw[2].str.contains(':'))]\n", 69 | "PC_edgelist = PC_filt[[0, 2]].values.tolist()\n", 70 | "print 'Protein-Protein interactions in Pathway Commons v9:', len(PC_edgelist)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 26, 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "# Sort each edge representation for filtering\n", 82 | "PC_edgelist_sorted = [sorted(edge) for edge in PC_edgelist]" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 27, 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [ 92 | { 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "968186 input edges\n", 97 | "0 self-edges removed\n", 98 | "0 edges with un-mapped genes removed\n", 99 | "143511 duplicate edges removed\n", 100 | "Edge list filtered: 1.92 seconds\n", 101 | "824675 Edges remaining\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "# Filter edgelist for duplicate nodes and for self-edges\n", 107 | "PC_edgelist_filt = gct.filter_converted_edgelist(PC_edgelist_sorted)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 28, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "Edge list saved: 0.55 seconds\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "# Save genelist to file\n", 127 | "outdir = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/Network_SIFs_Symbol/'\n", 128 | "gct.write_edgelist(PC_edgelist_filt, outdir+'PathwayCommons_Symbol.sif')" 129 | ] 130 | } 131 | ], 132 | "metadata": { 133 | "kernelspec": { 134 | "display_name": "Python 2", 135 | "language": "python", 136 | "name": "python2" 137 | }, 138 | "language_info": { 139 | "codemirror_mode": { 140 | "name": "ipython", 141 | "version": 2 142 | }, 143 | "file_extension": ".py", 144 | "mimetype": "text/x-python", 145 | "name": "python", 146 | "nbconvert_exporter": "python", 147 | "pygments_lexer": "ipython2", 148 | "version": "2.7.11" 149 | } 150 | }, 151 | "nbformat": 4, 152 | "nbformat_minor": 0 153 | } 154 | -------------------------------------------------------------------------------- /Network Processing Notebooks/Reactome Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import itertools\n", 15 | "import time" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Load Reactome Raw Data\n", 23 | "#### Source: http://www.reactome.org/download/current/homo_sapiens.interactions.txt.gz\n", 24 | "#### File to download: The link labelled \"Human protein-protein interaction pairs in tab-delimited format\" seems to have many more interactions than the MITAB file format. This is the file that we will use for this network.\n", 25 | "Downloaded: June 15, 2017 \n", 26 | "Last Updated: April 20, 2017 " 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Raw Edges in Reactome v60: 2523567\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 46 | "Reactome_Raw = pd.read_csv(wd+'Network_Data_Raw/Reactome_v60.interactions.txt',sep='\\t',skiprows=1, header=-1, low_memory=False)\n", 47 | "print 'Raw Edges in Reactome v60:', len(Reactome_Raw)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": false 55 | }, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "214432 Raw Reactome Edges after removing duplicate edges\n", 62 | "210066 Raw Reactome Edges after removing duplicate and self-edges\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "# Get edge list of network (filter for duplicate edges and self-edges)\n", 68 | "query_edgelist_filt = Reactome_Raw[[0,3]].drop_duplicates()\n", 69 | "print len(query_edgelist_filt), \"Raw Reactome Edges after removing duplicate edges\"\n", 70 | "query_edgelist_filt2 = query_edgelist_filt[query_edgelist_filt[0]!=query_edgelist_filt[3]]\n", 71 | "print len(query_edgelist_filt2), \"Raw Reactome Edges after removing duplicate and self-edges\"\n", 72 | "query_edgelist = query_edgelist_filt2.values.tolist()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "# Extract gene list\n", 84 | "Reactome_Raw_Genes = list(set(query_edgelist_filt2[0]).union(set(query_edgelist_filt2[3])))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Convert Genes from UniProtKB to Symbol" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 5, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "8387 Valid Query Genes\n", 106 | "0 Invalid Query Genes\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Reactome_Raw_Genes)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "Batch query complete: 13.56 seconds\n", 126 | "8518 Matched query results\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "# Set scopes (gene naming systems to search)\n", 132 | "scopes = \"uniprot\"\n", 133 | "\n", 134 | "# Set fields (systems from which to return gene names from)\n", 135 | "fields = \"symbol, entrezgene\"\n", 136 | "\n", 137 | "# Query MyGene.Info\n", 138 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 139 | "print len(match_list), 'Matched query results'" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 7, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "Queries without full matching results found: 511\n", 154 | "\n", 155 | "102 Queries with mutliple matches found\n", 156 | "\n", 157 | "Query mapping table/dictionary construction complete: 17.83 seconds\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "## Construct Converted Network" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 8, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "# Format edge list by removing prefixes from all interactors\n", 181 | "query_edgelist_fmt = [[gct.get_identifier_without_prefix(edge[0]), gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist]" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 9, 187 | "metadata": { 188 | "collapsed": true 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "# Convert network edge list to symbol\n", 193 | "Reactome_edgelist_symbol = gct.convert_edgelist(query_edgelist_fmt, query_to_symbol, weighted=False)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 11, 199 | "metadata": { 200 | "collapsed": false 201 | }, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "210066 input edges\n", 208 | "2708 self-edges removed\n", 209 | "10886 edges with un-mapped genes removed\n", 210 | "1970 duplicate edges removed\n", 211 | "Edge list filtered: 0.51 seconds\n", 212 | "194502 Edges remaining\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "# Filter converted edge list\n", 218 | "Reactome_edgelist_symbol_filt = gct.filter_converted_edgelist(Reactome_edgelist_symbol, weighted=False)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 12, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "Edge list saved: 0.59 seconds\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "# Save filtered, converted edge list to file\n", 238 | "gct.write_edgelist(Reactome_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/Reactome_Symbol.sif', binary=True)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "collapsed": true 246 | }, 247 | "outputs": [], 248 | "source": [] 249 | } 250 | ], 251 | "metadata": { 252 | "kernelspec": { 253 | "display_name": "Python 2", 254 | "language": "python", 255 | "name": "python2" 256 | }, 257 | "language_info": { 258 | "codemirror_mode": { 259 | "name": "ipython", 260 | "version": 2 261 | }, 262 | "file_extension": ".py", 263 | "mimetype": "text/x-python", 264 | "name": "python", 265 | "nbconvert_exporter": "python", 266 | "pygments_lexer": "ipython2", 267 | "version": "2.7.11" 268 | } 269 | }, 270 | "nbformat": 4, 271 | "nbformat_minor": 0 272 | } 273 | -------------------------------------------------------------------------------- /Network Processing Notebooks/Reactome-FIs Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import itertools\n", 15 | "import time" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Load Reactome-Functional Interactions Raw Data\n", 23 | "#### Source: http://reactomews.oicr.on.ca:8080/caBigR3WebApp2016/FIsInGene_022717_with_annotations.txt.zip\n", 24 | "Downloaded: June 15, 2017 \n", 25 | "Last Updated: February 27, 2017 \n", 26 | "Note about processing: It looks like most of the edges are given as gene symbols but many of them seem to be invalid names, so we will use some of the gene conversion tools to filter these results as best we can." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Raw edges in ReactomeFI: 230243\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 46 | "Reactome_FIs_Raw = pd.read_csv(wd+'Network_Data_Raw/FIsInGene_022717_with_annotations.txt',sep='\\t')\n", 47 | "print 'Raw edges in ReactomeFI:', Reactome_FIs_Raw.shape[0]" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "# Extract gene list\n", 59 | "Reactome_FIs_Raw_Genes = list(set(Reactome_FIs_Raw['Gene1']).union(set(Reactome_FIs_Raw['Gene2'])))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "metadata": { 66 | "collapsed": false 67 | }, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "12254 Valid Query Genes\n", 74 | "23 Invalid Query Genes:\n", 75 | "['YWHAE/FAM22B FUSION', 'RUNX1/C20ORF112 FUSION', 'IGKV A18', 'APC VARIANT PROTEIN', 'STAG1 VARIANT PROTEIN', 'MIR CL-10', 'BETA 2-MICROGLOBULIN', 'BCR/ABL FUSION', 'ATP2B2 VARIANT PROTEIN', 'ITGA7 VARIANT PROTEIN', 'CREB-1', 'CD40 LIGAND', 'NUMA1 VARIANT PROTEIN', 'PIK4CA VARIANT PROTEIN', 'EPHB2 VARIANT PROTEIN', 'RUNX1/CBFA2T2 FUSION', 'TNC VARIANT PROTEIN', 'PIK3C2B VARIANT PROTEIN', 'PLCG1 VARIANT PROTEIN', 'WUGSC:H_GS165O14.2', 'PIK3CA VARIANT PROTEIN', 'YWHAE/FAM22A FUSION', 'PDHA1/LOC79064']\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "# Find \"invalid genes\" by text format\n", 81 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Reactome_FIs_Raw_Genes, exclude_prefixes=['CHEBI'], print_invalid_genes=True)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": { 88 | "collapsed": false 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "# Get Edgelist of network\n", 93 | "query_edgelist = Reactome_FIs_Raw[['Gene1','Gene2', 'Score']].values.tolist()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 6, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "820 / 230243 edges with invalid nodes removed\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "# Filter query edges\n", 113 | "query_edgelist_filt = gct.filter_query_edgelist(query_edgelist,invalid_genes)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 7, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "229423 input edges\n", 128 | "0 self-edges removed\n", 129 | "0 edges with un-mapped genes removed\n", 130 | "0 duplicate edges removed\n", 131 | "Edge list filtered: 1.95 seconds\n", 132 | "229423 Edges remaining\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "# Filter edge list\n", 138 | "ReactomeFI_edgelist_filt = gct.filter_converted_edgelist(query_edgelist_filt, weighted=True)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "Edge list saved: 0.68 seconds\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "# Save filtered, converted edge list to file\n", 158 | "gct.write_edgelist(ReactomeFI_edgelist_filt, wd+'Network_SIFs_Symbol/ReactomeFI_Symbol.sif', binary=False)" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 36, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "90.0% score: 1.0\n", 173 | "0 / 229423 edges retained\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "# Create filtered network\n", 179 | "ReactomeFI90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/ReactomeFI_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n", 180 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/ReactomeFI90_edgelist_Symbol.sif')" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 37, 186 | "metadata": { 187 | "collapsed": false 188 | }, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "198541 / 229423 edges kept, 0.86539274615\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "# The filter function didn't work here because the max value makes up >90% of the edges. \n", 200 | "# We need to filter but keep all max edges instead\n", 201 | "ReactomeFI_edgelist = pd.DataFrame(ReactomeFI_edgelist_filt, columns=['NodeA', 'NodeB', 'Score'])\n", 202 | "q_score = ReactomeFI_edgelist['Score'].quantile(0.9)\n", 203 | "ReactomeFI_edgelist_filt2 = ReactomeFI_edgelist[ReactomeFI_edgelist['Score']>=q_score]\n", 204 | "print ReactomeFI_edgelist_filt2.shape[0], '/', ReactomeFI_edgelist.shape[0], 'edges kept, ', float(ReactomeFI_edgelist_filt2.shape[0])/ReactomeFI_edgelist.shape[0]" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "# Essentially >85% of the edges have the 'maximum score' which makes almost no sense for filtering further" 216 | ] 217 | } 218 | ], 219 | "metadata": { 220 | "kernelspec": { 221 | "display_name": "Python 2", 222 | "language": "python", 223 | "name": "python2" 224 | }, 225 | "language_info": { 226 | "codemirror_mode": { 227 | "name": "ipython", 228 | "version": 2 229 | }, 230 | "file_extension": ".py", 231 | "mimetype": "text/x-python", 232 | "name": "python", 233 | "nbconvert_exporter": "python", 234 | "pygments_lexer": "ipython2", 235 | "version": "2.7.11" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 0 240 | } 241 | -------------------------------------------------------------------------------- /Network Processing Notebooks/STRING Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import time" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Load STRING Raw Data\n", 22 | "#### Source: http://string-db.org/download/protein.links.v10.5.txt.gz\n", 23 | "#### Source (detailed): http://string-db.org/download/protein.links.detailed.v10.5.txt.gz\n", 24 | "#### File to download: The link labelled 'protein.links.v10.5.txt.gz' is simply the binary file version of the 'detailed' file. The detailed file documents the types of interactions and support for each interaction. It can be used for filtering in the future if desired, but will not be filtered on those categories currently.\n", 25 | "Downloaded: June 15, 2016 \n", 26 | "Last Updated: May 14, 2017\t\n", 27 | "Processing note: This data needs to be filtered for human-only interactions. This is a very long and large file, so we will parse the edges that are human-human interactions only by streaming the file. Then the resulting human-human interaction file will be read to be processed." 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "Filtered human-human STRING interactions only: 1793.17046094 seconds\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "# Load and filter STRING for only human-human protein interactions\n", 47 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 48 | "starttime=time.time()\n", 49 | "g=open(wd+'Network_Data_Raw/STRING/STRING_human_v10.5.txt','w')\n", 50 | "with open(wd+'Network_Data_Raw/STRING/protein.links.v10.5.txt') as f:\n", 51 | " for line in f:\n", 52 | " edge = line.split(' ')\n", 53 | " if edge[0].startswith('9606') and edge[1].startswith('9606'):\n", 54 | " g.write(edge[0].split('.')[1]+'\\t'+edge[1].split('.')[1]+'\\t'+edge[2]+'\\n')\n", 55 | "print 'Filtered human-human STRING interactions only:', time.time()-starttime, 'seconds'\n", 56 | "g.close()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## Load human-filtered STRING edges" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 2, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "Raw Edges in STRING v10.5: 11353056\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 83 | "STRING_Raw = pd.read_csv(wd+'Network_Data_Raw/STRING/STRING_human_v10.5.txt',sep='\\t',header=-1)\n", 84 | "STRING_Raw.columns = ['NodeA', 'NodeB', 'Score']\n", 85 | "print 'Raw Edges in STRING v10.5:', len(STRING_Raw)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "Edges in STRING v10.5 after dropping duplicates: 11353056\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "STRING_Raw_filt = STRING_Raw.drop_duplicates()\n", 105 | "print 'Edges in STRING v10.5 after dropping duplicates:', len(STRING_Raw_filt)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 4, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "The history saving thread hit an unexpected error (OperationalError('database is locked',)).History will not be written to the database.\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "STRING_Genes = list(set(STRING_Raw_filt['NodeA']).union(set(STRING_Raw_filt['NodeB'])))" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 5, 130 | "metadata": { 131 | "collapsed": false 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "query_edgelist = STRING_Raw_filt[['NodeA', 'NodeB', 'Score']].values.tolist()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "## Convert Genes from Ensembl Protein to Hugo Symbol" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "19576 Valid Query Genes\n", 157 | "0 Invalid Query Genes\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "query_string, valid_genes, invalid_genes = gct.query_constructor(STRING_Genes)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 7, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "Batch query complete: 23.11 seconds\n", 177 | "19578 Matched query results\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "# Set scopes (gene naming systems to search)\n", 183 | "scopes = \"ensemblprotein\"\n", 184 | "\n", 185 | "# Set fields (systems from which to return gene names from)\n", 186 | "fields = \"symbol, entrezgene\"\n", 187 | "\n", 188 | "# Query MyGene.Info\n", 189 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 190 | "print len(match_list), 'Matched query results'" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 8, 196 | "metadata": { 197 | "collapsed": false 198 | }, 199 | "outputs": [ 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "Queries without full matching results found: 1584\n", 205 | "\n", 206 | "1 Queries with mutliple matches found\n", 207 | "\n", 208 | "Query mapping table/dictionary construction complete: 115.61 seconds\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "## Construct Converted Network" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 9, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "CPU times: user 26.7 s, sys: 2.74 s, total: 29.5 s\n", 235 | "Wall time: 29.2 s\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "%%time\n", 241 | "# Convert weighted edge list\n", 242 | "STRING_edgelist_symbol = gct.convert_edgelist(query_edgelist, query_to_symbol, weighted=True)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 10, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [ 252 | { 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "11353056 input edges\n", 257 | "30268 self-edges removed\n", 258 | "1043874 edges with un-mapped genes removed\n", 259 | "5143146 duplicate edges removed\n", 260 | "Edge list filtered: 77.42 seconds\n", 261 | "5135768 Edges remaining\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "# Filter converted edge list\n", 267 | "STRING_edgelist_symbol_filt = gct.filter_converted_edgelist(STRING_edgelist_symbol, weighted=True)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 11, 273 | "metadata": { 274 | "collapsed": false 275 | }, 276 | "outputs": [ 277 | { 278 | "name": "stdout", 279 | "output_type": "stream", 280 | "text": [ 281 | "Edge list saved: 8.28 seconds\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "# Write network to file\n", 287 | "gct.write_edgelist(STRING_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/STRING_Symbol.sif', binary=False)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 12, 293 | "metadata": { 294 | "collapsed": false 295 | }, 296 | "outputs": [ 297 | { 298 | "name": "stdout", 299 | "output_type": "stream", 300 | "text": [ 301 | "90.0% score: 497.0\n", 302 | "513035 / 5135768 edges retained\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "# Create filtered network\n", 308 | "STRING90_edgelist = dit.filter_weighted_network_sif(wd+'Network_SIFs_Symbol/STRING_Symbol.sif', nodeA_col=0, nodeB_col=1, score_col=2, \n", 309 | " q=0.9, delimiter='\\t', verbose=True, save_path=wd+'Network_SIFs_Symbol/STRING90_Symbol.sif')" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": { 316 | "collapsed": true 317 | }, 318 | "outputs": [], 319 | "source": [] 320 | } 321 | ], 322 | "metadata": { 323 | "kernelspec": { 324 | "display_name": "Python 2", 325 | "language": "python", 326 | "name": "python2" 327 | }, 328 | "language_info": { 329 | "codemirror_mode": { 330 | "name": "ipython", 331 | "version": 2 332 | }, 333 | "file_extension": ".py", 334 | "mimetype": "text/x-python", 335 | "name": "python", 336 | "nbconvert_exporter": "python", 337 | "pygments_lexer": "ipython2", 338 | "version": "2.7.11" 339 | } 340 | }, 341 | "nbformat": 4, 342 | "nbformat_minor": 0 343 | } 344 | -------------------------------------------------------------------------------- /Network Processing Notebooks/iRefIndex Processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from network_evaluation_tools import gene_conversion_tools as gct\n", 12 | "from network_evaluation_tools import data_import_tools as dit\n", 13 | "import pandas as pd\n", 14 | "import itertools\n", 15 | "import time" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Load iRefIndex Raw Data\n", 23 | "#### Source (MITAB): http://irefindex.org/download/irefindex/data/archive/release_14.0/psi_mitab/MITAB2.6/9606.mitab.07042015.txt.zip\n", 24 | "Downloaded: July 28, 2016 \n", 25 | "Last Updated: April 20, 2015 \n", 26 | "Notes for processing: This is the file for human protein interactions, however, not all interactions are human-human interactions. These need to be filtered. Also all ID's not without RefSeq or UniProt ID are excluded. Custom processing for this network is described below\n", 27 | "### From iRefIndex Mapping Documentation Page:\n", 28 | "\"We have made a file which provides mappings between iRefIndex identifiers and popular external identifiers. The current files contain all UniProt and RefSeq identifiers known to the current version of iRefIndex as documented on the sources page. For specific source documentation, see the sources for each released version. \n", 29 | " \n", 30 | "Other database identifiers are provided as database/accession pairs only when the iRefIndex identifier (ROGID) does not have a corresponding UniProt or RefSeq record with an identical sequence.\" \n", 31 | " \n", 32 | "Therefore: Interactions containing an ROGID identifier will be removed" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "Raw edge count in iRefIndex: 673100\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "wd = '/cellar/users/jkhuang/Data/Projects/Network_Analysis/Data/'\n", 52 | "iRefIndex_Raw = pd.read_csv(wd+'Network_Data_Raw/iRefIndex/9606.mitab.04072015.txt',sep='\\t')\n", 53 | "print 'Raw edge count in iRefIndex:', len(iRefIndex_Raw)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "Human-Human only interactions in iRefIndex: 485030\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "# Keep only human-human interactions\n", 73 | "iRef_Human_only = iRefIndex_Raw[(iRefIndex_Raw['taxa']=='taxid:9606(Homo sapiens)') & (iRefIndex_Raw['taxb']=='taxid:9606(Homo sapiens)')]\n", 74 | "print 'Human-Human only interactions in iRefIndex:', len(iRef_Human_only)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 5, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "# Extract gene list\n", 86 | "Human_iRef_Genes = list(set(iRef_Human_only['#uidA']).union(set(iRef_Human_only['uidB'])))" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "['uniprotkb', 'refseq', 'rogid']\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "# Get all iRef prefixes\n", 106 | "prefixes=[]\n", 107 | "for gene in Human_iRef_Genes:\n", 108 | " prefix=gene.split(':')[0]\n", 109 | " if prefix not in prefixes:\n", 110 | " prefixes.append(prefix)\n", 111 | "print prefixes" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 7, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "485030 Human iRefIndex Edges\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "# Get edge list of network\n", 131 | "query_edgelist = iRef_Human_only[['#uidA','uidB']].values.tolist()\n", 132 | "print len(query_edgelist), \"Human iRefIndex Edges\"" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## Convert Genes" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 9, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "23906 Valid Query Genes\n", 154 | "945 Invalid Query Genes\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "# Construct list of genes to be submitted to MyGene.Info API (remove all genes with 'rogid' prefix)\n", 160 | "# This should only keep uniprotkb and refseq as queries\n", 161 | "query_string, valid_genes, invalid_genes = gct.query_constructor(Human_iRef_Genes, exclude_prefixes=['rogid'])" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 10, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "6305 / 485030 edges with invalid nodes removed\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "# filter edgelist because len(invalid_genes) > 0\n", 181 | "query_edgelist_filt = gct.filter_query_edgelist(query_edgelist, invalid_genes)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 11, 187 | "metadata": { 188 | "collapsed": false 189 | }, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "Batch query complete: 48.3 seconds\n", 196 | "24127 Matched query results\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "# Set scopes (gene naming systems to search)\n", 202 | "scopes = \"uniprot, refseq\"\n", 203 | "\n", 204 | "# Set fields (systems from which to return gene names from)\n", 205 | "fields = \"symbol, entrezgene\"\n", 206 | "\n", 207 | "# Query MyGene.Info\n", 208 | "match_list = gct.query_batch(query_string, scopes=scopes, fields=fields)\n", 209 | "print len(match_list), 'Matched query results'" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 12, 215 | "metadata": { 216 | "collapsed": false, 217 | "scrolled": true 218 | }, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "Queries without full matching results found: 6147\n", 225 | "\n", 226 | "162 Queries with mutliple matches found\n", 227 | "\n", 228 | "Query mapping table/dictionary construction complete: 149.88 seconds\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "match_table_trim, query_to_symbol, query_to_entrez = gct.construct_query_map_table(match_list, valid_genes)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## Construct Converted Network" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 13, 246 | "metadata": { 247 | "collapsed": false 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "# Format edge list by removing prefix indicators from all interactors\n", 252 | "query_edgelist_filt_fmt = [[gct.get_identifier_without_prefix(edge[0]),gct.get_identifier_without_prefix(edge[1])] for edge in query_edgelist_filt]" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 15, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "# Convert network edge list to symbol\n", 264 | "iRefIndex_edgelist_symbol = gct.convert_edgelist(query_edgelist_filt_fmt, query_to_symbol)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 16, 270 | "metadata": { 271 | "collapsed": false 272 | }, 273 | "outputs": [ 274 | { 275 | "name": "stdout", 276 | "output_type": "stream", 277 | "text": [ 278 | "478725 input edges\n", 279 | "34326 self-edges removed\n", 280 | "132730 edges with un-mapped genes removed\n", 281 | "178121 duplicate edges removed\n", 282 | "Edge list filtered: 0.78 seconds\n", 283 | "133548 Edges remaining\n" 284 | ] 285 | } 286 | ], 287 | "source": [ 288 | "# Filter converted edge list\n", 289 | "iRefIndex_edgelist_symbol_filt = gct.filter_converted_edgelist(iRefIndex_edgelist_symbol)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 17, 295 | "metadata": { 296 | "collapsed": false 297 | }, 298 | "outputs": [ 299 | { 300 | "name": "stdout", 301 | "output_type": "stream", 302 | "text": [ 303 | "Edge list saved: 0.22 seconds\n" 304 | ] 305 | } 306 | ], 307 | "source": [ 308 | "# Save filtered, converted edge list to file\n", 309 | "gct.write_edgelist(iRefIndex_edgelist_symbol_filt, wd+'Network_SIFs_Symbol/iRefIndex_Symbol.sif')" 310 | ] 311 | } 312 | ], 313 | "metadata": { 314 | "kernelspec": { 315 | "display_name": "Python 2", 316 | "language": "python", 317 | "name": "python2" 318 | }, 319 | "language_info": { 320 | "codemirror_mode": { 321 | "name": "ipython", 322 | "version": 2 323 | }, 324 | "file_extension": ".py", 325 | "mimetype": "text/x-python", 326 | "name": "python", 327 | "nbconvert_exporter": "python", 328 | "pygments_lexer": "ipython2", 329 | "version": "2.7.11" 330 | } 331 | }, 332 | "nbformat": 4, 333 | "nbformat_minor": 0 334 | } 335 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Network Evaluation Tools 2 | 3 | Network Evaluation Tools is a Python 2.7 package with corresponding examples for evaluating a network's ability to group a given node set in network proximity. This package was developed as a part of the work done in [Huang and Carlin et al. 2018](http://www.cell.com/cell-systems/fulltext/S2405-4712(18)30095-4). 4 | 5 | ## Modules in this package 6 | - _data_import_tools_ - This module contains functions for helping import network files and gene set files for analysis. 7 | - _gene_conversion_tools_ - This module contains functions for helping convert, filter, and save networks from their raw database form. Used in the Network Processing Jupyter Notebooks. 8 | - _miscellaneous_functions_ - This module contains various functions developed to help with analysis along the way. These functions are not well tested and may contain bugs. These functions were generally used to determine other network performance metrics on network recovery of gene sets. 9 | - _network_evaluation_functions_ - This module contains many of the core functions of the set-based network evaluation algorithm. 10 | - _network_propagation_ - This module contains functions to help with network propagation steps used in the set-based network evaluation algorithm. 11 | 12 | ## Version and Dendencies 13 | Currently, the network_evaluation_tools package requires Python 2.7 - Python 2.7.13. Note that some functions in this package may not work with Python 3.0+. 14 | network_evaluation_tools requires: 15 | - Argparse >= 1.1 16 | - NetworkX >= 2.1 17 | - Numpy >= 1.11.0 18 | - Matplotlib >= 1.5.1 19 | - Pandas >= 0.19.0 20 | - Requests >= 2.13.0 21 | - Scipy >= 0.17.0 22 | - Scikit-learn >= 0.17.1 23 | 24 | Note: 25 | - In Pandas v0.20.0+, the ```.ix```indexer has been deprecated. There may be warning regarding this issue, yet the function still works. 26 | 27 | ## Installation 28 | 1. Clone the repository 29 | 2. cd to new respository 30 | 3. Execute following command: 31 | ```python setup.py install``` 32 | 33 | ## Network analysis 34 | 1. If the network needs to be normalized to a particular naming scheme:
35 | A Jupyter Notebook describing how each network was processed from the raw download file in the original [paper](Link) can be found in the ```Network Processing Notebooks``` folder.
36 | 2. There are two ways to perform the network evaluation on a gene set:
37 | The following network analyses can be performed either from a Jupyter Notebook or from the command line (see ```Network Evaluation Examples``` folder). Jupyter notebooks are documented within the notebook and the documentation for the python scripts can be seen using the command ```python [script_name].py -h```.
38 | 39 | ## Data provided in this repository (see ```Data``` Folder) 40 | - Database Citations - An Excel file containing details about all of the networks used in the original paper's analysis and affiliated citations for all of the databases used. 41 | - _DisGeNET / Oncogenic Component Gene Sets_ - Two tab separated files, each line containing a gene set from either DisGeNET or the Oncogenic Component collection. The first column of each file is the name of the gene set followed by the list of genes associated with that given gene set on the same line. 42 | - _Network performance (AUPRCs) on DisGeNET / Oncogenic Component Gene Sets_ - Two csv files containing the raw Z-normalized AUPRC scores (network performance scores) of each network analyzed on each gene set analyzed from DisGeNET or the Oncogenic Component gene set collection. 43 | - _Network performance effect sizes on DisGeNET / Oncogenic Component Gene Sets_ - Two csv files containing the relative performance gain of each network's AUPRC score over the median null AUPRC score for each gene set analyzed from DisGeNET or the Oncogenic Component gene set collection. 44 | 45 | ## Issues 46 | Please feel free to post issues/bug reports. Questions can be sent to jkh013@ucsd.edu 47 | 48 | ## License 49 | See the [LICENSE](https://github.com/huangger/Network_Evaluation_Tools/blob/master/LICENSE.txt) file for license rights and limitations (MIT). 50 | 51 | 52 | -------------------------------------------------------------------------------- /network_evaluation_tools/.ipynb_checkpoints/PSN Construction-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 1 6 | } 7 | -------------------------------------------------------------------------------- /network_evaluation_tools/.ipynb_checkpoints/SBNE Method-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 1 6 | } 7 | -------------------------------------------------------------------------------- /network_evaluation_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/idekerlab/Network_Evaluation_Tools/4c0017e3cc3fa7767f5172cea76b4f3f7d8d0b0b/network_evaluation_tools/__init__.py -------------------------------------------------------------------------------- /network_evaluation_tools/data_import_tools.py: -------------------------------------------------------------------------------- 1 | ############################################### 2 | # ---------- Data Import Functions ---------- # 3 | ############################################### 4 | 5 | import pandas as pd 6 | import networkx as nx 7 | import time 8 | import os 9 | 10 | # Filter extended sif file where all edges are weighted by a specific quantile 11 | # Return the filtered network edge list and save it to a file if desired (for import by load_network_file) 12 | def filter_weighted_network_sif(network_file_path, nodeA_col=0, nodeB_col=1, score_col=2, q=0.9, delimiter='\t', verbose=False, save_path=None): 13 | data = pd.read_csv(network_file_path, sep=delimiter, header=-1, low_memory=False) 14 | # Filter edges by score quantile 15 | q_score = data[score_col].quantile(q) 16 | if verbose: 17 | print str(round(q*100,2))+'%', 'score:', q_score 18 | data_filt = data[data[score_col]>q_score][data.columns[[nodeA_col, nodeB_col, score_col]]] 19 | data_filt.columns = ['nodeA', 'nodeB', 'edgeScore'] 20 | if verbose: 21 | print data_filt.shape[0], '/', data.shape[0], 'edges retained' 22 | if save_path is not None: 23 | data_filt.to_csv(save_path, sep='\t', header=False, index=False) 24 | return data_filt 25 | 26 | # Load network from file as unweighted network 27 | # Can set delimiter, but default delimiter is tab 28 | # Only will read edges as first two columns, all other columns will be ignored 29 | def load_network_file(network_file_path, delimiter='\t', verbose=False): 30 | network = nx.read_edgelist(network_file_path, delimiter=delimiter, data=False) 31 | if verbose: 32 | print 'Network File Loaded:', network_file_path 33 | return network 34 | 35 | # Get full paths to all networks in directory with a given file name structure: 36 | # e.g. If filename = 'BIND_Symbol.sif', then network_name='BIND', suffix='_Symbol', ext='.sif 37 | def get_networks(wd, suffix=None, file_ext='.sif'): 38 | network_files = {} 39 | for fn in os.listdir(wd): 40 | if suffix==None: 41 | if fn.endswith(file_ext): 42 | network_files[fn.split(file_ext)[0]]=wd+fn 43 | else: 44 | if fn.endswith(file_ext) and fn.split(file_ext)[0].endswith(suffix): 45 | network_files[fn.split(suffix)[0]]=wd+fn 46 | return network_files 47 | 48 | # Companion function with get_networks(), loads all of the network files found in a directory 49 | # Uses the load_network_file() function to load each network, also only imports first two columns, no edge data 50 | # Constructs a dictionary of useful network items for each network in the directory: 51 | # - Actual networkx object representation of network 52 | # - List of nodes by name for each network 53 | # - List of edges by node name for each network 54 | def load_networks(network_file_map, delimiter='\t', verbose=False): 55 | # Initialize dictionaries 56 | networks, network_edges, network_nodes = {}, {}, {} 57 | # Loading network and network properties 58 | for network_name in network_file_map: 59 | loadtime = time.time() 60 | # Load network 61 | network = load_network_file(network_file_map[network_name], verbose=verbose) 62 | networks[network_name]=network 63 | # Construct network node list 64 | network_nodes[network_name] = network.nodes() 65 | # Construct network edge list 66 | network_edges[network_name] = network.edges() 67 | if verbose: 68 | print 'All given network files loaded' 69 | # Return data structure 70 | return networks, network_edges, network_nodes 71 | 72 | # Convert and save MAF from Broad Firehose 73 | # Can produce 2 types of filetypes: 'matrix' or 'list', matrix is a full samples-by-genes binary csv, 'list' is a sparse representaiton of 'matrix' 74 | # This is a conversion tool, so the result must be saved (most tools will require a path to a processed MAF file and load it separately) 75 | # Gene naming can be 'Symbol' or 'Entrez' 76 | def process_TCGA_MAF(maf_file, save_path, filetype='matrix', gene_naming='Symbol', verbose=False): 77 | loadtime = time.time() 78 | # Load MAF File 79 | TCGA_MAF = pd.read_csv(maf_file,sep='\t',low_memory=False) 80 | # Get all patient somatic mutation (sm) pairs from MAF file 81 | if gene_naming=='Entrez': 82 | TCGA_sm = TCGA_MAF.groupby(['Tumor_Sample_Barcode', 'Entrez_Gene_Id']).size() 83 | else: 84 | TCGA_sm = TCGA_MAF.groupby(['Tumor_Sample_Barcode', 'Hugo_Symbol']).size() 85 | # Turn somatic mutation data into binary matrix 86 | TCGA_sm_mat = TCGA_sm.unstack().fillna(0) 87 | TCGA_sm_mat = (TCGA_sm_mat>0).astype(int) 88 | # Trim TCGA barcodes 89 | TCGA_sm_mat.index = [pat[:12] for pat in TCGA_sm_mat.index] 90 | # Filter samples with duplicate IDs 91 | non_dup_IDs = list(TCGA_sm_mat.index.value_counts().index[TCGA_sm_mat.index.value_counts()==1]) 92 | dup_IDs = list(TCGA_sm_mat.index.value_counts().index[TCGA_sm_mat.index.value_counts()>1]) 93 | # Save file as binary matrix or sparse list 94 | if filetype=='list': 95 | # Now try to construct two-column/sparse representation of binary sm data 96 | # Get list of all patient somatic mutations 97 | index_list = list(TCGA_sm.index) 98 | # Filter list of patient somatic mutations of duplicate patient barcodes 99 | index_list_filt = [i for i in index_list if not any([True if barcode in i[0] else False for barcode in dup_IDs])] 100 | # Save patient somatic mutations list to file 101 | f = open(save_path, 'w') 102 | for sm in index_list_filt: 103 | f.write(sm[0][:12]+'\t'+sm[1]+'\n') 104 | f.close() 105 | if verbose: 106 | print 'Binary somatic mutations list saved' 107 | else: 108 | # Save non-duplicate patients' binary TCGA somatic mutation matrix to csv 109 | TCGA_sm_mat_filt = TCGA_sm_mat.ix[non_dup_IDs] 110 | # Remove all genes that have no more mutations after patient filtering 111 | nonempty_cols = [col for col in TCGA_sm_mat_filt.columns if not all(TCGA_sm_mat_filt[col]==0)] 112 | TCGA_sm_mat_filt2 = TCGA_sm_mat_filt[nonempty_cols] 113 | # Remove columns with bad names like '0' 114 | named_cols = [col for col in TCGA_sm_mat_filt.columns if col!='0'] 115 | TCGA_sm_mat_filt3 = TCGA_sm_mat_filt2[nonempty_cols] 116 | TCGA_sm_mat_filt3.to_csv(save_path) 117 | if verbose: 118 | print 'Binary somatic mutation matrix saved' 119 | if verbose: 120 | print 'MAF file processed:', maf_file, round(time.time()-loadtime, 2), 'seconds.' 121 | return 122 | 123 | # Load binary mutation data with 2 file types (filetype= 'matrix' or 'list') 124 | # filetype=='matrix' is a csv or tsv style matrix with row and column headers, rows are samples/patients, columns are genes 125 | # filetype=='list' is a 2 columns text file separated by the delimiter where 1st column is sample/patient, 2nd column is one gene mutated in that patient 126 | # Line example in 'list' file: 'Patient ID','Gene Mutated' 127 | def load_binary_mutation_data(filename, filetype='matrix', delimiter=',', verbose=False): 128 | if filetype=='list': 129 | f = open(filename) 130 | binary_mat_lines = f.read().splitlines() 131 | binary_mat_data = [(line.split('\t')[0], line.split('\t')[1]) for line in binary_mat_lines] 132 | binary_mat_index = pd.MultiIndex.from_tuples(binary_mat_data, names=['Tumor_Sample_Barcode', 'Hugo_Symbol']) 133 | binary_mat_2col = pd.DataFrame(1, index=binary_mat_index, columns=[0])[0] 134 | binary_mat = binary_mat_2col.unstack().fillna(0) 135 | else: 136 | binary_mat = pd.read_csv(filename, delimiter=delimiter, index_col=0).astype(int) 137 | if verbose: 138 | print 'Binary Mutation Matrix Loaded:', filename 139 | return binary_mat 140 | 141 | # Concatinate multiple mutation matrices together 142 | # All file type structures and delimiters must be the same (see load_binary_mutation_matrix()) across all files 143 | def concat_binary_mutation_matrices(filename_list, filetype='matrix', delimiter=',', verbose=False, save_path=None): 144 | binary_mat_list = [load_binary_mutation_data(fn, filetype=filetype, delimiter=delimiter, verbose=verbose) for fn in filename_list] 145 | binary_mat_concat = pd.concat(binary_mat_list).fillna(0) 146 | if verbose: 147 | print 'All binary mutation matrices loaded and concatenated' 148 | if save_path==None: 149 | return binary_mat_concat 150 | else: 151 | binary_mat_concat.to_csv(save_path) 152 | return binary_mat_concat 153 | 154 | # Construct dictionary of node sets from input text file to perform AUPRC analysis on for network of interest 155 | # File format: Each line is a delimited list with the first item in the list is the name of the node set 156 | # All other nodes in the list follow the node set name 157 | def load_node_sets(node_set_file, delimiter='\t', verbose=False): 158 | f = open(node_set_file) 159 | node_set_lines = f.read().splitlines() 160 | node_set_lines_split = [line.split(delimiter) for line in node_set_lines] 161 | f.close() 162 | node_sets = {node_set[0]:set(node_set[1:]) for node_set in node_set_lines_split} 163 | if verbose: 164 | print 'Node cohorts loaded:', node_set_file 165 | return node_sets -------------------------------------------------------------------------------- /network_evaluation_tools/gene_conversion_tools.py: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | # ---------- Network Gene Name Conversion Functions ---------- # 3 | ################################################################ 4 | import requests 5 | import re 6 | import time 7 | import pandas as pd 8 | 9 | # Determine if id to be input is a valid gene name (does not contain parentheses or quotations or whitespace) 10 | def exclude_id(name, bad_prefixes=None): 11 | excluded_id_regex = re.compile('[(),\'\"\s\/\|\.<>]+') 12 | # Remove genes that may also have prefixes that we do not want (e.g. CHEBI) 13 | if bad_prefixes: 14 | for prefix in bad_prefixes: 15 | if name.startswith(prefix): 16 | return True 17 | return excluded_id_regex.search(name) 18 | 19 | # Remove the naming system prefix, if there is one 20 | def get_identifier_without_prefix(string): 21 | elements = string.split(':') 22 | length = len(elements) 23 | if length is 2: 24 | return str(elements[1]) 25 | elif length > 2: 26 | return None 27 | else: 28 | return string 29 | 30 | # Construct string for bach query to MyGene.Info v3.0.0 API 31 | def query_constructor(gene_list, exclude_prefixes=None, print_invalid_genes=False): 32 | # Find genes that are valid and return only gene identifiers 33 | valid_query_genes = [get_identifier_without_prefix(gene) for gene in gene_list if exclude_id(gene, exclude_prefixes)==None] 34 | # Find all genes that have invalid names 35 | invalid_query_genes = [gene for gene in gene_list if exclude_id(gene, exclude_prefixes)!=None] 36 | print len(valid_query_genes), "Valid Query Genes" 37 | if print_invalid_genes: 38 | print len(invalid_query_genes), "Invalid Query Genes:" 39 | print invalid_query_genes 40 | else: 41 | print len(invalid_query_genes), "Invalid Query Genes" 42 | query_string = ' '.join(valid_query_genes) # Build string of names to input into MyGene.Info 43 | return query_string, valid_query_genes, invalid_query_genes 44 | 45 | # Function for posting batch query to MyGene.info v3.0.0 API 46 | def query_batch(query_string, tax_id='9606', scopes="symbol, entrezgene, alias, uniprot", fields="symbol, entrezgene"): 47 | query_split = query_string.split(' ') 48 | query_n = len(query_split) 49 | query_time = time.time() 50 | if query_n <=1000: 51 | data = {'species': tax_id, # Human Only 52 | 'scopes': scopes, # Default symbol, entrez, alias, uniprot. Alias often returns more genes than needed, return only higest scoring genes 53 | 'fields': fields, # Which gene name spaces to convert to 54 | 'q': query_string} 55 | res = requests.post('http://mygene.info/v3/query', data) 56 | json = res.json() 57 | else: 58 | # If the query is too long, we will need to break it up into chunks of 1000 query genes (MyGene.info cap) 59 | if query_n % 1000 == 0: 60 | chunks = query_n / 1000 61 | else: 62 | chunks = (query_n / 1000) + 1 63 | query_chunks = [] 64 | for i in range(chunks): 65 | start_i, end_i = i*1000, (i+1)*1000 66 | query_chunks.append(' '.join(query_split[start_i:end_i])) 67 | json = [] 68 | for chunk in query_chunks: 69 | data = {'species': '9606', # Human Only 70 | 'scopes': "entrezgene, retired", # Default symbol, entrez, alias, uniprot. Alias often returns more genes than needed, return only higest scoring genes 71 | 'fields': "symbol, entrezgene", # Which gene name spaces to convert to 72 | 'q': chunk} 73 | res = requests.post('http://mygene.info/v3/query', data) 74 | json = json+res.json() 75 | print len(json), 'Matched query results' 76 | print 'Batch query complete:', round(time.time()-query_time,2), 'seconds' 77 | return json 78 | 79 | # Construct matched queries maps 80 | def construct_query_map_table(query_result, query_genes, display_unmatched_queries=False): 81 | construction_time = time.time() 82 | # Construct DataFrame of matched queries (only keep the results for each query where both symbol and entrez id were mapped) 83 | matched_data, matched_genes=[], [] 84 | for match in query_result: 85 | if match.get('entrezgene') and match.get('symbol'): 86 | matched_data.append([match.get('query'), match.get('_score'), match.get('symbol'), str(match.get('entrezgene'))]) 87 | matched_genes.append(match.get('query')) 88 | # Add all other partial mappings or non-mappings to the list 89 | partial_match_genes = [gene for gene in query_genes if gene not in matched_genes] 90 | partial_match_results = [] 91 | for match in query_result: 92 | if match.get('query') in partial_match_genes: 93 | partial_match_results.append(match) 94 | if match.get('entrezgene'): # If there if an entrez gene, we want that that in string form, otherwise we want None 95 | matched_data.append([match.get('query'), match.get('_score'), match.get('symbol'), str(match.get('entrezgene'))]) 96 | else: 97 | matched_data.append([match.get('query'), match.get('_score'), match.get('symbol'), match.get('entrezgene')]) 98 | print 'Queries without full matching results found:', len(partial_match_results) 99 | if display_unmatched_queries: 100 | for entry in partial_match_results: 101 | print entry 102 | # Convert matched data list into data frame table 103 | match_table = pd.DataFrame(data=matched_data, columns=['Query','Score','Symbol','EntrezID']) 104 | match_table = match_table.set_index('Query') 105 | # Some genes will be matched in duplicates (due to alias mapping, generally the highest scoring matches will be correct) 106 | # Therefore we remove duplicate mappings to create 1-to-1 mappings for query to genes. 107 | duplicate_matched_genes = [] 108 | for gene in query_genes: 109 | if type(match_table.ix[gene])==pd.DataFrame: 110 | duplicate_matched_genes.append(gene) 111 | print 112 | print len(duplicate_matched_genes), "Queries with mutliple matches found" 113 | # Construct mapping table of genes with only one full result 114 | single_match_genes = [gene for gene in query_genes if gene not in duplicate_matched_genes] 115 | match_table_single = match_table.ix[single_match_genes] 116 | # Keep matches of queries matched only once if there are duplicate matches for genes 117 | if len(duplicate_matched_genes) > 0: 118 | # Keep maximum scored matches of queries matched more than once 119 | max_score_matches=[] 120 | for gene in duplicate_matched_genes: 121 | matched_duplicates = match_table.ix[gene] 122 | max_score = max(matched_duplicates['Score']) 123 | max_score_matches.append(matched_duplicates[matched_duplicates['Score']==max_score]) 124 | match_table_duplicate_max = pd.concat(max_score_matches) 125 | # Construct Query maps for symbol and entrez 126 | match_table_trim = pd.concat([match_table_single, match_table_duplicate_max]) 127 | else: 128 | match_table_trim = match_table_single.copy(deep=True) 129 | # Construct query map dictionaries 130 | query_to_symbol = match_table_trim['Symbol'].to_dict() 131 | query_to_entrez = match_table_trim['EntrezID'].to_dict() 132 | print 133 | print 'Query mapping table/dictionary construction complete:', round(time.time()-construction_time,2), 'seconds' 134 | return match_table_trim, query_to_symbol, query_to_entrez 135 | 136 | # Filter edgelist to remove all genes that contain invalid query names 137 | # This function is only required if there are any invalid genes found by query_constructor() 138 | def filter_query_edgelist(query_edgelist, invalid_genes): 139 | edgelist_filt = [] 140 | count=0 141 | for edge in query_edgelist: 142 | if edge[0] in invalid_genes or edge[1] in invalid_genes: 143 | count+=1 144 | else: 145 | edgelist_filt.append(edge) 146 | print count, '/', len(query_edgelist), 'edges with invalid nodes removed' 147 | return edgelist_filt 148 | 149 | # Convert network edge lists 150 | # Third column is for weights if desired to pass weights forward 151 | def convert_edgelist(query_edgelist, gene_map, weighted=False): 152 | if weighted: 153 | return [sorted([gene_map[edge[0]],gene_map[edge[1]]])+[edge[2]] for edge in query_edgelist] 154 | else: 155 | return [sorted([gene_map[edge[0]],gene_map[edge[1]]]) for edge in query_edgelist] 156 | 157 | # Sometimes each node needs to be converted by its best match if there are multiple names per node 158 | # This function uses the match_table constructed earlier to convert genes to either symbol or entrez format only 159 | def convert_custom_namelist(names, field, match_table): 160 | # Keep only mappings defined for field of interest 161 | if field=='symbol': 162 | # Return match table values that have matched symbol 163 | conversion = match_table.ix[names][~(match_table.ix[names]['Symbol'].isnull())] 164 | if conversion.shape[0]==0: 165 | return None 166 | else: 167 | # Return conversion with max score or None if no conversion 168 | max_score = conversion['Score'].max() 169 | return conversion[conversion['Score']==max_score].ix[0]['Symbol'] 170 | elif field=='entrez': 171 | # Return match table values that have matched symbol 172 | conversion = match_table.ix[names][~(match_table.ix[names]['EntrezID'].isnull())] 173 | if conversion.shape[0]==0: 174 | return None 175 | else: 176 | # Return conversion with max score or None if no conversion 177 | max_score = conversion['Score'].max() 178 | return conversion[conversion['Score']==max_score].ix[0]['EntrezID'] 179 | 180 | # Filter converted edge lists 181 | def filter_converted_edgelist(edgelist, remove_self_edges=True, weighted=False): 182 | filter_time = time.time() 183 | print len(edgelist),'input edges' 184 | # Remove self-edges 185 | if remove_self_edges: 186 | edgelist_filt1 = [edge for edge in edgelist if edge[0]!=edge[1]] 187 | print len(edgelist)-len(edgelist_filt1), 'self-edges removed' 188 | else: 189 | edgelist_filt1 = edgelist 190 | print 'Self-edges not removed' 191 | if weighted: 192 | # Remove edges where one or both nodes are "None" 193 | edgelist_filt2 = pd.DataFrame(data=edgelist_filt1).dropna().values.tolist() 194 | print len(edgelist_filt1)-len(edgelist_filt2), 'edges with un-mapped genes removed' 195 | # Remove duplicates by keeping the max score 196 | edgelist_filt3_scoremap = {} 197 | for edge in edgelist_filt2: 198 | if edge[0]+'+'+edge[1] not in edgelist_filt3_scoremap: 199 | edgelist_filt3_scoremap[edge[0]+'+'+edge[1]] = edge[2] 200 | else: 201 | edgelist_filt3_scoremap[edge[0]+'+'+edge[1]] = max(edgelist_filt3_scoremap[edge[0]+'+'+edge[1]], edge[2]) 202 | # Convert dictionary of scores to list 203 | edgelist_filt3 = [] 204 | for edge in edgelist_filt3_scoremap: 205 | edgelist_filt3.append(edge.split('+')+[edgelist_filt3_scoremap[edge]]) 206 | print len(edgelist_filt2)-len(edgelist_filt3), 'duplicate edges removed' 207 | else: 208 | # Remove edges where one or both nodes are "None" 209 | edgelist_filt2 = pd.DataFrame(data=edgelist_filt1).dropna() 210 | print len(edgelist_filt1)-edgelist_filt2.shape[0], 'edges with un-mapped genes removed' 211 | # Remove duplicate edges 212 | edgelist_filt3 = edgelist_filt2.drop_duplicates().values.tolist() 213 | print edgelist_filt2.shape[0]-len(edgelist_filt3), 'duplicate edges removed' 214 | print 'Edge list filtered:',round(time.time()-filter_time,2),'seconds' 215 | print len(edgelist_filt3), 'Edges remaining' 216 | return edgelist_filt3 217 | 218 | # Write edgelist to file 219 | def write_edgelist(edgelist, output_file, delimiter='\t', binary=True): 220 | write_time=time.time() 221 | f = open(output_file,'w') 222 | for edge in edgelist: 223 | if binary: 224 | f.write(delimiter.join([edge[0], edge[1]])+'\n') 225 | else: 226 | f.write(delimiter.join([str(val) for val in edge])+'\n') 227 | f.close() 228 | print 'Edge list saved:', round(time.time()-write_time,2),'seconds' 229 | -------------------------------------------------------------------------------- /network_evaluation_tools/miscellaneous_functions.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pandas as pd 3 | import numpy as np 4 | import data_import_tools as dit 5 | import network_propagation as prop 6 | import network_evaluation_functions as nef 7 | from multiprocessing import Pool 8 | import pickle as p 9 | 10 | ################################################################################ 11 | # ---------- Additional Node Set-Based Network Evaluation Functions ---------- # 12 | ################################################################################ 13 | 14 | # Calculate confusion matrix (true positive, false negatives, false positives, true negatives) of node set recovery for given node set 15 | # The confusion matrix for every position on every AUPRC curve is returned/stored 16 | def calculate_confusion_matrix_serial(prop_geno, p, n, node_set_name, node_set, verbose=False): 17 | runtime = time.time() 18 | intersect = [nodes for nodes in node_set if nodes in prop_geno.index] 19 | confusion_matrices = {} 20 | sample_size = int(round(p*len(intersect))) 21 | for i in range(n): # Number of times to run the sampling 22 | sample = random.sample(intersect, sample_size) # get node set sample 23 | intersect_non_sample = [node for node in intersect if node not in sample] # nodes in intersect not in sample 24 | prop_geno_non_sample = list(prop_geno.index[~prop_geno.index.isin(sample)]) # nodes in network not in sample 25 | prop_geno_sample_sum = prop_geno.ix[sample][prop_geno_non_sample].sum().sort_values(ascending=False) # summed prop value for all nodes 26 | y_actual = pd.Series(0, index=prop_geno_sample_sum.index, dtype=int) # nodes sorted by mean prop value 27 | y_actual.ix[intersect_non_sample]+=1 # which nodes in sorted list are in intersect_non_sample 28 | intersect_non_sample_sorted = y_actual[y_actual==1].index # intersect_non_sample sorted 29 | confusion_matrix = {'TP':[], 'FN':[], 'FP':[], 'TN':[]} # initialize true positive, false negative, false positive, true negative lists 30 | for node in intersect_non_sample_sorted: # Slide down sorted nodes by summed prop value by nodes that are in intersect_non_sample 31 | TP, FN = sum(y_actual.ix[:node]), sum(y_actual.ix[node:])-1 # Calculate true positives and false negatives found at this point in list 32 | FP, TN = len(y_actual.ix[:node])-TP, len(y_actual.ix[node:])-1-FN # Calculate false positives and true negatives found at this point in list 33 | confusion_matrix['TP'].append(TP) 34 | confusion_matrix['FN'].append(FN) 35 | confusion_matrix['FP'].append(FP) 36 | confusion_matrix['TN'].append(TN) 37 | confusion_matrices[i]=confusion_matrix 38 | if verbose: 39 | print 'Confusion matrices calculated for node set', node_set_name, 'complete.', repr(len(intersect))+' nodes in network,', round(time.time()-runtime, 2), 'seconds.' 40 | return confusion_matrices 41 | 42 | # Calculate confusion matrix (true positive, false negatives, false positives, true negatives) of node set recovery for given node set 43 | # The parameter setup is written for running in serial, only difference is that the name of the node set also must be passed, and prop_geno will be set as a global variable 44 | # The confusion matrix for every position on every AUPRC curve is returned/stored 45 | def calculate_confusion_matrix_parallel(node_set_params): 46 | node_set_name, node_set, p, n, verbose = node_set_params[0], node_set_params[1], node_set_params[2], node_set_params[3], node_set_params[4] 47 | runtime = time.time() 48 | intersect = [nodes for nodes in node_set if nodes in prop_geno.index] 49 | confusion_matrices = {} 50 | sample_size = int(round(p*len(intersect))) 51 | for i in range(n): # Number of times to run the sampling 52 | sample = random.sample(intersect, sample_size) # get node set sample 53 | intersect_non_sample = [node for node in intersect if node not in sample] # nodes in intersect not in sample 54 | prop_geno_non_sample = list(prop_geno.index[~prop_geno.index.isin(sample)]) # nodes in network not in sample 55 | prop_geno_sample_sum = prop_geno.ix[sample][prop_geno_non_sample].sum().sort_values(ascending=False) # summed prop value for all nodes 56 | y_actual = pd.Series(0, index=prop_geno_sample_sum.index, dtype=int) # nodes sorted by mean prop value 57 | y_actual.ix[intersect_non_sample]+=1 # which nodes in sorted list are in intersect_non_sample 58 | intersect_non_sample_sorted = y_actual[y_actual==1].index # intersect_non_sample sorted 59 | confusion_matrix = {'TP':[], 'FN':[], 'FP':[], 'TN':[]} # initialize true positive, false negative, false positive, true negative lists 60 | for node in intersect_non_sample_sorted: # Slide down sorted nodes by summed prop value by nodes that are in intersect_non_sample 61 | TP, FN = sum(y_actual.ix[:node]), sum(y_actual.ix[node:])-1 # Calculate true positives and false negatives found at this point in list 62 | FP, TN = len(y_actual.ix[:node])-TP, len(y_actual.ix[node:])-1-FN # Calculate false positives and true negatives found at this point in list 63 | confusion_matrix['TP'].append(TP) 64 | confusion_matrix['FN'].append(FN) 65 | confusion_matrix['FP'].append(FP) 66 | confusion_matrix['TN'].append(TN) 67 | confusion_matrices[i]=confusion_matrix 68 | if verbose: 69 | print 'Confusion matrices calculated for node set', node_set_name, 'complete.', repr(len(intersect))+' nodes in network,', round(time.time()-runtime, 2), 'seconds.' 70 | return [node_set_name, confusion_matrices] 71 | 72 | # Wapper for calculating the confusion matrices for input node set file and network (has parallel option) 73 | # Not run for null network shuffles 74 | def confusion_matrix_construction_wrapper(network_file, node_set_file, sample_p, sub_sample_iterations, 75 | alpha=None, m=-0.17190024, b=0.7674828, net_delim='\t', set_delim='\t', cores=1, verbose=False, save_path=None): 76 | starttime = time.time() 77 | # Load network 78 | network = dit.load_network_file(network_file, delimiter=net_delim, verbose=verbose) 79 | # Load node set 80 | node_sets = dit.load_node_sets(node_set_file, delimiter=set_delim, verbose=verbose) 81 | # Calculate network influence matrix 82 | prop_net = nef.construct_prop_kernel(network, alpha=alpha, m=m, b=b) 83 | # Calculate confusion matrix values for each node set 84 | if cores == 1: 85 | # Calculate confusion matrix values for node sets one at a time 86 | node_set_conf_mat = {node_set:nef.calculate_confusion_matrix_serial(prop_net, sample_p, sub_sample_iterations, node_set, node_sets[node_set], verbose=verbose) for node_set in node_sets} 87 | else: 88 | # Initialize multiple threads for confusion matrix analysis of multiple node sets 89 | initializer_args = [prop_net] 90 | pool = Pool(cores, nef.parallel_analysis_initializer, initializer_args) 91 | # Construct parameter list to be passed 92 | conf_mat_Analysis_params = [[node_set, node_sets[node_set], sample_p, sub_sample_iterations, verbose] for node_set in node_sets] 93 | # Run the confusion matrix analysis for each geneset 94 | conf_mat_results = pool.map(nef.calculate_confusion_matrix_parallel, conf_mat_Analysis_params) 95 | # Construct confusion matrix results dictionary 96 | node_set_conf_mat = {result[0]:result[1] for result in conf_mat_results} 97 | if save_path is None: 98 | if verbose: 99 | print 'Network confusion matrix values calcualted:', round(time.time()-starttime, 2), 'seconds' 100 | return node_set_conf_mat 101 | else: 102 | p.dump(node_set_conf_mat, open(save_path, 'wb')) 103 | if verbose: 104 | print 'Network confusion matrix values calcualted:', round(time.time()-starttime, 2), 'seconds' 105 | return node_set_conf_mat 106 | 107 | # Use confusion matrix results to calculate odds ratio, risk ratio, accuracy or precision at a given recall threshold 108 | def confusion_matrix_analysis(confusion_matrix_input, calculation, recall_threshold=0.9, verbose=False, save_path=None): 109 | runtime = time.time() 110 | # Load confusion matrix data 111 | if type(confusion_matrix_input)!=dict: 112 | confusion_matrix = p.load(open(confusion_matrix_input, 'rb')) 113 | else: 114 | confusion_matrix = confusion_matrix_input 115 | 116 | # Calculate average and variance of specified calculation 117 | cohort_calculated_values_mean, cohort_calculated_values_var = {}, {} 118 | # For each cohort tested 119 | for cohort in confusion_matrix: 120 | print cohort 121 | n = len(confusion_matrix[cohort]) 122 | calculation_values = [] 123 | # For all sub-sample iterations 124 | for i in range(n): 125 | # Find where recall >= recall threshold 126 | for j in range(len(confusion_matrix[cohort][i]['TP'])): 127 | TP = confusion_matrix[cohort][i]['TP'][j] 128 | FN = confusion_matrix[cohort][i]['FN'][j] 129 | recall = TP / float((TP+FN)) 130 | if recall >= recall_threshold: 131 | FP = confusion_matrix[cohort][i]['FP'][j] 132 | TN = confusion_matrix[cohort][i]['TN'][j] 133 | if calculation=='OR': # Odds Ratio: OR = (TP/FP) / (FN/TN) 134 | calculation_values.append((float(TP)/FP) / (float(FN)/TN)) 135 | elif calculation=='RR': # Risk Ratio / Relative Risk: RR = (TP/(TP+FN)) / (FP/(FP+TN)) 136 | calculation_values.append((float(TP)/(TP+FN)) / (float(FP)/(FP+TN))) 137 | elif calculation=='accuracy': # accuracy = (TP + TN) / (TP + TN + FP + FN) 138 | calculation_values.append(float(TP+TN) / (TP+FN+FP+TN)) 139 | else: # precision = (TP) / (TP+FP) 140 | calculation_values.append(float(TP) / (TP+FP)) 141 | break 142 | # Calculate average and variance of value of interest across all iterations for given cohort 143 | cohort_calculated_values_mean[cohort] = np.mean(calculation_values) 144 | cohort_calculated_values_var[cohort] = np.var(calculation_values) 145 | # Return table of average/variance values for performance on all cohorts at given threshold 146 | cohort_calculated_values_table = pd.concat([pd.Series(cohort_calculated_values_mean, name='Average '+calculation), 147 | pd.Series(cohort_calculated_values_var, name=calculation+' Var')], axis=1) 148 | if save_path is None: 149 | if verbose: 150 | print calculation, 'calculation completed for all cohorts', round(time.time()-runtime, 2), 'seconds.' 151 | return cohort_calculated_values_table 152 | else: 153 | cohort_calculated_values_table.to_csv(save_path) 154 | if verbose: 155 | print calculation, 'calculation completed for all cohorts', round(time.time()-runtime, 2), 'seconds.' 156 | return cohort_calculated_values_table 157 | 158 | 159 | -------------------------------------------------------------------------------- /network_evaluation_tools/network_propagation.py: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | # ---------- Network Propagation Functions ---------- # 3 | ####################################################### 4 | import networkx as nx 5 | import time 6 | import numpy as np 7 | import scipy 8 | import pandas as pd 9 | import copy 10 | 11 | # Normalize network (or network subgraph) for random walk propagation 12 | def normalize_network(network, symmetric_norm=False): 13 | adj_mat = nx.adjacency_matrix(network) 14 | adj_array = np.array(adj_mat.todense()) 15 | if symmetric_norm: 16 | D = np.diag(1/np.sqrt(sum(adj_array))) 17 | adj_array_norm = np.dot(np.dot(D, adj_array), D) 18 | else: 19 | degree_norm_array = np.diag(1/sum(adj_array).astype(float)) 20 | sparse_degree_norm_array = scipy.sparse.csr_matrix(degree_norm_array) 21 | adj_array_norm = sparse_degree_norm_array.dot(adj_mat).toarray() 22 | return adj_array_norm 23 | # Note about normalizing by degree, if multiply by degree_norm_array first (D^-1 * A), then do not need to return 24 | # transposed adjacency array, it is already in the correct orientation 25 | 26 | # Calculate optimal propagation coefficient (updated model) 27 | def calculate_alpha(network, m=-0.02935302, b=0.74842057): 28 | log_edge_count = np.log10(len(network.edges())) 29 | alpha_val = round(m*log_edge_count+b,3) 30 | if alpha_val <=0: 31 | raise ValueError('Alpha <= 0 - Network Edge Count is too high') 32 | # There should never be a case where Alpha >= 1, as avg node degree will never be negative 33 | else: 34 | return alpha_val 35 | 36 | # Closed form random-walk propagation (as seen in HotNet2) for each subgraph: Ft = (1-alpha)*Fo * (I-alpha*norm_adj_mat)^-1 37 | # Concatenate to previous set of subgraphs 38 | def fast_random_walk(alpha, binary_mat, subgraph_norm, prop_data): 39 | term1=(1-alpha)*binary_mat 40 | term2=np.identity(binary_mat.shape[1])-alpha*subgraph_norm 41 | term2_inv = np.linalg.inv(term2) 42 | subgraph_prop = np.dot(term1, term2_inv) 43 | return np.concatenate((prop_data, subgraph_prop), axis=1) 44 | 45 | # Wrapper for random walk propagation of full network by subgraphs 46 | def closed_form_network_propagation(network, binary_matrix, network_alpha, symmetric_norm=False, verbose=False, save_path=None): 47 | starttime=time.time() 48 | if verbose: 49 | print 'Alpha:', network_alpha 50 | # Separate network into connected components and calculate propagation values of each sub-sample on each connected component 51 | subgraphs = list(nx.connected_component_subgraphs(network)) 52 | # Initialize propagation results by propagating first subgraph 53 | subgraph = subgraphs[0] 54 | subgraph_nodes = list(subgraph.nodes) 55 | prop_data_node_order = list(subgraph_nodes) 56 | binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T) 57 | subgraph_norm = normalize_network(subgraph, symmetric_norm=symmetric_norm) 58 | prop_data_empty = np.zeros((binary_matrix_filt.shape[0], 1)) 59 | prop_data = fast_random_walk(network_alpha, binary_matrix_filt, subgraph_norm, prop_data_empty) 60 | # Get propagated results for remaining subgraphs 61 | for subgraph in subgraphs[1:]: 62 | subgraph_nodes = list(subgraph.nodes) 63 | prop_data_node_order = prop_data_node_order + subgraph_nodes 64 | binary_matrix_filt = np.array(binary_matrix.T.ix[subgraph_nodes].fillna(0).T) 65 | subgraph_norm = normalize_network(subgraph, symmetric_norm=symmetric_norm) 66 | prop_data = fast_random_walk(network_alpha, binary_matrix_filt, subgraph_norm, prop_data) 67 | # Return propagated result as dataframe 68 | prop_data_df = pd.DataFrame(data=prop_data[:,1:], index = binary_matrix.index, columns=prop_data_node_order) 69 | if save_path is None: 70 | if verbose: 71 | print 'Network Propagation Complete:', time.time()-starttime, 'seconds' 72 | return prop_data_df 73 | else: 74 | prop_data_df.to_csv(save_path) 75 | if verbose: 76 | print 'Network Propagation Complete:', time.time()-starttime, 'seconds' 77 | return prop_data_df 78 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | Setup module adapted from setuptools code. See: 3 | https://packaging.python.org/en/latest/distributing.html 4 | https://github.com/pypa/sampleproject 5 | """ 6 | 7 | # Always prefer setuptools over distutils 8 | from setuptools import setup, find_packages 9 | 10 | setup( 11 | name='network_evaluation_tools', 12 | version='1.0.2', 13 | description='Module to perform patient and molecular network evaluation as described in Huang and Carlin, et al. 2018', 14 | url='https://github.com/idekerlab/Network_Evaluation_Tools', 15 | author='Justin Huang', 16 | author_email='jkh013@ucsd.edu', 17 | license='MIT', 18 | classifiers=[ 19 | 'Development Status :: 5 - Production/Stable', 20 | 'Intended Audience :: Science/Research', 21 | 'Topic :: Software Development :: Build Tools', 22 | 'License :: OSI Approved :: MIT License', 23 | 'Programming Language :: Python :: 2.7' 24 | ], 25 | packages=find_packages(exclude=['copy', 'itertools', 'os', 're', 'time']), 26 | install_requires=[ 27 | 'argparse>=1.1', 28 | 'networkx>=2.1', 29 | 'numpy>=1.11.0', 30 | 'matplotlib>=1.5.1', 31 | 'pandas>=0.19.0', 32 | 'requests>=2.13.0', 33 | 'scipy>=0.17.0', 34 | 'scikit-learn>=0.17.1', 35 | 'seaborn>=0.7.1'] 36 | ) --------------------------------------------------------------------------------