├── .gitignore ├── Chemical-Chemical ├── README.txt ├── make_drugbank_chemical_chemical.py └── parse_drugbank_chemical_chemical.py ├── Chemical-Gene ├── README.txt ├── make_drugbank_chemical_gene.py └── parse_drugbank_chemical_gene.py ├── Chemical ├── README.txt ├── make_snap_chemical_mode_table.py ├── newChemParser.py ├── parse_drugbank_chemicals.py └── test_db_parse.py ├── Disease-Chemical ├── README.txt └── make_disease_chem_ctd.py ├── Disease-Disease ├── README.txt └── parse_do_disease_disease.py ├── Disease-Function ├── README.txt └── make_disease_func_ctd.py ├── Disease-Gene ├── README.txt ├── make_disease_gene_ctd.py └── make_disease_gene_disgenet.py ├── Disease ├── README.txt ├── parse_ctd_diseases.py ├── parse_do_diseases.py └── parse_omim_diseases.py ├── Function-Function ├── README.txt └── parse_obo_for_functions.py ├── Function ├── README.txt └── parse_obo_for_functions.py ├── Gene-Function └── README.txt ├── Gene-Protein ├── README.txt └── fetch_ensembl_id_mapping.py ├── Gene └── README.txt ├── Protein-Protein └── README.txt ├── Protein ├── README.txt └── add_organism.py ├── README.txt ├── Utils ├── README.txt ├── create_snap_crossnet_table.py ├── create_snap_mode_equiv_table.py ├── create_snap_mode_table.py ├── extract_edge_list.py ├── extract_unique_node_ids.py ├── getStats.py └── utils.py ├── drugbank ├── edges │ ├── README.txt │ ├── getDrugInteractions.py │ ├── getGeneInteractions.py │ ├── make-edges.sh │ ├── makeEdgeTableCC.py │ └── makeEdgeTableCG.py └── nodes │ ├── README.txt │ ├── make-nodes.sh │ ├── makeNodeTables.py │ └── parseDrugbank.py └── examples ├── README.txt ├── config.txt ├── miner_get_stats.py └── miner_load_tables.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.pyc 4 | -------------------------------------------------------------------------------- /Chemical-Chemical/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing chemical-chemical information: 2 | - drugbank 3 | 4 | Workflow: 5 | 6 | Input Files: 7 | /path/to/input/drugbank.xml 8 | /path/to/input/miner-chemical-0-drugbank-20160523.tsv 9 | 10 | Intermediate Files: 11 | /path/to/intermediate/drugbank_parsed_chemical_chemical.tsv 12 | 13 | Output Files: 14 | /path/to/output/miner-chemical-chemical-20160423.tsv 15 | /path/to/output/miner-chemical-chemical-0-drugbank-20160423.tsv 16 | 17 | # Parse data 18 | python parse_drugbank_chemical_chemical.py /path/to/input/drugbank.xml --output_dir /path/to/intermediate/ 19 | 20 | # Create crossnet tables 21 | python make_drugbank_chemical_chemical.py /path/to/intermediate/drugbank_parsed_chemical_chemical.tsv ./../Chemical/miner-chemical-0-drugbank-20160523.tsv --output_dir /path/to/output/ 22 | 23 | Usage of the scripts used: 24 | 25 | ------------------------------------------ 26 | file : parse_drugbank_chemical_chemical.py 27 | ------------------------------------------ 28 | 29 | XML parser to parse the drugbank database for chemical chemical interactions. 30 | Outputs a tab separated .tsv file with the following coloumn headers: 31 | DrugbankId DrugbankId 32 | 33 | Usage: 34 | python parse_drugbank_chemical_chemical.py 35 | 36 | Positional Arguments: 37 | input_file : Path to the durgbank.xml file. 38 | 39 | Optional Arugments: 40 | --output_dir : Directory to create output files. Defaults to the current working directory. 41 | 42 | Example Usage: 43 | Input File: drugbank.xml 44 | 45 | Output directory : outputs/chemical/ 46 | 47 | Comamnd line: 48 | python parse_drugbank_chemical_chemical.py drugbank.xml --output_dir outputs/chemicals/ 49 | 50 | Output: 51 | drugbank_parsed_chemical_chemical.tsv 52 | 53 | ------------------------------------------ 54 | file : make_drugbank_chemical_chemical.py 55 | ------------------------------------------ 56 | 57 | Script to output chemical chemical interactions. 58 | 59 | Usage: 60 | python make_drugbank_chemical_chemical.py 61 | 62 | Positional Arguments: 63 | input_file : Path to chemical chemical interaction file (drugbank_parsed_chemical_chemical.tsv) 64 | mode_file : Path to chemical mode file (miner-chemical-0-drugbank-20160523.tsv) 65 | 66 | Optional Arugments: 67 | --output_dir : Directory to create output files. Defaults to the current working directory. 68 | 69 | Example Usage: 70 | Input File: drugbank_parsed_chemical_chemical.tsv, miner-chemical-0-drugbank-20160523.tsv 71 | 72 | Output directory : outputs/chemical/ 73 | 74 | Comamnd line: 75 | python make_drugbank_chemical_chemical.py drugbank_parsed_chemical_chemical.tsv ./../nodes/miner-chemical-0-drugbank-20160523.tsv --output_dir outputs/chemicals/ 76 | 77 | Output: 78 | miner-chemical-chemical-20160423.tsv, miner-chemical-chemical-0-drugbank-20160423.tsv 79 | 80 | 81 | -------------------------------------------------------------------------------- /Chemical-Chemical/make_drugbank_chemical_chemical.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : make_drugbank_chemical_chemical.py 3 | author: Agrim Gupta 4 | 5 | Script to output chemical chemical interactions. 6 | 7 | Usage: 8 | python make_drugbank_chemical_chemical.py 9 | 10 | Positional Arguments: 11 | input_file : Path to chemical chemical interaction file (drugbank_parsed_chemical_chemical.tsv) 12 | mode_file : Path to chemical mode file (miner-chemical-0-drugbank-20160523.tsv) 13 | 14 | Optional Arugments: 15 | --output_dir : Directory to create output files. Defaults to the current working directory. 16 | 17 | Example Usage: 18 | Input File: drugbank_parsed_chemical_chemical.tsv, miner-chemical-0-drugbank-20160523.tsv 19 | 20 | Output directory : outputs/chemical/ 21 | 22 | Comamnd line: 23 | python make_drugbank_chemical_chemical.py drugbank_parsed_chemical_chemical.tsv ./../nodes/miner-chemical-0-drugbank-20160523.tsv --output_dir outputs/chemicals/ 24 | 25 | Output: 26 | miner-chemical-chemical-20160423.tsv, miner-chemical-chemical-0-drugbank-20160423.tsv 27 | ''' 28 | from collections import defaultdict 29 | import os 30 | import argparse 31 | from datetime import datetime 32 | 33 | parser = argparse.ArgumentParser(description='Output crossnet for chemical chemical interaction') 34 | parser.add_argument('input_file', help='input file path. File should be parsed chemical-chemical interaction') 35 | parser.add_argument('mode_file', help='mode file path. File should be the chemical mode file') 36 | parser.add_argument('--output_dir', help='directory to output files', default='.') 37 | args = parser.parse_args() 38 | sep = "\t" 39 | empty = "NULL" 40 | format = '%Y%m%d' 41 | dateStr = datetime.now().strftime(format) 42 | 43 | snapIdPrefix = "" 44 | edgeFile = args.input_file 45 | nodeMap = args.mode_file 46 | masterTable = os.path.join(args.output_dir, "miner-chemical-chemical-" + dateStr + ".tsv") 47 | subTable = os.path.join(args.output_dir, "miner-chemical-chemical-0-drugbank-" + dateStr + ".tsv") 48 | idNum = 0 49 | # Make a dict mapping from drugbankId to snapChemId 50 | drugbankSnap = {} 51 | with open(nodeMap, 'r') as f: 52 | for line in f: 53 | if line.startswith('#'): 54 | continue 55 | line = line.strip().split(sep) 56 | drugbankSnap[line[1]] = line[0] 57 | 58 | drugsDone = defaultdict(list) 59 | with open(edgeFile, 'r') as f, open(masterTable, 'w') as master, open(subTable, 'w') as sub: 60 | master.write('# snap_edge_id\tdataset_id\tsnap_source_id\tsnap_dst_id\n') 61 | sub.write('# snap_edge_id\tdataset_source_id\tdataset_dst_id\n') 62 | for line in f: 63 | if line.startswith('#'): 64 | continue 65 | line = line.strip().split(sep) 66 | if line[1] in drugsDone[line[0]]: 67 | continue 68 | if line[0] not in drugbankSnap or line[1] not in drugbankSnap: 69 | continue 70 | drugsDone[line[0]].append(line[1]) 71 | snapId = snapIdPrefix + str(idNum) 72 | idNum += 1 73 | master.write(snapId + sep + "0" + sep + drugbankSnap[line[0]] + sep + drugbankSnap[line[1]] + sep + line[2] + '\n') 74 | sub.write(snapId + sep + line[0] + sep + line[1] + sep + line[2] + '\n') 75 | 76 | 77 | -------------------------------------------------------------------------------- /Chemical-Chemical/parse_drugbank_chemical_chemical.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : parse_drugbank_chemical_chemical.py 3 | author: Agrim Gupta 4 | 5 | XML parser to parse the drugbank database for chemical chemical interactions. 6 | Outputs a tab separated .tsv file with the following coloumn headers: 7 | DrugbankId DrugbankId 8 | 9 | Usage: 10 | python parse_drugbank_chemical_chemical.py 11 | 12 | Positional Arguments: 13 | input_file : Path to the durgbank.xml file. 14 | 15 | Optional Arugments: 16 | --output_dir : Directory to create output files. Defaults to the current working directory. 17 | 18 | Example Usage: 19 | Input File: drugbank.xml 20 | 21 | Output directory : outputs/chemical/ 22 | 23 | Comamnd line: 24 | python parse_drugbank_chemical_chemical.py drugbank.xml --output_dir outputs/chemicals/ 25 | 26 | Output: 27 | drugbank_parsed_chemical_chemical.tsv 28 | ''' 29 | 30 | from bs4 import BeautifulSoup 31 | import os 32 | import argparse 33 | 34 | parser = argparse.ArgumentParser(description='Parse Durgbank database for drug drug interaction') 35 | parser.add_argument('input_file', help='input file path. File should be the drugbank.xml file.') 36 | parser.add_argument('--output_dir', help='directory to output files', default='.') 37 | args = parser.parse_args() 38 | outputFile = os.path.join(args.output_dir, "drugbank_parsed_chemical_chemical.tsv") 39 | soup = BeautifulSoup(open(args.input_file),"xml") 40 | sep = "\t" 41 | empty = "NULL" 42 | with open(outputFile, 'w') as f: 43 | for drug in soup.findAll("drug"): 44 | drugName = drug.find("drugbank-id").text 45 | interactions = drug.findAll("drug-interaction") 46 | if not interactions: 47 | continue 48 | for i in interactions: 49 | toPrint = drugName + sep + i.find("drugbank-id").text + sep + i.find("description").text 50 | f.write(toPrint.encode('utf-8') + '\n') 51 | -------------------------------------------------------------------------------- /Chemical-Gene/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing chemical-gene information: 2 | - drugbank 3 | 4 | Workflow: 5 | 6 | Input Files: 7 | /path/to/input/drugbank.xml 8 | /path/to/input/miner-chemical-0-drugbank-20160523.tsv 9 | /path/to/input/miner-gene-0-20160523.tsv 10 | 11 | Intermediate Files: 12 | /path/to/intermediate/drugbank_parsed_chemical_gene.tsv 13 | 14 | Output Files: 15 | /path/to/output/miner-chemical-gene-20160423.tsv 16 | /path/to/output/miner-chemical-gene-0-drugbank-20160423.tsv 17 | 18 | # Parse data 19 | python parse_drugbank_chemical_gene.py /path/to/input/drugbank.xml --output_dir /path/to/intermediate/ 20 | 21 | # Create crossnet tables 22 | python make_drugbank_chemical_gene.py /path/to/intermediate/drugbank_parsed_chemical_gene.tsv ./../nodes/miner-chemical-0-drugbank-20160523.tsv miner-genes-0-go-20160523.tsv --output_dir /path/to/output/ 23 | 24 | 25 | Usage of the scripts used: 26 | 27 | -------------------------------------- 28 | file : parse_drugbank_chemical_gene.py 29 | -------------------------------------- 30 | 31 | XML parser to parse the drugbank database for chemical gene interactions. 32 | Outputs a tab separated .tsv file with the following coloumn headers: 33 | DrugbankId Gene1 Gene2 ... 34 | Currently UniportID is used for genes. 35 | 36 | Usage: 37 | python parse_drugbank_chemical_gene.py 38 | 39 | Positional Arguments: 40 | input_file : Path to the durgbank.xml file. 41 | 42 | Optional Arugments: 43 | --output_dir : Directory to create output files. Defaults to the current working directory. 44 | 45 | Example Usage: 46 | Input File: drugbank.xml 47 | 48 | Output directory : outputs/chemical/ 49 | 50 | Comamnd line: 51 | python parse_drugbank_chemical_gene.py drugbank.xml --output_dir outputs/chemicals/ 52 | 53 | Output: 54 | drugbank_parsed_chemical_gene.tsv 55 | 56 | ------------------------------------- 57 | file : make_drugbank_chemical_gene.py 58 | ------------------------------------- 59 | 60 | Script to output chemical gene interactions. 61 | 62 | Usage: 63 | python make_drugbank_chemical_gene.py 64 | 65 | Positional Arguments: 66 | input_file : Path to chemical chemical interaction file (drugbank_parsed_chemical_gene.tsv) 67 | chemical_mode : Path to chemical mode file (miner-chemical-0-drugbank-20160523.tsv) 68 | gene_mode : Path to gene mode file (miner-genes-0-go-20160523.tsv) 69 | 70 | Optional Arugments: 71 | --output_dir : Directory to create output files. Defaults to the current working directory. 72 | 73 | Example Usage: 74 | Input File: drugbank_parsed_chemical_gene.tsv, miner-chemical-0-drugbank-20160523.tsv 75 | 76 | Output directory : outputs/chemical/ 77 | 78 | Comamnd line: 79 | python make_drugbank_chemical_gene.py drugbank_parsed_chemical_gene.tsv ./../nodes/miner-chemical-0-drugbank-20160523.tsv miner-genes-0-go-20160523.tsv --output_dir outputs/chemicals/ 80 | 81 | Output: 82 | miner-chemical-gene-20160423.tsv, miner-chemical-gene-0-drugbank-20160423.tsv 83 | 84 | -------------------------------------------------------------------------------- /Chemical-Gene/make_drugbank_chemical_gene.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : make_drugbank_chemical_gene.py 3 | author: Agrim Gupta 4 | 5 | Script to output chemical gene interactions. 6 | 7 | Usage: 8 | python make_drugbank_chemical_gene.py 9 | 10 | Positional Arguments: 11 | input_file : Path to chemical chemical interaction file (drugbank_parsed_chemical_gene.tsv) 12 | chemical_mode : Path to chemical mode file (miner-chemical-0-drugbank-20160523.tsv) 13 | gene_mode : Path to gene mode file (miner-genes-0-go-20160523.tsv) 14 | 15 | Optional Arugments: 16 | --output_dir : Directory to create output files. Defaults to the current working directory. 17 | 18 | Example Usage: 19 | Input File: drugbank_parsed_chemical_gene.tsv, miner-chemical-0-drugbank-20160523.tsv 20 | 21 | Output directory : outputs/chemical/ 22 | 23 | Comamnd line: 24 | python make_drugbank_chemical_gene.py drugbank_parsed_chemical_gene.tsv ./../nodes/miner-chemical-0-drugbank-20160523.tsv miner-genes-0-go-20160523.tsv --output_dir outputs/chemicals/ 25 | 26 | Output: 27 | miner-chemical-gene-20160423.tsv, miner-chemical-gene-0-drugbank-20160423.tsv 28 | ''' 29 | from collections import defaultdict 30 | import os 31 | import argparse 32 | from datetime import datetime 33 | 34 | parser = argparse.ArgumentParser(description='Output crossnet for chemical chemical interaction') 35 | parser.add_argument('input_file', help='input file path. File should be parsed chemical-gene interaction') 36 | parser.add_argument('chemical_mode', help='chemical mode file path. File should be the chemical mode file') 37 | parser.add_argument('gene_mode', help='gene mode file path. File should be the gene mode file') 38 | parser.add_argument('--output_dir', help='directory to output files', default='.') 39 | args = parser.parse_args() 40 | sep = "\t" 41 | empty = "NULL" 42 | format = '%Y%m%d' 43 | dateStr = datetime.now().strftime(format) 44 | snapIdPrefix = "" 45 | 46 | edgeFile = args.input_file 47 | nodeMap = args.chemical_mode 48 | geneMap = args.gene_mode 49 | masterTable = os.path.join(args.output_dir, "miner-chemical-gene-" + dateStr + ".tsv") 50 | subTable = os.path.join(args.output_dir, "miner-chemical-gene-0-drugbank-" + dateStr + ".tsv") 51 | idNum = 0 52 | # Make a dict mapping from drugbankId to snapChemId 53 | drugbankSnap = {} 54 | with open(nodeMap, 'r') as f: 55 | for line in f: 56 | if line.startswith('#'): 57 | continue 58 | line = line.strip().split(sep) 59 | drugbankSnap[line[1]] = line[0] 60 | 61 | # Make a dict mapping from UniProtKB to snapGeneId 62 | geneSnap = {} 63 | with open(geneMap, 'r') as f: 64 | for line in f: 65 | if line.startswith('#'): 66 | continue 67 | line = line.strip().split('\t') 68 | geneSnap[line[1]] = line[0] 69 | 70 | with open(edgeFile, 'r') as f, open(masterTable, 'w') as master, open(subTable, 'w') as sub: 71 | master.write('# snap_edge_id\tdataset_id\tsnap_source_id\tsnap_dst_id\n') 72 | sub.write('# snap_edge_id\tdataset_source_id\tdataset_dst_id\n') 73 | for line in f: 74 | if line.startswith('#'): 75 | continue 76 | line = line.strip().split(sep) 77 | if line[0] not in drugbankSnap: 78 | continue 79 | geneList = line[1].split(",") 80 | if geneList[0] == "NULL": 81 | continue 82 | for gene in geneList: 83 | snapId = snapIdPrefix + str(idNum) 84 | idNum += 1 85 | if gene not in geneSnap: 86 | print gene 87 | continue 88 | master.write(snapId + sep + "0" + sep + drugbankSnap[line[0]] + sep + geneSnap[gene] + '\n') 89 | sub.write(snapId + sep + line[0] + sep + gene + '\n') 90 | 91 | 92 | -------------------------------------------------------------------------------- /Chemical-Gene/parse_drugbank_chemical_gene.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : parse_drugbank_chemical_gene.py 3 | author: Agrim Gupta 4 | 5 | XML parser to parse the drugbank database for chemical gene interactions. 6 | Outputs a tab separated .tsv file with the following coloumn headers: 7 | DrugbankId Gene1 Gene2 ... 8 | Currently UniportID is used for genes. 9 | 10 | Usage: 11 | python parse_drugbank_chemical_gene.py 12 | 13 | Positional Arguments: 14 | input_file : Path to the durgbank.xml file. 15 | 16 | Optional Arugments: 17 | --output_dir : Directory to create output files. Defaults to the current working directory. 18 | 19 | Example Usage: 20 | Input File: drugbank.xml 21 | 22 | Output directory : outputs/chemical/ 23 | 24 | Comamnd line: 25 | python parse_drugbank_chemical_gene.py drugbank.xml --output_dir outputs/chemicals/ 26 | 27 | Output: 28 | drugbank_parsed_chemical_gene.tsv 29 | ''' 30 | 31 | from bs4 import BeautifulSoup 32 | import os 33 | import argparse 34 | 35 | parser = argparse.ArgumentParser(description='Parse Durgbank database for drug gene interaction') 36 | parser.add_argument('input_file', help='input file path. File should be the drugbank.xml file.') 37 | parser.add_argument('--output_dir', help='directory to output files', default='.') 38 | args = parser.parse_args() 39 | outputFile = os.path.join(args.output_dir, "drugbank_parsed_chemical_gene.tsv") 40 | soup = BeautifulSoup(open(args.input_file),"xml") 41 | sep = "\t" 42 | empty = "NULL" 43 | #geneIdentifier = "HUGO Gene Nomenclature Committee (HGNC)" 44 | geneIdentifier = "UniProtKB" 45 | with open(outputFile, 'w') as f: 46 | for drug in soup.findAll("drug"): 47 | toPrint = "" 48 | toPrint += drug.find("drugbank-id").text + sep 49 | # Get target Genes 50 | targets = drug.findAll("target") 51 | targetGene = [] 52 | if targets: 53 | for target in targets: 54 | externIden = target.findAll("external-identifier") 55 | if not externIden: 56 | continue 57 | for iden in externIden: 58 | if iden.find("resource").text == geneIdentifier: 59 | targetGene.append(iden.find("identifier").text) 60 | # Get Enzyme Gene 61 | enzymes = drug.findAll("enzyme") 62 | enzymeGene = [] 63 | if enzymes: 64 | for enzyme in enzymes: 65 | externIden = enzyme.findAll("external-identifier") 66 | if not externIden: 67 | continue 68 | for iden in externIden: 69 | if iden.find("resource").text == geneIdentifier: 70 | enzymeGene.append(iden.find("identifier").text) 71 | allGene = targetGene + enzymeGene 72 | if len(allGene) == 0: 73 | toPrint += empty 74 | else: 75 | toPrint += ','.join(allGene) 76 | f.write(toPrint.encode('utf-8') + '\n') 77 | -------------------------------------------------------------------------------- /Chemical/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing chemical information: 2 | - Drugbank 3 | 4 | Workflow for creating mode tables for chemicals: 5 | 6 | Input files: 7 | /path/to/input/drugbank.xml 8 | 9 | Intermediate files: 10 | /path/to/intermediate/drugbank_parsed.tsv 11 | Output files: 12 | /path/to/output/miner-chemical-20160523.tsv 13 | /path/to/output/miner-chemical-0-drugbank-20160523.tsv 14 | /path/to/output/miner-chemical-1-PubChemCompound-20160523.tsv 15 | /path/to/output/miner-chemical-2-PubChemSubstance-20160523.tsv 16 | /path/to/output/miner-chemical-equiv-20160523.tsv 17 | 18 | # Parse Data 19 | # Beautiful Soup is required for this, use pipenv to install if you lack permissions 20 | python parse_drugbank_chemicals.py /path/to/input/drugbank.xml --output-dir /path/to/intermediate/ 21 | 22 | # Create Mode tables 23 | python make_snap_chemical_mode.py /path/to/intermediate/drugbank_parsed.tsv --output_dir /path/to/intermediate 24 | 25 | Usage of the scripts used: 26 | 27 | ---------------------------------- 28 | file : parse_drugbank_chemicals.py 29 | ---------------------------------- 30 | 31 | XML parser to parse the drugbank database and output a tsv file 32 | containting the following coloumn headers: 33 | DrugbankID PubChem_Compound PubChem_Substance 34 | 35 | Usage: 36 | python parse_drugbank_chemicals.py 37 | 38 | Positional Arguments: 39 | input_file : Path to the durgbank.xml file. 40 | 41 | Optional Arugments: 42 | --output_dir : Directory to create output files. Defaults to the current working directory. 43 | 44 | Example Usage: 45 | Input File: drugbank.xml 46 | 47 | Output directory : outputs/chemical/ 48 | 49 | Comamnd line: 50 | python parse_drugbank_chemicals.py drugbank.xml --output_dir outputs/chemicals/ 51 | 52 | Output: 53 | drugbank_parsed.tsv 54 | 55 | --------------------------------------- 56 | file : make_snap_chemical_mode_table.py 57 | --------------------------------------- 58 | 59 | Takes input parsed durgbank.xml with the following coloumn headers: 60 | DrugbankID PubChem_Compound PubChem_Substance. Outputs snap tables for 61 | chemical mode. 62 | 63 | Usage: 64 | python make_snap_chemical_mode.py 65 | 66 | Positional Arguments: 67 | input_file : Path to parsed drugbank.xml. 68 | 69 | Optional Arugments: 70 | --output_dir : Directory to create output files. Defaults to the current working directory. 71 | 72 | Example Usage: 73 | Input File: drugbank_parsed.tsv 74 | 75 | Output directory : outputs/chemical/ 76 | 77 | Comamnd line: 78 | python make_snap_chemical_mode.py drugbank_parsed.tsv --output_dir outputs/chemicals/ 79 | 80 | Output: 81 | miner-chemical-20160523.tsv, miner-chemical-0-drugbank-20160523.tsv, 82 | miner-chemical-1-PubChemCompound-20160523.tsv, miner-chemical-2-PubChemSubstance-20160523.tsv, 83 | miner-chemical-equiv-20160523.tsv 84 | 85 | 86 | -------------------------------------------------------------------------------- /Chemical/make_snap_chemical_mode_table.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : make_snap_chemical_mode_table.py 3 | author: Agrim Gupta 4 | 5 | Takes input parsed durgbank.xml with the following coloumn headers: 6 | DrugbankID PubChem_Compound PubChem_Substance. Outputs snap tables for 7 | chemical mode. 8 | 9 | Usage: 10 | python make_snap_chemical_mode.py 11 | 12 | Positional Arguments: 13 | input_file : Path to parsed drugbank.xml. 14 | 15 | Optional Arugments: 16 | --output_dir : Directory to create output files. Defaults to the current working directory. 17 | 18 | Example Usage: 19 | Input File: drugbank_parsed.tsv 20 | 21 | Output directory : outputs/chemical/ 22 | 23 | Comamnd line: 24 | python make_snap_chemical_mode.py drugbank_parsed.tsv --output_dir outputs/chemicals/ 25 | 26 | Output: 27 | miner-chemical-20160523.tsv, miner-chemical-0-drugbank-20160523.tsv, 28 | miner-chemical-1-PubChemCompound-20160523.tsv, miner-chemical-2-PubChemSubstance-20160523.tsv, 29 | miner-chemical-equiv-20160523.tsv 30 | ''' 31 | import itertools 32 | import os 33 | from datetime import datetime 34 | import argparse 35 | 36 | sep = "\t" 37 | empty = "NULL" 38 | snapIdPrefix = "" 39 | idNum = 0 40 | format = '%Y%m%d' 41 | dateStr = datetime.now().strftime(format) 42 | 43 | parser = argparse.ArgumentParser(description='Make mode tables for chemical') 44 | parser.add_argument('input_file', help='input file path. File should be the parsed drugbank.xml') 45 | parser.add_argument('--output_dir', help='directory to output files', default='.') 46 | args = parser.parse_args() 47 | 48 | #output files 49 | masterTable = os.path.join(args.output_dir,"miner-chemical-" + dateStr + ".tsv") 50 | drugbankTable = os.path.join(args.output_dir, "miner-chemical-0-drugbank-" + dateStr + ".tsv") 51 | pubCompundTable = os.path.join(args.output_dir, "miner-chemical-1-PubChemCompound-" + dateStr + ".tsv") 52 | pubSubTable = os.path.join(args.output_dir, "miner-chemical-2-PubChemSubstance-" + dateStr + ".tsv") 53 | eqTable = os.path.join(args.output_dir, "miner-chemical-equiv-" + dateStr + ".tsv") 54 | 55 | subTable = [drugbankTable, pubCompundTable, pubSubTable] 56 | databases = ["drugbank", "PubChemCompound" , "PubChemSubstance"] 57 | subHandle = [open(subTable[i], 'w') for i in xrange(len(subTable))] 58 | # Add Header 59 | for i in xrange(len((subHandle))): 60 | subHandle[i].write('# snap_id\t%s specific id\n' % databases[i]) 61 | 62 | with open(args.input_file, 'r') as input, open(masterTable, 'w') as master,open(eqTable, 'w') as eqTable: 63 | master.write('# snap_id\tdataset_id\n') 64 | eqTable.write('# Equivalence table for mode chemical\n') 65 | eqTable.write('# snap_id_1\tsnap_id_2\n') 66 | for line in input: 67 | if line.startswith('#'): 68 | continue 69 | line = line.strip().split(sep) 70 | currId = [] 71 | # Only first three fields are relavant 72 | for num,id in enumerate(line): 73 | if num > 2: 74 | break 75 | if id == "NULL": 76 | continue 77 | snapId = snapIdPrefix + str(idNum) 78 | idNum += 1 79 | master.write(snapId + sep + str(num) + '\n') 80 | subHandle[num].write(snapId + sep + id + '\n') 81 | currId.append(snapId) 82 | allPerms = list(itertools.permutations(currId,2)) 83 | for perm in allPerms: 84 | toWrite = ' '.join(perm) 85 | eqTable.write(toWrite + '\n') 86 | 87 | [handle.close() for handle in subHandle] 88 | -------------------------------------------------------------------------------- /Chemical/newChemParser.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | from datetime import datetime 4 | import argparse 5 | 6 | sep = "\t" 7 | empty = "NULL" 8 | snapIdPrefix = "" 9 | idNum = 0 10 | format = '%Y%m%d' 11 | dateStr = datetime.now().strftime(format) 12 | 13 | parser = argparse.ArgumentParser(description='Make mode tables for chemical') 14 | parser.add_argument('input_file', help='input file path. File should be the parsed drugbank.xml') 15 | parser.add_argument('--output_dir', help='directory to output files', default='.') 16 | args = parser.parse_args() 17 | 18 | masterTable = os.path.join(args.output_dir,"miner-chemical-" + dateStr + ".tsv") 19 | drugbankTable = os.path.join(args.output_dir, "miner-chemical-0-drugbank-" + dateStr + ".tsv") 20 | with open(args.input_file, 'r') as input, open(masterTable, 'w') as master, open(drugbankTable,'w') as drugTable: 21 | master.write('# snap_id\tdataset_id\n') 22 | drugTable.write('# snap_id\tdataset_id\tname\n') 23 | #eqTable.write('# Equivalence table for mode chemical\n') 24 | #eqTable.write('# snap_id_1\tsnap_id_2\n') 25 | for line in input: 26 | if line.startswith('#'): 27 | continue 28 | spline =line.strip().split(sep) 29 | if line.startswith('DB') and len(spline)>1: 30 | line = spline 31 | id = line[0] 32 | name = line[1] 33 | if name == "": 34 | name = "NULL" 35 | snapId = snapIdPrefix + str(idNum) 36 | idNum += 1 37 | master.write(snapId + sep + id + '\n') 38 | drugTable.write(snapId + sep + id + sep + name + '\n') 39 | -------------------------------------------------------------------------------- /Chemical/parse_drugbank_chemicals.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : parse_drugbank_chemicals.py 3 | author: Agrim Gupta 4 | 5 | XML parser to parse the drugbank database and output a tsv file 6 | containting the following coloumn headers: 7 | DrugbankID PubChem_Compound PubChem_Substance 8 | 9 | Usage: 10 | python parse_drugbank_chemicals.py 11 | 12 | Positional Arguments: 13 | input_file : Path to the durgbank.xml file. 14 | 15 | Optional Arugments: 16 | --output_dir : Directory to create output files. Defaults to the current working directory. 17 | 18 | Example Usage: 19 | Input File: drugbank.xml 20 | 21 | Output directory : outputs/chemical/ 22 | 23 | Comamnd line: 24 | python parse_drugbank_chemicals.py drugbank.xml --output_dir outputs/chemicals/ 25 | 26 | Output: 27 | drugbank_parsed.tsv 28 | ''' 29 | 30 | from bs4 import BeautifulSoup 31 | import os 32 | import argparse 33 | 34 | parser = argparse.ArgumentParser(description='Parse Durgbank database for chemicals') 35 | parser.add_argument('input_file', help='input file path. File should be the drugbank.xml file.') 36 | parser.add_argument('--output_dir', help='directory to output files', default='.') 37 | args = parser.parse_args() 38 | outputFile = os.path.join(args.output_dir, "drugbank_parsed.tsv") 39 | soup = BeautifulSoup(open(args.input_file),"xml") 40 | sep = "\t" 41 | empty = "NULL" 42 | fields = ["name", "description", "general-references", "synthesis-reference", 43 | "protein-binding", "classification", "salts", "synonyms", "products", "international-brands", "mixtures", 44 | "manufacturers", "prices", "categories", "dosages", "atc-codes", "food-interactions", "pathways","reactions", 45 | "snp-effects","snp-adverse-drug-reactions"] 46 | header = ["drugbankID", "pc_Compund", "pc_substance"] + fields; 47 | 48 | seenids = set() 49 | seen = set() 50 | 51 | def recur(elem,l): 52 | for e in elem.findChildren(): 53 | if not e.findChildren(): 54 | l.append(e.text.strip()) 55 | else: 56 | recur(e, l) 57 | 58 | with open(outputFile, 'w') as f: 59 | f.write("# " + sep.join(header) + '\n') 60 | for drug in soup.findAll("drug"): 61 | name = drug.find("name").text 62 | id = drug.find("drugbank-id").text 63 | if name not in seen or id not in seenids: 64 | chemFound = False 65 | toPrint = "" 66 | toPrint += drug.find("drugbank-id").text + sep 67 | seen.add(drug.find("name").text) 68 | seenids.add(id) 69 | identifiers = [i for i in drug.findAll("external-identifier")] 70 | for i in identifiers: 71 | database = i.find("resource").text 72 | if database != "PubChem Compound": 73 | continue 74 | value = i.find("identifier").text 75 | chemFound = True 76 | toPrint += value + sep 77 | if not chemFound: 78 | toPrint += empty + sep 79 | chemFound = False 80 | for i in identifiers: 81 | database = i.find("resource").text 82 | if database != "PubChem Substance": 83 | continue 84 | value = i.find("identifier").text 85 | chemFound = True 86 | toPrint += value + sep 87 | if not chemFound: 88 | toPrint += empty + sep 89 | attributes = [] 90 | for field in fields: 91 | l = [] 92 | if not drug.find(field): 93 | attributes.append(empty) 94 | continue 95 | if drug.find(field).findChildren(): 96 | recur(drug.find(field),l) 97 | for i in range(len(l)): 98 | if l[i] == "": 99 | l[i] = empty 100 | attributes.append("|".join(l).encode('utf-8')) 101 | else: 102 | if drug.find(field).text != "": 103 | genRef = drug.find(field).text 104 | genRef = genRef.split("\n") 105 | attributes.append("|".join(genRef).encode('utf-8')) 106 | #attributes.append(drug.find(field).text.encode('utf-8')) 107 | else: 108 | attributes.append(empty) 109 | toPrint = toPrint.encode('utf-8') + sep.join(attributes) 110 | f.write(toPrint + '\n') 111 | 112 | -------------------------------------------------------------------------------- /Chemical/test_db_parse.py: -------------------------------------------------------------------------------- 1 | #author: farzaan kaiyom (farzaank) 2 | #description: basic parser for drugbank, printing each nodeid and name 3 | #updated to work for 2019 dataset 4 | 5 | import os 6 | from bs4 import BeautifulSoup 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser(description='Parse Durgbank database for chemicals') 10 | parser.add_argument('input_file', help='input file path. File should be the drugbank.xml file.') 11 | parser.add_argument('--output_dir', help='directory to output files', default='.') 12 | args = parser.parse_args() 13 | outputFile = os.path.join(args.output_dir, "drugbank_parse_test1.tsv") 14 | soup = BeautifulSoup(open(args.input_file),"xml") 15 | sep = "\t" 16 | empty = "NULL" 17 | 18 | seen=set() 19 | 20 | with open(outputFile, 'w') as f: 21 | f.write("# " + sep.join(header) + '\n') 22 | counter = 0 23 | drugs = soup.findAll("drug") 24 | for drug in drugs: 25 | drugline = '' 26 | if not drug.find("name"): 27 | name = empty 28 | else: 29 | name = drug.find("name").text 30 | if name not in seen: 31 | seen.add(name) 32 | counter += 1 33 | drugline += (str(counter)+" ") 34 | drugline += (str(name) + " ") 35 | drugline += drug.find('drugbank-id').text + sep 36 | try: 37 | f.write(drugline+'\n') 38 | except: 39 | drugline = drugline.encode('utf-8') 40 | f.write(drugline+'\n') 41 | -------------------------------------------------------------------------------- /Disease-Chemical/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing disease-chemical information: 2 | - CTD 3 | 4 | Workflow for creating crossnet tables for disease-chemical relationships: 5 | 6 | Pre-requisites: 7 | Must have the disease modes table from CTD (MESH and OMIM), and the chemical 8 | mode table from Drugbank. 9 | /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv 10 | /path/to/disease_mode/miner-disease-2-CTD_OMIM-20160521.tsv 11 | /path/to/chemical_mode/miner-chemical-0-drugbank-20160521.tsv 12 | 13 | Input files/directories: 14 | /path/to/input/CTD_dir (from CTD) 15 | 16 | Intermediate files: 17 | /path/to/intermediate/ctd_disease_chem_parsed.tsv 18 | 19 | Output files: 20 | /path/to/output/miner-disease-chemical-20160521.tsv 21 | /path/to/output/miner-disease-chemical-0-CTD_MESH-20160521.tsv 22 | /path/to/output/miner-disease-chemical-1-CTD_OMIM-20160521.tsv 23 | 24 | # Create intermediate files 25 | python make_disease_chem_ctd.py /path/to/input/CTD_dir --output_dir /path/to/intermediate/ 26 | 27 | # Create cross net files 28 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ctd_disease_chem_parsed.tsv /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv /path/to/chemical_mode/miner-chemical-0-drugbank-20160521.tsv CTD_MESH 0 --output_dir /path/to/output/ --skip_missing_ids 29 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ctd_disease_chem_parsed.tsv /path/to/disease_mode/miner-disease-2-CTD_OMIM-20160521.tsv /path/to/chemical_mode/miner-chemical-0-drugbank-20160521.tsv CTD_OMIM 1 --output_dir /path/to/output/ --skip_missing_ids 30 | -------------------------------------------------------------------------------- /Disease-Chemical/make_disease_chem_ctd.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : make_disease_chem_ctd.py 3 | author: Viswajith Venugopal 4 | 5 | Parses CTD to find disease chemical links. 6 | 7 | Usage: 8 | python make_disease_chem_ctd [--output_dir OUTPUT_DIR] 9 | 10 | Positional Arguments: 11 | input_dir : The directory of the CTD files. 12 | 13 | Optional Arugments: 14 | --output_dir : Directory to create output files. Defaults to the current working directory. 15 | 16 | Example Usage: 17 | Input File: CTD/0416_CTD 18 | 19 | Output directory : outputs/disease-chemical/ 20 | 21 | Comamnd line: 22 | python parse_do_diseases.py CTD/0416_CTD --output_dir outputs/disease-chemical/ 23 | 24 | Output: 25 | ctd_disease_chemical_parsed.tsv 26 | ''' 27 | 28 | from collections import defaultdict 29 | import os 30 | import argparse 31 | 32 | def get_chem_to_db(ctd_dir): 33 | ctd_chem_node_fname = os.path.join(ctd_dir, 'CTD_chemicals.tsv') 34 | chem_to_db_dict = {} 35 | # First, we load uniprot ids. 36 | with open(ctd_chem_node_fname, 'r') as ctd_gene_node_f: 37 | for line in ctd_gene_node_f: 38 | if line.startswith('#'): 39 | continue 40 | sp_line = line.strip('\n').split('\t') 41 | chem_id = sp_line[1] 42 | db_ids = sp_line[8] 43 | if len(db_ids) > 0: 44 | db_ids = db_ids.split('|') 45 | chem_to_db_dict[chem_id] = db_ids 46 | 47 | return chem_to_db_dict 48 | 49 | def parse_ctd_chem_diseases(ctd_dir): 50 | 51 | chem_to_db_dict = get_chem_to_db(ctd_dir) 52 | disease_chem_list = [] 53 | ctd_chem_dis_fname = os.path.join(ctd_dir, 'CTD_chemicals_diseases.tsv') 54 | with open(ctd_chem_dis_fname) as in_f: 55 | for line in in_f: 56 | if line.startswith('#'): 57 | continue 58 | sp_line = line.strip('\n').split('\t') 59 | chem_id = 'MESH:' + sp_line[1] 60 | if chem_id not in chem_to_db_dict: 61 | continue 62 | db_id = chem_to_db_dict[chem_id][0] 63 | disease_id = sp_line[4] 64 | inference_score = sp_line[7] 65 | if inference_score == "": 66 | inference_score = "0" 67 | disease_chem_list.append((disease_id, db_id,inference_score)) 68 | 69 | return disease_chem_list 70 | 71 | 72 | parser = argparse.ArgumentParser(description='Parse CTD to find disease-chemical links.') 73 | parser.add_argument('input_dir', help='Input files directory. This should be the directory with all the CTD TSVs.') 74 | parser.add_argument('--output_dir', help='Directory to output files', default='.') 75 | args = parser.parse_args() 76 | 77 | output_fname = os.path.join(args.output_dir, "ctd_disease_chem_parsed.tsv") 78 | 79 | disease_chem_list = parse_ctd_chem_diseases(args.input_dir) 80 | 81 | with open(output_fname, 'w') as out_f: 82 | out_f.write('#Disease Chemical links from CTD.\n') 83 | for (disease_id, db_id, iscore) in disease_chem_list: 84 | out_f.write('\t'.join([disease_id, db_id,iscore])) 85 | out_f.write('\n') 86 | -------------------------------------------------------------------------------- /Disease-Disease/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing disease-disease information: 2 | - DiseaseOntology (has edges for the 'is_a' relationship) 3 | 4 | Workflow for creating crossnet tables for disease-disease relationships: 5 | 6 | Pre-requisites: 7 | Must have the disease mode table from DOID 8 | /path/to/mode/miner-disease-0-DOID-20160521.tsv 9 | 10 | Input files/directories: 11 | /path/to/input/doid.obo (from DOID) 12 | 13 | Intermediate files: 14 | /path/to/intermediate/doid_disease_disease_parsed.tsv 15 | 16 | Output files: 17 | /path/to/output/miner-disease-disease-20160521.tsv 18 | /path/to/output/miner-disease-disease-0-DOID-20160521.tsv 19 | 20 | # Create intermediate files 21 | python parse_do_disease_disease.py /path/to/input/doid.obo --output_dir /path/to/intermediate/ 22 | 23 | # Create cross net files 24 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/doid_disease_disease_parsed.tsv /path/to/mode/miner-disease-0-DOID-20160521.tsv /path/to/mode/miner-disease-0-DOID-20160521.tsv DOID 0 --output_dir /path/to/output/ 25 | -------------------------------------------------------------------------------- /Disease-Disease/parse_do_disease_disease.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : parse_do_disease_disease.py 3 | author: Viswajith Venugopal 4 | 5 | Parses the disease ontology OBO to create the 6 | edge table using the is_a relationship. 7 | 8 | Usage: 9 | python parse_do_disease_disease.py [--output_dir OUTPUT_DIR] 10 | 11 | Positional Arguments: 12 | input_file : The doid.obo file which contains the disease ontology. 13 | 14 | Optional Arugments: 15 | --output_dir : Directory to create output files. Defaults to the current working directory. 16 | 17 | Example Usage: 18 | Input File: doid.obo 19 | 20 | Output directory : outputs/disease-disease/ 21 | 22 | Comamnd line: 23 | python parse_do_disease_disease.py doid.obo --output_dir outputs/disease-disease/ 24 | 25 | Output: 26 | doid_disease_disease_parsed.tsv 27 | ''' 28 | 29 | from collections import defaultdict 30 | import os 31 | import argparse 32 | import pickle 33 | 34 | 35 | # In[79]: 36 | 37 | def parse_do_file_to_list(fname): 38 | """ 39 | Reads the disease ontology in obo format from file 40 | given by fname, and returns the ontology as a list 41 | of dictionaries, one dictionary per entry. 42 | The dictionary for each entry is structured with 43 | the following fields 44 | { 45 | 'id' (The disease ontology id) 46 | 'name' 47 | 'def' 48 | 'synonym' 49 | 'alt_id' (A list of alternate DOID ids) 50 | 'xref' (A list of xrefs to MESH/OMIM ids) 51 | 'is_a' (A DOID of what this disease is) 52 | 53 | } 54 | """ 55 | f = open(fname, 'r') 56 | 57 | preamble = True # If we're in the top part of the file 58 | global_list = [] 59 | curr_node_dict = {} 60 | for line in f: 61 | if preamble: 62 | if line.startswith('[Term]'): 63 | preamble = False 64 | continue 65 | spline = line.strip().split() 66 | if len(spline) == 0: 67 | global_list.append(curr_node_dict) 68 | curr_node_dict = {} 69 | continue 70 | if spline[0] == 'id:': 71 | if not spline[1].startswith('DOID'): # This means we've reached the bottom part of the file. 72 | break 73 | curr_node_dict['id'] = spline[1] 74 | elif spline[0] == 'name:': 75 | curr_node_dict['name'] = ' '.join(spline[1:]) 76 | elif spline[0] == 'def:': 77 | curr_node_dict['def'] = ' '.join(spline[1:]) 78 | elif spline[0] == 'synonym:': 79 | curr_node_dict['synonym'] = ' '.join(spline[1:]) 80 | elif spline[0] == 'alt_id:': 81 | if 'alt_id' in curr_node_dict: 82 | curr_node_dict['alt_id'].append(spline[1]) 83 | else: 84 | curr_node_dict['alt_id'] = [spline[1]] 85 | elif spline[0] == 'is_a:': 86 | curr_node_dict['is_a'] = spline[1] 87 | elif spline[0] == 'xref:': 88 | if 'xref' in curr_node_dict: 89 | curr_node_dict['xref'].append(spline[1]) 90 | else: 91 | curr_node_dict['xref'] = [spline[1]] 92 | 93 | 94 | return global_list 95 | 96 | doid_to_mesh_dict = defaultdict(list) 97 | mesh_to_doid_dict = defaultdict(list) 98 | omim_to_doid_dict = defaultdict(list) 99 | doid_to_omim_dict = defaultdict(list) 100 | doid_equiv_dict = defaultdict(list) 101 | 102 | 103 | parser = argparse.ArgumentParser(description='Parse DOID to find disease-disease is-a edges.') 104 | parser.add_argument('input_file', help='Input file path. File should be the doid.obo file.') 105 | parser.add_argument('--output_dir', help='Directory to output files', default='.') 106 | args = parser.parse_args() 107 | 108 | output_fname = os.path.join(args.output_dir, "doid_disease_disease_parsed.tsv") 109 | 110 | # Get the Disease Ontology as a list of one dictionary per entry. 111 | do_list = parse_do_file_to_list(args.input_file) 112 | 113 | with open(output_fname, 'w') as out_f: 114 | out_f.write('# Parsed DOID file.\n# Columns are source id, dest id.\n') 115 | for entry in do_list: 116 | if 'is_a' in entry: 117 | out_f.write('\t'.join([entry['id'], entry['is_a']])) 118 | out_f.write('\n') 119 | -------------------------------------------------------------------------------- /Disease-Function/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing disease-function information: 2 | - CTD 3 | 4 | Workflow for creating crossnet tables for disease-function relationships: 5 | 6 | Pre-requisites: 7 | Must have the disease modes table from CTD (MESH), and the function mode table from 8 | GO. 9 | /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv 10 | /path/to/function_mode/miner-function-0-GO-20160521.tsv 11 | 12 | Input files/directories: 13 | /path/to/input/CTD_dir (from CTD) 14 | 15 | Intermediate files: 16 | /path/to/intermediate/ctd_disease_func_parsed.tsv 17 | 18 | Output files: 19 | /path/to/output/miner-disease-function-20160521.tsv 20 | /path/to/output/miner-disease-function-0-CTD-20160521.tsv 21 | 22 | # Create intermediate files 23 | python make_disease_func_ctd.py /path/to/input/CTD_dir --output_dir /path/to/intermediate/ 24 | 25 | # Create cross net files 26 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ctd_disease_func_parsed.tsv /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv /path/to/function_mode/miner-function-0-GO-20160521.tsv CTD 0 --output_dir /path/to/output/ --skip_missing_ids 27 | -------------------------------------------------------------------------------- /Disease-Function/make_disease_func_ctd.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : make_disease_func_ctd.py 3 | author: Viswajith Venugopal 4 | 5 | Goes over the disease function file in CTD and creates a table with the disease id 6 | and GO id of the function. 7 | 8 | Usage: 9 | python make_disease_func_ctd [--output_dir OUTPUT_DIR] 10 | 11 | Positional Arguments: 12 | input_dir : The directory with all the CTD TSVs 13 | 14 | Optional Arugments: 15 | --output_dir : Directory to create output files. Defaults to the current working directory. 16 | 17 | Example Usage: 18 | Input Dir: CTD/0416_CTD 19 | 20 | Output directory : outputs/disease_func/ 21 | 22 | Comamnd line: 23 | python make_disease_func_ctd.py CTD/0416_CTD --output_dir outputs/disease_func/ 24 | 25 | Output: 26 | ctd_disease_func_parsed.tsv 27 | ''' 28 | 29 | from collections import defaultdict 30 | import os 31 | import argparse 32 | 33 | def load_disease_functions_ctd(ctd_dir): 34 | f1 = open(os.path.join(ctd_dir, 'CTD_Phenotype-Disease_biological_process_associations.tsv'), 'r') 35 | f2 = open(os.path.join(ctd_dir, 'CTD_Phenotype-Disease_cellular_component_associations.tsv'), 'r') 36 | f3 = open(os.path.join(ctd_dir, 'CTD_Phenotype-Disease_molecular_function_associations.tsv'), 'r') 37 | global_list = [] 38 | linktype = "" 39 | for f in [f1, f2, f3]: 40 | if f==f1: 41 | linktype = "biological" 42 | elif f==f2: 43 | linktype = "cellular" 44 | else: 45 | linktype = "molecular" 46 | for line in f: 47 | if line.startswith('#'): 48 | continue 49 | sp_line = line.strip('\n').split('\t') 50 | disease_id = sp_line[3] 51 | go_id = sp_line[1] 52 | global_list.append((disease_id, go_id, linktype)) 53 | return global_list 54 | 55 | parser = argparse.ArgumentParser(description='Parse CTD to find disease-function links.') 56 | parser.add_argument('input_dir', help='Input files directory. This should be the directory with all the CTD TSVs.') 57 | parser.add_argument('--output_dir', help='Directory to output files', default='.') 58 | args = parser.parse_args() 59 | 60 | output_fname = os.path.join(args.output_dir, "ctd_disease_func_parsed.tsv") 61 | 62 | disease_func_list = load_disease_functions_ctd(args.input_dir) 63 | 64 | with open(output_fname, 'w') as out_f: 65 | out_f.write('#Disease Function links from CTD.\n') 66 | for (disease_id, go_id,linktype) in disease_func_list: 67 | out_f.write('\t'.join([disease_id, go_id,linktype])) 68 | out_f.write('\n') 69 | -------------------------------------------------------------------------------- /Disease-Gene/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing disease-gene information: 2 | - CTD 3 | 4 | Workflow for creating crossnet tables for disease-gene relationships: 5 | 6 | Pre-requisites: 7 | Must have the disease modes table from CTD (MESH and OMIM), and the gene 8 | mode table from GO. 9 | /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv 10 | /path/to/disease_mode/miner-disease-2-CTD_OMIM-20160521.tsv 11 | /path/to/gene_mode/miner-gene-0-GO-20160521.tsv 12 | 13 | Input files/directories: 14 | /path/to/input/CTD_dir (from CTD) 15 | 16 | Intermediate files: 17 | /path/to/intermediate/ctd_disease_gene_parsed.tsv 18 | 19 | Output files: 20 | /path/to/output/miner-disease-gene-20160521.tsv 21 | /path/to/output/miner-disease-gene-0-CTD_MESH-20160521.tsv 22 | /path/to/output/miner-disease-gene-1-CTD_OMIM-20160521.tsv 23 | 24 | # Create intermediate files 25 | python make_disease_gene_ctd.py /path/to/input/CTD_dir --output_dir /path/to/intermediate/ 26 | 27 | # Create cross net files 28 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ctd_disease_gene_parsed.tsv /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv /path/to/gene_mode/miner-gene-0-GO-20160521.tsv CTD_MESH 0 --output_dir /path/to/output/ --skip_missing_ids 29 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ctd_disease_gene_parsed.tsv /path/to/disease_mode/miner-disease-2-CTD_OMIM-20160521.tsv /path/to/gene_mode/miner-gene-0-GO-20160521.tsv CTD_OMIM 1 --output_dir /path/to/output/ --skip_missing_ids 30 | -------------------------------------------------------------------------------- /Disease-Gene/make_disease_gene_ctd.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : make_disease_gene_ctd.py 3 | author: Viswajith Venugopal 4 | 5 | Parses CTD to find disease-gene edges. 6 | 7 | Usage: 8 | python make_disease_gene_ctd.py 9 | 10 | Positional Arguments: 11 | input_dir : The directory to the CTD folder. 12 | 13 | Optional Arugments: 14 | --output_dir : Directory to create output files. Defaults to the current working directory. 15 | 16 | Example Usage: 17 | Input File: CTD/0416_CTD 18 | 19 | Output directory : outputs/disease-gene/ 20 | 21 | Comamnd line: 22 | python make_disease_gene_ctd.py --output_dir outputs/disease-gene/ 23 | 24 | Output: 25 | ctd_disease_gene_parsed.tsv 26 | ''' 27 | 28 | from collections import defaultdict 29 | import os 30 | import argparse 31 | 32 | def get_ncbi_to_uniprot(ctd_dir): 33 | ctd_gene_node_fname = os.path.join(ctd_dir, 'CTD_genes.tsv') 34 | ncbi_to_uniprot_dict = {} 35 | # First, we load uniprot ids. 36 | with open(ctd_gene_node_fname, 'r') as ctd_gene_node_f: 37 | for line in ctd_gene_node_f: 38 | if line.startswith('#'): 39 | continue 40 | sp_line = line.strip('\n').split('\t') 41 | ncbi_id = sp_line[2] 42 | uniprot_ids = sp_line[7] 43 | if len(uniprot_ids) > 0: 44 | uniprot_ids = uniprot_ids.split('|') 45 | ncbi_to_uniprot_dict[ncbi_id] = uniprot_ids 46 | 47 | return ncbi_to_uniprot_dict 48 | 49 | def parse_ctd_gene_diseases(ctd_dir): 50 | 51 | ncbi_to_uniprot_dict = get_ncbi_to_uniprot(ctd_dir) 52 | disease_gene_list = [] 53 | ctd_gene_dis_fname = os.path.join(ctd_dir, 'CTD_genes_diseases.tsv') 54 | with open(ctd_gene_dis_fname) as in_f: 55 | i = 0 56 | for line in in_f: 57 | i += 1 58 | if i % 100000 == 0: 59 | pass 60 | #print i 61 | if line.startswith('#'): 62 | continue 63 | sp_line = line.strip('\n').split('\t') 64 | ncbi_id = sp_line[1] 65 | if ncbi_id not in ncbi_to_uniprot_dict: 66 | continue 67 | disease_id = sp_line[3] 68 | iscore = sp_line[6] 69 | if iscore=="": 70 | iscore = 0 71 | for uniprot_id in ncbi_to_uniprot_dict[ncbi_id]: 72 | yield (disease_id, uniprot_id,iscore) 73 | 74 | 75 | parser = argparse.ArgumentParser(description='Parse CTD to find disease-gene links.') 76 | parser.add_argument('input_dir', help='Input files directory. This should be the directory with all the CTD TSVs.') 77 | parser.add_argument('--output_dir', help='Directory to output files', default='.') 78 | args = parser.parse_args() 79 | 80 | output_fname = os.path.join(args.output_dir, "ctd_disease_gene_parsed.tsv") 81 | 82 | 83 | with open(output_fname, 'w') as out_f: 84 | out_f.write('#Disease Gene links from CTD.\n') 85 | for (disease_id, uni_id,iscore) in parse_ctd_gene_diseases(args.input_dir): 86 | out_f.write('\t'.join([disease_id, uni_id,iscore])) 87 | out_f.write('\n') 88 | -------------------------------------------------------------------------------- /Disease-Gene/make_disease_gene_disgenet.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Title : make_disease_gene_disgenet.py 3 | Author: Farzaan Kaiyom 4 | 5 | Parses data from DisGeNET to find disease-gene edges. 6 | 7 | Usage: 8 | python make_disease_gene_disgenet.py 9 | 10 | Positional Arguments: 11 | input_dir : The directory to the CTD folder. 12 | 13 | Optional Arugments: 14 | --output_dir : Directory to create output files. Defaults to the current working directory. 15 | 16 | Example Usage: 17 | Input File: CTD/0819_CTD 18 | 19 | Output directory : outputs/disease-gene/ 20 | 21 | Comamnd line: 22 | python make_disease_gene_disgenet.py --output_dir outputs/disease-gene/ 23 | 24 | Output: 25 | ctd_disease_gene_parsed2.tsv 26 | ''' 27 | 28 | from collections import defaultdict 29 | import os 30 | import argparse 31 | 32 | def get_DGN_to_MESH(DGN_dir): 33 | map_fname = os.path.join(DGN_dir, 'mapping_files.tsv') 34 | mesh_dict = {} 35 | with open(map_fname, 'r') as dismap: 36 | for line in dismap: 37 | if line.startswith('#'): 38 | continue 39 | trgt = line.strip('\n').split('|') 40 | DGNid = trgt[0] 41 | vocab = trgt[2] 42 | if vocab=='MESH' 43 | code = trgt[3] 44 | mesh_dict[DGNid]=code 45 | return mesh_dict 46 | 47 | def get_ncbi_to_uniprot(ctd_dir): 48 | ctd_gene_node_fname = os.path.join(ctd_dir, 'CTD_genes.tsv') 49 | ncbi_to_uniprot_dict = {} 50 | # First, we load uniprot ids. 51 | with open(ctd_gene_node_fname, 'r') as ctd_gene_node_f: 52 | for line in ctd_gene_node_f: 53 | if line.startswith('#'): 54 | continue 55 | sp_line = line.strip('\n').split('\t') 56 | ncbi_id = sp_line[2] 57 | uniprot_ids = sp_line[7] 58 | if len(uniprot_ids) > 0: 59 | uniprot_ids = uniprot_ids.split('|') 60 | ncbi_to_uniprot_dict[ncbi_id] = uniprot_ids 61 | 62 | return ncbi_to_uniprot_dict 63 | 64 | def parse_ctd_gene_diseases(ctd_dir,DGN_dir): 65 | 66 | ncbi_to_uniprot_dict = get_ncbi_to_uniprot(ctd_dir) 67 | dgn_dict = get_DGN_to_MESH(DGN_dir) 68 | disease_gene_list = [] 69 | ctd_gene_dis_fname = os.path.join(ctd_dir, 'CTD_genes_diseases.tsv') 70 | with open(ctd_gene_dis_fname) as in_f: 71 | i = 0 72 | for line in in_f: 73 | i += 1 74 | if i % 100000 == 0: 75 | pass 76 | #print i 77 | if line.startswith('#'): 78 | continue 79 | sp_line = line.strip('\n').split('\t') 80 | ncbi_id = sp_line[1] 81 | if ncbi_id not in ncbi_to_uniprot_dict: 82 | continue 83 | disease_id = sp_line[3] 84 | for uniprot_id in ncbi_to_uniprot_dict[ncbi_id]: 85 | yield (disease_id, uniprot_id) 86 | 87 | 88 | parser = argparse.ArgumentParser(description='Parse CTD to find disease-gene links.') 89 | parser.add_argument('input_dir1', help='Input files directory. This should be the directory with all the CTD TSVs.') 90 | parser.add_argument('input_dir2', help='Input files directory. This should be the directory with the DGN-MESH mapping.') 91 | parser.add_argument('--output_dir', help='Directory to output files', default='.') 92 | args = parser.parse_args() 93 | 94 | output_fname = os.path.join(args.output_dir, "ctd_disease_gene_parsed2.tsv") 95 | 96 | 97 | with open(output_fname, 'w') as out_f: 98 | out_f.write('#Disease Gene links from CTD.\n') 99 | for (disease_id, uni_id) in parse_ctd_gene_diseases(args.input_dir1,args.input_dir2): 100 | out_f.write('\t'.join([disease_id, uni_id])) 101 | out_f.write('\n') 102 | -------------------------------------------------------------------------------- /Disease/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing disease information: 2 | - DiseaseOntology (has DOID ids, as well as cross-references to the MESH and 3 | OMIM ids) 4 | - CTD (uses MESH ids for some diseases, and OMIM ids for others) 5 | - OMIM 6 | 7 | Workflow for creating mode tables for diseases: 8 | 9 | Input files/directories: 10 | /path/to/input/doid.obo (from DOID) 11 | /path/to/input/CTD_diseases.tsv (from CTD) 12 | /path/to/input/OMIM/ (from OMIM. Note that this is the entire directory, since 13 | multiple files are required to identify diseases) 14 | 15 | Intermediate files: 16 | /path/to/intermediate/doid_parsed.tsv 17 | /path/to/intermediate/ctd_mesh_parsed.tsv 18 | /path/to/intermediate/ctd_omim_parsed.tsv 19 | /path/to/intermediate/omim_parsed.tsv 20 | /path/to/intermediate/doid_mesh_equiv.tsv 21 | /path/to/intermediate/doid_omim_equiv.tsv 22 | 23 | Output files: 24 | /path/to/output/miner-disease-20160521.tsv 25 | /path/to/output/miner-disease-0-DOID-20160521.tsv 26 | /path/to/output/miner-disease-1-CTD_MESH-20160521.tsv 27 | /path/to/output/miner-disease-3-OMIM-20160525.tsv 28 | /path/to/output/miner-disease-equiv-20160525.tsv 29 | 30 | # Create intermediate files 31 | python parse_do_diseases.tsv /path/to/input/doid.obo --output_dir /path/to/intermediate/ 32 | python parse_ctd_diseases.tsv /path/to/input/CTD_diseases.tsv --output_dir /path/to/intermediate/ 33 | python parse_omim_diseases.tsv /path/to/input/OMIM/ --output_dir /path/to/intermediate/ 34 | 35 | 36 | # Create mode files 37 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/doid_parsed.tsv disease DOID 0 --output_dir /path/to/output/ 38 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/ctd_mesh_parsed.tsv disease CTD_MESH 1 --output_dir /path/to/output/ 39 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/ctd_omim_parsed.tsv disease CTD_OMIM 2 --output_dir /path/to/output/ 40 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/omim_parsed.tsv disease OMIM 3 --output_dir /path/to/output/ 41 | 42 | # Create mode equivalence table 43 | python ../Utils/create_snap_mode_equiv_table.py /path/to/output/miner-disease-0-DOID-20160521.tsv /path/to/output/miner-disease-1-CTD_MESH-20160521.tsv --output_dir /path/to/output/ --mapping_file /path/to/intermediate/doid_mesh_equiv.tsv --skip_missing_ids 44 | python ../Utils/create_snap_mode_equiv_table.py /path/to/output/miner-disease-0-DOID-20160521.tsv /path/to/output/miner-disease-2-CTD_OMIM-20160521.tsv --output_dir /path/to/output/ --mapping_file /path/to/intermediate/doid_omim_equiv.tsv --skip_missing_ids 45 | python ../Utils/create_snap_mode_equiv_table.py /path/to/output/miner-disease-0-DOID-20160521.tsv /path/to/output/miner-disease-3-OMIM-20160521.tsv --output_dir /path/to/output/ --mapping_file /path/to/intermediate/doid_omim_equiv.tsv --skip_missing_ids 46 | -------------------------------------------------------------------------------- /Disease/parse_ctd_diseases.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : parse_ctd_diseases.py 3 | author: Viswajith Venugopal 4 | 5 | Parses the CTD disease node table into a TSV we can use to build our SNAP mode table 6 | Creates two separate files, one for CTD diseases with a MESH id, and another for 7 | CTD diseases with an OMIM id. 8 | 9 | Usage: 10 | python parse_ctd_diseases.py [--output_dir OUTPUT_DIR] 11 | 12 | Positional Arguments: 13 | input_file : The CTD_diseases.tsv file which ships with CTD. 14 | 15 | Optional Arugments: 16 | --output_dir : Directory to create output files. Defaults to the current working directory. 17 | 18 | Example Usage: 19 | Input File: CTD_diseases.tsv 20 | 21 | Output directory : outputs/diseases/ 22 | 23 | Comamnd line: 24 | python parse_ctd_diseases.py CTD_diseases.tsv --output_dir outputs/diseases/ 25 | 26 | Output: 27 | ctd_mesh_parsed.tsv 28 | ctd_omim_parsed.tsv 29 | ''' 30 | 31 | import os 32 | import argparse 33 | 34 | def parse_ctd_file_to_list(fname): 35 | """ 36 | Parses the ctd_diseases.tsv file, and returns it as a list 37 | of entries, each entry represented as a dictionary with structure: 38 | { 39 | 'name' 40 | 'id' 41 | 'alt_ids' (list of alternate disease ids) 42 | 'defs' 43 | 'parents' (list of parent ids) 44 | 'syns' 45 | } 46 | """ 47 | f = open(fname,'r') 48 | ctd_list = [] 49 | for line in f: 50 | if line.startswith('#'): 51 | continue 52 | spline = line.strip('\n').split('\t') 53 | name = spline[0] 54 | disease_id = spline[1] 55 | alt_ids = spline[2] 56 | defs = spline[3] 57 | parents = spline[4] 58 | syns = spline[7] 59 | if len(alt_ids) > 0: 60 | alt_ids = alt_ids.split('|') 61 | else: 62 | alt_ids = [] 63 | if len(parents) > 0: 64 | parents = parents.split('|') 65 | else: 66 | parents = [] 67 | ctd_list.append({ 68 | 'name': name, 69 | 'id': disease_id, 70 | 'alt_ids': alt_ids, 71 | 'defs': defs, 72 | 'parents': parents, 73 | 'syns': syns 74 | }) 75 | 76 | return ctd_list 77 | 78 | parser = argparse.ArgumentParser(description='Parse CTD to find diseases.') 79 | parser.add_argument('input_file', help='Input file path. File should be the CTD_diseases.tsv file.') 80 | parser.add_argument('--output_dir', help='Directory to output files', default='.') 81 | args = parser.parse_args() 82 | 83 | ctd_list = parse_ctd_file_to_list(args.input_file) 84 | mesh_output_fname = os.path.join(args.output_dir, "ctd_mesh_parsed.tsv") 85 | omim_output_fname = os.path.join(args.output_dir, "ctd_omim_parsed.tsv") 86 | 87 | 88 | with open(mesh_output_fname, 'w') as mesh_out_f: 89 | with open(omim_output_fname, 'w') as omim_out_f: 90 | mesh_out_f.write('# Parsed CTD diseases file with MESH ids.\n# Columns: id, name, definitions, synonyms.\n') 91 | omim_out_f.write('# Parsed CTD diseases file with OMIM ids.\n# Columns: id, name, definitions, synonyms.\n') 92 | 93 | for entry in ctd_list: 94 | 95 | # The string to write into the output file. 96 | str_to_write = '\t'.join([entry['id'], entry['name'], 97 | entry['defs'], entry['syns']]) 98 | 99 | # Find the id; in CTD, it can be either an OMIM or a MESH id. 100 | if entry['id'].startswith('MESH'): 101 | mesh_out_f.write(str_to_write + '\n') 102 | elif entry['id'].startswith('OMIM'): 103 | omim_out_f.write(str_to_write + '\n') 104 | -------------------------------------------------------------------------------- /Disease/parse_do_diseases.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : parse_do_diseases.py 3 | author: Viswajith Venugopal 4 | 5 | Parses the disease ontology OBO. Creates the disease ontology node TSV, as well as 6 | files with mappings between DOIDs and the MESH and OMIM ids that there is a cross 7 | reference to in the Disease Ontology. 8 | 9 | Usage: 10 | python parse_do_diseases.py [--output_dir OUTPUT_DIR] 11 | 12 | Positional Arguments: 13 | input_file : The doid.obo file which contains the disease ontology. 14 | 15 | Optional Arugments: 16 | --output_dir : Directory to create output files. Defaults to the current working directory. 17 | 18 | Example Usage: 19 | Input File: doid.obo 20 | 21 | Output directory : outputs/diseases/ 22 | 23 | Comamnd line: 24 | python parse_do_diseases.py doid.obo --output_dir outputs/diseases/ 25 | 26 | Output: 27 | do_parsed.tsv 28 | doid_mesh_equiv.tsv 29 | doid_omim_equiv.tsv 30 | ''' 31 | 32 | from collections import defaultdict 33 | import os 34 | import argparse 35 | 36 | 37 | # In[79]: 38 | 39 | def parse_do_file_to_list(fname): 40 | """ 41 | Reads the disease ontology in obo format from file 42 | given by fname, and returns the ontology as a list 43 | of dictionaries, one dictionary per entry. 44 | The dictionary for each entry is structured with 45 | the following fields 46 | { 47 | 'id' (The disease ontology id) 48 | 'name' 49 | 'def' 50 | 'synonym' 51 | 'alt_id' (A list of alternate DOID ids) 52 | 'xref' (A list of xrefs to MESH/OMIM ids) 53 | 'is_a' (A DOID of what this disease is) 54 | 55 | } 56 | """ 57 | f = open(fname, 'r') 58 | 59 | preamble = True # If we're in the top part of the file 60 | global_list = [] 61 | curr_node_dict = {} 62 | for line in f: 63 | if preamble: 64 | if line.startswith('[Term]'): 65 | preamble = False 66 | continue 67 | spline = line.strip().split() 68 | if len(spline) == 0: 69 | global_list.append(curr_node_dict) 70 | curr_node_dict = {} 71 | continue 72 | if spline[0] == 'id:': 73 | if not spline[1].startswith('DOID'): # This means we've reached the bottom part of the file. 74 | break 75 | curr_node_dict['id'] = spline[1] 76 | elif spline[0] == 'name:': 77 | curr_node_dict['name'] = ' '.join(spline[1:]) 78 | elif spline[0] == 'def:': 79 | curr_node_dict['def'] = ' '.join(spline[1:]) 80 | elif spline[0] == 'synonym:': 81 | curr_node_dict['synonym'] = ' '.join(spline[1:]) 82 | elif spline[0] == 'alt_id:': 83 | if 'alt_id' in curr_node_dict: 84 | curr_node_dict['alt_id'].append(spline[1]) 85 | else: 86 | curr_node_dict['alt_id'] = [spline[1]] 87 | elif spline[0] == 'is_a:': 88 | curr_node_dict['is_a'] = spline[1] 89 | elif spline[0] == 'xref:': 90 | if 'xref' in curr_node_dict: 91 | curr_node_dict['xref'].append(spline[1]) 92 | else: 93 | curr_node_dict['xref'] = [spline[1]] 94 | 95 | 96 | return global_list 97 | 98 | doid_to_mesh_dict = defaultdict(list) 99 | mesh_to_doid_dict = defaultdict(list) 100 | omim_to_doid_dict = defaultdict(list) 101 | doid_to_omim_dict = defaultdict(list) 102 | doid_equiv_dict = defaultdict(list) 103 | 104 | 105 | parser = argparse.ArgumentParser(description='Parse DOID to find diseases.') 106 | parser.add_argument('input_file', help='Input file path. File should be the doid.obo file.') 107 | parser.add_argument('--output_dir', help='Directory to output files', default='.') 108 | args = parser.parse_args() 109 | 110 | output_fname = os.path.join(args.output_dir, "doid_parsed.tsv") 111 | doid_mesh_fname = os.path.join(args.output_dir, "doid_mesh_equiv.tsv") 112 | doid_omim_fname = os.path.join(args.output_dir, "doid_omim_equiv.tsv") 113 | #metadict_fname = os.path.join(args.output_dir, "meta_dict.pickle") 114 | 115 | # Get the Disease Ontology as a list of one dictionary per entry. 116 | do_list = parse_do_file_to_list(args.input_file) 117 | 118 | with open(output_fname, 'w') as out_f: 119 | out_f.write('# Parsed DOID file.\n# Columns are id, name, definition, synonym.\n') 120 | for entry in do_list: 121 | 122 | # To the doid table, we write all the info. 123 | name = entry['name'] if 'name' in entry else '' 124 | definition = entry['def'] if 'def' in entry else '' 125 | synonym = entry['synonym'] if 'synonym' in entry else '' 126 | out_f.write('\t'.join([entry['id'], name, definition, synonym])) 127 | out_f.write('\n') 128 | 129 | # Populate dictionaries 130 | if 'alt_id' in entry: 131 | for alt_id in entry['alt_id']: 132 | doid_equiv_dict[entry['id']].append(alt_id) 133 | doid_equiv_dict[alt_id].append(entry['id']) 134 | 135 | if 'xref' in entry: 136 | for xref in entry['xref']: 137 | if xref.startswith('OMIM'): 138 | omim_to_doid_dict[xref].append(entry['id']) 139 | doid_to_omim_dict[entry['id']].append(xref) 140 | 141 | #updated for 2019 DOID dataset 142 | elif xref.startswith('MSH') or xref.startswith('MESH'): 143 | # For consistency, use MESH:id instead of MSH:id 144 | if xref.startswith('MSH'): 145 | mesh_id = xref[:1] + 'E' + xref[1:] 146 | else: 147 | mesh_id = xref[:1] + xref[1:] 148 | mesh_to_doid_dict[mesh_id].append(entry['id']) 149 | doid_to_mesh_dict[entry['id']].append(mesh_id) 150 | 151 | with open(doid_mesh_fname, 'w') as mesh_f: 152 | with open(doid_omim_fname, 'w') as omim_f: 153 | for doid in doid_to_mesh_dict: 154 | for mesh in doid_to_mesh_dict[doid]: 155 | mesh_f.write(doid + '\t' + mesh + '\n') 156 | for doid in doid_to_omim_dict: 157 | for omim in doid_to_omim_dict[doid]: 158 | omim_f.write(doid + '\t' + omim + '\n') 159 | """ 160 | meta_dict = { 161 | "doid_to_mesh_dict" : doid_to_mesh_dict, 162 | "mesh_to_doid_dict" : mesh_to_doid_dict, 163 | "omim_to_doid_dict" : omim_to_doid_dict, 164 | "doid_to_omim_dict" : doid_to_omim_dict, 165 | "doid_equiv_dict" : doid_equiv_dict, 166 | } 167 | pickle.dump(meta_dict, open(metadict_fname, 'w')) 168 | """ 169 | -------------------------------------------------------------------------------- /Disease/parse_omim_diseases.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : parse_ctd_diseases.py 3 | author: Farzaan Kaiyom 4 | based on scripts by Viswajith Venugopal 5 | 6 | Parses the OMIM latest disease table found in genemap2.txt 7 | ^ *This is the latest formatting for OMIM data* ^ 8 | 9 | Usage: 10 | python parse_ctd_diseases.py [--output_dir OUTPUT_DIR] 11 | 12 | Positional Arguments: 13 | input_file : The directory containing all the OMIM files. 14 | 15 | Optional Arugments: 16 | --output_dir : Directory to create output files. Defaults to the current working directory. 17 | 18 | Example Usage: 19 | Input File: OMIM/08-2019/ 20 | 21 | Output directory : ../../output/diseases/ 22 | 23 | Comamnd line: 24 | python parse_ctd_diseases.py OMIM/08-2019/ --output_dir output/diseases/ 25 | 26 | Output: 27 | omim_parsed.tsv 28 | ''' 29 | 30 | import os 31 | import argparse 32 | 33 | def parse_omim_file_to_list(omim_dir): 34 | """ 35 | Takes the OMIM directory as an argument, and 36 | returns a list of diseases from OMIM. 37 | First, it goes over the mim2gene file, and stores 38 | the OMIM numbers which correspond to diseases (phenotype). 39 | Then, it goes over the genemap and produces a list of entries 40 | which correspond to diseases, one dictionary per entry. 41 | Each dictionary has the following structure: 42 | { 43 | 'id', 44 | 'cyto_loc', 45 | 'gene_symbols', 46 | 'gene_name', 47 | 'comments', 48 | 'phenotypes', 49 | 'mouse_symb' 50 | } 51 | """ 52 | 53 | mim2gene_f = open(os.path.join(omim_dir, 'mim2gene.txt'), 'r') 54 | genemap_f = open(os.path.join(omim_dir, 'genemap2.txt'), 'r') 55 | 56 | # The set of mim numbers corresponding to diseases. 57 | disease_mims = set() 58 | for line in mim2gene_f: 59 | if line.startswith('#'): 60 | continue 61 | sp_line = line.split('\t') 62 | mim_number = sp_line[0] 63 | mim_type = sp_line[1] 64 | if mim_type == 'phenotype': 65 | disease_mims.add(mim_number) 66 | 67 | omim_list = [] 68 | # Now, go over genemap and populate the list. 69 | for line in genemap_f: 70 | if line.startswith('#'): 71 | continue 72 | 73 | sp_line = line.strip('\n').split('\t') 74 | mim_number = sp_line[5] 75 | if mim_number not in disease_mims: 76 | continue 77 | cyto_loc = sp_line[3] 78 | gene_symbols = sp_line[6] 79 | gene_name = sp_line[7] 80 | 81 | ensembl_id = sp_line[10] 82 | 83 | comments = sp_line[11] 84 | phenotypes = sp_line[12] 85 | mouse_gene_symbol = sp_line[13] 86 | omim_list.append({ 87 | 'id' : 'OMIM:' + mim_number, 88 | 'cyto_loc': cyto_loc, 89 | 'gene_symbols': gene_symbols, 90 | 'gene_name': gene_name, 91 | 'comments': comments, 92 | 'phenotypes': phenotypes, 93 | 'mouse_symb': mouse_gene_symbol 94 | }) 95 | return omim_list 96 | 97 | 98 | parser = argparse.ArgumentParser(description='Parse OMIM to find diseases.') 99 | parser.add_argument('input_dir', help='Input file dir. File should be the directory which contains all downloaded OMIM files.') 100 | parser.add_argument('--output_dir', help='Directory to output files', default='.') 101 | args = parser.parse_args() 102 | 103 | # Get the OMIM node table as a list of one dictionary per entry. 104 | omim_list = parse_omim_file_to_list(args.input_dir) 105 | output_fname = os.path.join(args.output_dir, "omim_parsed.tsv") 106 | 107 | with open(output_fname, 'w') as out_f: 108 | out_f.write('Parsed OMIM file.\n Columns: id, phenotypes, gene_name, gene_symbols, cyto_loc') 109 | for entry in omim_list: 110 | out_f.write('\t'.join([entry['id'], entry['phenotypes'], 111 | entry['gene_name'], entry['gene_symbols'], entry['cyto_loc'], 112 | entry['mouse_symb']]) + '\n') 113 | -------------------------------------------------------------------------------- /Function-Function/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing function-function interaction information: 2 | - GeneOntology (GO ids) 3 | 4 | Note that all the relevant tables can be generated using scripts found in the Utils directory. 5 | 6 | Workflow: 7 | 8 | Input files: 9 | /path/to/input/go.obo 10 | 11 | Output files: 12 | /path/to/output/miner-function-20160521.tsv 13 | /path/to/output/miner-function-0-GO-20160521.tsv 14 | /path/to/output/miner-function-function-20160521.tsv 15 | /path/to/output/miner-function-function-0-GO-20160521.tsv 16 | 17 | Intermediate files: 18 | /path/to/intermediate/go_parsed.tsv 19 | /path/to/intermediate/go_nodes.tsv 20 | 21 | # Create all the function mode files 22 | python ../Function-Function/parse_obo_for_functions.py /path/to/input/go.obo /path/to/intermediate/go_parsed.tsv 23 | 24 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/go_parsed.tsv /path/to/intermediate/go_nodes.tsv GO 0 1 25 | 26 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/go_nodes.tsv function GO 0 --output_dir /path/to/output/ 27 | 28 | 29 | # Create crossnet files 30 | python ../Utils/create_snap_crossnet_table.py /path/to/input/go_parsed.tsv /path/to/output/miner-function-0-GO-20160521.tsv /path/to/output/miner-function-0-GO-20160521.tsv GO 0 --output_dir /path/to/output/ 31 | 32 | Scripts included: 33 | 34 | file: parse_obo_for_functions.py 35 | author: Sheila Ramaswamy(@sramas15) 36 | 37 | Script that parses the gene ontology obo file for the function-function edge list. 38 | 39 | Usage: 40 | python parse_obo_for_functions.py 41 | 42 | Positional Arguments: 43 | input_file_path: Path to the input file; Input file should be the GO obo file. 44 | output_file_path: Path to the output file; Will be a tsv with the following schema: 45 | \\t\\t 46 | 47 | 48 | 49 | Example usage: 50 | Creating files for function-function relationships using GeneOntology: 51 | 52 | Input files: /path/to/input/go.obo 53 | 54 | Output files: /path/to/output/functions.tsv 55 | 56 | Workflow: 57 | 58 | python parse_obo_for_functions.py /path/to/input/go.obo /path/to/output/functions.tsv 59 | 60 | 61 | -------------------------------------------------------------------------------- /Function-Function/parse_obo_for_functions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file: parse_obo_for_functions.py 3 | author: Sheila Ramaswamy(@sramas15) 4 | 5 | Script that parses the gene ontology obo file for the function-function edge list. 6 | 7 | Usage: 8 | python parse_obo_for_functions.py 9 | 10 | Positional Arguments: 11 | input_file_path: Path to the input file; Input file should be the GO obo file. 12 | output_file_path: Path to the output file; Will be a tsv with the following schema: 13 | \\t\\t 14 | 15 | 16 | 17 | Example usage: 18 | Creating files for function-function relationships using GeneOntology: 19 | 20 | Input files: /path/to/input/go.obo 21 | 22 | Output files: /path/to/output/functions.tsv 23 | 24 | Workflow: 25 | 26 | python parse_obo_for_functions.py /path/to/input/go.obo /path/to/output/functions.tsv 27 | ''' 28 | import argparse 29 | 30 | parser = argparse.ArgumentParser(description='Parses and create an edge list for go functions of the form \\t\\t') 31 | parser.add_argument('input_file', help='input file name. Should be an obo file') 32 | parser.add_argument('output_file', help='output file name. Will be a tsv') 33 | 34 | args = parser.parse_args() 35 | 36 | 37 | edge_terms = ['disjoint_from', 'consider', 'alt_id', 'id', 'relationship', 'intersection_of', 'is_a', 'replaced_by'] 38 | 39 | with open(args.input_file, 'r') as inF: 40 | with open(args.output_file, 'a') as outF: 41 | outF.write('# Function-function interactions from GO\n') 42 | outF.write('# GO_id1\tGO_id2\n') 43 | inTerm = False 44 | currNode = None 45 | for line in inF: 46 | line = line.strip() 47 | if line == '[Term]': 48 | inTerm = True 49 | continue 50 | if len(line) == 0: 51 | inTerm = False 52 | currNode = None 53 | continue 54 | if inTerm: 55 | if line[0:3] == 'id:': 56 | currNode = line[4:].strip() 57 | else: 58 | for term in edge_terms: 59 | if line.split(':')[0] == term: 60 | assert currNode is not None 61 | new_line = line[len(term)+1:].strip() 62 | if new_line[0:3] == 'GO:': 63 | attr = '-' 64 | dst_id = new_line.split(' ')[0] 65 | else: 66 | (attr, edge_id) = new_line.split('!')[0].split('GO:') 67 | attr = attr.strip() 68 | dst_id = 'GO:' + edge_id.strip() 69 | outF.write('%s\t%s\t%s\n' % (currNode, dst_id, term)) 70 | break 71 | 72 | 73 | -------------------------------------------------------------------------------- /Function/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing function information: 2 | - GeneOntology (GO ids) 3 | 4 | Note that all the relevant tables can be generated using scripts found in the Utils directory. 5 | 6 | Workflow: 7 | 8 | Input files: 9 | /path/to/input/go.obo 10 | 11 | Output files: 12 | /path/to/output/miner-function-20160521.tsv 13 | /path/to/output/miner-function-0-GO-20160521.tsv 14 | 15 | Intermediate files: 16 | /path/to/intermediate/go_parsed.tsv 17 | /path/to/intermediate/go_nodes.tsv 18 | 19 | # Create all the function mode files 20 | python ../Function-Function/parse_obo_for_functions.py /path/to/input/go.obo /path/to/intermediate/go_parsed.tsv 21 | 22 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/go_parsed.tsv /path/to/intermediate/go_nodes.tsv GO 0 1 23 | 24 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/go_nodes.tsv function GO 0 --output_dir /path/to/output/ 25 | -------------------------------------------------------------------------------- /Function/parse_obo_for_functions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file: parse_obo_for_functions.py 3 | author: Sheila Ramaswamy(@sramas15) 4 | 5 | Script that parses the gene ontology obo file for the function-function edge list. 6 | 7 | Usage: 8 | python parse_obo_for_functions.py 9 | 10 | Positional Arguments: 11 | input_file_path: Path to the input file; Input file should be the GO obo file. 12 | output_file_path: Path to the output file; Will be a tsv with the following schema: 13 | \\t\\t 14 | 15 | 16 | 17 | Example usage: 18 | Creating files for function-function relationships using GeneOntology: 19 | 20 | Input files: /path/to/input/go.obo 21 | 22 | Output files: /path/to/output/functions.tsv 23 | 24 | Workflow: 25 | 26 | python parse_obo_for_functions.py /path/to/input/go.obo /path/to/output/functions.tsv 27 | ''' 28 | import argparse 29 | 30 | parser = argparse.ArgumentParser(description='Parses and create an edge list for go functions of the form \\t\\t') 31 | parser.add_argument('input_file', help='input file name. Should be an obo file') 32 | parser.add_argument('output_file', help='output file name. Will be a tsv') 33 | 34 | args = parser.parse_args() 35 | 36 | 37 | edge_terms = ['disjoint_from', 'consider', 'alt_id', 'id', 'relationship', 'intersection_of', 'is_a', 'replaced_by'] 38 | 39 | lbls = ['name','desc','namespace','synonym'] 40 | 41 | with open(args.input_file, 'r') as inF: 42 | with open(args.output_file, 'a') as outF: 43 | outF.write('# Function-function interactions from GO\n') 44 | outF.write('# GO_id1\tname\tdesc\tnamespace\tsynonym\n') 45 | inTerm = True 46 | currNode = None 47 | passed = False 48 | for line in inF: 49 | line = line.strip() 50 | #if line == '[Term]': 51 | #inTerm = True 52 | #continue 53 | if len(line) == 0: 54 | #inTerm = False 55 | #currNode = None 56 | continue 57 | if inTerm: 58 | 59 | if line[0:3] == 'id:': 60 | if passed: 61 | for lbl in lbls: 62 | if lbl not in linedict.keys(): 63 | linedict[lbl]='N/A' 64 | outF.write('%s\t%s\t%s\t%s\t%s\n' % (linedict['id'], linedict['name'],linedict['desc'],linedict['namespace'],linedict['synonym'])) 65 | linedict = {} 66 | linedict['id']= line[4:].strip() 67 | currNode = linedict['id'] 68 | passed = True 69 | else: 70 | if line.split(':')[0] in lbls: 71 | term = line.split(':')[0] 72 | assert currNode is not None 73 | new_line = line[len(term)+1:].strip() 74 | if 'EXACT []' in new_line: 75 | new_line = new_line.replace('EXACT []','') 76 | linedict[term] = new_line 77 | #if new_line[0:3] == 'GO:': 78 | # attr = '-' 79 | # dst_id = new_line.split(' ')[0] 80 | #else: 81 | # (attr, edge_id) = new_line.split('!')[0].split('GO:') 82 | # attr = attr.strip() 83 | # dst_id = 'GO:' + edge_id.strip() 84 | #outF.write('%s\t%s\t%s\n' % (currNode, dst_id, term)) 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /Gene-Function/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing gene-function information: 2 | - GeneOntology (Uniprot ids to GO ids) 3 | 4 | Note that all the relevant tables can be generated using scripts found in the Utils directory. 5 | 6 | Workflow: 7 | 8 | Input files: 9 | /path/to/input/gene_association.goa_human (from GO) 10 | /path/to/input/go.obo 11 | 12 | Output files: 13 | /path/to/output/miner-gene-20160521.tsv 14 | /path/to/output/miner-gene-0-GO-20160521.tsv 15 | /path/to/output/miner-function-20160521.tsv 16 | /path/to/output/miner-function-0-GO-20160521.tsv 17 | /path/to/output/miner-gene-function-20160521.tsv 18 | /path/to/output/miner-gene-function-0-GO-20160521.tsv 19 | 20 | Intermediate files: 21 | /path/to/intermediate/go_parsed.tsv 22 | /path/to/intermediate/go_nodes.tsv 23 | 24 | # First create all the gene mode files 25 | 26 | python ../Utils/create_snap_mode_table.py /path/to/input/gene_association.goa_human gene GO 0 --output_dir /path/to/output/ --node_index 1 27 | 28 | 29 | # Second create all the function mode files 30 | python ../Function-Function/parse_obo_for_functions.py /path/to/input/go.obo /path/to/intermediate/go_parsed.tsv 31 | 32 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/go_parsed.tsv /path/to/intermediate/go_nodes.tsv GO 0 1 33 | 34 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/go_nodes.tsv function GO 0 --output_dir /path/to/output/ 35 | 36 | # Create crossnet files 37 | python ../Utils/create_snap_crossnet_table.py /path/to/input/gene_association.goa_human /path/to/output/miner-gene-0-GO-20160521.tsv /path/to/output/miner-function-0-GO-20160521.tsv GO 0 --output_dir /path/to/output/ --src_node_index 1 --dst_node_index 4 38 | 39 | -------------------------------------------------------------------------------- /Gene-Protein/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing gene-protein interaction information: 2 | - ENSEMBL/BioMart server (data NOT on ilfs2) 3 | 4 | Workflow: 5 | 6 | Input Files: 7 | /path/to/input/hgnc_complete_set.txt 8 | /path/to/input/protein.links.full.v10.txt 9 | 10 | Output Files: 11 | /path/to/output/miner-gene-20160521.tsv 12 | /path/to/output/miner-gene-2-HUGO_ENSEMBL-20160521.tsv 13 | /path/to/output/miner-protein-20160521.tsv 14 | /path/to/output/miner-protein-0-STRING-20160521.tsv 15 | /path/to/output/miner-gene-protein-20160521.tsv 16 | /path/to/output/miner-gene-protein-0-ENSEMBL-20160521.tsv 17 | 18 | Intermediate Files: 19 | /path/to/intermediate/ensembl_mapping.tsv 20 | /path/to/intermediate/protein-STRING-edgelist.tsv 21 | /path/to/intermediate/protein-STRING-nodelist.tsv 22 | 23 | # Get the mapping file from biomart 24 | python fetch_ensembl_id_mapping /path/to/intermediate/ensembl_mapping.tsv 25 | 26 | # Get the gene mode table 27 | python ../Utils/create_snap_mode_table.py /path/to/input/hgnc_complete_set.txt gene HUGO_ENSEMBL 2 --output_dir /path/to/output/ --node_index 19 28 | 29 | # Extract the edge list from the full protein interactions file; potentially may have to change 30 | # the divider default value in the script (assume src and dst columns are 1 and 5) 31 | python ../Utils/extract_edge_list.py /path/to/input/protein.links.full.v10.txt /path/to/intermediate/protein-STRING-edgelist.tsv STRING 1 5 32 | 33 | # Extract the unique protein ids from the edge list (columns 0 and 1) 34 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/protein-STRING-edgelist.tsv /path/to/intermediate/protein-STRING-nodelist.tsv STRING 0 1 35 | 36 | # Create the protein mode files 37 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/protein-STRING-nodelist.tsv protein STRING 0 --output_dir /path/to/output/ 38 | 39 | # Create the CrossNet tables 40 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ensembl_mapping.tsv /path/to/output/miner-gene-2-HUGO_ENSEMBL-20160521.tsv /path/to/output/miner-protein-0-STRING-20160521.tsv ENSEMBL 0 --output_dir /path/to/output/ --skip_missing_ids --dst_mode_filter add_species_id 41 | 42 | 43 | Scripts Included: 44 | 45 | file: fetch_ensembl_id_mapping.py 46 | author: Sheila Ramaswamy(@sramas15) 47 | 48 | Connects to ENSEMBL server to fetch id mapping, using the biomart python library. 49 | 50 | Dependencies: 51 | biomart python library (https://pypi.python.org/pypi/biomart/0.9.0) 52 | 53 | Usage: 54 | python parse_obo_for_functions.py 55 | 56 | Positional Arguments: 57 | output_file_path: Path to the output file; Will be a tsv with the following schema: 58 | \\t 59 | 60 | 61 | Example usage: 62 | 63 | Input files: None 64 | 65 | Output files: /path/to/output/gene-protein-mapping.tsv 66 | 67 | Workflow: 68 | 69 | python fetch_ensembl_id_mapping.py /path/to/output/gene-protein-mapping.tsv 70 | 71 | 72 | -------------------------------------------------------------------------------- /Gene-Protein/fetch_ensembl_id_mapping.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file: fetch_ensembl_id_mapping.py 3 | author: Sheila Ramaswamy(@sramas15) 4 | 5 | Connects to ENSEMBL server to fetch id mapping, using the biomart python library. 6 | 7 | Dependencies: 8 | biomart python library (https://pypi.python.org/pypi/biomart/0.9.0) 9 | 10 | Usage: 11 | python parse_obo_for_functions.py 12 | 13 | Positional Arguments: 14 | output_file_path: Path to the output file; Will be a tsv with the following schema: 15 | \\t 16 | 17 | 18 | Example usage: 19 | 20 | Input files: None 21 | 22 | Output files: /path/to/output/gene-protein-mapping.tsv 23 | 24 | Workflow: 25 | 26 | python fetch_ensembl_id_mapping.py /path/to/output/gene-protein-mapping.tsv 27 | ''' 28 | import argparse 29 | from biomart import BiomartServer 30 | 31 | parser = argparse.ArgumentParser(description='Get ensembl gene and peptide mapping from biomart') 32 | parser.add_argument('output_file', help='output file name. Will be a tsv') 33 | args = parser.parse_args() 34 | 35 | def main(newfile): 36 | atts = ['ensembl_gene_id', 'ensembl_peptide_id'] 37 | url = 'http://www.ensembl.org/biomart' 38 | server = BiomartServer(url) 39 | hge = server.datasets['hsapiens_gene_ensembl'] 40 | with open(newfile, 'w') as outF: 41 | s = hge.search({'attributes': atts}, header=0) 42 | for l in s.iter_lines(): 43 | (gene_id, peptide_id) = l.split('\t') 44 | if len(peptide_id) > 0: 45 | outF.write('%s\t%s\n' % (gene_id.strip(), peptide_id.strip())) 46 | 47 | main(args.output_file) 48 | -------------------------------------------------------------------------------- /Gene/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets containing gene information: 2 | - GeneOntology (Uniprot ids *) 3 | - HUGO (contains ENSEMBL gene ids and Uniprot ids *) 4 | 5 | * Uniprot ids technically protein ids, but they seem to be used interchangeably in these datasets. 6 | 7 | Note that all the relevant tables can be generated using scripts found in the Utils directory. 8 | 9 | Workflow for creating mode tables for genes 10 | 11 | Input files: 12 | /path/to/input/hgnc_complete_set.txt (from HUGO) 13 | /path/to/input/goa_human.gaf (from GO) 14 | /path/to/input/goa_human_complex.gaf (from GO) 15 | /path/to/input/goa_human_isoform.gaf (from GO) 16 | /path/to/input/goa_human_rna.gaf (from GO) 17 | 18 | Output files: 19 | /path/to/output/miner-gene-20160521.tsv 20 | /path/to/output/miner-gene-0-GO-20160521.tsv 21 | /path/to/output/miner-gene-1-HUGO_Uniprot-20160521.tsv 22 | /path/to/output/miner-gene-2-HUGO_ENSEMBL-20160521.tsv 23 | 24 | 25 | # Create mode files 26 | python ../Utils/create_snap_mode_table.py /path/to/input/goa_human.gaf gene GO 0 --output_dir /path/to/output/ --node_index 1 27 | python ../Utils/create_snap_mode_table.py /path/to/input/goa_human_complex.gaf gene GOcomplex 1 --output_dir /path/to/output/ --node_index 1 28 | python ../Utils/create_snap_mode_table.py /path/to/input/goa_human_isoform.gaf gene GOisoform 2 --output_dir /path/to/output/ --node_index 1 29 | python ../Utils/create_snap_mode_table.py /path/to/input/goa_human_rna.gaf gene GOrna 3 --output_dir /path/to/output/ --node_index 1 30 | python ../Utils/create_snap_mode_table.py /path/to/input/hgnc_complete_set.txt gene HUGO_Uniprot 4 --output_dir /path/to/output/ --node_index 25 31 | python ../Utils/create_snap_mode_table.py /path/to/input/hgnc_complete_set.txt gene HUGO_ENSEMBL 5 --output_dir /path/to/output/ --node_index 19 32 | 33 | # Create mode equivalence table 34 | python../Utils/create_snap_mode_equiv_table.py /path/to/output/miner-gene-0-GO-20160521.tsv /path/to/output/miner-gene-1-HUGO_Uniprot-20160521.tsv --output_dir /path/to/output/ 35 | 36 | python../Utils/create_snap_mode_equiv_table.py /path/to/output/miner-gene-2-HUGO_ENSEMBL-20160521.tsv /path/to/output/miner-gene-1-HUGO_Uniprot-20160521.tsv --output_dir /path/to/output/ --mapping_file /path/to/input/hgnc_complete_set.txt --ds1_node_index 19 --ds2_node_index 25 37 | -------------------------------------------------------------------------------- /Protein-Protein/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets relevant for getting protein-protein interactions: 2 | 3 | - STRING v10; uses ENSEMBL peptide ids 4 | 5 | Note that all the relevant tables can be generated using scripts found in the Utils directory. 6 | 7 | ***** IMPORTANT: STRING v10 contains 1,847,117,370 edges and therefore takes a long time to process. 8 | when adding interactions from another dataset (to the same miner-protein-protein-20160521.tsv 9 | file), please use the snap_id_counter_start argument to set the starting snap id to 10 | 1,847,117,370; otherwise, the entire file is read to get a starting id. 11 | 12 | How to create crossnet tables from the STRING database (assumes mode tables not already created): 13 | 14 | Input Files (from STRING): 15 | /path/to/input/protein.links.full.v10.txt 16 | 17 | Output Files: 18 | /path/to/output/miner-protein-20160521.tsv 19 | /path/to/output/miner-protein-0-STRING-20160521.tsv 20 | /path/to/output/miner-protein-protein-20160521.tsv 21 | /path/to/output/miner-protein-protein-0-STRING-20160521.tsv 22 | 23 | Intermediate Files: 24 | /path/to/intermediate/protein-STRING-edgelist.tsv 25 | /path/to/intermediate/protein-STRING-nodelist.tsv 26 | 27 | # Extract the edge list from the full protein interactions file; potentially may have to change 28 | # the divider default value in the script (assume src and dst columns are 1 and 5) 29 | python ../Utils/extract_edge_list.py /path/to/input/protein.links.full.v10.txt /path/to/intermediate/protein-STRING-edgelist.tsv STRING 1 5 30 | 31 | # Extract the unique protein ids from the edge list (columns 0 and 1) 32 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/protein-STRING-edgelist.tsv /path/to/intermediate/protein-STRING-nodelist.tsv STRING 0 1 33 | 34 | # Create the mode files 35 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/protein-STRING-nodelist.tsv protein STRING 0 --output_dir /path/to/output/ 36 | 37 | # Create the crossnet files 38 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/protein-STRING-edgelist.tsv /path/to/output/miner-protein-0-STRING-20160521.tsv /path/to/output/miner-protein-0-STRING-20160521.tsv STRING 0 --output_dir /path/to/output/ 39 | 40 | -------------------------------------------------------------------------------- /Protein/README.txt: -------------------------------------------------------------------------------- 1 | Current datasets relevant for getting protein ids: 2 | 3 | - STRING v10; uses ENSEMBL peptide ids 4 | 5 | Note that all the relevant tables can be generated using scripts found in the Utils directory. 6 | 7 | How to create mode tables from the STRING database: 8 | 9 | Input Files (from STRING): 10 | /path/to/input/protein.links.full.v10.txt 11 | 12 | Output Files: 13 | /path/to/output/miner-protein-20160521.tsv 14 | /path/to/output/miner-protein-0-STRING-20160521.tsv 15 | 16 | Intermediate Files: 17 | /path/to/intermediate/protein-STRING-edgelist.tsv 18 | /path/to/intermediate/protein-STRING-nodelist.tsv 19 | 20 | # Extract the edge list from the full protein interactions file; potentially may have to change 21 | # the divider default value in the script (assume src and dst columns are 0 and 1) 22 | python ../Utils/extract_edge_list.py /path/to/input/protein.links.full.v10.txt /path/to/intermediate/protein-STRING-edgelist.tsv STRING 0 1 --divider " " 23 | 24 | # Extract the unique protein ids from the edge list (columns 0 and 1) 25 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/protein-STRING-edgelist.tsv /path/to/intermediate/protein-STRING-nodelist.tsv STRING 0 1 26 | 27 | # Create the mode files 28 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/protein-STRING-nodelist.tsv protein STRING 0 --output_dir /path/to/output/ 29 | 30 | # If multiple datasets used, and there is a mapping between the ids, please use the 31 | # create_snap_mode_equiv_table.py script. 32 | 33 | -------------------------------------------------------------------------------- /Protein/add_organism.py: -------------------------------------------------------------------------------- 1 | inFNm = 'STRING_protein_nodelist.tsv' 2 | outFNm = 'STRING_nodelist.tsv' 3 | with open(inFNm, 'r') as inF: 4 | with open(outFNm, 'a') as outF: 5 | for line in inF: 6 | cur = str(line).replace('\n','') 7 | organism = cur.split('.')[0] 8 | cur = cur + '\t' + organism 9 | outF.write(cur+'\n') 10 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | Scripts used to extract nodes and edges from MINER data in 2016 and later in 2019 2 | 3 | 4 | The Utils directory consists of general scripts that can be used to process multiple datasets. 5 | 6 | In addition, there are directories (with readmes on how to generate the respective snap tables) for each of the following Modes and CrossNets: 7 | 8 | Modes 9 | - Genes 10 | - HUGO (http://www.genenames.org/cgi-bin/statistics) 11 | - GeneOntology (http://geneontology.org/page/download-go-annotations) 12 | - Proteins 13 | - STRING (http://string-db.org/cgi/download.pl) 14 | - Functions 15 | - GeneOntology (http://geneontology.org/page/download-ontology) 16 | - Chemicals 17 | - Drugbank (http://www.drugbank.ca/) 18 | - Diseases 19 | - DiseaseOntology (http://disease-ontology.org/) 20 | - CTD (http://ctdbase.org) 21 | - OMIM (http://www.omim.org/) 22 | 23 | CrossNets 24 | - Gene-Protein 25 | - ENSEMBL Genes, Human genes (http://www.ensembl.org/biomart/martview) 26 | - Protein-Protein 27 | - STRING (http://string-db.org/cgi/download.pl) 28 | - Gene-Function 29 | - GeneOntology (http://geneontology.org/docs/download-go-annotations/) 30 | - Function-Function 31 | - GeneOntology (http://geneontology.org/page/download-ontology) 32 | - Chemical-Chemical 33 | - Drugbank (http://www.drugbank.ca/) 34 | - Chemical-Gene 35 | - Drugbank (http://www.drugbank.ca/) 36 | - Disease-Disease 37 | - DiseaseOntology (http://disease-ontology.org/) 38 | - Disease-Gene 39 | - CTD (http://ctdbase.org) 40 | - Disease-Chemical 41 | - CTD (http://ctdbase.org) 42 | - Disease-Function 43 | - CTD (http://ctdbase.org) 44 | 45 | New Datasets can be found at /dfs/scratch2/MINER-BIO/data-miner-201908. 46 | Old Datasets can be found at /dfs/scratch2/MINER-BIO/data-miner. 47 | 48 | The latest graph can be found at /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/snap 49 | 50 | Here's a quick look at the new miner dataset: 51 | ------------------------------- 52 | Modes | Nodes 53 | ------------------------------- 54 | Chemical | 13,339 55 | Protein | 22,406,877 56 | Gene | 106,536 57 | Function | 48,969 58 | Disease | 25,969 59 | 60 | -------------------------------------- 61 | Cross-Nets | Edges 62 | -------------------------------------- 63 | Chemical-Chemical | 2,712,183 64 | Chemical-Gene | 20,644 65 | Function-Function | 249,828 66 | Gene-Function | 481,543 67 | Gene-Protein | 18,650 68 | Disease-Disease | 9,383 69 | Disease-Gene | 64,109,210 70 | Disease-Function | 2,138,340 71 | Disease-Chemical | 2,643,750 72 | Protein-Protein | 2,147,483,643 73 | 74 | The old miner-dataset at a glance: 75 | ------------------------------- 76 | Modes | Nodes 77 | ------------------------------- 78 | Chemical | 11,367 79 | Protein | 8,254,694 80 | Gene | 104,004 81 | Function | 46,564 82 | Disease | 22,299 83 | 84 | -------------------------------------- 85 | Cross-Nets | Edges 86 | -------------------------------------- 87 | Chemical-Chemical | 95,246 88 | Chemical-Gene | 15,424 89 | Function-Function | 119,464 90 | Gene-Function | 481,733 91 | Gene-Protein | 17,930 92 | Disease-Disease | 6,877 93 | Disease-Gene | 42,475,361 94 | Disease-Function | 784,457 95 | Disease-Chemical | 1,334,088 96 | Protein-Protein | 1,847,117,370 97 | -------------------------------------------------------------------------------- /Utils/README.txt: -------------------------------------------------------------------------------- 1 | This directory creates generic scripts than can be used to create snap-formatted tsvs for modes 2 | and crossnets. This directory currently consists of three files: 3 | 4 | 1. create_snap_mode_table.py 5 | Input: - original dataset, in tsv form 6 | Output: - Snap mode table tsv (snap_nid\tdataset_id) 7 | - dataset specific mode snap table tsv (snap_nid\tdataset_entity_id) 8 | 2. create_snap_crossnet_table.py 9 | Input: - Source dataset specific snap mode table tsv 10 | - Destination dataset specific snap mode table tsv 11 | - the dataset (in tsv form) specifying edges. 12 | Output: - Snap crossnet table tsv (snap_eid\tdataset_id\tsnap_src_nid\tsnap_dst_nid) 13 | - he dataset specific snap table tsv (snap_eid\tsrc_dataset_id\tdst_dataset_id) 14 | 3. create_snap_mode_equiv_table.py 15 | Input: - First dataset specific snap mode table tsv 16 | - Second dataset specific snap mode table tsv 17 | - (Optional) the dataset (in tsv form) specifying id equivalences 18 | Output: - Snap mode equivlance table (snap_nid\tsnap_nid) 19 | 20 | 21 | It also contains scripts to pull out unique node ids and create an edge list (i.e. remove 22 | extraneous fields, but can also handle input lines that model many-to-many, many-to-1, and 23 | 1-to-many relationships): 24 | 25 | 4. extract_unique_node_ids.py 26 | Description: Extracts node ids from a tsv and writes all the unique ids to a tsv. 27 | Input: - dataset, in tsv form *** 28 | Output: - tsv, where each line contains a single node id 29 | 5. extract_edge_list.py 30 | Description: Extracts src and dst node ids from a file, creates a tsv edge list. Can process 31 | 1-to-1, 1-to-many, many-to-1, and many-to-many relationships in input file. 32 | Input: - dataset, in tsv form *** 33 | Output: - tsv, where each line contains the source node id and the destination node id. 34 | 35 | *** If file doesn't use tabs to separate the fields in a line, you can change the divider default 36 | argument value in the script. 37 | 38 | 39 | Below are details on the arguments and usage for each script (taken from the header of each file): 40 | 41 | ** Note that these descriptions may be out-of-date. Please read the header of the respective script to get the most up-to-date information about its command line arguments, etc. 42 | 43 | ######################################## 44 | ### create_snap_mode_table.py ### 45 | ######################################## 46 | 47 | Script that creates snap tables for a given mode. 48 | 49 | Usage: 50 | python create_snap_mode_table.py 51 | 52 | Positional Arguments: 53 | input_file: Path to the input file; Input file should be a tsv. 54 | mode_name: Name of the mode being created e.g. genes 55 | dataset_name: Name of dataset being used to create the snap mode tables i.e. the 56 | dataset the input file comes from. e.g. STRING 57 | dataset_id: unique integer id for this dataset. 58 | 59 | 60 | Optional arguments: 61 | --node_index: If there are multiple columns in the input tsv, the index of the column with the node id. 62 | Defaults to 0. 63 | --output_dir: Directory to create output files. Defaults to the current working directory. 64 | --full_mode_file: Name of output file tsv containing a list of \t. 65 | Defaults to output_dir/miner--.tsv 66 | --db_node_file: Name of output file tsv for a specific dataset; contains a list of \t 67 | Defaults to output_dir/miner----.tsv 68 | --snap_id_counter_start Start assigning snap ids from this integer value; this number MUST be greater 69 | than any id found in the full mode file. If not specified, finds the max id in the 70 | full_mode_file. 71 | 72 | Example usage: 73 | Creating files for genes using two datasets, GeneOntology and HUGO: 74 | 75 | Input files: hugo.tsv and go.tsv 76 | 77 | Output directory: outputs/genes/ 78 | 79 | Output files: miner-gene-20160520.tsv, miner-gene-0-GO-20160520.tsv, miner-gene-1-HUGO-20160520.tsv 80 | 81 | Workflow: 82 | 83 | python create_snap_mode_table.py go.tsv gene GO 0 --output_dir outputs/genes/ 84 | python create_snap_mode_table.py hugo.tsv gene HUGO 1 --output_dir outputs/genes/ 85 | 86 | 87 | ############################################ 88 | ### create_snap_crossnet_table.py ### 89 | ############################################ 90 | 91 | Script that creates snap tables for a given crossnet. 92 | 93 | Usage: 94 | python create_snap_crossnet_table.py 95 | 96 | Positional Arguments: 97 | input_file Path to the input file; Input file should be a tsv. 98 | src_file Path to a dataset specific file, as outputted by create_snap_mode_table.py, 99 | corresponding to the source mode. File name MUST MATCH FORMAT: 100 | miner----.tsv 101 | dst_file Path to a dataset specific file, as outputted by create_snap_mode_table.py, 102 | corresponding to the destination mode. File name MUST MATCH FORMAT: 103 | miner----.tsv 104 | dataset_name: Name of dataset being used to create the snap crossnet tables i.e. the 105 | dataset the input file comes from. e.g. STRING 106 | dataset_id: unique integer id for this dataset. 107 | 108 | 109 | Optional arguments: 110 | --src_node_index: If there are multiple columns in the input tsv, the index of the column with the src node id. 111 | Defaults to 0. 112 | --dst_node_index: If there are multiple columns in the input tsv, the index of the column with the dst node id. 113 | Defaults to 1. 114 | --output_dir: Directory to create output files. Defaults to the current working directory. 115 | --full_crossnet_file: Name of output file tsv containing a list of \t. 116 | Defaults to output_dir/miner---.tsv 117 | --db_edge_file: Name of output file tsv for a specific dataset; contains a list of \t 118 | Defaults to output_dir/miner-----.tsv 119 | --snap_id_counter_start Start assigning snap ids from this integer value; this number MUST be greater 120 | than any id found in the full crossnet file. If not specified, finds the max id in the 121 | full_crossnet_file. 122 | --skip_missing_ids Flag; If any of the ids in the input tsv do not have snap ids (which are fetched from 123 | the src and dst files), skip the line and continue parsing the data. 124 | --src_mode_filter The name of a function in utils.py that should be applied to the source node id in 125 | in the input file before using it to look up the snap id in the src_file. Defaults to None. 126 | --dst_mode_filter The name of a function in utils.py that should be applied to the destination node id in 127 | in the input file before using it to look up the snap id in the dst_file. Defaults to None. 128 | 129 | Example usage: 130 | Creating files for genes-function relationships using GeneOntology: 131 | 132 | Input files: go.tsv, miner-gene-0-GO-20160520.tsv, miner-function-0-GO-20160520.tsv 133 | 134 | Output directory: outputs/genes-functions/ 135 | 136 | Output files: miner-gene-function-20160520.tsv, miner-gene-function-0-GO-20160520.tsv 137 | 138 | Workflow: 139 | 140 | python create_snap_crossnet_table.py go.tsv miner-gene-0-GO-20160520.tsv miner-function-0-GO-20160520.tsv GO 0 --output_dir outputs/genes-functions/ 141 | 142 | 143 | ############################################## 144 | ### create_snap_mode_equiv_table.py ### 145 | ############################################## 146 | 147 | Script that creates snap equivalence table between two datasets for a given mode. 148 | 149 | Usage: 150 | python create_snap_mode_equiv_table.py 151 | 152 | Positional Arguments: 153 | dataset1_file Path to a dataset specific file, as outputted by create_snap_mode_table.py, 154 | corresponding to the source mode. File name MUST MATCH FORMAT: 155 | miner----.tsv 156 | dataset2_file Path to a dataset specific file, as outputted by create_snap_mode_table.py, 157 | corresponding to the destination mode. File name MUST MATCH FORMAT: 158 | miner----.tsv 159 | 160 | 161 | Optional arguments: 162 | --mapping_file Path to a tsv file containing the mapping between the two datasets. 163 | --ds1_node_index: If there are multiple columns in the input tsv, the index of the column with the dataset1 entity id. 164 | Defaults to 0. 165 | --ds2_node_index: If there are multiple columns in the input tsv, the index of the column with the dataset2 entity id. 166 | Defaults to 1. 167 | --output_dir: Directory to create output files. Defaults to the current working directory. 168 | --equiv_file: Name of output file tsv containing a list of \t. 169 | Defaults to output_dir/miner--equiv-.tsv 170 | --skip_missing_ids Flag; If any of the ids in the input tsv do not have snap ids (which are fetched from 171 | the src and dst files), skip the line and continue parsing the data. 172 | 173 | Example usage: 174 | Creating equivalence file for genes using GeneOntoloty and HUGO 175 | 176 | Input files: hugo.tsv, miner-gene-0-GO-20160520.tsv, miner-gene-1-HUGO-20160520.tsv 177 | 178 | Output directory: outputs/genes/ 179 | 180 | Output files: miner-gene-equiv-20160520.tsv 181 | 182 | Workflow: 183 | 184 | python create_snap_mode_equiv_table.py miner-gene-0-GO-20160520.tsv miner-gene-1-HUGO-20160520.tsv --mapping_file_path hugo.tsv --output_dir outputs/genes/ 185 | 186 | 187 | ######################################### 188 | ### extract_unique_node_ids.py ### 189 | ######################################### 190 | 191 | Script that creates a tsv containing all the unique node ids from a given input file. 192 | 193 | Usage: 194 | python extract_unique_node_ids.py ... 195 | 196 | Positional Arguments: 197 | input_file Path to the input file; Input file should be a tsv. 198 | output_file Path to the output file; Output file will be a tsv. 199 | dataset_name: Name of dataset nodes are being extracted from e.g. STRING 200 | columns: Columns containing node ids. Can specify many. 201 | 202 | 203 | Optional arguments: 204 | --node_name: String indicating how to refer to the node ids in the file scheme. Defaults to node_id. 205 | --has_title: If provided, skips over the first line of the file. 206 | --verbose: If provided, prints to the console for every million lines of the input file processed. 207 | 208 | Example usage: 209 | Extracting node ids from a STRING edgelist file, consisting of \t 210 | 211 | Input files: STRING.tsv 212 | 213 | Output file: STRING-nodes.tsv 214 | 215 | Workflow: 216 | 217 | python extract_unique_node_ids.py STRING.tsv STRING-nodes.tsv STRING 0 1 --node_name ENSEMBL_peptide_id --verbose 218 | 219 | 220 | ################################### 221 | ### extract_edge_list.py ### 222 | ################################### 223 | 224 | Script that creates an edge list given the input file. 225 | 226 | Usage: 227 | python extract_unique_node_ids.py 228 | 229 | Positional Arguments: 230 | input_file Path to the input file; Input file should be a tsv. 231 | output_file Path to the output file; Output file will be a tsv. 232 | dataset_name: Name of dataset nodes are being extracted from e.g. STRING 233 | src_node_column: Column containing source node(s) 234 | dst_node_column: Column containing destination node(s) 235 | 236 | Optional arguments: 237 | --src_node_name: String indicating how to refer to the src node ids in the file scheme. Defaults to node_id1. 238 | --dst_node_name: String indicating how to refer to the dst node ids in the file scheme. Defaults to node_id2. 239 | --has_title: If provided, skips over the first line of the file. 240 | --verbose: If provided, prints to the console for every million lines of the input file processed. 241 | --src_node_sep: If the column containing the src node actually contains a list of nodes, the character separater 242 | used to split the text into the different node ids. Relevant for many-to-one relationships. 243 | By default assumes only one node id specified. 244 | --dst_node_sep: If the column containing the dst node actually contains a list of nodes, the character separater 245 | used to split the text into the different node ids. Relevant for one-to-many relationships. 246 | By default assumes only one node id specified. 247 | 248 | Example usage: 249 | Extracting edge list from a STRING protein-protein interactions file, which contains many other fields. 250 | 251 | Input files: STRING.tsv; assume protein 1 at index 1 and protein 2 at index 5. 252 | 253 | Output file: STRING-edges.tsv 254 | 255 | Workflow: 256 | 257 | python extract_edge_list.py STRING.tsv STRING-edges.tsv STRING 1 5 --src_node_name protein_1 --dst_node_name protein_2 258 | 259 | -------------------------------------------------------------------------------- /Utils/create_snap_crossnet_table.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file: create_snap_crossnet_table.py 3 | author: Sheila Ramaswamy(@sramas15) 4 | 5 | Script that creates snap tables for a given crossnet. 6 | 7 | Usage: 8 | python create_snap_crossnet_table.py 9 | 10 | Positional Arguments: 11 | input_file: Path to the input file; Input file should be a tsv. 12 | src_file: Path to a dataset specific file, as outputted by create_snap_mode_table.py, 13 | corresponding to the source mode. File name MUST MATCH FORMAT: 14 | miner----.tsv 15 | dst_file: Path to a dataset specific file, as outputted by create_snap_mode_table.py, 16 | corresponding to the destination mode. File name MUST MATCH FORMAT: 17 | miner----.tsv 18 | dataset_name: Name of dataset being used to create the snap crossnet tables i.e. the 19 | dataset the input file comes from. e.g. STRING 20 | dataset_id: unique integer id for this dataset. 21 | 22 | 23 | Optional arguments: 24 | --src_node_index: If there are multiple columns in the input tsv, the index of the column with the src node id. 25 | Defaults to 0. 26 | --dst_node_index: If there are multiple columns in the input tsv, the index of the column with the dst node id. 27 | Defaults to 1. 28 | --output_dir: Directory to create output files. Defaults to the current working directory. 29 | --full_crossnet_file: Name of output file tsv containing a list of \t. 30 | Defaults to output_dir/miner---.tsv 31 | --db_edge_file: Name of output file tsv for a specific dataset; contains a list of \t 32 | Defaults to output_dir/miner-----.tsv 33 | --snap_id_counter_start Start assigning snap ids from this integer value; this number MUST be greater 34 | than any id found in the full crossnet file. If not specified, finds the max id in the 35 | full_crossnet_file. 36 | --skip_missing_ids Flag; If any of the ids in the input tsv do not have snap ids (which are fetched from 37 | the src and dst files), skip the line and continue parsing the data. 38 | --src_mode_filter The name of a function in utils.py that should be applied to the source node id in 39 | in the input file before using it to look up the snap id in the src_file. Defaults to None. 40 | --dst_mode_filter The name of a function in utils.py that should be applied to the destination node id in 41 | in the input file before using it to look up the snap id in the dst_file. Defaults to None. 42 | 43 | Example usage: 44 | Creating files for genes-function relationships using GeneOntology: 45 | 46 | Input files: go.tsv, miner-gene-0-GO-20160520.tsv, miner-function-0-GO-20160520.tsv 47 | 48 | Output directory: outputs/genes-functions/ 49 | 50 | Output files: miner-gene-function-20160520.tsv, miner-gene-function-0-GO-20160520.tsv 51 | 52 | Workflow: 53 | 54 | python create_snap_crossnet_table.py go.tsv miner-gene-0-GO-20160520.tsv miner-function-0-GO-20160520.tsv GO 0 --output_dir outputs/genes-functions/ 55 | ''' 56 | 57 | import argparse 58 | import utils 59 | import os 60 | 61 | parser = argparse.ArgumentParser(description='Create snap edge tables') 62 | parser.add_argument('input_file', help='input file name. File should be a tsv, containing interactions between ids found in src_file_name and ids found in dst_file_name') 63 | parser.add_argument('src_file', help='input file name. Should be a file outputted by create_snap_mode_table (with properly formatted name).') 64 | parser.add_argument('dst_file', help='input file name. Should be a file outputted by create_snap_mode_table (with properly formatted name).') 65 | parser.add_argument('dataset_name', type=str, help='name of dataset') 66 | parser.add_argument('db_id', type=int, help='int id for this dataset') 67 | parser.add_argument('--src_node_index', type=int, help='column index that contains src node ids (NOT snap ids, from src_input_file)', default=0) 68 | parser.add_argument('--dst_node_index', type=int, help='column index that contains dst node ids (NOT snap ids, from dst_input_file)', default=1) 69 | parser.add_argument('--output_dir', help='directory to output files; either this argument or full_crossnet_file and db_edge_file MUST be specified', default='.') 70 | parser.add_argument('--full_crossnet_file', help='output file name; outputs a list of snap ids, the db ids (db the snap id was derived from), and source and destination snap node ids;' \ 71 | + 'note that this file is appended to; OVERRIDES output_dir argument', default=None) 72 | parser.add_argument('--db_edge_file', help='output file name; output contains mapping of snap ids to dataset ids; OVERRIDES output dir argument', default=None) 73 | parser.add_argument('--skip_missing_ids', action='store_true', help='don\'t throw an error if ids in input_file not found in src or dst file.') 74 | parser.add_argument('--snap_id_counter_start', type=int, help='where to start assigning snap ids', default=-1) 75 | parser.add_argument('--src_mode_filter', type=str, default=None) 76 | parser.add_argument('--dst_mode_filter', type=str, default=None) 77 | args = parser.parse_args() 78 | 79 | 80 | inFNm = args.input_file 81 | srcFile = args.src_file 82 | dstFile = args.dst_file 83 | dataset = args.dataset_name 84 | db_id = args.db_id 85 | 86 | srcIdx = args.src_node_index 87 | dstIdx = args.dst_node_index 88 | 89 | src_db_id = utils.parse_dataset_id_from_name(os.path.basename(srcFile)) 90 | dst_db_id = utils.parse_dataset_id_from_name(os.path.basename(dstFile)) 91 | 92 | mode_name1 = utils.parse_mode_name_from_name(os.path.basename(srcFile)) 93 | mode_name2 = utils.parse_mode_name_from_name(os.path.basename(dstFile)) 94 | 95 | output_dir = args.output_dir 96 | outFNm = args.full_crossnet_file 97 | if outFNm is None: 98 | outFNm = os.path.join(args.output_dir, utils.get_full_cross_file_name(mode_name1, mode_name2)) 99 | outFNm2 = args.db_edge_file 100 | if outFNm2 is None: 101 | outFNm2 = os.path.join(args.output_dir, utils.get_cross_file_name(mode_name1, mode_name2, db_id, dataset)) 102 | 103 | 104 | src_mapping = utils.read_mode_file(srcFile) 105 | if os.path.samefile(srcFile, dstFile): 106 | dst_mapping = src_mapping 107 | else: 108 | dst_mapping = utils.read_mode_file(dstFile) 109 | 110 | src_filter = utils.get_filter(args.src_mode_filter) 111 | dst_filter = utils.get_filter(args.dst_mode_filter) 112 | 113 | add_schema = True 114 | counter = args.snap_id_counter_start 115 | if counter == -1: 116 | counter = utils.get_max_id(outFNm) 117 | print 'Starting at snap id: %d' % counter 118 | with open(inFNm, 'r') as inF: 119 | with open(outFNm, 'a') as fullF: 120 | with open(outFNm2, 'w') as dbF: 121 | # Add schema/metadata 122 | if counter == 0: 123 | fullF.write('# Full crossnet file for %s to %s\n' % (mode_name1, mode_name2)) 124 | fullF.write('# File generated on: %s\n' % utils.get_current_date()) 125 | fullF.write('# snap_eid\tdataset_id\tsrc_snap_nid\tdst_snap_nid\n') 126 | dbF.write('# Crossnet table for dataset: %s\n' % dataset) 127 | dbF.write('# File generated on: %s\n' % utils.get_current_date()) 128 | # Process file 129 | for line in inF: 130 | if line[0] == '#' or line[0] == '!' or line[0] == '\n': 131 | continue 132 | vals = utils.split_then_strip(line, '\t') 133 | if add_schema: 134 | attrs_schema = '# snap_eid\tsrc_dataset_id\tdst_dataset_id' 135 | for i in range(len(vals)): 136 | if i != srcIdx and i != dstIdx: 137 | attrs_schema += '\tC%d' % i 138 | dbF.write('%s\n' % attrs_schema) 139 | add_schema = False 140 | id1 = vals[srcIdx] 141 | id2 = vals[dstIdx] 142 | if src_filter: 143 | id1 = src_filter(id1) 144 | if dst_filter: 145 | id2 = dst_filter(id2) 146 | if id1 == '' or id2 == '': 147 | continue 148 | if args.skip_missing_ids and (id1 not in src_mapping or id2 not in dst_mapping): 149 | continue 150 | attr_strs = '' 151 | for i in range(len(vals)): 152 | if i != srcIdx and i != dstIdx: 153 | attr_strs += '\t' + vals[i] 154 | fullF.write('%d\t%d\t%d\t%d\n' % (counter, db_id, src_mapping[id1], dst_mapping[id2])) 155 | dbF.write('%d\t%d\t%d%s\n' % (counter, src_db_id, dst_db_id, attr_strs)) 156 | counter += 1 157 | print 'Ending at snap id: %d' % counter 158 | -------------------------------------------------------------------------------- /Utils/create_snap_mode_equiv_table.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file: create_snap_mode_equiv_table.py 3 | author: Sheila Ramaswamy(@sramas15) 4 | 5 | Script that creates snap equivalence table between two datasets for a given mode. 6 | 7 | Usage: 8 | python create_snap_mode_equiv_table.py 9 | 10 | Positional Arguments: 11 | dataset1_file: Path to a dataset specific file, as outputted by create_snap_mode_table.py, 12 | corresponding to the source mode. File name MUST MATCH FORMAT: 13 | miner----.tsv 14 | dataset2_file: Path to a dataset specific file, as outputted by create_snap_mode_table.py, 15 | corresponding to the destination mode. File name MUST MATCH FORMAT: 16 | miner----.tsv 17 | 18 | 19 | Optional arguments: 20 | --mapping_file: Path to a tsv file containing the mapping between the two datasets. 21 | --ds1_node_index: If there are multiple columns in the input tsv, the index of the column with the dataset1 entity id. 22 | Defaults to 0. 23 | --ds2_node_index: If there are multiple columns in the input tsv, the index of the column with the dataset2 entity id. 24 | Defaults to 1. 25 | --output_dir: Directory to create output files. Defaults to the current working directory. 26 | --equiv_file: Name of output file tsv containing a list of \t. 27 | Defaults to output_dir/miner--equiv-.tsv 28 | --skip_missing_ids Flag; If any of the ids in the input tsv do not have snap ids (which are fetched from 29 | the src and dst files), skip the line and continue parsing the data. 30 | 31 | Example usage: 32 | Creating equivalence file for genes using GeneOntoloty and HUGO 33 | 34 | Input files: hugo.tsv, miner-gene-0-GO-20160520.tsv, miner-gene-1-HUGO-20160520.tsv 35 | 36 | Output directory: outputs/genes/ 37 | 38 | Output files: miner-gene-equiv-20160520.tsv 39 | 40 | Workflow: 41 | 42 | python create_snap_mode_equiv_table.py miner-gene-0-GO-20160520.tsv miner-gene-1-HUGO-20160520.tsv --mapping_file_path hugo.tsv --output_dir outputs/genes/ 43 | ''' 44 | 45 | import argparse 46 | import utils 47 | import os 48 | 49 | parser = argparse.ArgumentParser(description='Create snap edge tables') 50 | parser.add_argument('dataset1_file', help='input file name. Should be a file outputted by create_snap_mode_table (with properly formatted name).') 51 | parser.add_argument('dataset2_file', help='input file name. Should be a file outputted by create_snap_mode_table (with properly formatted name).') 52 | parser.add_argument('--mapping_file', help='path to a tsv file giving a mapping between the dataset specific ids in dataset1 and dataset2 files.', default=None) 53 | parser.add_argument('--ds1_node_index', type=int, help='column index that contains ds1 node ids (NOT snap ids, from src_input_file)', default=0) 54 | parser.add_argument('--ds2_node_index', type=int, help='column index that contains ds2 node ids (NOT snap ids, from dst_input_file)', default=1) 55 | parser.add_argument('--output_dir', help='directory to output files; either this argument or full_crossnet_file and db_edge_file MUST be specified', default='.') 56 | parser.add_argument('--equiv_file', help='output file name; outputs a equivalence table of snap ids' \ 57 | + 'note that this file is appended to; OVERRIDES output_dir argument', default=None) 58 | parser.add_argument('--skip_missing_ids', action='store_true', help='don\'t throw an error if ids in input_file not found in src or dst file.') 59 | args = parser.parse_args() 60 | 61 | 62 | inFNm = args.mapping_file 63 | dsFile1 = args.dataset1_file 64 | dsFile2 = args.dataset2_file 65 | 66 | ds1Idx = args.ds1_node_index 67 | ds2Idx = args.ds2_node_index 68 | 69 | output_dir = args.output_dir 70 | outFNm = args.equiv_file 71 | mode_name = 'Unknown' 72 | if outFNm is None: 73 | mode_name1 = utils.parse_mode_name_from_name(os.path.basename(dsFile1)) 74 | mode_name2 = utils.parse_mode_name_from_name(os.path.basename(dsFile2)) 75 | mode_name = mode_name1 76 | assert mode_name1 == mode_name2 77 | outFNm = os.path.join(args.output_dir, utils.get_equiv_mode_file_name(mode_name1)) 78 | 79 | ds1_mapping = utils.read_mode_file(dsFile1) 80 | if os.path.samefile(dsFile1, dsFile2): 81 | ds2_mapping = ds1_mapping 82 | else: 83 | ds2_mapping = utils.read_mode_file(dsFile2) 84 | 85 | add_header = True 86 | if os.path.isfile(outFNm): 87 | add_header = False 88 | 89 | 90 | with open(outFNm, 'a') as equivF: 91 | if add_header: 92 | equivF.write('# Equivalence table for mode %s\n' % mode_name) 93 | equivF.write('# File generated on: %s\n' % utils.get_current_date()) 94 | equivF.write('# snap_nid_1\tsnap_nid_2\n') 95 | if inFNm is not None: 96 | with open(inFNm, 'r') as inF: 97 | for line in inF: 98 | if line[0] == '#' or line[0] == '\n': 99 | continue 100 | vals = utils.split_then_strip(line, '\t') 101 | id1 = vals[ds1Idx] 102 | id2 = vals[ds2Idx] 103 | if id1 == '' or id2 == '': 104 | continue 105 | if args.skip_missing_ids and (id1 not in ds1_mapping or id2 not in ds2_mapping): 106 | continue 107 | equivF.write('%d\t%d\n' % (ds1_mapping[id1], ds2_mapping[id2])) 108 | else: 109 | for id1 in ds1_mapping: 110 | if id1 in ds2_mapping: 111 | equivF.write('%d\t%d\n' % (ds1_mapping[id1], ds2_mapping[id1])) 112 | -------------------------------------------------------------------------------- /Utils/create_snap_mode_table.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file: create_snap_mode_table.py 3 | author: Sheila Ramaswamy(@sramas15) 4 | 5 | Script that creates snap tables for a given mode. 6 | 7 | Usage: 8 | python create_snap_mode_table.py 9 | 10 | Positional Arguments: 11 | input_file: Path to the input file; Input file should be a tsv. 12 | mode_name: Name of the mode being created e.g. genes 13 | dataset_name: Name of dataset being used to create the snap mode tables i.e. the 14 | dataset the input file comes from. e.g. STRING 15 | dataset_id: unique integer id for this dataset. 16 | 17 | 18 | Optional arguments: 19 | --node_index: If there are multiple columns in the input tsv, the index of the column with the node id. 20 | Defaults to 0. 21 | --output_dir: Directory to create output files. Defaults to the current working directory. 22 | --full_mode_file: Name of output file tsv containing a list of \t. 23 | Defaults to output_dir/miner--.tsv 24 | --db_node_file: Name of output file tsv for a specific dataset; contains a list of \t 25 | Defaults to output_dir/miner----.tsv 26 | --snap_id_counter_start Start assigning snap ids from this integer value; this number MUST be greater 27 | than any id found in the full mode file. If not specified, finds the max id in the 28 | full_mode_file. 29 | 30 | Example usage: 31 | Creating files for genes using two datasets, GeneOntology and HUGO: 32 | 33 | Input files: hugo.tsv and go.tsv 34 | 35 | Output directory: outputs/genes/ 36 | 37 | Output files: miner-gene-20160520.tsv, miner-gene-0-GO-20160520.tsv, miner-gene-1-HUGO-20160520.tsv 38 | 39 | Workflow: 40 | 41 | python create_snap_mode_table.py go.tsv gene GO 0 --output_dir outputs/genes/ 42 | python create_snap_mode_table.py hugo.tsv gene HUGO 1 --output_dir outputs/genes/ 43 | ''' 44 | 45 | import argparse 46 | import utils 47 | import os 48 | 49 | 50 | # Create command line arguments 51 | parser = argparse.ArgumentParser(description='Create snap node tables; for more detailed description, please see file header.') 52 | parser.add_argument('input_file', help='input file name. File should be a tsv, with one mode-specific id per line (unless --node_index specified)') 53 | parser.add_argument('mode_name', type=str, help='mode name') 54 | parser.add_argument('dataset_name', type=str, help='name of dataset') 55 | parser.add_argument('db_id', type=int, help='int id for this dataset') 56 | parser.add_argument('--node_index', type=int, help='column index that contains node ids', default=0) 57 | parser.add_argument('--output_dir', help='directory to output files; either this argument or full_mode_file and db_node_file MUST be specified', default='.') 58 | parser.add_argument('--full_mode_file', help='output file name; outputs a list of snap ids and the db ids (db the snap id was derived from);' \ 59 | + 'note that this file is appended to; OVERRIDES output_dir argument', default=None) 60 | parser.add_argument('--db_node_file', help='output file name; output contains mapping of snap ids to db protein ids; OVERRIDES output dir argument', default=None) 61 | parser.add_argument('--snap_id_counter_start', type=int, help='where to start assigning snap ids', default=-1) 62 | args = parser.parse_args() 63 | 64 | 65 | # Process command line arguments, get default path names 66 | inFNm = args.input_file 67 | db_id = args.db_id 68 | mode_name = args.mode_name 69 | dataset = args.dataset_name 70 | outFNm = args.full_mode_file 71 | if outFNm is None: 72 | outFNm = os.path.join(args.output_dir, utils.get_full_mode_file_name(mode_name)) 73 | dbFNm = args.db_node_file 74 | if dbFNm is None: 75 | dbFNm = os.path.join(args.output_dir, utils.get_mode_file_name(mode_name, db_id, dataset)) 76 | 77 | counter = args.snap_id_counter_start 78 | if counter == -1: 79 | counter = utils.get_max_id(outFNm) 80 | node_index = args.node_index 81 | 82 | 83 | # Read input file, create output files. 84 | seen = set() 85 | print 'Starting at snap id: %d' % counter 86 | with open(inFNm, 'r') as inF: 87 | with open(outFNm, 'a') as outF: 88 | with open(dbFNm, 'w') as dbF: 89 | if counter == 0: 90 | outF.write('# Full mode table for %s\n' % mode_name) 91 | outF.write('# File generated on: %s\n' % utils.get_current_date()) 92 | outF.write('# snap_nid\tdataset id\n') 93 | dbF.write('# Mode table for dataset: %s\n' % dataset) 94 | dbF.write('# File generated on: %s\n' % utils.get_current_date()) 95 | add_schema = True 96 | for line in inF: 97 | if line[0] == '#' or line[0] == '!' or line[0] == '\n': # skip comments 98 | continue 99 | vals = utils.split_then_strip(line, '\t') 100 | if add_schema: 101 | attrs_schema = '# snap_nid\tdataset_nid' 102 | for i in range(len(vals)): 103 | if i != node_index: 104 | attrs_schema += '\tC%d' % i 105 | dbF.write('%s\n' % attrs_schema) 106 | add_schema = False 107 | node_id = vals[node_index] 108 | if node_id in seen or len(node_id) == 0: 109 | continue 110 | attrs_str = '' 111 | for i in range(len(vals)): 112 | if i != node_index: 113 | attrs_str += '\t' + vals[i] 114 | outF.write('%d\t%d\n' % (counter, db_id)) 115 | dbF.write('%d\t%s%s\n' % (counter, node_id, attrs_str)) 116 | seen.add(node_id) 117 | counter += 1 118 | 119 | print 'Ending at snap id: %d' % counter 120 | 121 | -------------------------------------------------------------------------------- /Utils/extract_edge_list.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file: extract_edge_list.py 3 | author: Sheila Ramaswamy(@sramas15) 4 | 5 | Script that creates an edge list given the input file. 6 | 7 | Usage: 8 | python extract_edge_list.py 9 | 10 | Positional Arguments: 11 | input_file: Path to the input file; Input file should be a tsv. 12 | output_file: Path to the output file; Output file will be a tsv. 13 | dataset_name: Name of dataset nodes are being extracted from e.g. STRING 14 | src_node_column: Column containing source node(s) 15 | dst_node_column: Column containing destination node(s) 16 | 17 | Optional arguments: 18 | --src_node_name: String indicating how to refer to the src node ids in the file scheme. Defaults to node_id1. 19 | --dst_node_name: String indicating how to refer to the dst node ids in the file scheme. Defaults to node_id2. 20 | --has_title: If provided, skips over the first line of the file. 21 | --verbose: If provided, prints to the console for every million lines of the input file processed. 22 | --src_node_sep: If the column containing the src node actually contains a list of nodes, the character separater 23 | used to split the text into the different node ids. Relevant for many-to-one relationships. 24 | By default assumes only one node id specified. 25 | --dst_node_sep: If the column containing the dst node actually contains a list of nodes, the character separater 26 | used to split the text into the different node ids. Relevant for one-to-many relationships. 27 | By default assumes only one node id specified. 28 | 29 | Example usage: 30 | Extracting edge list from a STRING protein-protein interactions file, which contains many other fields. 31 | 32 | Input files: STRING.tsv; assume protein 1 at index 1 and protein 2 at index 5. 33 | 34 | Output file: STRING-edges.tsv 35 | 36 | Workflow: 37 | 38 | python extract_edge_list.py STRING.tsv STRING-edges.tsv STRING 1 5 --src_node_name protein_1 --dst_node_name protein_2 39 | ''' 40 | import argparse 41 | import utils 42 | 43 | parser = argparse.ArgumentParser(description='Extract edges and additional data from a file') 44 | parser.add_argument('input_file', help='input file name.') 45 | parser.add_argument('output_file', help='output file name.') 46 | parser.add_argument('dataset_name', help='Name of the dataset') 47 | parser.add_argument('src_node_col', help='column index containing source nodes') 48 | parser.add_argument('dst_node_col', help='column index containing destination nodes') 49 | parser.add_argument('--src_node_sep', help='if multiple ids are specified in this column,' \ 50 | + ' character used to split them', default=None) 51 | parser.add_argument('--dst_node_sep', help='if multiple ids are specified in this column,' \ 52 | + ' character used to split them', default=None) 53 | parser.add_argument('--has_title', action='store_true', 54 | help='has a title line') 55 | parser.add_argument('--verbose', action='store_true', 56 | help='Print every 1,000,000 lines processed') 57 | parser.add_argument('--divider', default='\t', type=str, help='separator') 58 | parser.add_argument('--src_node_name', default='node_id1', type=str, help='how to identify the src nodes in the header for tsv') 59 | parser.add_argument('--dst_node_name', default='node_id2', type=str, help='how to identify the dst nodes in the header for tsv') 60 | 61 | if __name__ == '__main__': 62 | args = parser.parse_args() 63 | #print(args) 64 | with open(args.input_file, 'r') as inF: 65 | with open(args.output_file, 'w') as outF: 66 | outF.write('# Dataset: %s\n' % args.dataset_name) 67 | outF.write('# %s\t%s\n' % (args.src_node_name, args.dst_node_name)) 68 | for i, line in enumerate(inF): 69 | if args.verbose and i%1000000 == 0: 70 | print 'Finished processing line %d in the original input file' % i 71 | if line[0] == '#' or line[0] == '!' or line[0] == '\n' or (i==0 and args.has_title): 72 | continue 73 | vals = utils.split_then_strip(line, args.divider) 74 | #print(vals) 75 | src_nodes = [vals[int(args.src_node_col)]] 76 | dst_nodes = [vals[int(args.dst_node_col)]] 77 | if args.src_node_sep is not None: 78 | src_nodes = src_nodes[0].split(args.src_node_sep) 79 | if args.dst_node_sep is not None: 80 | dst_nodes = dst_nodes[0].split(args.dst_node_sep) 81 | for src_node in src_nodes: 82 | if src_node == '': 83 | continue 84 | for dst_node in dst_nodes: 85 | if dst_node != '': 86 | outF.write('%s\t%s\n' % (src_node, dst_node)) 87 | -------------------------------------------------------------------------------- /Utils/extract_unique_node_ids.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file: extract_unique_node_ids.py 3 | author: Sheila Ramaswamy(@sramas15) 4 | 5 | Script that creates a tsv containing all the unique node ids from a given input file. 6 | 7 | Usage: 8 | python extract_unique_node_ids.py ... 9 | 10 | Positional Arguments: 11 | input_file: Path to the input file; Input file should be a tsv. 12 | output_file: Path to the output file; Output file will be a tsv. 13 | dataset_name: Name of dataset nodes are being extracted from e.g. STRING 14 | columns: Columns containing node ids. Can specify many. 15 | 16 | 17 | Optional arguments: 18 | --node_name: String indicating how to refer to the node ids in the file scheme. Defaults to node_id. 19 | --has_title: If provided, skips over the first line of the file. 20 | --verbose: If provided, prints to the console for every million lines of the input file processed. 21 | 22 | Example usage: 23 | Extracting node ids from a STRING edgelist file, consisting of \t 24 | 25 | Input files: STRING.tsv 26 | 27 | Output file: STRING-nodes.tsv 28 | 29 | Workflow: 30 | 31 | python extract_unique_node_ids.py STRING.tsv STRING-nodes.tsv STRING 0 1 --node_name ENSEMBL_peptide_id --verbose 32 | ''' 33 | 34 | 35 | import argparse 36 | import utils 37 | 38 | parser = argparse.ArgumentParser(description='Extract unique node ids from file.') 39 | parser.add_argument('input_file', help='input file name.') 40 | parser.add_argument('output_file', help='output file name.') 41 | parser.add_argument('dataset_name', help='Name of the dataset') 42 | parser.add_argument('columns', metavar='N', type=int, nargs='+', 43 | help='columnswith node ids') 44 | parser.add_argument('--has_title', action='store_true', 45 | help='has a title line that is not prefixed with a #') 46 | parser.add_argument('--verbose', action='store_true', 47 | help='Print every 1,000,000 lines processed') 48 | parser.add_argument('--divider', default='\t', type=str, help='column separator, by default a tab') 49 | parser.add_argument('--node_name', default='node_id', type=str, help='how to identify the nodes in the header for tsv') 50 | 51 | if __name__ == '__main__': 52 | args = parser.parse_args() 53 | with open(args.input_file, 'r') as inF: 54 | unique_ids = set() 55 | with open(args.output_file, 'w') as outF: 56 | outF.write('# Dataset: %s\n' % args.dataset_name) 57 | outF.write('# %s\n' % args.node_name) 58 | for i, line in enumerate(inF): 59 | if args.verbose and i%1000000 == 0: 60 | print 'Finished processing line %d in the original input file' % i 61 | if line[0] == '#' or line[0] == '!' or line[0] == '\n' or (i==0 and args.has_title): 62 | continue 63 | vals = utils.split_then_strip(line, args.divider) 64 | for column in args.columns: 65 | if vals[column] not in unique_ids and len(vals[column]) > 0: 66 | unique_ids.add(vals[column]) 67 | new_line = '%s\n' % vals[column] 68 | outF.write(new_line) 69 | -------------------------------------------------------------------------------- /Utils/getStats.py: -------------------------------------------------------------------------------- 1 | import snap 2 | import time 3 | 4 | #from utils.network_utils import get_num_elem_per_mode 5 | 6 | filename = "Graphs/oldMinerNewSNAP.graph" 7 | FIn = snap.TFIn(filename) 8 | Graph = snap.TMMNet.Load(FIn) 9 | 10 | print('Modes: %d' % Graph.GetModeNets()) 11 | print('Link types: %d' % Graph.GetCrossNets()) 12 | 13 | crossnetids = snap.TInt64V() 14 | crossneti = Graph.BegCrossNetI() 15 | while crossneti < Graph.EndCrossNetI(): 16 | crossnetids.Add(crossneti.GetCrossId()) 17 | crossneti.Next() 18 | 19 | nodeattrmapping = snap.TIntStrStrTr64V() 20 | edgeattrmapping = snap.TIntStrStrTr64V() 21 | start_time = time.time() 22 | DirectedNetwork = Graph.ToNetwork(crossnetids, nodeattrmapping, edgeattrmapping) 23 | end_time = time.time() 24 | print("Converting to TNEANet takes %s seconds" % (end_time - start_time)) 25 | 26 | snap.PrintInfo(DirectedNetwork, "Python type PNEANet", "output.txt", False) 27 | map(lambda x: x.replace("\n", ""), open("output.txt").readlines()) 28 | -------------------------------------------------------------------------------- /Utils/utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file: utils.py 3 | author: Sheila Ramaswamy (@sramas15) 4 | 5 | File containing util functions useful for other scripts. 6 | ''' 7 | import os 8 | from datetime import datetime 9 | 10 | HUMAN_SPECIES_ID = '9606' 11 | 12 | def get_filter(method_name): 13 | '''Given a function name(string), returns the corresponding function in this file. 14 | 15 | Input: 16 | method_name: string, function name. 17 | Output: 18 | a function in this file or None, if the function doesn't exist. 19 | ''' 20 | if method_name is None: 21 | return None 22 | possibles = globals().copy() 23 | possibles.update(locals()) 24 | method = possibles.get(method_name) 25 | return method 26 | 27 | def remove_species_id(name): 28 | '''Filter that removes the species id from a string id. Currently only works 29 | with the human species id. 30 | 31 | Input: 32 | name: an ENSEMBL human protein id, prefixed with the species id. 33 | Output: 34 | the ENSEMBL human protein id, without the species prefix. 35 | ''' 36 | vals = split_then_strip(name, '.') 37 | if len(vals) != 2 or vals[0] != HUMAN_SPECIES_ID: 38 | return name 39 | return vals[1] 40 | 41 | def add_species_id(name): 42 | '''Adds the human species id as a prefix to the given ENSEMBL id. 43 | 44 | Input: 45 | name: the ENSEMBL protein id (string). 46 | Output: 47 | a string consisting of the human species id, '.' and name. 48 | ''' 49 | return '%s.%s' % (HUMAN_SPECIES_ID, name) 50 | 51 | def split_then_strip(string, split_char): 52 | '''Splits the string using the given character and removes whitespace from all 53 | resulting substrings. 54 | 55 | Input: 56 | string: string being split 57 | split_char: character (or multiple characters) used to split string 58 | Output: 59 | a list, consisting of the stripped substrings. 60 | ''' 61 | return [s.strip() for s in string.split(split_char)] 62 | 63 | 64 | def get_file_len(input_file): 65 | '''Returns the length of the input_file; Returns 0 if the file does not exist. 66 | 67 | Input: 68 | input_file: path to the input file. 69 | Output: 70 | number of lines in the file. 71 | ''' 72 | if os.path.isfile(input_file): 73 | return sum(1 for line in open(input_file, 'r')) 74 | return 0 75 | 76 | def get_max_id(input_file): 77 | '''Returns the max snap id of the input_file; Returns 0 if the file does not exist. 78 | Assumes file in format of snap mode or crossnet full table tsv file. 79 | 80 | Input: 81 | input_file: path to the input file. 82 | Output: 83 | max snap id in input file. 84 | ''' 85 | max_id = -1 86 | if os.path.isfile(input_file): 87 | with open(input_file, 'r') as inF: 88 | for line in inF: 89 | if line[0]=='#': 90 | continue 91 | new_id = int(line.strip().split('\t')[0]) 92 | if new_id > max_id: 93 | max_id = new_id 94 | max_id += 1 95 | return max_id 96 | 97 | def get_current_date(): 98 | '''Returns the current date, formatted as YYYYMMDD 99 | 100 | Input: 101 | None 102 | Output: 103 | Current date, as a string. 104 | ''' 105 | format = '%Y%m%d' 106 | return datetime.now().strftime(format) 107 | 108 | def get_full_mode_file_name(mode_name): 109 | '''Returns the formatted file name that should contain the full list of snap ids for the mode. 110 | 111 | Input: 112 | mode_name: the name of the mode 113 | Output: 114 | the formatted file name. 115 | ''' 116 | return 'miner-%s-%s.tsv' % (mode_name, get_current_date()) 117 | 118 | def get_equiv_mode_file_name(mode_name): 119 | '''Returns the formatted file name that should contain the equivalence table of snap ids for the mode. 120 | 121 | Input: 122 | mode_name: the name of the mode 123 | Output: 124 | the formatted file name. 125 | ''' 126 | return 'miner-%s-equiv-%s.tsv' % (mode_name, get_current_date()) 127 | 128 | def get_mode_file_name(mode_name, db_id, dataset): 129 | '''Returns the formatted file name that should contain the snap id to dataset 130 | specific id mapping. 131 | 132 | Input: 133 | mode_name: the name of the mode 134 | db_id: dataset id for the given dataset. 135 | dataset: the name of the dataset e.g. STRING 136 | Output: 137 | the formatted file name. 138 | ''' 139 | return 'miner-%s-%d-%s-%s.tsv' % (mode_name, int(db_id), dataset, get_current_date()) 140 | 141 | def get_full_cross_file_name(mode_name1, mode_name2): 142 | '''Returns the formatted file name that should contain the full list of snap ids for the cross net. 143 | 144 | Input: 145 | mode_name1: the name of the src mode 146 | mode_name2: the name of the dst mode 147 | Output: 148 | the formatted file name. 149 | ''' 150 | return 'miner-%s-%s-%s.tsv' % (mode_name1, mode_name2, get_current_date()) 151 | 152 | def get_cross_file_name(mode_name1, mode_name2, db_id, dataset): 153 | '''Returns the formatted file name that should contain the snap id to the dataset ids 154 | for the source and destination nodes. 155 | 156 | Input: 157 | mode_name1: the name of the src mode 158 | mode_name2: the name of the dst mode 159 | db_id: dataset id for the given dataset. 160 | dataset: the name of the dataset e.g. STRING 161 | Output: 162 | the formatted file name. 163 | ''' 164 | return 'miner-%s-%s-%d-%s-%s.tsv' % (mode_name1, mode_name2, int(db_id), dataset, get_current_date()) 165 | 166 | def parse_dataset_id_from_name(file_name): 167 | '''Extracts the dataset id from the formatted mode file name. 168 | 169 | Input: 170 | file_name: mode file name, as returned by get_mode_file_name. 171 | Output: 172 | the integer dataset id. 173 | ''' 174 | return int(file_name.split('-')[2]) 175 | 176 | def parse_dataset_name_from_name(file_name): 177 | '''Extracts the dataset name from the formatted mode file name. 178 | 179 | Input: 180 | file_name: mode file name, as returned by get_mode_file_name. 181 | Output: 182 | the string dataset name. 183 | ''' 184 | return file_name.split('-')[3] 185 | 186 | def parse_mode_name_from_name(file_name): 187 | '''Extracts the mode name from the formatted mode file name. 188 | 189 | Input: 190 | file_name: mode file name, as returned by get_mode_file_name. 191 | Output: 192 | the string dataset mode name. 193 | ''' 194 | return file_name.split('-')[1] 195 | 196 | def read_mode_file(map_file): 197 | '''Reads the mapping between dataset specific ids to snap ids into a dictionary. 198 | 199 | Input: 200 | map_file: file containing the mapping. 201 | Output: 202 | dictionary from the dataset specific ids to snap ids. 203 | ''' 204 | mapping = {} 205 | with open(map_file, 'r') as inF: 206 | for line in inF: 207 | if len(line) == 0 or line[0] == '#': 208 | continue 209 | vals = split_then_strip(line, '\t') 210 | snap_id = vals[0] 211 | dataset_id = vals[1] 212 | mapping[dataset_id] = int(snap_id) 213 | return mapping 214 | -------------------------------------------------------------------------------- /drugbank/edges/README.txt: -------------------------------------------------------------------------------- 1 | Generate Edges from Drugbank Database 2 | ------------------------------------- 3 | 4 | 1. Models the following interactions from the drugbank database: 5 | 1. Drug-Drug interaction 6 | 2. Drug-Gene interacion 7 | 2. External Dependencies: 8 | 1. Requires mapping from SnapGeneID<->UniportKB 9 | 3. Steps: 10 | I. Download drugbank.xml 11 | II. Ensure that gene mapping from 2 is in the current folder. 12 | III. Run make-edges.sh 13 | 3. If BeautifulSoup is not installed or you don't have permissions to install: 14 | I. Follow steps mentioned on http://docs.python-guide.org/en/latest/dev/virtualenvs/ 15 | II. Next launch the env as : source /env/bin/activate 16 | III. Run make-edges.sh 17 | -------------------------------------------------------------------------------- /drugbank/edges/getDrugInteractions.py: -------------------------------------------------------------------------------- 1 | ############################################# 2 | # XML parser to parse the drugbank database 3 | # Will output a space separated .txt file 4 | # with the following coloumn headers: 5 | # DrugbankId DrugbankId 6 | # Each line represents a drug-drug interatcion 7 | ############################################## 8 | 9 | from bs4 import BeautifulSoup 10 | soup = BeautifulSoup(open("./drugbank.xml"),"xml") 11 | sep = " " 12 | with open('edgesD.txt', 'w') as f: 13 | for drug in soup.findAll("drug"): 14 | drugName = drug.find("drugbank-id").text 15 | interactions = drug.findAll("drug-interaction") 16 | if not interactions: 17 | continue 18 | for i in interactions: 19 | toPrint = drugName + sep + i.find("drugbank-id").text 20 | f.write(toPrint.encode('utf-8') + '\n') 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /drugbank/edges/getGeneInteractions.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # XML parser to parse the drugbank database 3 | # Will output a space separated .txt file 4 | # with the following coloumn headers: 5 | # DrugbankId Gene1 Gene2 ... 6 | # Currently UniportID is used for genes. 7 | ########################################### 8 | 9 | from bs4 import BeautifulSoup 10 | soup = BeautifulSoup(open("./../drugbank.xml"),"xml") 11 | sep = " " 12 | empty = "NULL" 13 | #geneIdentifier = "HUGO Gene Nomenclature Committee (HGNC)" 14 | geneIdentifier = "UniProtKB" 15 | with open('drugGene.txt', 'w') as f: 16 | for drug in soup.findAll("drug"): 17 | toPrint = "" 18 | toPrint += drug.find("drugbank-id").text + sep 19 | # Get target Genes 20 | targets = drug.findAll("target") 21 | targetGene = [] 22 | if targets: 23 | for target in targets: 24 | externIden = target.findAll("external-identifier") 25 | if not externIden: 26 | continue 27 | for iden in externIden: 28 | if iden.find("resource").text == geneIdentifier: 29 | targetGene.append(iden.find("identifier").text) 30 | # Get Enzyme Gene 31 | enzymes = drug.findAll("enzyme") 32 | enzymeGene = [] 33 | if enzymes: 34 | for enzyme in enzymes: 35 | externIden = enzyme.findAll("external-identifier") 36 | if not externIden: 37 | continue 38 | for iden in externIden: 39 | if iden.find("resource").text == geneIdentifier: 40 | enzymeGene.append(iden.find("identifier").text) 41 | allGene = targetGene + enzymeGene 42 | if len(allGene) == 0: 43 | toPrint += empty 44 | else: 45 | toPrint += ','.join(allGene) 46 | f.write(toPrint.encode('utf-8') + '\n') 47 | 48 | 49 | -------------------------------------------------------------------------------- /drugbank/edges/make-edges.sh: -------------------------------------------------------------------------------- 1 | python getDrugInteractions.py 2 | python getGeneInteractions.py 3 | python makeEdgeTableCC.py 4 | python makeEdgeTableCG.py 5 | -------------------------------------------------------------------------------- /drugbank/edges/makeEdgeTableCC.py: -------------------------------------------------------------------------------- 1 | ######################################################## 2 | # Takes as input a .txt file of drug-drug interactions. 3 | # and subSnapDrugbank.txt which contains mapping from 4 | # snapChemicalId to DrugbankId. 5 | # Outputs : 6 | # 1. Master Edge Table: SnapCCId EdgeTableNo SrcId DstId 7 | # 2. Sub Tables: 8 | # 1. SnapCCId SrcId DstId (DrugbankId) 9 | # All edges are undirected. Hence A-B is reported only 10 | # once. 11 | ######################################################### 12 | from collections import defaultdict 13 | 14 | sep = " " 15 | snapIdPrefix = "SCC" 16 | edgeFile = "edgesD.txt" 17 | nodeMap = "./../nodes/subSnapDrugbank.txt" 18 | masterTable = "snapChemicalCC.txt" 19 | subTable = "subSnapDrugbankCC.txt" 20 | idNum = 0 21 | # Make a dict mapping from drugbankId to snapChemId 22 | drugbankSnap = {} 23 | with open(nodeMap, 'r') as f: 24 | for line in f: 25 | line = line.strip().split(sep) 26 | drugbankSnap[line[1]] = line[0] 27 | 28 | drugsDone = defaultdict(list) 29 | with open(edgeFile, 'r') as f, open(masterTable, 'w') as master, open(subTable, 'w') as sub: 30 | for line in f: 31 | line = line.strip().split(sep) 32 | if line[1] in drugsDone[line[0]]: 33 | continue 34 | if line[0] not in drugbankSnap or line[1] not in drugbankSnap: 35 | continue 36 | drugsDone[line[0]].append(line[1]) 37 | snapId = snapIdPrefix + str(idNum) 38 | idNum += 1 39 | master.write(snapId + sep + "0 " + drugbankSnap[line[0]] + sep + drugbankSnap[line[1]] + '\n') 40 | sub.write(snapId + sep + line[0] + sep + line[1] + '\n') 41 | 42 | 43 | -------------------------------------------------------------------------------- /drugbank/edges/makeEdgeTableCG.py: -------------------------------------------------------------------------------- 1 | ######################################################## 2 | # Takes as input a .txt file of drug-gene interactions. 3 | # and subSnapDrugbank.txt which contains mapping from 4 | # snapChemicalId to DrugbankId. 5 | # Depends on the mapping of Genes to SnapGeneID. 6 | # Outputs : 7 | # 1. Master Edge Table: SnapCGId EdgeTableNo SrcId DstId 8 | # 2. Sub Tables: 9 | # 1. SnapCGId SrcId (DrugbankID) DstId (UniportId) 10 | # All edges are undirected. Hence A-B is reported only 11 | # once. 12 | ######################################################### 13 | from collections import defaultdict 14 | 15 | sep = " " 16 | snapIdPrefix = "SCG" 17 | edgeFile = "drugGene.txt" 18 | nodeMap = "./../nodes/subSnapDrugbank.txt" 19 | masterTable = "snapChemicalCG.txt" 20 | subTable = "subSnapDrugbankCG.txt" 21 | geneMap = "./snap.genes.0.go" 22 | idNum = 0 23 | # Make a dict mapping from drugbankId to snapChemId 24 | drugbankSnap = {} 25 | with open(nodeMap, 'r') as f: 26 | for line in f: 27 | line = line.strip().split(sep) 28 | drugbankSnap[line[1]] = line[0] 29 | 30 | # Make a dict mapping from UniProtKB to snapGeneId 31 | geneSnap = {} 32 | with open(geneMap, 'r') as f: 33 | for line in f: 34 | line = line.strip().split('\t') 35 | geneSnap[line[1]] = line[0] 36 | 37 | with open(edgeFile, 'r') as f, open(masterTable, 'w') as master, open(subTable, 'w') as sub: 38 | for line in f: 39 | line = line.strip().split(sep) 40 | if line[0] not in drugbankSnap: 41 | continue 42 | geneList = line[1].split(",") 43 | if geneList[0] == "NULL": 44 | continue 45 | for gene in geneList: 46 | snapId = snapIdPrefix + str(idNum) 47 | idNum += 1 48 | if gene not in geneSnap: 49 | print gene 50 | continue 51 | master.write(snapId + sep + "0 " + drugbankSnap[line[0]] + sep + geneSnap[gene] + '\n') 52 | sub.write(snapId + sep + line[0] + sep + gene + '\n') 53 | 54 | 55 | -------------------------------------------------------------------------------- /drugbank/nodes/README.txt: -------------------------------------------------------------------------------- 1 | Generate Nodes from Drugbank Database 2 | ------------------------------------- 3 | 4 | 1. Takes as input a single .xml file from the drugbank database to 5 | generate node mapping. 6 | 2. Steps: 7 | I. Download .xml in the current directory. 8 | II. Run make-nodes.sh 9 | 3. If BeautifulSoup is not installed or you cannont install: 10 | I. Follow steps mentioned on http://docs.python-guide.org/en/latest/dev/virtualenvs/ 11 | II. Next launch the env as : source /env/bin/activate 12 | III. Run make-nodes.sh 13 | -------------------------------------------------------------------------------- /drugbank/nodes/make-nodes.sh: -------------------------------------------------------------------------------- 1 | python parseDrugbank.py 2 | python makeNodeTables.py 3 | -------------------------------------------------------------------------------- /drugbank/nodes/makeNodeTables.py: -------------------------------------------------------------------------------- 1 | ####################################################### 2 | # Takes input a .txt file with the following col headers 3 | # DrugbankID PubChem_Compound PubChem_Substance 4 | # Ouputs the following files: 5 | # 1. Master Node table : SnapChemID SubTableID 6 | # 2. Sub tables : 7 | # 1. SanpChemID DrugbankID 8 | # 2. SnapChemID PubChem_Compound 9 | # 3. SanpChemID PubChem_Substance 10 | # 3. Equvialence Table : SnapChemID SnapChemID 11 | ####################################################### 12 | import itertools 13 | sep = " " 14 | empty = "NULL" 15 | snapIdPrefix = "SC" 16 | # Names/handles for the output tables 17 | idNum = 0 18 | masterTable = "snapChemical.txt" 19 | subTable = ["subSnapDrugbank.txt", "subSnapPubCompound.txt", "subSnapPubSubstance.txt"] 20 | subHandle = [open(subTable[i], 'w') for i in xrange(len(subTable))] 21 | eqTable = "snapEqChem.txt" 22 | with open('did_pubC_pubS.txt', 'r') as input, open(masterTable, 'w') as master,open(eqTable, 'w') as eqTable: 23 | for line in input: 24 | line = line.strip().split(" ") 25 | currId = [] 26 | for num,id in enumerate(line): 27 | if id == "NULL": 28 | continue 29 | snapId = snapIdPrefix + str(idNum) 30 | idNum += 1 31 | master.write(snapId + sep + str(num) + '\n') 32 | subHandle[num].write(snapId + sep + id + '\n') 33 | currId.append(snapId) 34 | allPerms = list(itertools.permutations(currId,2)) 35 | for perm in allPerms: 36 | toWrite = ' '.join(perm) 37 | eqTable.write(toWrite + '\n') 38 | 39 | [handle.close() for handle in subHandle] 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /drugbank/nodes/parseDrugbank.py: -------------------------------------------------------------------------------- 1 | ########################################### 2 | # XML parser to parse the drugbank database 3 | # Will output a space separated .txt file 4 | # with the following coloumn headers: 5 | # DrugbankID PubChem_Compound PubChem_Substance 6 | # Requirements : Assumes that durgbank.xml is in the 7 | # same folder. 8 | ########################################### 9 | 10 | from bs4 import BeautifulSoup 11 | soup = BeautifulSoup(open("./drugbank.xml"),"xml") 12 | sep = " " 13 | empty = "NULL" 14 | with open('did_pubC_pubS.txt', 'w') as f: 15 | for drug in soup.findAll("drug"): 16 | flag = False 17 | toPrint = "" 18 | toPrint += drug.find("drugbank-id").text + sep 19 | #toPrint += drug.find("name").text + sep 20 | identifiers = [i for i in drug.findAll("external-identifier")] 21 | for i in identifiers: 22 | database = i.find("resource").text 23 | if database != "PubChem Compound": 24 | continue 25 | value = i.find("identifier").text 26 | flag = True 27 | toPrint += value + sep 28 | if not flag: 29 | toPrint += empty + sep 30 | 31 | for i in identifiers: 32 | database = i.find("resource").text 33 | if database != "PubChem Substance": 34 | continue 35 | value = i.find("identifier").text 36 | flag = True 37 | toPrint += value + sep 38 | if not flag: 39 | toPrint += empty + sep 40 | f.write(toPrint.encode('utf-8') + '\n') 41 | 42 | 43 | -------------------------------------------------------------------------------- /examples/README.txt: -------------------------------------------------------------------------------- 1 | ------------------------------ 2 | Example : Load Miner Dataset 3 | ------------------------------ 4 | 5 | This folder contains two simple scripts to 6 | 1. Load the miner data set into a multi-modal network which is then saved to disk. 7 | 2. Read the saved graph from disk and print basic statistics about the miner dataset. 8 | 9 | Sample workflow: 10 | 1. Generate the network and save to disk 11 | python miner_load_tables.py config.txt --loglevel info 12 | 13 | 2. Read the saved network and print statistics 14 | python miner_get_stats.py ./miner.graph 15 | 16 | Usage of the scripts used: 17 | 18 | ---------------------------- 19 | file : miner_load_tables.py 20 | ---------------------------- 21 | 22 | Example to illustrate how to load the miner dataset into a multi-modal network. 23 | 24 | Usage: miner_load_tables.py 25 | 26 | Config File : 27 | The config files contains the path to all tsv to load modes and cross-nets. 28 | 29 | Positional Arguments: 30 | config_file : Path to the config file. The config file contatins the path to all the modes and cross-net tsv files. 31 | 32 | Optional Arguments: 33 | --output_dir : Directory to create output files. Defaults to the current working directory. 34 | --loglevel : Enable logging. Defaults to warning level. 35 | 36 | Example Usage: 37 | Input File : Config.txt 38 | 39 | Command Line : 40 | python miner_load_tables.py config.txt --loglevel info 41 | 42 | Output: 43 | miner.graph 44 | 45 | Note: 46 | Due to 32 bit limitation we can't fully load the Protein-Protein edges. 47 | Only edges with confidence level greater than 200 are used. When SNAP supports 48 | 64 bit make the following change in config file: 49 | Protein-Protein = /dfs/ilfs2/0/MINER-BIO/types/Protein-Protein/20160418/snap-tables/miner-protein-protein-20160607.tsv 50 | 51 | Error after using 64 bit: RuntimeError: Message: TVec::Resize: std::exception, Length:536870912, Capacity:1073741824, New capacity:-1, Type:4TVecI11THashKeyDatI6TInt64S1_iExE [Program failed to allocate more memory. Solution-1: Get a bigger machine and a 64-bit compiler.] 52 | 53 | 54 | --------------------------- 55 | file : miner_get_stats.py 56 | --------------------------- 57 | 58 | Script to print basic statistics of the miner dataset. 59 | 60 | Usage: 61 | python miner_get_stats.py 62 | 63 | Positional Arguments: 64 | input_file : Path to the multi-modal network 65 | 66 | Example Usage: 67 | Input file : miner.graph 68 | 69 | Command line: 70 | python miner_get_stats.py ./miner.graph 71 | 72 | -------------------------------------------------------------------------------- /examples/config.txt: -------------------------------------------------------------------------------- 1 | [Modes] 2 | Chemical = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Chemical/miner-chemical-20190828.tsv 3 | Protein = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Protein/miner-proteins-20190816.tsv 4 | Function = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Function/miner-function-20190807.tsv 5 | Gene = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Gene/miner-gene-20190826.tsv 6 | Disease = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Disease/miner-disease-20190813.tsv 7 | 8 | [Cross-Net] 9 | Chemical-Chemical = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Chemical-Chemical/miner-chemical-chemical-20190828.tsv 10 | Chemical-Gene = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Chemical-Gene/miner-chemical-gene-20190828.tsv 11 | Function-Function = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Function-Function/miner-function-function-20190807.tsv 12 | Gene-Protein = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Gene-Protein/miner-gene-protein-20190819.tsv 13 | Gene-Function = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Gene-Function/miner-gene-function-20190815.tsv 14 | Protein-Protein = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Protein-Protein/miner-protein-protein-20190819.tsv 15 | Disease-Disease = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Disease-Disease/miner-disease-disease-20190813.tsv 16 | Disease-Gene = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Disease-Gene/miner-disease-gene-20190816.tsv 17 | Disease-Function = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Disease-Function/miner-disease-function-20190814.tsv 18 | Disease-Chemical = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Disease-Chemical/miner-disease-chemical-20190828.tsv 19 | -------------------------------------------------------------------------------- /examples/miner_get_stats.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : miner_get_stats.py 3 | authors : Farzaan Kaiyom, Agrim Gupta 4 | 5 | Script to print basic statistics of the miner dataset. 6 | 7 | Usage: 8 | python miner_get_stats.py 9 | 10 | Positional Arguments: 11 | input_file : Path to the multi-modal network 12 | 13 | Example Usage: 14 | Input file : miner.graph 15 | 16 | Command line: 17 | python miner_get_stats.py ./miner.graph 18 | ''' 19 | 20 | import sys 21 | sys.path.insert(0, './../../swig/') 22 | import snap 23 | import argparse 24 | 25 | parser = argparse.ArgumentParser(description='Print basic statistics of the miner dataset') 26 | parser.add_argument('input_file', help='path to the multi-modal network') 27 | args = parser.parse_args() 28 | 29 | #methods to test modes 30 | def modeStats(Graph,name): 31 | try: 32 | gp = Graph.GetModeNetByName(name) 33 | print(name,": ",gp.GetNodes()) 34 | except: 35 | print(name," skipped") 36 | 37 | def crossStats(Graph,name): 38 | try: 39 | gp = Graph.GetCrossNetByName(name) 40 | print(name,": ",gp.GetEdges()) # use getEdges for older versions of SNAP.py 41 | except: 42 | print(name," skipped") 43 | 44 | print("Printing Modes") 45 | FIn = snap.TFIn(args.input_file) 46 | Graph = snap.TMMNet.Load(FIn) 47 | 48 | modeStats(Graph,"Chemical") 49 | modeStats(Graph,"Protein") 50 | modeStats(Graph,"Gene") 51 | modeStats(Graph,"Function") 52 | modeStats(Graph,"Disease") 53 | 54 | print("Printing CrossNets") 55 | 56 | crossStats(Graph,"Chemical-Chemical") 57 | crossStats(Graph,"Chemical-Gene") 58 | crossStats(Graph,"Function-Function") 59 | crossStats(Graph,"Gene-Function") 60 | crossStats(Graph,"Gene-Protein") 61 | crossStats(Graph,"Disease-Disease") 62 | crossStats(Graph,"Disease-Gene") 63 | crossStats(Graph,"Disease-Function") 64 | crossStats(Graph,"Disease-Chemical") 65 | crossStats(Graph,"Protein-Protein") 66 | -------------------------------------------------------------------------------- /examples/miner_load_tables.py: -------------------------------------------------------------------------------- 1 | ''' 2 | file : miner_load_tables.py 3 | author : Agrim Gupta 4 | edited by : Farzaan Kaiyom 5 | 6 | Example to illustrate how to load the miner dataset into a multi-modal network. 7 | 8 | Usage: miner_load_tables.py 9 | 10 | Positional Arguments: 11 | config_file : Path to the config file. The config file contatins the path to all the modes and cross-net tsv files. 12 | 13 | Optional Arguments: 14 | --output_dir : Directory to create output files. Defaults to the current working directory. 15 | --loglevel : Enable logging. Defaults to warning level. 16 | 17 | Example Usage: 18 | Input File : Config.txt 19 | 20 | Command Line : 21 | python miner_load_tables.py config.txt --loglevel info 22 | 23 | Output: 24 | miner.graph 25 | ''' 26 | 27 | import sys 28 | sys.path.insert(0, './../../swig/') 29 | import snap 30 | import ConfigParser 31 | import argparse 32 | import logging 33 | import os 34 | 35 | parser = argparse.ArgumentParser(description='Generate a Multi-Modal Network') 36 | parser.add_argument('config_file', help='path of a config file.') 37 | parser.add_argument('--output_dir', help='output path to save the Multi-Modal Network', default='.') 38 | parser.add_argument('--loglevel', help='info for debug print.') 39 | parser.add_argument('--outputf', help='output file name.', default='miner.graph') 40 | args = parser.parse_args() 41 | config = ConfigParser.ConfigParser() 42 | config.readfp(open(args.config_file)) 43 | if args.loglevel: 44 | numeric_level = getattr(logging, args.loglevel.upper(), None) 45 | logging.basicConfig(level=numeric_level) 46 | 47 | context = snap.TTableContext() 48 | # Construct the graph 49 | logging.info('Building Multi-Modal Network') 50 | Graph = snap.TMMNet.New() 51 | 52 | # Loading Modes 53 | try: 54 | chemical_mode_file = config.get('Modes', 'Chemical') 55 | cmschema = snap.Schema() 56 | cmschema.Add(snap.TStrTAttrPr("ChemicalId", snap.atStr)) 57 | cmschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 58 | chemical_mode = snap.TTable.LoadSS(cmschema, chemical_mode_file, context, "\t", snap.TBool(False)) 59 | logging.info('Done loading Chemical Mode') 60 | snap.LoadModeNetToNet(Graph, "Chemical", chemical_mode, "ChemicalId", snap.TStr64V()) 61 | except ConfigParser.NoOptionError: 62 | logging.info('Skipping Chemical Mode') 63 | 64 | try: 65 | function_mode_file = config.get('Modes', 'Function') 66 | fmschema = snap.Schema() 67 | fmschema.Add(snap.TStrTAttrPr("FunctionId", snap.atStr)) 68 | fmschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 69 | function_mode = snap.TTable.LoadSS(fmschema, function_mode_file, context, "\t", snap.TBool(False)) 70 | logging.info('Done loading Function Mode') 71 | snap.LoadModeNetToNet(Graph, "Function", function_mode, "FunctionId", snap.TStr64V()) 72 | except ConfigParser.NoOptionError: 73 | logging.info('Skipping Function Mode') 74 | 75 | try: 76 | gene_mode_file = config.get('Modes', 'Gene') 77 | gmschema = snap.Schema() 78 | gmschema.Add(snap.TStrTAttrPr("GeneId", snap.atStr)) 79 | gmschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 80 | gene_mode = snap.TTable.LoadSS(gmschema, gene_mode_file, context, "\t", snap.TBool(False)) 81 | logging.info('Done loading Gene Mode') 82 | snap.LoadModeNetToNet(Graph, "Gene", gene_mode, "GeneId", snap.T64StrV()) 83 | except ConfigParser.NoOptionError: 84 | logging.info('Skipping Gene Mode') 85 | 86 | try: 87 | protein_mode_file = config.get('Modes', 'Protein') 88 | pmschema = snap.Schema() 89 | pmschema.Add(snap.TStrTAttrPr("ProteinId", snap.atStr)) 90 | pmschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 91 | protein_mode = snap.TTable.LoadSS(pmschema, protein_mode_file, context, "\t", snap.TBool(False)) 92 | logging.info('Done loading Protein Mode') 93 | snap.LoadModeNetToNet(Graph, "Protein", protein_mode, "ProteinId", snap.TStr64V()) 94 | except ConfigParser.NoOptionError: 95 | logging.info('Skipping Protein Mode') 96 | 97 | try: 98 | disease_mode_file = config.get('Modes', 'Disease') 99 | dmschema = snap.Schema() 100 | dmschema.Add(snap.TStrTAttrPr("DiseaseId", snap.atStr)) 101 | dmschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 102 | disease_mode = snap.TTable.LoadSS(dmschema, disease_mode_file, context, "\t", snap.TBool(False)) 103 | logging.info('Done loading Disease Mode') 104 | snap.LoadModeNetToNet(Graph, "Disease", disease_mode, "DiseaseId", snap.TStr64V()) 105 | except ConfigParser.NoOptionError: 106 | logging.info('Skipping Disease Mode') 107 | 108 | # Loading Cross-Nets 109 | try: 110 | chemical_chemical_crossnet_file = config.get('Cross-Net', 'Chemical-Chemical') 111 | cccschema = snap.Schema() 112 | cccschema.Add(snap.TStrTAttrPr("CCEdgeId", snap.atStr)) 113 | cccschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 114 | cccschema.Add(snap.TStrTAttrPr("CSrcId", snap.atStr)) 115 | cccschema.Add(snap.TStrTAttrPr("CDstId", snap.atStr)) 116 | cccschema.Add(snap.TStrTAttrPr("desc", snap.atStr)) 117 | chemical_chemical_crossnet = snap.TTable.LoadSS(cccschema, chemical_chemical_crossnet_file, context, "\t", snap.TBool(False)) 118 | logging.info('Done loading Chemical-Chemical Cross-Net') 119 | snap.LoadCrossNetToNet(Graph, "Chemical", "Chemical", "Chemical-Chemical", chemical_chemical_crossnet, "CSrcId", "CDstId", snap.TStr64V()) 120 | except ConfigParser.NoOptionError: 121 | logging.info('Skipping Chemical-Chemical Cross-Net') 122 | 123 | try: 124 | chemical_gene_crossnet_file = config.get('Cross-Net', 'Chemical-Gene') 125 | cgcschema = snap.Schema() 126 | cgcschema.Add(snap.TStrTAttrPr("CGEdgeId", snap.atStr)) 127 | cgcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 128 | cgcschema.Add(snap.TStrTAttrPr("CSrcId", snap.atStr)) 129 | cgcschema.Add(snap.TStrTAttrPr("GDstId", snap.atStr)) 130 | chemical_gene_crossnet = snap.TTable.LoadSS(cgcschema, chemical_gene_crossnet_file, context, "\t", snap.TBool(False)) 131 | logging.info('Done loading Chemical-Gene Cross-Net') 132 | snap.LoadCrossNetToNet(Graph, "Chemical", "Gene", "Chemical-Gene", chemical_gene_crossnet, "CSrcId", "GDstId", snap.TStr64V()) 133 | except ConfigParser.NoOptionError: 134 | logging.info('Skipping Chemical-Gene Cross-Net') 135 | 136 | try: 137 | function_function_crossnet_file = config.get('Cross-Net', 'Function-Function') 138 | ffcschema = snap.Schema() 139 | ffcschema.Add(snap.TStrTAttrPr("FFEdgeId", snap.atStr)) 140 | ffcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 141 | ffcschema.Add(snap.TStrTAttrPr("FSrcId", snap.atStr)) 142 | ffcschema.Add(snap.TStrTAttrPr("FDstId", snap.atStr)) 143 | function_function_crossnet = snap.TTable.LoadSS(ffcschema, function_function_crossnet_file, context, "\t", snap.TBool(False)) 144 | logging.info('Done loading Function-Function Cross-Net') 145 | snap.LoadCrossNetToNet(Graph, "Function", "Function", "Function-Function", function_function_crossnet, "FSrcId", "FDstId", snap.TStr64V()) 146 | except ConfigParser.NoOptionError: 147 | logging.info('Skipping Function-Function Cross-Net') 148 | 149 | try: 150 | gene_function_crossnet_file = config.get('Cross-Net', 'Gene-Function') 151 | gfcschema = snap.Schema() 152 | gfcschema.Add(snap.TStrTAttrPr("GFEdgeId", snap.atStr)) 153 | gfcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 154 | gfcschema.Add(snap.TStrTAttrPr("GSrcId", snap.atStr)) 155 | gfcschema.Add(snap.TStrTAttrPr("FDstId", snap.atStr)) 156 | gene_function_crossnet = snap.TTable.LoadSS(gfcschema, gene_function_crossnet_file, context, "\t", snap.TBool(False)) 157 | logging.info('Done loading Gene-Function Cross-Net') 158 | snap.LoadCrossNetToNet(Graph, "Gene", "Function", "Gene-Function", gene_function_crossnet, "GSrcId", "FDstId", snap.TStr64V()) 159 | except ConfigParser.NoOptionError: 160 | logging.info('Skipping Gene-Function Cross-Net') 161 | 162 | try: 163 | gene_protein_crossnet_file = config.get('Cross-Net', 'Gene-Protein') 164 | gpcschema = snap.Schema() 165 | gpcschema.Add(snap.TStrTAttrPr("GPEdgeId", snap.atStr)) 166 | gpcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 167 | gpcschema.Add(snap.TStrTAttrPr("GSrcId", snap.atStr)) 168 | gpcschema.Add(snap.TStrTAttrPr("PDstId", snap.atStr)) 169 | gene_protein_crossnet = snap.TTable.LoadSS(gpcschema, gene_protein_crossnet_file, context, "\t", snap.TBool(False)) 170 | logging.info('Done loading Gene-Protein Cross-Net') 171 | snap.LoadCrossNetToNet(Graph, "Gene", "Protein", "Gene-Protein", gene_protein_crossnet, "GSrcId", "PDstId", snap.TStr64V()) 172 | except ConfigParser.NoOptionError: 173 | logging.info('Skipping Gene-Protein Cross-Net') 174 | try: 175 | protein_protein_crossnet_file = config.get('Cross-Net', 'Protein-Protein') 176 | ppcschema = snap.Schema() 177 | ppcschema.Add(snap.TStrTAttrPr("PPEdgeId", snap.atStr)) 178 | ppcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 179 | ppcschema.Add(snap.TStrTAttrPr("PSrcId", snap.atStr)) 180 | ppcschema.Add(snap.TStrTAttrPr("PDstId", snap.atStr)) 181 | protein_protein_crossnet = snap.TTable.LoadSS(ppcschema, protein_protein_crossnet_file, context, "\t", snap.TBool(False)) 182 | logging.info('Done loading Protein-Protein Cross-Net') 183 | snap.LoadCrossNetToNet(Graph, "Protein", "Protein", "Protein-Protein", protein_protein_crossnet, "PSrcId", "PDstId", snap.TStr64V()) 184 | except ConfigParser.NoOptionError: 185 | logging.info('Skipping Protein-Protein Cross-Net') 186 | try: 187 | disease_disease_crossnet_file = config.get('Cross-Net', 'Disease-Disease') 188 | ddcschema = snap.Schema() 189 | ddcschema.Add(snap.TStrTAttrPr("DDEdgeId", snap.atStr)) 190 | ddcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 191 | ddcschema.Add(snap.TStrTAttrPr("DSrcId", snap.atStr)) 192 | ddcschema.Add(snap.TStrTAttrPr("DDstId", snap.atStr)) 193 | disease_disease_crossnet = snap.TTable.LoadSS(ddcschema, disease_disease_crossnet_file, context, "\t", snap.TBool(False)) 194 | logging.info('Done loading Disease-Disease Cross-Net') 195 | snap.LoadCrossNetToNet(Graph, "Disease", "Disease", "Disease-Disease", disease_disease_crossnet, "DSrcId", "DDstId", snap.TStr64V()) 196 | except ConfigParser.NoOptionError: 197 | logging.info('Skipping Disease-Disease Cross-Net') 198 | 199 | try: 200 | disease_gene_crossnet_file = config.get('Cross-Net', 'Disease-Gene') 201 | dgcschema = snap.Schema() 202 | dgcschema.Add(snap.TStrTAttrPr("DGEdgeId", snap.atStr)) 203 | dgcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 204 | dgcschema.Add(snap.TStrTAttrPr("DSrcId", snap.atStr)) 205 | dgcschema.Add(snap.TStrTAttrPr("GDstId", snap.atStr)) 206 | disease_gene_crossnet = snap.TTable.LoadSS(dgcschema, disease_gene_crossnet_file, context, "\t", snap.TBool(False)) 207 | logging.info('Done loading Disease-Gene Cross-Net') 208 | snap.LoadCrossNetToNet(Graph, "Disease", "Gene", "Disease-Gene", disease_gene_crossnet, "DSrcId", "GDstId", snap.TStr64V()) 209 | except ConfigParser.NoOptionError: 210 | logging.info('Skipping Disease-Gene Cross-Net') 211 | 212 | try: 213 | disease_function_crossnet_file = config.get('Cross-Net', 'Disease-Function') 214 | dfcschema = snap.Schema() 215 | dfcschema.Add(snap.TStrTAttrPr("DFEdgeId", snap.atStr)) 216 | dfcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 217 | dfcschema.Add(snap.TStrTAttrPr("DSrcId", snap.atStr)) 218 | dfcschema.Add(snap.TStrTAttrPr("FDstId", snap.atStr)) 219 | disease_function_crossnet = snap.TTable.LoadSS(dfcschema, disease_function_crossnet_file, context, "\t", snap.TBool(False)) 220 | logging.info('Done loading Disease-Function Cross-Net') 221 | snap.LoadCrossNetToNet(Graph, "Disease", "Function", "Disease-Function", disease_function_crossnet, "DSrcId", "FDstId", snap.TStr64V()) 222 | except ConfigParser.NoOptionError: 223 | logging.info('Skipping Disease-Function Cross-Net') 224 | 225 | try: 226 | disease_chemical_crossnet_file = config.get('Cross-Net', 'Disease-Chemical') 227 | dccschema = snap.Schema() 228 | dccschema.Add(snap.TStrTAttrPr("DCEdgeId", snap.atStr)) 229 | dccschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr)) 230 | dccschema.Add(snap.TStrTAttrPr("DSrcId", snap.atStr)) 231 | dccschema.Add(snap.TStrTAttrPr("CDstId", snap.atStr)) 232 | disease_chemical_crossnet = snap.TTable.LoadSS(dccschema, disease_chemical_crossnet_file, context, "\t", snap.TBool(False)) 233 | logging.info('Done loading Disease-Chemical Cross-Net') 234 | snap.LoadCrossNetToNet(Graph, "Disease", "Chemical", "Disease-Chemical", disease_chemical_crossnet, "DSrcId", "CDstId", snap.TStr64V()) 235 | except ConfigParser.NoOptionError: 236 | logging.info('Skipping Disease-Chemical Cross-Net') 237 | 238 | # Save the graph 239 | logging.info('Saving Multi-Modal Network to disk') 240 | outputPath = os.path.join(args.output_dir, args.outputf) 241 | FOut = snap.TFOut(outputPath) 242 | Graph.Save(FOut) 243 | FOut.Flush() 244 | --------------------------------------------------------------------------------