├── .gitignore
├── Chemical-Chemical
    ├── README.txt
    ├── make_drugbank_chemical_chemical.py
    └── parse_drugbank_chemical_chemical.py
├── Chemical-Gene
    ├── README.txt
    ├── make_drugbank_chemical_gene.py
    └── parse_drugbank_chemical_gene.py
├── Chemical
    ├── README.txt
    ├── make_snap_chemical_mode_table.py
    ├── newChemParser.py
    ├── parse_drugbank_chemicals.py
    └── test_db_parse.py
├── Disease-Chemical
    ├── README.txt
    └── make_disease_chem_ctd.py
├── Disease-Disease
    ├── README.txt
    └── parse_do_disease_disease.py
├── Disease-Function
    ├── README.txt
    └── make_disease_func_ctd.py
├── Disease-Gene
    ├── README.txt
    ├── make_disease_gene_ctd.py
    └── make_disease_gene_disgenet.py
├── Disease
    ├── README.txt
    ├── parse_ctd_diseases.py
    ├── parse_do_diseases.py
    └── parse_omim_diseases.py
├── Function-Function
    ├── README.txt
    └── parse_obo_for_functions.py
├── Function
    ├── README.txt
    └── parse_obo_for_functions.py
├── Gene-Function
    └── README.txt
├── Gene-Protein
    ├── README.txt
    └── fetch_ensembl_id_mapping.py
├── Gene
    └── README.txt
├── Protein-Protein
    └── README.txt
├── Protein
    ├── README.txt
    └── add_organism.py
├── README.txt
├── Utils
    ├── README.txt
    ├── create_snap_crossnet_table.py
    ├── create_snap_mode_equiv_table.py
    ├── create_snap_mode_table.py
    ├── extract_edge_list.py
    ├── extract_unique_node_ids.py
    ├── getStats.py
    └── utils.py
├── drugbank
    ├── edges
    │   ├── README.txt
    │   ├── getDrugInteractions.py
    │   ├── getGeneInteractions.py
    │   ├── make-edges.sh
    │   ├── makeEdgeTableCC.py
    │   └── makeEdgeTableCG.py
    └── nodes
    │   ├── README.txt
    │   ├── make-nodes.sh
    │   ├── makeNodeTables.py
    │   └── parseDrugbank.py
└── examples
    ├── README.txt
    ├── config.txt
    ├── miner_get_stats.py
    └── miner_load_tables.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled source #
2 | ###################
3 | *.pyc
4 | 


--------------------------------------------------------------------------------
/Chemical-Chemical/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing chemical-chemical information:
 2 | - drugbank
 3 | 
 4 | Workflow:
 5 | 
 6 | Input Files:
 7 | /path/to/input/drugbank.xml
 8 | /path/to/input/miner-chemical-0-drugbank-20160523.tsv
 9 | 
10 | Intermediate Files:
11 | /path/to/intermediate/drugbank_parsed_chemical_chemical.tsv
12 | 
13 | Output Files:
14 | /path/to/output/miner-chemical-chemical-20160423.tsv
15 | /path/to/output/miner-chemical-chemical-0-drugbank-20160423.tsv
16 | 
17 | # Parse data
18 | python parse_drugbank_chemical_chemical.py /path/to/input/drugbank.xml --output_dir /path/to/intermediate/
19 | 
20 | # Create crossnet tables
21 | python make_drugbank_chemical_chemical.py /path/to/intermediate/drugbank_parsed_chemical_chemical.tsv ./../Chemical/miner-chemical-0-drugbank-20160523.tsv --output_dir /path/to/output/
22 | 
23 | Usage of the scripts used:
24 | 
25 | ------------------------------------------
26 | file : parse_drugbank_chemical_chemical.py
27 | ------------------------------------------
28 | 
29 | XML parser to parse the drugbank database for chemical chemical interactions. 
30 | Outputs a tab separated .tsv file with the following coloumn headers:
31 | DrugbankId DrugbankId
32 | 
33 | Usage:
34 | python parse_drugbank_chemical_chemical.py <input_file_path>
35 | 
36 | Positional Arguments:
37 | input_file   : Path to the durgbank.xml file.
38 | 
39 | Optional Arugments: 
40 | --output_dir : Directory to create output files. Defaults to the current working directory.
41 | 
42 | Example Usage:
43 | Input File: drugbank.xml
44 | 
45 | Output directory : outputs/chemical/
46 | 
47 | Comamnd line:
48 | python parse_drugbank_chemical_chemical.py drugbank.xml --output_dir outputs/chemicals/
49 | 
50 | Output: 
51 | drugbank_parsed_chemical_chemical.tsv
52 | 
53 | ------------------------------------------
54 | file : make_drugbank_chemical_chemical.py
55 | ------------------------------------------
56 | 
57 | Script to output chemical chemical interactions.
58 | 
59 | Usage:
60 | python make_drugbank_chemical_chemical.py <input_file> <mode_file>
61 | 
62 | Positional Arguments:
63 | input_file   : Path to chemical chemical interaction file (drugbank_parsed_chemical_chemical.tsv)
64 | mode_file    : Path to chemical mode file (miner-chemical-0-drugbank-20160523.tsv)
65 | 
66 | Optional Arugments: 
67 | --output_dir : Directory to create output files. Defaults to the current working directory.
68 | 
69 | Example Usage:
70 | Input File: drugbank_parsed_chemical_chemical.tsv, miner-chemical-0-drugbank-20160523.tsv
71 | 
72 | Output directory : outputs/chemical/
73 | 
74 | Comamnd line:
75 | python make_drugbank_chemical_chemical.py drugbank_parsed_chemical_chemical.tsv ./../nodes/miner-chemical-0-drugbank-20160523.tsv --output_dir outputs/chemicals/
76 | 
77 | Output: 
78 | miner-chemical-chemical-20160423.tsv, miner-chemical-chemical-0-drugbank-20160423.tsv
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/Chemical-Chemical/make_drugbank_chemical_chemical.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file : make_drugbank_chemical_chemical.py
 3 | author: Agrim Gupta
 4 | 
 5 | Script to output chemical chemical interactions.
 6 | 
 7 | Usage:
 8 | python make_drugbank_chemical_chemical.py <input_file> <mode_file>
 9 | 
10 | Positional Arguments:
11 | input_file   : Path to chemical chemical interaction file (drugbank_parsed_chemical_chemical.tsv)
12 | mode_file    : Path to chemical mode file (miner-chemical-0-drugbank-20160523.tsv)
13 | 
14 | Optional Arugments: 
15 | --output_dir : Directory to create output files. Defaults to the current working directory.
16 | 
17 | Example Usage:
18 | Input File: drugbank_parsed_chemical_chemical.tsv, miner-chemical-0-drugbank-20160523.tsv
19 | 
20 | Output directory : outputs/chemical/
21 | 
22 | Comamnd line:
23 | python make_drugbank_chemical_chemical.py drugbank_parsed_chemical_chemical.tsv ./../nodes/miner-chemical-0-drugbank-20160523.tsv --output_dir outputs/chemicals/
24 | 
25 | Output: 
26 | miner-chemical-chemical-20160423.tsv, miner-chemical-chemical-0-drugbank-20160423.tsv
27 | '''
28 | from collections import defaultdict
29 | import os
30 | import argparse
31 | from datetime import datetime
32 | 
33 | parser = argparse.ArgumentParser(description='Output crossnet for chemical chemical interaction')
34 | parser.add_argument('input_file', help='input file path. File should be parsed chemical-chemical interaction')
35 | parser.add_argument('mode_file', help='mode file path. File should be the chemical mode file')
36 | parser.add_argument('--output_dir', help='directory to output files', default='.')
37 | args = parser.parse_args()
38 | sep = "\t"
39 | empty = "NULL"
40 | format = '%Y%m%d'
41 | dateStr = datetime.now().strftime(format)
42 | 
43 | snapIdPrefix = ""
44 | edgeFile = args.input_file 
45 | nodeMap = args.mode_file
46 | masterTable = os.path.join(args.output_dir, "miner-chemical-chemical-" + dateStr + ".tsv")
47 | subTable = os.path.join(args.output_dir, "miner-chemical-chemical-0-drugbank-" + dateStr + ".tsv")
48 | idNum = 0
49 | # Make a dict mapping from drugbankId to snapChemId
50 | drugbankSnap = {}
51 | with open(nodeMap, 'r') as f:
52 |     for line in f:
53 |         if line.startswith('#'):
54 |             continue
55 |         line = line.strip().split(sep)
56 |         drugbankSnap[line[1]] = line[0]
57 | 
58 | drugsDone = defaultdict(list)
59 | with open(edgeFile, 'r') as f, open(masterTable, 'w') as master, open(subTable, 'w') as sub:
60 |     master.write('# snap_edge_id\tdataset_id\tsnap_source_id\tsnap_dst_id\n')
61 |     sub.write('# snap_edge_id\tdataset_source_id\tdataset_dst_id\n')
62 |     for line in f:
63 |         if line.startswith('#'):
64 |             continue
65 |         line = line.strip().split(sep)
66 |         if line[1] in drugsDone[line[0]]:
67 |             continue
68 |         if line[0] not in drugbankSnap or line[1] not in drugbankSnap:
69 |             continue
70 |         drugsDone[line[0]].append(line[1])
71 |         snapId = snapIdPrefix + str(idNum)
72 |         idNum += 1
73 |         master.write(snapId + sep + "0" + sep + drugbankSnap[line[0]] + sep + drugbankSnap[line[1]] + sep + line[2] + '\n')
74 |         sub.write(snapId + sep + line[0] + sep + line[1] + sep + line[2] + '\n')
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/Chemical-Chemical/parse_drugbank_chemical_chemical.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file : parse_drugbank_chemical_chemical.py
 3 | author: Agrim Gupta
 4 | 
 5 | XML parser to parse the drugbank database for chemical chemical interactions. 
 6 | Outputs a tab separated .tsv file with the following coloumn headers:
 7 | DrugbankId DrugbankId
 8 | 
 9 | Usage:
10 | python parse_drugbank_chemical_chemical.py <input_file_path>
11 | 
12 | Positional Arguments:
13 | input_file   : Path to the durgbank.xml file.
14 | 
15 | Optional Arugments: 
16 | --output_dir : Directory to create output files. Defaults to the current working directory.
17 | 
18 | Example Usage:
19 | Input File: drugbank.xml
20 | 
21 | Output directory : outputs/chemical/
22 | 
23 | Comamnd line:
24 | python parse_drugbank_chemical_chemical.py drugbank.xml --output_dir outputs/chemicals/
25 | 
26 | Output: 
27 | drugbank_parsed_chemical_chemical.tsv
28 | '''
29 | 
30 | from bs4 import BeautifulSoup
31 | import os
32 | import argparse
33 | 
34 | parser = argparse.ArgumentParser(description='Parse Durgbank database for drug drug interaction')
35 | parser.add_argument('input_file', help='input file path. File should be the drugbank.xml file.')
36 | parser.add_argument('--output_dir', help='directory to output files', default='.')
37 | args = parser.parse_args()
38 | outputFile = os.path.join(args.output_dir, "drugbank_parsed_chemical_chemical.tsv")
39 | soup = BeautifulSoup(open(args.input_file),"xml")
40 | sep = "\t"
41 | empty = "NULL"
42 | with open(outputFile, 'w') as f:
43 |     for drug in soup.findAll("drug"):
44 |         drugName = drug.find("drugbank-id").text 
45 |         interactions = drug.findAll("drug-interaction")
46 |         if not interactions:
47 |             continue
48 |         for i in interactions:
49 |             toPrint = drugName + sep + i.find("drugbank-id").text + sep + i.find("description").text
50 |             f.write(toPrint.encode('utf-8') + '\n')
51 | 


--------------------------------------------------------------------------------
/Chemical-Gene/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing chemical-gene information:
 2 | - drugbank
 3 | 
 4 | Workflow:
 5 | 
 6 | Input Files:
 7 | /path/to/input/drugbank.xml
 8 | /path/to/input/miner-chemical-0-drugbank-20160523.tsv
 9 | /path/to/input/miner-gene-0-20160523.tsv
10 | 
11 | Intermediate Files:
12 | /path/to/intermediate/drugbank_parsed_chemical_gene.tsv
13 | 
14 | Output Files:
15 | /path/to/output/miner-chemical-gene-20160423.tsv
16 | /path/to/output/miner-chemical-gene-0-drugbank-20160423.tsv
17 | 
18 | # Parse data
19 | python parse_drugbank_chemical_gene.py /path/to/input/drugbank.xml --output_dir /path/to/intermediate/
20 | 
21 | # Create crossnet tables
22 | python make_drugbank_chemical_gene.py /path/to/intermediate/drugbank_parsed_chemical_gene.tsv ./../nodes/miner-chemical-0-drugbank-20160523.tsv miner-genes-0-go-20160523.tsv --output_dir /path/to/output/
23 | 
24 | 
25 | Usage of the scripts used:
26 | 
27 | --------------------------------------
28 | file : parse_drugbank_chemical_gene.py
29 | --------------------------------------
30 | 
31 | XML parser to parse the drugbank database for chemical gene interactions. 
32 | Outputs a tab separated .tsv file with the following coloumn headers:
33 | DrugbankId Gene1 Gene2 ...
34 | Currently UniportID is used for genes.
35 | 
36 | Usage:
37 | python parse_drugbank_chemical_gene.py <input_file_path>
38 | 
39 | Positional Arguments:
40 | input_file   : Path to the durgbank.xml file.
41 | 
42 | Optional Arugments: 
43 | --output_dir : Directory to create output files. Defaults to the current working directory.
44 | 
45 | Example Usage:
46 | Input File: drugbank.xml
47 | 
48 | Output directory : outputs/chemical/
49 | 
50 | Comamnd line:
51 | python parse_drugbank_chemical_gene.py drugbank.xml --output_dir outputs/chemicals/
52 | 
53 | Output: 
54 | drugbank_parsed_chemical_gene.tsv
55 | 
56 | -------------------------------------
57 | file : make_drugbank_chemical_gene.py
58 | -------------------------------------
59 | 
60 | Script to output chemical gene interactions.
61 | 
62 | Usage:
63 | python make_drugbank_chemical_gene.py <input_file> <chemical_mode> <gene_mode>
64 | 
65 | Positional Arguments:
66 | input_file       : Path to chemical chemical interaction file (drugbank_parsed_chemical_gene.tsv)
67 | chemical_mode    : Path to chemical mode file (miner-chemical-0-drugbank-20160523.tsv)
68 | gene_mode        : Path to gene mode file (miner-genes-0-go-20160523.tsv) 
69 | 
70 | Optional Arugments: 
71 | --output_dir : Directory to create output files. Defaults to the current working directory.
72 | 
73 | Example Usage:
74 | Input File: drugbank_parsed_chemical_gene.tsv, miner-chemical-0-drugbank-20160523.tsv
75 | 
76 | Output directory : outputs/chemical/
77 | 
78 | Comamnd line:
79 | python make_drugbank_chemical_gene.py drugbank_parsed_chemical_gene.tsv ./../nodes/miner-chemical-0-drugbank-20160523.tsv miner-genes-0-go-20160523.tsv --output_dir outputs/chemicals/
80 | 
81 | Output: 
82 | miner-chemical-gene-20160423.tsv, miner-chemical-gene-0-drugbank-20160423.tsv
83 | 
84 | 


--------------------------------------------------------------------------------
/Chemical-Gene/make_drugbank_chemical_gene.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file : make_drugbank_chemical_gene.py
 3 | author: Agrim Gupta
 4 | 
 5 | Script to output chemical gene interactions.
 6 | 
 7 | Usage:
 8 | python make_drugbank_chemical_gene.py <input_file> <chemical_mode> <gene_mode>
 9 | 
10 | Positional Arguments:
11 | input_file       : Path to chemical chemical interaction file (drugbank_parsed_chemical_gene.tsv)
12 | chemical_mode    : Path to chemical mode file (miner-chemical-0-drugbank-20160523.tsv)
13 | gene_mode        : Path to gene mode file (miner-genes-0-go-20160523.tsv) 
14 | 
15 | Optional Arugments: 
16 | --output_dir : Directory to create output files. Defaults to the current working directory.
17 | 
18 | Example Usage:
19 | Input File: drugbank_parsed_chemical_gene.tsv, miner-chemical-0-drugbank-20160523.tsv
20 | 
21 | Output directory : outputs/chemical/
22 | 
23 | Comamnd line:
24 | python make_drugbank_chemical_gene.py drugbank_parsed_chemical_gene.tsv ./../nodes/miner-chemical-0-drugbank-20160523.tsv miner-genes-0-go-20160523.tsv --output_dir outputs/chemicals/
25 | 
26 | Output: 
27 | miner-chemical-gene-20160423.tsv, miner-chemical-gene-0-drugbank-20160423.tsv
28 | '''
29 | from collections import defaultdict
30 | import os
31 | import argparse
32 | from datetime import datetime
33 | 
34 | parser = argparse.ArgumentParser(description='Output crossnet for chemical chemical interaction')
35 | parser.add_argument('input_file', help='input file path. File should be parsed chemical-gene interaction')
36 | parser.add_argument('chemical_mode', help='chemical mode file path. File should be the chemical mode file')
37 | parser.add_argument('gene_mode', help='gene mode file path. File should be the gene mode file')
38 | parser.add_argument('--output_dir', help='directory to output files', default='.')
39 | args = parser.parse_args()
40 | sep = "\t"
41 | empty = "NULL"
42 | format = '%Y%m%d'
43 | dateStr = datetime.now().strftime(format)
44 | snapIdPrefix = ""
45 | 
46 | edgeFile = args.input_file
47 | nodeMap = args.chemical_mode
48 | geneMap = args.gene_mode
49 | masterTable = os.path.join(args.output_dir, "miner-chemical-gene-" + dateStr + ".tsv")
50 | subTable = os.path.join(args.output_dir, "miner-chemical-gene-0-drugbank-" + dateStr + ".tsv")
51 | idNum = 0
52 | # Make a dict mapping from drugbankId to snapChemId
53 | drugbankSnap = {}
54 | with open(nodeMap, 'r') as f:
55 |     for line in f:
56 |         if line.startswith('#'):
57 |             continue
58 |         line = line.strip().split(sep)
59 |         drugbankSnap[line[1]] = line[0]
60 | 
61 | # Make a dict mapping from UniProtKB to snapGeneId
62 | geneSnap = {}
63 | with open(geneMap, 'r') as f:
64 |     for line in f:
65 |         if line.startswith('#'):
66 |             continue
67 |         line = line.strip().split('\t')
68 |         geneSnap[line[1]] = line[0]
69 | 
70 | with open(edgeFile, 'r') as f, open(masterTable, 'w') as master, open(subTable, 'w') as sub: 
71 |     master.write('# snap_edge_id\tdataset_id\tsnap_source_id\tsnap_dst_id\n')
72 |     sub.write('# snap_edge_id\tdataset_source_id\tdataset_dst_id\n')
73 |     for line in f:
74 |         if line.startswith('#'):
75 |             continue
76 |         line = line.strip().split(sep)
77 |         if line[0] not in drugbankSnap:
78 |             continue
79 |         geneList = line[1].split(",")
80 |         if geneList[0] == "NULL":
81 |             continue
82 |         for gene in geneList:
83 |             snapId = snapIdPrefix + str(idNum)
84 |             idNum += 1
85 |             if gene not in geneSnap:
86 |                 print gene
87 |                 continue
88 |             master.write(snapId + sep + "0" + sep + drugbankSnap[line[0]] + sep + geneSnap[gene] + '\n')
89 |             sub.write(snapId + sep + line[0] + sep + gene + '\n')
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/Chemical-Gene/parse_drugbank_chemical_gene.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file : parse_drugbank_chemical_gene.py
 3 | author: Agrim Gupta
 4 | 
 5 | XML parser to parse the drugbank database for chemical gene interactions. 
 6 | Outputs a tab separated .tsv file with the following coloumn headers:
 7 | DrugbankId Gene1 Gene2 ...
 8 | Currently UniportID is used for genes.
 9 | 
10 | Usage:
11 | python parse_drugbank_chemical_gene.py <input_file_path>
12 | 
13 | Positional Arguments:
14 | input_file   : Path to the durgbank.xml file.
15 | 
16 | Optional Arugments: 
17 | --output_dir : Directory to create output files. Defaults to the current working directory.
18 | 
19 | Example Usage:
20 | Input File: drugbank.xml
21 | 
22 | Output directory : outputs/chemical/
23 | 
24 | Comamnd line:
25 | python parse_drugbank_chemical_gene.py drugbank.xml --output_dir outputs/chemicals/
26 | 
27 | Output: 
28 | drugbank_parsed_chemical_gene.tsv
29 | '''
30 | 
31 | from bs4 import BeautifulSoup
32 | import os
33 | import argparse
34 | 
35 | parser = argparse.ArgumentParser(description='Parse Durgbank database for drug gene interaction')
36 | parser.add_argument('input_file', help='input file path. File should be the drugbank.xml file.')
37 | parser.add_argument('--output_dir', help='directory to output files', default='.')
38 | args = parser.parse_args()
39 | outputFile = os.path.join(args.output_dir, "drugbank_parsed_chemical_gene.tsv")
40 | soup = BeautifulSoup(open(args.input_file),"xml")
41 | sep = "\t"
42 | empty = "NULL"
43 | #geneIdentifier = "HUGO Gene Nomenclature Committee (HGNC)"
44 | geneIdentifier = "UniProtKB"
45 | with open(outputFile, 'w') as f:
46 |     for drug in soup.findAll("drug"):
47 |         toPrint = ""
48 |         toPrint += drug.find("drugbank-id").text + sep
49 |         # Get target Genes
50 |         targets = drug.findAll("target")
51 |         targetGene = []
52 |         if targets:
53 |             for target in targets:
54 |                 externIden = target.findAll("external-identifier")
55 |                 if not externIden:
56 |                     continue
57 |                 for iden in externIden:
58 |                     if iden.find("resource").text == geneIdentifier:
59 |                         targetGene.append(iden.find("identifier").text)
60 |         # Get Enzyme Gene
61 |         enzymes = drug.findAll("enzyme")
62 |         enzymeGene = []
63 |         if enzymes:
64 |             for enzyme in enzymes:
65 |                 externIden = enzyme.findAll("external-identifier")
66 |                 if not externIden:
67 |                     continue
68 |                 for iden in externIden:
69 |                     if iden.find("resource").text == geneIdentifier:
70 |                         enzymeGene.append(iden.find("identifier").text)
71 |         allGene = targetGene + enzymeGene
72 |         if len(allGene) == 0:
73 |             toPrint += empty
74 |         else:
75 |             toPrint += ','.join(allGene)
76 |         f.write(toPrint.encode('utf-8') + '\n')
77 | 


--------------------------------------------------------------------------------
/Chemical/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing chemical information:
 2 | - Drugbank
 3 | 
 4 | Workflow for creating mode tables for chemicals:
 5 | 
 6 | Input files:
 7 | /path/to/input/drugbank.xml
 8 | 
 9 | Intermediate files:
10 | /path/to/intermediate/drugbank_parsed.tsv
11 | Output files:
12 | /path/to/output/miner-chemical-20160523.tsv
13 | /path/to/output/miner-chemical-0-drugbank-20160523.tsv
14 | /path/to/output/miner-chemical-1-PubChemCompound-20160523.tsv 
15 | /path/to/output/miner-chemical-2-PubChemSubstance-20160523.tsv
16 | /path/to/output/miner-chemical-equiv-20160523.tsv
17 | 
18 | # Parse Data
19 | # Beautiful Soup is required for this, use pipenv to install if you lack permissions
20 | python parse_drugbank_chemicals.py /path/to/input/drugbank.xml --output-dir /path/to/intermediate/
21 | 
22 | # Create Mode tables
23 | python make_snap_chemical_mode.py /path/to/intermediate/drugbank_parsed.tsv --output_dir /path/to/intermediate
24 | 
25 | Usage of the scripts used:
26 | 
27 | ----------------------------------
28 | file : parse_drugbank_chemicals.py
29 | ----------------------------------
30 | 
31 | XML parser to parse the drugbank database and output a tsv file 
32 | containting the following coloumn headers:
33 | DrugbankID PubChem_Compound PubChem_Substance
34 | 
35 | Usage:
36 | python parse_drugbank_chemicals.py <input_file_path>
37 | 
38 | Positional Arguments:
39 | input_file   : Path to the durgbank.xml file.
40 | 
41 | Optional Arugments: 
42 | --output_dir : Directory to create output files. Defaults to the current working directory.
43 | 
44 | Example Usage:
45 | Input File: drugbank.xml
46 | 
47 | Output directory : outputs/chemical/
48 | 
49 | Comamnd line:
50 | python parse_drugbank_chemicals.py drugbank.xml --output_dir outputs/chemicals/
51 | 
52 | Output: 
53 | drugbank_parsed.tsv
54 | 
55 | ---------------------------------------
56 | file : make_snap_chemical_mode_table.py
57 | ---------------------------------------
58 | 
59 | Takes input parsed durgbank.xml with the following coloumn headers:
60 | DrugbankID PubChem_Compound PubChem_Substance. Outputs snap tables for 
61 | chemical mode.
62 | 
63 | Usage:
64 | python make_snap_chemical_mode.py <input_file_path>
65 | 
66 | Positional Arguments:
67 | input_file   : Path to parsed drugbank.xml.
68 | 
69 | Optional Arugments: 
70 | --output_dir : Directory to create output files. Defaults to the current working directory.
71 | 
72 | Example Usage:
73 | Input File: drugbank_parsed.tsv 
74 | 
75 | Output directory : outputs/chemical/
76 | 
77 | Comamnd line:
78 | python make_snap_chemical_mode.py drugbank_parsed.tsv --output_dir outputs/chemicals/
79 | 
80 | Output:
81 | miner-chemical-20160523.tsv, miner-chemical-0-drugbank-20160523.tsv,
82 | miner-chemical-1-PubChemCompound-20160523.tsv, miner-chemical-2-PubChemSubstance-20160523.tsv,
83 | miner-chemical-equiv-20160523.tsv
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/Chemical/make_snap_chemical_mode_table.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file : make_snap_chemical_mode_table.py
 3 | author: Agrim Gupta
 4 | 
 5 | Takes input parsed durgbank.xml with the following coloumn headers:
 6 | DrugbankID PubChem_Compound PubChem_Substance. Outputs snap tables for 
 7 | chemical mode.
 8 | 
 9 | Usage:
10 | python make_snap_chemical_mode.py <input_file_path>
11 | 
12 | Positional Arguments:
13 | input_file   : Path to parsed drugbank.xml.
14 | 
15 | Optional Arugments: 
16 | --output_dir : Directory to create output files. Defaults to the current working directory.
17 | 
18 | Example Usage:
19 | Input File: drugbank_parsed.tsv 
20 | 
21 | Output directory : outputs/chemical/
22 | 
23 | Comamnd line:
24 | python make_snap_chemical_mode.py drugbank_parsed.tsv --output_dir outputs/chemicals/
25 | 
26 | Output:
27 | miner-chemical-20160523.tsv, miner-chemical-0-drugbank-20160523.tsv,
28 | miner-chemical-1-PubChemCompound-20160523.tsv, miner-chemical-2-PubChemSubstance-20160523.tsv,
29 | miner-chemical-equiv-20160523.tsv
30 | '''
31 | import itertools
32 | import os
33 | from datetime import datetime
34 | import argparse
35 | 
36 | sep = "\t"
37 | empty = "NULL"
38 | snapIdPrefix = ""
39 | idNum = 0
40 | format = '%Y%m%d'
41 | dateStr = datetime.now().strftime(format)
42 | 
43 | parser = argparse.ArgumentParser(description='Make mode tables for chemical')
44 | parser.add_argument('input_file', help='input file path. File should be the parsed drugbank.xml')
45 | parser.add_argument('--output_dir', help='directory to output files', default='.')
46 | args = parser.parse_args()
47 | 
48 | #output files
49 | masterTable = os.path.join(args.output_dir,"miner-chemical-" + dateStr + ".tsv")
50 | drugbankTable = os.path.join(args.output_dir, "miner-chemical-0-drugbank-" + dateStr + ".tsv")
51 | pubCompundTable = os.path.join(args.output_dir, "miner-chemical-1-PubChemCompound-" + dateStr + ".tsv")
52 | pubSubTable = os.path.join(args.output_dir, "miner-chemical-2-PubChemSubstance-" + dateStr + ".tsv")
53 | eqTable = os.path.join(args.output_dir, "miner-chemical-equiv-" + dateStr + ".tsv")
54 | 
55 | subTable = [drugbankTable, pubCompundTable, pubSubTable]
56 | databases = ["drugbank", "PubChemCompound" , "PubChemSubstance"]
57 | subHandle = [open(subTable[i], 'w') for i in xrange(len(subTable))]
58 | # Add Header
59 | for i in xrange(len((subHandle))):
60 |     subHandle[i].write('# snap_id\t%s specific id\n' % databases[i])
61 | 
62 | with open(args.input_file, 'r') as input, open(masterTable, 'w') as master,open(eqTable, 'w') as eqTable:
63 |     master.write('# snap_id\tdataset_id\n')
64 |     eqTable.write('# Equivalence table for mode chemical\n')
65 |     eqTable.write('# snap_id_1\tsnap_id_2\n')
66 |     for line in input:
67 |         if line.startswith('#'):
68 |             continue
69 |         line = line.strip().split(sep)
70 |         currId = []
71 |         # Only first three fields are relavant
72 |         for num,id in enumerate(line):
73 |             if num > 2:
74 |                 break
75 |             if id == "NULL":
76 |                 continue
77 |             snapId = snapIdPrefix + str(idNum)
78 |             idNum += 1
79 |             master.write(snapId + sep + str(num) + '\n')
80 |             subHandle[num].write(snapId + sep + id + '\n')
81 |             currId.append(snapId)
82 |         allPerms = list(itertools.permutations(currId,2))
83 |         for perm in allPerms:
84 |             toWrite = ' '.join(perm)
85 |             eqTable.write(toWrite + '\n')
86 | 
87 | [handle.close() for handle in subHandle]
88 | 


--------------------------------------------------------------------------------
/Chemical/newChemParser.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import os
 3 | from datetime import datetime
 4 | import argparse
 5 | 
 6 | sep = "\t"
 7 | empty = "NULL"
 8 | snapIdPrefix = ""
 9 | idNum = 0
10 | format = '%Y%m%d'
11 | dateStr = datetime.now().strftime(format)
12 | 
13 | parser = argparse.ArgumentParser(description='Make mode tables for chemical')
14 | parser.add_argument('input_file', help='input file path. File should be the parsed drugbank.xml')
15 | parser.add_argument('--output_dir', help='directory to output files', default='.')
16 | args = parser.parse_args()
17 | 
18 | masterTable = os.path.join(args.output_dir,"miner-chemical-" + dateStr + ".tsv")
19 | drugbankTable = os.path.join(args.output_dir, "miner-chemical-0-drugbank-" + dateStr + ".tsv")
20 | with open(args.input_file, 'r') as input, open(masterTable, 'w') as master, open(drugbankTable,'w') as drugTable:
21 |     master.write('# snap_id\tdataset_id\n')
22 |     drugTable.write('# snap_id\tdataset_id\tname\n')
23 |     #eqTable.write('# Equivalence table for mode chemical\n')
24 |     #eqTable.write('# snap_id_1\tsnap_id_2\n')
25 |     for line in input:
26 |         if line.startswith('#'):
27 |             continue
28 | 	spline =line.strip().split(sep)
29 |         if line.startswith('DB') and len(spline)>1:
30 |             line = spline
31 |             id = line[0]
32 |             name = line[1]
33 |             if name == "":
34 |                 name = "NULL"
35 |             snapId = snapIdPrefix + str(idNum)
36 |             idNum += 1
37 |             master.write(snapId + sep + id + '\n')
38 |             drugTable.write(snapId + sep + id + sep + name + '\n')
39 | 


--------------------------------------------------------------------------------
/Chemical/parse_drugbank_chemicals.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | file : parse_drugbank_chemicals.py
  3 | author: Agrim Gupta
  4 | 
  5 | XML parser to parse the drugbank database and output a tsv file 
  6 | containting the following coloumn headers:
  7 | DrugbankID PubChem_Compound PubChem_Substance
  8 | 
  9 | Usage:
 10 | python parse_drugbank_chemicals.py <input_file_path>
 11 | 
 12 | Positional Arguments:
 13 | input_file   : Path to the durgbank.xml file.
 14 | 
 15 | Optional Arugments: 
 16 | --output_dir : Directory to create output files. Defaults to the current working directory.
 17 | 
 18 | Example Usage:
 19 | Input File: drugbank.xml
 20 | 
 21 | Output directory : outputs/chemical/
 22 | 
 23 | Comamnd line:
 24 | python parse_drugbank_chemicals.py drugbank.xml --output_dir outputs/chemicals/
 25 | 
 26 | Output: 
 27 | drugbank_parsed.tsv
 28 | '''
 29 | 
 30 | from bs4 import BeautifulSoup
 31 | import os
 32 | import argparse
 33 | 
 34 | parser = argparse.ArgumentParser(description='Parse Durgbank database for chemicals')
 35 | parser.add_argument('input_file', help='input file path. File should be the drugbank.xml file.')
 36 | parser.add_argument('--output_dir', help='directory to output files', default='.')
 37 | args = parser.parse_args()
 38 | outputFile = os.path.join(args.output_dir, "drugbank_parsed.tsv")
 39 | soup = BeautifulSoup(open(args.input_file),"xml")
 40 | sep = "\t"
 41 | empty = "NULL"
 42 | fields = ["name", "description", "general-references", "synthesis-reference",
 43 |           "protein-binding", "classification", "salts", "synonyms", "products", "international-brands", "mixtures",
 44 |           "manufacturers", "prices", "categories", "dosages", "atc-codes", "food-interactions", "pathways","reactions",
 45 |           "snp-effects","snp-adverse-drug-reactions"]
 46 | header = ["drugbankID", "pc_Compund", "pc_substance"] + fields;
 47 | 
 48 | seenids = set()
 49 | seen = set()
 50 | 
 51 | def recur(elem,l):
 52 |     for e in elem.findChildren():
 53 |         if not e.findChildren():
 54 |             l.append(e.text.strip())
 55 |         else:
 56 |             recur(e, l)
 57 | 
 58 | with open(outputFile, 'w') as f:
 59 |     f.write("# " + sep.join(header) + '\n')
 60 |     for drug in soup.findAll("drug"):
 61 |         name = drug.find("name").text
 62 |         id = drug.find("drugbank-id").text
 63 |         if name not in seen or id not in seenids:
 64 |                   chemFound = False
 65 |                   toPrint = ""
 66 |                   toPrint += drug.find("drugbank-id").text + sep
 67 |                   seen.add(drug.find("name").text)
 68 |                   seenids.add(id)
 69 |                   identifiers = [i for i in drug.findAll("external-identifier")]
 70 |                   for i in identifiers:
 71 |                       database = i.find("resource").text
 72 |                       if database != "PubChem Compound":
 73 |                           continue
 74 |                       value = i.find("identifier").text
 75 |                       chemFound = True
 76 |                       toPrint += value + sep
 77 |                   if not chemFound:
 78 |                       toPrint += empty + sep
 79 |                   chemFound = False
 80 |                   for i in identifiers:
 81 |                       database = i.find("resource").text
 82 |                       if database != "PubChem Substance":
 83 |                           continue
 84 |                       value = i.find("identifier").text
 85 |                       chemFound = True
 86 |                       toPrint += value + sep
 87 |                   if not chemFound:
 88 |                       toPrint += empty + sep 
 89 |                   attributes = []
 90 |                   for field in fields:
 91 |                       l = []
 92 |                       if not drug.find(field):
 93 |                           attributes.append(empty)
 94 |                           continue
 95 |                       if drug.find(field).findChildren():
 96 |                           recur(drug.find(field),l)
 97 |                           for i in range(len(l)):
 98 |                               if l[i] == "":
 99 |                                   l[i] = empty
100 |                           attributes.append("|".join(l).encode('utf-8'))
101 |                       else:
102 |                           if drug.find(field).text != "":
103 |                               genRef = drug.find(field).text
104 |                               genRef = genRef.split("\n")
105 |                               attributes.append("|".join(genRef).encode('utf-8'))
106 |                               #attributes.append(drug.find(field).text.encode('utf-8'))
107 |                           else:
108 |                               attributes.append(empty)
109 |                   toPrint = toPrint.encode('utf-8') + sep.join(attributes)
110 |                   f.write(toPrint + '\n')
111 | 
112 | 


--------------------------------------------------------------------------------
/Chemical/test_db_parse.py:
--------------------------------------------------------------------------------
 1 | #author: farzaan kaiyom (farzaank)
 2 | #description: basic parser for drugbank, printing each nodeid and name
 3 | #updated to work for 2019 dataset
 4 | 
 5 | import os
 6 | from bs4 import BeautifulSoup
 7 | import argparse
 8 | 
 9 | parser = argparse.ArgumentParser(description='Parse Durgbank database for chemicals')
10 | parser.add_argument('input_file', help='input file path. File should be the drugbank.xml file.')
11 | parser.add_argument('--output_dir', help='directory to output files', default='.')
12 | args = parser.parse_args()
13 | outputFile = os.path.join(args.output_dir, "drugbank_parse_test1.tsv")
14 | soup = BeautifulSoup(open(args.input_file),"xml")
15 | sep = "\t"
16 | empty = "NULL"
17 | 
18 | seen=set()
19 | 
20 | with open(outputFile, 'w') as f:
21 |     f.write("# " + sep.join(header) + '\n')
22 |     counter = 0 
23 |     drugs = soup.findAll("drug")
24 |     for drug in drugs:
25 |         drugline = ''
26 |         if not drug.find("name"):
27 |             name = empty
28 |         else:
29 |             name = drug.find("name").text
30 |         if name not in seen:
31 |             seen.add(name)
32 |             counter += 1
33 |             drugline += (str(counter)+" ")
34 |             drugline += (str(name) + " ")
35 |             drugline += drug.find('drugbank-id').text + sep
36 |             try:
37 |                 f.write(drugline+'\n')
38 |             except:
39 |                 drugline = drugline.encode('utf-8')
40 |                 f.write(drugline+'\n')
41 | 


--------------------------------------------------------------------------------
/Disease-Chemical/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing disease-chemical information:
 2 | - CTD
 3 | 
 4 | Workflow for creating crossnet tables for disease-chemical relationships:
 5 | 
 6 | Pre-requisites:
 7 | Must have the disease modes table from CTD (MESH and OMIM), and the chemical
 8 | mode table from Drugbank.
 9 | /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv
10 | /path/to/disease_mode/miner-disease-2-CTD_OMIM-20160521.tsv
11 | /path/to/chemical_mode/miner-chemical-0-drugbank-20160521.tsv
12 | 
13 | Input files/directories:
14 | /path/to/input/CTD_dir  (from CTD)
15 | 
16 | Intermediate files:
17 | /path/to/intermediate/ctd_disease_chem_parsed.tsv
18 | 
19 | Output files:
20 | /path/to/output/miner-disease-chemical-20160521.tsv
21 | /path/to/output/miner-disease-chemical-0-CTD_MESH-20160521.tsv
22 | /path/to/output/miner-disease-chemical-1-CTD_OMIM-20160521.tsv
23 | 
24 | # Create intermediate files
25 | python make_disease_chem_ctd.py /path/to/input/CTD_dir --output_dir /path/to/intermediate/
26 | 
27 | # Create cross net files
28 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ctd_disease_chem_parsed.tsv /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv /path/to/chemical_mode/miner-chemical-0-drugbank-20160521.tsv CTD_MESH 0 --output_dir /path/to/output/ --skip_missing_ids
29 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ctd_disease_chem_parsed.tsv /path/to/disease_mode/miner-disease-2-CTD_OMIM-20160521.tsv /path/to/chemical_mode/miner-chemical-0-drugbank-20160521.tsv CTD_OMIM 1 --output_dir /path/to/output/ --skip_missing_ids
30 | 


--------------------------------------------------------------------------------
/Disease-Chemical/make_disease_chem_ctd.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file : make_disease_chem_ctd.py
 3 | author: Viswajith Venugopal
 4 | 
 5 | Parses CTD to find disease chemical links.
 6 | 
 7 | Usage:
 8 | python make_disease_chem_ctd <input_dir> [--output_dir OUTPUT_DIR]
 9 | 
10 | Positional Arguments:
11 | input_dir  : The directory of the CTD files.
12 | 
13 | Optional Arugments: 
14 | --output_dir : Directory to create output files. Defaults to the current working directory.
15 | 
16 | Example Usage:
17 | Input File: CTD/0416_CTD
18 | 
19 | Output directory : outputs/disease-chemical/
20 | 
21 | Comamnd line:
22 | python parse_do_diseases.py CTD/0416_CTD --output_dir outputs/disease-chemical/
23 | 
24 | Output: 
25 | ctd_disease_chemical_parsed.tsv
26 | '''
27 | 
28 | from collections import defaultdict
29 | import os
30 | import argparse
31 | 
32 | def get_chem_to_db(ctd_dir):
33 |     ctd_chem_node_fname = os.path.join(ctd_dir, 'CTD_chemicals.tsv')
34 |     chem_to_db_dict = {}
35 |     # First, we load uniprot ids.
36 |     with open(ctd_chem_node_fname, 'r') as ctd_gene_node_f:
37 |         for line in ctd_gene_node_f:
38 |             if line.startswith('#'):
39 |                 continue
40 |             sp_line = line.strip('\n').split('\t')
41 |             chem_id = sp_line[1]
42 |             db_ids = sp_line[8]
43 |             if len(db_ids) > 0:
44 |                 db_ids = db_ids.split('|')
45 |                 chem_to_db_dict[chem_id] = db_ids
46 | 
47 |     return chem_to_db_dict
48 | 
49 | def parse_ctd_chem_diseases(ctd_dir):
50 |     
51 |     chem_to_db_dict = get_chem_to_db(ctd_dir)
52 |     disease_chem_list = []
53 |     ctd_chem_dis_fname = os.path.join(ctd_dir, 'CTD_chemicals_diseases.tsv')
54 |     with open(ctd_chem_dis_fname) as in_f:
55 |         for line in in_f:
56 |             if line.startswith('#'):
57 |                 continue
58 |             sp_line = line.strip('\n').split('\t')
59 |             chem_id = 'MESH:' + sp_line[1]
60 |             if chem_id not in chem_to_db_dict:
61 |                 continue
62 |             db_id = chem_to_db_dict[chem_id][0]
63 |             disease_id = sp_line[4]
64 |             inference_score = sp_line[7]
65 |             if inference_score == "":
66 |                 inference_score = "0"
67 |             disease_chem_list.append((disease_id, db_id,inference_score))
68 | 
69 |     return disease_chem_list
70 |      
71 | 
72 | parser = argparse.ArgumentParser(description='Parse CTD to find disease-chemical links.')
73 | parser.add_argument('input_dir', help='Input files directory. This should be the directory with all the CTD TSVs.')
74 | parser.add_argument('--output_dir', help='Directory to output files', default='.')
75 | args = parser.parse_args()
76 | 
77 | output_fname = os.path.join(args.output_dir, "ctd_disease_chem_parsed.tsv")
78 | 
79 | disease_chem_list = parse_ctd_chem_diseases(args.input_dir)
80 | 
81 | with open(output_fname, 'w') as out_f:
82 |     out_f.write('#Disease Chemical links from CTD.\n')
83 |     for (disease_id, db_id, iscore) in disease_chem_list:
84 |         out_f.write('\t'.join([disease_id, db_id,iscore]))
85 |         out_f.write('\n')
86 | 


--------------------------------------------------------------------------------
/Disease-Disease/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing disease-disease information:
 2 | - DiseaseOntology (has edges for the 'is_a' relationship)
 3 | 
 4 | Workflow for creating crossnet tables for disease-disease relationships:
 5 | 
 6 | Pre-requisites:
 7 | Must have the disease mode table from DOID
 8 | /path/to/mode/miner-disease-0-DOID-20160521.tsv
 9 | 
10 | Input files/directories:
11 | /path/to/input/doid.obo  (from DOID)
12 | 
13 | Intermediate files:
14 | /path/to/intermediate/doid_disease_disease_parsed.tsv
15 | 
16 | Output files:
17 | /path/to/output/miner-disease-disease-20160521.tsv
18 | /path/to/output/miner-disease-disease-0-DOID-20160521.tsv
19 | 
20 | # Create intermediate files
21 | python parse_do_disease_disease.py /path/to/input/doid.obo --output_dir /path/to/intermediate/
22 | 
23 | # Create cross net files
24 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/doid_disease_disease_parsed.tsv /path/to/mode/miner-disease-0-DOID-20160521.tsv /path/to/mode/miner-disease-0-DOID-20160521.tsv DOID 0 --output_dir /path/to/output/
25 | 


--------------------------------------------------------------------------------
/Disease-Disease/parse_do_disease_disease.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | file : parse_do_disease_disease.py
  3 | author: Viswajith Venugopal
  4 | 
  5 | Parses the disease ontology OBO to create the 
  6 | edge table using the is_a relationship.
  7 | 
  8 | Usage:
  9 | python parse_do_disease_disease.py <input_file> [--output_dir OUTPUT_DIR]
 10 | 
 11 | Positional Arguments:
 12 | input_file   : The doid.obo file which contains the disease ontology.
 13 | 
 14 | Optional Arugments: 
 15 | --output_dir : Directory to create output files. Defaults to the current working directory.
 16 | 
 17 | Example Usage:
 18 | Input File: doid.obo
 19 | 
 20 | Output directory : outputs/disease-disease/
 21 | 
 22 | Comamnd line:
 23 | python parse_do_disease_disease.py doid.obo --output_dir outputs/disease-disease/
 24 | 
 25 | Output: 
 26 | doid_disease_disease_parsed.tsv
 27 | '''
 28 | 
 29 | from collections import defaultdict
 30 | import os
 31 | import argparse
 32 | import pickle
 33 | 
 34 | 
 35 | # In[79]:
 36 | 
 37 | def parse_do_file_to_list(fname):
 38 |     """
 39 |     Reads the disease ontology in obo format from file
 40 |     given by fname, and returns the ontology as a list
 41 |     of dictionaries, one dictionary per entry.
 42 |     The dictionary for each entry is structured with
 43 |     the following fields
 44 |     {
 45 |         'id' (The disease ontology id)
 46 |         'name'
 47 |         'def'
 48 |         'synonym'
 49 |         'alt_id' (A list of alternate DOID ids)
 50 |         'xref' (A list of xrefs to MESH/OMIM ids)
 51 |         'is_a' (A DOID of what this disease is)
 52 |     
 53 |     }
 54 |     """
 55 |     f = open(fname, 'r')
 56 | 
 57 |     preamble = True # If we're in the top part of the file
 58 |     global_list = []
 59 |     curr_node_dict = {}
 60 |     for line in f:
 61 |         if preamble:
 62 |             if line.startswith('[Term]'):
 63 |                 preamble = False
 64 |             continue
 65 |         spline = line.strip().split()
 66 |         if len(spline) == 0:
 67 |             global_list.append(curr_node_dict)
 68 |             curr_node_dict = {}
 69 |             continue
 70 |         if spline[0] == 'id:':
 71 |             if not spline[1].startswith('DOID'): # This means we've reached the bottom part of the file.
 72 |                 break
 73 |             curr_node_dict['id'] = spline[1]
 74 |         elif spline[0] == 'name:':
 75 |             curr_node_dict['name'] = ' '.join(spline[1:])
 76 |         elif spline[0] == 'def:':
 77 |             curr_node_dict['def'] = ' '.join(spline[1:])
 78 |         elif spline[0] == 'synonym:':
 79 |             curr_node_dict['synonym'] = ' '.join(spline[1:])
 80 |         elif spline[0] == 'alt_id:':
 81 |             if 'alt_id' in curr_node_dict:
 82 |                 curr_node_dict['alt_id'].append(spline[1])
 83 |             else:
 84 |                 curr_node_dict['alt_id'] = [spline[1]]
 85 |         elif spline[0] == 'is_a:':
 86 |             curr_node_dict['is_a'] = spline[1]
 87 |         elif spline[0] == 'xref:':
 88 |             if 'xref' in curr_node_dict:
 89 |                 curr_node_dict['xref'].append(spline[1])
 90 |             else:
 91 |                 curr_node_dict['xref'] = [spline[1]]
 92 | 
 93 |                 
 94 |     return global_list
 95 | 
 96 | doid_to_mesh_dict = defaultdict(list)
 97 | mesh_to_doid_dict = defaultdict(list)
 98 | omim_to_doid_dict = defaultdict(list)
 99 | doid_to_omim_dict = defaultdict(list)
100 | doid_equiv_dict = defaultdict(list)
101 | 
102 | 
103 | parser = argparse.ArgumentParser(description='Parse DOID to find disease-disease is-a edges.')
104 | parser.add_argument('input_file', help='Input file path. File should be the doid.obo file.')
105 | parser.add_argument('--output_dir', help='Directory to output files', default='.')
106 | args = parser.parse_args()
107 | 
108 | output_fname = os.path.join(args.output_dir, "doid_disease_disease_parsed.tsv")
109 | 
110 | # Get the Disease Ontology as a list of one dictionary per entry.
111 | do_list = parse_do_file_to_list(args.input_file)
112 | 
113 | with open(output_fname, 'w') as out_f:
114 |     out_f.write('# Parsed DOID file.\n# Columns are source id, dest id.\n')
115 |     for entry in do_list:
116 |         if 'is_a' in entry:
117 |             out_f.write('\t'.join([entry['id'], entry['is_a']]))
118 |             out_f.write('\n')
119 | 


--------------------------------------------------------------------------------
/Disease-Function/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing disease-function information:
 2 | - CTD
 3 | 
 4 | Workflow for creating crossnet tables for disease-function relationships:
 5 | 
 6 | Pre-requisites:
 7 | Must have the disease modes table from CTD (MESH), and the function mode table from
 8 | GO.
 9 | /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv
10 | /path/to/function_mode/miner-function-0-GO-20160521.tsv
11 | 
12 | Input files/directories:
13 | /path/to/input/CTD_dir  (from CTD)
14 | 
15 | Intermediate files:
16 | /path/to/intermediate/ctd_disease_func_parsed.tsv
17 | 
18 | Output files:
19 | /path/to/output/miner-disease-function-20160521.tsv
20 | /path/to/output/miner-disease-function-0-CTD-20160521.tsv
21 | 
22 | # Create intermediate files
23 | python make_disease_func_ctd.py /path/to/input/CTD_dir --output_dir /path/to/intermediate/
24 | 
25 | # Create cross net files
26 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ctd_disease_func_parsed.tsv /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv /path/to/function_mode/miner-function-0-GO-20160521.tsv CTD 0 --output_dir /path/to/output/ --skip_missing_ids
27 | 


--------------------------------------------------------------------------------
/Disease-Function/make_disease_func_ctd.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file : make_disease_func_ctd.py
 3 | author: Viswajith Venugopal
 4 | 
 5 | Goes over the disease function file in CTD and creates a table with the disease id
 6 | and GO id of the function.
 7 | 
 8 | Usage:
 9 | python make_disease_func_ctd <input_dir> [--output_dir OUTPUT_DIR]
10 | 
11 | Positional Arguments:
12 | input_dir   : The directory with all the CTD TSVs
13 | 
14 | Optional Arugments: 
15 | --output_dir : Directory to create output files. Defaults to the current working directory.
16 | 
17 | Example Usage:
18 | Input Dir: CTD/0416_CTD
19 | 
20 | Output directory : outputs/disease_func/
21 | 
22 | Comamnd line:
23 | python make_disease_func_ctd.py CTD/0416_CTD --output_dir outputs/disease_func/
24 | 
25 | Output: 
26 | ctd_disease_func_parsed.tsv
27 | '''
28 | 
29 | from collections import defaultdict
30 | import os
31 | import argparse
32 | 
33 | def load_disease_functions_ctd(ctd_dir):
34 |     f1 = open(os.path.join(ctd_dir, 'CTD_Phenotype-Disease_biological_process_associations.tsv'), 'r')
35 |     f2 = open(os.path.join(ctd_dir, 'CTD_Phenotype-Disease_cellular_component_associations.tsv'), 'r')
36 |     f3 = open(os.path.join(ctd_dir, 'CTD_Phenotype-Disease_molecular_function_associations.tsv'), 'r')
37 |     global_list = []
38 |     linktype = ""
39 |     for f in [f1, f2, f3]:
40 |         if f==f1:
41 |             linktype = "biological"
42 |         elif f==f2:
43 |             linktype = "cellular"
44 |         else:
45 |             linktype = "molecular"
46 |         for line in f:
47 |             if line.startswith('#'):
48 |                 continue
49 |             sp_line = line.strip('\n').split('\t')
50 |             disease_id = sp_line[3]
51 |             go_id = sp_line[1]
52 |             global_list.append((disease_id, go_id, linktype))
53 |     return global_list
54 | 
55 | parser = argparse.ArgumentParser(description='Parse CTD to find disease-function links.')
56 | parser.add_argument('input_dir', help='Input files directory. This should be the directory with all the CTD TSVs.')
57 | parser.add_argument('--output_dir', help='Directory to output files', default='.')
58 | args = parser.parse_args()
59 | 
60 | output_fname = os.path.join(args.output_dir, "ctd_disease_func_parsed.tsv")
61 | 
62 | disease_func_list = load_disease_functions_ctd(args.input_dir)
63 | 
64 | with open(output_fname, 'w') as out_f:
65 |     out_f.write('#Disease Function links from CTD.\n')
66 |     for (disease_id, go_id,linktype) in disease_func_list:
67 |         out_f.write('\t'.join([disease_id, go_id,linktype]))
68 |         out_f.write('\n')
69 | 


--------------------------------------------------------------------------------
/Disease-Gene/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing disease-gene information:
 2 | - CTD
 3 | 
 4 | Workflow for creating crossnet tables for disease-gene relationships:
 5 | 
 6 | Pre-requisites:
 7 | Must have the disease modes table from CTD (MESH and OMIM), and the gene
 8 | mode table from GO.
 9 | /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv
10 | /path/to/disease_mode/miner-disease-2-CTD_OMIM-20160521.tsv
11 | /path/to/gene_mode/miner-gene-0-GO-20160521.tsv
12 | 
13 | Input files/directories:
14 | /path/to/input/CTD_dir  (from CTD)
15 | 
16 | Intermediate files:
17 | /path/to/intermediate/ctd_disease_gene_parsed.tsv
18 | 
19 | Output files:
20 | /path/to/output/miner-disease-gene-20160521.tsv
21 | /path/to/output/miner-disease-gene-0-CTD_MESH-20160521.tsv
22 | /path/to/output/miner-disease-gene-1-CTD_OMIM-20160521.tsv
23 | 
24 | # Create intermediate files
25 | python make_disease_gene_ctd.py /path/to/input/CTD_dir --output_dir /path/to/intermediate/
26 | 
27 | # Create cross net files
28 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ctd_disease_gene_parsed.tsv /path/to/disease_mode/miner-disease-1-CTD_MESH-20160521.tsv /path/to/gene_mode/miner-gene-0-GO-20160521.tsv CTD_MESH 0 --output_dir /path/to/output/ --skip_missing_ids
29 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ctd_disease_gene_parsed.tsv /path/to/disease_mode/miner-disease-2-CTD_OMIM-20160521.tsv /path/to/gene_mode/miner-gene-0-GO-20160521.tsv CTD_OMIM 1 --output_dir /path/to/output/ --skip_missing_ids
30 | 


--------------------------------------------------------------------------------
/Disease-Gene/make_disease_gene_ctd.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file : make_disease_gene_ctd.py
 3 | author: Viswajith Venugopal
 4 | 
 5 | Parses CTD to find disease-gene edges.
 6 | 
 7 | Usage:
 8 | python make_disease_gene_ctd.py <input_file>
 9 | 
10 | Positional Arguments:
11 | input_dir   : The directory to the CTD folder.
12 | 
13 | Optional Arugments: 
14 | --output_dir : Directory to create output files. Defaults to the current working directory.
15 | 
16 | Example Usage:
17 | Input File: CTD/0416_CTD
18 | 
19 | Output directory : outputs/disease-gene/
20 | 
21 | Comamnd line:
22 | python make_disease_gene_ctd.py --output_dir outputs/disease-gene/
23 | 
24 | Output: 
25 | ctd_disease_gene_parsed.tsv
26 | '''
27 | 
28 | from collections import defaultdict
29 | import os
30 | import argparse
31 | 
32 | def get_ncbi_to_uniprot(ctd_dir):
33 |     ctd_gene_node_fname = os.path.join(ctd_dir, 'CTD_genes.tsv')
34 |     ncbi_to_uniprot_dict = {}
35 |     # First, we load uniprot ids.
36 |     with open(ctd_gene_node_fname, 'r') as ctd_gene_node_f:
37 |         for line in ctd_gene_node_f:
38 |             if line.startswith('#'):
39 |                 continue
40 |             sp_line = line.strip('\n').split('\t')
41 |             ncbi_id = sp_line[2]
42 |             uniprot_ids = sp_line[7]
43 |             if len(uniprot_ids) > 0:
44 |                 uniprot_ids = uniprot_ids.split('|')
45 |                 ncbi_to_uniprot_dict[ncbi_id] = uniprot_ids
46 | 
47 |     return ncbi_to_uniprot_dict
48 | 
49 | def parse_ctd_gene_diseases(ctd_dir):
50 |     
51 |     ncbi_to_uniprot_dict = get_ncbi_to_uniprot(ctd_dir)
52 |     disease_gene_list = []
53 |     ctd_gene_dis_fname = os.path.join(ctd_dir, 'CTD_genes_diseases.tsv')
54 |     with open(ctd_gene_dis_fname) as in_f:
55 |         i = 0
56 |         for line in in_f:
57 |             i += 1
58 |             if i % 100000 == 0:
59 |                 pass
60 |                 #print i
61 |             if line.startswith('#'):
62 |                 continue
63 |             sp_line = line.strip('\n').split('\t')
64 |             ncbi_id = sp_line[1]
65 |             if ncbi_id not in ncbi_to_uniprot_dict:
66 |                 continue
67 |             disease_id = sp_line[3]
68 |             iscore = sp_line[6]
69 |             if iscore=="":
70 |                 iscore = 0
71 |             for uniprot_id in ncbi_to_uniprot_dict[ncbi_id]:
72 |                 yield (disease_id, uniprot_id,iscore)
73 |      
74 | 
75 | parser = argparse.ArgumentParser(description='Parse CTD to find disease-gene links.')
76 | parser.add_argument('input_dir', help='Input files directory. This should be the directory with all the CTD TSVs.')
77 | parser.add_argument('--output_dir', help='Directory to output files', default='.')
78 | args = parser.parse_args()
79 | 
80 | output_fname = os.path.join(args.output_dir, "ctd_disease_gene_parsed.tsv")
81 | 
82 | 
83 | with open(output_fname, 'w') as out_f:
84 |     out_f.write('#Disease Gene links from CTD.\n')
85 |     for (disease_id, uni_id,iscore) in parse_ctd_gene_diseases(args.input_dir):
86 |         out_f.write('\t'.join([disease_id, uni_id,iscore]))
87 |         out_f.write('\n')
88 | 


--------------------------------------------------------------------------------
/Disease-Gene/make_disease_gene_disgenet.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Title : make_disease_gene_disgenet.py
  3 | Author: Farzaan Kaiyom
  4 | 
  5 | Parses data from DisGeNET to find disease-gene edges.
  6 | 
  7 | Usage:
  8 | python make_disease_gene_disgenet.py <input_file>
  9 | 
 10 | Positional Arguments:
 11 | input_dir   : The directory to the CTD folder.
 12 | 
 13 | Optional Arugments: 
 14 | --output_dir : Directory to create output files. Defaults to the current working directory.
 15 | 
 16 | Example Usage:
 17 | Input File: CTD/0819_CTD
 18 | 
 19 | Output directory : outputs/disease-gene/
 20 | 
 21 | Comamnd line:
 22 | python make_disease_gene_disgenet.py --output_dir outputs/disease-gene/
 23 | 
 24 | Output: 
 25 | ctd_disease_gene_parsed2.tsv
 26 | '''
 27 | 
 28 | from collections import defaultdict
 29 | import os
 30 | import argparse
 31 | 
 32 | def get_DGN_to_MESH(DGN_dir):
 33 |     map_fname = os.path.join(DGN_dir, 'mapping_files.tsv')
 34 |     mesh_dict = {}
 35 |     with open(map_fname, 'r') as dismap:
 36 |         for line in dismap:
 37 |             if line.startswith('#'):
 38 |                 continue
 39 |             trgt = line.strip('\n').split('|')
 40 |             DGNid = trgt[0]
 41 |             vocab = trgt[2]
 42 |             if vocab=='MESH'
 43 |                 code = trgt[3]
 44 |                 mesh_dict[DGNid]=code
 45 |     return mesh_dict
 46 | 
 47 | def get_ncbi_to_uniprot(ctd_dir):
 48 |     ctd_gene_node_fname = os.path.join(ctd_dir, 'CTD_genes.tsv')
 49 |     ncbi_to_uniprot_dict = {}
 50 |     # First, we load uniprot ids.
 51 |     with open(ctd_gene_node_fname, 'r') as ctd_gene_node_f:
 52 |         for line in ctd_gene_node_f:
 53 |             if line.startswith('#'):
 54 |                 continue
 55 |             sp_line = line.strip('\n').split('\t')
 56 |             ncbi_id = sp_line[2]
 57 |             uniprot_ids = sp_line[7]
 58 |             if len(uniprot_ids) > 0:
 59 |                 uniprot_ids = uniprot_ids.split('|')
 60 |                 ncbi_to_uniprot_dict[ncbi_id] = uniprot_ids
 61 | 
 62 |     return ncbi_to_uniprot_dict
 63 | 
 64 | def parse_ctd_gene_diseases(ctd_dir,DGN_dir):
 65 |     
 66 |     ncbi_to_uniprot_dict = get_ncbi_to_uniprot(ctd_dir)
 67 |     dgn_dict = get_DGN_to_MESH(DGN_dir)
 68 |     disease_gene_list = []
 69 |     ctd_gene_dis_fname = os.path.join(ctd_dir, 'CTD_genes_diseases.tsv')
 70 |     with open(ctd_gene_dis_fname) as in_f:
 71 |         i = 0
 72 |         for line in in_f:
 73 |             i += 1
 74 |             if i % 100000 == 0:
 75 |                 pass
 76 |                 #print i
 77 |             if line.startswith('#'):
 78 |                 continue
 79 |             sp_line = line.strip('\n').split('\t')
 80 |             ncbi_id = sp_line[1]
 81 |             if ncbi_id not in ncbi_to_uniprot_dict:
 82 |                 continue
 83 |             disease_id = sp_line[3]
 84 |             for uniprot_id in ncbi_to_uniprot_dict[ncbi_id]:
 85 |                 yield (disease_id, uniprot_id)
 86 |      
 87 | 
 88 | parser = argparse.ArgumentParser(description='Parse CTD to find disease-gene links.')
 89 | parser.add_argument('input_dir1', help='Input files directory. This should be the directory with all the CTD TSVs.')
 90 | parser.add_argument('input_dir2', help='Input files directory. This should be the directory with the DGN-MESH mapping.')
 91 | parser.add_argument('--output_dir', help='Directory to output files', default='.')
 92 | args = parser.parse_args()
 93 | 
 94 | output_fname = os.path.join(args.output_dir, "ctd_disease_gene_parsed2.tsv")
 95 | 
 96 | 
 97 | with open(output_fname, 'w') as out_f:
 98 |     out_f.write('#Disease Gene links from CTD.\n')
 99 |     for (disease_id, uni_id) in parse_ctd_gene_diseases(args.input_dir1,args.input_dir2):
100 |         out_f.write('\t'.join([disease_id, uni_id]))
101 |         out_f.write('\n')
102 | 


--------------------------------------------------------------------------------
/Disease/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing disease information:
 2 | - DiseaseOntology (has DOID ids, as well as cross-references to the MESH and
 3 |   OMIM ids)
 4 | - CTD (uses MESH ids for some diseases, and OMIM ids for others)
 5 | - OMIM
 6 | 
 7 | Workflow for creating mode tables for diseases:
 8 | 
 9 | Input files/directories:
10 | /path/to/input/doid.obo  (from DOID)
11 | /path/to/input/CTD_diseases.tsv (from CTD)
12 | /path/to/input/OMIM/ (from OMIM. Note that this is the entire directory, since
13 | multiple files are required to identify diseases)
14 | 
15 | Intermediate files:
16 | /path/to/intermediate/doid_parsed.tsv
17 | /path/to/intermediate/ctd_mesh_parsed.tsv
18 | /path/to/intermediate/ctd_omim_parsed.tsv
19 | /path/to/intermediate/omim_parsed.tsv
20 | /path/to/intermediate/doid_mesh_equiv.tsv
21 | /path/to/intermediate/doid_omim_equiv.tsv
22 | 
23 | Output files:
24 | /path/to/output/miner-disease-20160521.tsv
25 | /path/to/output/miner-disease-0-DOID-20160521.tsv
26 | /path/to/output/miner-disease-1-CTD_MESH-20160521.tsv
27 | /path/to/output/miner-disease-3-OMIM-20160525.tsv
28 | /path/to/output/miner-disease-equiv-20160525.tsv
29 | 
30 | # Create intermediate files
31 | python parse_do_diseases.tsv /path/to/input/doid.obo --output_dir /path/to/intermediate/
32 | python parse_ctd_diseases.tsv /path/to/input/CTD_diseases.tsv --output_dir /path/to/intermediate/
33 | python parse_omim_diseases.tsv /path/to/input/OMIM/ --output_dir /path/to/intermediate/
34 | 
35 | 
36 | # Create mode files
37 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/doid_parsed.tsv disease DOID 0 --output_dir /path/to/output/
38 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/ctd_mesh_parsed.tsv disease CTD_MESH 1 --output_dir /path/to/output/
39 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/ctd_omim_parsed.tsv disease CTD_OMIM 2 --output_dir /path/to/output/
40 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/omim_parsed.tsv disease OMIM 3 --output_dir /path/to/output/
41 | 
42 | # Create mode equivalence table
43 | python ../Utils/create_snap_mode_equiv_table.py /path/to/output/miner-disease-0-DOID-20160521.tsv /path/to/output/miner-disease-1-CTD_MESH-20160521.tsv --output_dir /path/to/output/ --mapping_file /path/to/intermediate/doid_mesh_equiv.tsv --skip_missing_ids
44 | python ../Utils/create_snap_mode_equiv_table.py /path/to/output/miner-disease-0-DOID-20160521.tsv /path/to/output/miner-disease-2-CTD_OMIM-20160521.tsv --output_dir /path/to/output/ --mapping_file /path/to/intermediate/doid_omim_equiv.tsv --skip_missing_ids
45 | python ../Utils/create_snap_mode_equiv_table.py /path/to/output/miner-disease-0-DOID-20160521.tsv /path/to/output/miner-disease-3-OMIM-20160521.tsv --output_dir /path/to/output/ --mapping_file /path/to/intermediate/doid_omim_equiv.tsv --skip_missing_ids
46 | 


--------------------------------------------------------------------------------
/Disease/parse_ctd_diseases.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | file : parse_ctd_diseases.py
  3 | author: Viswajith Venugopal
  4 | 
  5 | Parses the CTD disease node table into a TSV we can use to build our SNAP mode table
  6 | Creates two separate files, one for CTD diseases with a MESH id, and another for 
  7 | CTD diseases with an OMIM id.
  8 | 
  9 | Usage:
 10 | python parse_ctd_diseases.py <input_file> [--output_dir OUTPUT_DIR]
 11 | 
 12 | Positional Arguments:
 13 | input_file   : The CTD_diseases.tsv file which ships with CTD.
 14 | 
 15 | Optional Arugments: 
 16 | --output_dir : Directory to create output files. Defaults to the current working directory.
 17 | 
 18 | Example Usage:
 19 | Input File: CTD_diseases.tsv
 20 | 
 21 | Output directory : outputs/diseases/
 22 | 
 23 | Comamnd line:
 24 | python parse_ctd_diseases.py CTD_diseases.tsv --output_dir outputs/diseases/
 25 | 
 26 | Output: 
 27 | ctd_mesh_parsed.tsv
 28 | ctd_omim_parsed.tsv
 29 | '''
 30 | 
 31 | import os
 32 | import argparse
 33 | 
 34 | def parse_ctd_file_to_list(fname):
 35 |    """
 36 |    Parses the ctd_diseases.tsv file, and returns it as a list
 37 |    of entries, each entry represented as a dictionary with structure:
 38 |    {
 39 |        'name'
 40 |        'id'
 41 |        'alt_ids' (list of alternate disease ids)
 42 |        'defs'
 43 |        'parents' (list of parent ids)
 44 |        'syns'
 45 |    }
 46 |    """
 47 |    f = open(fname,'r')
 48 |    ctd_list = []
 49 |    for line in f:
 50 |        if line.startswith('#'):
 51 |            continue
 52 |        spline = line.strip('\n').split('\t')
 53 |        name = spline[0]
 54 |        disease_id = spline[1]
 55 |        alt_ids = spline[2]
 56 |        defs = spline[3]
 57 |        parents = spline[4]
 58 |        syns = spline[7]
 59 |        if len(alt_ids) > 0:
 60 |            alt_ids = alt_ids.split('|')
 61 |        else:
 62 |            alt_ids = []
 63 |        if len(parents) > 0:
 64 |            parents = parents.split('|')
 65 |        else:
 66 |            parents = []
 67 |        ctd_list.append({
 68 |                'name': name,
 69 |                'id': disease_id,
 70 |                'alt_ids': alt_ids,
 71 |                'defs': defs,
 72 |                'parents': parents,
 73 |                'syns': syns
 74 |            })
 75 |                
 76 |    return ctd_list
 77 | 
 78 | parser = argparse.ArgumentParser(description='Parse CTD to find diseases.')
 79 | parser.add_argument('input_file', help='Input file path. File should be the CTD_diseases.tsv file.')
 80 | parser.add_argument('--output_dir', help='Directory to output files', default='.')
 81 | args = parser.parse_args()
 82 | 
 83 | ctd_list = parse_ctd_file_to_list(args.input_file)
 84 | mesh_output_fname = os.path.join(args.output_dir, "ctd_mesh_parsed.tsv")
 85 | omim_output_fname = os.path.join(args.output_dir, "ctd_omim_parsed.tsv")
 86 | 
 87 | 
 88 | with open(mesh_output_fname, 'w') as mesh_out_f:
 89 |     with open(omim_output_fname, 'w') as omim_out_f:
 90 |         mesh_out_f.write('# Parsed CTD diseases file with MESH ids.\n# Columns: id, name, definitions, synonyms.\n')
 91 |         omim_out_f.write('# Parsed CTD diseases file with OMIM ids.\n# Columns: id, name, definitions, synonyms.\n')
 92 | 
 93 |         for entry in ctd_list:
 94 |             
 95 |             # The string to write into the output file.
 96 |             str_to_write = '\t'.join([entry['id'], entry['name'], 
 97 |                                       entry['defs'], entry['syns']])
 98 |             
 99 |             # Find the id; in CTD, it can be either an OMIM or a MESH id.
100 |             if entry['id'].startswith('MESH'):
101 |                 mesh_out_f.write(str_to_write + '\n')
102 |             elif entry['id'].startswith('OMIM'):
103 |                 omim_out_f.write(str_to_write + '\n')
104 | 


--------------------------------------------------------------------------------
/Disease/parse_do_diseases.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | file : parse_do_diseases.py
  3 | author: Viswajith Venugopal
  4 | 
  5 | Parses the disease ontology OBO. Creates the disease ontology node TSV, as well as
  6 | files with mappings between DOIDs and the MESH and OMIM ids that there is a cross
  7 | reference to in the Disease Ontology.
  8 | 
  9 | Usage:
 10 | python parse_do_diseases.py <input_file> [--output_dir OUTPUT_DIR]
 11 | 
 12 | Positional Arguments:
 13 | input_file   : The doid.obo file which contains the disease ontology.
 14 | 
 15 | Optional Arugments: 
 16 | --output_dir : Directory to create output files. Defaults to the current working directory.
 17 | 
 18 | Example Usage:
 19 | Input File: doid.obo
 20 | 
 21 | Output directory : outputs/diseases/
 22 | 
 23 | Comamnd line:
 24 | python parse_do_diseases.py doid.obo --output_dir outputs/diseases/
 25 | 
 26 | Output: 
 27 | do_parsed.tsv
 28 | doid_mesh_equiv.tsv
 29 | doid_omim_equiv.tsv
 30 | '''
 31 | 
 32 | from collections import defaultdict
 33 | import os
 34 | import argparse
 35 | 
 36 | 
 37 | # In[79]:
 38 | 
 39 | def parse_do_file_to_list(fname):
 40 |     """
 41 |     Reads the disease ontology in obo format from file
 42 |     given by fname, and returns the ontology as a list
 43 |     of dictionaries, one dictionary per entry.
 44 |     The dictionary for each entry is structured with
 45 |     the following fields
 46 |     {
 47 |         'id' (The disease ontology id)
 48 |         'name'
 49 |         'def'
 50 |         'synonym'
 51 |         'alt_id' (A list of alternate DOID ids)
 52 |         'xref' (A list of xrefs to MESH/OMIM ids)
 53 |         'is_a' (A DOID of what this disease is)
 54 |     
 55 |     }
 56 |     """
 57 |     f = open(fname, 'r')
 58 | 
 59 |     preamble = True # If we're in the top part of the file
 60 |     global_list = []
 61 |     curr_node_dict = {}
 62 |     for line in f:
 63 |         if preamble:
 64 |             if line.startswith('[Term]'):
 65 |                 preamble = False
 66 |             continue
 67 |         spline = line.strip().split()
 68 |         if len(spline) == 0:
 69 |             global_list.append(curr_node_dict)
 70 |             curr_node_dict = {}
 71 |             continue
 72 |         if spline[0] == 'id:':
 73 |             if not spline[1].startswith('DOID'): # This means we've reached the bottom part of the file.
 74 |                 break
 75 |             curr_node_dict['id'] = spline[1]
 76 |         elif spline[0] == 'name:':
 77 |             curr_node_dict['name'] = ' '.join(spline[1:])
 78 |         elif spline[0] == 'def:':
 79 |             curr_node_dict['def'] = ' '.join(spline[1:])
 80 |         elif spline[0] == 'synonym:':
 81 |             curr_node_dict['synonym'] = ' '.join(spline[1:])
 82 |         elif spline[0] == 'alt_id:':
 83 |             if 'alt_id' in curr_node_dict:
 84 |                 curr_node_dict['alt_id'].append(spline[1])
 85 |             else:
 86 |                 curr_node_dict['alt_id'] = [spline[1]]
 87 |         elif spline[0] == 'is_a:':
 88 |             curr_node_dict['is_a'] = spline[1]
 89 |         elif spline[0] == 'xref:':
 90 |             if 'xref' in curr_node_dict:
 91 |                 curr_node_dict['xref'].append(spline[1])
 92 |             else:
 93 |                 curr_node_dict['xref'] = [spline[1]]
 94 | 
 95 |                 
 96 |     return global_list
 97 | 
 98 | doid_to_mesh_dict = defaultdict(list)
 99 | mesh_to_doid_dict = defaultdict(list)
100 | omim_to_doid_dict = defaultdict(list)
101 | doid_to_omim_dict = defaultdict(list)
102 | doid_equiv_dict = defaultdict(list)
103 | 
104 | 
105 | parser = argparse.ArgumentParser(description='Parse DOID to find diseases.')
106 | parser.add_argument('input_file', help='Input file path. File should be the doid.obo file.')
107 | parser.add_argument('--output_dir', help='Directory to output files', default='.')
108 | args = parser.parse_args()
109 | 
110 | output_fname = os.path.join(args.output_dir, "doid_parsed.tsv")
111 | doid_mesh_fname = os.path.join(args.output_dir, "doid_mesh_equiv.tsv")
112 | doid_omim_fname = os.path.join(args.output_dir, "doid_omim_equiv.tsv")
113 | #metadict_fname = os.path.join(args.output_dir, "meta_dict.pickle")
114 | 
115 | # Get the Disease Ontology as a list of one dictionary per entry.
116 | do_list = parse_do_file_to_list(args.input_file)
117 | 
118 | with open(output_fname, 'w') as out_f:
119 |     out_f.write('# Parsed DOID file.\n# Columns are id, name, definition, synonym.\n')
120 |     for entry in do_list:
121 |         
122 |         # To the doid table, we write all the info.
123 |         name = entry['name'] if 'name' in entry else ''
124 |         definition = entry['def'] if 'def' in entry else ''
125 |         synonym = entry['synonym'] if 'synonym' in entry else ''
126 |         out_f.write('\t'.join([entry['id'], name, definition, synonym]))
127 |         out_f.write('\n')
128 |         
129 |         # Populate dictionaries
130 |         if 'alt_id' in entry:
131 |             for alt_id in entry['alt_id']:
132 |                 doid_equiv_dict[entry['id']].append(alt_id)
133 |                 doid_equiv_dict[alt_id].append(entry['id'])
134 |         
135 |         if 'xref' in entry:
136 |             for xref in entry['xref']:
137 |                 if xref.startswith('OMIM'):
138 |                     omim_to_doid_dict[xref].append(entry['id'])
139 |                     doid_to_omim_dict[entry['id']].append(xref)
140 |                     
141 |                 #updated for 2019 DOID dataset
142 |                 elif xref.startswith('MSH') or xref.startswith('MESH'):
143 |                     # For consistency, use MESH:id instead of MSH:id
144 |                     if xref.startswith('MSH'):
145 |                         mesh_id = xref[:1] + 'E' + xref[1:]
146 |                     else:
147 |                         mesh_id = xref[:1] + xref[1:]
148 |                     mesh_to_doid_dict[mesh_id].append(entry['id'])
149 |                     doid_to_mesh_dict[entry['id']].append(mesh_id)
150 | 
151 | with open(doid_mesh_fname, 'w') as mesh_f:
152 |     with open(doid_omim_fname, 'w') as omim_f:
153 |         for doid in doid_to_mesh_dict:
154 |             for mesh in doid_to_mesh_dict[doid]:
155 |                 mesh_f.write(doid + '\t' + mesh + '\n')
156 |         for doid in doid_to_omim_dict:
157 |             for omim in doid_to_omim_dict[doid]:
158 |                 omim_f.write(doid + '\t' + omim + '\n')
159 | """
160 | meta_dict = {
161 |     "doid_to_mesh_dict" : doid_to_mesh_dict,
162 |     "mesh_to_doid_dict" : mesh_to_doid_dict,
163 |     "omim_to_doid_dict" : omim_to_doid_dict,
164 |     "doid_to_omim_dict" : doid_to_omim_dict,
165 |     "doid_equiv_dict" : doid_equiv_dict,
166 | }
167 | pickle.dump(meta_dict, open(metadict_fname, 'w'))
168 | """
169 | 


--------------------------------------------------------------------------------
/Disease/parse_omim_diseases.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | file : parse_ctd_diseases.py
  3 | author: Farzaan Kaiyom
  4 | based on scripts by Viswajith Venugopal
  5 | 
  6 | Parses the OMIM latest disease table found in genemap2.txt
  7 | ^ *This is the latest formatting for OMIM data* ^
  8 | 
  9 | Usage:
 10 | python parse_ctd_diseases.py <input_file> [--output_dir OUTPUT_DIR]
 11 | 
 12 | Positional Arguments:
 13 | input_file   : The directory containing all the OMIM files.
 14 | 
 15 | Optional Arugments: 
 16 | --output_dir : Directory to create output files. Defaults to the current working directory.
 17 | 
 18 | Example Usage:
 19 | Input File: OMIM/08-2019/
 20 | 
 21 | Output directory : ../../output/diseases/
 22 | 
 23 | Comamnd line:
 24 | python parse_ctd_diseases.py OMIM/08-2019/ --output_dir output/diseases/
 25 | 
 26 | Output: 
 27 | omim_parsed.tsv
 28 | '''
 29 | 
 30 | import os
 31 | import argparse
 32 | 
 33 | def parse_omim_file_to_list(omim_dir):
 34 |     """
 35 |     Takes the OMIM directory as an argument, and
 36 |     returns a list of diseases from OMIM.
 37 |     First, it goes over the mim2gene file, and stores
 38 |     the OMIM numbers which correspond to diseases (phenotype).
 39 |     Then, it goes over the genemap and produces a list of entries
 40 |     which correspond to diseases, one dictionary per entry.
 41 |     Each dictionary has the following structure:
 42 |     {
 43 |         'id',
 44 |         'cyto_loc',
 45 |         'gene_symbols',
 46 |         'gene_name',
 47 |         'comments',
 48 |         'phenotypes',
 49 |         'mouse_symb'
 50 |     }
 51 |     """
 52 |     
 53 |     mim2gene_f = open(os.path.join(omim_dir, 'mim2gene.txt'), 'r')
 54 |     genemap_f = open(os.path.join(omim_dir, 'genemap2.txt'), 'r')
 55 |     
 56 |     # The set of mim numbers corresponding to diseases.
 57 |     disease_mims = set()
 58 |     for line in mim2gene_f:
 59 |         if line.startswith('#'):
 60 |             continue
 61 |         sp_line = line.split('\t')
 62 |         mim_number = sp_line[0]
 63 |         mim_type = sp_line[1]
 64 |         if mim_type == 'phenotype':
 65 |             disease_mims.add(mim_number)
 66 |     
 67 |     omim_list = []
 68 |     # Now, go over genemap and populate the list.
 69 |     for line in genemap_f:
 70 |         if line.startswith('#'):
 71 |             continue
 72 | 
 73 |         sp_line = line.strip('\n').split('\t')
 74 |         mim_number = sp_line[5]
 75 |         if mim_number not in disease_mims:
 76 |             continue
 77 |         cyto_loc = sp_line[3]
 78 |         gene_symbols = sp_line[6]
 79 |         gene_name = sp_line[7]
 80 |         
 81 |         ensembl_id = sp_line[10]
 82 | 
 83 |         comments = sp_line[11]
 84 |         phenotypes = sp_line[12]
 85 |         mouse_gene_symbol = sp_line[13]
 86 |         omim_list.append({
 87 |                 'id' : 'OMIM:' + mim_number,
 88 |                 'cyto_loc': cyto_loc,
 89 |                 'gene_symbols': gene_symbols,
 90 |                 'gene_name': gene_name,
 91 |                 'comments': comments,
 92 |                 'phenotypes': phenotypes,
 93 |                 'mouse_symb': mouse_gene_symbol
 94 |             })
 95 |     return omim_list
 96 | 
 97 | 
 98 | parser = argparse.ArgumentParser(description='Parse OMIM to find diseases.')
 99 | parser.add_argument('input_dir', help='Input file dir. File should be the directory which contains all downloaded OMIM files.')
100 | parser.add_argument('--output_dir', help='Directory to output files', default='.')
101 | args = parser.parse_args()
102 | 
103 | # Get the OMIM node table as a list of one dictionary per entry.
104 | omim_list = parse_omim_file_to_list(args.input_dir)
105 | output_fname = os.path.join(args.output_dir, "omim_parsed.tsv")
106 | 
107 | with open(output_fname, 'w') as out_f:
108 |     out_f.write('Parsed OMIM file.\n Columns: id, phenotypes, gene_name, gene_symbols, cyto_loc')
109 |     for entry in omim_list:
110 |         out_f.write('\t'.join([entry['id'], entry['phenotypes'],
111 |                                     entry['gene_name'], entry['gene_symbols'], entry['cyto_loc'],
112 |                                    entry['mouse_symb']]) + '\n')
113 | 


--------------------------------------------------------------------------------
/Function-Function/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing function-function interaction information:
 2 | - GeneOntology (GO ids)
 3 | 
 4 | Note that all the relevant tables can be generated using scripts found in the Utils directory.
 5 | 
 6 | Workflow:
 7 | 
 8 | Input files:
 9 | /path/to/input/go.obo
10 | 
11 | Output files:
12 | /path/to/output/miner-function-20160521.tsv
13 | /path/to/output/miner-function-0-GO-20160521.tsv
14 | /path/to/output/miner-function-function-20160521.tsv
15 | /path/to/output/miner-function-function-0-GO-20160521.tsv
16 | 
17 | Intermediate files:
18 | /path/to/intermediate/go_parsed.tsv
19 | /path/to/intermediate/go_nodes.tsv
20 | 
21 | # Create all the function mode files
22 | python ../Function-Function/parse_obo_for_functions.py /path/to/input/go.obo /path/to/intermediate/go_parsed.tsv
23 | 
24 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/go_parsed.tsv /path/to/intermediate/go_nodes.tsv GO 0 1
25 | 
26 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/go_nodes.tsv function GO 0 --output_dir /path/to/output/
27 | 
28 | 
29 | # Create crossnet files
30 | python ../Utils/create_snap_crossnet_table.py /path/to/input/go_parsed.tsv /path/to/output/miner-function-0-GO-20160521.tsv /path/to/output/miner-function-0-GO-20160521.tsv GO 0 --output_dir /path/to/output/
31 | 
32 | Scripts included:
33 | 
34 | file: parse_obo_for_functions.py
35 | author: Sheila Ramaswamy(@sramas15)
36 | 
37 | Script that parses the gene ontology obo file for the function-function edge list.
38 | 
39 | Usage:
40 | python parse_obo_for_functions.py <input_file_path> <output_file_path>
41 | 
42 | Positional Arguments:
43 | input_file_path:         Path to the input file; Input file should be the GO obo file.
44 | output_file_path:        Path to the output file; Will be a tsv with the following schema:
45 | 					     <go_id1>\\t<go_id2>\\t<optional_edge_attr>
46 | 
47 | 
48 | 
49 | Example usage:
50 | Creating files for function-function relationships using GeneOntology:
51 | 
52 | Input files: /path/to/input/go.obo
53 | 
54 | Output files: /path/to/output/functions.tsv
55 | 
56 | Workflow:
57 | 
58 | python parse_obo_for_functions.py /path/to/input/go.obo /path/to/output/functions.tsv
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/Function-Function/parse_obo_for_functions.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file: parse_obo_for_functions.py
 3 | author: Sheila Ramaswamy(@sramas15)
 4 | 
 5 | Script that parses the gene ontology obo file for the function-function edge list.
 6 | 
 7 | Usage:
 8 | python parse_obo_for_functions.py <input_file_path> <output_file_path>
 9 | 
10 | Positional Arguments:
11 | input_file_path:         Path to the input file; Input file should be the GO obo file.
12 | output_file_path:        Path to the output file; Will be a tsv with the following schema:
13 | 					     <go_id1>\\t<go_id2>\\t<optional_edge_attr>
14 | 
15 | 
16 | 
17 | Example usage:
18 | Creating files for function-function relationships using GeneOntology:
19 | 
20 | Input files: /path/to/input/go.obo
21 | 
22 | Output files: /path/to/output/functions.tsv
23 | 
24 | Workflow:
25 | 
26 | python parse_obo_for_functions.py /path/to/input/go.obo /path/to/output/functions.tsv
27 | '''
28 | import argparse
29 | 
30 | parser = argparse.ArgumentParser(description='Parses and create an edge list for go functions of the form <go_id1>\\t<go_id2>\\t<optional_edge_attr>')
31 | parser.add_argument('input_file', help='input file name. Should be an obo file')
32 | parser.add_argument('output_file', help='output file name. Will be a tsv')
33 | 
34 | args = parser.parse_args()
35 | 
36 | 
37 | edge_terms = ['disjoint_from',  'consider', 'alt_id', 'id', 'relationship', 'intersection_of', 'is_a', 'replaced_by']
38 | 
39 | with open(args.input_file, 'r') as inF:
40 | 	with open(args.output_file, 'a') as outF:
41 | 		outF.write('# Function-function interactions from GO\n')
42 | 		outF.write('# GO_id1\tGO_id2\n')
43 | 		inTerm = False
44 | 		currNode = None
45 | 		for line in inF:
46 | 			line = line.strip()
47 | 			if line == '[Term]':
48 | 				inTerm = True
49 | 				continue
50 | 			if len(line) == 0:
51 | 				inTerm = False
52 | 				currNode = None
53 | 				continue
54 | 			if inTerm:
55 | 				if line[0:3] == 'id:':
56 | 					currNode = line[4:].strip()
57 | 				else:
58 | 					for term in edge_terms:
59 | 						if line.split(':')[0] == term:
60 | 							assert currNode is not None
61 | 							new_line = line[len(term)+1:].strip()
62 | 							if new_line[0:3] == 'GO:':
63 | 								attr = '-'
64 | 								dst_id = new_line.split(' ')[0]
65 | 							else:
66 | 								(attr, edge_id) = new_line.split('!')[0].split('GO:')
67 | 								attr = attr.strip()
68 | 								dst_id = 'GO:' + edge_id.strip()
69 | 							outF.write('%s\t%s\t%s\n' % (currNode, dst_id, term))
70 | 							break
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/Function/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing function information:
 2 | - GeneOntology (GO ids)
 3 | 
 4 | Note that all the relevant tables can be generated using scripts found in the Utils directory.
 5 | 
 6 | Workflow:
 7 | 
 8 | Input files:
 9 | /path/to/input/go.obo
10 | 
11 | Output files:
12 | /path/to/output/miner-function-20160521.tsv
13 | /path/to/output/miner-function-0-GO-20160521.tsv
14 | 
15 | Intermediate files:
16 | /path/to/intermediate/go_parsed.tsv
17 | /path/to/intermediate/go_nodes.tsv
18 | 
19 | # Create all the function mode files
20 | python ../Function-Function/parse_obo_for_functions.py /path/to/input/go.obo /path/to/intermediate/go_parsed.tsv
21 | 
22 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/go_parsed.tsv /path/to/intermediate/go_nodes.tsv GO 0 1
23 | 
24 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/go_nodes.tsv function GO 0 --output_dir /path/to/output/
25 | 


--------------------------------------------------------------------------------
/Function/parse_obo_for_functions.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file: parse_obo_for_functions.py
 3 | author: Sheila Ramaswamy(@sramas15)
 4 | 
 5 | Script that parses the gene ontology obo file for the function-function edge list.
 6 | 
 7 | Usage:
 8 | python parse_obo_for_functions.py <input_file_path> <output_file_path>
 9 | 
10 | Positional Arguments:
11 | input_file_path:         Path to the input file; Input file should be the GO obo file.
12 | output_file_path:        Path to the output file; Will be a tsv with the following schema:
13 | 					     <go_id1>\\t<go_id2>\\t<optional_edge_attr>
14 | 
15 | 
16 | 
17 | Example usage:
18 | Creating files for function-function relationships using GeneOntology:
19 | 
20 | Input files: /path/to/input/go.obo
21 | 
22 | Output files: /path/to/output/functions.tsv
23 | 
24 | Workflow:
25 | 
26 | python parse_obo_for_functions.py /path/to/input/go.obo /path/to/output/functions.tsv
27 | '''
28 | import argparse
29 | 
30 | parser = argparse.ArgumentParser(description='Parses and create an edge list for go functions of the form <go_id1>\\t<go_id2>\\t<optional_edge_attr>')
31 | parser.add_argument('input_file', help='input file name. Should be an obo file')
32 | parser.add_argument('output_file', help='output file name. Will be a tsv')
33 | 
34 | args = parser.parse_args()
35 | 
36 | 
37 | edge_terms = ['disjoint_from',  'consider', 'alt_id', 'id', 'relationship', 'intersection_of', 'is_a', 'replaced_by']
38 | 
39 | lbls = ['name','desc','namespace','synonym']
40 | 
41 | with open(args.input_file, 'r') as inF:
42 | 	with open(args.output_file, 'a') as outF:
43 | 		outF.write('# Function-function interactions from GO\n')
44 | 		outF.write('# GO_id1\tname\tdesc\tnamespace\tsynonym\n')
45 | 		inTerm = True
46 | 		currNode = None
47 | 		passed = False
48 | 		for line in inF:
49 | 			line = line.strip()
50 | 			#if line == '[Term]':
51 | 				#inTerm = True
52 | 				#continue
53 | 			if len(line) == 0:
54 | 				#inTerm = False
55 | 				#currNode = None
56 | 				continue
57 | 			if inTerm:
58 | 
59 | 				if line[0:3] == 'id:':
60 | 					if passed:
61 | 						for lbl in lbls:
62 | 							if lbl not in linedict.keys():
63 | 								linedict[lbl]='N/A'
64 | 						outF.write('%s\t%s\t%s\t%s\t%s\n' % (linedict['id'], linedict['name'],linedict['desc'],linedict['namespace'],linedict['synonym']))
65 | 					linedict = {}
66 | 					linedict['id']= line[4:].strip()
67 | 					currNode = linedict['id']
68 | 					passed = True
69 | 				else:
70 | 					if line.split(':')[0] in lbls:
71 | 						term = line.split(':')[0]
72 | 						assert currNode is not None
73 | 						new_line = line[len(term)+1:].strip()
74 | 						if 'EXACT []' in new_line:
75 | 							new_line = new_line.replace('EXACT []','')
76 | 						linedict[term] = new_line
77 | 							#if new_line[0:3] == 'GO:':
78 | 							#	attr = '-'
79 | 							#		dst_id = new_line.split(' ')[0]
80 | 							#else:
81 | 							#	(attr, edge_id) = new_line.split('!')[0].split('GO:')
82 | 							#	attr = attr.strip()
83 | 							#	dst_id = 'GO:' + edge_id.strip()
84 | 				#outF.write('%s\t%s\t%s\n' % (currNode, dst_id, term))
85 | 							
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/Gene-Function/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing gene-function information:
 2 | - GeneOntology (Uniprot ids to GO ids)
 3 | 
 4 | Note that all the relevant tables can be generated using scripts found in the Utils directory.
 5 | 
 6 | Workflow:
 7 | 
 8 | Input files:
 9 | /path/to/input/gene_association.goa_human (from GO)
10 | /path/to/input/go.obo
11 | 
12 | Output files:
13 | /path/to/output/miner-gene-20160521.tsv
14 | /path/to/output/miner-gene-0-GO-20160521.tsv
15 | /path/to/output/miner-function-20160521.tsv
16 | /path/to/output/miner-function-0-GO-20160521.tsv
17 | /path/to/output/miner-gene-function-20160521.tsv
18 | /path/to/output/miner-gene-function-0-GO-20160521.tsv
19 | 
20 | Intermediate files:
21 | /path/to/intermediate/go_parsed.tsv
22 | /path/to/intermediate/go_nodes.tsv
23 | 
24 | # First create all the gene mode files
25 | 
26 | python ../Utils/create_snap_mode_table.py /path/to/input/gene_association.goa_human gene GO 0 --output_dir /path/to/output/ --node_index 1
27 | 
28 | 
29 | # Second create all the function mode files
30 | python ../Function-Function/parse_obo_for_functions.py /path/to/input/go.obo /path/to/intermediate/go_parsed.tsv
31 | 
32 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/go_parsed.tsv /path/to/intermediate/go_nodes.tsv GO 0 1
33 | 
34 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/go_nodes.tsv function GO 0 --output_dir /path/to/output/
35 | 
36 | # Create crossnet files
37 | python ../Utils/create_snap_crossnet_table.py /path/to/input/gene_association.goa_human /path/to/output/miner-gene-0-GO-20160521.tsv /path/to/output/miner-function-0-GO-20160521.tsv GO 0 --output_dir /path/to/output/ --src_node_index 1 --dst_node_index 4
38 | 
39 | 


--------------------------------------------------------------------------------
/Gene-Protein/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing gene-protein interaction information:
 2 | - ENSEMBL/BioMart server (data NOT on ilfs2)
 3 | 
 4 | Workflow:
 5 | 
 6 | Input Files:
 7 | /path/to/input/hgnc_complete_set.txt
 8 | /path/to/input/protein.links.full.v10.txt
 9 | 
10 | Output Files:
11 | /path/to/output/miner-gene-20160521.tsv
12 | /path/to/output/miner-gene-2-HUGO_ENSEMBL-20160521.tsv
13 | /path/to/output/miner-protein-20160521.tsv
14 | /path/to/output/miner-protein-0-STRING-20160521.tsv
15 | /path/to/output/miner-gene-protein-20160521.tsv
16 | /path/to/output/miner-gene-protein-0-ENSEMBL-20160521.tsv
17 | 
18 | Intermediate Files:
19 | /path/to/intermediate/ensembl_mapping.tsv
20 | /path/to/intermediate/protein-STRING-edgelist.tsv
21 | /path/to/intermediate/protein-STRING-nodelist.tsv
22 | 
23 | # Get the mapping file from biomart
24 | python fetch_ensembl_id_mapping /path/to/intermediate/ensembl_mapping.tsv
25 | 
26 | # Get the gene mode table
27 | python ../Utils/create_snap_mode_table.py /path/to/input/hgnc_complete_set.txt gene HUGO_ENSEMBL 2 --output_dir /path/to/output/ --node_index 19
28 | 
29 | # Extract the edge list from the full protein interactions file; potentially may have to change
30 | # the divider default value in the script (assume src and dst columns are 1 and 5)
31 | python ../Utils/extract_edge_list.py /path/to/input/protein.links.full.v10.txt /path/to/intermediate/protein-STRING-edgelist.tsv STRING 1 5
32 | 
33 | # Extract the unique protein ids from the edge list (columns 0 and 1)
34 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/protein-STRING-edgelist.tsv /path/to/intermediate/protein-STRING-nodelist.tsv STRING 0 1
35 | 
36 | # Create the protein mode files
37 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/protein-STRING-nodelist.tsv protein STRING 0 --output_dir /path/to/output/
38 | 
39 | # Create the CrossNet tables
40 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/ensembl_mapping.tsv /path/to/output/miner-gene-2-HUGO_ENSEMBL-20160521.tsv /path/to/output/miner-protein-0-STRING-20160521.tsv ENSEMBL 0 --output_dir /path/to/output/ --skip_missing_ids --dst_mode_filter add_species_id
41 | 
42 | 
43 | Scripts Included:
44 | 
45 | file: fetch_ensembl_id_mapping.py
46 | author: Sheila Ramaswamy(@sramas15)
47 | 
48 | Connects to ENSEMBL server to fetch id mapping, using the biomart python library.
49 | 
50 | Dependencies:
51 | biomart python library (https://pypi.python.org/pypi/biomart/0.9.0)
52 | 
53 | Usage:
54 | python parse_obo_for_functions.py <output_file_path>
55 | 
56 | Positional Arguments:
57 | output_file_path:        Path to the output file; Will be a tsv with the following schema:
58 |                <gene_id>\\t<protein_id>
59 | 
60 | 
61 | Example usage:
62 | 
63 | Input files: None
64 | 
65 | Output files: /path/to/output/gene-protein-mapping.tsv
66 | 
67 | Workflow:
68 | 
69 | python fetch_ensembl_id_mapping.py /path/to/output/gene-protein-mapping.tsv
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/Gene-Protein/fetch_ensembl_id_mapping.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file: fetch_ensembl_id_mapping.py
 3 | author: Sheila Ramaswamy(@sramas15)
 4 | 
 5 | Connects to ENSEMBL server to fetch id mapping, using the biomart python library.
 6 | 
 7 | Dependencies:
 8 | biomart python library (https://pypi.python.org/pypi/biomart/0.9.0)
 9 | 
10 | Usage:
11 | python parse_obo_for_functions.py <output_file_path>
12 | 
13 | Positional Arguments:
14 | output_file_path:        Path to the output file; Will be a tsv with the following schema:
15 |                <gene_id>\\t<protein_id>
16 | 
17 | 
18 | Example usage:
19 | 
20 | Input files: None
21 | 
22 | Output files: /path/to/output/gene-protein-mapping.tsv
23 | 
24 | Workflow:
25 | 
26 | python fetch_ensembl_id_mapping.py /path/to/output/gene-protein-mapping.tsv
27 | '''
28 | import argparse
29 | from biomart import BiomartServer
30 | 
31 | parser = argparse.ArgumentParser(description='Get ensembl gene and peptide mapping from biomart')
32 | parser.add_argument('output_file', help='output file name. Will be a tsv')
33 | args = parser.parse_args()
34 | 
35 | def main(newfile):
36 |   atts = ['ensembl_gene_id', 'ensembl_peptide_id']
37 |   url = 'http://www.ensembl.org/biomart'
38 |   server = BiomartServer(url)
39 |   hge = server.datasets['hsapiens_gene_ensembl']
40 |   with open(newfile, 'w') as outF:
41 |     s = hge.search({'attributes': atts}, header=0)
42 |     for l in s.iter_lines():
43 |       (gene_id, peptide_id) = l.split('\t')
44 |       if len(peptide_id) > 0:
45 |         outF.write('%s\t%s\n' % (gene_id.strip(), peptide_id.strip()))
46 | 
47 | main(args.output_file)
48 | 


--------------------------------------------------------------------------------
/Gene/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets containing gene information:
 2 | - GeneOntology (Uniprot ids *)
 3 | - HUGO (contains ENSEMBL gene ids and Uniprot ids *)
 4 | 
 5 | * Uniprot ids technically protein ids, but they seem to be used interchangeably in these datasets.
 6 | 
 7 | Note that all the relevant tables can be generated using scripts found in the Utils directory.
 8 | 
 9 | Workflow for creating mode tables for genes
10 | 
11 | Input files:
12 | /path/to/input/hgnc_complete_set.txt  (from HUGO)
13 | /path/to/input/goa_human.gaf (from GO)
14 | /path/to/input/goa_human_complex.gaf (from GO)
15 | /path/to/input/goa_human_isoform.gaf (from GO)
16 | /path/to/input/goa_human_rna.gaf (from GO)
17 | 
18 | Output files:
19 | /path/to/output/miner-gene-20160521.tsv
20 | /path/to/output/miner-gene-0-GO-20160521.tsv
21 | /path/to/output/miner-gene-1-HUGO_Uniprot-20160521.tsv
22 | /path/to/output/miner-gene-2-HUGO_ENSEMBL-20160521.tsv
23 | 
24 | 
25 | # Create mode files
26 | python ../Utils/create_snap_mode_table.py /path/to/input/goa_human.gaf gene GO 0 --output_dir /path/to/output/ --node_index 1
27 | python ../Utils/create_snap_mode_table.py /path/to/input/goa_human_complex.gaf gene GOcomplex 1 --output_dir /path/to/output/ --node_index 1
28 | python ../Utils/create_snap_mode_table.py /path/to/input/goa_human_isoform.gaf gene GOisoform 2 --output_dir /path/to/output/ --node_index 1
29 | python ../Utils/create_snap_mode_table.py /path/to/input/goa_human_rna.gaf gene GOrna 3 --output_dir /path/to/output/ --node_index 1
30 | python ../Utils/create_snap_mode_table.py /path/to/input/hgnc_complete_set.txt gene HUGO_Uniprot 4 --output_dir /path/to/output/ --node_index 25
31 | python ../Utils/create_snap_mode_table.py /path/to/input/hgnc_complete_set.txt gene HUGO_ENSEMBL 5 --output_dir /path/to/output/ --node_index 19
32 | 
33 | # Create mode equivalence table
34 | python../Utils/create_snap_mode_equiv_table.py /path/to/output/miner-gene-0-GO-20160521.tsv /path/to/output/miner-gene-1-HUGO_Uniprot-20160521.tsv --output_dir /path/to/output/
35 | 
36 | python../Utils/create_snap_mode_equiv_table.py /path/to/output/miner-gene-2-HUGO_ENSEMBL-20160521.tsv /path/to/output/miner-gene-1-HUGO_Uniprot-20160521.tsv --output_dir /path/to/output/ --mapping_file /path/to/input/hgnc_complete_set.txt --ds1_node_index 19 --ds2_node_index 25
37 | 


--------------------------------------------------------------------------------
/Protein-Protein/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets relevant for getting protein-protein interactions:
 2 | 
 3 | - STRING v10; uses ENSEMBL peptide ids
 4 | 
 5 | Note that all the relevant tables can be generated using scripts found in the Utils directory.
 6 | 
 7 | ***** IMPORTANT: STRING v10 contains 1,847,117,370 edges and therefore takes a long time to process.
 8 |       when adding interactions from another dataset (to the same miner-protein-protein-20160521.tsv 
 9 |       file), please use the snap_id_counter_start argument to set the starting snap id to 
10 |       1,847,117,370; otherwise, the entire file is read to get a starting id.
11 | 
12 | How to create crossnet tables from the STRING database (assumes mode tables not already created):
13 | 
14 | Input Files (from STRING):
15 | /path/to/input/protein.links.full.v10.txt
16 | 
17 | Output Files:
18 | /path/to/output/miner-protein-20160521.tsv
19 | /path/to/output/miner-protein-0-STRING-20160521.tsv
20 | /path/to/output/miner-protein-protein-20160521.tsv
21 | /path/to/output/miner-protein-protein-0-STRING-20160521.tsv
22 | 
23 | Intermediate Files:
24 | /path/to/intermediate/protein-STRING-edgelist.tsv
25 | /path/to/intermediate/protein-STRING-nodelist.tsv
26 | 
27 | # Extract the edge list from the full protein interactions file; potentially may have to change
28 | # the divider default value in the script (assume src and dst columns are 1 and 5)
29 | python ../Utils/extract_edge_list.py /path/to/input/protein.links.full.v10.txt /path/to/intermediate/protein-STRING-edgelist.tsv STRING 1 5
30 | 
31 | # Extract the unique protein ids from the edge list (columns 0 and 1)
32 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/protein-STRING-edgelist.tsv /path/to/intermediate/protein-STRING-nodelist.tsv STRING 0 1
33 | 
34 | # Create the mode files
35 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/protein-STRING-nodelist.tsv protein STRING 0 --output_dir /path/to/output/
36 | 
37 | # Create the crossnet files
38 | python ../Utils/create_snap_crossnet_table.py /path/to/intermediate/protein-STRING-edgelist.tsv /path/to/output/miner-protein-0-STRING-20160521.tsv /path/to/output/miner-protein-0-STRING-20160521.tsv STRING 0 --output_dir /path/to/output/
39 | 
40 | 


--------------------------------------------------------------------------------
/Protein/README.txt:
--------------------------------------------------------------------------------
 1 | Current datasets relevant for getting protein ids:
 2 | 
 3 | - STRING v10; uses ENSEMBL peptide ids
 4 | 
 5 | Note that all the relevant tables can be generated using scripts found in the Utils directory.
 6 | 
 7 | How to create mode tables from the STRING database:
 8 | 
 9 | Input Files (from STRING):
10 | /path/to/input/protein.links.full.v10.txt
11 | 
12 | Output Files:
13 | /path/to/output/miner-protein-20160521.tsv
14 | /path/to/output/miner-protein-0-STRING-20160521.tsv
15 | 
16 | Intermediate Files:
17 | /path/to/intermediate/protein-STRING-edgelist.tsv
18 | /path/to/intermediate/protein-STRING-nodelist.tsv
19 | 
20 | # Extract the edge list from the full protein interactions file; potentially may have to change
21 | # the divider default value in the script (assume src and dst columns are 0 and 1)
22 | python ../Utils/extract_edge_list.py /path/to/input/protein.links.full.v10.txt /path/to/intermediate/protein-STRING-edgelist.tsv STRING 0 1 --divider " "
23 | 
24 | # Extract the unique protein ids from the edge list (columns 0 and 1)
25 | python ../Utils/extract_unique_node_ids.py /path/to/intermediate/protein-STRING-edgelist.tsv /path/to/intermediate/protein-STRING-nodelist.tsv STRING 0 1
26 | 
27 | # Create the mode files
28 | python ../Utils/create_snap_mode_table.py /path/to/intermediate/protein-STRING-nodelist.tsv protein STRING 0 --output_dir /path/to/output/
29 | 
30 | # If multiple datasets used, and there is a mapping between the ids, please use the
31 | # create_snap_mode_equiv_table.py script.
32 | 
33 | 


--------------------------------------------------------------------------------
/Protein/add_organism.py:
--------------------------------------------------------------------------------
 1 | inFNm = 'STRING_protein_nodelist.tsv'
 2 | outFNm = 'STRING_nodelist.tsv'
 3 | with open(inFNm, 'r') as inF:
 4 |   with open(outFNm, 'a') as outF:
 5 |         for line in inF:
 6 |             cur = str(line).replace('\n','')
 7 |             organism = cur.split('.')[0]
 8 |             cur = cur + '\t' + organism
 9 |             outF.write(cur+'\n')
10 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | Scripts used to extract nodes and edges from MINER data in 2016 and later in 2019
 2 | 
 3 | 
 4 | The Utils directory consists of general scripts that can be used to process multiple datasets.
 5 | 
 6 | In addition, there are directories (with readmes on how to generate the respective snap tables) for each of the following Modes and CrossNets:
 7 | 
 8 | Modes
 9 | - Genes
10 | 	- HUGO (http://www.genenames.org/cgi-bin/statistics)
11 | 	- GeneOntology (http://geneontology.org/page/download-go-annotations)
12 | - Proteins
13 | 	- STRING (http://string-db.org/cgi/download.pl)
14 | - Functions
15 | 	- GeneOntology (http://geneontology.org/page/download-ontology)
16 | - Chemicals
17 |         - Drugbank (http://www.drugbank.ca/)
18 | - Diseases
19 | 	- DiseaseOntology (http://disease-ontology.org/)
20 | 	- CTD (http://ctdbase.org)
21 | 	- OMIM (http://www.omim.org/)
22 | 
23 | CrossNets
24 | - Gene-Protein
25 | 	- ENSEMBL Genes, Human genes (http://www.ensembl.org/biomart/martview)
26 | - Protein-Protein
27 | 	- STRING (http://string-db.org/cgi/download.pl)
28 | - Gene-Function
29 | 	- GeneOntology (http://geneontology.org/docs/download-go-annotations/)
30 | - Function-Function
31 | 	- GeneOntology (http://geneontology.org/page/download-ontology)
32 | - Chemical-Chemical
33 |         - Drugbank (http://www.drugbank.ca/)
34 | - Chemical-Gene
35 |         - Drugbank (http://www.drugbank.ca/)
36 | - Disease-Disease
37 | 	- DiseaseOntology (http://disease-ontology.org/)
38 | - Disease-Gene
39 | 	- CTD (http://ctdbase.org)
40 | - Disease-Chemical
41 | 	- CTD (http://ctdbase.org)
42 | - Disease-Function
43 | 	- CTD (http://ctdbase.org)
44 | 
45 | New Datasets can be found at /dfs/scratch2/MINER-BIO/data-miner-201908. 
46 | Old Datasets can be found at /dfs/scratch2/MINER-BIO/data-miner. 
47 | 
48 | The latest graph can be found at /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/snap
49 | 
50 | Here's a quick look at the new miner dataset:
51 | -------------------------------
52 |    Modes     |     Nodes
53 | -------------------------------
54 | Chemical     |   13,339
55 | Protein      |   22,406,877
56 | Gene         |   106,536
57 | Function     |   48,969
58 | Disease      |   25,969
59 | 
60 | --------------------------------------
61 |     Cross-Nets     |     Edges
62 | --------------------------------------
63 | Chemical-Chemical  |     2,712,183
64 | Chemical-Gene      |     20,644
65 | Function-Function  |     249,828
66 | Gene-Function      |     481,543
67 | Gene-Protein       |     18,650
68 | Disease-Disease    |     9,383
69 | Disease-Gene       |     64,109,210
70 | Disease-Function   |     2,138,340
71 | Disease-Chemical   |     2,643,750
72 | Protein-Protein    |     2,147,483,643
73 | 
74 | The old miner-dataset at a glance:
75 | -------------------------------
76 |    Modes     |     Nodes
77 | -------------------------------
78 | Chemical     |     11,367 
79 | Protein      |  8,254,694
80 | Gene         |    104,004
81 | Function     |     46,564
82 | Disease      |     22,299
83 | 
84 | --------------------------------------
85 |     Cross-Nets     |     Edges
86 | --------------------------------------
87 | Chemical-Chemical  |         95,246
88 | Chemical-Gene      |         15,424
89 | Function-Function  |        119,464
90 | Gene-Function      |        481,733
91 | Gene-Protein       |         17,930
92 | Disease-Disease    |          6,877
93 | Disease-Gene       |     42,475,361
94 | Disease-Function   |        784,457
95 | Disease-Chemical   |      1,334,088
96 | Protein-Protein    |  1,847,117,370
97 | 


--------------------------------------------------------------------------------
/Utils/README.txt:
--------------------------------------------------------------------------------
  1 | This directory creates generic scripts than can be used to create snap-formatted tsvs for modes
  2 | and crossnets. This directory currently consists of three files:
  3 | 
  4 | 1. create_snap_mode_table.py
  5 | 	Input:    - original dataset, in tsv form
  6 | 	Output:   - Snap mode table tsv (snap_nid\tdataset_id)
  7 | 	          - dataset specific mode snap table tsv (snap_nid\tdataset_entity_id)
  8 | 2. create_snap_crossnet_table.py
  9 | 	Input:    - Source dataset specific snap mode table tsv
 10 | 	          - Destination dataset specific snap mode table tsv
 11 | 	          - the dataset (in tsv form) specifying edges.
 12 | 	Output:   - Snap crossnet table tsv (snap_eid\tdataset_id\tsnap_src_nid\tsnap_dst_nid) 
 13 | 	          - he dataset specific snap table tsv (snap_eid\tsrc_dataset_id\tdst_dataset_id)
 14 | 3. create_snap_mode_equiv_table.py
 15 | 	Input:    - First dataset specific snap mode table tsv
 16 | 	          - Second dataset specific snap mode table tsv
 17 | 	          - (Optional) the dataset (in tsv form) specifying id equivalences
 18 | 	Output:   - Snap mode equivlance table (snap_nid\tsnap_nid)
 19 | 
 20 | 
 21 | It also contains scripts to pull out unique node ids and create an edge list (i.e. remove 
 22 | extraneous fields, but can also handle input lines that model many-to-many, many-to-1, and
 23 | 1-to-many relationships):
 24 | 
 25 | 4. extract_unique_node_ids.py
 26 | 	Description: Extracts node ids from a tsv and writes all the unique ids to a tsv.
 27 | 	Input:    - dataset, in tsv form ***
 28 | 	Output:   - tsv, where each line contains a single node id
 29 | 5. extract_edge_list.py 
 30 | 	Description: Extracts src and dst node ids from a file, creates a tsv edge list. Can process
 31 | 	             1-to-1, 1-to-many, many-to-1, and many-to-many relationships in input file.
 32 | 	Input:    - dataset, in tsv form ***
 33 | 	Output:   - tsv, where each line contains the source node id and the destination node id.
 34 | 
 35 | *** If file doesn't use tabs to separate the fields in a line, you can change the divider default
 36 |     argument value in the script.
 37 | 
 38 | 
 39 | Below are details on the arguments and usage for each script (taken from the header of each file):
 40 | 
 41 | ** Note that these descriptions may be out-of-date. Please read the header of the respective script to get the most up-to-date information about its command line arguments, etc.
 42 | 
 43 | ########################################
 44 | ###     create_snap_mode_table.py    ###
 45 | ########################################
 46 | 
 47 | Script that creates snap tables for a given mode.
 48 | 
 49 | Usage:
 50 | python create_snap_mode_table.py <input_file_path> <mode_name> <dataset_name> <dataset_id>
 51 | 
 52 | Positional Arguments:
 53 | input_file:              Path to the input file; Input file should be a tsv.
 54 | mode_name:               Name of the mode being created e.g. genes
 55 | dataset_name:            Name of dataset being used to create the snap mode tables i.e. the 
 56 |                          dataset the input file comes from. e.g. STRING
 57 | dataset_id:              unique integer id for this dataset.
 58 | 
 59 | 
 60 | Optional arguments:
 61 | --node_index:            If there are multiple columns in the input tsv, the index of the column with the node id.
 62 |                          Defaults to 0.
 63 | --output_dir:            Directory to create output files. Defaults to the current working directory.
 64 | --full_mode_file:        Name of output file tsv containing a list of <snap_id>\t<dataset_id>.
 65 |                          Defaults to output_dir/miner-<mode_name>-<date>.tsv
 66 | --db_node_file:          Name of output file tsv for a specific dataset; contains a list of <snap id>\t<dataset_specific_entity_id>
 67 |                          Defaults to output_dir/miner-<mode_name>-<dataset_id>-<dataset>-<date>.tsv
 68 | --snap_id_counter_start  Start assigning snap ids from this integer value; this number MUST be greater
 69 |                          than any id found in the full mode file. If not specified, finds the max id in the
 70 |                          full_mode_file.
 71 | 
 72 | Example usage:
 73 | Creating files for genes using two datasets, GeneOntology and HUGO:
 74 | 
 75 | Input files: hugo.tsv and go.tsv
 76 | 
 77 | Output directory: outputs/genes/
 78 | 
 79 | Output files: miner-gene-20160520.tsv, miner-gene-0-GO-20160520.tsv, miner-gene-1-HUGO-20160520.tsv
 80 | 
 81 | Workflow:
 82 | 
 83 | python create_snap_mode_table.py go.tsv gene GO 0 --output_dir outputs/genes/
 84 | python create_snap_mode_table.py hugo.tsv gene HUGO 1 --output_dir outputs/genes/
 85 | 
 86 | 
 87 | ############################################
 88 | ###     create_snap_crossnet_table.py    ###
 89 | ############################################
 90 | 
 91 | Script that creates snap tables for a given crossnet.
 92 | 
 93 | Usage:
 94 | python create_snap_crossnet_table.py <input_file_path> <src_file_path> <dst_file_path> <dataset_name> <dataset_id>
 95 | 
 96 | Positional Arguments:
 97 | input_file               Path to the input file; Input file should be a tsv.
 98 | src_file                 Path to a dataset specific file, as outputted by create_snap_mode_table.py,
 99 |                          corresponding to the source mode. File name MUST MATCH FORMAT:
100 |                          miner-<mode_name>-<dataset_id>-<dataset>-<date>.tsv
101 | dst_file                 Path to a dataset specific file, as outputted by create_snap_mode_table.py,
102 |                          corresponding to the destination mode. File name MUST MATCH FORMAT:
103 |                          miner-<mode_name>-<dataset_id>-<dataset>-<date>.tsv
104 | dataset_name:            Name of dataset being used to create the snap crossnet tables i.e. the 
105 |                          dataset the input file comes from. e.g. STRING
106 | dataset_id:              unique integer id for this dataset.
107 | 
108 | 
109 | Optional arguments:
110 | --src_node_index:        If there are multiple columns in the input tsv, the index of the column with the src node id.
111 |                          Defaults to 0.
112 | --dst_node_index:        If there are multiple columns in the input tsv, the index of the column with the dst node id.
113 |                          Defaults to 1.
114 | --output_dir:            Directory to create output files. Defaults to the current working directory.
115 | --full_crossnet_file:    Name of output file tsv containing a list of <snap_id>\t<dataset_id>.
116 |                          Defaults to output_dir/miner-<src_mode_name>-<dst_mode_name>-<date>.tsv
117 | --db_edge_file:          Name of output file tsv for a specific dataset; contains a list of <snap id>\t<dataset_specific_entity_id>
118 |                          Defaults to output_dir/miner-<src_mode_name>-<dst_mode_name>-<dataset_id>-<dataset>-<date>.tsv
119 | --snap_id_counter_start  Start assigning snap ids from this integer value; this number MUST be greater
120 |                          than any id found in the full crossnet file. If not specified, finds the max id in the
121 |                          full_crossnet_file.
122 | --skip_missing_ids       Flag; If any of the ids in the input tsv do not have snap ids (which are fetched from
123 |                          the src and dst files), skip the line and continue parsing the data.
124 | --src_mode_filter        The name of a function in utils.py that should be applied to the source node id in 
125 |                          in the input file before using it to look up the snap id in the src_file. Defaults to None.
126 | --dst_mode_filter        The name of a function in utils.py that should be applied to the destination node id in 
127 |                          in the input file before using it to look up the snap id in the dst_file. Defaults to None.
128 | 
129 | Example usage:
130 | Creating files for genes-function relationships using GeneOntology:
131 | 
132 | Input files: go.tsv, miner-gene-0-GO-20160520.tsv, miner-function-0-GO-20160520.tsv
133 | 
134 | Output directory: outputs/genes-functions/
135 | 
136 | Output files: miner-gene-function-20160520.tsv, miner-gene-function-0-GO-20160520.tsv
137 | 
138 | Workflow:
139 | 
140 | python create_snap_crossnet_table.py go.tsv miner-gene-0-GO-20160520.tsv miner-function-0-GO-20160520.tsv GO 0 --output_dir outputs/genes-functions/
141 | 
142 | 
143 | ##############################################
144 | ###     create_snap_mode_equiv_table.py    ###
145 | ##############################################
146 | 
147 | Script that creates snap equivalence table between two datasets for a given mode.
148 | 
149 | Usage:
150 | python create_snap_mode_equiv_table.py <dataset1_file_path> <dataset2_file_path>
151 | 
152 | Positional Arguments:
153 | dataset1_file            Path to a dataset specific file, as outputted by create_snap_mode_table.py,
154 |                          corresponding to the source mode. File name MUST MATCH FORMAT:
155 |                          miner-<mode_name>-<dataset_id>-<dataset>-<date>.tsv
156 | dataset2_file            Path to a dataset specific file, as outputted by create_snap_mode_table.py,
157 |                          corresponding to the destination mode. File name MUST MATCH FORMAT:
158 |                          miner-<mode_name>-<dataset_id>-<dataset>-<date>.tsv
159 | 
160 | 
161 | Optional arguments:
162 | --mapping_file           Path to a tsv file containing the mapping between the two datasets.
163 | --ds1_node_index:        If there are multiple columns in the input tsv, the index of the column with the dataset1 entity id.
164 |                          Defaults to 0.
165 | --ds2_node_index:        If there are multiple columns in the input tsv, the index of the column with the dataset2 entity id.
166 |                          Defaults to 1.
167 | --output_dir:            Directory to create output files. Defaults to the current working directory.
168 | --equiv_file:            Name of output file tsv containing a list of <snap_id>\t<snap_id>.
169 |                          Defaults to output_dir/miner-<mode_name>-equiv-<date>.tsv
170 | --skip_missing_ids       Flag; If any of the ids in the input tsv do not have snap ids (which are fetched from
171 |                          the src and dst files), skip the line and continue parsing the data.
172 | 
173 | Example usage:
174 | Creating equivalence file for genes using GeneOntoloty and HUGO
175 | 
176 | Input files: hugo.tsv, miner-gene-0-GO-20160520.tsv, miner-gene-1-HUGO-20160520.tsv
177 | 
178 | Output directory: outputs/genes/
179 | 
180 | Output files: miner-gene-equiv-20160520.tsv
181 | 
182 | Workflow:
183 | 
184 | python create_snap_mode_equiv_table.py miner-gene-0-GO-20160520.tsv miner-gene-1-HUGO-20160520.tsv --mapping_file_path hugo.tsv --output_dir outputs/genes/
185 | 
186 | 
187 | #########################################
188 | ###     extract_unique_node_ids.py    ###
189 | #########################################
190 | 
191 | Script that creates a tsv containing all the unique node ids from a given input file.
192 | 
193 | Usage:
194 | python extract_unique_node_ids.py <input_file_path> <output_file_path> <dataset_name> <column_1> <column_2> ... <column_N>
195 | 
196 | Positional Arguments:
197 | input_file               Path to the input file; Input file should be a tsv.
198 | output_file              Path to the output file; Output file will be a tsv.
199 | dataset_name:            Name of dataset nodes are being extracted from e.g. STRING
200 | columns:                 Columns containing node ids. Can specify many.
201 | 
202 | 
203 | Optional arguments:
204 | --node_name:             String indicating how to refer to the node ids in the file scheme. Defaults to node_id.
205 | --has_title:             If provided, skips over the first line of the file.
206 | --verbose:               If provided, prints to the console for every million lines of the input file processed.
207 | 
208 | Example usage:
209 | Extracting node ids from a STRING edgelist file, consisting of <src_node_id>\t<dst_node_id>
210 | 
211 | Input files: STRING.tsv
212 | 
213 | Output file: STRING-nodes.tsv
214 | 
215 | Workflow:
216 | 
217 | python extract_unique_node_ids.py STRING.tsv STRING-nodes.tsv STRING 0 1 --node_name ENSEMBL_peptide_id --verbose 
218 | 
219 | 
220 | ###################################
221 | ###     extract_edge_list.py    ###
222 | ###################################
223 | 
224 | Script that creates an edge list given the input file.
225 | 
226 | Usage:
227 | python extract_unique_node_ids.py <input_file_path> <output_file_path> <dataset_name> <src_node_column> <dst_node_column>
228 | 
229 | Positional Arguments:
230 | input_file               Path to the input file; Input file should be a tsv.
231 | output_file              Path to the output file; Output file will be a tsv.
232 | dataset_name:            Name of dataset nodes are being extracted from e.g. STRING
233 | src_node_column:         Column containing source node(s)
234 | dst_node_column:         Column containing destination node(s)
235 | 
236 | Optional arguments:
237 | --src_node_name:         String indicating how to refer to the src node ids in the file scheme. Defaults to node_id1.
238 | --dst_node_name:         String indicating how to refer to the dst node ids in the file scheme. Defaults to node_id2.
239 | --has_title:             If provided, skips over the first line of the file.
240 | --verbose:               If provided, prints to the console for every million lines of the input file processed.
241 | --src_node_sep:          If the column containing the src node actually contains a list of nodes, the character separater
242 |                          used to split the text into the different node ids. Relevant for many-to-one relationships.
243 |                          By default assumes only one node id specified.
244 | --dst_node_sep:          If the column containing the dst node actually contains a list of nodes, the character separater
245 |                          used to split the text into the different node ids. Relevant for one-to-many relationships.
246 |                          By default assumes only one node id specified.
247 | 
248 | Example usage:
249 | Extracting edge list from a STRING protein-protein interactions file, which contains many other fields.
250 | 
251 | Input files: STRING.tsv; assume protein 1 at index 1 and protein 2 at index 5.
252 | 
253 | Output file: STRING-edges.tsv
254 | 
255 | Workflow:
256 | 
257 | python extract_edge_list.py STRING.tsv STRING-edges.tsv STRING 1 5 --src_node_name protein_1 --dst_node_name protein_2
258 | 
259 | 


--------------------------------------------------------------------------------
/Utils/create_snap_crossnet_table.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | file: create_snap_crossnet_table.py
  3 | author: Sheila Ramaswamy(@sramas15)
  4 | 
  5 | Script that creates snap tables for a given crossnet.
  6 | 
  7 | Usage:
  8 | python create_snap_crossnet_table.py <input_file_path> <src_file_path> <dst_file_path> <dataset_name> <dataset_id>
  9 | 
 10 | Positional Arguments:
 11 | input_file:              Path to the input file; Input file should be a tsv.
 12 | src_file:                Path to a dataset specific file, as outputted by create_snap_mode_table.py,
 13 |                          corresponding to the source mode. File name MUST MATCH FORMAT:
 14 |                          miner-<mode_name>-<dataset_id>-<dataset>-<date>.tsv
 15 | dst_file:                Path to a dataset specific file, as outputted by create_snap_mode_table.py,
 16 |                          corresponding to the destination mode. File name MUST MATCH FORMAT:
 17 |                          miner-<mode_name>-<dataset_id>-<dataset>-<date>.tsv
 18 | dataset_name:            Name of dataset being used to create the snap crossnet tables i.e. the 
 19 |                          dataset the input file comes from. e.g. STRING
 20 | dataset_id:              unique integer id for this dataset.
 21 | 
 22 | 
 23 | Optional arguments:
 24 | --src_node_index:        If there are multiple columns in the input tsv, the index of the column with the src node id.
 25 |                          Defaults to 0.
 26 | --dst_node_index:        If there are multiple columns in the input tsv, the index of the column with the dst node id.
 27 |                          Defaults to 1.
 28 | --output_dir:            Directory to create output files. Defaults to the current working directory.
 29 | --full_crossnet_file:    Name of output file tsv containing a list of <snap_id>\t<dataset_id>.
 30 |                          Defaults to output_dir/miner-<src_mode_name>-<dst_mode_name>-<date>.tsv
 31 | --db_edge_file:          Name of output file tsv for a specific dataset; contains a list of <snap id>\t<dataset_specific_entity_id>
 32 |                          Defaults to output_dir/miner-<src_mode_name>-<dst_mode_name>-<dataset_id>-<dataset>-<date>.tsv
 33 | --snap_id_counter_start  Start assigning snap ids from this integer value; this number MUST be greater
 34 |                          than any id found in the full crossnet file. If not specified, finds the max id in the
 35 |                          full_crossnet_file.
 36 | --skip_missing_ids       Flag; If any of the ids in the input tsv do not have snap ids (which are fetched from
 37 |                          the src and dst files), skip the line and continue parsing the data.
 38 | --src_mode_filter        The name of a function in utils.py that should be applied to the source node id in 
 39 |                          in the input file before using it to look up the snap id in the src_file. Defaults to None.
 40 | --dst_mode_filter        The name of a function in utils.py that should be applied to the destination node id in 
 41 |                          in the input file before using it to look up the snap id in the dst_file. Defaults to None.
 42 | 
 43 | Example usage:
 44 | Creating files for genes-function relationships using GeneOntology:
 45 | 
 46 | Input files: go.tsv, miner-gene-0-GO-20160520.tsv, miner-function-0-GO-20160520.tsv
 47 | 
 48 | Output directory: outputs/genes-functions/
 49 | 
 50 | Output files: miner-gene-function-20160520.tsv, miner-gene-function-0-GO-20160520.tsv
 51 | 
 52 | Workflow:
 53 | 
 54 | python create_snap_crossnet_table.py go.tsv miner-gene-0-GO-20160520.tsv miner-function-0-GO-20160520.tsv GO 0 --output_dir outputs/genes-functions/
 55 | '''
 56 | 
 57 | import argparse
 58 | import utils
 59 | import os
 60 | 
 61 | parser = argparse.ArgumentParser(description='Create snap edge tables')
 62 | parser.add_argument('input_file', help='input file name. File should be a tsv, containing interactions between ids found in src_file_name and ids found in dst_file_name')
 63 | parser.add_argument('src_file', help='input file name. Should be a file outputted by create_snap_mode_table (with properly formatted name).')
 64 | parser.add_argument('dst_file', help='input file name. Should be a file outputted by create_snap_mode_table (with properly formatted name).')
 65 | parser.add_argument('dataset_name', type=str, help='name of dataset')
 66 | parser.add_argument('db_id', type=int, help='int id for this dataset')
 67 | parser.add_argument('--src_node_index', type=int, help='column index that contains src node ids (NOT snap ids, from src_input_file)', default=0)
 68 | parser.add_argument('--dst_node_index', type=int, help='column index that contains dst node ids (NOT snap ids, from dst_input_file)', default=1)
 69 | parser.add_argument('--output_dir', help='directory to output files; either this argument or full_crossnet_file and db_edge_file MUST be specified', default='.')
 70 | parser.add_argument('--full_crossnet_file', help='output file name; outputs a list of snap ids, the db ids (db the snap id was derived from), and source and destination snap node ids;' \
 71 |   + 'note that this file is appended to; OVERRIDES output_dir argument', default=None)
 72 | parser.add_argument('--db_edge_file', help='output file name; output contains mapping of snap ids to dataset ids; OVERRIDES output dir argument', default=None)
 73 | parser.add_argument('--skip_missing_ids', action='store_true', help='don\'t throw an error if ids in input_file not found in src or dst file.')
 74 | parser.add_argument('--snap_id_counter_start', type=int, help='where to start assigning snap ids', default=-1)
 75 | parser.add_argument('--src_mode_filter', type=str, default=None)
 76 | parser.add_argument('--dst_mode_filter', type=str, default=None)
 77 | args = parser.parse_args()
 78 | 
 79 | 
 80 | inFNm = args.input_file
 81 | srcFile = args.src_file
 82 | dstFile = args.dst_file
 83 | dataset = args.dataset_name
 84 | db_id = args.db_id
 85 | 
 86 | srcIdx = args.src_node_index
 87 | dstIdx = args.dst_node_index
 88 | 
 89 | src_db_id = utils.parse_dataset_id_from_name(os.path.basename(srcFile))
 90 | dst_db_id = utils.parse_dataset_id_from_name(os.path.basename(dstFile))
 91 | 
 92 | mode_name1 = utils.parse_mode_name_from_name(os.path.basename(srcFile))
 93 | mode_name2 = utils.parse_mode_name_from_name(os.path.basename(dstFile))
 94 | 
 95 | output_dir = args.output_dir
 96 | outFNm = args.full_crossnet_file
 97 | if outFNm is None:
 98 |   outFNm = os.path.join(args.output_dir, utils.get_full_cross_file_name(mode_name1, mode_name2))
 99 | outFNm2 = args.db_edge_file
100 | if outFNm2 is None:
101 |   outFNm2 = os.path.join(args.output_dir, utils.get_cross_file_name(mode_name1, mode_name2, db_id, dataset))
102 | 
103 | 
104 | src_mapping = utils.read_mode_file(srcFile)
105 | if os.path.samefile(srcFile, dstFile):
106 |   dst_mapping = src_mapping
107 | else:
108 |   dst_mapping = utils.read_mode_file(dstFile)
109 | 
110 | src_filter = utils.get_filter(args.src_mode_filter)
111 | dst_filter = utils.get_filter(args.dst_mode_filter)
112 | 
113 | add_schema = True
114 | counter = args.snap_id_counter_start
115 | if counter == -1:
116 |   counter = utils.get_max_id(outFNm)
117 | print 'Starting at snap id: %d' % counter
118 | with open(inFNm, 'r') as inF:
119 |   with open(outFNm, 'a') as fullF:
120 |     with open(outFNm2, 'w') as dbF:
121 |       # Add schema/metadata
122 |       if counter == 0:
123 |         fullF.write('# Full crossnet file for %s to %s\n' % (mode_name1, mode_name2))
124 |         fullF.write('# File generated on: %s\n' % utils.get_current_date())
125 |         fullF.write('# snap_eid\tdataset_id\tsrc_snap_nid\tdst_snap_nid\n')
126 |       dbF.write('# Crossnet table for dataset: %s\n' % dataset)
127 |       dbF.write('# File generated on: %s\n' % utils.get_current_date())
128 |       # Process file
129 |       for line in inF:
130 |         if line[0] == '#' or line[0] == '!'  or line[0] == '\n':
131 |           continue
132 |         vals =  utils.split_then_strip(line, '\t')
133 |         if add_schema:
134 |           attrs_schema = '# snap_eid\tsrc_dataset_id\tdst_dataset_id'
135 |           for i in range(len(vals)):
136 |             if i != srcIdx and i != dstIdx:
137 |               attrs_schema += '\tC%d' % i
138 |           dbF.write('%s\n' % attrs_schema)
139 |           add_schema = False
140 |         id1 = vals[srcIdx]
141 |         id2 = vals[dstIdx]
142 |         if src_filter:
143 |           id1 = src_filter(id1)
144 |         if dst_filter:
145 |           id2 = dst_filter(id2)
146 |         if id1 == '' or id2 == '':
147 |           continue
148 |         if args.skip_missing_ids and (id1 not in src_mapping or id2 not in dst_mapping):
149 |           continue
150 |         attr_strs = ''
151 |         for i in range(len(vals)):
152 |           if i != srcIdx and i != dstIdx:
153 |             attr_strs += '\t' + vals[i]
154 |         fullF.write('%d\t%d\t%d\t%d\n' % (counter, db_id, src_mapping[id1], dst_mapping[id2]))
155 |         dbF.write('%d\t%d\t%d%s\n' % (counter, src_db_id, dst_db_id, attr_strs))
156 |         counter += 1
157 | print 'Ending at snap id: %d' % counter
158 | 


--------------------------------------------------------------------------------
/Utils/create_snap_mode_equiv_table.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | file: create_snap_mode_equiv_table.py
  3 | author: Sheila Ramaswamy(@sramas15)
  4 | 
  5 | Script that creates snap equivalence table between two datasets for a given mode.
  6 | 
  7 | Usage:
  8 | python create_snap_mode_equiv_table.py <dataset1_file_path> <dataset2_file_path>
  9 | 
 10 | Positional Arguments:
 11 | dataset1_file:           Path to a dataset specific file, as outputted by create_snap_mode_table.py,
 12 |                          corresponding to the source mode. File name MUST MATCH FORMAT:
 13 |                          miner-<mode_name>-<dataset_id>-<dataset>-<date>.tsv
 14 | dataset2_file:           Path to a dataset specific file, as outputted by create_snap_mode_table.py,
 15 |                          corresponding to the destination mode. File name MUST MATCH FORMAT:
 16 |                          miner-<mode_name>-<dataset_id>-<dataset>-<date>.tsv
 17 | 
 18 | 
 19 | Optional arguments:
 20 | --mapping_file:          Path to a tsv file containing the mapping between the two datasets.
 21 | --ds1_node_index:        If there are multiple columns in the input tsv, the index of the column with the dataset1 entity id.
 22 |                          Defaults to 0.
 23 | --ds2_node_index:        If there are multiple columns in the input tsv, the index of the column with the dataset2 entity id.
 24 |                          Defaults to 1.
 25 | --output_dir:            Directory to create output files. Defaults to the current working directory.
 26 | --equiv_file:            Name of output file tsv containing a list of <snap_id>\t<snap_id>.
 27 |                          Defaults to output_dir/miner-<mode_name>-equiv-<date>.tsv
 28 | --skip_missing_ids       Flag; If any of the ids in the input tsv do not have snap ids (which are fetched from
 29 |                          the src and dst files), skip the line and continue parsing the data.
 30 | 
 31 | Example usage:
 32 | Creating equivalence file for genes using GeneOntoloty and HUGO
 33 | 
 34 | Input files: hugo.tsv, miner-gene-0-GO-20160520.tsv, miner-gene-1-HUGO-20160520.tsv
 35 | 
 36 | Output directory: outputs/genes/
 37 | 
 38 | Output files: miner-gene-equiv-20160520.tsv
 39 | 
 40 | Workflow:
 41 | 
 42 | python create_snap_mode_equiv_table.py miner-gene-0-GO-20160520.tsv miner-gene-1-HUGO-20160520.tsv --mapping_file_path hugo.tsv --output_dir outputs/genes/
 43 | '''
 44 | 
 45 | import argparse
 46 | import utils
 47 | import os
 48 | 
 49 | parser = argparse.ArgumentParser(description='Create snap edge tables')
 50 | parser.add_argument('dataset1_file', help='input file name. Should be a file outputted by create_snap_mode_table (with properly formatted name).')
 51 | parser.add_argument('dataset2_file', help='input file name. Should be a file outputted by create_snap_mode_table (with properly formatted name).')
 52 | parser.add_argument('--mapping_file', help='path to a tsv file giving a mapping between the dataset specific ids in dataset1 and dataset2 files.', default=None)
 53 | parser.add_argument('--ds1_node_index', type=int, help='column index that contains ds1 node ids (NOT snap ids, from src_input_file)', default=0)
 54 | parser.add_argument('--ds2_node_index', type=int, help='column index that contains ds2 node ids (NOT snap ids, from dst_input_file)', default=1)
 55 | parser.add_argument('--output_dir', help='directory to output files; either this argument or full_crossnet_file and db_edge_file MUST be specified', default='.')
 56 | parser.add_argument('--equiv_file', help='output file name; outputs a equivalence table of snap ids' \
 57 |   + 'note that this file is appended to; OVERRIDES output_dir argument', default=None)
 58 | parser.add_argument('--skip_missing_ids', action='store_true', help='don\'t throw an error if ids in input_file not found in src or dst file.')
 59 | args = parser.parse_args()
 60 | 
 61 | 
 62 | inFNm = args.mapping_file
 63 | dsFile1 = args.dataset1_file
 64 | dsFile2 = args.dataset2_file
 65 | 
 66 | ds1Idx = args.ds1_node_index
 67 | ds2Idx = args.ds2_node_index
 68 | 
 69 | output_dir = args.output_dir
 70 | outFNm = args.equiv_file
 71 | mode_name = 'Unknown'
 72 | if outFNm is None:
 73 |   mode_name1 = utils.parse_mode_name_from_name(os.path.basename(dsFile1))
 74 |   mode_name2 = utils.parse_mode_name_from_name(os.path.basename(dsFile2))
 75 |   mode_name = mode_name1
 76 |   assert mode_name1 == mode_name2
 77 |   outFNm = os.path.join(args.output_dir, utils.get_equiv_mode_file_name(mode_name1))
 78 | 
 79 | ds1_mapping = utils.read_mode_file(dsFile1)
 80 | if os.path.samefile(dsFile1, dsFile2):
 81 |   ds2_mapping = ds1_mapping
 82 | else:
 83 |   ds2_mapping = utils.read_mode_file(dsFile2)
 84 | 
 85 | add_header = True
 86 | if os.path.isfile(outFNm):
 87 |   add_header = False
 88 | 
 89 | 
 90 | with open(outFNm, 'a') as equivF:
 91 |   if add_header:
 92 |     equivF.write('# Equivalence table for mode %s\n' % mode_name)
 93 |     equivF.write('# File generated on: %s\n' % utils.get_current_date())
 94 |     equivF.write('# snap_nid_1\tsnap_nid_2\n')
 95 |   if inFNm is not None:
 96 |     with open(inFNm, 'r') as inF:
 97 |       for line in inF:
 98 |         if line[0] == '#' or line[0] == '\n':
 99 |           continue
100 |         vals = utils.split_then_strip(line, '\t')
101 |         id1 = vals[ds1Idx]
102 |         id2 = vals[ds2Idx]
103 |         if id1 == '' or id2 == '':
104 |           continue
105 |         if args.skip_missing_ids and (id1 not in ds1_mapping or id2 not in ds2_mapping):
106 |           continue
107 |         equivF.write('%d\t%d\n' % (ds1_mapping[id1], ds2_mapping[id2]))
108 |   else:
109 |     for id1 in ds1_mapping:
110 |       if id1 in ds2_mapping:
111 |         equivF.write('%d\t%d\n' % (ds1_mapping[id1], ds2_mapping[id1]))
112 | 


--------------------------------------------------------------------------------
/Utils/create_snap_mode_table.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | file: create_snap_mode_table.py
  3 | author: Sheila Ramaswamy(@sramas15)
  4 | 
  5 | Script that creates snap tables for a given mode.
  6 | 
  7 | Usage:
  8 | python create_snap_mode_table.py <input_file_path> <mode_name> <dataset_name> <dataset_id>
  9 | 
 10 | Positional Arguments:
 11 | input_file:              Path to the input file; Input file should be a tsv.
 12 | mode_name:               Name of the mode being created e.g. genes
 13 | dataset_name:            Name of dataset being used to create the snap mode tables i.e. the 
 14 |                          dataset the input file comes from. e.g. STRING
 15 | dataset_id:              unique integer id for this dataset.
 16 | 
 17 | 
 18 | Optional arguments:
 19 | --node_index:            If there are multiple columns in the input tsv, the index of the column with the node id.
 20 |                          Defaults to 0.
 21 | --output_dir:            Directory to create output files. Defaults to the current working directory.
 22 | --full_mode_file:        Name of output file tsv containing a list of <snap_id>\t<dataset_id>.
 23 |                          Defaults to output_dir/miner-<mode_name>-<date>.tsv
 24 | --db_node_file:          Name of output file tsv for a specific dataset; contains a list of <snap id>\t<dataset_specific_entity_id>
 25 |                          Defaults to output_dir/miner-<mode_name>-<dataset_id>-<dataset>-<date>.tsv
 26 | --snap_id_counter_start  Start assigning snap ids from this integer value; this number MUST be greater
 27 |                          than any id found in the full mode file. If not specified, finds the max id in the
 28 |                          full_mode_file.
 29 | 
 30 | Example usage:
 31 | Creating files for genes using two datasets, GeneOntology and HUGO:
 32 | 
 33 | Input files: hugo.tsv and go.tsv
 34 | 
 35 | Output directory: outputs/genes/
 36 | 
 37 | Output files: miner-gene-20160520.tsv, miner-gene-0-GO-20160520.tsv, miner-gene-1-HUGO-20160520.tsv
 38 | 
 39 | Workflow:
 40 | 
 41 | python create_snap_mode_table.py go.tsv gene GO 0 --output_dir outputs/genes/
 42 | python create_snap_mode_table.py hugo.tsv gene HUGO 1 --output_dir outputs/genes/
 43 | '''
 44 | 
 45 | import argparse
 46 | import utils
 47 | import os
 48 | 
 49 | 
 50 | # Create command line arguments
 51 | parser = argparse.ArgumentParser(description='Create snap node tables; for more detailed description, please see file header.')
 52 | parser.add_argument('input_file', help='input file name. File should be a tsv, with one mode-specific id per line (unless --node_index specified)')
 53 | parser.add_argument('mode_name', type=str, help='mode name')
 54 | parser.add_argument('dataset_name', type=str, help='name of dataset')
 55 | parser.add_argument('db_id', type=int, help='int id for this dataset')
 56 | parser.add_argument('--node_index', type=int, help='column index that contains node ids', default=0)
 57 | parser.add_argument('--output_dir', help='directory to output files; either this argument or full_mode_file and db_node_file MUST be specified', default='.')
 58 | parser.add_argument('--full_mode_file', help='output file name; outputs a list of snap ids and the db ids (db the snap id was derived from);' \
 59 |   + 'note that this file is appended to; OVERRIDES output_dir argument', default=None)
 60 | parser.add_argument('--db_node_file', help='output file name; output contains mapping of snap ids to db protein ids; OVERRIDES output dir argument', default=None)
 61 | parser.add_argument('--snap_id_counter_start', type=int, help='where to start assigning snap ids', default=-1)
 62 | args = parser.parse_args()
 63 | 
 64 | 
 65 | # Process command line arguments, get default path names
 66 | inFNm = args.input_file
 67 | db_id = args.db_id
 68 | mode_name = args.mode_name
 69 | dataset = args.dataset_name
 70 | outFNm = args.full_mode_file
 71 | if outFNm is None:
 72 |   outFNm = os.path.join(args.output_dir, utils.get_full_mode_file_name(mode_name))
 73 | dbFNm = args.db_node_file
 74 | if dbFNm is None:
 75 |   dbFNm = os.path.join(args.output_dir, utils.get_mode_file_name(mode_name, db_id, dataset))
 76 | 
 77 | counter = args.snap_id_counter_start
 78 | if counter == -1:
 79 |   counter = utils.get_max_id(outFNm)
 80 | node_index = args.node_index
 81 | 
 82 | 
 83 | # Read input file, create output files.
 84 | seen = set()
 85 | print 'Starting at snap id: %d' % counter
 86 | with open(inFNm, 'r') as inF:
 87 |   with open(outFNm, 'a') as outF:
 88 |     with open(dbFNm, 'w') as dbF:
 89 |       if counter == 0:
 90 |         outF.write('# Full mode table for %s\n' % mode_name)
 91 |         outF.write('# File generated on: %s\n' % utils.get_current_date())
 92 |         outF.write('# snap_nid\tdataset id\n')
 93 |       dbF.write('# Mode table for dataset: %s\n' % dataset)
 94 |       dbF.write('# File generated on: %s\n' % utils.get_current_date())
 95 |       add_schema = True
 96 |       for line in inF:
 97 |         if line[0] == '#' or line[0] == '!' or line[0] == '\n': # skip comments
 98 |           continue
 99 |         vals = utils.split_then_strip(line, '\t')
100 |         if add_schema:
101 |           attrs_schema = '# snap_nid\tdataset_nid'
102 |           for i in range(len(vals)):
103 |             if i != node_index:
104 |               attrs_schema += '\tC%d' % i
105 |           dbF.write('%s\n' % attrs_schema)
106 |           add_schema = False
107 |         node_id = vals[node_index]
108 |         if node_id in seen or len(node_id) == 0:
109 |           continue
110 |         attrs_str = ''
111 |         for i in range(len(vals)):
112 |           if i != node_index:
113 |             attrs_str += '\t' + vals[i]
114 |         outF.write('%d\t%d\n' % (counter, db_id))
115 |         dbF.write('%d\t%s%s\n' % (counter, node_id, attrs_str))
116 |         seen.add(node_id)
117 |         counter += 1
118 | 
119 | print 'Ending at snap id: %d' % counter
120 | 
121 | 


--------------------------------------------------------------------------------
/Utils/extract_edge_list.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file: extract_edge_list.py
 3 | author: Sheila Ramaswamy(@sramas15)
 4 | 
 5 | Script that creates an edge list given the input file.
 6 | 
 7 | Usage:
 8 | python extract_edge_list.py <input_file_path> <output_file_path> <dataset_name> <src_node_column> <dst_node_column>
 9 | 
10 | Positional Arguments:
11 | input_file:              Path to the input file; Input file should be a tsv.
12 | output_file:             Path to the output file; Output file will be a tsv.
13 | dataset_name:            Name of dataset nodes are being extracted from e.g. STRING
14 | src_node_column:         Column containing source node(s)
15 | dst_node_column:         Column containing destination node(s)
16 | 
17 | Optional arguments:
18 | --src_node_name:         String indicating how to refer to the src node ids in the file scheme. Defaults to node_id1.
19 | --dst_node_name:         String indicating how to refer to the dst node ids in the file scheme. Defaults to node_id2.
20 | --has_title:             If provided, skips over the first line of the file.
21 | --verbose:               If provided, prints to the console for every million lines of the input file processed.
22 | --src_node_sep:          If the column containing the src node actually contains a list of nodes, the character separater
23 |                          used to split the text into the different node ids. Relevant for many-to-one relationships.
24 |                          By default assumes only one node id specified.
25 | --dst_node_sep:          If the column containing the dst node actually contains a list of nodes, the character separater
26 |                          used to split the text into the different node ids. Relevant for one-to-many relationships.
27 |                          By default assumes only one node id specified.
28 | 
29 | Example usage:
30 | Extracting edge list from a STRING protein-protein interactions file, which contains many other fields.
31 | 
32 | Input files: STRING.tsv; assume protein 1 at index 1 and protein 2 at index 5.
33 | 
34 | Output file: STRING-edges.tsv
35 | 
36 | Workflow:
37 | 
38 | python extract_edge_list.py STRING.tsv STRING-edges.tsv STRING 1 5 --src_node_name protein_1 --dst_node_name protein_2
39 | '''
40 | import argparse
41 | import utils
42 | 
43 | parser = argparse.ArgumentParser(description='Extract edges and additional data from a file')
44 | parser.add_argument('input_file', help='input file name.')
45 | parser.add_argument('output_file', help='output file name.')
46 | parser.add_argument('dataset_name', help='Name of the dataset')
47 | parser.add_argument('src_node_col', help='column index containing source nodes')
48 | parser.add_argument('dst_node_col', help='column index containing destination nodes')
49 | parser.add_argument('--src_node_sep', help='if multiple ids are specified in this column,' \
50 |                   + ' character used to split them', default=None)
51 | parser.add_argument('--dst_node_sep', help='if multiple ids are specified in this column,' \
52 |                   + ' character used to split them', default=None)
53 | parser.add_argument('--has_title', action='store_true',
54 |                     help='has a title line')
55 | parser.add_argument('--verbose', action='store_true',
56 |                     help='Print every 1,000,000 lines processed')
57 | parser.add_argument('--divider', default='\t', type=str, help='separator')
58 | parser.add_argument('--src_node_name', default='node_id1', type=str, help='how to identify the src nodes in the header for tsv')
59 | parser.add_argument('--dst_node_name', default='node_id2', type=str, help='how to identify the dst nodes in the header for tsv')
60 | 
61 | if __name__ == '__main__':
62 |     args = parser.parse_args()
63 |     #print(args)
64 |     with open(args.input_file, 'r') as inF:
65 |         with open(args.output_file, 'w') as outF:
66 |             outF.write('# Dataset: %s\n' % args.dataset_name)
67 |             outF.write('# %s\t%s\n' % (args.src_node_name, args.dst_node_name))
68 |             for i, line in enumerate(inF):
69 |                 if args.verbose and i%1000000 == 0:
70 |                     print 'Finished processing line %d in the original input file' % i
71 |                 if line[0] == '#' or line[0] == '!' or line[0] == '\n' or (i==0 and args.has_title):
72 |                     continue
73 |                 vals = utils.split_then_strip(line, args.divider)
74 | 		#print(vals)
75 |                 src_nodes = [vals[int(args.src_node_col)]]
76 |                 dst_nodes = [vals[int(args.dst_node_col)]]
77 |                 if args.src_node_sep is not None:
78 |                     src_nodes = src_nodes[0].split(args.src_node_sep)
79 |                 if args.dst_node_sep is not None:
80 |                     dst_nodes = dst_nodes[0].split(args.dst_node_sep)
81 |                 for src_node in src_nodes:
82 |                     if src_node == '':
83 |                         continue
84 |                     for dst_node in dst_nodes:
85 |                         if dst_node != '':
86 |                             outF.write('%s\t%s\n' % (src_node, dst_node))
87 | 


--------------------------------------------------------------------------------
/Utils/extract_unique_node_ids.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file: extract_unique_node_ids.py
 3 | author: Sheila Ramaswamy(@sramas15)
 4 | 
 5 | Script that creates a tsv containing all the unique node ids from a given input file.
 6 | 
 7 | Usage:
 8 | python extract_unique_node_ids.py <input_file_path> <output_file_path> <dataset_name> <column_1> <column_2> ... <column_N>
 9 | 
10 | Positional Arguments:
11 | input_file:              Path to the input file; Input file should be a tsv.
12 | output_file:             Path to the output file; Output file will be a tsv.
13 | dataset_name:            Name of dataset nodes are being extracted from e.g. STRING
14 | columns:                 Columns containing node ids. Can specify many.
15 | 
16 | 
17 | Optional arguments:
18 | --node_name:             String indicating how to refer to the node ids in the file scheme. Defaults to node_id.
19 | --has_title:             If provided, skips over the first line of the file.
20 | --verbose:               If provided, prints to the console for every million lines of the input file processed.
21 | 
22 | Example usage:
23 | Extracting node ids from a STRING edgelist file, consisting of <src_node_id>\t<dst_node_id>
24 | 
25 | Input files: STRING.tsv
26 | 
27 | Output file: STRING-nodes.tsv
28 | 
29 | Workflow:
30 | 
31 | python extract_unique_node_ids.py STRING.tsv STRING-nodes.tsv STRING 0 1 --node_name ENSEMBL_peptide_id --verbose 
32 | '''
33 | 
34 | 
35 | import argparse
36 | import utils
37 | 
38 | parser = argparse.ArgumentParser(description='Extract unique node ids from file.')
39 | parser.add_argument('input_file', help='input file name.')
40 | parser.add_argument('output_file', help='output file name.')
41 | parser.add_argument('dataset_name', help='Name of the dataset')
42 | parser.add_argument('columns', metavar='N', type=int, nargs='+',
43 |                     help='columnswith node ids')
44 | parser.add_argument('--has_title', action='store_true',
45 |                     help='has a title line that is not prefixed with a #')
46 | parser.add_argument('--verbose', action='store_true',
47 |                     help='Print every 1,000,000 lines processed')
48 | parser.add_argument('--divider', default='\t', type=str, help='column separator, by default a tab')
49 | parser.add_argument('--node_name', default='node_id', type=str, help='how to identify the nodes in the header for tsv')
50 | 
51 | if __name__ == '__main__':
52 |     args = parser.parse_args()
53 |     with open(args.input_file, 'r') as inF:
54 |         unique_ids = set()
55 |         with open(args.output_file, 'w') as outF:
56 |             outF.write('# Dataset: %s\n' % args.dataset_name)
57 |             outF.write('# %s\n' % args.node_name)
58 |             for i, line in enumerate(inF):
59 |                 if args.verbose and i%1000000 == 0:
60 |                     print 'Finished processing line %d in the original input file' % i
61 |                 if line[0] == '#' or line[0] == '!' or line[0] == '\n' or (i==0 and args.has_title):
62 |                     continue
63 |                 vals = utils.split_then_strip(line, args.divider)
64 |                 for column in args.columns:
65 |                     if vals[column] not in unique_ids and len(vals[column]) > 0:
66 |                         unique_ids.add(vals[column])
67 |                         new_line = '%s\n' % vals[column]
68 |                         outF.write(new_line)
69 | 


--------------------------------------------------------------------------------
/Utils/getStats.py:
--------------------------------------------------------------------------------
 1 | import snap
 2 | import time
 3 | 
 4 | #from utils.network_utils import get_num_elem_per_mode
 5 | 
 6 | filename = "Graphs/oldMinerNewSNAP.graph"
 7 | FIn = snap.TFIn(filename)
 8 | Graph = snap.TMMNet.Load(FIn)
 9 | 
10 | print('Modes: %d' % Graph.GetModeNets())
11 | print('Link types: %d' % Graph.GetCrossNets())
12 | 
13 | crossnetids = snap.TInt64V()
14 | crossneti = Graph.BegCrossNetI()
15 | while crossneti < Graph.EndCrossNetI():
16 |     crossnetids.Add(crossneti.GetCrossId())
17 |     crossneti.Next()
18 | 
19 | nodeattrmapping = snap.TIntStrStrTr64V()
20 | edgeattrmapping = snap.TIntStrStrTr64V()
21 | start_time = time.time()
22 | DirectedNetwork = Graph.ToNetwork(crossnetids, nodeattrmapping, edgeattrmapping)
23 | end_time = time.time()
24 | print("Converting to TNEANet  takes %s seconds" % (end_time - start_time))
25 | 
26 | snap.PrintInfo(DirectedNetwork, "Python type PNEANet", "output.txt", False)
27 | map(lambda x: x.replace("\n", ""), open("output.txt").readlines())
28 | 


--------------------------------------------------------------------------------
/Utils/utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | file: utils.py
  3 | author: Sheila Ramaswamy (@sramas15)
  4 | 
  5 | File containing util functions useful for other scripts.
  6 | '''
  7 | import os
  8 | from datetime import datetime
  9 | 
 10 | HUMAN_SPECIES_ID = '9606'
 11 | 
 12 | def get_filter(method_name):
 13 | 	'''Given a function name(string), returns the corresponding function in this file.
 14 | 
 15 | 	Input:
 16 | 		method_name: string, function name.
 17 | 	Output:
 18 | 		a function in this file or None, if the function doesn't exist.
 19 | 	'''
 20 | 	if method_name is None:
 21 | 		return None
 22 | 	possibles = globals().copy()
 23 | 	possibles.update(locals())
 24 | 	method = possibles.get(method_name)
 25 | 	return method
 26 | 
 27 | def remove_species_id(name):
 28 | 	'''Filter that removes the species id from a string id. Currently only works
 29 | 	with the human species id.
 30 | 
 31 | 	Input:
 32 | 		name: an ENSEMBL human protein id, prefixed with the species id.
 33 | 	Output:
 34 | 		the ENSEMBL human protein id, without the species prefix.
 35 | 	'''
 36 | 	vals = split_then_strip(name, '.')
 37 | 	if len(vals) != 2 or vals[0] != HUMAN_SPECIES_ID:
 38 | 		return name
 39 | 	return vals[1]
 40 | 
 41 | def add_species_id(name):
 42 | 	'''Adds the human species id as a prefix to the given ENSEMBL id.
 43 | 
 44 | 	Input:
 45 | 		name: the ENSEMBL protein id (string).
 46 | 	Output:
 47 | 		a string consisting of the human species id, '.' and name.
 48 | 	'''
 49 | 	return '%s.%s' % (HUMAN_SPECIES_ID, name)
 50 | 
 51 | def split_then_strip(string, split_char):
 52 | 	'''Splits the string using the given character and removes whitespace from all
 53 | 	resulting substrings.
 54 | 
 55 | 	Input:
 56 | 		string: string being split
 57 | 		split_char: character (or multiple characters) used to split string
 58 | 	Output:
 59 | 		a list, consisting of the stripped substrings.
 60 | 	'''
 61 | 	return [s.strip() for s in string.split(split_char)]
 62 | 
 63 | 
 64 | def get_file_len(input_file):
 65 | 	'''Returns the length of the input_file; Returns 0 if the file does not exist.
 66 | 
 67 | 	Input:
 68 | 	    input_file: path to the input file.
 69 | 	Output:
 70 | 	    number of lines in the file.
 71 | 	'''
 72 | 	if os.path.isfile(input_file):
 73 | 		return sum(1 for line in open(input_file, 'r'))
 74 | 	return 0
 75 | 
 76 | def get_max_id(input_file):
 77 | 	'''Returns the max snap id of the input_file; Returns 0 if the file does not exist.
 78 | 	Assumes file in format of snap mode or crossnet full table tsv file.
 79 | 
 80 | 	Input:
 81 | 	    input_file: path to the input file.
 82 | 	Output:
 83 | 	    max snap id in input file.
 84 | 	'''
 85 | 	max_id = -1
 86 | 	if os.path.isfile(input_file):
 87 | 		with open(input_file, 'r') as inF:
 88 | 			for line in inF:
 89 | 				if line[0]=='#':
 90 | 					continue
 91 | 				new_id = int(line.strip().split('\t')[0])
 92 | 				if new_id > max_id:
 93 | 					max_id = new_id
 94 | 	max_id += 1
 95 | 	return max_id
 96 | 
 97 | def get_current_date():
 98 | 	'''Returns the current date, formatted as YYYYMMDD
 99 | 
100 | 	Input:
101 | 	    None
102 | 	Output:
103 | 	    Current date, as a string.
104 | 	'''
105 | 	format = '%Y%m%d'
106 | 	return datetime.now().strftime(format)
107 | 
108 | def get_full_mode_file_name(mode_name):
109 | 	'''Returns the formatted file name that should contain the full list of snap ids for the mode.
110 | 
111 | 	Input:
112 | 	    mode_name: the name of the mode
113 | 	Output:
114 | 	    the formatted file name.
115 | 	'''
116 | 	return 'miner-%s-%s.tsv' % (mode_name, get_current_date())
117 | 
118 | def get_equiv_mode_file_name(mode_name):
119 | 	'''Returns the formatted file name that should contain the equivalence table of snap ids for the mode.
120 | 
121 | 	Input:
122 | 	    mode_name: the name of the mode
123 | 	Output:
124 | 	    the formatted file name.
125 | 	'''
126 | 	return 'miner-%s-equiv-%s.tsv' % (mode_name, get_current_date())
127 | 
128 | def get_mode_file_name(mode_name, db_id, dataset):
129 | 	'''Returns the formatted file name that should contain the snap id to dataset
130 | 	specific id mapping.
131 | 
132 | 	Input:
133 | 	    mode_name: the name of the mode
134 | 	    db_id: dataset id for the given dataset.
135 | 	    dataset: the name of the dataset e.g. STRING
136 | 	Output:
137 | 	    the formatted file name.
138 | 	'''
139 | 	return 'miner-%s-%d-%s-%s.tsv' % (mode_name, int(db_id), dataset, get_current_date())
140 | 
141 | def get_full_cross_file_name(mode_name1, mode_name2):
142 | 	'''Returns the formatted file name that should contain the full list of snap ids for the cross net.
143 | 
144 | 	Input:
145 | 	    mode_name1: the name of the src mode
146 | 	    mode_name2: the name of the dst mode
147 | 	Output:
148 | 	    the formatted file name.
149 | 	'''
150 | 	return 'miner-%s-%s-%s.tsv' % (mode_name1, mode_name2,  get_current_date())
151 | 
152 | def get_cross_file_name(mode_name1, mode_name2, db_id, dataset):
153 | 	'''Returns the formatted file name that should contain the snap id to the dataset ids
154 | 	for the source and destination nodes.
155 | 
156 | 	Input:
157 | 	    mode_name1: the name of the src mode
158 | 	    mode_name2: the name of the dst mode
159 | 	    db_id: dataset id for the given dataset.
160 | 	    dataset: the name of the dataset e.g. STRING
161 | 	Output:
162 | 	    the formatted file name.
163 | 	'''
164 | 	return 'miner-%s-%s-%d-%s-%s.tsv' % (mode_name1, mode_name2, int(db_id), dataset, get_current_date())
165 | 
166 | def parse_dataset_id_from_name(file_name):
167 | 	'''Extracts the dataset id from the formatted mode file name.
168 | 
169 | 	Input:
170 | 	    file_name: mode file name, as returned by get_mode_file_name.
171 | 	Output:
172 | 	     the integer dataset id.
173 | 	'''
174 | 	return int(file_name.split('-')[2])
175 | 
176 | def parse_dataset_name_from_name(file_name):
177 | 	'''Extracts the dataset name from the formatted mode file name.
178 | 
179 | 	Input:
180 | 	    file_name: mode file name, as returned by get_mode_file_name.
181 | 	Output:
182 | 	     the string dataset name.
183 | 	'''
184 | 	return file_name.split('-')[3]
185 | 
186 | def parse_mode_name_from_name(file_name):
187 | 	'''Extracts the mode name from the formatted mode file name.
188 | 
189 | 	Input:
190 | 	    file_name: mode file name, as returned by get_mode_file_name.
191 | 	Output:
192 | 	     the string dataset mode name.
193 | 	'''
194 | 	return file_name.split('-')[1]
195 | 
196 | def read_mode_file(map_file):
197 | 	'''Reads the mapping between dataset specific ids to snap ids into a dictionary.
198 | 
199 | 	Input:
200 | 	    map_file: file containing the mapping.
201 | 	Output:
202 | 	    dictionary from the dataset specific ids to snap ids.
203 | 	'''
204 | 	mapping = {}
205 | 	with open(map_file, 'r') as inF:
206 | 		for line in inF:
207 | 			if len(line) == 0 or line[0] == '#':
208 | 				continue
209 | 			vals = split_then_strip(line, '\t')
210 | 			snap_id = vals[0]
211 | 			dataset_id = vals[1]
212 | 			mapping[dataset_id] = int(snap_id)
213 | 	return mapping
214 | 


--------------------------------------------------------------------------------
/drugbank/edges/README.txt:
--------------------------------------------------------------------------------
 1 | Generate Edges from Drugbank Database
 2 | -------------------------------------
 3 | 
 4 | 1. Models the following interactions from the drugbank database:
 5 |    1. Drug-Drug interaction
 6 |    2. Drug-Gene interacion
 7 | 2. External Dependencies:
 8 |    1. Requires mapping from SnapGeneID<->UniportKB
 9 | 3. Steps:
10 |    I. Download drugbank.xml
11 |    II. Ensure that gene mapping from 2 is in the current folder.
12 |    III. Run make-edges.sh
13 | 3. If BeautifulSoup is not installed or you don't have permissions to install:
14 |    I. Follow steps mentioned on http://docs.python-guide.org/en/latest/dev/virtualenvs/ 
15 |    II. Next launch the env as : source /env/bin/activate
16 |    III. Run make-edges.sh
17 | 


--------------------------------------------------------------------------------
/drugbank/edges/getDrugInteractions.py:
--------------------------------------------------------------------------------
 1 | #############################################
 2 | # XML parser to parse the drugbank database
 3 | # Will output a space separated .txt file
 4 | # with the following coloumn headers:
 5 | # DrugbankId DrugbankId
 6 | # Each line represents a drug-drug interatcion
 7 | ##############################################
 8 | 
 9 | from bs4 import BeautifulSoup
10 | soup = BeautifulSoup(open("./drugbank.xml"),"xml")
11 | sep = " " 
12 | with open('edgesD.txt', 'w') as f:
13 |     for drug in soup.findAll("drug"):
14 |         drugName = drug.find("drugbank-id").text 
15 |         interactions = drug.findAll("drug-interaction")
16 |         if not interactions:
17 |             continue
18 |         for i in interactions:
19 |             toPrint = drugName + sep + i.find("drugbank-id").text
20 |             f.write(toPrint.encode('utf-8') + '\n')
21 | 
22 |     
23 |  
24 | 


--------------------------------------------------------------------------------
/drugbank/edges/getGeneInteractions.py:
--------------------------------------------------------------------------------
 1 | ###########################################
 2 | # XML parser to parse the drugbank database
 3 | # Will output a space separated .txt file
 4 | # with the following coloumn headers:
 5 | # DrugbankId Gene1 Gene2 ...
 6 | # Currently UniportID is used for genes.
 7 | ###########################################
 8 | 
 9 | from bs4 import BeautifulSoup
10 | soup = BeautifulSoup(open("./../drugbank.xml"),"xml")
11 | sep = " "
12 | empty = "NULL"
13 | #geneIdentifier = "HUGO Gene Nomenclature Committee (HGNC)"
14 | geneIdentifier = "UniProtKB"
15 | with open('drugGene.txt', 'w') as f:
16 |     for drug in soup.findAll("drug"):
17 |         toPrint = ""
18 |         toPrint += drug.find("drugbank-id").text + sep
19 |         # Get target Genes
20 |         targets = drug.findAll("target")
21 |         targetGene = []
22 |         if targets:
23 |             for target in targets:
24 |                 externIden = target.findAll("external-identifier")
25 |                 if not externIden:
26 |                     continue
27 |                 for iden in externIden:
28 |                     if iden.find("resource").text == geneIdentifier:
29 |                         targetGene.append(iden.find("identifier").text)
30 |         # Get Enzyme Gene
31 |         enzymes = drug.findAll("enzyme")
32 |         enzymeGene = []
33 |         if enzymes:
34 |             for enzyme in enzymes:
35 |                 externIden = enzyme.findAll("external-identifier")
36 |                 if not externIden:
37 |                     continue
38 |                 for iden in externIden:
39 |                     if iden.find("resource").text == geneIdentifier:
40 |                         enzymeGene.append(iden.find("identifier").text)
41 |         allGene = targetGene + enzymeGene
42 |         if len(allGene) == 0:
43 |             toPrint += empty
44 |         else:
45 |             toPrint += ','.join(allGene)
46 |         f.write(toPrint.encode('utf-8') + '\n')
47 |     
48 |  
49 | 


--------------------------------------------------------------------------------
/drugbank/edges/make-edges.sh:
--------------------------------------------------------------------------------
1 | python getDrugInteractions.py
2 | python getGeneInteractions.py
3 | python makeEdgeTableCC.py
4 | python makeEdgeTableCG.py
5 | 


--------------------------------------------------------------------------------
/drugbank/edges/makeEdgeTableCC.py:
--------------------------------------------------------------------------------
 1 | ########################################################
 2 | # Takes as input a .txt file of drug-drug interactions.
 3 | # and subSnapDrugbank.txt which contains mapping from
 4 | # snapChemicalId to DrugbankId. 
 5 | # Outputs : 
 6 | # 1. Master Edge Table: SnapCCId EdgeTableNo SrcId DstId
 7 | # 2. Sub Tables: 
 8 | #     1. SnapCCId SrcId DstId (DrugbankId)
 9 | # All edges are undirected. Hence A-B is reported only 
10 | # once. 
11 | #########################################################
12 | from collections import defaultdict
13 | 
14 | sep = " "
15 | snapIdPrefix = "SCC"
16 | edgeFile = "edgesD.txt"
17 | nodeMap = "./../nodes/subSnapDrugbank.txt"
18 | masterTable = "snapChemicalCC.txt"
19 | subTable = "subSnapDrugbankCC.txt"
20 | idNum = 0
21 | # Make a dict mapping from drugbankId to snapChemId
22 | drugbankSnap = {}
23 | with open(nodeMap, 'r') as f:
24 |     for line in f:
25 |         line = line.strip().split(sep)
26 |         drugbankSnap[line[1]] = line[0]
27 | 
28 | drugsDone = defaultdict(list)
29 | with open(edgeFile, 'r') as f, open(masterTable, 'w') as master, open(subTable, 'w') as sub:
30 |     for line in f:
31 |         line = line.strip().split(sep)
32 |         if line[1] in drugsDone[line[0]]:
33 |             continue
34 |         if line[0] not in drugbankSnap or line[1] not in drugbankSnap:
35 |             continue
36 |         drugsDone[line[0]].append(line[1])
37 |         snapId = snapIdPrefix + str(idNum)
38 |         idNum += 1
39 |         master.write(snapId + sep + "0 " + drugbankSnap[line[0]] + sep + drugbankSnap[line[1]] + '\n')
40 |         sub.write(snapId + sep + line[0] + sep + line[1] + '\n')
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/drugbank/edges/makeEdgeTableCG.py:
--------------------------------------------------------------------------------
 1 | ########################################################
 2 | # Takes as input a .txt file of drug-gene interactions.
 3 | # and subSnapDrugbank.txt which contains mapping from
 4 | # snapChemicalId to DrugbankId.
 5 | # Depends on the mapping of Genes to SnapGeneID.
 6 | # Outputs : 
 7 | # 1. Master Edge Table: SnapCGId EdgeTableNo SrcId DstId
 8 | # 2. Sub Tables: 
 9 | #     1. SnapCGId SrcId (DrugbankID) DstId (UniportId)
10 | # All edges are undirected. Hence A-B is reported only 
11 | # once. 
12 | #########################################################
13 | from collections import defaultdict
14 | 
15 | sep = " "
16 | snapIdPrefix = "SCG"
17 | edgeFile = "drugGene.txt"
18 | nodeMap = "./../nodes/subSnapDrugbank.txt"
19 | masterTable = "snapChemicalCG.txt"
20 | subTable = "subSnapDrugbankCG.txt"
21 | geneMap = "./snap.genes.0.go"
22 | idNum = 0
23 | # Make a dict mapping from drugbankId to snapChemId
24 | drugbankSnap = {}
25 | with open(nodeMap, 'r') as f:
26 |     for line in f:
27 |         line = line.strip().split(sep)
28 |         drugbankSnap[line[1]] = line[0]
29 | 
30 | # Make a dict mapping from UniProtKB to snapGeneId
31 | geneSnap = {}
32 | with open(geneMap, 'r') as f:
33 |     for line in f:
34 |         line = line.strip().split('\t')
35 |         geneSnap[line[1]] = line[0]
36 | 
37 | with open(edgeFile, 'r') as f, open(masterTable, 'w') as master, open(subTable, 'w') as sub:
38 |     for line in f:
39 |         line = line.strip().split(sep)
40 |         if line[0] not in drugbankSnap:
41 |             continue
42 |         geneList = line[1].split(",")
43 |         if geneList[0] == "NULL":
44 |             continue
45 |         for gene in geneList:
46 |             snapId = snapIdPrefix + str(idNum)
47 |             idNum += 1
48 |             if gene not in geneSnap:
49 |                 print gene
50 |                 continue
51 |             master.write(snapId + sep + "0 " + drugbankSnap[line[0]] + sep + geneSnap[gene] + '\n')
52 |             sub.write(snapId + sep + line[0] + sep + gene + '\n')
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/drugbank/nodes/README.txt:
--------------------------------------------------------------------------------
 1 | Generate Nodes from Drugbank Database
 2 | -------------------------------------
 3 | 
 4 | 1. Takes as input a single .xml file from the drugbank database to 
 5 |    generate node mapping.
 6 | 2. Steps: 
 7 |    I. Download .xml in the current directory.
 8 |    II. Run make-nodes.sh
 9 | 3. If BeautifulSoup is not installed or you cannont install:
10 |    I. Follow steps mentioned on http://docs.python-guide.org/en/latest/dev/virtualenvs/ 
11 |    II. Next launch the env as : source /env/bin/activate
12 |    III. Run make-nodes.sh
13 | 


--------------------------------------------------------------------------------
/drugbank/nodes/make-nodes.sh:
--------------------------------------------------------------------------------
1 | python parseDrugbank.py
2 | python makeNodeTables.py
3 | 


--------------------------------------------------------------------------------
/drugbank/nodes/makeNodeTables.py:
--------------------------------------------------------------------------------
 1 | #######################################################
 2 | # Takes input a .txt file with the following col headers
 3 | # DrugbankID PubChem_Compound PubChem_Substance
 4 | # Ouputs the following files:
 5 | # 1. Master Node table : SnapChemID SubTableID
 6 | # 2. Sub tables : 
 7 | #    1. SanpChemID DrugbankID
 8 | #    2. SnapChemID PubChem_Compound
 9 | #    3. SanpChemID PubChem_Substance
10 | # 3. Equvialence Table : SnapChemID SnapChemID
11 | #######################################################
12 | import itertools
13 | sep = " "
14 | empty = "NULL"
15 | snapIdPrefix = "SC"
16 | # Names/handles for the output tables
17 | idNum = 0
18 | masterTable = "snapChemical.txt"
19 | subTable = ["subSnapDrugbank.txt", "subSnapPubCompound.txt", "subSnapPubSubstance.txt"]
20 | subHandle = [open(subTable[i], 'w') for i in xrange(len(subTable))]
21 | eqTable = "snapEqChem.txt"
22 | with open('did_pubC_pubS.txt', 'r') as input, open(masterTable, 'w') as master,open(eqTable, 'w') as eqTable:
23 |     for line in input:
24 |         line = line.strip().split(" ")
25 |         currId = []
26 |         for num,id in enumerate(line):
27 |             if id == "NULL":
28 |                 continue
29 |             snapId = snapIdPrefix + str(idNum)
30 |             idNum += 1
31 |             master.write(snapId + sep + str(num) + '\n')
32 |             subHandle[num].write(snapId + sep + id + '\n')
33 |             currId.append(snapId)
34 |         allPerms = list(itertools.permutations(currId,2))
35 |         for perm in allPerms:
36 |             toWrite = ' '.join(perm)
37 |             eqTable.write(toWrite + '\n')
38 | 
39 | [handle.close() for handle in subHandle]
40 | 
41 |             
42 | 
43 |             
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/drugbank/nodes/parseDrugbank.py:
--------------------------------------------------------------------------------
 1 | ###########################################
 2 | # XML parser to parse the drugbank database
 3 | # Will output a space separated .txt file
 4 | # with the following coloumn headers:
 5 | # DrugbankID PubChem_Compound PubChem_Substance
 6 | # Requirements : Assumes that durgbank.xml is in the 
 7 | # same folder.
 8 | ###########################################
 9 | 
10 | from bs4 import BeautifulSoup
11 | soup = BeautifulSoup(open("./drugbank.xml"),"xml")
12 | sep = " "
13 | empty = "NULL"
14 | with open('did_pubC_pubS.txt', 'w') as f:
15 |     for drug in soup.findAll("drug"):
16 |         flag = False
17 |         toPrint = ""
18 |         toPrint += drug.find("drugbank-id").text + sep
19 |         #toPrint += drug.find("name").text + sep
20 |         identifiers = [i for i in drug.findAll("external-identifier")]
21 |         for i in identifiers:
22 |             database = i.find("resource").text
23 |             if database != "PubChem Compound":
24 |                 continue
25 |             value = i.find("identifier").text
26 |             flag = True
27 |             toPrint += value + sep
28 |         if not flag:
29 |             toPrint += empty + sep
30 | 
31 |         for i in identifiers:
32 |             database = i.find("resource").text
33 |             if database != "PubChem Substance":
34 |                 continue
35 |             value = i.find("identifier").text
36 |             flag = True
37 |             toPrint += value + sep
38 |         if not flag:
39 |             toPrint += empty + sep
40 |         f.write(toPrint.encode('utf-8') + '\n')
41 |     
42 |  
43 | 


--------------------------------------------------------------------------------
/examples/README.txt:
--------------------------------------------------------------------------------
 1 | ------------------------------
 2 | Example : Load Miner Dataset
 3 | ------------------------------
 4 | 
 5 | This folder contains two simple scripts to 
 6 | 1. Load the miner data set into a multi-modal network which is then saved to disk. 
 7 | 2. Read the saved graph from disk and print basic statistics about the miner dataset. 
 8 | 
 9 | Sample workflow:
10 | 1. Generate the network and save to disk
11 | python miner_load_tables.py config.txt --loglevel info
12 | 
13 | 2. Read the saved network and print statistics
14 | python miner_get_stats.py ./miner.graph
15 | 
16 | Usage of the scripts used:
17 | 
18 | ----------------------------
19 | file : miner_load_tables.py
20 | ----------------------------
21 | 
22 | Example to illustrate how to load the miner dataset into a multi-modal network. 
23 | 
24 | Usage: miner_load_tables.py <config_file>
25 | 
26 | Config File : 
27 | The config files contains the path to all tsv to load modes and cross-nets. 
28 | 
29 | Positional Arguments:
30 | config_file  : Path to the config file. The config file contatins the path to all the modes and cross-net tsv files.
31 | 
32 | Optional Arguments: 
33 | --output_dir : Directory to create output files. Defaults to the current working directory.
34 | --loglevel   : Enable logging. Defaults to warning level. 
35 | 
36 | Example Usage: 
37 | Input File   : Config.txt
38 | 
39 | Command Line : 
40 | python miner_load_tables.py config.txt --loglevel info 
41 | 
42 | Output: 
43 | miner.graph
44 | 
45 | Note:
46 | Due to 32 bit limitation we can't fully load the Protein-Protein edges.
47 | Only edges with confidence level greater than 200 are used. When SNAP supports
48 | 64 bit make the following change in config file:
49 | Protein-Protein = /dfs/ilfs2/0/MINER-BIO/types/Protein-Protein/20160418/snap-tables/miner-protein-protein-20160607.tsv
50 | 
51 | Error after using 64 bit: RuntimeError: Message: TVec::Resize: std::exception, Length:536870912, Capacity:1073741824, New capacity:-1, Type:4TVecI11THashKeyDatI6TInt64S1_iExE [Program failed to allocate more memory. Solution-1: Get a bigger machine and a 64-bit compiler.]
52 | 
53 | 
54 | ---------------------------
55 | file   : miner_get_stats.py
56 | ---------------------------
57 | 
58 | Script to print basic statistics of the miner dataset.
59 | 
60 | Usage: 
61 | python miner_get_stats.py <input_file>
62 | 
63 | Positional Arguments:
64 | input_file : Path to the multi-modal network
65 | 
66 | Example Usage: 
67 | Input file : miner.graph
68 | 
69 | Command line: 
70 | python miner_get_stats.py ./miner.graph
71 | 
72 | 


--------------------------------------------------------------------------------
/examples/config.txt:
--------------------------------------------------------------------------------
 1 | [Modes]
 2 | Chemical = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Chemical/miner-chemical-20190828.tsv
 3 | Protein = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Protein/miner-proteins-20190816.tsv
 4 | Function = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Function/miner-function-20190807.tsv 
 5 | Gene = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Gene/miner-gene-20190826.tsv
 6 | Disease = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Disease/miner-disease-20190813.tsv
 7 | 
 8 | [Cross-Net]
 9 | Chemical-Chemical = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Chemical-Chemical/miner-chemical-chemical-20190828.tsv
10 | Chemical-Gene = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Chemical-Gene/miner-chemical-gene-20190828.tsv
11 | Function-Function = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Function-Function/miner-function-function-20190807.tsv
12 | Gene-Protein = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Gene-Protein/miner-gene-protein-20190819.tsv
13 | Gene-Function = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Gene-Function/miner-gene-function-20190815.tsv
14 | Protein-Protein = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Protein-Protein/miner-protein-protein-20190819.tsv
15 | Disease-Disease = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Disease-Disease/miner-disease-disease-20190813.tsv
16 | Disease-Gene = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Disease-Gene/miner-disease-gene-20190816.tsv
17 | Disease-Function = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Disease-Function/miner-disease-function-20190814.tsv
18 | Disease-Chemical = /dfs/scratch2/MINER-BIO/work-data-miner-v2/farzaan/output/Disease-Chemical/miner-disease-chemical-20190828.tsv
19 | 


--------------------------------------------------------------------------------
/examples/miner_get_stats.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | file   : miner_get_stats.py
 3 | authors : Farzaan Kaiyom, Agrim Gupta
 4 | 
 5 | Script to print basic statistics of the miner dataset.
 6 | 
 7 | Usage: 
 8 | python miner_get_stats.py <input_file>
 9 | 
10 | Positional Arguments:
11 | input_file : Path to the multi-modal network
12 | 
13 | Example Usage: 
14 | Input file : miner.graph
15 | 
16 | Command line: 
17 | python miner_get_stats.py ./miner.graph
18 | '''
19 | 
20 | import sys
21 | sys.path.insert(0, './../../swig/')
22 | import snap
23 | import argparse
24 | 
25 | parser = argparse.ArgumentParser(description='Print basic statistics of the miner dataset')
26 | parser.add_argument('input_file', help='path to the multi-modal network')
27 | args = parser.parse_args()
28 | 
29 | #methods to test modes
30 | def modeStats(Graph,name):
31 |   try:
32 |     gp = Graph.GetModeNetByName(name)
33 |     print(name,": ",gp.GetNodes())
34 |   except:
35 |     print(name," skipped")
36 | 
37 | def crossStats(Graph,name):
38 |   try:
39 |     gp = Graph.GetCrossNetByName(name)
40 |     print(name,": ",gp.GetEdges()) # use getEdges for older versions of SNAP.py
41 |   except:
42 |     print(name," skipped")
43 |   
44 | print("Printing Modes")
45 | FIn = snap.TFIn(args.input_file)
46 | Graph = snap.TMMNet.Load(FIn)
47 | 
48 | modeStats(Graph,"Chemical")
49 | modeStats(Graph,"Protein")
50 | modeStats(Graph,"Gene")
51 | modeStats(Graph,"Function")
52 | modeStats(Graph,"Disease")
53 | 
54 | print("Printing CrossNets")
55 | 
56 | crossStats(Graph,"Chemical-Chemical")
57 | crossStats(Graph,"Chemical-Gene")
58 | crossStats(Graph,"Function-Function")
59 | crossStats(Graph,"Gene-Function")
60 | crossStats(Graph,"Gene-Protein")
61 | crossStats(Graph,"Disease-Disease")
62 | crossStats(Graph,"Disease-Gene")
63 | crossStats(Graph,"Disease-Function")
64 | crossStats(Graph,"Disease-Chemical")
65 | crossStats(Graph,"Protein-Protein")
66 | 


--------------------------------------------------------------------------------
/examples/miner_load_tables.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | file : miner_load_tables.py
  3 | author : Agrim Gupta
  4 | edited by : Farzaan Kaiyom
  5 | 
  6 | Example to illustrate how to load the miner dataset into a multi-modal network. 
  7 | 
  8 | Usage: miner_load_tables.py <config_file>
  9 | 
 10 | Positional Arguments:
 11 | config_file  : Path to the config file. The config file contatins the path to all the modes and cross-net tsv files.
 12 | 
 13 | Optional Arguments: 
 14 | --output_dir : Directory to create output files. Defaults to the current working directory.
 15 | --loglevel   : Enable logging. Defaults to warning level. 
 16 | 
 17 | Example Usage: 
 18 | Input File   : Config.txt
 19 | 
 20 | Command Line : 
 21 | python miner_load_tables.py config.txt --loglevel info 
 22 | 
 23 | Output: 
 24 | miner.graph
 25 | '''
 26 | 
 27 | import sys
 28 | sys.path.insert(0, './../../swig/')
 29 | import snap
 30 | import ConfigParser
 31 | import argparse
 32 | import logging
 33 | import os
 34 | 
 35 | parser = argparse.ArgumentParser(description='Generate a Multi-Modal Network')
 36 | parser.add_argument('config_file', help='path of a config file.')
 37 | parser.add_argument('--output_dir', help='output path to save the Multi-Modal Network', default='.')
 38 | parser.add_argument('--loglevel', help='info for debug print.')
 39 | parser.add_argument('--outputf', help='output file name.', default='miner.graph')
 40 | args = parser.parse_args()
 41 | config = ConfigParser.ConfigParser()
 42 | config.readfp(open(args.config_file))
 43 | if args.loglevel:
 44 |     numeric_level = getattr(logging, args.loglevel.upper(), None)
 45 |     logging.basicConfig(level=numeric_level)
 46 | 
 47 | context = snap.TTableContext()
 48 | # Construct the graph
 49 | logging.info('Building Multi-Modal Network')
 50 | Graph = snap.TMMNet.New()
 51 | 
 52 | # Loading Modes
 53 | try:
 54 |     chemical_mode_file = config.get('Modes', 'Chemical')
 55 |     cmschema = snap.Schema()
 56 |     cmschema.Add(snap.TStrTAttrPr("ChemicalId", snap.atStr))
 57 |     cmschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
 58 |     chemical_mode = snap.TTable.LoadSS(cmschema, chemical_mode_file, context, "\t", snap.TBool(False))
 59 |     logging.info('Done loading Chemical Mode')
 60 |     snap.LoadModeNetToNet(Graph, "Chemical", chemical_mode, "ChemicalId", snap.TStr64V())
 61 | except ConfigParser.NoOptionError: 
 62 |     logging.info('Skipping Chemical Mode')
 63 | 
 64 | try:
 65 |     function_mode_file = config.get('Modes', 'Function')
 66 |     fmschema = snap.Schema()
 67 |     fmschema.Add(snap.TStrTAttrPr("FunctionId", snap.atStr))
 68 |     fmschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
 69 |     function_mode = snap.TTable.LoadSS(fmschema, function_mode_file, context, "\t", snap.TBool(False))
 70 |     logging.info('Done loading Function Mode')
 71 |     snap.LoadModeNetToNet(Graph, "Function", function_mode, "FunctionId", snap.TStr64V())
 72 | except ConfigParser.NoOptionError:
 73 |     logging.info('Skipping Function Mode')
 74 | 
 75 | try:
 76 |     gene_mode_file = config.get('Modes', 'Gene')
 77 |     gmschema = snap.Schema()
 78 |     gmschema.Add(snap.TStrTAttrPr("GeneId", snap.atStr))
 79 |     gmschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
 80 |     gene_mode = snap.TTable.LoadSS(gmschema, gene_mode_file, context, "\t", snap.TBool(False))
 81 |     logging.info('Done loading Gene Mode')
 82 |     snap.LoadModeNetToNet(Graph, "Gene", gene_mode, "GeneId", snap.T64StrV())
 83 | except ConfigParser.NoOptionError:
 84 |     logging.info('Skipping Gene Mode')
 85 | 
 86 | try:
 87 |     protein_mode_file = config.get('Modes', 'Protein')
 88 |     pmschema = snap.Schema()
 89 |     pmschema.Add(snap.TStrTAttrPr("ProteinId", snap.atStr))
 90 |     pmschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
 91 |     protein_mode = snap.TTable.LoadSS(pmschema, protein_mode_file, context, "\t", snap.TBool(False))
 92 |     logging.info('Done loading Protein Mode')
 93 |     snap.LoadModeNetToNet(Graph, "Protein", protein_mode, "ProteinId", snap.TStr64V())
 94 | except ConfigParser.NoOptionError:
 95 |     logging.info('Skipping Protein Mode')
 96 | 
 97 | try:
 98 |     disease_mode_file = config.get('Modes', 'Disease')
 99 |     dmschema = snap.Schema()
100 |     dmschema.Add(snap.TStrTAttrPr("DiseaseId", snap.atStr))
101 |     dmschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
102 |     disease_mode = snap.TTable.LoadSS(dmschema, disease_mode_file, context, "\t", snap.TBool(False))
103 |     logging.info('Done loading Disease Mode')
104 |     snap.LoadModeNetToNet(Graph, "Disease", disease_mode, "DiseaseId", snap.TStr64V())
105 | except ConfigParser.NoOptionError:
106 |     logging.info('Skipping Disease Mode')
107 | 
108 | # Loading Cross-Nets
109 | try:
110 |     chemical_chemical_crossnet_file = config.get('Cross-Net', 'Chemical-Chemical')
111 |     cccschema = snap.Schema()
112 |     cccschema.Add(snap.TStrTAttrPr("CCEdgeId", snap.atStr))
113 |     cccschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
114 |     cccschema.Add(snap.TStrTAttrPr("CSrcId", snap.atStr))
115 |     cccschema.Add(snap.TStrTAttrPr("CDstId", snap.atStr))
116 |     cccschema.Add(snap.TStrTAttrPr("desc", snap.atStr))
117 |     chemical_chemical_crossnet = snap.TTable.LoadSS(cccschema, chemical_chemical_crossnet_file, context, "\t", snap.TBool(False))
118 |     logging.info('Done loading Chemical-Chemical Cross-Net')
119 |     snap.LoadCrossNetToNet(Graph, "Chemical", "Chemical", "Chemical-Chemical", chemical_chemical_crossnet, "CSrcId", "CDstId", snap.TStr64V())
120 | except ConfigParser.NoOptionError:
121 |     logging.info('Skipping Chemical-Chemical Cross-Net')
122 | 
123 | try:
124 |     chemical_gene_crossnet_file = config.get('Cross-Net', 'Chemical-Gene')
125 |     cgcschema = snap.Schema()
126 |     cgcschema.Add(snap.TStrTAttrPr("CGEdgeId", snap.atStr))
127 |     cgcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
128 |     cgcschema.Add(snap.TStrTAttrPr("CSrcId", snap.atStr))
129 |     cgcschema.Add(snap.TStrTAttrPr("GDstId", snap.atStr))
130 |     chemical_gene_crossnet = snap.TTable.LoadSS(cgcschema, chemical_gene_crossnet_file, context, "\t", snap.TBool(False))
131 |     logging.info('Done loading Chemical-Gene Cross-Net')
132 |     snap.LoadCrossNetToNet(Graph, "Chemical", "Gene", "Chemical-Gene", chemical_gene_crossnet, "CSrcId", "GDstId", snap.TStr64V())
133 | except ConfigParser.NoOptionError:
134 |     logging.info('Skipping Chemical-Gene Cross-Net')
135 | 
136 | try:
137 |     function_function_crossnet_file = config.get('Cross-Net', 'Function-Function')
138 |     ffcschema = snap.Schema()
139 |     ffcschema.Add(snap.TStrTAttrPr("FFEdgeId", snap.atStr))
140 |     ffcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
141 |     ffcschema.Add(snap.TStrTAttrPr("FSrcId", snap.atStr))
142 |     ffcschema.Add(snap.TStrTAttrPr("FDstId", snap.atStr))
143 |     function_function_crossnet = snap.TTable.LoadSS(ffcschema, function_function_crossnet_file, context, "\t", snap.TBool(False))
144 |     logging.info('Done loading Function-Function Cross-Net')
145 |     snap.LoadCrossNetToNet(Graph, "Function", "Function", "Function-Function", function_function_crossnet, "FSrcId", "FDstId", snap.TStr64V())
146 | except ConfigParser.NoOptionError:
147 |     logging.info('Skipping Function-Function Cross-Net')
148 | 
149 | try:
150 |     gene_function_crossnet_file = config.get('Cross-Net', 'Gene-Function')
151 |     gfcschema = snap.Schema()
152 |     gfcschema.Add(snap.TStrTAttrPr("GFEdgeId", snap.atStr))
153 |     gfcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
154 |     gfcschema.Add(snap.TStrTAttrPr("GSrcId", snap.atStr))
155 |     gfcschema.Add(snap.TStrTAttrPr("FDstId", snap.atStr))
156 |     gene_function_crossnet = snap.TTable.LoadSS(gfcschema, gene_function_crossnet_file, context, "\t", snap.TBool(False))
157 |     logging.info('Done loading Gene-Function Cross-Net')
158 |     snap.LoadCrossNetToNet(Graph, "Gene", "Function", "Gene-Function", gene_function_crossnet, "GSrcId", "FDstId", snap.TStr64V())
159 | except ConfigParser.NoOptionError:
160 |     logging.info('Skipping Gene-Function Cross-Net')
161 | 
162 | try:
163 |     gene_protein_crossnet_file = config.get('Cross-Net', 'Gene-Protein')
164 |     gpcschema = snap.Schema()
165 |     gpcschema.Add(snap.TStrTAttrPr("GPEdgeId", snap.atStr))
166 |     gpcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
167 |     gpcschema.Add(snap.TStrTAttrPr("GSrcId", snap.atStr))
168 |     gpcschema.Add(snap.TStrTAttrPr("PDstId", snap.atStr))
169 |     gene_protein_crossnet = snap.TTable.LoadSS(gpcschema, gene_protein_crossnet_file, context, "\t", snap.TBool(False))
170 |     logging.info('Done loading Gene-Protein Cross-Net')
171 |     snap.LoadCrossNetToNet(Graph, "Gene", "Protein", "Gene-Protein", gene_protein_crossnet, "GSrcId", "PDstId", snap.TStr64V())
172 | except ConfigParser.NoOptionError:
173 |     logging.info('Skipping Gene-Protein Cross-Net')
174 | try:
175 |     protein_protein_crossnet_file = config.get('Cross-Net', 'Protein-Protein')
176 |     ppcschema = snap.Schema()
177 |     ppcschema.Add(snap.TStrTAttrPr("PPEdgeId", snap.atStr))
178 |     ppcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
179 |     ppcschema.Add(snap.TStrTAttrPr("PSrcId", snap.atStr))
180 |     ppcschema.Add(snap.TStrTAttrPr("PDstId", snap.atStr))
181 |     protein_protein_crossnet = snap.TTable.LoadSS(ppcschema, protein_protein_crossnet_file, context, "\t", snap.TBool(False))
182 |     logging.info('Done loading Protein-Protein Cross-Net')
183 |     snap.LoadCrossNetToNet(Graph, "Protein", "Protein", "Protein-Protein", protein_protein_crossnet, "PSrcId", "PDstId", snap.TStr64V())
184 | except ConfigParser.NoOptionError:
185 |     logging.info('Skipping Protein-Protein Cross-Net')
186 | try:
187 |     disease_disease_crossnet_file = config.get('Cross-Net', 'Disease-Disease')
188 |     ddcschema = snap.Schema()
189 |     ddcschema.Add(snap.TStrTAttrPr("DDEdgeId", snap.atStr))
190 |     ddcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
191 |     ddcschema.Add(snap.TStrTAttrPr("DSrcId", snap.atStr))
192 |     ddcschema.Add(snap.TStrTAttrPr("DDstId", snap.atStr))
193 |     disease_disease_crossnet = snap.TTable.LoadSS(ddcschema, disease_disease_crossnet_file, context, "\t", snap.TBool(False))
194 |     logging.info('Done loading Disease-Disease Cross-Net')
195 |     snap.LoadCrossNetToNet(Graph, "Disease", "Disease", "Disease-Disease", disease_disease_crossnet, "DSrcId", "DDstId", snap.TStr64V())
196 | except ConfigParser.NoOptionError:
197 |     logging.info('Skipping Disease-Disease Cross-Net')
198 | 
199 | try:
200 |     disease_gene_crossnet_file = config.get('Cross-Net', 'Disease-Gene')
201 |     dgcschema = snap.Schema()
202 |     dgcschema.Add(snap.TStrTAttrPr("DGEdgeId", snap.atStr))
203 |     dgcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
204 |     dgcschema.Add(snap.TStrTAttrPr("DSrcId", snap.atStr))
205 |     dgcschema.Add(snap.TStrTAttrPr("GDstId", snap.atStr))
206 |     disease_gene_crossnet = snap.TTable.LoadSS(dgcschema, disease_gene_crossnet_file, context, "\t", snap.TBool(False))
207 |     logging.info('Done loading Disease-Gene Cross-Net')
208 |     snap.LoadCrossNetToNet(Graph, "Disease", "Gene", "Disease-Gene", disease_gene_crossnet, "DSrcId", "GDstId", snap.TStr64V())
209 | except ConfigParser.NoOptionError:
210 |     logging.info('Skipping Disease-Gene Cross-Net')
211 | 
212 | try:
213 |     disease_function_crossnet_file = config.get('Cross-Net', 'Disease-Function')
214 |     dfcschema = snap.Schema()
215 |     dfcschema.Add(snap.TStrTAttrPr("DFEdgeId", snap.atStr))
216 |     dfcschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
217 |     dfcschema.Add(snap.TStrTAttrPr("DSrcId", snap.atStr))
218 |     dfcschema.Add(snap.TStrTAttrPr("FDstId", snap.atStr))
219 |     disease_function_crossnet = snap.TTable.LoadSS(dfcschema, disease_function_crossnet_file, context, "\t", snap.TBool(False))
220 |     logging.info('Done loading Disease-Function Cross-Net')
221 |     snap.LoadCrossNetToNet(Graph, "Disease", "Function", "Disease-Function", disease_function_crossnet, "DSrcId", "FDstId", snap.TStr64V())
222 | except ConfigParser.NoOptionError:
223 |     logging.info('Skipping Disease-Function Cross-Net')
224 | 
225 | try:
226 |     disease_chemical_crossnet_file = config.get('Cross-Net', 'Disease-Chemical')
227 |     dccschema = snap.Schema()
228 |     dccschema.Add(snap.TStrTAttrPr("DCEdgeId", snap.atStr))
229 |     dccschema.Add(snap.TStrTAttrPr("datasetId", snap.atStr))
230 |     dccschema.Add(snap.TStrTAttrPr("DSrcId", snap.atStr))
231 |     dccschema.Add(snap.TStrTAttrPr("CDstId", snap.atStr))
232 |     disease_chemical_crossnet = snap.TTable.LoadSS(dccschema, disease_chemical_crossnet_file, context, "\t", snap.TBool(False))
233 |     logging.info('Done loading Disease-Chemical Cross-Net')
234 |     snap.LoadCrossNetToNet(Graph, "Disease", "Chemical", "Disease-Chemical", disease_chemical_crossnet, "DSrcId", "CDstId", snap.TStr64V())
235 | except ConfigParser.NoOptionError:
236 |     logging.info('Skipping Disease-Chemical Cross-Net')
237 | 
238 | # Save the graph
239 | logging.info('Saving Multi-Modal Network to disk')
240 | outputPath = os.path.join(args.output_dir, args.outputf)
241 | FOut = snap.TFOut(outputPath)
242 | Graph.Save(FOut)
243 | FOut.Flush()
244 | 


--------------------------------------------------------------------------------