├── README.md ├── load_drugbank_csv.sql └── parse_drugbank_xml.py /README.md: -------------------------------------------------------------------------------- 1 | # drugbank_xml2csv_python 2 | Python script for converting DrugBank XML to relational CSV files. If loading CSVs to a SQL database, a sample query is: 3 | `select distinct drugbank_id, drugname, inhibitor, antagonist, agonist` 4 | `from drugbank05_drug2target` 5 | `join drugbank05_partner_protein using (partner_id)` 6 | `join drugbank05_drugs using (drugbank_id)` 7 | `where gene_name = 'KCNH2';` 8 | 9 | Tested on DrugBank 5.0.5 (3/4/17) 10 | 11 | Dependencies: `lxml` 12 | 13 | Outputs: 14 | - `drugbank05_drugs.csv` 15 | - `drugbank05_drug2target.csv` 16 | - `drugbank05_drug2target_human.csv` 17 | - `drugbank05_drug2enzyme.csv` 18 | - `drugbank05_drug2enzyme_human.csv` 19 | - `drugbank05_drug2transporter.csv` 20 | - `drugbank05_drug2transporter_human.csv` 21 | - `drugbank05_partner_protein.csv` 22 | - `drugbank05_partner_protein_human.csv` 23 | -------------------------------------------------------------------------------- /load_drugbank_csv.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `drugbank05_drugs` ( 2 | `drugbank_id` varchar(255) NOT NULL DEFAULT '', 3 | `drugname` varchar(255) NOT NULL DEFAULT '', 4 | `drug_type` varchar(255) NOT NULL DEFAULT '', 5 | `approved` int(1) NOT NULL, 6 | `experimental` int(1) NOT NULL, 7 | `illicit` int(1) NOT NULL, 8 | `investigational` int(1) NOT NULL, 9 | `nutraceutical` int(1) NOT NULL, 10 | `withdrawn` int(1) NOT NULL, 11 | PRIMARY KEY (`drugbank_id`), 12 | KEY `drug_type` (`drug_type`) 13 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 14 | 15 | load data local infile 'drugbank05_drugs.csv' 16 | into table drugbank05_drugs 17 | fields terminated by ',' 18 | enclosed by '"' 19 | lines terminated by '\n' 20 | ignore 1 lines; 21 | 22 | 23 | 24 | CREATE TABLE `drugbank05_drug2target` ( 25 | `drugbank_id` varchar(255) NOT NULL DEFAULT '', 26 | `partner_id` varchar(255) NOT NULL DEFAULT '', 27 | `inhibitor` int(1) NOT NULL, 28 | `antagonist` int(1) NOT NULL, 29 | `agonist` int(1) NOT NULL 30 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 31 | 32 | load data local infile 'drugbank05_drug2target.csv' 33 | into table drugbank05_drug2target 34 | fields terminated by ',' 35 | enclosed by '"' 36 | lines terminated by '\n' 37 | ignore 1 lines; 38 | 39 | 40 | 41 | CREATE TABLE `drugbank05_drug2enzyme` ( 42 | `drugbank_id` varchar(255) NOT NULL DEFAULT '', 43 | `partner_id` varchar(255) NOT NULL DEFAULT '', 44 | `substrate` int(1) NOT NULL, 45 | `inducer` int(1) NOT NULL, 46 | `inhibitor` int(1) NOT NULL 47 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 48 | 49 | load data local infile 'drugbank05_drug2enzyme.csv' 50 | into table drugbank05_drug2enzyme 51 | fields terminated by ',' 52 | enclosed by '"' 53 | lines terminated by '\n' 54 | ignore 1 lines; 55 | 56 | 57 | 58 | CREATE TABLE `drugbank05_drug2transporter` ( 59 | `drugbank_id` varchar(255) NOT NULL DEFAULT '', 60 | `partner_id` varchar(255) NOT NULL DEFAULT '', 61 | `substrate` int(1) NOT NULL, 62 | `inducer` int(1) NOT NULL, 63 | `inhibitor` int(1) NOT NULL 64 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 65 | 66 | load data local infile 'drugbank05_drug2transporter.csv' 67 | into table drugbank05_drug2transporter 68 | fields terminated by ',' 69 | enclosed by '"' 70 | lines terminated by '\n' 71 | ignore 1 lines; 72 | 73 | 74 | 75 | CREATE TABLE `drugbank05_partner_protein` ( 76 | `partner_id` varchar(255) NOT NULL DEFAULT '', 77 | `partner_name` varchar(255) NOT NULL DEFAULT '', 78 | `gene_name` varchar(255) NOT NULL DEFAULT '', 79 | `uniprot_id` varchar(255) NOT NULL DEFAULT '', 80 | `genbank_gene_id` varchar(255) NOT NULL DEFAULT '', 81 | `genbank_protein_id` varchar(255) NOT NULL DEFAULT '', 82 | `hgnc_id` varchar(255) NOT NULL DEFAULT '', 83 | `organism` varchar(255) NOT NULL DEFAULT '', 84 | `taxonomy_id` int(11) NOT NULL 85 | ) ENGINE=MyISAM DEFAULT CHARSET=latin1; 86 | 87 | load data local infile 'drugbank05_partner_protein.csv' 88 | into table drugbank05_partner_protein 89 | fields terminated by ',' 90 | enclosed by '"' 91 | lines terminated by '\n' 92 | ignore 1 lines; -------------------------------------------------------------------------------- /parse_drugbank_xml.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | from StringIO import StringIO 3 | from collections import defaultdict 4 | import csv 5 | 6 | # Parse XML 7 | f = open('drugbank.xml','r') 8 | data = f.read() 9 | f.close() 10 | 11 | tree = etree.parse(StringIO(data)) 12 | context = etree.iterparse(StringIO(data)) 13 | 14 | root = tree.getroot() 15 | # print len(root), 'drugs' 16 | 17 | 18 | ####################################################################### 19 | # Iterate over drugs 20 | drug2attrib = defaultdict(dict) 21 | # drugbank_id -> {'drugname', 'drug_type', 'groups', 'targets/enzymes/transporters': [_id, _actions]} 22 | 23 | target2attrib = defaultdict(dict) 24 | enzyme2attrib = defaultdict(dict) 25 | transporter2attrib = defaultdict(dict) 26 | # drugbank_target_id -> {'gene', 'name', 'organism', 'taxonomy_id', 'uniprot_id', 'genbank_gene_id', 'genbank_protein_id', 'hgnc_id'} 27 | 28 | tag_prefix = '{http://www.drugbank.ca}' 29 | 30 | for child in root: 31 | for s in child.findall(tag_prefix+'drugbank-id'): 32 | if 'primary' in s.attrib: 33 | drugbank_id = s.text 34 | 35 | drugname = child.findall(tag_prefix+'name')[0].text 36 | drug2attrib[drugbank_id]['drugname'] = drugname 37 | 38 | drug_type = child.attrib['type'] 39 | drug2attrib[drugbank_id]['drug_type'] = drug_type 40 | 41 | groups = [s.text for s in child.find(tag_prefix+'groups').findall(tag_prefix+'group')] 42 | drug2attrib[drugbank_id]['groups'] = groups 43 | 44 | # Get targets 45 | drug2attrib[drugbank_id]['targets'] = [] 46 | for target in child.find(tag_prefix+'targets').findall(tag_prefix+'target'): 47 | if target.find(tag_prefix+'polypeptide') is None: 48 | continue 49 | 50 | target_id = target.find(tag_prefix+'id').text 51 | target_gene = target.find(tag_prefix+'polypeptide').find(tag_prefix+'gene-name').text 52 | target_name = target.find(tag_prefix+'name').text 53 | target_organism = target.find(tag_prefix+'organism').text 54 | target_taxonomy_id = target.find(tag_prefix+'polypeptide').find(tag_prefix+'organism').attrib['ncbi-taxonomy-id'] 55 | 56 | if target_organism is None and target_taxonomy_id == '9606': 57 | target_organism = 'Human' 58 | if target_organism == 'Human' and target_taxonomy_id == '': 59 | target_taxonomy_id = '9606' 60 | if target_organism == 'Homo sapiens': 61 | target_organism = 'Human' 62 | if target_gene is None or target_organism is None or target_taxonomy_id is None: 63 | continue 64 | 65 | target_external_ids = target.find(tag_prefix+'polypeptide').find(tag_prefix+'external-identifiers').findall(tag_prefix+'external-identifier') 66 | target_uniprot_id = '' 67 | target_genbank_gene = '' 68 | target_genbank_protein = '' 69 | target_hgnc_id = '' 70 | 71 | for external_id in target_external_ids: 72 | if external_id.find(tag_prefix+'resource').text == 'UniProtKB': 73 | target_uniprot_id = external_id.find(tag_prefix+'identifier').text 74 | elif external_id.find(tag_prefix+'resource').text == 'GenBank Gene Database': 75 | target_genbank_gene = external_id.find(tag_prefix+'identifier').text 76 | elif external_id.find(tag_prefix+'resource').text == 'GenBank Protein Database': 77 | target_genbank_protein = external_id.find(tag_prefix+'identifier').text 78 | elif external_id.find(tag_prefix+'resource').text == 'HUGO Gene Nomenclature Committee (HGNC)': 79 | target_hgnc_id = external_id.find(tag_prefix+'identifier').text 80 | 81 | target_actions = [s.text.lower() for s in target.find(tag_prefix+'actions').findall(tag_prefix+'action')] 82 | 83 | drug2attrib[drugbank_id]['targets'].append((target_id,target_actions)) 84 | 85 | if target_id not in target2attrib: #{'gene', 'name', 'organism', 'taxonomy_id', 'uniprot_id', 'genbank_gene_id', 'genbank_protein_id', 'hgnc_id'} 86 | target2attrib[target_id]['gene'] = target_gene 87 | target2attrib[target_id]['name'] = target_name 88 | target2attrib[target_id]['organism'] = target_organism 89 | target2attrib[target_id]['taxonomy_id'] = target_taxonomy_id 90 | 91 | target2attrib[target_id]['uniprot_id'] = target_uniprot_id 92 | target2attrib[target_id]['genbank_gene_id'] = target_genbank_gene 93 | target2attrib[target_id]['genbank_protein_id'] = target_genbank_protein 94 | target2attrib[target_id]['hgnc_id'] = target_hgnc_id 95 | 96 | #print target_id, target_gene, target_name, target_organism, target_taxonomy_id, target_actions 97 | 98 | 99 | # Get enzymes 100 | drug2attrib[drugbank_id]['enzymes'] = [] 101 | for enzyme in child.find(tag_prefix+'enzymes').findall(tag_prefix+'enzyme'): 102 | if enzyme.find(tag_prefix+'polypeptide') is None: 103 | continue 104 | 105 | enzyme_id = enzyme.find(tag_prefix+'id').text 106 | enzyme_gene = enzyme.find(tag_prefix+'polypeptide').find(tag_prefix+'gene-name').text 107 | enzyme_name = enzyme.find(tag_prefix+'name').text 108 | enzyme_organism = enzyme.find(tag_prefix+'organism').text 109 | enzyme_taxonomy_id = enzyme.find(tag_prefix+'polypeptide').find(tag_prefix+'organism').attrib['ncbi-taxonomy-id'] 110 | 111 | if enzyme_organism is None and enzyme_taxonomy_id == '9606': 112 | enzyme_organism = 'Human' 113 | if enzyme_organism == 'Human' and enzyme_taxonomy_id == '': 114 | enzyme_taxonomy_id = '9606' 115 | if enzyme_organism == 'Homo sapiens': 116 | enzyme_organism = 'Human' 117 | if enzyme_gene is None or enzyme_organism is None or enzyme_taxonomy_id is None: 118 | continue 119 | 120 | enzyme_external_ids = enzyme.find(tag_prefix+'polypeptide').find(tag_prefix+'external-identifiers').findall(tag_prefix+'external-identifier') 121 | enzyme_uniprot_id = '' 122 | enzyme_genbank_gene = '' 123 | enzyme_genbank_protein = '' 124 | enzyme_hgnc_id = '' 125 | 126 | for external_id in enzyme_external_ids: 127 | if external_id.find(tag_prefix+'resource').text == 'UniProtKB': 128 | enzyme_uniprot_id = external_id.find(tag_prefix+'identifier').text 129 | elif external_id.find(tag_prefix+'resource').text == 'GenBank Gene Database': 130 | enzyme_genbank_gene = external_id.find(tag_prefix+'identifier').text 131 | elif external_id.find(tag_prefix+'resource').text == 'GenBank Protein Database': 132 | enzyme_genbank_protein = external_id.find(tag_prefix+'identifier').text 133 | elif external_id.find(tag_prefix+'resource').text == 'HUGO Gene Nomenclature Committee (HGNC)': 134 | enzyme_hgnc_id = external_id.find(tag_prefix+'identifier').text 135 | 136 | enzyme_actions = [s.text.lower() for s in enzyme.find(tag_prefix+'actions').findall(tag_prefix+'action')] 137 | 138 | drug2attrib[drugbank_id]['enzymes'].append((enzyme_id,enzyme_actions)) 139 | 140 | if enzyme_id not in enzyme2attrib: #{'gene', 'name', 'organism', 'taxonomy_id', 'uniprot_id', 'genbank_gene_id', 'genbank_protein_id', 'hgnc_id'} 141 | enzyme2attrib[enzyme_id]['gene'] = enzyme_gene 142 | enzyme2attrib[enzyme_id]['name'] = enzyme_name 143 | enzyme2attrib[enzyme_id]['organism'] = enzyme_organism 144 | enzyme2attrib[enzyme_id]['taxonomy_id'] = enzyme_taxonomy_id 145 | 146 | enzyme2attrib[enzyme_id]['uniprot_id'] = enzyme_uniprot_id 147 | enzyme2attrib[enzyme_id]['genbank_gene_id'] = enzyme_genbank_gene 148 | enzyme2attrib[enzyme_id]['genbank_protein_id'] = enzyme_genbank_protein 149 | enzyme2attrib[enzyme_id]['hgnc_id'] = enzyme_hgnc_id 150 | 151 | #print enzyme_id, enzyme_gene, enzyme_name, enzyme_organism, enzyme_taxonomy_id, enzyme_actions 152 | 153 | 154 | # Get transporters 155 | drug2attrib[drugbank_id]['transporters'] = [] 156 | for transporter in child.find(tag_prefix+'transporters').findall(tag_prefix+'transporter'): 157 | if transporter.find(tag_prefix+'polypeptide') is None: 158 | continue 159 | 160 | transporter_id = transporter.find(tag_prefix+'id').text 161 | transporter_gene = transporter.find(tag_prefix+'polypeptide').find(tag_prefix+'gene-name').text 162 | transporter_name = transporter.find(tag_prefix+'name').text 163 | transporter_organism = transporter.find(tag_prefix+'organism').text 164 | transporter_taxonomy_id = transporter.find(tag_prefix+'polypeptide').find(tag_prefix+'organism').attrib['ncbi-taxonomy-id'] 165 | 166 | if transporter_organism is None and transporter_taxonomy_id == '9606': 167 | transporter_organism = 'Human' 168 | if transporter_organism == 'Human' and transporter_taxonomy_id == '': 169 | transporter_taxonomy_id = '9606' 170 | if transporter_organism == 'Homo sapiens': 171 | transporter_organism = 'Human' 172 | if transporter_gene is None or transporter_organism is None or transporter_taxonomy_id is None: 173 | continue 174 | 175 | transporter_external_ids = transporter.find(tag_prefix+'polypeptide').find(tag_prefix+'external-identifiers').findall(tag_prefix+'external-identifier') 176 | transporter_uniprot_id = '' 177 | transporter_genbank_gene = '' 178 | transporter_genbank_protein = '' 179 | transporter_hgnc_id = '' 180 | 181 | for external_id in transporter_external_ids: 182 | if external_id.find(tag_prefix+'resource').text == 'UniProtKB': 183 | transporter_uniprot_id = external_id.find(tag_prefix+'identifier').text 184 | elif external_id.find(tag_prefix+'resource').text == 'GenBank Gene Database': 185 | transporter_genbank_gene = external_id.find(tag_prefix+'identifier').text 186 | elif external_id.find(tag_prefix+'resource').text == 'GenBank Protein Database': 187 | transporter_genbank_protein = external_id.find(tag_prefix+'identifier').text 188 | elif external_id.find(tag_prefix+'resource').text == 'HUGO Gene Nomenclature Committee (HGNC)': 189 | transporter_hgnc_id = external_id.find(tag_prefix+'identifier').text 190 | 191 | transporter_actions = [s.text.lower() for s in transporter.find(tag_prefix+'actions').findall(tag_prefix+'action')] 192 | 193 | drug2attrib[drugbank_id]['transporters'].append((transporter_id,transporter_actions)) 194 | 195 | if transporter_id not in transporter2attrib: #{'gene', 'name', 'organism', 'taxonomy_id', 'uniprot_id', 'genbank_gene_id', 'genbank_protein_id', 'hgnc_id'} 196 | transporter2attrib[transporter_id]['gene'] = transporter_gene 197 | transporter2attrib[transporter_id]['name'] = transporter_name 198 | transporter2attrib[transporter_id]['organism'] = transporter_organism 199 | transporter2attrib[transporter_id]['taxonomy_id'] = transporter_taxonomy_id 200 | 201 | transporter2attrib[transporter_id]['uniprot_id'] = transporter_uniprot_id 202 | transporter2attrib[transporter_id]['genbank_gene_id'] = transporter_genbank_gene 203 | transporter2attrib[transporter_id]['genbank_protein_id'] = transporter_genbank_protein 204 | transporter2attrib[transporter_id]['hgnc_id'] = transporter_hgnc_id 205 | 206 | #print transporter_id, transporter_gene, transporter_name, transporter_organism, transporter_taxonomy_id, transporter_actions 207 | 208 | 209 | print drugbank_id, drugname, drug_type, groups, 210 | print 'targets:', len(drug2attrib[drugbank_id]['targets']), 211 | print 'enzymes:', len(drug2attrib[drugbank_id]['enzymes']), 212 | print 'transporters:', len(drug2attrib[drugbank_id]['transporters']) 213 | 214 | print '\n' 215 | 216 | 217 | ####################################################################### 218 | # List of drugs to save (as long as num_targets + num_enzymes + num_transporters != 0) 219 | drugs = [] 220 | for drugbank_id in sorted(drug2attrib.keys()): 221 | if len(drug2attrib[drugbank_id]['targets']) == 0 and len(drug2attrib[drugbank_id]['enzymes']) == 0 and len(drug2attrib[drugbank_id]['transporters']) == 0: 222 | continue 223 | else: 224 | drugs.append(drugbank_id) 225 | 226 | print len(drug2attrib), "drugs parsed from XML" 227 | print len(drugs), "drugs with at least 1 target/ enzyme/ transporter" 228 | 229 | 230 | ####################################################################### 231 | # Save drug attributes to CSV {'drugname', 'drug_type', 'groups', 'targets/enzymes/transporters': [_id, _actions]} 232 | outf = open('drugbank05_drugs.csv', 'w') 233 | writer = csv.writer(outf) 234 | writer.writerow(['drugbank_id', 'drugname', 'drug_type', 'approved', 'experimental', 'illicit', 'investigational', 'nutraceutical', 'withdrawn']) 235 | 236 | for drugbank_id in drugs: 237 | drugname = drug2attrib[drugbank_id]['drugname'] 238 | if isinstance(drugname, unicode): 239 | if u'\u03b2' in drugname: 240 | drugname = drugname.replace(u'\u03b2', 'beta') 241 | if u'\u03b1' in drugname: 242 | drugname = drugname.replace(u'\u03b1', 'alpha') 243 | drugname = drugname.encode("utf-8") 244 | drug_type = drug2attrib[drugbank_id]['drug_type'] 245 | groups = [1 if group in drug2attrib[drugbank_id]['groups'] else 0 for group in ['approved', 'experimental', 'illicit', 'investigational', 'nutraceutical', 'withdrawn'] ] 246 | 247 | writer.writerow([drugbank_id, drugname, drug_type]+groups) 248 | 249 | outf.close() 250 | 251 | 252 | ####################################################################### 253 | # Save all targets, enzymes, transporters to CSV 254 | 255 | # drugbank_target_id -> #{'gene', 'name', 'organism', 'taxonomy_id', 'uniprot_id', 'genbank_gene_id', 'genbank_protein_id', 'hgnc_id'} 256 | 257 | outf = open('drugbank05_partner_protein.csv', 'w') 258 | outfh = open('drugbank05_partner_protein_human.csv', 'w') 259 | writer = csv.writer(outf) 260 | writerh = csv.writer(outfh) 261 | 262 | writer.writerow(['partner_id', 'partner_name', 'gene_name', 'uniprot_id', 'genbank_gene_id', 'genbank_protein_id', 'hgnc_id', 'organism', 'taxonomy_id']) 263 | writerh.writerow(['partner_id', 'partner_name', 'gene_name', 'uniprot_id', 'genbank_gene_id', 'genbank_protein_id', 'hgnc_id', 'organism', 'taxonomy_id']) 264 | 265 | partners_written = set() 266 | 267 | # Targets 268 | for partner_id in sorted(target2attrib.keys()): 269 | if partner_id in partners_written: 270 | # print partner_id, target2attrib[partner_id]['gene'], 'already recorded' 271 | continue 272 | 273 | partner_name = target2attrib[partner_id]['name'] 274 | gene_name = target2attrib[partner_id]['gene'] 275 | organism = target2attrib[partner_id]['organism'] 276 | taxonomy_id = target2attrib[partner_id]['taxonomy_id'] 277 | 278 | uniprot_id = target2attrib[partner_id]['uniprot_id'] 279 | genbank_gene_id = target2attrib[partner_id]['genbank_gene_id'] 280 | genbank_protein_id = target2attrib[partner_id]['genbank_protein_id'] 281 | hgnc_id = target2attrib[partner_id]['hgnc_id'] 282 | 283 | partners_written.add(partner_id) 284 | 285 | writer.writerow([partner_id, partner_name, gene_name, uniprot_id, genbank_gene_id, genbank_protein_id, hgnc_id, organism, taxonomy_id]) 286 | 287 | if taxonomy_id == '9606' and organism == 'Human': 288 | writerh.writerow([partner_id, partner_name, gene_name, uniprot_id, genbank_gene_id, genbank_protein_id, hgnc_id, organism, taxonomy_id]) 289 | 290 | if taxonomy_id == '9606' and organism.lower() != 'human': 291 | print partner_id, target2attrib[partner_id]['gene'], organism, taxonomy_id, 'organism mismatch' 292 | 293 | 294 | # enzymes 295 | for partner_id in sorted(enzyme2attrib.keys()): 296 | if partner_id in partners_written: 297 | # print partner_id, enzyme2attrib[partner_id]['gene'], 'already recorded in targets' 298 | continue 299 | 300 | partner_name = enzyme2attrib[partner_id]['name'] 301 | gene_name = enzyme2attrib[partner_id]['gene'] 302 | organism = enzyme2attrib[partner_id]['organism'] 303 | taxonomy_id = enzyme2attrib[partner_id]['taxonomy_id'] 304 | 305 | uniprot_id = enzyme2attrib[partner_id]['uniprot_id'] 306 | genbank_gene_id = enzyme2attrib[partner_id]['genbank_gene_id'] 307 | genbank_protein_id = enzyme2attrib[partner_id]['genbank_protein_id'] 308 | hgnc_id = enzyme2attrib[partner_id]['hgnc_id'] 309 | 310 | partners_written.add(partner_id) 311 | 312 | writer.writerow([partner_id, partner_name, gene_name, uniprot_id, genbank_gene_id, genbank_protein_id, hgnc_id, organism, taxonomy_id]) 313 | 314 | if taxonomy_id == '9606' and organism == 'Human': 315 | writerh.writerow([partner_id, partner_name, gene_name, uniprot_id, genbank_gene_id, genbank_protein_id, hgnc_id, organism, taxonomy_id]) 316 | 317 | if taxonomy_id == '9606' and organism.lower() != 'human': 318 | print partner_id, enzyme2attrib[partner_id]['gene'], organism, taxonomy_id, 'organism mismatch' 319 | 320 | 321 | # transporters 322 | for partner_id in sorted(transporter2attrib.keys()): 323 | if partner_id in partners_written: 324 | # print partner_id, transporter2attrib[partner_id]['gene'], 'already recorded in targets and/or enzymes' 325 | continue 326 | 327 | partner_name = transporter2attrib[partner_id]['name'] 328 | gene_name = transporter2attrib[partner_id]['gene'] 329 | organism = transporter2attrib[partner_id]['organism'] 330 | taxonomy_id = transporter2attrib[partner_id]['taxonomy_id'] 331 | 332 | uniprot_id = transporter2attrib[partner_id]['uniprot_id'] 333 | genbank_gene_id = transporter2attrib[partner_id]['genbank_gene_id'] 334 | genbank_protein_id = transporter2attrib[partner_id]['genbank_protein_id'] 335 | hgnc_id = transporter2attrib[partner_id]['hgnc_id'] 336 | 337 | partners_written.add(partner_id) 338 | 339 | writer.writerow([partner_id, partner_name, gene_name, uniprot_id, genbank_gene_id, genbank_protein_id, hgnc_id, organism, taxonomy_id]) 340 | 341 | if taxonomy_id == '9606' and organism == 'Human': 342 | writerh.writerow([partner_id, partner_name, gene_name, uniprot_id, genbank_gene_id, genbank_protein_id, hgnc_id, organism, taxonomy_id]) 343 | 344 | if taxonomy_id == '9606' and organism.lower() != 'human': 345 | print partner_id, transporter2attrib[partner_id]['gene'], organism, taxonomy_id, 'organism mismatch' 346 | 347 | outf.close() 348 | outfh.close() 349 | 350 | 351 | ####################################################################### 352 | # Save drug-target, -enzyme, -transporter pairs to CSV 353 | 354 | # target [('antagonist', 1374), ('agonist', 857), ('inhibitor', 1818)] 355 | # enzyme [('substrate', 2402), ('inducer', 407), ('inhibitor', 1350)] 356 | # transporter [('substrate', 790), ('inducer', 100), ('inhibitor', 1075)] 357 | 358 | # Targets 359 | outf = open('drugbank05_drug2target.csv', 'w') 360 | outfh = open('drugbank05_drug2target_human.csv', 'w') 361 | writer = csv.writer(outf) 362 | writerh = csv.writer(outfh) 363 | 364 | target_actions_to_write = ['inhibitor', 'antagonist', 'agonist'] 365 | writer.writerow(['drugbank_id', 'partner_id']+target_actions_to_write) 366 | writerh.writerow(['drugbank_id', 'partner_id']+target_actions_to_write) 367 | 368 | for drugbank_id in drugs: 369 | for (target_id, target_actions) in drug2attrib[drugbank_id]['targets']: 370 | actions = [1 if action in target_actions else 0 for action in target_actions_to_write] 371 | 372 | writer.writerow([drugbank_id, target_id]+actions) 373 | 374 | if target2attrib[target_id]['organism'] == 'Human' and target2attrib[target_id]['taxonomy_id'] == '9606': 375 | writerh.writerow([drugbank_id, target_id]+actions) 376 | 377 | outf.close() 378 | outfh.close() 379 | 380 | 381 | # Enzymes 382 | outf = open('drugbank05_drug2enzyme.csv', 'w') 383 | outfh = open('drugbank05_drug2enzyme_human.csv', 'w') 384 | writer = csv.writer(outf) 385 | writerh = csv.writer(outfh) 386 | 387 | enzyme_actions_to_write = ['substrate', 'inducer', 'inhibitor'] 388 | writer.writerow(['drugbank_id', 'partner_id']+enzyme_actions_to_write) 389 | writerh.writerow(['drugbank_id', 'partner_id']+enzyme_actions_to_write) 390 | 391 | for drugbank_id in drugs: 392 | for (enzyme_id, enzyme_actions) in drug2attrib[drugbank_id]['enzymes']: 393 | actions = [1 if action in enzyme_actions else 0 for action in enzyme_actions_to_write] 394 | 395 | writer.writerow([drugbank_id, enzyme_id]+actions) 396 | 397 | if enzyme2attrib[enzyme_id]['organism'] == 'Human' and enzyme2attrib[enzyme_id]['taxonomy_id'] == '9606': 398 | writerh.writerow([drugbank_id, enzyme_id]+actions) 399 | 400 | outf.close() 401 | outfh.close() 402 | 403 | 404 | # Transporters 405 | outf = open('drugbank05_drug2transporter.csv', 'w') 406 | outfh = open('drugbank05_drug2transporter_human.csv', 'w') 407 | writer = csv.writer(outf) 408 | writerh = csv.writer(outfh) 409 | 410 | transporter_actions_to_write = ['substrate', 'inducer', 'inhibitor'] 411 | writer.writerow(['drugbank_id', 'partner_id']+transporter_actions_to_write) 412 | writerh.writerow(['drugbank_id', 'partner_id']+transporter_actions_to_write) 413 | 414 | for drugbank_id in drugs: 415 | for (transporter_id, transporter_actions) in drug2attrib[drugbank_id]['transporters']: 416 | actions = [1 if action in transporter_actions else 0 for action in transporter_actions_to_write] 417 | 418 | writer.writerow([drugbank_id, transporter_id]+actions) 419 | 420 | if transporter2attrib[transporter_id]['organism'] == 'Human' and transporter2attrib[transporter_id]['taxonomy_id'] == '9606': 421 | writerh.writerow([drugbank_id, transporter_id]+actions) 422 | 423 | outf.close() 424 | outfh.close() 425 | 426 | --------------------------------------------------------------------------------