├── neo4j ├── .gitignore └── input │ └── extension_script.sh ├── requirements.txt ├── README.md ├── .gitignore ├── sbcdb ├── __init__.py ├── test │ ├── __init__.py │ ├── test_mnxref_utils.py │ └── test_enzyme_utils.py ├── index.py ├── chebi_utils.py ├── rhea_utils.py ├── namespace_utils.py ├── build.py ├── init.cql ├── utils.py ├── enzyme_utils.py ├── ncbi_taxonomy_utils.py ├── kegg_utils.py ├── reaction_utils.py ├── spectra_utils.py ├── chemical_utils.py └── mnxref_utils.py ├── start_db.sh ├── Dockerfile └── LICENSE /neo4j/.gitignore: -------------------------------------------------------------------------------- 1 | /csv/ 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ijson 2 | glpk 3 | libchebipy 4 | subliminal-py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # biochem4j 2 | biochem4j: integrated and extensible biochemical knowledge through graph databases -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.project 2 | /.pydevproject 3 | /neo4j/input/nodes 4 | /neo4j/input/rels 5 | **/*.log 6 | **/*.pyc 7 | **/.DS_Store -------------------------------------------------------------------------------- /sbcdb/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | -------------------------------------------------------------------------------- /sbcdb/test/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | synbiochem (c) University of Manchester 2015 3 | 4 | synbiochem is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | -------------------------------------------------------------------------------- /start_db.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DIR=$(cd "$(dirname "$0")"; pwd) 4 | 5 | docker run \ 6 | --detach \ 7 | --user neo4j \ 8 | --publish=80:7474 \ 9 | --publish=443:7473 \ 10 | --publish=7687:7687 \ 11 | --volume=$DIR/neo4j/input:/input \ 12 | --env=NEO4J_AUTH=none \ 13 | --env=NEO4J_dbms_read__only=true \ 14 | --env=EXTENSION_SCRIPT=/input/extension_script.sh \ 15 | neo4j -------------------------------------------------------------------------------- /neo4j/input/extension_script.sh: -------------------------------------------------------------------------------- 1 | rm -rf /var/lib/neo4j/data/databases/graph.db 2 | 3 | nodes=`ls -d /input/nodes/*` 4 | rels=`ls -d /input/rels/*` 5 | nodes_str=`echo $nodes | sed "s/ / --nodes /g"` 6 | rels_str=`echo $rels| sed "s/ / --relationships /g"` 7 | 8 | /var/lib/neo4j/bin/neo4j-admin \ 9 | import \ 10 | --nodes $nodes_str \ 11 | --relationships $rels_str \ 12 | --delimiter ";" \ 13 | --array-delimiter "|" \ 14 | --multiline-fields true -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7 2 | 3 | # Make current directory visible inside Docker container: 4 | COPY . /biochem4j 5 | WORKDIR /biochem4j 6 | 7 | # Install / update relevant ubuntu packages: 8 | RUN apt-get update \ 9 | && apt-get install -y --no-install-recommends libgmp3-dev 10 | 11 | # Download and install glpk: 12 | RUN mkdir /usr/local/glpk \ 13 | && curl http://ftp.gnu.org/gnu/glpk/glpk-4.39.tar.gz \ 14 | | tar xvzC /usr/local/glpk --strip-components=1 \ 15 | && cd /usr/local/glpk \ 16 | && ./configure \ 17 | && make \ 18 | && make install 19 | 20 | # Install requirements: 21 | RUN pip install --upgrade pip \ 22 | && pip install -r requirements.txt 23 | 24 | # Update paths: 25 | ENV LD_LIBRARY_PATH /usr/local/lib:${LD_LIBRARY_PATH} 26 | ENV PYTHONPATH $PYTHONPATH:. 27 | 28 | # Run: 29 | ENTRYPOINT ["python", "-u", "sbcdb/build.py"] -------------------------------------------------------------------------------- /sbcdb/index.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | import os 11 | import subprocess 12 | import sys 13 | 14 | 15 | def index_db(db_loc): 16 | '''Index database.''' 17 | directory = os.path.dirname(os.path.realpath(__file__)) 18 | filename = os.path.join(directory, 'init.cql') 19 | 20 | with open(filename, 'rU') as init_file: 21 | for line in init_file: 22 | params = ['neo4j-shell', '-path', db_loc, '-c', line.strip()] 23 | subprocess.call(params) 24 | 25 | 26 | def main(argv): 27 | '''main method''' 28 | index_db(argv[0]) 29 | 30 | 31 | if __name__ == '__main__': 32 | main(sys.argv[1:]) 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Neil Swainston 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /sbcdb/test/test_mnxref_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | synbiochem (c) University of Manchester 2015 3 | 4 | synbiochem is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | # pylint: disable=too-many-public-methods 11 | import unittest 12 | 13 | from sbcdb.mnxref_utils import MnxRefReader 14 | 15 | 16 | class TestMnxRefReader(unittest.TestCase): 17 | '''Test class for MnxRefReader.''' 18 | 19 | def setUp(self): 20 | unittest.TestCase.setUp(self) 21 | reader = MnxRefReader() 22 | self.__chem_data = reader.get_chem_data() 23 | self.__reac_data = reader.get_reac_data() 24 | 25 | def test_get_chem_data(self): 26 | '''Tests get_chem_data method.''' 27 | self.assertEquals(self.__chem_data['MNXM1354']['chebi'], 'CHEBI:58282') 28 | 29 | def test_get_reac_data(self): 30 | '''Tests get_chem_data method.''' 31 | eqn = '1 MNXM1 + 1 MNXM6 + 1 MNXM97401 = 1 MNXM5 + 1 MNXM97393' 32 | self.assertEquals(self.__reac_data['MNXR62989']['equation'], eqn) 33 | 34 | 35 | if __name__ == "__main__": 36 | # import sys;sys.argv = ['', 'Test.testName'] 37 | unittest.main() 38 | -------------------------------------------------------------------------------- /sbcdb/chebi_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | from libchebipy._chebi_entity import ChebiEntity 11 | 12 | 13 | def load(chem_manager, writer): 14 | '''Loads ChEBI data from libChEBIpy.''' 15 | chebi_ids = [] 16 | rels = [] 17 | 18 | _add_node('CHEBI:24431', chebi_ids, rels, chem_manager) 19 | 20 | writer.write_rels(rels, 'Chemical', 'Chemical') 21 | 22 | 23 | def _add_node(chebi_id, chebi_ids, rels, chem_manager): 24 | '''Constructs a node from libChEBI.''' 25 | if chebi_id not in chebi_ids: 26 | chebi_ids.append(chebi_id) 27 | 28 | chem_id, entity = chem_manager.add_chemical({'chebi': chebi_id}) 29 | 30 | for incoming in entity.get_incomings(): 31 | target_id = incoming.get_target_chebi_id() 32 | 33 | chebi_ent = ChebiEntity(target_id) 34 | 35 | if chebi_ent.get_parent_id(): 36 | target_id = chebi_ent.get_parent_id() 37 | 38 | _add_node(target_id, chebi_ids, rels, chem_manager) 39 | rels.append([target_id, incoming.get_type(), chem_id]) 40 | -------------------------------------------------------------------------------- /sbcdb/test/test_enzyme_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | synbiochem (c) University of Manchester 2015 3 | 4 | synbiochem is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | # pylint: disable=too-many-public-methods 11 | import unittest 12 | 13 | from sbcdb.enzyme_utils import EnzymeManager 14 | 15 | 16 | class TestEnzymeManager(unittest.TestCase): 17 | '''Test class for EnzymeManager.''' 18 | 19 | def setUp(self): 20 | unittest.TestCase.setUp(self) 21 | self.__manager = EnzymeManager() 22 | 23 | def test_add_uniprot_data(self): 24 | '''Tests add_uniprot_data method.''' 25 | enzyme_ids = ['P19367', 'Q2KNB7'] 26 | 27 | # Test unthreaded: 28 | self.__manager.add_uniprot_data(enzyme_ids, source='test') 29 | self.assertEqual(len(enzyme_ids), len(self.__manager.get_nodes())) 30 | 31 | # Test threaded: 32 | self.__manager.add_uniprot_data(enzyme_ids, source='test', 33 | num_threads=24) 34 | self.assertEqual(len(enzyme_ids), len(self.__manager.get_nodes())) 35 | 36 | 37 | if __name__ == "__main__": 38 | # import sys;sys.argv = ['', 'Test.testName'] 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /sbcdb/rhea_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | import tempfile 11 | import urllib 12 | 13 | 14 | __RHEA_URL = 'ftp://ftp.ebi.ac.uk/pub/databases/rhea/tsv/rhea2uniprot.tsv' 15 | 16 | 17 | def load(reaction_manager, source=__RHEA_URL, num_threads=0): 18 | '''Loads Rhea data.''' 19 | # Parse data: 20 | temp_file = tempfile.NamedTemporaryFile() 21 | urllib.urlretrieve(source, temp_file.name) 22 | data = _parse(temp_file.name) 23 | reaction_manager.add_react_to_enz(data, 'rhea', num_threads) 24 | 25 | 26 | def _parse(filename): 27 | '''Parses file.''' 28 | data = {} 29 | 30 | with open(filename, 'r') as textfile: 31 | next(textfile) 32 | 33 | for line in textfile: 34 | tokens = line.split('\t') 35 | 36 | if len(tokens) == 4: 37 | uniprot_id = tokens[3].strip() 38 | 39 | if not tokens[0] or not tokens[2]: 40 | print ','.join(tokens) 41 | 42 | _add(data, tokens[0], uniprot_id) 43 | _add(data, tokens[2], uniprot_id) 44 | 45 | return data 46 | 47 | 48 | def _add(data, rhea_id, uniprot_id): 49 | '''Adds Rhea id and Uniprot id to data.''' 50 | if rhea_id in data: 51 | data[rhea_id].append(uniprot_id) 52 | else: 53 | data[rhea_id] = [uniprot_id] 54 | -------------------------------------------------------------------------------- /sbcdb/namespace_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | synbiochem (c) University of Manchester 2015 3 | 4 | synbiochem is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | __CHEMICAL_NAMESPACE = { 11 | # value (namespace) corresponds to identifiers.org: 12 | 'bigg': 'bigg.metabolite', 13 | 'CAS Registry Number': 'cas', 14 | 'chebi': 'chebi', 15 | 'ChemIDplus accession': 'chemidplus', 16 | 'Chemspider accession': 'chemspider', 17 | 'DrugBank accession': 'drugbank', 18 | 'hmdb': 'hmdb', 19 | 'HMDB accession': 'hmdb', 20 | 'kegg': 'kegg.compound', 21 | 'KEGG COMPOUND accession': 'kegg.compound', 22 | 'KEGG DRUG accession': 'kegg.drug', 23 | 'KEGG GLYCAN accession': 'kegg.glycan', 24 | 'KNApSAcK accession': 'knapsack', 25 | 'lipidmaps': 'lipidmaps', 26 | 'LIPID MAPS instance accession': 'lipidmaps', 27 | 'MolBase accession': 'molbase', 28 | 'PDB accession': 'pdb', 29 | 'PubMed citation': 'pubmed', 30 | 'reactome': 'reactome', 31 | 'RESID accession': 'resid', 32 | 'seed': 'seed.compound', 33 | 'umbbd': 'umbbd.compound', 34 | 'UM-BBD compID': 'umbbd.compound', 35 | 'upa': 'unipathway', 36 | 'Wikipedia accession': 'wikipedia.en', 37 | 38 | # Not in identifiers.org: 39 | 'metacyc': 'metacyc', 40 | 'MetaCyc accession': 'metacyc', 41 | 'mnx': 'mnx' 42 | } 43 | 44 | __REACTION_NAMESPACE = { 45 | # value (namespace) corresponds to identifiers.org: 46 | 'bigg': 'bigg.reaction', 47 | 'kegg': 'kegg.reaction', 48 | 'reactome': 'reactome', 49 | 'rhea': 'rhea', 50 | 'seed': 'seed', 51 | 52 | # Not in identifiers.org: 53 | 'metacyc': 'metacyc', 54 | 'mnx': 'mnx', 55 | } 56 | 57 | 58 | def resolve_namespace(name, chemical): 59 | '''Maps name to distinct namespace from identifiers.org.''' 60 | namespace = __CHEMICAL_NAMESPACE if chemical else __REACTION_NAMESPACE 61 | return namespace[name] if name in namespace else None 62 | -------------------------------------------------------------------------------- /sbcdb/build.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | import multiprocessing 11 | import sys 12 | 13 | from sbcdb import chebi_utils, chemical_utils, mnxref_utils, \ 14 | ncbi_taxonomy_utils, reaction_utils, rhea_utils, spectra_utils, utils 15 | 16 | 17 | def build_csv(dest_dir, array_delimiter, num_threads): 18 | '''Build database CSV files.''' 19 | writer = utils.Writer(dest_dir) 20 | 21 | # Get Organism data: 22 | print 'Parsing NCBI Taxonomy' 23 | ncbi_taxonomy_utils.load(writer, array_delimiter) 24 | 25 | # Get Chemical and Reaction data. 26 | # Write chemistry csv files: 27 | chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter) 28 | reac_man = reaction_utils.ReactionManager() 29 | 30 | print 'Parsing MNXref' 31 | mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer) 32 | mnx_loader.load() 33 | 34 | print 'Parsing ChEBI' 35 | chebi_utils.load(chem_man, writer) 36 | 37 | # Get Spectrum data: 38 | print 'Parsing spectrum data' 39 | spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter) 40 | 41 | chem_man.write_files(writer) 42 | 43 | # Get Reaction / Enzyme / Organism data: 44 | # print 'Parsing KEGG' 45 | # kegg_utils.load(reac_man, num_threads=num_threads) 46 | 47 | print 'Parsing Rhea' 48 | rhea_utils.load(reac_man, num_threads=num_threads) 49 | 50 | reac_man.write_files(writer) 51 | 52 | 53 | def main(args): 54 | '''main method''' 55 | num_threads = 0 56 | 57 | if len(args) > 2: 58 | try: 59 | num_threads = int(args[2]) 60 | except ValueError: 61 | if args[2] == 'True': 62 | num_threads = multiprocessing.cpu_count() 63 | 64 | print 'Running build with %d threads' % num_threads 65 | 66 | build_csv(args[0], args[1], num_threads) 67 | 68 | 69 | if __name__ == '__main__': 70 | main(sys.argv[1:]) 71 | -------------------------------------------------------------------------------- /sbcdb/init.cql: -------------------------------------------------------------------------------- 1 | CREATE CONSTRAINT ON (n:Organism) ASSERT n.taxonomy IS UNIQUE; 2 | CREATE CONSTRAINT ON (n:Enzyme) ASSERT n.entry IS UNIQUE; 3 | CREATE CONSTRAINT ON (n:Enzyme) ASSERT n.uniprot IS UNIQUE; 4 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.`bigg.reaction` IS UNIQUE; 5 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.id IS UNIQUE; 6 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.`kegg.reaction` IS UNIQUE; 7 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.metacyc IS UNIQUE; 8 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.mnx IS UNIQUE; 9 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.reactome IS UNIQUE; 10 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.rhea IS UNIQUE; 11 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.seed IS UNIQUE; 12 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`bigg.metabolite` IS UNIQUE; 13 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.cas IS UNIQUE; 14 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chebi IS UNIQUE; 15 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chemidplus IS UNIQUE; 16 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chemspider IS UNIQUE; 17 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.drugbank IS UNIQUE; 18 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.hmdb IS UNIQUE; 19 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.id IS UNIQUE; 20 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.compound` IS UNIQUE; 21 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.drug` IS UNIQUE; 22 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.glycan` IS UNIQUE; 23 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.knapsack IS UNIQUE; 24 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.lipidmaps IS UNIQUE; 25 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.metacyc IS UNIQUE; 26 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.mnx IS UNIQUE; 27 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.molbase IS UNIQUE; 28 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.pdb IS UNIQUE; 29 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.pubmed IS UNIQUE; 30 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.reactome IS UNIQUE; 31 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.resid IS UNIQUE; 32 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`seed.compound` IS UNIQUE; 33 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`umbbd.compound` IS UNIQUE; 34 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.unipathway IS UNIQUE; 35 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`wikipedia.en` IS UNIQUE; -------------------------------------------------------------------------------- /sbcdb/utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | synbiochem (c) University of Manchester 2016 3 | 4 | synbiochem is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | # pylint: disable=invalid-name 11 | # pylint: disable=too-many-arguments 12 | import os 13 | from shutil import rmtree 14 | 15 | import pandas as pd 16 | 17 | 18 | class Writer(object): 19 | '''CSV file writer class for biochem4j files.''' 20 | 21 | def __init__(self, dest_dir): 22 | self.__nodes_dir = os.path.join(os.path.abspath(dest_dir), 'nodes') 23 | self.__rels_dir = os.path.join(os.path.abspath(dest_dir), 'rels') 24 | 25 | if os.path.exists(self.__nodes_dir): 26 | rmtree(self.__nodes_dir) 27 | 28 | os.makedirs(self.__nodes_dir) 29 | 30 | if os.path.exists(self.__rels_dir): 31 | rmtree(self.__rels_dir) 32 | 33 | os.makedirs(self.__rels_dir) 34 | 35 | def write_nodes(self, nodes, group, separator=';'): 36 | '''Writes Nodes to csv file.''' 37 | if not nodes: 38 | return None 39 | 40 | df = pd.DataFrame(nodes) 41 | df.dropna(axis=1, how='all', inplace=True) 42 | 43 | filename = os.path.join(self.__nodes_dir, group + '.csv') 44 | df.to_csv(filename, index=False, encoding='utf-8', sep=separator) 45 | 46 | return filename 47 | 48 | def write_rels(self, rels, group_start, group_end, separator=';'): 49 | '''Writes Relationships to csv file.''' 50 | if not rels: 51 | return None 52 | 53 | columns = [':START_ID(' + group_start + ')', 54 | ':TYPE', 55 | ':END_ID(' + group_end + ')'] 56 | 57 | if len(rels[0]) > 3: 58 | columns.append('PROPERTIES') 59 | 60 | df = pd.DataFrame(rels, columns=columns) 61 | 62 | if len(rels[0]) > 3: 63 | props_df = pd.DataFrame(list(df['PROPERTIES'])) 64 | df.drop('PROPERTIES', axis=1, inplace=True) 65 | df = df.join(props_df) 66 | 67 | filename = os.path.join(self.__rels_dir, 68 | group_start + '_' + group_end + '.csv') 69 | df.to_csv(filename, index=False, encoding='utf-8', sep=separator) 70 | 71 | return filename 72 | -------------------------------------------------------------------------------- /sbcdb/enzyme_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | from synbiochem.utils import seq_utils 11 | 12 | 13 | class EnzymeManager(object): 14 | '''Class to implement a manager of Enzyme data.''' 15 | 16 | def __init__(self): 17 | '''Constructor.''' 18 | self.__nodes = {} 19 | self.__org_enz_rels = [] 20 | 21 | def get_nodes(self): 22 | '''Gets enzyme nodes.''' 23 | return self.__nodes.values() 24 | 25 | def get_org_enz_rels(self): 26 | '''Gets organism-to-enzyme relationships.''' 27 | return self.__org_enz_rels 28 | 29 | def add_uniprot_data(self, enzyme_ids, source, num_threads=0): 30 | '''Gets Uniprot data.''' 31 | 32 | fields = ['entry name', 'protein names', 'organism-id', 'ec'] 33 | enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids 34 | if enzyme_id not in self.__nodes] 35 | uniprot_values = seq_utils.get_uniprot_values(enzyme_ids, fields, 36 | batch_size=512, 37 | verbose=True, 38 | num_threads=num_threads) 39 | 40 | for uniprot_id, uniprot_value in uniprot_values.iteritems(): 41 | enzyme_node = {':LABEL': 'Enzyme', 42 | 'uniprot:ID(Enzyme)': uniprot_id} 43 | self.__nodes[uniprot_id] = enzyme_node 44 | 45 | organism_id = uniprot_value.pop('Organism ID') \ 46 | if 'Organism ID' in uniprot_value else None 47 | 48 | if 'Entry name' in uniprot_value: 49 | enzyme_node['entry'] = uniprot_value['Entry name'] 50 | 51 | if 'Protein names' in uniprot_value: 52 | enzyme_node['names'] = uniprot_value['Protein names'] 53 | 54 | if enzyme_node['names']: 55 | enzyme_node['name'] = enzyme_node['names'][0] 56 | 57 | if 'EC number' in uniprot_value: 58 | enzyme_node['ec-code'] = uniprot_value['EC number'] 59 | 60 | if organism_id: 61 | self.__org_enz_rels.append([organism_id, 'expresses', 62 | uniprot_id, {'source': source}]) 63 | -------------------------------------------------------------------------------- /sbcdb/ncbi_taxonomy_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | import os 11 | import sys 12 | import tarfile 13 | import tempfile 14 | import urllib 15 | 16 | 17 | __NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz' 18 | 19 | 20 | def load(writer, array_delimiter, source=__NCBITAXONOMY_URL): 21 | '''Loads NCBI Taxonomy data.''' 22 | nodes_filename, names_filename = _get_ncbi_taxonomy_files(source) 23 | nodes, rels = _parse_nodes(nodes_filename, array_delimiter) 24 | _parse_names(nodes, names_filename, array_delimiter) 25 | 26 | writer.write_nodes(nodes.values(), 'Organism') 27 | writer.write_rels(rels, 'Organism', 'Organism') 28 | 29 | 30 | def _get_ncbi_taxonomy_files(source): 31 | '''Downloads and extracts NCBI Taxonomy files.''' 32 | temp_dir = tempfile.gettempdir() 33 | temp_gzipfile = tempfile.NamedTemporaryFile() 34 | urllib.urlretrieve(source, temp_gzipfile.name) 35 | 36 | temp_tarfile = tarfile.open(temp_gzipfile.name, 'r:gz') 37 | temp_tarfile.extractall(temp_dir) 38 | 39 | temp_gzipfile.close() 40 | temp_tarfile.close() 41 | 42 | return os.path.join(temp_dir, 'nodes.dmp'), \ 43 | os.path.join(temp_dir, 'names.dmp') 44 | 45 | 46 | def _parse_nodes(filename, array_delimiter): 47 | '''Parses nodes file.''' 48 | nodes = {} 49 | rels = [] 50 | 51 | with open(filename, 'r') as textfile: 52 | for line in textfile: 53 | tokens = [x.strip() for x in line.split('|')] 54 | tax_id = tokens[0] 55 | 56 | if tax_id != '1': 57 | rels.append([tax_id, 'is_a', tokens[1]]) 58 | 59 | nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id, 60 | ':LABEL': 61 | 'Organism' + array_delimiter + tokens[2]} 62 | 63 | return nodes, rels 64 | 65 | 66 | def _parse_names(nodes, filename, array_delimiter): 67 | '''Parses names file.''' 68 | 69 | with open(filename, 'r') as textfile: 70 | for line in textfile: 71 | tokens = [x.strip() for x in line.split('|')] 72 | node = nodes[tokens[0]] 73 | 74 | if 'name' not in node: 75 | node['name'] = tokens[1] 76 | node['names:string[]'] = set([node['name']]) 77 | else: 78 | node['names:string[]'].add(tokens[1]) 79 | 80 | for _, node in nodes.iteritems(): 81 | if 'names:string[]' in node: 82 | node['names:string[]'] = \ 83 | array_delimiter.join(node['names:string[]']) 84 | 85 | 86 | def main(argv): 87 | '''main method''' 88 | load(*argv) 89 | 90 | 91 | if __name__ == "__main__": 92 | main(sys.argv[1:]) 93 | -------------------------------------------------------------------------------- /sbcdb/kegg_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | from collections import defaultdict 11 | import urllib2 12 | 13 | from synbiochem.utils import thread_utils 14 | 15 | 16 | def load(reaction_manager, organisms=None, num_threads=0): 17 | '''Loads KEGG data.''' 18 | 19 | if organisms is None: 20 | organisms = \ 21 | sorted([line.split()[1] for line in 22 | urllib2.urlopen('http://rest.kegg.jp/list/organism')]) 23 | 24 | # EC to gene, gene to Uniprot: 25 | ec_genes, gene_uniprots = _get_gene_data(organisms, num_threads) 26 | 27 | data = defaultdict(list) 28 | 29 | # KEGG Reaction to EC: 30 | kegg_reac_ec = _parse_url('http://rest.kegg.jp/link/ec/reaction') 31 | 32 | for kegg_reac, ec_terms in kegg_reac_ec.iteritems(): 33 | for ec_term in ec_terms: 34 | if ec_term in ec_genes: 35 | for gene in ec_genes[ec_term]: 36 | if gene in gene_uniprots: 37 | uniprots = [val[3:] for val in gene_uniprots[gene]] 38 | data[kegg_reac[3:]].extend(uniprots) 39 | 40 | reaction_manager.add_react_to_enz(data, 'kegg.reaction', num_threads) 41 | 42 | 43 | def _get_gene_data(organisms, num_threads): 44 | '''Gets gene data.''' 45 | ec_genes = defaultdict(list) 46 | gene_uniprots = defaultdict(list) 47 | 48 | if num_threads: 49 | thread_pool = thread_utils.ThreadPool(num_threads) 50 | 51 | for org in organisms: 52 | thread_pool.add_task(_parse_organism, org, ec_genes, gene_uniprots) 53 | 54 | thread_pool.wait_completion() 55 | else: 56 | for org in organisms: 57 | _parse_organism(org, ec_genes, gene_uniprots) 58 | 59 | return ec_genes, gene_uniprots 60 | 61 | 62 | def _parse_organism(org, ec_genes, gene_uniprots): 63 | '''Parse organism.''' 64 | print 'KEGG: loading ' + org 65 | 66 | for key, value in _parse_url('http://rest.kegg.jp/link/' + org.lower() + 67 | '/enzyme').iteritems(): 68 | ec_genes[key].extend(value) 69 | 70 | for key, value in _parse_url('http://rest.kegg.jp/conv/uniprot/' + 71 | org.lower()).iteritems(): 72 | gene_uniprots[key].extend(value) 73 | 74 | 75 | def _parse_url(url, attempts=16): 76 | '''Parses url to form key to list of values dictionary.''' 77 | data = defaultdict(list) 78 | 79 | for _ in range(attempts): 80 | try: 81 | for line in urllib2.urlopen(url): 82 | tokens = line.split() 83 | 84 | if len(tokens) > 1: 85 | data[tokens[0]].append(tokens[1]) 86 | 87 | return data 88 | except urllib2.URLError, err: 89 | # Take no action, but try again... 90 | print '\t'.join([url, str(err)]) 91 | 92 | return data 93 | -------------------------------------------------------------------------------- /sbcdb/reaction_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | from sbcdb.enzyme_utils import EnzymeManager 11 | 12 | 13 | class ReactionManager(object): 14 | '''Class to implement a manager of Reaction data.''' 15 | 16 | def __init__(self): 17 | '''Constructor.''' 18 | self.__nodes = {} 19 | self.__reac_ids = {} 20 | self.__reac_enz_rels = [] 21 | self.__org_enz_rels = [] 22 | self.__enz_man = EnzymeManager() 23 | 24 | def write_files(self, writer): 25 | '''Write neo4j import files.''' 26 | return ([writer.write_nodes(self.__nodes.values(), 27 | 'Reaction'), 28 | writer.write_nodes(self.__enz_man.get_nodes(), 29 | 'Enzyme')], 30 | [writer.write_rels(self.__reac_enz_rels, 31 | 'Reaction', 'Enzyme'), 32 | writer.write_rels(self.__enz_man.get_org_enz_rels(), 33 | 'Organism', 'Enzyme')]) 34 | 35 | def add_reaction(self, source, reac_id, properties): 36 | '''Adds a reaction to the collection of nodes, ensuring uniqueness.''' 37 | reac_id = self.__reac_ids[source + reac_id] \ 38 | if source + reac_id in self.__reac_ids else reac_id 39 | 40 | if reac_id not in self.__nodes: 41 | properties[':LABEL'] = 'Reaction' 42 | properties['id:ID(Reaction)'] = reac_id 43 | properties['source'] = source 44 | properties[source] = reac_id 45 | self.__nodes[reac_id] = properties 46 | 47 | if 'mnx' in properties: 48 | self.__reac_ids['mnx' + properties['mnx']] = reac_id 49 | 50 | if 'kegg.reaction' in properties: 51 | self.__reac_ids[ 52 | 'kegg.reaction' + properties['kegg.reaction']] = reac_id 53 | 54 | if 'rhea' in properties: 55 | self.__reac_ids['rhea' + properties['rhea']] = reac_id 56 | else: 57 | self.__nodes[reac_id].update(properties) 58 | 59 | return reac_id 60 | 61 | def add_react_to_enz(self, data, source, num_threads=0): 62 | '''Submit data to the graph.''' 63 | # Create Reaction and Enzyme nodes: 64 | enzyme_ids = self.__create_react_enz(data, source) 65 | 66 | # Create Enzyme nodes: 67 | self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads) 68 | 69 | def __create_react_enz(self, data, source): 70 | '''Creates Reaction and Enzyme nodes and their Relationships.''' 71 | enzyme_ids = [] 72 | 73 | for reac_id, uniprot_ids in data.iteritems(): 74 | reac_id = self.add_reaction(source, reac_id, {}) 75 | 76 | for uniprot_id in uniprot_ids: 77 | enzyme_ids.append(uniprot_id) 78 | self.__reac_enz_rels.append([reac_id, 'catalysed_by', 79 | uniprot_id, 80 | {'source': source}]) 81 | 82 | return list(set(enzyme_ids)) 83 | -------------------------------------------------------------------------------- /sbcdb/spectra_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | import os 11 | import tempfile 12 | from urllib import urlretrieve 13 | import zipfile 14 | 15 | import ijson 16 | 17 | 18 | __MONA_URL = 'http://mona.fiehnlab.ucdavis.edu/rest/downloads/retrieve/' + \ 19 | 'd2eb33f0-b22e-49a7-bc31-eb951f8347b2' 20 | 21 | __MONA_FILENAME = 'MoNA-export-All_Spectra.json' 22 | 23 | _NAME_MAP = {'kegg': 'kegg.compound', 24 | 'molecular formula': 'formula', 25 | 'total exact mass': 'monoisotopic_mass:float'} 26 | 27 | 28 | def load(writer, chem_manager, 29 | array_delimiter='|', url=__MONA_URL, filename=__MONA_FILENAME): 30 | '''Build Spectrum nodes and relationships.''' 31 | nodes = [] 32 | rels = [] 33 | 34 | records = _parse(_get_file(url, filename), array_delimiter) 35 | 36 | for record in records: 37 | chem_id, _ = chem_manager.add_chemical(record['chemical']) 38 | nodes.append(record['spectrum']) 39 | rels.append([chem_id, 'has', record['spectrum']['id:ID(Spectrum)']]) 40 | 41 | return [writer.write_nodes(nodes, 'Spectrum')], \ 42 | [writer.write_rels(rels, 'Chemical', 'Spectrum')] 43 | 44 | 45 | def _parse(filename, array_delimiter): 46 | '''Parses MoNA json file.''' 47 | records = [] 48 | record = {'chemical': {'names:string[]': []}, 49 | 'spectrum': {':LABEL': 'Spectrum', 'tags:string[]': []}} 50 | name = None 51 | 52 | for prefix, typ, value in ijson.parse(open(filename)): 53 | if prefix == 'item' and typ == 'start_map': 54 | record = {'chemical': {'names:string[]': []}, 55 | 'spectrum': {':LABEL': 'Spectrum', 56 | 'tags:string[]': []}} 57 | elif prefix == 'item.compound.item.inchi': 58 | record['chemical']['inchi'] = value 59 | elif prefix == 'item.compound.item.names.item.name': 60 | if 'name' not in record['chemical']: 61 | record['chemical']['name'] = value 62 | record['chemical']['names:string[]'].append(value) 63 | elif prefix == 'item.compound.item.metaData.item.name' or \ 64 | prefix == 'item.metaData.item.name': 65 | name = _normalise_name(value.lower()) 66 | elif prefix == 'item.compound.item.metaData.item.value': 67 | _parse_compound_metadata(name, value, record) 68 | name = None 69 | elif prefix == 'item.id': 70 | record['spectrum']['id:ID(Spectrum)'] = value 71 | elif prefix == 'item.metaData.item.value': 72 | record['spectrum'][name] = value 73 | name = None 74 | elif prefix == 'item.spectrum': 75 | values = [float(val) for term in value.split() 76 | for val in term.split(':')] 77 | record['spectrum']['m/z:float[]'] = \ 78 | array_delimiter.join(map(str, values[0::2])) 79 | record['spectrum']['I:float[]'] = \ 80 | array_delimiter.join(map(str, values[1::2])) 81 | elif prefix == 'item.tags.item.text': 82 | record['spectrum']['tags:string[]'].append(value) 83 | elif prefix == 'item' and typ == 'end_map': 84 | records.append(record) 85 | 86 | return records 87 | 88 | 89 | def _get_file(url, filename): 90 | '''Gets file from url.''' 91 | destination = os.path.join(os.path.expanduser('~'), 'MoNA') 92 | 93 | if not os.path.exists(destination): 94 | os.makedirs(destination) 95 | 96 | filepath = os.path.join(destination, filename) 97 | 98 | if not os.path.exists(filepath): 99 | tmp_file = tempfile.NamedTemporaryFile(delete=False) 100 | urlretrieve(url, tmp_file.name) 101 | zfile = zipfile.ZipFile(tmp_file.name, 'r') 102 | filepath = os.path.join(destination, zfile.namelist()[0]) 103 | zfile.extractall(destination) 104 | 105 | return filepath 106 | 107 | 108 | def _parse_compound_metadata(name, value, record): 109 | '''Parses compound metadata.''' 110 | if name == 'chebi' and isinstance(value, unicode): 111 | value = value.replace('CHEBI:', '').split()[0] 112 | 113 | record['chemical'][_normalise_name(name)] = value 114 | 115 | 116 | def _normalise_name(name): 117 | '''Normalises name in name:value pairs.''' 118 | if name in _NAME_MAP: 119 | return _NAME_MAP[name] 120 | 121 | return name.replace(':', '_') 122 | -------------------------------------------------------------------------------- /sbcdb/chemical_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | import math 11 | import uuid 12 | 13 | from libchebipy._chebi_entity import ChebiEntity, ChebiException 14 | 15 | from sbcdb import namespace_utils as ns_utils 16 | from synbiochem.utils import chem_utils 17 | 18 | 19 | class ChemicalManager(object): 20 | '''Class to implement a manager of Chemical data.''' 21 | 22 | def __init__(self, array_delimiter): 23 | '''Constructor.''' 24 | self.__array_delimiter = array_delimiter 25 | self.__nodes = {} 26 | self.__chem_ids = {} 27 | 28 | def write_files(self, writer): 29 | '''Write neo4j import files.''' 30 | return writer.write_nodes(self.__nodes.values(), 'Chemical') 31 | 32 | def add_chemical(self, properties): 33 | '''Adds a chemical to the collection of nodes, ensuring uniqueness.''' 34 | chem_id, chebi_ent = self.__get_chem_id(properties) 35 | 36 | if 'charge:float' in properties: 37 | charge = properties.pop('charge:float') 38 | 39 | if not math.isnan(charge): 40 | properties['charge:float'] = int(charge) 41 | 42 | if chem_id not in self.__nodes: 43 | properties[':LABEL'] = 'Chemical' 44 | properties['id:ID(Chemical)'] = chem_id 45 | properties['source'] = 'chebi' if 'chebi' in properties else 'mnx' 46 | 47 | _normalise_mass(properties) 48 | self.__nodes[chem_id] = properties 49 | else: 50 | self.__nodes[chem_id].update(properties) 51 | 52 | return chem_id, chebi_ent 53 | 54 | def get_props(self, prop, default=None): 55 | '''Gets all chem_ids to property as a dict.''' 56 | return {key: self.__nodes[chem_id].get(prop, default) 57 | for key, chem_id in self.__chem_ids.iteritems()} 58 | 59 | def get_prop(self, chem_id, prop, default=None): 60 | '''Gets a property.''' 61 | return self.__nodes[self.__chem_ids[chem_id]].get(prop, default) 62 | 63 | def __get_chem_id(self, properties): 64 | '''Manages chemical id mapping.''' 65 | chebi_id = properties.get('chebi', None) 66 | chebi_ent = None 67 | 68 | if chebi_id: 69 | try: 70 | chebi_id, chebi_ent = _get_chebi_data(chebi_id, properties, 71 | self.__array_delimiter) 72 | except ChebiException, err: 73 | properties.pop('chebi') 74 | chebi_id = None 75 | print err 76 | except ValueError, err: 77 | properties.pop('chebi') 78 | chebi_id = None 79 | print err 80 | 81 | mnx_id = properties.get('mnx', None) 82 | inchi_id = properties.get('inchi', None) 83 | 84 | if chebi_id: 85 | self.__chem_ids[chebi_id] = chebi_id 86 | 87 | if inchi_id: 88 | self.__chem_ids[inchi_id] = chebi_id 89 | 90 | if mnx_id: 91 | self.__chem_ids[mnx_id] = chebi_id 92 | 93 | return chebi_id, chebi_ent 94 | 95 | if inchi_id: 96 | chem_id = self.__chem_ids.get(inchi_id, None) 97 | 98 | if chem_id: 99 | return chem_id, None 100 | 101 | if mnx_id: 102 | chem_id = self.__chem_ids.get(mnx_id, None) 103 | 104 | if chem_id: 105 | return chem_id, None 106 | 107 | if inchi_id: 108 | self.__chem_ids[inchi_id] = mnx_id 109 | 110 | self.__chem_ids[mnx_id] = mnx_id 111 | return mnx_id, None 112 | 113 | new_id = str(uuid.uuid4()) 114 | self.__chem_ids[inchi_id] = new_id 115 | 116 | return new_id, None 117 | 118 | 119 | def _get_chebi_data(chebi_id, properties, array_delimiter): 120 | '''Gets ChEBI data.''' 121 | chebi_ent = ChebiEntity(str(chebi_id)) 122 | 123 | if chebi_ent.get_parent_id(): 124 | chebi_id = chebi_ent.get_parent_id() 125 | else: 126 | chebi_id = chebi_ent.get_id() 127 | 128 | properties['chebi'] = chebi_id 129 | 130 | formula = chebi_ent.get_formula() 131 | charge = chebi_ent.get_charge() 132 | inchi = chebi_ent.get_inchi() 133 | smiles = chebi_ent.get_smiles() 134 | 135 | if formula: 136 | properties['formula'] = formula 137 | 138 | if not math.isnan(charge): 139 | properties['charge:float'] = charge 140 | 141 | if inchi: 142 | properties['inchi'] = inchi 143 | 144 | if smiles: 145 | properties['smiles'] = smiles 146 | 147 | properties['name'] = chebi_ent.get_name() 148 | properties['names:string[]'] = \ 149 | array_delimiter.join([name.get_name() 150 | for name in chebi_ent.get_names()] + 151 | [chebi_ent.get_name()]) 152 | 153 | for db_acc in chebi_ent.get_database_accessions(): 154 | namespace = ns_utils.resolve_namespace( 155 | db_acc.get_type(), True) 156 | 157 | if namespace is not None: 158 | properties[namespace] = db_acc.get_accession_number() 159 | 160 | return chebi_id, chebi_ent 161 | 162 | 163 | def _normalise_mass(properties): 164 | '''Removes ambiguity in mass values by recalculating according to chemical 165 | formula.''' 166 | properties.pop('mass:float', None) 167 | 168 | if 'formula' in properties and properties['formula'] is not None: 169 | mono_mass = chem_utils.get_molecular_mass(properties['formula']) 170 | 171 | if not math.isnan(mono_mass): 172 | properties['monoisotopic_mass:float'] = mono_mass 173 | -------------------------------------------------------------------------------- /sbcdb/mnxref_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | SYNBIOCHEM-DB (c) University of Manchester 2015 3 | 4 | SYNBIOCHEM-DB is licensed under the MIT License. 5 | 6 | To view a copy of this license, visit . 7 | 8 | @author: neilswainston 9 | ''' 10 | # pylint: disable=no-member 11 | # pylint: disable=too-few-public-methods 12 | # pylint: disable=too-many-locals 13 | from collections import Counter 14 | import csv 15 | import itertools 16 | import math 17 | import re 18 | import urllib2 19 | 20 | import numpy 21 | from subliminal import balance 22 | 23 | from sbcdb import namespace_utils 24 | from synbiochem.utils import chem_utils 25 | 26 | 27 | _METANETX_URL = 'http://metanetx.org/cgi-bin/mnxget/mnxref/' 28 | 29 | 30 | class MnxRefReader(object): 31 | '''Class to read MnxRef data from the chem_prop.tsv, the chem_xref.tsv and 32 | reac_prop.tsv files.''' 33 | 34 | def __init__(self, source=_METANETX_URL): 35 | self.__source = source 36 | self.__mnx_id_patt = re.compile(r'(MNX[MR])(\d+)') 37 | self.__chem_data = {} 38 | self.__reac_data = {} 39 | 40 | def get_chem_data(self): 41 | '''Gets chemical data.''' 42 | if not self.__chem_data: 43 | self.__read_chem_prop() 44 | self.__read_xref('chem_xref.tsv', self.__chem_data, True) 45 | 46 | return self.__chem_data 47 | 48 | def get_reac_data(self): 49 | '''Gets reaction data.''' 50 | if not self.__reac_data: 51 | self.__read_reac_prop() 52 | self.__read_xref('reac_xref.tsv', self.__reac_data, False) 53 | 54 | return self.__reac_data 55 | 56 | def __read_chem_prop(self): 57 | '''Read chemical properties and create Nodes.''' 58 | chem_prop_keys = ['id', 'name', 'formula', 'charge:float', 59 | 'mass:float', 'inchi', 'smiles', 'source'] 60 | 61 | for values in self.__read_data('chem_prop.tsv'): 62 | if not values[0].startswith('#'): 63 | values[0] = self.__parse_id(values[0]) 64 | values[7] = self.__parse_id(values[7]) 65 | props = dict(zip(chem_prop_keys, values)) 66 | props.pop('source') 67 | _convert_to_float(props, 'charge:float') 68 | _convert_to_float(props, 'mass:float') 69 | props = {key: value for key, value in props.iteritems() 70 | if value != ''} 71 | self.__chem_data[values[0]] = props 72 | 73 | def __read_xref(self, filename, data, chemical): 74 | '''Read xrefs and update Nodes.''' 75 | xref_keys = ['XREF', 'MNX_ID', 'Evidence', 'Description'] 76 | 77 | for values in self.__read_data(filename): 78 | if not values[0].startswith('#'): 79 | xrefs = dict(zip(xref_keys[:len(values)], values)) 80 | evidence = xrefs.get('Evidence', 'identity') 81 | 82 | if evidence == 'identity' or evidence == 'structural': 83 | xrefs['MNX_ID'] = self.__parse_id(xrefs['MNX_ID']) 84 | xref = xrefs['XREF'].split(':') 85 | 86 | if xrefs['MNX_ID'] in data: 87 | entry = data[xrefs['MNX_ID']] 88 | self.__add_xref(xref, entry, chemical) 89 | 90 | def __add_xref(self, xref, entry, chemical): 91 | '''Adds an xref.''' 92 | namespace = namespace_utils.resolve_namespace(xref[0], 93 | chemical) 94 | 95 | if namespace is not None: 96 | xref[1] = self.__parse_id(xref[1]) 97 | 98 | entry[namespace] = xref[1] \ 99 | if namespace != 'chebi' \ 100 | else 'CHEBI:' + xref[1] 101 | 102 | def __read_reac_prop(self): 103 | '''Read reaction properties and create Nodes.''' 104 | reac_prop_keys = ['id', 'equation', 'description', 'balance', 'ec', 105 | 'Source'] 106 | 107 | for values in self.__read_data('reac_prop.tsv'): 108 | if not values[0].startswith('#'): 109 | values[0] = self.__parse_id(values[0]) 110 | values[5] = self.__parse_id(values[5]) 111 | 112 | props = dict(zip(reac_prop_keys, values)) 113 | props.pop('Source') 114 | 115 | try: 116 | participants = chem_utils.parse_equation( 117 | props.pop('equation')) 118 | 119 | for participant in participants: 120 | participant[0] = self.__parse_id(participant[0]) 121 | 122 | if participant[0] not in self.__chem_data: 123 | self.__add_chem(participant[0]) 124 | 125 | props['reac_defn'] = participants 126 | self.__reac_data[values[0]] = props 127 | except ValueError: 128 | print 'WARNING: Suspected polymerisation reaction: ' + \ 129 | values[0] + '\t' + str(props) 130 | 131 | def __add_chem(self, chem_id): 132 | '''Adds a chemical with given id.''' 133 | props = {'id': chem_id} 134 | self.__chem_data[chem_id] = props 135 | return props 136 | 137 | def __read_data(self, filename): 138 | '''Downloads and reads tab-limited files into lists of lists of 139 | strings.''' 140 | return list(csv.reader(urllib2.urlopen(self.__source + filename), 141 | delimiter='\t')) 142 | 143 | def __parse_id(self, item_id): 144 | '''Parses mnx ids.''' 145 | matches = self.__mnx_id_patt.findall(item_id) 146 | 147 | for mat in matches: 148 | return mat[0] + str(int(mat[1])) 149 | 150 | return item_id 151 | 152 | 153 | class MnxRefLoader(object): 154 | '''Loads MNXref data into neo4j format.''' 155 | 156 | def __init__(self, chem_man, reac_man, writer): 157 | self.__chem_man = chem_man 158 | self.__reac_man = reac_man 159 | self.__writer = writer 160 | 161 | def load(self): 162 | '''Loads MnxRef data from chem_prop.tsv, chem_xref.tsv, 163 | reac_prop.tsv and reac_xref.tsv files.''' 164 | reader = MnxRefReader() 165 | 166 | for properties in reader.get_chem_data().values(): 167 | properties['mnx'] = properties.pop('id') 168 | self.__chem_man.add_chemical(properties) 169 | 170 | rels = self.__add_reac_nodes(reader.get_reac_data()) 171 | 172 | return [], [self.__writer.write_rels(rels, 'Reaction', 'Chemical')] 173 | 174 | def __add_reac_nodes(self, reac_data): 175 | '''Get reaction nodes from data.''' 176 | reac_id_def = {} 177 | 178 | for properties in reac_data.values(): 179 | reac_def = [] 180 | mnx_id = properties.pop('id') 181 | 182 | # Remove equation and description (may be inconsistent with 183 | # balanced reaction): 184 | if 'description' in properties: 185 | properties.pop('description') 186 | 187 | for prt in properties.pop('reac_defn'): 188 | chem_id, _ = self.__chem_man.add_chemical({'mnx': prt[0]}) 189 | 190 | reac_def.append([self.__chem_man.get_prop(prt[0], 'formula'), 191 | self.__chem_man.get_prop(prt[0], 192 | 'charge:float', 0), 193 | prt[1], 194 | chem_id]) 195 | 196 | if all([values[0] is not None for values in reac_def]): 197 | balanced, _, balanced_def = balance.balance_reac(reac_def) 198 | properties['balance'] = balanced 199 | else: 200 | properties['balance'] = 'unknown' 201 | balanced_def = reac_def 202 | 203 | reac_id = self.__reac_man.add_reaction('mnx', mnx_id, 204 | properties) 205 | reac_id_def[reac_id] = balanced_def 206 | 207 | chem_id_mass = self.__chem_man.get_props('monoisotopic_mass:float', 208 | float('NaN')) 209 | cofactors = [chem_id 210 | for chem_id, mass in chem_id_mass.iteritems() 211 | if mass > 0 and mass < 44] # Assume mass < CO2 = cofactor 212 | 213 | cofactor_pairs = _calc_cofactors(reac_id_def.values(), cofactors) 214 | rels = [] 215 | 216 | for reac_id, defn in reac_id_def.iteritems(): 217 | reactants = [term[3] for term in defn if term[2] < 0] 218 | products = [term[3] for term in defn if term[2] > 0] 219 | reac_cofactors = [] 220 | 221 | # Set metabolites as cofactors: 222 | for met in [term[3] for term in defn]: 223 | if met in cofactors: 224 | reac_cofactors.append(met) 225 | 226 | # Set pairs as cofactors: 227 | for pair in itertools.product(reactants, products): 228 | if tuple(sorted(pair)) in cofactor_pairs: 229 | reac_cofactors.extend(pair) 230 | 231 | for term in defn: 232 | rels.append([reac_id, 233 | 'has_cofactor' if term[3] in reac_cofactors 234 | else 'has_reactant', 235 | term[3], 236 | {'stoichiometry:float': term[2]}]) 237 | 238 | return rels 239 | 240 | 241 | def _calc_cofactors(reaction_defs, cofactors, cutoff=0.8): 242 | '''Calculates cofactors.''' 243 | pairs = Counter() 244 | 245 | # Calculate all reactant / product pairs... 246 | for reaction_def in reaction_defs: 247 | reactants = [term[3] for term in reaction_def if term[2] < 0 and 248 | term[3] not in cofactors] 249 | products = [term[3] for term in reaction_def if term[2] > 0 and 250 | term[3] not in cofactors] 251 | 252 | pairs.update([tuple(sorted(pair)) 253 | for pair in itertools.product(reactants, products)]) 254 | 255 | return _filter(pairs, cutoff) 256 | 257 | 258 | def _filter(counter, cutoff): 259 | '''Filter counter items according to cutoff.''' 260 | # Count occurences of pairs, then bin into a histogram... 261 | hist_counter = Counter(counter.values()) 262 | 263 | # Fit straight-line to histogram log-log plot and filter... 264 | x_val, y_val = zip(*list(hist_counter.items())) 265 | m_val, b_val = numpy.polyfit(numpy.log(x_val), numpy.log(y_val), 1) 266 | 267 | return [item[0] for item in counter.items() 268 | if item[1] > math.exp(cutoff * -b_val / m_val)] 269 | 270 | 271 | def _convert_to_float(dictionary, key): 272 | '''Converts a key value in a dictionary to a float.''' 273 | if dictionary.get(key, None): 274 | dictionary[key] = float(dictionary[key] 275 | if dictionary[key] != 'NA' 276 | else 'NaN') 277 | else: 278 | # Remove key: 279 | dictionary.pop(key, None) 280 | --------------------------------------------------------------------------------