├── neo4j
    ├── .gitignore
    └── input
    │   └── extension_script.sh
├── requirements.txt
├── README.md
├── .gitignore
├── sbcdb
    ├── __init__.py
    ├── test
    │   ├── __init__.py
    │   ├── test_mnxref_utils.py
    │   └── test_enzyme_utils.py
    ├── index.py
    ├── chebi_utils.py
    ├── rhea_utils.py
    ├── namespace_utils.py
    ├── build.py
    ├── init.cql
    ├── utils.py
    ├── enzyme_utils.py
    ├── ncbi_taxonomy_utils.py
    ├── kegg_utils.py
    ├── reaction_utils.py
    ├── spectra_utils.py
    ├── chemical_utils.py
    └── mnxref_utils.py
├── start_db.sh
├── Dockerfile
└── LICENSE


/neo4j/.gitignore:
--------------------------------------------------------------------------------
1 | /csv/
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ijson
2 | glpk
3 | libchebipy
4 | subliminal-py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # biochem4j
2 | biochem4j: integrated and extensible biochemical knowledge through graph databases


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.project
2 | /.pydevproject
3 | /neo4j/input/nodes
4 | /neo4j/input/rels
5 | **/*.log
6 | **/*.pyc
7 | **/.DS_Store


--------------------------------------------------------------------------------
/sbcdb/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | SYNBIOCHEM-DB (c) University of Manchester 2015
 3 | 
 4 | SYNBIOCHEM-DB is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | 


--------------------------------------------------------------------------------
/sbcdb/test/__init__.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | synbiochem (c) University of Manchester 2015
 3 | 
 4 | synbiochem is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | 


--------------------------------------------------------------------------------
/start_db.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | DIR=$(cd "$(dirname "$0")"; pwd)
 4 | 
 5 | docker run \
 6 | --detach \
 7 | --user neo4j \
 8 | --publish=80:7474 \
 9 | --publish=443:7473 \
10 | --publish=7687:7687 \
11 | --volume=$DIR/neo4j/input:/input \
12 | --env=NEO4J_AUTH=none \
13 | --env=NEO4J_dbms_read__only=true \
14 | --env=EXTENSION_SCRIPT=/input/extension_script.sh \
15 | neo4j


--------------------------------------------------------------------------------
/neo4j/input/extension_script.sh:
--------------------------------------------------------------------------------
 1 | rm -rf /var/lib/neo4j/data/databases/graph.db
 2 | 
 3 | nodes=`ls -d /input/nodes/*`
 4 | rels=`ls -d /input/rels/*`
 5 | nodes_str=`echo $nodes | sed "s/ / --nodes /g"`
 6 | rels_str=`echo $rels| sed "s/ / --relationships /g"`
 7 | 
 8 | /var/lib/neo4j/bin/neo4j-admin \
 9 | 	import \
10 | 	--nodes $nodes_str \
11 | 	--relationships $rels_str \
12 | 	--delimiter ";" \
13 | 	--array-delimiter "|" \
14 | 	--multiline-fields true


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:2.7
 2 | 
 3 | # Make current directory visible inside Docker container:
 4 | COPY . /biochem4j
 5 | WORKDIR /biochem4j
 6 | 
 7 | # Install / update relevant ubuntu packages:
 8 | RUN apt-get update \
 9 | 	&& apt-get install -y --no-install-recommends libgmp3-dev
10 | 
11 | # Download and install glpk:
12 | RUN mkdir /usr/local/glpk \
13 | 	&& curl http://ftp.gnu.org/gnu/glpk/glpk-4.39.tar.gz \
14 | 	| tar xvzC /usr/local/glpk --strip-components=1 \
15 | 	&& cd /usr/local/glpk \
16 | 	&& ./configure \
17 | 	&& make \
18 | 	&& make install
19 | 
20 | # Install requirements:
21 | RUN pip install --upgrade pip \
22 | 	&& pip install -r requirements.txt
23 | 
24 | # Update paths:
25 | ENV LD_LIBRARY_PATH /usr/local/lib:${LD_LIBRARY_PATH}
26 | ENV PYTHONPATH $PYTHONPATH:.
27 | 
28 | # Run:
29 | ENTRYPOINT ["python", "-u", "sbcdb/build.py"]


--------------------------------------------------------------------------------
/sbcdb/index.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | SYNBIOCHEM-DB (c) University of Manchester 2015
 3 | 
 4 | SYNBIOCHEM-DB is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | import os
11 | import subprocess
12 | import sys
13 | 
14 | 
15 | def index_db(db_loc):
16 |     '''Index database.'''
17 |     directory = os.path.dirname(os.path.realpath(__file__))
18 |     filename = os.path.join(directory, 'init.cql')
19 | 
20 |     with open(filename, 'rU') as init_file:
21 |         for line in init_file:
22 |             params = ['neo4j-shell', '-path', db_loc, '-c', line.strip()]
23 |             subprocess.call(params)
24 | 
25 | 
26 | def main(argv):
27 |     '''main method'''
28 |     index_db(argv[0])
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     main(sys.argv[1:])
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Neil Swainston
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/sbcdb/test/test_mnxref_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | synbiochem (c) University of Manchester 2015
 3 | 
 4 | synbiochem is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | # pylint: disable=too-many-public-methods
11 | import unittest
12 | 
13 | from sbcdb.mnxref_utils import MnxRefReader
14 | 
15 | 
16 | class TestMnxRefReader(unittest.TestCase):
17 |     '''Test class for MnxRefReader.'''
18 | 
19 |     def setUp(self):
20 |         unittest.TestCase.setUp(self)
21 |         reader = MnxRefReader()
22 |         self.__chem_data = reader.get_chem_data()
23 |         self.__reac_data = reader.get_reac_data()
24 | 
25 |     def test_get_chem_data(self):
26 |         '''Tests get_chem_data method.'''
27 |         self.assertEquals(self.__chem_data['MNXM1354']['chebi'], 'CHEBI:58282')
28 | 
29 |     def test_get_reac_data(self):
30 |         '''Tests get_chem_data method.'''
31 |         eqn = '1 MNXM1 + 1 MNXM6 + 1 MNXM97401 = 1 MNXM5 + 1 MNXM97393'
32 |         self.assertEquals(self.__reac_data['MNXR62989']['equation'], eqn)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     # import sys;sys.argv = ['', 'Test.testName']
37 |     unittest.main()
38 | 


--------------------------------------------------------------------------------
/sbcdb/chebi_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | SYNBIOCHEM-DB (c) University of Manchester 2015
 3 | 
 4 | SYNBIOCHEM-DB is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | from libchebipy._chebi_entity import ChebiEntity
11 | 
12 | 
13 | def load(chem_manager, writer):
14 |     '''Loads ChEBI data from libChEBIpy.'''
15 |     chebi_ids = []
16 |     rels = []
17 | 
18 |     _add_node('CHEBI:24431', chebi_ids, rels, chem_manager)
19 | 
20 |     writer.write_rels(rels, 'Chemical', 'Chemical')
21 | 
22 | 
23 | def _add_node(chebi_id, chebi_ids, rels, chem_manager):
24 |     '''Constructs a node from libChEBI.'''
25 |     if chebi_id not in chebi_ids:
26 |         chebi_ids.append(chebi_id)
27 | 
28 |         chem_id, entity = chem_manager.add_chemical({'chebi': chebi_id})
29 | 
30 |         for incoming in entity.get_incomings():
31 |             target_id = incoming.get_target_chebi_id()
32 | 
33 |             chebi_ent = ChebiEntity(target_id)
34 | 
35 |             if chebi_ent.get_parent_id():
36 |                 target_id = chebi_ent.get_parent_id()
37 | 
38 |             _add_node(target_id, chebi_ids, rels, chem_manager)
39 |             rels.append([target_id, incoming.get_type(), chem_id])
40 | 


--------------------------------------------------------------------------------
/sbcdb/test/test_enzyme_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | synbiochem (c) University of Manchester 2015
 3 | 
 4 | synbiochem is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | # pylint: disable=too-many-public-methods
11 | import unittest
12 | 
13 | from sbcdb.enzyme_utils import EnzymeManager
14 | 
15 | 
16 | class TestEnzymeManager(unittest.TestCase):
17 |     '''Test class for EnzymeManager.'''
18 | 
19 |     def setUp(self):
20 |         unittest.TestCase.setUp(self)
21 |         self.__manager = EnzymeManager()
22 | 
23 |     def test_add_uniprot_data(self):
24 |         '''Tests add_uniprot_data method.'''
25 |         enzyme_ids = ['P19367', 'Q2KNB7']
26 | 
27 |         # Test unthreaded:
28 |         self.__manager.add_uniprot_data(enzyme_ids, source='test')
29 |         self.assertEqual(len(enzyme_ids), len(self.__manager.get_nodes()))
30 | 
31 |         # Test threaded:
32 |         self.__manager.add_uniprot_data(enzyme_ids, source='test',
33 |                                         num_threads=24)
34 |         self.assertEqual(len(enzyme_ids), len(self.__manager.get_nodes()))
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     # import sys;sys.argv = ['', 'Test.testName']
39 |     unittest.main()
40 | 


--------------------------------------------------------------------------------
/sbcdb/rhea_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | SYNBIOCHEM-DB (c) University of Manchester 2015
 3 | 
 4 | SYNBIOCHEM-DB is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | import tempfile
11 | import urllib
12 | 
13 | 
14 | __RHEA_URL = 'ftp://ftp.ebi.ac.uk/pub/databases/rhea/tsv/rhea2uniprot.tsv'
15 | 
16 | 
17 | def load(reaction_manager, source=__RHEA_URL, num_threads=0):
18 |     '''Loads Rhea data.'''
19 |     # Parse data:
20 |     temp_file = tempfile.NamedTemporaryFile()
21 |     urllib.urlretrieve(source, temp_file.name)
22 |     data = _parse(temp_file.name)
23 |     reaction_manager.add_react_to_enz(data, 'rhea', num_threads)
24 | 
25 | 
26 | def _parse(filename):
27 |     '''Parses file.'''
28 |     data = {}
29 | 
30 |     with open(filename, 'r') as textfile:
31 |         next(textfile)
32 | 
33 |         for line in textfile:
34 |             tokens = line.split('\t')
35 | 
36 |             if len(tokens) == 4:
37 |                 uniprot_id = tokens[3].strip()
38 | 
39 |                 if not tokens[0] or not tokens[2]:
40 |                     print ','.join(tokens)
41 | 
42 |                 _add(data, tokens[0], uniprot_id)
43 |                 _add(data, tokens[2], uniprot_id)
44 | 
45 |     return data
46 | 
47 | 
48 | def _add(data, rhea_id, uniprot_id):
49 |     '''Adds Rhea id and Uniprot id to data.'''
50 |     if rhea_id in data:
51 |         data[rhea_id].append(uniprot_id)
52 |     else:
53 |         data[rhea_id] = [uniprot_id]
54 | 


--------------------------------------------------------------------------------
/sbcdb/namespace_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | synbiochem (c) University of Manchester 2015
 3 | 
 4 | synbiochem is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | __CHEMICAL_NAMESPACE = {
11 |     # value (namespace) corresponds to identifiers.org:
12 |     'bigg': 'bigg.metabolite',
13 |     'CAS Registry Number': 'cas',
14 |     'chebi': 'chebi',
15 |     'ChemIDplus accession': 'chemidplus',
16 |     'Chemspider accession': 'chemspider',
17 |     'DrugBank accession': 'drugbank',
18 |     'hmdb': 'hmdb',
19 |     'HMDB accession': 'hmdb',
20 |     'kegg': 'kegg.compound',
21 |     'KEGG COMPOUND accession': 'kegg.compound',
22 |     'KEGG DRUG accession': 'kegg.drug',
23 |     'KEGG GLYCAN accession': 'kegg.glycan',
24 |     'KNApSAcK accession': 'knapsack',
25 |     'lipidmaps': 'lipidmaps',
26 |     'LIPID MAPS instance accession': 'lipidmaps',
27 |     'MolBase accession': 'molbase',
28 |     'PDB accession': 'pdb',
29 |     'PubMed citation': 'pubmed',
30 |     'reactome': 'reactome',
31 |     'RESID accession': 'resid',
32 |     'seed': 'seed.compound',
33 |     'umbbd': 'umbbd.compound',
34 |     'UM-BBD compID': 'umbbd.compound',
35 |     'upa': 'unipathway',
36 |     'Wikipedia accession': 'wikipedia.en',
37 | 
38 |     # Not in identifiers.org:
39 |     'metacyc': 'metacyc',
40 |     'MetaCyc accession': 'metacyc',
41 |     'mnx': 'mnx'
42 | }
43 | 
44 | __REACTION_NAMESPACE = {
45 |     # value (namespace) corresponds to identifiers.org:
46 |     'bigg': 'bigg.reaction',
47 |     'kegg': 'kegg.reaction',
48 |     'reactome': 'reactome',
49 |     'rhea': 'rhea',
50 |     'seed': 'seed',
51 | 
52 |     # Not in identifiers.org:
53 |     'metacyc': 'metacyc',
54 |     'mnx': 'mnx',
55 | }
56 | 
57 | 
58 | def resolve_namespace(name, chemical):
59 |     '''Maps name to distinct namespace from identifiers.org.'''
60 |     namespace = __CHEMICAL_NAMESPACE if chemical else __REACTION_NAMESPACE
61 |     return namespace[name] if name in namespace else None
62 | 


--------------------------------------------------------------------------------
/sbcdb/build.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | SYNBIOCHEM-DB (c) University of Manchester 2015
 3 | 
 4 | SYNBIOCHEM-DB is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | import multiprocessing
11 | import sys
12 | 
13 | from sbcdb import chebi_utils, chemical_utils, mnxref_utils, \
14 |     ncbi_taxonomy_utils, reaction_utils, rhea_utils, spectra_utils, utils
15 | 
16 | 
17 | def build_csv(dest_dir, array_delimiter, num_threads):
18 |     '''Build database CSV files.'''
19 |     writer = utils.Writer(dest_dir)
20 | 
21 |     # Get Organism data:
22 |     print 'Parsing NCBI Taxonomy'
23 |     ncbi_taxonomy_utils.load(writer, array_delimiter)
24 | 
25 |     # Get Chemical and Reaction data.
26 |     # Write chemistry csv files:
27 |     chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter)
28 |     reac_man = reaction_utils.ReactionManager()
29 | 
30 |     print 'Parsing MNXref'
31 |     mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer)
32 |     mnx_loader.load()
33 | 
34 |     print 'Parsing ChEBI'
35 |     chebi_utils.load(chem_man, writer)
36 | 
37 |     # Get Spectrum data:
38 |     print 'Parsing spectrum data'
39 |     spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter)
40 | 
41 |     chem_man.write_files(writer)
42 | 
43 |     # Get Reaction / Enzyme / Organism data:
44 |     # print 'Parsing KEGG'
45 |     # kegg_utils.load(reac_man, num_threads=num_threads)
46 | 
47 |     print 'Parsing Rhea'
48 |     rhea_utils.load(reac_man, num_threads=num_threads)
49 | 
50 |     reac_man.write_files(writer)
51 | 
52 | 
53 | def main(args):
54 |     '''main method'''
55 |     num_threads = 0
56 | 
57 |     if len(args) > 2:
58 |         try:
59 |             num_threads = int(args[2])
60 |         except ValueError:
61 |             if args[2] == 'True':
62 |                 num_threads = multiprocessing.cpu_count()
63 | 
64 |     print 'Running build with %d threads' % num_threads
65 | 
66 |     build_csv(args[0], args[1], num_threads)
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     main(sys.argv[1:])
71 | 


--------------------------------------------------------------------------------
/sbcdb/init.cql:
--------------------------------------------------------------------------------
 1 | CREATE CONSTRAINT ON (n:Organism) ASSERT n.taxonomy IS UNIQUE;
 2 | CREATE CONSTRAINT ON (n:Enzyme) ASSERT n.entry IS UNIQUE;
 3 | CREATE CONSTRAINT ON (n:Enzyme) ASSERT n.uniprot IS UNIQUE;
 4 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.`bigg.reaction` IS UNIQUE;
 5 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.id IS UNIQUE;
 6 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.`kegg.reaction` IS UNIQUE;
 7 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.metacyc IS UNIQUE;
 8 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.mnx IS UNIQUE;
 9 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.reactome IS UNIQUE;
10 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.rhea IS UNIQUE;
11 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.seed IS UNIQUE;
12 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`bigg.metabolite` IS UNIQUE;
13 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.cas IS UNIQUE;
14 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chebi IS UNIQUE;
15 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chemidplus IS UNIQUE;
16 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chemspider IS UNIQUE;
17 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.drugbank IS UNIQUE;
18 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.hmdb IS UNIQUE;
19 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.id IS UNIQUE;
20 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.compound` IS UNIQUE;
21 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.drug` IS UNIQUE;
22 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.glycan` IS UNIQUE;
23 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.knapsack IS UNIQUE;
24 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.lipidmaps IS UNIQUE;
25 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.metacyc IS UNIQUE;
26 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.mnx IS UNIQUE;
27 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.molbase IS UNIQUE;
28 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.pdb IS UNIQUE;
29 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.pubmed IS UNIQUE;
30 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.reactome IS UNIQUE;
31 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.resid IS UNIQUE;
32 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`seed.compound` IS UNIQUE;
33 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`umbbd.compound` IS UNIQUE;
34 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.unipathway IS UNIQUE;
35 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`wikipedia.en` IS UNIQUE;


--------------------------------------------------------------------------------
/sbcdb/utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | synbiochem (c) University of Manchester 2016
 3 | 
 4 | synbiochem is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | # pylint: disable=invalid-name
11 | # pylint: disable=too-many-arguments
12 | import os
13 | from shutil import rmtree
14 | 
15 | import pandas as pd
16 | 
17 | 
18 | class Writer(object):
19 |     '''CSV file writer class for biochem4j files.'''
20 | 
21 |     def __init__(self, dest_dir):
22 |         self.__nodes_dir = os.path.join(os.path.abspath(dest_dir), 'nodes')
23 |         self.__rels_dir = os.path.join(os.path.abspath(dest_dir), 'rels')
24 | 
25 |         if os.path.exists(self.__nodes_dir):
26 |             rmtree(self.__nodes_dir)
27 | 
28 |         os.makedirs(self.__nodes_dir)
29 | 
30 |         if os.path.exists(self.__rels_dir):
31 |             rmtree(self.__rels_dir)
32 | 
33 |         os.makedirs(self.__rels_dir)
34 | 
35 |     def write_nodes(self, nodes, group, separator=';'):
36 |         '''Writes Nodes to csv file.'''
37 |         if not nodes:
38 |             return None
39 | 
40 |         df = pd.DataFrame(nodes)
41 |         df.dropna(axis=1, how='all', inplace=True)
42 | 
43 |         filename = os.path.join(self.__nodes_dir, group + '.csv')
44 |         df.to_csv(filename, index=False, encoding='utf-8', sep=separator)
45 | 
46 |         return filename
47 | 
48 |     def write_rels(self, rels, group_start, group_end, separator=';'):
49 |         '''Writes Relationships to csv file.'''
50 |         if not rels:
51 |             return None
52 | 
53 |         columns = [':START_ID(' + group_start + ')',
54 |                    ':TYPE',
55 |                    ':END_ID(' + group_end + ')']
56 | 
57 |         if len(rels[0]) > 3:
58 |             columns.append('PROPERTIES')
59 | 
60 |         df = pd.DataFrame(rels, columns=columns)
61 | 
62 |         if len(rels[0]) > 3:
63 |             props_df = pd.DataFrame(list(df['PROPERTIES']))
64 |             df.drop('PROPERTIES', axis=1, inplace=True)
65 |             df = df.join(props_df)
66 | 
67 |         filename = os.path.join(self.__rels_dir,
68 |                                 group_start + '_' + group_end + '.csv')
69 |         df.to_csv(filename, index=False, encoding='utf-8', sep=separator)
70 | 
71 |         return filename
72 | 


--------------------------------------------------------------------------------
/sbcdb/enzyme_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | SYNBIOCHEM-DB (c) University of Manchester 2015
 3 | 
 4 | SYNBIOCHEM-DB is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | from synbiochem.utils import seq_utils
11 | 
12 | 
13 | class EnzymeManager(object):
14 |     '''Class to implement a manager of Enzyme data.'''
15 | 
16 |     def __init__(self):
17 |         '''Constructor.'''
18 |         self.__nodes = {}
19 |         self.__org_enz_rels = []
20 | 
21 |     def get_nodes(self):
22 |         '''Gets enzyme nodes.'''
23 |         return self.__nodes.values()
24 | 
25 |     def get_org_enz_rels(self):
26 |         '''Gets organism-to-enzyme relationships.'''
27 |         return self.__org_enz_rels
28 | 
29 |     def add_uniprot_data(self, enzyme_ids, source, num_threads=0):
30 |         '''Gets Uniprot data.'''
31 | 
32 |         fields = ['entry name', 'protein names', 'organism-id', 'ec']
33 |         enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids
34 |                       if enzyme_id not in self.__nodes]
35 |         uniprot_values = seq_utils.get_uniprot_values(enzyme_ids, fields,
36 |                                                       batch_size=512,
37 |                                                       verbose=True,
38 |                                                       num_threads=num_threads)
39 | 
40 |         for uniprot_id, uniprot_value in uniprot_values.iteritems():
41 |             enzyme_node = {':LABEL': 'Enzyme',
42 |                            'uniprot:ID(Enzyme)': uniprot_id}
43 |             self.__nodes[uniprot_id] = enzyme_node
44 | 
45 |             organism_id = uniprot_value.pop('Organism ID') \
46 |                 if 'Organism ID' in uniprot_value else None
47 | 
48 |             if 'Entry name' in uniprot_value:
49 |                 enzyme_node['entry'] = uniprot_value['Entry name']
50 | 
51 |             if 'Protein names' in uniprot_value:
52 |                 enzyme_node['names'] = uniprot_value['Protein names']
53 | 
54 |                 if enzyme_node['names']:
55 |                     enzyme_node['name'] = enzyme_node['names'][0]
56 | 
57 |             if 'EC number' in uniprot_value:
58 |                 enzyme_node['ec-code'] = uniprot_value['EC number']
59 | 
60 |             if organism_id:
61 |                 self.__org_enz_rels.append([organism_id, 'expresses',
62 |                                             uniprot_id, {'source': source}])
63 | 


--------------------------------------------------------------------------------
/sbcdb/ncbi_taxonomy_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | SYNBIOCHEM-DB (c) University of Manchester 2015
 3 | 
 4 | SYNBIOCHEM-DB is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | import os
11 | import sys
12 | import tarfile
13 | import tempfile
14 | import urllib
15 | 
16 | 
17 | __NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
18 | 
19 | 
20 | def load(writer, array_delimiter, source=__NCBITAXONOMY_URL):
21 |     '''Loads NCBI Taxonomy data.'''
22 |     nodes_filename, names_filename = _get_ncbi_taxonomy_files(source)
23 |     nodes, rels = _parse_nodes(nodes_filename, array_delimiter)
24 |     _parse_names(nodes, names_filename, array_delimiter)
25 | 
26 |     writer.write_nodes(nodes.values(), 'Organism')
27 |     writer.write_rels(rels, 'Organism', 'Organism')
28 | 
29 | 
30 | def _get_ncbi_taxonomy_files(source):
31 |     '''Downloads and extracts NCBI Taxonomy files.'''
32 |     temp_dir = tempfile.gettempdir()
33 |     temp_gzipfile = tempfile.NamedTemporaryFile()
34 |     urllib.urlretrieve(source, temp_gzipfile.name)
35 | 
36 |     temp_tarfile = tarfile.open(temp_gzipfile.name, 'r:gz')
37 |     temp_tarfile.extractall(temp_dir)
38 | 
39 |     temp_gzipfile.close()
40 |     temp_tarfile.close()
41 | 
42 |     return os.path.join(temp_dir, 'nodes.dmp'), \
43 |         os.path.join(temp_dir, 'names.dmp')
44 | 
45 | 
46 | def _parse_nodes(filename, array_delimiter):
47 |     '''Parses nodes file.'''
48 |     nodes = {}
49 |     rels = []
50 | 
51 |     with open(filename, 'r') as textfile:
52 |         for line in textfile:
53 |             tokens = [x.strip() for x in line.split('|')]
54 |             tax_id = tokens[0]
55 | 
56 |             if tax_id != '1':
57 |                 rels.append([tax_id, 'is_a', tokens[1]])
58 | 
59 |             nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id,
60 |                              ':LABEL':
61 |                              'Organism' + array_delimiter + tokens[2]}
62 | 
63 |     return nodes, rels
64 | 
65 | 
66 | def _parse_names(nodes, filename, array_delimiter):
67 |     '''Parses names file.'''
68 | 
69 |     with open(filename, 'r') as textfile:
70 |         for line in textfile:
71 |             tokens = [x.strip() for x in line.split('|')]
72 |             node = nodes[tokens[0]]
73 | 
74 |             if 'name' not in node:
75 |                 node['name'] = tokens[1]
76 |                 node['names:string[]'] = set([node['name']])
77 |             else:
78 |                 node['names:string[]'].add(tokens[1])
79 | 
80 |     for _, node in nodes.iteritems():
81 |         if 'names:string[]' in node:
82 |             node['names:string[]'] = \
83 |                 array_delimiter.join(node['names:string[]'])
84 | 
85 | 
86 | def main(argv):
87 |     '''main method'''
88 |     load(*argv)
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     main(sys.argv[1:])
93 | 


--------------------------------------------------------------------------------
/sbcdb/kegg_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | SYNBIOCHEM-DB (c) University of Manchester 2015
 3 | 
 4 | SYNBIOCHEM-DB is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | from collections import defaultdict
11 | import urllib2
12 | 
13 | from synbiochem.utils import thread_utils
14 | 
15 | 
16 | def load(reaction_manager, organisms=None, num_threads=0):
17 |     '''Loads KEGG data.'''
18 | 
19 |     if organisms is None:
20 |         organisms = \
21 |             sorted([line.split()[1] for line in
22 |                     urllib2.urlopen('http://rest.kegg.jp/list/organism')])
23 | 
24 |     # EC to gene, gene to Uniprot:
25 |     ec_genes, gene_uniprots = _get_gene_data(organisms, num_threads)
26 | 
27 |     data = defaultdict(list)
28 | 
29 |     # KEGG Reaction to EC:
30 |     kegg_reac_ec = _parse_url('http://rest.kegg.jp/link/ec/reaction')
31 | 
32 |     for kegg_reac, ec_terms in kegg_reac_ec.iteritems():
33 |         for ec_term in ec_terms:
34 |             if ec_term in ec_genes:
35 |                 for gene in ec_genes[ec_term]:
36 |                     if gene in gene_uniprots:
37 |                         uniprots = [val[3:] for val in gene_uniprots[gene]]
38 |                         data[kegg_reac[3:]].extend(uniprots)
39 | 
40 |     reaction_manager.add_react_to_enz(data, 'kegg.reaction', num_threads)
41 | 
42 | 
43 | def _get_gene_data(organisms, num_threads):
44 |     '''Gets gene data.'''
45 |     ec_genes = defaultdict(list)
46 |     gene_uniprots = defaultdict(list)
47 | 
48 |     if num_threads:
49 |         thread_pool = thread_utils.ThreadPool(num_threads)
50 | 
51 |         for org in organisms:
52 |             thread_pool.add_task(_parse_organism, org, ec_genes, gene_uniprots)
53 | 
54 |         thread_pool.wait_completion()
55 |     else:
56 |         for org in organisms:
57 |             _parse_organism(org, ec_genes, gene_uniprots)
58 | 
59 |     return ec_genes, gene_uniprots
60 | 
61 | 
62 | def _parse_organism(org, ec_genes, gene_uniprots):
63 |     '''Parse organism.'''
64 |     print 'KEGG: loading ' + org
65 | 
66 |     for key, value in _parse_url('http://rest.kegg.jp/link/' + org.lower() +
67 |                                  '/enzyme').iteritems():
68 |         ec_genes[key].extend(value)
69 | 
70 |     for key, value in _parse_url('http://rest.kegg.jp/conv/uniprot/' +
71 |                                  org.lower()).iteritems():
72 |         gene_uniprots[key].extend(value)
73 | 
74 | 
75 | def _parse_url(url, attempts=16):
76 |     '''Parses url to form key to list of values dictionary.'''
77 |     data = defaultdict(list)
78 | 
79 |     for _ in range(attempts):
80 |         try:
81 |             for line in urllib2.urlopen(url):
82 |                 tokens = line.split()
83 | 
84 |                 if len(tokens) > 1:
85 |                     data[tokens[0]].append(tokens[1])
86 | 
87 |             return data
88 |         except urllib2.URLError, err:
89 |             # Take no action, but try again...
90 |             print '\t'.join([url, str(err)])
91 | 
92 |     return data
93 | 


--------------------------------------------------------------------------------
/sbcdb/reaction_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | SYNBIOCHEM-DB (c) University of Manchester 2015
 3 | 
 4 | SYNBIOCHEM-DB is licensed under the MIT License.
 5 | 
 6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
 7 | 
 8 | @author:  neilswainston
 9 | '''
10 | from sbcdb.enzyme_utils import EnzymeManager
11 | 
12 | 
13 | class ReactionManager(object):
14 |     '''Class to implement a manager of Reaction data.'''
15 | 
16 |     def __init__(self):
17 |         '''Constructor.'''
18 |         self.__nodes = {}
19 |         self.__reac_ids = {}
20 |         self.__reac_enz_rels = []
21 |         self.__org_enz_rels = []
22 |         self.__enz_man = EnzymeManager()
23 | 
24 |     def write_files(self, writer):
25 |         '''Write neo4j import files.'''
26 |         return ([writer.write_nodes(self.__nodes.values(),
27 |                                     'Reaction'),
28 |                  writer.write_nodes(self.__enz_man.get_nodes(),
29 |                                     'Enzyme')],
30 |                 [writer.write_rels(self.__reac_enz_rels,
31 |                                    'Reaction', 'Enzyme'),
32 |                  writer.write_rels(self.__enz_man.get_org_enz_rels(),
33 |                                    'Organism', 'Enzyme')])
34 | 
35 |     def add_reaction(self, source, reac_id, properties):
36 |         '''Adds a reaction to the collection of nodes, ensuring uniqueness.'''
37 |         reac_id = self.__reac_ids[source + reac_id] \
38 |             if source + reac_id in self.__reac_ids else reac_id
39 | 
40 |         if reac_id not in self.__nodes:
41 |             properties[':LABEL'] = 'Reaction'
42 |             properties['id:ID(Reaction)'] = reac_id
43 |             properties['source'] = source
44 |             properties[source] = reac_id
45 |             self.__nodes[reac_id] = properties
46 | 
47 |             if 'mnx' in properties:
48 |                 self.__reac_ids['mnx' + properties['mnx']] = reac_id
49 | 
50 |             if 'kegg.reaction' in properties:
51 |                 self.__reac_ids[
52 |                     'kegg.reaction' + properties['kegg.reaction']] = reac_id
53 | 
54 |             if 'rhea' in properties:
55 |                 self.__reac_ids['rhea' + properties['rhea']] = reac_id
56 |         else:
57 |             self.__nodes[reac_id].update(properties)
58 | 
59 |         return reac_id
60 | 
61 |     def add_react_to_enz(self, data, source, num_threads=0):
62 |         '''Submit data to the graph.'''
63 |         # Create Reaction and Enzyme nodes:
64 |         enzyme_ids = self.__create_react_enz(data, source)
65 | 
66 |         # Create Enzyme nodes:
67 |         self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads)
68 | 
69 |     def __create_react_enz(self, data, source):
70 |         '''Creates Reaction and Enzyme nodes and their Relationships.'''
71 |         enzyme_ids = []
72 | 
73 |         for reac_id, uniprot_ids in data.iteritems():
74 |             reac_id = self.add_reaction(source, reac_id, {})
75 | 
76 |             for uniprot_id in uniprot_ids:
77 |                 enzyme_ids.append(uniprot_id)
78 |                 self.__reac_enz_rels.append([reac_id, 'catalysed_by',
79 |                                              uniprot_id,
80 |                                              {'source': source}])
81 | 
82 |         return list(set(enzyme_ids))
83 | 


--------------------------------------------------------------------------------
/sbcdb/spectra_utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | SYNBIOCHEM-DB (c) University of Manchester 2015
  3 | 
  4 | SYNBIOCHEM-DB is licensed under the MIT License.
  5 | 
  6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
  7 | 
  8 | @author:  neilswainston
  9 | '''
 10 | import os
 11 | import tempfile
 12 | from urllib import urlretrieve
 13 | import zipfile
 14 | 
 15 | import ijson
 16 | 
 17 | 
 18 | __MONA_URL = 'http://mona.fiehnlab.ucdavis.edu/rest/downloads/retrieve/' + \
 19 |     'd2eb33f0-b22e-49a7-bc31-eb951f8347b2'
 20 | 
 21 | __MONA_FILENAME = 'MoNA-export-All_Spectra.json'
 22 | 
 23 | _NAME_MAP = {'kegg': 'kegg.compound',
 24 |              'molecular formula': 'formula',
 25 |              'total exact mass': 'monoisotopic_mass:float'}
 26 | 
 27 | 
 28 | def load(writer, chem_manager,
 29 |          array_delimiter='|', url=__MONA_URL, filename=__MONA_FILENAME):
 30 |     '''Build Spectrum nodes and relationships.'''
 31 |     nodes = []
 32 |     rels = []
 33 | 
 34 |     records = _parse(_get_file(url, filename), array_delimiter)
 35 | 
 36 |     for record in records:
 37 |         chem_id, _ = chem_manager.add_chemical(record['chemical'])
 38 |         nodes.append(record['spectrum'])
 39 |         rels.append([chem_id, 'has', record['spectrum']['id:ID(Spectrum)']])
 40 | 
 41 |     return [writer.write_nodes(nodes, 'Spectrum')], \
 42 |         [writer.write_rels(rels, 'Chemical', 'Spectrum')]
 43 | 
 44 | 
 45 | def _parse(filename, array_delimiter):
 46 |     '''Parses MoNA json file.'''
 47 |     records = []
 48 |     record = {'chemical': {'names:string[]': []},
 49 |               'spectrum': {':LABEL': 'Spectrum', 'tags:string[]': []}}
 50 |     name = None
 51 | 
 52 |     for prefix, typ, value in ijson.parse(open(filename)):
 53 |         if prefix == 'item' and typ == 'start_map':
 54 |             record = {'chemical': {'names:string[]': []},
 55 |                       'spectrum': {':LABEL': 'Spectrum',
 56 |                                    'tags:string[]': []}}
 57 |         elif prefix == 'item.compound.item.inchi':
 58 |             record['chemical']['inchi'] = value
 59 |         elif prefix == 'item.compound.item.names.item.name':
 60 |             if 'name' not in record['chemical']:
 61 |                 record['chemical']['name'] = value
 62 |             record['chemical']['names:string[]'].append(value)
 63 |         elif prefix == 'item.compound.item.metaData.item.name' or \
 64 |                 prefix == 'item.metaData.item.name':
 65 |             name = _normalise_name(value.lower())
 66 |         elif prefix == 'item.compound.item.metaData.item.value':
 67 |             _parse_compound_metadata(name, value, record)
 68 |             name = None
 69 |         elif prefix == 'item.id':
 70 |             record['spectrum']['id:ID(Spectrum)'] = value
 71 |         elif prefix == 'item.metaData.item.value':
 72 |             record['spectrum'][name] = value
 73 |             name = None
 74 |         elif prefix == 'item.spectrum':
 75 |             values = [float(val) for term in value.split()
 76 |                       for val in term.split(':')]
 77 |             record['spectrum']['m/z:float[]'] = \
 78 |                 array_delimiter.join(map(str, values[0::2]))
 79 |             record['spectrum']['I:float[]'] = \
 80 |                 array_delimiter.join(map(str, values[1::2]))
 81 |         elif prefix == 'item.tags.item.text':
 82 |             record['spectrum']['tags:string[]'].append(value)
 83 |         elif prefix == 'item' and typ == 'end_map':
 84 |             records.append(record)
 85 | 
 86 |     return records
 87 | 
 88 | 
 89 | def _get_file(url, filename):
 90 |     '''Gets file from url.'''
 91 |     destination = os.path.join(os.path.expanduser('~'), 'MoNA')
 92 | 
 93 |     if not os.path.exists(destination):
 94 |         os.makedirs(destination)
 95 | 
 96 |     filepath = os.path.join(destination, filename)
 97 | 
 98 |     if not os.path.exists(filepath):
 99 |         tmp_file = tempfile.NamedTemporaryFile(delete=False)
100 |         urlretrieve(url, tmp_file.name)
101 |         zfile = zipfile.ZipFile(tmp_file.name, 'r')
102 |         filepath = os.path.join(destination, zfile.namelist()[0])
103 |         zfile.extractall(destination)
104 | 
105 |     return filepath
106 | 
107 | 
108 | def _parse_compound_metadata(name, value, record):
109 |     '''Parses compound metadata.'''
110 |     if name == 'chebi' and isinstance(value, unicode):
111 |         value = value.replace('CHEBI:', '').split()[0]
112 | 
113 |     record['chemical'][_normalise_name(name)] = value
114 | 
115 | 
116 | def _normalise_name(name):
117 |     '''Normalises name in name:value pairs.'''
118 |     if name in _NAME_MAP:
119 |         return _NAME_MAP[name]
120 | 
121 |     return name.replace(':', '_')
122 | 


--------------------------------------------------------------------------------
/sbcdb/chemical_utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | SYNBIOCHEM-DB (c) University of Manchester 2015
  3 | 
  4 | SYNBIOCHEM-DB is licensed under the MIT License.
  5 | 
  6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
  7 | 
  8 | @author:  neilswainston
  9 | '''
 10 | import math
 11 | import uuid
 12 | 
 13 | from libchebipy._chebi_entity import ChebiEntity, ChebiException
 14 | 
 15 | from sbcdb import namespace_utils as ns_utils
 16 | from synbiochem.utils import chem_utils
 17 | 
 18 | 
 19 | class ChemicalManager(object):
 20 |     '''Class to implement a manager of Chemical data.'''
 21 | 
 22 |     def __init__(self, array_delimiter):
 23 |         '''Constructor.'''
 24 |         self.__array_delimiter = array_delimiter
 25 |         self.__nodes = {}
 26 |         self.__chem_ids = {}
 27 | 
 28 |     def write_files(self, writer):
 29 |         '''Write neo4j import files.'''
 30 |         return writer.write_nodes(self.__nodes.values(), 'Chemical')
 31 | 
 32 |     def add_chemical(self, properties):
 33 |         '''Adds a chemical to the collection of nodes, ensuring uniqueness.'''
 34 |         chem_id, chebi_ent = self.__get_chem_id(properties)
 35 | 
 36 |         if 'charge:float' in properties:
 37 |             charge = properties.pop('charge:float')
 38 | 
 39 |             if not math.isnan(charge):
 40 |                 properties['charge:float'] = int(charge)
 41 | 
 42 |         if chem_id not in self.__nodes:
 43 |             properties[':LABEL'] = 'Chemical'
 44 |             properties['id:ID(Chemical)'] = chem_id
 45 |             properties['source'] = 'chebi' if 'chebi' in properties else 'mnx'
 46 | 
 47 |             _normalise_mass(properties)
 48 |             self.__nodes[chem_id] = properties
 49 |         else:
 50 |             self.__nodes[chem_id].update(properties)
 51 | 
 52 |         return chem_id, chebi_ent
 53 | 
 54 |     def get_props(self, prop, default=None):
 55 |         '''Gets all chem_ids to property as a dict.'''
 56 |         return {key: self.__nodes[chem_id].get(prop, default)
 57 |                 for key, chem_id in self.__chem_ids.iteritems()}
 58 | 
 59 |     def get_prop(self, chem_id, prop, default=None):
 60 |         '''Gets a property.'''
 61 |         return self.__nodes[self.__chem_ids[chem_id]].get(prop, default)
 62 | 
 63 |     def __get_chem_id(self, properties):
 64 |         '''Manages chemical id mapping.'''
 65 |         chebi_id = properties.get('chebi', None)
 66 |         chebi_ent = None
 67 | 
 68 |         if chebi_id:
 69 |             try:
 70 |                 chebi_id, chebi_ent = _get_chebi_data(chebi_id, properties,
 71 |                                                       self.__array_delimiter)
 72 |             except ChebiException, err:
 73 |                 properties.pop('chebi')
 74 |                 chebi_id = None
 75 |                 print err
 76 |             except ValueError, err:
 77 |                 properties.pop('chebi')
 78 |                 chebi_id = None
 79 |                 print err
 80 | 
 81 |         mnx_id = properties.get('mnx', None)
 82 |         inchi_id = properties.get('inchi', None)
 83 | 
 84 |         if chebi_id:
 85 |             self.__chem_ids[chebi_id] = chebi_id
 86 | 
 87 |             if inchi_id:
 88 |                 self.__chem_ids[inchi_id] = chebi_id
 89 | 
 90 |             if mnx_id:
 91 |                 self.__chem_ids[mnx_id] = chebi_id
 92 | 
 93 |             return chebi_id, chebi_ent
 94 | 
 95 |         if inchi_id:
 96 |             chem_id = self.__chem_ids.get(inchi_id, None)
 97 | 
 98 |             if chem_id:
 99 |                 return chem_id, None
100 | 
101 |         if mnx_id:
102 |             chem_id = self.__chem_ids.get(mnx_id, None)
103 | 
104 |             if chem_id:
105 |                 return chem_id, None
106 | 
107 |             if inchi_id:
108 |                 self.__chem_ids[inchi_id] = mnx_id
109 | 
110 |             self.__chem_ids[mnx_id] = mnx_id
111 |             return mnx_id, None
112 | 
113 |         new_id = str(uuid.uuid4())
114 |         self.__chem_ids[inchi_id] = new_id
115 | 
116 |         return new_id, None
117 | 
118 | 
119 | def _get_chebi_data(chebi_id, properties, array_delimiter):
120 |     '''Gets ChEBI data.'''
121 |     chebi_ent = ChebiEntity(str(chebi_id))
122 | 
123 |     if chebi_ent.get_parent_id():
124 |         chebi_id = chebi_ent.get_parent_id()
125 |     else:
126 |         chebi_id = chebi_ent.get_id()
127 | 
128 |     properties['chebi'] = chebi_id
129 | 
130 |     formula = chebi_ent.get_formula()
131 |     charge = chebi_ent.get_charge()
132 |     inchi = chebi_ent.get_inchi()
133 |     smiles = chebi_ent.get_smiles()
134 | 
135 |     if formula:
136 |         properties['formula'] = formula
137 | 
138 |     if not math.isnan(charge):
139 |         properties['charge:float'] = charge
140 | 
141 |     if inchi:
142 |         properties['inchi'] = inchi
143 | 
144 |     if smiles:
145 |         properties['smiles'] = smiles
146 | 
147 |     properties['name'] = chebi_ent.get_name()
148 |     properties['names:string[]'] = \
149 |         array_delimiter.join([name.get_name()
150 |                               for name in chebi_ent.get_names()] +
151 |                              [chebi_ent.get_name()])
152 | 
153 |     for db_acc in chebi_ent.get_database_accessions():
154 |         namespace = ns_utils.resolve_namespace(
155 |             db_acc.get_type(), True)
156 | 
157 |         if namespace is not None:
158 |             properties[namespace] = db_acc.get_accession_number()
159 | 
160 |     return chebi_id, chebi_ent
161 | 
162 | 
163 | def _normalise_mass(properties):
164 |     '''Removes ambiguity in mass values by recalculating according to chemical
165 |     formula.'''
166 |     properties.pop('mass:float', None)
167 | 
168 |     if 'formula' in properties and properties['formula'] is not None:
169 |         mono_mass = chem_utils.get_molecular_mass(properties['formula'])
170 | 
171 |         if not math.isnan(mono_mass):
172 |             properties['monoisotopic_mass:float'] = mono_mass
173 | 


--------------------------------------------------------------------------------
/sbcdb/mnxref_utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | SYNBIOCHEM-DB (c) University of Manchester 2015
  3 | 
  4 | SYNBIOCHEM-DB is licensed under the MIT License.
  5 | 
  6 | To view a copy of this license, visit <http://opensource.org/licenses/MIT/>.
  7 | 
  8 | @author:  neilswainston
  9 | '''
 10 | # pylint: disable=no-member
 11 | # pylint: disable=too-few-public-methods
 12 | # pylint: disable=too-many-locals
 13 | from collections import Counter
 14 | import csv
 15 | import itertools
 16 | import math
 17 | import re
 18 | import urllib2
 19 | 
 20 | import numpy
 21 | from subliminal import balance
 22 | 
 23 | from sbcdb import namespace_utils
 24 | from synbiochem.utils import chem_utils
 25 | 
 26 | 
 27 | _METANETX_URL = 'http://metanetx.org/cgi-bin/mnxget/mnxref/'
 28 | 
 29 | 
 30 | class MnxRefReader(object):
 31 |     '''Class to read MnxRef data from the chem_prop.tsv, the chem_xref.tsv and
 32 |     reac_prop.tsv files.'''
 33 | 
 34 |     def __init__(self, source=_METANETX_URL):
 35 |         self.__source = source
 36 |         self.__mnx_id_patt = re.compile(r'(MNX[MR])(\d+)')
 37 |         self.__chem_data = {}
 38 |         self.__reac_data = {}
 39 | 
 40 |     def get_chem_data(self):
 41 |         '''Gets chemical data.'''
 42 |         if not self.__chem_data:
 43 |             self.__read_chem_prop()
 44 |             self.__read_xref('chem_xref.tsv', self.__chem_data, True)
 45 | 
 46 |         return self.__chem_data
 47 | 
 48 |     def get_reac_data(self):
 49 |         '''Gets reaction data.'''
 50 |         if not self.__reac_data:
 51 |             self.__read_reac_prop()
 52 |             self.__read_xref('reac_xref.tsv', self.__reac_data, False)
 53 | 
 54 |         return self.__reac_data
 55 | 
 56 |     def __read_chem_prop(self):
 57 |         '''Read chemical properties and create Nodes.'''
 58 |         chem_prop_keys = ['id', 'name', 'formula', 'charge:float',
 59 |                           'mass:float', 'inchi', 'smiles', 'source']
 60 | 
 61 |         for values in self.__read_data('chem_prop.tsv'):
 62 |             if not values[0].startswith('#'):
 63 |                 values[0] = self.__parse_id(values[0])
 64 |                 values[7] = self.__parse_id(values[7])
 65 |                 props = dict(zip(chem_prop_keys, values))
 66 |                 props.pop('source')
 67 |                 _convert_to_float(props, 'charge:float')
 68 |                 _convert_to_float(props, 'mass:float')
 69 |                 props = {key: value for key, value in props.iteritems()
 70 |                          if value != ''}
 71 |                 self.__chem_data[values[0]] = props
 72 | 
 73 |     def __read_xref(self, filename, data, chemical):
 74 |         '''Read xrefs and update Nodes.'''
 75 |         xref_keys = ['XREF', 'MNX_ID', 'Evidence', 'Description']
 76 | 
 77 |         for values in self.__read_data(filename):
 78 |             if not values[0].startswith('#'):
 79 |                 xrefs = dict(zip(xref_keys[:len(values)], values))
 80 |                 evidence = xrefs.get('Evidence', 'identity')
 81 | 
 82 |                 if evidence == 'identity' or evidence == 'structural':
 83 |                     xrefs['MNX_ID'] = self.__parse_id(xrefs['MNX_ID'])
 84 |                     xref = xrefs['XREF'].split(':')
 85 | 
 86 |                     if xrefs['MNX_ID'] in data:
 87 |                         entry = data[xrefs['MNX_ID']]
 88 |                         self.__add_xref(xref, entry, chemical)
 89 | 
 90 |     def __add_xref(self, xref, entry, chemical):
 91 |         '''Adds an xref.'''
 92 |         namespace = namespace_utils.resolve_namespace(xref[0],
 93 |                                                       chemical)
 94 | 
 95 |         if namespace is not None:
 96 |             xref[1] = self.__parse_id(xref[1])
 97 | 
 98 |             entry[namespace] = xref[1] \
 99 |                 if namespace != 'chebi' \
100 |                 else 'CHEBI:' + xref[1]
101 | 
102 |     def __read_reac_prop(self):
103 |         '''Read reaction properties and create Nodes.'''
104 |         reac_prop_keys = ['id', 'equation', 'description', 'balance', 'ec',
105 |                           'Source']
106 | 
107 |         for values in self.__read_data('reac_prop.tsv'):
108 |             if not values[0].startswith('#'):
109 |                 values[0] = self.__parse_id(values[0])
110 |                 values[5] = self.__parse_id(values[5])
111 | 
112 |                 props = dict(zip(reac_prop_keys, values))
113 |                 props.pop('Source')
114 | 
115 |                 try:
116 |                     participants = chem_utils.parse_equation(
117 |                         props.pop('equation'))
118 | 
119 |                     for participant in participants:
120 |                         participant[0] = self.__parse_id(participant[0])
121 | 
122 |                         if participant[0] not in self.__chem_data:
123 |                             self.__add_chem(participant[0])
124 | 
125 |                     props['reac_defn'] = participants
126 |                     self.__reac_data[values[0]] = props
127 |                 except ValueError:
128 |                     print 'WARNING: Suspected polymerisation reaction: ' + \
129 |                         values[0] + '\t' + str(props)
130 | 
131 |     def __add_chem(self, chem_id):
132 |         '''Adds a chemical with given id.'''
133 |         props = {'id': chem_id}
134 |         self.__chem_data[chem_id] = props
135 |         return props
136 | 
137 |     def __read_data(self, filename):
138 |         '''Downloads and reads tab-limited files into lists of lists of
139 |         strings.'''
140 |         return list(csv.reader(urllib2.urlopen(self.__source + filename),
141 |                                delimiter='\t'))
142 | 
143 |     def __parse_id(self, item_id):
144 |         '''Parses mnx ids.'''
145 |         matches = self.__mnx_id_patt.findall(item_id)
146 | 
147 |         for mat in matches:
148 |             return mat[0] + str(int(mat[1]))
149 | 
150 |         return item_id
151 | 
152 | 
153 | class MnxRefLoader(object):
154 |     '''Loads MNXref data into neo4j format.'''
155 | 
156 |     def __init__(self, chem_man, reac_man, writer):
157 |         self.__chem_man = chem_man
158 |         self.__reac_man = reac_man
159 |         self.__writer = writer
160 | 
161 |     def load(self):
162 |         '''Loads MnxRef data from chem_prop.tsv, chem_xref.tsv,
163 |         reac_prop.tsv and reac_xref.tsv files.'''
164 |         reader = MnxRefReader()
165 | 
166 |         for properties in reader.get_chem_data().values():
167 |             properties['mnx'] = properties.pop('id')
168 |             self.__chem_man.add_chemical(properties)
169 | 
170 |         rels = self.__add_reac_nodes(reader.get_reac_data())
171 | 
172 |         return [], [self.__writer.write_rels(rels, 'Reaction', 'Chemical')]
173 | 
174 |     def __add_reac_nodes(self, reac_data):
175 |         '''Get reaction nodes from data.'''
176 |         reac_id_def = {}
177 | 
178 |         for properties in reac_data.values():
179 |             reac_def = []
180 |             mnx_id = properties.pop('id')
181 | 
182 |             # Remove equation and description (may be inconsistent with
183 |             # balanced reaction):
184 |             if 'description' in properties:
185 |                 properties.pop('description')
186 | 
187 |             for prt in properties.pop('reac_defn'):
188 |                 chem_id, _ = self.__chem_man.add_chemical({'mnx': prt[0]})
189 | 
190 |                 reac_def.append([self.__chem_man.get_prop(prt[0], 'formula'),
191 |                                  self.__chem_man.get_prop(prt[0],
192 |                                                           'charge:float', 0),
193 |                                  prt[1],
194 |                                  chem_id])
195 | 
196 |             if all([values[0] is not None for values in reac_def]):
197 |                 balanced, _, balanced_def = balance.balance_reac(reac_def)
198 |                 properties['balance'] = balanced
199 |             else:
200 |                 properties['balance'] = 'unknown'
201 |                 balanced_def = reac_def
202 | 
203 |             reac_id = self.__reac_man.add_reaction('mnx', mnx_id,
204 |                                                    properties)
205 |             reac_id_def[reac_id] = balanced_def
206 | 
207 |         chem_id_mass = self.__chem_man.get_props('monoisotopic_mass:float',
208 |                                                  float('NaN'))
209 |         cofactors = [chem_id
210 |                      for chem_id, mass in chem_id_mass.iteritems()
211 |                      if mass > 0 and mass < 44]  # Assume mass < CO2 = cofactor
212 | 
213 |         cofactor_pairs = _calc_cofactors(reac_id_def.values(), cofactors)
214 |         rels = []
215 | 
216 |         for reac_id, defn in reac_id_def.iteritems():
217 |             reactants = [term[3] for term in defn if term[2] < 0]
218 |             products = [term[3] for term in defn if term[2] > 0]
219 |             reac_cofactors = []
220 | 
221 |             # Set metabolites as cofactors:
222 |             for met in [term[3] for term in defn]:
223 |                 if met in cofactors:
224 |                     reac_cofactors.append(met)
225 | 
226 |             # Set pairs as cofactors:
227 |             for pair in itertools.product(reactants, products):
228 |                 if tuple(sorted(pair)) in cofactor_pairs:
229 |                     reac_cofactors.extend(pair)
230 | 
231 |             for term in defn:
232 |                 rels.append([reac_id,
233 |                              'has_cofactor' if term[3] in reac_cofactors
234 |                              else 'has_reactant',
235 |                              term[3],
236 |                              {'stoichiometry:float': term[2]}])
237 | 
238 |         return rels
239 | 
240 | 
241 | def _calc_cofactors(reaction_defs, cofactors, cutoff=0.8):
242 |     '''Calculates cofactors.'''
243 |     pairs = Counter()
244 | 
245 |     # Calculate all reactant / product pairs...
246 |     for reaction_def in reaction_defs:
247 |         reactants = [term[3] for term in reaction_def if term[2] < 0 and
248 |                      term[3] not in cofactors]
249 |         products = [term[3] for term in reaction_def if term[2] > 0 and
250 |                     term[3] not in cofactors]
251 | 
252 |         pairs.update([tuple(sorted(pair))
253 |                       for pair in itertools.product(reactants, products)])
254 | 
255 |     return _filter(pairs, cutoff)
256 | 
257 | 
258 | def _filter(counter, cutoff):
259 |     '''Filter counter items according to cutoff.'''
260 |     # Count occurences of pairs, then bin into a histogram...
261 |     hist_counter = Counter(counter.values())
262 | 
263 |     # Fit straight-line to histogram log-log plot and filter...
264 |     x_val, y_val = zip(*list(hist_counter.items()))
265 |     m_val, b_val = numpy.polyfit(numpy.log(x_val), numpy.log(y_val), 1)
266 | 
267 |     return [item[0] for item in counter.items()
268 |             if item[1] > math.exp(cutoff * -b_val / m_val)]
269 | 
270 | 
271 | def _convert_to_float(dictionary, key):
272 |     '''Converts a key value in a dictionary to a float.'''
273 |     if dictionary.get(key, None):
274 |         dictionary[key] = float(dictionary[key]
275 |                                 if dictionary[key] != 'NA'
276 |                                 else 'NaN')
277 |     else:
278 |         # Remove key:
279 |         dictionary.pop(key, None)
280 | 


--------------------------------------------------------------------------------