├── neo4j
├── .gitignore
└── input
│ └── extension_script.sh
├── requirements.txt
├── README.md
├── .gitignore
├── sbcdb
├── __init__.py
├── test
│ ├── __init__.py
│ ├── test_mnxref_utils.py
│ └── test_enzyme_utils.py
├── index.py
├── chebi_utils.py
├── rhea_utils.py
├── namespace_utils.py
├── build.py
├── init.cql
├── utils.py
├── enzyme_utils.py
├── ncbi_taxonomy_utils.py
├── kegg_utils.py
├── reaction_utils.py
├── spectra_utils.py
├── chemical_utils.py
└── mnxref_utils.py
├── start_db.sh
├── Dockerfile
└── LICENSE
/neo4j/.gitignore:
--------------------------------------------------------------------------------
1 | /csv/
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ijson
2 | glpk
3 | libchebipy
4 | subliminal-py
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # biochem4j
2 | biochem4j: integrated and extensible biochemical knowledge through graph databases
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.project
2 | /.pydevproject
3 | /neo4j/input/nodes
4 | /neo4j/input/rels
5 | **/*.log
6 | **/*.pyc
7 | **/.DS_Store
--------------------------------------------------------------------------------
/sbcdb/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 |
--------------------------------------------------------------------------------
/sbcdb/test/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 | synbiochem (c) University of Manchester 2015
3 |
4 | synbiochem is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 |
--------------------------------------------------------------------------------
/start_db.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | DIR=$(cd "$(dirname "$0")"; pwd)
4 |
5 | docker run \
6 | --detach \
7 | --user neo4j \
8 | --publish=80:7474 \
9 | --publish=443:7473 \
10 | --publish=7687:7687 \
11 | --volume=$DIR/neo4j/input:/input \
12 | --env=NEO4J_AUTH=none \
13 | --env=NEO4J_dbms_read__only=true \
14 | --env=EXTENSION_SCRIPT=/input/extension_script.sh \
15 | neo4j
--------------------------------------------------------------------------------
/neo4j/input/extension_script.sh:
--------------------------------------------------------------------------------
1 | rm -rf /var/lib/neo4j/data/databases/graph.db
2 |
3 | nodes=`ls -d /input/nodes/*`
4 | rels=`ls -d /input/rels/*`
5 | nodes_str=`echo $nodes | sed "s/ / --nodes /g"`
6 | rels_str=`echo $rels| sed "s/ / --relationships /g"`
7 |
8 | /var/lib/neo4j/bin/neo4j-admin \
9 | import \
10 | --nodes $nodes_str \
11 | --relationships $rels_str \
12 | --delimiter ";" \
13 | --array-delimiter "|" \
14 | --multiline-fields true
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:2.7
2 |
3 | # Make current directory visible inside Docker container:
4 | COPY . /biochem4j
5 | WORKDIR /biochem4j
6 |
7 | # Install / update relevant ubuntu packages:
8 | RUN apt-get update \
9 | && apt-get install -y --no-install-recommends libgmp3-dev
10 |
11 | # Download and install glpk:
12 | RUN mkdir /usr/local/glpk \
13 | && curl http://ftp.gnu.org/gnu/glpk/glpk-4.39.tar.gz \
14 | | tar xvzC /usr/local/glpk --strip-components=1 \
15 | && cd /usr/local/glpk \
16 | && ./configure \
17 | && make \
18 | && make install
19 |
20 | # Install requirements:
21 | RUN pip install --upgrade pip \
22 | && pip install -r requirements.txt
23 |
24 | # Update paths:
25 | ENV LD_LIBRARY_PATH /usr/local/lib:${LD_LIBRARY_PATH}
26 | ENV PYTHONPATH $PYTHONPATH:.
27 |
28 | # Run:
29 | ENTRYPOINT ["python", "-u", "sbcdb/build.py"]
--------------------------------------------------------------------------------
/sbcdb/index.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | import os
11 | import subprocess
12 | import sys
13 |
14 |
15 | def index_db(db_loc):
16 | '''Index database.'''
17 | directory = os.path.dirname(os.path.realpath(__file__))
18 | filename = os.path.join(directory, 'init.cql')
19 |
20 | with open(filename, 'rU') as init_file:
21 | for line in init_file:
22 | params = ['neo4j-shell', '-path', db_loc, '-c', line.strip()]
23 | subprocess.call(params)
24 |
25 |
26 | def main(argv):
27 | '''main method'''
28 | index_db(argv[0])
29 |
30 |
31 | if __name__ == '__main__':
32 | main(sys.argv[1:])
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Neil Swainston
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/sbcdb/test/test_mnxref_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | synbiochem (c) University of Manchester 2015
3 |
4 | synbiochem is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | # pylint: disable=too-many-public-methods
11 | import unittest
12 |
13 | from sbcdb.mnxref_utils import MnxRefReader
14 |
15 |
16 | class TestMnxRefReader(unittest.TestCase):
17 | '''Test class for MnxRefReader.'''
18 |
19 | def setUp(self):
20 | unittest.TestCase.setUp(self)
21 | reader = MnxRefReader()
22 | self.__chem_data = reader.get_chem_data()
23 | self.__reac_data = reader.get_reac_data()
24 |
25 | def test_get_chem_data(self):
26 | '''Tests get_chem_data method.'''
27 | self.assertEquals(self.__chem_data['MNXM1354']['chebi'], 'CHEBI:58282')
28 |
29 | def test_get_reac_data(self):
30 | '''Tests get_chem_data method.'''
31 | eqn = '1 MNXM1 + 1 MNXM6 + 1 MNXM97401 = 1 MNXM5 + 1 MNXM97393'
32 | self.assertEquals(self.__reac_data['MNXR62989']['equation'], eqn)
33 |
34 |
35 | if __name__ == "__main__":
36 | # import sys;sys.argv = ['', 'Test.testName']
37 | unittest.main()
38 |
--------------------------------------------------------------------------------
/sbcdb/chebi_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | from libchebipy._chebi_entity import ChebiEntity
11 |
12 |
13 | def load(chem_manager, writer):
14 | '''Loads ChEBI data from libChEBIpy.'''
15 | chebi_ids = []
16 | rels = []
17 |
18 | _add_node('CHEBI:24431', chebi_ids, rels, chem_manager)
19 |
20 | writer.write_rels(rels, 'Chemical', 'Chemical')
21 |
22 |
23 | def _add_node(chebi_id, chebi_ids, rels, chem_manager):
24 | '''Constructs a node from libChEBI.'''
25 | if chebi_id not in chebi_ids:
26 | chebi_ids.append(chebi_id)
27 |
28 | chem_id, entity = chem_manager.add_chemical({'chebi': chebi_id})
29 |
30 | for incoming in entity.get_incomings():
31 | target_id = incoming.get_target_chebi_id()
32 |
33 | chebi_ent = ChebiEntity(target_id)
34 |
35 | if chebi_ent.get_parent_id():
36 | target_id = chebi_ent.get_parent_id()
37 |
38 | _add_node(target_id, chebi_ids, rels, chem_manager)
39 | rels.append([target_id, incoming.get_type(), chem_id])
40 |
--------------------------------------------------------------------------------
/sbcdb/test/test_enzyme_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | synbiochem (c) University of Manchester 2015
3 |
4 | synbiochem is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | # pylint: disable=too-many-public-methods
11 | import unittest
12 |
13 | from sbcdb.enzyme_utils import EnzymeManager
14 |
15 |
16 | class TestEnzymeManager(unittest.TestCase):
17 | '''Test class for EnzymeManager.'''
18 |
19 | def setUp(self):
20 | unittest.TestCase.setUp(self)
21 | self.__manager = EnzymeManager()
22 |
23 | def test_add_uniprot_data(self):
24 | '''Tests add_uniprot_data method.'''
25 | enzyme_ids = ['P19367', 'Q2KNB7']
26 |
27 | # Test unthreaded:
28 | self.__manager.add_uniprot_data(enzyme_ids, source='test')
29 | self.assertEqual(len(enzyme_ids), len(self.__manager.get_nodes()))
30 |
31 | # Test threaded:
32 | self.__manager.add_uniprot_data(enzyme_ids, source='test',
33 | num_threads=24)
34 | self.assertEqual(len(enzyme_ids), len(self.__manager.get_nodes()))
35 |
36 |
37 | if __name__ == "__main__":
38 | # import sys;sys.argv = ['', 'Test.testName']
39 | unittest.main()
40 |
--------------------------------------------------------------------------------
/sbcdb/rhea_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | import tempfile
11 | import urllib
12 |
13 |
14 | __RHEA_URL = 'ftp://ftp.ebi.ac.uk/pub/databases/rhea/tsv/rhea2uniprot.tsv'
15 |
16 |
17 | def load(reaction_manager, source=__RHEA_URL, num_threads=0):
18 | '''Loads Rhea data.'''
19 | # Parse data:
20 | temp_file = tempfile.NamedTemporaryFile()
21 | urllib.urlretrieve(source, temp_file.name)
22 | data = _parse(temp_file.name)
23 | reaction_manager.add_react_to_enz(data, 'rhea', num_threads)
24 |
25 |
26 | def _parse(filename):
27 | '''Parses file.'''
28 | data = {}
29 |
30 | with open(filename, 'r') as textfile:
31 | next(textfile)
32 |
33 | for line in textfile:
34 | tokens = line.split('\t')
35 |
36 | if len(tokens) == 4:
37 | uniprot_id = tokens[3].strip()
38 |
39 | if not tokens[0] or not tokens[2]:
40 | print ','.join(tokens)
41 |
42 | _add(data, tokens[0], uniprot_id)
43 | _add(data, tokens[2], uniprot_id)
44 |
45 | return data
46 |
47 |
48 | def _add(data, rhea_id, uniprot_id):
49 | '''Adds Rhea id and Uniprot id to data.'''
50 | if rhea_id in data:
51 | data[rhea_id].append(uniprot_id)
52 | else:
53 | data[rhea_id] = [uniprot_id]
54 |
--------------------------------------------------------------------------------
/sbcdb/namespace_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | synbiochem (c) University of Manchester 2015
3 |
4 | synbiochem is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | __CHEMICAL_NAMESPACE = {
11 | # value (namespace) corresponds to identifiers.org:
12 | 'bigg': 'bigg.metabolite',
13 | 'CAS Registry Number': 'cas',
14 | 'chebi': 'chebi',
15 | 'ChemIDplus accession': 'chemidplus',
16 | 'Chemspider accession': 'chemspider',
17 | 'DrugBank accession': 'drugbank',
18 | 'hmdb': 'hmdb',
19 | 'HMDB accession': 'hmdb',
20 | 'kegg': 'kegg.compound',
21 | 'KEGG COMPOUND accession': 'kegg.compound',
22 | 'KEGG DRUG accession': 'kegg.drug',
23 | 'KEGG GLYCAN accession': 'kegg.glycan',
24 | 'KNApSAcK accession': 'knapsack',
25 | 'lipidmaps': 'lipidmaps',
26 | 'LIPID MAPS instance accession': 'lipidmaps',
27 | 'MolBase accession': 'molbase',
28 | 'PDB accession': 'pdb',
29 | 'PubMed citation': 'pubmed',
30 | 'reactome': 'reactome',
31 | 'RESID accession': 'resid',
32 | 'seed': 'seed.compound',
33 | 'umbbd': 'umbbd.compound',
34 | 'UM-BBD compID': 'umbbd.compound',
35 | 'upa': 'unipathway',
36 | 'Wikipedia accession': 'wikipedia.en',
37 |
38 | # Not in identifiers.org:
39 | 'metacyc': 'metacyc',
40 | 'MetaCyc accession': 'metacyc',
41 | 'mnx': 'mnx'
42 | }
43 |
44 | __REACTION_NAMESPACE = {
45 | # value (namespace) corresponds to identifiers.org:
46 | 'bigg': 'bigg.reaction',
47 | 'kegg': 'kegg.reaction',
48 | 'reactome': 'reactome',
49 | 'rhea': 'rhea',
50 | 'seed': 'seed',
51 |
52 | # Not in identifiers.org:
53 | 'metacyc': 'metacyc',
54 | 'mnx': 'mnx',
55 | }
56 |
57 |
58 | def resolve_namespace(name, chemical):
59 | '''Maps name to distinct namespace from identifiers.org.'''
60 | namespace = __CHEMICAL_NAMESPACE if chemical else __REACTION_NAMESPACE
61 | return namespace[name] if name in namespace else None
62 |
--------------------------------------------------------------------------------
/sbcdb/build.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | import multiprocessing
11 | import sys
12 |
13 | from sbcdb import chebi_utils, chemical_utils, mnxref_utils, \
14 | ncbi_taxonomy_utils, reaction_utils, rhea_utils, spectra_utils, utils
15 |
16 |
17 | def build_csv(dest_dir, array_delimiter, num_threads):
18 | '''Build database CSV files.'''
19 | writer = utils.Writer(dest_dir)
20 |
21 | # Get Organism data:
22 | print 'Parsing NCBI Taxonomy'
23 | ncbi_taxonomy_utils.load(writer, array_delimiter)
24 |
25 | # Get Chemical and Reaction data.
26 | # Write chemistry csv files:
27 | chem_man = chemical_utils.ChemicalManager(array_delimiter=array_delimiter)
28 | reac_man = reaction_utils.ReactionManager()
29 |
30 | print 'Parsing MNXref'
31 | mnx_loader = mnxref_utils.MnxRefLoader(chem_man, reac_man, writer)
32 | mnx_loader.load()
33 |
34 | print 'Parsing ChEBI'
35 | chebi_utils.load(chem_man, writer)
36 |
37 | # Get Spectrum data:
38 | print 'Parsing spectrum data'
39 | spectra_utils.load(writer, chem_man, array_delimiter=array_delimiter)
40 |
41 | chem_man.write_files(writer)
42 |
43 | # Get Reaction / Enzyme / Organism data:
44 | # print 'Parsing KEGG'
45 | # kegg_utils.load(reac_man, num_threads=num_threads)
46 |
47 | print 'Parsing Rhea'
48 | rhea_utils.load(reac_man, num_threads=num_threads)
49 |
50 | reac_man.write_files(writer)
51 |
52 |
53 | def main(args):
54 | '''main method'''
55 | num_threads = 0
56 |
57 | if len(args) > 2:
58 | try:
59 | num_threads = int(args[2])
60 | except ValueError:
61 | if args[2] == 'True':
62 | num_threads = multiprocessing.cpu_count()
63 |
64 | print 'Running build with %d threads' % num_threads
65 |
66 | build_csv(args[0], args[1], num_threads)
67 |
68 |
69 | if __name__ == '__main__':
70 | main(sys.argv[1:])
71 |
--------------------------------------------------------------------------------
/sbcdb/init.cql:
--------------------------------------------------------------------------------
1 | CREATE CONSTRAINT ON (n:Organism) ASSERT n.taxonomy IS UNIQUE;
2 | CREATE CONSTRAINT ON (n:Enzyme) ASSERT n.entry IS UNIQUE;
3 | CREATE CONSTRAINT ON (n:Enzyme) ASSERT n.uniprot IS UNIQUE;
4 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.`bigg.reaction` IS UNIQUE;
5 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.id IS UNIQUE;
6 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.`kegg.reaction` IS UNIQUE;
7 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.metacyc IS UNIQUE;
8 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.mnx IS UNIQUE;
9 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.reactome IS UNIQUE;
10 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.rhea IS UNIQUE;
11 | CREATE CONSTRAINT ON (n:Reaction) ASSERT n.seed IS UNIQUE;
12 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`bigg.metabolite` IS UNIQUE;
13 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.cas IS UNIQUE;
14 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chebi IS UNIQUE;
15 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chemidplus IS UNIQUE;
16 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.chemspider IS UNIQUE;
17 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.drugbank IS UNIQUE;
18 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.hmdb IS UNIQUE;
19 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.id IS UNIQUE;
20 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.compound` IS UNIQUE;
21 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.drug` IS UNIQUE;
22 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`kegg.glycan` IS UNIQUE;
23 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.knapsack IS UNIQUE;
24 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.lipidmaps IS UNIQUE;
25 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.metacyc IS UNIQUE;
26 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.mnx IS UNIQUE;
27 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.molbase IS UNIQUE;
28 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.pdb IS UNIQUE;
29 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.pubmed IS UNIQUE;
30 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.reactome IS UNIQUE;
31 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.resid IS UNIQUE;
32 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`seed.compound` IS UNIQUE;
33 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`umbbd.compound` IS UNIQUE;
34 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.unipathway IS UNIQUE;
35 | CREATE CONSTRAINT ON (n:Chemical) ASSERT n.`wikipedia.en` IS UNIQUE;
--------------------------------------------------------------------------------
/sbcdb/utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | synbiochem (c) University of Manchester 2016
3 |
4 | synbiochem is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | # pylint: disable=invalid-name
11 | # pylint: disable=too-many-arguments
12 | import os
13 | from shutil import rmtree
14 |
15 | import pandas as pd
16 |
17 |
18 | class Writer(object):
19 | '''CSV file writer class for biochem4j files.'''
20 |
21 | def __init__(self, dest_dir):
22 | self.__nodes_dir = os.path.join(os.path.abspath(dest_dir), 'nodes')
23 | self.__rels_dir = os.path.join(os.path.abspath(dest_dir), 'rels')
24 |
25 | if os.path.exists(self.__nodes_dir):
26 | rmtree(self.__nodes_dir)
27 |
28 | os.makedirs(self.__nodes_dir)
29 |
30 | if os.path.exists(self.__rels_dir):
31 | rmtree(self.__rels_dir)
32 |
33 | os.makedirs(self.__rels_dir)
34 |
35 | def write_nodes(self, nodes, group, separator=';'):
36 | '''Writes Nodes to csv file.'''
37 | if not nodes:
38 | return None
39 |
40 | df = pd.DataFrame(nodes)
41 | df.dropna(axis=1, how='all', inplace=True)
42 |
43 | filename = os.path.join(self.__nodes_dir, group + '.csv')
44 | df.to_csv(filename, index=False, encoding='utf-8', sep=separator)
45 |
46 | return filename
47 |
48 | def write_rels(self, rels, group_start, group_end, separator=';'):
49 | '''Writes Relationships to csv file.'''
50 | if not rels:
51 | return None
52 |
53 | columns = [':START_ID(' + group_start + ')',
54 | ':TYPE',
55 | ':END_ID(' + group_end + ')']
56 |
57 | if len(rels[0]) > 3:
58 | columns.append('PROPERTIES')
59 |
60 | df = pd.DataFrame(rels, columns=columns)
61 |
62 | if len(rels[0]) > 3:
63 | props_df = pd.DataFrame(list(df['PROPERTIES']))
64 | df.drop('PROPERTIES', axis=1, inplace=True)
65 | df = df.join(props_df)
66 |
67 | filename = os.path.join(self.__rels_dir,
68 | group_start + '_' + group_end + '.csv')
69 | df.to_csv(filename, index=False, encoding='utf-8', sep=separator)
70 |
71 | return filename
72 |
--------------------------------------------------------------------------------
/sbcdb/enzyme_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | from synbiochem.utils import seq_utils
11 |
12 |
13 | class EnzymeManager(object):
14 | '''Class to implement a manager of Enzyme data.'''
15 |
16 | def __init__(self):
17 | '''Constructor.'''
18 | self.__nodes = {}
19 | self.__org_enz_rels = []
20 |
21 | def get_nodes(self):
22 | '''Gets enzyme nodes.'''
23 | return self.__nodes.values()
24 |
25 | def get_org_enz_rels(self):
26 | '''Gets organism-to-enzyme relationships.'''
27 | return self.__org_enz_rels
28 |
29 | def add_uniprot_data(self, enzyme_ids, source, num_threads=0):
30 | '''Gets Uniprot data.'''
31 |
32 | fields = ['entry name', 'protein names', 'organism-id', 'ec']
33 | enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids
34 | if enzyme_id not in self.__nodes]
35 | uniprot_values = seq_utils.get_uniprot_values(enzyme_ids, fields,
36 | batch_size=512,
37 | verbose=True,
38 | num_threads=num_threads)
39 |
40 | for uniprot_id, uniprot_value in uniprot_values.iteritems():
41 | enzyme_node = {':LABEL': 'Enzyme',
42 | 'uniprot:ID(Enzyme)': uniprot_id}
43 | self.__nodes[uniprot_id] = enzyme_node
44 |
45 | organism_id = uniprot_value.pop('Organism ID') \
46 | if 'Organism ID' in uniprot_value else None
47 |
48 | if 'Entry name' in uniprot_value:
49 | enzyme_node['entry'] = uniprot_value['Entry name']
50 |
51 | if 'Protein names' in uniprot_value:
52 | enzyme_node['names'] = uniprot_value['Protein names']
53 |
54 | if enzyme_node['names']:
55 | enzyme_node['name'] = enzyme_node['names'][0]
56 |
57 | if 'EC number' in uniprot_value:
58 | enzyme_node['ec-code'] = uniprot_value['EC number']
59 |
60 | if organism_id:
61 | self.__org_enz_rels.append([organism_id, 'expresses',
62 | uniprot_id, {'source': source}])
63 |
--------------------------------------------------------------------------------
/sbcdb/ncbi_taxonomy_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | import os
11 | import sys
12 | import tarfile
13 | import tempfile
14 | import urllib
15 |
16 |
17 | __NCBITAXONOMY_URL = 'ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz'
18 |
19 |
20 | def load(writer, array_delimiter, source=__NCBITAXONOMY_URL):
21 | '''Loads NCBI Taxonomy data.'''
22 | nodes_filename, names_filename = _get_ncbi_taxonomy_files(source)
23 | nodes, rels = _parse_nodes(nodes_filename, array_delimiter)
24 | _parse_names(nodes, names_filename, array_delimiter)
25 |
26 | writer.write_nodes(nodes.values(), 'Organism')
27 | writer.write_rels(rels, 'Organism', 'Organism')
28 |
29 |
30 | def _get_ncbi_taxonomy_files(source):
31 | '''Downloads and extracts NCBI Taxonomy files.'''
32 | temp_dir = tempfile.gettempdir()
33 | temp_gzipfile = tempfile.NamedTemporaryFile()
34 | urllib.urlretrieve(source, temp_gzipfile.name)
35 |
36 | temp_tarfile = tarfile.open(temp_gzipfile.name, 'r:gz')
37 | temp_tarfile.extractall(temp_dir)
38 |
39 | temp_gzipfile.close()
40 | temp_tarfile.close()
41 |
42 | return os.path.join(temp_dir, 'nodes.dmp'), \
43 | os.path.join(temp_dir, 'names.dmp')
44 |
45 |
46 | def _parse_nodes(filename, array_delimiter):
47 | '''Parses nodes file.'''
48 | nodes = {}
49 | rels = []
50 |
51 | with open(filename, 'r') as textfile:
52 | for line in textfile:
53 | tokens = [x.strip() for x in line.split('|')]
54 | tax_id = tokens[0]
55 |
56 | if tax_id != '1':
57 | rels.append([tax_id, 'is_a', tokens[1]])
58 |
59 | nodes[tax_id] = {'taxonomy:ID(Organism)': tax_id,
60 | ':LABEL':
61 | 'Organism' + array_delimiter + tokens[2]}
62 |
63 | return nodes, rels
64 |
65 |
66 | def _parse_names(nodes, filename, array_delimiter):
67 | '''Parses names file.'''
68 |
69 | with open(filename, 'r') as textfile:
70 | for line in textfile:
71 | tokens = [x.strip() for x in line.split('|')]
72 | node = nodes[tokens[0]]
73 |
74 | if 'name' not in node:
75 | node['name'] = tokens[1]
76 | node['names:string[]'] = set([node['name']])
77 | else:
78 | node['names:string[]'].add(tokens[1])
79 |
80 | for _, node in nodes.iteritems():
81 | if 'names:string[]' in node:
82 | node['names:string[]'] = \
83 | array_delimiter.join(node['names:string[]'])
84 |
85 |
86 | def main(argv):
87 | '''main method'''
88 | load(*argv)
89 |
90 |
91 | if __name__ == "__main__":
92 | main(sys.argv[1:])
93 |
--------------------------------------------------------------------------------
/sbcdb/kegg_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | from collections import defaultdict
11 | import urllib2
12 |
13 | from synbiochem.utils import thread_utils
14 |
15 |
16 | def load(reaction_manager, organisms=None, num_threads=0):
17 | '''Loads KEGG data.'''
18 |
19 | if organisms is None:
20 | organisms = \
21 | sorted([line.split()[1] for line in
22 | urllib2.urlopen('http://rest.kegg.jp/list/organism')])
23 |
24 | # EC to gene, gene to Uniprot:
25 | ec_genes, gene_uniprots = _get_gene_data(organisms, num_threads)
26 |
27 | data = defaultdict(list)
28 |
29 | # KEGG Reaction to EC:
30 | kegg_reac_ec = _parse_url('http://rest.kegg.jp/link/ec/reaction')
31 |
32 | for kegg_reac, ec_terms in kegg_reac_ec.iteritems():
33 | for ec_term in ec_terms:
34 | if ec_term in ec_genes:
35 | for gene in ec_genes[ec_term]:
36 | if gene in gene_uniprots:
37 | uniprots = [val[3:] for val in gene_uniprots[gene]]
38 | data[kegg_reac[3:]].extend(uniprots)
39 |
40 | reaction_manager.add_react_to_enz(data, 'kegg.reaction', num_threads)
41 |
42 |
43 | def _get_gene_data(organisms, num_threads):
44 | '''Gets gene data.'''
45 | ec_genes = defaultdict(list)
46 | gene_uniprots = defaultdict(list)
47 |
48 | if num_threads:
49 | thread_pool = thread_utils.ThreadPool(num_threads)
50 |
51 | for org in organisms:
52 | thread_pool.add_task(_parse_organism, org, ec_genes, gene_uniprots)
53 |
54 | thread_pool.wait_completion()
55 | else:
56 | for org in organisms:
57 | _parse_organism(org, ec_genes, gene_uniprots)
58 |
59 | return ec_genes, gene_uniprots
60 |
61 |
62 | def _parse_organism(org, ec_genes, gene_uniprots):
63 | '''Parse organism.'''
64 | print 'KEGG: loading ' + org
65 |
66 | for key, value in _parse_url('http://rest.kegg.jp/link/' + org.lower() +
67 | '/enzyme').iteritems():
68 | ec_genes[key].extend(value)
69 |
70 | for key, value in _parse_url('http://rest.kegg.jp/conv/uniprot/' +
71 | org.lower()).iteritems():
72 | gene_uniprots[key].extend(value)
73 |
74 |
75 | def _parse_url(url, attempts=16):
76 | '''Parses url to form key to list of values dictionary.'''
77 | data = defaultdict(list)
78 |
79 | for _ in range(attempts):
80 | try:
81 | for line in urllib2.urlopen(url):
82 | tokens = line.split()
83 |
84 | if len(tokens) > 1:
85 | data[tokens[0]].append(tokens[1])
86 |
87 | return data
88 | except urllib2.URLError, err:
89 | # Take no action, but try again...
90 | print '\t'.join([url, str(err)])
91 |
92 | return data
93 |
--------------------------------------------------------------------------------
/sbcdb/reaction_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | from sbcdb.enzyme_utils import EnzymeManager
11 |
12 |
13 | class ReactionManager(object):
14 | '''Class to implement a manager of Reaction data.'''
15 |
16 | def __init__(self):
17 | '''Constructor.'''
18 | self.__nodes = {}
19 | self.__reac_ids = {}
20 | self.__reac_enz_rels = []
21 | self.__org_enz_rels = []
22 | self.__enz_man = EnzymeManager()
23 |
24 | def write_files(self, writer):
25 | '''Write neo4j import files.'''
26 | return ([writer.write_nodes(self.__nodes.values(),
27 | 'Reaction'),
28 | writer.write_nodes(self.__enz_man.get_nodes(),
29 | 'Enzyme')],
30 | [writer.write_rels(self.__reac_enz_rels,
31 | 'Reaction', 'Enzyme'),
32 | writer.write_rels(self.__enz_man.get_org_enz_rels(),
33 | 'Organism', 'Enzyme')])
34 |
35 | def add_reaction(self, source, reac_id, properties):
36 | '''Adds a reaction to the collection of nodes, ensuring uniqueness.'''
37 | reac_id = self.__reac_ids[source + reac_id] \
38 | if source + reac_id in self.__reac_ids else reac_id
39 |
40 | if reac_id not in self.__nodes:
41 | properties[':LABEL'] = 'Reaction'
42 | properties['id:ID(Reaction)'] = reac_id
43 | properties['source'] = source
44 | properties[source] = reac_id
45 | self.__nodes[reac_id] = properties
46 |
47 | if 'mnx' in properties:
48 | self.__reac_ids['mnx' + properties['mnx']] = reac_id
49 |
50 | if 'kegg.reaction' in properties:
51 | self.__reac_ids[
52 | 'kegg.reaction' + properties['kegg.reaction']] = reac_id
53 |
54 | if 'rhea' in properties:
55 | self.__reac_ids['rhea' + properties['rhea']] = reac_id
56 | else:
57 | self.__nodes[reac_id].update(properties)
58 |
59 | return reac_id
60 |
61 | def add_react_to_enz(self, data, source, num_threads=0):
62 | '''Submit data to the graph.'''
63 | # Create Reaction and Enzyme nodes:
64 | enzyme_ids = self.__create_react_enz(data, source)
65 |
66 | # Create Enzyme nodes:
67 | self.__enz_man.add_uniprot_data(enzyme_ids, source, num_threads)
68 |
69 | def __create_react_enz(self, data, source):
70 | '''Creates Reaction and Enzyme nodes and their Relationships.'''
71 | enzyme_ids = []
72 |
73 | for reac_id, uniprot_ids in data.iteritems():
74 | reac_id = self.add_reaction(source, reac_id, {})
75 |
76 | for uniprot_id in uniprot_ids:
77 | enzyme_ids.append(uniprot_id)
78 | self.__reac_enz_rels.append([reac_id, 'catalysed_by',
79 | uniprot_id,
80 | {'source': source}])
81 |
82 | return list(set(enzyme_ids))
83 |
--------------------------------------------------------------------------------
/sbcdb/spectra_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | import os
11 | import tempfile
12 | from urllib import urlretrieve
13 | import zipfile
14 |
15 | import ijson
16 |
17 |
18 | __MONA_URL = 'http://mona.fiehnlab.ucdavis.edu/rest/downloads/retrieve/' + \
19 | 'd2eb33f0-b22e-49a7-bc31-eb951f8347b2'
20 |
21 | __MONA_FILENAME = 'MoNA-export-All_Spectra.json'
22 |
23 | _NAME_MAP = {'kegg': 'kegg.compound',
24 | 'molecular formula': 'formula',
25 | 'total exact mass': 'monoisotopic_mass:float'}
26 |
27 |
28 | def load(writer, chem_manager,
29 | array_delimiter='|', url=__MONA_URL, filename=__MONA_FILENAME):
30 | '''Build Spectrum nodes and relationships.'''
31 | nodes = []
32 | rels = []
33 |
34 | records = _parse(_get_file(url, filename), array_delimiter)
35 |
36 | for record in records:
37 | chem_id, _ = chem_manager.add_chemical(record['chemical'])
38 | nodes.append(record['spectrum'])
39 | rels.append([chem_id, 'has', record['spectrum']['id:ID(Spectrum)']])
40 |
41 | return [writer.write_nodes(nodes, 'Spectrum')], \
42 | [writer.write_rels(rels, 'Chemical', 'Spectrum')]
43 |
44 |
45 | def _parse(filename, array_delimiter):
46 | '''Parses MoNA json file.'''
47 | records = []
48 | record = {'chemical': {'names:string[]': []},
49 | 'spectrum': {':LABEL': 'Spectrum', 'tags:string[]': []}}
50 | name = None
51 |
52 | for prefix, typ, value in ijson.parse(open(filename)):
53 | if prefix == 'item' and typ == 'start_map':
54 | record = {'chemical': {'names:string[]': []},
55 | 'spectrum': {':LABEL': 'Spectrum',
56 | 'tags:string[]': []}}
57 | elif prefix == 'item.compound.item.inchi':
58 | record['chemical']['inchi'] = value
59 | elif prefix == 'item.compound.item.names.item.name':
60 | if 'name' not in record['chemical']:
61 | record['chemical']['name'] = value
62 | record['chemical']['names:string[]'].append(value)
63 | elif prefix == 'item.compound.item.metaData.item.name' or \
64 | prefix == 'item.metaData.item.name':
65 | name = _normalise_name(value.lower())
66 | elif prefix == 'item.compound.item.metaData.item.value':
67 | _parse_compound_metadata(name, value, record)
68 | name = None
69 | elif prefix == 'item.id':
70 | record['spectrum']['id:ID(Spectrum)'] = value
71 | elif prefix == 'item.metaData.item.value':
72 | record['spectrum'][name] = value
73 | name = None
74 | elif prefix == 'item.spectrum':
75 | values = [float(val) for term in value.split()
76 | for val in term.split(':')]
77 | record['spectrum']['m/z:float[]'] = \
78 | array_delimiter.join(map(str, values[0::2]))
79 | record['spectrum']['I:float[]'] = \
80 | array_delimiter.join(map(str, values[1::2]))
81 | elif prefix == 'item.tags.item.text':
82 | record['spectrum']['tags:string[]'].append(value)
83 | elif prefix == 'item' and typ == 'end_map':
84 | records.append(record)
85 |
86 | return records
87 |
88 |
89 | def _get_file(url, filename):
90 | '''Gets file from url.'''
91 | destination = os.path.join(os.path.expanduser('~'), 'MoNA')
92 |
93 | if not os.path.exists(destination):
94 | os.makedirs(destination)
95 |
96 | filepath = os.path.join(destination, filename)
97 |
98 | if not os.path.exists(filepath):
99 | tmp_file = tempfile.NamedTemporaryFile(delete=False)
100 | urlretrieve(url, tmp_file.name)
101 | zfile = zipfile.ZipFile(tmp_file.name, 'r')
102 | filepath = os.path.join(destination, zfile.namelist()[0])
103 | zfile.extractall(destination)
104 |
105 | return filepath
106 |
107 |
108 | def _parse_compound_metadata(name, value, record):
109 | '''Parses compound metadata.'''
110 | if name == 'chebi' and isinstance(value, unicode):
111 | value = value.replace('CHEBI:', '').split()[0]
112 |
113 | record['chemical'][_normalise_name(name)] = value
114 |
115 |
116 | def _normalise_name(name):
117 | '''Normalises name in name:value pairs.'''
118 | if name in _NAME_MAP:
119 | return _NAME_MAP[name]
120 |
121 | return name.replace(':', '_')
122 |
--------------------------------------------------------------------------------
/sbcdb/chemical_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | import math
11 | import uuid
12 |
13 | from libchebipy._chebi_entity import ChebiEntity, ChebiException
14 |
15 | from sbcdb import namespace_utils as ns_utils
16 | from synbiochem.utils import chem_utils
17 |
18 |
19 | class ChemicalManager(object):
20 | '''Class to implement a manager of Chemical data.'''
21 |
22 | def __init__(self, array_delimiter):
23 | '''Constructor.'''
24 | self.__array_delimiter = array_delimiter
25 | self.__nodes = {}
26 | self.__chem_ids = {}
27 |
28 | def write_files(self, writer):
29 | '''Write neo4j import files.'''
30 | return writer.write_nodes(self.__nodes.values(), 'Chemical')
31 |
32 | def add_chemical(self, properties):
33 | '''Adds a chemical to the collection of nodes, ensuring uniqueness.'''
34 | chem_id, chebi_ent = self.__get_chem_id(properties)
35 |
36 | if 'charge:float' in properties:
37 | charge = properties.pop('charge:float')
38 |
39 | if not math.isnan(charge):
40 | properties['charge:float'] = int(charge)
41 |
42 | if chem_id not in self.__nodes:
43 | properties[':LABEL'] = 'Chemical'
44 | properties['id:ID(Chemical)'] = chem_id
45 | properties['source'] = 'chebi' if 'chebi' in properties else 'mnx'
46 |
47 | _normalise_mass(properties)
48 | self.__nodes[chem_id] = properties
49 | else:
50 | self.__nodes[chem_id].update(properties)
51 |
52 | return chem_id, chebi_ent
53 |
54 | def get_props(self, prop, default=None):
55 | '''Gets all chem_ids to property as a dict.'''
56 | return {key: self.__nodes[chem_id].get(prop, default)
57 | for key, chem_id in self.__chem_ids.iteritems()}
58 |
59 | def get_prop(self, chem_id, prop, default=None):
60 | '''Gets a property.'''
61 | return self.__nodes[self.__chem_ids[chem_id]].get(prop, default)
62 |
63 | def __get_chem_id(self, properties):
64 | '''Manages chemical id mapping.'''
65 | chebi_id = properties.get('chebi', None)
66 | chebi_ent = None
67 |
68 | if chebi_id:
69 | try:
70 | chebi_id, chebi_ent = _get_chebi_data(chebi_id, properties,
71 | self.__array_delimiter)
72 | except ChebiException, err:
73 | properties.pop('chebi')
74 | chebi_id = None
75 | print err
76 | except ValueError, err:
77 | properties.pop('chebi')
78 | chebi_id = None
79 | print err
80 |
81 | mnx_id = properties.get('mnx', None)
82 | inchi_id = properties.get('inchi', None)
83 |
84 | if chebi_id:
85 | self.__chem_ids[chebi_id] = chebi_id
86 |
87 | if inchi_id:
88 | self.__chem_ids[inchi_id] = chebi_id
89 |
90 | if mnx_id:
91 | self.__chem_ids[mnx_id] = chebi_id
92 |
93 | return chebi_id, chebi_ent
94 |
95 | if inchi_id:
96 | chem_id = self.__chem_ids.get(inchi_id, None)
97 |
98 | if chem_id:
99 | return chem_id, None
100 |
101 | if mnx_id:
102 | chem_id = self.__chem_ids.get(mnx_id, None)
103 |
104 | if chem_id:
105 | return chem_id, None
106 |
107 | if inchi_id:
108 | self.__chem_ids[inchi_id] = mnx_id
109 |
110 | self.__chem_ids[mnx_id] = mnx_id
111 | return mnx_id, None
112 |
113 | new_id = str(uuid.uuid4())
114 | self.__chem_ids[inchi_id] = new_id
115 |
116 | return new_id, None
117 |
118 |
119 | def _get_chebi_data(chebi_id, properties, array_delimiter):
120 | '''Gets ChEBI data.'''
121 | chebi_ent = ChebiEntity(str(chebi_id))
122 |
123 | if chebi_ent.get_parent_id():
124 | chebi_id = chebi_ent.get_parent_id()
125 | else:
126 | chebi_id = chebi_ent.get_id()
127 |
128 | properties['chebi'] = chebi_id
129 |
130 | formula = chebi_ent.get_formula()
131 | charge = chebi_ent.get_charge()
132 | inchi = chebi_ent.get_inchi()
133 | smiles = chebi_ent.get_smiles()
134 |
135 | if formula:
136 | properties['formula'] = formula
137 |
138 | if not math.isnan(charge):
139 | properties['charge:float'] = charge
140 |
141 | if inchi:
142 | properties['inchi'] = inchi
143 |
144 | if smiles:
145 | properties['smiles'] = smiles
146 |
147 | properties['name'] = chebi_ent.get_name()
148 | properties['names:string[]'] = \
149 | array_delimiter.join([name.get_name()
150 | for name in chebi_ent.get_names()] +
151 | [chebi_ent.get_name()])
152 |
153 | for db_acc in chebi_ent.get_database_accessions():
154 | namespace = ns_utils.resolve_namespace(
155 | db_acc.get_type(), True)
156 |
157 | if namespace is not None:
158 | properties[namespace] = db_acc.get_accession_number()
159 |
160 | return chebi_id, chebi_ent
161 |
162 |
163 | def _normalise_mass(properties):
164 | '''Removes ambiguity in mass values by recalculating according to chemical
165 | formula.'''
166 | properties.pop('mass:float', None)
167 |
168 | if 'formula' in properties and properties['formula'] is not None:
169 | mono_mass = chem_utils.get_molecular_mass(properties['formula'])
170 |
171 | if not math.isnan(mono_mass):
172 | properties['monoisotopic_mass:float'] = mono_mass
173 |
--------------------------------------------------------------------------------
/sbcdb/mnxref_utils.py:
--------------------------------------------------------------------------------
1 | '''
2 | SYNBIOCHEM-DB (c) University of Manchester 2015
3 |
4 | SYNBIOCHEM-DB is licensed under the MIT License.
5 |
6 | To view a copy of this license, visit .
7 |
8 | @author: neilswainston
9 | '''
10 | # pylint: disable=no-member
11 | # pylint: disable=too-few-public-methods
12 | # pylint: disable=too-many-locals
13 | from collections import Counter
14 | import csv
15 | import itertools
16 | import math
17 | import re
18 | import urllib2
19 |
20 | import numpy
21 | from subliminal import balance
22 |
23 | from sbcdb import namespace_utils
24 | from synbiochem.utils import chem_utils
25 |
26 |
27 | _METANETX_URL = 'http://metanetx.org/cgi-bin/mnxget/mnxref/'
28 |
29 |
30 | class MnxRefReader(object):
31 | '''Class to read MnxRef data from the chem_prop.tsv, the chem_xref.tsv and
32 | reac_prop.tsv files.'''
33 |
34 | def __init__(self, source=_METANETX_URL):
35 | self.__source = source
36 | self.__mnx_id_patt = re.compile(r'(MNX[MR])(\d+)')
37 | self.__chem_data = {}
38 | self.__reac_data = {}
39 |
40 | def get_chem_data(self):
41 | '''Gets chemical data.'''
42 | if not self.__chem_data:
43 | self.__read_chem_prop()
44 | self.__read_xref('chem_xref.tsv', self.__chem_data, True)
45 |
46 | return self.__chem_data
47 |
48 | def get_reac_data(self):
49 | '''Gets reaction data.'''
50 | if not self.__reac_data:
51 | self.__read_reac_prop()
52 | self.__read_xref('reac_xref.tsv', self.__reac_data, False)
53 |
54 | return self.__reac_data
55 |
56 | def __read_chem_prop(self):
57 | '''Read chemical properties and create Nodes.'''
58 | chem_prop_keys = ['id', 'name', 'formula', 'charge:float',
59 | 'mass:float', 'inchi', 'smiles', 'source']
60 |
61 | for values in self.__read_data('chem_prop.tsv'):
62 | if not values[0].startswith('#'):
63 | values[0] = self.__parse_id(values[0])
64 | values[7] = self.__parse_id(values[7])
65 | props = dict(zip(chem_prop_keys, values))
66 | props.pop('source')
67 | _convert_to_float(props, 'charge:float')
68 | _convert_to_float(props, 'mass:float')
69 | props = {key: value for key, value in props.iteritems()
70 | if value != ''}
71 | self.__chem_data[values[0]] = props
72 |
73 | def __read_xref(self, filename, data, chemical):
74 | '''Read xrefs and update Nodes.'''
75 | xref_keys = ['XREF', 'MNX_ID', 'Evidence', 'Description']
76 |
77 | for values in self.__read_data(filename):
78 | if not values[0].startswith('#'):
79 | xrefs = dict(zip(xref_keys[:len(values)], values))
80 | evidence = xrefs.get('Evidence', 'identity')
81 |
82 | if evidence == 'identity' or evidence == 'structural':
83 | xrefs['MNX_ID'] = self.__parse_id(xrefs['MNX_ID'])
84 | xref = xrefs['XREF'].split(':')
85 |
86 | if xrefs['MNX_ID'] in data:
87 | entry = data[xrefs['MNX_ID']]
88 | self.__add_xref(xref, entry, chemical)
89 |
90 | def __add_xref(self, xref, entry, chemical):
91 | '''Adds an xref.'''
92 | namespace = namespace_utils.resolve_namespace(xref[0],
93 | chemical)
94 |
95 | if namespace is not None:
96 | xref[1] = self.__parse_id(xref[1])
97 |
98 | entry[namespace] = xref[1] \
99 | if namespace != 'chebi' \
100 | else 'CHEBI:' + xref[1]
101 |
102 | def __read_reac_prop(self):
103 | '''Read reaction properties and create Nodes.'''
104 | reac_prop_keys = ['id', 'equation', 'description', 'balance', 'ec',
105 | 'Source']
106 |
107 | for values in self.__read_data('reac_prop.tsv'):
108 | if not values[0].startswith('#'):
109 | values[0] = self.__parse_id(values[0])
110 | values[5] = self.__parse_id(values[5])
111 |
112 | props = dict(zip(reac_prop_keys, values))
113 | props.pop('Source')
114 |
115 | try:
116 | participants = chem_utils.parse_equation(
117 | props.pop('equation'))
118 |
119 | for participant in participants:
120 | participant[0] = self.__parse_id(participant[0])
121 |
122 | if participant[0] not in self.__chem_data:
123 | self.__add_chem(participant[0])
124 |
125 | props['reac_defn'] = participants
126 | self.__reac_data[values[0]] = props
127 | except ValueError:
128 | print 'WARNING: Suspected polymerisation reaction: ' + \
129 | values[0] + '\t' + str(props)
130 |
131 | def __add_chem(self, chem_id):
132 | '''Adds a chemical with given id.'''
133 | props = {'id': chem_id}
134 | self.__chem_data[chem_id] = props
135 | return props
136 |
137 | def __read_data(self, filename):
138 | '''Downloads and reads tab-limited files into lists of lists of
139 | strings.'''
140 | return list(csv.reader(urllib2.urlopen(self.__source + filename),
141 | delimiter='\t'))
142 |
143 | def __parse_id(self, item_id):
144 | '''Parses mnx ids.'''
145 | matches = self.__mnx_id_patt.findall(item_id)
146 |
147 | for mat in matches:
148 | return mat[0] + str(int(mat[1]))
149 |
150 | return item_id
151 |
152 |
153 | class MnxRefLoader(object):
154 | '''Loads MNXref data into neo4j format.'''
155 |
156 | def __init__(self, chem_man, reac_man, writer):
157 | self.__chem_man = chem_man
158 | self.__reac_man = reac_man
159 | self.__writer = writer
160 |
161 | def load(self):
162 | '''Loads MnxRef data from chem_prop.tsv, chem_xref.tsv,
163 | reac_prop.tsv and reac_xref.tsv files.'''
164 | reader = MnxRefReader()
165 |
166 | for properties in reader.get_chem_data().values():
167 | properties['mnx'] = properties.pop('id')
168 | self.__chem_man.add_chemical(properties)
169 |
170 | rels = self.__add_reac_nodes(reader.get_reac_data())
171 |
172 | return [], [self.__writer.write_rels(rels, 'Reaction', 'Chemical')]
173 |
174 | def __add_reac_nodes(self, reac_data):
175 | '''Get reaction nodes from data.'''
176 | reac_id_def = {}
177 |
178 | for properties in reac_data.values():
179 | reac_def = []
180 | mnx_id = properties.pop('id')
181 |
182 | # Remove equation and description (may be inconsistent with
183 | # balanced reaction):
184 | if 'description' in properties:
185 | properties.pop('description')
186 |
187 | for prt in properties.pop('reac_defn'):
188 | chem_id, _ = self.__chem_man.add_chemical({'mnx': prt[0]})
189 |
190 | reac_def.append([self.__chem_man.get_prop(prt[0], 'formula'),
191 | self.__chem_man.get_prop(prt[0],
192 | 'charge:float', 0),
193 | prt[1],
194 | chem_id])
195 |
196 | if all([values[0] is not None for values in reac_def]):
197 | balanced, _, balanced_def = balance.balance_reac(reac_def)
198 | properties['balance'] = balanced
199 | else:
200 | properties['balance'] = 'unknown'
201 | balanced_def = reac_def
202 |
203 | reac_id = self.__reac_man.add_reaction('mnx', mnx_id,
204 | properties)
205 | reac_id_def[reac_id] = balanced_def
206 |
207 | chem_id_mass = self.__chem_man.get_props('monoisotopic_mass:float',
208 | float('NaN'))
209 | cofactors = [chem_id
210 | for chem_id, mass in chem_id_mass.iteritems()
211 | if mass > 0 and mass < 44] # Assume mass < CO2 = cofactor
212 |
213 | cofactor_pairs = _calc_cofactors(reac_id_def.values(), cofactors)
214 | rels = []
215 |
216 | for reac_id, defn in reac_id_def.iteritems():
217 | reactants = [term[3] for term in defn if term[2] < 0]
218 | products = [term[3] for term in defn if term[2] > 0]
219 | reac_cofactors = []
220 |
221 | # Set metabolites as cofactors:
222 | for met in [term[3] for term in defn]:
223 | if met in cofactors:
224 | reac_cofactors.append(met)
225 |
226 | # Set pairs as cofactors:
227 | for pair in itertools.product(reactants, products):
228 | if tuple(sorted(pair)) in cofactor_pairs:
229 | reac_cofactors.extend(pair)
230 |
231 | for term in defn:
232 | rels.append([reac_id,
233 | 'has_cofactor' if term[3] in reac_cofactors
234 | else 'has_reactant',
235 | term[3],
236 | {'stoichiometry:float': term[2]}])
237 |
238 | return rels
239 |
240 |
241 | def _calc_cofactors(reaction_defs, cofactors, cutoff=0.8):
242 | '''Calculates cofactors.'''
243 | pairs = Counter()
244 |
245 | # Calculate all reactant / product pairs...
246 | for reaction_def in reaction_defs:
247 | reactants = [term[3] for term in reaction_def if term[2] < 0 and
248 | term[3] not in cofactors]
249 | products = [term[3] for term in reaction_def if term[2] > 0 and
250 | term[3] not in cofactors]
251 |
252 | pairs.update([tuple(sorted(pair))
253 | for pair in itertools.product(reactants, products)])
254 |
255 | return _filter(pairs, cutoff)
256 |
257 |
258 | def _filter(counter, cutoff):
259 | '''Filter counter items according to cutoff.'''
260 | # Count occurences of pairs, then bin into a histogram...
261 | hist_counter = Counter(counter.values())
262 |
263 | # Fit straight-line to histogram log-log plot and filter...
264 | x_val, y_val = zip(*list(hist_counter.items()))
265 | m_val, b_val = numpy.polyfit(numpy.log(x_val), numpy.log(y_val), 1)
266 |
267 | return [item[0] for item in counter.items()
268 | if item[1] > math.exp(cutoff * -b_val / m_val)]
269 |
270 |
271 | def _convert_to_float(dictionary, key):
272 | '''Converts a key value in a dictionary to a float.'''
273 | if dictionary.get(key, None):
274 | dictionary[key] = float(dictionary[key]
275 | if dictionary[key] != 'NA'
276 | else 'NaN')
277 | else:
278 | # Remove key:
279 | dictionary.pop(key, None)
280 |
--------------------------------------------------------------------------------