├── tests
    ├── __init__.py
    ├── util
    │   ├── __init__.py
    │   ├── pytest.ini
    │   ├── test_warning_util.py
    │   ├── test_chem_util.py
    │   ├── test_index_collection.py
    │   ├── test_mongo_util.py
    │   ├── test_reaction_util.py
    │   ├── test_rna_seq_util.py
    │   ├── test_calc_tanimoto.py
    │   └── test_rna_halflife_util.py
    ├── data_source
    │   ├── __init__.py
    │   ├── pytest.ini
    │   ├── metabolite_concentration
    │   │   ├── test_query_demo.py
    │   │   └── test_metabolite_concentration.py
    │   ├── test_uniprot_nosql.py
    │   ├── rna_halflife
    │   │   ├── test_doi_10_1038_srep01318.py
    │   │   ├── test_doi_10_1371_journal_pone_0059059.py
    │   │   ├── test_order_by_ko.py
    │   │   ├── test_doi_10_1093_nar_gks1019.py
    │   │   ├── test_doi_10_1186_gb_2012_13_4_r30.py
    │   │   ├── test_doi_10_1093_nar_gkt1150.py
    │   │   ├── test_doi_10_1101_gr_131037_111.py
    │   │   └── test_doi_10_1186_s12864_016_3219_8.py
    │   ├── test_pax_nosql.py
    │   ├── test_kegg_org_code.py
    │   ├── test_gene_ortholog.py
    │   ├── test_metabolites_meta_collection.py
    │   ├── test_ec.py
    │   ├── brenda
    │   │   └── test_reaction.py
    │   ├── test_intact_nosql.py
    │   ├── test_sabio_reaction.py
    │   ├── test_taxon_tree.py
    │   ├── test_metabolite_nosql.py
    │   └── test_protein_aggregate.py
    ├── core
    │   └── pytest.ini
    ├── requirements.txt
    ├── fixtures
    │   ├── ump_kinase.xlsx
    │   ├── five_reactions.xlsx
    │   ├── twenty_reactions.xlsx
    │   └── Mycoplasma_pneumoniae.xlsx
    └── elasticsearch_kl
    │   └── test_batch_load.py
├── docs
    ├── references.bib
    ├── brenda
    │   └── Reactions_BKMS.tar.gz
    ├── metabolite_concentration
    │   ├── mmc2.xlsx
    │   ├── mmc3.xlsx
    │   ├── compounds.tsv.gz
    │   ├── aaf2786-Hackett-SM-table-S9.xls
    │   ├── 41589_2016_BFnchembio2077_MOESM585_ESM.xlsx
    │   └── 41589_2016_BFnchembio2077_MOESM586_ESM.xlsx
    ├── references.rst
    ├── apm-server.yml
    ├── requirements.txt
    ├── filebeat.docker.yml
    ├── requirements.rtd.txt
    ├── metabolites
    │   ├── F6P.json
    │   ├── FDP.json
    │   ├── PYR.json
    │   ├── PEP.json
    │   ├── ACCOATAXNEW.json
    │   ├── DHAP.json
    │   ├── G6P.json
    │   ├── R5P.json
    │   ├── S7P.json
    │   ├── 6PG.json
    │   └── GAP.json
    ├── metricbeat.docker.yml
    ├── deployment.rst
    ├── protein_localization
    │   ├── Experimental_v4.00_PSortdb
    │   │   ├── P50307.json
    │   │   ├── P01553.json
    │   │   ├── P34071.json
    │   │   ├── P06886.json
    │   │   ├── P10335.json
    │   │   ├── P01552.json
    │   │   ├── P09978.json
    │   │   ├── P81177.json
    │   │   ├── P00644.json
    │   │   └── P45723.json
    │   ├── NP_219511.1_ D.json
    │   ├── NP_219504.1_ a.json
    │   └── Gram_Negative_WO_Outer_Membrane
    │   │   ├── WP_012241978.1.json
    │   │   ├── WP_012242024.1.json
    │   │   ├── WP_041633705.1.json
    │   │   ├── WP_041633707.1.json
    │   │   ├── WP_081423625.1.json
    │   │   ├── WP_012242018.1.json
    │   │   ├── WP_012242006.1.json
    │   │   ├── WP_012242014.1.json
    │   │   ├── WP_012242027.1.json
    │   │   └── WP_012242037.1.json
    ├── about.rst
    ├── index.rst
    └── installation.rst
├── datanator
    ├── core
    │   └── __init__.py
    ├── data_source
    │   ├── __init__.py
    │   ├── builds
    │   │   ├── __init__.py
    │   │   ├── full.py
    │   │   ├── test.py
    │   │   ├── test_log.py
    │   │   └── med.py
    │   ├── array_express_tools
    │   │   ├── __init__.py
    │   │   └── taxon_exceptions.txt
    │   ├── process_rna_seq
    │   │   ├── __init__.py
    │   │   └── download_cdna.py
    │   ├── user_data
    │   │   ├── InputTemplate.xlsx
    │   │   └── RNA-Seq_Experiment_Template
    │   │   │   ├── RNA-SeqMetadataTemplate.xlsx
    │   │   │   └── samples
    │   │   │       └── ProcessedRNA-SeqTemplate.xlsx
    │   ├── brenda
    │   │   └── kinetic_constants.py
    │   ├── rna_halflife
    │   │   └── back_fill_gene_name.py
    │   ├── protein_localization
    │   │   ├── database_demo.py
    │   │   ├── justin_parseGramPositiveJSONSchema.py
    │   │   ├── experimental.py
    │   │   ├── parse_psortdb_negative_wo_outer_membrane.py
    │   │   └── parse_psortdb_experimental.py
    │   ├── sqlite_to_json.py
    │   ├── metabolite_concentration
    │   │   └── query_demo.py
    │   ├── sabio_compound.py
    │   └── protein_modification
    │   │   └── 10_1093_nar_gkw1075.py
    ├── schema_2
    │   ├── __init__.py
    │   ├── migrate_ec.py
    │   ├── migrate_corum.py
    │   ├── migrate_metabolite_concentration.py
    │   └── migrate_metabolites_meta.py
    ├── _version.py
    ├── __init__.py
    ├── config
    │   ├── __init__.py
    │   ├── core.default.cfg
    │   ├── core.schema.cfg
    │   └── core.py
    ├── util
    │   ├── __init__.py
    │   ├── warning_util.py
    │   ├── constants.py
    │   ├── base26.py
    │   ├── build_util.py
    │   ├── reaction_util.py
    │   └── mongo_util.py
    └── parse_metabolite_concentration.py
├── pytest.ini
├── scripts
    ├── docker-machine
    ├── mongorestore.sh
    ├── mongorestore_aws.sh
    ├── mongodump.sh
    └── quilt_backup.py
├── setup.cfg
├── docker_builds
    ├── Flaskoffline
    ├── Karrlabdatanator
    └── Mongocurl
├── .karr_lab_build_utils.yml
├── requirements.txt
├── MANIFEST.in
├── .circleci
    └── requirements.txt
├── LICENSE
├── docker-compose.yml
├── CONTRIBUTING.md
├── setup.py
├── .gitignore
├── LICENSE-THIRD-PARTY-DATA
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/references.bib:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/util/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datanator/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datanator/data_source/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datanator/schema_2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/data_source/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datanator/data_source/builds/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datanator/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.2'
2 | 


--------------------------------------------------------------------------------
/tests/core/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -p no:warnings
3 | 


--------------------------------------------------------------------------------
/tests/util/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -p no:warnings
3 | 


--------------------------------------------------------------------------------
/tests/data_source/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = -p no:warnings
3 | 


--------------------------------------------------------------------------------
/datanator/__init__.py:
--------------------------------------------------------------------------------
1 | from ._version import __version__
2 | # :obj:`str`: version
3 | 


--------------------------------------------------------------------------------
/datanator/config/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import get_config, get_debug_logs_config
2 | 


--------------------------------------------------------------------------------
/datanator/data_source/array_express_tools/__init__.py:
--------------------------------------------------------------------------------
1 | from . import ensembl_tools
2 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | attrdict
2 | capturer
3 | flask_testing
4 | ftputil
5 | mock
6 | scipy
7 | 


--------------------------------------------------------------------------------
/datanator/config/core.default.cfg:
--------------------------------------------------------------------------------
1 | [datanator]
2 |     [[bioportal]]
3 |     [[quilt]]
4 |     [[mongodb]]


--------------------------------------------------------------------------------
/tests/fixtures/ump_kinase.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/tests/fixtures/ump_kinase.xlsx


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings  =
3 |     ignore:.*inspect.getargspec.* is deprecated:DeprecationWarning
4 | 


--------------------------------------------------------------------------------
/docs/brenda/Reactions_BKMS.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/brenda/Reactions_BKMS.tar.gz


--------------------------------------------------------------------------------
/tests/fixtures/five_reactions.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/tests/fixtures/five_reactions.xlsx


--------------------------------------------------------------------------------
/tests/fixtures/twenty_reactions.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/tests/fixtures/twenty_reactions.xlsx


--------------------------------------------------------------------------------
/datanator/data_source/process_rna_seq/__init__.py:
--------------------------------------------------------------------------------
1 | from .core import get_processed_data_samples
2 | from . import download_cdna
3 | 


--------------------------------------------------------------------------------
/docs/metabolite_concentration/mmc2.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/mmc2.xlsx


--------------------------------------------------------------------------------
/docs/metabolite_concentration/mmc3.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/mmc3.xlsx


--------------------------------------------------------------------------------
/docs/references.rst:
--------------------------------------------------------------------------------
1 | References
2 | ==========
3 | 
4 | .. bibliography:: references.bib
5 |     :encoding: latin
6 |     :style: unsrt


--------------------------------------------------------------------------------
/tests/fixtures/Mycoplasma_pneumoniae.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/tests/fixtures/Mycoplasma_pneumoniae.xlsx


--------------------------------------------------------------------------------
/docs/metabolite_concentration/compounds.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/compounds.tsv.gz


--------------------------------------------------------------------------------
/datanator/data_source/user_data/InputTemplate.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/datanator/data_source/user_data/InputTemplate.xlsx


--------------------------------------------------------------------------------
/scripts/docker-machine:
--------------------------------------------------------------------------------
1 | docker-machine create --driver amazonec2 --amazonec2-instance-type m5d.large --amazonec2-open-port 27017 --amazonec2-monitoring datanator-ec2


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bdist_wheel]
 2 | universal = 1
 3 | 
 4 | [coverage:run]
 5 | source = 
 6 |     datanator
 7 | 
 8 | [sphinx-apidocs]
 9 | packages = 
10 | 	datanator
11 | 


--------------------------------------------------------------------------------
/docs/metabolite_concentration/aaf2786-Hackett-SM-table-S9.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/aaf2786-Hackett-SM-table-S9.xls


--------------------------------------------------------------------------------
/docker_builds/Flaskoffline:
--------------------------------------------------------------------------------
1 | FROM lzy7071/karrlabdatanator:latest
2 | 
3 | COPY . /home
4 | WORKDIR /home
5 | 
6 | ENTRYPOINT ["python3"]
7 | CMD ["/home/datanator/datanator/rest/__init__.py"]
8 | 


--------------------------------------------------------------------------------
/docker_builds/Karrlabdatanator:
--------------------------------------------------------------------------------
1 | FROM lzy7071/karrlabdatanator_dependencies:latest
2 | 
3 | RUN python3 -m pip install git+https://github.com/KarrLab/datanator.git
4 | 
5 | WORKDIR /root
6 | CMD bash
7 | 


--------------------------------------------------------------------------------
/docs/metabolite_concentration/41589_2016_BFnchembio2077_MOESM585_ESM.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/41589_2016_BFnchembio2077_MOESM585_ESM.xlsx


--------------------------------------------------------------------------------
/docs/metabolite_concentration/41589_2016_BFnchembio2077_MOESM586_ESM.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/41589_2016_BFnchembio2077_MOESM586_ESM.xlsx


--------------------------------------------------------------------------------
/datanator/data_source/user_data/RNA-Seq_Experiment_Template/RNA-SeqMetadataTemplate.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/datanator/data_source/user_data/RNA-Seq_Experiment_Template/RNA-SeqMetadataTemplate.xlsx


--------------------------------------------------------------------------------
/datanator/util/__init__.py:
--------------------------------------------------------------------------------
1 | from . import molecule_util
2 | from . import rna_seq_util
3 | from . import taxonomy_util
4 | from . import warning_util
5 | from . import mongo_util
6 | from . import file_util
7 | from . import chem_util
8 | 


--------------------------------------------------------------------------------
/docker_builds/Mongocurl:
--------------------------------------------------------------------------------
 1 | FROM mongo:4.0.10
 2 | 
 3 | RUN apt-get update -y \
 4 | 	&& apt-get install -y --no-install-recommends \
 5 | 		curl \
 6 | 		wget \
 7 | 	&& rm -rf /var/lib/apt/lists/*
 8 | 
 9 | WORKDIR /root
10 | CMD bash
11 | 


--------------------------------------------------------------------------------
/datanator/data_source/user_data/RNA-Seq_Experiment_Template/samples/ProcessedRNA-SeqTemplate.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KarrLab/datanator/HEAD/datanator/data_source/user_data/RNA-Seq_Experiment_Template/samples/ProcessedRNA-SeqTemplate.xlsx


--------------------------------------------------------------------------------
/docs/apm-server.yml:
--------------------------------------------------------------------------------
1 | apm-server:
2 |   # Defines the host and port the server is listening on.  use "unix:/path/to.sock" to listen on a unix domain socket.
3 |   host: "apm-server:8200"
4 | 
5 | output.elasticsearch:
6 |     hosts: ["elasticsearch:9200"]


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx >= 1.8
2 | sphinx_fontawesome
3 | sphinx_rtd_theme >= 0.4.2
4 | sphinxcontrib_addmetahtml >= 0.1.1
5 | sphinxcontrib_bibtex
6 | sphinxcontrib_googleanalytics >= 0.1.1
7 | sphinxcontrib_spelling
8 | sphinxprettysearchresults
9 | 


--------------------------------------------------------------------------------
/scripts/mongorestore.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | FILE=/root/karr_lab/datanator.archive
3 | if [ ! -f "$FILE" ]; then
4 | 	curl -o $FILE https://mongo-dbdump.s3.amazonaws.com/datanator.20190701.archive
5 | fi
6 | mongorestore -d datanator --host mongo:27017 --archive=$FILE


--------------------------------------------------------------------------------
/.karr_lab_build_utils.yml:
--------------------------------------------------------------------------------
 1 | downstream_dependencies:
 2 |   - h1_hesc
 3 |   - wc_cli
 4 | email_notifications:
 5 |   - jonrkarr@gmail.com
 6 |   - yosefdroth@gmail.com
 7 | static_analyses:
 8 |   ignore_unused_requirements:
 9 |     - python_libsbml
10 |   ignore_missing_requirements:
11 |     - python-libsbml-experimental
12 |     - pymongo
13 | 


--------------------------------------------------------------------------------
/docs/filebeat.docker.yml:
--------------------------------------------------------------------------------
 1 | filebeat.config:
 2 |   modules:
 3 |     path: ${path.config}/modules.d/*.yml
 4 |     reload.enabled: false
 5 | filebeat.autodiscover:
 6 |   providers:
 7 |     - type: docker
 8 |       hints.enabled: true
 9 | 
10 | output.elasticsearch:
11 |   hosts: ["elasticsearch:9200"]
12 | setup.kibana:
13 |   host: "kibana:5601"
14 | 


--------------------------------------------------------------------------------
/docs/requirements.rtd.txt:
--------------------------------------------------------------------------------
1 | sphinx >= 1.8
2 | sphinx_fontawesome
3 | sphinx_rtd_theme >= 0.4.2
4 | sphinxcontrib_addmetahtml >= 0.1.1
5 | sphinxcontrib_bibtex
6 | sphinxcontrib_googleanalytics @ git+https://github.com/karrlab/sphinxcontrib-googleanalytics.git#egg=sphinxcontrib_googleanalytics-0.1.1
7 | sphinxcontrib_spelling
8 | sphinxprettysearchresults
9 | 


--------------------------------------------------------------------------------
/scripts/mongorestore_aws.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | FILE=~/karr_lab/datanator-dump
3 | source <(grep = ~/.wc/datanator.cfg | tr -d ' ')
4 | if [ ! -d "$FILE" ]; then
5 | 	mkdir $FILE
6 | 	aws s3 cp https://mongo-dbdump.s3.amazonaws.com/datanator $FILE --recursive
7 | fi
8 | mongorestore -d datanator -u $user -p $password --authenticationDatabase admin "$FILE/datanator"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4
 2 | biopython
 3 | bioservices >= 1.5.0
 4 | bpforms
 5 | cement >= 3.0.0
 6 | configobj
 7 | datanator_query_python >= 0.6.31
 8 | ete3
 9 | flask_migrate
10 | genson
11 | karr_lab_aws_manager >= 0.0.21
12 | numpy
13 | obj_tables[bio]
14 | openbabel
15 | pandas >= 1.0.1
16 | pint >= 0.10
17 | pubchempy
18 | python_libsbml
19 | requests
20 | requests_cache
21 | setuptools
22 | simplejson
23 | sqlalchemy
24 | sqlalchemy_utils
25 | tabula_py
26 | wc_utils
27 | xmltodict
28 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | # description
 2 | include README.rst
 3 | 
 4 | # license
 5 | include LICENSE
 6 | 
 7 | # database license
 8 | include LICNESE-DATA
 9 | include LICENSE-DATABASE-STRUCTURE
10 | include LICENSE-THIRD-PARTRY-DATA
11 | 
12 | # requirements
13 | include requirements.txt
14 | 
15 | # configuration
16 | recursive-include datanator/config *.cfg
17 | 
18 | # data source configurations
19 | recursive-include datanator/data_source *.txt
20 | 
21 | # data files
22 | recursive-include datanator/data *.txt *.xlsx
23 | 


--------------------------------------------------------------------------------
/.circleci/requirements.txt:
--------------------------------------------------------------------------------
 1 | # for quilt3 compatibility
 2 | urllib3 < 1.25
 3 | 
 4 | # Karr Lab
 5 | git+https://github.com/KarrLab/datanator_swagger_ui_bundle.git#egg=swagger_ui_bundle
 6 | git+https://github.com/KarrLab/wc_utils.git#egg=wc_utils
 7 | git+https://github.com/KarrLab/bpforms.git#egg=bpforms
 8 | git+https://github.com/KarrLab/bcforms.git#egg=bcforms
 9 | git+https://github.com/KarrLab/obj_tables.git#egg=obj_tables[all]
10 | git+https://github.com/KarrLab/datanator_query_python.git#egg=datanator_query_python
11 | 


--------------------------------------------------------------------------------
/scripts/mongodump.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 |  
 3 | MONGO_DATABASE="datanator"
 4 | APP_NAME="datanator"
 5 | source <(grep = ~/.wc/datanator.cfg | tr -d ' ')
 6 | 
 7 | MONGO_HOST="localhost"
 8 | TIMESTAMP=`date +%F-%H%M`
 9 | MONGODUMP_PATH="/usr/bin/mongodump"
10 | BACKUPS_DIR="/data/mongodump"
11 | mkdir -p $BACKUPS_DIR
12 | BACKUP_NAME="$BACKUPS_DIR/$APP_NAME"
13 |  
14 | $MONGODUMP_PATH -d $MONGO_DATABASE -u $user -p $password --authenticationDatabase admin -o $BACKUP_NAME
15 | aws s3 cp $BACKUP_NAME s3://mongo-dbdump/ --recursive
16 | rm -rf $BACKUPS_DIR


--------------------------------------------------------------------------------
/docs/metabolites/F6P.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type":"metabolite",
 3 |     "name":"Frutose 6-Phosphate",
 4 |     "synonyms":[
 5 |         "F6P",
 6 |         "6-O-Phosphono-D-fructose",
 7 |         "D-Fructose 6-phosphoric acid"
 8 |         ],
 9 |     "identifiers":[
10 |         {"namespace":"doi"},
11 |         {"value":"10.1074/jbc.M109.095570"}
12 |     ],
13 |     "taxon":{
14 |         "ncbi_taxonomy_id":562,
15 |         "name":"Escherichia coli str. K-12 substr. MG1655",
16 |         "canon_ancestors":[
17 |             {"ncbi_taxonomy_id": "",
18 |             "name": ""}
19 |         ]
20 |     }
21 | }


--------------------------------------------------------------------------------
/docs/metabolites/FDP.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type":"metabolite",
 3 |     "name":"Fluorescein diphosphate",
 4 |     "synonyms":[
 5 |         "FDP",
 6 |         "3-Oxo-3H-spiro[2-benzofuran-1,9'-xanthene]-3',6'-diyl bis(phosphate)"
 7 |         ],
 8 |     "identifiers":[
 9 |         {"namespace":"doi"},
10 |         {"value":"10.1074/jbc.M109.095570"}
11 |     ],
12 |     "taxon":{
13 |         "ncbi_taxonomy_id":562,
14 |         "name":"Escherichia coli str. K-12 substr. MG1655",
15 |         "canon_ancestors":[
16 |             {"ncbi_taxonomy_id": "",
17 |             "name": ""}
18 |         ]
19 |     }
20 | }


--------------------------------------------------------------------------------
/docs/metabolites/PYR.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type":"metabolite",
 3 |     "name":"Pyruvate",
 4 |     "synonyms":[
 5 |         "PYR",
 6 |         "2-Oxopropanoate",
 7 |         "Methylglyoxylate",
 8 |         "57-60-3",
 9 |         "UNII-HO43T60JMG"
10 |         ],
11 |     "identifiers":[
12 |         {"namespace":"doi"},
13 |         {"value":"10.1074/jbc.M109.095570"}
14 |     ],
15 |     "taxon":{
16 |         "ncbi_taxonomy_id":562,
17 |         "name":"Escherichia coli str. K-12 substr. MG1655",
18 |         "canon_ancestors":[
19 |             {"ncbi_taxonomy_id": "",
20 |             "name": ""}
21 |         ]
22 |     }
23 | }


--------------------------------------------------------------------------------
/docs/metabolites/PEP.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type":"metabolite",
 3 |     "name":"Phosphoenolpyruvate",
 4 |     "synonyms":[
 5 |         "PEP",
 6 |         "Phosphoenolpyruvic acid",
 7 |         "2-(phosphonooxy)prop-2-enoic acid",
 8 |         "138-08-9"
 9 |         ],
10 |     "identifiers":[
11 |         {"namespace":"doi"},
12 |         {"value":"10.1074/jbc.M109.095570"}
13 |     ],
14 |     "taxon":{
15 |         "ncbi_taxonomy_id":562,
16 |         "name":"Escherichia coli str. K-12 substr. MG1655",
17 |         "canon_ancestors":[
18 |             {"ncbi_taxonomy_id": "",
19 |             "name": ""}
20 |         ]
21 |     }
22 | }


--------------------------------------------------------------------------------
/datanator/data_source/builds/full.py:
--------------------------------------------------------------------------------
 1 | from datanator.core import common_schema
 2 | import datetime
 3 | import pkg_resources
 4 | import sys
 5 | 
 6 | 
 7 | def build():
 8 |     old_stdout = sys.stdout
 9 |     log_filename = pkg_resources.resource_filename(
10 |         'datanator', "builds/logs/{}.txt".format(str(datetime.datetime.now())))
11 |     with open(log_filename, "w") as log_file:
12 |         sys.stdout = log_file
13 |         cs = common_schema.CommonSchema(load_content=True, verbose=True,
14 |                                         load_entire_small_dbs=True)
15 |         cs.upload_backup()
16 |     sys.stdout = old_stdout
17 | 


--------------------------------------------------------------------------------
/docs/metabolites/ACCOATAXNEW.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 | 
 4 |         "type": "metabolite",
 5 | 
 6 |         "name": "acetyl-CoA",
 7 | 
 8 |         "identifiers": [
 9 | 
10 |             {"namespace": "inchikey",
11 |             "value": "SLZBFCDCINBPY-ZSJPKINUSA-N"}
12 | 
13 |         ]
14 | 
15 |     },
16 | 
17 |     "value": [
18 | 
19 |         {"type": "metabolite_concentration", "value": "0.26", "units": "μmol/g DCW",
20 | 
21 |         "substrate": "NOX01"},
22 | 
23 |         {"type": "metabolite_concentration", "value": "0.29", "units": "μmol/g DCW",
24 | 
25 |             "substrate": "NOX02"}
26 | 
27 |     ]
28 | }
29 | 


--------------------------------------------------------------------------------
/docs/metabolites/DHAP.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type":"metabolite",
 3 |     "name":"Dihydroxyacetone Phosphate",
 4 |     "synonyms":[
 5 |         "DHAP",
 6 |         "1-hydroxy-3-(phosphonooxy)-2-propanone",
 7 |         "3-Hydroxy-2-oxopropyl dihydrogen phosphate"
 8 |         ],
 9 |     "identifiers":[
10 |         {"namespace":"doi"},
11 |         {"value":"10.1074/jbc.M109.095570"}
12 |     ],
13 |     "taxon":{
14 |         "ncbi_taxonomy_id":562,
15 |         "name":"Escherichia coli str. K-12 substr. MG1655",
16 |         "canon_ancestors":[
17 |             {"ncbi_taxonomy_id": "",
18 |             "name": ""}
19 |         ]
20 |     }
21 | }


--------------------------------------------------------------------------------
/scripts/quilt_backup.py:
--------------------------------------------------------------------------------
 1 | import wc_utils.quilt
 2 | 
 3 | 
 4 | def main():
 5 |     '''Backup or download data from/to Quilt
 6 |     '''
 7 |     path = input("BSON file location:\n")
 8 |     package = 'datanator'
 9 |     manager = wc_utils.quilt.QuiltManager(path=path, package=package)
10 |     backup = input("Backup or Download (choose 'backup' or 'download')?\n")    
11 |     if backup.lower() == 'backup':
12 |         message = input("Optionally, enter a commit message:\n")
13 |         manager.upload_package(message=message or None)
14 |     else:
15 |         manager.download_package()
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     main()
20 | 


--------------------------------------------------------------------------------
/datanator/data_source/builds/test.py:
--------------------------------------------------------------------------------
 1 | from datanator.core import common_schema
 2 | import datetime
 3 | import pkg_resources
 4 | import sys
 5 | 
 6 | 
 7 | def build():
 8 |     old_stdout = sys.stdout
 9 |     log_filename = pkg_resources.resource_filename(
10 |         'datanator', "builds/logs/{}.txt".format(str(datetime.datetime.now())))
11 |     with open(log_filename, "w") as log_file:
12 |         sys.stdout = log_file
13 |         cs = common_schema.CommonSchema(load_content=True, clear_content=True,
14 |                                         verbose=True, test=True, max_entries=10)
15 |         # cs.upload_backup()
16 |     sys.stdout = old_stdout
17 | 


--------------------------------------------------------------------------------
/docs/metabolites/G6P.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type":"metabolite",
 3 |     "name":"Glucose 6-Phosphate",
 4 |     "synonyms":[
 5 |         "G6P",
 6 |         "6-O-Phosphono-α-D-glucopyranose",
 7 |         "D-Glucopyranose 6-phosphate",
 8 |         "D-glucose 6-(dihydrogen phosphate)"
 9 |         ],
10 |     "identifiers":[
11 |         {"namespace":"doi"},
12 |         {"value":"10.1074/jbc.M109.095570"}
13 |     ],
14 |     "taxon":{
15 |         "ncbi_taxonomy_id":562,
16 |         "name":"Escherichia coli str. K-12 substr. MG1655",
17 |         "canon_ancestors":[
18 |             {"ncbi_taxonomy_id": "",
19 |             "name": ""}
20 |         ]
21 |     }
22 | }


--------------------------------------------------------------------------------
/datanator/data_source/builds/test_log.py:
--------------------------------------------------------------------------------
 1 | from datanator.core import common_schema
 2 | import datetime
 3 | import pkg_resources
 4 | import sys
 5 | 
 6 | 
 7 | def build():
 8 |     old_stdout = sys.stdout
 9 |     log_filename = pkg_resources.resource_filename(
10 |         'datanator', "builds/logs/{}.txt".format(str(datetime.datetime.now())))
11 |     with open(log_filename, "w") as log_file:
12 |         sys.stdout = log_file
13 |         # cs = common_schema.CommonSchema(load_content=True, clear_content=True,
14 |         #                                verbose=True, test=True, max_entries=10)
15 |         # cs.upload_backup()
16 |     sys.stdout = old_stdout
17 | 


--------------------------------------------------------------------------------
/datanator/data_source/builds/med.py:
--------------------------------------------------------------------------------
 1 | from datanator.core import common_schema
 2 | import datetime
 3 | import pkg_resources
 4 | import sys
 5 | 
 6 | 
 7 | def build():
 8 |     old_stdout = sys.stdout
 9 |     log_filename = pkg_resources.resource_filename(
10 |         'datanator', "builds/logs/{}.txt".format(str(datetime.datetime.now())))
11 |     with open(log_filename, "w") as log_file:
12 |         sys.stdout = log_file
13 |         cs = common_schema.CommonSchema(load_content=True, clear_content=True,
14 |                                         max_entries=20, load_entire_small_dbs=True, verbose=True)
15 |         # cs.upload_backup()
16 |     sys.stdout = old_stdout
17 | 


--------------------------------------------------------------------------------
/docs/metabolites/R5P.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type":"metabolite",
 3 |     "name":"Ribose 5-Phosphate",
 4 |     "synonyms":[
 5 |         "R5P",
 6 |         "Ribose phosphate",
 7 |         "Ribose 5-monophosphate",
 8 |         "5-O-phosphono-D-ribose",
 9 |         "D-Ribose 5-(dihydrogen phosphate)"
10 |         ],
11 |     "identifiers":[
12 |         {"namespace":"doi"},
13 |         {"value":"10.1074/jbc.M109.095570"}
14 |     ],
15 |     "taxon":{
16 |         "ncbi_taxonomy_id":562,
17 |         "name":"Escherichia coli str. K-12 substr. MG1655",
18 |         "canon_ancestors":[
19 |             {"ncbi_taxonomy_id": "",
20 |             "name": ""}
21 |         ]
22 |     }
23 | }


--------------------------------------------------------------------------------
/docs/metabolites/S7P.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type":"metabolite",
 3 |     "name":"sedoheptulose-7-phosphate",
 4 |     "synonyms":[
 5 |         "S7P",
 6 |         "7-(dihydrogen phosphate) sedoheptulose",
 7 |         "{[(2R,3S,4R,5S)-3,4,5,6-tetrahydroxy-6-(hydroxymethyl)oxan-2-yl]methoxy}phosphonic acid"
 8 |         ],
 9 |     "identifiers":[
10 |         {"namespace":"doi"},
11 |         {"value":"10.1074/jbc.M109.095570"}
12 |     ],
13 |     "taxon":{
14 |         "ncbi_taxonomy_id":562,
15 |         "name":"Escherichia coli str. K-12 substr. MG1655",
16 |         "canon_ancestors":[
17 |             {"ncbi_taxonomy_id": "",
18 |             "name": ""}
19 |         ]
20 |     }
21 | }


--------------------------------------------------------------------------------
/docs/metabolites/6PG.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type":"metabolite",
 3 |     "name":"6-Phosphogluconic acid",
 4 |     "synonyms":[
 5 |         "6PG",
 6 |         "6-O-Phosphono-D-gluconic acid",
 7 |         "6-phosphogluconate",
 8 |         "6-phospho-D-gluconate",
 9 |         "D-gluconic acid 6-phosphate"
10 |         ],
11 |     "identifiers":[ 
12 |         {"namespace":"inchikey"},
13 |         {"value":"xxxxx"}
14 |     ],
15 |     "taxon":{
16 |         "ncbi_taxonomy_id":562,
17 |         "name":"Escherichia coli str. K-12 substr. MG1655",
18 |         "canon_ancestors":[
19 |             {"ncbi_taxonomy_id": "",
20 |             "name": ""}
21 |         ]
22 |     }
23 | }


--------------------------------------------------------------------------------
/docs/metricbeat.docker.yml:
--------------------------------------------------------------------------------
 1 | metricbeat.config:
 2 |   modules:
 3 |     path: ${path.config}/modules.d/*.yml
 4 |     # Reload module configs as they change:
 5 |     reload.enabled: false
 6 | 
 7 | metricbeat.autodiscover:
 8 |   providers:
 9 |     - type: docker
10 |       hints.enabled: true
11 | 
12 | metricbeat.modules:
13 | - module: docker
14 |   metricsets:
15 |     - "container"
16 |     - "cpu"
17 |     - "diskio"
18 |     - "healthcheck"
19 |     - "info"
20 |     - "memory"
21 |     - "network"
22 |   hosts: ["unix:///var/run/docker.sock"]
23 |   period: 10s
24 |   enabled: true
25 | 
26 | output.elasticsearch:
27 |   hosts: ['elasticsearch:9200']
28 | setup.kibana:
29 |   host: "kibana:5601"
30 | 


--------------------------------------------------------------------------------
/docs/metabolites/GAP.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type":"metabolite",
 3 |     "name":"3-Phosphoglyceraldehyde",
 4 |     "synonyms":[
 5 |         "GAP",
 6 |         "glyceraldehyde 3-phosphate",
 7 |         "2-hydroxy-3-(phosphonooxy)-Propanal",
 8 |         "2-Hydroxy-3-oxopropyl dihydrogen phosphate",
 9 |         "2-Hydroxy-3-oxopropyldihydrogenphosphat",
10 |         "3-phosphoglyceraldehyde"
11 |         ],
12 |     "identifiers":[
13 |         {"namespace":"doi"},
14 |         {"value":"10.1074/jbc.M109.095570"}
15 |     ],
16 |     "taxon":{
17 |         "ncbi_taxonomy_id":562,
18 |         "name":"Escherichia coli str. K-12 substr. MG1655",
19 |         "canon_ancestors":[
20 |             {"ncbi_taxonomy_id": "",
21 |             "name": ""}
22 |         ]
23 |     }
24 | }


--------------------------------------------------------------------------------
/docs/deployment.rst:
--------------------------------------------------------------------------------
 1 | Deployment
 2 | ============
 3 | The following instructions describe how to deploy ``datanator`` to the heroku server
 4 | 
 5 | We are deploying the backend API server via a container using the karrlab/wc_env_dependencies:latest image.
 6 | 
 7 | The commands for deploying the container are the following::
 8 | 
 9 |   heroku login
10 |   heroku container:login
11 |   heroku container:push web -a datanator
12 |   heroku container:release web -a datanator
13 | 
14 | In order to change the configuration of the container, look at the Dockerfile for datanator. The gunicorn production server can be
15 | adjusted accordingly in order to accommodate the number of users. 
16 | 
17 | 
18 | Contact `Saahith <mailto:saahith116@gmail.com>`_ for any questions regarding deployment
19 | 


--------------------------------------------------------------------------------
/datanator/util/warning_util.py:
--------------------------------------------------------------------------------
 1 | """ Warning utilities
 2 | 
 3 | :Author: Yosef Roth <yosefdroth@gmail.com>
 4 | :Author: Jonathan Karr <jonrkarr@gmail.com>
 5 | :Date: 2017-04-13
 6 | :Copyright: 2017, Karr Lab
 7 | :License: MIT
 8 | """
 9 | 
10 | import openbabel
11 | import requests.packages.urllib3
12 | 
13 | 
14 | def disable_warnings():
15 |     """ Disable warning messages from openbabel and urllib """
16 |     openbabel.obErrorLog.SetOutputLevel(openbabel.obError)
17 |     requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
18 | 
19 | 
20 | def enable_warnings():
21 |     """ Enable warning messages from openbabel and urllib """
22 |     openbabel.obErrorLog.SetOutputLevel(openbabel.obWarning)
23 |     requests.packages.urllib3.warnings.resetwarnings()
24 | 


--------------------------------------------------------------------------------
/datanator/config/core.schema.cfg:
--------------------------------------------------------------------------------
 1 | [datanator]
 2 |     [[bioportal]]
 3 |         key = string(default=None)
 4 |         # authentication token for BioPortal; this can be botained by creating an account at
 5 |         # http://bioportal.bioontology.org, logging in, and copying your API key from your
 6 |         # account settings page
 7 |     [[quilt]]
 8 |         owner = string(default=None)
 9 |         # user or team id of the owner of the Quilt package
10 | 
11 |         package = string(default=None)
12 |         # user to use Quilt; only needed to obtain an authentication token
13 |     [[mongodb]]
14 |         user = string(default = None)
15 |         password = string(default = None)
16 |         server = string(default = None)
17 |         port = string(default = None)
18 |         replSet = string(default = None)
19 | 
20 | 


--------------------------------------------------------------------------------
/datanator/data_source/brenda/kinetic_constants.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | from bson.binary import Binary
 3 | import pickle
 4 | from datanator_query_python.util import mongo_util
 5 | import datanator.config.core
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | def main():
10 |     db = 'test'
11 |     collection_str = 'brenda_constants'
12 |     username = datanator.config.core.get_config()[
13 |         'datanator']['mongodb']['user']
14 |     password = datanator.config.core.get_config(
15 |     )['datanator']['mongodb']['password']
16 |     MongoDB = datanator.config.core.get_config(
17 |     )['datanator']['mongodb']['server']
18 |     manager = mongo_util.MongoUtil(MongoDB=MongoDB, db=db, username=username,
19 |                                    password=password, collection_str=collection_str)
20 | 
21 |     with open(str(Path('~/karr_lab/datanator/docs/brenda/brenda.pkl').expanduser()), 'rb') as f:
22 |         data = pickle.load(f)
23 |         coll.insert({'bin-data': Binary(thebytes)})


--------------------------------------------------------------------------------
/datanator/util/constants.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pkg_resources
 3 | 
 4 | ## Paths
 5 | DATA_CACHE_DIR = os.path.expanduser(os.path.join('~', '.wc', 'data', 'datanator'))
 6 | 
 7 | ## Endpoints
 8 | CURRENT_VERSION_ENDPOINT = '/v0'
 9 | 
10 | # Speed Contstants
11 | METABOLITE_REACTION_LIMIT = 5
12 | 
13 | # Common Schema Constants
14 | DATA_DUMP_PATH = os.path.join(DATA_CACHE_DIR , 'CommonSchema.sql')
15 | PAX_NAME = 'Pax'
16 | PAX_INITIAL_AMOUNT = 1
17 | SABIO_NAME = 'Sabio'
18 | SABIO_INITIAL_AMOUNT = 1
19 | ARRAY_EXPRESS_NAME = 'Array Express'
20 | ARRAY_EXPRESS_INITIAL_AMOUNT = 1
21 | INTACT_NAME = 'IntAct'
22 | INTACT_INITIAL_AMOUNT = 0
23 | 
24 | ## Batching Test Constants
25 | PAX_TEST_BATCH = 2
26 | INTACT_INTERACTION_TEST_BATCH = 10
27 | ARRAY_EXPRESS_TEST_BATCH = 5
28 | SABIO_TEST_BATCH = 100
29 | 
30 | ## Batching Build Constants
31 | PAX_BUILD_BATCH = 300
32 | INTACT_INTERACTION_BUILD_BATCH = 100000
33 | ARRAY_EXPRESS_BUILD_BATCH = 1000
34 | SABIO_BUILD_BATCH = 100000
35 | INTACT_INTERACTION_BUILD_SUB_BATCH = 5000
36 | 


--------------------------------------------------------------------------------
/tests/data_source/metabolite_concentration/test_query_demo.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source.metabolite_concentration import query_demo
 3 | from datanator_query_python.config import config
 4 | 
 5 | 
 6 | class TestQueryDemo(unittest.TestCase):
 7 |     
 8 |     @classmethod
 9 |     def setUpClass(cls):
10 |         conf = config.TestConfig()
11 |         cls.src = query_demo.QueryDemo(MongoDB=conf.SERVER,
12 |                                        db='datanator-test',
13 |                                        collection_str="taxon_tree",
14 |                                        username=conf.USERNAME,
15 |                                        password=conf.PASSWORD)
16 | 
17 |     @classmethod
18 |     def tearDownClass(cls):
19 |         cls.src.client.close()
20 | 
21 |     def test_get_canon_ancestors(self):
22 |         tax_id = 1280
23 |         result = self.src.get_canon_ancestors(tax_id)
24 |         self.assertEqual(result[0], {'ncbi_taxonomy_id': 131567, 'name': 'cellular organisms'})
25 |         tax_id = 0
26 |         result = self.src.get_canon_ancestors(tax_id)
27 |         self.assertEqual(result, [])


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017-2018 Karr Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/protein_localization/Experimental_v4.00_PSortdb/P50307.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "Cytoplasmic protein",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "name_space": "uniprot_id",
 9 |                 "value": "P50307"
10 |             },
11 |             {
12 |                 "name_space": "Refseq_Accession",
13 |                 "value": null
14 |             },
15 |             {
16 |                 "name_space": "Other_Accession",
17 |                 "value": null
18 |             }
19 |         ]
20 |     },
21 |     "value": {
22 |         "experimental_localization": [
23 |             "Cytoplasmic"
24 |         ],
25 |         "secondary_localizaton": []
26 |     },
27 |     "genotype": {
28 |         "taxon": {
29 |             "ncbi_taxonomy_id": 1280,
30 |             "name": "Staphylococcus aureus"
31 |         }
32 |     },
33 |     "environment": {
34 |         "GramStain": "Gram positive"
35 |     },
36 |     "source": {
37 |         "namespace": "ePSORTdb",
38 |         "value": "Version 3"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Experimental_v4.00_PSortdb/P01553.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "Enterotoxin type C-1 precursor (SEC1)",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "name_space": "uniprot_id",
 9 |                 "value": "P01553"
10 |             },
11 |             {
12 |                 "name_space": "Refseq_Accession",
13 |                 "value": null
14 |             },
15 |             {
16 |                 "name_space": "Other_Accession",
17 |                 "value": null
18 |             }
19 |         ]
20 |     },
21 |     "value": {
22 |         "experimental_localization": [
23 |             "Extracellular"
24 |         ],
25 |         "secondary_localizaton": []
26 |     },
27 |     "genotype": {
28 |         "taxon": {
29 |             "ncbi_taxonomy_id": 1280,
30 |             "name": "Staphylococcus aureus"
31 |         }
32 |     },
33 |     "environment": {
34 |         "GramStain": "Gram positive"
35 |     },
36 |     "source": {
37 |         "namespace": "ePSORTdb",
38 |         "value": "Version 3"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Experimental_v4.00_PSortdb/P34071.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "Enterotoxin type C-2 precursor (SEC2)",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "name_space": "uniprot_id",
 9 |                 "value": "P34071"
10 |             },
11 |             {
12 |                 "name_space": "Refseq_Accession",
13 |                 "value": null
14 |             },
15 |             {
16 |                 "name_space": "Other_Accession",
17 |                 "value": null
18 |             }
19 |         ]
20 |     },
21 |     "value": {
22 |         "experimental_localization": [
23 |             "Extracellular"
24 |         ],
25 |         "secondary_localizaton": []
26 |     },
27 |     "genotype": {
28 |         "taxon": {
29 |             "ncbi_taxonomy_id": 1280,
30 |             "name": "Staphylococcus aureus"
31 |         }
32 |     },
33 |     "environment": {
34 |         "GramStain": "Gram positive"
35 |     },
36 |     "source": {
37 |         "namespace": "ePSORTdb",
38 |         "value": "Version 3"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Experimental_v4.00_PSortdb/P06886.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "Toxic shock syndrome toxin-1 precursor (TSST-1)",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "name_space": "uniprot_id",
 9 |                 "value": "P06886"
10 |             },
11 |             {
12 |                 "name_space": "Refseq_Accession",
13 |                 "value": null
14 |             },
15 |             {
16 |                 "name_space": "Other_Accession",
17 |                 "value": null
18 |             }
19 |         ]
20 |     },
21 |     "value": {
22 |         "experimental_localization": [
23 |             "Extracellular"
24 |         ],
25 |         "secondary_localizaton": []
26 |     },
27 |     "genotype": {
28 |         "taxon": {
29 |             "ncbi_taxonomy_id": 1280,
30 |             "name": "Staphylococcus aureus"
31 |         }
32 |     },
33 |     "environment": {
34 |         "GramStain": "Gram positive"
35 |     },
36 |     "source": {
37 |         "namespace": "ePSORTdb",
38 |         "value": "Version 3"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Experimental_v4.00_PSortdb/P10335.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "Lipase precursor (Glycerol ester hydrolase) ",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "name_space": "uniprot_id",
 9 |                 "value": "P10335"
10 |             },
11 |             {
12 |                 "name_space": "Refseq_Accession",
13 |                 "value": null
14 |             },
15 |             {
16 |                 "name_space": "Other_Accession",
17 |                 "value": null
18 |             }
19 |         ]
20 |     },
21 |     "value": {
22 |         "experimental_localization": [
23 |             "Extracellular"
24 |         ],
25 |         "secondary_localizaton": []
26 |     },
27 |     "genotype": {
28 |         "taxon": {
29 |             "ncbi_taxonomy_id": 1280,
30 |             "name": "Staphylococcus aureus"
31 |         }
32 |     },
33 |     "environment": {
34 |         "GramStain": "Gram positive"
35 |     },
36 |     "source": {
37 |         "namespace": "ePSORTdb",
38 |         "value": "Version 3"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/about.rst:
--------------------------------------------------------------------------------
 1 | About
 2 | =====
 3 | 
 4 | ----------------------
 5 | License
 6 | ----------------------
 7 | 
 8 | The software is released under the MIT license:
 9 | 
10 | .. literalinclude:: ../LICENSE
11 |     :language: text
12 | 
13 | ----------------------
14 | Development team
15 | ----------------------
16 | 
17 | This package was developed by the following researchers in the `Karr Lab <http://www.karrlab.org>`_ at the Icahn School of Medicine at Mount Sinai in New York, USA:
18 | 
19 | * `Yosef Roth <mailto:yosefdroth@gmail.com>`_
20 | * `Saahith Pochiraju <https://www.linkedin.com/in/saahith-pochiraju-379a6394>`_
21 | * Balazs Szigeti
22 | * `Jonathan Karr <http://www.karrlab.org>`_
23 | 
24 | ----------------------
25 | Acknowledgements
26 | ----------------------
27 | 
28 | This work was supported by a National Institute of Health MIRA award [grant number 1 R35 GM 119771-01]; a National Science Foundation INSPIRE award [grant number 1649014]; and the National Science Foundation / ERASynBio [grant numbers 1548123, 335672].
29 | 
30 | ----------------------
31 | Questions and comments
32 | ----------------------
33 | 
34 | Please contact the `Karr Lab <http://www.karrlab.org>`_ with any questions or comments.
35 | 


--------------------------------------------------------------------------------
/docs/protein_localization/Experimental_v4.00_PSortdb/P01552.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "Enterotoxin type B precursor (SEB) ",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "name_space": "uniprot_id",
 9 |                 "value": "P01552"
10 |             },
11 |             {
12 |                 "name_space": "Refseq_Accession",
13 |                 "value": null
14 |             },
15 |             {
16 |                 "name_space": "Other_Accession",
17 |                 "value": null
18 |             }
19 |         ]
20 |     },
21 |     "value": {
22 |         "experimental_localization": [
23 |             "Extracellular"
24 |         ],
25 |         "secondary_localizaton": []
26 |     },
27 |     "genotype": {
28 |         "taxon": {
29 |             "ncbi_taxonomy_id": 1280,
30 |             "name": "Staphylococcus aureus",
31 |             "canon_anctors": []
32 |         }
33 |     },
34 |     "environment": {
35 |         "GramStain": "Gram positive"
36 |     },
37 |     "source": {
38 |         "namespace": "ePSORTdb",
39 |         "value": "Version 3"
40 |     }
41 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Experimental_v4.00_PSortdb/P09978.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "Phospholipase C precursor (Beta-hemolysin) (Beta-toxin) (Sphingomyelinase) (SMase) ",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "name_space": "uniprot_id",
 9 |                 "value": "P09978"
10 |             },
11 |             {
12 |                 "name_space": "Refseq_Accession",
13 |                 "value": null
14 |             },
15 |             {
16 |                 "name_space": "Other_Accession",
17 |                 "value": null
18 |             }
19 |         ]
20 |     },
21 |     "value": {
22 |         "experimental_localization": [
23 |             "Extracellular"
24 |         ],
25 |         "secondary_localizaton": []
26 |     },
27 |     "genotype": {
28 |         "taxon": {
29 |             "ncbi_taxonomy_id": 1280,
30 |             "name": "Staphylococcus aureus"
31 |         }
32 |     },
33 |     "environment": {
34 |         "GramStain": "Gram positive"
35 |     },
36 |     "source": {
37 |         "namespace": "ePSORTdb",
38 |         "value": "Version 3"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Experimental_v4.00_PSortdb/P81177.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "Zinc metalloproteinase aureolysin precursor (Staphylococcus aureus neutral proteinase) ",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "name_space": "uniprot_id",
 9 |                 "value": "P81177"
10 |             },
11 |             {
12 |                 "name_space": "Refseq_Accession",
13 |                 "value": null
14 |             },
15 |             {
16 |                 "name_space": "Other_Accession",
17 |                 "value": null
18 |             }
19 |         ]
20 |     },
21 |     "value": {
22 |         "experimental_localization": [
23 |             "Extracellular"
24 |         ],
25 |         "secondary_localizaton": []
26 |     },
27 |     "genotype": {
28 |         "taxon": {
29 |             "ncbi_taxonomy_id": 1280,
30 |             "name": "Staphylococcus aureus"
31 |         }
32 |     },
33 |     "environment": {
34 |         "GramStain": "Gram positive"
35 |     },
36 |     "source": {
37 |         "namespace": "ePSORTdb",
38 |         "value": "Version 3"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Experimental_v4.00_PSortdb/P00644.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "Thermonuclease precursor (EC 31311) (TNase) (Micrococcal nuclease) (Staphylococcal nuclease)",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "name_space": "uniprot_id",
 9 |                 "value": "P00644"
10 |             },
11 |             {
12 |                 "name_space": "Refseq_Accession",
13 |                 "value": null
14 |             },
15 |             {
16 |                 "name_space": "Other_Accession",
17 |                 "value": null
18 |             }
19 |         ]
20 |     },
21 |     "value": {
22 |         "experimental_localization": [
23 |             "Extracellular"
24 |         ],
25 |         "secondary_localizaton": []
26 |     },
27 |     "genotype": {
28 |         "taxon": {
29 |             "ncbi_taxonomy_id": 1280,
30 |             "name": "Staphylococcus aureus"
31 |         }
32 |     },
33 |     "environment": {
34 |         "GramStain": "Gram positive"
35 |     },
36 |     "source": {
37 |         "namespace": "ePSORTdb",
38 |         "value": "Version 3"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Experimental_v4.00_PSortdb/P45723.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "1-phosphatidylinositol phosphodiesterase precursor (EC 31410) (Phosphatidylinositol-specific phospholipase C) (PI-PLC)",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "name_space": "uniprot_id",
 9 |                 "value": "P45723"
10 |             },
11 |             {
12 |                 "name_space": "Refseq_Accession",
13 |                 "value": null
14 |             },
15 |             {
16 |                 "name_space": "Other_Accession",
17 |                 "value": null
18 |             }
19 |         ]
20 |     },
21 |     "value": {
22 |         "experimental_localization": [
23 |             "Extracellular"
24 |         ],
25 |         "secondary_localizaton": []
26 |     },
27 |     "genotype": {
28 |         "taxon": {
29 |             "ncbi_taxonomy_id": 1280,
30 |             "name": "Staphylococcus aureus"
31 |         }
32 |     },
33 |     "environment": {
34 |         "GramStain": "Gram positive"
35 |     },
36 |     "source": {
37 |         "namespace": "ePSORTdb",
38 |         "value": "Version 3"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/NP_219511.1_ D.json:
--------------------------------------------------------------------------------
1 | {"SeqID": "gi||ref|NP_219511.1| DNA-binding protein", "PPSVM_Localization": "Unknown", "PPSVM_Details": "", "Profile_Localization": "Unknown", "Profile_Details": "No matches to profiles found", "Signal_Localization": "Unknown", "Signal_Details": "No signal peptide detected", "SCL-BLASTe_Localization": "Unknown", "SCL-BLASTe_Details": "No matches against database", "CMSVM_Localization": "Unknown", "CMSVM_Details": "", "SCL-BLAST_Localization": "Unknown", "SCL-BLAST_Details": "No matches against database", "OMPMotif_Localization": "Unknown", "OMPMotif_Details": "No motifs found", "OMSVM_Localization": "Unknown", "OMSVM_Details": "", "Motif_Localization": "Unknown", "Motif_Details": "No motifs found", "CytoSVM_Localization": "Unknown", "CytoSVM_Details": "", "CWSVM_Localization": "", "CWSVM_Details": "", "ModHMM_Localization": "Unknown", "ModHMM_Details": "1 internal helix found", "ECSVM_Localization": "Unknown", "ECSVM_Details": "", "Cytoplasmic Membrane_Score": "2.00", "Extracellular_Score": "2.00", "Outer Membrane_Score": "2.00", "Periplasmic_Score": "2.00", "Cytoplasmic_Score": "2.00", "Final_Localization": "Unknown", "Final_Localization_2": "", "Final_Localization_Comments": "", "Secondary_Localization": "", "Final_Score": "2.00", "PSortVersion": "3.00"}


--------------------------------------------------------------------------------
/datanator/util/base26.py:
--------------------------------------------------------------------------------
 1 | ''' 
 2 |     Fork from git@github.com:mnowotka/chembl_ikey.git
 3 | '''
 4 | 
 5 | from itertools import product
 6 | from string import ascii_uppercase
 7 | 
 8 | t26 = [t for t in map(''.join, product(ascii_uppercase, repeat=3)) if t[0] != 'E' and (t < 'TAA' or t > 'TTV')]
 9 | d26 = list(map(''.join, product(ascii_uppercase, repeat=2)))
10 | 
11 | 
12 | def base26_triplet_1(a):
13 |     b0 = a[0]
14 |     b1 = a[1] & 0x3f
15 |     h = b0 | b1 << 8
16 |     return t26[h]
17 | 
18 | 
19 | def base26_triplet_2(a):
20 |     b0 = a[1] & 0xc0
21 |     b1 = a[2]
22 |     b2 = a[3] & 0x0f
23 |     h = (b0 | b1 << 8 | b2 << 16) >> 6
24 |     return t26[h]
25 | 
26 | 
27 | def base26_triplet_3(a):
28 |     b0 = a[3] & 0xf0
29 |     b1 = a[4]
30 |     b2 = a[5] & 0x03
31 |     h = (b0 | b1 << 8 | b2 << 16) >> 4
32 |     return t26[h]
33 | 
34 | 
35 | def base26_triplet_4(a):
36 |     b0 = a[5] & 0xfc
37 |     b1 = a[6]
38 |     h = (b0 | b1 << 8) >> 2
39 |     return t26[h]
40 | 
41 | 
42 | def base26_dublet_for_bits_28_to_36(a):
43 |     b0 = a[3] & 0xf0
44 |     b1 = a[4] & 0x1f
45 |     h = (b0 | b1 << 8) >> 4
46 |     return d26[h]
47 | 
48 | 
49 | def base26_dublet_for_bits_56_to_64(a):
50 |     b0 = a[7]
51 |     b1 = a[8] & 0x01
52 |     h = b0 | b1 << 8
53 |     return d26[h]
54 | 


--------------------------------------------------------------------------------
/docs/protein_localization/NP_219504.1_ a.json:
--------------------------------------------------------------------------------
1 | {"SeqID": "gi||ref|NP_219504.1| aspartyl/glutamyl-tRNA amidotransferase subunit C", "PPSVM_Localization": "Unknown", "PPSVM_Details": "", "Profile_Localization": "Unknown", "Profile_Details": "No matches to profiles found", "Signal_Localization": "Unknown", "Signal_Details": "No signal peptide detected", "SCL-BLASTe_Localization": "Unknown", "SCL-BLASTe_Details": "No matches against database", "CMSVM_Localization": "Unknown", "CMSVM_Details": "", "SCL-BLAST_Localization": "Unknown", "SCL-BLAST_Details": "No matches against database", "OMPMotif_Localization": "Unknown", "OMPMotif_Details": "No motifs found", "OMSVM_Localization": "Unknown", "OMSVM_Details": "", "Motif_Localization": "Unknown", "Motif_Details": "No motifs found", "CytoSVM_Localization": "Unknown", "CytoSVM_Details": "", "CWSVM_Localization": "", "CWSVM_Details": "", "ModHMM_Localization": "Unknown", "ModHMM_Details": "No internal helices found", "ECSVM_Localization": "Unknown", "ECSVM_Details": "", "Cytoplasmic Membrane_Score": "2.00", "Extracellular_Score": "2.00", "Outer Membrane_Score": "2.00", "Periplasmic_Score": "2.00", "Cytoplasmic_Score": "2.00", "Final_Localization": "Unknown", "Final_Localization_2": "", "Final_Localization_Comments": "", "Secondary_Localization": "", "Final_Score": "2.00", "PSortVersion": "3.00"}


--------------------------------------------------------------------------------
/tests/data_source/test_uniprot_nosql.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import shutil
 3 | import tempfile
 4 | from datanator.data_source import uniprot_nosql
 5 | import datanator.config.core
 6 | 
 7 | 
 8 | class TestUniprotNoSQL(unittest.TestCase):
 9 |     
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         cls.cache_dirname = tempfile.mkdtemp()
13 |         db = 'test'
14 |         username = datanator.config.core.get_config()['datanator']['mongodb']['user']
15 |         password = datanator.config.core.get_config()['datanator']['mongodb']['password']
16 |         MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
17 |         cls.src = uniprot_nosql.UniprotNoSQL(MongoDB=MongoDB, db=db, max_entries=20,
18 |                                             username=username, password=password, collection_str='test_uniprot')
19 | 
20 |     @classmethod
21 |     def tearDownClass(cls):
22 |         shutil.rmtree(cls.cache_dirname)
23 |         cls.src.db_obj.drop_collection(cls.src.collection_str)
24 | 
25 |     # @unittest.skip('large single file download')
26 |     def test_proper_loading(self):
27 |         self.src.load_uniprot()
28 |         # count = uni.count()
29 |         # self.assertEqual(count, 10)
30 |         # self.assertNotEqual(uni.find_one()['gene_name'], None)
31 | 
32 |     def test_fill_species_name(self):
33 |         self.src.fill_species_name()


--------------------------------------------------------------------------------
/tests/util/test_warning_util.py:
--------------------------------------------------------------------------------
 1 | """ Tests of the warning utilities
 2 | 
 3 | :Author: Jonathan Karr <jonrkarr@gmail.com>
 4 | :Date: 2017-04-12
 5 | :Copyright: 2017, Karr Lab
 6 | :License: MIT
 7 | """
 8 | 
 9 | from capturer import CaptureOutput
10 | from datanator.util import molecule_util
11 | from datanator.util import warning_util
12 | import unittest
13 | 
14 | 
15 | class TestWarningUtil(unittest.TestCase):
16 |     adp = 'NC1=C2N=CN(C3OC(COP([O-])(=O)OP([O-])([O-])=O)C(O)C3O)C2=NC=N1'
17 | 
18 |     def test_enable_warnings_openbabel(self):
19 |         warning_util.enable_warnings()
20 |         with CaptureOutput(termination_delay=0.1) as capturer:
21 |             molecule_util.Molecule(structure=self.adp).to_inchi()
22 |             self.assertNotEqual(capturer.get_text(), '')
23 | 
24 |     def test_disable_warnings_openbabel(self):
25 |         warning_util.disable_warnings()
26 |         with CaptureOutput(termination_delay=0.1) as capturer:
27 |             molecule_util.Molecule(structure=self.adp).to_inchi()
28 |             self.assertEqual(capturer.get_text(), '')
29 | 
30 |     @unittest.skip('todo: implement')
31 |     def test_disable_warnings_urllib3(self):
32 |         warning_util.disable_warnings()
33 |         with CaptureOutput(termination_delay=0.1) as capturer:
34 |             response = requests.get('http://www.karrlab.org')
35 |             self.assertEqual(capturer.get_text(), '')
36 | 


--------------------------------------------------------------------------------
/tests/util/test_chem_util.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.util import chem_util
 3 | import tempfile
 4 | import shutil
 5 | 
 6 | 
 7 | class TestChemUtil(unittest.TestCase):
 8 | 
 9 |     @classmethod
10 |     def setUpClass(cls):
11 |         cls.cache_dirname = tempfile.mkdtemp()
12 |         cls.src = chem_util.ChemUtil()
13 | 
14 |     @classmethod
15 |     def tearDownClass(cls):
16 |         shutil.rmtree(cls.cache_dirname)
17 | 
18 |     def test_simplify_inchi(self):
19 |         inchi = 'InChI=1S/H2O/h1H2'
20 |         self.assertEqual('InChI=1S/H2O', self.src.simplify_inchi(inchi))
21 |         inchi = None
22 |         self.assertEqual('InChI = None', self.src.simplify_inchi(inchi))
23 | 
24 |     def test_hash_inchi(self):
25 |     	inchi = 'InChI=1S/C6H12N2O4S2/c7-3(5(9)10)1-13-14-2-4(8)6(11)12'
26 |     	hashed = 'e0a402c94a0ecd52ec426756854592f76eece8fd3ffef2e7347fb6c5'
27 |     	self.assertEqual(hashed, self.src.hash_inchi(inchi))
28 |     	self.assertEqual('InChI = None', self.src.hash_inchi(None))
29 | 
30 |     def test_morphineInChIKey(self):
31 |         key = self.src.inchi_to_inchikey("InChI=1S/C17H19NO3/c1-18-7-6-17-10-3-5-13(20)16(17)21-15-12(19)4-2-9(14(15)17)8-11(10)18/h2-5,10-11,13,16,19-20H,6-8H2,1H3/t10-,11+,13-,16-,17-/m0/s1")
32 |         self.assertEqual(key,'BQJCRHHNABKAKU-KBQPJGBKSA-N')
33 |         key_1 = self.src.inchi_to_inchikey('InChI=1S/H2O/h1H2')
34 |         self.assertEqual(key_1, 'XLYOFNOQVPJJNP-UHFFFAOYSA-N')


--------------------------------------------------------------------------------
/docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012241978.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "hypothetical protein",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "namespace": "Seq_ID",
 9 |                 "value": "WP_012241978.1"
10 |             }
11 |         ]
12 |     },
13 |     "value": {
14 |         "PPSVM_Localization": null,
15 |         "Profile_Localization": "Unknown",
16 |         "Signal_Localization": "Unknown",
17 |         "SCL-BLASTe_Localization": "Unknown",
18 |         "CMSVM_Localization": "Unknown",
19 |         "SCL-BLAST_Localization": "Unknown",
20 |         "OMPMotif_Localization": null,
21 |         "OMSVM_Localization": null,
22 |         "Motif_Localization": "Unknown",
23 |         "CytoSVM_Localization": "Unknown",
24 |         "CWSVM_Localization": "Unknown",
25 |         "ModHMM_Localization": "Unknown",
26 |         "ECSVM_Localization": "Unknown",
27 |         "Cytoplasmic Membrane_Score": 2.5,
28 |         "Cellwall_Score": 2.5,
29 |         "Extracellular_Score": 2.5,
30 |         "Cytoplasmic_Score": 2.5,
31 |         "Final_Localization": "Unknown",
32 |         "Final_Localization_2": null,
33 |         "Secondary_Localization": null,
34 |         "Final_Score": 2.5
35 |     },
36 |     "source": {
37 |         "namespace": "PSORT",
38 |         "value": "Version 3.0"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242024.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "hypothetical protein",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "namespace": "Seq_ID",
 9 |                 "value": "WP_012242024.1"
10 |             }
11 |         ]
12 |     },
13 |     "value": {
14 |         "PPSVM_Localization": null,
15 |         "Profile_Localization": "Unknown",
16 |         "Signal_Localization": "Unknown",
17 |         "SCL-BLASTe_Localization": "Unknown",
18 |         "CMSVM_Localization": "Unknown",
19 |         "SCL-BLAST_Localization": "Unknown",
20 |         "OMPMotif_Localization": null,
21 |         "OMSVM_Localization": null,
22 |         "Motif_Localization": "Unknown",
23 |         "CytoSVM_Localization": "Unknown",
24 |         "CWSVM_Localization": "Unknown",
25 |         "ModHMM_Localization": "Unknown",
26 |         "ECSVM_Localization": "Unknown",
27 |         "Cytoplasmic Membrane_Score": 2.5,
28 |         "Cellwall_Score": 2.5,
29 |         "Extracellular_Score": 2.5,
30 |         "Cytoplasmic_Score": 2.5,
31 |         "Final_Localization": "Unknown",
32 |         "Final_Localization_2": null,
33 |         "Secondary_Localization": null,
34 |         "Final_Score": 2.5
35 |     },
36 |     "source": {
37 |         "namespace": "PSORT",
38 |         "value": "Version 3.0"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_041633705.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "hypothetical protein",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "namespace": "Seq_ID",
 9 |                 "value": "WP_041633705.1"
10 |             }
11 |         ]
12 |     },
13 |     "value": {
14 |         "PPSVM_Localization": null,
15 |         "Profile_Localization": "Unknown",
16 |         "Signal_Localization": "Unknown",
17 |         "SCL-BLASTe_Localization": "Unknown",
18 |         "CMSVM_Localization": "Unknown",
19 |         "SCL-BLAST_Localization": "Unknown",
20 |         "OMPMotif_Localization": null,
21 |         "OMSVM_Localization": null,
22 |         "Motif_Localization": "Unknown",
23 |         "CytoSVM_Localization": "Unknown",
24 |         "CWSVM_Localization": "Unknown",
25 |         "ModHMM_Localization": "Unknown",
26 |         "ECSVM_Localization": "Unknown",
27 |         "Cytoplasmic Membrane_Score": 2.5,
28 |         "Cellwall_Score": 2.5,
29 |         "Extracellular_Score": 2.5,
30 |         "Cytoplasmic_Score": 2.5,
31 |         "Final_Localization": "Unknown",
32 |         "Final_Localization_2": null,
33 |         "Secondary_Localization": null,
34 |         "Final_Score": 2.5
35 |     },
36 |     "source": {
37 |         "namespace": "PSORT",
38 |         "value": "Version 3.0"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_041633707.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "hypothetical protein",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "namespace": "Seq_ID",
 9 |                 "value": "WP_041633707.1"
10 |             }
11 |         ]
12 |     },
13 |     "value": {
14 |         "PPSVM_Localization": null,
15 |         "Profile_Localization": "Unknown",
16 |         "Signal_Localization": "Unknown",
17 |         "SCL-BLASTe_Localization": "Unknown",
18 |         "CMSVM_Localization": "Unknown",
19 |         "SCL-BLAST_Localization": "Unknown",
20 |         "OMPMotif_Localization": null,
21 |         "OMSVM_Localization": null,
22 |         "Motif_Localization": "Unknown",
23 |         "CytoSVM_Localization": "Unknown",
24 |         "CWSVM_Localization": "Unknown",
25 |         "ModHMM_Localization": "Unknown",
26 |         "ECSVM_Localization": "Unknown",
27 |         "Cytoplasmic Membrane_Score": 2.5,
28 |         "Cellwall_Score": 2.5,
29 |         "Extracellular_Score": 2.5,
30 |         "Cytoplasmic_Score": 2.5,
31 |         "Final_Localization": "Unknown",
32 |         "Final_Localization_2": null,
33 |         "Secondary_Localization": null,
34 |         "Final_Score": 2.5
35 |     },
36 |     "source": {
37 |         "namespace": "PSORT",
38 |         "value": "Version 3.0"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_081423625.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "hypothetical protein",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "namespace": "Seq_ID",
 9 |                 "value": "WP_081423625.1"
10 |             }
11 |         ]
12 |     },
13 |     "value": {
14 |         "PPSVM_Localization": null,
15 |         "Profile_Localization": "Unknown",
16 |         "Signal_Localization": "Unknown",
17 |         "SCL-BLASTe_Localization": "Unknown",
18 |         "CMSVM_Localization": "Unknown",
19 |         "SCL-BLAST_Localization": "Unknown",
20 |         "OMPMotif_Localization": null,
21 |         "OMSVM_Localization": null,
22 |         "Motif_Localization": "Unknown",
23 |         "CytoSVM_Localization": "Unknown",
24 |         "CWSVM_Localization": "Unknown",
25 |         "ModHMM_Localization": "Unknown",
26 |         "ECSVM_Localization": "Unknown",
27 |         "Cytoplasmic Membrane_Score": 2.5,
28 |         "Cellwall_Score": 2.5,
29 |         "Extracellular_Score": 2.5,
30 |         "Cytoplasmic_Score": 2.5,
31 |         "Final_Localization": "Unknown",
32 |         "Final_Localization_2": null,
33 |         "Secondary_Localization": null,
34 |         "Final_Score": 2.5
35 |     },
36 |     "source": {
37 |         "namespace": "PSORT",
38 |         "value": "Version 3.0"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242018.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "transcriptional regulator",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "namespace": "Seq_ID",
 9 |                 "value": "WP_012242018.1"
10 |             }
11 |         ]
12 |     },
13 |     "value": {
14 |         "PPSVM_Localization": null,
15 |         "Profile_Localization": "Unknown",
16 |         "Signal_Localization": "Unknown",
17 |         "SCL-BLASTe_Localization": "Unknown",
18 |         "CMSVM_Localization": "Unknown",
19 |         "SCL-BLAST_Localization": "Unknown",
20 |         "OMPMotif_Localization": null,
21 |         "OMSVM_Localization": null,
22 |         "Motif_Localization": "Unknown",
23 |         "CytoSVM_Localization": "Unknown",
24 |         "CWSVM_Localization": "Unknown",
25 |         "ModHMM_Localization": "Unknown",
26 |         "ECSVM_Localization": "Unknown",
27 |         "Cytoplasmic Membrane_Score": 2.5,
28 |         "Cellwall_Score": 2.5,
29 |         "Extracellular_Score": 2.5,
30 |         "Cytoplasmic_Score": 2.5,
31 |         "Final_Localization": "Unknown",
32 |         "Final_Localization_2": null,
33 |         "Secondary_Localization": null,
34 |         "Final_Score": 2.5
35 |     },
36 |     "source": {
37 |         "namespace": "PSORT",
38 |         "value": "Version 3.0"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242006.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "formate--tetrahydrofolate ligase",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "namespace": "Seq_ID",
 9 |                 "value": "WP_012242006.1"
10 |             }
11 |         ]
12 |     },
13 |     "value": {
14 |         "PPSVM_Localization": null,
15 |         "Profile_Localization": "Unknown",
16 |         "Signal_Localization": "Unknown",
17 |         "SCL-BLASTe_Localization": "Unknown",
18 |         "CMSVM_Localization": "Unknown",
19 |         "SCL-BLAST_Localization": "Unknown",
20 |         "OMPMotif_Localization": null,
21 |         "OMSVM_Localization": null,
22 |         "Motif_Localization": "Unknown",
23 |         "CytoSVM_Localization": "Unknown",
24 |         "CWSVM_Localization": "Unknown",
25 |         "ModHMM_Localization": "Unknown",
26 |         "ECSVM_Localization": "Unknown",
27 |         "Cytoplasmic Membrane_Score": 2.5,
28 |         "Cellwall_Score": 2.5,
29 |         "Extracellular_Score": 2.5,
30 |         "Cytoplasmic_Score": 2.5,
31 |         "Final_Localization": "Unknown",
32 |         "Final_Localization_2": null,
33 |         "Secondary_Localization": null,
34 |         "Final_Score": 2.5
35 |     },
36 |     "source": {
37 |         "namespace": "PSORT",
38 |         "value": "Version 3.0"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242014.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "hypothetical protein",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "namespace": "Seq_ID",
 9 |                 "value": "WP_012242014.1"
10 |             }
11 |         ]
12 |     },
13 |     "value": {
14 |         "PPSVM_Localization": null,
15 |         "Profile_Localization": "Unknown",
16 |         "Signal_Localization": "Non-Cytoplasmic",
17 |         "SCL-BLASTe_Localization": "Unknown",
18 |         "CMSVM_Localization": "Unknown",
19 |         "SCL-BLAST_Localization": "Unknown",
20 |         "OMPMotif_Localization": null,
21 |         "OMSVM_Localization": null,
22 |         "Motif_Localization": "Unknown",
23 |         "CytoSVM_Localization": "Unknown",
24 |         "CWSVM_Localization": "Unknown",
25 |         "ModHMM_Localization": "Unknown",
26 |         "ECSVM_Localization": "Unknown",
27 |         "Cytoplasmic Membrane_Score": 3.33,
28 |         "Cellwall_Score": 3.33,
29 |         "Extracellular_Score": 3.33,
30 |         "Cytoplasmic_Score": 0.0,
31 |         "Final_Localization": "Unknown",
32 |         "Final_Localization_2": null,
33 |         "Secondary_Localization": null,
34 |         "Final_Score": 3.33
35 |     },
36 |     "source": {
37 |         "namespace": "PSORT",
38 |         "value": "Version 3.0"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242027.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "thiamine biosynthesis lipoprotein",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "namespace": "Seq_ID",
 9 |                 "value": "WP_012242027.1"
10 |             }
11 |         ]
12 |     },
13 |     "value": {
14 |         "PPSVM_Localization": null,
15 |         "Profile_Localization": "Unknown",
16 |         "Signal_Localization": "Unknown",
17 |         "SCL-BLASTe_Localization": "Unknown",
18 |         "CMSVM_Localization": "Unknown",
19 |         "SCL-BLAST_Localization": "Unknown",
20 |         "OMPMotif_Localization": null,
21 |         "OMSVM_Localization": null,
22 |         "Motif_Localization": "Unknown",
23 |         "CytoSVM_Localization": "Unknown",
24 |         "CWSVM_Localization": "Unknown",
25 |         "ModHMM_Localization": "Unknown",
26 |         "ECSVM_Localization": "Unknown",
27 |         "Cytoplasmic Membrane_Score": 2.5,
28 |         "Cellwall_Score": 2.5,
29 |         "Extracellular_Score": 2.5,
30 |         "Cytoplasmic_Score": 2.5,
31 |         "Final_Localization": "Unknown",
32 |         "Final_Localization_2": null,
33 |         "Secondary_Localization": null,
34 |         "Final_Score": 2.5
35 |     },
36 |     "source": {
37 |         "namespace": "PSORT",
38 |         "value": "Version 3.0"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242037.1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "entity": {
 3 |         "type": "protein",
 4 |         "name": "DUF951 domain-containing protein",
 5 |         "synonyms": [],
 6 |         "identifiers": [
 7 |             {
 8 |                 "namespace": "Seq_ID",
 9 |                 "value": "WP_012242037.1"
10 |             }
11 |         ]
12 |     },
13 |     "value": {
14 |         "PPSVM_Localization": null,
15 |         "Profile_Localization": "Unknown",
16 |         "Signal_Localization": "Unknown",
17 |         "SCL-BLASTe_Localization": "Unknown",
18 |         "CMSVM_Localization": "Unknown",
19 |         "SCL-BLAST_Localization": "Unknown",
20 |         "OMPMotif_Localization": null,
21 |         "OMSVM_Localization": null,
22 |         "Motif_Localization": "Unknown",
23 |         "CytoSVM_Localization": "Unknown",
24 |         "CWSVM_Localization": "Unknown",
25 |         "ModHMM_Localization": "Unknown",
26 |         "ECSVM_Localization": "Unknown",
27 |         "Cytoplasmic Membrane_Score": 2.5,
28 |         "Cellwall_Score": 2.5,
29 |         "Extracellular_Score": 2.5,
30 |         "Cytoplasmic_Score": 2.5,
31 |         "Final_Localization": "Unknown",
32 |         "Final_Localization_2": null,
33 |         "Secondary_Localization": null,
34 |         "Final_Score": 2.5
35 |     },
36 |     "source": {
37 |         "namespace": "PSORT",
38 |         "value": "Version 3.0"
39 |     }
40 | }


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   datanator:
 5 |     image: karrlab/wc_env
 6 |     restart: always
 7 |     stdin_open: true
 8 |     tty: true
 9 |     ports:
10 |       - "10001:5002"
11 |     volumes: 
12 |       - "../:/root/karr_lab"
13 |       - "/home/zl/.wc:/root/.wc"
14 |     entrypoint: >
15 |       bash -c "python3 -m pip install -e /root/karr_lab/pkg_utils/
16 |       && python3 -m pip install -e /root/karr_lab/wc_utils/
17 |       && python3 -m pip install -e /root/karr_lab/karr_lab_aws_manager/
18 |       && python3 -m pip install -e /root/karr_lab/datanator_query_python/ 
19 |       && python3 -m pip install -e /root/karr_lab/datanator/
20 |       && tail -f /dev/null"
21 | 
22 |   # mongo:
23 |   #   image: mongo:4.0.10
24 |   #   restart: always
25 |   #   volumes:
26 |   #     - ./datanator/data_source/cache/mongo:/data/db
27 |   #     - ../:/root/karr_lab
28 |   #   ports:
29 |   #     - "27017:27017"
30 |   #   depends_on:
31 |   #     - datanator
32 | 
33 |   # mongosetup:
34 |   #   image: lzy7071/mongo-curl:latest
35 |   #   volumes:
36 |   #     - ../:/root/karr_lab
37 |   #   entrypoint: [ "bash", "/root/karr_lab/datanator/scripts/mongorestore.sh" ]
38 |   #   restart: on-failure
39 |   #   depends_on:
40 |   #     - mongo
41 | 
42 |   # mongoexpress:
43 |   #   image: mongo-express:0.49.0
44 |   #   restart: always
45 |   #   ports:
46 |   #     - "8081:8081"
47 |   #   depends_on:
48 |   #     - mongo
49 |   #   restart: always
50 |   #   command: sh -c 'sleep 10 && tini -- node app'


--------------------------------------------------------------------------------
/datanator/data_source/rna_halflife/back_fill_gene_name.py:
--------------------------------------------------------------------------------
 1 | from datanator_query_python.query import query_rna_halflife, query_uniprot
 2 | from datanator_query_python.util import mongo_util
 3 | 
 4 | 
 5 | class FillGeneName(mongo_util.MongoUtil):
 6 | 
 7 |     def __init__(self, server=None, db='datanator', collection_str='rna_halflife', username=None, 
 8 |                 password=None, authSource='admin', readPreference='nearest', verbose=False, max_entries=float('inf')):
 9 |         super().__init__(MongoDB=server, db=db, verbose=verbose, max_entries=max_entries,
10 |                         username=username, password=password, authSource=authSource, readPreference=readPreference)
11 |         self.client, self.db, self.collection = self.con_db(collection_str)
12 |         self.rna_query = query_rna_halflife.QueryRNA(server=server, username=username, password=password, verbose=verbose,
13 |                                                 db=db, collection_str=collection_str, authDB=authSource, readPreference=readPreference)
14 |         self.uniprot_query = query_uniprot.QueryUniprot(username=username, password=password, server=server, authSource=authSource,
15 |                                                         database=db, collection_str=collection_str, readPreference=readPreference)
16 | 
17 |     def fill_with_oln(self):
18 |         """Fill gene_name with 'ordered_locus_name' field.
19 |         """
20 |         con_0 = {'gene_name': None}
21 |         con_1 = {'halflives.ordered_locus_name': {'$exists': True}}
22 |         pass


--------------------------------------------------------------------------------
/tests/data_source/rna_halflife/test_doi_10_1038_srep01318.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source.rna_halflife import doi_10_1038_srep01318
 3 | import tempfile
 4 | import shutil
 5 | import json
 6 | import os
 7 | from datanator_query_python.config import config
 8 | import pandas as pd
 9 | 
10 | 
11 | class TestProteinAggregate(unittest.TestCase):
12 | 
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         des_db = 'test'
16 |         src_db = 'datanator'
17 |         cls.protein_col = 'uniprot'
18 |         cls.rna_col = 'rna_halflife'
19 |         conf = config.TestConfig()
20 |         username = conf.USERNAME
21 |         password = conf.PASSWORD
22 |         MongoDB = conf.SERVER    
23 |         cls.src = doi_10_1038_srep01318.Halflife(server=MongoDB, src_db=src_db,
24 |         protein_col=cls.protein_col, authDB='admin', readPreference='nearest',
25 |         username=username, password=password, verbose=True, max_entries=20,
26 |         des_db=des_db, rna_col=cls.rna_col)
27 | 
28 |     @classmethod
29 |     def tearDownClass(cls):
30 |         # cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col)
31 |         cls.src.db_obj.drop_collection(cls.rna_col)
32 |         cls.src.uniprot_collection_manager.client.close()
33 |         cls.src.client.close()
34 |         cls.src.uniprot_query_manager.client.close()
35 | 
36 |     def test_fill_rna_halflife(self):
37 |         d = {'probeset_id': 34555, 'gene_symbol': 'test_symbol', 'gm07029_a1': 1.1236, 'accession_id': 'NM_001762'}
38 |         df_0 = pd.DataFrame(d, index=[0])
39 |         self.src.fill_rna_halflife(df_0, start=0)
40 | 


--------------------------------------------------------------------------------
/tests/data_source/metabolite_concentration/test_metabolite_concentration.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source.metabolite_concentration import doi_10_1038_nchembio_2077
 3 | from datanator_query_python.config import config
 4 | 
 5 | 
 6 | class TestMetaboliteConcentration(unittest.TestCase):
 7 | 
 8 |     @classmethod
 9 |     def setUpClass(cls):
10 |         des_db = 'test'
11 |         collection_str = 'metabolite_concentrations'
12 |         conf = config.TestConfig()
13 |         username = conf.USERNAME
14 |         password = conf.PASSWORD
15 |         MongoDB = conf.SERVER    
16 |         cls.src = doi_10_1038_nchembio_2077.Concentration(MongoDB=MongoDB, db=des_db, collection_str=collection_str, 
17 |         username=username, password=password, authSource='admin', readPreference='nearest',
18 |         verbose=True, max_entries=float('inf'))
19 | 
20 |     @classmethod
21 |     def tearDownClass(cls):
22 |         cls.src.client.close()
23 | 
24 |     def test_flatten_conc_obj(self):
25 |         _input = {'a': 0, 'b': 1, 'c': 2, "internal": "false"}
26 |         result = self.src._flatten_conc_obj(_input, 1000, "XXXXX",'name')
27 |         self.assertEqual(result, [{'a': 0, 'b': 1, 'c': 2,'internal': 'false', 'ncbi_taxonomy_id': 1000,'species_name': 'name', 'species_name': 'XXXXX'}])
28 |         _input = {'a': [0, 1, 2, 3], 'b': [0, 10, 20, 30], 'c': [0, 100, 200, 300], "internal": "true"}
29 |         result = self.src._flatten_conc_obj(_input, 1000, "YYYYY", 'name')
30 |         self.assertEqual(len(result), 4)
31 |         self.assertEqual(result[1], {'a': 1, 'b': 10, 'c': 100, 'internal': 'r', 'ncbi_taxonomy_id': 1000,'species_name': 'name','species_name': 'YYYYY'})


--------------------------------------------------------------------------------
/tests/data_source/test_pax_nosql.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source import pax_nosql
 3 | import datanator.config.core
 4 | import tempfile
 5 | import shutil
 6 | 
 7 | class TestCorumNoSQL(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 |         self.cache_dirname = tempfile.mkdtemp()
11 |         self.db = 'test'
12 |         self.username = datanator.config.core.get_config()['datanator']['mongodb']['user']
13 |         self.password = datanator.config.core.get_config()['datanator']['mongodb']['password']
14 |         self.MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
15 |         port = datanator.config.core.get_config()['datanator']['mongodb']['port']
16 |         replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet']
17 | 
18 |     def tearDown(self):
19 |         shutil.rmtree(self.cache_dirname)
20 | 
21 |     # only loads partial content because it takes too long to load everything
22 |     def test_load_content(self):
23 |         src = pax_nosql.PaxNoSQL(
24 |             self.cache_dirname, self.MongoDB, self.db, verbose=True, max_entries = 5,
25 |             password = self.password, username = self.username)
26 |         collection = src.load_content()
27 |         self.assertEqual(collection.count(), 5)
28 |         cursor = collection.find({'file_name': '882/882-WHOLE_ORGANISM-integrated.txt'})
29 |         self.assertEqual(cursor.count(), 1)
30 |         self.assertEqual(cursor[0]['species_name'], 'D.vulgaris')
31 |         self.assertEqual(cursor[0]['observation'][0]['string_id'], '882.DVU0949')
32 |         cursor = collection.find({'file_name': '882/882-Desulfo_Lac_Exp_SC_zhang_2006.txt'})
33 |         self.assertEqual(cursor.count(), 1)
34 |         self.assertEqual(cursor[0]['weight'], 20)
35 |         self.assertEqual(cursor[0]['observation'][1]['string_id'], '882.DVU0142')
36 | 


--------------------------------------------------------------------------------
/tests/data_source/test_kegg_org_code.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import shutil
 3 | import tempfile
 4 | from datanator.data_source import kegg_org_code
 5 | import datanator.config.core
 6 | 
 7 | 
 8 | class TestKeggOrgCode(unittest.TestCase):
 9 |     
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         cls.cache_dirname = tempfile.mkdtemp()
13 |         db = 'test'
14 |         username = datanator.config.core.get_config()['datanator']['mongodb']['user']
15 |         password = datanator.config.core.get_config()['datanator']['mongodb']['password']
16 |         MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
17 |         cls.src = kegg_org_code.KeggOrgCode(MongoDB, db, max_entries=100, username=username, password=password,
18 |                                             readPreference='nearest', authSource='admin', verbose=True, collection_str='kegg_organisms_code')
19 | 
20 |     @classmethod
21 |     def tearDownClass(cls):
22 |         shutil.rmtree(cls.cache_dirname)
23 |         cls.src.db.drop_collection(cls.src.collection_str)
24 | 
25 |     @unittest.skip('passed')
26 |     def test_parse_html_iter(self):
27 |         results = self.src.parse_html_iter()
28 |         for i, result in enumerate(results):
29 |             if i == self.src.max_entries:
30 |                 break
31 |             print(result)
32 | 
33 |     @unittest.skip('passed')
34 |     def test_make_bulk(self):
35 |         result = self.src.make_bulk(offset=500)
36 |         print(result)
37 |         self.assertEqual(len(result), 100)
38 | 
39 |     @unittest.skip('passed')
40 |     def test_get_ncbi_id_rest(self):
41 |         name = "homo sapiens (human)"
42 |         self.assertEqual(self.src.get_ncbi_id_rest(name), 9606)
43 | 
44 |     def test_get_ncbi_id(self):
45 |         name = 'Mus musculus'
46 |         self.assertEqual(self.src.get_ncbi_id(name), 10090)


--------------------------------------------------------------------------------
/datanator/data_source/protein_localization/database_demo.py:
--------------------------------------------------------------------------------
 1 | from datanator_query_python.config import config
 2 | from datanator_query_python.util import mongo_util
 3 | 
 4 | 
 5 | class Demo(mongo_util.MongoUtil):
 6 | 
 7 |     def __init__(self, 
 8 |                  server_demo="someaddress",
 9 |                  db_demo="datanator-demo",
10 |                  username_demo="username",
11 |                  password_demo="password",
12 |                  collection_str="demo-collection"):
13 |         super().__init__(MongoDB=server_demo,
14 |                          db=db_demo,
15 |                          username=username_demo,
16 |                          password=password_demo)
17 |         self.collection = self.db_obj[collection_str]
18 | 
19 |     def update_collection(self):
20 |         """Update collection in db.
21 |         """
22 |         # dic = {"uniprot_id": "P01234",
23 |         #        "locale": "cell membrane",
24 |         #        "array_obj": ["a", "b", "c"]}
25 |         dic = {"uniprot_id": "P01234",
26 |             "locale": "cell membrane",
27 |             "array_obj": ["a", "c", "d"]}
28 | 
29 |         self.collection.update_one({"uniprot_id": dic["uniprot_id"]},
30 |                                     {"$set": {"locale": dic["locale"]},
31 |                                     "$addToSet": {"array_obj": {"$each": dic["array_obj"]}}},
32 |                                     upsert=True)
33 | 
34 | 
35 | def main():
36 |     conf = config.SchemaMigration()
37 |     username = conf.USERNAME
38 |     password = conf.PASSWORD
39 |     server = conf.SERVER
40 |     src = Demo(server_demo=server,
41 |                username_demo=username,
42 |                password_demo=password,
43 |                db_demo="test",
44 |                collection_str="taxon-schema")
45 |     src.update_collection()
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     main()


--------------------------------------------------------------------------------
/tests/data_source/rna_halflife/test_doi_10_1371_journal_pone_0059059.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source.rna_halflife import doi_10_1371_journal_pone_0059059
 3 | import tempfile
 4 | import shutil
 5 | from datanator_query_python.config import config
 6 | import tabula
 7 | 
 8 | 
 9 | class TestProteinAggregate(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         des_db = 'test'
14 |         src_db = 'datanator'
15 |         cls.protein_col = 'uniprot'
16 |         cls.rna_col = 'rna_halflife'
17 |         cls.cache_dir = tempfile.mkdtemp()
18 |         conf = config.TestConfig()
19 |         username = conf.USERNAME
20 |         password = conf.PASSWORD
21 |         MongoDB = conf.SERVER    
22 |         cls.src = doi_10_1371_journal_pone_0059059.Halflife(server=MongoDB, src_db=src_db,
23 |         protein_col=cls.protein_col, authDB='admin', readPreference='nearest',
24 |         username=username, password=password, verbose=True, max_entries=20,
25 |         des_db=des_db, rna_col=cls.rna_col, cache_dir=cls.cache_dir)
26 | 
27 |     @classmethod
28 |     def tearDownClass(cls):
29 |         shutil.rmtree(cls.cache_dir)
30 |         cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col)
31 |         cls.src.db_obj.drop_collection(cls.rna_col)
32 |         cls.src.uniprot_collection_manager.client.close()
33 |         cls.src.client.close()
34 |         cls.src.uniprot_query_manager.client.close()
35 | 
36 |     @unittest.skip('Needs acadamic IPs.')
37 |     def test_fill_rna_halflife(self):
38 |         url = 'https://journals.plos.org/plosone/article/file?type=supplementary&id=info:doi/10.1371/journal.pone.0059059.s002'
39 |         df = tabula.read_pdf(url, pandas_options={'header': None, 'na_values': 'ND'}, pages='all')
40 |         df.columns = ['gene_name', 'a', 'b', 'c', 'd', 'e']
41 |         self.src.fill_rna_half_life(df, ['Lactococcus lactis subsp. lactis Il1403', 272623])


--------------------------------------------------------------------------------
/datanator/data_source/process_rna_seq/download_cdna.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlretrieve
 2 | import os
 3 | import shutil
 4 | 
 5 | def run(ensembl_info, top_dir):
 6 |     """Downloads the CDNA for a given sample, and creates a kallisto index file. 
 7 |             The CDNA file is stored in a "CDNA" subdirectory within the top directory. 
 8 |             The kalliso index files are stored within "kallisto_index_files" subdirectory within the top directory
 9 | 
10 |         Args:
11 |             experiment(:obj:`array_express.Experiment`): the array express experiment
12 |             top_dirname(:obj:`str`): the name of the directory where the overall data is being stored
13 | 
14 |         """
15 |     download_cdna(ensembl_info, top_dir)
16 |     process_cdna(ensembl_info, top_dir)
17 | 
18 | 
19 | def download_cdna(ensembl_info, top_dir):
20 | 
21 |     DIRNAME = "{}/CDNA_FILES".format(top_dir)
22 |     if not os.path.isdir(DIRNAME):
23 |         os.makedirs(DIRNAME)
24 |     spec_name = ensembl_info.organism_strain
25 |     file_name = "{}/{}.cdna.all.fa.gz".format(DIRNAME, spec_name)
26 |     url = ensembl_info.url
27 |     if not os.path.isfile(file_name):
28 |         file = urlretrieve(url, '{}/{}.cdna.all.fa.gz'.format(top_dir, spec_name))
29 |         shutil.move('{}/{}.cdna.all.fa.gz'.format(top_dir, spec_name), DIRNAME)
30 |     os.chdir(top_dir)
31 | 
32 | def process_cdna(ensembl_info, top_dir):
33 |     DIRNAME = "{}/CDNA_FILES".format(top_dir)
34 |     file_name = "{}/{}.cdna.all.fa.gz".format(DIRNAME, ensembl_info.organism_strain)
35 |     KALLISTO_DIR = "{}/kallisto_index_files".format(top_dir)
36 |     if not os.path.isdir(KALLISTO_DIR):
37 |         os.makedirs(KALLISTO_DIR)
38 |     if not os.path.isfile("{}/{}.idx".format(KALLISTO_DIR, ensembl_info.organism_strain)):
39 |         os.system("kallisto index -i {}.idx {}".format(ensembl_info.organism_strain, file_name))
40 |         shutil.move("{}/{}.idx".format(top_dir, ensembl_info.organism_strain), KALLISTO_DIR)
41 | 
42 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Datanator
 2 | 
 3 | We enthusiastically welcome contributions to Datanator!
 4 | 
 5 | ## Coordinating contributions
 6 | 
 7 | Before getting started, please contact the lead developers at [info@karrlab.org](mailto:info@karrlab.org) to coordinate your planned contributions with other ongoing efforts. Please also use GitHub issues to announce your plans to the community so that other developers can provide input into your plans and coordinate their own work. As the development community grows, we will institute additional infrastructure as needed such as a leadership committee and regular online meetings.
 8 | 
 9 | ## Repository organization
10 | 
11 | Datanator follows standard Python conventions:
12 | 
13 | * `datanator/`: source code
14 | * `tests/`: tests
15 | * `docs/`: documentation
16 | * `setup.py`: installation script
17 | 
18 | ## Coding convention
19 | 
20 | Datanator follows standard Python style conventions:
21 | 
22 | * Module names: `lower_snake_case`
23 | * Class names: `UpperCamelCase`
24 | * Function names: `lower_snake_case`
25 | * Variable names: `lower_snake_case`
26 | 
27 | ## Testing
28 | 
29 | We strive to have complete test coverage of Datanator. As such, all contributions to Datanator should be tested. The tests are located in the `tests` subdirectory. The tests are implemented using the `unittest` module. The tests can be executed by running `pytest tests`.
30 | 
31 | Upon each push to GitHub, GitHub will trigger CircleCI to execute all of the tests.
32 | 
33 | ## Documentation convention
34 | 
35 | Datanator is documented using the napoleon Sphinx plugin. The documentation can be compiled by running `sphinx-build docs docs/_build/html`.
36 | 
37 | ## Submitting changes
38 | 
39 | Please use GitHub pull requests to submit changes. Each request should include a brief description of the new and/or modified features.
40 | 
41 | ## Releasing and deploying new versions
42 | 
43 | Contact [info@karrlab.org](mailto:info@karrlab.org) to request release and deployment of new changes. 
44 | 


--------------------------------------------------------------------------------
/datanator/data_source/sqlite_to_json.py:
--------------------------------------------------------------------------------
 1 |  
 2 | '''Converts tables in SQLite into json files
 3 | 	Attributes:
 4 | 		database: path to sqlite database
 5 | 		query: query execution command in string format
 6 | 		
 7 | '''
 8 | 
 9 | import json
10 | import os
11 | import sqlite3
12 | import pprint
13 | 
14 | class SQLToJSON():
15 | 
16 |     def __init__(self, query, cache_dirname=None):
17 |         self.query = query
18 |         self.cache_dirname = cache_dirname
19 | 
20 |     def db(self):
21 |         database = self.cache_dirname
22 |         return sqlite3.connect(database)
23 | 
24 |     # returns all the table names in a sqlite database
25 |     def table(self):
26 |     	cursor = self.db().cursor()
27 |     	cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
28 |     	tables = cursor.fetchall()
29 |     	table_names = []
30 |     	for table_name in tables:
31 |         	table_names.append(table_name[0])
32 |     	cursor.connection.close()
33 |     	return table_names
34 | 
35 |     # one : return as one json file or not
36 |     def query_table(self, table, one=True):
37 |         cur = self.db().cursor()
38 |         query = self.query + table
39 |         cur.execute(query)
40 |         r = [dict((cur.description[i][0], value)
41 |                   for i, value in enumerate(row)) for row in cur.fetchall()]
42 |         cur.connection.close()
43 |         return (r if r else None) if one else r
44 | 
45 | 
46 | def main():
47 |     database = './cache/SabioRk.sqlite'
48 |     query = "select * from "
49 |     collection_dir = './cache/SabioRk/'
50 |     os.makedirs(os.path.dirname(collection_dir), exist_ok=True)
51 | 
52 |     temp = SQLToJSON(query, cache_dirname=database)
53 |     tables = temp.table()
54 | 
55 |     for table in tables:
56 |         file_name = os.path.join(collection_dir + table + '.json')
57 |         result = temp.query_table(table)
58 |         with open(file_name, "w") as f:
59 |             f.write(json.dumps(result, indent=4))
60 | 
61 | if __name__ == '__main__':
62 |     main()
63 | 


--------------------------------------------------------------------------------
/tests/data_source/test_gene_ortholog.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import shutil
 3 | import tempfile
 4 | from datanator.data_source import gene_ortholog
 5 | import datanator.config.core
 6 | 
 7 | 
 8 | class TestKeggOrgCode(unittest.TestCase):
 9 |     
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         cls.cache_dirname = tempfile.mkdtemp()
13 |         db = 'test'
14 |         username = datanator.config.core.get_config()['datanator']['mongodb']['user']
15 |         password = datanator.config.core.get_config()['datanator']['mongodb']['password']
16 |         MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
17 |         cls.src = gene_ortholog.KeggGeneOrtholog(MongoDB, des_db=db, src_db='datanator', max_entries=10, username=username, password=password,
18 |                                                 readPreference='nearest', authSource='admin', verbose=True)
19 |         cls.query = 'aly:ARALYDRAFT_486312'
20 | 
21 |     @classmethod
22 |     def tearDownClass(cls):
23 |         shutil.rmtree(cls.cache_dirname)
24 |         cls.src.des_db.drop_collection(cls.src.collection_str)
25 | 
26 |     @unittest.skip('passed')
27 |     def test_parse_html(self):
28 |         soup = self.src.get_html(self.query)
29 |         results = self.src.parse_html(soup)
30 |         for i, result in enumerate(results):
31 |             if i == self.src.max_entries:
32 |                 break
33 |             print(result)
34 | 
35 |     @unittest.skip('passed')
36 |     def test_uniprot_to_org_gene(self):
37 |         uniprot_id = 'Q05758'
38 |         result = self.src.uniprot_to_org_gene(uniprot_id)
39 |         self.assertEqual('ath:AT3G58610', result)
40 |         uniprot_id = 'Q8N7E2'
41 |         result = self.src.uniprot_to_org_gene(uniprot_id)
42 |         print(result)
43 | 
44 |     def test_parse_gene_info(self):
45 |         result = self.src.parse_gene_info('100008727')
46 |         self.assertEqual(['AAD18037.1', 'AAD38154.1', 'AFS49951.1', 'NP_001075529.1', 'Q9XSZ4.1', 'XP_008265676.1', 'XP_017202733.1'], result)


--------------------------------------------------------------------------------
/datanator/util/build_util.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | def timemethod(method):
 4 | 
 5 |     def timed(*args, **kw):
 6 |         if args[0].verbose:
 7 |             print('\n------------------------ Initializing %r ------------------------' % (method.__name__))
 8 |         ts = time.time()
 9 |         result = method(*args, **kw)
10 |         te = time.time()
11 | 
12 |         if args[0].verbose:
13 |             print('%r took %2.2f sec' % \
14 |                       (method.__name__, te-ts))
15 |             print('%r completed' % (method.__name__))
16 |         return result
17 | 
18 |     return timed
19 | 
20 | def timeloadcontent(method):
21 | 
22 |     def timed(*args, **kw):
23 |         if args[0].verbose:
24 |             print(''' \n
25 |                 ===================================
26 |                 |                                 |
27 |                 |                                 |
28 |                 |    Starting Datanator Build     |
29 |                 |                                 |
30 |                 |                                 |
31 |                 ===================================
32 | 
33 |                 ''')
34 | 
35 |         ts = time.time()
36 |         result = method(*args, **kw)
37 |         te = time.time()
38 | 
39 |         if args[0].verbose:
40 |             print(''' \n
41 |                 =============================================
42 |                 |                                           |
43 |                 |             Finished Build                |
44 |                     Total time taken for build: %2.2f secs
45 |                 |                                           |
46 |                 =============================================
47 |                 ''' % (te - ts))
48 | 
49 |         return result
50 | 
51 |     return timed
52 | 
53 | 
54 | def continuousload(method):
55 | 
56 |     def continuous(*args, **kw):
57 |         try:
58 |             result = method(*args, **kw)
59 |             return result
60 |         except Exception as e:
61 |             print(e)
62 | 
63 |     return continuous
64 | 


--------------------------------------------------------------------------------
/tests/util/test_index_collection.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.util import index_collection
 3 | import datanator.config.core
 4 | import tempfile
 5 | import shutil
 6 | 
 7 | 
 8 | class TestMongoUtil(unittest.TestCase):
 9 | 
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         cls.cache_dirname = tempfile.mkdtemp()
13 |         cls.db = 'test'
14 |         username = datanator.config.core.get_config()['datanator']['mongodb']['user']
15 |         password = datanator.config.core.get_config()['datanator']['mongodb']['password']
16 |         MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
17 |         port = datanator.config.core.get_config()['datanator']['mongodb']['port']
18 |         replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet']
19 |         cls.src = index_collection.IndexCollection(
20 |             cache_dirname = cls.cache_dirname, MongoDB = MongoDB, 
21 |             replicaSet = replSet, db = cls.db, verbose=True, max_entries=20,
22 |             username = username, password = password)
23 | 
24 |     @classmethod
25 |     def tearDownClass(cls):
26 |         shutil.rmtree(cls.cache_dirname)
27 | 
28 |     @unittest.skip('passed')
29 |     def test_index_corum(self):
30 |         col_str = 'corum'
31 |         self.src.index_corum(col_str)
32 |         client, _, collection = self.src.con_db(col_str)
33 |         self.assertEqual(len(list(collection.list_indexes())), 4)
34 |         client.close()
35 | 
36 |     @unittest.skip('passed')
37 |     def test_index_sabio(self):
38 |         col_str = 'sabio_rk'
39 |         self.src.index_sabio(col_str)
40 |         client,_,collection = self.src.con_db(col_str)
41 |         self.assertEqual( len(list(collection.list_indexes())), 11) # 10 + 1
42 |         client.close()
43 | 
44 |     @unittest.skip('passed')
45 |     def test_index_uniprot(self):
46 |         col_str = 'uniprot'
47 |         self.src.index_uniprot(col_str)
48 |         client,_,collection = self.src.con_db(col_str)
49 |         self.assertEqual( len(list(collection.list_indexes())), 3) # 2 + 1
50 |         client.close()


--------------------------------------------------------------------------------
/tests/data_source/rna_halflife/test_order_by_ko.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source.rna_halflife import order_by_ko
 3 | from datanator_query_python.config import config
 4 | 
 5 | 
 6 | class TestReorg(unittest.TestCase):
 7 | 
 8 |     @classmethod
 9 |     def setUpClass(cls):
10 |         des_db = 'test'
11 |         src_db = 'datanator'
12 |         cls.src_collection = 'rna_halflife_new'
13 |         cls.des_collection = 'rna_halflife_test'
14 |         conf = config.TestConfig()
15 |         username = conf.USERNAME
16 |         password = conf.PASSWORD
17 |         MongoDB = conf.SERVER    
18 |         cls.src = order_by_ko.Reorg(MongoDB=MongoDB, src_db=src_db,
19 |                  verbose=False, max_entries=20, username=username, 
20 |                  password=password, authSource='admin', readPreference='nearest',
21 |                  des_collection=cls.des_collection, src_collection=cls.src_collection,
22 |                  des_db=des_db)
23 | 
24 |     @classmethod
25 |     def tearDownClass(cls):
26 |         cls.src.des_db.drop_collection(cls.des_collection)
27 |         cls.src.src_client.close()
28 |         cls.src.des_client.close()
29 | 
30 |     def test_helper(self):
31 |         doi = '10.1016/j.cell.2013.12.026'
32 |         _, count = self.src.helper(doi)
33 |         self.assertGreater(count, 0)
34 | 
35 |     @unittest.skip('passed')
36 |     def test_fill_cell(self):
37 |         self.src.fill_cell()
38 | 
39 |     @unittest.skip('passed')
40 |     def test_fill_mbc(self):
41 |         self.src.fill_mbc()
42 | 
43 |     @unittest.skip('passed')
44 |     def test_fill_nar_gks(self):
45 |         self.src.fill_nar_gks()
46 | 
47 |     @unittest.skip('passed')
48 |     def test_fill_nar_gkt(self):
49 |         self.src.fill_nar_gkt()
50 | 
51 |     @unittest.skip('passed')
52 |     def test_fill_gr_131(self):
53 |         self.src.fill_gr_131()
54 | 
55 |     @unittest.skip('passed')
56 |     def test_fill_gb_2012(self):
57 |         self.src.fill_gb_2012()
58 | 
59 |     @unittest.skip('passed')
60 |     def test_fill_s12864(self):
61 |         self.src.fill_s12864()
62 | 
63 |     @unittest.skip('passed')
64 |     def test_fill_journal_pone(self):
65 |         self.src.fill_journal_pone()         


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import setuptools
 3 | import subprocess
 4 | import sys
 5 | try:
 6 |     result = subprocess.run(
 7 |         [sys.executable, "-m", "pip", "show", "pkg_utils"],
 8 |         check=True, capture_output=True)
 9 |     match = re.search(r'\nVersion: (.*?)\n', result.stdout.decode(), re.DOTALL)
10 |     assert match and tuple(match.group(1).split('.')) >= ('0', '0', '5')
11 | except (subprocess.CalledProcessError, AssertionError):
12 |     subprocess.run(
13 |         [sys.executable, "-m", "pip", "install", "-U", "pkg_utils"],
14 |         check=True)
15 | import os
16 | import pkg_utils
17 | 
18 | name = 'datanator'
19 | dirname = os.path.dirname(__file__)
20 | 
21 | # get package metadata
22 | md = pkg_utils.get_package_metadata(dirname, name)
23 | 
24 | # install package
25 | setuptools.setup(
26 |     name=name,
27 |     version=md.version,
28 |     description='Finds relevant kinetic data for biochemical models',
29 |     long_description=md.long_description,
30 | 
31 |     url='https://github.com/KarrLab/' + name,
32 |     download_url='https://github.com/KarrLab/' + name,
33 |     license='MIT',
34 | 
35 |     author='Karr Lab',
36 |     author_email='members@karrlab.org',
37 | 
38 |     keywords=['kinetic data', 'systems biology', 'computational biology', ],
39 |     classifiers=[
40 |         'Development Status :: 3 - Alpha',
41 |         'Intended Audience :: Developers',
42 |         'License :: OSI Approved :: MIT License',
43 |         'Programming Language :: Python :: 3',
44 |         'Programming Language :: Python :: 3.6',
45 |     ],
46 | 
47 |     packages=setuptools.find_packages(exclude=['tests', 'tests.*']),
48 |     package_data={
49 |         name: [
50 |             'config/core.schema.cfg',
51 |             'config/core.default.cfg',
52 |             'data_source/*.txt',
53 |             'data/*.txt',
54 |             'data/*.xlsx',
55 |         ],
56 |     },
57 |     entry_points={
58 |         'console_scripts': [
59 |             'datanator = datanator.__main__:main',
60 |         ],
61 |     },
62 | 
63 |     install_requires=md.install_requires,
64 |     extras_require=md.extras_require,
65 |     tests_require=md.tests_require,
66 |     dependency_links=md.dependency_links,
67 | )
68 | 


--------------------------------------------------------------------------------
/tests/util/test_mongo_util.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.util import mongo_util
 3 | import datanator.config.core
 4 | import tempfile
 5 | import shutil
 6 | 
 7 | 
 8 | class TestMongoUtil(unittest.TestCase):
 9 | 
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         cls.cache_dirname = tempfile.mkdtemp()
13 |         cls.db = 'datanator'
14 |         username = datanator.config.core.get_config()['datanator']['mongodb']['user']
15 |         password = datanator.config.core.get_config()['datanator']['mongodb']['password']
16 |         MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
17 |         port = datanator.config.core.get_config()['datanator']['mongodb']['port']
18 |         replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet']
19 |         cls.src = mongo_util.MongoUtil(
20 |             cache_dirname = cls.cache_dirname, MongoDB = MongoDB, 
21 |             replicaSet = replSet, db = cls.db, verbose=True, max_entries=20,
22 |             username = username, password = password)
23 |         cls.collection_str = 'ecmdb'
24 | 
25 | 
26 |     @classmethod
27 |     def tearDownClass(cls):
28 |         shutil.rmtree(cls.cache_dirname)
29 | 
30 | 
31 |     # @unittest.skip('passed')
32 |     def test_list_all_collections(self):
33 |         self.assertTrue('ecmdb' in self.src.list_all_collections())
34 | 
35 | 
36 |     # @unittest.skip('passed')
37 |     def test_con_db(self):
38 |         self.assertNotEqual(self.src.con_db(self.db), 'Server not available')
39 | 
40 |     @unittest.skip('passed')
41 |     def test_fill_db(self):
42 |         self.collection_obj.drop()
43 |         self.assertEqual(self.collection_obj.find().count(), 0)
44 |         collection_obj = self.src.fill_db(self.collection_str)
45 |         self.assertNotEqual(collection_obj.find().count(), 0)
46 | 
47 |     # @unittest.skip('passed')
48 |     def test_print_schema(self):
49 |         a = self.src.print_schema('ecmdb')
50 |         self.assertEqual(a['properties']['creation_date'], {'type': 'string'})
51 |         self.assertEqual(a['properties']['synonyms'],  {'type': 'object', 'properties': {'synonym': {'type': 'array', 
52 |             'items': {'type': 'string'}}}, 'required': ['synonym']})
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/tests/util/test_reaction_util.py:
--------------------------------------------------------------------------------
 1 | """ Tests of the reaction utilities
 2 | 
 3 | :Author: Jonathan Karr <jonrkarr@gmail.com>
 4 | :Date: 2017-04-12
 5 | :Copyright: 2017, Karr Lab
 6 | :License: MIT
 7 | """
 8 | 
 9 | from datanator.core import data_model
10 | from datanator.util import reaction_util
11 | import unittest
12 | 
13 | 
14 | class TestReaction(unittest.TestCase):
15 |     adp = 'NC1=C2N=CN(C3OC(COP([O-])(=O)OP([O-])([O-])=O)C(O)C3O)C2=NC=N1'
16 |     atp = 'NC1=C2N=CN(C3OC(COP([O-])(=O)OP([O-])(=O)OP([O-])([O-])=O)C(O)C3O)C2=NC=N1'
17 |     h = '[H+]'
18 |     h2o = 'O'
19 |     pi = 'OP([O-])([O-])=O'
20 | 
21 |     def make_reaction(self):
22 |         return data_model.Reaction(participants=[
23 |             data_model.ReactionParticipant(
24 |                 specie=data_model.Specie(structure=self.atp, id='atp'),
25 |                 compartment=data_model.Compartment(id='c'),
26 |                 coefficient=-1),
27 |             data_model.ReactionParticipant(
28 |                 specie=data_model.Specie(structure=self.h2o, id='h2o'),
29 |                 compartment=data_model.Compartment(id='c'),
30 |                 coefficient=-1),
31 |             data_model.ReactionParticipant(
32 |                 specie=data_model.Specie(structure=self.adp, id='adp'),
33 |                 compartment=data_model.Compartment(id='c'),
34 |                 coefficient=1),
35 |             data_model.ReactionParticipant(
36 |                 specie=data_model.Specie(structure=self.pi, id='pi'),
37 |                 compartment=data_model.Compartment(id='c'),
38 |                 coefficient=1),
39 |             data_model.ReactionParticipant(
40 |                 specie=data_model.Specie(structure=self.h, id='h'),
41 |                 compartment=data_model.Compartment(id='c'),
42 |                 coefficient=1),
43 |         ])
44 | 
45 |     def test_calc_reactant_product_pairs(self):
46 |         rxn = self.make_reaction()
47 |         pairs = reaction_util.calc_reactant_product_pairs(rxn)
48 | 
49 |         self.assertEqual(pairs[0][0].specie.id, 'atp')
50 |         self.assertEqual(pairs[0][1].specie.id, 'adp')
51 | 
52 |         self.assertEqual(pairs[1][0].specie.id, 'h2o')
53 |         self.assertIn(pairs[1][1].specie.id, ['h', 'pi'])
54 | 
55 |         self.assertEqual(pairs[2][0], None)
56 |         self.assertIn(pairs[2][1].specie.id, ['h', 'pi'])
57 | 


--------------------------------------------------------------------------------
/datanator/data_source/metabolite_concentration/query_demo.py:
--------------------------------------------------------------------------------
 1 | from datanator_query_python.util import mongo_util
 2 | from datanator_query_python.config import config
 3 | 
 4 | 
 5 | class QueryDemo(mongo_util.MongoUtil):
 6 |     def __init__(self, MongoDB=None,
 7 |                        db=None,
 8 |                        collection_str=None,
 9 |                        password=None,
10 |                        username=None,
11 |                        max_entries=20):
12 |         super().__init__(MongoDB=MongoDB,
13 |                          db=db,
14 |                          username=username,
15 |                          password=password)
16 |         self.collection = self.db_obj[collection_str]
17 |         self.max_entries = max_entries
18 | 
19 |     def get_canon_ancestors(self, tax_id):
20 |         """Getting canon ancestor information by taxon ID
21 | 
22 |         Args:
23 |             (:obj:`int`): Taxon ID of organism.
24 | 
25 |         Return:
26 |             (:obj:`list` of :obj:`Obj`)
27 |         """
28 |         query = {"tax_id": tax_id}
29 |         projection = {"canon_anc_ids": 1, "canon_anc_names": 1,
30 |                       "_id": 0}
31 |         doc = self.collection.find_one(filter=query,
32 |                                        projection=projection)
33 |         result = []
34 |         if doc is None:
35 |             return result
36 |         for _id, name in zip(doc["canon_anc_ids"], doc["canon_anc_names"]):
37 |             obj = {"ncbi_taxonomy_id": _id,
38 |                    "name": name}
39 |             result.append(obj)
40 |         return result
41 | 
42 |     def demo_find(self, tax_id):
43 |         """Find organism with canon ancestor tax_id
44 | 
45 |         Args:
46 |             (:obj:`int`): Ancestor ID.
47 | 
48 |         Return:
49 |             (:obj:`list`)
50 |         """
51 |         query = {"canon_anc_ids": tax_id}
52 |         projection = {"canon_anc_ids": 1, "canon_anc_names": 1,
53 |                       "_id": 0}
54 |         docs = self.collection.find(filter=query,
55 |                                     projection=projection)  
56 |         result = []  
57 |         if docs is None:
58 |             return result
59 |         for i, doc in enumerate(docs):  
60 |             if i == self.max_entries:
61 |                 break
62 |             result.append(doc)
63 |         return result
64 | 


--------------------------------------------------------------------------------
/tests/data_source/test_metabolites_meta_collection.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source import metabolites_meta_collection
 3 | import datanator.config.core
 4 | import pymongo
 5 | import tempfile
 6 | import shutil
 7 | 
 8 | 
 9 | class TestMetabolitesMeta(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         cls.cache_dirname = tempfile.mkdtemp()
14 |         cls.db = 'test'
15 |         cls.meta_loc = 'datanator'
16 |         cls.username = datanator.config.core.get_config()['datanator']['mongodb']['user']
17 |         cls.password = datanator.config.core.get_config()['datanator']['mongodb']['password']
18 |         cls.MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
19 |         port = datanator.config.core.get_config()['datanator']['mongodb']['port']
20 |         replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet']
21 |         cls.src = metabolites_meta_collection.MetabolitesMeta(cache_dirname=cls.cache_dirname,
22 |                                                               MongoDB=cls.MongoDB,  db=cls.db,
23 |                                                               verbose=True, max_entries=20, username = cls.username,
24 |                                                               password = cls.password, meta_loc = cls.meta_loc)
25 |         cls.client, cls.db_obj, cls.collection_obj = cls.src.con_db(cls.db)
26 | 
27 |     @classmethod
28 |     def tearDownClass(cls):
29 |         shutil.rmtree(cls.cache_dirname)
30 |         cls.client.close()
31 | 
32 | 
33 |     # def test_fill_metabolite_fields(self):
34 |     #     dict_list = self.src.fill_metabolite_fields(
35 |     #     	fields = ['m2m_id', 'inchi'], collection_src = 'ecmdb', collection_des = 'metabolites_meta')
36 |     #     self.assertEqual(
37 |     #         dict_list[0]['inchi'], 'InChI=1S/C4H6O3/c1-2-3(5)4(6)7')
38 | 
39 |     def test_load_content(self):
40 |         self.src.load_content()
41 |         meta_db = self.src.client[self.meta_loc]
42 |         collection = meta_db['metabolites_meta']
43 |         cursor = collection.find_one({'inchi': 'InChI=1S/C4H8O3/c1-3(2-5)4(6)7/h3,5H,2H2,1H3,(H,6,7)'})
44 |         self.assertEqual(cursor['InChI_Key'], 'DBXBTMSZEOQQDU-UHFFFAOYSA-N')
45 |         
46 |     def test_replace_key_in_similar_compounds(self):
47 |         self.src.replace_key_in_similar_compounds()


--------------------------------------------------------------------------------
/tests/data_source/test_ec.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source import ec
 3 | import datanator.config.core
 4 | import shutil
 5 | import tempfile
 6 | from pathlib import Path
 7 | 
 8 | 
 9 | class TestEC(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         cls.cache_dir = tempfile.mkdtemp()
14 |         db = 'test'
15 |         username = datanator.config.core.get_config()['datanator']['mongodb']['user']
16 |         password = datanator.config.core.get_config()['datanator']['mongodb']['password']
17 |         MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
18 |         cls.src = ec.EC(server=MongoDB, db=db, username=username, password=password, authSource='admin',
19 |                         readPreference='nearest', max_entries=20, cache_dir=cls.cache_dir)
20 | 
21 |     @classmethod
22 |     def tearDownClass(cls):
23 |         shutil.rmtree(cls.cache_dir)
24 |         cls.src.db.drop_collection(cls.src.collection_str)
25 |         cls.src.client.close()
26 | 
27 |     @unittest.skip('IP')
28 |     def test_establish_ftp(self):
29 |         ftp = self.src.establish_ftp()
30 |         self.assertTrue('enzyme.dat' in ftp.nlst())
31 |         
32 |     @unittest.skip('IP')
33 |     def test_retrieve_content(self):
34 |         p = Path(self.cache_dir+'/enzyme.dat')
35 |         self.src.retrieve_content()
36 |         self.assertTrue(p.exists())
37 | 
38 |     @unittest.skip('circle directory error.')
39 |     def test_parse_content(self):
40 |         location = str(Path('~/karr_lab/datanator/docs/enzyme.dat').expanduser())
41 |         self.src.parse_content(location)
42 | 
43 |     def test_make_doc(self):
44 |         lines = ["ID   1.1.1.1", "DE   Alcohol dehydrogenase.", "AN   Aldehyde reductase.",
45 |         "CA   (1) A primary alcohol + NAD(+) = an aldehyde + NADH.", "CA   (2) A secondary alcohol + NAD(+) = a ketone + NADH.",
46 |         "CF   Zn(2+) or Fe cation."]
47 |         result = self.src.make_doc(lines)
48 |         self.assertEqual(result, {'ec_number': '1.1.1.1', 'ec_name': 'Alcohol dehydrogenase', 
49 |                                   'ec_synonyms': ['Aldehyde reductase'], 
50 |                                   'catalytic_activity': ['(1) A primary alcohol + NAD(+) = an aldehyde + NADH', '(2) A secondary alcohol + NAD(+) = a ketone + NADH'], 
51 |                                   'cofactor': 'Zn(2+) or Fe cation'})


--------------------------------------------------------------------------------
/tests/data_source/rna_halflife/test_doi_10_1093_nar_gks1019.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source.rna_halflife import doi_10_1093_nar_gks1019
 3 | import tempfile
 4 | import shutil
 5 | import json
 6 | import os
 7 | from datanator_query_python.config import config
 8 | import pandas as pd
 9 | 
10 | 
11 | class TestProteinAggregate(unittest.TestCase):
12 | 
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         des_db = 'test'
16 |         src_db = 'datanator'
17 |         cls.protein_col = 'uniprot'
18 |         cls.rna_col = 'rna_halflife'
19 |         conf = config.TestConfig()
20 |         username = conf.USERNAME
21 |         password = conf.PASSWORD
22 |         MongoDB = conf.SERVER    
23 |         cls.src = doi_10_1093_nar_gks1019.Halflife(server=MongoDB, src_db=src_db,
24 |         protein_col=cls.protein_col, authDB='admin', readPreference='nearest',
25 |         username=username, password=password, verbose=True, max_entries=20,
26 |         des_db=des_db, rna_col=cls.rna_col)
27 | 
28 |     @classmethod
29 |     def tearDownClass(cls):
30 |         cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col)
31 |         cls.src.db_obj.drop_collection(cls.rna_col)
32 |         cls.src.uniprot_collection_manager.client.close()
33 |         cls.src.client.close()
34 |         cls.src.uniprot_query_manager.client.close()
35 | 
36 |     @unittest.skip('avoid downloading')
37 |     def test_fill_uniprot(self):
38 |         url_0 = 'https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/nar/41/1/10.1093/nar/gks1019/2/gks1019-nar-00676-a-2012-File003.xlsx?Expires=1578425844&Signature=ZRFUxLdn4-vaBt5gQci~0o56KqyR9nJj9i32ig5X6YcfqiJeV3obEq8leHGdDxx6w~KABgewiQ66HTB7gmuG~2GL-YgxPKYSjt17WrYMkc-0ibw6TMlTvWZZfvw-lPe~wvpmVfNEXnTbP7jHyNLu9jeJ6yhoXvgIyQtzA5PbEI1fyXEgeZzOKMltmITqL3g3APsPsagCTC66rwrBT23Aghh6D314uilT2DZHCc68MH2nyV~qAhFqIQiOj-7VTEKqkDPvPYvuE2KNKXdvW23gk100YV~58ozbt8ijRz5Gr5gPtE~f1Ab5l260EIbWHJNabMRleInJQqUIDPFN4C38PQ__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA'
39 |         df_0 = self.src.fill_uniprot(url_0, 'Supplementary Table 1')
40 |         self.assertEqual(df_0.iloc[0]['ordered_locus_name'], 'Rv0002')
41 | 
42 |     def test_fill_rna_halflife(self):
43 |         d = {'half_life': [32.3, 12.2, 13.2], 'r_squared': [0.9, 0.7, 0.8],
44 |             'ordered_locus_name': ['Rv0002', 'something', 'this']}
45 |         df_0 = pd.DataFrame(d)
46 |         self.src.fill_rna_halflife(df_0, ['aaa', 102])


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | .pytest_cache/
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | env/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | src/
 27 | *.DS_Store
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # test output
 40 | tests/reports/
 41 | tests/output/
 42 | 
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *,cover
 53 | .hypothesis/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # MongoDB stuff:
 68 | ./db_volumes
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx generated documentation
 74 | docs/_build/
 75 | docs/source/
 76 | docs/.doctrees/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # IPython Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # celery beat schedule file
 88 | celerybeat-schedule
 89 | 
 90 | # dotenv
 91 | .env
 92 | 
 93 | # virtualenv
 94 | venv/
 95 | ENV/
 96 | .venv
 97 | 
 98 | # Spyder project settings
 99 | .spyderproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # temporary Excel files
105 | .~lock.*
106 | 
107 | # scratch
108 | scratch/
109 | datanator/data/
110 | 
111 | # passwords
112 | tests/fixtures/secret/
113 | 
114 | # IDE configuration
115 | .idea/
116 | 
117 | # cache files
118 | datanator/data_source/cache/
119 | intact/
120 | datanator/data/cache/CommonSchema.sql
121 | datanator/builds/logs/
122 | *.sqlite
123 | 
124 | # configuration
125 | datanator.cfg
126 | datanator/production_server.py
127 | 
128 | #data source files for YMDB
129 | datanator/data_source/YMDB/
130 | 
131 | # minerva
132 | minerva/*
133 | 
134 | # ontology file
135 | *.obo
136 | 
137 | # invisible stuff
138 | .*
139 | # test output
140 | tests/data_source/test_processing/
141 | 


--------------------------------------------------------------------------------
/tests/data_source/rna_halflife/test_doi_10_1186_gb_2012_13_4_r30.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source.rna_halflife import doi_10_1186_gb_2012_13_4_r30
 3 | import tempfile
 4 | import shutil
 5 | import json
 6 | import os
 7 | from datanator_query_python.config import config
 8 | import pandas as pd
 9 | 
10 | 
11 | class TestProteinAggregate(unittest.TestCase):
12 | 
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         des_db = 'test'
16 |         src_db = 'datanator'
17 |         cls.protein_col = 'uniprot'
18 |         cls.rna_col = 'rna_halflife'
19 |         conf = config.TestConfig()
20 |         username = conf.USERNAME
21 |         password = conf.PASSWORD
22 |         MongoDB = conf.SERVER    
23 |         cls.src = doi_10_1186_gb_2012_13_4_r30.Halflife(server=MongoDB, src_db=src_db,
24 |         protein_col=cls.protein_col, authDB='admin', readPreference='nearest',
25 |         username=username, password=password, verbose=True, max_entries=20,
26 |         des_db=des_db, rna_col=cls.rna_col)
27 | 
28 |     @classmethod
29 |     def tearDownClass(cls):
30 |         cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col)
31 |         cls.src.db_obj.drop_collection(cls.rna_col)
32 |         cls.src.uniprot_collection_manager.client.close()
33 |         cls.src.client.close()
34 |         cls.src.uniprot_query_manager.client.close()
35 | 
36 |     @unittest.skip('passed')
37 |     def test_load_uniprot(self):
38 |         self.src.load_uniprot()
39 | 
40 |     def test_fill_rna_half_life(self):
41 |         url = """https://static-content.springer.com/esm/art%3A10.1186%2Fgb-2012-13-4-r30/MediaObjects/13059_2011_2880_MOESM3_ESM.XLSX"""
42 |         names = ['ordered_locus_name', 'half_life_ga_2', 'reads_per_kb_per_mb',
43 |                 'transcriptional_start_sites', 'transcriptional_end_sites', 'operon',
44 |                 'gene_start', 'gene_end', 'strand', 'gene_name', 'protein_annotation',
45 |                 'cog', 'kegg', 'half_life_qpcr', 'half_life_454']
46 |         df_10987 = self.src.make_df(url, 'Bc10987', names=names, usecols='A:O', skiprows=[0,1], nrows=self.src.max_entries)
47 |         self.src.fill_rna_half_life(df_10987, names, ['Bacillus cereus ATCC 10987', 222523])
48 |         self.src.fill_rna_half_life(df_10987, names, ['Bacillus cereus ATCC 10987', 222523], quantification_method='RT-qPCR')
49 |         self.src.fill_rna_half_life(df_10987, names, ['Bacillus cereus ATCC 10987', 222523], quantification_method='Roche 454')


--------------------------------------------------------------------------------
/tests/elasticsearch_kl/test_batch_load.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.elasticsearch_kl import batch_load
 3 | from datanator_query_python.config import config
 4 | import tempfile
 5 | import shutil
 6 | import requests
 7 | 
 8 | class TestMongoToES(unittest.TestCase):
 9 | 
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         cls.cache_dir = tempfile.mkdtemp()
13 |         cls.src = batch_load.MongoToES(profile_name='es-poweruser', credential_path='~/.wc/third_party/aws_credentials',
14 |                 config_path='~/.wc/third_party/aws_config', elastic_path='~/.wc/third_party/elasticsearch.ini',
15 |                 cache_dir=cls.cache_dir, service_name='es', index='test', max_entries=float('inf'), verbose=True)
16 |         cls.url = cls.src.es_endpoint + '/' + cls.src.index
17 |         requests.delete(cls.url, auth=cls.src.awsauth)
18 |         conf = config.Config()
19 |         cls.username = conf.USERNAME
20 |         cls.password = conf.PASSWORD
21 |         cls.server = conf.SERVER
22 |         cls.authDB = conf.AUTHDB
23 |         cls.db = 'datanator'
24 | 
25 |     @classmethod
26 |     def tearDownClass(cls):
27 |         shutil.rmtree(cls.cache_dir)
28 |         requests.delete(cls.url, auth=cls.src.awsauth)
29 | 
30 |     def test_connection(self):
31 |         result = self.src.client.list_domain_names()
32 |         self.assertEqual(result['ResponseMetadata']['HTTPStatusCode'], 200)
33 |         self.assertTrue('datanator-elasticsearch' in self.src.es_endpoint)
34 | 
35 |     def test_data_from_mongo(self):
36 |         count, _ = self.src.data_from_mongo_protein(self.server, self.db, self.username, 
37 |                                                     self.password, authSource=self.authDB)
38 |         self.assertTrue(count >= 1000)
39 | 
40 |     def test_data_from_metabolite(self):
41 |         _, count_0, _, count_1 = self.src.data_from_mongo_metabolite(self.server, self.db, self.username, 
42 |                                             self.password, authSource=self.authDB)
43 |         self.assertTrue(count_0 >= 1000)
44 |         self.assertTrue(count_1 >= 1000)
45 | 
46 |     def test_data_from_metabolites_meta(self):
47 |         doc = self.src.data_from_mongo_metabolites_meta(self.server, self.db, self.username,
48 |                                                             self.password, authSource=self.authDB)
49 |         result = []
50 |         for i in range(5):
51 |             result.append(doc)
52 |         self.assertEqual(len(result), 5)


--------------------------------------------------------------------------------
/tests/data_source/rna_halflife/test_doi_10_1093_nar_gkt1150.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source.rna_halflife import doi_10_1093_nar_gkt1150
 3 | import tempfile
 4 | import shutil
 5 | import json
 6 | import os
 7 | from datanator_query_python.config import config
 8 | import pandas as pd
 9 | 
10 | 
11 | class TestProteinAggregate(unittest.TestCase):
12 | 
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         des_db = 'test'
16 |         src_db = 'datanator'
17 |         cls.protein_col = 'uniprot'
18 |         cls.rna_col = 'rna_halflife'
19 |         cls.cache_dir = tempfile.mkdtemp()
20 |         conf = config.TestConfig()
21 |         username = conf.USERNAME
22 |         password = conf.PASSWORD
23 |         MongoDB = conf.SERVER    
24 |         cls.src = doi_10_1093_nar_gkt1150.Halflife(server=MongoDB, src_db=src_db,
25 |         protein_col=cls.protein_col, authDB='admin', readPreference='nearest',
26 |         username=username, password=password, verbose=True, max_entries=20,
27 |         des_db=des_db, rna_col=cls.rna_col, cache_dir=cls.cache_dir)
28 | 
29 |     @classmethod
30 |     def tearDownClass(cls):
31 |         shutil.rmtree(cls.cache_dir)
32 |         cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col)
33 |         cls.src.db_obj.drop_collection(cls.rna_col)
34 |         cls.src.uniprot_collection_manager.client.close()
35 |         cls.src.client.close()
36 |         cls.src.uniprot_query_manager.client.close()
37 | 
38 |     @unittest.skip('downloading of file forbidden from nonacademic IP')
39 |     def test_fill_rna_half_life(self):
40 |         url = 'https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/nar/42/4/10.1093_nar_gkt1150/1/gkt1150_Supplementary_Data.zip?Expires=1578928721&Signature=ADjsCSaceimzGs6aJ~uG7np88TzHNooAoBabdm-6utYVIZOEwRbzTdiBp~76vM4KEHz9Nir8GNrtA3AwHwGFm0bu~aorTG4xrOChS6UgfBQiUtgr8vfbDIUno1y1nxLGCKIfQrb2Gx-SVnigum2gjcveymK995zadSNZqN~w-vz-Ii0a6fH7kvKN8m9vLWf6fdo0NXSmgnkjj9KPCuS-bmK0y4ZH5Ex0Rl4qi5uCroYmDBNOhXY23pcalbpFwB1-07tA3~756gZN4Mo9uMeSVQKl5UsHzx5amB6WvSCXS8z756YoaaMCg0FQbUCcQ46fRGdHxcvPNcrPo5IMEGmi8g__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA'
41 |         df_s1 = self.src.make_df(url, 'TableS1', names=['oln', 'gene_symbol', 'a', 'vc_a', 'b', 'vc_b', 'c', 'vc_c', 'd', 'vc_d'], usecols='A,B,L:S',
42 |                             skiprows=list(range(0, 7)), file_type='zip', file_name='nar-01935-a-2013-File011.xlsx')
43 |         self.src.fill_rna_half_life(df_s1, ['Escherichia coli str. K-12 substr. MG1655', 511145])


--------------------------------------------------------------------------------
/tests/data_source/rna_halflife/test_doi_10_1101_gr_131037_111.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source.rna_halflife import doi_10_1101_gr_131037_111
 3 | import tempfile
 4 | import shutil
 5 | import json
 6 | import os
 7 | from datanator_query_python.config import config
 8 | import pandas as pd
 9 | 
10 | 
11 | class TestProteinAggregate(unittest.TestCase):
12 | 
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         des_db = 'test'
16 |         src_db = 'datanator'
17 |         cls.protein_col = 'uniprot'
18 |         cls.rna_col = 'rna_halflife'
19 |         conf = config.TestConfig()
20 |         username = conf.USERNAME
21 |         password = conf.PASSWORD
22 |         MongoDB = conf.SERVER    
23 |         cls.src = doi_10_1101_gr_131037_111.Halflife(server=MongoDB, src_db=src_db,
24 |         protein_col=cls.protein_col, authDB='admin', readPreference='nearest',
25 |         username=username, password=password, verbose=True, max_entries=20,
26 |         des_db=des_db, rna_col=cls.rna_col)
27 | 
28 |     @classmethod
29 |     def tearDownClass(cls):
30 |         cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col)
31 |         cls.src.db_obj.drop_collection(cls.rna_col)
32 |         cls.src.uniprot_collection_manager.client.close()
33 |         cls.src.client.close()
34 |         cls.src.uniprot_query_manager.client.close()
35 | 
36 |     @unittest.skip('avoid downloading')
37 |     def test_fill_uniprot(self):
38 |         url_0 = 'https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/nar/41/1/10.1093/nar/gks1019/2/gks1019-nar-00676-a-2012-File003.xlsx?Expires=1578425844&Signature=ZRFUxLdn4-vaBt5gQci~0o56KqyR9nJj9i32ig5X6YcfqiJeV3obEq8leHGdDxx6w~KABgewiQ66HTB7gmuG~2GL-YgxPKYSjt17WrYMkc-0ibw6TMlTvWZZfvw-lPe~wvpmVfNEXnTbP7jHyNLu9jeJ6yhoXvgIyQtzA5PbEI1fyXEgeZzOKMltmITqL3g3APsPsagCTC66rwrBT23Aghh6D314uilT2DZHCc68MH2nyV~qAhFqIQiOj-7VTEKqkDPvPYvuE2KNKXdvW23gk100YV~58ozbt8ijRz5Gr5gPtE~f1Ab5l260EIbWHJNabMRleInJQqUIDPFN4C38PQ__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA'
39 |         df_0 = self.src.fill_uniprot(url_0, 'Supplementary Table 1')
40 |         self.assertEqual(df_0.iloc[0]['ordered_locus_name'], 'Rv0002')
41 | 
42 |     def test_fill_rna_halflife(self):
43 |         url = """https://genome.cshlp.org/content/suppl/2012/02/06/gr.131037.111.DC1/Supp_Table_2.xlsx"""
44 |         usecols = 'B,L,M,N,O,P,AC,AD,AR,AT,AU'
45 |         df_0 = self.src.make_df(url, 'V1ncodemouse_probe_annotations_', header=0, usecols=usecols, nrows=34509)
46 |         self.src.fill_rna_half_life(df_0, ['Mus musculus', 10090])


--------------------------------------------------------------------------------
/datanator/config/core.py:
--------------------------------------------------------------------------------
 1 | """ Configuration
 2 | 
 3 | :Author: Jonathan Karr <jonrkarr@gmail.com>
 4 | :Date: 2017-05-13
 5 | :Copyright: 2017, Karr Lab
 6 | :License: MIT
 7 | """
 8 | 
 9 | import configobj
10 | import os
11 | import pkg_resources
12 | import wc_utils.config.core
13 | import wc_utils.debug_logs.config
14 | 
15 | 
16 | def get_config(extra=None):
17 |     """ Get configuration
18 | 
19 |     Args:
20 |         extra (:obj:`dict`, optional): additional configuration to override
21 | 
22 |     Returns:
23 |         :obj:`configobj.ConfigObj`: nested dictionary with the configuration settings loaded from the configuration source(s).
24 |     """
25 |     paths = wc_utils.config.core.ConfigPaths(
26 |         default=pkg_resources.resource_filename('datanator', 'config/core.default.cfg'),
27 |         schema=pkg_resources.resource_filename('datanator', 'config/core.schema.cfg'),
28 |         user=(
29 |             'datanator.cfg',
30 |             os.path.expanduser('~/.wc/datanator.cfg'),
31 |         )
32 |     )
33 | 
34 |     return wc_utils.config.core.ConfigManager(paths).get_config(extra=extra)
35 | def get_mongo_config():
36 |     """ Get a configuration to pass directly into the mongo client
37 |     Args:
38 |         extra (:obj: 'dict', optional): override the Mongo information loaded from the config file
39 |     Returns: 
40 |         :obj:'dict': dictionary containing parameters to pass into the MongoDB util constructor
41 |     """
42 |     config=get_config()
43 |     username =config['datanator']['mongodb']['user']
44 |     password =config['datanator']['mongodb']['password']
45 |     port = config['datanator']['mongodb']['port']
46 |     MongoDB = config['datanator']['mongodb']['server']
47 |     replSet = config['datanator']['mongodb']['replSet']
48 |     mongo_config = {"MongoDB":MongoDB,"username":username, "password": password, "replicaSet": replSet}
49 |     return mongo_config
50 | 
51 | 
52 | 
53 | 
54 | def get_debug_logs_config(extra=None):
55 |     """ Get debug logs configuration
56 | 
57 |     Args:
58 |         extra (:obj:`dict`, optional): additional configuration to override
59 | 
60 |     Returns:
61 |         :obj:`configobj.ConfigObj`: nested dictionary with the configuration settings loaded from the configuration source(s).
62 |     """
63 |     paths = wc_utils.debug_logs.config.paths.deepcopy()
64 |     paths.user = (
65 |         'datanator.debug.cfg',
66 |         os.path.expanduser('~/.wc/datanator.debug.cfg'),
67 |     )
68 |     return wc_utils.config.core.ConfigManager(paths).get_config(extra=extra)
69 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | ``datanator``
 2 | =====================
 3 | ``datanator`` is a software tool for finding experimental data for building and calibrating dynamical models of cellular biochemistry such as metabolite, RNA, and protein abundances; protein complex compositions; transcription factor binding motifs; and kinetic parameters. ``datanator`` is particularly useful for building large models, such as whole-cell models, that require large amounts of data to constrain large numbers of parameters. ``datanator`` was motivated by the need for large amounts of data to constrain whole-cell models and the fact that this data is hard to utilize because it is scattered across numerous siloed repositories.
 4 | 
 5 | ``datanator`` currently supports the following data types and data sources:
 6 | 
 7 | * Metabolite concentrations: `ECMDB <http://www.ecmdb.ca>`_ and `YMBD <http://www.ymdb.ca>`_
 8 | * RNA abundance: `ArrayExpress <https://www.ebi.ac.uk/arrayexpress>`_
 9 | * Protein abundance: `PaxDb <http://pax-db.org>`_
10 | * Protein complex composition: `CORUM <http://mips.helmholtz-muenchen.de/corum>`_
11 | * Transcription factor binding motifs: `JASPAR <http://jaspar.genereg.net>`_
12 | * Reaction kinetics: `SABIO-RK <http://sabio.h-its.org>`_
13 | * Taxonomy: `NCBI Taxonomy <https://www.ncbi.nlm.nih.gov/taxonomy>`_
14 | 
15 | ``datanator`` (1) downloads these repositories; (2) normalizes their data to a common ontology and units; (3) stores their data to a local SQLite database; and (4) provides a Python API for (a) finding relevant data to model a specific organism and environmental condition from similar species, reactions, genotypes (taxon, variant), and environments (temperature, pH, media), and (b) reducing multiple relevant observations to a single consensus recommended parameter value, and (c) exporting these consensus recommendations and their provenance to an Excel workbook. To make ``datanator`` easier to use, we plan to develop user-friendly command line and web-based interfaces for finding data for SBML-encoded models.
16 | 
17 | ``datanator`` is under active development and is not yet ready for end users. Please check back soon for updates.
18 | 
19 | This website contains detailed documentation of the ``datanator`` source code. Going forward, this website will also contain detailed instructions and tutorials on how to use ``datanator``.
20 | 
21 | Contents
22 | --------
23 | 
24 | .. toctree::
25 |    :maxdepth: 3
26 |    :numbered:
27 | 
28 |    intro
29 |    installation
30 |    tutorial
31 |    API documentation <source/modules.rst>
32 |    about
33 |    references.rst
34 | 


--------------------------------------------------------------------------------
/tests/data_source/brenda/test_reaction.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import shutil
 3 | import tempfile
 4 | from datanator.data_source.brenda import reaction
 5 | from datanator_query_python.config import config
 6 | import pandas as pd
 7 | 
 8 | 
 9 | class TestBrendaRxn(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         conf = config.TestConfig()
14 |         cls.collection_str = 'brenda_reaction'
15 |         username = conf.USERNAME
16 |         password = conf.PASSWORD
17 |         MongoDB = conf.SERVER
18 |         cls.src = reaction.BrendaRxn(MongoDB=MongoDB, db='test', collection_str=cls.collection_str,
19 |                                      username=username, password=password, authSource='admin',
20 |                                      max_entries=20, verbose=True)
21 | 
22 |     @classmethod
23 |     def tearDownClass(cls):
24 |         cls.src.db_obj.drop_collection(cls.collection_str)
25 |         cls.src.client.close()
26 | 
27 |     # @unittest.skip('passed')
28 |     def test_download_and_read(self):
29 |         result = self.src.download_and_read()
30 |         self.assertEqual(result['ec_number'][1], '6.3.2.1')
31 | 
32 |     def test_clean_up(self):
33 |         result = self.src.download_and_read()
34 |         exp = self.src.clean_up(result)
35 |         self.assertEqual(exp['reaction_id_brenda'][1], ['BR101'])
36 |         self.assertEqual(exp['reaction_id_sabio_rk'][1], 2406)
37 | 
38 |     # @unittest.skip('passed')
39 |     def test_parse_reaction(self):
40 |         df = pd.DataFrame({'reaction': ['ATP + (R)-pantoate + beta-alanine <=> AMP + diphosphate + (R)-pantothenate',
41 |                                         'ATP + Detyrosinated alpha-tubulin + L-Tyrosine = alpha-Tubulin + ADP + Orthophosphate']})
42 |         result = self.src.parse_reaction(df)
43 |         self.assertEqual(result['products'][1][1], 'ADP')
44 |         self.assertEqual(result['substrates'][0][1], '(R)-pantoate')
45 | 
46 |     # @unittest.skip('passed')
47 |     def test_load_df_sim(self):
48 |         df = pd.DataFrame({'reaction': ['ATP + (R)-pantoate + beta-alanine <=> AMP + diphosphate + (R)-pantothenate',
49 |                                         'ATP + Detyrosinated alpha-tubulin + L-Tyrosine = alpha-Tubulin + ADP + Orthophosphate']})
50 |         result = self.src.parse_reaction(df)
51 |         self.src.load_df(result)
52 | 
53 |     # @unittest.skip('passed')
54 |     def test_load_df_real(self):
55 |         result = self.src.download_and_read()
56 |         self.src.clean_up(result)
57 |         x = self.src.parse_reaction(result)
58 |         self.src.load_df(x.head(100))


--------------------------------------------------------------------------------
/datanator/parse_metabolite_concentration.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import urllib.request
 3 | import json
 4 | from datanator_query_python.config import config
 5 | from datanator_query_python.util import mongo_util
 6 | 
 7 | class ParseMetaboliteConcentration(mongo_util.MongoUtil):
 8 |     def __init__(self,
 9 |                  MongoDB=None,
10 |                  db=None,
11 |                  collection=None,
12 |                  max_entries=float('inf'),
13 |                  username=None,
14 |                  password=None,
15 |                  authSource = 'admin',
16 |                  readPreference = 'nearest'):
17 |         super().__init__(MongoDB=MongoDB, db=db,
18 |                          username = username,
19 |                          password = password,
20 |                          authSource = authSource,
21 |                          readPreference=readPreference)
22 |         self.max_entries = max_entries
23 |         self.collection = collection
24 | 
25 |     def parse_metabolite(self):
26 |         """
27 |         Read JSON metabolite concentration files from Github and
28 |         insert separate documents for each metabolite into MongoDB database  
29 | 
30 |         Args:
31 |             ()
32 |         Return:
33 |             ()
34 |         """
35 |         collection = self.db_obj[self.collection]
36 |         metabolites = ["ATP","CTP","GMP","GTP","IMP","NAD","NADH","NADP","NADPH","TTP","UTP"]
37 |         for i in range(len(metabolites)):       
38 |             url = urllib.request.urlopen("https://raw.githubusercontent.com/KarrLab/datanator/tutorial/docs/metabolites/"+metabolites[i]+".json")
39 |             data = json.loads(url.read().decode())
40 |             collection.insert_one({"inchikey":data['inchikey']})
41 |             for j in range(len(data['concentrations'])):
42 |                 sub_data = data['concentrations'][j]
43 |                 collection.update_one({"inchikey":data['inchikey']},{"$addToSet":{'concentrations':sub_data}})
44 | def main():
45 |     conf=config.Victoria()
46 |     conf_main = config.Config()
47 |     username = conf.USERNAME
48 |     password = conf.PASSWORD
49 |     MongoDB = conf_main.SERVER
50 |     src = ParseMetaboliteConcentration(MongoDB = MongoDB,
51 |                                        username=username,
52 |                                        password=password,
53 |                                        collection = "metabolite_concentration",
54 |                                        db = "datanator-demo")
55 |     src.parse_metabolite()
56 | 
57 | if __name__== '__main__':
58 |     main()
59 |         
60 | 


--------------------------------------------------------------------------------
/tests/data_source/test_intact_nosql.py:
--------------------------------------------------------------------------------
 1 | from datanator.data_source import intact_nosql
 2 | import shutil
 3 | import os
 4 | import unittest
 5 | import tempfile
 6 | import datanator.config.core
 7 | 
 8 | class TestCorumNoSQL(unittest.TestCase):
 9 | 
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         cls.cache_dirname = tempfile.mkdtemp()
13 |         cls.db = 'test'
14 |         cls.username = datanator.config.core.get_config()['datanator']['mongodb']['user']
15 |         cls.password = datanator.config.core.get_config()['datanator']['mongodb']['password']
16 |         cls.MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
17 |         cls.replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet']
18 |         cls.src = intact_nosql.IntActNoSQL(
19 |             cache_dirname = cls.cache_dirname, MongoDB = cls.MongoDB, 
20 |             db = cls.db, verbose=True, max_entries=20,
21 |             username  = cls.username, password = cls.password)
22 | 
23 |     @classmethod
24 |     def tearDownClass(cls):
25 |         shutil.rmtree(cls.cache_dirname)
26 |         cls.src.collection_interaction.drop()
27 |         cls.src.collection_complex.drop()
28 |         cls.src.client_interaction.close()
29 |         cls.src.client_complex.close()
30 | 
31 |     def test_download_content(self):
32 |         self.src.download_content()
33 |         file = os.path.join(self.cache_dirname, 'intact', 'complextab', 'bos_taurus.tsv')
34 |         self.assertTrue(os.path.exists(file))
35 | 
36 |     # @unittest.skip("loading everything")
37 |     def test_load_complex(self):
38 |         self.src.add_complexes()
39 |         int_complex = self.src.collection_complex
40 |         self.assertTrue(int_complex.find().count() > 11)
41 |         # cursor = int_complex.find({'identifier': 'CPX-3140'})
42 |         # self.assertEqual(cursor.count(), 1)
43 |         # self.assertEqual(cursor[0]['ncbi_id'], 7227)
44 |         # self.assertEqual(cursor[0]['subunits'], [{'uniprot_id': 'P48607-PRO_0000022407', 'count': '1'},
45 |         #                                         {'uniprot_id': 'P48607-PRO_0000022407', 'count': '1'}])
46 | 
47 |     # @unittest.skip('loaded')
48 |     def test_load_interaction(self):
49 |         self.src.add_interactions()
50 |         int_int = self.src.collection_interaction
51 |         self.assertTrue(int_int.count_documents({}) in [18, 19, 20])
52 |         cursor = int_int.find({'interaction_id': 'intact:EBI-526288'})
53 |         self.assertEqual(cursor.count(), 1)
54 |         self.assertEqual(cursor[0]['method'], 'anti tag coimmunoprecipitation')
55 |         self.assertEqual(cursor[0]['confidence'], 'intact-miscore:0.51')
56 | 


--------------------------------------------------------------------------------
/tests/data_source/rna_halflife/test_doi_10_1186_s12864_016_3219_8.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source.rna_halflife import doi_10_1186_s12864_016_3219_8
 3 | import tempfile
 4 | import shutil
 5 | import json
 6 | import os
 7 | import datanator.config.core
 8 | 
 9 | 
10 | class TestProteinAggregate(unittest.TestCase):
11 | 
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         cls.cache_dirname = tempfile.mkdtemp()
15 |         cache_dir = os.path.join(cls.cache_dirname, 'logs.txt')
16 |         des_db = 'test'
17 |         db = 'datanator'
18 |         cls.collection_str = 'test_rna_halflife'
19 |         username = datanator.config.core.get_config()[
20 |             'datanator']['mongodb']['user']
21 |         password = datanator.config.core.get_config(
22 |         )['datanator']['mongodb']['password']
23 |         server = datanator.config.core.get_config(
24 |         )['datanator']['mongodb']['server']       
25 |         cls.src = doi_10_1186_s12864_016_3219_8.Halflife(username=username, password=password, server=server, 
26 |                                                 authDB='admin',max_entries=100, uniprot_col_db=des_db,
27 |                                                 verbose=True, collection_str=cls.collection_str, db=db,
28 |                                                 cache_dir=cache_dir)
29 | 
30 |     @classmethod
31 |     def tearDownClass(cls):
32 |         shutil.rmtree(cls.cache_dirname)
33 |         cls.src.db.drop_collection(cls.collection_str)
34 |         cls.src.client.close()
35 | 
36 |     def test_download_xlsx(self):
37 |         result = self.src.download_xlsx('MeOH')
38 |         self.assertEqual(result['gene_fragment'][0], 'MA0001')
39 | 
40 |     # @unittest.skip('passed')
41 |     def test_load_halflife(self):
42 |         df = self.src.download_xlsx('MeOH')
43 |         self.src.load_halflife(df)
44 |         df = self.src.download_xlsx('TMA')
45 |         self.src.add_to_halflife(df)
46 | 
47 |     # @unittest.skip('passed')
48 |     def test_fill_gene_protein_name(self):
49 |         self.src.fill_gene_protein_name()
50 |         result = self.src.collection.find_one({'gene_name': '-'})
51 |         self.assertIsNone(result)
52 | 
53 |     # @unittest.skip('passed')
54 |     def test_fill_protein_name(self):
55 |         self.src.fill_protein_name()
56 |         result = self.src.collection.find_one({'$and':[{'gene_name': {'$ne': '-'}}, 
57 |                                             {'protein_name': {'$exists': False}},
58 |                                             {'gene_name': {'$exists': True}}]})
59 |         self.assertIsNone(result)
60 | 
61 |     # @unittest.skip('passed')
62 |     def test_fill_uniprot_by_oln(self):
63 |         self.src.fill_uniprot_by_oln('MA0002')


--------------------------------------------------------------------------------
/LICENSE-THIRD-PARTY-DATA:
--------------------------------------------------------------------------------
 1 | The Datanator database is a compilation of data curated from the literature by the Datanator team and data aggregated from third-party databases. The data curated by the Datanator team is available under the Creative Commons 1.0 Universal (CC0) License. The data compiled from third-party sources is available under the licenses summarized below.
 2 | 
 3 | E. coli Metabolome Database (ECMDB)
 4 |     URL: https://ecmdb.ca
 5 |     License summary: Offered freely to the public. Use and re-distribution of the data, in whole or in part, for commercial purposes requires permission of the authors and attribution.
 6 |     License statement URL: https://ecmdb.ca/citations
 7 | 
 8 | MODOMICS
 9 |     URL: https://iimcb.genesilico.pl/modomics
10 |     License summary: Requests attribution.
11 |     License statement URL: https://iimcb.genesilico.pl/modomics/download
12 | 
13 | NCBI Taxonomy
14 |     URL: https://www.ncbi.nlm.nih.gov/taxonomy
15 |     License summary: Public domain. Information may be freely distributed and copied. Requests attribution.
16 |     License statement URL: https://www.ncbi.nlm.nih.gov/home/about/policies/#copyright
17 | 
18 | OrthoDB
19 |     URL: https://www.orthodb.org
20 |     License: CC BY 3.0
21 |     License statement URL: https://www.orthodb.org/?page=disclaimer
22 |     License URL: https://creativecommons.org/licenses/by/3.0/
23 | 
24 | PaxDB
25 |     URL: https://pax-db.org
26 |     License summary: The creators have granted Datanator permissions for derivations beyond the CC BY-ND 3.0 license stated at https://pax-db.org/license
27 | 
28 | Protein Ontology (PRO)
29 |     URL: https://proconsortium.org
30 |     License: CC BY 4.0
31 |     License statement URL: https://proconsortium.org/download/current/pro_nonreasoned.obo
32 |     License URL: https://creativecommons.org/licenses/by/4.0/
33 | 
34 | SABIO-RK
35 |     URL: http://sabio.h-its.org
36 |     License summary: Grants database for non-commercial research and academic purposes only, excluding as part of any product or service which is licensed. This license is not transferable. Requests attribution.
37 |     License URL: http://sabio.h-its.org/layouts/content/termscondition.gsp
38 | 
39 | UniProt
40 |     URL: https://www.uniprot.org
41 |     License: CC BY 4.0 for the copyrightable parts of the database
42 |     License statement URL: https://www.uniprot.org/help/license
43 |     License URL: https://creativecommons.org/licenses/by/4.0/
44 | 
45 | Yeast Metabolome Database (YMDB)
46 |     URL: http://www.ymdb.ca
47 |     License summary: Offered freely to the public. Use and re-distribution of the data, in whole or in part, for commercial purposes requires permission of the authors and attribution.
48 |     License statement URL: http://www.ymdb.ca/downloads
49 | 


--------------------------------------------------------------------------------
/datanator/util/reaction_util.py:
--------------------------------------------------------------------------------
 1 | """ Utilities for dealing with reactions
 2 | 
 3 | :Author: Yosef Roth <yosefdroth@gmail.com>
 4 | :Author: Jonathan <jonrkarr@gmail.com>
 5 | :Date: 2017-04-13
 6 | :Copyright: 2017, Karr Lab
 7 | :License: MIT
 8 | """
 9 | 
10 | from datanator.core import data_model
11 | from datanator.util import molecule_util
12 | import numpy
13 | 
14 | 
15 | def calc_reactant_product_pairs(reaction):
16 |     """ Get list of pairs of similar reactants and products using a greedy algorithm.
17 | 
18 |     Args:
19 |         reaction (:obj:`data_model.Reaction`): reaction
20 | 
21 |     Returns:
22 |         :obj:`list` of :obj:`tuple` of obj:`data_model.Specie`, :obj:`data_model.Specie`: list of pairs of similar reactants and products
23 |     """
24 |     participants = reaction.get_ordered_participants()
25 |     reactants = list(filter(lambda p: p.coefficient < 0, participants))
26 |     products = list(filter(lambda p: p.coefficient > 0, participants))
27 | 
28 |     # sort by structure to ensure result is reproducible
29 |     key = lambda p: (len(p.specie.structure), p.specie.structure)
30 |     reactants = sorted(reactants, key=key, reverse=True)
31 |     products = sorted(products, key=key, reverse=True)
32 | 
33 |     # create :obj:`molecule_util.Molecule` objects for each reactant and product
34 |     reactant_mols = [molecule_util.Molecule(structure=reactant.specie.structure) for reactant in reactants]
35 |     product_mols = [molecule_util.Molecule(structure=product.specie.structure) for product in products]
36 | 
37 |     # calculate similarities between each reactant and each product
38 |     similarities = numpy.full((len(reactants), len(products)), numpy.nan)
39 |     for i_reactant, reactant in enumerate(reactant_mols):
40 |         for i_product, product in enumerate(product_mols):
41 |             similarities[i_reactant, i_product] = reactant.get_similarity(product)
42 | 
43 |     # initialize pairs of similar reactants and products
44 |     pairs = []
45 | 
46 |     # iteratively identify the most similar pair of reactants and products
47 |     for i in range(min(len(reactants), len(products))):
48 |         index = numpy.argmax(similarities)
49 |         indices = numpy.unravel_index(index, dims=similarities.shape)
50 |         i_reactant = indices[0]
51 |         i_product = indices[1]
52 |         pairs.append((reactants[i_reactant], products[i_product]))
53 | 
54 |         reactants.pop(i_reactant)
55 |         products.pop(i_product)
56 |         similarities = numpy.delete(similarities, i_reactant, axis=0)
57 |         similarities = numpy.delete(similarities, i_product, axis=1)
58 | 
59 |     # unpaired products, reactants
60 |     for reactant in reactants:
61 |         pairs.append((reactant, None))
62 |     for product in products:
63 |         pairs.append((None, product))
64 | 
65 |     return pairs
66 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | The following instructions describe how to install ``datanator`` onto a Debian-based Linux OS
 4 | using the docker image `wc_env <https://hub.docker.com/r/karrlab/wc_env>`
 5 | 
 6 | Install dependencies
 7 | --------------------
 8 | First, please install the following dependencies:
 9 | 
10 | * `Docker <https://docs.docker.com/get-docker/>`_
11 | * `Docker-compose <https://docs.docker.com/compose/install/>`_
12 | 
13 | The following shell commands can be used to install these dependencies onto Ubuntu Linux 16.04::
14 | 
15 |     apt-get update
16 |     
17 |     apt-get install \
18 |         apt-transport-https \
19 |         ca-certificates \
20 |         curl \
21 |         gnupg-agent \
22 |         software-properties-common
23 | 
24 |     curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
25 | 
26 |     sudo add-apt-repository \
27 |         "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
28 |         $(lsb_release -cs) \
29 |         stable"
30 | 
31 |     sudo apt-get update
32 | 
33 |     sudo apt-get install docker-ce docker-ce-cli containerd.io
34 | 
35 |     sudo curl -L "https://github.com/docker/compose/releases/download/1.26.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
36 | 
37 |     sudo chmod +x /usr/local/bin/docker-compose
38 | 
39 |     sudo ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose
40 | 
41 | Install ``datanator``
42 | -----------------------------
43 | Second, please run the following shell commands to clone and install ``datanator`` from GitHub::
44 | 
45 |     mkdir karr_lab
46 |     mkdir ~/.wc
47 |     cd ./karr_lab
48 |     git clone git@github.com:KarrLab/pkg_utils.git
49 |     git clone git@github.com:KarrLab/wc_utils.git
50 |     git clone git@github.com:KarrLab/karr_lab_aws_manager.git
51 |     git clone git@github.com:KarrLab/datanator_query_python.git
52 |     git clone git@github.com:KarrLab/datanator.git
53 |     cd ./datanator
54 |     nano docker-compose.yml # change ``zl`` on line 13 to the proper username. Save and exit by pressing ``Ctrl + X`` followed by ``Y``
55 |     docker-compose up -d
56 | 
57 | 
58 | Run ``datanator``
59 | -----------------------------
60 | One needs to find the docker container ID in order to use Datanator package::
61 | 
62 |     docker ps
63 |     docker exec -it <container_id> bash
64 |     cd karr_lab/datanator
65 | 
66 | All python scripts in ``datanator`` dicrectory can be run with python3, for example::
67 | 
68 |     python3 datanator/data_source/corum_nosql.py
69 | 
70 | Running the command above will parse `Corum <http://mips.helmholtz-muenchen.de/corum/>` and
71 | store the parsed data in KarrLab's MongoDB.
72 | 
73 | 
74 | Contact `Yang <mailto:zhouyang.lian@familian.life>`_ for any questions regarding installation and running the package.
75 | 


--------------------------------------------------------------------------------
/datanator/data_source/protein_localization/justin_parseGramPositiveJSONSchema.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd  
 3 | import os
 4 | 
 5 | class ParseJSONSchema:
 6 |     
 7 |     def __init__(self, dataset, directory):
 8 |         self.dataset = dataset
 9 |         self.directory = directory
10 | 
11 | 
12 |     def update_directory(self):
13 |         data = pd.read_csv(self.dataset, delimiter='\t', nrows=10000)
14 |         data = data.where(pd.notnull(data), None)
15 | 
16 |         for i in range(len(data)):
17 |             d = {}
18 |             
19 |             # data to "entity"
20 |             d["entity"] = {}
21 |             d["entity"]["type"] = "protein"
22 |             seq_id = str(data.iloc[i,0])
23 |             d["entity"]["name"] = seq_id[str(data.iloc[i,0]).rfind('|')+2:]
24 |             
25 |             # data to "identifier" in "entity"
26 |             identifier = []
27 |             dict_identifier = {} # dictionary for identifier in entity
28 |             dict_identifier["namespace"] = "SeqID"
29 |             dict_identifier["value"] = seq_id[8:22]
30 |             identifier.append(dict_identifier)
31 |             d["entity"]["identifiers"] = identifier
32 | 
33 |             # data to values
34 |             d["values"] = []
35 |             for column_name in data.columns[1:]:
36 |                 if column_name != "SeqID" and column_name != "PSortVersion":
37 |                     values_dict = {}
38 |                     values_dict["type"] = column_name
39 |                     values_dict["value"] = data[column_name].iloc[i]
40 |                     d["values"].append(values_dict)
41 | 
42 |             # data to "identifier"
43 |             d["identifier"] = {}
44 |             d["identifier"]["namespace"] = "SeqID"
45 |             d["identifier"]["value"] = seq_id[8:22]
46 | 
47 | 
48 |             # source
49 |             d["source"] = []
50 |             dict_source = {}
51 |             dict_source["namespace"] = "PSORTsb Gram Positive"
52 |             dict_source["value"] = "Version 3"
53 |             d["source"].append(dict_source)
54 | 
55 |             # environment
56 |             d["environment"] = {"GramStain": "Gram positive"}
57 | 
58 |             # Schema Version
59 |             d["schema_version"] = "2.0"
60 | 
61 | 
62 |             # Create JSON files and place in directory
63 |             with open(self.directory+"/{}.json".format(seq_id[8:22]), "w+") as JSONfile:
64 |                 json.dump(d, JSONfile)
65 | 
66 | 
67 | def main():
68 |     json_files = ParseJSONSchema(dataset="./datanator/docs/protein_localization/computed_gram_positive/Computed-Gram_positive-PSORTdb-3.00.tab", 
69 |                                 directory="./datanator/docs/protein_localization/computed_gram_positive/JSONSchema")
70 | 
71 |     json_files.update_directory()
72 | 
73 | if __name__ == "__main__":
74 |     main()


--------------------------------------------------------------------------------
/tests/util/test_rna_seq_util.py:
--------------------------------------------------------------------------------
 1 | """ Test for RNA-seq utilities
 2 | 
 3 | :Author: Jonathan Karr <jonrkarr@gmail.com>
 4 | :Date: 2018-01-15
 5 | :Copyright: 2018, Karr Lab
 6 | :License: MIT
 7 | """
 8 | 
 9 | from datanator.util import rna_seq_util
10 | import capturer
11 | import os
12 | import shutil
13 | import tempfile
14 | import unittest
15 | import urllib.request
16 | 
17 | 
18 | class TestKallisto(unittest.TestCase):
19 | 
20 |     def setUp(self):
21 |         self.temp_dir = tempfile.mkdtemp()
22 | 
23 |     def tearDown(self):
24 |         shutil.rmtree(self.temp_dir)
25 | 
26 |     def test(self):
27 |         fasta_filename = os.path.join(self.temp_dir, 'transcripts.fasta.gz')
28 |         fastq_filenames = [
29 |             os.path.join(self.temp_dir, 'reads_1.fastq.gz'),
30 |             os.path.join(self.temp_dir, 'reads_2.fastq.gz'),
31 |         ]
32 | 
33 |         # download files
34 |         urllib.request.urlretrieve('https://github.com/pachterlab/kallisto/raw/master/test/transcripts.fasta.gz',
35 |                                    fasta_filename)
36 |         urllib.request.urlretrieve('https://github.com/pachterlab/kallisto/raw/master/test/reads_1.fastq.gz',
37 |                                    fastq_filenames[0])
38 |         urllib.request.urlretrieve('https://github.com/pachterlab/kallisto/raw/master/test/reads_2.fastq.gz',
39 |                                    fastq_filenames[1])
40 | 
41 |         # run kallisto on test files
42 |         index_filename = os.path.join(self.temp_dir, 'index.idx')
43 |         rna_seq_util.Kallisto().index([fasta_filename], index_filename=index_filename)
44 |         self.assertTrue(os.path.isfile(index_filename))
45 | 
46 |         output_dirname = os.path.join(self.temp_dir, 'out')
47 |         rna_seq_util.Kallisto().quant(fastq_filenames, index_filename=index_filename, output_dirname=output_dirname)
48 |         self.assertTrue(os.path.isdir(output_dirname))
49 |         self.assertTrue(os.path.isfile(os.path.join(output_dirname, 'abundance.tsv')))
50 |         self.assertTrue(os.path.isfile(os.path.join(output_dirname, 'abundance.h5')))
51 |         self.assertTrue(os.path.isfile(os.path.join(output_dirname, 'run_info.json')))
52 | 
53 |     def test_error(self):
54 |         with capturer.CaptureOutput(merged=False, relay=False) as captured:
55 |             with self.assertRaises(Exception):
56 |                 rna_seq_util.Kallisto()._run('index', ['__undefined__.fasta'], verbose=True)
57 |             self.assertNotEqual(captured.stdout.get_text(), '')
58 |             self.assertNotEqual(captured.stderr.get_text(), '')
59 | 
60 |         with capturer.CaptureOutput(merged=False, relay=False) as captured:
61 |             with self.assertRaises(Exception):
62 |                 rna_seq_util.Kallisto()._run('index', ['__undefined__.fasta'], verbose=False)
63 |             self.assertEqual(captured.stdout.get_text(), '')
64 |             self.assertEqual(captured.stderr.get_text(), '')
65 | 


--------------------------------------------------------------------------------
/datanator/schema_2/migrate_ec.py:
--------------------------------------------------------------------------------
 1 | from datanator_query_python.config import motor_client_manager
 2 | import asyncio
 3 | import simplejson as json
 4 | from pymongo import UpdateOne
 5 | from pymongo.errors import BulkWriteError
 6 | from pprint import pprint
 7 | 
 8 | 
 9 | class MigrateEC:
10 | 
11 |     def __init__(self, collection="ec", to_database="datanator-test",
12 |                  from_database="datanator", max_entries=float("inf")):
13 |         self.collection = collection
14 |         self.from_database = from_database
15 |         self.to_database = to_database
16 |         self.from_collection = motor_client_manager.client.get_database(from_database)[collection]
17 |         self.to_collection = motor_client_manager.client.get_database(to_database)[collection]
18 |         self.max_entries = max_entries
19 | 
20 |     async def index_primary(self, _key, background=True):
21 |         """Index key (single key ascending)
22 | 
23 |         Args:
24 |             _key(:obj:`str`): Name of key to be indexed
25 |         """
26 |         await self.to_collection.create_index(_key, background=background)
27 |     
28 |     async def process_cursor(self, skip=0):
29 |         """Process mongodb cursor
30 |         Transform data and move to new database
31 | 
32 |         Args:
33 |             docs(:obj:`pymongo.Cursor`): documents to be processed
34 |         """
35 |         bulk_write = []
36 |         query = {}
37 |         if self.max_entries == float('inf'):
38 |             limit = 0
39 |         else:
40 |             limit = self.max_entries
41 |         docs = self.from_collection.find(filter=query, projection={'_id': 0},
42 |                                         no_cursor_timeout=True, batch_size=500,
43 |                                         skip=skip, limit=limit)
44 |         i = 0
45 |         async for doc in docs:
46 |             i += 1
47 |             if i == self.max_entries:
48 |                 break
49 |             if i != 0 and i % 50 == 0:
50 |                 print("Processing file {}".format(i + skip))
51 |                 try:
52 |                     self.to_collection.bulk_write(bulk_write)
53 |                     bulk_write = []
54 |                 except BulkWriteError as bwe:
55 |                     pprint(bwe.details)
56 |                     bulk_write = []
57 |             doc["schema_version"] = "2"
58 |             bulk_write.append(UpdateOne({'ec_number': doc["ec_number"]}, {'$set': json.loads(json.dumps(doc, ignore_nan=True))}, upsert=True))
59 |         if len(bulk_write) != 0:
60 |             try:
61 |                 self.to_collection.bulk_write(bulk_write)
62 |             except BulkWriteError as bwe:
63 |                 pprint(bwe.details)
64 |             finally:
65 |                 print("Done.")
66 |         pass
67 | 
68 | 
69 | def main(tx, rx):
70 |     asyncio.gather(tx, rx)
71 | 
72 | if __name__ == '__main__':
73 |     src = MigrateEC(to_database="test", max_entries=100)
74 |     asyncio.run(main(src.index_primary("ec_number"), src.process_cursor(skip=0)))
75 | 


--------------------------------------------------------------------------------
/datanator/schema_2/migrate_corum.py:
--------------------------------------------------------------------------------
 1 | from datanator_query_python.config import motor_client_manager, config
 2 | import simplejson as json
 3 | import asyncio
 4 | from pymongo import UpdateOne
 5 | from pymongo.errors import BulkWriteError
 6 | from pprint import pprint
 7 | import os
 8 | 
 9 | 
10 | class MigrateCorum:
11 | 
12 |     def __init__(self, collection="corum", to_database="datanator-test",
13 |                  from_database="datanator", max_entries=float("inf")):
14 |         self.collection = collection
15 |         self.from_database = from_database
16 |         self.to_database = to_database
17 |         self.from_collection = motor_client_manager.client.get_database(from_database)[collection]
18 |         self.to_collection = motor_client_manager.client.get_database(to_database)[collection]
19 |         self.max_entries = max_entries
20 | 
21 |     async def index_primary(self, _key, background=True):
22 |         """Index key (single key ascending)
23 | 
24 |         Args:
25 |             _key(:obj:`str`): Name of key to be indexed
26 |         """
27 |         yield self.to_collection.create_index(_key, background=background)
28 | 
29 |     async def process_cursor(self, skip=0):
30 |         """Transform data and move to new database
31 | 
32 |         Args:
33 |             docs(:obj:`pymongo.Cursor`): documents to be processed
34 |         """
35 |         bulk_write = []
36 |         query = {}
37 |         if self.max_entries == float('inf'):
38 |             limit = 0
39 |         else:
40 |             limit = self.max_entries
41 |         docs = self.from_collection.find(filter=query, projection={'_id': 0},
42 |                                         no_cursor_timeout=True, batch_size=10,
43 |                                         skip=skip, limit=limit)
44 |         i = 0
45 |         async for doc in docs:
46 |             i += 1
47 |             if i == self.max_entries:
48 |                 break
49 |             if i != 0 and i % 50 == 0:
50 |                 print("Processing file {}".format(i + skip))
51 |                 try:
52 |                     self.to_collection.bulk_write(bulk_write)
53 |                     bulk_write = []
54 |                 except BulkWriteError as bwe:
55 |                     pprint(bwe.details)
56 |                     bulk_write = []
57 |             doc.pop("complex_id")
58 |             doc["ncbi_taxonomy_id"] = doc["SWISSPROT_organism_NCBI_ID"]
59 |             doc.pop("SWISSPROT_organism_NCBI_ID")
60 |             doc["schema_version"] = "2"
61 |             bulk_write.append(UpdateOne({'ComplexID': doc.get("ComplexID")}, {'$set': json.loads(json.dumps(doc, ignore_nan=True))}, upsert=True))
62 |         if len(bulk_write) != 0:
63 |             try:
64 |                 self.to_collection.bulk_write(bulk_write)
65 |             except BulkWriteError as bwe:
66 |                 pprint(bwe.details)
67 |             finally:
68 |                 print("Done.")  
69 | 
70 | 
71 | def main():
72 |     loop = asyncio.get_event_loop()
73 |     src = MigrateCorum()
74 |     src.index_primary('ComplexID')
75 |     loop.run_until_complete(src.process_cursor(skip=0))
76 | 
77 | if __name__ == '__main__':
78 |     main()
79 | 


--------------------------------------------------------------------------------
/tests/util/test_calc_tanimoto.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.util import calc_tanimoto
 3 | import tempfile
 4 | import shutil
 5 | import datanator.config.core
 6 | from datanator.util import mongo_util
 7 | 
 8 | 
 9 | class TestCalcTanimoto(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         cls.cache_dirname = tempfile.mkdtemp()
14 |         cls.db = 'test'
15 |         cls.username = datanator.config.core.get_config()['datanator']['mongodb']['user']
16 |         cls.password = datanator.config.core.get_config()['datanator']['mongodb']['password']
17 |         cls.server = datanator.config.core.get_config()['datanator']['mongodb']['server']
18 |         port = datanator.config.core.get_config()['datanator']['mongodb']['port']
19 |         replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet']
20 |         cls.src = calc_tanimoto.CalcTanimoto(
21 |             cache_dirname=cls.cache_dirname, MongoDB=cls.server, replicaSet=replSet, db=cls.db,
22 |             verbose=True, max_entries=20, password=cls.password, username=cls.username)
23 | 
24 |     @classmethod
25 |     def tearDownClass(cls):
26 |         shutil.rmtree(cls.cache_dirname)
27 | 
28 |     # @unittest.skip('passed')
29 |     def test_get_tanimoto(self):
30 |         mol1 = 'InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(15-13(18-9)16(25)20-17(26)19-15)5-11(22)14(24)12(23)6-30-31(27,28)29'
31 |         mol2 = 'InChI=1S/C10H7NO3/c12-9(10(13)14)7-5-11-8-4-2-1-3-6(7)8/h1-5,11H,(H,13,14)'
32 |         mol3 = 'InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(15-13(18-9)16(25)20-17(26)19-15)5-11(22)14(24)12(23)6-30-31(27,28)29'
33 |         coe = self.src.get_tanimoto(mol1, mol2)
34 |         coe2 = self.src.get_tanimoto(mol1, mol3)
35 |         self.assertEqual(0.121, coe)
36 |         self.assertEqual(1., coe2)
37 | 
38 |     @unittest.skip('out of date')
39 |     def test_one_to_many(self):
40 |         inchi = 'InChI=1S/C5H8O3/c1-3(2)4(6)5(7)8/h3H,1-2H3,(H,7,8)'
41 |         coeff, hashes = self.src.one_to_many(inchi, collection_str='metabolites_meta',
42 |                     field='inchi', lookup='inchi', num=10)
43 |         print(len(hashes))
44 |         client, _, col = mongo_util.MongoUtil(db = self.db, MongoDB = self.server,
45 |                                         username = self.username, password = self.password).con_db('metabolites_meta')
46 |         inchi1 = col.find_one({'inchi': hashes[5]})['inchi']
47 |         inchi2 = col.find_one({'inchi': hashes[9]})['inchi']
48 |         self.assertEqual(coeff[5], self.src.get_tanimoto(inchi, inchi1))
49 |         self.assertEqual(coeff[9], self.src.get_tanimoto(inchi, inchi2))
50 | 
51 |     @unittest.skip('out of date')
52 |     def test_many_to_many(self):
53 |         client, _, col = mongo_util.MongoUtil(db = self.db, MongoDB = self.server,
54 |                                         username = self.username, password = self.password).con_db('metabolites_meta')
55 |         self.src.many_to_many(collection_str1='metabolites_meta',
56 |                      collection_str2='metabolites_meta', field1='inchi',
57 |                      field2='inchi', lookup1='inchi',
58 |                      lookup2='inchi', num=10)
59 | 


--------------------------------------------------------------------------------
/datanator/util/mongo_util.py:
--------------------------------------------------------------------------------
 1 | import pymongo
 2 | import wc_utils.quilt
 3 | from bson import decode_all
 4 | import hashlib
 5 | from genson import SchemaBuilder
 6 | 
 7 | 
 8 | class MongoUtil:
 9 | 
10 |     def __init__(self, cache_dirname=None, MongoDB=None, replicaSet=None, db='test',
11 |                  verbose=False, max_entries=float('inf'), username = None, 
12 |                  password = None, authSource = 'admin', readPreference='nearest'):
13 |         string = "mongodb+srv://{}:{}@{}/{}?authSource={}&retryWrites=true&w=majority&readPreference={}".format(username, password, MongoDB, db, authSource, readPreference)
14 |         self.client = pymongo.MongoClient(string)
15 |         self.db_obj = self.client.get_database(db)
16 | 
17 |     def list_all_collections(self):
18 |         '''List all non-system collections within database
19 |         '''
20 | 
21 |         return self.db_obj.list_collection_names()
22 | 
23 |     def con_db(self, collection_str):
24 |         try:
25 |             collection = self.db_obj[collection_str]
26 |             return (self.client, self.db_obj, collection)
27 |         except pymongo.errors.ConnectionFailure:
28 |             return ('Server not available')
29 |         except ServerSelectionTimeoutError:
30 |             return ('Server timeout')
31 | 
32 |     def fill_db(self, collection_str):
33 |         '''Check if collection is already in MongoDB 
34 |         
35 |         If already in MongoDB:
36 |             Do nothing
37 |         Else:
38 |             Load data into db from quiltdata (karrlab/datanator)
39 | 
40 |         Args:
41 |             collection_str: name of collection (e.g. 'ecmdb', 'pax', etc)
42 |         '''
43 |         _, _, collection = self.con_db(collection_str)
44 |         if collection.find({}).count() != 0:
45 |             return collection
46 |         else:
47 |             manager = wc_utils.quilt.QuiltManager(
48 |                 path=self.cache_dirname, package='datanator')
49 |             filename = collection_str + '.bson'
50 |             manager.download_package(filename)
51 |             with open((self.cache_dirname + '/' + filename), 'rb') as f:
52 |                 collection.insert(decode_all(f.read()))
53 |             return collection
54 | 
55 |     def print_schema(self, collection_str):
56 |         '''Print out schema of a collection
57 |            removed '_id' from collection due to its object type
58 |            and universality 
59 |         '''
60 |         _, _, collection = self.con_db(collection_str)
61 |         doc = collection.find_one({})
62 |         builder = SchemaBuilder()
63 |         del doc['_id']
64 |         builder.add_object(doc)
65 |         return builder.to_schema()
66 | 
67 |     def flatten_collection(self, collection_str):
68 |         '''Flatten a collection
69 | 
70 |             c is ommitted because it does not have a non-object 
71 |             value associated with it
72 |         '''
73 |         _, _, collection = self.con_db(collection_str)
74 | 
75 |         pipeline = [
76 |             { "$addFields": { "subdoc.a": "$a" } },
77 |             { "$replaceRoot": { "newRoot": "$subdoc" }  }
78 |         ]
79 |         flat_col = collection.aggregate(pipeline)
80 |         return flat_col
81 | 


--------------------------------------------------------------------------------
/tests/util/test_rna_halflife_util.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import tempfile
 3 | import shutil
 4 | from datanator.util import rna_halflife_util
 5 | from datanator_query_python.config import config
 6 | import pandas as pd
 7 | 
 8 | 
 9 | class TestRnaHlUtil(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         src_db = 'datanator'
14 |         des_db = 'test'
15 |         conf = config.TestConfig()
16 |         username = conf.USERNAME
17 |         password = conf.PASSWORD
18 |         MongoDB = conf.SERVER
19 |         username = username
20 |         password = password
21 |         cls.cache_dir = tempfile.mkdtemp()
22 |         cls.protein_col = 'uniprot'
23 |         cls.rna_col = 'rna_halflife'
24 |         cls.src = rna_halflife_util.RnaHLUtil(server=MongoDB, username=username,
25 |         password=password, src_db=src_db, des_db=des_db, protein_col=cls.protein_col,
26 |         rna_col=cls.rna_col, readPreference='nearest', cache_dir=cls.cache_dir)
27 | 
28 |     @classmethod
29 |     def tearDownClass(cls):
30 |         shutil.rmtree(cls.cache_dir)
31 |         cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col)
32 |         cls.src.uniprot_collection_manager.client.close()
33 |         cls.src.uniprot_query_manager.client.close()
34 | 
35 |     @unittest.skip('avoid r/w db')
36 |     def test_fill_uniprot_by_oln(self):
37 |         self.src.fill_uniprot_by_oln('MA0002')
38 | 
39 |     @unittest.skip('links will not work from nonacademic IPs.')
40 |     def test_make_df(self):
41 |         url_0 = 'https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/nar/41/1/10.1093/nar/gks1019/2/gks1019-nar-00676-a-2012-File003.xlsx?Expires=1578425844&Signature=ZRFUxLdn4-vaBt5gQci~0o56KqyR9nJj9i32ig5X6YcfqiJeV3obEq8leHGdDxx6w~KABgewiQ66HTB7gmuG~2GL-YgxPKYSjt17WrYMkc-0ibw6TMlTvWZZfvw-lPe~wvpmVfNEXnTbP7jHyNLu9jeJ6yhoXvgIyQtzA5PbEI1fyXEgeZzOKMltmITqL3g3APsPsagCTC66rwrBT23Aghh6D314uilT2DZHCc68MH2nyV~qAhFqIQiOj-7VTEKqkDPvPYvuE2KNKXdvW23gk100YV~58ozbt8ijRz5Gr5gPtE~f1Ab5l260EIbWHJNabMRleInJQqUIDPFN4C38PQ__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA'
42 |         df_0 = self.src.make_df(url_0, 'Supplementary Table 1', usecols='B:D', skiprows=[0,1,2],
43 |         names=['ordered_locus_name', 'half_life', 'r_squared'])
44 |         self.assertEqual(df_0.iloc[0]['ordered_locus_name'], 'Rv0002')
45 |         url_1 = 'https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/nar/42/4/10.1093_nar_gkt1150/1/gkt1150_Supplementary_Data.zip?Expires=1578928721&Signature=ADjsCSaceimzGs6aJ~uG7np88TzHNooAoBabdm-6utYVIZOEwRbzTdiBp~76vM4KEHz9Nir8GNrtA3AwHwGFm0bu~aorTG4xrOChS6UgfBQiUtgr8vfbDIUno1y1nxLGCKIfQrb2Gx-SVnigum2gjcveymK995zadSNZqN~w-vz-Ii0a6fH7kvKN8m9vLWf6fdo0NXSmgnkjj9KPCuS-bmK0y4ZH5Ex0Rl4qi5uCroYmDBNOhXY23pcalbpFwB1-07tA3~756gZN4Mo9uMeSVQKl5UsHzx5amB6WvSCXS8z756YoaaMCg0FQbUCcQ46fRGdHxcvPNcrPo5IMEGmi8g__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA'
46 |         df_1 = self.src.make_df(url_1, 'TableS1', file_type='zip', file_name='nar-01935-a-2013-File011.xlsx', usecols='L:O', skiprows=list(range(0, 7)),
47 |                                 names=['a', 'b', 'c', 'd'])
48 |         self.assertEqual(df_1.iloc[0]['a'], 5.74239011770224)
49 | 
50 |     def test_fill_uniprot_with_df(self):
51 |         pass


--------------------------------------------------------------------------------
/datanator/data_source/sabio_compound.py:
--------------------------------------------------------------------------------
 1 | from datanator.util import mongo_util, file_util, chem_util
 2 | import datanator.config.core
 3 | from pymongo.collation import Collation, CollationStrength
 4 | from pymongo import ASCENDING
 5 | import os
 6 | import tempfile
 7 | 
 8 | 
 9 | class SabioCompound:
10 | 
11 |     def __init__(self, username=None, password=None, server=None, authSource='admin',
12 |                  src_database='datanator', dest_database=None, max_entries=float('inf'), verbose=True,
13 |                  src_collection='sabio_compound', dest_collection=None, cache_dir=None):
14 |         '''
15 |                 Args:
16 |                     src_database (:obj: `str`): name of database in which source collections reside
17 |         '''
18 |         self.mongo_manager = mongo_util.MongoUtil(MongoDB=server, username=username,
19 |                                                   password=password, authSource=authSource, db=src_database)
20 |         self.file_manager = file_util.FileUtil()
21 |         self.chem_manager = chem_util.ChemUtil()
22 |         self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
23 |         self.verbose = verbose
24 |         self.max_entries = max_entries
25 |         self.src_collection = src_collection   
26 |     
27 |     def add_inchi_key(self):
28 |         """Add inchi_key field to sabio_compound collection
29 |         in MongoDB
30 |         """
31 |         query = {}
32 |         projection = {'structures._value_inchi': 1}
33 |         _, _, collection = self.mongo_manager.con_db(self.src_collection)
34 |         docs = collection.find(filter=query, projection=projection)
35 |         count = collection.count_documents(query)
36 |         for i, doc in enumerate(docs):
37 |             if i == self.max_entries:
38 |                 break
39 |             if self.verbose and i % 100 == 0:
40 |                 print('Processing doc {} out of {}'.format(i, count))
41 |             try:
42 |                 inchi = doc['structures'][0]['_value_inchi']
43 |             except IndexError:
44 |                 print('Compound with id {} has no structure information'.format(doc['_id']))
45 |             except KeyError:
46 |                 print('Compound with id {} has no structure array'.format(doc['_id']))
47 |             inchi_key = self.chem_manager.inchi_to_inchikey(inchi)
48 |             collection.update_one({'_id': doc['_id']},
49 |                                  {'$set': {'inchi_key': inchi_key}})
50 | 
51 | def main():
52 |     cache_dirname = tempfile.mkdtemp()
53 |     cache_dir = os.path.join(cache_dirname, 'logs.txt')
54 |     src_db = 'datanator'
55 |     collection_str = 'sabio_compound'
56 |     username = datanator.config.core.get_config()[
57 |         'datanator']['mongodb']['user']
58 |     password = datanator.config.core.get_config(
59 |     )['datanator']['mongodb']['password']
60 |     server = datanator.config.core.get_config(
61 |     )['datanator']['mongodb']['server']        
62 |     src = SabioCompound(username=username, password=password, server=server, 
63 |                         authSource='admin', src_database=src_db,
64 |                         verbose=True, src_collection=collection_str,
65 |                         cache_dir=cache_dir)
66 |     src.add_inchi_key()
67 | 
68 | if __name__ == '__main__':
69 |     main()


--------------------------------------------------------------------------------
/datanator/data_source/protein_localization/experimental.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import numpy as np
 4 | from datanator_query_python.config import config
 5 | from datanator_query_python.util import mongo_util
 6 | 
 7 | 
 8 | class ParsePsortExperimental(mongo_util.MongoUtil):
 9 | 
10 |     def __init__(self, max_entries=float('inf'),
11 |                  MongoDB=None,
12 |                  db=None,
13 |                  collection=None,
14 |                  username=None,
15 |                  password=None,
16 |                  authSource='admin',
17 |                  readPreference='nearest'):
18 |         super(ParsePsortExperimental, self).__init__(MongoDB=MongoDB, db=db,
19 |                                                      username=username,
20 |                                                      password=password,
21 |                                                      authSource=authSource,
22 |                                                      readPreference=readPreference)
23 |         self.max_entries = max_entries
24 |         self.collection = collection
25 | 
26 |     def parse_psortdb(self):
27 |         """
28 |         To parse database psortdb Experimental-PSORTdb-v4.00.tsv file
29 | ​
30 |         Args:
31 |             max_entries: int
32 |                 number of rows to parse.
33 |                 A JSON file will be created for each of the tsv file's first <max_entries> rows
34 | ​
35 |         Return:
36 |             ()
37 |         """
38 |         collection = self.db_obj[self.collection]
39 |         # data = pd.read_csv('Experimental-PSORTdb-v4.00.tsv', delimiter="\t")
40 |         # data = data.fillna("None")
41 |         # header = list(data.columns.values)
42 |         # for i in range(self.max_entries):
43 |         #     d = {}
44 |         #     for j in range(len(header)):
45 |         #         if isinstance(data.iloc[i, j], int):
46 |         #             data.iloc[i, j] = int(data.iloc[i, j])
47 |         #         else:
48 |         #             d[header[j]] = data.iloc[i, j]
49 |         #     # name is the JSON file's name
50 |         #     if (data.iloc[i, 0] != "None"):
51 |         #         name = data.iloc[i, 0]  # SwissProt_ID
52 |         #     else:
53 |         #         name = data.iloc[i, 2]  # Other_Accession
54 |         #     with open(name + ".json", "w+") as f:
55 |         #         x = json.dumps(d, f, cls=NpEncoder)
56 |         collection.update_one({"uniprot_id": "P01234"},
57 |                               {"$set": {"protein_name": "some_name",
58 |                                         "another_field": "another_value"},
59 |                                "$addToSet": {"$each": {"add_id": [{"namespace": "something",
60 |                                                                    "value": "1"}]}}})
61 | 
62 | 
63 | def main():
64 |     conf = config.Justin()
65 |     conf_main = config.Config()
66 |     username = conf.USERNAME
67 |     password = conf.PASSWORD
68 |     MongoDB = conf_main.SERVER
69 |     src = ParsePsortExperimental(MongoDB=MongoDB,
70 |                                  username=username,
71 |                                  password=password,
72 |                                  collection="protein_localization",
73 |                                  db="datanator-demo")
74 |     src.parse_psortdb()
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     main()
79 | 


--------------------------------------------------------------------------------
/datanator/data_source/protein_localization/parse_psortdb_negative_wo_outer_membrane.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | 
 4 | class ParsePsort:
 5 |     def __init__(self, max_entries):
 6 |         self.max_entries = max_entries
 7 |         
 8 |     def parse_psortdb(self):
 9 |         """
10 |         To parse database psortdb gram negative without outer membrane file
11 |         and create JSON files conforming to datanator_pattern/observation_compiled.json
12 |         
13 |         Args:
14 |             max_entries(:obj:'int'): number of rows to parse.
15 |             A JSON file will be created for each of the first <max_entries> rows
16 | 
17 |         Return:
18 |             ()
19 |         """
20 |         data=pd.read_csv('Computed-Gram_negative_without_outer_membrane-PSORTdb-3.00.tab',delimiter="\t",low_memory=False)
21 |         data = data.where(pd.notnull(data), None)
22 |         for i in range(self.max_entries):
23 |             d={}
24 |             #entity
25 |             d["entity"]={}
26 |             d["entity"]["type"]="protein"
27 |             d["entity"]["name"]=str(data.iloc[i,0])[str(data.iloc[i,0]).rfind("|")+2:]
28 |             d["entity"]["synonyms"]=[]
29 |             #identifiers
30 |             d["entity"]["identifiers"]=[]
31 |             seq_id = {}
32 |             seq_id["namespace"]="Seq_ID"
33 |             seq_id["value"]=str(data.iloc[i,0])[str(data.iloc[i,0]).find("ref")+4:str(data.iloc[i,0]).rfind("|")]
34 |             d["entity"]["identifiers"].append(seq_id)
35 |             #localizations
36 |             d["value"]={}
37 |             d["value"]["PPSVM_Localization"]=data.iloc[i,1]
38 |             d["value"]["Profile_Localization"]=data.iloc[i,3]
39 |             d["value"]["Signal_Localization"]=data.iloc[i,5]
40 |             d["value"]["SCL-BLASTe_Localization"]=data.iloc[i,7]
41 |             d["value"]["CMSVM_Localization"]=data.iloc[i,9]
42 |             d["value"]["SCL-BLAST_Localization"]=data.iloc[i,11]
43 |             d["value"]["OMPMotif_Localization"]=data.iloc[i,13]
44 |             d["value"]["OMSVM_Localization"]=data.iloc[i,15]
45 |             d["value"]["Motif_Localization"]=data.iloc[i,17]
46 |             d["value"]["CytoSVM_Localization"]=data.iloc[i,19]
47 |             d["value"]["CWSVM_Localization"]=data.iloc[i,21]
48 |             d["value"]["ModHMM_Localization"]=data.iloc[i,23]
49 |             d["value"]["ECSVM_Localization"]=data.iloc[i,25]
50 |             d["value"]["Cytoplasmic Membrane_Score"]=data.iloc[i,27]
51 |             d["value"]["Cellwall_Score"]=data.iloc[i,28]
52 |             d["value"]["Extracellular_Score"]=data.iloc[i,29]
53 |             d["value"]["Cytoplasmic_Score"]=data.iloc[i,30]                    
54 |             d["value"]["Final_Localization"]=data.iloc[i,31]
55 |             d["value"]["Final_Localization_2"]=data.iloc[i,32]
56 |             d["value"]["Secondary_Localization"]=data.iloc[i,34]
57 |             d["value"]["Final_Score"]=data.iloc[i,35]
58 | 
59 |             #source
60 |             d["source"]={}
61 |             d["source"]["namespace"]="PSORT"
62 |             d["source"]["value"]="Version "+str(data.iloc[i,36])
63 |             with open("Gram_Negative_WO_Outer_Membrane/"+str(data.iloc[i,0])[str(data.iloc[i,0]).find("ref")+4:str(data.iloc[i,0]).rfind("|")]+".json","w+") as f:
64 |                 json.dump(d,f,indent=4)
65 | 
66 | p1=ParsePsort(10)
67 | p1.parse_psortdb()
68 | 


--------------------------------------------------------------------------------
/datanator/schema_2/migrate_metabolite_concentration.py:
--------------------------------------------------------------------------------
 1 | from datanator_query_python.config import motor_client_manager
 2 | import asyncio
 3 | import simplejson as json
 4 | from pymongo import UpdateOne
 5 | from pymongo.errors import BulkWriteError
 6 | from pprint import pprint
 7 | 
 8 | 
 9 | class MigrateMC:
10 | 
11 |     def __init__(self, collection="metabolite_concentrations", to_database="datanator-test",
12 |                  from_database="datanator", max_entries=float("inf")):
13 |         self.collection = collection
14 |         self.from_database = from_database
15 |         self.to_database = to_database
16 |         self.from_collection = motor_client_manager.client.get_database(from_database)[collection]
17 |         self.to_collection = motor_client_manager.client.get_database(to_database)[collection]
18 |         self.max_entries = max_entries
19 | 
20 |     async def index_primary(self, _key, background=True):
21 |         """Index key (single key ascending)
22 | 
23 |         Args:
24 |             _key(:obj:`str`): Name of key to be indexed
25 |         """
26 |         await self.to_collection.create_index(_key, background=background)
27 |     
28 |     async def process_cursor(self, skip=0):
29 |         """Process mongodb cursor
30 |         Transform data and move to new database
31 | 
32 |         Args:
33 |             docs(:obj:`pymongo.Cursor`): documents to be processed
34 |         """
35 |         bulk_write = []
36 |         query = {}
37 |         if self.max_entries == float('inf'):
38 |             limit = 0
39 |         else:
40 |             limit = self.max_entries
41 |         docs = self.from_collection.find(filter=query, projection={'_id': 0},
42 |                                         no_cursor_timeout=True, batch_size=100,
43 |                                         skip=skip, limit=limit)
44 |         i = 0
45 |         async for doc in docs:
46 |             i += 1
47 |             if i == self.max_entries:
48 |                 break
49 |             if i != 0 and i % 50 == 0:
50 |                 print("Processing file {}".format(i + skip))
51 |                 try:
52 |                     await self.to_collection.bulk_write(bulk_write)
53 |                     bulk_write = []
54 |                 except BulkWriteError as bwe:
55 |                     pprint(bwe.details)
56 |                     bulk_write = []
57 |             doc['schema_version'] = "2"
58 |             for obj in doc["concentrations"]:
59 |                 tax_doc = await motor_client_manager.client.get_database(
60 |                     "datanator-test")["taxon_tree"].find_one(filter={"tax_id": obj["ncbi_taxonomy_id"]},
61 |                     projection={'canon_anc_ids': 1, 'canon_anc_names': 1})
62 |                 obj["canon_anc_ids"] = tax_doc["canon_anc_ids"]
63 |                 obj["canon_anc_names"] = tax_doc["canon_anc_names"]
64 |                 obj.pop("last_modified", None)
65 |             bulk_write.append(UpdateOne({'inchikey': doc['inchikey']}, {'$set': json.loads(json.dumps(doc, ignore_nan=True))}, upsert=True))
66 |         if len(bulk_write) != 0:
67 |             try:
68 |                 self.to_collection.bulk_write(bulk_write)
69 |             except BulkWriteError as bwe:
70 |                 pprint(bwe.details)
71 |             finally:
72 |                 print("Done.")
73 | 
74 | src = MigrateMC()
75 | async def main():     
76 |     await asyncio.gather(src.index_primary('inchikey'),
77 |                          src.process_cursor(skip=0))
78 | 
79 | if __name__ == '__main__':
80 |     loop = asyncio.get_event_loop()   
81 |     loop.run_until_complete(main())


--------------------------------------------------------------------------------
/datanator/data_source/array_express_tools/taxon_exceptions.txt:
--------------------------------------------------------------------------------
 1 | Recorded Taxon -- Correct Taxon
 2 | Homo sapiens + Aspergillus fumigatus -- Homo sapiens, Aspergillus fumigatus
 3 | Homo sapiens + Candida albicans -- Homo sapiens, Candida albicans
 4 | Homo sapiens + Candida parapsilosis -- Homo sapiens
 5 | Homo sapiens + Saccharomyces cerevisiae -- Homo sapiens
 6 | human -- Homo sapiens
 7 | mouse -- mus musculus
 8 | Homo Sapien -- Homo sapiens
 9 | Trypanosma congolense -- Trypanosoma congolense
10 | mixed sample: human cell line AGS and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344
11 | mixed sample: human cell line CaCo-2 and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344
12 | mixed sample: human cell line HEK 293 and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344
13 | mixed sample: human cell line HT29 and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344
14 | mixed sample: human cell line LoVo and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344
15 | mixed sample: human cell line THP-1 (-PMA)and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344
16 | mixed sample: mouse cell line BMDM and Salmonella Typhimurium SL1344 -- mus musculus, Salmonella Typhimurium SL1344
17 | mixed sample: mouse cell line L929 and Salmonella Typhimurium SL1344 -- mus musculus, Salmonella Typhimurium SL1344
18 | mixed sample: mouse cell line MEF and Salmonella Typhimurium SL1344 -- mus musculus, Salmonella Typhimurium SL1344
19 | mixed sample: mouse cell line RAW264.7 and Salmonella Typhimurium SL1344 -- mus musculus, Salmonella Typhimurium SL1344
20 | mixed sample: pig cell line 3D4/31 and Salmonella Typhimurium SL1344 -- sus scrofa, Salmonella Typhimurium SL1344
21 | mixed sample: pig cell line IPEC-J2 and Salmonella Typhimurium SL1344 -- sus scrofa, Salmonella Typhimurium SL1344
22 | human cell line HeLa-S3 -- Homo sapiens
23 | mixed sample (human cell line HeLa-S3 + Salmonella typhimurium SL1344) -- Homo sapiens, Salmonella Typhimurium SL1344
24 | mixed sample: human cell line HeLa-S3 and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344
25 | Xanthomonas oryzae pv. oryzae KACC10331 -- Xanthomonas oryzae
26 | Leishmania mexicana and Mus musculus -- Leishmania mexicana, Mus musculus
27 | Acinetobacter baumanii -- Acinetobacter baumannii
28 | Zebrafish -- Danio rerio
29 | Human -- Homo sapiens
30 | Japanese rice fish -- Oryzias latipes
31 | Japanses rice fish -- Oryzias latipes
32 | House mouse -- mus musculus
33 |   -- None
34 | Mouse -- mus musculus
35 | Homo sapien -- Homo sapiens
36 | Acinetobacter baumannii 1592897 -- Acinetobacter baumannii
37 | Acinetobacter baumannii 1564232 -- Acinetobacter baumannii
38 | Acinetobacter baumannii 983759 -- Acinetobacter baumannii
39 | Acinetobacter baumannii 478810 -- Acinetobacter baumannii
40 | Strongyloides_stercoralis -- Strongyloides stercoralis
41 | Solanum lycopersicum L. -- Solanum lycopersicum
42 | chimpanzee -- Pan troglodytes
43 | Holcus lanatus L -- Holcus lanatus
44 | Eupolybothrus sp. PS-2013 -- Eupolybothrus
45 | Oryctolagus cuniculus domesticus -- Oryctolagus cuniculus
46 | Reaumuria soongorica -- Reaumuria
47 | Lactobacillus oligofermentans LMG 22743T + Lactococcus piscium MKFS47 -- Lactobacillus oligofermentans DSM 15707 = LMG 22743, Lactococcus piscium MKFS47
48 | Lactococcus lactis subsp. cremoris CNCM I-1631 -- Lactococcus lactis subsp. cremoris
49 | Mycobacterium smegmatis str. MC2 155 -- Mycolicibacterium smegmatis MC2 155
50 | Lactobacillus oligofermentans LMG 22743T -- Lactobacillus oligofermentans DSM 15707 = LMG 22743
51 | Frankia sp. CcI3 -- Frankia
52 | 


--------------------------------------------------------------------------------
/tests/data_source/test_sabio_reaction.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source import sabio_reaction
 3 | import tempfile
 4 | import shutil
 5 | import json
 6 | import os
 7 | import datanator.config.core
 8 | 
 9 | 
10 | class TestProteinAggregate(unittest.TestCase):
11 | 
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         cls.cache_dirname = tempfile.mkdtemp()
15 |         cache_dir = os.path.join(cls.cache_dirname, 'logs.txt')
16 |         src_db = 'datanator'
17 |         des_db = 'test'
18 |         cls.collection_str = 'sabio_reaction'
19 |         username = datanator.config.core.get_config()[
20 |             'datanator']['mongodb']['user']
21 |         password = datanator.config.core.get_config(
22 |         )['datanator']['mongodb']['password']
23 |         server = datanator.config.core.get_config(
24 |         )['datanator']['mongodb']['server']
25 |         port = datanator.config.core.get_config(
26 |         )['datanator']['mongodb']['port']        
27 |         cls.src = sabio_reaction.RxnAggregate(username=username, password=password, server=server, 
28 |                                                 authSource='admin', src_database=src_db, max_entries=20, 
29 |                                                 verbose=True, collection=cls.collection_str, destination_database=des_db,
30 |                                                 cache_dir=cache_dir)
31 | 
32 |     @classmethod
33 |     def tearDownClass(cls):
34 |         shutil.rmtree(cls.cache_dirname)
35 |         cls.src.db_obj.drop_collection(cls.collection_str)
36 |         cls.src.client.close()
37 | 
38 |     def test_get_id(self):
39 |         input_0 = {'resource': [{'namespace': 'something'}, {'id': '2'}, {'namespace': 'sabiork.reaction', 'id': '6570'}]}
40 |         result_0 = self.src.get_rxn_id(input_0)
41 |         self.assertEqual(result_0, 6570)
42 | 
43 |     def test_create_reactants(self):
44 |         input_0 = {'reaction_participant': [{}, {}, {}, {'substrate_aggregate': '123'}, {'product_aggregate': '456'}]}
45 |         result_0 = self.src.create_reactants(input_0)
46 |         self.assertEqual(result_0, {'substrate_aggregate': '123', 'product_aggregate': '456'})
47 | 
48 |     def test_fill_collection(self):
49 |         self.src.fill_collection()
50 | 
51 |     def test_extract_reactant_names(self):
52 |         substrates_0 = {'substrate_name': 'a', 'substrate_synonym': ['a1', 'a2', 'a3']}
53 |         substrates_1 = {'substrate_name': 'b', 'substrate_synonym': ['b1', 'b2', 'b3']}
54 |         products_0 = {'product_name': 'c', 'product_synonym': ['c1', 'c2', 'c3']}
55 |         products_1 = {'product_name': 'd', 'product_synonym': ['d1', 'd2', 'd3']}
56 |         products_2 = {'product_name': 'e', 'product_synonym': []}
57 |         input_0 = {'reaction_participant': [{'substrate': [substrates_0, substrates_1]},{'product': [products_0, products_1, products_2]}]}
58 |         sub_0, pro_0 = self.src.extract_reactant_names(input_0)
59 |         sub_exp_0 = [['a1', 'a2', 'a3', 'a'], ['b1', 'b2', 'b3', 'b']]
60 |         pro_exp_0 = [['c1', 'c2', 'c3', 'c'], ['d1', 'd2', 'd3', 'd'], ['e']]
61 |         self.assertEqual(sub_0, sub_exp_0)
62 |         self.assertEqual(pro_0, pro_exp_0)
63 | 
64 |     def test_extract_enzyme_names(self):
65 |         input_0 = {'enzymes': [{'enzyme':[{'enzyme_name': 'a', 'enzyme_synonym': ['a1', 'a2', 'a3']}]}]}
66 |         input_1 = {'enzymes': [{'enzyme':[{'enzyme_name': 'a', 'enzyme_synonym': ['a1', 'a2', 'a3']},
67 |                                           {'enzyme_name': 'b', 'enzyme_synonym': ['b1', 'b2', 'b3']}]}]}
68 |         result_0 = self.src.extract_enzyme_names(input_0)
69 |         result_1 = self.src.extract_enzyme_names(input_1)
70 |         self.assertEqual(result_0, ['a1', 'a2', 'a3', 'a']) 
71 |         self.assertEqual(result_1[0], ['a1', 'a2', 'a3', 'a'])


--------------------------------------------------------------------------------
/tests/data_source/test_taxon_tree.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source import taxon_tree
 3 | import datanator.config.core
 4 | import tempfile
 5 | import shutil
 6 | import os
 7 | import json
 8 | 
 9 | 
10 | class TestTaxonTree(unittest.TestCase):
11 | 
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         cls.cache_dirname = tempfile.mkdtemp()
15 |         cls.db = 'test'
16 |         username = datanator.config.core.get_config()['datanator']['mongodb']['user']
17 |         password = datanator.config.core.get_config()['datanator']['mongodb']['password']
18 |         MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
19 |         port = datanator.config.core.get_config()['datanator']['mongodb']['port']
20 |         replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet']
21 |         cls.collection_str = 'taxon_tree'
22 |         cls.src = taxon_tree.TaxonTree(
23 |             cls.cache_dirname, MongoDB, cls.db, replicaSet=None, 
24 |             verbose=True, max_entries=10, username = username, password = password)
25 |         cls.path = os.path.join(cls.cache_dirname, cls.collection_str)
26 | 
27 |     @classmethod
28 |     def tearDownClass(cls):
29 |         shutil.rmtree(cls.cache_dirname)
30 |         cls.src.client.close()
31 | 
32 |     # @unittest.skip('passed')
33 |     def test_download_dump(self):
34 |         noi = 'division.dmp'
35 |         my_file = os.path.join(self.path, noi)
36 |         self.src.download_dump()
37 |         self.assertTrue(os.path.isfile(my_file))
38 | 
39 |     # @unittest.skip('passed')
40 |     def test_parse_fullname_line(self):
41 |         line1 = '1936272    |   Candidatus Heimdallarchaeota    |   cellular organisms; Archaea; Asgard group;  |'
42 |         line2 = '2012493    |   Candidatus Heimdallarchaeota archaeon B3_Heim   |   cellular organisms; Archaea; Asgard group; Candidatus Heimdallarchaeota;    |'
43 |         line3 = '1935183    |   Asgard group    |   cellular organisms; Archaea;    |'
44 | 
45 |         self.assertEqual(self.src.parse_fullname_line(line1), [
46 |                          '1936272', 'Candidatus Heimdallarchaeota', ['cellular organisms', 'Archaea', 'Asgard group']])
47 |         self.assertEqual(self.src.parse_fullname_line(line2)[:2], [
48 |                          '2012493', 'Candidatus Heimdallarchaeota archaeon B3_Heim'])
49 |         self.assertEqual(self.src.parse_fullname_line(line3)
50 |                          [1], 'Asgard group')
51 |         self.assertEqual(self.src.parse_fullname_line(line3)[
52 |                          2], ['cellular organisms', 'Archaea'])
53 | 
54 |     # @unittest.skip('passed')
55 |     def test_parse_taxid_line(self):
56 |         line1 = '1841596\t|\t131567 2157 1935183 1936272 \t|\n'
57 |         self.assertEqual(self.src.parse_taxid_line(line1), [
58 |                          '131567', '2157', '1935183', '1936272'])
59 | 
60 |     # @unittest.skip('passed')
61 |     def test_parse_fullname_taxid(self):
62 |         self.src.parse_fullname_taxid()
63 |         doc = self.src.collection.find_one({'tax_id': 1935183})
64 |         self.assertEqual(doc['anc_id'], [131567, 2157])
65 | 
66 |     # @unittest.skip('passed')
67 |     def test_parse_nodes(self):
68 |         self.src.parse_nodes()
69 |         doc = self.src.collection.find_one({'tax_id': 1})
70 |         self.assertEqual(doc['tax_name'], 'root')
71 |         self.assertEqual(doc['division_id'], 8)
72 | 
73 |     # @unittest.skip('passed')
74 |     def test_parse_division(self):
75 |         self.src.parse_division()
76 | 
77 |     # @unittest.skip('passed')
78 |     def test_parse_names(self):
79 |         self.src.parse_names()
80 | 
81 |     # @unittest.skip('passed')
82 |     def test_parse_gencode(self):
83 |         self.src.parse_gencode()
84 | 
85 |     def test_load_content(self):
86 |         self.src.load_content()
87 | 


--------------------------------------------------------------------------------
/tests/data_source/test_metabolite_nosql.py:
--------------------------------------------------------------------------------
 1 | '''Tests of metabolite_nosql
 2 | 
 3 | :Author: Zhouyang Lian <zhouyang.lian@familian.life>
 4 | :Author: Jonathan <jonrkarr@gmail.com>
 5 | :Date: 2019-04-02
 6 | :Copyright: 2019, Karr Lab
 7 | :License: MIT
 8 | '''
 9 | 
10 | import unittest
11 | import shutil
12 | import tempfile
13 | from datanator.data_source import metabolite_nosql
14 | import datanator.config.core
15 | import os
16 | import json
17 | 
18 | 
19 | class TestMetaboliteNoSQL(unittest.TestCase):
20 | 
21 |     @classmethod
22 |     def setUpClass(cls):
23 |         cls.cache_dirname = tempfile.mkdtemp()
24 |         cls.source = 'ecmdb' # 'ymdb' or 'ecmdb'
25 |         cls.db = 'datanator'
26 |         username = datanator.config.core.get_config()['datanator']['mongodb']['user']
27 |         password = datanator.config.core.get_config()['datanator']['mongodb']['password']
28 |         MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server']
29 |         port = datanator.config.core.get_config()['datanator']['mongodb']['port']
30 |         replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet']
31 |         cls.output_directory = cls.cache_dirname # directory to store JSON files
32 |         cls.src = metabolite_nosql.MetaboliteNoSQL(cls.output_directory,
33 |             cls.source, MongoDB, cls.db, verbose = True, max_entries=20,
34 |             username = username, password = password)
35 |         cls.client, cls.db_obj, cls.collection = cls.src.con_db(cls.source)
36 | 
37 |     @classmethod
38 |     def tearDownClass(cls):
39 |         shutil.rmtree(cls.cache_dirname)
40 |         cls.client.close()
41 | 
42 |     @unittest.skip('ecmdb.ca and ymdb.ca xml server http 500 error')
43 |     def test_write_to_json(self):
44 |         session = self.src.write_to_json()
45 |         null = None
46 |         if self.source == 'ymdb':
47 |             ymdb_6 = self.collection.find({"ymdb_id": "YMDB00006"})[0]
48 |             self.assertEqual(ymdb_6['ymdb_id'], "YMDB00006")
49 |             self.assertEqual(ymdb_6['species'], "Saccharomyces cerevisiae")
50 |             self.assertEqual(ymdb_6['name'], "1D-Myo-inositol 1,4,5,6-tetrakisphosphate")
51 | 
52 |             ymdb_10 = self.collection.find({"ymdb_id": "YMDB00010"})[0]
53 |             self.assertEqual(ymdb_10['ymdb_id'], "YMDB00010")
54 |             self.assertEqual(ymdb_10['species'], "Saccharomyces cerevisiae")
55 |             self.assertEqual(ymdb_10['wikipedia'], None)
56 | 
57 |             file_name = self.output_directory + '/' + 'YMDB00003.json'
58 |             with open (file_name, 'r') as f:
59 |                 data = json.load(f)
60 |             self.assertEqual(data['ymdb_id'], "YMDB00003")
61 |             self.assertEqual(data['name'], "Urea")
62 |             self.assertEqual(data['state'], "Solid")
63 | 
64 | 
65 |         elif self.source == 'ecmdb':
66 |             ecmdb_5 = self.collection.find({"m2m_id": "M2MDB000005"})[0]
67 |             self.assertEqual(ecmdb_5['accession'], "ECMDB00023")
68 |             self.assertEqual(ecmdb_5['name'], "3-Hydroxyisobutyric acid")
69 |             self.assertEqual(ecmdb_5['chemical_formula'], "C4H8O3")
70 | 
71 |             ecmdb_10 = self.collection.find({"m2m_id": "M2MDB000010"})[0]
72 |             self.assertEqual(ecmdb_10['accession'], "ECMDB00034")
73 |             self.assertEqual(ecmdb_10['name'], "Adenine")
74 |             self.assertEqual(ecmdb_10['chemical_formula'], "C5H5N5")
75 | 
76 |             file_name = self.output_directory + '/' + 'M2MDB000003.json'
77 |             with open (file_name, 'r') as f:
78 |                 data = json.load(f)
79 |             self.assertEqual(data['accession'], "ECMDB00014")
80 |             self.assertEqual(data['name'], "Deoxycytidine")
81 |             self.assertEqual(data['wikipedia'], "Deoxycytidine")
82 | 
83 |         else:
84 |             print("Database source has to be 'ecmdb' or 'ymdb'")
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/datanator/data_source/protein_modification/10_1093_nar_gkw1075.py:
--------------------------------------------------------------------------------
 1 | """Parse tsv file generated by datanator.data_source.protein_modification.pro
 2 | """
 3 | import pandas as pd
 4 | from datanator_query_python.util import mongo_util
 5 | from pymongo.collation import Collation, CollationStrength
 6 | import numpy as np
 7 | 
 8 | 
 9 | class ProteinMod(mongo_util.MongoUtil):
10 | 
11 |     def __init__(self, file_location, MongoDB=None, db=None, collection_str=None, username=None,
12 |                  password=None, authSource='admin', readPreference='nearest', verbose=True,
13 |                  max_entries=float('inf')):
14 |         """
15 |         
16 |         Args:
17 |             file_location(:obj:`str`): location of csv file to be parsed.
18 |             collection_str(:obj:`str`): name of collection in MongoDB to be filled.
19 |         """
20 |         super().__init__(MongoDB=MongoDB, db=db, username=username, password=password,
21 |                          authSource=authSource, readPreference=readPreference)
22 |         self.collection = self.db_obj[collection_str]
23 |         self.verbose = verbose
24 |         self.file_location = file_location
25 |         self.max_entries = max_entries
26 |         self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY)
27 | 
28 |     def fill_collection(self, start_row=0):
29 |         """
30 |         Fill collection collection_str.
31 | 
32 |         Args:
33 |             start_row (:obj:`int`, optional): Read from csv row. Defaults to 0.
34 |         """
35 |         df = pd.read_csv(self.file_location, header=0, error_bad_lines=False,
36 |                          engine='c', sep='\t',
37 |                          low_memory=False, skiprows=start_row)
38 |         df.columns = [x.lower() for x in ['PRO_id',	'UniProt_id', 'Organism', 'Unmodified_sequence_IUBMB',	
39 |                       'Processing',	'Deletions', 'Processsed_sequence_IUBMB', 'Processsed_formula',	
40 |                       'Processsed_molecular_weight', 'Processsed_charge', 'Modifications',
41 |                       'Crosslinks',	'Modified_sequence_abbreviated_BpForms', 'Modified_sequence_BpForms',
42 |                       'concrete',	'Modified_formula',	'Modified_molecular_weight', 'Modified_charge',	
43 |                       'Modifications_formula', 'Modifications_molecular_weight', 'Modifications_charge',	
44 |                       'PRO_issues', 'Monomeric_form_issues']]
45 |         df = df.drop(columns=['organism', 'unmodified_sequence_iubmb'])
46 |         for i, row in df.iterrows():
47 |             if i == self.max_entries:
48 |                 break
49 |             if row['concrete'] == False or row['pro_issues'] == np.NAN or row['monomeric_form_issues'] == np.NAN:
50 |                 continue
51 |             if i % 50 == 0 and self.verbose:
52 |                 print('Processing row {}'.format(i))
53 |             uniprot_id = row['uniprot_id'][:6]
54 |             row['reference'] = {'doi': '10.1093/nar/gkw1075'}
55 |             self.collection.update_many({'uniprot_id': uniprot_id},
56 |                                        {'$addToSet': {'modifications': row.to_dict()}},
57 |                                        collation=self.collation, upsert=False)
58 | 
59 | 
60 | import datanator.config.core
61 | from pathlib import Path
62 | 
63 | def main():
64 |     db = 'datanator'
65 |     collection_str = 'uniprot'
66 |     username = datanator.config.core.get_config()[
67 |         'datanator']['mongodb']['user']
68 |     password = datanator.config.core.get_config(
69 |     )['datanator']['mongodb']['password']
70 |     MongoDB = datanator.config.core.get_config(
71 |     )['datanator']['mongodb']['server']
72 |     file_location = str(Path('~/karr_lab/datanator/docs/modified_protein_sequences/pro.out.tsv').expanduser())
73 |     manager = ProteinMod(file_location, MongoDB=MongoDB, db=db, collection_str=collection_str,
74 |                          username=username, password=password)
75 |     manager.fill_collection(start_row=None)
76 | 
77 | if __name__ == '__main__':
78 |     main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Documentation](https://readthedocs.org/projects/datanator/badge/?version=latest)](http://docs.karrlab.org/datanator)
 2 | [![Test results](https://circleci.com/gh/KarrLab/datanator.svg?style=shield)](https://circleci.com/gh/KarrLab/datanator)
 3 | [![Test coverage](https://coveralls.io/repos/github/KarrLab/datanator/badge.svg)](https://coveralls.io/github/KarrLab/datanator)
 4 | [![Code analysis](https://api.codeclimate.com/v1/badges/e9b796130e29aee4672f/maintainability)](https://codeclimate.com/github/KarrLab/datanator)
 5 | [![License](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
 6 | 
 7 | # Datanator: Toolkit for discovering and aggregating data for whole-cell modeling
 8 | 
 9 | ## Contents
10 | * [Overview](#overview)
11 | * [Installation instructions and documentation](#installation-instructions-and-documentation)
12 | * [Testing Datanator](#testing-datanator)
13 | * [License](#license)
14 | * [Development team](#development-team)
15 | * [Questions and comments](#questions-and-comments)
16 | 
17 | ## Overview
18 | Extensive data is needed to build comprehensive predictive models of cells. Although the literature and public repositories contain extensive data about cells, this data is hard to utilize for modeling because it is scattered across a large number of sources; because it is described with inconsistent identifiers, units, and data models; and because there are few tools for finding relevant data for modeling specific species and environmental conditions. 
19 | 
20 | Datanator is a software tool for discovering, aggregating, and integrating the data needed for modeling cells. This includes metabolite, RNA, and protein abundances; protein complex compositions; transcription factor binding motifs; and kinetic parameters. Datanator is particularly useful for building large models, such as whole-cell models, that require large amounts of data to constrain large numbers of parameters.
21 | 
22 | This package contains the source code for Datanator. The data aggregated with Datanator is available at [https://www.datanator.info](https://www.datanator.info). The data is also available for download as MongoDB snapshot from [Zenodo](https://doi.org/10.5281/zenodo.3971048).
23 | 
24 | ## Installation instructions and documentation
25 | Please see the [documentation](http://docs.karrlab.org/datanator) for installation instructions, user instructions, and code documentation. 
26 | 
27 | Note, Datanator only supports Python 3. 
28 | 
29 | If one needs to use the datanator database hosted by Karr Lab, one will need `karr_lab_build_config` repository saved
30 | as `.wc` in the user home directory.
31 | 
32 | 
33 | ## Testing Datanator
34 | To ensure Datanator works properly, we have developed extensive units tests of every aspect of `datanator`. We recommend using `pytest` to run these tests as follows:
35 | 
36 | ```
37 | python3 -m pytest tests
38 | ```
39 | 
40 | ## License
41 | We aim to provide data and tools for working with this data with no additional restrictions beyond those imposed by the third-party data sources and software libraries used to construct Datanator.
42 | 
43 | The content of the Datanator database is a compilation of data curated by the Datanator Team and data aggregated from third-party databases. The copyrightable content curated by the Datanator Team is released under the Creative Commons 1.0 Universal (CC0) [License](LICENSE-DATA). The content from third-party databases is available under the licenses summarized [here](LICENSE-THIRD-PATRY-DATA).
44 | 
45 | The structure of the database is released under the CC0 [License](LICENSE-DATABASE-STRUCTURE). This software is released open-source under the MIT [License](LICENSE).
46 | 
47 | ## Development team
48 | The model was developed by the [Karr Lab](https://www.karrlab.org) at the Icahn School of Medicine at Mount Sinai in New York, US.
49 | 
50 | * Yosef Roth
51 | * Zhouyang Lian
52 | * Saahith Pochiraju
53 | * Balazs Szigeti
54 | * Jonathan Karr
55 | 
56 | ## Questions and comments
57 | Please contact the [Karr Lab](https://www.karrlab.org) with any questions or comments.
58 | 


--------------------------------------------------------------------------------
/datanator/data_source/protein_localization/parse_psortdb_experimental.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import json
  3 | import numpy as np
  4 | 
  5 | class NpEncoder(json.JSONEncoder):
  6 |     def default(self, obj):
  7 |         """
  8 |         Converts the dictionary's values into a JSON serializable data type
  9 |         
 10 |         """
 11 |         if isinstance(obj, np.integer):
 12 |             return int(obj)
 13 |         elif isinstance(obj, np.floating):
 14 |             return float(obj)
 15 |         elif isinstance(obj, np.ndarray):
 16 |             return obj.tolist()
 17 |         else:
 18 |             return super(NpEncoder, self).default(obj)
 19 |         
 20 | class ParsePsortExperimental:
 21 |     def __init__(self, max_entries):
 22 |         self.max_entries = max_entries
 23 | 
 24 |     def parse_psortdb(self):
 25 |         """
 26 |         To parse database psortdb Experimental-PSORTdb-v4.00.tsv file
 27 |         and create JSON files conforming to datanator_pattern/observation_compiled.json
 28 | 
 29 |         Args:
 30 |             max_entries: int
 31 |                 number of rows to parse.
 32 |                 A JSON file will be created for each of the tsv file's first <max_entries> rows
 33 | 
 34 |         Return:
 35 |             ()
 36 |         """
 37 |         data=pd.read_csv('Experimental-PSORTdb-v4.00.tsv',delimiter="\t")
 38 |         data = data.where(pd.notnull(data), None)
 39 |         for i in range(self.max_entries):
 40 |             d={}
 41 |             #entity
 42 |             d["entity"]={}
 43 |             d["entity"]["type"]="protein"
 44 |             d["entity"]["name"]=str(data.iloc[i,6]).replace(".","")
 45 |             if data.iloc[i,7] != None:
 46 |                     d["entity"]["synonyms"]=str(data.iloc[i,7]).split(",")
 47 |             else:
 48 |                 d["entity"]["synonyms"]=[]
 49 |             #identifiers
 50 |             d["entity"]["identifiers"]=[]
 51 |             uniprot={}
 52 |             uniprot["name_space"]="uniprot_id"
 53 |             uniprot["value"]=data.iloc[i,0]
 54 |             ref_seq = {}
 55 |             ref_seq["name_space"]="Refseq_Accession"
 56 |             ref_seq["value"]=data.iloc[i,1]
 57 |             other_accession = {}
 58 |             other_accession["name_space"]="Other_Accession"
 59 |             other_accession["value"]=data.iloc[i,2]
 60 |             d["entity"]["identifiers"].append(uniprot)
 61 |             d["entity"]["identifiers"].append(ref_seq)
 62 |             d["entity"]["identifiers"].append(other_accession)
 63 | 
 64 |             #localizations
 65 |             d["value"]={}
 66 |             if data.iloc[i,3] != None:
 67 |                 d["value"]["experimental_localization"] = str(data.iloc[i,3]).split(",")
 68 |             else:
 69 |                 d["value"]["experimental_localization"] = []
 70 |             if data.iloc[i,4] != None:
 71 |                 d["value"]["secondary_localizaton"] = str(data.iloc[i,4]).split(",")
 72 |             else:
 73 |                 d["value"]["secondary_localizaton"] = []
 74 | 
 75 |             #genotype
 76 |             d["genotype"]={}
 77 |             d["genotype"]["taxon"]={}
 78 |             d["genotype"]["taxon"]["ncbi_taxonomy_id"]=data.iloc[i,9]
 79 |             d["genotype"]["taxon"]["name"]=data.iloc[i,10]
 80 | 
 81 |             #environment
 82 |             d["environment"]={}
 83 |             d["environment"]["GramStain"]=data.iloc[i,13]
 84 |             
 85 |             #source
 86 |             d["source"]={}
 87 |             d["source"]["namespace"]="ePSORTdb"
 88 |             d["source"]["value"]="Version "+str(data.iloc[i,17])
 89 | 
 90 |             #name is the JSON file's name
 91 |             if (data.iloc[i,0]!=None):
 92 |                 name = data.iloc[i,0]   #SwissProt_ID
 93 |             else:
 94 |                 name = data.iloc[i,2]   #Other_Accession
 95 |             with open("Experimental_PSortdb/"+name+".json","w+") as f:
 96 |                 json.dump(d,f,cls=NpEncoder,indent=4)
 97 | 
 98 | p1=ParsePsortExperimental(10)
 99 | p1.parse_psortdb()
100 | 


--------------------------------------------------------------------------------
/datanator/schema_2/migrate_metabolites_meta.py:
--------------------------------------------------------------------------------
 1 | from datanator_query_python.config import motor_client_manager, config
 2 | from datanator.util import calc_tanimoto
 3 | import simplejson as json
 4 | import asyncio
 5 | from pymongo import UpdateOne
 6 | from pymongo.errors import BulkWriteError
 7 | from pprint import pprint
 8 | import os
 9 | 
10 | 
11 | class MigrateMM:
12 | 
13 |     def __init__(self, collection="metabolites_meta", to_database="datanator-test",
14 |                  from_database="datanator", max_entries=float("inf")):
15 |         self.collection = collection
16 |         self.from_database = from_database
17 |         self.to_database = to_database
18 |         self.from_collection = motor_client_manager.client.get_database(from_database)[collection]
19 |         self.to_collection = motor_client_manager.client.get_database(to_database)[collection]
20 |         self.max_entries = max_entries
21 |         self.calc_tanimoto = calc_tanimoto.CalcTanimoto(MongoDB=config.Config.SERVER,
22 |                                                         password=os.getenv("{}_PASSWORD".format(motor_client_manager.where)),
23 |                                                         username=os.getenv(motor_client_manager.where),
24 |                                                         db=from_database)
25 | 
26 |     async def index_primary(self, _key, background=True):
27 |         """Index key (single key ascending)
28 | 
29 |         Args:
30 |             _key(:obj:`str`): Name of key to be indexed
31 |         """
32 |         await self.to_collection.create_index(_key, background=background)
33 | 
34 |     async def process_cursor(self, skip=0):
35 |         """Transform data and move to new database
36 | 
37 |         Args:
38 |             docs(:obj:`pymongo.Cursor`): documents to be processed
39 |         """
40 |         bulk_write = []
41 |         query = {}
42 |         if self.max_entries == float('inf'):
43 |             limit = 0
44 |         else:
45 |             limit = self.max_entries
46 |         docs = self.from_collection.find(filter=query, projection={'_id': 0},
47 |                                         no_cursor_timeout=True, batch_size=10,
48 |                                         skip=skip, limit=limit)
49 |         i = 0
50 |         async for doc in docs:
51 |             i += 1
52 |             if i == self.max_entries:
53 |                 break
54 |             if i != 0 and i % 50 == 0:
55 |                 print("Processing file {}".format(i + skip))
56 |                 try:
57 |                     self.to_collection.bulk_write(bulk_write)
58 |                     bulk_write = []
59 |                 except BulkWriteError as bwe:
60 |                     pprint(bwe.details)
61 |                     bulk_write = []
62 |             similar_compound = list(doc.get("similar_compounds")[0].keys())[0]
63 |             if len(similar_compound) > 30: #sha256 string
64 |                 doc["similar_compounds"] = []
65 |                 inchi = doc.get("inchi")
66 |                 sorted_coeff, sorted_inchi = self.calc_tanimoto.one_to_many(inchi)
67 |                 for num, inc in zip(sorted_coeff, sorted_inchi):
68 |                     doc["similar_compounds"].append({"inchikey": inc, "similarity_score": num})
69 |             else:
70 |                 similar_compounds = doc.get("similar_compounds")
71 |                 doc["similar_compounds"] = []
72 |                 for item in similar_compounds:
73 |                     doc["similar_compounds"].append({"inchikey": list(item.keys())[0], "similarity_score": list(item.values())[0]})
74 |             doc["schema_version"] = "2"
75 |             bulk_write.append(UpdateOne({'InChI_Key': doc.get("InChI_Key")}, {'$set': json.loads(json.dumps(doc, ignore_nan=True))}, upsert=True))
76 |         if len(bulk_write) != 0:
77 |             try:
78 |                 self.to_collection.bulk_write(bulk_write)
79 |             except BulkWriteError as bwe:
80 |                 pprint(bwe.details)
81 |             finally:
82 |                 print("Done.")  
83 | 
84 | 
85 | def main():
86 |     loop = asyncio.get_event_loop()
87 |     src = MigrateMM()
88 |     src.index_primary('InChI_Key')
89 |     loop.run_until_complete(src.process_cursor(skip=4791))
90 | 
91 | if __name__ == '__main__':
92 |     main()
93 | 
94 | 


--------------------------------------------------------------------------------
/tests/data_source/test_protein_aggregate.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datanator.data_source import protein_aggregate
 3 | import tempfile
 4 | import shutil
 5 | import json
 6 | import os
 7 | import datanator.config.core
 8 | 
 9 | 
10 | class TestProteinAggregate(unittest.TestCase):
11 | 
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         cls.cache_dirname = tempfile.mkdtemp()
15 |         cache_dir = os.path.join(cls.cache_dirname, 'logs.txt')
16 |         src_db = 'datanator'
17 |         des_db = 'test'
18 |         cls.collection_str = 'test_protein_aggregate'
19 |         username = datanator.config.core.get_config()[
20 |             'datanator']['mongodb']['user']
21 |         password = datanator.config.core.get_config(
22 |         )['datanator']['mongodb']['password']
23 |         server = datanator.config.core.get_config(
24 |         )['datanator']['mongodb']['server']
25 |         port = datanator.config.core.get_config(
26 |         )['datanator']['mongodb']['port']        
27 |         cls.src = protein_aggregate.ProteinAggregate(username=username, password=password, server=server, 
28 |                                                 authSource='admin', src_database=src_db, max_entries=20, 
29 |                                                 verbose=True, collection=cls.collection_str, destination_database=des_db,
30 |                                                 cache_dir=cache_dir)
31 | 
32 |     @classmethod
33 |     def tearDownClass(cls):
34 |         shutil.rmtree(cls.cache_dirname)
35 |         # cls.src.db.drop_collection(cls.collection_str)
36 |         cls.src.client.close()
37 | 
38 |     # # @unittest.skip('passed')
39 |     # def test_load_abundance_from_pax(self):
40 |     #     self.src.load_abundance_from_pax()
41 |     #     doc = self.src.col.find_one(filter={'uniprot_id': 'Q72DI0'})
42 |     #     self.assertTrue('abundances' in doc.keys())
43 |     #     self.assertTrue('ncbi_taxonomy_id' in doc.keys())
44 | 
45 |     # # @unittest.skip('passed')
46 |     # def test_load_ko(self):
47 |     #     self.src.col.insert_one({'uniprot_id': 'a_mock_value',
48 |     #         'gene_name': 'gdh'}) #insert a mock document
49 |     #     self.src.load_ko()
50 |     #     doc = self.src.col.find_one(filter={'uniprot_id': 'a_mock_value'})
51 |     #     self.assertTrue('ko_number' in doc.keys())
52 | 
53 |     # # @unittest.skip('passed')
54 |     # def test_load_taxon(self):
55 |     #     self.src.col.insert_one({'ncbi_taxonomy_id': 9606,
56 |     #         'uniprot_id': 'taxon_mock_value'})
57 |     #     self.src.load_taxon()
58 |     #     doc = self.src.col.find_one(filter={'uniprot_id': 'taxon_mock_value'})
59 |     #     self.assertTrue('ancestor_name' in doc.keys())
60 | 
61 |     # def test_load_unreviewed_abundance(self):
62 |     #     dic_0 = {'observation': [{'protein_id': {'string_id': 'string_mock_0', 'uniprot_id': 'id_mock_0'},
63 |     #     'string_id': 'string_mock_0', 'abundance': 0 }], 'ncbi_id': 0, 'species_name': 'name_mock_0', 'organ': 'organ_0'}
64 |     #     dic_1 = {'observation': [{'protein_id': {'string_id': 'string_mock_1', 'uniprot_id': 'id_mock_1'},
65 |     #     'string_id': 'string_mock_1', 'abundance': 1 }], 'ncbi_id': 1, 'species_name': 'name_mock_1', 'organ': 'organ_1'}
66 |     #     dic_2 = {'uniprot_id': 'id_mock_0', 'abundances': []}
67 |     #     dic_3 = {'uniprot_id': 'Q72DIO'}
68 |     #     self.src.col.insert_many([dic_2, dic_3])
69 |     #     self.src.load_unreviewed_abundance()
70 |     #     doc = self.src.col.find_one(filter={'species_name': 'D.vulgaris'})
71 |     #     self.assertEqual(doc['ncbi_taxonomy_id'], 882)
72 | 
73 |     @unittest.skip('removed the function')
74 |     def test_loadload_kinlaw_from_sabio(self):
75 |         dic_0 = {'uniprot_id': 'P20932'}
76 |         dic_1 = {'uniprot_id': 'id_mock_1', 'protein_name': 'subtilisin'}
77 |         dic_2 = {'uniprot_id': 'P16064', 'protein_name': 'subtilisin'}
78 |         self.src.col.insert_many([dic_0, dic_1, dic_2])
79 |         self.src.load_kinlaw_from_sabio()
80 |         result_0 = self.src.col.find_one({'uniprot_id': 'P20932'})
81 |         self.assertTrue('kinetics' in list(result_0.keys()))
82 |         self.assertTrue({'kinlaw_id': 17, 'ncbi_taxonomy_id': 303} in result_0['kinetics'])
83 |         result_1 = self.src.col.find_one({'uniprot_id': 'P16064'})
84 |         self.assertTrue({'kinlaw_id': 1, 'ncbi_taxonomy_id': 1467} in result_1['kinetics'])
85 | 
86 | 


--------------------------------------------------------------------------------