├── tests ├── __init__.py ├── util │ ├── __init__.py │ ├── pytest.ini │ ├── test_warning_util.py │ ├── test_chem_util.py │ ├── test_index_collection.py │ ├── test_mongo_util.py │ ├── test_reaction_util.py │ ├── test_rna_seq_util.py │ ├── test_calc_tanimoto.py │ └── test_rna_halflife_util.py ├── data_source │ ├── __init__.py │ ├── pytest.ini │ ├── metabolite_concentration │ │ ├── test_query_demo.py │ │ └── test_metabolite_concentration.py │ ├── test_uniprot_nosql.py │ ├── rna_halflife │ │ ├── test_doi_10_1038_srep01318.py │ │ ├── test_doi_10_1371_journal_pone_0059059.py │ │ ├── test_order_by_ko.py │ │ ├── test_doi_10_1093_nar_gks1019.py │ │ ├── test_doi_10_1186_gb_2012_13_4_r30.py │ │ ├── test_doi_10_1093_nar_gkt1150.py │ │ ├── test_doi_10_1101_gr_131037_111.py │ │ └── test_doi_10_1186_s12864_016_3219_8.py │ ├── test_pax_nosql.py │ ├── test_kegg_org_code.py │ ├── test_gene_ortholog.py │ ├── test_metabolites_meta_collection.py │ ├── test_ec.py │ ├── brenda │ │ └── test_reaction.py │ ├── test_intact_nosql.py │ ├── test_sabio_reaction.py │ ├── test_taxon_tree.py │ ├── test_metabolite_nosql.py │ └── test_protein_aggregate.py ├── core │ └── pytest.ini ├── requirements.txt ├── fixtures │ ├── ump_kinase.xlsx │ ├── five_reactions.xlsx │ ├── twenty_reactions.xlsx │ └── Mycoplasma_pneumoniae.xlsx └── elasticsearch_kl │ └── test_batch_load.py ├── docs ├── references.bib ├── brenda │ └── Reactions_BKMS.tar.gz ├── metabolite_concentration │ ├── mmc2.xlsx │ ├── mmc3.xlsx │ ├── compounds.tsv.gz │ ├── aaf2786-Hackett-SM-table-S9.xls │ ├── 41589_2016_BFnchembio2077_MOESM585_ESM.xlsx │ └── 41589_2016_BFnchembio2077_MOESM586_ESM.xlsx ├── references.rst ├── apm-server.yml ├── requirements.txt ├── filebeat.docker.yml ├── requirements.rtd.txt ├── metabolites │ ├── F6P.json │ ├── FDP.json │ ├── PYR.json │ ├── PEP.json │ ├── ACCOATAXNEW.json │ ├── DHAP.json │ ├── G6P.json │ ├── R5P.json │ ├── S7P.json │ ├── 6PG.json │ └── GAP.json ├── metricbeat.docker.yml ├── deployment.rst ├── protein_localization │ ├── Experimental_v4.00_PSortdb │ │ ├── P50307.json │ │ ├── P01553.json │ │ ├── P34071.json │ │ ├── P06886.json │ │ ├── P10335.json │ │ ├── P01552.json │ │ ├── P09978.json │ │ ├── P81177.json │ │ ├── P00644.json │ │ └── P45723.json │ ├── NP_219511.1_ D.json │ ├── NP_219504.1_ a.json │ └── Gram_Negative_WO_Outer_Membrane │ │ ├── WP_012241978.1.json │ │ ├── WP_012242024.1.json │ │ ├── WP_041633705.1.json │ │ ├── WP_041633707.1.json │ │ ├── WP_081423625.1.json │ │ ├── WP_012242018.1.json │ │ ├── WP_012242006.1.json │ │ ├── WP_012242014.1.json │ │ ├── WP_012242027.1.json │ │ └── WP_012242037.1.json ├── about.rst ├── index.rst └── installation.rst ├── datanator ├── core │ └── __init__.py ├── data_source │ ├── __init__.py │ ├── builds │ │ ├── __init__.py │ │ ├── full.py │ │ ├── test.py │ │ ├── test_log.py │ │ └── med.py │ ├── array_express_tools │ │ ├── __init__.py │ │ └── taxon_exceptions.txt │ ├── process_rna_seq │ │ ├── __init__.py │ │ └── download_cdna.py │ ├── user_data │ │ ├── InputTemplate.xlsx │ │ └── RNA-Seq_Experiment_Template │ │ │ ├── RNA-SeqMetadataTemplate.xlsx │ │ │ └── samples │ │ │ └── ProcessedRNA-SeqTemplate.xlsx │ ├── brenda │ │ └── kinetic_constants.py │ ├── rna_halflife │ │ └── back_fill_gene_name.py │ ├── protein_localization │ │ ├── database_demo.py │ │ ├── justin_parseGramPositiveJSONSchema.py │ │ ├── experimental.py │ │ ├── parse_psortdb_negative_wo_outer_membrane.py │ │ └── parse_psortdb_experimental.py │ ├── sqlite_to_json.py │ ├── metabolite_concentration │ │ └── query_demo.py │ ├── sabio_compound.py │ └── protein_modification │ │ └── 10_1093_nar_gkw1075.py ├── schema_2 │ ├── __init__.py │ ├── migrate_ec.py │ ├── migrate_corum.py │ ├── migrate_metabolite_concentration.py │ └── migrate_metabolites_meta.py ├── _version.py ├── __init__.py ├── config │ ├── __init__.py │ ├── core.default.cfg │ ├── core.schema.cfg │ └── core.py ├── util │ ├── __init__.py │ ├── warning_util.py │ ├── constants.py │ ├── base26.py │ ├── build_util.py │ ├── reaction_util.py │ └── mongo_util.py └── parse_metabolite_concentration.py ├── pytest.ini ├── scripts ├── docker-machine ├── mongorestore.sh ├── mongorestore_aws.sh ├── mongodump.sh └── quilt_backup.py ├── setup.cfg ├── docker_builds ├── Flaskoffline ├── Karrlabdatanator └── Mongocurl ├── .karr_lab_build_utils.yml ├── requirements.txt ├── MANIFEST.in ├── .circleci └── requirements.txt ├── LICENSE ├── docker-compose.yml ├── CONTRIBUTING.md ├── setup.py ├── .gitignore ├── LICENSE-THIRD-PARTY-DATA └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/references.bib: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/util/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datanator/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datanator/data_source/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datanator/schema_2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data_source/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datanator/data_source/builds/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datanator/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.2' 2 | -------------------------------------------------------------------------------- /tests/core/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -p no:warnings 3 | -------------------------------------------------------------------------------- /tests/util/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -p no:warnings 3 | -------------------------------------------------------------------------------- /tests/data_source/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = -p no:warnings 3 | -------------------------------------------------------------------------------- /datanator/__init__.py: -------------------------------------------------------------------------------- 1 | from ._version import __version__ 2 | # :obj:`str`: version 3 | -------------------------------------------------------------------------------- /datanator/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import get_config, get_debug_logs_config 2 | -------------------------------------------------------------------------------- /datanator/data_source/array_express_tools/__init__.py: -------------------------------------------------------------------------------- 1 | from . import ensembl_tools 2 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | attrdict 2 | capturer 3 | flask_testing 4 | ftputil 5 | mock 6 | scipy 7 | -------------------------------------------------------------------------------- /datanator/config/core.default.cfg: -------------------------------------------------------------------------------- 1 | [datanator] 2 | [[bioportal]] 3 | [[quilt]] 4 | [[mongodb]] -------------------------------------------------------------------------------- /tests/fixtures/ump_kinase.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/tests/fixtures/ump_kinase.xlsx -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore:.*inspect.getargspec.* is deprecated:DeprecationWarning 4 | -------------------------------------------------------------------------------- /docs/brenda/Reactions_BKMS.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/brenda/Reactions_BKMS.tar.gz -------------------------------------------------------------------------------- /tests/fixtures/five_reactions.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/tests/fixtures/five_reactions.xlsx -------------------------------------------------------------------------------- /tests/fixtures/twenty_reactions.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/tests/fixtures/twenty_reactions.xlsx -------------------------------------------------------------------------------- /datanator/data_source/process_rna_seq/__init__.py: -------------------------------------------------------------------------------- 1 | from .core import get_processed_data_samples 2 | from . import download_cdna 3 | -------------------------------------------------------------------------------- /docs/metabolite_concentration/mmc2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/mmc2.xlsx -------------------------------------------------------------------------------- /docs/metabolite_concentration/mmc3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/mmc3.xlsx -------------------------------------------------------------------------------- /docs/references.rst: -------------------------------------------------------------------------------- 1 | References 2 | ========== 3 | 4 | .. bibliography:: references.bib 5 | :encoding: latin 6 | :style: unsrt -------------------------------------------------------------------------------- /tests/fixtures/Mycoplasma_pneumoniae.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/tests/fixtures/Mycoplasma_pneumoniae.xlsx -------------------------------------------------------------------------------- /docs/metabolite_concentration/compounds.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/compounds.tsv.gz -------------------------------------------------------------------------------- /datanator/data_source/user_data/InputTemplate.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/datanator/data_source/user_data/InputTemplate.xlsx -------------------------------------------------------------------------------- /scripts/docker-machine: -------------------------------------------------------------------------------- 1 | docker-machine create --driver amazonec2 --amazonec2-instance-type m5d.large --amazonec2-open-port 27017 --amazonec2-monitoring datanator-ec2 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [coverage:run] 5 | source = 6 | datanator 7 | 8 | [sphinx-apidocs] 9 | packages = 10 | datanator 11 | -------------------------------------------------------------------------------- /docs/metabolite_concentration/aaf2786-Hackett-SM-table-S9.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/aaf2786-Hackett-SM-table-S9.xls -------------------------------------------------------------------------------- /docker_builds/Flaskoffline: -------------------------------------------------------------------------------- 1 | FROM lzy7071/karrlabdatanator:latest 2 | 3 | COPY . /home 4 | WORKDIR /home 5 | 6 | ENTRYPOINT ["python3"] 7 | CMD ["/home/datanator/datanator/rest/__init__.py"] 8 | -------------------------------------------------------------------------------- /docker_builds/Karrlabdatanator: -------------------------------------------------------------------------------- 1 | FROM lzy7071/karrlabdatanator_dependencies:latest 2 | 3 | RUN python3 -m pip install git+https://github.com/KarrLab/datanator.git 4 | 5 | WORKDIR /root 6 | CMD bash 7 | -------------------------------------------------------------------------------- /docs/metabolite_concentration/41589_2016_BFnchembio2077_MOESM585_ESM.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/41589_2016_BFnchembio2077_MOESM585_ESM.xlsx -------------------------------------------------------------------------------- /docs/metabolite_concentration/41589_2016_BFnchembio2077_MOESM586_ESM.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/docs/metabolite_concentration/41589_2016_BFnchembio2077_MOESM586_ESM.xlsx -------------------------------------------------------------------------------- /datanator/data_source/user_data/RNA-Seq_Experiment_Template/RNA-SeqMetadataTemplate.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/datanator/data_source/user_data/RNA-Seq_Experiment_Template/RNA-SeqMetadataTemplate.xlsx -------------------------------------------------------------------------------- /datanator/util/__init__.py: -------------------------------------------------------------------------------- 1 | from . import molecule_util 2 | from . import rna_seq_util 3 | from . import taxonomy_util 4 | from . import warning_util 5 | from . import mongo_util 6 | from . import file_util 7 | from . import chem_util 8 | -------------------------------------------------------------------------------- /docker_builds/Mongocurl: -------------------------------------------------------------------------------- 1 | FROM mongo:4.0.10 2 | 3 | RUN apt-get update -y \ 4 | && apt-get install -y --no-install-recommends \ 5 | curl \ 6 | wget \ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | WORKDIR /root 10 | CMD bash 11 | -------------------------------------------------------------------------------- /datanator/data_source/user_data/RNA-Seq_Experiment_Template/samples/ProcessedRNA-SeqTemplate.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KarrLab/datanator/HEAD/datanator/data_source/user_data/RNA-Seq_Experiment_Template/samples/ProcessedRNA-SeqTemplate.xlsx -------------------------------------------------------------------------------- /docs/apm-server.yml: -------------------------------------------------------------------------------- 1 | apm-server: 2 | # Defines the host and port the server is listening on. use "unix:/path/to.sock" to listen on a unix domain socket. 3 | host: "apm-server:8200" 4 | 5 | output.elasticsearch: 6 | hosts: ["elasticsearch:9200"] -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx >= 1.8 2 | sphinx_fontawesome 3 | sphinx_rtd_theme >= 0.4.2 4 | sphinxcontrib_addmetahtml >= 0.1.1 5 | sphinxcontrib_bibtex 6 | sphinxcontrib_googleanalytics >= 0.1.1 7 | sphinxcontrib_spelling 8 | sphinxprettysearchresults 9 | -------------------------------------------------------------------------------- /scripts/mongorestore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | FILE=/root/karr_lab/datanator.archive 3 | if [ ! -f "$FILE" ]; then 4 | curl -o $FILE https://mongo-dbdump.s3.amazonaws.com/datanator.20190701.archive 5 | fi 6 | mongorestore -d datanator --host mongo:27017 --archive=$FILE -------------------------------------------------------------------------------- /.karr_lab_build_utils.yml: -------------------------------------------------------------------------------- 1 | downstream_dependencies: 2 | - h1_hesc 3 | - wc_cli 4 | email_notifications: 5 | - jonrkarr@gmail.com 6 | - yosefdroth@gmail.com 7 | static_analyses: 8 | ignore_unused_requirements: 9 | - python_libsbml 10 | ignore_missing_requirements: 11 | - python-libsbml-experimental 12 | - pymongo 13 | -------------------------------------------------------------------------------- /docs/filebeat.docker.yml: -------------------------------------------------------------------------------- 1 | filebeat.config: 2 | modules: 3 | path: ${path.config}/modules.d/*.yml 4 | reload.enabled: false 5 | filebeat.autodiscover: 6 | providers: 7 | - type: docker 8 | hints.enabled: true 9 | 10 | output.elasticsearch: 11 | hosts: ["elasticsearch:9200"] 12 | setup.kibana: 13 | host: "kibana:5601" 14 | -------------------------------------------------------------------------------- /docs/requirements.rtd.txt: -------------------------------------------------------------------------------- 1 | sphinx >= 1.8 2 | sphinx_fontawesome 3 | sphinx_rtd_theme >= 0.4.2 4 | sphinxcontrib_addmetahtml >= 0.1.1 5 | sphinxcontrib_bibtex 6 | sphinxcontrib_googleanalytics @ git+https://github.com/karrlab/sphinxcontrib-googleanalytics.git#egg=sphinxcontrib_googleanalytics-0.1.1 7 | sphinxcontrib_spelling 8 | sphinxprettysearchresults 9 | -------------------------------------------------------------------------------- /scripts/mongorestore_aws.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | FILE=~/karr_lab/datanator-dump 3 | source <(grep = ~/.wc/datanator.cfg | tr -d ' ') 4 | if [ ! -d "$FILE" ]; then 5 | mkdir $FILE 6 | aws s3 cp https://mongo-dbdump.s3.amazonaws.com/datanator $FILE --recursive 7 | fi 8 | mongorestore -d datanator -u $user -p $password --authenticationDatabase admin "$FILE/datanator" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | biopython 3 | bioservices >= 1.5.0 4 | bpforms 5 | cement >= 3.0.0 6 | configobj 7 | datanator_query_python >= 0.6.31 8 | ete3 9 | flask_migrate 10 | genson 11 | karr_lab_aws_manager >= 0.0.21 12 | numpy 13 | obj_tables[bio] 14 | openbabel 15 | pandas >= 1.0.1 16 | pint >= 0.10 17 | pubchempy 18 | python_libsbml 19 | requests 20 | requests_cache 21 | setuptools 22 | simplejson 23 | sqlalchemy 24 | sqlalchemy_utils 25 | tabula_py 26 | wc_utils 27 | xmltodict 28 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # description 2 | include README.rst 3 | 4 | # license 5 | include LICENSE 6 | 7 | # database license 8 | include LICNESE-DATA 9 | include LICENSE-DATABASE-STRUCTURE 10 | include LICENSE-THIRD-PARTRY-DATA 11 | 12 | # requirements 13 | include requirements.txt 14 | 15 | # configuration 16 | recursive-include datanator/config *.cfg 17 | 18 | # data source configurations 19 | recursive-include datanator/data_source *.txt 20 | 21 | # data files 22 | recursive-include datanator/data *.txt *.xlsx 23 | -------------------------------------------------------------------------------- /.circleci/requirements.txt: -------------------------------------------------------------------------------- 1 | # for quilt3 compatibility 2 | urllib3 < 1.25 3 | 4 | # Karr Lab 5 | git+https://github.com/KarrLab/datanator_swagger_ui_bundle.git#egg=swagger_ui_bundle 6 | git+https://github.com/KarrLab/wc_utils.git#egg=wc_utils 7 | git+https://github.com/KarrLab/bpforms.git#egg=bpforms 8 | git+https://github.com/KarrLab/bcforms.git#egg=bcforms 9 | git+https://github.com/KarrLab/obj_tables.git#egg=obj_tables[all] 10 | git+https://github.com/KarrLab/datanator_query_python.git#egg=datanator_query_python 11 | -------------------------------------------------------------------------------- /scripts/mongodump.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | MONGO_DATABASE="datanator" 4 | APP_NAME="datanator" 5 | source <(grep = ~/.wc/datanator.cfg | tr -d ' ') 6 | 7 | MONGO_HOST="localhost" 8 | TIMESTAMP=`date +%F-%H%M` 9 | MONGODUMP_PATH="/usr/bin/mongodump" 10 | BACKUPS_DIR="/data/mongodump" 11 | mkdir -p $BACKUPS_DIR 12 | BACKUP_NAME="$BACKUPS_DIR/$APP_NAME" 13 | 14 | $MONGODUMP_PATH -d $MONGO_DATABASE -u $user -p $password --authenticationDatabase admin -o $BACKUP_NAME 15 | aws s3 cp $BACKUP_NAME s3://mongo-dbdump/ --recursive 16 | rm -rf $BACKUPS_DIR -------------------------------------------------------------------------------- /docs/metabolites/F6P.json: -------------------------------------------------------------------------------- 1 | { 2 | "type":"metabolite", 3 | "name":"Frutose 6-Phosphate", 4 | "synonyms":[ 5 | "F6P", 6 | "6-O-Phosphono-D-fructose", 7 | "D-Fructose 6-phosphoric acid" 8 | ], 9 | "identifiers":[ 10 | {"namespace":"doi"}, 11 | {"value":"10.1074/jbc.M109.095570"} 12 | ], 13 | "taxon":{ 14 | "ncbi_taxonomy_id":562, 15 | "name":"Escherichia coli str. K-12 substr. MG1655", 16 | "canon_ancestors":[ 17 | {"ncbi_taxonomy_id": "", 18 | "name": ""} 19 | ] 20 | } 21 | } -------------------------------------------------------------------------------- /docs/metabolites/FDP.json: -------------------------------------------------------------------------------- 1 | { 2 | "type":"metabolite", 3 | "name":"Fluorescein diphosphate", 4 | "synonyms":[ 5 | "FDP", 6 | "3-Oxo-3H-spiro[2-benzofuran-1,9'-xanthene]-3',6'-diyl bis(phosphate)" 7 | ], 8 | "identifiers":[ 9 | {"namespace":"doi"}, 10 | {"value":"10.1074/jbc.M109.095570"} 11 | ], 12 | "taxon":{ 13 | "ncbi_taxonomy_id":562, 14 | "name":"Escherichia coli str. K-12 substr. MG1655", 15 | "canon_ancestors":[ 16 | {"ncbi_taxonomy_id": "", 17 | "name": ""} 18 | ] 19 | } 20 | } -------------------------------------------------------------------------------- /docs/metabolites/PYR.json: -------------------------------------------------------------------------------- 1 | { 2 | "type":"metabolite", 3 | "name":"Pyruvate", 4 | "synonyms":[ 5 | "PYR", 6 | "2-Oxopropanoate", 7 | "Methylglyoxylate", 8 | "57-60-3", 9 | "UNII-HO43T60JMG" 10 | ], 11 | "identifiers":[ 12 | {"namespace":"doi"}, 13 | {"value":"10.1074/jbc.M109.095570"} 14 | ], 15 | "taxon":{ 16 | "ncbi_taxonomy_id":562, 17 | "name":"Escherichia coli str. K-12 substr. MG1655", 18 | "canon_ancestors":[ 19 | {"ncbi_taxonomy_id": "", 20 | "name": ""} 21 | ] 22 | } 23 | } -------------------------------------------------------------------------------- /docs/metabolites/PEP.json: -------------------------------------------------------------------------------- 1 | { 2 | "type":"metabolite", 3 | "name":"Phosphoenolpyruvate", 4 | "synonyms":[ 5 | "PEP", 6 | "Phosphoenolpyruvic acid", 7 | "2-(phosphonooxy)prop-2-enoic acid", 8 | "138-08-9" 9 | ], 10 | "identifiers":[ 11 | {"namespace":"doi"}, 12 | {"value":"10.1074/jbc.M109.095570"} 13 | ], 14 | "taxon":{ 15 | "ncbi_taxonomy_id":562, 16 | "name":"Escherichia coli str. K-12 substr. MG1655", 17 | "canon_ancestors":[ 18 | {"ncbi_taxonomy_id": "", 19 | "name": ""} 20 | ] 21 | } 22 | } -------------------------------------------------------------------------------- /datanator/data_source/builds/full.py: -------------------------------------------------------------------------------- 1 | from datanator.core import common_schema 2 | import datetime 3 | import pkg_resources 4 | import sys 5 | 6 | 7 | def build(): 8 | old_stdout = sys.stdout 9 | log_filename = pkg_resources.resource_filename( 10 | 'datanator', "builds/logs/{}.txt".format(str(datetime.datetime.now()))) 11 | with open(log_filename, "w") as log_file: 12 | sys.stdout = log_file 13 | cs = common_schema.CommonSchema(load_content=True, verbose=True, 14 | load_entire_small_dbs=True) 15 | cs.upload_backup() 16 | sys.stdout = old_stdout 17 | -------------------------------------------------------------------------------- /docs/metabolites/ACCOATAXNEW.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | 4 | "type": "metabolite", 5 | 6 | "name": "acetyl-CoA", 7 | 8 | "identifiers": [ 9 | 10 | {"namespace": "inchikey", 11 | "value": "SLZBFCDCINBPY-ZSJPKINUSA-N"} 12 | 13 | ] 14 | 15 | }, 16 | 17 | "value": [ 18 | 19 | {"type": "metabolite_concentration", "value": "0.26", "units": "μmol/g DCW", 20 | 21 | "substrate": "NOX01"}, 22 | 23 | {"type": "metabolite_concentration", "value": "0.29", "units": "μmol/g DCW", 24 | 25 | "substrate": "NOX02"} 26 | 27 | ] 28 | } 29 | -------------------------------------------------------------------------------- /docs/metabolites/DHAP.json: -------------------------------------------------------------------------------- 1 | { 2 | "type":"metabolite", 3 | "name":"Dihydroxyacetone Phosphate", 4 | "synonyms":[ 5 | "DHAP", 6 | "1-hydroxy-3-(phosphonooxy)-2-propanone", 7 | "3-Hydroxy-2-oxopropyl dihydrogen phosphate" 8 | ], 9 | "identifiers":[ 10 | {"namespace":"doi"}, 11 | {"value":"10.1074/jbc.M109.095570"} 12 | ], 13 | "taxon":{ 14 | "ncbi_taxonomy_id":562, 15 | "name":"Escherichia coli str. K-12 substr. MG1655", 16 | "canon_ancestors":[ 17 | {"ncbi_taxonomy_id": "", 18 | "name": ""} 19 | ] 20 | } 21 | } -------------------------------------------------------------------------------- /scripts/quilt_backup.py: -------------------------------------------------------------------------------- 1 | import wc_utils.quilt 2 | 3 | 4 | def main(): 5 | '''Backup or download data from/to Quilt 6 | ''' 7 | path = input("BSON file location:\n") 8 | package = 'datanator' 9 | manager = wc_utils.quilt.QuiltManager(path=path, package=package) 10 | backup = input("Backup or Download (choose 'backup' or 'download')?\n") 11 | if backup.lower() == 'backup': 12 | message = input("Optionally, enter a commit message:\n") 13 | manager.upload_package(message=message or None) 14 | else: 15 | manager.download_package() 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /datanator/data_source/builds/test.py: -------------------------------------------------------------------------------- 1 | from datanator.core import common_schema 2 | import datetime 3 | import pkg_resources 4 | import sys 5 | 6 | 7 | def build(): 8 | old_stdout = sys.stdout 9 | log_filename = pkg_resources.resource_filename( 10 | 'datanator', "builds/logs/{}.txt".format(str(datetime.datetime.now()))) 11 | with open(log_filename, "w") as log_file: 12 | sys.stdout = log_file 13 | cs = common_schema.CommonSchema(load_content=True, clear_content=True, 14 | verbose=True, test=True, max_entries=10) 15 | # cs.upload_backup() 16 | sys.stdout = old_stdout 17 | -------------------------------------------------------------------------------- /docs/metabolites/G6P.json: -------------------------------------------------------------------------------- 1 | { 2 | "type":"metabolite", 3 | "name":"Glucose 6-Phosphate", 4 | "synonyms":[ 5 | "G6P", 6 | "6-O-Phosphono-α-D-glucopyranose", 7 | "D-Glucopyranose 6-phosphate", 8 | "D-glucose 6-(dihydrogen phosphate)" 9 | ], 10 | "identifiers":[ 11 | {"namespace":"doi"}, 12 | {"value":"10.1074/jbc.M109.095570"} 13 | ], 14 | "taxon":{ 15 | "ncbi_taxonomy_id":562, 16 | "name":"Escherichia coli str. K-12 substr. MG1655", 17 | "canon_ancestors":[ 18 | {"ncbi_taxonomy_id": "", 19 | "name": ""} 20 | ] 21 | } 22 | } -------------------------------------------------------------------------------- /datanator/data_source/builds/test_log.py: -------------------------------------------------------------------------------- 1 | from datanator.core import common_schema 2 | import datetime 3 | import pkg_resources 4 | import sys 5 | 6 | 7 | def build(): 8 | old_stdout = sys.stdout 9 | log_filename = pkg_resources.resource_filename( 10 | 'datanator', "builds/logs/{}.txt".format(str(datetime.datetime.now()))) 11 | with open(log_filename, "w") as log_file: 12 | sys.stdout = log_file 13 | # cs = common_schema.CommonSchema(load_content=True, clear_content=True, 14 | # verbose=True, test=True, max_entries=10) 15 | # cs.upload_backup() 16 | sys.stdout = old_stdout 17 | -------------------------------------------------------------------------------- /datanator/data_source/builds/med.py: -------------------------------------------------------------------------------- 1 | from datanator.core import common_schema 2 | import datetime 3 | import pkg_resources 4 | import sys 5 | 6 | 7 | def build(): 8 | old_stdout = sys.stdout 9 | log_filename = pkg_resources.resource_filename( 10 | 'datanator', "builds/logs/{}.txt".format(str(datetime.datetime.now()))) 11 | with open(log_filename, "w") as log_file: 12 | sys.stdout = log_file 13 | cs = common_schema.CommonSchema(load_content=True, clear_content=True, 14 | max_entries=20, load_entire_small_dbs=True, verbose=True) 15 | # cs.upload_backup() 16 | sys.stdout = old_stdout 17 | -------------------------------------------------------------------------------- /docs/metabolites/R5P.json: -------------------------------------------------------------------------------- 1 | { 2 | "type":"metabolite", 3 | "name":"Ribose 5-Phosphate", 4 | "synonyms":[ 5 | "R5P", 6 | "Ribose phosphate", 7 | "Ribose 5-monophosphate", 8 | "5-O-phosphono-D-ribose", 9 | "D-Ribose 5-(dihydrogen phosphate)" 10 | ], 11 | "identifiers":[ 12 | {"namespace":"doi"}, 13 | {"value":"10.1074/jbc.M109.095570"} 14 | ], 15 | "taxon":{ 16 | "ncbi_taxonomy_id":562, 17 | "name":"Escherichia coli str. K-12 substr. MG1655", 18 | "canon_ancestors":[ 19 | {"ncbi_taxonomy_id": "", 20 | "name": ""} 21 | ] 22 | } 23 | } -------------------------------------------------------------------------------- /docs/metabolites/S7P.json: -------------------------------------------------------------------------------- 1 | { 2 | "type":"metabolite", 3 | "name":"sedoheptulose-7-phosphate", 4 | "synonyms":[ 5 | "S7P", 6 | "7-(dihydrogen phosphate) sedoheptulose", 7 | "{[(2R,3S,4R,5S)-3,4,5,6-tetrahydroxy-6-(hydroxymethyl)oxan-2-yl]methoxy}phosphonic acid" 8 | ], 9 | "identifiers":[ 10 | {"namespace":"doi"}, 11 | {"value":"10.1074/jbc.M109.095570"} 12 | ], 13 | "taxon":{ 14 | "ncbi_taxonomy_id":562, 15 | "name":"Escherichia coli str. K-12 substr. MG1655", 16 | "canon_ancestors":[ 17 | {"ncbi_taxonomy_id": "", 18 | "name": ""} 19 | ] 20 | } 21 | } -------------------------------------------------------------------------------- /docs/metabolites/6PG.json: -------------------------------------------------------------------------------- 1 | { 2 | "type":"metabolite", 3 | "name":"6-Phosphogluconic acid", 4 | "synonyms":[ 5 | "6PG", 6 | "6-O-Phosphono-D-gluconic acid", 7 | "6-phosphogluconate", 8 | "6-phospho-D-gluconate", 9 | "D-gluconic acid 6-phosphate" 10 | ], 11 | "identifiers":[ 12 | {"namespace":"inchikey"}, 13 | {"value":"xxxxx"} 14 | ], 15 | "taxon":{ 16 | "ncbi_taxonomy_id":562, 17 | "name":"Escherichia coli str. K-12 substr. MG1655", 18 | "canon_ancestors":[ 19 | {"ncbi_taxonomy_id": "", 20 | "name": ""} 21 | ] 22 | } 23 | } -------------------------------------------------------------------------------- /docs/metricbeat.docker.yml: -------------------------------------------------------------------------------- 1 | metricbeat.config: 2 | modules: 3 | path: ${path.config}/modules.d/*.yml 4 | # Reload module configs as they change: 5 | reload.enabled: false 6 | 7 | metricbeat.autodiscover: 8 | providers: 9 | - type: docker 10 | hints.enabled: true 11 | 12 | metricbeat.modules: 13 | - module: docker 14 | metricsets: 15 | - "container" 16 | - "cpu" 17 | - "diskio" 18 | - "healthcheck" 19 | - "info" 20 | - "memory" 21 | - "network" 22 | hosts: ["unix:///var/run/docker.sock"] 23 | period: 10s 24 | enabled: true 25 | 26 | output.elasticsearch: 27 | hosts: ['elasticsearch:9200'] 28 | setup.kibana: 29 | host: "kibana:5601" 30 | -------------------------------------------------------------------------------- /docs/metabolites/GAP.json: -------------------------------------------------------------------------------- 1 | { 2 | "type":"metabolite", 3 | "name":"3-Phosphoglyceraldehyde", 4 | "synonyms":[ 5 | "GAP", 6 | "glyceraldehyde 3-phosphate", 7 | "2-hydroxy-3-(phosphonooxy)-Propanal", 8 | "2-Hydroxy-3-oxopropyl dihydrogen phosphate", 9 | "2-Hydroxy-3-oxopropyldihydrogenphosphat", 10 | "3-phosphoglyceraldehyde" 11 | ], 12 | "identifiers":[ 13 | {"namespace":"doi"}, 14 | {"value":"10.1074/jbc.M109.095570"} 15 | ], 16 | "taxon":{ 17 | "ncbi_taxonomy_id":562, 18 | "name":"Escherichia coli str. K-12 substr. MG1655", 19 | "canon_ancestors":[ 20 | {"ncbi_taxonomy_id": "", 21 | "name": ""} 22 | ] 23 | } 24 | } -------------------------------------------------------------------------------- /docs/deployment.rst: -------------------------------------------------------------------------------- 1 | Deployment 2 | ============ 3 | The following instructions describe how to deploy ``datanator`` to the heroku server 4 | 5 | We are deploying the backend API server via a container using the karrlab/wc_env_dependencies:latest image. 6 | 7 | The commands for deploying the container are the following:: 8 | 9 | heroku login 10 | heroku container:login 11 | heroku container:push web -a datanator 12 | heroku container:release web -a datanator 13 | 14 | In order to change the configuration of the container, look at the Dockerfile for datanator. The gunicorn production server can be 15 | adjusted accordingly in order to accommodate the number of users. 16 | 17 | 18 | Contact `Saahith `_ for any questions regarding deployment 19 | -------------------------------------------------------------------------------- /datanator/util/warning_util.py: -------------------------------------------------------------------------------- 1 | """ Warning utilities 2 | 3 | :Author: Yosef Roth 4 | :Author: Jonathan Karr 5 | :Date: 2017-04-13 6 | :Copyright: 2017, Karr Lab 7 | :License: MIT 8 | """ 9 | 10 | import openbabel 11 | import requests.packages.urllib3 12 | 13 | 14 | def disable_warnings(): 15 | """ Disable warning messages from openbabel and urllib """ 16 | openbabel.obErrorLog.SetOutputLevel(openbabel.obError) 17 | requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) 18 | 19 | 20 | def enable_warnings(): 21 | """ Enable warning messages from openbabel and urllib """ 22 | openbabel.obErrorLog.SetOutputLevel(openbabel.obWarning) 23 | requests.packages.urllib3.warnings.resetwarnings() 24 | -------------------------------------------------------------------------------- /datanator/config/core.schema.cfg: -------------------------------------------------------------------------------- 1 | [datanator] 2 | [[bioportal]] 3 | key = string(default=None) 4 | # authentication token for BioPortal; this can be botained by creating an account at 5 | # http://bioportal.bioontology.org, logging in, and copying your API key from your 6 | # account settings page 7 | [[quilt]] 8 | owner = string(default=None) 9 | # user or team id of the owner of the Quilt package 10 | 11 | package = string(default=None) 12 | # user to use Quilt; only needed to obtain an authentication token 13 | [[mongodb]] 14 | user = string(default = None) 15 | password = string(default = None) 16 | server = string(default = None) 17 | port = string(default = None) 18 | replSet = string(default = None) 19 | 20 | -------------------------------------------------------------------------------- /datanator/data_source/brenda/kinetic_constants.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | from bson.binary import Binary 3 | import pickle 4 | from datanator_query_python.util import mongo_util 5 | import datanator.config.core 6 | from pathlib import Path 7 | 8 | 9 | def main(): 10 | db = 'test' 11 | collection_str = 'brenda_constants' 12 | username = datanator.config.core.get_config()[ 13 | 'datanator']['mongodb']['user'] 14 | password = datanator.config.core.get_config( 15 | )['datanator']['mongodb']['password'] 16 | MongoDB = datanator.config.core.get_config( 17 | )['datanator']['mongodb']['server'] 18 | manager = mongo_util.MongoUtil(MongoDB=MongoDB, db=db, username=username, 19 | password=password, collection_str=collection_str) 20 | 21 | with open(str(Path('~/karr_lab/datanator/docs/brenda/brenda.pkl').expanduser()), 'rb') as f: 22 | data = pickle.load(f) 23 | coll.insert({'bin-data': Binary(thebytes)}) -------------------------------------------------------------------------------- /datanator/util/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pkg_resources 3 | 4 | ## Paths 5 | DATA_CACHE_DIR = os.path.expanduser(os.path.join('~', '.wc', 'data', 'datanator')) 6 | 7 | ## Endpoints 8 | CURRENT_VERSION_ENDPOINT = '/v0' 9 | 10 | # Speed Contstants 11 | METABOLITE_REACTION_LIMIT = 5 12 | 13 | # Common Schema Constants 14 | DATA_DUMP_PATH = os.path.join(DATA_CACHE_DIR , 'CommonSchema.sql') 15 | PAX_NAME = 'Pax' 16 | PAX_INITIAL_AMOUNT = 1 17 | SABIO_NAME = 'Sabio' 18 | SABIO_INITIAL_AMOUNT = 1 19 | ARRAY_EXPRESS_NAME = 'Array Express' 20 | ARRAY_EXPRESS_INITIAL_AMOUNT = 1 21 | INTACT_NAME = 'IntAct' 22 | INTACT_INITIAL_AMOUNT = 0 23 | 24 | ## Batching Test Constants 25 | PAX_TEST_BATCH = 2 26 | INTACT_INTERACTION_TEST_BATCH = 10 27 | ARRAY_EXPRESS_TEST_BATCH = 5 28 | SABIO_TEST_BATCH = 100 29 | 30 | ## Batching Build Constants 31 | PAX_BUILD_BATCH = 300 32 | INTACT_INTERACTION_BUILD_BATCH = 100000 33 | ARRAY_EXPRESS_BUILD_BATCH = 1000 34 | SABIO_BUILD_BATCH = 100000 35 | INTACT_INTERACTION_BUILD_SUB_BATCH = 5000 36 | -------------------------------------------------------------------------------- /tests/data_source/metabolite_concentration/test_query_demo.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source.metabolite_concentration import query_demo 3 | from datanator_query_python.config import config 4 | 5 | 6 | class TestQueryDemo(unittest.TestCase): 7 | 8 | @classmethod 9 | def setUpClass(cls): 10 | conf = config.TestConfig() 11 | cls.src = query_demo.QueryDemo(MongoDB=conf.SERVER, 12 | db='datanator-test', 13 | collection_str="taxon_tree", 14 | username=conf.USERNAME, 15 | password=conf.PASSWORD) 16 | 17 | @classmethod 18 | def tearDownClass(cls): 19 | cls.src.client.close() 20 | 21 | def test_get_canon_ancestors(self): 22 | tax_id = 1280 23 | result = self.src.get_canon_ancestors(tax_id) 24 | self.assertEqual(result[0], {'ncbi_taxonomy_id': 131567, 'name': 'cellular organisms'}) 25 | tax_id = 0 26 | result = self.src.get_canon_ancestors(tax_id) 27 | self.assertEqual(result, []) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017-2018 Karr Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/protein_localization/Experimental_v4.00_PSortdb/P50307.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "Cytoplasmic protein", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "name_space": "uniprot_id", 9 | "value": "P50307" 10 | }, 11 | { 12 | "name_space": "Refseq_Accession", 13 | "value": null 14 | }, 15 | { 16 | "name_space": "Other_Accession", 17 | "value": null 18 | } 19 | ] 20 | }, 21 | "value": { 22 | "experimental_localization": [ 23 | "Cytoplasmic" 24 | ], 25 | "secondary_localizaton": [] 26 | }, 27 | "genotype": { 28 | "taxon": { 29 | "ncbi_taxonomy_id": 1280, 30 | "name": "Staphylococcus aureus" 31 | } 32 | }, 33 | "environment": { 34 | "GramStain": "Gram positive" 35 | }, 36 | "source": { 37 | "namespace": "ePSORTdb", 38 | "value": "Version 3" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Experimental_v4.00_PSortdb/P01553.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "Enterotoxin type C-1 precursor (SEC1)", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "name_space": "uniprot_id", 9 | "value": "P01553" 10 | }, 11 | { 12 | "name_space": "Refseq_Accession", 13 | "value": null 14 | }, 15 | { 16 | "name_space": "Other_Accession", 17 | "value": null 18 | } 19 | ] 20 | }, 21 | "value": { 22 | "experimental_localization": [ 23 | "Extracellular" 24 | ], 25 | "secondary_localizaton": [] 26 | }, 27 | "genotype": { 28 | "taxon": { 29 | "ncbi_taxonomy_id": 1280, 30 | "name": "Staphylococcus aureus" 31 | } 32 | }, 33 | "environment": { 34 | "GramStain": "Gram positive" 35 | }, 36 | "source": { 37 | "namespace": "ePSORTdb", 38 | "value": "Version 3" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Experimental_v4.00_PSortdb/P34071.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "Enterotoxin type C-2 precursor (SEC2)", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "name_space": "uniprot_id", 9 | "value": "P34071" 10 | }, 11 | { 12 | "name_space": "Refseq_Accession", 13 | "value": null 14 | }, 15 | { 16 | "name_space": "Other_Accession", 17 | "value": null 18 | } 19 | ] 20 | }, 21 | "value": { 22 | "experimental_localization": [ 23 | "Extracellular" 24 | ], 25 | "secondary_localizaton": [] 26 | }, 27 | "genotype": { 28 | "taxon": { 29 | "ncbi_taxonomy_id": 1280, 30 | "name": "Staphylococcus aureus" 31 | } 32 | }, 33 | "environment": { 34 | "GramStain": "Gram positive" 35 | }, 36 | "source": { 37 | "namespace": "ePSORTdb", 38 | "value": "Version 3" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Experimental_v4.00_PSortdb/P06886.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "Toxic shock syndrome toxin-1 precursor (TSST-1)", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "name_space": "uniprot_id", 9 | "value": "P06886" 10 | }, 11 | { 12 | "name_space": "Refseq_Accession", 13 | "value": null 14 | }, 15 | { 16 | "name_space": "Other_Accession", 17 | "value": null 18 | } 19 | ] 20 | }, 21 | "value": { 22 | "experimental_localization": [ 23 | "Extracellular" 24 | ], 25 | "secondary_localizaton": [] 26 | }, 27 | "genotype": { 28 | "taxon": { 29 | "ncbi_taxonomy_id": 1280, 30 | "name": "Staphylococcus aureus" 31 | } 32 | }, 33 | "environment": { 34 | "GramStain": "Gram positive" 35 | }, 36 | "source": { 37 | "namespace": "ePSORTdb", 38 | "value": "Version 3" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Experimental_v4.00_PSortdb/P10335.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "Lipase precursor (Glycerol ester hydrolase) ", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "name_space": "uniprot_id", 9 | "value": "P10335" 10 | }, 11 | { 12 | "name_space": "Refseq_Accession", 13 | "value": null 14 | }, 15 | { 16 | "name_space": "Other_Accession", 17 | "value": null 18 | } 19 | ] 20 | }, 21 | "value": { 22 | "experimental_localization": [ 23 | "Extracellular" 24 | ], 25 | "secondary_localizaton": [] 26 | }, 27 | "genotype": { 28 | "taxon": { 29 | "ncbi_taxonomy_id": 1280, 30 | "name": "Staphylococcus aureus" 31 | } 32 | }, 33 | "environment": { 34 | "GramStain": "Gram positive" 35 | }, 36 | "source": { 37 | "namespace": "ePSORTdb", 38 | "value": "Version 3" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/about.rst: -------------------------------------------------------------------------------- 1 | About 2 | ===== 3 | 4 | ---------------------- 5 | License 6 | ---------------------- 7 | 8 | The software is released under the MIT license: 9 | 10 | .. literalinclude:: ../LICENSE 11 | :language: text 12 | 13 | ---------------------- 14 | Development team 15 | ---------------------- 16 | 17 | This package was developed by the following researchers in the `Karr Lab `_ at the Icahn School of Medicine at Mount Sinai in New York, USA: 18 | 19 | * `Yosef Roth `_ 20 | * `Saahith Pochiraju `_ 21 | * Balazs Szigeti 22 | * `Jonathan Karr `_ 23 | 24 | ---------------------- 25 | Acknowledgements 26 | ---------------------- 27 | 28 | This work was supported by a National Institute of Health MIRA award [grant number 1 R35 GM 119771-01]; a National Science Foundation INSPIRE award [grant number 1649014]; and the National Science Foundation / ERASynBio [grant numbers 1548123, 335672]. 29 | 30 | ---------------------- 31 | Questions and comments 32 | ---------------------- 33 | 34 | Please contact the `Karr Lab `_ with any questions or comments. 35 | -------------------------------------------------------------------------------- /docs/protein_localization/Experimental_v4.00_PSortdb/P01552.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "Enterotoxin type B precursor (SEB) ", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "name_space": "uniprot_id", 9 | "value": "P01552" 10 | }, 11 | { 12 | "name_space": "Refseq_Accession", 13 | "value": null 14 | }, 15 | { 16 | "name_space": "Other_Accession", 17 | "value": null 18 | } 19 | ] 20 | }, 21 | "value": { 22 | "experimental_localization": [ 23 | "Extracellular" 24 | ], 25 | "secondary_localizaton": [] 26 | }, 27 | "genotype": { 28 | "taxon": { 29 | "ncbi_taxonomy_id": 1280, 30 | "name": "Staphylococcus aureus", 31 | "canon_anctors": [] 32 | } 33 | }, 34 | "environment": { 35 | "GramStain": "Gram positive" 36 | }, 37 | "source": { 38 | "namespace": "ePSORTdb", 39 | "value": "Version 3" 40 | } 41 | } -------------------------------------------------------------------------------- /docs/protein_localization/Experimental_v4.00_PSortdb/P09978.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "Phospholipase C precursor (Beta-hemolysin) (Beta-toxin) (Sphingomyelinase) (SMase) ", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "name_space": "uniprot_id", 9 | "value": "P09978" 10 | }, 11 | { 12 | "name_space": "Refseq_Accession", 13 | "value": null 14 | }, 15 | { 16 | "name_space": "Other_Accession", 17 | "value": null 18 | } 19 | ] 20 | }, 21 | "value": { 22 | "experimental_localization": [ 23 | "Extracellular" 24 | ], 25 | "secondary_localizaton": [] 26 | }, 27 | "genotype": { 28 | "taxon": { 29 | "ncbi_taxonomy_id": 1280, 30 | "name": "Staphylococcus aureus" 31 | } 32 | }, 33 | "environment": { 34 | "GramStain": "Gram positive" 35 | }, 36 | "source": { 37 | "namespace": "ePSORTdb", 38 | "value": "Version 3" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Experimental_v4.00_PSortdb/P81177.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "Zinc metalloproteinase aureolysin precursor (Staphylococcus aureus neutral proteinase) ", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "name_space": "uniprot_id", 9 | "value": "P81177" 10 | }, 11 | { 12 | "name_space": "Refseq_Accession", 13 | "value": null 14 | }, 15 | { 16 | "name_space": "Other_Accession", 17 | "value": null 18 | } 19 | ] 20 | }, 21 | "value": { 22 | "experimental_localization": [ 23 | "Extracellular" 24 | ], 25 | "secondary_localizaton": [] 26 | }, 27 | "genotype": { 28 | "taxon": { 29 | "ncbi_taxonomy_id": 1280, 30 | "name": "Staphylococcus aureus" 31 | } 32 | }, 33 | "environment": { 34 | "GramStain": "Gram positive" 35 | }, 36 | "source": { 37 | "namespace": "ePSORTdb", 38 | "value": "Version 3" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Experimental_v4.00_PSortdb/P00644.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "Thermonuclease precursor (EC 31311) (TNase) (Micrococcal nuclease) (Staphylococcal nuclease)", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "name_space": "uniprot_id", 9 | "value": "P00644" 10 | }, 11 | { 12 | "name_space": "Refseq_Accession", 13 | "value": null 14 | }, 15 | { 16 | "name_space": "Other_Accession", 17 | "value": null 18 | } 19 | ] 20 | }, 21 | "value": { 22 | "experimental_localization": [ 23 | "Extracellular" 24 | ], 25 | "secondary_localizaton": [] 26 | }, 27 | "genotype": { 28 | "taxon": { 29 | "ncbi_taxonomy_id": 1280, 30 | "name": "Staphylococcus aureus" 31 | } 32 | }, 33 | "environment": { 34 | "GramStain": "Gram positive" 35 | }, 36 | "source": { 37 | "namespace": "ePSORTdb", 38 | "value": "Version 3" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Experimental_v4.00_PSortdb/P45723.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "1-phosphatidylinositol phosphodiesterase precursor (EC 31410) (Phosphatidylinositol-specific phospholipase C) (PI-PLC)", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "name_space": "uniprot_id", 9 | "value": "P45723" 10 | }, 11 | { 12 | "name_space": "Refseq_Accession", 13 | "value": null 14 | }, 15 | { 16 | "name_space": "Other_Accession", 17 | "value": null 18 | } 19 | ] 20 | }, 21 | "value": { 22 | "experimental_localization": [ 23 | "Extracellular" 24 | ], 25 | "secondary_localizaton": [] 26 | }, 27 | "genotype": { 28 | "taxon": { 29 | "ncbi_taxonomy_id": 1280, 30 | "name": "Staphylococcus aureus" 31 | } 32 | }, 33 | "environment": { 34 | "GramStain": "Gram positive" 35 | }, 36 | "source": { 37 | "namespace": "ePSORTdb", 38 | "value": "Version 3" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/NP_219511.1_ D.json: -------------------------------------------------------------------------------- 1 | {"SeqID": "gi||ref|NP_219511.1| DNA-binding protein", "PPSVM_Localization": "Unknown", "PPSVM_Details": "", "Profile_Localization": "Unknown", "Profile_Details": "No matches to profiles found", "Signal_Localization": "Unknown", "Signal_Details": "No signal peptide detected", "SCL-BLASTe_Localization": "Unknown", "SCL-BLASTe_Details": "No matches against database", "CMSVM_Localization": "Unknown", "CMSVM_Details": "", "SCL-BLAST_Localization": "Unknown", "SCL-BLAST_Details": "No matches against database", "OMPMotif_Localization": "Unknown", "OMPMotif_Details": "No motifs found", "OMSVM_Localization": "Unknown", "OMSVM_Details": "", "Motif_Localization": "Unknown", "Motif_Details": "No motifs found", "CytoSVM_Localization": "Unknown", "CytoSVM_Details": "", "CWSVM_Localization": "", "CWSVM_Details": "", "ModHMM_Localization": "Unknown", "ModHMM_Details": "1 internal helix found", "ECSVM_Localization": "Unknown", "ECSVM_Details": "", "Cytoplasmic Membrane_Score": "2.00", "Extracellular_Score": "2.00", "Outer Membrane_Score": "2.00", "Periplasmic_Score": "2.00", "Cytoplasmic_Score": "2.00", "Final_Localization": "Unknown", "Final_Localization_2": "", "Final_Localization_Comments": "", "Secondary_Localization": "", "Final_Score": "2.00", "PSortVersion": "3.00"} -------------------------------------------------------------------------------- /datanator/util/base26.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Fork from git@github.com:mnowotka/chembl_ikey.git 3 | ''' 4 | 5 | from itertools import product 6 | from string import ascii_uppercase 7 | 8 | t26 = [t for t in map(''.join, product(ascii_uppercase, repeat=3)) if t[0] != 'E' and (t < 'TAA' or t > 'TTV')] 9 | d26 = list(map(''.join, product(ascii_uppercase, repeat=2))) 10 | 11 | 12 | def base26_triplet_1(a): 13 | b0 = a[0] 14 | b1 = a[1] & 0x3f 15 | h = b0 | b1 << 8 16 | return t26[h] 17 | 18 | 19 | def base26_triplet_2(a): 20 | b0 = a[1] & 0xc0 21 | b1 = a[2] 22 | b2 = a[3] & 0x0f 23 | h = (b0 | b1 << 8 | b2 << 16) >> 6 24 | return t26[h] 25 | 26 | 27 | def base26_triplet_3(a): 28 | b0 = a[3] & 0xf0 29 | b1 = a[4] 30 | b2 = a[5] & 0x03 31 | h = (b0 | b1 << 8 | b2 << 16) >> 4 32 | return t26[h] 33 | 34 | 35 | def base26_triplet_4(a): 36 | b0 = a[5] & 0xfc 37 | b1 = a[6] 38 | h = (b0 | b1 << 8) >> 2 39 | return t26[h] 40 | 41 | 42 | def base26_dublet_for_bits_28_to_36(a): 43 | b0 = a[3] & 0xf0 44 | b1 = a[4] & 0x1f 45 | h = (b0 | b1 << 8) >> 4 46 | return d26[h] 47 | 48 | 49 | def base26_dublet_for_bits_56_to_64(a): 50 | b0 = a[7] 51 | b1 = a[8] & 0x01 52 | h = b0 | b1 << 8 53 | return d26[h] 54 | -------------------------------------------------------------------------------- /docs/protein_localization/NP_219504.1_ a.json: -------------------------------------------------------------------------------- 1 | {"SeqID": "gi||ref|NP_219504.1| aspartyl/glutamyl-tRNA amidotransferase subunit C", "PPSVM_Localization": "Unknown", "PPSVM_Details": "", "Profile_Localization": "Unknown", "Profile_Details": "No matches to profiles found", "Signal_Localization": "Unknown", "Signal_Details": "No signal peptide detected", "SCL-BLASTe_Localization": "Unknown", "SCL-BLASTe_Details": "No matches against database", "CMSVM_Localization": "Unknown", "CMSVM_Details": "", "SCL-BLAST_Localization": "Unknown", "SCL-BLAST_Details": "No matches against database", "OMPMotif_Localization": "Unknown", "OMPMotif_Details": "No motifs found", "OMSVM_Localization": "Unknown", "OMSVM_Details": "", "Motif_Localization": "Unknown", "Motif_Details": "No motifs found", "CytoSVM_Localization": "Unknown", "CytoSVM_Details": "", "CWSVM_Localization": "", "CWSVM_Details": "", "ModHMM_Localization": "Unknown", "ModHMM_Details": "No internal helices found", "ECSVM_Localization": "Unknown", "ECSVM_Details": "", "Cytoplasmic Membrane_Score": "2.00", "Extracellular_Score": "2.00", "Outer Membrane_Score": "2.00", "Periplasmic_Score": "2.00", "Cytoplasmic_Score": "2.00", "Final_Localization": "Unknown", "Final_Localization_2": "", "Final_Localization_Comments": "", "Secondary_Localization": "", "Final_Score": "2.00", "PSortVersion": "3.00"} -------------------------------------------------------------------------------- /tests/data_source/test_uniprot_nosql.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import shutil 3 | import tempfile 4 | from datanator.data_source import uniprot_nosql 5 | import datanator.config.core 6 | 7 | 8 | class TestUniprotNoSQL(unittest.TestCase): 9 | 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.cache_dirname = tempfile.mkdtemp() 13 | db = 'test' 14 | username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 15 | password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 16 | MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 17 | cls.src = uniprot_nosql.UniprotNoSQL(MongoDB=MongoDB, db=db, max_entries=20, 18 | username=username, password=password, collection_str='test_uniprot') 19 | 20 | @classmethod 21 | def tearDownClass(cls): 22 | shutil.rmtree(cls.cache_dirname) 23 | cls.src.db_obj.drop_collection(cls.src.collection_str) 24 | 25 | # @unittest.skip('large single file download') 26 | def test_proper_loading(self): 27 | self.src.load_uniprot() 28 | # count = uni.count() 29 | # self.assertEqual(count, 10) 30 | # self.assertNotEqual(uni.find_one()['gene_name'], None) 31 | 32 | def test_fill_species_name(self): 33 | self.src.fill_species_name() -------------------------------------------------------------------------------- /tests/util/test_warning_util.py: -------------------------------------------------------------------------------- 1 | """ Tests of the warning utilities 2 | 3 | :Author: Jonathan Karr 4 | :Date: 2017-04-12 5 | :Copyright: 2017, Karr Lab 6 | :License: MIT 7 | """ 8 | 9 | from capturer import CaptureOutput 10 | from datanator.util import molecule_util 11 | from datanator.util import warning_util 12 | import unittest 13 | 14 | 15 | class TestWarningUtil(unittest.TestCase): 16 | adp = 'NC1=C2N=CN(C3OC(COP([O-])(=O)OP([O-])([O-])=O)C(O)C3O)C2=NC=N1' 17 | 18 | def test_enable_warnings_openbabel(self): 19 | warning_util.enable_warnings() 20 | with CaptureOutput(termination_delay=0.1) as capturer: 21 | molecule_util.Molecule(structure=self.adp).to_inchi() 22 | self.assertNotEqual(capturer.get_text(), '') 23 | 24 | def test_disable_warnings_openbabel(self): 25 | warning_util.disable_warnings() 26 | with CaptureOutput(termination_delay=0.1) as capturer: 27 | molecule_util.Molecule(structure=self.adp).to_inchi() 28 | self.assertEqual(capturer.get_text(), '') 29 | 30 | @unittest.skip('todo: implement') 31 | def test_disable_warnings_urllib3(self): 32 | warning_util.disable_warnings() 33 | with CaptureOutput(termination_delay=0.1) as capturer: 34 | response = requests.get('http://www.karrlab.org') 35 | self.assertEqual(capturer.get_text(), '') 36 | -------------------------------------------------------------------------------- /tests/util/test_chem_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.util import chem_util 3 | import tempfile 4 | import shutil 5 | 6 | 7 | class TestChemUtil(unittest.TestCase): 8 | 9 | @classmethod 10 | def setUpClass(cls): 11 | cls.cache_dirname = tempfile.mkdtemp() 12 | cls.src = chem_util.ChemUtil() 13 | 14 | @classmethod 15 | def tearDownClass(cls): 16 | shutil.rmtree(cls.cache_dirname) 17 | 18 | def test_simplify_inchi(self): 19 | inchi = 'InChI=1S/H2O/h1H2' 20 | self.assertEqual('InChI=1S/H2O', self.src.simplify_inchi(inchi)) 21 | inchi = None 22 | self.assertEqual('InChI = None', self.src.simplify_inchi(inchi)) 23 | 24 | def test_hash_inchi(self): 25 | inchi = 'InChI=1S/C6H12N2O4S2/c7-3(5(9)10)1-13-14-2-4(8)6(11)12' 26 | hashed = 'e0a402c94a0ecd52ec426756854592f76eece8fd3ffef2e7347fb6c5' 27 | self.assertEqual(hashed, self.src.hash_inchi(inchi)) 28 | self.assertEqual('InChI = None', self.src.hash_inchi(None)) 29 | 30 | def test_morphineInChIKey(self): 31 | key = self.src.inchi_to_inchikey("InChI=1S/C17H19NO3/c1-18-7-6-17-10-3-5-13(20)16(17)21-15-12(19)4-2-9(14(15)17)8-11(10)18/h2-5,10-11,13,16,19-20H,6-8H2,1H3/t10-,11+,13-,16-,17-/m0/s1") 32 | self.assertEqual(key,'BQJCRHHNABKAKU-KBQPJGBKSA-N') 33 | key_1 = self.src.inchi_to_inchikey('InChI=1S/H2O/h1H2') 34 | self.assertEqual(key_1, 'XLYOFNOQVPJJNP-UHFFFAOYSA-N') -------------------------------------------------------------------------------- /docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012241978.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "hypothetical protein", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "namespace": "Seq_ID", 9 | "value": "WP_012241978.1" 10 | } 11 | ] 12 | }, 13 | "value": { 14 | "PPSVM_Localization": null, 15 | "Profile_Localization": "Unknown", 16 | "Signal_Localization": "Unknown", 17 | "SCL-BLASTe_Localization": "Unknown", 18 | "CMSVM_Localization": "Unknown", 19 | "SCL-BLAST_Localization": "Unknown", 20 | "OMPMotif_Localization": null, 21 | "OMSVM_Localization": null, 22 | "Motif_Localization": "Unknown", 23 | "CytoSVM_Localization": "Unknown", 24 | "CWSVM_Localization": "Unknown", 25 | "ModHMM_Localization": "Unknown", 26 | "ECSVM_Localization": "Unknown", 27 | "Cytoplasmic Membrane_Score": 2.5, 28 | "Cellwall_Score": 2.5, 29 | "Extracellular_Score": 2.5, 30 | "Cytoplasmic_Score": 2.5, 31 | "Final_Localization": "Unknown", 32 | "Final_Localization_2": null, 33 | "Secondary_Localization": null, 34 | "Final_Score": 2.5 35 | }, 36 | "source": { 37 | "namespace": "PSORT", 38 | "value": "Version 3.0" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242024.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "hypothetical protein", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "namespace": "Seq_ID", 9 | "value": "WP_012242024.1" 10 | } 11 | ] 12 | }, 13 | "value": { 14 | "PPSVM_Localization": null, 15 | "Profile_Localization": "Unknown", 16 | "Signal_Localization": "Unknown", 17 | "SCL-BLASTe_Localization": "Unknown", 18 | "CMSVM_Localization": "Unknown", 19 | "SCL-BLAST_Localization": "Unknown", 20 | "OMPMotif_Localization": null, 21 | "OMSVM_Localization": null, 22 | "Motif_Localization": "Unknown", 23 | "CytoSVM_Localization": "Unknown", 24 | "CWSVM_Localization": "Unknown", 25 | "ModHMM_Localization": "Unknown", 26 | "ECSVM_Localization": "Unknown", 27 | "Cytoplasmic Membrane_Score": 2.5, 28 | "Cellwall_Score": 2.5, 29 | "Extracellular_Score": 2.5, 30 | "Cytoplasmic_Score": 2.5, 31 | "Final_Localization": "Unknown", 32 | "Final_Localization_2": null, 33 | "Secondary_Localization": null, 34 | "Final_Score": 2.5 35 | }, 36 | "source": { 37 | "namespace": "PSORT", 38 | "value": "Version 3.0" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_041633705.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "hypothetical protein", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "namespace": "Seq_ID", 9 | "value": "WP_041633705.1" 10 | } 11 | ] 12 | }, 13 | "value": { 14 | "PPSVM_Localization": null, 15 | "Profile_Localization": "Unknown", 16 | "Signal_Localization": "Unknown", 17 | "SCL-BLASTe_Localization": "Unknown", 18 | "CMSVM_Localization": "Unknown", 19 | "SCL-BLAST_Localization": "Unknown", 20 | "OMPMotif_Localization": null, 21 | "OMSVM_Localization": null, 22 | "Motif_Localization": "Unknown", 23 | "CytoSVM_Localization": "Unknown", 24 | "CWSVM_Localization": "Unknown", 25 | "ModHMM_Localization": "Unknown", 26 | "ECSVM_Localization": "Unknown", 27 | "Cytoplasmic Membrane_Score": 2.5, 28 | "Cellwall_Score": 2.5, 29 | "Extracellular_Score": 2.5, 30 | "Cytoplasmic_Score": 2.5, 31 | "Final_Localization": "Unknown", 32 | "Final_Localization_2": null, 33 | "Secondary_Localization": null, 34 | "Final_Score": 2.5 35 | }, 36 | "source": { 37 | "namespace": "PSORT", 38 | "value": "Version 3.0" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_041633707.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "hypothetical protein", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "namespace": "Seq_ID", 9 | "value": "WP_041633707.1" 10 | } 11 | ] 12 | }, 13 | "value": { 14 | "PPSVM_Localization": null, 15 | "Profile_Localization": "Unknown", 16 | "Signal_Localization": "Unknown", 17 | "SCL-BLASTe_Localization": "Unknown", 18 | "CMSVM_Localization": "Unknown", 19 | "SCL-BLAST_Localization": "Unknown", 20 | "OMPMotif_Localization": null, 21 | "OMSVM_Localization": null, 22 | "Motif_Localization": "Unknown", 23 | "CytoSVM_Localization": "Unknown", 24 | "CWSVM_Localization": "Unknown", 25 | "ModHMM_Localization": "Unknown", 26 | "ECSVM_Localization": "Unknown", 27 | "Cytoplasmic Membrane_Score": 2.5, 28 | "Cellwall_Score": 2.5, 29 | "Extracellular_Score": 2.5, 30 | "Cytoplasmic_Score": 2.5, 31 | "Final_Localization": "Unknown", 32 | "Final_Localization_2": null, 33 | "Secondary_Localization": null, 34 | "Final_Score": 2.5 35 | }, 36 | "source": { 37 | "namespace": "PSORT", 38 | "value": "Version 3.0" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_081423625.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "hypothetical protein", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "namespace": "Seq_ID", 9 | "value": "WP_081423625.1" 10 | } 11 | ] 12 | }, 13 | "value": { 14 | "PPSVM_Localization": null, 15 | "Profile_Localization": "Unknown", 16 | "Signal_Localization": "Unknown", 17 | "SCL-BLASTe_Localization": "Unknown", 18 | "CMSVM_Localization": "Unknown", 19 | "SCL-BLAST_Localization": "Unknown", 20 | "OMPMotif_Localization": null, 21 | "OMSVM_Localization": null, 22 | "Motif_Localization": "Unknown", 23 | "CytoSVM_Localization": "Unknown", 24 | "CWSVM_Localization": "Unknown", 25 | "ModHMM_Localization": "Unknown", 26 | "ECSVM_Localization": "Unknown", 27 | "Cytoplasmic Membrane_Score": 2.5, 28 | "Cellwall_Score": 2.5, 29 | "Extracellular_Score": 2.5, 30 | "Cytoplasmic_Score": 2.5, 31 | "Final_Localization": "Unknown", 32 | "Final_Localization_2": null, 33 | "Secondary_Localization": null, 34 | "Final_Score": 2.5 35 | }, 36 | "source": { 37 | "namespace": "PSORT", 38 | "value": "Version 3.0" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242018.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "transcriptional regulator", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "namespace": "Seq_ID", 9 | "value": "WP_012242018.1" 10 | } 11 | ] 12 | }, 13 | "value": { 14 | "PPSVM_Localization": null, 15 | "Profile_Localization": "Unknown", 16 | "Signal_Localization": "Unknown", 17 | "SCL-BLASTe_Localization": "Unknown", 18 | "CMSVM_Localization": "Unknown", 19 | "SCL-BLAST_Localization": "Unknown", 20 | "OMPMotif_Localization": null, 21 | "OMSVM_Localization": null, 22 | "Motif_Localization": "Unknown", 23 | "CytoSVM_Localization": "Unknown", 24 | "CWSVM_Localization": "Unknown", 25 | "ModHMM_Localization": "Unknown", 26 | "ECSVM_Localization": "Unknown", 27 | "Cytoplasmic Membrane_Score": 2.5, 28 | "Cellwall_Score": 2.5, 29 | "Extracellular_Score": 2.5, 30 | "Cytoplasmic_Score": 2.5, 31 | "Final_Localization": "Unknown", 32 | "Final_Localization_2": null, 33 | "Secondary_Localization": null, 34 | "Final_Score": 2.5 35 | }, 36 | "source": { 37 | "namespace": "PSORT", 38 | "value": "Version 3.0" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242006.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "formate--tetrahydrofolate ligase", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "namespace": "Seq_ID", 9 | "value": "WP_012242006.1" 10 | } 11 | ] 12 | }, 13 | "value": { 14 | "PPSVM_Localization": null, 15 | "Profile_Localization": "Unknown", 16 | "Signal_Localization": "Unknown", 17 | "SCL-BLASTe_Localization": "Unknown", 18 | "CMSVM_Localization": "Unknown", 19 | "SCL-BLAST_Localization": "Unknown", 20 | "OMPMotif_Localization": null, 21 | "OMSVM_Localization": null, 22 | "Motif_Localization": "Unknown", 23 | "CytoSVM_Localization": "Unknown", 24 | "CWSVM_Localization": "Unknown", 25 | "ModHMM_Localization": "Unknown", 26 | "ECSVM_Localization": "Unknown", 27 | "Cytoplasmic Membrane_Score": 2.5, 28 | "Cellwall_Score": 2.5, 29 | "Extracellular_Score": 2.5, 30 | "Cytoplasmic_Score": 2.5, 31 | "Final_Localization": "Unknown", 32 | "Final_Localization_2": null, 33 | "Secondary_Localization": null, 34 | "Final_Score": 2.5 35 | }, 36 | "source": { 37 | "namespace": "PSORT", 38 | "value": "Version 3.0" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242014.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "hypothetical protein", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "namespace": "Seq_ID", 9 | "value": "WP_012242014.1" 10 | } 11 | ] 12 | }, 13 | "value": { 14 | "PPSVM_Localization": null, 15 | "Profile_Localization": "Unknown", 16 | "Signal_Localization": "Non-Cytoplasmic", 17 | "SCL-BLASTe_Localization": "Unknown", 18 | "CMSVM_Localization": "Unknown", 19 | "SCL-BLAST_Localization": "Unknown", 20 | "OMPMotif_Localization": null, 21 | "OMSVM_Localization": null, 22 | "Motif_Localization": "Unknown", 23 | "CytoSVM_Localization": "Unknown", 24 | "CWSVM_Localization": "Unknown", 25 | "ModHMM_Localization": "Unknown", 26 | "ECSVM_Localization": "Unknown", 27 | "Cytoplasmic Membrane_Score": 3.33, 28 | "Cellwall_Score": 3.33, 29 | "Extracellular_Score": 3.33, 30 | "Cytoplasmic_Score": 0.0, 31 | "Final_Localization": "Unknown", 32 | "Final_Localization_2": null, 33 | "Secondary_Localization": null, 34 | "Final_Score": 3.33 35 | }, 36 | "source": { 37 | "namespace": "PSORT", 38 | "value": "Version 3.0" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242027.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "thiamine biosynthesis lipoprotein", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "namespace": "Seq_ID", 9 | "value": "WP_012242027.1" 10 | } 11 | ] 12 | }, 13 | "value": { 14 | "PPSVM_Localization": null, 15 | "Profile_Localization": "Unknown", 16 | "Signal_Localization": "Unknown", 17 | "SCL-BLASTe_Localization": "Unknown", 18 | "CMSVM_Localization": "Unknown", 19 | "SCL-BLAST_Localization": "Unknown", 20 | "OMPMotif_Localization": null, 21 | "OMSVM_Localization": null, 22 | "Motif_Localization": "Unknown", 23 | "CytoSVM_Localization": "Unknown", 24 | "CWSVM_Localization": "Unknown", 25 | "ModHMM_Localization": "Unknown", 26 | "ECSVM_Localization": "Unknown", 27 | "Cytoplasmic Membrane_Score": 2.5, 28 | "Cellwall_Score": 2.5, 29 | "Extracellular_Score": 2.5, 30 | "Cytoplasmic_Score": 2.5, 31 | "Final_Localization": "Unknown", 32 | "Final_Localization_2": null, 33 | "Secondary_Localization": null, 34 | "Final_Score": 2.5 35 | }, 36 | "source": { 37 | "namespace": "PSORT", 38 | "value": "Version 3.0" 39 | } 40 | } -------------------------------------------------------------------------------- /docs/protein_localization/Gram_Negative_WO_Outer_Membrane/WP_012242037.1.json: -------------------------------------------------------------------------------- 1 | { 2 | "entity": { 3 | "type": "protein", 4 | "name": "DUF951 domain-containing protein", 5 | "synonyms": [], 6 | "identifiers": [ 7 | { 8 | "namespace": "Seq_ID", 9 | "value": "WP_012242037.1" 10 | } 11 | ] 12 | }, 13 | "value": { 14 | "PPSVM_Localization": null, 15 | "Profile_Localization": "Unknown", 16 | "Signal_Localization": "Unknown", 17 | "SCL-BLASTe_Localization": "Unknown", 18 | "CMSVM_Localization": "Unknown", 19 | "SCL-BLAST_Localization": "Unknown", 20 | "OMPMotif_Localization": null, 21 | "OMSVM_Localization": null, 22 | "Motif_Localization": "Unknown", 23 | "CytoSVM_Localization": "Unknown", 24 | "CWSVM_Localization": "Unknown", 25 | "ModHMM_Localization": "Unknown", 26 | "ECSVM_Localization": "Unknown", 27 | "Cytoplasmic Membrane_Score": 2.5, 28 | "Cellwall_Score": 2.5, 29 | "Extracellular_Score": 2.5, 30 | "Cytoplasmic_Score": 2.5, 31 | "Final_Localization": "Unknown", 32 | "Final_Localization_2": null, 33 | "Secondary_Localization": null, 34 | "Final_Score": 2.5 35 | }, 36 | "source": { 37 | "namespace": "PSORT", 38 | "value": "Version 3.0" 39 | } 40 | } -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | datanator: 5 | image: karrlab/wc_env 6 | restart: always 7 | stdin_open: true 8 | tty: true 9 | ports: 10 | - "10001:5002" 11 | volumes: 12 | - "../:/root/karr_lab" 13 | - "/home/zl/.wc:/root/.wc" 14 | entrypoint: > 15 | bash -c "python3 -m pip install -e /root/karr_lab/pkg_utils/ 16 | && python3 -m pip install -e /root/karr_lab/wc_utils/ 17 | && python3 -m pip install -e /root/karr_lab/karr_lab_aws_manager/ 18 | && python3 -m pip install -e /root/karr_lab/datanator_query_python/ 19 | && python3 -m pip install -e /root/karr_lab/datanator/ 20 | && tail -f /dev/null" 21 | 22 | # mongo: 23 | # image: mongo:4.0.10 24 | # restart: always 25 | # volumes: 26 | # - ./datanator/data_source/cache/mongo:/data/db 27 | # - ../:/root/karr_lab 28 | # ports: 29 | # - "27017:27017" 30 | # depends_on: 31 | # - datanator 32 | 33 | # mongosetup: 34 | # image: lzy7071/mongo-curl:latest 35 | # volumes: 36 | # - ../:/root/karr_lab 37 | # entrypoint: [ "bash", "/root/karr_lab/datanator/scripts/mongorestore.sh" ] 38 | # restart: on-failure 39 | # depends_on: 40 | # - mongo 41 | 42 | # mongoexpress: 43 | # image: mongo-express:0.49.0 44 | # restart: always 45 | # ports: 46 | # - "8081:8081" 47 | # depends_on: 48 | # - mongo 49 | # restart: always 50 | # command: sh -c 'sleep 10 && tini -- node app' -------------------------------------------------------------------------------- /datanator/data_source/rna_halflife/back_fill_gene_name.py: -------------------------------------------------------------------------------- 1 | from datanator_query_python.query import query_rna_halflife, query_uniprot 2 | from datanator_query_python.util import mongo_util 3 | 4 | 5 | class FillGeneName(mongo_util.MongoUtil): 6 | 7 | def __init__(self, server=None, db='datanator', collection_str='rna_halflife', username=None, 8 | password=None, authSource='admin', readPreference='nearest', verbose=False, max_entries=float('inf')): 9 | super().__init__(MongoDB=server, db=db, verbose=verbose, max_entries=max_entries, 10 | username=username, password=password, authSource=authSource, readPreference=readPreference) 11 | self.client, self.db, self.collection = self.con_db(collection_str) 12 | self.rna_query = query_rna_halflife.QueryRNA(server=server, username=username, password=password, verbose=verbose, 13 | db=db, collection_str=collection_str, authDB=authSource, readPreference=readPreference) 14 | self.uniprot_query = query_uniprot.QueryUniprot(username=username, password=password, server=server, authSource=authSource, 15 | database=db, collection_str=collection_str, readPreference=readPreference) 16 | 17 | def fill_with_oln(self): 18 | """Fill gene_name with 'ordered_locus_name' field. 19 | """ 20 | con_0 = {'gene_name': None} 21 | con_1 = {'halflives.ordered_locus_name': {'$exists': True}} 22 | pass -------------------------------------------------------------------------------- /tests/data_source/rna_halflife/test_doi_10_1038_srep01318.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source.rna_halflife import doi_10_1038_srep01318 3 | import tempfile 4 | import shutil 5 | import json 6 | import os 7 | from datanator_query_python.config import config 8 | import pandas as pd 9 | 10 | 11 | class TestProteinAggregate(unittest.TestCase): 12 | 13 | @classmethod 14 | def setUpClass(cls): 15 | des_db = 'test' 16 | src_db = 'datanator' 17 | cls.protein_col = 'uniprot' 18 | cls.rna_col = 'rna_halflife' 19 | conf = config.TestConfig() 20 | username = conf.USERNAME 21 | password = conf.PASSWORD 22 | MongoDB = conf.SERVER 23 | cls.src = doi_10_1038_srep01318.Halflife(server=MongoDB, src_db=src_db, 24 | protein_col=cls.protein_col, authDB='admin', readPreference='nearest', 25 | username=username, password=password, verbose=True, max_entries=20, 26 | des_db=des_db, rna_col=cls.rna_col) 27 | 28 | @classmethod 29 | def tearDownClass(cls): 30 | # cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col) 31 | cls.src.db_obj.drop_collection(cls.rna_col) 32 | cls.src.uniprot_collection_manager.client.close() 33 | cls.src.client.close() 34 | cls.src.uniprot_query_manager.client.close() 35 | 36 | def test_fill_rna_halflife(self): 37 | d = {'probeset_id': 34555, 'gene_symbol': 'test_symbol', 'gm07029_a1': 1.1236, 'accession_id': 'NM_001762'} 38 | df_0 = pd.DataFrame(d, index=[0]) 39 | self.src.fill_rna_halflife(df_0, start=0) 40 | -------------------------------------------------------------------------------- /tests/data_source/metabolite_concentration/test_metabolite_concentration.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source.metabolite_concentration import doi_10_1038_nchembio_2077 3 | from datanator_query_python.config import config 4 | 5 | 6 | class TestMetaboliteConcentration(unittest.TestCase): 7 | 8 | @classmethod 9 | def setUpClass(cls): 10 | des_db = 'test' 11 | collection_str = 'metabolite_concentrations' 12 | conf = config.TestConfig() 13 | username = conf.USERNAME 14 | password = conf.PASSWORD 15 | MongoDB = conf.SERVER 16 | cls.src = doi_10_1038_nchembio_2077.Concentration(MongoDB=MongoDB, db=des_db, collection_str=collection_str, 17 | username=username, password=password, authSource='admin', readPreference='nearest', 18 | verbose=True, max_entries=float('inf')) 19 | 20 | @classmethod 21 | def tearDownClass(cls): 22 | cls.src.client.close() 23 | 24 | def test_flatten_conc_obj(self): 25 | _input = {'a': 0, 'b': 1, 'c': 2, "internal": "false"} 26 | result = self.src._flatten_conc_obj(_input, 1000, "XXXXX",'name') 27 | self.assertEqual(result, [{'a': 0, 'b': 1, 'c': 2,'internal': 'false', 'ncbi_taxonomy_id': 1000,'species_name': 'name', 'species_name': 'XXXXX'}]) 28 | _input = {'a': [0, 1, 2, 3], 'b': [0, 10, 20, 30], 'c': [0, 100, 200, 300], "internal": "true"} 29 | result = self.src._flatten_conc_obj(_input, 1000, "YYYYY", 'name') 30 | self.assertEqual(len(result), 4) 31 | self.assertEqual(result[1], {'a': 1, 'b': 10, 'c': 100, 'internal': 'r', 'ncbi_taxonomy_id': 1000,'species_name': 'name','species_name': 'YYYYY'}) -------------------------------------------------------------------------------- /tests/data_source/test_pax_nosql.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source import pax_nosql 3 | import datanator.config.core 4 | import tempfile 5 | import shutil 6 | 7 | class TestCorumNoSQL(unittest.TestCase): 8 | 9 | def setUp(self): 10 | self.cache_dirname = tempfile.mkdtemp() 11 | self.db = 'test' 12 | self.username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 13 | self.password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 14 | self.MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 15 | port = datanator.config.core.get_config()['datanator']['mongodb']['port'] 16 | replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet'] 17 | 18 | def tearDown(self): 19 | shutil.rmtree(self.cache_dirname) 20 | 21 | # only loads partial content because it takes too long to load everything 22 | def test_load_content(self): 23 | src = pax_nosql.PaxNoSQL( 24 | self.cache_dirname, self.MongoDB, self.db, verbose=True, max_entries = 5, 25 | password = self.password, username = self.username) 26 | collection = src.load_content() 27 | self.assertEqual(collection.count(), 5) 28 | cursor = collection.find({'file_name': '882/882-WHOLE_ORGANISM-integrated.txt'}) 29 | self.assertEqual(cursor.count(), 1) 30 | self.assertEqual(cursor[0]['species_name'], 'D.vulgaris') 31 | self.assertEqual(cursor[0]['observation'][0]['string_id'], '882.DVU0949') 32 | cursor = collection.find({'file_name': '882/882-Desulfo_Lac_Exp_SC_zhang_2006.txt'}) 33 | self.assertEqual(cursor.count(), 1) 34 | self.assertEqual(cursor[0]['weight'], 20) 35 | self.assertEqual(cursor[0]['observation'][1]['string_id'], '882.DVU0142') 36 | -------------------------------------------------------------------------------- /tests/data_source/test_kegg_org_code.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import shutil 3 | import tempfile 4 | from datanator.data_source import kegg_org_code 5 | import datanator.config.core 6 | 7 | 8 | class TestKeggOrgCode(unittest.TestCase): 9 | 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.cache_dirname = tempfile.mkdtemp() 13 | db = 'test' 14 | username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 15 | password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 16 | MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 17 | cls.src = kegg_org_code.KeggOrgCode(MongoDB, db, max_entries=100, username=username, password=password, 18 | readPreference='nearest', authSource='admin', verbose=True, collection_str='kegg_organisms_code') 19 | 20 | @classmethod 21 | def tearDownClass(cls): 22 | shutil.rmtree(cls.cache_dirname) 23 | cls.src.db.drop_collection(cls.src.collection_str) 24 | 25 | @unittest.skip('passed') 26 | def test_parse_html_iter(self): 27 | results = self.src.parse_html_iter() 28 | for i, result in enumerate(results): 29 | if i == self.src.max_entries: 30 | break 31 | print(result) 32 | 33 | @unittest.skip('passed') 34 | def test_make_bulk(self): 35 | result = self.src.make_bulk(offset=500) 36 | print(result) 37 | self.assertEqual(len(result), 100) 38 | 39 | @unittest.skip('passed') 40 | def test_get_ncbi_id_rest(self): 41 | name = "homo sapiens (human)" 42 | self.assertEqual(self.src.get_ncbi_id_rest(name), 9606) 43 | 44 | def test_get_ncbi_id(self): 45 | name = 'Mus musculus' 46 | self.assertEqual(self.src.get_ncbi_id(name), 10090) -------------------------------------------------------------------------------- /datanator/data_source/protein_localization/database_demo.py: -------------------------------------------------------------------------------- 1 | from datanator_query_python.config import config 2 | from datanator_query_python.util import mongo_util 3 | 4 | 5 | class Demo(mongo_util.MongoUtil): 6 | 7 | def __init__(self, 8 | server_demo="someaddress", 9 | db_demo="datanator-demo", 10 | username_demo="username", 11 | password_demo="password", 12 | collection_str="demo-collection"): 13 | super().__init__(MongoDB=server_demo, 14 | db=db_demo, 15 | username=username_demo, 16 | password=password_demo) 17 | self.collection = self.db_obj[collection_str] 18 | 19 | def update_collection(self): 20 | """Update collection in db. 21 | """ 22 | # dic = {"uniprot_id": "P01234", 23 | # "locale": "cell membrane", 24 | # "array_obj": ["a", "b", "c"]} 25 | dic = {"uniprot_id": "P01234", 26 | "locale": "cell membrane", 27 | "array_obj": ["a", "c", "d"]} 28 | 29 | self.collection.update_one({"uniprot_id": dic["uniprot_id"]}, 30 | {"$set": {"locale": dic["locale"]}, 31 | "$addToSet": {"array_obj": {"$each": dic["array_obj"]}}}, 32 | upsert=True) 33 | 34 | 35 | def main(): 36 | conf = config.SchemaMigration() 37 | username = conf.USERNAME 38 | password = conf.PASSWORD 39 | server = conf.SERVER 40 | src = Demo(server_demo=server, 41 | username_demo=username, 42 | password_demo=password, 43 | db_demo="test", 44 | collection_str="taxon-schema") 45 | src.update_collection() 46 | 47 | 48 | if __name__ == "__main__": 49 | main() -------------------------------------------------------------------------------- /tests/data_source/rna_halflife/test_doi_10_1371_journal_pone_0059059.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source.rna_halflife import doi_10_1371_journal_pone_0059059 3 | import tempfile 4 | import shutil 5 | from datanator_query_python.config import config 6 | import tabula 7 | 8 | 9 | class TestProteinAggregate(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | des_db = 'test' 14 | src_db = 'datanator' 15 | cls.protein_col = 'uniprot' 16 | cls.rna_col = 'rna_halflife' 17 | cls.cache_dir = tempfile.mkdtemp() 18 | conf = config.TestConfig() 19 | username = conf.USERNAME 20 | password = conf.PASSWORD 21 | MongoDB = conf.SERVER 22 | cls.src = doi_10_1371_journal_pone_0059059.Halflife(server=MongoDB, src_db=src_db, 23 | protein_col=cls.protein_col, authDB='admin', readPreference='nearest', 24 | username=username, password=password, verbose=True, max_entries=20, 25 | des_db=des_db, rna_col=cls.rna_col, cache_dir=cls.cache_dir) 26 | 27 | @classmethod 28 | def tearDownClass(cls): 29 | shutil.rmtree(cls.cache_dir) 30 | cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col) 31 | cls.src.db_obj.drop_collection(cls.rna_col) 32 | cls.src.uniprot_collection_manager.client.close() 33 | cls.src.client.close() 34 | cls.src.uniprot_query_manager.client.close() 35 | 36 | @unittest.skip('Needs acadamic IPs.') 37 | def test_fill_rna_halflife(self): 38 | url = 'https://journals.plos.org/plosone/article/file?type=supplementary&id=info:doi/10.1371/journal.pone.0059059.s002' 39 | df = tabula.read_pdf(url, pandas_options={'header': None, 'na_values': 'ND'}, pages='all') 40 | df.columns = ['gene_name', 'a', 'b', 'c', 'd', 'e'] 41 | self.src.fill_rna_half_life(df, ['Lactococcus lactis subsp. lactis Il1403', 272623]) -------------------------------------------------------------------------------- /datanator/data_source/process_rna_seq/download_cdna.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlretrieve 2 | import os 3 | import shutil 4 | 5 | def run(ensembl_info, top_dir): 6 | """Downloads the CDNA for a given sample, and creates a kallisto index file. 7 | The CDNA file is stored in a "CDNA" subdirectory within the top directory. 8 | The kalliso index files are stored within "kallisto_index_files" subdirectory within the top directory 9 | 10 | Args: 11 | experiment(:obj:`array_express.Experiment`): the array express experiment 12 | top_dirname(:obj:`str`): the name of the directory where the overall data is being stored 13 | 14 | """ 15 | download_cdna(ensembl_info, top_dir) 16 | process_cdna(ensembl_info, top_dir) 17 | 18 | 19 | def download_cdna(ensembl_info, top_dir): 20 | 21 | DIRNAME = "{}/CDNA_FILES".format(top_dir) 22 | if not os.path.isdir(DIRNAME): 23 | os.makedirs(DIRNAME) 24 | spec_name = ensembl_info.organism_strain 25 | file_name = "{}/{}.cdna.all.fa.gz".format(DIRNAME, spec_name) 26 | url = ensembl_info.url 27 | if not os.path.isfile(file_name): 28 | file = urlretrieve(url, '{}/{}.cdna.all.fa.gz'.format(top_dir, spec_name)) 29 | shutil.move('{}/{}.cdna.all.fa.gz'.format(top_dir, spec_name), DIRNAME) 30 | os.chdir(top_dir) 31 | 32 | def process_cdna(ensembl_info, top_dir): 33 | DIRNAME = "{}/CDNA_FILES".format(top_dir) 34 | file_name = "{}/{}.cdna.all.fa.gz".format(DIRNAME, ensembl_info.organism_strain) 35 | KALLISTO_DIR = "{}/kallisto_index_files".format(top_dir) 36 | if not os.path.isdir(KALLISTO_DIR): 37 | os.makedirs(KALLISTO_DIR) 38 | if not os.path.isfile("{}/{}.idx".format(KALLISTO_DIR, ensembl_info.organism_strain)): 39 | os.system("kallisto index -i {}.idx {}".format(ensembl_info.organism_strain, file_name)) 40 | shutil.move("{}/{}.idx".format(top_dir, ensembl_info.organism_strain), KALLISTO_DIR) 41 | 42 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Datanator 2 | 3 | We enthusiastically welcome contributions to Datanator! 4 | 5 | ## Coordinating contributions 6 | 7 | Before getting started, please contact the lead developers at [info@karrlab.org](mailto:info@karrlab.org) to coordinate your planned contributions with other ongoing efforts. Please also use GitHub issues to announce your plans to the community so that other developers can provide input into your plans and coordinate their own work. As the development community grows, we will institute additional infrastructure as needed such as a leadership committee and regular online meetings. 8 | 9 | ## Repository organization 10 | 11 | Datanator follows standard Python conventions: 12 | 13 | * `datanator/`: source code 14 | * `tests/`: tests 15 | * `docs/`: documentation 16 | * `setup.py`: installation script 17 | 18 | ## Coding convention 19 | 20 | Datanator follows standard Python style conventions: 21 | 22 | * Module names: `lower_snake_case` 23 | * Class names: `UpperCamelCase` 24 | * Function names: `lower_snake_case` 25 | * Variable names: `lower_snake_case` 26 | 27 | ## Testing 28 | 29 | We strive to have complete test coverage of Datanator. As such, all contributions to Datanator should be tested. The tests are located in the `tests` subdirectory. The tests are implemented using the `unittest` module. The tests can be executed by running `pytest tests`. 30 | 31 | Upon each push to GitHub, GitHub will trigger CircleCI to execute all of the tests. 32 | 33 | ## Documentation convention 34 | 35 | Datanator is documented using the napoleon Sphinx plugin. The documentation can be compiled by running `sphinx-build docs docs/_build/html`. 36 | 37 | ## Submitting changes 38 | 39 | Please use GitHub pull requests to submit changes. Each request should include a brief description of the new and/or modified features. 40 | 41 | ## Releasing and deploying new versions 42 | 43 | Contact [info@karrlab.org](mailto:info@karrlab.org) to request release and deployment of new changes. 44 | -------------------------------------------------------------------------------- /datanator/data_source/sqlite_to_json.py: -------------------------------------------------------------------------------- 1 | 2 | '''Converts tables in SQLite into json files 3 | Attributes: 4 | database: path to sqlite database 5 | query: query execution command in string format 6 | 7 | ''' 8 | 9 | import json 10 | import os 11 | import sqlite3 12 | import pprint 13 | 14 | class SQLToJSON(): 15 | 16 | def __init__(self, query, cache_dirname=None): 17 | self.query = query 18 | self.cache_dirname = cache_dirname 19 | 20 | def db(self): 21 | database = self.cache_dirname 22 | return sqlite3.connect(database) 23 | 24 | # returns all the table names in a sqlite database 25 | def table(self): 26 | cursor = self.db().cursor() 27 | cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") 28 | tables = cursor.fetchall() 29 | table_names = [] 30 | for table_name in tables: 31 | table_names.append(table_name[0]) 32 | cursor.connection.close() 33 | return table_names 34 | 35 | # one : return as one json file or not 36 | def query_table(self, table, one=True): 37 | cur = self.db().cursor() 38 | query = self.query + table 39 | cur.execute(query) 40 | r = [dict((cur.description[i][0], value) 41 | for i, value in enumerate(row)) for row in cur.fetchall()] 42 | cur.connection.close() 43 | return (r if r else None) if one else r 44 | 45 | 46 | def main(): 47 | database = './cache/SabioRk.sqlite' 48 | query = "select * from " 49 | collection_dir = './cache/SabioRk/' 50 | os.makedirs(os.path.dirname(collection_dir), exist_ok=True) 51 | 52 | temp = SQLToJSON(query, cache_dirname=database) 53 | tables = temp.table() 54 | 55 | for table in tables: 56 | file_name = os.path.join(collection_dir + table + '.json') 57 | result = temp.query_table(table) 58 | with open(file_name, "w") as f: 59 | f.write(json.dumps(result, indent=4)) 60 | 61 | if __name__ == '__main__': 62 | main() 63 | -------------------------------------------------------------------------------- /tests/data_source/test_gene_ortholog.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import shutil 3 | import tempfile 4 | from datanator.data_source import gene_ortholog 5 | import datanator.config.core 6 | 7 | 8 | class TestKeggOrgCode(unittest.TestCase): 9 | 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.cache_dirname = tempfile.mkdtemp() 13 | db = 'test' 14 | username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 15 | password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 16 | MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 17 | cls.src = gene_ortholog.KeggGeneOrtholog(MongoDB, des_db=db, src_db='datanator', max_entries=10, username=username, password=password, 18 | readPreference='nearest', authSource='admin', verbose=True) 19 | cls.query = 'aly:ARALYDRAFT_486312' 20 | 21 | @classmethod 22 | def tearDownClass(cls): 23 | shutil.rmtree(cls.cache_dirname) 24 | cls.src.des_db.drop_collection(cls.src.collection_str) 25 | 26 | @unittest.skip('passed') 27 | def test_parse_html(self): 28 | soup = self.src.get_html(self.query) 29 | results = self.src.parse_html(soup) 30 | for i, result in enumerate(results): 31 | if i == self.src.max_entries: 32 | break 33 | print(result) 34 | 35 | @unittest.skip('passed') 36 | def test_uniprot_to_org_gene(self): 37 | uniprot_id = 'Q05758' 38 | result = self.src.uniprot_to_org_gene(uniprot_id) 39 | self.assertEqual('ath:AT3G58610', result) 40 | uniprot_id = 'Q8N7E2' 41 | result = self.src.uniprot_to_org_gene(uniprot_id) 42 | print(result) 43 | 44 | def test_parse_gene_info(self): 45 | result = self.src.parse_gene_info('100008727') 46 | self.assertEqual(['AAD18037.1', 'AAD38154.1', 'AFS49951.1', 'NP_001075529.1', 'Q9XSZ4.1', 'XP_008265676.1', 'XP_017202733.1'], result) -------------------------------------------------------------------------------- /datanator/util/build_util.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | def timemethod(method): 4 | 5 | def timed(*args, **kw): 6 | if args[0].verbose: 7 | print('\n------------------------ Initializing %r ------------------------' % (method.__name__)) 8 | ts = time.time() 9 | result = method(*args, **kw) 10 | te = time.time() 11 | 12 | if args[0].verbose: 13 | print('%r took %2.2f sec' % \ 14 | (method.__name__, te-ts)) 15 | print('%r completed' % (method.__name__)) 16 | return result 17 | 18 | return timed 19 | 20 | def timeloadcontent(method): 21 | 22 | def timed(*args, **kw): 23 | if args[0].verbose: 24 | print(''' \n 25 | =================================== 26 | | | 27 | | | 28 | | Starting Datanator Build | 29 | | | 30 | | | 31 | =================================== 32 | 33 | ''') 34 | 35 | ts = time.time() 36 | result = method(*args, **kw) 37 | te = time.time() 38 | 39 | if args[0].verbose: 40 | print(''' \n 41 | ============================================= 42 | | | 43 | | Finished Build | 44 | Total time taken for build: %2.2f secs 45 | | | 46 | ============================================= 47 | ''' % (te - ts)) 48 | 49 | return result 50 | 51 | return timed 52 | 53 | 54 | def continuousload(method): 55 | 56 | def continuous(*args, **kw): 57 | try: 58 | result = method(*args, **kw) 59 | return result 60 | except Exception as e: 61 | print(e) 62 | 63 | return continuous 64 | -------------------------------------------------------------------------------- /tests/util/test_index_collection.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.util import index_collection 3 | import datanator.config.core 4 | import tempfile 5 | import shutil 6 | 7 | 8 | class TestMongoUtil(unittest.TestCase): 9 | 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.cache_dirname = tempfile.mkdtemp() 13 | cls.db = 'test' 14 | username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 15 | password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 16 | MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 17 | port = datanator.config.core.get_config()['datanator']['mongodb']['port'] 18 | replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet'] 19 | cls.src = index_collection.IndexCollection( 20 | cache_dirname = cls.cache_dirname, MongoDB = MongoDB, 21 | replicaSet = replSet, db = cls.db, verbose=True, max_entries=20, 22 | username = username, password = password) 23 | 24 | @classmethod 25 | def tearDownClass(cls): 26 | shutil.rmtree(cls.cache_dirname) 27 | 28 | @unittest.skip('passed') 29 | def test_index_corum(self): 30 | col_str = 'corum' 31 | self.src.index_corum(col_str) 32 | client, _, collection = self.src.con_db(col_str) 33 | self.assertEqual(len(list(collection.list_indexes())), 4) 34 | client.close() 35 | 36 | @unittest.skip('passed') 37 | def test_index_sabio(self): 38 | col_str = 'sabio_rk' 39 | self.src.index_sabio(col_str) 40 | client,_,collection = self.src.con_db(col_str) 41 | self.assertEqual( len(list(collection.list_indexes())), 11) # 10 + 1 42 | client.close() 43 | 44 | @unittest.skip('passed') 45 | def test_index_uniprot(self): 46 | col_str = 'uniprot' 47 | self.src.index_uniprot(col_str) 48 | client,_,collection = self.src.con_db(col_str) 49 | self.assertEqual( len(list(collection.list_indexes())), 3) # 2 + 1 50 | client.close() -------------------------------------------------------------------------------- /tests/data_source/rna_halflife/test_order_by_ko.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source.rna_halflife import order_by_ko 3 | from datanator_query_python.config import config 4 | 5 | 6 | class TestReorg(unittest.TestCase): 7 | 8 | @classmethod 9 | def setUpClass(cls): 10 | des_db = 'test' 11 | src_db = 'datanator' 12 | cls.src_collection = 'rna_halflife_new' 13 | cls.des_collection = 'rna_halflife_test' 14 | conf = config.TestConfig() 15 | username = conf.USERNAME 16 | password = conf.PASSWORD 17 | MongoDB = conf.SERVER 18 | cls.src = order_by_ko.Reorg(MongoDB=MongoDB, src_db=src_db, 19 | verbose=False, max_entries=20, username=username, 20 | password=password, authSource='admin', readPreference='nearest', 21 | des_collection=cls.des_collection, src_collection=cls.src_collection, 22 | des_db=des_db) 23 | 24 | @classmethod 25 | def tearDownClass(cls): 26 | cls.src.des_db.drop_collection(cls.des_collection) 27 | cls.src.src_client.close() 28 | cls.src.des_client.close() 29 | 30 | def test_helper(self): 31 | doi = '10.1016/j.cell.2013.12.026' 32 | _, count = self.src.helper(doi) 33 | self.assertGreater(count, 0) 34 | 35 | @unittest.skip('passed') 36 | def test_fill_cell(self): 37 | self.src.fill_cell() 38 | 39 | @unittest.skip('passed') 40 | def test_fill_mbc(self): 41 | self.src.fill_mbc() 42 | 43 | @unittest.skip('passed') 44 | def test_fill_nar_gks(self): 45 | self.src.fill_nar_gks() 46 | 47 | @unittest.skip('passed') 48 | def test_fill_nar_gkt(self): 49 | self.src.fill_nar_gkt() 50 | 51 | @unittest.skip('passed') 52 | def test_fill_gr_131(self): 53 | self.src.fill_gr_131() 54 | 55 | @unittest.skip('passed') 56 | def test_fill_gb_2012(self): 57 | self.src.fill_gb_2012() 58 | 59 | @unittest.skip('passed') 60 | def test_fill_s12864(self): 61 | self.src.fill_s12864() 62 | 63 | @unittest.skip('passed') 64 | def test_fill_journal_pone(self): 65 | self.src.fill_journal_pone() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | import setuptools 3 | import subprocess 4 | import sys 5 | try: 6 | result = subprocess.run( 7 | [sys.executable, "-m", "pip", "show", "pkg_utils"], 8 | check=True, capture_output=True) 9 | match = re.search(r'\nVersion: (.*?)\n', result.stdout.decode(), re.DOTALL) 10 | assert match and tuple(match.group(1).split('.')) >= ('0', '0', '5') 11 | except (subprocess.CalledProcessError, AssertionError): 12 | subprocess.run( 13 | [sys.executable, "-m", "pip", "install", "-U", "pkg_utils"], 14 | check=True) 15 | import os 16 | import pkg_utils 17 | 18 | name = 'datanator' 19 | dirname = os.path.dirname(__file__) 20 | 21 | # get package metadata 22 | md = pkg_utils.get_package_metadata(dirname, name) 23 | 24 | # install package 25 | setuptools.setup( 26 | name=name, 27 | version=md.version, 28 | description='Finds relevant kinetic data for biochemical models', 29 | long_description=md.long_description, 30 | 31 | url='https://github.com/KarrLab/' + name, 32 | download_url='https://github.com/KarrLab/' + name, 33 | license='MIT', 34 | 35 | author='Karr Lab', 36 | author_email='members@karrlab.org', 37 | 38 | keywords=['kinetic data', 'systems biology', 'computational biology', ], 39 | classifiers=[ 40 | 'Development Status :: 3 - Alpha', 41 | 'Intended Audience :: Developers', 42 | 'License :: OSI Approved :: MIT License', 43 | 'Programming Language :: Python :: 3', 44 | 'Programming Language :: Python :: 3.6', 45 | ], 46 | 47 | packages=setuptools.find_packages(exclude=['tests', 'tests.*']), 48 | package_data={ 49 | name: [ 50 | 'config/core.schema.cfg', 51 | 'config/core.default.cfg', 52 | 'data_source/*.txt', 53 | 'data/*.txt', 54 | 'data/*.xlsx', 55 | ], 56 | }, 57 | entry_points={ 58 | 'console_scripts': [ 59 | 'datanator = datanator.__main__:main', 60 | ], 61 | }, 62 | 63 | install_requires=md.install_requires, 64 | extras_require=md.extras_require, 65 | tests_require=md.tests_require, 66 | dependency_links=md.dependency_links, 67 | ) 68 | -------------------------------------------------------------------------------- /tests/util/test_mongo_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.util import mongo_util 3 | import datanator.config.core 4 | import tempfile 5 | import shutil 6 | 7 | 8 | class TestMongoUtil(unittest.TestCase): 9 | 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.cache_dirname = tempfile.mkdtemp() 13 | cls.db = 'datanator' 14 | username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 15 | password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 16 | MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 17 | port = datanator.config.core.get_config()['datanator']['mongodb']['port'] 18 | replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet'] 19 | cls.src = mongo_util.MongoUtil( 20 | cache_dirname = cls.cache_dirname, MongoDB = MongoDB, 21 | replicaSet = replSet, db = cls.db, verbose=True, max_entries=20, 22 | username = username, password = password) 23 | cls.collection_str = 'ecmdb' 24 | 25 | 26 | @classmethod 27 | def tearDownClass(cls): 28 | shutil.rmtree(cls.cache_dirname) 29 | 30 | 31 | # @unittest.skip('passed') 32 | def test_list_all_collections(self): 33 | self.assertTrue('ecmdb' in self.src.list_all_collections()) 34 | 35 | 36 | # @unittest.skip('passed') 37 | def test_con_db(self): 38 | self.assertNotEqual(self.src.con_db(self.db), 'Server not available') 39 | 40 | @unittest.skip('passed') 41 | def test_fill_db(self): 42 | self.collection_obj.drop() 43 | self.assertEqual(self.collection_obj.find().count(), 0) 44 | collection_obj = self.src.fill_db(self.collection_str) 45 | self.assertNotEqual(collection_obj.find().count(), 0) 46 | 47 | # @unittest.skip('passed') 48 | def test_print_schema(self): 49 | a = self.src.print_schema('ecmdb') 50 | self.assertEqual(a['properties']['creation_date'], {'type': 'string'}) 51 | self.assertEqual(a['properties']['synonyms'], {'type': 'object', 'properties': {'synonym': {'type': 'array', 52 | 'items': {'type': 'string'}}}, 'required': ['synonym']}) 53 | 54 | 55 | -------------------------------------------------------------------------------- /tests/util/test_reaction_util.py: -------------------------------------------------------------------------------- 1 | """ Tests of the reaction utilities 2 | 3 | :Author: Jonathan Karr 4 | :Date: 2017-04-12 5 | :Copyright: 2017, Karr Lab 6 | :License: MIT 7 | """ 8 | 9 | from datanator.core import data_model 10 | from datanator.util import reaction_util 11 | import unittest 12 | 13 | 14 | class TestReaction(unittest.TestCase): 15 | adp = 'NC1=C2N=CN(C3OC(COP([O-])(=O)OP([O-])([O-])=O)C(O)C3O)C2=NC=N1' 16 | atp = 'NC1=C2N=CN(C3OC(COP([O-])(=O)OP([O-])(=O)OP([O-])([O-])=O)C(O)C3O)C2=NC=N1' 17 | h = '[H+]' 18 | h2o = 'O' 19 | pi = 'OP([O-])([O-])=O' 20 | 21 | def make_reaction(self): 22 | return data_model.Reaction(participants=[ 23 | data_model.ReactionParticipant( 24 | specie=data_model.Specie(structure=self.atp, id='atp'), 25 | compartment=data_model.Compartment(id='c'), 26 | coefficient=-1), 27 | data_model.ReactionParticipant( 28 | specie=data_model.Specie(structure=self.h2o, id='h2o'), 29 | compartment=data_model.Compartment(id='c'), 30 | coefficient=-1), 31 | data_model.ReactionParticipant( 32 | specie=data_model.Specie(structure=self.adp, id='adp'), 33 | compartment=data_model.Compartment(id='c'), 34 | coefficient=1), 35 | data_model.ReactionParticipant( 36 | specie=data_model.Specie(structure=self.pi, id='pi'), 37 | compartment=data_model.Compartment(id='c'), 38 | coefficient=1), 39 | data_model.ReactionParticipant( 40 | specie=data_model.Specie(structure=self.h, id='h'), 41 | compartment=data_model.Compartment(id='c'), 42 | coefficient=1), 43 | ]) 44 | 45 | def test_calc_reactant_product_pairs(self): 46 | rxn = self.make_reaction() 47 | pairs = reaction_util.calc_reactant_product_pairs(rxn) 48 | 49 | self.assertEqual(pairs[0][0].specie.id, 'atp') 50 | self.assertEqual(pairs[0][1].specie.id, 'adp') 51 | 52 | self.assertEqual(pairs[1][0].specie.id, 'h2o') 53 | self.assertIn(pairs[1][1].specie.id, ['h', 'pi']) 54 | 55 | self.assertEqual(pairs[2][0], None) 56 | self.assertIn(pairs[2][1].specie.id, ['h', 'pi']) 57 | -------------------------------------------------------------------------------- /datanator/data_source/metabolite_concentration/query_demo.py: -------------------------------------------------------------------------------- 1 | from datanator_query_python.util import mongo_util 2 | from datanator_query_python.config import config 3 | 4 | 5 | class QueryDemo(mongo_util.MongoUtil): 6 | def __init__(self, MongoDB=None, 7 | db=None, 8 | collection_str=None, 9 | password=None, 10 | username=None, 11 | max_entries=20): 12 | super().__init__(MongoDB=MongoDB, 13 | db=db, 14 | username=username, 15 | password=password) 16 | self.collection = self.db_obj[collection_str] 17 | self.max_entries = max_entries 18 | 19 | def get_canon_ancestors(self, tax_id): 20 | """Getting canon ancestor information by taxon ID 21 | 22 | Args: 23 | (:obj:`int`): Taxon ID of organism. 24 | 25 | Return: 26 | (:obj:`list` of :obj:`Obj`) 27 | """ 28 | query = {"tax_id": tax_id} 29 | projection = {"canon_anc_ids": 1, "canon_anc_names": 1, 30 | "_id": 0} 31 | doc = self.collection.find_one(filter=query, 32 | projection=projection) 33 | result = [] 34 | if doc is None: 35 | return result 36 | for _id, name in zip(doc["canon_anc_ids"], doc["canon_anc_names"]): 37 | obj = {"ncbi_taxonomy_id": _id, 38 | "name": name} 39 | result.append(obj) 40 | return result 41 | 42 | def demo_find(self, tax_id): 43 | """Find organism with canon ancestor tax_id 44 | 45 | Args: 46 | (:obj:`int`): Ancestor ID. 47 | 48 | Return: 49 | (:obj:`list`) 50 | """ 51 | query = {"canon_anc_ids": tax_id} 52 | projection = {"canon_anc_ids": 1, "canon_anc_names": 1, 53 | "_id": 0} 54 | docs = self.collection.find(filter=query, 55 | projection=projection) 56 | result = [] 57 | if docs is None: 58 | return result 59 | for i, doc in enumerate(docs): 60 | if i == self.max_entries: 61 | break 62 | result.append(doc) 63 | return result 64 | -------------------------------------------------------------------------------- /tests/data_source/test_metabolites_meta_collection.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source import metabolites_meta_collection 3 | import datanator.config.core 4 | import pymongo 5 | import tempfile 6 | import shutil 7 | 8 | 9 | class TestMetabolitesMeta(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | cls.cache_dirname = tempfile.mkdtemp() 14 | cls.db = 'test' 15 | cls.meta_loc = 'datanator' 16 | cls.username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 17 | cls.password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 18 | cls.MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 19 | port = datanator.config.core.get_config()['datanator']['mongodb']['port'] 20 | replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet'] 21 | cls.src = metabolites_meta_collection.MetabolitesMeta(cache_dirname=cls.cache_dirname, 22 | MongoDB=cls.MongoDB, db=cls.db, 23 | verbose=True, max_entries=20, username = cls.username, 24 | password = cls.password, meta_loc = cls.meta_loc) 25 | cls.client, cls.db_obj, cls.collection_obj = cls.src.con_db(cls.db) 26 | 27 | @classmethod 28 | def tearDownClass(cls): 29 | shutil.rmtree(cls.cache_dirname) 30 | cls.client.close() 31 | 32 | 33 | # def test_fill_metabolite_fields(self): 34 | # dict_list = self.src.fill_metabolite_fields( 35 | # fields = ['m2m_id', 'inchi'], collection_src = 'ecmdb', collection_des = 'metabolites_meta') 36 | # self.assertEqual( 37 | # dict_list[0]['inchi'], 'InChI=1S/C4H6O3/c1-2-3(5)4(6)7') 38 | 39 | def test_load_content(self): 40 | self.src.load_content() 41 | meta_db = self.src.client[self.meta_loc] 42 | collection = meta_db['metabolites_meta'] 43 | cursor = collection.find_one({'inchi': 'InChI=1S/C4H8O3/c1-3(2-5)4(6)7/h3,5H,2H2,1H3,(H,6,7)'}) 44 | self.assertEqual(cursor['InChI_Key'], 'DBXBTMSZEOQQDU-UHFFFAOYSA-N') 45 | 46 | def test_replace_key_in_similar_compounds(self): 47 | self.src.replace_key_in_similar_compounds() -------------------------------------------------------------------------------- /tests/data_source/test_ec.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source import ec 3 | import datanator.config.core 4 | import shutil 5 | import tempfile 6 | from pathlib import Path 7 | 8 | 9 | class TestEC(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | cls.cache_dir = tempfile.mkdtemp() 14 | db = 'test' 15 | username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 16 | password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 17 | MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 18 | cls.src = ec.EC(server=MongoDB, db=db, username=username, password=password, authSource='admin', 19 | readPreference='nearest', max_entries=20, cache_dir=cls.cache_dir) 20 | 21 | @classmethod 22 | def tearDownClass(cls): 23 | shutil.rmtree(cls.cache_dir) 24 | cls.src.db.drop_collection(cls.src.collection_str) 25 | cls.src.client.close() 26 | 27 | @unittest.skip('IP') 28 | def test_establish_ftp(self): 29 | ftp = self.src.establish_ftp() 30 | self.assertTrue('enzyme.dat' in ftp.nlst()) 31 | 32 | @unittest.skip('IP') 33 | def test_retrieve_content(self): 34 | p = Path(self.cache_dir+'/enzyme.dat') 35 | self.src.retrieve_content() 36 | self.assertTrue(p.exists()) 37 | 38 | @unittest.skip('circle directory error.') 39 | def test_parse_content(self): 40 | location = str(Path('~/karr_lab/datanator/docs/enzyme.dat').expanduser()) 41 | self.src.parse_content(location) 42 | 43 | def test_make_doc(self): 44 | lines = ["ID 1.1.1.1", "DE Alcohol dehydrogenase.", "AN Aldehyde reductase.", 45 | "CA (1) A primary alcohol + NAD(+) = an aldehyde + NADH.", "CA (2) A secondary alcohol + NAD(+) = a ketone + NADH.", 46 | "CF Zn(2+) or Fe cation."] 47 | result = self.src.make_doc(lines) 48 | self.assertEqual(result, {'ec_number': '1.1.1.1', 'ec_name': 'Alcohol dehydrogenase', 49 | 'ec_synonyms': ['Aldehyde reductase'], 50 | 'catalytic_activity': ['(1) A primary alcohol + NAD(+) = an aldehyde + NADH', '(2) A secondary alcohol + NAD(+) = a ketone + NADH'], 51 | 'cofactor': 'Zn(2+) or Fe cation'}) -------------------------------------------------------------------------------- /tests/data_source/rna_halflife/test_doi_10_1093_nar_gks1019.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source.rna_halflife import doi_10_1093_nar_gks1019 3 | import tempfile 4 | import shutil 5 | import json 6 | import os 7 | from datanator_query_python.config import config 8 | import pandas as pd 9 | 10 | 11 | class TestProteinAggregate(unittest.TestCase): 12 | 13 | @classmethod 14 | def setUpClass(cls): 15 | des_db = 'test' 16 | src_db = 'datanator' 17 | cls.protein_col = 'uniprot' 18 | cls.rna_col = 'rna_halflife' 19 | conf = config.TestConfig() 20 | username = conf.USERNAME 21 | password = conf.PASSWORD 22 | MongoDB = conf.SERVER 23 | cls.src = doi_10_1093_nar_gks1019.Halflife(server=MongoDB, src_db=src_db, 24 | protein_col=cls.protein_col, authDB='admin', readPreference='nearest', 25 | username=username, password=password, verbose=True, max_entries=20, 26 | des_db=des_db, rna_col=cls.rna_col) 27 | 28 | @classmethod 29 | def tearDownClass(cls): 30 | cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col) 31 | cls.src.db_obj.drop_collection(cls.rna_col) 32 | cls.src.uniprot_collection_manager.client.close() 33 | cls.src.client.close() 34 | cls.src.uniprot_query_manager.client.close() 35 | 36 | @unittest.skip('avoid downloading') 37 | def test_fill_uniprot(self): 38 | url_0 = 'https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/nar/41/1/10.1093/nar/gks1019/2/gks1019-nar-00676-a-2012-File003.xlsx?Expires=1578425844&Signature=ZRFUxLdn4-vaBt5gQci~0o56KqyR9nJj9i32ig5X6YcfqiJeV3obEq8leHGdDxx6w~KABgewiQ66HTB7gmuG~2GL-YgxPKYSjt17WrYMkc-0ibw6TMlTvWZZfvw-lPe~wvpmVfNEXnTbP7jHyNLu9jeJ6yhoXvgIyQtzA5PbEI1fyXEgeZzOKMltmITqL3g3APsPsagCTC66rwrBT23Aghh6D314uilT2DZHCc68MH2nyV~qAhFqIQiOj-7VTEKqkDPvPYvuE2KNKXdvW23gk100YV~58ozbt8ijRz5Gr5gPtE~f1Ab5l260EIbWHJNabMRleInJQqUIDPFN4C38PQ__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA' 39 | df_0 = self.src.fill_uniprot(url_0, 'Supplementary Table 1') 40 | self.assertEqual(df_0.iloc[0]['ordered_locus_name'], 'Rv0002') 41 | 42 | def test_fill_rna_halflife(self): 43 | d = {'half_life': [32.3, 12.2, 13.2], 'r_squared': [0.9, 0.7, 0.8], 44 | 'ordered_locus_name': ['Rv0002', 'something', 'this']} 45 | df_0 = pd.DataFrame(d) 46 | self.src.fill_rna_halflife(df_0, ['aaa', 102]) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .pytest_cache/ 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | src/ 27 | *.DS_Store 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # test output 40 | tests/reports/ 41 | tests/output/ 42 | 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # MongoDB stuff: 68 | ./db_volumes 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx generated documentation 74 | docs/_build/ 75 | docs/source/ 76 | docs/.doctrees/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # IPython Notebook 82 | .ipynb_checkpoints 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # dotenv 91 | .env 92 | 93 | # virtualenv 94 | venv/ 95 | ENV/ 96 | .venv 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # temporary Excel files 105 | .~lock.* 106 | 107 | # scratch 108 | scratch/ 109 | datanator/data/ 110 | 111 | # passwords 112 | tests/fixtures/secret/ 113 | 114 | # IDE configuration 115 | .idea/ 116 | 117 | # cache files 118 | datanator/data_source/cache/ 119 | intact/ 120 | datanator/data/cache/CommonSchema.sql 121 | datanator/builds/logs/ 122 | *.sqlite 123 | 124 | # configuration 125 | datanator.cfg 126 | datanator/production_server.py 127 | 128 | #data source files for YMDB 129 | datanator/data_source/YMDB/ 130 | 131 | # minerva 132 | minerva/* 133 | 134 | # ontology file 135 | *.obo 136 | 137 | # invisible stuff 138 | .* 139 | # test output 140 | tests/data_source/test_processing/ 141 | -------------------------------------------------------------------------------- /tests/data_source/rna_halflife/test_doi_10_1186_gb_2012_13_4_r30.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source.rna_halflife import doi_10_1186_gb_2012_13_4_r30 3 | import tempfile 4 | import shutil 5 | import json 6 | import os 7 | from datanator_query_python.config import config 8 | import pandas as pd 9 | 10 | 11 | class TestProteinAggregate(unittest.TestCase): 12 | 13 | @classmethod 14 | def setUpClass(cls): 15 | des_db = 'test' 16 | src_db = 'datanator' 17 | cls.protein_col = 'uniprot' 18 | cls.rna_col = 'rna_halflife' 19 | conf = config.TestConfig() 20 | username = conf.USERNAME 21 | password = conf.PASSWORD 22 | MongoDB = conf.SERVER 23 | cls.src = doi_10_1186_gb_2012_13_4_r30.Halflife(server=MongoDB, src_db=src_db, 24 | protein_col=cls.protein_col, authDB='admin', readPreference='nearest', 25 | username=username, password=password, verbose=True, max_entries=20, 26 | des_db=des_db, rna_col=cls.rna_col) 27 | 28 | @classmethod 29 | def tearDownClass(cls): 30 | cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col) 31 | cls.src.db_obj.drop_collection(cls.rna_col) 32 | cls.src.uniprot_collection_manager.client.close() 33 | cls.src.client.close() 34 | cls.src.uniprot_query_manager.client.close() 35 | 36 | @unittest.skip('passed') 37 | def test_load_uniprot(self): 38 | self.src.load_uniprot() 39 | 40 | def test_fill_rna_half_life(self): 41 | url = """https://static-content.springer.com/esm/art%3A10.1186%2Fgb-2012-13-4-r30/MediaObjects/13059_2011_2880_MOESM3_ESM.XLSX""" 42 | names = ['ordered_locus_name', 'half_life_ga_2', 'reads_per_kb_per_mb', 43 | 'transcriptional_start_sites', 'transcriptional_end_sites', 'operon', 44 | 'gene_start', 'gene_end', 'strand', 'gene_name', 'protein_annotation', 45 | 'cog', 'kegg', 'half_life_qpcr', 'half_life_454'] 46 | df_10987 = self.src.make_df(url, 'Bc10987', names=names, usecols='A:O', skiprows=[0,1], nrows=self.src.max_entries) 47 | self.src.fill_rna_half_life(df_10987, names, ['Bacillus cereus ATCC 10987', 222523]) 48 | self.src.fill_rna_half_life(df_10987, names, ['Bacillus cereus ATCC 10987', 222523], quantification_method='RT-qPCR') 49 | self.src.fill_rna_half_life(df_10987, names, ['Bacillus cereus ATCC 10987', 222523], quantification_method='Roche 454') -------------------------------------------------------------------------------- /tests/elasticsearch_kl/test_batch_load.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.elasticsearch_kl import batch_load 3 | from datanator_query_python.config import config 4 | import tempfile 5 | import shutil 6 | import requests 7 | 8 | class TestMongoToES(unittest.TestCase): 9 | 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.cache_dir = tempfile.mkdtemp() 13 | cls.src = batch_load.MongoToES(profile_name='es-poweruser', credential_path='~/.wc/third_party/aws_credentials', 14 | config_path='~/.wc/third_party/aws_config', elastic_path='~/.wc/third_party/elasticsearch.ini', 15 | cache_dir=cls.cache_dir, service_name='es', index='test', max_entries=float('inf'), verbose=True) 16 | cls.url = cls.src.es_endpoint + '/' + cls.src.index 17 | requests.delete(cls.url, auth=cls.src.awsauth) 18 | conf = config.Config() 19 | cls.username = conf.USERNAME 20 | cls.password = conf.PASSWORD 21 | cls.server = conf.SERVER 22 | cls.authDB = conf.AUTHDB 23 | cls.db = 'datanator' 24 | 25 | @classmethod 26 | def tearDownClass(cls): 27 | shutil.rmtree(cls.cache_dir) 28 | requests.delete(cls.url, auth=cls.src.awsauth) 29 | 30 | def test_connection(self): 31 | result = self.src.client.list_domain_names() 32 | self.assertEqual(result['ResponseMetadata']['HTTPStatusCode'], 200) 33 | self.assertTrue('datanator-elasticsearch' in self.src.es_endpoint) 34 | 35 | def test_data_from_mongo(self): 36 | count, _ = self.src.data_from_mongo_protein(self.server, self.db, self.username, 37 | self.password, authSource=self.authDB) 38 | self.assertTrue(count >= 1000) 39 | 40 | def test_data_from_metabolite(self): 41 | _, count_0, _, count_1 = self.src.data_from_mongo_metabolite(self.server, self.db, self.username, 42 | self.password, authSource=self.authDB) 43 | self.assertTrue(count_0 >= 1000) 44 | self.assertTrue(count_1 >= 1000) 45 | 46 | def test_data_from_metabolites_meta(self): 47 | doc = self.src.data_from_mongo_metabolites_meta(self.server, self.db, self.username, 48 | self.password, authSource=self.authDB) 49 | result = [] 50 | for i in range(5): 51 | result.append(doc) 52 | self.assertEqual(len(result), 5) -------------------------------------------------------------------------------- /tests/data_source/rna_halflife/test_doi_10_1093_nar_gkt1150.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source.rna_halflife import doi_10_1093_nar_gkt1150 3 | import tempfile 4 | import shutil 5 | import json 6 | import os 7 | from datanator_query_python.config import config 8 | import pandas as pd 9 | 10 | 11 | class TestProteinAggregate(unittest.TestCase): 12 | 13 | @classmethod 14 | def setUpClass(cls): 15 | des_db = 'test' 16 | src_db = 'datanator' 17 | cls.protein_col = 'uniprot' 18 | cls.rna_col = 'rna_halflife' 19 | cls.cache_dir = tempfile.mkdtemp() 20 | conf = config.TestConfig() 21 | username = conf.USERNAME 22 | password = conf.PASSWORD 23 | MongoDB = conf.SERVER 24 | cls.src = doi_10_1093_nar_gkt1150.Halflife(server=MongoDB, src_db=src_db, 25 | protein_col=cls.protein_col, authDB='admin', readPreference='nearest', 26 | username=username, password=password, verbose=True, max_entries=20, 27 | des_db=des_db, rna_col=cls.rna_col, cache_dir=cls.cache_dir) 28 | 29 | @classmethod 30 | def tearDownClass(cls): 31 | shutil.rmtree(cls.cache_dir) 32 | cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col) 33 | cls.src.db_obj.drop_collection(cls.rna_col) 34 | cls.src.uniprot_collection_manager.client.close() 35 | cls.src.client.close() 36 | cls.src.uniprot_query_manager.client.close() 37 | 38 | @unittest.skip('downloading of file forbidden from nonacademic IP') 39 | def test_fill_rna_half_life(self): 40 | url = 'https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/nar/42/4/10.1093_nar_gkt1150/1/gkt1150_Supplementary_Data.zip?Expires=1578928721&Signature=ADjsCSaceimzGs6aJ~uG7np88TzHNooAoBabdm-6utYVIZOEwRbzTdiBp~76vM4KEHz9Nir8GNrtA3AwHwGFm0bu~aorTG4xrOChS6UgfBQiUtgr8vfbDIUno1y1nxLGCKIfQrb2Gx-SVnigum2gjcveymK995zadSNZqN~w-vz-Ii0a6fH7kvKN8m9vLWf6fdo0NXSmgnkjj9KPCuS-bmK0y4ZH5Ex0Rl4qi5uCroYmDBNOhXY23pcalbpFwB1-07tA3~756gZN4Mo9uMeSVQKl5UsHzx5amB6WvSCXS8z756YoaaMCg0FQbUCcQ46fRGdHxcvPNcrPo5IMEGmi8g__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA' 41 | df_s1 = self.src.make_df(url, 'TableS1', names=['oln', 'gene_symbol', 'a', 'vc_a', 'b', 'vc_b', 'c', 'vc_c', 'd', 'vc_d'], usecols='A,B,L:S', 42 | skiprows=list(range(0, 7)), file_type='zip', file_name='nar-01935-a-2013-File011.xlsx') 43 | self.src.fill_rna_half_life(df_s1, ['Escherichia coli str. K-12 substr. MG1655', 511145]) -------------------------------------------------------------------------------- /tests/data_source/rna_halflife/test_doi_10_1101_gr_131037_111.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source.rna_halflife import doi_10_1101_gr_131037_111 3 | import tempfile 4 | import shutil 5 | import json 6 | import os 7 | from datanator_query_python.config import config 8 | import pandas as pd 9 | 10 | 11 | class TestProteinAggregate(unittest.TestCase): 12 | 13 | @classmethod 14 | def setUpClass(cls): 15 | des_db = 'test' 16 | src_db = 'datanator' 17 | cls.protein_col = 'uniprot' 18 | cls.rna_col = 'rna_halflife' 19 | conf = config.TestConfig() 20 | username = conf.USERNAME 21 | password = conf.PASSWORD 22 | MongoDB = conf.SERVER 23 | cls.src = doi_10_1101_gr_131037_111.Halflife(server=MongoDB, src_db=src_db, 24 | protein_col=cls.protein_col, authDB='admin', readPreference='nearest', 25 | username=username, password=password, verbose=True, max_entries=20, 26 | des_db=des_db, rna_col=cls.rna_col) 27 | 28 | @classmethod 29 | def tearDownClass(cls): 30 | cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col) 31 | cls.src.db_obj.drop_collection(cls.rna_col) 32 | cls.src.uniprot_collection_manager.client.close() 33 | cls.src.client.close() 34 | cls.src.uniprot_query_manager.client.close() 35 | 36 | @unittest.skip('avoid downloading') 37 | def test_fill_uniprot(self): 38 | url_0 = 'https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/nar/41/1/10.1093/nar/gks1019/2/gks1019-nar-00676-a-2012-File003.xlsx?Expires=1578425844&Signature=ZRFUxLdn4-vaBt5gQci~0o56KqyR9nJj9i32ig5X6YcfqiJeV3obEq8leHGdDxx6w~KABgewiQ66HTB7gmuG~2GL-YgxPKYSjt17WrYMkc-0ibw6TMlTvWZZfvw-lPe~wvpmVfNEXnTbP7jHyNLu9jeJ6yhoXvgIyQtzA5PbEI1fyXEgeZzOKMltmITqL3g3APsPsagCTC66rwrBT23Aghh6D314uilT2DZHCc68MH2nyV~qAhFqIQiOj-7VTEKqkDPvPYvuE2KNKXdvW23gk100YV~58ozbt8ijRz5Gr5gPtE~f1Ab5l260EIbWHJNabMRleInJQqUIDPFN4C38PQ__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA' 39 | df_0 = self.src.fill_uniprot(url_0, 'Supplementary Table 1') 40 | self.assertEqual(df_0.iloc[0]['ordered_locus_name'], 'Rv0002') 41 | 42 | def test_fill_rna_halflife(self): 43 | url = """https://genome.cshlp.org/content/suppl/2012/02/06/gr.131037.111.DC1/Supp_Table_2.xlsx""" 44 | usecols = 'B,L,M,N,O,P,AC,AD,AR,AT,AU' 45 | df_0 = self.src.make_df(url, 'V1ncodemouse_probe_annotations_', header=0, usecols=usecols, nrows=34509) 46 | self.src.fill_rna_half_life(df_0, ['Mus musculus', 10090]) -------------------------------------------------------------------------------- /datanator/config/core.py: -------------------------------------------------------------------------------- 1 | """ Configuration 2 | 3 | :Author: Jonathan Karr 4 | :Date: 2017-05-13 5 | :Copyright: 2017, Karr Lab 6 | :License: MIT 7 | """ 8 | 9 | import configobj 10 | import os 11 | import pkg_resources 12 | import wc_utils.config.core 13 | import wc_utils.debug_logs.config 14 | 15 | 16 | def get_config(extra=None): 17 | """ Get configuration 18 | 19 | Args: 20 | extra (:obj:`dict`, optional): additional configuration to override 21 | 22 | Returns: 23 | :obj:`configobj.ConfigObj`: nested dictionary with the configuration settings loaded from the configuration source(s). 24 | """ 25 | paths = wc_utils.config.core.ConfigPaths( 26 | default=pkg_resources.resource_filename('datanator', 'config/core.default.cfg'), 27 | schema=pkg_resources.resource_filename('datanator', 'config/core.schema.cfg'), 28 | user=( 29 | 'datanator.cfg', 30 | os.path.expanduser('~/.wc/datanator.cfg'), 31 | ) 32 | ) 33 | 34 | return wc_utils.config.core.ConfigManager(paths).get_config(extra=extra) 35 | def get_mongo_config(): 36 | """ Get a configuration to pass directly into the mongo client 37 | Args: 38 | extra (:obj: 'dict', optional): override the Mongo information loaded from the config file 39 | Returns: 40 | :obj:'dict': dictionary containing parameters to pass into the MongoDB util constructor 41 | """ 42 | config=get_config() 43 | username =config['datanator']['mongodb']['user'] 44 | password =config['datanator']['mongodb']['password'] 45 | port = config['datanator']['mongodb']['port'] 46 | MongoDB = config['datanator']['mongodb']['server'] 47 | replSet = config['datanator']['mongodb']['replSet'] 48 | mongo_config = {"MongoDB":MongoDB,"username":username, "password": password, "replicaSet": replSet} 49 | return mongo_config 50 | 51 | 52 | 53 | 54 | def get_debug_logs_config(extra=None): 55 | """ Get debug logs configuration 56 | 57 | Args: 58 | extra (:obj:`dict`, optional): additional configuration to override 59 | 60 | Returns: 61 | :obj:`configobj.ConfigObj`: nested dictionary with the configuration settings loaded from the configuration source(s). 62 | """ 63 | paths = wc_utils.debug_logs.config.paths.deepcopy() 64 | paths.user = ( 65 | 'datanator.debug.cfg', 66 | os.path.expanduser('~/.wc/datanator.debug.cfg'), 67 | ) 68 | return wc_utils.config.core.ConfigManager(paths).get_config(extra=extra) 69 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ``datanator`` 2 | ===================== 3 | ``datanator`` is a software tool for finding experimental data for building and calibrating dynamical models of cellular biochemistry such as metabolite, RNA, and protein abundances; protein complex compositions; transcription factor binding motifs; and kinetic parameters. ``datanator`` is particularly useful for building large models, such as whole-cell models, that require large amounts of data to constrain large numbers of parameters. ``datanator`` was motivated by the need for large amounts of data to constrain whole-cell models and the fact that this data is hard to utilize because it is scattered across numerous siloed repositories. 4 | 5 | ``datanator`` currently supports the following data types and data sources: 6 | 7 | * Metabolite concentrations: `ECMDB `_ and `YMBD `_ 8 | * RNA abundance: `ArrayExpress `_ 9 | * Protein abundance: `PaxDb `_ 10 | * Protein complex composition: `CORUM `_ 11 | * Transcription factor binding motifs: `JASPAR `_ 12 | * Reaction kinetics: `SABIO-RK `_ 13 | * Taxonomy: `NCBI Taxonomy `_ 14 | 15 | ``datanator`` (1) downloads these repositories; (2) normalizes their data to a common ontology and units; (3) stores their data to a local SQLite database; and (4) provides a Python API for (a) finding relevant data to model a specific organism and environmental condition from similar species, reactions, genotypes (taxon, variant), and environments (temperature, pH, media), and (b) reducing multiple relevant observations to a single consensus recommended parameter value, and (c) exporting these consensus recommendations and their provenance to an Excel workbook. To make ``datanator`` easier to use, we plan to develop user-friendly command line and web-based interfaces for finding data for SBML-encoded models. 16 | 17 | ``datanator`` is under active development and is not yet ready for end users. Please check back soon for updates. 18 | 19 | This website contains detailed documentation of the ``datanator`` source code. Going forward, this website will also contain detailed instructions and tutorials on how to use ``datanator``. 20 | 21 | Contents 22 | -------- 23 | 24 | .. toctree:: 25 | :maxdepth: 3 26 | :numbered: 27 | 28 | intro 29 | installation 30 | tutorial 31 | API documentation 32 | about 33 | references.rst 34 | -------------------------------------------------------------------------------- /tests/data_source/brenda/test_reaction.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import shutil 3 | import tempfile 4 | from datanator.data_source.brenda import reaction 5 | from datanator_query_python.config import config 6 | import pandas as pd 7 | 8 | 9 | class TestBrendaRxn(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | conf = config.TestConfig() 14 | cls.collection_str = 'brenda_reaction' 15 | username = conf.USERNAME 16 | password = conf.PASSWORD 17 | MongoDB = conf.SERVER 18 | cls.src = reaction.BrendaRxn(MongoDB=MongoDB, db='test', collection_str=cls.collection_str, 19 | username=username, password=password, authSource='admin', 20 | max_entries=20, verbose=True) 21 | 22 | @classmethod 23 | def tearDownClass(cls): 24 | cls.src.db_obj.drop_collection(cls.collection_str) 25 | cls.src.client.close() 26 | 27 | # @unittest.skip('passed') 28 | def test_download_and_read(self): 29 | result = self.src.download_and_read() 30 | self.assertEqual(result['ec_number'][1], '6.3.2.1') 31 | 32 | def test_clean_up(self): 33 | result = self.src.download_and_read() 34 | exp = self.src.clean_up(result) 35 | self.assertEqual(exp['reaction_id_brenda'][1], ['BR101']) 36 | self.assertEqual(exp['reaction_id_sabio_rk'][1], 2406) 37 | 38 | # @unittest.skip('passed') 39 | def test_parse_reaction(self): 40 | df = pd.DataFrame({'reaction': ['ATP + (R)-pantoate + beta-alanine <=> AMP + diphosphate + (R)-pantothenate', 41 | 'ATP + Detyrosinated alpha-tubulin + L-Tyrosine = alpha-Tubulin + ADP + Orthophosphate']}) 42 | result = self.src.parse_reaction(df) 43 | self.assertEqual(result['products'][1][1], 'ADP') 44 | self.assertEqual(result['substrates'][0][1], '(R)-pantoate') 45 | 46 | # @unittest.skip('passed') 47 | def test_load_df_sim(self): 48 | df = pd.DataFrame({'reaction': ['ATP + (R)-pantoate + beta-alanine <=> AMP + diphosphate + (R)-pantothenate', 49 | 'ATP + Detyrosinated alpha-tubulin + L-Tyrosine = alpha-Tubulin + ADP + Orthophosphate']}) 50 | result = self.src.parse_reaction(df) 51 | self.src.load_df(result) 52 | 53 | # @unittest.skip('passed') 54 | def test_load_df_real(self): 55 | result = self.src.download_and_read() 56 | self.src.clean_up(result) 57 | x = self.src.parse_reaction(result) 58 | self.src.load_df(x.head(100)) -------------------------------------------------------------------------------- /datanator/parse_metabolite_concentration.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import urllib.request 3 | import json 4 | from datanator_query_python.config import config 5 | from datanator_query_python.util import mongo_util 6 | 7 | class ParseMetaboliteConcentration(mongo_util.MongoUtil): 8 | def __init__(self, 9 | MongoDB=None, 10 | db=None, 11 | collection=None, 12 | max_entries=float('inf'), 13 | username=None, 14 | password=None, 15 | authSource = 'admin', 16 | readPreference = 'nearest'): 17 | super().__init__(MongoDB=MongoDB, db=db, 18 | username = username, 19 | password = password, 20 | authSource = authSource, 21 | readPreference=readPreference) 22 | self.max_entries = max_entries 23 | self.collection = collection 24 | 25 | def parse_metabolite(self): 26 | """ 27 | Read JSON metabolite concentration files from Github and 28 | insert separate documents for each metabolite into MongoDB database 29 | 30 | Args: 31 | () 32 | Return: 33 | () 34 | """ 35 | collection = self.db_obj[self.collection] 36 | metabolites = ["ATP","CTP","GMP","GTP","IMP","NAD","NADH","NADP","NADPH","TTP","UTP"] 37 | for i in range(len(metabolites)): 38 | url = urllib.request.urlopen("https://raw.githubusercontent.com/KarrLab/datanator/tutorial/docs/metabolites/"+metabolites[i]+".json") 39 | data = json.loads(url.read().decode()) 40 | collection.insert_one({"inchikey":data['inchikey']}) 41 | for j in range(len(data['concentrations'])): 42 | sub_data = data['concentrations'][j] 43 | collection.update_one({"inchikey":data['inchikey']},{"$addToSet":{'concentrations':sub_data}}) 44 | def main(): 45 | conf=config.Victoria() 46 | conf_main = config.Config() 47 | username = conf.USERNAME 48 | password = conf.PASSWORD 49 | MongoDB = conf_main.SERVER 50 | src = ParseMetaboliteConcentration(MongoDB = MongoDB, 51 | username=username, 52 | password=password, 53 | collection = "metabolite_concentration", 54 | db = "datanator-demo") 55 | src.parse_metabolite() 56 | 57 | if __name__== '__main__': 58 | main() 59 | 60 | -------------------------------------------------------------------------------- /tests/data_source/test_intact_nosql.py: -------------------------------------------------------------------------------- 1 | from datanator.data_source import intact_nosql 2 | import shutil 3 | import os 4 | import unittest 5 | import tempfile 6 | import datanator.config.core 7 | 8 | class TestCorumNoSQL(unittest.TestCase): 9 | 10 | @classmethod 11 | def setUpClass(cls): 12 | cls.cache_dirname = tempfile.mkdtemp() 13 | cls.db = 'test' 14 | cls.username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 15 | cls.password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 16 | cls.MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 17 | cls.replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet'] 18 | cls.src = intact_nosql.IntActNoSQL( 19 | cache_dirname = cls.cache_dirname, MongoDB = cls.MongoDB, 20 | db = cls.db, verbose=True, max_entries=20, 21 | username = cls.username, password = cls.password) 22 | 23 | @classmethod 24 | def tearDownClass(cls): 25 | shutil.rmtree(cls.cache_dirname) 26 | cls.src.collection_interaction.drop() 27 | cls.src.collection_complex.drop() 28 | cls.src.client_interaction.close() 29 | cls.src.client_complex.close() 30 | 31 | def test_download_content(self): 32 | self.src.download_content() 33 | file = os.path.join(self.cache_dirname, 'intact', 'complextab', 'bos_taurus.tsv') 34 | self.assertTrue(os.path.exists(file)) 35 | 36 | # @unittest.skip("loading everything") 37 | def test_load_complex(self): 38 | self.src.add_complexes() 39 | int_complex = self.src.collection_complex 40 | self.assertTrue(int_complex.find().count() > 11) 41 | # cursor = int_complex.find({'identifier': 'CPX-3140'}) 42 | # self.assertEqual(cursor.count(), 1) 43 | # self.assertEqual(cursor[0]['ncbi_id'], 7227) 44 | # self.assertEqual(cursor[0]['subunits'], [{'uniprot_id': 'P48607-PRO_0000022407', 'count': '1'}, 45 | # {'uniprot_id': 'P48607-PRO_0000022407', 'count': '1'}]) 46 | 47 | # @unittest.skip('loaded') 48 | def test_load_interaction(self): 49 | self.src.add_interactions() 50 | int_int = self.src.collection_interaction 51 | self.assertTrue(int_int.count_documents({}) in [18, 19, 20]) 52 | cursor = int_int.find({'interaction_id': 'intact:EBI-526288'}) 53 | self.assertEqual(cursor.count(), 1) 54 | self.assertEqual(cursor[0]['method'], 'anti tag coimmunoprecipitation') 55 | self.assertEqual(cursor[0]['confidence'], 'intact-miscore:0.51') 56 | -------------------------------------------------------------------------------- /tests/data_source/rna_halflife/test_doi_10_1186_s12864_016_3219_8.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source.rna_halflife import doi_10_1186_s12864_016_3219_8 3 | import tempfile 4 | import shutil 5 | import json 6 | import os 7 | import datanator.config.core 8 | 9 | 10 | class TestProteinAggregate(unittest.TestCase): 11 | 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.cache_dirname = tempfile.mkdtemp() 15 | cache_dir = os.path.join(cls.cache_dirname, 'logs.txt') 16 | des_db = 'test' 17 | db = 'datanator' 18 | cls.collection_str = 'test_rna_halflife' 19 | username = datanator.config.core.get_config()[ 20 | 'datanator']['mongodb']['user'] 21 | password = datanator.config.core.get_config( 22 | )['datanator']['mongodb']['password'] 23 | server = datanator.config.core.get_config( 24 | )['datanator']['mongodb']['server'] 25 | cls.src = doi_10_1186_s12864_016_3219_8.Halflife(username=username, password=password, server=server, 26 | authDB='admin',max_entries=100, uniprot_col_db=des_db, 27 | verbose=True, collection_str=cls.collection_str, db=db, 28 | cache_dir=cache_dir) 29 | 30 | @classmethod 31 | def tearDownClass(cls): 32 | shutil.rmtree(cls.cache_dirname) 33 | cls.src.db.drop_collection(cls.collection_str) 34 | cls.src.client.close() 35 | 36 | def test_download_xlsx(self): 37 | result = self.src.download_xlsx('MeOH') 38 | self.assertEqual(result['gene_fragment'][0], 'MA0001') 39 | 40 | # @unittest.skip('passed') 41 | def test_load_halflife(self): 42 | df = self.src.download_xlsx('MeOH') 43 | self.src.load_halflife(df) 44 | df = self.src.download_xlsx('TMA') 45 | self.src.add_to_halflife(df) 46 | 47 | # @unittest.skip('passed') 48 | def test_fill_gene_protein_name(self): 49 | self.src.fill_gene_protein_name() 50 | result = self.src.collection.find_one({'gene_name': '-'}) 51 | self.assertIsNone(result) 52 | 53 | # @unittest.skip('passed') 54 | def test_fill_protein_name(self): 55 | self.src.fill_protein_name() 56 | result = self.src.collection.find_one({'$and':[{'gene_name': {'$ne': '-'}}, 57 | {'protein_name': {'$exists': False}}, 58 | {'gene_name': {'$exists': True}}]}) 59 | self.assertIsNone(result) 60 | 61 | # @unittest.skip('passed') 62 | def test_fill_uniprot_by_oln(self): 63 | self.src.fill_uniprot_by_oln('MA0002') -------------------------------------------------------------------------------- /LICENSE-THIRD-PARTY-DATA: -------------------------------------------------------------------------------- 1 | The Datanator database is a compilation of data curated from the literature by the Datanator team and data aggregated from third-party databases. The data curated by the Datanator team is available under the Creative Commons 1.0 Universal (CC0) License. The data compiled from third-party sources is available under the licenses summarized below. 2 | 3 | E. coli Metabolome Database (ECMDB) 4 | URL: https://ecmdb.ca 5 | License summary: Offered freely to the public. Use and re-distribution of the data, in whole or in part, for commercial purposes requires permission of the authors and attribution. 6 | License statement URL: https://ecmdb.ca/citations 7 | 8 | MODOMICS 9 | URL: https://iimcb.genesilico.pl/modomics 10 | License summary: Requests attribution. 11 | License statement URL: https://iimcb.genesilico.pl/modomics/download 12 | 13 | NCBI Taxonomy 14 | URL: https://www.ncbi.nlm.nih.gov/taxonomy 15 | License summary: Public domain. Information may be freely distributed and copied. Requests attribution. 16 | License statement URL: https://www.ncbi.nlm.nih.gov/home/about/policies/#copyright 17 | 18 | OrthoDB 19 | URL: https://www.orthodb.org 20 | License: CC BY 3.0 21 | License statement URL: https://www.orthodb.org/?page=disclaimer 22 | License URL: https://creativecommons.org/licenses/by/3.0/ 23 | 24 | PaxDB 25 | URL: https://pax-db.org 26 | License summary: The creators have granted Datanator permissions for derivations beyond the CC BY-ND 3.0 license stated at https://pax-db.org/license 27 | 28 | Protein Ontology (PRO) 29 | URL: https://proconsortium.org 30 | License: CC BY 4.0 31 | License statement URL: https://proconsortium.org/download/current/pro_nonreasoned.obo 32 | License URL: https://creativecommons.org/licenses/by/4.0/ 33 | 34 | SABIO-RK 35 | URL: http://sabio.h-its.org 36 | License summary: Grants database for non-commercial research and academic purposes only, excluding as part of any product or service which is licensed. This license is not transferable. Requests attribution. 37 | License URL: http://sabio.h-its.org/layouts/content/termscondition.gsp 38 | 39 | UniProt 40 | URL: https://www.uniprot.org 41 | License: CC BY 4.0 for the copyrightable parts of the database 42 | License statement URL: https://www.uniprot.org/help/license 43 | License URL: https://creativecommons.org/licenses/by/4.0/ 44 | 45 | Yeast Metabolome Database (YMDB) 46 | URL: http://www.ymdb.ca 47 | License summary: Offered freely to the public. Use and re-distribution of the data, in whole or in part, for commercial purposes requires permission of the authors and attribution. 48 | License statement URL: http://www.ymdb.ca/downloads 49 | -------------------------------------------------------------------------------- /datanator/util/reaction_util.py: -------------------------------------------------------------------------------- 1 | """ Utilities for dealing with reactions 2 | 3 | :Author: Yosef Roth 4 | :Author: Jonathan 5 | :Date: 2017-04-13 6 | :Copyright: 2017, Karr Lab 7 | :License: MIT 8 | """ 9 | 10 | from datanator.core import data_model 11 | from datanator.util import molecule_util 12 | import numpy 13 | 14 | 15 | def calc_reactant_product_pairs(reaction): 16 | """ Get list of pairs of similar reactants and products using a greedy algorithm. 17 | 18 | Args: 19 | reaction (:obj:`data_model.Reaction`): reaction 20 | 21 | Returns: 22 | :obj:`list` of :obj:`tuple` of obj:`data_model.Specie`, :obj:`data_model.Specie`: list of pairs of similar reactants and products 23 | """ 24 | participants = reaction.get_ordered_participants() 25 | reactants = list(filter(lambda p: p.coefficient < 0, participants)) 26 | products = list(filter(lambda p: p.coefficient > 0, participants)) 27 | 28 | # sort by structure to ensure result is reproducible 29 | key = lambda p: (len(p.specie.structure), p.specie.structure) 30 | reactants = sorted(reactants, key=key, reverse=True) 31 | products = sorted(products, key=key, reverse=True) 32 | 33 | # create :obj:`molecule_util.Molecule` objects for each reactant and product 34 | reactant_mols = [molecule_util.Molecule(structure=reactant.specie.structure) for reactant in reactants] 35 | product_mols = [molecule_util.Molecule(structure=product.specie.structure) for product in products] 36 | 37 | # calculate similarities between each reactant and each product 38 | similarities = numpy.full((len(reactants), len(products)), numpy.nan) 39 | for i_reactant, reactant in enumerate(reactant_mols): 40 | for i_product, product in enumerate(product_mols): 41 | similarities[i_reactant, i_product] = reactant.get_similarity(product) 42 | 43 | # initialize pairs of similar reactants and products 44 | pairs = [] 45 | 46 | # iteratively identify the most similar pair of reactants and products 47 | for i in range(min(len(reactants), len(products))): 48 | index = numpy.argmax(similarities) 49 | indices = numpy.unravel_index(index, dims=similarities.shape) 50 | i_reactant = indices[0] 51 | i_product = indices[1] 52 | pairs.append((reactants[i_reactant], products[i_product])) 53 | 54 | reactants.pop(i_reactant) 55 | products.pop(i_product) 56 | similarities = numpy.delete(similarities, i_reactant, axis=0) 57 | similarities = numpy.delete(similarities, i_product, axis=1) 58 | 59 | # unpaired products, reactants 60 | for reactant in reactants: 61 | pairs.append((reactant, None)) 62 | for product in products: 63 | pairs.append((None, product)) 64 | 65 | return pairs 66 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | The following instructions describe how to install ``datanator`` onto a Debian-based Linux OS 4 | using the docker image `wc_env ` 5 | 6 | Install dependencies 7 | -------------------- 8 | First, please install the following dependencies: 9 | 10 | * `Docker `_ 11 | * `Docker-compose `_ 12 | 13 | The following shell commands can be used to install these dependencies onto Ubuntu Linux 16.04:: 14 | 15 | apt-get update 16 | 17 | apt-get install \ 18 | apt-transport-https \ 19 | ca-certificates \ 20 | curl \ 21 | gnupg-agent \ 22 | software-properties-common 23 | 24 | curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - 25 | 26 | sudo add-apt-repository \ 27 | "deb [arch=amd64] https://download.docker.com/linux/ubuntu \ 28 | $(lsb_release -cs) \ 29 | stable" 30 | 31 | sudo apt-get update 32 | 33 | sudo apt-get install docker-ce docker-ce-cli containerd.io 34 | 35 | sudo curl -L "https://github.com/docker/compose/releases/download/1.26.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose 36 | 37 | sudo chmod +x /usr/local/bin/docker-compose 38 | 39 | sudo ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose 40 | 41 | Install ``datanator`` 42 | ----------------------------- 43 | Second, please run the following shell commands to clone and install ``datanator`` from GitHub:: 44 | 45 | mkdir karr_lab 46 | mkdir ~/.wc 47 | cd ./karr_lab 48 | git clone git@github.com:KarrLab/pkg_utils.git 49 | git clone git@github.com:KarrLab/wc_utils.git 50 | git clone git@github.com:KarrLab/karr_lab_aws_manager.git 51 | git clone git@github.com:KarrLab/datanator_query_python.git 52 | git clone git@github.com:KarrLab/datanator.git 53 | cd ./datanator 54 | nano docker-compose.yml # change ``zl`` on line 13 to the proper username. Save and exit by pressing ``Ctrl + X`` followed by ``Y`` 55 | docker-compose up -d 56 | 57 | 58 | Run ``datanator`` 59 | ----------------------------- 60 | One needs to find the docker container ID in order to use Datanator package:: 61 | 62 | docker ps 63 | docker exec -it bash 64 | cd karr_lab/datanator 65 | 66 | All python scripts in ``datanator`` dicrectory can be run with python3, for example:: 67 | 68 | python3 datanator/data_source/corum_nosql.py 69 | 70 | Running the command above will parse `Corum ` and 71 | store the parsed data in KarrLab's MongoDB. 72 | 73 | 74 | Contact `Yang `_ for any questions regarding installation and running the package. 75 | -------------------------------------------------------------------------------- /datanator/data_source/protein_localization/justin_parseGramPositiveJSONSchema.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import os 4 | 5 | class ParseJSONSchema: 6 | 7 | def __init__(self, dataset, directory): 8 | self.dataset = dataset 9 | self.directory = directory 10 | 11 | 12 | def update_directory(self): 13 | data = pd.read_csv(self.dataset, delimiter='\t', nrows=10000) 14 | data = data.where(pd.notnull(data), None) 15 | 16 | for i in range(len(data)): 17 | d = {} 18 | 19 | # data to "entity" 20 | d["entity"] = {} 21 | d["entity"]["type"] = "protein" 22 | seq_id = str(data.iloc[i,0]) 23 | d["entity"]["name"] = seq_id[str(data.iloc[i,0]).rfind('|')+2:] 24 | 25 | # data to "identifier" in "entity" 26 | identifier = [] 27 | dict_identifier = {} # dictionary for identifier in entity 28 | dict_identifier["namespace"] = "SeqID" 29 | dict_identifier["value"] = seq_id[8:22] 30 | identifier.append(dict_identifier) 31 | d["entity"]["identifiers"] = identifier 32 | 33 | # data to values 34 | d["values"] = [] 35 | for column_name in data.columns[1:]: 36 | if column_name != "SeqID" and column_name != "PSortVersion": 37 | values_dict = {} 38 | values_dict["type"] = column_name 39 | values_dict["value"] = data[column_name].iloc[i] 40 | d["values"].append(values_dict) 41 | 42 | # data to "identifier" 43 | d["identifier"] = {} 44 | d["identifier"]["namespace"] = "SeqID" 45 | d["identifier"]["value"] = seq_id[8:22] 46 | 47 | 48 | # source 49 | d["source"] = [] 50 | dict_source = {} 51 | dict_source["namespace"] = "PSORTsb Gram Positive" 52 | dict_source["value"] = "Version 3" 53 | d["source"].append(dict_source) 54 | 55 | # environment 56 | d["environment"] = {"GramStain": "Gram positive"} 57 | 58 | # Schema Version 59 | d["schema_version"] = "2.0" 60 | 61 | 62 | # Create JSON files and place in directory 63 | with open(self.directory+"/{}.json".format(seq_id[8:22]), "w+") as JSONfile: 64 | json.dump(d, JSONfile) 65 | 66 | 67 | def main(): 68 | json_files = ParseJSONSchema(dataset="./datanator/docs/protein_localization/computed_gram_positive/Computed-Gram_positive-PSORTdb-3.00.tab", 69 | directory="./datanator/docs/protein_localization/computed_gram_positive/JSONSchema") 70 | 71 | json_files.update_directory() 72 | 73 | if __name__ == "__main__": 74 | main() -------------------------------------------------------------------------------- /tests/util/test_rna_seq_util.py: -------------------------------------------------------------------------------- 1 | """ Test for RNA-seq utilities 2 | 3 | :Author: Jonathan Karr 4 | :Date: 2018-01-15 5 | :Copyright: 2018, Karr Lab 6 | :License: MIT 7 | """ 8 | 9 | from datanator.util import rna_seq_util 10 | import capturer 11 | import os 12 | import shutil 13 | import tempfile 14 | import unittest 15 | import urllib.request 16 | 17 | 18 | class TestKallisto(unittest.TestCase): 19 | 20 | def setUp(self): 21 | self.temp_dir = tempfile.mkdtemp() 22 | 23 | def tearDown(self): 24 | shutil.rmtree(self.temp_dir) 25 | 26 | def test(self): 27 | fasta_filename = os.path.join(self.temp_dir, 'transcripts.fasta.gz') 28 | fastq_filenames = [ 29 | os.path.join(self.temp_dir, 'reads_1.fastq.gz'), 30 | os.path.join(self.temp_dir, 'reads_2.fastq.gz'), 31 | ] 32 | 33 | # download files 34 | urllib.request.urlretrieve('https://github.com/pachterlab/kallisto/raw/master/test/transcripts.fasta.gz', 35 | fasta_filename) 36 | urllib.request.urlretrieve('https://github.com/pachterlab/kallisto/raw/master/test/reads_1.fastq.gz', 37 | fastq_filenames[0]) 38 | urllib.request.urlretrieve('https://github.com/pachterlab/kallisto/raw/master/test/reads_2.fastq.gz', 39 | fastq_filenames[1]) 40 | 41 | # run kallisto on test files 42 | index_filename = os.path.join(self.temp_dir, 'index.idx') 43 | rna_seq_util.Kallisto().index([fasta_filename], index_filename=index_filename) 44 | self.assertTrue(os.path.isfile(index_filename)) 45 | 46 | output_dirname = os.path.join(self.temp_dir, 'out') 47 | rna_seq_util.Kallisto().quant(fastq_filenames, index_filename=index_filename, output_dirname=output_dirname) 48 | self.assertTrue(os.path.isdir(output_dirname)) 49 | self.assertTrue(os.path.isfile(os.path.join(output_dirname, 'abundance.tsv'))) 50 | self.assertTrue(os.path.isfile(os.path.join(output_dirname, 'abundance.h5'))) 51 | self.assertTrue(os.path.isfile(os.path.join(output_dirname, 'run_info.json'))) 52 | 53 | def test_error(self): 54 | with capturer.CaptureOutput(merged=False, relay=False) as captured: 55 | with self.assertRaises(Exception): 56 | rna_seq_util.Kallisto()._run('index', ['__undefined__.fasta'], verbose=True) 57 | self.assertNotEqual(captured.stdout.get_text(), '') 58 | self.assertNotEqual(captured.stderr.get_text(), '') 59 | 60 | with capturer.CaptureOutput(merged=False, relay=False) as captured: 61 | with self.assertRaises(Exception): 62 | rna_seq_util.Kallisto()._run('index', ['__undefined__.fasta'], verbose=False) 63 | self.assertEqual(captured.stdout.get_text(), '') 64 | self.assertEqual(captured.stderr.get_text(), '') 65 | -------------------------------------------------------------------------------- /datanator/schema_2/migrate_ec.py: -------------------------------------------------------------------------------- 1 | from datanator_query_python.config import motor_client_manager 2 | import asyncio 3 | import simplejson as json 4 | from pymongo import UpdateOne 5 | from pymongo.errors import BulkWriteError 6 | from pprint import pprint 7 | 8 | 9 | class MigrateEC: 10 | 11 | def __init__(self, collection="ec", to_database="datanator-test", 12 | from_database="datanator", max_entries=float("inf")): 13 | self.collection = collection 14 | self.from_database = from_database 15 | self.to_database = to_database 16 | self.from_collection = motor_client_manager.client.get_database(from_database)[collection] 17 | self.to_collection = motor_client_manager.client.get_database(to_database)[collection] 18 | self.max_entries = max_entries 19 | 20 | async def index_primary(self, _key, background=True): 21 | """Index key (single key ascending) 22 | 23 | Args: 24 | _key(:obj:`str`): Name of key to be indexed 25 | """ 26 | await self.to_collection.create_index(_key, background=background) 27 | 28 | async def process_cursor(self, skip=0): 29 | """Process mongodb cursor 30 | Transform data and move to new database 31 | 32 | Args: 33 | docs(:obj:`pymongo.Cursor`): documents to be processed 34 | """ 35 | bulk_write = [] 36 | query = {} 37 | if self.max_entries == float('inf'): 38 | limit = 0 39 | else: 40 | limit = self.max_entries 41 | docs = self.from_collection.find(filter=query, projection={'_id': 0}, 42 | no_cursor_timeout=True, batch_size=500, 43 | skip=skip, limit=limit) 44 | i = 0 45 | async for doc in docs: 46 | i += 1 47 | if i == self.max_entries: 48 | break 49 | if i != 0 and i % 50 == 0: 50 | print("Processing file {}".format(i + skip)) 51 | try: 52 | self.to_collection.bulk_write(bulk_write) 53 | bulk_write = [] 54 | except BulkWriteError as bwe: 55 | pprint(bwe.details) 56 | bulk_write = [] 57 | doc["schema_version"] = "2" 58 | bulk_write.append(UpdateOne({'ec_number': doc["ec_number"]}, {'$set': json.loads(json.dumps(doc, ignore_nan=True))}, upsert=True)) 59 | if len(bulk_write) != 0: 60 | try: 61 | self.to_collection.bulk_write(bulk_write) 62 | except BulkWriteError as bwe: 63 | pprint(bwe.details) 64 | finally: 65 | print("Done.") 66 | pass 67 | 68 | 69 | def main(tx, rx): 70 | asyncio.gather(tx, rx) 71 | 72 | if __name__ == '__main__': 73 | src = MigrateEC(to_database="test", max_entries=100) 74 | asyncio.run(main(src.index_primary("ec_number"), src.process_cursor(skip=0))) 75 | -------------------------------------------------------------------------------- /datanator/schema_2/migrate_corum.py: -------------------------------------------------------------------------------- 1 | from datanator_query_python.config import motor_client_manager, config 2 | import simplejson as json 3 | import asyncio 4 | from pymongo import UpdateOne 5 | from pymongo.errors import BulkWriteError 6 | from pprint import pprint 7 | import os 8 | 9 | 10 | class MigrateCorum: 11 | 12 | def __init__(self, collection="corum", to_database="datanator-test", 13 | from_database="datanator", max_entries=float("inf")): 14 | self.collection = collection 15 | self.from_database = from_database 16 | self.to_database = to_database 17 | self.from_collection = motor_client_manager.client.get_database(from_database)[collection] 18 | self.to_collection = motor_client_manager.client.get_database(to_database)[collection] 19 | self.max_entries = max_entries 20 | 21 | async def index_primary(self, _key, background=True): 22 | """Index key (single key ascending) 23 | 24 | Args: 25 | _key(:obj:`str`): Name of key to be indexed 26 | """ 27 | yield self.to_collection.create_index(_key, background=background) 28 | 29 | async def process_cursor(self, skip=0): 30 | """Transform data and move to new database 31 | 32 | Args: 33 | docs(:obj:`pymongo.Cursor`): documents to be processed 34 | """ 35 | bulk_write = [] 36 | query = {} 37 | if self.max_entries == float('inf'): 38 | limit = 0 39 | else: 40 | limit = self.max_entries 41 | docs = self.from_collection.find(filter=query, projection={'_id': 0}, 42 | no_cursor_timeout=True, batch_size=10, 43 | skip=skip, limit=limit) 44 | i = 0 45 | async for doc in docs: 46 | i += 1 47 | if i == self.max_entries: 48 | break 49 | if i != 0 and i % 50 == 0: 50 | print("Processing file {}".format(i + skip)) 51 | try: 52 | self.to_collection.bulk_write(bulk_write) 53 | bulk_write = [] 54 | except BulkWriteError as bwe: 55 | pprint(bwe.details) 56 | bulk_write = [] 57 | doc.pop("complex_id") 58 | doc["ncbi_taxonomy_id"] = doc["SWISSPROT_organism_NCBI_ID"] 59 | doc.pop("SWISSPROT_organism_NCBI_ID") 60 | doc["schema_version"] = "2" 61 | bulk_write.append(UpdateOne({'ComplexID': doc.get("ComplexID")}, {'$set': json.loads(json.dumps(doc, ignore_nan=True))}, upsert=True)) 62 | if len(bulk_write) != 0: 63 | try: 64 | self.to_collection.bulk_write(bulk_write) 65 | except BulkWriteError as bwe: 66 | pprint(bwe.details) 67 | finally: 68 | print("Done.") 69 | 70 | 71 | def main(): 72 | loop = asyncio.get_event_loop() 73 | src = MigrateCorum() 74 | src.index_primary('ComplexID') 75 | loop.run_until_complete(src.process_cursor(skip=0)) 76 | 77 | if __name__ == '__main__': 78 | main() 79 | -------------------------------------------------------------------------------- /tests/util/test_calc_tanimoto.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.util import calc_tanimoto 3 | import tempfile 4 | import shutil 5 | import datanator.config.core 6 | from datanator.util import mongo_util 7 | 8 | 9 | class TestCalcTanimoto(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | cls.cache_dirname = tempfile.mkdtemp() 14 | cls.db = 'test' 15 | cls.username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 16 | cls.password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 17 | cls.server = datanator.config.core.get_config()['datanator']['mongodb']['server'] 18 | port = datanator.config.core.get_config()['datanator']['mongodb']['port'] 19 | replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet'] 20 | cls.src = calc_tanimoto.CalcTanimoto( 21 | cache_dirname=cls.cache_dirname, MongoDB=cls.server, replicaSet=replSet, db=cls.db, 22 | verbose=True, max_entries=20, password=cls.password, username=cls.username) 23 | 24 | @classmethod 25 | def tearDownClass(cls): 26 | shutil.rmtree(cls.cache_dirname) 27 | 28 | # @unittest.skip('passed') 29 | def test_get_tanimoto(self): 30 | mol1 = 'InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(15-13(18-9)16(25)20-17(26)19-15)5-11(22)14(24)12(23)6-30-31(27,28)29' 31 | mol2 = 'InChI=1S/C10H7NO3/c12-9(10(13)14)7-5-11-8-4-2-1-3-6(7)8/h1-5,11H,(H,13,14)' 32 | mol3 = 'InChI=1S/C17H21N4O9P/c1-7-3-9-10(4-8(7)2)21(15-13(18-9)16(25)20-17(26)19-15)5-11(22)14(24)12(23)6-30-31(27,28)29' 33 | coe = self.src.get_tanimoto(mol1, mol2) 34 | coe2 = self.src.get_tanimoto(mol1, mol3) 35 | self.assertEqual(0.121, coe) 36 | self.assertEqual(1., coe2) 37 | 38 | @unittest.skip('out of date') 39 | def test_one_to_many(self): 40 | inchi = 'InChI=1S/C5H8O3/c1-3(2)4(6)5(7)8/h3H,1-2H3,(H,7,8)' 41 | coeff, hashes = self.src.one_to_many(inchi, collection_str='metabolites_meta', 42 | field='inchi', lookup='inchi', num=10) 43 | print(len(hashes)) 44 | client, _, col = mongo_util.MongoUtil(db = self.db, MongoDB = self.server, 45 | username = self.username, password = self.password).con_db('metabolites_meta') 46 | inchi1 = col.find_one({'inchi': hashes[5]})['inchi'] 47 | inchi2 = col.find_one({'inchi': hashes[9]})['inchi'] 48 | self.assertEqual(coeff[5], self.src.get_tanimoto(inchi, inchi1)) 49 | self.assertEqual(coeff[9], self.src.get_tanimoto(inchi, inchi2)) 50 | 51 | @unittest.skip('out of date') 52 | def test_many_to_many(self): 53 | client, _, col = mongo_util.MongoUtil(db = self.db, MongoDB = self.server, 54 | username = self.username, password = self.password).con_db('metabolites_meta') 55 | self.src.many_to_many(collection_str1='metabolites_meta', 56 | collection_str2='metabolites_meta', field1='inchi', 57 | field2='inchi', lookup1='inchi', 58 | lookup2='inchi', num=10) 59 | -------------------------------------------------------------------------------- /datanator/util/mongo_util.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | import wc_utils.quilt 3 | from bson import decode_all 4 | import hashlib 5 | from genson import SchemaBuilder 6 | 7 | 8 | class MongoUtil: 9 | 10 | def __init__(self, cache_dirname=None, MongoDB=None, replicaSet=None, db='test', 11 | verbose=False, max_entries=float('inf'), username = None, 12 | password = None, authSource = 'admin', readPreference='nearest'): 13 | string = "mongodb+srv://{}:{}@{}/{}?authSource={}&retryWrites=true&w=majority&readPreference={}".format(username, password, MongoDB, db, authSource, readPreference) 14 | self.client = pymongo.MongoClient(string) 15 | self.db_obj = self.client.get_database(db) 16 | 17 | def list_all_collections(self): 18 | '''List all non-system collections within database 19 | ''' 20 | 21 | return self.db_obj.list_collection_names() 22 | 23 | def con_db(self, collection_str): 24 | try: 25 | collection = self.db_obj[collection_str] 26 | return (self.client, self.db_obj, collection) 27 | except pymongo.errors.ConnectionFailure: 28 | return ('Server not available') 29 | except ServerSelectionTimeoutError: 30 | return ('Server timeout') 31 | 32 | def fill_db(self, collection_str): 33 | '''Check if collection is already in MongoDB 34 | 35 | If already in MongoDB: 36 | Do nothing 37 | Else: 38 | Load data into db from quiltdata (karrlab/datanator) 39 | 40 | Args: 41 | collection_str: name of collection (e.g. 'ecmdb', 'pax', etc) 42 | ''' 43 | _, _, collection = self.con_db(collection_str) 44 | if collection.find({}).count() != 0: 45 | return collection 46 | else: 47 | manager = wc_utils.quilt.QuiltManager( 48 | path=self.cache_dirname, package='datanator') 49 | filename = collection_str + '.bson' 50 | manager.download_package(filename) 51 | with open((self.cache_dirname + '/' + filename), 'rb') as f: 52 | collection.insert(decode_all(f.read())) 53 | return collection 54 | 55 | def print_schema(self, collection_str): 56 | '''Print out schema of a collection 57 | removed '_id' from collection due to its object type 58 | and universality 59 | ''' 60 | _, _, collection = self.con_db(collection_str) 61 | doc = collection.find_one({}) 62 | builder = SchemaBuilder() 63 | del doc['_id'] 64 | builder.add_object(doc) 65 | return builder.to_schema() 66 | 67 | def flatten_collection(self, collection_str): 68 | '''Flatten a collection 69 | 70 | c is ommitted because it does not have a non-object 71 | value associated with it 72 | ''' 73 | _, _, collection = self.con_db(collection_str) 74 | 75 | pipeline = [ 76 | { "$addFields": { "subdoc.a": "$a" } }, 77 | { "$replaceRoot": { "newRoot": "$subdoc" } } 78 | ] 79 | flat_col = collection.aggregate(pipeline) 80 | return flat_col 81 | -------------------------------------------------------------------------------- /tests/util/test_rna_halflife_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import tempfile 3 | import shutil 4 | from datanator.util import rna_halflife_util 5 | from datanator_query_python.config import config 6 | import pandas as pd 7 | 8 | 9 | class TestRnaHlUtil(unittest.TestCase): 10 | 11 | @classmethod 12 | def setUpClass(cls): 13 | src_db = 'datanator' 14 | des_db = 'test' 15 | conf = config.TestConfig() 16 | username = conf.USERNAME 17 | password = conf.PASSWORD 18 | MongoDB = conf.SERVER 19 | username = username 20 | password = password 21 | cls.cache_dir = tempfile.mkdtemp() 22 | cls.protein_col = 'uniprot' 23 | cls.rna_col = 'rna_halflife' 24 | cls.src = rna_halflife_util.RnaHLUtil(server=MongoDB, username=username, 25 | password=password, src_db=src_db, des_db=des_db, protein_col=cls.protein_col, 26 | rna_col=cls.rna_col, readPreference='nearest', cache_dir=cls.cache_dir) 27 | 28 | @classmethod 29 | def tearDownClass(cls): 30 | shutil.rmtree(cls.cache_dir) 31 | cls.src.uniprot_collection_manager.db_obj.drop_collection(cls.protein_col) 32 | cls.src.uniprot_collection_manager.client.close() 33 | cls.src.uniprot_query_manager.client.close() 34 | 35 | @unittest.skip('avoid r/w db') 36 | def test_fill_uniprot_by_oln(self): 37 | self.src.fill_uniprot_by_oln('MA0002') 38 | 39 | @unittest.skip('links will not work from nonacademic IPs.') 40 | def test_make_df(self): 41 | url_0 = 'https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/nar/41/1/10.1093/nar/gks1019/2/gks1019-nar-00676-a-2012-File003.xlsx?Expires=1578425844&Signature=ZRFUxLdn4-vaBt5gQci~0o56KqyR9nJj9i32ig5X6YcfqiJeV3obEq8leHGdDxx6w~KABgewiQ66HTB7gmuG~2GL-YgxPKYSjt17WrYMkc-0ibw6TMlTvWZZfvw-lPe~wvpmVfNEXnTbP7jHyNLu9jeJ6yhoXvgIyQtzA5PbEI1fyXEgeZzOKMltmITqL3g3APsPsagCTC66rwrBT23Aghh6D314uilT2DZHCc68MH2nyV~qAhFqIQiOj-7VTEKqkDPvPYvuE2KNKXdvW23gk100YV~58ozbt8ijRz5Gr5gPtE~f1Ab5l260EIbWHJNabMRleInJQqUIDPFN4C38PQ__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA' 42 | df_0 = self.src.make_df(url_0, 'Supplementary Table 1', usecols='B:D', skiprows=[0,1,2], 43 | names=['ordered_locus_name', 'half_life', 'r_squared']) 44 | self.assertEqual(df_0.iloc[0]['ordered_locus_name'], 'Rv0002') 45 | url_1 = 'https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/nar/42/4/10.1093_nar_gkt1150/1/gkt1150_Supplementary_Data.zip?Expires=1578928721&Signature=ADjsCSaceimzGs6aJ~uG7np88TzHNooAoBabdm-6utYVIZOEwRbzTdiBp~76vM4KEHz9Nir8GNrtA3AwHwGFm0bu~aorTG4xrOChS6UgfBQiUtgr8vfbDIUno1y1nxLGCKIfQrb2Gx-SVnigum2gjcveymK995zadSNZqN~w-vz-Ii0a6fH7kvKN8m9vLWf6fdo0NXSmgnkjj9KPCuS-bmK0y4ZH5Ex0Rl4qi5uCroYmDBNOhXY23pcalbpFwB1-07tA3~756gZN4Mo9uMeSVQKl5UsHzx5amB6WvSCXS8z756YoaaMCg0FQbUCcQ46fRGdHxcvPNcrPo5IMEGmi8g__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA' 46 | df_1 = self.src.make_df(url_1, 'TableS1', file_type='zip', file_name='nar-01935-a-2013-File011.xlsx', usecols='L:O', skiprows=list(range(0, 7)), 47 | names=['a', 'b', 'c', 'd']) 48 | self.assertEqual(df_1.iloc[0]['a'], 5.74239011770224) 49 | 50 | def test_fill_uniprot_with_df(self): 51 | pass -------------------------------------------------------------------------------- /datanator/data_source/sabio_compound.py: -------------------------------------------------------------------------------- 1 | from datanator.util import mongo_util, file_util, chem_util 2 | import datanator.config.core 3 | from pymongo.collation import Collation, CollationStrength 4 | from pymongo import ASCENDING 5 | import os 6 | import tempfile 7 | 8 | 9 | class SabioCompound: 10 | 11 | def __init__(self, username=None, password=None, server=None, authSource='admin', 12 | src_database='datanator', dest_database=None, max_entries=float('inf'), verbose=True, 13 | src_collection='sabio_compound', dest_collection=None, cache_dir=None): 14 | ''' 15 | Args: 16 | src_database (:obj: `str`): name of database in which source collections reside 17 | ''' 18 | self.mongo_manager = mongo_util.MongoUtil(MongoDB=server, username=username, 19 | password=password, authSource=authSource, db=src_database) 20 | self.file_manager = file_util.FileUtil() 21 | self.chem_manager = chem_util.ChemUtil() 22 | self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY) 23 | self.verbose = verbose 24 | self.max_entries = max_entries 25 | self.src_collection = src_collection 26 | 27 | def add_inchi_key(self): 28 | """Add inchi_key field to sabio_compound collection 29 | in MongoDB 30 | """ 31 | query = {} 32 | projection = {'structures._value_inchi': 1} 33 | _, _, collection = self.mongo_manager.con_db(self.src_collection) 34 | docs = collection.find(filter=query, projection=projection) 35 | count = collection.count_documents(query) 36 | for i, doc in enumerate(docs): 37 | if i == self.max_entries: 38 | break 39 | if self.verbose and i % 100 == 0: 40 | print('Processing doc {} out of {}'.format(i, count)) 41 | try: 42 | inchi = doc['structures'][0]['_value_inchi'] 43 | except IndexError: 44 | print('Compound with id {} has no structure information'.format(doc['_id'])) 45 | except KeyError: 46 | print('Compound with id {} has no structure array'.format(doc['_id'])) 47 | inchi_key = self.chem_manager.inchi_to_inchikey(inchi) 48 | collection.update_one({'_id': doc['_id']}, 49 | {'$set': {'inchi_key': inchi_key}}) 50 | 51 | def main(): 52 | cache_dirname = tempfile.mkdtemp() 53 | cache_dir = os.path.join(cache_dirname, 'logs.txt') 54 | src_db = 'datanator' 55 | collection_str = 'sabio_compound' 56 | username = datanator.config.core.get_config()[ 57 | 'datanator']['mongodb']['user'] 58 | password = datanator.config.core.get_config( 59 | )['datanator']['mongodb']['password'] 60 | server = datanator.config.core.get_config( 61 | )['datanator']['mongodb']['server'] 62 | src = SabioCompound(username=username, password=password, server=server, 63 | authSource='admin', src_database=src_db, 64 | verbose=True, src_collection=collection_str, 65 | cache_dir=cache_dir) 66 | src.add_inchi_key() 67 | 68 | if __name__ == '__main__': 69 | main() -------------------------------------------------------------------------------- /datanator/data_source/protein_localization/experimental.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import numpy as np 4 | from datanator_query_python.config import config 5 | from datanator_query_python.util import mongo_util 6 | 7 | 8 | class ParsePsortExperimental(mongo_util.MongoUtil): 9 | 10 | def __init__(self, max_entries=float('inf'), 11 | MongoDB=None, 12 | db=None, 13 | collection=None, 14 | username=None, 15 | password=None, 16 | authSource='admin', 17 | readPreference='nearest'): 18 | super(ParsePsortExperimental, self).__init__(MongoDB=MongoDB, db=db, 19 | username=username, 20 | password=password, 21 | authSource=authSource, 22 | readPreference=readPreference) 23 | self.max_entries = max_entries 24 | self.collection = collection 25 | 26 | def parse_psortdb(self): 27 | """ 28 | To parse database psortdb Experimental-PSORTdb-v4.00.tsv file 29 | ​ 30 | Args: 31 | max_entries: int 32 | number of rows to parse. 33 | A JSON file will be created for each of the tsv file's first rows 34 | ​ 35 | Return: 36 | () 37 | """ 38 | collection = self.db_obj[self.collection] 39 | # data = pd.read_csv('Experimental-PSORTdb-v4.00.tsv', delimiter="\t") 40 | # data = data.fillna("None") 41 | # header = list(data.columns.values) 42 | # for i in range(self.max_entries): 43 | # d = {} 44 | # for j in range(len(header)): 45 | # if isinstance(data.iloc[i, j], int): 46 | # data.iloc[i, j] = int(data.iloc[i, j]) 47 | # else: 48 | # d[header[j]] = data.iloc[i, j] 49 | # # name is the JSON file's name 50 | # if (data.iloc[i, 0] != "None"): 51 | # name = data.iloc[i, 0] # SwissProt_ID 52 | # else: 53 | # name = data.iloc[i, 2] # Other_Accession 54 | # with open(name + ".json", "w+") as f: 55 | # x = json.dumps(d, f, cls=NpEncoder) 56 | collection.update_one({"uniprot_id": "P01234"}, 57 | {"$set": {"protein_name": "some_name", 58 | "another_field": "another_value"}, 59 | "$addToSet": {"$each": {"add_id": [{"namespace": "something", 60 | "value": "1"}]}}}) 61 | 62 | 63 | def main(): 64 | conf = config.Justin() 65 | conf_main = config.Config() 66 | username = conf.USERNAME 67 | password = conf.PASSWORD 68 | MongoDB = conf_main.SERVER 69 | src = ParsePsortExperimental(MongoDB=MongoDB, 70 | username=username, 71 | password=password, 72 | collection="protein_localization", 73 | db="datanator-demo") 74 | src.parse_psortdb() 75 | 76 | 77 | if __name__ == '__main__': 78 | main() 79 | -------------------------------------------------------------------------------- /datanator/data_source/protein_localization/parse_psortdb_negative_wo_outer_membrane.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | class ParsePsort: 5 | def __init__(self, max_entries): 6 | self.max_entries = max_entries 7 | 8 | def parse_psortdb(self): 9 | """ 10 | To parse database psortdb gram negative without outer membrane file 11 | and create JSON files conforming to datanator_pattern/observation_compiled.json 12 | 13 | Args: 14 | max_entries(:obj:'int'): number of rows to parse. 15 | A JSON file will be created for each of the first rows 16 | 17 | Return: 18 | () 19 | """ 20 | data=pd.read_csv('Computed-Gram_negative_without_outer_membrane-PSORTdb-3.00.tab',delimiter="\t",low_memory=False) 21 | data = data.where(pd.notnull(data), None) 22 | for i in range(self.max_entries): 23 | d={} 24 | #entity 25 | d["entity"]={} 26 | d["entity"]["type"]="protein" 27 | d["entity"]["name"]=str(data.iloc[i,0])[str(data.iloc[i,0]).rfind("|")+2:] 28 | d["entity"]["synonyms"]=[] 29 | #identifiers 30 | d["entity"]["identifiers"]=[] 31 | seq_id = {} 32 | seq_id["namespace"]="Seq_ID" 33 | seq_id["value"]=str(data.iloc[i,0])[str(data.iloc[i,0]).find("ref")+4:str(data.iloc[i,0]).rfind("|")] 34 | d["entity"]["identifiers"].append(seq_id) 35 | #localizations 36 | d["value"]={} 37 | d["value"]["PPSVM_Localization"]=data.iloc[i,1] 38 | d["value"]["Profile_Localization"]=data.iloc[i,3] 39 | d["value"]["Signal_Localization"]=data.iloc[i,5] 40 | d["value"]["SCL-BLASTe_Localization"]=data.iloc[i,7] 41 | d["value"]["CMSVM_Localization"]=data.iloc[i,9] 42 | d["value"]["SCL-BLAST_Localization"]=data.iloc[i,11] 43 | d["value"]["OMPMotif_Localization"]=data.iloc[i,13] 44 | d["value"]["OMSVM_Localization"]=data.iloc[i,15] 45 | d["value"]["Motif_Localization"]=data.iloc[i,17] 46 | d["value"]["CytoSVM_Localization"]=data.iloc[i,19] 47 | d["value"]["CWSVM_Localization"]=data.iloc[i,21] 48 | d["value"]["ModHMM_Localization"]=data.iloc[i,23] 49 | d["value"]["ECSVM_Localization"]=data.iloc[i,25] 50 | d["value"]["Cytoplasmic Membrane_Score"]=data.iloc[i,27] 51 | d["value"]["Cellwall_Score"]=data.iloc[i,28] 52 | d["value"]["Extracellular_Score"]=data.iloc[i,29] 53 | d["value"]["Cytoplasmic_Score"]=data.iloc[i,30] 54 | d["value"]["Final_Localization"]=data.iloc[i,31] 55 | d["value"]["Final_Localization_2"]=data.iloc[i,32] 56 | d["value"]["Secondary_Localization"]=data.iloc[i,34] 57 | d["value"]["Final_Score"]=data.iloc[i,35] 58 | 59 | #source 60 | d["source"]={} 61 | d["source"]["namespace"]="PSORT" 62 | d["source"]["value"]="Version "+str(data.iloc[i,36]) 63 | with open("Gram_Negative_WO_Outer_Membrane/"+str(data.iloc[i,0])[str(data.iloc[i,0]).find("ref")+4:str(data.iloc[i,0]).rfind("|")]+".json","w+") as f: 64 | json.dump(d,f,indent=4) 65 | 66 | p1=ParsePsort(10) 67 | p1.parse_psortdb() 68 | -------------------------------------------------------------------------------- /datanator/schema_2/migrate_metabolite_concentration.py: -------------------------------------------------------------------------------- 1 | from datanator_query_python.config import motor_client_manager 2 | import asyncio 3 | import simplejson as json 4 | from pymongo import UpdateOne 5 | from pymongo.errors import BulkWriteError 6 | from pprint import pprint 7 | 8 | 9 | class MigrateMC: 10 | 11 | def __init__(self, collection="metabolite_concentrations", to_database="datanator-test", 12 | from_database="datanator", max_entries=float("inf")): 13 | self.collection = collection 14 | self.from_database = from_database 15 | self.to_database = to_database 16 | self.from_collection = motor_client_manager.client.get_database(from_database)[collection] 17 | self.to_collection = motor_client_manager.client.get_database(to_database)[collection] 18 | self.max_entries = max_entries 19 | 20 | async def index_primary(self, _key, background=True): 21 | """Index key (single key ascending) 22 | 23 | Args: 24 | _key(:obj:`str`): Name of key to be indexed 25 | """ 26 | await self.to_collection.create_index(_key, background=background) 27 | 28 | async def process_cursor(self, skip=0): 29 | """Process mongodb cursor 30 | Transform data and move to new database 31 | 32 | Args: 33 | docs(:obj:`pymongo.Cursor`): documents to be processed 34 | """ 35 | bulk_write = [] 36 | query = {} 37 | if self.max_entries == float('inf'): 38 | limit = 0 39 | else: 40 | limit = self.max_entries 41 | docs = self.from_collection.find(filter=query, projection={'_id': 0}, 42 | no_cursor_timeout=True, batch_size=100, 43 | skip=skip, limit=limit) 44 | i = 0 45 | async for doc in docs: 46 | i += 1 47 | if i == self.max_entries: 48 | break 49 | if i != 0 and i % 50 == 0: 50 | print("Processing file {}".format(i + skip)) 51 | try: 52 | await self.to_collection.bulk_write(bulk_write) 53 | bulk_write = [] 54 | except BulkWriteError as bwe: 55 | pprint(bwe.details) 56 | bulk_write = [] 57 | doc['schema_version'] = "2" 58 | for obj in doc["concentrations"]: 59 | tax_doc = await motor_client_manager.client.get_database( 60 | "datanator-test")["taxon_tree"].find_one(filter={"tax_id": obj["ncbi_taxonomy_id"]}, 61 | projection={'canon_anc_ids': 1, 'canon_anc_names': 1}) 62 | obj["canon_anc_ids"] = tax_doc["canon_anc_ids"] 63 | obj["canon_anc_names"] = tax_doc["canon_anc_names"] 64 | obj.pop("last_modified", None) 65 | bulk_write.append(UpdateOne({'inchikey': doc['inchikey']}, {'$set': json.loads(json.dumps(doc, ignore_nan=True))}, upsert=True)) 66 | if len(bulk_write) != 0: 67 | try: 68 | self.to_collection.bulk_write(bulk_write) 69 | except BulkWriteError as bwe: 70 | pprint(bwe.details) 71 | finally: 72 | print("Done.") 73 | 74 | src = MigrateMC() 75 | async def main(): 76 | await asyncio.gather(src.index_primary('inchikey'), 77 | src.process_cursor(skip=0)) 78 | 79 | if __name__ == '__main__': 80 | loop = asyncio.get_event_loop() 81 | loop.run_until_complete(main()) -------------------------------------------------------------------------------- /datanator/data_source/array_express_tools/taxon_exceptions.txt: -------------------------------------------------------------------------------- 1 | Recorded Taxon -- Correct Taxon 2 | Homo sapiens + Aspergillus fumigatus -- Homo sapiens, Aspergillus fumigatus 3 | Homo sapiens + Candida albicans -- Homo sapiens, Candida albicans 4 | Homo sapiens + Candida parapsilosis -- Homo sapiens 5 | Homo sapiens + Saccharomyces cerevisiae -- Homo sapiens 6 | human -- Homo sapiens 7 | mouse -- mus musculus 8 | Homo Sapien -- Homo sapiens 9 | Trypanosma congolense -- Trypanosoma congolense 10 | mixed sample: human cell line AGS and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344 11 | mixed sample: human cell line CaCo-2 and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344 12 | mixed sample: human cell line HEK 293 and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344 13 | mixed sample: human cell line HT29 and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344 14 | mixed sample: human cell line LoVo and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344 15 | mixed sample: human cell line THP-1 (-PMA)and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344 16 | mixed sample: mouse cell line BMDM and Salmonella Typhimurium SL1344 -- mus musculus, Salmonella Typhimurium SL1344 17 | mixed sample: mouse cell line L929 and Salmonella Typhimurium SL1344 -- mus musculus, Salmonella Typhimurium SL1344 18 | mixed sample: mouse cell line MEF and Salmonella Typhimurium SL1344 -- mus musculus, Salmonella Typhimurium SL1344 19 | mixed sample: mouse cell line RAW264.7 and Salmonella Typhimurium SL1344 -- mus musculus, Salmonella Typhimurium SL1344 20 | mixed sample: pig cell line 3D4/31 and Salmonella Typhimurium SL1344 -- sus scrofa, Salmonella Typhimurium SL1344 21 | mixed sample: pig cell line IPEC-J2 and Salmonella Typhimurium SL1344 -- sus scrofa, Salmonella Typhimurium SL1344 22 | human cell line HeLa-S3 -- Homo sapiens 23 | mixed sample (human cell line HeLa-S3 + Salmonella typhimurium SL1344) -- Homo sapiens, Salmonella Typhimurium SL1344 24 | mixed sample: human cell line HeLa-S3 and Salmonella Typhimurium SL1344 -- Homo sapiens, Salmonella Typhimurium SL1344 25 | Xanthomonas oryzae pv. oryzae KACC10331 -- Xanthomonas oryzae 26 | Leishmania mexicana and Mus musculus -- Leishmania mexicana, Mus musculus 27 | Acinetobacter baumanii -- Acinetobacter baumannii 28 | Zebrafish -- Danio rerio 29 | Human -- Homo sapiens 30 | Japanese rice fish -- Oryzias latipes 31 | Japanses rice fish -- Oryzias latipes 32 | House mouse -- mus musculus 33 | -- None 34 | Mouse -- mus musculus 35 | Homo sapien -- Homo sapiens 36 | Acinetobacter baumannii 1592897 -- Acinetobacter baumannii 37 | Acinetobacter baumannii 1564232 -- Acinetobacter baumannii 38 | Acinetobacter baumannii 983759 -- Acinetobacter baumannii 39 | Acinetobacter baumannii 478810 -- Acinetobacter baumannii 40 | Strongyloides_stercoralis -- Strongyloides stercoralis 41 | Solanum lycopersicum L. -- Solanum lycopersicum 42 | chimpanzee -- Pan troglodytes 43 | Holcus lanatus L -- Holcus lanatus 44 | Eupolybothrus sp. PS-2013 -- Eupolybothrus 45 | Oryctolagus cuniculus domesticus -- Oryctolagus cuniculus 46 | Reaumuria soongorica -- Reaumuria 47 | Lactobacillus oligofermentans LMG 22743T + Lactococcus piscium MKFS47 -- Lactobacillus oligofermentans DSM 15707 = LMG 22743, Lactococcus piscium MKFS47 48 | Lactococcus lactis subsp. cremoris CNCM I-1631 -- Lactococcus lactis subsp. cremoris 49 | Mycobacterium smegmatis str. MC2 155 -- Mycolicibacterium smegmatis MC2 155 50 | Lactobacillus oligofermentans LMG 22743T -- Lactobacillus oligofermentans DSM 15707 = LMG 22743 51 | Frankia sp. CcI3 -- Frankia 52 | -------------------------------------------------------------------------------- /tests/data_source/test_sabio_reaction.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source import sabio_reaction 3 | import tempfile 4 | import shutil 5 | import json 6 | import os 7 | import datanator.config.core 8 | 9 | 10 | class TestProteinAggregate(unittest.TestCase): 11 | 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.cache_dirname = tempfile.mkdtemp() 15 | cache_dir = os.path.join(cls.cache_dirname, 'logs.txt') 16 | src_db = 'datanator' 17 | des_db = 'test' 18 | cls.collection_str = 'sabio_reaction' 19 | username = datanator.config.core.get_config()[ 20 | 'datanator']['mongodb']['user'] 21 | password = datanator.config.core.get_config( 22 | )['datanator']['mongodb']['password'] 23 | server = datanator.config.core.get_config( 24 | )['datanator']['mongodb']['server'] 25 | port = datanator.config.core.get_config( 26 | )['datanator']['mongodb']['port'] 27 | cls.src = sabio_reaction.RxnAggregate(username=username, password=password, server=server, 28 | authSource='admin', src_database=src_db, max_entries=20, 29 | verbose=True, collection=cls.collection_str, destination_database=des_db, 30 | cache_dir=cache_dir) 31 | 32 | @classmethod 33 | def tearDownClass(cls): 34 | shutil.rmtree(cls.cache_dirname) 35 | cls.src.db_obj.drop_collection(cls.collection_str) 36 | cls.src.client.close() 37 | 38 | def test_get_id(self): 39 | input_0 = {'resource': [{'namespace': 'something'}, {'id': '2'}, {'namespace': 'sabiork.reaction', 'id': '6570'}]} 40 | result_0 = self.src.get_rxn_id(input_0) 41 | self.assertEqual(result_0, 6570) 42 | 43 | def test_create_reactants(self): 44 | input_0 = {'reaction_participant': [{}, {}, {}, {'substrate_aggregate': '123'}, {'product_aggregate': '456'}]} 45 | result_0 = self.src.create_reactants(input_0) 46 | self.assertEqual(result_0, {'substrate_aggregate': '123', 'product_aggregate': '456'}) 47 | 48 | def test_fill_collection(self): 49 | self.src.fill_collection() 50 | 51 | def test_extract_reactant_names(self): 52 | substrates_0 = {'substrate_name': 'a', 'substrate_synonym': ['a1', 'a2', 'a3']} 53 | substrates_1 = {'substrate_name': 'b', 'substrate_synonym': ['b1', 'b2', 'b3']} 54 | products_0 = {'product_name': 'c', 'product_synonym': ['c1', 'c2', 'c3']} 55 | products_1 = {'product_name': 'd', 'product_synonym': ['d1', 'd2', 'd3']} 56 | products_2 = {'product_name': 'e', 'product_synonym': []} 57 | input_0 = {'reaction_participant': [{'substrate': [substrates_0, substrates_1]},{'product': [products_0, products_1, products_2]}]} 58 | sub_0, pro_0 = self.src.extract_reactant_names(input_0) 59 | sub_exp_0 = [['a1', 'a2', 'a3', 'a'], ['b1', 'b2', 'b3', 'b']] 60 | pro_exp_0 = [['c1', 'c2', 'c3', 'c'], ['d1', 'd2', 'd3', 'd'], ['e']] 61 | self.assertEqual(sub_0, sub_exp_0) 62 | self.assertEqual(pro_0, pro_exp_0) 63 | 64 | def test_extract_enzyme_names(self): 65 | input_0 = {'enzymes': [{'enzyme':[{'enzyme_name': 'a', 'enzyme_synonym': ['a1', 'a2', 'a3']}]}]} 66 | input_1 = {'enzymes': [{'enzyme':[{'enzyme_name': 'a', 'enzyme_synonym': ['a1', 'a2', 'a3']}, 67 | {'enzyme_name': 'b', 'enzyme_synonym': ['b1', 'b2', 'b3']}]}]} 68 | result_0 = self.src.extract_enzyme_names(input_0) 69 | result_1 = self.src.extract_enzyme_names(input_1) 70 | self.assertEqual(result_0, ['a1', 'a2', 'a3', 'a']) 71 | self.assertEqual(result_1[0], ['a1', 'a2', 'a3', 'a']) -------------------------------------------------------------------------------- /tests/data_source/test_taxon_tree.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source import taxon_tree 3 | import datanator.config.core 4 | import tempfile 5 | import shutil 6 | import os 7 | import json 8 | 9 | 10 | class TestTaxonTree(unittest.TestCase): 11 | 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.cache_dirname = tempfile.mkdtemp() 15 | cls.db = 'test' 16 | username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 17 | password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 18 | MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 19 | port = datanator.config.core.get_config()['datanator']['mongodb']['port'] 20 | replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet'] 21 | cls.collection_str = 'taxon_tree' 22 | cls.src = taxon_tree.TaxonTree( 23 | cls.cache_dirname, MongoDB, cls.db, replicaSet=None, 24 | verbose=True, max_entries=10, username = username, password = password) 25 | cls.path = os.path.join(cls.cache_dirname, cls.collection_str) 26 | 27 | @classmethod 28 | def tearDownClass(cls): 29 | shutil.rmtree(cls.cache_dirname) 30 | cls.src.client.close() 31 | 32 | # @unittest.skip('passed') 33 | def test_download_dump(self): 34 | noi = 'division.dmp' 35 | my_file = os.path.join(self.path, noi) 36 | self.src.download_dump() 37 | self.assertTrue(os.path.isfile(my_file)) 38 | 39 | # @unittest.skip('passed') 40 | def test_parse_fullname_line(self): 41 | line1 = '1936272 | Candidatus Heimdallarchaeota | cellular organisms; Archaea; Asgard group; |' 42 | line2 = '2012493 | Candidatus Heimdallarchaeota archaeon B3_Heim | cellular organisms; Archaea; Asgard group; Candidatus Heimdallarchaeota; |' 43 | line3 = '1935183 | Asgard group | cellular organisms; Archaea; |' 44 | 45 | self.assertEqual(self.src.parse_fullname_line(line1), [ 46 | '1936272', 'Candidatus Heimdallarchaeota', ['cellular organisms', 'Archaea', 'Asgard group']]) 47 | self.assertEqual(self.src.parse_fullname_line(line2)[:2], [ 48 | '2012493', 'Candidatus Heimdallarchaeota archaeon B3_Heim']) 49 | self.assertEqual(self.src.parse_fullname_line(line3) 50 | [1], 'Asgard group') 51 | self.assertEqual(self.src.parse_fullname_line(line3)[ 52 | 2], ['cellular organisms', 'Archaea']) 53 | 54 | # @unittest.skip('passed') 55 | def test_parse_taxid_line(self): 56 | line1 = '1841596\t|\t131567 2157 1935183 1936272 \t|\n' 57 | self.assertEqual(self.src.parse_taxid_line(line1), [ 58 | '131567', '2157', '1935183', '1936272']) 59 | 60 | # @unittest.skip('passed') 61 | def test_parse_fullname_taxid(self): 62 | self.src.parse_fullname_taxid() 63 | doc = self.src.collection.find_one({'tax_id': 1935183}) 64 | self.assertEqual(doc['anc_id'], [131567, 2157]) 65 | 66 | # @unittest.skip('passed') 67 | def test_parse_nodes(self): 68 | self.src.parse_nodes() 69 | doc = self.src.collection.find_one({'tax_id': 1}) 70 | self.assertEqual(doc['tax_name'], 'root') 71 | self.assertEqual(doc['division_id'], 8) 72 | 73 | # @unittest.skip('passed') 74 | def test_parse_division(self): 75 | self.src.parse_division() 76 | 77 | # @unittest.skip('passed') 78 | def test_parse_names(self): 79 | self.src.parse_names() 80 | 81 | # @unittest.skip('passed') 82 | def test_parse_gencode(self): 83 | self.src.parse_gencode() 84 | 85 | def test_load_content(self): 86 | self.src.load_content() 87 | -------------------------------------------------------------------------------- /tests/data_source/test_metabolite_nosql.py: -------------------------------------------------------------------------------- 1 | '''Tests of metabolite_nosql 2 | 3 | :Author: Zhouyang Lian 4 | :Author: Jonathan 5 | :Date: 2019-04-02 6 | :Copyright: 2019, Karr Lab 7 | :License: MIT 8 | ''' 9 | 10 | import unittest 11 | import shutil 12 | import tempfile 13 | from datanator.data_source import metabolite_nosql 14 | import datanator.config.core 15 | import os 16 | import json 17 | 18 | 19 | class TestMetaboliteNoSQL(unittest.TestCase): 20 | 21 | @classmethod 22 | def setUpClass(cls): 23 | cls.cache_dirname = tempfile.mkdtemp() 24 | cls.source = 'ecmdb' # 'ymdb' or 'ecmdb' 25 | cls.db = 'datanator' 26 | username = datanator.config.core.get_config()['datanator']['mongodb']['user'] 27 | password = datanator.config.core.get_config()['datanator']['mongodb']['password'] 28 | MongoDB = datanator.config.core.get_config()['datanator']['mongodb']['server'] 29 | port = datanator.config.core.get_config()['datanator']['mongodb']['port'] 30 | replSet = datanator.config.core.get_config()['datanator']['mongodb']['replSet'] 31 | cls.output_directory = cls.cache_dirname # directory to store JSON files 32 | cls.src = metabolite_nosql.MetaboliteNoSQL(cls.output_directory, 33 | cls.source, MongoDB, cls.db, verbose = True, max_entries=20, 34 | username = username, password = password) 35 | cls.client, cls.db_obj, cls.collection = cls.src.con_db(cls.source) 36 | 37 | @classmethod 38 | def tearDownClass(cls): 39 | shutil.rmtree(cls.cache_dirname) 40 | cls.client.close() 41 | 42 | @unittest.skip('ecmdb.ca and ymdb.ca xml server http 500 error') 43 | def test_write_to_json(self): 44 | session = self.src.write_to_json() 45 | null = None 46 | if self.source == 'ymdb': 47 | ymdb_6 = self.collection.find({"ymdb_id": "YMDB00006"})[0] 48 | self.assertEqual(ymdb_6['ymdb_id'], "YMDB00006") 49 | self.assertEqual(ymdb_6['species'], "Saccharomyces cerevisiae") 50 | self.assertEqual(ymdb_6['name'], "1D-Myo-inositol 1,4,5,6-tetrakisphosphate") 51 | 52 | ymdb_10 = self.collection.find({"ymdb_id": "YMDB00010"})[0] 53 | self.assertEqual(ymdb_10['ymdb_id'], "YMDB00010") 54 | self.assertEqual(ymdb_10['species'], "Saccharomyces cerevisiae") 55 | self.assertEqual(ymdb_10['wikipedia'], None) 56 | 57 | file_name = self.output_directory + '/' + 'YMDB00003.json' 58 | with open (file_name, 'r') as f: 59 | data = json.load(f) 60 | self.assertEqual(data['ymdb_id'], "YMDB00003") 61 | self.assertEqual(data['name'], "Urea") 62 | self.assertEqual(data['state'], "Solid") 63 | 64 | 65 | elif self.source == 'ecmdb': 66 | ecmdb_5 = self.collection.find({"m2m_id": "M2MDB000005"})[0] 67 | self.assertEqual(ecmdb_5['accession'], "ECMDB00023") 68 | self.assertEqual(ecmdb_5['name'], "3-Hydroxyisobutyric acid") 69 | self.assertEqual(ecmdb_5['chemical_formula'], "C4H8O3") 70 | 71 | ecmdb_10 = self.collection.find({"m2m_id": "M2MDB000010"})[0] 72 | self.assertEqual(ecmdb_10['accession'], "ECMDB00034") 73 | self.assertEqual(ecmdb_10['name'], "Adenine") 74 | self.assertEqual(ecmdb_10['chemical_formula'], "C5H5N5") 75 | 76 | file_name = self.output_directory + '/' + 'M2MDB000003.json' 77 | with open (file_name, 'r') as f: 78 | data = json.load(f) 79 | self.assertEqual(data['accession'], "ECMDB00014") 80 | self.assertEqual(data['name'], "Deoxycytidine") 81 | self.assertEqual(data['wikipedia'], "Deoxycytidine") 82 | 83 | else: 84 | print("Database source has to be 'ecmdb' or 'ymdb'") 85 | 86 | 87 | -------------------------------------------------------------------------------- /datanator/data_source/protein_modification/10_1093_nar_gkw1075.py: -------------------------------------------------------------------------------- 1 | """Parse tsv file generated by datanator.data_source.protein_modification.pro 2 | """ 3 | import pandas as pd 4 | from datanator_query_python.util import mongo_util 5 | from pymongo.collation import Collation, CollationStrength 6 | import numpy as np 7 | 8 | 9 | class ProteinMod(mongo_util.MongoUtil): 10 | 11 | def __init__(self, file_location, MongoDB=None, db=None, collection_str=None, username=None, 12 | password=None, authSource='admin', readPreference='nearest', verbose=True, 13 | max_entries=float('inf')): 14 | """ 15 | 16 | Args: 17 | file_location(:obj:`str`): location of csv file to be parsed. 18 | collection_str(:obj:`str`): name of collection in MongoDB to be filled. 19 | """ 20 | super().__init__(MongoDB=MongoDB, db=db, username=username, password=password, 21 | authSource=authSource, readPreference=readPreference) 22 | self.collection = self.db_obj[collection_str] 23 | self.verbose = verbose 24 | self.file_location = file_location 25 | self.max_entries = max_entries 26 | self.collation = Collation(locale='en', strength=CollationStrength.SECONDARY) 27 | 28 | def fill_collection(self, start_row=0): 29 | """ 30 | Fill collection collection_str. 31 | 32 | Args: 33 | start_row (:obj:`int`, optional): Read from csv row. Defaults to 0. 34 | """ 35 | df = pd.read_csv(self.file_location, header=0, error_bad_lines=False, 36 | engine='c', sep='\t', 37 | low_memory=False, skiprows=start_row) 38 | df.columns = [x.lower() for x in ['PRO_id', 'UniProt_id', 'Organism', 'Unmodified_sequence_IUBMB', 39 | 'Processing', 'Deletions', 'Processsed_sequence_IUBMB', 'Processsed_formula', 40 | 'Processsed_molecular_weight', 'Processsed_charge', 'Modifications', 41 | 'Crosslinks', 'Modified_sequence_abbreviated_BpForms', 'Modified_sequence_BpForms', 42 | 'concrete', 'Modified_formula', 'Modified_molecular_weight', 'Modified_charge', 43 | 'Modifications_formula', 'Modifications_molecular_weight', 'Modifications_charge', 44 | 'PRO_issues', 'Monomeric_form_issues']] 45 | df = df.drop(columns=['organism', 'unmodified_sequence_iubmb']) 46 | for i, row in df.iterrows(): 47 | if i == self.max_entries: 48 | break 49 | if row['concrete'] == False or row['pro_issues'] == np.NAN or row['monomeric_form_issues'] == np.NAN: 50 | continue 51 | if i % 50 == 0 and self.verbose: 52 | print('Processing row {}'.format(i)) 53 | uniprot_id = row['uniprot_id'][:6] 54 | row['reference'] = {'doi': '10.1093/nar/gkw1075'} 55 | self.collection.update_many({'uniprot_id': uniprot_id}, 56 | {'$addToSet': {'modifications': row.to_dict()}}, 57 | collation=self.collation, upsert=False) 58 | 59 | 60 | import datanator.config.core 61 | from pathlib import Path 62 | 63 | def main(): 64 | db = 'datanator' 65 | collection_str = 'uniprot' 66 | username = datanator.config.core.get_config()[ 67 | 'datanator']['mongodb']['user'] 68 | password = datanator.config.core.get_config( 69 | )['datanator']['mongodb']['password'] 70 | MongoDB = datanator.config.core.get_config( 71 | )['datanator']['mongodb']['server'] 72 | file_location = str(Path('~/karr_lab/datanator/docs/modified_protein_sequences/pro.out.tsv').expanduser()) 73 | manager = ProteinMod(file_location, MongoDB=MongoDB, db=db, collection_str=collection_str, 74 | username=username, password=password) 75 | manager.fill_collection(start_row=None) 76 | 77 | if __name__ == '__main__': 78 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Documentation](https://readthedocs.org/projects/datanator/badge/?version=latest)](http://docs.karrlab.org/datanator) 2 | [![Test results](https://circleci.com/gh/KarrLab/datanator.svg?style=shield)](https://circleci.com/gh/KarrLab/datanator) 3 | [![Test coverage](https://coveralls.io/repos/github/KarrLab/datanator/badge.svg)](https://coveralls.io/github/KarrLab/datanator) 4 | [![Code analysis](https://api.codeclimate.com/v1/badges/e9b796130e29aee4672f/maintainability)](https://codeclimate.com/github/KarrLab/datanator) 5 | [![License](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE) 6 | 7 | # Datanator: Toolkit for discovering and aggregating data for whole-cell modeling 8 | 9 | ## Contents 10 | * [Overview](#overview) 11 | * [Installation instructions and documentation](#installation-instructions-and-documentation) 12 | * [Testing Datanator](#testing-datanator) 13 | * [License](#license) 14 | * [Development team](#development-team) 15 | * [Questions and comments](#questions-and-comments) 16 | 17 | ## Overview 18 | Extensive data is needed to build comprehensive predictive models of cells. Although the literature and public repositories contain extensive data about cells, this data is hard to utilize for modeling because it is scattered across a large number of sources; because it is described with inconsistent identifiers, units, and data models; and because there are few tools for finding relevant data for modeling specific species and environmental conditions. 19 | 20 | Datanator is a software tool for discovering, aggregating, and integrating the data needed for modeling cells. This includes metabolite, RNA, and protein abundances; protein complex compositions; transcription factor binding motifs; and kinetic parameters. Datanator is particularly useful for building large models, such as whole-cell models, that require large amounts of data to constrain large numbers of parameters. 21 | 22 | This package contains the source code for Datanator. The data aggregated with Datanator is available at [https://www.datanator.info](https://www.datanator.info). The data is also available for download as MongoDB snapshot from [Zenodo](https://doi.org/10.5281/zenodo.3971048). 23 | 24 | ## Installation instructions and documentation 25 | Please see the [documentation](http://docs.karrlab.org/datanator) for installation instructions, user instructions, and code documentation. 26 | 27 | Note, Datanator only supports Python 3. 28 | 29 | If one needs to use the datanator database hosted by Karr Lab, one will need `karr_lab_build_config` repository saved 30 | as `.wc` in the user home directory. 31 | 32 | 33 | ## Testing Datanator 34 | To ensure Datanator works properly, we have developed extensive units tests of every aspect of `datanator`. We recommend using `pytest` to run these tests as follows: 35 | 36 | ``` 37 | python3 -m pytest tests 38 | ``` 39 | 40 | ## License 41 | We aim to provide data and tools for working with this data with no additional restrictions beyond those imposed by the third-party data sources and software libraries used to construct Datanator. 42 | 43 | The content of the Datanator database is a compilation of data curated by the Datanator Team and data aggregated from third-party databases. The copyrightable content curated by the Datanator Team is released under the Creative Commons 1.0 Universal (CC0) [License](LICENSE-DATA). The content from third-party databases is available under the licenses summarized [here](LICENSE-THIRD-PATRY-DATA). 44 | 45 | The structure of the database is released under the CC0 [License](LICENSE-DATABASE-STRUCTURE). This software is released open-source under the MIT [License](LICENSE). 46 | 47 | ## Development team 48 | The model was developed by the [Karr Lab](https://www.karrlab.org) at the Icahn School of Medicine at Mount Sinai in New York, US. 49 | 50 | * Yosef Roth 51 | * Zhouyang Lian 52 | * Saahith Pochiraju 53 | * Balazs Szigeti 54 | * Jonathan Karr 55 | 56 | ## Questions and comments 57 | Please contact the [Karr Lab](https://www.karrlab.org) with any questions or comments. 58 | -------------------------------------------------------------------------------- /datanator/data_source/protein_localization/parse_psortdb_experimental.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | import numpy as np 4 | 5 | class NpEncoder(json.JSONEncoder): 6 | def default(self, obj): 7 | """ 8 | Converts the dictionary's values into a JSON serializable data type 9 | 10 | """ 11 | if isinstance(obj, np.integer): 12 | return int(obj) 13 | elif isinstance(obj, np.floating): 14 | return float(obj) 15 | elif isinstance(obj, np.ndarray): 16 | return obj.tolist() 17 | else: 18 | return super(NpEncoder, self).default(obj) 19 | 20 | class ParsePsortExperimental: 21 | def __init__(self, max_entries): 22 | self.max_entries = max_entries 23 | 24 | def parse_psortdb(self): 25 | """ 26 | To parse database psortdb Experimental-PSORTdb-v4.00.tsv file 27 | and create JSON files conforming to datanator_pattern/observation_compiled.json 28 | 29 | Args: 30 | max_entries: int 31 | number of rows to parse. 32 | A JSON file will be created for each of the tsv file's first rows 33 | 34 | Return: 35 | () 36 | """ 37 | data=pd.read_csv('Experimental-PSORTdb-v4.00.tsv',delimiter="\t") 38 | data = data.where(pd.notnull(data), None) 39 | for i in range(self.max_entries): 40 | d={} 41 | #entity 42 | d["entity"]={} 43 | d["entity"]["type"]="protein" 44 | d["entity"]["name"]=str(data.iloc[i,6]).replace(".","") 45 | if data.iloc[i,7] != None: 46 | d["entity"]["synonyms"]=str(data.iloc[i,7]).split(",") 47 | else: 48 | d["entity"]["synonyms"]=[] 49 | #identifiers 50 | d["entity"]["identifiers"]=[] 51 | uniprot={} 52 | uniprot["name_space"]="uniprot_id" 53 | uniprot["value"]=data.iloc[i,0] 54 | ref_seq = {} 55 | ref_seq["name_space"]="Refseq_Accession" 56 | ref_seq["value"]=data.iloc[i,1] 57 | other_accession = {} 58 | other_accession["name_space"]="Other_Accession" 59 | other_accession["value"]=data.iloc[i,2] 60 | d["entity"]["identifiers"].append(uniprot) 61 | d["entity"]["identifiers"].append(ref_seq) 62 | d["entity"]["identifiers"].append(other_accession) 63 | 64 | #localizations 65 | d["value"]={} 66 | if data.iloc[i,3] != None: 67 | d["value"]["experimental_localization"] = str(data.iloc[i,3]).split(",") 68 | else: 69 | d["value"]["experimental_localization"] = [] 70 | if data.iloc[i,4] != None: 71 | d["value"]["secondary_localizaton"] = str(data.iloc[i,4]).split(",") 72 | else: 73 | d["value"]["secondary_localizaton"] = [] 74 | 75 | #genotype 76 | d["genotype"]={} 77 | d["genotype"]["taxon"]={} 78 | d["genotype"]["taxon"]["ncbi_taxonomy_id"]=data.iloc[i,9] 79 | d["genotype"]["taxon"]["name"]=data.iloc[i,10] 80 | 81 | #environment 82 | d["environment"]={} 83 | d["environment"]["GramStain"]=data.iloc[i,13] 84 | 85 | #source 86 | d["source"]={} 87 | d["source"]["namespace"]="ePSORTdb" 88 | d["source"]["value"]="Version "+str(data.iloc[i,17]) 89 | 90 | #name is the JSON file's name 91 | if (data.iloc[i,0]!=None): 92 | name = data.iloc[i,0] #SwissProt_ID 93 | else: 94 | name = data.iloc[i,2] #Other_Accession 95 | with open("Experimental_PSortdb/"+name+".json","w+") as f: 96 | json.dump(d,f,cls=NpEncoder,indent=4) 97 | 98 | p1=ParsePsortExperimental(10) 99 | p1.parse_psortdb() 100 | -------------------------------------------------------------------------------- /datanator/schema_2/migrate_metabolites_meta.py: -------------------------------------------------------------------------------- 1 | from datanator_query_python.config import motor_client_manager, config 2 | from datanator.util import calc_tanimoto 3 | import simplejson as json 4 | import asyncio 5 | from pymongo import UpdateOne 6 | from pymongo.errors import BulkWriteError 7 | from pprint import pprint 8 | import os 9 | 10 | 11 | class MigrateMM: 12 | 13 | def __init__(self, collection="metabolites_meta", to_database="datanator-test", 14 | from_database="datanator", max_entries=float("inf")): 15 | self.collection = collection 16 | self.from_database = from_database 17 | self.to_database = to_database 18 | self.from_collection = motor_client_manager.client.get_database(from_database)[collection] 19 | self.to_collection = motor_client_manager.client.get_database(to_database)[collection] 20 | self.max_entries = max_entries 21 | self.calc_tanimoto = calc_tanimoto.CalcTanimoto(MongoDB=config.Config.SERVER, 22 | password=os.getenv("{}_PASSWORD".format(motor_client_manager.where)), 23 | username=os.getenv(motor_client_manager.where), 24 | db=from_database) 25 | 26 | async def index_primary(self, _key, background=True): 27 | """Index key (single key ascending) 28 | 29 | Args: 30 | _key(:obj:`str`): Name of key to be indexed 31 | """ 32 | await self.to_collection.create_index(_key, background=background) 33 | 34 | async def process_cursor(self, skip=0): 35 | """Transform data and move to new database 36 | 37 | Args: 38 | docs(:obj:`pymongo.Cursor`): documents to be processed 39 | """ 40 | bulk_write = [] 41 | query = {} 42 | if self.max_entries == float('inf'): 43 | limit = 0 44 | else: 45 | limit = self.max_entries 46 | docs = self.from_collection.find(filter=query, projection={'_id': 0}, 47 | no_cursor_timeout=True, batch_size=10, 48 | skip=skip, limit=limit) 49 | i = 0 50 | async for doc in docs: 51 | i += 1 52 | if i == self.max_entries: 53 | break 54 | if i != 0 and i % 50 == 0: 55 | print("Processing file {}".format(i + skip)) 56 | try: 57 | self.to_collection.bulk_write(bulk_write) 58 | bulk_write = [] 59 | except BulkWriteError as bwe: 60 | pprint(bwe.details) 61 | bulk_write = [] 62 | similar_compound = list(doc.get("similar_compounds")[0].keys())[0] 63 | if len(similar_compound) > 30: #sha256 string 64 | doc["similar_compounds"] = [] 65 | inchi = doc.get("inchi") 66 | sorted_coeff, sorted_inchi = self.calc_tanimoto.one_to_many(inchi) 67 | for num, inc in zip(sorted_coeff, sorted_inchi): 68 | doc["similar_compounds"].append({"inchikey": inc, "similarity_score": num}) 69 | else: 70 | similar_compounds = doc.get("similar_compounds") 71 | doc["similar_compounds"] = [] 72 | for item in similar_compounds: 73 | doc["similar_compounds"].append({"inchikey": list(item.keys())[0], "similarity_score": list(item.values())[0]}) 74 | doc["schema_version"] = "2" 75 | bulk_write.append(UpdateOne({'InChI_Key': doc.get("InChI_Key")}, {'$set': json.loads(json.dumps(doc, ignore_nan=True))}, upsert=True)) 76 | if len(bulk_write) != 0: 77 | try: 78 | self.to_collection.bulk_write(bulk_write) 79 | except BulkWriteError as bwe: 80 | pprint(bwe.details) 81 | finally: 82 | print("Done.") 83 | 84 | 85 | def main(): 86 | loop = asyncio.get_event_loop() 87 | src = MigrateMM() 88 | src.index_primary('InChI_Key') 89 | loop.run_until_complete(src.process_cursor(skip=4791)) 90 | 91 | if __name__ == '__main__': 92 | main() 93 | 94 | -------------------------------------------------------------------------------- /tests/data_source/test_protein_aggregate.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datanator.data_source import protein_aggregate 3 | import tempfile 4 | import shutil 5 | import json 6 | import os 7 | import datanator.config.core 8 | 9 | 10 | class TestProteinAggregate(unittest.TestCase): 11 | 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.cache_dirname = tempfile.mkdtemp() 15 | cache_dir = os.path.join(cls.cache_dirname, 'logs.txt') 16 | src_db = 'datanator' 17 | des_db = 'test' 18 | cls.collection_str = 'test_protein_aggregate' 19 | username = datanator.config.core.get_config()[ 20 | 'datanator']['mongodb']['user'] 21 | password = datanator.config.core.get_config( 22 | )['datanator']['mongodb']['password'] 23 | server = datanator.config.core.get_config( 24 | )['datanator']['mongodb']['server'] 25 | port = datanator.config.core.get_config( 26 | )['datanator']['mongodb']['port'] 27 | cls.src = protein_aggregate.ProteinAggregate(username=username, password=password, server=server, 28 | authSource='admin', src_database=src_db, max_entries=20, 29 | verbose=True, collection=cls.collection_str, destination_database=des_db, 30 | cache_dir=cache_dir) 31 | 32 | @classmethod 33 | def tearDownClass(cls): 34 | shutil.rmtree(cls.cache_dirname) 35 | # cls.src.db.drop_collection(cls.collection_str) 36 | cls.src.client.close() 37 | 38 | # # @unittest.skip('passed') 39 | # def test_load_abundance_from_pax(self): 40 | # self.src.load_abundance_from_pax() 41 | # doc = self.src.col.find_one(filter={'uniprot_id': 'Q72DI0'}) 42 | # self.assertTrue('abundances' in doc.keys()) 43 | # self.assertTrue('ncbi_taxonomy_id' in doc.keys()) 44 | 45 | # # @unittest.skip('passed') 46 | # def test_load_ko(self): 47 | # self.src.col.insert_one({'uniprot_id': 'a_mock_value', 48 | # 'gene_name': 'gdh'}) #insert a mock document 49 | # self.src.load_ko() 50 | # doc = self.src.col.find_one(filter={'uniprot_id': 'a_mock_value'}) 51 | # self.assertTrue('ko_number' in doc.keys()) 52 | 53 | # # @unittest.skip('passed') 54 | # def test_load_taxon(self): 55 | # self.src.col.insert_one({'ncbi_taxonomy_id': 9606, 56 | # 'uniprot_id': 'taxon_mock_value'}) 57 | # self.src.load_taxon() 58 | # doc = self.src.col.find_one(filter={'uniprot_id': 'taxon_mock_value'}) 59 | # self.assertTrue('ancestor_name' in doc.keys()) 60 | 61 | # def test_load_unreviewed_abundance(self): 62 | # dic_0 = {'observation': [{'protein_id': {'string_id': 'string_mock_0', 'uniprot_id': 'id_mock_0'}, 63 | # 'string_id': 'string_mock_0', 'abundance': 0 }], 'ncbi_id': 0, 'species_name': 'name_mock_0', 'organ': 'organ_0'} 64 | # dic_1 = {'observation': [{'protein_id': {'string_id': 'string_mock_1', 'uniprot_id': 'id_mock_1'}, 65 | # 'string_id': 'string_mock_1', 'abundance': 1 }], 'ncbi_id': 1, 'species_name': 'name_mock_1', 'organ': 'organ_1'} 66 | # dic_2 = {'uniprot_id': 'id_mock_0', 'abundances': []} 67 | # dic_3 = {'uniprot_id': 'Q72DIO'} 68 | # self.src.col.insert_many([dic_2, dic_3]) 69 | # self.src.load_unreviewed_abundance() 70 | # doc = self.src.col.find_one(filter={'species_name': 'D.vulgaris'}) 71 | # self.assertEqual(doc['ncbi_taxonomy_id'], 882) 72 | 73 | @unittest.skip('removed the function') 74 | def test_loadload_kinlaw_from_sabio(self): 75 | dic_0 = {'uniprot_id': 'P20932'} 76 | dic_1 = {'uniprot_id': 'id_mock_1', 'protein_name': 'subtilisin'} 77 | dic_2 = {'uniprot_id': 'P16064', 'protein_name': 'subtilisin'} 78 | self.src.col.insert_many([dic_0, dic_1, dic_2]) 79 | self.src.load_kinlaw_from_sabio() 80 | result_0 = self.src.col.find_one({'uniprot_id': 'P20932'}) 81 | self.assertTrue('kinetics' in list(result_0.keys())) 82 | self.assertTrue({'kinlaw_id': 17, 'ncbi_taxonomy_id': 303} in result_0['kinetics']) 83 | result_1 = self.src.col.find_one({'uniprot_id': 'P16064'}) 84 | self.assertTrue({'kinlaw_id': 1, 'ncbi_taxonomy_id': 1467} in result_1['kinetics']) 85 | 86 | --------------------------------------------------------------------------------