├── .github └── workflows │ └── tests.yml ├── .gitignore ├── .readthedocs.yaml ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmarker ├── __init__.py ├── cli.py ├── util.py └── viewer_app │ ├── app.py │ └── benchmark.html ├── demos ├── api_structure.png ├── api_structure_future.png ├── db_basic_structure.png ├── indra_db.png └── indra_db_description_and_demo.ipynb ├── doc ├── Makefile ├── conf.py ├── ext │ └── citations.py ├── index.rst ├── indra_db_logo.png ├── license.rst ├── modules │ ├── cli │ │ └── index.rst │ ├── client │ │ ├── index.rst │ │ ├── misc.rst │ │ ├── principal │ │ │ └── index.rst │ │ └── readonly │ │ │ └── index.rst │ ├── index.rst │ ├── misc.rst │ ├── preassembly │ │ └── index.rst │ ├── reading │ │ └── index.rst │ ├── schemas │ │ └── index.rst │ └── util │ │ └── index.rst ├── requirements.txt ├── rest_api_doc │ └── readme_link.rst ├── web_ui_doc │ └── index.rst └── web_ui_results_expanded.png ├── docker ├── Dockerfile └── buildspec.yml ├── indra_db ├── __init__.py ├── belief.py ├── cli │ ├── __init__.py │ ├── content.py │ ├── dump.py │ ├── elsevier_titles.txt │ ├── knowledgebase.py │ ├── preassembly.py │ ├── reading.py │ ├── util.py │ └── xdd.py ├── client │ ├── __init__.py │ ├── datasets.py │ ├── principal │ │ ├── __init__.py │ │ ├── content.py │ │ ├── curation.py │ │ ├── pa_statements.py │ │ └── raw_statements.py │ ├── readonly │ │ ├── __init__.py │ │ ├── mesh_ref_counts.py │ │ ├── query.py │ │ └── util.py │ └── statements.py ├── config.py ├── copy_utils.py ├── databases.py ├── exceptions.py ├── preassembly │ ├── preassemble_db.py │ └── submitter.py ├── reading │ ├── __init__.py │ ├── read_db.py │ ├── read_db_aws.py │ └── submitter.py ├── readonly_dumping │ ├── README.md │ ├── __init__.py │ ├── export_assembly.py │ ├── export_assembly_refinement.py │ ├── locations.py │ ├── rds_restore.sh │ ├── readonly_dumping.py │ ├── readonly_dumping_bash.sh │ └── util.py ├── resources │ ├── __init__.py │ ├── build_sample_set.py │ └── default_db_config.ini ├── schemas │ ├── __init__.py │ ├── indexes.py │ ├── mixins.py │ ├── principal_schema.py │ └── readonly_schema.py ├── tests │ ├── README.md │ ├── db_building_util.py │ ├── test_belief.py │ ├── test_config.py │ ├── test_content_manager.py │ ├── test_content_scripts.py │ ├── test_copy.py │ ├── test_dump_manager.py │ ├── test_kbs.py │ ├── test_preassembly.py │ ├── test_principal_client.py │ ├── test_query.py │ ├── test_reading.py │ ├── test_readonly_pipeline.py │ ├── test_setup.py │ ├── test_sif_dumper.py │ ├── test_xdd_manager.py │ └── util.py └── util │ ├── __init__.py │ ├── aws.py │ ├── build_corpus.py │ ├── constructors.py │ ├── content_scripts.py │ ├── data_gatherer.py │ ├── distill_statements.py │ ├── dump_sif.py │ ├── helpers.py │ ├── insert.py │ └── s3_path.py ├── indra_db_service ├── README.md ├── __init__.py ├── api.py ├── call_handlers.py ├── cli │ ├── __init__.py │ ├── __main__.py │ └── zappa_tools.py ├── config.py ├── data-vis │ ├── .gitignore │ ├── README.md │ ├── babel.config.js │ ├── package.json │ ├── public │ │ └── index.html │ ├── src │ │ ├── App.vue │ │ ├── components │ │ │ ├── AmountView │ │ │ │ ├── AmountView.vue │ │ │ │ ├── LineChart.vue │ │ │ │ └── index.js │ │ │ ├── TimeView │ │ │ │ ├── TimeView.vue │ │ │ │ └── index.js │ │ │ └── index.js │ │ ├── index.js │ │ └── main.js │ └── vue.config.js ├── errors.py ├── gunicorn.conf.py ├── sample_hashes.pkl ├── search_introduction.md ├── static │ └── curationFunctions.js ├── templates │ ├── daily_data.html │ ├── idbr_description.html │ ├── idbr_statements_view.html │ ├── idbr_template.html │ ├── search.html │ ├── search_statements.html │ └── welcome.html ├── test_api.py └── util.py └── setup.py /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v2 10 | - uses: actions/cache@v2 11 | with: 12 | path: ~/.cache/pip 13 | key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }} 14 | restore-keys: | 15 | ${{ runner.os }}-pip- 16 | - name: Set up Python 3.6 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: 3.6 20 | - name: Install dependencies 21 | run: | 22 | echo $GITHUB_EVENT_NAME 23 | export WRKDIR=`pwd` 24 | echo "home dir:" $WRKDIR 25 | sudo apt-get update 26 | sudo apt-get install libstdc++6 graphviz python3-dev libgraphviz-dev pkg-config 27 | # Install test/github-workflows-specific dependencies not covered elsewhere 28 | pip install jsonschema coverage nose-timer doctest-ignore-unicode awscli pycodestyle 29 | pip install cython psycopg2 30 | # Now install INDRA DB with all its extras 31 | pip install git+https://github.com/pagreene/indra.git@api-update 32 | cd .. 33 | git clone https://github.com/indralab/ui_util 34 | cd ui_util/indralab_auth_tools 35 | echo "indralab_auth_tools dir:" pwd 36 | pip install . 37 | cd $WRKDIR 38 | echo "indra_db dir:" pwd 39 | pip install .[all] 40 | - name: Run API tests 41 | env: 42 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 43 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 44 | INDRADBPRIMARY: ${{ secrets.INDRADBPRIMARY }} 45 | INDRAROPRIMARY: ${{ secrets.INDRAROPRIMARY }} 46 | SUPERSECRETSECREST: ${{ secrets.SUPERSECRETSECRET }} 47 | run: | 48 | # Set nose attributes based on the context in which we are running 49 | export NOSEATTR="!notravis,!slow,!cron"; 50 | export NOSEATTR=$(if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then echo $NOSEATTR,!nonpublic; else echo $NOSEATTR; fi) 51 | echo $NOSEATTR 52 | # These are files that are ignored so that doctests don't fail 53 | export NOSE_IGNORE_FILES="find_full_text_sentence.py"; 54 | 55 | echo $NOSEATTR 56 | #- cd $TRAVIS_BUILD_DIR 57 | # Now run all INDRA DB REST API tests 58 | cd rest_api 59 | nosetests -v -a $NOSEATTR --with-coverage --cover-inclusive --cover-package=indra --with-doctest --with-doctest-ignore-unicode --with-timer --timer-top-n 10 --processes=0 60 | #- name: Run all other tests 61 | # env: 62 | # AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 63 | # AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 64 | # INDRADBPRIMARY: ${{ secrets.INDRADBPRIMARY }} 65 | # INDRAROPRIMARY: ${{ secrets.INDRAROPRIMARY }} 66 | # run: | 67 | # # Set nose attributes based on the context in which we are running 68 | # export NOSEATTR="!notravis,!slow,!cron"; 69 | # export NOSEATTR=$(if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then echo $NOSEATTR,!nonpublic; else echo $NOSEATTR; fi) 70 | # echo $NOSEATTR 71 | # # These are files that are ignored so that doctests don't fail 72 | # export NOSE_IGNORE_FILES="find_full_text_sentence.py"; 73 | # echo $NOSEATTR 74 | # #- cd $TRAVIS_BUILD_DIR 75 | # # Now run all INDRA DB REST API tests 76 | # cd indra_db 77 | # nosetests -v -a $NOSEATTR --with-coverage --cover-inclusive --cover-package=indra --with-doctest --with-doctest-ignore-unicode --with-timer --timer-top-n 10 --processes=0 78 | 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # INDRA DB specific ignores 2 | junk* 3 | 4 | # Some generic ignores 5 | __pycache__/ 6 | *.py[cod] 7 | *.so 8 | env/ 9 | bin/ 10 | build/ 11 | _* 12 | 13 | # Other 14 | *.txt 15 | *.cx 16 | *.zip 17 | *.csv 18 | *.java 19 | *.xbel 20 | *.tsv 21 | *.ai 22 | *.png 23 | *.eps 24 | *.gz 25 | *.swp 26 | *.pkl 27 | 28 | # For the cool cats using PyCharm 29 | .idea 30 | .idea/* 31 | 32 | .pytest_cache 33 | 34 | # Mr Developer 35 | .mr.developer.cfg 36 | .project 37 | .pydevproject 38 | 39 | # Django stuff: 40 | *.log 41 | *.pot 42 | 43 | # Data files 44 | *.rdf 45 | *.owl 46 | *.xml 47 | *.nxml 48 | *.bel 49 | *.json 50 | *.bson 51 | *.dat 52 | 53 | # Documents, graphs, images 54 | *.pdf 55 | *.dot 56 | *.bngl 57 | *.jpg 58 | 59 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | version: 2 6 | 7 | # Set the version of Python and other tools you might need 8 | build: 9 | os: ubuntu-20.04 10 | tools: 11 | python: "3.9" 12 | 13 | python: 14 | install: 15 | - requirements: doc/requirements.txt 16 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include indra_db/resources/default_db_config.ini 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # INDRA DB 2 | 3 | 4 | 5 | The INDRA (Integrated Network and Dynamical Reasoning Assembler) Database is a 6 | framework for creating, maintaining, and accessing a database of content, 7 | readings, and statements. This implementation is currently designed to work 8 | primarily with Amazon Web Services RDS running Postrgres 9+. Used as a backend 9 | to INDRA, the INDRA Database provides a systematic way of scaling the knowledge 10 | acquired from other databases, reading, and manual input, and puts that 11 | knowledge at your fingertips through a direct Python client and a REST api. 12 | 13 | ### REST API 14 | 15 | The INDRA DB is available via a web UI at: https://db.indra.bio 16 | 17 | At the same URL, a REST service is also available which allows for programmatic usage 18 | as documented here: https://github.com/gyorilab/indra_db/blob/master/indra_db_service/README.md 19 | 20 | A convenient way to query the INDRA DB is via INDRA's built-in client towards INDRA DB 21 | which is documented here: https://indra.readthedocs.io/en/latest/modules/sources/indra_db_rest/index.html. 22 | 23 | ### Knowledge sources 24 | 25 | The INDRA Database currently integrates and distills knowledge from several 26 | different sources, both biology-focused natural language processing systems and 27 | other pre-existing databases 28 | 29 | #### Daily Readers 30 | We have read all available content, and every day we run the following readers: 31 | - [REACH](https://github.com/clulab/reach) 32 | - [Sparser](https://github.com/ddmcdonald/sparser) 33 | 34 | we read all new content with the following readers: 35 | - [Eidos](https://github.com/clulab/eidos) 36 | - [ISI](https://github.com/sgarg87/big_mech_isi_gg) 37 | - [MTI](https://ii.nlm.nih.gov/MTI/index.shtml) - used specifically to tag 38 | content with topic terms. 39 | 40 | we read a limited subset of new content with the following readers: 41 | - [TRIPS](http://trips.ihmc.us/parser/cgi/drum) 42 | 43 | on the latest content drawn from: 44 | - [PubMed](https://www.ncbi.nlm.nih.gov/pubmed/) - ~19 million abstracts and ~29 million titles 45 | - [PubMed Central](/www.ncbi.nlm.nih.gov/pmc/) - ~2.7 million fulltext 46 | - [Elsevier](https://www.elsevier.com/) - ~0.7 million fulltext 47 | (requires special access) 48 | 49 | #### Other Readers 50 | We also include more or less static content extracted from the following readers: 51 | - [RLIMS-P](https://research.bioinformatics.udel.edu/rlimsp/) 52 | 53 | #### Other Databases 54 | We include the information from these pre-existing databases: 55 | - [Pathway Commons database](http://pathwaycommons.org/) 56 | - [BEL Large Corpus](https://github.com/OpenBEL/) 57 | - [SIGNOR](https://signor.uniroma2.it/) 58 | - [BioGRID](https://thebiogrid.org/) 59 | - [TAS](https://www.biorxiv.org/content/10.1101/358978v1) 60 | - [TRRUST](https://omictools.com/trrust-tool) 61 | - [PhosphoSitePlus](https://www.phosphosite.org/) 62 | - [Causal Biological Networks Database](http://www.causalbionet.com/) 63 | - [VirHostNet](http://virhostnet.prabi.fr/) 64 | - [CTD](http://ctdbase.org/) 65 | - [Phospho.ELM](http://phospho.elm.eu.org/) 66 | - [DrugBank](https://www.drugbank.ca/) 67 | - [CONIB](https://pharmacome.github.io/conib/) 68 | - [CRoG](https://github.com/chemical-roles/chemical-roles) 69 | - [DGI](https://www.dgidb.org/) 70 | 71 | These databases are retrieved primarily using the tools in `indra.sources`. The 72 | statements extracted from all of these sources are stored and updated in the 73 | database. 74 | 75 | ### Knowledge Assembly 76 | 77 | The INDRA Database uses the powerful internal assembly tools available in INDRA 78 | but implemented for large-scale incremental assembly. The resulting corpus of 79 | cleaned and de-duplicated statements, each with fully maintained provenance, is 80 | the primary product of the database. 81 | 82 | For more details on the internal assembly process of INDRA, see the 83 | [INDRA documentation](http://indra.readthedocs.io/en/latest/modules/preassembler). 84 | 85 | ### Access 86 | 87 | The content in the database can be accessed by those that created it using the 88 | `indra_db.client` submodule. This repo also implements a REST API which can be 89 | used by those without direct acccess to the database. For access to our REST 90 | API, please contact the authors. 91 | 92 | ## Installation 93 | 94 | The INDRA database only works for Python 3.6+, though some parts are still compatible with 3.5. 95 | 96 | First, [install INDRA](http://indra.readthedocs.io/en/latest/installation.html), 97 | then simply clone this repo, and make sure that it is visible in your 98 | `PYTHONPATH`. 99 | 100 | ## Funding 101 | The development of INDRA DB is funded under the DARPA Communicating with Computers program (ARO grant W911NF-15-1-0544). 102 | -------------------------------------------------------------------------------- /benchmarker/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import * 2 | -------------------------------------------------------------------------------- /benchmarker/cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import webbrowser 4 | from time import sleep 5 | 6 | from numpy import array 7 | from datetime import datetime 8 | from collections import defaultdict 9 | from typing import Iterable 10 | 11 | import click 12 | 13 | from benchmarker.util import benchmark, list_apis, list_stacks, save_results 14 | 15 | 16 | HERE = os.path.dirname(os.path.abspath(__file__)) 17 | 18 | 19 | @click.group() 20 | def main(): 21 | """The benchmarker CLI. 22 | 23 | The benchmarker tool allows stack deployments to be 24 | compared based on the time taken to run existing test corpora that utilize 25 | the web service. 26 | """ 27 | 28 | 29 | @main.command('list') 30 | @click.argument("list_scope", type=click.Choice(["apis", "stacks"]), 31 | required=False) 32 | def print_list(list_scope): 33 | """List the apis or stacks that have already been used.""" 34 | def print_apis(): 35 | print() 36 | print("Existing API Test Corpora") 37 | print("-------------------------") 38 | for api in list_apis(): 39 | print(api) 40 | 41 | def print_stacks(): 42 | print() 43 | print("Existing Tested Stacks") 44 | print("----------------------") 45 | for stack_name in list_stacks(): 46 | print(stack_name) 47 | 48 | if list_scope == 'apis': 49 | print_apis() 50 | elif list_scope == 'stacks': 51 | print_stacks() 52 | else: 53 | print_apis() 54 | print_stacks() 55 | 56 | 57 | @main.command() 58 | @click.argument("test_corpus") 59 | @click.argument("stack_name") 60 | @click.argument("api_name") 61 | @click.option("-r", "--inner-runs", default=1, 62 | type=click.IntRange(1, 100), 63 | help="Select the number of times to repeat the test in a row.") 64 | @click.option("-R", "--outer-runs", default=1, 65 | type=click.IntRange(1, 100), 66 | help=("Select the number of times to repeat the entire suite of " 67 | "tests.")) 68 | def run(test_corpus, stack_name, api_name, inner_runs, outer_runs): 69 | """Run the benchmarker and save the aggregate the results. 70 | 71 | \b 72 | The TEST_CORPUS should be a path to a python test file that tests the INDRA 73 | Database REST service, using the standard convention: 74 | 75 | "path/to/test_file.py:test_function" 76 | 77 | The STACK_NAME should name a readonly-build stack (database and service 78 | deployment) that are being tested. You can get a list of existing 79 | (previously tested) stacks using `indra_db_benchmarker list`. 80 | 81 | The API_NAME should give a name for the test corpus that is being used. You 82 | can get a list of existing (previously used) corpora using the `list` 83 | feature. 84 | """ 85 | import tabulate 86 | start_time = datetime.utcnow() 87 | 88 | # Run the benchmarker. Run it `outer_run` times, and we will aggregate 89 | # the results below. 90 | result_list = [] 91 | test_names = [] 92 | for i in range(outer_runs): 93 | run_result = benchmark(test_corpus, num_runs=inner_runs) 94 | if not test_names: 95 | test_names = list(run_result.keys()) 96 | result_list.append(run_result) 97 | 98 | # Aggregate the results from above, either adding values to the list 99 | # or extending a list. 100 | results = {} 101 | for test_name in test_names: 102 | test_results = defaultdict(list) 103 | for this_result in result_list: 104 | test_data = this_result[test_name] 105 | for data_name, data_val in test_data.items(): 106 | if isinstance(data_val, Iterable): 107 | test_results[data_name].extend(data_val) 108 | else: 109 | test_results[data_name].append(data_val) 110 | 111 | # Convert the default dict into a real dict. 112 | test_results = dict(test_results) 113 | 114 | # Turn the time data into an array, and calculate mean and std dev. 115 | time_data = array(test_results['times']) 116 | test_results['duration'] = time_data.mean() 117 | test_results['deviation'] = time_data.std() 118 | 119 | # Calculate the overall pass rate. 120 | test_results['passed'] = sum(test_results['passed'])/outer_runs 121 | 122 | # Add this test's aggregated results to the results object. 123 | results[test_name] = test_results 124 | 125 | rows = [(test, st['passed'], st['duration'], st['deviation']) 126 | for test, st in results.items()] 127 | headers = ('Test', 'Fraction Passed', 'Ave. Duration', 'Std. Deviation') 128 | print(tabulate.tabulate(rows, headers)) 129 | save_results(start_time, api_name, stack_name, results) 130 | 131 | 132 | @main.command() 133 | def view(): 134 | """Run the web service to view results.""" 135 | basic_env = os.environ.copy() 136 | basic_env['FLASK_APP'] = os.path.join(HERE, "viewer_app/app.py:app") 137 | print("Starting web server...") 138 | p = subprocess.Popen(['flask', 'run', '--port', '5280'], 139 | env=basic_env, stdout=subprocess.PIPE, 140 | stderr=subprocess.PIPE) 141 | sleep(2) 142 | print("Opening browser...") 143 | webbrowser.open("http://localhost:5280") 144 | print("Press Ctrl-C to exit.") 145 | p.wait() 146 | 147 | 148 | if __name__ == "__main__": 149 | main() 150 | -------------------------------------------------------------------------------- /benchmarker/util.py: -------------------------------------------------------------------------------- 1 | __all__ = ['benchmark', 'list_apis', 'list_stacks', 'save_results'] 2 | 3 | import os 4 | import json 5 | 6 | import boto3 7 | import logging 8 | from datetime import datetime 9 | from inspect import getmembers, isfunction, isclass, ismethod 10 | from importlib.util import spec_from_file_location, module_from_spec 11 | 12 | from numpy import array 13 | 14 | 15 | logger = logging.getLogger('benchmark_tools') 16 | 17 | BUCKET = 'bigmech' 18 | PREFIX = 'indra-db/benchmarks/' 19 | 20 | 21 | def run_test(test_name, test_func, num_runs): 22 | test_results = dict.fromkeys(['passed', 'error_type', 'error_str', 23 | 'duration', 'deviation', 'times']) 24 | test_results['passed'] = False 25 | test_results['error_type'] = [None]*num_runs 26 | test_results['error_str'] = [None]*num_runs 27 | print(test_name) 28 | print('-' * len(test_name)) 29 | durations = [] 30 | for i in range(num_runs): 31 | print("LOGS:") 32 | start = datetime.now() 33 | try: 34 | test_func() 35 | print('-' * len(test_name)) 36 | print("PASSED!") 37 | test_results['passed'] += True 38 | except Exception as e: 39 | print('-' * len(test_name)) 40 | print("FAILED!", type(e), e) 41 | logger.exception(e) 42 | test_results['passed'] += False 43 | test_results['error_type'][i] = str(type(e)) 44 | test_results['error_str'][i] = str(e) 45 | finally: 46 | end = datetime.now() 47 | durations.append((end - start).total_seconds()) 48 | print() 49 | dur_array = array(durations) 50 | test_results['times'] = durations 51 | test_results['duration'] = dur_array.mean() 52 | test_results['deviation'] = dur_array.std() 53 | test_results['passed'] = test_results['passed'] / num_runs 54 | return test_results 55 | 56 | 57 | def benchmark(test_selection=None, base_name=None, num_runs=1): 58 | """Run a benchmark of the REST service using a given test corpus. 59 | 60 | Parameters 61 | ---------- 62 | test_selection : Optional[str] 63 | Specify the location of the test or tests you wish to run, using the 64 | standard formalism: "path/to/test.py:specific_test", where any less 65 | specification will result in a search for things that start with "test_" 66 | recursively, as usual. 67 | base_name : Optional[str] 68 | Give this benchmark a base name. 69 | num_runs : Optional[int] 70 | Specify how many times the tests should be run. 71 | """ 72 | # By default, just run in this directory 73 | if test_selection is None: 74 | test_selection = os.path.abspath('.') 75 | 76 | # Extract a function name, if it was included. 77 | if test_selection.count(':') == 0: 78 | func_name = None 79 | elif test_selection.count(':') == 1: 80 | test_selection, func_name = test_selection.split(':') 81 | else: 82 | raise ValueError(f"Invalid loc: {test_selection}") 83 | mod_name = os.path.basename(test_selection).replace('.py', '') 84 | if base_name: 85 | mod_name = base_name + '.' + mod_name 86 | 87 | # Check if the location exists, and whether it is a directory or file. 88 | # Handle the file case by recursively calling this function for each file. 89 | results = {} 90 | if not os.path.exists(test_selection): 91 | raise ValueError(f"No such file or directory: {test_selection}") 92 | elif os.path.isdir(test_selection): 93 | if func_name is not None: 94 | raise ValueError("To specify function, location must be a file.") 95 | for file in os.listdir(test_selection): 96 | new_path = os.path.join(test_selection, file) 97 | if ('test' in file and os.path.isfile(new_path) 98 | and new_path.endswith('.py')): 99 | results.update(benchmark(new_path, base_name=mod_name, 100 | num_runs=num_runs)) 101 | return results 102 | 103 | # Handle the case a file is specified. 104 | if not test_selection.endswith('.py'): 105 | raise ValueError(f"Location {test_selection} is not a python file.") 106 | print("=" * len(test_selection)) 107 | print(test_selection) 108 | print('-' * len(test_selection)) 109 | spec = spec_from_file_location(mod_name, test_selection) 110 | test_module = module_from_spec(spec) 111 | try: 112 | spec.loader.exec_module(test_module) 113 | except KeyboardInterrupt: 114 | raise 115 | except Exception as err: 116 | logger.error(f"Failed to load {test_selection}, skipping...") 117 | logger.exception(err) 118 | return results 119 | 120 | # Run test functions 121 | tests = [f for f, _ in getmembers(test_module, isfunction) if 'test' in f] 122 | for test_name in tests: 123 | test = getattr(test_module, test_name) 124 | results[f'{mod_name}.{test_name}'] = run_test(test_name, test, num_runs) 125 | 126 | # Run test classes 127 | test_classes = [c for c, _ in getmembers(test_module, isclass) 128 | if c.lower().startswith('test')] 129 | for class_name in test_classes: 130 | cls = getattr(test_module, class_name) 131 | obj = cls() 132 | test_methods = [m for m, _ in getmembers(obj, ismethod) 133 | if m.lower().startswith('test') 134 | or m.lower() == 'run_test'] 135 | for method_name in test_methods: 136 | obj.setUp() 137 | test = getattr(obj, method_name) 138 | if method_name == 'run_test' and len(test_methods) == 1: 139 | results[f'{mod_name}.{class_name}'] = \ 140 | run_test(class_name, test, num_runs) 141 | else: 142 | results[f'{mod_name}.{class_name}.{method_name}'] = \ 143 | run_test(method_name, test, num_runs) 144 | obj.tearDown() 145 | 146 | return results 147 | 148 | 149 | def list_apis(): 150 | """List the current API names on s3.""" 151 | s3 = boto3.client('s3') 152 | res = s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, Delimiter='/') 153 | return [e['Prefix'][len(PREFIX):-1] for e in res['CommonPrefixes']] 154 | 155 | 156 | def list_stacks(): 157 | """List the stacks represented on s3.""" 158 | s3 = boto3.client('s3') 159 | stack_names = set() 160 | for api_name in list_apis(): 161 | try: 162 | api_prefix = f'{PREFIX}{api_name}/' 163 | res = s3.list_objects_v2(Bucket=BUCKET, Prefix=api_prefix, 164 | Delimiter='/') 165 | stack_names |= {e['Prefix'][len(api_prefix):-1] 166 | for e in res['CommonPrefixes']} 167 | except KeyError: 168 | logger.error(f"Failed to inspect {api_prefix}: likely malformed " 169 | f"content was added to s3.") 170 | continue 171 | return list(stack_names) 172 | 173 | 174 | def save_results(start_time, api_name, stack_name, results): 175 | """Save the result of a test on s3.""" 176 | s3 = boto3.client('s3') 177 | data_key = f'{PREFIX}{api_name}/{stack_name}/{start_time}.json' 178 | s3.put_object(Bucket=BUCKET, Key=data_key, Body=json.dumps(results)) 179 | return 180 | -------------------------------------------------------------------------------- /benchmarker/viewer_app/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import boto3 4 | import logging 5 | from os import path 6 | from flask import Flask, jsonify 7 | 8 | from benchmarker.util import list_stacks, list_apis 9 | 10 | logger = logging.getLogger('benchmark_viewer') 11 | 12 | HERE = path.dirname(__file__) 13 | 14 | app = Flask('benchmark_viewer') 15 | BUCKET = 'bigmech' 16 | PREFIX = 'indra-db/benchmarks/' 17 | 18 | 19 | def load(**kwargs): 20 | with open(path.join(HERE, 'benchmark.html'), 'r') as f: 21 | s = f.read() 22 | for key, value in kwargs.items(): 23 | s = s.replace(f'{{{{ {key} }}}}', json.dumps(value)) 24 | return s 25 | 26 | 27 | @app.route('/', methods=['GET']) 28 | def serve_page(): 29 | return load(stacks=list_stacks(), apis=list_apis()) 30 | 31 | 32 | @app.route('/fetch///', methods=['GET']) 33 | def get_stack_data(corpus_name, stack_name, test_file): 34 | try: 35 | s3 = boto3.client('s3') 36 | file = s3.get_object( 37 | Bucket=BUCKET, 38 | Key=f'{PREFIX}{corpus_name}/{stack_name}/{test_file}' 39 | ) 40 | data = json.loads(file['Body'].read()) 41 | except Exception as e: 42 | logger.exception(e) 43 | return jsonify({'message': f'Error: {e}'}), 500 44 | return jsonify({'message': 'success', 'tests': data}), 200 45 | 46 | 47 | @app.route('/list/', methods=['GET']) 48 | def list_corpus_options(corpus_name): 49 | option_dict = {} 50 | try: 51 | s3 = boto3.client('s3') 52 | prefix = f'{PREFIX}{corpus_name}/' 53 | res = s3.list_objects_v2(Bucket=BUCKET, Prefix=prefix) 54 | keys = [e['Key'][len(prefix):] for e in res['Contents']] 55 | for key in keys: 56 | stack, test = key.split('/') 57 | test_time = test.split('.')[0] 58 | label = f'{test_time} ({stack})' 59 | option_dict[label] = {'stack': stack, 'test': test} 60 | except Exception as e: 61 | logger.exception(e) 62 | return jsonify({'message': f'Error: {e}'}), 500 63 | return jsonify({'message': 'success', 'options': option_dict}) 64 | -------------------------------------------------------------------------------- /demos/api_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/demos/api_structure.png -------------------------------------------------------------------------------- /demos/api_structure_future.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/demos/api_structure_future.png -------------------------------------------------------------------------------- /demos/db_basic_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/demos/db_basic_structure.png -------------------------------------------------------------------------------- /demos/indra_db.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/demos/indra_db.png -------------------------------------------------------------------------------- /doc/ext/citations.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from docutils import nodes, utils 4 | from docutils.parsers.rst import roles 5 | 6 | pubmed_uri_pattern = "http://www.ncbi.nlm.nih.gov/pubmed/%i" 7 | doi_uri_pattern = "http://dx.doi.org/%s" 8 | 9 | def pmid_reference_role(role, rawtext, text, lineno, inliner, 10 | options={}, content=[]): 11 | try: 12 | pmid = int(text) 13 | if pmid <= 0: 14 | raise ValueError 15 | except ValueError: 16 | msg = inliner.reporter.error( 17 | 'pmid number must be a number greater than or equal to 1; ' 18 | '"%s" is invalid.' % text, line=lineno) 19 | prb = inliner.problematic(rawtext, rawtext, msg) 20 | return [prb], [msg] 21 | ref = pubmed_uri_pattern % pmid 22 | nodelist = [] 23 | nodelist.append(nodes.inline(text='PMID:')) 24 | nodelist.append(nodes.reference(rawtext, utils.unescape(text), refuri=ref, 25 | **options)) 26 | return nodelist, [] 27 | 28 | def doi_reference_role(role, rawtext, text, lineno, inliner, 29 | options={}, content=[]): 30 | ref = doi_uri_pattern % text 31 | nodelist = [] 32 | nodelist.append(nodes.inline(text='doi:')) 33 | nodelist.append(nodes.reference(rawtext, utils.unescape(text), refuri=ref, 34 | **options)) 35 | return nodelist, [] 36 | 37 | def setup(app): 38 | app.add_role('pmid', pmid_reference_role) 39 | app.add_role('doi', doi_reference_role) 40 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. mdinclude:: ../README.md 2 | 3 | Further INDRA Database documentation 4 | ==================================== 5 | .. toctree:: 6 | :maxdepth: 3 7 | 8 | license.rst 9 | modules/index.rst 10 | 11 | 12 | INDRA Database REST Service 13 | =========================== 14 | 15 | .. toctree:: 16 | :maxdepth: 3 17 | 18 | rest_api_doc/readme_link.rst 19 | 20 | INDRA Database Web UI 21 | ===================== 22 | 23 | .. toctree:: 24 | :maxdepth: 3 25 | 26 | web_ui_doc/index.rst 27 | 28 | Indices and tables 29 | ================== 30 | 31 | * :ref:`genindex` 32 | * :ref:`search` 33 | 34 | -------------------------------------------------------------------------------- /doc/indra_db_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/doc/indra_db_logo.png -------------------------------------------------------------------------------- /doc/license.rst: -------------------------------------------------------------------------------- 1 | License and funding 2 | ------------------- 3 | 4 | Copyright (C) 2018, Indra Labs 5 | 6 | This code is free software: you can redistribute it and/or modify 7 | it under the terms of the GNU General Public License as published by 8 | the Free Software Foundation, either version 3 of the License, or 9 | (at your option) any later version. 10 | 11 | This code is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | 16 | You may find a copy of the GNU General Public License 17 | `here`_. 18 | 19 | The INDRA was developed with funding from ARO grant W911NF-14-1-0397, 20 | "Programmatic modelling for reasoning across complex mechanisms" under 21 | the DARPA Big Mechanism program, and the INDRA database was developed 22 | as an extention of that core project. Work has continued under 23 | W911NF-14-1-0391, "Active context" under the DARPA Communicating with 24 | Computers program, and the DARPA Automated Scientific Discovery Framework 25 | project. 26 | -------------------------------------------------------------------------------- /doc/modules/cli/index.rst: -------------------------------------------------------------------------------- 1 | Pipeline Management CLI 2 | ======================= 3 | 4 | This module creates a CLI for managing the pipelines used to update 5 | content and knowledge in the database, and move or transform that 6 | knowledge on a regular basis. 7 | 8 | .. click:: indra_db.cli:main 9 | :prog: indra-db 10 | :nested: full 11 | 12 | 13 | Pipeline CLI Implementations 14 | ============================ 15 | 16 | Content (:py:mod:`indra_db.cli.content`) 17 | ---------------------------------------- 18 | 19 | The Content CLI manages the text content that is 20 | stored in the database. A parent class is defined, and managers for different 21 | sources (e.g. PubMed) can be defined by inheriting from this parent. This file 22 | is also used as the shell command to run updates of the content. 23 | 24 | .. automodule:: indra_db.cli.content 25 | :members: 26 | :member-order: bysource 27 | 28 | 29 | Reading (:py:mod:`indra_db.cli.reading`) 30 | ---------------------------------------- 31 | 32 | The Reading CLI handles the reading of the text contend and the processing 33 | of those readings into statements. As with Content CLI, different reading 34 | pipelines can be handled by defining children of a parent class. 35 | 36 | .. automodule:: indra_db.cli.reading 37 | :members: 38 | :member-order: bysource 39 | 40 | 41 | PreAssembly (:py:mod:`indra_db.cli.preassembly`) 42 | ------------------------------------------------ 43 | 44 | The Preassembly CLI manages the preassembly pipeline, running deploying 45 | preassembly jobs to Batch. 46 | 47 | .. automodule:: indra_db.cli.preassembly 48 | :members: 49 | :member-order: bysource 50 | 51 | 52 | Knowledge Bases (:py:mod:`indra_db.cli.knowledgebase`) 53 | ------------------------------------------------------ 54 | 55 | The INDRA Databases also derives much of its knowledge from external databases 56 | and other resources not extracted from plain text, referred to in this repo as 57 | "knowledge bases", so as to avoid the ambiguity of "database". This CLI 58 | handles the updates of those knowledge bases, each of which requires different 59 | handling. 60 | 61 | .. automodule:: indra_db.cli.knowledgebase 62 | :members: 63 | :member-order: bysource 64 | 65 | 66 | Static Dumps (:py:mod:`indra_db.cli.dump`) 67 | ------------------------------------------ 68 | 69 | This handles the generation of static dumps, including the readonly database 70 | from the principal database. 71 | 72 | .. automodule:: indra_db.cli.dump 73 | :members: 74 | :member-order: bysource 75 | -------------------------------------------------------------------------------- /doc/modules/client/index.rst: -------------------------------------------------------------------------------- 1 | The Client 2 | ========== 3 | The purpose of the client is to be the gateway for external access to the 4 | content of the databases. Here we define high level access functions for 5 | getting data out of the database in a natural way. This is where the queries 6 | used by the REST API are defined, and most users looking to access knowledge on 7 | the database should use the client if they can, as it is heavily optimized. 8 | 9 | Our system utilizes 2 databases, one which represents the "ground truth", as 10 | we know it, and is structured naturally for performing updates on our 11 | knowledge; it will always be the most up to date. We also have a "readonly" 12 | database that we used for our outward facing services. This database is 13 | optimized for fast queries and the content in it is updated weekly. Each 14 | database has its own set of access tools. 15 | 16 | 17 | .. toctree:: 18 | :maxdepth: 3 19 | 20 | principal/index.rst 21 | readonly/index.rst 22 | misc.rst 23 | 24 | 25 | -------------------------------------------------------------------------------- /doc/modules/client/misc.rst: -------------------------------------------------------------------------------- 1 | Miscellaneous Client APIs (Mostly Deprecated) 2 | ============================================= 3 | 4 | There are some, generally archaic, client functions which use both readonly 5 | and principal resources. I make no guarantee that these will work. 6 | 7 | Get Datasets (:py:mod:`indra_db.client.datasets`) 8 | ------------------------------------------------- 9 | 10 | An early attempt at something very like the :py:mod:`indra_db.client.readonly.interactions` 11 | approach to getting superficial data out of the database. 12 | 13 | .. automodule:: indra_db.client.datasets 14 | :members: 15 | 16 | 17 | Get Statements (:py:mod:`indra_db.client.statements`) 18 | ----------------------------------------------------- 19 | 20 | The first round of tools written to get Statements out of the database, 21 | utilizing far too many queries and taking absurdly long to complete. Most of 22 | their functions have been outmoded, with the exception of getting PA Statements 23 | from the principal database, which (as of this writing) has yet to be 24 | implemented. 25 | 26 | .. automodule:: indra_db.client.statements 27 | :members: 28 | -------------------------------------------------------------------------------- /doc/modules/client/principal/index.rst: -------------------------------------------------------------------------------- 1 | The Principal Database Client 2 | ============================= 3 | 4 | This is the set of client tools to access the most-nearly ground truth 5 | knowledge stored on the principal database. 6 | 7 | 8 | Access Readings and Text Content (:py:mod:`indra_db.client.principal.content`) 9 | ------------------------------------------------------------------------------ 10 | 11 | This defines a simple API to access the content that we store on the database 12 | for external purposes. 13 | 14 | .. automodule:: indra_db.client.principal.content 15 | :members: 16 | 17 | 18 | Submit and Retrieve Curations (:py:mod:`indra_db.client.principal.curation`) 19 | ---------------------------------------------------------------------------- 20 | 21 | On our services, users have the ability to curate the results we present, 22 | indicating whether they are correct or not, and how they may be incorrect. The 23 | API for adding and retrieving that input is defined here. 24 | 25 | .. automodule:: indra_db.client.principal.curation 26 | :members: 27 | 28 | 29 | Get Raw Statements (:py:mod:`indra_db.client.principal.raw_statements`) 30 | ----------------------------------------------------------------------- 31 | 32 | Get the raw, uncleaned and un-merged Statements based on agent and type or by 33 | paper(s) of origin. 34 | 35 | .. automodule:: indra_db.client.principal.raw_statements 36 | :members: 37 | -------------------------------------------------------------------------------- /doc/modules/client/readonly/index.rst: -------------------------------------------------------------------------------- 1 | The Readonly Client 2 | =================== 3 | 4 | Here are our primary tools intended for retrieving Statements, in particular 5 | Pre-Assembled (PA) Statements, from the readonly database. This is some of the 6 | most heavily optimized access code in the repo, and is the backbone of most 7 | external or outward facing applications. 8 | 9 | The readonly database, as the name suggests, is designed to take only read 10 | requests, and is updated via dump only once a week. This allows users of 11 | our database to access it even as we perform daily updates on the principal 12 | database, without worrying about queries interfering. 13 | 14 | 15 | Construct composable queries (:py:mod:`indra_db.client.readonly.query`) 16 | ------------------------------------------------------------------------------- 17 | 18 | This is a sophisticated system of classes that can be used to form queires 19 | for preassembled statements from the readonly database. 20 | 21 | .. automodule:: indra_db.client.readonly.query 22 | :members: 23 | :member-order: bysource 24 | 25 | -------------------------------------------------------------------------------- /doc/modules/index.rst: -------------------------------------------------------------------------------- 1 | INDRA Database modules 2 | ====================== 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | client/index.rst 8 | cli/index.rst 9 | reading/index.rst 10 | preassembly/index.rst 11 | schemas/index.rst 12 | util/index.rst 13 | misc.rst 14 | 15 | -------------------------------------------------------------------------------- /doc/modules/misc.rst: -------------------------------------------------------------------------------- 1 | Some Miscellaneous Modules 2 | ========================== 3 | 4 | Here are some modules and files that live on their own, and don't fit neatly 5 | into other categories. 6 | 7 | 8 | Low Level Database Interface (:py:mod:`indra_db.databases`) 9 | ----------------------------------------------------------- 10 | 11 | The Database Manager classes are the lowest level interface with the database, 12 | implemented with SQLAlchemy, providing useful short-cuts but also allowing full 13 | access to SQLAlchemy's API. 14 | 15 | .. automodule:: indra_db.databases 16 | :members: 17 | :member-order: bysource 18 | 19 | 20 | Belief Calculator (:py:mod:`indra_db.belief`) 21 | --------------------------------------------- 22 | 23 | The belief in the knowledge of a Statement is a measure of our confidence that 24 | the Statement is an accurate representation of the text, _NOT_ our confidence 25 | in the validity of what was in that text. Given the size of the content in the 26 | database, some special care is needed when calculating this value, which 27 | depends heavily on the support relations between pre-assembled Statements. 28 | 29 | .. automodule:: indra_db.belief 30 | :members: 31 | -------------------------------------------------------------------------------- /doc/modules/preassembly/index.rst: -------------------------------------------------------------------------------- 1 | Database Integrated Preassembly Tools 2 | ===================================== 3 | 4 | The database runs incremental preassembly on the raw statements to generate 5 | the preassembled (PA) Statements. The code to accomplish this task is defined 6 | here, principally in :class:`DbPreassembler 7 | `. This module also 8 | defines proceedures for running these jobs on AWS. 9 | 10 | Database Preassembly (:py:mod:`indra_db.preassembly.preassemble_db`) 11 | -------------------------------------------------------------------- 12 | 13 | This module defines a class that manages preassembly for a given list of 14 | statement types on the local machine. 15 | 16 | .. automodule:: indra_db.preassembly.preassemble_db 17 | :members: 18 | :member-order: bysource 19 | 20 | 21 | A Class to Manage and Monitor AWS Batch Jobs (:py:mod:`indra_db.preassembly.submitter`) 22 | --------------------------------------------------------------------------------------- 23 | 24 | Allow a manager to monitor the Batch jobs to prevent runaway jobs, and smooth 25 | out job runs and submissions. 26 | 27 | .. automodule:: indra_db.preassembly.submitter 28 | :members: 29 | :member-order: bysource 30 | 31 | -------------------------------------------------------------------------------- /doc/modules/reading/index.rst: -------------------------------------------------------------------------------- 1 | Database Integrated Reading Tools 2 | ================================= 3 | 4 | Here are defined the procedures for reading content on the database, stashing 5 | the reading outputs, and producing statements from the readings, and inserting 6 | those raw statements into the database. 7 | 8 | The Database Readers (:py:mod:`indra_db.reading.read_db`) 9 | --------------------------------------------------------- 10 | 11 | A reader is defined as a python class which implements the machinery needed to 12 | process the text content we store, read it, and extract Statements from the 13 | reading results, storing the readings along the way. The reader must conform 14 | to a standard interface, which then allows readers to be run in a plug-and-play 15 | manner. 16 | 17 | .. automodule:: indra_db.reading.read_db 18 | :members: 19 | :member-order: bysource 20 | 21 | 22 | The Database Script for Running on AWS (:py:mod:`indra_db.reading.read_db_aws`) 23 | ------------------------------------------------------------------------------- 24 | 25 | This is the script used to run reading on AWS Batch, generally run from an 26 | AWS Lambda function. 27 | 28 | .. automodule:: indra_db.reading.read_db_aws 29 | :members: 30 | :member-order: bysource 31 | 32 | 33 | A Class to Manage and Monitor AWS Batch Jobs (:py:mod:`indra_db.reading.submitter`) 34 | ----------------------------------------------------------------------------------- 35 | 36 | Allow a manager to monitor the Batch jobs to prevent runaway jobs, and smooth 37 | out job runs and submissions. 38 | 39 | .. automodule:: indra_db.reading.submitter 40 | :members: 41 | :member-order: bysource 42 | 43 | -------------------------------------------------------------------------------- /doc/modules/schemas/index.rst: -------------------------------------------------------------------------------- 1 | Database Schemas 2 | ================ 3 | 4 | Here are defined the schemas for the principal and readonly databases, as well 5 | as some useful mixin classes. 6 | 7 | Principal Database Schema (:py:mod:`indra_db.schemas.principal_schema`) 8 | ----------------------------------------------------------------------- 9 | 10 | .. automodule:: indra_db.schemas.principal_schema 11 | :members: 12 | :member-order: bysource 13 | 14 | Readonly Database Schema (:py:mod:`indra_db.schemas.readonly_schema`) 15 | --------------------------------------------------------------------- 16 | 17 | Defines the `get_schema` function for the readonly database, which is used by 18 | external services to access the Statement knowledge we acquire. 19 | 20 | .. automodule:: indra_db.schemas.readonly_schema 21 | :members: 22 | :member-order: bysource 23 | 24 | Class Mix-ins (:py:mod:`indra_db.schemas.mixins`) 25 | ------------------------------------------------- 26 | 27 | This defines class mixins that are used to add general features to SQLAlchemy 28 | table objects via multiple inheritance. 29 | 30 | .. automodule:: indra_db.schemas.mixins 31 | :members: 32 | :member-order: bysource 33 | 34 | Indexes (:py:mod:`indra_db.schemas.indexes`) 35 | -------------------------------------------- 36 | 37 | This defines the classes needed to create and maintain indices in the database, 38 | the other part of the infrastructure of which is included in the `IndraDBTable` 39 | class mixin definition. 40 | 41 | .. automodule:: indra_db.schemas.indexes 42 | :members: 43 | :member-order: bysource 44 | -------------------------------------------------------------------------------- /doc/modules/util/index.rst: -------------------------------------------------------------------------------- 1 | Utilities 2 | ========= 3 | 4 | Here live the more mundane and backend utilities used throughout other modules 5 | of the codebase, and potentially elsewhere, although they are not intended for 6 | external use in general. Several more-or-less bespoke scripts are also stored 7 | here. 8 | 9 | 10 | Database Session Constructors (:py:mod:`indra_db.util.constructors`) 11 | -------------------------------------------------------------------- 12 | 13 | Constructors to get interfaces to the different databases, selecting among 14 | the various physical instances defined in the config file. 15 | 16 | .. automodule:: indra_db.util.constructors 17 | :members: 18 | 19 | 20 | Scripts to Get Content (:py:mod:`indra_db.util.content_scripts`) 21 | ---------------------------------------------------------------- 22 | 23 | General scripts for getting content by various IDs. 24 | 25 | .. automodule:: indra_db.util.content_scripts 26 | :members: 27 | 28 | 29 | Distilling Raw Statements (:py:mod:`indra_db.util.distill_statements`) 30 | ---------------------------------------------------------------------- 31 | 32 | Do some pre-pre-assembly cleansing of the raw Statements to account for various 33 | kinds of duplicity that are artifacts of our content collection and reading 34 | pipelines rather than representing actually duplicated knowledge in the 35 | literature. 36 | 37 | .. automodule:: indra_db.util.distill_statements 38 | :members: 39 | 40 | 41 | Script to Create a SIF Dump (:py:mod:`indra_db.util.dump_sif`) 42 | -------------------------------------------------------------- 43 | 44 | Create an interactome from metadata in the database and dump the results as a 45 | sif file. 46 | 47 | .. automodule:: indra_db.util.dump_sif 48 | :members: 49 | 50 | 51 | General Helper Functions (:py:mod:`indra_db.util.helpers`) 52 | ---------------------------------------------------------- 53 | 54 | Functions with broad utility throughout the repository, but otherwise 55 | miscellaneous. 56 | 57 | .. automodule:: indra_db.util.helpers 58 | :members: 59 | 60 | 61 | Routines for Inserting Statements and Content (:py:mod:`indra_db.util.insert`) 62 | ------------------------------------------------------------------------------ 63 | 64 | Inserting content into the database can be a rather involved process, but here 65 | are defined high-level utilities to uniformly accomplish the task. 66 | 67 | .. automodule:: indra_db.util.insert 68 | :members: 69 | -------------------------------------------------------------------------------- /doc/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx_rtd_theme 2 | sphinx 3 | mock 4 | ipython 5 | matplotlib 6 | future 7 | psycopg2 8 | sphinx-click 9 | requests 10 | m2r2 11 | termcolor 12 | cachetools 13 | git+https://github.com/sorgerlab/indra.git 14 | 15 | -------------------------------------------------------------------------------- /doc/rest_api_doc/readme_link.rst: -------------------------------------------------------------------------------- 1 | .. mdinclude:: ../../indra_db_service/README.md 2 | 3 | -------------------------------------------------------------------------------- /doc/web_ui_doc/index.rst: -------------------------------------------------------------------------------- 1 | .. mdinclude:: ../../indra_db_service/search_introduction.md 2 | -------------------------------------------------------------------------------- /doc/web_ui_results_expanded.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/doc/web_ui_results_expanded.png -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG AWS_ACCOUNT_ID 2 | FROM ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/indra:latest 3 | 4 | ARG BUILD_BRANCH 5 | ARG INDRA_BRANCH 6 | 7 | ENV DIRPATH /sw 8 | ENV PYTHONPATH "$PYTHONPATH:${DIRPATH}/covid-19" 9 | WORKDIR $DIRPATH 10 | 11 | RUN cd indra && \ 12 | git fetch --all && \ 13 | git checkout $INDRA_BRANCH && \ 14 | echo "INDRA_BRANCH=" $INDRA_BRANCH && \ 15 | pip install -e . -U 16 | 17 | # Install libpq5 and some other necessities. 18 | RUN wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \ 19 | apt-get update && \ 20 | apt-get install -y lsb-release && \ 21 | echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" | tee /etc/apt/sources.list.d/pgdg.list && \ 22 | apt-get update && \ 23 | apt-get install -y libpq5 libpq-dev postgresql-client-13 postgresql-client-common && \ 24 | pip install awscli 25 | 26 | # Install psycopg2 27 | RUN git clone https://github.com/psycopg/psycopg2.git && \ 28 | cd psycopg2 && \ 29 | python setup.py build && \ 30 | python setup.py install 31 | 32 | # Install pgcopy 33 | RUN git clone https://github.com/pagreene/pgcopy.git && \ 34 | cd pgcopy && \ 35 | python setup.py install 36 | 37 | # Install covid-19 38 | RUN git clone https://github.com/indralab/covid-19.git 39 | 40 | # Install sqlalchemy < 1.4 (due to indirect dependencies, it may be a later 41 | # version in the indra:db image) 42 | RUN pip install "sqlalchemy<1.4" 43 | 44 | #install bs4 45 | RUN pip install bs4 46 | 47 | # Install indra_db 48 | RUN git clone https://github.com/gyorilab/indra_db.git && \ 49 | cd indra_db && \ 50 | pip install -e .[all] && \ 51 | pip list && \ 52 | echo "PYTHONPATH =" $PYTHONPATH && \ 53 | git checkout $BUILD_BRANCH && \ 54 | echo "BUILD_BRANCH =" $BUILD_BRANCH && \ 55 | git branch && \ 56 | echo "[indra]" > /root/.config/indra/config.ini 57 | 58 | -------------------------------------------------------------------------------- /docker/buildspec.yml: -------------------------------------------------------------------------------- 1 | version: 0.1 2 | 3 | phases: 4 | pre_build: 5 | commands: 6 | - echo Logging in to Amazon ECR... 7 | - aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com 8 | build: 9 | commands: 10 | - echo Build started on `date` 11 | - echo Building the Docker image... 12 | - docker build --build-arg BUILD_BRANCH=$BUILD_BRANCH --build-arg INDRA_BRANCH=$INDRA_BRANCH --build-arg AWS_ACCOUNT_ID=$AWS_ACCOUNT_ID -t $IMAGE_REPO_NAME:$IMAGE_TAG -f docker/Dockerfile . 13 | - docker tag $IMAGE_REPO_NAME:$IMAGE_TAG $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG 14 | post_build: 15 | commands: 16 | - echo Build completed on `date` 17 | - echo Pushing the Docker image... 18 | - docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG 19 | -------------------------------------------------------------------------------- /indra_db/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import get_primary_db, get_db, get_ro 2 | from .databases import texttypes, formats, sql_expressions 3 | -------------------------------------------------------------------------------- /indra_db/cli/__init__.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from .knowledgebase import kb 4 | from .content import content 5 | from .dump import dump_cli 6 | from .preassembly import pa 7 | from .reading import reading 8 | from .xdd import xdd 9 | 10 | 11 | @click.group() 12 | def main(): 13 | """INDRA Database Infrastructure CLI 14 | 15 | The INDRA Database is both a physical database and an infrastructure for 16 | managing and updating the content of that physical database. This CLI 17 | is used for executing these management commands. 18 | """ 19 | 20 | 21 | @main.command() 22 | @click.argument("task", type=click.Choice(["gather"])) 23 | def pipeline_stats(task): 24 | """Manage the pipeline stats gathered on s3. 25 | 26 | All major upload and update pipelines have basic timeing and success-failure 27 | stats gather on them using the 28 | :class:`DataGatherer ` class 29 | wrapper. 30 | 31 | These stats are displayed on the ``/monitor`` endpoint of the database 32 | service. 33 | 34 | \b 35 | Tasks are: 36 | - gather: gather the individual job JSONs into an aggregated file. 37 | """ 38 | if task == "gather": 39 | from indra_db.util.data_gatherer import digest_s3_files 40 | digest_s3_files() 41 | 42 | 43 | main.add_command(kb) 44 | main.add_command(content) 45 | main.add_command(dump_cli) 46 | main.add_command(pa) 47 | main.add_command(reading) 48 | main.add_command(xdd) 49 | -------------------------------------------------------------------------------- /indra_db/cli/preassembly.py: -------------------------------------------------------------------------------- 1 | import click 2 | from datetime import datetime 3 | 4 | from indra_db import get_db 5 | from indra_db.exceptions import IndraDbException 6 | 7 | from .util import format_date 8 | 9 | def filter_updates(stmt_type, pa_updates): 10 | return {u.run_datetime for u in pa_updates if u.stmt_type == stmt_type} 11 | 12 | 13 | def list_last_updates(db): 14 | """Return a dict of the most recent updates for each statement type.""" 15 | from indra_db.preassembly.submitter import VALID_STATEMENTS 16 | pa_updates = db.select_all(db.PreassemblyUpdates) 17 | last_full_update = max(filter_updates(None, pa_updates)) 18 | last_updates = {st: max(filter_updates(st, pa_updates) 19 | | {last_full_update}) 20 | for st in VALID_STATEMENTS} 21 | return last_updates 22 | 23 | 24 | def list_latest_raw_stmts(db): 25 | """Return a dict of the most recent new raw statement for each type.""" 26 | from sqlalchemy import func 27 | res = (db.session.query(db.RawStatements.type, 28 | func.max(db.RawStatements.create_date)) 29 | .group_by(db.RawStatements.type) 30 | .all()) 31 | return {k: v for k, v in res} 32 | 33 | 34 | def run_preassembly(mode, project_name): 35 | """Construct a submitter and begin submitting jobs to Batch for preassembly. 36 | 37 | This function will determine which statement types need to be updated and 38 | how far back they go, and will create the appropriate 39 | :class:`PreassemblySubmitter 40 | ` 41 | instance, and run the jobs with pre-set parameters on statement types that 42 | need updating. 43 | 44 | Parameters 45 | ---------- 46 | project_name : str 47 | This name is used to gag the various AWS resources used for accounting 48 | purposes. 49 | """ 50 | from indra_db.preassembly.submitter import VALID_STATEMENTS, \ 51 | PreassemblySubmitter 52 | db = get_db('primary') 53 | if mode == 'update': 54 | # Find the latest update for each statement type. 55 | last_updates = list_last_updates(db) 56 | 57 | # Get the most recent raw statement datetimes 58 | latest_raw_stmts = list_latest_raw_stmts(db) 59 | 60 | # Only include statements types that have new raw statements. 61 | need_to_update = [s_type for s_type, last_upd in last_updates.items() 62 | if s_type in latest_raw_stmts.keys() 63 | and latest_raw_stmts[s_type] > last_upd] 64 | else: 65 | # Make sure the pa_statements table is truly empty. 66 | if db.select_one(db.PAStatements): 67 | raise IndraDbException("Please clear the pa_statements table " 68 | "before running create. If you want to run " 69 | "an incremental update, please run with " 70 | "mode 'update'.") 71 | 72 | # Just run them all. 73 | need_to_update = VALID_STATEMENTS[:] 74 | 75 | # Create the submitter, and run it. 76 | basename = datetime.utcnow().strftime('%Y%m%d_%H%M%S') 77 | ps = PreassemblySubmitter(basename, mode, project_name=project_name) 78 | ps.set_max_jobs(4) 79 | ps.run(need_to_update, 100000, True, stagger=600, poll_interval=120) 80 | 81 | 82 | @click.group() 83 | def pa(): 84 | """Manage the preassembly pipeline.""" 85 | 86 | 87 | @pa.command() 88 | @click.argument('task', type=click.Choice(['create', 'update']), 89 | required=True) 90 | @click.argument('project-name', required=False) 91 | def run(task, project_name): 92 | """Manage the indra_db preassembly. 93 | 94 | \b 95 | Tasks: 96 | - "create": populate the pa_statements table for the first time (this 97 | requires that the table be empty). 98 | - "update": update the existing content in pa_statements with the latest 99 | from raw statements. 100 | 101 | A project name is required to tag the AWS instances with a "project" tag. 102 | """ 103 | run_preassembly(task, project_name) 104 | 105 | 106 | @pa.command('list') 107 | @click.option('-r', '--with-raw', is_flag=True, 108 | help="Include the latest datetimes for raw statements of each " 109 | "type. This will take much longer.") 110 | def show_list(with_raw): 111 | """List the latest updates for each type of Statement.""" 112 | import tabulate 113 | 114 | db = get_db('primary') 115 | rows = [(st, lu) for st, lu in list_last_updates(db).items()] 116 | header = ('Statement Type', 'Last Update') 117 | if with_raw: 118 | print("This may take a while...", end='', flush=True) 119 | raw_stmt_dates = list_latest_raw_stmts(db) 120 | print("\r", end='') 121 | new_rows = [] 122 | for st, lu in rows: 123 | raw_date = raw_stmt_dates.get(st) 124 | if raw_date is None: 125 | new_rows.append((st, format_date(lu), "[None]", "No")) 126 | else: 127 | new_rows.append((st, format_date(lu), format_date(raw_date), 128 | "Yes" if raw_date > lu else "No")) 129 | rows = new_rows 130 | header += ('Latest Raw Stmt', 'Needs Update?') 131 | else: 132 | rows = [(st, format_date(lu)) for st, lu in rows] 133 | rows.sort() 134 | print(tabulate.tabulate(rows, header)) 135 | 136 | 137 | -------------------------------------------------------------------------------- /indra_db/cli/util.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | 4 | def format_date(dt): 5 | if not isinstance(dt, datetime): 6 | return dt 7 | return dt.strftime("%Y %b %d %I:%M%p") 8 | -------------------------------------------------------------------------------- /indra_db/client/__init__.py: -------------------------------------------------------------------------------- 1 | """This module contains tools designed to access content in the db. 2 | 3 | Specifically, this is for direct access to the database, not through the web 4 | api. 5 | 6 | All the functions defined require direct access to the database, which is in 7 | general restricted. For broad access, see the indra_db_rest api client in 8 | INDRA. 9 | 10 | There are two key ways of accessing statements from the INDRA Database: 11 | directly and using the materialize views. Only the `get_statement_jsons` 12 | functions are limited to using the views. Most other functions access the 13 | primary tables of the database and are generally slower. The 14 | `get_statement_jsons` functions are the most heavily optimized for fast 15 | recall, as they are the back-end to the REST API. 16 | """ 17 | 18 | from .datasets import * 19 | from .readonly import * 20 | from .principal import * 21 | -------------------------------------------------------------------------------- /indra_db/client/principal/__init__.py: -------------------------------------------------------------------------------- 1 | from .raw_statements import * 2 | from .pa_statements import * 3 | from .curation import * 4 | from .content import * 5 | -------------------------------------------------------------------------------- /indra_db/client/principal/content.py: -------------------------------------------------------------------------------- 1 | __all__ = ['get_reader_output', 'get_content_by_refs', 'get_text'] 2 | 3 | import logging 4 | from collections import defaultdict 5 | 6 | from indra_db.util import unpack, _get_trids 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def get_reader_output(db, ref_id, ref_type='tcid', reader=None, 12 | reader_version=None): 13 | """Return reader output for a given text content. 14 | 15 | Parameters 16 | ---------- 17 | db : :py:class:`DatabaseManager` 18 | Reference to the DB to query 19 | ref_id : int or str 20 | The text reference ID whose reader output should be returned 21 | ref_type : Optional[str] 22 | The type of ID to look for, options include 23 | 'tcid' for the database's internal unique text content ID, 24 | or 'pmid', 'pmcid', 'doi, 'pii', 'manuscript_id' 25 | Default: 'tcid' 26 | reader : Optional[str] 27 | The name of the reader whose output is of interest 28 | reader_version : Optional[str] 29 | The specific version of the reader 30 | 31 | Returns 32 | ------- 33 | reading_results : dict{dict{list[str]}} 34 | A dict of reader outputs that match the query criteria, indexed first 35 | by text content id, then by reader. 36 | """ 37 | if ref_type == 'tcid': 38 | clauses = [db.Reading.text_content_id == ref_id] 39 | else: 40 | trids = _get_trids(db, ref_id, ref_type) 41 | if not trids: 42 | return [] 43 | logger.debug("Found %d text ref ids." % len(trids)) 44 | clauses = [db.TextContent.text_ref_id.in_(trids), 45 | db.Reading.text_content_id == db.TextContent.id] 46 | if reader: 47 | clauses.append(db.Reading.reader == reader.upper()) 48 | if reader_version: 49 | clauses.append(db.Reading.reader_version == reader_version) 50 | 51 | res = db.select_all([db.Reading.text_content_id, db.Reading.reader, 52 | db.Reading.bytes], *clauses) 53 | reading_dict = defaultdict(lambda: defaultdict(lambda: [])) 54 | for tcid, reader, result in res: 55 | unpacked_result = None 56 | if not result: 57 | logger.warning("Got reading result with zero content.") 58 | else: 59 | unpacked_result = unpack(result) 60 | reading_dict[tcid][reader].append(unpacked_result) 61 | return reading_dict 62 | 63 | 64 | def get_content_by_refs(db, pmid_list=None, trid_list=None, sources=None, 65 | formats=None, content_type='abstract', unzip=True): 66 | """Return content from the database given a list of PMIDs or text ref ids. 67 | 68 | Note that either pmid_list OR trid_list must be set, and only one can be 69 | set at a time. 70 | 71 | Parameters 72 | ---------- 73 | db : :py:class:`DatabaseManager` 74 | Reference to the DB to query 75 | pmid_list : list[str] or None 76 | A list of pmids. Default is None, in which case trid_list must be 77 | given. 78 | trid_list : list[int] or None 79 | A list of text ref ids. Default is None, in which case pmid list must 80 | be given. 81 | sources : list[str] or None 82 | A list of sources to include (e.g. 'pmc_oa', or 'pubmed'). Default is 83 | None, indicating that all sources will be included. 84 | formats : list[str] 85 | A list of the formats to be included ('xml', 'text'). Default is None, 86 | indicating that all formats will be included. 87 | content_type : str 88 | Select the type of content to load ('abstract' or 'fulltext'). Note 89 | that not all refs will have any, or both, types of content. 90 | unzip : Optional[bool] 91 | If True, the compressed output is decompressed into clear text. 92 | Default: True 93 | 94 | Returns 95 | ------- 96 | content_dict : dict 97 | A dictionary whose keys are text ref ids, with each value being the 98 | the corresponding content. 99 | """ 100 | # Make sure we only get one type of list. 101 | if not (pmid_list or trid_list): 102 | raise ValueError("One of `pmid_list` or `trid_list` must be defined.") 103 | if pmid_list and trid_list: 104 | raise ValueError("Only one of `pmid_list` or `trid_list` may be used.") 105 | 106 | # Put together the clauses for the general constraints. 107 | clauses = [] 108 | if sources is not None: 109 | clauses.append(db.TextContent.source.in_(sources)) 110 | if formats is not None: 111 | clauses.append(db.TextContent.format.in_(formats)) 112 | if content_type not in ['abstract', 'fulltext']: 113 | raise ValueError("Unrecognized content type: %s" % content_type) 114 | else: 115 | clauses.append(db.TextContent.text_type == content_type) 116 | 117 | # Do the query to get the content. 118 | if pmid_list is not None: 119 | content_list = db.select_all( 120 | [db.TextRef.pmid, db.TextContent.content], 121 | db.TextRef.id == db.TextContent.text_ref_id, 122 | db.TextRef.pmid.in_(pmid_list), 123 | *clauses 124 | ) 125 | else: 126 | content_list = db.select_all([db.TextRef.id, db.TextContent.content], 127 | db.TextContent.text_ref_id.in_(trid_list), 128 | *clauses) 129 | if unzip: 130 | content_dict = {id_val: unpack(content) 131 | for id_val, content in content_list} 132 | else: 133 | content_dict = {id_val: content for id_val, content in content_list} 134 | return content_dict 135 | 136 | 137 | def get_text(db, pmids, text_type): 138 | """Return text content of a given type for a list of PMIDs.""" 139 | # Run a query for text content of the desired type 140 | res = (db.session.query(db.TextRef.pmid, db.TextContent.text_type, 141 | db.TextContent.content) 142 | .filter(db.TextRef.pmid_in(pmids)) 143 | .join(db.TextContent) 144 | .filter(db.TextContent.text_type == text_type) 145 | .all()) 146 | # Unpack the content, clean it up, and return it as a dictionary keyed 147 | # by pmid 148 | text_by_pmid = { 149 | row.pmid: unpack(row.content).replace("\t", " ").replace("\n", "\t") 150 | for row in res 151 | } 152 | return text_by_pmid 153 | -------------------------------------------------------------------------------- /indra_db/client/principal/pa_statements.py: -------------------------------------------------------------------------------- 1 | __all__ = ["get_pa_stmt_jsons"] 2 | 3 | import json 4 | from collections import defaultdict 5 | 6 | from sqlalchemy import func, cast, String, null 7 | from sqlalchemy.dialects.postgresql import array 8 | from sqlalchemy.orm import aliased 9 | 10 | from indra_db.util.constructors import get_db 11 | from indra_db.client.principal.raw_statements import _fix_evidence 12 | 13 | 14 | def get_pa_stmt_jsons(clauses=None, with_evidence=True, db=None, limit=1000): 15 | """Load preassembled Statements from the principal database.""" 16 | if db is None: 17 | db = get_db('primary') 18 | 19 | if clauses is None: 20 | clauses = [] 21 | 22 | # Construct the core query. 23 | if with_evidence: 24 | text_ref_cols = [db.Reading.id, db.TextContent.id, db.TextRef.pmid, 25 | db.TextRef.pmcid, db.TextRef.doi, db.TextRef.url, 26 | db.TextRef.pii] 27 | text_ref_types = tuple([str if isinstance(col.type, String) else int 28 | for col in text_ref_cols]) 29 | text_ref_cols = tuple([cast(col, String) 30 | if not isinstance(col.type, String) else col 31 | for col in text_ref_cols]) 32 | text_ref_labels = ('rid', 'tcid', 'pmid', 'pmcid', 'doi', 'url', 'pii') 33 | core_q = db.session.query( 34 | db.PAStatements.mk_hash.label('mk_hash'), 35 | db.PAStatements.json.label('json'), 36 | func.array_agg(db.RawStatements.json).label("raw_jsons"), 37 | func.array_agg(array(text_ref_cols)).label("text_refs") 38 | ).outerjoin( 39 | db.RawUniqueLinks, 40 | db.RawUniqueLinks.pa_stmt_mk_hash == db.PAStatements.mk_hash 41 | ).join( 42 | db.RawStatements, 43 | db.RawStatements.id == db.RawUniqueLinks.raw_stmt_id 44 | ).outerjoin( 45 | db.Reading, 46 | db.Reading.id == db.RawStatements.reading_id 47 | ).outerjoin( 48 | db.TextContent, 49 | db.TextContent.id == db.Reading.text_content_id 50 | ).outerjoin( 51 | db.TextRef, 52 | db.TextRef.id == db.TextContent.text_ref_id 53 | ) 54 | else: 55 | text_ref_types = None 56 | text_ref_labels = None 57 | core_q = db.session.query( 58 | db.PAStatements.mk_hash.label('mk_hash'), 59 | db.PAStatements.json.label('json'), 60 | null().label('raw_jsons'), 61 | null().label('text_refs') 62 | ) 63 | core_q = core_q.filter( 64 | *clauses 65 | ).group_by( 66 | db.PAStatements.mk_hash, 67 | db.PAStatements.json 68 | ) 69 | if limit: 70 | core_q = core_q.limit(limit) 71 | core_sq = core_q.subquery().alias('core') 72 | 73 | # Construct the layer of the query that gathers agent info. 74 | agent_tuple = (cast(db.PAAgents.ag_num, String), 75 | db.PAAgents.db_name, 76 | db.PAAgents.db_id) 77 | at_sq = db.session.query( 78 | core_sq.c.mk_hash, 79 | core_sq.c.json, 80 | core_sq.c.raw_jsons, 81 | core_sq.c.text_refs, 82 | func.array_agg(array(agent_tuple)).label('db_refs') 83 | ).filter( 84 | db.PAAgents.stmt_mk_hash == core_sq.c.mk_hash 85 | ).group_by( 86 | core_sq.c.mk_hash, 87 | core_sq.c.json, 88 | core_sq.c.raw_jsons, 89 | core_sq.c.text_refs 90 | ).subquery().alias('agent_tuples') 91 | 92 | # Construct the layer of the query that gathers supports/supported by. 93 | sup_from = aliased(db.PASupportLinks, name='sup_from') 94 | sup_to = aliased(db.PASupportLinks, name='sup_to') 95 | q = db.session.query( 96 | at_sq.c.mk_hash, 97 | at_sq.c.json, 98 | at_sq.c.raw_jsons, 99 | at_sq.c.text_refs, 100 | at_sq.c.db_refs, 101 | func.array_agg(sup_from.supporting_mk_hash).label('supporting_hashes'), 102 | func.array_agg(sup_to.supported_mk_hash).label('supported_hashes') 103 | ).outerjoin( 104 | sup_from, 105 | sup_from.supported_mk_hash == at_sq.c.mk_hash 106 | ).outerjoin( 107 | sup_to, 108 | sup_to.supporting_mk_hash == at_sq.c.mk_hash 109 | ).group_by( 110 | at_sq.c.mk_hash, 111 | at_sq.c.json, 112 | at_sq.c.raw_jsons, 113 | at_sq.c.text_refs, 114 | at_sq.c.db_refs 115 | ) 116 | 117 | # Run and parse the query. 118 | stmt_jsons = {} 119 | stmts_by_hash = {} 120 | for h, sj, rjs, text_refs, db_refs, supping, supped in q.all(): 121 | # Gather the agent refs. 122 | db_ref_dicts = defaultdict(lambda: defaultdict(list)) 123 | for ag_num, db_name, db_id in db_refs: 124 | db_ref_dicts[int(ag_num)][db_name].append(db_id) 125 | db_ref_dicts = {k: dict(v) for k, v in db_ref_dicts.items()} 126 | 127 | # Clean supping and supped. 128 | supping = [h for h in set(supping) if h is not None] 129 | supped = [h for h in set(supped) if h is not None] 130 | 131 | # Parse the JSON bytes into JSON. 132 | stmt_json = json.loads(sj) 133 | if 'supports' not in stmt_json: 134 | stmt_json['supports'] = [] 135 | if 'supported_by' not in stmt_json: 136 | stmt_json['supported_by'] = [] 137 | 138 | # Load the evidence. 139 | if rjs is not None: 140 | for rj, text_ref_values in zip(rjs, text_refs): 141 | raw_json = json.loads(rj) 142 | ev = raw_json['evidence'][0] 143 | if any(v is not None for v in text_ref_values): 144 | tr_dict = {lbl.upper(): None if val == "None" else typ(val) 145 | for lbl, typ, val 146 | in zip(text_ref_labels, text_ref_types, 147 | text_ref_values)} 148 | _fix_evidence(ev, tr_dict.pop('RID'), tr_dict.pop('TCID'), 149 | tr_dict) 150 | if 'evidence' not in stmt_json: 151 | stmt_json['evidence'] = [] 152 | stmt_json['evidence'].append(ev) 153 | 154 | # Resolve supports supported-by, as much as possible. 155 | stmts_by_hash[h] = stmt_json 156 | for supped_h in (h for h in supped if h in stmts_by_hash): 157 | stmt_json['supports'].append(stmts_by_hash[supped_h]['id']) 158 | stmts_by_hash[supped_h]['supported_by'].append(stmt_json['id']) 159 | for supping_h in (h for h in supping if h in stmts_by_hash): 160 | stmt_json['supported_by'].append(stmts_by_hash[supping_h]['id']) 161 | stmts_by_hash[supping_h]['supports'].append(stmt_json['id']) 162 | 163 | # Put it together in a dictionary. 164 | result_dict = { 165 | "mk_hash": h, 166 | "stmt": stmt_json, 167 | "db_refs": db_ref_dicts, 168 | "supports_hashes": supping, 169 | "supported_by_hashes": supped 170 | } 171 | stmt_jsons[h] = result_dict 172 | return stmt_jsons 173 | -------------------------------------------------------------------------------- /indra_db/client/principal/raw_statements.py: -------------------------------------------------------------------------------- 1 | __all__ = ['get_raw_stmt_jsons_from_agents', 'get_raw_stmt_jsons_from_papers', 2 | 'get_raw_stmt_jsons'] 3 | 4 | import json 5 | from collections import defaultdict 6 | 7 | from sqlalchemy import intersect_all 8 | 9 | from indra.util import clockit 10 | 11 | from indra_db import get_db 12 | from indra_db.util import regularize_agent_id 13 | 14 | # ==== 15 | # API 16 | # ==== 17 | 18 | 19 | @clockit 20 | def get_raw_stmt_jsons_from_papers(id_list, id_type='pmid', db=None, 21 | max_stmts=None, offset=None): 22 | """Get raw statement jsons for a given list of papers. 23 | 24 | Parameters 25 | ---------- 26 | id_list : list 27 | A list of ints or strs that are ids of papers of type `id_type`. 28 | id_type : str 29 | Default is 'pmid'. The type of ids given in id_list, e.g. 'pmid', 30 | 'pmcid', 'trid'. 31 | db : :py:class:`DatabaseManager` 32 | Optionally specify a database manager that attaches to something 33 | besides the primary database, for example a local database instance. 34 | 35 | Returns 36 | ------- 37 | result_dict : dict 38 | A dictionary keyed by id (of `id_type`) with a list of raw statement 39 | json objects as each value. Ids for which no statements are found will 40 | not be included in the dict. 41 | """ 42 | if db is None: 43 | db = get_db('primary') 44 | 45 | # Get the attribute for this id type. 46 | if id_type == 'pmid': 47 | id_constraint = db.TextRef.pmid_in(id_list, filter_ids=True) 48 | elif id_type == 'pmcid': 49 | id_constraint = db.TextRef.pmcid_in(id_list, filter_ids=True) 50 | elif id_type == 'doi': 51 | id_constraint = db.TextRef.doi_in(id_list, filter_ids=True) 52 | else: 53 | id_constraint = _get_id_col(db.TextRef, id_type).in_(id_list) 54 | 55 | # Get the results. 56 | res = db.select_all([db.TextRef, db.RawStatements.json], id_constraint, 57 | *db.link(db.RawStatements, db.TextRef)) 58 | 59 | # Organized the results into a dict of lists keyed by id value. 60 | # Fix pmids along the way. 61 | result_dict = defaultdict(list) 62 | for tr, rjson_bytes in res: 63 | id_val = _get_id_col(tr, id_type) 64 | 65 | # Decode and unpack the json 66 | rjson = json.loads(rjson_bytes.decode('utf-8')) 67 | 68 | # Fix the pmids in this json. 69 | rjson['evidence'][0]['pmid'] = tr.pmid 70 | 71 | # Set the text_refs in this json 72 | ev = rjson['evidence'][0] 73 | if 'text_refs' not in ev.keys(): 74 | ev['text_refs'] = {} 75 | for idt in ['trid', 'pmid', 'pmcid', 'doi']: 76 | ev['text_refs'][idt.upper()] = _get_id_col(tr, idt) 77 | 78 | # Add this to the results. 79 | result_dict[id_val].append(rjson) 80 | 81 | return result_dict 82 | 83 | 84 | @clockit 85 | def get_raw_stmt_jsons_from_agents(agents=None, stmt_type=None, db=None, 86 | max_stmts=None, offset=None): 87 | """Get Raw statement jsons from a list of agent refs and Statement type.""" 88 | if db is None: 89 | db = get_db('primary') 90 | 91 | if agents is None: 92 | agents = [] 93 | 94 | # Turn the agents parameters into an intersection of queries for stmt ids. 95 | entity_queries = [] 96 | for role, ag_dbid, ns in agents: 97 | # Make the id match paradigms for the database. 98 | ag_dbid = regularize_agent_id(ag_dbid, ns) 99 | 100 | # Sanitize wildcards. 101 | for char in ['%', '_']: 102 | ag_dbid = ag_dbid.replace(char, '\%s' % char) 103 | 104 | # Generate the query 105 | q = db.session.query( 106 | db.RawAgents.stmt_id.label('stmt_id') 107 | ).filter( 108 | db.RawAgents.db_id.like(ag_dbid) 109 | ) 110 | 111 | if ns is not None: 112 | q = q.filter(db.RawAgents.db_name.like(ns)) 113 | 114 | if role is not None: 115 | q = q.filter(db.RawAgents.role == role.upper()) 116 | 117 | entity_queries.append(q) 118 | 119 | # Add a constraint for the statement type. 120 | if stmt_type is not None: 121 | q = db.session.query( 122 | db.RawStatements.id.label('stmt_id') 123 | ).filter( 124 | db.RawStatements.type == stmt_type 125 | ) 126 | entity_queries.append(q) 127 | 128 | # Generate the sub-query. 129 | ag_query_al = intersect_all(*entity_queries).alias('intersection') 130 | ag_query = db.session.query(ag_query_al).distinct().subquery('ag_stmt_ids') 131 | 132 | # Get the raw statement JSONs from the database. 133 | res = get_raw_stmt_jsons([db.RawStatements.id == ag_query.c.stmt_id], db=db, 134 | max_stmts=max_stmts, offset=offset) 135 | return res 136 | 137 | 138 | def get_raw_stmt_jsons(clauses=None, db=None, max_stmts=None, offset=None): 139 | """Get Raw Statements from the principle database, given arbitrary clauses. 140 | """ 141 | if db is None: 142 | db = get_db('primary') 143 | 144 | if clauses is None: 145 | clauses = [] 146 | 147 | q = db.session.query( 148 | db.RawStatements.id, 149 | db.RawStatements.json, 150 | db.Reading.id, 151 | db.TextContent.id, 152 | db.TextRef 153 | ).filter( 154 | *clauses 155 | ).outerjoin( 156 | db.Reading, 157 | db.Reading.id == db.RawStatements.reading_id 158 | ).outerjoin( 159 | db.TextContent, 160 | db.TextContent.id == db.Reading.text_content_id 161 | ).outerjoin( 162 | db.TextRef, 163 | db.TextRef.id == db.TextContent.text_ref_id 164 | ) 165 | 166 | if max_stmts is not None: 167 | q = q.limit(max_stmts) 168 | 169 | if offset is not None: 170 | q = q.offset(offset) 171 | 172 | raw_stmt_jsons = {} 173 | for sid, json_bytes, rid, tcid, tr in q.all(): 174 | raw_j = json.loads(json_bytes) 175 | if rid is not None: 176 | _fix_evidence(raw_j['evidence'][0], rid, tcid, tr.get_ref_dict()) 177 | raw_stmt_jsons[sid] = raw_j 178 | 179 | return raw_stmt_jsons 180 | 181 | 182 | # ====== 183 | # Tools 184 | # ====== 185 | 186 | 187 | def _get_id_col(tr, id_type): 188 | if id_type == 'trid': 189 | id_attr = tr.id 190 | else: 191 | try: 192 | id_attr = getattr(tr, id_type) 193 | except AttributeError: 194 | raise ValueError("Invalid id_type: %s" % id_type) 195 | return id_attr 196 | 197 | 198 | def _fix_evidence(ev, rid, tcid, tr_dict): 199 | ev['text_refs'] = tr_dict 200 | ev['text_refs']['TCID'] = tcid 201 | ev['text_refs']['READING_ID'] = rid 202 | if 'PMID' in tr_dict: 203 | ev['pmid'] = tr_dict['PMID'] 204 | return 205 | 206 | -------------------------------------------------------------------------------- /indra_db/client/readonly/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import * 2 | from .query import * 3 | -------------------------------------------------------------------------------- /indra_db/client/readonly/mesh_ref_counts.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import func 2 | 3 | from indra_db import get_ro 4 | 5 | 6 | def get_mesh_ref_counts(mesh_terms, require_all=False, ro=None): 7 | """Get the number of distinct pmids by mesh term for each hash. 8 | 9 | This function directly queries a table in the readonly database that counts 10 | the number of distinct PMIDs for each mesh term/hash pair. Given a list of 11 | mesh terms, this will return a dictionary keyed by hash containing 12 | dictionaries indicating how much support the hash has from each of the given 13 | mesh IDs in terms of distinct PMIDs (thus distinct publications). 14 | 15 | Parameters 16 | ---------- 17 | mesh_terms : list 18 | A list of mesh term strings of the form "D000#####". 19 | require_all : Optional[bool] 20 | If True, require that each entry in the result includes both mesh terms. 21 | In other words, only return results where, for each hash, articles exist 22 | with support from all MeSH IDs given, not just one or the other. Default 23 | is False 24 | ro : Optional[DatabaseManager] 25 | A database manager handle. The default is the primary readonly, as 26 | indicated by environment variables or the config file. 27 | """ 28 | # Get the default readonly database, if needed.. 29 | if ro is None: 30 | ro = get_ro('primary') 31 | 32 | # Make sure the mesh IDs are of the correct kind. 33 | if not all(m.startswith('D') or m.startswith('C') for m in mesh_terms): 34 | raise ValueError("All mesh terms must begin with C or D.") 35 | 36 | # Convert the IDs to numbers for faster lookup. 37 | result = {} 38 | for prefix, table in [('C', ro.MeshConceptRefCounts), 39 | ('D', ro.MeshTermRefCounts)]: 40 | mesh_num_map = {int(m[1:]): m for m in mesh_terms 41 | if m.startswith(prefix)} 42 | if not mesh_num_map: 43 | continue 44 | 45 | # Build the query. 46 | nums = func.array_agg(table.mesh_num) 47 | counts = func.array_agg(table.ref_count) 48 | q = ro.session.query(table.mk_hash, nums.label('nums'), 49 | counts.label('ref_counts'), table.pmid_count) 50 | if len(mesh_num_map.keys()) == 1: 51 | q = q.filter(table.mesh_num == list(mesh_num_map.keys())[0]) 52 | elif len(mesh_num_map.keys()) > 1: 53 | q = q.filter(table.mesh_num.in_(mesh_num_map.keys())) 54 | q = q.group_by(table.mk_hash, table.pmid_count) 55 | 56 | # Apply the require all option by comparing the length of the nums array 57 | # to the number of inputs. 58 | if require_all: 59 | q = q.having(func.cardinality(nums) == len(mesh_num_map.keys())) 60 | 61 | # Parse the results. 62 | for mk_hash, nums, counts, pmid_count in q.all(): 63 | count_dict = {mesh_num_map[mesh_num]: ref_count 64 | for mesh_num, ref_count in zip(nums, counts)} 65 | if mk_hash not in result: 66 | result[mk_hash] = count_dict 67 | result[mk_hash]['total'] = pmid_count 68 | else: 69 | result[mk_hash].update(count_dict) 70 | result[mk_hash]['total'] += sum(counts) 71 | 72 | # Little sloppy, but delete any that don't meet the require_all constraint. 73 | if require_all: 74 | num_terms = len(set(mesh_terms)) 75 | for mk_hash in result.copy().keys(): 76 | if len(result[mk_hash]) != num_terms + 1: 77 | result.pop(mk_hash) 78 | return result 79 | -------------------------------------------------------------------------------- /indra_db/client/readonly/util.py: -------------------------------------------------------------------------------- 1 | __all__ = ['stmt_from_interaction'] 2 | 3 | import logging 4 | 5 | from indra.statements import get_statement_by_name, Agent, ActiveForm 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | snowflakes = ['Complex', 'Translocation', 'ActiveForm', 'Conversion', 11 | 'Autophosphorylation'] 12 | 13 | 14 | def stmt_from_interaction(interaction): 15 | """Get a shell statement from an interaction.""" 16 | StmtClass = get_statement_by_name(interaction['type']) 17 | if interaction['type'] == 'Complex': 18 | agents = [Agent(name) for name in interaction['agents'].values()] 19 | stmt = StmtClass(agents) 20 | elif interaction['type'] == 'ActiveForm': 21 | name = interaction['agents'][0] 22 | agent = Agent(name) 23 | stmt = StmtClass(agent, interaction['activity'], 24 | interaction['is_active']) 25 | else: 26 | agents = [Agent(interaction['agents'][i]) 27 | if interaction['agents'].get(i) 28 | else None 29 | for i in range(len(StmtClass._agent_order))] 30 | stmt = StmtClass(*agents) 31 | return stmt 32 | 33 | 34 | def _iter_agents(stmt_json, agent_order): 35 | for i, ag_key in enumerate(agent_order): 36 | ag = stmt_json.get(ag_key) 37 | if ag is None: 38 | continue 39 | if isinstance(ag, list): 40 | # Like a complex 41 | for ag_obj in ag: 42 | if stmt_json['type'] in snowflakes: 43 | yield None, ag_obj 44 | else: 45 | yield ['subject', 'object'][i], ag_obj 46 | else: 47 | if stmt_json['type'] in snowflakes: 48 | yield None, ag 49 | else: 50 | yield ['subject', 'object'][i], ag 51 | -------------------------------------------------------------------------------- /indra_db/exceptions.py: -------------------------------------------------------------------------------- 1 | 2 | class IndraDbException(Exception): 3 | pass 4 | 5 | 6 | class NoAuthError(IndraDbException): 7 | def __init__(self, api_key, access): 8 | msg = "The api key %s does not grand access to %s." % (api_key, access) 9 | super(NoAuthError, self).__init__(msg) 10 | 11 | 12 | class BadHashError(IndraDbException): 13 | def __init__(self, mk_hash): 14 | self.bad_hash = mk_hash 15 | msg = 'The matches-key hash %s is not valid.' % mk_hash 16 | super(BadHashError, self).__init__(msg) 17 | -------------------------------------------------------------------------------- /indra_db/preassembly/submitter.py: -------------------------------------------------------------------------------- 1 | from indra.statements import get_all_descendants, Statement 2 | from indra_reading.batch.submitters.submitter import Submitter 3 | from indra_reading.batch.util import bucket_name 4 | 5 | DEFAULT_AVOID_STATEMENTS = ['Event', 'Influence', 'Unresolved'] 6 | VALID_STATEMENTS = [st.__name__ for st in get_all_descendants(Statement) 7 | if st.__name__ not in DEFAULT_AVOID_STATEMENTS] 8 | 9 | 10 | class PreassemblySubmitter(Submitter): 11 | job_class = 'preassembly' 12 | _purpose = 'db_preassembly' 13 | _job_queue_dict = {'run_db_reading_queue': ['create', 'update']} 14 | _job_def_dict = {'run_db_reading_jobdef': ['create', 'update']} 15 | 16 | def __init__(self, basename, task, *args, **kwargs): 17 | if task not in ['create', 'update']: 18 | raise ValueError(f"Invalid task '{task}': expected 'create' or " 19 | f"'update'.") 20 | self.task = task 21 | super(PreassemblySubmitter, self).__init__(basename, *args, **kwargs) 22 | 23 | def _iter_over_select_queues(self): 24 | for jq, tasks in self._job_queue_dict.items(): 25 | if self.task not in tasks: 26 | continue 27 | yield jq 28 | 29 | def _get_command(self, job_type_set, *args): 30 | if len(args) == 2: 31 | stmt_type, batch_size = args 32 | continuing = False 33 | else: 34 | stmt_type, batch_size, continuing = args 35 | if self.task not in job_type_set: 36 | return None, None 37 | job_name = f'{self.job_base}_{self.task}_{stmt_type}' 38 | s3_cache = f's3://{bucket_name}/{self.s3_base}/{job_name}' 39 | cmd = ['python3', '-m', 'indra_db.preassembly.preassemble_db', 40 | self.task, '-C', s3_cache, '-T', stmt_type, '-Y', 41 | '-b', str(batch_size)] 42 | if continuing: 43 | cmd += ['-c'] 44 | return job_name, cmd 45 | 46 | def _iter_job_args(self, *args): 47 | type_list = args[0] 48 | if type_list is None: 49 | type_list = VALID_STATEMENTS 50 | 51 | invalid_types = set(type_list) - set(VALID_STATEMENTS) 52 | if invalid_types: 53 | raise ValueError(f"Found invalid statement types: {invalid_types}") 54 | 55 | for stmt_type in type_list: 56 | yield (stmt_type,) + tuple(args[1:]) 57 | -------------------------------------------------------------------------------- /indra_db/reading/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/indra_db/reading/__init__.py -------------------------------------------------------------------------------- /indra_db/readonly_dumping/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/indra_db/readonly_dumping/__init__.py -------------------------------------------------------------------------------- /indra_db/readonly_dumping/export_assembly_refinement.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import time 4 | from datetime import datetime, timezone 5 | 6 | import boto3 7 | 8 | from indra_db import get_db 9 | from indra_db.readonly_dumping.export_assembly import split_tsv_gz_file, \ 10 | batch_size, count_rows_in_tsv_gz, get_refinement_graph, \ 11 | refinement_cycles_fpath, calculate_belief 12 | import multiprocessing as mp 13 | 14 | from indra_db.util import S3Path 15 | from .locations import * 16 | from indra_db.readonly_dumping.util import record_time 17 | import logging 18 | 19 | logger = logging.getLogger("indra_db.readonly_dumping.export_assembly") 20 | logger.setLevel(logging.DEBUG) 21 | logger.propagate = False 22 | 23 | file_handler = logging.FileHandler(pipeline_log_fpath.absolute().as_posix(), mode='a') 24 | file_handler.setLevel(logging.DEBUG) 25 | 26 | formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M') 27 | file_handler.setFormatter(formatter) 28 | 29 | logger.addHandler(file_handler) 30 | 31 | #put the rest of export_assembly in a seperate file to ensure memory is released in EC2 32 | if __name__ == '__main__': 33 | if not refinements_fpath.exists() or not belief_scores_pkl_fpath.exists(): 34 | db = get_db("primary") 35 | res = db.select_all(db.DBInfo) 36 | db_name_api_mapping = {r.db_name: r.source_api for r in res} 37 | 38 | time_benchmark = {} 39 | start_time = time.time() 40 | mp.set_start_method('spawn') 41 | logger.info("6. Running setup for refinement calculation") 42 | 43 | # 6. Calculate refinement graph: 44 | 45 | if not split_unique_statements_folder_fpath.exists(): 46 | logger.info("Splitting unique statements") 47 | # time: 30 min 48 | split_tsv_gz_file(unique_stmts_fpath.as_posix(), 49 | split_unique_statements_folder_fpath.as_posix(), 50 | batch_size=batch_size) 51 | logger.info( 52 | "Finished splitting unique statement" 53 | ) 54 | else: 55 | logger.info( 56 | "split_unique_statements_folder exist" 57 | ) 58 | split_unique_files = [os.path.join(split_unique_statements_folder_fpath, f) 59 | for f in 60 | os.listdir(split_unique_statements_folder_fpath) 61 | if f.endswith(".gz")] 62 | split_unique_files = sorted( 63 | split_unique_files, 64 | key=lambda x: int(re.findall(r'\d+', x)[0]) 65 | ) 66 | batch_count = len(split_unique_files) 67 | # get the n_rows in the last uncompleted batch 68 | last_count = count_rows_in_tsv_gz(split_unique_files[-1]) 69 | num_rows = (batch_count - 1) * batch_size + last_count 70 | logger.info(f"{num_rows} rows in unique statements with " 71 | f"{batch_count} batches") 72 | cycles_found = False 73 | 74 | ref_graph = get_refinement_graph(n_rows=num_rows, 75 | split_files=split_unique_files) 76 | end_time = time.time() 77 | record_time(export_benchmark.absolute().as_posix(), 78 | (end_time - start_time)/3600, 79 | 'Refinement step', 'a') 80 | 81 | # 7. Get belief scores, if there were no refinement cycles 82 | start_time = time.time() 83 | if cycles_found: 84 | logger.info( 85 | f"Refinement graph stored in variable 'ref_graph', " 86 | f"edges saved to {refinements_fpath.as_posix()}" 87 | f"and cycles saved to {refinement_cycles_fpath.as_posix()}" 88 | ) 89 | 90 | else: 91 | logger.info("7. Calculating belief") 92 | calculate_belief( 93 | refinements_graph=ref_graph, 94 | num_batches=batch_count, 95 | batch_size=batch_size, 96 | source_mapping=db_name_api_mapping, 97 | ) 98 | 99 | # upload source_count, belief_score 100 | # and processed_statement to S3 for cogex usage 101 | timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") 102 | 103 | s3 = boto3.client("s3") 104 | base_s3_path = S3Path("bigmech", 105 | f"indra-db/dumps/cogex_files/{timestamp}") 106 | 107 | for local_file in [source_counts_fpath, processed_stmts_fpath, 108 | belief_scores_pkl_fpath]: 109 | s3_path = base_s3_path.get_element_path(local_file.name) 110 | s3_path.upload(s3, body=local_file.read_bytes()) 111 | logger.info(f"Uploaded {local_file} → {s3_path}") 112 | 113 | if refinements_fpath.exists() or refinement_cycles_fpath.exists(): 114 | for local_file in [refinements_fpath, refinement_cycles_fpath]: 115 | s3_path = base_s3_path.get_element_path(local_file.name) 116 | s3_path.upload(s3, body=local_file.read_bytes()) 117 | logger.info(f"Uploaded {local_file} → {s3_path}") 118 | 119 | end_time = time.time() 120 | record_time(export_benchmark.absolute().as_posix(), 121 | (end_time - start_time) / 3600, 122 | 'Belief score step', 'a') 123 | else: 124 | logger.info("Final output already exists, stopping script") -------------------------------------------------------------------------------- /indra_db/readonly_dumping/locations.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import pystow 3 | 4 | TEMP_DIR = pystow.module("readonly_pipeline") 5 | 6 | 7 | __all__ = [ 8 | "TEMP_DIR", 9 | "PUBMED_MESH_DIR", 10 | "pubmed_xml_gz_dir", 11 | "raw_statements_fpath", 12 | "reading_text_content_fpath", 13 | "text_refs_fpath", 14 | "drop_readings_fpath", 15 | "reading_to_text_ref_map_fpath", 16 | "processed_stmts_reading_fpath", 17 | "processed_stmts_fpath", 18 | "source_counts_reading_fpath", 19 | "source_counts_knowledgebases_fpath", 20 | "source_counts_fpath", 21 | "stmt_hash_to_raw_stmt_ids_fpath", 22 | "stmt_hash_to_raw_stmt_ids_reading_fpath", 23 | "stmt_hash_to_raw_stmt_ids_knowledgebases_fpath", 24 | "raw_id_info_map_fpath", 25 | "raw_id_info_map_reading_fpath", 26 | "raw_id_info_map_knowledgebases_fpath", 27 | "grounded_stmts_fpath", 28 | "unique_stmts_fpath", 29 | "refinements_fpath", 30 | "belief_scores_pkl_fpath", 31 | "pa_hash_act_type_ag_count_cache", 32 | "belief_scores_tsv_fpath", 33 | "reading_ref_link_tsv_fpath", 34 | "raw_stmt_source_tsv_fpath", 35 | "PUBMED_MESH_DIR", 36 | "pubmed_xml_gz_dir", 37 | "pmid_mesh_map_fpath", 38 | "pmid_mesh_mti_fpath", 39 | "pmid_stmt_hash_fpath", 40 | "pmid_mesh_concept_counts_fpath", 41 | "pmid_mesh_term_counts_fpath", 42 | "mk_hash_pmid_sets_fpath", 43 | "mesh_concept_ref_counts_fpath", 44 | "mesh_term_ref_counts_fpath", 45 | "mesh_concepts_meta_fpath", 46 | "mesh_terms_meta_fpath", 47 | "raw_stmt_mesh_concepts_fpath", 48 | "raw_stmt_mesh_terms_fpath", 49 | "pa_meta_fpath", 50 | "name_meta_tsv", 51 | "text_meta_tsv", 52 | "other_meta_tsv", 53 | "source_meta_parquet", 54 | "evidence_counts_tsv", 55 | "pa_agents_counts_tsv", 56 | 'split_raw_statements_folder_fpath', 57 | 'split_unique_statements_folder_fpath', 58 | "sql_ontology_db_fpath", 59 | "postgresql_jar", 60 | "split_pa_link_folder_fpath", 61 | "standard_readonly_snapshot", 62 | "new_readonly_snapshot", 63 | "export_benchmark", 64 | "table_benchmark", 65 | "pipeline_log_fpath", 66 | "knowledgebase_source_data_fpath" 67 | ] 68 | pipeline_log_fpath = TEMP_DIR.join(name="Pipeline.log") 69 | 70 | # knowledgebase source files 71 | knowledgebase_source_data_fpath = TEMP_DIR.join(name="kb_source_data") 72 | knowledgebase_version_record = TEMP_DIR.join(name='knowledgebase_version_record.tsv') 73 | 74 | # Dump files and their derivatives 75 | split_raw_statements_folder_fpath = TEMP_DIR.join(name="split_raw_statements") 76 | raw_statements_fpath = TEMP_DIR.join(name="raw_statements.tsv.gz") 77 | reading_text_content_fpath = TEMP_DIR.join(name="reading_text_content_meta.tsv.gz") 78 | text_refs_fpath = TEMP_DIR.join(name="text_refs_principal.tsv.gz") 79 | drop_readings_fpath = TEMP_DIR.join(name="drop_readings.pkl") 80 | reading_to_text_ref_map_fpath = TEMP_DIR.join(name="reading_to_text_ref_map.pkl") 81 | processed_stmts_reading_fpath = TEMP_DIR.join( 82 | name="processed_statements_reading.tsv.gz" 83 | ) 84 | processed_stmts_fpath = TEMP_DIR.join(name="processed_statements.tsv.gz") 85 | source_counts_reading_fpath = TEMP_DIR.join(name="source_counts_reading.pkl") 86 | source_counts_knowledgebases_fpath = TEMP_DIR.join( 87 | name="source_counts_knowledgebases.pkl" 88 | ) 89 | source_counts_fpath = TEMP_DIR.join(name="source_counts.pkl") 90 | stmt_hash_to_raw_stmt_ids_fpath = TEMP_DIR.join(name="stmt_hash_to_raw_stmt_ids.pkl") 91 | stmt_hash_to_raw_stmt_ids_reading_fpath = TEMP_DIR.join( 92 | name="stmt_hash_to_raw_stmt_ids_reading.pkl" 93 | ) 94 | stmt_hash_to_raw_stmt_ids_knowledgebases_fpath = TEMP_DIR.join( 95 | name="stmt_hash_to_raw_stmt_ids_knowledgebases.pkl" 96 | ) 97 | raw_id_info_map_fpath = TEMP_DIR.join(name="raw_stmt_id_to_info_map.tsv.gz") 98 | raw_id_info_map_reading_fpath = TEMP_DIR.join( 99 | name="raw_stmt_id_to_info_map_reading.tsv.gz" 100 | ) 101 | raw_id_info_map_knowledgebases_fpath = TEMP_DIR.join( 102 | name="raw_stmt_id_to_info_map_knowledgebases.tsv.gz" 103 | ) 104 | grounded_stmts_fpath = TEMP_DIR.join(name="grounded_statements.tsv.gz") 105 | unique_stmts_fpath = TEMP_DIR.join(name="unique_statements.tsv.gz") 106 | refinements_fpath = TEMP_DIR.join(name="refinements.tsv.gz") 107 | sql_ontology_db_fpath = TEMP_DIR.join(name='bio_ontology.db') 108 | split_unique_statements_folder_fpath = TEMP_DIR.join(name="split_unique_statements_folder") 109 | belief_scores_pkl_fpath = TEMP_DIR.join(name="belief_scores.pkl") 110 | pa_hash_act_type_ag_count_cache = TEMP_DIR.join( 111 | name="pa_hash_act_type_ag_count_cache.pkl" 112 | ) 113 | 114 | # Temporary tsv files used for load into readonly db 115 | belief_scores_tsv_fpath = TEMP_DIR.join(name="belief_scores.tsv") 116 | reading_ref_link_tsv_fpath = TEMP_DIR.join(name="reading_ref_link.tsv") 117 | raw_stmt_source_tsv_fpath = TEMP_DIR.join(name="raw_stmt_source.tsv") 118 | 119 | # Pubmed XML files 120 | PUBMED_MESH_DIR = TEMP_DIR.module("pubmed_mesh") 121 | pubmed_xml_gz_dir = PUBMED_MESH_DIR.join(name="pubmed_xml_gz") 122 | 123 | # stmt hash-pmid-MeSH map 124 | pmid_mesh_map_fpath = PUBMED_MESH_DIR.join(name="pmid_mesh_map.pkl") 125 | pmid_mesh_mti_fpath = PUBMED_MESH_DIR.join(name="pmid_mesh_mti.tsv") 126 | pmid_stmt_hash_fpath = PUBMED_MESH_DIR.join(name="pmid_stmt_hash.pkl") 127 | 128 | # MeshConcept/TermRefCounts 129 | pmid_mesh_concept_counts_fpath = TEMP_DIR.join(name="pmid_mesh_concept_counts.pkl") 130 | pmid_mesh_term_counts_fpath = TEMP_DIR.join(name="pmid_mesh_term_counts.pkl") 131 | mk_hash_pmid_sets_fpath = TEMP_DIR.join(name="mk_hash_pmid_sets.pkl") 132 | mesh_concept_ref_counts_fpath = TEMP_DIR.join(name="mesh_concept_ref_counts.tsv") 133 | mesh_term_ref_counts_fpath = TEMP_DIR.join(name="mesh_term_ref_counts.tsv") 134 | 135 | # MeshConceptMeta and MeshTermMeta 136 | mesh_concepts_meta_fpath = PUBMED_MESH_DIR.join(name="mesh_concepts_meta.tsv") 137 | mesh_terms_meta_fpath = PUBMED_MESH_DIR.join(name="mesh_terms_meta.tsv") 138 | 139 | # RawStmtMeshConcepts and RawStmtMeshTerms 140 | raw_stmt_mesh_concepts_fpath = PUBMED_MESH_DIR.join(name="raw_stmt_mesh_concepts.tsv") 141 | raw_stmt_mesh_terms_fpath = PUBMED_MESH_DIR.join(name="raw_stmt_mesh_terms.tsv") 142 | 143 | # PaMeta and derived files 144 | pa_meta_fpath = TEMP_DIR.join(name="pa_meta.tsv") 145 | name_meta_tsv = TEMP_DIR.join(name="name_meta.tsv") 146 | text_meta_tsv = TEMP_DIR.join(name="text_meta.tsv") 147 | other_meta_tsv = TEMP_DIR.join(name="other_meta.tsv") 148 | 149 | # SourceMeta 150 | source_meta_parquet = TEMP_DIR.join(name="source_meta.parquet") 151 | 152 | # EvidenceCounts 153 | evidence_counts_tsv = TEMP_DIR.join(name="evidence_counts.tsv") 154 | 155 | # PaAgentCounts 156 | pa_agents_counts_tsv = TEMP_DIR.join(name="pa_agents_counts.tsv") 157 | 158 | #table construction 159 | postgresql_jar = TEMP_DIR.join(name='postgresql-42.7.3.jar') 160 | split_pa_link_folder_fpath = TEMP_DIR.join(name='split_parquet') 161 | standard_readonly_snapshot = TEMP_DIR.join(name='schema_snapshot.sql') 162 | new_readonly_snapshot=TEMP_DIR.join(name='new_readonly_snapshot.sql') 163 | export_benchmark = TEMP_DIR.join(name='export_benchmark_times.txt') 164 | table_benchmark = TEMP_DIR.join(name='table_benchmark_times.txt') 165 | 166 | 167 | if __name__ == "__main__": 168 | # Print the requested path to stdout if there is a match 169 | import sys 170 | 171 | file_name = sys.argv[1] 172 | for file_var in __all__: 173 | if file_var.startswith(file_name): 174 | if hasattr(sys.modules[__name__], file_var): 175 | path = getattr(sys.modules[__name__], file_var) 176 | assert isinstance(path, Path) 177 | print(path.absolute().as_posix()) 178 | break 179 | else: 180 | raise ValueError(f"Could not find file {file_name}") 181 | -------------------------------------------------------------------------------- /indra_db/readonly_dumping/rds_restore.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Log start time 4 | echo "[$(date)] Starting RDS instance creation..." 5 | 6 | # Step 1: Create the RDS instance 7 | aws rds create-db-instance \ 8 | --db-instance-identifier readonly-test \ 9 | --db-instance-class db.m5.xlarge \ 10 | --engine postgres \ 11 | --allocated-storage 500 \ 12 | --master-username masteruser \ 13 | --master-user-password testpassword \ 14 | --vpc-security-group-ids sg-0c49d0d42c8ae49c1 \ 15 | --availability-zone us-east-1a \ 16 | --backup-retention-period 7 \ 17 | --db-name postgres \ 18 | --publicly-accessible 19 | 20 | # Log progress 21 | echo "[$(date)] RDS instance creation initiated. Waiting for it to be available..." 22 | 23 | # Step 2: Wait for the RDS instance to become available 24 | aws rds wait db-instance-available --db-instance-identifier readonly-test 25 | echo "[$(date)] RDS instance is now available." 26 | 27 | # Step 3: Get the RDS endpoint 28 | RDS_ENDPOINT=$(aws rds describe-db-instances \ 29 | --db-instance-identifier readonly-test \ 30 | --query "DBInstances[0].Endpoint.Address" \ 31 | --output text) 32 | 33 | if [[ -z "$RDS_ENDPOINT" ]]; then 34 | echo "[$(date)] Failed to retrieve RDS endpoint." 35 | exit 1 36 | fi 37 | 38 | echo "[$(date)] RDS Endpoint: $RDS_ENDPOINT" 39 | 40 | # Step 4: Connect to the RDS instance and create a database 41 | 42 | echo "[$(date)] Connecting to RDS to create the database..." 43 | PGPASSWORD=testpassword psql -h $RDS_ENDPOINT -U masteruser -d postgres -p 5432 -c "DROP DATABASE IF EXISTS indradb_readonly_test;" 44 | PGPASSWORD=testpassword psql -h $RDS_ENDPOINT -U masteruser -d postgres -p 5432 -c "CREATE DATABASE indradb_readonly_test;" 45 | 46 | if [[ $? -ne 0 ]]; then 47 | echo "[$(date)] Failed to create the database." 48 | exit 1 49 | fi 50 | 51 | echo "[$(date)] Database 'indradb_readonly_test' created successfully." 52 | 53 | # Step 5: Restore the dump file from S3 into the new database 54 | echo "[$(date)] Restoring dump file into the database..." 55 | 56 | aws s3 cp s3://bigmech/indra-db/dumps/indradb_readonly_local_test.dump - | \ 57 | PGPASSWORD=testpassword psql -h $RDS_ENDPOINT -U masteruser -d indradb_readonly_test 58 | 59 | if [[ $? -ne 0 ]]; then 60 | echo "[$(date)] Failed to restore the dump file." 61 | exit 1 62 | fi 63 | 64 | echo "[$(date)] Dump file restored successfully into 'indradb_readonly_test'." -------------------------------------------------------------------------------- /indra_db/readonly_dumping/readonly_dumping_bash.sh: -------------------------------------------------------------------------------- 1 | # shellcheck disable=SC1090 2 | # SETUP 3 | set -e 4 | # Get password to the principal database from the user 5 | echo "Enter password for the principal database:" 6 | read -s PGPASSWORD # -s flag hides the password 7 | # Use the PGPASSWORD environment variable to set the password, see: 8 | # https://www.postgresql.org/docs/13/libpq-envars.html 9 | export PGPASSWORD 10 | 11 | # If the password is empty, exit 12 | if [ -z "$PGPASSWORD" ] 13 | then 14 | echo "Password is empty. Exiting." 15 | exit 1 16 | fi 17 | 18 | 19 | 20 | #Set the user for the local db 21 | LOCAL_RO_USER="postgres" 22 | export LOCAL_RO_USER 23 | 24 | # Set the password for the local db 25 | echo "Provide password for the local database:" 26 | read -s LOCAL_RO_PASSWORD 27 | export LOCAL_RO_PASSWORD 28 | 29 | # Set the name of the local db 30 | LOCAL_RO_DB_NAME="indradb_readonly_local_test" 31 | export LOCAL_RO_DB_NAME 32 | echo "Local db name: $LOCAL_RO_DB_NAME" 33 | 34 | # Get the current date and time 35 | START_DATE_TIME=`date '+%Y-%m-%d %H:%M:%S'` 36 | START_DATE=`date '+%Y-%m-%d'` 37 | echo "{\"datetime\": \"$START_DATE_TIME\", \"date_stamp\": \"$START_DATE\"}" > start.json 38 | S3_PATH="s3://bigmech/indra-db/dumps/$START_DATE" 39 | aws s3 cp start.json "$S3_PATH/start.json" 40 | echo "Start date marked as: $START_DATE" 41 | # INITIAL DUMPING 42 | 43 | # Get file paths for initial dump files 44 | RAW_STMTS_FPATH=`python3 -m indra_db.readonly_dumping.locations raw_statements` 45 | export RAW_STMTS_FPATH 46 | READING_TEXT_CONTENT_META_FPATH=`python3 -m indra_db.readonly_dumping.locations reading_text_content` 47 | export READING_TEXT_CONTENT_META_FPATH 48 | TEXT_REFS_PRINCIPAL_FPATH=`python3 -m indra_db.readonly_dumping.locations text_refs` 49 | export TEXT_REFS_PRINCIPAL_FPATH 50 | 51 | 52 | if [ -z "$RAW_STMTS_FPATH" ] || [ -z "$READING_TEXT_CONTENT_META_FPATH" ] || [ -z "$TEXT_REFS_PRINCIPAL_FPATH" ] 53 | then 54 | if [ -z "$RAW_STMTS_FPATH" ] 55 | then 56 | echo "Raw statements file path is empty" 57 | fi 58 | if [ -z "$READING_TEXT_CONTENT_META_FPATH" ] 59 | then 60 | echo "Reading text content meta file path is empty" 61 | fi 62 | if [ -z "$TEXT_REFS_PRINCIPAL_FPATH" ] 63 | then 64 | echo "Text refs principal file path is empty" 65 | fi 66 | exit 1 67 | else 68 | echo "Raw statements file path: $RAW_STMTS_FPATH" 69 | echo "Reading text content meta file path: $READING_TEXT_CONTENT_META_FPATH" 70 | echo "Text refs principal file path: $TEXT_REFS_PRINCIPAL_FPATH" 71 | fi 72 | 73 | # Exit if any of the file names are empty 74 | if [ ! -f "$RAW_STMTS_FPATH" ] 75 | then 76 | echo "Dumping raw statements" 77 | start=$(date +%s) 78 | psql -d indradb_test \ 79 | -h indradb-refresh.cvyak4iikv71.us-east-1.rds.amazonaws.com \ 80 | -U tester \ 81 | -w \ 82 | -c "COPY (SELECT id, db_info_id, reading_id, 83 | convert_from (json::bytea, 'utf-8') 84 | FROM public.raw_statements) 85 | TO STDOUT" \ 86 | | gzip > "$RAW_STMTS_FPATH" 87 | end=$(date +%s) 88 | runtime=$((end-start)) 89 | echo "Dumped raw statements in $runtime seconds" 90 | else 91 | echo "Raw statements file already exists, skipping dump" 92 | fi 93 | 94 | if [ ! -f "$READING_TEXT_CONTENT_META_FPATH" ] 95 | then 96 | echo "Dumping reading text content meta" 97 | start=$(date +%s) 98 | psql -d indradb_test \ 99 | -h indradb-refresh.cvyak4iikv71.us-east-1.rds.amazonaws.com \ 100 | -U tester \ 101 | -w \ 102 | -c "COPY (SELECT rd.id, rd.reader_version, tc.id, tc.text_ref_id, 103 | tc.source, tc.text_type 104 | FROM public.text_content as tc, public.reading as rd 105 | WHERE tc.id = rd.text_content_id) 106 | TO STDOUT" \ 107 | | gzip > "$READING_TEXT_CONTENT_META_FPATH" 108 | end=$(date +%s) 109 | runtime=$((end-start)) 110 | echo "Dumped reading text content meta in $runtime seconds" 111 | else 112 | echo "Reading text content meta file already exists, skipping dump" 113 | fi 114 | 115 | if [ ! -f "$TEXT_REFS_PRINCIPAL_FPATH" ] 116 | then 117 | echo "Dumping text refs principal" 118 | start=$(date +%s) 119 | psql -d indradb_test \ 120 | -h indradb-refresh.cvyak4iikv71.us-east-1.rds.amazonaws.com \ 121 | -U tester \ 122 | -w \ 123 | -c "COPY (SELECT id, pmid, pmcid, doi, pii, url, manuscript_id 124 | FROM public.text_ref) 125 | TO STDOUT" \ 126 | | gzip > "$TEXT_REFS_PRINCIPAL_FPATH" 127 | end=$(date +%s) 128 | runtime=$((end-start)) 129 | echo "Dumped text refs in $runtime seconds" 130 | else 131 | echo "Text refs principal file already exists, skipping dump" 132 | fi 133 | # LOCAL DB CREATION AND DUMPING 134 | 135 | python -m indra_db.readonly_dumping.export_assembly 136 | python -m indra_db.readonly_dumping.export_assembly_refinement 137 | 138 | # Create db; 139 | PGPASSWORD=$LOCAL_RO_PASSWORD 140 | export PGPASSWORD 141 | 142 | psql -h localhost -U postgres -c "DROP DATABASE IF EXISTS $LOCAL_RO_DB_NAME" 143 | psql -h localhost -U postgres -c "CREATE DATABASE $LOCAL_RO_DB_NAME" 144 | ## Run import script 145 | python3 -m indra_db.readonly_dumping.readonly_dumping \ 146 | --db-name $LOCAL_RO_DB_NAME \ 147 | --user $LOCAL_RO_USER \ 148 | --password "$LOCAL_RO_PASSWORD" 149 | # --force # Use if you want to overwrite an existing db, if it exists 150 | 151 | # Dump the db, once done importing 152 | pg_dump -h localhost \ 153 | -U postgres \ 154 | -w \ 155 | -f "${LOCAL_RO_DB_NAME}.dump" $LOCAL_RO_DB_NAME 156 | 157 | ## copy to s3 158 | aws s3 cp "${LOCAL_RO_DB_NAME}.dump" "s3://bigmech/indra-db/dumps/" 159 | 160 | # Remove dump file only after it has been copied to s3 successfully 161 | #rm "${LOCAL_RO_DB_NAME}.dump" 162 | 163 | # Upload an end date file to S3 164 | # This is used to keep track of the end date of the dump 165 | # The file is uploaded to the indra-db/dumps/ directory 166 | # The file name is the current date and time 167 | 168 | # Get the current date and time 169 | END_DATE_TIME=`date '+%Y-%m-%d %H:%M:%S'` 170 | END_DATE=`date '+%Y-%m-%d'` 171 | echo "{\"datetime\": \"$END_DATE_TIME\", \"date_stamp\": \"$END_DATE\"}" > end.json 172 | aws s3 cp end.json "$S3_PATH/end.json" 173 | 174 | # At this point, if a new readonly instance is already created, we could run 175 | # the following command to update the instance (assuming the password is set 176 | # in PGPASSWORD, which will be read if -w is set): 177 | # pg_restore -h .us-east-1.rds.amazonaws.com \ 178 | # -U \ 179 | # -f \ 180 | # -w \ 181 | # -d indradb_readonly \ 182 | # --no-owner -------------------------------------------------------------------------------- /indra_db/resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/indra_db/resources/__init__.py -------------------------------------------------------------------------------- /indra_db/resources/default_db_config.ini: -------------------------------------------------------------------------------- 1 | # Here, you may enter addresses to INDRA database instances, headed by section 2 | # titles []. This name is used to refer to access the database within 3 | # the code. Note that databases may also be defined in the environment using 4 | # the format defined in `indra_db.config.DB_STR_FMT`, with a name starting with 5 | # INDRADB 6 | 7 | # Test Databases: 8 | # ---------------- 9 | # Any name with 'test' in it (ex: 'test', 'test1', 'local_test', etc.) may be 10 | # used in testing. Each test database will be tried in order, from top to 11 | # bottom, and the first that can successfully establish a session will be used. 12 | # 13 | # You should also make the names sortable by preference, with "earlier" names 14 | # preferred to later names. 15 | 16 | [test] 17 | role = principal 18 | dialect = postgresql 19 | driver = 20 | username = 21 | password = 22 | host = 23 | port = 24 | name = indradb_test 25 | 26 | [readonly-test] 27 | role = readonly 28 | dialect = postgresql 29 | driver = 30 | username = 31 | password = 32 | host = 33 | port = 34 | name = indradb_readonly_test 35 | 36 | # The Primary Databases: 37 | # --------------------- 38 | # When using the low-level database access classes, it is assumed that there is 39 | # a 'primary' database (eg. [primary]). 40 | 41 | 42 | 43 | # AWS S3 dump site: 44 | # ----------------- 45 | [aws-s3_dump] 46 | bucket = 47 | prefix = 48 | 49 | 50 | # AWS Lambda Config: 51 | # ------------------ 52 | [aws-lambda] 53 | role = 54 | function = 55 | 56 | 57 | # AWS RDS Config: 58 | # --------------- 59 | [aws-rds-settings] 60 | master_user = 61 | security_group = 62 | availability_zone = 63 | 64 | [general] 65 | testing = false -------------------------------------------------------------------------------- /indra_db/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | from .principal_schema import * 2 | from .readonly_schema import * 3 | -------------------------------------------------------------------------------- /indra_db/schemas/indexes.py: -------------------------------------------------------------------------------- 1 | __all__ = ['BtreeIndex', 'StringIndex'] 2 | 3 | 4 | class BtreeIndex(object): 5 | def __init__(self, name, colname, opts=None, cluster=False): 6 | self.name = name 7 | self.colname = colname 8 | contents = colname 9 | if opts is not None: 10 | contents += ' ' + opts 11 | self.definition = ('btree (%s)' % contents) 12 | self.cluster = cluster 13 | 14 | 15 | class StringIndex(BtreeIndex): 16 | def __init__(self, name, colname): 17 | opts = 'COLLATE "en_US.utf8" varchar_ops ASC NULLS LAST' 18 | super().__init__(name, colname, opts) 19 | 20 | -------------------------------------------------------------------------------- /indra_db/tests/README.md: -------------------------------------------------------------------------------- 1 | # Testing the INDRA Database 2 | 3 | In `indra_db`, we use the `nosetests` framework to run tests. Tests are 4 | automatically detected in the usual ways, such as by the prefix `test_` on 5 | files and functions. 6 | 7 | ## Setting up the Test Database 8 | Most tests require access to a test database, which is, and should remain, 9 | separate from the database generally used. This repository requires a database 10 | of at least postgers version 9.6, which for most systems will require some 11 | extra work, as 9.6 is not (or at least was not for me) natively available 12 | through `apt-get`. 13 | 14 | To get access to the latest versions of postgres, you must first execute the 15 | following (a la [this site](https://r00t4bl3.com/post/how-to-install-postgresql-9-6-on-linux-mint-18-1-serena)): 16 | ```bash 17 | sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ xenial-pgdg main" > /etc/apt/sources.list.d/pgdg.list' 18 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - 19 | sudo apt-get update 20 | ``` 21 | And optionally 22 | ```bash 23 | sudo apt-get upgrade 24 | ``` 25 | You should now see there are several versions of postgres available for 26 | installation. You should be able to install any version >= 9.6, but for the 27 | sake of simplicity, I will from here assume 9.6 is being installed. 28 | ```bash 29 | sudo apt-get install postgresql-9.6 postgresql-common 30 | ``` 31 | 32 | Also, note that this is all much more complicated if you have or have ever had 33 | a different version of postgres installed. One way to check this is to inspect 34 | the `/etc/postgresql` direcotry for other versions. This will indicate current 35 | active versions, but also version that were uninstalled without `--purge`, 36 | which could still interfere with the running database. 37 | 38 | You can also run the `pg_lsclusters` command to see what clusters are currently 39 | running. You should see only one, with the correct version, running on port 40 | 5432, like so: 41 | ``` 42 | Ver Cluster Port Status Owner Data directory Log file 43 | 9.6 main 5432 online postgres /var/lib/postgresql/9.6/main /var/log/postgresql/postgresql-9.6-main.log 44 | ``` 45 | 46 | Lastly, you should check and make sure that when you `la /var/run/postgresql/` 47 | (note the `la` for list all, not `ls`) you see the following: 48 | ``` 49 | 9.6-main.pg_stat_tmp 9.6-main.pid .s.PGSQL.5432 .s.PGSQL.5432.lock 50 | ``` 51 | If you don't see this, you may need to reboot or take other actions. 52 | 53 | Once all the above is confirmed, you will need to make access to the database 54 | more permissive. *You should **not** do this when the database could be 55 | exposed to the outside or multiple users may be using the same machine*. 56 | 57 | Edit the the host-based authentication (HBA) config file: `pg_hba.conf`, which 58 | will likely require `sudo`. For me, this file is located at 59 | `/etc/postgresql/9.6/main/pg_hba.conf`. For the sake of this test setup you 60 | should got to the bottom where you see several lines of the form: 61 | ``` 62 | # TYPE DATABASE USER ADDRESS METHOD 63 | local all postgres peer 64 | ``` 65 | **Changing `peer` or `md5` in the `METHOD` sections to `trust`**. Save the file. 66 | For the changes to take effect, first attempt to run: 67 | ``` 68 | sudo service postgresql restart 69 | ``` 70 | and check to see if you can enter a postgres session by running 71 | `psql -U postgres`. If this fails to work, you will need to reboot your 72 | computer for the changes to take effect. 73 | 74 | Once that is done, you can create the test database that INDRA DB uses: 75 | `indradb_test` by entering the following command: 76 | ```bash 77 | sudo -u postgres createdb indradb_test 78 | ``` 79 | You should not be prompted to enter a password. If so, revisit the changes made 80 | to the `pg_hba.conf` file, and again make sure you rebooted after making the 81 | changes. You can then test that the database works as expected by entering 82 | ```bash 83 | psql -U postgres 84 | ``` 85 | At which point you should see a prompt like this: 86 | ``` 87 | psql (10.9 (Ubuntu 10.9-1.pgdg16.04+1), server 9.6.14) 88 | Type "help" for help. 89 | 90 | postgres=# 91 | 92 | ``` 93 | Enter `\q` to exit the prompt, and you should be all set to run the tests. 94 | 95 | 96 | You should also create a test readonly database: 97 | ```bash 98 | sudo -u postgres createdb indradb_ro_test 99 | ``` 100 | 101 | ## Other Test Resources 102 | 103 | To test preassembly, you will also need a test ontology (called `test_ontology.pkl`) 104 | in a directory called `test_resources` within `indra_db/tests`. 105 | -------------------------------------------------------------------------------- /indra_db/tests/test_belief.py: -------------------------------------------------------------------------------- 1 | from nose.plugins.attrib import attr 2 | 3 | from indra.belief import BeliefEngine 4 | from indra_db.belief import MockStatement, MockEvidence, populate_support, \ 5 | load_mock_statements, calculate_belief 6 | from indra_db.tests.util import get_prepped_db 7 | 8 | 9 | def test_belief_calc_up_to_prior(): 10 | be = BeliefEngine() 11 | test_stmts = [ 12 | MockStatement(1, [MockEvidence('sparser'), MockEvidence('reach')]), 13 | MockStatement(2, MockEvidence('biopax')), 14 | MockStatement(3, MockEvidence('signor')), 15 | MockStatement(4, MockEvidence('biogrid')), 16 | MockStatement(5, MockEvidence('bel')), 17 | MockStatement(6, [MockEvidence('phosphosite'), MockEvidence('trips')]), 18 | ] 19 | be.set_prior_probs(test_stmts) 20 | results = {s.matches_key(): s.belief for s in test_stmts} 21 | print(results) 22 | assert len(results) == len(test_stmts), (len(results), len(test_stmts)) 23 | assert all([0 < b < 1 for b in results.values()]), 'Beliefs out of range.' 24 | 25 | 26 | def test_belief_calc_up_to_hierarchy(): 27 | be = BeliefEngine() 28 | test_stmts = [ 29 | MockStatement(1, [MockEvidence('sparser'), MockEvidence('reach')]), 30 | MockStatement(2, MockEvidence('biopax')), 31 | MockStatement(3, MockEvidence('signor')), 32 | MockStatement(4, MockEvidence('biogrid')), 33 | MockStatement(5, MockEvidence('bel')), 34 | MockStatement(6, [MockEvidence('phosphosite'), MockEvidence('trips')]), 35 | ] 36 | be.set_prior_probs(test_stmts) 37 | init_results = {s.matches_key(): s.belief for s in test_stmts} 38 | print(init_results) 39 | supp_links = [(1,2), (1,3), (2,3), (1,5), (4,3)] 40 | populate_support(test_stmts, supp_links) 41 | be.set_hierarchy_probs(test_stmts) 42 | results = {s.matches_key(): s.belief for s in test_stmts} 43 | print(results) 44 | 45 | # Test a couple very simple properties. 46 | assert len(results) == len(test_stmts), (len(results), len(test_stmts)) 47 | assert all([0 < b < 1 for b in results.values()]), 'Beliefs out of range.' 48 | 49 | # Test the change from the initial. 50 | all_deltas_correct = True 51 | deltas_dict = {} 52 | for s in test_stmts: 53 | h = s.matches_key() 54 | b = s.belief 55 | 56 | # Get results 57 | res = {'actual': b - init_results[h]} 58 | 59 | # Define expectations. 60 | if s.supports: 61 | res['expected'] = 'increase' 62 | if res['actual'] <= 0: 63 | all_deltas_correct = False 64 | else: 65 | res['expected'] = 'no change' 66 | if res['actual'] != 0: 67 | all_deltas_correct = False 68 | 69 | deltas_dict[h] = res 70 | assert all_deltas_correct, deltas_dict 71 | 72 | 73 | @attr('nonpublic') 74 | def test_mock_stmt_load_and_belief_calc(): 75 | db = get_prepped_db(1000, with_pa=True) 76 | stmts = load_mock_statements(db) 77 | assert 500 <= len(stmts) <= 1000, len(stmts) 78 | assert all([len(s.evidence) >= 1 for s in stmts]) 79 | sid_list = [ev.annotations['raw_sid'] for s in stmts for ev in s.evidence] 80 | sid_set = set(sid_list) 81 | assert len(sid_list) == len(sid_set), (len(sid_list), len(sid_set)) 82 | assert len([sup for s in stmts for sup in s.supports]) \ 83 | == db.count(db.PASupportLinks), "Support is missing." 84 | belief_dict = calculate_belief(stmts) 85 | assert len(belief_dict) == len(stmts), (len(belief_dict), len(stmts)) 86 | assert all([0 < b < 1 for b in belief_dict.values()]),\ 87 | 'Belief values out of range.' 88 | -------------------------------------------------------------------------------- /indra_db/tests/test_config.py: -------------------------------------------------------------------------------- 1 | from indra_db.config import build_db_url 2 | 3 | 4 | def test_build_db_url(): 5 | """Test the build of a database URL from typical inputs.""" 6 | res_url = build_db_url(host="host", password="password", dialect="postgres", 7 | username="user", port=10, name="db") 8 | assert res_url == "postgres://user:password@host:10/db", res_url 9 | -------------------------------------------------------------------------------- /indra_db/tests/test_copy.py: -------------------------------------------------------------------------------- 1 | from indra_db.tests.util import get_temp_db 2 | 3 | 4 | COLS = ('pmid', 'pmcid') 5 | 6 | 7 | def _ref_set(db): 8 | return set(db.select_all([db.TextRef.pmid, db.TextRef.pmcid])) 9 | 10 | 11 | def _assert_set_equal(s1, s2): 12 | assert s1 == s2, '%s != %s' % (s1, s2) 13 | 14 | 15 | def test_vanilla_copy(): 16 | db = get_temp_db(True) 17 | inps = {('a', '1'), ('b', '1')} 18 | db.copy('text_ref', inps, COLS) 19 | assert inps == _ref_set(db) 20 | 21 | try: 22 | db.copy('text_ref', inps, COLS) 23 | except: 24 | return 25 | assert False, "Copy of duplicate data succeeded." 26 | 27 | 28 | def _do_init_copy(db): 29 | inps_1 = {('a', '1'), ('b', '2')} 30 | db.copy('text_ref', inps_1, COLS) 31 | _assert_set_equal(inps_1, _ref_set(db)) 32 | return inps_1 33 | 34 | 35 | def test_lazy_copy(): 36 | db = get_temp_db(True) 37 | inps_1 = _do_init_copy(db) 38 | inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} 39 | db.copy_lazy('text_ref', inps_2, COLS) 40 | _assert_set_equal(inps_1 | inps_2, _ref_set(db)) 41 | 42 | 43 | def test_lazy_report_copy(): 44 | db = get_temp_db(True) 45 | inps_1 = _do_init_copy(db) 46 | inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} 47 | 48 | left_out = db.copy_report_lazy('text_ref', inps_2, COLS) 49 | _assert_set_equal(inps_1 | inps_2, _ref_set(db)) 50 | _assert_set_equal(inps_1 & inps_2, {t[:2] for t in left_out}) 51 | 52 | 53 | def test_push_copy(): 54 | db = get_temp_db(True) 55 | inps_1 = _do_init_copy(db) 56 | inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} 57 | 58 | original_date = db.select_one(db.TextRef.create_date, 59 | db.TextRef.pmid == 'b') 60 | 61 | db.copy_push('text_ref', inps_2, COLS) 62 | _assert_set_equal(inps_1 | inps_2, _ref_set(db)) 63 | new_date = db.select_one(db.TextRef.create_date, 64 | db.TextRef.pmid == 'b') 65 | assert new_date != original_date, "PMID b was not updated." 66 | 67 | 68 | def test_push_report_copy(): 69 | db = get_temp_db(True) 70 | inps_1 = _do_init_copy(db) 71 | inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} 72 | 73 | original_date = db.select_one(db.TextRef.create_date, 74 | db.TextRef.pmid == 'b') 75 | 76 | updated = db.copy_report_push('text_ref', inps_2, COLS) 77 | _assert_set_equal(inps_1 | inps_2, _ref_set(db)) 78 | _assert_set_equal(inps_1 & inps_2, {t[:2] for t in updated}) 79 | new_date = db.select_one(db.TextRef.create_date, 80 | db.TextRef.pmid == 'b') 81 | assert new_date != original_date, 'PMID b was not updated.' 82 | 83 | 84 | def test_detailed_copy_report(): 85 | db = get_temp_db(True) 86 | inps_1 = _do_init_copy(db) 87 | inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} 88 | 89 | exiting_ids = {trid for trid, in db.select_all(db.TextRef.id)} 90 | 91 | existing_ids, new_ids, skipped_rows = \ 92 | db.copy_detailed_report_lazy('text_ref', inps_2, COLS) 93 | _assert_set_equal(inps_1 | inps_2, _ref_set(db)) 94 | _assert_set_equal(inps_1 & inps_2, {t[:2] for t in skipped_rows}) 95 | assert {trid for trid, in new_ids} != exiting_ids 96 | 97 | 98 | def test_detailed_copy_report_pmid_and_id(): 99 | db = get_temp_db(True) 100 | inps_1 = _do_init_copy(db) 101 | inps_2 = {('b', '2'), ('c', '1'), ('d', '3')} 102 | 103 | existing_id_dict = {pmid: trid for trid, pmid 104 | in db.select_all([db.TextRef.id, db.TextRef.pmid])} 105 | 106 | existing_ids, new_ids, skipped_rows = \ 107 | db.copy_detailed_report_lazy('text_ref', inps_2, COLS, 108 | ('pmid', 'pmcid', 'id')) 109 | new_id_dict = {pmid: trid for pmid, trid in new_ids} 110 | returned_existing_id_dict = {pmid: trid for pmid, _, trid, in existing_ids} 111 | assert returned_existing_id_dict == {'b': 1} 112 | _assert_set_equal(inps_1 | inps_2, _ref_set(db)) 113 | _assert_set_equal(inps_1 & inps_2, {t[:2] for t in skipped_rows}) 114 | assert set(existing_id_dict.keys()) != set(new_id_dict.keys()) 115 | 116 | 117 | def test_detailed_copy_report_repeated_pmid_no_conflict(): 118 | db = get_temp_db(True) 119 | 120 | inps_1 = {('1', 'PMC1', '10.1/a'), ('2', 'PMC2', '10.2/b')} 121 | inps_2 = {('1', 'PMC3', '10.3/c')} 122 | 123 | cols = ('pmid', 'pmcid', 'doi') 124 | db.copy('text_ref', inps_1, cols) 125 | 126 | existing_ids, new_ids, skipped_rows = \ 127 | db.copy_detailed_report_lazy('text_ref', inps_2, cols, ('pmid', 'id')) 128 | assert not existing_ids 129 | assert not skipped_rows 130 | assert len(new_ids) == 1 131 | 132 | 133 | def test_detailed_copy_report_repeated_pmid_with_conflict(): 134 | db = get_temp_db(True) 135 | 136 | inps_1 = {('1', 'PMC1', '10.1/a'), ('2', 'PMC2', '10.2/b')} 137 | inps_2 = {('1', 'PMC3', '10.1/a')} 138 | 139 | cols = ('pmid', 'pmcid', 'doi') 140 | db.copy('text_ref', inps_1, cols) 141 | 142 | existing_ids, new_ids, skipped_rows = \ 143 | db.copy_detailed_report_lazy('text_ref', inps_2, cols, ('pmid', 'id')) 144 | assert existing_ids == [('1', 1)] 145 | assert len(skipped_rows) == 1 146 | assert not new_ids 147 | -------------------------------------------------------------------------------- /indra_db/tests/test_kbs.py: -------------------------------------------------------------------------------- 1 | from nose.plugins.attrib import attr 2 | 3 | from indra.statements.statements import Agent, Phosphorylation, Complex, \ 4 | Evidence 5 | 6 | from indra_db.managers.knowledgebase_manager import * 7 | from indra_db.util import insert_db_stmts 8 | from indra_db.tests.util import get_temp_db 9 | 10 | 11 | def _check_kbm(Kb, *args, **kwargs): 12 | db = get_temp_db(clear=True) 13 | dbid = db.select_one(db.DBInfo.id, db.DBInfo.db_name == Kb.name) 14 | assert dbid is None 15 | kbm = Kb(*args, **kwargs) 16 | kbm.upload(db) 17 | dbid = db.select_one(db.DBInfo.id, db.DBInfo.db_name == Kb.name)[0] 18 | assert dbid is not None 19 | db_stmts = db.select_all(db.RawStatements) 20 | print(len(db_stmts)) 21 | assert len(db_stmts) 22 | assert all(s.db_info_id == dbid for s in db_stmts) 23 | db.session.close() 24 | 25 | 26 | @attr("nonpublic") 27 | def test_tas(): 28 | _check_kbm(TasManager) 29 | 30 | 31 | @attr('nonpublic') 32 | def test_cbn(): 33 | s3_url = 'https://s3.amazonaws.com/bigmech/travis/Hox-2.0-Hs.jgf.zip' 34 | _check_kbm(CBNManager, archive_url=s3_url) 35 | 36 | 37 | @attr('nonpublic', 'slow') 38 | def test_hprd(): 39 | _check_kbm(HPRDManager) 40 | 41 | 42 | @attr('nonpublic') 43 | def test_signor(): 44 | _check_kbm(SignorManager) 45 | 46 | 47 | @attr('nonpublic', 'slow') 48 | def test_biogrid(): 49 | _check_kbm(BiogridManager) 50 | 51 | 52 | @attr('nonpublic', 'slow') 53 | def test_bel_lc(): 54 | _check_kbm(BelLcManager) 55 | 56 | 57 | @attr('nonpublic', 'slow') 58 | def test_pathway_commons(): 59 | _check_kbm(PathwayCommonsManager) 60 | 61 | 62 | @attr('nonpublic', 'slow') 63 | def test_rlimsp(): 64 | _check_kbm(RlimspManager) 65 | 66 | 67 | @attr('nonpublic') 68 | def test_trrust(): 69 | _check_kbm(TrrustManager) 70 | 71 | 72 | @attr('nonpublic', 'slow') 73 | def test_phosphosite(): 74 | _check_kbm(PhosphositeManager) 75 | 76 | 77 | @attr('nonpublic') 78 | def test_simple_db_insert(): 79 | db = get_temp_db() 80 | db._clear(force=True) 81 | stmts = [Phosphorylation(Agent('MEK', db_refs={'FPLX': 'MEK'}), 82 | Agent('ERK', db_refs={'FPLX': 'ERK'}), 83 | evidence=Evidence(source_api='test')), 84 | Complex([Agent(n, db_refs={'FPLX': n}) for n in ('MEK', 'ERK')], 85 | evidence=Evidence(source_api='test'))] 86 | dbid = db.insert(db.DBInfo, db_name='test', source_api='tester') 87 | insert_db_stmts(db, stmts, dbid) 88 | db_stmts = db.select_all(db.RawStatements) 89 | db_agents = db.select_all(db.RawAgents) 90 | assert len(db_stmts) == 2, len(db_stmts) 91 | assert len(db_agents) == 8, len(db_agents) 92 | db.session.close() 93 | -------------------------------------------------------------------------------- /indra_db/tests/test_principal_client.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from indra_db.client.principal import * 4 | from indra.statements import Agent, Phosphorylation, Complex, Activation 5 | 6 | from indra_db.tests.util import get_temp_db 7 | from indra_db.tests.db_building_util import DbBuilder 8 | 9 | 10 | def _construct_database(): 11 | db = get_temp_db(clear=True) 12 | db_builder = DbBuilder(db) 13 | db_builder.add_text_refs([ 14 | ('12345', 'PMC54321'), 15 | ('24680', 'PMC08642') 16 | ]) 17 | db_builder.add_text_content([ 18 | ['pubmed-abs', 'pmc_oa'], 19 | ['pubmed-abs'] 20 | ]) 21 | db_builder.add_readings([ 22 | ['REACH'], 23 | ['REACH'], 24 | ['REACH', 'SPARSER'] 25 | ]) 26 | 27 | mek = Agent('MEK', db_refs={'FPLX': 'MEK'}) 28 | erk = Agent('ERK', db_refs={'FPLX': 'ERK'}) 29 | raf = Agent('RAF', db_refs={'FPLX': 'RAF'}) 30 | 31 | db_builder.add_raw_reading_statements([ 32 | [Phosphorylation(mek, erk), Complex([mek, erk])], 33 | [Phosphorylation(mek, erk)], 34 | [Activation(mek, erk)], 35 | [Complex([mek, erk]), Complex([raf, erk])] 36 | ]) 37 | 38 | db_builder.add_databases(['signor']) 39 | db_builder.add_raw_database_statements([ 40 | [Complex([raf, erk])] 41 | ]) 42 | db_builder.add_pa_statements([ 43 | (Phosphorylation(mek, erk), [0, 2]), 44 | (Complex([mek, erk]), [1, 4]), 45 | (Activation(mek, erk), [3]), 46 | (Complex([raf, erk]), [5, 6]) 47 | ]) 48 | return db 49 | 50 | 51 | def test_get_raw_statements_all(): 52 | db = _construct_database() 53 | res = get_raw_stmt_jsons(db=db) 54 | assert len(res) == 7, len(res) 55 | 56 | 57 | def test_raw_statement_retrieval_from_agents_type_only(): 58 | db = _construct_database() 59 | res = get_raw_stmt_jsons_from_agents(stmt_type='Complex', db=db) 60 | assert len(res) > 0 61 | assert len(res) < 7 62 | assert all(sj['type'] == 'Complex' for sj in res.values()) 63 | 64 | 65 | def test_raw_statement_retrieval_from_agents_mek(): 66 | db = _construct_database() 67 | res = get_raw_stmt_jsons_from_agents(agents=[(None, 'MEK', 'FPLX')], db=db) 68 | assert len(res) > 0 69 | assert len(res) < 7 70 | assert all('MEK' in json.dumps(sj) for sj in res.values()) 71 | 72 | 73 | def test_raw_statement_retrieval_generic(): 74 | db = _construct_database() 75 | res = get_raw_stmt_jsons([db.Reading.reader == 'REACH', 76 | db.Reading.id == db.RawStatements.reading_id], 77 | db=db) 78 | assert len(res) > 0 79 | assert len(res) < 7 80 | assert all(sj['evidence'][0]['source_api'] == 'reach' 81 | for sj in res.values()) 82 | 83 | 84 | def test_raw_statements_get_database_only(): 85 | db = _construct_database() 86 | res = get_raw_stmt_jsons([db.RawStatements.reading_id.is_(None)], db=db) 87 | assert len(res) == 1, len(res) 88 | assert all(sj['evidence'][0]['source_api'] == 'signor' 89 | for sj in res.values()) 90 | 91 | 92 | def test_pa_statement_retrieval_generic(): 93 | db = _construct_database() 94 | res = get_pa_stmt_jsons(db=db) 95 | assert len(res) == 4 96 | 97 | 98 | def test_pa_statement_retrieval_by_type(): 99 | db = _construct_database() 100 | res = get_pa_stmt_jsons([db.PAStatements.type == 'Complex'], db=db) 101 | assert len(res) == 2 102 | assert all(j['stmt']['type'] == 'Complex' for j in res.values()) 103 | -------------------------------------------------------------------------------- /indra_db/tests/test_readonly_pipeline.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import gzip 3 | import json 4 | import pickle 5 | from pathlib import Path 6 | from collections import Counter 7 | 8 | import networkx as nx 9 | 10 | from indra.belief import BeliefEngine 11 | from indra.statements import Agent, Evidence, Activation 12 | from indra_db import get_db 13 | from indra_db.readonly_dumping.export_assembly import calculate_belief 14 | 15 | 16 | def test_unit_belief_calc(): 17 | activation = Activation( 18 | Agent("A"), 19 | Agent("B"), 20 | evidence=[Evidence(source_api="reach") for _ in range(3)], 21 | ) 22 | 23 | # Test that the belief score is calculated correctly 24 | assert activation.belief == 1 25 | 26 | # Set up default Belief Engine 27 | belief_engine = BeliefEngine() 28 | 29 | belief_engine.set_prior_probs([activation]) 30 | 31 | assert activation.belief != 1 32 | assert activation.belief == 0.923 33 | 34 | 35 | def test_calculate_belief(): 36 | activation1 = Activation( 37 | Agent("A", location="nucleus"), 38 | Agent("B", location="cytoplasm"), 39 | evidence=[ 40 | Evidence( 41 | source_api="reach", 42 | text="A activates B in vitro in a dose-dependent manner.") 43 | ], 44 | ) 45 | hash1 = activation1.get_hash() 46 | activation2 = Activation( 47 | Agent("A", location="nucleus"), 48 | Agent("B"), 49 | evidence=[ 50 | Evidence(source_api="reach", text="A activates B in vitro.") 51 | ], 52 | ) 53 | hash2 = activation2.get_hash() 54 | activation3 = Activation( 55 | Agent("A"), 56 | Agent("B"), 57 | evidence=[Evidence(source_api="reach", text="A activates B.")], 58 | ) 59 | hash3 = activation3.get_hash() 60 | 61 | # Sanity check 62 | assert hash1 != hash2 != hash3 63 | 64 | stmt_list = [(hash1, activation1), (hash2, activation2), (hash3, activation3)] 65 | 66 | # Dump the statements to a file 67 | test_statements_tsv_gz = Path(__file__).parent / "test_statements.tsv.gz" 68 | with gzip.open(test_statements_tsv_gz, "wt") as f: 69 | csv_writer = csv.writer(f, delimiter="\t") 70 | csv_writer.writerows( 71 | (sh, json.dumps(st.to_json())) for sh, st in stmt_list 72 | ) 73 | 74 | source_counts = { 75 | hash1: {"reach": 1}, 76 | hash2: {"reach": 1}, 77 | hash3: {"reach": 1}, 78 | } 79 | test_source_counts_pkl = Path(__file__).parent / "test_source_counts.pkl" 80 | with open(test_source_counts_pkl, "wb") as f: 81 | pickle.dump(source_counts, f) 82 | 83 | # Create support: activation1 -> activation2 -> activation3 in a 84 | # refinement graph 85 | refinements = {(hash1, hash2), (hash2, hash3)} 86 | refinement_graph = nx.DiGraph() 87 | refinement_graph.add_edges_from(refinements) 88 | assert nx.ancestors(refinement_graph, hash1) == set() 89 | assert nx.ancestors(refinement_graph, hash2) == {hash1} 90 | assert nx.ancestors(refinement_graph, hash3) == {hash1, hash2} 91 | 92 | # Run the belief calculation function 93 | db = get_db("primary") 94 | res = db.select_all(db.DBInfo) 95 | db_name_api_mapping = {r.db_name: r.source_api for r in res} 96 | test_belief_path = Path(__file__).parent / "test_belief_path.pkl" 97 | calculate_belief( 98 | refinements_graph=refinement_graph, 99 | num_batches=1, 100 | batch_size=len(stmt_list), 101 | source_mapping=db_name_api_mapping, 102 | unique_stmts_path=test_statements_tsv_gz, 103 | belief_scores_pkl_path=test_belief_path, 104 | source_counts_path=test_source_counts_pkl, 105 | ) 106 | 107 | # Calculate the belief scores: Add evidence of supporting statements to the 108 | # evidence of the supported statement then calculate the prior belief 109 | belief_engine = BeliefEngine(refinements_graph=refinement_graph) 110 | to_calc_list = [] 111 | local_beliefs = {} 112 | for st_hash, stmt in stmt_list: 113 | # Sum belief score of ancestors 114 | summed_src_count = Counter(source_counts[st_hash]) 115 | 116 | if st_hash in refinement_graph.nodes: 117 | for anc_hash in nx.ancestors(refinement_graph, st_hash): 118 | summed_src_count += Counter(source_counts[anc_hash]) 119 | 120 | ev_list_this_stmt = [] 121 | for source, count in summed_src_count.items(): 122 | for _ in range(count): 123 | ev_list_this_stmt.append(Evidence(source_api=source)) 124 | 125 | stmt.evidence = ev_list_this_stmt 126 | to_calc_list.append((st_hash, stmt)) 127 | 128 | hashes, stmts = zip(*to_calc_list) 129 | belief_engine.set_prior_probs(stmts) 130 | for st_hash2, stmt2 in zip(hashes, stmts): 131 | local_beliefs[st_hash2] = stmt2.belief 132 | 133 | # Load the belief scores 134 | with open(test_belief_path, "rb") as f: 135 | belief_dict = pickle.load(f) 136 | 137 | # Check that the belief scores are correct 138 | assert all( 139 | local_beliefs[st_hash] == belief_dict[st_hash] 140 | for st_hash in belief_dict 141 | ) 142 | 143 | assert len(stmts[2].evidence) == 3 144 | assert all(ev.source_api == 'reach' for ev in stmts[2].evidence) 145 | assert belief_dict[hash3] == 0.923 146 | -------------------------------------------------------------------------------- /indra_db/tests/test_setup.py: -------------------------------------------------------------------------------- 1 | from indra_db.tests.util import get_temp_db 2 | 3 | 4 | def test_db_presence(): 5 | db = get_temp_db(clear=True) 6 | db.insert(db.TextRef, pmid='12345') 7 | -------------------------------------------------------------------------------- /indra_db/tests/test_sif_dumper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from os import path, remove 4 | import pandas as pd 5 | 6 | import indra_db.tests.util as tu 7 | from indra_db.util.dump_sif import load_db_content, get_source_counts, \ 8 | make_dataframe, NS_LIST, normalize_sif_names 9 | 10 | 11 | class SifDumperTester(unittest.TestCase): 12 | def get_db(self, count=1000): 13 | # Get db 14 | return tu.get_filled_ro(count) 15 | 16 | def setUp(self): 17 | self.db = self.get_db() 18 | self.db_content = load_db_content(True, NS_LIST, None, self.db) 19 | self.df = make_dataframe(True, self.db_content, None) 20 | 21 | # Tests 22 | def test_get_content(self): 23 | """Checks content loading and its structure""" 24 | 25 | # Get first item 26 | r = list(self.db_content)[0] 27 | assert isinstance(r, tuple) 28 | assert len(r) == 6 29 | assert isinstance(r[0], int) # mk_hash 30 | assert isinstance(r[1], str) # db_name 31 | assert r[1] in NS_LIST 32 | assert isinstance(r[2], str) # db_id 33 | assert isinstance(r[3], int) # ag_num 34 | assert r[3] > -1 35 | assert isinstance(r[4], int) # ev_count 36 | assert r[4] > 0 37 | assert isinstance(r[5], str) # type 38 | 39 | def test_dataframe(self): 40 | """Checks a dataframe produced by make_dataframe""" 41 | 42 | # Check column names 43 | assert {'agA_id', 'agA_name', 'agA_ns', 'agB_id', 'agB_name', 'agB_ns', 44 | 'evidence_count', 'stmt_hash', 'stmt_type'} == set( 45 | self.df.columns) 46 | 47 | # Check for None's 48 | assert sum(self.df['agA_name'] == None) == 0 49 | assert sum(self.df['agB_name'] == None) == 0 50 | 51 | # Check df types 52 | assert isinstance(self.df.head(1)['agA_ns'][0], str) 53 | assert isinstance(self.df.head(1)['agB_ns'][0], str) 54 | assert isinstance(self.df.head(1)['agA_id'][0], str) 55 | assert isinstance(self.df.head(1)['agB_id'][0], str) 56 | assert isinstance(self.df.head(1)['agA_name'][0], str) 57 | assert isinstance(self.df.head(1)['agB_name'][0], str) 58 | assert isinstance(self.df.head(1)['stmt_type'][0], str) 59 | assert isinstance(self.df.head(1)['evidence_count'][0], np.int64) 60 | assert isinstance(self.df.head(1)['stmt_hash'][0], np.int64) 61 | 62 | # Check that we don't have significant keyerrors from creating the df 63 | key_error_file = path.join(path.dirname(__file__), 'key_errors.csv') 64 | if path.exists(key_error_file): 65 | key_errors = pd.read_csv(key_error_file, sep=',', 66 | names=['stmt_hash', 'ag_num'], header=None) 67 | remove(key_error_file) 68 | missing_hashes = set(key_errors['stmt_hash'].values) 69 | df_hashes = set(self.df['stmt_hash'].values) 70 | 71 | assert len(missing_hashes.intersection(df_hashes)) / \ 72 | len(df_hashes) < 0.5 73 | 74 | def test_stratified_evidence(self): 75 | """Check the stratified evidence dumper""" 76 | 77 | ev_dict = get_source_counts(ro=self.db) 78 | 79 | # Check if nested dict 80 | for k in ev_dict: 81 | assert isinstance(ev_dict[k], dict) 82 | break 83 | 84 | # Check that some keys exist in the df 85 | df_hashes = set(self.df['stmt_hash'].values) 86 | sd_hashes = set(ev_dict.keys()) 87 | assert len(sd_hashes.intersection(df_hashes)) / len(sd_hashes) > 0.25 88 | 89 | 90 | def test_normalize_names(): 91 | sif_dict = { 92 | 'agA_ns': ['HGNC', 'HGNC'], 93 | 'agA_id': ['26128', '26128'], 94 | 'agA_name': ['SPRING1', 'C12orf49'], 95 | 'agB_ns': ['HGNC', 'HGNC'], 96 | 'agB_id': ['11892', '3236'], 97 | 'agB_name': ['TNF', 'EGFR'], 98 | 'stmt_type': ['Activation', 'Phosphorylation'], 99 | 'evidence_count': [10, 12], 100 | 'stmt_hash': [1234567890, -9876543210], 101 | 'residue': [None, None], 102 | 'position': [None, None], 103 | 'source_counts': [{'sparser': 6, 'reach': 4}, {'pc': 6, 'sparser': 6}], 104 | 'belief': [0.998, 0.9999] 105 | } 106 | 107 | sif_df = pd.DataFrame(sif_dict) 108 | normalize_sif_names(sif_df) 109 | # Both names should now be SPRING1 110 | assert set(sif_df.agA_name.values) == {'SPRING1'} 111 | -------------------------------------------------------------------------------- /indra_db/tests/test_xdd_manager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | import random 4 | 5 | from indra_db.tests.util import get_temp_db 6 | from indra_db.managers.xdd_manager import XddManager 7 | 8 | 9 | def test_dump(): 10 | db = get_temp_db(clear=True) 11 | m = XddManager() 12 | 13 | # Enter "old" DOIs 14 | s3 = boto3.client('s3') 15 | res = s3.list_objects_v2(**m.bucket.kw()) 16 | dois = set() 17 | for ref in res['Contents']: 18 | key = ref['Key'] 19 | if 'bib' not in key: 20 | continue 21 | try: 22 | obj = s3.get_object(Key=key, **m.bucket.kw()) 23 | except Exception: 24 | print('ack') 25 | continue 26 | bibs = json.loads(obj['Body'].read()) 27 | dois |= {bib['identifier'][0]['id'] for bib in bibs 28 | if 'identifier' in bib} 29 | sample_dois = random.sample(dois, len(dois)//2) 30 | new_trs = [db.TextRef.new(doi=doi) for doi in sample_dois] 31 | print(f"Adding {len(new_trs)} 'old' text refs.") 32 | db.session.add_all(new_trs) 33 | db.session.commit() 34 | 35 | # Run the update. 36 | m.run(db) 37 | 38 | # Check the result. 39 | assert db.select_all(db.TextRef) 40 | assert db.select_all(db.TextContent) 41 | assert db.select_all(db.Reading) 42 | assert db.select_all(db.RawStatements) 43 | assert db.select_all(db.RawAgents) 44 | -------------------------------------------------------------------------------- /indra_db/util/__init__.py: -------------------------------------------------------------------------------- 1 | """This file contains low level functions used by other indra_db tools/services. 2 | 3 | Some key functions' capabilities include: 4 | - getting access to/constructing DatabaseManager instances. 5 | - inserting statements, which are stored in multiple tables, into the database. 6 | - distilling and deleting statements 7 | """ 8 | 9 | __all__ = ['get_primary_db', 'get_db', 'insert_raw_agents', 'insert_pa_stmts', 10 | 'insert_pa_agents', 'insert_db_stmts', 'get_raw_stmts_frm_db_list', 11 | 'distill_stmts', 'regularize_agent_id', 'get_statement_object', 12 | 'extract_agent_data', 'get_ro', 'S3Path', 'hash_pa_agents'] 13 | 14 | from .insert import * 15 | from .s3_path import * 16 | from .helpers import * 17 | from .constructors import * 18 | from .content_scripts import * 19 | from .distill_statements import * 20 | -------------------------------------------------------------------------------- /indra_db/util/aws.py: -------------------------------------------------------------------------------- 1 | import re 2 | import boto3 3 | 4 | 5 | def uncamel(word): 6 | return re.sub(r'([a-z])([A-Z])', r'\g<1>_\g<2>', word).lower() 7 | 8 | 9 | def get_role_kwargs(role): 10 | sts = boto3.client('sts') 11 | 12 | # Check the current role 13 | kwargs = {} 14 | ident = sts.get_caller_identity() 15 | if role and not ident['Arn'].endswith(role): 16 | # If the role is not the default, assume that role. 17 | new_role_arn = "arn:aws:iam::%s:role/%s" % (ident['Account'], role) 18 | res = sts.assume_role(RoleArn=new_role_arn, 19 | RoleSessionName="AssumeRoleReadonlyDBUpdate") 20 | kwargs = {'aws_' + uncamel(k): v for k, v in res['Credentials'].items() 21 | if 'expiration' not in k.lower()} 22 | 23 | return kwargs, ident 24 | -------------------------------------------------------------------------------- /indra_db/util/build_corpus.py: -------------------------------------------------------------------------------- 1 | """Dump test corpora of content covering all REACH rules 2 | 3 | This script is designed select content from the database based on the REACH 4 | rules that have been triggered within that content. Three slightly different 5 | methods are used, and three corpora are produces, each as a directory. 6 | """ 7 | 8 | import os 9 | import json 10 | from indra_db.util import unpack 11 | from indra_db.util import get_ro, get_db 12 | 13 | db = get_db('primary') 14 | 15 | rs = db.select_all(db.RawStatements, db.Reading.reader == 'REACH', 16 | db.RawStatements.reading_id == db.Reading.id, yield_per=10000) 17 | found_by = {} 18 | for r in rs: 19 | found_by[r.id] = json.loads(r.json)['evidence'][0]['annotations']['found_by'] 20 | 21 | fb_set = set(found_by.values()) 22 | print(f"Found {len(fb_set)} distinct found-by rules.") 23 | 24 | fb_counts = {} 25 | for sid, word in found_by.items(): 26 | fb_counts[word] = fb_counts.get(word, 0) + 1 27 | 28 | fb_sids = {} 29 | for sid, word in found_by.items(): 30 | if word not in fb_sids: 31 | fb_sids[word] = [] 32 | fb_sids[word].append(sid) 33 | 34 | tc_data = db.select_all([db.TextContent.id, db.TextContent.source, db.TextContent.text_type, db.RawStatements.id], 35 | db.Reading.reader == 'REACH', *db.link(db.TextContent, db.RawStatements)) 36 | tc_lookup = {sid: (tcid, src, tt) for tcid, src, tt, sid in tc_data} 37 | 38 | fb_tc_dict = {} 39 | tc_fb_dict = {} 40 | for fb, sids in sorted(fb_sids.items(), key=lambda t: len(t[1])): 41 | print(fb, len(sids)) 42 | this_dict = {} 43 | for sid in sids: 44 | tcid, src, tt = tc_lookup[sid] 45 | 46 | # Add fb to lookup by tcid 47 | if tcid not in tc_fb_dict: 48 | tc_fb_dict[tcid] = set() 49 | tc_fb_dict[tcid].add(fb) 50 | 51 | # Add tcid sid data to list of content with this fb. 52 | key = (src, tt) 53 | if key not in this_dict: 54 | this_dict[key] = [] 55 | this_dict[key].append({'tcid': tcid, 'sid': sid}) 56 | fb_tc_dict[fb] = this_dict 57 | 58 | 59 | num_with = 0 60 | for fb, cont_meta in fb_tc_dict.items(): 61 | if ('pubmed', 'abstract') not in cont_meta and ('pubmed', 'title') not in cont_meta: 62 | print(f"{fb:70} {fb_counts[fb]} {cont_meta.keys()}") 63 | else: 64 | num_with += 1 65 | 66 | ranking = [('pubmed', 'abstract'), ('pmc_oa', 'fulltext'), ('manuscripts', 'fulltext'), ('pubmed', 'title')] 67 | 68 | 69 | def dump_tcs(tcids, dirname): 70 | tcs = db.select_all([db.TextRef.id, db.TextRef.pmid, db.TextRef.pmcid, db.TextContent.id, 71 | db.TextContent.source, db.TextContent.text_type, db.TextContent.content], 72 | db.TextContent.id.in_(tcids), *db.link(db.TextRef, db.TextContent)) 73 | tt_counts = {} 74 | for row in tcs: 75 | tt = row[-1] 76 | tt_counts[tt] = tt_counts.get(tt, 0) + 1 77 | 78 | print(dirname, tt_counts) 79 | 80 | if not os.path.exists(dirname): 81 | os.mkdir(dirname) 82 | else: 83 | raise ValueError(f"Directory {dirname} already exists.") 84 | 85 | metadata = {} 86 | for trid, pmid, pmcid, tcid, src, tt, cont_bytes in tcs: 87 | metadata[tcid] = {'trid': trid, 'pmid': pmid, 'tcid': tcid, 'pmcid': pmcid, 'source': src, 'text_type': tt} 88 | if src == 'pubmed': 89 | fmt = 'txt' 90 | else: 91 | fmt = 'nxml' 92 | with open(f'{dirname}/{tcid}.{fmt}', 'w') as f: 93 | f.write(unpack(cont_bytes)) 94 | with open(f'{dirname}/metadata.json', 'w') as f: 95 | json.dump(metadata, f, indent=2) 96 | 97 | 98 | # Select strictly the content with the most rules represented. No preference 99 | # based on type. 100 | corpus_ids = [] 101 | rep_fbs = set() 102 | for fb, cont_meta in sorted(fb_tc_dict.items(), key=lambda t: fb_counts[t[0]]): 103 | print("--------------------------------------------") 104 | print("Examining rule:", fb, fb_counts[fb]) 105 | if fb in rep_fbs: 106 | print("Already represented...skipping") 107 | continue 108 | 109 | best_ref = None 110 | for text_cat, text_list in cont_meta.items(): 111 | print(text_cat, len(text_list)) 112 | counted_refs = [(len(tc_fb_dict[d['tcid']] - rep_fbs), d['tcid']) for d in text_list] 113 | print(f"best ref for {text_cat}:", max(counted_refs)) 114 | if best_ref is None: 115 | best_ref = max(counted_refs) 116 | else: 117 | this_ref = max(counted_refs) 118 | if this_ref > best_ref: 119 | best_ref = this_ref 120 | print(f"Overall best ref for {fb}:", best_ref) 121 | corpus_ids.append(best_ref[1]) 122 | rep_fbs |= tc_fb_dict[best_ref[1]] 123 | print(len(rep_fbs)) 124 | if len(rep_fbs) == len(fb_counts): 125 | print("DONE!") 126 | break 127 | dump_tcs(corpus_ids, 'corpus_1') 128 | 129 | 130 | # Select the content with most rules, with the preference for abstract as a tie-breaker. 131 | corpus_ids_2 = [] 132 | rep_fbs = set() 133 | for fb, cont_meta in sorted(fb_tc_dict.items(), key=lambda t: fb_counts[t[0]]): 134 | print("--------------------------------------------") 135 | print("Examining rule:", fb, fb_counts[fb]) 136 | if fb in rep_fbs: 137 | print("Already represented...skipping") 138 | continue 139 | 140 | all_counted_refs = [] 141 | for text_cat, text_list in cont_meta.items(): 142 | print(text_cat, len(text_list)) 143 | all_counted_refs += [(len(tc_fb_dict[d['tcid']] - rep_fbs), -ranking.index(text_cat), d['tcid']) for d in text_list] 144 | best_ref = max(all_counted_refs) 145 | print(f"Overall best ref for {fb}:", best_ref) 146 | corpus_ids_2.append(best_ref[-1]) 147 | rep_fbs |= tc_fb_dict[best_ref[-1]] 148 | print(len(rep_fbs)) 149 | if len(rep_fbs) == len(fb_counts): 150 | print("DONE!") 151 | break 152 | dump_tcs(corpus_ids_2, 'corpus_2') 153 | 154 | 155 | # Select abstracts whenever possible, fulltext only when necessary. 156 | corpus_ids_3 = [] 157 | rep_fbs = set() 158 | for fb, cont_meta in sorted(fb_tc_dict.items(), key=lambda t: fb_counts[t[0]]): 159 | print("--------------------------------------------") 160 | print("Examining rule:", fb, fb_counts[fb]) 161 | if fb in rep_fbs: 162 | print("Already represented...skipping") 163 | continue 164 | 165 | all_counted_refs = [] 166 | for text_cat, text_list in cont_meta.items(): 167 | print(text_cat, len(text_list)) 168 | all_counted_refs += [(-ranking.index(text_cat), len(tc_fb_dict[d['tcid']] - rep_fbs), d['tcid']) for d in text_list] 169 | best_ref = max(all_counted_refs) 170 | print(f"Overall best ref for {fb}:", best_ref) 171 | corpus_ids_3.append(best_ref[-1]) 172 | rep_fbs |= tc_fb_dict[best_ref[-1]] 173 | print(len(rep_fbs)) 174 | if len(rep_fbs) == len(fb_counts): 175 | print("DONE!") 176 | break 177 | dump_tcs(corpus_ids_3, 'corpus_3') 178 | 179 | -------------------------------------------------------------------------------- /indra_db/util/constructors.py: -------------------------------------------------------------------------------- 1 | __all__ = ['get_primary_db', 'get_db', 'get_ro', 'get_ro_host'] 2 | 3 | import logging 4 | 5 | from indra_db.databases import PrincipalDatabaseManager, \ 6 | ReadonlyDatabaseManager 7 | from indra_db.exceptions import IndraDbException 8 | from indra_db.config import get_databases, get_readonly_databases, nope_in_test 9 | 10 | logger = logging.getLogger('util-constructors') 11 | 12 | 13 | __PRIMARY_DB = None 14 | 15 | 16 | @nope_in_test 17 | def get_primary_db(force_new=False): 18 | """Get a DatabaseManager instance for the primary database host. 19 | 20 | The primary database host is defined in the defaults.txt file, or in a file 21 | given by the environment variable DEFAULTS_FILE. Alternatively, it may be 22 | defined by the INDRADBPRIMARY environment variable. If none of the above 23 | are specified, this function will raise an exception. 24 | 25 | Note: by default, calling this function twice will return the same 26 | `DatabaseManager` instance. In other words:: 27 | 28 | db1 = get_primary_db() 29 | db2 = get_primary_db() 30 | db1 is db2 31 | 32 | This means also that, for example `db1.select_one(db2.TextRef)` will work, 33 | in the above context. 34 | 35 | It is still recommended that when creating a script or function, or other 36 | general application, you should not rely on this feature to get your access 37 | to the database, as it can make substituting a different database host both 38 | complicated and messy. Rather, a database instance should be explicitly 39 | passed between different users as is done in `get_statements_by_gene_role_type` 40 | function's call to `get_statements` in `indra.db.query_db_stmts`. 41 | 42 | Parameters 43 | ---------- 44 | force_new : bool 45 | If true, a new instance will be created and returned, regardless of 46 | whether there is an existing instance or not. Default is False, so that 47 | if this function has been called before within the global scope, a the 48 | instance that was first created will be returned. 49 | 50 | Returns 51 | ------- 52 | primary_db : :py:class:`DatabaseManager` 53 | An instance of the database manager that is attached to the primary 54 | database. 55 | """ 56 | logger.warning("DEPRECATION WARNING: This function is being deprecated.") 57 | defaults = get_databases() 58 | if 'primary' in defaults.keys(): 59 | primary_host = defaults['primary'] 60 | else: 61 | raise IndraDbException("No primary host available in defaults file.") 62 | 63 | global __PRIMARY_DB 64 | if __PRIMARY_DB is None or force_new: 65 | __PRIMARY_DB = PrincipalDatabaseManager(primary_host, label='primary') 66 | __PRIMARY_DB.grab_session() 67 | return __PRIMARY_DB 68 | 69 | 70 | @nope_in_test 71 | def get_db(db_label, protected=False): 72 | """Get a db instance base on it's name in the config or env. 73 | 74 | If the label does not exist or the database labeled can't be reached, None 75 | is returned. 76 | """ 77 | # Instantiate a database handle 78 | defaults = get_databases() 79 | if db_label not in defaults: 80 | logger.error(f"No such database available: {db_label}. Check config " 81 | f"file or environment variables.") 82 | return 83 | db_url = defaults[db_label] 84 | db = PrincipalDatabaseManager(db_url, label=db_label, protected=protected) 85 | if not db.available: 86 | return 87 | db.grab_session() 88 | return db 89 | 90 | 91 | @nope_in_test 92 | def get_ro(ro_label, protected=True): 93 | """Get a readonly database instance, based on its name. 94 | 95 | If the label does not exist or the database labeled can't be reached, None 96 | is returned. 97 | """ 98 | # Instantiate a readonly database. 99 | defaults = get_readonly_databases() 100 | if ro_label == 'primary' and 'override' in defaults: 101 | logger.info("Found an override database: using in place of primary.") 102 | ro_label = 'override' 103 | if ro_label not in defaults: 104 | logger.error(f"No such readonly database available: {ro_label}. Check " 105 | f"config file or environment variables.") 106 | return 107 | db_url = defaults[ro_label] 108 | ro = ReadonlyDatabaseManager(db_url, label=ro_label, protected=protected) 109 | if not ro.available: 110 | return 111 | ro.grab_session() 112 | return ro 113 | 114 | 115 | def get_ro_host(ro_label): 116 | """Get the host of the current readonly database.""" 117 | ro = get_ro(ro_label) 118 | if not ro: 119 | return None 120 | return ro.url.host 121 | -------------------------------------------------------------------------------- /indra_db/util/helpers.py: -------------------------------------------------------------------------------- 1 | __all__ = ['unpack', '_get_trids', '_fix_evidence_refs', 2 | 'get_raw_stmts_frm_db_list', '_set_evidence_text_ref', 3 | 'get_statement_object'] 4 | 5 | import json 6 | import zlib 7 | import logging 8 | 9 | from indra.util import clockit 10 | from indra.statements import Statement 11 | 12 | logger = logging.getLogger('util-helpers') 13 | 14 | 15 | def get_statement_object(db_stmt): 16 | """Get an INDRA Statement object from a db_stmt.""" 17 | if isinstance(db_stmt, bytes): 18 | jb = db_stmt 19 | else: 20 | jb = db_stmt.json 21 | return Statement._from_json(json.loads(jb.decode('utf-8'))) 22 | 23 | 24 | def _set_evidence_text_ref(stmt, tr): 25 | # This is a separate function because it is likely to change, and this is a 26 | # critical process that is executed in multiple places. 27 | for ev in stmt.evidence: 28 | ev.pmid = tr.pmid 29 | ev.text_refs = tr.get_ref_dict() 30 | 31 | 32 | @clockit 33 | def _fix_evidence_refs(db, rid_stmt_trios): 34 | """Get proper id data for a raw statement from the database. 35 | 36 | Alterations are made to the Statement objects "in-place", so this function 37 | itself returns None. 38 | """ 39 | rid_set = {rid for rid, _, _ in rid_stmt_trios if rid is not None} 40 | logger.info("Getting text refs for %d readings." % len(rid_set)) 41 | if rid_set: 42 | rid_tr_pairs = db.select_all( 43 | [db.Reading.id, db.TextRef], 44 | db.Reading.id.in_(rid_set), 45 | db.Reading.text_content_id == db.TextContent.id, 46 | db.TextContent.text_ref_id == db.TextRef.id 47 | ) 48 | rid_tr_dict = {rid: tr for rid, tr in rid_tr_pairs} 49 | for rid, sid, stmt in rid_stmt_trios: 50 | if rid is None: 51 | # This means this statement came from a database, not reading. 52 | continue 53 | assert len(stmt.evidence) == 1, \ 54 | "Only raw statements can have their refs fixed." 55 | _set_evidence_text_ref(stmt, rid_tr_dict[rid]) 56 | return 57 | 58 | 59 | @clockit 60 | def get_raw_stmts_frm_db_list(db, db_stmt_objs, fix_refs=True, with_sids=True): 61 | """Convert table objects of raw statements into INDRA Statement objects.""" 62 | rid_stmt_sid_trios = [(db_stmt.reading_id, db_stmt.id, 63 | get_statement_object(db_stmt)) 64 | for db_stmt in db_stmt_objs] 65 | if fix_refs: 66 | _fix_evidence_refs(db, rid_stmt_sid_trios) 67 | # Note: it is important that order is maintained here (hence not a set or 68 | # dict). 69 | if with_sids: 70 | return [(sid, stmt) for _, sid, stmt in rid_stmt_sid_trios] 71 | else: 72 | return [stmt for _, _, stmt in rid_stmt_sid_trios] 73 | 74 | 75 | def unpack(bts, decode=True): 76 | ret = zlib.decompress(bts, zlib.MAX_WBITS+16) 77 | if decode: 78 | ret = ret.decode('utf-8') 79 | return ret 80 | 81 | 82 | def _get_trids(db, id_val, id_type): 83 | """Return text ref IDs corresponding to any ID type and value.""" 84 | # Get the text ref id(s) 85 | if id_type in ['trid']: 86 | trids = [int(id_val)] 87 | else: 88 | id_types = ['pmid', 'pmcid', 'doi', 'pii', 'url', 'manuscript_id'] 89 | if id_type not in id_types: 90 | raise ValueError('id_type must be one of: %s' % str(id_types)) 91 | constraint = (getattr(db.TextRef, id_type) == id_val) 92 | trids = [trid for trid, in db.select_all(db.TextRef.id, constraint)] 93 | return trids 94 | -------------------------------------------------------------------------------- /indra_db/util/s3_path.py: -------------------------------------------------------------------------------- 1 | import re 2 | from os import path 3 | from io import BytesIO 4 | 5 | 6 | class S3Path(object): 7 | """A simple object to make it easier to manage s3 locations.""" 8 | def __init__(self, bucket, key=None): 9 | if not isinstance(bucket, str): 10 | raise ValueError("Bucket must be a string, not %s." % type(bucket)) 11 | self.bucket = bucket 12 | if key is not None: 13 | if not isinstance(key, str): 14 | raise ValueError("Key must be a string, not %s." % type(key)) 15 | elif key.startswith('/'): 16 | key = key[1:] 17 | self.key = key 18 | 19 | def __lt__(self, other): 20 | if not isinstance(other, S3Path): 21 | raise ValueError(f"Cannot compare with type \"{type(other)}\".") 22 | return self.to_string() < other.to_string() 23 | 24 | def __eq__(self, other): 25 | if not isinstance(other, S3Path): 26 | raise ValueError(f"Cannot compare with type \"{type(other)}\".") 27 | return self.to_string() == other.to_string() 28 | 29 | def __le__(self, other): 30 | if not isinstance(other, S3Path): 31 | raise ValueError(f"Cannot compare with type \"{type(other)}\".") 32 | return self.to_string() <= other.to_string() 33 | 34 | def kw(self, prefix=False): 35 | ret = {'Bucket': self.bucket} 36 | if self.key: 37 | if prefix: 38 | ret['Prefix'] = self.key 39 | else: 40 | ret['Key'] = self.key 41 | return ret 42 | 43 | def get(self, s3): 44 | if not self.key: 45 | raise ValueError("Cannot get key-less s3 path.") 46 | return s3.get_object(**self.kw()) 47 | 48 | def upload(self, s3, body): 49 | if not self.key: 50 | raise ValueError("Cannot 'upload' to a key-less s3 path.") 51 | bytes_io = BytesIO(body) 52 | return s3.upload_fileobj(bytes_io, **self.kw()) 53 | 54 | def put(self, s3, body): 55 | if not self.key: 56 | raise ValueError("Cannot 'put' to a key-less s3 path.") 57 | return s3.put_object(Body=body, **self.kw()) 58 | 59 | def list_objects(self, s3): 60 | raw_res = s3.list_objects_v2(**self.kw(prefix=True)) 61 | return [self.__class__(self.bucket, e['Key']) 62 | for e in raw_res['Contents']] 63 | 64 | def list_prefixes(self, s3): 65 | raw_res = s3.list_objects_v2(Delimiter='/', **self.kw(prefix=True)) 66 | return [self.__class__(self.bucket, e['Prefix']) 67 | for e in raw_res['CommonPrefixes']] 68 | 69 | def exists(self, s3): 70 | return 'Contents' in s3.list_objects_v2(**self.kw(prefix=True)) 71 | 72 | def delete(self, s3): 73 | return s3.delete_object(**self.kw()) 74 | 75 | def get_element_path(self, *subkeys): 76 | args = [] 77 | if self.key is not None: 78 | args.append(self.key) 79 | args += subkeys 80 | return self.from_key_parts(self.bucket, *args) 81 | 82 | @classmethod 83 | def from_key_parts(cls, bucket, *key_elements): 84 | key = path.join(*key_elements) 85 | return cls(bucket, key) 86 | 87 | @classmethod 88 | def from_string(cls, s3_key_str): 89 | patt = re.compile(r's3://([a-z0-9\-.]+)/(.*)') 90 | m = patt.match(s3_key_str) 91 | if m is None: 92 | raise ValueError("Invalid format for s3 path: %s" % s3_key_str) 93 | bucket, key = m.groups() 94 | if not key: 95 | key = None 96 | return cls(bucket, key) 97 | 98 | def to_string(self): 99 | return 's3://{bucket}/{key}'.format(bucket=self.bucket, key=self.key) 100 | 101 | def __str__(self): 102 | return self.to_string() 103 | 104 | def __repr__(self): 105 | return 'S3Path({bucket}, {key})'.format(bucket=self.bucket, 106 | key=self.key) 107 | -------------------------------------------------------------------------------- /indra_db_service/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/indra_db_service/__init__.py -------------------------------------------------------------------------------- /indra_db_service/cli/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import click 4 | 5 | 6 | @click.group() 7 | def main(): 8 | """Run the indra db rest service CLI.""" 9 | 10 | 11 | @main.command() 12 | @click.argument('deployment', nargs=1) 13 | @click.option('-s', '--settings', 'zappa_settings_file', 14 | default='zappa_settings.json', 15 | help="Specify the zappa settings file to use. Default is " 16 | "'zappa_settings.json'.") 17 | def push(deployment, zappa_settings_file): 18 | """Push a new deployment to the remote lambdas using zappa.""" 19 | import json 20 | from pathlib import Path 21 | from indra_db_service.cli.zappa_tools import fix_permissions 22 | click.echo(f"Updating {deployment} deployment.") 23 | if not Path(zappa_settings_file).exists(): 24 | click.echo(f"Zappa settings file not found: {zappa_settings_file}") 25 | return 26 | zappa_settings = json.load(open(zappa_settings_file, 'r')) 27 | os.system(f'zappa update {deployment}') 28 | fix_permissions(deployment, zappa_settings=zappa_settings) 29 | 30 | 31 | @main.command() 32 | @click.option('-p', '--port', type=click.INT, 33 | help="Override the default port number.") 34 | @click.option('-h', '--host', default='0.0.0.0', 35 | help="Override the default host.") 36 | @click.option('-vd', '--vue-deployment', 37 | type=click.Choice(['stable', 'dev', 'latest', 'test']), 38 | help="Load the vue package from this S3 deployment instead of " 39 | "a local directory.", 40 | required=False) 41 | def test_service(port, host, vue_deployment): 42 | """Run the service in test mode locally.""" 43 | from indra_db_service.config import TESTING 44 | TESTING['status'] = True 45 | if vue_deployment is not None: 46 | TESTING['deployment'] = vue_deployment 47 | TESTING['vue-root'] = ( 48 | f'https://bigmech.s3.amazonaws.com/indra-db/indralabvue-' 49 | f'{vue_deployment}' 50 | ) 51 | click.echo(f'Using deployment {vue_deployment} from S3 at {TESTING["vue-root"]}') 52 | 53 | from indra_db_service.api import app 54 | app.run(host=host, port=port, debug=True) 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /indra_db_service/cli/__main__.py: -------------------------------------------------------------------------------- 1 | from . import main 2 | 3 | 4 | if __name__ == '__main__': 5 | main() 6 | -------------------------------------------------------------------------------- /indra_db_service/cli/zappa_tools.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | from indra_db.config import CONFIG 4 | from indra_db.util.aws import get_role_kwargs 5 | 6 | 7 | # Lambda CONFIG parameters 8 | aws_role = CONFIG['lambda']['role'] 9 | aws_primary_function = 'indra-db-api-ROOT' 10 | 11 | 12 | def fix_permissions(deployment, zappa_settings) -> None: 13 | """Add permissions to the lambda function to allow access from API Gateway. 14 | 15 | When Zappa runs, it removes permission for the primary endpoint to call 16 | the lambda functions it creates. This function goes in and fixes those 17 | permissions, and is intended to be run after a zappa update. 18 | """ 19 | # Get relevant settings from the zappa config. 20 | project_name = zappa_settings[deployment]['project_name'] 21 | region = zappa_settings[deployment]['aws_region'] 22 | if zappa_settings[deployment]['profile_name'].lower() != aws_role.lower(): 23 | raise Exception("Required roles do not match!") 24 | 25 | # Get the ID for the API on API Gateway 26 | kwargs, identity = get_role_kwargs(aws_role) 27 | if 'region_name' not in kwargs: 28 | kwargs['region_name'] = region 29 | api_gateway = boto3.client('apigateway', **kwargs) 30 | api_data = api_gateway.get_rest_apis() 31 | for item in api_data['items']: 32 | if item['name'] == aws_primary_function: 33 | break 34 | else: 35 | raise Exception(f"Could not find api matching name: " 36 | f"{aws_primary_function}") 37 | 38 | # Give the API Gateway access to the lambda functions. 39 | account_id = identity['Account'] 40 | lambda_client = boto3.client('lambda', **kwargs) 41 | for label, endpoint in [('root', ''), ('leafs', '/*')]: 42 | source_arn = (f"arn:aws:execute-api:{region}:{account_id}:{item['id']}" 43 | f"/*/*/{deployment}{endpoint}") 44 | statement_id = f'{aws_primary_function}-access-to-{deployment}-{label}' 45 | lambda_client.add_permission(FunctionName=f'{project_name}-{deployment}', 46 | Action='lambda:InvokeFunction', 47 | Principal='apigateway.amazonaws.com', 48 | SourceArn=source_arn, 49 | StatementId=statement_id) 50 | return 51 | -------------------------------------------------------------------------------- /indra_db_service/config.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "TITLE", 3 | "DEPLOYMENT", 4 | "BASE_URL", 5 | "VUE_ROOT", 6 | "MAX_STMTS", 7 | "MAX_LIST_LEN", 8 | "REDACT_MESSAGE", 9 | "TESTING", 10 | "jwt_nontest_optional", 11 | "CURATOR_SALT", 12 | ] 13 | 14 | from os import environ 15 | from pathlib import Path 16 | from flask_jwt_extended import jwt_required 17 | 18 | TITLE = "The INDRA Database" 19 | DEPLOYMENT = environ.get("INDRA_DB_API_DEPLOYMENT") 20 | BASE_URL = environ.get("INDRA_DB_API_BASE_URL") 21 | CURATOR_SALT = environ.get("INDRA_DB_API_CURATOR_SALT") 22 | VUE_ROOT = environ.get("INDRA_DB_API_VUE_ROOT") 23 | if VUE_ROOT is not None and not VUE_ROOT.startswith("http"): 24 | VUE_ROOT = Path(VUE_ROOT).expanduser() 25 | if not VUE_ROOT.is_absolute(): 26 | VUE_ROOT = Path(__file__).parent.absolute() / VUE_ROOT 27 | MAX_STMTS = 500 28 | MAX_LIST_LEN = 2000 29 | REDACT_MESSAGE = "[MISSING/INVALID CREDENTIALS: limited to 200 char for Elsevier]" 30 | 31 | TESTING = {} 32 | if environ.get("TESTING_DB_APP") == "1": 33 | TESTING["status"] = True 34 | else: 35 | TESTING["status"] = False 36 | TESTING["deployment"] = "" 37 | TESTING["vue-root"] = "" 38 | 39 | 40 | def jwt_nontest_optional(func): 41 | if TESTING["status"]: 42 | return func 43 | else: 44 | return jwt_required(optional=True)(func) 45 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | /dist 4 | 5 | # local env files 6 | .env.local 7 | .env.*.local 8 | 9 | # Log files 10 | npm-debug.log* 11 | yarn-debug.log* 12 | yarn-error.log* 13 | 14 | # Editor directories and files 15 | .idea 16 | .vscode 17 | *.suo 18 | *.ntvs* 19 | *.njsproj 20 | *.sln 21 | *.sw? 22 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/README.md: -------------------------------------------------------------------------------- 1 | # data-vis 2 | 3 | ## Project setup 4 | ``` 5 | npm install 6 | ``` 7 | 8 | ### Compiles and hot-reloads for development 9 | ``` 10 | npm run test 11 | ``` 12 | 13 | ### Compiles and minifies for production 14 | ``` 15 | npm run build 16 | ``` 17 | 18 | ### Lints and fixes files 19 | ``` 20 | npm run lint 21 | ``` 22 | 23 | ### Customize configuration 24 | See [Configuration Reference](https://cli.vuejs.org/config/). 25 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [ 3 | '@vue/cli-plugin-babel/preset' 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "data-vis", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "serve": "vue-cli-service serve", 7 | "build": "vue-cli-service build --target lib --name DataVis src/index.js", 8 | "watch": "npm run build -- --watch", 9 | "test": "npm run watch -- --mode development", 10 | "lint": "vue-cli-service lint" 11 | }, 12 | "dependencies": { 13 | "@vueform/multiselect": "^1.5.0", 14 | "apexcharts": "^3.26.3", 15 | "core-js": "^3.4.4", 16 | "vue": "^3.0.11", 17 | "vue-router": "^4.0.8", 18 | "vue3-apexcharts": "^1.4.0" 19 | }, 20 | "devDependencies": { 21 | "@vue/cli-plugin-babel": "~4.5.0", 22 | "@vue/cli-plugin-eslint": "~4.5.0", 23 | "@vue/cli-service": "~4.5.0", 24 | "@vue/compiler-sfc": "^3.0.0", 25 | "babel-eslint": "^10.0.3", 26 | "eslint": "^5.16.0", 27 | "eslint-plugin-vue": "^5.0.0", 28 | "vue-template-compiler": "^2.6.10" 29 | }, 30 | "eslintConfig": { 31 | "root": true, 32 | "env": { 33 | "node": true 34 | }, 35 | "extends": [ 36 | "plugin:vue/essential", 37 | "eslint:recommended" 38 | ], 39 | "rules": {}, 40 | "parserOptions": { 41 | "parser": "babel-eslint" 42 | } 43 | }, 44 | "browserslist": [ 45 | "> 1%", 46 | "last 2 versions" 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | data-vis 9 | 10 | 11 | 14 |
15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/src/App.vue: -------------------------------------------------------------------------------- 1 | 7 | 8 | 19 | 20 | 30 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/src/components/AmountView/AmountView.vue: -------------------------------------------------------------------------------- 1 | 25 | 26 | 147 | 148 | 149 | 150 | 158 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/src/components/AmountView/LineChart.vue: -------------------------------------------------------------------------------- 1 | 17 | 18 | 101 | 102 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/src/components/AmountView/index.js: -------------------------------------------------------------------------------- 1 | import datavisComp from './AmountView' 2 | 3 | export default Vue => { 4 | Vue.component(datavisComp.name, datavisComp); 5 | } 6 | 7 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/src/components/TimeView/TimeView.vue: -------------------------------------------------------------------------------- 1 | 69 | 70 | 204 | 205 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/src/components/TimeView/index.js: -------------------------------------------------------------------------------- 1 | import datavisComp from './TimeView' 2 | 3 | export default Vue => { 4 | Vue.component(datavisComp.name, datavisComp); 5 | } 6 | 7 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/src/components/index.js: -------------------------------------------------------------------------------- 1 | export {default as TimeView} from './TimeView' 2 | export {default as AmountView} from './AmountView' 3 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/src/index.js: -------------------------------------------------------------------------------- 1 | export {default as TimeView} from './components/TimeView' 2 | export {default as AmountView} from './components/AmountView' 3 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/src/main.js: -------------------------------------------------------------------------------- 1 | import { createApp } from 'vue' 2 | import App from './App.vue' 3 | import {TimeView, AmountView} from './index' 4 | 5 | const app = createApp(App) 6 | app.use(TimeView) 7 | app.use(AmountView) 8 | app.mount('#app') 9 | -------------------------------------------------------------------------------- /indra_db_service/data-vis/vue.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | publicPath: '/data-vis/', 3 | } 4 | -------------------------------------------------------------------------------- /indra_db_service/errors.py: -------------------------------------------------------------------------------- 1 | from flask import Response, jsonify 2 | 3 | 4 | class HttpUserError(ValueError): 5 | def __init__(self, msg, err_code=400): 6 | self.err_code = err_code 7 | self.msg = msg 8 | super(HttpUserError, self).__init__(msg) 9 | 10 | def to_json(self): 11 | return {"result": "failure", "reason": self.msg} 12 | 13 | def response(self): 14 | return jsonify(self.to_json()), self.err_code 15 | 16 | 17 | class ResultTypeError(HttpUserError): 18 | def __init__(self, result_type): 19 | self.result_type = result_type 20 | msg = f"Invalid result type: {result_type}" 21 | super(ResultTypeError, self).__init__(msg) 22 | 23 | 24 | class InvalidCredentials(HttpUserError): 25 | def __init__(self, cred_type): 26 | super(InvalidCredentials, self).\ 27 | __init__(f"Invalid credentials: {cred_type}", 401) 28 | 29 | 30 | class InsufficientPermission(HttpUserError): 31 | def __init__(self, resource): 32 | super(InsufficientPermission, self).\ 33 | __init__(f"Insufficient permissions for: {resource}", 403) 34 | -------------------------------------------------------------------------------- /indra_db_service/gunicorn.conf.py: -------------------------------------------------------------------------------- 1 | """Gunicorn configuration file for the INDRA DB service 2 | 3 | https://docs.gunicorn.org/en/stable/settings.html#config-file 4 | """ 5 | 6 | import threading 7 | from indralab_auth_tools.src.database import monitor_database_connection 8 | 9 | 10 | def post_fork(server, worker): 11 | """Function to run after forking a worker 12 | 13 | See: https://docs.gunicorn.org/en/stable/settings.html#post-fork 14 | 15 | This function is called after a worker is forked. It starts a thread to monitor 16 | the database connection and reset the connection if it is lost. 17 | """ 18 | 19 | # Setting check interval to 2x gunicorn timeout, which is 300 s. 20 | thread = threading.Thread( 21 | target=monitor_database_connection, args=(600,), daemon=True 22 | ) 23 | thread.start() 24 | print(f"Started database connection monitor thread in worker {worker.pid}.") 25 | -------------------------------------------------------------------------------- /indra_db_service/sample_hashes.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/indra_db_service/sample_hashes.pkl -------------------------------------------------------------------------------- /indra_db_service/search_introduction.md: -------------------------------------------------------------------------------- 1 | # INDRA DB Search Interface 2 | 3 | On the landing page of the INDRA DB web interface, you can search for statements by 4 | agent name, statement type, MeSH term, or paper. By default, an unfilled agent search 5 | option is displayed. You can add additional search options by selecting one from the 6 | dropdown below the current list of search options. You can also remove a search option 7 | by clicking the "X" button next to it. Once you have entered your search criteria, click 8 | the "Search" button to retrieve the statements that match your search criteria. 9 | 10 | ## Search Options 11 | 12 | The search options are as follows: 13 | 14 | - Agent: search by a specific entity (gene, small molecule, biological process, etc.) 15 | The options are: 16 | - role: 17 | 18 | - subject: the agent is an upstream/controller in retrieved statements 19 | - object: the agent is downstream/controlled in retrieved statements 20 | - any: any role is allowed 21 | - text: Enter the name of the agent 22 | - namespace and Gilda grounding option: Typically it's more reliable to query the DB 23 | using identifiers rather than informal names. If you know the identifier for e.g., a 24 | gene, say "HGNC:1234", you enter '1234' in the text box and then chose 'hgnc' in the 25 | namespace dropdown. However, if you only know the name, the Gilda grounding option 26 | is useful. If you enter e.g., "K-ras" and click "Ground with GILDA", it will 27 | automatically find an identifier for it. If there is ambiguity, you can select the 28 | correct entity from a dropdown. Once you click "Search", the DB will be queried 29 | using the ID selected from the dropdown. 30 | - Type: the type of statement e.g. Activation, Phosphorylation, DecreaseAmount, Complex, 31 | etc. Read more about the types of statements in the 32 | [INDRA documentation](https://indra.readthedocs.io/en/latest/modules/statements.html). 33 | - MeSH: a Medical Subject Headings term that the papers retrieved as evidence are 34 | annotated with. This option also has the option to ground with Gilda if you only know 35 | the name of the MeSH term. 36 | - Paper: Limit the search to a specific publication that evidence comes from. To include 37 | multiple papers, select another paper search option from the dropdown. In the paper 38 | search option, you can search by these publication identifiers: 39 | - PMID: PubMed ID 40 | - PMCID: PubMed Central ID 41 | - DOI: Digital Object Identifier 42 | - TRID: Internal INDRA DB ID signifying a specific publication regardless of the 43 | external identifier (PMID, PMCID, DOI). 44 | - TCID: Internal INDRA DB ID signifying a piece of a text retrieved from 45 | a particular source. 46 | 47 | ## Search Results 48 | 49 | The search results are displayed in hierarchical list format. At the top level, the 50 | most generic form of interaction matching the search criteria are displayed. Clicking 51 | on one of the rows expands the next level of detail, showing the specific forms of 52 | interactions that match the search criteria. Clicking on one of these rows expands the 53 | next level of detail, showing the specific statements that match the search criteria. 54 | The nesting is at most three levels deep, but can also be less if e.g., there is only one 55 | statement type for one interaction type. 56 | 57 | ![Web UI screenshot](../doc/web_ui_results_expanded.png) 58 | Search results view with three levels of nesting expanded for USP15 affecting BARD1 59 | 60 | The search results allows you to curate evidence for each statement. To do this, click 61 | on the pencil icon next to the piece of evidence you want to curate. This will open a 62 | curation area where different options for curating the evidence are available. To read 63 | more about curation, see the 64 | [curation tutorial](https://indra.readthedocs.io/en/latest/tutorials/html_curation.html) 65 | in the INDRA documentation. 66 | -------------------------------------------------------------------------------- /indra_db_service/templates/daily_data.html: -------------------------------------------------------------------------------- 1 | {% extends "idbr_template.html" %} 2 | 3 | {% block scripts %} 4 | {{ super() }} 5 | 6 | {% endblock %} 7 | 8 | {% block body %} 9 | {{ super() }} 10 |
11 |

Batch Job Runtimes

12 | 13 | 14 |

Output Measurements

15 | 16 |
17 | 18 | 20 | 25 | {% endblock %} 26 | -------------------------------------------------------------------------------- /indra_db_service/templates/idbr_description.html: -------------------------------------------------------------------------------- 1 | This project is developed by the 2 | Gyori Lab for Computational Biomedicine at Northeastern University. 3 | This work was funded by DARPA grants W911NF‐15‐1‐0544 and HR00112220036 4 | under the DARPA CwC, DARPA ASKEM and ARPA-H BDF programs. 5 | Source code for the INDRA DB is available here. 6 | Contact: Benjamin M. Gyori. -------------------------------------------------------------------------------- /indra_db_service/templates/idbr_statements_view.html: -------------------------------------------------------------------------------- 1 | {% extends "indra/statements_view.html" %} 2 | {% from "auth_macros.html" import login_overlay %} 3 | {% from "idbr_template.html" import nav_header %} 4 | 5 | {% block scripts %} 6 | {{ super() }} 7 | 8 | 9 | 12 | 13 | {% endblock %} 14 | 15 | {% block navbar %} 16 | {{ nav_header(identity) }} 17 | {% endblock %} 18 | 19 | {% block footer_desc %}{% include "idbr_description.html" %}{{ super() }}{% endblock %} 20 | 21 | {% block body %} 22 | {{ login_overlay() }} 23 | {{ super() }} 24 | {% endblock %} 25 | 26 | {% block additional_footer %} 27 | {{ super() }}{% include "idbr_footer.html" %} 28 | {% endblock %} 29 | -------------------------------------------------------------------------------- /indra_db_service/templates/idbr_template.html: -------------------------------------------------------------------------------- 1 | {% extends "indra/template.html" %} 2 | {% from "auth_macros.html" import login_overlay %} 3 | 4 | {% macro nav_header(identity) -%} 5 | 39 | 50 | 55 | 93 | {%- endmacro %} 94 | 95 | {% block title %} 96 | INDRA Database 97 | {% endblock %} 98 | 99 | {% block navbar %} 100 | {{ nav_header(identity) }} 101 | {% endblock %} 102 | 103 | 104 | {% block body %} 105 | {{ login_overlay() }} 106 | {{ super() }} 107 | {% endblock %} 108 | 109 | {% block footer_desc %} 110 |

{% include "idbr_description.html" %}

111 | {% endblock %} 112 | -------------------------------------------------------------------------------- /indra_db_service/templates/search.html: -------------------------------------------------------------------------------- 1 | {% extends "idbr_template.html" %} 2 | 3 | {% block scripts %} 4 | {{ super() }} 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 63 | 64 | {% endblock %} 65 | 66 | {% block body %} 67 | {{ super() }} 68 |
69 | 70 |
71 | 92 | {% endblock %} 93 | -------------------------------------------------------------------------------- /indra_db_service/templates/search_statements.html: -------------------------------------------------------------------------------- 1 | {% extends "idbr_template.html" %} 2 | 3 | {% block scripts %} 4 | {{ super() }} 5 | 6 | 77 | {% endblock %} 78 | 79 | {% block body %} 80 | {{ super() }} 81 |

{{ message }}

82 |
83 | Mandatory Parameters
Enter subject and object 84 |
85 |
86 | 88 |
89 |
90 | 92 |
93 |
94 | or enter agents separated by space 95 |
96 |
97 | 100 |
101 |
102 |
Optional parameters
Statement type (see the INDRA documentation for more info on statement 105 | types.) 106 |
107 |
108 | 110 |
111 |
112 | Number of statements returned (max 1000) 113 |
114 |
115 |
117 | 119 |
120 |
121 |
122 | Evidence count per statement (max 10000) 123 |
124 |
125 |
127 | 129 |
130 |
131 |
132 |
Submit 133 |
134 |
135 | 138 |
139 |
140 | 141 |
142 | {% endblock %} 143 | -------------------------------------------------------------------------------- /indra_db_service/templates/welcome.html: -------------------------------------------------------------------------------- 1 | {% extends 'idbr_template.html' %} 2 | 3 | {% block body %} 4 | Click the button to the right to sign in and get access to the DB 5 | search page 6 | 10 | {% endblock %} -------------------------------------------------------------------------------- /indra_db_service/util.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from io import StringIO 4 | from datetime import datetime 5 | 6 | from indra.assemblers.html.assembler import _format_stmt_text 7 | from indra_db.client import stmt_from_interaction 8 | 9 | from indra_db.client.readonly.query import gilda_ground 10 | 11 | logger = logging.getLogger('db rest api - util') 12 | 13 | 14 | class DbAPIError(Exception): 15 | pass 16 | 17 | 18 | class NoGroundingFound(DbAPIError): 19 | pass 20 | 21 | 22 | def get_s3_client(): 23 | import boto3 24 | from botocore import config 25 | return boto3.client('s3', boto3.session.Session().region_name, 26 | config=config.Config(s3={'addressing_style': 'path'})) 27 | 28 | # ============================================== 29 | # Define some utilities used to resolve queries. 30 | # ============================================== 31 | 32 | 33 | def process_agent(agent_param): 34 | """Get the agent id and namespace from an input param.""" 35 | 36 | if not agent_param.endswith('@TEXT'): 37 | param_parts = agent_param.split('@') 38 | if len(param_parts) == 2: 39 | ag, ns = param_parts 40 | elif len(param_parts) == 1: 41 | ns = 'NAME' 42 | ag = param_parts[0] 43 | else: 44 | raise DbAPIError('Unrecognized agent spec: \"%s\"' % agent_param) 45 | else: 46 | ag = agent_param[:-5] 47 | ns = 'TEXT' 48 | 49 | if ns == 'HGNC-SYMBOL': 50 | ns = 'NAME' 51 | 52 | logger.info("Resolved %s to ag=%s, ns=%s" % (agent_param, ag, ns)) 53 | return ag, ns 54 | 55 | 56 | def process_mesh_term(mesh_term): 57 | """Use gilda to translate a mesh term into a MESH ID if possible.""" 58 | if mesh_term is None: 59 | return mesh_term 60 | 61 | # Check to see if this is a mesh ID. 62 | if any(mesh_term.startswith(c) for c in ['D', 'C']) \ 63 | and mesh_term[1:].isdigit(): 64 | return mesh_term 65 | 66 | # Try to ground the term. 67 | results = gilda_ground(mesh_term) 68 | for res in results: 69 | if res['term']['db'] == 'MESH': 70 | logger.info(f"Auto-mapped {mesh_term} to {res['term']['id']} " 71 | f"({res['term']['entry_name']}) using Gilda.") 72 | return res['term']['id'] 73 | raise NoGroundingFound(f"Could not find MESH id for {mesh_term} among " 74 | f"gilda results:\n{json.dumps(results, indent=2)}") 75 | 76 | 77 | def get_source(ev_json): 78 | notes = ev_json.get('annotations') 79 | if notes is None: 80 | return 81 | src = notes.get('content_source') 82 | if src is None: 83 | return 84 | return src.lower() 85 | 86 | 87 | def sec_since(t): 88 | return (datetime.now() - t).total_seconds() 89 | 90 | 91 | class LogTracker(object): 92 | log_path = '.rest_api_tracker.log' 93 | 94 | def __init__(self): 95 | root_logger = logging.getLogger() 96 | self.stream = StringIO() 97 | sh = logging.StreamHandler(self.stream) 98 | formatter = logging.Formatter('%(levelname)s: %(name)s %(message)s') 99 | sh.setFormatter(formatter) 100 | sh.setLevel(logging.WARNING) 101 | root_logger.addHandler(sh) 102 | self.root_logger = root_logger 103 | return 104 | 105 | def get_messages(self): 106 | conts = self.stream.getvalue() 107 | print(conts) 108 | ret = conts.splitlines() 109 | return ret 110 | 111 | def get_level_stats(self): 112 | msg_list = self.get_messages() 113 | ret = {} 114 | for msg in msg_list: 115 | level = msg.split(':')[0] 116 | if level not in ret.keys(): 117 | ret[level] = 0 118 | ret[level] += 1 119 | return ret 120 | 121 | 122 | def iter_free_agents(query_dict): 123 | agent_keys = {k for k in query_dict.keys() if k.startswith('agent')} 124 | for k in agent_keys: 125 | entry = query_dict.pop(k) 126 | if isinstance(entry, list): 127 | for agent in entry: 128 | yield agent 129 | else: 130 | yield entry 131 | 132 | 133 | def _make_english_from_meta(interaction): 134 | stmt_type = interaction.get('type') 135 | agent_json = interaction['agents'] 136 | if stmt_type is None: 137 | if len(agent_json) == 0: 138 | eng = '' 139 | else: 140 | ag_list = list(agent_json.values()) 141 | eng = f'{ag_list[0]}' 142 | if len(agent_json) > 1: 143 | eng += ' affects ' + f'{ag_list[1]}' 144 | if len(agent_json) > 3: 145 | eng += ', ' \ 146 | + ', '.join(f'{ag}' 147 | for ag in ag_list[2:-1]) 148 | if len(agent_json) > 2: 149 | eng += ', and ' + f'{ag_list[-1]}' 150 | else: 151 | eng += ' is modified' 152 | else: 153 | eng = _format_stmt_text(stmt_from_interaction(interaction)) 154 | return eng -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | def main(): 5 | packages = find_packages() 6 | print("Installing `indra_db` Packages:\n", '\n'.join(packages)) 7 | extras_require = {'test': ['nose', 'coverage', 'python-coveralls', 8 | 'nose-timer'], 9 | 'service': ['flask', 'flask-jwt-extended', 'flask-cors', 10 | 'flask-compress', 'numpy'], 11 | 'cli': ['click', 'boto3'], 12 | 'copy': ['pgcopy'], 13 | 'misc': ['matplotlib', 'numpy']} 14 | extras_require['all'] = list({dep for deps in extras_require.values() 15 | for dep in deps}) 16 | setup(name='indra_db', 17 | version='0.0.1', 18 | description='INDRA Database', 19 | long_description='INDRA Database', 20 | url='https://github.com/indralab/indra_db', 21 | author='Patrick Greene', 22 | author_email='patrick_greene@hms.harvard.edu', 23 | packages=packages, 24 | include_package_data=True, 25 | install_requires=['sqlalchemy<1.4', 'psycopg2', 'cachetools', 26 | 'termcolor', 'bs4', 'pyyaml'], 27 | extras_require=extras_require, 28 | entry_points=""" 29 | [console_scripts] 30 | indra-db=indra_db.cli:main 31 | indra-db-service=indra_db_service.cli:main 32 | indra-db-benchmarker=benchmarker.cli:main 33 | """) 34 | 35 | 36 | if __name__ == '__main__': 37 | main() 38 | --------------------------------------------------------------------------------