├── .github
    └── workflows
    │   └── tests.yml
├── .gitignore
├── .readthedocs.yaml
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarker
    ├── __init__.py
    ├── cli.py
    ├── util.py
    └── viewer_app
    │   ├── app.py
    │   └── benchmark.html
├── demos
    ├── api_structure.png
    ├── api_structure_future.png
    ├── db_basic_structure.png
    ├── indra_db.png
    └── indra_db_description_and_demo.ipynb
├── doc
    ├── Makefile
    ├── conf.py
    ├── ext
    │   └── citations.py
    ├── index.rst
    ├── indra_db_logo.png
    ├── license.rst
    ├── modules
    │   ├── cli
    │   │   └── index.rst
    │   ├── client
    │   │   ├── index.rst
    │   │   ├── misc.rst
    │   │   ├── principal
    │   │   │   └── index.rst
    │   │   └── readonly
    │   │   │   └── index.rst
    │   ├── index.rst
    │   ├── misc.rst
    │   ├── preassembly
    │   │   └── index.rst
    │   ├── reading
    │   │   └── index.rst
    │   ├── schemas
    │   │   └── index.rst
    │   └── util
    │   │   └── index.rst
    ├── requirements.txt
    ├── rest_api_doc
    │   └── readme_link.rst
    ├── web_ui_doc
    │   └── index.rst
    └── web_ui_results_expanded.png
├── docker
    ├── Dockerfile
    └── buildspec.yml
├── indra_db
    ├── __init__.py
    ├── belief.py
    ├── cli
    │   ├── __init__.py
    │   ├── content.py
    │   ├── dump.py
    │   ├── elsevier_titles.txt
    │   ├── knowledgebase.py
    │   ├── preassembly.py
    │   ├── reading.py
    │   ├── util.py
    │   └── xdd.py
    ├── client
    │   ├── __init__.py
    │   ├── datasets.py
    │   ├── principal
    │   │   ├── __init__.py
    │   │   ├── content.py
    │   │   ├── curation.py
    │   │   ├── pa_statements.py
    │   │   └── raw_statements.py
    │   ├── readonly
    │   │   ├── __init__.py
    │   │   ├── mesh_ref_counts.py
    │   │   ├── query.py
    │   │   └── util.py
    │   └── statements.py
    ├── config.py
    ├── copy_utils.py
    ├── databases.py
    ├── exceptions.py
    ├── preassembly
    │   ├── preassemble_db.py
    │   └── submitter.py
    ├── reading
    │   ├── __init__.py
    │   ├── read_db.py
    │   ├── read_db_aws.py
    │   └── submitter.py
    ├── readonly_dumping
    │   ├── README.md
    │   ├── __init__.py
    │   ├── export_assembly.py
    │   ├── export_assembly_refinement.py
    │   ├── locations.py
    │   ├── rds_restore.sh
    │   ├── readonly_dumping.py
    │   ├── readonly_dumping_bash.sh
    │   └── util.py
    ├── resources
    │   ├── __init__.py
    │   ├── build_sample_set.py
    │   └── default_db_config.ini
    ├── schemas
    │   ├── __init__.py
    │   ├── indexes.py
    │   ├── mixins.py
    │   ├── principal_schema.py
    │   └── readonly_schema.py
    ├── tests
    │   ├── README.md
    │   ├── db_building_util.py
    │   ├── test_belief.py
    │   ├── test_config.py
    │   ├── test_content_manager.py
    │   ├── test_content_scripts.py
    │   ├── test_copy.py
    │   ├── test_dump_manager.py
    │   ├── test_kbs.py
    │   ├── test_preassembly.py
    │   ├── test_principal_client.py
    │   ├── test_query.py
    │   ├── test_reading.py
    │   ├── test_readonly_pipeline.py
    │   ├── test_setup.py
    │   ├── test_sif_dumper.py
    │   ├── test_xdd_manager.py
    │   └── util.py
    └── util
    │   ├── __init__.py
    │   ├── aws.py
    │   ├── build_corpus.py
    │   ├── constructors.py
    │   ├── content_scripts.py
    │   ├── data_gatherer.py
    │   ├── distill_statements.py
    │   ├── dump_sif.py
    │   ├── helpers.py
    │   ├── insert.py
    │   └── s3_path.py
├── indra_db_service
    ├── README.md
    ├── __init__.py
    ├── api.py
    ├── call_handlers.py
    ├── cli
    │   ├── __init__.py
    │   ├── __main__.py
    │   └── zappa_tools.py
    ├── config.py
    ├── data-vis
    │   ├── .gitignore
    │   ├── README.md
    │   ├── babel.config.js
    │   ├── package.json
    │   ├── public
    │   │   └── index.html
    │   ├── src
    │   │   ├── App.vue
    │   │   ├── components
    │   │   │   ├── AmountView
    │   │   │   │   ├── AmountView.vue
    │   │   │   │   ├── LineChart.vue
    │   │   │   │   └── index.js
    │   │   │   ├── TimeView
    │   │   │   │   ├── TimeView.vue
    │   │   │   │   └── index.js
    │   │   │   └── index.js
    │   │   ├── index.js
    │   │   └── main.js
    │   └── vue.config.js
    ├── errors.py
    ├── gunicorn.conf.py
    ├── sample_hashes.pkl
    ├── search_introduction.md
    ├── static
    │   └── curationFunctions.js
    ├── templates
    │   ├── daily_data.html
    │   ├── idbr_description.html
    │   ├── idbr_statements_view.html
    │   ├── idbr_template.html
    │   ├── search.html
    │   ├── search_statements.html
    │   └── welcome.html
    ├── test_api.py
    └── util.py
└── setup.py


/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - uses: actions/checkout@v2
10 |     - uses: actions/cache@v2
11 |       with:
12 |         path: ~/.cache/pip
13 |         key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }}
14 |         restore-keys: |
15 |           ${{ runner.os }}-pip-
16 |     - name: Set up Python 3.6
17 |       uses: actions/setup-python@v2
18 |       with:
19 |         python-version: 3.6
20 |     - name: Install dependencies
21 |       run: |
22 |         echo $GITHUB_EVENT_NAME
23 |         export WRKDIR=`pwd`
24 |         echo "home dir:" $WRKDIR
25 |         sudo apt-get update
26 |         sudo apt-get install libstdc++6 graphviz python3-dev libgraphviz-dev pkg-config
27 |         # Install test/github-workflows-specific dependencies not covered elsewhere
28 |         pip install jsonschema coverage nose-timer doctest-ignore-unicode awscli pycodestyle
29 |         pip install cython psycopg2
30 |         # Now install INDRA DB with all its extras
31 |         pip install git+https://github.com/pagreene/indra.git@api-update
32 |         cd ..
33 |         git clone https://github.com/indralab/ui_util
34 |         cd ui_util/indralab_auth_tools
35 |         echo "indralab_auth_tools dir:" pwd
36 |         pip install .
37 |         cd $WRKDIR
38 |         echo "indra_db dir:" pwd
39 |         pip install .[all]
40 |     - name: Run API tests
41 |       env:
42 |         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
43 |         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
44 |         INDRADBPRIMARY: ${{ secrets.INDRADBPRIMARY }}
45 |         INDRAROPRIMARY: ${{ secrets.INDRAROPRIMARY }}
46 |         SUPERSECRETSECREST: ${{ secrets.SUPERSECRETSECRET }}
47 |       run: |
48 |         # Set nose attributes based on the context in which we are running
49 |         export NOSEATTR="!notravis,!slow,!cron";
50 |         export NOSEATTR=$(if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then echo $NOSEATTR,!nonpublic; else echo $NOSEATTR; fi)
51 |         echo $NOSEATTR
52 |         # These are files that are ignored so that doctests don't fail
53 |         export NOSE_IGNORE_FILES="find_full_text_sentence.py";
54 | 
55 |         echo $NOSEATTR
56 |         #- cd $TRAVIS_BUILD_DIR
57 |         # Now run all INDRA DB REST API tests
58 |         cd rest_api
59 |         nosetests -v -a $NOSEATTR --with-coverage --cover-inclusive --cover-package=indra --with-doctest --with-doctest-ignore-unicode --with-timer --timer-top-n 10 --processes=0
60 |     #- name: Run all other tests
61 |     #  env:
62 |     #    AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
63 |     #    AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
64 |     #    INDRADBPRIMARY: ${{ secrets.INDRADBPRIMARY }}
65 |     #    INDRAROPRIMARY: ${{ secrets.INDRAROPRIMARY }}
66 |     #  run: |
67 |     #    # Set nose attributes based on the context in which we are running
68 |     #    export NOSEATTR="!notravis,!slow,!cron";
69 |     #    export NOSEATTR=$(if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then echo $NOSEATTR,!nonpublic; else echo $NOSEATTR; fi)
70 |     #    echo $NOSEATTR
71 |     #    # These are files that are ignored so that doctests don't fail
72 |     #    export NOSE_IGNORE_FILES="find_full_text_sentence.py";
73 |     #    echo $NOSEATTR
74 |     #    #- cd $TRAVIS_BUILD_DIR
75 |     #    # Now run all INDRA DB REST API tests
76 |     #    cd indra_db
77 |     #    nosetests -v -a $NOSEATTR --with-coverage --cover-inclusive --cover-package=indra --with-doctest --with-doctest-ignore-unicode --with-timer --timer-top-n 10 --processes=0
78 |         
79 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # INDRA DB specific ignores
 2 | junk*
 3 | 
 4 | # Some generic ignores
 5 | __pycache__/
 6 | *.py[cod]
 7 | *.so
 8 | env/
 9 | bin/
10 | build/
11 | _*
12 | 
13 | # Other
14 | *.txt
15 | *.cx
16 | *.zip
17 | *.csv
18 | *.java
19 | *.xbel
20 | *.tsv
21 | *.ai
22 | *.png
23 | *.eps
24 | *.gz
25 | *.swp
26 | *.pkl
27 | 
28 | # For the cool cats using PyCharm
29 | .idea
30 | .idea/*
31 | 
32 | .pytest_cache
33 | 
34 | # Mr Developer
35 | .mr.developer.cfg
36 | .project
37 | .pydevproject
38 | 
39 | # Django stuff:
40 | *.log
41 | *.pot
42 | 
43 | # Data files
44 | *.rdf
45 | *.owl
46 | *.xml
47 | *.nxml
48 | *.bel
49 | *.json
50 | *.bson
51 | *.dat
52 | 
53 | # Documents, graphs, images
54 | *.pdf
55 | *.dot
56 | *.bngl
57 | *.jpg
58 | 
59 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | version: 2
 6 | 
 7 | # Set the version of Python and other tools you might need
 8 | build:
 9 |   os: ubuntu-20.04
10 |   tools:
11 |     python: "3.9"
12 | 
13 | python:
14 |   install:
15 |     - requirements: doc/requirements.txt
16 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include indra_db/resources/default_db_config.ini
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # INDRA DB
  2 | 
  3 | <img align="left" src="https://s3.amazonaws.com/bigmech/indra-db/indra_db_logo.png" width="480" height="200" />
  4 | 
  5 | The INDRA (Integrated Network and Dynamical Reasoning Assembler) Database is a
  6 | framework for creating, maintaining, and accessing a database of content,
  7 | readings, and statements. This implementation is currently designed to work
  8 | primarily with Amazon Web Services RDS running Postrgres 9+. Used as a backend
  9 | to INDRA, the INDRA Database provides a systematic way of scaling the knowledge
 10 | acquired from other databases, reading, and manual input, and puts that
 11 | knowledge at your fingertips through a direct Python client and a REST api.
 12 | 
 13 | ### REST API
 14 | 
 15 | The INDRA DB is available via a web UI at: https://db.indra.bio
 16 | 
 17 | At the same URL, a REST service is also available which allows for programmatic usage
 18 | as documented here: https://github.com/gyorilab/indra_db/blob/master/indra_db_service/README.md
 19 | 
 20 | A convenient way to query the INDRA DB is via INDRA's built-in client towards INDRA DB
 21 | which is documented here: https://indra.readthedocs.io/en/latest/modules/sources/indra_db_rest/index.html.
 22 | 
 23 | ### Knowledge sources
 24 | 
 25 | The INDRA Database currently integrates and distills knowledge from several
 26 | different sources, both biology-focused natural language processing systems and
 27 | other pre-existing databases
 28 | 
 29 | #### Daily Readers
 30 | We have read all available content, and every day we run the following readers:
 31 | - [REACH](https://github.com/clulab/reach)
 32 | - [Sparser](https://github.com/ddmcdonald/sparser)
 33 | 
 34 | we read all new content with the following readers:
 35 | - [Eidos](https://github.com/clulab/eidos)
 36 | - [ISI](https://github.com/sgarg87/big_mech_isi_gg)
 37 | - [MTI](https://ii.nlm.nih.gov/MTI/index.shtml) - used specifically to tag
 38 | content with topic terms.
 39 | 
 40 | we read a limited subset of new content with the following readers:
 41 | - [TRIPS](http://trips.ihmc.us/parser/cgi/drum)
 42 | 
 43 | on the latest content drawn from:
 44 | - [PubMed](https://www.ncbi.nlm.nih.gov/pubmed/) - ~19 million abstracts and ~29 million titles
 45 | - [PubMed Central](/www.ncbi.nlm.nih.gov/pmc/) - ~2.7 million fulltext
 46 | - [Elsevier](https://www.elsevier.com/) - ~0.7 million fulltext 
 47 | (requires special access)
 48 | 
 49 | #### Other Readers
 50 | We also include more or less static content extracted from the following readers:
 51 | - [RLIMS-P](https://research.bioinformatics.udel.edu/rlimsp/)
 52 | 
 53 | #### Other Databases
 54 | We include the information from these pre-existing databases:
 55 | - [Pathway Commons database](http://pathwaycommons.org/)
 56 | - [BEL Large Corpus](https://github.com/OpenBEL/)
 57 | - [SIGNOR](https://signor.uniroma2.it/)
 58 | - [BioGRID](https://thebiogrid.org/)
 59 | - [TAS](https://www.biorxiv.org/content/10.1101/358978v1)
 60 | - [TRRUST](https://omictools.com/trrust-tool)
 61 | - [PhosphoSitePlus](https://www.phosphosite.org/)
 62 | - [Causal Biological Networks Database](http://www.causalbionet.com/)
 63 | - [VirHostNet](http://virhostnet.prabi.fr/)
 64 | - [CTD](http://ctdbase.org/)
 65 | - [Phospho.ELM](http://phospho.elm.eu.org/)
 66 | - [DrugBank](https://www.drugbank.ca/)
 67 | - [CONIB](https://pharmacome.github.io/conib/)
 68 | - [CRoG](https://github.com/chemical-roles/chemical-roles)
 69 | - [DGI](https://www.dgidb.org/)
 70 | 
 71 | These databases are retrieved primarily using the tools in `indra.sources`. The
 72 | statements extracted from all of these sources are stored and updated in the
 73 | database.
 74 | 
 75 | ### Knowledge Assembly
 76 | 
 77 | The INDRA Database uses the powerful internal assembly tools available in INDRA
 78 | but implemented for large-scale incremental assembly. The resulting corpus of
 79 | cleaned and de-duplicated statements, each with fully maintained provenance, is
 80 | the primary product of the database.
 81 | 
 82 | For more details on the internal assembly process of INDRA, see the
 83 | [INDRA documentation](http://indra.readthedocs.io/en/latest/modules/preassembler).
 84 | 
 85 | ### Access
 86 | 
 87 | The content in the database can be accessed by those that created it using the
 88 | `indra_db.client` submodule. This repo also implements a REST API which can be
 89 | used by those without direct acccess to the database. For access to our REST
 90 | API, please contact the authors.
 91 | 
 92 | ## Installation
 93 | 
 94 | The INDRA database only works for Python 3.6+, though some parts are still compatible with 3.5.
 95 | 
 96 | First, [install INDRA](http://indra.readthedocs.io/en/latest/installation.html),
 97 | then simply clone this repo, and make sure that it is visible in your
 98 | `PYTHONPATH`.
 99 | 
100 | ## Funding
101 | The development of INDRA DB is funded under the DARPA Communicating with Computers program (ARO grant W911NF-15-1-0544).
102 | 


--------------------------------------------------------------------------------
/benchmarker/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 | 


--------------------------------------------------------------------------------
/benchmarker/cli.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import webbrowser
  4 | from time import sleep
  5 | 
  6 | from numpy import array
  7 | from datetime import datetime
  8 | from collections import defaultdict
  9 | from typing import Iterable
 10 | 
 11 | import click
 12 | 
 13 | from benchmarker.util import benchmark, list_apis, list_stacks, save_results
 14 | 
 15 | 
 16 | HERE = os.path.dirname(os.path.abspath(__file__))
 17 | 
 18 | 
 19 | @click.group()
 20 | def main():
 21 |     """The benchmarker CLI.
 22 | 
 23 |     The benchmarker tool allows stack deployments to be
 24 |     compared based on the time taken to run existing test corpora that utilize
 25 |     the web service.
 26 |     """
 27 | 
 28 | 
 29 | @main.command('list')
 30 | @click.argument("list_scope", type=click.Choice(["apis", "stacks"]),
 31 |                 required=False)
 32 | def print_list(list_scope):
 33 |     """List the apis or stacks that have already been used."""
 34 |     def print_apis():
 35 |         print()
 36 |         print("Existing API Test Corpora")
 37 |         print("-------------------------")
 38 |         for api in list_apis():
 39 |             print(api)
 40 | 
 41 |     def print_stacks():
 42 |         print()
 43 |         print("Existing Tested Stacks")
 44 |         print("----------------------")
 45 |         for stack_name in list_stacks():
 46 |             print(stack_name)
 47 | 
 48 |     if list_scope == 'apis':
 49 |         print_apis()
 50 |     elif list_scope == 'stacks':
 51 |         print_stacks()
 52 |     else:
 53 |         print_apis()
 54 |         print_stacks()
 55 | 
 56 | 
 57 | @main.command()
 58 | @click.argument("test_corpus")
 59 | @click.argument("stack_name")
 60 | @click.argument("api_name")
 61 | @click.option("-r", "--inner-runs", default=1,
 62 |               type=click.IntRange(1, 100),
 63 |               help="Select the number of times to repeat the test in a row.")
 64 | @click.option("-R", "--outer-runs", default=1,
 65 |               type=click.IntRange(1, 100),
 66 |               help=("Select the number of times to repeat the entire suite of "
 67 |                     "tests."))
 68 | def run(test_corpus, stack_name, api_name, inner_runs, outer_runs):
 69 |     """Run the benchmarker and save the aggregate the results.
 70 | 
 71 |     \b
 72 |     The TEST_CORPUS should be a path to a python test file that tests the INDRA
 73 |     Database REST service, using the standard convention:
 74 | 
 75 |         "path/to/test_file.py:test_function"
 76 | 
 77 |     The STACK_NAME should name a readonly-build stack (database and service
 78 |     deployment) that are being tested. You can get a list of existing
 79 |     (previously tested) stacks using `indra_db_benchmarker list`.
 80 | 
 81 |     The API_NAME should give a name for the test corpus that is being used. You
 82 |     can get a list of existing (previously used) corpora using the `list`
 83 |     feature.
 84 |     """
 85 |     import tabulate
 86 |     start_time = datetime.utcnow()
 87 | 
 88 |     # Run the benchmarker. Run it `outer_run` times, and we will aggregate
 89 |     # the results below.
 90 |     result_list = []
 91 |     test_names = []
 92 |     for i in range(outer_runs):
 93 |         run_result = benchmark(test_corpus, num_runs=inner_runs)
 94 |         if not test_names:
 95 |             test_names = list(run_result.keys())
 96 |         result_list.append(run_result)
 97 | 
 98 |     # Aggregate the results from above, either adding values to the list
 99 |     # or extending a list.
100 |     results = {}
101 |     for test_name in test_names:
102 |         test_results = defaultdict(list)
103 |         for this_result in result_list:
104 |             test_data = this_result[test_name]
105 |             for data_name, data_val in test_data.items():
106 |                 if isinstance(data_val, Iterable):
107 |                     test_results[data_name].extend(data_val)
108 |                 else:
109 |                     test_results[data_name].append(data_val)
110 | 
111 |         # Convert the default dict into a real dict.
112 |         test_results = dict(test_results)
113 | 
114 |         # Turn the time data into an array, and calculate mean and std dev.
115 |         time_data = array(test_results['times'])
116 |         test_results['duration'] = time_data.mean()
117 |         test_results['deviation'] = time_data.std()
118 | 
119 |         # Calculate the overall pass rate.
120 |         test_results['passed'] = sum(test_results['passed'])/outer_runs
121 | 
122 |         # Add this test's aggregated results to the results object.
123 |         results[test_name] = test_results
124 | 
125 |     rows = [(test, st['passed'], st['duration'], st['deviation'])
126 |             for test, st in results.items()]
127 |     headers = ('Test', 'Fraction Passed', 'Ave. Duration', 'Std. Deviation')
128 |     print(tabulate.tabulate(rows, headers))
129 |     save_results(start_time, api_name, stack_name, results)
130 | 
131 | 
132 | @main.command()
133 | def view():
134 |     """Run the web service to view results."""
135 |     basic_env = os.environ.copy()
136 |     basic_env['FLASK_APP'] = os.path.join(HERE, "viewer_app/app.py:app")
137 |     print("Starting web server...")
138 |     p = subprocess.Popen(['flask', 'run', '--port', '5280'],
139 |                          env=basic_env, stdout=subprocess.PIPE,
140 |                          stderr=subprocess.PIPE)
141 |     sleep(2)
142 |     print("Opening browser...")
143 |     webbrowser.open("http://localhost:5280")
144 |     print("Press Ctrl-C to exit.")
145 |     p.wait()
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     main()
150 | 


--------------------------------------------------------------------------------
/benchmarker/util.py:
--------------------------------------------------------------------------------
  1 | __all__ = ['benchmark', 'list_apis', 'list_stacks', 'save_results']
  2 | 
  3 | import os
  4 | import json
  5 | 
  6 | import boto3
  7 | import logging
  8 | from datetime import datetime
  9 | from inspect import getmembers, isfunction, isclass, ismethod
 10 | from importlib.util import spec_from_file_location, module_from_spec
 11 | 
 12 | from numpy import array
 13 | 
 14 | 
 15 | logger = logging.getLogger('benchmark_tools')
 16 | 
 17 | BUCKET = 'bigmech'
 18 | PREFIX = 'indra-db/benchmarks/'
 19 | 
 20 | 
 21 | def run_test(test_name, test_func, num_runs):
 22 |     test_results = dict.fromkeys(['passed', 'error_type', 'error_str',
 23 |                                   'duration', 'deviation', 'times'])
 24 |     test_results['passed'] = False
 25 |     test_results['error_type'] = [None]*num_runs
 26 |     test_results['error_str'] = [None]*num_runs
 27 |     print(test_name)
 28 |     print('-' * len(test_name))
 29 |     durations = []
 30 |     for i in range(num_runs):
 31 |         print("LOGS:")
 32 |         start = datetime.now()
 33 |         try:
 34 |             test_func()
 35 |             print('-' * len(test_name))
 36 |             print("PASSED!")
 37 |             test_results['passed'] += True
 38 |         except Exception as e:
 39 |             print('-' * len(test_name))
 40 |             print("FAILED!", type(e), e)
 41 |             logger.exception(e)
 42 |             test_results['passed'] += False
 43 |             test_results['error_type'][i] = str(type(e))
 44 |             test_results['error_str'][i] = str(e)
 45 |         finally:
 46 |             end = datetime.now()
 47 |             durations.append((end - start).total_seconds())
 48 |             print()
 49 |     dur_array = array(durations)
 50 |     test_results['times'] = durations
 51 |     test_results['duration'] = dur_array.mean()
 52 |     test_results['deviation'] = dur_array.std()
 53 |     test_results['passed'] = test_results['passed'] / num_runs
 54 |     return test_results
 55 | 
 56 | 
 57 | def benchmark(test_selection=None, base_name=None, num_runs=1):
 58 |     """Run a benchmark of the REST service using a given test corpus.
 59 | 
 60 |     Parameters
 61 |     ----------
 62 |     test_selection : Optional[str]
 63 |         Specify the location of the test or tests you wish to run, using the
 64 |         standard formalism: "path/to/test.py:specific_test", where any less
 65 |         specification will result in a search for things that start with "test_"
 66 |         recursively, as usual.
 67 |     base_name : Optional[str]
 68 |         Give this benchmark a base name.
 69 |     num_runs : Optional[int]
 70 |         Specify how many times the tests should be run.
 71 |     """
 72 |     # By default, just run in this directory
 73 |     if test_selection is None:
 74 |         test_selection = os.path.abspath('.')
 75 | 
 76 |     # Extract a function name, if it was included.
 77 |     if test_selection.count(':') == 0:
 78 |         func_name = None
 79 |     elif test_selection.count(':') == 1:
 80 |         test_selection, func_name = test_selection.split(':')
 81 |     else:
 82 |         raise ValueError(f"Invalid loc: {test_selection}")
 83 |     mod_name = os.path.basename(test_selection).replace('.py', '')
 84 |     if base_name:
 85 |         mod_name = base_name + '.' + mod_name
 86 | 
 87 |     # Check if the location exists, and whether it is a directory or file.
 88 |     # Handle the file case by recursively calling this function for each file.
 89 |     results = {}
 90 |     if not os.path.exists(test_selection):
 91 |         raise ValueError(f"No such file or directory: {test_selection}")
 92 |     elif os.path.isdir(test_selection):
 93 |         if func_name is not None:
 94 |             raise ValueError("To specify function, location must be a file.")
 95 |         for file in os.listdir(test_selection):
 96 |             new_path = os.path.join(test_selection, file)
 97 |             if ('test' in file and os.path.isfile(new_path)
 98 |                     and new_path.endswith('.py')):
 99 |                 results.update(benchmark(new_path, base_name=mod_name,
100 |                                          num_runs=num_runs))
101 |         return results
102 | 
103 |     # Handle the case a file is specified.
104 |     if not test_selection.endswith('.py'):
105 |         raise ValueError(f"Location {test_selection} is not a python file.")
106 |     print("=" * len(test_selection))
107 |     print(test_selection)
108 |     print('-' * len(test_selection))
109 |     spec = spec_from_file_location(mod_name, test_selection)
110 |     test_module = module_from_spec(spec)
111 |     try:
112 |         spec.loader.exec_module(test_module)
113 |     except KeyboardInterrupt:
114 |         raise
115 |     except Exception as err:
116 |         logger.error(f"Failed to load {test_selection}, skipping...")
117 |         logger.exception(err)
118 |         return results
119 | 
120 |     # Run test functions
121 |     tests = [f for f, _ in getmembers(test_module, isfunction) if 'test' in f]
122 |     for test_name in tests:
123 |         test = getattr(test_module, test_name)
124 |         results[f'{mod_name}.{test_name}'] = run_test(test_name, test, num_runs)
125 | 
126 |     # Run test classes
127 |     test_classes = [c for c, _ in getmembers(test_module, isclass)
128 |                     if c.lower().startswith('test')]
129 |     for class_name in test_classes:
130 |         cls = getattr(test_module, class_name)
131 |         obj = cls()
132 |         test_methods = [m for m, _ in getmembers(obj, ismethod)
133 |                         if m.lower().startswith('test')
134 |                         or m.lower() == 'run_test']
135 |         for method_name in test_methods:
136 |             obj.setUp()
137 |             test = getattr(obj, method_name)
138 |             if method_name == 'run_test' and len(test_methods) == 1:
139 |                 results[f'{mod_name}.{class_name}'] = \
140 |                     run_test(class_name, test, num_runs)
141 |             else:
142 |                 results[f'{mod_name}.{class_name}.{method_name}'] = \
143 |                     run_test(method_name, test, num_runs)
144 |             obj.tearDown()
145 | 
146 |     return results
147 | 
148 | 
149 | def list_apis():
150 |     """List the current API names on s3."""
151 |     s3 = boto3.client('s3')
152 |     res = s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, Delimiter='/')
153 |     return [e['Prefix'][len(PREFIX):-1] for e in res['CommonPrefixes']]
154 | 
155 | 
156 | def list_stacks():
157 |     """List the stacks represented on s3."""
158 |     s3 = boto3.client('s3')
159 |     stack_names = set()
160 |     for api_name in list_apis():
161 |         try:
162 |             api_prefix = f'{PREFIX}{api_name}/'
163 |             res = s3.list_objects_v2(Bucket=BUCKET, Prefix=api_prefix,
164 |                                      Delimiter='/')
165 |             stack_names |= {e['Prefix'][len(api_prefix):-1]
166 |                             for e in res['CommonPrefixes']}
167 |         except KeyError:
168 |             logger.error(f"Failed to inspect {api_prefix}: likely malformed "
169 |                          f"content was added to s3.")
170 |             continue
171 |     return list(stack_names)
172 | 
173 | 
174 | def save_results(start_time, api_name, stack_name, results):
175 |     """Save the result of a test on s3."""
176 |     s3 = boto3.client('s3')
177 |     data_key = f'{PREFIX}{api_name}/{stack_name}/{start_time}.json'
178 |     s3.put_object(Bucket=BUCKET, Key=data_key, Body=json.dumps(results))
179 |     return
180 | 


--------------------------------------------------------------------------------
/benchmarker/viewer_app/app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import boto3
 4 | import logging
 5 | from os import path
 6 | from flask import Flask, jsonify
 7 | 
 8 | from benchmarker.util import list_stacks, list_apis
 9 | 
10 | logger = logging.getLogger('benchmark_viewer')
11 | 
12 | HERE = path.dirname(__file__)
13 | 
14 | app = Flask('benchmark_viewer')
15 | BUCKET = 'bigmech'
16 | PREFIX = 'indra-db/benchmarks/'
17 | 
18 | 
19 | def load(**kwargs):
20 |     with open(path.join(HERE, 'benchmark.html'), 'r') as f:
21 |         s = f.read()
22 |     for key, value in kwargs.items():
23 |         s = s.replace(f'{{{{ {key} }}}}', json.dumps(value))
24 |     return s
25 | 
26 | 
27 | @app.route('/', methods=['GET'])
28 | def serve_page():
29 |     return load(stacks=list_stacks(), apis=list_apis())
30 | 
31 | 
32 | @app.route('/fetch/<corpus_name>/<stack_name>/<test_file>', methods=['GET'])
33 | def get_stack_data(corpus_name, stack_name, test_file):
34 |     try:
35 |         s3 = boto3.client('s3')
36 |         file = s3.get_object(
37 |             Bucket=BUCKET,
38 |             Key=f'{PREFIX}{corpus_name}/{stack_name}/{test_file}'
39 |         )
40 |         data = json.loads(file['Body'].read())
41 |     except Exception as e:
42 |         logger.exception(e)
43 |         return jsonify({'message': f'Error: {e}'}), 500
44 |     return jsonify({'message': 'success', 'tests': data}), 200
45 | 
46 | 
47 | @app.route('/list/<corpus_name>', methods=['GET'])
48 | def list_corpus_options(corpus_name):
49 |     option_dict = {}
50 |     try:
51 |         s3 = boto3.client('s3')
52 |         prefix = f'{PREFIX}{corpus_name}/'
53 |         res = s3.list_objects_v2(Bucket=BUCKET, Prefix=prefix)
54 |         keys = [e['Key'][len(prefix):] for e in res['Contents']]
55 |         for key in keys:
56 |             stack, test = key.split('/')
57 |             test_time = test.split('.')[0]
58 |             label = f'{test_time} ({stack})'
59 |             option_dict[label] = {'stack': stack, 'test': test}
60 |     except Exception as e:
61 |         logger.exception(e)
62 |         return jsonify({'message': f'Error: {e}'}), 500
63 |     return jsonify({'message': 'success', 'options': option_dict})
64 | 


--------------------------------------------------------------------------------
/demos/api_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/demos/api_structure.png


--------------------------------------------------------------------------------
/demos/api_structure_future.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/demos/api_structure_future.png


--------------------------------------------------------------------------------
/demos/db_basic_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/demos/db_basic_structure.png


--------------------------------------------------------------------------------
/demos/indra_db.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/demos/indra_db.png


--------------------------------------------------------------------------------
/doc/ext/citations.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from docutils import nodes, utils
 4 | from docutils.parsers.rst import roles
 5 | 
 6 | pubmed_uri_pattern = "http://www.ncbi.nlm.nih.gov/pubmed/%i"
 7 | doi_uri_pattern = "http://dx.doi.org/%s"
 8 | 
 9 | def pmid_reference_role(role, rawtext, text, lineno, inliner,
10 |                         options={}, content=[]):
11 |     try:
12 |         pmid = int(text)
13 |         if pmid <= 0:
14 |             raise ValueError
15 |     except ValueError:
16 |         msg = inliner.reporter.error(
17 |             'pmid number must be a number greater than or equal to 1; '
18 |             '"%s" is invalid.' % text, line=lineno)
19 |         prb = inliner.problematic(rawtext, rawtext, msg)
20 |         return [prb], [msg]
21 |     ref = pubmed_uri_pattern % pmid
22 |     nodelist = []
23 |     nodelist.append(nodes.inline(text='PMID:'))
24 |     nodelist.append(nodes.reference(rawtext, utils.unescape(text), refuri=ref,
25 |                                     **options))
26 |     return nodelist, []
27 | 
28 | def doi_reference_role(role, rawtext, text, lineno, inliner,
29 |                        options={}, content=[]):
30 |     ref = doi_uri_pattern % text
31 |     nodelist = []
32 |     nodelist.append(nodes.inline(text='doi:'))
33 |     nodelist.append(nodes.reference(rawtext, utils.unescape(text), refuri=ref,
34 |                                     **options))
35 |     return nodelist, []
36 | 
37 | def setup(app):
38 |     app.add_role('pmid', pmid_reference_role)
39 |     app.add_role('doi', doi_reference_role)
40 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. mdinclude:: ../README.md
 2 | 
 3 | Further INDRA Database documentation
 4 | ====================================
 5 | .. toctree::
 6 |    :maxdepth: 3
 7 | 
 8 |    license.rst
 9 |    modules/index.rst
10 | 
11 | 
12 | INDRA Database REST Service
13 | ===========================
14 | 
15 | .. toctree::
16 |    :maxdepth: 3
17 | 
18 |    rest_api_doc/readme_link.rst
19 | 
20 | INDRA Database Web UI
21 | =====================
22 | 
23 | .. toctree::
24 |    :maxdepth: 3
25 | 
26 |    web_ui_doc/index.rst
27 | 
28 | Indices and tables
29 | ==================
30 | 
31 | * :ref:`genindex`
32 | * :ref:`search`
33 | 
34 | 


--------------------------------------------------------------------------------
/doc/indra_db_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/doc/indra_db_logo.png


--------------------------------------------------------------------------------
/doc/license.rst:
--------------------------------------------------------------------------------
 1 | License and funding
 2 | -------------------
 3 | 
 4 | Copyright (C) 2018, Indra Labs
 5 | 
 6 | This code is free software: you can redistribute it and/or modify
 7 | it under the terms of the GNU General Public License as published by
 8 | the Free Software Foundation, either version 3 of the License, or
 9 | (at your option) any later version.
10 | 
11 | This code is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU General Public License for more details.
15 | 
16 | You may find a copy of the GNU General Public License
17 | `here<https://www.gnu.org/licenses/>`_.
18 | 
19 | The INDRA was developed with funding from ARO grant W911NF-14-1-0397,
20 | "Programmatic modelling for reasoning across complex mechanisms" under
21 | the DARPA Big Mechanism program, and the INDRA database was developed
22 | as an extention of that core project. Work has continued under
23 | W911NF-14-1-0391, "Active context" under the DARPA Communicating with
24 | Computers program, and the DARPA Automated Scientific Discovery Framework
25 | project.
26 | 


--------------------------------------------------------------------------------
/doc/modules/cli/index.rst:
--------------------------------------------------------------------------------
 1 | Pipeline Management CLI
 2 | =======================
 3 | 
 4 | This module creates a CLI for managing the pipelines used to update
 5 | content and knowledge in the database, and move or transform that
 6 | knowledge on a regular basis.
 7 | 
 8 | .. click:: indra_db.cli:main
 9 |    :prog: indra-db
10 |    :nested: full
11 | 
12 | 
13 | Pipeline CLI Implementations
14 | ============================
15 | 
16 | Content (:py:mod:`indra_db.cli.content`)
17 | ----------------------------------------
18 | 
19 | The Content CLI manages the text content that is
20 | stored in the database. A parent class is defined, and managers for different
21 | sources (e.g. PubMed) can be defined by inheriting from this parent. This file
22 | is also used as the shell command to run updates of the content.
23 | 
24 | .. automodule:: indra_db.cli.content
25 |    :members:
26 |    :member-order: bysource
27 | 
28 | 
29 | Reading (:py:mod:`indra_db.cli.reading`)
30 | ----------------------------------------
31 | 
32 | The Reading CLI handles the reading of the text contend and the processing
33 | of those readings into statements. As with Content CLI, different reading
34 | pipelines can be handled by defining children of a parent class.
35 | 
36 | .. automodule:: indra_db.cli.reading
37 |    :members:
38 |    :member-order: bysource
39 | 
40 | 
41 | PreAssembly (:py:mod:`indra_db.cli.preassembly`)
42 | ------------------------------------------------
43 | 
44 | The Preassembly CLI manages the preassembly pipeline, running deploying
45 | preassembly jobs to Batch.
46 | 
47 | .. automodule:: indra_db.cli.preassembly
48 |    :members:
49 |    :member-order: bysource
50 | 
51 | 
52 | Knowledge Bases (:py:mod:`indra_db.cli.knowledgebase`)
53 | ------------------------------------------------------
54 | 
55 | The INDRA Databases also derives much of its knowledge from external databases
56 | and other resources not extracted from plain text, referred to in this repo as
57 | "knowledge bases", so as to avoid the ambiguity of "database". This CLI
58 | handles the updates of those knowledge bases, each of which requires different
59 | handling.
60 | 
61 | .. automodule:: indra_db.cli.knowledgebase
62 |    :members:
63 |    :member-order: bysource
64 | 
65 | 
66 | Static Dumps (:py:mod:`indra_db.cli.dump`)
67 | ------------------------------------------
68 | 
69 | This handles the generation of static dumps, including the readonly database 
70 | from the principal database.
71 | 
72 | .. automodule:: indra_db.cli.dump
73 |    :members:
74 |    :member-order: bysource
75 | 


--------------------------------------------------------------------------------
/doc/modules/client/index.rst:
--------------------------------------------------------------------------------
 1 | The Client
 2 | ==========
 3 | The purpose of the client is to be the gateway for external access to the
 4 | content of the databases. Here we define high level access functions for
 5 | getting data out of the database in a natural way. This is where the queries
 6 | used by the REST API are defined, and most users looking to access knowledge on
 7 | the database should use the client if they can, as it is heavily optimized.
 8 | 
 9 | Our system utilizes 2 databases, one which represents the "ground truth", as
10 | we know it, and is structured naturally for performing updates on our
11 | knowledge; it will always be the most up to date. We also have a "readonly"
12 | database that we used for our outward facing services. This database is
13 | optimized for fast queries and the content in it is updated weekly. Each
14 | database has its own set of access tools.
15 | 
16 | 
17 | .. toctree::
18 |    :maxdepth: 3
19 | 
20 |    principal/index.rst
21 |    readonly/index.rst
22 |    misc.rst
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/doc/modules/client/misc.rst:
--------------------------------------------------------------------------------
 1 | Miscellaneous Client APIs (Mostly Deprecated)
 2 | =============================================
 3 | 
 4 | There are some, generally archaic, client functions which use both readonly
 5 | and principal resources. I make no guarantee that these will work.
 6 | 
 7 | Get Datasets (:py:mod:`indra_db.client.datasets`)
 8 | -------------------------------------------------
 9 | 
10 | An early attempt at something very like the :py:mod:`indra_db.client.readonly.interactions`
11 | approach to getting superficial data out of the database.
12 | 
13 | .. automodule:: indra_db.client.datasets
14 |    :members:
15 | 
16 | 
17 | Get Statements (:py:mod:`indra_db.client.statements`)
18 | -----------------------------------------------------
19 | 
20 | The first round of tools written to get Statements out of the database,
21 | utilizing far too many queries and taking absurdly long to complete. Most of
22 | their functions have been outmoded, with the exception of getting PA Statements
23 | from the principal database, which (as of this writing) has yet to be
24 | implemented.
25 | 
26 | .. automodule:: indra_db.client.statements
27 |    :members:
28 | 


--------------------------------------------------------------------------------
/doc/modules/client/principal/index.rst:
--------------------------------------------------------------------------------
 1 | The Principal Database Client
 2 | =============================
 3 | 
 4 | This is the set of client tools to access the most-nearly ground truth
 5 | knowledge stored on the principal database.
 6 | 
 7 | 
 8 | Access Readings and Text Content (:py:mod:`indra_db.client.principal.content`)
 9 | ------------------------------------------------------------------------------
10 | 
11 | This defines a simple API to access the content that we store on the database
12 | for external purposes.
13 | 
14 | .. automodule:: indra_db.client.principal.content
15 |    :members:
16 | 
17 | 
18 | Submit and Retrieve Curations (:py:mod:`indra_db.client.principal.curation`)
19 | ----------------------------------------------------------------------------
20 | 
21 | On our services, users have the ability to curate the results we present,
22 | indicating whether they are correct or not, and how they may be incorrect. The
23 | API for adding and retrieving that input is defined here.
24 | 
25 | .. automodule:: indra_db.client.principal.curation
26 |    :members:
27 | 
28 | 
29 | Get Raw Statements (:py:mod:`indra_db.client.principal.raw_statements`)
30 | -----------------------------------------------------------------------
31 | 
32 | Get the raw, uncleaned and un-merged Statements based on agent and type or by
33 | paper(s) of origin.
34 | 
35 | .. automodule:: indra_db.client.principal.raw_statements
36 |    :members:
37 | 


--------------------------------------------------------------------------------
/doc/modules/client/readonly/index.rst:
--------------------------------------------------------------------------------
 1 | The Readonly Client
 2 | ===================
 3 | 
 4 | Here are our primary tools intended for retrieving Statements, in particular
 5 | Pre-Assembled (PA) Statements, from the readonly database. This is some of the
 6 | most heavily optimized access code in the repo, and is the backbone of most
 7 | external or outward facing applications.
 8 | 
 9 | The readonly database, as the name suggests, is designed to take only read
10 | requests, and is updated via dump only once a week. This allows users of
11 | our database to access it even as we perform daily updates on the principal
12 | database, without worrying about queries interfering.
13 | 
14 | 
15 | Construct composable queries (:py:mod:`indra_db.client.readonly.query`)
16 | -------------------------------------------------------------------------------
17 | 
18 | This is a sophisticated system of classes that can be used to form queires
19 | for preassembled statements from the readonly database.
20 | 
21 | .. automodule:: indra_db.client.readonly.query
22 |    :members:
23 |    :member-order: bysource
24 | 
25 | 


--------------------------------------------------------------------------------
/doc/modules/index.rst:
--------------------------------------------------------------------------------
 1 | INDRA Database modules
 2 | ======================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 3
 6 | 
 7 |    client/index.rst
 8 |    cli/index.rst
 9 |    reading/index.rst
10 |    preassembly/index.rst
11 |    schemas/index.rst
12 |    util/index.rst
13 |    misc.rst
14 | 
15 | 


--------------------------------------------------------------------------------
/doc/modules/misc.rst:
--------------------------------------------------------------------------------
 1 | Some Miscellaneous Modules
 2 | ==========================
 3 | 
 4 | Here are some modules and files that live on their own, and don't fit neatly
 5 | into other categories.
 6 | 
 7 | 
 8 | Low Level Database Interface (:py:mod:`indra_db.databases`)
 9 | -----------------------------------------------------------
10 | 
11 | The Database Manager classes are the lowest level interface with the database,
12 | implemented with SQLAlchemy, providing useful short-cuts but also allowing full
13 | access to SQLAlchemy's API.
14 | 
15 | .. automodule:: indra_db.databases
16 |    :members:
17 |    :member-order: bysource
18 | 
19 | 
20 | Belief Calculator (:py:mod:`indra_db.belief`)
21 | ---------------------------------------------
22 | 
23 | The belief in the knowledge of a Statement is a measure of our confidence that
24 | the Statement is an accurate representation of the text, _NOT_ our confidence
25 | in the validity of what was in that text. Given the size of the content in the
26 | database, some special care is needed when calculating this value, which
27 | depends heavily on the support relations between pre-assembled Statements.
28 | 
29 | .. automodule:: indra_db.belief
30 |    :members:
31 | 


--------------------------------------------------------------------------------
/doc/modules/preassembly/index.rst:
--------------------------------------------------------------------------------
 1 | Database Integrated Preassembly Tools
 2 | =====================================
 3 | 
 4 | The database runs incremental preassembly on the raw statements to generate
 5 | the preassembled (PA) Statements. The code to accomplish this task is defined
 6 | here, principally in :class:`DbPreassembler
 7 | <indra_db.preassembly.preassemble_db.DbPreassembler>`. This module also
 8 | defines proceedures for running these jobs on AWS.
 9 | 
10 | Database Preassembly (:py:mod:`indra_db.preassembly.preassemble_db`)
11 | --------------------------------------------------------------------
12 | 
13 | This module defines a class that manages preassembly for a given list of
14 | statement types on the local machine.
15 | 
16 | .. automodule:: indra_db.preassembly.preassemble_db
17 |    :members:
18 |    :member-order: bysource
19 | 
20 | 
21 | A Class to Manage and Monitor AWS Batch Jobs (:py:mod:`indra_db.preassembly.submitter`)
22 | ---------------------------------------------------------------------------------------
23 | 
24 | Allow a manager to monitor the Batch jobs to prevent runaway jobs, and smooth
25 | out job runs and submissions.
26 | 
27 | .. automodule:: indra_db.preassembly.submitter
28 |    :members:
29 |    :member-order: bysource
30 | 
31 | 


--------------------------------------------------------------------------------
/doc/modules/reading/index.rst:
--------------------------------------------------------------------------------
 1 | Database Integrated Reading Tools
 2 | =================================
 3 | 
 4 | Here are defined the procedures for reading content on the database, stashing
 5 | the reading outputs, and producing statements from the readings, and inserting
 6 | those raw statements into the database.
 7 | 
 8 | The Database Readers (:py:mod:`indra_db.reading.read_db`)
 9 | ---------------------------------------------------------
10 | 
11 | A reader is defined as a python class which implements the machinery needed to
12 | process the text content we store, read it, and extract Statements from the
13 | reading results, storing the readings along the way. The reader must conform
14 | to a standard interface, which then allows readers to be run in a plug-and-play
15 | manner.
16 | 
17 | .. automodule:: indra_db.reading.read_db
18 |    :members:
19 |    :member-order: bysource
20 | 
21 | 
22 | The Database Script for Running on AWS (:py:mod:`indra_db.reading.read_db_aws`)
23 | -------------------------------------------------------------------------------
24 | 
25 | This is the script used to run reading on AWS Batch, generally run from an
26 | AWS Lambda function.
27 | 
28 | .. automodule:: indra_db.reading.read_db_aws
29 |    :members:
30 |    :member-order: bysource
31 | 
32 | 
33 | A Class to Manage and Monitor AWS Batch Jobs (:py:mod:`indra_db.reading.submitter`)
34 | -----------------------------------------------------------------------------------
35 | 
36 | Allow a manager to monitor the Batch jobs to prevent runaway jobs, and smooth
37 | out job runs and submissions.
38 | 
39 | .. automodule:: indra_db.reading.submitter
40 |    :members:
41 |    :member-order: bysource
42 | 
43 | 


--------------------------------------------------------------------------------
/doc/modules/schemas/index.rst:
--------------------------------------------------------------------------------
 1 | Database Schemas
 2 | ================
 3 | 
 4 | Here are defined the schemas for the principal and readonly databases, as well
 5 | as some useful mixin classes.
 6 | 
 7 | Principal Database Schema (:py:mod:`indra_db.schemas.principal_schema`)
 8 | -----------------------------------------------------------------------
 9 | 
10 | .. automodule:: indra_db.schemas.principal_schema
11 |    :members:
12 |    :member-order: bysource
13 | 
14 | Readonly Database Schema (:py:mod:`indra_db.schemas.readonly_schema`)
15 | ---------------------------------------------------------------------
16 | 
17 | Defines the `get_schema` function for the readonly database, which is used by
18 | external services to access the Statement knowledge we acquire.
19 | 
20 | .. automodule:: indra_db.schemas.readonly_schema
21 |    :members:
22 |    :member-order: bysource
23 | 
24 | Class Mix-ins (:py:mod:`indra_db.schemas.mixins`)
25 | -------------------------------------------------
26 | 
27 | This defines class mixins that are used to add general features to SQLAlchemy
28 | table objects via multiple inheritance.
29 | 
30 | .. automodule:: indra_db.schemas.mixins
31 |    :members:
32 |    :member-order: bysource
33 | 
34 | Indexes (:py:mod:`indra_db.schemas.indexes`)
35 | --------------------------------------------
36 | 
37 | This defines the classes needed to create and maintain indices in the database,
38 | the other part of the infrastructure of which is included in the `IndraDBTable`
39 | class mixin definition.
40 | 
41 | .. automodule:: indra_db.schemas.indexes
42 |    :members:
43 |    :member-order: bysource
44 | 


--------------------------------------------------------------------------------
/doc/modules/util/index.rst:
--------------------------------------------------------------------------------
 1 | Utilities
 2 | =========
 3 | 
 4 | Here live the more mundane and backend utilities used throughout other modules
 5 | of the codebase, and potentially elsewhere, although they are not intended for
 6 | external use in general. Several more-or-less bespoke scripts are also stored
 7 | here.
 8 | 
 9 | 
10 | Database Session Constructors (:py:mod:`indra_db.util.constructors`)
11 | --------------------------------------------------------------------
12 | 
13 | Constructors to get interfaces to the different databases, selecting among
14 | the various physical instances defined in the config file.
15 | 
16 | .. automodule:: indra_db.util.constructors
17 |    :members:
18 | 
19 | 
20 | Scripts to Get Content (:py:mod:`indra_db.util.content_scripts`)
21 | ----------------------------------------------------------------
22 | 
23 | General scripts for getting content by various IDs.
24 | 
25 | .. automodule:: indra_db.util.content_scripts
26 |    :members:
27 | 
28 | 
29 | Distilling Raw Statements (:py:mod:`indra_db.util.distill_statements`)
30 | ----------------------------------------------------------------------
31 | 
32 | Do some pre-pre-assembly cleansing of the raw Statements to account for various
33 | kinds of duplicity that are artifacts of our content collection and reading
34 | pipelines rather than representing actually duplicated knowledge in the
35 | literature.
36 | 
37 | .. automodule:: indra_db.util.distill_statements
38 |    :members:
39 | 
40 | 
41 | Script to Create a SIF Dump (:py:mod:`indra_db.util.dump_sif`)
42 | --------------------------------------------------------------
43 | 
44 | Create an interactome from metadata in the database and dump the results as a
45 | sif file.
46 | 
47 | .. automodule:: indra_db.util.dump_sif
48 |    :members:
49 | 
50 | 
51 | General Helper Functions (:py:mod:`indra_db.util.helpers`)
52 | ----------------------------------------------------------
53 | 
54 | Functions with broad utility throughout the repository, but otherwise
55 | miscellaneous.
56 | 
57 | .. automodule:: indra_db.util.helpers
58 |    :members:
59 | 
60 | 
61 | Routines for Inserting Statements and Content (:py:mod:`indra_db.util.insert`)
62 | ------------------------------------------------------------------------------
63 | 
64 | Inserting content into the database can be a rather involved process, but here
65 | are defined high-level utilities to uniformly accomplish the task.
66 | 
67 | .. automodule:: indra_db.util.insert
68 |    :members:
69 | 


--------------------------------------------------------------------------------
/doc/requirements.txt:
--------------------------------------------------------------------------------
 1 | sphinx_rtd_theme
 2 | sphinx
 3 | mock
 4 | ipython
 5 | matplotlib
 6 | future
 7 | psycopg2
 8 | sphinx-click
 9 | requests
10 | m2r2
11 | termcolor
12 | cachetools
13 | git+https://github.com/sorgerlab/indra.git
14 | 
15 | 


--------------------------------------------------------------------------------
/doc/rest_api_doc/readme_link.rst:
--------------------------------------------------------------------------------
1 | .. mdinclude:: ../../indra_db_service/README.md
2 | 
3 | 


--------------------------------------------------------------------------------
/doc/web_ui_doc/index.rst:
--------------------------------------------------------------------------------
1 | .. mdinclude:: ../../indra_db_service/search_introduction.md
2 | 


--------------------------------------------------------------------------------
/doc/web_ui_results_expanded.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/doc/web_ui_results_expanded.png


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG AWS_ACCOUNT_ID
 2 | FROM ${AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/indra:latest
 3 | 
 4 | ARG BUILD_BRANCH
 5 | ARG INDRA_BRANCH
 6 | 
 7 | ENV DIRPATH /sw
 8 | ENV PYTHONPATH "$PYTHONPATH:${DIRPATH}/covid-19"
 9 | WORKDIR $DIRPATH
10 | 
11 | RUN cd indra && \
12 |     git fetch --all && \
13 |     git checkout $INDRA_BRANCH && \
14 |     echo "INDRA_BRANCH=" $INDRA_BRANCH && \
15 |     pip install -e . -U
16 | 
17 | # Install libpq5 and some other necessities.
18 | RUN wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | apt-key add - && \
19 |     apt-get update && \
20 |     apt-get install -y lsb-release && \
21 |     echo "deb http://apt.postgresql.org/pub/repos/apt/ `lsb_release -cs`-pgdg main" | tee  /etc/apt/sources.list.d/pgdg.list && \
22 |     apt-get update && \
23 |     apt-get install -y libpq5 libpq-dev postgresql-client-13 postgresql-client-common && \
24 |     pip install awscli
25 | 
26 | # Install psycopg2
27 | RUN git clone https://github.com/psycopg/psycopg2.git && \
28 |     cd psycopg2 && \
29 |     python setup.py build && \
30 |     python setup.py install
31 | 
32 | # Install pgcopy
33 | RUN git clone https://github.com/pagreene/pgcopy.git && \
34 |     cd pgcopy && \
35 |     python setup.py install
36 | 
37 | # Install covid-19
38 | RUN git clone https://github.com/indralab/covid-19.git
39 | 
40 | # Install sqlalchemy < 1.4 (due to indirect dependencies, it may be a later
41 | # version in the indra:db image)
42 | RUN pip install "sqlalchemy<1.4"
43 | 
44 | #install bs4
45 | RUN pip install bs4
46 | 
47 | # Install indra_db
48 | RUN git clone https://github.com/gyorilab/indra_db.git && \
49 |     cd indra_db && \
50 |     pip install -e .[all] && \
51 |     pip list && \
52 |     echo "PYTHONPATH =" $PYTHONPATH && \
53 |     git checkout $BUILD_BRANCH && \
54 |     echo "BUILD_BRANCH =" $BUILD_BRANCH && \
55 |     git branch && \
56 |     echo "[indra]" > /root/.config/indra/config.ini
57 | 
58 | 


--------------------------------------------------------------------------------
/docker/buildspec.yml:
--------------------------------------------------------------------------------
 1 | version: 0.1
 2 | 
 3 | phases:
 4 |   pre_build:
 5 |     commands:
 6 |       - echo Logging in to Amazon ECR...
 7 |       - aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.us-east-1.amazonaws.com
 8 |   build:
 9 |     commands:
10 |       - echo Build started on `date`
11 |       - echo Building the Docker image...
12 |       - docker build --build-arg BUILD_BRANCH=$BUILD_BRANCH --build-arg INDRA_BRANCH=$INDRA_BRANCH --build-arg AWS_ACCOUNT_ID=$AWS_ACCOUNT_ID -t $IMAGE_REPO_NAME:$IMAGE_TAG -f docker/Dockerfile .
13 |       - docker tag $IMAGE_REPO_NAME:$IMAGE_TAG $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG
14 |   post_build:
15 |     commands:
16 |       - echo Build completed on `date`
17 |       - echo Pushing the Docker image...
18 |       - docker push $AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$IMAGE_REPO_NAME:$IMAGE_TAG
19 | 


--------------------------------------------------------------------------------
/indra_db/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import get_primary_db, get_db, get_ro
2 | from .databases import texttypes, formats, sql_expressions
3 | 


--------------------------------------------------------------------------------
/indra_db/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from .knowledgebase import kb
 4 | from .content import content
 5 | from .dump import dump_cli
 6 | from .preassembly import pa
 7 | from .reading import reading
 8 | from .xdd import xdd
 9 | 
10 | 
11 | @click.group()
12 | def main():
13 |     """INDRA Database Infrastructure CLI
14 | 
15 |     The INDRA Database is both a physical database and an infrastructure for
16 |     managing and updating the content of that physical database. This CLI
17 |     is used for executing these management commands.
18 |     """
19 | 
20 | 
21 | @main.command()
22 | @click.argument("task", type=click.Choice(["gather"]))
23 | def pipeline_stats(task):
24 |     """Manage the pipeline stats gathered on s3.
25 | 
26 |     All major upload and update pipelines have basic timeing and success-failure
27 |     stats gather on them using the
28 |     :class:`DataGatherer <indra_db.util.data_gatherer.DataGatherer>` class
29 |     wrapper.
30 | 
31 |     These stats are displayed on the ``/monitor`` endpoint of the database
32 |     service.
33 | 
34 |     \b
35 |     Tasks are:
36 |      - gather: gather the individual job JSONs into an aggregated file.
37 |     """
38 |     if task == "gather":
39 |         from indra_db.util.data_gatherer import digest_s3_files
40 |         digest_s3_files()
41 | 
42 | 
43 | main.add_command(kb)
44 | main.add_command(content)
45 | main.add_command(dump_cli)
46 | main.add_command(pa)
47 | main.add_command(reading)
48 | main.add_command(xdd)
49 | 


--------------------------------------------------------------------------------
/indra_db/cli/preassembly.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | from datetime import datetime
  3 | 
  4 | from indra_db import get_db
  5 | from indra_db.exceptions import IndraDbException
  6 | 
  7 | from .util import format_date
  8 | 
  9 | def filter_updates(stmt_type, pa_updates):
 10 |     return {u.run_datetime for u in pa_updates if u.stmt_type == stmt_type}
 11 | 
 12 | 
 13 | def list_last_updates(db):
 14 |     """Return a dict of the most recent updates for each statement type."""
 15 |     from indra_db.preassembly.submitter import VALID_STATEMENTS
 16 |     pa_updates = db.select_all(db.PreassemblyUpdates)
 17 |     last_full_update = max(filter_updates(None, pa_updates))
 18 |     last_updates = {st: max(filter_updates(st, pa_updates)
 19 |                             | {last_full_update})
 20 |                     for st in VALID_STATEMENTS}
 21 |     return last_updates
 22 | 
 23 | 
 24 | def list_latest_raw_stmts(db):
 25 |     """Return a dict of the most recent new raw statement for each type."""
 26 |     from sqlalchemy import func
 27 |     res = (db.session.query(db.RawStatements.type,
 28 |                             func.max(db.RawStatements.create_date))
 29 |                      .group_by(db.RawStatements.type)
 30 |                      .all())
 31 |     return {k: v for k, v in res}
 32 | 
 33 | 
 34 | def run_preassembly(mode, project_name):
 35 |     """Construct a submitter and begin submitting jobs to Batch for preassembly.
 36 | 
 37 |     This function will determine which statement types need to be updated and
 38 |     how far back they go, and will create the appropriate
 39 |     :class:`PreassemblySubmitter
 40 |     <indra_db.preassembly.submitter.PreassemblySubmitter>`
 41 |     instance, and run the jobs with pre-set parameters on statement types that
 42 |     need updating.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     project_name : str
 47 |         This name is used to gag the various AWS resources used for accounting
 48 |         purposes.
 49 |     """
 50 |     from indra_db.preassembly.submitter import VALID_STATEMENTS, \
 51 |         PreassemblySubmitter
 52 |     db = get_db('primary')
 53 |     if mode == 'update':
 54 |         # Find the latest update for each statement type.
 55 |         last_updates = list_last_updates(db)
 56 | 
 57 |         # Get the most recent raw statement datetimes
 58 |         latest_raw_stmts = list_latest_raw_stmts(db)
 59 | 
 60 |         # Only include statements types that have new raw statements.
 61 |         need_to_update = [s_type for s_type, last_upd in last_updates.items()
 62 |                           if s_type in latest_raw_stmts.keys()
 63 |                           and latest_raw_stmts[s_type] > last_upd]
 64 |     else:
 65 |         # Make sure the pa_statements table is truly empty.
 66 |         if db.select_one(db.PAStatements):
 67 |             raise IndraDbException("Please clear the pa_statements table "
 68 |                                    "before running create. If you want to run "
 69 |                                    "an incremental update, please run with "
 70 |                                    "mode 'update'.")
 71 | 
 72 |         # Just run them all.
 73 |         need_to_update = VALID_STATEMENTS[:]
 74 | 
 75 |     # Create the submitter, and run it.
 76 |     basename = datetime.utcnow().strftime('%Y%m%d_%H%M%S')
 77 |     ps = PreassemblySubmitter(basename, mode, project_name=project_name)
 78 |     ps.set_max_jobs(4)
 79 |     ps.run(need_to_update, 100000, True, stagger=600, poll_interval=120)
 80 | 
 81 | 
 82 | @click.group()
 83 | def pa():
 84 |     """Manage the preassembly pipeline."""
 85 | 
 86 | 
 87 | @pa.command()
 88 | @click.argument('task', type=click.Choice(['create', 'update']),
 89 |                 required=True)
 90 | @click.argument('project-name', required=False)
 91 | def run(task, project_name):
 92 |     """Manage the indra_db preassembly.
 93 | 
 94 |     \b
 95 |     Tasks:
 96 |      - "create": populate the pa_statements table for the first time (this
 97 |        requires that the table be empty).
 98 |      - "update": update the existing content in pa_statements with the latest
 99 |        from raw statements.
100 | 
101 |     A project name is required to tag the AWS instances with a "project" tag.
102 |     """
103 |     run_preassembly(task, project_name)
104 | 
105 | 
106 | @pa.command('list')
107 | @click.option('-r', '--with-raw', is_flag=True,
108 |               help="Include the latest datetimes for raw statements of each "
109 |                    "type. This will take much longer.")
110 | def show_list(with_raw):
111 |     """List the latest updates for each type of Statement."""
112 |     import tabulate
113 | 
114 |     db = get_db('primary')
115 |     rows = [(st, lu) for st, lu in list_last_updates(db).items()]
116 |     header = ('Statement Type', 'Last Update')
117 |     if with_raw:
118 |         print("This may take a while...", end='', flush=True)
119 |         raw_stmt_dates = list_latest_raw_stmts(db)
120 |         print("\r", end='')
121 |         new_rows = []
122 |         for st, lu in rows:
123 |             raw_date = raw_stmt_dates.get(st)
124 |             if raw_date is None:
125 |                 new_rows.append((st, format_date(lu), "[None]", "No"))
126 |             else:
127 |                 new_rows.append((st, format_date(lu), format_date(raw_date),
128 |                                  "Yes" if raw_date > lu else "No"))
129 |         rows = new_rows
130 |         header += ('Latest Raw Stmt', 'Needs Update?')
131 |     else:
132 |         rows = [(st, format_date(lu)) for st, lu in rows]
133 |     rows.sort()
134 |     print(tabulate.tabulate(rows, header))
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/indra_db/cli/util.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | 
3 | 
4 | def format_date(dt):
5 |     if not isinstance(dt, datetime):
6 |         return dt
7 |     return dt.strftime("%Y %b %d %I:%M%p")
8 | 


--------------------------------------------------------------------------------
/indra_db/client/__init__.py:
--------------------------------------------------------------------------------
 1 | """This module contains tools designed to access content in the db.
 2 | 
 3 | Specifically, this is for direct access to the database, not through the web
 4 | api.
 5 | 
 6 | All the functions defined require direct access to the database, which is in
 7 | general restricted. For broad access, see the indra_db_rest api client in
 8 | INDRA.
 9 | 
10 | There are two key ways of accessing statements from the INDRA Database:
11 | directly and using the materialize views. Only the `get_statement_jsons`
12 | functions are limited to using the views. Most other functions access the
13 | primary tables of the database and are generally slower. The
14 | `get_statement_jsons` functions are the most heavily optimized for fast
15 | recall, as they are the back-end to the REST API.
16 | """
17 | 
18 | from .datasets import *
19 | from .readonly import *
20 | from .principal import *
21 | 


--------------------------------------------------------------------------------
/indra_db/client/principal/__init__.py:
--------------------------------------------------------------------------------
1 | from .raw_statements import *
2 | from .pa_statements import *
3 | from .curation import *
4 | from .content import *
5 | 


--------------------------------------------------------------------------------
/indra_db/client/principal/content.py:
--------------------------------------------------------------------------------
  1 | __all__ = ['get_reader_output', 'get_content_by_refs', 'get_text']
  2 | 
  3 | import logging
  4 | from collections import defaultdict
  5 | 
  6 | from indra_db.util import unpack, _get_trids
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | def get_reader_output(db, ref_id, ref_type='tcid', reader=None,
 12 |                       reader_version=None):
 13 |     """Return reader output for a given text content.
 14 | 
 15 |     Parameters
 16 |     ----------
 17 |     db : :py:class:`DatabaseManager`
 18 |         Reference to the DB to query
 19 |     ref_id : int or str
 20 |         The text reference ID whose reader output should be returned
 21 |     ref_type : Optional[str]
 22 |         The type of ID to look for, options include
 23 |         'tcid' for the database's internal unique text content ID,
 24 |         or 'pmid', 'pmcid', 'doi, 'pii', 'manuscript_id'
 25 |         Default: 'tcid'
 26 |     reader : Optional[str]
 27 |         The name of the reader whose output is of interest
 28 |     reader_version : Optional[str]
 29 |         The specific version of the reader
 30 | 
 31 |     Returns
 32 |     -------
 33 |     reading_results : dict{dict{list[str]}}
 34 |         A dict of reader outputs that match the query criteria, indexed first
 35 |         by text content id, then by reader.
 36 |     """
 37 |     if ref_type == 'tcid':
 38 |         clauses = [db.Reading.text_content_id == ref_id]
 39 |     else:
 40 |         trids = _get_trids(db, ref_id, ref_type)
 41 |         if not trids:
 42 |             return []
 43 |         logger.debug("Found %d text ref ids." % len(trids))
 44 |         clauses = [db.TextContent.text_ref_id.in_(trids),
 45 |                    db.Reading.text_content_id == db.TextContent.id]
 46 |     if reader:
 47 |         clauses.append(db.Reading.reader == reader.upper())
 48 |     if reader_version:
 49 |         clauses.append(db.Reading.reader_version == reader_version)
 50 | 
 51 |     res = db.select_all([db.Reading.text_content_id, db.Reading.reader,
 52 |                          db.Reading.bytes], *clauses)
 53 |     reading_dict = defaultdict(lambda: defaultdict(lambda: []))
 54 |     for tcid, reader, result in res:
 55 |         unpacked_result = None
 56 |         if not result:
 57 |             logger.warning("Got reading result with zero content.")
 58 |         else:
 59 |             unpacked_result = unpack(result)
 60 |         reading_dict[tcid][reader].append(unpacked_result)
 61 |     return reading_dict
 62 | 
 63 | 
 64 | def get_content_by_refs(db, pmid_list=None, trid_list=None, sources=None,
 65 |                         formats=None, content_type='abstract', unzip=True):
 66 |     """Return content from the database given a list of PMIDs or text ref ids.
 67 | 
 68 |     Note that either pmid_list OR trid_list must be set, and only one can be
 69 |     set at a time.
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     db : :py:class:`DatabaseManager`
 74 |         Reference to the DB to query
 75 |     pmid_list : list[str] or None
 76 |         A list of pmids. Default is None, in which case trid_list must be
 77 |         given.
 78 |     trid_list : list[int] or None
 79 |         A list of text ref ids. Default is None, in which case pmid list must
 80 |         be given.
 81 |     sources : list[str] or None
 82 |         A list of sources to include (e.g. 'pmc_oa', or 'pubmed'). Default is
 83 |         None, indicating that all sources will be included.
 84 |     formats : list[str]
 85 |         A list of the formats to be included ('xml', 'text'). Default is None,
 86 |         indicating that all formats will be included.
 87 |     content_type : str
 88 |         Select the type of content to load ('abstract' or 'fulltext'). Note
 89 |         that not all refs will have any, or both, types of content.
 90 |     unzip : Optional[bool]
 91 |         If True, the compressed output is decompressed into clear text.
 92 |         Default: True
 93 | 
 94 |     Returns
 95 |     -------
 96 |     content_dict : dict
 97 |         A dictionary whose keys are text ref ids, with each value being the
 98 |         the corresponding content.
 99 |     """
100 |     # Make sure we only get one type of list.
101 |     if not (pmid_list or trid_list):
102 |         raise ValueError("One of `pmid_list` or `trid_list` must be defined.")
103 |     if pmid_list and trid_list:
104 |         raise ValueError("Only one of `pmid_list` or `trid_list` may be used.")
105 | 
106 |     # Put together the clauses for the general constraints.
107 |     clauses = []
108 |     if sources is not None:
109 |         clauses.append(db.TextContent.source.in_(sources))
110 |     if formats is not None:
111 |         clauses.append(db.TextContent.format.in_(formats))
112 |     if content_type not in ['abstract', 'fulltext']:
113 |         raise ValueError("Unrecognized content type: %s" % content_type)
114 |     else:
115 |         clauses.append(db.TextContent.text_type == content_type)
116 | 
117 |     # Do the query to get the content.
118 |     if pmid_list is not None:
119 |         content_list = db.select_all(
120 |             [db.TextRef.pmid, db.TextContent.content],
121 |             db.TextRef.id == db.TextContent.text_ref_id,
122 |             db.TextRef.pmid.in_(pmid_list),
123 |             *clauses
124 |             )
125 |     else:
126 |         content_list = db.select_all([db.TextRef.id, db.TextContent.content],
127 |                                      db.TextContent.text_ref_id.in_(trid_list),
128 |                                      *clauses)
129 |     if unzip:
130 |         content_dict = {id_val: unpack(content)
131 |                         for id_val, content in content_list}
132 |     else:
133 |         content_dict = {id_val: content for id_val, content in content_list}
134 |     return content_dict
135 | 
136 | 
137 | def get_text(db, pmids, text_type):
138 |     """Return text content of a given type for a list of PMIDs."""
139 |     # Run a query for text content of the desired type
140 |     res = (db.session.query(db.TextRef.pmid, db.TextContent.text_type,
141 |                             db.TextContent.content)
142 |             .filter(db.TextRef.pmid_in(pmids))
143 |             .join(db.TextContent)
144 |             .filter(db.TextContent.text_type == text_type)
145 |             .all())
146 |     # Unpack the content, clean it up, and return it as a dictionary keyed
147 |     # by pmid
148 |     text_by_pmid = {
149 |         row.pmid: unpack(row.content).replace("\t", " ").replace("\n", "\t")
150 |         for row in res
151 |     }
152 |     return text_by_pmid
153 | 


--------------------------------------------------------------------------------
/indra_db/client/principal/pa_statements.py:
--------------------------------------------------------------------------------
  1 | __all__ = ["get_pa_stmt_jsons"]
  2 | 
  3 | import json
  4 | from collections import defaultdict
  5 | 
  6 | from sqlalchemy import func, cast, String, null
  7 | from sqlalchemy.dialects.postgresql import array
  8 | from sqlalchemy.orm import aliased
  9 | 
 10 | from indra_db.util.constructors import get_db
 11 | from indra_db.client.principal.raw_statements import _fix_evidence
 12 | 
 13 | 
 14 | def get_pa_stmt_jsons(clauses=None, with_evidence=True, db=None, limit=1000):
 15 |     """Load preassembled Statements from the principal database."""
 16 |     if db is None:
 17 |         db = get_db('primary')
 18 | 
 19 |     if clauses is None:
 20 |         clauses = []
 21 | 
 22 |     # Construct the core query.
 23 |     if with_evidence:
 24 |         text_ref_cols = [db.Reading.id, db.TextContent.id, db.TextRef.pmid,
 25 |                          db.TextRef.pmcid, db.TextRef.doi, db.TextRef.url,
 26 |                          db.TextRef.pii]
 27 |         text_ref_types = tuple([str if isinstance(col.type, String) else int
 28 |                                 for col in text_ref_cols])
 29 |         text_ref_cols = tuple([cast(col, String)
 30 |                                if not isinstance(col.type, String) else col
 31 |                                for col in text_ref_cols])
 32 |         text_ref_labels = ('rid', 'tcid', 'pmid', 'pmcid', 'doi', 'url', 'pii')
 33 |         core_q = db.session.query(
 34 |             db.PAStatements.mk_hash.label('mk_hash'),
 35 |             db.PAStatements.json.label('json'),
 36 |             func.array_agg(db.RawStatements.json).label("raw_jsons"),
 37 |             func.array_agg(array(text_ref_cols)).label("text_refs")
 38 |         ).outerjoin(
 39 |             db.RawUniqueLinks,
 40 |             db.RawUniqueLinks.pa_stmt_mk_hash == db.PAStatements.mk_hash
 41 |         ).join(
 42 |             db.RawStatements,
 43 |             db.RawStatements.id == db.RawUniqueLinks.raw_stmt_id
 44 |         ).outerjoin(
 45 |             db.Reading,
 46 |             db.Reading.id == db.RawStatements.reading_id
 47 |         ).outerjoin(
 48 |             db.TextContent,
 49 |             db.TextContent.id == db.Reading.text_content_id
 50 |         ).outerjoin(
 51 |             db.TextRef,
 52 |             db.TextRef.id == db.TextContent.text_ref_id
 53 |         )
 54 |     else:
 55 |         text_ref_types = None
 56 |         text_ref_labels = None
 57 |         core_q = db.session.query(
 58 |             db.PAStatements.mk_hash.label('mk_hash'),
 59 |             db.PAStatements.json.label('json'),
 60 |             null().label('raw_jsons'),
 61 |             null().label('text_refs')
 62 |         )
 63 |     core_q = core_q.filter(
 64 |         *clauses
 65 |     ).group_by(
 66 |         db.PAStatements.mk_hash,
 67 |         db.PAStatements.json
 68 |     )
 69 |     if limit:
 70 |         core_q = core_q.limit(limit)
 71 |     core_sq = core_q.subquery().alias('core')
 72 | 
 73 |     # Construct the layer of the query that gathers agent info.
 74 |     agent_tuple = (cast(db.PAAgents.ag_num, String),
 75 |                    db.PAAgents.db_name,
 76 |                    db.PAAgents.db_id)
 77 |     at_sq = db.session.query(
 78 |         core_sq.c.mk_hash,
 79 |         core_sq.c.json,
 80 |         core_sq.c.raw_jsons,
 81 |         core_sq.c.text_refs,
 82 |         func.array_agg(array(agent_tuple)).label('db_refs')
 83 |     ).filter(
 84 |         db.PAAgents.stmt_mk_hash == core_sq.c.mk_hash
 85 |     ).group_by(
 86 |         core_sq.c.mk_hash,
 87 |         core_sq.c.json,
 88 |         core_sq.c.raw_jsons,
 89 |         core_sq.c.text_refs
 90 |     ).subquery().alias('agent_tuples')
 91 | 
 92 |     # Construct the layer of the query that gathers supports/supported by.
 93 |     sup_from = aliased(db.PASupportLinks, name='sup_from')
 94 |     sup_to = aliased(db.PASupportLinks, name='sup_to')
 95 |     q = db.session.query(
 96 |         at_sq.c.mk_hash,
 97 |         at_sq.c.json,
 98 |         at_sq.c.raw_jsons,
 99 |         at_sq.c.text_refs,
100 |         at_sq.c.db_refs,
101 |         func.array_agg(sup_from.supporting_mk_hash).label('supporting_hashes'),
102 |         func.array_agg(sup_to.supported_mk_hash).label('supported_hashes')
103 |     ).outerjoin(
104 |         sup_from,
105 |         sup_from.supported_mk_hash == at_sq.c.mk_hash
106 |     ).outerjoin(
107 |         sup_to,
108 |         sup_to.supporting_mk_hash == at_sq.c.mk_hash
109 |     ).group_by(
110 |         at_sq.c.mk_hash,
111 |         at_sq.c.json,
112 |         at_sq.c.raw_jsons,
113 |         at_sq.c.text_refs,
114 |         at_sq.c.db_refs
115 |     )
116 | 
117 |     # Run and parse the query.
118 |     stmt_jsons = {}
119 |     stmts_by_hash = {}
120 |     for h, sj, rjs, text_refs, db_refs, supping, supped in q.all():
121 |         # Gather the agent refs.
122 |         db_ref_dicts = defaultdict(lambda: defaultdict(list))
123 |         for ag_num, db_name, db_id in db_refs:
124 |             db_ref_dicts[int(ag_num)][db_name].append(db_id)
125 |         db_ref_dicts = {k: dict(v) for k, v in db_ref_dicts.items()}
126 | 
127 |         # Clean supping and supped.
128 |         supping = [h for h in set(supping) if h is not None]
129 |         supped = [h for h in set(supped) if h is not None]
130 | 
131 |         # Parse the JSON bytes into JSON.
132 |         stmt_json = json.loads(sj)
133 |         if 'supports' not in stmt_json:
134 |             stmt_json['supports'] = []
135 |         if 'supported_by' not in stmt_json:
136 |             stmt_json['supported_by'] = []
137 | 
138 |         # Load the evidence.
139 |         if rjs is not None:
140 |             for rj, text_ref_values in zip(rjs, text_refs):
141 |                 raw_json = json.loads(rj)
142 |                 ev = raw_json['evidence'][0]
143 |                 if any(v is not None for v in text_ref_values):
144 |                     tr_dict = {lbl.upper(): None if val == "None" else typ(val)
145 |                                for lbl, typ, val
146 |                                in zip(text_ref_labels, text_ref_types,
147 |                                       text_ref_values)}
148 |                     _fix_evidence(ev, tr_dict.pop('RID'), tr_dict.pop('TCID'),
149 |                                   tr_dict)
150 |                 if 'evidence' not in stmt_json:
151 |                     stmt_json['evidence'] = []
152 |                 stmt_json['evidence'].append(ev)
153 | 
154 |         # Resolve supports supported-by, as much as possible.
155 |         stmts_by_hash[h] = stmt_json
156 |         for supped_h in (h for h in supped if h in stmts_by_hash):
157 |             stmt_json['supports'].append(stmts_by_hash[supped_h]['id'])
158 |             stmts_by_hash[supped_h]['supported_by'].append(stmt_json['id'])
159 |         for supping_h in (h for h in supping if h in stmts_by_hash):
160 |             stmt_json['supported_by'].append(stmts_by_hash[supping_h]['id'])
161 |             stmts_by_hash[supping_h]['supports'].append(stmt_json['id'])
162 | 
163 |         # Put it together in a dictionary.
164 |         result_dict = {
165 |             "mk_hash": h,
166 |             "stmt": stmt_json,
167 |             "db_refs": db_ref_dicts,
168 |             "supports_hashes": supping,
169 |             "supported_by_hashes": supped
170 |         }
171 |         stmt_jsons[h] = result_dict
172 |     return stmt_jsons
173 | 


--------------------------------------------------------------------------------
/indra_db/client/principal/raw_statements.py:
--------------------------------------------------------------------------------
  1 | __all__ = ['get_raw_stmt_jsons_from_agents', 'get_raw_stmt_jsons_from_papers',
  2 |            'get_raw_stmt_jsons']
  3 | 
  4 | import json
  5 | from collections import defaultdict
  6 | 
  7 | from sqlalchemy import intersect_all
  8 | 
  9 | from indra.util import clockit
 10 | 
 11 | from indra_db import get_db
 12 | from indra_db.util import regularize_agent_id
 13 | 
 14 | # ====
 15 | # API
 16 | # ====
 17 | 
 18 | 
 19 | @clockit
 20 | def get_raw_stmt_jsons_from_papers(id_list, id_type='pmid', db=None,
 21 |                                    max_stmts=None, offset=None):
 22 |     """Get raw statement jsons for a given list of papers.
 23 | 
 24 |     Parameters
 25 |     ----------
 26 |     id_list : list
 27 |         A list of ints or strs that are ids of papers of type `id_type`.
 28 |     id_type : str
 29 |         Default is 'pmid'. The type of ids given in id_list, e.g. 'pmid',
 30 |         'pmcid', 'trid'.
 31 |     db : :py:class:`DatabaseManager`
 32 |         Optionally specify a database manager that attaches to something
 33 |         besides the primary database, for example a local database instance.
 34 | 
 35 |     Returns
 36 |     -------
 37 |     result_dict : dict
 38 |         A dictionary keyed by id (of `id_type`) with a list of raw statement
 39 |         json objects as each value. Ids for which no statements are found will
 40 |         not be included in the dict.
 41 |     """
 42 |     if db is None:
 43 |         db = get_db('primary')
 44 | 
 45 |     # Get the attribute for this id type.
 46 |     if id_type == 'pmid':
 47 |         id_constraint = db.TextRef.pmid_in(id_list, filter_ids=True)
 48 |     elif id_type == 'pmcid':
 49 |         id_constraint = db.TextRef.pmcid_in(id_list, filter_ids=True)
 50 |     elif id_type == 'doi':
 51 |         id_constraint = db.TextRef.doi_in(id_list, filter_ids=True)
 52 |     else:
 53 |         id_constraint = _get_id_col(db.TextRef, id_type).in_(id_list)
 54 | 
 55 |     # Get the results.
 56 |     res = db.select_all([db.TextRef, db.RawStatements.json], id_constraint,
 57 |                         *db.link(db.RawStatements, db.TextRef))
 58 | 
 59 |     # Organized the results into a dict of lists keyed by id value.
 60 |     # Fix pmids along the way.
 61 |     result_dict = defaultdict(list)
 62 |     for tr, rjson_bytes in res:
 63 |         id_val = _get_id_col(tr, id_type)
 64 | 
 65 |         # Decode and unpack the json
 66 |         rjson = json.loads(rjson_bytes.decode('utf-8'))
 67 | 
 68 |         # Fix the pmids in this json.
 69 |         rjson['evidence'][0]['pmid'] = tr.pmid
 70 | 
 71 |         # Set the text_refs in this json
 72 |         ev = rjson['evidence'][0]
 73 |         if 'text_refs' not in ev.keys():
 74 |             ev['text_refs'] = {}
 75 |         for idt in ['trid', 'pmid', 'pmcid', 'doi']:
 76 |             ev['text_refs'][idt.upper()] = _get_id_col(tr, idt)
 77 | 
 78 |         # Add this to the results.
 79 |         result_dict[id_val].append(rjson)
 80 | 
 81 |     return result_dict
 82 | 
 83 | 
 84 | @clockit
 85 | def get_raw_stmt_jsons_from_agents(agents=None, stmt_type=None, db=None,
 86 |                                    max_stmts=None, offset=None):
 87 |     """Get Raw statement jsons from a list of agent refs and Statement type."""
 88 |     if db is None:
 89 |         db = get_db('primary')
 90 | 
 91 |     if agents is None:
 92 |         agents = []
 93 | 
 94 |     # Turn the agents parameters into an intersection of queries for stmt ids.
 95 |     entity_queries = []
 96 |     for role, ag_dbid, ns in agents:
 97 |         # Make the id match paradigms for the database.
 98 |         ag_dbid = regularize_agent_id(ag_dbid, ns)
 99 | 
100 |         # Sanitize wildcards.
101 |         for char in ['%', '_']:
102 |             ag_dbid = ag_dbid.replace(char, '\%s' % char)
103 | 
104 |         # Generate the query
105 |         q = db.session.query(
106 |             db.RawAgents.stmt_id.label('stmt_id')
107 |         ).filter(
108 |             db.RawAgents.db_id.like(ag_dbid)
109 |         )
110 | 
111 |         if ns is not None:
112 |             q = q.filter(db.RawAgents.db_name.like(ns))
113 | 
114 |         if role is not None:
115 |             q = q.filter(db.RawAgents.role == role.upper())
116 | 
117 |         entity_queries.append(q)
118 | 
119 |     # Add a constraint for the statement type.
120 |     if stmt_type is not None:
121 |         q = db.session.query(
122 |             db.RawStatements.id.label('stmt_id')
123 |         ).filter(
124 |             db.RawStatements.type == stmt_type
125 |         )
126 |         entity_queries.append(q)
127 | 
128 |     # Generate the sub-query.
129 |     ag_query_al = intersect_all(*entity_queries).alias('intersection')
130 |     ag_query = db.session.query(ag_query_al).distinct().subquery('ag_stmt_ids')
131 | 
132 |     # Get the raw statement JSONs from the database.
133 |     res = get_raw_stmt_jsons([db.RawStatements.id == ag_query.c.stmt_id], db=db,
134 |                              max_stmts=max_stmts, offset=offset)
135 |     return res
136 | 
137 | 
138 | def get_raw_stmt_jsons(clauses=None, db=None, max_stmts=None, offset=None):
139 |     """Get Raw Statements from the principle database, given arbitrary clauses.
140 |     """
141 |     if db is None:
142 |         db = get_db('primary')
143 | 
144 |     if clauses is None:
145 |         clauses = []
146 | 
147 |     q = db.session.query(
148 |         db.RawStatements.id,
149 |         db.RawStatements.json,
150 |         db.Reading.id,
151 |         db.TextContent.id,
152 |         db.TextRef
153 |     ).filter(
154 |         *clauses
155 |     ).outerjoin(
156 |         db.Reading,
157 |         db.Reading.id == db.RawStatements.reading_id
158 |     ).outerjoin(
159 |         db.TextContent,
160 |         db.TextContent.id == db.Reading.text_content_id
161 |     ).outerjoin(
162 |         db.TextRef,
163 |         db.TextRef.id == db.TextContent.text_ref_id
164 |     )
165 | 
166 |     if max_stmts is not None:
167 |         q = q.limit(max_stmts)
168 | 
169 |     if offset is not None:
170 |         q = q.offset(offset)
171 | 
172 |     raw_stmt_jsons = {}
173 |     for sid, json_bytes, rid, tcid, tr in q.all():
174 |         raw_j = json.loads(json_bytes)
175 |         if rid is not None:
176 |             _fix_evidence(raw_j['evidence'][0], rid, tcid, tr.get_ref_dict())
177 |         raw_stmt_jsons[sid] = raw_j
178 | 
179 |     return raw_stmt_jsons
180 | 
181 | 
182 | # ======
183 | # Tools
184 | # ======
185 | 
186 | 
187 | def _get_id_col(tr, id_type):
188 |     if id_type == 'trid':
189 |         id_attr = tr.id
190 |     else:
191 |         try:
192 |             id_attr = getattr(tr, id_type)
193 |         except AttributeError:
194 |             raise ValueError("Invalid id_type: %s" % id_type)
195 |     return id_attr
196 | 
197 | 
198 | def _fix_evidence(ev, rid, tcid, tr_dict):
199 |     ev['text_refs'] = tr_dict
200 |     ev['text_refs']['TCID'] = tcid
201 |     ev['text_refs']['READING_ID'] = rid
202 |     if 'PMID' in tr_dict:
203 |         ev['pmid'] = tr_dict['PMID']
204 |     return
205 | 
206 | 


--------------------------------------------------------------------------------
/indra_db/client/readonly/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 | from .query import *
3 | 


--------------------------------------------------------------------------------
/indra_db/client/readonly/mesh_ref_counts.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import func
 2 | 
 3 | from indra_db import get_ro
 4 | 
 5 | 
 6 | def get_mesh_ref_counts(mesh_terms, require_all=False, ro=None):
 7 |     """Get the number of distinct pmids by mesh term for each hash.
 8 | 
 9 |     This function directly queries a table in the readonly database that counts
10 |     the number of distinct PMIDs for each mesh term/hash pair. Given a list of
11 |     mesh terms, this will return a dictionary keyed by hash containing
12 |     dictionaries indicating how much support the hash has from each of the given
13 |     mesh IDs in terms of distinct PMIDs (thus distinct publications).
14 | 
15 |     Parameters
16 |     ----------
17 |     mesh_terms : list
18 |         A list of mesh term strings of the form "D000#####".
19 |     require_all : Optional[bool]
20 |         If True, require that each entry in the result includes both mesh terms.
21 |         In other words, only return results where, for each hash, articles exist
22 |         with support from all MeSH IDs given, not just one or the other. Default
23 |         is False
24 |     ro : Optional[DatabaseManager]
25 |         A database manager handle. The default is the primary readonly, as
26 |         indicated by environment variables or the config file.
27 |     """
28 |     # Get the default readonly database, if needed..
29 |     if ro is None:
30 |         ro = get_ro('primary')
31 | 
32 |     # Make sure the mesh IDs are of the correct kind.
33 |     if not all(m.startswith('D') or m.startswith('C') for m in mesh_terms):
34 |         raise ValueError("All mesh terms must begin with C or D.")
35 | 
36 |     # Convert the IDs to numbers for faster lookup.
37 |     result = {}
38 |     for prefix, table in [('C', ro.MeshConceptRefCounts),
39 |                           ('D', ro.MeshTermRefCounts)]:
40 |         mesh_num_map = {int(m[1:]): m for m in mesh_terms
41 |                         if m.startswith(prefix)}
42 |         if not mesh_num_map:
43 |             continue
44 | 
45 |         # Build the query.
46 |         nums = func.array_agg(table.mesh_num)
47 |         counts = func.array_agg(table.ref_count)
48 |         q = ro.session.query(table.mk_hash, nums.label('nums'),
49 |                              counts.label('ref_counts'), table.pmid_count)
50 |         if len(mesh_num_map.keys()) == 1:
51 |             q = q.filter(table.mesh_num == list(mesh_num_map.keys())[0])
52 |         elif len(mesh_num_map.keys()) > 1:
53 |             q = q.filter(table.mesh_num.in_(mesh_num_map.keys()))
54 |         q = q.group_by(table.mk_hash, table.pmid_count)
55 | 
56 |         # Apply the require all option by comparing the length of the nums array
57 |         # to the number of inputs.
58 |         if require_all:
59 |             q = q.having(func.cardinality(nums) == len(mesh_num_map.keys()))
60 | 
61 |         # Parse the results.
62 |         for mk_hash, nums, counts, pmid_count in q.all():
63 |             count_dict = {mesh_num_map[mesh_num]: ref_count
64 |                           for mesh_num, ref_count in zip(nums, counts)}
65 |             if mk_hash not in result:
66 |                 result[mk_hash] = count_dict
67 |                 result[mk_hash]['total'] = pmid_count
68 |             else:
69 |                 result[mk_hash].update(count_dict)
70 |                 result[mk_hash]['total'] += sum(counts)
71 | 
72 |     # Little sloppy, but delete any that don't meet the require_all constraint.
73 |     if require_all:
74 |         num_terms = len(set(mesh_terms))
75 |         for mk_hash in result.copy().keys():
76 |             if len(result[mk_hash]) != num_terms + 1:
77 |                 result.pop(mk_hash)
78 |     return result
79 | 


--------------------------------------------------------------------------------
/indra_db/client/readonly/util.py:
--------------------------------------------------------------------------------
 1 | __all__ = ['stmt_from_interaction']
 2 | 
 3 | import logging
 4 | 
 5 | from indra.statements import get_statement_by_name, Agent, ActiveForm
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | snowflakes = ['Complex', 'Translocation', 'ActiveForm', 'Conversion',
11 |               'Autophosphorylation']
12 | 
13 | 
14 | def stmt_from_interaction(interaction):
15 |     """Get a shell statement from an interaction."""
16 |     StmtClass = get_statement_by_name(interaction['type'])
17 |     if interaction['type'] == 'Complex':
18 |         agents = [Agent(name) for name in interaction['agents'].values()]
19 |         stmt = StmtClass(agents)
20 |     elif interaction['type'] == 'ActiveForm':
21 |         name = interaction['agents'][0]
22 |         agent = Agent(name)
23 |         stmt = StmtClass(agent, interaction['activity'],
24 |                          interaction['is_active'])
25 |     else:
26 |         agents = [Agent(interaction['agents'][i])
27 |                   if interaction['agents'].get(i)
28 |                   else None
29 |                   for i in range(len(StmtClass._agent_order))]
30 |         stmt = StmtClass(*agents)
31 |     return stmt
32 | 
33 | 
34 | def _iter_agents(stmt_json, agent_order):
35 |     for i, ag_key in enumerate(agent_order):
36 |         ag = stmt_json.get(ag_key)
37 |         if ag is None:
38 |             continue
39 |         if isinstance(ag, list):
40 |             # Like a complex
41 |             for ag_obj in ag:
42 |                 if stmt_json['type'] in snowflakes:
43 |                     yield None, ag_obj
44 |                 else:
45 |                     yield ['subject', 'object'][i], ag_obj
46 |         else:
47 |             if stmt_json['type'] in snowflakes:
48 |                 yield None, ag
49 |             else:
50 |                 yield ['subject', 'object'][i], ag
51 | 


--------------------------------------------------------------------------------
/indra_db/exceptions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class IndraDbException(Exception):
 3 |     pass
 4 | 
 5 | 
 6 | class NoAuthError(IndraDbException):
 7 |     def __init__(self, api_key, access):
 8 |         msg = "The api key %s does not grand access to %s." % (api_key, access)
 9 |         super(NoAuthError, self).__init__(msg)
10 | 
11 | 
12 | class BadHashError(IndraDbException):
13 |     def __init__(self, mk_hash):
14 |         self.bad_hash = mk_hash
15 |         msg = 'The matches-key hash %s is not valid.' % mk_hash
16 |         super(BadHashError, self).__init__(msg)
17 | 


--------------------------------------------------------------------------------
/indra_db/preassembly/submitter.py:
--------------------------------------------------------------------------------
 1 | from indra.statements import get_all_descendants, Statement
 2 | from indra_reading.batch.submitters.submitter import Submitter
 3 | from indra_reading.batch.util import bucket_name
 4 | 
 5 | DEFAULT_AVOID_STATEMENTS = ['Event', 'Influence', 'Unresolved']
 6 | VALID_STATEMENTS = [st.__name__ for st in get_all_descendants(Statement)
 7 |                     if st.__name__ not in DEFAULT_AVOID_STATEMENTS]
 8 | 
 9 | 
10 | class PreassemblySubmitter(Submitter):
11 |     job_class = 'preassembly'
12 |     _purpose = 'db_preassembly'
13 |     _job_queue_dict = {'run_db_reading_queue': ['create', 'update']}
14 |     _job_def_dict = {'run_db_reading_jobdef': ['create', 'update']}
15 | 
16 |     def __init__(self, basename, task, *args, **kwargs):
17 |         if task not in ['create', 'update']:
18 |             raise ValueError(f"Invalid task '{task}': expected 'create' or "
19 |                              f"'update'.")
20 |         self.task = task
21 |         super(PreassemblySubmitter, self).__init__(basename, *args, **kwargs)
22 | 
23 |     def _iter_over_select_queues(self):
24 |         for jq, tasks in self._job_queue_dict.items():
25 |             if self.task not in tasks:
26 |                 continue
27 |             yield jq
28 | 
29 |     def _get_command(self, job_type_set, *args):
30 |         if len(args) == 2:
31 |             stmt_type, batch_size = args
32 |             continuing = False
33 |         else:
34 |             stmt_type, batch_size, continuing = args
35 |         if self.task not in job_type_set:
36 |             return None, None
37 |         job_name = f'{self.job_base}_{self.task}_{stmt_type}'
38 |         s3_cache = f's3://{bucket_name}/{self.s3_base}/{job_name}'
39 |         cmd = ['python3', '-m', 'indra_db.preassembly.preassemble_db',
40 |                self.task, '-C', s3_cache, '-T', stmt_type, '-Y',
41 |                '-b', str(batch_size)]
42 |         if continuing:
43 |             cmd += ['-c']
44 |         return job_name, cmd
45 | 
46 |     def _iter_job_args(self, *args):
47 |         type_list = args[0]
48 |         if type_list is None:
49 |             type_list = VALID_STATEMENTS
50 | 
51 |         invalid_types = set(type_list) - set(VALID_STATEMENTS)
52 |         if invalid_types:
53 |             raise ValueError(f"Found invalid statement types: {invalid_types}")
54 | 
55 |         for stmt_type in type_list:
56 |             yield (stmt_type,) + tuple(args[1:])
57 | 


--------------------------------------------------------------------------------
/indra_db/reading/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/indra_db/reading/__init__.py


--------------------------------------------------------------------------------
/indra_db/readonly_dumping/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/indra_db/readonly_dumping/__init__.py


--------------------------------------------------------------------------------
/indra_db/readonly_dumping/export_assembly_refinement.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import time
  4 | from datetime import datetime, timezone
  5 | 
  6 | import boto3
  7 | 
  8 | from indra_db import get_db
  9 | from indra_db.readonly_dumping.export_assembly import split_tsv_gz_file, \
 10 |     batch_size, count_rows_in_tsv_gz, get_refinement_graph, \
 11 |     refinement_cycles_fpath, calculate_belief
 12 | import multiprocessing as mp
 13 | 
 14 | from indra_db.util import S3Path
 15 | from .locations import *
 16 | from indra_db.readonly_dumping.util import record_time
 17 | import logging
 18 | 
 19 | logger = logging.getLogger("indra_db.readonly_dumping.export_assembly")
 20 | logger.setLevel(logging.DEBUG)
 21 | logger.propagate = False
 22 | 
 23 | file_handler = logging.FileHandler(pipeline_log_fpath.absolute().as_posix(), mode='a')
 24 | file_handler.setLevel(logging.DEBUG)
 25 | 
 26 | formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M')
 27 | file_handler.setFormatter(formatter)
 28 | 
 29 | logger.addHandler(file_handler)
 30 | 
 31 | #put the rest of export_assembly in a seperate file to ensure memory is released in EC2
 32 | if __name__ == '__main__':
 33 |     if not refinements_fpath.exists() or not belief_scores_pkl_fpath.exists():
 34 |         db = get_db("primary")
 35 |         res = db.select_all(db.DBInfo)
 36 |         db_name_api_mapping = {r.db_name: r.source_api for r in res}
 37 | 
 38 |         time_benchmark = {}
 39 |         start_time = time.time()
 40 |         mp.set_start_method('spawn')
 41 |         logger.info("6. Running setup for refinement calculation")
 42 | 
 43 |         # 6. Calculate refinement graph:
 44 | 
 45 |         if not split_unique_statements_folder_fpath.exists():
 46 |             logger.info("Splitting unique statements")
 47 |             # time: 30 min
 48 |             split_tsv_gz_file(unique_stmts_fpath.as_posix(),
 49 |                               split_unique_statements_folder_fpath.as_posix(),
 50 |                               batch_size=batch_size)
 51 |             logger.info(
 52 |                 "Finished splitting unique statement"
 53 |             )
 54 |         else:
 55 |             logger.info(
 56 |                 "split_unique_statements_folder exist"
 57 |             )
 58 |         split_unique_files = [os.path.join(split_unique_statements_folder_fpath, f)
 59 |                               for f in
 60 |                               os.listdir(split_unique_statements_folder_fpath)
 61 |                               if f.endswith(".gz")]
 62 |         split_unique_files = sorted(
 63 |             split_unique_files,
 64 |             key=lambda x: int(re.findall(r'\d+', x)[0])
 65 |         )
 66 |         batch_count = len(split_unique_files)
 67 |         # get the n_rows in the last uncompleted batch
 68 |         last_count = count_rows_in_tsv_gz(split_unique_files[-1])
 69 |         num_rows = (batch_count - 1) * batch_size + last_count
 70 |         logger.info(f"{num_rows} rows in unique statements with "
 71 |                     f"{batch_count} batches")
 72 |         cycles_found = False
 73 | 
 74 |         ref_graph = get_refinement_graph(n_rows=num_rows,
 75 |                                          split_files=split_unique_files)
 76 |         end_time = time.time()
 77 |         record_time(export_benchmark.absolute().as_posix(),
 78 |                     (end_time - start_time)/3600,
 79 |                     'Refinement step', 'a')
 80 | 
 81 |         # 7. Get belief scores, if there were no refinement cycles
 82 |         start_time = time.time()
 83 |         if cycles_found:
 84 |             logger.info(
 85 |                 f"Refinement graph stored in variable 'ref_graph', "
 86 |                 f"edges saved to {refinements_fpath.as_posix()}"
 87 |                 f"and cycles saved to {refinement_cycles_fpath.as_posix()}"
 88 |             )
 89 | 
 90 |         else:
 91 |             logger.info("7. Calculating belief")
 92 |             calculate_belief(
 93 |                 refinements_graph=ref_graph,
 94 |                 num_batches=batch_count,
 95 |                 batch_size=batch_size,
 96 |                 source_mapping=db_name_api_mapping,
 97 |             )
 98 | 
 99 |         # upload source_count, belief_score
100 |         # and processed_statement to S3 for cogex usage
101 |         timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
102 | 
103 |         s3 = boto3.client("s3")
104 |         base_s3_path = S3Path("bigmech",
105 |                               f"indra-db/dumps/cogex_files/{timestamp}")
106 | 
107 |         for local_file in [source_counts_fpath, processed_stmts_fpath,
108 |                            belief_scores_pkl_fpath]:
109 |             s3_path = base_s3_path.get_element_path(local_file.name)
110 |             s3_path.upload(s3, body=local_file.read_bytes())
111 |             logger.info(f"Uploaded {local_file} → {s3_path}")
112 | 
113 |         if refinements_fpath.exists() or refinement_cycles_fpath.exists():
114 |             for local_file in [refinements_fpath, refinement_cycles_fpath]:
115 |                 s3_path = base_s3_path.get_element_path(local_file.name)
116 |                 s3_path.upload(s3, body=local_file.read_bytes())
117 |                 logger.info(f"Uploaded {local_file} → {s3_path}")
118 | 
119 |         end_time = time.time()
120 |         record_time(export_benchmark.absolute().as_posix(),
121 |                     (end_time - start_time) / 3600,
122 |                     'Belief score step', 'a')
123 |     else:
124 |         logger.info("Final output already exists, stopping script")


--------------------------------------------------------------------------------
/indra_db/readonly_dumping/locations.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import pystow
  3 | 
  4 | TEMP_DIR = pystow.module("readonly_pipeline")
  5 | 
  6 | 
  7 | __all__ = [
  8 |     "TEMP_DIR",
  9 |     "PUBMED_MESH_DIR",
 10 |     "pubmed_xml_gz_dir",
 11 |     "raw_statements_fpath",
 12 |     "reading_text_content_fpath",
 13 |     "text_refs_fpath",
 14 |     "drop_readings_fpath",
 15 |     "reading_to_text_ref_map_fpath",
 16 |     "processed_stmts_reading_fpath",
 17 |     "processed_stmts_fpath",
 18 |     "source_counts_reading_fpath",
 19 |     "source_counts_knowledgebases_fpath",
 20 |     "source_counts_fpath",
 21 |     "stmt_hash_to_raw_stmt_ids_fpath",
 22 |     "stmt_hash_to_raw_stmt_ids_reading_fpath",
 23 |     "stmt_hash_to_raw_stmt_ids_knowledgebases_fpath",
 24 |     "raw_id_info_map_fpath",
 25 |     "raw_id_info_map_reading_fpath",
 26 |     "raw_id_info_map_knowledgebases_fpath",
 27 |     "grounded_stmts_fpath",
 28 |     "unique_stmts_fpath",
 29 |     "refinements_fpath",
 30 |     "belief_scores_pkl_fpath",
 31 |     "pa_hash_act_type_ag_count_cache",
 32 |     "belief_scores_tsv_fpath",
 33 |     "reading_ref_link_tsv_fpath",
 34 |     "raw_stmt_source_tsv_fpath",
 35 |     "PUBMED_MESH_DIR",
 36 |     "pubmed_xml_gz_dir",
 37 |     "pmid_mesh_map_fpath",
 38 |     "pmid_mesh_mti_fpath",
 39 |     "pmid_stmt_hash_fpath",
 40 |     "pmid_mesh_concept_counts_fpath",
 41 |     "pmid_mesh_term_counts_fpath",
 42 |     "mk_hash_pmid_sets_fpath",
 43 |     "mesh_concept_ref_counts_fpath",
 44 |     "mesh_term_ref_counts_fpath",
 45 |     "mesh_concepts_meta_fpath",
 46 |     "mesh_terms_meta_fpath",
 47 |     "raw_stmt_mesh_concepts_fpath",
 48 |     "raw_stmt_mesh_terms_fpath",
 49 |     "pa_meta_fpath",
 50 |     "name_meta_tsv",
 51 |     "text_meta_tsv",
 52 |     "other_meta_tsv",
 53 |     "source_meta_parquet",
 54 |     "evidence_counts_tsv",
 55 |     "pa_agents_counts_tsv",
 56 |     'split_raw_statements_folder_fpath',
 57 |     'split_unique_statements_folder_fpath',
 58 |     "sql_ontology_db_fpath",
 59 |     "postgresql_jar",
 60 |     "split_pa_link_folder_fpath",
 61 |     "standard_readonly_snapshot",
 62 |     "new_readonly_snapshot",
 63 |     "export_benchmark",
 64 |     "table_benchmark",
 65 |     "pipeline_log_fpath",
 66 |     "knowledgebase_source_data_fpath"
 67 | ]
 68 | pipeline_log_fpath = TEMP_DIR.join(name="Pipeline.log")
 69 | 
 70 | # knowledgebase source files
 71 | knowledgebase_source_data_fpath = TEMP_DIR.join(name="kb_source_data")
 72 | knowledgebase_version_record = TEMP_DIR.join(name='knowledgebase_version_record.tsv')
 73 | 
 74 | # Dump files and their derivatives
 75 | split_raw_statements_folder_fpath = TEMP_DIR.join(name="split_raw_statements")
 76 | raw_statements_fpath = TEMP_DIR.join(name="raw_statements.tsv.gz")
 77 | reading_text_content_fpath = TEMP_DIR.join(name="reading_text_content_meta.tsv.gz")
 78 | text_refs_fpath = TEMP_DIR.join(name="text_refs_principal.tsv.gz")
 79 | drop_readings_fpath = TEMP_DIR.join(name="drop_readings.pkl")
 80 | reading_to_text_ref_map_fpath = TEMP_DIR.join(name="reading_to_text_ref_map.pkl")
 81 | processed_stmts_reading_fpath = TEMP_DIR.join(
 82 |     name="processed_statements_reading.tsv.gz"
 83 | )
 84 | processed_stmts_fpath = TEMP_DIR.join(name="processed_statements.tsv.gz")
 85 | source_counts_reading_fpath = TEMP_DIR.join(name="source_counts_reading.pkl")
 86 | source_counts_knowledgebases_fpath = TEMP_DIR.join(
 87 |     name="source_counts_knowledgebases.pkl"
 88 | )
 89 | source_counts_fpath = TEMP_DIR.join(name="source_counts.pkl")
 90 | stmt_hash_to_raw_stmt_ids_fpath = TEMP_DIR.join(name="stmt_hash_to_raw_stmt_ids.pkl")
 91 | stmt_hash_to_raw_stmt_ids_reading_fpath = TEMP_DIR.join(
 92 |     name="stmt_hash_to_raw_stmt_ids_reading.pkl"
 93 | )
 94 | stmt_hash_to_raw_stmt_ids_knowledgebases_fpath = TEMP_DIR.join(
 95 |     name="stmt_hash_to_raw_stmt_ids_knowledgebases.pkl"
 96 | )
 97 | raw_id_info_map_fpath = TEMP_DIR.join(name="raw_stmt_id_to_info_map.tsv.gz")
 98 | raw_id_info_map_reading_fpath = TEMP_DIR.join(
 99 |     name="raw_stmt_id_to_info_map_reading.tsv.gz"
100 | )
101 | raw_id_info_map_knowledgebases_fpath = TEMP_DIR.join(
102 |     name="raw_stmt_id_to_info_map_knowledgebases.tsv.gz"
103 | )
104 | grounded_stmts_fpath = TEMP_DIR.join(name="grounded_statements.tsv.gz")
105 | unique_stmts_fpath = TEMP_DIR.join(name="unique_statements.tsv.gz")
106 | refinements_fpath = TEMP_DIR.join(name="refinements.tsv.gz")
107 | sql_ontology_db_fpath = TEMP_DIR.join(name='bio_ontology.db')
108 | split_unique_statements_folder_fpath = TEMP_DIR.join(name="split_unique_statements_folder")
109 | belief_scores_pkl_fpath = TEMP_DIR.join(name="belief_scores.pkl")
110 | pa_hash_act_type_ag_count_cache = TEMP_DIR.join(
111 |     name="pa_hash_act_type_ag_count_cache.pkl"
112 | )
113 | 
114 | # Temporary tsv files used for load into readonly db
115 | belief_scores_tsv_fpath = TEMP_DIR.join(name="belief_scores.tsv")
116 | reading_ref_link_tsv_fpath = TEMP_DIR.join(name="reading_ref_link.tsv")
117 | raw_stmt_source_tsv_fpath = TEMP_DIR.join(name="raw_stmt_source.tsv")
118 | 
119 | # Pubmed XML files
120 | PUBMED_MESH_DIR = TEMP_DIR.module("pubmed_mesh")
121 | pubmed_xml_gz_dir = PUBMED_MESH_DIR.join(name="pubmed_xml_gz")
122 | 
123 | # stmt hash-pmid-MeSH map
124 | pmid_mesh_map_fpath = PUBMED_MESH_DIR.join(name="pmid_mesh_map.pkl")
125 | pmid_mesh_mti_fpath = PUBMED_MESH_DIR.join(name="pmid_mesh_mti.tsv")
126 | pmid_stmt_hash_fpath = PUBMED_MESH_DIR.join(name="pmid_stmt_hash.pkl")
127 | 
128 | # MeshConcept/TermRefCounts
129 | pmid_mesh_concept_counts_fpath = TEMP_DIR.join(name="pmid_mesh_concept_counts.pkl")
130 | pmid_mesh_term_counts_fpath = TEMP_DIR.join(name="pmid_mesh_term_counts.pkl")
131 | mk_hash_pmid_sets_fpath = TEMP_DIR.join(name="mk_hash_pmid_sets.pkl")
132 | mesh_concept_ref_counts_fpath = TEMP_DIR.join(name="mesh_concept_ref_counts.tsv")
133 | mesh_term_ref_counts_fpath = TEMP_DIR.join(name="mesh_term_ref_counts.tsv")
134 | 
135 | # MeshConceptMeta and MeshTermMeta
136 | mesh_concepts_meta_fpath = PUBMED_MESH_DIR.join(name="mesh_concepts_meta.tsv")
137 | mesh_terms_meta_fpath = PUBMED_MESH_DIR.join(name="mesh_terms_meta.tsv")
138 | 
139 | # RawStmtMeshConcepts and RawStmtMeshTerms
140 | raw_stmt_mesh_concepts_fpath = PUBMED_MESH_DIR.join(name="raw_stmt_mesh_concepts.tsv")
141 | raw_stmt_mesh_terms_fpath = PUBMED_MESH_DIR.join(name="raw_stmt_mesh_terms.tsv")
142 | 
143 | # PaMeta and derived files
144 | pa_meta_fpath = TEMP_DIR.join(name="pa_meta.tsv")
145 | name_meta_tsv = TEMP_DIR.join(name="name_meta.tsv")
146 | text_meta_tsv = TEMP_DIR.join(name="text_meta.tsv")
147 | other_meta_tsv = TEMP_DIR.join(name="other_meta.tsv")
148 | 
149 | # SourceMeta
150 | source_meta_parquet = TEMP_DIR.join(name="source_meta.parquet")
151 | 
152 | # EvidenceCounts
153 | evidence_counts_tsv = TEMP_DIR.join(name="evidence_counts.tsv")
154 | 
155 | # PaAgentCounts
156 | pa_agents_counts_tsv = TEMP_DIR.join(name="pa_agents_counts.tsv")
157 | 
158 | #table construction
159 | postgresql_jar = TEMP_DIR.join(name='postgresql-42.7.3.jar')
160 | split_pa_link_folder_fpath = TEMP_DIR.join(name='split_parquet')
161 | standard_readonly_snapshot = TEMP_DIR.join(name='schema_snapshot.sql')
162 | new_readonly_snapshot=TEMP_DIR.join(name='new_readonly_snapshot.sql')
163 | export_benchmark = TEMP_DIR.join(name='export_benchmark_times.txt')
164 | table_benchmark = TEMP_DIR.join(name='table_benchmark_times.txt')
165 | 
166 | 
167 | if __name__ == "__main__":
168 |     # Print the requested path to stdout if there is a match
169 |     import sys
170 | 
171 |     file_name = sys.argv[1]
172 |     for file_var in __all__:
173 |         if file_var.startswith(file_name):
174 |             if hasattr(sys.modules[__name__], file_var):
175 |                 path = getattr(sys.modules[__name__], file_var)
176 |                 assert isinstance(path, Path)
177 |                 print(path.absolute().as_posix())
178 |                 break
179 |     else:
180 |         raise ValueError(f"Could not find file {file_name}")
181 | 


--------------------------------------------------------------------------------
/indra_db/readonly_dumping/rds_restore.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Log start time
 4 | echo "[$(date)] Starting RDS instance creation..."
 5 | 
 6 | # Step 1: Create the RDS instance
 7 | aws rds create-db-instance \
 8 |     --db-instance-identifier readonly-test \
 9 |     --db-instance-class db.m5.xlarge \
10 |     --engine postgres \
11 |     --allocated-storage 500 \
12 |     --master-username masteruser \
13 |     --master-user-password testpassword \
14 |     --vpc-security-group-ids sg-0c49d0d42c8ae49c1 \
15 |     --availability-zone us-east-1a \
16 |     --backup-retention-period 7 \
17 |     --db-name postgres \
18 |     --publicly-accessible
19 | 
20 | # Log progress
21 | echo "[$(date)] RDS instance creation initiated. Waiting for it to be available..."
22 | 
23 | # Step 2: Wait for the RDS instance to become available
24 | aws rds wait db-instance-available --db-instance-identifier readonly-test
25 | echo "[$(date)] RDS instance is now available."
26 | 
27 | # Step 3: Get the RDS endpoint
28 | RDS_ENDPOINT=$(aws rds describe-db-instances \
29 |     --db-instance-identifier readonly-test \
30 |     --query "DBInstances[0].Endpoint.Address" \
31 |     --output text)
32 | 
33 | if [[ -z "$RDS_ENDPOINT" ]]; then
34 |     echo "[$(date)] Failed to retrieve RDS endpoint."
35 |     exit 1
36 | fi
37 | 
38 | echo "[$(date)] RDS Endpoint: $RDS_ENDPOINT"
39 | 
40 | # Step 4: Connect to the RDS instance and create a database
41 | 
42 | echo "[$(date)] Connecting to RDS to create the database..."
43 | PGPASSWORD=testpassword psql -h $RDS_ENDPOINT -U masteruser -d postgres -p 5432 -c "DROP DATABASE IF EXISTS indradb_readonly_test;"
44 | PGPASSWORD=testpassword psql -h $RDS_ENDPOINT -U masteruser -d postgres -p 5432 -c "CREATE DATABASE indradb_readonly_test;"
45 | 
46 | if [[ $? -ne 0 ]]; then
47 |     echo "[$(date)] Failed to create the database."
48 |     exit 1
49 | fi
50 | 
51 | echo "[$(date)] Database 'indradb_readonly_test' created successfully."
52 | 
53 | # Step 5: Restore the dump file from S3 into the new database
54 | echo "[$(date)] Restoring dump file into the database..."
55 | 
56 | aws s3 cp s3://bigmech/indra-db/dumps/indradb_readonly_local_test.dump - | \
57 | PGPASSWORD=testpassword psql -h $RDS_ENDPOINT -U masteruser -d indradb_readonly_test
58 | 
59 | if [[ $? -ne 0 ]]; then
60 |     echo "[$(date)] Failed to restore the dump file."
61 |     exit 1
62 | fi
63 | 
64 | echo "[$(date)] Dump file restored successfully into 'indradb_readonly_test'."


--------------------------------------------------------------------------------
/indra_db/readonly_dumping/readonly_dumping_bash.sh:
--------------------------------------------------------------------------------
  1 | # shellcheck disable=SC1090
  2 | # SETUP
  3 | set -e
  4 | # Get password to the principal database from the user
  5 | echo "Enter password for the principal database:"
  6 | read -s PGPASSWORD # -s flag hides the password
  7 | # Use the PGPASSWORD environment variable to set the password, see:
  8 | # https://www.postgresql.org/docs/13/libpq-envars.html
  9 | export PGPASSWORD
 10 | 
 11 | # If the password is empty, exit
 12 | if [ -z "$PGPASSWORD" ]
 13 | then
 14 |     echo "Password is empty. Exiting."
 15 |     exit 1
 16 | fi
 17 | 
 18 | 
 19 | 
 20 | #Set the user for the local db
 21 | LOCAL_RO_USER="postgres"
 22 | export LOCAL_RO_USER
 23 | 
 24 | # Set the password for the local db
 25 | echo "Provide password for the local database:"
 26 | read -s LOCAL_RO_PASSWORD
 27 | export LOCAL_RO_PASSWORD
 28 | 
 29 | # Set the name of the local db
 30 | LOCAL_RO_DB_NAME="indradb_readonly_local_test"
 31 | export LOCAL_RO_DB_NAME
 32 | echo "Local db name: $LOCAL_RO_DB_NAME"
 33 | 
 34 | # Get the current date and time
 35 | START_DATE_TIME=`date '+%Y-%m-%d %H:%M:%S'`
 36 | START_DATE=`date '+%Y-%m-%d'`
 37 | echo "{\"datetime\": \"$START_DATE_TIME\", \"date_stamp\": \"$START_DATE\"}" > start.json
 38 | S3_PATH="s3://bigmech/indra-db/dumps/$START_DATE"
 39 | aws s3 cp start.json "$S3_PATH/start.json"
 40 | echo "Start date marked as: $START_DATE"
 41 | # INITIAL DUMPING
 42 | 
 43 | # Get file paths for initial dump files
 44 | RAW_STMTS_FPATH=`python3 -m indra_db.readonly_dumping.locations raw_statements`
 45 | export RAW_STMTS_FPATH
 46 | READING_TEXT_CONTENT_META_FPATH=`python3 -m indra_db.readonly_dumping.locations reading_text_content`
 47 | export READING_TEXT_CONTENT_META_FPATH
 48 | TEXT_REFS_PRINCIPAL_FPATH=`python3 -m indra_db.readonly_dumping.locations text_refs`
 49 | export TEXT_REFS_PRINCIPAL_FPATH
 50 | 
 51 | 
 52 | if [ -z "$RAW_STMTS_FPATH" ] || [ -z "$READING_TEXT_CONTENT_META_FPATH" ] || [ -z "$TEXT_REFS_PRINCIPAL_FPATH" ]
 53 | then
 54 |     if [ -z "$RAW_STMTS_FPATH" ]
 55 |     then
 56 |         echo "Raw statements file path is empty"
 57 |     fi
 58 |     if [ -z "$READING_TEXT_CONTENT_META_FPATH" ]
 59 |     then
 60 |         echo "Reading text content meta file path is empty"
 61 |     fi
 62 |     if [ -z "$TEXT_REFS_PRINCIPAL_FPATH" ]
 63 |     then
 64 |         echo "Text refs principal file path is empty"
 65 |     fi
 66 |     exit 1
 67 | else
 68 |     echo "Raw statements file path: $RAW_STMTS_FPATH"
 69 |     echo "Reading text content meta file path: $READING_TEXT_CONTENT_META_FPATH"
 70 |     echo "Text refs principal file path: $TEXT_REFS_PRINCIPAL_FPATH"
 71 | fi
 72 | 
 73 | # Exit if any of the file names are empty
 74 | if [ ! -f "$RAW_STMTS_FPATH" ]
 75 | then
 76 |     echo "Dumping raw statements"
 77 |     start=$(date +%s)
 78 |     psql -d indradb_test \
 79 |          -h indradb-refresh.cvyak4iikv71.us-east-1.rds.amazonaws.com \
 80 |          -U tester \
 81 |          -w \
 82 |          -c "COPY (SELECT id, db_info_id, reading_id,
 83 |                    convert_from (json::bytea, 'utf-8')
 84 |                    FROM public.raw_statements)
 85 |              TO STDOUT" \
 86 |           | gzip > "$RAW_STMTS_FPATH"
 87 |     end=$(date +%s)
 88 |     runtime=$((end-start))
 89 |     echo "Dumped raw statements in $runtime seconds"
 90 | else
 91 |     echo "Raw statements file already exists, skipping dump"
 92 | fi
 93 | 
 94 | if [ ! -f "$READING_TEXT_CONTENT_META_FPATH" ]
 95 | then
 96 |     echo "Dumping reading text content meta"
 97 |     start=$(date +%s)
 98 |     psql -d indradb_test \
 99 |          -h indradb-refresh.cvyak4iikv71.us-east-1.rds.amazonaws.com \
100 |          -U tester \
101 |          -w \
102 |          -c "COPY (SELECT rd.id, rd.reader_version, tc.id, tc.text_ref_id,
103 |                           tc.source, tc.text_type
104 |                    FROM public.text_content as tc, public.reading as rd
105 |                    WHERE tc.id = rd.text_content_id)
106 |              TO STDOUT" \
107 |          | gzip > "$READING_TEXT_CONTENT_META_FPATH"
108 |     end=$(date +%s)
109 |     runtime=$((end-start))
110 |     echo "Dumped reading text content meta in $runtime seconds"
111 | else
112 |     echo "Reading text content meta file already exists, skipping dump"
113 | fi
114 | 
115 | if [ ! -f "$TEXT_REFS_PRINCIPAL_FPATH" ]
116 | then
117 |     echo "Dumping text refs principal"
118 |     start=$(date +%s)
119 |     psql -d indradb_test \
120 |          -h indradb-refresh.cvyak4iikv71.us-east-1.rds.amazonaws.com \
121 |          -U tester \
122 |          -w \
123 |          -c "COPY (SELECT id, pmid, pmcid, doi, pii, url, manuscript_id
124 |                    FROM public.text_ref)
125 |              TO STDOUT" \
126 |          | gzip > "$TEXT_REFS_PRINCIPAL_FPATH"
127 |     end=$(date +%s)
128 |     runtime=$((end-start))
129 |     echo "Dumped text refs in $runtime seconds"
130 | else
131 |     echo "Text refs principal file already exists, skipping dump"
132 | fi
133 | # LOCAL DB CREATION AND DUMPING
134 | 
135 | python -m indra_db.readonly_dumping.export_assembly
136 | python -m indra_db.readonly_dumping.export_assembly_refinement
137 | 
138 | # Create db;
139 | PGPASSWORD=$LOCAL_RO_PASSWORD
140 | export PGPASSWORD
141 | 
142 | psql -h localhost -U postgres -c "DROP DATABASE IF EXISTS $LOCAL_RO_DB_NAME"
143 | psql -h localhost -U postgres -c "CREATE DATABASE $LOCAL_RO_DB_NAME"
144 | ## Run import script
145 | python3 -m indra_db.readonly_dumping.readonly_dumping \
146 |         --db-name $LOCAL_RO_DB_NAME \
147 |         --user $LOCAL_RO_USER \
148 |         --password "$LOCAL_RO_PASSWORD"
149 |         # --force  # Use if you want to overwrite an existing db, if it exists
150 | 
151 | # Dump the db, once done importing
152 | pg_dump -h localhost \
153 |         -U postgres \
154 |         -w \
155 |         -f "${LOCAL_RO_DB_NAME}.dump" $LOCAL_RO_DB_NAME
156 | 
157 | ## copy to s3
158 | aws s3 cp "${LOCAL_RO_DB_NAME}.dump" "s3://bigmech/indra-db/dumps/"
159 | 
160 | # Remove dump file only after it has been copied to s3 successfully
161 | #rm "${LOCAL_RO_DB_NAME}.dump"
162 | 
163 | # Upload an end date file to S3
164 | # This is used to keep track of the end date of the dump
165 | # The file is uploaded to the indra-db/dumps/ directory
166 | # The file name is the current date and time
167 | 
168 | # Get the current date and time
169 | END_DATE_TIME=`date '+%Y-%m-%d %H:%M:%S'`
170 | END_DATE=`date '+%Y-%m-%d'`
171 | echo "{\"datetime\": \"$END_DATE_TIME\", \"date_stamp\": \"$END_DATE\"}" > end.json
172 | aws s3 cp end.json "$S3_PATH/end.json"
173 | 
174 | # At this point, if a new readonly instance is already created, we could run
175 | # the following command to update the instance (assuming the password is set
176 | # in PGPASSWORD, which will be read if -w is set):
177 | # pg_restore -h <readonly-instance>.us-east-1.rds.amazonaws.com \
178 | #            -U <user-name> \
179 | #            -f <dump-file> \
180 | #            -w \
181 | #            -d indradb_readonly \
182 | #            --no-owner


--------------------------------------------------------------------------------
/indra_db/resources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/indra_db/resources/__init__.py


--------------------------------------------------------------------------------
/indra_db/resources/default_db_config.ini:
--------------------------------------------------------------------------------
 1 | # Here, you may enter addresses to INDRA database instances, headed by section
 2 | # titles [<db_name>]. This name is used to refer to access the database within
 3 | # the code. Note that databases may also be defined in the environment using
 4 | # the format defined in `indra_db.config.DB_STR_FMT`, with a name starting with
 5 | # INDRADB<db_name_in_all_caps>
 6 | 
 7 | # Test Databases:
 8 | # ----------------
 9 | # Any name with 'test' in it (ex: 'test', 'test1', 'local_test', etc.) may be
10 | # used in testing. Each test database will be tried in order, from top to
11 | # bottom, and the first that can successfully establish a session will be used.
12 | #
13 | # You should also make the names sortable by preference, with "earlier" names
14 | # preferred to later names.
15 | 
16 | [test]
17 | role = principal
18 | dialect = postgresql
19 | driver =
20 | username =
21 | password =
22 | host =
23 | port =
24 | name = indradb_test
25 | 
26 | [readonly-test]
27 | role = readonly
28 | dialect = postgresql
29 | driver =
30 | username =
31 | password =
32 | host =
33 | port =
34 | name = indradb_readonly_test
35 | 
36 | # The Primary Databases:
37 | # ---------------------
38 | # When using the low-level database access classes, it is assumed that there is
39 | # a 'primary' database (eg. [primary]).
40 | 
41 | 
42 | 
43 | # AWS S3 dump site:
44 | # -----------------
45 | [aws-s3_dump]
46 | bucket =
47 | prefix =
48 | 
49 | 
50 | # AWS Lambda Config:
51 | # ------------------
52 | [aws-lambda]
53 | role =
54 | function =
55 | 
56 | 
57 | # AWS RDS Config:
58 | # ---------------
59 | [aws-rds-settings]
60 | master_user =
61 | security_group =
62 | availability_zone =
63 | 
64 | [general]
65 | testing = false


--------------------------------------------------------------------------------
/indra_db/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | from .principal_schema import *
2 | from .readonly_schema import *
3 | 


--------------------------------------------------------------------------------
/indra_db/schemas/indexes.py:
--------------------------------------------------------------------------------
 1 | __all__ = ['BtreeIndex', 'StringIndex']
 2 | 
 3 | 
 4 | class BtreeIndex(object):
 5 |     def __init__(self, name, colname, opts=None, cluster=False):
 6 |         self.name = name
 7 |         self.colname = colname
 8 |         contents = colname
 9 |         if opts is not None:
10 |             contents += ' ' + opts
11 |         self.definition = ('btree (%s)' % contents)
12 |         self.cluster = cluster
13 | 
14 | 
15 | class StringIndex(BtreeIndex):
16 |     def __init__(self, name, colname):
17 |         opts = 'COLLATE "en_US.utf8" varchar_ops ASC NULLS LAST'
18 |         super().__init__(name, colname, opts)
19 | 
20 | 


--------------------------------------------------------------------------------
/indra_db/tests/README.md:
--------------------------------------------------------------------------------
  1 | # Testing the INDRA Database
  2 | 
  3 | In `indra_db`, we use the `nosetests` framework to run tests. Tests are 
  4 | automatically detected in the usual ways, such as by the prefix `test_` on
  5 | files and functions.
  6 | 
  7 | ## Setting up the Test Database
  8 | Most tests require access to a test database, which is, and should remain,
  9 | separate from the database generally used. This repository requires a database
 10 | of at least postgers version 9.6, which for most systems will require some
 11 | extra work, as 9.6 is not (or at least was not for me) natively available
 12 | through `apt-get`.
 13 | 
 14 | To get access to the latest versions of postgres, you must first execute the
 15 | following (a la [this site](https://r00t4bl3.com/post/how-to-install-postgresql-9-6-on-linux-mint-18-1-serena)):
 16 | ```bash
 17 | sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt/ xenial-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
 18 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
 19 | sudo apt-get update
 20 | ```
 21 | And optionally
 22 | ```bash
 23 | sudo apt-get upgrade
 24 | ```
 25 | You should now see there are several versions of postgres available for
 26 | installation. You should be able to install any version >= 9.6, but for the
 27 | sake of simplicity, I will from here assume 9.6 is being installed.
 28 | ```bash
 29 | sudo apt-get install postgresql-9.6 postgresql-common
 30 | ```
 31 | 
 32 | Also, note that this is all much more complicated if you have or have ever had
 33 | a different version of postgres installed. One way to check this is to inspect
 34 | the `/etc/postgresql` direcotry for other versions. This will indicate current
 35 | active versions, but also version that were uninstalled without `--purge`,
 36 | which could still interfere with the running database.
 37 | 
 38 | You can also run the `pg_lsclusters` command to see what clusters are currently
 39 | running. You should see only one, with the correct version, running on port
 40 | 5432, like so:
 41 | ```
 42 | Ver Cluster Port Status Owner    Data directory               Log file
 43 | 9.6 main    5432 online postgres /var/lib/postgresql/9.6/main /var/log/postgresql/postgresql-9.6-main.log
 44 | ```
 45 | 
 46 | Lastly, you should check and make sure that when you `la /var/run/postgresql/`
 47 | (note the `la` for list all, not `ls`) you see the following:
 48 | ```
 49 | 9.6-main.pg_stat_tmp  9.6-main.pid  .s.PGSQL.5432  .s.PGSQL.5432.lock
 50 | ```
 51 | If you don't see this, you may need to reboot or take other actions.
 52 | 
 53 | Once all the above is confirmed, you will need to make access to the database
 54 | more permissive. *You should **not** do this when the database could be
 55 | exposed to the outside or multiple users may be using the same machine*.
 56 | 
 57 | Edit the the host-based authentication (HBA) config file: `pg_hba.conf`, which
 58 | will likely require `sudo`. For me, this file is located at 
 59 | `/etc/postgresql/9.6/main/pg_hba.conf`. For the sake of this test setup you
 60 | should got to the bottom where you see several lines of the form:
 61 | ```
 62 | # TYPE  DATABASE        USER            ADDRESS                 METHOD
 63 | local   all             postgres                                peer
 64 | ```
 65 | **Changing `peer` or `md5` in the `METHOD` sections to `trust`**. Save the file.
 66 | For the changes to take effect, first attempt to run:
 67 | ```
 68 | sudo service postgresql restart
 69 | ```
 70 | and check to see if you can enter a postgres session by running
 71 | `psql -U postgres`. If this fails to work, you will need to reboot your
 72 | computer for the changes to take effect.
 73 | 
 74 | Once that is done, you can create the test database that INDRA DB uses:
 75 | `indradb_test` by entering the following command:
 76 | ```bash
 77 | sudo -u postgres createdb indradb_test
 78 | ```
 79 | You should not be prompted to enter a password. If so, revisit the changes made
 80 | to the `pg_hba.conf` file, and again make sure you rebooted after making the
 81 | changes. You can then test that the database works as expected by entering
 82 | ```bash
 83 | psql -U postgres
 84 | ```
 85 | At which point you should see a prompt like this:
 86 | ```
 87 | psql (10.9 (Ubuntu 10.9-1.pgdg16.04+1), server 9.6.14)
 88 | Type "help" for help.
 89 | 
 90 | postgres=# 
 91 | 
 92 | ```
 93 | Enter `\q` to exit the prompt, and you should be all set to run the tests.
 94 | 
 95 | 
 96 | You should also create a test readonly database:
 97 | ```bash
 98 | sudo -u postgres createdb indradb_ro_test
 99 | ```
100 | 
101 | ## Other Test Resources
102 | 
103 | To test preassembly, you will also need a test ontology (called `test_ontology.pkl`)
104 | in a directory called `test_resources` within `indra_db/tests`.
105 | 


--------------------------------------------------------------------------------
/indra_db/tests/test_belief.py:
--------------------------------------------------------------------------------
 1 | from nose.plugins.attrib import attr
 2 | 
 3 | from indra.belief import BeliefEngine
 4 | from indra_db.belief import MockStatement, MockEvidence, populate_support, \
 5 |     load_mock_statements, calculate_belief
 6 | from indra_db.tests.util import get_prepped_db
 7 | 
 8 | 
 9 | def test_belief_calc_up_to_prior():
10 |     be = BeliefEngine()
11 |     test_stmts = [
12 |         MockStatement(1, [MockEvidence('sparser'), MockEvidence('reach')]),
13 |         MockStatement(2, MockEvidence('biopax')),
14 |         MockStatement(3, MockEvidence('signor')),
15 |         MockStatement(4, MockEvidence('biogrid')),
16 |         MockStatement(5, MockEvidence('bel')),
17 |         MockStatement(6, [MockEvidence('phosphosite'), MockEvidence('trips')]),
18 |         ]
19 |     be.set_prior_probs(test_stmts)
20 |     results = {s.matches_key(): s.belief for s in test_stmts}
21 |     print(results)
22 |     assert len(results) == len(test_stmts), (len(results), len(test_stmts))
23 |     assert all([0 < b < 1 for b in results.values()]), 'Beliefs out of range.'
24 | 
25 | 
26 | def test_belief_calc_up_to_hierarchy():
27 |     be = BeliefEngine()
28 |     test_stmts = [
29 |         MockStatement(1, [MockEvidence('sparser'), MockEvidence('reach')]),
30 |         MockStatement(2, MockEvidence('biopax')),
31 |         MockStatement(3, MockEvidence('signor')),
32 |         MockStatement(4, MockEvidence('biogrid')),
33 |         MockStatement(5, MockEvidence('bel')),
34 |         MockStatement(6, [MockEvidence('phosphosite'), MockEvidence('trips')]),
35 |         ]
36 |     be.set_prior_probs(test_stmts)
37 |     init_results = {s.matches_key(): s.belief for s in test_stmts}
38 |     print(init_results)
39 |     supp_links = [(1,2), (1,3), (2,3), (1,5), (4,3)]
40 |     populate_support(test_stmts, supp_links)
41 |     be.set_hierarchy_probs(test_stmts)
42 |     results = {s.matches_key(): s.belief for s in test_stmts}
43 |     print(results)
44 | 
45 |     # Test a couple very simple properties.
46 |     assert len(results) == len(test_stmts), (len(results), len(test_stmts))
47 |     assert all([0 < b < 1 for b in results.values()]), 'Beliefs out of range.'
48 | 
49 |     # Test the change from the initial.
50 |     all_deltas_correct = True
51 |     deltas_dict = {}
52 |     for s in test_stmts:
53 |         h = s.matches_key()
54 |         b = s.belief
55 | 
56 |         # Get results
57 |         res = {'actual': b - init_results[h]}
58 | 
59 |         # Define expectations.
60 |         if s.supports:
61 |             res['expected'] = 'increase'
62 |             if res['actual'] <= 0:
63 |                 all_deltas_correct = False
64 |         else:
65 |             res['expected'] = 'no change'
66 |             if res['actual'] != 0:
67 |                 all_deltas_correct = False
68 | 
69 |         deltas_dict[h] = res
70 |     assert all_deltas_correct, deltas_dict
71 | 
72 | 
73 | @attr('nonpublic')
74 | def test_mock_stmt_load_and_belief_calc():
75 |     db = get_prepped_db(1000, with_pa=True)
76 |     stmts = load_mock_statements(db)
77 |     assert 500 <= len(stmts) <= 1000, len(stmts)
78 |     assert all([len(s.evidence) >= 1 for s in stmts])
79 |     sid_list = [ev.annotations['raw_sid'] for s in stmts for ev in s.evidence]
80 |     sid_set = set(sid_list)
81 |     assert len(sid_list) == len(sid_set), (len(sid_list), len(sid_set))
82 |     assert len([sup for s in stmts for sup in s.supports]) \
83 |         == db.count(db.PASupportLinks), "Support is missing."
84 |     belief_dict = calculate_belief(stmts)
85 |     assert len(belief_dict) == len(stmts), (len(belief_dict), len(stmts))
86 |     assert all([0 < b < 1 for b in belief_dict.values()]),\
87 |         'Belief values out of range.'
88 | 


--------------------------------------------------------------------------------
/indra_db/tests/test_config.py:
--------------------------------------------------------------------------------
1 | from indra_db.config import build_db_url
2 | 
3 | 
4 | def test_build_db_url():
5 |     """Test the build of a database URL from typical inputs."""
6 |     res_url = build_db_url(host="host", password="password", dialect="postgres",
7 |                            username="user", port=10, name="db")
8 |     assert res_url == "postgres://user:password@host:10/db", res_url
9 | 


--------------------------------------------------------------------------------
/indra_db/tests/test_copy.py:
--------------------------------------------------------------------------------
  1 | from indra_db.tests.util import get_temp_db
  2 | 
  3 | 
  4 | COLS = ('pmid', 'pmcid')
  5 | 
  6 | 
  7 | def _ref_set(db):
  8 |     return set(db.select_all([db.TextRef.pmid, db.TextRef.pmcid]))
  9 | 
 10 | 
 11 | def _assert_set_equal(s1, s2):
 12 |     assert s1 == s2, '%s != %s' % (s1, s2)
 13 | 
 14 | 
 15 | def test_vanilla_copy():
 16 |     db = get_temp_db(True)
 17 |     inps = {('a', '1'), ('b', '1')}
 18 |     db.copy('text_ref', inps, COLS)
 19 |     assert inps == _ref_set(db)
 20 | 
 21 |     try:
 22 |         db.copy('text_ref', inps, COLS)
 23 |     except:
 24 |         return
 25 |     assert False, "Copy of duplicate data succeeded."
 26 | 
 27 | 
 28 | def _do_init_copy(db):
 29 |     inps_1 = {('a', '1'), ('b', '2')}
 30 |     db.copy('text_ref', inps_1, COLS)
 31 |     _assert_set_equal(inps_1, _ref_set(db))
 32 |     return inps_1
 33 | 
 34 | 
 35 | def test_lazy_copy():
 36 |     db = get_temp_db(True)
 37 |     inps_1 = _do_init_copy(db)
 38 |     inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}
 39 |     db.copy_lazy('text_ref', inps_2, COLS)
 40 |     _assert_set_equal(inps_1 | inps_2, _ref_set(db))
 41 | 
 42 | 
 43 | def test_lazy_report_copy():
 44 |     db = get_temp_db(True)
 45 |     inps_1 = _do_init_copy(db)
 46 |     inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}
 47 | 
 48 |     left_out = db.copy_report_lazy('text_ref', inps_2, COLS)
 49 |     _assert_set_equal(inps_1 | inps_2, _ref_set(db))
 50 |     _assert_set_equal(inps_1 & inps_2, {t[:2] for t in left_out})
 51 | 
 52 | 
 53 | def test_push_copy():
 54 |     db = get_temp_db(True)
 55 |     inps_1 = _do_init_copy(db)
 56 |     inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}
 57 | 
 58 |     original_date = db.select_one(db.TextRef.create_date,
 59 |                                   db.TextRef.pmid == 'b')
 60 | 
 61 |     db.copy_push('text_ref', inps_2, COLS)
 62 |     _assert_set_equal(inps_1 | inps_2, _ref_set(db))
 63 |     new_date = db.select_one(db.TextRef.create_date,
 64 |                              db.TextRef.pmid == 'b')
 65 |     assert new_date != original_date, "PMID b was not updated."
 66 | 
 67 | 
 68 | def test_push_report_copy():
 69 |     db = get_temp_db(True)
 70 |     inps_1 = _do_init_copy(db)
 71 |     inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}
 72 | 
 73 |     original_date = db.select_one(db.TextRef.create_date,
 74 |                                   db.TextRef.pmid == 'b')
 75 | 
 76 |     updated = db.copy_report_push('text_ref', inps_2, COLS)
 77 |     _assert_set_equal(inps_1 | inps_2, _ref_set(db))
 78 |     _assert_set_equal(inps_1 & inps_2, {t[:2] for t in updated})
 79 |     new_date = db.select_one(db.TextRef.create_date,
 80 |                              db.TextRef.pmid == 'b')
 81 |     assert new_date != original_date, 'PMID b was not updated.'
 82 | 
 83 | 
 84 | def test_detailed_copy_report():
 85 |     db = get_temp_db(True)
 86 |     inps_1 = _do_init_copy(db)
 87 |     inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}
 88 | 
 89 |     exiting_ids = {trid for trid, in db.select_all(db.TextRef.id)}
 90 | 
 91 |     existing_ids, new_ids, skipped_rows = \
 92 |         db.copy_detailed_report_lazy('text_ref', inps_2, COLS)
 93 |     _assert_set_equal(inps_1 | inps_2, _ref_set(db))
 94 |     _assert_set_equal(inps_1 & inps_2, {t[:2] for t in skipped_rows})
 95 |     assert {trid for trid, in new_ids} != exiting_ids
 96 | 
 97 | 
 98 | def test_detailed_copy_report_pmid_and_id():
 99 |     db = get_temp_db(True)
100 |     inps_1 = _do_init_copy(db)
101 |     inps_2 = {('b', '2'), ('c', '1'), ('d', '3')}
102 | 
103 |     existing_id_dict = {pmid: trid for trid, pmid
104 |                         in db.select_all([db.TextRef.id, db.TextRef.pmid])}
105 | 
106 |     existing_ids, new_ids, skipped_rows = \
107 |         db.copy_detailed_report_lazy('text_ref', inps_2, COLS,
108 |                                      ('pmid', 'pmcid', 'id'))
109 |     new_id_dict = {pmid: trid for pmid, trid in new_ids}
110 |     returned_existing_id_dict = {pmid: trid for pmid, _, trid, in existing_ids}
111 |     assert returned_existing_id_dict == {'b': 1}
112 |     _assert_set_equal(inps_1 | inps_2, _ref_set(db))
113 |     _assert_set_equal(inps_1 & inps_2, {t[:2] for t in skipped_rows})
114 |     assert set(existing_id_dict.keys()) != set(new_id_dict.keys())
115 | 
116 | 
117 | def test_detailed_copy_report_repeated_pmid_no_conflict():
118 |     db = get_temp_db(True)
119 | 
120 |     inps_1 = {('1', 'PMC1', '10.1/a'), ('2', 'PMC2', '10.2/b')}
121 |     inps_2 = {('1', 'PMC3', '10.3/c')}
122 | 
123 |     cols = ('pmid', 'pmcid', 'doi')
124 |     db.copy('text_ref', inps_1, cols)
125 | 
126 |     existing_ids, new_ids, skipped_rows = \
127 |         db.copy_detailed_report_lazy('text_ref', inps_2, cols, ('pmid', 'id'))
128 |     assert not existing_ids
129 |     assert not skipped_rows
130 |     assert len(new_ids) == 1
131 | 
132 | 
133 | def test_detailed_copy_report_repeated_pmid_with_conflict():
134 |     db = get_temp_db(True)
135 | 
136 |     inps_1 = {('1', 'PMC1', '10.1/a'), ('2', 'PMC2', '10.2/b')}
137 |     inps_2 = {('1', 'PMC3', '10.1/a')}
138 | 
139 |     cols = ('pmid', 'pmcid', 'doi')
140 |     db.copy('text_ref', inps_1, cols)
141 | 
142 |     existing_ids, new_ids, skipped_rows = \
143 |         db.copy_detailed_report_lazy('text_ref', inps_2, cols, ('pmid', 'id'))
144 |     assert existing_ids == [('1', 1)]
145 |     assert len(skipped_rows) == 1
146 |     assert not new_ids
147 | 


--------------------------------------------------------------------------------
/indra_db/tests/test_kbs.py:
--------------------------------------------------------------------------------
 1 | from nose.plugins.attrib import attr
 2 | 
 3 | from indra.statements.statements import Agent, Phosphorylation, Complex, \
 4 |     Evidence
 5 | 
 6 | from indra_db.managers.knowledgebase_manager import *
 7 | from indra_db.util import insert_db_stmts
 8 | from indra_db.tests.util import get_temp_db
 9 | 
10 | 
11 | def _check_kbm(Kb, *args, **kwargs):
12 |     db = get_temp_db(clear=True)
13 |     dbid = db.select_one(db.DBInfo.id, db.DBInfo.db_name == Kb.name)
14 |     assert dbid is None
15 |     kbm = Kb(*args, **kwargs)
16 |     kbm.upload(db)
17 |     dbid = db.select_one(db.DBInfo.id, db.DBInfo.db_name == Kb.name)[0]
18 |     assert dbid is not None
19 |     db_stmts = db.select_all(db.RawStatements)
20 |     print(len(db_stmts))
21 |     assert len(db_stmts)
22 |     assert all(s.db_info_id == dbid for s in db_stmts)
23 |     db.session.close()
24 | 
25 | 
26 | @attr("nonpublic")
27 | def test_tas():
28 |     _check_kbm(TasManager)
29 | 
30 | 
31 | @attr('nonpublic')
32 | def test_cbn():
33 |     s3_url = 'https://s3.amazonaws.com/bigmech/travis/Hox-2.0-Hs.jgf.zip'
34 |     _check_kbm(CBNManager, archive_url=s3_url)
35 | 
36 | 
37 | @attr('nonpublic', 'slow')
38 | def test_hprd():
39 |     _check_kbm(HPRDManager)
40 | 
41 | 
42 | @attr('nonpublic')
43 | def test_signor():
44 |     _check_kbm(SignorManager)
45 | 
46 | 
47 | @attr('nonpublic', 'slow')
48 | def test_biogrid():
49 |     _check_kbm(BiogridManager)
50 | 
51 | 
52 | @attr('nonpublic', 'slow')
53 | def test_bel_lc():
54 |     _check_kbm(BelLcManager)
55 | 
56 | 
57 | @attr('nonpublic', 'slow')
58 | def test_pathway_commons():
59 |     _check_kbm(PathwayCommonsManager)
60 | 
61 | 
62 | @attr('nonpublic', 'slow')
63 | def test_rlimsp():
64 |     _check_kbm(RlimspManager)
65 | 
66 | 
67 | @attr('nonpublic')
68 | def test_trrust():
69 |     _check_kbm(TrrustManager)
70 | 
71 | 
72 | @attr('nonpublic', 'slow')
73 | def test_phosphosite():
74 |     _check_kbm(PhosphositeManager)
75 | 
76 | 
77 | @attr('nonpublic')
78 | def test_simple_db_insert():
79 |     db = get_temp_db()
80 |     db._clear(force=True)
81 |     stmts = [Phosphorylation(Agent('MEK', db_refs={'FPLX': 'MEK'}),
82 |                              Agent('ERK', db_refs={'FPLX': 'ERK'}),
83 |                              evidence=Evidence(source_api='test')),
84 |              Complex([Agent(n, db_refs={'FPLX': n}) for n in ('MEK', 'ERK')],
85 |                      evidence=Evidence(source_api='test'))]
86 |     dbid = db.insert(db.DBInfo, db_name='test', source_api='tester')
87 |     insert_db_stmts(db, stmts, dbid)
88 |     db_stmts = db.select_all(db.RawStatements)
89 |     db_agents = db.select_all(db.RawAgents)
90 |     assert len(db_stmts) == 2, len(db_stmts)
91 |     assert len(db_agents) == 8, len(db_agents)
92 |     db.session.close()
93 | 


--------------------------------------------------------------------------------
/indra_db/tests/test_principal_client.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from indra_db.client.principal import *
  4 | from indra.statements import Agent, Phosphorylation, Complex, Activation
  5 | 
  6 | from indra_db.tests.util import get_temp_db
  7 | from indra_db.tests.db_building_util import DbBuilder
  8 | 
  9 | 
 10 | def _construct_database():
 11 |     db = get_temp_db(clear=True)
 12 |     db_builder = DbBuilder(db)
 13 |     db_builder.add_text_refs([
 14 |         ('12345', 'PMC54321'),
 15 |         ('24680', 'PMC08642')
 16 |     ])
 17 |     db_builder.add_text_content([
 18 |         ['pubmed-abs', 'pmc_oa'],
 19 |         ['pubmed-abs']
 20 |     ])
 21 |     db_builder.add_readings([
 22 |         ['REACH'],
 23 |         ['REACH'],
 24 |         ['REACH', 'SPARSER']
 25 |     ])
 26 | 
 27 |     mek = Agent('MEK', db_refs={'FPLX': 'MEK'})
 28 |     erk = Agent('ERK', db_refs={'FPLX': 'ERK'})
 29 |     raf = Agent('RAF', db_refs={'FPLX': 'RAF'})
 30 | 
 31 |     db_builder.add_raw_reading_statements([
 32 |         [Phosphorylation(mek, erk), Complex([mek, erk])],
 33 |         [Phosphorylation(mek, erk)],
 34 |         [Activation(mek, erk)],
 35 |         [Complex([mek, erk]), Complex([raf, erk])]
 36 |     ])
 37 | 
 38 |     db_builder.add_databases(['signor'])
 39 |     db_builder.add_raw_database_statements([
 40 |         [Complex([raf, erk])]
 41 |     ])
 42 |     db_builder.add_pa_statements([
 43 |         (Phosphorylation(mek, erk), [0, 2]),
 44 |         (Complex([mek, erk]), [1, 4]),
 45 |         (Activation(mek, erk), [3]),
 46 |         (Complex([raf, erk]), [5, 6])
 47 |     ])
 48 |     return db
 49 | 
 50 | 
 51 | def test_get_raw_statements_all():
 52 |     db = _construct_database()
 53 |     res = get_raw_stmt_jsons(db=db)
 54 |     assert len(res) == 7, len(res)
 55 | 
 56 | 
 57 | def test_raw_statement_retrieval_from_agents_type_only():
 58 |     db = _construct_database()
 59 |     res = get_raw_stmt_jsons_from_agents(stmt_type='Complex', db=db)
 60 |     assert len(res) > 0
 61 |     assert len(res) < 7
 62 |     assert all(sj['type'] == 'Complex' for sj in res.values())
 63 | 
 64 | 
 65 | def test_raw_statement_retrieval_from_agents_mek():
 66 |     db = _construct_database()
 67 |     res = get_raw_stmt_jsons_from_agents(agents=[(None, 'MEK', 'FPLX')], db=db)
 68 |     assert len(res) > 0
 69 |     assert len(res) < 7
 70 |     assert all('MEK' in json.dumps(sj) for sj in res.values())
 71 | 
 72 | 
 73 | def test_raw_statement_retrieval_generic():
 74 |     db = _construct_database()
 75 |     res = get_raw_stmt_jsons([db.Reading.reader == 'REACH',
 76 |                               db.Reading.id == db.RawStatements.reading_id],
 77 |                              db=db)
 78 |     assert len(res) > 0
 79 |     assert len(res) < 7
 80 |     assert all(sj['evidence'][0]['source_api'] == 'reach'
 81 |                for sj in res.values())
 82 | 
 83 | 
 84 | def test_raw_statements_get_database_only():
 85 |     db = _construct_database()
 86 |     res = get_raw_stmt_jsons([db.RawStatements.reading_id.is_(None)], db=db)
 87 |     assert len(res) == 1, len(res)
 88 |     assert all(sj['evidence'][0]['source_api'] == 'signor'
 89 |                for sj in res.values())
 90 | 
 91 | 
 92 | def test_pa_statement_retrieval_generic():
 93 |     db = _construct_database()
 94 |     res = get_pa_stmt_jsons(db=db)
 95 |     assert len(res) == 4
 96 | 
 97 | 
 98 | def test_pa_statement_retrieval_by_type():
 99 |     db = _construct_database()
100 |     res = get_pa_stmt_jsons([db.PAStatements.type == 'Complex'], db=db)
101 |     assert len(res) == 2
102 |     assert all(j['stmt']['type'] == 'Complex' for j in res.values())
103 | 


--------------------------------------------------------------------------------
/indra_db/tests/test_readonly_pipeline.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import gzip
  3 | import json
  4 | import pickle
  5 | from pathlib import Path
  6 | from collections import Counter
  7 | 
  8 | import networkx as nx
  9 | 
 10 | from indra.belief import BeliefEngine
 11 | from indra.statements import Agent, Evidence, Activation
 12 | from indra_db import get_db
 13 | from indra_db.readonly_dumping.export_assembly import calculate_belief
 14 | 
 15 | 
 16 | def test_unit_belief_calc():
 17 |     activation = Activation(
 18 |         Agent("A"),
 19 |         Agent("B"),
 20 |         evidence=[Evidence(source_api="reach") for _ in range(3)],
 21 |     )
 22 | 
 23 |     # Test that the belief score is calculated correctly
 24 |     assert activation.belief == 1
 25 | 
 26 |     # Set up default Belief Engine
 27 |     belief_engine = BeliefEngine()
 28 | 
 29 |     belief_engine.set_prior_probs([activation])
 30 | 
 31 |     assert activation.belief != 1
 32 |     assert activation.belief == 0.923
 33 | 
 34 | 
 35 | def test_calculate_belief():
 36 |     activation1 = Activation(
 37 |         Agent("A", location="nucleus"),
 38 |         Agent("B", location="cytoplasm"),
 39 |         evidence=[
 40 |             Evidence(
 41 |                 source_api="reach",
 42 |                 text="A activates B in vitro in a dose-dependent manner.")
 43 |         ],
 44 |     )
 45 |     hash1 = activation1.get_hash()
 46 |     activation2 = Activation(
 47 |         Agent("A", location="nucleus"),
 48 |         Agent("B"),
 49 |         evidence=[
 50 |             Evidence(source_api="reach", text="A activates B in vitro.")
 51 |         ],
 52 |     )
 53 |     hash2 = activation2.get_hash()
 54 |     activation3 = Activation(
 55 |         Agent("A"),
 56 |         Agent("B"),
 57 |         evidence=[Evidence(source_api="reach", text="A activates B.")],
 58 |     )
 59 |     hash3 = activation3.get_hash()
 60 | 
 61 |     # Sanity check
 62 |     assert hash1 != hash2 != hash3
 63 | 
 64 |     stmt_list = [(hash1, activation1), (hash2, activation2), (hash3, activation3)]
 65 | 
 66 |     # Dump the statements to a file
 67 |     test_statements_tsv_gz = Path(__file__).parent / "test_statements.tsv.gz"
 68 |     with gzip.open(test_statements_tsv_gz, "wt") as f:
 69 |         csv_writer = csv.writer(f, delimiter="\t")
 70 |         csv_writer.writerows(
 71 |             (sh, json.dumps(st.to_json())) for sh, st in stmt_list
 72 |         )
 73 | 
 74 |     source_counts = {
 75 |         hash1: {"reach": 1},
 76 |         hash2: {"reach": 1},
 77 |         hash3: {"reach": 1},
 78 |     }
 79 |     test_source_counts_pkl = Path(__file__).parent / "test_source_counts.pkl"
 80 |     with open(test_source_counts_pkl, "wb") as f:
 81 |         pickle.dump(source_counts, f)
 82 | 
 83 |     # Create support: activation1 -> activation2 -> activation3 in a
 84 |     # refinement graph
 85 |     refinements = {(hash1, hash2), (hash2, hash3)}
 86 |     refinement_graph = nx.DiGraph()
 87 |     refinement_graph.add_edges_from(refinements)
 88 |     assert nx.ancestors(refinement_graph, hash1) == set()
 89 |     assert nx.ancestors(refinement_graph, hash2) == {hash1}
 90 |     assert nx.ancestors(refinement_graph, hash3) == {hash1, hash2}
 91 | 
 92 |     # Run the belief calculation function
 93 |     db = get_db("primary")
 94 |     res = db.select_all(db.DBInfo)
 95 |     db_name_api_mapping = {r.db_name: r.source_api for r in res}
 96 |     test_belief_path = Path(__file__).parent / "test_belief_path.pkl"
 97 |     calculate_belief(
 98 |         refinements_graph=refinement_graph,
 99 |         num_batches=1,
100 |         batch_size=len(stmt_list),
101 |         source_mapping=db_name_api_mapping,
102 |         unique_stmts_path=test_statements_tsv_gz,
103 |         belief_scores_pkl_path=test_belief_path,
104 |         source_counts_path=test_source_counts_pkl,
105 |     )
106 | 
107 |     # Calculate the belief scores: Add evidence of supporting statements to the
108 |     # evidence of the supported statement then calculate the prior belief
109 |     belief_engine = BeliefEngine(refinements_graph=refinement_graph)
110 |     to_calc_list = []
111 |     local_beliefs = {}
112 |     for st_hash, stmt in stmt_list:
113 |         # Sum belief score of ancestors
114 |         summed_src_count = Counter(source_counts[st_hash])
115 | 
116 |         if st_hash in refinement_graph.nodes:
117 |             for anc_hash in nx.ancestors(refinement_graph, st_hash):
118 |                 summed_src_count += Counter(source_counts[anc_hash])
119 | 
120 |         ev_list_this_stmt = []
121 |         for source, count in summed_src_count.items():
122 |             for _ in range(count):
123 |                 ev_list_this_stmt.append(Evidence(source_api=source))
124 | 
125 |         stmt.evidence = ev_list_this_stmt
126 |         to_calc_list.append((st_hash, stmt))
127 | 
128 |     hashes, stmts = zip(*to_calc_list)
129 |     belief_engine.set_prior_probs(stmts)
130 |     for st_hash2, stmt2 in zip(hashes, stmts):
131 |         local_beliefs[st_hash2] = stmt2.belief
132 | 
133 |     # Load the belief scores
134 |     with open(test_belief_path, "rb") as f:
135 |         belief_dict = pickle.load(f)
136 | 
137 |     # Check that the belief scores are correct
138 |     assert all(
139 |         local_beliefs[st_hash] == belief_dict[st_hash]
140 |         for st_hash in belief_dict
141 |     )
142 | 
143 |     assert len(stmts[2].evidence) == 3
144 |     assert all(ev.source_api == 'reach' for ev in stmts[2].evidence)
145 |     assert belief_dict[hash3] == 0.923
146 | 


--------------------------------------------------------------------------------
/indra_db/tests/test_setup.py:
--------------------------------------------------------------------------------
1 | from indra_db.tests.util import get_temp_db
2 | 
3 | 
4 | def test_db_presence():
5 |     db = get_temp_db(clear=True)
6 |     db.insert(db.TextRef, pmid='12345')
7 | 


--------------------------------------------------------------------------------
/indra_db/tests/test_sif_dumper.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import numpy as np
  3 | from os import path, remove
  4 | import pandas as pd
  5 | 
  6 | import indra_db.tests.util as tu
  7 | from indra_db.util.dump_sif import load_db_content, get_source_counts, \
  8 |     make_dataframe, NS_LIST, normalize_sif_names
  9 | 
 10 | 
 11 | class SifDumperTester(unittest.TestCase):
 12 |     def get_db(self, count=1000):
 13 |         # Get db
 14 |         return tu.get_filled_ro(count)
 15 | 
 16 |     def setUp(self):
 17 |         self.db = self.get_db()
 18 |         self.db_content = load_db_content(True, NS_LIST, None, self.db)
 19 |         self.df = make_dataframe(True, self.db_content, None)
 20 | 
 21 |     # Tests
 22 |     def test_get_content(self):
 23 |         """Checks content loading and its structure"""
 24 | 
 25 |         # Get first item
 26 |         r = list(self.db_content)[0]
 27 |         assert isinstance(r, tuple)
 28 |         assert len(r) == 6
 29 |         assert isinstance(r[0], int)  # mk_hash
 30 |         assert isinstance(r[1], str)  # db_name
 31 |         assert r[1] in NS_LIST
 32 |         assert isinstance(r[2], str)  # db_id
 33 |         assert isinstance(r[3], int)  # ag_num
 34 |         assert r[3] > -1
 35 |         assert isinstance(r[4], int)  # ev_count
 36 |         assert r[4] > 0
 37 |         assert isinstance(r[5], str)  # type
 38 | 
 39 |     def test_dataframe(self):
 40 |         """Checks a dataframe produced by make_dataframe"""
 41 | 
 42 |         # Check column names
 43 |         assert {'agA_id', 'agA_name', 'agA_ns', 'agB_id', 'agB_name', 'agB_ns',
 44 |                 'evidence_count', 'stmt_hash', 'stmt_type'} == set(
 45 |             self.df.columns)
 46 | 
 47 |         # Check for None's
 48 |         assert sum(self.df['agA_name'] == None) == 0
 49 |         assert sum(self.df['agB_name'] == None) == 0
 50 | 
 51 |         # Check df types
 52 |         assert isinstance(self.df.head(1)['agA_ns'][0], str)
 53 |         assert isinstance(self.df.head(1)['agB_ns'][0], str)
 54 |         assert isinstance(self.df.head(1)['agA_id'][0], str)
 55 |         assert isinstance(self.df.head(1)['agB_id'][0], str)
 56 |         assert isinstance(self.df.head(1)['agA_name'][0], str)
 57 |         assert isinstance(self.df.head(1)['agB_name'][0], str)
 58 |         assert isinstance(self.df.head(1)['stmt_type'][0], str)
 59 |         assert isinstance(self.df.head(1)['evidence_count'][0], np.int64)
 60 |         assert isinstance(self.df.head(1)['stmt_hash'][0], np.int64)
 61 | 
 62 |         # Check that we don't have significant keyerrors from creating the df
 63 |         key_error_file = path.join(path.dirname(__file__), 'key_errors.csv')
 64 |         if path.exists(key_error_file):
 65 |             key_errors = pd.read_csv(key_error_file, sep=',',
 66 |                                      names=['stmt_hash', 'ag_num'], header=None)
 67 |             remove(key_error_file)
 68 |             missing_hashes = set(key_errors['stmt_hash'].values)
 69 |             df_hashes = set(self.df['stmt_hash'].values)
 70 | 
 71 |             assert len(missing_hashes.intersection(df_hashes)) / \
 72 |                 len(df_hashes) < 0.5
 73 | 
 74 |     def test_stratified_evidence(self):
 75 |         """Check the stratified evidence dumper"""
 76 | 
 77 |         ev_dict = get_source_counts(ro=self.db)
 78 | 
 79 |         # Check if nested dict
 80 |         for k in ev_dict:
 81 |             assert isinstance(ev_dict[k], dict)
 82 |             break
 83 | 
 84 |         # Check that some keys exist in the df
 85 |         df_hashes = set(self.df['stmt_hash'].values)
 86 |         sd_hashes = set(ev_dict.keys())
 87 |         assert len(sd_hashes.intersection(df_hashes)) / len(sd_hashes) > 0.25
 88 | 
 89 | 
 90 | def test_normalize_names():
 91 |     sif_dict = {
 92 |         'agA_ns': ['HGNC', 'HGNC'],
 93 |         'agA_id': ['26128', '26128'],
 94 |         'agA_name': ['SPRING1', 'C12orf49'],
 95 |         'agB_ns': ['HGNC', 'HGNC'],
 96 |         'agB_id': ['11892', '3236'],
 97 |         'agB_name': ['TNF', 'EGFR'],
 98 |         'stmt_type': ['Activation', 'Phosphorylation'],
 99 |         'evidence_count': [10, 12],
100 |         'stmt_hash': [1234567890, -9876543210],
101 |         'residue': [None, None],
102 |         'position': [None, None],
103 |         'source_counts': [{'sparser': 6, 'reach': 4}, {'pc': 6, 'sparser': 6}],
104 |         'belief': [0.998, 0.9999]
105 |      }
106 | 
107 |     sif_df = pd.DataFrame(sif_dict)
108 |     normalize_sif_names(sif_df)
109 |     # Both names should now be SPRING1
110 |     assert set(sif_df.agA_name.values) == {'SPRING1'}
111 | 


--------------------------------------------------------------------------------
/indra_db/tests/test_xdd_manager.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import boto3
 3 | import random
 4 | 
 5 | from indra_db.tests.util import get_temp_db
 6 | from indra_db.managers.xdd_manager import XddManager
 7 | 
 8 | 
 9 | def test_dump():
10 |     db = get_temp_db(clear=True)
11 |     m = XddManager()
12 | 
13 |     # Enter "old" DOIs
14 |     s3 = boto3.client('s3')
15 |     res = s3.list_objects_v2(**m.bucket.kw())
16 |     dois = set()
17 |     for ref in res['Contents']:
18 |         key = ref['Key']
19 |         if 'bib' not in key:
20 |             continue
21 |         try:
22 |             obj = s3.get_object(Key=key, **m.bucket.kw())
23 |         except Exception:
24 |             print('ack')
25 |             continue
26 |         bibs = json.loads(obj['Body'].read())
27 |         dois |= {bib['identifier'][0]['id'] for bib in bibs
28 |                  if 'identifier' in bib}
29 |     sample_dois = random.sample(dois, len(dois)//2)
30 |     new_trs = [db.TextRef.new(doi=doi) for doi in sample_dois]
31 |     print(f"Adding {len(new_trs)} 'old' text refs.")
32 |     db.session.add_all(new_trs)
33 |     db.session.commit()
34 | 
35 |     # Run the update.
36 |     m.run(db)
37 | 
38 |     # Check the result.
39 |     assert db.select_all(db.TextRef)
40 |     assert db.select_all(db.TextContent)
41 |     assert db.select_all(db.Reading)
42 |     assert db.select_all(db.RawStatements)
43 |     assert db.select_all(db.RawAgents)
44 | 


--------------------------------------------------------------------------------
/indra_db/util/__init__.py:
--------------------------------------------------------------------------------
 1 | """This file contains low level functions used by other indra_db tools/services.
 2 | 
 3 | Some key functions' capabilities include:
 4 | - getting access to/constructing DatabaseManager instances.
 5 | - inserting statements, which are stored in multiple tables, into the database.
 6 | - distilling and deleting statements
 7 | """
 8 | 
 9 | __all__ = ['get_primary_db', 'get_db', 'insert_raw_agents', 'insert_pa_stmts',
10 |            'insert_pa_agents', 'insert_db_stmts', 'get_raw_stmts_frm_db_list',
11 |            'distill_stmts', 'regularize_agent_id', 'get_statement_object',
12 |            'extract_agent_data', 'get_ro', 'S3Path', 'hash_pa_agents']
13 | 
14 | from .insert import *
15 | from .s3_path import *
16 | from .helpers import *
17 | from .constructors import *
18 | from .content_scripts import *
19 | from .distill_statements import *
20 | 


--------------------------------------------------------------------------------
/indra_db/util/aws.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import boto3
 3 | 
 4 | 
 5 | def uncamel(word):
 6 |     return re.sub(r'([a-z])([A-Z])', r'\g<1>_\g<2>', word).lower()
 7 | 
 8 | 
 9 | def get_role_kwargs(role):
10 |     sts = boto3.client('sts')
11 | 
12 |     # Check the current role
13 |     kwargs = {}
14 |     ident = sts.get_caller_identity()
15 |     if role and not ident['Arn'].endswith(role):
16 |         # If the role is not the default, assume that role.
17 |         new_role_arn = "arn:aws:iam::%s:role/%s" % (ident['Account'], role)
18 |         res = sts.assume_role(RoleArn=new_role_arn,
19 |                               RoleSessionName="AssumeRoleReadonlyDBUpdate")
20 |         kwargs = {'aws_' + uncamel(k): v for k, v in res['Credentials'].items()
21 |                   if 'expiration' not in k.lower()}
22 | 
23 |     return kwargs, ident
24 | 


--------------------------------------------------------------------------------
/indra_db/util/build_corpus.py:
--------------------------------------------------------------------------------
  1 | """Dump test corpora of content covering all REACH rules
  2 | 
  3 | This script is designed select content from the database based on the REACH
  4 | rules that have been triggered within that content. Three slightly different
  5 | methods are used, and three corpora are produces, each as a directory.
  6 | """
  7 | 
  8 | import os
  9 | import json
 10 | from indra_db.util import unpack
 11 | from indra_db.util import get_ro, get_db
 12 | 
 13 | db = get_db('primary')
 14 | 
 15 | rs = db.select_all(db.RawStatements, db.Reading.reader == 'REACH', 
 16 |                    db.RawStatements.reading_id == db.Reading.id, yield_per=10000)
 17 | found_by = {}
 18 | for r in rs:
 19 |     found_by[r.id] = json.loads(r.json)['evidence'][0]['annotations']['found_by']
 20 |     
 21 | fb_set = set(found_by.values())
 22 | print(f"Found {len(fb_set)} distinct found-by rules.")
 23 | 
 24 | fb_counts = {}
 25 | for sid, word in found_by.items():
 26 |     fb_counts[word] = fb_counts.get(word, 0) + 1
 27 |     
 28 | fb_sids = {}
 29 | for sid, word in found_by.items():
 30 |     if word not in fb_sids:
 31 |         fb_sids[word] = []
 32 |     fb_sids[word].append(sid)
 33 |     
 34 | tc_data = db.select_all([db.TextContent.id, db.TextContent.source, db.TextContent.text_type, db.RawStatements.id],
 35 |                         db.Reading.reader == 'REACH', *db.link(db.TextContent, db.RawStatements))
 36 | tc_lookup = {sid: (tcid, src, tt) for tcid, src, tt, sid in tc_data}
 37 | 
 38 | fb_tc_dict = {}
 39 | tc_fb_dict = {}
 40 | for fb, sids in sorted(fb_sids.items(), key=lambda t: len(t[1])):
 41 |     print(fb, len(sids))
 42 |     this_dict = {}
 43 |     for sid in sids:
 44 |         tcid, src, tt = tc_lookup[sid]
 45 |         
 46 |         # Add fb to lookup by tcid
 47 |         if tcid not in tc_fb_dict:
 48 |             tc_fb_dict[tcid] = set()
 49 |         tc_fb_dict[tcid].add(fb)
 50 |         
 51 |         # Add tcid sid data to list of content with this fb.
 52 |         key = (src, tt)
 53 |         if key not in this_dict:
 54 |             this_dict[key] = []
 55 |         this_dict[key].append({'tcid': tcid, 'sid': sid})
 56 |     fb_tc_dict[fb] = this_dict
 57 |     
 58 |         
 59 | num_with = 0
 60 | for fb, cont_meta in fb_tc_dict.items():
 61 |     if ('pubmed', 'abstract') not in cont_meta and ('pubmed', 'title') not in cont_meta:
 62 |         print(f"{fb:70} {fb_counts[fb]} {cont_meta.keys()}")
 63 |     else:
 64 |         num_with += 1
 65 |         
 66 | ranking = [('pubmed', 'abstract'), ('pmc_oa', 'fulltext'), ('manuscripts', 'fulltext'), ('pubmed', 'title')]
 67 |         
 68 | 
 69 | def dump_tcs(tcids, dirname):
 70 |     tcs = db.select_all([db.TextRef.id, db.TextRef.pmid, db.TextRef.pmcid, db.TextContent.id, 
 71 |                          db.TextContent.source, db.TextContent.text_type, db.TextContent.content], 
 72 |                         db.TextContent.id.in_(tcids), *db.link(db.TextRef, db.TextContent))
 73 |     tt_counts = {}
 74 |     for row in tcs:
 75 |         tt = row[-1]
 76 |         tt_counts[tt] = tt_counts.get(tt, 0) + 1
 77 |         
 78 |     print(dirname, tt_counts)
 79 | 
 80 |     if not os.path.exists(dirname):
 81 |         os.mkdir(dirname)
 82 |     else:
 83 |         raise ValueError(f"Directory {dirname} already exists.")
 84 | 
 85 |     metadata = {}
 86 |     for trid, pmid, pmcid, tcid, src, tt, cont_bytes in tcs:
 87 |         metadata[tcid] = {'trid': trid, 'pmid': pmid, 'tcid': tcid, 'pmcid': pmcid, 'source': src, 'text_type': tt}
 88 |         if src == 'pubmed':
 89 |             fmt = 'txt'
 90 |         else:
 91 |             fmt = 'nxml'
 92 |         with open(f'{dirname}/{tcid}.{fmt}', 'w') as f:
 93 |             f.write(unpack(cont_bytes))
 94 |     with open(f'{dirname}/metadata.json', 'w') as f:
 95 |         json.dump(metadata, f, indent=2)
 96 | 
 97 | 
 98 | # Select strictly the content with the most rules represented. No preference
 99 | # based on type.
100 | corpus_ids = []
101 | rep_fbs = set()
102 | for fb, cont_meta in sorted(fb_tc_dict.items(), key=lambda t: fb_counts[t[0]]):
103 |     print("--------------------------------------------")
104 |     print("Examining rule:", fb, fb_counts[fb])
105 |     if fb in rep_fbs:
106 |         print("Already represented...skipping")
107 |         continue
108 |         
109 |     best_ref = None
110 |     for text_cat, text_list in cont_meta.items():
111 |         print(text_cat, len(text_list))
112 |         counted_refs = [(len(tc_fb_dict[d['tcid']] - rep_fbs), d['tcid']) for d in text_list]
113 |         print(f"best ref for {text_cat}:", max(counted_refs))
114 |         if best_ref is None:
115 |             best_ref = max(counted_refs)
116 |         else:
117 |             this_ref = max(counted_refs)
118 |             if this_ref > best_ref:
119 |                 best_ref = this_ref
120 |     print(f"Overall best ref for {fb}:", best_ref)
121 |     corpus_ids.append(best_ref[1])
122 |     rep_fbs |= tc_fb_dict[best_ref[1]]
123 |     print(len(rep_fbs))
124 |     if len(rep_fbs) == len(fb_counts):
125 |         print("DONE!")
126 |         break
127 | dump_tcs(corpus_ids, 'corpus_1')
128 | 
129 |         
130 | # Select the content with most rules, with the preference for abstract as a tie-breaker.
131 | corpus_ids_2 = []
132 | rep_fbs = set()
133 | for fb, cont_meta in sorted(fb_tc_dict.items(), key=lambda t: fb_counts[t[0]]):
134 |     print("--------------------------------------------")
135 |     print("Examining rule:", fb, fb_counts[fb])
136 |     if fb in rep_fbs:
137 |         print("Already represented...skipping")
138 |         continue
139 |         
140 |     all_counted_refs = []
141 |     for text_cat, text_list in cont_meta.items():
142 |         print(text_cat, len(text_list))
143 |         all_counted_refs += [(len(tc_fb_dict[d['tcid']] - rep_fbs), -ranking.index(text_cat), d['tcid']) for d in text_list]
144 |     best_ref = max(all_counted_refs)
145 |     print(f"Overall best ref for {fb}:", best_ref)
146 |     corpus_ids_2.append(best_ref[-1])
147 |     rep_fbs |= tc_fb_dict[best_ref[-1]]
148 |     print(len(rep_fbs))
149 |     if len(rep_fbs) == len(fb_counts):
150 |         print("DONE!")
151 |         break
152 | dump_tcs(corpus_ids_2, 'corpus_2')
153 |         
154 | 
155 | # Select abstracts whenever possible, fulltext only when necessary.
156 | corpus_ids_3 = []
157 | rep_fbs = set()
158 | for fb, cont_meta in sorted(fb_tc_dict.items(), key=lambda t: fb_counts[t[0]]):
159 |     print("--------------------------------------------")
160 |     print("Examining rule:", fb, fb_counts[fb])
161 |     if fb in rep_fbs:
162 |         print("Already represented...skipping")
163 |         continue
164 |         
165 |     all_counted_refs = []
166 |     for text_cat, text_list in cont_meta.items():
167 |         print(text_cat, len(text_list))
168 |         all_counted_refs += [(-ranking.index(text_cat), len(tc_fb_dict[d['tcid']] - rep_fbs), d['tcid']) for d in text_list]
169 |     best_ref = max(all_counted_refs)
170 |     print(f"Overall best ref for {fb}:", best_ref)
171 |     corpus_ids_3.append(best_ref[-1])
172 |     rep_fbs |= tc_fb_dict[best_ref[-1]]
173 |     print(len(rep_fbs))
174 |     if len(rep_fbs) == len(fb_counts):
175 |         print("DONE!")
176 |         break
177 | dump_tcs(corpus_ids_3, 'corpus_3')    
178 |     
179 | 


--------------------------------------------------------------------------------
/indra_db/util/constructors.py:
--------------------------------------------------------------------------------
  1 | __all__ = ['get_primary_db', 'get_db', 'get_ro', 'get_ro_host']
  2 | 
  3 | import logging
  4 | 
  5 | from indra_db.databases import PrincipalDatabaseManager, \
  6 |     ReadonlyDatabaseManager
  7 | from indra_db.exceptions import IndraDbException
  8 | from indra_db.config import get_databases, get_readonly_databases, nope_in_test
  9 | 
 10 | logger = logging.getLogger('util-constructors')
 11 | 
 12 | 
 13 | __PRIMARY_DB = None
 14 | 
 15 | 
 16 | @nope_in_test
 17 | def get_primary_db(force_new=False):
 18 |     """Get a DatabaseManager instance for the primary database host.
 19 | 
 20 |     The primary database host is defined in the defaults.txt file, or in a file
 21 |     given by the environment variable DEFAULTS_FILE. Alternatively, it may be
 22 |     defined by the INDRADBPRIMARY environment variable. If none of the above
 23 |     are specified, this function will raise an exception.
 24 | 
 25 |     Note: by default, calling this function twice will return the same
 26 |     `DatabaseManager` instance. In other words::
 27 | 
 28 |         db1 = get_primary_db()
 29 |         db2 = get_primary_db()
 30 |         db1 is db2
 31 | 
 32 |     This means also that, for example `db1.select_one(db2.TextRef)` will work,
 33 |     in the above context.
 34 | 
 35 |     It is still recommended that when creating a script or function, or other
 36 |     general application, you should not rely on this feature to get your access
 37 |     to the database, as it can make substituting a different database host both
 38 |     complicated and messy. Rather, a database instance should be explicitly
 39 |     passed between different users as is done in `get_statements_by_gene_role_type`
 40 |     function's call to `get_statements` in `indra.db.query_db_stmts`.
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     force_new : bool
 45 |         If true, a new instance will be created and returned, regardless of
 46 |         whether there is an existing instance or not. Default is False, so that
 47 |         if this function has been called before within the global scope, a the
 48 |         instance that was first created will be returned.
 49 | 
 50 |     Returns
 51 |     -------
 52 |     primary_db : :py:class:`DatabaseManager`
 53 |         An instance of the database manager that is attached to the primary
 54 |         database.
 55 |     """
 56 |     logger.warning("DEPRECATION WARNING: This function is being deprecated.")
 57 |     defaults = get_databases()
 58 |     if 'primary' in defaults.keys():
 59 |         primary_host = defaults['primary']
 60 |     else:
 61 |         raise IndraDbException("No primary host available in defaults file.")
 62 | 
 63 |     global __PRIMARY_DB
 64 |     if __PRIMARY_DB is None or force_new:
 65 |         __PRIMARY_DB = PrincipalDatabaseManager(primary_host, label='primary')
 66 |         __PRIMARY_DB.grab_session()
 67 |     return __PRIMARY_DB
 68 | 
 69 | 
 70 | @nope_in_test
 71 | def get_db(db_label, protected=False):
 72 |     """Get a db instance base on it's name in the config or env.
 73 | 
 74 |     If the label does not exist or the database labeled can't be reached, None
 75 |     is returned.
 76 |     """
 77 |     # Instantiate a database handle
 78 |     defaults = get_databases()
 79 |     if db_label not in defaults:
 80 |         logger.error(f"No such database available: {db_label}. Check config "
 81 |                      f"file or environment variables.")
 82 |         return
 83 |     db_url = defaults[db_label]
 84 |     db = PrincipalDatabaseManager(db_url, label=db_label, protected=protected)
 85 |     if not db.available:
 86 |         return
 87 |     db.grab_session()
 88 |     return db
 89 | 
 90 | 
 91 | @nope_in_test
 92 | def get_ro(ro_label, protected=True):
 93 |     """Get a readonly database instance, based on its name.
 94 | 
 95 |     If the label does not exist or the database labeled can't be reached, None
 96 |     is returned.
 97 |     """
 98 |     # Instantiate a readonly database.
 99 |     defaults = get_readonly_databases()
100 |     if ro_label == 'primary' and 'override' in defaults:
101 |         logger.info("Found an override database: using in place of primary.")
102 |         ro_label = 'override'
103 |     if ro_label not in defaults:
104 |         logger.error(f"No such readonly database available: {ro_label}. Check "
105 |                      f"config file or environment variables.")
106 |         return
107 |     db_url = defaults[ro_label]
108 |     ro = ReadonlyDatabaseManager(db_url, label=ro_label, protected=protected)
109 |     if not ro.available:
110 |         return
111 |     ro.grab_session()
112 |     return ro
113 | 
114 | 
115 | def get_ro_host(ro_label):
116 |     """Get the host of the current readonly database."""
117 |     ro = get_ro(ro_label)
118 |     if not ro:
119 |         return None
120 |     return ro.url.host
121 | 


--------------------------------------------------------------------------------
/indra_db/util/helpers.py:
--------------------------------------------------------------------------------
 1 | __all__ = ['unpack', '_get_trids', '_fix_evidence_refs',
 2 |            'get_raw_stmts_frm_db_list', '_set_evidence_text_ref',
 3 |            'get_statement_object']
 4 | 
 5 | import json
 6 | import zlib
 7 | import logging
 8 | 
 9 | from indra.util import clockit
10 | from indra.statements import Statement
11 | 
12 | logger = logging.getLogger('util-helpers')
13 | 
14 | 
15 | def get_statement_object(db_stmt):
16 |     """Get an INDRA Statement object from a db_stmt."""
17 |     if isinstance(db_stmt, bytes):
18 |         jb = db_stmt
19 |     else:
20 |         jb = db_stmt.json
21 |     return Statement._from_json(json.loads(jb.decode('utf-8')))
22 | 
23 | 
24 | def _set_evidence_text_ref(stmt, tr):
25 |     # This is a separate function because it is likely to change, and this is a
26 |     # critical process that is executed in multiple places.
27 |     for ev in stmt.evidence:
28 |         ev.pmid = tr.pmid
29 |         ev.text_refs = tr.get_ref_dict()
30 | 
31 | 
32 | @clockit
33 | def _fix_evidence_refs(db, rid_stmt_trios):
34 |     """Get proper id data for a raw statement from the database.
35 | 
36 |     Alterations are made to the Statement objects "in-place", so this function
37 |     itself returns None.
38 |     """
39 |     rid_set = {rid for rid, _, _ in rid_stmt_trios if rid is not None}
40 |     logger.info("Getting text refs for %d readings." % len(rid_set))
41 |     if rid_set:
42 |         rid_tr_pairs = db.select_all(
43 |             [db.Reading.id, db.TextRef],
44 |             db.Reading.id.in_(rid_set),
45 |             db.Reading.text_content_id == db.TextContent.id,
46 |             db.TextContent.text_ref_id == db.TextRef.id
47 |         )
48 |         rid_tr_dict = {rid: tr for rid, tr in rid_tr_pairs}
49 |         for rid, sid, stmt in rid_stmt_trios:
50 |             if rid is None:
51 |                 # This means this statement came from a database, not reading.
52 |                 continue
53 |             assert len(stmt.evidence) == 1, \
54 |                 "Only raw statements can have their refs fixed."
55 |             _set_evidence_text_ref(stmt, rid_tr_dict[rid])
56 |     return
57 | 
58 | 
59 | @clockit
60 | def get_raw_stmts_frm_db_list(db, db_stmt_objs, fix_refs=True, with_sids=True):
61 |     """Convert table objects of raw statements into INDRA Statement objects."""
62 |     rid_stmt_sid_trios = [(db_stmt.reading_id, db_stmt.id,
63 |                            get_statement_object(db_stmt))
64 |                           for db_stmt in db_stmt_objs]
65 |     if fix_refs:
66 |         _fix_evidence_refs(db, rid_stmt_sid_trios)
67 |     # Note: it is important that order is maintained here (hence not a set or
68 |     # dict).
69 |     if with_sids:
70 |         return [(sid, stmt) for _, sid, stmt in rid_stmt_sid_trios]
71 |     else:
72 |         return [stmt for _, _, stmt in rid_stmt_sid_trios]
73 | 
74 | 
75 | def unpack(bts, decode=True):
76 |     ret = zlib.decompress(bts, zlib.MAX_WBITS+16)
77 |     if decode:
78 |         ret = ret.decode('utf-8')
79 |     return ret
80 | 
81 | 
82 | def _get_trids(db, id_val, id_type):
83 |     """Return text ref IDs corresponding to any ID type and value."""
84 |     # Get the text ref id(s)
85 |     if id_type in ['trid']:
86 |         trids = [int(id_val)]
87 |     else:
88 |         id_types = ['pmid', 'pmcid', 'doi', 'pii', 'url', 'manuscript_id']
89 |         if id_type not in id_types:
90 |             raise ValueError('id_type must be one of: %s' % str(id_types))
91 |         constraint = (getattr(db.TextRef, id_type) == id_val)
92 |         trids = [trid for trid, in db.select_all(db.TextRef.id, constraint)]
93 |     return trids
94 | 


--------------------------------------------------------------------------------
/indra_db/util/s3_path.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from os import path
  3 | from io import BytesIO
  4 | 
  5 | 
  6 | class S3Path(object):
  7 |     """A simple object to make it easier to manage s3 locations."""
  8 |     def __init__(self, bucket, key=None):
  9 |         if not isinstance(bucket, str):
 10 |             raise ValueError("Bucket must be a string, not %s." % type(bucket))
 11 |         self.bucket = bucket
 12 |         if key is not None:
 13 |             if not isinstance(key, str):
 14 |                 raise ValueError("Key must be a string, not %s." % type(key))
 15 |             elif key.startswith('/'):
 16 |                 key = key[1:]
 17 |         self.key = key
 18 | 
 19 |     def __lt__(self, other):
 20 |         if not isinstance(other, S3Path):
 21 |             raise ValueError(f"Cannot compare with type \"{type(other)}\".")
 22 |         return self.to_string() < other.to_string()
 23 | 
 24 |     def __eq__(self, other):
 25 |         if not isinstance(other, S3Path):
 26 |             raise ValueError(f"Cannot compare with type \"{type(other)}\".")
 27 |         return self.to_string() == other.to_string()
 28 | 
 29 |     def __le__(self, other):
 30 |         if not isinstance(other, S3Path):
 31 |             raise ValueError(f"Cannot compare with type \"{type(other)}\".")
 32 |         return self.to_string() <= other.to_string()
 33 | 
 34 |     def kw(self, prefix=False):
 35 |         ret = {'Bucket': self.bucket}
 36 |         if self.key:
 37 |             if prefix:
 38 |                 ret['Prefix'] = self.key
 39 |             else:
 40 |                 ret['Key'] = self.key
 41 |         return ret
 42 | 
 43 |     def get(self, s3):
 44 |         if not self.key:
 45 |             raise ValueError("Cannot get key-less s3 path.")
 46 |         return s3.get_object(**self.kw())
 47 | 
 48 |     def upload(self, s3, body):
 49 |         if not self.key:
 50 |             raise ValueError("Cannot 'upload' to a key-less s3 path.")
 51 |         bytes_io = BytesIO(body)
 52 |         return s3.upload_fileobj(bytes_io, **self.kw())
 53 | 
 54 |     def put(self, s3, body):
 55 |         if not self.key:
 56 |             raise ValueError("Cannot 'put' to a key-less s3 path.")
 57 |         return s3.put_object(Body=body, **self.kw())
 58 | 
 59 |     def list_objects(self, s3):
 60 |         raw_res = s3.list_objects_v2(**self.kw(prefix=True))
 61 |         return [self.__class__(self.bucket, e['Key'])
 62 |                 for e in raw_res['Contents']]
 63 | 
 64 |     def list_prefixes(self, s3):
 65 |         raw_res = s3.list_objects_v2(Delimiter='/', **self.kw(prefix=True))
 66 |         return [self.__class__(self.bucket, e['Prefix'])
 67 |                 for e in raw_res['CommonPrefixes']]
 68 | 
 69 |     def exists(self, s3):
 70 |         return 'Contents' in s3.list_objects_v2(**self.kw(prefix=True))
 71 | 
 72 |     def delete(self, s3):
 73 |         return s3.delete_object(**self.kw())
 74 | 
 75 |     def get_element_path(self, *subkeys):
 76 |         args = []
 77 |         if self.key is not None:
 78 |             args.append(self.key)
 79 |         args += subkeys
 80 |         return self.from_key_parts(self.bucket, *args)
 81 | 
 82 |     @classmethod
 83 |     def from_key_parts(cls, bucket, *key_elements):
 84 |         key = path.join(*key_elements)
 85 |         return cls(bucket, key)
 86 | 
 87 |     @classmethod
 88 |     def from_string(cls, s3_key_str):
 89 |         patt = re.compile(r's3://([a-z0-9\-.]+)/(.*)')
 90 |         m = patt.match(s3_key_str)
 91 |         if m is None:
 92 |             raise ValueError("Invalid format for s3 path: %s" % s3_key_str)
 93 |         bucket, key = m.groups()
 94 |         if not key:
 95 |             key = None
 96 |         return cls(bucket, key)
 97 | 
 98 |     def to_string(self):
 99 |         return 's3://{bucket}/{key}'.format(bucket=self.bucket, key=self.key)
100 | 
101 |     def __str__(self):
102 |         return self.to_string()
103 | 
104 |     def __repr__(self):
105 |         return 'S3Path({bucket}, {key})'.format(bucket=self.bucket,
106 |                                                 key=self.key)
107 | 


--------------------------------------------------------------------------------
/indra_db_service/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/indra_db_service/__init__.py


--------------------------------------------------------------------------------
/indra_db_service/cli/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import click
 4 | 
 5 | 
 6 | @click.group()
 7 | def main():
 8 |     """Run the indra db rest service CLI."""
 9 | 
10 | 
11 | @main.command()
12 | @click.argument('deployment', nargs=1)
13 | @click.option('-s', '--settings', 'zappa_settings_file',
14 |               default='zappa_settings.json',
15 |               help="Specify the zappa settings file to use. Default is "
16 |                    "'zappa_settings.json'.")
17 | def push(deployment, zappa_settings_file):
18 |     """Push a new deployment to the remote lambdas using zappa."""
19 |     import json
20 |     from pathlib import Path
21 |     from indra_db_service.cli.zappa_tools import fix_permissions
22 |     click.echo(f"Updating {deployment} deployment.")
23 |     if not Path(zappa_settings_file).exists():
24 |         click.echo(f"Zappa settings file not found: {zappa_settings_file}")
25 |         return
26 |     zappa_settings = json.load(open(zappa_settings_file, 'r'))
27 |     os.system(f'zappa update {deployment}')
28 |     fix_permissions(deployment, zappa_settings=zappa_settings)
29 | 
30 | 
31 | @main.command()
32 | @click.option('-p', '--port', type=click.INT,
33 |               help="Override the default port number.")
34 | @click.option('-h', '--host', default='0.0.0.0',
35 |               help="Override the default host.")
36 | @click.option('-vd', '--vue-deployment',
37 |               type=click.Choice(['stable', 'dev', 'latest', 'test']),
38 |               help="Load the vue package from this S3 deployment instead of "
39 |                    "a local directory.",
40 |               required=False)
41 | def test_service(port, host, vue_deployment):
42 |     """Run the service in test mode locally."""
43 |     from indra_db_service.config import TESTING
44 |     TESTING['status'] = True
45 |     if vue_deployment is not None:
46 |         TESTING['deployment'] = vue_deployment
47 |         TESTING['vue-root'] = (
48 |             f'https://bigmech.s3.amazonaws.com/indra-db/indralabvue-'
49 |             f'{vue_deployment}'
50 |         )
51 |         click.echo(f'Using deployment {vue_deployment} from S3 at {TESTING["vue-root"]}')
52 | 
53 |     from indra_db_service.api import app
54 |     app.run(host=host, port=port, debug=True)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/indra_db_service/cli/__main__.py:
--------------------------------------------------------------------------------
1 | from . import main
2 | 
3 | 
4 | if __name__ == '__main__':
5 |     main()
6 | 


--------------------------------------------------------------------------------
/indra_db_service/cli/zappa_tools.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | 
 3 | from indra_db.config import CONFIG
 4 | from indra_db.util.aws import get_role_kwargs
 5 | 
 6 | 
 7 | # Lambda CONFIG parameters
 8 | aws_role = CONFIG['lambda']['role']
 9 | aws_primary_function = 'indra-db-api-ROOT'
10 | 
11 | 
12 | def fix_permissions(deployment, zappa_settings) -> None:
13 |     """Add permissions to the lambda function to allow access from API Gateway.
14 | 
15 |     When Zappa runs, it removes permission for the primary endpoint to call
16 |     the lambda functions it creates. This function goes in and fixes those
17 |     permissions, and is intended to be run after a zappa update.
18 |     """
19 |     # Get relevant settings from the zappa config.
20 |     project_name = zappa_settings[deployment]['project_name']
21 |     region = zappa_settings[deployment]['aws_region']
22 |     if zappa_settings[deployment]['profile_name'].lower() != aws_role.lower():
23 |         raise Exception("Required roles do not match!")
24 | 
25 |     # Get the ID for the API on API Gateway
26 |     kwargs, identity = get_role_kwargs(aws_role)
27 |     if 'region_name' not in kwargs:
28 |         kwargs['region_name'] = region
29 |     api_gateway = boto3.client('apigateway', **kwargs)
30 |     api_data = api_gateway.get_rest_apis()
31 |     for item in api_data['items']:
32 |         if item['name'] == aws_primary_function:
33 |             break
34 |     else:
35 |         raise Exception(f"Could not find api matching name: "
36 |                         f"{aws_primary_function}")
37 | 
38 |     # Give the API Gateway access to the lambda functions.
39 |     account_id = identity['Account']
40 |     lambda_client = boto3.client('lambda', **kwargs)
41 |     for label, endpoint in [('root', ''), ('leafs', '/*')]:
42 |         source_arn = (f"arn:aws:execute-api:{region}:{account_id}:{item['id']}"
43 |                       f"/*/*/{deployment}{endpoint}")
44 |         statement_id = f'{aws_primary_function}-access-to-{deployment}-{label}'
45 |         lambda_client.add_permission(FunctionName=f'{project_name}-{deployment}',
46 |                                      Action='lambda:InvokeFunction',
47 |                                      Principal='apigateway.amazonaws.com',
48 |                                      SourceArn=source_arn,
49 |                                      StatementId=statement_id)
50 |     return
51 | 


--------------------------------------------------------------------------------
/indra_db_service/config.py:
--------------------------------------------------------------------------------
 1 | __all__ = [
 2 |     "TITLE",
 3 |     "DEPLOYMENT",
 4 |     "BASE_URL",
 5 |     "VUE_ROOT",
 6 |     "MAX_STMTS",
 7 |     "MAX_LIST_LEN",
 8 |     "REDACT_MESSAGE",
 9 |     "TESTING",
10 |     "jwt_nontest_optional",
11 |     "CURATOR_SALT",
12 | ]
13 | 
14 | from os import environ
15 | from pathlib import Path
16 | from flask_jwt_extended import jwt_required
17 | 
18 | TITLE = "The INDRA Database"
19 | DEPLOYMENT = environ.get("INDRA_DB_API_DEPLOYMENT")
20 | BASE_URL = environ.get("INDRA_DB_API_BASE_URL")
21 | CURATOR_SALT = environ.get("INDRA_DB_API_CURATOR_SALT")
22 | VUE_ROOT = environ.get("INDRA_DB_API_VUE_ROOT")
23 | if VUE_ROOT is not None and not VUE_ROOT.startswith("http"):
24 |     VUE_ROOT = Path(VUE_ROOT).expanduser()
25 |     if not VUE_ROOT.is_absolute():
26 |         VUE_ROOT = Path(__file__).parent.absolute() / VUE_ROOT
27 | MAX_STMTS = 500
28 | MAX_LIST_LEN = 2000
29 | REDACT_MESSAGE = "[MISSING/INVALID CREDENTIALS: limited to 200 char for Elsevier]"
30 | 
31 | TESTING = {}
32 | if environ.get("TESTING_DB_APP") == "1":
33 |     TESTING["status"] = True
34 | else:
35 |     TESTING["status"] = False
36 | TESTING["deployment"] = ""
37 | TESTING["vue-root"] = ""
38 | 
39 | 
40 | def jwt_nontest_optional(func):
41 |     if TESTING["status"]:
42 |         return func
43 |     else:
44 |         return jwt_required(optional=True)(func)
45 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | node_modules
 3 | /dist
 4 | 
 5 | # local env files
 6 | .env.local
 7 | .env.*.local
 8 | 
 9 | # Log files
10 | npm-debug.log*
11 | yarn-debug.log*
12 | yarn-error.log*
13 | 
14 | # Editor directories and files
15 | .idea
16 | .vscode
17 | *.suo
18 | *.ntvs*
19 | *.njsproj
20 | *.sln
21 | *.sw?
22 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/README.md:
--------------------------------------------------------------------------------
 1 | # data-vis
 2 | 
 3 | ## Project setup
 4 | ```
 5 | npm install
 6 | ```
 7 | 
 8 | ### Compiles and hot-reloads for development
 9 | ```
10 | npm run test
11 | ```
12 | 
13 | ### Compiles and minifies for production
14 | ```
15 | npm run build
16 | ```
17 | 
18 | ### Lints and fixes files
19 | ```
20 | npm run lint
21 | ```
22 | 
23 | ### Customize configuration
24 | See [Configuration Reference](https://cli.vuejs.org/config/).
25 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/babel.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   presets: [
3 |     '@vue/cli-plugin-babel/preset'
4 |   ]
5 | }
6 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "data-vis",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "serve": "vue-cli-service serve",
 7 |     "build": "vue-cli-service build --target lib --name DataVis src/index.js",
 8 |     "watch": "npm run build -- --watch",
 9 |     "test": "npm run watch -- --mode development",
10 |     "lint": "vue-cli-service lint"
11 |   },
12 |   "dependencies": {
13 |     "@vueform/multiselect": "^1.5.0",
14 |     "apexcharts": "^3.26.3",
15 |     "core-js": "^3.4.4",
16 |     "vue": "^3.0.11",
17 |     "vue-router": "^4.0.8",
18 |     "vue3-apexcharts": "^1.4.0"
19 |   },
20 |   "devDependencies": {
21 |     "@vue/cli-plugin-babel": "~4.5.0",
22 |     "@vue/cli-plugin-eslint": "~4.5.0",
23 |     "@vue/cli-service": "~4.5.0",
24 |     "@vue/compiler-sfc": "^3.0.0",
25 |     "babel-eslint": "^10.0.3",
26 |     "eslint": "^5.16.0",
27 |     "eslint-plugin-vue": "^5.0.0",
28 |     "vue-template-compiler": "^2.6.10"
29 |   },
30 |   "eslintConfig": {
31 |     "root": true,
32 |     "env": {
33 |       "node": true
34 |     },
35 |     "extends": [
36 |       "plugin:vue/essential",
37 |       "eslint:recommended"
38 |     ],
39 |     "rules": {},
40 |     "parserOptions": {
41 |       "parser": "babel-eslint"
42 |     }
43 |   },
44 |   "browserslist": [
45 |     "> 1%",
46 |     "last 2 versions"
47 |   ]
48 | }
49 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 6 |     <meta name="viewport" content="width=device-width,initial-scale=1.0">
 7 |     <link rel="icon" href="<%= BASE_URL %>favicon.ico">
 8 |     <title>data-vis</title>
 9 |   </head>
10 |   <body>
11 |     <noscript>
12 |       <strong>We're sorry but data-vis doesn't work properly without JavaScript enabled. Please enable it to continue.</strong>
13 |     </noscript>
14 |     <div id="app"></div>
15 |     <!-- built files will be auto injected -->
16 |   </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/src/App.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <div id="app">
 3 |     <TimeView></TimeView>
 4 |     <AmountView></AmountView>
 5 |   </div>
 6 | </template>
 7 | 
 8 | <script>
 9 | import {TimeView, AmountView} from 'indralab-vue'
10 | 
11 | export default {
12 |   name: 'app',
13 |   components: {
14 |     TimeView,
15 |     AmountView
16 |   }
17 | }
18 | </script>
19 | 
20 | <style>
21 | #app {
22 |   font-family: 'Avenir', Helvetica, Arial, sans-serif;
23 |   -webkit-font-smoothing: antialiased;
24 |   -moz-osx-font-smoothing: grayscale;
25 |   text-align: center;
26 |   color: #2c3e50;
27 |   margin-top: 60px;
28 | }
29 | </style>
30 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/src/components/AmountView/AmountView.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <div class="amount-view">
  3 |     <div class="form">
  4 |       <div>
  5 |         <multiselect v-model="stage"
  6 |                      :searchable="true"
  7 |                      :options="stages"
  8 |                      @input="getStageData"
  9 |                      placeholder="Select a stage"></multiselect>
 10 |       </div>
 11 |       <div v-if="measures">
 12 |         <multiselect v-model="selected_measures"
 13 |                      :options="measures"
 14 |                      :multiple="true"
 15 |                      placeholder="Select measure"></multiselect>
 16 |       </div>
 17 |     </div>
 18 |     <linechart v-for="measure in selected_measures"
 19 |                :name="measure"
 20 |                :key="measure"
 21 |                :data="series[measure]">
 22 |     </linechart>
 23 |   </div>
 24 | </template>
 25 | 
 26 | <script>
 27 |   import LineChart from './LineChart'
 28 |   import Multiselect from '@vueform/multiselect'
 29 | 
 30 |   export default {
 31 |     name: "AmountView",
 32 |     components: {
 33 |       linechart: LineChart,
 34 |       multiselect: Multiselect,
 35 |     },
 36 |     data: function() {
 37 |       return {
 38 |         // Stage info
 39 |         stages: [],
 40 |         stage: null,
 41 | 
 42 |         // Measurement option and selection.
 43 |         measures: null,
 44 |         selected_measures: [],
 45 | 
 46 |         // The backend data.
 47 |         amount_data: null,
 48 |         dates: null,
 49 | 
 50 |       }
 51 |     },
 52 |     methods: {
 53 |       getStageData: async function() {
 54 |         /**
 55 |          * Retrieve the data file for the currently selected stage.
 56 |          *
 57 |          * This function sets the `amount_data` and `measures` attributes,
 58 |          * and also resets the `selected_measures` attributes. This function
 59 |          * does nothing if `stage` is empty/null.
 60 |          */
 61 |         // If there is no stage specified, there is nothing to do.
 62 |         if (!this.stage)
 63 |           return;
 64 | 
 65 |         // Reset the selected measures first
 66 |         this.selected_measures = null;
 67 | 
 68 |         // Get the data from the backend service.
 69 |         const resp = await fetch(
 70 |           this.$amount_view_url + this.stage,
 71 |           {method: 'GET'}
 72 |           );
 73 | 
 74 |         // Process the data.
 75 |         this.amount_data = await resp.json();
 76 |         this.measures = Object.keys(this.amount_data);
 77 | 
 78 |         // Figure out what dates are included in this file.
 79 |         let date_set = new Set();
 80 |         Object.values(this.amount_data).forEach(measure_data => {
 81 |           Object.values(measure_data).forEach(source_data => {
 82 |             Object.values(source_data).forEach(pair => {
 83 |               date_set.add(pair[0]);
 84 |             })
 85 |           })
 86 |         });
 87 |         this.dates = Array.from(date_set);
 88 |       },
 89 |       getStageOptions: async function() {
 90 |         /**
 91 |          * Get the list of possible stages from the backend server.
 92 |          *
 93 |          * This function populates the `stages` attribute.
 94 |          */
 95 |         const resp = await fetch(
 96 |           this.$amount_view_url + 'liststages',
 97 |           {method: 'GET'}
 98 |           );
 99 |         this.stages = await resp.json()
100 |       },
101 |     },
102 |     created: function() {
103 |       this.getStageOptions();
104 |     },
105 |     computed: {
106 |       series: function() {
107 |         /**
108 |          * Compute the data series options.
109 |          *
110 |          * The data generated is based on the `amount_data` and `dates`
111 |          * attributes.
112 |          *
113 |          * @return {Object} containing the line-plot data for each measure.
114 |          */
115 | 
116 |         // If there is no amount data, there are no series
117 |         if (!this.amount_data)
118 |           return {};
119 | 
120 |         // Build up the return Object.
121 |         let ret = {};
122 |         let these_dates, these_data;
123 |         for (let [measure, measure_data] of Object.entries(this.amount_data)) {
124 |           ret[measure] = [];
125 |           for (let [name, measures] of Object.entries(measure_data)) {
126 |             // Get the data that exists within the measurements.
127 |             these_dates = new Set();
128 |             these_data = [];
129 |             measures.forEach(pair => {
130 |               these_dates.add(pair[0]);
131 |               these_data.push(pair);
132 |             });
133 | 
134 |             // Fill in the rest with nulls.
135 |             this.dates.forEach(date => {
136 |               if (!these_dates.has(date))
137 |                 these_data.push([date, null])
138 |             });
139 |             ret[measure].push({name: name, data: these_data});
140 |           }
141 |         }
142 |         return ret
143 |       }
144 |     }
145 |   }
146 | </script>
147 | 
148 | <!-- <style src="vue-multiselect/dist/vue-multiselect.min.css"></style> -->
149 | 
150 | <style scoped>
151 |   @import "~@vueform/multiselect/themes/default.css";
152 | 
153 |   .form {
154 |     width: 50em;
155 |     margin: 0 auto;
156 |   }
157 | </style>
158 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/src/components/AmountView/LineChart.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <div class="line-chart">
  3 |     <h3>
  4 |       {{ name }}
  5 |       <button class='btn btn-outline-dark'
  6 |               @click="changeScale">
  7 |         {{ otherScale(scale) }}
  8 |       </button>
  9 |     </h3>
 10 |     <apexchart type="line"
 11 |                height=300
 12 |                :options="chartOptions"
 13 |                :series="series">
 14 |     </apexchart>
 15 |   </div>
 16 | </template>
 17 | 
 18 | <script>
 19 |   import VueApexCharts from 'vue3-apexcharts';
 20 | 
 21 |   export default {
 22 |     name: "LineChart",
 23 |     components: {
 24 |       apexchart: VueApexCharts,
 25 |     },
 26 |     props: [
 27 |       'name',
 28 |       'data'
 29 |     ],
 30 |     data: function() {
 31 |       return {
 32 |         scale: 'Linear',
 33 |       }
 34 |     },
 35 |     methods: {
 36 |       changeScale: function() {
 37 |         this.scale = this.otherScale(this.scale);
 38 |       },
 39 |       otherScale: function(scale) {
 40 |         return (scale === 'Linear') ? 'Log' : 'Linear';
 41 |       },
 42 |       formatYLabel: function(value) {
 43 |         // If the value has been log-scaled, it will be a decimal.
 44 |         if (this.scale === 'Log')
 45 |           return Math.pow(10, value).toFixed(0);
 46 |         return value.toFixed(0);
 47 |       },
 48 |     },
 49 |     computed: {
 50 |       series: function() {
 51 |         if (this.scale === 'Linear')
 52 |           return this.data;
 53 | 
 54 |         const series = [];
 55 |         let scaled_line;
 56 |         for (let line of this.data) {
 57 |           scaled_line = {name: line.name, data: []};
 58 |           line.data.forEach(pair => {
 59 |             if (pair[1] > 0)
 60 |               scaled_line.data.push([pair[0], Math.log10(pair[1])]);
 61 |             else
 62 |               scaled_line.data.push([pair[0], null]);
 63 |           });
 64 |           series.push(scaled_line)
 65 |         }
 66 |         return series
 67 |       },
 68 |       chartOptions: function() {
 69 |         return {
 70 |           dataLabels: {
 71 |             enabled: false
 72 |           },
 73 |           stroke: {
 74 |             curve: 'straight',
 75 |           },
 76 |           grid: {
 77 |             padding: {
 78 |               right: 30,
 79 |               left: 20
 80 |             }
 81 |           },
 82 |           xaxis: {
 83 |             type: 'datetime',
 84 |             title: {
 85 |               text: 'Day',
 86 |             },
 87 |           },
 88 |           yaxis: {
 89 |             title: {
 90 |               text: (this.scale === 'Linear') ? 'Count' : 'Log Count'
 91 |             },
 92 |             labels: {
 93 |               formatter: this.formatYLabel
 94 |             }
 95 |           }
 96 |         }
 97 |       }
 98 |     }
 99 |   }
100 | </script>
101 | 
102 | <style scoped>
103 | 
104 | </style>


--------------------------------------------------------------------------------
/indra_db_service/data-vis/src/components/AmountView/index.js:
--------------------------------------------------------------------------------
1 | import datavisComp from './AmountView'
2 | 
3 | export default Vue => {
4 |     Vue.component(datavisComp.name, datavisComp);
5 | }
6 | 
7 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/src/components/TimeView/TimeView.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 |   <div class="time-view">
  3 |     <div align="center">
  4 |       <button class="btn btn-outline-dark"
  5 |               :disabled="!canDelta(-1)"
  6 |               v-on:click="changeDay(-1)">
  7 |         Previous
  8 |       </button>
  9 |       <button class="btn btn-outline-dark"
 10 |               :disabled="!canDelta(1)"
 11 |               v-on:click="changeDay(1)">
 12 |         Next
 13 |       </button>
 14 |     </div>
 15 | 
 16 |     <div>
 17 |       <span v-for="(flavors, stage) in color_pallett" :key="stage">
 18 |         <b>{{ stage }}</b>:
 19 |         <span v-for="(color, flavor) in flavors" :key="flavor">
 20 |           <span v-if="Object.keys(flavors).length <= 1 || flavor !== 'all'">
 21 |               <span class="legend-dot"
 22 |                     :style="`background-color: ${color};`"></span>
 23 |               {{ flavor.toLowerCase() }}
 24 |           </span>
 25 |         </span>
 26 |         &nbsp;&nbsp;
 27 |       </span>
 28 |     </div>
 29 | 
 30 |     <div v-for="day_bundle in bars" :key="day_bundle.day">
 31 |       <hr>
 32 |       <div class="row">
 33 |         <div class="col-1">
 34 |           {{ day_bundle.day }}
 35 |         </div>
 36 |         <div class="col-11">
 37 |           <figure>
 38 |             <svg :height="day_bundle.bars.length * 5"
 39 |                  width="100%"
 40 |                  role="img"
 41 |                  class="chart">
 42 |               <g v-for="(bar, index) in day_bundle.bars"
 43 |                  :key="bar.key"
 44 |                  class="bar">
 45 |                 <rect :x="bar.start + '%'"
 46 |                       :width="bar.width + '%'"
 47 |                       :y="index * 5"
 48 |                       height="4"
 49 |                       @mouseover="showTooltip($event, bar)"
 50 |                       @mouseleave="tooltip_on = false"
 51 |                       :fill="bar.color"></rect>
 52 |               </g>
 53 |             </svg>
 54 |           </figure>
 55 |         </div>
 56 |       </div>
 57 |     </div>
 58 |     <div class="tooltip"
 59 |          :style="`opacity:${tooltip_on ? 1 : 0};
 60 |                   z-index:${tooltip_on ? 10 : -10};
 61 |                   left: ${tooltip.position.left}px;
 62 |                   top: ${tooltip.position.top}px`">
 63 |       <div><b>{{ tooltip.flavor }}</b></div>
 64 |       <div><span class="stage-label">{{ tooltip.stage }}</span>  {{ tooltip.start }} - {{ tooltip.stop }}</div>
 65 |     </div>
 66 | 
 67 |   </div>
 68 | </template>
 69 | 
 70 | <script>
 71 |    function getTime(hours) {
 72 |      let minutes = Math.round((hours % 1) * 60);
 73 |      let min_str = '';
 74 |      if (minutes < 10)
 75 |        min_str = '0' + minutes;
 76 |      else
 77 |        min_str = minutes.toString();
 78 |      return Math.floor(hours) + ':' + min_str;
 79 |    }
 80 | 
 81 |    export default {
 82 |     name: "TimeView",
 83 |     data: function() {
 84 |       return {
 85 |         lo: 0,
 86 |         color_pallett: {
 87 |           content: {
 88 |             pubmed: '#006600',
 89 |             pmc_oa: '#669900',
 90 |             manuscripts: '#666633',
 91 |             all: '#609060'
 92 |           },
 93 |           reading: {
 94 |             REACH: '#00cc99',
 95 |             SPARSER: '#003399',
 96 |             ISI: '#9999ff',
 97 |             TRIPS: '#0080ff',
 98 |             all: '#606090'
 99 |           },
100 |           preassembly: {
101 |             all: '#cc5050'
102 |           },
103 |         },
104 |         date_data: [],
105 |         tooltip_on: false,
106 |         tooltip: {
107 |           stage: '',
108 |           flavor: '',
109 |           start: '',
110 |           stop: '',
111 |           position: {left: 0, right: 0}
112 |         }
113 |       }
114 |     },
115 |     methods: {
116 |       getDates: async function() {
117 |         /**
118 |          * Get the runtime data from the backend.
119 |          */
120 |         const resp = await fetch(this.$time_view_url, {method: 'GET'});
121 |         this.date_data = await resp.json();
122 |         this.lo = this.date_data.length - 3;
123 |       },
124 | 
125 |       canDelta: function(delta) {
126 |         return ((delta > 0 && this.hi + delta <= this.date_data.length)
127 |             || (delta < 0 && this.lo + delta >= 0))
128 |       },
129 | 
130 |       changeDay: function(delta) {
131 |         if (!this.date_data.length)
132 |           return;
133 | 
134 |         if (this.canDelta(delta))
135 |           this.lo += delta;
136 |       },
137 | 
138 |       showTooltip: function(event, bar) {
139 |         this.tooltip_on = true;
140 |         this.tooltip.stage = bar.stage;
141 |         this.tooltip.flavor = bar.flavor.toLowerCase();
142 |         this.tooltip.start = bar.start_time;
143 |         this.tooltip.stop = bar.stop_time;
144 |         this.tooltip.position = {left: event.pageX, top: event.pageY}
145 |       },
146 |     },
147 |     created: function() {
148 |       this.getDates();
149 |     },
150 |     computed: {
151 |       hi: function() {
152 |         if (!this.date_data.length)
153 |           return this.lo;
154 | 
155 |         return Math.min(this.lo + 3, this.date_data.length);
156 |       },
157 | 
158 |       days: function() {
159 |         /**
160 |          * Generate the series data.
161 |          *
162 |          * This function re-organizes and selects a range of data from the
163 |          * runtime JSON. Only 3 days are shown at a time, based on the `day`
164 |          * prop passed to this component from on high.
165 |          *
166 |          * @return {Array} of objects with data for each stage.
167 |          */
168 |         if (!this.date_data.length)
169 |           return [];
170 |         return this.date_data.slice(this.lo, this.hi);
171 |       },
172 | 
173 |       bars: function() {
174 |         const ret = [];
175 |         let day_bundle;
176 |         for (let day of this.days) {
177 |           day_bundle = {day: day.day_str, bars: []};
178 |           for (let [stage_name, stage_data] of Object.entries(day.times)) {
179 |             for (let [flavor_name, times] of Object.entries(stage_data)) {
180 |               if (Object.keys(stage_data).length > 1 && flavor_name === 'all')
181 |                 continue;
182 | 
183 |               for (let timespan of times) {
184 |                 day_bundle.bars.push({
185 |                   key: stage_name + flavor_name + timespan[0] + '-' + timespan[1],
186 |                   stage: stage_name,
187 |                   flavor: flavor_name,
188 |                   start: Math.max(0, timespan[0]/24 * 100),
189 |                   width: Math.max(0, (timespan[1] - timespan[0])/24 * 100),
190 |                   color: this.color_pallett[stage_name][flavor_name],
191 |                   start_time: getTime(timespan[0]),
192 |                   stop_time: getTime(timespan[1])
193 |                 })
194 |               }
195 |             }
196 |           }
197 |           ret.push(day_bundle);
198 |         }
199 |         return ret;
200 |       }
201 |     }
202 |   }
203 | </script>
204 | 
205 | <style scoped>
206 |   button {
207 |     margin: 2px;
208 |   }
209 |   .bar {
210 |     height: 21px;
211 |     cursor: pointer;
212 |   }
213 |   .legend-dot {
214 |     display: inline-block;
215 |     cursor: pointer;
216 |     position: relative;
217 |     width:12px;
218 |     height:12px;
219 |     border-radius:12px;
220 |   }
221 |   .tooltip {
222 |     position: absolute;
223 |     border-radius: 5px;
224 |     background-color: white;
225 |     padding: 5px;
226 |     border: 1px solid #e3e3e3;
227 |     transition: 0.2s ease all;
228 |   }
229 |   .stage-label {
230 |     color: grey;
231 |     font-weight: bold;
232 |   }
233 | </style>


--------------------------------------------------------------------------------
/indra_db_service/data-vis/src/components/TimeView/index.js:
--------------------------------------------------------------------------------
1 | import datavisComp from './TimeView'
2 | 
3 | export default Vue => {
4 |     Vue.component(datavisComp.name, datavisComp);
5 | }
6 | 
7 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/src/components/index.js:
--------------------------------------------------------------------------------
1 | export {default as TimeView} from './TimeView'
2 | export {default as AmountView} from './AmountView'
3 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/src/index.js:
--------------------------------------------------------------------------------
1 | export {default as TimeView} from './components/TimeView'
2 | export {default as AmountView} from './components/AmountView'
3 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/src/main.js:
--------------------------------------------------------------------------------
1 | import { createApp } from 'vue'
2 | import App from './App.vue'
3 | import {TimeView, AmountView} from './index'
4 | 
5 | const app = createApp(App)
6 | app.use(TimeView)
7 | app.use(AmountView)
8 | app.mount('#app')
9 | 


--------------------------------------------------------------------------------
/indra_db_service/data-vis/vue.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |     publicPath: '/data-vis/',
3 | }
4 | 


--------------------------------------------------------------------------------
/indra_db_service/errors.py:
--------------------------------------------------------------------------------
 1 | from flask import Response, jsonify
 2 | 
 3 | 
 4 | class HttpUserError(ValueError):
 5 |     def __init__(self, msg, err_code=400):
 6 |         self.err_code = err_code
 7 |         self.msg = msg
 8 |         super(HttpUserError, self).__init__(msg)
 9 | 
10 |     def to_json(self):
11 |         return {"result": "failure", "reason": self.msg}
12 | 
13 |     def response(self):
14 |         return jsonify(self.to_json()), self.err_code
15 | 
16 | 
17 | class ResultTypeError(HttpUserError):
18 |     def __init__(self, result_type):
19 |         self.result_type = result_type
20 |         msg = f"Invalid result type: {result_type}"
21 |         super(ResultTypeError, self).__init__(msg)
22 | 
23 | 
24 | class InvalidCredentials(HttpUserError):
25 |     def __init__(self, cred_type):
26 |         super(InvalidCredentials, self).\
27 |             __init__(f"Invalid credentials: {cred_type}", 401)
28 | 
29 | 
30 | class InsufficientPermission(HttpUserError):
31 |     def __init__(self, resource):
32 |         super(InsufficientPermission, self).\
33 |             __init__(f"Insufficient permissions for: {resource}", 403)
34 | 


--------------------------------------------------------------------------------
/indra_db_service/gunicorn.conf.py:
--------------------------------------------------------------------------------
 1 | """Gunicorn configuration file for the INDRA DB service
 2 | 
 3 | https://docs.gunicorn.org/en/stable/settings.html#config-file
 4 | """
 5 | 
 6 | import threading
 7 | from indralab_auth_tools.src.database import monitor_database_connection
 8 | 
 9 | 
10 | def post_fork(server, worker):
11 |     """Function to run after forking a worker
12 | 
13 |     See: https://docs.gunicorn.org/en/stable/settings.html#post-fork
14 | 
15 |     This function is called after a worker is forked. It starts a thread to monitor
16 |     the database connection and reset the connection if it is lost.
17 |     """
18 | 
19 |     # Setting check interval to 2x gunicorn timeout, which is 300 s.
20 |     thread = threading.Thread(
21 |         target=monitor_database_connection, args=(600,), daemon=True
22 |     )
23 |     thread.start()
24 |     print(f"Started database connection monitor thread in worker {worker.pid}.")
25 | 


--------------------------------------------------------------------------------
/indra_db_service/sample_hashes.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gyorilab/indra_db/77785ce0d1badd271b120db747abfff4d6f35832/indra_db_service/sample_hashes.pkl


--------------------------------------------------------------------------------
/indra_db_service/search_introduction.md:
--------------------------------------------------------------------------------
 1 | # INDRA DB Search Interface
 2 | 
 3 | On the landing page of the INDRA DB web interface, you can search for statements by
 4 | agent name, statement type, MeSH term, or paper. By default, an unfilled agent search
 5 | option is displayed. You can add additional search options by selecting one from the
 6 | dropdown below the current list of search options. You can also remove a search option
 7 | by clicking the "X" button next to it. Once you have entered your search criteria, click
 8 | the "Search" button to retrieve the statements that match your search criteria.
 9 | 
10 | ## Search Options
11 | 
12 | The search options are as follows:
13 | 
14 | - Agent: search by a specific entity (gene, small molecule, biological process, etc.)
15 |   The options are:
16 |   - role:
17 | 
18 |     - subject: the agent is an upstream/controller in retrieved statements
19 |     - object: the agent is downstream/controlled in retrieved statements
20 |     - any: any role is allowed
21 |   - text: Enter the name of the agent
22 |   - namespace and Gilda grounding option: Typically it's more reliable to query the DB
23 |     using identifiers rather than informal names. If you know the identifier for e.g., a
24 |     gene, say "HGNC:1234", you enter '1234' in the text box and then chose 'hgnc' in the
25 |     namespace dropdown. However, if you only know the name, the Gilda grounding option
26 |     is useful. If you enter e.g., "K-ras" and click "Ground with GILDA", it will
27 |     automatically find an identifier for it. If there is ambiguity, you can select the
28 |     correct entity from a dropdown. Once you click "Search", the DB will be queried
29 |     using the ID selected from the dropdown.
30 | - Type: the type of statement e.g. Activation, Phosphorylation, DecreaseAmount, Complex,
31 |   etc. Read more about the types of statements in the 
32 |   [INDRA documentation](https://indra.readthedocs.io/en/latest/modules/statements.html).
33 | - MeSH: a Medical Subject Headings term that the papers retrieved as evidence are
34 |   annotated with. This option also has the option to ground with Gilda if you only know
35 |   the name of the MeSH term.
36 | - Paper: Limit the search to a specific publication that evidence comes from. To include
37 |   multiple papers, select another paper search option from the dropdown. In the paper
38 |   search option, you can search by these publication identifiers:
39 |   - PMID: PubMed ID
40 |   - PMCID: PubMed Central ID
41 |   - DOI: Digital Object Identifier
42 |   - TRID: Internal INDRA DB ID signifying a specific publication regardless of the
43 |     external identifier (PMID, PMCID, DOI).
44 |   - TCID: Internal INDRA DB ID signifying a piece of a text retrieved from 
45 |     a particular source.
46 | 
47 | ## Search Results
48 | 
49 | The search results are displayed in hierarchical list format. At the top level, the
50 | most generic form of interaction matching the search criteria are displayed. Clicking
51 | on one of the rows expands the next level of detail, showing the specific forms of
52 | interactions that match the search criteria. Clicking on one of these rows expands the
53 | next level of detail, showing the specific statements that match the search criteria.
54 | The nesting is at most three levels deep, but can also be less if e.g., there is only one
55 | statement type for one interaction type.
56 | 
57 | ![Web UI screenshot](../doc/web_ui_results_expanded.png)
58 | <span class="caption">Search results view with three levels of nesting expanded for USP15 affecting BARD1</span>
59 | 
60 | The search results allows you to curate evidence for each statement. To do this, click
61 | on the pencil icon next to the piece of evidence you want to curate. This will open a
62 | curation area where different options for curating the evidence are available. To read
63 | more about curation, see the
64 | [curation tutorial](https://indra.readthedocs.io/en/latest/tutorials/html_curation.html)
65 | in the INDRA documentation.
66 | 


--------------------------------------------------------------------------------
/indra_db_service/templates/daily_data.html:
--------------------------------------------------------------------------------
 1 | {% extends "idbr_template.html" %}
 2 | 
 3 | {% block scripts %}
 4 |   {{ super() }}
 5 |   <script src="https://cdn.jsdelivr.net/npm/vue/dist/vue.js"></script>
 6 | {% endblock %}
 7 | 
 8 | {% block body %}
 9 |   {{ super() }}
10 |   <div id="app">
11 |     <h3>Batch Job Runtimes</h3>
12 |     <time-view></time-view>
13 | 
14 |     <h3>Output Measurements</h3>
15 |     <amount-view></amount-view>
16 |   </div>
17 |   <script src="{{ url_for('serve_data_vis', file_path='DataVis.umd.js') }}"></script>
18 |   <link href="{{ url_for('serve_data_vis', file_path='DataVis.css') }}"
19 |         rel="stylesheet">
20 |   <script>
21 |     Vue.prototype.$time_view_url = "{{ url_for('serve_runtime') }}";
22 |     Vue.prototype.$amount_view_url = "{{ url_for('serve_stages', stage='') }}";
23 |     var app = new Vue({el: '#app'});
24 |   </script>
25 | {% endblock %}
26 | 


--------------------------------------------------------------------------------
/indra_db_service/templates/idbr_description.html:
--------------------------------------------------------------------------------
1 | This project is developed by the <a href="https://gyorilab.github.io">
2 | Gyori Lab for Computational Biomedicine</a> at Northeastern University.
3 | This work was funded by DARPA grants W911NF‐15‐1‐0544 and HR00112220036
4 | under the DARPA CwC, DARPA ASKEM and ARPA-H BDF programs.
5 | Source code for the INDRA DB is available <a href='https://github.com/gyorilab/indra_db'>here</a>.
6 | Contact: <a href="mailto:b.gyori@northeastern.edu">Benjamin M. Gyori</a>.


--------------------------------------------------------------------------------
/indra_db_service/templates/idbr_statements_view.html:
--------------------------------------------------------------------------------
 1 | {% extends "indra/statements_view.html" %}
 2 | {% from "auth_macros.html" import login_overlay %}
 3 | {% from "idbr_template.html" import nav_header %}
 4 | 
 5 | {% block scripts %}
 6 |     {{ super() }}
 7 | 
 8 |     <script src="{{ url_for('static', filename='curationFunctions.js') }}"></script>
 9 |     <script>
10 |         let CURATION_ADDR = "{{ url_for('submit_curation_endpoint', hash_val='') }}";
11 |     </script>
12 | 
13 | {% endblock %}
14 | 
15 | {% block navbar %}
16 |     {{ nav_header(identity) }}
17 | {% endblock %}
18 | 
19 | {% block footer_desc %}{% include "idbr_description.html" %}{{ super() }}{% endblock %}
20 | 
21 | {% block body %}
22 |     {{ login_overlay() }}
23 |     {{ super() }}
24 | {% endblock %}
25 | 
26 | {% block additional_footer %}
27 |     {{ super() }}{% include "idbr_footer.html" %}
28 | {% endblock %}
29 | 


--------------------------------------------------------------------------------
/indra_db_service/templates/idbr_template.html:
--------------------------------------------------------------------------------
  1 | {% extends "indra/template.html" %}
  2 | {% from "auth_macros.html" import login_overlay %}
  3 | 
  4 | {% macro nav_header(identity) -%}
  5 |     <script>
  6 |         function handle_success(type, resp_data) {
  7 |             const user_msg = document.querySelector('#user-loginout-msg');
  8 |             if (type === "login") {
  9 |                 const btn = document.querySelector("#loginout-button");
 10 |                 btn.innerHTML = 'Logout';
 11 |                 btn.onclick = () => {return trigger_logout()};
 12 |                 document.querySelector('#user-logoin');
 13 |                 user_msg.innerHTML = `Welcome, ${resp_data.user_email}`;
 14 |                 report_login_result(''); // clear the login result message
 15 |             }
 16 |             else if (type === "register") {
 17 |                 trigger_login()
 18 |             }
 19 |             else { // logout
 20 |                 const btn = document.querySelector("#loginout-button");
 21 |                 btn.innerHTML = 'Login';
 22 |                 btn.onclick = () => {return trigger_login()};
 23 |                 user_msg.innerHTML = ""
 24 |             }
 25 |         }
 26 | 
 27 |         function trigger_login(type=null, data=null) {
 28 |             return login(handle_success, trigger_unchecked_login)
 29 |         }
 30 | 
 31 |         function trigger_unchecked_login(type=null, data=null) {
 32 |             return login(handle_success, trigger_unchecked_login, true)
 33 |         }
 34 | 
 35 |         function trigger_logout() {
 36 |             return logout(handle_success)
 37 |         }
 38 |     </script>
 39 |     <style>
 40 |         #loginout-button, #user-loginout-msg {
 41 |             margin-top: 5px;
 42 |             margin-bottom: 5px;
 43 |         }
 44 | 
 45 |         .no-horiz-pad {
 46 |             padding-left: 0;
 47 |             padding-right: 0;
 48 |         }
 49 |     </style>
 50 |     <button class="navbar-toggler" type="button" data-toggle="collapse"
 51 |             data-target="#navbarCollapse" aria-controls="navbarCollapse"
 52 |             aria-expanded="navbarCollapse" aria-label="Toggle navigation">
 53 |       <span class="navbar-toggler-icon"></span>
 54 |     </button>
 55 |     <div class="collapse navbar-collapse" id="navbarCollapse">
 56 |       <ul class="navbar-nav mr-auto">
 57 |         <li class="nav-item {% if search_active %}acitve{% endif %}">
 58 |           <a class="nav-link" href="{{ url_for('search') }}"
 59 |              target="_blank">
 60 |             Search
 61 |           </a>
 62 |         </li>
 63 |         <li>
 64 |           <a class="nav-link" href="https://github.com/gyorilab/indra_db/blob/master/indra_db_service/search_introduction.md"
 65 |              target="_blank">
 66 |             Documentation
 67 |           </a>
 68 |         </li>
 69 |         <li>
 70 |           <a class="nav-link" href="https://github.com/gyorilab/indra_db/blob/master/indra_db_service/README.md"
 71 |              target="_blank">
 72 |             Rest API
 73 |           </a>
 74 |         </li>
 75 |       </ul>
 76 |       <div class="col text-right no-horiz-pad">
 77 |         <span id="user-loginout-msg">
 78 |             {% if identity %}
 79 |               Welcome {{ identity['email'] }}!
 80 |             {% endif %}
 81 |         </span>
 82 |         <button class="btn btn-primary"
 83 |                 onclick="return {% if identity %}trigger_logout(){% else %}trigger_login(){% endif %};"
 84 |                 id="loginout-button">
 85 |           {% if identity %}
 86 |             Logout
 87 |           {%  else %}
 88 |             Login
 89 |           {% endif %}
 90 |         </button>
 91 |       </div>
 92 |     </div>
 93 | {%- endmacro %}
 94 | 
 95 | {% block title %}
 96 |   <a class="navbar-brand" href="#" style="padding-left: 0.5em;">INDRA Database</a>
 97 | {% endblock %}
 98 | 
 99 | {% block navbar %}
100 |     {{ nav_header(identity) }}
101 | {% endblock %}
102 | 
103 | 
104 | {% block body %}
105 |     {{ login_overlay() }}
106 |     {{ super() }}
107 | {% endblock %}
108 | 
109 | {% block footer_desc %}
110 |   <p>{% include "idbr_description.html" %}</p>
111 | {% endblock %}
112 | 


--------------------------------------------------------------------------------
/indra_db_service/templates/search.html:
--------------------------------------------------------------------------------
 1 | {% extends "idbr_template.html" %}
 2 | 
 3 | {% block scripts %}
 4 | {{ super() }}
 5 | 
 6 | <!-- Vue.js setup -->
 7 | 
 8 | <!-- development version, includes helpful console warnings -->
 9 | <!--<script src="https://cdn.jsdelivr.net/npm/vue@2/dist/vue.js"></script>-->
10 | <!-- production version, optimized for size and speed -->
11 | <script src="https://cdn.jsdelivr.net/npm/vue@2"></script>
12 | 
13 | <!-- indralab vue package, sourced from either S3 or locally when developing -->
14 | <script src="{{ vue_src }}"></script>
15 | <link href="{{  vue_style }}" rel="stylesheet">
16 | 
17 | <style>
18 |   {% for category, data in source_colors %}
19 |     {% for source, font_color in data['sources'].items() %}
20 |       .source-{{ source }} {
21 |         background-color: {{ font_color }};
22 |         color: {{ data['color'] }};
23 |       }
24 |     {% endfor %}
25 |   {% endfor %}
26 | 
27 |   .badge-subject {
28 |     background-color: #4a36aa;
29 |     color: #FFFFFF;
30 |   }
31 |   .badge-object {
32 |     background-color: #2d8e4c;
33 |     color: #FFFFFF;
34 |   }
35 |   .badge-other {
36 |     background-color: #606060;
37 |     color: #FFFFFF;
38 |   }
39 |   .badge-source {
40 |     font-size: 8pt;
41 |     margin: 0;
42 |     padding-left: 5px;
43 |     margin-left: 1px;
44 |     cursor: pointer;
45 |   }
46 | 
47 |   .frozen-box {
48 |     background-color: #f0f0f0;
49 |     border: 1px solid #a0a0a0;
50 |     border-radius: 2px;
51 |     padding: 5px;
52 |   }
53 | 
54 |   #search-reopen {
55 |     cursor: pointer;
56 |   }
57 | 
58 |   .nvm {
59 |     padding-left: 0;
60 |     padding-right: 0;
61 |   }
62 | </style>
63 | 
64 | {% endblock %}
65 | 
66 | {% block body %}
67 |   {{ super() }}
68 |   <div id="app">
69 |     <relation-search></relation-search>
70 |   </div>
71 |   <script>
72 |     Vue.prototype.$ground_url = "{{ url_for('ground') }}";
73 |     Vue.prototype.$relation_url = "{{ url_for('expand_meta_row') }}";
74 |     Vue.prototype.$agent_url = "{{ url_for('get_statements',
75 |                                            result_type='agents',
76 |                                            method='from_simple_json') }}";
77 |     Vue.prototype.$stmt_url = "{{ url_for('get_statements',
78 |                                           result_type='statements',
79 |                                           method='from_agent_json') }}";
80 |     Vue.prototype.$stmt_hash_url = "{{ url_for('get_statements',
81 |                                                result_type='statements',
82 |                                                method='from_hash/') }}";
83 |     Vue.prototype.$curation_url = "{{ url_for('submit_curation_endpoint',
84 |                                               hash_val='') }}";
85 |     Vue.prototype.$curation_list_url = "{{ url_for('list_curations',
86 |                                                    stmt_hash='',
87 |                                                    src_hash='') }}".slice(0, -2);
88 |     Vue.prototype.$stmt_types = {{ stmt_types_json }};
89 |     Vue.prototype.$sources = {{ sources_dict }};
90 |     var app = new Vue({el: '#app'});
91 |   </script>
92 | {% endblock %}
93 | 


--------------------------------------------------------------------------------
/indra_db_service/templates/search_statements.html:
--------------------------------------------------------------------------------
  1 | {% extends "idbr_template.html" %}
  2 | 
  3 | {% block scripts %}
  4 | {{ super() }}
  5 | 
  6 | <script>
  7 |     var ENV = 'indra';
  8 | 
  9 |     function appendToFormSubmit() {
 10 |         // Initialize queryString
 11 |         let queryString = '';
 12 | 
 13 |         // Get agents OR subject/object
 14 |         let actorsString = agentsOrNot();
 15 |         if (!actorsString) return;
 16 | 
 17 |         queryString += actorsString;
 18 | 
 19 |         // Loop form fields
 20 |         for (el of document.querySelectorAll('.stmt-form')) {
 21 |             if (!(['subject', 'object', 'agents'].indexOf(el.name) > -1)) {
 22 |                 if (el.value) {
 23 |                     if (el.name == 'type') {
 24 |                         queryString += el.name + '=' + el.value.replace(/\s+/g, '') + '&';
 25 |                     } else {
 26 |                         queryString += el.name + '=' + el.value + '&';
 27 |                     }
 28 |                 }
 29 |             }
 30 |         }
 31 |         queryString += 'format=html';
 32 | 
 33 |         // Build URL
 34 |         let baseUrl = `${window.location.href}/from_agents?`;
 35 | 
 36 |         let getUrl = baseUrl + queryString;
 37 | 
 38 |         // Check output
 39 |         // console.log('getUrl: ' + getUrl)
 40 | 
 41 |         // Redirect
 42 |         window.location.replace(getUrl)
 43 | 
 44 |     }
 45 | 
 46 |     function agentsOrNot() {
 47 |         let agentsArray = document.getElementById('agents').value.match(/\S+/g) || [];
 48 |         let subject = document.getElementById('subject').value.replace(/\s+/g, '');
 49 |         let object = document.getElementById('object').value.replace(/\s+/g, '');
 50 | 
 51 |         // Check if user entered both agents AND at least one of subject and object
 52 |         if (!(Boolean(agentsArray.length) || Boolean(subject) || Boolean(object))) {
 53 |             alert('Only input subject/object OR agents');
 54 |             return;
 55 |         }
 56 | 
 57 |         let queryString = '';
 58 | 
 59 |         // Get subject/object or agents
 60 |         if (Boolean(subject)) {
 61 |             queryString += 'subject=' + subject + '&';
 62 |         }
 63 |         if (Boolean(object)) {
 64 |             queryString += 'object=' + object + '&';
 65 |         }
 66 |         if (agentsArray.length > 0) {
 67 |             let i = 0;
 68 |             for (ag of agentsArray) {
 69 |                 queryString += 'agent' + i + '=' + ag + '&';
 70 |                 i++;
 71 |             }
 72 |         }
 73 | 
 74 |         return queryString;
 75 |     }
 76 | </script>
 77 | {% endblock %}
 78 | 
 79 | {% block body %}
 80 |     {{ super() }}
 81 |     <p>{{ message }}</p>
 82 |     <form>
 83 |         <b>Mandatory Parameters</b><br>Enter subject and object
 84 |         <div class="row">
 85 |             <div class="col">
 86 |                 <input type="text" id="subject" name="subject"
 87 |                        class="form-control stmt-form" placeholder="subject...">
 88 |             </div>
 89 |             <div class="col">
 90 |                 <input type="text" id="object" name="object"
 91 |                        class="form-control stmt-form" placeholder="object...">
 92 |             </div>
 93 |         </div>
 94 |         <i>or</i> enter agents separated by space
 95 |         <div class="row">
 96 |             <div class="col">
 97 |                 <input type="text" id="agents" name="agents"
 98 |                        class="form-control stmt-form"
 99 |                        placeholder="Agents separated by space...">
100 |             </div>
101 |         </div>
102 |         <br><b>Optional parameters</b><br>Statement type (<i>see the <a
103 |             href="https://indra.readthedocs.io/en/latest/modules/statements.html"
104 |             target="_blank">INDRA documentation</a> for more info on statement
105 |         types.</i>)
106 |         <div class="row">
107 |             <div class="col">
108 |                 <input type="text" name="type" id="stmt-type"
109 |                        class="form-control stmt-form" placeholder="Enter statement type">
110 |             </div>
111 |         </div>
112 |         Number of statements returned (max 1000)
113 |         <div class="row">
114 |             <div class="col">
115 |                 <div class="container"
116 |                      style="width: 90px; float: left; padding: 0px;">
117 |                     <input class="form-control stmt-form" type="number" name="max_stmts"
118 |                            id="max-stmts" min="1" max="1000">
119 |                 </div>
120 |             </div>
121 |         </div>
122 |         Evidence count per statement (max 10000)
123 |         <div class="row">
124 |             <div class="col">
125 |                 <div class="container"
126 |                      style="width: 110px; float: left; padding: 0px;">
127 |                     <input class="form-control stmt-form" type="number" name="ev_limit"
128 |                            id="ev-limit" min="1" max="10000">
129 |                 </div>
130 |             </div>
131 |         </div>
132 |         <br><b>Submit</b>
133 |         <div class="row">
134 |             <div class="col">
135 |                 <button type="button" class="btn btn-primary"
136 |                         onclick="appendToFormSubmit()">Submit query
137 |                 </button>
138 |             </div>
139 |         </div>
140 | 
141 |     </form>
142 | {% endblock %}
143 | 


--------------------------------------------------------------------------------
/indra_db_service/templates/welcome.html:
--------------------------------------------------------------------------------
 1 | {% extends 'idbr_template.html' %}
 2 | 
 3 | {% block body %}
 4 |     Click the button to the right to sign in and get access to the DB
 5 |     search page
 6 |     <button style="margin: 10px" class="btn btn-outline-primary"
 7 |             id="status-notify" onClick="{{ onclick_action }}">Search
 8 |         the database
 9 |     </button>
10 | {% endblock %}


--------------------------------------------------------------------------------
/indra_db_service/util.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from io import StringIO
  4 | from datetime import datetime
  5 | 
  6 | from indra.assemblers.html.assembler import _format_stmt_text
  7 | from indra_db.client import stmt_from_interaction
  8 | 
  9 | from indra_db.client.readonly.query import gilda_ground
 10 | 
 11 | logger = logging.getLogger('db rest api - util')
 12 | 
 13 | 
 14 | class DbAPIError(Exception):
 15 |     pass
 16 | 
 17 | 
 18 | class NoGroundingFound(DbAPIError):
 19 |     pass
 20 | 
 21 | 
 22 | def get_s3_client():
 23 |     import boto3
 24 |     from botocore import config
 25 |     return boto3.client('s3', boto3.session.Session().region_name,
 26 |                         config=config.Config(s3={'addressing_style': 'path'}))
 27 | 
 28 | # ==============================================
 29 | # Define some utilities used to resolve queries.
 30 | # ==============================================
 31 | 
 32 | 
 33 | def process_agent(agent_param):
 34 |     """Get the agent id and namespace from an input param."""
 35 | 
 36 |     if not agent_param.endswith('@TEXT'):
 37 |         param_parts = agent_param.split('@')
 38 |         if len(param_parts) == 2:
 39 |             ag, ns = param_parts
 40 |         elif len(param_parts) == 1:
 41 |             ns = 'NAME'
 42 |             ag = param_parts[0]
 43 |         else:
 44 |             raise DbAPIError('Unrecognized agent spec: \"%s\"' % agent_param)
 45 |     else:
 46 |         ag = agent_param[:-5]
 47 |         ns = 'TEXT'
 48 | 
 49 |     if ns == 'HGNC-SYMBOL':
 50 |         ns = 'NAME'
 51 | 
 52 |     logger.info("Resolved %s to ag=%s, ns=%s" % (agent_param, ag, ns))
 53 |     return ag, ns
 54 | 
 55 | 
 56 | def process_mesh_term(mesh_term):
 57 |     """Use gilda to translate a mesh term into a MESH ID if possible."""
 58 |     if mesh_term is None:
 59 |         return mesh_term
 60 | 
 61 |     # Check to see if this is a mesh ID.
 62 |     if any(mesh_term.startswith(c) for c in ['D', 'C']) \
 63 |             and mesh_term[1:].isdigit():
 64 |         return mesh_term
 65 | 
 66 |     # Try to ground the term.
 67 |     results = gilda_ground(mesh_term)
 68 |     for res in results:
 69 |         if res['term']['db'] == 'MESH':
 70 |             logger.info(f"Auto-mapped {mesh_term} to {res['term']['id']} "
 71 |                         f"({res['term']['entry_name']}) using Gilda.")
 72 |             return res['term']['id']
 73 |     raise NoGroundingFound(f"Could not find MESH id for {mesh_term} among "
 74 |                            f"gilda results:\n{json.dumps(results, indent=2)}")
 75 | 
 76 | 
 77 | def get_source(ev_json):
 78 |     notes = ev_json.get('annotations')
 79 |     if notes is None:
 80 |         return
 81 |     src = notes.get('content_source')
 82 |     if src is None:
 83 |         return
 84 |     return src.lower()
 85 | 
 86 | 
 87 | def sec_since(t):
 88 |     return (datetime.now() - t).total_seconds()
 89 | 
 90 | 
 91 | class LogTracker(object):
 92 |     log_path = '.rest_api_tracker.log'
 93 | 
 94 |     def __init__(self):
 95 |         root_logger = logging.getLogger()
 96 |         self.stream = StringIO()
 97 |         sh = logging.StreamHandler(self.stream)
 98 |         formatter = logging.Formatter('%(levelname)s: %(name)s %(message)s')
 99 |         sh.setFormatter(formatter)
100 |         sh.setLevel(logging.WARNING)
101 |         root_logger.addHandler(sh)
102 |         self.root_logger = root_logger
103 |         return
104 | 
105 |     def get_messages(self):
106 |         conts = self.stream.getvalue()
107 |         print(conts)
108 |         ret = conts.splitlines()
109 |         return ret
110 | 
111 |     def get_level_stats(self):
112 |         msg_list = self.get_messages()
113 |         ret = {}
114 |         for msg in msg_list:
115 |             level = msg.split(':')[0]
116 |             if level not in ret.keys():
117 |                 ret[level] = 0
118 |             ret[level] += 1
119 |         return ret
120 | 
121 | 
122 | def iter_free_agents(query_dict):
123 |     agent_keys = {k for k in query_dict.keys() if k.startswith('agent')}
124 |     for k in agent_keys:
125 |         entry = query_dict.pop(k)
126 |         if isinstance(entry, list):
127 |             for agent in entry:
128 |                 yield agent
129 |         else:
130 |             yield entry
131 | 
132 | 
133 | def _make_english_from_meta(interaction):
134 |     stmt_type = interaction.get('type')
135 |     agent_json = interaction['agents']
136 |     if stmt_type is None:
137 |         if len(agent_json) == 0:
138 |             eng = ''
139 |         else:
140 |             ag_list = list(agent_json.values())
141 |             eng = f'<b>{ag_list[0]}</b>'
142 |             if len(agent_json) > 1:
143 |                 eng += ' affects ' + f'<b>{ag_list[1]}</b>'
144 |                 if len(agent_json) > 3:
145 |                     eng += ', ' \
146 |                            + ', '.join(f'<b>{ag}</b>'
147 |                                        for ag in ag_list[2:-1])
148 |                 if len(agent_json) > 2:
149 |                     eng += ', and ' + f'<b>{ag_list[-1]}</b>'
150 |             else:
151 |                 eng += ' is modified'
152 |     else:
153 |         eng = _format_stmt_text(stmt_from_interaction(interaction))
154 |     return eng


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | def main():
 5 |     packages = find_packages()
 6 |     print("Installing `indra_db` Packages:\n", '\n'.join(packages))
 7 |     extras_require = {'test': ['nose', 'coverage', 'python-coveralls',
 8 |                                'nose-timer'],
 9 |                       'service': ['flask', 'flask-jwt-extended', 'flask-cors',
10 |                                   'flask-compress', 'numpy'],
11 |                       'cli': ['click', 'boto3'],
12 |                       'copy': ['pgcopy'],
13 |                       'misc': ['matplotlib', 'numpy']}
14 |     extras_require['all'] = list({dep for deps in extras_require.values()
15 |                                   for dep in deps})
16 |     setup(name='indra_db',
17 |           version='0.0.1',
18 |           description='INDRA Database',
19 |           long_description='INDRA Database',
20 |           url='https://github.com/indralab/indra_db',
21 |           author='Patrick Greene',
22 |           author_email='patrick_greene@hms.harvard.edu',
23 |           packages=packages,
24 |           include_package_data=True,
25 |           install_requires=['sqlalchemy<1.4', 'psycopg2', 'cachetools',
26 |                             'termcolor', 'bs4', 'pyyaml'],
27 |           extras_require=extras_require,
28 |           entry_points="""
29 |           [console_scripts]
30 |           indra-db=indra_db.cli:main
31 |           indra-db-service=indra_db_service.cli:main
32 |           indra-db-benchmarker=benchmarker.cli:main
33 |           """)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 | 


--------------------------------------------------------------------------------