├── .github └── workflows │ └── docs.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── docker-compose.yml ├── docs ├── .nojekyll ├── Makefile ├── conf.py ├── configuration.rst ├── faq.rst ├── images │ ├── architecture.png │ ├── expand.jpg │ ├── icondark.png │ ├── iconlogodark.png │ └── manual.jpg ├── index.rst ├── make.bat ├── requirements.txt └── usage.rst ├── neuralqa ├── __init__.py ├── cli.py ├── config_default.yaml ├── expander │ ├── __init__.py │ ├── expander.py │ ├── expanderpool.py │ └── mlmexpander.py ├── reader │ ├── __init__.py │ ├── bertreader.py │ ├── reader.py │ └── readerpool.py ├── retriever │ ├── __init__.py │ ├── elasticsearchretriever.py │ ├── retriever.py │ ├── retrieverpool.py │ └── solrretriever.py ├── server │ ├── __init__.py │ ├── routehandlers.py │ ├── routemodels.py │ ├── serve.py │ ├── server_app.py │ └── ui │ │ ├── .gitignore │ │ ├── README.md │ │ ├── build │ │ ├── android-chrome-192x192.png │ │ ├── android-chrome-512x512.png │ │ ├── apple-touch-icon.png │ │ ├── asset-manifest.json │ │ ├── favicon-16x16.png │ │ ├── favicon-32x32.png │ │ ├── favicon.ico │ │ ├── images │ │ │ └── icon.png │ │ ├── index.html │ │ ├── libs │ │ │ └── leader-line.min.js │ │ ├── logo152.png │ │ ├── manifest.json │ │ ├── precache-manifest.f2ddb522e87f24d57699361b5d062612.js │ │ ├── robots.txt │ │ ├── service-worker.js │ │ └── static │ │ │ ├── css │ │ │ ├── main.0d7f6602.chunk.css │ │ │ └── main.0d7f6602.chunk.css.map │ │ │ └── js │ │ │ ├── 2.17f05cd8.chunk.js │ │ │ ├── 2.17f05cd8.chunk.js.LICENSE.txt │ │ │ ├── 2.17f05cd8.chunk.js.map │ │ │ ├── main.32abfeaf.chunk.js │ │ │ ├── main.32abfeaf.chunk.js.map │ │ │ ├── runtime-main.985d1449.js │ │ │ └── runtime-main.985d1449.js.map │ │ ├── package.json │ │ ├── public │ │ ├── android-chrome-192x192.png │ │ ├── android-chrome-512x512.png │ │ ├── apple-touch-icon.png │ │ ├── favicon-16x16.png │ │ ├── favicon-32x32.png │ │ ├── favicon.ico │ │ ├── images │ │ │ └── icon.png │ │ ├── index.html │ │ ├── libs │ │ │ └── leader-line.min.js │ │ ├── logo152.png │ │ ├── manifest.json │ │ └── robots.txt │ │ ├── src │ │ ├── components │ │ │ ├── Main.js │ │ │ ├── barviz │ │ │ │ ├── BarViz.jsx │ │ │ │ └── barviz.css │ │ │ ├── expandview │ │ │ │ ├── ExpandView.jsx │ │ │ │ ├── ex.json │ │ │ │ └── expandview.css │ │ │ ├── explainview │ │ │ │ ├── ExplainView.jsx │ │ │ │ ├── ex.json │ │ │ │ └── explainview.css │ │ │ ├── footer │ │ │ │ ├── Footer.jsx │ │ │ │ └── footer.css │ │ │ ├── header │ │ │ │ ├── Header.jsx │ │ │ │ └── header.css │ │ │ ├── helperfunctions │ │ │ │ └── HelperFunctions.jsx │ │ │ ├── queryview │ │ │ │ ├── QueryView.jsx │ │ │ │ └── queryview.css │ │ │ ├── template.css │ │ │ ├── template.scss │ │ │ └── testview │ │ │ │ ├── TestView.jsx │ │ │ │ ├── aapl.csv │ │ │ │ ├── ex.json │ │ │ │ └── testview.css │ │ ├── index.js │ │ ├── serviceWorker.js │ │ └── setupTests.js │ │ └── yarn.lock ├── utils │ ├── __init__.py │ ├── cli_args.py │ ├── config_utils.py │ ├── data_utils.py │ └── file_utils.py └── version.py ├── notes.md ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── expander └── test_expander.py ├── reader └── test_reader.py └── retriever └── test_retriever.py /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | 3 | # Controls when the action will run. Triggers the workflow on push or pull request 4 | # events but only for the master branch 5 | on: 6 | push: 7 | branches: [master] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | # Example of using a custom build-command. 17 | - uses: ammaraskar/sphinx-action@master 18 | with: 19 | docs-folder: "docs/" 20 | 21 | # Create an artifact of the html output. 22 | - uses: actions/upload-artifact@v1 23 | with: 24 | name: DocumentationHTML 25 | path: docs/_build/html/ 26 | 27 | # Publish built docs to gh-pages branch. 28 | # =============================== 29 | - name: Commit documentation changes 30 | run: | 31 | git clone https://github.com/victordibia/neuralqa.git --branch gh-pages --single-branch gh-pages 32 | cp -r docs/_build/html/* gh-pages/ 33 | cd gh-pages 34 | git config --local user.email "action@github.com" 35 | git config --local user.name "GitHub Action" 36 | git add . 37 | git commit -m "Update documentation" -a || true 38 | # The above command will fail if no changes were present, so we ignore 39 | # that. 40 | - name: Push changes 41 | uses: ad-m/github-push-action@master 42 | with: 43 | branch: gh-pages 44 | directory: gh-pages 45 | github_token: ${{ secrets.GITHUB_TOKEN }} 46 | # =============================== 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | data 3 | neuralqa/data 4 | docs/images/extra 5 | config.yaml 6 | .vscode 7 | neuralqa/server/test_query.py 8 | 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | pip-wheel-metadata/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | db.sqlite3-journal 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 103 | __pypackages__/ 104 | 105 | # Celery stuff 106 | celerybeat-schedule 107 | celerybeat.pid 108 | 109 | # SageMath parsed files 110 | *.sage.py 111 | 112 | # Environments 113 | .env 114 | .venv 115 | env/ 116 | venv/ 117 | ENV/ 118 | env.bak/ 119 | venv.bak/ 120 | 121 | # Spyder project settings 122 | .spyderproject 123 | .spyproject 124 | 125 | # Rope project settings 126 | .ropeproject 127 | 128 | # mkdocs documentation 129 | /site 130 | 131 | # mypy 132 | .mypy_cache/ 133 | .dmypy.json 134 | dmypy.json 135 | 136 | # Pyre type checker 137 | .pyre/ 138 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:20.04 2 | 3 | COPY . . 4 | 5 | RUN apt-get update && \ 6 | apt-get -y upgrade && \ 7 | apt-get -y install python3 && \ 8 | apt-get -y install python3-pip && \ 9 | pip3 install neuralqa && \ 10 | apt-get -y install wget && \ 11 | wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.0-amd64.deb && \ 12 | wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.8.0-amd64.deb.sha512 && \ 13 | shasum -a 512 -c elasticsearch-7.8.0-amd64.deb.sha512 && \ 14 | dpkg -i elasticsearch-7.8.0-amd64.deb && \ 15 | service elasticsearch start && \ 16 | sleep 30 && \ 17 | 18 | EXPOSE 80 19 | 20 | CMD ["neuralqa", "--host", "0.0.0.0", "--port", "80"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Victor Dibia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## NeuralQA: A Usable Library for (Extractive) Question Answering on Large Datasets with BERT 2 | 3 | [![License: MIT](https://img.shields.io/github/license/victordibia/neuralqa)](https://opensource.org/licenses/MIT) 4 | ![docs](https://github.com/victordibia/neuralqa/workflows/docs/badge.svg?style=flat-square) 5 | 6 | > Still in **alpha**, lots of changes anticipated. View demo on [neuralqa.fastforwardlabs.com](https://neuralqa.fastforwardlabs.com/#/). 7 | 8 | 9 | 10 | `NeuralQA` provides an easy to use api and visual interface for Extractive Question Answering (QA), 11 | on large datasets. The QA process is comprised of two main stages - **Passage retrieval (Retriever)** is implemented using [ElasticSearch](https://www.elastic.co/downloads/elasticsearch) 12 | and **Document Reading (Reader)** is implemented using pretrained BERT models via the 13 | Huggingface [Transformers](https://github.com/huggingface/transformers) api. 14 | 15 | ## Usage 16 | 17 | ```shell 18 | pip3 install neuralqa 19 | ``` 20 | 21 | Create (or navigate to) a folder you would like to use with NeuralQA. Run the following command line instruction within that folder. 22 | 23 | ```shell 24 | neuralqa ui --port 4000 25 | ``` 26 | 27 | navigate to [http://localhost:4000/#/](http://localhost:4000/#/) to view the NeuralQA interface. Learn about other command line options in the documentation [here](https://victordibia.github.io/neuralqa/usage.html#command-line-options) or how to [configure](https://victordibia.github.io/neuralqa/configuration.html) NeuralQA to use your own reader models or retriever instances. 28 | 29 | > Note: To use NeuralQA with a retriever such as ElasticSearch, follow the [instructions here](https://www.elastic.co/downloads/elasticsearch) to download, install, and launch a local elasticsearch instance and add it to your config.yaml file. 30 | 31 | ### How Does it Work? 32 | 33 | 34 | 35 | NeuralQA is comprised of several high level modules: 36 | 37 | - **Retriever**: For each search query (question), scan an index (elasticsearch), and retrieve a list of candidate matched passages. 38 | 39 | - **Reader**: For each retrieved passage, a BERT based model predicts a span that contains the answer to the question. In practice, retrieved passages may be lengthy and BERT based models can process a maximum of 512 tokens at a time. NeuralQA handles this in two ways. Lengthy passages are chunked into smaller sections with a configurable stride. Secondly, NeuralQA offers the option of extracting a subset of relevant snippets (RelSnip) which a BERT reader can then scan to find answers. Relevant snippets are portions of the retrieved document that contain exact match results for the search query. 40 | 41 | - **Expander**: Methods for generating additional (relevant) query terms to improve recall. Currently, we implement Contextual Query Expansion using finetuned Masked Language Models. This is implemented via a user in the loop flow where the user can choose to include any suggested expansion terms. 42 | 43 | 44 | 45 | - **User Interface**: NeuralQA provides a visual user interface for performing queries (manual queries where question and context are provided as well as queries over a search index), viewing results and also sensemaking of results (reranking of passages based on answer scores, highlighting keyword match, model explanations). 46 | 47 | ## Configuration 48 | 49 | Properties of modules within NeuralQA (ui, retriever, reader, expander) can be specified via a [yaml configuration](neuralqa/config_default.yaml) file. When you launch the ui, you can specify the path to your config file `--config-path`. If this is not provided, NeuralQA will search for a config.yaml in the current folder or create a [default copy](neuralqa/config_default.yaml)) in the current folder. Sample configuration shown below: 50 | 51 | ```yaml 52 | ui: 53 | queryview: 54 | intro: 55 | title: "NeuralQA: Question Answering on Large Datasets" 56 | subtitle: "Subtitle of your choice" 57 | views: # select sections of the ui to hide or show 58 | intro: True 59 | advanced: True 60 | samples: False 61 | passages: True 62 | explanations: True 63 | allanswers: True 64 | options: # values for advanced options 65 | stride: .. 66 | maxpassages: .. 67 | highlightspan: .. 68 | 69 | header: # header tile for ui 70 | appname: NeuralQA 71 | appdescription: Question Answering on Large Datasets 72 | 73 | reader: 74 | title: Reader 75 | selected: twmkn9/distilbert-base-uncased-squad2 76 | options: 77 | - name: DistilBERT SQUAD2 78 | value: twmkn9/distilbert-base-uncased-squad2 79 | type: distilbert 80 | - name: BERT SQUAD2 81 | value: deepset/bert-base-cased-squad2 82 | type: bert 83 | ``` 84 | 85 | ## Documentation 86 | 87 | An attempt is being made to better document NeuralQA here - [https://victordibia.github.io/neuralqa/](https://victordibia.github.io/neuralqa/). 88 | 89 | ## Citation 90 | 91 | A paper introducing NeuralQA and its components can be [found here](https://arxiv.org/abs/2007.15211). 92 | 93 | ``` 94 | @article{dibia2020neuralqa, 95 | title={NeuralQA: A Usable Library for Question Answering (Contextual Query Expansion + BERT) on Large Datasets}, 96 | author={Victor Dibia}, 97 | year={2020}, 98 | journal={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP): System Demonstrations} 99 | } 100 | ``` 101 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | neuralqa_docker: 4 | build: . 5 | expose: 6 | - 80 7 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/docs/.nojekyll -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath('../../neuralqa/')) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = 'NeuralQA' 22 | copyright = '2020, Victor Dibia' 23 | author = 'Victor Dibia' 24 | 25 | # The full version, including alpha/beta/rc tags 26 | release = '0.0.16a' 27 | 28 | # set master doc 29 | master_doc = 'index' 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = ['sphinx.ext.autodoc'] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # List of patterns, relative to source directory, that match files and 42 | # directories to ignore when looking for source files. 43 | # This pattern also affects html_static_path and html_extra_path. 44 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 45 | 46 | 47 | # -- Options for HTML output ------------------------------------------------- 48 | 49 | # The theme to use for HTML and HTML Help pages. See the documentation for 50 | # a list of builtin themes. 51 | # 52 | html_theme = 'sphinx_rtd_theme' 53 | 54 | # Add any paths that contain custom static files (such as style sheets) here, 55 | # relative to this directory. They are copied after the builtin static files, 56 | # so a file named "default.css" will overwrite the builtin "default.css". 57 | html_static_path = [] 58 | -------------------------------------------------------------------------------- /docs/configuration.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ================ 3 | 4 | 5 | ``NeuralQA`` provides an interface to specify properties of each module (ui, retriever, reader, expander) via a `yaml configuration `_ file. When you launch the ui, you can specify the path to your config file `--config-path`. If this is not provided, NeuralQA will search for a config.yaml in the current folder or create a [default copy](neuralqa/config_default.yaml)) in the current folder. Sample configuration for the UI is shown below: 6 | 7 | 8 | UI Configuration 9 | ************************** 10 | 11 | The code snippet below shows how you can configure parts of the main user interface for ``NeuralQA``. 12 | 13 | .. note:: 14 | You will need to restart ``NeuralQA`` each time you make a change to config.yaml. 15 | You can show/hide sections of the UI e.g. show/hide retrieved passages, show only top answer or all answers, show or hide the advanced options view etc. You can also change the default title and description of the page. 16 | 17 | 18 | .. code-block:: yaml 19 | 20 | ui: 21 | header: 22 | appname: NeuralQA 23 | appdescription: Question Answering on Large Datasets 24 | queryview: 25 | intro: 26 | title: "NeuralQA: Question Answering on Large Datasets" 27 | subtitle: "NeuralQA is an interactive tool for question answering (passage retrieval + document reading). You can manually provide a passage or select a search index from (e.g. case.law ) dataset under the QA configuration settings below. To begin, type in a question query below." 28 | disclaimer: " .. " 29 | views: 30 | intro: True 31 | advanced: True # if false, default retriever/reader settings will be used. 32 | samples: True # show/hide sample question answer pairs 33 | passages: True # show/hide passages which are retrieved 34 | explanations: True # show/hide explanations button 35 | allanswers: True # show all answers or just the best answer (based on probability score) 36 | expander: False # show or hide the expander dropdown. 37 | options: 38 | stride: 39 | title: Token Stride 40 | selected: 0 41 | options: 42 | - name: 0 43 | value: 0 44 | - name: 50 45 | value: 50 46 | - name: 100 47 | value: 100 48 | - name: 200 49 | value: 200 50 | maxdocuments: 51 | title: Max Documents 52 | selected: 5 53 | options: 54 | - name: 5 55 | value: 5 56 | - name: 10 57 | value: 10 58 | - name: 15 59 | value: 15 60 | highlightspan: 61 | title: Highlight Span 62 | selected: 250 63 | options: 64 | - name: 150 65 | value: 150 66 | - name: 250 67 | value: 250 68 | - name: 350 69 | value: 350 70 | - name: 450 71 | value: 450 72 | - name: 650 73 | value: 650 74 | samples: 75 | 76 | 77 | 78 | Reader Configuration 79 | ************************** 80 | 81 | You can configure the reader models that are available for use with the ``NeuralQA`` api and web interface. Because ``NeuralQA`` uses the HuggingFace api, reader models can be specified using either the path to a hosted HuggingFace model or the path to a local folder on disk contained a trained HuggingFace model. 82 | 83 | .. code-block:: yaml 84 | 85 | reader: 86 | title: Reader 87 | selected: twmkn9/distilbert-base-uncased-squad2 #default selected reader on startup. Should correspond to the reader value 88 | options: 89 | - name: DistilBERT SQUAD2 90 | value: twmkn9/distilbert-base-uncased-squad2 91 | type: distilbert 92 | - name: BERT SQUAD2 93 | value: deepset/bert-base-cased-squad2 94 | type: bert 95 | - name: Medical BERT SQUAD2 96 | value: /Users/user/Downloads/meddistilbert # example path to a local model on disk 97 | type: bert 98 | 99 | 100 | 101 | 102 | 103 | Memory Requirements 104 | ************************** 105 | 106 | To enable fast user interaction, ``NeuralQA`` loads models weights that are specified in `config.yaml` into memory when the application is launched. For example, if 3 Bert base reader models (~425MB each) specified in `config.yaml`, each of these will be loaded into memory. Similarly all specified query expansion models will also be loaded into memory. Plan to account for these when using ``NeuralQA``. -------------------------------------------------------------------------------- /docs/faq.rst: -------------------------------------------------------------------------------- 1 | .. NeuralQA documentation master file, created by 2 | sphinx-quickstart on Fri Jul 3 22:14:37 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | FAQ 7 | ========================= 8 | 9 | - In general versions of neuralqa may introduce changes config.yaml structure. A good first debug step is to delete your current config.yaml and rebuild it. -------------------------------------------------------------------------------- /docs/images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/docs/images/architecture.png -------------------------------------------------------------------------------- /docs/images/expand.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/docs/images/expand.jpg -------------------------------------------------------------------------------- /docs/images/icondark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/docs/images/icondark.png -------------------------------------------------------------------------------- /docs/images/iconlogodark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/docs/images/iconlogodark.png -------------------------------------------------------------------------------- /docs/images/manual.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/docs/images/manual.jpg -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. NeuralQA documentation master file, created by 2 | sphinx-quickstart on Fri Jul 3 22:14:37 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | NeuralQA Documentation 7 | ========================= 8 | 9 | 10 | ``NeuralQA`` provides an easy to use api and visual interface for Question Answering (QA), 11 | on large datasets. The QA process is comprised of two main stages - **Passage retrieval (Retriever)** which is implemented using `ElasticSearch `_ 12 | and **Document Reading (Reader)** which is implemented using pretrained BERT models via the 13 | Huggingface `Transformers `_ api. 14 | 15 | You can install it via pip: 16 | 17 | .. code-block:: shell 18 | 19 | pip3 install neuralqa 20 | 21 | Launch the web interface via command line: 22 | 23 | .. code-block:: shell 24 | 25 | neuralqa ui --port 5000 26 | 27 | You can also clone this repository, make changes and launch the application directly from repository: 28 | 29 | .. code-block:: shell 30 | 31 | python neuralqa/cli.py ui 32 | 33 | 34 | 35 | .. image:: https://raw.githubusercontent.com/victordibia/neuralqa/master/docs/images/manual.jpg 36 | :width: 100% 37 | :alt: NeuralQA User Interface Screenshot 38 | 39 | 40 | 41 | Why NeuralQA? 42 | ********************* 43 | 44 | The goal of NeuralQA is to provide the quickest path to exploring QA with as little changes as possible to your current infrastructure. 45 | 46 | NeuralQA is helpful in a few ways: 47 | 48 | - A visual interface for sensemaking of model results. 49 | - A rest api for QA related operations (retrieval, document reading, model explanation). 50 | - Helpful implementations that *can* improve the QA process 51 | - RelSnip (Relevant Snippets): The content of retrieved documents can be lengthy, incurring high latency costs for a docuement reader to process the entire document. RelSnip constructs a smaller passage by concatenating subsections of the original documents that contain exact keyword matches for the query. This set of relevant snippets is then processed by the document reader. 52 | - Query expansion: Sparse representation retrievers like BM25 and TFIDF (implemented in ElasticSearch) rely on exact query keyword matching. This can be problematic if a different vocabulary is used in the documents to express the same content. To help address this, NeuralQA can rewrite the query to integrate additional keywords to increase the set of relevant retrieved documents. 53 | - Configurable via a yaml configuration file. 54 | 55 | - Bring your own QA reader. You can select from the gallery of QA models provided by HuggingFace or provide your own finetuned HuggingFace model. 56 | - Bring your own retriever. You can attach NeuralQA to an existing retriever instance (elasticsearch) and configure retriever queries. 57 | - Configure the visual interface. 58 | 59 | - Show/hide views: e.g. show/hide retrieved passages, show only top answer or all answers, show sample questions etc. 60 | - Show/hide controls: e.g. show/hide controls for selected retriever, reader etc. 61 | - Content: You can rename the title and descriptions as needed. 62 | 63 | NeuralQA is created to be helpful for two groups of users: 64 | 65 | - **Hobbyists**: Try out QA models on your own data or retriever setup and visually inspect the results. 66 | - **Teams**: Provide a front facing QA interface for your end users on your retriever instances. You can create docker containers that run NeuralQA for scale and configure them with your retriever instance clusters. 67 | 68 | How It Works 69 | ********************** 70 | 71 | .. image:: https://raw.githubusercontent.com/victordibia/neuralqa/master/docs/images/architecture.png 72 | :width: 100% 73 | :alt: NeuralQA Architecture 74 | 75 | NeuralQA is comprised of several high level modules: 76 | 77 | 78 | - **Retriever**: For each search query (question), scan an index (elasticsearch), and retrieve a list of candidate matched passages. 79 | 80 | - **Reader**: For each retrieved passage, a BERT based model predicts a span that contains the answer to the question. In practice, retrieved passages may be lengthy and BERT based models can process a maximum of 512 tokens at a time. NeuralQA handles this in two ways. Lengthy passages are chunked into smaller sections with a configurable stride. Secondly, NeuralQA offers the option of extracting a subset of relevant snippets (RelSnip) which a BERT reader can then scan to find answers. Relevant snippets are portions of the retrieved document that contain exact match results for the search query. 81 | 82 | - **Expander**: Methods for generating additional (relevant) query terms to improve recall. Currently, we implement Contextual Query Expansion using finetuned Masked Language Models. 83 | 84 | - **User Interface**: NeuralQA provides a visual user interface for performing queries (manual queries where question and context are provided as well as queries over a search index), viewing results and also sensemaking of results (reranking of passages based on answer scores, highlighting keyword match, model explanations). 85 | 86 | 87 | Citation 88 | ********************** 89 | A paper introducing NeuralQA and its components can be `found here `_. 90 | 91 | .. code-block:: 92 | 93 | @article{dibia2020neuralqa, 94 | title={NeuralQA: A Usable Library for Question Answering (Contextual Query Expansion + BERT) on Large Datasets}, 95 | author={Victor Dibia}, 96 | year={2020}, 97 | journal={arXiv preprint arXiv:2007.15211} 98 | } 99 | 100 | 101 | .. toctree:: 102 | :maxdepth: 3 103 | :caption: Contents: 104 | 105 | self 106 | usage 107 | configuration 108 | faq 109 | 110 | 111 | .. Indices and tables 112 | .. ================== 113 | 114 | .. * :ref:`genindex` 115 | .. * :ref:`modindex` 116 | .. * :ref:`search` 117 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx_rtd_theme -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | Usage 2 | ============= 3 | 4 | Installation 5 | ******************* 6 | 7 | ``NeuralQA`` can be installed via `pip` using the following command: 8 | 9 | .. code-block:: shell 10 | 11 | pip3 install neuralqa 12 | 13 | 14 | Viewing the UI 15 | ************************************************** 16 | 17 | .. code-block:: shell 18 | 19 | neuralqa ui --port 5000 --workers 1 20 | 21 | 22 | .. note:: 23 | ``NeuralQA`` uses the `uvicorn `_ asgi webserver with support for multiple workers (use the worker flag to set the number of worker processes). Note that model weights used by ``NeuralQA`` are loaded into memory on status *for each thread*. 24 | 25 | .. image:: https://raw.githubusercontent.com/victordibia/neuralqa/master/docs/images/manual.jpg 26 | :width: 100% 27 | :alt: NeuralQA User Interface Screenshot 28 | 29 | 30 | 31 | Command Line Options 32 | ********************************* 33 | 34 | The primary command for ``NeuralQA`` is `neuralqa ui`, used to launch the web interface. Use the following command to view the available options. 35 | 36 | .. code-block:: shell 37 | 38 | neuralqa ui --help 39 | 40 | .. code-block:: shell 41 | 42 | Usage: neuralqa ui [OPTIONS] 43 | 44 | This command launches the web interface for NeuralQA. 45 | 46 | Options: 47 | -h, --host TEXT The network address to listen on (default: 48 | 127.0.0.1). Use 0.0.0.0 to bind to all addresses if 49 | you want to access the tracking server from other 50 | machines. 51 | 52 | -p, --port INTEGER The port to listen on (default: 5000). 53 | -w, --workers INTEGER Number of uviicorn worker processes to handle 54 | requests (default: 1). 55 | 56 | -cp, --config-path TEXT Path to a yaml file containing config for neuralqa. 57 | If none is provided, the default config.yaml is 58 | copied to the current directory. 59 | 60 | --help Show this message and exit. 61 | 62 | 63 | 64 | 65 | 66 | Rest API Docs 67 | ***************************************** 68 | 69 | The rest api for ``NeuralQA`` is implemented using `FastAPI `_. This means you do get excellent documentation for free. In your browser, type the following: 70 | 71 | 72 | .. code-block:: shell 73 | 74 | localhost:port/api/docs 75 | 76 | 77 | 78 | 79 | Loading Sample Data 80 | ***************************************** 81 | 82 | ``NeuralQA`` provides a method to download and import sample data (court case documents) into a local elasticsearch index. The command below will import the first 10,000 documents from the `case law dataset `_ for new mexico. 83 | 84 | 85 | .. code-block:: shell 86 | 87 | neuralqa load --max-docs 10000 -------------------------------------------------------------------------------- /neuralqa/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from neuralqa.version import VERSION as __version__ 3 | from neuralqa.reader import BERTReader 4 | from neuralqa.utils import import_sample_data 5 | 6 | 7 | logging.getLogger("transformers").setLevel(logging.ERROR) 8 | logging.getLogger("tensorflow").setLevel(logging.ERROR) 9 | logging.getLogger("elasticsearch").setLevel(logging.CRITICAL) 10 | 11 | __all__ = ["BERTReader", "import_sample_data"] 12 | -------------------------------------------------------------------------------- /neuralqa/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | from neuralqa.server import launch_server 3 | from neuralqa.utils import cli_args 4 | from neuralqa.utils import import_sample_data, ConfigParser 5 | import os 6 | from neuralqa.retriever import RetrieverPool 7 | import logging 8 | 9 | 10 | @click.group() 11 | @click.version_option() 12 | def cli(): 13 | pass 14 | 15 | 16 | # @cli.command() 17 | # @cli_args.HOST 18 | # @cli_args.PORT 19 | # @cli_args.WORKERS 20 | # @cli_args.CONFIG_PATH 21 | # def test(host, port, workers, config_path): 22 | # import_sample_data() 23 | 24 | 25 | @cli.command() 26 | @cli_args.MAX_DOCS 27 | def load(max_docs): 28 | """This command loads sample data into a local elastic search index.""" 29 | 30 | logging.basicConfig() 31 | logging.getLogger().setLevel(logging.INFO) 32 | logging.getLogger(__name__).setLevel(logging.INFO) 33 | import_sample_data(max_docs=max_docs) 34 | 35 | 36 | @cli.command() 37 | @cli_args.HOST 38 | @cli_args.PORT 39 | @cli_args.WORKERS 40 | @cli_args.CONFIG_PATH 41 | def ui(host, port, workers, config_path): 42 | """This command launches the web interface for NeuralQA.""" 43 | logging.basicConfig() 44 | logging.getLogger().setLevel(logging.INFO) 45 | logging.getLogger(__name__).setLevel(logging.INFO) 46 | if (config_path): 47 | os.environ["NEURALQA_CONFIG_PATH"] = config_path 48 | launch_server(host, port, workers) 49 | 50 | 51 | if __name__ == '__main__': 52 | cli() 53 | -------------------------------------------------------------------------------- /neuralqa/config_default.yaml: -------------------------------------------------------------------------------- 1 | appname: NeuralQA 2 | 3 | ui: 4 | header: 5 | appname: NeuralQA 6 | appdescription: Question Answering on Large Datasets 7 | queryview: 8 | intro: 9 | title: "NeuralQA: Question Answering on Large Datasets" 10 | subtitle: "NeuralQA is an interactive tool for question answering (passage retrieval + document reading). You can manually provide a passage or select a search index from (e.g. case.law ) dataset under the QA configuration settings below. To begin, type in a question query below." 11 | disclaimer: " .. " 12 | views: 13 | intro: True 14 | advanced: True # if false, default retriever/reader settings will be used. 15 | samples: True # show/hide sample question answer pairs 16 | passages: True # show/hide passages which are retrieved 17 | explanations: True # show/hide explanations button 18 | allanswers: False # show all answers or just the best answer (based on probability score) 19 | expander: False 20 | options: 21 | stride: 22 | title: Token Stride 23 | selected: 0 24 | options: 25 | - name: 0 26 | value: 0 27 | - name: 50 28 | value: 50 29 | - name: 100 30 | value: 100 31 | - name: 200 32 | value: 200 33 | maxdocuments: 34 | title: Max Documents 35 | selected: 5 36 | options: 37 | - name: 5 38 | value: 5 39 | - name: 10 40 | value: 10 41 | - name: 15 42 | value: 15 43 | fragmentsize: 44 | title: Fragment Size 45 | selected: 350 46 | options: 47 | - name: 350 48 | value: 350 49 | - name: 450 50 | value: 450 51 | - name: 650 52 | value: 650 53 | - name: 850 54 | value: 850 55 | 56 | retriever: 57 | title: Retriever 58 | selected: "none" 59 | 60 | options: 61 | - name: None 62 | value: "none" 63 | type: "none" 64 | 65 | # - name: Case Law 66 | # value: cases 67 | # type: elasticsearch 68 | # connection: 69 | # host: localhost 70 | # port: 9200 71 | # username: "" 72 | # password: "" 73 | # body_field: "casebody.data.opinions.text" 74 | # - name: Medical 75 | # value: medical 76 | # host: localhost 77 | # port: 9200 78 | # username: None 79 | # password: None 80 | # type: elasticsearch 81 | # fields: 82 | # body_field: context 83 | # - name: Supreme Court 84 | # value: supremecourt 85 | # host: localhost 86 | # port: 9200 87 | # username: None 88 | # password: None 89 | # type: elasticsearch 90 | # fields: 91 | # body_field: casebody 92 | readtopn: 0 93 | 94 | relsnip: 95 | title: Relsnip 96 | selected: True 97 | options: 98 | - name: Yes 99 | value: True 100 | - name: No 101 | value: False 102 | 103 | server: # webserver host and port defaults 104 | host: localhost 105 | port: 5000 106 | 107 | reader: 108 | title: Reader 109 | selected: twmkn9/distilbert-base-uncased-squad2 110 | options: 111 | - name: DistilBERT SQUAD2 112 | value: twmkn9/distilbert-base-uncased-squad2 113 | type: distilbert 114 | - name: BERT SQUAD2 115 | value: deepset/bert-base-cased-squad2 116 | type: bert 117 | # - name: Medical BERT SQUAD2 118 | # value: /Users/victordibia/Downloads/meddistilbert 119 | # type: bert 120 | 121 | expander: 122 | title: Expander 123 | selected: "none" 124 | options: 125 | - name: "None" 126 | value: "none" 127 | type: "none" 128 | # - name: BERT MaskedLM 129 | # type: maskedlm 130 | # value: bert-base-uncased 131 | 132 | # optional selection of samples to show in the UI 133 | samples: 134 | - question: "what is the goal of the fourth amendment?" 135 | context: 136 | The Fourth Amendment of the U.S. Constitution provides that the right of 137 | the people to be secure in their persons, houses, papers, and effects, against 138 | unreasonable searches and seizures, shall not be violated, and no Warrants shall 139 | issue, but upon probable cause, supported by Oath or affirmation, and particularly 140 | describing the place to be searched, and the persons or things to be seized.'The 141 | ultimate goal of this provision is to protect people’s right to privacy and freedom 142 | from unreasonable intrusions by the government. However, the Fourth Amendment 143 | does not guarantee protection from all searches and seizures, but only those done 144 | by the government and deemed unreasonable under the law. 145 | - question: Who was the first woman to serve on the supreme court in America 146 | context: 147 | Sandra Day O’Connor, née Sandra Day, (born March 26, 1930, El Paso, Texas, 148 | U.S.), associate justice of the Supreme Court of the United States from 1981 to 149 | 2006. She was the first woman to serve on the Supreme Court. A moderate conservative, 150 | she was known for her dispassionate and meticulously researched opinions. Sandra 151 | Day grew up on a large family ranch near Duncan, Arizona. She received undergraduate 152 | (1950) and law (1952) degrees from Stanford University, where she met the future 153 | chief justice of the United States William Rehnquist. 154 | - question: Where did Sandra Day grow up? 155 | context: 156 | Sandra Day O’Connor, née Sandra Day, (born March 26, 1930, El Paso, Texas, 157 | U.S.), associate justice of the Supreme Court of the United States from 1981 to 158 | 2006. She was the first woman to serve on the Supreme Court. A moderate conservative, 159 | she was known for her dispassionate and meticulously researched opinions. Sandra 160 | Day grew up on a large family ranch near Duncan, Arizona. She received undergraduate 161 | (1950) and law (1952) degrees from Stanford University, where she met the future 162 | chief justice of the United States William Rehnquist. 163 | -------------------------------------------------------------------------------- /neuralqa/expander/__init__.py: -------------------------------------------------------------------------------- 1 | from .expander import * 2 | from .mlmexpander import * 3 | from .expanderpool import * 4 | -------------------------------------------------------------------------------- /neuralqa/expander/expander.py: -------------------------------------------------------------------------------- 1 | 2 | class Expander: 3 | def __init__(self, expander_type, **kwargs): 4 | self.expander_type = expander_type 5 | -------------------------------------------------------------------------------- /neuralqa/expander/expanderpool.py: -------------------------------------------------------------------------------- 1 | 2 | from neuralqa.expander import MLMExpander 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class ExpanderPool(): 9 | def __init__(self, expanders): 10 | self._selected_expander = expanders["selected"] 11 | self.expander_pool = {} 12 | for expander in expanders["options"]: 13 | if (expander["type"] == "maskedlm"): 14 | self.expander_pool[expander["value"]] = MLMExpander( 15 | model_path=expander["value"]) 16 | 17 | @property 18 | def expander(self): 19 | return self.expander_pool[self.selected_expander] 20 | 21 | @property 22 | def selected_expander(self): 23 | return self._selected_expander 24 | 25 | @selected_expander.setter 26 | def selected_expander(self, selected_expander): 27 | if (selected_expander in self.expander_pool): 28 | self._selected_expander = selected_expander 29 | else: 30 | if (len(self.expander_pool) > 0): 31 | default_expander = next(iter(self.expander_pool)) 32 | logger.info( 33 | ">> Expander you are attempting to use %s does not exist in expander pool. Using the following default expander instead %s ", selected_expander, default_expander) 34 | self._selected_expander = default_expander 35 | else: 36 | logger.info( 37 | ">> No expander has been specified in config.yaml.") 38 | self._selected_expander = None 39 | -------------------------------------------------------------------------------- /neuralqa/expander/mlmexpander.py: -------------------------------------------------------------------------------- 1 | from neuralqa.expander import Expander 2 | import logging 3 | from transformers import AutoTokenizer, TFBertForMaskedLM 4 | import tensorflow as tf 5 | import time 6 | import spacy 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class MLMExpander(Expander): 11 | def __init__(self, index_type="mlm", model_path="bert-base-uncased", **kwargs): 12 | Expander.__init__(self, index_type) 13 | 14 | self.candidate_pos = ["NOUN", "ADJ", "ADV"] 15 | self.model_path = model_path 16 | 17 | allowed_keys = list(self.__dict__.keys()) 18 | self.__dict__.update((k, v) 19 | for k, v in kwargs.items() if k in allowed_keys) 20 | rejected_keys = set(kwargs.keys()) - set(allowed_keys) 21 | if rejected_keys: 22 | raise ValueError( 23 | "Invalid arguments in ElasticSearchRetriever constructor:{}".format(rejected_keys)) 24 | 25 | logger.info( 26 | ">> loading HF model for Query Expansion from " + model_path) 27 | self.tokenizer = AutoTokenizer.from_pretrained( 28 | self.model_path, use_fast=True) 29 | self.model = TFBertForMaskedLM.from_pretrained( 30 | self.model_path, from_pt=True) 31 | logger.info(">> Loading Spacy NLP model ") 32 | 33 | try: 34 | self.nlp = spacy.load('en_core_web_md') 35 | except OSError: 36 | logger.info( 37 | "Downloading language model for the spaCy POS tagger (don't worry, this will only happen once)") 38 | from spacy.cli import download 39 | download('en_core_web_md') 40 | self.nlp = spacy.load('en_core_web_md') 41 | # self.nlp = en_core_web_md.load() 42 | # logger.info(">> Spacy nlp model loaded ") 43 | 44 | def predict_mask(self, sequence, model, tokenizer, top_n=2): 45 | input = tokenizer.encode(sequence, return_tensors="tf") 46 | mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1] 47 | token_logits = model(input)[0] 48 | mask_token_logits = token_logits[0, mask_token_index, :] 49 | 50 | probabilities = tf.nn.softmax(mask_token_logits) 51 | topk = tf.math.top_k(probabilities, top_n) 52 | top_n_probs, top_n_tokens = topk.values.numpy(), topk.indices.numpy() 53 | results = [{"token": tokenizer.decode([top_n_tokens[i]]), "probability": float(top_n_probs[i])} 54 | for i in range(len(top_n_probs))] 55 | # print(results) 56 | return results 57 | 58 | def expand_query(self, query, top_n=3, threshold=0): 59 | start_time = time.time() 60 | 61 | doc = self.nlp(query) 62 | query_tokens = [str(token) for token in doc] 63 | new_terms = [] 64 | candidate_expansions = [] 65 | # print([chunk.text for chunk in doc.noun_chunks], "\n =========") 66 | # print([ent.text for ent in doc.ents], "\n =========") 67 | # for token in doc: 68 | # print(token, "=>", token.ent_type_) 69 | 70 | for i, token in enumerate(doc): 71 | # only expand if pos is not in our candidate list and it is not a named entity type 72 | pred_tokens = None 73 | if (token.pos_ in self.candidate_pos and not token.ent_type_): 74 | temp_doc = query_tokens.copy() 75 | temp_doc[i] = self.tokenizer.mask_token 76 | temp_doc = " ".join(temp_doc) 77 | pred_tokens = self.predict_mask( 78 | temp_doc, self.model, self.tokenizer, top_n=top_n) 79 | new_terms = new_terms + pred_tokens 80 | candidate_expansions.append( 81 | {"token": str(token), "expansion": pred_tokens, "token_index": i, "pos": token.pos_, "pos_desc": spacy.explain(token.pos_), "named_entity": token.ent_type_, "ent_desc": spacy.explain(token.ent_type_)}) 82 | 83 | elapsed_time = time.time() - start_time 84 | 85 | terms_list = [] 86 | seen_terms = [] 87 | # remove punctuation, low probability, words subwords, duplicates 88 | for token in new_terms: 89 | if token["token"].isalnum() and token["probability"] > threshold and "#" not in token["token"] and token["token"] not in query and token["token"] not in seen_terms: 90 | terms_list.append(token) 91 | seen_terms.append(token["token"]) 92 | 93 | result = { 94 | "terms": terms_list, 95 | "query": query_tokens, 96 | "expansions": candidate_expansions, 97 | "took": elapsed_time 98 | } 99 | return result 100 | -------------------------------------------------------------------------------- /neuralqa/reader/__init__.py: -------------------------------------------------------------------------------- 1 | from .reader import * 2 | from .bertreader import * 3 | from .readerpool import * 4 | -------------------------------------------------------------------------------- /neuralqa/reader/bertreader.py: -------------------------------------------------------------------------------- 1 | from neuralqa.reader import Reader 2 | 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import time 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class BERTReader(Reader): 13 | def __init__(self, model_name, model_path, model_type="bert", **kwargs): 14 | Reader.__init__(self, model_name, model_path, model_type) 15 | # self.load_model(model_name, model_path, model_type) 16 | 17 | def get_best_start_end_position(self, start_scores, end_scores): 18 | answer_start = tf.argmax(start_scores, axis=1).numpy()[0] 19 | answer_end = (tf.argmax(end_scores, axis=1) + 1).numpy()[0] 20 | return answer_start, answer_end 21 | 22 | def get_chunk_answer_span(self, inputs): 23 | start_time = time.time() 24 | answer_start_scores, answer_end_scores = self.model(inputs) 25 | 26 | answer_start, answer_end = self.get_best_start_end_position( 27 | answer_start_scores, answer_end_scores) 28 | 29 | answer_end = answer_end - \ 30 | 1 if answer_end == answer_end_scores.shape[1] else answer_end 31 | 32 | answer_start_softmax_probability = tf.nn.softmax( 33 | answer_start_scores, axis=1).numpy()[0][answer_start] 34 | answer_end_softmax_probability = tf.nn.softmax( 35 | answer_end_scores, axis=1).numpy()[0][answer_end] 36 | 37 | answer = self.tokenizer.decode( 38 | inputs["input_ids"][0][answer_start:answer_end], skip_special_tokens=True) 39 | 40 | # if model predict first token 0 which is in the question as part of the answer, return nothing 41 | if answer_start == 0: 42 | answer = "" 43 | 44 | elapsed_time = time.time() - start_time 45 | return {"answer": answer, "took": elapsed_time, 46 | "start_probability": str(answer_start_softmax_probability), 47 | "end_probability": str(answer_end_softmax_probability), 48 | "probability": str(answer_end_softmax_probability + answer_start_softmax_probability / 2) 49 | } 50 | 51 | def token_chunker(self, question, context, max_chunk_size=512, stride=2, max_num_chunks=5): 52 | # we tokenize question and context once. 53 | # if question + context > max chunksize, we break it down into multiple chunks of question + 54 | # subsets of context with some stride overlap 55 | 56 | question_tokens = self.tokenizer.encode(question) 57 | context_tokens = self.tokenizer.encode( 58 | context, add_special_tokens=False) 59 | 60 | chunk_holder = [] 61 | chunk_size = max_chunk_size - len(question_tokens) - 1 62 | # -1 for the 102 end token we append later 63 | current_pos = 0 64 | chunk_count = 0 65 | while current_pos < len(context_tokens) and current_pos >= 0: 66 | 67 | # we want to cap the number of chunks we create 68 | if max_num_chunks and chunk_count >= max_num_chunks: 69 | break 70 | 71 | end_point = current_pos + \ 72 | chunk_size if (current_pos + chunk_size) < len(context_tokens) - \ 73 | 1 else len(context_tokens) - 1 74 | token_chunk = question_tokens + \ 75 | context_tokens[current_pos: end_point] + [102] 76 | 77 | # question type is 0, context type is 1, convert to tf 78 | token_type_ids = [0]*len(question_tokens) + \ 79 | [1] * (len(token_chunk) - len(question_tokens)) 80 | token_type_ids = tf.constant( 81 | token_type_ids, dtype='int32', shape=(1, len(token_type_ids))) 82 | 83 | # attend to every token 84 | attention_mask = tf.ones( 85 | (1, len(token_chunk)), dtype=tf.dtypes.int32) 86 | 87 | # convert token chunk to tf 88 | token_chunk = tf.constant( 89 | token_chunk, dtype='int32', shape=(1, len(token_chunk))) 90 | 91 | chunk_holder.append( 92 | {"token_ids": token_chunk, 93 | "context": self.tokenizer.decode(context_tokens[current_pos: end_point], skip_special_tokens=True), 94 | "attention_mask": attention_mask, 95 | "token_type_ids": token_type_ids 96 | }) 97 | current_pos = current_pos + chunk_size - stride + 1 98 | chunk_count += 1 99 | 100 | return chunk_holder 101 | 102 | def answer_question(self, question, context, max_chunk_size=512, stride=70): 103 | 104 | # chunk tokens 105 | chunked_tokens = self.token_chunker( 106 | question, context, max_chunk_size, stride) 107 | answer_holder = [] 108 | for chunk in chunked_tokens: 109 | model_input = {"input_ids": chunk["token_ids"], "attention_mask": 110 | chunk["attention_mask"], "token_type_ids": chunk["token_type_ids"]} 111 | answer = self.get_chunk_answer_span(model_input) 112 | if len(answer["answer"]) > 2: 113 | answer["question"] = question 114 | answer["context"] = chunk["context"].replace("##", "").replace( 115 | answer["answer"], " " + answer["answer"] + " ") 116 | answer_holder.append(answer) 117 | return answer_holder 118 | 119 | def get_correct_span_mask(self, correct_index, token_size): 120 | span_mask = np.zeros((1, token_size)) 121 | span_mask[0, correct_index] = 1 122 | span_mask = tf.constant(span_mask, dtype='float32') 123 | return span_mask 124 | 125 | def get_embedding_matrix(self): 126 | if "DistilBert" in type(self.model).__name__: 127 | return self.model.distilbert.embeddings.word_embeddings 128 | else: 129 | return self.model.bert.embeddings.word_embeddings 130 | 131 | # move this to some utils file 132 | def clean_tokens(self, gradients, tokens, token_types): 133 | """ 134 | Clean the tokens and gradients 135 | Remove "[CLS]","[CLR]", "[SEP]" tokens 136 | Reduce (mean) gradients values for tokens that are split ## 137 | """ 138 | token_holder = [] 139 | token_type_holder = [] 140 | gradient_holder = [] 141 | i = 0 142 | while i < len(tokens): 143 | if (tokens[i] not in ["[CLS]", "[CLR]", "[SEP]"]): 144 | token = tokens[i] 145 | conn = gradients[i] 146 | token_type = token_types[i] 147 | if i < len(tokens)-1: 148 | if tokens[i+1][0:2] == "##": 149 | token = tokens[i] 150 | conn = gradients[i] 151 | j = 1 152 | while i < len(tokens)-1 and tokens[i+1][0:2] == "##": 153 | i += 1 154 | token += tokens[i][2:] 155 | conn += gradients[i] 156 | j += 1 157 | conn = conn / j 158 | token_holder.append(token) 159 | token_type_holder.append(token_type) 160 | # gradient_holder.append(conn) 161 | gradient_holder.append( 162 | {"gradient": conn, "token": token, "token_type": token_type}) 163 | i += 1 164 | return gradient_holder 165 | 166 | def get_gradient(self, question, context): 167 | """Return gradient of input (question) wrt to model output span prediction 168 | 169 | Args: 170 | question (str): text of input question 171 | context (str): text of question context/passage 172 | model (QA model): Hugging Face BERT model for QA transformers.modeling_tf_distilbert.TFDistilBertForQuestionAnswering, transformers.modeling_tf_bert.TFBertForQuestionAnswering 173 | tokenizer (tokenizer): transformers.tokenization_bert.BertTokenizerFast 174 | 175 | Returns: 176 | (tuple): (gradients, token_words, token_types, answer_text) 177 | """ 178 | 179 | embedding_matrix = self.get_embedding_matrix() 180 | 181 | encoded_tokens = self.tokenizer.encode_plus( 182 | question, context, add_special_tokens=True, return_token_type_ids=True, return_tensors="tf") 183 | token_ids = list(encoded_tokens["input_ids"].numpy()[0]) 184 | vocab_size = embedding_matrix.get_shape()[0] 185 | 186 | # convert token ids to one hot. We can't differentiate wrt to int token ids hence the need for one hot representation 187 | token_ids_tensor = tf.constant([token_ids], dtype='int32') 188 | token_ids_tensor_one_hot = tf.one_hot(token_ids_tensor, vocab_size) 189 | 190 | with tf.GradientTape(watch_accessed_variables=False) as tape: 191 | # (i) watch input variable 192 | tape.watch(token_ids_tensor_one_hot) 193 | 194 | # multiply input model embedding matrix; allows us do backprop wrt one hot input 195 | inputs_embeds = tf.matmul( 196 | token_ids_tensor_one_hot, embedding_matrix) 197 | 198 | # (ii) get prediction 199 | start_scores, end_scores = self.model( 200 | {"inputs_embeds": inputs_embeds, "token_type_ids": encoded_tokens["token_type_ids"], "attention_mask": encoded_tokens["attention_mask"]}) 201 | answer_start, answer_end = self.get_best_start_end_position( 202 | start_scores, end_scores) 203 | 204 | start_output_mask = self.get_correct_span_mask( 205 | answer_start, len(token_ids)) 206 | end_output_mask = self.get_correct_span_mask( 207 | answer_end, len(token_ids)) 208 | 209 | # zero out all predictions outside of the correct span positions; we want to get gradients wrt to just these positions 210 | predict_correct_start_token = tf.reduce_sum( 211 | start_scores * start_output_mask) 212 | predict_correct_end_token = tf.reduce_sum( 213 | end_scores * end_output_mask) 214 | 215 | # (iii) get gradient of input with respect to both start and end output 216 | gradient_non_normalized = tf.norm( 217 | tape.gradient([predict_correct_start_token, predict_correct_end_token], token_ids_tensor_one_hot), axis=2) 218 | 219 | # (iv) normalize gradient scores and return them as "explanations" 220 | gradient_tensor = ( 221 | gradient_non_normalized / 222 | tf.reduce_max(gradient_non_normalized) 223 | ) 224 | gradients = gradient_tensor[0].numpy().tolist() 225 | 226 | token_words = self.tokenizer.convert_ids_to_tokens(token_ids) 227 | token_types = list( 228 | encoded_tokens["token_type_ids"].numpy()[0].tolist()) 229 | answer_text = self.tokenizer.decode( 230 | token_ids[answer_start:answer_end], skip_special_tokens=True) 231 | 232 | # clean up gradients and words 233 | gradients = self.clean_tokens( 234 | gradients, token_words, token_types) 235 | return gradients, answer_text, question 236 | 237 | def explain_model(self, question, context, explain_method="gradient"): 238 | if explain_method == "gradient": 239 | return self.get_gradient(question, context) 240 | -------------------------------------------------------------------------------- /neuralqa/reader/reader.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering 6 | import time 7 | import logging 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class Reader: 14 | def __init__(self, model_name, model_path, model_type, **kwargs): 15 | self.load_model(model_name, model_path, model_type) 16 | 17 | def load_model(self, model_name, model_path, model_type): 18 | logger.info(">> Loading HF model " + 19 | model_name + " from " + model_path) 20 | self.type = model_type 21 | self.name = model_name 22 | self.tokenizer = AutoTokenizer.from_pretrained( 23 | model_path, use_fast=True) 24 | self.model = TFAutoModelForQuestionAnswering.from_pretrained( 25 | model_path, from_pt=True) 26 | -------------------------------------------------------------------------------- /neuralqa/reader/readerpool.py: -------------------------------------------------------------------------------- 1 | 2 | from neuralqa.reader import BERTReader 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class ReaderPool(): 9 | def __init__(self, models): 10 | self._selected_model = models["selected"] 11 | self.reader_pool = {} 12 | for model in models["options"]: 13 | if (model["type"] == "bert" or model["type"] == "distilbert"): 14 | self.reader_pool[model["value"]] = BERTReader( 15 | model["name"], model["value"]) 16 | 17 | @property 18 | def model(self): 19 | return self.reader_pool[self.selected_model] 20 | 21 | @property 22 | def selected_model(self): 23 | return self._selected_model 24 | 25 | @selected_model.setter 26 | def selected_model(self, selected_model): 27 | 28 | if (selected_model in self.reader_pool): 29 | self._selected_model = selected_model 30 | else: 31 | if (len(self.reader_pool) > 0): 32 | default_model = next(iter(self.reader_pool)) 33 | logger.info( 34 | ">> Model you are attempting to use %s does not exist in model pool. Using the following default model instead %s ", selected_model, default_model) 35 | self._selected_model = default_model 36 | else: 37 | logger.info( 38 | ">> No reader has been specified in config.yaml.") 39 | self._selected_model = None 40 | -------------------------------------------------------------------------------- /neuralqa/retriever/__init__.py: -------------------------------------------------------------------------------- 1 | from .retriever import * 2 | from .elasticsearchretriever import * 3 | from .solrretriever import * 4 | from .retrieverpool import * 5 | -------------------------------------------------------------------------------- /neuralqa/retriever/elasticsearchretriever.py: -------------------------------------------------------------------------------- 1 | from neuralqa.retriever import Retriever 2 | from neuralqa.utils import parse_field_content 3 | from elasticsearch import Elasticsearch, ConnectionError, NotFoundError 4 | import logging 5 | 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class ElasticSearchRetriever(Retriever): 11 | def __init__(self, index_type="elasticsearch", host="localhost", port=9200, username="", password="", **kwargs): 12 | Retriever.__init__(self, index_type) 13 | 14 | self.username = username 15 | self.password = password 16 | self.body_field = "" 17 | self.host = host 18 | self.port = port 19 | allowed_keys = list(self.__dict__.keys()) 20 | self.__dict__.update((k, v) 21 | for k, v in kwargs.items() if k in allowed_keys) 22 | 23 | print(self.__dict__) 24 | # self.es = Elasticsearch( 25 | # [{'host': self.host, 'port': self.port, 26 | # "username": self.username, "password": self.password}]) 27 | self.es = Elasticsearch(hosts=[{"host": self.host, "port": self.port}], 28 | http_auth=(self.username, self.password)) 29 | self.isAvailable = self.es.ping() 30 | 31 | rejected_keys = set(kwargs.keys()) - set(allowed_keys) 32 | 33 | if rejected_keys: 34 | raise ValueError( 35 | "Invalid arguments in ElasticSearchRetriever constructor:{}".format(rejected_keys)) 36 | 37 | def run_query(self, index_name, search_query, max_documents=5, fragment_size=100, relsnip=True, num_fragments=5, highlight_tags=True): 38 | 39 | tags = {"pre_tags": [""], "post_tags": [ 40 | ""]} if not highlight_tags else {} 41 | highlight_params = { 42 | "fragment_size": fragment_size, 43 | "fields": { 44 | self.body_field: tags 45 | }, 46 | "number_of_fragments": num_fragments 47 | } 48 | 49 | search_query = { 50 | "_source": {"includes": [self.body_field]}, 51 | "query": { 52 | "multi_match": { 53 | "query": search_query, 54 | "fields": [self.body_field] 55 | } 56 | }, 57 | "size": max_documents 58 | } 59 | 60 | status = True 61 | results = {} 62 | 63 | if (relsnip): 64 | # search_query["_source"] = {"includes": [""]} 65 | search_query["highlight"] = highlight_params 66 | # else: 67 | # search_query["_source"] = {"includes": [self.body_field]} 68 | 69 | try: 70 | query_result = self.es.search( 71 | index=index_name, body=search_query) 72 | 73 | # RelSnip: for each document, we concatenate all 74 | # fragments in each document and return as the document. 75 | highlights = [" ".join(hit["highlight"][self.body_field]) 76 | for hit in query_result["hits"]["hits"] if "highlight" in hit] 77 | docs = [parse_field_content(self.body_field, hit["_source"]) 78 | for hit in query_result["hits"]["hits"] if "_source" in hit] 79 | took = query_result["took"] 80 | results = {"took": took, "highlights": highlights, "docs": docs} 81 | 82 | except (ConnectionRefusedError, NotFoundError, Exception) as e: 83 | status = False 84 | results["errormsg"] = str(e) 85 | 86 | results["status"] = status 87 | return results 88 | 89 | def test_connection(self): 90 | try: 91 | self.es.cluster.health() 92 | return True 93 | except ConnectionError: 94 | return False 95 | except Exception as e: 96 | logger.info( 97 | 'An unknown error occured connecting to ElasticSearch: %s' % e) 98 | return False 99 | -------------------------------------------------------------------------------- /neuralqa/retriever/retriever.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Retriever: 4 | def __init__(self, index_type): 5 | self.index_type = index_type 6 | -------------------------------------------------------------------------------- /neuralqa/retriever/retrieverpool.py: -------------------------------------------------------------------------------- 1 | 2 | from neuralqa.retriever import ElasticSearchRetriever 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class RetrieverPool(): 9 | def __init__(self, retrievers): 10 | 11 | self.retriever_pool = {} 12 | for retriever in retrievers["options"]: 13 | if (retriever["value"] in self.retriever_pool): 14 | raise ValueError( 15 | "Duplicate retriever value : {} ".format(retriever["value"])) 16 | 17 | if (retriever["type"] == "elasticsearch"): 18 | self.retriever_pool[retriever["value"]] = ElasticSearchRetriever( 19 | **retriever["connection"]) 20 | if (retriever["type"] == "solr"): 21 | logger.info("We do not yet support Solr retrievers") 22 | self.selected_retriever = retrievers["selected"] 23 | 24 | @property 25 | def retriever(self): 26 | return self.retriever_pool[self.selected_retriever] 27 | 28 | @property 29 | def selected_retriever(self): 30 | return self._selected_retriever 31 | 32 | @selected_retriever.setter 33 | def selected_retriever(self, selected_retriever): 34 | if (selected_retriever in self.retriever_pool): 35 | self._selected_retriever = selected_retriever 36 | else: 37 | if (len(self.retriever_pool) > 0): 38 | default_retriever = next(iter(self.retriever_pool)) 39 | logger.info( 40 | ">> Retriever you are attempting to use (%s) does not exist in retriever pool. Using the following default retriever instead %s ", selected_retriever, default_retriever) 41 | self._selected_retriever = default_retriever 42 | else: 43 | logger.info( 44 | ">> No retriever has been specified in config.yaml.") 45 | self._selected_retriever = None 46 | -------------------------------------------------------------------------------- /neuralqa/retriever/solrretriever.py: -------------------------------------------------------------------------------- 1 | from neuralqa.retriever import Retriever 2 | from neuralqa.utils import parse_field_content 3 | import requests 4 | import logging 5 | 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class SolrRetriever(Retriever): 11 | def __init__(self, index_type="solr", host="localhost", port=8983, protocol="http", ** kwargs): 12 | Retriever.__init__(self, index_type) 13 | 14 | self.username = "" 15 | self.password = "" 16 | self.body_field = "" 17 | self.host = host 18 | self.port = port 19 | self.protocol = protocol 20 | 21 | allowed_keys = list(self.__dict__.keys()) 22 | self.__dict__.update((k, v) 23 | for k, v in kwargs.items() if k in allowed_keys) 24 | 25 | self.base_solr_url = protocol + "://" + \ 26 | host + ":" + str(port) + "/solr" 27 | 28 | # self.es = Elasticsearch([{'host': self.host, 'port': self.port}]) 29 | # self.isAvailable = self.es.ping() 30 | 31 | rejected_keys = set(kwargs.keys()) - set(allowed_keys) 32 | 33 | if rejected_keys: 34 | raise ValueError( 35 | "Invalid arguments in ElasticSearchRetriever constructor:{}".format(rejected_keys)) 36 | 37 | def run_query(self, index_name, search_query, max_documents=5, fragment_size=100, relsnip=True, num_fragments=5, highlight_tags=True): 38 | query_url = self.base_solr_url + "/" + index_name + "/select" 39 | 40 | params = {"df": self.body_field, "fl": self.body_field, 41 | "wt": "json", "q": search_query, "rows": max_documents} 42 | 43 | hl_params = {"hl": "true", "hl.method": "unified", "hl.snippets": num_fragments, 44 | "hl.fragsize": num_fragments, "hl.usePhraseHighlighter": "true"} 45 | if not highlight_tags: 46 | hl_params["hl.tags.pre"] = "" 47 | hl_params["hl.tags.post"] = "" 48 | 49 | if relsnip: 50 | params = {**params, **hl_params} 51 | else: 52 | params["fl"] = "null" 53 | 54 | response = requests.get(query_url, params=params) 55 | highlights = [] 56 | docs = [] 57 | results = {} 58 | status = False 59 | 60 | if (response.status_code == 200): 61 | status = True 62 | print(response.url, response.status_code) 63 | response = response.json() 64 | print((response.keys())) 65 | highlights = [" ".join(response["highlighting"][key][self.body_field]) 66 | for key in response["highlighting"].keys()] if "highlighting" in response else highlights 67 | docs = [" ".join(doc[self.body_field]) 68 | for doc in response["response"]["docs"]] 69 | results = {"took": response["responseHeader"] 70 | ["QTime"], "highlights": highlights, "docs": docs} 71 | else: 72 | print("An error has occured", 73 | response.status_code, response.__dict__) 74 | status = False 75 | results["errormsg"] = str(response.status_code) 76 | results["status"] = status 77 | return results 78 | -------------------------------------------------------------------------------- /neuralqa/server/__init__.py: -------------------------------------------------------------------------------- 1 | from .server_app import launch_server 2 | -------------------------------------------------------------------------------- /neuralqa/server/routehandlers.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from neuralqa.utils import ConfigParser 4 | import time 5 | from fastapi import APIRouter 6 | from typing import Optional 7 | from neuralqa.server.routemodels import Document, Answer, Explanation, Expansion 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class Handler: 14 | def __init__(self, reader_pool, retriever_pool, expander_pool): 15 | router = APIRouter() 16 | self.router = router 17 | 18 | self.reader_pool = reader_pool 19 | self.retriever_pool = retriever_pool 20 | self.expander_pool = expander_pool 21 | 22 | @router.post("/answers") 23 | async def get_answers(params: Answer): 24 | 25 | """Generate an answer for the given search query. 26 | Performed as two stage process 27 | 1.) Get sample passages from neighbourhood provided by matches by elastic search 28 | 2.) Used BERT Model to identify exact answer spans 29 | 30 | Returns: 31 | [type] -- [description] 32 | """ 33 | 34 | answer_holder = [] 35 | response = {} 36 | start_time = time.time() 37 | 38 | # switch to the selected model and retriever 39 | self.reader_pool.selected_model = params.reader 40 | self.retriever_pool.selected_retriever = params.retriever 41 | 42 | # print(params.query + " ".join(params.expansionterms)) 43 | # answer question based on provided context 44 | if (params.retriever == "none" or self.retriever_pool.selected_retriever == None): 45 | answers = self.reader_pool.model.answer_question( 46 | params.query, params.context, stride=params.tokenstride) 47 | for answer in answers: 48 | answer["index"] = 0 49 | answer_holder.append(answer) 50 | # answer question based on retrieved passages from elastic search 51 | 52 | else: 53 | # add query expansion terms to query if any 54 | retriever_query = params.query + \ 55 | " ".join(params.expansionterms) 56 | num_fragments = 5 57 | query_results = self.retriever_pool.retriever.run_query(params.retriever, retriever_query, 58 | max_documents=params.max_documents, fragment_size=params.fragment_size, 59 | relsnip=params.relsnip, num_fragments=num_fragments, highlight_tags=False) 60 | # print(query_results) 61 | if (query_results["status"]): 62 | # if relsnip is not enabled, read the entire document ... this is super slow 63 | docs = query_results["highlights"] if params.relsnip else query_results["docs"] 64 | 65 | for i, doc in enumerate(docs): 66 | doc = doc.replace("\n", " ") 67 | answers = self.reader_pool.model.answer_question( 68 | params.query, doc, stride=params.tokenstride) 69 | for answer in answers: 70 | answer["index"] = i 71 | answer_holder.append(answer) 72 | 73 | # sort answers by probability 74 | answer_holder = sorted( 75 | answer_holder, key=lambda k: k['probability'], reverse=True) 76 | elapsed_time = time.time() - start_time 77 | response = {"answers": answer_holder, 78 | "took": elapsed_time} 79 | return response 80 | 81 | @router.post("/documents") 82 | async def get_documents(params: Document): 83 | """Get a list of documents and highlights that match the given search query 84 | 85 | Returns: 86 | dictionary -- contains details on elastic search results. 87 | """ 88 | 89 | num_fragments = 5 90 | query_results = {"docs": [], "highlights": []} 91 | 92 | self.retriever_pool.selected_retriever = params.retriever 93 | if self.retriever_pool.selected_retriever: 94 | query_results = self.retriever_pool.retriever.run_query( 95 | params.retriever, params.query, max_documents=params.max_documents, fragment_size=params.fragment_size, relsnip=params.relsnip, num_fragments=num_fragments) 96 | # print(query_results) 97 | max_doc_size = 1200 98 | if not params.relsnip: 99 | query_results["highlights"] = [ 100 | doc[:max_doc_size] + " .." for doc in query_results["docs"]] 101 | return query_results 102 | 103 | @router.post("/explain") 104 | async def get_explanation(params: Explanation): 105 | """Return an explanation for a given model 106 | 107 | Returns: 108 | [dictionary]: [explanation , query, question, ] 109 | """ 110 | 111 | # TODO: Do we need to switch readers here? 112 | 113 | context = params.context.replace( 114 | "", "").replace("", "") 115 | 116 | gradients, answer_text, question = self.reader_pool.model.explain_model( 117 | params.query, context) 118 | 119 | explanation_result = {"gradients": gradients, 120 | "answer": answer_text, 121 | "question": question 122 | } 123 | return explanation_result 124 | 125 | @router.post("/expand") 126 | async def get_expansion(params: Expansion): 127 | """Return an expansion for a given query 128 | 129 | Returns: 130 | [dictionary]: [expansion] 131 | """ 132 | 133 | expanded_query = {"query": None} 134 | # switch to selected expander, perform expansion 135 | if params.expander != "none": 136 | self.expander_pool.selected_expander = params.expander 137 | if self.expander_pool.selected_expander: 138 | expanded_query = self.expander_pool.expander.expand_query( 139 | params.query) 140 | 141 | return expanded_query 142 | -------------------------------------------------------------------------------- /neuralqa/server/routemodels.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from pydantic import BaseModel 4 | from typing import Optional 5 | 6 | 7 | class Document(BaseModel): 8 | 9 | max_documents: Optional[int] = 5 10 | query: str = "what is a fourth amendment right violation?" 11 | fragment_size: int = 250 12 | retriever: Optional[str] = None 13 | relsnip: Optional[bool] = True 14 | 15 | 16 | class Answer(BaseModel): 17 | 18 | max_documents: Optional[int] = 5 19 | query: str = "what is a fourth amendment right violation?" 20 | fragment_size: int = 250 21 | tokenstride: int = 50 22 | context: Optional[str] = "The fourth amendment kind of protects the rights of citizens .. such that they dont get searched" 23 | reader: str = None 24 | relsnip: bool = True 25 | expander: Optional[str] = None 26 | expansionterms: Optional[list] = None 27 | retriever: Optional[str] = "manual" 28 | 29 | 30 | class Explanation(BaseModel): 31 | query: str = "what is a fourth amendment right violation?" 32 | context: str = "The fourth amendment kind of protects the rights of citizens .. such that they dont get searched" 33 | 34 | 35 | class Expansion(BaseModel): 36 | query: str = "what is a fourth amendment right violation?" 37 | expander: Optional[str] = None 38 | -------------------------------------------------------------------------------- /neuralqa/server/serve.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from neuralqa.reader import BERTReader, ReaderPool 4 | from neuralqa.server.routehandlers import Handler 5 | from neuralqa.retriever import ElasticSearchRetriever, RetrieverPool 6 | from neuralqa.utils import ConfigParser 7 | from neuralqa.expander import ExpanderPool 8 | 9 | import os 10 | import logging 11 | import time 12 | import uvicorn 13 | from fastapi import FastAPI 14 | from fastapi.staticfiles import StaticFiles 15 | # from fastapi.middleware.cors import CORSMiddleware 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | config_path = os.environ.get("NEURALQA_CONFIG_PATH") 21 | app_config = ConfigParser(config_path) 22 | 23 | app = FastAPI() 24 | api = FastAPI(root_path="/api") 25 | 26 | 27 | # origins = [ 28 | # "http://localhost", 29 | # "http://localhost:3000", 30 | # ] 31 | 32 | # app.add_middleware( 33 | # CORSMiddleware, 34 | # allow_origins=origins, 35 | # allow_credentials=True, 36 | # allow_methods=["*"], 37 | # allow_headers=["*"], 38 | # ) 39 | root_file_path = os.path.dirname(os.path.abspath(__file__)) 40 | static_folder_root = os.path.join(root_file_path, "ui/build") 41 | 42 | app.mount("/api", api) 43 | app.mount("/", StaticFiles(directory=static_folder_root, html=True), 44 | name="ui") 45 | 46 | 47 | @api.get('/config') 48 | async def get_config(): 49 | config = app_config.config["ui"] 50 | # show only listed models to ui 51 | config["queryview"]["options"]["relsnip"] = app_config.config["relsnip"] 52 | config["queryview"]["options"]["samples"] = app_config.config["samples"] 53 | config["queryview"]["options"]["expander"] = app_config.config["expander"] 54 | config["queryview"]["options"]["reader"] = app_config.config["reader"] 55 | config["queryview"]["options"]["retriever"] = app_config.config["retriever"] 56 | return config 57 | 58 | # # Define a Reader Pool, load into memory 59 | reader_pool = ReaderPool(app_config.config["reader"]) 60 | 61 | # # define the search index, load into memory 62 | retriever_pool = RetrieverPool(app_config.config["retriever"]) 63 | 64 | # define the expander, load into memory 65 | expander_pool = ExpanderPool(app_config.config["expander"]) 66 | 67 | handlers = Handler(reader_pool, retriever_pool, expander_pool) 68 | # handlers = Handler(None, None) 69 | api.include_router(handlers.router) 70 | -------------------------------------------------------------------------------- /neuralqa/server/server_app.py: -------------------------------------------------------------------------------- 1 | 2 | import uvicorn 3 | import os 4 | 5 | 6 | def launch_server(host="127.0.0.1", port=5000, workers=1, reload=False): 7 | uvicorn.run("neuralqa.server.serve:app", host=host, port=port, workers=workers, 8 | log_level="info", reload=reload) 9 | 10 | 11 | if __name__ == "__main__": 12 | launch_server() 13 | -------------------------------------------------------------------------------- /neuralqa/server/ui/.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | public/images/extra 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # production 12 | !build 13 | 14 | # misc 15 | .DS_Store 16 | .env.local 17 | .env.development.local 18 | .env.test.local 19 | .env.production.local 20 | 21 | npm-debug.log* 22 | yarn-debug.log* 23 | yarn-error.log* 24 | -------------------------------------------------------------------------------- /neuralqa/server/ui/README.md: -------------------------------------------------------------------------------- 1 | ## NeuralQA UI 2 | 3 | This project was bootstrapped with [Create React App](https://github.com/facebook/create-react-app). -------------------------------------------------------------------------------- /neuralqa/server/ui/build/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/build/android-chrome-192x192.png -------------------------------------------------------------------------------- /neuralqa/server/ui/build/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/build/android-chrome-512x512.png -------------------------------------------------------------------------------- /neuralqa/server/ui/build/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/build/apple-touch-icon.png -------------------------------------------------------------------------------- /neuralqa/server/ui/build/asset-manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "files": { 3 | "main.css": "/static/css/main.0d7f6602.chunk.css", 4 | "main.js": "/static/js/main.32abfeaf.chunk.js", 5 | "main.js.map": "/static/js/main.32abfeaf.chunk.js.map", 6 | "runtime-main.js": "/static/js/runtime-main.985d1449.js", 7 | "runtime-main.js.map": "/static/js/runtime-main.985d1449.js.map", 8 | "static/js/2.17f05cd8.chunk.js": "/static/js/2.17f05cd8.chunk.js", 9 | "static/js/2.17f05cd8.chunk.js.map": "/static/js/2.17f05cd8.chunk.js.map", 10 | "index.html": "/index.html", 11 | "precache-manifest.f2ddb522e87f24d57699361b5d062612.js": "/precache-manifest.f2ddb522e87f24d57699361b5d062612.js", 12 | "service-worker.js": "/service-worker.js", 13 | "static/css/main.0d7f6602.chunk.css.map": "/static/css/main.0d7f6602.chunk.css.map", 14 | "static/js/2.17f05cd8.chunk.js.LICENSE.txt": "/static/js/2.17f05cd8.chunk.js.LICENSE.txt" 15 | }, 16 | "entrypoints": [ 17 | "static/js/runtime-main.985d1449.js", 18 | "static/js/2.17f05cd8.chunk.js", 19 | "static/css/main.0d7f6602.chunk.css", 20 | "static/js/main.32abfeaf.chunk.js" 21 | ] 22 | } -------------------------------------------------------------------------------- /neuralqa/server/ui/build/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/build/favicon-16x16.png -------------------------------------------------------------------------------- /neuralqa/server/ui/build/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/build/favicon-32x32.png -------------------------------------------------------------------------------- /neuralqa/server/ui/build/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/build/favicon.ico -------------------------------------------------------------------------------- /neuralqa/server/ui/build/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/build/images/icon.png -------------------------------------------------------------------------------- /neuralqa/server/ui/build/index.html: -------------------------------------------------------------------------------- 1 | NeuralQA: Question Answering on Large Documents
-------------------------------------------------------------------------------- /neuralqa/server/ui/build/logo152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/build/logo152.png -------------------------------------------------------------------------------- /neuralqa/server/ui/build/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "NeuralQA", 3 | "name": "Question Answering on Large Document Datasets", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "android-chrome-192x192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "android-chrome-512x512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } -------------------------------------------------------------------------------- /neuralqa/server/ui/build/precache-manifest.f2ddb522e87f24d57699361b5d062612.js: -------------------------------------------------------------------------------- 1 | self.__precacheManifest = (self.__precacheManifest || []).concat([ 2 | { 3 | "revision": "ab09b5abfa289e05b196fbecd4c7b9b1", 4 | "url": "/index.html" 5 | }, 6 | { 7 | "revision": "24ab984c723263199c47", 8 | "url": "/static/css/main.0d7f6602.chunk.css" 9 | }, 10 | { 11 | "revision": "b07275a309ceb994bb56", 12 | "url": "/static/js/2.17f05cd8.chunk.js" 13 | }, 14 | { 15 | "revision": "3453b8997016469371284a28c0e873e2", 16 | "url": "/static/js/2.17f05cd8.chunk.js.LICENSE.txt" 17 | }, 18 | { 19 | "revision": "24ab984c723263199c47", 20 | "url": "/static/js/main.32abfeaf.chunk.js" 21 | }, 22 | { 23 | "revision": "b7d5e4d288fb48c3a4d5", 24 | "url": "/static/js/runtime-main.985d1449.js" 25 | } 26 | ]); -------------------------------------------------------------------------------- /neuralqa/server/ui/build/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /neuralqa/server/ui/build/service-worker.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Welcome to your Workbox-powered service worker! 3 | * 4 | * You'll need to register this file in your web app and you should 5 | * disable HTTP caching for this file too. 6 | * See https://goo.gl/nhQhGp 7 | * 8 | * The rest of the code is auto-generated. Please don't update this file 9 | * directly; instead, make changes to your Workbox build configuration 10 | * and re-run your build process. 11 | * See https://goo.gl/2aRDsh 12 | */ 13 | 14 | importScripts("https://storage.googleapis.com/workbox-cdn/releases/4.3.1/workbox-sw.js"); 15 | 16 | importScripts( 17 | "/precache-manifest.f2ddb522e87f24d57699361b5d062612.js" 18 | ); 19 | 20 | self.addEventListener('message', (event) => { 21 | if (event.data && event.data.type === 'SKIP_WAITING') { 22 | self.skipWaiting(); 23 | } 24 | }); 25 | 26 | workbox.core.clientsClaim(); 27 | 28 | /** 29 | * The workboxSW.precacheAndRoute() method efficiently caches and responds to 30 | * requests for URLs in the manifest. 31 | * See https://goo.gl/S9QRab 32 | */ 33 | self.__precacheManifest = [].concat(self.__precacheManifest || []); 34 | workbox.precaching.precacheAndRoute(self.__precacheManifest, {}); 35 | 36 | workbox.routing.registerNavigationRoute(workbox.precaching.getCacheKeyForURL("/index.html"), { 37 | 38 | blacklist: [/^\/_/,/\/[^/?]+\.[^/]+$/], 39 | }); 40 | -------------------------------------------------------------------------------- /neuralqa/server/ui/build/static/js/2.17f05cd8.chunk.js.LICENSE.txt: -------------------------------------------------------------------------------- 1 | /* 2 | object-assign 3 | (c) Sindre Sorhus 4 | @license MIT 5 | */ 6 | 7 | /*! 8 | Copyright (c) 2017 Jed Watson. 9 | Licensed under the MIT License (MIT), see 10 | http://jedwatson.github.io/classnames 11 | */ 12 | 13 | /** @license React v0.19.1 14 | * scheduler.production.min.js 15 | * 16 | * Copyright (c) Facebook, Inc. and its affiliates. 17 | * 18 | * This source code is licensed under the MIT license found in the 19 | * LICENSE file in the root directory of this source tree. 20 | */ 21 | 22 | /** @license React v16.13.1 23 | * react-dom.production.min.js 24 | * 25 | * Copyright (c) Facebook, Inc. and its affiliates. 26 | * 27 | * This source code is licensed under the MIT license found in the 28 | * LICENSE file in the root directory of this source tree. 29 | */ 30 | 31 | /** @license React v16.13.1 32 | * react-is.production.min.js 33 | * 34 | * Copyright (c) Facebook, Inc. and its affiliates. 35 | * 36 | * This source code is licensed under the MIT license found in the 37 | * LICENSE file in the root directory of this source tree. 38 | */ 39 | 40 | /** @license React v16.13.1 41 | * react.production.min.js 42 | * 43 | * Copyright (c) Facebook, Inc. and its affiliates. 44 | * 45 | * This source code is licensed under the MIT license found in the 46 | * LICENSE file in the root directory of this source tree. 47 | */ 48 | -------------------------------------------------------------------------------- /neuralqa/server/ui/build/static/js/runtime-main.985d1449.js: -------------------------------------------------------------------------------- 1 | !function(e){function r(r){for(var n,l,a=r[0],f=r[1],i=r[2],c=0,s=[];c0.2%", 34 | "not dead", 35 | "not op_mini all" 36 | ], 37 | "development": [ 38 | "last 1 chrome version", 39 | "last 1 firefox version", 40 | "last 1 safari version" 41 | ] 42 | }, 43 | "devDependencies": { 44 | "gh-pages": "^3.0.0" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /neuralqa/server/ui/public/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/public/android-chrome-192x192.png -------------------------------------------------------------------------------- /neuralqa/server/ui/public/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/public/android-chrome-512x512.png -------------------------------------------------------------------------------- /neuralqa/server/ui/public/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/public/apple-touch-icon.png -------------------------------------------------------------------------------- /neuralqa/server/ui/public/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/public/favicon-16x16.png -------------------------------------------------------------------------------- /neuralqa/server/ui/public/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/public/favicon-32x32.png -------------------------------------------------------------------------------- /neuralqa/server/ui/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/public/favicon.ico -------------------------------------------------------------------------------- /neuralqa/server/ui/public/images/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/public/images/icon.png -------------------------------------------------------------------------------- /neuralqa/server/ui/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 12 | 13 | 17 | 18 | 19 | 20 | 24 | 28 | 32 | 36 | 37 | 38 | 42 | 43 | 52 | NeuralQA: Question Answering on Large Documents 53 | 54 | 55 | 56 | 57 |
58 | 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /neuralqa/server/ui/public/logo152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/server/ui/public/logo152.png -------------------------------------------------------------------------------- /neuralqa/server/ui/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "NeuralQA", 3 | "name": "Question Answering on Large Document Datasets", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "android-chrome-192x192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "android-chrome-512x512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } -------------------------------------------------------------------------------- /neuralqa/server/ui/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/Main.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2019 Fast Forward Labs. 4 | * Written by Victor Dibia / Contact : https://github.com/victordibia 5 | * CaseQA - CaseQA: Question Answering on Large Datasets with BERT. 6 | * Licensed under the MIT License (the "License"); 7 | * ============================================================================= 8 | */ 9 | 10 | import React, { Component } from "react"; 11 | import { getJSONData, sampleConfig } from "./helperfunctions/HelperFunctions"; 12 | import { Route, HashRouter } from "react-router-dom"; 13 | 14 | import QueryView from "./queryview/QueryView"; 15 | import Header from "./header/Header"; 16 | import Footer from "./footer/Footer"; 17 | import { createBrowserHistory } from "history"; 18 | import TestView from "./testview/TestView"; 19 | // import TestView from "./testview/TestView"; 20 | 21 | const history = createBrowserHistory({ 22 | basename: "", // The base URL of the app (see below) 23 | forceRefresh: false, // Set true to force full page refreshes 24 | keyLength: 6, // The length of location.key 25 | // A function to use to confirm navigation with the user (see below) 26 | getUserConfirmation: (message, callback) => callback(window.confirm(message)), 27 | }); 28 | 29 | history.listen((location) => { 30 | // console.log(location.pathname, location.hash) 31 | }); 32 | 33 | let linkHolder = {}; 34 | 35 | function updateLh(location) { 36 | if (location.hash in linkHolder) { 37 | linkHolder[location.hash] = linkHolder[location.hash] + 1; 38 | } else { 39 | linkHolder[location.hash] = 0; 40 | } 41 | } 42 | 43 | history.listen((location) => { 44 | updateLh(location); 45 | }); 46 | 47 | class Main extends Component { 48 | constructor(props) { 49 | super(props); 50 | 51 | this.state = { 52 | config: null, 53 | }; 54 | updateLh(window.location); 55 | 56 | this.serverBasePath = 57 | window.location.protocol + "//" + window.location.host; 58 | // this.serverBasePath = "http://localhost:5000"; 59 | this.configEndpoint = "/api/config"; 60 | } 61 | 62 | componentDidMount() { 63 | let configUrl = this.serverBasePath + this.configEndpoint; 64 | let config = getJSONData(configUrl); 65 | let self = this; 66 | config 67 | .then((data) => { 68 | if (data) { 69 | this.setState({ config: data }); 70 | } 71 | }) 72 | .catch(function (err) { 73 | console.log("Failed to fetch config, using default config.", err); 74 | self.setState({ config: sampleConfig() }); 75 | }); 76 | } 77 | render() { 78 | const mQueryView = (props) => { 79 | return ( 80 | 84 | ); 85 | }; 86 | return ( 87 | 88 | {this.state.config && ( 89 |
90 |
91 |
92 | 93 | 94 |
95 |
96 | )} 97 | 98 | 101 |
102 | ); 103 | } 104 | } 105 | 106 | export default Main; 107 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/barviz/BarViz.jsx: -------------------------------------------------------------------------------- 1 | import React, { Component } from "react"; 2 | import * as d3 from "d3"; 3 | import "./barviz.css"; 4 | 5 | class BarViz extends Component { 6 | constructor(props) { 7 | super(props); 8 | 9 | this.grads = props.data.gradients; 10 | 11 | // this.minChartWidth = 900; 12 | this.minChartHeight = 250; 13 | this.minChartWidth = this.props.minChartWidth || 800; 14 | 15 | this.brushHeight = 60; 16 | this.barColor = "#0062ff"; 17 | this.inactiveColor = "rgba(85, 85, 85, 0.586)"; 18 | this.initialBrushPercentage = 35 / this.grads.length; 19 | 20 | // window.addEventListener("resize", handleResize); 21 | } 22 | 23 | getLabel(d, i) { 24 | return i + "*.*" + d.token + " *.* (" + d.gradient.toFixed(2) + ")"; 25 | } 26 | 27 | componentWillUnmount() {} 28 | 29 | componentDidUpdate(prevProps, prevState) {} 30 | 31 | setupScalesAxes(data) { 32 | let self = this; 33 | this.chartMargin = { top: 5, right: 0, bottom: 0, left: 0 }; 34 | this.chartWidth = 35 | this.minChartWidth - this.chartMargin.left - this.chartMargin.right; 36 | this.chartHeight = 37 | this.minChartHeight - this.chartMargin.top - this.chartMargin.bottom; 38 | this.xScale = d3 39 | .scaleBand() 40 | .domain(data.map((d, i) => self.getLabel(d, i))) 41 | .range([this.chartMargin.left, this.chartWidth - this.chartMargin.right]); 42 | 43 | this.yScale = d3 44 | .scaleLinear() 45 | .domain([0, d3.max(data, (d) => d.gradient)]) 46 | .nice() 47 | .range([this.chartHeight, 0]); 48 | } 49 | 50 | createSVGBox = (selector, height) => { 51 | return d3 52 | .select(selector) 53 | .append("svg") 54 | .attr( 55 | "width", 56 | this.chartWidth + this.chartMargin.left + this.chartMargin.right 57 | ) 58 | .attr("height", height + this.chartMargin.top + this.chartMargin.bottom) 59 | .append("g") 60 | .attr( 61 | "transform", 62 | "translate(" + this.chartMargin.left + "," + this.chartMargin.top + ")" 63 | ); 64 | }; 65 | createBarRects = (svg, x, y, data, chartclass, transparency) => { 66 | svg 67 | .append("g") 68 | .attr("class", chartclass) 69 | .selectAll("rect") 70 | .data(data) 71 | .join("rect") 72 | .attr("x", (d, i) => x(this.getLabel(d, i))) 73 | .attr("y", (d) => y(d.gradient)) 74 | .attr("height", (d) => y(0) - y(d.gradient)) 75 | .attr("width", x.bandwidth()) 76 | .attr("class", transparency ? "strokedbarrect" : "") 77 | .attr( 78 | "fill", 79 | (d) => "rgba(0, 98, 255, " + (transparency ? d.gradient : 1) + ")" 80 | ); 81 | }; 82 | 83 | drawBrushGraph(data) { 84 | let self = this; 85 | this.brushXScale = this.xScale.copy(); 86 | this.brushYScale = this.yScale.copy().range([this.brushHeight, 0]); 87 | const x = this.brushXScale; 88 | const y = this.brushYScale; 89 | const mainXZoom = d3 90 | .scaleLinear() 91 | .range([this.chartMargin.left, this.chartWidth - this.chartMargin.right]) 92 | .domain([ 93 | this.chartMargin.left, 94 | this.chartWidth - this.chartMargin.right, 95 | ]); 96 | 97 | const svg = this.createSVGBox("div.d3brush", this.brushHeight); 98 | 99 | this.createBarRects(svg, x, y, data, "minibars", false); 100 | const brush = d3 101 | .brushX() 102 | .extent([ 103 | [this.chartMargin.left, 0.5], 104 | [this.chartWidth - this.chartMargin.right, this.brushHeight], 105 | ]) 106 | .on("brush", brushed) 107 | .on("start", brushStarted) 108 | .on("end", brushEnded); 109 | 110 | const defaultSelection = [ 111 | x.range()[0], 112 | (x.range()[1] - x.range()[0]) * self.initialBrushPercentage, 113 | ]; 114 | 115 | svg.append("g").call(brush).call(brush.move, defaultSelection); 116 | 117 | function brushStarted() { 118 | // console.log("brush started"); 119 | d3.select("div.barviz") 120 | .selectAll("text.textlabel") 121 | .attr("class", "textinvisible textlabel"); 122 | } 123 | function brushEnded() { 124 | d3.select("div.barviz") 125 | .selectAll("text.textlabel") 126 | .attr("class", "textlabel"); 127 | const extentX = d3.event.selection; 128 | // console.log("brush ended", extentX); 129 | if (extentX) { 130 | // const selected = x 131 | // .domain() 132 | // .filter( 133 | // (d) => 134 | // extentX[0] - x.bandwidth() + 1e-2 <= x(d) && 135 | // x(d) <= extentX[1] - 1e-2 136 | // ); 137 | 138 | updateScalePostBrush(extentX); 139 | 140 | const svg = d3.select("div.barviz"); 141 | svg 142 | .selectAll("text.textlabel") 143 | .data(data) 144 | .attr("x", (d, i) => { 145 | return ( 146 | self.xScale(self.getLabel(d, i)) + self.xScale.bandwidth() / 2 147 | ); 148 | }) 149 | .attr("y", (d) => { 150 | return self.yScale.range()[0]; 151 | }); 152 | } 153 | } 154 | 155 | function brushed() { 156 | const extentX = d3.event.selection; 157 | const selected = x 158 | .domain() 159 | .filter( 160 | (d) => 161 | extentX[0] - x.bandwidth() + 1e-2 <= x(d) && 162 | x(d) <= extentX[1] - 1e-2 163 | ); 164 | 165 | d3.select("div.d3brush") 166 | .select(".minibars") 167 | .selectAll("rect") 168 | .style("fill", (d, i) => { 169 | return selected.indexOf(self.getLabel(d, i)) > -1 170 | ? self.barColor 171 | : self.inactiveColor; 172 | }); 173 | 174 | updateScalePostBrush(extentX); 175 | update(self.grads); 176 | } 177 | 178 | function updateScalePostBrush(extentX) { 179 | let originalRange = mainXZoom.range(); 180 | mainXZoom.domain(extentX); 181 | 182 | self.xScale.domain(data.map((d, i) => self.getLabel(d, i))); 183 | self.xScale 184 | .range([mainXZoom(originalRange[0]), mainXZoom(originalRange[1])]) 185 | .paddingInner(0.1); 186 | } 187 | 188 | function update(data) { 189 | const x = self.xScale; 190 | const y = self.yScale; 191 | const svg = d3.select("div.barviz"); 192 | svg 193 | .selectAll("rect.mainbars") 194 | .data(data) 195 | .join("rect") 196 | .attr("x", (d, i) => x(self.getLabel(d, i))) 197 | .attr("y", (d) => y(d.gradient)) 198 | .attr("height", (d) => y(0) - y(d.gradient)) 199 | .attr("width", x.bandwidth()); 200 | } 201 | } 202 | 203 | createToolTip(svg) { 204 | // create tooltip 205 | let tooltip = svg 206 | .append("g") 207 | .attr("class", "tooltiptext") 208 | .style("display", "none"); 209 | 210 | tooltip.append("rect").attr("class", "tooltiprect"); 211 | 212 | tooltip.append("text").attr("x", 10).attr("dy", "1.2em"); 213 | // .style("text-anchor", "middle"); 214 | 215 | return tooltip; 216 | } 217 | 218 | drawGraph(data) { 219 | let self = this; 220 | this.setupScalesAxes(data); 221 | const x = this.xScale; 222 | const y = this.yScale; 223 | 224 | const svg = this.createSVGBox("div.barviz", this.chartHeight); 225 | const bar = svg.selectAll("g").data(data).join("g"); 226 | 227 | bar 228 | .append("rect") 229 | .attr("class", "strokedbarrect mainbars") 230 | .attr( 231 | "fill", 232 | (d) => 233 | "rgba(0, 98, 255, " + 234 | (d.gradient > 0.5 ? 1 : 0.5 + 0.5 * d.gradient) + 235 | ")" 236 | ) 237 | .attr("width", x.bandwidth()) 238 | .attr("height", (d) => y(0) - y(d.gradient)) 239 | .on("mouseover", function () { 240 | tooltip.style("display", null); 241 | d3.select(this).attr("fill", "lightgrey"); 242 | }) 243 | .on("mouseout", function (d) { 244 | tooltip.style("display", "none"); 245 | d3.select(this).attr( 246 | "fill", 247 | "rgba(0, 98, 255, " + 248 | (d.gradient > 0.5 ? 1 : 0.5 + 0.5 * d.gradient) + 249 | ")" 250 | ); 251 | }) 252 | .on("mousemove", function (d) { 253 | var xPosition = d3.mouse(this)[0] + 10; 254 | var yPosition = d3.mouse(this)[1] - 20; 255 | tooltip.attr( 256 | "transform", 257 | "translate(" + xPosition + "," + yPosition + ")" 258 | ); 259 | tooltip.select("text").text(d.token); 260 | tooltip 261 | .select("rect") 262 | .attr( 263 | "width", 264 | tooltip.select("text").node().getComputedTextLength() + 20 265 | ); 266 | }); 267 | 268 | bar 269 | .append("text") 270 | // .attr("fill", "white") 271 | .attr("x", (d, i) => { 272 | return x(self.getLabel(d, i)); 273 | }) 274 | .attr("y", (d) => y(d.gradient)) 275 | .attr("class", "textlabel") 276 | .text((d) => d.token); 277 | 278 | let tooltip = this.createToolTip(svg); 279 | } 280 | 281 | componentDidMount() { 282 | let barvizElement = document.getElementById("barviz"); 283 | barvizElement.style.width = this.minChartWidth + "px"; 284 | 285 | this.drawGraph(this.grads); 286 | this.drawBrushGraph(this.grads); 287 | } 288 | 289 | render() { 290 | return ( 291 |
292 |
293 |
294 |
295 |
296 |
297 | ); 298 | } 299 | } 300 | 301 | export default BarViz; 302 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/barviz/barviz.css: -------------------------------------------------------------------------------- 1 | .d3brush { 2 | /* border: 1px solid black; */ 3 | } 4 | 5 | .yticktext { 6 | border: 1px solid green; 7 | } 8 | 9 | .barviz { 10 | /* border: 1px solid pink; */ 11 | background-image: repeating-linear-gradient( 12 | 180deg, 13 | #ccc 29.5px, 14 | #ccc 30px, 15 | transparent 30px, 16 | transparent 55px 17 | ); 18 | } 19 | 20 | .barvizcontent { 21 | background-color: rgb(235, 235, 235); 22 | padding: 10px 10px 8px 10px; 23 | } 24 | 25 | .strokedbarrect { 26 | /* stroke-width: 1px; 27 | stroke: rgba(44, 44, 44, 0.199); */ 28 | } 29 | 30 | .textlabel { 31 | text-anchor: end; 32 | /* transform: rotate(20deg); */ 33 | writing-mode: vertical-rl; 34 | cursor: default; 35 | text-shadow: 0.5px 0.5px 1px white; 36 | /* stroke: grey; */ 37 | fill: black; 38 | stroke-width: 1px; 39 | pointer-events: none; 40 | } 41 | 42 | .tooltiptext { 43 | font-weight: bold; 44 | border: 1px solid black; 45 | pointer-events: none; 46 | font-size: 1.2em; 47 | text-anchor: start; 48 | } 49 | 50 | .tooltiprect { 51 | height: 1.7em; 52 | fill: white; 53 | opacity: 0.65; 54 | } 55 | 56 | .textinvisible { 57 | visibility: hidden; 58 | } 59 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/expandview/ExpandView.jsx: -------------------------------------------------------------------------------- 1 | import React, { Component } from "react"; 2 | import { Modal } from "carbon-components-react"; 3 | import "./expandview.css"; 4 | import { LeaderLine, animOptions } from "../helperfunctions/HelperFunctions"; 5 | 6 | class ExpandView extends Component { 7 | constructor(props) { 8 | super(props); 9 | this.data = props.data; 10 | 11 | // console.log(this.props); 12 | 13 | this.state = { 14 | data: this.data, 15 | showInfoModal: false, 16 | }; 17 | this.blueColor = "#0062ff"; 18 | this.greyColor = "#c4c3c3"; 19 | } 20 | 21 | componentDidUpdate(prevProps, prevState) { 22 | if (this.props.data !== prevProps.data) { 23 | this.removeAllLines(); 24 | this.drawLines(); 25 | // console.log(this.lineHolder.length, " num lines"); 26 | } 27 | if (this.props.viewChanged !== prevProps.viewChanged) { 28 | this.redrawAllLines(); 29 | } 30 | } 31 | 32 | clickTerm(e) { 33 | // console.log(e.target.innerHTML); 34 | this.props.addQueryTerm(e.target.innerHTML); 35 | } 36 | 37 | updateGraph(data) {} 38 | 39 | drawLeaderLine(startElement, endElement, startAnchor, endAnchor) { 40 | let lineColor = this.blueColor; 41 | let lineWidth = 1.5; 42 | let plugType = "square"; 43 | let endPlugType = "arrow2"; 44 | 45 | let line = new LeaderLine( 46 | LeaderLine.pointAnchor(startElement, startAnchor), 47 | LeaderLine.pointAnchor(endElement, endAnchor), 48 | { 49 | color: lineColor, 50 | startPlug: plugType, 51 | endPlug: endPlugType, 52 | startPlugColor: lineColor, 53 | endSocketGravity: 400, 54 | path: "arc", 55 | size: lineWidth, 56 | hide: true, 57 | } 58 | ); 59 | // document.querySelector('.leader-line').style.zIndex = -100 60 | animOptions.duration = this.state.animationDuration; 61 | line.show("draw", animOptions); 62 | this.lineHolder.push({ 63 | line: line, 64 | }); 65 | } 66 | 67 | removeAllLines(line) { 68 | this.lineHolder.forEach(function (each) { 69 | each.line.remove(); 70 | }); 71 | this.lineHolder = []; 72 | } 73 | 74 | redrawAllLines() { 75 | this.lineHolder.forEach(function (each) { 76 | each.line.position(); 77 | }); 78 | } 79 | getElement(attributeName, attributeValue) { 80 | return document 81 | .querySelector("div") 82 | .querySelector("[" + attributeName + "=" + attributeValue + "]"); 83 | } 84 | componentDidMount() { 85 | this.lineHolder = []; 86 | this.topAnchor = { x: "50%", y: 0 }; 87 | this.bottomAnchor = { x: "50%", y: "100%" }; 88 | this.leftAnchor = { x: "0%", y: "50%" }; 89 | this.rightAnchor = { x: "100%", y: "50%" }; 90 | 91 | this.drawLines(); 92 | } 93 | 94 | drawLines() { 95 | for (const ex of this.props.data.expansions) { 96 | if (ex.expansion) { 97 | for (let i = 0; i < ex.expansion.length; i++) { 98 | const startId = "term" + ex.token_index; 99 | const endId = "subterm" + ex.token_index + i; 100 | const startEl = this.getElement("id", startId); 101 | const endEl = this.getElement("id", endId); 102 | this.drawLeaderLine(startEl, endEl, this.leftAnchor, this.leftAnchor); 103 | } 104 | } 105 | } 106 | } 107 | 108 | componentWillUnmount() { 109 | this.removeAllLines(); 110 | } 111 | 112 | clickInfo(e) { 113 | this.setState({ showInfoModal: !this.state.showInfoModal }); 114 | } 115 | 116 | render() { 117 | let suggestedTermList = []; 118 | const data = this.props.data; 119 | if (data.expansions && data.terms) { 120 | suggestedTermList = data.terms.map((data, index) => { 121 | return ( 122 |
126 | {data.token} 127 |
128 | ); 129 | }); 130 | } 131 | const expansionTermsList = data.expansions.map((expansionData, index) => { 132 | const terms = (expansionData.expansion || []).map((data, index) => { 133 | return ( 134 |
140 | {data.token} 141 |
142 | ); 143 | }); 144 | const boxColor = terms.length > 0 ? this.blueColor : this.greyColor; 145 | 146 | return ( 147 |
148 |
149 |
150 | {expansionData.pos} 151 | 152 |
153 | PART OF SPEECH 154 |
155 |
{(expansionData.pos_desc || "").toUpperCase()}
156 |
157 |
158 |
159 | {expansionData.named_entity !== "" 160 | ? "| " + expansionData.named_entity 161 | : ""} 162 | 163 |
164 | NAMED ENTITY 165 |
166 |
{(expansionData.ent_desc || "").toUpperCase()}
167 |
168 |
169 |
170 | 171 |
0 ? "mb5" : "") 175 | } 176 | style={{ 177 | color: terms.length > 0 ? "white" : "", 178 | backgroundColor: boxColor, 179 | }} 180 | > 181 | {expansionData.token} 182 |
183 |
{terms}
184 |
185 | ); 186 | }); 187 | 188 | // const subTermsList = this.data.expansions 189 | // .filter((data) => { 190 | // if (data.expansion) { 191 | // return true; 192 | // } 193 | // return false; 194 | // }) 195 | // .map((expansionData, termIndex) => { 196 | // const terms = expansionData.expansion.map((data, index) => { 197 | // return ( 198 | //
203 | // {data.token} 204 | //
205 | // ); 206 | // }); 207 | // return ( 208 | //
209 | //
{terms}
210 | //
211 | // ); 212 | // }); 213 | 214 | return ( 215 |
216 | 227 |
228 |
229 | {" "} 230 | What is Contextual Query Expansion?{" "} 231 |
232 | Query expansion works as follows. First, a set of rules are used to 233 | determine which token in the query to expand. These rules are chosen 234 | to improve recall (surface relevant queries) without altering the 235 | semantics of the original query. Example rules include only 236 | expanding ADJECTIVES AND ADVERBS ; other parts of speech such as 237 | nouns, proper nouns or even named entities are not expanded. Once 238 | expansion candidates are selected, they are then iteratively masked 239 | and a masked language model is used to predict tokens that best 240 | complete the sentence given the surrounding tokens. Additional 241 | details are provided in the{" "} 242 | 243 | {" "} 244 | NeuralQA paper. 245 | {" "} 246 |
247 | {" "} 248 | How is this Implemented?{" "} 249 |
250 | Part of speech detection is implemented using Spacy NLP. A BERT 251 | based masked language model is used for predicting expansion terms 252 | (can be selected under advanced options). 253 |

254 | *Note contextual query expansion works best when the model is 255 | trained on the target (open-domain) dataset. 256 |
257 |
258 |
259 | 260 | {" "} 261 | {suggestedTermList.length} Suggested Expansion Terms{" "} 262 | 263 | 264 | {" "} 265 | {this.props.data.took.toFixed(3)} seconds 266 | 267 | . 268 |
269 |
270 | {/* suggested terms: 271 | {suggestedTermList} */} 272 |
273 |
274 | Click any of the expansion candidate terms below to append it to 275 | your query 276 |
277 |
281 | ? info 282 |
283 |
284 | 285 |
{expansionTermsList}
286 |
287 | {" "} 288 | The visualization above indicates how the expansion terms were 289 | generated.

290 |
291 |
292 |
293 | ); 294 | } 295 | } 296 | 297 | export default ExpandView; 298 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/expandview/ex.json: -------------------------------------------------------------------------------- 1 | { 2 | "terms": [ 3 | { "token": "personal", "probability": 0.2398897111415863 }, 4 | { "token": "word", "probability": 0.04981723055243492 }, 5 | { "token": "ii", "probability": 0.11301881819963455 }, 6 | { "token": "macintosh", "probability": 0.09222493320703506 } 7 | ], 8 | "query": [ 9 | "Steve", 10 | "jobs", 11 | "created", 12 | "the", 13 | "apple", 14 | "computer", 15 | "in", 16 | "which", 17 | "year" 18 | ], 19 | "expansions": [ 20 | { 21 | "token": "Steve", 22 | "expansion": null, 23 | "token_index": 0, 24 | "pos": "PROPN", 25 | "pos_desc": "proper noun", 26 | "named_entity": "PERSON", 27 | "ent_desc": "People, including fictional" 28 | }, 29 | { 30 | "token": "jobs", 31 | "expansion": null, 32 | "token_index": 1, 33 | "pos": "NOUN", 34 | "pos_desc": "noun", 35 | "named_entity": "PERSON", 36 | "ent_desc": "People, including fictional" 37 | }, 38 | { 39 | "token": "created", 40 | "expansion": null, 41 | "token_index": 2, 42 | "pos": "VERB", 43 | "pos_desc": "verb", 44 | "named_entity": "", 45 | "ent_desc": null 46 | }, 47 | { 48 | "token": "the", 49 | "expansion": null, 50 | "token_index": 3, 51 | "pos": "DET", 52 | "pos_desc": "determiner", 53 | "named_entity": "", 54 | "ent_desc": null 55 | }, 56 | { 57 | "token": "apple", 58 | "expansion": [ 59 | { "token": "apple", "probability": 0.29380887746810913 }, 60 | { "token": "personal", "probability": 0.2398897111415863 }, 61 | { "token": "word", "probability": 0.04981723055243492 } 62 | ], 63 | "token_index": 4, 64 | "pos": "NOUN", 65 | "pos_desc": "noun", 66 | "named_entity": "", 67 | "ent_desc": null 68 | }, 69 | { 70 | "token": "computer", 71 | "expansion": [ 72 | { "token": ",", "probability": 0.4731844961643219 }, 73 | { "token": "ii", "probability": 0.11301881819963455 }, 74 | { "token": "macintosh", "probability": 0.09222493320703506 } 75 | ], 76 | "token_index": 5, 77 | "pos": "NOUN", 78 | "pos_desc": "noun", 79 | "named_entity": "", 80 | "ent_desc": null 81 | }, 82 | { 83 | "token": "in", 84 | "expansion": null, 85 | "token_index": 6, 86 | "pos": "ADP", 87 | "pos_desc": "adposition", 88 | "named_entity": "", 89 | "ent_desc": null 90 | }, 91 | { 92 | "token": "which", 93 | "expansion": null, 94 | "token_index": 7, 95 | "pos": "DET", 96 | "pos_desc": "determiner", 97 | "named_entity": "", 98 | "ent_desc": null 99 | }, 100 | { 101 | "token": "year", 102 | "expansion": [ 103 | { "token": "?", "probability": 0.7166592478752136 }, 104 | { "token": ".", "probability": 0.18741711974143982 }, 105 | { "token": ";", "probability": 0.06785053759813309 } 106 | ], 107 | "token_index": 8, 108 | "pos": "NOUN", 109 | "pos_desc": "noun", 110 | "named_entity": "", 111 | "ent_desc": null 112 | } 113 | ], 114 | "took": 0.5484719276428223 115 | } 116 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/expandview/expandview.css: -------------------------------------------------------------------------------- 1 | .termcontainer { 2 | background-color: rgb(238, 238, 238); 3 | padding: 7px; 4 | margin: 5px 5px 0px 0px; 5 | vertical-align: top; 6 | } 7 | .termbox { 8 | padding: 6px; 9 | text-align: center; 10 | background-color: rgb(211, 211, 211); 11 | } 12 | .subtermbox { 13 | background-color: rgb(233, 233, 233); 14 | margin: 3px 0px 0px 0px; 15 | border: 1px solid #0062ff; 16 | } 17 | .subtermbox:hover { 18 | background-color: rgb(199, 199, 199); 19 | /* color: white; */ 20 | } 21 | .subtermgroupbox { 22 | padding-right: 3px; 23 | border: 1px solid rgb(243, 243, 243); 24 | margin-right: 5px; 25 | } 26 | .expandview { 27 | border: 1px solid #c4c3c3; 28 | /* background-color: #f0efef; */ 29 | padding-left: 10px; 30 | /* padding-right: 10px; */ 31 | padding-bottom: 10px; 32 | } 33 | 34 | .tooltip { 35 | position: relative; 36 | display: inline-block; 37 | cursor: default; 38 | } 39 | 40 | .tooltip .expandtooltiptext { 41 | visibility: hidden; 42 | padding: 5px; 43 | /* width: 120px; */ 44 | background-color: rgb(114, 114, 114); 45 | color: #fff; 46 | text-align: center; 47 | /* border-radius: 6px; */ 48 | /* padding: 5px 0; */ 49 | /* border: 1px solid black; */ 50 | 51 | /* Position the tooltip */ 52 | position: absolute; 53 | z-index: 1; 54 | bottom: 140%; 55 | left: 0%; 56 | /* margin-left: -60px; */ 57 | } 58 | 59 | .tooltip:hover .expandtooltiptext { 60 | visibility: visible; 61 | } 62 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/explainview/ExplainView.jsx: -------------------------------------------------------------------------------- 1 | import React, { Component } from "react"; 2 | import { Tabs, Tab } from "carbon-components-react"; 3 | import "./explainview.css"; 4 | import BarViz from "../barviz/BarViz"; 5 | 6 | class ExplainView extends Component { 7 | constructor(props) { 8 | super(props); 9 | 10 | // console.log(props); 11 | this.state = { 12 | minCharWidth: null, 13 | }; 14 | } 15 | 16 | getLabel(d, i) { 17 | return i + "*.*" + d.token + " *.* (" + d.gradient.toFixed(2) + ")"; 18 | } 19 | 20 | componentDidUpdate(prevProps, prevState) { 21 | // this.data = prevProps.explanationData; 22 | // if ( 23 | // prevProps.explanationData && 24 | // prevProps.explanationData.answer !== this.state.data.answer 25 | // ) { 26 | // console.log("updating .. ", this.data); 27 | // this.updateGraph(this.data.gradients); 28 | // this.setState({ 29 | // data: prevProps.explanationData, 30 | // }); 31 | // } 32 | // // this.setState({ 33 | // // data: prevProps.explanationData[prevProps.selectedExplanation], 34 | // // }); 35 | } 36 | 37 | componentDidMount() { 38 | this.setState({ 39 | minCharWidth: document.getElementById("barvizcontainer").offsetWidth - 40, 40 | }); 41 | } 42 | render() { 43 | const denseViz = this.props.data.gradients.map((xdata, xindex) => { 44 | return ( 45 | 52 | {xdata.token}   53 | 54 | ); 55 | }); 56 | 57 | return ( 58 |
59 | 60 | 61 | {/* {answerText} */} 62 |
63 |
64 | * Darker words indicate larger impact on answer span selection. 65 |
66 |
67 |
{denseViz}
68 |
69 |
70 |
71 | 72 | {/* {answerText} */} 73 |
74 |
75 | * Darker bars indicate larger impact on answer span selection. 76 |
77 | {this.state.minCharWidth && ( 78 | 82 | )} 83 |
84 |
85 |
86 |
87 | The visualizations above indicate how each word in the query and 88 | context contributes to the model's selection of an answer span. 89 |
90 |
91 | ); 92 | } 93 | } 94 | 95 | export default ExplainView; 96 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/explainview/ex.json: -------------------------------------------------------------------------------- 1 | { 2 | "gradients": [ 3 | { 4 | "gradient": 0.7776244878768921, 5 | "token": "what", 6 | "token_type": 0 7 | }, 8 | { 9 | "gradient": 0.3813329041004181, 10 | "token": "is", 11 | "token_type": 0 12 | }, 13 | { 14 | "gradient": 0.4833911657333374, 15 | "token": "the", 16 | "token_type": 0 17 | }, 18 | { 19 | "gradient": 1, 20 | "token": "goal", 21 | "token_type": 0 22 | }, 23 | { 24 | "gradient": 0.4539879858493805, 25 | "token": "of", 26 | "token_type": 0 27 | }, 28 | { 29 | "gradient": 0.2254873514175415, 30 | "token": "the", 31 | "token_type": 0 32 | }, 33 | { 34 | "gradient": 0.5164986252784729, 35 | "token": "fourth", 36 | "token_type": 0 37 | }, 38 | { 39 | "gradient": 0.577403724193573, 40 | "token": "amendment", 41 | "token_type": 0 42 | }, 43 | { 44 | "gradient": 0.4014101028442383, 45 | "token": "?", 46 | "token_type": 0 47 | }, 48 | { 49 | "gradient": 0.08653245866298676, 50 | "token": "the", 51 | "token_type": 1 52 | }, 53 | { 54 | "gradient": 0.4406398832798004, 55 | "token": "fourth", 56 | "token_type": 1 57 | }, 58 | { 59 | "gradient": 0.4501516819000244, 60 | "token": "amendment", 61 | "token_type": 1 62 | }, 63 | { 64 | "gradient": 0.09730638563632965, 65 | "token": "of", 66 | "token_type": 1 67 | }, 68 | { 69 | "gradient": 0.05219580605626106, 70 | "token": "the", 71 | "token_type": 1 72 | }, 73 | { 74 | "gradient": 0.10632430016994476, 75 | "token": "u", 76 | "token_type": 1 77 | }, 78 | { 79 | "gradient": 0.08209715783596039, 80 | "token": ".", 81 | "token_type": 1 82 | }, 83 | { 84 | "gradient": 0.11832378804683685, 85 | "token": "s", 86 | "token_type": 1 87 | }, 88 | { 89 | "gradient": 0.12593649327754974, 90 | "token": ".", 91 | "token_type": 1 92 | }, 93 | { 94 | "gradient": 0.18220987915992737, 95 | "token": "constitution", 96 | "token_type": 1 97 | }, 98 | { 99 | "gradient": 0.2233753353357315, 100 | "token": "provides", 101 | "token_type": 1 102 | }, 103 | { 104 | "gradient": 0.09926070272922516, 105 | "token": "that", 106 | "token_type": 1 107 | }, 108 | { 109 | "gradient": 0.04957512393593788, 110 | "token": "the", 111 | "token_type": 1 112 | }, 113 | { 114 | "gradient": 0.06091616675257683, 115 | "token": "right", 116 | "token_type": 1 117 | }, 118 | { 119 | "gradient": 0.0487910620868206, 120 | "token": "of", 121 | "token_type": 1 122 | }, 123 | { 124 | "gradient": 0.03942923620343208, 125 | "token": "the", 126 | "token_type": 1 127 | }, 128 | { 129 | "gradient": 0.07061353325843811, 130 | "token": "people", 131 | "token_type": 1 132 | }, 133 | { 134 | "gradient": 0.035968974232673645, 135 | "token": "to", 136 | "token_type": 1 137 | }, 138 | { 139 | "gradient": 0.05321967601776123, 140 | "token": "be", 141 | "token_type": 1 142 | }, 143 | { 144 | "gradient": 0.08256877958774567, 145 | "token": "secure", 146 | "token_type": 1 147 | }, 148 | { 149 | "gradient": 0.06096402183175087, 150 | "token": "in", 151 | "token_type": 1 152 | }, 153 | { 154 | "gradient": 0.0372462272644043, 155 | "token": "their", 156 | "token_type": 1 157 | }, 158 | { 159 | "gradient": 0.06295276433229446, 160 | "token": "persons", 161 | "token_type": 1 162 | }, 163 | { 164 | "gradient": 0.04519972205162048, 165 | "token": ",", 166 | "token_type": 1 167 | }, 168 | { 169 | "gradient": 0.06783878803253174, 170 | "token": "houses", 171 | "token_type": 1 172 | }, 173 | { 174 | "gradient": 0.040804315358400345, 175 | "token": ",", 176 | "token_type": 1 177 | }, 178 | { 179 | "gradient": 0.06289402395486832, 180 | "token": "papers", 181 | "token_type": 1 182 | }, 183 | { 184 | "gradient": 0.03958067297935486, 185 | "token": ",", 186 | "token_type": 1 187 | }, 188 | { 189 | "gradient": 0.05112042650580406, 190 | "token": "and", 191 | "token_type": 1 192 | }, 193 | { 194 | "gradient": 0.07284298539161682, 195 | "token": "effects", 196 | "token_type": 1 197 | }, 198 | { 199 | "gradient": 0.05123045668005943, 200 | "token": ",", 201 | "token_type": 1 202 | }, 203 | { 204 | "gradient": 0.05624000355601311, 205 | "token": "against", 206 | "token_type": 1 207 | }, 208 | { 209 | "gradient": 0.1851975917816162, 210 | "token": "unreasonable", 211 | "token_type": 1 212 | }, 213 | { 214 | "gradient": 0.06078457459807396, 215 | "token": "searches", 216 | "token_type": 1 217 | }, 218 | { 219 | "gradient": 0.07405952364206314, 220 | "token": "and", 221 | "token_type": 1 222 | }, 223 | { 224 | "gradient": 0.07777296006679535, 225 | "token": "seizures", 226 | "token_type": 1 227 | }, 228 | { 229 | "gradient": 0.0655774474143982, 230 | "token": ",", 231 | "token_type": 1 232 | }, 233 | { 234 | "gradient": 0.09869317710399628, 235 | "token": "shall", 236 | "token_type": 1 237 | }, 238 | { 239 | "gradient": 0.07527285069227219, 240 | "token": "not", 241 | "token_type": 1 242 | }, 243 | { 244 | "gradient": 0.0456937812268734, 245 | "token": "be", 246 | "token_type": 1 247 | }, 248 | { 249 | "gradient": 0.10462962836027145, 250 | "token": "violated", 251 | "token_type": 1 252 | }, 253 | { 254 | "gradient": 0.06425818055868149, 255 | "token": ",", 256 | "token_type": 1 257 | }, 258 | { 259 | "gradient": 0.05537235736846924, 260 | "token": "and", 261 | "token_type": 1 262 | }, 263 | { 264 | "gradient": 0.0633930191397667, 265 | "token": "no", 266 | "token_type": 1 267 | }, 268 | { 269 | "gradient": 0.04549432918429375, 270 | "token": "warrants", 271 | "token_type": 1 272 | }, 273 | { 274 | "gradient": 0.0779174342751503, 275 | "token": "shall", 276 | "token_type": 1 277 | }, 278 | { 279 | "gradient": 0.047900501638650894, 280 | "token": "issue", 281 | "token_type": 1 282 | }, 283 | { 284 | "gradient": 0.05423515662550926, 285 | "token": ",", 286 | "token_type": 1 287 | }, 288 | { 289 | "gradient": 0.054544754326343536, 290 | "token": "but", 291 | "token_type": 1 292 | }, 293 | { 294 | "gradient": 0.04602174833416939, 295 | "token": "upon", 296 | "token_type": 1 297 | }, 298 | { 299 | "gradient": 0.07888579368591309, 300 | "token": "probable", 301 | "token_type": 1 302 | }, 303 | { 304 | "gradient": 0.07856228202581406, 305 | "token": "cause", 306 | "token_type": 1 307 | }, 308 | { 309 | "gradient": 0.11361091583967209, 310 | "token": ",", 311 | "token_type": 1 312 | }, 313 | { 314 | "gradient": 0.062142688781023026, 315 | "token": "supported", 316 | "token_type": 1 317 | }, 318 | { 319 | "gradient": 0.05307861790060997, 320 | "token": "by", 321 | "token_type": 1 322 | }, 323 | { 324 | "gradient": 0.09184946864843369, 325 | "token": "oath", 326 | "token_type": 1 327 | }, 328 | { 329 | "gradient": 0.060711491852998734, 330 | "token": "or", 331 | "token_type": 1 332 | }, 333 | { 334 | "gradient": 0.047539242853720985, 335 | "token": "affirmation", 336 | "token_type": 1 337 | }, 338 | { 339 | "gradient": 0.07517780363559723, 340 | "token": ",", 341 | "token_type": 1 342 | }, 343 | { 344 | "gradient": 0.06382681429386139, 345 | "token": "and", 346 | "token_type": 1 347 | }, 348 | { 349 | "gradient": 0.10078483819961548, 350 | "token": "particularly", 351 | "token_type": 1 352 | }, 353 | { 354 | "gradient": 0.07376561313867569, 355 | "token": "describing", 356 | "token_type": 1 357 | }, 358 | { 359 | "gradient": 0.03218426927924156, 360 | "token": "the", 361 | "token_type": 1 362 | }, 363 | { 364 | "gradient": 0.04783613234758377, 365 | "token": "place", 366 | "token_type": 1 367 | }, 368 | { 369 | "gradient": 0.051581379026174545, 370 | "token": "to", 371 | "token_type": 1 372 | }, 373 | { 374 | "gradient": 0.03697739914059639, 375 | "token": "be", 376 | "token_type": 1 377 | }, 378 | { 379 | "gradient": 0.08481930196285248, 380 | "token": "searched", 381 | "token_type": 1 382 | }, 383 | { 384 | "gradient": 0.0867309644818306, 385 | "token": ",", 386 | "token_type": 1 387 | }, 388 | { 389 | "gradient": 0.06060314550995827, 390 | "token": "and", 391 | "token_type": 1 392 | }, 393 | { 394 | "gradient": 0.042602699249982834, 395 | "token": "the", 396 | "token_type": 1 397 | }, 398 | { 399 | "gradient": 0.056840281933546066, 400 | "token": "persons", 401 | "token_type": 1 402 | }, 403 | { 404 | "gradient": 0.061877764761447906, 405 | "token": "or", 406 | "token_type": 1 407 | }, 408 | { 409 | "gradient": 0.04903039708733559, 410 | "token": "things", 411 | "token_type": 1 412 | }, 413 | { 414 | "gradient": 0.04995288327336311, 415 | "token": "to", 416 | "token_type": 1 417 | }, 418 | { 419 | "gradient": 0.06450371444225311, 420 | "token": "be", 421 | "token_type": 1 422 | }, 423 | { 424 | "gradient": 0.1317097693681717, 425 | "token": "seized", 426 | "token_type": 1 427 | }, 428 | { 429 | "gradient": 0.27072301506996155, 430 | "token": ".", 431 | "token_type": 1 432 | }, 433 | { 434 | "gradient": 0.3593496084213257, 435 | "token": "'", 436 | "token_type": 1 437 | }, 438 | { 439 | "gradient": 0.20046266913414001, 440 | "token": "the", 441 | "token_type": 1 442 | }, 443 | { 444 | "gradient": 0.5591774582862854, 445 | "token": "ultimate", 446 | "token_type": 1 447 | }, 448 | { 449 | "gradient": 0.6059514284133911, 450 | "token": "goal", 451 | "token_type": 1 452 | }, 453 | { 454 | "gradient": 0.22130441665649414, 455 | "token": "of", 456 | "token_type": 1 457 | }, 458 | { 459 | "gradient": 0.21139739453792572, 460 | "token": "this", 461 | "token_type": 1 462 | }, 463 | { 464 | "gradient": 0.22913874685764313, 465 | "token": "provision", 466 | "token_type": 1 467 | }, 468 | { 469 | "gradient": 0.3722269535064697, 470 | "token": "is", 471 | "token_type": 1 472 | }, 473 | { 474 | "gradient": 0.37132930755615234, 475 | "token": "to", 476 | "token_type": 1 477 | }, 478 | { 479 | "gradient": 0.3717334270477295, 480 | "token": "protect", 481 | "token_type": 1 482 | }, 483 | { 484 | "gradient": 0.16041798889636993, 485 | "token": "people", 486 | "token_type": 1 487 | }, 488 | { 489 | "gradient": 0.1518508344888687, 490 | "token": "’", 491 | "token_type": 1 492 | }, 493 | { 494 | "gradient": 0.0807218924164772, 495 | "token": "s", 496 | "token_type": 1 497 | }, 498 | { 499 | "gradient": 0.15385906398296356, 500 | "token": "right", 501 | "token_type": 1 502 | }, 503 | { 504 | "gradient": 0.050815142691135406, 505 | "token": "to", 506 | "token_type": 1 507 | }, 508 | { 509 | "gradient": 0.16720698773860931, 510 | "token": "privacy", 511 | "token_type": 1 512 | }, 513 | { 514 | "gradient": 0.14042364060878754, 515 | "token": "and", 516 | "token_type": 1 517 | }, 518 | { 519 | "gradient": 0.15495635569095612, 520 | "token": "freedom", 521 | "token_type": 1 522 | }, 523 | { 524 | "gradient": 0.16938212513923645, 525 | "token": "from", 526 | "token_type": 1 527 | }, 528 | { 529 | "gradient": 0.11007372289896011, 530 | "token": "unreasonable", 531 | "token_type": 1 532 | }, 533 | { 534 | "gradient": 0.11692224815487862, 535 | "token": "intrusions", 536 | "token_type": 1 537 | }, 538 | { 539 | "gradient": 0.10463482141494751, 540 | "token": "by", 541 | "token_type": 1 542 | }, 543 | { 544 | "gradient": 0.10229557752609253, 545 | "token": "the", 546 | "token_type": 1 547 | }, 548 | { 549 | "gradient": 0.4751121699810028, 550 | "token": "government", 551 | "token_type": 1 552 | }, 553 | { 554 | "gradient": 0.480056494474411, 555 | "token": ".", 556 | "token_type": 1 557 | }, 558 | { 559 | "gradient": 0.37309250235557556, 560 | "token": "however", 561 | "token_type": 1 562 | }, 563 | { 564 | "gradient": 0.2522304356098175, 565 | "token": ",", 566 | "token_type": 1 567 | }, 568 | { 569 | "gradient": 0.1848718523979187, 570 | "token": "the", 571 | "token_type": 1 572 | }, 573 | { 574 | "gradient": 0.3077230453491211, 575 | "token": "fourth", 576 | "token_type": 1 577 | }, 578 | { 579 | "gradient": 0.2492513805627823, 580 | "token": "amendment", 581 | "token_type": 1 582 | }, 583 | { 584 | "gradient": 0.11481970548629761, 585 | "token": "does", 586 | "token_type": 1 587 | }, 588 | { 589 | "gradient": 0.12900979816913605, 590 | "token": "not", 591 | "token_type": 1 592 | }, 593 | { 594 | "gradient": 0.16723977029323578, 595 | "token": "guarantee", 596 | "token_type": 1 597 | }, 598 | { 599 | "gradient": 0.08052156865596771, 600 | "token": "protection", 601 | "token_type": 1 602 | }, 603 | { 604 | "gradient": 0.0599604956805706, 605 | "token": "from", 606 | "token_type": 1 607 | }, 608 | { 609 | "gradient": 0.0466151125729084, 610 | "token": "all", 611 | "token_type": 1 612 | }, 613 | { 614 | "gradient": 0.0796627625823021, 615 | "token": "searches", 616 | "token_type": 1 617 | }, 618 | { 619 | "gradient": 0.09778217226266861, 620 | "token": "and", 621 | "token_type": 1 622 | }, 623 | { 624 | "gradient": 0.13690105080604553, 625 | "token": "seizures", 626 | "token_type": 1 627 | }, 628 | { 629 | "gradient": 0.1097775250673294, 630 | "token": ",", 631 | "token_type": 1 632 | }, 633 | { 634 | "gradient": 0.14515969157218933, 635 | "token": "but", 636 | "token_type": 1 637 | }, 638 | { 639 | "gradient": 0.07196320593357086, 640 | "token": "only", 641 | "token_type": 1 642 | }, 643 | { 644 | "gradient": 0.09799671918153763, 645 | "token": "those", 646 | "token_type": 1 647 | }, 648 | { 649 | "gradient": 0.05541221424937248, 650 | "token": "done", 651 | "token_type": 1 652 | }, 653 | { 654 | "gradient": 0.054436687380075455, 655 | "token": "by", 656 | "token_type": 1 657 | }, 658 | { 659 | "gradient": 0.06119629368185997, 660 | "token": "the", 661 | "token_type": 1 662 | }, 663 | { 664 | "gradient": 0.11526178568601608, 665 | "token": "government", 666 | "token_type": 1 667 | }, 668 | { 669 | "gradient": 0.07573369145393372, 670 | "token": "and", 671 | "token_type": 1 672 | }, 673 | { 674 | "gradient": 0.09001221507787704, 675 | "token": "deemed", 676 | "token_type": 1 677 | }, 678 | { 679 | "gradient": 0.0890301913022995, 680 | "token": "unreasonable", 681 | "token_type": 1 682 | }, 683 | { 684 | "gradient": 0.07160922139883041, 685 | "token": "under", 686 | "token_type": 1 687 | }, 688 | { 689 | "gradient": 0.05608946084976196, 690 | "token": "the", 691 | "token_type": 1 692 | }, 693 | { 694 | "gradient": 0.05668415129184723, 695 | "token": "law", 696 | "token_type": 1 697 | }, 698 | { 699 | "gradient": 0.13379965722560883, 700 | "token": ".", 701 | "token_type": 1 702 | } 703 | ], 704 | "answer": "to protect people ’ s right to privacy and freedom from unreasonable intrusions by the government", 705 | "question": "what is the goal of the fourth amendment? " 706 | } 707 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/explainview/explainview.css: -------------------------------------------------------------------------------- 1 | .explanationspan { 2 | /* border: 1px solid green; */ 3 | padding: 2px; 4 | display: inline; 5 | /* line-height: 1.5em; */ 6 | } 7 | 8 | .barviz { 9 | /* border: 2px solid green; */ 10 | /* background: green; */ 11 | overflow: scroll; 12 | } 13 | 14 | .viztabcontent { 15 | background-color: rgb(235, 235, 235); 16 | } 17 | 18 | .brushbox { 19 | /* border: 1px solid green; */ 20 | width: 100%; 21 | } 22 | 23 | .graphsvgpath { 24 | /* transform: rotate(90deg); */ 25 | fill: #686868; 26 | stroke: #070707; 27 | } 28 | .brushhandle { 29 | position: absolute; 30 | height: 100%; 31 | background-color: #b4b4b48e; 32 | border: 1px solid white; 33 | border-radius: 2px; 34 | cursor: grab; 35 | } 36 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/footer/Footer.jsx: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2019 Fast Forward Labs. 4 | * Written by Victor Dibia / Contact : https://github.com/victordibia 5 | * CaseQA - CaseQA: Question Answering on Large Datasets with BERT. 6 | * Licensed under the MIT License (the "License"); 7 | * ============================================================================= 8 | */ 9 | 10 | 11 | import React, { Component } from "react"; 12 | import "./footer.css" 13 | 14 | class Footer extends Component { 15 | render() { 16 | return ( 17 |
18 | © NeuralQA 2020. Learn more on Github. 19 |
20 | ); 21 | } 22 | } 23 | 24 | export default Footer; -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/footer/footer.css: -------------------------------------------------------------------------------- 1 | 2 | 3 | #footer { 4 | position: fixed; 5 | bottom: 0; 6 | width: 100%; 7 | border-top: 1px solid #f0f3f6; 8 | padding: 14px; 9 | background: #fff; 10 | z-index: 10; 11 | } 12 | 13 | #footer a { 14 | text-decoration: none; 15 | } 16 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/header/Header.jsx: -------------------------------------------------------------------------------- 1 | /** 2 | * @license 3 | * Copyright 2019 Fast Forward Labs. 4 | * Written by / Contact : https://github.com/victordibia 5 | * NeuralQA - NeuralQA: Question Answering on Large Datasets with BERT. 6 | * Licensed under the MIT License (the "License"); 7 | * ============================================================================= 8 | */ 9 | 10 | 11 | import React, { Component } from "react"; 12 | import { 13 | // NavLink 14 | } from "react-router-dom"; 15 | // import { LogoGithub16 } from '@carbon/icons-react'; 16 | 17 | import "./header.css" 18 | 19 | class Header extends Component { 20 | constructor(props) { 21 | super(props) 22 | this.appName = props.data.appname || "NeuralQA" 23 | this.appDescription = props.data.appdescription || " Question Answering on Large Datasets." 24 | } 25 | render() { 26 | return ( 27 |
28 |
29 |
30 |
31 | 37 |
38 |
{this.appName}
39 |
40 | {/*
41 | NeuralQA 42 |
*/} 43 | {/* */} 49 | {/* */} 55 |
56 | 57 |
58 |
59 |
60 |
61 |
62 | 63 | ); 64 | } 65 | } 66 | 67 | export default Header; -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/header/header.css: -------------------------------------------------------------------------------- 1 | 2 | .navbarlinks a.active:hover, .navbarlinks a.active, .navbarlinks.selected { 3 | border-bottom: 4px #0062FF solid; 4 | } 5 | .navbarlinks a{ 6 | text-decoration: none; 7 | color: #fff; 8 | width: 100%; 9 | /* height: 100%; */ 10 | /* border: 1px solid pink; */ 11 | line-height: 3.1em; 12 | border-bottom: 4px solid rgb(36, 36, 36) ; 13 | padding: 0px 12px 0px 12px; 14 | } 15 | 16 | .navbarlinks a:hover{ 17 | background-color: #3D3D3D; 18 | border-bottom: 4px solid #3D3D3D ; 19 | } 20 | 21 | .headerboost{ 22 | height: 38px; 23 | } 24 | 25 | .headerrow{ 26 | height: 48px; 27 | } 28 | .headericonbox{ 29 | padding-top: 8px; 30 | } 31 | .headericon{ 32 | height: 2.0em; 33 | } 34 | /* .bx--header{ 35 | background-color: rgb(199, 29, 29); 36 | border: 10px solid green; 37 | } */ 38 | 39 | .headermain{ 40 | background: #161616 ; 41 | position: fixed; 42 | top: 0; 43 | width: 100%; 44 | z-index: 5000; 45 | } 46 | 47 | .gitlogo{ 48 | position: absolute; 49 | top: 14px; 50 | } 51 | .gitlogotext{ 52 | margin: 0px 0px 0px 20px; 53 | } -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/helperfunctions/HelperFunctions.jsx: -------------------------------------------------------------------------------- 1 | export function abbreviateString(value, maxLength) { 2 | if (value.length <= maxLength) { 3 | return value; 4 | } else { 5 | let retval = value.substring(0, maxLength) + ".."; 6 | return retval; 7 | } 8 | } 9 | 10 | function intlFormat(num) { 11 | return new Intl.NumberFormat().format(Math.round(num * 10) / 10); 12 | } 13 | export function makeFriendly(num) { 14 | if (num < 1 && num > 0) { 15 | return num; 16 | } 17 | if (Math.abs(num) >= 1000000) return intlFormat(num / 1000000) + "M"; 18 | if (Math.abs(num) >= 1000) return intlFormat(num / 1000) + "k"; 19 | return intlFormat(num); 20 | } 21 | 22 | export function getJSONData(url) { 23 | return fetch(url) 24 | .then(function (response) { 25 | if (response.status !== 200) { 26 | console.log( 27 | "Looks like there was a problem. Status Code: " + response.status 28 | ); 29 | return Promise.reject(response.status); 30 | } 31 | return response.json().then(function (data) { 32 | return data; 33 | }); 34 | }) 35 | .catch(function (err) { 36 | return Promise.reject(err); 37 | }); 38 | } 39 | 40 | export function postJSONData(url, postData) { 41 | return fetch(url, { 42 | method: "post", 43 | body: JSON.stringify(postData), 44 | headers: { 45 | "Content-Type": "application/json", 46 | }, 47 | }) 48 | .then(function (response) { 49 | if (response.status !== 200) { 50 | console.log( 51 | "Looks like there was a problem. Status Code: " + response.status 52 | ); 53 | return Promise.reject(response.status); 54 | } 55 | return response.json().then(function (data) { 56 | return data; 57 | }); 58 | }) 59 | .catch(function (err) { 60 | return Promise.reject(err); 61 | }); 62 | } 63 | 64 | export function sampleConfig() { 65 | return { 66 | header: { 67 | appname: "NeuralQA", 68 | appdescription: "Question Answering on Large Datasets", 69 | }, 70 | queryview: { 71 | intro: { 72 | title: "NeuralQA: Question Answering on Large Datasets", 73 | subtitle: 74 | "NeuralQA is an interactive tool for question answering (passage retrieval + document reading). You can manually provide a passage or select a search index from (e.g. case.law ) dataset under the QA configuration settings below. To begin, type in a question query below.", 75 | disclaimer: " .. ", 76 | }, 77 | views: { 78 | intro: true, 79 | advanced: true, 80 | samples: true, 81 | passages: true, 82 | explanations: true, 83 | allanswers: false, 84 | expander: false, 85 | }, 86 | options: { 87 | stride: { 88 | title: "Token Stride", 89 | selected: 0, 90 | options: [ 91 | { name: 0, value: 0 }, 92 | { name: 50, value: 50 }, 93 | { name: 100, value: 100 }, 94 | { name: 200, value: 200 }, 95 | ], 96 | }, 97 | maxdocuments: { 98 | title: "Max Documents", 99 | selected: 5, 100 | options: [ 101 | { name: 5, value: 5 }, 102 | { name: 10, value: 10 }, 103 | { name: 15, value: 15 }, 104 | ], 105 | }, 106 | fragmentsize: { 107 | title: "Fragment Size", 108 | selected: 350, 109 | options: [ 110 | { name: 350, value: 350 }, 111 | { name: 450, value: 450 }, 112 | { name: 650, value: 650 }, 113 | { name: 850, value: 850 }, 114 | ], 115 | }, 116 | relsnip: { 117 | title: "Relsnip", 118 | selected: true, 119 | options: [ 120 | { name: true, value: true }, 121 | { name: false, value: false }, 122 | ], 123 | }, 124 | samples: [ 125 | { 126 | question: "what is the goal of the fourth amendment? ", 127 | context: 128 | "The Fourth Amendment of the U.S. Constitution provides that the right of the people to be secure in their persons, houses, papers, and effects, against unreasonable searches and seizures, shall not be violated, and no Warrants shall issue, but upon probable cause, supported by Oath or affirmation, and particularly describing the place to be searched, and the persons or things to be seized.'The ultimate goal of this provision is to protect people’s right to privacy and freedom from unreasonable intrusions by the government. However, the Fourth Amendment does not guarantee protection from all searches and seizures, but only those done by the government and deemed unreasonable under the law.", 129 | }, 130 | { 131 | question: 132 | "Who was the first woman to serve on the supreme court in America", 133 | context: 134 | "Sandra Day O’Connor, née Sandra Day, (born March 26, 1930, El Paso, Texas, U.S.), associate justice of the Supreme Court of the United States from 1981 to 2006. She was the first woman to serve on the Supreme Court. A moderate conservative, she was known for her dispassionate and meticulously researched opinions. Sandra Day grew up on a large family ranch near Duncan, Arizona. She received undergraduate (1950) and law (1952) degrees from Stanford University, where she met the future chief justice of the United States William Rehnquist.", 135 | }, 136 | { 137 | question: "Where did Sandra Day grow up?", 138 | context: 139 | "Sandra Day O’Connor, née Sandra Day, (born March 26, 1930, El Paso, Texas, U.S.), associate justice of the Supreme Court of the United States from 1981 to 2006. She was the first woman to serve on the Supreme Court. A moderate conservative, she was known for her dispassionate and meticulously researched opinions. Sandra Day grew up on a large family ranch near Duncan, Arizona. She received undergraduate (1950) and law (1952) degrees from Stanford University, where she met the future chief justice of the United States William Rehnquist.", 140 | }, 141 | ], 142 | expander: { 143 | title: "Expander", 144 | selected: "none", 145 | options: [{ name: "None", value: "none", type: "none" }], 146 | }, 147 | reader: { 148 | title: "Reader", 149 | selected: "twmkn9/distilbert-base-uncased-squad2", 150 | options: [ 151 | { 152 | name: "DistilBERT SQUAD2", 153 | value: "twmkn9/distilbert-base-uncased-squad2", 154 | type: "distilbert", 155 | }, 156 | { 157 | name: "BERT SQUAD2", 158 | value: "deepset/bert-base-cased-squad2", 159 | type: "bert", 160 | }, 161 | ], 162 | }, 163 | retriever: { 164 | title: "Retriever", 165 | selected: "none", 166 | options: [ 167 | { name: "None", value: "none", type: "none" }, 168 | { 169 | name: "Case Law", 170 | value: "cases", 171 | host: "localhost", 172 | port: 9200, 173 | username: "None", 174 | password: "None", 175 | type: "elasticsearch", 176 | fields: { body_field: "casebody.data.opinions.text" }, 177 | }, 178 | ], 179 | readtopn: 0, 180 | }, 181 | }, 182 | }, 183 | }; 184 | } 185 | 186 | export const LeaderLine = window.LeaderLine; 187 | export const animOptions = { duration: 800, timing: "ease" }; 188 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/queryview/queryview.css: -------------------------------------------------------------------------------- 1 | .passagetitle { 2 | font-weight: bold; 3 | padding: 0px 0px 5px 0px; 4 | } 5 | .passagexcerpt { 6 | } 7 | .passagerow { 8 | background: rgb(243, 244, 245); 9 | padding: 10px; 10 | margin-bottom: 5px; 11 | } 12 | 13 | .rightbox { 14 | position: absolute; 15 | right: 10px; 16 | top: 10px; 17 | } 18 | 19 | .highlightsection > span > em, 20 | .contextrow > em { 21 | background-color: #fcfcaa; 22 | padding: 0px 5px 0px 5px; 23 | border: 1px solid rgb(197, 197, 197); 24 | } 25 | .underline { 26 | border-bottom: 1px dashed rgb(196, 193, 193); 27 | } 28 | .highlightsection { 29 | padding: 0px 0px 7px 0px; 30 | } 31 | .answerspan { 32 | padding: 8px; 33 | background-color: rgb(227, 227, 227); 34 | margin: 0px 0px 5px 0px; 35 | } 36 | .loaderbox { 37 | /* border: 1px solid grey; */ 38 | transition: width 0.7s, opacity 0.7s; 39 | -webkit-transition: width 0.7s, opacity 0.7s; 40 | /* width: 34px; */ 41 | } 42 | 43 | .errormessage { 44 | padding: 5px; 45 | color: white; 46 | background-color: red; 47 | } 48 | 49 | .excerpttitle { 50 | padding: 2px 2px 2px 0px; 51 | color: rgb(97, 97, 97); 52 | font-weight: bold; 53 | } 54 | 55 | .answersubrow { 56 | padding: 5px 0px 5px 0px; 57 | line-height: 1.2em; 58 | } 59 | .topanswer { 60 | /* border: 2px solid rgb(253, 162, 42); */ 61 | border: 2px solid #999898; 62 | } 63 | .answerrow { 64 | margin: 0px 0px 3px 0px; 65 | background-color: rgb(244, 244, 244); 66 | } 67 | 68 | .contextinputarea { 69 | min-height: 120px; 70 | } 71 | .samplequestionrow { 72 | background: rgb(210, 210, 210); 73 | padding: 5px; 74 | margin: 5px 5px 0px 0px; 75 | border-left: 1.5px solid white; 76 | } 77 | .samplequestionrow.selected { 78 | background-color: rgb(190, 213, 250); 79 | border-left: 1.5px solid rgb(157, 157, 157); 80 | } 81 | 82 | .lh2m { 83 | line-height: 2em; 84 | } 85 | 86 | .answerrowtitletag { 87 | font-size: 2.4em; 88 | color: grey; 89 | } 90 | 91 | .answerquote { 92 | font-size: 1.5em; 93 | color: rgb(14, 14, 68); 94 | } 95 | 96 | .sectionheading { 97 | line-height: 2em; 98 | } 99 | 100 | .explanationspan { 101 | /* border: 1px solid green; */ 102 | padding: 2px; 103 | display: inline; 104 | /* line-height: 1.5em; */ 105 | } 106 | 107 | .whatsthis { 108 | position: absolute; 109 | right: 0px; 110 | top: 0px; 111 | background-color: #807e7e; 112 | padding: 10px; 113 | color: white; 114 | } 115 | .whatsthis:hover { 116 | background-color: #0062ff; 117 | } 118 | 119 | .infodesctitle { 120 | /* font-weight: bold; */ 121 | } 122 | 123 | .infodescrow { 124 | } 125 | 126 | .infocircle { 127 | border-radius: 50%; 128 | border: 1px solid white; 129 | padding: 0px 5px 0px 5px; 130 | } 131 | 132 | .exptermbox { 133 | margin-top: 5px; 134 | } 135 | 136 | .exptermboxdata { 137 | padding: 10px; 138 | border-left: 1px solid lightgrey; 139 | border-top: 1px solid lightgrey; 140 | border-bottom: 1px solid lightgrey; 141 | } 142 | .termboxclose { 143 | border: 1px solid lightgrey; 144 | padding: 10px; 145 | cursor: pointer; 146 | } 147 | .termboxclose:hover { 148 | background-color: lightgrey; 149 | } 150 | .selectedtermslabel { 151 | background-color: lightgrey; 152 | padding: 9px; 153 | } 154 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/template.css: -------------------------------------------------------------------------------- 1 | /* Text Highlights */ 2 | .textalignright { 3 | text-align: right; 4 | } 5 | .textaligncenter { 6 | text-align: center; 7 | } 8 | .textvalignmiddle { 9 | vertical-align: middle; 10 | } 11 | .bluehightlight { 12 | padding: 10px; 13 | background: rgb(190, 213, 250); 14 | margin: 0px 0px 0px 0px; 15 | } 16 | .greyhighlight { 17 | background: rgba(209, 209, 209, 0.9); 18 | } 19 | .borderleftdash { 20 | border-left: 1px dashed grey; 21 | } 22 | .lightgreyhighlight { 23 | background: rgba(231, 231, 231, 0.9); 24 | } 25 | .justifycenter { 26 | justify-content: center; 27 | } 28 | .mynotif { 29 | border-left: 4px #054ada solid; 30 | } 31 | .unselectable { 32 | -webkit-touch-callout: none; 33 | -webkit-user-select: none; 34 | -khtml-user-select: none; 35 | -moz-user-select: none; 36 | -ms-user-select: none; 37 | user-select: none; 38 | } 39 | .mediumdesc { 40 | font-size: 0.9em; 41 | } 42 | .greymoreinfo { 43 | background: rgba(209, 209, 209, 0.9); 44 | } 45 | .greymoreinfo:hover { 46 | background: rgb(187, 187, 187); 47 | } 48 | .greymoreinfo:active { 49 | background: rgb(155, 155, 155); 50 | } 51 | .modelconfigdiv { 52 | background: rgba(209, 209, 209, 0.3); 53 | } 54 | .lightbluehightlight { 55 | padding: 10px; 56 | background: #edf4ff; 57 | margin: 0px 0px 0px 0px; 58 | } 59 | 60 | .orangehighlight { 61 | padding: 10px; 62 | background: rgb(240, 134, 3); 63 | margin: 0px 0px 0px 0px; 64 | color: white; 65 | } 66 | 67 | .topblueborder { 68 | border-top: 3px solid #0062ff; 69 | } 70 | 71 | .bottomblueborder { 72 | border-bottom: 3px solid #0062ff; 73 | } 74 | 75 | .lh10 { 76 | line-height: 1.25rem; 77 | } 78 | 79 | .lhmedium { 80 | line-height: 1rem; 81 | } 82 | .lhsmall { 83 | line-height: 0.8rem; 84 | } 85 | .smalldesc { 86 | font-size: 0.72em; 87 | } 88 | 89 | .sectiontitle { 90 | font-size: 1.2em; 91 | font-weight: bold; 92 | } 93 | 94 | .horrule { 95 | height: 1px; 96 | border-bottom: 1px solid rgba(145, 142, 142, 0.4); 97 | } 98 | 99 | .boldtext { 100 | font-weight: bold; 101 | } 102 | 103 | .boldmediumtext { 104 | font-weight: 600; 105 | } 106 | .whitetext { 107 | color: white; 108 | } 109 | .whitefill { 110 | fill: white; 111 | } 112 | .greentext { 113 | color: green; 114 | } 115 | /* Layout Spacing margin, etc */ 116 | .iblock { 117 | display: inline-block; 118 | } 119 | .redcolor { 120 | color: red; 121 | } 122 | .border { 123 | border: 1px solid green; 124 | } 125 | .greyborder { 126 | border: 1px solid rgba(105, 105, 105, 0.8); 127 | } 128 | 129 | .floatleft { 130 | float: left; 131 | } 132 | 133 | .floatright { 134 | float: right; 135 | } 136 | .clearfix { 137 | overflow: auto; 138 | } 139 | 140 | .p10 { 141 | padding: 10px; 142 | } 143 | .m10 { 144 | margin: 10px; 145 | } 146 | 147 | .p20 { 148 | padding: 20px; 149 | } 150 | .m20 { 151 | margin: 20px; 152 | } 153 | 154 | .pb10 { 155 | padding-bottom: 10px; 156 | } 157 | 158 | .pb20 { 159 | padding-bottom: 20px; 160 | } 161 | 162 | .pb3 { 163 | padding-bottom: 3px; 164 | } 165 | 166 | .p5 { 167 | padding: 5px; 168 | } 169 | .p4 { 170 | padding: 4px; 171 | } 172 | .p3 { 173 | padding: 3px; 174 | } 175 | .pt10 { 176 | padding-top: 10px; 177 | } 178 | .pt2 { 179 | padding-top: 2px; 180 | } 181 | 182 | .pt3 { 183 | padding-top: 3px; 184 | } 185 | .pt4 { 186 | padding-top: 4px; 187 | } 188 | .pt5 { 189 | padding-top: 5px; 190 | } 191 | .pt7 { 192 | padding-top: 7px; 193 | } 194 | 195 | .pb5 { 196 | padding-bottom: 5px; 197 | } 198 | .pb7 { 199 | padding-bottom: 7px; 200 | } 201 | 202 | .pb2 { 203 | padding-bottom: 2px; 204 | } 205 | 206 | .pt20 { 207 | padding-top: 20px; 208 | } 209 | 210 | .pr5 { 211 | padding-right: 5px; 212 | } 213 | .pr10 { 214 | padding-right: 10px; 215 | } 216 | .pr20 { 217 | padding-right: 20px; 218 | } 219 | 220 | .pl4 { 221 | padding-left: 4px; 222 | } 223 | 224 | .pl5 { 225 | padding-left: 5px; 226 | } 227 | 228 | .pl10 { 229 | padding-left: 10px; 230 | } 231 | .pl20 { 232 | padding-left: 20px; 233 | } 234 | 235 | .mt10 { 236 | margin-top: 10px; 237 | } 238 | .mt5 { 239 | margin-top: 5px; 240 | } 241 | 242 | .mt20 { 243 | margin-top: 20px; 244 | } 245 | 246 | .mb10 { 247 | margin-bottom: 10px; 248 | } 249 | 250 | .mb20 { 251 | margin-bottom: 20px; 252 | } 253 | 254 | .mb3 { 255 | margin-bottom: 3px; 256 | } 257 | 258 | .mb5 { 259 | margin-bottom: 5px; 260 | } 261 | 262 | .mb7 { 263 | margin-bottom: 7px; 264 | } 265 | 266 | .mr10 { 267 | margin-right: 10px; 268 | } 269 | .mr5 { 270 | margin-right: 5px; 271 | } 272 | .mr3 { 273 | margin-right: 3px; 274 | } 275 | .mr2 { 276 | margin-right: 2px; 277 | } 278 | 279 | .ml5 { 280 | margin-left: 5px; 281 | } 282 | 283 | .mr20 { 284 | margin-right: 20px; 285 | } 286 | 287 | .ml10 { 288 | margin-left: 10px; 289 | } 290 | 291 | .rad5 { 292 | border-radius: 5px; 293 | } 294 | 295 | .rad4 { 296 | border-radius: 4px; 297 | } 298 | 299 | .rad3 { 300 | border-radius: 3px; 301 | } 302 | 303 | .rad2 { 304 | border-radius: 2px; 305 | } 306 | 307 | .opacity100 { 308 | opacity: 1; 309 | } 310 | .opacity50 { 311 | opacity: 0.5; 312 | } 313 | .opacity0 { 314 | opacity: 0; 315 | } 316 | .transitiono3s { 317 | transition: opacity 0.3s ease-in-out; 318 | -moz-transition: opacity 0.3s ease-in-out; 319 | -webkit-transition: opacity 0.3s ease-in-out; 320 | } 321 | 322 | .transitionoh4s { 323 | border: 1px solid pink; 324 | transition: opacity 0.4s ease-in-out; 325 | -webkit-transition: opacity 0.4s ease-in-out; 326 | 327 | /* transition: height .4s ease-in-out; */ 328 | -webkit-transition: height 1.4s ease-in-out; 329 | } 330 | 331 | .transitionw6s { 332 | transition: width 0.4s ease-in-out; 333 | -moz-transition: width 0.4s ease-in-out; 334 | -webkit-transition: width 0.4s ease-in-out; 335 | } 336 | .notransition { 337 | -webkit-transition: none !important; 338 | -moz-transition: none !important; 339 | -o-transition: none !important; 340 | transition: none !important; 341 | } 342 | 343 | .clickable { 344 | cursor: pointer; 345 | } 346 | .decornone > a { 347 | text-decoration: none; 348 | color: white; 349 | } 350 | .w0 { 351 | width: 0%; 352 | } 353 | .w100 { 354 | width: 100%; 355 | } 356 | 357 | .h100 { 358 | height: 100%; 359 | } 360 | .h100v { 361 | height: 100vh; 362 | } 363 | .unclickable { 364 | pointer-events: none; 365 | } 366 | 367 | .flexcolumn { 368 | flex-direction: column; 369 | } 370 | 371 | .flexstretch { 372 | align-self: stretch; 373 | } 374 | 375 | .flexjustifycenter { 376 | justify-content: center; 377 | align-items: center; 378 | } 379 | 380 | .flexjustifyright { 381 | justify-content: right; 382 | align-items: right; 383 | } 384 | 385 | .flexjustifyleft { 386 | justify-content: left; 387 | align-items: left; 388 | } 389 | 390 | .flex { 391 | display: flex; 392 | } 393 | 394 | .displaynone { 395 | display: none; 396 | } 397 | .displayblock { 398 | display: block; 399 | } 400 | 401 | .flexwrap { 402 | flex-wrap: wrap; 403 | justify-content: space-around; 404 | } 405 | 406 | .flexpushout { 407 | display: none; 408 | } 409 | 410 | @media screen and (max-width: 600px) { 411 | .flexwrap { 412 | flex-wrap: wrap; 413 | } 414 | .flexwrapitem:first-child { 415 | flex-basis: 100%; 416 | margin-bottom: 5px; 417 | } 418 | .flexwrapitem:nth-child(n + 2) { 419 | flex-basis: 100%; 420 | margin-bottom: 5px; 421 | } 422 | .flexpushout { 423 | display: block; 424 | } 425 | } 426 | 427 | @media screen and (max-width: 400px) { 428 | .flexwrapitem:nth-child(n + 2) { 429 | flex-basis: 100%; 430 | margin-bottom: 5px; 431 | } 432 | } 433 | 434 | @media screen and (max-width: 500px) { 435 | .apptitle { 436 | display: none; 437 | } 438 | } 439 | 440 | @media screen and (max-width: 800px) { 441 | .flexwrap8 { 442 | flex-wrap: wrap; 443 | } 444 | .flexwrapitem8:first-child { 445 | flex-basis: 100%; 446 | margin-bottom: 5px; 447 | } 448 | } 449 | 450 | @media screen and (max-width: 1070px) { 451 | .smallhide { 452 | display: none; 453 | } 454 | } 455 | 456 | @media screen and (min-width: 1000px) { 457 | .smallshow { 458 | display: none; 459 | } 460 | } 461 | 462 | .errordiv { 463 | border: 1px solid red; 464 | border-left: 4px solid red; 465 | } 466 | 467 | .flexfull { 468 | flex: 1; 469 | } 470 | 471 | .flex1 { 472 | flex: 0.1; 473 | } 474 | 475 | .flex2 { 476 | flex: 0.2; 477 | } 478 | 479 | .flex20 { 480 | flex: 2; 481 | } 482 | 483 | .flex30 { 484 | flex: 3; 485 | } 486 | .flex40 { 487 | flex: 4; 488 | } 489 | .flex80 { 490 | flex: 8; 491 | } 492 | 493 | .flex3 { 494 | flex: 0.3; 495 | } 496 | .flex35 { 497 | flex: 0.35; 498 | } 499 | 500 | .flex4 { 501 | flex: 0.4; 502 | } 503 | 504 | .flex5 { 505 | flex: 0.5; 506 | } 507 | 508 | .flex6 { 509 | flex: 0.6; 510 | } 511 | 512 | .flex7 { 513 | flex: 0.7; 514 | } 515 | 516 | .flex8 { 517 | flex: 0.8; 518 | } 519 | 520 | .flex9 { 521 | flex: 0.9; 522 | } 523 | 524 | .positionrelative { 525 | position: relative; 526 | } 527 | 528 | .positionabsolute { 529 | position: absolute; 530 | } 531 | .bottomright { 532 | bottom: 0px; 533 | right: 0px; 534 | } 535 | .container-fluid { 536 | max-width: 1220px; 537 | margin: auto; 538 | } 539 | 540 | .centerpage { 541 | max-width: 1220px; 542 | margin: auto; 543 | } 544 | 545 | .scrollwindow { 546 | overflow: scroll; 547 | } 548 | 549 | .scrollwindow::-webkit-scrollbar { 550 | -webkit-appearance: none; 551 | } 552 | 553 | .scrollwindow::-webkit-scrollbar:vertical { 554 | width: 8px; 555 | } 556 | 557 | .scrollwindow::-webkit-scrollbar:horizontal { 558 | height: 8px; 559 | } 560 | 561 | .scrollwindow::-webkit-scrollbar-thumb { 562 | border-radius: 8px; 563 | border: 2px solid white; /* should match background, can't be transparent */ 564 | background-color: rgba(0, 0, 0, 0.5); 565 | } 566 | ::-webkit-scrollbar-corner { 567 | background-color: rgba(0, 0, 0, 0); 568 | } 569 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/template.scss: -------------------------------------------------------------------------------- 1 | // @import 'carbon-components/scss/globals/scss/_styles.scss'; 2 | 3 | //------------------------- 4 | // 🌍 Global 5 | //------------------------- 6 | 7 | $css--font-face: true; 8 | $css--helpers: true; 9 | $css--body: true; 10 | $css--use-layer: true; 11 | $css--reset: true; 12 | $css--typography: true; 13 | $css--plex: true; 14 | 15 | @import "carbon-components/scss/globals/scss/_typography"; 16 | @import "carbon-components/scss/globals/scss/_css--font-face"; 17 | @import "carbon-components/scss/globals/scss/_css--body"; 18 | 19 | //------------------------- 20 | // 🍕 Components 21 | //------------------------- 22 | 23 | @import "carbon-components/scss/components/select/_select"; 24 | @import "carbon-components/scss/components/tabs/_tabs"; 25 | // @import "carbon-components/scss/components/tabs/_tab"; 26 | // @import 'carbon-components/scss/components/data-table/_data-table'; 27 | @import "carbon-components/scss/components/loading/_loading"; 28 | @import "carbon-components/scss/components/modal/_modal"; 29 | @import "carbon-components/scss/components/button/_button"; 30 | @import "carbon-components/scss/components/checkbox/_checkbox"; 31 | // @import 'carbon-components/scss/components/radio-button/_radio-button'; 32 | @import "carbon-components/scss/components/toggle/_toggle"; 33 | // @import 'carbon-components/scss/components/search/_search'; 34 | @import "carbon-components/scss/components/tooltip/_tooltip"; 35 | @import "carbon-components/scss/components/slider/_slider"; 36 | @import "carbon-components/scss/components/text-area/_text-area"; 37 | 38 | // @import 'carbon-components/scss/components/combo-box/_combo-box'; 39 | 40 | // @import 'carbon-components/scss/components/list-box/_list-box'; 41 | 42 | // .bx--toggle-input__label, .bx--toggle-input{ 43 | // background-color: green; 44 | // margin: 0px; 45 | // padding: 0px; 46 | // height: 24px ; 47 | // } 48 | 49 | .bx--tab-content { 50 | padding: 10px 0px 0px 0px; 51 | } 52 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/testview/TestView.jsx: -------------------------------------------------------------------------------- 1 | import React, { Component } from "react"; 2 | import "./testview.css"; 3 | import BarViz from "../barviz/BarViz"; 4 | 5 | class TestView extends Component { 6 | constructor(props) { 7 | super(props); 8 | 9 | this.data = require("./ex.json"); 10 | // this.data.gradients = this.data.gradients.concat(this.data.gradients); 11 | // console.log(this.data); 12 | } 13 | componentDidMount() { 14 | this.barVizWidth = document.getElementById("barvizcontainer").offsetWidth; 15 | // console.log(this.barVizWidth); 16 | } 17 | 18 | render() { 19 | return ( 20 |
21 | 22 |
23 | ); 24 | } 25 | } 26 | 27 | export default TestView; 28 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/components/testview/testview.css: -------------------------------------------------------------------------------- 1 | .d3brush { 2 | /* border: 1px solid black; */ 3 | } 4 | 5 | .yticktext { 6 | border: 1px solid green; 7 | background-color: rgba(85, 85, 85, 0.986); 8 | } 9 | 10 | .barviz { 11 | overflow: hidden; 12 | } 13 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/index.js: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom'; 3 | import './components/template.scss'; 4 | import './components/template.css'; 5 | import App from './components/Main'; 6 | import * as serviceWorker from './serviceWorker'; 7 | 8 | ReactDOM.render(, document.getElementById('root')); 9 | 10 | // If you want your app to work offline and load faster, you can change 11 | // unregister() to register() below. Note this comes with some pitfalls. 12 | // Learn more about service workers: https://bit.ly/CRA-PWA 13 | serviceWorker.unregister(); 14 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/serviceWorker.js: -------------------------------------------------------------------------------- 1 | // This optional code is used to register a service worker. 2 | // register() is not called by default. 3 | 4 | // This lets the app load faster on subsequent visits in production, and gives 5 | // it offline capabilities. However, it also means that developers (and users) 6 | // will only see deployed updates on subsequent visits to a page, after all the 7 | // existing tabs open on the page have been closed, since previously cached 8 | // resources are updated in the background. 9 | 10 | // To learn more about the benefits of this model and instructions on how to 11 | // opt-in, read https://bit.ly/CRA-PWA 12 | 13 | const isLocalhost = Boolean( 14 | window.location.hostname === 'localhost' || 15 | // [::1] is the IPv6 localhost address. 16 | window.location.hostname === '[::1]' || 17 | // 127.0.0.0/8 are considered localhost for IPv4. 18 | window.location.hostname.match( 19 | /^127(?:\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}$/ 20 | ) 21 | ); 22 | 23 | export function register(config) { 24 | if (process.env.NODE_ENV === 'production' && 'serviceWorker' in navigator) { 25 | // The URL constructor is available in all browsers that support SW. 26 | const publicUrl = new URL(process.env.PUBLIC_URL, window.location.href); 27 | if (publicUrl.origin !== window.location.origin) { 28 | // Our service worker won't work if PUBLIC_URL is on a different origin 29 | // from what our page is served on. This might happen if a CDN is used to 30 | // serve assets; see https://github.com/facebook/create-react-app/issues/2374 31 | return; 32 | } 33 | 34 | window.addEventListener('load', () => { 35 | const swUrl = `${process.env.PUBLIC_URL}/service-worker.js`; 36 | 37 | if (isLocalhost) { 38 | // This is running on localhost. Let's check if a service worker still exists or not. 39 | checkValidServiceWorker(swUrl, config); 40 | 41 | // Add some additional logging to localhost, pointing developers to the 42 | // service worker/PWA documentation. 43 | navigator.serviceWorker.ready.then(() => { 44 | console.log( 45 | 'This web app is being served cache-first by a service ' + 46 | 'worker. To learn more, visit https://bit.ly/CRA-PWA' 47 | ); 48 | }); 49 | } else { 50 | // Is not localhost. Just register service worker 51 | registerValidSW(swUrl, config); 52 | } 53 | }); 54 | } 55 | } 56 | 57 | function registerValidSW(swUrl, config) { 58 | navigator.serviceWorker 59 | .register(swUrl) 60 | .then(registration => { 61 | registration.onupdatefound = () => { 62 | const installingWorker = registration.installing; 63 | if (installingWorker == null) { 64 | return; 65 | } 66 | installingWorker.onstatechange = () => { 67 | if (installingWorker.state === 'installed') { 68 | if (navigator.serviceWorker.controller) { 69 | // At this point, the updated precached content has been fetched, 70 | // but the previous service worker will still serve the older 71 | // content until all client tabs are closed. 72 | console.log( 73 | 'New content is available and will be used when all ' + 74 | 'tabs for this page are closed. See https://bit.ly/CRA-PWA.' 75 | ); 76 | 77 | // Execute callback 78 | if (config && config.onUpdate) { 79 | config.onUpdate(registration); 80 | } 81 | } else { 82 | // At this point, everything has been precached. 83 | // It's the perfect time to display a 84 | // "Content is cached for offline use." message. 85 | console.log('Content is cached for offline use.'); 86 | 87 | // Execute callback 88 | if (config && config.onSuccess) { 89 | config.onSuccess(registration); 90 | } 91 | } 92 | } 93 | }; 94 | }; 95 | }) 96 | .catch(error => { 97 | console.error('Error during service worker registration:', error); 98 | }); 99 | } 100 | 101 | function checkValidServiceWorker(swUrl, config) { 102 | // Check if the service worker can be found. If it can't reload the page. 103 | fetch(swUrl, { 104 | headers: { 'Service-Worker': 'script' }, 105 | }) 106 | .then(response => { 107 | // Ensure service worker exists, and that we really are getting a JS file. 108 | const contentType = response.headers.get('content-type'); 109 | if ( 110 | response.status === 404 || 111 | (contentType != null && contentType.indexOf('javascript') === -1) 112 | ) { 113 | // No service worker found. Probably a different app. Reload the page. 114 | navigator.serviceWorker.ready.then(registration => { 115 | registration.unregister().then(() => { 116 | window.location.reload(); 117 | }); 118 | }); 119 | } else { 120 | // Service worker found. Proceed as normal. 121 | registerValidSW(swUrl, config); 122 | } 123 | }) 124 | .catch(() => { 125 | console.log( 126 | 'No internet connection found. App is running in offline mode.' 127 | ); 128 | }); 129 | } 130 | 131 | export function unregister() { 132 | if ('serviceWorker' in navigator) { 133 | navigator.serviceWorker.ready 134 | .then(registration => { 135 | registration.unregister(); 136 | }) 137 | .catch(error => { 138 | console.error(error.message); 139 | }); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /neuralqa/server/ui/src/setupTests.js: -------------------------------------------------------------------------------- 1 | // jest-dom adds custom jest matchers for asserting on DOM nodes. 2 | // allows you to do things like: 3 | // expect(element).toHaveTextContent(/react/i) 4 | // learn more: https://github.com/testing-library/jest-dom 5 | import '@testing-library/jest-dom/extend-expect'; 6 | -------------------------------------------------------------------------------- /neuralqa/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .config_utils import ConfigParser 2 | from .file_utils import * 3 | from .data_utils import import_sample_data, parse_field_content 4 | -------------------------------------------------------------------------------- /neuralqa/utils/cli_args.py: -------------------------------------------------------------------------------- 1 | """ 2 | Definitions of click options shared by several CLI commands. 3 | """ 4 | import click 5 | 6 | 7 | HOST = click.option("--host", "-h", default="127.0.0.1", 8 | help="The network address to listen on (default: 127.0.0.1). " 9 | "Use 0.0.0.0 to bind to all addresses if you want to access the tracking " 10 | "server from other machines.") 11 | 12 | PORT = click.option("--port", "-p", default=5000, 13 | help="The port to listen on (default: 5000).") 14 | 15 | 16 | WORKERS = click.option("--workers", "-w", default=1, 17 | help="Number of uviicorn worker processes to handle requests (default: 1).") 18 | 19 | MAX_DOCS = click.option("--max-docs", "-md", default=2000, 20 | help="Maximum number of sample documents to import when loading sample data into local index") 21 | 22 | CONFIG_PATH = click.option("--config-path", "-cp", default=None, 23 | help="Path to a yaml file containing config for neuralqa. " 24 | "If none is provided, the default config.yaml is copied to the current directory.") 25 | -------------------------------------------------------------------------------- /neuralqa/utils/config_utils.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | import logging 4 | import shutil 5 | 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class ConfigParser: 11 | def __init__(self, config_path): 12 | 13 | module_file_path = os.path.dirname(os.path.abspath(__file__)) 14 | self.default_config_path = os.path.join( 15 | module_file_path, "../config_default.yaml") 16 | self.current_config_path = os.path.join(os.getcwd(), "config.yaml") 17 | 18 | if config_path and os.path.exists(config_path): 19 | self.config = self.load_config(config_path) 20 | # else: 21 | # logger.info("Supplied config file does not exist. " + 22 | # os.path.join(os.getcwd(), config_path)) 23 | # logger.info("Creating new config file at " + 24 | # self.current_config_path) 25 | # self.config = self.load_default_config() 26 | else: 27 | 28 | if (config_path and not os.path.exists(config_path)): 29 | logger.info(">> Supplied config file does not exist. " + 30 | os.path.join(os.getcwd(), config_path)) 31 | 32 | if os.path.exists(self.current_config_path): 33 | logger.info(">> Found config.yaml file found in current directory " + 34 | self.current_config_path) 35 | self.config = self.load_config(self.current_config_path) 36 | else: 37 | logger.info(">> Creating new config file at " + 38 | self.current_config_path) 39 | shutil.copyfile(self.default_config_path, 40 | self.current_config_path) 41 | self.config = self.load_default_config() 42 | 43 | def load_default_config(self): 44 | with open(self.default_config_path) as f: 45 | default_config = yaml.safe_load(f) 46 | return default_config 47 | 48 | def load_config(self, config_path): 49 | """Specially load a config file path. 50 | Will first load the default config file, and update its values with 51 | the content of the file in config_path. 52 | 53 | Args: 54 | config_path ([type]): [description] 55 | 56 | Returns: 57 | [type]: [description] 58 | """ 59 | default_config = self.load_default_config() 60 | 61 | with open(config_path) as f: 62 | config = yaml.safe_load(f) 63 | 64 | default_config.update(config) 65 | return default_config 66 | -------------------------------------------------------------------------------- /neuralqa/utils/data_utils.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | import os 3 | import zipfile 4 | import shutil 5 | import urllib.request 6 | import logging 7 | import lzma 8 | import json 9 | import tarfile 10 | import hashlib 11 | 12 | 13 | logger = logging.getLogger(__name__) 14 | # index settings with analyzer to automatically remove stop words 15 | index_settings = { 16 | "settings": { 17 | "analysis": { 18 | "analyzer": { 19 | "stop_analyzer": { 20 | "type": "standard", 21 | "stopwords": "_english_" 22 | } 23 | } 24 | } 25 | }, 26 | "mappings": { 27 | "properties": { 28 | "casebody.data.opinions.text": { 29 | "type": "text", 30 | "analyzer": "stop_analyzer" 31 | }, 32 | "name": { 33 | "type": "text", 34 | "analyzer": "stop_analyzer" 35 | } 36 | } 37 | } 38 | } 39 | 40 | 41 | def create_index_from_json(index_name, file_path, max_docs=None): 42 | """Create an index from json file formats. 43 | Read each file line by line, parse each line as json 44 | jsonl.xz 45 | json : must be a json file containing a list 46 | 47 | Arguments: 48 | file_path {str} -- path to case.law bulk file 49 | Keyword Arguments: 50 | max_docs {int} -- maximum size of records to use in creating index. 51 | small default can be used to enable quick testing (e.g: {2000}). 52 | set this to None to use the entire data file. 53 | """ 54 | # print("*** maxdocs", max_docs) 55 | es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) 56 | 57 | es.indices.create( 58 | index=index_name, body=index_settings, ignore=400) 59 | 60 | extension = os.path.splitext(file_path)[1] 61 | logger.info(">> Creating index using file " + file_path) 62 | i = 0 63 | if extension == ".xz": 64 | with lzma.open(file_path) as f: 65 | for line in f: 66 | i += 1 67 | line = json.loads(str(line, 'utf8')) 68 | try: 69 | index_status = es.index( 70 | index=index_name, id=i, body=line) 71 | # print(index_status) 72 | except Exception as e: 73 | logger.info( 74 | "An error has occurred while creating index " + str(e)) 75 | break 76 | # logger.info(index_status) 77 | if (i > max_docs): 78 | break 79 | logger.info(">> Creating index complete, delete data file .. ") 80 | os.remove(file_path) 81 | 82 | 83 | def import_scotus_files(max_docs=2000): 84 | scotus_url = "https://www.courtlistener.com/api/bulk-data/opinions/scotus.tar.gz" 85 | scotus_dir = "scotusdata" 86 | index_name = "supremecourt" 87 | 88 | if (not os.path.exists(scotus_dir)): 89 | os.makedirs(scotus_dir, exist_ok=True) 90 | logger.info(">>> Downloading supreme court case data") 91 | ftpstream = urllib.request.urlopen(scotus_url) 92 | thetarfile = tarfile.open(fileobj=ftpstream, mode="r|gz") 93 | thetarfile.extractall(path=scotus_dir) 94 | logger.info(">>> Download completed ") 95 | 96 | logger.info(">> Creating %s index using %s documents", 97 | index_name, str(max_docs)) 98 | scotus_files = os.listdir(scotus_dir) 99 | 100 | es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) 101 | 102 | es.indices.create( 103 | index=index_name, body=index_settings, ignore=400) 104 | 105 | i = 0 106 | for file_path in (scotus_files): 107 | with open("scotusdata/" + file_path) as json_file: 108 | scotus_case = json.load(json_file) 109 | case = {"author": scotus_case["author"], 110 | "casebody": scotus_case["plain_text"]} 111 | if (scotus_case["plain_text"] != ""): 112 | try: 113 | index_status = es.index( 114 | index=index_name, id=scotus_case["id"], body=case) 115 | except Exception as e: 116 | logger.info( 117 | "An error has occurred while creating index " + str(e)) 118 | break 119 | i += 1 120 | if (i > max_docs): 121 | break 122 | 123 | logger.info(">> Index creation complete.") 124 | 125 | 126 | def download_data(data_url, source_name): 127 | """Download Zip datafile from case.law 128 | Arguments: 129 | data_url {str} -- url path dataset 130 | source_name {str} -- name for dataset 131 | """ 132 | # create data directory 133 | os.makedirs("data", exist_ok=True) 134 | # download data from caselaw 135 | zip_file_path = source_name + ".zip" 136 | logger.info(">> Downloading data file for " + source_name) 137 | urllib.request.urlretrieve(data_url, zip_file_path) 138 | logger.info(">> Downloaded data file " + zip_file_path) 139 | 140 | extract_dir = "temp" + source_name 141 | with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: 142 | zip_ref.extractall(extract_dir) 143 | data_file = os.path.join(extract_dir, os.listdir( 144 | extract_dir)[0], "data", "data.jsonl.xz") 145 | final_file_path = os.path.join("data", source_name + "jsonl.xz") 146 | shutil.copyfile(data_file, final_file_path) 147 | logger.info(">> Extracted and moved jsonl file to data folder") 148 | shutil.rmtree(extract_dir) 149 | os.remove(zip_file_path) 150 | return final_file_path 151 | 152 | 153 | def import_sample_data(max_docs=2000): 154 | """This method downloads several datasets and builds an 155 | elasticsearch index using the downloaded data. 156 | Caselaw 157 | 158 | Args: 159 | max_docs (int, optional): [description]. Defaults to 2000. 160 | """ 161 | caselaw_data_paths = [ 162 | ["https://api.case.law/v1/bulk/22411/download/", "newmexico"] 163 | ] 164 | for data_path in caselaw_data_paths: 165 | file_path = download_data(data_path[0], data_path[1]) 166 | create_index_from_json("cases", file_path, max_docs=max_docs) 167 | 168 | # import_scotus_files(max_docs=max_docs) 169 | # import_medical_data(max_docs=max_docs) 170 | 171 | 172 | def parse_field_content(field_name, content): 173 | """Parse content fields if nested using dot notation, else return content as is. 174 | e.g. for acrray content and field_name casebody.data.opinions.text, we return 175 | content[casebody][data][opinions][text]. If any nest level is an array we return only the 176 | first instance of this array. e.g. if opinions is an array, we return 177 | content[casebody][data][opinions][0][text]. 178 | 179 | Args: 180 | field_name ([str]): [description] 181 | content ([dict]): [description] 182 | 183 | Returns: 184 | [str]: content of field 185 | """ 186 | 187 | if ("." not in field_name): 188 | return content[field_name] 189 | else: 190 | fields = field_name.split(".") 191 | for field in fields: 192 | content = content[field] 193 | if (isinstance(content, list)): 194 | content = content[0] 195 | return content 196 | -------------------------------------------------------------------------------- /neuralqa/utils/file_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/neuralqa/utils/file_utils.py -------------------------------------------------------------------------------- /neuralqa/version.py: -------------------------------------------------------------------------------- 1 | 2 | VERSION = "0.0.31-alpha" 3 | -------------------------------------------------------------------------------- /notes.md: -------------------------------------------------------------------------------- 1 | ## General Notes on Designing CaseQA 2 | 3 | In this note, we will discuss some implementation decisions made while designing CaseQA. 4 | 5 | ### Information Retrieval with Elastic Search 6 | 7 | - User experience 8 | 9 | - Query highlighting. 10 | We use the query [highlighting](https://www.elastic.co/guide/en/elasticsearch/reference/6.8/search-request-highlighting.html) feature offered by elastic search. - Pros: Showing highlights of how matching is provided helps the user make sense of decisions made by the IR module. - Cons: Query highlighting can increase query time (in some cases we saw up to **5x** increased time for first time queries) . 11 | 12 | - Stop word removal. 13 | We apply an [elastic analyzer](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-stop-analyzer.html) to prevent elastic from matching based on stop words. 14 | Search queries including stop words (the, of, a) can make IR results noisy e.g. longer passages with these words get better matching scores based on frequency of occurrence of stop words which hold little value for actual relevance. Given that we will rely heavily on the document retrieval ranking produced by elastic for downstream QA, we want to be more conservative here. 15 | 16 | ### Passage Selection Strategy 17 | 18 | - BERT models process input with a max sequence length of 512 tokens. This introduces latency challenges when attempting to read a typical court document (> 10,000 tokens). In general, we have a few options 19 | - Read all passages 20 | - Look at all opinions within each case 21 | - Pro 22 | - Exhaustive search on all content 23 | - Con 24 | - Court document are long and sometimes repetitive. 25 | - Passage Fragmentation 26 | - At index creation time, we can break up large passages into smaller paragraphs and store them as individual documents in the index. 27 | - Feed each of these passages to BERT to find answers 28 | - Pro 29 | - Some research suggests this approach yields good results 30 | - Con 31 | - Can be challenging to identify the right segmentation strategy (e.g size of each paragraph, etc). This has be done at index creation time. 32 | - [The approach we use] Curated Passages based on highlights 33 | - Use highlights from elastic (`n` snippets that contain search query) as passage candidates 34 | - This allows us reduce a passage of 10k tokens to ~1000 tokens! 35 | - Merge highlights from each passage into a single combined passage that can be read by BERT 36 | - Depending on the size of the snippets used, the combined passage may exceed the total number of tokens that BERT. Here we use a chunking approach 37 | - Encode long question and passage once 38 | - Construct question + passage_chunk such that len(question + passage_chunk) < max_model_length 39 | - Use a stride to keep some context across chunks (can result in more tokens) 40 | - Rank extracted answer spans based on softmax probability of answer start position. 41 | 42 | ### Passage Tokenization 43 | 44 | - Using the default tokenization for distilbert (or any other neural model) reveals a couple of issues. First, our case dataset contains a set of knarly sequences (citations) e.g `Baldasar v. Illinois, 446 U.S. 222, 100 S.Ct. 1585, 64 L.Ed.2d 169 (1980)` and can get tokenized into the strangest (long) tokens. 45 | - This makes a case for some form of "data cleaning" to "manage" the size the size of tokens generated for each passage e.g. removing certain alphanumeric words/sequences that are unlikely to contribute to meaning of queries 46 | - Transformer Fast Toknizer 47 | - Using the fast tokenizer library by HuggingFace resulted in ~6.8x speedups for input tokenization 48 | 49 | ## Thoughts 50 | 51 | - Leveraging signals from highlights provided by IR methods goes a long way in making BERT practical for use today. 52 | - On a commercial CPU laptop it takes about 0.3 seconds for BERT to read a relatively short passage (200 words) 53 | - While IR will frequently return snippets that are relevant, there is still additional human effort required to parse each of these snippets and examine the surrounding area for clues towards the answer. This is where a BERT QA can indeed serve to reduce this effort significantly. By surfacing snippets, the BERT model either address the users requirement immediately, or serve as an index into the larger document for further exploration. 54 | 55 | Other implementation notes 56 | 57 | ## Serving UI/API 58 | 59 | - Serve both ui and api over same backend api. This simplifies build in that we can think of /ui as just another api end point. Users can still run the backend without the ui as long as they conform to api standards. 60 | Caveat .. some want to have varied exposure. 61 | - expose front end but not api. Only front end should be able to call api 62 | - expose api to specific internal applications. 63 | - expose api to any applicaation 64 | 65 | ## Retriever Interface on yaml 66 | 67 | - search fields [] : a list of the fields in the index we want to search on 68 | - title field (optional): this is used as a title if available, else the first n size of the body is used. 69 | - body field: a subset of this field is shown in the UI, this is also the field that is returned to the reader for reading. 70 | - body offset: snippet of body that is shown on ui 71 | 72 | ## TODO 73 | 74 | - expose helpful functions that allow neural qa 75 | - `create_index` neuralqa.elasticsearch: import data into elastic search 76 | - `expand_query` neuralqa.expander: 77 | - `BertModel` ... create model 78 | 79 | * specify interface for search queries 80 | 81 | - configuration file to define how search queries should be processed 82 | - list of fields to search over 83 | - method for constructing highlights 84 | - allow renaming of fields to create unified interface for UI to visualize 85 | - excerpt 86 | - title 87 | - highlight 88 | 89 | * Config flow 90 | - command line specify where config file is 91 | - if not specified 92 | - copy default yaml file to current directory 93 | - print message for user to modify this file to enable additional config 94 | 95 | ### Release Checklist 96 | 97 | - verify latest build of ui without debug flags ... e.g. setting port to loccal port 98 | - rebuild ui 99 | - copy latest config.yaml structure to config-default.yaml 100 | - remove CORS testing harness for UI 101 | 102 | - remove manual UI pointers to port 5000 103 | - remove CORS allow rules on backend api 104 | 105 | - verify version bump 106 | 107 | ## FAQs 108 | 109 | - How does NeuralQA handle really long passages? BERT can process input of max size 512 tokes (question + context + special tokens), how does NeuralQA handle longer passages? 110 | We divide the context into multiple chunks (with some optional striding) and find answers within each chunk. For very very long passages, you can 111 | 112 | ## Notes on Running/Locally 113 | 114 | - `pip install neuralqa` 115 | - `neuralqa load` - optional load sample data into an index running on localhost 9200 with no credentials 116 | - `neuralqa ui` - this will launch the web interface 117 | - this will create a default config.yaml in the current folder. You can modify this file and then rerun `neuralqa ui`. 118 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.5.0 2 | tensorflow==2.4.0 3 | transformers==3.0.2 4 | uvicorn 5 | aiofiles 6 | fastapi 7 | elasticsearch==7.7.1 8 | pyyaml==3.13 9 | spacy -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/setup.cfg -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from importlib.machinery import SourceFileLoader 3 | from setuptools import setup, find_packages 4 | 5 | 6 | version = SourceFileLoader('neuralqa.version', os.path.join( 7 | 'neuralqa', 'version.py')).load_module().VERSION 8 | 9 | 10 | def package_files(directory): 11 | paths = [] 12 | for (path, _, filenames) in os.walk(directory): 13 | for filename in filenames: 14 | paths.append(os.path.join('..', path, filename)) 15 | return paths 16 | 17 | 18 | ui_files = package_files("neuralqa/server/ui/build") 19 | yaml_file = ["config_default.yaml"] 20 | setup( 21 | name='neuralqa', 22 | packages=find_packages(exclude=['tests', 'tests.*']), 23 | package_data={"neuralqa": ui_files + yaml_file}, 24 | version=version, 25 | license='MIT', 26 | description='NeuralQA: Question Answering on Large Datasets', 27 | long_description=open('README.md').read(), 28 | long_description_content_type="text/markdown", 29 | author='Victor Dibia', 30 | url='https://github.com/victordibia/neuralqa', 31 | python_requires='>=3.5', 32 | # download_url='https://github.com/victordibia/neuralqa/archive/v0.0.2.tar.gz', 33 | keywords=['NLP', 'Question Answering', 'Machine Learning'], 34 | install_requires=[ 35 | 'fastapi', 36 | 'aiofiles', 37 | 'uvicorn', 38 | 'numpy', 39 | 'tensorflow>=2.1.0', 40 | 'torch', 41 | 'torchvision', 42 | 'transformers', 43 | 'elasticsearch>=7.7.1', 44 | 'pyyaml>=3.13', 45 | 'spacy' 46 | ], 47 | extras_require={ 48 | 'test': ['pytest'] 49 | }, 50 | classifiers=[ 51 | 'Development Status :: 3 - Alpha', 52 | 'Intended Audience :: Developers', 53 | 'License :: OSI Approved :: MIT License', 54 | 'Programming Language :: Python :: 3.6', 55 | ], 56 | entry_points={ 57 | "console_scripts": [ 58 | "neuralqa=neuralqa.cli:cli", 59 | ] 60 | } 61 | ) 62 | -------------------------------------------------------------------------------- /tests/expander/test_expander.py: -------------------------------------------------------------------------------- 1 | from neuralqa.expander import MLMExpander 2 | 3 | 4 | def test_mlm_expander(): 5 | expander_kwargs = { 6 | # "model_path": "distilbert-base-uncased" 7 | } 8 | test_string = "Steve jobs created the apple computer in which year" 9 | expander = MLMExpander(**expander_kwargs) 10 | expansion = expander.expand_query(test_string) 11 | assert len(expansion["terms"]) > 0 12 | print(expansion) 13 | 14 | 15 | test_mlm_expander() 16 | -------------------------------------------------------------------------------- /tests/reader/test_reader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/victordibia/neuralqa/fb48f4d45d5856195baef25b4707e7b282cc364d/tests/reader/test_reader.py -------------------------------------------------------------------------------- /tests/retriever/test_retriever.py: -------------------------------------------------------------------------------- 1 | from neuralqa.retriever import ElasticSearchRetriever 2 | from neuralqa.utils import ConfigParser 3 | 4 | 5 | def test_elasticserch_retriever(): 6 | app_config = ConfigParser("config.yaml") 7 | rkwargs = app_config.config["retriever"]["options"][1]["connection"] 8 | retriever = ElasticSearchRetriever(**rkwargs) 9 | results = retriever.run_query( 10 | "cases", "what is the punishment for arson crime") 11 | assert results != None 12 | 13 | 14 | test_elasticserch_retriever() 15 | --------------------------------------------------------------------------------