├── .dockerignore
├── .gitignore
├── Dockerfile
├── README.md
├── __init__.py
├── app.py
├── calculate_network_change.py
├── conductor.py
├── doc
├── README.md
├── aske_context.md
├── innovations.md
├── installation.md
├── lessons_learned.md
├── notebooks
│ ├── Domain_Adapted_Glove.ipynb
│ ├── Parsing.ipynb
│ ├── alpha
│ │ ├── Get.ipynb
│ │ ├── Process.ipynb
│ │ └── README.md
│ ├── directed_query_gen_walkthrough.ipynb
│ ├── gan_training_illustration.ipynb
│ ├── key_triples_walkthrough.ipynb
│ ├── kg_predict_walkthrough.ipynb
│ ├── kg_query_walkthrough.ipynb
│ ├── precooked_replication.ipynb
│ ├── prepared_output.ipynb
│ └── pure_generation_walkthrough.ipynb
└── phase_two_developments.md
├── docker-compose.yml
├── get_kg_query_params.py
├── images
├── KCCA_equation.png
├── MULTIVAC_schematic.png
├── aske_schematic_v1.5.png
├── aske_schematic_v1.png
├── emulated.png
├── emulated_kg.png
├── formula.png
├── formula_dependencies.png
├── gan.png
├── gan_design.png
├── key_triples.png
├── krongen.png
├── latex_parse_1.png
├── latex_parse_2.png
├── multivac_concept.png
├── phase_one_system.png
├── qgnet.png
├── simple_kg.png
└── stanford_dependecies.png
├── multivac.cfg
├── predict_kg.py
├── pymln
├── LICENSE
├── README.md
├── __init__.py
├── eval
│ ├── Answer.py
│ ├── Question.py
│ ├── USP.py
│ └── __init__.py
├── pymln.py
├── semantic
│ ├── Agenda.py
│ ├── Argument.py
│ ├── Clust.py
│ ├── Executor.py
│ ├── MLN.py
│ ├── Parse.py
│ ├── ParseParams.py
│ ├── Part.py
│ ├── Scorer.py
│ ├── SearchOp.py
│ ├── __init__.py
│ └── argclust.py
├── syntax
│ ├── Nodes
│ │ ├── Article.py
│ │ ├── Sentence.py
│ │ ├── Token.py
│ │ ├── TreeNode.py
│ │ └── __init__.py
│ ├── Relations
│ │ ├── ArgType.py
│ │ ├── Path.py
│ │ ├── RelType.py
│ │ └── __init__.py
│ ├── StanfordParseReader.py
│ └── __init__.py
└── utils
│ ├── Utils.py
│ └── __init__.py
├── requirements.txt
├── settings.py
├── src
├── .gitkeep
├── __init__.py
├── data
│ ├── .gitkeep
│ ├── clean_documents.py
│ ├── clean_questions.py
│ ├── clean_text.py
│ ├── equationparsing.py
│ ├── extract_text.py
│ ├── get.py
│ ├── glove.py
│ ├── make.py
│ ├── parsing.py
│ ├── process.py
│ ├── qgnet.py
│ ├── textparsing.py
│ ├── trainEmbeddings.R
│ └── write_mln_to_graph_db.py
├── gan
│ ├── __init__.py
│ ├── config.cfg
│ ├── discriminator
│ │ ├── __init__.py
│ │ ├── dataset.py
│ │ ├── model.py
│ │ ├── scripts
│ │ │ └── preprocess-multivac.py
│ │ ├── trainer.py
│ │ └── tree.py
│ ├── gen_pyt
│ │ ├── __init__.py
│ │ ├── asdl
│ │ │ ├── __init__.py
│ │ │ ├── asdl.py
│ │ │ ├── asdl_ast.py
│ │ │ ├── hypothesis.py
│ │ │ ├── lang
│ │ │ │ ├── __init__.py
│ │ │ │ ├── eng
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── eng_asdl_helper.py
│ │ │ │ │ ├── eng_transition_system.py
│ │ │ │ │ └── grammar.py
│ │ │ │ └── grammar.py
│ │ │ └── transition_system.py
│ │ ├── astnode.py
│ │ ├── components
│ │ │ ├── __init__.py
│ │ │ ├── action_info.py
│ │ │ ├── dataset.py
│ │ │ ├── decode_hypothesis.py
│ │ │ └── vocab.py
│ │ ├── datasets
│ │ │ ├── __init__.py
│ │ │ └── english
│ │ │ │ ├── __init__.py
│ │ │ │ └── dataset.py
│ │ ├── model
│ │ │ ├── __init__.py
│ │ │ ├── attention_util.py
│ │ │ ├── lstm.py
│ │ │ ├── nn_utils.py
│ │ │ ├── parser.py
│ │ │ └── pointer_net.py
│ │ └── query_treebank.py
│ ├── gen_test.py
│ ├── querygan_pyt.py
│ └── utilities
│ │ ├── __init__.py
│ │ ├── rollout.py
│ │ ├── shuffled_queries.py
│ │ ├── tree_rollout.py
│ │ ├── utils.py
│ │ └── vocab.py
├── link_prediction
│ └── MULTIVAC_link_prediction.py
├── rdf_graph
│ ├── build_graph.py
│ ├── environment.yml
│ ├── map_queries.py
│ ├── rdf_extract.py
│ ├── rdf_graph.py
│ └── rdf_parse.py
└── utilities.py
├── stanford-corenlp-full
└── rdf_graph.properties
├── sys
└── .gitkeep
└── templates
├── base.html
└── query.html
/.dockerignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__
3 | .git
4 | .gitignore
5 | venv
6 | env
7 |
8 | docker-compose.yml
9 | Dockerfile
10 | .dockerignore
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # directories
2 | /data/
3 | /models/
4 |
5 | # pycharm files
6 | .idea/
7 |
8 | # scratch notebook directories
9 | scratch_notebooks/peter/
10 |
11 | # downloaded stanford nlp models
12 | stanford_nlp_models/
13 |
14 | # environment
15 | multivac/
16 | .env
17 | src/pubmed-parser
18 | src/slate
19 |
20 | # juypter notebook
21 | .ipynb_checkpoints
22 |
23 | # Byte-compiled / optimized / DLL files
24 | __pycache__/
25 | *.py[cod]
26 | *$py.class
27 |
28 | # workspaces
29 | .code-workspace
30 |
31 | # system files
32 | .DS_Store
33 |
34 | # logs
35 | *.log
36 |
37 | # flat-files
38 | *.json
39 | *.xml
40 | *.csv
41 |
42 |
43 | # envs
44 | venv/
45 | env/
46 | virtualenv/
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # use ubuntu as the base image; install R and Python on top
2 | FROM ubuntu:latest
3 |
4 | # avoid humna input for geography and stuff
5 | ENV DEBIAN_FRONTEND=noninteractive
6 |
7 | # install R and python
8 | RUN apt-get update && apt-get install -y --no-install-recommends build-essential r-base python3.7 python3-pip python3-setuptools python3-dev git
9 |
10 | # copy requirements over to application
11 | COPY requirements.txt /multivac/requirements.txt
12 |
13 | WORKDIR /multivac
14 |
15 | # set up bdist_wheel
16 | RUN pip3 install wheel --no-cache-dir
17 |
18 | RUN pip3 install setuptools --no-cache-dir
19 |
20 | # env setup
21 | RUN pip3 install torch==1.2.0 --no-cache-dir
22 | RUN pip3 install -r requirements.txt --no-cache-dir
23 |
24 | RUN git clone https://github.com/thunlp/OpenKE && cd OpenKE && git checkout master && sh make.sh
25 |
26 | COPY . /multivac
27 |
28 | ENV PYTHONPATH "${PYTHONPATH}:/"
29 |
30 | EXPOSE 5000
31 |
32 | CMD python3 app.py
33 |
34 |
35 | ### Look into this if issues with OpenKE sh (production image)
36 | # https://forums.docker.com/t/best-practices-for-git-clone-make-etc-via-dockerfile-run/79152/3
37 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MULTIVAC
2 | DARPA’s Information Innovation Office’s Automating Scientific Knowledge Extraction (ASKE) program seeks to develop approaches to make it easier for scientists to build, maintain and reason over rich models of complex systems — which could include physical, biological, social, engineered or hybrid systems. By interpreting and exposing scientific knowledge and assumptions in existing model code and documentation, researchers can identify new data and information resources automatically, extracting useful information from these sources, and integrating this useful information into machine-curated expert models for robust modeling.
3 |
4 |
5 |
6 |
7 | Gallup’s Meta-model Unification Learned Through Inquiry Vectorization and Automated Comprehension (MULTIVAC) effort supports these goals by developing a system that absorbs scientific knowledge — in the form of facts, relationships, models and equations — from a particular domain corpus into a semantic knowledge graph and learns to query that knowledge graph in order to accelerate scientific exploration within the target domain. MULTIVAC consists of an expert query generator trained on a corpus of historical expert queries and tuned dialectically with the use of a Generative Adversarial Network (GAN) architecture. As a prototype system, MULTIVAC focuses on the domain of epidemiological research, and specifically the realm of SIR/SEIR (Susceptible-Infected-Recovered, often with an additional “Exposed” element) compartmental model approaches. It is Gallup’s intent that this system includes a “human-in-the-loop” element, especially during training, to ensure that the system is properly tuned and responsive to the needs and interests of the human researchers it is intended to augment.
8 |
9 | ## System Setup and Operation
10 | - MULTIVAC Installation
11 |
12 | ## System Documentation
13 | - Phase I Development
14 | - Phase II Developments
15 | - Key Innovations
16 |
17 | For more information please contact Principal Investigator, Benjamin Ryan (ben_ryan@gallup.com).
18 |
19 | ---
20 |
21 | ## Acknowledgements
22 | This work is supported by the Defense Advanced Research Projects Agency (DARPA) under Agreement No. HR00111990008.
23 |
24 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/__init__.py
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from flask import Flask, redirect, render_template, request, url_for
4 |
5 | from multivac.src.rdf_graph import map_queries
6 |
7 | app = Flask(__name__)
8 | app.debug = True
9 | app.config['STATIC_FOLDER'] = f'{os.getcwd()}/sys'
10 |
11 |
12 | @app.route('/')
13 | def query():
14 |
15 | return render_template(
16 | 'query.html'
17 | )
18 |
19 |
20 | @app.route('/results')
21 | def results():
22 |
23 | if request.method == 'GET':
24 |
25 | in_dir = os.path.abspath(request.values.get('dir-input'))
26 | out_dir = os.path.abspath(request.values.get('out-input'))
27 |
28 | # make sure these folders exist
29 | assert os.path.exists(out_dir)
30 | assert os.path.exists(in_dir)
31 |
32 | args_dict = {
33 | 'docker_folder_structure': [x for x in os.walk(os.getcwd())],
34 | 'dir': in_dir,
35 | 'model': request.values.get('model-type-input'),
36 | 'out': out_dir,
37 | 'run': request.values.get('run-input'),
38 | 'threshold': request.values.get('threshold-input'),
39 | 'verbose': request.values.get('verbosity-input'),
40 | 'num_top_rel': request.values.get('num-top-input'),
41 | 'search': request.values.get('search-input'),
42 | }
43 |
44 | results = map_queries.run(args_dict)
45 |
46 | return args_dict
47 |
48 | else:
49 | return redirect(url_for('query'))
50 |
51 |
52 | if __name__ == "__main__":
53 | app.run(host="0.0.0.0", port=5000)
54 |
--------------------------------------------------------------------------------
/calculate_network_change.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | This script is meant to identify relevant nodes based on differences of
5 | centrality measure of real and estimated networks.
6 | """
7 | import argparse
8 | import json
9 | import networkx as nx
10 | import numpy as np
11 | import os
12 |
13 | from datetime import datetime
14 |
15 | from multivac.get_kg_query_params import build_network, read_txt
16 |
17 | def build_comparison_metrics(n1, n2, mtype):
18 | if 'degree' in mtype:
19 | n1x = nx.degree_centrality(n1)
20 | n2x = nx.degree_centrality(n2)
21 | else:
22 | tol = 1.0e-6
23 |
24 | while True:
25 | try:
26 | n1x = nx.eigenvector_centrality(n1, tol=tol)
27 | n2x = nx.eigenvector_centrality(n2, tol=tol)
28 | break
29 | except:
30 | tol *= 10
31 | print("Increasing tolerance to {}".format(tol))
32 | continue
33 |
34 | net = {**n1x, **n2x}
35 | for k, v in net.items():
36 | if k in n1x and k in n2x:
37 | net[k] = [n1x[k], v]
38 | elif k in n1x and k not in n2x:
39 | net[k] = [v, np.nan]
40 | else:
41 | net[k] = [np.nan, v]
42 |
43 | return net
44 |
45 |
46 | def generate_node_changes(net):
47 | res = {}
48 | for k, v in net.items():
49 | pct_change = (net[k][1] - net[k][0]) / (net[k][0] + 1)
50 |
51 | if not np.isnan(pct_change):
52 | res.update({k: pct_change})
53 |
54 | return res
55 |
56 |
57 | def generate_result_lists(net, num, ctype=['top', 'bottom']):
58 | res = {}
59 | if 'top' in ctype:
60 | keys = list(net.keys())[-num:]
61 | else:
62 | keys = list(net.keys())[:num]
63 | for key in keys:
64 | res.update({key: net[key]})
65 |
66 | return res
67 |
68 | def get_items(fpath):
69 | items = {}
70 |
71 | for item, idx in read_txt(fpath):
72 | items[int(idx)] = item
73 |
74 | return items
75 |
76 | def triple_to_labels(triple, ents, rels):
77 | head, tail, rel = trip
78 | return " ".join([ents[head], rels[rel], ents[tail]])
79 |
80 |
81 | def get_top_triples(ofile, nfile, kg_dir, measure='eigenvector', num_results=100, out=None):
82 | ents = get_items(os.path.join(kg_dir, 'entity2id.txt'))
83 | rels = get_items(os.path.join(kg_dir, 'relation2id.txt'))
84 | triples = read_txt(os.path.join(kg_dir, 'train2id.txt'))
85 | triples = np.array(triples).astype(int)
86 |
87 | # read in new file for comparison
88 | new = read_txt(nfile)
89 |
90 | # create networks
91 | neto = build_network(triples)
92 | netn = build_network(triples + new)
93 | net = build_comparison_metrics(neto, netn, measure)
94 |
95 | # calculate node changes
96 | result = generate_node_changes(net)
97 | result = {k: v for k, v in sorted(result.items(),
98 | key=lambda item: item[1])}
99 |
100 | # generate results of interest
101 | gains = generate_result_lists(result, len(result), 'top')
102 |
103 | trip_scores = np.zeros(triples.shape[0])
104 |
105 | for i, trip in enumerate(triples):
106 | headgain = tailgain = 0
107 | head, tail, _ = trip
108 | trip_scores[i] = gains.get(str(head), 0) + gains.get(str(tail), 0)
109 |
110 | idxs = trip_scores.argsort()[::-1]
111 | top = triples[idxs,][:num_results,:]
112 |
113 | results = {}
114 |
115 | for i, t in enumerate(top):
116 | triple_id = idxs[i]
117 | h, t, r = t
118 | score = trip_scores[triple_id]
119 |
120 | try:
121 | label = " ".join([ents[h], rels[r], ents[t]])
122 | except:
123 | label = "missing RDF-triple"
124 |
125 | results[triple_id] = {'label': label, 'score': score}
126 |
127 | if out:
128 | with open('{}/key_triples.json'.format(out), 'w') as f:
129 | json.dump(results, f)
130 |
131 | return True
132 | else:
133 | return results
134 |
135 |
136 | def run(args_dict):
137 | # read in files for comparison
138 | orig = read_txt(args_dict['files'][0])
139 | new = read_txt(args_dict['files'][1])
140 |
141 | # create networks
142 | neto = build_network(orig)
143 | netn = build_network(orig + new)
144 | net = build_comparison_metrics(neto, netn, args_dict['measure'])
145 |
146 | # calculate node changes
147 | result = generate_node_changes(net)
148 | result = {k: v for k, v in sorted(result.items(),
149 | key=lambda item: item[1])}
150 |
151 | # generate results of interest
152 | top_gain = generate_result_lists(result, args_dict['num_results'], 'top')
153 | top_loss = generate_result_lists(result, args_dict['num_results'], 'bottom')
154 |
155 | # dump results to disk
156 | time = datetime.now().strftime('%d%b%Y-%H:%M:%S')
157 | with open('{}/top_gains_{}.json'.format(args_dict['output'], time), 'w') as f:
158 | json.dump(top_gain, f)
159 | with open('{}/top_losses_{}.json'.format(args_dict['output'], time), 'w') as f:
160 | json.dump(top_loss, f)
161 |
162 |
163 | if __name__ == '__main__':
164 | parser = argparse.ArgumentParser(description='Calculate differences '
165 | 'between networks.')
166 | parser.add_argument('-f', '--files', nargs=2, required=True, help='Two '
167 | 'files -- the real network then estimated network -- '
168 | 'over which to calculate differences.')
169 | parser.add_argument('-m', '--measure', required=False,
170 | default='eigenvector', choices=['degree',
171 | 'eigenvector'], help='Select which network centrality '
172 | 'measure is required.')
173 | parser.add_argument('-n', '--num_results', required=False, default=10,
174 | type=int, help='Number of results to return from '
175 | 'centrality calculation.')
176 | parser.add_argument('-o', '--output', required=True, help='Path to '
177 | 'directory to write results to disk.')
178 | args_dict = vars(parser.parse_args())
179 |
180 | run(args_dict)
181 |
182 |
--------------------------------------------------------------------------------
/conductor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | """
4 | this script conducts the entire flow of the multivac system to date. it has the
5 | following flow:
6 | 1. collect data
7 | a. these data come from arxiv, springer, and pubmed in this instance, but
8 | could be modified to include more
9 | b. it saves the downloaded pdf's to a directory and creates a json object
10 | for further use
11 | 2. parse data
12 | a. the json objects that are saved from the collection step are processed
13 | for dependencies, input (word position), and morphology (lemma) [dim]
14 | b. it also identifies and notates equations throughout articles
15 | 3. run glove models
16 | a. take article collection that is parsed and create glove word embeddings
17 | b. develops both domain-general and domain-specific models
18 | 4. build the query generation (qg) network
19 | a. uses context/answers as inputs to create questions as output
20 | b. builds off of the domain-adapted glove models to produces robust
21 | questions around a topic of interest (in this case, epidemiology)
22 | 5. build markov logic network (mln)
23 | a. compile parsed dim files into trees and semantically cluster
24 | b. produce a graphical model based on first-order logic for
25 | """
26 | import argparse
27 |
28 | from multivac.src.data.glove import glove_main
29 | from multivac.src.data.make import collect_main
30 | from multivac.src.data.parsing import nlp_parse_main
31 | from multivac.src.data.qgnet import qgnet_main
32 | from multivac.pymln.pymln import mln_main
33 |
34 |
35 | def conduct(args_dict):
36 | # step 1: collect data
37 | collect_main()
38 |
39 | # step 2:
40 | nlp_parse_main(args_dict)
41 |
42 | # step 3: run glove models
43 | glove_main()
44 |
45 | # step 4: build qg network
46 | qgnet_main(args_dict)
47 |
48 | # step 5: build mln
49 | mln_main(args_dict)
50 |
51 |
52 | if __name__ == '__main__':
53 | parser = argparse.ArgumentParser(description='Orchestrate pipeline for '
54 | 'MULTIVAC processing and modeling.')
55 | parser.add_argument('-bp', '--nlp_bp', required=False, type=int,
56 | help='Which document to start parsing with.')
57 | parser.add_argument('-js', '--nlp_newjson', action='store_true',
58 | help='Boolean; indicates whether to create new JSON '
59 | 'file for glove embedding.')
60 | parser.add_argument('-an', '--subset', type=int, help='Number of articles '
61 | 'for MLN run.')
62 | parser.add_argument('-pc', '--prior_num_conj', default=10, type=int,
63 | help='Prior on number of conjunctive parts assigned to '
64 | 'same cluster in MLN.')
65 | parser.add_argument('-pp', '--prior_num_param', default=5, type=int,
66 | help='Prior on number of parameters for cluster '
67 | 'merges.')
68 | parser.add_argument('-qp', '--qgnet_path', required=True, help='The '
69 | 'top-level qgnet directory to create folders for '
70 | 'models and data.')
71 | parser.add_argument('-v', "--verbose", action='store_true', help='Give '
72 | 'verbose output during MLN modeling.')
73 | args_dict = vars(parser.parse_args())
74 |
75 | conduct(args_dict)
76 |
--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
1 | # MULTIVAC Documentation and References
2 | This page serves as an index of system design, theory, and walk through documentation for Gallup’s Meta-model Unification Learned Through Inquiry Vectorization and Automated Comprehension (MULTIVAC). DARPA’s Information Innovation Office’s Automating Scientific Knowledge Extraction (ASKE) program seeks to develop approaches to make it easier for scientists to build, maintain and reason over rich models of complex systems — which could include physical, biological, social, engineered or hybrid systems. MULTIVAC supports these goals by developing a system that absorbs scientific knowledge — in the form of facts, relationships, models and equations — from a particular domain corpus into a Markov Logic Network (MLN) ontology and learns to query that ontology in order to accelerate scientific exploration within the target domain.
3 |
4 | ## Key Innovations
5 | - Key Innovations
6 |
7 | ## Phase II Developments
8 | - Phase II Developments
9 |
10 | ## Phase I Development - System Overview
11 | - System Walk-Through (Jupyter Notebook): Piece-by-Piece Execution
12 | - Markov Logic Network Induction: Construction of the knowledge graph representation in the form of a Markov Logic Network.
13 | - Query Mapping: Query Mapping Execution
14 | - Phase I Lessons Learned: Review of lessons learned from implementing Phase I systems.
15 |
16 | ## ASKE Community
17 | - MULTIVAC in the ASKE Context
18 | - Other ASKE repositories
19 | - ASKE official homepage
20 |
21 | ## Related Research and Resources
22 | - GANs for Text Generation: paperswithcode.com
23 | - Markov Logic Networks paperswithcode.com
24 |
25 | For more information please contact Principal Investigator, Benjamin Ryan (ben_ryan@gallup.com).
26 |
27 | ---
28 | ## Acknowledgements
29 | This work is supported by the Defense Advanced Research Projects Agency (DARPA) under Agreement No. HR00111990008.
30 |
31 |
--------------------------------------------------------------------------------
/doc/aske_context.md:
--------------------------------------------------------------------------------
1 | # MULTIVAC in the ASKE Context
2 | 
3 | 
4 |
5 | Gallup’s MULTIVAC system extracts scientific knowledge — in the form of facts, relationships, equations — from a given domain corpus consisting of natural language text and formal mathematical equations. The system then compiles this knowledge into a curated probabilistic graphical model (specifically, a Markov Logic Network) knowledgebase. Finally, the system learns to query that knowledge base in order to accelerate scientific exploration within the target domain.
6 |
7 | With reference to the first ASKE program schematic on the previous slide, MULTIVAC is more or less vertically integrated across the discovery/extraction, curation, and inference.
8 |
9 | The end objective, however, is hypothesis generation. This feature situates the most novel contribution of MULTIVAC essentially outside these levels, at the top of the more process-oriented second schematic on the previous slide. In effect, MULTIVAC’s “inference” component inverts the standard intention and, instead of using the work done in the extraction and curation layers to arrive at new inferences, learns through observation and experimentation how to ask it’s own novel questions that then require more standard inference solutions to answer. Other projects in the program have presented innovative ways of automating or enhancing execution of human inquiries. Our system seeks to automate the production and evolution of those queries in the first place.
10 |
11 | The final goal of a MULTIVAC system for any given domain is to generate new scientific queries relevant to that domain that have not been asked before by humans. These inquiries, properly formatted, could in theory even act as inputs to many of the other TA2 systems.
12 |
13 | ### Wait, but Why?
14 | - The glacial pace of evolution in paradigms and modes of inquiry within domains.
15 | - Stove-pipes within and between domains of scientific inquiry
16 |
17 |
18 | ## ASKE Potential Use Cases
19 | ### Modernizing and consolidating old research:
20 | - While much research is available in digital form today, vast archives exist in hard copy in various forms that are far less searchable. Using an ASKE system to ingest and compile/curate these types of repositories could help revitalize forgotten areas of research.
21 |
22 | ### Breaking stovepipes:
23 | - Sometimes research fields become balkanized between different communities based on approaches, terminologies, or simply favored publication venues. An ASKE system that can comprehend a field at scale across these artificial segmentations could help break irrational logjams and cross-pollinate discoveries.
24 |
25 | ### Revitalizing stagnant areas of research:
26 | - Occasionally research fields lose momentum or interest, as consensus emerges on “big questions” or as unknowns become more apparently “unknowable.” Paradigm shifts can happen that help break this stagnation and revolutionize fields, but this can take a great deal of time and is never guaranteed. A system that can analyze a field of research and produce novel questions or avenues of inquiry can help inject new creativity and perspectives and revitalize research.
27 |
--------------------------------------------------------------------------------
/doc/installation.md:
--------------------------------------------------------------------------------
1 | # MULTIVAC Installation Guide
2 |
3 | ### Installation Requirements
4 | MULTIVAC can be most easily and cleanly installed using `docker`. To use this method, Docker Desktop is required for launching the system on your local machine. Docker Desktop can be set up easily for either Mac or Windows machines with resources found at the following links:
5 | * For Mac users: https://docs.docker.com/docker-for-mac/install/
6 | * For Windows users: https://docs.docker.com/docker-for-windows/install/
7 |
8 | MULTIVAC makes use of multiple linked docker containers, so along with Docker Desktop, users will need to have set up `docker-compose`. Mac, Windows, and Linux instructions for installation can be found here:
9 | * Docker Compose: https://docs.docker.com/compose/install/
10 |
11 | ### Downloading and Deploying MULTIVAC
12 | The first step is to clone this MULTIVAC repository from GitHub. With Git also locally installed:
13 | * Run the following command in your preferred directory: `git clone https://github.com/GallupGovt/multivac.git`
14 | * Next, change into the MULTIVAC directory you just created and run: `docker-compose up`
15 |
16 | This command will download and build the resources MULTIVAC depends on: Stanford CoreNLP, Grobid Publication Parsing, and Jupyter Notebook Viewer, as well as the core MULTIVAC system itself. This process will take some time on first use, and require well over 10 GB of hard drive space, so please plan accordingly.
17 |
18 | ### Basic Operations
19 | In order to see the running processes under Docker, you can use the `docker ps` command. You should see a running container named *multivac_multivac:latest*. This is the root source of our project. To interact with our code and system, you may use `docker exec -it {container-of-multivac-id} {command}`(i.e. `docker exec -it abd35789sbd2 python3 querygan_pyt.py --cuda`). You can also access our web application through port 5000 of your machine, i.e. http://0.0.0.0:5000 or http://your.ip.add:5000 if on a VM.
20 |
21 | To run any docker commands in the background, add the flag `-d` to your command. Once the system is built, you can always start and stop it with the commands `docker-compose start` and `docker-compose stop`.
22 |
--------------------------------------------------------------------------------
/doc/notebooks/alpha/README.md:
--------------------------------------------------------------------------------
1 | # Working Files
2 | NOTE: The files and code in this directory and sub-directories are deprecated, work-in-progress, or both. This code is not intended to work.
--------------------------------------------------------------------------------
/doc/notebooks/directed_query_gen_walkthrough.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Walk Through for Directed Query Generation\n",
8 | "This notebook outlines the process of generating novel questions based on a user's seed topic using MULTIVAC's semantic knowledge graph and trained query generator. \n",
9 | "First, we set up the required imports and arguments for the test. "
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": null,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "from multivac.src.rdf_graph.map_queries import *\n",
19 | "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n",
20 | "from multivac.src.gan.gen_test import run\n",
21 | "os.chdir('src/gan')"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "args_dict = {'dir': os.path.abspath('../../data'),\n",
31 | " 'out': os.path.abspath('../../models'),\n",
32 | " 'glove': '../../models/glove.42B.300d',\n",
33 | " 'run': 'model',\n",
34 | " 'model': 'transe',\n",
35 | " 'threshold': 0.1,\n",
36 | " 'num_top_rel': 10}\n"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "Next, we load up the knowledge graph embedding model previously calculated. This embedding model allows us to assign probabilities to missing nodes or relationships in the knowledge graph proposed via submitted queries. Here we are using TransE, an approach which models relationships by interpreting them as translations operating on the low-dimensional embeddings of entities."
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "con = config.Config()\n",
53 | "con.set_in_path(args_dict['dir']+os.path.sep)\n",
54 | "con.set_work_threads(8)\n",
55 | "con.set_dimension(100)\n",
56 | "con.set_test_link_prediction(True)\n",
57 | "con.set_test_triple_classification(True)\n",
58 | "\n",
59 | "files = glob.glob(os.path.join(args_dict['out'],'*tf*'))\n",
60 | "times = list(set([file.split('.')[2] for file in files]))\n",
61 | "ifile = max([datetime.strptime(x, '%d%b%Y-%H:%M:%S') for x in times]).strftime('%d%b%Y-%H:%M:%S')\n",
62 | "con.set_import_files(os.path.join(args_dict['out'], 'model.vec.{}.tf'.format(ifile)))\n",
63 | "\n",
64 | "con.init()\n",
65 | "kem = set_model_choice(args_dict['model'])\n",
66 | "con.set_model(kem)\n",
67 | "\n",
68 | "\n",
69 | "files = [x for x in os.listdir(con.in_path) if '2id' in x]\n",
70 | "rel_file = get_newest_file(con.in_path, files, 'relation')\n",
71 | "ent_file = get_newest_file(con.in_path, files, 'entity')\n",
72 | "trn_file = get_newest_file(con.in_path, files, 'train')\n",
73 | "\n",
74 | "entities = pd.read_csv(ent_file, sep='\\t', \n",
75 | " names=[\"Ent\",\"Id\"], skiprows=1)\n",
76 | "relations = pd.read_csv(rel_file, sep='\\t', \n",
77 | " names=[\"Rel\",\"Id\"], skiprows=1)\n",
78 | "train = pd.read_csv(trn_file, sep='\\t', \n",
79 | " names=[\"Head\",\"Tail\",\"Relation\"], skiprows=1)"
80 | ]
81 | },
82 | {
83 | "cell_type": "markdown",
84 | "metadata": {},
85 | "source": [
86 | "We then set up a GloVe embedding model. Here we use the large scale, pre-trained GloVe embedding model given the open domain nature of potential submitted questions."
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {},
93 | "outputs": [],
94 | "source": [
95 | "glove_vocab, glove_emb = load_word_vectors(args_dict['glove'])\n"
96 | ]
97 | },
98 | {
99 | "cell_type": "markdown",
100 | "metadata": {},
101 | "source": [
102 | "Finally, we input our seed topic and extract the knowledge graph elements and predicted elements most related to that topic. The system identifies all triples containing the topic or closely semantically related to it, and returns the top `num_top_rel` results (by default, 10)."
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": null,
108 | "metadata": {},
109 | "outputs": [],
110 | "source": [
111 | "sample_topic = 'avian flu'"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "results = predict_object(con, sample_topic, relations, entities, train, glove_vocab, glove_emb, exact=False)"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "These results are then fed to the query generator, which produces questions in response to each topic. The `run()` function called below does two main things: 1) submit the \"query\" triples to the Generator system to be parsed into a tree object representing the consituency parse of an English language question, and 2) translate that parse into the surface text for presentation:\n",
128 | "```python\n",
129 | " results = netG.parse(query, beam_size=netG.args['beam_size'])\n",
130 | " texts = [asdl_ast_to_english(x.tree) for x in results]\n",
131 | "\n",
132 | " return texts\n",
133 | "```"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "questions = results.Text.apply(lambda x: run({'query': list(x), \n",
143 | " 'model': os.path.join(args_dict['out'], 'gen_checkpoint.pth')}))"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": null,
149 | "metadata": {},
150 | "outputs": [],
151 | "source": [
152 | "questions.values"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {},
159 | "outputs": [],
160 | "source": []
161 | }
162 | ],
163 | "metadata": {
164 | "kernelspec": {
165 | "display_name": "Python 3",
166 | "language": "python",
167 | "name": "python3"
168 | },
169 | "language_info": {
170 | "codemirror_mode": {
171 | "name": "ipython",
172 | "version": 3
173 | },
174 | "file_extension": ".py",
175 | "mimetype": "text/x-python",
176 | "name": "python",
177 | "nbconvert_exporter": "python",
178 | "pygments_lexer": "ipython3",
179 | "version": "3.7.5"
180 | }
181 | },
182 | "nbformat": 4,
183 | "nbformat_minor": 2
184 | }
185 |
--------------------------------------------------------------------------------
/doc/notebooks/gan_training_illustration.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Walk Through for GAN Training\n",
8 | "This notebook illustrates the training of MULTIVAC's generative adversarial network system for query generation.\n",
9 | "First, we set up the required imports and arguments for the test. This process can be performed all at once from the commandline as well: