├── .dockerignore
├── .gitignore
├── Dockerfile
├── README.md
├── __init__.py
├── app.py
├── calculate_network_change.py
├── conductor.py
├── doc
    ├── README.md
    ├── aske_context.md
    ├── innovations.md
    ├── installation.md
    ├── lessons_learned.md
    ├── notebooks
    │   ├── Domain_Adapted_Glove.ipynb
    │   ├── Parsing.ipynb
    │   ├── alpha
    │   │   ├── Get.ipynb
    │   │   ├── Process.ipynb
    │   │   └── README.md
    │   ├── directed_query_gen_walkthrough.ipynb
    │   ├── gan_training_illustration.ipynb
    │   ├── key_triples_walkthrough.ipynb
    │   ├── kg_predict_walkthrough.ipynb
    │   ├── kg_query_walkthrough.ipynb
    │   ├── precooked_replication.ipynb
    │   ├── prepared_output.ipynb
    │   └── pure_generation_walkthrough.ipynb
    └── phase_two_developments.md
├── docker-compose.yml
├── get_kg_query_params.py
├── images
    ├── KCCA_equation.png
    ├── MULTIVAC_schematic.png
    ├── aske_schematic_v1.5.png
    ├── aske_schematic_v1.png
    ├── emulated.png
    ├── emulated_kg.png
    ├── formula.png
    ├── formula_dependencies.png
    ├── gan.png
    ├── gan_design.png
    ├── key_triples.png
    ├── krongen.png
    ├── latex_parse_1.png
    ├── latex_parse_2.png
    ├── multivac_concept.png
    ├── phase_one_system.png
    ├── qgnet.png
    ├── simple_kg.png
    └── stanford_dependecies.png
├── multivac.cfg
├── predict_kg.py
├── pymln
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── eval
    │   ├── Answer.py
    │   ├── Question.py
    │   ├── USP.py
    │   └── __init__.py
    ├── pymln.py
    ├── semantic
    │   ├── Agenda.py
    │   ├── Argument.py
    │   ├── Clust.py
    │   ├── Executor.py
    │   ├── MLN.py
    │   ├── Parse.py
    │   ├── ParseParams.py
    │   ├── Part.py
    │   ├── Scorer.py
    │   ├── SearchOp.py
    │   ├── __init__.py
    │   └── argclust.py
    ├── syntax
    │   ├── Nodes
    │   │   ├── Article.py
    │   │   ├── Sentence.py
    │   │   ├── Token.py
    │   │   ├── TreeNode.py
    │   │   └── __init__.py
    │   ├── Relations
    │   │   ├── ArgType.py
    │   │   ├── Path.py
    │   │   ├── RelType.py
    │   │   └── __init__.py
    │   ├── StanfordParseReader.py
    │   └── __init__.py
    └── utils
    │   ├── Utils.py
    │   └── __init__.py
├── requirements.txt
├── settings.py
├── src
    ├── .gitkeep
    ├── __init__.py
    ├── data
    │   ├── .gitkeep
    │   ├── clean_documents.py
    │   ├── clean_questions.py
    │   ├── clean_text.py
    │   ├── equationparsing.py
    │   ├── extract_text.py
    │   ├── get.py
    │   ├── glove.py
    │   ├── make.py
    │   ├── parsing.py
    │   ├── process.py
    │   ├── qgnet.py
    │   ├── textparsing.py
    │   ├── trainEmbeddings.R
    │   └── write_mln_to_graph_db.py
    ├── gan
    │   ├── __init__.py
    │   ├── config.cfg
    │   ├── discriminator
    │   │   ├── __init__.py
    │   │   ├── dataset.py
    │   │   ├── model.py
    │   │   ├── scripts
    │   │   │   └── preprocess-multivac.py
    │   │   ├── trainer.py
    │   │   └── tree.py
    │   ├── gen_pyt
    │   │   ├── __init__.py
    │   │   ├── asdl
    │   │   │   ├── __init__.py
    │   │   │   ├── asdl.py
    │   │   │   ├── asdl_ast.py
    │   │   │   ├── hypothesis.py
    │   │   │   ├── lang
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── eng
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── eng_asdl_helper.py
    │   │   │   │   │   ├── eng_transition_system.py
    │   │   │   │   │   └── grammar.py
    │   │   │   │   └── grammar.py
    │   │   │   └── transition_system.py
    │   │   ├── astnode.py
    │   │   ├── components
    │   │   │   ├── __init__.py
    │   │   │   ├── action_info.py
    │   │   │   ├── dataset.py
    │   │   │   ├── decode_hypothesis.py
    │   │   │   └── vocab.py
    │   │   ├── datasets
    │   │   │   ├── __init__.py
    │   │   │   └── english
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── dataset.py
    │   │   ├── model
    │   │   │   ├── __init__.py
    │   │   │   ├── attention_util.py
    │   │   │   ├── lstm.py
    │   │   │   ├── nn_utils.py
    │   │   │   ├── parser.py
    │   │   │   └── pointer_net.py
    │   │   └── query_treebank.py
    │   ├── gen_test.py
    │   ├── querygan_pyt.py
    │   └── utilities
    │   │   ├── __init__.py
    │   │   ├── rollout.py
    │   │   ├── shuffled_queries.py
    │   │   ├── tree_rollout.py
    │   │   ├── utils.py
    │   │   └── vocab.py
    ├── link_prediction
    │   └── MULTIVAC_link_prediction.py
    ├── rdf_graph
    │   ├── build_graph.py
    │   ├── environment.yml
    │   ├── map_queries.py
    │   ├── rdf_extract.py
    │   ├── rdf_graph.py
    │   └── rdf_parse.py
    └── utilities.py
├── stanford-corenlp-full
    └── rdf_graph.properties
├── sys
    └── .gitkeep
└── templates
    ├── base.html
    └── query.html


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | __pycache__
 3 | .git
 4 | .gitignore
 5 | venv
 6 | env
 7 | 
 8 | docker-compose.yml
 9 | Dockerfile
10 | .dockerignore


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # directories
 2 | /data/
 3 | /models/
 4 | 
 5 | # pycharm files
 6 | .idea/
 7 | 
 8 | # scratch notebook directories
 9 | scratch_notebooks/peter/
10 | 
11 | # downloaded stanford nlp models
12 | stanford_nlp_models/
13 | 
14 | # environment
15 | multivac/
16 | .env
17 | src/pubmed-parser
18 | src/slate
19 | 
20 | # juypter notebook
21 | .ipynb_checkpoints
22 | 
23 | # Byte-compiled / optimized / DLL files
24 | __pycache__/
25 | *.py[cod]
26 | *$py.class
27 | 
28 | # workspaces
29 | .code-workspace
30 | 
31 | # system files
32 | .DS_Store
33 | 
34 | # logs
35 | *.log
36 | 
37 | # flat-files
38 | *.json
39 | *.xml
40 | *.csv
41 | 
42 | 
43 | # envs
44 | venv/
45 | env/
46 | virtualenv/


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # use ubuntu as the base image; install R and Python on top
 2 | FROM ubuntu:latest
 3 | 
 4 | # avoid humna input for geography and stuff
 5 | ENV DEBIAN_FRONTEND=noninteractive
 6 | 
 7 | # install R and python
 8 | RUN apt-get update && apt-get install -y --no-install-recommends build-essential r-base python3.7 python3-pip python3-setuptools python3-dev git
 9 | 
10 | # copy requirements over to application
11 | COPY requirements.txt /multivac/requirements.txt
12 | 
13 | WORKDIR /multivac
14 | 
15 | # set up bdist_wheel
16 | RUN pip3 install wheel --no-cache-dir
17 | 
18 | RUN pip3 install setuptools --no-cache-dir
19 | 
20 | # env setup
21 | RUN pip3 install torch==1.2.0 --no-cache-dir
22 | RUN pip3 install -r requirements.txt --no-cache-dir
23 | 
24 | RUN git clone https://github.com/thunlp/OpenKE && cd OpenKE && git checkout master && sh make.sh
25 | 
26 | COPY . /multivac
27 | 
28 | ENV PYTHONPATH "${PYTHONPATH}:/"
29 | 
30 | EXPOSE 5000
31 | 
32 | CMD python3 app.py
33 | 
34 | 
35 | ### Look into this if issues with OpenKE sh (production image)
36 | # https://forums.docker.com/t/best-practices-for-git-clone-make-etc-via-dockerfile-run/79152/3
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MULTIVAC
 2 | DARPA’s Information Innovation Office’s Automating Scientific Knowledge Extraction (ASKE) program seeks to develop approaches to make it easier for scientists to build, maintain and reason over rich models of complex systems — which could include physical, biological, social, engineered or hybrid systems. By interpreting and exposing scientific knowledge and assumptions in existing model code and documentation, researchers can identify new data and information resources automatically, extracting useful information from these sources, and integrating this useful information into machine-curated expert models for robust modeling.
 3 | 
 4 | <img align="left" width="60%" src="images/phase_one_system.png" alt="MULTIVAC Schematic">
 5 | <img align="right" width="35%" src="images/gan_design.png" alt="GAN Schematic">
 6 | 
 7 | <br clear="all">Gallup’s Meta-model Unification Learned Through Inquiry Vectorization and Automated Comprehension (MULTIVAC) effort supports these goals by developing a system that absorbs scientific knowledge — in the form of facts, relationships, models and equations — from a particular domain corpus into a semantic knowledge graph and learns to query that knowledge graph in order to accelerate scientific exploration within the target domain. MULTIVAC consists of an expert query generator trained on a corpus of historical expert queries and tuned dialectically with the use of a Generative Adversarial Network (GAN) architecture. As a prototype system, MULTIVAC focuses on the domain of epidemiological research, and specifically the realm of SIR/SEIR (Susceptible-Infected-Recovered, often with an additional “Exposed” element) compartmental model approaches. It is Gallup’s intent that this system includes a “human-in-the-loop” element, especially during training, to ensure that the system is properly tuned and responsive to the needs and interests of the human researchers it is intended to augment.
 8 | 
 9 | ## System Setup and Operation
10 | - <a href="https://github.com/GallupGovt/multivac/tree/master/doc/installation.md">MULTIVAC Installation</a>
11 | 
12 | ## System Documentation
13 | - <a href="https://github.com/GallupGovt/multivac/tree/master/doc#phase-i-development---system-overview">Phase I Development</a>
14 | - <a href='https://github.com/GallupGovt/multivac/tree/master/doc/phase_two_developments.md'>Phase II Developments</a>
15 | - <a href='https://github.com/GallupGovt/multivac/tree/master/doc/innovations.md'>Key Innovations</a>
16 | 
17 | For more information please contact Principal Investigator, Benjamin Ryan (ben_ryan@gallup.com).
18 | 
19 | ---
20 | 
21 | ## Acknowledgements
22 | This work is supported by the Defense Advanced Research Projects Agency (DARPA) under Agreement No. HR00111990008.
23 | 
24 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/__init__.py


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from flask import Flask, redirect, render_template, request, url_for
 4 | 
 5 | from multivac.src.rdf_graph import map_queries
 6 | 
 7 | app = Flask(__name__)
 8 | app.debug = True
 9 | app.config['STATIC_FOLDER'] = f'{os.getcwd()}/sys'
10 | 
11 | 
12 | @app.route('/')
13 | def query():
14 | 
15 |     return render_template(
16 |         'query.html'
17 |     )
18 | 
19 | 
20 | @app.route('/results')
21 | def results():
22 | 
23 |     if request.method == 'GET':
24 | 
25 |         in_dir = os.path.abspath(request.values.get('dir-input'))
26 |         out_dir = os.path.abspath(request.values.get('out-input'))
27 | 
28 |         # make sure these folders exist
29 |         assert os.path.exists(out_dir)
30 |         assert os.path.exists(in_dir)
31 | 
32 |         args_dict = {
33 |             'docker_folder_structure': [x for x in os.walk(os.getcwd())],
34 |             'dir': in_dir,
35 |             'model': request.values.get('model-type-input'),
36 |             'out': out_dir,
37 |             'run': request.values.get('run-input'),
38 |             'threshold': request.values.get('threshold-input'),
39 |             'verbose': request.values.get('verbosity-input'),
40 |             'num_top_rel': request.values.get('num-top-input'),
41 |             'search': request.values.get('search-input'),
42 |         }
43 | 
44 |         results = map_queries.run(args_dict)
45 | 
46 |         return args_dict
47 | 
48 |     else:
49 |         return redirect(url_for('query'))
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     app.run(host="0.0.0.0", port=5000)
54 | 


--------------------------------------------------------------------------------
/calculate_network_change.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | This script is meant to identify relevant nodes based on differences of
  5 | centrality measure of real and estimated networks.
  6 | """
  7 | import argparse
  8 | import json
  9 | import networkx as nx
 10 | import numpy as np
 11 | import os
 12 | 
 13 | from datetime import datetime
 14 | 
 15 | from multivac.get_kg_query_params import build_network, read_txt
 16 | 
 17 | def build_comparison_metrics(n1, n2, mtype):
 18 |     if 'degree' in mtype:
 19 |         n1x = nx.degree_centrality(n1)
 20 |         n2x = nx.degree_centrality(n2)
 21 |     else:
 22 |         tol = 1.0e-6
 23 | 
 24 |         while True:
 25 |             try:
 26 |                 n1x = nx.eigenvector_centrality(n1, tol=tol)
 27 |                 n2x = nx.eigenvector_centrality(n2, tol=tol)
 28 |                 break
 29 |             except:
 30 |                 tol *= 10
 31 |                 print("Increasing tolerance to {}".format(tol))
 32 |                 continue
 33 | 
 34 |     net = {**n1x, **n2x}
 35 |     for k, v in net.items():
 36 |         if k in n1x and k in n2x:
 37 |             net[k] = [n1x[k], v]
 38 |         elif k in n1x and k not in n2x:
 39 |             net[k] = [v, np.nan]
 40 |         else:
 41 |             net[k] = [np.nan, v]
 42 | 
 43 |     return net
 44 | 
 45 | 
 46 | def generate_node_changes(net):
 47 |     res = {}
 48 |     for k, v in net.items():
 49 |         pct_change = (net[k][1] - net[k][0]) / (net[k][0] + 1)
 50 | 
 51 |         if not np.isnan(pct_change):
 52 |             res.update({k: pct_change})
 53 | 
 54 |     return res
 55 | 
 56 | 
 57 | def generate_result_lists(net, num, ctype=['top', 'bottom']):
 58 |     res = {}
 59 |     if 'top' in ctype:
 60 |         keys = list(net.keys())[-num:]
 61 |     else:
 62 |         keys = list(net.keys())[:num]
 63 |     for key in keys:
 64 |         res.update({key: net[key]})
 65 | 
 66 |     return res
 67 | 
 68 | def get_items(fpath):
 69 |     items = {}
 70 | 
 71 |     for item, idx in read_txt(fpath):
 72 |         items[int(idx)] = item
 73 | 
 74 |     return items
 75 | 
 76 | def triple_to_labels(triple, ents, rels):
 77 |     head, tail, rel = trip
 78 |     return " ".join([ents[head], rels[rel], ents[tail]])
 79 | 
 80 | 
 81 | def get_top_triples(ofile, nfile, kg_dir, measure='eigenvector', num_results=100, out=None):
 82 |     ents = get_items(os.path.join(kg_dir, 'entity2id.txt'))
 83 |     rels = get_items(os.path.join(kg_dir, 'relation2id.txt'))
 84 |     triples = read_txt(os.path.join(kg_dir, 'train2id.txt'))
 85 |     triples = np.array(triples).astype(int)
 86 | 
 87 |     # read in new file for comparison
 88 |     new = read_txt(nfile)
 89 | 
 90 |     # create networks
 91 |     neto = build_network(triples)
 92 |     netn = build_network(triples + new)
 93 |     net = build_comparison_metrics(neto, netn, measure)
 94 | 
 95 |     # calculate node changes
 96 |     result = generate_node_changes(net)
 97 |     result = {k: v for k, v in sorted(result.items(),
 98 |                                       key=lambda item: item[1])}
 99 | 
100 |     # generate results of interest
101 |     gains = generate_result_lists(result, len(result), 'top')
102 | 
103 |     trip_scores = np.zeros(triples.shape[0])
104 | 
105 |     for i, trip in enumerate(triples):
106 |         headgain = tailgain = 0
107 |         head, tail, _ = trip
108 |         trip_scores[i] = gains.get(str(head), 0) + gains.get(str(tail), 0)
109 | 
110 |     idxs = trip_scores.argsort()[::-1]
111 |     top = triples[idxs,][:num_results,:]
112 | 
113 |     results = {}
114 | 
115 |     for i, t in enumerate(top):
116 |         triple_id = idxs[i]
117 |         h, t, r = t
118 |         score = trip_scores[triple_id]
119 | 
120 |         try:
121 |             label = " ".join([ents[h], rels[r], ents[t]])
122 |         except:
123 |             label = "missing RDF-triple"
124 |     
125 |         results[triple_id] = {'label': label, 'score': score}
126 | 
127 |     if out:
128 |         with open('{}/key_triples.json'.format(out), 'w') as f:
129 |             json.dump(results, f)
130 | 
131 |         return True
132 |     else:
133 |         return results
134 | 
135 | 
136 | def run(args_dict):
137 |     # read in files for comparison
138 |     orig = read_txt(args_dict['files'][0])
139 |     new = read_txt(args_dict['files'][1])
140 | 
141 |     # create networks
142 |     neto = build_network(orig)
143 |     netn = build_network(orig + new)
144 |     net = build_comparison_metrics(neto, netn, args_dict['measure'])
145 | 
146 |     # calculate node changes
147 |     result = generate_node_changes(net)
148 |     result = {k: v for k, v in sorted(result.items(),
149 |                                       key=lambda item: item[1])}
150 | 
151 |     # generate results of interest
152 |     top_gain = generate_result_lists(result, args_dict['num_results'], 'top')
153 |     top_loss = generate_result_lists(result, args_dict['num_results'], 'bottom')
154 | 
155 |     # dump results to disk
156 |     time = datetime.now().strftime('%d%b%Y-%H:%M:%S')
157 |     with open('{}/top_gains_{}.json'.format(args_dict['output'], time), 'w') as f:
158 |         json.dump(top_gain, f)
159 |     with open('{}/top_losses_{}.json'.format(args_dict['output'], time), 'w') as f:
160 |         json.dump(top_loss, f)
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     parser = argparse.ArgumentParser(description='Calculate differences '
165 |                                      'between networks.')
166 |     parser.add_argument('-f', '--files', nargs=2, required=True, help='Two '
167 |                         'files -- the real network then estimated network -- '
168 |                         'over which to calculate differences.')
169 |     parser.add_argument('-m', '--measure', required=False,
170 |                         default='eigenvector', choices=['degree',
171 |                         'eigenvector'], help='Select which network centrality '
172 |                         'measure is required.')
173 |     parser.add_argument('-n', '--num_results', required=False, default=10,
174 |                         type=int, help='Number of results to return from '
175 |                         'centrality calculation.')
176 |     parser.add_argument('-o', '--output', required=True, help='Path to '
177 |                         'directory to write results to disk.')
178 |     args_dict = vars(parser.parse_args())
179 | 
180 |     run(args_dict)
181 | 
182 | 


--------------------------------------------------------------------------------
/conductor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | this script conducts the entire flow of the multivac system to date. it has the
 5 | following flow:
 6 | 1. collect data
 7 |     a. these data come from arxiv, springer, and pubmed in this instance, but
 8 |         could be modified to include more
 9 |     b. it saves the downloaded pdf's to a directory and creates a json object
10 |         for further use
11 | 2. parse data
12 |     a. the json objects that are saved from the collection step are processed
13 |        for dependencies, input (word position), and morphology (lemma) [dim]
14 |     b. it also identifies and notates equations throughout articles
15 | 3. run glove models
16 |     a. take article collection that is parsed and create glove word embeddings
17 |     b. develops both domain-general and domain-specific models
18 | 4. build the query generation (qg) network
19 |     a. uses context/answers as inputs to create questions as output
20 |     b. builds off of the domain-adapted glove models to produces robust
21 |        questions around a topic of interest (in this case, epidemiology)
22 | 5. build markov logic network (mln)
23 |     a. compile parsed dim files into trees and semantically cluster
24 |     b. produce a graphical model based on first-order logic for
25 | """
26 | import argparse
27 | 
28 | from multivac.src.data.glove import glove_main
29 | from multivac.src.data.make import collect_main
30 | from multivac.src.data.parsing import nlp_parse_main
31 | from multivac.src.data.qgnet import qgnet_main
32 | from multivac.pymln.pymln import mln_main
33 | 
34 | 
35 | def conduct(args_dict):
36 |     # step 1: collect data
37 |     collect_main()
38 | 
39 |     # step 2:
40 |     nlp_parse_main(args_dict)
41 | 
42 |     # step 3: run glove models
43 |     glove_main()
44 | 
45 |     # step 4: build qg network
46 |     qgnet_main(args_dict)
47 | 
48 |     # step 5: build mln
49 |     mln_main(args_dict)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     parser = argparse.ArgumentParser(description='Orchestrate pipeline for '
54 |                                      'MULTIVAC processing and modeling.')
55 |     parser.add_argument('-bp', '--nlp_bp', required=False, type=int,
56 |                         help='Which document to start parsing with.')
57 |     parser.add_argument('-js', '--nlp_newjson', action='store_true',
58 |                         help='Boolean; indicates whether to create new JSON '
59 |                         'file for glove embedding.')
60 |     parser.add_argument('-an', '--subset', type=int, help='Number of articles '
61 |                         'for MLN run.')
62 |     parser.add_argument('-pc', '--prior_num_conj', default=10, type=int,
63 |                         help='Prior on number of conjunctive parts assigned to '
64 |                         'same cluster in MLN.')
65 |     parser.add_argument('-pp', '--prior_num_param', default=5, type=int,
66 |                         help='Prior on number of parameters for cluster '
67 |                         'merges.')
68 |     parser.add_argument('-qp', '--qgnet_path', required=True, help='The '
69 |                         'top-level qgnet directory to create folders for '
70 |                         'models and data.')
71 |     parser.add_argument('-v', "--verbose", action='store_true', help='Give '
72 |                         'verbose output during MLN modeling.')
73 |     args_dict = vars(parser.parse_args())
74 | 
75 |     conduct(args_dict)
76 | 


--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
 1 | # MULTIVAC Documentation and References
 2 | This page serves as an index of system design, theory, and walk through documentation for Gallup’s Meta-model Unification Learned Through Inquiry Vectorization and Automated Comprehension (MULTIVAC). DARPA’s Information Innovation Office’s Automating Scientific Knowledge Extraction (ASKE) program seeks to develop approaches to make it easier for scientists to build, maintain and reason over rich models of complex systems — which could include physical, biological, social, engineered or hybrid systems. MULTIVAC supports these goals by developing a system that absorbs scientific knowledge — in the form of facts, relationships, models and equations — from a particular domain corpus into a Markov Logic Network (MLN) ontology and learns to query that ontology in order to accelerate scientific exploration within the target domain. 
 3 | 
 4 | ## Key Innovations
 5 | - <a href='https://github.com/GallupGovt/multivac/tree/master/doc/innovations.md'>Key Innovations</a>
 6 | 
 7 | ## Phase II Developments
 8 | - <a href='https://github.com/GallupGovt/multivac/tree/master/doc/phase_two_developments.md'>Phase II Developments</a>
 9 | 
10 | ## Phase I Development - System Overview
11 | - System Walk-Through (Jupyter Notebook): <a href='https://github.com/GallupGovt/multivac/tree/master/precooked_replication.ipynb'>Piece-by-Piece Execution</a>
12 | - Markov Logic Network Induction: <a href='https://github.com/GallupGovt/multivac/tree/master/pymln'>Construction</a> of the knowledge graph representation in the form of a Markov Logic Network.
13 | - Query Mapping: <a href='https://github.com/GallupGovt/multivac/tree/master/prepared_output.ipynb'>Query Mapping Execution</a>
14 | - Phase I Lessons Learned: <a href='https://github.com/GallupGovt/multivac/blob/master/doc/lessons_learned.md'>Review</a> of lessons learned from implementing Phase I systems.
15 | 
16 | ## ASKE Community
17 | - MULTIVAC in the <a href='https://github.com/GallupGovt/multivac/blob/master/doc/aske_context.md'>ASKE Context</a>
18 | - Other ASKE <a href='https://github.com/DARPA-ASKE/info-and-links'>repositories</a>
19 | - ASKE official <a href='https://www.darpa.mil/program/automating-scientific-knowledge-extraction'>homepage</a>
20 | 
21 | ## Related Research and Resources
22 | - GANs for Text Generation: <a href='https://paperswithcode.com/search?q=gan+text'>paperswithcode.com</a>
23 | - Markov Logic Networks <a href='https://paperswithcode.com/search?q=markov+logic+network'>paperswithcode.com</a>
24 | 
25 | For more information please contact Principal Investigator, Benjamin Ryan (ben_ryan@gallup.com).
26 | 
27 | ---
28 | ## Acknowledgements
29 | This work is supported by the Defense Advanced Research Projects Agency (DARPA) under Agreement No. HR00111990008.
30 | 
31 | 


--------------------------------------------------------------------------------
/doc/aske_context.md:
--------------------------------------------------------------------------------
 1 | # MULTIVAC in the ASKE Context
 2 | ![ASKE Schematic v1.0](https://github.com/GallupGovt/multivac/blob/master/images/aske_schematic_v1.png)
 3 | ![ASKE Schematic v1.5](https://github.com/GallupGovt/multivac/blob/master/images/aske_schematic_v1.5.png)
 4 | 
 5 | Gallup’s MULTIVAC system extracts scientific knowledge — in the form of facts, relationships, equations — from a given domain corpus consisting of natural language text and formal mathematical equations. The system then compiles this knowledge into a curated probabilistic graphical model (specifically, a Markov Logic Network) knowledgebase. Finally, the system learns to query that knowledge base in order to accelerate scientific exploration within the target domain. 
 6 | 
 7 | With reference to the first ASKE program schematic on the previous slide, MULTIVAC is more or less vertically integrated across the discovery/extraction, curation, and inference. 
 8 | 
 9 | The end objective, however, is hypothesis generation. This feature situates the most novel contribution of MULTIVAC essentially outside these levels, at the top of the more process-oriented second schematic on the previous slide. In effect, MULTIVAC’s “inference” component inverts the standard intention and, instead of using the work done in the extraction and curation layers to arrive at new inferences, learns through observation and experimentation how to ask it’s own novel questions that then require more standard inference solutions to answer. Other projects in the program have presented innovative ways of automating or enhancing execution of human inquiries. Our system seeks to automate the production and evolution of those queries in the first place.
10 | 
11 | The final goal of a MULTIVAC system for any given domain is to generate new scientific queries relevant to that domain that have not been asked before by humans. These inquiries, properly formatted, could in theory even act as inputs to many of the other TA2 systems.
12 | 
13 | ### Wait, but Why?
14 | - The glacial pace of evolution in paradigms and modes of inquiry within domains. 
15 | - Stove-pipes within and between domains of scientific inquiry
16 | 
17 | 
18 | ## ASKE Potential Use Cases
19 | ### Modernizing and consolidating old research:
20 | - While much research is available in digital form today, vast archives exist in hard copy in various forms that are far less searchable. Using an ASKE system to ingest and compile/curate these types of repositories could help revitalize forgotten areas of research.
21 | 
22 | ### Breaking stovepipes:
23 | - Sometimes research fields become balkanized between different communities based on approaches, terminologies, or simply favored publication venues. An ASKE system that can comprehend a field at scale across these artificial segmentations could help break irrational logjams and cross-pollinate discoveries. 
24 | 
25 | ### Revitalizing stagnant areas of research:
26 | - Occasionally research fields lose momentum or interest, as consensus emerges on “big questions” or as unknowns become more apparently “unknowable.” Paradigm shifts can happen that help break this stagnation and revolutionize fields, but this can take a great deal of time and is never guaranteed. A system that can analyze a field of research and produce novel questions or avenues of inquiry can help inject new creativity and perspectives and revitalize research.
27 | 


--------------------------------------------------------------------------------
/doc/installation.md:
--------------------------------------------------------------------------------
 1 | # MULTIVAC Installation Guide
 2 | 
 3 | ### Installation Requirements
 4 | MULTIVAC can be most easily and cleanly installed using `docker`. To use this method, Docker Desktop is required for launching the system on your local machine. Docker Desktop can be set up easily for either Mac or Windows machines with resources found at the following links:
 5 | * For Mac users: https://docs.docker.com/docker-for-mac/install/
 6 | * For Windows users: https://docs.docker.com/docker-for-windows/install/
 7 | 
 8 | MULTIVAC makes use of multiple linked docker containers, so along with Docker Desktop, users will need to have set up `docker-compose`. Mac, Windows, and Linux instructions for installation can be found here: 
 9 | * Docker Compose: https://docs.docker.com/compose/install/
10 | 
11 | ### Downloading and Deploying MULTIVAC
12 | The first step is to clone this MULTIVAC repository from GitHub. With Git also locally <a href="https://git-scm.com/book/en/v2/Getting-Started-Installing-Git">installed</a>:
13 | * Run the following command in your preferred directory: `git clone https://github.com/GallupGovt/multivac.git`
14 | * Next, change into the MULTIVAC directory you just created and run: `docker-compose up`
15 | 
16 | This command will download and build the resources MULTIVAC depends on: Stanford CoreNLP, Grobid Publication Parsing, and Jupyter Notebook Viewer, as well as the core MULTIVAC system itself. This process will take some time on first use, and require well over 10 GB of hard drive space, so please plan accordingly. 
17 | 
18 | ### Basic Operations
19 | In order to see the running processes under Docker, you can use the `docker ps` command. You should see a running container named *multivac_multivac:latest*. This is the root source of our project. To interact with our code and system, you may use `docker exec -it {container-of-multivac-id} {command}`(i.e. `docker exec -it abd35789sbd2 python3 querygan_pyt.py --cuda`). You can also access our web application through port 5000 of your machine, i.e. http://0.0.0.0:5000 or http://your.ip.add:5000 if on a VM. 
20 | 
21 | To run any docker commands in the background, add the flag `-d` to your command. Once the system is built, you can always start and stop it with the commands `docker-compose start` and `docker-compose stop`. 
22 | 


--------------------------------------------------------------------------------
/doc/notebooks/alpha/README.md:
--------------------------------------------------------------------------------
1 | # Working Files
2 | NOTE: The files and code in this directory and sub-directories are deprecated, work-in-progress, or both. This code is not intended to work.


--------------------------------------------------------------------------------
/doc/notebooks/directed_query_gen_walkthrough.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Walk Through for Directed Query Generation\n",
  8 |     "This notebook outlines the process of generating novel questions based on a user's seed topic using MULTIVAC's semantic knowledge graph and trained query generator. \n",
  9 |     "First, we set up the required imports and arguments for the test. "
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from multivac.src.rdf_graph.map_queries import *\n",
 19 |     "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n",
 20 |     "from multivac.src.gan.gen_test import run\n",
 21 |     "os.chdir('src/gan')"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "args_dict = {'dir': os.path.abspath('../../data'),\n",
 31 |     "             'out': os.path.abspath('../../models'),\n",
 32 |     "             'glove': '../../models/glove.42B.300d',\n",
 33 |     "             'run': 'model',\n",
 34 |     "             'model': 'transe',\n",
 35 |     "             'threshold': 0.1,\n",
 36 |     "             'num_top_rel': 10}\n"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "Next, we load up the knowledge graph embedding model previously calculated. This embedding model allows us to assign probabilities to missing nodes or relationships in the knowledge graph proposed via submitted queries. Here we are using TransE, an approach which models relationships by interpreting them as translations operating on the low-dimensional embeddings of entities."
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "con = config.Config()\n",
 53 |     "con.set_in_path(args_dict['dir']+os.path.sep)\n",
 54 |     "con.set_work_threads(8)\n",
 55 |     "con.set_dimension(100)\n",
 56 |     "con.set_test_link_prediction(True)\n",
 57 |     "con.set_test_triple_classification(True)\n",
 58 |     "\n",
 59 |     "files = glob.glob(os.path.join(args_dict['out'],'*tf*'))\n",
 60 |     "times = list(set([file.split('.')[2] for file in files]))\n",
 61 |     "ifile = max([datetime.strptime(x, '%d%b%Y-%H:%M:%S') for x in times]).strftime('%d%b%Y-%H:%M:%S')\n",
 62 |     "con.set_import_files(os.path.join(args_dict['out'], 'model.vec.{}.tf'.format(ifile)))\n",
 63 |     "\n",
 64 |     "con.init()\n",
 65 |     "kem = set_model_choice(args_dict['model'])\n",
 66 |     "con.set_model(kem)\n",
 67 |     "\n",
 68 |     "\n",
 69 |     "files = [x for x in os.listdir(con.in_path) if '2id' in x]\n",
 70 |     "rel_file = get_newest_file(con.in_path, files, 'relation')\n",
 71 |     "ent_file = get_newest_file(con.in_path, files, 'entity')\n",
 72 |     "trn_file = get_newest_file(con.in_path, files, 'train')\n",
 73 |     "\n",
 74 |     "entities = pd.read_csv(ent_file, sep='\\t', \n",
 75 |     "                       names=[\"Ent\",\"Id\"], skiprows=1)\n",
 76 |     "relations = pd.read_csv(rel_file, sep='\\t', \n",
 77 |     "                        names=[\"Rel\",\"Id\"], skiprows=1)\n",
 78 |     "train = pd.read_csv(trn_file, sep='\\t', \n",
 79 |     "                    names=[\"Head\",\"Tail\",\"Relation\"], skiprows=1)"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "We then set up a GloVe embedding model. Here we use the large scale, pre-trained GloVe embedding model given the open domain nature of potential submitted questions."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "glove_vocab, glove_emb = load_word_vectors(args_dict['glove'])\n"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Finally, we input our seed topic and extract the knowledge graph elements and predicted elements most related to that topic. The system identifies all triples containing the topic or closely semantically related to it, and returns the top `num_top_rel` results (by default, 10)."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "sample_topic = 'avian flu'"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "results = predict_object(con, sample_topic, relations, entities, train, glove_vocab, glove_emb, exact=False)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "These results are then fed to the query generator, which produces questions in response to each topic. The `run()` function called below does two main things: 1) submit the \"query\" triples to the Generator system to be parsed into a tree object representing the consituency parse of an English language question, and 2) translate that parse into the surface text for presentation:\n",
128 |     "```python\n",
129 |     "    results = netG.parse(query, beam_size=netG.args['beam_size'])\n",
130 |     "    texts = [asdl_ast_to_english(x.tree) for x in results]\n",
131 |     "\n",
132 |     "    return texts\n",
133 |     "```"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "questions = results.Text.apply(lambda x: run({'query': list(x), \n",
143 |     "                                              'model': os.path.join(args_dict['out'], 'gen_checkpoint.pth')}))"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "questions.values"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": []
161 |   }
162 |  ],
163 |  "metadata": {
164 |   "kernelspec": {
165 |    "display_name": "Python 3",
166 |    "language": "python",
167 |    "name": "python3"
168 |   },
169 |   "language_info": {
170 |    "codemirror_mode": {
171 |     "name": "ipython",
172 |     "version": 3
173 |    },
174 |    "file_extension": ".py",
175 |    "mimetype": "text/x-python",
176 |    "name": "python",
177 |    "nbconvert_exporter": "python",
178 |    "pygments_lexer": "ipython3",
179 |    "version": "3.7.5"
180 |   }
181 |  },
182 |  "nbformat": 4,
183 |  "nbformat_minor": 2
184 | }
185 | 


--------------------------------------------------------------------------------
/doc/notebooks/gan_training_illustration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Walk Through for GAN Training\n",
  8 |     "This notebook illustrates the training of MULTIVAC's generative adversarial network system for query generation.\n",
  9 |     "First, we set up the required imports and arguments for the test. This process can be performed all at once from the commandline as well:<br><br>\n",
 10 |     "`python3 querygan_pyt.py --gan_D_STEPS 1 --gan_K_STEPS 2 --gan_ROLLOUT_NUM 3 --gan_GENERATED_NUM 100`<br><br>\n",
 11 |     "(training and model parameters are read from a `config.cfg` file, but any of them may be overriden at run time with the appropriate arguments. Here, we reduce the number of steps and the generated samples batch size to better illustrate the entire training cycle in a more timely fashion. "
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import os\n",
 21 |     "os.chdir('src/gan')\n",
 22 |     "from multivac.src.gan.querygan_pyt import *"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "other_args = ['--gan_D_STEPS',       '1', \n",
 32 |     "              '--gan_K_STEPS',       '2', \n",
 33 |     "              '--gan_ROLLOUT_NUM',   '3', \n",
 34 |     "              '--gan_GENERATED_NUM', '100']\n",
 35 |     "\n",
 36 |     "args = {'config': 'config.cfg',\n",
 37 |     "        'cuda': False,\n",
 38 |     "        'continue': True,\n",
 39 |     "        'gen_chk': '../../models/gen_checkpoint.pth',\n",
 40 |     "        'disc_chk': '../../models/disc_checkpoint.pth'}\n",
 41 |     "\n",
 42 |     "overrides = {}\n",
 43 |     "\n",
 44 |     "i = 0\n",
 45 |     "\n",
 46 |     "while i < len(other_args):\n",
 47 |     "    if other_args[i].startswith('--'):\n",
 48 |     "        key = other_args[i][2:]\n",
 49 |     "        value = other_args[i+1]\n",
 50 |     "\n",
 51 |     "        if value.startswith('--'):\n",
 52 |     "            overrides[key] = True\n",
 53 |     "            i += 1\n",
 54 |     "            continue\n",
 55 |     "        else:\n",
 56 |     "            overrides[key] = value\n",
 57 |     "            i += 2\n",
 58 |     "    else:\n",
 59 |     "        i += 1\n",
 60 |     "\n",
 61 |     "cfg = configparser.ConfigParser()\n",
 62 |     "cfgDIR = os.path.dirname(os.getcwd())\n",
 63 |     "\n",
 64 |     "if args['config'] is not None:\n",
 65 |     "    cfg.read(args['config'])\n",
 66 |     "else:\n",
 67 |     "    cfg.read(os.path.join(cfgDIR, 'config.cfg'))\n",
 68 |     "\n",
 69 |     "cfg_dict = cfg._sections\n",
 70 |     "cfg_dict['ARGS'] = args\n",
 71 |     "\n",
 72 |     "for arg in overrides:\n",
 73 |     "    section, param = arg.split(\"_\", 1)\n",
 74 |     "    try:\n",
 75 |     "        cfg[section.upper()][param] = overrides[arg]\n",
 76 |     "    except KeyError:\n",
 77 |     "        print(\"Section \" + section.upper() + \"not found in \"\n",
 78 |     "              \"\" + args['config'] + \", skipping.\")\n",
 79 |     "        continue\n",
 80 |     "\n",
 81 |     "for name, section in cfg_dict.items():\n",
 82 |     "    for carg in section:\n",
 83 |     "        # Cast all arguments to proper types\n",
 84 |     "        if section[carg] == 'None':\n",
 85 |     "            section[carg] = None\n",
 86 |     "            continue\n",
 87 |     "\n",
 88 |     "        try:\n",
 89 |     "            section[carg] = int(section[carg])\n",
 90 |     "        except:\n",
 91 |     "            try:\n",
 92 |     "                section[carg] = float(section[carg])\n",
 93 |     "            except:\n",
 94 |     "                if section[carg] in ['True','False']:\n",
 95 |     "                    section[carg] = eval(section[carg])\n"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "Next, we load up the knowledge graph embedding model previously calculated. This embedding model allows us to assign probabilities to missing nodes or relationships in the knowledge graph proposed via submitted queries. Here we are using TransE, an approach which models relationships by interpreting them as translations operating on the low-dimensional embeddings of entities."
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "continue_training(cfg_dict, args['gen_chk'], args['disc_chk'])"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": []
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "Python 3",
125 |    "language": "python",
126 |    "name": "python3"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 3
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython3",
138 |    "version": "3.7.5"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 2
143 | }
144 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | services:
 3 |   multivac:
 4 |     build:
 5 |       context: .
 6 |       dockerfile: Dockerfile
 7 |     ports:
 8 |     - "5000:5000"
 9 |     depends_on:
10 |     - stanfordnlp
11 |     - grobid
12 |     - jupyter
13 |     links:
14 |     - "jupyter"
15 |     volumes:
16 |       - "./:/app"
17 |   stanfordnlp:
18 |     image: "graham3333/corenlp-complete"
19 |     ports:
20 |     - "9000:9000"
21 |   grobid:
22 |     image: "lfoppiano/grobid:0.5.5"
23 |     ports:
24 |     - "8070:8070"
25 |     - "8071:8071"
26 |   jupyter:
27 |     image: "jupyter/nbviewer"
28 |     ports:
29 |       - "8080:8080"
30 | 


--------------------------------------------------------------------------------
/get_kg_query_params.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | This script is meant to use network centrality measures to identify and select
 5 | nodes and edges (entities and relations) that would make good prediction
 6 | starting points for the MULTIVAC system. It does this by using eigenvector
 7 | centrality but can be extended to include additional network centrality
 8 | measures.
 9 | """
10 | import argparse
11 | import sys
12 | 
13 | import networkx as nx
14 | 
15 | 
16 | def analyze_network(net, args_dict):
17 |     if 'degree' in args_dict['measure']:
18 |         ans = nx.degree_centrality(net)
19 |     elif 'eigenvector' in args_dict['measure']:
20 |         ans = nx.eigenvector_centrality(net)
21 |     else:
22 |         sys.exit('Whoops; you must provide a valid network centrality measure.')
23 |     ans = sorted(ans.items(), key=lambda x: x[1], reverse=True)
24 | 
25 |     return ans[:args_dict['num_results']]
26 | 
27 | 
28 | def build_network(data):
29 |     tmp = [tuple(x[:2]) for x in data]
30 |     g = nx.Graph()
31 |     g.add_edges_from(tmp)
32 | 
33 |     return g
34 | 
35 | 
36 | def read_txt(file):
37 |     with open(file) as f:
38 |         tmp = f.readlines()[1:]
39 | 
40 |     return [x.rstrip(' \n').split('\t') for x in tmp]
41 | 
42 | 
43 | def run(args_dict):
44 |     # read in data
45 |     entities = read_txt(args_dict['files'][0])
46 |     network = read_txt(args_dict['files'][1])
47 | 
48 |     # construct/analyze network
49 |     net = build_network(network)
50 |     results = analyze_network(net, args_dict)
51 | 
52 |     # return results
53 |     named_entities = ['{}\n'.format(entity[0]) for entity in entities if
54 |                       entity[1] in [res[0] for res in results]]
55 | 
56 |     with open('search_terms_{}.txt'.format(args_dict['measure']), 'w') as f:
57 |         f.writelines(named_entities)
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     parser = argparse.ArgumentParser(description='Run network centrality '
62 |                                      'measures on data.')
63 |     parser.add_argument('-f', '--files', nargs=2, required=True, help='Two '
64 |                         'files -- entities then train -- that are parsed to '
65 |                         'create a network.')
66 |     parser.add_argument('-m', '--measure', required=False,
67 |                         default='eigenvector', choices=['degree', 'eigenvector'],
68 |                         help='Select which network centrality '
69 |                         'measure is required.')
70 |     parser.add_argument('-n', '--num_results', required=False, default=10,
71 |                         type=int, help='Number of results to return from '
72 |                         'centrality calculation.')
73 |     args_dict = vars(parser.parse_args())
74 | 
75 |     run(args_dict)
76 | 


--------------------------------------------------------------------------------
/images/KCCA_equation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/KCCA_equation.png


--------------------------------------------------------------------------------
/images/MULTIVAC_schematic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/MULTIVAC_schematic.png


--------------------------------------------------------------------------------
/images/aske_schematic_v1.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/aske_schematic_v1.5.png


--------------------------------------------------------------------------------
/images/aske_schematic_v1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/aske_schematic_v1.png


--------------------------------------------------------------------------------
/images/emulated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/emulated.png


--------------------------------------------------------------------------------
/images/emulated_kg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/emulated_kg.png


--------------------------------------------------------------------------------
/images/formula.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/formula.png


--------------------------------------------------------------------------------
/images/formula_dependencies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/formula_dependencies.png


--------------------------------------------------------------------------------
/images/gan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/gan.png


--------------------------------------------------------------------------------
/images/gan_design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/gan_design.png


--------------------------------------------------------------------------------
/images/key_triples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/key_triples.png


--------------------------------------------------------------------------------
/images/krongen.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/krongen.png


--------------------------------------------------------------------------------
/images/latex_parse_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/latex_parse_1.png


--------------------------------------------------------------------------------
/images/latex_parse_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/latex_parse_2.png


--------------------------------------------------------------------------------
/images/multivac_concept.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/multivac_concept.png


--------------------------------------------------------------------------------
/images/phase_one_system.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/phase_one_system.png


--------------------------------------------------------------------------------
/images/qgnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/qgnet.png


--------------------------------------------------------------------------------
/images/simple_kg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/simple_kg.png


--------------------------------------------------------------------------------
/images/stanford_dependecies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/stanford_dependecies.png


--------------------------------------------------------------------------------
/multivac.cfg:
--------------------------------------------------------------------------------
 1 | [PATHS]
 2 | #root_dir =
 3 | #data_dir =
 4 | #raw_dir =
 5 | #interim_dir =
 6 | #processed_dir =
 7 | #metadata_dir =
 8 | #models_dir =
 9 | #stanf_nlp_dir =
10 | 
11 | [SEARCH]
12 | # define search terms
13 | terms = ['sir model', 'susceptible-infected-recovered', 'irSIR model',
14 |         'susceptible-infected', 'seir model',
15 |         'susceptible-exposed-infected-recovered']
16 | 
17 | # specify sources
18 | sources = ['arxiv', 'pubmed', 'springer']
19 | 
20 | # filter terms for selected apis
21 | 
22 | [FILTER]
23 | # arxiv: filter out selected topics given default query that targets concepts related to
24 | # susceptible-infected-recovered
25 | drops = ['astro-ph Astrophysics',
26 |     'astro-ph.CO Cosmology and Nongalactic Astrophysics',
27 |     'astro-ph.EP Earth and Planetary Astrophysics',
28 |     'astro-ph.GA Astrophysics of Galaxies',
29 |     'astro-ph.HE High Energy Astrophysical Phenomena',
30 |     'astro-ph.IM Instrumentation and Methods for Astrophysics',
31 |     'astro-ph.SR Solar and Stellar Astrophysics',
32 |     'cond-mat.mes-hall Mesoscale and Nanoscale Physics',
33 |     'cond-mat.mtrl-sci Materials Science',
34 |     'cond-mat.other Other Condensed Matter',
35 |     'cond-mat.quant-gas Quantum Gases',
36 |     'cond-mat.soft Soft Condensed Matter',
37 |     'cond-mat.stat-mech Statistical Mechanics',
38 |     'cond-mat.str-el Strongly Correlated Electrons',
39 |     'cond-mat.supr-con Superconductivity',
40 |     'eess.AS Audio and Speech Processing',
41 |     'eess.IV Image and Video Processing',
42 |     'eess.SP Signal Processing',
43 |     'gr-qc General Relativity and Quantum Cosmology',
44 |     'hep-ex High Energy Physics - Experiment',
45 |     'hep-lat High Energy Physics - Lattice',
46 |     'hep-ph High Energy Physics - Phenomenology',
47 |     'hep-th High Energy Physics - Theory',
48 |     'math.AC Commutative Algebra',
49 |     'math.AG Algebraic Geometry',
50 |     'nucl-ex Nuclear Experiment',
51 |     'nucl-th Nuclear Theory',
52 |     'physics.acc-ph Accelerator Physics',
53 |     'physics.ao-ph Atmospheric and Oceanic Physics',
54 |     'physics.app-ph Applied Physics',
55 |     'physics.atm-clus Atomic and Molecular Clusters',
56 |     'physics.atom-ph Atomic Physics',
57 |     'physics.chem-ph Chemical Physics',
58 |     'physics.class-ph Classical Physics',
59 |     'physics.comp-ph Computational Physics',
60 |     'physics.ed-ph Physics Education',
61 |     'physics.flu-dyn Fluid Dynamics',
62 |     'physics.gen-ph General Physics',
63 |     'physics.geo-ph Geophysics',
64 |     'physics.hist-ph History and Philosophy of Physics',
65 |     'physics.ins-det Instrumentation and Detectors',
66 |     'physics.med-ph Medical Physics',
67 |     'physics.optics Optics',
68 |     'physics.plasm-ph Plasma Physics',
69 |     'physics.space-ph Space Physics',
70 |     'q-fin.CP Computational Finance',
71 |     'q-fin.EC Economics',
72 |     'q-fin.GN General Finance',
73 |     'q-fin.MF Mathematical Finance',
74 |     'q-fin.PM Portfolio Management',
75 |     'q-fin.PR Pricing of Securities',
76 |     'q-fin.RM Risk Management',
77 |     'q-fin.ST Statistical Finance',
78 |     'q-fin.TR Trading and Market Microstructure',
79 |     'quant-ph Quantum Physics']


--------------------------------------------------------------------------------
/pymln/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Gallup Government, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pymln/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #
 4 | # Python implementation of Unsupervised Semantic Parsing system, from:
 5 | #
 6 | #   Hoifung Poon and Pedro Domingos (2009). "Unsupervised Semantic Parsing",
 7 | #   in Proceedings of the Conference on Empirical Methods in Natural Language
 8 | #   Processing (EMNLP), 2009. http://alchemy.cs.washington.edu/usp.
 9 | #
10 | 
11 | from multivac.pymln.semantic import *
12 | from multivac.pymln.syntax import *
13 | from multivac.pymln.utils import *
14 | from multivac.pymln.eval import *
15 | 
16 | 


--------------------------------------------------------------------------------
/pymln/eval/Answer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Answer(object):
 3 |     def __init__(self, sid, rst):
 4 |         self._sid = sid
 5 |         self._rst = rst
 6 | 
 7 |     def __hash__(self):
 8 |         return hash(self.toString())
 9 | 
10 |     def __eq__(self, other):
11 |         return self.compareTo(other) == 0
12 | 
13 |     def __lt__(self, other):
14 |         return self.compareTo(other) < 0
15 | 
16 |     def __str__(self):
17 |         return self.toString()
18 | 
19 |     def getSentId(self):
20 |         return self._sid
21 | 
22 |     def getRst(self):
23 |         return self._rst
24 | 
25 |     def compareTo(self, a):
26 |         result = 0
27 | 
28 |         if self._rst != a.getRst():
29 |             if self._rst < a.getRst():
30 |                 result -= 1
31 |             else:
32 |                 result += 1
33 |         elif self._sid != a.getSentId():
34 |             if self._sid < a.getSentId():
35 |                 result -= 1
36 |             else:
37 |                 result += 1
38 | 
39 |         return result
40 | 
41 |     def toString(self):
42 |         return ' '.join([self._sid, self._rst])
43 | 
44 | 


--------------------------------------------------------------------------------
/pymln/eval/Question.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Question(object):
 3 |     def __init__(self, rel, arg, dep):
 4 |         self._rel = rel
 5 |         self._dep = dep
 6 |         self._arg = arg
 7 |         self._argClustIdxSeq = None
 8 | 
 9 |     def __hash__(self):
10 |         return hash(self.toString())
11 | 
12 |     def __eq__(self, other):
13 |         return self.compareTo(other) == 0
14 | 
15 |     def __lt__(self):
16 |         return self.compareTo(other) < 0
17 | 
18 |     def __str__(self):
19 |         return self.toString()
20 | 
21 |     def getRel(self):
22 |         return self._rel
23 | 
24 |     def getArg(self):
25 |         return self._arg
26 | 
27 |     def getDep(self):
28 |         return self._dep
29 | 
30 |     def compareTo(self, q):
31 |         result = 0
32 | 
33 |         if self._dep != q.getDep():
34 |             if self._dep < q.getDep():
35 |                 result -= 1
36 |             else:
37 |                 result += 1
38 |         elif self._rel != q.getRel():
39 |             if self._rel < q.getRel():
40 |                 result -= 1
41 |             else:
42 |                 result += 1
43 |         elif self._arg != q.getArg():
44 |             if self._arg < q.getArg():
45 |                 result -= 1
46 |             else:
47 |                 result += 1
48 | 
49 |         return result
50 | 
51 |     def getPattern(self):
52 |         if self._dep == 'nsubj':
53 |             return ' '.join([self._arg, self._rel])
54 |         elif self._dep == 'dobj':
55 |             return ' '.join([self._rel, self._arg])
56 |         else:
57 |             return None
58 | 
59 |     def toString(self):
60 |         if self._dep == 'nsubj':
61 |             return "What does {} {}?".format(self._arg, self._rel)
62 |         elif self._dep == 'dobj':
63 |             return "What {}s {}?".format(self._rel, self._arg)
64 |         else:
65 |             return "{} ::: {} ::: {}".format(self._rel, self._dep, self._arg)
66 |     
67 | 


--------------------------------------------------------------------------------
/pymln/eval/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | from . Answer import Answer
4 | from . Question import Question
5 | from . USP import USP
6 | 
7 | 


--------------------------------------------------------------------------------
/pymln/pymln.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # Python implementation of Unsupervised Semantic Parsing system, from:
  4 | #
  5 | #   Hoifung Poon and Pedro Domingos (2009). "Unsupervised Semantic Parsing",
  6 | #   in Proceedings of the Conference on Empirical Methods in Natural Language
  7 | #   Processing (EMNLP), 2009. http://alchemy.cs.washington.edu/usp.
  8 | import os
  9 | 
 10 | from datetime import datetime
 11 | 
 12 | from multivac import settings
 13 | from multivac.pymln.semantic import Parse, MLN, Clust
 14 | from multivac.pymln.syntax.StanfordParseReader import StanfordParseReader
 15 | 
 16 | 
 17 | def read_input_files(DIR):
 18 |     files = []
 19 |     for file in os.listdir(DIR):
 20 |         if file.endswith(".dep"):
 21 |             files.append(file)
 22 | 
 23 |     return files
 24 | 
 25 | 
 26 | def mln_main(args_dict):
 27 |     # set variables
 28 |     verbose = args_dict['verbose']
 29 |     data_dir = settings.data_dir
 30 |     results_dir = settings.mln_dir
 31 |     parser = Parse(args_dict['prior_num_param'], args_dict['prior_num_conj'])
 32 | 
 33 |     # read in inputs
 34 |     input_files = read_input_files(data_dir)
 35 |     input_files.sort()
 36 | 
 37 |     # set final parameter
 38 |     if 'subset' in args_dict:
 39 |         subset = args_dict['subset']
 40 |     else:
 41 |         subset = len(input_files)
 42 | 
 43 |     articles = []
 44 |     for i, fileName in enumerate(input_files):
 45 |         try:
 46 |             a = StanfordParseReader.readParse(fileName, data_dir)
 47 |         except:
 48 |             print("Error on {}, {}".format(i, fileName))
 49 |             raise Exception
 50 | 
 51 |         if i%100 == 0:
 52 |             print("{} articles parsed.".format(i))
 53 | 
 54 |         if i >= subset:
 55 |             break
 56 | 
 57 |         articles.append(a)
 58 | 
 59 | 
 60 |     if verbose:
 61 |         print("{} Initializing...".format(datetime.now()))
 62 |     parser.initialize(articles, verbose)
 63 | 
 64 |     if verbose:
 65 |         print("{}: {} articles parsed, of {} sentences and {} total tokens."
 66 |               .format(datetime.now(),
 67 |                       len(articles),
 68 |                       parser.numSents,
 69 |                       parser.numTkns))
 70 |     num_arg_clusts = sum([len(x._argClusts) for x in Clust.clusts.values()])
 71 | 
 72 |     if verbose:
 73 |         print("{}: {} initial clusters, with {} argument clusters."
 74 |               .format(datetime.now(), len(Clust.clusts), num_arg_clusts))
 75 |         print("{} Merging arguments...".format(datetime.now()))
 76 |     parser.mergeArgs()
 77 |     num_arg_clusts = sum([len(x._argClusts) for x in Clust.clusts.values()])
 78 | 
 79 |     if verbose:
 80 |         print("Now with {} initial clusters, {} argument clusters."
 81 |               .format(len(Clust.clusts), num_arg_clusts))
 82 |         print("{} Creating agenda...".format(datetime.now()))
 83 |     parser.agenda.createAgenda(verbose)
 84 | 
 85 |     if verbose:
 86 |         print("{}: {} possible operations in queue, {} merges and {} composes."
 87 |               .format(datetime.now(),
 88 |                       len(parser.agenda._agendaToScore),
 89 |                       len(parser.agenda._mc_neighs),
 90 |                       len(parser.agenda._compose_cnt)))
 91 |         print("{} Processing agenda...".format(datetime.now()))
 92 |     parser.agenda.procAgenda(verbose)
 93 | 
 94 |     num_arg_clusts = sum([len(x._argClusts) for x in Clust.clusts.values()])
 95 | 
 96 |     if verbose:
 97 |         print("{}: {} final clusters, with {} argument clusters."
 98 |               .format(datetime.now(), len(Clust.clusts), num_arg_clusts))
 99 | 
100 |     MLN.save_mln(results_dir / "mln.pkl")
101 |     MLN.printModel(results_dir)
102 | 
103 |     if verbose:
104 |         print("{} Induced MLN saved.".format(datetime.now()))
105 | 
106 | 
107 | if __name__ == '__main__':
108 |     mln_main(args_dict)
109 | 


--------------------------------------------------------------------------------
/pymln/semantic/Argument.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class Argument(object):
 4 | 	def __init__(self, argNode, path, argPart):
 5 | 		self._argNode = argNode
 6 | 		self._path = path
 7 | 		self._argPart = argPart
 8 | 
 9 | 		return None
10 | 
11 | 	def getPath(self):
12 | 		return self._path
13 | 
14 | 	def getPart(self):
15 | 		return self._argPart
16 | 
17 | 	def getNode(self):
18 | 		return self._argNode
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/pymln/semantic/MLN.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from multivac.pymln.semantic import Clust, ArgClust, Part
  3 | from multivac.pymln.syntax.Relations import ArgType, RelType
  4 | 
  5 | import json
  6 | import pickle
  7 | import os
  8 | 
  9 | class MLN(object):
 10 |     '''
 11 |     Class for simply outputting the MLN structure parsed from source
 12 |     documents.
 13 | 
 14 |     '''
 15 |     def __init__(self):
 16 |         return None
 17 | 
 18 |     def printModel(path):
 19 |         clustering = MLN.printClustering(path)
 20 |         mln = MLN.printMLN(path)
 21 |         prse = MLN.printParse(path)
 22 | 
 23 |         if path is None:
 24 |             return clustering, mln, prse
 25 |         else:
 26 |             return None
 27 | 
 28 |     def printClustering(path=None):
 29 |         out_str = "=== Clustering ===\n"
 30 | 
 31 |         for ci, clust in Clust.clusts.items():
 32 |             # if len(clust._relTypeIdx_cnt) > 1:
 33 |             out_str += str(ci) + " " + clust.toString() + "\n"
 34 |             for aci, ac in clust._argClusts.items():
 35 |                 out_str += "\t{}\t{}\t{}\n".format(aci, ac.toString(), ac._ttlArgCnt)
 36 | 
 37 |         if path is not None:
 38 |             dst = "{}/{}.clustering".format(path,
 39 |                                             os.path.basename(os.path.dirname(path)))
 40 |             with open(dst, 'w') as f:
 41 |                 f.write(out_str)
 42 | 
 43 |             return None
 44 |         else:
 45 |             return out_str
 46 | 
 47 |     def save_mln(path):
 48 |         '''
 49 |             Save all objects necessary to recreate the MLN knowledgebase
 50 |         '''
 51 |         with open(path, 'wb') as f:
 52 |             pickle.dump({'clusts': Clust.clusts,
 53 |                          'relTypeIdx_clustIdx': Clust.relTypeIdx_clustIdx,
 54 |                          'relTypes': RelType.relTypes,
 55 |                          'relTypeStr_idx': RelType.relTypeStr_idx,
 56 |                          'argTypes': ArgType.argTypes,
 57 |                          'argTypeStr_idx': ArgType.argTypeStr_idx,
 58 |                          'rootNodeId_part': Part.rootNodeId_part,
 59 |                          'clustIdx_partRootNodeIds': Part.clustIdx_partRootNodeIds,
 60 |                          'pairClustIdxs_pairPartRootNodeIds': Part.pairClustIdxs_pairPartRootNodeIds},
 61 |                         f)
 62 | 
 63 |         return None
 64 | 
 65 |     def load_mln(path, ret=False):
 66 |         with open(path, 'rb') as f:
 67 |             mln = pickle.load(f)
 68 | 
 69 |         try:
 70 |             _ = len(Clust.clusts)
 71 |             _ = len(ArgType.argTypes)
 72 |             _ = len(RelType.relTypes)
 73 |             _ = len(Part.rootNodeId_part)
 74 |         except NameError:
 75 |             from multivac.pymln.semantic import Clust, Part
 76 |             from multivac.pymln.syntax.Relations import ArgType, RelType
 77 | 
 78 |         Clust.clusts = mln['clusts']
 79 |         Clust.relTypeIdx_clustIdx = mln['relTypeIdx_clustIdx']
 80 |         RelType.relTypes = mln['relTypes']
 81 |         RelType.relTypeStr_idx = mln['relTypeStr_idx']
 82 |         ArgType.argTypes = mln['argTypes']
 83 |         ArgType.argTypeStr_idx = mln['argTypeStr_idx']
 84 |         Part.rootNodeId_part = mln['rootNodeId_part']
 85 |         Part.clustIdx_partRootNodeIds = mln['clustIdx_partRootNodeIds']
 86 |         Part.pairClustIdxs_pairPartRootNodeIds = mln['pairClustIdxs_pairPartRootNodeIds']
 87 | 
 88 |         if ret:
 89 |             return mln
 90 |         else:
 91 |             return None
 92 | 
 93 |     def printMLN(path=None):
 94 |         out_str = ""
 95 | 
 96 |         for ci in Clust.clusts:
 97 |             cl = Clust.getClust(ci)
 98 |             out_str += "{}\t{}\n".format(cl._clustIdx,cl)
 99 | 
100 |             for aci in cl._argClusts:
101 |                 ac = cl._argClusts[aci]
102 |                 out_str += "\t{}: ".format(aci)
103 | 
104 |                 out_str += "\t".join(["{}: {}".format(k, v)
105 |                                       for k, v in ac._argNum_cnt.items()])
106 |                 out_str += "\n\t"
107 |                 out_str += "\t".join(["{}: {}: {}".format(k,
108 |                                                           ArgType.getArgType(k).toString(),
109 |                                                           v)
110 |                                       for k, v in ac._argTypeIdx_cnt.items()])
111 |                 out_str += "\n\t"
112 |                 out_str += "\t".join(["{}: {}: {}".format(k,
113 |                                                           Clust.getClust(k),
114 |                                                           v)
115 |                                       for k, v in ac._chdClustIdx_cnt.items()])
116 |                 out_str += "\n"
117 | 
118 |         if path is not None:
119 |             dst = "{}/{}.mln".format(path,
120 |                                      os.path.basename(os.path.dirname(path)))
121 | 
122 |             with open(dst, 'w') as f:
123 |                 f.write(out_str)
124 | 
125 |             return None
126 |         else:
127 |             return out_str
128 | 
129 | 
130 |     def printParse(path=None):
131 |         out_str = ""
132 | 
133 |         for rnid, pt in Part.rootNodeId_part.items():
134 |             out_str += "{}\t{}\n".format(rnid, pt._relTreeRoot.getTreeStr())
135 |             out_str += "\t{}: {}\n".format(pt._clustIdx,
136 |                                            Clust.getClust(pt._clustIdx).toString())
137 | 
138 |             if pt._parPart is None:
139 |                 out_str += "\t\n\t\n"
140 |             else:
141 |                 arg = pt._parPart.getArgument(pt._parArgIdx)
142 |                 out_str += "\t{}\t{}\t{}\n".format(pt._parPart._relTreeRoot.getId(),
143 |                                                    pt._parPart._clustIdx,
144 |                                                    Clust.getClust(pt._parPart._clustIdx))
145 |                 out_str += "\t{}: {}: {}\n".format(pt._parPart.getArgClust(pt._parArgIdx),
146 |                                                    arg._path.getArgType(),
147 |                                                    ArgType.getArgType(arg._path.getArgType()))
148 | 
149 |         if path is not None:
150 |             dst = "{}/{}.parse".format(path,
151 |                                        os.path.basename(os.path.dirname(path)))
152 |             with open(dst, 'w') as f:
153 |                 f.write(out_str)
154 | 
155 |             return None
156 |         else:
157 |             return out_str
158 | 
159 | 


--------------------------------------------------------------------------------
/pymln/semantic/ParseParams.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class ParseParams(object):
 4 | 	minMCCnt = 10
 5 | 	minAbsCnt = 50
 6 | 	priorCutOff = 10
 7 | 	priorNumArgComb = 1
 8 | 	priorMerge = 0
 9 | 	priorNumParam = 5
10 | 	priorNumConj = 10
11 | 
12 | 


--------------------------------------------------------------------------------
/pymln/semantic/SearchOp.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from multivac.pymln.semantic import Clust
 3 | 
 4 | class SearchOp(object):
 5 |     # Why are these strings instead of just integers?
 6 |     OP_MERGE_CLUST = '0'
 7 |     OP_MERGE_ROLE  = '1'
 8 |     OP_COMPOSE     = '2'
 9 | 
10 |     def __init__(self):
11 |         self._op = ''
12 |         self._clustIdx1 = None
13 |         self._clustIdx2 = None
14 |         self._clustIdx = None
15 |         self._argIdx1 = None
16 |         self._argIdx2 = None
17 |         self._parClustIdx = None
18 |         self._chdClustIdx = None
19 |         self._str = None
20 | 
21 |     def __hash__(self):
22 |         return hash(self.toString())
23 | 
24 |     def __eq__(self, other):
25 |         return self.compareTo(other) == 0
26 | 
27 |     def __lt__(self, other):
28 |         return self.compareTo(other) < 0
29 | 
30 |     def __repr__(self):
31 |         return self.toString()
32 | 
33 |     def compareTo(self, z):
34 |         this = sum([ord(x) for x in self.toString()])
35 |         that = sum([ord(x) for x in z.toString()])
36 |         result = this - that
37 | 
38 |         return result
39 | 
40 |     def toString(self):
41 |         if self._str is None:
42 |             self.genString()
43 | 
44 |         return self._str
45 | 
46 |     def genString(self):
47 |         self._str = "OP_{}:".format(self._op)
48 | 
49 |         if self._op == SearchOp.OP_MERGE_CLUST:
50 |             c1 = Clust.getClust(self._clustIdx1)
51 |             c2 = Clust.getClust(self._clustIdx2)
52 |             self._str += "{} == {}".format(c1.toString(), c2.toString())
53 |         elif self._op == SearchOp.OP_MERGE_ROLE:
54 |             self._str += "{}:{}:{}".format(self._clustIdx,
55 |                                            self._argIdx1,
56 |                                            self._argIdx2)
57 |         elif self._op == SearchOp.OP_COMPOSE:
58 |             rc = Clust.getClust(self._parClustIdx)
59 |             ac = Clust.getClust(self._chdClustIdx)
60 |             self._str += "{} ++ {}".format(rc.toString(), ac.toString())
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/pymln/semantic/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from . ParseParams import ParseParams
 3 | from . Argument import Argument
 4 | from . argclust import ArgClust
 5 | 
 6 | from . Clust import Clust
 7 | from . SearchOp import SearchOp
 8 | from . Part import Part
 9 | from . Agenda import Agenda
10 | from . Scorer import Scorer as Scorer
11 | from . Executor import Executor
12 | from . MLN import MLN
13 | 
14 | from . Parse import Parse
15 | 


--------------------------------------------------------------------------------
/pymln/semantic/argclust.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # from collections import OrderedDict
 3 | from sortedcontainers import SortedSet
 4 | from multivac.pymln.syntax.Relations import ArgType
 5 | 
 6 | class ArgClust(object):
 7 |     def __init__(self):
 8 |         # Dictionary mapping {int: int}
 9 |         self._argTypeIdx_cnt = {}
10 |         # Dictionary mapping {int: int}
11 |         self._chdClustIdx_cnt = {}
12 |         # Dictionary mapping {int: int}
13 |         self._argNum_cnt = {}
14 |         self._ttlArgCnt = 0
15 |         self._partRootTreeNodeIds = SortedSet()
16 | 
17 |     def toString(self):
18 |         s = ''
19 |         for k, v in self._argTypeIdx_cnt.items():
20 |             if len(s) > 0:
21 |                 s += ' '
22 |             s += '{}:{}'.format(ArgType.getArgType(k), v)
23 | 
24 |         return s
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/pymln/syntax/Nodes/Article.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from multivac.pymln.syntax.Nodes import Sentence
 3 | 
 4 | class Article(object):
 5 |     '''
 6 |     An Article() is merely a collection of Sentences() (represented as a list)
 7 |     and an article id, which can be of any particular type but should be unique
 8 |     in a collection of Articles.
 9 |     '''
10 |     def __init__(self, fn=None):
11 |         self.uid = fn
12 |         self.sentences = []
13 | 
14 |     def __repr__(self):
15 |         return str(self.__dict__)
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/pymln/syntax/Nodes/Sentence.py:
--------------------------------------------------------------------------------
  1 | from sortedcontainers import SortedSet
  2 | from multivac.pymln.syntax.Nodes.Token import Token
  3 | 
  4 | class Sentence(object):
  5 |     def __init__(self):
  6 |         '''
  7 |         Each sentence consists of:
  8 |         _tokens: A list of individual tokens in the sentence, containing POS,
  9 |         lemma, and actual form of the word/item.
 10 |         _tkn_children: A dictionary mapping parents (denoted by the integer
 11 |         keys) to children (sets of integer, string tuples).
 12 |         _tkn_par: A dictionary mapping children (denoted by integer keys) to
 13 |         parents (tuples of string, integer values)
 14 |         '''
 15 |         self._tokens = []
 16 | 
 17 |         # Dictionary mapping {int: set((int, str))}
 18 |         self._tkn_children = {0: SortedSet()}
 19 |         # Dictionary mapping {int: (str, int)}
 20 |         self._tkn_par = {}
 21 | 
 22 |         return None
 23 | 
 24 | 
 25 |     def __repr__(self):
 26 |         return ('Tokens: ' + str([str(x) for x in self._tokens]))
 27 | 
 28 |     def get_tokens(self, idx=None):
 29 |         '''
 30 |         Return Tokens at the specified indices.
 31 |         '''
 32 |         if idx is None:
 33 |             return self._tokens
 34 |         elif isinstance(idx, list):
 35 |             return [self._tokens[i] for i in idx]
 36 |         elif isinstance(idx, int):
 37 |             return self.get_token(idx)
 38 |         else:
 39 |             raise ValueError
 40 | 
 41 | 
 42 |     def get_token(self, idx):
 43 |         '''
 44 |         Return the Token() at the specified index.
 45 |         '''
 46 |         return self._tokens[idx]
 47 | 
 48 |     def add_token(self, tok):
 49 |         '''
 50 |         Append the Token() to the list of _tokens.
 51 |         '''
 52 |         assert isinstance(tok, Token)
 53 |         self._tokens.append(tok)
 54 | 
 55 |         return None
 56 | 
 57 |     def get_children(self, parent=None):
 58 |         '''
 59 |         Return the child/children of the parent specified by the given key. If
 60 |         no key specified, return them all.
 61 |         '''
 62 |         if parent is not None:
 63 |             if parent in self._tkn_children:
 64 |                 c = self._tkn_children[parent]
 65 |             else:
 66 |                 c = None
 67 |         else:
 68 |             c = self._tkn_children
 69 | 
 70 |         return c
 71 | 
 72 |     def set_children(self, parent, kids):
 73 |         '''
 74 |         Add the child/children specified by the key/kids key/value pair.
 75 |         '''
 76 |         assert isinstance(kids, SortedSet)
 77 |         self._tkn_children[parent] = kids
 78 | 
 79 |         return None
 80 | 
 81 |     def add_child(self, parent, kid):
 82 |         '''
 83 |         Add/update the child/children specified by the key/kids key/value pair.
 84 |         '''
 85 |         assert parent in self._tkn_children
 86 |         self._tkn_children[parent].add(kid)
 87 | 
 88 |         return None
 89 | 
 90 |     def get_parent(self, kid):
 91 |         '''
 92 |         Return the parent of the child specified by the given key.
 93 |         '''
 94 |         if kid in self._tkn_par:
 95 |             return self._tkn_par[kid]
 96 |         else:
 97 |             return None
 98 | 
 99 |     def set_parent(self, kid, parent):
100 |         '''
101 |         Add/update the parent specified by the given key/parent value pair.
102 |         '''
103 |         assert isinstance(parent, tuple)
104 |         self._tkn_par[kid] = parent
105 | 
106 |         return None
107 | 
108 | 
109 | 
110 | 


--------------------------------------------------------------------------------
/pymln/syntax/Nodes/Token.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class Token(object):
 3 | 
 4 |     contentPOS = set(['J','R','V','N'])
 5 | 
 6 |     def isContent(t):
 7 |         return t._pos[0] in Token.contentPOS
 8 | 
 9 |     def isVerb(t):
10 |         return t._pos[0] == 'V'
11 | 
12 |     def isNoun(t):
13 |         return (t._pos[0] == 'N') | (self._pos.startswith('PRP'))
14 | 
15 | 
16 |     def __init__(self, pos, lemma, form=None):
17 |         self._pos = pos
18 | 
19 |         if Token.isContent(self):
20 |             self._pos = pos[0]
21 | 
22 |         self._lemma = lemma
23 | 
24 |         if form is None:
25 |             self._form = lemma
26 |         else:
27 |             self._form = form
28 | 
29 |     def __hash__(self):
30 |         return hash(self.toString())
31 | 
32 |     def __lt__(self, other):
33 |         return self.compareTo(other) < 0
34 | 
35 |     def __eq__(self, other):
36 |         return self.compareTo(other) == 0
37 | 
38 |     def __str__(self):
39 |         return self.toString()
40 | 
41 |     def getForm(self):
42 |         return self._form
43 | 
44 |     def getPOS(self):
45 |         return self._pos
46 | 
47 |     def getLemma(self):
48 |         return self._lemma
49 | 
50 |     def compareTo(self, t):
51 |         this = sum([ord(x) for x in self._lemma])
52 |         that = sum([ord(x) for x in t._lemma])
53 |         result = this - that
54 | 
55 |         if result == 0:
56 |             this = sum([ord(x) for x in self._pos])
57 |             that = sum([ord(x) for x in t._pos])
58 |             result = this - that
59 |         return result
60 | 
61 |     def equals(self, t):
62 |         return (self._pos == t._pos) & (self._lemma == t._lemma)
63 | 
64 |     def hashCode(self):
65 |         return hash(self)
66 | 
67 |     def toString(self):
68 |         return (self._pos + ":" + self._lemma)
69 | 


--------------------------------------------------------------------------------
/pymln/syntax/Nodes/TreeNode.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # from collections import OrderedDict
 3 | from sortedcontainers import SortedDict, SortedSet
 4 | from multivac.pymln.syntax.Nodes import Token
 5 | 
 6 | class TreeNode(object):
 7 |     # map {str: TreeNode}
 8 |     id_treeNodes = {}
 9 | 
10 |     def __init__(self, tree_node_id, token):
11 |         self._id = tree_node_id
12 |         self._tkn = token
13 |         # map {str: set(TreeNodes)}
14 |         self._children = SortedDict()
15 |         TreeNode.id_treeNodes[tree_node_id] = self
16 | 
17 |     def __hash__(self):
18 |         return hash(self.toString())
19 | 
20 |     def __eq__(self, other):
21 |         return self.compareTo(other) == 0
22 | 
23 |     def __lt__(self, other):
24 |         return self.compareTo(other) < 0
25 | 
26 |     def __str__(self):
27 |         return self.toString()
28 | 
29 |     def __repr__(self):
30 |         return self.toString()
31 | 
32 |     def addChild(self, dep, child):
33 |         if dep not in self._children:
34 |             self._children[dep] = SortedSet()
35 | 
36 |         self._children[dep].add(child)
37 | 
38 |         return None
39 | 
40 |     def getId(self):
41 |         return self._id
42 | 
43 |     def getToken(self):
44 |         return self._tkn
45 | 
46 |     def getChildren(self):
47 |         return self._children
48 | 
49 |     def compareTo(self, z):
50 |         if not isinstance(z, TreeNode):
51 |             raise ValueError
52 | 
53 |         return self._tkn.compareTo(z._tkn)
54 | 
55 |     def toString(self):
56 |         return self._tkn.toString()
57 | 
58 |     def getTreeNode(tree_node_id):
59 |         return TreeNode.id_treeNodes[tree_node_id]
60 | 
61 |     def getTreeStr(self):
62 |         id_str = SortedDict()
63 | 
64 |         if (len(self._children) > 0):
65 |             for dep, nodes in self._children.items():
66 |                 s = ''
67 | 
68 |                 for node in nodes:
69 |                     if dep.startswith('prep_') or dep.startswith('conj_'):
70 |                         s = dep[5:] + ' '
71 |                     s = s + node.getTreeStr()
72 |                     id_str[node.getId()] = s
73 | 
74 |         id_str[self._id] = self._tkn.getLemma()
75 |         result = ' '.join(id_str.values())
76 | 
77 |         return result
78 | 
79 | 
80 | 


--------------------------------------------------------------------------------
/pymln/syntax/Nodes/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | from multivac.pymln.syntax.Nodes.Article import Article
4 | from multivac.pymln.syntax.Nodes.Sentence import Sentence
5 | from multivac.pymln.syntax.Nodes.Token import Token
6 | from multivac.pymln.syntax.Nodes.TreeNode import TreeNode
7 | 
8 | 


--------------------------------------------------------------------------------
/pymln/syntax/Relations/ArgType.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from multivac.pymln.syntax.Relations import RelType
 3 | 
 4 | class ArgType(object):
 5 |     argTypes = []
 6 |     # Dictionary mapping {str: int}
 7 |     argTypeStr_idx = {}
 8 | 
 9 |     def __init__(self, target):
10 |         s = target.toString()
11 |         self._dep = target.getDep()
12 |         self._dep2 = target.getDep2()
13 |         self._str = None
14 | 
15 |         if target.getTreeRoot() is not None:
16 |             self._relTypeIdx = RelType.getRelType(target.getTreeRoot())
17 |         else:
18 |             self._relTypeIdx = -1
19 | 
20 |         self._str = self.toString()
21 |         ArgType.argTypes.append(self)
22 |         i = len(ArgType.argTypes) - 1
23 |         ArgType.argTypeStr_idx[s] = i
24 | 
25 |     def __hash__(self):
26 |         return hash(self.toString())
27 | 
28 |     def __eq__(self, other):
29 |         return self.compareTo(other) == 0
30 | 
31 |     def __str__(self):
32 |         return self.toString()
33 | 
34 |     def __repr__(self):
35 |         return self.toString()
36 | 
37 |     def getArgType(target):
38 |         if isinstance(target, int):
39 |             return ArgType.argTypes[target]
40 |         elif not isinstance(target, str):
41 |             s = target.toString()
42 | 
43 |             if s not in ArgType.argTypeStr_idx:
44 |                 t = ArgType(target)
45 | 
46 |         return ArgType.argTypeStr_idx[s]
47 | 
48 |     def compareTo(self, z):
49 |         if self._dep is None or z.GetDep() is None:
50 |             return None
51 | 
52 |         this = sum([ord(x) for x in self._dep])
53 |         that = sum([ord(x) for x in z.getDep()])
54 |         result = this - that
55 | 
56 |         if result == 0:
57 |             result = self._relTypeIdx - z._relTypeIdx
58 | 
59 |             if result == 0:
60 |                 if self._dep2 is not None:
61 |                     this = sum([ord(x) for x in self._dep2])
62 | 
63 |                     try:
64 |                         that = sum([ord(x) for x in z.getDep2()])
65 |                     except TypeError:
66 |                         result = -1
67 |                     else:
68 |                         result = this - that
69 | 
70 |         return result
71 | 
72 |     def toString(self):
73 |         if self._str is None:
74 |             self._str = '<' + self._dep
75 | 
76 |             if self._relTypeIdx >= 0:
77 |                 self._str += ':{}:{}'.format(self._relTypeIdx,self._dep2)
78 | 
79 |             self._str += '>'
80 | 
81 |         return self._str
82 | 
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------
/pymln/syntax/Relations/Path.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from multivac.pymln.syntax.Relations import RelType, ArgType
 3 | 
 4 | class Path(object):
 5 |     def __init__(self, dep, treeRoot=None, argNode=None, dep2=None):
 6 |         self._dep = dep
 7 |         self._treeRoot = treeRoot
 8 |         self._argNode = argNode
 9 |         self._dep2 = dep2
10 |         self._str = None
11 | 
12 |         self._argTypeIdx = ArgType.getArgType(self)
13 | 
14 |     def __str__(self):
15 |         return self.toString()
16 | 
17 |     def __repr__(self):
18 |         return self.toString()
19 | 
20 |     def getDep(self):
21 |         return self._dep
22 | 
23 |     def getTreeRoot(self):
24 |         return self._treeRoot
25 | 
26 |     def getArgNode(self):
27 |         return self._argNode
28 | 
29 |     def getDep2(self):
30 |         return self._dep2
31 | 
32 |     def getArgType(self):
33 |         return self._argTypeIdx
34 | 
35 |     def toString(self):
36 |         if self._str is None:
37 |             self._str = self.genTypeStr()
38 | 
39 |         return self._str
40 | 
41 |     def genTypeStr(self):
42 |         typ_str = '<' + self._dep
43 | 
44 |         if self._treeRoot is not None:
45 |             rel_str = RelType.genTypeStr(self._treeRoot)
46 |             typ_str += ':' + rel_str + ':' + self._dep2
47 | 
48 |         typ_str += '>'
49 | 
50 |         return typ_str
51 | 
52 | 


--------------------------------------------------------------------------------
/pymln/syntax/Relations/RelType.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from multivac.pymln.syntax.Nodes import Token, TreeNode
 3 | 
 4 | class RelType(object):
 5 |     relTypes = []
 6 |     # Dictionary mapping {str: int} tracking RelType strings and
 7 |     # their unique indices.
 8 |     relTypeStr_idx = {}
 9 | 
10 |     def __init__(self, target):
11 |         self._str = RelType.genTypeStr(target)
12 | 
13 |         if Token.isContent(target._tkn):
14 |             self._type = 'C'
15 |         else:
16 |             self._type = 'N'
17 | 
18 |         RelType.relTypeStr_idx[self._str] = len(RelType.relTypes)
19 |         RelType.relTypes.append(self)
20 | 
21 |     def __hash__(self):
22 |         return hash(self.toString())
23 | 
24 |     def __eq__(self, other):
25 |         return self.compareTo(other) == 0
26 | 
27 |     def getType(self):
28 |         return self._type
29 | 
30 |     def getRelType(target):
31 |         if target is None:
32 |             result = None
33 |         elif isinstance(target,int):
34 |             result = RelType.relTypes[target]
35 |         else:
36 |             type_str = RelType.genTypeStr(target)
37 | 
38 |             if type_str not in RelType.relTypeStr_idx:
39 |                 t = RelType(target)
40 | 
41 |             result = RelType.relTypeStr_idx[type_str]
42 | 
43 |         return result
44 | 
45 |     def genTypeStr(tn):
46 |         type_str = '('
47 |         type_str += tn.toString()
48 |         children = tn.getChildren()
49 | 
50 |         if len(children) > 0:
51 |             for child in children:
52 |                 type_str += ' (' + child
53 |                 tree_nodes = children[child]
54 | 
55 |                 for node in tree_nodes:
56 |                     type_str += ' ' + RelType.genTypeStr(node)
57 | 
58 |                 type_str += ')'
59 | 
60 |         type_str += ')'
61 | 
62 |         return type_str
63 | 
64 |     def compareTo(self, z):
65 |         this = sum([ord(x) for x in self._str])
66 |         that = sum([ord(x) for x in z.toString()])
67 |         result = this - that
68 | 
69 |         return result
70 | 
71 |     def toString(self):
72 |         return self._str
73 | 
74 | 


--------------------------------------------------------------------------------
/pymln/syntax/Relations/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | from . ArgType import ArgType
4 | from . Path import Path
5 | from . RelType import RelType
6 | 
7 | 


--------------------------------------------------------------------------------
/pymln/syntax/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from multivac.pymln.syntax.Nodes import Article, Sentence, Token
3 | from multivac.pymln.syntax.Relations import ArgType, Path, RelType
4 | from multivac.pymln.syntax import StanfordParseReader
5 | 
6 | 


--------------------------------------------------------------------------------
/pymln/utils/Utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #
 4 | # Utility functions for pymln parsing
 5 | # 
 6 | 
 7 | import math
 8 | 
 9 | def inc_key(d, key, inc=1):
10 |     if key not in d:
11 |         d[key] = inc
12 |     else:
13 |         d[key] += inc
14 | 
15 |     return d
16 | 
17 | def dec_key(d, key, base=None, dec=1, remove=False):
18 |     if key not in d:
19 |         if base is None:
20 |             d = None
21 |         else:
22 |             d[key] = base - dec
23 |     else:
24 |         d[key] -= dec
25 | 
26 |     if remove and d[key] <= 0:
27 |         del d[key]
28 | 
29 |     return d
30 | 
31 | 
32 | def genTreeNodeID(aid, sid, wid):
33 |     node_id = '{0}:{1}:{2:03d}'.format(aid, sid, wid)
34 | 
35 |     return node_id
36 | 
37 | 
38 | class java_iter(object):
39 |     def __init__(self, it):
40 |         self.it = iter(it)
41 |         self._hasnext = None
42 | 
43 |     def __iter__(self): return self
44 | 
45 |     def next(self):
46 |         if self._hasnext:
47 |             result = self._thenext
48 |         else:
49 |             result = next(self.it)
50 |             self._hasnext = None
51 | 
52 |         return result
53 | 
54 |     def hasnext(self):
55 |         if self._hasnext is None:
56 |             try: 
57 |                 self._thenext = next(self.it)
58 |             except StopIteration: 
59 |                 self._hasnext = False
60 |         else: 
61 |             self._hasnext = True
62 |     
63 |         return self._hasnext
64 | 
65 | 
66 | def compareStr(s, t):
67 |     # compare each character until there's a difference!!!
68 |         this = sum([ord(x) for x in s])
69 |         that = sum([ord(x) for x in t])
70 |         result = this - that
71 | 
72 |         return result
73 | 
74 | 
75 | def xlogx(x):
76 |     if x <= 0:
77 |         result = 0
78 |     else:
79 |         result = x * math.log(x)
80 | 
81 |     return result
82 | 
83 | 
84 |     


--------------------------------------------------------------------------------
/pymln/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from multivac.pymln.utils.Utils import inc_key, dec_key, compareStr
3 | from multivac.pymln.utils.Utils import java_iter, genTreeNodeID, xlogx
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | stanfordnlp==0.2.0
 2 | networkx==2.3
 3 | numpy==1.16.2
 4 | matplotlib==3.1.1
 5 | nltk==3.4.5
 6 | Flask==1.1.1
 7 | scipy==1.2.1
 8 | -e git://github.com/pgr-gallup/slate.git#egg=slate==0.5.2
 9 | interruptingcow==0.8
10 | tqdm==4.34.0
11 | torch==1.2.0
12 | pandas==0.24.2
13 | tensorflow==1.15.2
14 | Unidecode==1.0.23
15 | fastcluster==1.1.25
16 | git+https://github.com/titipata/pubmed_parser.git
17 | stanford_corenlp==3.9.2
18 | requests==2.21.0
19 | py2neo==4.3.0
20 | sympy==1.3
21 | sortedcontainers==2.1.0
22 | spacy==2.1.3
23 | feedparser==5.2.1
24 | beautifulsoup4==4.8.1
25 | python-dotenv==0.10.3
26 | scikit_learn==0.21.3
27 | 
28 | 


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
 1 | import configparser
 2 | import os
 3 | from pathlib import Path
 4 | 
 5 | from dotenv import load_dotenv
 6 | from multivac.src import utilities
 7 | 
 8 | cfg = configparser.ConfigParser()
 9 | cfgDIR = Path(__file__).resolve().parent
10 | 
11 | try:
12 |     cfg.read(cfgDIR / config_file_name)
13 | except NameError:
14 |     cfg.read(cfgDIR / 'multivac.cfg')
15 | 
16 | root_dir = cfg['PATHS'].get('root_dir', cfgDIR)
17 | qgnet_dir = cfg['PATHS'].get('qgnet_dir', root_dir / '..' / 'qgnet')
18 | 
19 | sys_dir = cfg['PATHS'].get('sys_dir', root_dir/'sys')
20 | data_dir = cfg['PATHS'].get('data_dir', sys_dir/'data')
21 | raw_dir = cfg['PATHS'].get('raw_dir', data_dir/'raw')
22 | interim_dir = cfg['PATHS'].get('interim_dir', data_dir/'interim')
23 | processed_dir = cfg['PATHS'].get('processed_dir', data_dir/'processed')
24 | metadata_dir = cfg['PATHS'].get('metadata_dir', processed_dir/'metadata')
25 | models_dir = cfg['PATHS'].get('models_dir', sys_dir/'models')
26 | stanf_nlp_dir = cfg['PATHS'].get('stanf_nlp_dir',
27 |                                  root_dir/'stanford_nlp_models')
28 | mln_dir = cfg['PATHS'].get('mln_dir', root_dir/'mln_models')
29 | 
30 | # Get search and filter settings; default to empty lists
31 | terms = eval(cfg['SEARCH'].get('terms', '[]'))
32 | sources = eval(cfg['SEARCH'].get('sources', '[]'))
33 | arxiv_drops = eval(cfg['FILTER'].get('drops', '[]'))
34 | 
35 | # make data directories if they don't already exist
36 | dirs = [
37 |     data_dir,
38 |     raw_dir,
39 |     interim_dir,
40 |     processed_dir,
41 |     metadata_dir,
42 |     models_dir,
43 |     stanf_nlp_dir,
44 |     mln_dir,
45 | ]
46 | dirs += [raw_dir / x for x in sources]
47 | for _dir in dirs:
48 |     utilities.mkdir(_dir)
49 | 


--------------------------------------------------------------------------------
/src/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/.gitkeep


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/__init__.py


--------------------------------------------------------------------------------
/src/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/data/.gitkeep


--------------------------------------------------------------------------------
/src/data/clean_text.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import json
  4 | import os
  5 | 
  6 | import bs4
  7 | 
  8 | 
  9 | def get_abstract(soup):
 10 | 
 11 |     abstract_element = soup.find('abstract').find('p')
 12 |     abstract_text = abstract_element.text if abstract_element else '**NONE**'
 13 | 
 14 |     # needs to be smarter
 15 |     if 'Abstract' in abstract_text:
 16 |         abstract_text = abstract_text.replace('Abstract', '').strip()
 17 |     if abstract_text.startswith('.'):
 18 |         abstract_text = abstract_text[1:].strip()
 19 | 
 20 |     return abstract_text
 21 | 
 22 | 
 23 | def get_authors(soup):
 24 | 
 25 |     author_elements = soup.find_all('author')
 26 | 
 27 |     authors = []
 28 |     for author in author_elements:
 29 |         firstname = author.find('forename')
 30 |         lastname = author.find('surname')
 31 | 
 32 |         first = firstname.text if firstname else '**NONE**'
 33 |         last = lastname.text if lastname else '**NONE**'
 34 |         out = f'{first} {last}'
 35 | 
 36 |         # needs to be smarter
 37 |         if out.startswith('&amp;'):
 38 |             out = out.replace('&amp;', '').strip()
 39 | 
 40 |         authors.append(out)
 41 | 
 42 |     authors_list = list(set(authors))
 43 | 
 44 |     return authors_list
 45 | 
 46 | 
 47 | def get_content(soup):
 48 | 
 49 |     paragraph_elements = soup.find_all('p')
 50 |     paragraphs_list = [e.text for e in paragraph_elements]
 51 |     # potentially more cleaning here
 52 | 
 53 |     return paragraphs_list
 54 | 
 55 | 
 56 | def get_references(soup):
 57 | 
 58 |     reference_elements = soup.find_all('ref')
 59 |     references_list = [e.text for e in reference_elements]
 60 |     # potentially more cleaning here
 61 | 
 62 |     return references_list
 63 | 
 64 | 
 65 | def get_formulas(soup):
 66 | 
 67 |     formula_elements = soup.find_all('formula')
 68 |     formulas_list = [e.text for e in formula_elements]
 69 |     # potentially more cleaning here
 70 | 
 71 |     return formulas_list
 72 | 
 73 | 
 74 | def get_title(soup):
 75 | 
 76 |     title_element = soup.find('titleStmt')
 77 |     title = title_element.text.strip('\n')
 78 | 
 79 |     return title
 80 | 
 81 | 
 82 | def run(args_dict):
 83 | 
 84 |     indir = os.path.abspath(args_dict['indir'])
 85 | 
 86 |     # get all files in specified directory
 87 |     files = [x for x in os.walk(indir)][0][2]
 88 | 
 89 |     # temporary placeholder for all data
 90 |     complete_list = []
 91 |     for f in files:
 92 | 
 93 |         # full path to input file
 94 |         fin = f'{indir}/{f}'
 95 | 
 96 |         # only operate on proper files from extract_text module
 97 |         if fin.endswith('.tei.xml'):
 98 | 
 99 |             tmpf = open(fin, 'r')
100 |             content = tmpf.read()
101 |             tmpf.close()
102 | 
103 |             soup = bs4.BeautifulSoup(content, 'xml')
104 | 
105 |             # gather all parsed data
106 |             abstract = get_abstract(soup)
107 |             authors = get_authors(soup)
108 |             references = get_references(soup)
109 |             formulas = get_formulas(soup)
110 |             title = get_title(soup)
111 | 
112 |             # comes in as list, combine to full text
113 |             tmp_content = get_content(soup)
114 |             content = ' '.join(tmp_content)
115 | 
116 |             # cleaning fluff from main content
117 |             for ref in references:
118 |                 content = content.replace(ref, '')
119 |             for frm in formulas:
120 |                 content = content.replace(frm, '')
121 |             for atr in authors:
122 |                 content = content.replace(atr, '')
123 |             content = content.replace(abstract, '')
124 | 
125 |             structure = {
126 |                 f: {
127 |                     'meta': {
128 |                         'abstract': abstract,
129 |                         'authors': authors,
130 |                         'title': title
131 |                     },
132 |                     'text': content
133 |                 }
134 |             }
135 | 
136 |             complete_list.append(structure)
137 | 
138 |         else:
139 | 
140 |             pass
141 | 
142 |     # file outpu handling
143 |     outdir = os.path.abspath(args_dict['outdir'])
144 |     stamp = datetime.datetime.now().strftime('%Y%M%d_%H%M%S')
145 |     fname = f'output_{stamp}.json'
146 |     fout = f'{outdir}/{fname}'
147 | 
148 |     f = open(fout, 'w')
149 |     json.dump(complete_list, f)
150 |     f.close()
151 | 
152 | 
153 | if __name__ == "__main__":
154 |     parser = argparse.ArgumentParser(
155 |         description="Parser for XMLized scholarly publications."
156 |     )
157 |     parser.add_argument(
158 |         "--indir",
159 |         required=True,
160 |         help="Path to the directory containing XMLs to process."
161 |     )
162 |     parser.add_argument(
163 |         "--outdir",
164 |         required=True,
165 |         help="Path to output directory for processed files."
166 |     )
167 | 
168 |     args_dict = vars(parser.parse_args())
169 | 
170 |     run(args_dict)
171 | 


--------------------------------------------------------------------------------
/src/data/glove.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | import json
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy.stats import zscore
  8 | from sklearn.cross_decomposition import CCA
  9 | from unidecode import unidecode
 10 | 
 11 | from multivac import settings
 12 | from rpy2.robjects import numpy2ri, pandas2ri, r
 13 | 
 14 | 
 15 | def domain_adapted_CCA(DG_embed, DS_embed, NC=100):
 16 |     # calculate the z-score
 17 |     DG_embed_norm = zscore(DG_embed)
 18 |     DS_embed_norm = zscore(DS_embed)
 19 | 
 20 |     # Initialize CCA Model
 21 |     cca = CCA(n_components=NC)
 22 |     cca.fit(DG_embed_norm, DS_embed_norm)
 23 | 
 24 |     DA_embeddings = (cca.x_scores_ + cca.y_scores_)/2
 25 | 
 26 |     return cca, DA_embeddings
 27 | 
 28 | 
 29 | def glove_main():
 30 |     # Load data from nlp parsing
 31 |     with open('{}/articles-with-equations.json'.format(settings.data_dir), 'r',
 32 |               encoding='utf-8') as jf:
 33 |         src_data = json.load(jf)
 34 | 
 35 |     texts = [src_data[art]['text'] for art in src_data if
 36 |              src_data[art]['text'] is not None]
 37 | 
 38 |     # The "unidecode" step simplifies non-ASCII chars which
 39 |     # mess up the R GloVe engine.
 40 |     texts_df = pd.Series(texts).apply(lambda x: unidecode(x))
 41 |     texts_df = pd.DataFrame({'text': texts_df})
 42 | 
 43 |     # Source all the functions contained in the 'trainEmbeddings' R file
 44 |     r("source('{}/trainEmbeddings.R'.format('src/data'))")
 45 | 
 46 |     # Call the main GloVe-embedding function from the R script
 47 |     trainEmbeddings_R = r("trainEmbeddings")
 48 | 
 49 |     # Train domain-specific GloVe embedding model and ouput as a Numpy Matrix
 50 |     pandas2ri.activate()
 51 |     DS_embeddings_R = trainEmbeddings_R(texts_df)
 52 |     pandas2ri.deactivate()
 53 | 
 54 |     DS_embeddings = numpy2ri.rpy2py(DS_embeddings_R[0])
 55 | 
 56 |     # Get domain-specific GloVe vocabulary
 57 |     domain_spec_vocab = list(DS_embeddings_R[1])
 58 | 
 59 |     # Load in Stanford's 'Common Crawl' domain-general Glove Embedding Model
 60 |     # Only pull out the words that are contained in our corpus
 61 |     # * This can take a while (~30min) - could use some optimization *
 62 |     DG_embeddings = loadGloveModel(
 63 |         '{}/glove.42B.300d.txt'.format(settings.data_dir),
 64 |         domain_spec_vocab
 65 |     )
 66 | 
 67 |     # Processing to ensure rows match between the domain-general and
 68 |     # domain-specific embeddings
 69 |     # Convert domain-general embedding from dictionary to array
 70 |     domain_gen_vocab = np.array([DG_embeddings[i] for i in
 71 |                                 DG_embeddings.keys()])
 72 | 
 73 |     # Find the indices of matching words
 74 |     both = set(domain_gen_vocab).intersection(domain_spec_vocab)
 75 |     indices_gen = [domain_gen_vocab.index(x) for x in both]
 76 |     indices_spec = [domain_spec_vocab.index(x) for x in both]
 77 |     indices_spec_notDG = [domain_spec_vocab.index(x) for x in
 78 |                           domain_spec_vocab if x not in both]
 79 | 
 80 |     # Sort and subset domain-specific array to match indices of domain-general
 81 |     # array
 82 |     DS_embeddings_subset = DS_embeddings[indices_spec, :].copy()
 83 |     DG_embeddings_subset = DG_embeddings[indices_gen, :].copy()
 84 | 
 85 |     # fit cca model
 86 |     cca_res, DA_embeddings = domain_adapted_CCA(DG_embeddings_subset,
 87 |                                                 DS_embeddings_subset, NC=100)
 88 | 
 89 |     DS_embeddings_notinDG = DS_embeddings[indices_spec_notDG, :]
 90 |     DS_embeddings_notinDG_norm = zscore(DS_embeddings_notinDG)
 91 | 
 92 |     DA_notinDG_embeddings = cca_res.y_weights_.T @ DS_embeddings_notinDG_norm.T
 93 |     DA_embeddings_final = np.append(DA_embeddings, DA_notinDG_embeddings.T,
 94 |                                     axis=0)
 95 | 
 96 |     # write data to disk
 97 |     np.savetxt('{}/da_embeddings.txt'.format(settings.models_dir),
 98 |                DA_embeddings_final, fmt='%d')
 99 | 
100 | 
101 | def loadGloveModel(gloveFile, vocab):
102 |     f = open(gloveFile, ' r')
103 | 
104 |     model = {}
105 |     for line in f:
106 |         splitLine = line.split()
107 |         word = splitLine[0]
108 |         if word in vocab:
109 |             embedding = np.array([float(val) for val in splitLine[1:]])
110 |             model[word] = embedding
111 | 
112 |     return model
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     glove_main()
117 | 


--------------------------------------------------------------------------------
/src/data/make.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | from multivac.src.data.get import collect_get_main
 4 | from multivac.src.data.process import collect_process_main
 5 | 
 6 | 
 7 | def collect_main():
 8 |     # query apis to obtain articles
 9 |     collect_get_main()
10 | 
11 |     # process article data for models
12 |     collect_process_main()
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     collect_main()
17 | 


--------------------------------------------------------------------------------
/src/data/process.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | import copy
  4 | import json
  5 | import os
  6 | import pickle
  7 | from collections import OrderedDict
  8 | 
  9 | import pubmed_parser
 10 | from bs4 import BeautifulSoup as bs
 11 | 
 12 | import slate
 13 | from multivac import settings
 14 | from multivac.src import utilities
 15 | 
 16 | 
 17 | def aggregate_pubmed(srcs, verbose=False):
 18 |     """Aggregate a set of Pubmed article text and metadata."""
 19 |     pubmed_data = OrderedDict()
 20 |     pubmed_metadata = OrderedDict()
 21 |     for src in srcs:
 22 |         if verbose:
 23 |             print(src)
 24 |         try:
 25 |             temp = OrderedDict()
 26 |             metadata, text = parse_pubmed(str(src.absolute()))
 27 |             temp['metadata'] = metadata
 28 |             temp['metadata']['source'] = 'pubmed'
 29 |             temp['text'] = text
 30 |             try:
 31 |                 k = metadata['doi']
 32 |             except AttributeError:
 33 |                 k = src.strip('.xml')
 34 |             if len(text) > 0:
 35 |                 pubmed_data[k] = temp
 36 |                 pubmed_metadata[k] = metadata
 37 |             print(src)
 38 |         except Exception:
 39 |             if verbose:
 40 |                 print('Error: %s' % src)
 41 |             pass
 42 |     dst = settings.metadata_dir / 'pubmed.pkl'
 43 |     with open(dst, 'wb') as f:
 44 |         pickle.dump(pubmed_metadata, f)
 45 |     return pubmed_data
 46 | 
 47 | 
 48 | def collect_process_main():
 49 |     output = {}
 50 |     for source in settings.sources:
 51 |         data_raw_dir = settings.raw_dir / source
 52 |         if source in ['arxiv', 'springer']:
 53 |             data = parse_articles_data(source, data_raw_dir)
 54 |         elif source == 'pubmed':
 55 |             srcs = [data_raw_dir / x for x in os.listdir(data_raw_dir)]
 56 |             data = aggregate_pubmed(srcs)
 57 |         if len(output) == 0:
 58 |             output = copy.deepcopy(data)
 59 |         else:
 60 |             output.update(data)
 61 |     arxiv_drops = [x.split()[0] for x in settings.arxiv_drops]
 62 |     filtered_output = filter_arxiv(output, arxiv_drops)
 63 |     save_outputs(filtered_output)
 64 |     return True
 65 | 
 66 | 
 67 | def filter_arxiv(output, arxiv_drops):
 68 |     filtered_output = OrderedDict()
 69 |     for k, v in output.items():
 70 |         if v['metadata']['source'] == 'arxiv':
 71 |             for term in v['metadata']['tags']:
 72 |                 if term['term'] not in arxiv_drops:
 73 |                     filtered_output[copy.deepcopy(k)] = copy.deepcopy(v)
 74 |         else:
 75 |             filtered_output[copy.deepcopy(k)] = copy.deepcopy(v)
 76 |     return filtered_output
 77 | 
 78 | 
 79 | def parse_articles_data(source, data_raw_dir, verbose=False):
 80 |     """Parse Arxiv and Springer article data."""
 81 |     # load metadata
 82 |     fn = source + '.pkl'
 83 |     metadata_src = settings.metadata_dir / fn
 84 |     with open(metadata_src, 'rb') as f:
 85 |         metadata_ = pickle.load(f)
 86 | 
 87 |     # we'll just add the text to a new arxiv object, an ordered dict keyed on
 88 |     # doi or other id
 89 |     data = OrderedDict()
 90 |     for ix, article_metadata in enumerate(metadata_):
 91 | 
 92 |         # initialize temp dictionary
 93 |         temp = OrderedDict()
 94 |         temp['metadata'] = copy.deepcopy(article_metadata)
 95 |         temp['metadata']['source'] = source
 96 |         article_fn = article_metadata['fn']
 97 |         if verbose:
 98 |             print(article_fn)
 99 |         src = data_raw_dir / article_fn
100 | 
101 |         # define key and value
102 |         if source == 'arxiv':
103 |             k = article_metadata['fn'].strip('.pdf')
104 |             temp['text'] = parse_pdf(src)
105 |         elif source == 'springer':
106 |             k = article_metadata['doi']
107 |             temp['text'] = parse_html(src)
108 |         elif source == 'pubmed':
109 |             raise ValueError('pubmed not supported. Only "arxiv" and "springer" supported. '
110 |                              'Try "parse_pubmed() function"')
111 |         else:
112 |             raise ValueError('Only "arxiv" and "springer" supported as sources.')
113 | 
114 |         # populate interim dictionary
115 |         data[k] = temp
116 | 
117 |     # save intermediate outputs
118 |     data_interim_dst = settings.interim_dir / fn
119 |     with open(data_interim_dst, 'wb') as f:
120 |         pickle.dump(data, f)
121 |     return data
122 | 
123 | 
124 | def parse_html(src):
125 |     """Parse research paper HTML and return text."""
126 |     with open(src, 'r', encoding='utf-8') as f:
127 |         raw_data_ = f.read()
128 |     soup = bs(raw_data_)
129 |     try:
130 |         text = ' '.join(soup.find('article').get_text().split())
131 |     except AttributeError:
132 |         text = None
133 |     return text
134 | 
135 | 
136 | def parse_pdf(src):
137 |     """Parse research paper PDF and return text."""
138 |     try:
139 |         # try to open file
140 |         with open(src, 'rb') as f:
141 |             doc = slate.PDF(f)
142 | 
143 |         # get text: strip out newlines and extra spaces
144 |         doc = ' '.join([' '.join(x.split()) for x in doc])
145 |         text = (doc.split(' Abstract ')[-1]
146 |                    .split(' Acknowledgments ')[0]
147 |                    .split(' ∗ ∗ ∗ ')[0]
148 |                    .strip()
149 |                 )
150 | 
151 |     except Exception:
152 |         text = None
153 | 
154 |     return text
155 | 
156 | 
157 | def parse_pubmed(src):
158 |     """Parse pubmed xml article data and return metadata and text."""
159 |     metadata = pubmed_parser.parse_pubmed_xml(src)
160 |     text = pubmed_parser.parse_pubmed_paragraph(src, all_paragraph=True)
161 |     text = ' '.join(' '.join([x['text'] for x in text]).split())
162 |     return metadata, text
163 | 
164 | 
165 | def save_outputs(output, dst_dir=None, fn_prefix=None):
166 |     if dst_dir is None:
167 |         dst_dir = settings.processed_dir / 'data'
168 |     utilities.mkdir(dst_dir)
169 |     fn = 'data.json'
170 |     if fn_prefix is not None:
171 |         fn = fn_prefix + '_' + fn
172 |     dst = dst_dir / fn
173 |     with open(dst, 'wb') as f:
174 |         json.dump(output, f)
175 | 
176 | 
177 | if __name__ == '__main__':
178 |     collect_process_main()
179 | 


--------------------------------------------------------------------------------
/src/data/qgnet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import argparse
 4 | import os
 5 | import subprocess
 6 | 
 7 | import corenlp
 8 | 
 9 | from multivac import settings
10 | from multivac.src.data.parsing import load_data
11 | from qgnet.test.testinput.preprocessing_pdf import (create_tf_idf,
12 |                                                     preprocess_pdf)
13 | 
14 | os.environ["CORENLP_HOME"] = ('{}/stanford-corenlp-full-2018-10-05'
15 |                               .format(settings.models_dir))
16 | 
17 | 
18 | def qgnet_main(args_dict):
19 |     # first, run shell script, if necessary, in qgnet to create model
20 |     subprocess.call([
21 |         '../{}/download_QG-Net.sh'.format(settings.qgnet_dir),
22 |         args_dict['qgnet_path']
23 |     ])
24 | 
25 |     # second, pre-process the pdfs
26 |     jsonObj, allDocs = load_data('{}/da_embeddings.txt'
27 |                                  .format(settings.models_dir))
28 |     abstracts = []
29 |     for value in jsonObj.values():
30 |         if "summary" in value['metadata']:
31 |             abstracts.append(value['metadata']["summary"])
32 |         elif "abstract" in value['metadata']:
33 |             abstracts.append(value['metadata']["abstract"])
34 | 
35 |     nlp = corenlp.CoreNLPClient(output_format='json', properties={
36 |         'timeout': '50000'})
37 | 
38 |     features, tfidf = create_tf_idf(abstracts, False)
39 | 
40 |     for i, abstract in enumerate(abstracts):
41 |         preprocess_pdf(abstract, features[i, :].toarray(), tfidf, nlp)
42 | 
43 |     # third, generate qg-net questions
44 |     subprocess.call([
45 |         '../{}/qg_reproduce_LS.sh'.format(settings.qgnet_dir),
46 |         args_dict['qgnet_path'],
47 |         settings.models_dir
48 |     ])
49 | 
50 | 
51 | if __name__ == '__main__':
52 | 
53 |     parser = argparse.ArgumentParser(
54 |         description="Parser for QGNet."
55 |     )
56 |     parser.add_argument(
57 |         "--qgnet_path",
58 |         required=True,
59 |         help="Path to QGNet questions."
60 |     )
61 | 
62 |     args_dict = vars(parser.parse_args())
63 | 
64 |     qgnet_main(args_dict)
65 | 


--------------------------------------------------------------------------------
/src/data/textparsing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import re as reg
 4 | 
 5 | 
 6 | def clean_doc(doc, spacynlp):
 7 |     '''
 8 |     Clean individual documents and remove citations, URLs, emails, other
 9 |     trivial content. Returns cleaned doc
10 |     '''
11 |     # Regex for cleaning
12 |     re_citationsNumeric = reg.compile(r'(\[\d+)(,\s*\d+)*]')
13 |     re_url = reg.compile(r'((http|ftp|https):\/\/)?[-a-zA-Z0-9@:%._\+~#=]"'
14 |                          r'{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)')
15 |     re_intextcite = reg.compile(r"((?:[A-Za-z][A-Za-z'`-éü-]+)(?:,? (?:(?:and |& )"
16 |                                 r"?(?:[A-Za-z][A-Za-z'`-éü-]+)|(?:et al.?)))*(?:,* "
17 |                                 r"*((?:19|20)[0-9][0-9][a-z]*)(\s*&\s*[0-9]*[a-z]*)"
18 |                                 r"*(, (\d+))*(?:, p.? [0-9]+)?| *\\((?:19|20)[0-9]"
19 |                                 r"[0-9][a-z](\s*&)(?:, p.? [0-9]+)?\\)))")
20 | 
21 |     re_emptyCite = reg.compile(r"\(([\s]*[;]+[\s]*)+\)")
22 |     re_emptyEg = reg.compile(r'\(e.g.[\s*;\s*]*[,]*\s*\)')
23 |     re_clickHere = reg.compile(r'Click here[^.]*\.')
24 |     re_cid = reg.compile(r"\(cid:\d+\)")
25 |     re_email = reg.compile(r"[\w.-]+@[\w.-]+")
26 |     re_emptyParens = reg.compile(r"\(\s*\)")
27 |     re_emptySee = reg.compile(r"\(see(\s)*\)")
28 |     re_sponsors = reg.compile(r'(This work was supported).+')
29 |     re_arxivHeader = reg.compile(r"(a r X i v).*?(?=[a-zA-Z]{2,})")
30 |     re_vixraHeader = reg.compile(r"^(\s?.?\s)+(v i X r a)")
31 |     re_hyphenatedWords = reg.compile(r'\S(?=\S*[-]\s)([a-zA-Z-]+)(\s)[A-za-z]+')
32 | 
33 |     # Actual cleaning
34 |     doc = reg.sub(re_cid, ' ', doc)
35 |     doc = reg.sub(re_citationsNumeric, ' NumericCitation ', doc)
36 |     doc = reg.sub(re_url, ' ', doc)
37 |     doc = reg.sub(re_intextcite, ' Citation ', doc)
38 |     doc = reg.sub(re_emptyCite, ' ', doc)
39 |     doc = reg.sub(re_emptyEg, ' ', doc)
40 |     doc = reg.sub(re_clickHere, ' ', doc)
41 |     doc = reg.sub(re_email, ' ', doc)
42 |     doc = reg.sub(re_emptyParens, ' ', doc)
43 |     doc = reg.sub(re_emptySee, ' ', doc)
44 |     doc = reg.sub(re_arxivHeader, ' ', doc)
45 |     doc = reg.sub(re_vixraHeader, ' ', doc)
46 | 
47 |     # This work supported by --> all the way to end of document
48 |     # Only remove this when it appears in the second half of the article
49 |     for m in reg.finditer(re_sponsors, doc):
50 |         if m.start() > (len(doc)/2):
51 |             doc = reg.sub(re_sponsors, ' ', doc)
52 | 
53 |     # Handling hyphens - 2-28-2018
54 |     for m in reg.finditer(re_hyphenatedWords, doc):
55 |         match = m.group(0)
56 | 
57 |         mergedWord = match.replace(' ', '').replace('-', '')
58 |         if mergedWord in spacynlp.vocab:
59 | 
60 |             doc = doc.replace(match, mergedWord)
61 |         else:
62 |             allWords = True
63 |             for i in match.replace(' ', '').split('-'):
64 |                 allWords = allWords and (i in spacynlp.vocab)
65 |             if allWords:
66 |                 doc = doc.replace(match, (match.replace(' ', '')))
67 |             else:
68 |                 doc = doc.replace(match, mergedWord)
69 | 
70 |     # De-dup for PUBMED articles, where the main text is sometimes duplicated
71 |     sliceText = doc[0:500]
72 |     count = doc.count(sliceText)
73 | 
74 |     if count > 1:
75 |         posDup = doc.find(sliceText, 1)
76 |         doc = doc[0:posDup-1]
77 | 
78 |     return doc
79 | 


--------------------------------------------------------------------------------
/src/data/trainEmbeddings.R:
--------------------------------------------------------------------------------
 1 | pks = c(
 2 |     'data.table',
 3 |     'dplyr',
 4 |     'text2vec',
 5 |     'Rtsne',
 6 |     'quanteda',
 7 |     'doParallel',
 8 |     'foreach'
 9 | )
10 | 
11 | # Takes a list or vector of package names and loads them, installing
12 | # first if they are not already installed.
13 | getPackages <- function(list.of.packages) {
14 |     new.packages <- list.of.packages[!(
15 |         list.of.packages %in% installed.packages()[,"Package"]
16 |     )]
17 | 
18 |     if(length(new.packages)) install.packages(new.packages)
19 |        lapply(list.of.packages,require,character.only=T)
20 | }
21 | 
22 | # Fits GloVe embeddings model on data
23 | trainEmbeddings <- function(docs,
24 |                             term_count_min=5L,
25 |                             skip_grams_window=10L,
26 |                             word_vectors_size=300,
27 |                             x_max=100,
28 |                             n_iter=100,
29 |                             convergence_tol=0.01,
30 |                             learning_rate=0.05,
31 |                             verbose=FALSE) {
32 |     toks <- tokens(tolower(docs))
33 |     feats <- dfm(toks, verbose=verbose) %>%
34 |     dfm_trim(min_termfreq=term_count_min) %>%
35 |     featnames()
36 |     toks <- tokens_select(toks, feats,
37 |                           selection='keep',
38 |                           valuetype='fixed',
39 |                           padding=TRUE,
40 |                           case_insensitive=FALSE,
41 |                           verbose=TRUE)
42 |     my_fcm <- fcm(toks,
43 |                   context="window",
44 |                   window=skip_grams_window,
45 |                   count="weighted",
46 |                   weights=1/(1:skip_grams_window),
47 |                   tri=TRUE)
48 | 
49 |     glove <- GlobalVectors$new(word_vectors_size=word_vectors_size,
50 |                                vocabulary=featnames(my_fcm),
51 |                                x_max=x_max,
52 |                                learning_rate=learning_rate)
53 | 
54 |     if(verbose) print('Fitting GloVe model...')
55 | 
56 |     wv_main = glove$fit_transform(my_fcm,
57 |                                   n_iter=n_iter,
58 |                                   convergence_tol=convergence_tol)
59 | 
60 |     if(verbose) print('Done.')
61 | 
62 |     # Combine context and target word vectors in the same manner as
63 |     # original GloVe research
64 |     word_vectors = wv_main + t(glove$components)
65 | 
66 |     results = list(word_vectors,feats)
67 |     return(results)
68 | }
69 | 
70 | getPackages(pks)
71 | 


--------------------------------------------------------------------------------
/src/gan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/__init__.py


--------------------------------------------------------------------------------
/src/gan/config.cfg:
--------------------------------------------------------------------------------
  1 | [GAN]
  2 | ROLLOUT_NUM = 2
  3 | 
  4 | G_STEPS = 1
  5 | D_STEPS = 1
  6 | K_STEPS = 4
  7 | 
  8 | SEED = 12
  9 | TOTAL_EPOCHS = 200
 10 | GENERATED_NUM = 100
 11 | 
 12 | kg_directory = ../../data
 13 | glove_dir = ../../models
 14 | glove_file = DA_glove_embeddings_300.pkl
 15 | glove_lower = True
 16 | verbose = True
 17 | 
 18 | [GENERATOR]
 19 | verbose = True
 20 | annot_file = data/query_annots.txt
 21 | texts_file = data/query_texts.txt
 22 | sample_dir = data/samples
 23 | output_dir = ../../models
 24 | grammar = None
 25 | cuda = False
 26 | 
 27 | uniform_init = None
 28 | glorot_init = False
 29 | kaiming_init = True
 30 | 
 31 | #### Model configuration ####
 32 | batch_size = 32
 33 | dropout = 0.
 34 | word_dropout = 0.
 35 | primitive_token_label_smoothing = 0.1
 36 | lstm = lstm
 37 | encoder = lstm
 38 | 
 39 | # Embedding sizes
 40 | embed_size = 128
 41 | action_embed_size = 128
 42 | field_embed_size = 64
 43 | type_embed_size = 64
 44 | 
 45 | # Hidden sizes
 46 | hidden_size = 256
 47 | att_vec_size = 256
 48 | 
 49 | # readout layer
 50 | no_query_vec_to_action_map = False
 51 | readout = non_linear
 52 | query_vec_to_action_diff_map = False
 53 | 
 54 | # supervised attention
 55 | sup_attention = False
 56 | 
 57 | # parent information switch for decoder LSTM
 58 | no_parent_production_embed = False
 59 | no_parent_field_embed = False
 60 | no_parent_field_type_embed = False
 61 | no_parent_state = False
 62 | 
 63 | no_input_feed = False
 64 | no_copy = False
 65 | 
 66 | # training schedule details
 67 | PRE_G_EPOCHS = 50
 68 | optimizer = Adam
 69 | lr = 0.0001
 70 | lr_decay = 0.
 71 | beta_1 = 0.5
 72 | log_every = 10
 73 | clip_grad = 5.
 74 | 
 75 | #### decoding/validation/testing ####
 76 | beam_size = 5
 77 | decode_max_time_step = 100
 78 | 
 79 | 
 80 | [DISCRIMINATOR]
 81 | device = cpu
 82 | cuda = False
 83 | verbose = False
 84 | data = discriminator/data/multivac
 85 | label_smoothing = 0.9
 86 | 
 87 | #### Model configuration ####
 88 | vocab_size = 0
 89 | num_epochs = 5
 90 | filter_sizes = (10, 5, 4, 3)
 91 | num_filters = 20
 92 | hidden_dims = 10
 93 | dropout_prob1 = 0.5 
 94 | dropout_prob2 = 0.8
 95 | 
 96 | # training schedule details
 97 | batch_size = 64
 98 | optim = adam
 99 | lr = 0.0004
100 | beta_1 = 0.5
101 | wd = 0.
102 | 


--------------------------------------------------------------------------------
/src/gan/discriminator/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset import MULTIVACDataset
2 | from .model import QueryGAN_Discriminator_CNN
3 | from .trainer import Trainer
4 | from .tree import Tree
5 | 
6 | __all__ = [MULTIVACDataset, QueryGAN_Discriminator_CNN, Trainer, Tree]
7 | 


--------------------------------------------------------------------------------
/src/gan/discriminator/dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from copy import deepcopy
 3 | 
 4 | import torch
 5 | import torch.utils.data as data
 6 | from multivac.src.gan.discriminator.tree import Tree
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | # Dataset class for MULTIVAC dataset
11 | class MULTIVACDataset(data.Dataset):
12 | 
13 |     def __init__(self, path, vocab):
14 |         super().__init__()
15 |         self.vocab = vocab
16 |         self.sentences = self.read_sentences(os.path.join(path, 'text.toks'))
17 |         self.labels = MULTIVACDataset.read_labels(os.path.join(path, 'cat.txt'))
18 |         self.size = self.labels.size(0)
19 | 
20 |     def __len__(self):
21 |         return self.size
22 | 
23 |     def __getitem__(self, index):
24 |         sent = deepcopy(self.sentences[index])
25 |         label = deepcopy(self.labels[index])
26 |         return (sent, label)
27 | 
28 |     def read_sentences(self, filename):
29 |         with open(filename, 'r') as f:
30 |             sentences = [self.read_sentence(line) for line in tqdm(f.readlines())]
31 | 
32 |         return sentences
33 | 
34 |     def read_sentence(self, line):
35 |         indices = self.vocab.convertToIdx(line.split())
36 |         result = torch.tensor(indices, dtype=torch.long, device='cpu')
37 | 
38 |         return result
39 | 
40 |     @staticmethod
41 |     def read_trees(filename):
42 |         with open(filename, 'r') as f:
43 |             trees = [MULTIVACDataset.read_tree(line) for line in tqdm(f.readlines())]
44 | 
45 |         return trees
46 | 
47 |     @staticmethod
48 |     def read_tree(line):
49 |         if isinstance(line, list):
50 |             parents = line
51 |         else:
52 |             parents = list(map(int, line.split()))
53 | 
54 |         trees = dict()
55 |         root = None
56 | 
57 |         for i in range(1, len(parents) + 1):
58 |             if i - 1 not in trees.keys() and parents[i - 1] != -1:
59 |                 idx = i
60 |                 prev = None
61 | 
62 |                 while True:
63 |                     parent = parents[idx - 1]
64 | 
65 |                     if parent == -1:
66 |                         break
67 | 
68 |                     tree = Tree()
69 | 
70 |                     if prev is not None:
71 |                         tree.add_child(prev)
72 | 
73 |                     trees[idx - 1] = tree
74 |                     tree.idx = idx - 1
75 | 
76 |                     if parent - 1 in trees.keys():
77 |                         trees[parent - 1].add_child(tree)
78 |                         break
79 |                     elif parent == 0:
80 |                         root = tree
81 |                         break
82 |                     else:
83 |                         prev = tree
84 |                         idx = parent
85 | 
86 |         return root
87 | 
88 |     @staticmethod
89 |     def read_labels(filename):
90 |         with open(filename, 'r') as f:
91 |             labels = list(map(float, f.readlines()))
92 |             labels = torch.tensor(labels, dtype=torch.float, device='cpu')
93 | 
94 |         return labels
95 | 


--------------------------------------------------------------------------------
/src/gan/discriminator/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.utils.data import DataLoader
  4 | from tqdm import tqdm
  5 | 
  6 | 
  7 | class QueryGAN_Discriminator_CNN(nn.Module):
  8 | 
  9 |     def __init__(self, args, vocab, vectors, output_shape):
 10 |         super(QueryGAN_Discriminator_CNN, self).__init__()
 11 | 
 12 |         self.args = args
 13 |         self.filter_sizes = eval(self.args['filter_sizes'])
 14 |         self.num_filters = self.args['num_filters']
 15 |         self.hidden_dims = self.args['hidden_dims']
 16 |         self.dropout_prob1 = self.args['dropout_prob1']
 17 |         self.dropout_prob2 = self.args['dropout_prob2']
 18 |         self.num_classes = output_shape
 19 |         self.channels_out = sum([((150-(k-1))//2)*self.num_filters
 20 |                                  for k in self.filter_sizes])
 21 |         self.vocab = vocab
 22 | 
 23 |         self.emb = nn.Embedding(vocab.size(), vectors.size(1))
 24 |         emb = torch.zeros(vocab.size(), vectors.size(1), dtype=torch.float,
 25 |                           device=args['device'])
 26 |         emb.normal_(0, 0.05)
 27 | 
 28 |         for word in vocab.labelToIdx.keys():
 29 |             if vocab.getIndex(word) < vectors.size(0):
 30 |                 emb[vocab.getIndex(word)] = vectors[vocab.getIndex(word)]
 31 |             else:
 32 |                 emb[vocab.getIndex(word)].zero_()
 33 | 
 34 |         self.emb.weight.data.copy_(emb)
 35 |         del emb
 36 | 
 37 |         self.emb.weight.requires_grad = False
 38 |         self.dropout1 = nn.Dropout(self.dropout_prob1)
 39 | 
 40 |         self.vocab_size = len(vocab)
 41 |         self.batchsize = self.args['batch_size']
 42 |         self.num_epochs = self.args['num_epochs']
 43 | 
 44 |         self.conv_blocks = nn.ModuleList(
 45 |             [nn.Sequential(
 46 |                 nn.Conv1d(in_channels=vectors.shape[1],
 47 |                           out_channels=self.num_filters,
 48 |                           kernel_size=sz,
 49 |                           stride=1,
 50 |                           padding=0),
 51 |                 nn.LeakyReLU(negative_slope=0.2),
 52 |                 nn.BatchNorm1d(self.num_filters),
 53 |                 nn.MaxPool1d(kernel_size=2),
 54 |                 nn.Flatten()) for sz in self.filter_sizes]
 55 |         )
 56 | 
 57 |         self.out = nn.Sequential(
 58 |                         nn.Dropout(self.dropout_prob2),
 59 |                         nn.Linear(self.channels_out, self.hidden_dims),
 60 |                         nn.Linear(self.hidden_dims, self.num_classes)
 61 |                       )
 62 | 
 63 |         for block in self.conv_blocks:
 64 |             block.apply(self.init_weights)
 65 | 
 66 |         self.out.apply(self.init_weights)
 67 | 
 68 |         if self.args['optim'] == 'adam':
 69 |             self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, 
 70 |                                                self.parameters()), 
 71 |                                         betas = (self.args['beta_1'], 0.999),
 72 |                                         lr=self.args['lr'], 
 73 |                                         weight_decay=self.args['wd'])
 74 |         elif self.args['optim'] == 'adagrad':
 75 |             self.optimizer = torch.optim.Adagrad(filter(lambda p: p.requires_grad, 
 76 |                                                   self.parameters()), 
 77 |                                            lr=self.args['lr'], 
 78 |                                            weight_decay=self.args['wd'])
 79 |         elif self.args['optim'] == 'sgd':
 80 |             self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, 
 81 |                                               self.parameters()), 
 82 |                                        lr=self.args['lr'], 
 83 |                                        weight_decay=self.args['wd'])
 84 | 
 85 |     def init_weights(self, m):
 86 |         if type(m) in (nn.Linear, nn.Conv1d):
 87 |             nn.init.kaiming_uniform_(m.weight)
 88 | 
 89 |             if m.bias is not None:
 90 |                 nn.init.constant_(m.bias, 0)
 91 | 
 92 |     def forward(self, verbatim_indices):
 93 |         embeddings = self.emb(verbatim_indices)
 94 |         embeddings = embeddings.permute(0, 2, 1)
 95 |         X = self.dropout1(embeddings)
 96 | 
 97 |         X = [conv(embeddings) for conv in self.conv_blocks]
 98 |         X_cat = torch.cat(X, 1)
 99 | 
100 |         return self.out(X_cat)
101 | 
102 |     def predict(self, X):
103 |         self.eval()
104 | 
105 |         with torch.no_grad():
106 |             yhat = self(X).softmax(dim=-1)
107 | 
108 |             scores, labels = yhat.topk(1, -1, True, True)
109 |             return scores, labels
110 | 
111 |     def train_single_code(self, train):
112 | 
113 |         if self.args['label_smoothing']:
114 |             criterion = SmoothedCrossEntropy(self.args['label_smoothing'])
115 |         else:
116 |             criterion = nn.CrossEntropyLoss()
117 | 
118 |         return self.trainer(train, criterion)
119 | 
120 |     def trainer(self, train, criterion):
121 |         trainloader = DataLoader(train, batch_size=self.args['batch_size'], 
122 |                                  shuffle=True, num_workers=4)
123 |         steps = len(trainloader)
124 | 
125 |         if self.args['device'] == 'cuda':
126 |             self.cuda()
127 |             self.optimizer.cuda()
128 | 
129 |         self.train()
130 | 
131 |         for i, (x, y) in enumerate(tqdm(trainloader)):
132 |             verbs = x.to(self.args['device'])
133 |             labels = y.to(self.args['device'])
134 | 
135 |             # Forward pass
136 |             outputs = self(verbs)
137 | 
138 |             if not self.args['label_smoothing']:
139 |                 labels = labels.argmax(1)
140 | 
141 |             loss = criterion(outputs, labels)
142 |             
143 |             # Backward and optimize
144 |             self.optimizer.zero_grad()
145 |             loss.backward()
146 |             self.optimizer.step()
147 | 
148 |         return loss.item()
149 | 
150 | class SmoothedCrossEntropy(nn.Module):
151 |     '''
152 |     Adapted from https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/train.py#L38
153 |     '''
154 |     def __init__(self, smoothing):
155 |         super(SmoothedCrossEntropy, self).__init__()
156 | 
157 |         self.smoothing = smoothing
158 |         self.softmax = nn.LogSoftmax(dim=1)
159 | 
160 |     def forward(self, output, target):
161 |         '''
162 |             output: Tensor of predictions for class labels of size 
163 |                     batchsize * n_classes
164 |             target: Onehot Tensor indicating actual class labels of size
165 |                     batchsize * n_classes
166 |         '''
167 |         target = target * self.smoothing + (1 - target) * (1 - self.smoothing)
168 |         return -(target * self.softmax(output)).mean()
169 | 
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/src/gan/discriminator/scripts/preprocess-multivac.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Preprocessing script for MULTIVAC data.
  3 | 
  4 | """
  5 | import argparse
  6 | import glob
  7 | import os
  8 | import re
  9 | 
 10 | from sklearn.model_selection import train_test_split
 11 | from tqdm import tqdm
 12 | 
 13 | from multivac.src.gan.utilities.utils import build_vocab
 14 | from multivac.src.rdf_graph.rdf_parse import StanfordParser
 15 | 
 16 | 
 17 | def dep_parse(filepath, parser):
 18 |     print('\nDependency parsing ' + filepath)
 19 |     dirpath = os.path.dirname(filepath)
 20 | 
 21 |     with open(filepath, 'r') as f:
 22 |         examples = f.readlines()
 23 | 
 24 |     with open(os.path.join(dirpath, 'text.toks'), 'w') as tokfile, \
 25 |             open(os.path.join(dirpath, 'text.parents'), 'w') as parfile:
 26 | 
 27 |         for example in tqdm(examples):
 28 |             text = example.strip()
 29 | 
 30 |             if not text.endswith("?"):
 31 |                 text = re.sub(r"\?", "", text)
 32 |                 text += "?"
 33 | 
 34 |             sample_parse = parser.get_parse(text)['sentences'][0]
 35 |             tokens = [x['word'] for x in sample_parse['tokens']]
 36 |             deps = sorted(sample_parse['basicDependencies'],
 37 |                           key=lambda x: x['dependent'])
 38 |             parents = [x['governor'] for x in deps]
 39 | 
 40 |             parfile.write(' '.join([str(x) for x in parents]) + '\n')
 41 |             tokfile.write(' '.join(tokens) + '\n')
 42 | 
 43 | 
 44 | def gen_tokens(filepath, parser):
 45 |     print('\nTokenizing ' + filepath)
 46 |     dirpath = os.path.dirname(filepath)
 47 | 
 48 |     with open(filepath, 'r') as f:
 49 |         examples = f.readlines()
 50 | 
 51 |     with open(os.path.join(dirpath, 'text.toks'), 'w') as tokfile:
 52 | 
 53 |         for example in tqdm(examples):
 54 |             text = example.strip()
 55 | 
 56 |             if not text.endswith("?"):
 57 |                 text = re.sub(r"\?", "", text)
 58 |                 text += "?"
 59 | 
 60 |             sample_parse = parser.get_parse(text)
 61 |             tokens = [x['word'] for x in sample_parse['tokens']]
 62 |             tokfile.write(' '.join(tokens) + '\n')
 63 | 
 64 | 
 65 | def make_dirs(dirs):
 66 |     for d in dirs:
 67 |         if not os.path.exists(d):
 68 |             os.makedirs(d)
 69 | 
 70 | 
 71 | def split(filepath, dst_dir):
 72 |     '''
 73 |     Input datafiles now have form:
 74 |     id \t sentence \t category (0, 1)
 75 |     id = id number
 76 |     sentence = text of sentence/query
 77 |     category = whether this is a "real" or "fake" sentence
 78 |     '''
 79 |     with open(filepath) as datafile, \
 80 |             open(os.path.join(dst_dir, 'text.txt'), 'w') as textfile, \
 81 |             open(os.path.join(dst_dir, 'id.txt'), 'w') as idfile, \
 82 |             open(os.path.join(dst_dir, 'cat.txt'), 'w') as catfile:
 83 |         datafile.readline()
 84 | 
 85 |         for line in datafile:
 86 |             i, text, cat = line.strip().split('\t')
 87 |             idfile.write(i + '\n')
 88 |             textfile.write(text + '\n')
 89 |             catfile.write(cat + '\n')
 90 | 
 91 | 
 92 | def train_dev_test_split(filepath, dst_dir,
 93 |                          train=0.7, dev=0.2, test=0.1):
 94 |     test = test/(train + test)
 95 | 
 96 |     with open(filepath, "r") as datafile:
 97 |         data = datafile.readlines()
 98 | 
 99 |     header = data[0]
100 | 
101 |     x_train, x_dev = train_test_split(data[1:], test_size=dev, shuffle=True)
102 |     x_train, x_test = train_test_split(x_train, test_size=test, shuffle=True)
103 | 
104 |     with open(os.path.join(dst_dir, "MULTIVAC_train.txt"), "w") as f:
105 |         f.write(header)
106 |         for line in x_train:
107 |             f.write(line)
108 | 
109 |     with open(os.path.join(dst_dir, "MULTIVAC_test_annotated.txt"), "w") as f:
110 |         f.write(header)
111 |         for line in x_dev:
112 |             f.write(line)
113 | 
114 |     with open(os.path.join(dst_dir, "MULTIVAC_trial.txt"), "w") as f:
115 |         f.write(header)
116 |         for line in x_test:
117 |             f.write(line)
118 | 
119 | 
120 | if __name__ == '__main__':
121 | 
122 |     parser = argparse.ArgumentParser(
123 |         description='Preprocessing of MULTIVAC data for QueryGAN '
124 |                     'discriminator training.')
125 |     # data arguments
126 |     parser.add_argument('-d', '--data', required=False,
127 |                         help='Path to source dataset.')
128 | 
129 |     args = vars(parser.parse_args())
130 | 
131 |     print('=' * 80)
132 |     print('Preprocessing MULTIVAC dataset')
133 |     print('=' * 80)
134 | 
135 |     base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
136 |     data_dir = os.path.join(base_dir, 'data')
137 |     multivac_dir = os.path.join(data_dir, 'multivac')
138 | 
139 |     prs = StanfordParser(annots='tokenize')
140 | 
141 |     split(os.path.join(multivac_dir, 'extracted_questions_labels.txt'), multivac_dir)
142 |     gen_tokens(os.path.join(multivac_dir, 'text.txt'), prs)
143 | 
144 |     # get vocabulary
145 |     build_vocab(glob.glob(os.path.join(multivac_dir, '*/*.toks')),
146 |                 os.path.join(multivac_dir, 'vocab.txt'))
147 |     build_vocab(glob.glob(os.path.join(multivac_dir, '*/*.toks')),
148 |                 os.path.join(multivac_dir, 'vocab-cased.txt'),
149 |                 lowercase=False)
150 | 


--------------------------------------------------------------------------------
/src/gan/discriminator/trainer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | 
 4 | 
 5 | class Trainer(object):
 6 |     def __init__(self, args, model, criterion, optimizer, device):
 7 |         super(Trainer, self).__init__()
 8 |         self.args = args
 9 |         self.model = model
10 |         self.criterion = criterion
11 |         self.optimizer = optimizer
12 |         self.device = device
13 |         self.epoch = 0
14 | 
15 |     # helper function for training
16 |     def train(self, dataset):
17 |         self.model.train()
18 |         self.optimizer.zero_grad()
19 |         total_loss = 0.0
20 |         indices = torch.randperm(len(dataset), dtype=torch.long, device=self.device)
21 | 
22 |         for idx in tqdm(range(len(dataset)), desc='Training epoch ' + str(self.epoch + 1) + ''):
23 |             tree, inputs, label = dataset[indices[idx]]
24 |             inputs = inputs.to(self.device)
25 |             label = label.to(self.device).view(1, 1)
26 |             output = self.model(tree, inputs)
27 |             loss = self.criterion(output, label)
28 |             total_loss += loss.item()
29 |             loss.backward()
30 | 
31 |             if idx % self.args['batchsize'] == 0 and idx > 0:
32 |                 self.optimizer.step()
33 |                 self.optimizer.zero_grad()
34 | 
35 |         self.epoch += 1
36 |         return total_loss / len(dataset)
37 | 
38 |     # helper function for testing
39 |     def test(self, dataset):
40 |         self.model.eval()
41 | 
42 |         with torch.no_grad():
43 |             total_loss = 0.0
44 |             predictions = torch.zeros(len(dataset), dtype=torch.float, device=self.device)
45 | 
46 |             for idx in tqdm(range(len(dataset)), desc='Testing epoch  ' + str(self.epoch) + ''):
47 |                 tree, inputs, label = dataset[idx]
48 |                 inputs, label = inputs.to(self.device), label.to(self.device).view(1, 1)
49 |                 output = self.model(tree, inputs)
50 |                 loss = self.criterion(output, label)
51 |                 total_loss += loss.item()
52 |                 output = output.squeeze().to('cpu')
53 |                 predictions[idx] = torch.round(output)
54 | 
55 |         return total_loss / len(dataset), predictions
56 | 


--------------------------------------------------------------------------------
/src/gan/discriminator/tree.py:
--------------------------------------------------------------------------------
 1 | # tree object from stanfordnlp/treelstm
 2 | class Tree(object):
 3 | 
 4 |     def __init__(self):
 5 |         self.parent = None
 6 |         self.num_children = 0
 7 |         self.children = list()
 8 | 
 9 |     def add_child(self, child):
10 |         child.parent = self
11 |         self.num_children += 1
12 |         self.children.append(child)
13 | 
14 |     def size(self):
15 |         if getattr(self, '_size'):
16 |             return self._size
17 |         count = 1
18 |         for i in range(self.num_children):
19 |             count += self.children[i].size()
20 |         self._size = count
21 |         return self._size
22 | 
23 |     def depth(self):
24 |         if getattr(self, '_depth'):
25 |             return self._depth
26 |         count = 0
27 |         if self.num_children > 0:
28 |             for i in range(self.num_children):
29 |                 child_depth = self.children[i].depth()
30 |                 if child_depth > count:
31 |                     count = child_depth
32 |             count += 1
33 |         self._depth = count
34 |         return self._depth
35 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/__init__.py


--------------------------------------------------------------------------------
/src/gan/gen_pyt/asdl/__init__.py:
--------------------------------------------------------------------------------
 1 | # import six
 2 | # from .lang.lambda_dcs.lambda_dcs_transition_system import LambdaCalculusTransitionSystem
 3 | # from .lang.prolog.prolog_transition_system import PrologTransitionSystem
 4 | 
 5 | # if six.PY2:
 6 | #     from .lang.py.py_transition_system import PythonTransitionSystem
 7 | # else:
 8 | #     from .lang.py3.py3_transition_system import Python3TransitionSystem
 9 | #     from asdl.lang.sql.sql_transition_system import SqlTransitionSystem
10 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/asdl/hypothesis.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | from multivac.src.gan.gen_pyt.asdl.asdl import ASDLCompositeType
  4 | from multivac.src.gan.gen_pyt.asdl.asdl_ast import AbstractSyntaxTree
  5 | from multivac.src.gan.gen_pyt.asdl.transition_system import (ApplyRuleAction,
  6 |                                                              GenTokenAction,
  7 |                                                              ReduceAction)
  8 | 
  9 | 
 10 | class Hypothesis(object):
 11 | 
 12 |     def __init__(self):
 13 |         self.tree = None
 14 |         self.actions = []
 15 |         self.score = 0.
 16 |         self.frontier_node = None
 17 |         self.frontier_field = None
 18 |         self._value_buffer = []
 19 | 
 20 |         # record the current time step
 21 |         self.t = 0
 22 | 
 23 |     def apply_action(self, action):
 24 |         if self.tree is None:
 25 |             assert isinstance(action, ApplyRuleAction), 'Invalid action [%s], only ApplyRule action is valid ' \
 26 |                                                         'at the beginning of decoding'
 27 | 
 28 |             self.tree = AbstractSyntaxTree(action.production)
 29 |             self.update_frontier_info()
 30 |         elif self.frontier_node:
 31 |             if isinstance(self.frontier_field.type, ASDLCompositeType):
 32 |                 if isinstance(action, ApplyRuleAction):
 33 |                     field_value = AbstractSyntaxTree(action.production)
 34 |                     field_value.created_time = self.t
 35 |                     self.frontier_field.add_value(field_value)
 36 |                     self.update_frontier_info()
 37 |                 elif isinstance(action, ReduceAction):
 38 |                     assert self.frontier_field.cardinality in ('optional', 'multiple'), 'Reduce action can only be ' \
 39 |                                                                                         'applied on field with multiple ' \
 40 |                                                                                         'cardinality'
 41 |                     self.frontier_field.set_finish()
 42 |                     self.update_frontier_info()
 43 |                 else:
 44 |                     raise ValueError('Invalid action [%s] on field [%s]' % (action, self.frontier_field))
 45 |             else:  # fill in a primitive field
 46 |                 if isinstance(action, GenTokenAction):
 47 |                     # only field of type string requires termination signal </primitive>
 48 |                     end_primitive = False
 49 |                     if self.frontier_field.type.name == 'string':
 50 |                         if action.is_stop_signal():
 51 |                             self.frontier_field.add_value(' '.join(self._value_buffer))
 52 |                             self._value_buffer = []
 53 | 
 54 |                             end_primitive = True
 55 |                         else:
 56 |                             self._value_buffer.append(action.token)
 57 |                     else:
 58 |                         self.frontier_field.add_value(action.token)
 59 |                         end_primitive = True
 60 | 
 61 |                     if end_primitive and self.frontier_field.cardinality in ('single', 'optional'):
 62 |                         self.frontier_field.set_finish()
 63 |                         self.update_frontier_info()
 64 | 
 65 |                 elif isinstance(action, ReduceAction):
 66 |                     assert self.frontier_field.cardinality in ('optional', 'multiple'), 'Reduce action can only be ' \
 67 |                                                                                         'applied on field with multiple ' \
 68 |                                                                                         'cardinality'
 69 |                     self.frontier_field.set_finish()
 70 |                     self.update_frontier_info()
 71 |                 else:
 72 |                     raise ValueError('Can only invoke GenToken or Reduce actions on primitive fields')
 73 | 
 74 |         self.t += 1
 75 |         self.actions.append(action)
 76 | 
 77 |     def update_frontier_info(self):
 78 |         def _find_frontier_node_and_field(tree_node):
 79 |             if tree_node:
 80 |                 for field in tree_node.fields:
 81 |                     # if it's an intermediate node, check its children
 82 |                     if isinstance(field.type, ASDLCompositeType) and field.value:
 83 |                         if field.cardinality in ('single', 'optional'):
 84 |                             iter_values = [field.value]
 85 |                         else:
 86 |                             iter_values = field.value
 87 | 
 88 |                         for child_node in iter_values:
 89 |                             result = _find_frontier_node_and_field(child_node)
 90 |                             if result:
 91 |                                 return result
 92 | 
 93 |                     # now all its possible children are checked
 94 |                     if not field.finished:
 95 |                         return tree_node, field
 96 | 
 97 |                 return None
 98 |             else:
 99 |                 return None
100 | 
101 |         frontier_info = _find_frontier_node_and_field(self.tree)
102 |         if frontier_info:
103 |             self.frontier_node, self.frontier_field = frontier_info
104 |         else:
105 |             self.frontier_node, self.frontier_field = None, None
106 | 
107 |     def clone_and_apply_action(self, action):
108 |         new_hyp = self.copy()
109 |         new_hyp.apply_action(action)
110 | 
111 |         return new_hyp
112 | 
113 |     def copy(self):
114 |         new_hyp = Hypothesis()
115 |         if self.tree:
116 |             new_hyp.tree = self.tree.copy()
117 | 
118 |         new_hyp.actions = list(self.actions)
119 |         new_hyp.score = self.score
120 |         new_hyp._value_buffer = list(self._value_buffer)
121 |         new_hyp.t = self.t
122 | 
123 |         new_hyp.update_frontier_info()
124 | 
125 |         return new_hyp
126 | 
127 |     @property
128 |     def completed(self):
129 |         return self.tree and self.frontier_field is None
130 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/asdl/lang/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/asdl/lang/__init__.py


--------------------------------------------------------------------------------
/src/gan/gen_pyt/asdl/lang/eng/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/asdl/lang/eng/__init__.py


--------------------------------------------------------------------------------
/src/gan/gen_pyt/asdl/lang/eng/eng_asdl_helper.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from multivac.src.gan.gen_pyt.asdl.asdl import (ASDLCompositeType,
 4 |                                                 ASDLConstructor,
 5 |                                                 ASDLPrimitiveType,
 6 |                                                 ASDLProduction, Field)
 7 | from multivac.src.gan.gen_pyt.asdl.asdl_ast import (AbstractSyntaxTree,
 8 |                                                     RealizedField)
 9 | 
10 | 
11 | def find_match_paren(s):
12 |     count = 0
13 | 
14 |     for i, c in enumerate(s):
15 |         if c == "(":
16 |             count += 1
17 |         elif c == ")":
18 |             count -= 1
19 | 
20 |         if count == 0:
21 |             return i
22 | 
23 | 
24 | def english_ast_to_asdl_ast(text, depth=0, debug=False):
25 |     ''' Takes a constituency parse string of an English sentence and creates
26 |         an AbstractSyntaxTree object from it.
27 | 
28 |         Example input:
29 |         '(ROOT (SBARQ (WHADVP (WRB Why)) (SQ (VBP do) (NP (NNS birds)) (ADVP
30 |         (RB suddenly)) (VP (VB appear) (SBAR (WHADVP (WRB whenever)) (S (NP
31 |         (PRP you)) (VP (VBP are) (ADJP (JJ near))))))) (. ?)))'
32 |     '''
33 | 
34 |     if debug:
35 |         print(("\t" * depth + "String: '{}'".format(text)))
36 | 
37 |     try:
38 |         tree_str = text[text.index("(") + 1:text.rfind(")")]
39 |     except ValueError:
40 |         print(("Malformatted parse string: '{}'".format(text)))
41 |         raise ValueError
42 | 
43 |     all_fields = []
44 |     next_idx = tree_str.index(" ")
45 | 
46 |     if "(" in tree_str:
47 |         node_type = ASDLCompositeType(tree_str[:next_idx])
48 |         node_fields = []
49 | 
50 |         while "(" in tree_str:
51 |             tree_str = tree_str[tree_str.index("("):]
52 |             next_idx = find_match_paren(tree_str) + 1
53 |             child = english_ast_to_asdl_ast(tree_str[:next_idx], depth+1, debug)
54 | 
55 |             if isinstance(child, AbstractSyntaxTree):
56 |                 asdl_field = Field(child.production.type.name,
57 |                                    child.production.type,
58 |                                    'single')
59 |                 all_fields.append(RealizedField(asdl_field, value=child))
60 |             else:
61 |                 asdl_field = child.field
62 |                 all_fields.append(child)
63 | 
64 |             node_fields.append(asdl_field)
65 |             tree_str = tree_str[next_idx + 1:]
66 | 
67 |         field_str = ', '.join(["({})".format(f.name) for f in node_fields])
68 |         rule_str = node_type.name + " -> " + field_str
69 |         constructor = ASDLConstructor(rule_str, node_fields)
70 |         production = ASDLProduction(node_type, constructor)
71 | 
72 |         result = AbstractSyntaxTree(production, realized_fields=all_fields)
73 |     else:
74 |         node_type = ASDLPrimitiveType(tree_str[:next_idx])
75 |         result = RealizedField(Field(node_type.name, node_type, 'single'),
76 |                                value=tree_str[next_idx + 1:])
77 | 
78 |     return result
79 | 
80 | 
81 | def asdl_ast_to_english(asdl_ast_node):
82 |     tokens = []
83 | 
84 |     for field in asdl_ast_node.fields:
85 |         # for composite node
86 |         field_value = None
87 | 
88 |         if isinstance(field.type, ASDLCompositeType) and field.value:
89 |             field_value = asdl_ast_to_english(field.value)
90 |         else:
91 |             field_value = field.value
92 | 
93 |         tokens.append(field_value)
94 | 
95 |     return ' '.join([x if x else '<None>' for x in tokens])
96 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/asdl/lang/eng/eng_transition_system.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from multivac.src.gan.gen_pyt.asdl.lang.eng.eng_asdl_helper import (
 4 |     asdl_ast_to_english, english_ast_to_asdl_ast)
 5 | from multivac.src.gan.gen_pyt.asdl.transition_system import (GenTokenAction,
 6 |                                                              TransitionSystem)
 7 | from multivac.src.rdf_graph.rdf_parse import tokenize_text
 8 | 
 9 | 
10 | class EnglishTransitionSystem(TransitionSystem):
11 | 
12 |     def __init__(self, grammar):
13 |         super().__init__(grammar)
14 | 
15 |     def tokenize_text(self, text, mode=None):
16 |         return tokenize_text(text, mode)
17 | 
18 |     def surface_text_to_ast(self, text, parser):
19 |         p = parser.get_parse(text)['sentences'][0]['parse']
20 |         return english_ast_to_asdl_ast(p)
21 | 
22 |     def ast_to_surface_text(self, asdl_ast):
23 |         text = asdl_ast_to_english(asdl_ast)
24 |         return text
25 | 
26 |     def compare_ast(self, hyp_ast, ref_ast):
27 |         hyp_text = self.ast_to_surface_text(hyp_ast)
28 |         ref_reformatted_text = self.ast_to_surface_text(ref_ast)
29 | 
30 |         ref_text_tokens = tokenize_text(ref_reformatted_text)
31 |         hyp_text_tokens = tokenize_text(hyp_text)
32 | 
33 |         return ref_text_tokens == hyp_text_tokens
34 | 
35 |     def get_primitive_field_actions(self, realized_field):
36 |         actions = []
37 | 
38 |         if realized_field.value is not None:
39 |             field_values = [realized_field.value]
40 | 
41 |             tokens = []
42 | 
43 |             for field_val in field_values:
44 |                 tokens.extend(field_val.split(' '))
45 | 
46 |             for tok in tokens:
47 |                 actions.append(GenTokenAction(tok))
48 | 
49 |         return actions
50 | 
51 |     def is_valid_hypothesis(self, hyp, parser, **kwargs):
52 |         try:
53 |             hyp_text = self.ast_to_surface_text(hyp.tree)
54 |             new_tree = self.surface_text_to_ast(hyp_text, parser)
55 |             assert hyp.tree == new_tree
56 |         except Exception:
57 |             return False
58 |         return True
59 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/asdl/lang/eng/grammar.py:
--------------------------------------------------------------------------------
  1 | """
  2 | English grammar and typing system
  3 | """
  4 | from collections import OrderedDict
  5 | 
  6 | from multivac.src.gan.gen_pyt.asdl.asdl import (ASDLCompositeType,
  7 |                                                 ASDLConstructor, ASDLGrammar,
  8 |                                                 ASDLPrimitiveType,
  9 |                                                 ASDLProduction, Field)
 10 | from multivac.src.gan.gen_pyt.asdl.lang.eng.eng_asdl_helper import \
 11 |     english_ast_to_asdl_ast
 12 | from multivac.src.gan.gen_pyt.asdl.lang.grammar import Grammar
 13 | 
 14 | BRACKET_TYPES = {
 15 |     ASDLPrimitiveType('-LRB-'): '(',
 16 |     ASDLPrimitiveType('-RRB-'): ')',
 17 |     ASDLPrimitiveType('-LCB-'): '{',
 18 |     ASDLPrimitiveType('-RCB-'): '}',
 19 |     ASDLPrimitiveType('-LSB-'): '[',
 20 |     ASDLPrimitiveType('-RSB-'): ']',
 21 | }
 22 | 
 23 | TERMINAL_TYPES = {
 24 |     ASDLPrimitiveType('CC'),    # Coordinating conjunction
 25 |     ASDLPrimitiveType('CD'),    # Cardinal number
 26 |     ASDLPrimitiveType('DT'),    # Determiner
 27 |     ASDLPrimitiveType('EX'),    # Existential there
 28 |     ASDLPrimitiveType('FW'),    # Foreign word
 29 |     ASDLPrimitiveType('IN'),    # Preposition or subordinating conjunction
 30 |     ASDLPrimitiveType('JJ'),    # Adjective
 31 |     ASDLPrimitiveType('JJR'),   # Adjective, comparative
 32 |     ASDLPrimitiveType('JJS'),   # Adjective, superlative
 33 |     ASDLPrimitiveType('LS'),    # List item marker
 34 |     ASDLPrimitiveType('MD'),    # Modals
 35 |     ASDLPrimitiveType('NN'),    # Noun, singular or mass
 36 |     ASDLPrimitiveType('NNS'),   # Noun, plural
 37 |     ASDLPrimitiveType('NNP'),   # Proper noun, singular
 38 |     ASDLPrimitiveType('NNPS'),  # Proper noun, plural
 39 |     ASDLPrimitiveType('PDT'),   # Predeterminer
 40 |     ASDLPrimitiveType('POS'),   # Possessive ending
 41 |     ASDLPrimitiveType('PRP'),   # Personal pronoun
 42 |     ASDLPrimitiveType('PRP$'),  # Possessive pronoun (prolog version PRP-S)
 43 |     ASDLPrimitiveType('RB'),    # Adverb
 44 |     ASDLPrimitiveType('RBR'),   # Adverb, comparative
 45 |     ASDLPrimitiveType('RBS'),   # Adverb, superlative
 46 |     ASDLPrimitiveType('RP'),    # Particle
 47 |     ASDLPrimitiveType('SYM'),   # Symbol
 48 |     ASDLPrimitiveType('TO'),    # to
 49 |     ASDLPrimitiveType('UH'),    # Interjection
 50 |     ASDLPrimitiveType('VB'),    # Verb, base form
 51 |     ASDLPrimitiveType('VBD'),   # Verb, past tense
 52 |     ASDLPrimitiveType('VBG'),   # Verb, gerund or present participle
 53 |     ASDLPrimitiveType('VBN'),   # Verb, past participle
 54 |     ASDLPrimitiveType('VBP'),   # Verb, non-3rd person singular present
 55 |     ASDLPrimitiveType('VBZ'),   # Verb, 3rd person singular present
 56 |     ASDLPrimitiveType('WDT'),   # Wh-determiner
 57 |     ASDLPrimitiveType('WP'),    # Wh-pronoun
 58 |     ASDLPrimitiveType('WP$'),   # Possessive wh-pronoun (prolog version WP-S)
 59 |     ASDLPrimitiveType('WRB')    # Wh-adverb
 60 | }
 61 | 
 62 | 
 63 | class EnglishGrammar(Grammar):
 64 | 
 65 |     def __init__(self, rules):
 66 |         super().__init__(rules)
 67 | 
 68 |         self.terminal_types.update(TERMINAL_TYPES)
 69 |         self.terminal_types.update(BRACKET_TYPES)
 70 | 
 71 | 
 72 | class EnglishASDLGrammar(ASDLGrammar):
 73 |     """
 74 |     Collection of types, constructors and productions
 75 |     """
 76 | 
 77 |     def __init__(self, grammar=None, productions=None):
 78 |         # productions are indexed by their head types
 79 |         self._productions = OrderedDict()
 80 |         self._constructor_production_map = dict()
 81 | 
 82 |         if productions is not None:
 83 |             english_prods = set(productions)
 84 | 
 85 |             for prod in english_prods:
 86 |                 if prod.type not in self._productions:
 87 |                     self._productions[prod.type] = list()
 88 |                 self._productions[prod.type].append(prod)
 89 |                 self._constructor_production_map[prod.constructor.name] = prod
 90 | 
 91 |             self.root_type = ASDLCompositeType("ROOT")
 92 |         elif grammar is not None:
 93 |             if isinstance(grammar, ASDLGrammar):
 94 |                 self = grammar
 95 |                 return
 96 | 
 97 |             for rule in grammar.rules:
 98 |                 fields = []
 99 | 
100 |                 for child in rule.children:
101 |                     if grammar.is_terminal(child):
102 |                         child_type = ASDLPrimitiveType(child.type)
103 |                     else:
104 |                         child_type = ASDLCompositeType(child.type)
105 | 
106 |                     fields.append(Field(child.type, child_type, 'single'))
107 | 
108 |                 constructor = ASDLConstructor(rule.type, fields)
109 |                 production = ASDLProduction(ASDLCompositeType(rule.type),
110 |                                             constructor)
111 | 
112 |                 if production.type not in self._productions:
113 |                     self._productions[production.type] = list()
114 | 
115 |                 self._productions[production.type].append(production)
116 |                 self._constructor_production_map[constructor.name] = production
117 | 
118 |             self.root_type = ASDLCompositeType(grammar.root_node.type)
119 | 
120 |         self.size = sum(len(head) for head in self._productions.values())
121 |         self.terminal_types = set(self.primitive_types)
122 |         self.terminal_types.update(TERMINAL_TYPES)
123 |         self.terminal_types.update(BRACKET_TYPES.keys())
124 | 
125 |         self._types = sorted(self.terminal_types.union(set(self.types)),
126 |                              key=lambda x: x.name)
127 | 
128 |         # get entities to their ids map
129 |         self.prod2id = {prod: i for i, prod in enumerate(self.productions)}
130 |         self.type2id = {type: i for i, type in enumerate(self.types)}
131 |         self.field2id = {field: i for i, field in enumerate(self.fields)}
132 | 
133 |         self.id2prod = {i: prod for i, prod in enumerate(self.productions)}
134 |         self.id2type = {i: type for i, type in enumerate(self.types)}
135 |         self.id2field = {i: field for i, field in enumerate(self.fields)}
136 | 
137 |     @staticmethod
138 |     def from_text(text, parser):
139 |         productions = set()
140 | 
141 |         if isinstance(text, list):
142 |             text = '\n'.join(text)
143 | 
144 |         for s in text:
145 |             try:
146 |                 p = parser.get_parse(s)['sentences'][0]['parse']
147 |             except Exception:
148 |                 continue
149 |             try:
150 |                 parse_tree = english_ast_to_asdl_ast(p.parse_string)
151 |             except Exception:
152 |                 continue
153 | 
154 |             productions.update(parse_tree.get_productions())
155 | 
156 |         productions = sorted(productions, key=lambda x: x.__repr__)
157 | 
158 |         grammar = EnglishASDLGrammar(productions=productions)
159 |         return grammar
160 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/asdl/lang/grammar.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict, defaultdict
 2 | 
 3 | from multivac.src.gan.gen_pyt.astnode import ASTNode
 4 | from multivac.src.gan.utilities.utils import typename
 5 | 
 6 | 
 7 | class Grammar(object):
 8 | 
 9 |     def __init__(self, rules):
10 |         """
11 |         instantiate a grammar with a set of production rules of type Rule
12 |         """
13 |         self.rules = rules
14 |         self.rule_index = defaultdict(list)
15 |         self.rule_to_id = OrderedDict()
16 | 
17 |         node_types = set()
18 |         lhs_nodes = set()
19 |         rhs_nodes = set()
20 | 
21 |         for rule in self.rules:
22 |             self.rule_index[rule.parent].append(rule)
23 | 
24 |             # we also store all unique node types
25 |             for node in rule.nodes:
26 |                 node_types.add(typename(node.type))
27 | 
28 |             lhs_nodes.add(rule.parent)
29 | 
30 |             for child in rule.children:
31 |                 rhs_nodes.add(child.as_type_node)
32 | 
33 |         root_node = lhs_nodes - rhs_nodes
34 | 
35 |         try:
36 |             assert len(root_node) == 1
37 |         except AssertionError:
38 |             print(root_node)
39 |             raise AssertionError
40 | 
41 |         self.root_node = next(iter(root_node))
42 | 
43 |         self.terminal_nodes = rhs_nodes - lhs_nodes
44 |         self.terminal_types = set([n.type for n in self.terminal_nodes])
45 | 
46 |         self.node_type_to_id = OrderedDict()
47 |         for i, type in enumerate(node_types, start=0):
48 |             self.node_type_to_id[type] = i
49 | 
50 |         for gid, rule in enumerate(rules, start=0):
51 |             self.rule_to_id[rule] = gid
52 | 
53 |         self.id_to_rule = OrderedDict((v, k) for (k, v) in list(self.rule_to_id.items()))
54 | 
55 |     def __iter__(self):
56 |         return self.rules.__iter__()
57 | 
58 |     def __len__(self):
59 |         return len(self.rules)
60 | 
61 |     def __getitem__(self, lhs):
62 |         key_node = ASTNode(lhs.type, None)  # Rules are indexed by types only
63 |         if key_node in self.rule_index:
64 |             return self.rule_index[key_node]
65 |         else:
66 |             KeyError('key=%s' % key_node)
67 | 
68 |     def get_node_type_id(self, node):
69 |         if isinstance(node, ASTNode):
70 |             type_repr = typename(node.type)
71 |             return self.node_type_to_id[type_repr]
72 |         else:
73 |             # assert isinstance(node, str)
74 |             # it is a type
75 |             type_repr = typename(node)
76 |             return self.node_type_to_id[type_repr]
77 | 
78 |     def is_terminal(self, node):
79 |         return node.type in self.terminal_types
80 | 
81 |     def is_value_node(self, node):
82 |         return node.type in self.terminal_types
83 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/asdl/transition_system.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | 
  4 | class Action(object):
  5 |     pass
  6 | 
  7 | 
  8 | class ApplyRuleAction(Action):
  9 |     def __init__(self, production):
 10 |         self.production = production
 11 | 
 12 |     def __hash__(self):
 13 |         return hash(self.production)
 14 | 
 15 |     def __eq__(self, other):
 16 |         return isinstance(other, ApplyRuleAction) and self.production == other.production
 17 | 
 18 |     def __ne__(self, other):
 19 |         return not self.__eq__(other)
 20 | 
 21 |     def __repr__(self):
 22 |         return 'ApplyRule[%s]' % self.production.__repr__()
 23 | 
 24 | 
 25 | class GenTokenAction(Action):
 26 |     def __init__(self, token):
 27 |         self.token = token
 28 | 
 29 |     def is_stop_signal(self):
 30 |         return self.token == '</primitive>'
 31 | 
 32 |     def __repr__(self):
 33 |         return 'GenToken[%s]' % self.token
 34 | 
 35 | 
 36 | class GenEngTokenAction(GenTokenAction):
 37 |     def __init__(self, token):
 38 |         self.token = token
 39 | 
 40 |     def is_stop_signal(self):
 41 |         return self.token == '<eos>'
 42 | 
 43 | 
 44 | class ReduceAction(Action):
 45 |     def __repr__(self):
 46 |         return 'Reduce'
 47 | 
 48 | 
 49 | class TransitionSystem(object):
 50 |     def __init__(self, grammar):
 51 |         self.grammar = grammar
 52 | 
 53 |     def get_actions(self, asdl_ast):
 54 |         """
 55 |         generate action sequence given the ASDL Syntax Tree
 56 |         """
 57 | 
 58 |         actions = []
 59 | 
 60 |         parent_action = ApplyRuleAction(asdl_ast.production)
 61 |         actions.append(parent_action)
 62 | 
 63 |         for field in asdl_ast.fields:
 64 |             # is a composite field
 65 |             if self.grammar.is_composite_type(field.type):
 66 |                 if field.cardinality == 'single':
 67 |                     field_actions = self.get_actions(field.value)
 68 |                 else:
 69 |                     field_actions = []
 70 | 
 71 |                     if field.value is not None:
 72 |                         if field.cardinality == 'multiple':
 73 |                             for val in field.value:
 74 |                                 cur_child_actions = self.get_actions(val)
 75 |                                 field_actions.extend(cur_child_actions)
 76 |                         elif field.cardinality == 'optional':
 77 |                             field_actions = self.get_actions(field.value)
 78 | 
 79 |                     # if an optional field is filled, then do not need Reduce action
 80 |                     if field.cardinality == 'multiple' or field.cardinality == 'optional' and not field_actions:
 81 |                         field_actions.append(ReduceAction())
 82 |             else:  # is a primitive field
 83 |                 field_actions = self.get_primitive_field_actions(field)
 84 | 
 85 |                 # if an optional field is filled, then do not need Reduce action
 86 |                 if field.cardinality == 'multiple' or field.cardinality == 'optional' and not field_actions:
 87 |                     # reduce action
 88 |                     field_actions.append(ReduceAction())
 89 | 
 90 |             actions.extend(field_actions)
 91 | 
 92 |         return actions
 93 | 
 94 |     def tokenize_code(self, code, mode):
 95 |         raise NotImplementedError
 96 | 
 97 |     def compare_ast(self, hyp_ast, ref_ast):
 98 |         raise NotImplementedError
 99 | 
100 |     def ast_to_surface_code(self, asdl_ast):
101 |         raise NotImplementedError
102 | 
103 |     def surface_code_to_ast(self, code):
104 |         raise NotImplementedError
105 | 
106 |     def get_primitive_field_actions(self, realized_field):
107 |         raise NotImplementedError
108 | 
109 |     def get_valid_continuation_types(self, hyp):
110 |         if hyp.tree:
111 |             if self.grammar.is_composite_type(hyp.frontier_field.type):
112 |                 if hyp.frontier_field.cardinality == 'single':
113 |                     return ApplyRuleAction,
114 |                 else:  # optional, multiple
115 |                     return ApplyRuleAction, ReduceAction
116 |             else:
117 |                 if hyp.frontier_field.cardinality == 'single':
118 |                     return GenTokenAction,
119 |                 elif hyp.frontier_field.cardinality == 'optional':
120 |                     if hyp._value_buffer:
121 |                         return GenTokenAction,
122 |                     else:
123 |                         return GenTokenAction, ReduceAction
124 |                 else:
125 |                     return GenTokenAction, ReduceAction
126 |         else:
127 |             return ApplyRuleAction,
128 | 
129 |     def get_valid_continuating_productions(self, hyp):
130 |         if hyp.tree:
131 |             if self.grammar.is_composite_type(hyp.frontier_field.type):
132 |                 return self.grammar[hyp.frontier_field.type]
133 |             else:
134 |                 raise ValueError
135 |         else:
136 |             return self.grammar[self.grammar.root_type]
137 | 
138 |     @staticmethod
139 |     def get_class_by_lang(lang):
140 |         if lang == 'python':
141 |             from .lang.py.py_transition_system import PythonTransitionSystem
142 |             return PythonTransitionSystem
143 |         elif lang == 'english':
144 |             from .lang.eng.eng_transition_system import EnglishTransitionSystem
145 |             return EnglishTransitionSystem
146 |         elif lang == 'python3':
147 |             from .lang.py3.py3_transition_system import Python3TransitionSystem
148 |             return Python3TransitionSystem
149 |         elif lang == 'lambda_dcs':
150 |             from .lang.lambda_dcs.lambda_dcs_transition_system import LambdaCalculusTransitionSystem
151 |             return LambdaCalculusTransitionSystem
152 |         elif lang == 'prolog':
153 |             from .lang.prolog.prolog_transition_system import PrologTransitionSystem
154 |             return PrologTransitionSystem
155 |         elif lang == 'wikisql':
156 |             from .lang.sql.sql_transition_system import SqlTransitionSystem
157 |             return SqlTransitionSystem
158 | 
159 |         raise ValueError('unknown language %s' % lang)
160 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/components/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | import multivac.src.gan.utilities.vocab as vocab
3 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/components/action_info.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from multivac.src.gan.gen_pyt.asdl.hypothesis import Hypothesis
 3 | from multivac.src.gan.gen_pyt.asdl.transition_system import GenTokenAction
 4 | 
 5 | 
 6 | class ActionInfo(object):
 7 |     """sufficient statistics for making a prediction of an action at a time step"""
 8 | 
 9 |     def __init__(self, action=None):
10 |         self.t = 0
11 |         self.parent_t = -1
12 |         self.action = action
13 |         self.frontier_prod = None
14 |         self.frontier_field = None
15 | 
16 |         # for GenToken actions only
17 |         self.copy_from_src = False
18 |         self.src_token_position = -1
19 | 
20 |     def __repr__(self, verbose=False):
21 |         repr_str = '%s (t=%d, p_t=%d, frontier_field=%s)' % (repr(self.action),
22 |                                                              self.t,
23 |                                                              self.parent_t,
24 |                                                              self.frontier_field.__repr__(True)
25 |                                                              if self.frontier_field else 'None')
26 | 
27 |         if verbose:
28 |             verbose_repr = 'action_prob=%.4f, ' % self.action_prob
29 |             if isinstance(self.action, GenTokenAction):
30 |                 verbose_repr += 'in_vocab=%s, ' \
31 |                                 'gen_copy_switch=%s, ' \
32 |                                 'p(gen)=%s, p(copy)=%s, ' \
33 |                                 'has_copy=%s, copy_pos=%s' % (self.in_vocab,
34 |                                                               self.gen_copy_switch,
35 |                                                               self.gen_token_prob, self.copy_token_prob,
36 |                                                               self.copy_from_src, self.src_token_position)
37 | 
38 |             repr_str += '\n' + verbose_repr
39 | 
40 |         return repr_str
41 | 
42 | 
43 | def get_action_infos(src_query, tgt_actions, force_copy=False, verbose=False):
44 |     action_infos = []
45 |     hyp = Hypothesis()
46 | 
47 |     for t, action in enumerate(tgt_actions):
48 |         action_info = ActionInfo(action)
49 |         action_info.t = t
50 | 
51 |         if verbose:
52 |             print(action)
53 | 
54 |         if hyp.frontier_node:
55 |             action_info.parent_t = hyp.frontier_node.created_time
56 |             action_info.frontier_prod = hyp.frontier_node.production
57 |             action_info.frontier_field = hyp.frontier_field.field
58 | 
59 |             if verbose:
60 |                 print("Frontier node: {} :: {}".format(action_info.frontier_prod, action_info.frontier_field))
61 | 
62 |         if isinstance(action, GenTokenAction):
63 |             if verbose:
64 |                 print("GenToken: {}".format(str(action.token)))
65 | 
66 |             try:
67 |                 tok_src_idx = src_query.index(str(action.token))
68 |                 action_info.copy_from_src = True
69 |                 action_info.src_token_position = tok_src_idx
70 |             except ValueError:
71 |                 if force_copy:
72 |                     raise ValueError('cannot copy primitive token %s from source' % action.token)
73 | 
74 |         hyp.apply_action(action)
75 |         action_infos.append(action_info)
76 | 
77 |     return action_infos
78 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/components/decode_hypothesis.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from multivac.src.gan.gen_pyt.asdl.hypothesis import Hypothesis
 4 | 
 5 | 
 6 | class DecodeHypothesis(Hypothesis):
 7 | 
 8 |     def __init__(self):
 9 |         super(DecodeHypothesis, self).__init__()
10 | 
11 |         self.action_infos = []
12 |         self.code = None
13 | 
14 |     def clone_and_apply_action_info(self, action_info):
15 |         action = action_info.action
16 | 
17 |         new_hyp = self.clone_and_apply_action(action)
18 |         new_hyp.action_infos.append(action_info)
19 | 
20 |         return new_hyp
21 | 
22 |     def copy(self):
23 |         new_hyp = DecodeHypothesis()
24 |         if self.tree:
25 |             new_hyp.tree = self.tree.copy()
26 | 
27 |         new_hyp.actions = list(self.actions)
28 |         new_hyp.action_infos = list(self.action_infos)
29 |         new_hyp.score = self.score
30 |         new_hyp._value_buffer = list(self._value_buffer)
31 |         new_hyp.t = self.t
32 |         new_hyp.code = self.code
33 | 
34 |         new_hyp.update_frontier_info()
35 | 
36 |         return new_hyp
37 | 
38 |     def apply_action_info(self, action_info):
39 |         self.apply_action(action_info.action)
40 |         self.action_infos.append(action_info)
41 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/components/vocab.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from collections import Counter
  3 | from itertools import chain
  4 | 
  5 | 
  6 | class Vocab(object):
  7 | 
  8 |     def __init__(self, filename=None, data=None, lower=False):
  9 |         self.idxToLabel = {}
 10 |         self.labelToIdx = {}
 11 |         self.lower = lower
 12 | 
 13 |         # Special entries will not be pruned.
 14 |         self.special = []
 15 | 
 16 |         if data is not None:
 17 |             self.addSpecials(data)
 18 |         if filename is not None:
 19 |             self.loadFile(filename)
 20 | 
 21 |         self.add('<pad>')
 22 |         self.add('<unk>')
 23 |         self.add('<eos>')
 24 | 
 25 |     def __getitem__(self, item):
 26 |         return self.labelToIdx.get(item, self.unk)
 27 | 
 28 |     def __contains__(self, item):
 29 |         return item in self.labelToIdx
 30 | 
 31 |     def __setitem__(self, key, value):
 32 |         self.labelToIdx[key] = value
 33 | 
 34 |     def __len__(self):
 35 |         return len(self.labelToIdx)
 36 | 
 37 |     def __iter__(self):
 38 |         return iter(list(self.labelToIdx.keys()))
 39 | 
 40 |     def __eq__(self, other):
 41 |         return all([self.idxToLabel == other.idxToLabel,
 42 |                     self.labelToIdx == other.labelToIdx,
 43 |                     self.lower == other.lower,
 44 |                     self.special == other.special])
 45 | 
 46 |     @property
 47 |     def pad(self):
 48 |         return self.labelToIdx['<pad>']
 49 | 
 50 |     @property
 51 |     def unk(self):
 52 |         return self.labelToIdx['<unk>']
 53 | 
 54 |     @property
 55 |     def eos(self):
 56 |         return self.labelToIdx['<eos>']
 57 | 
 58 |     def is_unk(self, word):
 59 |         return word not in self
 60 | 
 61 |     def size(self):
 62 |         return len(self.idxToLabel)
 63 | 
 64 |     # Load entries from a file.
 65 |     def loadFile(self, filename):
 66 |         idx = 0
 67 |         for line in open(filename, 'r', encoding='utf8', errors='ignore'):
 68 |             token = line.rstrip('\n')
 69 |             self.add(token)
 70 |             idx += 1
 71 | 
 72 |     def getIndex(self, key, default=None):
 73 |         key = key.lower() if self.lower else key
 74 | 
 75 |         return self.labelToIdx.get(key, default)
 76 | 
 77 |     def getLabel(self, idx, default=None):
 78 |         return self.idxToLabel.get(idx, default)
 79 | 
 80 |     def add_from_data(self, label, idx=None):
 81 |         if idx:
 82 |             self.idxToLabel[idx] = label
 83 |             self.labelToIdx[label] = idx
 84 |         else:
 85 |             idx = self.add(label)
 86 | 
 87 |     # Mark this `label` and `idx` as special
 88 |     def addSpecial(self, label):
 89 |         idx = self.add(label)
 90 |         self.special += [idx]
 91 | 
 92 |     # Mark all labels in `labels` as specials
 93 |     def addSpecials(self, labels):
 94 |         for label in labels:
 95 |             if isinstance(label, tuple):
 96 |                 self.add_from_data(*label)
 97 |             else:
 98 |                 self.addSpecial(label)
 99 | 
100 |     # Add `label` in the dictionary. Use `idx` as its index if given.
101 |     def add(self, label):
102 |         label = label.lower() if self.lower else label
103 | 
104 |         if label in self.labelToIdx:
105 |             idx = self.labelToIdx[label]
106 |         else:
107 |             idx = len(self.idxToLabel)
108 |             self.idxToLabel[idx] = label
109 |             self.labelToIdx[label] = idx
110 |         return idx
111 | 
112 |     # Convert `labels` to indices. Use `unkWord` if not found.
113 |     # Optionally insert `bosWord` at the beginning and `eosWord` at the .
114 |     def convertToIdx(self, labels, unkWord=None, bosWord=None, eosWord=None):
115 |         if unkWord is None:
116 |             unk = self.unk
117 |         else:
118 |             unk = self.getIndex(unkWord)
119 | 
120 |         vec = []
121 | 
122 |         if bosWord is not None:
123 |             vec += [self.getIndex(bosWord)]
124 | 
125 |         vec += [self.getIndex(label, default=unk) for label in labels]
126 | 
127 |         if eosWord is not None:
128 |             vec += [self.getIndex(eosWord)]
129 | 
130 |         return vec
131 | 
132 |     # Convert `idx` to labels. If index `stop` is reached, convert it and return.
133 |     def convertToLabels(self, idx, stop=None):
134 |         labels = []
135 | 
136 |         for i in idx:
137 |             labels += [self.getLabel(i)]
138 |             if i == stop:
139 |                 break
140 | 
141 |         return labels
142 | 
143 |     @staticmethod
144 |     def from_corpus(corpus, size=None, freq_cutoff=0):
145 |         vocab = Vocab()
146 | 
147 |         word_freq = Counter(chain(*corpus))
148 |         top_k_words = sorted(word_freq.keys(), reverse=True, key=word_freq.get)
149 | 
150 |         if size is not None:
151 |             top_k_words = top_k_words[:size]
152 | 
153 |         words_not_included = []
154 | 
155 |         for word in top_k_words:
156 |             if word_freq[word] >= freq_cutoff:
157 |                 vocab.add(word)
158 |             else:
159 |                 words_not_included.append(word)
160 | 
161 |             if len(vocab) == size:
162 |                 break
163 | 
164 |         return vocab
165 | 
166 |     @staticmethod
167 |     def from_dict(vocab):
168 |         vocab = Vocab()
169 | 
170 |         for key, value in vocab.items():
171 |             setattr(vocab, key, value)
172 | 
173 |         return vocab
174 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/datasets/__init__.py


--------------------------------------------------------------------------------
/src/gan/gen_pyt/datasets/english/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/datasets/english/__init__.py


--------------------------------------------------------------------------------
/src/gan/gen_pyt/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/model/__init__.py


--------------------------------------------------------------------------------
/src/gan/gen_pyt/model/attention_util.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from multivac.src.gan.gen_pyt.asdl.transition_system import GenTokenAction
 3 | 
 4 | LOGICAL_FORM_LEXICON = {
 5 |     'city:t': ['citi'],
 6 |     'density:i': ['densiti', 'averag', 'popul'],
 7 | }
 8 | 
 9 | 
10 | class AttentionUtil(object):
11 | 
12 |     @staticmethod
13 |     def get_candidate_tokens_to_attend(src_tokens, action):
14 |         tokens_to_attend = dict()
15 |         if isinstance(action, GenTokenAction):
16 |             tgt_token = action.token
17 |             for src_idx, src_token in enumerate(src_tokens):
18 |                 # match lemma
19 |                 if len(src_token) >= 3 and tgt_token.startswith(src_token) or \
20 |                                 src_token in LOGICAL_FORM_LEXICON.get(tgt_token, []):
21 |                     tokens_to_attend[src_idx] = src_token
22 | 
23 |         return tokens_to_attend
24 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/model/lstm.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | import torch.nn.utils
 6 | from torch.nn import Parameter, init
 7 | from torch.nn.modules.rnn import RNNCellBase
 8 | 
 9 | 
10 | class ParentFeedingLSTMCell(RNNCellBase):
11 | 
12 |     def __init__(self, input_size, hidden_size):
13 |         super(ParentFeedingLSTMCell, self).__init__()
14 | 
15 |         self.input_size = input_size
16 |         self.hidden_size = hidden_size
17 | 
18 |         self.W_i = Parameter(torch.Tensor(hidden_size, input_size))
19 |         self.U_i = Parameter(torch.Tensor(hidden_size, hidden_size))
20 |         self.U_i_p = Parameter(torch.Tensor(hidden_size, hidden_size))
21 |         self.b_i = Parameter(torch.Tensor(hidden_size))
22 | 
23 |         self.W_f = Parameter(torch.Tensor(hidden_size, input_size))
24 |         self.U_f = Parameter(torch.Tensor(hidden_size, hidden_size))
25 |         self.U_f_p = Parameter(torch.Tensor(hidden_size, hidden_size))
26 |         self.b_f = Parameter(torch.Tensor(hidden_size))
27 |         self.b_f_p = Parameter(torch.Tensor(hidden_size))
28 | 
29 |         self.W_c = Parameter(torch.Tensor(hidden_size, input_size))
30 |         self.U_c = Parameter(torch.Tensor(hidden_size, hidden_size))
31 |         self.U_c_p = Parameter(torch.Tensor(hidden_size, hidden_size))
32 |         self.b_c = Parameter(torch.Tensor(hidden_size))
33 | 
34 |         self.W_o = Parameter(torch.Tensor(hidden_size, input_size))
35 |         self.U_o = Parameter(torch.Tensor(hidden_size, hidden_size))
36 |         self.U_o_p = Parameter(torch.Tensor(hidden_size, hidden_size))
37 |         self.b_o = Parameter(torch.Tensor(hidden_size))
38 | 
39 |         self.reset_parameters()
40 | 
41 |     def reset_parameters(self):
42 |         init.orthogonal(self.W_i)
43 |         init.orthogonal(self.U_i)
44 |         init.orthogonal(self.U_i_p)
45 | 
46 |         init.orthogonal(self.W_f)
47 |         init.orthogonal(self.U_f)
48 |         init.orthogonal(self.U_f_p)
49 | 
50 |         init.orthogonal(self.W_c)
51 |         init.orthogonal(self.U_c)
52 |         init.orthogonal(self.U_c_p)
53 | 
54 |         init.orthogonal(self.W_o)
55 |         init.orthogonal(self.U_o)
56 |         init.orthogonal(self.U_o_p)
57 | 
58 |         self.b_i.data.fill_(0.)
59 |         self.b_c.data.fill_(0.)
60 |         self.b_o.data.fill_(0.)
61 |         # forget bias set to 1.
62 |         self.b_f.data.fill_(1.)
63 |         self.b_f_p.data.fill_(1.)
64 | 
65 |     def forward(self, input, hidden_states):
66 |         h_tm1, c_tm1, h_tm1_p, c_tm1_p = hidden_states
67 |         i_t = torch.sigmoid(F.linear(input, self.W_i) + F.linear(h_tm1, self.U_i) +
68 |                             F.linear(h_tm1_p, self.U_i_p) + self.b_i)
69 | 
70 |         xf_t = F.linear(input, self.W_f)
71 |         f_t = torch.sigmoid(xf_t + F.linear(h_tm1, self.U_f) + self.b_f)
72 |         f_t_p = torch.sigmoid(xf_t + F.linear(h_tm1_p, self.U_f_p) + self.b_f_p)
73 | 
74 |         xc_t = torch.linear(input, self.W_c) + F.linear(h_tm1, self.U_c) + F.linear(h_tm1_p, self.U_c_p) + self.b_c
75 |         c_t = f_t * c_tm1 + f_t_p * c_tm1_p + i_t * torch.tanh(xc_t)
76 | 
77 |         o_t = torch.sigmoid(F.linear(input, self.W_o) + F.linear(h_tm1, self.U_o) +
78 |                             F.linear(h_tm1_p, self.U_o_p) + self.b_o)
79 |         h_t = o_t * torch.tanh(c_t)
80 | 
81 |         return h_t, c_t
82 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/model/nn_utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.nn.init as init
  8 | 
  9 | 
 10 | def dot_prod_attention(h_t, src_encoding, src_encoding_att_linear, mask=None):
 11 |     """
 12 |     :param h_t: (batch_size, hidden_size)
 13 |     :param src_encoding: (batch_size, src_sent_len, hidden_size * 2)
 14 |     :param src_encoding_att_linear: (batch_size, src_sent_len, hidden_size)
 15 |     :param mask: (batch_size, src_sent_len)
 16 |     """
 17 |     # (batch_size, src_sent_len)
 18 |     att_weight = torch.bmm(src_encoding_att_linear, h_t.unsqueeze(2)).squeeze(2)
 19 |     if mask is not None:
 20 |         att_weight.data.masked_fill_(mask.bool(), -float('inf'))
 21 |     att_weight = F.softmax(att_weight, dim=-1)
 22 | 
 23 |     att_view = (att_weight.size(0), 1, att_weight.size(1))
 24 |     # (batch_size, hidden_size)
 25 |     ctx_vec = torch.bmm(att_weight.view(*att_view), src_encoding).squeeze(1)
 26 | 
 27 |     return ctx_vec, att_weight
 28 | 
 29 | 
 30 | def length_array_to_mask_tensor(length_array, cuda=False, valid_entry_has_mask_one=False):
 31 |     max_len = max(length_array)
 32 |     batch_size = len(length_array)
 33 | 
 34 |     mask = np.zeros((batch_size, max_len), dtype=np.uint8)
 35 |     for i, seq_len in enumerate(length_array):
 36 |         if valid_entry_has_mask_one:
 37 |             mask[i][:seq_len] = 1
 38 |         else:
 39 |             mask[i][seq_len:] = 1
 40 | 
 41 |     mask = torch.ByteTensor(mask)
 42 |     return mask.cuda() if cuda else mask
 43 | 
 44 | 
 45 | def input_transpose(sents, pad_token):
 46 |     """
 47 |     transform the input List[sequence] of size (batch_size, max_sent_len)
 48 |     into a list of size (max_sent_len, batch_size), with proper padding
 49 |     """
 50 |     max_len = max(len(s) for s in sents)
 51 |     batch_size = len(sents)
 52 | 
 53 |     sents_t = []
 54 |     for i in range(max_len):
 55 |         sents_t.append([sents[k][i] if len(sents[k]) > i else pad_token for k in range(batch_size)])
 56 | 
 57 |     return sents_t
 58 | 
 59 | 
 60 | def word2id(sents, vocab):
 61 |     if type(sents[0]) == list:
 62 |         return [[vocab[w] for w in s] for s in sents]
 63 |     else:
 64 |         return [vocab[w] for w in sents]
 65 | 
 66 | 
 67 | def id2word(sents, vocab):
 68 |     if type(sents[0]) == list:
 69 |         return [[vocab.idxToLabel[w] for w in s] for s in sents]
 70 |     else:
 71 |         return [vocab.idxToLabel[w] for w in sents]
 72 | 
 73 | 
 74 | def to_input_variable(sequences, vocab, cuda=False, training=True, append_boundary_sym=False):
 75 |     """
 76 |     given a list of sequences,
 77 |     return a tensor of shape (max_sent_len, batch_size)
 78 |     """
 79 |     if append_boundary_sym:
 80 |         sequences = [['<s>'] + seq + ['</s>'] for seq in sequences]
 81 | 
 82 |     word_ids = word2id(sequences, vocab)
 83 |     sents_t = input_transpose(word_ids, vocab['<pad>'])
 84 | 
 85 |     sents_var = torch.LongTensor(sents_t)
 86 |     if cuda:
 87 |         sents_var = sents_var.cuda()
 88 | 
 89 |     return sents_var
 90 | 
 91 | 
 92 | def uniform_init(lower, upper, params):
 93 |     for p in params:
 94 |         p.data.uniform_(lower, upper)
 95 | 
 96 | 
 97 | def kaiming_init(params):
 98 |     for p in params:
 99 |         if len(p.data.size()) > 1:
100 |             init.kaiming_normal_(p.data)
101 | 
102 | 
103 | def glorot_init(params):
104 |     for p in params:
105 |         if len(p.data.size()) > 1:
106 |             init.xavier_normal_(p.data)
107 | 
108 | 
109 | def identity(x):
110 |     return x
111 | 
112 | 
113 | class LabelSmoothing(nn.Module):
114 |     """Implement label smoothing.
115 | 
116 |     Reference: the annotated transformer
117 |     """
118 | 
119 |     def __init__(self, smoothing, tgt_vocab_size, ignore_indices=None):
120 |         if ignore_indices is None:
121 |             ignore_indices = []
122 | 
123 |         super(LabelSmoothing, self).__init__()
124 | 
125 |         self.criterion = nn.KLDivLoss(reduction='none')
126 |         smoothing_value = smoothing / float(tgt_vocab_size - 1 - len(ignore_indices))
127 |         one_hot = torch.zeros((tgt_vocab_size,)).fill_(smoothing_value)
128 |         for idx in ignore_indices:
129 |             one_hot[idx] = 0.
130 | 
131 |         self.confidence = 1.0 - smoothing
132 |         self.register_buffer('one_hot', one_hot.unsqueeze(0))
133 | 
134 |     def forward(self, model_prob, target):
135 |         # (batch_size, *, tgt_vocab_size)
136 |         dim = list(model_prob.size())[:-1] + [1]
137 |         true_dist = self.one_hot.repeat(*dim)
138 |         true_dist.scatter_(-1, target.unsqueeze(-1), self.confidence)
139 | 
140 |         return self.criterion(model_prob, true_dist).sum(dim=-1)
141 | 
142 | 
143 | class FeedForward(nn.Module):
144 |     """Feed forward neural network adapted from AllenNLP"""
145 | 
146 |     def __init__(self, input_dim, num_layers, hidden_dims, activations, dropout):
147 |         super(FeedForward, self).__init__()
148 | 
149 |         if not isinstance(hidden_dims, list):
150 |             hidden_dims = [hidden_dims] * num_layers  # type: ignore
151 |         if not isinstance(activations, list):
152 |             activations = [activations] * num_layers  # type: ignore
153 |         if not isinstance(dropout, list):
154 |             dropout = [dropout] * num_layers  # type: ignore
155 | 
156 |         self.activations = activations
157 |         input_dims = [input_dim] + hidden_dims[:-1]
158 |         linear_layers = []
159 |         for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims):
160 |             linear_layers.append(nn.Linear(layer_input_dim, layer_output_dim))
161 | 
162 |         self.linear_layers = nn.ModuleList(linear_layers)
163 |         dropout_layers = [nn.Dropout(p=value) for value in dropout]
164 |         self.dropout = nn.ModuleList(dropout_layers)
165 |         self.output_dim = hidden_dims[-1]
166 |         self.input_dim = input_dim
167 | 
168 |     def forward(self, x):
169 |         output = x
170 |         for layer, activation, dropout in zip(self.linear_layers, self.activations, self.dropout):
171 |             output = dropout(activation(layer(output)))
172 |         return output
173 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/model/pointer_net.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import torch.nn.utils
 7 | 
 8 | 
 9 | class PointerNet(nn.Module):
10 |     def __init__(self, query_vec_size, src_encoding_size, attention_type='affine'):
11 |         super(PointerNet, self).__init__()
12 | 
13 |         assert attention_type in ('affine', 'dot_prod')
14 |         if attention_type == 'affine':
15 |             self.src_encoding_linear = nn.Linear(src_encoding_size, query_vec_size, bias=False)
16 | 
17 |         self.attention_type = attention_type
18 | 
19 |     def forward(self, src_encodings, src_token_mask, query_vec):
20 |         """
21 |         :param src_encodings: Variable(batch_size, src_sent_len, hidden_size * 2)
22 |         :param src_token_mask: Variable(batch_size, src_sent_len)
23 |         :param query_vec: Variable(tgt_action_num, batch_size, query_vec_size)
24 |         :return: Variable(tgt_action_num, batch_size, src_sent_len)
25 |         """
26 | 
27 |         # (batch_size, 1, src_sent_len, query_vec_size)
28 |         if self.attention_type == 'affine':
29 |             src_encodings = self.src_encoding_linear(src_encodings)
30 |         src_encodings = src_encodings.unsqueeze(1)
31 | 
32 |         # (batch_size, tgt_action_num, query_vec_size, 1)
33 |         q = query_vec.permute(1, 0, 2).unsqueeze(3)
34 | 
35 |         # (batch_size, tgt_action_num, src_sent_len)
36 |         weights = torch.matmul(src_encodings, q).squeeze(3)
37 | 
38 |         # (tgt_action_num, batch_size, src_sent_len)
39 |         weights = weights.permute(1, 0, 2)
40 | 
41 |         if src_token_mask is not None:
42 |             # (tgt_action_num, batch_size, src_sent_len)
43 |             src_token_mask = src_token_mask.unsqueeze(0).expand_as(weights)
44 |             weights.data.masked_fill_(src_token_mask.bool(), -float('inf'))
45 | 
46 |         ptr_weights = F.softmax(weights, dim=-1)
47 | 
48 |         return ptr_weights
49 | 


--------------------------------------------------------------------------------
/src/gan/gen_pyt/query_treebank.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import pickle
  4 | 
  5 | from multivac.src.gan.gen_pyt.asdl.lang.eng.eng_asdl_helper import \
  6 |     english_ast_to_asdl_ast
  7 | from multivac.src.gan.gen_pyt.asdl.lang.eng.grammar import (EnglishASDLGrammar,
  8 |                                                             EnglishGrammar)
  9 | from multivac.src.gan.gen_pyt.astnode import ASTNode
 10 | from multivac.src.rdf_graph.rdf_parse import (StanfordParser, check_parse,
 11 |                                               clean_queries, stanford_parse)
 12 | 
 13 | 
 14 | def find_match_paren(s):
 15 |     count = 0
 16 | 
 17 |     for i, c in enumerate(s):
 18 |         if c == "(":
 19 |             count += 1
 20 |         elif c == ")":
 21 |             count -= 1
 22 | 
 23 |         if count == 0:
 24 |             return i
 25 | 
 26 | 
 27 | def get_eng_tree(text, depth=0, debug=False):
 28 |     ''' Takes a constituency parse string of an English sentence and creates
 29 |         an ASTNode tree from it.
 30 | 
 31 |         Example input:
 32 |         '(ROOT (SBARQ (WHADVP (WRB Why)) (SQ (VBP do) (NP (NNS birds)) (ADVP
 33 |         (RB suddenly)) (VP (VB appear) (SBAR (WHADVP (WRB whenever)) (S (NP
 34 |         (PRP you)) (VP (VBP are) (ADJP (JJ near))))))) (. ?)))'
 35 |     '''
 36 | 
 37 |     if debug:
 38 |         print(("\t" * depth + "String: '{}'".format(text)))
 39 | 
 40 |     try:
 41 |         tree_str = text[text.index("(") + 1:text.rfind(")")]
 42 |     except ValueError:
 43 |         print(("Malformatted parse string: '{}'".format(text)))
 44 |         raise ValueError
 45 | 
 46 |     next_idx = tree_str.index(" ")
 47 | 
 48 |     tree = ASTNode(tree_str[:next_idx])
 49 |     if debug:
 50 |         print(("\t" * depth + "Type: '{}'".format(tree.type)))
 51 | 
 52 |     if "(" in tree_str:
 53 |         while "(" in tree_str:
 54 |             tree_str = tree_str[tree_str.index("("):]
 55 |             next_idx = find_match_paren(tree_str) + 1
 56 |             tree.add_child(get_eng_tree(tree_str[:next_idx], depth+1, debug))
 57 |             tree_str = tree_str[next_idx + 1:]
 58 |     else:
 59 |         tree.value = tree_str[next_idx + 1:]
 60 |         if debug:
 61 |             print(("\t" * depth + "Value: " + tree.value))
 62 | 
 63 |     return tree
 64 | 
 65 | 
 66 | def get_grammar(parse_trees, verbose=False):
 67 |     rules = set()
 68 | 
 69 |     for parse_tree in parse_trees:
 70 |         parse_tree_rules, rule_parents = parse_tree.get_productions()
 71 |         for rule in parse_tree_rules:
 72 |             rules.add(rule)
 73 | 
 74 |     rules = list(sorted(rules, key=lambda x: x.__repr__()))
 75 |     grammar = EnglishGrammar(rules)
 76 | 
 77 |     if verbose:
 78 |         print(('num. rules: %d', len(rules)))
 79 | 
 80 |     return grammar
 81 | 
 82 | 
 83 | def parse_raw(parser, query):
 84 |     try:
 85 |         query = stanford_parse(parser, query)
 86 |     except Exception:
 87 |         print('Could not parse query: {}'.format(query))
 88 |         return None
 89 | 
 90 |     try:
 91 |         result = get_eng_tree(query.parse_string)
 92 |     except Exception:
 93 |         print("Could not interpret query parse: {}".format(query.parse_string))
 94 |         return None
 95 | 
 96 |     return result
 97 | 
 98 | 
 99 | def extract_grammar(source_file, output=None, clean=False, verbose=False,
100 |                     asdl=False):
101 |     parse_trees = list()
102 | 
103 |     if asdl:
104 |         parse_func = english_ast_to_asdl_ast
105 |     else:
106 |         parse_func = get_eng_tree
107 | 
108 |     parser = StanfordParser(annots="tokenize ssplit parse")
109 | 
110 |     with open(source_file, 'r') as f:
111 |         queries = f.readlines()
112 | 
113 |     if clean:
114 |         queries = clean_queries(queries, verbose)
115 | 
116 |     if verbose:
117 |         print("Performing constituency parsing of queries")
118 | 
119 |     for i, q in enumerate(queries):
120 |         if len(q) > 0:
121 |             try:
122 |                 query = stanford_parse(parser, q)
123 |             except Exception:
124 |                 print('Could not parse query {}: "{}"'.format(i, q))
125 |                 continue
126 | 
127 |         if check_parse(query):
128 |             try:
129 |                 parse_trees.append(parse_func(query.parse_string))
130 |             except Exception:
131 |                 print(("Could not interpret query parse {}: '{}'".format(i, query)))
132 |                 continue
133 | 
134 |         if i % 100 == 0:
135 |             print("{} queries processed.".format(i))
136 | 
137 |     if verbose:
138 |         print(("{} queries successfully parsed.".format(len(parse_trees))))
139 |         print("Extracting grammar production rules.")
140 | 
141 |     if asdl:
142 |         productions = set()
143 | 
144 |         for parse_tree in parse_trees:
145 |             productions.update(parse_tree.get_productions())
146 | 
147 |         grammar = EnglishASDLGrammar(productions=productions)
148 |     else:
149 |         rules = set()
150 | 
151 |         for parse_tree in parse_trees:
152 |             parse_tree_rules, _ = parse_tree.get_productions()
153 | 
154 |             for rule in parse_tree_rules:
155 |                 rules.add(rule)
156 | 
157 |         rules = list(sorted(rules, key=lambda x: x.__repr__()))
158 |         grammar = EnglishGrammar(rules)
159 | 
160 |     if verbose:
161 |         print("Grammar induced successfully.")
162 | 
163 |     if output is not None:
164 |         with open(output, 'wb') as f:
165 |             pickle.dump(grammar, f)
166 |     else:
167 |         return grammar, parse_trees
168 | 
169 | 
170 | if __name__ == '__main__':
171 |     parser = argparse.ArgumentParser(
172 |         description='Compile grammar from query examples.')
173 |     parser.add_argument('-q', '--queries', required=True,
174 |                         help='Path to queries.')
175 |     parser.add_argument('-o', '--output',
176 |                         help='Filename for output.')
177 |     parser.add_argument('-c', '--clean', action='store_true', default=False,
178 |                         help='Pre-clean queries before populating.')
179 |     parser.add_argument('-v', '--verbose', action='store_true', default=False,
180 |                         help='Print verbose output on progress.')
181 |     parser.add_argument('-a', '--asdl', action='store_true', default=False,
182 |                         help='Return grammar in ASDL mode.')
183 | 
184 |     args_dict = vars(parser.parse_args())
185 | 
186 |     extract_grammar(args_dict['queries'],
187 |                     args_dict['output'],
188 |                     args_dict['clean'],
189 |                     args_dict['verbose'],
190 |                     args_dict['asdl'])
191 | 


--------------------------------------------------------------------------------
/src/gan/gen_test.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from multivac.src.gan.gen_pyt.asdl.lang.eng.eng_asdl_helper import \
 4 |     asdl_ast_to_english
 5 | from multivac.src.gan.gen_pyt.model.parser import Parser
 6 | 
 7 | 
 8 | def run(args):
 9 |     '''
10 |     Load GAN generator model
11 |     Apply query items
12 |     Return beam search results
13 |     '''
14 | 
15 |     if isinstance(args['model'], str):
16 |         netG = Parser.load(args['model'])
17 |     else:
18 |         netG = args['model']
19 |     if isinstance(args['query'], str):
20 |         query = args['query'].split()
21 |     else:
22 |         query = args['query']
23 | 
24 |     results = netG.parse(query, beam_size=netG.args['beam_size'])
25 |     texts = [asdl_ast_to_english(x.tree) for x in results]
26 | 
27 |     return texts
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     parser = argparse.ArgumentParser()
32 |     parser.add_argument('-m', '--model',
33 |                         help='Path to model checkpoint file.')
34 |     parser.add_argument('-q', '--query', nargs='+', required=False,
35 |                         help='Query tokens for generating a question.')
36 | 
37 |     args = vars(parser.parse_args())
38 | 
39 |     results = run(args)
40 |     print(results)
41 | 


--------------------------------------------------------------------------------
/src/gan/utilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/utilities/__init__.py


--------------------------------------------------------------------------------
/src/gan/utilities/rollout.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import os
  4 | from tqdm import tqdm
  5 | from spacy.tokenizer import Tokenizer
  6 | from spacy.vocab import Vocab
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.utils.data import Dataset, DataLoader
 11 | 
 12 | from discriminator import MULTIVACDataset, Tree
 13 | from gen_pyt.asdl.lang.eng.eng_asdl_helper import asdl_ast_to_english
 14 | from gen_pyt.model.parser import Parser
 15 | # from .tree_rollout import rollout_samples
 16 | from multivac.src.gan.gen_pyt.components.decode_hypothesis import DecodeHypothesis
 17 | 
 18 | from multivac.src.rdf_graph.rdf_parse import StanfordParser
 19 | 
 20 | class RolloutDataset(Dataset):
 21 |     def __init__(self, data):
 22 |         super().__init__()
 23 |         self.data = data
 24 |         self.size = self.data.shape[0]
 25 |     def __len__(self):
 26 |         return self.size
 27 |     def __getitem__(self, index):
 28 |         return copy.deepcopy(self.data[index])
 29 | 
 30 | class Engine(object):
 31 |     def __init__(self, step=0):
 32 |         self.step = 0
 33 | 
 34 | class Rollout(object):
 35 |     def __init__(self, rollout_num, vocab):
 36 |         #self.new_net = copy.deepcopy(net)
 37 |         self.vocab = vocab
 38 |         self.tokenizer = Tokenizer(Vocab(strings=list(vocab.labelToIdx.keys())))
 39 |         self.rollout_num = rollout_num
 40 |         self.parser = StanfordParser(annots='tokenize')
 41 | 
 42 |     def hyp_to_parse(self, hyp, vocab):
 43 |         if isinstance(hyp,str):
 44 |             text = hyp
 45 |         else:
 46 |             text = asdl_ast_to_english(hyp.tree)
 47 | 
 48 |         parse = self.parser.get_parse(text)['sentences']
 49 | 
 50 |         if len(parse) > 0:
 51 |             tokens = [x['word'] for x in parse[0]['tokens']]
 52 |             deps = sorted(parse[0]['basicDependencies'], 
 53 |                           key=lambda x: x['dependent'])
 54 |             parents = [x['governor'] for x in deps]
 55 |             tree = MULTIVACDataset.read_tree(parents)
 56 |             inp = torch.tensor(vocab.convertToIdx(tokens, '<unk>'), 
 57 |                                dtype=torch.long, device='cpu')
 58 |         else:
 59 |             tree = Tree()
 60 |             inp = torch.tensor([])
 61 | 
 62 |         return tree, inp
 63 | 
 64 |     def parse_tokens(self, tree):
 65 |         text = asdl_ast_to_english(tree)
 66 |         tokens = [x.text for x in self.tokenizer(text)]
 67 |         result = torch.tensor(self.vocab.convertToIdx(tokens, '<unk>'), 
 68 |                               dtype=torch.long, 
 69 |                               device='cpu')
 70 |         return result
 71 | 
 72 |     @staticmethod
 73 |     def parse_to_trees(parses, vocab):
 74 |         results = [''] * len(parses)
 75 | 
 76 |         for idx, parse in enumerate(parses):
 77 |             tokens = [x['word'] for x in parse['tokens']]
 78 |             deps = sorted(parse['basicDependencies'], 
 79 |                           key=lambda x: x['dependent'])
 80 |             parents = [x['governor'] for x in deps]
 81 |             tree = MULTIVACDataset.read_tree(parents)
 82 |             results[idx] = (tree, torch.tensor(vocab.convertToIdx(tokens, '<unk>'), 
 83 |                                                dtype=torch.long, device='cpu'))
 84 | 
 85 |         return results
 86 | 
 87 |     @staticmethod
 88 |     def ffwd_hyp(hyp, j):
 89 |         new_hyp = DecodeHypothesis()
 90 | 
 91 |         for i in range(j):
 92 |             if i < len(hyp.action_infos):
 93 |                 new_hyp.apply_action_info(hyp.action_infos[i])
 94 | 
 95 |         return new_hyp
 96 | 
 97 |     def get_tree_reward(self, hyps, states, examples, 
 98 |                         netG, netD, vocab, verbose=False):
 99 |         batch_size = len(hyps)
100 |         src_sents = [e.src_sent for e in examples]
101 |         rewards = []
102 |         max_action_len = max([len(hyp.actions) for hyp in hyps])
103 | 
104 |         netD.eval()
105 | 
106 |         for i in range(self.rollout_num):
107 |             if verbose: print("Rollout step {}".format(i))
108 | 
109 |             samples = [[0] * batch_size] * max_action_len
110 |             inputs  = [[0] * batch_size] * max_action_len
111 |             # texts   = [[0] * batch_size] * max_action_len
112 | 
113 |             for j in tqdm(range(1, max_action_len)):
114 |                 for n in range(batch_size):
115 |                     src = src_sents[n]
116 |                     hyp = Rollout.ffwd_hyp(hyps[n], j)
117 |                     state = states[n][:j]
118 |                     samples[j-1][n] = netG.sample(src, hyp, state)
119 | 
120 |             if verbose: print("Samples generated of shape "
121 |                               "({},{})".format(max_action_len, batch_size))
122 | 
123 |             for x in tqdm(range(max_action_len), "Translating trees..."):
124 |                 for h, hyp in enumerate(samples[x]):
125 |                     inputs[x][h] = self.parse_tokens(hyp.tree)
126 | 
127 |             for j in range(max_action_len):
128 |                 samps = torch.full((len(inputs[j]), 150), vocab.pad)
129 | 
130 |                 for idx, x in enumerate(inputs[j]):
131 |                     samps[idx, :len(x)] = x[:150]
132 | 
133 |                 x = samps.long().to(netD.args['device'])
134 |                 out = netD(x).softmax(dim=-1).data[:,1].numpy()
135 | 
136 |                 if i == 0:
137 |                     rewards.append(out)
138 |                 else:
139 |                     rewards[j] += out
140 | 
141 |             originals = [self.parse_tokens(hyp.tree) for hyp in hyps]
142 | 
143 |             for j in tqdm(range(batch_size), desc="Rating action step {}...".format(max_action_len)):
144 |                 samps = torch.full((len(originals), 150), vocab.pad)
145 | 
146 |                 for idx, x in enumerate(originals):
147 |                     samps[idx, :len(x)] = x[:150]
148 | 
149 |                 x = samps.long().to(netD.args['device'])
150 |                 out = netD(x).softmax(dim=-1).data[:,1].numpy()
151 | 
152 |             if i == 0:
153 |                 rewards.append(out)
154 |             else:
155 |                 rewards[-1] += out
156 | 
157 |         rewards = np.array(rewards) / (1.0 * self.rollout_num)
158 | 
159 |         return rewards
160 | 


--------------------------------------------------------------------------------
/src/gan/utilities/shuffled_queries.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import os
 4 | import pandas as pd
 5 | from random import shuffle
 6 | 
 7 | def run(args_dict):
 8 |     DIR = os.path.dirname(args_dict['file'])
 9 |     file = os.path.basename(args_dict['file'])
10 | 
11 |     with open(args_dict['file']) as f:
12 |         clean_txt = f.readlines()
13 | 
14 |     df_clean = pd.DataFrame(clean_txt)
15 |     df_clean.columns = ['query']
16 |     # clean queries will contain a label of 1
17 |     df_clean['label'] = 1
18 | 
19 |     # Tokenizing and shuffling each query
20 |     # These will be labeled as 0
21 |     shuffles = []
22 |     for txt in clean_txt:
23 |         txt = txt.split(" ")
24 |         shuffle(txt)
25 |         shuffles.append(" ".join(txt))
26 | 
27 |     shuffled_df = pd.DataFrame()
28 |     shuffled_df['query'] = shuffles
29 |     shuffled_df['label'] = 0
30 | 
31 |     final_df = pd.concat([df_clean, shuffled_df]).reset_index()
32 |     final_df.rename(columns={'index': 'id'}, inplace=True)
33 |     final_df['query'] = final_df['query'].apply(lambda x: x.replace("\n", ""))
34 | 
35 |     np.savetxt(os.path.join(DIR,"extracted_questions_labels.txt"), 
36 |                final_df.values, newline='\n', fmt=["%s", "%s", "%s"], 
37 |                delimiter='\t',
38 |                header='id query label')
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     parser = argparse.ArgumentParser(description='Create shuffled queries'
43 |                                     'from .txt file for GAN.')
44 |     parser.add_argument('-f', '--file', required=True, 
45 |                         help='Path to source query file.')
46 | 
47 |     args_dict = vars(parser.parse_args())
48 | 
49 |     run(args_dict)
50 | 
51 | 


--------------------------------------------------------------------------------
/src/gan/utilities/utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import pickle
  4 | import math
  5 | 
  6 | import torch
  7 | 
  8 | from multivac.src.gan.utilities.vocab import Vocab
  9 | 
 10 | 
 11 | # write unique words from a set of files to a new file
 12 | def build_vocab(filenames, vocabfile, lowercase=True):
 13 |     vocab = set()
 14 | 
 15 |     for filename in filenames:
 16 |         with open(filename, 'r') as f:
 17 |             for line in f:
 18 |                 if lowercase:
 19 |                     line = line.lower()
 20 | 
 21 |                 tokens = line.rstrip('\n').split(' ')
 22 |                 vocab |= set(tokens)
 23 | 
 24 |     with open(vocabfile, 'w') as f:
 25 |         for token in sorted(vocab):
 26 |             f.write(token + '\n')
 27 | 
 28 | class cached_property(object):
 29 |     """ A property that is only computed once per instance and then replaces
 30 |         itself with an ordinary attribute. Deleting the attribute resets the
 31 |         property.
 32 | 
 33 |         Source: https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76
 34 |         """
 35 | 
 36 |     def __init__(self, func):
 37 |         self.__doc__ = getattr(func, '__doc__')
 38 |         self.func = func
 39 | 
 40 |     def __get__(self, obj, cls):
 41 |         if obj is None:
 42 |             return self
 43 |         value = obj.__dict__[self.func.__name__] = self.func(obj)
 44 |         return value
 45 | 
 46 | def deserialize_from_file(path):
 47 |     with open(path, 'rb') as f:
 48 |         obj = pickle.load(f)
 49 |     
 50 |     return obj
 51 | 
 52 | # loading GLOVE word vectors
 53 | # if .pth file is found, will load that
 54 | # else will load from .txt file & save
 55 | def load_word_vectors(path, lowercase=True):
 56 |     if os.path.isfile(path + '.pth') and os.path.isfile(path + '.vocab'):
 57 |         print('==> File found, loading to memory')
 58 |         vectors = torch.load(path + '.pth')
 59 |         vocab = Vocab(filename=path + '.vocab', lower=lowercase)
 60 | 
 61 |         return vocab, vectors
 62 |     elif path.endswith('.pkl'):
 63 |         print('==> File found, loading to memory')
 64 | 
 65 |         with open(path, "rb") as f:
 66 |             glove = pickle.load(f)
 67 | 
 68 |         vectors = torch.from_numpy(glove['embeddings']).float()
 69 |         vocab = Vocab(data=glove['vocab'], lower=lowercase)
 70 | 
 71 |         return vocab, vectors
 72 | 
 73 |     # saved file not found, read from txt file
 74 |     # and create tensors for word vectors
 75 |     print('==> File not found, preparing, be patient')
 76 | 
 77 | 
 78 |     count = sum(1 for line in open(path + '.txt', 'r', encoding='utf8', errors='ignore'))
 79 | 
 80 |     with open(path + '.txt', 'r') as f:
 81 |         contents = f.readline().rstrip('\n').split(' ')
 82 |         dim = len(contents[1:])
 83 | 
 84 |     words = [None] * (count)
 85 |     vectors = torch.zeros(count, dim, dtype=torch.float, device='cpu')
 86 | 
 87 |     with open(path + '.txt', 'r', encoding='utf8', errors='ignore') as f:
 88 |         idx = 0
 89 | 
 90 |         for line in f:
 91 |             contents = line.rstrip('\n').split(' ')
 92 |             words[idx] = contents[0]
 93 |             values = list(map(float, contents[1:]))
 94 |             vectors[idx] = torch.tensor(values, dtype=torch.float, device='cpu')
 95 |             idx += 1
 96 | 
 97 |     with open(path + '.vocab', 'w', encoding='utf8', errors='ignore') as f:
 98 |         for word in words:
 99 |             f.write(word + '\n')
100 | 
101 |     vocab = Vocab(filename=path + '.vocab')
102 |     torch.save(vectors, path + '.pth')
103 | 
104 |     return vocab, vectors
105 | 
106 | def serialize_to_file(obj, path, protocol=pickle.HIGHEST_PROTOCOL):
107 |     with open(path, 'wb') as f:
108 |         pickle.dump(obj, f, protocol=protocol)
109 | 
110 | def typename(x):
111 |     if isinstance(x, str):
112 |         return x
113 |     return x.__name__
114 | 


--------------------------------------------------------------------------------
/src/gan/utilities/vocab.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from collections import Counter
  3 | from itertools import chain
  4 | 
  5 | class Vocab(object):
  6 |     def __init__(self, filename=None, data=None, lower=False):
  7 |         self.idxToLabel = {}
  8 |         self.labelToIdx = {}
  9 |         self.lower = lower
 10 | 
 11 |         # Special entries will not be pruned.
 12 |         self.special = []
 13 | 
 14 |         if data is not None:
 15 |             self.addSpecials(data)
 16 |         if filename is not None:
 17 |             self.loadFile(filename)
 18 | 
 19 |         self.add('<pad>')
 20 |         self.add('<unk>')
 21 |         self.add('<eos>')
 22 | 
 23 |     def __getitem__(self, item):
 24 |             return self.labelToIdx.get(item, self.unk)
 25 | 
 26 |     def __contains__(self, item):
 27 |             return item in self.labelToIdx
 28 | 
 29 |     @property
 30 |     def size(self):
 31 |         return len(self.idxToLabel)
 32 | 
 33 |     def __setitem__(self, key, value):
 34 |         self.labelToIdx[key] = value
 35 | 
 36 |     def __len__(self):
 37 |         return len(self.labelToIdx)
 38 | 
 39 |     def __iter__(self):
 40 |         return iter(list(self.labelToIdx.keys()))
 41 | 
 42 |     def __eq__(self, other):
 43 |         return all([self.idxToLabel == other.idxToLabel, 
 44 |                     self.labelToIdx == other.labelToIdx,
 45 |                     self.lower      == other.lower, 
 46 |                     self.special    == other.special])
 47 | 
 48 |     @property
 49 |     def pad(self):
 50 |         return self.labelToIdx['<pad>']
 51 | 
 52 |     @property
 53 |     def unk(self):
 54 |         return self.labelToIdx['<unk>']
 55 | 
 56 |     @property
 57 |     def eos(self):
 58 |         return self.labelToIdx['<eos>']
 59 | 
 60 |     def is_unk(self, word):
 61 |         return word not in self
 62 | 
 63 |     def size(self):
 64 |         return len(self.idxToLabel)
 65 | 
 66 |     # Load entries from a file.
 67 |     def loadFile(self, filename):
 68 |         idx = 0
 69 |         for line in open(filename, 'r', encoding='utf8', errors='ignore'):
 70 |             token = line.rstrip('\n')
 71 |             self.add(token)
 72 |             idx += 1
 73 | 
 74 |     def getIndex(self, key, default=None):
 75 |         key = key.lower() if self.lower else key
 76 | 
 77 |         return self.labelToIdx.get(key, default)
 78 | 
 79 |     def getLabel(self, idx, default=None):
 80 |         return self.idxToLabel.get(idx, default)
 81 | 
 82 |     def add_from_data(self, label, idx=None):
 83 |         if idx:
 84 |             self.idxToLabel[idx] = label
 85 |             self.labelToIdx[label] = idx
 86 |         else:
 87 |             idx = self.add(label)
 88 |         
 89 |     # Mark this `label` and `idx` as special
 90 |     def addSpecial(self, label):
 91 |         idx = self.add(label)
 92 |         self.special += [idx]
 93 | 
 94 |     # Mark all labels in `labels` as specials
 95 |     def addSpecials(self, labels):
 96 |         for label in labels:
 97 |             if isinstance(label, tuple):
 98 |                 self.add_from_data(*label)
 99 |             else:
100 |                 self.addSpecial(label)
101 | 
102 |     # Add `label` in the dictionary. Use `idx` as its index if given.
103 |     def add(self, label):
104 |         label = label.lower() if self.lower else label
105 | 
106 |         if label in self.labelToIdx:
107 |             idx = self.labelToIdx[label]
108 |         else:
109 |             idx = len(self.idxToLabel)
110 |             self.idxToLabel[idx] = label
111 |             self.labelToIdx[label] = idx
112 |         return idx
113 | 
114 |     # Convert `labels` to indices. Use `unkWord` if not found.
115 |     # Optionally insert `bosWord` at the beginning and `eosWord` at the .
116 |     def convertToIdx(self, labels, unkWord=None, bosWord=None, eosWord=None):
117 |         if unkWord is None:
118 |             unk = self.unk
119 |         else:
120 |             unk = self.getIndex(unkWord)
121 | 
122 |         vec = []
123 | 
124 |         if bosWord is not None:
125 |             vec += [self.getIndex(bosWord)]
126 | 
127 |         
128 |         vec += [self.getIndex(label, default=unk) for label in labels]
129 | 
130 |         if eosWord is not None:
131 |             vec += [self.getIndex(eosWord)]
132 | 
133 |         return vec
134 | 
135 |     # Convert `idx` to labels. If index `stop` is reached, convert it and return.
136 |     def convertToLabels(self, idx, stop=None):
137 |         labels = []
138 | 
139 |         for i in idx:
140 |             labels += [self.getLabel(i)]
141 |             if i == stop:
142 |                 break
143 | 
144 |         return labels
145 | 
146 | 
147 |     @staticmethod
148 |     def from_corpus(corpus, size=None, freq_cutoff=0):
149 |         vocab = Vocab()
150 | 
151 |         word_freq = Counter(chain(*corpus))
152 |         non_singletons = [w for w in word_freq if word_freq[w] > 1]
153 |         singletons = [w for w in word_freq if word_freq[w] == 1]
154 |         top_k_words = sorted(word_freq.keys(), reverse=True, key=word_freq.get)
155 | 
156 |         if size is not None:
157 |             top_k_words = top_k_words[:size]
158 | 
159 |         words_not_included = []
160 | 
161 |         for word in top_k_words:
162 |             if word_freq[word] >= freq_cutoff:
163 |                 vocab.add(word)
164 |             else:
165 |                 words_not_included.append(word)
166 | 
167 |             if len(vocab) == size:
168 |                 break
169 | 
170 |         return vocab
171 | 
172 |     @staticmethod
173 |     def from_dict(vocab):
174 |         new_vocab = Vocab()
175 | 
176 |         for key, value in vocab.items():
177 |             setattr(new_vocab, key, value)
178 | 
179 |         return new_vocab
180 | 


--------------------------------------------------------------------------------
/src/rdf_graph/build_graph.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | import argparse
 4 | 
 5 | from datetime import datetime
 6 | 
 7 | from rdf_graph import RDFGraph
 8 | 
 9 | 
10 | def run(args_dict):
11 |     # create timestamp
12 |     timestamp = datetime.now().strftime('%d%b%Y-%H:%M:%S')
13 | 
14 |     # instantiate class
15 |     knowledge_graph = RDFGraph()
16 | 
17 |     # Associate a JSON file of source documents from which to induce
18 |     # the knowledge graph.
19 |     knowledge_graph.set_source(args_dict['sources'])
20 | 
21 |     print('\nExtracting relation triples from abstracts')
22 |     knowledge_graph.extract_raw_tuples()
23 | 
24 |     # pre-process extracted tuples
25 |     print('\nPreprocessing raw relation triples')
26 |     knowledge_graph.preprocess_raw_tuples()
27 | 
28 |     # cluster all entities using fast
29 |     # agglomerative clustering and cosine distance of averaged word embeddings
30 |     print('\nClustering entities from relation triples')
31 |     knowledge_graph.cluster_entities(args_dict['glove'])
32 |     print('\n{} entity clusters were found'
33 |           .format(len(knowledge_graph.entity_cluster_results['cluster_members'])))
34 | 
35 |     # output text files that will be used openke for knowledge graph creation
36 |     # and embedding output .txt files for openke output
37 |     print('\nSaving final tuples to .txt files for input to OpenKE')
38 |     knowledge_graph.output_to_openke(timestamp)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     parser = argparse.ArgumentParser(description='Fetcher to retrieve articles '
43 |                                      'for modeling.')
44 |     parser.add_argument('-s', '--sources', required=True,
45 |                         help='Select a source for article retrieval.')
46 |     parser.add_argument('-g', '--glove', required=True,
47 |                         help='Path to pickle file containing glove embeddings')
48 |     args_dict = vars(parser.parse_args())
49 | 
50 |     run(args_dict)
51 | 


--------------------------------------------------------------------------------
/src/rdf_graph/environment.yml:
--------------------------------------------------------------------------------
 1 | name: nlp
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 |   - python=3.7
 6 |   - pandas
 7 |   - textacy
 8 |   - neuralcoref
 9 |   - tqdm
10 |   - jupyterlab
11 |   - pip
12 |   - pip:
13 |     - nltk
14 |     - stanfordcorenlp==3.9.1.1
15 |     - https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.3/en_core_web_sm-2.1.0.tar.gz
16 | 


--------------------------------------------------------------------------------
/src/utilities.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def mkdir(directory, verbose=False):
 5 |     """Make a directory if it doesn't already exist."""
 6 |     if not os.path.exists(directory):
 7 |         if verbose:
 8 |             print('Make directory %s' % directory)
 9 | 
10 |         os.makedirs(directory)
11 |     elif verbose:
12 |         print('Directory %s already exists' % directory)
13 | 
14 | 
15 | def dict_str(my_dict, results=''):
16 |     '''Poor man's version of a pretty-ish print function for dictionaries to
17 |        expose the basic structure without printing all the values. '''
18 |     results += '{'
19 |     results += ', '.join(['{}: {}'.format(k, type(v)) if not isinstance(v, dict) else '{}:\n \t {} \n'.format(k, dict_str(v)) for k, v in my_dict.items()])
20 | 
21 |     results += '}'
22 | 
23 |     return results
24 | 


--------------------------------------------------------------------------------
/stanford-corenlp-full/rdf_graph.properties:
--------------------------------------------------------------------------------
1 | port = 9000
2 | timeout = 45000
3 | ner.model = edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz
4 | ner.useSUTime = false 
5 | ner.applyNumericClassifiers = false


--------------------------------------------------------------------------------
/sys/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/sys/.gitkeep


--------------------------------------------------------------------------------
/templates/base.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |     <!-- HTML metdata -->
 6 |     <meta charset="utf-8">
 7 |     <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
 8 |     <meta name="description" content="">
 9 |     <meta name="author" content="">
10 |     <title>MULTIVAC | Gallup DSS</title>
11 |     <!-- Bootstap CSS and Style imports -->
12 |     <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
13 |     <!-- For any page specific style customizations -->
14 |     {% block header %}{% endblock %}
15 | </head>
16 | 
17 | <body>
18 |     <div style="padding: 20px 0 80px 0">
19 |         <div class="container-fluid" style="width: 95%">
20 |             <div class="row">
21 |                 <div class="col-sm text-center">
22 |                     <h1>{% block title %}{% endblock %}</h1>
23 |                     <p>{% block description %}{% endblock %}</p>
24 |                 </div>
25 |             </div>
26 |             <hr>
27 |             <div class="row">
28 |                 <div class="col-sm">
29 |                     <h2 class="display-5">System Process</h2>
30 |                     {% block leftcontent %}{% endblock %}
31 |                 </div>
32 |                 <div class="col-sm">
33 |                     <h2 class="display-5">Code Review</h2>
34 |                     {% block rightcontent %}{% endblock %}
35 |                 </div>
36 |             </div>
37 |         </div>
38 |     </div>
39 |     <!-- Bootstrap JavaScript imports -->
40 |     <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
41 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
42 |     <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
43 |     {% block scripts %}{% endblock %}
44 | </body>
45 | 
46 | </html>


--------------------------------------------------------------------------------
/templates/query.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block title %}
 4 | Gallup - MULTIVAC
 5 | {% endblock %}
 6 | 
 7 | {% block description %}
 8 | A DARPA Automating Scientific Knowledge Extraction (ASKE) Artifical Intelligence Exploration (AIE) Program
 9 | {% endblock %}
10 | 
11 | {% block leftcontent %}
12 | <hr>
13 | <button class="btn btn-lg btn-success" style="width: 49%">Initialize System</button>
14 | <button class="btn btn-lg btn-secondary" style="width: 49%">Generate Query</button>
15 | <hr>
16 | <!-- this is for rdf_graph -->
17 | <h5>Map queries to knowledge graph</h5><br>
18 | <form method="GET" action="{{ url_for('results') }}">
19 |     <div class="row">
20 |         <div class="col-md-6 mb-3">
21 |             <label for="dir-input">Path to index data directory</label>
22 |             <input type="text" class="form-control" id="dir-input" name="dir-input" value="sys/data/raw/" required>
23 |         </div>
24 |         <div class="col-md-6 mb-3">
25 |             <label for="filetime-input">Timestamp appended to file</label>
26 |             <input type="text" class="form-control" id="filetime-input" name="filetime-input" placeholder="Not required" disabled>
27 |         </div>
28 |     </div>
29 |     <div class="mb-3">
30 |         <div class="form-group">
31 |             <label for="model-type-input">Model selection for knowledge embedding</label>
32 |             <select class="form-control" id="model-type-input" name="model-type-input">
33 |                 <option value="analogy">analogy</option>
34 |                 <option value="complex">complex</option>
35 |                 <option value="distmult">distmult</option>
36 |                 <option value="hole">hole</option>
37 |                 <option value="rescal">rescal</option>
38 |                 <option value="transd">transd</option>
39 |                 <option value="transe">transe</option>
40 |                 <option value="transh">transh</option>
41 |                 <option value="transr">transr</option>
42 |             </select>
43 |         </div>
44 |     </div>
45 |     <div class="row">
46 |         <div class="col-md-6 mb-3">
47 |             <label for="out-input">Path to models output directory</label>
48 |             <input type="text" class="form-control" id="out-input" name="out-input" value="sys/data/processed/">
49 |         </div>
50 |         <div class="col-md-6 mb-3">
51 |             <label for="glove-dir-input">Path to GloVe embeddings model file</label>
52 |             <input type="text" class="form-control" id="glove-dir-input" name="glove-dir-input" placeholder="Not required" disabled>
53 |         </div>
54 |     </div>
55 |     <div class="mb-3">
56 |         <div class="form-group">
57 |             <label for="run-input">Identify choice of action, fitting a model or predicting from it</label>
58 |             <select class="form-control" id="run-input" name="run-input">
59 |                 <option value="analogy">fit</option>
60 |                 <option value="complex">model</option>
61 |             </select>
62 |         </div>
63 |     </div>
64 |     <div class="row">
65 |         <div class="col-md-4 mb-3">
66 |             <label for="threshold-input">Threshold</label>
67 |             <input type="number" class="form-control" id="threshold-input" name="threshold-input" value="10">
68 |         </div>
69 |         <div class="col-md-4 mb-3">
70 |         	<label>Verbosity</label><br>
71 |             <div class="form-check form-check-inline">
72 |                 <input class="form-check-input" type="radio" name="verbosity-input" id="verbose-yes" value="True">
73 |                 <label class="form-check-label" for="verbose-yes">True</label>
74 |             </div>
75 |             <div class="form-check form-check-inline">
76 |                 <input class="form-check-input" type="radio" name="verbosity-input" id="verbose-no" value="False" checked>
77 |                 <label class="form-check-label" for="verbose-no">False</label>
78 |             </div>
79 |         </div>
80 |         <div class="col-md-4 mb-3">
81 |             <label for="num-top-input">Number ot top matches</label>
82 |             <input type="number" class="form-control" id="num-top-input" name="num-top-input" value="0.1">
83 |         </div>
84 |     </div>
85 |     <div class="mb-3">
86 |         <label for="search-input">Searches to execute. Either a path to a CSV containing triples or a triple in the format:<br> <code>"subject terms ::: relation terms ::: object terms"</code></label>
87 |         <input type="text" class="form-control" id="search-input" name="search-input" placeholder="subject terms ::: relation terms ::: object terms">
88 |     </div>
89 |     <hr>
90 |     <button class="btn btn-primary btn-lg btn-block" type="submit">Go!</button>
91 |     <hr>
92 | </form>
93 | {% endblock %}
94 | 
95 | {% block rightcontent %}
96 | <hr>
97 | <iframe height="800" style="width: 100%" src="0.0.0.0:8080/github/GallupGovt/multivac/blob/master/precooked_replication.ipynb"></iframe>
98 | {% endblock %}
99 | 


--------------------------------------------------------------------------------