├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── __init__.py ├── app.py ├── calculate_network_change.py ├── conductor.py ├── doc ├── README.md ├── aske_context.md ├── innovations.md ├── installation.md ├── lessons_learned.md ├── notebooks │ ├── Domain_Adapted_Glove.ipynb │ ├── Parsing.ipynb │ ├── alpha │ │ ├── Get.ipynb │ │ ├── Process.ipynb │ │ └── README.md │ ├── directed_query_gen_walkthrough.ipynb │ ├── gan_training_illustration.ipynb │ ├── key_triples_walkthrough.ipynb │ ├── kg_predict_walkthrough.ipynb │ ├── kg_query_walkthrough.ipynb │ ├── precooked_replication.ipynb │ ├── prepared_output.ipynb │ └── pure_generation_walkthrough.ipynb └── phase_two_developments.md ├── docker-compose.yml ├── get_kg_query_params.py ├── images ├── KCCA_equation.png ├── MULTIVAC_schematic.png ├── aske_schematic_v1.5.png ├── aske_schematic_v1.png ├── emulated.png ├── emulated_kg.png ├── formula.png ├── formula_dependencies.png ├── gan.png ├── gan_design.png ├── key_triples.png ├── krongen.png ├── latex_parse_1.png ├── latex_parse_2.png ├── multivac_concept.png ├── phase_one_system.png ├── qgnet.png ├── simple_kg.png └── stanford_dependecies.png ├── multivac.cfg ├── predict_kg.py ├── pymln ├── LICENSE ├── README.md ├── __init__.py ├── eval │ ├── Answer.py │ ├── Question.py │ ├── USP.py │ └── __init__.py ├── pymln.py ├── semantic │ ├── Agenda.py │ ├── Argument.py │ ├── Clust.py │ ├── Executor.py │ ├── MLN.py │ ├── Parse.py │ ├── ParseParams.py │ ├── Part.py │ ├── Scorer.py │ ├── SearchOp.py │ ├── __init__.py │ └── argclust.py ├── syntax │ ├── Nodes │ │ ├── Article.py │ │ ├── Sentence.py │ │ ├── Token.py │ │ ├── TreeNode.py │ │ └── __init__.py │ ├── Relations │ │ ├── ArgType.py │ │ ├── Path.py │ │ ├── RelType.py │ │ └── __init__.py │ ├── StanfordParseReader.py │ └── __init__.py └── utils │ ├── Utils.py │ └── __init__.py ├── requirements.txt ├── settings.py ├── src ├── .gitkeep ├── __init__.py ├── data │ ├── .gitkeep │ ├── clean_documents.py │ ├── clean_questions.py │ ├── clean_text.py │ ├── equationparsing.py │ ├── extract_text.py │ ├── get.py │ ├── glove.py │ ├── make.py │ ├── parsing.py │ ├── process.py │ ├── qgnet.py │ ├── textparsing.py │ ├── trainEmbeddings.R │ └── write_mln_to_graph_db.py ├── gan │ ├── __init__.py │ ├── config.cfg │ ├── discriminator │ │ ├── __init__.py │ │ ├── dataset.py │ │ ├── model.py │ │ ├── scripts │ │ │ └── preprocess-multivac.py │ │ ├── trainer.py │ │ └── tree.py │ ├── gen_pyt │ │ ├── __init__.py │ │ ├── asdl │ │ │ ├── __init__.py │ │ │ ├── asdl.py │ │ │ ├── asdl_ast.py │ │ │ ├── hypothesis.py │ │ │ ├── lang │ │ │ │ ├── __init__.py │ │ │ │ ├── eng │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── eng_asdl_helper.py │ │ │ │ │ ├── eng_transition_system.py │ │ │ │ │ └── grammar.py │ │ │ │ └── grammar.py │ │ │ └── transition_system.py │ │ ├── astnode.py │ │ ├── components │ │ │ ├── __init__.py │ │ │ ├── action_info.py │ │ │ ├── dataset.py │ │ │ ├── decode_hypothesis.py │ │ │ └── vocab.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ └── english │ │ │ │ ├── __init__.py │ │ │ │ └── dataset.py │ │ ├── model │ │ │ ├── __init__.py │ │ │ ├── attention_util.py │ │ │ ├── lstm.py │ │ │ ├── nn_utils.py │ │ │ ├── parser.py │ │ │ └── pointer_net.py │ │ └── query_treebank.py │ ├── gen_test.py │ ├── querygan_pyt.py │ └── utilities │ │ ├── __init__.py │ │ ├── rollout.py │ │ ├── shuffled_queries.py │ │ ├── tree_rollout.py │ │ ├── utils.py │ │ └── vocab.py ├── link_prediction │ └── MULTIVAC_link_prediction.py ├── rdf_graph │ ├── build_graph.py │ ├── environment.yml │ ├── map_queries.py │ ├── rdf_extract.py │ ├── rdf_graph.py │ └── rdf_parse.py └── utilities.py ├── stanford-corenlp-full └── rdf_graph.properties ├── sys └── .gitkeep └── templates ├── base.html └── query.html /.dockerignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | .git 4 | .gitignore 5 | venv 6 | env 7 | 8 | docker-compose.yml 9 | Dockerfile 10 | .dockerignore -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # directories 2 | /data/ 3 | /models/ 4 | 5 | # pycharm files 6 | .idea/ 7 | 8 | # scratch notebook directories 9 | scratch_notebooks/peter/ 10 | 11 | # downloaded stanford nlp models 12 | stanford_nlp_models/ 13 | 14 | # environment 15 | multivac/ 16 | .env 17 | src/pubmed-parser 18 | src/slate 19 | 20 | # juypter notebook 21 | .ipynb_checkpoints 22 | 23 | # Byte-compiled / optimized / DLL files 24 | __pycache__/ 25 | *.py[cod] 26 | *$py.class 27 | 28 | # workspaces 29 | .code-workspace 30 | 31 | # system files 32 | .DS_Store 33 | 34 | # logs 35 | *.log 36 | 37 | # flat-files 38 | *.json 39 | *.xml 40 | *.csv 41 | 42 | 43 | # envs 44 | venv/ 45 | env/ 46 | virtualenv/ -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # use ubuntu as the base image; install R and Python on top 2 | FROM ubuntu:latest 3 | 4 | # avoid humna input for geography and stuff 5 | ENV DEBIAN_FRONTEND=noninteractive 6 | 7 | # install R and python 8 | RUN apt-get update && apt-get install -y --no-install-recommends build-essential r-base python3.7 python3-pip python3-setuptools python3-dev git 9 | 10 | # copy requirements over to application 11 | COPY requirements.txt /multivac/requirements.txt 12 | 13 | WORKDIR /multivac 14 | 15 | # set up bdist_wheel 16 | RUN pip3 install wheel --no-cache-dir 17 | 18 | RUN pip3 install setuptools --no-cache-dir 19 | 20 | # env setup 21 | RUN pip3 install torch==1.2.0 --no-cache-dir 22 | RUN pip3 install -r requirements.txt --no-cache-dir 23 | 24 | RUN git clone https://github.com/thunlp/OpenKE && cd OpenKE && git checkout master && sh make.sh 25 | 26 | COPY . /multivac 27 | 28 | ENV PYTHONPATH "${PYTHONPATH}:/" 29 | 30 | EXPOSE 5000 31 | 32 | CMD python3 app.py 33 | 34 | 35 | ### Look into this if issues with OpenKE sh (production image) 36 | # https://forums.docker.com/t/best-practices-for-git-clone-make-etc-via-dockerfile-run/79152/3 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MULTIVAC 2 | DARPA’s Information Innovation Office’s Automating Scientific Knowledge Extraction (ASKE) program seeks to develop approaches to make it easier for scientists to build, maintain and reason over rich models of complex systems — which could include physical, biological, social, engineered or hybrid systems. By interpreting and exposing scientific knowledge and assumptions in existing model code and documentation, researchers can identify new data and information resources automatically, extracting useful information from these sources, and integrating this useful information into machine-curated expert models for robust modeling. 3 | 4 | MULTIVAC Schematic 5 | GAN Schematic 6 | 7 |
Gallup’s Meta-model Unification Learned Through Inquiry Vectorization and Automated Comprehension (MULTIVAC) effort supports these goals by developing a system that absorbs scientific knowledge — in the form of facts, relationships, models and equations — from a particular domain corpus into a semantic knowledge graph and learns to query that knowledge graph in order to accelerate scientific exploration within the target domain. MULTIVAC consists of an expert query generator trained on a corpus of historical expert queries and tuned dialectically with the use of a Generative Adversarial Network (GAN) architecture. As a prototype system, MULTIVAC focuses on the domain of epidemiological research, and specifically the realm of SIR/SEIR (Susceptible-Infected-Recovered, often with an additional “Exposed” element) compartmental model approaches. It is Gallup’s intent that this system includes a “human-in-the-loop” element, especially during training, to ensure that the system is properly tuned and responsive to the needs and interests of the human researchers it is intended to augment. 8 | 9 | ## System Setup and Operation 10 | - MULTIVAC Installation 11 | 12 | ## System Documentation 13 | - Phase I Development 14 | - Phase II Developments 15 | - Key Innovations 16 | 17 | For more information please contact Principal Investigator, Benjamin Ryan (ben_ryan@gallup.com). 18 | 19 | --- 20 | 21 | ## Acknowledgements 22 | This work is supported by the Defense Advanced Research Projects Agency (DARPA) under Agreement No. HR00111990008. 23 | 24 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/__init__.py -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from flask import Flask, redirect, render_template, request, url_for 4 | 5 | from multivac.src.rdf_graph import map_queries 6 | 7 | app = Flask(__name__) 8 | app.debug = True 9 | app.config['STATIC_FOLDER'] = f'{os.getcwd()}/sys' 10 | 11 | 12 | @app.route('/') 13 | def query(): 14 | 15 | return render_template( 16 | 'query.html' 17 | ) 18 | 19 | 20 | @app.route('/results') 21 | def results(): 22 | 23 | if request.method == 'GET': 24 | 25 | in_dir = os.path.abspath(request.values.get('dir-input')) 26 | out_dir = os.path.abspath(request.values.get('out-input')) 27 | 28 | # make sure these folders exist 29 | assert os.path.exists(out_dir) 30 | assert os.path.exists(in_dir) 31 | 32 | args_dict = { 33 | 'docker_folder_structure': [x for x in os.walk(os.getcwd())], 34 | 'dir': in_dir, 35 | 'model': request.values.get('model-type-input'), 36 | 'out': out_dir, 37 | 'run': request.values.get('run-input'), 38 | 'threshold': request.values.get('threshold-input'), 39 | 'verbose': request.values.get('verbosity-input'), 40 | 'num_top_rel': request.values.get('num-top-input'), 41 | 'search': request.values.get('search-input'), 42 | } 43 | 44 | results = map_queries.run(args_dict) 45 | 46 | return args_dict 47 | 48 | else: 49 | return redirect(url_for('query')) 50 | 51 | 52 | if __name__ == "__main__": 53 | app.run(host="0.0.0.0", port=5000) 54 | -------------------------------------------------------------------------------- /calculate_network_change.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This script is meant to identify relevant nodes based on differences of 5 | centrality measure of real and estimated networks. 6 | """ 7 | import argparse 8 | import json 9 | import networkx as nx 10 | import numpy as np 11 | import os 12 | 13 | from datetime import datetime 14 | 15 | from multivac.get_kg_query_params import build_network, read_txt 16 | 17 | def build_comparison_metrics(n1, n2, mtype): 18 | if 'degree' in mtype: 19 | n1x = nx.degree_centrality(n1) 20 | n2x = nx.degree_centrality(n2) 21 | else: 22 | tol = 1.0e-6 23 | 24 | while True: 25 | try: 26 | n1x = nx.eigenvector_centrality(n1, tol=tol) 27 | n2x = nx.eigenvector_centrality(n2, tol=tol) 28 | break 29 | except: 30 | tol *= 10 31 | print("Increasing tolerance to {}".format(tol)) 32 | continue 33 | 34 | net = {**n1x, **n2x} 35 | for k, v in net.items(): 36 | if k in n1x and k in n2x: 37 | net[k] = [n1x[k], v] 38 | elif k in n1x and k not in n2x: 39 | net[k] = [v, np.nan] 40 | else: 41 | net[k] = [np.nan, v] 42 | 43 | return net 44 | 45 | 46 | def generate_node_changes(net): 47 | res = {} 48 | for k, v in net.items(): 49 | pct_change = (net[k][1] - net[k][0]) / (net[k][0] + 1) 50 | 51 | if not np.isnan(pct_change): 52 | res.update({k: pct_change}) 53 | 54 | return res 55 | 56 | 57 | def generate_result_lists(net, num, ctype=['top', 'bottom']): 58 | res = {} 59 | if 'top' in ctype: 60 | keys = list(net.keys())[-num:] 61 | else: 62 | keys = list(net.keys())[:num] 63 | for key in keys: 64 | res.update({key: net[key]}) 65 | 66 | return res 67 | 68 | def get_items(fpath): 69 | items = {} 70 | 71 | for item, idx in read_txt(fpath): 72 | items[int(idx)] = item 73 | 74 | return items 75 | 76 | def triple_to_labels(triple, ents, rels): 77 | head, tail, rel = trip 78 | return " ".join([ents[head], rels[rel], ents[tail]]) 79 | 80 | 81 | def get_top_triples(ofile, nfile, kg_dir, measure='eigenvector', num_results=100, out=None): 82 | ents = get_items(os.path.join(kg_dir, 'entity2id.txt')) 83 | rels = get_items(os.path.join(kg_dir, 'relation2id.txt')) 84 | triples = read_txt(os.path.join(kg_dir, 'train2id.txt')) 85 | triples = np.array(triples).astype(int) 86 | 87 | # read in new file for comparison 88 | new = read_txt(nfile) 89 | 90 | # create networks 91 | neto = build_network(triples) 92 | netn = build_network(triples + new) 93 | net = build_comparison_metrics(neto, netn, measure) 94 | 95 | # calculate node changes 96 | result = generate_node_changes(net) 97 | result = {k: v for k, v in sorted(result.items(), 98 | key=lambda item: item[1])} 99 | 100 | # generate results of interest 101 | gains = generate_result_lists(result, len(result), 'top') 102 | 103 | trip_scores = np.zeros(triples.shape[0]) 104 | 105 | for i, trip in enumerate(triples): 106 | headgain = tailgain = 0 107 | head, tail, _ = trip 108 | trip_scores[i] = gains.get(str(head), 0) + gains.get(str(tail), 0) 109 | 110 | idxs = trip_scores.argsort()[::-1] 111 | top = triples[idxs,][:num_results,:] 112 | 113 | results = {} 114 | 115 | for i, t in enumerate(top): 116 | triple_id = idxs[i] 117 | h, t, r = t 118 | score = trip_scores[triple_id] 119 | 120 | try: 121 | label = " ".join([ents[h], rels[r], ents[t]]) 122 | except: 123 | label = "missing RDF-triple" 124 | 125 | results[triple_id] = {'label': label, 'score': score} 126 | 127 | if out: 128 | with open('{}/key_triples.json'.format(out), 'w') as f: 129 | json.dump(results, f) 130 | 131 | return True 132 | else: 133 | return results 134 | 135 | 136 | def run(args_dict): 137 | # read in files for comparison 138 | orig = read_txt(args_dict['files'][0]) 139 | new = read_txt(args_dict['files'][1]) 140 | 141 | # create networks 142 | neto = build_network(orig) 143 | netn = build_network(orig + new) 144 | net = build_comparison_metrics(neto, netn, args_dict['measure']) 145 | 146 | # calculate node changes 147 | result = generate_node_changes(net) 148 | result = {k: v for k, v in sorted(result.items(), 149 | key=lambda item: item[1])} 150 | 151 | # generate results of interest 152 | top_gain = generate_result_lists(result, args_dict['num_results'], 'top') 153 | top_loss = generate_result_lists(result, args_dict['num_results'], 'bottom') 154 | 155 | # dump results to disk 156 | time = datetime.now().strftime('%d%b%Y-%H:%M:%S') 157 | with open('{}/top_gains_{}.json'.format(args_dict['output'], time), 'w') as f: 158 | json.dump(top_gain, f) 159 | with open('{}/top_losses_{}.json'.format(args_dict['output'], time), 'w') as f: 160 | json.dump(top_loss, f) 161 | 162 | 163 | if __name__ == '__main__': 164 | parser = argparse.ArgumentParser(description='Calculate differences ' 165 | 'between networks.') 166 | parser.add_argument('-f', '--files', nargs=2, required=True, help='Two ' 167 | 'files -- the real network then estimated network -- ' 168 | 'over which to calculate differences.') 169 | parser.add_argument('-m', '--measure', required=False, 170 | default='eigenvector', choices=['degree', 171 | 'eigenvector'], help='Select which network centrality ' 172 | 'measure is required.') 173 | parser.add_argument('-n', '--num_results', required=False, default=10, 174 | type=int, help='Number of results to return from ' 175 | 'centrality calculation.') 176 | parser.add_argument('-o', '--output', required=True, help='Path to ' 177 | 'directory to write results to disk.') 178 | args_dict = vars(parser.parse_args()) 179 | 180 | run(args_dict) 181 | 182 | -------------------------------------------------------------------------------- /conductor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | this script conducts the entire flow of the multivac system to date. it has the 5 | following flow: 6 | 1. collect data 7 | a. these data come from arxiv, springer, and pubmed in this instance, but 8 | could be modified to include more 9 | b. it saves the downloaded pdf's to a directory and creates a json object 10 | for further use 11 | 2. parse data 12 | a. the json objects that are saved from the collection step are processed 13 | for dependencies, input (word position), and morphology (lemma) [dim] 14 | b. it also identifies and notates equations throughout articles 15 | 3. run glove models 16 | a. take article collection that is parsed and create glove word embeddings 17 | b. develops both domain-general and domain-specific models 18 | 4. build the query generation (qg) network 19 | a. uses context/answers as inputs to create questions as output 20 | b. builds off of the domain-adapted glove models to produces robust 21 | questions around a topic of interest (in this case, epidemiology) 22 | 5. build markov logic network (mln) 23 | a. compile parsed dim files into trees and semantically cluster 24 | b. produce a graphical model based on first-order logic for 25 | """ 26 | import argparse 27 | 28 | from multivac.src.data.glove import glove_main 29 | from multivac.src.data.make import collect_main 30 | from multivac.src.data.parsing import nlp_parse_main 31 | from multivac.src.data.qgnet import qgnet_main 32 | from multivac.pymln.pymln import mln_main 33 | 34 | 35 | def conduct(args_dict): 36 | # step 1: collect data 37 | collect_main() 38 | 39 | # step 2: 40 | nlp_parse_main(args_dict) 41 | 42 | # step 3: run glove models 43 | glove_main() 44 | 45 | # step 4: build qg network 46 | qgnet_main(args_dict) 47 | 48 | # step 5: build mln 49 | mln_main(args_dict) 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser(description='Orchestrate pipeline for ' 54 | 'MULTIVAC processing and modeling.') 55 | parser.add_argument('-bp', '--nlp_bp', required=False, type=int, 56 | help='Which document to start parsing with.') 57 | parser.add_argument('-js', '--nlp_newjson', action='store_true', 58 | help='Boolean; indicates whether to create new JSON ' 59 | 'file for glove embedding.') 60 | parser.add_argument('-an', '--subset', type=int, help='Number of articles ' 61 | 'for MLN run.') 62 | parser.add_argument('-pc', '--prior_num_conj', default=10, type=int, 63 | help='Prior on number of conjunctive parts assigned to ' 64 | 'same cluster in MLN.') 65 | parser.add_argument('-pp', '--prior_num_param', default=5, type=int, 66 | help='Prior on number of parameters for cluster ' 67 | 'merges.') 68 | parser.add_argument('-qp', '--qgnet_path', required=True, help='The ' 69 | 'top-level qgnet directory to create folders for ' 70 | 'models and data.') 71 | parser.add_argument('-v', "--verbose", action='store_true', help='Give ' 72 | 'verbose output during MLN modeling.') 73 | args_dict = vars(parser.parse_args()) 74 | 75 | conduct(args_dict) 76 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # MULTIVAC Documentation and References 2 | This page serves as an index of system design, theory, and walk through documentation for Gallup’s Meta-model Unification Learned Through Inquiry Vectorization and Automated Comprehension (MULTIVAC). DARPA’s Information Innovation Office’s Automating Scientific Knowledge Extraction (ASKE) program seeks to develop approaches to make it easier for scientists to build, maintain and reason over rich models of complex systems — which could include physical, biological, social, engineered or hybrid systems. MULTIVAC supports these goals by developing a system that absorbs scientific knowledge — in the form of facts, relationships, models and equations — from a particular domain corpus into a Markov Logic Network (MLN) ontology and learns to query that ontology in order to accelerate scientific exploration within the target domain. 3 | 4 | ## Key Innovations 5 | - Key Innovations 6 | 7 | ## Phase II Developments 8 | - Phase II Developments 9 | 10 | ## Phase I Development - System Overview 11 | - System Walk-Through (Jupyter Notebook): Piece-by-Piece Execution 12 | - Markov Logic Network Induction: Construction of the knowledge graph representation in the form of a Markov Logic Network. 13 | - Query Mapping: Query Mapping Execution 14 | - Phase I Lessons Learned: Review of lessons learned from implementing Phase I systems. 15 | 16 | ## ASKE Community 17 | - MULTIVAC in the ASKE Context 18 | - Other ASKE repositories 19 | - ASKE official homepage 20 | 21 | ## Related Research and Resources 22 | - GANs for Text Generation: paperswithcode.com 23 | - Markov Logic Networks paperswithcode.com 24 | 25 | For more information please contact Principal Investigator, Benjamin Ryan (ben_ryan@gallup.com). 26 | 27 | --- 28 | ## Acknowledgements 29 | This work is supported by the Defense Advanced Research Projects Agency (DARPA) under Agreement No. HR00111990008. 30 | 31 | -------------------------------------------------------------------------------- /doc/aske_context.md: -------------------------------------------------------------------------------- 1 | # MULTIVAC in the ASKE Context 2 | ![ASKE Schematic v1.0](https://github.com/GallupGovt/multivac/blob/master/images/aske_schematic_v1.png) 3 | ![ASKE Schematic v1.5](https://github.com/GallupGovt/multivac/blob/master/images/aske_schematic_v1.5.png) 4 | 5 | Gallup’s MULTIVAC system extracts scientific knowledge — in the form of facts, relationships, equations — from a given domain corpus consisting of natural language text and formal mathematical equations. The system then compiles this knowledge into a curated probabilistic graphical model (specifically, a Markov Logic Network) knowledgebase. Finally, the system learns to query that knowledge base in order to accelerate scientific exploration within the target domain. 6 | 7 | With reference to the first ASKE program schematic on the previous slide, MULTIVAC is more or less vertically integrated across the discovery/extraction, curation, and inference. 8 | 9 | The end objective, however, is hypothesis generation. This feature situates the most novel contribution of MULTIVAC essentially outside these levels, at the top of the more process-oriented second schematic on the previous slide. In effect, MULTIVAC’s “inference” component inverts the standard intention and, instead of using the work done in the extraction and curation layers to arrive at new inferences, learns through observation and experimentation how to ask it’s own novel questions that then require more standard inference solutions to answer. Other projects in the program have presented innovative ways of automating or enhancing execution of human inquiries. Our system seeks to automate the production and evolution of those queries in the first place. 10 | 11 | The final goal of a MULTIVAC system for any given domain is to generate new scientific queries relevant to that domain that have not been asked before by humans. These inquiries, properly formatted, could in theory even act as inputs to many of the other TA2 systems. 12 | 13 | ### Wait, but Why? 14 | - The glacial pace of evolution in paradigms and modes of inquiry within domains. 15 | - Stove-pipes within and between domains of scientific inquiry 16 | 17 | 18 | ## ASKE Potential Use Cases 19 | ### Modernizing and consolidating old research: 20 | - While much research is available in digital form today, vast archives exist in hard copy in various forms that are far less searchable. Using an ASKE system to ingest and compile/curate these types of repositories could help revitalize forgotten areas of research. 21 | 22 | ### Breaking stovepipes: 23 | - Sometimes research fields become balkanized between different communities based on approaches, terminologies, or simply favored publication venues. An ASKE system that can comprehend a field at scale across these artificial segmentations could help break irrational logjams and cross-pollinate discoveries. 24 | 25 | ### Revitalizing stagnant areas of research: 26 | - Occasionally research fields lose momentum or interest, as consensus emerges on “big questions” or as unknowns become more apparently “unknowable.” Paradigm shifts can happen that help break this stagnation and revolutionize fields, but this can take a great deal of time and is never guaranteed. A system that can analyze a field of research and produce novel questions or avenues of inquiry can help inject new creativity and perspectives and revitalize research. 27 | -------------------------------------------------------------------------------- /doc/installation.md: -------------------------------------------------------------------------------- 1 | # MULTIVAC Installation Guide 2 | 3 | ### Installation Requirements 4 | MULTIVAC can be most easily and cleanly installed using `docker`. To use this method, Docker Desktop is required for launching the system on your local machine. Docker Desktop can be set up easily for either Mac or Windows machines with resources found at the following links: 5 | * For Mac users: https://docs.docker.com/docker-for-mac/install/ 6 | * For Windows users: https://docs.docker.com/docker-for-windows/install/ 7 | 8 | MULTIVAC makes use of multiple linked docker containers, so along with Docker Desktop, users will need to have set up `docker-compose`. Mac, Windows, and Linux instructions for installation can be found here: 9 | * Docker Compose: https://docs.docker.com/compose/install/ 10 | 11 | ### Downloading and Deploying MULTIVAC 12 | The first step is to clone this MULTIVAC repository from GitHub. With Git also locally installed: 13 | * Run the following command in your preferred directory: `git clone https://github.com/GallupGovt/multivac.git` 14 | * Next, change into the MULTIVAC directory you just created and run: `docker-compose up` 15 | 16 | This command will download and build the resources MULTIVAC depends on: Stanford CoreNLP, Grobid Publication Parsing, and Jupyter Notebook Viewer, as well as the core MULTIVAC system itself. This process will take some time on first use, and require well over 10 GB of hard drive space, so please plan accordingly. 17 | 18 | ### Basic Operations 19 | In order to see the running processes under Docker, you can use the `docker ps` command. You should see a running container named *multivac_multivac:latest*. This is the root source of our project. To interact with our code and system, you may use `docker exec -it {container-of-multivac-id} {command}`(i.e. `docker exec -it abd35789sbd2 python3 querygan_pyt.py --cuda`). You can also access our web application through port 5000 of your machine, i.e. http://0.0.0.0:5000 or http://your.ip.add:5000 if on a VM. 20 | 21 | To run any docker commands in the background, add the flag `-d` to your command. Once the system is built, you can always start and stop it with the commands `docker-compose start` and `docker-compose stop`. 22 | -------------------------------------------------------------------------------- /doc/notebooks/alpha/README.md: -------------------------------------------------------------------------------- 1 | # Working Files 2 | NOTE: The files and code in this directory and sub-directories are deprecated, work-in-progress, or both. This code is not intended to work. -------------------------------------------------------------------------------- /doc/notebooks/directed_query_gen_walkthrough.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Walk Through for Directed Query Generation\n", 8 | "This notebook outlines the process of generating novel questions based on a user's seed topic using MULTIVAC's semantic knowledge graph and trained query generator. \n", 9 | "First, we set up the required imports and arguments for the test. " 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from multivac.src.rdf_graph.map_queries import *\n", 19 | "tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)\n", 20 | "from multivac.src.gan.gen_test import run\n", 21 | "os.chdir('src/gan')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "args_dict = {'dir': os.path.abspath('../../data'),\n", 31 | " 'out': os.path.abspath('../../models'),\n", 32 | " 'glove': '../../models/glove.42B.300d',\n", 33 | " 'run': 'model',\n", 34 | " 'model': 'transe',\n", 35 | " 'threshold': 0.1,\n", 36 | " 'num_top_rel': 10}\n" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Next, we load up the knowledge graph embedding model previously calculated. This embedding model allows us to assign probabilities to missing nodes or relationships in the knowledge graph proposed via submitted queries. Here we are using TransE, an approach which models relationships by interpreting them as translations operating on the low-dimensional embeddings of entities." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "con = config.Config()\n", 53 | "con.set_in_path(args_dict['dir']+os.path.sep)\n", 54 | "con.set_work_threads(8)\n", 55 | "con.set_dimension(100)\n", 56 | "con.set_test_link_prediction(True)\n", 57 | "con.set_test_triple_classification(True)\n", 58 | "\n", 59 | "files = glob.glob(os.path.join(args_dict['out'],'*tf*'))\n", 60 | "times = list(set([file.split('.')[2] for file in files]))\n", 61 | "ifile = max([datetime.strptime(x, '%d%b%Y-%H:%M:%S') for x in times]).strftime('%d%b%Y-%H:%M:%S')\n", 62 | "con.set_import_files(os.path.join(args_dict['out'], 'model.vec.{}.tf'.format(ifile)))\n", 63 | "\n", 64 | "con.init()\n", 65 | "kem = set_model_choice(args_dict['model'])\n", 66 | "con.set_model(kem)\n", 67 | "\n", 68 | "\n", 69 | "files = [x for x in os.listdir(con.in_path) if '2id' in x]\n", 70 | "rel_file = get_newest_file(con.in_path, files, 'relation')\n", 71 | "ent_file = get_newest_file(con.in_path, files, 'entity')\n", 72 | "trn_file = get_newest_file(con.in_path, files, 'train')\n", 73 | "\n", 74 | "entities = pd.read_csv(ent_file, sep='\\t', \n", 75 | " names=[\"Ent\",\"Id\"], skiprows=1)\n", 76 | "relations = pd.read_csv(rel_file, sep='\\t', \n", 77 | " names=[\"Rel\",\"Id\"], skiprows=1)\n", 78 | "train = pd.read_csv(trn_file, sep='\\t', \n", 79 | " names=[\"Head\",\"Tail\",\"Relation\"], skiprows=1)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "We then set up a GloVe embedding model. Here we use the large scale, pre-trained GloVe embedding model given the open domain nature of potential submitted questions." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "glove_vocab, glove_emb = load_word_vectors(args_dict['glove'])\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Finally, we input our seed topic and extract the knowledge graph elements and predicted elements most related to that topic. The system identifies all triples containing the topic or closely semantically related to it, and returns the top `num_top_rel` results (by default, 10)." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "sample_topic = 'avian flu'" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "results = predict_object(con, sample_topic, relations, entities, train, glove_vocab, glove_emb, exact=False)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "These results are then fed to the query generator, which produces questions in response to each topic. The `run()` function called below does two main things: 1) submit the \"query\" triples to the Generator system to be parsed into a tree object representing the consituency parse of an English language question, and 2) translate that parse into the surface text for presentation:\n", 128 | "```python\n", 129 | " results = netG.parse(query, beam_size=netG.args['beam_size'])\n", 130 | " texts = [asdl_ast_to_english(x.tree) for x in results]\n", 131 | "\n", 132 | " return texts\n", 133 | "```" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "questions = results.Text.apply(lambda x: run({'query': list(x), \n", 143 | " 'model': os.path.join(args_dict['out'], 'gen_checkpoint.pth')}))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "questions.values" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "Python 3", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.7.5" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 2 184 | } 185 | -------------------------------------------------------------------------------- /doc/notebooks/gan_training_illustration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Walk Through for GAN Training\n", 8 | "This notebook illustrates the training of MULTIVAC's generative adversarial network system for query generation.\n", 9 | "First, we set up the required imports and arguments for the test. This process can be performed all at once from the commandline as well:

\n", 10 | "`python3 querygan_pyt.py --gan_D_STEPS 1 --gan_K_STEPS 2 --gan_ROLLOUT_NUM 3 --gan_GENERATED_NUM 100`

\n", 11 | "(training and model parameters are read from a `config.cfg` file, but any of them may be overriden at run time with the appropriate arguments. Here, we reduce the number of steps and the generated samples batch size to better illustrate the entire training cycle in a more timely fashion. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import os\n", 21 | "os.chdir('src/gan')\n", 22 | "from multivac.src.gan.querygan_pyt import *" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "other_args = ['--gan_D_STEPS', '1', \n", 32 | " '--gan_K_STEPS', '2', \n", 33 | " '--gan_ROLLOUT_NUM', '3', \n", 34 | " '--gan_GENERATED_NUM', '100']\n", 35 | "\n", 36 | "args = {'config': 'config.cfg',\n", 37 | " 'cuda': False,\n", 38 | " 'continue': True,\n", 39 | " 'gen_chk': '../../models/gen_checkpoint.pth',\n", 40 | " 'disc_chk': '../../models/disc_checkpoint.pth'}\n", 41 | "\n", 42 | "overrides = {}\n", 43 | "\n", 44 | "i = 0\n", 45 | "\n", 46 | "while i < len(other_args):\n", 47 | " if other_args[i].startswith('--'):\n", 48 | " key = other_args[i][2:]\n", 49 | " value = other_args[i+1]\n", 50 | "\n", 51 | " if value.startswith('--'):\n", 52 | " overrides[key] = True\n", 53 | " i += 1\n", 54 | " continue\n", 55 | " else:\n", 56 | " overrides[key] = value\n", 57 | " i += 2\n", 58 | " else:\n", 59 | " i += 1\n", 60 | "\n", 61 | "cfg = configparser.ConfigParser()\n", 62 | "cfgDIR = os.path.dirname(os.getcwd())\n", 63 | "\n", 64 | "if args['config'] is not None:\n", 65 | " cfg.read(args['config'])\n", 66 | "else:\n", 67 | " cfg.read(os.path.join(cfgDIR, 'config.cfg'))\n", 68 | "\n", 69 | "cfg_dict = cfg._sections\n", 70 | "cfg_dict['ARGS'] = args\n", 71 | "\n", 72 | "for arg in overrides:\n", 73 | " section, param = arg.split(\"_\", 1)\n", 74 | " try:\n", 75 | " cfg[section.upper()][param] = overrides[arg]\n", 76 | " except KeyError:\n", 77 | " print(\"Section \" + section.upper() + \"not found in \"\n", 78 | " \"\" + args['config'] + \", skipping.\")\n", 79 | " continue\n", 80 | "\n", 81 | "for name, section in cfg_dict.items():\n", 82 | " for carg in section:\n", 83 | " # Cast all arguments to proper types\n", 84 | " if section[carg] == 'None':\n", 85 | " section[carg] = None\n", 86 | " continue\n", 87 | "\n", 88 | " try:\n", 89 | " section[carg] = int(section[carg])\n", 90 | " except:\n", 91 | " try:\n", 92 | " section[carg] = float(section[carg])\n", 93 | " except:\n", 94 | " if section[carg] in ['True','False']:\n", 95 | " section[carg] = eval(section[carg])\n" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "Next, we load up the knowledge graph embedding model previously calculated. This embedding model allows us to assign probabilities to missing nodes or relationships in the knowledge graph proposed via submitted queries. Here we are using TransE, an approach which models relationships by interpreting them as translations operating on the low-dimensional embeddings of entities." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "continue_training(cfg_dict, args['gen_chk'], args['disc_chk'])" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "Python 3", 125 | "language": "python", 126 | "name": "python3" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 3 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython3", 138 | "version": "3.7.5" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 2 143 | } 144 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | multivac: 4 | build: 5 | context: . 6 | dockerfile: Dockerfile 7 | ports: 8 | - "5000:5000" 9 | depends_on: 10 | - stanfordnlp 11 | - grobid 12 | - jupyter 13 | links: 14 | - "jupyter" 15 | volumes: 16 | - "./:/app" 17 | stanfordnlp: 18 | image: "graham3333/corenlp-complete" 19 | ports: 20 | - "9000:9000" 21 | grobid: 22 | image: "lfoppiano/grobid:0.5.5" 23 | ports: 24 | - "8070:8070" 25 | - "8071:8071" 26 | jupyter: 27 | image: "jupyter/nbviewer" 28 | ports: 29 | - "8080:8080" 30 | -------------------------------------------------------------------------------- /get_kg_query_params.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | This script is meant to use network centrality measures to identify and select 5 | nodes and edges (entities and relations) that would make good prediction 6 | starting points for the MULTIVAC system. It does this by using eigenvector 7 | centrality but can be extended to include additional network centrality 8 | measures. 9 | """ 10 | import argparse 11 | import sys 12 | 13 | import networkx as nx 14 | 15 | 16 | def analyze_network(net, args_dict): 17 | if 'degree' in args_dict['measure']: 18 | ans = nx.degree_centrality(net) 19 | elif 'eigenvector' in args_dict['measure']: 20 | ans = nx.eigenvector_centrality(net) 21 | else: 22 | sys.exit('Whoops; you must provide a valid network centrality measure.') 23 | ans = sorted(ans.items(), key=lambda x: x[1], reverse=True) 24 | 25 | return ans[:args_dict['num_results']] 26 | 27 | 28 | def build_network(data): 29 | tmp = [tuple(x[:2]) for x in data] 30 | g = nx.Graph() 31 | g.add_edges_from(tmp) 32 | 33 | return g 34 | 35 | 36 | def read_txt(file): 37 | with open(file) as f: 38 | tmp = f.readlines()[1:] 39 | 40 | return [x.rstrip(' \n').split('\t') for x in tmp] 41 | 42 | 43 | def run(args_dict): 44 | # read in data 45 | entities = read_txt(args_dict['files'][0]) 46 | network = read_txt(args_dict['files'][1]) 47 | 48 | # construct/analyze network 49 | net = build_network(network) 50 | results = analyze_network(net, args_dict) 51 | 52 | # return results 53 | named_entities = ['{}\n'.format(entity[0]) for entity in entities if 54 | entity[1] in [res[0] for res in results]] 55 | 56 | with open('search_terms_{}.txt'.format(args_dict['measure']), 'w') as f: 57 | f.writelines(named_entities) 58 | 59 | 60 | if __name__ == '__main__': 61 | parser = argparse.ArgumentParser(description='Run network centrality ' 62 | 'measures on data.') 63 | parser.add_argument('-f', '--files', nargs=2, required=True, help='Two ' 64 | 'files -- entities then train -- that are parsed to ' 65 | 'create a network.') 66 | parser.add_argument('-m', '--measure', required=False, 67 | default='eigenvector', choices=['degree', 'eigenvector'], 68 | help='Select which network centrality ' 69 | 'measure is required.') 70 | parser.add_argument('-n', '--num_results', required=False, default=10, 71 | type=int, help='Number of results to return from ' 72 | 'centrality calculation.') 73 | args_dict = vars(parser.parse_args()) 74 | 75 | run(args_dict) 76 | -------------------------------------------------------------------------------- /images/KCCA_equation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/KCCA_equation.png -------------------------------------------------------------------------------- /images/MULTIVAC_schematic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/MULTIVAC_schematic.png -------------------------------------------------------------------------------- /images/aske_schematic_v1.5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/aske_schematic_v1.5.png -------------------------------------------------------------------------------- /images/aske_schematic_v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/aske_schematic_v1.png -------------------------------------------------------------------------------- /images/emulated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/emulated.png -------------------------------------------------------------------------------- /images/emulated_kg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/emulated_kg.png -------------------------------------------------------------------------------- /images/formula.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/formula.png -------------------------------------------------------------------------------- /images/formula_dependencies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/formula_dependencies.png -------------------------------------------------------------------------------- /images/gan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/gan.png -------------------------------------------------------------------------------- /images/gan_design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/gan_design.png -------------------------------------------------------------------------------- /images/key_triples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/key_triples.png -------------------------------------------------------------------------------- /images/krongen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/krongen.png -------------------------------------------------------------------------------- /images/latex_parse_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/latex_parse_1.png -------------------------------------------------------------------------------- /images/latex_parse_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/latex_parse_2.png -------------------------------------------------------------------------------- /images/multivac_concept.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/multivac_concept.png -------------------------------------------------------------------------------- /images/phase_one_system.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/phase_one_system.png -------------------------------------------------------------------------------- /images/qgnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/qgnet.png -------------------------------------------------------------------------------- /images/simple_kg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/simple_kg.png -------------------------------------------------------------------------------- /images/stanford_dependecies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/images/stanford_dependecies.png -------------------------------------------------------------------------------- /multivac.cfg: -------------------------------------------------------------------------------- 1 | [PATHS] 2 | #root_dir = 3 | #data_dir = 4 | #raw_dir = 5 | #interim_dir = 6 | #processed_dir = 7 | #metadata_dir = 8 | #models_dir = 9 | #stanf_nlp_dir = 10 | 11 | [SEARCH] 12 | # define search terms 13 | terms = ['sir model', 'susceptible-infected-recovered', 'irSIR model', 14 | 'susceptible-infected', 'seir model', 15 | 'susceptible-exposed-infected-recovered'] 16 | 17 | # specify sources 18 | sources = ['arxiv', 'pubmed', 'springer'] 19 | 20 | # filter terms for selected apis 21 | 22 | [FILTER] 23 | # arxiv: filter out selected topics given default query that targets concepts related to 24 | # susceptible-infected-recovered 25 | drops = ['astro-ph Astrophysics', 26 | 'astro-ph.CO Cosmology and Nongalactic Astrophysics', 27 | 'astro-ph.EP Earth and Planetary Astrophysics', 28 | 'astro-ph.GA Astrophysics of Galaxies', 29 | 'astro-ph.HE High Energy Astrophysical Phenomena', 30 | 'astro-ph.IM Instrumentation and Methods for Astrophysics', 31 | 'astro-ph.SR Solar and Stellar Astrophysics', 32 | 'cond-mat.mes-hall Mesoscale and Nanoscale Physics', 33 | 'cond-mat.mtrl-sci Materials Science', 34 | 'cond-mat.other Other Condensed Matter', 35 | 'cond-mat.quant-gas Quantum Gases', 36 | 'cond-mat.soft Soft Condensed Matter', 37 | 'cond-mat.stat-mech Statistical Mechanics', 38 | 'cond-mat.str-el Strongly Correlated Electrons', 39 | 'cond-mat.supr-con Superconductivity', 40 | 'eess.AS Audio and Speech Processing', 41 | 'eess.IV Image and Video Processing', 42 | 'eess.SP Signal Processing', 43 | 'gr-qc General Relativity and Quantum Cosmology', 44 | 'hep-ex High Energy Physics - Experiment', 45 | 'hep-lat High Energy Physics - Lattice', 46 | 'hep-ph High Energy Physics - Phenomenology', 47 | 'hep-th High Energy Physics - Theory', 48 | 'math.AC Commutative Algebra', 49 | 'math.AG Algebraic Geometry', 50 | 'nucl-ex Nuclear Experiment', 51 | 'nucl-th Nuclear Theory', 52 | 'physics.acc-ph Accelerator Physics', 53 | 'physics.ao-ph Atmospheric and Oceanic Physics', 54 | 'physics.app-ph Applied Physics', 55 | 'physics.atm-clus Atomic and Molecular Clusters', 56 | 'physics.atom-ph Atomic Physics', 57 | 'physics.chem-ph Chemical Physics', 58 | 'physics.class-ph Classical Physics', 59 | 'physics.comp-ph Computational Physics', 60 | 'physics.ed-ph Physics Education', 61 | 'physics.flu-dyn Fluid Dynamics', 62 | 'physics.gen-ph General Physics', 63 | 'physics.geo-ph Geophysics', 64 | 'physics.hist-ph History and Philosophy of Physics', 65 | 'physics.ins-det Instrumentation and Detectors', 66 | 'physics.med-ph Medical Physics', 67 | 'physics.optics Optics', 68 | 'physics.plasm-ph Plasma Physics', 69 | 'physics.space-ph Space Physics', 70 | 'q-fin.CP Computational Finance', 71 | 'q-fin.EC Economics', 72 | 'q-fin.GN General Finance', 73 | 'q-fin.MF Mathematical Finance', 74 | 'q-fin.PM Portfolio Management', 75 | 'q-fin.PR Pricing of Securities', 76 | 'q-fin.RM Risk Management', 77 | 'q-fin.ST Statistical Finance', 78 | 'q-fin.TR Trading and Market Microstructure', 79 | 'quant-ph Quantum Physics'] -------------------------------------------------------------------------------- /pymln/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Gallup Government, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pymln/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 4 | # Python implementation of Unsupervised Semantic Parsing system, from: 5 | # 6 | # Hoifung Poon and Pedro Domingos (2009). "Unsupervised Semantic Parsing", 7 | # in Proceedings of the Conference on Empirical Methods in Natural Language 8 | # Processing (EMNLP), 2009. http://alchemy.cs.washington.edu/usp. 9 | # 10 | 11 | from multivac.pymln.semantic import * 12 | from multivac.pymln.syntax import * 13 | from multivac.pymln.utils import * 14 | from multivac.pymln.eval import * 15 | 16 | -------------------------------------------------------------------------------- /pymln/eval/Answer.py: -------------------------------------------------------------------------------- 1 | 2 | class Answer(object): 3 | def __init__(self, sid, rst): 4 | self._sid = sid 5 | self._rst = rst 6 | 7 | def __hash__(self): 8 | return hash(self.toString()) 9 | 10 | def __eq__(self, other): 11 | return self.compareTo(other) == 0 12 | 13 | def __lt__(self, other): 14 | return self.compareTo(other) < 0 15 | 16 | def __str__(self): 17 | return self.toString() 18 | 19 | def getSentId(self): 20 | return self._sid 21 | 22 | def getRst(self): 23 | return self._rst 24 | 25 | def compareTo(self, a): 26 | result = 0 27 | 28 | if self._rst != a.getRst(): 29 | if self._rst < a.getRst(): 30 | result -= 1 31 | else: 32 | result += 1 33 | elif self._sid != a.getSentId(): 34 | if self._sid < a.getSentId(): 35 | result -= 1 36 | else: 37 | result += 1 38 | 39 | return result 40 | 41 | def toString(self): 42 | return ' '.join([self._sid, self._rst]) 43 | 44 | -------------------------------------------------------------------------------- /pymln/eval/Question.py: -------------------------------------------------------------------------------- 1 | 2 | class Question(object): 3 | def __init__(self, rel, arg, dep): 4 | self._rel = rel 5 | self._dep = dep 6 | self._arg = arg 7 | self._argClustIdxSeq = None 8 | 9 | def __hash__(self): 10 | return hash(self.toString()) 11 | 12 | def __eq__(self, other): 13 | return self.compareTo(other) == 0 14 | 15 | def __lt__(self): 16 | return self.compareTo(other) < 0 17 | 18 | def __str__(self): 19 | return self.toString() 20 | 21 | def getRel(self): 22 | return self._rel 23 | 24 | def getArg(self): 25 | return self._arg 26 | 27 | def getDep(self): 28 | return self._dep 29 | 30 | def compareTo(self, q): 31 | result = 0 32 | 33 | if self._dep != q.getDep(): 34 | if self._dep < q.getDep(): 35 | result -= 1 36 | else: 37 | result += 1 38 | elif self._rel != q.getRel(): 39 | if self._rel < q.getRel(): 40 | result -= 1 41 | else: 42 | result += 1 43 | elif self._arg != q.getArg(): 44 | if self._arg < q.getArg(): 45 | result -= 1 46 | else: 47 | result += 1 48 | 49 | return result 50 | 51 | def getPattern(self): 52 | if self._dep == 'nsubj': 53 | return ' '.join([self._arg, self._rel]) 54 | elif self._dep == 'dobj': 55 | return ' '.join([self._rel, self._arg]) 56 | else: 57 | return None 58 | 59 | def toString(self): 60 | if self._dep == 'nsubj': 61 | return "What does {} {}?".format(self._arg, self._rel) 62 | elif self._dep == 'dobj': 63 | return "What {}s {}?".format(self._rel, self._arg) 64 | else: 65 | return "{} ::: {} ::: {}".format(self._rel, self._dep, self._arg) 66 | 67 | -------------------------------------------------------------------------------- /pymln/eval/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from . Answer import Answer 4 | from . Question import Question 5 | from . USP import USP 6 | 7 | -------------------------------------------------------------------------------- /pymln/pymln.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # Python implementation of Unsupervised Semantic Parsing system, from: 4 | # 5 | # Hoifung Poon and Pedro Domingos (2009). "Unsupervised Semantic Parsing", 6 | # in Proceedings of the Conference on Empirical Methods in Natural Language 7 | # Processing (EMNLP), 2009. http://alchemy.cs.washington.edu/usp. 8 | import os 9 | 10 | from datetime import datetime 11 | 12 | from multivac import settings 13 | from multivac.pymln.semantic import Parse, MLN, Clust 14 | from multivac.pymln.syntax.StanfordParseReader import StanfordParseReader 15 | 16 | 17 | def read_input_files(DIR): 18 | files = [] 19 | for file in os.listdir(DIR): 20 | if file.endswith(".dep"): 21 | files.append(file) 22 | 23 | return files 24 | 25 | 26 | def mln_main(args_dict): 27 | # set variables 28 | verbose = args_dict['verbose'] 29 | data_dir = settings.data_dir 30 | results_dir = settings.mln_dir 31 | parser = Parse(args_dict['prior_num_param'], args_dict['prior_num_conj']) 32 | 33 | # read in inputs 34 | input_files = read_input_files(data_dir) 35 | input_files.sort() 36 | 37 | # set final parameter 38 | if 'subset' in args_dict: 39 | subset = args_dict['subset'] 40 | else: 41 | subset = len(input_files) 42 | 43 | articles = [] 44 | for i, fileName in enumerate(input_files): 45 | try: 46 | a = StanfordParseReader.readParse(fileName, data_dir) 47 | except: 48 | print("Error on {}, {}".format(i, fileName)) 49 | raise Exception 50 | 51 | if i%100 == 0: 52 | print("{} articles parsed.".format(i)) 53 | 54 | if i >= subset: 55 | break 56 | 57 | articles.append(a) 58 | 59 | 60 | if verbose: 61 | print("{} Initializing...".format(datetime.now())) 62 | parser.initialize(articles, verbose) 63 | 64 | if verbose: 65 | print("{}: {} articles parsed, of {} sentences and {} total tokens." 66 | .format(datetime.now(), 67 | len(articles), 68 | parser.numSents, 69 | parser.numTkns)) 70 | num_arg_clusts = sum([len(x._argClusts) for x in Clust.clusts.values()]) 71 | 72 | if verbose: 73 | print("{}: {} initial clusters, with {} argument clusters." 74 | .format(datetime.now(), len(Clust.clusts), num_arg_clusts)) 75 | print("{} Merging arguments...".format(datetime.now())) 76 | parser.mergeArgs() 77 | num_arg_clusts = sum([len(x._argClusts) for x in Clust.clusts.values()]) 78 | 79 | if verbose: 80 | print("Now with {} initial clusters, {} argument clusters." 81 | .format(len(Clust.clusts), num_arg_clusts)) 82 | print("{} Creating agenda...".format(datetime.now())) 83 | parser.agenda.createAgenda(verbose) 84 | 85 | if verbose: 86 | print("{}: {} possible operations in queue, {} merges and {} composes." 87 | .format(datetime.now(), 88 | len(parser.agenda._agendaToScore), 89 | len(parser.agenda._mc_neighs), 90 | len(parser.agenda._compose_cnt))) 91 | print("{} Processing agenda...".format(datetime.now())) 92 | parser.agenda.procAgenda(verbose) 93 | 94 | num_arg_clusts = sum([len(x._argClusts) for x in Clust.clusts.values()]) 95 | 96 | if verbose: 97 | print("{}: {} final clusters, with {} argument clusters." 98 | .format(datetime.now(), len(Clust.clusts), num_arg_clusts)) 99 | 100 | MLN.save_mln(results_dir / "mln.pkl") 101 | MLN.printModel(results_dir) 102 | 103 | if verbose: 104 | print("{} Induced MLN saved.".format(datetime.now())) 105 | 106 | 107 | if __name__ == '__main__': 108 | mln_main(args_dict) 109 | -------------------------------------------------------------------------------- /pymln/semantic/Argument.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Argument(object): 4 | def __init__(self, argNode, path, argPart): 5 | self._argNode = argNode 6 | self._path = path 7 | self._argPart = argPart 8 | 9 | return None 10 | 11 | def getPath(self): 12 | return self._path 13 | 14 | def getPart(self): 15 | return self._argPart 16 | 17 | def getNode(self): 18 | return self._argNode 19 | 20 | 21 | -------------------------------------------------------------------------------- /pymln/semantic/MLN.py: -------------------------------------------------------------------------------- 1 | 2 | from multivac.pymln.semantic import Clust, ArgClust, Part 3 | from multivac.pymln.syntax.Relations import ArgType, RelType 4 | 5 | import json 6 | import pickle 7 | import os 8 | 9 | class MLN(object): 10 | ''' 11 | Class for simply outputting the MLN structure parsed from source 12 | documents. 13 | 14 | ''' 15 | def __init__(self): 16 | return None 17 | 18 | def printModel(path): 19 | clustering = MLN.printClustering(path) 20 | mln = MLN.printMLN(path) 21 | prse = MLN.printParse(path) 22 | 23 | if path is None: 24 | return clustering, mln, prse 25 | else: 26 | return None 27 | 28 | def printClustering(path=None): 29 | out_str = "=== Clustering ===\n" 30 | 31 | for ci, clust in Clust.clusts.items(): 32 | # if len(clust._relTypeIdx_cnt) > 1: 33 | out_str += str(ci) + " " + clust.toString() + "\n" 34 | for aci, ac in clust._argClusts.items(): 35 | out_str += "\t{}\t{}\t{}\n".format(aci, ac.toString(), ac._ttlArgCnt) 36 | 37 | if path is not None: 38 | dst = "{}/{}.clustering".format(path, 39 | os.path.basename(os.path.dirname(path))) 40 | with open(dst, 'w') as f: 41 | f.write(out_str) 42 | 43 | return None 44 | else: 45 | return out_str 46 | 47 | def save_mln(path): 48 | ''' 49 | Save all objects necessary to recreate the MLN knowledgebase 50 | ''' 51 | with open(path, 'wb') as f: 52 | pickle.dump({'clusts': Clust.clusts, 53 | 'relTypeIdx_clustIdx': Clust.relTypeIdx_clustIdx, 54 | 'relTypes': RelType.relTypes, 55 | 'relTypeStr_idx': RelType.relTypeStr_idx, 56 | 'argTypes': ArgType.argTypes, 57 | 'argTypeStr_idx': ArgType.argTypeStr_idx, 58 | 'rootNodeId_part': Part.rootNodeId_part, 59 | 'clustIdx_partRootNodeIds': Part.clustIdx_partRootNodeIds, 60 | 'pairClustIdxs_pairPartRootNodeIds': Part.pairClustIdxs_pairPartRootNodeIds}, 61 | f) 62 | 63 | return None 64 | 65 | def load_mln(path, ret=False): 66 | with open(path, 'rb') as f: 67 | mln = pickle.load(f) 68 | 69 | try: 70 | _ = len(Clust.clusts) 71 | _ = len(ArgType.argTypes) 72 | _ = len(RelType.relTypes) 73 | _ = len(Part.rootNodeId_part) 74 | except NameError: 75 | from multivac.pymln.semantic import Clust, Part 76 | from multivac.pymln.syntax.Relations import ArgType, RelType 77 | 78 | Clust.clusts = mln['clusts'] 79 | Clust.relTypeIdx_clustIdx = mln['relTypeIdx_clustIdx'] 80 | RelType.relTypes = mln['relTypes'] 81 | RelType.relTypeStr_idx = mln['relTypeStr_idx'] 82 | ArgType.argTypes = mln['argTypes'] 83 | ArgType.argTypeStr_idx = mln['argTypeStr_idx'] 84 | Part.rootNodeId_part = mln['rootNodeId_part'] 85 | Part.clustIdx_partRootNodeIds = mln['clustIdx_partRootNodeIds'] 86 | Part.pairClustIdxs_pairPartRootNodeIds = mln['pairClustIdxs_pairPartRootNodeIds'] 87 | 88 | if ret: 89 | return mln 90 | else: 91 | return None 92 | 93 | def printMLN(path=None): 94 | out_str = "" 95 | 96 | for ci in Clust.clusts: 97 | cl = Clust.getClust(ci) 98 | out_str += "{}\t{}\n".format(cl._clustIdx,cl) 99 | 100 | for aci in cl._argClusts: 101 | ac = cl._argClusts[aci] 102 | out_str += "\t{}: ".format(aci) 103 | 104 | out_str += "\t".join(["{}: {}".format(k, v) 105 | for k, v in ac._argNum_cnt.items()]) 106 | out_str += "\n\t" 107 | out_str += "\t".join(["{}: {}: {}".format(k, 108 | ArgType.getArgType(k).toString(), 109 | v) 110 | for k, v in ac._argTypeIdx_cnt.items()]) 111 | out_str += "\n\t" 112 | out_str += "\t".join(["{}: {}: {}".format(k, 113 | Clust.getClust(k), 114 | v) 115 | for k, v in ac._chdClustIdx_cnt.items()]) 116 | out_str += "\n" 117 | 118 | if path is not None: 119 | dst = "{}/{}.mln".format(path, 120 | os.path.basename(os.path.dirname(path))) 121 | 122 | with open(dst, 'w') as f: 123 | f.write(out_str) 124 | 125 | return None 126 | else: 127 | return out_str 128 | 129 | 130 | def printParse(path=None): 131 | out_str = "" 132 | 133 | for rnid, pt in Part.rootNodeId_part.items(): 134 | out_str += "{}\t{}\n".format(rnid, pt._relTreeRoot.getTreeStr()) 135 | out_str += "\t{}: {}\n".format(pt._clustIdx, 136 | Clust.getClust(pt._clustIdx).toString()) 137 | 138 | if pt._parPart is None: 139 | out_str += "\t\n\t\n" 140 | else: 141 | arg = pt._parPart.getArgument(pt._parArgIdx) 142 | out_str += "\t{}\t{}\t{}\n".format(pt._parPart._relTreeRoot.getId(), 143 | pt._parPart._clustIdx, 144 | Clust.getClust(pt._parPart._clustIdx)) 145 | out_str += "\t{}: {}: {}\n".format(pt._parPart.getArgClust(pt._parArgIdx), 146 | arg._path.getArgType(), 147 | ArgType.getArgType(arg._path.getArgType())) 148 | 149 | if path is not None: 150 | dst = "{}/{}.parse".format(path, 151 | os.path.basename(os.path.dirname(path))) 152 | with open(dst, 'w') as f: 153 | f.write(out_str) 154 | 155 | return None 156 | else: 157 | return out_str 158 | 159 | -------------------------------------------------------------------------------- /pymln/semantic/ParseParams.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class ParseParams(object): 4 | minMCCnt = 10 5 | minAbsCnt = 50 6 | priorCutOff = 10 7 | priorNumArgComb = 1 8 | priorMerge = 0 9 | priorNumParam = 5 10 | priorNumConj = 10 11 | 12 | -------------------------------------------------------------------------------- /pymln/semantic/SearchOp.py: -------------------------------------------------------------------------------- 1 | 2 | from multivac.pymln.semantic import Clust 3 | 4 | class SearchOp(object): 5 | # Why are these strings instead of just integers? 6 | OP_MERGE_CLUST = '0' 7 | OP_MERGE_ROLE = '1' 8 | OP_COMPOSE = '2' 9 | 10 | def __init__(self): 11 | self._op = '' 12 | self._clustIdx1 = None 13 | self._clustIdx2 = None 14 | self._clustIdx = None 15 | self._argIdx1 = None 16 | self._argIdx2 = None 17 | self._parClustIdx = None 18 | self._chdClustIdx = None 19 | self._str = None 20 | 21 | def __hash__(self): 22 | return hash(self.toString()) 23 | 24 | def __eq__(self, other): 25 | return self.compareTo(other) == 0 26 | 27 | def __lt__(self, other): 28 | return self.compareTo(other) < 0 29 | 30 | def __repr__(self): 31 | return self.toString() 32 | 33 | def compareTo(self, z): 34 | this = sum([ord(x) for x in self.toString()]) 35 | that = sum([ord(x) for x in z.toString()]) 36 | result = this - that 37 | 38 | return result 39 | 40 | def toString(self): 41 | if self._str is None: 42 | self.genString() 43 | 44 | return self._str 45 | 46 | def genString(self): 47 | self._str = "OP_{}:".format(self._op) 48 | 49 | if self._op == SearchOp.OP_MERGE_CLUST: 50 | c1 = Clust.getClust(self._clustIdx1) 51 | c2 = Clust.getClust(self._clustIdx2) 52 | self._str += "{} == {}".format(c1.toString(), c2.toString()) 53 | elif self._op == SearchOp.OP_MERGE_ROLE: 54 | self._str += "{}:{}:{}".format(self._clustIdx, 55 | self._argIdx1, 56 | self._argIdx2) 57 | elif self._op == SearchOp.OP_COMPOSE: 58 | rc = Clust.getClust(self._parClustIdx) 59 | ac = Clust.getClust(self._chdClustIdx) 60 | self._str += "{} ++ {}".format(rc.toString(), ac.toString()) 61 | 62 | 63 | -------------------------------------------------------------------------------- /pymln/semantic/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . ParseParams import ParseParams 3 | from . Argument import Argument 4 | from . argclust import ArgClust 5 | 6 | from . Clust import Clust 7 | from . SearchOp import SearchOp 8 | from . Part import Part 9 | from . Agenda import Agenda 10 | from . Scorer import Scorer as Scorer 11 | from . Executor import Executor 12 | from . MLN import MLN 13 | 14 | from . Parse import Parse 15 | -------------------------------------------------------------------------------- /pymln/semantic/argclust.py: -------------------------------------------------------------------------------- 1 | 2 | # from collections import OrderedDict 3 | from sortedcontainers import SortedSet 4 | from multivac.pymln.syntax.Relations import ArgType 5 | 6 | class ArgClust(object): 7 | def __init__(self): 8 | # Dictionary mapping {int: int} 9 | self._argTypeIdx_cnt = {} 10 | # Dictionary mapping {int: int} 11 | self._chdClustIdx_cnt = {} 12 | # Dictionary mapping {int: int} 13 | self._argNum_cnt = {} 14 | self._ttlArgCnt = 0 15 | self._partRootTreeNodeIds = SortedSet() 16 | 17 | def toString(self): 18 | s = '' 19 | for k, v in self._argTypeIdx_cnt.items(): 20 | if len(s) > 0: 21 | s += ' ' 22 | s += '{}:{}'.format(ArgType.getArgType(k), v) 23 | 24 | return s 25 | 26 | 27 | -------------------------------------------------------------------------------- /pymln/syntax/Nodes/Article.py: -------------------------------------------------------------------------------- 1 | 2 | from multivac.pymln.syntax.Nodes import Sentence 3 | 4 | class Article(object): 5 | ''' 6 | An Article() is merely a collection of Sentences() (represented as a list) 7 | and an article id, which can be of any particular type but should be unique 8 | in a collection of Articles. 9 | ''' 10 | def __init__(self, fn=None): 11 | self.uid = fn 12 | self.sentences = [] 13 | 14 | def __repr__(self): 15 | return str(self.__dict__) 16 | 17 | 18 | -------------------------------------------------------------------------------- /pymln/syntax/Nodes/Sentence.py: -------------------------------------------------------------------------------- 1 | from sortedcontainers import SortedSet 2 | from multivac.pymln.syntax.Nodes.Token import Token 3 | 4 | class Sentence(object): 5 | def __init__(self): 6 | ''' 7 | Each sentence consists of: 8 | _tokens: A list of individual tokens in the sentence, containing POS, 9 | lemma, and actual form of the word/item. 10 | _tkn_children: A dictionary mapping parents (denoted by the integer 11 | keys) to children (sets of integer, string tuples). 12 | _tkn_par: A dictionary mapping children (denoted by integer keys) to 13 | parents (tuples of string, integer values) 14 | ''' 15 | self._tokens = [] 16 | 17 | # Dictionary mapping {int: set((int, str))} 18 | self._tkn_children = {0: SortedSet()} 19 | # Dictionary mapping {int: (str, int)} 20 | self._tkn_par = {} 21 | 22 | return None 23 | 24 | 25 | def __repr__(self): 26 | return ('Tokens: ' + str([str(x) for x in self._tokens])) 27 | 28 | def get_tokens(self, idx=None): 29 | ''' 30 | Return Tokens at the specified indices. 31 | ''' 32 | if idx is None: 33 | return self._tokens 34 | elif isinstance(idx, list): 35 | return [self._tokens[i] for i in idx] 36 | elif isinstance(idx, int): 37 | return self.get_token(idx) 38 | else: 39 | raise ValueError 40 | 41 | 42 | def get_token(self, idx): 43 | ''' 44 | Return the Token() at the specified index. 45 | ''' 46 | return self._tokens[idx] 47 | 48 | def add_token(self, tok): 49 | ''' 50 | Append the Token() to the list of _tokens. 51 | ''' 52 | assert isinstance(tok, Token) 53 | self._tokens.append(tok) 54 | 55 | return None 56 | 57 | def get_children(self, parent=None): 58 | ''' 59 | Return the child/children of the parent specified by the given key. If 60 | no key specified, return them all. 61 | ''' 62 | if parent is not None: 63 | if parent in self._tkn_children: 64 | c = self._tkn_children[parent] 65 | else: 66 | c = None 67 | else: 68 | c = self._tkn_children 69 | 70 | return c 71 | 72 | def set_children(self, parent, kids): 73 | ''' 74 | Add the child/children specified by the key/kids key/value pair. 75 | ''' 76 | assert isinstance(kids, SortedSet) 77 | self._tkn_children[parent] = kids 78 | 79 | return None 80 | 81 | def add_child(self, parent, kid): 82 | ''' 83 | Add/update the child/children specified by the key/kids key/value pair. 84 | ''' 85 | assert parent in self._tkn_children 86 | self._tkn_children[parent].add(kid) 87 | 88 | return None 89 | 90 | def get_parent(self, kid): 91 | ''' 92 | Return the parent of the child specified by the given key. 93 | ''' 94 | if kid in self._tkn_par: 95 | return self._tkn_par[kid] 96 | else: 97 | return None 98 | 99 | def set_parent(self, kid, parent): 100 | ''' 101 | Add/update the parent specified by the given key/parent value pair. 102 | ''' 103 | assert isinstance(parent, tuple) 104 | self._tkn_par[kid] = parent 105 | 106 | return None 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /pymln/syntax/Nodes/Token.py: -------------------------------------------------------------------------------- 1 | 2 | class Token(object): 3 | 4 | contentPOS = set(['J','R','V','N']) 5 | 6 | def isContent(t): 7 | return t._pos[0] in Token.contentPOS 8 | 9 | def isVerb(t): 10 | return t._pos[0] == 'V' 11 | 12 | def isNoun(t): 13 | return (t._pos[0] == 'N') | (self._pos.startswith('PRP')) 14 | 15 | 16 | def __init__(self, pos, lemma, form=None): 17 | self._pos = pos 18 | 19 | if Token.isContent(self): 20 | self._pos = pos[0] 21 | 22 | self._lemma = lemma 23 | 24 | if form is None: 25 | self._form = lemma 26 | else: 27 | self._form = form 28 | 29 | def __hash__(self): 30 | return hash(self.toString()) 31 | 32 | def __lt__(self, other): 33 | return self.compareTo(other) < 0 34 | 35 | def __eq__(self, other): 36 | return self.compareTo(other) == 0 37 | 38 | def __str__(self): 39 | return self.toString() 40 | 41 | def getForm(self): 42 | return self._form 43 | 44 | def getPOS(self): 45 | return self._pos 46 | 47 | def getLemma(self): 48 | return self._lemma 49 | 50 | def compareTo(self, t): 51 | this = sum([ord(x) for x in self._lemma]) 52 | that = sum([ord(x) for x in t._lemma]) 53 | result = this - that 54 | 55 | if result == 0: 56 | this = sum([ord(x) for x in self._pos]) 57 | that = sum([ord(x) for x in t._pos]) 58 | result = this - that 59 | return result 60 | 61 | def equals(self, t): 62 | return (self._pos == t._pos) & (self._lemma == t._lemma) 63 | 64 | def hashCode(self): 65 | return hash(self) 66 | 67 | def toString(self): 68 | return (self._pos + ":" + self._lemma) 69 | -------------------------------------------------------------------------------- /pymln/syntax/Nodes/TreeNode.py: -------------------------------------------------------------------------------- 1 | 2 | # from collections import OrderedDict 3 | from sortedcontainers import SortedDict, SortedSet 4 | from multivac.pymln.syntax.Nodes import Token 5 | 6 | class TreeNode(object): 7 | # map {str: TreeNode} 8 | id_treeNodes = {} 9 | 10 | def __init__(self, tree_node_id, token): 11 | self._id = tree_node_id 12 | self._tkn = token 13 | # map {str: set(TreeNodes)} 14 | self._children = SortedDict() 15 | TreeNode.id_treeNodes[tree_node_id] = self 16 | 17 | def __hash__(self): 18 | return hash(self.toString()) 19 | 20 | def __eq__(self, other): 21 | return self.compareTo(other) == 0 22 | 23 | def __lt__(self, other): 24 | return self.compareTo(other) < 0 25 | 26 | def __str__(self): 27 | return self.toString() 28 | 29 | def __repr__(self): 30 | return self.toString() 31 | 32 | def addChild(self, dep, child): 33 | if dep not in self._children: 34 | self._children[dep] = SortedSet() 35 | 36 | self._children[dep].add(child) 37 | 38 | return None 39 | 40 | def getId(self): 41 | return self._id 42 | 43 | def getToken(self): 44 | return self._tkn 45 | 46 | def getChildren(self): 47 | return self._children 48 | 49 | def compareTo(self, z): 50 | if not isinstance(z, TreeNode): 51 | raise ValueError 52 | 53 | return self._tkn.compareTo(z._tkn) 54 | 55 | def toString(self): 56 | return self._tkn.toString() 57 | 58 | def getTreeNode(tree_node_id): 59 | return TreeNode.id_treeNodes[tree_node_id] 60 | 61 | def getTreeStr(self): 62 | id_str = SortedDict() 63 | 64 | if (len(self._children) > 0): 65 | for dep, nodes in self._children.items(): 66 | s = '' 67 | 68 | for node in nodes: 69 | if dep.startswith('prep_') or dep.startswith('conj_'): 70 | s = dep[5:] + ' ' 71 | s = s + node.getTreeStr() 72 | id_str[node.getId()] = s 73 | 74 | id_str[self._id] = self._tkn.getLemma() 75 | result = ' '.join(id_str.values()) 76 | 77 | return result 78 | 79 | 80 | -------------------------------------------------------------------------------- /pymln/syntax/Nodes/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from multivac.pymln.syntax.Nodes.Article import Article 4 | from multivac.pymln.syntax.Nodes.Sentence import Sentence 5 | from multivac.pymln.syntax.Nodes.Token import Token 6 | from multivac.pymln.syntax.Nodes.TreeNode import TreeNode 7 | 8 | -------------------------------------------------------------------------------- /pymln/syntax/Relations/ArgType.py: -------------------------------------------------------------------------------- 1 | 2 | from multivac.pymln.syntax.Relations import RelType 3 | 4 | class ArgType(object): 5 | argTypes = [] 6 | # Dictionary mapping {str: int} 7 | argTypeStr_idx = {} 8 | 9 | def __init__(self, target): 10 | s = target.toString() 11 | self._dep = target.getDep() 12 | self._dep2 = target.getDep2() 13 | self._str = None 14 | 15 | if target.getTreeRoot() is not None: 16 | self._relTypeIdx = RelType.getRelType(target.getTreeRoot()) 17 | else: 18 | self._relTypeIdx = -1 19 | 20 | self._str = self.toString() 21 | ArgType.argTypes.append(self) 22 | i = len(ArgType.argTypes) - 1 23 | ArgType.argTypeStr_idx[s] = i 24 | 25 | def __hash__(self): 26 | return hash(self.toString()) 27 | 28 | def __eq__(self, other): 29 | return self.compareTo(other) == 0 30 | 31 | def __str__(self): 32 | return self.toString() 33 | 34 | def __repr__(self): 35 | return self.toString() 36 | 37 | def getArgType(target): 38 | if isinstance(target, int): 39 | return ArgType.argTypes[target] 40 | elif not isinstance(target, str): 41 | s = target.toString() 42 | 43 | if s not in ArgType.argTypeStr_idx: 44 | t = ArgType(target) 45 | 46 | return ArgType.argTypeStr_idx[s] 47 | 48 | def compareTo(self, z): 49 | if self._dep is None or z.GetDep() is None: 50 | return None 51 | 52 | this = sum([ord(x) for x in self._dep]) 53 | that = sum([ord(x) for x in z.getDep()]) 54 | result = this - that 55 | 56 | if result == 0: 57 | result = self._relTypeIdx - z._relTypeIdx 58 | 59 | if result == 0: 60 | if self._dep2 is not None: 61 | this = sum([ord(x) for x in self._dep2]) 62 | 63 | try: 64 | that = sum([ord(x) for x in z.getDep2()]) 65 | except TypeError: 66 | result = -1 67 | else: 68 | result = this - that 69 | 70 | return result 71 | 72 | def toString(self): 73 | if self._str is None: 74 | self._str = '<' + self._dep 75 | 76 | if self._relTypeIdx >= 0: 77 | self._str += ':{}:{}'.format(self._relTypeIdx,self._dep2) 78 | 79 | self._str += '>' 80 | 81 | return self._str 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /pymln/syntax/Relations/Path.py: -------------------------------------------------------------------------------- 1 | 2 | from multivac.pymln.syntax.Relations import RelType, ArgType 3 | 4 | class Path(object): 5 | def __init__(self, dep, treeRoot=None, argNode=None, dep2=None): 6 | self._dep = dep 7 | self._treeRoot = treeRoot 8 | self._argNode = argNode 9 | self._dep2 = dep2 10 | self._str = None 11 | 12 | self._argTypeIdx = ArgType.getArgType(self) 13 | 14 | def __str__(self): 15 | return self.toString() 16 | 17 | def __repr__(self): 18 | return self.toString() 19 | 20 | def getDep(self): 21 | return self._dep 22 | 23 | def getTreeRoot(self): 24 | return self._treeRoot 25 | 26 | def getArgNode(self): 27 | return self._argNode 28 | 29 | def getDep2(self): 30 | return self._dep2 31 | 32 | def getArgType(self): 33 | return self._argTypeIdx 34 | 35 | def toString(self): 36 | if self._str is None: 37 | self._str = self.genTypeStr() 38 | 39 | return self._str 40 | 41 | def genTypeStr(self): 42 | typ_str = '<' + self._dep 43 | 44 | if self._treeRoot is not None: 45 | rel_str = RelType.genTypeStr(self._treeRoot) 46 | typ_str += ':' + rel_str + ':' + self._dep2 47 | 48 | typ_str += '>' 49 | 50 | return typ_str 51 | 52 | -------------------------------------------------------------------------------- /pymln/syntax/Relations/RelType.py: -------------------------------------------------------------------------------- 1 | 2 | from multivac.pymln.syntax.Nodes import Token, TreeNode 3 | 4 | class RelType(object): 5 | relTypes = [] 6 | # Dictionary mapping {str: int} tracking RelType strings and 7 | # their unique indices. 8 | relTypeStr_idx = {} 9 | 10 | def __init__(self, target): 11 | self._str = RelType.genTypeStr(target) 12 | 13 | if Token.isContent(target._tkn): 14 | self._type = 'C' 15 | else: 16 | self._type = 'N' 17 | 18 | RelType.relTypeStr_idx[self._str] = len(RelType.relTypes) 19 | RelType.relTypes.append(self) 20 | 21 | def __hash__(self): 22 | return hash(self.toString()) 23 | 24 | def __eq__(self, other): 25 | return self.compareTo(other) == 0 26 | 27 | def getType(self): 28 | return self._type 29 | 30 | def getRelType(target): 31 | if target is None: 32 | result = None 33 | elif isinstance(target,int): 34 | result = RelType.relTypes[target] 35 | else: 36 | type_str = RelType.genTypeStr(target) 37 | 38 | if type_str not in RelType.relTypeStr_idx: 39 | t = RelType(target) 40 | 41 | result = RelType.relTypeStr_idx[type_str] 42 | 43 | return result 44 | 45 | def genTypeStr(tn): 46 | type_str = '(' 47 | type_str += tn.toString() 48 | children = tn.getChildren() 49 | 50 | if len(children) > 0: 51 | for child in children: 52 | type_str += ' (' + child 53 | tree_nodes = children[child] 54 | 55 | for node in tree_nodes: 56 | type_str += ' ' + RelType.genTypeStr(node) 57 | 58 | type_str += ')' 59 | 60 | type_str += ')' 61 | 62 | return type_str 63 | 64 | def compareTo(self, z): 65 | this = sum([ord(x) for x in self._str]) 66 | that = sum([ord(x) for x in z.toString()]) 67 | result = this - that 68 | 69 | return result 70 | 71 | def toString(self): 72 | return self._str 73 | 74 | -------------------------------------------------------------------------------- /pymln/syntax/Relations/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from . ArgType import ArgType 4 | from . Path import Path 5 | from . RelType import RelType 6 | 7 | -------------------------------------------------------------------------------- /pymln/syntax/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from multivac.pymln.syntax.Nodes import Article, Sentence, Token 3 | from multivac.pymln.syntax.Relations import ArgType, Path, RelType 4 | from multivac.pymln.syntax import StanfordParseReader 5 | 6 | -------------------------------------------------------------------------------- /pymln/utils/Utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 4 | # Utility functions for pymln parsing 5 | # 6 | 7 | import math 8 | 9 | def inc_key(d, key, inc=1): 10 | if key not in d: 11 | d[key] = inc 12 | else: 13 | d[key] += inc 14 | 15 | return d 16 | 17 | def dec_key(d, key, base=None, dec=1, remove=False): 18 | if key not in d: 19 | if base is None: 20 | d = None 21 | else: 22 | d[key] = base - dec 23 | else: 24 | d[key] -= dec 25 | 26 | if remove and d[key] <= 0: 27 | del d[key] 28 | 29 | return d 30 | 31 | 32 | def genTreeNodeID(aid, sid, wid): 33 | node_id = '{0}:{1}:{2:03d}'.format(aid, sid, wid) 34 | 35 | return node_id 36 | 37 | 38 | class java_iter(object): 39 | def __init__(self, it): 40 | self.it = iter(it) 41 | self._hasnext = None 42 | 43 | def __iter__(self): return self 44 | 45 | def next(self): 46 | if self._hasnext: 47 | result = self._thenext 48 | else: 49 | result = next(self.it) 50 | self._hasnext = None 51 | 52 | return result 53 | 54 | def hasnext(self): 55 | if self._hasnext is None: 56 | try: 57 | self._thenext = next(self.it) 58 | except StopIteration: 59 | self._hasnext = False 60 | else: 61 | self._hasnext = True 62 | 63 | return self._hasnext 64 | 65 | 66 | def compareStr(s, t): 67 | # compare each character until there's a difference!!! 68 | this = sum([ord(x) for x in s]) 69 | that = sum([ord(x) for x in t]) 70 | result = this - that 71 | 72 | return result 73 | 74 | 75 | def xlogx(x): 76 | if x <= 0: 77 | result = 0 78 | else: 79 | result = x * math.log(x) 80 | 81 | return result 82 | 83 | 84 | -------------------------------------------------------------------------------- /pymln/utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from multivac.pymln.utils.Utils import inc_key, dec_key, compareStr 3 | from multivac.pymln.utils.Utils import java_iter, genTreeNodeID, xlogx 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | stanfordnlp==0.2.0 2 | networkx==2.3 3 | numpy==1.16.2 4 | matplotlib==3.1.1 5 | nltk==3.4.5 6 | Flask==1.1.1 7 | scipy==1.2.1 8 | -e git://github.com/pgr-gallup/slate.git#egg=slate==0.5.2 9 | interruptingcow==0.8 10 | tqdm==4.34.0 11 | torch==1.2.0 12 | pandas==0.24.2 13 | tensorflow==1.15.2 14 | Unidecode==1.0.23 15 | fastcluster==1.1.25 16 | git+https://github.com/titipata/pubmed_parser.git 17 | stanford_corenlp==3.9.2 18 | requests==2.21.0 19 | py2neo==4.3.0 20 | sympy==1.3 21 | sortedcontainers==2.1.0 22 | spacy==2.1.3 23 | feedparser==5.2.1 24 | beautifulsoup4==4.8.1 25 | python-dotenv==0.10.3 26 | scikit_learn==0.21.3 27 | 28 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | import configparser 2 | import os 3 | from pathlib import Path 4 | 5 | from dotenv import load_dotenv 6 | from multivac.src import utilities 7 | 8 | cfg = configparser.ConfigParser() 9 | cfgDIR = Path(__file__).resolve().parent 10 | 11 | try: 12 | cfg.read(cfgDIR / config_file_name) 13 | except NameError: 14 | cfg.read(cfgDIR / 'multivac.cfg') 15 | 16 | root_dir = cfg['PATHS'].get('root_dir', cfgDIR) 17 | qgnet_dir = cfg['PATHS'].get('qgnet_dir', root_dir / '..' / 'qgnet') 18 | 19 | sys_dir = cfg['PATHS'].get('sys_dir', root_dir/'sys') 20 | data_dir = cfg['PATHS'].get('data_dir', sys_dir/'data') 21 | raw_dir = cfg['PATHS'].get('raw_dir', data_dir/'raw') 22 | interim_dir = cfg['PATHS'].get('interim_dir', data_dir/'interim') 23 | processed_dir = cfg['PATHS'].get('processed_dir', data_dir/'processed') 24 | metadata_dir = cfg['PATHS'].get('metadata_dir', processed_dir/'metadata') 25 | models_dir = cfg['PATHS'].get('models_dir', sys_dir/'models') 26 | stanf_nlp_dir = cfg['PATHS'].get('stanf_nlp_dir', 27 | root_dir/'stanford_nlp_models') 28 | mln_dir = cfg['PATHS'].get('mln_dir', root_dir/'mln_models') 29 | 30 | # Get search and filter settings; default to empty lists 31 | terms = eval(cfg['SEARCH'].get('terms', '[]')) 32 | sources = eval(cfg['SEARCH'].get('sources', '[]')) 33 | arxiv_drops = eval(cfg['FILTER'].get('drops', '[]')) 34 | 35 | # make data directories if they don't already exist 36 | dirs = [ 37 | data_dir, 38 | raw_dir, 39 | interim_dir, 40 | processed_dir, 41 | metadata_dir, 42 | models_dir, 43 | stanf_nlp_dir, 44 | mln_dir, 45 | ] 46 | dirs += [raw_dir / x for x in sources] 47 | for _dir in dirs: 48 | utilities.mkdir(_dir) 49 | -------------------------------------------------------------------------------- /src/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/.gitkeep -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/__init__.py -------------------------------------------------------------------------------- /src/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/data/.gitkeep -------------------------------------------------------------------------------- /src/data/clean_text.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import json 4 | import os 5 | 6 | import bs4 7 | 8 | 9 | def get_abstract(soup): 10 | 11 | abstract_element = soup.find('abstract').find('p') 12 | abstract_text = abstract_element.text if abstract_element else '**NONE**' 13 | 14 | # needs to be smarter 15 | if 'Abstract' in abstract_text: 16 | abstract_text = abstract_text.replace('Abstract', '').strip() 17 | if abstract_text.startswith('.'): 18 | abstract_text = abstract_text[1:].strip() 19 | 20 | return abstract_text 21 | 22 | 23 | def get_authors(soup): 24 | 25 | author_elements = soup.find_all('author') 26 | 27 | authors = [] 28 | for author in author_elements: 29 | firstname = author.find('forename') 30 | lastname = author.find('surname') 31 | 32 | first = firstname.text if firstname else '**NONE**' 33 | last = lastname.text if lastname else '**NONE**' 34 | out = f'{first} {last}' 35 | 36 | # needs to be smarter 37 | if out.startswith('&'): 38 | out = out.replace('&', '').strip() 39 | 40 | authors.append(out) 41 | 42 | authors_list = list(set(authors)) 43 | 44 | return authors_list 45 | 46 | 47 | def get_content(soup): 48 | 49 | paragraph_elements = soup.find_all('p') 50 | paragraphs_list = [e.text for e in paragraph_elements] 51 | # potentially more cleaning here 52 | 53 | return paragraphs_list 54 | 55 | 56 | def get_references(soup): 57 | 58 | reference_elements = soup.find_all('ref') 59 | references_list = [e.text for e in reference_elements] 60 | # potentially more cleaning here 61 | 62 | return references_list 63 | 64 | 65 | def get_formulas(soup): 66 | 67 | formula_elements = soup.find_all('formula') 68 | formulas_list = [e.text for e in formula_elements] 69 | # potentially more cleaning here 70 | 71 | return formulas_list 72 | 73 | 74 | def get_title(soup): 75 | 76 | title_element = soup.find('titleStmt') 77 | title = title_element.text.strip('\n') 78 | 79 | return title 80 | 81 | 82 | def run(args_dict): 83 | 84 | indir = os.path.abspath(args_dict['indir']) 85 | 86 | # get all files in specified directory 87 | files = [x for x in os.walk(indir)][0][2] 88 | 89 | # temporary placeholder for all data 90 | complete_list = [] 91 | for f in files: 92 | 93 | # full path to input file 94 | fin = f'{indir}/{f}' 95 | 96 | # only operate on proper files from extract_text module 97 | if fin.endswith('.tei.xml'): 98 | 99 | tmpf = open(fin, 'r') 100 | content = tmpf.read() 101 | tmpf.close() 102 | 103 | soup = bs4.BeautifulSoup(content, 'xml') 104 | 105 | # gather all parsed data 106 | abstract = get_abstract(soup) 107 | authors = get_authors(soup) 108 | references = get_references(soup) 109 | formulas = get_formulas(soup) 110 | title = get_title(soup) 111 | 112 | # comes in as list, combine to full text 113 | tmp_content = get_content(soup) 114 | content = ' '.join(tmp_content) 115 | 116 | # cleaning fluff from main content 117 | for ref in references: 118 | content = content.replace(ref, '') 119 | for frm in formulas: 120 | content = content.replace(frm, '') 121 | for atr in authors: 122 | content = content.replace(atr, '') 123 | content = content.replace(abstract, '') 124 | 125 | structure = { 126 | f: { 127 | 'meta': { 128 | 'abstract': abstract, 129 | 'authors': authors, 130 | 'title': title 131 | }, 132 | 'text': content 133 | } 134 | } 135 | 136 | complete_list.append(structure) 137 | 138 | else: 139 | 140 | pass 141 | 142 | # file outpu handling 143 | outdir = os.path.abspath(args_dict['outdir']) 144 | stamp = datetime.datetime.now().strftime('%Y%M%d_%H%M%S') 145 | fname = f'output_{stamp}.json' 146 | fout = f'{outdir}/{fname}' 147 | 148 | f = open(fout, 'w') 149 | json.dump(complete_list, f) 150 | f.close() 151 | 152 | 153 | if __name__ == "__main__": 154 | parser = argparse.ArgumentParser( 155 | description="Parser for XMLized scholarly publications." 156 | ) 157 | parser.add_argument( 158 | "--indir", 159 | required=True, 160 | help="Path to the directory containing XMLs to process." 161 | ) 162 | parser.add_argument( 163 | "--outdir", 164 | required=True, 165 | help="Path to output directory for processed files." 166 | ) 167 | 168 | args_dict = vars(parser.parse_args()) 169 | 170 | run(args_dict) 171 | -------------------------------------------------------------------------------- /src/data/glove.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import json 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy.stats import zscore 8 | from sklearn.cross_decomposition import CCA 9 | from unidecode import unidecode 10 | 11 | from multivac import settings 12 | from rpy2.robjects import numpy2ri, pandas2ri, r 13 | 14 | 15 | def domain_adapted_CCA(DG_embed, DS_embed, NC=100): 16 | # calculate the z-score 17 | DG_embed_norm = zscore(DG_embed) 18 | DS_embed_norm = zscore(DS_embed) 19 | 20 | # Initialize CCA Model 21 | cca = CCA(n_components=NC) 22 | cca.fit(DG_embed_norm, DS_embed_norm) 23 | 24 | DA_embeddings = (cca.x_scores_ + cca.y_scores_)/2 25 | 26 | return cca, DA_embeddings 27 | 28 | 29 | def glove_main(): 30 | # Load data from nlp parsing 31 | with open('{}/articles-with-equations.json'.format(settings.data_dir), 'r', 32 | encoding='utf-8') as jf: 33 | src_data = json.load(jf) 34 | 35 | texts = [src_data[art]['text'] for art in src_data if 36 | src_data[art]['text'] is not None] 37 | 38 | # The "unidecode" step simplifies non-ASCII chars which 39 | # mess up the R GloVe engine. 40 | texts_df = pd.Series(texts).apply(lambda x: unidecode(x)) 41 | texts_df = pd.DataFrame({'text': texts_df}) 42 | 43 | # Source all the functions contained in the 'trainEmbeddings' R file 44 | r("source('{}/trainEmbeddings.R'.format('src/data'))") 45 | 46 | # Call the main GloVe-embedding function from the R script 47 | trainEmbeddings_R = r("trainEmbeddings") 48 | 49 | # Train domain-specific GloVe embedding model and ouput as a Numpy Matrix 50 | pandas2ri.activate() 51 | DS_embeddings_R = trainEmbeddings_R(texts_df) 52 | pandas2ri.deactivate() 53 | 54 | DS_embeddings = numpy2ri.rpy2py(DS_embeddings_R[0]) 55 | 56 | # Get domain-specific GloVe vocabulary 57 | domain_spec_vocab = list(DS_embeddings_R[1]) 58 | 59 | # Load in Stanford's 'Common Crawl' domain-general Glove Embedding Model 60 | # Only pull out the words that are contained in our corpus 61 | # * This can take a while (~30min) - could use some optimization * 62 | DG_embeddings = loadGloveModel( 63 | '{}/glove.42B.300d.txt'.format(settings.data_dir), 64 | domain_spec_vocab 65 | ) 66 | 67 | # Processing to ensure rows match between the domain-general and 68 | # domain-specific embeddings 69 | # Convert domain-general embedding from dictionary to array 70 | domain_gen_vocab = np.array([DG_embeddings[i] for i in 71 | DG_embeddings.keys()]) 72 | 73 | # Find the indices of matching words 74 | both = set(domain_gen_vocab).intersection(domain_spec_vocab) 75 | indices_gen = [domain_gen_vocab.index(x) for x in both] 76 | indices_spec = [domain_spec_vocab.index(x) for x in both] 77 | indices_spec_notDG = [domain_spec_vocab.index(x) for x in 78 | domain_spec_vocab if x not in both] 79 | 80 | # Sort and subset domain-specific array to match indices of domain-general 81 | # array 82 | DS_embeddings_subset = DS_embeddings[indices_spec, :].copy() 83 | DG_embeddings_subset = DG_embeddings[indices_gen, :].copy() 84 | 85 | # fit cca model 86 | cca_res, DA_embeddings = domain_adapted_CCA(DG_embeddings_subset, 87 | DS_embeddings_subset, NC=100) 88 | 89 | DS_embeddings_notinDG = DS_embeddings[indices_spec_notDG, :] 90 | DS_embeddings_notinDG_norm = zscore(DS_embeddings_notinDG) 91 | 92 | DA_notinDG_embeddings = cca_res.y_weights_.T @ DS_embeddings_notinDG_norm.T 93 | DA_embeddings_final = np.append(DA_embeddings, DA_notinDG_embeddings.T, 94 | axis=0) 95 | 96 | # write data to disk 97 | np.savetxt('{}/da_embeddings.txt'.format(settings.models_dir), 98 | DA_embeddings_final, fmt='%d') 99 | 100 | 101 | def loadGloveModel(gloveFile, vocab): 102 | f = open(gloveFile, ' r') 103 | 104 | model = {} 105 | for line in f: 106 | splitLine = line.split() 107 | word = splitLine[0] 108 | if word in vocab: 109 | embedding = np.array([float(val) for val in splitLine[1:]]) 110 | model[word] = embedding 111 | 112 | return model 113 | 114 | 115 | if __name__ == '__main__': 116 | glove_main() 117 | -------------------------------------------------------------------------------- /src/data/make.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from multivac.src.data.get import collect_get_main 4 | from multivac.src.data.process import collect_process_main 5 | 6 | 7 | def collect_main(): 8 | # query apis to obtain articles 9 | collect_get_main() 10 | 11 | # process article data for models 12 | collect_process_main() 13 | 14 | 15 | if __name__ == '__main__': 16 | collect_main() 17 | -------------------------------------------------------------------------------- /src/data/process.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import copy 4 | import json 5 | import os 6 | import pickle 7 | from collections import OrderedDict 8 | 9 | import pubmed_parser 10 | from bs4 import BeautifulSoup as bs 11 | 12 | import slate 13 | from multivac import settings 14 | from multivac.src import utilities 15 | 16 | 17 | def aggregate_pubmed(srcs, verbose=False): 18 | """Aggregate a set of Pubmed article text and metadata.""" 19 | pubmed_data = OrderedDict() 20 | pubmed_metadata = OrderedDict() 21 | for src in srcs: 22 | if verbose: 23 | print(src) 24 | try: 25 | temp = OrderedDict() 26 | metadata, text = parse_pubmed(str(src.absolute())) 27 | temp['metadata'] = metadata 28 | temp['metadata']['source'] = 'pubmed' 29 | temp['text'] = text 30 | try: 31 | k = metadata['doi'] 32 | except AttributeError: 33 | k = src.strip('.xml') 34 | if len(text) > 0: 35 | pubmed_data[k] = temp 36 | pubmed_metadata[k] = metadata 37 | print(src) 38 | except Exception: 39 | if verbose: 40 | print('Error: %s' % src) 41 | pass 42 | dst = settings.metadata_dir / 'pubmed.pkl' 43 | with open(dst, 'wb') as f: 44 | pickle.dump(pubmed_metadata, f) 45 | return pubmed_data 46 | 47 | 48 | def collect_process_main(): 49 | output = {} 50 | for source in settings.sources: 51 | data_raw_dir = settings.raw_dir / source 52 | if source in ['arxiv', 'springer']: 53 | data = parse_articles_data(source, data_raw_dir) 54 | elif source == 'pubmed': 55 | srcs = [data_raw_dir / x for x in os.listdir(data_raw_dir)] 56 | data = aggregate_pubmed(srcs) 57 | if len(output) == 0: 58 | output = copy.deepcopy(data) 59 | else: 60 | output.update(data) 61 | arxiv_drops = [x.split()[0] for x in settings.arxiv_drops] 62 | filtered_output = filter_arxiv(output, arxiv_drops) 63 | save_outputs(filtered_output) 64 | return True 65 | 66 | 67 | def filter_arxiv(output, arxiv_drops): 68 | filtered_output = OrderedDict() 69 | for k, v in output.items(): 70 | if v['metadata']['source'] == 'arxiv': 71 | for term in v['metadata']['tags']: 72 | if term['term'] not in arxiv_drops: 73 | filtered_output[copy.deepcopy(k)] = copy.deepcopy(v) 74 | else: 75 | filtered_output[copy.deepcopy(k)] = copy.deepcopy(v) 76 | return filtered_output 77 | 78 | 79 | def parse_articles_data(source, data_raw_dir, verbose=False): 80 | """Parse Arxiv and Springer article data.""" 81 | # load metadata 82 | fn = source + '.pkl' 83 | metadata_src = settings.metadata_dir / fn 84 | with open(metadata_src, 'rb') as f: 85 | metadata_ = pickle.load(f) 86 | 87 | # we'll just add the text to a new arxiv object, an ordered dict keyed on 88 | # doi or other id 89 | data = OrderedDict() 90 | for ix, article_metadata in enumerate(metadata_): 91 | 92 | # initialize temp dictionary 93 | temp = OrderedDict() 94 | temp['metadata'] = copy.deepcopy(article_metadata) 95 | temp['metadata']['source'] = source 96 | article_fn = article_metadata['fn'] 97 | if verbose: 98 | print(article_fn) 99 | src = data_raw_dir / article_fn 100 | 101 | # define key and value 102 | if source == 'arxiv': 103 | k = article_metadata['fn'].strip('.pdf') 104 | temp['text'] = parse_pdf(src) 105 | elif source == 'springer': 106 | k = article_metadata['doi'] 107 | temp['text'] = parse_html(src) 108 | elif source == 'pubmed': 109 | raise ValueError('pubmed not supported. Only "arxiv" and "springer" supported. ' 110 | 'Try "parse_pubmed() function"') 111 | else: 112 | raise ValueError('Only "arxiv" and "springer" supported as sources.') 113 | 114 | # populate interim dictionary 115 | data[k] = temp 116 | 117 | # save intermediate outputs 118 | data_interim_dst = settings.interim_dir / fn 119 | with open(data_interim_dst, 'wb') as f: 120 | pickle.dump(data, f) 121 | return data 122 | 123 | 124 | def parse_html(src): 125 | """Parse research paper HTML and return text.""" 126 | with open(src, 'r', encoding='utf-8') as f: 127 | raw_data_ = f.read() 128 | soup = bs(raw_data_) 129 | try: 130 | text = ' '.join(soup.find('article').get_text().split()) 131 | except AttributeError: 132 | text = None 133 | return text 134 | 135 | 136 | def parse_pdf(src): 137 | """Parse research paper PDF and return text.""" 138 | try: 139 | # try to open file 140 | with open(src, 'rb') as f: 141 | doc = slate.PDF(f) 142 | 143 | # get text: strip out newlines and extra spaces 144 | doc = ' '.join([' '.join(x.split()) for x in doc]) 145 | text = (doc.split(' Abstract ')[-1] 146 | .split(' Acknowledgments ')[0] 147 | .split(' ∗ ∗ ∗ ')[0] 148 | .strip() 149 | ) 150 | 151 | except Exception: 152 | text = None 153 | 154 | return text 155 | 156 | 157 | def parse_pubmed(src): 158 | """Parse pubmed xml article data and return metadata and text.""" 159 | metadata = pubmed_parser.parse_pubmed_xml(src) 160 | text = pubmed_parser.parse_pubmed_paragraph(src, all_paragraph=True) 161 | text = ' '.join(' '.join([x['text'] for x in text]).split()) 162 | return metadata, text 163 | 164 | 165 | def save_outputs(output, dst_dir=None, fn_prefix=None): 166 | if dst_dir is None: 167 | dst_dir = settings.processed_dir / 'data' 168 | utilities.mkdir(dst_dir) 169 | fn = 'data.json' 170 | if fn_prefix is not None: 171 | fn = fn_prefix + '_' + fn 172 | dst = dst_dir / fn 173 | with open(dst, 'wb') as f: 174 | json.dump(output, f) 175 | 176 | 177 | if __name__ == '__main__': 178 | collect_process_main() 179 | -------------------------------------------------------------------------------- /src/data/qgnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import argparse 4 | import os 5 | import subprocess 6 | 7 | import corenlp 8 | 9 | from multivac import settings 10 | from multivac.src.data.parsing import load_data 11 | from qgnet.test.testinput.preprocessing_pdf import (create_tf_idf, 12 | preprocess_pdf) 13 | 14 | os.environ["CORENLP_HOME"] = ('{}/stanford-corenlp-full-2018-10-05' 15 | .format(settings.models_dir)) 16 | 17 | 18 | def qgnet_main(args_dict): 19 | # first, run shell script, if necessary, in qgnet to create model 20 | subprocess.call([ 21 | '../{}/download_QG-Net.sh'.format(settings.qgnet_dir), 22 | args_dict['qgnet_path'] 23 | ]) 24 | 25 | # second, pre-process the pdfs 26 | jsonObj, allDocs = load_data('{}/da_embeddings.txt' 27 | .format(settings.models_dir)) 28 | abstracts = [] 29 | for value in jsonObj.values(): 30 | if "summary" in value['metadata']: 31 | abstracts.append(value['metadata']["summary"]) 32 | elif "abstract" in value['metadata']: 33 | abstracts.append(value['metadata']["abstract"]) 34 | 35 | nlp = corenlp.CoreNLPClient(output_format='json', properties={ 36 | 'timeout': '50000'}) 37 | 38 | features, tfidf = create_tf_idf(abstracts, False) 39 | 40 | for i, abstract in enumerate(abstracts): 41 | preprocess_pdf(abstract, features[i, :].toarray(), tfidf, nlp) 42 | 43 | # third, generate qg-net questions 44 | subprocess.call([ 45 | '../{}/qg_reproduce_LS.sh'.format(settings.qgnet_dir), 46 | args_dict['qgnet_path'], 47 | settings.models_dir 48 | ]) 49 | 50 | 51 | if __name__ == '__main__': 52 | 53 | parser = argparse.ArgumentParser( 54 | description="Parser for QGNet." 55 | ) 56 | parser.add_argument( 57 | "--qgnet_path", 58 | required=True, 59 | help="Path to QGNet questions." 60 | ) 61 | 62 | args_dict = vars(parser.parse_args()) 63 | 64 | qgnet_main(args_dict) 65 | -------------------------------------------------------------------------------- /src/data/textparsing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import re as reg 4 | 5 | 6 | def clean_doc(doc, spacynlp): 7 | ''' 8 | Clean individual documents and remove citations, URLs, emails, other 9 | trivial content. Returns cleaned doc 10 | ''' 11 | # Regex for cleaning 12 | re_citationsNumeric = reg.compile(r'(\[\d+)(,\s*\d+)*]') 13 | re_url = reg.compile(r'((http|ftp|https):\/\/)?[-a-zA-Z0-9@:%._\+~#=]"' 14 | r'{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)') 15 | re_intextcite = reg.compile(r"((?:[A-Za-z][A-Za-z'`-éü-]+)(?:,? (?:(?:and |& )" 16 | r"?(?:[A-Za-z][A-Za-z'`-éü-]+)|(?:et al.?)))*(?:,* " 17 | r"*((?:19|20)[0-9][0-9][a-z]*)(\s*&\s*[0-9]*[a-z]*)" 18 | r"*(, (\d+))*(?:, p.? [0-9]+)?| *\\((?:19|20)[0-9]" 19 | r"[0-9][a-z](\s*&)(?:, p.? [0-9]+)?\\)))") 20 | 21 | re_emptyCite = reg.compile(r"\(([\s]*[;]+[\s]*)+\)") 22 | re_emptyEg = reg.compile(r'\(e.g.[\s*;\s*]*[,]*\s*\)') 23 | re_clickHere = reg.compile(r'Click here[^.]*\.') 24 | re_cid = reg.compile(r"\(cid:\d+\)") 25 | re_email = reg.compile(r"[\w.-]+@[\w.-]+") 26 | re_emptyParens = reg.compile(r"\(\s*\)") 27 | re_emptySee = reg.compile(r"\(see(\s)*\)") 28 | re_sponsors = reg.compile(r'(This work was supported).+') 29 | re_arxivHeader = reg.compile(r"(a r X i v).*?(?=[a-zA-Z]{2,})") 30 | re_vixraHeader = reg.compile(r"^(\s?.?\s)+(v i X r a)") 31 | re_hyphenatedWords = reg.compile(r'\S(?=\S*[-]\s)([a-zA-Z-]+)(\s)[A-za-z]+') 32 | 33 | # Actual cleaning 34 | doc = reg.sub(re_cid, ' ', doc) 35 | doc = reg.sub(re_citationsNumeric, ' NumericCitation ', doc) 36 | doc = reg.sub(re_url, ' ', doc) 37 | doc = reg.sub(re_intextcite, ' Citation ', doc) 38 | doc = reg.sub(re_emptyCite, ' ', doc) 39 | doc = reg.sub(re_emptyEg, ' ', doc) 40 | doc = reg.sub(re_clickHere, ' ', doc) 41 | doc = reg.sub(re_email, ' ', doc) 42 | doc = reg.sub(re_emptyParens, ' ', doc) 43 | doc = reg.sub(re_emptySee, ' ', doc) 44 | doc = reg.sub(re_arxivHeader, ' ', doc) 45 | doc = reg.sub(re_vixraHeader, ' ', doc) 46 | 47 | # This work supported by --> all the way to end of document 48 | # Only remove this when it appears in the second half of the article 49 | for m in reg.finditer(re_sponsors, doc): 50 | if m.start() > (len(doc)/2): 51 | doc = reg.sub(re_sponsors, ' ', doc) 52 | 53 | # Handling hyphens - 2-28-2018 54 | for m in reg.finditer(re_hyphenatedWords, doc): 55 | match = m.group(0) 56 | 57 | mergedWord = match.replace(' ', '').replace('-', '') 58 | if mergedWord in spacynlp.vocab: 59 | 60 | doc = doc.replace(match, mergedWord) 61 | else: 62 | allWords = True 63 | for i in match.replace(' ', '').split('-'): 64 | allWords = allWords and (i in spacynlp.vocab) 65 | if allWords: 66 | doc = doc.replace(match, (match.replace(' ', ''))) 67 | else: 68 | doc = doc.replace(match, mergedWord) 69 | 70 | # De-dup for PUBMED articles, where the main text is sometimes duplicated 71 | sliceText = doc[0:500] 72 | count = doc.count(sliceText) 73 | 74 | if count > 1: 75 | posDup = doc.find(sliceText, 1) 76 | doc = doc[0:posDup-1] 77 | 78 | return doc 79 | -------------------------------------------------------------------------------- /src/data/trainEmbeddings.R: -------------------------------------------------------------------------------- 1 | pks = c( 2 | 'data.table', 3 | 'dplyr', 4 | 'text2vec', 5 | 'Rtsne', 6 | 'quanteda', 7 | 'doParallel', 8 | 'foreach' 9 | ) 10 | 11 | # Takes a list or vector of package names and loads them, installing 12 | # first if they are not already installed. 13 | getPackages <- function(list.of.packages) { 14 | new.packages <- list.of.packages[!( 15 | list.of.packages %in% installed.packages()[,"Package"] 16 | )] 17 | 18 | if(length(new.packages)) install.packages(new.packages) 19 | lapply(list.of.packages,require,character.only=T) 20 | } 21 | 22 | # Fits GloVe embeddings model on data 23 | trainEmbeddings <- function(docs, 24 | term_count_min=5L, 25 | skip_grams_window=10L, 26 | word_vectors_size=300, 27 | x_max=100, 28 | n_iter=100, 29 | convergence_tol=0.01, 30 | learning_rate=0.05, 31 | verbose=FALSE) { 32 | toks <- tokens(tolower(docs)) 33 | feats <- dfm(toks, verbose=verbose) %>% 34 | dfm_trim(min_termfreq=term_count_min) %>% 35 | featnames() 36 | toks <- tokens_select(toks, feats, 37 | selection='keep', 38 | valuetype='fixed', 39 | padding=TRUE, 40 | case_insensitive=FALSE, 41 | verbose=TRUE) 42 | my_fcm <- fcm(toks, 43 | context="window", 44 | window=skip_grams_window, 45 | count="weighted", 46 | weights=1/(1:skip_grams_window), 47 | tri=TRUE) 48 | 49 | glove <- GlobalVectors$new(word_vectors_size=word_vectors_size, 50 | vocabulary=featnames(my_fcm), 51 | x_max=x_max, 52 | learning_rate=learning_rate) 53 | 54 | if(verbose) print('Fitting GloVe model...') 55 | 56 | wv_main = glove$fit_transform(my_fcm, 57 | n_iter=n_iter, 58 | convergence_tol=convergence_tol) 59 | 60 | if(verbose) print('Done.') 61 | 62 | # Combine context and target word vectors in the same manner as 63 | # original GloVe research 64 | word_vectors = wv_main + t(glove$components) 65 | 66 | results = list(word_vectors,feats) 67 | return(results) 68 | } 69 | 70 | getPackages(pks) 71 | -------------------------------------------------------------------------------- /src/gan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/__init__.py -------------------------------------------------------------------------------- /src/gan/config.cfg: -------------------------------------------------------------------------------- 1 | [GAN] 2 | ROLLOUT_NUM = 2 3 | 4 | G_STEPS = 1 5 | D_STEPS = 1 6 | K_STEPS = 4 7 | 8 | SEED = 12 9 | TOTAL_EPOCHS = 200 10 | GENERATED_NUM = 100 11 | 12 | kg_directory = ../../data 13 | glove_dir = ../../models 14 | glove_file = DA_glove_embeddings_300.pkl 15 | glove_lower = True 16 | verbose = True 17 | 18 | [GENERATOR] 19 | verbose = True 20 | annot_file = data/query_annots.txt 21 | texts_file = data/query_texts.txt 22 | sample_dir = data/samples 23 | output_dir = ../../models 24 | grammar = None 25 | cuda = False 26 | 27 | uniform_init = None 28 | glorot_init = False 29 | kaiming_init = True 30 | 31 | #### Model configuration #### 32 | batch_size = 32 33 | dropout = 0. 34 | word_dropout = 0. 35 | primitive_token_label_smoothing = 0.1 36 | lstm = lstm 37 | encoder = lstm 38 | 39 | # Embedding sizes 40 | embed_size = 128 41 | action_embed_size = 128 42 | field_embed_size = 64 43 | type_embed_size = 64 44 | 45 | # Hidden sizes 46 | hidden_size = 256 47 | att_vec_size = 256 48 | 49 | # readout layer 50 | no_query_vec_to_action_map = False 51 | readout = non_linear 52 | query_vec_to_action_diff_map = False 53 | 54 | # supervised attention 55 | sup_attention = False 56 | 57 | # parent information switch for decoder LSTM 58 | no_parent_production_embed = False 59 | no_parent_field_embed = False 60 | no_parent_field_type_embed = False 61 | no_parent_state = False 62 | 63 | no_input_feed = False 64 | no_copy = False 65 | 66 | # training schedule details 67 | PRE_G_EPOCHS = 50 68 | optimizer = Adam 69 | lr = 0.0001 70 | lr_decay = 0. 71 | beta_1 = 0.5 72 | log_every = 10 73 | clip_grad = 5. 74 | 75 | #### decoding/validation/testing #### 76 | beam_size = 5 77 | decode_max_time_step = 100 78 | 79 | 80 | [DISCRIMINATOR] 81 | device = cpu 82 | cuda = False 83 | verbose = False 84 | data = discriminator/data/multivac 85 | label_smoothing = 0.9 86 | 87 | #### Model configuration #### 88 | vocab_size = 0 89 | num_epochs = 5 90 | filter_sizes = (10, 5, 4, 3) 91 | num_filters = 20 92 | hidden_dims = 10 93 | dropout_prob1 = 0.5 94 | dropout_prob2 = 0.8 95 | 96 | # training schedule details 97 | batch_size = 64 98 | optim = adam 99 | lr = 0.0004 100 | beta_1 = 0.5 101 | wd = 0. 102 | -------------------------------------------------------------------------------- /src/gan/discriminator/__init__.py: -------------------------------------------------------------------------------- 1 | from .dataset import MULTIVACDataset 2 | from .model import QueryGAN_Discriminator_CNN 3 | from .trainer import Trainer 4 | from .tree import Tree 5 | 6 | __all__ = [MULTIVACDataset, QueryGAN_Discriminator_CNN, Trainer, Tree] 7 | -------------------------------------------------------------------------------- /src/gan/discriminator/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from copy import deepcopy 3 | 4 | import torch 5 | import torch.utils.data as data 6 | from multivac.src.gan.discriminator.tree import Tree 7 | from tqdm import tqdm 8 | 9 | 10 | # Dataset class for MULTIVAC dataset 11 | class MULTIVACDataset(data.Dataset): 12 | 13 | def __init__(self, path, vocab): 14 | super().__init__() 15 | self.vocab = vocab 16 | self.sentences = self.read_sentences(os.path.join(path, 'text.toks')) 17 | self.labels = MULTIVACDataset.read_labels(os.path.join(path, 'cat.txt')) 18 | self.size = self.labels.size(0) 19 | 20 | def __len__(self): 21 | return self.size 22 | 23 | def __getitem__(self, index): 24 | sent = deepcopy(self.sentences[index]) 25 | label = deepcopy(self.labels[index]) 26 | return (sent, label) 27 | 28 | def read_sentences(self, filename): 29 | with open(filename, 'r') as f: 30 | sentences = [self.read_sentence(line) for line in tqdm(f.readlines())] 31 | 32 | return sentences 33 | 34 | def read_sentence(self, line): 35 | indices = self.vocab.convertToIdx(line.split()) 36 | result = torch.tensor(indices, dtype=torch.long, device='cpu') 37 | 38 | return result 39 | 40 | @staticmethod 41 | def read_trees(filename): 42 | with open(filename, 'r') as f: 43 | trees = [MULTIVACDataset.read_tree(line) for line in tqdm(f.readlines())] 44 | 45 | return trees 46 | 47 | @staticmethod 48 | def read_tree(line): 49 | if isinstance(line, list): 50 | parents = line 51 | else: 52 | parents = list(map(int, line.split())) 53 | 54 | trees = dict() 55 | root = None 56 | 57 | for i in range(1, len(parents) + 1): 58 | if i - 1 not in trees.keys() and parents[i - 1] != -1: 59 | idx = i 60 | prev = None 61 | 62 | while True: 63 | parent = parents[idx - 1] 64 | 65 | if parent == -1: 66 | break 67 | 68 | tree = Tree() 69 | 70 | if prev is not None: 71 | tree.add_child(prev) 72 | 73 | trees[idx - 1] = tree 74 | tree.idx = idx - 1 75 | 76 | if parent - 1 in trees.keys(): 77 | trees[parent - 1].add_child(tree) 78 | break 79 | elif parent == 0: 80 | root = tree 81 | break 82 | else: 83 | prev = tree 84 | idx = parent 85 | 86 | return root 87 | 88 | @staticmethod 89 | def read_labels(filename): 90 | with open(filename, 'r') as f: 91 | labels = list(map(float, f.readlines())) 92 | labels = torch.tensor(labels, dtype=torch.float, device='cpu') 93 | 94 | return labels 95 | -------------------------------------------------------------------------------- /src/gan/discriminator/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.utils.data import DataLoader 4 | from tqdm import tqdm 5 | 6 | 7 | class QueryGAN_Discriminator_CNN(nn.Module): 8 | 9 | def __init__(self, args, vocab, vectors, output_shape): 10 | super(QueryGAN_Discriminator_CNN, self).__init__() 11 | 12 | self.args = args 13 | self.filter_sizes = eval(self.args['filter_sizes']) 14 | self.num_filters = self.args['num_filters'] 15 | self.hidden_dims = self.args['hidden_dims'] 16 | self.dropout_prob1 = self.args['dropout_prob1'] 17 | self.dropout_prob2 = self.args['dropout_prob2'] 18 | self.num_classes = output_shape 19 | self.channels_out = sum([((150-(k-1))//2)*self.num_filters 20 | for k in self.filter_sizes]) 21 | self.vocab = vocab 22 | 23 | self.emb = nn.Embedding(vocab.size(), vectors.size(1)) 24 | emb = torch.zeros(vocab.size(), vectors.size(1), dtype=torch.float, 25 | device=args['device']) 26 | emb.normal_(0, 0.05) 27 | 28 | for word in vocab.labelToIdx.keys(): 29 | if vocab.getIndex(word) < vectors.size(0): 30 | emb[vocab.getIndex(word)] = vectors[vocab.getIndex(word)] 31 | else: 32 | emb[vocab.getIndex(word)].zero_() 33 | 34 | self.emb.weight.data.copy_(emb) 35 | del emb 36 | 37 | self.emb.weight.requires_grad = False 38 | self.dropout1 = nn.Dropout(self.dropout_prob1) 39 | 40 | self.vocab_size = len(vocab) 41 | self.batchsize = self.args['batch_size'] 42 | self.num_epochs = self.args['num_epochs'] 43 | 44 | self.conv_blocks = nn.ModuleList( 45 | [nn.Sequential( 46 | nn.Conv1d(in_channels=vectors.shape[1], 47 | out_channels=self.num_filters, 48 | kernel_size=sz, 49 | stride=1, 50 | padding=0), 51 | nn.LeakyReLU(negative_slope=0.2), 52 | nn.BatchNorm1d(self.num_filters), 53 | nn.MaxPool1d(kernel_size=2), 54 | nn.Flatten()) for sz in self.filter_sizes] 55 | ) 56 | 57 | self.out = nn.Sequential( 58 | nn.Dropout(self.dropout_prob2), 59 | nn.Linear(self.channels_out, self.hidden_dims), 60 | nn.Linear(self.hidden_dims, self.num_classes) 61 | ) 62 | 63 | for block in self.conv_blocks: 64 | block.apply(self.init_weights) 65 | 66 | self.out.apply(self.init_weights) 67 | 68 | if self.args['optim'] == 'adam': 69 | self.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, 70 | self.parameters()), 71 | betas = (self.args['beta_1'], 0.999), 72 | lr=self.args['lr'], 73 | weight_decay=self.args['wd']) 74 | elif self.args['optim'] == 'adagrad': 75 | self.optimizer = torch.optim.Adagrad(filter(lambda p: p.requires_grad, 76 | self.parameters()), 77 | lr=self.args['lr'], 78 | weight_decay=self.args['wd']) 79 | elif self.args['optim'] == 'sgd': 80 | self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, 81 | self.parameters()), 82 | lr=self.args['lr'], 83 | weight_decay=self.args['wd']) 84 | 85 | def init_weights(self, m): 86 | if type(m) in (nn.Linear, nn.Conv1d): 87 | nn.init.kaiming_uniform_(m.weight) 88 | 89 | if m.bias is not None: 90 | nn.init.constant_(m.bias, 0) 91 | 92 | def forward(self, verbatim_indices): 93 | embeddings = self.emb(verbatim_indices) 94 | embeddings = embeddings.permute(0, 2, 1) 95 | X = self.dropout1(embeddings) 96 | 97 | X = [conv(embeddings) for conv in self.conv_blocks] 98 | X_cat = torch.cat(X, 1) 99 | 100 | return self.out(X_cat) 101 | 102 | def predict(self, X): 103 | self.eval() 104 | 105 | with torch.no_grad(): 106 | yhat = self(X).softmax(dim=-1) 107 | 108 | scores, labels = yhat.topk(1, -1, True, True) 109 | return scores, labels 110 | 111 | def train_single_code(self, train): 112 | 113 | if self.args['label_smoothing']: 114 | criterion = SmoothedCrossEntropy(self.args['label_smoothing']) 115 | else: 116 | criterion = nn.CrossEntropyLoss() 117 | 118 | return self.trainer(train, criterion) 119 | 120 | def trainer(self, train, criterion): 121 | trainloader = DataLoader(train, batch_size=self.args['batch_size'], 122 | shuffle=True, num_workers=4) 123 | steps = len(trainloader) 124 | 125 | if self.args['device'] == 'cuda': 126 | self.cuda() 127 | self.optimizer.cuda() 128 | 129 | self.train() 130 | 131 | for i, (x, y) in enumerate(tqdm(trainloader)): 132 | verbs = x.to(self.args['device']) 133 | labels = y.to(self.args['device']) 134 | 135 | # Forward pass 136 | outputs = self(verbs) 137 | 138 | if not self.args['label_smoothing']: 139 | labels = labels.argmax(1) 140 | 141 | loss = criterion(outputs, labels) 142 | 143 | # Backward and optimize 144 | self.optimizer.zero_grad() 145 | loss.backward() 146 | self.optimizer.step() 147 | 148 | return loss.item() 149 | 150 | class SmoothedCrossEntropy(nn.Module): 151 | ''' 152 | Adapted from https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/train.py#L38 153 | ''' 154 | def __init__(self, smoothing): 155 | super(SmoothedCrossEntropy, self).__init__() 156 | 157 | self.smoothing = smoothing 158 | self.softmax = nn.LogSoftmax(dim=1) 159 | 160 | def forward(self, output, target): 161 | ''' 162 | output: Tensor of predictions for class labels of size 163 | batchsize * n_classes 164 | target: Onehot Tensor indicating actual class labels of size 165 | batchsize * n_classes 166 | ''' 167 | target = target * self.smoothing + (1 - target) * (1 - self.smoothing) 168 | return -(target * self.softmax(output)).mean() 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /src/gan/discriminator/scripts/preprocess-multivac.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocessing script for MULTIVAC data. 3 | 4 | """ 5 | import argparse 6 | import glob 7 | import os 8 | import re 9 | 10 | from sklearn.model_selection import train_test_split 11 | from tqdm import tqdm 12 | 13 | from multivac.src.gan.utilities.utils import build_vocab 14 | from multivac.src.rdf_graph.rdf_parse import StanfordParser 15 | 16 | 17 | def dep_parse(filepath, parser): 18 | print('\nDependency parsing ' + filepath) 19 | dirpath = os.path.dirname(filepath) 20 | 21 | with open(filepath, 'r') as f: 22 | examples = f.readlines() 23 | 24 | with open(os.path.join(dirpath, 'text.toks'), 'w') as tokfile, \ 25 | open(os.path.join(dirpath, 'text.parents'), 'w') as parfile: 26 | 27 | for example in tqdm(examples): 28 | text = example.strip() 29 | 30 | if not text.endswith("?"): 31 | text = re.sub(r"\?", "", text) 32 | text += "?" 33 | 34 | sample_parse = parser.get_parse(text)['sentences'][0] 35 | tokens = [x['word'] for x in sample_parse['tokens']] 36 | deps = sorted(sample_parse['basicDependencies'], 37 | key=lambda x: x['dependent']) 38 | parents = [x['governor'] for x in deps] 39 | 40 | parfile.write(' '.join([str(x) for x in parents]) + '\n') 41 | tokfile.write(' '.join(tokens) + '\n') 42 | 43 | 44 | def gen_tokens(filepath, parser): 45 | print('\nTokenizing ' + filepath) 46 | dirpath = os.path.dirname(filepath) 47 | 48 | with open(filepath, 'r') as f: 49 | examples = f.readlines() 50 | 51 | with open(os.path.join(dirpath, 'text.toks'), 'w') as tokfile: 52 | 53 | for example in tqdm(examples): 54 | text = example.strip() 55 | 56 | if not text.endswith("?"): 57 | text = re.sub(r"\?", "", text) 58 | text += "?" 59 | 60 | sample_parse = parser.get_parse(text) 61 | tokens = [x['word'] for x in sample_parse['tokens']] 62 | tokfile.write(' '.join(tokens) + '\n') 63 | 64 | 65 | def make_dirs(dirs): 66 | for d in dirs: 67 | if not os.path.exists(d): 68 | os.makedirs(d) 69 | 70 | 71 | def split(filepath, dst_dir): 72 | ''' 73 | Input datafiles now have form: 74 | id \t sentence \t category (0, 1) 75 | id = id number 76 | sentence = text of sentence/query 77 | category = whether this is a "real" or "fake" sentence 78 | ''' 79 | with open(filepath) as datafile, \ 80 | open(os.path.join(dst_dir, 'text.txt'), 'w') as textfile, \ 81 | open(os.path.join(dst_dir, 'id.txt'), 'w') as idfile, \ 82 | open(os.path.join(dst_dir, 'cat.txt'), 'w') as catfile: 83 | datafile.readline() 84 | 85 | for line in datafile: 86 | i, text, cat = line.strip().split('\t') 87 | idfile.write(i + '\n') 88 | textfile.write(text + '\n') 89 | catfile.write(cat + '\n') 90 | 91 | 92 | def train_dev_test_split(filepath, dst_dir, 93 | train=0.7, dev=0.2, test=0.1): 94 | test = test/(train + test) 95 | 96 | with open(filepath, "r") as datafile: 97 | data = datafile.readlines() 98 | 99 | header = data[0] 100 | 101 | x_train, x_dev = train_test_split(data[1:], test_size=dev, shuffle=True) 102 | x_train, x_test = train_test_split(x_train, test_size=test, shuffle=True) 103 | 104 | with open(os.path.join(dst_dir, "MULTIVAC_train.txt"), "w") as f: 105 | f.write(header) 106 | for line in x_train: 107 | f.write(line) 108 | 109 | with open(os.path.join(dst_dir, "MULTIVAC_test_annotated.txt"), "w") as f: 110 | f.write(header) 111 | for line in x_dev: 112 | f.write(line) 113 | 114 | with open(os.path.join(dst_dir, "MULTIVAC_trial.txt"), "w") as f: 115 | f.write(header) 116 | for line in x_test: 117 | f.write(line) 118 | 119 | 120 | if __name__ == '__main__': 121 | 122 | parser = argparse.ArgumentParser( 123 | description='Preprocessing of MULTIVAC data for QueryGAN ' 124 | 'discriminator training.') 125 | # data arguments 126 | parser.add_argument('-d', '--data', required=False, 127 | help='Path to source dataset.') 128 | 129 | args = vars(parser.parse_args()) 130 | 131 | print('=' * 80) 132 | print('Preprocessing MULTIVAC dataset') 133 | print('=' * 80) 134 | 135 | base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 136 | data_dir = os.path.join(base_dir, 'data') 137 | multivac_dir = os.path.join(data_dir, 'multivac') 138 | 139 | prs = StanfordParser(annots='tokenize') 140 | 141 | split(os.path.join(multivac_dir, 'extracted_questions_labels.txt'), multivac_dir) 142 | gen_tokens(os.path.join(multivac_dir, 'text.txt'), prs) 143 | 144 | # get vocabulary 145 | build_vocab(glob.glob(os.path.join(multivac_dir, '*/*.toks')), 146 | os.path.join(multivac_dir, 'vocab.txt')) 147 | build_vocab(glob.glob(os.path.join(multivac_dir, '*/*.toks')), 148 | os.path.join(multivac_dir, 'vocab-cased.txt'), 149 | lowercase=False) 150 | -------------------------------------------------------------------------------- /src/gan/discriminator/trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | 4 | 5 | class Trainer(object): 6 | def __init__(self, args, model, criterion, optimizer, device): 7 | super(Trainer, self).__init__() 8 | self.args = args 9 | self.model = model 10 | self.criterion = criterion 11 | self.optimizer = optimizer 12 | self.device = device 13 | self.epoch = 0 14 | 15 | # helper function for training 16 | def train(self, dataset): 17 | self.model.train() 18 | self.optimizer.zero_grad() 19 | total_loss = 0.0 20 | indices = torch.randperm(len(dataset), dtype=torch.long, device=self.device) 21 | 22 | for idx in tqdm(range(len(dataset)), desc='Training epoch ' + str(self.epoch + 1) + ''): 23 | tree, inputs, label = dataset[indices[idx]] 24 | inputs = inputs.to(self.device) 25 | label = label.to(self.device).view(1, 1) 26 | output = self.model(tree, inputs) 27 | loss = self.criterion(output, label) 28 | total_loss += loss.item() 29 | loss.backward() 30 | 31 | if idx % self.args['batchsize'] == 0 and idx > 0: 32 | self.optimizer.step() 33 | self.optimizer.zero_grad() 34 | 35 | self.epoch += 1 36 | return total_loss / len(dataset) 37 | 38 | # helper function for testing 39 | def test(self, dataset): 40 | self.model.eval() 41 | 42 | with torch.no_grad(): 43 | total_loss = 0.0 44 | predictions = torch.zeros(len(dataset), dtype=torch.float, device=self.device) 45 | 46 | for idx in tqdm(range(len(dataset)), desc='Testing epoch ' + str(self.epoch) + ''): 47 | tree, inputs, label = dataset[idx] 48 | inputs, label = inputs.to(self.device), label.to(self.device).view(1, 1) 49 | output = self.model(tree, inputs) 50 | loss = self.criterion(output, label) 51 | total_loss += loss.item() 52 | output = output.squeeze().to('cpu') 53 | predictions[idx] = torch.round(output) 54 | 55 | return total_loss / len(dataset), predictions 56 | -------------------------------------------------------------------------------- /src/gan/discriminator/tree.py: -------------------------------------------------------------------------------- 1 | # tree object from stanfordnlp/treelstm 2 | class Tree(object): 3 | 4 | def __init__(self): 5 | self.parent = None 6 | self.num_children = 0 7 | self.children = list() 8 | 9 | def add_child(self, child): 10 | child.parent = self 11 | self.num_children += 1 12 | self.children.append(child) 13 | 14 | def size(self): 15 | if getattr(self, '_size'): 16 | return self._size 17 | count = 1 18 | for i in range(self.num_children): 19 | count += self.children[i].size() 20 | self._size = count 21 | return self._size 22 | 23 | def depth(self): 24 | if getattr(self, '_depth'): 25 | return self._depth 26 | count = 0 27 | if self.num_children > 0: 28 | for i in range(self.num_children): 29 | child_depth = self.children[i].depth() 30 | if child_depth > count: 31 | count = child_depth 32 | count += 1 33 | self._depth = count 34 | return self._depth 35 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/__init__.py -------------------------------------------------------------------------------- /src/gan/gen_pyt/asdl/__init__.py: -------------------------------------------------------------------------------- 1 | # import six 2 | # from .lang.lambda_dcs.lambda_dcs_transition_system import LambdaCalculusTransitionSystem 3 | # from .lang.prolog.prolog_transition_system import PrologTransitionSystem 4 | 5 | # if six.PY2: 6 | # from .lang.py.py_transition_system import PythonTransitionSystem 7 | # else: 8 | # from .lang.py3.py3_transition_system import Python3TransitionSystem 9 | # from asdl.lang.sql.sql_transition_system import SqlTransitionSystem 10 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/asdl/hypothesis.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from multivac.src.gan.gen_pyt.asdl.asdl import ASDLCompositeType 4 | from multivac.src.gan.gen_pyt.asdl.asdl_ast import AbstractSyntaxTree 5 | from multivac.src.gan.gen_pyt.asdl.transition_system import (ApplyRuleAction, 6 | GenTokenAction, 7 | ReduceAction) 8 | 9 | 10 | class Hypothesis(object): 11 | 12 | def __init__(self): 13 | self.tree = None 14 | self.actions = [] 15 | self.score = 0. 16 | self.frontier_node = None 17 | self.frontier_field = None 18 | self._value_buffer = [] 19 | 20 | # record the current time step 21 | self.t = 0 22 | 23 | def apply_action(self, action): 24 | if self.tree is None: 25 | assert isinstance(action, ApplyRuleAction), 'Invalid action [%s], only ApplyRule action is valid ' \ 26 | 'at the beginning of decoding' 27 | 28 | self.tree = AbstractSyntaxTree(action.production) 29 | self.update_frontier_info() 30 | elif self.frontier_node: 31 | if isinstance(self.frontier_field.type, ASDLCompositeType): 32 | if isinstance(action, ApplyRuleAction): 33 | field_value = AbstractSyntaxTree(action.production) 34 | field_value.created_time = self.t 35 | self.frontier_field.add_value(field_value) 36 | self.update_frontier_info() 37 | elif isinstance(action, ReduceAction): 38 | assert self.frontier_field.cardinality in ('optional', 'multiple'), 'Reduce action can only be ' \ 39 | 'applied on field with multiple ' \ 40 | 'cardinality' 41 | self.frontier_field.set_finish() 42 | self.update_frontier_info() 43 | else: 44 | raise ValueError('Invalid action [%s] on field [%s]' % (action, self.frontier_field)) 45 | else: # fill in a primitive field 46 | if isinstance(action, GenTokenAction): 47 | # only field of type string requires termination signal 48 | end_primitive = False 49 | if self.frontier_field.type.name == 'string': 50 | if action.is_stop_signal(): 51 | self.frontier_field.add_value(' '.join(self._value_buffer)) 52 | self._value_buffer = [] 53 | 54 | end_primitive = True 55 | else: 56 | self._value_buffer.append(action.token) 57 | else: 58 | self.frontier_field.add_value(action.token) 59 | end_primitive = True 60 | 61 | if end_primitive and self.frontier_field.cardinality in ('single', 'optional'): 62 | self.frontier_field.set_finish() 63 | self.update_frontier_info() 64 | 65 | elif isinstance(action, ReduceAction): 66 | assert self.frontier_field.cardinality in ('optional', 'multiple'), 'Reduce action can only be ' \ 67 | 'applied on field with multiple ' \ 68 | 'cardinality' 69 | self.frontier_field.set_finish() 70 | self.update_frontier_info() 71 | else: 72 | raise ValueError('Can only invoke GenToken or Reduce actions on primitive fields') 73 | 74 | self.t += 1 75 | self.actions.append(action) 76 | 77 | def update_frontier_info(self): 78 | def _find_frontier_node_and_field(tree_node): 79 | if tree_node: 80 | for field in tree_node.fields: 81 | # if it's an intermediate node, check its children 82 | if isinstance(field.type, ASDLCompositeType) and field.value: 83 | if field.cardinality in ('single', 'optional'): 84 | iter_values = [field.value] 85 | else: 86 | iter_values = field.value 87 | 88 | for child_node in iter_values: 89 | result = _find_frontier_node_and_field(child_node) 90 | if result: 91 | return result 92 | 93 | # now all its possible children are checked 94 | if not field.finished: 95 | return tree_node, field 96 | 97 | return None 98 | else: 99 | return None 100 | 101 | frontier_info = _find_frontier_node_and_field(self.tree) 102 | if frontier_info: 103 | self.frontier_node, self.frontier_field = frontier_info 104 | else: 105 | self.frontier_node, self.frontier_field = None, None 106 | 107 | def clone_and_apply_action(self, action): 108 | new_hyp = self.copy() 109 | new_hyp.apply_action(action) 110 | 111 | return new_hyp 112 | 113 | def copy(self): 114 | new_hyp = Hypothesis() 115 | if self.tree: 116 | new_hyp.tree = self.tree.copy() 117 | 118 | new_hyp.actions = list(self.actions) 119 | new_hyp.score = self.score 120 | new_hyp._value_buffer = list(self._value_buffer) 121 | new_hyp.t = self.t 122 | 123 | new_hyp.update_frontier_info() 124 | 125 | return new_hyp 126 | 127 | @property 128 | def completed(self): 129 | return self.tree and self.frontier_field is None 130 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/asdl/lang/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/asdl/lang/__init__.py -------------------------------------------------------------------------------- /src/gan/gen_pyt/asdl/lang/eng/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/asdl/lang/eng/__init__.py -------------------------------------------------------------------------------- /src/gan/gen_pyt/asdl/lang/eng/eng_asdl_helper.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from multivac.src.gan.gen_pyt.asdl.asdl import (ASDLCompositeType, 4 | ASDLConstructor, 5 | ASDLPrimitiveType, 6 | ASDLProduction, Field) 7 | from multivac.src.gan.gen_pyt.asdl.asdl_ast import (AbstractSyntaxTree, 8 | RealizedField) 9 | 10 | 11 | def find_match_paren(s): 12 | count = 0 13 | 14 | for i, c in enumerate(s): 15 | if c == "(": 16 | count += 1 17 | elif c == ")": 18 | count -= 1 19 | 20 | if count == 0: 21 | return i 22 | 23 | 24 | def english_ast_to_asdl_ast(text, depth=0, debug=False): 25 | ''' Takes a constituency parse string of an English sentence and creates 26 | an AbstractSyntaxTree object from it. 27 | 28 | Example input: 29 | '(ROOT (SBARQ (WHADVP (WRB Why)) (SQ (VBP do) (NP (NNS birds)) (ADVP 30 | (RB suddenly)) (VP (VB appear) (SBAR (WHADVP (WRB whenever)) (S (NP 31 | (PRP you)) (VP (VBP are) (ADJP (JJ near))))))) (. ?)))' 32 | ''' 33 | 34 | if debug: 35 | print(("\t" * depth + "String: '{}'".format(text))) 36 | 37 | try: 38 | tree_str = text[text.index("(") + 1:text.rfind(")")] 39 | except ValueError: 40 | print(("Malformatted parse string: '{}'".format(text))) 41 | raise ValueError 42 | 43 | all_fields = [] 44 | next_idx = tree_str.index(" ") 45 | 46 | if "(" in tree_str: 47 | node_type = ASDLCompositeType(tree_str[:next_idx]) 48 | node_fields = [] 49 | 50 | while "(" in tree_str: 51 | tree_str = tree_str[tree_str.index("("):] 52 | next_idx = find_match_paren(tree_str) + 1 53 | child = english_ast_to_asdl_ast(tree_str[:next_idx], depth+1, debug) 54 | 55 | if isinstance(child, AbstractSyntaxTree): 56 | asdl_field = Field(child.production.type.name, 57 | child.production.type, 58 | 'single') 59 | all_fields.append(RealizedField(asdl_field, value=child)) 60 | else: 61 | asdl_field = child.field 62 | all_fields.append(child) 63 | 64 | node_fields.append(asdl_field) 65 | tree_str = tree_str[next_idx + 1:] 66 | 67 | field_str = ', '.join(["({})".format(f.name) for f in node_fields]) 68 | rule_str = node_type.name + " -> " + field_str 69 | constructor = ASDLConstructor(rule_str, node_fields) 70 | production = ASDLProduction(node_type, constructor) 71 | 72 | result = AbstractSyntaxTree(production, realized_fields=all_fields) 73 | else: 74 | node_type = ASDLPrimitiveType(tree_str[:next_idx]) 75 | result = RealizedField(Field(node_type.name, node_type, 'single'), 76 | value=tree_str[next_idx + 1:]) 77 | 78 | return result 79 | 80 | 81 | def asdl_ast_to_english(asdl_ast_node): 82 | tokens = [] 83 | 84 | for field in asdl_ast_node.fields: 85 | # for composite node 86 | field_value = None 87 | 88 | if isinstance(field.type, ASDLCompositeType) and field.value: 89 | field_value = asdl_ast_to_english(field.value) 90 | else: 91 | field_value = field.value 92 | 93 | tokens.append(field_value) 94 | 95 | return ' '.join([x if x else '' for x in tokens]) 96 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/asdl/lang/eng/eng_transition_system.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from multivac.src.gan.gen_pyt.asdl.lang.eng.eng_asdl_helper import ( 4 | asdl_ast_to_english, english_ast_to_asdl_ast) 5 | from multivac.src.gan.gen_pyt.asdl.transition_system import (GenTokenAction, 6 | TransitionSystem) 7 | from multivac.src.rdf_graph.rdf_parse import tokenize_text 8 | 9 | 10 | class EnglishTransitionSystem(TransitionSystem): 11 | 12 | def __init__(self, grammar): 13 | super().__init__(grammar) 14 | 15 | def tokenize_text(self, text, mode=None): 16 | return tokenize_text(text, mode) 17 | 18 | def surface_text_to_ast(self, text, parser): 19 | p = parser.get_parse(text)['sentences'][0]['parse'] 20 | return english_ast_to_asdl_ast(p) 21 | 22 | def ast_to_surface_text(self, asdl_ast): 23 | text = asdl_ast_to_english(asdl_ast) 24 | return text 25 | 26 | def compare_ast(self, hyp_ast, ref_ast): 27 | hyp_text = self.ast_to_surface_text(hyp_ast) 28 | ref_reformatted_text = self.ast_to_surface_text(ref_ast) 29 | 30 | ref_text_tokens = tokenize_text(ref_reformatted_text) 31 | hyp_text_tokens = tokenize_text(hyp_text) 32 | 33 | return ref_text_tokens == hyp_text_tokens 34 | 35 | def get_primitive_field_actions(self, realized_field): 36 | actions = [] 37 | 38 | if realized_field.value is not None: 39 | field_values = [realized_field.value] 40 | 41 | tokens = [] 42 | 43 | for field_val in field_values: 44 | tokens.extend(field_val.split(' ')) 45 | 46 | for tok in tokens: 47 | actions.append(GenTokenAction(tok)) 48 | 49 | return actions 50 | 51 | def is_valid_hypothesis(self, hyp, parser, **kwargs): 52 | try: 53 | hyp_text = self.ast_to_surface_text(hyp.tree) 54 | new_tree = self.surface_text_to_ast(hyp_text, parser) 55 | assert hyp.tree == new_tree 56 | except Exception: 57 | return False 58 | return True 59 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/asdl/lang/eng/grammar.py: -------------------------------------------------------------------------------- 1 | """ 2 | English grammar and typing system 3 | """ 4 | from collections import OrderedDict 5 | 6 | from multivac.src.gan.gen_pyt.asdl.asdl import (ASDLCompositeType, 7 | ASDLConstructor, ASDLGrammar, 8 | ASDLPrimitiveType, 9 | ASDLProduction, Field) 10 | from multivac.src.gan.gen_pyt.asdl.lang.eng.eng_asdl_helper import \ 11 | english_ast_to_asdl_ast 12 | from multivac.src.gan.gen_pyt.asdl.lang.grammar import Grammar 13 | 14 | BRACKET_TYPES = { 15 | ASDLPrimitiveType('-LRB-'): '(', 16 | ASDLPrimitiveType('-RRB-'): ')', 17 | ASDLPrimitiveType('-LCB-'): '{', 18 | ASDLPrimitiveType('-RCB-'): '}', 19 | ASDLPrimitiveType('-LSB-'): '[', 20 | ASDLPrimitiveType('-RSB-'): ']', 21 | } 22 | 23 | TERMINAL_TYPES = { 24 | ASDLPrimitiveType('CC'), # Coordinating conjunction 25 | ASDLPrimitiveType('CD'), # Cardinal number 26 | ASDLPrimitiveType('DT'), # Determiner 27 | ASDLPrimitiveType('EX'), # Existential there 28 | ASDLPrimitiveType('FW'), # Foreign word 29 | ASDLPrimitiveType('IN'), # Preposition or subordinating conjunction 30 | ASDLPrimitiveType('JJ'), # Adjective 31 | ASDLPrimitiveType('JJR'), # Adjective, comparative 32 | ASDLPrimitiveType('JJS'), # Adjective, superlative 33 | ASDLPrimitiveType('LS'), # List item marker 34 | ASDLPrimitiveType('MD'), # Modals 35 | ASDLPrimitiveType('NN'), # Noun, singular or mass 36 | ASDLPrimitiveType('NNS'), # Noun, plural 37 | ASDLPrimitiveType('NNP'), # Proper noun, singular 38 | ASDLPrimitiveType('NNPS'), # Proper noun, plural 39 | ASDLPrimitiveType('PDT'), # Predeterminer 40 | ASDLPrimitiveType('POS'), # Possessive ending 41 | ASDLPrimitiveType('PRP'), # Personal pronoun 42 | ASDLPrimitiveType('PRP$'), # Possessive pronoun (prolog version PRP-S) 43 | ASDLPrimitiveType('RB'), # Adverb 44 | ASDLPrimitiveType('RBR'), # Adverb, comparative 45 | ASDLPrimitiveType('RBS'), # Adverb, superlative 46 | ASDLPrimitiveType('RP'), # Particle 47 | ASDLPrimitiveType('SYM'), # Symbol 48 | ASDLPrimitiveType('TO'), # to 49 | ASDLPrimitiveType('UH'), # Interjection 50 | ASDLPrimitiveType('VB'), # Verb, base form 51 | ASDLPrimitiveType('VBD'), # Verb, past tense 52 | ASDLPrimitiveType('VBG'), # Verb, gerund or present participle 53 | ASDLPrimitiveType('VBN'), # Verb, past participle 54 | ASDLPrimitiveType('VBP'), # Verb, non-3rd person singular present 55 | ASDLPrimitiveType('VBZ'), # Verb, 3rd person singular present 56 | ASDLPrimitiveType('WDT'), # Wh-determiner 57 | ASDLPrimitiveType('WP'), # Wh-pronoun 58 | ASDLPrimitiveType('WP$'), # Possessive wh-pronoun (prolog version WP-S) 59 | ASDLPrimitiveType('WRB') # Wh-adverb 60 | } 61 | 62 | 63 | class EnglishGrammar(Grammar): 64 | 65 | def __init__(self, rules): 66 | super().__init__(rules) 67 | 68 | self.terminal_types.update(TERMINAL_TYPES) 69 | self.terminal_types.update(BRACKET_TYPES) 70 | 71 | 72 | class EnglishASDLGrammar(ASDLGrammar): 73 | """ 74 | Collection of types, constructors and productions 75 | """ 76 | 77 | def __init__(self, grammar=None, productions=None): 78 | # productions are indexed by their head types 79 | self._productions = OrderedDict() 80 | self._constructor_production_map = dict() 81 | 82 | if productions is not None: 83 | english_prods = set(productions) 84 | 85 | for prod in english_prods: 86 | if prod.type not in self._productions: 87 | self._productions[prod.type] = list() 88 | self._productions[prod.type].append(prod) 89 | self._constructor_production_map[prod.constructor.name] = prod 90 | 91 | self.root_type = ASDLCompositeType("ROOT") 92 | elif grammar is not None: 93 | if isinstance(grammar, ASDLGrammar): 94 | self = grammar 95 | return 96 | 97 | for rule in grammar.rules: 98 | fields = [] 99 | 100 | for child in rule.children: 101 | if grammar.is_terminal(child): 102 | child_type = ASDLPrimitiveType(child.type) 103 | else: 104 | child_type = ASDLCompositeType(child.type) 105 | 106 | fields.append(Field(child.type, child_type, 'single')) 107 | 108 | constructor = ASDLConstructor(rule.type, fields) 109 | production = ASDLProduction(ASDLCompositeType(rule.type), 110 | constructor) 111 | 112 | if production.type not in self._productions: 113 | self._productions[production.type] = list() 114 | 115 | self._productions[production.type].append(production) 116 | self._constructor_production_map[constructor.name] = production 117 | 118 | self.root_type = ASDLCompositeType(grammar.root_node.type) 119 | 120 | self.size = sum(len(head) for head in self._productions.values()) 121 | self.terminal_types = set(self.primitive_types) 122 | self.terminal_types.update(TERMINAL_TYPES) 123 | self.terminal_types.update(BRACKET_TYPES.keys()) 124 | 125 | self._types = sorted(self.terminal_types.union(set(self.types)), 126 | key=lambda x: x.name) 127 | 128 | # get entities to their ids map 129 | self.prod2id = {prod: i for i, prod in enumerate(self.productions)} 130 | self.type2id = {type: i for i, type in enumerate(self.types)} 131 | self.field2id = {field: i for i, field in enumerate(self.fields)} 132 | 133 | self.id2prod = {i: prod for i, prod in enumerate(self.productions)} 134 | self.id2type = {i: type for i, type in enumerate(self.types)} 135 | self.id2field = {i: field for i, field in enumerate(self.fields)} 136 | 137 | @staticmethod 138 | def from_text(text, parser): 139 | productions = set() 140 | 141 | if isinstance(text, list): 142 | text = '\n'.join(text) 143 | 144 | for s in text: 145 | try: 146 | p = parser.get_parse(s)['sentences'][0]['parse'] 147 | except Exception: 148 | continue 149 | try: 150 | parse_tree = english_ast_to_asdl_ast(p.parse_string) 151 | except Exception: 152 | continue 153 | 154 | productions.update(parse_tree.get_productions()) 155 | 156 | productions = sorted(productions, key=lambda x: x.__repr__) 157 | 158 | grammar = EnglishASDLGrammar(productions=productions) 159 | return grammar 160 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/asdl/lang/grammar.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict, defaultdict 2 | 3 | from multivac.src.gan.gen_pyt.astnode import ASTNode 4 | from multivac.src.gan.utilities.utils import typename 5 | 6 | 7 | class Grammar(object): 8 | 9 | def __init__(self, rules): 10 | """ 11 | instantiate a grammar with a set of production rules of type Rule 12 | """ 13 | self.rules = rules 14 | self.rule_index = defaultdict(list) 15 | self.rule_to_id = OrderedDict() 16 | 17 | node_types = set() 18 | lhs_nodes = set() 19 | rhs_nodes = set() 20 | 21 | for rule in self.rules: 22 | self.rule_index[rule.parent].append(rule) 23 | 24 | # we also store all unique node types 25 | for node in rule.nodes: 26 | node_types.add(typename(node.type)) 27 | 28 | lhs_nodes.add(rule.parent) 29 | 30 | for child in rule.children: 31 | rhs_nodes.add(child.as_type_node) 32 | 33 | root_node = lhs_nodes - rhs_nodes 34 | 35 | try: 36 | assert len(root_node) == 1 37 | except AssertionError: 38 | print(root_node) 39 | raise AssertionError 40 | 41 | self.root_node = next(iter(root_node)) 42 | 43 | self.terminal_nodes = rhs_nodes - lhs_nodes 44 | self.terminal_types = set([n.type for n in self.terminal_nodes]) 45 | 46 | self.node_type_to_id = OrderedDict() 47 | for i, type in enumerate(node_types, start=0): 48 | self.node_type_to_id[type] = i 49 | 50 | for gid, rule in enumerate(rules, start=0): 51 | self.rule_to_id[rule] = gid 52 | 53 | self.id_to_rule = OrderedDict((v, k) for (k, v) in list(self.rule_to_id.items())) 54 | 55 | def __iter__(self): 56 | return self.rules.__iter__() 57 | 58 | def __len__(self): 59 | return len(self.rules) 60 | 61 | def __getitem__(self, lhs): 62 | key_node = ASTNode(lhs.type, None) # Rules are indexed by types only 63 | if key_node in self.rule_index: 64 | return self.rule_index[key_node] 65 | else: 66 | KeyError('key=%s' % key_node) 67 | 68 | def get_node_type_id(self, node): 69 | if isinstance(node, ASTNode): 70 | type_repr = typename(node.type) 71 | return self.node_type_to_id[type_repr] 72 | else: 73 | # assert isinstance(node, str) 74 | # it is a type 75 | type_repr = typename(node) 76 | return self.node_type_to_id[type_repr] 77 | 78 | def is_terminal(self, node): 79 | return node.type in self.terminal_types 80 | 81 | def is_value_node(self, node): 82 | return node.type in self.terminal_types 83 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/asdl/transition_system.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | 4 | class Action(object): 5 | pass 6 | 7 | 8 | class ApplyRuleAction(Action): 9 | def __init__(self, production): 10 | self.production = production 11 | 12 | def __hash__(self): 13 | return hash(self.production) 14 | 15 | def __eq__(self, other): 16 | return isinstance(other, ApplyRuleAction) and self.production == other.production 17 | 18 | def __ne__(self, other): 19 | return not self.__eq__(other) 20 | 21 | def __repr__(self): 22 | return 'ApplyRule[%s]' % self.production.__repr__() 23 | 24 | 25 | class GenTokenAction(Action): 26 | def __init__(self, token): 27 | self.token = token 28 | 29 | def is_stop_signal(self): 30 | return self.token == '' 31 | 32 | def __repr__(self): 33 | return 'GenToken[%s]' % self.token 34 | 35 | 36 | class GenEngTokenAction(GenTokenAction): 37 | def __init__(self, token): 38 | self.token = token 39 | 40 | def is_stop_signal(self): 41 | return self.token == '' 42 | 43 | 44 | class ReduceAction(Action): 45 | def __repr__(self): 46 | return 'Reduce' 47 | 48 | 49 | class TransitionSystem(object): 50 | def __init__(self, grammar): 51 | self.grammar = grammar 52 | 53 | def get_actions(self, asdl_ast): 54 | """ 55 | generate action sequence given the ASDL Syntax Tree 56 | """ 57 | 58 | actions = [] 59 | 60 | parent_action = ApplyRuleAction(asdl_ast.production) 61 | actions.append(parent_action) 62 | 63 | for field in asdl_ast.fields: 64 | # is a composite field 65 | if self.grammar.is_composite_type(field.type): 66 | if field.cardinality == 'single': 67 | field_actions = self.get_actions(field.value) 68 | else: 69 | field_actions = [] 70 | 71 | if field.value is not None: 72 | if field.cardinality == 'multiple': 73 | for val in field.value: 74 | cur_child_actions = self.get_actions(val) 75 | field_actions.extend(cur_child_actions) 76 | elif field.cardinality == 'optional': 77 | field_actions = self.get_actions(field.value) 78 | 79 | # if an optional field is filled, then do not need Reduce action 80 | if field.cardinality == 'multiple' or field.cardinality == 'optional' and not field_actions: 81 | field_actions.append(ReduceAction()) 82 | else: # is a primitive field 83 | field_actions = self.get_primitive_field_actions(field) 84 | 85 | # if an optional field is filled, then do not need Reduce action 86 | if field.cardinality == 'multiple' or field.cardinality == 'optional' and not field_actions: 87 | # reduce action 88 | field_actions.append(ReduceAction()) 89 | 90 | actions.extend(field_actions) 91 | 92 | return actions 93 | 94 | def tokenize_code(self, code, mode): 95 | raise NotImplementedError 96 | 97 | def compare_ast(self, hyp_ast, ref_ast): 98 | raise NotImplementedError 99 | 100 | def ast_to_surface_code(self, asdl_ast): 101 | raise NotImplementedError 102 | 103 | def surface_code_to_ast(self, code): 104 | raise NotImplementedError 105 | 106 | def get_primitive_field_actions(self, realized_field): 107 | raise NotImplementedError 108 | 109 | def get_valid_continuation_types(self, hyp): 110 | if hyp.tree: 111 | if self.grammar.is_composite_type(hyp.frontier_field.type): 112 | if hyp.frontier_field.cardinality == 'single': 113 | return ApplyRuleAction, 114 | else: # optional, multiple 115 | return ApplyRuleAction, ReduceAction 116 | else: 117 | if hyp.frontier_field.cardinality == 'single': 118 | return GenTokenAction, 119 | elif hyp.frontier_field.cardinality == 'optional': 120 | if hyp._value_buffer: 121 | return GenTokenAction, 122 | else: 123 | return GenTokenAction, ReduceAction 124 | else: 125 | return GenTokenAction, ReduceAction 126 | else: 127 | return ApplyRuleAction, 128 | 129 | def get_valid_continuating_productions(self, hyp): 130 | if hyp.tree: 131 | if self.grammar.is_composite_type(hyp.frontier_field.type): 132 | return self.grammar[hyp.frontier_field.type] 133 | else: 134 | raise ValueError 135 | else: 136 | return self.grammar[self.grammar.root_type] 137 | 138 | @staticmethod 139 | def get_class_by_lang(lang): 140 | if lang == 'python': 141 | from .lang.py.py_transition_system import PythonTransitionSystem 142 | return PythonTransitionSystem 143 | elif lang == 'english': 144 | from .lang.eng.eng_transition_system import EnglishTransitionSystem 145 | return EnglishTransitionSystem 146 | elif lang == 'python3': 147 | from .lang.py3.py3_transition_system import Python3TransitionSystem 148 | return Python3TransitionSystem 149 | elif lang == 'lambda_dcs': 150 | from .lang.lambda_dcs.lambda_dcs_transition_system import LambdaCalculusTransitionSystem 151 | return LambdaCalculusTransitionSystem 152 | elif lang == 'prolog': 153 | from .lang.prolog.prolog_transition_system import PrologTransitionSystem 154 | return PrologTransitionSystem 155 | elif lang == 'wikisql': 156 | from .lang.sql.sql_transition_system import SqlTransitionSystem 157 | return SqlTransitionSystem 158 | 159 | raise ValueError('unknown language %s' % lang) 160 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/components/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import multivac.src.gan.utilities.vocab as vocab 3 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/components/action_info.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from multivac.src.gan.gen_pyt.asdl.hypothesis import Hypothesis 3 | from multivac.src.gan.gen_pyt.asdl.transition_system import GenTokenAction 4 | 5 | 6 | class ActionInfo(object): 7 | """sufficient statistics for making a prediction of an action at a time step""" 8 | 9 | def __init__(self, action=None): 10 | self.t = 0 11 | self.parent_t = -1 12 | self.action = action 13 | self.frontier_prod = None 14 | self.frontier_field = None 15 | 16 | # for GenToken actions only 17 | self.copy_from_src = False 18 | self.src_token_position = -1 19 | 20 | def __repr__(self, verbose=False): 21 | repr_str = '%s (t=%d, p_t=%d, frontier_field=%s)' % (repr(self.action), 22 | self.t, 23 | self.parent_t, 24 | self.frontier_field.__repr__(True) 25 | if self.frontier_field else 'None') 26 | 27 | if verbose: 28 | verbose_repr = 'action_prob=%.4f, ' % self.action_prob 29 | if isinstance(self.action, GenTokenAction): 30 | verbose_repr += 'in_vocab=%s, ' \ 31 | 'gen_copy_switch=%s, ' \ 32 | 'p(gen)=%s, p(copy)=%s, ' \ 33 | 'has_copy=%s, copy_pos=%s' % (self.in_vocab, 34 | self.gen_copy_switch, 35 | self.gen_token_prob, self.copy_token_prob, 36 | self.copy_from_src, self.src_token_position) 37 | 38 | repr_str += '\n' + verbose_repr 39 | 40 | return repr_str 41 | 42 | 43 | def get_action_infos(src_query, tgt_actions, force_copy=False, verbose=False): 44 | action_infos = [] 45 | hyp = Hypothesis() 46 | 47 | for t, action in enumerate(tgt_actions): 48 | action_info = ActionInfo(action) 49 | action_info.t = t 50 | 51 | if verbose: 52 | print(action) 53 | 54 | if hyp.frontier_node: 55 | action_info.parent_t = hyp.frontier_node.created_time 56 | action_info.frontier_prod = hyp.frontier_node.production 57 | action_info.frontier_field = hyp.frontier_field.field 58 | 59 | if verbose: 60 | print("Frontier node: {} :: {}".format(action_info.frontier_prod, action_info.frontier_field)) 61 | 62 | if isinstance(action, GenTokenAction): 63 | if verbose: 64 | print("GenToken: {}".format(str(action.token))) 65 | 66 | try: 67 | tok_src_idx = src_query.index(str(action.token)) 68 | action_info.copy_from_src = True 69 | action_info.src_token_position = tok_src_idx 70 | except ValueError: 71 | if force_copy: 72 | raise ValueError('cannot copy primitive token %s from source' % action.token) 73 | 74 | hyp.apply_action(action) 75 | action_infos.append(action_info) 76 | 77 | return action_infos 78 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/components/decode_hypothesis.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from multivac.src.gan.gen_pyt.asdl.hypothesis import Hypothesis 4 | 5 | 6 | class DecodeHypothesis(Hypothesis): 7 | 8 | def __init__(self): 9 | super(DecodeHypothesis, self).__init__() 10 | 11 | self.action_infos = [] 12 | self.code = None 13 | 14 | def clone_and_apply_action_info(self, action_info): 15 | action = action_info.action 16 | 17 | new_hyp = self.clone_and_apply_action(action) 18 | new_hyp.action_infos.append(action_info) 19 | 20 | return new_hyp 21 | 22 | def copy(self): 23 | new_hyp = DecodeHypothesis() 24 | if self.tree: 25 | new_hyp.tree = self.tree.copy() 26 | 27 | new_hyp.actions = list(self.actions) 28 | new_hyp.action_infos = list(self.action_infos) 29 | new_hyp.score = self.score 30 | new_hyp._value_buffer = list(self._value_buffer) 31 | new_hyp.t = self.t 32 | new_hyp.code = self.code 33 | 34 | new_hyp.update_frontier_info() 35 | 36 | return new_hyp 37 | 38 | def apply_action_info(self, action_info): 39 | self.apply_action(action_info.action) 40 | self.action_infos.append(action_info) 41 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/components/vocab.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import Counter 3 | from itertools import chain 4 | 5 | 6 | class Vocab(object): 7 | 8 | def __init__(self, filename=None, data=None, lower=False): 9 | self.idxToLabel = {} 10 | self.labelToIdx = {} 11 | self.lower = lower 12 | 13 | # Special entries will not be pruned. 14 | self.special = [] 15 | 16 | if data is not None: 17 | self.addSpecials(data) 18 | if filename is not None: 19 | self.loadFile(filename) 20 | 21 | self.add('') 22 | self.add('') 23 | self.add('') 24 | 25 | def __getitem__(self, item): 26 | return self.labelToIdx.get(item, self.unk) 27 | 28 | def __contains__(self, item): 29 | return item in self.labelToIdx 30 | 31 | def __setitem__(self, key, value): 32 | self.labelToIdx[key] = value 33 | 34 | def __len__(self): 35 | return len(self.labelToIdx) 36 | 37 | def __iter__(self): 38 | return iter(list(self.labelToIdx.keys())) 39 | 40 | def __eq__(self, other): 41 | return all([self.idxToLabel == other.idxToLabel, 42 | self.labelToIdx == other.labelToIdx, 43 | self.lower == other.lower, 44 | self.special == other.special]) 45 | 46 | @property 47 | def pad(self): 48 | return self.labelToIdx[''] 49 | 50 | @property 51 | def unk(self): 52 | return self.labelToIdx[''] 53 | 54 | @property 55 | def eos(self): 56 | return self.labelToIdx[''] 57 | 58 | def is_unk(self, word): 59 | return word not in self 60 | 61 | def size(self): 62 | return len(self.idxToLabel) 63 | 64 | # Load entries from a file. 65 | def loadFile(self, filename): 66 | idx = 0 67 | for line in open(filename, 'r', encoding='utf8', errors='ignore'): 68 | token = line.rstrip('\n') 69 | self.add(token) 70 | idx += 1 71 | 72 | def getIndex(self, key, default=None): 73 | key = key.lower() if self.lower else key 74 | 75 | return self.labelToIdx.get(key, default) 76 | 77 | def getLabel(self, idx, default=None): 78 | return self.idxToLabel.get(idx, default) 79 | 80 | def add_from_data(self, label, idx=None): 81 | if idx: 82 | self.idxToLabel[idx] = label 83 | self.labelToIdx[label] = idx 84 | else: 85 | idx = self.add(label) 86 | 87 | # Mark this `label` and `idx` as special 88 | def addSpecial(self, label): 89 | idx = self.add(label) 90 | self.special += [idx] 91 | 92 | # Mark all labels in `labels` as specials 93 | def addSpecials(self, labels): 94 | for label in labels: 95 | if isinstance(label, tuple): 96 | self.add_from_data(*label) 97 | else: 98 | self.addSpecial(label) 99 | 100 | # Add `label` in the dictionary. Use `idx` as its index if given. 101 | def add(self, label): 102 | label = label.lower() if self.lower else label 103 | 104 | if label in self.labelToIdx: 105 | idx = self.labelToIdx[label] 106 | else: 107 | idx = len(self.idxToLabel) 108 | self.idxToLabel[idx] = label 109 | self.labelToIdx[label] = idx 110 | return idx 111 | 112 | # Convert `labels` to indices. Use `unkWord` if not found. 113 | # Optionally insert `bosWord` at the beginning and `eosWord` at the . 114 | def convertToIdx(self, labels, unkWord=None, bosWord=None, eosWord=None): 115 | if unkWord is None: 116 | unk = self.unk 117 | else: 118 | unk = self.getIndex(unkWord) 119 | 120 | vec = [] 121 | 122 | if bosWord is not None: 123 | vec += [self.getIndex(bosWord)] 124 | 125 | vec += [self.getIndex(label, default=unk) for label in labels] 126 | 127 | if eosWord is not None: 128 | vec += [self.getIndex(eosWord)] 129 | 130 | return vec 131 | 132 | # Convert `idx` to labels. If index `stop` is reached, convert it and return. 133 | def convertToLabels(self, idx, stop=None): 134 | labels = [] 135 | 136 | for i in idx: 137 | labels += [self.getLabel(i)] 138 | if i == stop: 139 | break 140 | 141 | return labels 142 | 143 | @staticmethod 144 | def from_corpus(corpus, size=None, freq_cutoff=0): 145 | vocab = Vocab() 146 | 147 | word_freq = Counter(chain(*corpus)) 148 | top_k_words = sorted(word_freq.keys(), reverse=True, key=word_freq.get) 149 | 150 | if size is not None: 151 | top_k_words = top_k_words[:size] 152 | 153 | words_not_included = [] 154 | 155 | for word in top_k_words: 156 | if word_freq[word] >= freq_cutoff: 157 | vocab.add(word) 158 | else: 159 | words_not_included.append(word) 160 | 161 | if len(vocab) == size: 162 | break 163 | 164 | return vocab 165 | 166 | @staticmethod 167 | def from_dict(vocab): 168 | vocab = Vocab() 169 | 170 | for key, value in vocab.items(): 171 | setattr(vocab, key, value) 172 | 173 | return vocab 174 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/datasets/__init__.py -------------------------------------------------------------------------------- /src/gan/gen_pyt/datasets/english/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/datasets/english/__init__.py -------------------------------------------------------------------------------- /src/gan/gen_pyt/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/gen_pyt/model/__init__.py -------------------------------------------------------------------------------- /src/gan/gen_pyt/model/attention_util.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from multivac.src.gan.gen_pyt.asdl.transition_system import GenTokenAction 3 | 4 | LOGICAL_FORM_LEXICON = { 5 | 'city:t': ['citi'], 6 | 'density:i': ['densiti', 'averag', 'popul'], 7 | } 8 | 9 | 10 | class AttentionUtil(object): 11 | 12 | @staticmethod 13 | def get_candidate_tokens_to_attend(src_tokens, action): 14 | tokens_to_attend = dict() 15 | if isinstance(action, GenTokenAction): 16 | tgt_token = action.token 17 | for src_idx, src_token in enumerate(src_tokens): 18 | # match lemma 19 | if len(src_token) >= 3 and tgt_token.startswith(src_token) or \ 20 | src_token in LOGICAL_FORM_LEXICON.get(tgt_token, []): 21 | tokens_to_attend[src_idx] = src_token 22 | 23 | return tokens_to_attend 24 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/model/lstm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | import torch.nn.utils 6 | from torch.nn import Parameter, init 7 | from torch.nn.modules.rnn import RNNCellBase 8 | 9 | 10 | class ParentFeedingLSTMCell(RNNCellBase): 11 | 12 | def __init__(self, input_size, hidden_size): 13 | super(ParentFeedingLSTMCell, self).__init__() 14 | 15 | self.input_size = input_size 16 | self.hidden_size = hidden_size 17 | 18 | self.W_i = Parameter(torch.Tensor(hidden_size, input_size)) 19 | self.U_i = Parameter(torch.Tensor(hidden_size, hidden_size)) 20 | self.U_i_p = Parameter(torch.Tensor(hidden_size, hidden_size)) 21 | self.b_i = Parameter(torch.Tensor(hidden_size)) 22 | 23 | self.W_f = Parameter(torch.Tensor(hidden_size, input_size)) 24 | self.U_f = Parameter(torch.Tensor(hidden_size, hidden_size)) 25 | self.U_f_p = Parameter(torch.Tensor(hidden_size, hidden_size)) 26 | self.b_f = Parameter(torch.Tensor(hidden_size)) 27 | self.b_f_p = Parameter(torch.Tensor(hidden_size)) 28 | 29 | self.W_c = Parameter(torch.Tensor(hidden_size, input_size)) 30 | self.U_c = Parameter(torch.Tensor(hidden_size, hidden_size)) 31 | self.U_c_p = Parameter(torch.Tensor(hidden_size, hidden_size)) 32 | self.b_c = Parameter(torch.Tensor(hidden_size)) 33 | 34 | self.W_o = Parameter(torch.Tensor(hidden_size, input_size)) 35 | self.U_o = Parameter(torch.Tensor(hidden_size, hidden_size)) 36 | self.U_o_p = Parameter(torch.Tensor(hidden_size, hidden_size)) 37 | self.b_o = Parameter(torch.Tensor(hidden_size)) 38 | 39 | self.reset_parameters() 40 | 41 | def reset_parameters(self): 42 | init.orthogonal(self.W_i) 43 | init.orthogonal(self.U_i) 44 | init.orthogonal(self.U_i_p) 45 | 46 | init.orthogonal(self.W_f) 47 | init.orthogonal(self.U_f) 48 | init.orthogonal(self.U_f_p) 49 | 50 | init.orthogonal(self.W_c) 51 | init.orthogonal(self.U_c) 52 | init.orthogonal(self.U_c_p) 53 | 54 | init.orthogonal(self.W_o) 55 | init.orthogonal(self.U_o) 56 | init.orthogonal(self.U_o_p) 57 | 58 | self.b_i.data.fill_(0.) 59 | self.b_c.data.fill_(0.) 60 | self.b_o.data.fill_(0.) 61 | # forget bias set to 1. 62 | self.b_f.data.fill_(1.) 63 | self.b_f_p.data.fill_(1.) 64 | 65 | def forward(self, input, hidden_states): 66 | h_tm1, c_tm1, h_tm1_p, c_tm1_p = hidden_states 67 | i_t = torch.sigmoid(F.linear(input, self.W_i) + F.linear(h_tm1, self.U_i) + 68 | F.linear(h_tm1_p, self.U_i_p) + self.b_i) 69 | 70 | xf_t = F.linear(input, self.W_f) 71 | f_t = torch.sigmoid(xf_t + F.linear(h_tm1, self.U_f) + self.b_f) 72 | f_t_p = torch.sigmoid(xf_t + F.linear(h_tm1_p, self.U_f_p) + self.b_f_p) 73 | 74 | xc_t = torch.linear(input, self.W_c) + F.linear(h_tm1, self.U_c) + F.linear(h_tm1_p, self.U_c_p) + self.b_c 75 | c_t = f_t * c_tm1 + f_t_p * c_tm1_p + i_t * torch.tanh(xc_t) 76 | 77 | o_t = torch.sigmoid(F.linear(input, self.W_o) + F.linear(h_tm1, self.U_o) + 78 | F.linear(h_tm1_p, self.U_o_p) + self.b_o) 79 | h_t = o_t * torch.tanh(c_t) 80 | 81 | return h_t, c_t 82 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/model/nn_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.nn.init as init 8 | 9 | 10 | def dot_prod_attention(h_t, src_encoding, src_encoding_att_linear, mask=None): 11 | """ 12 | :param h_t: (batch_size, hidden_size) 13 | :param src_encoding: (batch_size, src_sent_len, hidden_size * 2) 14 | :param src_encoding_att_linear: (batch_size, src_sent_len, hidden_size) 15 | :param mask: (batch_size, src_sent_len) 16 | """ 17 | # (batch_size, src_sent_len) 18 | att_weight = torch.bmm(src_encoding_att_linear, h_t.unsqueeze(2)).squeeze(2) 19 | if mask is not None: 20 | att_weight.data.masked_fill_(mask.bool(), -float('inf')) 21 | att_weight = F.softmax(att_weight, dim=-1) 22 | 23 | att_view = (att_weight.size(0), 1, att_weight.size(1)) 24 | # (batch_size, hidden_size) 25 | ctx_vec = torch.bmm(att_weight.view(*att_view), src_encoding).squeeze(1) 26 | 27 | return ctx_vec, att_weight 28 | 29 | 30 | def length_array_to_mask_tensor(length_array, cuda=False, valid_entry_has_mask_one=False): 31 | max_len = max(length_array) 32 | batch_size = len(length_array) 33 | 34 | mask = np.zeros((batch_size, max_len), dtype=np.uint8) 35 | for i, seq_len in enumerate(length_array): 36 | if valid_entry_has_mask_one: 37 | mask[i][:seq_len] = 1 38 | else: 39 | mask[i][seq_len:] = 1 40 | 41 | mask = torch.ByteTensor(mask) 42 | return mask.cuda() if cuda else mask 43 | 44 | 45 | def input_transpose(sents, pad_token): 46 | """ 47 | transform the input List[sequence] of size (batch_size, max_sent_len) 48 | into a list of size (max_sent_len, batch_size), with proper padding 49 | """ 50 | max_len = max(len(s) for s in sents) 51 | batch_size = len(sents) 52 | 53 | sents_t = [] 54 | for i in range(max_len): 55 | sents_t.append([sents[k][i] if len(sents[k]) > i else pad_token for k in range(batch_size)]) 56 | 57 | return sents_t 58 | 59 | 60 | def word2id(sents, vocab): 61 | if type(sents[0]) == list: 62 | return [[vocab[w] for w in s] for s in sents] 63 | else: 64 | return [vocab[w] for w in sents] 65 | 66 | 67 | def id2word(sents, vocab): 68 | if type(sents[0]) == list: 69 | return [[vocab.idxToLabel[w] for w in s] for s in sents] 70 | else: 71 | return [vocab.idxToLabel[w] for w in sents] 72 | 73 | 74 | def to_input_variable(sequences, vocab, cuda=False, training=True, append_boundary_sym=False): 75 | """ 76 | given a list of sequences, 77 | return a tensor of shape (max_sent_len, batch_size) 78 | """ 79 | if append_boundary_sym: 80 | sequences = [[''] + seq + [''] for seq in sequences] 81 | 82 | word_ids = word2id(sequences, vocab) 83 | sents_t = input_transpose(word_ids, vocab['']) 84 | 85 | sents_var = torch.LongTensor(sents_t) 86 | if cuda: 87 | sents_var = sents_var.cuda() 88 | 89 | return sents_var 90 | 91 | 92 | def uniform_init(lower, upper, params): 93 | for p in params: 94 | p.data.uniform_(lower, upper) 95 | 96 | 97 | def kaiming_init(params): 98 | for p in params: 99 | if len(p.data.size()) > 1: 100 | init.kaiming_normal_(p.data) 101 | 102 | 103 | def glorot_init(params): 104 | for p in params: 105 | if len(p.data.size()) > 1: 106 | init.xavier_normal_(p.data) 107 | 108 | 109 | def identity(x): 110 | return x 111 | 112 | 113 | class LabelSmoothing(nn.Module): 114 | """Implement label smoothing. 115 | 116 | Reference: the annotated transformer 117 | """ 118 | 119 | def __init__(self, smoothing, tgt_vocab_size, ignore_indices=None): 120 | if ignore_indices is None: 121 | ignore_indices = [] 122 | 123 | super(LabelSmoothing, self).__init__() 124 | 125 | self.criterion = nn.KLDivLoss(reduction='none') 126 | smoothing_value = smoothing / float(tgt_vocab_size - 1 - len(ignore_indices)) 127 | one_hot = torch.zeros((tgt_vocab_size,)).fill_(smoothing_value) 128 | for idx in ignore_indices: 129 | one_hot[idx] = 0. 130 | 131 | self.confidence = 1.0 - smoothing 132 | self.register_buffer('one_hot', one_hot.unsqueeze(0)) 133 | 134 | def forward(self, model_prob, target): 135 | # (batch_size, *, tgt_vocab_size) 136 | dim = list(model_prob.size())[:-1] + [1] 137 | true_dist = self.one_hot.repeat(*dim) 138 | true_dist.scatter_(-1, target.unsqueeze(-1), self.confidence) 139 | 140 | return self.criterion(model_prob, true_dist).sum(dim=-1) 141 | 142 | 143 | class FeedForward(nn.Module): 144 | """Feed forward neural network adapted from AllenNLP""" 145 | 146 | def __init__(self, input_dim, num_layers, hidden_dims, activations, dropout): 147 | super(FeedForward, self).__init__() 148 | 149 | if not isinstance(hidden_dims, list): 150 | hidden_dims = [hidden_dims] * num_layers # type: ignore 151 | if not isinstance(activations, list): 152 | activations = [activations] * num_layers # type: ignore 153 | if not isinstance(dropout, list): 154 | dropout = [dropout] * num_layers # type: ignore 155 | 156 | self.activations = activations 157 | input_dims = [input_dim] + hidden_dims[:-1] 158 | linear_layers = [] 159 | for layer_input_dim, layer_output_dim in zip(input_dims, hidden_dims): 160 | linear_layers.append(nn.Linear(layer_input_dim, layer_output_dim)) 161 | 162 | self.linear_layers = nn.ModuleList(linear_layers) 163 | dropout_layers = [nn.Dropout(p=value) for value in dropout] 164 | self.dropout = nn.ModuleList(dropout_layers) 165 | self.output_dim = hidden_dims[-1] 166 | self.input_dim = input_dim 167 | 168 | def forward(self, x): 169 | output = x 170 | for layer, activation, dropout in zip(self.linear_layers, self.activations, self.dropout): 171 | output = dropout(activation(layer(output))) 172 | return output 173 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/model/pointer_net.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.nn.utils 7 | 8 | 9 | class PointerNet(nn.Module): 10 | def __init__(self, query_vec_size, src_encoding_size, attention_type='affine'): 11 | super(PointerNet, self).__init__() 12 | 13 | assert attention_type in ('affine', 'dot_prod') 14 | if attention_type == 'affine': 15 | self.src_encoding_linear = nn.Linear(src_encoding_size, query_vec_size, bias=False) 16 | 17 | self.attention_type = attention_type 18 | 19 | def forward(self, src_encodings, src_token_mask, query_vec): 20 | """ 21 | :param src_encodings: Variable(batch_size, src_sent_len, hidden_size * 2) 22 | :param src_token_mask: Variable(batch_size, src_sent_len) 23 | :param query_vec: Variable(tgt_action_num, batch_size, query_vec_size) 24 | :return: Variable(tgt_action_num, batch_size, src_sent_len) 25 | """ 26 | 27 | # (batch_size, 1, src_sent_len, query_vec_size) 28 | if self.attention_type == 'affine': 29 | src_encodings = self.src_encoding_linear(src_encodings) 30 | src_encodings = src_encodings.unsqueeze(1) 31 | 32 | # (batch_size, tgt_action_num, query_vec_size, 1) 33 | q = query_vec.permute(1, 0, 2).unsqueeze(3) 34 | 35 | # (batch_size, tgt_action_num, src_sent_len) 36 | weights = torch.matmul(src_encodings, q).squeeze(3) 37 | 38 | # (tgt_action_num, batch_size, src_sent_len) 39 | weights = weights.permute(1, 0, 2) 40 | 41 | if src_token_mask is not None: 42 | # (tgt_action_num, batch_size, src_sent_len) 43 | src_token_mask = src_token_mask.unsqueeze(0).expand_as(weights) 44 | weights.data.masked_fill_(src_token_mask.bool(), -float('inf')) 45 | 46 | ptr_weights = F.softmax(weights, dim=-1) 47 | 48 | return ptr_weights 49 | -------------------------------------------------------------------------------- /src/gan/gen_pyt/query_treebank.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import pickle 4 | 5 | from multivac.src.gan.gen_pyt.asdl.lang.eng.eng_asdl_helper import \ 6 | english_ast_to_asdl_ast 7 | from multivac.src.gan.gen_pyt.asdl.lang.eng.grammar import (EnglishASDLGrammar, 8 | EnglishGrammar) 9 | from multivac.src.gan.gen_pyt.astnode import ASTNode 10 | from multivac.src.rdf_graph.rdf_parse import (StanfordParser, check_parse, 11 | clean_queries, stanford_parse) 12 | 13 | 14 | def find_match_paren(s): 15 | count = 0 16 | 17 | for i, c in enumerate(s): 18 | if c == "(": 19 | count += 1 20 | elif c == ")": 21 | count -= 1 22 | 23 | if count == 0: 24 | return i 25 | 26 | 27 | def get_eng_tree(text, depth=0, debug=False): 28 | ''' Takes a constituency parse string of an English sentence and creates 29 | an ASTNode tree from it. 30 | 31 | Example input: 32 | '(ROOT (SBARQ (WHADVP (WRB Why)) (SQ (VBP do) (NP (NNS birds)) (ADVP 33 | (RB suddenly)) (VP (VB appear) (SBAR (WHADVP (WRB whenever)) (S (NP 34 | (PRP you)) (VP (VBP are) (ADJP (JJ near))))))) (. ?)))' 35 | ''' 36 | 37 | if debug: 38 | print(("\t" * depth + "String: '{}'".format(text))) 39 | 40 | try: 41 | tree_str = text[text.index("(") + 1:text.rfind(")")] 42 | except ValueError: 43 | print(("Malformatted parse string: '{}'".format(text))) 44 | raise ValueError 45 | 46 | next_idx = tree_str.index(" ") 47 | 48 | tree = ASTNode(tree_str[:next_idx]) 49 | if debug: 50 | print(("\t" * depth + "Type: '{}'".format(tree.type))) 51 | 52 | if "(" in tree_str: 53 | while "(" in tree_str: 54 | tree_str = tree_str[tree_str.index("("):] 55 | next_idx = find_match_paren(tree_str) + 1 56 | tree.add_child(get_eng_tree(tree_str[:next_idx], depth+1, debug)) 57 | tree_str = tree_str[next_idx + 1:] 58 | else: 59 | tree.value = tree_str[next_idx + 1:] 60 | if debug: 61 | print(("\t" * depth + "Value: " + tree.value)) 62 | 63 | return tree 64 | 65 | 66 | def get_grammar(parse_trees, verbose=False): 67 | rules = set() 68 | 69 | for parse_tree in parse_trees: 70 | parse_tree_rules, rule_parents = parse_tree.get_productions() 71 | for rule in parse_tree_rules: 72 | rules.add(rule) 73 | 74 | rules = list(sorted(rules, key=lambda x: x.__repr__())) 75 | grammar = EnglishGrammar(rules) 76 | 77 | if verbose: 78 | print(('num. rules: %d', len(rules))) 79 | 80 | return grammar 81 | 82 | 83 | def parse_raw(parser, query): 84 | try: 85 | query = stanford_parse(parser, query) 86 | except Exception: 87 | print('Could not parse query: {}'.format(query)) 88 | return None 89 | 90 | try: 91 | result = get_eng_tree(query.parse_string) 92 | except Exception: 93 | print("Could not interpret query parse: {}".format(query.parse_string)) 94 | return None 95 | 96 | return result 97 | 98 | 99 | def extract_grammar(source_file, output=None, clean=False, verbose=False, 100 | asdl=False): 101 | parse_trees = list() 102 | 103 | if asdl: 104 | parse_func = english_ast_to_asdl_ast 105 | else: 106 | parse_func = get_eng_tree 107 | 108 | parser = StanfordParser(annots="tokenize ssplit parse") 109 | 110 | with open(source_file, 'r') as f: 111 | queries = f.readlines() 112 | 113 | if clean: 114 | queries = clean_queries(queries, verbose) 115 | 116 | if verbose: 117 | print("Performing constituency parsing of queries") 118 | 119 | for i, q in enumerate(queries): 120 | if len(q) > 0: 121 | try: 122 | query = stanford_parse(parser, q) 123 | except Exception: 124 | print('Could not parse query {}: "{}"'.format(i, q)) 125 | continue 126 | 127 | if check_parse(query): 128 | try: 129 | parse_trees.append(parse_func(query.parse_string)) 130 | except Exception: 131 | print(("Could not interpret query parse {}: '{}'".format(i, query))) 132 | continue 133 | 134 | if i % 100 == 0: 135 | print("{} queries processed.".format(i)) 136 | 137 | if verbose: 138 | print(("{} queries successfully parsed.".format(len(parse_trees)))) 139 | print("Extracting grammar production rules.") 140 | 141 | if asdl: 142 | productions = set() 143 | 144 | for parse_tree in parse_trees: 145 | productions.update(parse_tree.get_productions()) 146 | 147 | grammar = EnglishASDLGrammar(productions=productions) 148 | else: 149 | rules = set() 150 | 151 | for parse_tree in parse_trees: 152 | parse_tree_rules, _ = parse_tree.get_productions() 153 | 154 | for rule in parse_tree_rules: 155 | rules.add(rule) 156 | 157 | rules = list(sorted(rules, key=lambda x: x.__repr__())) 158 | grammar = EnglishGrammar(rules) 159 | 160 | if verbose: 161 | print("Grammar induced successfully.") 162 | 163 | if output is not None: 164 | with open(output, 'wb') as f: 165 | pickle.dump(grammar, f) 166 | else: 167 | return grammar, parse_trees 168 | 169 | 170 | if __name__ == '__main__': 171 | parser = argparse.ArgumentParser( 172 | description='Compile grammar from query examples.') 173 | parser.add_argument('-q', '--queries', required=True, 174 | help='Path to queries.') 175 | parser.add_argument('-o', '--output', 176 | help='Filename for output.') 177 | parser.add_argument('-c', '--clean', action='store_true', default=False, 178 | help='Pre-clean queries before populating.') 179 | parser.add_argument('-v', '--verbose', action='store_true', default=False, 180 | help='Print verbose output on progress.') 181 | parser.add_argument('-a', '--asdl', action='store_true', default=False, 182 | help='Return grammar in ASDL mode.') 183 | 184 | args_dict = vars(parser.parse_args()) 185 | 186 | extract_grammar(args_dict['queries'], 187 | args_dict['output'], 188 | args_dict['clean'], 189 | args_dict['verbose'], 190 | args_dict['asdl']) 191 | -------------------------------------------------------------------------------- /src/gan/gen_test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from multivac.src.gan.gen_pyt.asdl.lang.eng.eng_asdl_helper import \ 4 | asdl_ast_to_english 5 | from multivac.src.gan.gen_pyt.model.parser import Parser 6 | 7 | 8 | def run(args): 9 | ''' 10 | Load GAN generator model 11 | Apply query items 12 | Return beam search results 13 | ''' 14 | 15 | if isinstance(args['model'], str): 16 | netG = Parser.load(args['model']) 17 | else: 18 | netG = args['model'] 19 | if isinstance(args['query'], str): 20 | query = args['query'].split() 21 | else: 22 | query = args['query'] 23 | 24 | results = netG.parse(query, beam_size=netG.args['beam_size']) 25 | texts = [asdl_ast_to_english(x.tree) for x in results] 26 | 27 | return texts 28 | 29 | 30 | if __name__ == '__main__': 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('-m', '--model', 33 | help='Path to model checkpoint file.') 34 | parser.add_argument('-q', '--query', nargs='+', required=False, 35 | help='Query tokens for generating a question.') 36 | 37 | args = vars(parser.parse_args()) 38 | 39 | results = run(args) 40 | print(results) 41 | -------------------------------------------------------------------------------- /src/gan/utilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/src/gan/utilities/__init__.py -------------------------------------------------------------------------------- /src/gan/utilities/rollout.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import os 4 | from tqdm import tqdm 5 | from spacy.tokenizer import Tokenizer 6 | from spacy.vocab import Vocab 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch.utils.data import Dataset, DataLoader 11 | 12 | from discriminator import MULTIVACDataset, Tree 13 | from gen_pyt.asdl.lang.eng.eng_asdl_helper import asdl_ast_to_english 14 | from gen_pyt.model.parser import Parser 15 | # from .tree_rollout import rollout_samples 16 | from multivac.src.gan.gen_pyt.components.decode_hypothesis import DecodeHypothesis 17 | 18 | from multivac.src.rdf_graph.rdf_parse import StanfordParser 19 | 20 | class RolloutDataset(Dataset): 21 | def __init__(self, data): 22 | super().__init__() 23 | self.data = data 24 | self.size = self.data.shape[0] 25 | def __len__(self): 26 | return self.size 27 | def __getitem__(self, index): 28 | return copy.deepcopy(self.data[index]) 29 | 30 | class Engine(object): 31 | def __init__(self, step=0): 32 | self.step = 0 33 | 34 | class Rollout(object): 35 | def __init__(self, rollout_num, vocab): 36 | #self.new_net = copy.deepcopy(net) 37 | self.vocab = vocab 38 | self.tokenizer = Tokenizer(Vocab(strings=list(vocab.labelToIdx.keys()))) 39 | self.rollout_num = rollout_num 40 | self.parser = StanfordParser(annots='tokenize') 41 | 42 | def hyp_to_parse(self, hyp, vocab): 43 | if isinstance(hyp,str): 44 | text = hyp 45 | else: 46 | text = asdl_ast_to_english(hyp.tree) 47 | 48 | parse = self.parser.get_parse(text)['sentences'] 49 | 50 | if len(parse) > 0: 51 | tokens = [x['word'] for x in parse[0]['tokens']] 52 | deps = sorted(parse[0]['basicDependencies'], 53 | key=lambda x: x['dependent']) 54 | parents = [x['governor'] for x in deps] 55 | tree = MULTIVACDataset.read_tree(parents) 56 | inp = torch.tensor(vocab.convertToIdx(tokens, ''), 57 | dtype=torch.long, device='cpu') 58 | else: 59 | tree = Tree() 60 | inp = torch.tensor([]) 61 | 62 | return tree, inp 63 | 64 | def parse_tokens(self, tree): 65 | text = asdl_ast_to_english(tree) 66 | tokens = [x.text for x in self.tokenizer(text)] 67 | result = torch.tensor(self.vocab.convertToIdx(tokens, ''), 68 | dtype=torch.long, 69 | device='cpu') 70 | return result 71 | 72 | @staticmethod 73 | def parse_to_trees(parses, vocab): 74 | results = [''] * len(parses) 75 | 76 | for idx, parse in enumerate(parses): 77 | tokens = [x['word'] for x in parse['tokens']] 78 | deps = sorted(parse['basicDependencies'], 79 | key=lambda x: x['dependent']) 80 | parents = [x['governor'] for x in deps] 81 | tree = MULTIVACDataset.read_tree(parents) 82 | results[idx] = (tree, torch.tensor(vocab.convertToIdx(tokens, ''), 83 | dtype=torch.long, device='cpu')) 84 | 85 | return results 86 | 87 | @staticmethod 88 | def ffwd_hyp(hyp, j): 89 | new_hyp = DecodeHypothesis() 90 | 91 | for i in range(j): 92 | if i < len(hyp.action_infos): 93 | new_hyp.apply_action_info(hyp.action_infos[i]) 94 | 95 | return new_hyp 96 | 97 | def get_tree_reward(self, hyps, states, examples, 98 | netG, netD, vocab, verbose=False): 99 | batch_size = len(hyps) 100 | src_sents = [e.src_sent for e in examples] 101 | rewards = [] 102 | max_action_len = max([len(hyp.actions) for hyp in hyps]) 103 | 104 | netD.eval() 105 | 106 | for i in range(self.rollout_num): 107 | if verbose: print("Rollout step {}".format(i)) 108 | 109 | samples = [[0] * batch_size] * max_action_len 110 | inputs = [[0] * batch_size] * max_action_len 111 | # texts = [[0] * batch_size] * max_action_len 112 | 113 | for j in tqdm(range(1, max_action_len)): 114 | for n in range(batch_size): 115 | src = src_sents[n] 116 | hyp = Rollout.ffwd_hyp(hyps[n], j) 117 | state = states[n][:j] 118 | samples[j-1][n] = netG.sample(src, hyp, state) 119 | 120 | if verbose: print("Samples generated of shape " 121 | "({},{})".format(max_action_len, batch_size)) 122 | 123 | for x in tqdm(range(max_action_len), "Translating trees..."): 124 | for h, hyp in enumerate(samples[x]): 125 | inputs[x][h] = self.parse_tokens(hyp.tree) 126 | 127 | for j in range(max_action_len): 128 | samps = torch.full((len(inputs[j]), 150), vocab.pad) 129 | 130 | for idx, x in enumerate(inputs[j]): 131 | samps[idx, :len(x)] = x[:150] 132 | 133 | x = samps.long().to(netD.args['device']) 134 | out = netD(x).softmax(dim=-1).data[:,1].numpy() 135 | 136 | if i == 0: 137 | rewards.append(out) 138 | else: 139 | rewards[j] += out 140 | 141 | originals = [self.parse_tokens(hyp.tree) for hyp in hyps] 142 | 143 | for j in tqdm(range(batch_size), desc="Rating action step {}...".format(max_action_len)): 144 | samps = torch.full((len(originals), 150), vocab.pad) 145 | 146 | for idx, x in enumerate(originals): 147 | samps[idx, :len(x)] = x[:150] 148 | 149 | x = samps.long().to(netD.args['device']) 150 | out = netD(x).softmax(dim=-1).data[:,1].numpy() 151 | 152 | if i == 0: 153 | rewards.append(out) 154 | else: 155 | rewards[-1] += out 156 | 157 | rewards = np.array(rewards) / (1.0 * self.rollout_num) 158 | 159 | return rewards 160 | -------------------------------------------------------------------------------- /src/gan/utilities/shuffled_queries.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | import pandas as pd 5 | from random import shuffle 6 | 7 | def run(args_dict): 8 | DIR = os.path.dirname(args_dict['file']) 9 | file = os.path.basename(args_dict['file']) 10 | 11 | with open(args_dict['file']) as f: 12 | clean_txt = f.readlines() 13 | 14 | df_clean = pd.DataFrame(clean_txt) 15 | df_clean.columns = ['query'] 16 | # clean queries will contain a label of 1 17 | df_clean['label'] = 1 18 | 19 | # Tokenizing and shuffling each query 20 | # These will be labeled as 0 21 | shuffles = [] 22 | for txt in clean_txt: 23 | txt = txt.split(" ") 24 | shuffle(txt) 25 | shuffles.append(" ".join(txt)) 26 | 27 | shuffled_df = pd.DataFrame() 28 | shuffled_df['query'] = shuffles 29 | shuffled_df['label'] = 0 30 | 31 | final_df = pd.concat([df_clean, shuffled_df]).reset_index() 32 | final_df.rename(columns={'index': 'id'}, inplace=True) 33 | final_df['query'] = final_df['query'].apply(lambda x: x.replace("\n", "")) 34 | 35 | np.savetxt(os.path.join(DIR,"extracted_questions_labels.txt"), 36 | final_df.values, newline='\n', fmt=["%s", "%s", "%s"], 37 | delimiter='\t', 38 | header='id query label') 39 | 40 | 41 | if __name__ == '__main__': 42 | parser = argparse.ArgumentParser(description='Create shuffled queries' 43 | 'from .txt file for GAN.') 44 | parser.add_argument('-f', '--file', required=True, 45 | help='Path to source query file.') 46 | 47 | args_dict = vars(parser.parse_args()) 48 | 49 | run(args_dict) 50 | 51 | -------------------------------------------------------------------------------- /src/gan/utilities/utils.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import pickle 4 | import math 5 | 6 | import torch 7 | 8 | from multivac.src.gan.utilities.vocab import Vocab 9 | 10 | 11 | # write unique words from a set of files to a new file 12 | def build_vocab(filenames, vocabfile, lowercase=True): 13 | vocab = set() 14 | 15 | for filename in filenames: 16 | with open(filename, 'r') as f: 17 | for line in f: 18 | if lowercase: 19 | line = line.lower() 20 | 21 | tokens = line.rstrip('\n').split(' ') 22 | vocab |= set(tokens) 23 | 24 | with open(vocabfile, 'w') as f: 25 | for token in sorted(vocab): 26 | f.write(token + '\n') 27 | 28 | class cached_property(object): 29 | """ A property that is only computed once per instance and then replaces 30 | itself with an ordinary attribute. Deleting the attribute resets the 31 | property. 32 | 33 | Source: https://github.com/bottlepy/bottle/commit/fa7733e075da0d790d809aa3d2f53071897e6f76 34 | """ 35 | 36 | def __init__(self, func): 37 | self.__doc__ = getattr(func, '__doc__') 38 | self.func = func 39 | 40 | def __get__(self, obj, cls): 41 | if obj is None: 42 | return self 43 | value = obj.__dict__[self.func.__name__] = self.func(obj) 44 | return value 45 | 46 | def deserialize_from_file(path): 47 | with open(path, 'rb') as f: 48 | obj = pickle.load(f) 49 | 50 | return obj 51 | 52 | # loading GLOVE word vectors 53 | # if .pth file is found, will load that 54 | # else will load from .txt file & save 55 | def load_word_vectors(path, lowercase=True): 56 | if os.path.isfile(path + '.pth') and os.path.isfile(path + '.vocab'): 57 | print('==> File found, loading to memory') 58 | vectors = torch.load(path + '.pth') 59 | vocab = Vocab(filename=path + '.vocab', lower=lowercase) 60 | 61 | return vocab, vectors 62 | elif path.endswith('.pkl'): 63 | print('==> File found, loading to memory') 64 | 65 | with open(path, "rb") as f: 66 | glove = pickle.load(f) 67 | 68 | vectors = torch.from_numpy(glove['embeddings']).float() 69 | vocab = Vocab(data=glove['vocab'], lower=lowercase) 70 | 71 | return vocab, vectors 72 | 73 | # saved file not found, read from txt file 74 | # and create tensors for word vectors 75 | print('==> File not found, preparing, be patient') 76 | 77 | 78 | count = sum(1 for line in open(path + '.txt', 'r', encoding='utf8', errors='ignore')) 79 | 80 | with open(path + '.txt', 'r') as f: 81 | contents = f.readline().rstrip('\n').split(' ') 82 | dim = len(contents[1:]) 83 | 84 | words = [None] * (count) 85 | vectors = torch.zeros(count, dim, dtype=torch.float, device='cpu') 86 | 87 | with open(path + '.txt', 'r', encoding='utf8', errors='ignore') as f: 88 | idx = 0 89 | 90 | for line in f: 91 | contents = line.rstrip('\n').split(' ') 92 | words[idx] = contents[0] 93 | values = list(map(float, contents[1:])) 94 | vectors[idx] = torch.tensor(values, dtype=torch.float, device='cpu') 95 | idx += 1 96 | 97 | with open(path + '.vocab', 'w', encoding='utf8', errors='ignore') as f: 98 | for word in words: 99 | f.write(word + '\n') 100 | 101 | vocab = Vocab(filename=path + '.vocab') 102 | torch.save(vectors, path + '.pth') 103 | 104 | return vocab, vectors 105 | 106 | def serialize_to_file(obj, path, protocol=pickle.HIGHEST_PROTOCOL): 107 | with open(path, 'wb') as f: 108 | pickle.dump(obj, f, protocol=protocol) 109 | 110 | def typename(x): 111 | if isinstance(x, str): 112 | return x 113 | return x.__name__ 114 | -------------------------------------------------------------------------------- /src/gan/utilities/vocab.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import Counter 3 | from itertools import chain 4 | 5 | class Vocab(object): 6 | def __init__(self, filename=None, data=None, lower=False): 7 | self.idxToLabel = {} 8 | self.labelToIdx = {} 9 | self.lower = lower 10 | 11 | # Special entries will not be pruned. 12 | self.special = [] 13 | 14 | if data is not None: 15 | self.addSpecials(data) 16 | if filename is not None: 17 | self.loadFile(filename) 18 | 19 | self.add('') 20 | self.add('') 21 | self.add('') 22 | 23 | def __getitem__(self, item): 24 | return self.labelToIdx.get(item, self.unk) 25 | 26 | def __contains__(self, item): 27 | return item in self.labelToIdx 28 | 29 | @property 30 | def size(self): 31 | return len(self.idxToLabel) 32 | 33 | def __setitem__(self, key, value): 34 | self.labelToIdx[key] = value 35 | 36 | def __len__(self): 37 | return len(self.labelToIdx) 38 | 39 | def __iter__(self): 40 | return iter(list(self.labelToIdx.keys())) 41 | 42 | def __eq__(self, other): 43 | return all([self.idxToLabel == other.idxToLabel, 44 | self.labelToIdx == other.labelToIdx, 45 | self.lower == other.lower, 46 | self.special == other.special]) 47 | 48 | @property 49 | def pad(self): 50 | return self.labelToIdx[''] 51 | 52 | @property 53 | def unk(self): 54 | return self.labelToIdx[''] 55 | 56 | @property 57 | def eos(self): 58 | return self.labelToIdx[''] 59 | 60 | def is_unk(self, word): 61 | return word not in self 62 | 63 | def size(self): 64 | return len(self.idxToLabel) 65 | 66 | # Load entries from a file. 67 | def loadFile(self, filename): 68 | idx = 0 69 | for line in open(filename, 'r', encoding='utf8', errors='ignore'): 70 | token = line.rstrip('\n') 71 | self.add(token) 72 | idx += 1 73 | 74 | def getIndex(self, key, default=None): 75 | key = key.lower() if self.lower else key 76 | 77 | return self.labelToIdx.get(key, default) 78 | 79 | def getLabel(self, idx, default=None): 80 | return self.idxToLabel.get(idx, default) 81 | 82 | def add_from_data(self, label, idx=None): 83 | if idx: 84 | self.idxToLabel[idx] = label 85 | self.labelToIdx[label] = idx 86 | else: 87 | idx = self.add(label) 88 | 89 | # Mark this `label` and `idx` as special 90 | def addSpecial(self, label): 91 | idx = self.add(label) 92 | self.special += [idx] 93 | 94 | # Mark all labels in `labels` as specials 95 | def addSpecials(self, labels): 96 | for label in labels: 97 | if isinstance(label, tuple): 98 | self.add_from_data(*label) 99 | else: 100 | self.addSpecial(label) 101 | 102 | # Add `label` in the dictionary. Use `idx` as its index if given. 103 | def add(self, label): 104 | label = label.lower() if self.lower else label 105 | 106 | if label in self.labelToIdx: 107 | idx = self.labelToIdx[label] 108 | else: 109 | idx = len(self.idxToLabel) 110 | self.idxToLabel[idx] = label 111 | self.labelToIdx[label] = idx 112 | return idx 113 | 114 | # Convert `labels` to indices. Use `unkWord` if not found. 115 | # Optionally insert `bosWord` at the beginning and `eosWord` at the . 116 | def convertToIdx(self, labels, unkWord=None, bosWord=None, eosWord=None): 117 | if unkWord is None: 118 | unk = self.unk 119 | else: 120 | unk = self.getIndex(unkWord) 121 | 122 | vec = [] 123 | 124 | if bosWord is not None: 125 | vec += [self.getIndex(bosWord)] 126 | 127 | 128 | vec += [self.getIndex(label, default=unk) for label in labels] 129 | 130 | if eosWord is not None: 131 | vec += [self.getIndex(eosWord)] 132 | 133 | return vec 134 | 135 | # Convert `idx` to labels. If index `stop` is reached, convert it and return. 136 | def convertToLabels(self, idx, stop=None): 137 | labels = [] 138 | 139 | for i in idx: 140 | labels += [self.getLabel(i)] 141 | if i == stop: 142 | break 143 | 144 | return labels 145 | 146 | 147 | @staticmethod 148 | def from_corpus(corpus, size=None, freq_cutoff=0): 149 | vocab = Vocab() 150 | 151 | word_freq = Counter(chain(*corpus)) 152 | non_singletons = [w for w in word_freq if word_freq[w] > 1] 153 | singletons = [w for w in word_freq if word_freq[w] == 1] 154 | top_k_words = sorted(word_freq.keys(), reverse=True, key=word_freq.get) 155 | 156 | if size is not None: 157 | top_k_words = top_k_words[:size] 158 | 159 | words_not_included = [] 160 | 161 | for word in top_k_words: 162 | if word_freq[word] >= freq_cutoff: 163 | vocab.add(word) 164 | else: 165 | words_not_included.append(word) 166 | 167 | if len(vocab) == size: 168 | break 169 | 170 | return vocab 171 | 172 | @staticmethod 173 | def from_dict(vocab): 174 | new_vocab = Vocab() 175 | 176 | for key, value in vocab.items(): 177 | setattr(new_vocab, key, value) 178 | 179 | return new_vocab 180 | -------------------------------------------------------------------------------- /src/rdf_graph/build_graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import argparse 4 | 5 | from datetime import datetime 6 | 7 | from rdf_graph import RDFGraph 8 | 9 | 10 | def run(args_dict): 11 | # create timestamp 12 | timestamp = datetime.now().strftime('%d%b%Y-%H:%M:%S') 13 | 14 | # instantiate class 15 | knowledge_graph = RDFGraph() 16 | 17 | # Associate a JSON file of source documents from which to induce 18 | # the knowledge graph. 19 | knowledge_graph.set_source(args_dict['sources']) 20 | 21 | print('\nExtracting relation triples from abstracts') 22 | knowledge_graph.extract_raw_tuples() 23 | 24 | # pre-process extracted tuples 25 | print('\nPreprocessing raw relation triples') 26 | knowledge_graph.preprocess_raw_tuples() 27 | 28 | # cluster all entities using fast 29 | # agglomerative clustering and cosine distance of averaged word embeddings 30 | print('\nClustering entities from relation triples') 31 | knowledge_graph.cluster_entities(args_dict['glove']) 32 | print('\n{} entity clusters were found' 33 | .format(len(knowledge_graph.entity_cluster_results['cluster_members']))) 34 | 35 | # output text files that will be used openke for knowledge graph creation 36 | # and embedding output .txt files for openke output 37 | print('\nSaving final tuples to .txt files for input to OpenKE') 38 | knowledge_graph.output_to_openke(timestamp) 39 | 40 | 41 | if __name__ == '__main__': 42 | parser = argparse.ArgumentParser(description='Fetcher to retrieve articles ' 43 | 'for modeling.') 44 | parser.add_argument('-s', '--sources', required=True, 45 | help='Select a source for article retrieval.') 46 | parser.add_argument('-g', '--glove', required=True, 47 | help='Path to pickle file containing glove embeddings') 48 | args_dict = vars(parser.parse_args()) 49 | 50 | run(args_dict) 51 | -------------------------------------------------------------------------------- /src/rdf_graph/environment.yml: -------------------------------------------------------------------------------- 1 | name: nlp 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.7 6 | - pandas 7 | - textacy 8 | - neuralcoref 9 | - tqdm 10 | - jupyterlab 11 | - pip 12 | - pip: 13 | - nltk 14 | - stanfordcorenlp==3.9.1.1 15 | - https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.1.3/en_core_web_sm-2.1.0.tar.gz 16 | -------------------------------------------------------------------------------- /src/utilities.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def mkdir(directory, verbose=False): 5 | """Make a directory if it doesn't already exist.""" 6 | if not os.path.exists(directory): 7 | if verbose: 8 | print('Make directory %s' % directory) 9 | 10 | os.makedirs(directory) 11 | elif verbose: 12 | print('Directory %s already exists' % directory) 13 | 14 | 15 | def dict_str(my_dict, results=''): 16 | '''Poor man's version of a pretty-ish print function for dictionaries to 17 | expose the basic structure without printing all the values. ''' 18 | results += '{' 19 | results += ', '.join(['{}: {}'.format(k, type(v)) if not isinstance(v, dict) else '{}:\n \t {} \n'.format(k, dict_str(v)) for k, v in my_dict.items()]) 20 | 21 | results += '}' 22 | 23 | return results 24 | -------------------------------------------------------------------------------- /stanford-corenlp-full/rdf_graph.properties: -------------------------------------------------------------------------------- 1 | port = 9000 2 | timeout = 45000 3 | ner.model = edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz 4 | ner.useSUTime = false 5 | ner.applyNumericClassifiers = false -------------------------------------------------------------------------------- /sys/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GallupGovt/multivac/8786a75b62a96d090e13371d8156da97ece75ce3/sys/.gitkeep -------------------------------------------------------------------------------- /templates/base.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | MULTIVAC | Gallup DSS 11 | 12 | 13 | 14 | {% block header %}{% endblock %} 15 | 16 | 17 | 18 |
19 |
20 |
21 |
22 |

{% block title %}{% endblock %}

23 |

{% block description %}{% endblock %}

24 |
25 |
26 |
27 |
28 |
29 |

System Process

30 | {% block leftcontent %}{% endblock %} 31 |
32 |
33 |

Code Review

34 | {% block rightcontent %}{% endblock %} 35 |
36 |
37 |
38 |
39 | 40 | 41 | 42 | 43 | {% block scripts %}{% endblock %} 44 | 45 | 46 | -------------------------------------------------------------------------------- /templates/query.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block title %} 4 | Gallup - MULTIVAC 5 | {% endblock %} 6 | 7 | {% block description %} 8 | A DARPA Automating Scientific Knowledge Extraction (ASKE) Artifical Intelligence Exploration (AIE) Program 9 | {% endblock %} 10 | 11 | {% block leftcontent %} 12 |
13 | 14 | 15 |
16 | 17 |
Map queries to knowledge graph

18 |
19 |
20 |
21 | 22 | 23 |
24 |
25 | 26 | 27 |
28 |
29 |
30 |
31 | 32 | 43 |
44 |
45 |
46 |
47 | 48 | 49 |
50 |
51 | 52 | 53 |
54 |
55 |
56 |
57 | 58 | 62 |
63 |
64 |
65 |
66 | 67 | 68 |
69 |
70 |
71 |
72 | 73 | 74 |
75 |
76 | 77 | 78 |
79 |
80 |
81 | 82 | 83 |
84 |
85 |
86 | 87 | 88 |
89 |
90 | 91 |
92 |
93 | {% endblock %} 94 | 95 | {% block rightcontent %} 96 |
97 | 98 | {% endblock %} 99 | --------------------------------------------------------------------------------