├── .gitignore ├── LICENSE ├── README.md ├── configs ├── configEN.config ├── configEN_test.config ├── configES.config └── configES_test.config ├── eval.py ├── install.sh ├── install_arc_solvers.sh ├── models.py ├── run.py ├── scripts ├── evaluate_arc_solvers.py ├── head2ARCformat.py ├── head2drqa.py └── pdfexams2txt.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 DVC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HEAD-QA 2 | 3 | **NEWS!** HEAD-QA can be now imported from [huggingface datasets](https://huggingface.co/datasets/head_qa). Thank you very much to [Maria Grandury](https://github.com/mariagrandury) for adding it. 4 | 5 | 6 | This repository contains the sources used in "HEAD-QA: A Healthcare Dataset for Complex Reasoning" (ACL, 2019) 7 | 8 | HEAD-QA is a multi-choice **HEA**lthcare **D**ataset. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the [Ministerio de Sanidad, Consumo y Bienestar Social](https://www.mscbs.gob.es/), who also provides direct [access](https://fse.mscbs.gob.es/fseweb/view/public/datosanteriores/cuadernosExamen/busquedaConvocatoria.xhtml) to the exams of the last 5 years (in Spanish). 9 | 10 | > Date of the last update of the documents object of the reuse: January, 14th, 2019. 11 | 12 | HEAD-QA tries to make these questions accessible for the Natural Language Processing community. We hope it is an useful resource towards achieving better QA systems. The dataset contains questions about the following topics: 13 | 14 | - Medicine. 15 | - Nursing. 16 | - Psychology. 17 | - Chemistry. 18 | - Pharmacology. 19 | - Biology. 20 | 21 | # Requirements 22 | 23 | - Python 3.6.7 24 | - DrQA 25 | - scikit-learn==0.20.2 26 | - numpy==1.16.0 27 | - torch==1.0.0 28 | - torchvision 29 | - spacy==2.0.0 30 | - prettytable==0.70.2 31 | 32 | ## Requirements for the ARC-Solvers 33 | 34 | - Python 3.6.7 35 | - torch==0.3.1 36 | - torchvision 37 | - allennlp==0.20.1 38 | 39 | ## Installation 40 | 41 | We first recommend you to install a virtualenv in the first place (e.g. `virtualenv -p python3.6 head-qa`) 42 | The script `install.sh` automatically installs the mentioned packages, assuming that you have previously created and activated your virtualenv (tested on Ubuntu 18.04, 64 bits). 43 | The script `install_arc_solvers.sh` install the needed stuff to run the ARC-solvers (Clark et al,2019). 44 | > We recommend using a different virtualenv for them as stuff such as the pytorch version might create conflicts. 45 | 46 | # Datasets 47 | 48 | [ES_HEAD dataset](https://drive.google.com/open?id=1dUIqVwvoZAtbX_-z5axCoe97XNcFo1No) 49 | [EN_HEAD dataset](https://drive.google.com/open?id=1phryJg4FjCFkn0mSCqIOP2-FscAeKGV0) 50 | Each dataset contains: 51 | - *.gold -> A tsv gold file that maps question IDs to the ground truth answer ID to such question. One file per exam. 52 | - HEAD[_EN].json -> It contains the whole data for HEAD-QA (used in the so-called 'unsupervised' setting). 53 | - train_HEAD[\_EN].json -> It contains the training set of HEAD-QA (used as the training set in the so-called 'supervised' setting) 54 | - dev_HEAD[\_EN].json -> A json file containing the development set of HEAD-QA (used in the 'supervised' setting). 55 | - test_HEAD[\_EN].json -> A json file containing the test set of HEAD-QA (used in the 'supervised' setting). 56 | 57 | [Data (images, pdfs, etc)](https://drive.google.com/open?id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t). Note that these are medical images and some of them might have sensitive content. 58 | 59 | 60 | 61 | # Run the baselines: Length, Random, Blind_n, IR and DrQA 62 | 63 | Available baselines for Spanish HEAD-QA: Length, Random, Blind_n, IR- 64 | Available baselines for English HEAD-QA (HEAD-QA\_EN): Length, Random, Blind_n, IR, DrQA- 65 | 66 | **Description of the baselines:** 67 | - Length: Chooses the longest answer 68 | - Random: Chooses a random answer. 69 | - Blind_n: Chooses the *n*th answer. 70 | - IR: Chooses the answer based on the relevance of the query: question+*n*th answer. 71 | - DrQA: A model based on DrQA's (Chen, D., Fisch, A., Weston, J., & Bordes, A. Reading Wikipedia to Answer Open-Domain Questions) 72 | 73 | 74 | ## Creating an inverted index 75 | 76 | IR and DrQA require to create an inverted index in advance. This is done using [wikiextractor](https://github.com/attardi/wikiextractor) and following [DrQa's Document Reader](https://github.com/facebookresearch/DrQA/blob/master/scripts/retriever/README.md) guidelines (visit their README.md for a detailed explanation about how to create the index, we here summarize the main steps): 77 | 78 | In this work we used the following Wikipedia dumps: 79 | 80 | - Spanish: [eswiki-20180620-pages-articles.xml.bz2](http://www.grupolys.org/software/head-qa-acl2019/eswiki-20180620-pages-articles.xml.bz2) 81 | - English: [enwiki-20180701-pages-articles.xml.bz2](http://www.grupolys.org/software/head-qa-acl2019/enwiki-20180701-pages-articles.xml.bz2) 82 | 83 | Alternative, you can try to use the current Wikipedia dump maintained by https://dumps.wikimedia.org/ 84 | 85 | ``` 86 | PYTHONPATH="$HOME/git/wikiextractor" python $HOME/git/wikiextractor/WikiExtractor.py $PATH_WIKIPEDIA_DUMP -o $PATH_WIKI_JSON --json 87 | PYTHONPATH="$HOME/git/DrQA/" python $HOME/git/DrQA/scripts/retriever/build_db.py $PATH_WIKI_JSON $PATH_DB 88 | PYTHONPATH="$HOME/git/DrQA/" python $HOME/git/DrQA/scripts/retriever/build_tfidf.py --num-workers 2 $PATH_DB $PATH_TFIDF 89 | ``` 90 | 91 | The created model in $PATH_TFIDF it's what will be used as our inverted index. 92 | If they are of any help, the indexes we used in our work can be found [here](http://www.grupolys.org/software/head-qa-acl2019/wiki-articles.tfidf.zip). 93 | 94 | ## Updating DrQA's tokenizer 95 | 96 | By default, DrQA uses the CoreNLP tokenizer. In this work we used the SpacyTokenizer instead. To use it, go to `DrQA/drqa/pipeline/__init__.py` and make sure you use the DEFAULT below these lines. Also, we used `multitask.mdl` as the `reader_model`. Make sure you have downloaded it when you installed DrQA. 97 | 98 | ``` 99 | from ..tokenizers import CoreNLPTokenizer, SpacyTokenizer 100 | 101 | DEFAULTS = { 102 | 'tokenizer': SpacyTokenizer,#CoreNLPTokenizer, 103 | 'ranker': TfidfDocRanker, 104 | 'db': DocDB, 105 | 'reader_model': os.path.join(DATA_DIR, 'reader/multitask.mdl'), 106 | } 107 | ``` 108 | 109 | 110 | ## Create a configuration file 111 | 112 | ``` 113 | #A configuration file for Spanish 114 | 115 | lang=es 116 | eval=eval.py 117 | #Path to your DrQA's installation 118 | drqa=DrQA/ 119 | use_stopwords=False 120 | ignore_questions=False 121 | negative_questions=False 122 | #The folder containing the .gold files 123 | path_solutions=HEAD/ 124 | 125 | es_head=HEAD/HEAD.json #HEAD-QA in json format 126 | #The inverted index that we have previously created. 127 | es_retriever=wikipedia//home/david.vilares/Escritorio/proof-head-qa-code/head-qa/wikipedia/eswiki-20180620-articles.tfidf 128 | 129 | ``` 130 | 131 | 132 | After this, you should be abl to run the script `run.py`: 133 | 134 | ``` 135 | python run.py --config configs/configuration$LANG.config --answerer $ANSWERER --output $OUTPUT 136 | ``` 137 | 138 | - `--config` A path to a configuration file (see the folder `configs` for an example) 139 | - `--answerer` A string to indicate what 'answerer' to use. Valid values are [length, random, ir, drqa, blind_n] (n is a number to indicate to take as the right answer the *n*th answer. 140 | - `--output` The path to the file to save the results 141 | 142 | # Running the ARC-solvers 143 | 144 | We also run the ARC-Solvers used in the ARC challenge (Clark, P., Cowhey, I., Etzioni, O., Khot, T., Sabharwal, A., Schoenick, C., & Tafjord, O. Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge.). To install and run them follow these steps: 145 | 146 | 1- Follow the [ARC-solvers README.md instructions](https://github.com/allenai/ARC-Solvers) to create a virtualenv, create the index and download the models and resources: 147 | 148 | > NOTE that instead of using their ARC_corpus.txt as the inverted index we used again Wikipedia. If you also want to use Wikipedia you need to do two things: 149 | 1. Make sure you have downloaded our [Wikipedia corpus](http://www.grupolys.org/software/head-qa-acl2019/WikiCorpus.zip) in txt format. 150 | 2. Modify the file ARC-Solvers/scripts/download_data.sh and change the argument specifying the corpus ARC_corpus.txt to the path where you have stored the Wikipedia corpus. 151 | 152 | >NOTE 2 ARC-Solvers need of elasticsearch 6+ to download the data. Download it and to run it execute. 153 | ``` 154 | cd elasticsearch- 155 | ./bin/elasticsearch 156 | ``` 157 | 158 | 2 - Convert HEAD_EN.json into the input format for the ARC solvers 159 | 160 | ``` 161 | PYTHONPATH=. python scripts/head2ARCformat.py --input HEAD_EN/HEAD_EN.json --output HEAD_ARC/ 162 | ``` 163 | 164 | 3 - Run the models using the evaluation scripts provided together with the ARC solvers: 165 | 166 | ``` 167 | cd ARC-Solvers 168 | sh scripts/evaluate_solver.sh ../HEAD_ARC/HEAD_EN.arc.txt data/ARC-V1-Models-Aug2018/dgem/ 169 | sh scripts/evaluate_solver.sh ../HEAD_ARC/HEAD_EN.arc.txt data/ARC-V1-Models-Aug2018/decompatt/ 170 | sh scripts/evaluate_bidaf.sh ../HEAD_ARC/HEAD_EN.arc.txt data/ARC-V1-Models-Aug2018/bidaf/ 171 | ``` 172 | 173 | 4 - Compute the scores for HEAD-QA, based on the ARC-solvers outputs 174 | 175 | ``` 176 | cd .. 177 | python evaluate_arc_solvers.py --arc_results $PATH_RESULTS --output $PATH_OUTPUT_DIR --disambiguator length --breakdown_results --path_eval eval.py 178 | ``` 179 | where: 180 | - `--arc_results` Path to the output directory containing the outputs computed in step 3. 181 | - `--output` The path to the output directory where to store the results. 182 | - `--disambiguator` The strategy to decide the right answer if many answers where selected as valid by an ARC-solver 183 | - `--breakdown_results` Activate to report individual results for each exam 184 | - `--path_eval` Path to the evaluation script 185 | 186 | 187 | #### Issues 188 | 189 | We had problems running some models, being unable to find the `question-tuplizer.jar` used in the ARC-solvers. If you experience this error `Error: Unable to access jarfile data/ARC-V1-Models-Feb2018/question-tuplizer.jar` we recommend you to change in the file `scripts/evaluate_solver.sh` the line: 190 | `java -Xmx8G -jar data/ARC-V1-Models-Feb2018/question-tuplizer.jar` 191 | by 192 | `java -Xmx8G -jar data/ARC-V1-Models-Aug2018/question-tuplizer.jar` 193 | 194 | We also had problems ruuning the dgem baseline. The default torch version that is installed if you follow the instructions in the ARC-solvers README.md is the 0.4.1. To make them work we needed to install torch 0.3.1 instead. 195 | 196 | ## Acknowledgements 197 | 198 | This work has received funding from the European Research Council (ERC), under the European Union's Horizon 2020 research and innovation programme (FASTPARSE, grant agreement No 714150). 199 | 200 | ### References 201 | 202 | Vilares, David and Gómez-Rodríguez, Carlos. "HEAD-QA: A Healthcare Dataset for Complex Reasoning", to appear, ACL 2019. 203 | -------------------------------------------------------------------------------- /configs/configEN.config: -------------------------------------------------------------------------------- 1 | lang=en 2 | eval=eval.py 3 | drqa=DrQA/ 4 | use_stopwords=False 5 | ignore_questions=False 6 | negative_questions=False 7 | path_solutions=HEAD_EN/ 8 | 9 | en_head=HEAD_EN/HEAD_EN.json 10 | en_retriever=wikipedia/enwiki-20180701-articles.tfidf/enwiki-20180701-articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz 11 | en_drqa_reader_model=DrQA/data/reader/multitask.mdl 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /configs/configEN_test.config: -------------------------------------------------------------------------------- 1 | lang=en 2 | eval=eval.py 3 | drqa=DrQA/ 4 | use_stopwords=False 5 | ignore_questions=False 6 | negative_questions=False 7 | path_solutions=HEAD_EN/ 8 | 9 | en_head=HEAD_EN/test_HEAD_EN.json 10 | en_retriever=wikipedia/enwiki-20180701-articles.tfidf/enwiki-20180701-articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz 11 | en_drqa_reader_model=DrQA/data/reader/multitask.mdl 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /configs/configES.config: -------------------------------------------------------------------------------- 1 | lang=es 2 | eval=eval.py 3 | drqa=DrQA/ 4 | use_stopwords=False 5 | ignore_questions=False 6 | negative_questions=False 7 | path_solutions=HEAD/ 8 | 9 | es_head=HEAD/HEAD.json 10 | es_retriever=wikipedia/eswiki-20180620-articles.tfidf/eswiki-20180620-articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz 11 | 12 | 13 | -------------------------------------------------------------------------------- /configs/configES_test.config: -------------------------------------------------------------------------------- 1 | lang=es 2 | eval=eval.py 3 | drqa=DrQA/ 4 | use_stopwords=False 5 | ignore_questions=False 6 | negative_questions=False 7 | path_solutions=HEAD/ 8 | 9 | #Spanish 10 | es_head=HEAD/test_HEAD.json 11 | es_retriever=wikipedia/eswiki-20180620-articles.tfidf/eswiki-20180620-articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz 12 | 13 | 14 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from utils import F1_SCORE, RECALL, PRECISION,ACCURACY, NETAS, RIGHT, WRONG, UNANSWERED, ID_UNANSWERED 2 | from sklearn.metrics import precision_recall_fscore_support 3 | from sklearn.metrics import accuracy_score 4 | from argparse import ArgumentParser 5 | import codecs 6 | import warnings 7 | warnings.filterwarnings('ignore') 8 | 9 | def netas_score(gold, predicted, avg_10best_scores=574.7): 10 | 11 | right = 0 12 | wrong = 0 13 | unanswered = 0 14 | brutas_score = 0 15 | if len(gold) != len(predicted): 16 | raise ValueError("The gold and predicted vector must have the same length") 17 | else: 18 | for g, p in zip(gold, predicted): 19 | if g == p: 20 | right+=1 21 | brutas_score+=3 22 | elif p != ID_UNANSWERED: 23 | brutas_score-=1 24 | wrong+=1 25 | else: 26 | unanswered+=1 27 | return brutas_score, right, wrong, unanswered 28 | 29 | 30 | def scores(y_pred,y_gold): 31 | p,r,f1,_ = precision_recall_fscore_support(y_gold, y_pred, average='macro') 32 | net,right,wrong,unanswered = netas_score(y_gold,y_pred) 33 | scores = "" 34 | scores+=PRECISION+"\t"+str(round(p,3))+"\n" 35 | scores+=RECALL+"\t"+str(round(r,3))+"\n" 36 | scores+=F1_SCORE+"\t"+str(round(f1,3))+"\n" 37 | scores+=ACCURACY+"\t"+str(round(accuracy_score(y_gold, y_predicted),3))+"\n" 38 | scores+=RIGHT+"\t"+right+"\n" 39 | scores+=WRONG+"\t"+wrong+"\n" 40 | scores+=UNANSWERED+"\t"+unanswered+"\n" 41 | scores+=NETAS+"\t"+net+"\n" 42 | 43 | if __name__ == '__main__': 44 | 45 | arg_parser = ArgumentParser() 46 | arg_parser.add_argument("--gold", dest="gold", help="Path to the PDF file", default=None) 47 | arg_parser.add_argument("--predicted", dest="predicted", help ="Path to the txt file", default=None) 48 | 49 | args = arg_parser.parse_args() 50 | with codecs.open(args.gold) as f_gold: 51 | gold = f_gold.readlines() 52 | y_gold = [e.split()[1].strip() for e in gold] 53 | 54 | with codecs.open(args.predicted) as f_predicted: 55 | predicted = f_predicted.readlines() 56 | y_predicted = [e.split()[1].strip() for e in predicted] 57 | 58 | p,r,f1,_ = precision_recall_fscore_support(y_gold, y_predicted, average='macro') 59 | net,right,wrong,unanswered = netas_score(y_gold,y_predicted) 60 | print (PRECISION,round(p,3)) 61 | print (RECALL, round(r,3)) 62 | print (F1_SCORE, round(f1,3)) 63 | print (ACCURACY,round(accuracy_score(y_gold, y_predicted),3)) 64 | print (RIGHT, right) 65 | print (WRONG, wrong) 66 | print (UNANSWERED, unanswered) 67 | print (NETAS, net ) 68 | 69 | 70 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | ################################################ 2 | # For the control, IR and DrQA methods 3 | ################################################ 4 | 5 | #Read additional instructions to install DrQA at https://github.com/facebookresearch/DrQA 6 | #For the most recent version cline instead #https://github.com/facebookresearch/DrQA.git 7 | git clone https://github.com/aghie/DrQA.git 8 | cd DrQA; pip install -r requirements.txt; python setup.py develop 9 | ./install_corenlp.sh 10 | ./download.sh 11 | 12 | pip install scikit-learn==0.20.2 13 | pip install numpy==1.16.0 14 | pip install torch==1.0.0 15 | pip install torchvision 16 | pip install spacy==2.0.0 17 | 18 | python -m spacy download en 19 | python -m spacy download es 20 | 21 | wget http://www.grupolys.org/software/head-qa-acl2019/HEAD.zip 22 | wget http://www.grupolys.org/software/head-qa-acl2019/HEAD_EN.zip 23 | wget http://www.grupolys.org/software/head-qa-acl2019/data.zip 24 | wget http://www.grupolys.org/software/head-qa-acl2019/wiki-articles.tfidf.zip 25 | wget http://www.grupolys.org/software/head-qa-acl2019/WikiCorpus.zip 26 | 27 | unzip HEAD.zip 28 | unzip HEAD_EN.zip 29 | unzip data.zip 30 | mkdir wikipedia 31 | unzip wiki-articles.tfidf.zip -d wikipedia/ 32 | unzip WikiCorpus.zip -d wikipedia/ 33 | 34 | -------------------------------------------------------------------------------- /install_arc_solvers.sh: -------------------------------------------------------------------------------- 1 | #https://github.com/aghie/ARC-Solvers.git 2 | 3 | pip install allennlp==0.2.1 4 | pip install torch==0.3.1 5 | pip install torchvision 6 | pip install prettytable==0.7.2 7 | pip install scikit-learn==0.20.2 8 | pip install numpy==1.16.0 9 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | from sklearn.preprocessing import normalize 2 | from sklearn.metrics.pairwise import cosine_similarity 3 | from drqa import pipeline 4 | from drqa.retriever import utils 5 | from drqa import retriever 6 | from nltk.corpus import stopwords 7 | from utils import TextSimilarity 8 | import logging 9 | import random 10 | import itertools 11 | import prettytable 12 | import numpy as np 13 | import sys 14 | import utils 15 | import codecs 16 | import tempfile 17 | import subprocess 18 | import json 19 | import os 20 | import abc 21 | 22 | 23 | # class VotingAnswerer(object): 24 | # 25 | # def __init__(self, answerers=[]): 26 | # self.answerers = answerers 27 | # self.question_classifier = utils.QuestionClassifier() 28 | # 29 | # def add(self,answerer): 30 | # self.answerers.add(answerer) 31 | # 32 | # def remove(self, answerer): 33 | # self.answerers.remove(answerer) 34 | # 35 | # def predict(self, qas): 36 | # preds = [] 37 | # 38 | # answers = np.zeros((len(self.answerers), len(qas)), dtype=np.integer) 39 | # for j,answerer in enumerate(self.answerers): 40 | # answers[j]= [raid for qid, raid in answerer.predict(qas)] 41 | # 42 | # print (answers, answers.shape) 43 | # 44 | # for qid,i in enumerate(range(0, answers.shape[1]),1): 45 | # 46 | # preds.append((str(qid),np.argmax(np.bincount(answers[:,i])))) 47 | # 48 | # 49 | # return preds 50 | 51 | 52 | 53 | class Answerer(object): 54 | 55 | def __init__(self,qclassifier): 56 | self.qclassifier = qclassifier 57 | 58 | def predict(self, qas): 59 | 60 | not_to_answer = self.not_to_answer(qas) 61 | predictions = self._predict(qas) 62 | for qid in not_to_answer: 63 | predictions[qid] = utils.ID_UNANSWERED 64 | 65 | return predictions 66 | 67 | 68 | def not_to_answer(self, qas): 69 | 70 | unanswerable = [] 71 | for qid, question, answers in qas: 72 | if self.qclassifier is not None and self.qclassifier.is_unanswerable(question): 73 | unanswerable.append(qid) 74 | return unanswerable 75 | 76 | 77 | @abc.abstractmethod 78 | def _predict(self, qas): 79 | pass 80 | 81 | 82 | class LengthAnswerer(Answerer): 83 | """ 84 | A solver that selects as the right answer the longest one 85 | """ 86 | 87 | NAME = "LengthAnswerer" 88 | MAX_CRITERIA = "max" 89 | MIN_CRITERIA = "min" 90 | 91 | def __init__(self, criteria=MAX_CRITERIA,count_words=False, 92 | qclassifier=None): 93 | """ 94 | 95 | Args 96 | 97 | criteria (string). Criteria to choose the right answer based on their lengths. 98 | Valid values are "max" or "min" 99 | count_words (boolean): If True, we split the text and count the number of actual words. 100 | Otherwise we simply count the length of the string 101 | 102 | """ 103 | 104 | Answerer.__init__(self,qclassifier) 105 | self.count_words = count_words 106 | 107 | if criteria.lower() == self.MAX_CRITERIA: 108 | self.criteria = max 109 | elif criteria.lower() == self.MIN_CRITERIA: 110 | self.criteria = min 111 | else: 112 | raise NotImplementedError 113 | 114 | 115 | def name(self): 116 | return self.NAME 117 | 118 | def _predict(self, qas): 119 | """ 120 | 121 | Returns a list of tuples (question_id, right_answer_id) 122 | 123 | Args 124 | 125 | qas (list). A list of tuples of strings of the form (Q, A1,...,AN) 126 | 127 | """ 128 | preds = {} 129 | for qid, question, answers in qas: 130 | 131 | if not self.count_words: 132 | answer_lengths = list(map(len, answers)) 133 | pred = answer_lengths.index(self.criteria(answer_lengths))+1 134 | else: 135 | answer_lengths = list(map(len, [a.split() for a in answers])) 136 | pred = answer_lengths.index(self.criteria(answer_lengths))+1 137 | preds[qid] = pred 138 | return preds 139 | 140 | def __str__(self): 141 | return self.NAME 142 | 143 | 144 | 145 | class RandomAnswerer(Answerer): 146 | """ 147 | A solver that select as the right answer a random one 148 | """ 149 | 150 | NAME = "RandomAnswerer" 151 | 152 | def __init__(self, qclassifier=None): 153 | Answerer.__init__(self,qclassifier) 154 | 155 | def name(self): 156 | return self.NAME 157 | 158 | def _predict(self, qas): 159 | """ 160 | 161 | Returns a list of tuples (question_id, right_answer_id) 162 | 163 | Args 164 | 165 | qas (list). A list of tuples of strings of the form (Q, A1,...,AN) 166 | 167 | """ 168 | 169 | preds = {} 170 | for qid, question, answers in qas: 171 | preds[qid] = random.randint(1,len(answers)) 172 | return preds 173 | 174 | def __str__(self): 175 | return self.NAME 176 | 177 | 178 | class BlindAnswerer(Answerer): 179 | 180 | NAME = "BlindAnswerer" 181 | 182 | 183 | def __init__(self, default, qclassifier=None): 184 | 185 | Answerer.__init__(self,qclassifier) 186 | self.default = default 187 | 188 | def name(self): 189 | return self.NAME+"-"+str(self.default) 190 | 191 | def _predict(self, qas): 192 | """ 193 | 194 | Returns a list of tuples (question_id, right_answer_id) 195 | 196 | Args 197 | 198 | qas (list). A list of tuples of strings of the form (Q, A1,...,AN) 199 | 200 | """ 201 | 202 | preds = {} 203 | for qid, question, answers in qas: 204 | if self.default in range(1,len(answers)+1): 205 | preds[qid] = self.default 206 | else: 207 | raise ValueError("The answer ID",self.default,"is not available in options 1 to ", len(answers)) 208 | return preds 209 | 210 | def __str__(self): 211 | return self.NAME+"-"+str(self.default) 212 | 213 | 214 | 215 | 216 | class WordSimilarityAnswerer(Answerer): 217 | """ 218 | This solver: (1) computes a question vector by summing the individual embeddings 219 | of its words (2) repeats the same process for each answer and (3) chooses as the right 220 | answer the asnwer that maximizes cosine_similarity(question_vector, answer_i_vector) 221 | """ 222 | 223 | NAME = "WordSimilarityAnswerer" 224 | 225 | def __init__(self, path_word_emb, qclassifier): 226 | 227 | """ 228 | 229 | Args 230 | 231 | path_word_emb (string): Path to the embeddings file 232 | """ 233 | 234 | Answerer.__init__(self,qclassifier) 235 | self.word2index = {} 236 | with codecs.open(path_word_emb) as f: 237 | self.n_words, self.embedding_size = tuple(map(int,f.readline().strip("\n").split())) 238 | self.word_embeddings = np.zeros(shape=(self.n_words,self.embedding_size), 239 | dtype=float) 240 | line = f.readline() 241 | idl = 0 242 | while line != "": 243 | 244 | word, vector = line.split()[0], line.split()[1:] 245 | self.word2index[word] = idl 246 | self.word_embeddings[idl] = list(map(float,vector)) 247 | line = f.readline() 248 | idl+=1 249 | print (" [OK]") 250 | 251 | 252 | def name(self): 253 | return self.NAME 254 | 255 | def _predict(self,qas): 256 | """ 257 | 258 | Returns a list of tuples (question_id, right_answer_id) 259 | 260 | Args 261 | 262 | qas (list). A list of tuples of strings of the form (QID, Q, A1,...,AN) 263 | 264 | """ 265 | 266 | preds = {} 267 | for qid, question, answers in qas: 268 | 269 | question_word_embs = [self.word_embeddings[self.word2index[word]] 270 | if word in self.word2index else np.zeros(self.embedding_size) 271 | for word in question] 272 | 273 | embedding_question = normalize(np.sum(question_word_embs, axis=0).reshape(1, -1)) 274 | 275 | best_score = -1 276 | for aid, answer in enumerate(answers,1): 277 | answer_word_embs = [self.word_embeddings[self.word2index[word]] 278 | if word in self.word2index else np.zeros(self.embedding_size) 279 | for word in answer] 280 | answer_vector = normalize(np.sum(answer_word_embs, axis=0).reshape(1, -1)) 281 | score = cosine_similarity(embedding_question, answer_vector)[0][0] 282 | if score > best_score: 283 | best_answer,best_score = aid, score 284 | 285 | preds[qid] = best_answer 286 | return preds 287 | 288 | def __str__(self): 289 | return self.NAME 290 | 291 | 292 | 293 | class IRAnswerer(Answerer): 294 | 295 | """ 296 | A solver that select as the right answer the answer that maximized 297 | the TF-IDF score of a Wikipedia document when the question+answer_i 298 | is used as the query. It not found it can choose between not to answer 299 | or answer randomly. 300 | 301 | This implementation uses the IR system presented in DrQa (Chen et al., 2017) 302 | """ 303 | 304 | 305 | NAME = "IRAnswerer" 306 | 307 | def __init__(self,tfidf_path, 308 | tokenizer, 309 | use_stopwords = False, 310 | qclassifier = None): 311 | 312 | Answerer.__init__(self,qclassifier) 313 | self.tokenizer = tokenizer 314 | self.ranker =retriever.get_class('tfidf')(tfidf_path=tfidf_path) 315 | self.stopwords = stopwords 316 | self.use_stopwords = use_stopwords 317 | 318 | def _preprocess(self,query): 319 | 320 | if self.use_stopwords: 321 | return " ".join([token.text for token in list(self.tokenizer(query)) 322 | if not token.is_stop]) 323 | else: 324 | return " ".join([token.text for token in list(self.tokenizer(query))]) 325 | 326 | def name(self): 327 | return self.NAME 328 | 329 | def _process(self,query, k=1): 330 | doc_names, doc_scores = self.ranker.closest_docs(query, k) 331 | results = [] 332 | for i in range(len(doc_names)): 333 | results.append((doc_names[i], doc_scores[i])) 334 | return results 335 | 336 | 337 | def _predict(self,qas): 338 | preds = {} 339 | for qid, question, answers in qas: 340 | 341 | unanswerable = False if self.qclassifier is None else self.qclassifier.is_unanswerable(question) 342 | 343 | #If it is a negation question we look for the least similar answer 344 | if self.qclassifier.is_negation_question(question): 345 | best_answer, best_score = 0, 100000000 346 | f = min 347 | else: 348 | best_answer, best_score = 0,0 349 | f = max 350 | 351 | if not unanswerable: 352 | 353 | question = self._preprocess(question) 354 | 355 | for aid, answer in enumerate(answers,1): 356 | name, score = self._process(" ".join([question, answer]), k=1)[0] 357 | if f == max and score > best_score: 358 | best_answer,best_score = aid, score 359 | elif f == min and score < best_score: 360 | best_answer,best_score = aid, score 361 | 362 | preds[qid] = best_answer 363 | 364 | return preds 365 | 366 | def __str__(self): 367 | return self.NAME 368 | 369 | 370 | 371 | class DrQAAnswerer(Answerer): 372 | """ 373 | A solver that implements a simple wrapper to make predictions using 374 | DrQA (Chen et al. 2017) 375 | """ 376 | NAME = "DrQAAnswerer" 377 | 378 | def __init__(self, tokenizer, reader_model=None, batch_size=64, 379 | qclassifier=None, cuda=False): 380 | 381 | """ 382 | Args 383 | 384 | drqa (string): 385 | """ 386 | 387 | print ("Tokenizer", tokenizer) 388 | Answerer.__init__(self,qclassifier) 389 | self.batch_size = batch_size 390 | self.n_docs = 5 391 | self.top_n = 1 392 | self.ts = TextSimilarity() 393 | print ("Reader model", reader_model, cuda) 394 | self.drqa = pipeline.DrQA( 395 | reader_model=reader_model, 396 | fixed_candidates=None, 397 | embedding_file=None, 398 | tokenizer="spacy", 399 | batch_size=batch_size, 400 | cuda=cuda, 401 | data_parallel=False, 402 | ranker_config={'options': {'tfidf_path': None, 403 | 'strict': False}}, 404 | db_config={'options': {'db_path': None}}, 405 | num_workers=1, 406 | ) 407 | 408 | 409 | 410 | def name(self): 411 | return self.NAME 412 | 413 | def _predict(self, qas): 414 | """ 415 | 416 | Returns a list of tuples (question_id, right_answer_id) 417 | 418 | Args 419 | 420 | qas (list). A list of tuples of strings of the form (QID, Q, A1,...,AN) 421 | 422 | """ 423 | 424 | preds = {} 425 | queries = [question for qid, question, answers in qas] 426 | tmp_out = tempfile.NamedTemporaryFile(delete=False) 427 | 428 | drqa_answers = [] 429 | with open(tmp_out.name, 'w') as f: 430 | batches = [queries[i: i + self.batch_size] 431 | for i in range(0, len(queries), self.batch_size)] 432 | for i, batch in enumerate(batches): 433 | 434 | predictions = self.drqa.process_batch( 435 | batch, 436 | n_docs=self.n_docs, 437 | top_n=self.top_n, 438 | ) 439 | 440 | drqa_answers.extend([p[0]["span"] for p in predictions]) 441 | 442 | #Compare which answer is the closest one to the DrQA answers 443 | assert (len(drqa_answers) == len(qas)) 444 | for pred_answer, (qid,question,answers) in zip(drqa_answers, qas): 445 | similarities = sorted([(idanswer, self.ts.similarity(pred_answer.split(" "), answer.split(" "))) 446 | for idanswer,answer in enumerate(answers,1)], 447 | key= lambda x : x[1], reverse=True) 448 | 449 | #No question scored We select the longest answer instead 450 | if similarities[0][1] == 0: 451 | length_answers = [(ida,len(a)) for ida, a in enumerate(answers,1)] 452 | length_answers = sorted(length_answers, key = lambda a: a[1], reverse=True) 453 | preds[qid] = length_answers[0][0] 454 | 455 | else: 456 | if self.qclassifier.is_negation_question(question): 457 | preds[qid] = similarities[-1][0] 458 | else: 459 | preds[qid] = similarities[0][0] 460 | 461 | return preds 462 | 463 | 464 | def __str__(self): 465 | return self.NAME 466 | 467 | 468 | 469 | 470 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from models import RandomAnswerer, LengthAnswerer, IRAnswerer, WordSimilarityAnswerer, DrQAAnswerer, BlindAnswerer 2 | from spacy.lang.es import Spanish 3 | from spacy.lang.en import English 4 | from utils import * 5 | from argparse import ArgumentParser 6 | from subprocess import PIPE,Popen 7 | from tqdm import tqdm 8 | import codecs 9 | import json 10 | import tempfile 11 | import os 12 | import random 13 | import subprocess 14 | import configparser 15 | import utils 16 | import spacy 17 | import en_core_web_sm 18 | 19 | 20 | SPANISH = "es" 21 | ENGLISH = "en" 22 | 23 | if __name__ == '__main__': 24 | 25 | arg_parser = ArgumentParser() 26 | arg_parser.add_argument("--config", dest="config", help="Path to the configuration file") 27 | arg_parser.add_argument("--output", dest="output", help="Path to the output to store the results") 28 | arg_parser.add_argument("--answerer", dest="answerer", help="Name of the answerer to be used to train the model") 29 | 30 | args = arg_parser.parse_args() 31 | config = config_file_to_dict(args.config) 32 | #Load the configuration for Spanish 33 | path_solutions = config["path_solutions"] 34 | 35 | if config["lang"].lower() == SPANISH: 36 | tfidf_retriever = config["es_retriever"] 37 | path_head =config["es_head"] 38 | 39 | unanswerable_sentences = [utils.BOS_IMAGE_QUESTION_ES] 40 | neg_words = utils.NEGATION_WORDS_ES 41 | nlp = spacy.load('es_core_news_sm') 42 | tokenizer = Spanish().Defaults.create_tokenizer(nlp) 43 | 44 | elif config["lang"].lower() == ENGLISH: 45 | tfidf_retriever = config["en_retriever"] 46 | path_head =config["en_head"] 47 | unanswerable_sentences = [utils.BOS_IMAGE_QUESTION_EN] 48 | neg_words = utils.NEGATION_WORDS_EN 49 | nlp = spacy.load('en_core_web_sm') 50 | tokenizer = English().Defaults.create_tokenizer(nlp) 51 | drqa_reader_model = config["en_drqa_reader_model"] 52 | 53 | else: 54 | raise NotImplementedError 55 | 56 | ignore_questions = True if config["ignore_questions"].lower() == "true" else False 57 | negative_questions = True if config["negative_questions"].lower() == "true" else False 58 | use_stopwords = True if config["use_stopwords"].lower() == "true" else False 59 | 60 | random.seed(17) 61 | unanswerable = [] 62 | 63 | if not negative_questions: 64 | neg_words = [] 65 | if not ignore_questions: 66 | unanswerable_sentences = [] 67 | 68 | qclassifier = QuestionClassifier(unanswerable= unanswerable_sentences, 69 | neg_words = neg_words) 70 | 71 | if args.answerer.lower() == "length": 72 | answerer = LengthAnswerer(qclassifier=qclassifier) 73 | elif args.answerer.lower().startswith("blind"): 74 | x = int(args.answerer.split("_")[1]) 75 | answerer = BlindAnswerer(default=x,qclassifier=qclassifier) 76 | elif args.answerer.lower() == "random": 77 | answerer = RandomAnswerer(qclassifier=qclassifier) 78 | elif args.answerer.lower() == "ir": 79 | answerer = IRAnswerer(tfidf_retriever, qclassifier=qclassifier, 80 | use_stopwords=False, tokenizer=tokenizer) 81 | elif args.answerer.lower() == "drqa": 82 | answerer = DrQAAnswerer(tokenizer=tokenizer, 83 | reader_model=drqa_reader_model, 84 | qclassifier=qclassifier, 85 | cuda=True) 86 | else: 87 | raise NotImplementedError("Answerer", args.answerer," is not available") 88 | 89 | systems = [answerer] 90 | solutions = {f.replace(".gold",""):path_solutions+os.sep+f for f in os.listdir(path_solutions) if f.endswith(".gold")} 91 | score = Score() 92 | dataset = Dataset() 93 | dataset.load_json(path_head) 94 | predictions = {} 95 | unanswerable = {} 96 | for answerer in systems: 97 | 98 | print ("Running ", answerer, "on ", path_head) 99 | avg_netas = 0 100 | avg_fscore = 0 101 | 102 | if answerer not in predictions: 103 | predictions[answerer.name()] = {} 104 | unanswerable[answerer.name()] = set([]) 105 | n_exams = len(dataset.get_exams()) 106 | for exam in tqdm(dataset.get_exams()): 107 | qas = dataset.get_qas(exam) 108 | preds = answerer.predict(qas) 109 | predictions[answerer.name()][exam] = preds 110 | 111 | 112 | systems = [] 113 | ir_answerer = None 114 | for answerer in predictions: 115 | for exam in predictions[answerer]: 116 | gold = solutions[exam] 117 | tmp = tempfile.NamedTemporaryFile(mode="w",delete=False) 118 | predicted = tmp.name 119 | for qid in predictions[answerer][exam]: 120 | tmp.write("\t".join([qid,str(predictions[answerer][exam][qid])])+"\n") 121 | tmp.close() 122 | 123 | command = ["python",config["eval"],"--gold",gold,"--predicted",predicted] 124 | p = subprocess.Popen(" ".join(command), stdout=subprocess.PIPE, shell=True) 125 | a, err = p.communicate() 126 | e = score.parse_eval(a.decode("utf-8")) 127 | score.add_exam(exam, e) 128 | os.remove(tmp.name) 129 | 130 | with codecs.open(args.output,"w") as f_out_results: 131 | f_out_results.write(score.get_table().get_string()) 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /scripts/evaluate_arc_solvers.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from subprocess import PIPE,Popen 3 | from utils import * 4 | from models import LengthAnswerer, BlindAnswerer 5 | from collections import OrderedDict 6 | import os 7 | import json 8 | import tempfile 9 | import subprocess 10 | import utils 11 | import sys 12 | 13 | 14 | def disambiguate(qa,disambiguator, d): 15 | 16 | if disambiguator: 17 | 18 | if disambiguator.lower() == "length": 19 | return d["length"].predict([qa])[1]-1 20 | 21 | elif disambiguator.lower() == "blind": 22 | #We select as the right answer to choice located before the last one 23 | return len(qa[2])-1 24 | else: 25 | return 26 | 27 | 28 | def breakdown_output(l): 29 | """ 30 | Args 31 | l (string) A list of tuples (exam, questionid, answerid). Id must follow the format $BASENAME_$NAMEEXAM_$QID 32 | """ 33 | 34 | def parse_questionid(name): 35 | 36 | try: 37 | exam = name.rsplit("_",1)[0] 38 | qid = name.rsplit("_",1)[1] 39 | return exam, qid 40 | except IndexError: 41 | raise ValueError("The variable 'name' does not respect the format") 42 | 43 | d = OrderedDict() 44 | for questionid, answer in l: 45 | 46 | exam, qid = parse_questionid(questionid) 47 | 48 | if exam not in d: 49 | d[exam] = OrderedDict() 50 | if qid not in d[exam]: 51 | d[exam][qid] = answer 52 | else: 53 | raise KeyError("Key", qid,"already exists in dictionary") 54 | return d 55 | 56 | 57 | def _select_answer(line_json): 58 | 59 | selected_answers = line_json["selected_answers"].split(",") 60 | #An ARC solver, we need to disambiguate 61 | if len(selected_answers) > 1: 62 | # print ("MORE than one answer selected") 63 | # print ("line_json", line_json) 64 | qa = (1,"", [choice["text"] for choice in line_json["question"]["choices"] 65 | if choice["label"] in selected_answers]) 66 | # print ("options", qa) 67 | iselected = disambiguate(qa, args.disambiguator, disambiguators) 68 | # print ("iselected", iselected) 69 | # input("NEXT") 70 | return selected_answers[iselected] 71 | else: 72 | # print ("ONLY one answer selected") 73 | # print ("line_json", line_json) 74 | # print ("selected_answer", line_json["selected_answers"]) 75 | # input("NEXT") 76 | return line_json["selected_answers"] 77 | 78 | 79 | def _select_negative_answer(line_json): 80 | 81 | score = sys.maxsize 82 | answer = utils.ID_UNANSWERED 83 | 84 | 85 | for choice in line_json["question"]["choices"]: 86 | if score > choice["score"]: 87 | answer = choice["label"] 88 | score = min(score, choice["score"]) 89 | # print ("score", score) 90 | # print ("answer", answer) 91 | # input("NEXT") 92 | return answer 93 | 94 | 95 | # def all_scores_are_zero(line_json): 96 | # 97 | # return not any([1 for choice in line_json["question"]["choices"] 98 | # if choice["score"] != 0]) 99 | 100 | 101 | def select_answer(line_json, ignore, negative, 102 | qclassifier): 103 | 104 | if ignore: 105 | if (qclassifier is not None and qclassifier.is_unanswerable(line_json["question"]["stem"]) 106 | ): 107 | return utils.ID_UNANSWERED 108 | else: 109 | if not negative: 110 | return _select_answer(line_json) 111 | else: 112 | return _select_negative_answer(line_json) 113 | else: 114 | if not negative: 115 | return _select_answer(line_json) 116 | else: 117 | return _select_negative_answer(line_json) 118 | 119 | 120 | 121 | if __name__ == '__main__': 122 | 123 | arg_parser = ArgumentParser() 124 | arg_parser.add_argument("--arc_results", dest="arc_results", 125 | help="Path to the directory containing the ARC results") 126 | arg_parser.add_argument("--output", dest="output", 127 | help="Path to the output file where to store the results") 128 | arg_parser.add_argument("--disambiguator", dest="disambiguator", default=None, 129 | help="Backup answerer to use if an ARC solver returns multiple questions [LengthAnswerer, BlindAnswerer]") 130 | arg_parser.add_argument("--breakdown_results", dest="breakdown_results", action="store_true", 131 | help="To breakdown results for each exam (each JSON 'question' field must include both an 'exam' and a 'qid' fields") 132 | arg_parser.add_argument("--ignore_questions", dest="ignore_questions", 133 | help="To ignore questions according to the strategy described in the paper", action="store_true") 134 | arg_parser.add_argument("--negative_questions", dest="negative_questions", 135 | help="It deals with negative questions", action="store_true") 136 | arg_parser.add_argument("--path_eval", 137 | help="Path to head-qa/eval.py") 138 | args = arg_parser.parse_args() 139 | 140 | results_files = [args.arc_results+os.sep+file 141 | for file in os.listdir(args.arc_results) 142 | if "qapredictions" in file] 143 | 144 | 145 | neg_words = utils.NEGATION_WORDS_EN 146 | unanswerable_sentences = [utils.BOS_IMAGE_QUESTION_EN] 147 | 148 | if not args.negative_questions: 149 | neg_words = [] 150 | if not args.ignore_questions: 151 | unanswerable_sentences = [] 152 | 153 | qclassifier = QuestionClassifier(unanswerable= unanswerable_sentences, 154 | neg_words = neg_words) 155 | 156 | length_answerer = LengthAnswerer() 157 | disambiguators = {"length": length_answerer} 158 | scorer = Score() 159 | for file in results_files: 160 | 161 | name_dataset = file.split("qapredictions")[0].rsplit("/",1)[1] 162 | name_model = file.split("qapredictions")[1].split("_")[1] 163 | 164 | gold = [] 165 | pred = [] 166 | ids = [] 167 | 168 | with open(file) as f: 169 | 170 | line = f.readline() 171 | while line != "": 172 | 173 | line_json = json.loads(line) 174 | line = f.readline() 175 | id = line_json["id"] 176 | ids.append(id) 177 | gold.append(line_json["answerKey"]) 178 | pred.append(select_answer(line_json, args.ignore_questions, 179 | args.negative_questions, qclassifier)) 180 | 181 | 182 | 183 | if args.breakdown_results: 184 | 185 | assert(len(ids)==len(gold)) 186 | assert(len(ids)==len(pred)) 187 | d_gold = breakdown_output(zip(ids, gold)) 188 | d_pred = breakdown_output(zip(ids, pred)) 189 | 190 | for exam in d_gold: 191 | gold_file = args.arc_results+os.sep+exam+".arc_gold" 192 | with open(gold_file,"w") as f_gold: 193 | f_gold.write("\n".join( [qid+"\t"+d_gold[exam][qid] 194 | for qid in d_gold[exam] ])) 195 | 196 | pred_file = args.arc_results+os.sep+exam+".arc_pred" 197 | with open(pred_file,"w") as f_pred: 198 | f_pred.write("\n".join([qid+"\t"+d_pred[exam][qid] 199 | for qid in d_pred[exam] ])) 200 | 201 | 202 | command = ["python",args.path_eval,"--gold",gold_file,"--predicted",pred_file] 203 | # command = ["python","../eval.py","--gold",gold_file,"--predicted",pred_file] 204 | p = subprocess.Popen(" ".join(command), stdout=subprocess.PIPE, shell=True) 205 | out, err = p.communicate() 206 | exam_scores = scorer.parse_eval(out.decode("utf-8")) 207 | scorer.add_exam(exam, exam_scores) 208 | 209 | else: 210 | assert(len(id)==len(gold)) 211 | assert(len(id)==len(pred)) 212 | gold_file = args.arc_results+os.sep+name_dataset+".arc_gold" 213 | with open(gold_file,"w") as f_gold: 214 | f_gold.write("\n".join( [id+"\t"+p for id, p in zip(ids,gold) ])) 215 | 216 | pred_file = args.arc_results+os.sep+name_dataset+".arc_pred" 217 | with open(pred_file,"w") as f_pred: 218 | f_pred.write("\n".join( [id+"\t"+p for id, p in zip(ids,pred) ])) 219 | 220 | command = ["python",args.path_eval,"--gold",gold_file,"--predicted",pred_file] 221 | p = subprocess.Popen(" ".join(command), stdout=subprocess.PIPE, shell=True) 222 | out, err = p.communicate() 223 | 224 | exam_scores = scorer.parse_eval(out.decode("utf-8")) 225 | scorer.add_exam(name_dataset, exam_scores) 226 | 227 | print (args.output+os.sep+"EN-"+name_model+"ign="+str(args.ignore_questions)+".neg="+str(args.negative_questions)) 228 | with codecs.open(args.output+os.sep+"EN-"+name_model+"ign="+str(args.ignore_questions)+".neg="+str(args.negative_questions)+".ARC.results","w") as f_out_results: 229 | f_out_results.write(scorer.get_table().get_string()) 230 | 231 | 232 | -------------------------------------------------------------------------------- /scripts/head2ARCformat.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import codecs 3 | import json 4 | import copy 5 | import os 6 | from utils import Dataset 7 | from collections import OrderedDict 8 | 9 | if __name__ == '__main__': 10 | 11 | arg_parser = ArgumentParser() 12 | arg_parser.add_argument("--input", dest="input", 13 | help="Path to the HEAD dataset", default=None) 14 | arg_parser.add_argument("--output", dest="output", 15 | help="Path to the output directory where to store the exams in a suitable format for the ARC solvers") 16 | 17 | args = arg_parser.parse_args() 18 | 19 | dataset = Dataset() 20 | dataset.load_json(args.input) 21 | exams = dataset.get_exams() 22 | name_head = args.input.rsplit("/",1)[1].replace(".json","") 23 | 24 | with codecs.open(args.output+os.sep+name_head+".arc.txt","w") as f: 25 | for exam in exams: 26 | data_exam = exams[exam]["data"] 27 | for ielement, element in enumerate(data_exam): 28 | 29 | arc_id = "_".join([exam, str(ielement)]) 30 | data = {"id":arc_id} 31 | stem = element["qtext"] 32 | question = {"stem": stem} 33 | question.update({"exam": exam}) 34 | question.update({"qid": ielement}) 35 | arc_answers = [] 36 | for answer in element["answers"]: 37 | arc_answers.append({"text":answer["atext"], "label":str(answer["aid"])}) 38 | 39 | question.update({"choices": arc_answers}) 40 | 41 | right_answer = str(element["answers"][int(element["ra"])-1]["aid"]) 42 | data.update({"question":question}) 43 | data.update({"answerKey": right_answer}) 44 | 45 | repr = json.dumps(data) 46 | f.write(repr+"\n") 47 | 48 | 49 | -------------------------------------------------------------------------------- /scripts/head2drqa.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import codecs 3 | import json 4 | import copy 5 | import os 6 | from utils import Dataset 7 | 8 | if __name__ == '__main__': 9 | 10 | arg_parser = ArgumentParser() 11 | arg_parser.add_argument("--input", dest="input", 12 | help="Path to the HEAD dataset", default=None) 13 | arg_parser.add_argument("--output", dest="output", 14 | help="Path to the output directory where to store the exams in a suitable format for DrQa") 15 | 16 | args = arg_parser.parse_args() 17 | 18 | dataset = Dataset() 19 | dataset.load_json(args.input) 20 | exams = dataset.get_exams() 21 | 22 | for exam in exams: 23 | with codecs.open(args.output+os.sep+exam+".drqa.txt","w") as f: 24 | data_exam = exams[exam]["data"] 25 | for ielement, element in enumerate(data_exam): 26 | data = {} 27 | question = element["qtext"] 28 | question_id = element["qid"] 29 | right_answer = element["answers"][int(element["ra"])-1]["atext"] 30 | data["question"] = question 31 | data["qid"] = question_id 32 | data["answer"] = right_answer 33 | 34 | repr = json.dumps(data) 35 | f.write(repr+"\n") 36 | -------------------------------------------------------------------------------- /scripts/pdfexams2txt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | from argparse import ArgumentParser 6 | import codecs 7 | import os 8 | import json 9 | import tempfile 10 | 11 | 12 | VERSION = "1.0" 13 | LANGUAGE = "es" 14 | 15 | class ExamInfo(object): 16 | 17 | def __init__(self): 18 | 19 | self.pdf = None 20 | self.answers = None 21 | self.images = None 22 | 23 | 24 | def exam_2_txt(path_pdf, path_txt): 25 | """ 26 | Executes pdftotext to extract the raw text from a PDF file. 27 | The option -layout is used to maintain the two columns format of the exam, 28 | which simplifies post-processing 29 | 30 | Args 31 | 32 | path_pdf (string): Path to the exam in PDF format 33 | path_txt (string): Path to the output path to store the file in raw format 34 | 35 | """ 36 | os.system(" ".join(["pdftotext -layout",path_pdf,path_txt])) 37 | 38 | 39 | def preprocess_line(line, mid): 40 | """ 41 | Preprocesses a line and infers the text that belongs to the first and second 42 | columns. 43 | 44 | Args 45 | 46 | line (string): A line from the exam 47 | mid (int): Index where the second column is considered to start 48 | 49 | """ 50 | 51 | if len(line) <= mid+4: 52 | col1 = line[0:mid+4].strip() 53 | col2 = "" 54 | else: 55 | col1 = line[0:mid+4].strip() if line[0:mid+4] != " "*(mid+4) else "" 56 | col2 = line[mid+4:].strip() 57 | return col1, col2 58 | 59 | 60 | def is_new_element(phrase,cur): 61 | 62 | """ 63 | Here, an exam represents: (1) a question or (2) one of the answers. 64 | These can be easily determined because of the format of the ID. 65 | """ 66 | 67 | phrase_aux = phrase.strip().split(". ") 68 | if len(phrase_aux) > 1: 69 | #TODO: This is a bit of a mess, but the processing from the pdf is not easy. 70 | #To "ensure it is an ID at the beginning of the sentence. 71 | return phrase_aux[0].isdigit() and ( phrase_aux[0] in ["1","2","3","4","5"] or len(cur.strip()) == 0 72 | or (len(cur.strip()) != 0 and (cur.strip()[-1] in [".",":","?"]) ) ) 73 | 74 | return False 75 | 76 | def concat_element(cur,to_append): 77 | 78 | """ 79 | 80 | Args: 81 | 82 | cur (string): Part of the current element (a question or an answer) 83 | that we have processed 84 | to_append (string): Line that we are processing now 85 | 86 | """ 87 | 88 | #is_new_element estimates if we have finised processing the current 89 | #question or element 90 | if is_new_element(to_append, cur): 91 | 92 | return cur+"\n"+postprocess_phrase(to_append) 93 | else: 94 | return cur+postprocess_phrase(to_append) 95 | 96 | 97 | 98 | def postprocess_phrase(phrase): 99 | """ 100 | Some words might be 'cut' for an exam, as the text might take 101 | several lines. If we find that the last element of a sentence is 102 | '-', we remove it, and the word will continue in the next line. 103 | Otherwise, we are sure it is the end of a word. 104 | 105 | Args: 106 | 107 | phrase (string) 108 | 109 | Warnings: 110 | 111 | This implementation is specific for Spanish 112 | """ 113 | 114 | if len(phrase) == 0: return phrase 115 | 116 | if phrase[-1] == "-": 117 | return phrase[:-1] 118 | else: 119 | return phrase+" " 120 | 121 | 122 | def contains_all_answers(qas, n_answers): 123 | """ 124 | Checks if we have identified all the answers for our tuple (Q,A1,A2,A3,A4) 125 | 126 | Args: 127 | 128 | qas: A tuple of the form (Q,A1,A2,A3,A4). Each exam is a string 129 | n_answers: Number of expected answers 130 | """ 131 | 132 | all = True 133 | 134 | for i in range(1,n_answers+1): 135 | all = all and qas[i].startswith(str(i)+".") 136 | 137 | return all 138 | 139 | 140 | def _remove_ids(elements): 141 | """ 142 | We remove the question and answer ids from the text (they are noise for the input) 143 | prior to convert the corpus to JSON 144 | """ 145 | new_elements = [] 146 | for e in elements: 147 | new_elements.append(e.split(".",1)[1].strip()) 148 | 149 | return new_elements 150 | 151 | 152 | def format_txt_exam(path_text): 153 | 154 | """ 155 | Transforms an exam in text format (as processed by exam_2_text) into a list (Q,A1,...,AN). 156 | 157 | Args: 158 | 159 | path_text (string): Path to an exam in text format 160 | 161 | """ 162 | 163 | first_page = True 164 | col1 = "" 165 | col2 = "" 166 | 167 | qas = [] 168 | pages = "" 169 | 170 | #Get the indexes of the footnote pages to know where columns should 171 | #be split 172 | d_pages_indexes = {} 173 | with codecs.open(path_text) as f: 174 | n_page = 1 175 | line = f.readline() 176 | while line != "": 177 | if line.strip().strip("-").isdigit() or line.strip().replace("-","").replace(" ","").isdigit(): 178 | d_pages_indexes[n_page] = line.index(line.strip()) 179 | n_page+=1 180 | line = f.readline() 181 | 182 | with codecs.open(path_text) as f: 183 | 184 | line = f.readline() 185 | n_page=1 186 | while line != "": 187 | 188 | #Beginning of a new page 189 | if line.strip() == "-1-" or line.strip() == "- 1 -": 190 | n_page+=1 191 | first_page = False 192 | c1, c2 = "","" 193 | col1 = "" 194 | col2 = "" 195 | 196 | elif line.strip().strip("-").isdigit() or line.strip().replace("-","").replace(" ","").isdigit(): 197 | 198 | pages += col1+col2 199 | 200 | col1 = "" 201 | col2 = "" 202 | n_page+=1 203 | 204 | elif not first_page: 205 | if n_page not in d_pages_indexes: break 206 | c1,c2 = preprocess_line(line, d_pages_indexes[n_page]) 207 | col1 = concat_element(col1,c1) 208 | col2 = concat_element(col2,c2) 209 | line = f.readline() 210 | 211 | 212 | 213 | pages_split = pages.strip().split("\n") 214 | n_answers = int(pages_split[-1].split(".")[0]) 215 | j = 1+n_answers 216 | for i in range(0,len(pages_split),j): 217 | 218 | if not contains_all_answers(pages_split[i:i+j], n_answers=n_answers): 219 | raise ValueError("The sample does not contain all the expected answers") 220 | 221 | qas.append(_remove_ids(pages_split[i:i+j])) 222 | 223 | return qas 224 | 225 | 226 | def format_txt_answers(path): 227 | """ 228 | Converts the text file with the gold answers into a list of tuples 229 | (question_id, right_answer_id) 230 | 231 | Args 232 | 233 | path (string): Path the text file containing the gold answers 234 | 235 | """ 236 | 237 | with codecs.open(path) as f: 238 | lines = f.readlines() 239 | 240 | head_line = lines[0] 241 | content_lines = lines[1:] 242 | 243 | d_columns = {} 244 | d_columns_content = {} 245 | 246 | for j,key in enumerate(head_line.split()): 247 | if key not in d_columns: 248 | d_columns[key] = [] 249 | d_columns_content[key] = [] 250 | d_columns[key].append(j) 251 | 252 | RC_key = "RC" 253 | v_keys = [key for key in d_columns if key.startswith("V")] 254 | v_keys.sort(reverse=True) 255 | V_key = v_keys[0] 256 | 257 | v_rc_indexes = zip([index for index in d_columns[V_key]], [index for index in d_columns[RC_key]]) 258 | 259 | template =[] 260 | for iv,irc in v_rc_indexes: 261 | #print iv,irc, content_lines[0].split(), len(content_lines[0].split()) 262 | question_ids = [line.split()[iv] for line in content_lines] 263 | answer_ids = [line.split()[irc] for line in content_lines] 264 | 265 | if len(question_ids) != len(answer_ids): 266 | raise ValueError("question_ids and answer_ids vectors should have the same length, but they do not") 267 | else: 268 | template.extend(zip(question_ids, answer_ids)) 269 | 270 | return template 271 | 272 | 273 | def get_image_path(path): 274 | """ 275 | Args 276 | 277 | path(string): Gets a dictionary that maps an image name to its 278 | relative path 279 | """ 280 | 281 | images = [(path+os.sep+img,img) 282 | for img in os.listdir(path)] 283 | d = {} 284 | for image_path,name in images: 285 | abbr = name.split("-")[-1].split(".")[0].replace("img","") 286 | d[abbr] = "./data"+image_path.split("/data")[1] 287 | 288 | return d 289 | 290 | def corpus_to_json(qas, template, images, path): 291 | """ 292 | 293 | Args: 294 | 295 | qas: 296 | template: 297 | images: 298 | path: 299 | 300 | """ 301 | 302 | data = {} 303 | data["name"] = path.rsplit("/",1)[1] 304 | data["data"] = [] 305 | 306 | map_category = {"B": "biology", 307 | "M": "medicine", 308 | "E": "nursery", 309 | "F": "pharmacology", 310 | "P": "psychology", 311 | "Q": "chemistry", 312 | } 313 | 314 | 315 | if len(qas) != len(template): 316 | raise ValueError("qas and template vectors should have the same length, but they do not", len(qas), len(template)) 317 | else: 318 | for question_answer, rc in zip(qas, template): 319 | qid = rc[0] 320 | qtext = question_answer[0] 321 | right_answer = rc[1] 322 | 323 | if not right_answer.isdigit(): 324 | continue 325 | 326 | image= '' 327 | if qtext.startswith("Pregunta vinculada a la imagen nº"): 328 | n_image = qtext.replace("Pregunta vinculada a la imagen nº","").split()[0] 329 | image = images[n_image] 330 | 331 | #Obtaining the information from the answers 332 | answers = [] 333 | for ianswer, answer in enumerate(question_answer[1:],1): 334 | answers.append({"aid":ianswer, 335 | "atext": answer}) 336 | 337 | data["data"].append({"qid": qid, 338 | "qtext": qtext, 339 | "ra": right_answer, 340 | "answers": answers, 341 | "image": image, 342 | }) 343 | data["category"] = map_category[path[-1]] 344 | data["year"] = path.rsplit("/",1)[1].split("_")[1] 345 | 346 | return {path.rsplit("/",1)[1]:data} 347 | 348 | 349 | ############################################################################### 350 | # PDFEXAMS2TXT.PY # 351 | ############################################################################### 352 | if __name__ == '__main__': 353 | 354 | arg_parser = ArgumentParser() 355 | 356 | 357 | arg_parser.add_argument("--data", dest="data", 358 | help="Path to the directory containing the different files", default=None) 359 | 360 | arg_parser.add_argument("--output", dest="output", 361 | help="Path to the output directory containing the files", default=None) 362 | 363 | 364 | args = arg_parser.parse_args() 365 | 366 | healthcare_categories = [args.data+os.sep+subdir 367 | for subdir in os.listdir(args.data)] 368 | 369 | dict_exams = {} 370 | data_exams = {} 371 | dict_solutions = {} 372 | for category in healthcare_categories: 373 | files = [category+os.sep+f for f in os.listdir(category)] 374 | 375 | for f in sorted(files): 376 | 377 | if "1_R" in f: 378 | print (f) 379 | continue 380 | 381 | name = f.rsplit("/",1)[1] 382 | info = name.rsplit(".",1)[1] 383 | name = name.rsplit(".",1)[0] 384 | 385 | if name not in dict_exams: 386 | dict_exams[name] = ExamInfo() 387 | 388 | if info == "pdf": 389 | dict_exams[name].pdf = f 390 | elif info == "answers": 391 | dict_exams[name].answers = f 392 | elif info == "images": 393 | dict_exams[name].images = f 394 | else: 395 | raise ValueError("Extension of the file is not recognized") 396 | 397 | for name_exam in sorted(dict_exams.keys()): 398 | 399 | if dict_exams[name_exam].pdf is None or dict_exams[name_exam].answers is None: 400 | raise ValueError("pdf or answers attributes from ExamInfo() object ",name_exam," are None") 401 | 402 | aux_file = tempfile.NamedTemporaryFile() 403 | 404 | exam_2_txt(dict_exams[name_exam].pdf, aux_file.name) 405 | exam = format_txt_exam(aux_file.name) 406 | 407 | template = format_txt_answers(dict_exams[name_exam].answers) 408 | dict_solutions[name_exam] = template 409 | images = None 410 | if dict_exams[name_exam].images is not None: 411 | images = get_image_path(dict_exams[name_exam].images) 412 | 413 | data_exam = corpus_to_json(exam, template, images, args.output+os.sep+name_exam) 414 | data_exams.update(data_exam) 415 | print ("The exam has been temporarily dumped into", args.output+os.sep+name_exam+".json") 416 | 417 | data = {} 418 | data["version"] = VERSION 419 | data["language"] = LANGUAGE 420 | data["exams"] = data_exams 421 | 422 | #The corpus formatted as a JSON 423 | with codecs.open(args.output+"HEAD.json", 'w') as outfile: 424 | json.dump(data, outfile) 425 | 426 | #A file containing pairs (question_id, right_answer_id). For evaluation purposes 427 | #If ra is X, then the question was deleted by the committee. We set it to 0 428 | for exam in dict_solutions: 429 | with codecs.open(args.output+exam+".gold","w") as f_gold: 430 | for qid, ra in dict_solutions[exam]: 431 | if not ra.isdigit(): 432 | continue 433 | f_gold.write( "\t".join([qid, ra])+"\n" ) 434 | 435 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from numpy import intersect1d 2 | from nltk.stem import WordNetLemmatizer 3 | from nltk.corpus import stopwords 4 | from collections import defaultdict, Counter 5 | from prettytable import PrettyTable 6 | import codecs 7 | import json 8 | import string 9 | 10 | 11 | ACCURACY="accuracy" 12 | F1_SCORE = "F1-score" 13 | RECALL = "recall" 14 | PRECISION = "precision" 15 | RIGHT="right" 16 | WRONG="wrong" 17 | UNANSWERED="unanswered" 18 | NETAS="netas" 19 | NEGATION_WORDS_ES = ["NO","FALSA","INCORRECTA","FALSO","INCORRECTO","MENOR","MENOS"] 20 | NEGATION_WORDS_EN = ["NO", "FALSE", "INCORRECT", "LESS"] 21 | BOS_IMAGE_QUESTION_ES = "Pregunta vinculada a la imagen" 22 | BOS_IMAGE_QUESTION_EN = "Question linked to image" 23 | ID_UNANSWERED = "-" 24 | 25 | def config_file_to_dict(input_file): 26 | config = {} 27 | fins = open(input_file,'r').readlines() 28 | for line in fins: 29 | if len(line) > 0 and line[0] == "#": 30 | continue 31 | if "=" in line: 32 | pair = line.strip().split('#',1)[0].split('=',1) 33 | item = pair[0] 34 | if item in config: 35 | print("Warning: duplicated config item found: %s, updated."%(pair[0])) 36 | config[item] = pair[-1] 37 | return config 38 | 39 | 40 | def is_int(token): 41 | try: 42 | int(token) 43 | return True 44 | except ValueError: 45 | return False 46 | 47 | class QuestionClassifier(): 48 | 49 | QUESTION_WITH_NEGATION = "NEGATION" 50 | QUESTION_WITH_STATISTICS = "STATISTICS" 51 | QUESTION_WITH_IMAGE = "WITH_IMAGE" 52 | QUESTION_OTHER = "OTHER" 53 | 54 | CLASSES = [QUESTION_WITH_NEGATION, 55 | QUESTION_WITH_STATISTICS, 56 | QUESTION_WITH_IMAGE, 57 | QUESTION_OTHER] 58 | 59 | def __init__(self, unanswerable=[], neg_words=[]): 60 | 61 | self.unanswerable = unanswerable 62 | self.neg_words = neg_words 63 | 64 | """ 65 | Preditcs the type of question 66 | """ 67 | def _predict_type(self, question): 68 | 69 | for unans in self.unanswerable: 70 | if question.startswith(unans): 71 | return self.QUESTION_WITH_IMAGE 72 | 73 | for negation in self.neg_words: 74 | if negation in question: 75 | return self.QUESTION_WITH_NEGATION 76 | 77 | return self.QUESTION_OTHER 78 | 79 | def is_unanswerable(self, question): 80 | return self._predict_type(question) in [self.QUESTION_WITH_IMAGE] 81 | 82 | def is_negation_question(self, question): 83 | return self._predict_type(question) == self.QUESTION_WITH_NEGATION 84 | 85 | 86 | 87 | class TextSimilarity(object): 88 | """ 89 | It measures the similarity between two texts (percentage of words shared 90 | between the answer and the span 91 | """ 92 | 93 | def __init__(self, stopwords=stopwords.words('english'), 94 | lemmatizer=WordNetLemmatizer()): 95 | 96 | self.stopwords = stopwords 97 | self.lemmatizer = lemmatizer 98 | 99 | 100 | def _preprocess(self, tokens): 101 | return [self.lemmatizer.lemmatize(t).lower() for t in tokens 102 | if t.lower() not in self.stopwords and t.lower() not in string.punctuation] 103 | 104 | def _compute_overlap(self,l1, l2): 105 | """ 106 | Computes the percentage of elements of l1 that is in l2 107 | 108 | Args 109 | 110 | l1 (list): A list of strings 111 | l2 (list): A list of strings 112 | """ 113 | 114 | d1 = Counter(l1) 115 | d2 = Counter(l2) 116 | 117 | o1 = 0. 118 | for k in d1: 119 | o1 += min(d1[k], d2[k]) 120 | 121 | if len(l2) == 0: 122 | return 0 123 | return o1 / len(l2) 124 | 125 | 126 | def similarity(self,tokens1,tokens2): 127 | 128 | ptokens1 = self._preprocess(tokens1) 129 | ptokens2 = self._preprocess(tokens2) 130 | # print ("ptokens1", ptokens1) 131 | # print ("ptokens2", ptokens2) 132 | return self._compute_overlap(ptokens1, ptokens2) 133 | 134 | 135 | 136 | class Score(object): 137 | 138 | iRIGHT = 0 139 | iWRONG = 1 140 | iUNANSWERED = 2 141 | iPRECISION = 3 142 | iRECALL = 4 143 | iF1 = 5 144 | iNETAS = 6 145 | 146 | 147 | def __init__(self): 148 | self.results = {} 149 | 150 | def parse_eval(self, output_eval): 151 | 152 | prec = 0.0 153 | recall = 0.0 154 | f1 = 0.0 155 | netas = 0.0 156 | 157 | d = {} 158 | for line in output_eval.split("\n"): 159 | 160 | if line.startswith("Number of valid predictions"): 161 | pass 162 | elif line.startswith(RIGHT): 163 | right = line.replace(RIGHT,"") 164 | d[RIGHT] = right 165 | elif line.startswith(WRONG): 166 | wrong = line.replace(WRONG,"") 167 | d[WRONG] = wrong 168 | elif line.startswith(UNANSWERED): 169 | unanswered = line.replace(UNANSWERED,"") 170 | d[UNANSWERED] = unanswered 171 | elif line.startswith(PRECISION): 172 | prec = line.replace(PRECISION,"") 173 | d[PRECISION] = prec 174 | elif line.startswith(RECALL): 175 | recall = line.replace(RECALL,"") 176 | d[RECALL] = recall 177 | elif line.startswith(F1_SCORE): 178 | f1_score = line.replace(F1_SCORE,"") 179 | d[F1_SCORE] = f1_score 180 | pass 181 | elif line.startswith(ACCURACY): 182 | acc = line.replace(ACCURACY, "") 183 | d[ACCURACY] = acc 184 | elif line.startswith(NETAS): 185 | netas = line.replace(NETAS,"") 186 | d[NETAS] = netas 187 | 188 | return self.scores_to_list(d) 189 | 190 | def scores_to_list(self, dscores): 191 | return list(map(float,[dscores[RIGHT],dscores[WRONG],dscores[UNANSWERED],dscores[PRECISION], 192 | dscores[RECALL],dscores[F1_SCORE], dscores[ACCURACY], dscores[NETAS]])) 193 | 194 | def add_exam(self, exam, scores): 195 | self.results[exam] = scores 196 | 197 | def get_exam_scores(self, exam): 198 | return self.results[exam] 199 | 200 | def get_category_scores(self, category): 201 | 202 | category_scores = [] 203 | for exam in self.results: 204 | if category in exam: 205 | category_scores.append(self.results[exam]) 206 | return category_scores 207 | 208 | def get_average_results(self, exams_scores): 209 | average = [0]*len(exams_scores[0]) 210 | for exam in exams_scores: 211 | for index,(s1, s2) in enumerate(zip(average, exam)): 212 | average[index] = s1+s2 213 | return [round(e/len(exams_scores),3) for e in average] 214 | 215 | def get_table(self): 216 | table = PrettyTable() 217 | table.field_names = ["Exam","Year","Right","Wrong","Unanswered","Precision","Recall","F1-score","Accuracy", "NETAS"] 218 | 219 | #Computing individual results 220 | for exam in self.results: 221 | e = [exam,""] 222 | e.extend(self.results[exam]) 223 | table.add_row(e) 224 | 225 | #Computing average results per category 226 | biology_exams = self.get_category_scores("_B") 227 | if len(biology_exams) != 0: 228 | biology_scores = self.get_average_results(biology_exams) 229 | biology_row = ["Biology (avg)", ""] 230 | biology_row.extend(biology_scores) 231 | table.add_row(biology_row) 232 | 233 | medicine_exams = self.get_category_scores("_M") 234 | if len(medicine_exams) != 0: 235 | medicine_scores = self.get_average_results(medicine_exams) 236 | medicine_row = ["Medicine (avg)", ""] 237 | medicine_row.extend(medicine_scores) 238 | table.add_row(medicine_row) 239 | 240 | nursery_exams = self.get_category_scores("_E") 241 | if len(nursery_exams) != 0: 242 | nursery_scores = self.get_average_results(nursery_exams) 243 | nursery_row = ["Nursery (avg)",""] 244 | nursery_row.extend(nursery_scores) 245 | table.add_row(nursery_row) 246 | 247 | pharma_exams = self.get_category_scores("_F") 248 | if len(pharma_exams) != 0: 249 | pharma_scores = self.get_average_results(pharma_exams) 250 | pharma_row = ["Pharmacology (avg)", ""] 251 | pharma_row.extend(pharma_scores) 252 | table.add_row(pharma_row) 253 | 254 | psycho_exams = self.get_category_scores("_P") 255 | if len(psycho_exams) != 0: 256 | psycho_scores = self.get_average_results(psycho_exams) 257 | psycho_row = ["Psychology (avg)", ""] 258 | psycho_row.extend(psycho_scores) 259 | table.add_row(psycho_row) 260 | 261 | chemistry_exams = self.get_category_scores("_Q") 262 | if len(chemistry_exams) != 0: 263 | chemistry_scores = self.get_average_results(chemistry_exams) 264 | chemistry_row = ["Chemistry (avg)", ""] 265 | chemistry_row.extend(chemistry_scores) 266 | table.add_row(chemistry_row) 267 | 268 | all_scores = self.get_average_results([self.results[exam] for exam in self.results]) 269 | all_row = ["All (avg)", ""] 270 | all_row.extend(all_scores) 271 | table.add_row(all_row) 272 | 273 | return table 274 | 275 | class Dataset(object): 276 | 277 | DATA = "data" 278 | VERSION = "version" 279 | EXAMS = "exams" 280 | 281 | def __init__(self): 282 | self.json = None 283 | 284 | def load_json(self,path): 285 | with codecs.open(path) as f: 286 | self.json = json.load(f) 287 | 288 | def get_version(self): 289 | return self.json[self.VERSION] 290 | 291 | def get_exam(self, name_exam): 292 | return self.json[self.EXAMS][name_exam] 293 | 294 | def get_exams(self): 295 | return self.json[self.EXAMS] 296 | 297 | def get_json(self): 298 | return self.json 299 | 300 | def get_qas(self, exam): 301 | qas = [] 302 | if self.json is None: 303 | raise ValueError("Dataset not provided") 304 | 305 | for sample in self.get_exam(exam)[self.DATA]: 306 | qas.append((sample["qid"], sample["qtext"], [a["atext"] for a in sample["answers"]])) 307 | return qas 308 | 309 | --------------------------------------------------------------------------------