├── .gitignore
├── LICENSE
├── README.md
├── configs
    ├── configEN.config
    ├── configEN_test.config
    ├── configES.config
    └── configES_test.config
├── eval.py
├── install.sh
├── install_arc_solvers.sh
├── models.py
├── run.py
├── scripts
    ├── evaluate_arc_solvers.py
    ├── head2ARCformat.py
    ├── head2drqa.py
    └── pdfexams2txt.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 DVC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # HEAD-QA
  2 | 
  3 | **NEWS!** HEAD-QA can be now imported from [huggingface datasets](https://huggingface.co/datasets/head_qa). Thank you very much to [Maria Grandury](https://github.com/mariagrandury) for adding it.
  4 | 
  5 | 
  6 | This repository contains the sources used in "HEAD-QA: A Healthcare Dataset for Complex Reasoning" (ACL, 2019)
  7 | 
  8 | HEAD-QA is a multi-choice **HEA**lthcare **D**ataset. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. They are designed by the [Ministerio de Sanidad, Consumo y Bienestar Social](https://www.mscbs.gob.es/), who also provides direct [access](https://fse.mscbs.gob.es/fseweb/view/public/datosanteriores/cuadernosExamen/busquedaConvocatoria.xhtml) to the exams of the last 5 years (in Spanish). 
  9 | 
 10 | > Date of the last update of the documents object of the reuse: January, 14th, 2019.
 11 | 
 12 | HEAD-QA tries to make these questions accessible for the Natural Language Processing community. We hope it is an useful resource towards achieving better QA systems. The dataset contains questions about the following topics:
 13 | 
 14 | - Medicine.
 15 | - Nursing.
 16 | - Psychology.
 17 | - Chemistry.
 18 | - Pharmacology.
 19 | - Biology.
 20 | 
 21 | # Requirements
 22 | 
 23 | - Python 3.6.7
 24 | - DrQA
 25 | - scikit-learn==0.20.2
 26 | - numpy==1.16.0
 27 | - torch==1.0.0
 28 | - torchvision
 29 | - spacy==2.0.0
 30 | - prettytable==0.70.2
 31 | 
 32 | ## Requirements for the ARC-Solvers
 33 | 
 34 | - Python 3.6.7
 35 | - torch==0.3.1
 36 | - torchvision
 37 | - allennlp==0.20.1
 38 | 
 39 | ## Installation
 40 | 
 41 | We first recommend you to install a virtualenv in the first place (e.g. `virtualenv -p python3.6 head-qa`)
 42 | The script `install.sh` automatically installs the mentioned packages, assuming that you have previously created and activated your virtualenv (tested on Ubuntu 18.04, 64 bits). 
 43 | The script `install_arc_solvers.sh` install the needed stuff to run the ARC-solvers (Clark et al,2019). 
 44 | > We recommend using a different virtualenv for them as stuff such as the pytorch version might create conflicts.
 45 | 
 46 | # Datasets
 47 | 
 48 | [ES_HEAD dataset](https://drive.google.com/open?id=1dUIqVwvoZAtbX_-z5axCoe97XNcFo1No)
 49 | [EN_HEAD dataset](https://drive.google.com/open?id=1phryJg4FjCFkn0mSCqIOP2-FscAeKGV0)
 50 | Each dataset contains:
 51 | - *.gold -> A tsv gold file that maps question IDs to the ground truth answer ID to such question. One file per exam.
 52 | - HEAD[_EN].json -> It contains the whole data for HEAD-QA (used in the so-called 'unsupervised' setting).
 53 | - train_HEAD[\_EN].json -> It contains the training set of HEAD-QA (used as the training set in the so-called 'supervised' setting) 
 54 | - dev_HEAD[\_EN].json -> A json file containing the development set of HEAD-QA (used in the 'supervised' setting).
 55 | - test_HEAD[\_EN].json -> A json file containing the test set of HEAD-QA (used in the 'supervised' setting).
 56 | 
 57 | [Data (images, pdfs, etc)](https://drive.google.com/open?id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t). Note that these are medical images and some of them might have sensitive content.
 58 | 
 59 | 
 60 | 
 61 | # Run the baselines: Length, Random, Blind_n, IR and DrQA
 62 | 
 63 | Available baselines for Spanish HEAD-QA: Length, Random, Blind_n, IR-
 64 | Available baselines for English HEAD-QA (HEAD-QA\_EN): Length, Random, Blind_n, IR, DrQA-
 65 | 
 66 | **Description of the baselines:**
 67 | - Length: Chooses the longest answer
 68 | - Random: Chooses a random answer.
 69 | - Blind_n: Chooses the *n*th answer.
 70 | - IR: Chooses the answer based on the relevance of the query: question+*n*th answer.
 71 | - DrQA: A model based on DrQA's (Chen, D., Fisch, A., Weston, J., & Bordes, A. Reading Wikipedia to Answer Open-Domain Questions)
 72 | 
 73 | 
 74 | ## Creating an inverted index
 75 | 
 76 | IR and DrQA require to create an inverted index in advance. This is done using [wikiextractor](https://github.com/attardi/wikiextractor) and following [DrQa's Document Reader](https://github.com/facebookresearch/DrQA/blob/master/scripts/retriever/README.md) guidelines (visit their README.md for a detailed explanation about how to create the index, we here summarize the main steps):
 77 | 
 78 | In this work we used the following Wikipedia dumps:
 79 | 
 80 | - Spanish: [eswiki-20180620-pages-articles.xml.bz2](http://www.grupolys.org/software/head-qa-acl2019/eswiki-20180620-pages-articles.xml.bz2)
 81 | - English: [enwiki-20180701-pages-articles.xml.bz2](http://www.grupolys.org/software/head-qa-acl2019/enwiki-20180701-pages-articles.xml.bz2)
 82 | 
 83 | Alternative, you can try to use the current Wikipedia dump maintained by https://dumps.wikimedia.org/
 84 | 
 85 | ```
 86 | PYTHONPATH="$HOME/git/wikiextractor" python $HOME/git/wikiextractor/WikiExtractor.py $PATH_WIKIPEDIA_DUMP -o $PATH_WIKI_JSON --json
 87 | PYTHONPATH="$HOME/git/DrQA/" python $HOME/git/DrQA/scripts/retriever/build_db.py $PATH_WIKI_JSON $PATH_DB
 88 | PYTHONPATH="$HOME/git/DrQA/" python $HOME/git/DrQA/scripts/retriever/build_tfidf.py --num-workers 2 $PATH_DB $PATH_TFIDF
 89 | ```
 90 | 
 91 | The created model in $PATH_TFIDF it's what will be used as our inverted index. 
 92 | If they are of any help, the indexes we used in our work can be found [here](http://www.grupolys.org/software/head-qa-acl2019/wiki-articles.tfidf.zip).
 93 | 
 94 | ## Updating DrQA's tokenizer
 95 | 
 96 | By default, DrQA uses the CoreNLP tokenizer. In this work we used the SpacyTokenizer instead. To use it, go to `DrQA/drqa/pipeline/__init__.py` and make sure you use the DEFAULT below these lines. Also, we used `multitask.mdl` as the `reader_model`. Make sure you have downloaded it when you installed DrQA.
 97 | 
 98 | ```
 99 | from ..tokenizers import CoreNLPTokenizer, SpacyTokenizer
100 | 
101 | DEFAULTS = {
102 |     'tokenizer': SpacyTokenizer,#CoreNLPTokenizer,
103 |     'ranker': TfidfDocRanker,
104 |     'db': DocDB,
105 |     'reader_model': os.path.join(DATA_DIR, 'reader/multitask.mdl'),
106 | }
107 | ```
108 | 
109 | 
110 | ## Create a configuration file
111 | 
112 | ```
113 | #A configuration file for Spanish
114 | 
115 | lang=es
116 | eval=eval.py
117 | #Path to your DrQA's installation
118 | drqa=DrQA/ 
119 | use_stopwords=False
120 | ignore_questions=False 
121 | negative_questions=False 
122 | #The folder containing the .gold files
123 | path_solutions=HEAD/ 
124 | 
125 | es_head=HEAD/HEAD.json #HEAD-QA in json format
126 | #The inverted index that we have previously created.
127 | es_retriever=wikipedia//home/david.vilares/Escritorio/proof-head-qa-code/head-qa/wikipedia/eswiki-20180620-articles.tfidf 
128 | 
129 | ```
130 | 
131 | 
132 | After this, you should be abl to run the script `run.py`:
133 | 
134 | ```
135 | python run.py --config configs/configuration$LANG.config --answerer $ANSWERER --output $OUTPUT
136 | ```
137 | 
138 | - `--config` A path to a configuration file (see the folder `configs` for an example)
139 | - `--answerer` A string to indicate what 'answerer' to use. Valid values are [length, random, ir, drqa, blind_n] (n is a number to indicate to take as the right answer the *n*th answer.
140 | - `--output` The path to the file to save the results
141 | 
142 | # Running the ARC-solvers
143 | 
144 | We also run the ARC-Solvers used in the ARC challenge (Clark, P., Cowhey, I., Etzioni, O., Khot, T., Sabharwal, A., Schoenick, C., & Tafjord, O. Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge.). To install and run them follow these steps:
145 | 
146 | 1- Follow the [ARC-solvers README.md instructions](https://github.com/allenai/ARC-Solvers) to create a virtualenv, create the index and download the models and resources:
147 | 
148 | > NOTE that instead of using their ARC_corpus.txt as the inverted index we used again Wikipedia. If you also want to use Wikipedia you need to do two things:
149 | 1. Make sure you have downloaded our [Wikipedia corpus](http://www.grupolys.org/software/head-qa-acl2019/WikiCorpus.zip) in txt format.
150 | 2. Modify the file ARC-Solvers/scripts/download_data.sh and change the argument specifying the corpus ARC_corpus.txt to the path where you have stored the Wikipedia corpus.
151 | 
152 | >NOTE 2 ARC-Solvers need of elasticsearch 6+ to download the data. Download it and to run it execute.
153 | ```
154 | cd elasticsearch-<version>
155 | ./bin/elasticsearch  
156 | ```
157 | 
158 | 2 - Convert HEAD_EN.json into the input format for the ARC solvers
159 | 
160 | ```
161 | PYTHONPATH=. python scripts/head2ARCformat.py --input HEAD_EN/HEAD_EN.json --output HEAD_ARC/
162 | ```
163 | 
164 | 3 - Run the models using the evaluation scripts provided together with the ARC solvers:
165 | 
166 | ```
167 | cd ARC-Solvers
168 | sh scripts/evaluate_solver.sh ../HEAD_ARC/HEAD_EN.arc.txt data/ARC-V1-Models-Aug2018/dgem/
169 | sh scripts/evaluate_solver.sh ../HEAD_ARC/HEAD_EN.arc.txt data/ARC-V1-Models-Aug2018/decompatt/
170 | sh scripts/evaluate_bidaf.sh ../HEAD_ARC/HEAD_EN.arc.txt data/ARC-V1-Models-Aug2018/bidaf/
171 | ```
172 | 
173 | 4 - Compute the scores for HEAD-QA, based on the ARC-solvers outputs
174 | 
175 | ```
176 | cd ..
177 | python evaluate_arc_solvers.py --arc_results $PATH_RESULTS --output $PATH_OUTPUT_DIR --disambiguator length --breakdown_results --path_eval eval.py
178 | ```
179 | where:
180 | - `--arc_results` Path to the output directory containing the outputs computed in step 3.
181 | - `--output` The path to the output directory where to store the results.
182 | - `--disambiguator` The strategy to decide the right answer if many answers where selected as valid by an ARC-solver
183 | - `--breakdown_results` Activate to report individual results for each exam
184 | - `--path_eval` Path to the evaluation script
185 | 
186 | 
187 | #### Issues
188 | 
189 | We had problems running some models, being unable to find the `question-tuplizer.jar` used in the ARC-solvers. If you experience this error `Error: Unable to access jarfile data/ARC-V1-Models-Feb2018/question-tuplizer.jar` we recommend you to change in the file `scripts/evaluate_solver.sh` the line:
190 | `java -Xmx8G -jar data/ARC-V1-Models-Feb2018/question-tuplizer.jar`
191 | by
192 | `java -Xmx8G -jar data/ARC-V1-Models-Aug2018/question-tuplizer.jar`
193 | 
194 | We also had problems ruuning the dgem baseline. The default torch version that is installed if you follow the instructions in the ARC-solvers README.md is the 0.4.1. To make them work we needed to install torch 0.3.1 instead.
195 | 
196 | ## Acknowledgements
197 | 
198 | This work has received funding from the European Research Council (ERC), under the European Union's Horizon 2020 research and innovation programme (FASTPARSE, grant agreement No 714150).
199 | 
200 | ### References
201 | 
202 | Vilares, David and Gómez-Rodríguez, Carlos. "HEAD-QA: A Healthcare Dataset for Complex Reasoning", to appear, ACL 2019.
203 | 


--------------------------------------------------------------------------------
/configs/configEN.config:
--------------------------------------------------------------------------------
 1 | lang=en
 2 | eval=eval.py
 3 | drqa=DrQA/
 4 | use_stopwords=False
 5 | ignore_questions=False
 6 | negative_questions=False
 7 | path_solutions=HEAD_EN/
 8 | 
 9 | en_head=HEAD_EN/HEAD_EN.json
10 | en_retriever=wikipedia/enwiki-20180701-articles.tfidf/enwiki-20180701-articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz
11 | en_drqa_reader_model=DrQA/data/reader/multitask.mdl
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/configs/configEN_test.config:
--------------------------------------------------------------------------------
 1 | lang=en
 2 | eval=eval.py
 3 | drqa=DrQA/
 4 | use_stopwords=False
 5 | ignore_questions=False
 6 | negative_questions=False
 7 | path_solutions=HEAD_EN/
 8 | 
 9 | en_head=HEAD_EN/test_HEAD_EN.json
10 | en_retriever=wikipedia/enwiki-20180701-articles.tfidf/enwiki-20180701-articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz
11 | en_drqa_reader_model=DrQA/data/reader/multitask.mdl
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/configs/configES.config:
--------------------------------------------------------------------------------
 1 | lang=es
 2 | eval=eval.py
 3 | drqa=DrQA/
 4 | use_stopwords=False
 5 | ignore_questions=False
 6 | negative_questions=False
 7 | path_solutions=HEAD/
 8 | 
 9 | es_head=HEAD/HEAD.json
10 | es_retriever=wikipedia/eswiki-20180620-articles.tfidf/eswiki-20180620-articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/configs/configES_test.config:
--------------------------------------------------------------------------------
 1 | lang=es
 2 | eval=eval.py
 3 | drqa=DrQA/
 4 | use_stopwords=False
 5 | ignore_questions=False
 6 | negative_questions=False
 7 | path_solutions=HEAD/
 8 | 
 9 | #Spanish
10 | es_head=HEAD/test_HEAD.json
11 | es_retriever=wikipedia/eswiki-20180620-articles.tfidf/eswiki-20180620-articles-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
 1 | from utils import F1_SCORE, RECALL, PRECISION,ACCURACY, NETAS, RIGHT, WRONG, UNANSWERED, ID_UNANSWERED
 2 | from sklearn.metrics import precision_recall_fscore_support
 3 | from sklearn.metrics import accuracy_score
 4 | from argparse import ArgumentParser
 5 | import codecs
 6 | import warnings
 7 | warnings.filterwarnings('ignore')  
 8 | 
 9 | def netas_score(gold, predicted, avg_10best_scores=574.7):
10 |     
11 |     right = 0
12 |     wrong = 0
13 |     unanswered = 0
14 |     brutas_score = 0
15 |     if len(gold) != len(predicted):
16 |         raise ValueError("The gold and predicted vector must have the same length")
17 |     else:
18 |         for g, p in zip(gold, predicted):
19 |             if g == p: 
20 |                 right+=1
21 |                 brutas_score+=3
22 |             elif p != ID_UNANSWERED: 
23 |                 brutas_score-=1
24 |                 wrong+=1
25 |             else:
26 |                 unanswered+=1
27 |     return brutas_score, right, wrong, unanswered
28 | 
29 | 
30 | def scores(y_pred,y_gold):
31 |     p,r,f1,_ = precision_recall_fscore_support(y_gold, y_pred, average='macro')
32 |     net,right,wrong,unanswered = netas_score(y_gold,y_pred)
33 |     scores = ""
34 |     scores+=PRECISION+"\t"+str(round(p,3))+"\n"
35 |     scores+=RECALL+"\t"+str(round(r,3))+"\n"
36 |     scores+=F1_SCORE+"\t"+str(round(f1,3))+"\n"
37 |     scores+=ACCURACY+"\t"+str(round(accuracy_score(y_gold, y_predicted),3))+"\n"
38 |     scores+=RIGHT+"\t"+right+"\n"
39 |     scores+=WRONG+"\t"+wrong+"\n"
40 |     scores+=UNANSWERED+"\t"+unanswered+"\n"
41 |     scores+=NETAS+"\t"+net+"\n"
42 | 
43 | if __name__ == '__main__':
44 | 
45 |     arg_parser = ArgumentParser()
46 |     arg_parser.add_argument("--gold", dest="gold", help="Path to the PDF file", default=None)
47 |     arg_parser.add_argument("--predicted", dest="predicted", help ="Path to the txt file", default=None)
48 |     
49 |     args = arg_parser.parse_args()
50 |     with codecs.open(args.gold) as f_gold:
51 |         gold = f_gold.readlines()
52 |         y_gold = [e.split()[1].strip() for e in gold]
53 |         
54 |     with codecs.open(args.predicted) as f_predicted:
55 |         predicted = f_predicted.readlines()
56 |         y_predicted =  [e.split()[1].strip() for e in predicted]
57 |     
58 |     p,r,f1,_ = precision_recall_fscore_support(y_gold, y_predicted, average='macro')
59 |     net,right,wrong,unanswered = netas_score(y_gold,y_predicted)
60 |     print (PRECISION,round(p,3))
61 |     print (RECALL, round(r,3))
62 |     print (F1_SCORE, round(f1,3))
63 |     print (ACCURACY,round(accuracy_score(y_gold, y_predicted),3))
64 |     print (RIGHT, right)
65 |     print (WRONG, wrong)
66 |     print (UNANSWERED, unanswered)
67 |     print (NETAS, net )
68 | 
69 |     
70 |     


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | ################################################
 2 | # For the control, IR and DrQA methods
 3 | ################################################
 4 | 
 5 | #Read additional instructions to install DrQA at https://github.com/facebookresearch/DrQA
 6 | #For the most recent version cline instead #https://github.com/facebookresearch/DrQA.git
 7 | git clone https://github.com/aghie/DrQA.git 
 8 | cd DrQA; pip install -r requirements.txt; python setup.py develop
 9 | ./install_corenlp.sh
10 | ./download.sh
11 | 
12 | pip install scikit-learn==0.20.2
13 | pip install numpy==1.16.0
14 | pip install torch==1.0.0
15 | pip install torchvision
16 | pip install spacy==2.0.0
17 | 
18 | python -m spacy download en
19 | python -m spacy download es
20 | 
21 | wget http://www.grupolys.org/software/head-qa-acl2019/HEAD.zip
22 | wget http://www.grupolys.org/software/head-qa-acl2019/HEAD_EN.zip
23 | wget http://www.grupolys.org/software/head-qa-acl2019/data.zip
24 | wget http://www.grupolys.org/software/head-qa-acl2019/wiki-articles.tfidf.zip
25 | wget http://www.grupolys.org/software/head-qa-acl2019/WikiCorpus.zip
26 | 
27 | unzip HEAD.zip
28 | unzip HEAD_EN.zip
29 | unzip data.zip
30 | mkdir wikipedia
31 | unzip wiki-articles.tfidf.zip -d wikipedia/
32 | unzip WikiCorpus.zip -d wikipedia/
33 | 
34 | 


--------------------------------------------------------------------------------
/install_arc_solvers.sh:
--------------------------------------------------------------------------------
1 | #https://github.com/aghie/ARC-Solvers.git
2 | 
3 | pip install allennlp==0.2.1
4 | pip install torch==0.3.1
5 | pip install torchvision
6 | pip install prettytable==0.7.2
7 | pip install scikit-learn==0.20.2
8 | pip install numpy==1.16.0
9 | 


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
  1 | from sklearn.preprocessing import normalize
  2 | from sklearn.metrics.pairwise import cosine_similarity
  3 | from drqa import pipeline
  4 | from drqa.retriever import utils
  5 | from drqa import retriever
  6 | from nltk.corpus import stopwords
  7 | from utils import TextSimilarity
  8 | import logging
  9 | import random
 10 | import itertools
 11 | import prettytable
 12 | import numpy as np
 13 | import sys
 14 | import utils
 15 | import codecs
 16 | import tempfile
 17 | import subprocess
 18 | import json
 19 | import os
 20 | import abc
 21 | 
 22 | 
 23 | # class VotingAnswerer(object):
 24 | #     
 25 | #     def __init__(self, answerers=[]):
 26 | #         self.answerers = answerers
 27 | #         self.question_classifier = utils.QuestionClassifier()
 28 | #         
 29 | #     def add(self,answerer):
 30 | #         self.answerers.add(answerer)
 31 | #         
 32 | #     def remove(self, answerer):
 33 | #         self.answerers.remove(answerer)
 34 | #         
 35 | #     def predict(self, qas):
 36 | #         preds = []
 37 | #             
 38 | #         answers = np.zeros((len(self.answerers), len(qas)), dtype=np.integer)
 39 | #         for j,answerer in enumerate(self.answerers):
 40 | #             answers[j]= [raid for qid, raid in answerer.predict(qas)]
 41 | #         
 42 | #         print (answers, answers.shape)
 43 | #         
 44 | #         for qid,i in enumerate(range(0, answers.shape[1]),1):
 45 | # 
 46 | #             preds.append((str(qid),np.argmax(np.bincount(answers[:,i]))))
 47 | # 
 48 | #         
 49 | #         return preds
 50 | 
 51 | 
 52 | 
 53 | class Answerer(object):
 54 | 
 55 |     def __init__(self,qclassifier):   
 56 |         self.qclassifier = qclassifier
 57 | 
 58 |     def predict(self, qas):   
 59 |         
 60 |         not_to_answer = self.not_to_answer(qas)     
 61 |         predictions = self._predict(qas)
 62 |         for qid in not_to_answer:
 63 |             predictions[qid] = utils.ID_UNANSWERED
 64 |     
 65 |         return predictions    
 66 |         
 67 |             
 68 |     def not_to_answer(self, qas):
 69 |         
 70 |         unanswerable = []
 71 |         for qid, question, answers in qas:
 72 |             if self.qclassifier is not None and self.qclassifier.is_unanswerable(question):
 73 |                 unanswerable.append(qid)
 74 |         return unanswerable    
 75 |         
 76 | 
 77 |     @abc.abstractmethod
 78 |     def _predict(self, qas):
 79 |         pass
 80 | 
 81 | 
 82 | class LengthAnswerer(Answerer):
 83 |     """
 84 |     A solver that selects as the right answer the longest one
 85 |     """
 86 |     
 87 |     NAME = "LengthAnswerer"
 88 |     MAX_CRITERIA = "max"
 89 |     MIN_CRITERIA = "min"
 90 |     
 91 |     def __init__(self, criteria=MAX_CRITERIA,count_words=False,
 92 |                  qclassifier=None):
 93 |         """
 94 |         
 95 |         Args
 96 |         
 97 |         criteria (string). Criteria to choose the right answer based on their lengths. 
 98 |         Valid values are "max" or "min"
 99 |         count_words (boolean): If True, we split the text and count the number of actual words. 
100 |         Otherwise we simply count the length of the string
101 |         
102 |         """
103 | 
104 |         Answerer.__init__(self,qclassifier)
105 |         self.count_words = count_words
106 |         
107 |         if criteria.lower() == self.MAX_CRITERIA:
108 |             self.criteria = max
109 |         elif criteria.lower() == self.MIN_CRITERIA:
110 |             self.criteria = min
111 |         else:
112 |             raise NotImplementedError
113 |       
114 |     
115 |     def name(self):
116 |         return self.NAME
117 |     
118 |     def _predict(self, qas):
119 |         """
120 |         
121 |         Returns a list of tuples (question_id, right_answer_id)
122 |         
123 |         Args
124 |         
125 |         qas (list). A list of tuples of strings of the form (Q, A1,...,AN)
126 |         
127 |         """
128 |         preds = {}
129 |         for qid, question, answers in qas:
130 |             
131 |             if not self.count_words:
132 |                 answer_lengths = list(map(len, answers))
133 |                 pred = answer_lengths.index(self.criteria(answer_lengths))+1
134 |             else:
135 |                 answer_lengths = list(map(len, [a.split() for a in answers]))
136 |                 pred = answer_lengths.index(self.criteria(answer_lengths))+1
137 |             preds[qid] = pred
138 |         return preds
139 | 
140 |     def __str__(self):
141 |         return self.NAME
142 |     
143 | 
144 | 
145 | class RandomAnswerer(Answerer):
146 |     """
147 |     A solver that select as the right answer a random one
148 |     """
149 | 
150 |     NAME = "RandomAnswerer"
151 |     
152 |     def __init__(self, qclassifier=None):
153 |         Answerer.__init__(self,qclassifier)
154 | 
155 |     def name(self):
156 |         return self.NAME
157 | 
158 |     def _predict(self, qas):        
159 |         """
160 |         
161 |         Returns a list of tuples (question_id, right_answer_id)
162 |         
163 |         Args
164 |         
165 |         qas (list). A list of tuples of strings of the form (Q, A1,...,AN)
166 |         
167 |         """
168 |         
169 |         preds = {}
170 |         for qid, question, answers in qas:
171 |             preds[qid] = random.randint(1,len(answers))
172 |         return preds
173 | 
174 |     def __str__(self):
175 |         return self.NAME
176 |     
177 |     
178 | class BlindAnswerer(Answerer):
179 |     
180 |     NAME = "BlindAnswerer"
181 |     
182 |     
183 |     def __init__(self, default, qclassifier=None):
184 |         
185 |         Answerer.__init__(self,qclassifier)
186 |         self.default = default
187 | 
188 |     def name(self):
189 |         return self.NAME+"-"+str(self.default)
190 | 
191 |     def _predict(self, qas):        
192 |         """
193 |         
194 |         Returns a list of tuples (question_id, right_answer_id)
195 |         
196 |         Args
197 |         
198 |         qas (list). A list of tuples of strings of the form (Q, A1,...,AN)
199 |         
200 |         """
201 |         
202 |         preds = {}
203 |         for qid, question, answers in qas:
204 |             if self.default in range(1,len(answers)+1):
205 |                 preds[qid] = self.default
206 |             else:
207 |                 raise ValueError("The answer ID",self.default,"is not available in options 1 to ", len(answers))
208 |         return preds    
209 | 
210 |     def __str__(self):
211 |         return self.NAME+"-"+str(self.default)
212 |     
213 | 
214 | 
215 | 
216 | class WordSimilarityAnswerer(Answerer):
217 |     """
218 |     This solver: (1) computes a question vector by summing the individual embeddings
219 |     of its words (2) repeats the same process for each answer and (3) chooses as the right
220 |     answer the asnwer that maximizes cosine_similarity(question_vector, answer_i_vector)
221 |     """
222 | 
223 |     NAME = "WordSimilarityAnswerer"
224 | 
225 |     def __init__(self, path_word_emb, qclassifier):
226 |         
227 |         """
228 |         
229 |         Args
230 |         
231 |         path_word_emb (string): Path to the embeddings file
232 |         """
233 |         
234 |         Answerer.__init__(self,qclassifier)
235 |         self.word2index = {}
236 |         with codecs.open(path_word_emb) as f:
237 |             self.n_words, self.embedding_size = tuple(map(int,f.readline().strip("\n").split()))
238 |             self.word_embeddings = np.zeros(shape=(self.n_words,self.embedding_size), 
239 |                                             dtype=float)
240 |             line = f.readline()
241 |             idl = 0
242 |             while line != "":
243 |             
244 |                 word, vector = line.split()[0],  line.split()[1:]
245 |                 self.word2index[word] = idl
246 |                 self.word_embeddings[idl] = list(map(float,vector))   
247 |                 line = f.readline() 
248 |                 idl+=1        
249 |             print (" [OK]")
250 |         
251 |         
252 |     def name(self):
253 |         return self.NAME
254 | 
255 |     def _predict(self,qas):
256 |         """
257 |         
258 |         Returns a list of tuples (question_id, right_answer_id)
259 |         
260 |         Args
261 |         
262 |         qas (list). A list of tuples of strings of the form (QID, Q, A1,...,AN)
263 |         
264 |         """
265 |         
266 |         preds = {}
267 |         for qid, question, answers in qas:
268 |         
269 |             question_word_embs = [self.word_embeddings[self.word2index[word]] 
270 |                                if word in self.word2index else np.zeros(self.embedding_size) 
271 |                                for word in question]
272 |             
273 |             embedding_question = normalize(np.sum(question_word_embs, axis=0).reshape(1, -1))
274 |   
275 |             best_score = -1
276 |             for aid, answer in enumerate(answers,1):
277 |                 answer_word_embs = [self.word_embeddings[self.word2index[word]] 
278 |                                if word in self.word2index else np.zeros(self.embedding_size) 
279 |                                for word in answer]
280 |                 answer_vector = normalize(np.sum(answer_word_embs, axis=0).reshape(1, -1))
281 |                 score = cosine_similarity(embedding_question, answer_vector)[0][0]
282 |                 if score > best_score:
283 |                     best_answer,best_score = aid, score             
284 |                 
285 |             preds[qid] = best_answer
286 |         return preds
287 | 
288 |     def __str__(self):
289 |         return self.NAME
290 |     
291 | 
292 | 
293 | class IRAnswerer(Answerer):
294 |     
295 |     """
296 |     A solver that select as the right answer the answer that maximized
297 |     the TF-IDF score of a Wikipedia document when the question+answer_i
298 |     is used as the query. It not found it can choose between not to answer 
299 |     or answer randomly.
300 |     
301 |     This implementation uses the IR system presented in DrQa (Chen et al., 2017)
302 |     """
303 | 
304 |     
305 |     NAME = "IRAnswerer"
306 | 
307 |     def __init__(self,tfidf_path,
308 |                  tokenizer,
309 |                  use_stopwords = False,
310 |                  qclassifier = None):
311 |         
312 |         Answerer.__init__(self,qclassifier)
313 |         self.tokenizer = tokenizer
314 |         self.ranker =retriever.get_class('tfidf')(tfidf_path=tfidf_path)
315 |         self.stopwords = stopwords
316 |         self.use_stopwords = use_stopwords
317 | 
318 |     def _preprocess(self,query):
319 |         
320 |         if self.use_stopwords:
321 |             return " ".join([token.text for token in list(self.tokenizer(query)) 
322 |                              if not token.is_stop])
323 |         else:
324 |             return " ".join([token.text for token in list(self.tokenizer(query))])            
325 | 
326 |     def name(self):
327 |         return self.NAME
328 | 
329 |     def _process(self,query, k=1):
330 |         doc_names, doc_scores = self.ranker.closest_docs(query, k)
331 |         results = []
332 |         for i in range(len(doc_names)):
333 |             results.append((doc_names[i], doc_scores[i]))
334 |         return results
335 | 
336 | 
337 |     def _predict(self,qas):
338 |         preds = {}
339 |         for qid, question, answers in qas:
340 | 
341 |             unanswerable = False if self.qclassifier is None else self.qclassifier.is_unanswerable(question)
342 |             
343 |             #If it is a negation question we look for the least similar answer
344 |             if self.qclassifier.is_negation_question(question):
345 |                 best_answer, best_score = 0, 100000000
346 |                 f = min     
347 |             else:
348 |                 best_answer, best_score = 0,0
349 |                 f = max
350 |                 
351 |             if not unanswerable:
352 | 
353 |                 question = self._preprocess(question)
354 | 
355 |                 for aid, answer in enumerate(answers,1):
356 |                     name, score = self._process(" ".join([question, answer]), k=1)[0]   
357 |                     if f == max and score > best_score:
358 |                         best_answer,best_score = aid, score 
359 |                     elif f == min and score < best_score:
360 |                         best_answer,best_score = aid, score
361 |                 
362 |             preds[qid] = best_answer
363 |         
364 |         return preds
365 | 
366 |     def __str__(self):
367 |         return self.NAME
368 |         
369 | 
370 | 
371 | class DrQAAnswerer(Answerer):
372 |     """
373 |     A solver that implements a simple wrapper to make predictions using 
374 |     DrQA (Chen et al. 2017)
375 |     """
376 |     NAME = "DrQAAnswerer"
377 |     
378 |     def __init__(self, tokenizer, reader_model=None, batch_size=64, 
379 |                  qclassifier=None, cuda=False):
380 |         
381 |         """
382 |         Args
383 |         
384 |         drqa (string): 
385 |         """
386 |         
387 |         print ("Tokenizer", tokenizer)
388 |         Answerer.__init__(self,qclassifier)
389 |         self.batch_size = batch_size
390 |         self.n_docs = 5
391 |         self.top_n = 1
392 |         self.ts = TextSimilarity()
393 |         print ("Reader model", reader_model, cuda)
394 |         self.drqa = pipeline.DrQA(
395 |                     reader_model=reader_model,
396 |                     fixed_candidates=None,
397 |                     embedding_file=None,
398 |                     tokenizer="spacy",
399 |                     batch_size=batch_size,
400 |                     cuda=cuda,
401 |                     data_parallel=False,
402 |                     ranker_config={'options': {'tfidf_path': None,
403 |                                                'strict': False}},
404 |                     db_config={'options': {'db_path': None}},
405 |                     num_workers=1,
406 |                 )
407 | 
408 |         
409 |     
410 |     def name(self):
411 |         return self.NAME
412 | 
413 |     def _predict(self, qas):
414 |         """
415 |         
416 |         Returns a list of tuples (question_id, right_answer_id)
417 |         
418 |         Args
419 |         
420 |         qas (list). A list of tuples of strings of the form (QID, Q, A1,...,AN)
421 |         
422 |         """        
423 | 
424 |         preds = {}
425 |         queries = [question for qid, question, answers in qas]   
426 |         tmp_out = tempfile.NamedTemporaryFile(delete=False)   
427 |         
428 |         drqa_answers = []
429 |         with open(tmp_out.name, 'w') as f:
430 |             batches = [queries[i: i + self.batch_size]
431 |                        for i in range(0, len(queries), self.batch_size)]
432 |             for i, batch in enumerate(batches):
433 | 
434 |                 predictions = self.drqa.process_batch(
435 |                     batch,
436 |                     n_docs=self.n_docs,
437 |                     top_n=self.top_n,
438 |                 )
439 | 
440 |                 drqa_answers.extend([p[0]["span"] for p in predictions])
441 |         
442 |         #Compare which answer is the closest one to the DrQA answers
443 |         assert (len(drqa_answers) == len(qas))
444 |         for pred_answer, (qid,question,answers) in zip(drqa_answers, qas):
445 |             similarities = sorted([(idanswer, self.ts.similarity(pred_answer.split(" "), answer.split(" "))) 
446 |                                    for idanswer,answer in enumerate(answers,1)], 
447 |                                    key= lambda x : x[1], reverse=True)
448 |             
449 |             #No question scored We select the longest answer instead
450 |             if similarities[0][1] == 0:
451 |                 length_answers = [(ida,len(a)) for ida, a in enumerate(answers,1)]
452 |                 length_answers = sorted(length_answers, key = lambda a: a[1], reverse=True)
453 |                 preds[qid] = length_answers[0][0] 
454 | 
455 |             else:
456 |                 if self.qclassifier.is_negation_question(question):
457 |                     preds[qid] = similarities[-1][0] 
458 |                 else:
459 |                     preds[qid] = similarities[0][0] 
460 |                 
461 |         return preds                    
462 | 
463 | 
464 |     def __str__(self):
465 |         return self.NAME
466 |     
467 |         
468 | 
469 |         
470 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
  1 | from models import RandomAnswerer, LengthAnswerer, IRAnswerer, WordSimilarityAnswerer, DrQAAnswerer, BlindAnswerer
  2 | from spacy.lang.es import Spanish
  3 | from spacy.lang.en import  English
  4 | from utils import *
  5 | from argparse import ArgumentParser
  6 | from subprocess import PIPE,Popen
  7 | from tqdm import tqdm
  8 | import codecs
  9 | import json
 10 | import tempfile
 11 | import os
 12 | import random
 13 | import subprocess
 14 | import configparser
 15 | import utils
 16 | import spacy
 17 | import en_core_web_sm
 18 | 
 19 | 
 20 | SPANISH = "es"
 21 | ENGLISH = "en"
 22 | 
 23 | if __name__ == '__main__':
 24 |     
 25 |     arg_parser = ArgumentParser()
 26 |     arg_parser.add_argument("--config", dest="config", help="Path to the configuration file")
 27 |     arg_parser.add_argument("--output", dest="output", help="Path to the output to store the results")
 28 |     arg_parser.add_argument("--answerer", dest="answerer", help="Name of the answerer to be used to train the model")
 29 |     
 30 |     args = arg_parser.parse_args()
 31 |     config = config_file_to_dict(args.config)
 32 |     #Load the configuration for Spanish
 33 |     path_solutions = config["path_solutions"]
 34 |     
 35 |     if config["lang"].lower() == SPANISH:
 36 |         tfidf_retriever = config["es_retriever"]
 37 |         path_head =config["es_head"]
 38 |     
 39 |         unanswerable_sentences = [utils.BOS_IMAGE_QUESTION_ES]
 40 |         neg_words = utils.NEGATION_WORDS_ES
 41 |         nlp = spacy.load('es_core_news_sm') 
 42 |         tokenizer = Spanish().Defaults.create_tokenizer(nlp)
 43 |         
 44 |     elif config["lang"].lower() == ENGLISH:
 45 |         tfidf_retriever = config["en_retriever"]
 46 |         path_head =config["en_head"]
 47 |         unanswerable_sentences = [utils.BOS_IMAGE_QUESTION_EN]
 48 |         neg_words = utils.NEGATION_WORDS_EN
 49 |         nlp = spacy.load('en_core_web_sm')
 50 |         tokenizer = English().Defaults.create_tokenizer(nlp)
 51 |         drqa_reader_model = config["en_drqa_reader_model"]
 52 | 
 53 |     else:
 54 |         raise NotImplementedError
 55 |     
 56 |     ignore_questions = True if config["ignore_questions"].lower() == "true" else False
 57 |     negative_questions = True if config["negative_questions"].lower() == "true" else False
 58 |     use_stopwords = True if config["use_stopwords"].lower() == "true" else False
 59 |     
 60 |     random.seed(17)
 61 |     unanswerable = []
 62 | 
 63 |     if not negative_questions:
 64 |         neg_words = []
 65 |     if not ignore_questions:
 66 |         unanswerable_sentences = []
 67 |     
 68 |     qclassifier = QuestionClassifier(unanswerable= unanswerable_sentences, 
 69 |                                      neg_words = neg_words)    
 70 |            
 71 |     if args.answerer.lower() == "length":
 72 |         answerer = LengthAnswerer(qclassifier=qclassifier)
 73 |     elif args.answerer.lower().startswith("blind"):
 74 |         x = int(args.answerer.split("_")[1])
 75 |         answerer = BlindAnswerer(default=x,qclassifier=qclassifier)
 76 |     elif args.answerer.lower() == "random":
 77 |         answerer = RandomAnswerer(qclassifier=qclassifier)
 78 |     elif args.answerer.lower() == "ir":
 79 |         answerer =  IRAnswerer(tfidf_retriever, qclassifier=qclassifier,
 80 |                                use_stopwords=False, tokenizer=tokenizer)
 81 |     elif args.answerer.lower() == "drqa":
 82 |         answerer =  DrQAAnswerer(tokenizer=tokenizer,
 83 |                                  reader_model=drqa_reader_model,
 84 |                                  qclassifier=qclassifier,
 85 |                                  cuda=True)
 86 |     else:
 87 |         raise NotImplementedError("Answerer", args.answerer," is not available")
 88 |     
 89 |     systems = [answerer]
 90 |     solutions =  {f.replace(".gold",""):path_solutions+os.sep+f for f in os.listdir(path_solutions) if f.endswith(".gold")}
 91 |     score = Score()
 92 |     dataset = Dataset()    
 93 |     dataset.load_json(path_head)
 94 |     predictions = {}
 95 |     unanswerable = {}
 96 |     for answerer in systems:
 97 |              
 98 |         print ("Running ", answerer, "on ", path_head)   
 99 |         avg_netas = 0
100 |         avg_fscore = 0
101 | 
102 |         if answerer not in predictions:
103 |             predictions[answerer.name()] = {}
104 |             unanswerable[answerer.name()] = set([])
105 |         n_exams = len(dataset.get_exams())
106 |         for exam in tqdm(dataset.get_exams()):
107 |             qas = dataset.get_qas(exam)
108 |             preds = answerer.predict(qas)
109 |             predictions[answerer.name()][exam] = preds  
110 |              
111 |         
112 |     systems = []
113 |     ir_answerer = None
114 |     for answerer in predictions:
115 |         for exam in predictions[answerer]:
116 |             gold = solutions[exam]
117 |             tmp = tempfile.NamedTemporaryFile(mode="w",delete=False)
118 |             predicted = tmp.name 
119 |             for qid in predictions[answerer][exam]:
120 |                 tmp.write("\t".join([qid,str(predictions[answerer][exam][qid])])+"\n")                       
121 |             tmp.close()        
122 |             
123 |             command = ["python",config["eval"],"--gold",gold,"--predicted",predicted]
124 |             p = subprocess.Popen(" ".join(command), stdout=subprocess.PIPE, shell=True)
125 |             a, err = p.communicate()
126 |             e = score.parse_eval(a.decode("utf-8"))
127 |             score.add_exam(exam, e)
128 |             os.remove(tmp.name)
129 |         
130 |         with codecs.open(args.output,"w") as f_out_results:
131 |             f_out_results.write(score.get_table().get_string())     
132 | 
133 |         
134 |         
135 | 


--------------------------------------------------------------------------------
/scripts/evaluate_arc_solvers.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | from subprocess import PIPE,Popen
  3 | from utils import *
  4 | from models import LengthAnswerer, BlindAnswerer
  5 | from collections import OrderedDict
  6 | import os
  7 | import json
  8 | import tempfile
  9 | import subprocess
 10 | import utils
 11 | import sys
 12 | 
 13 | 
 14 | def disambiguate(qa,disambiguator, d):
 15 |     
 16 |     if disambiguator:
 17 |         
 18 |         if disambiguator.lower() == "length":
 19 |             return d["length"].predict([qa])[1]-1
 20 | 
 21 |         elif disambiguator.lower() == "blind":
 22 |             #We select as the right answer to choice located before the last one
 23 |             return len(qa[2])-1
 24 |     else:
 25 |         return 
 26 |         
 27 | 
 28 | def breakdown_output(l):
 29 |     """
 30 |     Args 
 31 |     l (string) A list of tuples (exam, questionid, answerid). Id must follow the format $BASENAME_$NAMEEXAM_$QID
 32 |     """      
 33 |     
 34 |     def parse_questionid(name):
 35 |         
 36 |         try:
 37 |             exam = name.rsplit("_",1)[0]
 38 |             qid = name.rsplit("_",1)[1]    
 39 |             return exam, qid
 40 |         except IndexError:
 41 |             raise ValueError("The variable 'name' does not respect the format")
 42 |     
 43 |     d = OrderedDict()
 44 |     for questionid, answer in l:
 45 |         
 46 |         exam, qid = parse_questionid(questionid)
 47 |         
 48 |         if exam not in d:
 49 |             d[exam] = OrderedDict()
 50 |         if qid not in d[exam]:
 51 |             d[exam][qid] = answer
 52 |         else:
 53 |             raise KeyError("Key", qid,"already exists in dictionary")
 54 |     return d
 55 | 
 56 | 
 57 | def _select_answer(line_json):
 58 |     
 59 |     selected_answers = line_json["selected_answers"].split(",")
 60 |     #An ARC solver, we need to disambiguate
 61 |     if len(selected_answers) > 1:
 62 | #        print ("MORE than one answer selected")
 63 | #        print ("line_json", line_json)
 64 |         qa = (1,"", [choice["text"] for choice in line_json["question"]["choices"]
 65 |                                                                    if choice["label"] in selected_answers])
 66 | #        print ("options", qa)
 67 |         iselected = disambiguate(qa, args.disambiguator, disambiguators)
 68 | #        print ("iselected", iselected)
 69 | #        input("NEXT")
 70 |         return selected_answers[iselected]
 71 |     else:
 72 | #         print ("ONLY one answer selected")
 73 | #         print ("line_json", line_json)
 74 | #         print ("selected_answer", line_json["selected_answers"])
 75 | #         input("NEXT")
 76 |         return line_json["selected_answers"]
 77 |     
 78 |     
 79 | def _select_negative_answer(line_json):
 80 |     
 81 |     score = sys.maxsize
 82 |     answer = utils.ID_UNANSWERED
 83 |     
 84 | 
 85 |     for choice in line_json["question"]["choices"]:    
 86 |         if score > choice["score"]:
 87 |             answer = choice["label"]
 88 |             score = min(score, choice["score"])
 89 | #            print ("score", score)
 90 | #    print ("answer", answer)
 91 | #    input("NEXT")
 92 |     return answer
 93 |     
 94 | 
 95 | # def all_scores_are_zero(line_json):
 96 | #     
 97 | #     return not any([1 for choice in line_json["question"]["choices"] 
 98 | #                 if choice["score"] != 0])
 99 | 
100 | 
101 | def select_answer(line_json, ignore, negative,
102 |                   qclassifier):
103 |     
104 |     if ignore:    
105 |         if (qclassifier is not None and qclassifier.is_unanswerable(line_json["question"]["stem"])
106 |             ):
107 |             return utils.ID_UNANSWERED 
108 |         else:
109 |             if not negative:
110 |                 return _select_answer(line_json)
111 |             else:
112 |                 return _select_negative_answer(line_json)            
113 |     else:
114 |         if not negative:
115 |             return _select_answer(line_json)
116 |         else:
117 |             return _select_negative_answer(line_json)
118 |         
119 |         
120 | 
121 | if __name__ == '__main__':
122 |     
123 |     arg_parser = ArgumentParser()
124 |     arg_parser.add_argument("--arc_results", dest="arc_results", 
125 |                             help="Path to the directory containing the ARC results")
126 |     arg_parser.add_argument("--output", dest="output",
127 |                             help="Path to the output file where to store the results")
128 |     arg_parser.add_argument("--disambiguator", dest="disambiguator", default=None,
129 |                             help="Backup answerer to use if an ARC solver returns multiple questions [LengthAnswerer, BlindAnswerer]")
130 |     arg_parser.add_argument("--breakdown_results", dest="breakdown_results", action="store_true",
131 |                             help="To breakdown results for each exam (each JSON 'question' field must include both an 'exam' and a 'qid' fields")
132 |     arg_parser.add_argument("--ignore_questions", dest="ignore_questions",
133 |                             help="To ignore questions according to the strategy described in the paper", action="store_true")
134 |     arg_parser.add_argument("--negative_questions", dest="negative_questions",
135 |                             help="It deals with negative questions", action="store_true")
136 |     arg_parser.add_argument("--path_eval",
137 |                             help="Path to head-qa/eval.py")
138 |     args = arg_parser.parse_args()
139 | 
140 |     results_files = [args.arc_results+os.sep+file 
141 |                      for file in os.listdir(args.arc_results)
142 |                      if "qapredictions" in file]
143 |     
144 |     
145 |     neg_words = utils.NEGATION_WORDS_EN
146 |     unanswerable_sentences =  [utils.BOS_IMAGE_QUESTION_EN]
147 |     
148 |     if not args.negative_questions:
149 |         neg_words = []
150 |     if not args.ignore_questions:
151 |         unanswerable_sentences = []
152 |     
153 |     qclassifier = QuestionClassifier(unanswerable= unanswerable_sentences, 
154 |                                      neg_words = neg_words)    
155 |     
156 |     length_answerer = LengthAnswerer()
157 |     disambiguators = {"length": length_answerer}
158 |     scorer = Score()
159 |     for file in results_files:
160 |         
161 |         name_dataset =  file.split("qapredictions")[0].rsplit("/",1)[1]
162 |         name_model = file.split("qapredictions")[1].split("_")[1]
163 | 
164 |         gold = []
165 |         pred = []
166 |         ids = []
167 |         
168 |         with open(file) as f:
169 |             
170 |             line = f.readline()
171 |             while line != "":
172 |                 
173 |                 line_json = json.loads(line)
174 |                 line = f.readline()
175 |                 id = line_json["id"]
176 |                 ids.append(id)
177 |                 gold.append(line_json["answerKey"])
178 |                 pred.append(select_answer(line_json, args.ignore_questions, 
179 |                               args.negative_questions, qclassifier))
180 |                 
181 |         
182 |         
183 |         if args.breakdown_results:
184 |             
185 |             assert(len(ids)==len(gold))
186 |             assert(len(ids)==len(pred))
187 |             d_gold = breakdown_output(zip(ids, gold))
188 |             d_pred = breakdown_output(zip(ids, pred))
189 | 
190 |             for exam in d_gold:
191 |                 gold_file = args.arc_results+os.sep+exam+".arc_gold"
192 |                 with open(gold_file,"w") as f_gold:      
193 |                     f_gold.write("\n".join( [qid+"\t"+d_gold[exam][qid] 
194 |                                              for qid in d_gold[exam] ]))
195 |                     
196 |                 pred_file = args.arc_results+os.sep+exam+".arc_pred" 
197 |                 with open(pred_file,"w") as f_pred:      
198 |                     f_pred.write("\n".join([qid+"\t"+d_pred[exam][qid] 
199 |                                             for qid in d_pred[exam] ]))
200 |                 
201 |                 
202 |                 command = ["python",args.path_eval,"--gold",gold_file,"--predicted",pred_file]
203 |                # command = ["python","../eval.py","--gold",gold_file,"--predicted",pred_file]
204 |                 p = subprocess.Popen(" ".join(command), stdout=subprocess.PIPE, shell=True)
205 |                 out, err = p.communicate()
206 |                 exam_scores = scorer.parse_eval(out.decode("utf-8"))
207 |                 scorer.add_exam(exam, exam_scores)
208 |                 
209 |         else:        
210 |             assert(len(id)==len(gold))
211 |             assert(len(id)==len(pred))         
212 |             gold_file = args.arc_results+os.sep+name_dataset+".arc_gold"
213 |             with open(gold_file,"w") as f_gold:      
214 |                 f_gold.write("\n".join( [id+"\t"+p for id, p in zip(ids,gold) ]))
215 |                 
216 |             pred_file = args.arc_results+os.sep+name_dataset+".arc_pred" 
217 |             with open(pred_file,"w") as f_pred:      
218 |                 f_pred.write("\n".join( [id+"\t"+p for id, p in zip(ids,pred) ]))
219 |             
220 |             command = ["python",args.path_eval,"--gold",gold_file,"--predicted",pred_file]
221 |             p = subprocess.Popen(" ".join(command), stdout=subprocess.PIPE, shell=True)
222 |             out, err = p.communicate()
223 |     
224 |             exam_scores = scorer.parse_eval(out.decode("utf-8"))
225 |             scorer.add_exam(name_dataset, exam_scores)
226 |     
227 |         print (args.output+os.sep+"EN-"+name_model+"ign="+str(args.ignore_questions)+".neg="+str(args.negative_questions))
228 |         with codecs.open(args.output+os.sep+"EN-"+name_model+"ign="+str(args.ignore_questions)+".neg="+str(args.negative_questions)+".ARC.results","w") as f_out_results:
229 |             f_out_results.write(scorer.get_table().get_string())          
230 | 
231 |     
232 |     


--------------------------------------------------------------------------------
/scripts/head2ARCformat.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | import codecs
 3 | import json
 4 | import copy
 5 | import os
 6 | from utils import Dataset
 7 | from collections import OrderedDict
 8 | 
 9 | if __name__ == '__main__':
10 |     
11 |     arg_parser = ArgumentParser()
12 |     arg_parser.add_argument("--input", dest="input", 
13 |                             help="Path to the HEAD dataset", default=None)
14 |     arg_parser.add_argument("--output", dest="output", 
15 |                             help="Path to the output directory where to store the exams in a suitable format for the ARC solvers")
16 |     
17 |     args = arg_parser.parse_args()
18 |     
19 |     dataset = Dataset()
20 |     dataset.load_json(args.input)
21 |     exams = dataset.get_exams()
22 |     name_head = args.input.rsplit("/",1)[1].replace(".json","")
23 |     
24 |     with codecs.open(args.output+os.sep+name_head+".arc.txt","w") as f:         
25 |         for exam in exams:        
26 |             data_exam = exams[exam]["data"]
27 |             for ielement, element in enumerate(data_exam):
28 |             
29 |                 arc_id = "_".join([exam, str(ielement)]) 
30 |                 data = {"id":arc_id}
31 |                 stem = element["qtext"]
32 |                 question = {"stem": stem}
33 |                 question.update({"exam": exam})
34 |                 question.update({"qid": ielement})
35 |                 arc_answers = []
36 |                 for answer in element["answers"]:
37 |                     arc_answers.append({"text":answer["atext"], "label":str(answer["aid"])})
38 |                 
39 |                 question.update({"choices": arc_answers})
40 |                 
41 |                 right_answer = str(element["answers"][int(element["ra"])-1]["aid"])
42 |                 data.update({"question":question})
43 |                 data.update({"answerKey": right_answer})
44 |             
45 |                 repr = json.dumps(data)
46 |                 f.write(repr+"\n")  
47 | 
48 | 
49 |             


--------------------------------------------------------------------------------
/scripts/head2drqa.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | import codecs
 3 | import json
 4 | import copy
 5 | import os
 6 | from utils import Dataset
 7 | 
 8 | if __name__ == '__main__':
 9 |     
10 |     arg_parser = ArgumentParser()
11 |     arg_parser.add_argument("--input", dest="input", 
12 |                             help="Path to the HEAD dataset", default=None)
13 |     arg_parser.add_argument("--output", dest="output", 
14 |                             help="Path to the output directory where to store the exams in a suitable format for DrQa")
15 |     
16 |     args = arg_parser.parse_args()
17 |     
18 |     dataset = Dataset()
19 |     dataset.load_json(args.input)
20 |     exams = dataset.get_exams()
21 |     
22 |     for exam in exams:
23 |         with codecs.open(args.output+os.sep+exam+".drqa.txt","w") as f:         
24 |             data_exam = exams[exam]["data"]
25 |             for ielement, element in enumerate(data_exam):
26 |                 data = {}
27 |                 question = element["qtext"]
28 |                 question_id = element["qid"]
29 |                 right_answer = element["answers"][int(element["ra"])-1]["atext"]
30 |                 data["question"] = question
31 |                 data["qid"] = question_id
32 |                 data["answer"] = right_answer
33 |             
34 |                 repr = json.dumps(data)
35 |                 f.write(repr+"\n")
36 |             


--------------------------------------------------------------------------------
/scripts/pdfexams2txt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | 
  5 | from argparse import ArgumentParser
  6 | import codecs
  7 | import os
  8 | import json
  9 | import tempfile
 10 | 
 11 | 
 12 | VERSION = "1.0"
 13 | LANGUAGE = "es"
 14 | 
 15 | class ExamInfo(object):
 16 | 
 17 |     def __init__(self):
 18 | 
 19 |         self.pdf = None
 20 |         self.answers = None
 21 |         self.images = None        
 22 | 
 23 | 
 24 | def exam_2_txt(path_pdf, path_txt):    
 25 |     """
 26 |     Executes pdftotext to extract the raw text from a PDF file.
 27 |     The option -layout is used to maintain the two columns format of the exam,
 28 |     which simplifies post-processing
 29 |     
 30 |     Args
 31 |     
 32 |     path_pdf (string): Path to the exam in PDF format
 33 |     path_txt (string): Path to the output path to store the file in raw format
 34 |     
 35 |     """
 36 |     os.system(" ".join(["pdftotext -layout",path_pdf,path_txt]))
 37 | 
 38 | 
 39 | def preprocess_line(line, mid):
 40 |     """
 41 |     Preprocesses a line and infers the text that belongs to the first and second 
 42 |     columns.
 43 |     
 44 |     Args
 45 |     
 46 |     line (string): A line from the exam
 47 |     mid (int): Index where the second column is considered to start
 48 |     
 49 |     """
 50 |     
 51 |     if len(line) <= mid+4:
 52 |         col1 = line[0:mid+4].strip()
 53 |         col2 = ""
 54 |     else:
 55 |         col1 = line[0:mid+4].strip() if line[0:mid+4] != " "*(mid+4) else ""
 56 |         col2 = line[mid+4:].strip()
 57 |     return col1, col2
 58 | 
 59 | 
 60 | def is_new_element(phrase,cur):
 61 |     
 62 |     """
 63 |     Here, an exam represents: (1) a question or (2) one of the answers.
 64 |     These can be easily determined because of the format of the ID.
 65 |     """
 66 |     
 67 |     phrase_aux = phrase.strip().split(". ")
 68 |     if len(phrase_aux) > 1:
 69 |         #TODO: This is a bit of a mess, but the processing from the pdf is not easy.
 70 |         #To "ensure it is an ID at the beginning of the sentence. 
 71 |         return phrase_aux[0].isdigit() and ( phrase_aux[0] in ["1","2","3","4","5"]  or len(cur.strip()) == 0
 72 |                                              or (len(cur.strip()) != 0 and (cur.strip()[-1] in [".",":","?"]) ) ) 
 73 |     
 74 |     return False
 75 | 
 76 | def concat_element(cur,to_append):
 77 | 
 78 |     """
 79 |     
 80 |     Args:
 81 |     
 82 |     cur (string): Part of the current element (a question or an answer)
 83 |     that we have processed
 84 |     to_append (string): Line that we are processing now
 85 |     
 86 |     """
 87 | 
 88 |     #is_new_element estimates if we have finised processing the current
 89 |     #question or element
 90 |     if is_new_element(to_append, cur):
 91 |         
 92 |         return cur+"\n"+postprocess_phrase(to_append)
 93 |     else:
 94 |         return cur+postprocess_phrase(to_append)
 95 | 
 96 |     
 97 | 
 98 | def postprocess_phrase(phrase):
 99 |     """
100 |     Some words might be 'cut' for an exam, as the text might take
101 |     several lines. If we find that the last element of a sentence is
102 |     '-', we remove it, and the word will continue in the next line. 
103 |     Otherwise, we are sure it is the end of a word.
104 |     
105 |     Args:
106 |     
107 |     phrase (string)
108 |     
109 |     Warnings:
110 |     
111 |     This implementation is specific for Spanish
112 |     """
113 |     
114 |     if len(phrase) == 0: return phrase
115 |     
116 |     if phrase[-1] == "-":
117 |         return phrase[:-1]
118 |     else:
119 |         return phrase+" "
120 | 
121 |     
122 | def contains_all_answers(qas, n_answers):
123 |     """
124 |     Checks if we have identified all the answers for our tuple (Q,A1,A2,A3,A4)
125 |     
126 |     Args:
127 |     
128 |     qas: A tuple of the form (Q,A1,A2,A3,A4). Each exam is a string
129 |     n_answers: Number of expected answers
130 |     """
131 |     
132 |     all = True
133 |      
134 |     for i in range(1,n_answers+1):
135 |         all = all and qas[i].startswith(str(i)+".")
136 | 
137 |     return all
138 |     
139 |     
140 | def _remove_ids(elements):
141 |     """
142 |     We remove the question and answer ids from the text (they are noise for the input) 
143 |     prior to convert the corpus to JSON
144 |     """
145 |     new_elements = []
146 |     for e in elements:
147 |         new_elements.append(e.split(".",1)[1].strip())
148 |         
149 |     return new_elements
150 | 
151 | 
152 | def format_txt_exam(path_text):
153 |     
154 |     """
155 |     Transforms an exam in text format (as processed by exam_2_text) into a list (Q,A1,...,AN).
156 | 
157 |     Args:
158 |     
159 |     path_text (string): Path to an exam in text format
160 | 
161 |     """
162 |     
163 |     first_page = True
164 |     col1 = ""
165 |     col2 = ""
166 |     
167 |     qas = []
168 |     pages = ""
169 |     
170 |     #Get the indexes of the footnote pages to know where columns should
171 |     #be split
172 |     d_pages_indexes = {}
173 |     with codecs.open(path_text) as f:
174 |         n_page = 1
175 |         line = f.readline()
176 |         while line != "":
177 |             if line.strip().strip("-").isdigit() or line.strip().replace("-","").replace(" ","").isdigit(): 
178 |                 d_pages_indexes[n_page] = line.index(line.strip())
179 |                 n_page+=1
180 |             line = f.readline()
181 |          
182 |     with codecs.open(path_text) as f:
183 |         
184 |         line = f.readline()
185 |         n_page=1
186 |         while line != "":
187 |             
188 |             #Beginning of a new page
189 |             if line.strip() == "-1-" or line.strip() == "- 1 -":
190 |                 n_page+=1
191 |                 first_page = False
192 |                 c1, c2 = "",""
193 |                 col1 = ""
194 |                 col2 = ""
195 |             
196 |             elif line.strip().strip("-").isdigit() or line.strip().replace("-","").replace(" ","").isdigit():
197 |                 
198 |                 pages += col1+col2    
199 |                 
200 |                 col1 = ""
201 |                 col2 = ""
202 |                 n_page+=1
203 |              
204 |             elif not first_page:
205 |                 if n_page not in d_pages_indexes: break
206 |                 c1,c2 = preprocess_line(line, d_pages_indexes[n_page])
207 |                 col1 = concat_element(col1,c1)
208 |                 col2 = concat_element(col2,c2)
209 |             line = f.readline()
210 | 
211 | 
212 | 
213 |     pages_split = pages.strip().split("\n")
214 |     n_answers = int(pages_split[-1].split(".")[0])
215 |     j = 1+n_answers
216 |     for i in range(0,len(pages_split),j):
217 |         
218 |         if not contains_all_answers(pages_split[i:i+j], n_answers=n_answers):
219 |             raise ValueError("The sample does not contain all the expected answers")
220 |         
221 |         qas.append(_remove_ids(pages_split[i:i+j]))
222 | 
223 |     return qas
224 | 
225 | 
226 | def format_txt_answers(path):
227 |     """
228 |     Converts the text file with the gold answers into a list of tuples
229 |     (question_id, right_answer_id)
230 | 
231 |     Args
232 |     
233 |     path (string): Path the text file containing the gold answers
234 | 
235 |     """
236 | 
237 |     with codecs.open(path) as f:
238 |         lines = f.readlines()
239 |         
240 |     head_line = lines[0]
241 |     content_lines = lines[1:]
242 |     
243 |     d_columns = {}
244 |     d_columns_content = {}
245 |     
246 |     for j,key in enumerate(head_line.split()):
247 |         if key not in d_columns:
248 |             d_columns[key] = []
249 |             d_columns_content[key] = []
250 |         d_columns[key].append(j)
251 |     
252 |     RC_key = "RC"
253 |     v_keys = [key for key in d_columns if key.startswith("V")]
254 |     v_keys.sort(reverse=True)
255 |     V_key = v_keys[0]
256 |     
257 |     v_rc_indexes = zip([index for index in d_columns[V_key]], [index for index in d_columns[RC_key]]) 
258 |     
259 |     template =[]
260 |     for iv,irc in v_rc_indexes:
261 |         #print iv,irc, content_lines[0].split(), len(content_lines[0].split())
262 |         question_ids = [line.split()[iv] for line in content_lines]  
263 |         answer_ids = [line.split()[irc] for line in content_lines]  
264 |         
265 |         if len(question_ids) != len(answer_ids):
266 |             raise ValueError("question_ids and answer_ids vectors should have the same length, but they do not")
267 |         else:
268 |             template.extend(zip(question_ids, answer_ids))
269 | 
270 |     return template
271 | 
272 | 
273 | def get_image_path(path):
274 |     """
275 |     Args
276 |     
277 |     path(string): Gets a dictionary that maps an image name to its 
278 |     relative path
279 |     """
280 |     
281 |     images = [(path+os.sep+img,img) 
282 |               for img in os.listdir(path)]
283 |     d = {}
284 |     for image_path,name in images:
285 |         abbr = name.split("-")[-1].split(".")[0].replace("img","")
286 |         d[abbr] = "./data"+image_path.split("/data")[1]
287 |     
288 |     return d
289 | 
290 | def corpus_to_json(qas, template, images, path):
291 |     """
292 |     
293 |     Args:
294 |     
295 |     qas:
296 |     template:
297 |     images:
298 |     path:
299 |     
300 |     """
301 |     
302 |     data = {}
303 |     data["name"] = path.rsplit("/",1)[1]
304 |     data["data"] = []
305 |     
306 |     map_category = {"B": "biology",
307 |              "M": "medicine",
308 |              "E": "nursery",
309 |              "F": "pharmacology",
310 |              "P": "psychology",
311 |              "Q": "chemistry",
312 |              }
313 | 
314 | 
315 |     if len(qas) != len(template):
316 |         raise ValueError("qas and template vectors should have the same length, but they do not", len(qas), len(template))
317 |     else:   
318 |         for question_answer, rc in zip(qas, template):
319 |             qid = rc[0]
320 |             qtext = question_answer[0]
321 |             right_answer = rc[1]
322 |             
323 |             if not right_answer.isdigit():
324 |                 continue
325 |             
326 |             image= ''
327 |             if qtext.startswith("Pregunta vinculada a la imagen nº"):
328 |                 n_image = qtext.replace("Pregunta vinculada a la imagen nº","").split()[0]
329 |                 image = images[n_image]
330 |             
331 |             #Obtaining the information from the answers
332 |             answers = []
333 |             for ianswer, answer in enumerate(question_answer[1:],1):
334 |                 answers.append({"aid":ianswer,
335 |                                 "atext": answer})
336 |             
337 |             data["data"].append({"qid": qid,
338 |                                  "qtext": qtext,
339 |                                  "ra": right_answer,
340 |                                  "answers": answers,
341 |                                  "image": image,
342 |                                  })
343 |             data["category"] = map_category[path[-1]]
344 |             data["year"] = path.rsplit("/",1)[1].split("_")[1]
345 | 
346 |     return {path.rsplit("/",1)[1]:data}
347 |             
348 | 
349 | ###############################################################################
350 | #                                PDFEXAMS2TXT.PY                              #
351 | ###############################################################################
352 | if __name__ == '__main__':
353 |     
354 |     arg_parser = ArgumentParser()
355 | 
356 | 
357 |     arg_parser.add_argument("--data", dest="data", 
358 |                             help="Path to the directory containing the different files", default=None)
359 | 
360 |     arg_parser.add_argument("--output", dest="output", 
361 |                             help="Path to the output directory containing the files", default=None)
362 | 
363 |     
364 |     args = arg_parser.parse_args()
365 |     
366 |     healthcare_categories = [args.data+os.sep+subdir 
367 |                              for subdir in os.listdir(args.data)]
368 |     
369 |     dict_exams = {}
370 |     data_exams = {}
371 |     dict_solutions = {}
372 |     for category in healthcare_categories:
373 |         files = [category+os.sep+f for f in os.listdir(category)]
374 |         
375 |         for f in sorted(files):
376 |             
377 |             if "1_R" in f:
378 |                 print (f)
379 |                 continue
380 | 
381 |             name = f.rsplit("/",1)[1]
382 |             info = name.rsplit(".",1)[1]
383 |             name = name.rsplit(".",1)[0]
384 |             
385 |             if name not in dict_exams:
386 |                 dict_exams[name] = ExamInfo()
387 |             
388 |             if info == "pdf":
389 |                 dict_exams[name].pdf = f
390 |             elif info == "answers":
391 |                 dict_exams[name].answers = f
392 |             elif info == "images":
393 |                 dict_exams[name].images = f
394 |             else:
395 |                 raise ValueError("Extension of the file is not recognized")
396 |  
397 |     for name_exam in sorted(dict_exams.keys()):
398 | 
399 |         if dict_exams[name_exam].pdf is None or dict_exams[name_exam].answers is None:
400 |             raise ValueError("pdf or answers attributes from ExamInfo() object ",name_exam," are None")
401 | 
402 |         aux_file = tempfile.NamedTemporaryFile()
403 |     
404 |         exam_2_txt(dict_exams[name_exam].pdf, aux_file.name)
405 |         exam = format_txt_exam(aux_file.name)
406 |         
407 |         template = format_txt_answers(dict_exams[name_exam].answers)
408 |         dict_solutions[name_exam] = template
409 |         images = None
410 |         if dict_exams[name_exam].images is not None:
411 |             images = get_image_path(dict_exams[name_exam].images)
412 |         
413 |         data_exam = corpus_to_json(exam, template, images, args.output+os.sep+name_exam)
414 |         data_exams.update(data_exam)       
415 |         print ("The exam has been temporarily dumped into", args.output+os.sep+name_exam+".json")
416 | 
417 |     data = {}
418 |     data["version"] = VERSION 
419 |     data["language"] = LANGUAGE 
420 |     data["exams"] = data_exams
421 |     
422 |     #The corpus formatted as a JSON
423 |     with codecs.open(args.output+"HEAD.json", 'w') as outfile:  
424 |         json.dump(data, outfile)
425 |          
426 |     #A file containing pairs (question_id, right_answer_id). For evaluation purposes
427 |     #If ra is X, then the question was deleted by the committee. We set it to 0
428 |     for exam in dict_solutions:
429 |         with codecs.open(args.output+exam+".gold","w") as f_gold:
430 |             for qid, ra in dict_solutions[exam]:
431 |                 if not ra.isdigit():
432 |                     continue
433 |                 f_gold.write( "\t".join([qid, ra])+"\n" ) 
434 |                 
435 |                 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | from numpy import intersect1d
  2 | from nltk.stem import WordNetLemmatizer
  3 | from nltk.corpus import stopwords
  4 | from collections import defaultdict, Counter
  5 | from prettytable import PrettyTable
  6 | import codecs
  7 | import json
  8 | import string
  9 | 
 10 |  
 11 | ACCURACY="accuracy"
 12 | F1_SCORE = "F1-score"
 13 | RECALL = "recall"
 14 | PRECISION = "precision"
 15 | RIGHT="right"
 16 | WRONG="wrong"
 17 | UNANSWERED="unanswered"
 18 | NETAS="netas"
 19 | NEGATION_WORDS_ES = ["NO","FALSA","INCORRECTA","FALSO","INCORRECTO","MENOR","MENOS"]
 20 | NEGATION_WORDS_EN = ["NO", "FALSE", "INCORRECT", "LESS"]
 21 | BOS_IMAGE_QUESTION_ES = "Pregunta vinculada a la imagen"
 22 | BOS_IMAGE_QUESTION_EN = "Question linked to image"
 23 | ID_UNANSWERED = "-"
 24 | 
 25 | def config_file_to_dict(input_file):
 26 |     config = {}
 27 |     fins = open(input_file,'r').readlines()
 28 |     for line in fins:
 29 |         if len(line) > 0 and line[0] == "#":
 30 |             continue
 31 |         if "=" in line:
 32 |             pair = line.strip().split('#',1)[0].split('=',1)
 33 |             item = pair[0]
 34 |             if item in config:
 35 |                 print("Warning: duplicated config item found: %s, updated."%(pair[0]))
 36 |             config[item] = pair[-1]                
 37 |     return config
 38 | 
 39 | 
 40 | def is_int(token):
 41 |     try:
 42 |         int(token)
 43 |         return True
 44 |     except ValueError:
 45 |         return False
 46 | 
 47 | class QuestionClassifier():
 48 |     
 49 |     QUESTION_WITH_NEGATION = "NEGATION"
 50 |     QUESTION_WITH_STATISTICS = "STATISTICS"
 51 |     QUESTION_WITH_IMAGE = "WITH_IMAGE"
 52 |     QUESTION_OTHER = "OTHER"
 53 |     
 54 |     CLASSES = [QUESTION_WITH_NEGATION, 
 55 |                QUESTION_WITH_STATISTICS, 
 56 |                QUESTION_WITH_IMAGE,
 57 |                QUESTION_OTHER]
 58 |     
 59 |     def __init__(self, unanswerable=[], neg_words=[]):
 60 |         
 61 |         self.unanswerable = unanswerable
 62 |         self.neg_words = neg_words
 63 |     
 64 |     """
 65 |     Preditcs the type of question
 66 |     """
 67 |     def _predict_type(self, question):
 68 |         
 69 |         for unans in self.unanswerable:
 70 |             if question.startswith(unans):
 71 |                 return self.QUESTION_WITH_IMAGE
 72 |         
 73 |         for negation in self.neg_words:
 74 |             if negation in question:
 75 |                 return self.QUESTION_WITH_NEGATION
 76 |         
 77 |         return self.QUESTION_OTHER
 78 |             
 79 |     def is_unanswerable(self, question):    
 80 |         return self._predict_type(question) in [self.QUESTION_WITH_IMAGE]
 81 |     
 82 |     def is_negation_question(self, question):
 83 |         return self._predict_type(question) == self.QUESTION_WITH_NEGATION
 84 | 
 85 | 
 86 | 
 87 | class TextSimilarity(object):
 88 |     """
 89 |     It measures the similarity between two texts (percentage of words shared
 90 |     between the answer and the span
 91 |     """    
 92 |     
 93 |     def __init__(self, stopwords=stopwords.words('english'),
 94 |                  lemmatizer=WordNetLemmatizer()):
 95 |         
 96 |         self.stopwords = stopwords
 97 |         self.lemmatizer = lemmatizer
 98 |     
 99 |     
100 |     def _preprocess(self, tokens):
101 |         return  [self.lemmatizer.lemmatize(t).lower() for t in tokens 
102 |                  if t.lower() not in self.stopwords and t.lower() not in string.punctuation] 
103 | 
104 |     def _compute_overlap(self,l1, l2):
105 |         """
106 |         Computes the percentage of elements of l1 that is in l2
107 |         
108 |         Args
109 |         
110 |         l1 (list): A list of strings
111 |         l2 (list): A list of strings
112 |         """
113 |         
114 |         d1 = Counter(l1)
115 |         d2 = Counter(l2)
116 |     
117 |         o1 = 0.
118 |         for k in d1:
119 |             o1 += min(d1[k], d2[k])
120 | 
121 |         if len(l2) == 0:
122 |             return 0
123 |         return o1 / len(l2)
124 |   
125 | 
126 |     def similarity(self,tokens1,tokens2):
127 |         
128 |         ptokens1 = self._preprocess(tokens1)
129 |         ptokens2 = self._preprocess(tokens2)   
130 |     #    print ("ptokens1", ptokens1)
131 |     #    print ("ptokens2", ptokens2)
132 |         return self._compute_overlap(ptokens1, ptokens2) 
133 |         
134 |     
135 | 
136 | class Score(object):
137 | 
138 |     iRIGHT = 0
139 |     iWRONG = 1
140 |     iUNANSWERED = 2
141 |     iPRECISION = 3
142 |     iRECALL = 4
143 |     iF1 = 5
144 |     iNETAS = 6
145 | 
146 | 
147 |     def __init__(self):
148 |         self.results = {}
149 |         
150 |     def parse_eval(self, output_eval):
151 |         
152 |         prec = 0.0
153 |         recall = 0.0
154 |         f1 = 0.0
155 |         netas = 0.0
156 |         
157 |         d = {}
158 |         for line in output_eval.split("\n"):
159 | 
160 |             if line.startswith("Number of valid predictions"):
161 |                 pass
162 |             elif line.startswith(RIGHT):
163 |                 right = line.replace(RIGHT,"")
164 |                 d[RIGHT] = right    
165 |             elif line.startswith(WRONG):
166 |                 wrong = line.replace(WRONG,"")
167 |                 d[WRONG] = wrong   
168 |             elif line.startswith(UNANSWERED):
169 |                 unanswered = line.replace(UNANSWERED,"")
170 |                 d[UNANSWERED] = unanswered       
171 |             elif line.startswith(PRECISION):
172 |                 prec = line.replace(PRECISION,"")
173 |                 d[PRECISION] = prec
174 |             elif line.startswith(RECALL):
175 |                 recall = line.replace(RECALL,"")
176 |                 d[RECALL] = recall
177 |             elif line.startswith(F1_SCORE):
178 |                 f1_score = line.replace(F1_SCORE,"")
179 |                 d[F1_SCORE] = f1_score
180 |                 pass
181 |             elif line.startswith(ACCURACY):
182 |                 acc = line.replace(ACCURACY, "")
183 |                 d[ACCURACY] = acc
184 |             elif line.startswith(NETAS):
185 |                 netas = line.replace(NETAS,"")
186 |                 d[NETAS] = netas
187 |                 
188 |         return self.scores_to_list(d)
189 | 
190 |     def scores_to_list(self, dscores):
191 |         return list(map(float,[dscores[RIGHT],dscores[WRONG],dscores[UNANSWERED],dscores[PRECISION],
192 |                 dscores[RECALL],dscores[F1_SCORE], dscores[ACCURACY], dscores[NETAS]]))
193 | 
194 |     def add_exam(self, exam, scores):
195 |         self.results[exam] = scores
196 |         
197 |     def get_exam_scores(self, exam):
198 |         return self.results[exam]
199 |     
200 |     def get_category_scores(self, category):
201 |         
202 |         category_scores = []
203 |         for exam in self.results:
204 |             if category in exam:
205 |                 category_scores.append(self.results[exam])
206 |         return category_scores
207 |     
208 |     def get_average_results(self, exams_scores):
209 |         average = [0]*len(exams_scores[0])
210 |         for exam in exams_scores:
211 |             for index,(s1, s2) in enumerate(zip(average, exam)):
212 |                 average[index] = s1+s2
213 |         return [round(e/len(exams_scores),3) for e in average]
214 | 
215 |     def get_table(self):
216 |         table = PrettyTable()
217 |         table.field_names = ["Exam","Year","Right","Wrong","Unanswered","Precision","Recall","F1-score","Accuracy", "NETAS"]
218 |         
219 |         #Computing individual results
220 |         for exam in self.results:
221 |             e = [exam,""]
222 |             e.extend(self.results[exam])
223 |             table.add_row(e)
224 |         
225 |         #Computing average results per category
226 |         biology_exams = self.get_category_scores("_B")
227 |         if len(biology_exams) != 0:
228 |             biology_scores = self.get_average_results(biology_exams)    
229 |             biology_row = ["Biology (avg)", ""]
230 |             biology_row.extend(biology_scores)
231 |             table.add_row(biology_row)
232 |         
233 |         medicine_exams = self.get_category_scores("_M")
234 |         if len(medicine_exams) != 0:
235 |             medicine_scores = self.get_average_results(medicine_exams)
236 |             medicine_row = ["Medicine (avg)", ""]
237 |             medicine_row.extend(medicine_scores)
238 |             table.add_row(medicine_row)
239 | 
240 |         nursery_exams = self.get_category_scores("_E")
241 |         if len(nursery_exams) != 0:
242 |             nursery_scores = self.get_average_results(nursery_exams)
243 |             nursery_row = ["Nursery (avg)",""]
244 |             nursery_row.extend(nursery_scores)
245 |             table.add_row(nursery_row)
246 |             
247 |         pharma_exams = self.get_category_scores("_F")
248 |         if len(pharma_exams) != 0:
249 |             pharma_scores = self.get_average_results(pharma_exams)
250 |             pharma_row = ["Pharmacology (avg)", ""]
251 |             pharma_row.extend(pharma_scores)
252 |             table.add_row(pharma_row)
253 |         
254 |         psycho_exams = self.get_category_scores("_P")
255 |         if len(psycho_exams) != 0:
256 |             psycho_scores = self.get_average_results(psycho_exams)
257 |             psycho_row = ["Psychology (avg)", ""]
258 |             psycho_row.extend(psycho_scores)
259 |             table.add_row(psycho_row)
260 |         
261 |         chemistry_exams = self.get_category_scores("_Q")
262 |         if len(chemistry_exams) != 0:
263 |             chemistry_scores = self.get_average_results(chemistry_exams)
264 |             chemistry_row = ["Chemistry (avg)", ""]
265 |             chemistry_row.extend(chemistry_scores)
266 |             table.add_row(chemistry_row)
267 |         
268 |         all_scores = self.get_average_results([self.results[exam] for exam in self.results])
269 |         all_row = ["All (avg)", ""]
270 |         all_row.extend(all_scores)
271 |         table.add_row(all_row)
272 |         
273 |         return table
274 | 
275 | class Dataset(object):
276 |     
277 |     DATA = "data"
278 |     VERSION = "version"
279 |     EXAMS = "exams"
280 |     
281 |     def __init__(self):
282 |         self.json = None
283 |     
284 |     def load_json(self,path):
285 |         with codecs.open(path) as f:
286 |             self.json = json.load(f)
287 |             
288 |     def get_version(self):
289 |         return self.json[self.VERSION]
290 |     
291 |     def get_exam(self, name_exam):
292 |         return self.json[self.EXAMS][name_exam]
293 |     
294 |     def get_exams(self):
295 |         return self.json[self.EXAMS]
296 |     
297 |     def get_json(self):
298 |         return self.json
299 |     
300 |     def get_qas(self, exam):
301 |         qas = []
302 |         if self.json is None:
303 |             raise ValueError("Dataset not provided")
304 |         
305 |         for sample in self.get_exam(exam)[self.DATA]:
306 |             qas.append((sample["qid"], sample["qtext"], [a["atext"] for a in sample["answers"]]))
307 |         return qas
308 |     
309 |     


--------------------------------------------------------------------------------