├── .gitignore
├── LICENSE
├── README.md
├── ds_formatter
    ├── cnnnews.py
    ├── insuranceqa.py
    ├── mctest.py
    ├── msmarco.py
    ├── narrativeqa.py
    ├── qangaroo.py
    ├── quasar.py
    ├── squad.py
    ├── triviaqa.py
    ├── ubuntudialogue.py
    └── wikiqa.py
├── executor.py
├── requirements.txt
└── util.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # OSX specific
  2 | .DS_Store
  3 | __MACOSX
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | env/
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *,cover
 50 | .hypothesis/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # IPython Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | venv/
 87 | ENV/
 88 | 
 89 | # Spyder project settings
 90 | .spyderproject
 91 | 
 92 | # Rope project settings
 93 | .ropeproject
 94 | 
 95 | # Idea
 96 | .idea/
 97 | 
 98 | # TensorBoard dirs
 99 | .tb
100 | 
101 | # Vim buffer files
102 | *.swp
103 | 
104 | # Test result files
105 | testresult_*
106 | 
107 | #datasets
108 | data/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Tolgahan Cakaloglu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Dataset Converter for Question-Answering (QA) Tasks 
 2 | Dataset Converter for natural language processing tasks such QA(question-answering) Tasks: from one format to other one
 3 | 
 4 | #### QA Dataset Paper & Data :
 5 | 
 6 | * [SQuAD v1 paper](https://arxiv.org/pdf/1606.05250) | [SQuAD v1 data](https://github.com/rajpurkar/SQuAD-explorer/blob/master/dataset)
 7 | * [SQuAD v2 paper](https://arxiv.org/abs/1806.03822) | [SQuAD v2 data](https://github.com/rajpurkar/SQuAD-explorer/blob/master/dataset) (*NOTE: SQuAD v2 should be also compatible with this code [NOT TESTED]*) 
 8 | * [QAngaroo paper](https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299) | [QAngaroo data](http://bit.ly/2m0W32k)
 9 | * [MCTest paper](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/11/MCTest_EMNLP2013.pdf) | [MCTest data](https://github.com/mcobzarenco/mctest/tree/master/data/MCTest)
10 | * [WikiQA_paper](https://aclweb.org/anthology/D15-1237) | [WikiQA_data](https://www.microsoft.com/en-us/download/details.aspx?id=52419)
11 | * [InsuranceQA paper](https://arxiv.org/abs/1508.01585) | [InsuranceQA data v1](https://github.com/shuzi/insuranceQA/tree/master/V1) - [InsuranceQA data v2](https://github.com/shuzi/insuranceQA/tree/master/V2) 
12 | * [MS_MARCO paper](https://arxiv.org/pdf/1611.09268.pdf) | [MS_MARCO data](http://www.msmarco.org/dataset.aspx)
13 | * [WikiMovies](https://arxiv.org/abs/1606.03126)
14 | * [TriviaQA paper](https://arxiv.org/abs/1705.03551) | [TriviaQA data](http://nlp.cs.washington.edu/triviaqa/)
15 | * [Simple Questions](https://arxiv.org/abs/1506.02075)
16 | * [NarrativeQA paper](https://arxiv.org/abs/1712.07040) | [NarrativeQA data](https://github.com/deepmind/narrativeqa)
17 | * [Ubuntu Dialogue Corpus v2.0 paper](https://arxiv.org/abs/1506.08909) | [Ubuntu Dialogue Corpus v2.0 data](https://github.com/rkadlec/ubuntu-ranking-dataset-creator)
18 | * [NewsQA paper](https://arxiv.org/abs/1611.09830) | [NewsQA data](https://datasets.maluuba.com/NewsQA)
19 | * [Quasar data](http://curtis.ml.cmu.edu/datasets/quasar/)
20 | * **MatchZoo** Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt".  
21 | #### Supported Formats :
22 | Source | Destination | Status
23 | ------------ | ------------- | -------------
24 | QAngaroo| SQuAD| **completed**
25 | MCTest| SQuAD| **completed**
26 | WikiQA| SQuAD| **completed**
27 | InsuranceQA v1| SQuAD| **completed**
28 | InsuranceQA v2| SQuAD| **completed**
29 | TriviaQA| SQuAD| **completed**
30 | NarrativeQA| SQuAD| **completed**
31 | MS MARCO| SQuAD| **completed**
32 | MS MARCO v2| SQuAD| **completed**
33 | WikiMovies| SQuAD| *on hold*
34 | Simple Questions| SQuAD| *on hold*
35 | Ubuntu Corpus v2| SQuAD| **completed**
36 | NewsQA| SQuAD| **completed**
37 | SQuAD| MatchZoo| **completed**
38 | Quasar-T| SQuAD| **completed**
39 | Quasar-S| SQuAD| **completed**
40 | 
41 | #### Example Call :
42 | 
43 | You can find the sample call for each format type in the ``` executor.py ``` file such as below. 
44 | 
45 | ```
46 | python executor.py 
47 | --log_path="~/log.log" 
48 | --data_path="~/data/" 
49 | --from_files="source:question.train.token_idx.label,voc:vocabulary,answer:answers.label.token_idx" 
50 | --from_format="insuranceqa" 
51 | --to_format="squad" 
52 | --to_file_name="filename.what" # it is gonna be renamed as "[from_to]_filename.what"
53 | ```


--------------------------------------------------------------------------------
/ds_formatter/cnnnews.py:
--------------------------------------------------------------------------------
 1 | def convert_to_squad(question_answer_content, context_content_path):
 2 |     """
 3 |     :param question_answer_content:
 4 |     :param context_content_path: story files folder path
 5 |     :return: formatted SQUAD data
 6 |     At initial version, we are just focusing on the context and question, nothing more,
 7 |     therefore we are ignoring the answer part as of now
 8 |     """
 9 |     # PARSE FILES
10 |     import os
11 | 
12 |     squad_formatted_content = dict()
13 |     squad_formatted_content['version'] = 'cnnnews_squad_format'
14 |     data = []
15 |     #TODO: Each context has multiple questions and each row of the file has multiple questions in different columns (like every 4 columns), we need to handle this.
16 |     for datum in question_answer_content.itertuples(index=False):
17 |         # Format is deeply nested JSON -- prepare data structures
18 |         if datum[3] > 0 : #(part) answer absent, skip this question
19 |             continue
20 | 
21 |         data_ELEMENT = dict()
22 |         data_ELEMENT['title'] = 'dummyTitle'
23 |         paragraphs = []
24 |         paragraphs_ELEMENT = dict()
25 |         qas = []
26 |         qas_ELEMENT = dict()
27 |         qas_ELEMENT_ANSWERS = []
28 |         ANSWERS_ELEMENT = dict()
29 | 
30 |         story_file_name = datum[0][(datum[0].rindex('/') + 1):]
31 |         qas_ELEMENT['id'] = story_file_name
32 |         qas_ELEMENT['question'] = datum[1]
33 | 
34 |         story_file_path = context_content_path + os.sep + story_file_name
35 |         if not os.path.isfile(story_file_path):
36 |             raise TypeError(story_file_path + " does not exist")
37 |         superdocument = open(story_file_path).read()
38 | 
39 |         ANSWERS_ELEMENT['answer_start'] = -1
40 |         ANSWERS_ELEMENT['text'] = 'dummyAnswer'
41 | 
42 |         paragraphs_ELEMENT['context'] = superdocument
43 |         qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
44 | 
45 |         qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
46 |         qas.append(qas_ELEMENT)
47 | 
48 |         paragraphs_ELEMENT['qas'] = qas
49 |         paragraphs.append(paragraphs_ELEMENT)
50 | 
51 |         data_ELEMENT['paragraphs'] = paragraphs
52 |         data.append(data_ELEMENT)
53 | 
54 |     squad_formatted_content['data'] = data
55 | 
56 |     return squad_formatted_content


--------------------------------------------------------------------------------
/ds_formatter/insuranceqa.py:
--------------------------------------------------------------------------------
 1 | def convert_to_squad(questions, answers, a_to_q_map):
 2 |     """
 3 |     questions:questions
 4 |     answers: answers or context or paragraphs
 5 |     a_to_q_map: answers to questions mapping
 6 |     """
 7 |     squad_formatted_content = dict()
 8 |     squad_formatted_content['version'] = 'insuranceqa_squad_format'
 9 |     data = []
10 | 
11 | 
12 |     for par_indx, ques in a_to_q_map.items():
13 |         # Format is deeply nested JSON -- prepare data structures
14 |         data_ELEMENT = dict()
15 |         data_ELEMENT['title'] = 'dummyTitle'
16 | 
17 |         paragraphs = []
18 |         paragraphs_ELEMENT = dict()
19 | 
20 |         superdocument = answers[par_indx]
21 |         paragraphs_ELEMENT['context'] = superdocument
22 | 
23 | 
24 |         qas = []
25 |         for q_indx in ques:
26 |             qas_ELEMENT = dict()
27 |             ANSWERS_ELEMENT = dict()
28 |             qas_ELEMENT_ANSWERS = []
29 |             qas_ELEMENT['id'] = q_indx
30 |             qas_ELEMENT['question'] = questions[q_indx]
31 |             ANSWERS_ELEMENT['answer_start'] = -1
32 |             ANSWERS_ELEMENT['text'] = 'dummyAnswer'
33 |             qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
34 |             qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
35 |             qas.append(qas_ELEMENT)
36 | 
37 |         paragraphs_ELEMENT['qas'] = qas
38 |         paragraphs.append(paragraphs_ELEMENT)
39 | 
40 |         data_ELEMENT['paragraphs'] = paragraphs
41 |         data.append(data_ELEMENT)
42 | 
43 |     squad_formatted_content['data'] = data
44 | 
45 |     return squad_formatted_content
46 | 
47 | def load_vocab(vocab_file):
48 |   voc = {}
49 |   with open(vocab_file, 'r') as f_in:
50 |       for line in f_in:
51 |         word, _id = line.strip().split('\t')
52 |         voc[word] = _id
53 |   return voc
54 | 
55 | def load_answers(answers_file, voc):
56 |   #answers = context
57 |   _list = ["None"]
58 |   with open(answers_file, 'r') as f_in:
59 |       for line in f_in:
60 |           _, sent = line.strip().split('\t')
61 |           _list.append(' '.join([voc[wid] for wid in sent.split(' ')]))
62 |   return _list
63 | 
64 | 
65 | def load_questions(question_file, voc):
66 |   questions = []
67 |   a_to_q_map = dict()
68 |   x = dict()
69 |   ground_truth, no_ground_truth = 0, 0
70 |   with open(question_file, 'r') as f_in:
71 |       for q_indx, line in enumerate(f_in):
72 |         try:
73 |             type, q = line.strip().split('\t')
74 |         except:
75 |             type, q, ids, pooled_answers = line.strip().split('\t')
76 |         q = ' '.join([voc[wid] for wid in q.split(' ')])
77 |         questions.append(q)
78 |         if type not in x:
79 |             x[type] = 1
80 |         else:
81 |             x[type] = x[type] + 1
82 | 
83 |         if len([1 for gt in ids.split(' ') if gt in pooled_answers.split(' ')]) <= 0:
84 |             no_ground_truth +=1
85 |         else:
86 |             ground_truth += 1
87 |             for _id in ids.split(' '):
88 |                 if _id not in a_to_q_map:
89 |                     a_to_q_map[int(_id)] = [q_indx]
90 |                 else:
91 |                     temp_qs = a_to_q_map[int(_id)]
92 |                     temp_qs = temp_qs.append(int(_id))
93 |                     a_to_q_map[int(_id)] = temp_qs
94 |       print(x)
95 |       print("Total items: {}".format(sum([v for k, v in x.items()])))
96 |       print('Ground Truth: {}, No Ground_Truth: {}'.format(ground_truth, no_ground_truth))
97 |   return questions, a_to_q_map


--------------------------------------------------------------------------------
/ds_formatter/mctest.py:
--------------------------------------------------------------------------------
 1 | def convert_to_squad(story_question_content):
 2 |     """
 3 |     :param story_question_content:
 4 |     :param answer_content:
 5 |     :return: formatted SQUAD data
 6 |     At initial version, we are just focusing on the context and question, nothing more,
 7 |     therefore we are ignoring the answer part as of now
 8 |     """
 9 |     # PARSE FILES
10 | 
11 |     squad_formatted_content = dict()
12 |     squad_formatted_content['version'] = 'mctest_squad_format'
13 |     data = []
14 |     #TODO: Each context has multiple questions and each row of the file has multiple questions in different columns (like every 4 columns), we need to handle this.
15 |     for datum in story_question_content.itertuples(index=False):
16 |         # Format is deeply nested JSON -- prepare data structures
17 |         data_ELEMENT = dict()
18 |         data_ELEMENT['title'] = 'dummyTitle'
19 | 
20 |         paragraphs = []
21 |         paragraphs_ELEMENT = dict()
22 | 
23 |         superdocument = datum[2].replace('\\newline', '')
24 |         paragraphs_ELEMENT['context'] = superdocument
25 | 
26 |         qas = []
27 |         # it has 4 questions in each context
28 |         question_column_start_indx = 3
29 |         question_size = 4
30 |         for q_indx in range(question_size):
31 |             qas_ELEMENT = dict()
32 |             ANSWERS_ELEMENT = dict()
33 |             qas_ELEMENT_ANSWERS = []
34 |             qas_ELEMENT['id'] = datum[0] + "." +str(q_indx)
35 |             qas_ELEMENT['question'] = datum[q_indx + question_column_start_indx if q_indx < 1 else q_indx * 5 + 3].replace("one: ", "").replace("multiple: ", "")
36 |             ANSWERS_ELEMENT['answer_start'] = -1
37 |             ANSWERS_ELEMENT['text'] = 'dummyAnswer'
38 |             qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
39 |             qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
40 |             qas.append(qas_ELEMENT)
41 | 
42 |         paragraphs_ELEMENT['qas'] = qas
43 |         paragraphs.append(paragraphs_ELEMENT)
44 | 
45 |         data_ELEMENT['paragraphs'] = paragraphs
46 |         data.append(data_ELEMENT)
47 | 
48 |     squad_formatted_content['data'] = data
49 | 
50 |     return squad_formatted_content


--------------------------------------------------------------------------------
/ds_formatter/msmarco.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from tqdm import tqdm
  4 | from sklearn.utils import shuffle
  5 | 
  6 | # def convert_to_squad(story_question_content):
  7 | #     """
  8 | #     :param story_question_content:
  9 | #     :return: formatted SQUAD data
 10 | #     At initial version, we are just focusing on the context and question, nothing more,
 11 | #     therefore we are ignoring the answer part as of now
 12 | #     """
 13 | #     # PARSE FILES
 14 | #
 15 | #     squad_formatted_content = dict()
 16 | #     squad_formatted_content['version'] = 'msmarco_squad_format'
 17 | #     data = []
 18 | #     query = story_question_content['query']
 19 | #     query_keys = query.keys()
 20 | #     passages = story_question_content['passages']
 21 | #
 22 | #     id_index = 0
 23 | #     for key in query_keys:
 24 | #         # Format is deeply nested JSON -- prepare data structures
 25 | #         data_ELEMENT = dict()
 26 | #         data_ELEMENT['title'] = 'dummyTitle'
 27 | #         paragraphs = []
 28 | #         paragraphs_ELEMENT = dict()
 29 | #         qas = []
 30 | #         qas_ELEMENT = dict()
 31 | #         qas_ELEMENT_ANSWERS = []
 32 | #         ANSWERS_ELEMENT = dict()
 33 | #
 34 | #         qas_ELEMENT['id'] = id_index
 35 | #         qas_ELEMENT['question'] = query[key]
 36 | #         id_index += 1
 37 | #
 38 | #         superdocument = ' '.join([onePassage['passage_text'] for onePassage in passages[key]])
 39 | #
 40 | #         ANSWERS_ELEMENT['answer_start'] = -1
 41 | #         ANSWERS_ELEMENT['text'] = 'dummyAnswer'
 42 | #
 43 | #         paragraphs_ELEMENT['context'] = superdocument
 44 | #         qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
 45 | #
 46 | #         qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
 47 | #         qas.append(qas_ELEMENT)
 48 | #
 49 | #         paragraphs_ELEMENT['qas'] = qas
 50 | #         paragraphs.append(paragraphs_ELEMENT)
 51 | #
 52 | #         data_ELEMENT['paragraphs'] = paragraphs
 53 | #         data.append(data_ELEMENT)
 54 | #
 55 | #     squad_formatted_content['data'] = data
 56 | #
 57 | #     return squad_formatted_content
 58 | def convert_to_squad(input_dict):
 59 |     """
 60 |     :param story_question_content:
 61 |     :return: formatted SQUAD data
 62 |     At initial version, we are just focusing on the context and question, nothing more,
 63 |     therefore we are ignoring the answer part as of now.
 64 |     The code is to process train and development sets of MS-MARCO, since test(eval) set doesn't has answer information
 65 |     """
 66 |     # PARSE FILES
 67 |     squad_formatted_content=None
 68 |     if input_dict['v'] <= 2.0:
 69 |         squad_formatted_content = convert_v2(input_dict)
 70 |     else:
 71 |         squad_formatted_content = convert_v21(input_dict)
 72 |     return squad_formatted_content
 73 | 
 74 | def convert_v21(input_dict):
 75 |     squad_formatted_content = dict()
 76 |     squad_formatted_content['version'] = 'msmarco_v21_squad_format'
 77 |     data=[]
 78 |     all_data = assign_mapped_document(input_dict['queries'], input_dict['mappings'], input_dict['documents'])
 79 |     # if input_dict['limit'] != -1:
 80 |     #     all_data = shuffle(all_data)
 81 | 
 82 |     all_data = all_data.groupby(['p_id', 'p_content'])
 83 |     iterator = tqdm(enumerate(all_data))
 84 |     for i, pack in iterator:
 85 |         if input_dict['limit'] != -1 and i > input_dict['limit']:
 86 |             print('Data is prepared at the index of {}'.format(i))
 87 |             iterator.close()
 88 |             break
 89 |         p, qs = pack[0], pack[1]
 90 |         data_ELEMENT = dict()
 91 |         data_ELEMENT['title'] = 'dummyTitle'
 92 |         paragraphs = []
 93 |         paragraphs_ELEMENT = dict()
 94 |         superdocument = p[1]
 95 |         paragraphs_ELEMENT['context'] = superdocument
 96 |         qas = []
 97 |         for q in qs.itertuples():
 98 |             _q_indx, _q = q.q_id, q.q_content
 99 |             qas_ELEMENT = dict()
100 |             ANSWERS_ELEMENT = dict()
101 |             qas_ELEMENT_ANSWERS = []
102 |             qas_ELEMENT['id'] = _q_indx
103 |             qas_ELEMENT['question'] = _q
104 |             ANSWERS_ELEMENT['answer_start'] = -1
105 |             ANSWERS_ELEMENT['text'] = 'dummyAnswer'
106 |             qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
107 |             qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
108 |             qas.append(qas_ELEMENT)
109 |         paragraphs_ELEMENT['qas'] = qas
110 |         paragraphs.append(paragraphs_ELEMENT)
111 | 
112 |         data_ELEMENT['paragraphs'] = paragraphs
113 |         data.append(data_ELEMENT)
114 |     squad_formatted_content['data'] = data
115 |     return squad_formatted_content
116 | def assign_mapped_document(queries, mappings, documents):
117 |     # queries ids
118 |     print('Shape of query is {}'.format(queries.shape))
119 |     queries_mask = np.isin(mappings['q_id'], queries['id'])
120 |     mappings = mappings[queries_mask]
121 |     queries_dict = pd.Series(queries["content"].values, index=queries['id']).to_dict()
122 |     print('Len of query dict is {}'.format(len(queries_dict)))
123 |     print('Shape of mapping is {}'.format(mappings.shape))
124 | 
125 |     document_mask = np.isin(documents['id'], mappings['p_id'])
126 |     documents = documents[document_mask]
127 |     print('Shape of documents is {}'.format(documents.shape))
128 | 
129 |     documents_dict = pd.Series(documents["content"].values, index=documents['id']).to_dict()
130 |     print('Len of document dict is {}'.format(len(documents_dict)))
131 | 
132 | 
133 |     mappings['q_content'] = mappings['q_id'].map(queries_dict)
134 |     mappings['p_content'] = mappings['p_id'].map(documents_dict)
135 |     print('Shape of new mapping is {}'.format(mappings.shape))
136 |     return mappings
137 | def convert_v2(input_dict):
138 |     """
139 |     :param story_question_content:
140 |     :return: formatted SQUAD data
141 |     At initial version, we are just focusing on the context and question, nothing more,
142 |     therefore we are ignoring the answer part as of now.
143 |     The code is to process train and development sets of MS-MARCO, since test(eval) set doesn't has answer information
144 |     """
145 |     # PARSE FILES
146 |     story_question_content = input_dict['story_question_content']
147 |     squad_formatted_content = dict()
148 |     squad_formatted_content['version'] = 'msmarco_squad_format'
149 |     data = []
150 |     query = story_question_content['query']
151 |     #key list consists of keys of queries with answers
152 |     keys_with_answer = [x for x, y in story_question_content['answers'].items() if y[0] != 'No Answer Present.' and y[0] != '']
153 |     passages = story_question_content['passages']
154 | 
155 |     for key in keys_with_answer:
156 |         # Format is deeply nested JSON -- prepare data structures
157 |         data_ELEMENT = dict()
158 |         data_ELEMENT['title'] = 'dummyTitle'
159 |         paragraphs = []
160 |         paragraphs_ELEMENT = dict()
161 |         qas = []
162 |         qas_ELEMENT = dict()
163 |         qas_ELEMENT_ANSWERS = []
164 |         ANSWERS_ELEMENT = dict()
165 | 
166 |         qas_ELEMENT['id'] = key
167 |         qas_ELEMENT['question'] = query[key]
168 | 
169 |         #correct_context is a list
170 |         correct_context= [x for x in passages[key] if x['is_selected'] == 1]
171 |         #some query(question) has more than 1 correct contexts, we just pick the first one as the context
172 |         if len(correct_context) == 0:
173 |             continue
174 |         superdocument = correct_context[0]['passage_text']
175 | 
176 |         ANSWERS_ELEMENT['answer_start'] = -1
177 |         ANSWERS_ELEMENT['text'] = 'dummyAnswer'
178 | 
179 |         paragraphs_ELEMENT['context'] = superdocument
180 |         qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
181 | 
182 |         qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
183 |         qas.append(qas_ELEMENT)
184 | 
185 |         paragraphs_ELEMENT['qas'] = qas
186 |         paragraphs.append(paragraphs_ELEMENT)
187 | 
188 |         data_ELEMENT['paragraphs'] = paragraphs
189 |         data.append(data_ELEMENT)
190 | 
191 |     squad_formatted_content['data'] = data
192 | 
193 |     return squad_formatted_content


--------------------------------------------------------------------------------
/ds_formatter/narrativeqa.py:
--------------------------------------------------------------------------------
 1 | def convert_to_squad(story_summary_content, question_content, set_type):
 2 |     """
 3 |     :param story_summary_content:
 4 |     :param question_content:
 5 |     :param category_content:
 6 |     :param set_type:
 7 |     :return: formatted SQUAD data
 8 |     At initial version, we are just focusing on the context and question, nothing more,
 9 |     therefore we are ignoring the answer part as of now
10 |     """
11 |     squad_formatted_content = dict()
12 |     squad_formatted_content['version'] = 'narrativeqa_squad_format'
13 |     data = []
14 |     content = story_summary_content
15 |     if set_type != 'all':
16 |         content = story_summary_content[story_summary_content['set'] == set_type]
17 | 
18 |     for datum in content.itertuples(index=False):
19 |         #print(datum.summary)
20 |         data_ELEMENT = dict()
21 |         data_ELEMENT['title'] = 'dummyTitle'
22 | 
23 |         paragraphs = []
24 |         paragraphs_ELEMENT = dict()
25 | 
26 |         superdocument = datum.summary
27 |         paragraphs_ELEMENT['context'] = superdocument
28 | 
29 |         qas = []
30 |         sub_datum = question_content[question_content['document_id'] == datum.document_id]
31 |         for q_datum in sub_datum.itertuples():
32 |             # print(indx)
33 |             #print(q_datum)
34 |             qas_ELEMENT = dict()
35 |             ANSWERS_ELEMENT = dict()
36 |             qas_ELEMENT_ANSWERS = []
37 |             qas_ELEMENT['id'] = q_datum.document_id + '-' + str(q_datum.Index)
38 |             qas_ELEMENT['question'] = q_datum.question
39 |             ANSWERS_ELEMENT['answer_start'] = -1
40 |             ANSWERS_ELEMENT['text'] = 'dummyAnswer'
41 |             qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
42 |             qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
43 |             qas.append(qas_ELEMENT)
44 | 
45 |         paragraphs_ELEMENT['qas'] = qas
46 |         paragraphs.append(paragraphs_ELEMENT)
47 | 
48 |         data_ELEMENT['paragraphs'] = paragraphs
49 |         data.append(data_ELEMENT)
50 |     squad_formatted_content['data'] = data
51 | 
52 |     return squad_formatted_content


--------------------------------------------------------------------------------
/ds_formatter/qangaroo.py:
--------------------------------------------------------------------------------
 1 | def convert_to_squad(source_data):
 2 |     """
 3 |     Converts QAngaroo data (hoppy_data) into SQuAD format.
 4 |     The SQuAD-formatted data is written to disk at write_file_name.
 5 |     Note: All given support documents per example are concatenated
 6 |         into one super-document. All text is lowercased.
 7 |     """
 8 |     squad_formatted_content = dict()
 9 |     squad_formatted_content['version'] = 'hoppy_squad_format'
10 |     data = []
11 | 
12 | 
13 |     for datum in source_data:
14 | 
15 |         # Format is deeply nested JSON -- prepare data structures
16 |         data_ELEMENT = dict()
17 |         data_ELEMENT['title'] = 'dummyTitle'
18 |         paragraphs = []
19 |         paragraphs_ELEMENT = dict()
20 |         qas = []
21 |         qas_ELEMENT = dict()
22 |         qas_ELEMENT_ANSWERS = []
23 |         ANSWERS_ELEMENT = dict()
24 | 
25 |         qas_ELEMENT['id'] = datum['id']
26 |         qas_ELEMENT['question'] = datum['query']
27 | 
28 |         superdocument = " ".join(datum['supports'])
29 | 
30 |         answer_position = superdocument.find(datum['answer'])
31 |         if answer_position == -1:
32 |             continue
33 | 
34 |         ANSWERS_ELEMENT['answer_start'] = answer_position
35 |         ANSWERS_ELEMENT['text'] = datum['answer']
36 | 
37 |         paragraphs_ELEMENT['context'] = superdocument
38 |         qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
39 | 
40 |         qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
41 |         qas.append(qas_ELEMENT)
42 | 
43 |         paragraphs_ELEMENT['qas'] = qas
44 |         paragraphs.append(paragraphs_ELEMENT)
45 | 
46 |         data_ELEMENT['paragraphs'] = paragraphs
47 |         data.append(data_ELEMENT)
48 | 
49 |     squad_formatted_content['data'] = data
50 | 
51 |     return squad_formatted_content


--------------------------------------------------------------------------------
/ds_formatter/quasar.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from tqdm import tqdm
 4 | from sklearn.utils import shuffle
 5 | 
 6 | def convert_to_squad(queries, documents, is_null_tags_filter, limit):
 7 |     squad_formatted_content = dict()
 8 |     squad_formatted_content['version'] = 'quasar-t_squad_format'
 9 |     data=[]
10 |     pairs = create_pairs(zip(queries, documents), is_null_tags_filter)
11 |     # if limit != -1:
12 |     #     pairs = shuffle(pairs)
13 | 
14 |     pairs = pairs.groupby(['p_id', 'p_content'])
15 |     iterator =  tqdm(enumerate(pairs))
16 |     for i, pack in iterator:
17 |         if limit != -1 and i > limit:
18 |             print('Data is prepared at the index of {}'.format(i))
19 |             iterator.close()
20 |             break
21 |         p, qs = pack[0], pack[1]
22 |         data_ELEMENT = dict()
23 |         data_ELEMENT['title'] = 'dummyTitle'
24 |         paragraphs = []
25 |         paragraphs_ELEMENT = dict()
26 |         superdocument = p[1]
27 |         paragraphs_ELEMENT['context'] = superdocument
28 |         qas = []
29 |         for q in qs.itertuples():
30 |             _q_indx, _q = q.q_id, q.q_content
31 |             qas_ELEMENT = dict()
32 |             ANSWERS_ELEMENT = dict()
33 |             qas_ELEMENT_ANSWERS = []
34 |             qas_ELEMENT['id'] = _q_indx
35 |             qas_ELEMENT['question'] = _q
36 |             ANSWERS_ELEMENT['answer_start'] = -1
37 |             ANSWERS_ELEMENT['text'] = 'dummyAnswer'
38 |             qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
39 |             qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
40 |             qas.append(qas_ELEMENT)
41 |         paragraphs_ELEMENT['qas'] = qas
42 |         paragraphs.append(paragraphs_ELEMENT)
43 | 
44 |         data_ELEMENT['paragraphs'] = paragraphs
45 |         data.append(data_ELEMENT)
46 |     squad_formatted_content['data'] = data
47 |     return squad_formatted_content
48 | 
49 | def create_pairs(query_document_pair, is_null_tags_filter):
50 |     pairs = []
51 |     generator = enumerate(query_document_pair)
52 |     for i, pair in generator:
53 |         query, context = pair[0], pair[1]
54 |         if is_null_tags_filter.lower() in ['true', 'True', 'TRUE']:
55 |             if len(query['tags']) == 0:
56 |                 continue
57 |         if query['uid'] != context['uid']:
58 |             print(20 * '!')
59 |             print('Query {} - Document {} is mismatched.'.format(query['uid'],context['uid']))
60 |         pairs.append((query['uid'], query['question'], i, context['contexts'][0][1]))
61 |     return pd.DataFrame(pairs, columns=['q_id', 'q_content', 'p_id', 'p_content'])
62 | 


--------------------------------------------------------------------------------
/ds_formatter/squad.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | import util as UTIL
  3 | import pandas as pd
  4 | from collections import Counter
  5 | import matplotlib.pyplot as plt
  6 | #from random import shuffle,random
  7 | import os
  8 | import random
  9 | def convert_idx(text, tokens):
 10 |     current = 0
 11 |     spans = []
 12 |     for token in tokens:
 13 |         current = text.find(token, current)
 14 |         if current < 0:
 15 |             print("Token {} cannot be found".format(token))
 16 |             raise Exception()
 17 |         spans.append((current, current + len(token)))
 18 |         current += len(token)
 19 |     return spans
 20 | 
 21 | def process_squad_file(data, word_counter, char_counter):
 22 |     print("Generating examples...")
 23 |     examples = []
 24 |     eval_examples = {}
 25 |     total,_i_para  = 0, 0
 26 |     questions = []
 27 |     paragraphs = []
 28 |     question_to_paragraph = []
 29 |     for article in tqdm(data["data"]):
 30 |         title = article["title"]
 31 |         for para in article["paragraphs"]:
 32 |             context = para["context"].replace(
 33 |                 "''", '" ').replace("``", '" ')
 34 |             paragraphs.append(context)
 35 |             context_tokens = UTIL.word_tokenize(context)
 36 |             context_chars = [list(token) for token in context_tokens]
 37 |             spans = convert_idx(context, context_tokens)
 38 |             for token in context_tokens:
 39 |                 word_counter[token] += len(para["qas"])
 40 |                 for char in token:
 41 |                     char_counter[char] += len(para["qas"])
 42 |             for qa in para["qas"]:
 43 |                 total += 1
 44 |                 ques = qa["question"].replace(
 45 |                     "''", '" ').replace("``", '" ')
 46 |                 questions.append(ques)
 47 |                 question_to_paragraph.append(_i_para)
 48 |                 ques_tokens = UTIL.word_tokenize(ques)
 49 |                 ques_chars = [list(token) for token in ques_tokens]
 50 |                 for token in ques_tokens:
 51 |                     word_counter[token] += 1
 52 |                     for char in token:
 53 |                         char_counter[char] += 1
 54 |                 y1s, y2s = [], []
 55 |                 answer_texts = []
 56 |                 for answer in qa["answers"]:
 57 |                     answer_text = answer["text"]
 58 |                     answer_start = answer['answer_start']
 59 |                     answer_end = answer_start + len(answer_text)
 60 |                     answer_texts.append(answer_text)
 61 |                     answer_span = []
 62 |                     for idx, span in enumerate(spans):
 63 |                         if not (answer_end <= span[0] or answer_start >= span[1]):
 64 |                             answer_span.append(idx)
 65 |                     y1, y2 = answer_span[0], answer_span[-1]
 66 |                     y1s.append(y1)
 67 |                     y2s.append(y2)
 68 |                 example = {"context_tokens": context_tokens, "context_chars": context_chars, "ques_tokens": ques_tokens,
 69 |                            "ques_chars": ques_chars, "y1s": y1s, "y2s": y2s, "id": total}
 70 |                 examples.append(example)
 71 |                 eval_examples[str(total)] = {
 72 |                     "context": context, "spans": spans, 'ques': ques,"answers": answer_texts, "uuid": qa["id"], 'title': title}
 73 |             _i_para += 1
 74 |     print("{} questions in total".format(len(examples)))
 75 |     return examples, eval_examples, questions, paragraphs, question_to_paragraph
 76 | def tokenize_contexts(contexts:list, max_tokens=-1):
 77 |     tokenized_context = [UTIL.word_tokenize(context.strip()) if max_tokens == -1 else UTIL.word_tokenize(context.strip())[0:max_tokens]for context in contexts]
 78 |     return tokenized_context
 79 | 
 80 | def fixing_the_token_problem(tokenized_questions, tokenized_paragraphs):
 81 |     # fixing the '' problem:
 82 |     fixed_tokenized_question = []
 83 |     for indx, question in enumerate(tokenized_questions):
 84 |         tokens = []
 85 |         for token in question:
 86 |             t = token.strip()
 87 |             if t != "":
 88 |                 tokens.append(t)
 89 |         fixed_tokenized_question.append(tokens)
 90 | 
 91 |     fixed_tokenized_paragraph = []
 92 |     for indx, paragraph in enumerate(tokenized_paragraphs):
 93 |         tokens = []
 94 |         for token in paragraph:
 95 |             t = token.strip()
 96 |             if t != "":
 97 |                 tokens.append(t)
 98 |         fixed_tokenized_paragraph.append(tokens)
 99 |     return fixed_tokenized_question, fixed_tokenized_paragraph
100 | 
101 | def yield_to_matchzoo(question_answer_content, q_len, negative_sampling_count=100, max_tokens=-1):
102 |     """
103 |     :param question_answer_document content:
104 |     :return: yield matchzoo data
105 |     At initial version, we are just focusing on the context and question, nothing more,
106 |     therefore we are ignoring the answer part as of now
107 |     """
108 |     word_counter, char_counter = Counter(), Counter()
109 |     examples, eval, questions, paragraphs, q_to_ps = process_squad_file(question_answer_content, word_counter, char_counter)
110 |     tokenized_paragraphs = tokenize_contexts(paragraphs, max_tokens)
111 |     tokenized_questions = tokenize_contexts(questions, max_tokens)
112 |     tokenized_questions, tokenized_paragraphs = fixing_the_token_problem(tokenized_questions, tokenized_paragraphs)
113 | 
114 |     paragraphs_nontokenized = [" ".join(context) for context in tokenized_paragraphs]
115 |     questions_nontokenized = [" ".join(context) for context in tokenized_questions]
116 | 
117 |     for q_indx, question in enumerate(tqdm(questions_nontokenized[0:q_len])):
118 |         true_p_indx = q_to_ps[q_indx]
119 |         true_paragraph = paragraphs_nontokenized[true_p_indx]
120 |         temp_list = paragraphs_nontokenized.copy()
121 |         del temp_list[true_p_indx]
122 |         random.Random(q_indx).shuffle(temp_list)
123 |         for p_indx, paragraph in enumerate([true_paragraph] + temp_list[:negative_sampling_count-1]):
124 |             yield '\t'.join(['1' if p_indx == 0 else '0', question, paragraph])
125 | def convert_to_lucene(question_answer_content, doc_type_verbose, source_path):
126 |     """
127 |     :param question_answer_document content:
128 |     :return: yield matchzoo data
129 |     At initial version, we are just focusing on the context and question, nothing more,
130 |     therefore we are ignoring the answer part as of now
131 |     """
132 |     word_counter, char_counter = Counter(), Counter()
133 |     examples, eval, questions, paragraphs, q_to_ps = process_squad_file(question_answer_content, word_counter, char_counter)
134 |     tokenized_paragraphs = tokenize_contexts(paragraphs, -1)
135 |     tokenized_questions = tokenize_contexts(questions, -1)
136 |     tokenized_questions, tokenized_paragraphs = fixing_the_token_problem(tokenized_questions, tokenized_paragraphs)
137 | 
138 |     paragraphs_nontokenized = [" ".join(context) for context in tokenized_paragraphs]
139 |     questions_nontokenized = [" ".join(context) for context in tokenized_questions]
140 | 
141 |     if doc_type_verbose == 1 or doc_type_verbose == 3:
142 |         # questions
143 |         print('Questions are getting dumped.')
144 |         dst_dir = UTIL.create_dir(os.path.join(source_path, 'lucene_questions'))
145 |         for indx, doc in tqdm(enumerate(questions_nontokenized)):
146 |             as_json = dict()
147 |             as_json['content'] = doc
148 |             #as_json['doc_id'] = indx
149 |             UTIL.dump_json_file(os.path.join(dst_dir, '{}.json'.format(indx)), as_json, None)
150 |     elif doc_type_verbose == 2 or doc_type_verbose == 3:
151 |         print('Paragraphs are getting dumped.')
152 |         dst_dir = UTIL.create_dir(os.path.join(source_path, 'lucene_paragraphs'))
153 |         for indx, doc in tqdm(enumerate(paragraphs_nontokenized)):
154 |             as_json = dict()
155 |             as_json['content'] = doc
156 |             #as_json['doc_id'] = indx
157 |             UTIL.dump_json_file(os.path.join(dst_dir, '{}.json'.format(indx)), as_json, None)
158 |     print('Completed.')
159 | 
160 | def print_statistics(question_answer_content, is_histogram, histogram_bin,  document_type):
161 |     word_counter, char_counter = Counter(), Counter()
162 |     examples, eval, questions, paragraphs, q_to_ps = process_squad_file(question_answer_content, word_counter,
163 |                                                                         char_counter)
164 |     tokenized_paragraphs = tokenize_contexts(paragraphs, -1)
165 |     tokenized_questions = tokenize_contexts(questions, -1)
166 |     tokenized_questions, tokenized_paragraphs = fixing_the_token_problem(tokenized_questions, tokenized_paragraphs)
167 | 
168 |     paragraphs_nontokenized = [" ".join(context) for context in tokenized_paragraphs]
169 |     questions_nontokenized = [" ".join(context) for context in tokenized_questions]
170 | 
171 |     data = []
172 |     corpus = []
173 |     if document_type in [1, 3]:
174 |         corpus = corpus + tokenized_questions
175 |     if document_type in [2, 3]:
176 |         corpus = corpus + tokenized_paragraphs
177 |     for doc in corpus:
178 |         data.append(len(doc))
179 | 
180 |     if is_histogram.lower() in ['true', 'True', 'TRUE']:
181 |         data_df = pd.DataFrame(data, columns=['doc_len'])
182 |         data_df.hist(bins=histogram_bin)
183 |         plt.show()
184 | 
185 | 
186 | 
187 | def convert_to_short_squad(question_answer_content, q_len, negative_sampling_count, max_tokens=-1):
188 |     word_counter, char_counter = Counter(), Counter()
189 |     examples, eval, questions, paragraphs, q_to_ps = process_squad_file(question_answer_content, word_counter,
190 |                                                                         char_counter)
191 |     tokenized_paragraphs = tokenize_contexts(paragraphs, max_tokens)
192 |     tokenized_questions = tokenize_contexts(questions, max_tokens)
193 |     tokenized_questions, tokenized_paragraphs = fixing_the_token_problem(tokenized_questions, tokenized_paragraphs)
194 | 
195 |     paragraphs_nontokenized = [" ".join(context) for context in tokenized_paragraphs]
196 |     questions_nontokenized = [" ".join(context) for context in tokenized_questions]
197 | 
198 |     squad_formatted_content = dict()
199 |     squad_formatted_content['version'] = 'short_squad_format'
200 |     data = []
201 | 
202 |     last_paragraph_indx = None
203 |     questions = []
204 | 
205 |     for q_indx, question in enumerate(tqdm(questions_nontokenized[0:q_len])):
206 |         if len(data) > negative_sampling_count:
207 |             break
208 |         if last_paragraph_indx is None:
209 |             last_paragraph_indx = q_to_ps[q_indx]
210 |         #qs = [i for i in q_to_ps if i == 0]
211 |         current_paragraph_indx = q_to_ps[q_indx]
212 | 
213 |         if current_paragraph_indx != last_paragraph_indx:
214 |             data_ELEMENT = dict()
215 |             data_ELEMENT['title'] = 'dummyTitle'
216 |             paragraphs = []
217 |             paragraphs_ELEMENT = dict()
218 |             superdocument = paragraphs_nontokenized[last_paragraph_indx]
219 |             paragraphs_ELEMENT['context'] = superdocument
220 |             qas = []
221 |             for _q_item in questions:
222 |                 _q_indx, _q = _q_item[0], _q_item[1]
223 |                 qas_ELEMENT = dict()
224 |                 ANSWERS_ELEMENT = dict()
225 |                 qas_ELEMENT_ANSWERS = []
226 |                 qas_ELEMENT['id'] = _q_indx
227 |                 qas_ELEMENT['question'] = _q
228 |                 ANSWERS_ELEMENT['answer_start'] = -1
229 |                 ANSWERS_ELEMENT['text'] = 'dummyAnswer'
230 |                 qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
231 |                 qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
232 |                 qas.append(qas_ELEMENT)
233 |             paragraphs_ELEMENT['qas'] = qas
234 |             paragraphs.append(paragraphs_ELEMENT)
235 | 
236 |             data_ELEMENT['paragraphs'] = paragraphs
237 |             data.append(data_ELEMENT)
238 |             questions = []
239 |             last_paragraph_indx = current_paragraph_indx
240 |         questions.append((q_indx, question))
241 |     squad_formatted_content['data'] = data
242 |     return squad_formatted_content


--------------------------------------------------------------------------------
/ds_formatter/triviaqa.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from tqdm import tqdm
  3 | import nltk
  4 | sent_tokenize = nltk.data.load("tokenizers/punkt/english.pickle")
  5 | import sys
  6 | import os
  7 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
  8 | import util as UTIL
  9 | 
 10 | def convert_to_squad_format(qa_content, wikipedia_dir, web_dir, sample_size, seed, max_num_of_tokens):
 11 |     qa_json = read_triviaqa_data(qa_content)
 12 |     qad_triples = get_qad_triples(qa_json)
 13 | 
 14 |     random.seed(int(seed))
 15 |     random.shuffle(qad_triples)
 16 | 
 17 |     data = []
 18 |     for qad in tqdm(qad_triples):
 19 |         qid = qad['QuestionId']
 20 | 
 21 |         text = get_text(qad, qad['Source'], web_dir, wikipedia_dir)
 22 |         selected_text = select_relevant_portion(text, int(max_num_of_tokens))
 23 | 
 24 |         question = qad['Question']
 25 |         para = {'context': selected_text, 'qas': [{'question': question, 'answers': []}]}
 26 |         data.append({'paragraphs': [para]})
 27 |         qa = para['qas'][0]
 28 |         qa['id'] = get_question_doc_string(qid, qad['Filename'])
 29 |         qa['qid'] = qid
 30 | 
 31 |         ans_string, index = answer_index_in_document(qad['Answer'], selected_text)
 32 |         if index == -1:
 33 |             if qa_json['Split'] == 'train':
 34 |                 continue
 35 |         else:
 36 |             qa['answers'].append({'text': ans_string, 'answer_start': index})
 37 | 
 38 |         if qa_json['Split'] == 'train' and len(data) >= int(sample_size) and qa_json['Domain'] == 'Web':
 39 |             break
 40 | 
 41 |     squad = {'data': data, 'version': qa_json['Version']}
 42 |     return squad
 43 | 
 44 | 
 45 | def read_triviaqa_data(trivia_content):
 46 |     data = trivia_content #UTIL.load_json_file(file_path=qajson, logging=None)
 47 |     # read only documents and questions that are a part of clean data set
 48 |     if data['VerifiedEval']:
 49 |         clean_data = []
 50 |         for datum in data['Data']:
 51 |             if datum['QuestionPartOfVerifiedEval']:
 52 |                 if data['Domain'] == 'Web':
 53 |                     datum = read_clean_part(datum)
 54 |                 clean_data.append(datum)
 55 |         data['Data'] = clean_data
 56 |     return data
 57 | 
 58 | def read_clean_part(datum):
 59 |     for key in ['EntityPages', 'SearchResults']:
 60 |         new_page_list = []
 61 |         for page in datum.get(key, []):
 62 |             if page['DocPartOfVerifiedEval']:
 63 |                 new_page_list.append(page)
 64 |         datum[key] = new_page_list
 65 |     assert len(datum['EntityPages']) + len(datum['SearchResults']) > 0
 66 |     return datum
 67 | 
 68 | # Key for wikipedia eval is question-id. Key for web eval is the (question_id, filename) tuple
 69 | def get_key_to_ground_truth(data):
 70 |     if data['Domain'] == 'Wikipedia':
 71 |         return {datum['QuestionId']: datum['Answer'] for datum in data['Data']}
 72 |     else:
 73 |         return get_qd_to_answer(data)
 74 | 
 75 | 
 76 | def get_question_doc_string(qid, doc_name):
 77 |     return '{}--{}'.format(qid, doc_name)
 78 | 
 79 | def get_qd_to_answer(data):
 80 |     key_to_answer = {}
 81 |     for datum in data['Data']:
 82 |         for page in datum.get('EntityPages', []) + datum.get('SearchResults', []):
 83 |             qd_tuple = get_question_doc_string(datum['QuestionId'], page['Filename'])
 84 |             key_to_answer[qd_tuple] = datum['Answer']
 85 |     return key_to_answer
 86 | 
 87 | def answer_index_in_document(answer, document):
 88 |     answer_list = answer['NormalizedAliases']
 89 |     for answer_string_in_doc in answer_list:
 90 |         index = document.lower().find(answer_string_in_doc)
 91 |         if index != -1:
 92 |             return answer_string_in_doc, index
 93 |     return answer['NormalizedValue'], -1
 94 | 
 95 | def get_text(qad, domain, web_dir, wikipedia_dir):
 96 |     local_file = os.path.join(web_dir, qad['Filename']) if domain == 'SearchResults' else os.path.join(wikipedia_dir, qad['Filename'])
 97 |     return UTIL.get_file_contents(local_file, encoding='utf-8')
 98 | 
 99 | 
100 | def select_relevant_portion(text, max_num_tokens):
101 |     paras = text.split('\n')
102 |     selected = []
103 |     done = False
104 |     for para in paras:
105 |         sents = sent_tokenize.tokenize(para)
106 |         for sent in sents:
107 |             words = nltk.word_tokenize(sent)
108 |             for word in words:
109 |                 selected.append(word)
110 |                 if len(selected) >= max_num_tokens:
111 |                     done = True
112 |                     break
113 |             if done:
114 |                 break
115 |         if done:
116 |             break
117 |         selected.append('\n')
118 |     st = ' '.join(selected).strip()
119 |     return st
120 | 
121 | 
122 | def add_triple_data(datum, page, domain):
123 |     qad = {'Source': domain}
124 |     for key in ['QuestionId', 'Question', 'Answer']:
125 |         qad[key] = datum[key]
126 |     for key in page:
127 |         qad[key] = page[key]
128 |     return qad
129 | 
130 | 
131 | def get_qad_triples(data):
132 |     qad_triples = []
133 |     for datum in data['Data']:
134 |         for key in ['EntityPages', 'SearchResults']:
135 |             for page in datum.get(key, []):
136 |                 qad = add_triple_data(datum, page, key)
137 |                 qad_triples.append(qad)
138 |     return qad_triples


--------------------------------------------------------------------------------
/ds_formatter/ubuntudialogue.py:
--------------------------------------------------------------------------------
 1 | def convert_to_squad(story_question_content):
 2 |     """
 3 |     :param story_question_content:
 4 |     :return: formatted SQUAD data
 5 |     At initial version, we are just focusing on the context and question, nothing more,
 6 |     therefore we are ignoring the answer part as of now
 7 |     """
 8 |     # PARSE FILES
 9 | 
10 |     squad_formatted_content = dict()
11 |     squad_formatted_content['version'] = 'ubuntudialogue_squad_format'
12 |     data = []
13 | 
14 |     df = story_question_content.values
15 | #    print(df.shape)
16 |     id_index = 0
17 |     # for valid and test dataset
18 |     if(df.shape[1] == 11):
19 |         for datum in df:
20 |             data_ELEMENT = dict()
21 |             data_ELEMENT['title'] = 'dummyTitle'
22 |             paragraphs = []
23 |             paragraphs_ELEMENT = dict()
24 |             qas = []
25 |             qas_ELEMENT = dict()
26 |             qas_ELEMENT_ANSWERS = []
27 |             ANSWERS_ELEMENT = dict()
28 | 
29 |             qas_ELEMENT['id'] = id_index
30 |             id_index += 1
31 |             qas_ELEMENT['question'] = datum[1].replace("__eou__ __eot__", ".").replace("__eou__", ".").replace("__eot__", ".")
32 | 
33 |             superdocument = datum[0].replace("__eou__ __eot__", ".").replace("__eou__", ".").replace("__eot__", ".")
34 | 
35 |             ANSWERS_ELEMENT['answer_start'] = -1
36 |             ANSWERS_ELEMENT['text'] = 'dummyAnswer'
37 | 
38 |             paragraphs_ELEMENT['context'] = superdocument
39 |             qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
40 | 
41 |             qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
42 |             qas.append(qas_ELEMENT)
43 | 
44 |             paragraphs_ELEMENT['qas'] = qas
45 |             paragraphs.append(paragraphs_ELEMENT)
46 | 
47 |             data_ELEMENT['paragraphs'] = paragraphs
48 |             data.append(data_ELEMENT)
49 |     elif(df.shape[1] == 3): #for train set
50 |         true_response = [x for x in df if x[2] == 1]
51 |         for datum in true_response:
52 |             data_ELEMENT = dict()
53 |             data_ELEMENT['title'] = 'dummyTitle'
54 |             paragraphs = []
55 |             paragraphs_ELEMENT = dict()
56 |             qas = []
57 |             qas_ELEMENT = dict()
58 |             qas_ELEMENT_ANSWERS = []
59 |             ANSWERS_ELEMENT = dict()
60 | 
61 |             qas_ELEMENT['id'] = id_index
62 |             id_index += 1
63 |             qas_ELEMENT['question'] = datum[1].replace("__eou__ __eot__", ".").replace("__eou__", ".").replace("__eot__", ".")
64 | 
65 |             superdocument = datum[0].replace("__eou__ __eot__", ".").replace("__eou__", ".").replace("__eot__", ".")
66 | 
67 |             ANSWERS_ELEMENT['answer_start'] = -1
68 |             ANSWERS_ELEMENT['text'] = 'dummyAnswer'
69 | 
70 |             paragraphs_ELEMENT['context'] = superdocument
71 |             qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
72 | 
73 |             qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
74 |             qas.append(qas_ELEMENT)
75 | 
76 |             paragraphs_ELEMENT['qas'] = qas
77 |             paragraphs.append(paragraphs_ELEMENT)
78 | 
79 |             data_ELEMENT['paragraphs'] = paragraphs
80 |             data.append(data_ELEMENT)
81 | 
82 |     squad_formatted_content['data'] = data
83 | 
84 |     return squad_formatted_content


--------------------------------------------------------------------------------
/ds_formatter/wikiqa.py:
--------------------------------------------------------------------------------
 1 | def convert_to_squad(story_question_content):
 2 |     """
 3 |     :param story_question_content::
 4 |     :return: formatted SQUAD data
 5 |     At initial version, we are just focusing on the context and question, nothing more,
 6 |     therefore we are ignoring the answer part as of now
 7 |     """
 8 |     squad_formatted_content = dict()
 9 |     squad_formatted_content['version'] = 'wikiqa_squad_format'
10 |     data = []
11 |     grouped = story_question_content.groupby(['QuestionID'])
12 |     for q, datum in grouped:
13 | 
14 |         datum = datum.loc[datum['Label'].isin([1])]
15 |         #anyRelatedLabel = sum(datum['Label'])
16 |         if len(datum) > 0:
17 | 
18 |             # Format is deeply nested JSON -- prepare data structures
19 |             data_ELEMENT = dict()
20 |             data_ELEMENT['title'] = datum.iloc[0]['DocumentTitle']
21 |             paragraphs = []
22 |             paragraphs_ELEMENT = dict()
23 |             qas = []
24 |             qas_ELEMENT = dict()
25 |             qas_ELEMENT_ANSWERS = []
26 |             ANSWERS_ELEMENT = dict()
27 | 
28 |             qas_ELEMENT['id'] = q
29 |             qas_ELEMENT['question'] = datum.iloc[0]['Question']
30 | 
31 |             superdocument = ' '.join(datum['Sentence'].tolist())
32 | 
33 |             ANSWERS_ELEMENT['answer_start'] = -1
34 |             ANSWERS_ELEMENT['text'] = 'dummyAnswer'
35 | 
36 |             paragraphs_ELEMENT['context'] = superdocument
37 |             qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT)
38 | 
39 |             qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS
40 |             qas.append(qas_ELEMENT)
41 | 
42 |             paragraphs_ELEMENT['qas'] = qas
43 |             paragraphs.append(paragraphs_ELEMENT)
44 | 
45 |             data_ELEMENT['paragraphs'] = paragraphs
46 |             data.append(data_ELEMENT)
47 | 
48 |     squad_formatted_content['data'] = data
49 | 
50 |     return squad_formatted_content


--------------------------------------------------------------------------------
/executor.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import argparse
  3 | import util as UTIL
  4 | import sys
  5 | import os
  6 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
  7 | from ds_formatter import qangaroo, mctest, insuranceqa, triviaqa, wikiqa, narrativeqa, msmarco, ubuntudialogue, cnnnews, squad, quasar
  8 | 
  9 | 
 10 | def get_parser():
 11 |     parser = argparse.ArgumentParser()
 12 | 
 13 |     parser.add_argument('--log_path',help="path to the log file")
 14 |     parser.add_argument('--log_info',default="INFO", help="logging level")
 15 |     parser.add_argument('--data_path', help="path to the source file to be converted")
 16 |     parser.add_argument('--from_files', help="addition/supporting files that are in the same path as source, could be coma-seperated ',', file type can be also identified with ':'. Ex: 'voc:vocabulary.txt, 'answer:answer.txt'")
 17 |     parser.add_argument('--from_format', help="dataset name of the source format")
 18 |     parser.add_argument('--to_format', help="dataset name of the destination format")
 19 |     parser.add_argument('--to_file_name', help="destination file name")
 20 |     return parser
 21 | 
 22 | def main(args):
 23 |     try:
 24 |         logging.info('(function {}) Started'.format(main.__name__))
 25 | 
 26 |         source_files = UTIL.parse_source_files(args.data_path, args.from_files, logging)
 27 |         source_file = source_files['source']
 28 |         destination_file = os.path.join(args.data_path, args.from_format.lower() + '_to_' + args.to_format.lower() + '_'+args.to_file_name)
 29 | 
 30 |         # TODO: 1) We need to create a interface class to have the same signature for all the formatters in ds_formatter folder.
 31 |         # TODO: 2) We need to create a generic approach to convert any type to any type not only any type to squad.
 32 |         # TODO: 3) can we have better approach to handle the following if/else scenarios
 33 |         # TODO: 4) we may also put some kind of field wrapper to handle whether which fields are gonna be filled with dummy and which fields are gonna be filled with real values.
 34 |         if args.from_format.lower() == 'qangaroo' and args.to_format.lower() == 'squad' :
 35 |             """            
 36 |             --log_path="~/log.log" 
 37 |             --data_path="~/data/qangaroo_v1.1/wikihop" 
 38 |             --from_files="source:dev.json"
 39 |             --from_format="qangaroo" 
 40 |             --to_format="squad" 
 41 |             --to_file_name="dev.json" #it is gonna be renamed as "[from_to]_filename.what"
 42 |             """
 43 |             in_content = UTIL.load_json_file(source_file, logging)
 44 |             formatted_content = qangaroo.convert_to_squad(in_content)
 45 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
 46 | 
 47 |         elif args.from_format.lower() == 'mctest' and args.to_format.lower() == 'squad':
 48 |             """            
 49 |             --log_path="~/log.log" 
 50 |             --data_path="~/data/" 
 51 |             --from_files="source:mc160.dev.tsv" 
 52 |             --from_format="mctest" 
 53 |             --to_format="squad" 
 54 |             --to_file_name="mc160.dev.json" #it is gonna be renamed as "[from_to]_filename.what"
 55 |             """
 56 | 
 57 | 
 58 |             story_question_content = UTIL.load_csv_file(source_file,"\t", None, logging)
 59 |             #answer_content = UTIL.load_csv_file(additional_files['answer'], "\t", None, logging)
 60 |             formatted_content = mctest.convert_to_squad(story_question_content)
 61 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
 62 | 
 63 |         elif args.from_format.lower() == 'insuranceqa' and args.to_format.lower() == 'squad':
 64 |             """            
 65 |             --log_path="~/log.log" 
 66 |             --data_path="~/data/insuranceqa_v2" 
 67 |             --from_files="source:InsuranceQA.question.anslabel.token.1500.pool.solr.test.encoded,voc:vocabulary.txt,answer:InsuranceQA.label2answer.token.encoded"
 68 |             --from_format="insuranceqa" 
 69 |             --to_format="squad" 
 70 |             --to_file_name="1500.test.json"
 71 |             """
 72 | 
 73 |             voc = insuranceqa.load_vocab(source_files['voc'])
 74 |             questions, a_to_q_map = insuranceqa.load_questions(source_file, voc)
 75 |             answers = insuranceqa.load_answers(source_files['answer'], voc)
 76 |             formatted_content = insuranceqa.convert_to_squad(questions, answers, a_to_q_map)
 77 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
 78 | 
 79 |         elif args.from_format.lower() == 'triviaqa' and args.to_format.lower() == 'squad':
 80 |             """            
 81 |             --log_path="~/log.log" 
 82 |             --data_path="~/data/triviaqa/" 
 83 |             --from_files=""source:qa/wikipedia-train.json, wikipedia:evidence/wikipedia,web:evidence/web,seed:10,token_size:2000,sample_size:1000000"
 84 |             --from_format="triviaqa" 
 85 |             --to_format="squad" 
 86 |             --to_file_name="wikipedia-train-long.json"
 87 |             """
 88 | 
 89 |             wiki = source_files['wikipedia']
 90 |             web = source_files['web']
 91 |             seed = source_files['seed']
 92 |             max_num_of_tokens = source_files['token_size']
 93 |             sample_size = source_files['sample_size']
 94 |             qa_file = UTIL.load_json_file(source_file, logging)
 95 |             formatted_content = triviaqa.convert_to_squad_format(qa_file, wiki, web, sample_size, seed, max_num_of_tokens)
 96 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
 97 |         elif args.from_format.lower() == 'wikiqa' and args.to_format.lower() == 'squad':
 98 |             """            
 99 |             --log_path="~/log.log" 
100 |             --data_path="~/data/WikiQACorpus" 
101 |             --from_files="source:WikiQA-dev.tsv"
102 |             --from_format="wikiqa" 
103 |             --to_format="squad" 
104 |             --to_file_name="dev.json"
105 |             """
106 | 
107 |             story_question_content = UTIL.load_csv_file(source_file, "\t", 'infer', logging)
108 |             formatted_content = wikiqa.convert_to_squad(story_question_content)
109 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
110 | 
111 |         elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'matchzoo':
112 |             """       
113 |             **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt".     
114 |             --log_path="~/log.log" 
115 |             --data_path="~/data/squad" 
116 |             --from_files="source:dev-v1.1.json,q_len:1000,negative_sampling:100"
117 |             --from_format="squad" 
118 |             --to_format="matchzoo" 
119 |             --to_file_name="dev.txt"
120 |             """
121 |             negative_samp_count = int(source_files['negative_sampling'])
122 |             q_len = int(source_files['q_len'])
123 |             content = UTIL.load_json_file(source_file, logging)
124 |             generator = squad.yield_to_matchzoo(content, q_len, negative_samp_count)
125 |             open(destination_file, "w").write('\n'.join(data for data in generator))
126 | 
127 |             #UTIL.dump_json_file(destination_file, formatted_content, logging)
128 |         elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'lucene':
129 |             """       
130 |             **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt".     
131 |             --log_path="~/log.log" 
132 |             --data_path="~/data/squad" 
133 |             --from_files="source:dev-v1.1.json,doc_type_verbose:2"
134 |             --from_format="squad" 
135 |             --to_format="matchzoo" 
136 |             --to_file_name="dev.txt"
137 |             """
138 |             doc_type_verbose = int(source_files['doc_type_verbose'])
139 |             content = UTIL.load_json_file(source_file, logging)
140 |             squad.convert_to_lucene(content, doc_type_verbose, args.data_path)
141 |         elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'short_squad':
142 |             """       
143 |             **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt".     
144 |             --log_path="~/log.log" 
145 |             --data_path="~/data/squad" 
146 |             --from_files="source:dev-v1.1.json,q_len:1000,negative_sampling:100"
147 |             --from_format="squad" 
148 |             --to_format="short_squad" 
149 |             --to_file_name="dev.json"
150 |             """
151 |             negative_samp_count = int(source_files['negative_sampling'])
152 |             q_len = int(source_files['q_len'])
153 |             content = UTIL.load_json_file(source_file, logging)
154 |             formatted_content = squad.convert_to_short_squad(content, q_len, negative_samp_count)
155 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
156 |         elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'squad':
157 |             """       
158 |                In order to make some analyzes.      
159 |               --log_path="~/log.log" 
160 |               --data_path="~/data/squad" 
161 |               --from_files="source:dev-v1.1.json,is_histogram:True,document_type:1" #1 for question, #2 for paragraphs, #3 for both.
162 |               --from_format="squad" 
163 |               --to_format="squad" 
164 |               --to_file_name="dev.json"
165 |             """
166 |             is_historgram = source_files['is_histogram']
167 |             document_type = int(source_files['document_type'])
168 |             his_bin = int(source_files['histogram_bin'])
169 |             content = UTIL.load_json_file(source_file, logging)
170 |             squad.print_statistics(content, is_historgram, his_bin, document_type)
171 | 
172 |         elif args.from_format.lower() == 'narrativeqa' and args.to_format.lower() == 'squad':
173 |             """            
174 |             --log_path="~/log.log" 
175 |             --data_path="~/data/narrativeqa" 
176 |             --from_files="source:summaries.csv,set:train,qaps:qaps.csv" 
177 |             --from_format="narrativeqa" 
178 |             --to_format="squad" 
179 |             --to_file_name="train.json" #it is gonna be renamed as "[from_to]_filename.what"
180 |             """
181 | 
182 |             story_summary_content = UTIL.load_csv_file(source_file, ",", 'infer', logging)
183 |             question_content = UTIL.load_csv_file(source_files['qaps'], ",", 'infer', logging)
184 |             set_type = source_files['set']
185 |             formatted_content = narrativeqa.convert_to_squad(story_summary_content, question_content, set_type)
186 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
187 | 
188 |         elif args.from_format.lower() == 'webqa' and args.to_format.lower() == 'squad':
189 |             " ************************************************************ "
190 |             " *********************** ON-HOLD *****************************"
191 |             " ************************************************************ "
192 |             """            
193 |             --log_path="~/log.log" 
194 |             --data_path="~/data/" 
195 |             --from_files="label:question.train.token_idx.label,voc:vocabulary,answer:answers.label.token_idx" 
196 |             --from_format="webqa" 
197 |             --to_format="squad"
198 |             --to_file_name="filename.what" #it is gonna be renamed as "[from_to]_filename.what" 
199 |             """
200 | 
201 |             story_summary_content = UTIL.load_csv_file(source_file, ",", 'infer', logging)
202 |             question_content = UTIL.load_csv_file(source_files['qaps'], ",", 'infer', logging)
203 |             set_type = source_files['set']
204 |             formatted_content = narrativeqa.convert_to_squad(story_summary_content, question_content, set_type)
205 |             UTIL.dump_json_file(args.destination_file_path, formatted_content, logging)
206 |         elif args.from_format.lower() == 'msmarco' and args.to_format.lower() == 'squad':
207 |             """            
208 |             --log_path="~/log.log" 
209 |             --data_path="~/data/msmarco"
210 |             --from_format="msmarco" 
211 |             --to_format="squad"
212 |             --to_file_name="dev_2.1.json" #it is gonna be renamed as "[from_to]_filename.what" 
213 |             """
214 |             input_dict = {}
215 |             try:
216 |                 version = float(source_files['v'])
217 |             except:
218 |                 version = 2.0
219 | 
220 |             input_dict['v'] = version
221 |             if version <= 2.0:
222 |                 """
223 |                 for version <= 2.0
224 |                 --from_files="source:dev_2.1.json, v:2.0"
225 |                 """
226 |                 in_content = UTIL.load_json_file(source_file, logging)
227 |                 input_dict['story_question_content'] = in_content
228 |                 formatted_content = msmarco.convert_to_squad(in_content)
229 |             else:
230 |                 """
231 |                 for version > 2.0
232 |                 --from_files="source:queries.train.csv,document:collection.tsv,mapping:qrels.train.csv,v:2.1,limit:-1"
233 |                 """
234 |                 queries = UTIL.load_csv_file(source_file, "\t", None, logging, ['id', 'content'])
235 |                 input_dict['queries'] = queries
236 |                 mappings = UTIL.load_csv_file(source_files['mapping'], "\t", None, logging, ['q_id', 'tmp1', 'p_id', 'tmp2'], [0,1,2,3])
237 |                 input_dict['mappings'] = mappings
238 |                 documents = UTIL.load_csv_file(source_files['document'], "\t", None, logging, ['id', 'content'])
239 |                 input_dict['documents'] = documents
240 |                 input_dict['limit'] = int(source_files['limit'])
241 |                 formatted_content = msmarco.convert_to_squad(input_dict)
242 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
243 |         elif args.from_format.lower() == 'quasar' and args.to_format.lower() == 'squad':
244 |             """            
245 |             --log_path="~/log.log" 
246 |             --data_path="~/data/quasar-t"
247 |             --from_format="quasar-t" 
248 |             --to_format="squad"
249 |             --from_files="source:train_questions.json,document:train_contexts.json,type:t,is_null_tags_filter, limit:-1"
250 |             --to_file_name="train.json"
251 |             """
252 |             if source_files['type'].lower() =='t':
253 |                 # quasar-t
254 |                 queries = UTIL.load_json_line_file(source_file, logging)
255 |                 documents = UTIL.load_json_line_file(source_files['document'], logging)
256 |                 formatted_content = quasar.convert_to_squad(queries, documents, source_files['is_null_tags_filter'], int(source_files['limit']))
257 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
258 | 
259 |         elif args.from_format.lower() == 'ubuntu' and args.to_format.lower() == 'squad':
260 |             """            
261 |             --log_path="~/log.log" 
262 |             --data_path="~/data/ubuntu" 
263 |             --from_files="source:valid.csv"
264 |             --from_format="ubuntu" 
265 |             --to_format="squad"
266 |             --to_file_name="valid.json"
267 |             """
268 |             story_question_content = UTIL.load_csv_file(source_file, ",", 'infer', logging)
269 |             formatted_content = ubuntudialogue.convert_to_squad(story_question_content)
270 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
271 |         elif args.from_format.lower() == 'newsqa' and args.to_format.lower() == 'squad':
272 | 
273 |             """            
274 |             --log_path="~/log.log" 
275 |             --data_path="~/data/newsqa" 
276 |             --from_files="source:newsqa-data-v1.csv,story:cnn_stories/"
277 |             --from_format="newsqa" 
278 |             --to_format="squad"
279 |             --to_file_name="news.json"
280 |             """
281 | 
282 |             story_question_content = UTIL.load_csv_file(source_file, ",", 'infer', logging)
283 |             context_content_path = source_files['story']
284 |             formatted_content = cnnnews.convert_to_squad(story_question_content, context_content_path)
285 |             UTIL.dump_json_file(destination_file, formatted_content, logging)
286 |         else:
287 |             pass
288 |         logging.info('(function {}) Finished'.format(main.__name__))
289 |     except Exception as e:
290 |         logging.error('(function {}) has an error: {}'.format(main.__name__, e))
291 |         raise
292 | if __name__ == '__main__':
293 |     args = get_parser().parse_args()
294 |     assert args.log_path is not None, "No log path found at {}".format(args.log_path)
295 |     assert args.data_path is not None, "No folder path found at {}".format(args.data_path)
296 |     assert args.from_format is not None, "No 'from format' found {}".format(args.from_format)
297 |     assert args.from_files is not None, "No 'from files' format found {}".format(args.from_files)
298 |     assert args.to_format is not None, "No 'to format' dataset format found {}".format(
299 |         args.to_format)
300 |     assert args.to_file_name is not None, "No 'to file name' dataset format found {}".format(
301 |         args.to_file_name)
302 | 
303 |     if args.log_info.lower() =='info':
304 |         log_info = logging.INFO
305 |     elif args.log_info.lower() == 'debug':
306 |         log_info = logging.DEBUG
307 |     elif args.log_info.lower() == 'warn':
308 |         log_info = logging.WARNING
309 |     elif args.log_info.lower() == 'critical':
310 |         log_info = logging.CRITICAL
311 |     elif args.log_info.lower() == 'error':
312 |         log_info = logging.ERROR
313 |     else:
314 |         log_info = logging.INFO
315 | 
316 |     logging.basicConfig(filename=args.log_path, level=log_info)
317 |     main(args)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==0.23.4
2 | tqdm
3 | spacy==2.0.8
4 | scikit_learn==0.19.1
5 | scipy
6 | json-lines
7 | matplotlib


--------------------------------------------------------------------------------
/util.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pandas as pd
  3 | import os
  4 | import spacy
  5 | import json_lines
  6 | import matplotlib.pyplot as plt
  7 | nlp = spacy.blank("en")
  8 | def word_tokenize(sent):
  9 |     doc = nlp(sent)
 10 |     return [token.text for token in doc]
 11 | 
 12 | # The "*.json.gz" files can be read in python as follows:
 13 | #
 14 | # import gzip
 15 | # def read_data(path):
 16 | #     with gzip.open(path) as f:
 17 | #         for line in f:
 18 | #             yield eval(line)
 19 | 
 20 | 
 21 | def load_json_file(file_path, logging, encoding='utf-8'):
 22 |     content = None
 23 |     try:
 24 |         # with open(file_path, 'r') as f_in:
 25 |         # content = json.load(f_in)
 26 |         content = json.loads(get_file_contents(file_path, encoding=encoding))
 27 |         if logging is not None:
 28 |             logging.info('(function {}) is run successfuly and load the file: {}'.format(load_json_file.__name__, file_path))
 29 |     except Exception as e:
 30 |         if logging is not None:
 31 |             logging.error('(function {}) has an error: {}'.format(load_json_file.__name__, e))
 32 |         raise
 33 |     return content
 34 | 
 35 | def load_json_line_file(file_path, logging, encoding='utf-8'):
 36 |     content = []
 37 |     try:
 38 |         with open(file_path, 'r', encoding=encoding) as f:  # opening file in binary(rb) mode
 39 |             for item in json_lines.reader(f):
 40 |                 content.append(item)
 41 | 
 42 |         if logging is not None:
 43 |             logging.info('(function {}) is run successfuly and load the file: {}'.format(load_json_line_file.__name__, file_path))
 44 |     except Exception as e:
 45 |         if logging is not None:
 46 |             logging.error('(function {}) has an error: {}'.format(load_json_line_file.__name__, e))
 47 |         raise
 48 |     return content
 49 | 
 50 | def create_dir(dir):
 51 |     if not os.path.exists(dir):
 52 |         os.makedirs(dir)
 53 |     return dir
 54 | def dump_json_file(file_path, content, logging, encoding='utf-8'):
 55 |     try:
 56 |         with open(file_path, 'w', encoding=encoding) as f_out:
 57 |             json.dump(content, f_out, indent=1)
 58 |         if logging is not None:
 59 |             logging.info(
 60 |                 '(function {}) is run successfuly and write the file: {}'.format(dump_json_file.__name__, file_path))
 61 |     except Exception as e:
 62 |         if logging is not None:
 63 |             logging.error('(function {}) has an error: {}'.format(dump_json_file.__name__, e))
 64 |         raise
 65 | 
 66 | # def write_json_to_file(json_object, json_file, mode='w', encoding='utf-8'):
 67 | #     with open(json_file, mode, encoding=encoding) as outfile:
 68 | #         json.dump(json_object, outfile, indent=4, sort_keys=True, ensure_ascii=False)
 69 | 
 70 | # def dump_txt_contents(file_path, content, logging, encoding='utf-8'):
 71 | #     try:
 72 | #         with open(file_path, 'w', encoding=encoding) as f_out:
 73 | #             f_out.dump(content, f_out, indent=1)
 74 | #         if logging is not None:
 75 | #             logging.info(
 76 | #                 '(function {}) is run successfuly and write the file: {}'.format(dump_json_file.__name__,
 77 | #                                                                                  file_path))
 78 | #     except Exception as e:
 79 | #         if logging is not None:
 80 | #             logging.error('(function {}) has an error: {}'.format(dump_json_file.__name__, e))
 81 | #         raise
 82 | 
 83 | def get_file_contents(filename, encoding='utf-8'):
 84 |     with open(filename, encoding=encoding) as f:
 85 |         content = f.read()
 86 |     return content
 87 | 
 88 | 
 89 | def get_file_contents_as_list(file_path, encoding='utf-8', ignore_blanks=True):
 90 |     contents = get_file_contents(file_path, encoding=encoding)
 91 |     lines = contents.split('\n')
 92 |     lines = [line for line in lines if line != ''] if ignore_blanks else lines
 93 |     return lines
 94 | 
 95 | def load_csv_file(file_path, sep, header, logging, names=None, usecols=None):
 96 |     content = None
 97 |     try:
 98 |         with open(file_path, 'r') as f_in:
 99 |             content = pd.read_csv(file_path,sep=sep, header=header, names=names, usecols=None)
100 |         if logging is not None:
101 |             logging.info(
102 |                 '(function {}) is run successfuly and load the file: {}'.format(load_csv_file.__name__, file_path))
103 |     except Exception as e:
104 |         if logging is not None:
105 |             logging.error('(function {}) has an error: {}'.format(load_csv_file.__name__, e))
106 |         raise
107 |     return content
108 | 
109 | def parse_source_files(data_path, source_files, logging, item_seperator=',', k_v_seperator=':'):
110 |     source_path = data_path
111 |     _additional_files = dict()
112 |     try:
113 |         for _ in source_files.split(item_seperator):
114 |             _splitted = _.split(k_v_seperator)
115 |             key = _splitted[0].strip()
116 |             value = _splitted[1].strip()
117 |             print("indx:{}, key:{}, value:{}".format(_, key,value))
118 |             if os.path.isfile(os.path.join(source_path, value)) or os.path.isdir(os.path.join(source_path, value)):
119 |                 _additional_files[key] = os.path.join(source_path, value)
120 |             else:
121 |                 _additional_files[key] = value
122 |         if logging is not None:
123 |             logging.info(
124 |                 '(function {}) is run successfuly'.format(parse_source_files.__name__, data_path))
125 |     except Exception as e:
126 |         if logging is not None:
127 |             logging.error('(function {}) has an error: {}'.format(parse_source_files.__name__, e))
128 |         raise
129 |     return _additional_files


--------------------------------------------------------------------------------