├── .gitignore ├── LICENSE ├── README.md ├── ds_formatter ├── cnnnews.py ├── insuranceqa.py ├── mctest.py ├── msmarco.py ├── narrativeqa.py ├── qangaroo.py ├── quasar.py ├── squad.py ├── triviaqa.py ├── ubuntudialogue.py └── wikiqa.py ├── executor.py ├── requirements.txt └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | # OSX specific 2 | .DS_Store 3 | __MACOSX 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # IPython Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # Idea 96 | .idea/ 97 | 98 | # TensorBoard dirs 99 | .tb 100 | 101 | # Vim buffer files 102 | *.swp 103 | 104 | # Test result files 105 | testresult_* 106 | 107 | #datasets 108 | data/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Tolgahan Cakaloglu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dataset Converter for Question-Answering (QA) Tasks 2 | Dataset Converter for natural language processing tasks such QA(question-answering) Tasks: from one format to other one 3 | 4 | #### QA Dataset Paper & Data : 5 | 6 | * [SQuAD v1 paper](https://arxiv.org/pdf/1606.05250) | [SQuAD v1 data](https://github.com/rajpurkar/SQuAD-explorer/blob/master/dataset) 7 | * [SQuAD v2 paper](https://arxiv.org/abs/1806.03822) | [SQuAD v2 data](https://github.com/rajpurkar/SQuAD-explorer/blob/master/dataset) (*NOTE: SQuAD v2 should be also compatible with this code [NOT TESTED]*) 8 | * [QAngaroo paper](https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299) | [QAngaroo data](http://bit.ly/2m0W32k) 9 | * [MCTest paper](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/11/MCTest_EMNLP2013.pdf) | [MCTest data](https://github.com/mcobzarenco/mctest/tree/master/data/MCTest) 10 | * [WikiQA_paper](https://aclweb.org/anthology/D15-1237) | [WikiQA_data](https://www.microsoft.com/en-us/download/details.aspx?id=52419) 11 | * [InsuranceQA paper](https://arxiv.org/abs/1508.01585) | [InsuranceQA data v1](https://github.com/shuzi/insuranceQA/tree/master/V1) - [InsuranceQA data v2](https://github.com/shuzi/insuranceQA/tree/master/V2) 12 | * [MS_MARCO paper](https://arxiv.org/pdf/1611.09268.pdf) | [MS_MARCO data](http://www.msmarco.org/dataset.aspx) 13 | * [WikiMovies](https://arxiv.org/abs/1606.03126) 14 | * [TriviaQA paper](https://arxiv.org/abs/1705.03551) | [TriviaQA data](http://nlp.cs.washington.edu/triviaqa/) 15 | * [Simple Questions](https://arxiv.org/abs/1506.02075) 16 | * [NarrativeQA paper](https://arxiv.org/abs/1712.07040) | [NarrativeQA data](https://github.com/deepmind/narrativeqa) 17 | * [Ubuntu Dialogue Corpus v2.0 paper](https://arxiv.org/abs/1506.08909) | [Ubuntu Dialogue Corpus v2.0 data](https://github.com/rkadlec/ubuntu-ranking-dataset-creator) 18 | * [NewsQA paper](https://arxiv.org/abs/1611.09830) | [NewsQA data](https://datasets.maluuba.com/NewsQA) 19 | * [Quasar data](http://curtis.ml.cmu.edu/datasets/quasar/) 20 | * **MatchZoo** Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt". 21 | #### Supported Formats : 22 | Source | Destination | Status 23 | ------------ | ------------- | ------------- 24 | QAngaroo| SQuAD| **completed** 25 | MCTest| SQuAD| **completed** 26 | WikiQA| SQuAD| **completed** 27 | InsuranceQA v1| SQuAD| **completed** 28 | InsuranceQA v2| SQuAD| **completed** 29 | TriviaQA| SQuAD| **completed** 30 | NarrativeQA| SQuAD| **completed** 31 | MS MARCO| SQuAD| **completed** 32 | MS MARCO v2| SQuAD| **completed** 33 | WikiMovies| SQuAD| *on hold* 34 | Simple Questions| SQuAD| *on hold* 35 | Ubuntu Corpus v2| SQuAD| **completed** 36 | NewsQA| SQuAD| **completed** 37 | SQuAD| MatchZoo| **completed** 38 | Quasar-T| SQuAD| **completed** 39 | Quasar-S| SQuAD| **completed** 40 | 41 | #### Example Call : 42 | 43 | You can find the sample call for each format type in the ``` executor.py ``` file such as below. 44 | 45 | ``` 46 | python executor.py 47 | --log_path="~/log.log" 48 | --data_path="~/data/" 49 | --from_files="source:question.train.token_idx.label,voc:vocabulary,answer:answers.label.token_idx" 50 | --from_format="insuranceqa" 51 | --to_format="squad" 52 | --to_file_name="filename.what" # it is gonna be renamed as "[from_to]_filename.what" 53 | ``` -------------------------------------------------------------------------------- /ds_formatter/cnnnews.py: -------------------------------------------------------------------------------- 1 | def convert_to_squad(question_answer_content, context_content_path): 2 | """ 3 | :param question_answer_content: 4 | :param context_content_path: story files folder path 5 | :return: formatted SQUAD data 6 | At initial version, we are just focusing on the context and question, nothing more, 7 | therefore we are ignoring the answer part as of now 8 | """ 9 | # PARSE FILES 10 | import os 11 | 12 | squad_formatted_content = dict() 13 | squad_formatted_content['version'] = 'cnnnews_squad_format' 14 | data = [] 15 | #TODO: Each context has multiple questions and each row of the file has multiple questions in different columns (like every 4 columns), we need to handle this. 16 | for datum in question_answer_content.itertuples(index=False): 17 | # Format is deeply nested JSON -- prepare data structures 18 | if datum[3] > 0 : #(part) answer absent, skip this question 19 | continue 20 | 21 | data_ELEMENT = dict() 22 | data_ELEMENT['title'] = 'dummyTitle' 23 | paragraphs = [] 24 | paragraphs_ELEMENT = dict() 25 | qas = [] 26 | qas_ELEMENT = dict() 27 | qas_ELEMENT_ANSWERS = [] 28 | ANSWERS_ELEMENT = dict() 29 | 30 | story_file_name = datum[0][(datum[0].rindex('/') + 1):] 31 | qas_ELEMENT['id'] = story_file_name 32 | qas_ELEMENT['question'] = datum[1] 33 | 34 | story_file_path = context_content_path + os.sep + story_file_name 35 | if not os.path.isfile(story_file_path): 36 | raise TypeError(story_file_path + " does not exist") 37 | superdocument = open(story_file_path).read() 38 | 39 | ANSWERS_ELEMENT['answer_start'] = -1 40 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 41 | 42 | paragraphs_ELEMENT['context'] = superdocument 43 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 44 | 45 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 46 | qas.append(qas_ELEMENT) 47 | 48 | paragraphs_ELEMENT['qas'] = qas 49 | paragraphs.append(paragraphs_ELEMENT) 50 | 51 | data_ELEMENT['paragraphs'] = paragraphs 52 | data.append(data_ELEMENT) 53 | 54 | squad_formatted_content['data'] = data 55 | 56 | return squad_formatted_content -------------------------------------------------------------------------------- /ds_formatter/insuranceqa.py: -------------------------------------------------------------------------------- 1 | def convert_to_squad(questions, answers, a_to_q_map): 2 | """ 3 | questions:questions 4 | answers: answers or context or paragraphs 5 | a_to_q_map: answers to questions mapping 6 | """ 7 | squad_formatted_content = dict() 8 | squad_formatted_content['version'] = 'insuranceqa_squad_format' 9 | data = [] 10 | 11 | 12 | for par_indx, ques in a_to_q_map.items(): 13 | # Format is deeply nested JSON -- prepare data structures 14 | data_ELEMENT = dict() 15 | data_ELEMENT['title'] = 'dummyTitle' 16 | 17 | paragraphs = [] 18 | paragraphs_ELEMENT = dict() 19 | 20 | superdocument = answers[par_indx] 21 | paragraphs_ELEMENT['context'] = superdocument 22 | 23 | 24 | qas = [] 25 | for q_indx in ques: 26 | qas_ELEMENT = dict() 27 | ANSWERS_ELEMENT = dict() 28 | qas_ELEMENT_ANSWERS = [] 29 | qas_ELEMENT['id'] = q_indx 30 | qas_ELEMENT['question'] = questions[q_indx] 31 | ANSWERS_ELEMENT['answer_start'] = -1 32 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 33 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 34 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 35 | qas.append(qas_ELEMENT) 36 | 37 | paragraphs_ELEMENT['qas'] = qas 38 | paragraphs.append(paragraphs_ELEMENT) 39 | 40 | data_ELEMENT['paragraphs'] = paragraphs 41 | data.append(data_ELEMENT) 42 | 43 | squad_formatted_content['data'] = data 44 | 45 | return squad_formatted_content 46 | 47 | def load_vocab(vocab_file): 48 | voc = {} 49 | with open(vocab_file, 'r') as f_in: 50 | for line in f_in: 51 | word, _id = line.strip().split('\t') 52 | voc[word] = _id 53 | return voc 54 | 55 | def load_answers(answers_file, voc): 56 | #answers = context 57 | _list = ["None"] 58 | with open(answers_file, 'r') as f_in: 59 | for line in f_in: 60 | _, sent = line.strip().split('\t') 61 | _list.append(' '.join([voc[wid] for wid in sent.split(' ')])) 62 | return _list 63 | 64 | 65 | def load_questions(question_file, voc): 66 | questions = [] 67 | a_to_q_map = dict() 68 | x = dict() 69 | ground_truth, no_ground_truth = 0, 0 70 | with open(question_file, 'r') as f_in: 71 | for q_indx, line in enumerate(f_in): 72 | try: 73 | type, q = line.strip().split('\t') 74 | except: 75 | type, q, ids, pooled_answers = line.strip().split('\t') 76 | q = ' '.join([voc[wid] for wid in q.split(' ')]) 77 | questions.append(q) 78 | if type not in x: 79 | x[type] = 1 80 | else: 81 | x[type] = x[type] + 1 82 | 83 | if len([1 for gt in ids.split(' ') if gt in pooled_answers.split(' ')]) <= 0: 84 | no_ground_truth +=1 85 | else: 86 | ground_truth += 1 87 | for _id in ids.split(' '): 88 | if _id not in a_to_q_map: 89 | a_to_q_map[int(_id)] = [q_indx] 90 | else: 91 | temp_qs = a_to_q_map[int(_id)] 92 | temp_qs = temp_qs.append(int(_id)) 93 | a_to_q_map[int(_id)] = temp_qs 94 | print(x) 95 | print("Total items: {}".format(sum([v for k, v in x.items()]))) 96 | print('Ground Truth: {}, No Ground_Truth: {}'.format(ground_truth, no_ground_truth)) 97 | return questions, a_to_q_map -------------------------------------------------------------------------------- /ds_formatter/mctest.py: -------------------------------------------------------------------------------- 1 | def convert_to_squad(story_question_content): 2 | """ 3 | :param story_question_content: 4 | :param answer_content: 5 | :return: formatted SQUAD data 6 | At initial version, we are just focusing on the context and question, nothing more, 7 | therefore we are ignoring the answer part as of now 8 | """ 9 | # PARSE FILES 10 | 11 | squad_formatted_content = dict() 12 | squad_formatted_content['version'] = 'mctest_squad_format' 13 | data = [] 14 | #TODO: Each context has multiple questions and each row of the file has multiple questions in different columns (like every 4 columns), we need to handle this. 15 | for datum in story_question_content.itertuples(index=False): 16 | # Format is deeply nested JSON -- prepare data structures 17 | data_ELEMENT = dict() 18 | data_ELEMENT['title'] = 'dummyTitle' 19 | 20 | paragraphs = [] 21 | paragraphs_ELEMENT = dict() 22 | 23 | superdocument = datum[2].replace('\\newline', '') 24 | paragraphs_ELEMENT['context'] = superdocument 25 | 26 | qas = [] 27 | # it has 4 questions in each context 28 | question_column_start_indx = 3 29 | question_size = 4 30 | for q_indx in range(question_size): 31 | qas_ELEMENT = dict() 32 | ANSWERS_ELEMENT = dict() 33 | qas_ELEMENT_ANSWERS = [] 34 | qas_ELEMENT['id'] = datum[0] + "." +str(q_indx) 35 | qas_ELEMENT['question'] = datum[q_indx + question_column_start_indx if q_indx < 1 else q_indx * 5 + 3].replace("one: ", "").replace("multiple: ", "") 36 | ANSWERS_ELEMENT['answer_start'] = -1 37 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 38 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 39 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 40 | qas.append(qas_ELEMENT) 41 | 42 | paragraphs_ELEMENT['qas'] = qas 43 | paragraphs.append(paragraphs_ELEMENT) 44 | 45 | data_ELEMENT['paragraphs'] = paragraphs 46 | data.append(data_ELEMENT) 47 | 48 | squad_formatted_content['data'] = data 49 | 50 | return squad_formatted_content -------------------------------------------------------------------------------- /ds_formatter/msmarco.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tqdm import tqdm 4 | from sklearn.utils import shuffle 5 | 6 | # def convert_to_squad(story_question_content): 7 | # """ 8 | # :param story_question_content: 9 | # :return: formatted SQUAD data 10 | # At initial version, we are just focusing on the context and question, nothing more, 11 | # therefore we are ignoring the answer part as of now 12 | # """ 13 | # # PARSE FILES 14 | # 15 | # squad_formatted_content = dict() 16 | # squad_formatted_content['version'] = 'msmarco_squad_format' 17 | # data = [] 18 | # query = story_question_content['query'] 19 | # query_keys = query.keys() 20 | # passages = story_question_content['passages'] 21 | # 22 | # id_index = 0 23 | # for key in query_keys: 24 | # # Format is deeply nested JSON -- prepare data structures 25 | # data_ELEMENT = dict() 26 | # data_ELEMENT['title'] = 'dummyTitle' 27 | # paragraphs = [] 28 | # paragraphs_ELEMENT = dict() 29 | # qas = [] 30 | # qas_ELEMENT = dict() 31 | # qas_ELEMENT_ANSWERS = [] 32 | # ANSWERS_ELEMENT = dict() 33 | # 34 | # qas_ELEMENT['id'] = id_index 35 | # qas_ELEMENT['question'] = query[key] 36 | # id_index += 1 37 | # 38 | # superdocument = ' '.join([onePassage['passage_text'] for onePassage in passages[key]]) 39 | # 40 | # ANSWERS_ELEMENT['answer_start'] = -1 41 | # ANSWERS_ELEMENT['text'] = 'dummyAnswer' 42 | # 43 | # paragraphs_ELEMENT['context'] = superdocument 44 | # qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 45 | # 46 | # qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 47 | # qas.append(qas_ELEMENT) 48 | # 49 | # paragraphs_ELEMENT['qas'] = qas 50 | # paragraphs.append(paragraphs_ELEMENT) 51 | # 52 | # data_ELEMENT['paragraphs'] = paragraphs 53 | # data.append(data_ELEMENT) 54 | # 55 | # squad_formatted_content['data'] = data 56 | # 57 | # return squad_formatted_content 58 | def convert_to_squad(input_dict): 59 | """ 60 | :param story_question_content: 61 | :return: formatted SQUAD data 62 | At initial version, we are just focusing on the context and question, nothing more, 63 | therefore we are ignoring the answer part as of now. 64 | The code is to process train and development sets of MS-MARCO, since test(eval) set doesn't has answer information 65 | """ 66 | # PARSE FILES 67 | squad_formatted_content=None 68 | if input_dict['v'] <= 2.0: 69 | squad_formatted_content = convert_v2(input_dict) 70 | else: 71 | squad_formatted_content = convert_v21(input_dict) 72 | return squad_formatted_content 73 | 74 | def convert_v21(input_dict): 75 | squad_formatted_content = dict() 76 | squad_formatted_content['version'] = 'msmarco_v21_squad_format' 77 | data=[] 78 | all_data = assign_mapped_document(input_dict['queries'], input_dict['mappings'], input_dict['documents']) 79 | # if input_dict['limit'] != -1: 80 | # all_data = shuffle(all_data) 81 | 82 | all_data = all_data.groupby(['p_id', 'p_content']) 83 | iterator = tqdm(enumerate(all_data)) 84 | for i, pack in iterator: 85 | if input_dict['limit'] != -1 and i > input_dict['limit']: 86 | print('Data is prepared at the index of {}'.format(i)) 87 | iterator.close() 88 | break 89 | p, qs = pack[0], pack[1] 90 | data_ELEMENT = dict() 91 | data_ELEMENT['title'] = 'dummyTitle' 92 | paragraphs = [] 93 | paragraphs_ELEMENT = dict() 94 | superdocument = p[1] 95 | paragraphs_ELEMENT['context'] = superdocument 96 | qas = [] 97 | for q in qs.itertuples(): 98 | _q_indx, _q = q.q_id, q.q_content 99 | qas_ELEMENT = dict() 100 | ANSWERS_ELEMENT = dict() 101 | qas_ELEMENT_ANSWERS = [] 102 | qas_ELEMENT['id'] = _q_indx 103 | qas_ELEMENT['question'] = _q 104 | ANSWERS_ELEMENT['answer_start'] = -1 105 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 106 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 107 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 108 | qas.append(qas_ELEMENT) 109 | paragraphs_ELEMENT['qas'] = qas 110 | paragraphs.append(paragraphs_ELEMENT) 111 | 112 | data_ELEMENT['paragraphs'] = paragraphs 113 | data.append(data_ELEMENT) 114 | squad_formatted_content['data'] = data 115 | return squad_formatted_content 116 | def assign_mapped_document(queries, mappings, documents): 117 | # queries ids 118 | print('Shape of query is {}'.format(queries.shape)) 119 | queries_mask = np.isin(mappings['q_id'], queries['id']) 120 | mappings = mappings[queries_mask] 121 | queries_dict = pd.Series(queries["content"].values, index=queries['id']).to_dict() 122 | print('Len of query dict is {}'.format(len(queries_dict))) 123 | print('Shape of mapping is {}'.format(mappings.shape)) 124 | 125 | document_mask = np.isin(documents['id'], mappings['p_id']) 126 | documents = documents[document_mask] 127 | print('Shape of documents is {}'.format(documents.shape)) 128 | 129 | documents_dict = pd.Series(documents["content"].values, index=documents['id']).to_dict() 130 | print('Len of document dict is {}'.format(len(documents_dict))) 131 | 132 | 133 | mappings['q_content'] = mappings['q_id'].map(queries_dict) 134 | mappings['p_content'] = mappings['p_id'].map(documents_dict) 135 | print('Shape of new mapping is {}'.format(mappings.shape)) 136 | return mappings 137 | def convert_v2(input_dict): 138 | """ 139 | :param story_question_content: 140 | :return: formatted SQUAD data 141 | At initial version, we are just focusing on the context and question, nothing more, 142 | therefore we are ignoring the answer part as of now. 143 | The code is to process train and development sets of MS-MARCO, since test(eval) set doesn't has answer information 144 | """ 145 | # PARSE FILES 146 | story_question_content = input_dict['story_question_content'] 147 | squad_formatted_content = dict() 148 | squad_formatted_content['version'] = 'msmarco_squad_format' 149 | data = [] 150 | query = story_question_content['query'] 151 | #key list consists of keys of queries with answers 152 | keys_with_answer = [x for x, y in story_question_content['answers'].items() if y[0] != 'No Answer Present.' and y[0] != ''] 153 | passages = story_question_content['passages'] 154 | 155 | for key in keys_with_answer: 156 | # Format is deeply nested JSON -- prepare data structures 157 | data_ELEMENT = dict() 158 | data_ELEMENT['title'] = 'dummyTitle' 159 | paragraphs = [] 160 | paragraphs_ELEMENT = dict() 161 | qas = [] 162 | qas_ELEMENT = dict() 163 | qas_ELEMENT_ANSWERS = [] 164 | ANSWERS_ELEMENT = dict() 165 | 166 | qas_ELEMENT['id'] = key 167 | qas_ELEMENT['question'] = query[key] 168 | 169 | #correct_context is a list 170 | correct_context= [x for x in passages[key] if x['is_selected'] == 1] 171 | #some query(question) has more than 1 correct contexts, we just pick the first one as the context 172 | if len(correct_context) == 0: 173 | continue 174 | superdocument = correct_context[0]['passage_text'] 175 | 176 | ANSWERS_ELEMENT['answer_start'] = -1 177 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 178 | 179 | paragraphs_ELEMENT['context'] = superdocument 180 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 181 | 182 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 183 | qas.append(qas_ELEMENT) 184 | 185 | paragraphs_ELEMENT['qas'] = qas 186 | paragraphs.append(paragraphs_ELEMENT) 187 | 188 | data_ELEMENT['paragraphs'] = paragraphs 189 | data.append(data_ELEMENT) 190 | 191 | squad_formatted_content['data'] = data 192 | 193 | return squad_formatted_content -------------------------------------------------------------------------------- /ds_formatter/narrativeqa.py: -------------------------------------------------------------------------------- 1 | def convert_to_squad(story_summary_content, question_content, set_type): 2 | """ 3 | :param story_summary_content: 4 | :param question_content: 5 | :param category_content: 6 | :param set_type: 7 | :return: formatted SQUAD data 8 | At initial version, we are just focusing on the context and question, nothing more, 9 | therefore we are ignoring the answer part as of now 10 | """ 11 | squad_formatted_content = dict() 12 | squad_formatted_content['version'] = 'narrativeqa_squad_format' 13 | data = [] 14 | content = story_summary_content 15 | if set_type != 'all': 16 | content = story_summary_content[story_summary_content['set'] == set_type] 17 | 18 | for datum in content.itertuples(index=False): 19 | #print(datum.summary) 20 | data_ELEMENT = dict() 21 | data_ELEMENT['title'] = 'dummyTitle' 22 | 23 | paragraphs = [] 24 | paragraphs_ELEMENT = dict() 25 | 26 | superdocument = datum.summary 27 | paragraphs_ELEMENT['context'] = superdocument 28 | 29 | qas = [] 30 | sub_datum = question_content[question_content['document_id'] == datum.document_id] 31 | for q_datum in sub_datum.itertuples(): 32 | # print(indx) 33 | #print(q_datum) 34 | qas_ELEMENT = dict() 35 | ANSWERS_ELEMENT = dict() 36 | qas_ELEMENT_ANSWERS = [] 37 | qas_ELEMENT['id'] = q_datum.document_id + '-' + str(q_datum.Index) 38 | qas_ELEMENT['question'] = q_datum.question 39 | ANSWERS_ELEMENT['answer_start'] = -1 40 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 41 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 42 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 43 | qas.append(qas_ELEMENT) 44 | 45 | paragraphs_ELEMENT['qas'] = qas 46 | paragraphs.append(paragraphs_ELEMENT) 47 | 48 | data_ELEMENT['paragraphs'] = paragraphs 49 | data.append(data_ELEMENT) 50 | squad_formatted_content['data'] = data 51 | 52 | return squad_formatted_content -------------------------------------------------------------------------------- /ds_formatter/qangaroo.py: -------------------------------------------------------------------------------- 1 | def convert_to_squad(source_data): 2 | """ 3 | Converts QAngaroo data (hoppy_data) into SQuAD format. 4 | The SQuAD-formatted data is written to disk at write_file_name. 5 | Note: All given support documents per example are concatenated 6 | into one super-document. All text is lowercased. 7 | """ 8 | squad_formatted_content = dict() 9 | squad_formatted_content['version'] = 'hoppy_squad_format' 10 | data = [] 11 | 12 | 13 | for datum in source_data: 14 | 15 | # Format is deeply nested JSON -- prepare data structures 16 | data_ELEMENT = dict() 17 | data_ELEMENT['title'] = 'dummyTitle' 18 | paragraphs = [] 19 | paragraphs_ELEMENT = dict() 20 | qas = [] 21 | qas_ELEMENT = dict() 22 | qas_ELEMENT_ANSWERS = [] 23 | ANSWERS_ELEMENT = dict() 24 | 25 | qas_ELEMENT['id'] = datum['id'] 26 | qas_ELEMENT['question'] = datum['query'] 27 | 28 | superdocument = " ".join(datum['supports']) 29 | 30 | answer_position = superdocument.find(datum['answer']) 31 | if answer_position == -1: 32 | continue 33 | 34 | ANSWERS_ELEMENT['answer_start'] = answer_position 35 | ANSWERS_ELEMENT['text'] = datum['answer'] 36 | 37 | paragraphs_ELEMENT['context'] = superdocument 38 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 39 | 40 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 41 | qas.append(qas_ELEMENT) 42 | 43 | paragraphs_ELEMENT['qas'] = qas 44 | paragraphs.append(paragraphs_ELEMENT) 45 | 46 | data_ELEMENT['paragraphs'] = paragraphs 47 | data.append(data_ELEMENT) 48 | 49 | squad_formatted_content['data'] = data 50 | 51 | return squad_formatted_content -------------------------------------------------------------------------------- /ds_formatter/quasar.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from tqdm import tqdm 4 | from sklearn.utils import shuffle 5 | 6 | def convert_to_squad(queries, documents, is_null_tags_filter, limit): 7 | squad_formatted_content = dict() 8 | squad_formatted_content['version'] = 'quasar-t_squad_format' 9 | data=[] 10 | pairs = create_pairs(zip(queries, documents), is_null_tags_filter) 11 | # if limit != -1: 12 | # pairs = shuffle(pairs) 13 | 14 | pairs = pairs.groupby(['p_id', 'p_content']) 15 | iterator = tqdm(enumerate(pairs)) 16 | for i, pack in iterator: 17 | if limit != -1 and i > limit: 18 | print('Data is prepared at the index of {}'.format(i)) 19 | iterator.close() 20 | break 21 | p, qs = pack[0], pack[1] 22 | data_ELEMENT = dict() 23 | data_ELEMENT['title'] = 'dummyTitle' 24 | paragraphs = [] 25 | paragraphs_ELEMENT = dict() 26 | superdocument = p[1] 27 | paragraphs_ELEMENT['context'] = superdocument 28 | qas = [] 29 | for q in qs.itertuples(): 30 | _q_indx, _q = q.q_id, q.q_content 31 | qas_ELEMENT = dict() 32 | ANSWERS_ELEMENT = dict() 33 | qas_ELEMENT_ANSWERS = [] 34 | qas_ELEMENT['id'] = _q_indx 35 | qas_ELEMENT['question'] = _q 36 | ANSWERS_ELEMENT['answer_start'] = -1 37 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 38 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 39 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 40 | qas.append(qas_ELEMENT) 41 | paragraphs_ELEMENT['qas'] = qas 42 | paragraphs.append(paragraphs_ELEMENT) 43 | 44 | data_ELEMENT['paragraphs'] = paragraphs 45 | data.append(data_ELEMENT) 46 | squad_formatted_content['data'] = data 47 | return squad_formatted_content 48 | 49 | def create_pairs(query_document_pair, is_null_tags_filter): 50 | pairs = [] 51 | generator = enumerate(query_document_pair) 52 | for i, pair in generator: 53 | query, context = pair[0], pair[1] 54 | if is_null_tags_filter.lower() in ['true', 'True', 'TRUE']: 55 | if len(query['tags']) == 0: 56 | continue 57 | if query['uid'] != context['uid']: 58 | print(20 * '!') 59 | print('Query {} - Document {} is mismatched.'.format(query['uid'],context['uid'])) 60 | pairs.append((query['uid'], query['question'], i, context['contexts'][0][1])) 61 | return pd.DataFrame(pairs, columns=['q_id', 'q_content', 'p_id', 'p_content']) 62 | -------------------------------------------------------------------------------- /ds_formatter/squad.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import util as UTIL 3 | import pandas as pd 4 | from collections import Counter 5 | import matplotlib.pyplot as plt 6 | #from random import shuffle,random 7 | import os 8 | import random 9 | def convert_idx(text, tokens): 10 | current = 0 11 | spans = [] 12 | for token in tokens: 13 | current = text.find(token, current) 14 | if current < 0: 15 | print("Token {} cannot be found".format(token)) 16 | raise Exception() 17 | spans.append((current, current + len(token))) 18 | current += len(token) 19 | return spans 20 | 21 | def process_squad_file(data, word_counter, char_counter): 22 | print("Generating examples...") 23 | examples = [] 24 | eval_examples = {} 25 | total,_i_para = 0, 0 26 | questions = [] 27 | paragraphs = [] 28 | question_to_paragraph = [] 29 | for article in tqdm(data["data"]): 30 | title = article["title"] 31 | for para in article["paragraphs"]: 32 | context = para["context"].replace( 33 | "''", '" ').replace("``", '" ') 34 | paragraphs.append(context) 35 | context_tokens = UTIL.word_tokenize(context) 36 | context_chars = [list(token) for token in context_tokens] 37 | spans = convert_idx(context, context_tokens) 38 | for token in context_tokens: 39 | word_counter[token] += len(para["qas"]) 40 | for char in token: 41 | char_counter[char] += len(para["qas"]) 42 | for qa in para["qas"]: 43 | total += 1 44 | ques = qa["question"].replace( 45 | "''", '" ').replace("``", '" ') 46 | questions.append(ques) 47 | question_to_paragraph.append(_i_para) 48 | ques_tokens = UTIL.word_tokenize(ques) 49 | ques_chars = [list(token) for token in ques_tokens] 50 | for token in ques_tokens: 51 | word_counter[token] += 1 52 | for char in token: 53 | char_counter[char] += 1 54 | y1s, y2s = [], [] 55 | answer_texts = [] 56 | for answer in qa["answers"]: 57 | answer_text = answer["text"] 58 | answer_start = answer['answer_start'] 59 | answer_end = answer_start + len(answer_text) 60 | answer_texts.append(answer_text) 61 | answer_span = [] 62 | for idx, span in enumerate(spans): 63 | if not (answer_end <= span[0] or answer_start >= span[1]): 64 | answer_span.append(idx) 65 | y1, y2 = answer_span[0], answer_span[-1] 66 | y1s.append(y1) 67 | y2s.append(y2) 68 | example = {"context_tokens": context_tokens, "context_chars": context_chars, "ques_tokens": ques_tokens, 69 | "ques_chars": ques_chars, "y1s": y1s, "y2s": y2s, "id": total} 70 | examples.append(example) 71 | eval_examples[str(total)] = { 72 | "context": context, "spans": spans, 'ques': ques,"answers": answer_texts, "uuid": qa["id"], 'title': title} 73 | _i_para += 1 74 | print("{} questions in total".format(len(examples))) 75 | return examples, eval_examples, questions, paragraphs, question_to_paragraph 76 | def tokenize_contexts(contexts:list, max_tokens=-1): 77 | tokenized_context = [UTIL.word_tokenize(context.strip()) if max_tokens == -1 else UTIL.word_tokenize(context.strip())[0:max_tokens]for context in contexts] 78 | return tokenized_context 79 | 80 | def fixing_the_token_problem(tokenized_questions, tokenized_paragraphs): 81 | # fixing the '' problem: 82 | fixed_tokenized_question = [] 83 | for indx, question in enumerate(tokenized_questions): 84 | tokens = [] 85 | for token in question: 86 | t = token.strip() 87 | if t != "": 88 | tokens.append(t) 89 | fixed_tokenized_question.append(tokens) 90 | 91 | fixed_tokenized_paragraph = [] 92 | for indx, paragraph in enumerate(tokenized_paragraphs): 93 | tokens = [] 94 | for token in paragraph: 95 | t = token.strip() 96 | if t != "": 97 | tokens.append(t) 98 | fixed_tokenized_paragraph.append(tokens) 99 | return fixed_tokenized_question, fixed_tokenized_paragraph 100 | 101 | def yield_to_matchzoo(question_answer_content, q_len, negative_sampling_count=100, max_tokens=-1): 102 | """ 103 | :param question_answer_document content: 104 | :return: yield matchzoo data 105 | At initial version, we are just focusing on the context and question, nothing more, 106 | therefore we are ignoring the answer part as of now 107 | """ 108 | word_counter, char_counter = Counter(), Counter() 109 | examples, eval, questions, paragraphs, q_to_ps = process_squad_file(question_answer_content, word_counter, char_counter) 110 | tokenized_paragraphs = tokenize_contexts(paragraphs, max_tokens) 111 | tokenized_questions = tokenize_contexts(questions, max_tokens) 112 | tokenized_questions, tokenized_paragraphs = fixing_the_token_problem(tokenized_questions, tokenized_paragraphs) 113 | 114 | paragraphs_nontokenized = [" ".join(context) for context in tokenized_paragraphs] 115 | questions_nontokenized = [" ".join(context) for context in tokenized_questions] 116 | 117 | for q_indx, question in enumerate(tqdm(questions_nontokenized[0:q_len])): 118 | true_p_indx = q_to_ps[q_indx] 119 | true_paragraph = paragraphs_nontokenized[true_p_indx] 120 | temp_list = paragraphs_nontokenized.copy() 121 | del temp_list[true_p_indx] 122 | random.Random(q_indx).shuffle(temp_list) 123 | for p_indx, paragraph in enumerate([true_paragraph] + temp_list[:negative_sampling_count-1]): 124 | yield '\t'.join(['1' if p_indx == 0 else '0', question, paragraph]) 125 | def convert_to_lucene(question_answer_content, doc_type_verbose, source_path): 126 | """ 127 | :param question_answer_document content: 128 | :return: yield matchzoo data 129 | At initial version, we are just focusing on the context and question, nothing more, 130 | therefore we are ignoring the answer part as of now 131 | """ 132 | word_counter, char_counter = Counter(), Counter() 133 | examples, eval, questions, paragraphs, q_to_ps = process_squad_file(question_answer_content, word_counter, char_counter) 134 | tokenized_paragraphs = tokenize_contexts(paragraphs, -1) 135 | tokenized_questions = tokenize_contexts(questions, -1) 136 | tokenized_questions, tokenized_paragraphs = fixing_the_token_problem(tokenized_questions, tokenized_paragraphs) 137 | 138 | paragraphs_nontokenized = [" ".join(context) for context in tokenized_paragraphs] 139 | questions_nontokenized = [" ".join(context) for context in tokenized_questions] 140 | 141 | if doc_type_verbose == 1 or doc_type_verbose == 3: 142 | # questions 143 | print('Questions are getting dumped.') 144 | dst_dir = UTIL.create_dir(os.path.join(source_path, 'lucene_questions')) 145 | for indx, doc in tqdm(enumerate(questions_nontokenized)): 146 | as_json = dict() 147 | as_json['content'] = doc 148 | #as_json['doc_id'] = indx 149 | UTIL.dump_json_file(os.path.join(dst_dir, '{}.json'.format(indx)), as_json, None) 150 | elif doc_type_verbose == 2 or doc_type_verbose == 3: 151 | print('Paragraphs are getting dumped.') 152 | dst_dir = UTIL.create_dir(os.path.join(source_path, 'lucene_paragraphs')) 153 | for indx, doc in tqdm(enumerate(paragraphs_nontokenized)): 154 | as_json = dict() 155 | as_json['content'] = doc 156 | #as_json['doc_id'] = indx 157 | UTIL.dump_json_file(os.path.join(dst_dir, '{}.json'.format(indx)), as_json, None) 158 | print('Completed.') 159 | 160 | def print_statistics(question_answer_content, is_histogram, histogram_bin, document_type): 161 | word_counter, char_counter = Counter(), Counter() 162 | examples, eval, questions, paragraphs, q_to_ps = process_squad_file(question_answer_content, word_counter, 163 | char_counter) 164 | tokenized_paragraphs = tokenize_contexts(paragraphs, -1) 165 | tokenized_questions = tokenize_contexts(questions, -1) 166 | tokenized_questions, tokenized_paragraphs = fixing_the_token_problem(tokenized_questions, tokenized_paragraphs) 167 | 168 | paragraphs_nontokenized = [" ".join(context) for context in tokenized_paragraphs] 169 | questions_nontokenized = [" ".join(context) for context in tokenized_questions] 170 | 171 | data = [] 172 | corpus = [] 173 | if document_type in [1, 3]: 174 | corpus = corpus + tokenized_questions 175 | if document_type in [2, 3]: 176 | corpus = corpus + tokenized_paragraphs 177 | for doc in corpus: 178 | data.append(len(doc)) 179 | 180 | if is_histogram.lower() in ['true', 'True', 'TRUE']: 181 | data_df = pd.DataFrame(data, columns=['doc_len']) 182 | data_df.hist(bins=histogram_bin) 183 | plt.show() 184 | 185 | 186 | 187 | def convert_to_short_squad(question_answer_content, q_len, negative_sampling_count, max_tokens=-1): 188 | word_counter, char_counter = Counter(), Counter() 189 | examples, eval, questions, paragraphs, q_to_ps = process_squad_file(question_answer_content, word_counter, 190 | char_counter) 191 | tokenized_paragraphs = tokenize_contexts(paragraphs, max_tokens) 192 | tokenized_questions = tokenize_contexts(questions, max_tokens) 193 | tokenized_questions, tokenized_paragraphs = fixing_the_token_problem(tokenized_questions, tokenized_paragraphs) 194 | 195 | paragraphs_nontokenized = [" ".join(context) for context in tokenized_paragraphs] 196 | questions_nontokenized = [" ".join(context) for context in tokenized_questions] 197 | 198 | squad_formatted_content = dict() 199 | squad_formatted_content['version'] = 'short_squad_format' 200 | data = [] 201 | 202 | last_paragraph_indx = None 203 | questions = [] 204 | 205 | for q_indx, question in enumerate(tqdm(questions_nontokenized[0:q_len])): 206 | if len(data) > negative_sampling_count: 207 | break 208 | if last_paragraph_indx is None: 209 | last_paragraph_indx = q_to_ps[q_indx] 210 | #qs = [i for i in q_to_ps if i == 0] 211 | current_paragraph_indx = q_to_ps[q_indx] 212 | 213 | if current_paragraph_indx != last_paragraph_indx: 214 | data_ELEMENT = dict() 215 | data_ELEMENT['title'] = 'dummyTitle' 216 | paragraphs = [] 217 | paragraphs_ELEMENT = dict() 218 | superdocument = paragraphs_nontokenized[last_paragraph_indx] 219 | paragraphs_ELEMENT['context'] = superdocument 220 | qas = [] 221 | for _q_item in questions: 222 | _q_indx, _q = _q_item[0], _q_item[1] 223 | qas_ELEMENT = dict() 224 | ANSWERS_ELEMENT = dict() 225 | qas_ELEMENT_ANSWERS = [] 226 | qas_ELEMENT['id'] = _q_indx 227 | qas_ELEMENT['question'] = _q 228 | ANSWERS_ELEMENT['answer_start'] = -1 229 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 230 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 231 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 232 | qas.append(qas_ELEMENT) 233 | paragraphs_ELEMENT['qas'] = qas 234 | paragraphs.append(paragraphs_ELEMENT) 235 | 236 | data_ELEMENT['paragraphs'] = paragraphs 237 | data.append(data_ELEMENT) 238 | questions = [] 239 | last_paragraph_indx = current_paragraph_indx 240 | questions.append((q_indx, question)) 241 | squad_formatted_content['data'] = data 242 | return squad_formatted_content -------------------------------------------------------------------------------- /ds_formatter/triviaqa.py: -------------------------------------------------------------------------------- 1 | import random 2 | from tqdm import tqdm 3 | import nltk 4 | sent_tokenize = nltk.data.load("tokenizers/punkt/english.pickle") 5 | import sys 6 | import os 7 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 8 | import util as UTIL 9 | 10 | def convert_to_squad_format(qa_content, wikipedia_dir, web_dir, sample_size, seed, max_num_of_tokens): 11 | qa_json = read_triviaqa_data(qa_content) 12 | qad_triples = get_qad_triples(qa_json) 13 | 14 | random.seed(int(seed)) 15 | random.shuffle(qad_triples) 16 | 17 | data = [] 18 | for qad in tqdm(qad_triples): 19 | qid = qad['QuestionId'] 20 | 21 | text = get_text(qad, qad['Source'], web_dir, wikipedia_dir) 22 | selected_text = select_relevant_portion(text, int(max_num_of_tokens)) 23 | 24 | question = qad['Question'] 25 | para = {'context': selected_text, 'qas': [{'question': question, 'answers': []}]} 26 | data.append({'paragraphs': [para]}) 27 | qa = para['qas'][0] 28 | qa['id'] = get_question_doc_string(qid, qad['Filename']) 29 | qa['qid'] = qid 30 | 31 | ans_string, index = answer_index_in_document(qad['Answer'], selected_text) 32 | if index == -1: 33 | if qa_json['Split'] == 'train': 34 | continue 35 | else: 36 | qa['answers'].append({'text': ans_string, 'answer_start': index}) 37 | 38 | if qa_json['Split'] == 'train' and len(data) >= int(sample_size) and qa_json['Domain'] == 'Web': 39 | break 40 | 41 | squad = {'data': data, 'version': qa_json['Version']} 42 | return squad 43 | 44 | 45 | def read_triviaqa_data(trivia_content): 46 | data = trivia_content #UTIL.load_json_file(file_path=qajson, logging=None) 47 | # read only documents and questions that are a part of clean data set 48 | if data['VerifiedEval']: 49 | clean_data = [] 50 | for datum in data['Data']: 51 | if datum['QuestionPartOfVerifiedEval']: 52 | if data['Domain'] == 'Web': 53 | datum = read_clean_part(datum) 54 | clean_data.append(datum) 55 | data['Data'] = clean_data 56 | return data 57 | 58 | def read_clean_part(datum): 59 | for key in ['EntityPages', 'SearchResults']: 60 | new_page_list = [] 61 | for page in datum.get(key, []): 62 | if page['DocPartOfVerifiedEval']: 63 | new_page_list.append(page) 64 | datum[key] = new_page_list 65 | assert len(datum['EntityPages']) + len(datum['SearchResults']) > 0 66 | return datum 67 | 68 | # Key for wikipedia eval is question-id. Key for web eval is the (question_id, filename) tuple 69 | def get_key_to_ground_truth(data): 70 | if data['Domain'] == 'Wikipedia': 71 | return {datum['QuestionId']: datum['Answer'] for datum in data['Data']} 72 | else: 73 | return get_qd_to_answer(data) 74 | 75 | 76 | def get_question_doc_string(qid, doc_name): 77 | return '{}--{}'.format(qid, doc_name) 78 | 79 | def get_qd_to_answer(data): 80 | key_to_answer = {} 81 | for datum in data['Data']: 82 | for page in datum.get('EntityPages', []) + datum.get('SearchResults', []): 83 | qd_tuple = get_question_doc_string(datum['QuestionId'], page['Filename']) 84 | key_to_answer[qd_tuple] = datum['Answer'] 85 | return key_to_answer 86 | 87 | def answer_index_in_document(answer, document): 88 | answer_list = answer['NormalizedAliases'] 89 | for answer_string_in_doc in answer_list: 90 | index = document.lower().find(answer_string_in_doc) 91 | if index != -1: 92 | return answer_string_in_doc, index 93 | return answer['NormalizedValue'], -1 94 | 95 | def get_text(qad, domain, web_dir, wikipedia_dir): 96 | local_file = os.path.join(web_dir, qad['Filename']) if domain == 'SearchResults' else os.path.join(wikipedia_dir, qad['Filename']) 97 | return UTIL.get_file_contents(local_file, encoding='utf-8') 98 | 99 | 100 | def select_relevant_portion(text, max_num_tokens): 101 | paras = text.split('\n') 102 | selected = [] 103 | done = False 104 | for para in paras: 105 | sents = sent_tokenize.tokenize(para) 106 | for sent in sents: 107 | words = nltk.word_tokenize(sent) 108 | for word in words: 109 | selected.append(word) 110 | if len(selected) >= max_num_tokens: 111 | done = True 112 | break 113 | if done: 114 | break 115 | if done: 116 | break 117 | selected.append('\n') 118 | st = ' '.join(selected).strip() 119 | return st 120 | 121 | 122 | def add_triple_data(datum, page, domain): 123 | qad = {'Source': domain} 124 | for key in ['QuestionId', 'Question', 'Answer']: 125 | qad[key] = datum[key] 126 | for key in page: 127 | qad[key] = page[key] 128 | return qad 129 | 130 | 131 | def get_qad_triples(data): 132 | qad_triples = [] 133 | for datum in data['Data']: 134 | for key in ['EntityPages', 'SearchResults']: 135 | for page in datum.get(key, []): 136 | qad = add_triple_data(datum, page, key) 137 | qad_triples.append(qad) 138 | return qad_triples -------------------------------------------------------------------------------- /ds_formatter/ubuntudialogue.py: -------------------------------------------------------------------------------- 1 | def convert_to_squad(story_question_content): 2 | """ 3 | :param story_question_content: 4 | :return: formatted SQUAD data 5 | At initial version, we are just focusing on the context and question, nothing more, 6 | therefore we are ignoring the answer part as of now 7 | """ 8 | # PARSE FILES 9 | 10 | squad_formatted_content = dict() 11 | squad_formatted_content['version'] = 'ubuntudialogue_squad_format' 12 | data = [] 13 | 14 | df = story_question_content.values 15 | # print(df.shape) 16 | id_index = 0 17 | # for valid and test dataset 18 | if(df.shape[1] == 11): 19 | for datum in df: 20 | data_ELEMENT = dict() 21 | data_ELEMENT['title'] = 'dummyTitle' 22 | paragraphs = [] 23 | paragraphs_ELEMENT = dict() 24 | qas = [] 25 | qas_ELEMENT = dict() 26 | qas_ELEMENT_ANSWERS = [] 27 | ANSWERS_ELEMENT = dict() 28 | 29 | qas_ELEMENT['id'] = id_index 30 | id_index += 1 31 | qas_ELEMENT['question'] = datum[1].replace("__eou__ __eot__", ".").replace("__eou__", ".").replace("__eot__", ".") 32 | 33 | superdocument = datum[0].replace("__eou__ __eot__", ".").replace("__eou__", ".").replace("__eot__", ".") 34 | 35 | ANSWERS_ELEMENT['answer_start'] = -1 36 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 37 | 38 | paragraphs_ELEMENT['context'] = superdocument 39 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 40 | 41 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 42 | qas.append(qas_ELEMENT) 43 | 44 | paragraphs_ELEMENT['qas'] = qas 45 | paragraphs.append(paragraphs_ELEMENT) 46 | 47 | data_ELEMENT['paragraphs'] = paragraphs 48 | data.append(data_ELEMENT) 49 | elif(df.shape[1] == 3): #for train set 50 | true_response = [x for x in df if x[2] == 1] 51 | for datum in true_response: 52 | data_ELEMENT = dict() 53 | data_ELEMENT['title'] = 'dummyTitle' 54 | paragraphs = [] 55 | paragraphs_ELEMENT = dict() 56 | qas = [] 57 | qas_ELEMENT = dict() 58 | qas_ELEMENT_ANSWERS = [] 59 | ANSWERS_ELEMENT = dict() 60 | 61 | qas_ELEMENT['id'] = id_index 62 | id_index += 1 63 | qas_ELEMENT['question'] = datum[1].replace("__eou__ __eot__", ".").replace("__eou__", ".").replace("__eot__", ".") 64 | 65 | superdocument = datum[0].replace("__eou__ __eot__", ".").replace("__eou__", ".").replace("__eot__", ".") 66 | 67 | ANSWERS_ELEMENT['answer_start'] = -1 68 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 69 | 70 | paragraphs_ELEMENT['context'] = superdocument 71 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 72 | 73 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 74 | qas.append(qas_ELEMENT) 75 | 76 | paragraphs_ELEMENT['qas'] = qas 77 | paragraphs.append(paragraphs_ELEMENT) 78 | 79 | data_ELEMENT['paragraphs'] = paragraphs 80 | data.append(data_ELEMENT) 81 | 82 | squad_formatted_content['data'] = data 83 | 84 | return squad_formatted_content -------------------------------------------------------------------------------- /ds_formatter/wikiqa.py: -------------------------------------------------------------------------------- 1 | def convert_to_squad(story_question_content): 2 | """ 3 | :param story_question_content:: 4 | :return: formatted SQUAD data 5 | At initial version, we are just focusing on the context and question, nothing more, 6 | therefore we are ignoring the answer part as of now 7 | """ 8 | squad_formatted_content = dict() 9 | squad_formatted_content['version'] = 'wikiqa_squad_format' 10 | data = [] 11 | grouped = story_question_content.groupby(['QuestionID']) 12 | for q, datum in grouped: 13 | 14 | datum = datum.loc[datum['Label'].isin([1])] 15 | #anyRelatedLabel = sum(datum['Label']) 16 | if len(datum) > 0: 17 | 18 | # Format is deeply nested JSON -- prepare data structures 19 | data_ELEMENT = dict() 20 | data_ELEMENT['title'] = datum.iloc[0]['DocumentTitle'] 21 | paragraphs = [] 22 | paragraphs_ELEMENT = dict() 23 | qas = [] 24 | qas_ELEMENT = dict() 25 | qas_ELEMENT_ANSWERS = [] 26 | ANSWERS_ELEMENT = dict() 27 | 28 | qas_ELEMENT['id'] = q 29 | qas_ELEMENT['question'] = datum.iloc[0]['Question'] 30 | 31 | superdocument = ' '.join(datum['Sentence'].tolist()) 32 | 33 | ANSWERS_ELEMENT['answer_start'] = -1 34 | ANSWERS_ELEMENT['text'] = 'dummyAnswer' 35 | 36 | paragraphs_ELEMENT['context'] = superdocument 37 | qas_ELEMENT_ANSWERS.append(ANSWERS_ELEMENT) 38 | 39 | qas_ELEMENT['answers'] = qas_ELEMENT_ANSWERS 40 | qas.append(qas_ELEMENT) 41 | 42 | paragraphs_ELEMENT['qas'] = qas 43 | paragraphs.append(paragraphs_ELEMENT) 44 | 45 | data_ELEMENT['paragraphs'] = paragraphs 46 | data.append(data_ELEMENT) 47 | 48 | squad_formatted_content['data'] = data 49 | 50 | return squad_formatted_content -------------------------------------------------------------------------------- /executor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import argparse 3 | import util as UTIL 4 | import sys 5 | import os 6 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 7 | from ds_formatter import qangaroo, mctest, insuranceqa, triviaqa, wikiqa, narrativeqa, msmarco, ubuntudialogue, cnnnews, squad, quasar 8 | 9 | 10 | def get_parser(): 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument('--log_path',help="path to the log file") 14 | parser.add_argument('--log_info',default="INFO", help="logging level") 15 | parser.add_argument('--data_path', help="path to the source file to be converted") 16 | parser.add_argument('--from_files', help="addition/supporting files that are in the same path as source, could be coma-seperated ',', file type can be also identified with ':'. Ex: 'voc:vocabulary.txt, 'answer:answer.txt'") 17 | parser.add_argument('--from_format', help="dataset name of the source format") 18 | parser.add_argument('--to_format', help="dataset name of the destination format") 19 | parser.add_argument('--to_file_name', help="destination file name") 20 | return parser 21 | 22 | def main(args): 23 | try: 24 | logging.info('(function {}) Started'.format(main.__name__)) 25 | 26 | source_files = UTIL.parse_source_files(args.data_path, args.from_files, logging) 27 | source_file = source_files['source'] 28 | destination_file = os.path.join(args.data_path, args.from_format.lower() + '_to_' + args.to_format.lower() + '_'+args.to_file_name) 29 | 30 | # TODO: 1) We need to create a interface class to have the same signature for all the formatters in ds_formatter folder. 31 | # TODO: 2) We need to create a generic approach to convert any type to any type not only any type to squad. 32 | # TODO: 3) can we have better approach to handle the following if/else scenarios 33 | # TODO: 4) we may also put some kind of field wrapper to handle whether which fields are gonna be filled with dummy and which fields are gonna be filled with real values. 34 | if args.from_format.lower() == 'qangaroo' and args.to_format.lower() == 'squad' : 35 | """ 36 | --log_path="~/log.log" 37 | --data_path="~/data/qangaroo_v1.1/wikihop" 38 | --from_files="source:dev.json" 39 | --from_format="qangaroo" 40 | --to_format="squad" 41 | --to_file_name="dev.json" #it is gonna be renamed as "[from_to]_filename.what" 42 | """ 43 | in_content = UTIL.load_json_file(source_file, logging) 44 | formatted_content = qangaroo.convert_to_squad(in_content) 45 | UTIL.dump_json_file(destination_file, formatted_content, logging) 46 | 47 | elif args.from_format.lower() == 'mctest' and args.to_format.lower() == 'squad': 48 | """ 49 | --log_path="~/log.log" 50 | --data_path="~/data/" 51 | --from_files="source:mc160.dev.tsv" 52 | --from_format="mctest" 53 | --to_format="squad" 54 | --to_file_name="mc160.dev.json" #it is gonna be renamed as "[from_to]_filename.what" 55 | """ 56 | 57 | 58 | story_question_content = UTIL.load_csv_file(source_file,"\t", None, logging) 59 | #answer_content = UTIL.load_csv_file(additional_files['answer'], "\t", None, logging) 60 | formatted_content = mctest.convert_to_squad(story_question_content) 61 | UTIL.dump_json_file(destination_file, formatted_content, logging) 62 | 63 | elif args.from_format.lower() == 'insuranceqa' and args.to_format.lower() == 'squad': 64 | """ 65 | --log_path="~/log.log" 66 | --data_path="~/data/insuranceqa_v2" 67 | --from_files="source:InsuranceQA.question.anslabel.token.1500.pool.solr.test.encoded,voc:vocabulary.txt,answer:InsuranceQA.label2answer.token.encoded" 68 | --from_format="insuranceqa" 69 | --to_format="squad" 70 | --to_file_name="1500.test.json" 71 | """ 72 | 73 | voc = insuranceqa.load_vocab(source_files['voc']) 74 | questions, a_to_q_map = insuranceqa.load_questions(source_file, voc) 75 | answers = insuranceqa.load_answers(source_files['answer'], voc) 76 | formatted_content = insuranceqa.convert_to_squad(questions, answers, a_to_q_map) 77 | UTIL.dump_json_file(destination_file, formatted_content, logging) 78 | 79 | elif args.from_format.lower() == 'triviaqa' and args.to_format.lower() == 'squad': 80 | """ 81 | --log_path="~/log.log" 82 | --data_path="~/data/triviaqa/" 83 | --from_files=""source:qa/wikipedia-train.json, wikipedia:evidence/wikipedia,web:evidence/web,seed:10,token_size:2000,sample_size:1000000" 84 | --from_format="triviaqa" 85 | --to_format="squad" 86 | --to_file_name="wikipedia-train-long.json" 87 | """ 88 | 89 | wiki = source_files['wikipedia'] 90 | web = source_files['web'] 91 | seed = source_files['seed'] 92 | max_num_of_tokens = source_files['token_size'] 93 | sample_size = source_files['sample_size'] 94 | qa_file = UTIL.load_json_file(source_file, logging) 95 | formatted_content = triviaqa.convert_to_squad_format(qa_file, wiki, web, sample_size, seed, max_num_of_tokens) 96 | UTIL.dump_json_file(destination_file, formatted_content, logging) 97 | elif args.from_format.lower() == 'wikiqa' and args.to_format.lower() == 'squad': 98 | """ 99 | --log_path="~/log.log" 100 | --data_path="~/data/WikiQACorpus" 101 | --from_files="source:WikiQA-dev.tsv" 102 | --from_format="wikiqa" 103 | --to_format="squad" 104 | --to_file_name="dev.json" 105 | """ 106 | 107 | story_question_content = UTIL.load_csv_file(source_file, "\t", 'infer', logging) 108 | formatted_content = wikiqa.convert_to_squad(story_question_content) 109 | UTIL.dump_json_file(destination_file, formatted_content, logging) 110 | 111 | elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'matchzoo': 112 | """ 113 | **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt". 114 | --log_path="~/log.log" 115 | --data_path="~/data/squad" 116 | --from_files="source:dev-v1.1.json,q_len:1000,negative_sampling:100" 117 | --from_format="squad" 118 | --to_format="matchzoo" 119 | --to_file_name="dev.txt" 120 | """ 121 | negative_samp_count = int(source_files['negative_sampling']) 122 | q_len = int(source_files['q_len']) 123 | content = UTIL.load_json_file(source_file, logging) 124 | generator = squad.yield_to_matchzoo(content, q_len, negative_samp_count) 125 | open(destination_file, "w").write('\n'.join(data for data in generator)) 126 | 127 | #UTIL.dump_json_file(destination_file, formatted_content, logging) 128 | elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'lucene': 129 | """ 130 | **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt". 131 | --log_path="~/log.log" 132 | --data_path="~/data/squad" 133 | --from_files="source:dev-v1.1.json,doc_type_verbose:2" 134 | --from_format="squad" 135 | --to_format="matchzoo" 136 | --to_file_name="dev.txt" 137 | """ 138 | doc_type_verbose = int(source_files['doc_type_verbose']) 139 | content = UTIL.load_json_file(source_file, logging) 140 | squad.convert_to_lucene(content, doc_type_verbose, args.data_path) 141 | elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'short_squad': 142 | """ 143 | **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt". 144 | --log_path="~/log.log" 145 | --data_path="~/data/squad" 146 | --from_files="source:dev-v1.1.json,q_len:1000,negative_sampling:100" 147 | --from_format="squad" 148 | --to_format="short_squad" 149 | --to_file_name="dev.json" 150 | """ 151 | negative_samp_count = int(source_files['negative_sampling']) 152 | q_len = int(source_files['q_len']) 153 | content = UTIL.load_json_file(source_file, logging) 154 | formatted_content = squad.convert_to_short_squad(content, q_len, negative_samp_count) 155 | UTIL.dump_json_file(destination_file, formatted_content, logging) 156 | elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'squad': 157 | """ 158 | In order to make some analyzes. 159 | --log_path="~/log.log" 160 | --data_path="~/data/squad" 161 | --from_files="source:dev-v1.1.json,is_histogram:True,document_type:1" #1 for question, #2 for paragraphs, #3 for both. 162 | --from_format="squad" 163 | --to_format="squad" 164 | --to_file_name="dev.json" 165 | """ 166 | is_historgram = source_files['is_histogram'] 167 | document_type = int(source_files['document_type']) 168 | his_bin = int(source_files['histogram_bin']) 169 | content = UTIL.load_json_file(source_file, logging) 170 | squad.print_statistics(content, is_historgram, his_bin, document_type) 171 | 172 | elif args.from_format.lower() == 'narrativeqa' and args.to_format.lower() == 'squad': 173 | """ 174 | --log_path="~/log.log" 175 | --data_path="~/data/narrativeqa" 176 | --from_files="source:summaries.csv,set:train,qaps:qaps.csv" 177 | --from_format="narrativeqa" 178 | --to_format="squad" 179 | --to_file_name="train.json" #it is gonna be renamed as "[from_to]_filename.what" 180 | """ 181 | 182 | story_summary_content = UTIL.load_csv_file(source_file, ",", 'infer', logging) 183 | question_content = UTIL.load_csv_file(source_files['qaps'], ",", 'infer', logging) 184 | set_type = source_files['set'] 185 | formatted_content = narrativeqa.convert_to_squad(story_summary_content, question_content, set_type) 186 | UTIL.dump_json_file(destination_file, formatted_content, logging) 187 | 188 | elif args.from_format.lower() == 'webqa' and args.to_format.lower() == 'squad': 189 | " ************************************************************ " 190 | " *********************** ON-HOLD *****************************" 191 | " ************************************************************ " 192 | """ 193 | --log_path="~/log.log" 194 | --data_path="~/data/" 195 | --from_files="label:question.train.token_idx.label,voc:vocabulary,answer:answers.label.token_idx" 196 | --from_format="webqa" 197 | --to_format="squad" 198 | --to_file_name="filename.what" #it is gonna be renamed as "[from_to]_filename.what" 199 | """ 200 | 201 | story_summary_content = UTIL.load_csv_file(source_file, ",", 'infer', logging) 202 | question_content = UTIL.load_csv_file(source_files['qaps'], ",", 'infer', logging) 203 | set_type = source_files['set'] 204 | formatted_content = narrativeqa.convert_to_squad(story_summary_content, question_content, set_type) 205 | UTIL.dump_json_file(args.destination_file_path, formatted_content, logging) 206 | elif args.from_format.lower() == 'msmarco' and args.to_format.lower() == 'squad': 207 | """ 208 | --log_path="~/log.log" 209 | --data_path="~/data/msmarco" 210 | --from_format="msmarco" 211 | --to_format="squad" 212 | --to_file_name="dev_2.1.json" #it is gonna be renamed as "[from_to]_filename.what" 213 | """ 214 | input_dict = {} 215 | try: 216 | version = float(source_files['v']) 217 | except: 218 | version = 2.0 219 | 220 | input_dict['v'] = version 221 | if version <= 2.0: 222 | """ 223 | for version <= 2.0 224 | --from_files="source:dev_2.1.json, v:2.0" 225 | """ 226 | in_content = UTIL.load_json_file(source_file, logging) 227 | input_dict['story_question_content'] = in_content 228 | formatted_content = msmarco.convert_to_squad(in_content) 229 | else: 230 | """ 231 | for version > 2.0 232 | --from_files="source:queries.train.csv,document:collection.tsv,mapping:qrels.train.csv,v:2.1,limit:-1" 233 | """ 234 | queries = UTIL.load_csv_file(source_file, "\t", None, logging, ['id', 'content']) 235 | input_dict['queries'] = queries 236 | mappings = UTIL.load_csv_file(source_files['mapping'], "\t", None, logging, ['q_id', 'tmp1', 'p_id', 'tmp2'], [0,1,2,3]) 237 | input_dict['mappings'] = mappings 238 | documents = UTIL.load_csv_file(source_files['document'], "\t", None, logging, ['id', 'content']) 239 | input_dict['documents'] = documents 240 | input_dict['limit'] = int(source_files['limit']) 241 | formatted_content = msmarco.convert_to_squad(input_dict) 242 | UTIL.dump_json_file(destination_file, formatted_content, logging) 243 | elif args.from_format.lower() == 'quasar' and args.to_format.lower() == 'squad': 244 | """ 245 | --log_path="~/log.log" 246 | --data_path="~/data/quasar-t" 247 | --from_format="quasar-t" 248 | --to_format="squad" 249 | --from_files="source:train_questions.json,document:train_contexts.json,type:t,is_null_tags_filter, limit:-1" 250 | --to_file_name="train.json" 251 | """ 252 | if source_files['type'].lower() =='t': 253 | # quasar-t 254 | queries = UTIL.load_json_line_file(source_file, logging) 255 | documents = UTIL.load_json_line_file(source_files['document'], logging) 256 | formatted_content = quasar.convert_to_squad(queries, documents, source_files['is_null_tags_filter'], int(source_files['limit'])) 257 | UTIL.dump_json_file(destination_file, formatted_content, logging) 258 | 259 | elif args.from_format.lower() == 'ubuntu' and args.to_format.lower() == 'squad': 260 | """ 261 | --log_path="~/log.log" 262 | --data_path="~/data/ubuntu" 263 | --from_files="source:valid.csv" 264 | --from_format="ubuntu" 265 | --to_format="squad" 266 | --to_file_name="valid.json" 267 | """ 268 | story_question_content = UTIL.load_csv_file(source_file, ",", 'infer', logging) 269 | formatted_content = ubuntudialogue.convert_to_squad(story_question_content) 270 | UTIL.dump_json_file(destination_file, formatted_content, logging) 271 | elif args.from_format.lower() == 'newsqa' and args.to_format.lower() == 'squad': 272 | 273 | """ 274 | --log_path="~/log.log" 275 | --data_path="~/data/newsqa" 276 | --from_files="source:newsqa-data-v1.csv,story:cnn_stories/" 277 | --from_format="newsqa" 278 | --to_format="squad" 279 | --to_file_name="news.json" 280 | """ 281 | 282 | story_question_content = UTIL.load_csv_file(source_file, ",", 'infer', logging) 283 | context_content_path = source_files['story'] 284 | formatted_content = cnnnews.convert_to_squad(story_question_content, context_content_path) 285 | UTIL.dump_json_file(destination_file, formatted_content, logging) 286 | else: 287 | pass 288 | logging.info('(function {}) Finished'.format(main.__name__)) 289 | except Exception as e: 290 | logging.error('(function {}) has an error: {}'.format(main.__name__, e)) 291 | raise 292 | if __name__ == '__main__': 293 | args = get_parser().parse_args() 294 | assert args.log_path is not None, "No log path found at {}".format(args.log_path) 295 | assert args.data_path is not None, "No folder path found at {}".format(args.data_path) 296 | assert args.from_format is not None, "No 'from format' found {}".format(args.from_format) 297 | assert args.from_files is not None, "No 'from files' format found {}".format(args.from_files) 298 | assert args.to_format is not None, "No 'to format' dataset format found {}".format( 299 | args.to_format) 300 | assert args.to_file_name is not None, "No 'to file name' dataset format found {}".format( 301 | args.to_file_name) 302 | 303 | if args.log_info.lower() =='info': 304 | log_info = logging.INFO 305 | elif args.log_info.lower() == 'debug': 306 | log_info = logging.DEBUG 307 | elif args.log_info.lower() == 'warn': 308 | log_info = logging.WARNING 309 | elif args.log_info.lower() == 'critical': 310 | log_info = logging.CRITICAL 311 | elif args.log_info.lower() == 'error': 312 | log_info = logging.ERROR 313 | else: 314 | log_info = logging.INFO 315 | 316 | logging.basicConfig(filename=args.log_path, level=log_info) 317 | main(args) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==0.23.4 2 | tqdm 3 | spacy==2.0.8 4 | scikit_learn==0.19.1 5 | scipy 6 | json-lines 7 | matplotlib -------------------------------------------------------------------------------- /util.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import os 4 | import spacy 5 | import json_lines 6 | import matplotlib.pyplot as plt 7 | nlp = spacy.blank("en") 8 | def word_tokenize(sent): 9 | doc = nlp(sent) 10 | return [token.text for token in doc] 11 | 12 | # The "*.json.gz" files can be read in python as follows: 13 | # 14 | # import gzip 15 | # def read_data(path): 16 | # with gzip.open(path) as f: 17 | # for line in f: 18 | # yield eval(line) 19 | 20 | 21 | def load_json_file(file_path, logging, encoding='utf-8'): 22 | content = None 23 | try: 24 | # with open(file_path, 'r') as f_in: 25 | # content = json.load(f_in) 26 | content = json.loads(get_file_contents(file_path, encoding=encoding)) 27 | if logging is not None: 28 | logging.info('(function {}) is run successfuly and load the file: {}'.format(load_json_file.__name__, file_path)) 29 | except Exception as e: 30 | if logging is not None: 31 | logging.error('(function {}) has an error: {}'.format(load_json_file.__name__, e)) 32 | raise 33 | return content 34 | 35 | def load_json_line_file(file_path, logging, encoding='utf-8'): 36 | content = [] 37 | try: 38 | with open(file_path, 'r', encoding=encoding) as f: # opening file in binary(rb) mode 39 | for item in json_lines.reader(f): 40 | content.append(item) 41 | 42 | if logging is not None: 43 | logging.info('(function {}) is run successfuly and load the file: {}'.format(load_json_line_file.__name__, file_path)) 44 | except Exception as e: 45 | if logging is not None: 46 | logging.error('(function {}) has an error: {}'.format(load_json_line_file.__name__, e)) 47 | raise 48 | return content 49 | 50 | def create_dir(dir): 51 | if not os.path.exists(dir): 52 | os.makedirs(dir) 53 | return dir 54 | def dump_json_file(file_path, content, logging, encoding='utf-8'): 55 | try: 56 | with open(file_path, 'w', encoding=encoding) as f_out: 57 | json.dump(content, f_out, indent=1) 58 | if logging is not None: 59 | logging.info( 60 | '(function {}) is run successfuly and write the file: {}'.format(dump_json_file.__name__, file_path)) 61 | except Exception as e: 62 | if logging is not None: 63 | logging.error('(function {}) has an error: {}'.format(dump_json_file.__name__, e)) 64 | raise 65 | 66 | # def write_json_to_file(json_object, json_file, mode='w', encoding='utf-8'): 67 | # with open(json_file, mode, encoding=encoding) as outfile: 68 | # json.dump(json_object, outfile, indent=4, sort_keys=True, ensure_ascii=False) 69 | 70 | # def dump_txt_contents(file_path, content, logging, encoding='utf-8'): 71 | # try: 72 | # with open(file_path, 'w', encoding=encoding) as f_out: 73 | # f_out.dump(content, f_out, indent=1) 74 | # if logging is not None: 75 | # logging.info( 76 | # '(function {}) is run successfuly and write the file: {}'.format(dump_json_file.__name__, 77 | # file_path)) 78 | # except Exception as e: 79 | # if logging is not None: 80 | # logging.error('(function {}) has an error: {}'.format(dump_json_file.__name__, e)) 81 | # raise 82 | 83 | def get_file_contents(filename, encoding='utf-8'): 84 | with open(filename, encoding=encoding) as f: 85 | content = f.read() 86 | return content 87 | 88 | 89 | def get_file_contents_as_list(file_path, encoding='utf-8', ignore_blanks=True): 90 | contents = get_file_contents(file_path, encoding=encoding) 91 | lines = contents.split('\n') 92 | lines = [line for line in lines if line != ''] if ignore_blanks else lines 93 | return lines 94 | 95 | def load_csv_file(file_path, sep, header, logging, names=None, usecols=None): 96 | content = None 97 | try: 98 | with open(file_path, 'r') as f_in: 99 | content = pd.read_csv(file_path,sep=sep, header=header, names=names, usecols=None) 100 | if logging is not None: 101 | logging.info( 102 | '(function {}) is run successfuly and load the file: {}'.format(load_csv_file.__name__, file_path)) 103 | except Exception as e: 104 | if logging is not None: 105 | logging.error('(function {}) has an error: {}'.format(load_csv_file.__name__, e)) 106 | raise 107 | return content 108 | 109 | def parse_source_files(data_path, source_files, logging, item_seperator=',', k_v_seperator=':'): 110 | source_path = data_path 111 | _additional_files = dict() 112 | try: 113 | for _ in source_files.split(item_seperator): 114 | _splitted = _.split(k_v_seperator) 115 | key = _splitted[0].strip() 116 | value = _splitted[1].strip() 117 | print("indx:{}, key:{}, value:{}".format(_, key,value)) 118 | if os.path.isfile(os.path.join(source_path, value)) or os.path.isdir(os.path.join(source_path, value)): 119 | _additional_files[key] = os.path.join(source_path, value) 120 | else: 121 | _additional_files[key] = value 122 | if logging is not None: 123 | logging.info( 124 | '(function {}) is run successfuly'.format(parse_source_files.__name__, data_path)) 125 | except Exception as e: 126 | if logging is not None: 127 | logging.error('(function {}) has an error: {}'.format(parse_source_files.__name__, e)) 128 | raise 129 | return _additional_files --------------------------------------------------------------------------------