├── data ├── dev-v1.0.json ├── dev-v1.1.json ├── train-v1.0.json └── train-v1.1.json ├── README.md └── src └── SquadProcessing.py /data/dev-v1.0.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e909756bf3fd5889a3be15fc425a3e87248303a61f5cb5671b7e9e2bf3b4d3db 3 | size 4602468 4 | -------------------------------------------------------------------------------- /data/dev-v1.1.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:95aa6a52d5d6a735563366753ca50492a658031da74f301ac5238b03966972c9 3 | size 4854279 4 | -------------------------------------------------------------------------------- /data/train-v1.0.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:857d375a31e2884367e870ad9cbf47703bcc9c47af1fbf13dc647d4b3073472d 3 | size 36596194 4 | -------------------------------------------------------------------------------- /data/train-v1.1.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3527663986b8295af4f7fcdff1ba1ff3f72d07d61a20f487cb238a6ef92fd955 3 | size 30288272 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SQUAD Reformatter 2 | 3 | The [SQuAD dataset](https://rajpurkar.github.io/SQuAD-explorer/) was designed for the task of machine reading comprehension. The task consists in answering a question related to a specific passage by selecting which part of the passage answers the question. 4 | 5 | In this repo you will find a simple tool which reformats the SQuAD dataset into a more useful way in which it breaks apart the passages into sentences, and adds to the answers the sentence in which it is found. 6 | 7 | ## Usage 8 | 9 | ``` 10 | python SquadProcessing.py 11 | ``` 12 | 13 | An example of a call from the root folder would be: 14 | 15 | ``` 16 | python src/SquadProcessing.py data/dev-v1.0.json out/proc-dev-v1.0.json 17 | ``` 18 | 19 | ## Data Description 20 | 21 | ### Original format 22 | The dataset was given in 2 different JSON files that have the following structure: 23 | 24 | - title : str 25 | - paragraphs : [] 26 | - context : str 27 | - qas : [] 28 | - question : str 29 | - id : int 30 | - answers : [] 31 | - text : str 32 | - answer_start : int 33 | 34 | The index described in *answer_start* is 0 based. 35 | 36 | ### Output format 37 | Processed QA JSON Structure: 38 | 39 | - root : [] 40 | - topic : str 41 | - paragraphs : [] 42 | - pid : int 43 | - sentences: [] 44 | - text : str 45 | - pos : int 46 | - start_idx : int 47 | - qas: [] 48 | - qid : int 49 | - question : str 50 | - answers: 51 | - sent_pos : int 52 | - text : str 53 | - answer_start : int 54 | 55 | The passage id embeds the id for the topic and its corresponding passages in the following way: 56 | 57 | - the 10k's describe the topic 58 | - the units describe the passage -------------------------------------------------------------------------------- /src/SquadProcessing.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import sys 3 | import json 4 | from textblob import TextBlob 5 | 6 | def sweep_through_data(target_data): 7 | # The structure of the train/test QA files is the following: 8 | # - title 9 | # - paragraphs 10 | # - context 11 | # - qas 12 | # - question 13 | # - id 14 | # - answers 15 | # - text 16 | # - answer_start 17 | # 18 | # The index described in *answer_start* is 0 based. 19 | 20 | 21 | # ## Dataset exploration boilerplate code 22 | # Traverse all the qa's and store them into a file 23 | total_topics = 0 24 | total_questions = 0 25 | qa_json_path = 'all_questions.json' 26 | qa_json = [] 27 | for topic in target_data: 28 | total_topics += 1 29 | for passage in topic['paragraphs']: 30 | #print passage['context'] 31 | for qa in passage['qas']: 32 | total_questions += 1 33 | #print 'Q%d %s'%(qidx,qa['question']) 34 | #print 'QID:',qa['id'] 35 | num_ans = 0 36 | for answer in qa['answers']: 37 | #print 'A%d %s'%(aidx,answer['text']) 38 | #print 'StartIdx:',answer['answer_start'] 39 | cur_qa ={} 40 | cur_qa['question']=qa['question'] 41 | cur_qa['answer'] = answer['text'] 42 | qa_json.append(cur_qa) 43 | num_ans += 1 44 | 45 | json.dump(qa_json, open(qa_json_path,'wb')) 46 | 47 | print 'Total topics: %d'%(total_topics) 48 | print 'Total questions: %d'%(total_questions) 49 | print 'Total qa\'s: %d'%(len(qa_json)) 50 | print 'qa sample:' 51 | print qa_json[0] 52 | 53 | 54 | def in_which_sentence(text_blob, char_pos): 55 | ''' 56 | @:param text_blob: the text blob of the passage 57 | @:param char_pos: the starting position of the answer (assume answer is only within one sentence) 58 | Returns: index of the sentence that contains the answer 59 | The index returned is 0 based 60 | ''' 61 | if char_pos < 0 or char_pos > len(text_blob): 62 | return -1 63 | sent_lim = [ s.end for s in text_blob.sentences] 64 | for pos, lim in zip(range(len(sent_lim)),sent_lim): 65 | if char_pos < lim: 66 | return pos 67 | return -1 68 | 69 | 70 | def process_data(target_data, save_path): 71 | # Meta-data Registers 72 | # 10k indicates the topic id 73 | # units indicate the passage id within the unit 74 | topic_id = 10000 75 | topic_id_inc = 10000 76 | start_passage_id = 1 77 | progressive_id_inc = 1 78 | passage_id = topic_id + start_passage_id 79 | 80 | # Processed QA JSON Structure 81 | # proc_qa is a list of topics: 82 | # - topic : str 83 | # - paragraphs : [] 84 | # - pid : int 85 | # - sentences: [] 86 | # - text : str 87 | # - pos : int 88 | # - start_idx : int 89 | # - qas: [] 90 | # - qid : int 91 | # - question : str 92 | # - answers: 93 | # - sent_pos : int 94 | # - text : str 95 | # - answer_start : int 96 | 97 | processed_qa = [] 98 | # Process the info 99 | for topic in target_data: 100 | print topic['title'] #To keep track of the progress 101 | qa_topic = {} 102 | qa_topic['topic'] = topic['title'] 103 | 104 | # For each PARAGRAPH 105 | passage_list = [] 106 | for paragraph in topic['paragraphs']: 107 | passage = TextBlob(paragraph['context']) #This helps extract the sentences 108 | # ADD sentences to list to be indexed 109 | new_passage = {} 110 | new_passage['pid'] = passage_id 111 | sentence_list = [] 112 | for pos, sent, raw_sent in zip(range(len(passage.sentences)), passage.sentences, passage.raw_sentences): 113 | new_sent_input = {} 114 | new_sent_input['text'] = raw_sent 115 | new_sent_input['pos'] = pos 116 | new_sent_input['start_idx'] = sent.start 117 | sentence_list.append(new_sent_input) 118 | 119 | new_passage['sentences'] = sentence_list 120 | 121 | # Then store the corresponding QA's in another structure for JSON file 122 | qas_list = [] 123 | for qa in paragraph['qas']: 124 | new_qa_input = {} 125 | new_qa_input['qid'] = qa['id'] 126 | new_qa_input['question'] = qa['question'] 127 | answer_list = [] 128 | for answer in qa['answers']: 129 | answer['sent_pos'] = in_which_sentence(passage, answer['answer_start']) 130 | if answer['sent_pos']==-1: 131 | print qa['id'] 132 | answer_list.append(answer) 133 | new_qa_input['answers'] = answer_list 134 | qas_list.append(new_qa_input) 135 | 136 | new_passage['qas'] = qas_list 137 | 138 | passage_list.append(new_passage) 139 | 140 | # Update intra-topic passage id 141 | passage_id += progressive_id_inc 142 | 143 | qa_topic['paragraphs'] = passage_list 144 | 145 | processed_qa.append(qa_topic) 146 | 147 | # Update the passage id for next topic 148 | topic_id += topic_id_inc 149 | passage_id = topic_id + start_passage_id 150 | 151 | 152 | json.dump(processed_qa, open(save_path, 'wb')) 153 | 154 | if __name__=='__main__': 155 | if len(sys.argv) < 3: 156 | print '''Usage: SquadProcessing.py ''' 157 | sys.exit(-1) 158 | 159 | squad_filepath = sys.argv[1] 160 | save_path = sys.argv[2] 161 | 162 | target_json = json.load(open(squad_filepath, 'r')) 163 | target_data = target_json['data'] 164 | 165 | 166 | process_data(target_data, save_path) --------------------------------------------------------------------------------