├── data
    ├── dev-v1.0.json
    ├── dev-v1.1.json
    ├── train-v1.0.json
    └── train-v1.1.json
├── README.md
└── src
    └── SquadProcessing.py


/data/dev-v1.0.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e909756bf3fd5889a3be15fc425a3e87248303a61f5cb5671b7e9e2bf3b4d3db
3 | size 4602468
4 | 


--------------------------------------------------------------------------------
/data/dev-v1.1.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:95aa6a52d5d6a735563366753ca50492a658031da74f301ac5238b03966972c9
3 | size 4854279
4 | 


--------------------------------------------------------------------------------
/data/train-v1.0.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:857d375a31e2884367e870ad9cbf47703bcc9c47af1fbf13dc647d4b3073472d
3 | size 36596194
4 | 


--------------------------------------------------------------------------------
/data/train-v1.1.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3527663986b8295af4f7fcdff1ba1ff3f72d07d61a20f487cb238a6ef92fd955
3 | size 30288272
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SQUAD Reformatter
 2 | 
 3 | The [SQuAD dataset](https://rajpurkar.github.io/SQuAD-explorer/) was designed for the task of machine reading comprehension. The task consists in answering a question related to a specific passage by selecting which part of the passage answers the question.
 4 | 
 5 | In this repo you will find a simple tool which reformats the SQuAD dataset into a more useful way in which it breaks apart the passages into sentences, and adds to the answers the sentence in which it is found.
 6 | 
 7 | ## Usage
 8 | 
 9 | ```
10 | python SquadProcessing.py <squad_file> <save_file>
11 | ```
12 | 
13 | An example of a call from the root folder would be:
14 | 
15 | ```
16 | python src/SquadProcessing.py data/dev-v1.0.json out/proc-dev-v1.0.json
17 | ```
18 | 
19 | ## Data Description
20 | 
21 | ### Original format
22 | The dataset was given in 2 different JSON files that have the following structure:
23 | 
24 |   - title : str
25 |   - paragraphs : []
26 |       - context : str
27 |       - qas : []
28 |           - question : str
29 |           - id : int
30 |           - answers : []
31 |               - text : str
32 |               - answer_start : int
33 | 
34 |  The index described in *answer_start* is 0 based.
35 | 
36 | ### Output format
37 | Processed QA JSON Structure:
38 | 
39 | - root : []
40 |     - topic : str
41 |     - paragraphs : []
42 |         - pid : int
43 |         - sentences: []
44 |             - text : str
45 |             - pos : int
46 |             - start_idx : int
47 |         - qas: []
48 |             - qid : int
49 |             - question : str
50 |             - answers:
51 |                 - sent_pos : int
52 |                 - text : str
53 |                 - answer_start : int
54 | 
55 | The passage id embeds the id for the topic and its corresponding passages in the following way:
56 | 
57 | - the 10k's describe the topic
58 | - the units describe the passage


--------------------------------------------------------------------------------
/src/SquadProcessing.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import sys
  3 | import json
  4 | from textblob import TextBlob
  5 | 
  6 | def sweep_through_data(target_data):
  7 |     # The structure of the train/test QA files is the following:
  8 |     #  - title
  9 |     #  - paragraphs
 10 |     #      - context
 11 |     #      - qas
 12 |     #          - question
 13 |     #          - id
 14 |     #          - answers
 15 |     #              - text
 16 |     #              - answer_start
 17 |     #
 18 |     # The index described in *answer_start* is 0 based.
 19 | 
 20 | 
 21 |     # ## Dataset exploration boilerplate code
 22 |     # Traverse all the qa's and store them into a file
 23 |     total_topics = 0
 24 |     total_questions = 0
 25 |     qa_json_path = 'all_questions.json'
 26 |     qa_json = []
 27 |     for topic in target_data:
 28 |         total_topics += 1
 29 |         for passage in topic['paragraphs']:
 30 |             #print passage['context']
 31 |             for qa in passage['qas']:
 32 |                 total_questions += 1
 33 |                 #print 'Q%d %s'%(qidx,qa['question'])
 34 |                 #print 'QID:',qa['id']
 35 |                 num_ans = 0
 36 |                 for answer in qa['answers']:
 37 |                     #print 'A%d %s'%(aidx,answer['text'])
 38 |                     #print 'StartIdx:',answer['answer_start']
 39 |                     cur_qa ={}
 40 |                     cur_qa['question']=qa['question']
 41 |                     cur_qa['answer'] = answer['text']
 42 |                     qa_json.append(cur_qa)
 43 |                     num_ans += 1
 44 | 
 45 |     json.dump(qa_json, open(qa_json_path,'wb'))
 46 | 
 47 |     print 'Total topics:    %d'%(total_topics)
 48 |     print 'Total questions: %d'%(total_questions)
 49 |     print 'Total qa\'s:     %d'%(len(qa_json))
 50 |     print 'qa sample:'
 51 |     print qa_json[0]
 52 | 
 53 | 
 54 | def in_which_sentence(text_blob, char_pos):
 55 |     '''
 56 |     @:param text_blob: the text blob of the passage
 57 |     @:param char_pos: the starting position of the answer (assume answer is only within one sentence)
 58 |     Returns: index of the sentence that contains the answer
 59 |     The index returned is 0 based
 60 |     '''
 61 |     if char_pos < 0 or char_pos > len(text_blob):
 62 |         return -1
 63 |     sent_lim = [ s.end for s in text_blob.sentences]
 64 |     for pos, lim in zip(range(len(sent_lim)),sent_lim):
 65 |         if char_pos < lim:
 66 |             return pos
 67 |     return -1
 68 | 
 69 | 
 70 | def process_data(target_data, save_path):
 71 |     # Meta-data Registers
 72 |     # 10k indicates the topic id
 73 |     # units indicate the passage id within the unit
 74 |     topic_id = 10000
 75 |     topic_id_inc = 10000
 76 |     start_passage_id = 1
 77 |     progressive_id_inc = 1
 78 |     passage_id = topic_id + start_passage_id
 79 | 
 80 |     # Processed QA JSON Structure
 81 |     # proc_qa is a list of topics:
 82 |     # 	- topic : str
 83 |     # 	- paragraphs : []
 84 |     # 		- pid : int
 85 |     # 		- sentences: []
 86 |     # 			- text : str
 87 |     # 			- pos : int
 88 |     # 			- start_idx : int
 89 |     # 		- qas: []
 90 |     # 			- qid : int
 91 |     # 			- question : str
 92 |     # 			- answers:
 93 |     # 				- sent_pos : int
 94 |     # 				- text : str
 95 |     # 				- answer_start : int
 96 | 
 97 |     processed_qa = []
 98 |     # Process the info
 99 |     for topic in target_data:
100 |         print topic['title'] #To keep track of the progress
101 |         qa_topic = {}
102 |         qa_topic['topic'] = topic['title']
103 | 
104 |         # For each PARAGRAPH
105 |         passage_list = []
106 |         for paragraph in topic['paragraphs']:
107 |             passage = TextBlob(paragraph['context']) #This helps extract the sentences
108 |             # ADD sentences to list to be indexed
109 |             new_passage = {}
110 |             new_passage['pid'] = passage_id
111 |             sentence_list = []
112 |             for pos, sent, raw_sent in zip(range(len(passage.sentences)), passage.sentences, passage.raw_sentences):
113 |                 new_sent_input = {}
114 |                 new_sent_input['text'] = raw_sent
115 |                 new_sent_input['pos'] = pos
116 |                 new_sent_input['start_idx'] = sent.start
117 |                 sentence_list.append(new_sent_input)
118 | 
119 |             new_passage['sentences'] = sentence_list
120 | 
121 |             # Then store the corresponding QA's in another structure for JSON file
122 |             qas_list = []
123 |             for qa in paragraph['qas']:
124 |                 new_qa_input = {}
125 |                 new_qa_input['qid'] = qa['id']
126 |                 new_qa_input['question'] = qa['question']
127 |                 answer_list = []
128 |                 for answer in qa['answers']:
129 |                     answer['sent_pos'] = in_which_sentence(passage, answer['answer_start'])
130 |                     if answer['sent_pos']==-1:
131 |                         print qa['id']
132 |                     answer_list.append(answer)
133 |                 new_qa_input['answers'] = answer_list
134 |                 qas_list.append(new_qa_input)
135 | 
136 |             new_passage['qas'] = qas_list
137 | 
138 |             passage_list.append(new_passage)
139 | 
140 |             # Update intra-topic passage id
141 |             passage_id += progressive_id_inc
142 | 
143 |         qa_topic['paragraphs'] = passage_list
144 | 
145 |         processed_qa.append(qa_topic)
146 | 
147 |         # Update the passage id for next topic
148 |         topic_id += topic_id_inc
149 |         passage_id = topic_id + start_passage_id
150 | 
151 | 
152 |     json.dump(processed_qa, open(save_path, 'wb'))
153 | 
154 | if __name__=='__main__':
155 |     if len(sys.argv) < 3:
156 |         print '''Usage: SquadProcessing.py <squad_file> <save_file>'''
157 |         sys.exit(-1)
158 | 
159 |     squad_filepath = sys.argv[1]
160 |     save_path = sys.argv[2]
161 | 
162 |     target_json = json.load(open(squad_filepath, 'r'))
163 |     target_data = target_json['data']
164 | 
165 | 
166 |     process_data(target_data, save_path)


--------------------------------------------------------------------------------