├── .ipynb_checkpoints
    └── Code-checkpoint.ipynb
├── Code.ipynb
├── NLP Project.pdf
├── README.md
└── input.txt


/.ipynb_checkpoints/Code-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 26,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Imports\n",
 10 |     "import nltk\n",
 11 |     "import nltk.data\n",
 12 |     "from nltk.stem.lancaster import LancasterStemmer\n",
 13 |     "from nltk.stem.wordnet import WordNetLemmatizer\n",
 14 |     "import re\n",
 15 |     "import spacy\n",
 16 |     "import pandas as pd"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 9,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# # Setting stanford environment variables\n",
 26 |     "# os.environ['STANFORD_PARSER'] = '/home/nishant/Downloads/stanford-parser-full-2015-12-09/jars'\n",
 27 |     "# os.environ['STANFORD_MODELS'] = '/home/nishant/Downloads/stanford-parser-full-2015-12-09/jars'\n",
 28 |     "# stanford_dir = '/home/nishant/Downloads/stanford-parser-full-2015-12-09'"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 10,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Class initializations\n",
 38 |     "nlp = spacy.load('en_core_web_sm')\n",
 39 |     "stemmer = LancasterStemmer()\n",
 40 |     "# parser = stanford.StanfordParser(model_path = '/home/nishant/Downloads/stanford-parser-full-2015-12-09/jars/englishPCFG.ser.gz')\n",
 41 |     "# parser._classpath = tuple(find_jars_within_path(stanford_dir))"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 11,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "# List to hold all input sentences\n",
 51 |     "sentences = []\n",
 52 |     "\n",
 53 |     "# Dictionary to hold sentences corresponding to respective discourse markers\n",
 54 |     "disc_sentences = {}\n",
 55 |     "\n",
 56 |     "# Remaining sentences which do not have discourse markers (To be used later to generate other kinds of questions)\n",
 57 |     "nondisc_sentences = []\n",
 58 |     "\n",
 59 |     "# List of auxiliary verbs\n",
 60 |     "aux_list = ['am', 'are', 'is', 'was', 'were', 'can', 'could', 'does', 'do', 'did', 'has', 'had', 'may', 'might', 'must', 'need',\n",
 61 |     " 'ought', 'shall', 'should', 'will', 'would']\n",
 62 |     "\n",
 63 |     "# List of all discourse markers\n",
 64 |     "discourse_markers = ['because', 'as a result', 'since', 'when', 'although', 'for example', 'for instance']\n",
 65 |     "\n",
 66 |     "# Different question types possible for each discourse marker\n",
 67 |     "qtype = {'because': ['Why'], 'since': ['When', 'Why'], 'when': ['When'], 'although': ['Yes/No'], 'as a result': ['Why'], \n",
 68 |     "'for example': ['Give an example where'], 'for instance': ['Give an instance where'], 'to': ['Why']}\n",
 69 |     "\n",
 70 |     "# The argument which forms a question\n",
 71 |     "target_arg = {'because': 1, 'since': 1, 'when': 1, 'although': 1, 'as a result': 2, 'for example': 1, 'for instance': 1, 'to': 1}\n"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 12,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "[('Sachin', 'NNP'), ('Tendulkar', 'NNP'), ('was', 'VBD'), ('awarded', 'VBN'), ('Bharat', 'NNP'), ('Ratna', 'NNP'), ('in', 'IN'), ('2013', 'CD')]\n",
 84 |       "learn\n",
 85 |       "I eat apple\n"
 86 |      ]
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "# Rough Work\n",
 91 |     "text = 'Sachin Tendulkar was awarded Bharat Ratna in 2013'\n",
 92 |     "text = nltk.word_tokenize(text)\n",
 93 |     "tags = nltk.pos_tag(text)\n",
 94 |     "print(tags)\n",
 95 |     "# question_part = 'Sanskar think he felt included'\n",
 96 |     "# question_part = question_part[:question_part.index(tags[0][0]) + len(tags[0][0])]\n",
 97 |     "# print(question_part)\n",
 98 |     "# text = nltk.word_tokenize('I eat apple')\n",
 99 |     "# tags = nltk.pos_tag(text)\n",
100 |     "# print(tags)\n",
101 |     "print(stemmer.stem('learned'))\n",
102 |     "s = \"I eat apple.\"\n",
103 |     "if(s[-1] in ['.', ',']):\n",
104 |     "    s = s[:-1]\n",
105 |     "print(s)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 13,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "# This function is used to tokenize and split into sentences\n",
115 |     "def sentensify():\n",
116 |     "    global sentences\n",
117 |     "    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')\n",
118 |     "    fp = open('input.txt')\n",
119 |     "    data = fp.read()\n",
120 |     "    sentences = tokenizer.tokenize(data)\n",
121 |     "    discourse()\n"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 14,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "# Function used to generate the questions from sentences which have already been pre-processed.\n",
131 |     "def generate_question(question_part, type):\n",
132 |     "\n",
133 |     "    ''' Tree -> Input tree\n",
134 |     "        question_part -> Part of input sentence which forms a question\n",
135 |     "        type-> The type of question (why, where, etc)\n",
136 |     "    '''\n",
137 |     "    # Remove full stop and make first letter lower case\n",
138 |     "    question_part = question_part[0].lower() + question_part[1:]\n",
139 |     "    if(question_part[-1] == '.' or question_part[-1] == ','):\n",
140 |     "        question_part = question_part[:-1]\n",
141 |     "        \n",
142 |     "    # Capitalizing 'i' since 'I' is recognized by parsers appropriately    \n",
143 |     "    for i in range(0, len(question_part)):\n",
144 |     "        if(question_part[i] == 'i'):\n",
145 |     "            if((i == 0 and question_part[i + 1] == ' ') or (question_part[i - 1] == ' ' and question_part[i + 1] == ' ')):\n",
146 |     "                question_part = question_part[:i] + 'I' + question_part[i + 1: ]\n",
147 |     "                \n",
148 |     "    question = \"\"\n",
149 |     "    if(type == 'Give an example where' or type == 'Give an instance where'):\n",
150 |     "        question = type + \" \" + question_part + '?'\n",
151 |     "        return question\n",
152 |     "\n",
153 |     "    aux_verb = False\n",
154 |     "    res = None\n",
155 |     "    \n",
156 |     "    # Find out if auxiliary verb already exists\n",
157 |     "    for i in range(len(aux_list)):\n",
158 |     "        if(aux_list[i] in question_part.split()):\n",
159 |     "            aux_verb = True\n",
160 |     "            pos = i\n",
161 |     "            break\n",
162 |     "\n",
163 |     "    # If auxiliary verb exists\n",
164 |     "    if(aux_verb):\n",
165 |     "        \n",
166 |     "        # Tokeninze the part of the sentence from which the question has to be made\n",
167 |     "        text = nltk.word_tokenize(question_part)\n",
168 |     "        tags = nltk.pos_tag(text)\n",
169 |     "        question_part = \"\"\n",
170 |     "        fP = False\n",
171 |     "        \n",
172 |     "        for word, tag in tags:\n",
173 |     "            if(word in ['I', 'We', 'we']):\n",
174 |     "                question_part += 'you' + \" \"\n",
175 |     "                fP = True\n",
176 |     "                continue\n",
177 |     "            question_part += word + \" \"\n",
178 |     "\n",
179 |     "        # Split across the auxiliary verb and prepend it at the start of question part\n",
180 |     "        question = question_part.split(\" \" + aux_list[pos])\n",
181 |     "        if(fP):\n",
182 |     "             question = [\"were \"] + question\n",
183 |     "        else:\n",
184 |     "            question = [aux_list[pos] + \" \"] + question\n",
185 |     "\n",
186 |     "        # If Yes/No, no need to introduce question phrase\n",
187 |     "        if(type == 'Yes/No'):\n",
188 |     "            question += ['?']\n",
189 |     "            \n",
190 |     "        elif(type != \"non_disc\"):\n",
191 |     "            question = [type + \" \"] + question + [\"?\"]\n",
192 |     "            \n",
193 |     "        else:\n",
194 |     "            question = question + [\"?\"]\n",
195 |     "         \n",
196 |     "        question = ''.join(question)\n",
197 |     "\n",
198 |     "    # If auxilary verb does ot exist, it can only be some form of verb 'do'\n",
199 |     "    else:\n",
200 |     "        aux = None\n",
201 |     "        text = nltk.word_tokenize(question_part)\n",
202 |     "        tags = nltk.pos_tag(text)\n",
203 |     "        comb = \"\"\n",
204 |     "\n",
205 |     "        '''There can be following combinations of nouns and verbs:\n",
206 |     "            NN/NNP and VBZ  -> Does\n",
207 |     "            NNS/NNPS(plural) and VBP -> Do\n",
208 |     "            NN/NNP and VBN -> Did\n",
209 |     "            NNS/NNPS(plural) and VBN -> Did\n",
210 |     "        '''\n",
211 |     "        \n",
212 |     "        for tag in tags:\n",
213 |     "            if(comb == \"\"):\n",
214 |     "                if(tag[1] == 'NN' or tag[1] == 'NNP'):\n",
215 |     "                    comb = 'NN'\n",
216 |     "\n",
217 |     "                elif(tag[1] == 'NNS' or tag[1] == 'NNPS'):\n",
218 |     "                    comb = 'NNS'\n",
219 |     "\n",
220 |     "                elif(tag[1] == 'PRP'):\n",
221 |     "                    if tag[0] in ['He','She','It']:\n",
222 |     "                        comb = 'PRPS'\n",
223 |     "                    else:\n",
224 |     "                        comb = 'PRPP'\n",
225 |     "                        tmp = question_part.split(\" \")\n",
226 |     "                        tmp = tmp[1: ]\n",
227 |     "                        if(tag[0] in ['I', 'we', 'We']):\n",
228 |     "                            question_part = 'you ' + ' '.join(tmp)\n",
229 |     "            if(res == None):\n",
230 |     "                res = re.match(r\"VB*\", tag[1])\n",
231 |     "                if(res):\n",
232 |     "                    # question_part = question_part[:question_part.index(tag[0]) + len(tag[0])]\n",
233 |     "\n",
234 |     "                    # Stem the verb\n",
235 |     "                    question_part = question_part.replace(tag[0], stemmer.stem(tag[0]))\n",
236 |     "                res = re.match(r\"VBN\", tag[1])\n",
237 |     "                res = re.match(r\"VBD\", tag[1])\n",
238 |     "\n",
239 |     "        if(comb == 'NN'):\n",
240 |     "            aux = 'does'\n",
241 |     "            \n",
242 |     "        elif(comb == 'NNS'):\n",
243 |     "            aux = 'do'\n",
244 |     "            \n",
245 |     "        elif(comb == 'PRPS'):\n",
246 |     "            aux = 'does'\n",
247 |     "            \n",
248 |     "        elif(comb == 'PRPP'):\n",
249 |     "            aux = 'do'\n",
250 |     "            \n",
251 |     "        if(res and res.group() in ['VBD', 'VBN']):\n",
252 |     "            aux = 'did'\n",
253 |     "\n",
254 |     "        if(aux):\n",
255 |     "            if(type == \"non_disc\" or type == \"Yes/No\"):\n",
256 |     "                question = aux + \" \" + question_part + \"?\"\n",
257 |     "\n",
258 |     "            else:\n",
259 |     "                question = type + \" \" + aux + \" \" + question_part + \"?\"\n",
260 |     "    if(question != \"\"):\n",
261 |     "        question = question[0].upper() + question[1:]\n",
262 |     "    return question"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 15,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "# This function is used to get the named entities\n",
272 |     "def get_named_entities(sent):\n",
273 |     "    \n",
274 |     "    doc = nlp(sent)\n",
275 |     "    named_entities = [(X.text, X.label_) for X in doc.ents]\n",
276 |     "    return named_entities"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 16,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "# This function is used to get the required wh word\n",
286 |     "def get_wh_word(entity, sent):\n",
287 |     "    wh_word = \"\"\n",
288 |     "    if entity[1] in ['TIME', 'DATE']:\n",
289 |     "        wh_word = 'When'\n",
290 |     "    elif entity[1] == ['PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE']:\n",
291 |     "        wh_word = 'What'\n",
292 |     "    elif entity[1] in ['PERSON']:\n",
293 |     "            wh_word = 'Who'\n",
294 |     "    elif entity[1] in ['NORP', 'FAC' ,'ORG', 'GPE', 'LOC']:\n",
295 |     "        index = sent.find(entity[0])\n",
296 |     "        if index == 0:\n",
297 |     "            wh_word = \"Who\"\n",
298 |     "        else:\n",
299 |     "            wh_word = \"Where\"\n",
300 |     "    else:\n",
301 |     "        wh_word = \"Where\"\n",
302 |     "    return wh_word"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 35,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "# This function generate questions based on NER templates\n",
312 |     "def generate_one_word_questions(sent):\n",
313 |     "    \n",
314 |     "    named_entities = get_named_entities(sent)\n",
315 |     "    questions = []\n",
316 |     "    \n",
317 |     "    if not named_entities:\n",
318 |     "        return questions\n",
319 |     "    \n",
320 |     "    for entity in named_entities:\n",
321 |     "        wh_word = get_wh_word(entity, sent)\n",
322 |     "        \n",
323 |     "        if(sent[-1]== '.'):\n",
324 |     "            sent= sent[:-1]\n",
325 |     "        \n",
326 |     "        if sent.find(entity[0]) == 0:\n",
327 |     "            questions.append(sent.replace(entity[0],wh_word) + '?')\n",
328 |     "            continue\n",
329 |     "       \n",
330 |     "        question= \"\"\n",
331 |     "        aux_verb= False\n",
332 |     "        res= None\n",
333 |     "\n",
334 |     "        for i in range(len(aux_list)):\n",
335 |     "            if(aux_list[i] in sent.split()):\n",
336 |     "                aux_verb= True\n",
337 |     "                pos= i\n",
338 |     "                break\n",
339 |     "            \n",
340 |     "        if not aux_verb:\n",
341 |     "            pos = 9\n",
342 |     "        \n",
343 |     "        text = nltk.word_tokenize(sent)\n",
344 |     "        tags= nltk.pos_tag(text)\n",
345 |     "        question_part= \"\"\n",
346 |     "        \n",
347 |     "        if wh_word == 'When':\n",
348 |     "            word_list = sent.split(entity[0])[0].split()\n",
349 |     "            if word_list[-1] in ['in', 'at', 'on']:\n",
350 |     "                question_part = \" \".join(word_list[:-1])\n",
351 |     "            else:\n",
352 |     "                question_part = \" \".join(word_list)\n",
353 |     "            \n",
354 |     "            qp_text = nltk.word_tokenize(question_part)\n",
355 |     "            qp_tags = nltk.pos_tag(qp_text)\n",
356 |     "            \n",
357 |     "            question_part = \"\"\n",
358 |     "            \n",
359 |     "            for i, grp in enumerate(qp_tags):\n",
360 |     "                word = grp[0]\n",
361 |     "                tag = grp[1]\n",
362 |     "                if(re.match(\"VB*\", tag) and word not in aux_list):\n",
363 |     "                    question_part += WordNetLemmatizer().lemmatize(word,'v') + \" \"\n",
364 |     "                else:\n",
365 |     "                    question_part += word + \" \"\n",
366 |     "                \n",
367 |     "            if question_part[-1] == ' ':\n",
368 |     "                question_part = question_part[:-1]\n",
369 |     "        \n",
370 |     "        else:\n",
371 |     "            for i, grp in enumerate(tags):\n",
372 |     "                #Break the sentence after the first non-auxiliary verb\n",
373 |     "\n",
374 |     "                word = grp[0]\n",
375 |     "                tag = grp[1]\n",
376 |     "\n",
377 |     "                if(re.match(\"VB*\", tag) and word not in aux_list):\n",
378 |     "                    question_part+= word\n",
379 |     "\n",
380 |     "                    if i<len(tags) and 'NN' not in tags[i+1][1] and wh_word != 'When':\n",
381 |     "                        question_part+= \" \"+tags[i+1][0]\n",
382 |     "\n",
383 |     "                    break\n",
384 |     "                question_part+= word+ \" \"\n",
385 |     "        question= question_part.split(\" \"+ aux_list[pos])\n",
386 |     "        question= [aux_list[pos]+ \" \"]+ question\n",
387 |     "\n",
388 |     "\n",
389 |     "        question= [wh_word+ \" \"]+ question + [\"?\"]\n",
390 |     "\n",
391 |     "        question= ''.join(question)\n",
392 |     "        \n",
393 |     "        questions.append(question)\n",
394 |     "    \n",
395 |     "    return questions        "
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 94,
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": [
404 |     "# Function used to pre-process sentences which have discourse markers in them\n",
405 |     "def discourse():\n",
406 |     "    temp = []\n",
407 |     "    target = \"\"\n",
408 |     "    questions = []\n",
409 |     "    global disc_sentences\n",
410 |     "    disc_sentences = {}\n",
411 |     "    for i in range(len(sentences)):\n",
412 |     "        maxLen = 9999999\n",
413 |     "        val = -1\n",
414 |     "        for j in discourse_markers:\n",
415 |     "            tmp = len(sentences[i].split(j)[0].split(' '))  \n",
416 |     "            \n",
417 |     "            # To get valid, first discourse marker.   \n",
418 |     "            if(len(sentences[i].split(j)) > 1 and tmp >= 3 and tmp < maxLen):\n",
419 |     "                maxLen = tmp\n",
420 |     "                val = j\n",
421 |     "                \n",
422 |     "        if(val != -1):\n",
423 |     "\n",
424 |     "            # To initialize a list for every new key\n",
425 |     "            if(disc_sentences.get(val, 'empty') == 'empty'):\n",
426 |     "                disc_sentences[val] = []\n",
427 |     "                \n",
428 |     "            disc_sentences[val].append(sentences[i])\n",
429 |     "            temp.append(sentences[i])\n",
430 |     "\n",
431 |     "\n",
432 |     "    nondisc_sentences = list(set(sentences) - set(temp))\n",
433 |     "    \n",
434 |     "    t = []\n",
435 |     "    for k, v in disc_sentences.items():\n",
436 |     "        for val in range(len(v)):\n",
437 |     "            # Split the sentence on discourse marker and identify the question part\n",
438 |     "            question_part = disc_sentences[k][val].split(k)[target_arg[k] - 1]\n",
439 |     "            q = generate_question(question_part, qtype[k][0])\n",
440 |     "            if(q != \"\"):\n",
441 |     "                questions.append([disc_sentences[k][val],q])\n",
442 |     "                \n",
443 |     "                \n",
444 |     "    for question_part in nondisc_sentences:\n",
445 |     "        s = \"non_disc\"\n",
446 |     "        sentence = question_part\n",
447 |     "        text = nltk.word_tokenize(question_part)\n",
448 |     "        if(text[0] == 'Yes'):\n",
449 |     "            question_part = question_part[5:]\n",
450 |     "            s = \"Yes/No\"\n",
451 |     "            \n",
452 |     "        elif(text[0] == 'No'):\n",
453 |     "            question_part = question_part[4:]\n",
454 |     "            s = \"Yes/No\"\n",
455 |     "            \n",
456 |     "        q = generate_question(question_part, s)\n",
457 |     "        if(q != \"\"):\n",
458 |     "            questions.append([sentence,q])\n",
459 |     "        l = generate_one_word_questions(question_part)\n",
460 |     "        questions += [[sentence,i] for i in l]\n",
461 |     "    print(len(questions))\n",
462 |     "    \n",
463 |     "    for pair in questions:\n",
464 |     "        print(\"S: \",pair[0])\n",
465 |     "        print(\"Q: \",pair[1])\n",
466 |     "        print()"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 104,
472 |    "metadata": {
473 |     "scrolled": false
474 |    },
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "92\n",
481 |       "0.7608695652173914\n",
482 |       "0.8428571428571429\n"
483 |      ]
484 |     }
485 |    ],
486 |    "source": [
487 |     "# Syntactic Score and Fluency using Manual Evaluation\n",
488 |     "\n",
489 |     "syntactic_score = [0,0,1,1,1,1,1,0,1,1,1,0,1,0,1,1,1,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,1,0,1,1,1,1,1,1,0,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0,1,1,1,0,1,1]\n",
490 |     "fluency_score   = [0,0,1,1,1,0,1,0,1,1,1,0,1,0,1,1,1,1,1,1,1,0,0,0,0,1,0,1,1,1,1,1,0,0,1,1,1,0,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,0,1,1,0,0,1,1,1,1,0,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,1,1]\n",
491 |     "print(len(syntactic_score))\n",
492 |     "print(sum(syntactic_score)/len(syntactic_score))\n",
493 |     "print(sum(fluency_score)/sum(syntactic_score))"
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": 95,
499 |    "metadata": {
500 |     "scrolled": true
501 |    },
502 |    "outputs": [
503 |     {
504 |      "name": "stdout",
505 |      "output_type": "stream",
506 |      "text": [
507 |       "92\n",
508 |       "S:  I had been playing the drums since school time.\n",
509 |       "Q:  When were you been playing the drums ?\n",
510 |       "\n",
511 |       "S:  I have been up since four.\n",
512 |       "Q:  When do you hav been up ?\n",
513 |       "\n",
514 |       "S:  They were angry because their plans had been discovered.\n",
515 |       "Q:  Why were they angry ?\n",
516 |       "\n",
517 |       "S:  I think he felt included because he was helping as much as we were.\n",
518 |       "Q:  Why did you think he felt included ?\n",
519 |       "\n",
520 |       "S:  I am studying English because I’d like to immigrate to the U.S.\n",
521 |       "Q:  Why were you studying English ?\n",
522 |       "\n",
523 |       "S:  Children often cry just because they want some attention.\n",
524 |       "Q:  Why do children often cry just ?\n",
525 |       "\n",
526 |       "S:  Tom repainted his mailbox because it was looking shabby.\n",
527 |       "Q:  Why did tom repaint his mailbox ?\n",
528 |       "\n",
529 |       "S:  They slept in the car because they couldn't find a hotel.\n",
530 |       "Q:  Why do they slept in the car ?\n",
531 |       "\n",
532 |       "S:  The accident happened because of the driver's negligence.\n",
533 |       "Q:  Why did the accident hap ?\n",
534 |       "\n",
535 |       "S:  He gave up traveling abroad because of his sudden illness.\n",
536 |       "Q:  Why did he gav up traveling abroad ?\n",
537 |       "\n",
538 |       "S:  Harry was late for class yesterday because of his accident.\n",
539 |       "Q:  Why was harry late for class yesterday ?\n",
540 |       "\n",
541 |       "S:  We had a bad rice crop last year because it rained a lot.\n",
542 |       "Q:  Why were you a bad rice crop last year ?\n",
543 |       "\n",
544 |       "S:  I can play quite a few musical instruments, for example, the flute, the guitar, and the piano.\n",
545 |       "Q:  Give an example where I can play quite a few musical instruments, ?\n",
546 |       "\n",
547 |       "S:  Calcium is found in green leafy vegetables, for example, broccoli, kale, arugula, or spinach have over 160 mg. per serving.\n",
548 |       "Q:  Give an example where calcium is found in green leafy vegetables, ?\n",
549 |       "\n",
550 |       "S:  It is possible to combine Computer Science with other subjects, for example Physics.\n",
551 |       "Q:  Give an example where it is possible to combine Computer Science with other subjects, ?\n",
552 |       "\n",
553 |       "S:  Fractions can be written with oblique strokes, for example 2/3.\n",
554 |       "Q:  Give an example where fractions can be written with oblique strokes, ?\n",
555 |       "\n",
556 |       "S:  Vitamin C is found in colorful vegetables, for instance, bell peppers have a lot of vitamin C.\n",
557 |       "John is feeling much better now.\n",
558 |       "Q:  Give an instance where vitamin C is found in colorful vegetables, ?\n",
559 |       "\n",
560 |       "S:  He hurt his hand when he fell.\n",
561 |       "Q:  When did he hurt his hand ?\n",
562 |       "\n",
563 |       "S:  Every child feels displaced to some degree when a new sibling arrives.\n",
564 |       "Q:  When does every child feels displac to some degree ?\n",
565 |       "\n",
566 |       "S:  She was angry when you told her about the accident.\n",
567 |       "Q:  When was she angry ?\n",
568 |       "\n",
569 |       "S:  I married her when she was 23.\n",
570 |       "Q:  When did you marry her ?\n",
571 |       "\n",
572 |       "S:  I shall have been living in Mumbai for five years by May 2019.\n",
573 |       "Q:  Were you have been living in Mumbai for five years by May 2019 ?\n",
574 |       "\n",
575 |       "S:  I shall have been living in Mumbai for five years by May 2019.\n",
576 |       "Q:  Where shall I have been?\n",
577 |       "\n",
578 |       "S:  I shall have been living in Mumbai for five years by May 2019.\n",
579 |       "Q:  When shall I have be live in Mumbai for?\n",
580 |       "\n",
581 |       "S:  I shall have been living in Mumbai for five years by May 2019.\n",
582 |       "Q:  When shall I have be live in Mumbai for five years by?\n",
583 |       "\n",
584 |       "S:  It is ten o’clock.\n",
585 |       "Q:  Is it ten o ’ clock ?\n",
586 |       "\n",
587 |       "S:  It is ten o’clock.\n",
588 |       "Q:  Where is It ten o ’ clock ?\n",
589 |       "\n",
590 |       "S:  Yes, I like coffee.\n",
591 |       "Q:  Do you lik coffee?\n",
592 |       "\n",
593 |       "S:  He will go to China tomorrow.\n",
594 |       "Q:  Will he go to China tomorrow ?\n",
595 |       "\n",
596 |       "S:  He will go to China tomorrow.\n",
597 |       "Q:  Where will He go to?\n",
598 |       "\n",
599 |       "S:  He will go to China tomorrow.\n",
600 |       "Q:  When will He go to China?\n",
601 |       "\n",
602 |       "S:  He was elected as the Prime Minister of India on 15th August 1947.\n",
603 |       "Q:  Was he elected as the Prime Minister of India on 15th August 1947 ?\n",
604 |       "\n",
605 |       "S:  He was elected as the Prime Minister of India on 15th August 1947.\n",
606 |       "Q:  Where was He elected as?\n",
607 |       "\n",
608 |       "S:  He was elected as the Prime Minister of India on 15th August 1947.\n",
609 |       "Q:  When was He elect as the Prime Minister of India?\n",
610 |       "\n",
611 |       "S:  I was playing tennis.\n",
612 |       "Q:  Were you playing tennis ?\n",
613 |       "\n",
614 |       "S:  Yes, she is working very hard.\n",
615 |       "Q:  Is she working very hard ?\n",
616 |       "\n",
617 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
618 |       "Q:  Was abdul Kalam an aerospace scientist who served as the 11th President of India from 2002 to 2007 ?\n",
619 |       "\n",
620 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
621 |       "Q:  Who was an aerospace scientist who served as the 11th President of India from 2002 to 2007?\n",
622 |       "\n",
623 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
624 |       "Q:  Where was Abdul Kalam an aerospace scientist who served as?\n",
625 |       "\n",
626 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
627 |       "Q:  Where was Abdul Kalam an aerospace scientist who served as?\n",
628 |       "\n",
629 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
630 |       "Q:  When was Abdul Kalam an aerospace scientist who serve as the 11th President of India from?\n",
631 |       "\n",
632 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
633 |       "Q:  When was Abdul Kalam an aerospace scientist who serve as the 11th President of India from 2002 to?\n",
634 |       "\n",
635 |       "S:  Population refers to the number of individuals in a particular place.\n",
636 |       "Q:  Does population refers to the number of individuals in a particular place?\n",
637 |       "\n",
638 |       "S:  John was held captive at Castle Black.\n",
639 |       "Q:  Was john held captive at Castle Black ?\n",
640 |       "\n",
641 |       "S:  John was held captive at Castle Black.\n",
642 |       "Q:  Who was held captive at Castle Black?\n",
643 |       "\n",
644 |       "S:  John was held captive at Castle Black.\n",
645 |       "Q:  Where was John held captive?\n",
646 |       "\n",
647 |       "S:  The Taj Mahal is a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal.\n",
648 |       "Q:  Is the Taj Mahal a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal ?\n",
649 |       "\n",
650 |       "S:  The Taj Mahal is a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal.\n",
651 |       "Q:  When is The Taj Mahal a beautiful monument build?\n",
652 |       "\n",
653 |       "S:  The Taj Mahal is a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal.\n",
654 |       "Q:  Who is The Taj Mahal a beautiful monument built in?\n",
655 |       "\n",
656 |       "S:  The Taj Mahal is a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal.\n",
657 |       "Q:  Where is The Taj Mahal a beautiful monument built in?\n",
658 |       "\n",
659 |       "S:  Gandhi Jayanti is celebrated on 2nd October.\n",
660 |       "Q:  Is gandhi Jayanti celebrated on 2nd October ?\n",
661 |       "\n",
662 |       "S:  Gandhi Jayanti is celebrated on 2nd October.\n",
663 |       "Q:  Who is celebrated on 2nd October?\n",
664 |       "\n",
665 |       "S:  Gandhi Jayanti is celebrated on 2nd October.\n",
666 |       "Q:  When is Gandhi Jayanti celebrate?\n",
667 |       "\n",
668 |       "S:  They have been trying to contact her.\n",
669 |       "Q:  Do they hav been try to contact her?\n",
670 |       "\n",
671 |       "S:  Sun is the largest member of the Solar System.\n",
672 |       "Q:  Is sun the largest member of the Solar System ?\n",
673 |       "\n",
674 |       "S:  Sun is the largest member of the Solar System.\n",
675 |       "Q:  Who is the largest member of the Solar System?\n",
676 |       "\n",
677 |       "S:  Sun is the largest member of the Solar System.\n",
678 |       "Q:  Where is Sun the largest member of the Solar System ?\n",
679 |       "\n",
680 |       "S:  She is preparing chicken sandwiches for breakfast.\n",
681 |       "Q:  Is she preparing chicken sandwiches for breakfast ?\n",
682 |       "\n",
683 |       "S:  We were playing tennis at the club.\n",
684 |       "Q:  Were you playing tennis at the club ?\n",
685 |       "\n",
686 |       "S:  Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.\n",
687 |       "Q:  Was jawaharlal Nehru born on 14th November 1889 in Allahabad , Uttar Pradesh ?\n",
688 |       "\n",
689 |       "S:  Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.\n",
690 |       "Q:  Who was born on 14th November 1889 in Allahabad, Uttar Pradesh?\n",
691 |       "\n",
692 |       "S:  Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.\n",
693 |       "Q:  When was Jawaharlal Nehru bear?\n",
694 |       "\n",
695 |       "S:  Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.\n",
696 |       "Q:  Where was Jawaharlal Nehru born on?\n",
697 |       "\n",
698 |       "S:  Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.\n",
699 |       "Q:  Where was Jawaharlal Nehru born on?\n",
700 |       "\n",
701 |       "S:  Hindi Diwas was first celebrated in the year 1953.\n",
702 |       "Q:  Was hindi Diwas first celebrated in the year 1953 ?\n",
703 |       "\n",
704 |       "S:  Hindi Diwas was first celebrated in the year 1953.\n",
705 |       "Q:  Who was first celebrated in the year 1953?\n",
706 |       "\n",
707 |       "S:  Hindi Diwas was first celebrated in the year 1953.\n",
708 |       "Q:  When was Hindi Diwas first celebrate?\n",
709 |       "\n",
710 |       "S:  Delhi is the capital of India.\n",
711 |       "Q:  Is delhi the capital of India ?\n",
712 |       "\n",
713 |       "S:  Delhi is the capital of India.\n",
714 |       "Q:  Who is the capital of India?\n",
715 |       "\n",
716 |       "S:  Delhi is the capital of India.\n",
717 |       "Q:  Where is Delhi the capital of India ?\n",
718 |       "\n",
719 |       "S:  Sachin Tendulkar was awarded Bharat Ratna in 2013.\n",
720 |       "Q:  Was sachin Tendulkar awarded Bharat Ratna in 2013 ?\n",
721 |       "\n",
722 |       "S:  Sachin Tendulkar was awarded Bharat Ratna in 2013.\n",
723 |       "Q:  Who was awarded Bharat Ratna in 2013?\n",
724 |       "\n",
725 |       "S:  Sachin Tendulkar was awarded Bharat Ratna in 2013.\n",
726 |       "Q:  Who was Sachin Tendulkar awarded?\n",
727 |       "\n",
728 |       "S:  Sachin Tendulkar was awarded Bharat Ratna in 2013.\n",
729 |       "Q:  When was Sachin Tendulkar award Bharat Ratna?\n",
730 |       "\n",
731 |       "S:  His name is Peter.\n",
732 |       "Q:  Is his name Peter ?\n",
733 |       "\n",
734 |       "S:  His name is Peter.\n",
735 |       "Q:  Who is His name Peter ?\n",
736 |       "\n",
737 |       "S:  No, I was not playing cricket.\n",
738 |       "Q:  Were you not playing cricket ?\n",
739 |       "\n",
740 |       "S:  Darjeeling is known for its beautiful tea gardens.\n",
741 |       "Q:  Is darjeeling known for its beautiful tea gardens ?\n",
742 |       "\n",
743 |       "S:  Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.\n",
744 |       "Q:  Was mahatma Gandhi born on 2nd October 1869 in Porbandar , Gujarat ?\n",
745 |       "\n",
746 |       "S:  Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.\n",
747 |       "Q:  Who was born on 2nd October 1869 in Porbandar, Gujarat?\n",
748 |       "\n",
749 |       "S:  Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.\n",
750 |       "Q:  When was Mahatma Gandhi bear?\n",
751 |       "\n",
752 |       "S:  Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.\n",
753 |       "Q:  Where was Mahatma Gandhi born on?\n",
754 |       "\n",
755 |       "S:  Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.\n",
756 |       "Q:  Where was Mahatma Gandhi born on?\n",
757 |       "\n",
758 |       "S:  The British had introduced a National Flag for British India after the revolt of 1857.\n",
759 |       "Q:  Had the British introduced a National Flag for British India after the revolt of 1857 ?\n",
760 |       "\n",
761 |       "S:  The British had introduced a National Flag for British India after the revolt of 1857.\n",
762 |       "Q:  Where had The British introduced a?\n",
763 |       "\n",
764 |       "S:  The British had introduced a National Flag for British India after the revolt of 1857.\n",
765 |       "Q:  Where had The British introduced a?\n",
766 |       "\n",
767 |       "S:  The British had introduced a National Flag for British India after the revolt of 1857.\n",
768 |       "Q:  Where had The British introduced a?\n",
769 |       "\n",
770 |       "S:  The British had introduced a National Flag for British India after the revolt of 1857.\n",
771 |       "Q:  When had The British introduce a National Flag for British India after the revolt of?\n",
772 |       "\n",
773 |       "S:  They grow really well in pots.\n",
774 |       "Q:  Do they grow really well in pots?\n",
775 |       "\n",
776 |       "S:  You usually walk to work.\n",
777 |       "Q:  Do you usually walk to work?\n",
778 |       "\n",
779 |       "S:  I did go for fishing today.\n",
780 |       "Q:  Were you go for fishing today ?\n",
781 |       "\n",
782 |       "S:  I did go for fishing today.\n",
783 |       "Q:  When did I go for fishing?\n",
784 |       "\n"
785 |      ]
786 |     }
787 |    ],
788 |    "source": [
789 |     "sentensify()"
790 |    ]
791 |   },
792 |   {
793 |    "cell_type": "code",
794 |    "execution_count": null,
795 |    "metadata": {},
796 |    "outputs": [],
797 |    "source": []
798 |   },
799 |   {
800 |    "cell_type": "code",
801 |    "execution_count": null,
802 |    "metadata": {},
803 |    "outputs": [],
804 |    "source": []
805 |   }
806 |  ],
807 |  "metadata": {
808 |   "kernelspec": {
809 |    "display_name": "Python 3",
810 |    "language": "python",
811 |    "name": "python3"
812 |   },
813 |   "language_info": {
814 |    "codemirror_mode": {
815 |     "name": "ipython",
816 |     "version": 3
817 |    },
818 |    "file_extension": ".py",
819 |    "mimetype": "text/x-python",
820 |    "name": "python",
821 |    "nbconvert_exporter": "python",
822 |    "pygments_lexer": "ipython3",
823 |    "version": "3.6.7"
824 |   }
825 |  },
826 |  "nbformat": 4,
827 |  "nbformat_minor": 2
828 | }
829 | 


--------------------------------------------------------------------------------
/Code.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Imports\n",
 10 |     "import nltk\n",
 11 |     "import nltk.data\n",
 12 |     "from nltk.stem.lancaster import LancasterStemmer\n",
 13 |     "from nltk.stem.wordnet import WordNetLemmatizer\n",
 14 |     "import re\n",
 15 |     "import spacy\n",
 16 |     "import pandas as pd"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# Class initializations\n",
 26 |     "nlp = spacy.load('en_core_web_sm')\n",
 27 |     "stemmer = LancasterStemmer()"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# List to hold all input sentences\n",
 37 |     "sentences = []\n",
 38 |     "\n",
 39 |     "# Dictionary to hold sentences corresponding to respective discourse markers\n",
 40 |     "disc_sentences = {}\n",
 41 |     "\n",
 42 |     "# Remaining sentences which do not have discourse markers (To be used later to generate other kinds of questions)\n",
 43 |     "nondisc_sentences = []\n",
 44 |     "\n",
 45 |     "# List of auxiliary verbs\n",
 46 |     "aux_list = ['am', 'are', 'is', 'was', 'were', 'can', 'could', 'does', 'do', 'did', 'has', 'had', 'may', 'might', 'must',\n",
 47 |     "            'need', 'ought', 'shall', 'should', 'will', 'would']\n",
 48 |     "\n",
 49 |     "# List of all discourse markers\n",
 50 |     "discourse_markers = ['because', 'as a result', 'since', 'when', 'although', 'for example', 'for instance']\n",
 51 |     "\n",
 52 |     "# Different question types possible for each discourse marker\n",
 53 |     "qtype = {'because': ['Why'], 'since': ['When', 'Why'], 'when': ['When'], 'although': ['Yes/No'], 'as a result': ['Why'], \n",
 54 |     "        'for example': ['Give an example where'], 'for instance': ['Give an instance where'], 'to': ['Why']}\n",
 55 |     "\n",
 56 |     "# The argument which forms a question\n",
 57 |     "target_arg = {'because': 1, 'since': 1, 'when': 1, 'although': 1, 'as a result': 2, 'for example': 1, 'for instance': 1, \n",
 58 |     "              'to': 1}"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# This function is used to tokenize and split into sentences\n",
 68 |     "def sentensify():\n",
 69 |     "    global sentences\n",
 70 |     "    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')\n",
 71 |     "    fp = open('input.txt')\n",
 72 |     "    data = fp.read()\n",
 73 |     "    sentences = tokenizer.tokenize(data)\n",
 74 |     "    discourse()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 5,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# Function used to generate the questions from sentences which have already been pre-processed.\n",
 84 |     "def generate_question(question_part, type):\n",
 85 |     "\n",
 86 |     "    ''' \n",
 87 |     "        question_part -> Part of input sentence which forms a question\n",
 88 |     "        type-> The type of question (why, where, etc)\n",
 89 |     "    '''\n",
 90 |     "    # Remove full stop and make first letter lower case\n",
 91 |     "    question_part = question_part[0].lower() + question_part[1:]\n",
 92 |     "    if(question_part[-1] == '.' or question_part[-1] == ','):\n",
 93 |     "        question_part = question_part[:-1]\n",
 94 |     "        \n",
 95 |     "    # Capitalizing 'i' since 'I' is recognized by parsers appropriately    \n",
 96 |     "    for i in range(0, len(question_part)):\n",
 97 |     "        if(question_part[i] == 'i'):\n",
 98 |     "            if((i == 0 and question_part[i+1] == ' ') or (question_part[i-1] == ' ' and question_part[i+1] == ' ')):\n",
 99 |     "                question_part = question_part[:i] + 'I' + question_part[i + 1: ]\n",
100 |     "                \n",
101 |     "    question = \"\"\n",
102 |     "    if(type == 'Give an example where' or type == 'Give an instance where'):\n",
103 |     "        question = type + \" \" + question_part + '?'\n",
104 |     "        return question\n",
105 |     "\n",
106 |     "    aux_verb = False\n",
107 |     "    res = None\n",
108 |     "    \n",
109 |     "    # Find out if auxiliary verb already exists\n",
110 |     "    for i in range(len(aux_list)):\n",
111 |     "        if(aux_list[i] in question_part.split()):\n",
112 |     "            aux_verb = True\n",
113 |     "            pos = i\n",
114 |     "            break\n",
115 |     "\n",
116 |     "    # If auxiliary verb exists\n",
117 |     "    if(aux_verb):\n",
118 |     "        \n",
119 |     "        # Tokeninze the part of the sentence from which the question has to be made\n",
120 |     "        text = nltk.word_tokenize(question_part)\n",
121 |     "        tags = nltk.pos_tag(text)\n",
122 |     "        question_part = \"\"\n",
123 |     "        fP = False\n",
124 |     "        \n",
125 |     "        for word, tag in tags:\n",
126 |     "            if(word in ['I', 'We', 'we']):\n",
127 |     "                question_part += 'you' + \" \"\n",
128 |     "                fP = True\n",
129 |     "                continue\n",
130 |     "            question_part += word + \" \"\n",
131 |     "\n",
132 |     "        # Split across the auxiliary verb and prepend it at the start of question part\n",
133 |     "        question = question_part.split(\" \" + aux_list[pos])\n",
134 |     "        if(fP):\n",
135 |     "             question = [\"were \"] + question\n",
136 |     "        else:\n",
137 |     "            question = [aux_list[pos] + \" \"] + question\n",
138 |     "\n",
139 |     "        # If Yes/No, no need to introduce question phrase\n",
140 |     "        if(type == 'Yes/No'):\n",
141 |     "            question += ['?']\n",
142 |     "            \n",
143 |     "        elif(type != \"non_disc\"):\n",
144 |     "            question = [type + \" \"] + question + [\"?\"]\n",
145 |     "            \n",
146 |     "        else:\n",
147 |     "            question = question + [\"?\"]\n",
148 |     "         \n",
149 |     "        question = ''.join(question)\n",
150 |     "\n",
151 |     "    # If auxilary verb does ot exist, it can only be some form of verb 'do'\n",
152 |     "    else:\n",
153 |     "        aux = None\n",
154 |     "        text = nltk.word_tokenize(question_part)\n",
155 |     "        tags = nltk.pos_tag(text)\n",
156 |     "        comb = \"\"\n",
157 |     "\n",
158 |     "        '''There can be following combinations of nouns and verbs:\n",
159 |     "            NN/NNP and VBZ  -> Does\n",
160 |     "            NNS/NNPS(plural) and VBP -> Do\n",
161 |     "            NN/NNP and VBN -> Did\n",
162 |     "            NNS/NNPS(plural) and VBN -> Did\n",
163 |     "        '''\n",
164 |     "        \n",
165 |     "        for tag in tags:\n",
166 |     "            if(comb == \"\"):\n",
167 |     "                if(tag[1] == 'NN' or tag[1] == 'NNP'):\n",
168 |     "                    comb = 'NN'\n",
169 |     "\n",
170 |     "                elif(tag[1] == 'NNS' or tag[1] == 'NNPS'):\n",
171 |     "                    comb = 'NNS'\n",
172 |     "\n",
173 |     "                elif(tag[1] == 'PRP'):\n",
174 |     "                    if tag[0] in ['He','She','It']:\n",
175 |     "                        comb = 'PRPS'\n",
176 |     "                    else:\n",
177 |     "                        comb = 'PRPP'\n",
178 |     "                        tmp = question_part.split(\" \")\n",
179 |     "                        tmp = tmp[1: ]\n",
180 |     "                        if(tag[0] in ['I', 'we', 'We']):\n",
181 |     "                            question_part = 'you ' + ' '.join(tmp)\n",
182 |     "                            \n",
183 |     "            if(res == None):\n",
184 |     "                res = re.match(r\"VB*\", tag[1])\n",
185 |     "                if(res):\n",
186 |     "                    \n",
187 |     "                    # Stem the verb\n",
188 |     "                    question_part = question_part.replace(tag[0], stemmer.stem(tag[0]))\n",
189 |     "                res = re.match(r\"VBN\", tag[1])\n",
190 |     "                res = re.match(r\"VBD\", tag[1])\n",
191 |     "\n",
192 |     "        if(comb == 'NN'):\n",
193 |     "            aux = 'does'\n",
194 |     "            \n",
195 |     "        elif(comb == 'NNS'):\n",
196 |     "            aux = 'do'\n",
197 |     "            \n",
198 |     "        elif(comb == 'PRPS'):\n",
199 |     "            aux = 'does'\n",
200 |     "            \n",
201 |     "        elif(comb == 'PRPP'):\n",
202 |     "            aux = 'do'\n",
203 |     "            \n",
204 |     "        if(res and res.group() in ['VBD', 'VBN']):\n",
205 |     "            aux = 'did'\n",
206 |     "\n",
207 |     "        if(aux):\n",
208 |     "            if(type == \"non_disc\" or type == \"Yes/No\"):\n",
209 |     "                question = aux + \" \" + question_part + \"?\"\n",
210 |     "\n",
211 |     "            else:\n",
212 |     "                question = type + \" \" + aux + \" \" + question_part + \"?\"\n",
213 |     "    if(question != \"\"):\n",
214 |     "        question = question[0].upper() + question[1:]\n",
215 |     "    return question"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 6,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "# This function is used to get the named entities\n",
225 |     "def get_named_entities(sent):\n",
226 |     "    doc = nlp(sent)\n",
227 |     "    named_entities = [(X.text, X.label_) for X in doc.ents]\n",
228 |     "    return named_entities"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 7,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "# This function is used to get the required wh word\n",
238 |     "def get_wh_word(entity, sent):\n",
239 |     "    wh_word = \"\"\n",
240 |     "    if entity[1] in ['TIME', 'DATE']:\n",
241 |     "        wh_word = 'When'\n",
242 |     "        \n",
243 |     "    elif entity[1] == ['PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE']:\n",
244 |     "        wh_word = 'What'\n",
245 |     "        \n",
246 |     "    elif entity[1] in ['PERSON']:\n",
247 |     "            wh_word = 'Who'\n",
248 |     "            \n",
249 |     "    elif entity[1] in ['NORP', 'FAC' ,'ORG', 'GPE', 'LOC']:\n",
250 |     "        index = sent.find(entity[0])\n",
251 |     "        if index == 0:\n",
252 |     "            wh_word = \"Who\"\n",
253 |     "            \n",
254 |     "        else:\n",
255 |     "            wh_word = \"Where\"\n",
256 |     "            \n",
257 |     "    else:\n",
258 |     "        wh_word = \"Where\"\n",
259 |     "    return wh_word"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 8,
265 |    "metadata": {},
266 |    "outputs": [],
267 |    "source": [
268 |     "# This function generate questions based on NER templates\n",
269 |     "def generate_one_word_questions(sent):\n",
270 |     "    \n",
271 |     "    named_entities = get_named_entities(sent)\n",
272 |     "    questions = []\n",
273 |     "    \n",
274 |     "    if not named_entities:\n",
275 |     "        return questions\n",
276 |     "    \n",
277 |     "    for entity in named_entities:\n",
278 |     "        wh_word = get_wh_word(entity, sent)\n",
279 |     "        \n",
280 |     "        if(sent[-1] == '.'):\n",
281 |     "            sent = sent[:-1]\n",
282 |     "        \n",
283 |     "        if sent.find(entity[0]) == 0:\n",
284 |     "            questions.append(sent.replace(entity[0],wh_word) + '?')\n",
285 |     "            continue\n",
286 |     "       \n",
287 |     "        question = \"\"\n",
288 |     "        aux_verb = False\n",
289 |     "        res = None\n",
290 |     "\n",
291 |     "        for i in range(len(aux_list)):\n",
292 |     "            if(aux_list[i] in sent.split()):\n",
293 |     "                aux_verb = True\n",
294 |     "                pos = i\n",
295 |     "                break\n",
296 |     "            \n",
297 |     "        if not aux_verb:\n",
298 |     "            pos = 9\n",
299 |     "        \n",
300 |     "        text = nltk.word_tokenize(sent)\n",
301 |     "        tags = nltk.pos_tag(text)\n",
302 |     "        question_part = \"\"\n",
303 |     "        \n",
304 |     "        if wh_word == 'When':\n",
305 |     "            word_list = sent.split(entity[0])[0].split()\n",
306 |     "            if word_list[-1] in ['in', 'at', 'on']:\n",
307 |     "                question_part = \" \".join(word_list[:-1])\n",
308 |     "            else:\n",
309 |     "                question_part = \" \".join(word_list)\n",
310 |     "            \n",
311 |     "            qp_text = nltk.word_tokenize(question_part)\n",
312 |     "            qp_tags = nltk.pos_tag(qp_text)\n",
313 |     "            \n",
314 |     "            question_part = \"\"\n",
315 |     "            \n",
316 |     "            for i, grp in enumerate(qp_tags):\n",
317 |     "                word = grp[0]\n",
318 |     "                tag = grp[1]\n",
319 |     "                if(re.match(\"VB*\", tag) and word not in aux_list):\n",
320 |     "                    question_part += WordNetLemmatizer().lemmatize(word,'v') + \" \"\n",
321 |     "                else:\n",
322 |     "                    question_part += word + \" \"\n",
323 |     "                \n",
324 |     "            if question_part[-1] == ' ':\n",
325 |     "                question_part = question_part[:-1]\n",
326 |     "        \n",
327 |     "        else:\n",
328 |     "            for i, grp in enumerate(tags):\n",
329 |     "                \n",
330 |     "                #Break the sentence after the first non-auxiliary verb\n",
331 |     "                word = grp[0]\n",
332 |     "                tag = grp[1]\n",
333 |     "\n",
334 |     "                if(re.match(\"VB*\", tag) and word not in aux_list):\n",
335 |     "                    question_part += word\n",
336 |     "\n",
337 |     "                    if i<len(tags) and 'NN' not in tags[i+1][1] and wh_word != 'When':\n",
338 |     "                        question_part += \" \"+ tags[i+1][0]\n",
339 |     "\n",
340 |     "                    break\n",
341 |     "                question_part += word + \" \"\n",
342 |     "        question = question_part.split(\" \"+ aux_list[pos])\n",
343 |     "        question = [aux_list[pos] + \" \"] + question\n",
344 |     "        question = [wh_word+ \" \"] + question + [\"?\"]\n",
345 |     "        question = ''.join(question)\n",
346 |     "        questions.append(question)\n",
347 |     "    \n",
348 |     "    return questions        "
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": 9,
354 |    "metadata": {},
355 |    "outputs": [],
356 |    "source": [
357 |     "# Function used to pre-process sentences which have discourse markers in them\n",
358 |     "def discourse():\n",
359 |     "    temp = []\n",
360 |     "    target = \"\"\n",
361 |     "    questions = []\n",
362 |     "    global disc_sentences\n",
363 |     "    disc_sentences = {}\n",
364 |     "    for i in range(len(sentences)):\n",
365 |     "        maxLen = 9999999\n",
366 |     "        val = -1\n",
367 |     "        for j in discourse_markers:\n",
368 |     "            tmp = len(sentences[i].split(j)[0].split(' '))  \n",
369 |     "            \n",
370 |     "            # To get valid, first discourse marker.   \n",
371 |     "            if(len(sentences[i].split(j)) > 1 and tmp >= 3 and tmp < maxLen):\n",
372 |     "                maxLen = tmp\n",
373 |     "                val = j\n",
374 |     "                \n",
375 |     "        if(val != -1):\n",
376 |     "\n",
377 |     "            # To initialize a list for every new key\n",
378 |     "            if(disc_sentences.get(val, 'empty') == 'empty'):\n",
379 |     "                disc_sentences[val] = []\n",
380 |     "                \n",
381 |     "            disc_sentences[val].append(sentences[i])\n",
382 |     "            temp.append(sentences[i])\n",
383 |     "\n",
384 |     "\n",
385 |     "    nondisc_sentences = list(set(sentences) - set(temp))\n",
386 |     "    \n",
387 |     "    t = []\n",
388 |     "    for k, v in disc_sentences.items():\n",
389 |     "        for val in range(len(v)):\n",
390 |     "            \n",
391 |     "            # Split the sentence on discourse marker and identify the question part\n",
392 |     "            question_part = disc_sentences[k][val].split(k)[target_arg[k] - 1]\n",
393 |     "            q = generate_question(question_part, qtype[k][0])\n",
394 |     "            if(q != \"\"):\n",
395 |     "                questions.append([disc_sentences[k][val],q])\n",
396 |     "                \n",
397 |     "                \n",
398 |     "    for question_part in nondisc_sentences:\n",
399 |     "        s = \"non_disc\"\n",
400 |     "        sentence = question_part\n",
401 |     "        text = nltk.word_tokenize(question_part)\n",
402 |     "        if(text[0] == 'Yes'):\n",
403 |     "            question_part = question_part[5:]\n",
404 |     "            s = \"Yes/No\"\n",
405 |     "            \n",
406 |     "        elif(text[0] == 'No'):\n",
407 |     "            question_part = question_part[4:]\n",
408 |     "            s = \"Yes/No\"\n",
409 |     "            \n",
410 |     "        q = generate_question(question_part, s)\n",
411 |     "        if(q != \"\"):\n",
412 |     "            questions.append([sentence,q])\n",
413 |     "        l = generate_one_word_questions(question_part)\n",
414 |     "        questions += [[sentence,i] for i in l]\n",
415 |     "    print(len(questions))\n",
416 |     "    \n",
417 |     "    for pair in questions:\n",
418 |     "        print(\"S: \",pair[0])\n",
419 |     "        print(\"Q: \",pair[1])\n",
420 |     "        print()"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 10,
426 |    "metadata": {
427 |     "scrolled": false
428 |    },
429 |    "outputs": [
430 |     {
431 |      "name": "stdout",
432 |      "output_type": "stream",
433 |      "text": [
434 |       "92\n",
435 |       "0.7608695652173914\n",
436 |       "0.8428571428571429\n"
437 |      ]
438 |     }
439 |    ],
440 |    "source": [
441 |     "# Syntactic Score and Fluency using Manual Evaluation\n",
442 |     "\n",
443 |     "syntactic_score = [0,0,1,1,1,1,1,0,1,1,1,0,1,0,1,1,1,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,1,0,1,1,1,1,1,1,0,0,\n",
444 |     "                    1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0,1,1,1,0,1,1]\n",
445 |     "fluency_score   = [0,0,1,1,1,0,1,0,1,1,1,0,1,0,1,1,1,1,1,1,1,0,0,0,0,1,0,1,1,1,1,1,0,0,1,1,1,0,0,0,0,1,1,1,1,1,1,0,0,0,\n",
446 |     "                    1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,0,1,1,0,0,1,1,1,1,0,1,1,0,1,1,1,0,1,1,1,0,0,0,0,0,1,1]\n",
447 |     "print(len(syntactic_score))\n",
448 |     "print(sum(syntactic_score)/len(syntactic_score))\n",
449 |     "print(sum(fluency_score)/sum(syntactic_score))"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": 11,
455 |    "metadata": {
456 |     "scrolled": true
457 |    },
458 |    "outputs": [
459 |     {
460 |      "name": "stdout",
461 |      "output_type": "stream",
462 |      "text": [
463 |       "92\n",
464 |       "S:  They were angry because their plans had been discovered.\n",
465 |       "Q:  Why were they angry ?\n",
466 |       "\n",
467 |       "S:  I think he felt included because he was helping as much as we were.\n",
468 |       "Q:  Why did you think he felt included ?\n",
469 |       "\n",
470 |       "S:  I am studying English because I’d like to immigrate to the U.S.\n",
471 |       "Q:  Why were you studying English ?\n",
472 |       "\n",
473 |       "S:  Children often cry just because they want some attention.\n",
474 |       "Q:  Why do children often cry just ?\n",
475 |       "\n",
476 |       "S:  Tom repainted his mailbox because it was looking shabby.\n",
477 |       "Q:  Why did tom repaint his mailbox ?\n",
478 |       "\n",
479 |       "S:  They slept in the car because they couldn't find a hotel.\n",
480 |       "Q:  Why do they slept in the car ?\n",
481 |       "\n",
482 |       "S:  The accident happened because of the driver's negligence.\n",
483 |       "Q:  Why did the accident hap ?\n",
484 |       "\n",
485 |       "S:  He gave up traveling abroad because of his sudden illness.\n",
486 |       "Q:  Why did he gav up traveling abroad ?\n",
487 |       "\n",
488 |       "S:  Harry was late for class yesterday because of his accident.\n",
489 |       "Q:  Why was harry late for class yesterday ?\n",
490 |       "\n",
491 |       "S:  We had a bad rice crop last year because it rained a lot.\n",
492 |       "Q:  Why were you a bad rice crop last year ?\n",
493 |       "\n",
494 |       "S:  I can play quite a few musical instruments, for example, the flute, the guitar, and the piano.\n",
495 |       "Q:  Give an example where I can play quite a few musical instruments, ?\n",
496 |       "\n",
497 |       "S:  Calcium is found in green leafy vegetables, for example, broccoli, kale, arugula, or spinach have over 160 mg. per serving.\n",
498 |       "Q:  Give an example where calcium is found in green leafy vegetables, ?\n",
499 |       "\n",
500 |       "S:  It is possible to combine Computer Science with other subjects, for example Physics.\n",
501 |       "Q:  Give an example where it is possible to combine Computer Science with other subjects, ?\n",
502 |       "\n",
503 |       "S:  Fractions can be written with oblique strokes, for example 2/3.\n",
504 |       "Q:  Give an example where fractions can be written with oblique strokes, ?\n",
505 |       "\n",
506 |       "S:  He hurt his hand when he fell.\n",
507 |       "Q:  When did he hurt his hand ?\n",
508 |       "\n",
509 |       "S:  Every child feels displaced to some degree when a new sibling arrives.\n",
510 |       "Q:  When does every child feels displac to some degree ?\n",
511 |       "\n",
512 |       "S:  She was angry when you told her about the accident.\n",
513 |       "Q:  When was she angry ?\n",
514 |       "\n",
515 |       "S:  I married her when she was 23.\n",
516 |       "Q:  When did you marry her ?\n",
517 |       "\n",
518 |       "S:  Vitamin C is found in colorful vegetables, for instance, bell peppers have a lot of vitamin C.\n",
519 |       "John is feeling much better now.\n",
520 |       "Q:  Give an instance where vitamin C is found in colorful vegetables, ?\n",
521 |       "\n",
522 |       "S:  I had been playing the drums since school time.\n",
523 |       "Q:  When were you been playing the drums ?\n",
524 |       "\n",
525 |       "S:  I have been up since four.\n",
526 |       "Q:  When do you hav been up ?\n",
527 |       "\n",
528 |       "S:  He will go to China tomorrow.\n",
529 |       "Q:  Will he go to China tomorrow ?\n",
530 |       "\n",
531 |       "S:  He will go to China tomorrow.\n",
532 |       "Q:  Where will He go to?\n",
533 |       "\n",
534 |       "S:  He will go to China tomorrow.\n",
535 |       "Q:  When will He go to China?\n",
536 |       "\n",
537 |       "S:  Yes, she is working very hard.\n",
538 |       "Q:  Is she working very hard ?\n",
539 |       "\n",
540 |       "S:  You usually walk to work.\n",
541 |       "Q:  Do you usually walk to work?\n",
542 |       "\n",
543 |       "S:  The British had introduced a National Flag for British India after the revolt of 1857.\n",
544 |       "Q:  Had the British introduced a National Flag for British India after the revolt of 1857 ?\n",
545 |       "\n",
546 |       "S:  The British had introduced a National Flag for British India after the revolt of 1857.\n",
547 |       "Q:  Where had The British introduced a?\n",
548 |       "\n",
549 |       "S:  The British had introduced a National Flag for British India after the revolt of 1857.\n",
550 |       "Q:  Where had The British introduced a?\n",
551 |       "\n",
552 |       "S:  The British had introduced a National Flag for British India after the revolt of 1857.\n",
553 |       "Q:  Where had The British introduced a?\n",
554 |       "\n",
555 |       "S:  The British had introduced a National Flag for British India after the revolt of 1857.\n",
556 |       "Q:  When had The British introduce a National Flag for British India after the revolt of?\n",
557 |       "\n",
558 |       "S:  Gandhi Jayanti is celebrated on 2nd October.\n",
559 |       "Q:  Is gandhi Jayanti celebrated on 2nd October ?\n",
560 |       "\n",
561 |       "S:  Gandhi Jayanti is celebrated on 2nd October.\n",
562 |       "Q:  Who is celebrated on 2nd October?\n",
563 |       "\n",
564 |       "S:  Gandhi Jayanti is celebrated on 2nd October.\n",
565 |       "Q:  When is Gandhi Jayanti celebrate?\n",
566 |       "\n",
567 |       "S:  I was playing tennis.\n",
568 |       "Q:  Were you playing tennis ?\n",
569 |       "\n",
570 |       "S:  No, I was not playing cricket.\n",
571 |       "Q:  Were you not playing cricket ?\n",
572 |       "\n",
573 |       "S:  Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.\n",
574 |       "Q:  Was mahatma Gandhi born on 2nd October 1869 in Porbandar , Gujarat ?\n",
575 |       "\n",
576 |       "S:  Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.\n",
577 |       "Q:  Who was born on 2nd October 1869 in Porbandar, Gujarat?\n",
578 |       "\n",
579 |       "S:  Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.\n",
580 |       "Q:  When was Mahatma Gandhi bear?\n",
581 |       "\n",
582 |       "S:  Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.\n",
583 |       "Q:  Where was Mahatma Gandhi born on?\n",
584 |       "\n",
585 |       "S:  Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.\n",
586 |       "Q:  Where was Mahatma Gandhi born on?\n",
587 |       "\n",
588 |       "S:  Sun is the largest member of the Solar System.\n",
589 |       "Q:  Is sun the largest member of the Solar System ?\n",
590 |       "\n",
591 |       "S:  Sun is the largest member of the Solar System.\n",
592 |       "Q:  Who is the largest member of the Solar System?\n",
593 |       "\n",
594 |       "S:  Sun is the largest member of the Solar System.\n",
595 |       "Q:  Where is Sun the largest member of the Solar System ?\n",
596 |       "\n",
597 |       "S:  They grow really well in pots.\n",
598 |       "Q:  Do they grow really well in pots?\n",
599 |       "\n",
600 |       "S:  John was held captive at Castle Black.\n",
601 |       "Q:  Was john held captive at Castle Black ?\n",
602 |       "\n",
603 |       "S:  John was held captive at Castle Black.\n",
604 |       "Q:  Who was held captive at Castle Black?\n",
605 |       "\n",
606 |       "S:  John was held captive at Castle Black.\n",
607 |       "Q:  Where was John held captive?\n",
608 |       "\n",
609 |       "S:  It is ten o’clock.\n",
610 |       "Q:  Is it ten o ’ clock ?\n",
611 |       "\n",
612 |       "S:  It is ten o’clock.\n",
613 |       "Q:  Where is It ten o ’ clock ?\n",
614 |       "\n",
615 |       "S:  She is preparing chicken sandwiches for breakfast.\n",
616 |       "Q:  Is she preparing chicken sandwiches for breakfast ?\n",
617 |       "\n",
618 |       "S:  He was elected as the Prime Minister of India on 15th August 1947.\n",
619 |       "Q:  Was he elected as the Prime Minister of India on 15th August 1947 ?\n",
620 |       "\n",
621 |       "S:  He was elected as the Prime Minister of India on 15th August 1947.\n",
622 |       "Q:  Where was He elected as?\n",
623 |       "\n",
624 |       "S:  He was elected as the Prime Minister of India on 15th August 1947.\n",
625 |       "Q:  When was He elect as the Prime Minister of India?\n",
626 |       "\n",
627 |       "S:  I shall have been living in Mumbai for five years by May 2019.\n",
628 |       "Q:  Were you have been living in Mumbai for five years by May 2019 ?\n",
629 |       "\n",
630 |       "S:  I shall have been living in Mumbai for five years by May 2019.\n",
631 |       "Q:  Where shall I have been?\n",
632 |       "\n",
633 |       "S:  I shall have been living in Mumbai for five years by May 2019.\n",
634 |       "Q:  When shall I have be live in Mumbai for?\n",
635 |       "\n",
636 |       "S:  I shall have been living in Mumbai for five years by May 2019.\n",
637 |       "Q:  When shall I have be live in Mumbai for five years by?\n",
638 |       "\n",
639 |       "S:  Darjeeling is known for its beautiful tea gardens.\n",
640 |       "Q:  Is darjeeling known for its beautiful tea gardens ?\n",
641 |       "\n",
642 |       "S:  Hindi Diwas was first celebrated in the year 1953.\n",
643 |       "Q:  Was hindi Diwas first celebrated in the year 1953 ?\n",
644 |       "\n",
645 |       "S:  Hindi Diwas was first celebrated in the year 1953.\n",
646 |       "Q:  Who was first celebrated in the year 1953?\n",
647 |       "\n",
648 |       "S:  Hindi Diwas was first celebrated in the year 1953.\n",
649 |       "Q:  When was Hindi Diwas first celebrate?\n",
650 |       "\n",
651 |       "S:  I did go for fishing today.\n",
652 |       "Q:  Were you go for fishing today ?\n",
653 |       "\n",
654 |       "S:  I did go for fishing today.\n",
655 |       "Q:  When did I go for fishing?\n",
656 |       "\n",
657 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
658 |       "Q:  Was abdul Kalam an aerospace scientist who served as the 11th President of India from 2002 to 2007 ?\n",
659 |       "\n",
660 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
661 |       "Q:  Who was an aerospace scientist who served as the 11th President of India from 2002 to 2007?\n",
662 |       "\n",
663 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
664 |       "Q:  Where was Abdul Kalam an aerospace scientist who served as?\n",
665 |       "\n",
666 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
667 |       "Q:  Where was Abdul Kalam an aerospace scientist who served as?\n",
668 |       "\n",
669 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
670 |       "Q:  When was Abdul Kalam an aerospace scientist who serve as the 11th President of India from?\n",
671 |       "\n",
672 |       "S:  Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.\n",
673 |       "Q:  When was Abdul Kalam an aerospace scientist who serve as the 11th President of India from 2002 to?\n",
674 |       "\n",
675 |       "S:  Delhi is the capital of India.\n",
676 |       "Q:  Is delhi the capital of India ?\n",
677 |       "\n",
678 |       "S:  Delhi is the capital of India.\n",
679 |       "Q:  Who is the capital of India?\n",
680 |       "\n",
681 |       "S:  Delhi is the capital of India.\n",
682 |       "Q:  Where is Delhi the capital of India ?\n",
683 |       "\n",
684 |       "S:  Yes, I like coffee.\n",
685 |       "Q:  Do you lik coffee?\n",
686 |       "\n",
687 |       "S:  The Taj Mahal is a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal.\n",
688 |       "Q:  Is the Taj Mahal a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal ?\n",
689 |       "\n",
690 |       "S:  The Taj Mahal is a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal.\n",
691 |       "Q:  When is The Taj Mahal a beautiful monument build?\n",
692 |       "\n",
693 |       "S:  The Taj Mahal is a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal.\n",
694 |       "Q:  Who is The Taj Mahal a beautiful monument built in?\n",
695 |       "\n",
696 |       "S:  The Taj Mahal is a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal.\n",
697 |       "Q:  Where is The Taj Mahal a beautiful monument built in?\n",
698 |       "\n",
699 |       "S:  Sachin Tendulkar was awarded Bharat Ratna in 2013.\n",
700 |       "Q:  Was sachin Tendulkar awarded Bharat Ratna in 2013 ?\n",
701 |       "\n",
702 |       "S:  Sachin Tendulkar was awarded Bharat Ratna in 2013.\n",
703 |       "Q:  Who was awarded Bharat Ratna in 2013?\n",
704 |       "\n",
705 |       "S:  Sachin Tendulkar was awarded Bharat Ratna in 2013.\n",
706 |       "Q:  Who was Sachin Tendulkar awarded?\n",
707 |       "\n",
708 |       "S:  Sachin Tendulkar was awarded Bharat Ratna in 2013.\n",
709 |       "Q:  When was Sachin Tendulkar award Bharat Ratna?\n",
710 |       "\n",
711 |       "S:  His name is Peter.\n",
712 |       "Q:  Is his name Peter ?\n",
713 |       "\n",
714 |       "S:  His name is Peter.\n",
715 |       "Q:  Who is His name Peter ?\n",
716 |       "\n",
717 |       "S:  Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.\n",
718 |       "Q:  Was jawaharlal Nehru born on 14th November 1889 in Allahabad , Uttar Pradesh ?\n",
719 |       "\n",
720 |       "S:  Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.\n",
721 |       "Q:  Who was born on 14th November 1889 in Allahabad, Uttar Pradesh?\n",
722 |       "\n",
723 |       "S:  Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.\n",
724 |       "Q:  When was Jawaharlal Nehru bear?\n",
725 |       "\n",
726 |       "S:  Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.\n",
727 |       "Q:  Where was Jawaharlal Nehru born on?\n",
728 |       "\n",
729 |       "S:  Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.\n",
730 |       "Q:  Where was Jawaharlal Nehru born on?\n",
731 |       "\n",
732 |       "S:  We were playing tennis at the club.\n",
733 |       "Q:  Were you playing tennis at the club ?\n",
734 |       "\n",
735 |       "S:  Population refers to the number of individuals in a particular place.\n",
736 |       "Q:  Does population refers to the number of individuals in a particular place?\n",
737 |       "\n",
738 |       "S:  They have been trying to contact her.\n",
739 |       "Q:  Do they hav been try to contact her?\n",
740 |       "\n"
741 |      ]
742 |     }
743 |    ],
744 |    "source": [
745 |     "sentensify()"
746 |    ]
747 |   }
748 |  ],
749 |  "metadata": {
750 |   "kernelspec": {
751 |    "display_name": "Python 3",
752 |    "language": "python",
753 |    "name": "python3"
754 |   },
755 |   "language_info": {
756 |    "codemirror_mode": {
757 |     "name": "ipython",
758 |     "version": 3
759 |    },
760 |    "file_extension": ".py",
761 |    "mimetype": "text/x-python",
762 |    "name": "python",
763 |    "nbconvert_exporter": "python",
764 |    "pygments_lexer": "ipython3",
765 |    "version": "3.6.7"
766 |   }
767 |  },
768 |  "nbformat": 4,
769 |  "nbformat_minor": 2
770 | }
771 | 


--------------------------------------------------------------------------------
/NLP Project.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sanskar-Jain/Automatic-Question-Generator/b64bf27d39c08e561b503fc493c03fc510603462/NLP Project.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Question-Generation
 2 | An automated system that can take as input a text and produce as output questions for assessing a reader’s knowledge of the information in the
 3 | text. The system uses a combination of manually encoded transformation rules on each sentence in a paragraph.
 4 | 
 5 | ### Libraries Used
 6 |   - Spacy
 7 |   - NLTK
 8 |   - pandas
 9 |   - re
10 | 
11 | ### These are parts-of-speech tags used in the project
12 |   - NNS 	Noun, plural
13 |   - JJ 	Adjective 
14 |   - NNP 	Proper noun, singular 
15 |   - VBG 	Verb, gerund or present participle 
16 |   - VBN 	Verb, past participle 
17 |   - VBZ 	Verb, 3rd person singular present 
18 |   - VBD 	Verb, past tense 
19 |   - IN 		Preposition or subordinating conjunction 
20 |   - PRP 	Personal pronoun 
21 |   - NN 	Noun, singular or mass 
22 | 


--------------------------------------------------------------------------------
/input.txt:
--------------------------------------------------------------------------------
 1 | They were angry because their plans had been discovered.
 2 | No, I was not playing cricket. 
 3 | I can play quite a few musical instruments, for example, the flute, the guitar, and the piano.
 4 | He hurt his hand when he fell.
 5 | I think he felt included because he was helping as much as we were.
 6 | Calcium is found in green leafy vegetables, for example, broccoli, kale, arugula, or spinach have over 160 mg. per serving.
 7 | Sachin Tendulkar was awarded Bharat Ratna in 2013.
 8 | John was held captive at Castle Black.
 9 | I am studying English because I’d like to immigrate to the U.S.
10 | It is ten o’clock.
11 | He will go to China tomorrow.
12 | Children often cry just because they want some attention.
13 | Abdul Kalam was an aerospace scientist who served as the 11th President of India from 2002 to 2007.
14 | Vitamin C is found in colorful vegetables, for instance, bell peppers have a lot of vitamin C.
15 | John is feeling much better now.
16 | His name is Peter.
17 | Tom repainted his mailbox because it was looking shabby.
18 | The Taj Mahal is a beautiful monument built in 1631 by an Emperor named Shah Jahan in memory of his wife Mumtaz Mahal.
19 | Delhi is the capital of India.
20 | It is possible to combine Computer Science with other subjects, for example Physics.
21 | Sun is the largest member of the Solar System.
22 | Mahatma Gandhi was born on 2nd October 1869 in Porbandar, Gujarat.
23 | The British had introduced a National Flag for British India after the revolt of 1857.
24 | Hindi Diwas was first celebrated in the year 1953.
25 | Population refers to the number of individuals in a particular place.
26 | Jawaharlal Nehru was born on 14th November 1889 in Allahabad, Uttar Pradesh.
27 | He was elected as the Prime Minister of India on 15th August 1947.
28 | They slept in the car because they couldn't find a hotel.
29 | The accident happened because of the driver's negligence.
30 | Yes, I like coffee.
31 | Darjeeling is known for its beautiful tea gardens.
32 | Gandhi Jayanti is celebrated on 2nd October.
33 | Every child feels displaced to some degree when a new sibling arrives.
34 | He gave up traveling abroad because of his sudden illness.
35 | I was playing tennis.
36 | We were playing tennis at the club. 
37 | I had been playing the drums since school time.
38 | She is preparing chicken sandwiches for breakfast.
39 | They have been trying to contact her.
40 | I shall have been living in Mumbai for five years by May 2019. 
41 | Harry was late for class yesterday because of his accident.
42 | I have been up since four.
43 | Fractions can be written with oblique strokes, for example 2/3.
44 | Yes, she is working very hard.
45 | You usually walk to work.
46 | She was angry when you told her about the accident.
47 | I did go for fishing today.
48 | We had a bad rice crop last year because it rained a lot.
49 | They grow really well in pots.
50 | I married her when she was 23.


--------------------------------------------------------------------------------