├── .gitignore
├── LICENSE
├── Question Classifier.ipynb
├── README.md
├── question_classification_taxanomy (1) (1).txt
├── traininig_dataset (1) (1).txt
└── validation_dataset (1) (1).txt
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Aman
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Question Classifier.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stderr",
10 | "output_type": "stream",
11 | "text": [
12 | "c:\\users\\i327950\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\gensim\\utils.py:843: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n",
13 | " warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import pandas as pd\n",
19 | "import numpy as np\n",
20 | "import matplotlib.pyplot as plt\n",
21 | "import re, nltk\n",
22 | "import gensim\n",
23 | "import codecs\n",
24 | "from sner import Ner\n",
25 | "import spacy\n",
26 | "from sklearn.metrics import confusion_matrix, accuracy_score, average_precision_score\n",
27 | "from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV\n",
28 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n",
29 | "from nltk.internals import find_jars_within_path\n",
30 | "from nltk.tag import StanfordPOSTagger\n",
31 | "from nltk.tag import StanfordNERTagger\n",
32 | "import spacy\n",
33 | "from sklearn import linear_model\n",
34 | "from sklearn import svm\n",
35 | "from sklearn.metrics import fbeta_score, accuracy_score\n",
36 | "from scipy.sparse import hstack\n",
37 | "from sklearn.feature_extraction.text import CountVectorizer"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 2,
43 | "metadata": {
44 | "collapsed": true
45 | },
46 | "outputs": [],
47 | "source": [
48 | "f_train = open('traininig_dataset (1) (1).txt', 'r+')\n",
49 | "f_test = open('validation_dataset (1) (1).txt', 'r+')\n",
50 | "\n",
51 | "train = pd.DataFrame(f_train.readlines(), columns = ['Question'])\n",
52 | "test = pd.DataFrame(f_test.readlines(), columns = ['Question'])"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "metadata": {
59 | "collapsed": true
60 | },
61 | "outputs": [],
62 | "source": [
63 | "train['QType'] = train.Question.apply(lambda x: x.split(' ', 1)[0])\n",
64 | "train['Question'] = train.Question.apply(lambda x: x.split(' ', 1)[1])\n",
65 | "train['QType-Coarse'] = train.QType.apply(lambda x: x.split(':')[0])\n",
66 | "train['QType-Fine'] = train.QType.apply(lambda x: x.split(':')[1])\n",
67 | "test['QType'] = test.Question.apply(lambda x: x.split(' ', 1)[0])\n",
68 | "test['Question'] = test.Question.apply(lambda x: x.split(' ', 1)[1])\n",
69 | "test['QType-Coarse'] = test.QType.apply(lambda x: x.split(':')[0])\n",
70 | "test['QType-Fine'] = test.QType.apply(lambda x: x.split(':')[1])"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 4,
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/html": [
81 | "
\n",
82 | "\n",
95 | "
\n",
96 | " \n",
97 | " \n",
98 | " | \n",
99 | " Question | \n",
100 | " QType | \n",
101 | " QType-Coarse | \n",
102 | " QType-Fine | \n",
103 | "
\n",
104 | " \n",
105 | " \n",
106 | " \n",
107 | " 0 | \n",
108 | " How did serfdom develop in and then leave Russ... | \n",
109 | " DESC:manner | \n",
110 | " DESC | \n",
111 | " manner | \n",
112 | "
\n",
113 | " \n",
114 | " 1 | \n",
115 | " What films featured the character Popeye Doyle... | \n",
116 | " ENTY:cremat | \n",
117 | " ENTY | \n",
118 | " cremat | \n",
119 | "
\n",
120 | " \n",
121 | " 2 | \n",
122 | " How can I find a list of celebrities ' real na... | \n",
123 | " DESC:manner | \n",
124 | " DESC | \n",
125 | " manner | \n",
126 | "
\n",
127 | " \n",
128 | " 3 | \n",
129 | " What fowl grabs the spotlight after the Chines... | \n",
130 | " ENTY:animal | \n",
131 | " ENTY | \n",
132 | " animal | \n",
133 | "
\n",
134 | " \n",
135 | " 4 | \n",
136 | " What is the full form of .com ?\\n | \n",
137 | " ABBR:exp | \n",
138 | " ABBR | \n",
139 | " exp | \n",
140 | "
\n",
141 | " \n",
142 | "
\n",
143 | "
"
144 | ],
145 | "text/plain": [
146 | " Question QType \\\n",
147 | "0 How did serfdom develop in and then leave Russ... DESC:manner \n",
148 | "1 What films featured the character Popeye Doyle... ENTY:cremat \n",
149 | "2 How can I find a list of celebrities ' real na... DESC:manner \n",
150 | "3 What fowl grabs the spotlight after the Chines... ENTY:animal \n",
151 | "4 What is the full form of .com ?\\n ABBR:exp \n",
152 | "\n",
153 | " QType-Coarse QType-Fine \n",
154 | "0 DESC manner \n",
155 | "1 ENTY cremat \n",
156 | "2 DESC manner \n",
157 | "3 ENTY animal \n",
158 | "4 ABBR exp "
159 | ]
160 | },
161 | "execution_count": 4,
162 | "metadata": {},
163 | "output_type": "execute_result"
164 | }
165 | ],
166 | "source": [
167 | "train.head()"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 5,
173 | "metadata": {},
174 | "outputs": [
175 | {
176 | "data": {
177 | "text/html": [
178 | "\n",
179 | "\n",
192 | "
\n",
193 | " \n",
194 | " \n",
195 | " | \n",
196 | " Question | \n",
197 | " QType | \n",
198 | " QType-Coarse | \n",
199 | " QType-Fine | \n",
200 | "
\n",
201 | " \n",
202 | " \n",
203 | " \n",
204 | " count | \n",
205 | " 500 | \n",
206 | " 500 | \n",
207 | " 500 | \n",
208 | " 500 | \n",
209 | "
\n",
210 | " \n",
211 | " unique | \n",
212 | " 500 | \n",
213 | " 42 | \n",
214 | " 6 | \n",
215 | " 39 | \n",
216 | "
\n",
217 | " \n",
218 | " top | \n",
219 | " What is the most popular sport in Japan ?\\n | \n",
220 | " DESC:def | \n",
221 | " DESC | \n",
222 | " def | \n",
223 | "
\n",
224 | " \n",
225 | " freq | \n",
226 | " 1 | \n",
227 | " 123 | \n",
228 | " 138 | \n",
229 | " 123 | \n",
230 | "
\n",
231 | " \n",
232 | "
\n",
233 | "
"
234 | ],
235 | "text/plain": [
236 | " Question QType QType-Coarse \\\n",
237 | "count 500 500 500 \n",
238 | "unique 500 42 6 \n",
239 | "top What is the most popular sport in Japan ?\\n DESC:def DESC \n",
240 | "freq 1 123 138 \n",
241 | "\n",
242 | " QType-Fine \n",
243 | "count 500 \n",
244 | "unique 39 \n",
245 | "top def \n",
246 | "freq 123 "
247 | ]
248 | },
249 | "execution_count": 5,
250 | "metadata": {},
251 | "output_type": "execute_result"
252 | }
253 | ],
254 | "source": [
255 | "test.describe()"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 6,
261 | "metadata": {},
262 | "outputs": [
263 | {
264 | "data": {
265 | "text/html": [
266 | "\n",
267 | "\n",
280 | "
\n",
281 | " \n",
282 | " \n",
283 | " | \n",
284 | " Question | \n",
285 | " QType | \n",
286 | " QType-Coarse | \n",
287 | " QType-Fine | \n",
288 | "
\n",
289 | " \n",
290 | " \n",
291 | " \n",
292 | " 0 | \n",
293 | " How far is it from Denver to Aspen ?\\n | \n",
294 | " NUM:dist | \n",
295 | " NUM | \n",
296 | " dist | \n",
297 | "
\n",
298 | " \n",
299 | " 1 | \n",
300 | " What county is Modesto , California in ?\\n | \n",
301 | " LOC:city | \n",
302 | " LOC | \n",
303 | " city | \n",
304 | "
\n",
305 | " \n",
306 | " 2 | \n",
307 | " Who was Galileo ?\\n | \n",
308 | " HUM:desc | \n",
309 | " HUM | \n",
310 | " desc | \n",
311 | "
\n",
312 | " \n",
313 | " 3 | \n",
314 | " What is an atom ?\\n | \n",
315 | " DESC:def | \n",
316 | " DESC | \n",
317 | " def | \n",
318 | "
\n",
319 | " \n",
320 | " 4 | \n",
321 | " When did Hawaii become a state ?\\n | \n",
322 | " NUM:date | \n",
323 | " NUM | \n",
324 | " date | \n",
325 | "
\n",
326 | " \n",
327 | "
\n",
328 | "
"
329 | ],
330 | "text/plain": [
331 | " Question QType QType-Coarse \\\n",
332 | "0 How far is it from Denver to Aspen ?\\n NUM:dist NUM \n",
333 | "1 What county is Modesto , California in ?\\n LOC:city LOC \n",
334 | "2 Who was Galileo ?\\n HUM:desc HUM \n",
335 | "3 What is an atom ?\\n DESC:def DESC \n",
336 | "4 When did Hawaii become a state ?\\n NUM:date NUM \n",
337 | "\n",
338 | " QType-Fine \n",
339 | "0 dist \n",
340 | "1 city \n",
341 | "2 desc \n",
342 | "3 def \n",
343 | "4 date "
344 | ]
345 | },
346 | "execution_count": 6,
347 | "metadata": {},
348 | "output_type": "execute_result"
349 | }
350 | ],
351 | "source": [
352 | "test.head()"
353 | ]
354 | },
355 | {
356 | "cell_type": "code",
357 | "execution_count": 7,
358 | "metadata": {},
359 | "outputs": [
360 | {
361 | "data": {
362 | "text/html": [
363 | "\n",
364 | "\n",
377 | "
\n",
378 | " \n",
379 | " \n",
380 | " | \n",
381 | " Question | \n",
382 | " QType | \n",
383 | " QType-Coarse | \n",
384 | " QType-Fine | \n",
385 | "
\n",
386 | " \n",
387 | " \n",
388 | " \n",
389 | " count | \n",
390 | " 5952 | \n",
391 | " 5952 | \n",
392 | " 5952 | \n",
393 | " 5952 | \n",
394 | "
\n",
395 | " \n",
396 | " unique | \n",
397 | " 5871 | \n",
398 | " 50 | \n",
399 | " 6 | \n",
400 | " 47 | \n",
401 | "
\n",
402 | " \n",
403 | " top | \n",
404 | " What is the latitude and longitude of El Paso ... | \n",
405 | " HUM:ind | \n",
406 | " ENTY | \n",
407 | " ind | \n",
408 | "
\n",
409 | " \n",
410 | " freq | \n",
411 | " 3 | \n",
412 | " 1017 | \n",
413 | " 1344 | \n",
414 | " 1017 | \n",
415 | "
\n",
416 | " \n",
417 | "
\n",
418 | "
"
419 | ],
420 | "text/plain": [
421 | " Question QType \\\n",
422 | "count 5952 5952 \n",
423 | "unique 5871 50 \n",
424 | "top What is the latitude and longitude of El Paso ... HUM:ind \n",
425 | "freq 3 1017 \n",
426 | "\n",
427 | " QType-Coarse QType-Fine \n",
428 | "count 5952 5952 \n",
429 | "unique 6 47 \n",
430 | "top ENTY ind \n",
431 | "freq 1344 1017 "
432 | ]
433 | },
434 | "execution_count": 7,
435 | "metadata": {},
436 | "output_type": "execute_result"
437 | }
438 | ],
439 | "source": [
440 | "train.append(test).describe()"
441 | ]
442 | },
443 | {
444 | "cell_type": "markdown",
445 | "metadata": {},
446 | "source": [
447 | "As can be observed, the train set consists of some duplicate question (81 to be exact).
\n",
448 | "The number of unique Coarse:Fine classes is 50 whereas entries corresponding to 42 are present in the test set.
\n",
449 | "The number of fine classes overall is 47 whereas entries corresponding to 39 are present in test."
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": 8,
455 | "metadata": {
456 | "collapsed": true
457 | },
458 | "outputs": [],
459 | "source": [
460 | "from sklearn.preprocessing import LabelEncoder\n",
461 | "le = LabelEncoder()\n",
462 | "le.fit(pd.Series(train.QType.tolist() + test.QType.tolist()).values)\n",
463 | "train['QType'] = le.transform(train.QType.values)\n",
464 | "test['QType'] = le.transform(test.QType.values)\n",
465 | "le2 = LabelEncoder()\n",
466 | "le2.fit(pd.Series(train['QType-Coarse'].tolist() + test['QType-Coarse'].tolist()).values)\n",
467 | "train['QType-Coarse'] = le2.transform(train['QType-Coarse'].values)\n",
468 | "test['QType-Coarse'] = le2.transform(test['QType-Coarse'].values)\n",
469 | "le3 = LabelEncoder()\n",
470 | "le3.fit(pd.Series(train['QType-Fine'].tolist() + test['QType-Fine'].tolist()).values)\n",
471 | "train['QType-Fine'] = le3.transform(train['QType-Fine'].values)\n",
472 | "test['QType-Fine'] = le3.transform(test['QType-Fine'].values)"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 9,
478 | "metadata": {},
479 | "outputs": [
480 | {
481 | "data": {
482 | "text/html": [
483 | "\n",
484 | "\n",
497 | "
\n",
498 | " \n",
499 | " \n",
500 | " | \n",
501 | " Question | \n",
502 | " QType | \n",
503 | " QType-Coarse | \n",
504 | " QType-Fine | \n",
505 | "
\n",
506 | " \n",
507 | " \n",
508 | " \n",
509 | " 0 | \n",
510 | " How did serfdom develop in and then leave Russ... | \n",
511 | " 4 | \n",
512 | " 1 | \n",
513 | " 23 | \n",
514 | "
\n",
515 | " \n",
516 | " 1 | \n",
517 | " What films featured the character Popeye Doyle... | \n",
518 | " 9 | \n",
519 | " 2 | \n",
520 | " 8 | \n",
521 | "
\n",
522 | " \n",
523 | " 2 | \n",
524 | " How can I find a list of celebrities ' real na... | \n",
525 | " 4 | \n",
526 | " 1 | \n",
527 | " 23 | \n",
528 | "
\n",
529 | " \n",
530 | " 3 | \n",
531 | " What fowl grabs the spotlight after the Chines... | \n",
532 | " 6 | \n",
533 | " 2 | \n",
534 | " 1 | \n",
535 | "
\n",
536 | " \n",
537 | " 4 | \n",
538 | " What is the full form of .com ?\\n | \n",
539 | " 1 | \n",
540 | " 0 | \n",
541 | " 16 | \n",
542 | "
\n",
543 | " \n",
544 | "
\n",
545 | "
"
546 | ],
547 | "text/plain": [
548 | " Question QType QType-Coarse \\\n",
549 | "0 How did serfdom develop in and then leave Russ... 4 1 \n",
550 | "1 What films featured the character Popeye Doyle... 9 2 \n",
551 | "2 How can I find a list of celebrities ' real na... 4 1 \n",
552 | "3 What fowl grabs the spotlight after the Chines... 6 2 \n",
553 | "4 What is the full form of .com ?\\n 1 0 \n",
554 | "\n",
555 | " QType-Fine \n",
556 | "0 23 \n",
557 | "1 8 \n",
558 | "2 23 \n",
559 | "3 1 \n",
560 | "4 16 "
561 | ]
562 | },
563 | "execution_count": 9,
564 | "metadata": {},
565 | "output_type": "execute_result"
566 | }
567 | ],
568 | "source": [
569 | "train.head()"
570 | ]
571 | },
572 | {
573 | "cell_type": "code",
574 | "execution_count": 10,
575 | "metadata": {
576 | "collapsed": true
577 | },
578 | "outputs": [],
579 | "source": [
580 | "all_corpus = pd.Series(train.Question.tolist() + test.Question.tolist()).astype(str)"
581 | ]
582 | },
583 | {
584 | "cell_type": "markdown",
585 | "metadata": {},
586 | "source": [
587 | "Obtaining Dotwords.
\n",
588 | "Also, performing text cleaning and pre-processing in the next two blocks"
589 | ]
590 | },
591 | {
592 | "cell_type": "code",
593 | "execution_count": 11,
594 | "metadata": {},
595 | "outputs": [
596 | {
597 | "name": "stdout",
598 | "output_type": "stream",
599 | "text": [
600 | "[nltk_data] Downloading package stopwords to\n",
601 | "[nltk_data] C:\\Users\\I327950\\AppData\\Roaming\\nltk_data...\n",
602 | "[nltk_data] Package stopwords is already up-to-date!\n",
603 | "[nltk_data] Downloading package wordnet to\n",
604 | "[nltk_data] C:\\Users\\I327950\\AppData\\Roaming\\nltk_data...\n",
605 | "[nltk_data] Package wordnet is already up-to-date!\n"
606 | ]
607 | }
608 | ],
609 | "source": [
610 | "nltk.download('stopwords')\n",
611 | "nltk.download('wordnet')\n",
612 | "from nltk.corpus import stopwords\n",
613 | "from nltk.stem.porter import PorterStemmer \n",
614 | "from nltk.stem.snowball import SnowballStemmer\n",
615 | "from nltk.stem.wordnet import WordNetLemmatizer\n",
616 | "\n",
617 | "# dot_words = []\n",
618 | "# for row in all_corpus:\n",
619 | "# for word in row.split():\n",
620 | "# if '.' in word and len(word)>2:\n",
621 | "# dot_words.append(word)"
622 | ]
623 | },
624 | {
625 | "cell_type": "code",
626 | "execution_count": 12,
627 | "metadata": {
628 | "collapsed": true
629 | },
630 | "outputs": [],
631 | "source": [
632 | "def text_clean(corpus, keep_list):\n",
633 | " '''\n",
634 | " Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)\n",
635 | " \n",
636 | " Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained\n",
637 | " even after the cleaning process\n",
638 | " \n",
639 | " Output : Returns the cleaned text corpus\n",
640 | " \n",
641 | " '''\n",
642 | " cleaned_corpus = pd.Series()\n",
643 | " for row in corpus:\n",
644 | " qs = []\n",
645 | " for word in row.split():\n",
646 | " if word not in keep_list:\n",
647 | " p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)\n",
648 | " p1 = p1.lower()\n",
649 | " qs.append(p1)\n",
650 | " else : qs.append(word)\n",
651 | " cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))\n",
652 | " return cleaned_corpus"
653 | ]
654 | },
655 | {
656 | "cell_type": "code",
657 | "execution_count": 13,
658 | "metadata": {
659 | "collapsed": true
660 | },
661 | "outputs": [],
662 | "source": [
663 | "def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):\n",
664 | " \n",
665 | " '''\n",
666 | " Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)\n",
667 | " \n",
668 | " Input : \n",
669 | " 'corpus' - Text corpus on which pre-processing tasks will be performed\n",
670 | " 'keep_list' - List of words to be retained during cleaning process\n",
671 | " 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should \n",
672 | " be performed or not\n",
673 | " 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is \"None\", which corresponds to Porter\n",
674 | " Stemmer. 'snowball' corresponds to Snowball Stemmer\n",
675 | " \n",
676 | " Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together\n",
677 | " \n",
678 | " Output : Returns the processed text corpus\n",
679 | " \n",
680 | " '''\n",
681 | " if cleaning == True:\n",
682 | " corpus = text_clean(corpus, keep_list)\n",
683 | " \n",
684 | " if remove_stopwords == True:\n",
685 | " wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']\n",
686 | " stop = set(stopwords.words('english'))\n",
687 | " for word in wh_words:\n",
688 | " stop.remove(word)\n",
689 | " corpus = [[x for x in x.split() if x not in stop] for x in corpus]\n",
690 | " else :\n",
691 | " corpus = [[x for x in x.split()] for x in corpus]\n",
692 | " \n",
693 | " if lemmatization == True:\n",
694 | " lem = WordNetLemmatizer()\n",
695 | " corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]\n",
696 | " \n",
697 | " if stemming == True:\n",
698 | " if stem_type == 'snowball':\n",
699 | " stemmer = SnowballStemmer(language = 'english')\n",
700 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n",
701 | " else :\n",
702 | " stemmer = PorterStemmer()\n",
703 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n",
704 | " \n",
705 | " corpus = [' '.join(x) for x in corpus]\n",
706 | " \n",
707 | "\n",
708 | " return corpus"
709 | ]
710 | },
711 | {
712 | "cell_type": "code",
713 | "execution_count": 14,
714 | "metadata": {
715 | "collapsed": true
716 | },
717 | "outputs": [],
718 | "source": [
719 | "common_dot_words = ['U.S.', 'St.', 'Mr.', 'Mrs.', 'D.C.']\n",
720 | "all_corpus = preprocess(all_corpus, keep_list = common_dot_words, remove_stopwords = True)"
721 | ]
722 | },
723 | {
724 | "cell_type": "markdown",
725 | "metadata": {},
726 | "source": [
727 | "# Splitting the preprocessed combined corpus again into train and test set"
728 | ]
729 | },
730 | {
731 | "cell_type": "code",
732 | "execution_count": 16,
733 | "metadata": {},
734 | "outputs": [],
735 | "source": [
736 | "train_corpus = all_corpus[0:train.shape[0]]\n",
737 | "test_corpus = all_corpus[train.shape[0]:]"
738 | ]
739 | },
740 | {
741 | "cell_type": "markdown",
742 | "metadata": {},
743 | "source": [
744 | "Loading the English model for Spacy.
\n",
745 | "NLTK version for the same performs too slowly, hence opting for Spacy."
746 | ]
747 | },
748 | {
749 | "cell_type": "code",
750 | "execution_count": 17,
751 | "metadata": {
752 | "collapsed": true
753 | },
754 | "outputs": [],
755 | "source": [
756 | "nlp = spacy.load('en')"
757 | ]
758 | },
759 | {
760 | "cell_type": "markdown",
761 | "metadata": {},
762 | "source": [
763 | "# Obtaining Features from Train Data, which would be fed to CountVectorizer\n",
764 | "\n",
765 | "Creating list of Named Entitites, Lemmas, POS Tags, Syntactic Dependency Relation and Orthographic Features using shape.
\n",
766 | "Later, these would be used as features for our model."
767 | ]
768 | },
769 | {
770 | "cell_type": "code",
771 | "execution_count": 18,
772 | "metadata": {
773 | "collapsed": true
774 | },
775 | "outputs": [],
776 | "source": [
777 | "all_ner = []\n",
778 | "all_lemma = []\n",
779 | "all_tag = []\n",
780 | "all_dep = []\n",
781 | "all_shape = []\n",
782 | "for row in train_corpus:\n",
783 | " doc = nlp(row)\n",
784 | " present_lemma = []\n",
785 | " present_tag = []\n",
786 | " present_dep = []\n",
787 | " present_shape = []\n",
788 | " present_ner = []\n",
789 | " #print(row)\n",
790 | " for token in doc:\n",
791 | " present_lemma.append(token.lemma_)\n",
792 | " present_tag.append(token.tag_)\n",
793 | " #print(present_tag)\n",
794 | " present_dep.append(token.dep_)\n",
795 | " present_shape.append(token.shape_)\n",
796 | " all_lemma.append(\" \".join(present_lemma))\n",
797 | " all_tag.append(\" \".join(present_tag))\n",
798 | " all_dep.append(\" \".join(present_dep))\n",
799 | " all_shape.append(\" \".join(present_shape))\n",
800 | " for ent in doc.ents:\n",
801 | " present_ner.append(ent.label_)\n",
802 | " all_ner.append(\" \".join(present_ner))"
803 | ]
804 | },
805 | {
806 | "cell_type": "markdown",
807 | "metadata": {},
808 | "source": [
809 | "Converting the attributes obtained above into vectors using CountVectorizer."
810 | ]
811 | },
812 | {
813 | "cell_type": "code",
814 | "execution_count": 19,
815 | "metadata": {
816 | "collapsed": true
817 | },
818 | "outputs": [],
819 | "source": [
820 | "count_vec_ner = CountVectorizer(ngram_range=(1, 2)).fit(all_ner)\n",
821 | "ner_ft = count_vec_ner.transform(all_ner)\n",
822 | "count_vec_lemma = CountVectorizer(ngram_range=(1, 2)).fit(all_lemma)\n",
823 | "lemma_ft = count_vec_lemma.transform(all_lemma)\n",
824 | "count_vec_tag = CountVectorizer(ngram_range=(1, 2)).fit(all_tag)\n",
825 | "tag_ft = count_vec_tag.transform(all_tag)\n",
826 | "count_vec_dep = CountVectorizer(ngram_range=(1, 2)).fit(all_dep)\n",
827 | "dep_ft = count_vec_dep.transform(all_dep)\n",
828 | "count_vec_shape = CountVectorizer(ngram_range=(1, 2)).fit(all_shape)\n",
829 | "shape_ft = count_vec_shape.transform(all_shape)"
830 | ]
831 | },
832 | {
833 | "cell_type": "markdown",
834 | "metadata": {},
835 | "source": [
836 | "Combining the features obtained into 1 matrix"
837 | ]
838 | },
839 | {
840 | "cell_type": "code",
841 | "execution_count": 20,
842 | "metadata": {
843 | "collapsed": true
844 | },
845 | "outputs": [],
846 | "source": [
847 | "#x_all_ft_train = hstack([ner_ft, lemma_ft, tag_ft, dep_ft, shape_ft])\n",
848 | "x_all_ft_train = hstack([ner_ft, lemma_ft, tag_ft])"
849 | ]
850 | },
851 | {
852 | "cell_type": "code",
853 | "execution_count": 21,
854 | "metadata": {},
855 | "outputs": [
856 | {
857 | "data": {
858 | "text/plain": [
859 | "<5452x27303 sparse matrix of type ''\n",
860 | "\twith 102689 stored elements in COOrdinate format>"
861 | ]
862 | },
863 | "execution_count": 21,
864 | "metadata": {},
865 | "output_type": "execute_result"
866 | }
867 | ],
868 | "source": [
869 | "x_all_ft_train"
870 | ]
871 | },
872 | {
873 | "cell_type": "markdown",
874 | "metadata": {},
875 | "source": [
876 | "Converting from COOrdinate format to Compressed Sparse Row format for easier mathematical computations."
877 | ]
878 | },
879 | {
880 | "cell_type": "code",
881 | "execution_count": 22,
882 | "metadata": {},
883 | "outputs": [
884 | {
885 | "data": {
886 | "text/plain": [
887 | "<5452x27303 sparse matrix of type ''\n",
888 | "\twith 102689 stored elements in Compressed Sparse Row format>"
889 | ]
890 | },
891 | "execution_count": 22,
892 | "metadata": {},
893 | "output_type": "execute_result"
894 | }
895 | ],
896 | "source": [
897 | "x_all_ft_train = x_all_ft_train.tocsr()\n",
898 | "x_all_ft_train"
899 | ]
900 | },
901 | {
902 | "cell_type": "markdown",
903 | "metadata": {},
904 | "source": [
905 | "# Now we will obtain the Feature vectors for the test set using the CountVectorizers Obtained from the Training Corpus"
906 | ]
907 | },
908 | {
909 | "cell_type": "code",
910 | "execution_count": 23,
911 | "metadata": {
912 | "collapsed": true
913 | },
914 | "outputs": [],
915 | "source": [
916 | "all_test_ner = []\n",
917 | "all_test_lemma = []\n",
918 | "all_test_tag = []\n",
919 | "all_test_dep = []\n",
920 | "all_test_shape = []\n",
921 | "for row in test_corpus:\n",
922 | " doc = nlp(row)\n",
923 | " present_lemma = []\n",
924 | " present_tag = []\n",
925 | " present_dep = []\n",
926 | " present_shape = []\n",
927 | " present_ner = []\n",
928 | " #print(row)\n",
929 | " for token in doc:\n",
930 | " present_lemma.append(token.lemma_)\n",
931 | " present_tag.append(token.tag_)\n",
932 | " #print(present_tag)\n",
933 | " present_dep.append(token.dep_)\n",
934 | " present_shape.append(token.shape_)\n",
935 | " all_test_lemma.append(\" \".join(present_lemma))\n",
936 | " all_test_tag.append(\" \".join(present_tag))\n",
937 | " all_test_dep.append(\" \".join(present_dep))\n",
938 | " all_test_shape.append(\" \".join(present_shape))\n",
939 | " for ent in doc.ents:\n",
940 | " present_ner.append(ent.label_)\n",
941 | " all_test_ner.append(\" \".join(present_ner))"
942 | ]
943 | },
944 | {
945 | "cell_type": "code",
946 | "execution_count": 24,
947 | "metadata": {
948 | "collapsed": true
949 | },
950 | "outputs": [],
951 | "source": [
952 | "ner_test_ft = count_vec_ner.transform(all_test_ner)\n",
953 | "lemma_test_ft = count_vec_lemma.transform(all_test_lemma)\n",
954 | "tag_test_ft = count_vec_tag.transform(all_test_tag)\n",
955 | "dep_test_ft = count_vec_dep.transform(all_test_dep)\n",
956 | "shape_test_ft = count_vec_shape.transform(all_test_shape)"
957 | ]
958 | },
959 | {
960 | "cell_type": "code",
961 | "execution_count": 25,
962 | "metadata": {},
963 | "outputs": [],
964 | "source": [
965 | "#x_all_ft_test = hstack([ner_test_ft, lemma_test_ft, tag_test_ft, dep_test_ft, shape_test_ft])\n",
966 | "x_all_ft_test = hstack([ner_test_ft, lemma_test_ft, tag_test_ft])"
967 | ]
968 | },
969 | {
970 | "cell_type": "code",
971 | "execution_count": 26,
972 | "metadata": {},
973 | "outputs": [
974 | {
975 | "data": {
976 | "text/plain": [
977 | "<500x27303 sparse matrix of type ''\n",
978 | "\twith 5270 stored elements in COOrdinate format>"
979 | ]
980 | },
981 | "execution_count": 26,
982 | "metadata": {},
983 | "output_type": "execute_result"
984 | }
985 | ],
986 | "source": [
987 | "x_all_ft_test"
988 | ]
989 | },
990 | {
991 | "cell_type": "code",
992 | "execution_count": 27,
993 | "metadata": {},
994 | "outputs": [
995 | {
996 | "data": {
997 | "text/plain": [
998 | "<500x27303 sparse matrix of type ''\n",
999 | "\twith 5270 stored elements in Compressed Sparse Row format>"
1000 | ]
1001 | },
1002 | "execution_count": 27,
1003 | "metadata": {},
1004 | "output_type": "execute_result"
1005 | }
1006 | ],
1007 | "source": [
1008 | "x_all_ft_test = x_all_ft_test.tocsr()\n",
1009 | "x_all_ft_test"
1010 | ]
1011 | },
1012 | {
1013 | "cell_type": "markdown",
1014 | "metadata": {},
1015 | "source": [
1016 | "# Model Training\n",
1017 | "Literature study over the years has shown Linear SVM performs best in this Use Case."
1018 | ]
1019 | },
1020 | {
1021 | "cell_type": "code",
1022 | "execution_count": 28,
1023 | "metadata": {
1024 | "collapsed": true
1025 | },
1026 | "outputs": [],
1027 | "source": [
1028 | "model = svm.LinearSVC()"
1029 | ]
1030 | },
1031 | {
1032 | "cell_type": "markdown",
1033 | "metadata": {},
1034 | "source": [
1035 | "First Modelling for Coarse Classes"
1036 | ]
1037 | },
1038 | {
1039 | "cell_type": "code",
1040 | "execution_count": 29,
1041 | "metadata": {},
1042 | "outputs": [
1043 | {
1044 | "data": {
1045 | "text/plain": [
1046 | "LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
1047 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
1048 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
1049 | " verbose=0)"
1050 | ]
1051 | },
1052 | "execution_count": 29,
1053 | "metadata": {},
1054 | "output_type": "execute_result"
1055 | }
1056 | ],
1057 | "source": [
1058 | "model.fit(x_all_ft_train, train['QType-Coarse'].values)"
1059 | ]
1060 | },
1061 | {
1062 | "cell_type": "markdown",
1063 | "metadata": {},
1064 | "source": [
1065 | "# Model Evaluation"
1066 | ]
1067 | },
1068 | {
1069 | "cell_type": "code",
1070 | "execution_count": 30,
1071 | "metadata": {
1072 | "collapsed": true
1073 | },
1074 | "outputs": [],
1075 | "source": [
1076 | "preds = model.predict(x_all_ft_test)"
1077 | ]
1078 | },
1079 | {
1080 | "cell_type": "code",
1081 | "execution_count": 31,
1082 | "metadata": {},
1083 | "outputs": [
1084 | {
1085 | "data": {
1086 | "text/plain": [
1087 | "array([5, 4, 3, 1, 5, 5, 3, 1, 1, 1, 4, 1, 5, 3, 5, 5, 4, 3, 1, 5, 3, 1, 4,\n",
1088 | " 1, 1, 3, 1, 1, 4, 1, 5, 4, 1, 5, 5, 5, 4, 5, 5, 5, 2, 1, 1, 1, 3, 2,\n",
1089 | " 5, 1, 5, 3, 1, 3, 3, 1, 1, 1, 5, 4, 4, 5, 4, 3, 4, 2, 4, 3, 2, 1, 5,\n",
1090 | " 4, 5, 5, 4, 3, 4, 1, 2, 5, 5, 3, 1, 5, 3, 5, 5, 1, 1, 3, 1, 4, 3, 1,\n",
1091 | " 5, 5, 4, 4, 5, 1, 1, 3, 1, 3, 1, 3, 4, 1, 5, 2, 5, 4, 2, 1, 4, 2, 4,\n",
1092 | " 3, 5, 1, 5, 4, 5, 2, 1, 3, 1, 3, 1, 5, 1, 5, 5, 3, 1, 1, 1, 1, 4, 3,\n",
1093 | " 3, 1, 1, 2, 4, 2, 1, 2, 3, 2, 1, 1, 2, 3, 1, 5, 3, 4, 4, 1, 2, 4, 1,\n",
1094 | " 1, 5, 4, 2, 1, 5, 1, 4, 3, 5, 5, 5, 1, 4, 4, 4, 5, 2, 5, 4, 1, 4, 1,\n",
1095 | " 1, 3, 3, 1, 4, 1, 1, 4, 5, 5, 1, 4, 2, 3, 2, 2, 3, 4, 3, 2, 1, 4, 3,\n",
1096 | " 5, 1, 1, 5, 5, 1, 4, 1, 2, 1, 2, 5, 1, 1, 5, 1, 1, 4, 2, 5, 1, 4, 3,\n",
1097 | " 5, 3, 1, 5, 2, 1, 4, 1, 4, 5, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 3, 1, 2,\n",
1098 | " 2, 1, 4, 4, 2, 1, 4, 3, 3, 5, 2, 5, 1, 1, 4, 5, 1, 2, 1, 1, 3, 1, 2,\n",
1099 | " 1, 5, 0, 2, 4, 3, 0, 1, 4, 1, 1, 1, 1, 1, 4, 2, 5, 2, 1, 1, 2, 5, 1,\n",
1100 | " 2, 0, 5, 1, 5, 5, 4, 3, 4, 3, 5, 4, 4, 5, 1, 4, 1, 3, 4, 2, 4, 1, 5,\n",
1101 | " 1, 2, 5, 1, 1, 5, 5, 1, 1, 5, 2, 2, 1, 4, 1, 2, 1, 5, 5, 2, 5, 3, 5,\n",
1102 | " 3, 3, 1, 5, 1, 5, 4, 4, 2, 1, 3, 5, 1, 1, 2, 1, 1, 3, 5, 1, 1, 2, 2,\n",
1103 | " 1, 4, 1, 2, 1, 1, 3, 5, 4, 1, 0, 1, 3, 3, 1, 3, 5, 5, 1, 3, 1, 1, 3,\n",
1104 | " 1, 2, 5, 1, 1, 1, 5, 5, 4, 1, 2, 5, 0, 5, 4, 1, 4, 5, 1, 3, 1, 4, 0,\n",
1105 | " 4, 1, 1, 1, 3, 3, 5, 1, 3, 1, 4, 2, 1, 4, 1, 3, 1, 2, 4, 3, 1, 1, 1,\n",
1106 | " 5, 0, 2, 3, 1, 4, 3, 3, 2, 4, 3, 5, 2, 2, 2, 2, 5, 1, 5, 2, 4, 1, 1,\n",
1107 | " 1, 2, 1, 5, 4, 2, 1, 3, 1, 1, 1, 1, 2, 5, 2, 2, 1, 4, 2, 4, 3, 2, 2,\n",
1108 | " 1, 4, 1, 4, 5, 1, 2, 1, 1, 2, 5, 5, 3, 2, 5, 1, 1], dtype=int64)"
1109 | ]
1110 | },
1111 | "execution_count": 31,
1112 | "metadata": {},
1113 | "output_type": "execute_result"
1114 | }
1115 | ],
1116 | "source": [
1117 | "preds"
1118 | ]
1119 | },
1120 | {
1121 | "cell_type": "code",
1122 | "execution_count": 32,
1123 | "metadata": {},
1124 | "outputs": [
1125 | {
1126 | "data": {
1127 | "text/plain": [
1128 | "0.88200000000000001"
1129 | ]
1130 | },
1131 | "execution_count": 32,
1132 | "metadata": {},
1133 | "output_type": "execute_result"
1134 | }
1135 | ],
1136 | "source": [
1137 | "accuracy_score(test['QType-Coarse'].values, preds)"
1138 | ]
1139 | },
1140 | {
1141 | "cell_type": "markdown",
1142 | "metadata": {},
1143 | "source": [
1144 | "Glad to announce, Feature Engineering has enabled us to achieve an Accuracy of 88.2% on the validation set.
\n",
1145 | "The obtained accuracy is way higher than the 73% accuracy obtained without feature engineering"
1146 | ]
1147 | },
1148 | {
1149 | "cell_type": "markdown",
1150 | "metadata": {},
1151 | "source": [
1152 | "Next, we will obtain accuracies for Coarse:Fine combinations"
1153 | ]
1154 | },
1155 | {
1156 | "cell_type": "code",
1157 | "execution_count": 33,
1158 | "metadata": {},
1159 | "outputs": [
1160 | {
1161 | "data": {
1162 | "text/plain": [
1163 | "LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
1164 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
1165 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
1166 | " verbose=0)"
1167 | ]
1168 | },
1169 | "execution_count": 33,
1170 | "metadata": {},
1171 | "output_type": "execute_result"
1172 | }
1173 | ],
1174 | "source": [
1175 | "model.fit(x_all_ft_train, train['QType'].values)"
1176 | ]
1177 | },
1178 | {
1179 | "cell_type": "code",
1180 | "execution_count": 34,
1181 | "metadata": {
1182 | "collapsed": true
1183 | },
1184 | "outputs": [],
1185 | "source": [
1186 | "preds = model.predict(x_all_ft_test)"
1187 | ]
1188 | },
1189 | {
1190 | "cell_type": "code",
1191 | "execution_count": 35,
1192 | "metadata": {},
1193 | "outputs": [
1194 | {
1195 | "data": {
1196 | "text/plain": [
1197 | "0.81399999999999995"
1198 | ]
1199 | },
1200 | "execution_count": 35,
1201 | "metadata": {},
1202 | "output_type": "execute_result"
1203 | }
1204 | ],
1205 | "source": [
1206 | "accuracy_score(test['QType'].values, preds)"
1207 | ]
1208 | },
1209 | {
1210 | "cell_type": "markdown",
1211 | "metadata": {
1212 | "collapsed": true
1213 | },
1214 | "source": [
1215 | "Woah, up to 81.4% accuracy from 68% obtained earlier when modelled without Feature Engineering."
1216 | ]
1217 | },
1218 | {
1219 | "cell_type": "markdown",
1220 | "metadata": {},
1221 | "source": [
1222 | "Finally, we would evaluate our performance for the fine classes"
1223 | ]
1224 | },
1225 | {
1226 | "cell_type": "code",
1227 | "execution_count": 36,
1228 | "metadata": {},
1229 | "outputs": [
1230 | {
1231 | "data": {
1232 | "text/plain": [
1233 | "LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
1234 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
1235 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
1236 | " verbose=0)"
1237 | ]
1238 | },
1239 | "execution_count": 36,
1240 | "metadata": {},
1241 | "output_type": "execute_result"
1242 | }
1243 | ],
1244 | "source": [
1245 | "model.fit(x_all_ft_train, train['QType-Fine'].values)"
1246 | ]
1247 | },
1248 | {
1249 | "cell_type": "code",
1250 | "execution_count": 37,
1251 | "metadata": {
1252 | "collapsed": true
1253 | },
1254 | "outputs": [],
1255 | "source": [
1256 | "preds = model.predict(x_all_ft_test)"
1257 | ]
1258 | },
1259 | {
1260 | "cell_type": "code",
1261 | "execution_count": 38,
1262 | "metadata": {},
1263 | "outputs": [
1264 | {
1265 | "data": {
1266 | "text/plain": [
1267 | "0.81200000000000006"
1268 | ]
1269 | },
1270 | "execution_count": 38,
1271 | "metadata": {},
1272 | "output_type": "execute_result"
1273 | }
1274 | ],
1275 | "source": [
1276 | "accuracy_score(test['QType-Fine'].values, preds)"
1277 | ]
1278 | },
1279 | {
1280 | "cell_type": "markdown",
1281 | "metadata": {},
1282 | "source": [
1283 | "Not bad, We haved achieved an accuracy of 81.2% over the Fine Classes."
1284 | ]
1285 | },
1286 | {
1287 | "cell_type": "markdown",
1288 | "metadata": {},
1289 | "source": [
1290 | "# Conclusion\n",
1291 | "\n",
1292 | "We achieved great accuracies using Feature Engineering as compared to accuracies obtained without feature engineering.\n",
1293 | "(The notebook for models obtained without feature engineering is not being shared and one can try implementing it easily).\n",
1294 | "\n",
1295 | "Experimenting with informer hypernyms can further help in accuracy improvement as suggested in https://nlp.stanford.edu/courses/cs224n/2010/reports/olalerew.pdf"
1296 | ]
1297 | }
1298 | ],
1299 | "metadata": {
1300 | "kernelspec": {
1301 | "display_name": "Python 3",
1302 | "language": "python",
1303 | "name": "python3"
1304 | },
1305 | "language_info": {
1306 | "codemirror_mode": {
1307 | "name": "ipython",
1308 | "version": 3
1309 | },
1310 | "file_extension": ".py",
1311 | "mimetype": "text/x-python",
1312 | "name": "python",
1313 | "nbconvert_exporter": "python",
1314 | "pygments_lexer": "ipython3",
1315 | "version": "3.5.0"
1316 | }
1317 | },
1318 | "nbformat": 4,
1319 | "nbformat_minor": 2
1320 | }
1321 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Question-Classification
2 | Classifying questions from UIUC's CogComp QC Dataset
3 |
4 | # Classifying Questions into Coarse (6 classes) and Fine (50 classes) classes.
5 |
6 | # Approach
7 | 1. Text Exploration
8 | 2. Text Cleaning
9 | 3. Obtaing POS Tags, Identifying Named Entities, Lemmas, Syntactic Dependency Relations and Orthographic Features.
10 | 4. Using the obtained properties as Features.
11 | 5. Using a Linear SVM model on the engineered features.
12 |
13 | # Results
14 | * 88.2% accuracy on Coarse classes.
15 | * 81.6% accuracy on Fine classes.
16 |
17 | | Variations in Features Used | Coarse Set Accuracy | Coarse:Fine Set Accuracy | Fine Set Accuracy |
18 | | ------------- | ------------- | ------------- | ------------- |
19 | | Named Entity Recognition + Lemmas + POS Tags + Syntactic Dependency + Shape | 87.8 | 80.4 | 80.8 |
20 | | Named Entity Recognition + Lemmas + POS Tags + Syntactic Dependency | 87.2 | 80.6 | 81.4 |
21 | | Named Entity Recognition + Lemmas + POS Tags | **88.2** | **81.4** | 81.2 |
22 | | Named Entity Recognition + Lemmas | 86.4 | 80.6 | **81.6** |
23 | | Lemmas | 86.2 | 80.4 | **81.6** |
24 |
25 | # References
26 | https://nlp.stanford.edu/courses/cs224n/2010/reports/olalerew.pdf
27 |
--------------------------------------------------------------------------------
/question_classification_taxanomy (1) (1).txt:
--------------------------------------------------------------------------------
1 | Class Definition
2 |
3 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 | ABBREVIATION abbreviation
5 | abb abbreviation
6 | exp expression abbreviated
7 |
8 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
9 | ENTITY entities
10 | animal animals
11 | body organs of body
12 | color colors
13 | creative inventions, books and other creative pieces
14 | currency currency names
15 | dis.med. diseases and medicine
16 | event events
17 | food food
18 | instrument musical instrument
19 | lang languages
20 | letter letters like a-z
21 | other other entities
22 | plant plants
23 | product products
24 | religion religions
25 | sport sports
26 | substance elements and substances
27 | symbol symbols and signs
28 | technique techniques and methods
29 | term equivalent terms
30 | vehicle vehicles
31 | word words with a special property
32 |
33 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
34 | DESCRIPTION description and abstract concepts
35 | definition definition of sth.
36 | description description of sth.
37 | manner manner of an action
38 | reason reasons
39 |
40 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
41 | HUMAN human beings
42 | group a group or organization of persons
43 | ind an individual
44 | title title of a person
45 | description description of a person
46 |
47 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
48 | LOCATION locations
49 | city cities
50 | country countries
51 | mountain mountains
52 | other other locations
53 | state states
54 |
55 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
56 | NUMERIC numeric values
57 | code postcodes or other codes
58 | count number of sth.
59 | date dates
60 | distance linear measures
61 | money prices
62 | order ranks
63 | other other numbers
64 | period the lasting time of sth.
65 | percent fractions
66 | speed speed
67 | temp temperature
68 | size size, area and volume
69 | weight weight
--------------------------------------------------------------------------------
/validation_dataset (1) (1).txt:
--------------------------------------------------------------------------------
1 | NUM:dist How far is it from Denver to Aspen ?
2 | LOC:city What county is Modesto , California in ?
3 | HUM:desc Who was Galileo ?
4 | DESC:def What is an atom ?
5 | NUM:date When did Hawaii become a state ?
6 | NUM:dist How tall is the Sears Building ?
7 | HUM:gr George Bush purchased a small interest in which baseball team ?
8 | ENTY:plant What is Australia 's national flower ?
9 | DESC:reason Why does the moon turn orange ?
10 | DESC:def What is autism ?
11 | LOC:city What city had a world fair in 1900 ?
12 | HUM:ind What person 's head is on a dime ?
13 | NUM:weight What is the average weight of a Yellow Labrador ?
14 | HUM:ind Who was the first man to fly across the Pacific Ocean ?
15 | NUM:date When did Idaho become a state ?
16 | NUM:other What is the life expectancy for crickets ?
17 | ENTY:substance What metal has the highest melting point ?
18 | HUM:ind Who developed the vaccination against polio ?
19 | DESC:def What is epilepsy ?
20 | NUM:date What year did the Titanic sink ?
21 | HUM:ind Who was the first American to walk in space ?
22 | DESC:def What is a biosphere ?
23 | LOC:other What river in the US is known as the Big Muddy ?
24 | DESC:def What is bipolar disorder ?
25 | DESC:def What is cholesterol ?
26 | HUM:ind Who developed the Macintosh computer ?
27 | DESC:def What is caffeine ?
28 | LOC:other What imaginary line is halfway between the North and South Poles ?
29 | LOC:other Where is John Wayne airport ?
30 | LOC:other What hemisphere is the Philippines in ?
31 | NUM:speed What is the average speed of the horses at the Kentucky Derby ?
32 | LOC:mount Where are the Rocky Mountains ?
33 | DESC:def What are invertebrates ?
34 | NUM:temp What is the temperature at the center of the earth ?
35 | NUM:date When did John F. Kennedy get elected as President ?
36 | NUM:period How old was Elvis Presley when he died ?
37 | LOC:other Where is the Orinoco River ?
38 | NUM:dist How far is the service line from the net in tennis ?
39 | NUM:count How much fiber should you have per day ?
40 | NUM:count How many Great Lakes are there ?
41 | ENTY:plant Material called linen is made from what plant ?
42 | DESC:def What is Teflon ?
43 | DESC:def What is amitriptyline ?
44 | DESC:def What is a shaman ?
45 | ENTY:animal What is the proper name for a female walrus ?
46 | ENTY:animal What is a group of turkeys called ?
47 | NUM:period How long did Rip Van Winkle sleep ?
48 | DESC:def What are triglycerides ?
49 | NUM:count How many liters in a gallon ?
50 | HUM:gr What is the name of the chocolate company in San Francisco ?
51 | DESC:def What are amphibians ?
52 | HUM:ind Who discovered x-rays ?
53 | HUM:ind Which comedian 's signature line is `` Can we talk '' ?
54 | DESC:def What is fibromyalgia ?
55 | DESC:desc What is done with worn or outdated flags ?
56 | DESC:def What does cc in engines mean ?
57 | NUM:date When did Elvis Presley die ?
58 | LOC:city What is the capital of Yugoslavia ?
59 | LOC:city Where is Milan ?
60 | NUM:speed What is the speed hummingbirds fly ?
61 | LOC:city What is the oldest city in the United States ?
62 | HUM:ind What was W.C. Fields ' real name ?
63 | LOC:other What river flows between Fargo , North Dakota and Moorhead , Minnesota ?
64 | ENTY:food What do bats eat ?
65 | LOC:state What state did the Battle of Bighorn take place in ?
66 | HUM:desc Who was Abraham Lincoln ?
67 | ENTY:termeq What do you call a newborn kangaroo ?
68 | DESC:def What are spider veins ?
69 | NUM:date What day and month did John Lennon die ?
70 | LOC:other What strait separates North America from Asia ?
71 | NUM:other What is the population of Seattle ?
72 | NUM:money How much was a ticket for the Titanic ?
73 | LOC:city What is the largest city in the world ?
74 | HUM:ind What American composer wrote the music for `` West Side Story '' ?
75 | LOC:other Where is the Mall of the America ?
76 | DESC:def What is the pH scale ?
77 | ENTY:currency What type of currency is used in Australia ?
78 | NUM:dist How tall is the Gateway Arch in St. Louis , MO ?
79 | NUM:weight How much does the human adult female brain weigh ?
80 | HUM:ind Who was the first governor of Alaska ?
81 | DESC:def What is a prism ?
82 | NUM:date When was the first liver transplant ?
83 | HUM:ind Who was elected president of South Africa in 1994 ?
84 | NUM:other What is the population of China ?
85 | NUM:date When was Rosa Parks born ?
86 | DESC:reason Why is a ladybug helpful ?
87 | DESC:def What is amoxicillin ?
88 | HUM:ind Who was the first female United States Representative ?
89 | DESC:def What are xerophytes ?
90 | LOC:country What country did Ponce de Leon come from ?
91 | ENTY:event The U.S. Department of Treasury first issued paper currency for the U.S. during which war ?
92 | DESC:def What is desktop publishing ?
93 | NUM:temp What is the temperature of the sun 's surface ?
94 | NUM:date What year did Canada join the United Nations ?
95 | HUM:gr What is the oldest university in the US ?
96 | LOC:other Where is Prince Edward Island ?
97 | NUM:date Mercury , what year was it discovered ?
98 | DESC:def What is cryogenics ?
99 | DESC:def What are coral reefs ?
100 | ENTY:other What is the longest major league baseball-winning streak ?
101 | DESC:def What is neurology ?
102 | HUM:ind Who invented the calculator ?
103 | DESC:manner How do you measure earthquakes ?
104 | HUM:desc Who is Duke Ellington ?
105 | LOC:city What county is Phoenix , AZ in ?
106 | DESC:def What is a micron ?
107 | NUM:temp The sun 's core , what is the temperature ?
108 | ENTY:animal What is the Ohio state bird ?
109 | NUM:date When were William Shakespeare 's twins born ?
110 | LOC:other What is the highest dam in the U.S. ?
111 | ENTY:color What color is a poison arrow frog ?
112 | DESC:def What is acupuncture ?
113 | NUM:dist What is the length of the coastline of the state of Alaska ?
114 | HUM:ind What is the name of Neil Armstrong 's wife ?
115 | ENTY:plant What is Hawaii 's state flower ?
116 | HUM:ind Who won Ms. American in 1989 ?
117 | NUM:date When did the Hindenberg crash ?
118 | ENTY:substance What mineral helps prevent osteoporosis ?
119 | NUM:date What was the last year that the Chicago Cubs won the World Series ?
120 | LOC:other Where is Perth ?
121 | NUM:date What year did WWII begin ?
122 | NUM:dist What is the diameter of a golf ball ?
123 | DESC:def What is an eclipse ?
124 | HUM:ind Who discovered America ?
125 | NUM:dist What is the earth 's diameter ?
126 | HUM:ind Which president was unmarried ?
127 | NUM:dist How wide is the Milky Way galaxy ?
128 | NUM:date During which season do most thunderstorms occur ?
129 | DESC:def What is Wimbledon ?
130 | NUM:period What is the gestation period for a cat ?
131 | NUM:dist How far is a nautical mile ?
132 | HUM:ind Who was the abolitionist who led the raid on Harper 's Ferry in 1859 ?
133 | DESC:def What does target heart rate mean ?
134 | ENTY:product What was the first satellite to go into space ?
135 | DESC:def What is foreclosure ?
136 | ENTY:other What is the major fault line near Kentucky ?
137 | LOC:other Where is the Holland Tunnel ?
138 | HUM:ind Who wrote the hymn `` Amazing Grace '' ?
139 | HUM:title What position did Willie Davis play in baseball ?
140 | DESC:def What are platelets ?
141 | DESC:def What is severance pay ?
142 | ENTY:animal What is the name of Roy Roger 's dog ?
143 | LOC:other Where are the National Archives ?
144 | ENTY:animal What is a baby turkey called ?
145 | DESC:def What is poliomyelitis ?
146 | ENTY:body What is the longest bone in the human body ?
147 | HUM:ind Who is a German philosopher ?
148 | ENTY:veh What were Christopher Columbus ' three ships ?
149 | DESC:def What does Phi Beta Kappa mean ?
150 | DESC:def What is nicotine ?
151 | ENTY:termeq What is another name for vitamin B1 ?
152 | HUM:ind Who discovered radium ?
153 | DESC:def What are sunspots ?
154 | NUM:date When was Algeria colonized ?
155 | HUM:gr What baseball team was the first to make numbers part of their uniform ?
156 | LOC:other What continent is Egypt on ?
157 | LOC:city What is the capital of Mongolia ?
158 | DESC:def What is nanotechnology ?
159 | LOC:other In the late 1700 's British convicts were used to populate which colony ?
160 | LOC:state What state is the geographic center of the lower 48 states ?
161 | DESC:def What is an obtuse angle ?
162 | DESC:def What are polymers ?
163 | NUM:date When is hurricane season in the Caribbean ?
164 | LOC:other Where is the volcano Mauna Loa ?
165 | ENTY:termeq What is another astronomic term for the Northern Lights ?
166 | LOC:other What peninsula is Spain part of ?
167 | NUM:date When was Lyndon B. Johnson born ?
168 | DESC:def What is acetaminophen ?
169 | LOC:state What state has the least amount of rain per year ?
170 | HUM:ind Who founded American Red Cross ?
171 | NUM:date What year did the Milwaukee Braves become the Atlanta Braves ?
172 | NUM:speed How fast is alcohol absorbed ?
173 | NUM:date When is the summer solstice ?
174 | DESC:def What is supernova ?
175 | LOC:other Where is the Shawnee National Forest ?
176 | LOC:state What U.S. state 's motto is `` Live free or Die '' ?
177 | LOC:other Where is the Lourve ?
178 | NUM:date When was the first stamp issued ?
179 | ENTY:color What primary colors do you mix to make orange ?
180 | NUM:dist How far is Pluto from the sun ?
181 | LOC:other What body of water are the Canary Islands in ?
182 | DESC:def What is neuropathy ?
183 | LOC:other Where is the Euphrates River ?
184 | DESC:def What is cryptography ?
185 | ENTY:substance What is natural gas composed of ?
186 | HUM:ind Who is the Prime Minister of Canada ?
187 | HUM:ind What French ruler was defeated at the battle of Waterloo ?
188 | DESC:def What is leukemia ?
189 | LOC:other Where did Howard Hughes die ?
190 | ENTY:substance What is the birthstone for June ?
191 | ENTY:other What is the sales tax in Minnesota ?
192 | NUM:dist What is the distance in miles from the earth to the sun ?
193 | NUM:period What is the average life span for a chicken ?
194 | NUM:date When was the first Wal-Mart store opened ?
195 | DESC:def What is relative humidity ?
196 | LOC:city What city has the zip code of 35824 ?
197 | ENTY:currency What currency is used in Algeria ?
198 | HUM:ind Who invented the hula hoop ?
199 | ENTY:product What was the most popular toy in 1957 ?
200 | ENTY:substance What is pastrami made of ?
201 | ENTY:product What is the name of the satellite that the Soviet Union sent into space in 1957 ?
202 | LOC:city What city 's newspaper is called `` The Enquirer '' ?
203 | HUM:ind Who invented the slinky ?
204 | ENTY:animal What are the animals that don 't have backbones called ?
205 | NUM:other What is the melting point of copper ?
206 | LOC:other Where is the volcano Olympus Mons located ?
207 | HUM:ind Who was the 23rd president of the United States ?
208 | NUM:temp What is the average body temperature ?
209 | DESC:desc What does a defibrillator do ?
210 | DESC:desc What is the effect of acid rain ?
211 | NUM:date What year did the United States abolish the draft ?
212 | NUM:speed How fast is the speed of light ?
213 | LOC:state What province is Montreal in ?
214 | LOC:other What New York City structure is also known as the Twin Towers ?
215 | DESC:def What is fungus ?
216 | ENTY:lang What is the most frequently spoken language in the Netherlands ?
217 | DESC:def What is sodium chloride ?
218 | ENTY:termeq What are the spots on dominoes called ?
219 | NUM:count How many pounds in a ton ?
220 | DESC:def What is influenza ?
221 | DESC:def What is ozone depletion ?
222 | NUM:date What year was the Mona Lisa painted ?
223 | DESC:def What does `` Sitting Shiva '' mean ?
224 | ENTY:other What is the electrical output in Madrid , Spain ?
225 | LOC:mount Which mountain range in North America stretches from Maine to Georgia ?
226 | ENTY:substance What is plastic made of ?
227 | NUM:other What is the population of Nigeria ?
228 | DESC:desc What does your spleen do ?
229 | LOC:other Where is the Grand Canyon ?
230 | HUM:ind Who invented the telephone ?
231 | NUM:date What year did the U.S. buy Alaska ?
232 | HUM:ind What is the name of the leader of Ireland ?
233 | DESC:def What is phenylalanine ?
234 | NUM:count How many gallons of water are there in a cubic foot ?
235 | ENTY:other What are the two houses of the Legislative branch ?
236 | DESC:def What is sonar ?
237 | LOC:other In Poland , where do most people live ?
238 | DESC:def What is phosphorus ?
239 | LOC:other What is the location of the Sea of Tranquility ?
240 | NUM:speed How fast is sound ?
241 | LOC:state What French province is cognac produced in ?
242 | DESC:def What is Valentine 's Day ?
243 | DESC:reason What causes gray hair ?
244 | DESC:def What is hypertension ?
245 | DESC:def What is bandwidth ?
246 | LOC:other What is the longest suspension bridge in the U.S. ?
247 | DESC:def What is a parasite ?
248 | DESC:def What is home equity ?
249 | DESC:desc What do meteorologists do ?
250 | ENTY:other What is the criterion for being legally blind ?
251 | HUM:ind Who is the tallest man in the world ?
252 | LOC:city What are the twin cities ?
253 | ENTY:other What did Edward Binney and Howard Smith invent in 1903 ?
254 | ENTY:substance What is the statue of liberty made of ?
255 | DESC:def What is pilates ?
256 | LOC:other What planet is known as the `` red '' planet ?
257 | NUM:dist What is the depth of the Nile river ?
258 | ENTY:termeq What is the colorful Korean traditional dress called ?
259 | DESC:def What is Mardi Gras ?
260 | NUM:money Mexican pesos are worth what in U.S. dollars ?
261 | HUM:ind Who was the first African American to play for the Brooklyn Dodgers ?
262 | HUM:ind Who was the first Prime Minister of Canada ?
263 | NUM:count How many Admirals are there in the U.S. Navy ?
264 | ENTY:instru What instrument did Glenn Miller play ?
265 | NUM:period How old was Joan of Arc when she died ?
266 | DESC:def What does the word fortnight mean ?
267 | DESC:def What is dianetics ?
268 | LOC:city What is the capital of Ethiopia ?
269 | NUM:period For how long is an elephant pregnant ?
270 | DESC:manner How did Janice Joplin die ?
271 | ENTY:lang What is the primary language in Iceland ?
272 | DESC:desc What is the difference between AM radio stations and FM radio stations ?
273 | DESC:def What is osteoporosis ?
274 | HUM:ind Who was the first woman governor in the U.S. ?
275 | DESC:def What is peyote ?
276 | DESC:reason What is the esophagus used for ?
277 | DESC:def What is viscosity ?
278 | NUM:date What year did Oklahoma become a state ?
279 | ABBR:abb What is the abbreviation for Texas ?
280 | ENTY:substance What is a mirror made out of ?
281 | LOC:other Where on the body is a mortarboard worn ?
282 | HUM:ind What was J.F.K. 's wife 's name ?
283 | ABBR:exp What does I.V. stand for ?
284 | DESC:def What is the chunnel ?
285 | LOC:other Where is Hitler buried ?
286 | DESC:def What are antacids ?
287 | DESC:def What is pulmonary fibrosis ?
288 | DESC:def What are Quaaludes ?
289 | DESC:def What is naproxen ?
290 | DESC:def What is strep throat ?
291 | LOC:city What is the largest city in the U.S. ?
292 | ENTY:dismed What is foot and mouth disease ?
293 | NUM:other What is the life expectancy of a dollar bill ?
294 | ENTY:termeq What do you call a professional map drawer ?
295 | DESC:def What are Aborigines ?
296 | DESC:def What is hybridization ?
297 | ENTY:color What color is indigo ?
298 | NUM:period How old do you have to be in order to rent a car in Italy ?
299 | ENTY:other What does a barometer measure ?
300 | ENTY:color What color is a giraffe 's tongue ?
301 | ABBR:exp What does USPS stand for ?
302 | NUM:date What year did the NFL go on strike ?
303 | DESC:def What is solar wind ?
304 | NUM:date What date did Neil Armstrong land on the moon ?
305 | NUM:date When was Hiroshima bombed ?
306 | LOC:other Where is the Savannah River ?
307 | HUM:ind Who was the first woman killed in the Vietnam War ?
308 | LOC:other What planet has the strongest magnetic field of all the planets ?
309 | HUM:ind Who is the governor of Alaska ?
310 | NUM:date What year did Mussolini seize power in Italy ?
311 | LOC:city What is the capital of Persia ?
312 | LOC:other Where is the Eiffel Tower ?
313 | NUM:count How many hearts does an octopus have ?
314 | DESC:def What is pneumonia ?
315 | LOC:other What is the deepest lake in the US ?
316 | DESC:def What is a fuel cell ?
317 | HUM:ind Who was the first U.S. president to appear on TV ?
318 | LOC:other Where is the Little League Museum ?
319 | ENTY:other What are the two types of twins ?
320 | LOC:other What is the brightest star ?
321 | DESC:def What is diabetes ?
322 | NUM:date When was President Kennedy shot ?
323 | ABBR:exp What is TMJ ?
324 | ENTY:color What color is yak milk ?
325 | NUM:date What date was Dwight D. Eisenhower born ?
326 | ABBR:exp What does the technical term ISDN mean ?
327 | DESC:reason Why is the sun yellow ?
328 | NUM:money What is the conversion rate between dollars and pounds ?
329 | NUM:date When was Abraham Lincoln born ?
330 | DESC:def What is the Milky Way ?
331 | DESC:def What is mold ?
332 | NUM:date What year was Mozart born ?
333 | ENTY:animal What is a group of frogs called ?
334 | ENTY:veh What is the name of William Penn 's ship ?
335 | NUM:other What is the melting point of gold ?
336 | LOC:other What is the street address of the White House ?
337 | DESC:def What is semolina ?
338 | ENTY:food What fruit is Melba sauce made from ?
339 | DESC:def What is Ursa Major ?
340 | NUM:perc What is the percentage of water content in the human body ?
341 | NUM:weight How much does water weigh ?
342 | ENTY:event What was President Lyndon Johnson 's reform program called ?
343 | NUM:perc What is the murder rate in Windsor , Ontario ?
344 | HUM:ind Who is the only president to serve 2 non-consecutive terms ?
345 | NUM:other What is the population of Australia ?
346 | HUM:ind Who painted the ceiling of the Sistine Chapel ?
347 | ENTY:dismed Name a stimulant .
348 | DESC:desc What is the effect of volcanoes on the climate ?
349 | NUM:date What year did the Andy Griffith show begin ?
350 | DESC:def What is acid rain ?
351 | NUM:date What is the date of Mexico 's independence ?
352 | LOC:other What is the location of Lake Champlain ?
353 | ENTY:plant What is the Illinois state flower ?
354 | ENTY:animal What is Maryland 's state bird ?
355 | DESC:def What is quicksilver ?
356 | HUM:ind Who wrote `` The Divine Comedy '' ?
357 | NUM:speed What is the speed of light ?
358 | NUM:dist What is the width of a football field ?
359 | DESC:reason Why in tennis are zero points called love ?
360 | ENTY:animal What kind of dog was Toto in the Wizard of Oz ?
361 | DESC:def What is a thyroid ?
362 | DESC:def What does ciao mean ?
363 | ENTY:body What is the only artery that carries blue blood from the heart to the lungs ?
364 | NUM:other How often does Old Faithful erupt at Yellowstone National Park ?
365 | DESC:def What is acetic acid ?
366 | NUM:dist What is the elevation of St. Louis , MO ?
367 | ENTY:color What color does litmus paper turn when it comes into contact with a strong acid ?
368 | ENTY:color What are the colors of the German flag ?
369 | DESC:def What is the Moulin Rouge ?
370 | LOC:other What soviet seaport is on the Black Sea ?
371 | NUM:weight What is the atomic weight of silver ?
372 | ENTY:currency What currency do they use in Brazil ?
373 | DESC:def What are pathogens ?
374 | DESC:def What is mad cow disease ?
375 | ENTY:food Name a food high in zinc .
376 | NUM:date When did North Carolina enter the union ?
377 | LOC:other Where do apple snails live ?
378 | DESC:def What are ethics ?
379 | ABBR:exp What does CPR stand for ?
380 | DESC:def What is an annuity ?
381 | HUM:ind Who killed John F. Kennedy ?
382 | HUM:ind Who was the first vice president of the U.S. ?
383 | ENTY:substance What birthstone is turquoise ?
384 | HUM:ind Who was the first US President to ride in an automobile to his inauguration ?
385 | NUM:period How old was the youngest president of the United States ?
386 | NUM:date When was Ulysses S. Grant born ?
387 | DESC:def What is Muscular Dystrophy ?
388 | HUM:ind Who lived in the Neuschwanstein castle ?
389 | DESC:def What is propylene glycol ?
390 | DESC:def What is a panic disorder ?
391 | HUM:ind Who invented the instant Polaroid camera ?
392 | DESC:def What is a carcinogen ?
393 | ENTY:animal What is a baby lion called ?
394 | NUM:other What is the world 's population ?
395 | DESC:def What is nepotism ?
396 | DESC:def What is die-casting ?
397 | DESC:def What is myopia ?
398 | NUM:other What is the sales tax rate in New York ?
399 | NUM:perc Developing nations comprise what percentage of the world 's population ?
400 | LOC:mount What is the fourth highest mountain in the world ?
401 | HUM:ind What is Shakespeare 's nickname ?
402 | ENTY:substance What is the heaviest naturally occurring element ?
403 | NUM:date When is Father 's Day ?
404 | ABBR:exp What does the acronym NASA stand for ?
405 | NUM:dist How long is the Columbia River in miles ?
406 | LOC:city What city 's newspaper is called `` The Star '' ?
407 | DESC:def What is carbon dioxide ?
408 | LOC:other Where is the Mason/Dixon line ?
409 | NUM:date When was the Boston tea party ?
410 | DESC:def What is metabolism ?
411 | HUM:ind Which U.S.A. president appeared on `` Laugh-In '' ?
412 | ENTY:substance What are cigarettes made of ?
413 | LOC:city What is the capital of Zimbabwe ?
414 | ABBR:exp What does NASA stand for ?
415 | ENTY:plant What is the state flower of Michigan ?
416 | DESC:def What are semiconductors ?
417 | DESC:def What is nuclear power ?
418 | DESC:def What is a tsunami ?
419 | HUM:ind Who is the congressman from state of Texas on the armed forces committee ?
420 | HUM:ind Who was president in 1913 ?
421 | NUM:date When was the first kidney transplant ?
422 | LOC:other What are Canada 's two territories ?
423 | ENTY:veh What was the name of the plane Lindbergh flew solo across the Atlantic ?
424 | DESC:def What is genocide ?
425 | LOC:other What continent is Argentina on ?
426 | ENTY:other What monastery was raided by Vikings in the late eighth century ?
427 | DESC:def What is an earthquake ?
428 | LOC:other Where is the tallest roller coaster located ?
429 | DESC:def What are enzymes ?
430 | HUM:ind Who discovered oxygen ?
431 | DESC:def What is bangers and mash ?
432 | ENTY:animal What is the name given to the Tiger at Louisiana State University ?
433 | LOC:other Where are the British crown jewels kept ?
434 | HUM:ind Who was the first person to reach the North Pole ?
435 | DESC:def What is an ulcer ?
436 | DESC:def What is vertigo ?
437 | DESC:def What is the spirometer test ?
438 | NUM:date When is the official first day of summer ?
439 | ABBR:exp What does the abbreviation SOS mean ?
440 | ENTY:animal What is the smallest bird in Britain ?
441 | HUM:ind Who invented Trivial Pursuit ?
442 | ENTY:substance What gasses are in the troposphere ?
443 | LOC:country Which country has the most water pollution ?
444 | ENTY:animal What is the scientific name for elephant ?
445 | HUM:ind Who is the actress known for her role in the movie `` Gypsy '' ?
446 | ENTY:animal What breed of hunting dog did the Beverly Hillbillies own ?
447 | LOC:other What is the rainiest place on Earth ?
448 | HUM:ind Who was the first African American to win the Nobel Prize in literature ?
449 | NUM:date When is St. Patrick 's Day ?
450 | ENTY:animal What was FDR 's dog 's name ?
451 | ENTY:color What colors need to be mixed to get the color pink ?
452 | ENTY:sport What is the most popular sport in Japan ?
453 | ENTY:food What is the active ingredient in baking soda ?
454 | NUM:date When was Thomas Jefferson born ?
455 | NUM:temp How cold should a refrigerator be ?
456 | NUM:date When was the telephone invented ?
457 | ENTY:color What is the most common eye color ?
458 | LOC:other Where was the first golf course in the United States ?
459 | DESC:def What is schizophrenia ?
460 | DESC:def What is angiotensin ?
461 | HUM:gr What did Jesse Jackson organize ?
462 | ENTY:animal What is New York 's state bird ?
463 | LOC:other What is the National Park in Utah ?
464 | NUM:date What is Susan B. Anthony 's birthday ?
465 | LOC:state In which state would you find the Catskill Mountains ?
466 | ENTY:termeq What do you call a word that is spelled the same backwards and forwards ?
467 | DESC:def What are pediatricians ?
468 | HUM:gr What chain store is headquartered in Bentonville , Arkansas ?
469 | DESC:def What are solar cells ?
470 | DESC:def What is compounded interest ?
471 | DESC:def What are capers ?
472 | DESC:def What is an antigen ?
473 | ENTY:currency What currency does Luxembourg use ?
474 | NUM:other What is the population of Venezuela ?
475 | ENTY:other What type of polymer is used for bulletproof vests ?
476 | ENTY:currency What currency does Argentina use ?
477 | DESC:def What is a thermometer ?
478 | LOC:city What Canadian city has the largest population ?
479 | ENTY:color What color are crickets ?
480 | LOC:country Which country gave New York the Statue of Liberty ?
481 | ENTY:product What was the name of the first U.S. satellite sent into space ?
482 | ENTY:substance What precious stone is a form of pure carbon ?
483 | ENTY:substance What kind of gas is in a fluorescent bulb ?
484 | DESC:def What is rheumatoid arthritis ?
485 | LOC:other What river runs through Rowe , Italy ?
486 | DESC:def What is cerebral palsy ?
487 | LOC:city What city is also known as `` The Gateway to the West '' ?
488 | NUM:dist How far away is the moon ?
489 | ENTY:other What is the source of natural gas ?
490 | ENTY:veh In what spacecraft did U.S. astronaut Alan Shepard make his historic 1961 flight ?
491 | DESC:def What is pectin ?
492 | DESC:def What is bio-diversity ?
493 | ENTY:techmeth What 's the easiest way to remove wallpaper ?
494 | NUM:date What year did the Titanic start on its journey ?
495 | NUM:count How much of an apple is water ?
496 | HUM:ind Who was the 22nd President of the US ?
497 | ENTY:currency What is the money they use in Zambia ?
498 | NUM:count How many feet in a mile ?
499 | ENTY:substance What is the birthstone of October ?
500 | DESC:def What is e-coli ?
--------------------------------------------------------------------------------