├── .gitignore ├── LICENSE ├── Question Classifier.ipynb ├── README.md ├── question_classification_taxanomy (1) (1).txt ├── traininig_dataset (1) (1).txt └── validation_dataset (1) (1).txt /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Aman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Question Classifier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "c:\\users\\i327950\\appdata\\local\\programs\\python\\python35\\lib\\site-packages\\gensim\\utils.py:843: UserWarning: detected Windows; aliasing chunkize to chunkize_serial\n", 13 | " warnings.warn(\"detected Windows; aliasing chunkize to chunkize_serial\")\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "import numpy as np\n", 20 | "import matplotlib.pyplot as plt\n", 21 | "import re, nltk\n", 22 | "import gensim\n", 23 | "import codecs\n", 24 | "from sner import Ner\n", 25 | "import spacy\n", 26 | "from sklearn.metrics import confusion_matrix, accuracy_score, average_precision_score\n", 27 | "from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV\n", 28 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 29 | "from nltk.internals import find_jars_within_path\n", 30 | "from nltk.tag import StanfordPOSTagger\n", 31 | "from nltk.tag import StanfordNERTagger\n", 32 | "import spacy\n", 33 | "from sklearn import linear_model\n", 34 | "from sklearn import svm\n", 35 | "from sklearn.metrics import fbeta_score, accuracy_score\n", 36 | "from scipy.sparse import hstack\n", 37 | "from sklearn.feature_extraction.text import CountVectorizer" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "f_train = open('traininig_dataset (1) (1).txt', 'r+')\n", 49 | "f_test = open('validation_dataset (1) (1).txt', 'r+')\n", 50 | "\n", 51 | "train = pd.DataFrame(f_train.readlines(), columns = ['Question'])\n", 52 | "test = pd.DataFrame(f_test.readlines(), columns = ['Question'])" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "train['QType'] = train.Question.apply(lambda x: x.split(' ', 1)[0])\n", 64 | "train['Question'] = train.Question.apply(lambda x: x.split(' ', 1)[1])\n", 65 | "train['QType-Coarse'] = train.QType.apply(lambda x: x.split(':')[0])\n", 66 | "train['QType-Fine'] = train.QType.apply(lambda x: x.split(':')[1])\n", 67 | "test['QType'] = test.Question.apply(lambda x: x.split(' ', 1)[0])\n", 68 | "test['Question'] = test.Question.apply(lambda x: x.split(' ', 1)[1])\n", 69 | "test['QType-Coarse'] = test.QType.apply(lambda x: x.split(':')[0])\n", 70 | "test['QType-Fine'] = test.QType.apply(lambda x: x.split(':')[1])" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/html": [ 81 | "
\n", 82 | "\n", 95 | "\n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | "
QuestionQTypeQType-CoarseQType-Fine
0How did serfdom develop in and then leave Russ...DESC:mannerDESCmanner
1What films featured the character Popeye Doyle...ENTY:crematENTYcremat
2How can I find a list of celebrities ' real na...DESC:mannerDESCmanner
3What fowl grabs the spotlight after the Chines...ENTY:animalENTYanimal
4What is the full form of .com ?\\nABBR:expABBRexp
\n", 143 | "
" 144 | ], 145 | "text/plain": [ 146 | " Question QType \\\n", 147 | "0 How did serfdom develop in and then leave Russ... DESC:manner \n", 148 | "1 What films featured the character Popeye Doyle... ENTY:cremat \n", 149 | "2 How can I find a list of celebrities ' real na... DESC:manner \n", 150 | "3 What fowl grabs the spotlight after the Chines... ENTY:animal \n", 151 | "4 What is the full form of .com ?\\n ABBR:exp \n", 152 | "\n", 153 | " QType-Coarse QType-Fine \n", 154 | "0 DESC manner \n", 155 | "1 ENTY cremat \n", 156 | "2 DESC manner \n", 157 | "3 ENTY animal \n", 158 | "4 ABBR exp " 159 | ] 160 | }, 161 | "execution_count": 4, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "train.head()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 5, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/html": [ 178 | "
\n", 179 | "\n", 192 | "\n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | "
QuestionQTypeQType-CoarseQType-Fine
count500500500500
unique50042639
topWhat is the most popular sport in Japan ?\\nDESC:defDESCdef
freq1123138123
\n", 233 | "
" 234 | ], 235 | "text/plain": [ 236 | " Question QType QType-Coarse \\\n", 237 | "count 500 500 500 \n", 238 | "unique 500 42 6 \n", 239 | "top What is the most popular sport in Japan ?\\n DESC:def DESC \n", 240 | "freq 1 123 138 \n", 241 | "\n", 242 | " QType-Fine \n", 243 | "count 500 \n", 244 | "unique 39 \n", 245 | "top def \n", 246 | "freq 123 " 247 | ] 248 | }, 249 | "execution_count": 5, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "test.describe()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 6, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/html": [ 266 | "
\n", 267 | "\n", 280 | "\n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | "
QuestionQTypeQType-CoarseQType-Fine
0How far is it from Denver to Aspen ?\\nNUM:distNUMdist
1What county is Modesto , California in ?\\nLOC:cityLOCcity
2Who was Galileo ?\\nHUM:descHUMdesc
3What is an atom ?\\nDESC:defDESCdef
4When did Hawaii become a state ?\\nNUM:dateNUMdate
\n", 328 | "
" 329 | ], 330 | "text/plain": [ 331 | " Question QType QType-Coarse \\\n", 332 | "0 How far is it from Denver to Aspen ?\\n NUM:dist NUM \n", 333 | "1 What county is Modesto , California in ?\\n LOC:city LOC \n", 334 | "2 Who was Galileo ?\\n HUM:desc HUM \n", 335 | "3 What is an atom ?\\n DESC:def DESC \n", 336 | "4 When did Hawaii become a state ?\\n NUM:date NUM \n", 337 | "\n", 338 | " QType-Fine \n", 339 | "0 dist \n", 340 | "1 city \n", 341 | "2 desc \n", 342 | "3 def \n", 343 | "4 date " 344 | ] 345 | }, 346 | "execution_count": 6, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | } 350 | ], 351 | "source": [ 352 | "test.head()" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 7, 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/html": [ 363 | "
\n", 364 | "\n", 377 | "\n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | "
QuestionQTypeQType-CoarseQType-Fine
count5952595259525952
unique587150647
topWhat is the latitude and longitude of El Paso ...HUM:indENTYind
freq3101713441017
\n", 418 | "
" 419 | ], 420 | "text/plain": [ 421 | " Question QType \\\n", 422 | "count 5952 5952 \n", 423 | "unique 5871 50 \n", 424 | "top What is the latitude and longitude of El Paso ... HUM:ind \n", 425 | "freq 3 1017 \n", 426 | "\n", 427 | " QType-Coarse QType-Fine \n", 428 | "count 5952 5952 \n", 429 | "unique 6 47 \n", 430 | "top ENTY ind \n", 431 | "freq 1344 1017 " 432 | ] 433 | }, 434 | "execution_count": 7, 435 | "metadata": {}, 436 | "output_type": "execute_result" 437 | } 438 | ], 439 | "source": [ 440 | "train.append(test).describe()" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "As can be observed, the train set consists of some duplicate question (81 to be exact).
\n", 448 | "The number of unique Coarse:Fine classes is 50 whereas entries corresponding to 42 are present in the test set.
\n", 449 | "The number of fine classes overall is 47 whereas entries corresponding to 39 are present in test." 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 8, 455 | "metadata": { 456 | "collapsed": true 457 | }, 458 | "outputs": [], 459 | "source": [ 460 | "from sklearn.preprocessing import LabelEncoder\n", 461 | "le = LabelEncoder()\n", 462 | "le.fit(pd.Series(train.QType.tolist() + test.QType.tolist()).values)\n", 463 | "train['QType'] = le.transform(train.QType.values)\n", 464 | "test['QType'] = le.transform(test.QType.values)\n", 465 | "le2 = LabelEncoder()\n", 466 | "le2.fit(pd.Series(train['QType-Coarse'].tolist() + test['QType-Coarse'].tolist()).values)\n", 467 | "train['QType-Coarse'] = le2.transform(train['QType-Coarse'].values)\n", 468 | "test['QType-Coarse'] = le2.transform(test['QType-Coarse'].values)\n", 469 | "le3 = LabelEncoder()\n", 470 | "le3.fit(pd.Series(train['QType-Fine'].tolist() + test['QType-Fine'].tolist()).values)\n", 471 | "train['QType-Fine'] = le3.transform(train['QType-Fine'].values)\n", 472 | "test['QType-Fine'] = le3.transform(test['QType-Fine'].values)" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 9, 478 | "metadata": {}, 479 | "outputs": [ 480 | { 481 | "data": { 482 | "text/html": [ 483 | "
\n", 484 | "\n", 497 | "\n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | "
QuestionQTypeQType-CoarseQType-Fine
0How did serfdom develop in and then leave Russ...4123
1What films featured the character Popeye Doyle...928
2How can I find a list of celebrities ' real na...4123
3What fowl grabs the spotlight after the Chines...621
4What is the full form of .com ?\\n1016
\n", 545 | "
" 546 | ], 547 | "text/plain": [ 548 | " Question QType QType-Coarse \\\n", 549 | "0 How did serfdom develop in and then leave Russ... 4 1 \n", 550 | "1 What films featured the character Popeye Doyle... 9 2 \n", 551 | "2 How can I find a list of celebrities ' real na... 4 1 \n", 552 | "3 What fowl grabs the spotlight after the Chines... 6 2 \n", 553 | "4 What is the full form of .com ?\\n 1 0 \n", 554 | "\n", 555 | " QType-Fine \n", 556 | "0 23 \n", 557 | "1 8 \n", 558 | "2 23 \n", 559 | "3 1 \n", 560 | "4 16 " 561 | ] 562 | }, 563 | "execution_count": 9, 564 | "metadata": {}, 565 | "output_type": "execute_result" 566 | } 567 | ], 568 | "source": [ 569 | "train.head()" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 10, 575 | "metadata": { 576 | "collapsed": true 577 | }, 578 | "outputs": [], 579 | "source": [ 580 | "all_corpus = pd.Series(train.Question.tolist() + test.Question.tolist()).astype(str)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "markdown", 585 | "metadata": {}, 586 | "source": [ 587 | "Obtaining Dotwords.
\n", 588 | "Also, performing text cleaning and pre-processing in the next two blocks" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 11, 594 | "metadata": {}, 595 | "outputs": [ 596 | { 597 | "name": "stdout", 598 | "output_type": "stream", 599 | "text": [ 600 | "[nltk_data] Downloading package stopwords to\n", 601 | "[nltk_data] C:\\Users\\I327950\\AppData\\Roaming\\nltk_data...\n", 602 | "[nltk_data] Package stopwords is already up-to-date!\n", 603 | "[nltk_data] Downloading package wordnet to\n", 604 | "[nltk_data] C:\\Users\\I327950\\AppData\\Roaming\\nltk_data...\n", 605 | "[nltk_data] Package wordnet is already up-to-date!\n" 606 | ] 607 | } 608 | ], 609 | "source": [ 610 | "nltk.download('stopwords')\n", 611 | "nltk.download('wordnet')\n", 612 | "from nltk.corpus import stopwords\n", 613 | "from nltk.stem.porter import PorterStemmer \n", 614 | "from nltk.stem.snowball import SnowballStemmer\n", 615 | "from nltk.stem.wordnet import WordNetLemmatizer\n", 616 | "\n", 617 | "# dot_words = []\n", 618 | "# for row in all_corpus:\n", 619 | "# for word in row.split():\n", 620 | "# if '.' in word and len(word)>2:\n", 621 | "# dot_words.append(word)" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 12, 627 | "metadata": { 628 | "collapsed": true 629 | }, 630 | "outputs": [], 631 | "source": [ 632 | "def text_clean(corpus, keep_list):\n", 633 | " '''\n", 634 | " Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)\n", 635 | " \n", 636 | " Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained\n", 637 | " even after the cleaning process\n", 638 | " \n", 639 | " Output : Returns the cleaned text corpus\n", 640 | " \n", 641 | " '''\n", 642 | " cleaned_corpus = pd.Series()\n", 643 | " for row in corpus:\n", 644 | " qs = []\n", 645 | " for word in row.split():\n", 646 | " if word not in keep_list:\n", 647 | " p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)\n", 648 | " p1 = p1.lower()\n", 649 | " qs.append(p1)\n", 650 | " else : qs.append(word)\n", 651 | " cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))\n", 652 | " return cleaned_corpus" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 13, 658 | "metadata": { 659 | "collapsed": true 660 | }, 661 | "outputs": [], 662 | "source": [ 663 | "def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):\n", 664 | " \n", 665 | " '''\n", 666 | " Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)\n", 667 | " \n", 668 | " Input : \n", 669 | " 'corpus' - Text corpus on which pre-processing tasks will be performed\n", 670 | " 'keep_list' - List of words to be retained during cleaning process\n", 671 | " 'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should \n", 672 | " be performed or not\n", 673 | " 'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is \"None\", which corresponds to Porter\n", 674 | " Stemmer. 'snowball' corresponds to Snowball Stemmer\n", 675 | " \n", 676 | " Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together\n", 677 | " \n", 678 | " Output : Returns the processed text corpus\n", 679 | " \n", 680 | " '''\n", 681 | " if cleaning == True:\n", 682 | " corpus = text_clean(corpus, keep_list)\n", 683 | " \n", 684 | " if remove_stopwords == True:\n", 685 | " wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']\n", 686 | " stop = set(stopwords.words('english'))\n", 687 | " for word in wh_words:\n", 688 | " stop.remove(word)\n", 689 | " corpus = [[x for x in x.split() if x not in stop] for x in corpus]\n", 690 | " else :\n", 691 | " corpus = [[x for x in x.split()] for x in corpus]\n", 692 | " \n", 693 | " if lemmatization == True:\n", 694 | " lem = WordNetLemmatizer()\n", 695 | " corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]\n", 696 | " \n", 697 | " if stemming == True:\n", 698 | " if stem_type == 'snowball':\n", 699 | " stemmer = SnowballStemmer(language = 'english')\n", 700 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 701 | " else :\n", 702 | " stemmer = PorterStemmer()\n", 703 | " corpus = [[stemmer.stem(x) for x in x] for x in corpus]\n", 704 | " \n", 705 | " corpus = [' '.join(x) for x in corpus]\n", 706 | " \n", 707 | "\n", 708 | " return corpus" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": 14, 714 | "metadata": { 715 | "collapsed": true 716 | }, 717 | "outputs": [], 718 | "source": [ 719 | "common_dot_words = ['U.S.', 'St.', 'Mr.', 'Mrs.', 'D.C.']\n", 720 | "all_corpus = preprocess(all_corpus, keep_list = common_dot_words, remove_stopwords = True)" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": {}, 726 | "source": [ 727 | "# Splitting the preprocessed combined corpus again into train and test set" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 16, 733 | "metadata": {}, 734 | "outputs": [], 735 | "source": [ 736 | "train_corpus = all_corpus[0:train.shape[0]]\n", 737 | "test_corpus = all_corpus[train.shape[0]:]" 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": {}, 743 | "source": [ 744 | "Loading the English model for Spacy.
\n", 745 | "NLTK version for the same performs too slowly, hence opting for Spacy." 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": 17, 751 | "metadata": { 752 | "collapsed": true 753 | }, 754 | "outputs": [], 755 | "source": [ 756 | "nlp = spacy.load('en')" 757 | ] 758 | }, 759 | { 760 | "cell_type": "markdown", 761 | "metadata": {}, 762 | "source": [ 763 | "# Obtaining Features from Train Data, which would be fed to CountVectorizer\n", 764 | "\n", 765 | "Creating list of Named Entitites, Lemmas, POS Tags, Syntactic Dependency Relation and Orthographic Features using shape.
\n", 766 | "Later, these would be used as features for our model." 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": 18, 772 | "metadata": { 773 | "collapsed": true 774 | }, 775 | "outputs": [], 776 | "source": [ 777 | "all_ner = []\n", 778 | "all_lemma = []\n", 779 | "all_tag = []\n", 780 | "all_dep = []\n", 781 | "all_shape = []\n", 782 | "for row in train_corpus:\n", 783 | " doc = nlp(row)\n", 784 | " present_lemma = []\n", 785 | " present_tag = []\n", 786 | " present_dep = []\n", 787 | " present_shape = []\n", 788 | " present_ner = []\n", 789 | " #print(row)\n", 790 | " for token in doc:\n", 791 | " present_lemma.append(token.lemma_)\n", 792 | " present_tag.append(token.tag_)\n", 793 | " #print(present_tag)\n", 794 | " present_dep.append(token.dep_)\n", 795 | " present_shape.append(token.shape_)\n", 796 | " all_lemma.append(\" \".join(present_lemma))\n", 797 | " all_tag.append(\" \".join(present_tag))\n", 798 | " all_dep.append(\" \".join(present_dep))\n", 799 | " all_shape.append(\" \".join(present_shape))\n", 800 | " for ent in doc.ents:\n", 801 | " present_ner.append(ent.label_)\n", 802 | " all_ner.append(\" \".join(present_ner))" 803 | ] 804 | }, 805 | { 806 | "cell_type": "markdown", 807 | "metadata": {}, 808 | "source": [ 809 | "Converting the attributes obtained above into vectors using CountVectorizer." 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 19, 815 | "metadata": { 816 | "collapsed": true 817 | }, 818 | "outputs": [], 819 | "source": [ 820 | "count_vec_ner = CountVectorizer(ngram_range=(1, 2)).fit(all_ner)\n", 821 | "ner_ft = count_vec_ner.transform(all_ner)\n", 822 | "count_vec_lemma = CountVectorizer(ngram_range=(1, 2)).fit(all_lemma)\n", 823 | "lemma_ft = count_vec_lemma.transform(all_lemma)\n", 824 | "count_vec_tag = CountVectorizer(ngram_range=(1, 2)).fit(all_tag)\n", 825 | "tag_ft = count_vec_tag.transform(all_tag)\n", 826 | "count_vec_dep = CountVectorizer(ngram_range=(1, 2)).fit(all_dep)\n", 827 | "dep_ft = count_vec_dep.transform(all_dep)\n", 828 | "count_vec_shape = CountVectorizer(ngram_range=(1, 2)).fit(all_shape)\n", 829 | "shape_ft = count_vec_shape.transform(all_shape)" 830 | ] 831 | }, 832 | { 833 | "cell_type": "markdown", 834 | "metadata": {}, 835 | "source": [ 836 | "Combining the features obtained into 1 matrix" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": 20, 842 | "metadata": { 843 | "collapsed": true 844 | }, 845 | "outputs": [], 846 | "source": [ 847 | "#x_all_ft_train = hstack([ner_ft, lemma_ft, tag_ft, dep_ft, shape_ft])\n", 848 | "x_all_ft_train = hstack([ner_ft, lemma_ft, tag_ft])" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": 21, 854 | "metadata": {}, 855 | "outputs": [ 856 | { 857 | "data": { 858 | "text/plain": [ 859 | "<5452x27303 sparse matrix of type ''\n", 860 | "\twith 102689 stored elements in COOrdinate format>" 861 | ] 862 | }, 863 | "execution_count": 21, 864 | "metadata": {}, 865 | "output_type": "execute_result" 866 | } 867 | ], 868 | "source": [ 869 | "x_all_ft_train" 870 | ] 871 | }, 872 | { 873 | "cell_type": "markdown", 874 | "metadata": {}, 875 | "source": [ 876 | "Converting from COOrdinate format to Compressed Sparse Row format for easier mathematical computations." 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": 22, 882 | "metadata": {}, 883 | "outputs": [ 884 | { 885 | "data": { 886 | "text/plain": [ 887 | "<5452x27303 sparse matrix of type ''\n", 888 | "\twith 102689 stored elements in Compressed Sparse Row format>" 889 | ] 890 | }, 891 | "execution_count": 22, 892 | "metadata": {}, 893 | "output_type": "execute_result" 894 | } 895 | ], 896 | "source": [ 897 | "x_all_ft_train = x_all_ft_train.tocsr()\n", 898 | "x_all_ft_train" 899 | ] 900 | }, 901 | { 902 | "cell_type": "markdown", 903 | "metadata": {}, 904 | "source": [ 905 | "# Now we will obtain the Feature vectors for the test set using the CountVectorizers Obtained from the Training Corpus" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": 23, 911 | "metadata": { 912 | "collapsed": true 913 | }, 914 | "outputs": [], 915 | "source": [ 916 | "all_test_ner = []\n", 917 | "all_test_lemma = []\n", 918 | "all_test_tag = []\n", 919 | "all_test_dep = []\n", 920 | "all_test_shape = []\n", 921 | "for row in test_corpus:\n", 922 | " doc = nlp(row)\n", 923 | " present_lemma = []\n", 924 | " present_tag = []\n", 925 | " present_dep = []\n", 926 | " present_shape = []\n", 927 | " present_ner = []\n", 928 | " #print(row)\n", 929 | " for token in doc:\n", 930 | " present_lemma.append(token.lemma_)\n", 931 | " present_tag.append(token.tag_)\n", 932 | " #print(present_tag)\n", 933 | " present_dep.append(token.dep_)\n", 934 | " present_shape.append(token.shape_)\n", 935 | " all_test_lemma.append(\" \".join(present_lemma))\n", 936 | " all_test_tag.append(\" \".join(present_tag))\n", 937 | " all_test_dep.append(\" \".join(present_dep))\n", 938 | " all_test_shape.append(\" \".join(present_shape))\n", 939 | " for ent in doc.ents:\n", 940 | " present_ner.append(ent.label_)\n", 941 | " all_test_ner.append(\" \".join(present_ner))" 942 | ] 943 | }, 944 | { 945 | "cell_type": "code", 946 | "execution_count": 24, 947 | "metadata": { 948 | "collapsed": true 949 | }, 950 | "outputs": [], 951 | "source": [ 952 | "ner_test_ft = count_vec_ner.transform(all_test_ner)\n", 953 | "lemma_test_ft = count_vec_lemma.transform(all_test_lemma)\n", 954 | "tag_test_ft = count_vec_tag.transform(all_test_tag)\n", 955 | "dep_test_ft = count_vec_dep.transform(all_test_dep)\n", 956 | "shape_test_ft = count_vec_shape.transform(all_test_shape)" 957 | ] 958 | }, 959 | { 960 | "cell_type": "code", 961 | "execution_count": 25, 962 | "metadata": {}, 963 | "outputs": [], 964 | "source": [ 965 | "#x_all_ft_test = hstack([ner_test_ft, lemma_test_ft, tag_test_ft, dep_test_ft, shape_test_ft])\n", 966 | "x_all_ft_test = hstack([ner_test_ft, lemma_test_ft, tag_test_ft])" 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": 26, 972 | "metadata": {}, 973 | "outputs": [ 974 | { 975 | "data": { 976 | "text/plain": [ 977 | "<500x27303 sparse matrix of type ''\n", 978 | "\twith 5270 stored elements in COOrdinate format>" 979 | ] 980 | }, 981 | "execution_count": 26, 982 | "metadata": {}, 983 | "output_type": "execute_result" 984 | } 985 | ], 986 | "source": [ 987 | "x_all_ft_test" 988 | ] 989 | }, 990 | { 991 | "cell_type": "code", 992 | "execution_count": 27, 993 | "metadata": {}, 994 | "outputs": [ 995 | { 996 | "data": { 997 | "text/plain": [ 998 | "<500x27303 sparse matrix of type ''\n", 999 | "\twith 5270 stored elements in Compressed Sparse Row format>" 1000 | ] 1001 | }, 1002 | "execution_count": 27, 1003 | "metadata": {}, 1004 | "output_type": "execute_result" 1005 | } 1006 | ], 1007 | "source": [ 1008 | "x_all_ft_test = x_all_ft_test.tocsr()\n", 1009 | "x_all_ft_test" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "markdown", 1014 | "metadata": {}, 1015 | "source": [ 1016 | "# Model Training\n", 1017 | "Literature study over the years has shown Linear SVM performs best in this Use Case." 1018 | ] 1019 | }, 1020 | { 1021 | "cell_type": "code", 1022 | "execution_count": 28, 1023 | "metadata": { 1024 | "collapsed": true 1025 | }, 1026 | "outputs": [], 1027 | "source": [ 1028 | "model = svm.LinearSVC()" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "markdown", 1033 | "metadata": {}, 1034 | "source": [ 1035 | "First Modelling for Coarse Classes" 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "code", 1040 | "execution_count": 29, 1041 | "metadata": {}, 1042 | "outputs": [ 1043 | { 1044 | "data": { 1045 | "text/plain": [ 1046 | "LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 1047 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 1048 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 1049 | " verbose=0)" 1050 | ] 1051 | }, 1052 | "execution_count": 29, 1053 | "metadata": {}, 1054 | "output_type": "execute_result" 1055 | } 1056 | ], 1057 | "source": [ 1058 | "model.fit(x_all_ft_train, train['QType-Coarse'].values)" 1059 | ] 1060 | }, 1061 | { 1062 | "cell_type": "markdown", 1063 | "metadata": {}, 1064 | "source": [ 1065 | "# Model Evaluation" 1066 | ] 1067 | }, 1068 | { 1069 | "cell_type": "code", 1070 | "execution_count": 30, 1071 | "metadata": { 1072 | "collapsed": true 1073 | }, 1074 | "outputs": [], 1075 | "source": [ 1076 | "preds = model.predict(x_all_ft_test)" 1077 | ] 1078 | }, 1079 | { 1080 | "cell_type": "code", 1081 | "execution_count": 31, 1082 | "metadata": {}, 1083 | "outputs": [ 1084 | { 1085 | "data": { 1086 | "text/plain": [ 1087 | "array([5, 4, 3, 1, 5, 5, 3, 1, 1, 1, 4, 1, 5, 3, 5, 5, 4, 3, 1, 5, 3, 1, 4,\n", 1088 | " 1, 1, 3, 1, 1, 4, 1, 5, 4, 1, 5, 5, 5, 4, 5, 5, 5, 2, 1, 1, 1, 3, 2,\n", 1089 | " 5, 1, 5, 3, 1, 3, 3, 1, 1, 1, 5, 4, 4, 5, 4, 3, 4, 2, 4, 3, 2, 1, 5,\n", 1090 | " 4, 5, 5, 4, 3, 4, 1, 2, 5, 5, 3, 1, 5, 3, 5, 5, 1, 1, 3, 1, 4, 3, 1,\n", 1091 | " 5, 5, 4, 4, 5, 1, 1, 3, 1, 3, 1, 3, 4, 1, 5, 2, 5, 4, 2, 1, 4, 2, 4,\n", 1092 | " 3, 5, 1, 5, 4, 5, 2, 1, 3, 1, 3, 1, 5, 1, 5, 5, 3, 1, 1, 1, 1, 4, 3,\n", 1093 | " 3, 1, 1, 2, 4, 2, 1, 2, 3, 2, 1, 1, 2, 3, 1, 5, 3, 4, 4, 1, 2, 4, 1,\n", 1094 | " 1, 5, 4, 2, 1, 5, 1, 4, 3, 5, 5, 5, 1, 4, 4, 4, 5, 2, 5, 4, 1, 4, 1,\n", 1095 | " 1, 3, 3, 1, 4, 1, 1, 4, 5, 5, 1, 4, 2, 3, 2, 2, 3, 4, 3, 2, 1, 4, 3,\n", 1096 | " 5, 1, 1, 5, 5, 1, 4, 1, 2, 1, 2, 5, 1, 1, 5, 1, 1, 4, 2, 5, 1, 4, 3,\n", 1097 | " 5, 3, 1, 5, 2, 1, 4, 1, 4, 5, 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 3, 1, 2,\n", 1098 | " 2, 1, 4, 4, 2, 1, 4, 3, 3, 5, 2, 5, 1, 1, 4, 5, 1, 2, 1, 1, 3, 1, 2,\n", 1099 | " 1, 5, 0, 2, 4, 3, 0, 1, 4, 1, 1, 1, 1, 1, 4, 2, 5, 2, 1, 1, 2, 5, 1,\n", 1100 | " 2, 0, 5, 1, 5, 5, 4, 3, 4, 3, 5, 4, 4, 5, 1, 4, 1, 3, 4, 2, 4, 1, 5,\n", 1101 | " 1, 2, 5, 1, 1, 5, 5, 1, 1, 5, 2, 2, 1, 4, 1, 2, 1, 5, 5, 2, 5, 3, 5,\n", 1102 | " 3, 3, 1, 5, 1, 5, 4, 4, 2, 1, 3, 5, 1, 1, 2, 1, 1, 3, 5, 1, 1, 2, 2,\n", 1103 | " 1, 4, 1, 2, 1, 1, 3, 5, 4, 1, 0, 1, 3, 3, 1, 3, 5, 5, 1, 3, 1, 1, 3,\n", 1104 | " 1, 2, 5, 1, 1, 1, 5, 5, 4, 1, 2, 5, 0, 5, 4, 1, 4, 5, 1, 3, 1, 4, 0,\n", 1105 | " 4, 1, 1, 1, 3, 3, 5, 1, 3, 1, 4, 2, 1, 4, 1, 3, 1, 2, 4, 3, 1, 1, 1,\n", 1106 | " 5, 0, 2, 3, 1, 4, 3, 3, 2, 4, 3, 5, 2, 2, 2, 2, 5, 1, 5, 2, 4, 1, 1,\n", 1107 | " 1, 2, 1, 5, 4, 2, 1, 3, 1, 1, 1, 1, 2, 5, 2, 2, 1, 4, 2, 4, 3, 2, 2,\n", 1108 | " 1, 4, 1, 4, 5, 1, 2, 1, 1, 2, 5, 5, 3, 2, 5, 1, 1], dtype=int64)" 1109 | ] 1110 | }, 1111 | "execution_count": 31, 1112 | "metadata": {}, 1113 | "output_type": "execute_result" 1114 | } 1115 | ], 1116 | "source": [ 1117 | "preds" 1118 | ] 1119 | }, 1120 | { 1121 | "cell_type": "code", 1122 | "execution_count": 32, 1123 | "metadata": {}, 1124 | "outputs": [ 1125 | { 1126 | "data": { 1127 | "text/plain": [ 1128 | "0.88200000000000001" 1129 | ] 1130 | }, 1131 | "execution_count": 32, 1132 | "metadata": {}, 1133 | "output_type": "execute_result" 1134 | } 1135 | ], 1136 | "source": [ 1137 | "accuracy_score(test['QType-Coarse'].values, preds)" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "markdown", 1142 | "metadata": {}, 1143 | "source": [ 1144 | "Glad to announce, Feature Engineering has enabled us to achieve an Accuracy of 88.2% on the validation set.
\n", 1145 | "The obtained accuracy is way higher than the 73% accuracy obtained without feature engineering" 1146 | ] 1147 | }, 1148 | { 1149 | "cell_type": "markdown", 1150 | "metadata": {}, 1151 | "source": [ 1152 | "Next, we will obtain accuracies for Coarse:Fine combinations" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "execution_count": 33, 1158 | "metadata": {}, 1159 | "outputs": [ 1160 | { 1161 | "data": { 1162 | "text/plain": [ 1163 | "LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 1164 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 1165 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 1166 | " verbose=0)" 1167 | ] 1168 | }, 1169 | "execution_count": 33, 1170 | "metadata": {}, 1171 | "output_type": "execute_result" 1172 | } 1173 | ], 1174 | "source": [ 1175 | "model.fit(x_all_ft_train, train['QType'].values)" 1176 | ] 1177 | }, 1178 | { 1179 | "cell_type": "code", 1180 | "execution_count": 34, 1181 | "metadata": { 1182 | "collapsed": true 1183 | }, 1184 | "outputs": [], 1185 | "source": [ 1186 | "preds = model.predict(x_all_ft_test)" 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "code", 1191 | "execution_count": 35, 1192 | "metadata": {}, 1193 | "outputs": [ 1194 | { 1195 | "data": { 1196 | "text/plain": [ 1197 | "0.81399999999999995" 1198 | ] 1199 | }, 1200 | "execution_count": 35, 1201 | "metadata": {}, 1202 | "output_type": "execute_result" 1203 | } 1204 | ], 1205 | "source": [ 1206 | "accuracy_score(test['QType'].values, preds)" 1207 | ] 1208 | }, 1209 | { 1210 | "cell_type": "markdown", 1211 | "metadata": { 1212 | "collapsed": true 1213 | }, 1214 | "source": [ 1215 | "Woah, up to 81.4% accuracy from 68% obtained earlier when modelled without Feature Engineering." 1216 | ] 1217 | }, 1218 | { 1219 | "cell_type": "markdown", 1220 | "metadata": {}, 1221 | "source": [ 1222 | "Finally, we would evaluate our performance for the fine classes" 1223 | ] 1224 | }, 1225 | { 1226 | "cell_type": "code", 1227 | "execution_count": 36, 1228 | "metadata": {}, 1229 | "outputs": [ 1230 | { 1231 | "data": { 1232 | "text/plain": [ 1233 | "LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 1234 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 1235 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 1236 | " verbose=0)" 1237 | ] 1238 | }, 1239 | "execution_count": 36, 1240 | "metadata": {}, 1241 | "output_type": "execute_result" 1242 | } 1243 | ], 1244 | "source": [ 1245 | "model.fit(x_all_ft_train, train['QType-Fine'].values)" 1246 | ] 1247 | }, 1248 | { 1249 | "cell_type": "code", 1250 | "execution_count": 37, 1251 | "metadata": { 1252 | "collapsed": true 1253 | }, 1254 | "outputs": [], 1255 | "source": [ 1256 | "preds = model.predict(x_all_ft_test)" 1257 | ] 1258 | }, 1259 | { 1260 | "cell_type": "code", 1261 | "execution_count": 38, 1262 | "metadata": {}, 1263 | "outputs": [ 1264 | { 1265 | "data": { 1266 | "text/plain": [ 1267 | "0.81200000000000006" 1268 | ] 1269 | }, 1270 | "execution_count": 38, 1271 | "metadata": {}, 1272 | "output_type": "execute_result" 1273 | } 1274 | ], 1275 | "source": [ 1276 | "accuracy_score(test['QType-Fine'].values, preds)" 1277 | ] 1278 | }, 1279 | { 1280 | "cell_type": "markdown", 1281 | "metadata": {}, 1282 | "source": [ 1283 | "Not bad, We haved achieved an accuracy of 81.2% over the Fine Classes." 1284 | ] 1285 | }, 1286 | { 1287 | "cell_type": "markdown", 1288 | "metadata": {}, 1289 | "source": [ 1290 | "# Conclusion\n", 1291 | "\n", 1292 | "We achieved great accuracies using Feature Engineering as compared to accuracies obtained without feature engineering.\n", 1293 | "(The notebook for models obtained without feature engineering is not being shared and one can try implementing it easily).\n", 1294 | "\n", 1295 | "Experimenting with informer hypernyms can further help in accuracy improvement as suggested in https://nlp.stanford.edu/courses/cs224n/2010/reports/olalerew.pdf" 1296 | ] 1297 | } 1298 | ], 1299 | "metadata": { 1300 | "kernelspec": { 1301 | "display_name": "Python 3", 1302 | "language": "python", 1303 | "name": "python3" 1304 | }, 1305 | "language_info": { 1306 | "codemirror_mode": { 1307 | "name": "ipython", 1308 | "version": 3 1309 | }, 1310 | "file_extension": ".py", 1311 | "mimetype": "text/x-python", 1312 | "name": "python", 1313 | "nbconvert_exporter": "python", 1314 | "pygments_lexer": "ipython3", 1315 | "version": "3.5.0" 1316 | } 1317 | }, 1318 | "nbformat": 4, 1319 | "nbformat_minor": 2 1320 | } 1321 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Question-Classification 2 | Classifying questions from UIUC's CogComp QC Dataset 3 | 4 | # Classifying Questions into Coarse (6 classes) and Fine (50 classes) classes. 5 | 6 | # Approach 7 | 1. Text Exploration 8 | 2. Text Cleaning 9 | 3. Obtaing POS Tags, Identifying Named Entities, Lemmas, Syntactic Dependency Relations and Orthographic Features. 10 | 4. Using the obtained properties as Features. 11 | 5. Using a Linear SVM model on the engineered features. 12 | 13 | # Results 14 | * 88.2% accuracy on Coarse classes. 15 | * 81.6% accuracy on Fine classes. 16 | 17 | | Variations in Features Used | Coarse Set Accuracy | Coarse:Fine Set Accuracy | Fine Set Accuracy | 18 | | ------------- | ------------- | ------------- | ------------- | 19 | | Named Entity Recognition + Lemmas + POS Tags + Syntactic Dependency + Shape | 87.8 | 80.4 | 80.8 | 20 | | Named Entity Recognition + Lemmas + POS Tags + Syntactic Dependency | 87.2 | 80.6 | 81.4 | 21 | | Named Entity Recognition + Lemmas + POS Tags | **88.2** | **81.4** | 81.2 | 22 | | Named Entity Recognition + Lemmas | 86.4 | 80.6 | **81.6** | 23 | | Lemmas | 86.2 | 80.4 | **81.6** | 24 | 25 | # References 26 | https://nlp.stanford.edu/courses/cs224n/2010/reports/olalerew.pdf 27 | -------------------------------------------------------------------------------- /question_classification_taxanomy (1) (1).txt: -------------------------------------------------------------------------------- 1 | Class Definition 2 | 3 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 | ABBREVIATION abbreviation 5 | abb abbreviation 6 | exp expression abbreviated 7 | 8 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 9 | ENTITY entities 10 | animal animals 11 | body organs of body 12 | color colors 13 | creative inventions, books and other creative pieces 14 | currency currency names 15 | dis.med. diseases and medicine 16 | event events 17 | food food 18 | instrument musical instrument 19 | lang languages 20 | letter letters like a-z 21 | other other entities 22 | plant plants 23 | product products 24 | religion religions 25 | sport sports 26 | substance elements and substances 27 | symbol symbols and signs 28 | technique techniques and methods 29 | term equivalent terms 30 | vehicle vehicles 31 | word words with a special property 32 | 33 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 34 | DESCRIPTION description and abstract concepts 35 | definition definition of sth. 36 | description description of sth. 37 | manner manner of an action 38 | reason reasons 39 | 40 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 41 | HUMAN human beings 42 | group a group or organization of persons 43 | ind an individual 44 | title title of a person 45 | description description of a person 46 | 47 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 48 | LOCATION locations 49 | city cities 50 | country countries 51 | mountain mountains 52 | other other locations 53 | state states 54 | 55 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 56 | NUMERIC numeric values 57 | code postcodes or other codes 58 | count number of sth. 59 | date dates 60 | distance linear measures 61 | money prices 62 | order ranks 63 | other other numbers 64 | period the lasting time of sth. 65 | percent fractions 66 | speed speed 67 | temp temperature 68 | size size, area and volume 69 | weight weight -------------------------------------------------------------------------------- /validation_dataset (1) (1).txt: -------------------------------------------------------------------------------- 1 | NUM:dist How far is it from Denver to Aspen ? 2 | LOC:city What county is Modesto , California in ? 3 | HUM:desc Who was Galileo ? 4 | DESC:def What is an atom ? 5 | NUM:date When did Hawaii become a state ? 6 | NUM:dist How tall is the Sears Building ? 7 | HUM:gr George Bush purchased a small interest in which baseball team ? 8 | ENTY:plant What is Australia 's national flower ? 9 | DESC:reason Why does the moon turn orange ? 10 | DESC:def What is autism ? 11 | LOC:city What city had a world fair in 1900 ? 12 | HUM:ind What person 's head is on a dime ? 13 | NUM:weight What is the average weight of a Yellow Labrador ? 14 | HUM:ind Who was the first man to fly across the Pacific Ocean ? 15 | NUM:date When did Idaho become a state ? 16 | NUM:other What is the life expectancy for crickets ? 17 | ENTY:substance What metal has the highest melting point ? 18 | HUM:ind Who developed the vaccination against polio ? 19 | DESC:def What is epilepsy ? 20 | NUM:date What year did the Titanic sink ? 21 | HUM:ind Who was the first American to walk in space ? 22 | DESC:def What is a biosphere ? 23 | LOC:other What river in the US is known as the Big Muddy ? 24 | DESC:def What is bipolar disorder ? 25 | DESC:def What is cholesterol ? 26 | HUM:ind Who developed the Macintosh computer ? 27 | DESC:def What is caffeine ? 28 | LOC:other What imaginary line is halfway between the North and South Poles ? 29 | LOC:other Where is John Wayne airport ? 30 | LOC:other What hemisphere is the Philippines in ? 31 | NUM:speed What is the average speed of the horses at the Kentucky Derby ? 32 | LOC:mount Where are the Rocky Mountains ? 33 | DESC:def What are invertebrates ? 34 | NUM:temp What is the temperature at the center of the earth ? 35 | NUM:date When did John F. Kennedy get elected as President ? 36 | NUM:period How old was Elvis Presley when he died ? 37 | LOC:other Where is the Orinoco River ? 38 | NUM:dist How far is the service line from the net in tennis ? 39 | NUM:count How much fiber should you have per day ? 40 | NUM:count How many Great Lakes are there ? 41 | ENTY:plant Material called linen is made from what plant ? 42 | DESC:def What is Teflon ? 43 | DESC:def What is amitriptyline ? 44 | DESC:def What is a shaman ? 45 | ENTY:animal What is the proper name for a female walrus ? 46 | ENTY:animal What is a group of turkeys called ? 47 | NUM:period How long did Rip Van Winkle sleep ? 48 | DESC:def What are triglycerides ? 49 | NUM:count How many liters in a gallon ? 50 | HUM:gr What is the name of the chocolate company in San Francisco ? 51 | DESC:def What are amphibians ? 52 | HUM:ind Who discovered x-rays ? 53 | HUM:ind Which comedian 's signature line is `` Can we talk '' ? 54 | DESC:def What is fibromyalgia ? 55 | DESC:desc What is done with worn or outdated flags ? 56 | DESC:def What does cc in engines mean ? 57 | NUM:date When did Elvis Presley die ? 58 | LOC:city What is the capital of Yugoslavia ? 59 | LOC:city Where is Milan ? 60 | NUM:speed What is the speed hummingbirds fly ? 61 | LOC:city What is the oldest city in the United States ? 62 | HUM:ind What was W.C. Fields ' real name ? 63 | LOC:other What river flows between Fargo , North Dakota and Moorhead , Minnesota ? 64 | ENTY:food What do bats eat ? 65 | LOC:state What state did the Battle of Bighorn take place in ? 66 | HUM:desc Who was Abraham Lincoln ? 67 | ENTY:termeq What do you call a newborn kangaroo ? 68 | DESC:def What are spider veins ? 69 | NUM:date What day and month did John Lennon die ? 70 | LOC:other What strait separates North America from Asia ? 71 | NUM:other What is the population of Seattle ? 72 | NUM:money How much was a ticket for the Titanic ? 73 | LOC:city What is the largest city in the world ? 74 | HUM:ind What American composer wrote the music for `` West Side Story '' ? 75 | LOC:other Where is the Mall of the America ? 76 | DESC:def What is the pH scale ? 77 | ENTY:currency What type of currency is used in Australia ? 78 | NUM:dist How tall is the Gateway Arch in St. Louis , MO ? 79 | NUM:weight How much does the human adult female brain weigh ? 80 | HUM:ind Who was the first governor of Alaska ? 81 | DESC:def What is a prism ? 82 | NUM:date When was the first liver transplant ? 83 | HUM:ind Who was elected president of South Africa in 1994 ? 84 | NUM:other What is the population of China ? 85 | NUM:date When was Rosa Parks born ? 86 | DESC:reason Why is a ladybug helpful ? 87 | DESC:def What is amoxicillin ? 88 | HUM:ind Who was the first female United States Representative ? 89 | DESC:def What are xerophytes ? 90 | LOC:country What country did Ponce de Leon come from ? 91 | ENTY:event The U.S. Department of Treasury first issued paper currency for the U.S. during which war ? 92 | DESC:def What is desktop publishing ? 93 | NUM:temp What is the temperature of the sun 's surface ? 94 | NUM:date What year did Canada join the United Nations ? 95 | HUM:gr What is the oldest university in the US ? 96 | LOC:other Where is Prince Edward Island ? 97 | NUM:date Mercury , what year was it discovered ? 98 | DESC:def What is cryogenics ? 99 | DESC:def What are coral reefs ? 100 | ENTY:other What is the longest major league baseball-winning streak ? 101 | DESC:def What is neurology ? 102 | HUM:ind Who invented the calculator ? 103 | DESC:manner How do you measure earthquakes ? 104 | HUM:desc Who is Duke Ellington ? 105 | LOC:city What county is Phoenix , AZ in ? 106 | DESC:def What is a micron ? 107 | NUM:temp The sun 's core , what is the temperature ? 108 | ENTY:animal What is the Ohio state bird ? 109 | NUM:date When were William Shakespeare 's twins born ? 110 | LOC:other What is the highest dam in the U.S. ? 111 | ENTY:color What color is a poison arrow frog ? 112 | DESC:def What is acupuncture ? 113 | NUM:dist What is the length of the coastline of the state of Alaska ? 114 | HUM:ind What is the name of Neil Armstrong 's wife ? 115 | ENTY:plant What is Hawaii 's state flower ? 116 | HUM:ind Who won Ms. American in 1989 ? 117 | NUM:date When did the Hindenberg crash ? 118 | ENTY:substance What mineral helps prevent osteoporosis ? 119 | NUM:date What was the last year that the Chicago Cubs won the World Series ? 120 | LOC:other Where is Perth ? 121 | NUM:date What year did WWII begin ? 122 | NUM:dist What is the diameter of a golf ball ? 123 | DESC:def What is an eclipse ? 124 | HUM:ind Who discovered America ? 125 | NUM:dist What is the earth 's diameter ? 126 | HUM:ind Which president was unmarried ? 127 | NUM:dist How wide is the Milky Way galaxy ? 128 | NUM:date During which season do most thunderstorms occur ? 129 | DESC:def What is Wimbledon ? 130 | NUM:period What is the gestation period for a cat ? 131 | NUM:dist How far is a nautical mile ? 132 | HUM:ind Who was the abolitionist who led the raid on Harper 's Ferry in 1859 ? 133 | DESC:def What does target heart rate mean ? 134 | ENTY:product What was the first satellite to go into space ? 135 | DESC:def What is foreclosure ? 136 | ENTY:other What is the major fault line near Kentucky ? 137 | LOC:other Where is the Holland Tunnel ? 138 | HUM:ind Who wrote the hymn `` Amazing Grace '' ? 139 | HUM:title What position did Willie Davis play in baseball ? 140 | DESC:def What are platelets ? 141 | DESC:def What is severance pay ? 142 | ENTY:animal What is the name of Roy Roger 's dog ? 143 | LOC:other Where are the National Archives ? 144 | ENTY:animal What is a baby turkey called ? 145 | DESC:def What is poliomyelitis ? 146 | ENTY:body What is the longest bone in the human body ? 147 | HUM:ind Who is a German philosopher ? 148 | ENTY:veh What were Christopher Columbus ' three ships ? 149 | DESC:def What does Phi Beta Kappa mean ? 150 | DESC:def What is nicotine ? 151 | ENTY:termeq What is another name for vitamin B1 ? 152 | HUM:ind Who discovered radium ? 153 | DESC:def What are sunspots ? 154 | NUM:date When was Algeria colonized ? 155 | HUM:gr What baseball team was the first to make numbers part of their uniform ? 156 | LOC:other What continent is Egypt on ? 157 | LOC:city What is the capital of Mongolia ? 158 | DESC:def What is nanotechnology ? 159 | LOC:other In the late 1700 's British convicts were used to populate which colony ? 160 | LOC:state What state is the geographic center of the lower 48 states ? 161 | DESC:def What is an obtuse angle ? 162 | DESC:def What are polymers ? 163 | NUM:date When is hurricane season in the Caribbean ? 164 | LOC:other Where is the volcano Mauna Loa ? 165 | ENTY:termeq What is another astronomic term for the Northern Lights ? 166 | LOC:other What peninsula is Spain part of ? 167 | NUM:date When was Lyndon B. Johnson born ? 168 | DESC:def What is acetaminophen ? 169 | LOC:state What state has the least amount of rain per year ? 170 | HUM:ind Who founded American Red Cross ? 171 | NUM:date What year did the Milwaukee Braves become the Atlanta Braves ? 172 | NUM:speed How fast is alcohol absorbed ? 173 | NUM:date When is the summer solstice ? 174 | DESC:def What is supernova ? 175 | LOC:other Where is the Shawnee National Forest ? 176 | LOC:state What U.S. state 's motto is `` Live free or Die '' ? 177 | LOC:other Where is the Lourve ? 178 | NUM:date When was the first stamp issued ? 179 | ENTY:color What primary colors do you mix to make orange ? 180 | NUM:dist How far is Pluto from the sun ? 181 | LOC:other What body of water are the Canary Islands in ? 182 | DESC:def What is neuropathy ? 183 | LOC:other Where is the Euphrates River ? 184 | DESC:def What is cryptography ? 185 | ENTY:substance What is natural gas composed of ? 186 | HUM:ind Who is the Prime Minister of Canada ? 187 | HUM:ind What French ruler was defeated at the battle of Waterloo ? 188 | DESC:def What is leukemia ? 189 | LOC:other Where did Howard Hughes die ? 190 | ENTY:substance What is the birthstone for June ? 191 | ENTY:other What is the sales tax in Minnesota ? 192 | NUM:dist What is the distance in miles from the earth to the sun ? 193 | NUM:period What is the average life span for a chicken ? 194 | NUM:date When was the first Wal-Mart store opened ? 195 | DESC:def What is relative humidity ? 196 | LOC:city What city has the zip code of 35824 ? 197 | ENTY:currency What currency is used in Algeria ? 198 | HUM:ind Who invented the hula hoop ? 199 | ENTY:product What was the most popular toy in 1957 ? 200 | ENTY:substance What is pastrami made of ? 201 | ENTY:product What is the name of the satellite that the Soviet Union sent into space in 1957 ? 202 | LOC:city What city 's newspaper is called `` The Enquirer '' ? 203 | HUM:ind Who invented the slinky ? 204 | ENTY:animal What are the animals that don 't have backbones called ? 205 | NUM:other What is the melting point of copper ? 206 | LOC:other Where is the volcano Olympus Mons located ? 207 | HUM:ind Who was the 23rd president of the United States ? 208 | NUM:temp What is the average body temperature ? 209 | DESC:desc What does a defibrillator do ? 210 | DESC:desc What is the effect of acid rain ? 211 | NUM:date What year did the United States abolish the draft ? 212 | NUM:speed How fast is the speed of light ? 213 | LOC:state What province is Montreal in ? 214 | LOC:other What New York City structure is also known as the Twin Towers ? 215 | DESC:def What is fungus ? 216 | ENTY:lang What is the most frequently spoken language in the Netherlands ? 217 | DESC:def What is sodium chloride ? 218 | ENTY:termeq What are the spots on dominoes called ? 219 | NUM:count How many pounds in a ton ? 220 | DESC:def What is influenza ? 221 | DESC:def What is ozone depletion ? 222 | NUM:date What year was the Mona Lisa painted ? 223 | DESC:def What does `` Sitting Shiva '' mean ? 224 | ENTY:other What is the electrical output in Madrid , Spain ? 225 | LOC:mount Which mountain range in North America stretches from Maine to Georgia ? 226 | ENTY:substance What is plastic made of ? 227 | NUM:other What is the population of Nigeria ? 228 | DESC:desc What does your spleen do ? 229 | LOC:other Where is the Grand Canyon ? 230 | HUM:ind Who invented the telephone ? 231 | NUM:date What year did the U.S. buy Alaska ? 232 | HUM:ind What is the name of the leader of Ireland ? 233 | DESC:def What is phenylalanine ? 234 | NUM:count How many gallons of water are there in a cubic foot ? 235 | ENTY:other What are the two houses of the Legislative branch ? 236 | DESC:def What is sonar ? 237 | LOC:other In Poland , where do most people live ? 238 | DESC:def What is phosphorus ? 239 | LOC:other What is the location of the Sea of Tranquility ? 240 | NUM:speed How fast is sound ? 241 | LOC:state What French province is cognac produced in ? 242 | DESC:def What is Valentine 's Day ? 243 | DESC:reason What causes gray hair ? 244 | DESC:def What is hypertension ? 245 | DESC:def What is bandwidth ? 246 | LOC:other What is the longest suspension bridge in the U.S. ? 247 | DESC:def What is a parasite ? 248 | DESC:def What is home equity ? 249 | DESC:desc What do meteorologists do ? 250 | ENTY:other What is the criterion for being legally blind ? 251 | HUM:ind Who is the tallest man in the world ? 252 | LOC:city What are the twin cities ? 253 | ENTY:other What did Edward Binney and Howard Smith invent in 1903 ? 254 | ENTY:substance What is the statue of liberty made of ? 255 | DESC:def What is pilates ? 256 | LOC:other What planet is known as the `` red '' planet ? 257 | NUM:dist What is the depth of the Nile river ? 258 | ENTY:termeq What is the colorful Korean traditional dress called ? 259 | DESC:def What is Mardi Gras ? 260 | NUM:money Mexican pesos are worth what in U.S. dollars ? 261 | HUM:ind Who was the first African American to play for the Brooklyn Dodgers ? 262 | HUM:ind Who was the first Prime Minister of Canada ? 263 | NUM:count How many Admirals are there in the U.S. Navy ? 264 | ENTY:instru What instrument did Glenn Miller play ? 265 | NUM:period How old was Joan of Arc when she died ? 266 | DESC:def What does the word fortnight mean ? 267 | DESC:def What is dianetics ? 268 | LOC:city What is the capital of Ethiopia ? 269 | NUM:period For how long is an elephant pregnant ? 270 | DESC:manner How did Janice Joplin die ? 271 | ENTY:lang What is the primary language in Iceland ? 272 | DESC:desc What is the difference between AM radio stations and FM radio stations ? 273 | DESC:def What is osteoporosis ? 274 | HUM:ind Who was the first woman governor in the U.S. ? 275 | DESC:def What is peyote ? 276 | DESC:reason What is the esophagus used for ? 277 | DESC:def What is viscosity ? 278 | NUM:date What year did Oklahoma become a state ? 279 | ABBR:abb What is the abbreviation for Texas ? 280 | ENTY:substance What is a mirror made out of ? 281 | LOC:other Where on the body is a mortarboard worn ? 282 | HUM:ind What was J.F.K. 's wife 's name ? 283 | ABBR:exp What does I.V. stand for ? 284 | DESC:def What is the chunnel ? 285 | LOC:other Where is Hitler buried ? 286 | DESC:def What are antacids ? 287 | DESC:def What is pulmonary fibrosis ? 288 | DESC:def What are Quaaludes ? 289 | DESC:def What is naproxen ? 290 | DESC:def What is strep throat ? 291 | LOC:city What is the largest city in the U.S. ? 292 | ENTY:dismed What is foot and mouth disease ? 293 | NUM:other What is the life expectancy of a dollar bill ? 294 | ENTY:termeq What do you call a professional map drawer ? 295 | DESC:def What are Aborigines ? 296 | DESC:def What is hybridization ? 297 | ENTY:color What color is indigo ? 298 | NUM:period How old do you have to be in order to rent a car in Italy ? 299 | ENTY:other What does a barometer measure ? 300 | ENTY:color What color is a giraffe 's tongue ? 301 | ABBR:exp What does USPS stand for ? 302 | NUM:date What year did the NFL go on strike ? 303 | DESC:def What is solar wind ? 304 | NUM:date What date did Neil Armstrong land on the moon ? 305 | NUM:date When was Hiroshima bombed ? 306 | LOC:other Where is the Savannah River ? 307 | HUM:ind Who was the first woman killed in the Vietnam War ? 308 | LOC:other What planet has the strongest magnetic field of all the planets ? 309 | HUM:ind Who is the governor of Alaska ? 310 | NUM:date What year did Mussolini seize power in Italy ? 311 | LOC:city What is the capital of Persia ? 312 | LOC:other Where is the Eiffel Tower ? 313 | NUM:count How many hearts does an octopus have ? 314 | DESC:def What is pneumonia ? 315 | LOC:other What is the deepest lake in the US ? 316 | DESC:def What is a fuel cell ? 317 | HUM:ind Who was the first U.S. president to appear on TV ? 318 | LOC:other Where is the Little League Museum ? 319 | ENTY:other What are the two types of twins ? 320 | LOC:other What is the brightest star ? 321 | DESC:def What is diabetes ? 322 | NUM:date When was President Kennedy shot ? 323 | ABBR:exp What is TMJ ? 324 | ENTY:color What color is yak milk ? 325 | NUM:date What date was Dwight D. Eisenhower born ? 326 | ABBR:exp What does the technical term ISDN mean ? 327 | DESC:reason Why is the sun yellow ? 328 | NUM:money What is the conversion rate between dollars and pounds ? 329 | NUM:date When was Abraham Lincoln born ? 330 | DESC:def What is the Milky Way ? 331 | DESC:def What is mold ? 332 | NUM:date What year was Mozart born ? 333 | ENTY:animal What is a group of frogs called ? 334 | ENTY:veh What is the name of William Penn 's ship ? 335 | NUM:other What is the melting point of gold ? 336 | LOC:other What is the street address of the White House ? 337 | DESC:def What is semolina ? 338 | ENTY:food What fruit is Melba sauce made from ? 339 | DESC:def What is Ursa Major ? 340 | NUM:perc What is the percentage of water content in the human body ? 341 | NUM:weight How much does water weigh ? 342 | ENTY:event What was President Lyndon Johnson 's reform program called ? 343 | NUM:perc What is the murder rate in Windsor , Ontario ? 344 | HUM:ind Who is the only president to serve 2 non-consecutive terms ? 345 | NUM:other What is the population of Australia ? 346 | HUM:ind Who painted the ceiling of the Sistine Chapel ? 347 | ENTY:dismed Name a stimulant . 348 | DESC:desc What is the effect of volcanoes on the climate ? 349 | NUM:date What year did the Andy Griffith show begin ? 350 | DESC:def What is acid rain ? 351 | NUM:date What is the date of Mexico 's independence ? 352 | LOC:other What is the location of Lake Champlain ? 353 | ENTY:plant What is the Illinois state flower ? 354 | ENTY:animal What is Maryland 's state bird ? 355 | DESC:def What is quicksilver ? 356 | HUM:ind Who wrote `` The Divine Comedy '' ? 357 | NUM:speed What is the speed of light ? 358 | NUM:dist What is the width of a football field ? 359 | DESC:reason Why in tennis are zero points called love ? 360 | ENTY:animal What kind of dog was Toto in the Wizard of Oz ? 361 | DESC:def What is a thyroid ? 362 | DESC:def What does ciao mean ? 363 | ENTY:body What is the only artery that carries blue blood from the heart to the lungs ? 364 | NUM:other How often does Old Faithful erupt at Yellowstone National Park ? 365 | DESC:def What is acetic acid ? 366 | NUM:dist What is the elevation of St. Louis , MO ? 367 | ENTY:color What color does litmus paper turn when it comes into contact with a strong acid ? 368 | ENTY:color What are the colors of the German flag ? 369 | DESC:def What is the Moulin Rouge ? 370 | LOC:other What soviet seaport is on the Black Sea ? 371 | NUM:weight What is the atomic weight of silver ? 372 | ENTY:currency What currency do they use in Brazil ? 373 | DESC:def What are pathogens ? 374 | DESC:def What is mad cow disease ? 375 | ENTY:food Name a food high in zinc . 376 | NUM:date When did North Carolina enter the union ? 377 | LOC:other Where do apple snails live ? 378 | DESC:def What are ethics ? 379 | ABBR:exp What does CPR stand for ? 380 | DESC:def What is an annuity ? 381 | HUM:ind Who killed John F. Kennedy ? 382 | HUM:ind Who was the first vice president of the U.S. ? 383 | ENTY:substance What birthstone is turquoise ? 384 | HUM:ind Who was the first US President to ride in an automobile to his inauguration ? 385 | NUM:period How old was the youngest president of the United States ? 386 | NUM:date When was Ulysses S. Grant born ? 387 | DESC:def What is Muscular Dystrophy ? 388 | HUM:ind Who lived in the Neuschwanstein castle ? 389 | DESC:def What is propylene glycol ? 390 | DESC:def What is a panic disorder ? 391 | HUM:ind Who invented the instant Polaroid camera ? 392 | DESC:def What is a carcinogen ? 393 | ENTY:animal What is a baby lion called ? 394 | NUM:other What is the world 's population ? 395 | DESC:def What is nepotism ? 396 | DESC:def What is die-casting ? 397 | DESC:def What is myopia ? 398 | NUM:other What is the sales tax rate in New York ? 399 | NUM:perc Developing nations comprise what percentage of the world 's population ? 400 | LOC:mount What is the fourth highest mountain in the world ? 401 | HUM:ind What is Shakespeare 's nickname ? 402 | ENTY:substance What is the heaviest naturally occurring element ? 403 | NUM:date When is Father 's Day ? 404 | ABBR:exp What does the acronym NASA stand for ? 405 | NUM:dist How long is the Columbia River in miles ? 406 | LOC:city What city 's newspaper is called `` The Star '' ? 407 | DESC:def What is carbon dioxide ? 408 | LOC:other Where is the Mason/Dixon line ? 409 | NUM:date When was the Boston tea party ? 410 | DESC:def What is metabolism ? 411 | HUM:ind Which U.S.A. president appeared on `` Laugh-In '' ? 412 | ENTY:substance What are cigarettes made of ? 413 | LOC:city What is the capital of Zimbabwe ? 414 | ABBR:exp What does NASA stand for ? 415 | ENTY:plant What is the state flower of Michigan ? 416 | DESC:def What are semiconductors ? 417 | DESC:def What is nuclear power ? 418 | DESC:def What is a tsunami ? 419 | HUM:ind Who is the congressman from state of Texas on the armed forces committee ? 420 | HUM:ind Who was president in 1913 ? 421 | NUM:date When was the first kidney transplant ? 422 | LOC:other What are Canada 's two territories ? 423 | ENTY:veh What was the name of the plane Lindbergh flew solo across the Atlantic ? 424 | DESC:def What is genocide ? 425 | LOC:other What continent is Argentina on ? 426 | ENTY:other What monastery was raided by Vikings in the late eighth century ? 427 | DESC:def What is an earthquake ? 428 | LOC:other Where is the tallest roller coaster located ? 429 | DESC:def What are enzymes ? 430 | HUM:ind Who discovered oxygen ? 431 | DESC:def What is bangers and mash ? 432 | ENTY:animal What is the name given to the Tiger at Louisiana State University ? 433 | LOC:other Where are the British crown jewels kept ? 434 | HUM:ind Who was the first person to reach the North Pole ? 435 | DESC:def What is an ulcer ? 436 | DESC:def What is vertigo ? 437 | DESC:def What is the spirometer test ? 438 | NUM:date When is the official first day of summer ? 439 | ABBR:exp What does the abbreviation SOS mean ? 440 | ENTY:animal What is the smallest bird in Britain ? 441 | HUM:ind Who invented Trivial Pursuit ? 442 | ENTY:substance What gasses are in the troposphere ? 443 | LOC:country Which country has the most water pollution ? 444 | ENTY:animal What is the scientific name for elephant ? 445 | HUM:ind Who is the actress known for her role in the movie `` Gypsy '' ? 446 | ENTY:animal What breed of hunting dog did the Beverly Hillbillies own ? 447 | LOC:other What is the rainiest place on Earth ? 448 | HUM:ind Who was the first African American to win the Nobel Prize in literature ? 449 | NUM:date When is St. Patrick 's Day ? 450 | ENTY:animal What was FDR 's dog 's name ? 451 | ENTY:color What colors need to be mixed to get the color pink ? 452 | ENTY:sport What is the most popular sport in Japan ? 453 | ENTY:food What is the active ingredient in baking soda ? 454 | NUM:date When was Thomas Jefferson born ? 455 | NUM:temp How cold should a refrigerator be ? 456 | NUM:date When was the telephone invented ? 457 | ENTY:color What is the most common eye color ? 458 | LOC:other Where was the first golf course in the United States ? 459 | DESC:def What is schizophrenia ? 460 | DESC:def What is angiotensin ? 461 | HUM:gr What did Jesse Jackson organize ? 462 | ENTY:animal What is New York 's state bird ? 463 | LOC:other What is the National Park in Utah ? 464 | NUM:date What is Susan B. Anthony 's birthday ? 465 | LOC:state In which state would you find the Catskill Mountains ? 466 | ENTY:termeq What do you call a word that is spelled the same backwards and forwards ? 467 | DESC:def What are pediatricians ? 468 | HUM:gr What chain store is headquartered in Bentonville , Arkansas ? 469 | DESC:def What are solar cells ? 470 | DESC:def What is compounded interest ? 471 | DESC:def What are capers ? 472 | DESC:def What is an antigen ? 473 | ENTY:currency What currency does Luxembourg use ? 474 | NUM:other What is the population of Venezuela ? 475 | ENTY:other What type of polymer is used for bulletproof vests ? 476 | ENTY:currency What currency does Argentina use ? 477 | DESC:def What is a thermometer ? 478 | LOC:city What Canadian city has the largest population ? 479 | ENTY:color What color are crickets ? 480 | LOC:country Which country gave New York the Statue of Liberty ? 481 | ENTY:product What was the name of the first U.S. satellite sent into space ? 482 | ENTY:substance What precious stone is a form of pure carbon ? 483 | ENTY:substance What kind of gas is in a fluorescent bulb ? 484 | DESC:def What is rheumatoid arthritis ? 485 | LOC:other What river runs through Rowe , Italy ? 486 | DESC:def What is cerebral palsy ? 487 | LOC:city What city is also known as `` The Gateway to the West '' ? 488 | NUM:dist How far away is the moon ? 489 | ENTY:other What is the source of natural gas ? 490 | ENTY:veh In what spacecraft did U.S. astronaut Alan Shepard make his historic 1961 flight ? 491 | DESC:def What is pectin ? 492 | DESC:def What is bio-diversity ? 493 | ENTY:techmeth What 's the easiest way to remove wallpaper ? 494 | NUM:date What year did the Titanic start on its journey ? 495 | NUM:count How much of an apple is water ? 496 | HUM:ind Who was the 22nd President of the US ? 497 | ENTY:currency What is the money they use in Zambia ? 498 | NUM:count How many feet in a mile ? 499 | ENTY:substance What is the birthstone of October ? 500 | DESC:def What is e-coli ? --------------------------------------------------------------------------------