├── Chapter1.ipynb
├── Chapter10.ipynb
├── Chapter11.ipynb
├── Chapter2.ipynb
├── Chapter3.ipynb
├── Chapter4.ipynb
├── Chapter9.ipynb
├── Chapters5-6.ipynb
├── Chapters7-8.ipynb
├── On-Friday-board-members-meet-with-senior-managers-to-discuss-future-development-of-the-company.svg
├── README.md
├── chapter1_1.py
├── chapter1_2.py
├── chapter1_3.py
├── chapter2_word_split.py
├── cisi.zip
├── enron1.zip
├── enron2.zip
├── review_polarity.zip
├── rt-polaritydata.zip
├── sentiment_words.zip
└── time.zip


/Chapter1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Chapter 1: Introduction\n",
  8 |     "\n",
  9 |     "Let's build a vector for input text, e.g., from `doc1`:"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 14,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "[3, 5]\n"
 22 |      ]
 23 |     }
 24 |    ],
 25 |    "source": [
 26 |     "doc1 = \"meeting ... management ... meeting ... management ... meeting \"\n",
 27 |     "doc1 += \"... management ... meeting ... meeting\"\n",
 28 |     "\n",
 29 |     "vector = [0, 0]\n",
 30 |     "\n",
 31 |     "for word in doc1.split(\" \"):\n",
 32 |     "    if word==\"management\":\n",
 33 |     "        vector[0] = vector[0] + 1\n",
 34 |     "    if word==\"meeting\":\n",
 35 |     "        vector[1] = vector[1] + 1\n",
 36 |     "        \n",
 37 |     "print (vector)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "Here is how you can calculate *Euclidean distance* between a document and a query:"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 15,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "4.47213595499958\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "import math\n",
 62 |     "\n",
 63 |     "query = [1, 1]\n",
 64 |     "doc1 = [3, 5]\n",
 65 |     "sq_length = 0\n",
 66 |     "\n",
 67 |     "for index in range(0, len(query)):\n",
 68 |     "    sq_length += math.pow((doc1[index] - query[index]), 2)\n",
 69 |     "            \n",
 70 |     "print (math.sqrt(sq_length))"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "Finally, let's estimate *cosine similarity*:"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 16,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "0.9701425001453319\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "import math\n",
 95 |     "\n",
 96 |     "query = [1, 1]\n",
 97 |     "doc1 = [3, 5]\n",
 98 |     "\n",
 99 |     "def length(vector):\n",
100 |     "    sq_length = 0\n",
101 |     "    for index in range(0, len(vector)):\n",
102 |     "        sq_length += math.pow(vector[index], 2)\n",
103 |     "    return math.sqrt(sq_length)\n",
104 |     "    \n",
105 |     "def dot_product(vector1, vector2):\n",
106 |     "    if len(vector1)==len(vector2):\n",
107 |     "        dot_prod = 0\n",
108 |     "        for index in range(0, len(vector1)):\n",
109 |     "            dot_prod += vector1[index]*vector2[index]\n",
110 |     "        return dot_prod\n",
111 |     "    else:\n",
112 |     "        return \"Unmatching dimensionality\"\n",
113 |     "\n",
114 |     "cosine=dot_product(query, doc1)/(length(query)*length(doc1))\n",
115 |     "print (cosine)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": []
124 |   }
125 |  ],
126 |  "metadata": {
127 |   "kernelspec": {
128 |    "display_name": "Python 3",
129 |    "language": "python",
130 |    "name": "python3"
131 |   },
132 |   "language_info": {
133 |    "codemirror_mode": {
134 |     "name": "ipython",
135 |     "version": 3
136 |    },
137 |    "file_extension": ".py",
138 |    "mimetype": "text/x-python",
139 |    "name": "python",
140 |    "nbconvert_exporter": "python",
141 |    "pygments_lexer": "ipython3",
142 |    "version": "3.7.6"
143 |   }
144 |  },
145 |  "nbformat": 4,
146 |  "nbformat_minor": 4
147 | }
148 | 


--------------------------------------------------------------------------------
/Chapter2.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Chapter 2: Your first practical NLP application, spam filtering"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "Read in spam and ham file lists:"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 1,
  20 |    "metadata": {},
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "import os\n",
  24 |     "import codecs\n",
  25 |     "\n",
  26 |     "def read_in(folder):\n",
  27 |     "    files = os.listdir(folder)\n",
  28 |     "    a_list = []\n",
  29 |     "    for a_file in files:\n",
  30 |     "        if not a_file.startswith(\".\"):\n",
  31 |     "            f = codecs.open(folder + a_file, \"r\", encoding = \"ISO-8859-1\", errors=\"ignore\")\n",
  32 |     "            a_list.append(f.read())\n",
  33 |     "            f.close()\n",
  34 |     "    return a_list"
  35 |    ]
  36 |   },
  37 |   {
  38 |    "cell_type": "markdown",
  39 |    "metadata": {},
  40 |    "source": [
  41 |     "Initialise lists and print out length – this should return 1500 for `enron1/spam` and 3672 for `enron1/ham`:"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": 2,
  47 |    "metadata": {},
  48 |    "outputs": [
  49 |     {
  50 |      "name": "stdout",
  51 |      "output_type": "stream",
  52 |      "text": [
  53 |       "1500\n",
  54 |       "Subject: what up,, your cam babe\r\n",
  55 |       "What are you looking for?\r\n",
  56 |       "If your looking for a companion for friendship, love, a date, or just good ole'\r\n",
  57 |       "Fashioned * * * * * *, then try our brand new site; it was developed and created\r\n",
  58 |       "To help anyone find what they' re looking for. A quick bio form and you' re\r\n",
  59 |       "On the road to satisfaction in every sense of the word.... No matter what\r\n",
  60 |       "That may be!\r\n",
  61 |       "Try it out and youll be amazed.\r\n",
  62 |       "Have a terrific time this evening\r\n",
  63 |       "Copy and pa ste the add. Ress you see on the line below into your browser to come to the site.\r\n",
  64 |       "Http:// www. Meganbang. Biz/bld/acc /\r\n",
  65 |       "No more plz\r\n",
  66 |       "Http:// www. Naturalgolden. Com/retract /\r\n",
  67 |       "Counterattack aitken step preemptive shoehorn scaup. Electrocardiograph movie honeycomb. Monster war brandywine pietism byrne catatonia. Encomia lookup intervenor skeleton turn catfish.\r\n",
  68 |       "\n",
  69 |       "3672\n",
  70 |       "Subject: ena sales on hpl\r\n",
  71 |       "Just to update you on this project' s status:\r\n",
  72 |       "Based on a new report that scott mills ran for me from sitara, I have come up\r\n",
  73 |       "With the following counterparties as the ones to which ena is selling gas off\r\n",
  74 |       "Of hpl' s pipe.\r\n",
  75 |       "Altrade transaction, l. L. C. Gulf gas utilities company\r\n",
  76 |       "Brazoria, city of panther pipeline, inc.\r\n",
  77 |       "Central illinois light company praxair, inc.\r\n",
  78 |       "Central power and light company reliant energy - entex\r\n",
  79 |       "Ces - equistar chemicals, lp reliant energy - hl & p\r\n",
  80 |       "Corpus christI gas marketing, lp southern union company\r\n",
  81 |       "D & h gas company, inc. Texas utilities fuel company\r\n",
  82 |       "Duke energy field services, inc. Txu gas distribution\r\n",
  83 |       "Entex gas marketing company union carbide corporation\r\n",
  84 |       "Equistar chemicals, lp unit gas transmission company inc.\r\n",
  85 |       "Since i' m not sure exactly what gets entered into sitara, pat clynes\r\n",
  86 |       "Suggested that I check with daren farmer to make sure that i' m not missing\r\n",
  87 |       "Something (which I did below). While I am waiting for a response from him\r\n",
  88 |       "And/or mary smith, I will begin gathering the contractual volumes under the\r\n",
  89 |       "Above contracts.\r\n",
  90 |       "- - - - - - - - - - - - - - - - - - - - - - forwarded by cheryl dudley/hou/ect on 05/10/2000 07: 56\r\n",
  91 |       "Am - - - - - - - - - - - - - - - - - - - - - - - - - - -\r\n",
  92 |       "Cheryl d king\r\n",
  93 |       "05/08/2000 04: 11 pm\r\n",
  94 |       "Sent by: cheryl dudley\r\n",
  95 |       "To: daren j farmer/hou/ect@ ect, mary m smith/hou/ect@ ect\r\n",
  96 |       "Cc:\r\n",
  97 |       "Subject: ena sales on hpl\r\n",
  98 |       "I am working on a project for brenda herod & was wondering if one of you\r\n",
  99 |       "Could tell me if i' m on the right track & if this will get everything for\r\n",
 100 |       "Which she is looking.\r\n",
 101 |       "She is trying to draft a long - term transport/storage agreement between ena &\r\n",
 102 |       "Hplc which will allow ena to move the gas to their markets. In order to\r\n",
 103 |       "Accomplish this, she needs to know all of the sales to customers that ena is\r\n",
 104 |       "Doing off of hpl' s pipe.\r\n",
 105 |       "I had scott mills run a report from sitara showing all ena buy/sell activity\r\n",
 106 |       "On hpl since 7/99. If I eliminate the buys & the desk - to - desk deals, will\r\n",
 107 |       "This give me everything that I need?\r\n",
 108 |       "Are there buy/sell deals done with ena on hpl' s pipe that wouldn' t show up in\r\n",
 109 |       "Sitara? Someone mentioned something about deals where hpl transports the gas\r\n",
 110 |       "On it' s own behalf then ena sells it to a customer at that same spot - -\r\n",
 111 |       "????? Do deals like that happen? Would they show up in sitara?\r\n",
 112 |       "Is there anything else that i' m missing? I' m not real familiar with how some\r\n",
 113 |       "Of these deals happen nowadays so am very receptive to any\r\n",
 114 |       "Ideas/suggestions/help that you can offer!!!\r\n",
 115 |       "Thanks in advance.\n"
 116 |      ]
 117 |     }
 118 |    ],
 119 |    "source": [
 120 |     "spam_list = read_in(\"enron1/spam/\")\n",
 121 |     "print(len(spam_list))\n",
 122 |     "print(spam_list[0])\n",
 123 |     "ham_list = read_in(\"enron1/ham/\")\n",
 124 |     "print(len(ham_list))\n",
 125 |     "print(ham_list[0])"
 126 |    ]
 127 |   },
 128 |   {
 129 |    "cell_type": "markdown",
 130 |    "metadata": {},
 131 |    "source": [
 132 |     "Combine all emails together, keeping the label, and shuffle them: "
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "code",
 137 |    "execution_count": 3,
 138 |    "metadata": {},
 139 |    "outputs": [
 140 |     {
 141 |      "name": "stdout",
 142 |      "output_type": "stream",
 143 |      "text": [
 144 |       "Dataset size = 5172 emails\n"
 145 |      ]
 146 |     }
 147 |    ],
 148 |    "source": [
 149 |     "import random\n",
 150 |     "\n",
 151 |     "all_emails = [(email_content, \"spam\") for email_content in spam_list]\n",
 152 |     "all_emails += [(email_content, \"ham\") for email_content in ham_list]\n",
 153 |     "random.seed(42)\n",
 154 |     "random.shuffle(all_emails)\n",
 155 |     "print (f\"Dataset size = {str(len(all_emails))} emails\")"
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "markdown",
 160 |    "metadata": {},
 161 |    "source": [
 162 |     "Preprocess the texts by tokenising them and removing the stopwords:\n"
 163 |    ]
 164 |   },
 165 |   {
 166 |    "cell_type": "code",
 167 |    "execution_count": 4,
 168 |    "metadata": {
 169 |     "scrolled": false
 170 |    },
 171 |    "outputs": [
 172 |     {
 173 |      "name": "stdout",
 174 |      "output_type": "stream",
 175 |      "text": [
 176 |       "{'participate': True, 'in': True, 'our': True, 'new': True, 'lottery': True, 'now': True, '!': True}\n",
 177 |       "5172\n",
 178 |       "38\n",
 179 |       "38\n"
 180 |      ]
 181 |     }
 182 |    ],
 183 |    "source": [
 184 |     "import nltk\n",
 185 |     "from nltk import word_tokenize\n",
 186 |     "\n",
 187 |     "def get_features(text): \n",
 188 |     "    features = {}\n",
 189 |     "    word_list = [word for word in word_tokenize(text.lower())]\n",
 190 |     "    for word in word_list:\n",
 191 |     "        features[word] = True\n",
 192 |     "    return features\n",
 193 |     "\n",
 194 |     "all_features = [(get_features(email), label) for (email, label) in all_emails]\n",
 195 |     "\n",
 196 |     "print(get_features(\"Participate In Our New Lottery NOW!\"))\n",
 197 |     "print(len(all_features))\n",
 198 |     "print(len(all_features[0][0]))\n",
 199 |     "print(len(all_features[99][0]))"
 200 |    ]
 201 |   },
 202 |   {
 203 |    "cell_type": "markdown",
 204 |    "metadata": {},
 205 |    "source": [
 206 |     "Apply Naive Bayes classifier:"
 207 |    ]
 208 |   },
 209 |   {
 210 |    "cell_type": "code",
 211 |    "execution_count": 5,
 212 |    "metadata": {},
 213 |    "outputs": [
 214 |     {
 215 |      "name": "stdout",
 216 |      "output_type": "stream",
 217 |      "text": [
 218 |       "Training set size = 4137 emails\n",
 219 |       "Test set size = 1035 emails\n"
 220 |      ]
 221 |     }
 222 |    ],
 223 |    "source": [
 224 |     "from nltk import NaiveBayesClassifier, classify\n",
 225 |     "\n",
 226 |     "def train(features, proportion):\n",
 227 |     "    train_size = int(len(features) * proportion)\n",
 228 |     "    # initialise the training and test sets\n",
 229 |     "    train_set, test_set = features[:train_size], features[train_size:]\n",
 230 |     "    print (f\"Training set size = {str(len(train_set))} emails\")\n",
 231 |     "    print (f\"Test set size = {str(len(test_set))} emails\")\n",
 232 |     "    # train the classifier\n",
 233 |     "    classifier = NaiveBayesClassifier.train(train_set)\n",
 234 |     "    return train_set, test_set, classifier\n",
 235 |     "\n",
 236 |     "train_set, test_set, classifier = train(all_features, 0.8)"
 237 |    ]
 238 |   },
 239 |   {
 240 |    "cell_type": "markdown",
 241 |    "metadata": {},
 242 |    "source": [
 243 |     "Evaluate the performance:"
 244 |    ]
 245 |   },
 246 |   {
 247 |    "cell_type": "code",
 248 |    "execution_count": 6,
 249 |    "metadata": {},
 250 |    "outputs": [
 251 |     {
 252 |      "name": "stdout",
 253 |      "output_type": "stream",
 254 |      "text": [
 255 |       "Accuracy on the training set = 0.9615663524292966\n",
 256 |       "Accuracy on the test set = 0.936231884057971\n",
 257 |       "Most Informative Features\n",
 258 |       "               forwarded = True              ham : spam   =    200.5 : 1.0\n",
 259 |       "                    2004 = True             spam : ham    =    148.6 : 1.0\n",
 260 |       "                     nom = True              ham : spam   =    125.8 : 1.0\n",
 261 |       "                    pain = True             spam : ham    =    103.6 : 1.0\n",
 262 |       "                    spam = True             spam : ham    =     92.4 : 1.0\n",
 263 |       "                  health = True             spam : ham    =     81.1 : 1.0\n",
 264 |       "                     sex = True             spam : ham    =     79.5 : 1.0\n",
 265 |       "                     ect = True              ham : spam   =     75.7 : 1.0\n",
 266 |       "              nomination = True              ham : spam   =     74.8 : 1.0\n",
 267 |       "                   super = True             spam : ham    =     74.7 : 1.0\n",
 268 |       "                featured = True             spam : ham    =     73.1 : 1.0\n",
 269 |       "                creative = True             spam : ham    =     71.5 : 1.0\n",
 270 |       "                  differ = True             spam : ham    =     71.5 : 1.0\n",
 271 |       "                      cc = True              ham : spam   =     64.9 : 1.0\n",
 272 |       "                     ibm = True             spam : ham    =     63.4 : 1.0\n",
 273 |       "                   adobe = True             spam : ham    =     61.8 : 1.0\n",
 274 |       "                  shares = True             spam : ham    =     61.8 : 1.0\n",
 275 |       "            solicitation = True             spam : ham    =     61.8 : 1.0\n",
 276 |       "               clearance = True             spam : ham    =     60.2 : 1.0\n",
 277 |       "             medications = True             spam : ham    =     60.2 : 1.0\n",
 278 |       "                congress = True             spam : ham    =     58.6 : 1.0\n",
 279 |       "                     pro = True             spam : ham    =     57.0 : 1.0\n",
 280 |       "                 dealers = True             spam : ham    =     55.4 : 1.0\n",
 281 |       "                    draw = True             spam : ham    =     53.8 : 1.0\n",
 282 |       "                    2005 = True             spam : ham    =     53.5 : 1.0\n",
 283 |       "                    2001 = True              ham : spam   =     53.4 : 1.0\n",
 284 |       "                   cheap = True             spam : ham    =     51.6 : 1.0\n",
 285 |       "                       u = True             spam : ham    =     49.6 : 1.0\n",
 286 |       "                 doctors = True             spam : ham    =     45.8 : 1.0\n",
 287 |       "               publisher = True             spam : ham    =     45.8 : 1.0\n",
 288 |       "                  sexual = True             spam : ham    =     45.8 : 1.0\n",
 289 |       "               thousands = True             spam : ham    =     44.2 : 1.0\n",
 290 |       "              affordable = True             spam : ham    =     42.6 : 1.0\n",
 291 |       "                licensed = True             spam : ham    =     42.6 : 1.0\n",
 292 |       "                mailings = True             spam : ham    =     42.6 : 1.0\n",
 293 |       "                reliable = True             spam : ham    =     42.6 : 1.0\n",
 294 |       "                   julie = True              ham : spam   =     42.2 : 1.0\n",
 295 |       "                 advises = True             spam : ham    =     41.0 : 1.0\n",
 296 |       "                      ex = True             spam : ham    =     41.0 : 1.0\n",
 297 |       "                 popular = True             spam : ham    =     41.0 : 1.0\n",
 298 |       "                    lisa = True              ham : spam   =     40.0 : 1.0\n",
 299 |       "                pipeline = True              ham : spam   =     39.4 : 1.0\n",
 300 |       "                 foresee = True             spam : ham    =     39.4 : 1.0\n",
 301 |       "                powerful = True             spam : ham    =     39.4 : 1.0\n",
 302 |       "                      wi = True             spam : ham    =     39.4 : 1.0\n",
 303 |       "                   risks = True             spam : ham    =     39.0 : 1.0\n",
 304 |       "                   steve = True              ham : spam   =     38.3 : 1.0\n",
 305 |       "                   susan = True              ham : spam   =     38.3 : 1.0\n",
 306 |       "                     ali = True             spam : ham    =     38.1 : 1.0\n",
 307 |       "                    anti = True             spam : ham    =     37.7 : 1.0\n"
 308 |      ]
 309 |     }
 310 |    ],
 311 |    "source": [
 312 |     "def evaluate(train_set, test_set, classifier):\n",
 313 |     "    # check how the classifier performs on the training and test sets\n",
 314 |     "    print (f\"Accuracy on the training set = {str(classify.accuracy(classifier, train_set))}\")\n",
 315 |     "    print (f\"Accuracy on the test set = {str(classify.accuracy(classifier, test_set))}\")    \n",
 316 |     "    # check which words are most informative for the classifier\n",
 317 |     "    classifier.show_most_informative_features(50)\n",
 318 |     "\n",
 319 |     "evaluate(train_set, test_set, classifier)"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "markdown",
 324 |    "metadata": {},
 325 |    "source": [
 326 |     "Explore the contexts of use:"
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "code",
 331 |    "execution_count": 7,
 332 |    "metadata": {},
 333 |    "outputs": [
 334 |     {
 335 |      "name": "stdout",
 336 |      "output_type": "stream",
 337 |      "text": [
 338 |       "STOCKS in HAM:\n",
 339 |       "Displaying 1 of 1 matches:\n",
 340 |       "ur member directory . * follow your stocks and news headlines , exchange files\n",
 341 |       "Displaying 1 of 1 matches:\n",
 342 |       "ur member directory . * follow your stocks and news headlines , exchange files\n",
 343 |       "Displaying 1 of 1 matches:\n",
 344 |       "ur member directory . * follow your stocks and news headlines , exchange files\n",
 345 |       "Displaying 1 of 1 matches:\n",
 346 |       "ad my portfolio is diversified into stocks that have lost even more money than\n",
 347 |       "\n",
 348 |       "\n",
 349 |       "STOCKS in SPAM:\n",
 350 |       "Displaying 3 of 3 matches:\n",
 351 |       "report reveals this smallcap rocket stocks newsletter first we would like to s\n",
 352 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 353 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 354 |       "Displaying 3 of 3 matches:\n",
 355 |       "might occur . as with many microcap stocks , today ' s company has additional \n",
 356 |       "is emai | pertaining to investing , stocks , securities must be understood as \n",
 357 |       "ntative before deciding to trade in stocks featured within this emai | . none \n",
 358 |       "Displaying 6 of 6 matches:\n",
 359 |       "hem : ( big money was made in these stocks by savvy investors who timed them r\n",
 360 |       "g filthy , stinking ri ' ch in tiny stocks no one has ever heard of until now \n",
 361 |       "ynamic things . some of these small stocks have absolutely exploded in price r\n",
 362 |       "'' occur . as with many micro - cap stocks , today ' s company has additional \n",
 363 |       " ema - il pertaining to investing , stocks or securities must be understood as\n",
 364 |       "ntative before deciding to trade in stocks featured within this ema - il . non\n",
 365 |       "Displaying 2 of 2 matches:\n",
 366 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 367 |       "ntative before deciding to trade in stocks featured within this emai | . none \n",
 368 |       "Displaying 1 of 1 matches:\n",
 369 |       "in apple investments , inc profiled stocks . in order to be in full compliance\n",
 370 |       "Displaying 3 of 3 matches:\n",
 371 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 372 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 373 |       " lose money from investing in penny stocks . if you wish to stop future mailin\n",
 374 |       "Displaying 3 of 3 matches:\n",
 375 |       " plays . widespread gains in energy stocks are inflating the portfolios of agg\n",
 376 |       "st levels of the year , with energy stocks outperforming all other market sect\n",
 377 |       "utions that sma | | and micro - cap stocks are high - risk investments and tha\n",
 378 |       "Displaying 4 of 4 matches:\n",
 379 |       "watch this one trade . these little stocks can surprise in a big way sometimes\n",
 380 |       "might occur . as with many microcap stocks , today ' s company has additional \n",
 381 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 382 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 383 |       "Displaying 1 of 1 matches:\n",
 384 |       "_ _ _ _ _ _ _ _ _ _ _ _ _ _ penny - stocks are considered highly speculative a\n",
 385 |       "Displaying 3 of 3 matches:\n",
 386 |       "might occur . as with many microcap stocks , today ' s company has additiona |\n",
 387 |       "is emai | pertaining to investing , stocks , securities must be understood as \n",
 388 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 389 |       "Displaying 1 of 1 matches:\n",
 390 |       "ne trade thursday ! go fcdh . penny stocks are considered highiy specuiative a\n",
 391 |       "Displaying 2 of 2 matches:\n",
 392 |       "ims and do your own due diligence . stocks to play ( s 2 p ) profiles are not \n",
 393 |       "s obtained . investing in micro cap stocks is extremely risky and , investors \n",
 394 |       "Displaying 2 of 2 matches:\n",
 395 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 396 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 397 |       "Displaying 2 of 2 matches:\n",
 398 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 399 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 400 |       "Displaying 1 of 1 matches:\n",
 401 |       "scovering value in natural resource stocks elgin resources ( elr - tsx ) extra\n",
 402 |       "Displaying 2 of 2 matches:\n",
 403 |       "ck monday some of these little voip stocks have been really moving lately . an\n",
 404 |       " one trade monday ! go ypil . penny stocks are considered highiy specuiative a\n",
 405 |       "Displaying 1 of 1 matches:\n",
 406 |       " one trade monday ! go ndin . penny stocks are considered highly speculative a\n",
 407 |       "Displaying 1 of 1 matches:\n",
 408 |       " one trade monday ! go wysk . penny stocks are considered highiy specuiative a\n",
 409 |       "Displaying 1 of 1 matches:\n",
 410 |       "fessionally not multi - level - not stocks - not real estate no cost tele - se\n",
 411 |       "Displaying 2 of 2 matches:\n",
 412 |       " % on regular price we have massive stocks of drugs for same day dispatch fast\n",
 413 |       "e do have the lowest price and huge stocks ready for same - day dispatch . two\n",
 414 |       "Displaying 5 of 5 matches:\n",
 415 |       "ck monday some of these littie voip stocks have been really moving lately . an\n",
 416 |       "t can happen with these sma | | cap stocks when they take off . and it happens\n",
 417 |       " statements . as with many microcap stocks , today ' s company has additiona |\n",
 418 |       "is report pertaining to investing , stocks , securities must be understood as \n",
 419 |       "ntative before deciding to trade in stocks featured within this report . none \n",
 420 |       "Displaying 3 of 3 matches:\n",
 421 |       "might occur . as with many microcap stocks , today ' s company has additiona |\n",
 422 |       "is emai | pertaining to investing , stocks , securities must be understood as \n",
 423 |       "ntative before deciding to trade in stocks featured within this emai | . none \n",
 424 |       "Displaying 1 of 1 matches:\n",
 425 |       "subject : fwd : screw doctors . stocks available . vlagr @ . x _ a _ nax .\n",
 426 |       "Displaying 1 of 1 matches:\n",
 427 |       " one trade monday ! go wysk . penny stocks are considered highiy speculative a\n",
 428 |       "Displaying 2 of 2 matches:\n",
 429 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 430 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 431 |       "Displaying 4 of 4 matches:\n",
 432 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 433 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 434 |       "eep in mind that when trading small stocks like the company above there is a c\n",
 435 |       "t professional before investing any stocks or mutual funds .\n",
 436 |       "Displaying 1 of 1 matches:\n",
 437 |       " one trade monday ! go wysk . penny stocks are considered highiy specuiative a\n",
 438 |       "Displaying 1 of 1 matches:\n",
 439 |       "dge - ksige are you tired of buying stocks and not having them perform ? our s\n",
 440 |       "Displaying 4 of 4 matches:\n",
 441 |       "ck monday some of these little voip stocks have been rea | | y moving lately .\n",
 442 |       " statements . as with many microcap stocks , today ' s company has additiona |\n",
 443 |       "is report pertaining to investing , stocks , securities must be understood as \n",
 444 |       "ntative before deciding to trade in stocks featured within this report . none \n",
 445 |       "Displaying 4 of 4 matches:\n",
 446 |       "tion is key to stock success rocket stocks newsletter u r g e n t i n v e s t \n",
 447 |       "ht occur . as with many micro - cap stocks , today ' s company has additional \n",
 448 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 449 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 450 |       "Displaying 4 of 4 matches:\n",
 451 |       " the last 12 months , many of these stocks made triple and even quadruple retu\n",
 452 |       " statements . as with many microcap stocks , today ' s company has additiona |\n",
 453 |       "is report pertaining to investing , stocks , securities must be understood as \n",
 454 |       "ntative before deciding to trade in stocks featured within this report . none \n",
 455 |       "Displaying 3 of 3 matches:\n",
 456 |       "ancements but may be one of the few stocks left in this industry group that is\n",
 457 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 458 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 459 |       "Displaying 3 of 3 matches:\n",
 460 |       "might occur . as with many microcap stocks , today ' s company has additiona |\n",
 461 |       "is emai | pertaining to investing , stocks , securities must be understood as \n",
 462 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 463 |       "Displaying 2 of 2 matches:\n",
 464 |       " % on regular price we have massive stocks of drugs for same day dispatch fast\n",
 465 |       "e do have the lowest price and huge stocks ready for same - day dispatch . two\n",
 466 |       "Displaying 1 of 1 matches:\n",
 467 |       "or information puposes only . penny stocks are considered highly speculative a\n",
 468 |       "Displaying 4 of 4 matches:\n",
 469 |       "hree days . play of the week tracks stocks on downward trends , foresees botto\n",
 470 |       "mark is our uncanny ability to spot stocks that have bottomed - out and antici\n",
 471 |       "ound and upward trend . most of the stocks we track rebound and peak within ju\n",
 472 |       "om third party . investing in penny stocks is high risk and you should seek pr\n",
 473 |       "Displaying 4 of 4 matches:\n",
 474 |       "nt opportunity drummond , small cap stocks alert newsletter must read - alert \n",
 475 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 476 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 477 |       " lose money from investing in penny stocks . - - - - - - - - - - - - - - - - -\n",
 478 |       "Displaying 4 of 4 matches:\n",
 479 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 480 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 481 |       "eep in mind that when trading small stocks like the company above there is a c\n",
 482 |       "t professional before investing any stocks or mutual funds .\n",
 483 |       "Displaying 1 of 1 matches:\n",
 484 |       "the | ast 12 months , many of these stocks made triple and even quadruple retu\n",
 485 |       "Displaying 1 of 1 matches:\n",
 486 |       "cautions that small and micro - cap stocks are high - risk investments and tha\n",
 487 |       "Displaying 4 of 4 matches:\n",
 488 |       "k tuesday some of these littie voip stocks have been reaily moving lateiy . an\n",
 489 |       " statements . as with many microcap stocks , today ' s company has additional \n",
 490 |       "is report pertaining to investing , stocks , securities must be understood as \n",
 491 |       "ntative before deciding to trade in stocks featured within this report . none \n",
 492 |       "Displaying 2 of 2 matches:\n",
 493 |       "ck monday some of these little voip stocks have been realiy moving lately . an\n",
 494 |       " one trade monday ! go ypil . penny stocks are considered highiy specuiative a\n"
 495 |      ]
 496 |     },
 497 |     {
 498 |      "name": "stdout",
 499 |      "output_type": "stream",
 500 |      "text": [
 501 |       "Displaying 2 of 2 matches:\n",
 502 |       " % on regular price we have massive stocks of drugs for same day dispatch fast\n",
 503 |       "e do have the lowest price and huge stocks ready for same - day dispatch . two\n",
 504 |       "Displaying 2 of 2 matches:\n",
 505 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 506 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 507 |       "Displaying 4 of 4 matches:\n",
 508 |       "n this stock . some of these smal | stocks are absoiuteiy fiying , as many of \n",
 509 |       " statements . as with many microcap stocks , todays company has additional ris\n",
 510 |       "biication pertaining to investing , stocks , securities must be understood as \n",
 511 |       "ntative before deciding to trade in stocks featured within this publication . \n",
 512 |       "Displaying 1 of 1 matches:\n",
 513 |       "s obtained . investing in micro cap stocks is extremely risky and , investors \n",
 514 |       "Displaying 2 of 2 matches:\n",
 515 |       "is emai | pertaining to investing , stocks , securities must be understood as \n",
 516 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 517 |       "Displaying 3 of 3 matches:\n",
 518 |       " statements . as with many microcap stocks , todays company has additional ris\n",
 519 |       "blication pertaining to investing , stocks , securities must be understood as \n",
 520 |       "ntative before deciding to trade in stocks featured within this publication . \n",
 521 |       "Displaying 1 of 1 matches:\n",
 522 |       " the last 12 months , many of these stocks made tripie and even quadruple retu\n",
 523 |       "Displaying 3 of 3 matches:\n",
 524 |       "torage inc. play of the week tracks stocks on downward trends , foresees botto\n",
 525 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 526 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 527 |       "Displaying 1 of 1 matches:\n",
 528 |       "ecializing in undervalued small cap stocks for immediate breakout erhc and exx\n",
 529 |       "Displaying 3 of 3 matches:\n",
 530 |       "5 how many times have you seen good stocks but you couldn ' t get your hands o\n",
 531 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 532 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 533 |       "Displaying 5 of 5 matches:\n",
 534 |       "5 where were you when the following stocks exploded : scos : exploded from . 3\n",
 535 |       "d . 80 on friday . face it . little stocks can mean big gains for you . this r\n",
 536 |       "might occur . as with many microcap stocks , today ' s company has additional \n",
 537 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 538 |       "ntative before deciding to trade in stocks featured within this report . none \n",
 539 |       "Displaying 2 of 2 matches:\n",
 540 |       "subject : penny stocks are about timing nomad internationa\n",
 541 |       " one trade friday ! go ndin . penny stocks are considered highiy speculative a\n",
 542 |       "Displaying 2 of 2 matches:\n",
 543 |       "ng their gains . select gold mining stocks are the hot flyers of the otc . his\n",
 544 |       "is letter cautions that micro - cap stocks are high - risk investments and tha\n",
 545 |       "Displaying 1 of 1 matches:\n",
 546 |       "cautions that small and micro - cap stocks are high - risk investments and tha\n",
 547 |       "Displaying 5 of 5 matches:\n",
 548 |       "hursday ! some of these littie voip stocks have been realiy moving lateiy . an\n",
 549 |       "t can happen with these sma | | cap stocks when they take off . and it happens\n",
 550 |       " statements . as with many microcap stocks , today ' s company has additiona |\n",
 551 |       "is report pertaining to investing , stocks , securities must be understood as \n",
 552 |       "ntative before deciding to trade in stocks featured within this report . none \n",
 553 |       "Displaying 2 of 2 matches:\n",
 554 |       "rt identifying defense and security stocks ready to explode look at the moves \n",
 555 |       " actual exchanges where small - cap stocks are traded . silica stopband doorkn\n",
 556 |       "Displaying 6 of 6 matches:\n",
 557 |       " if you knew about these low priced stocks : otcbb : zapz : closed march 31 st\n",
 558 |       " following points : * many of these stocks are undiscovered and uncovered ! wh\n",
 559 |       " ! ! * * many of these undiscovered stocks are like coiled springs , wound tig\n",
 560 |       "might occur . as with many microcap stocks , today ' s company has additional \n",
 561 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 562 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 563 |       "Displaying 2 of 2 matches:\n",
 564 |       " the last 12 months , many of these stocks made tripie and even quadruple retu\n",
 565 |       "one trade tuesday ! go mogi . penny stocks are considered highly speculative a\n",
 566 |       "Displaying 3 of 3 matches:\n",
 567 |       " statements . as with many microcap stocks , todays company has additional ris\n",
 568 |       "blication pertaining to investing , stocks , securities must be understood as \n",
 569 |       "ntative before deciding to trade in stocks featured within this publication . \n",
 570 |       "Displaying 3 of 3 matches:\n",
 571 |       " statements . as with many microcap stocks , today ' s company has additiona |\n",
 572 |       "is report pertaining to investing , stocks , securities must be understood as \n",
 573 |       "ntative before deciding to trade in stocks featured within this report . none \n",
 574 |       "Displaying 4 of 4 matches:\n",
 575 |       "y agree , some , not all , of these stocks move in price because they are prom\n",
 576 |       "tands or that as with many microcap stocks , today ' s company has additional \n",
 577 |       "is report pertaining to investing , stocks , securities must be understood as \n",
 578 |       "ntative before deciding to trade in stocks featured within this report . none \n",
 579 |       "Displaying 3 of 3 matches:\n",
 580 |       "might occur . as with many microcap stocks , today ' s company has additiona |\n",
 581 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 582 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 583 |       "Displaying 3 of 3 matches:\n",
 584 |       "n how many times have you seen good stocks but you couldn ' t get your hands o\n",
 585 |       "his email pertaining to investing , stocks , securities must be understood as \n",
 586 |       "ntative before deciding to trade in stocks featured within this email . none o\n",
 587 |       "Displaying 1 of 1 matches:\n",
 588 |       " receive first notice on run - away stocks traders ' monthly alert january pic\n"
 589 |      ]
 590 |     }
 591 |    ],
 592 |    "source": [
 593 |     "from nltk.text import Text\n",
 594 |     "\n",
 595 |     "def concordance(data_list, search_word):\n",
 596 |     "    for email in data_list:\n",
 597 |     "        word_list = [word for word in word_tokenize(email.lower())]\n",
 598 |     "        text_list = Text(word_list)\n",
 599 |     "        if search_word in word_list:\n",
 600 |     "            text_list.concordance(search_word)\n",
 601 |     "\n",
 602 |     "\n",
 603 |     "print (\"STOCKS in HAM:\")\n",
 604 |     "concordance(ham_list, \"stocks\")\n",
 605 |     "print (\"\\n\\nSTOCKS in SPAM:\")\n",
 606 |     "concordance(spam_list, \"stocks\")"
 607 |    ]
 608 |   },
 609 |   {
 610 |    "cell_type": "markdown",
 611 |    "metadata": {},
 612 |    "source": [
 613 |     "Input some of your own messages:"
 614 |    ]
 615 |   },
 616 |   {
 617 |    "cell_type": "code",
 618 |    "execution_count": 8,
 619 |    "metadata": {},
 620 |    "outputs": [
 621 |     {
 622 |      "name": "stdout",
 623 |      "output_type": "stream",
 624 |      "text": [
 625 |       "Accuracy on the training set = 0.9615663524292966\n",
 626 |       "Accuracy on the test set = 1.0\n",
 627 |       "Most Informative Features\n",
 628 |       "               forwarded = True              ham : spam   =    200.5 : 1.0\n",
 629 |       "                    2004 = True             spam : ham    =    148.6 : 1.0\n",
 630 |       "                     nom = True              ham : spam   =    125.8 : 1.0\n",
 631 |       "                    pain = True             spam : ham    =    103.6 : 1.0\n",
 632 |       "                    spam = True             spam : ham    =     92.4 : 1.0\n",
 633 |       "                  health = True             spam : ham    =     81.1 : 1.0\n",
 634 |       "                     sex = True             spam : ham    =     79.5 : 1.0\n",
 635 |       "                     ect = True              ham : spam   =     75.7 : 1.0\n",
 636 |       "              nomination = True              ham : spam   =     74.8 : 1.0\n",
 637 |       "                   super = True             spam : ham    =     74.7 : 1.0\n",
 638 |       "                featured = True             spam : ham    =     73.1 : 1.0\n",
 639 |       "                creative = True             spam : ham    =     71.5 : 1.0\n",
 640 |       "                  differ = True             spam : ham    =     71.5 : 1.0\n",
 641 |       "                      cc = True              ham : spam   =     64.9 : 1.0\n",
 642 |       "                     ibm = True             spam : ham    =     63.4 : 1.0\n",
 643 |       "                   adobe = True             spam : ham    =     61.8 : 1.0\n",
 644 |       "                  shares = True             spam : ham    =     61.8 : 1.0\n",
 645 |       "            solicitation = True             spam : ham    =     61.8 : 1.0\n",
 646 |       "               clearance = True             spam : ham    =     60.2 : 1.0\n",
 647 |       "             medications = True             spam : ham    =     60.2 : 1.0\n",
 648 |       "                congress = True             spam : ham    =     58.6 : 1.0\n",
 649 |       "                     pro = True             spam : ham    =     57.0 : 1.0\n",
 650 |       "                 dealers = True             spam : ham    =     55.4 : 1.0\n",
 651 |       "                    draw = True             spam : ham    =     53.8 : 1.0\n",
 652 |       "                    2005 = True             spam : ham    =     53.5 : 1.0\n",
 653 |       "                    2001 = True              ham : spam   =     53.4 : 1.0\n",
 654 |       "                   cheap = True             spam : ham    =     51.6 : 1.0\n",
 655 |       "                       u = True             spam : ham    =     49.6 : 1.0\n",
 656 |       "                 doctors = True             spam : ham    =     45.8 : 1.0\n",
 657 |       "               publisher = True             spam : ham    =     45.8 : 1.0\n",
 658 |       "                  sexual = True             spam : ham    =     45.8 : 1.0\n",
 659 |       "               thousands = True             spam : ham    =     44.2 : 1.0\n",
 660 |       "              affordable = True             spam : ham    =     42.6 : 1.0\n",
 661 |       "                licensed = True             spam : ham    =     42.6 : 1.0\n",
 662 |       "                mailings = True             spam : ham    =     42.6 : 1.0\n",
 663 |       "                reliable = True             spam : ham    =     42.6 : 1.0\n",
 664 |       "                   julie = True              ham : spam   =     42.2 : 1.0\n",
 665 |       "                 advises = True             spam : ham    =     41.0 : 1.0\n",
 666 |       "                      ex = True             spam : ham    =     41.0 : 1.0\n",
 667 |       "                 popular = True             spam : ham    =     41.0 : 1.0\n",
 668 |       "                    lisa = True              ham : spam   =     40.0 : 1.0\n",
 669 |       "                pipeline = True              ham : spam   =     39.4 : 1.0\n",
 670 |       "                 foresee = True             spam : ham    =     39.4 : 1.0\n",
 671 |       "                powerful = True             spam : ham    =     39.4 : 1.0\n",
 672 |       "                      wi = True             spam : ham    =     39.4 : 1.0\n",
 673 |       "                   risks = True             spam : ham    =     39.0 : 1.0\n",
 674 |       "                   steve = True              ham : spam   =     38.3 : 1.0\n",
 675 |       "                   susan = True              ham : spam   =     38.3 : 1.0\n",
 676 |       "                     ali = True             spam : ham    =     38.1 : 1.0\n",
 677 |       "                    anti = True             spam : ham    =     37.7 : 1.0\n"
 678 |      ]
 679 |     }
 680 |    ],
 681 |    "source": [
 682 |     "test_spam_list = [\"Participate in our new lottery!\", \"Try out this new medicine\"]\n",
 683 |     "test_ham_list = [\"See the minutes from the last meeting attached\", \n",
 684 |     "                 \"Investors are coming to our office on Monday\"]\n",
 685 |     "\n",
 686 |     "test_emails = [(email_content, \"spam\") for email_content in test_spam_list]\n",
 687 |     "test_emails += [(email_content, \"ham\") for email_content in test_ham_list]\n",
 688 |     "\n",
 689 |     "new_test_set = [(get_features(email), label) for (email, label) in test_emails]\n",
 690 |     "\n",
 691 |     "evaluate(train_set, new_test_set, classifier)"
 692 |    ]
 693 |   },
 694 |   {
 695 |    "cell_type": "markdown",
 696 |    "metadata": {},
 697 |    "source": [
 698 |     "See how they get classified:"
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "code",
 703 |    "execution_count": 9,
 704 |    "metadata": {},
 705 |    "outputs": [
 706 |     {
 707 |      "name": "stdout",
 708 |      "output_type": "stream",
 709 |      "text": [
 710 |       "Participate in our new lottery!\n",
 711 |       "spam\n",
 712 |       "Try out this new medicine\n",
 713 |       "spam\n",
 714 |       "See the minutes from the last meeting attached\n",
 715 |       "ham\n",
 716 |       "Investors are coming to our office on Monday\n",
 717 |       "ham\n"
 718 |      ]
 719 |     }
 720 |    ],
 721 |    "source": [
 722 |     "for email in test_spam_list:\n",
 723 |     "    print (email)\n",
 724 |     "    print (classifier.classify(get_features(email)))\n",
 725 |     "for email in test_ham_list:\n",
 726 |     "    print (email)\n",
 727 |     "    print (classifier.classify(get_features(email)))"
 728 |    ]
 729 |   },
 730 |   {
 731 |    "cell_type": "markdown",
 732 |    "metadata": {},
 733 |    "source": [
 734 |     "Run in an interactive manner:"
 735 |    ]
 736 |   },
 737 |   {
 738 |    "cell_type": "code",
 739 |    "execution_count": 10,
 740 |    "metadata": {},
 741 |    "outputs": [
 742 |     {
 743 |      "name": "stdout",
 744 |      "output_type": "stream",
 745 |      "text": [
 746 |       "Type in your email here (or press 'Enter'): Buy new meds\n",
 747 |       "This email is likely spam\n",
 748 |       "\n",
 749 |       "Type in your email here (or press 'Enter'): Buy new meds here!\n",
 750 |       "This email is likely spam\n",
 751 |       "\n",
 752 |       "Type in your email here (or press 'Enter'): Get your stock options fast\n",
 753 |       "This email is likely spam\n",
 754 |       "\n",
 755 |       "Type in your email here (or press 'Enter'): Let's schedule a meeting for tomorrow\n",
 756 |       "This email is likely ham\n",
 757 |       "\n",
 758 |       "Type in your email here (or press 'Enter'): \n"
 759 |      ]
 760 |     }
 761 |    ],
 762 |    "source": [
 763 |     "while True:\n",
 764 |     "    email = input(\"Type in your email here (or press 'Enter'): \")\n",
 765 |     "    if len(email)==0:\n",
 766 |     "        break\n",
 767 |     "    else: \n",
 768 |     "        prediction = classifier.classify(get_features(email))\n",
 769 |     "        print (f\"This email is likely {prediction}\\n\")"
 770 |    ]
 771 |   },
 772 |   {
 773 |    "cell_type": "markdown",
 774 |    "metadata": {},
 775 |    "source": [
 776 |     "Run on a different dataset:"
 777 |    ]
 778 |   },
 779 |   {
 780 |    "cell_type": "markdown",
 781 |    "metadata": {},
 782 |    "source": [
 783 |     "# Assignment:\n",
 784 |     "\n",
 785 |     "Apply the classifier to a different test set, e.g. the emails from `enron2/`. As before, you need to read in the data, extract textual content, extract the features and evaluate the classifier. What do the results tell you?"
 786 |    ]
 787 |   },
 788 |   {
 789 |    "cell_type": "code",
 790 |    "execution_count": 11,
 791 |    "metadata": {},
 792 |    "outputs": [
 793 |     {
 794 |      "name": "stdout",
 795 |      "output_type": "stream",
 796 |      "text": [
 797 |       "1496\n",
 798 |       "Subject: big range of all types of downloadable software.\n",
 799 |       "Need software? Click here.\n",
 800 |       "Our american professors like their literature clear, cold, pure and very dead.\n",
 801 |       "Being another character is more interesting than being yourself.\n",
 802 |       "4361\n",
 803 |       "Subject: re: telephone interview with enron corp. Research dept.\n",
 804 |       "Dear shirley:\n",
 805 |       "Confirming that I will be waiting for the telephone interview at 1 pm\n",
 806 |       "Tomorrow.? I would like to give you my cell phone number, 713/907 - 6717, as a\n",
 807 |       "Back - up measure.? Please note that my first preference is to receive the call\n",
 808 |       "At my home number, 713/669 - 0923.\n",
 809 |       "Sincerely,\n",
 810 |       "RabI de\n",
 811 |       "?\n",
 812 |       "? Shirley. Crenshaw@ enron. Com wrote:\n",
 813 |       "Dear rabi:\n",
 814 |       "I have scheduled the telephone interview for 1: 00 pm on friday, july 7 th.\n",
 815 |       "We will call you at 713/669 - 0923. If there are any changes, please let\n",
 816 |       "Me know.\n",
 817 |       "Sincerely,\n",
 818 |       "Shirley crenshaw\n",
 819 |       "713 - 853 - 5290\n",
 820 |       "RabI deon 06/26/2000 10: 37: 24 pm\n",
 821 |       "To: shirley crenshaw\n",
 822 |       "Cc:\n",
 823 |       "Subject: re: telephone interview with enron corp. Research dept.\n",
 824 |       "Dear ms. Crenshaw:\n",
 825 |       "Thanks for your prompt response.? July 6 or 7 th will work best for me..? I\n",
 826 |       "Would prefer to be called at my home number.? Please let me know the\n",
 827 |       "Schedule and other details, if any.\n",
 828 |       "Sincerely,\n",
 829 |       "RabI de\n",
 830 |       "? Shirley crenshawwrote:\n",
 831 |       "Good afternoon mr. De:\n",
 832 |       "Your resume has been forwarded to the enron corp. Re! Search dept. And\n",
 833 |       "They would like to conduct a telephone interview with you at your\n",
 834 |       "Convenience.\n",
 835 |       "The interviewers would be:\n",
 836 |       "Vince kaminskI managing director\n",
 837 |       "Stinson gibner vice president\n",
 838 |       "Grant masson vice president\n",
 839 |       "P. V. Krishnarao director\n",
 840 |       "Paulo issler manager\n",
 841 |       "Please give me some dates and times this week or july 5, 6, and 7 th when\n",
 842 |       "You might be available and I will coordinate with the other calendars.\n",
 843 |       "I look forward to hearing from you.\n",
 844 |       "Sincerely,\n",
 845 |       "Shirley crenshaw\n",
 846 |       "Administrative coordinator\n",
 847 |       "Enron corp. Research\n",
 848 |       "713/853 - 5290\n",
 849 |       "Email: shirley. Crenshaw@ enron. Com\n",
 850 |       "Do you yahoo!?\n",
 851 |       "Get yahoo! Mail - free email you can access from anywhere!\n",
 852 |       "Do you yahoo!?\n",
 853 |       "Send instant messages & get email alerts with yahoo! Messenger.\n",
 854 |       "Accuracy on the training set = 0.9615663524292966\n",
 855 |       "Accuracy on the test set = 0.7430425132320301\n",
 856 |       "Most Informative Features\n",
 857 |       "               forwarded = True              ham : spam   =    200.5 : 1.0\n",
 858 |       "                    2004 = True             spam : ham    =    148.6 : 1.0\n",
 859 |       "                     nom = True              ham : spam   =    125.8 : 1.0\n",
 860 |       "                    pain = True             spam : ham    =    103.6 : 1.0\n",
 861 |       "                    spam = True             spam : ham    =     92.4 : 1.0\n",
 862 |       "                  health = True             spam : ham    =     81.1 : 1.0\n",
 863 |       "                     sex = True             spam : ham    =     79.5 : 1.0\n",
 864 |       "                     ect = True              ham : spam   =     75.7 : 1.0\n",
 865 |       "              nomination = True              ham : spam   =     74.8 : 1.0\n",
 866 |       "                   super = True             spam : ham    =     74.7 : 1.0\n",
 867 |       "                featured = True             spam : ham    =     73.1 : 1.0\n",
 868 |       "                creative = True             spam : ham    =     71.5 : 1.0\n",
 869 |       "                  differ = True             spam : ham    =     71.5 : 1.0\n",
 870 |       "                      cc = True              ham : spam   =     64.9 : 1.0\n",
 871 |       "                     ibm = True             spam : ham    =     63.4 : 1.0\n",
 872 |       "                   adobe = True             spam : ham    =     61.8 : 1.0\n",
 873 |       "                  shares = True             spam : ham    =     61.8 : 1.0\n",
 874 |       "            solicitation = True             spam : ham    =     61.8 : 1.0\n",
 875 |       "               clearance = True             spam : ham    =     60.2 : 1.0\n",
 876 |       "             medications = True             spam : ham    =     60.2 : 1.0\n",
 877 |       "                congress = True             spam : ham    =     58.6 : 1.0\n",
 878 |       "                     pro = True             spam : ham    =     57.0 : 1.0\n",
 879 |       "                 dealers = True             spam : ham    =     55.4 : 1.0\n",
 880 |       "                    draw = True             spam : ham    =     53.8 : 1.0\n",
 881 |       "                    2005 = True             spam : ham    =     53.5 : 1.0\n",
 882 |       "                    2001 = True              ham : spam   =     53.4 : 1.0\n",
 883 |       "                   cheap = True             spam : ham    =     51.6 : 1.0\n",
 884 |       "                       u = True             spam : ham    =     49.6 : 1.0\n",
 885 |       "                 doctors = True             spam : ham    =     45.8 : 1.0\n",
 886 |       "               publisher = True             spam : ham    =     45.8 : 1.0\n",
 887 |       "                  sexual = True             spam : ham    =     45.8 : 1.0\n",
 888 |       "               thousands = True             spam : ham    =     44.2 : 1.0\n",
 889 |       "              affordable = True             spam : ham    =     42.6 : 1.0\n",
 890 |       "                licensed = True             spam : ham    =     42.6 : 1.0\n",
 891 |       "                mailings = True             spam : ham    =     42.6 : 1.0\n",
 892 |       "                reliable = True             spam : ham    =     42.6 : 1.0\n",
 893 |       "                   julie = True              ham : spam   =     42.2 : 1.0\n",
 894 |       "                 advises = True             spam : ham    =     41.0 : 1.0\n",
 895 |       "                      ex = True             spam : ham    =     41.0 : 1.0\n",
 896 |       "                 popular = True             spam : ham    =     41.0 : 1.0\n",
 897 |       "                    lisa = True              ham : spam   =     40.0 : 1.0\n",
 898 |       "                pipeline = True              ham : spam   =     39.4 : 1.0\n",
 899 |       "                 foresee = True             spam : ham    =     39.4 : 1.0\n",
 900 |       "                powerful = True             spam : ham    =     39.4 : 1.0\n",
 901 |       "                      wi = True             spam : ham    =     39.4 : 1.0\n",
 902 |       "                   risks = True             spam : ham    =     39.0 : 1.0\n",
 903 |       "                   steve = True              ham : spam   =     38.3 : 1.0\n",
 904 |       "                   susan = True              ham : spam   =     38.3 : 1.0\n",
 905 |       "                     ali = True             spam : ham    =     38.1 : 1.0\n",
 906 |       "                    anti = True             spam : ham    =     37.7 : 1.0\n"
 907 |      ]
 908 |     }
 909 |    ],
 910 |    "source": [
 911 |     "test_spam_list = read_in(\"enron2/spam/\")\n",
 912 |     "print(len(test_spam_list))\n",
 913 |     "print(test_spam_list[0])\n",
 914 |     "test_ham_list = read_in(\"enron2/ham/\")\n",
 915 |     "print(len(test_ham_list))\n",
 916 |     "print(test_ham_list[0])\n",
 917 |     "\n",
 918 |     "test_emails = [(email_content, \"spam\") for email_content in test_spam_list]\n",
 919 |     "test_emails += [(email_content, \"ham\") for email_content in test_ham_list]\n",
 920 |     "random.shuffle(test_emails)\n",
 921 |     "\n",
 922 |     "new_test_set = [(get_features(email), label) for (email, label) in test_emails]\n",
 923 |     "\n",
 924 |     "evaluate(train_set, new_test_set, classifier)"
 925 |    ]
 926 |   },
 927 |   {
 928 |    "cell_type": "markdown",
 929 |    "metadata": {},
 930 |    "source": [
 931 |     "Combine the two datasets:"
 932 |    ]
 933 |   },
 934 |   {
 935 |    "cell_type": "code",
 936 |    "execution_count": 12,
 937 |    "metadata": {},
 938 |    "outputs": [
 939 |     {
 940 |      "name": "stdout",
 941 |      "output_type": "stream",
 942 |      "text": [
 943 |       "2996\n",
 944 |       "8033\n",
 945 |       "11029\n",
 946 |       "Training set size = 8823 emails\n",
 947 |       "Test set size = 2206 emails\n",
 948 |       "Accuracy on the training set = 0.9819789187351241\n",
 949 |       "Accuracy on the test set = 0.9810483182516647\n",
 950 |       "Most Informative Features\n",
 951 |       "                   meter = True              ham : spam   =    263.8 : 1.0\n",
 952 |       "                   vince = True              ham : spam   =    200.3 : 1.0\n",
 953 |       "                     sex = True             spam : ham    =    195.1 : 1.0\n",
 954 |       "                     nom = True              ham : spam   =    194.9 : 1.0\n",
 955 |       "                     php = True             spam : ham    =    182.1 : 1.0\n",
 956 |       "            prescription = True             spam : ham    =    169.2 : 1.0\n",
 957 |       "                     ect = True              ham : spam   =    167.7 : 1.0\n",
 958 |       "                    spam = True             spam : ham    =    145.8 : 1.0\n",
 959 |       "               forwarded = True              ham : spam   =    136.4 : 1.0\n",
 960 |       "                     fyi = True              ham : spam   =    134.6 : 1.0\n",
 961 |       "                    2005 = True             spam : ham    =    128.1 : 1.0\n",
 962 |       "                   logos = True             spam : ham    =    121.2 : 1.0\n",
 963 |       "              nomination = True              ham : spam   =    112.6 : 1.0\n",
 964 |       "              macromedia = True             spam : ham    =    109.5 : 1.0\n",
 965 |       "                  studio = True             spam : ham    =    108.3 : 1.0\n",
 966 |       "           uncertainties = True             spam : ham    =    108.3 : 1.0\n",
 967 |       "                   corel = True             spam : ham    =    104.4 : 1.0\n",
 968 |       "                  dealer = True             spam : ham    =    104.4 : 1.0\n",
 969 |       "                 readers = True             spam : ham    =     96.6 : 1.0\n",
 970 |       "                     pat = True              ham : spam   =     89.6 : 1.0\n",
 971 |       "                      cc = True              ham : spam   =     89.2 : 1.0\n",
 972 |       "                    2004 = True             spam : ham    =     81.9 : 1.0\n",
 973 |       "                     713 = True              ham : spam   =     81.0 : 1.0\n",
 974 |       "              materially = True             spam : ham    =     79.7 : 1.0\n",
 975 |       "             medications = True             spam : ham    =     78.4 : 1.0\n",
 976 |       "                     853 = True              ham : spam   =     78.3 : 1.0\n",
 977 |       "                identity = True             spam : ham    =     75.8 : 1.0\n",
 978 |       "                    8859 = True             spam : ham    =     72.0 : 1.0\n",
 979 |       "                 shirley = True              ham : spam   =     71.3 : 1.0\n",
 980 |       "                   penis = True             spam : ham    =     69.4 : 1.0\n",
 981 |       "                 removal = True             spam : ham    =     69.4 : 1.0\n",
 982 |       "                      wi = True             spam : ham    =     69.4 : 1.0\n",
 983 |       "                 artwork = True             spam : ham    =     68.1 : 1.0\n",
 984 |       "                 beliefs = True             spam : ham    =     68.1 : 1.0\n",
 985 |       "                   canon = True             spam : ham    =     66.8 : 1.0\n",
 986 |       "                 foresee = True             spam : ham    =     66.8 : 1.0\n",
 987 |       "                    pain = True             spam : ham    =     65.7 : 1.0\n",
 988 |       "             speculative = True             spam : ham    =     65.5 : 1.0\n",
 989 |       "                   epson = True             spam : ham    =     61.6 : 1.0\n",
 990 |       "                 factual = True             spam : ham    =     61.6 : 1.0\n",
 991 |       "                featured = True             spam : ham    =     60.8 : 1.0\n",
 992 |       "           advertisement = True             spam : ham    =     59.2 : 1.0\n",
 993 |       "                    drug = True             spam : ham    =     59.0 : 1.0\n",
 994 |       "                 hewlett = True             spam : ham    =     59.0 : 1.0\n",
 995 |       "                  weight = True             spam : ham    =     58.7 : 1.0\n",
 996 |       "                 packard = True             spam : ham    =     57.7 : 1.0\n",
 997 |       "                  sexual = True             spam : ham    =     57.2 : 1.0\n",
 998 |       "               enquiries = True             spam : ham    =     56.4 : 1.0\n",
 999 |       "              scheduling = True              ham : spam   =     55.4 : 1.0\n",
1000 |       "                   adult = True             spam : ham    =     55.1 : 1.0\n"
1001 |      ]
1002 |     }
1003 |    ],
1004 |    "source": [
1005 |     "spam_list = read_in(\"enron1/spam/\") + read_in(\"enron2/spam/\")\n",
1006 |     "print(len(spam_list))\n",
1007 |     "ham_list = read_in(\"enron1/ham/\") + read_in(\"enron2/ham/\")\n",
1008 |     "print(len(ham_list))\n",
1009 |     "\n",
1010 |     "all_emails = [(email_content, \"spam\") for email_content in spam_list]\n",
1011 |     "all_emails += [(email_content, \"ham\") for email_content in ham_list]\n",
1012 |     "random.shuffle(test_emails)\n",
1013 |     "\n",
1014 |     "all_features = [(get_features(email), label) for (email, label) in all_emails]\n",
1015 |     "print(len(all_features))\n",
1016 |     "\n",
1017 |     "train_set, test_set, classifier = train(all_features, 0.8)\n",
1018 |     "evaluate(train_set, new_test_set, classifier)"
1019 |    ]
1020 |   },
1021 |   {
1022 |    "cell_type": "code",
1023 |    "execution_count": null,
1024 |    "metadata": {},
1025 |    "outputs": [],
1026 |    "source": []
1027 |   }
1028 |  ],
1029 |  "metadata": {
1030 |   "kernelspec": {
1031 |    "display_name": "Python 3",
1032 |    "language": "python",
1033 |    "name": "python3"
1034 |   },
1035 |   "language_info": {
1036 |    "codemirror_mode": {
1037 |     "name": "ipython",
1038 |     "version": 3
1039 |    },
1040 |    "file_extension": ".py",
1041 |    "mimetype": "text/x-python",
1042 |    "name": "python",
1043 |    "nbconvert_exporter": "python",
1044 |    "pygments_lexer": "ipython3",
1045 |    "version": "3.7.6"
1046 |   }
1047 |  },
1048 |  "nbformat": 4,
1049 |  "nbformat_minor": 2
1050 | }
1051 | 


--------------------------------------------------------------------------------
/Chapter3.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Chapter 3: Information Retrieval"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "## Step 1: Reading in data\n",
  15 |     "\n",
  16 |     "There are three components to this data:\n",
  17 |     "- documents with their ids and content – there are $1460$ of those to be precise;\n",
  18 |     "- questions / queries with their ids and content – there are $112$ of those;\n",
  19 |     "- mapping between the queries and relevant documents.\n",
  20 |     "\n",
  21 |     "First, let's read in documents from the `CISI.ALL` file and store the result in `documents` data structure – set of tuples of document ids matched with contents:"
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "code",
  26 |    "execution_count": 1,
  27 |    "metadata": {},
  28 |    "outputs": [
  29 |     {
  30 |      "name": "stdout",
  31 |      "output_type": "stream",
  32 |      "text": [
  33 |       "1460\n",
  34 |       " 18 Editions of the Dewey Decimal Classifications Comaromi, J.P. The present study is a history of the DEWEY Decimal Classification.  The first edition of the DDC was published in 1876, the eighteenth edition in 1971, and future editions will continue to appear as needed.  In spite of the DDC's long and healthy life, however, its full story has never been told.  There have been biographies of Dewey that briefly describe his system, but this is the first attempt to provide a detailed history of the work that more than any other has spurred the growth of librarianship in this country and abroad. \n"
  35 |      ]
  36 |     }
  37 |    ],
  38 |    "source": [
  39 |     "def read_documents():\n",
  40 |     "    f = open(\"cisi/CISI.ALL\")\n",
  41 |     "    merged = \"\"\n",
  42 |     "    \n",
  43 |     "    for a_line in f.readlines():\n",
  44 |     "        if a_line.startswith(\".\"):\n",
  45 |     "            merged += \"\\n\" + a_line.strip()\n",
  46 |     "        else:\n",
  47 |     "            merged += \" \" + a_line.strip()\n",
  48 |     "    \n",
  49 |     "    documents = {}\n",
  50 |     "\n",
  51 |     "    content = \"\"\n",
  52 |     "    doc_id = \"\"\n",
  53 |     "\n",
  54 |     "    for a_line in merged.split(\"\\n\"):\n",
  55 |     "        if a_line.startswith(\".I\"):\n",
  56 |     "            doc_id = a_line.split(\" \")[1].strip()\n",
  57 |     "        elif a_line.startswith(\".X\"):\n",
  58 |     "            documents[doc_id] = content\n",
  59 |     "            content = \"\"\n",
  60 |     "            doc_id = \"\"\n",
  61 |     "        else:\n",
  62 |     "            content += a_line.strip()[3:] + \" \"\n",
  63 |     "    f.close()\n",
  64 |     "    return documents\n",
  65 |     "\n",
  66 |     "documents = read_documents()\n",
  67 |     "print(len(documents))\n",
  68 |     "print(documents.get(\"1\"))"
  69 |    ]
  70 |   },
  71 |   {
  72 |    "cell_type": "markdown",
  73 |    "metadata": {},
  74 |    "source": [
  75 |     "Second, let's read in queries from the `CISI.QRY` file and store the result in `queries` data structure – set of tuples of query ids matched with contents:"
  76 |    ]
  77 |   },
  78 |   {
  79 |    "cell_type": "code",
  80 |    "execution_count": 2,
  81 |    "metadata": {},
  82 |    "outputs": [
  83 |     {
  84 |      "name": "stdout",
  85 |      "output_type": "stream",
  86 |      "text": [
  87 |       "112\n",
  88 |       "What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles? \n"
  89 |      ]
  90 |     }
  91 |    ],
  92 |    "source": [
  93 |     "def read_queries():\n",
  94 |     "    f = open(\"cisi/CISI.QRY\")\n",
  95 |     "    merged = \"\"\n",
  96 |     "    \n",
  97 |     "    for a_line in f.readlines():\n",
  98 |     "        if a_line.startswith(\".\"):\n",
  99 |     "            merged += \"\\n\" + a_line.strip()\n",
 100 |     "        else:\n",
 101 |     "            merged += \" \" + a_line.strip()\n",
 102 |     "    \n",
 103 |     "    queries = {}\n",
 104 |     "\n",
 105 |     "    content = \"\"\n",
 106 |     "    qry_id = \"\"\n",
 107 |     "\n",
 108 |     "    for a_line in merged.split(\"\\n\"):\n",
 109 |     "        if a_line.startswith(\".I\"):\n",
 110 |     "            if not content==\"\":\n",
 111 |     "                queries[qry_id] = content\n",
 112 |     "                content = \"\"\n",
 113 |     "                qry_id = \"\"\n",
 114 |     "            qry_id = a_line.split(\" \")[1].strip()\n",
 115 |     "        elif a_line.startswith(\".W\") or a_line.startswith(\".T\"):\n",
 116 |     "            content += a_line.strip()[3:] + \" \"\n",
 117 |     "    queries[qry_id] = content\n",
 118 |     "    f.close()\n",
 119 |     "    return queries\n",
 120 |     "\n",
 121 |     "queries = read_queries()\n",
 122 |     "print(len(queries))\n",
 123 |     "print(queries.get(\"1\"))"
 124 |    ]
 125 |   },
 126 |   {
 127 |    "cell_type": "markdown",
 128 |    "metadata": {},
 129 |    "source": [
 130 |     "Finally, let's read in the mapping between the queries and the documents – we'll keep these in the `mappings` data structure – with tuples where each query index (key) corresponds to the list of one or more document indices (value):"
 131 |    ]
 132 |   },
 133 |   {
 134 |    "cell_type": "code",
 135 |    "execution_count": 3,
 136 |    "metadata": {},
 137 |    "outputs": [
 138 |     {
 139 |      "name": "stdout",
 140 |      "output_type": "stream",
 141 |      "text": [
 142 |       "76\n",
 143 |       "dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '37', '39', '41', '42', '43', '44', '45', '46', '49', '50', '52', '54', '55', '56', '57', '58', '61', '62', '65', '66', '67', '69', '71', '76', '79', '81', '82', '84', '90', '92', '95', '96', '97', '98', '99', '100', '101', '102', '104', '109', '111'])\n",
 144 |       "['28', '35', '38', '42', '43', '52', '65', '76', '86', '150', '189', '192', '193', '195', '215', '269', '291', '320', '429', '465', '466', '482', '483', '510', '524', '541', '576', '582', '589', '603', '650', '680', '711', '722', '726', '783', '813', '820', '868', '869', '894', '1162', '1164', '1195', '1196', '1281']\n"
 145 |      ]
 146 |     }
 147 |    ],
 148 |    "source": [
 149 |     "def read_mappings():\n",
 150 |     "    f = open(\"cisi/CISI.REL\")\n",
 151 |     "    \n",
 152 |     "    mappings = {}\n",
 153 |     "\n",
 154 |     "    for a_line in f.readlines():\n",
 155 |     "        voc = a_line.strip().split()\n",
 156 |     "        key = voc[0].strip()\n",
 157 |     "        current_value = voc[1].strip()\n",
 158 |     "        value = []\n",
 159 |     "        if key in mappings.keys():\n",
 160 |     "            value = mappings.get(key)\n",
 161 |     "        value.append(current_value)\n",
 162 |     "        mappings[key] = value\n",
 163 |     "\n",
 164 |     "    f.close()\n",
 165 |     "    return mappings\n",
 166 |     "\n",
 167 |     "mappings = read_mappings()\n",
 168 |     "print(len(mappings))\n",
 169 |     "print(mappings.keys())\n",
 170 |     "print(mappings.get(\"1\"))"
 171 |    ]
 172 |   },
 173 |   {
 174 |    "cell_type": "markdown",
 175 |    "metadata": {},
 176 |    "source": [
 177 |     "## A simple Boolean search algorithm\n",
 178 |     "\n",
 179 |     "First perform simple preprocessing as in the previous chapter:"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "code",
 184 |    "execution_count": 4,
 185 |    "metadata": {},
 186 |    "outputs": [
 187 |     {
 188 |      "name": "stdout",
 189 |      "output_type": "stream",
 190 |      "text": [
 191 |       "1460\n",
 192 |       "['18', 'editions', 'of', 'the', 'dewey', 'decimal', 'classifications', 'comaromi', ',', 'j.p.', 'the', 'present', 'study', 'is', 'a', 'history', 'of', 'the', 'dewey', 'decimal', 'classification', '.', 'the', 'first', 'edition', 'of', 'the', 'ddc', 'was', 'published', 'in', '1876', ',', 'the', 'eighteenth', 'edition', 'in', '1971', ',', 'and', 'future', 'editions', 'will', 'continue', 'to', 'appear', 'as', 'needed', '.', 'in', 'spite', 'of', 'the', 'ddc', \"'s\", 'long', 'and', 'healthy', 'life', ',', 'however', ',', 'its', 'full', 'story', 'has', 'never', 'been', 'told', '.', 'there', 'have', 'been', 'biographies', 'of', 'dewey', 'that', 'briefly', 'describe', 'his', 'system', ',', 'but', 'this', 'is', 'the', 'first', 'attempt', 'to', 'provide', 'a', 'detailed', 'history', 'of', 'the', 'work', 'that', 'more', 'than', 'any', 'other', 'has', 'spurred', 'the', 'growth', 'of', 'librarianship', 'in', 'this', 'country', 'and', 'abroad', '.']\n",
 193 |       "113\n",
 194 |       "112\n",
 195 |       "['what', 'problems', 'and', 'concerns', 'are', 'there', 'in', 'making', 'up', 'descriptive', 'titles', '?', 'what', 'difficulties', 'are', 'involved', 'in', 'automatically', 'retrieving', 'articles', 'from', 'approximate', 'titles', '?', 'what', 'is', 'the', 'usual', 'relevance', 'of', 'the', 'content', 'of', 'articles', 'to', 'their', 'titles', '?']\n",
 196 |       "38\n"
 197 |      ]
 198 |     }
 199 |    ],
 200 |    "source": [
 201 |     "import nltk\n",
 202 |     "from nltk import word_tokenize\n",
 203 |     "\n",
 204 |     "def get_words(text): \n",
 205 |     "    word_list = [word for word in word_tokenize(text.lower())]\n",
 206 |     "    return word_list\n",
 207 |     "\n",
 208 |     "doc_words = {}\n",
 209 |     "qry_words = {}\n",
 210 |     "for doc_id in documents.keys():\n",
 211 |     "    doc_words[doc_id] = get_words(documents.get(doc_id))\n",
 212 |     "for qry_id in queries.keys():\n",
 213 |     "    qry_words[qry_id] = get_words(queries.get(qry_id))\n",
 214 |     "\n",
 215 |     "print(len(doc_words))\n",
 216 |     "print(doc_words.get(\"1\"))\n",
 217 |     "print(len(doc_words.get(\"1\")))\n",
 218 |     "print(len(qry_words))\n",
 219 |     "print(qry_words.get(\"1\"))\n",
 220 |     "print(len(qry_words.get(\"1\")))"
 221 |    ]
 222 |   },
 223 |   {
 224 |    "cell_type": "markdown",
 225 |    "metadata": {},
 226 |    "source": [
 227 |     "And next match in a Boolean way:"
 228 |    ]
 229 |   },
 230 |   {
 231 |    "cell_type": "code",
 232 |    "execution_count": 5,
 233 |    "metadata": {},
 234 |    "outputs": [
 235 |     {
 236 |      "name": "stdout",
 237 |      "output_type": "stream",
 238 |      "text": [
 239 |       "['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100']\n",
 240 |       "1460\n"
 241 |      ]
 242 |     }
 243 |    ],
 244 |    "source": [
 245 |     "def retrieve_documents(doc_words, query):\n",
 246 |     "    docs = []\n",
 247 |     "    for doc_id in doc_words.keys():\n",
 248 |     "        found = False\n",
 249 |     "        i = 0\n",
 250 |     "        while i<len(query) and not found:\n",
 251 |     "            word = query[i]\n",
 252 |     "            if word in doc_words.get(doc_id):\n",
 253 |     "                docs.append(doc_id)\n",
 254 |     "                found=True\n",
 255 |     "            else:\n",
 256 |     "                i+=1\n",
 257 |     "    return docs\n",
 258 |     "\n",
 259 |     "docs = retrieve_documents(doc_words, qry_words.get(\"6\"))\n",
 260 |     "print(docs[:100])\n",
 261 |     "print(len(docs))"
 262 |    ]
 263 |   },
 264 |   {
 265 |    "cell_type": "markdown",
 266 |    "metadata": {},
 267 |    "source": [
 268 |     "**Exercise**: match the documents to the queries based on occurrence of *all* query words in the document:  "
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "code",
 273 |    "execution_count": 6,
 274 |    "metadata": {},
 275 |    "outputs": [
 276 |     {
 277 |      "name": "stdout",
 278 |      "output_type": "stream",
 279 |      "text": [
 280 |       "[]\n",
 281 |       "0\n"
 282 |      ]
 283 |     }
 284 |    ],
 285 |    "source": [
 286 |     "def retrieve_documents(doc_words, query):\n",
 287 |     "    docs = []\n",
 288 |     "    for doc_id in doc_words.keys():\n",
 289 |     "        #here, you are interested in the documents that contain all words    \n",
 290 |     "        found = True    \n",
 291 |     "        i = 0\n",
 292 |     "        #iterate through words in the query\n",
 293 |     "        while i<len(query) and found:    \n",
 294 |     "            word = query[i]\n",
 295 |     "            if not word in doc_words.get(doc_id):\n",
 296 |     "                #if the word is not in document, turn found flag off and stop\n",
 297 |     "                found=False    \n",
 298 |     "            else:\n",
 299 |     "                #otherwise, move on to the next query word\n",
 300 |     "                i+=1    \n",
 301 |     "        #if all words are found in the document, the last index is len(query)-1\n",
 302 |     "        #add the doc_id only in this case\n",
 303 |     "        if i==len(query)-1:\n",
 304 |     "            docs.append(doc_id)\n",
 305 |     "    return docs\n",
 306 |     "\n",
 307 |     "docs = retrieve_documents(doc_words, qry_words.get(\"112\"))\n",
 308 |     "print(docs[:100])\n",
 309 |     "print(len(docs))"
 310 |    ]
 311 |   },
 312 |   {
 313 |    "cell_type": "markdown",
 314 |    "metadata": {},
 315 |    "source": [
 316 |     "In fact, it is a very rare case that you may have any single document that contains all the words from the query, therefore, with this approach, you will likely get no relevant documents returned for any queries in this dataset."
 317 |    ]
 318 |   },
 319 |   {
 320 |    "cell_type": "markdown",
 321 |    "metadata": {},
 322 |    "source": [
 323 |     "## Step 2: Preprocessing the data\n",
 324 |     "\n",
 325 |     "Apply the following steps (some as before):\n",
 326 |     "- tokenize the text\n",
 327 |     "- put to lowercase\n",
 328 |     "- remove stopwords\n",
 329 |     "- lemmatize\n",
 330 |     "\n",
 331 |     "Now apply these steps to both documents and queries. Let's start with stopwords removal:"
 332 |    ]
 333 |   },
 334 |   {
 335 |    "cell_type": "code",
 336 |    "execution_count": 7,
 337 |    "metadata": {},
 338 |    "outputs": [
 339 |     {
 340 |      "name": "stdout",
 341 |      "output_type": "stream",
 342 |      "text": [
 343 |       "['18', 'editions', 'dewey', 'decimal', 'classifications', 'comaromi', 'j.p.', 'present', 'study', 'history', 'dewey', 'decimal', 'classification', 'first', 'edition', 'ddc', 'published', '1876', 'eighteenth', 'edition', '1971', 'future', 'editions', 'continue', 'appear', 'needed', 'spite', 'ddc', \"'s\", 'long', 'healthy', 'life', 'however', 'full', 'story', 'never', 'told', 'biographies', 'dewey', 'briefly', 'describe', 'system', 'first', 'attempt', 'provide', 'detailed', 'history', 'work', 'spurred', 'growth', 'librarianship', 'country', 'abroad']\n"
 344 |      ]
 345 |     }
 346 |    ],
 347 |    "source": [
 348 |     "import nltk\n",
 349 |     "import string\n",
 350 |     "from nltk import word_tokenize\n",
 351 |     "from nltk.corpus import stopwords\n",
 352 |     "\n",
 353 |     "def process(text): \n",
 354 |     "    stoplist = set(stopwords.words('english'))\n",
 355 |     "    word_list = [word for word in word_tokenize(text.lower())\n",
 356 |     "                 if not word in stoplist and not word in string.punctuation]\n",
 357 |     "    return word_list\n",
 358 |     "\n",
 359 |     "word_list = process(documents.get(\"1\"))\n",
 360 |     "print(word_list)    "
 361 |    ]
 362 |   },
 363 |   {
 364 |    "cell_type": "markdown",
 365 |    "metadata": {},
 366 |    "source": [
 367 |     "Lemmatization:"
 368 |    ]
 369 |   },
 370 |   {
 371 |    "cell_type": "code",
 372 |    "execution_count": 8,
 373 |    "metadata": {},
 374 |    "outputs": [
 375 |     {
 376 |      "name": "stdout",
 377 |      "output_type": "stream",
 378 |      "text": [
 379 |       "['cost', 'analysis', 'simulation', 'procedure', 'evaluation', 'large', 'information', 'system', 'bourne', 'c.p', 'ford', 'd.f', 'computer', 'program', 'write', 'use', 'simulate', 'several-year', 'operation', 'information', 'system', 'compute', 'estimate', 'expected', 'operating', 'cost', 'well', 'amount', 'equipment', 'personnel', 'require', 'time', 'period', 'program', 'use', 'analysis', 'several', 'large', 'system', 'prove', 'useful', 'research', 'tool', 'study', 'system', 'many', 'component', 'interrelated', 'operation', 'equivalent', 'manual', 'analysis', 'would', 'extremely', 'cumbersome', 'time', 'consuming', 'perhaps', 'even', 'impractical', 'paper', 'describe', 'program', 'show', 'example', 'result', 'simulation', 'two', 'several', 'suggested', 'design', 'specific', 'information', 'system']\n"
 380 |      ]
 381 |     }
 382 |    ],
 383 |    "source": [
 384 |     "import nltk\n",
 385 |     "import string\n",
 386 |     "from nltk import word_tokenize, WordNetLemmatizer, pos_tag\n",
 387 |     "from nltk.corpus import stopwords\n",
 388 |     "\n",
 389 |     "def process(text): \n",
 390 |     "    stoplist = set(stopwords.words('english'))\n",
 391 |     "    lemmatizer = WordNetLemmatizer()\n",
 392 |     "    pos_list = pos_tag(word_tokenize(text.lower()))\n",
 393 |     "    word_list = [entry for entry in pos_list\n",
 394 |     "                 if not entry[0] in stoplist and not entry[0] in string.punctuation]\n",
 395 |     "    lemmatized_wl = []\n",
 396 |     "    for entry in word_list:\n",
 397 |     "        if entry[1].startswith(\"V\"):\n",
 398 |     "            lemmatized_wl.append(lemmatizer.lemmatize(entry[0], \"v\"))\n",
 399 |     "        else:\n",
 400 |     "            lemmatized_wl.append(lemmatizer.lemmatize(entry[0]))\n",
 401 |     "    return lemmatized_wl\n",
 402 |     "\n",
 403 |     "word_list = process(documents.get(\"27\"))\n",
 404 |     "print(word_list)"
 405 |    ]
 406 |   },
 407 |   {
 408 |    "cell_type": "markdown",
 409 |    "metadata": {},
 410 |    "source": [
 411 |     "Preprocessing with a stemmer:"
 412 |    ]
 413 |   },
 414 |   {
 415 |    "cell_type": "code",
 416 |    "execution_count": 9,
 417 |    "metadata": {},
 418 |    "outputs": [
 419 |     {
 420 |      "name": "stdout",
 421 |      "output_type": "stream",
 422 |      "text": [
 423 |       "['cost', 'analys', 'sim', 'proc', 'evalu', 'larg', 'inform', 'system', 'bourn', 'c.p', 'ford', 'd.f', 'comput', 'program', 'writ', 'us', 'sim', 'several-year', 'op', 'inform', 'system', 'comput', 'estim', 'expect', 'op', 'cost', 'wel', 'amount', 'equip', 'personnel', 'requir', 'tim', 'period', 'program', 'us', 'analys', 'sev', 'larg', 'system', 'prov', 'us', 'research', 'tool', 'study', 'system', 'many', 'compon', 'interrel', 'op', 'equ', 'man', 'analys', 'would', 'extrem', 'cumbersom', 'tim', 'consum', 'perhap', 'ev', 'impract', 'pap', 'describ', 'program', 'show', 'exampl', 'result', 'sim', 'two', 'sev', 'suggest', 'design', 'spec', 'inform', 'system']\n",
 424 |       "['org', 'org', 'org', 'org', 'org', 'org']\n"
 425 |      ]
 426 |     }
 427 |    ],
 428 |    "source": [
 429 |     "import nltk\n",
 430 |     "import string\n",
 431 |     "from nltk import word_tokenize\n",
 432 |     "from nltk.corpus import stopwords\n",
 433 |     "from nltk.stem.lancaster import LancasterStemmer\n",
 434 |     "\n",
 435 |     "def process(text): \n",
 436 |     "    stoplist = set(stopwords.words('english'))\n",
 437 |     "    st = LancasterStemmer()\n",
 438 |     "    word_list = [st.stem(word) for word in word_tokenize(text.lower())\n",
 439 |     "                 if not word in stoplist and not word in string.punctuation]\n",
 440 |     "    return word_list\n",
 441 |     "  \n",
 442 |     "word_list = process(documents.get(\"27\"))\n",
 443 |     "print(word_list)\n",
 444 |     "word_list = process(\"organize, organizing, organizational, organ, organic, organizer\")\n",
 445 |     "print(word_list)"
 446 |    ]
 447 |   },
 448 |   {
 449 |    "cell_type": "markdown",
 450 |    "metadata": {
 451 |     "collapsed": true
 452 |    },
 453 |    "source": [
 454 |     "## Step 3: Term weighing\n",
 455 |     "\n",
 456 |     "First calculate the term frequency in each document:"
 457 |    ]
 458 |   },
 459 |   {
 460 |    "cell_type": "code",
 461 |    "execution_count": 10,
 462 |    "metadata": {},
 463 |    "outputs": [
 464 |     {
 465 |      "name": "stdout",
 466 |      "output_type": "stream",
 467 |      "text": [
 468 |       "1460\n",
 469 |       "{'18': 1, 'edit': 4, 'dewey': 3, 'decim': 2, 'class': 2, 'comarom': 1, 'j.p.': 1, 'pres': 1, 'study': 1, 'hist': 2, 'first': 2, 'ddc': 2, 'publ': 1, '1876': 1, 'eighteen': 1, '1971': 1, 'fut': 1, 'continu': 1, 'appear': 1, 'nee': 1, 'spit': 1, \"'s\": 1, 'long': 1, 'healthy': 1, 'lif': 1, 'howev': 1, 'ful': 1, 'story': 1, 'nev': 1, 'told': 1, 'biograph': 1, 'brief': 1, 'describ': 1, 'system': 1, 'attempt': 1, 'provid': 1, 'detail': 1, 'work': 1, 'spur': 1, 'grow': 1, 'libr': 1, 'country': 1, 'abroad': 1}\n",
 470 |       "43\n",
 471 |       "112\n",
 472 |       "{'problem': 1, 'concern': 1, 'mak': 1, 'describ': 1, 'titl': 3, 'difficul': 1, 'involv': 1, 'autom': 1, 'retriev': 1, 'artic': 2, 'approxim': 1, 'us': 1, 'relev': 1, 'cont': 1}\n",
 473 |       "14\n"
 474 |      ]
 475 |     }
 476 |    ],
 477 |    "source": [
 478 |     "import nltk\n",
 479 |     "import string\n",
 480 |     "from nltk import word_tokenize\n",
 481 |     "from nltk.corpus import stopwords\n",
 482 |     "from nltk.stem.lancaster import LancasterStemmer\n",
 483 |     "\n",
 484 |     "def get_terms(text): \n",
 485 |     "    stoplist = set(stopwords.words('english'))\n",
 486 |     "    terms = {}\n",
 487 |     "    st = LancasterStemmer()\n",
 488 |     "    word_list = [st.stem(word) for word in word_tokenize(text.lower())\n",
 489 |     "                 if not word in stoplist and not word in string.punctuation]\n",
 490 |     "    for word in word_list:\n",
 491 |     "        terms[word] = terms.get(word, 0) + 1\n",
 492 |     "    return terms\n",
 493 |     "\n",
 494 |     "doc_terms = {}\n",
 495 |     "qry_terms = {}\n",
 496 |     "for doc_id in documents.keys():\n",
 497 |     "    doc_terms[doc_id] = get_terms(documents.get(doc_id))\n",
 498 |     "for qry_id in queries.keys():\n",
 499 |     "    qry_terms[qry_id] = get_terms(queries.get(qry_id))\n",
 500 |     "\n",
 501 |     "print(len(doc_terms))\n",
 502 |     "print(doc_terms.get(\"1\"))\n",
 503 |     "print(len(doc_terms.get(\"1\")))\n",
 504 |     "print(len(qry_terms))\n",
 505 |     "print(qry_terms.get(\"1\"))\n",
 506 |     "print(len(qry_terms.get(\"1\")))"
 507 |    ]
 508 |   },
 509 |   {
 510 |    "cell_type": "markdown",
 511 |    "metadata": {},
 512 |    "source": [
 513 |     "Second, collect shared vocabulary from all documents and queries:"
 514 |    ]
 515 |   },
 516 |   {
 517 |    "cell_type": "code",
 518 |    "execution_count": 11,
 519 |    "metadata": {},
 520 |    "outputs": [
 521 |     {
 522 |      "name": "stdout",
 523 |      "output_type": "stream",
 524 |      "text": [
 525 |       "7775\n",
 526 |       "[\"''\", \"'60\", \"'70\", \"'anyhow\", \"'apparent\", \"'basic\", \"'better\", \"'bibliograph\", \"'bibliometrics\", \"'building\"]\n"
 527 |      ]
 528 |     }
 529 |    ],
 530 |    "source": [
 531 |     "def collect_vocabulary():\n",
 532 |     "    all_terms = []\n",
 533 |     "    for doc_id in doc_terms.keys():\n",
 534 |     "        for term in doc_terms.get(doc_id).keys():            \n",
 535 |     "            all_terms.append(term)\n",
 536 |     "    for qry_id in qry_terms.keys():\n",
 537 |     "        for term in qry_terms.get(qry_id).keys():\n",
 538 |     "            all_terms.append(term)\n",
 539 |     "    return sorted(set(all_terms))\n",
 540 |     "\n",
 541 |     "all_terms = collect_vocabulary()\n",
 542 |     "print(len(all_terms))\n",
 543 |     "print(all_terms[:10])"
 544 |    ]
 545 |   },
 546 |   {
 547 |    "cell_type": "markdown",
 548 |    "metadata": {},
 549 |    "source": [
 550 |     "Represent each document and query as the counts in the shared space:\n"
 551 |    ]
 552 |   },
 553 |   {
 554 |    "cell_type": "code",
 555 |    "execution_count": 12,
 556 |    "metadata": {},
 557 |    "outputs": [
 558 |     {
 559 |      "name": "stdout",
 560 |      "output_type": "stream",
 561 |      "text": [
 562 |       "1460\n",
 563 |       "7775\n",
 564 |       "112\n",
 565 |       "7775\n"
 566 |      ]
 567 |     }
 568 |    ],
 569 |    "source": [
 570 |     "def vectorize(input_features, vocabulary):\n",
 571 |     "    output = {}\n",
 572 |     "    for item_id in input_features.keys():\n",
 573 |     "        features = input_features.get(item_id)\n",
 574 |     "        output_vector = []\n",
 575 |     "        for word in vocabulary:\n",
 576 |     "            if word in features.keys():\n",
 577 |     "                output_vector.append(int(features.get(word)))\n",
 578 |     "            else:\n",
 579 |     "                output_vector.append(0)\n",
 580 |     "        output[item_id] = output_vector\n",
 581 |     "    return output\n",
 582 |     "\n",
 583 |     "doc_vectors = vectorize(doc_terms, all_terms)\n",
 584 |     "qry_vectors = vectorize(qry_terms, all_terms)\n",
 585 |     "\n",
 586 |     "print(len(doc_vectors))\n",
 587 |     "print(len(doc_vectors.get(\"1460\")))\n",
 588 |     "#print(doc_vectors.get(\"1460\")[:1000])\n",
 589 |     "print(len(qry_vectors))\n",
 590 |     "print(len(qry_vectors.get(\"112\")))\n",
 591 |     "#print(qry_vectors.get(\"112\")[:1000])"
 592 |    ]
 593 |   },
 594 |   {
 595 |    "cell_type": "code",
 596 |    "execution_count": 13,
 597 |    "metadata": {},
 598 |    "outputs": [
 599 |     {
 600 |      "name": "stdout",
 601 |      "output_type": "stream",
 602 |      "text": [
 603 |       "7775\n",
 604 |       "0.4287539560862571\n"
 605 |      ]
 606 |     }
 607 |    ],
 608 |    "source": [
 609 |     "import math\n",
 610 |     "\n",
 611 |     "def calculate_idfs(vocabulary, doc_features):\n",
 612 |     "    doc_idfs = {}\n",
 613 |     "    for term in vocabulary:\n",
 614 |     "        doc_count = 0\n",
 615 |     "        for doc_id in doc_features.keys():\n",
 616 |     "            terms = doc_features.get(doc_id)\n",
 617 |     "            if term in terms.keys():\n",
 618 |     "                doc_count += 1\n",
 619 |     "        doc_idfs[term] = math.log(float(len(doc_features.keys()))/float(1 + doc_count), 10)\n",
 620 |     "    return doc_idfs\n",
 621 |     "\n",
 622 |     "doc_idfs = calculate_idfs(all_terms, doc_terms)\n",
 623 |     "print(len(doc_idfs))\n",
 624 |     "print(doc_idfs.get(\"system\"))"
 625 |    ]
 626 |   },
 627 |   {
 628 |    "cell_type": "code",
 629 |    "execution_count": 14,
 630 |    "metadata": {},
 631 |    "outputs": [
 632 |     {
 633 |      "name": "stdout",
 634 |      "output_type": "stream",
 635 |      "text": [
 636 |       "1460\n",
 637 |       "7775\n"
 638 |      ]
 639 |     }
 640 |    ],
 641 |    "source": [
 642 |     "def vectorize_idf(input_terms, input_idfs, vocabulary):\n",
 643 |     "    output = {}\n",
 644 |     "    for item_id in input_terms.keys():\n",
 645 |     "        terms = input_terms.get(item_id)\n",
 646 |     "        output_vector = []\n",
 647 |     "        for term in vocabulary:\n",
 648 |     "            if term in terms.keys():\n",
 649 |     "                output_vector.append(input_idfs.get(term)*float(terms.get(term)))\n",
 650 |     "            else:\n",
 651 |     "                output_vector.append(float(0))\n",
 652 |     "        output[item_id] = output_vector\n",
 653 |     "    return output\n",
 654 |     "\n",
 655 |     "doc_vectors = vectorize_idf(doc_terms, doc_idfs, all_terms)\n",
 656 |     "\n",
 657 |     "print(len(doc_vectors))\n",
 658 |     "print(len(doc_vectors.get(\"1460\")))"
 659 |    ]
 660 |   },
 661 |   {
 662 |    "cell_type": "markdown",
 663 |    "metadata": {},
 664 |    "source": [
 665 |     "## Step 4: Retrieval of the most similar documents\n",
 666 |     "\n",
 667 |     "Use cosine similarity, as before, on unfiltered texts:\n"
 668 |    ]
 669 |   },
 670 |   {
 671 |    "cell_type": "code",
 672 |    "execution_count": 15,
 673 |    "metadata": {},
 674 |    "outputs": [
 675 |     {
 676 |      "name": "stdout",
 677 |      "output_type": "stream",
 678 |      "text": [
 679 |       "0.9701425001453319\n"
 680 |      ]
 681 |     }
 682 |    ],
 683 |    "source": [
 684 |     "import math\n",
 685 |     "\n",
 686 |     "query = [1, 1]\n",
 687 |     "document = [3, 5]\n",
 688 |     "\n",
 689 |     "def length(vector):\n",
 690 |     "    sq_length = 0\n",
 691 |     "    for index in range(0, len(vector)):\n",
 692 |     "        sq_length += math.pow(vector[index], 2)\n",
 693 |     "    return math.sqrt(sq_length)\n",
 694 |     "    \n",
 695 |     "def dot_product(vector1, vector2):\n",
 696 |     "    if len(vector1)==len(vector2):\n",
 697 |     "        dot_prod = 0\n",
 698 |     "        for index in range(0, len(vector1)):\n",
 699 |     "            if not vector1[index]==0 and not vector2[index]==0:\n",
 700 |     "                dot_prod += vector1[index]*vector2[index]\n",
 701 |     "        return dot_prod\n",
 702 |     "    else:\n",
 703 |     "        return \"Unmatching dimensionality\"\n",
 704 |     "\n",
 705 |     "def calculate_cosine(query, document):\n",
 706 |     "    cosine =  dot_product(query, document) / (length(query) * length(document)) \n",
 707 |     "    return cosine\n",
 708 |     "\n",
 709 |     "cosine = calculate_cosine(query, document)\n",
 710 |     "print (cosine)"
 711 |    ]
 712 |   },
 713 |   {
 714 |    "cell_type": "markdown",
 715 |    "metadata": {},
 716 |    "source": [
 717 |     "Get cosine similarity for some examples of a particular query and a particular document:"
 718 |    ]
 719 |   },
 720 |   {
 721 |    "cell_type": "code",
 722 |    "execution_count": 16,
 723 |    "metadata": {},
 724 |    "outputs": [
 725 |     {
 726 |      "name": "stdout",
 727 |      "output_type": "stream",
 728 |      "text": [
 729 |       "0.21799825905375303\n"
 730 |      ]
 731 |     }
 732 |    ],
 733 |    "source": [
 734 |     "#document = doc_vectors.get(\"27\")\n",
 735 |     "#query = qry_vectors.get(\"15\")\n",
 736 |     "\n",
 737 |     "document = doc_vectors.get(\"60\")\n",
 738 |     "query = qry_vectors.get(\"3\")\n",
 739 |     "\n",
 740 |     "cosine =  dot_product(query, document) / (length(query) * length(document))     \n",
 741 |     "print(cosine)"
 742 |    ]
 743 |   },
 744 |   {
 745 |    "cell_type": "markdown",
 746 |    "metadata": {},
 747 |    "source": [
 748 |     "Apply the search algorithm to find relevant documents for a particular query:"
 749 |    ]
 750 |   },
 751 |   {
 752 |    "cell_type": "code",
 753 |    "execution_count": 17,
 754 |    "metadata": {
 755 |     "scrolled": true
 756 |    },
 757 |    "outputs": [
 758 |     {
 759 |      "name": "stdout",
 760 |      "output_type": "stream",
 761 |      "text": [
 762 |       "469\n",
 763 |       "1179\n",
 764 |       "1181\n",
 765 |       "1142\n",
 766 |       "1190\n",
 767 |       "1116\n",
 768 |       "445\n",
 769 |       "85\n",
 770 |       "599\n",
 771 |       "60\n",
 772 |       "540\n",
 773 |       "640\n",
 774 |       "372\n",
 775 |       "1030\n",
 776 |       "1095\n",
 777 |       "965\n",
 778 |       "1161\n",
 779 |       "241\n",
 780 |       "1191\n",
 781 |       "899\n",
 782 |       "137\n",
 783 |       "535\n",
 784 |       "456\n",
 785 |       "803\n",
 786 |       "95\n",
 787 |       "544\n",
 788 |       "1077\n",
 789 |       "1111\n",
 790 |       "1103\n",
 791 |       "837\n",
 792 |       "560\n",
 793 |       "1133\n",
 794 |       "602\n",
 795 |       "166\n",
 796 |       "1080\n",
 797 |       "163\n",
 798 |       "686\n",
 799 |       "839\n",
 800 |       "1297\n",
 801 |       "1082\n",
 802 |       "1428\n",
 803 |       "1330\n",
 804 |       "1113\n",
 805 |       "110\n"
 806 |      ]
 807 |     }
 808 |    ],
 809 |    "source": [
 810 |     "from operator import itemgetter\n",
 811 |     "\n",
 812 |     "results = {}\n",
 813 |     "\n",
 814 |     "for doc_id in doc_vectors.keys():\n",
 815 |     "    document = doc_vectors.get(doc_id)\n",
 816 |     "    cosine = calculate_cosine(query, document)    \n",
 817 |     "    results[doc_id] = cosine\n",
 818 |     "\n",
 819 |     "for items in sorted(results.items(), key=itemgetter(1), reverse=True)[:44]:\n",
 820 |     "    print(items[0])"
 821 |    ]
 822 |   },
 823 |   {
 824 |    "cell_type": "markdown",
 825 |    "metadata": {},
 826 |    "source": [
 827 |     "## Step 5: Evaluation\n",
 828 |     "\n",
 829 |     "Prefilter – only keep the documents that contain at least one word from the query – to speed search up:"
 830 |    ]
 831 |   },
 832 |   {
 833 |    "cell_type": "code",
 834 |    "execution_count": 18,
 835 |    "metadata": {},
 836 |    "outputs": [
 837 |     {
 838 |      "name": "stdout",
 839 |      "output_type": "stream",
 840 |      "text": [
 841 |       "['5', '6', '10', '15', '16', '17', '21', '22', '25', '26', '27', '29', '30', '33', '38', '41', '42', '43', '45', '46', '47', '49', '51', '52', '56', '57', '58', '63', '64', '66', '68', '71', '74', '77', '78', '79', '80', '82', '87', '90', '91', '92', '95', '96', '97', '98', '101', '102', '104', '105', '106', '107', '109', '114', '116', '117', '122', '123', '124', '126', '129', '131', '132', '136', '140', '141', '142', '144', '150', '151', '155', '157', '158', '159', '160', '163', '168', '169', '175', '178', '179', '180', '181', '191', '194', '197', '206', '208', '211', '212', '214', '218', '220', '228', '229', '233', '237', '240', '241', '242']\n",
 842 |       "607\n"
 843 |      ]
 844 |     }
 845 |    ],
 846 |    "source": [
 847 |     "def prefilter(doc_terms, query):\n",
 848 |     "    docs = []\n",
 849 |     "    for doc_id in doc_terms.keys():\n",
 850 |     "        found = False\n",
 851 |     "        i = 0\n",
 852 |     "        while i<len(query.keys()) and not found:\n",
 853 |     "            term = list(query.keys())[i]\n",
 854 |     "            if term in doc_terms.get(doc_id).keys():\n",
 855 |     "                docs.append(doc_id)\n",
 856 |     "                found=True\n",
 857 |     "            else:\n",
 858 |     "                i+=1\n",
 859 |     "    return docs\n",
 860 |     "\n",
 861 |     "docs = prefilter(doc_terms, qry_terms.get(\"6\"))\n",
 862 |     "print(docs[:100])\n",
 863 |     "print(len(docs))\n",
 864 |     "\n",
 865 |     "prefiltered_docs = {}\n",
 866 |     "for query_id in mappings.keys():\n",
 867 |     "    prefiltered_docs[query_id] = prefilter(doc_terms, qry_terms.get(str(query_id)))"
 868 |    ]
 869 |   },
 870 |   {
 871 |    "cell_type": "markdown",
 872 |    "metadata": {},
 873 |    "source": [
 874 |     "Return the top-3 or top-10 results and evaluate in terms of precision:"
 875 |    ]
 876 |   },
 877 |   {
 878 |    "cell_type": "code",
 879 |    "execution_count": 19,
 880 |    "metadata": {},
 881 |    "outputs": [
 882 |     {
 883 |      "name": "stdout",
 884 |      "output_type": "stream",
 885 |      "text": [
 886 |       "1: 1.0\n",
 887 |       "2: 0.3333333333333333\n",
 888 |       "3: 1.0\n",
 889 |       "4: 0.0\n",
 890 |       "5: 0.0\n",
 891 |       "6: 0.0\n",
 892 |       "7: 0.0\n",
 893 |       "8: 0.0\n",
 894 |       "9: 0.3333333333333333\n",
 895 |       "10: 0.6666666666666666\n",
 896 |       "11: 0.3333333333333333\n",
 897 |       "12: 0.0\n",
 898 |       "13: 0.3333333333333333\n",
 899 |       "14: 0.0\n",
 900 |       "15: 0.0\n",
 901 |       "16: 0.0\n",
 902 |       "17: 0.0\n",
 903 |       "18: 0.0\n",
 904 |       "19: 0.0\n",
 905 |       "20: 0.3333333333333333\n",
 906 |       "21: 0.0\n",
 907 |       "22: 0.0\n",
 908 |       "23: 0.3333333333333333\n",
 909 |       "24: 1.0\n",
 910 |       "25: 0.0\n",
 911 |       "26: 0.6666666666666666\n",
 912 |       "27: 0.6666666666666666\n",
 913 |       "28: 0.6666666666666666\n",
 914 |       "29: 0.6666666666666666\n",
 915 |       "30: 1.0\n",
 916 |       "31: 0.3333333333333333\n",
 917 |       "32: 0.3333333333333333\n",
 918 |       "33: 0.0\n",
 919 |       "34: 0.6666666666666666\n",
 920 |       "35: 0.6666666666666666\n",
 921 |       "37: 0.3333333333333333\n",
 922 |       "39: 0.3333333333333333\n",
 923 |       "41: 0.3333333333333333\n",
 924 |       "42: 0.6666666666666666\n",
 925 |       "43: 0.0\n",
 926 |       "44: 0.3333333333333333\n",
 927 |       "45: 0.3333333333333333\n",
 928 |       "46: 0.6666666666666666\n",
 929 |       "49: 0.3333333333333333\n",
 930 |       "50: 0.6666666666666666\n",
 931 |       "52: 1.0\n",
 932 |       "54: 0.3333333333333333\n",
 933 |       "55: 1.0\n",
 934 |       "56: 0.6666666666666666\n",
 935 |       "57: 0.0\n",
 936 |       "58: 1.0\n",
 937 |       "61: 0.3333333333333333\n",
 938 |       "62: 1.0\n",
 939 |       "65: 0.6666666666666666\n",
 940 |       "66: 1.0\n",
 941 |       "67: 0.0\n",
 942 |       "69: 0.3333333333333333\n",
 943 |       "71: 0.0\n",
 944 |       "76: 1.0\n",
 945 |       "79: 0.3333333333333333\n",
 946 |       "81: 0.3333333333333333\n",
 947 |       "82: 0.3333333333333333\n",
 948 |       "84: 0.0\n",
 949 |       "90: 0.0\n",
 950 |       "92: 0.6666666666666666\n",
 951 |       "95: 0.6666666666666666\n",
 952 |       "96: 0.0\n",
 953 |       "97: 0.6666666666666666\n",
 954 |       "98: 1.0\n",
 955 |       "99: 0.3333333333333333\n",
 956 |       "100: 0.0\n",
 957 |       "101: 0.0\n",
 958 |       "102: 1.0\n",
 959 |       "104: 0.0\n",
 960 |       "109: 0.6666666666666666\n",
 961 |       "111: 1.0\n",
 962 |       "0.4035087719298246\n",
 963 |       "0.6578947368421053\n"
 964 |      ]
 965 |     }
 966 |    ],
 967 |    "source": [
 968 |     "def calculate_precision(model_output, gold_standard):\n",
 969 |     "    true_pos = 0\n",
 970 |     "    for item in model_output:\n",
 971 |     "        if item in gold_standard:\n",
 972 |     "            true_pos += 1\n",
 973 |     "    return float(true_pos)/float(len(model_output))\n",
 974 |     "\n",
 975 |     "def calculate_found(model_output, gold_standard):\n",
 976 |     "    found = 0\n",
 977 |     "    for item in model_output:\n",
 978 |     "        if item in gold_standard:\n",
 979 |     "            found = 1\n",
 980 |     "    return float(found)\n",
 981 |     "\n",
 982 |     "precision_all = 0.0\n",
 983 |     "found_all = 0.0\n",
 984 |     "for query_id in mappings.keys():\n",
 985 |     "    gold_standard = mappings.get(str(query_id))\n",
 986 |     "    query = qry_vectors.get(str(query_id))\n",
 987 |     "    results = {}\n",
 988 |     "    model_output = []\n",
 989 |     "    for doc_id in prefiltered_docs.get(str(query_id)):\n",
 990 |     "        document = doc_vectors.get(doc_id)\n",
 991 |     "        cosine = calculate_cosine(query, document)    \n",
 992 |     "        results[doc_id] = cosine\n",
 993 |     "    for items in sorted(results.items(), key=itemgetter(1), \n",
 994 |     "                        #reverse=True)[:min(10, len(gold_standard))]:\n",
 995 |     "                        #reverse=True)[:min(3, len(gold_standard))]:\n",
 996 |     "                        reverse=True)[:3]:\n",
 997 |     "        model_output.append(items[0])\n",
 998 |     "    precision = calculate_precision(model_output, gold_standard)\n",
 999 |     "    found = calculate_found(model_output, gold_standard)\n",
1000 |     "    print(f\"{str(query_id)}: {str(precision)}\")\n",
1001 |     "    precision_all += precision\n",
1002 |     "    found_all += found\n",
1003 |     "\n",
1004 |     "print(precision_all/float(len(mappings.keys())))\n",
1005 |     "print(found_all/float(len(mappings.keys())))"
1006 |    ]
1007 |   },
1008 |   {
1009 |    "cell_type": "code",
1010 |    "execution_count": 20,
1011 |    "metadata": {},
1012 |    "outputs": [
1013 |     {
1014 |      "name": "stdout",
1015 |      "output_type": "stream",
1016 |      "text": [
1017 |       "1: 1.0\n",
1018 |       "2: 0.0\n",
1019 |       "3: 1.0\n",
1020 |       "4: 0.0\n",
1021 |       "5: 0.0\n",
1022 |       "6: 0.0\n",
1023 |       "7: 0.0\n",
1024 |       "8: 0.0\n",
1025 |       "9: 0.0\n",
1026 |       "10: 1.0\n",
1027 |       "11: 1.0\n",
1028 |       "12: 0.0\n",
1029 |       "13: 1.0\n",
1030 |       "14: 0.0\n",
1031 |       "15: 0.0\n",
1032 |       "16: 0.0\n",
1033 |       "17: 0.0\n",
1034 |       "18: 0.0\n",
1035 |       "19: 0.0\n",
1036 |       "20: 0.0\n",
1037 |       "21: 0.0\n",
1038 |       "22: 0.0\n",
1039 |       "23: 0.0\n",
1040 |       "24: 1.0\n",
1041 |       "25: 0.0\n",
1042 |       "26: 1.0\n",
1043 |       "27: 1.0\n",
1044 |       "28: 1.0\n",
1045 |       "29: 1.0\n",
1046 |       "30: 1.0\n",
1047 |       "31: 0.0\n",
1048 |       "32: 0.0\n",
1049 |       "33: 0.0\n",
1050 |       "34: 1.0\n",
1051 |       "35: 0.0\n",
1052 |       "37: 1.0\n",
1053 |       "39: 1.0\n",
1054 |       "41: 1.0\n",
1055 |       "42: 1.0\n",
1056 |       "43: 0.0\n",
1057 |       "44: 0.0\n",
1058 |       "45: 1.0\n",
1059 |       "46: 0.0\n",
1060 |       "49: 0.0\n",
1061 |       "50: 1.0\n",
1062 |       "52: 1.0\n",
1063 |       "54: 0.0\n",
1064 |       "55: 1.0\n",
1065 |       "56: 1.0\n",
1066 |       "57: 0.0\n",
1067 |       "58: 1.0\n",
1068 |       "61: 0.0\n",
1069 |       "62: 1.0\n",
1070 |       "65: 1.0\n",
1071 |       "66: 1.0\n",
1072 |       "67: 0.0\n",
1073 |       "69: 1.0\n",
1074 |       "71: 0.0\n",
1075 |       "76: 1.0\n",
1076 |       "79: 1.0\n",
1077 |       "81: 1.0\n",
1078 |       "82: 0.0\n",
1079 |       "84: 0.0\n",
1080 |       "90: 0.0\n",
1081 |       "92: 1.0\n",
1082 |       "95: 0.0\n",
1083 |       "96: 0.0\n",
1084 |       "97: 1.0\n",
1085 |       "98: 1.0\n",
1086 |       "99: 0.0\n",
1087 |       "100: 0.0\n",
1088 |       "101: 0.0\n",
1089 |       "102: 1.0\n",
1090 |       "104: 0.0\n",
1091 |       "109: 0.0\n",
1092 |       "111: 1.0\n",
1093 |       "0.4473684210526316\n"
1094 |      ]
1095 |     }
1096 |    ],
1097 |    "source": [
1098 |     "precision_all = 0.0\n",
1099 |     "for query_id in mappings.keys():\n",
1100 |     "    gold_standard = mappings.get(str(query_id))\n",
1101 |     "    query = qry_vectors.get(str(query_id))\n",
1102 |     "    result = \"\"\n",
1103 |     "    model_output = []\n",
1104 |     "    max_sim = 0.0\n",
1105 |     "    prefiltered_docs = prefilter(doc_terms, qry_terms.get(str(query_id)))\n",
1106 |     "    for doc_id in prefiltered_docs:\n",
1107 |     "        document = doc_vectors.get(doc_id)\n",
1108 |     "        cosine = calculate_cosine(query, document) \n",
1109 |     "        if cosine >= max_sim:\n",
1110 |     "            max_sim = cosine\n",
1111 |     "            result = doc_id\n",
1112 |     "    model_output.append(result)\n",
1113 |     "    precision = calculate_precision(model_output, gold_standard)\n",
1114 |     "    print(f\"{str(query_id)}: {str(precision)}\")\n",
1115 |     "    precision_all += precision\n",
1116 |     "\n",
1117 |     "print(precision_all/len(mappings.keys()))"
1118 |    ]
1119 |   },
1120 |   {
1121 |    "cell_type": "markdown",
1122 |    "metadata": {
1123 |     "collapsed": true
1124 |    },
1125 |    "source": [
1126 |     "MRR – rank of the first relevant entry:"
1127 |    ]
1128 |   },
1129 |   {
1130 |    "cell_type": "code",
1131 |    "execution_count": 21,
1132 |    "metadata": {},
1133 |    "outputs": [
1134 |     {
1135 |      "name": "stdout",
1136 |      "output_type": "stream",
1137 |      "text": [
1138 |       "1: 1.0\n",
1139 |       "2: 0.3333333333333333\n",
1140 |       "3: 1.0\n",
1141 |       "4: 0.09090909090909091\n",
1142 |       "5: 0.14285714285714285\n",
1143 |       "6: 0.038461538461538464\n",
1144 |       "7: 0.043478260869565216\n",
1145 |       "8: 0.02857142857142857\n",
1146 |       "9: 0.5\n",
1147 |       "10: 1.0\n",
1148 |       "11: 1.0\n",
1149 |       "12: 0.1\n",
1150 |       "13: 1.0\n",
1151 |       "14: 0.011494252873563218\n",
1152 |       "15: 0.125\n",
1153 |       "16: 0.029411764705882353\n",
1154 |       "17: 0.25\n",
1155 |       "18: 0.25\n",
1156 |       "19: 0.25\n",
1157 |       "20: 0.5\n",
1158 |       "21: 0.05555555555555555\n",
1159 |       "22: 0.09090909090909091\n",
1160 |       "23: 0.5\n",
1161 |       "24: 1.0\n",
1162 |       "25: 0.1111111111111111\n",
1163 |       "26: 1.0\n",
1164 |       "27: 1.0\n",
1165 |       "28: 1.0\n",
1166 |       "29: 1.0\n",
1167 |       "30: 1.0\n",
1168 |       "31: 0.5\n",
1169 |       "32: 0.3333333333333333\n",
1170 |       "33: 0.05555555555555555\n",
1171 |       "34: 1.0\n",
1172 |       "35: 0.5\n",
1173 |       "37: 1.0\n",
1174 |       "39: 1.0\n",
1175 |       "41: 1.0\n",
1176 |       "42: 1.0\n",
1177 |       "43: 0.14285714285714285\n",
1178 |       "44: 0.5\n",
1179 |       "45: 1.0\n",
1180 |       "46: 0.5\n",
1181 |       "49: 0.3333333333333333\n",
1182 |       "50: 1.0\n",
1183 |       "52: 1.0\n",
1184 |       "54: 0.3333333333333333\n",
1185 |       "55: 1.0\n",
1186 |       "56: 1.0\n",
1187 |       "57: 0.09090909090909091\n",
1188 |       "58: 1.0\n",
1189 |       "61: 0.3333333333333333\n",
1190 |       "62: 1.0\n",
1191 |       "65: 1.0\n",
1192 |       "66: 1.0\n",
1193 |       "67: 0.25\n",
1194 |       "69: 1.0\n",
1195 |       "71: 0.25\n",
1196 |       "76: 1.0\n",
1197 |       "79: 1.0\n",
1198 |       "81: 1.0\n",
1199 |       "82: 0.5\n",
1200 |       "84: 0.05\n",
1201 |       "90: 0.2\n",
1202 |       "92: 1.0\n",
1203 |       "95: 0.5\n",
1204 |       "96: 0.08333333333333333\n",
1205 |       "97: 1.0\n",
1206 |       "98: 1.0\n",
1207 |       "99: 0.3333333333333333\n",
1208 |       "100: 0.1\n",
1209 |       "101: 0.020833333333333332\n",
1210 |       "102: 1.0\n",
1211 |       "104: 0.25\n",
1212 |       "109: 0.5\n",
1213 |       "111: 1.0\n",
1214 |       "0.5804111538527951\n"
1215 |      ]
1216 |     }
1217 |    ],
1218 |    "source": [
1219 |     "rank_all = 0.0\n",
1220 |     "for query_id in mappings.keys():\n",
1221 |     "    gold_standard = mappings.get(str(query_id))\n",
1222 |     "    query = qry_vectors.get(str(query_id))\n",
1223 |     "    results = {}\n",
1224 |     "    for doc_id in doc_vectors.keys():\n",
1225 |     "        document = doc_vectors.get(doc_id)\n",
1226 |     "        cosine = calculate_cosine(query, document)    \n",
1227 |     "        results[doc_id] = cosine\n",
1228 |     "    sorted_results = sorted(results.items(), key=itemgetter(1), reverse=True)\n",
1229 |     "    index = 0\n",
1230 |     "    found = False\n",
1231 |     "    while found==False:\n",
1232 |     "        item = sorted_results[index]\n",
1233 |     "        index += 1\n",
1234 |     "        if index==len(sorted_results):\n",
1235 |     "            found = True\n",
1236 |     "        if item[0] in gold_standard:\n",
1237 |     "            found = True\n",
1238 |     "            print(f\"{str(query_id)}: {str(float(1) / float(index))}\")\n",
1239 |     "            rank_all += float(1) / float(index)\n",
1240 |     "            \n",
1241 |     "            \n",
1242 |     "print(rank_all/float(len(mappings.keys())))"
1243 |    ]
1244 |   },
1245 |   {
1246 |    "cell_type": "code",
1247 |    "execution_count": null,
1248 |    "metadata": {},
1249 |    "outputs": [],
1250 |    "source": []
1251 |   }
1252 |  ],
1253 |  "metadata": {
1254 |   "kernelspec": {
1255 |    "display_name": "Python 3",
1256 |    "language": "python",
1257 |    "name": "python3"
1258 |   },
1259 |   "language_info": {
1260 |    "codemirror_mode": {
1261 |     "name": "ipython",
1262 |     "version": 3
1263 |    },
1264 |    "file_extension": ".py",
1265 |    "mimetype": "text/x-python",
1266 |    "name": "python",
1267 |    "nbconvert_exporter": "python",
1268 |    "pygments_lexer": "ipython3",
1269 |    "version": "3.7.6"
1270 |   }
1271 |  },
1272 |  "nbformat": 4,
1273 |  "nbformat_minor": 2
1274 | }
1275 | 


--------------------------------------------------------------------------------
/Chapter4.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Understanding the task\n",
  8 |     "\n",
  9 |     "First, a simple example:"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Participant1 = Harry\n",
 22 |       "Action = met\n",
 23 |       "Participant2 = Sally\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "information = \"When Harry met Sally\"\n",
 29 |     "words = information.split()\n",
 30 |     "print (f\"Participant1 = {words[words.index('met')-1]}\")\n",
 31 |     "print (f\"Action = met\")\n",
 32 |     "print (f\"Participant2 = {words[words.index('met')+1]}\")"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "Represent the action and its participants as tuples. Then you can easily extract the answers to queries:"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 1,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "name": "stdout",
 49 |      "output_type": "stream",
 50 |      "text": [
 51 |       "['his cabinet', 'Finnish President']\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "meetings = [('Boris Johnson', 'meets with', 'the Queen'),\n",
 57 |     "            ('Donald Trump', 'meets with', 'his cabinet'),\n",
 58 |     "            ('administration', 'meets with', 'tech giants'),\n",
 59 |     "            ('the Queen', 'meets with', 'the Prime Minister'),\n",
 60 |     "            ('Donald Trump', 'meets with', 'Finnish President')]\n",
 61 |     "query = [p2 for (p1, act, p2) in meetings if p1=='Donald Trump']\n",
 62 |     "print(query)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "As \"meeting\" is a mutual action, a participant may appear on the right or on the left – make sure both cases are covered:"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 2,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "['the Prime Minister', 'Boris Johnson']\n"
 82 |      ]
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "query = [p2 for (p1, act, p2) in meetings if p1=='the Queen']\n",
 87 |     "query += [p1 for (p1, act, p2) in meetings if p2=='the Queen']\n",
 88 |     "print(query)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {},
 94 |    "source": [
 95 |     "# Natural Language Processing with spaCy\n",
 96 |     "\n",
 97 |     "## Part-of-speech tagging\n",
 98 |     "\n",
 99 |     "Run `nlp` pipeline on some input text:"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 3,
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "name": "stdout",
109 |      "output_type": "stream",
110 |      "text": [
111 |       " Word         Position  Lowercase    Lemma        POS    Alphanumeric  Stopword \n",
112 |       " On           0         on           on           ADP    True          True     \n",
113 |       " Friday       1         friday       Friday       PROPN  True          False    \n",
114 |       " board        2         board        board        NOUN   True          False    \n",
115 |       " members      3         members      member       NOUN   True          False    \n",
116 |       " meet         4         meet         meet         VERB   True          False    \n",
117 |       " with         5         with         with         ADP    True          True     \n",
118 |       " senior       6         senior       senior       ADJ    True          False    \n",
119 |       " managers     7         managers     manager      NOUN   True          False    \n",
120 |       " to           8         to           to           PART   True          True     \n",
121 |       " discuss      9         discuss      discuss      VERB   True          False    \n",
122 |       " future       10        future       future       ADJ    True          False    \n",
123 |       " development  11        development  development  NOUN   True          False    \n",
124 |       " of           12        of           of           ADP    True          True     \n",
125 |       " the          13        the          the          DET    True          True     \n",
126 |       " company      14        company      company      NOUN   True          False    \n",
127 |       " .            15        .            .            PUNCT  False         False    \n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "import spacy\n",
133 |     "\n",
134 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
135 |     "doc = nlp(\"On Friday board members meet with senior managers \" +\n",
136 |     "          \"to discuss future development of the company.\")\n",
137 |     "\n",
138 |     "rows = []\n",
139 |     "rows.append([\"Word\", \"Position\", \"Lowercase\", \"Lemma\", \"POS\", \"Alphanumeric\", \"Stopword\"])\n",
140 |     "for token in doc:\n",
141 |     "    rows.append([token.text, str(token.i), token.lower_, token.lemma_, \n",
142 |     "                 token.pos_, str(token.is_alpha), str(token.is_stop)])\n",
143 |     "\n",
144 |     "columns = zip(*rows)\n",
145 |     "column_widths = [max(len(item) for item in col) for col in columns]\n",
146 |     "for row in rows:\n",
147 |     "    print(''.join(' {:{width}} '.format(row[i], width=column_widths[i]) \n",
148 |     "                  for i in range(0, len(row))))\n"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "markdown",
153 |    "metadata": {},
154 |    "source": [
155 |     "Use more challenging text, for example \"Jabberwocky\":"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 4,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "name": "stdout",
165 |      "output_type": "stream",
166 |      "text": [
167 |       " Word          Position  Lowercase     Lemma         POS    Alphanumeric  Stopword \n",
168 |       " Beware        0         beware        beware        VERB   True          False    \n",
169 |       " the           1         the           the           DET    True          True     \n",
170 |       " Jabberwock    2         jabberwock    Jabberwock    PROPN  True          False    \n",
171 |       " ,             3         ,             ,             PUNCT  False         False    \n",
172 |       " my            4         my            my            PRON   True          True     \n",
173 |       " son           5         son           son           NOUN   True          False    \n",
174 |       " !             6         !             !             PUNCT  False         False    \n",
175 |       " The           7         the           the           DET    True          True     \n",
176 |       " jaws          8         jaws          jaw           NOUN   True          False    \n",
177 |       " that          9         that          that          DET    True          True     \n",
178 |       " bite          10        bite          bite          VERB   True          False    \n",
179 |       " ,             11        ,             ,             PUNCT  False         False    \n",
180 |       " the           12        the           the           DET    True          True     \n",
181 |       " claws         13        claws         claw          NOUN   True          False    \n",
182 |       " that          14        that          that          DET    True          True     \n",
183 |       " catch         15        catch         catch         VERB   True          False    \n",
184 |       " !             16        !             !             PUNCT  False         False    \n",
185 |       " Beware        17        beware        beware        VERB   True          False    \n",
186 |       " the           18        the           the           DET    True          True     \n",
187 |       " Jubjub        19        jubjub        Jubjub        PROPN  True          False    \n",
188 |       " bird          20        bird          bird          NOUN   True          False    \n",
189 |       " ,             21        ,             ,             PUNCT  False         False    \n",
190 |       " and           22        and           and           CCONJ  True          True     \n",
191 |       " shun          23        shun          shun          VERB   True          False    \n",
192 |       " The           24        the           the           DET    True          True     \n",
193 |       " frumious      25        frumious      frumious      ADJ    True          False    \n",
194 |       " Bandersnatch  26        bandersnatch  Bandersnatch  PROPN  True          False    \n",
195 |       " !             27        !             !             PUNCT  False         False    \n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "import spacy\n",
201 |     "\n",
202 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
203 |     "doc = nlp(\"Beware the Jabberwock, my son! The jaws that bite, the claws that catch! \" +\n",
204 |     "          \"Beware the Jubjub bird, and shun The frumious Bandersnatch!\")\n",
205 |     "\n",
206 |     "rows = []\n",
207 |     "rows.append([\"Word\", \"Position\", \"Lowercase\", \"Lemma\", \"POS\", \"Alphanumeric\", \"Stopword\"])\n",
208 |     "for token in doc:\n",
209 |     "    rows.append([token.text, str(token.i), token.lower_, token.lemma_, \n",
210 |     "                 token.pos_, str(token.is_alpha), str(token.is_stop)])\n",
211 |     "\n",
212 |     "columns = zip(*rows)\n",
213 |     "column_widths = [max(len(item) for item in col) for col in columns]\n",
214 |     "for row in rows:\n",
215 |     "    print(''.join(' {:{width}} '.format(row[i], width=column_widths[i]) \n",
216 |     "                  for i in range(0, len(row))))"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "## Parsing\n",
224 |     "\n",
225 |     "Identify all noun phrases (groups of words that include a noun and all related words). These will be good candidates for the participants of the event:"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 5,
231 |    "metadata": {},
232 |    "outputs": [
233 |     {
234 |      "name": "stdout",
235 |      "output_type": "stream",
236 |      "text": [
237 |       "Friday\tFriday\tpobj\tOn\n",
238 |       "board members\tmembers\tnsubj\tmeet\n",
239 |       "senior managers\tmanagers\tpobj\twith\n",
240 |       "future development\tdevelopment\tdobj\tdiscuss\n",
241 |       "the company\tcompany\tpobj\tof\n"
242 |      ]
243 |     }
244 |    ],
245 |    "source": [
246 |     "import spacy\n",
247 |     "\n",
248 |     "nlp = spacy.load(\"en_core_web_sm\")\n",
249 |     "doc = nlp(\"On Friday, board members meet with senior managers \" +\n",
250 |     "          \"to discuss future development of the company.\")\n",
251 |     "\n",
252 |     "for chunk in doc.noun_chunks:\n",
253 |     "    print('\\t'.join([chunk.text, chunk.root.text, chunk.root.dep_,\n",
254 |     "            chunk.root.head.text]))"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {},
260 |    "source": [
261 |     "Here is how you can visualize the dependencies – call on `displacy` and store output in a file:"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 6,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "12233"
273 |       ]
274 |      },
275 |      "execution_count": 6,
276 |      "metadata": {},
277 |      "output_type": "execute_result"
278 |     }
279 |    ],
280 |    "source": [
281 |     "from spacy import displacy\n",
282 |     "from pathlib import Path\n",
283 |     "\n",
284 |     "svg = displacy.render(doc, style='dep', jupyter=False)\n",
285 |     "file_name = '-'.join([w.text for w in doc if not w.is_punct]) + \".svg\"\n",
286 |     "output_path = Path(file_name)\n",
287 |     "output_path.open(\"w\", encoding=\"utf-8\").write(svg)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "markdown",
292 |    "metadata": {},
293 |    "source": [
294 |     "Get all the dependencies for all the words in the sentence:"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 7,
300 |    "metadata": {},
301 |    "outputs": [
302 |     {
303 |      "name": "stdout",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "On prep meet VERB [Friday]\n",
307 |       "Friday pobj On ADP []\n",
308 |       ", punct meet VERB []\n",
309 |       "board compound members NOUN []\n",
310 |       "members nsubj meet VERB [board]\n",
311 |       "meet ROOT meet VERB [On, ,, members, with, discuss, .]\n",
312 |       "with prep meet VERB [managers]\n",
313 |       "senior amod managers NOUN []\n",
314 |       "managers pobj with ADP [senior]\n",
315 |       "to aux discuss VERB []\n",
316 |       "discuss advcl meet VERB [to, development]\n",
317 |       "future amod development NOUN []\n",
318 |       "development dobj discuss VERB [future, of]\n",
319 |       "of prep development NOUN [company]\n",
320 |       "the det company NOUN []\n",
321 |       "company pobj of ADP [the]\n",
322 |       ". punct meet VERB []\n"
323 |      ]
324 |     }
325 |    ],
326 |    "source": [
327 |     "for token in doc:\n",
328 |     "    print(token.text, token.dep_, token.head.text, token.head.pos_,\n",
329 |     "            [child for child in token.children])"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "Now let's iterate through the words and only identify the participants of the action when the action is expressed with \"meet\":"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 8,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "name": "stdout",
346 |      "output_type": "stream",
347 |      "text": [
348 |       "Participant1 = board members\n",
349 |       "Action = meet with\n",
350 |       "Participant2 = senior managers\n"
351 |      ]
352 |     }
353 |    ],
354 |    "source": [
355 |     "for token in doc:\n",
356 |     "    if token.lemma_==\"meet\" and token.pos_==\"VERB\" and token.dep_==\"ROOT\":\n",
357 |     "        action = token.text\n",
358 |     "        children = [child for child in token.children]\n",
359 |     "        participant1 = \"\"\n",
360 |     "        participant2 = \"\"\n",
361 |     "        for child1 in children:\n",
362 |     "            if child1.dep_==\"nsubj\":\n",
363 |     "                participant1 = \" \".join([attr.text for \n",
364 |     "                                         attr in child1.children]) + \" \" + child1.text\n",
365 |     "            elif child1.text==\"with\":\n",
366 |     "                action += \" \" + child1.text\n",
367 |     "                child1_children = [child for child in child1.children]\n",
368 |     "                for child2 in child1_children:\n",
369 |     "                    if child2.pos_ == \"NOUN\":\n",
370 |     "                        participant2 = \" \".join([attr.text for \n",
371 |     "                                             attr in child2.children]) + \" \" + child2.text\n",
372 |     "print (f\"Participant1 = {participant1}\")\n",
373 |     "print (f\"Action = {action}\")\n",
374 |     "print (f\"Participant2 = {participant2}\")    "
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "markdown",
379 |    "metadata": {},
380 |    "source": [
381 |     "Use various sentences and improve the code so that it can deal with different formats of the expression:"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 9,
387 |    "metadata": {},
388 |    "outputs": [
389 |     {
390 |      "name": "stdout",
391 |      "output_type": "stream",
392 |      "text": [
393 |       "\n",
394 |       "Sentence = On Friday, board members meet with senior managers to discuss future development of the company.\n",
395 |       "Participant1 = board members\n",
396 |       "Action = meet with\n",
397 |       "Participant2 = senior managers\n",
398 |       "\n",
399 |       "Sentence = Boris Johnson met with the Queen last week.\n",
400 |       "Participant1 = Boris Johnson\n",
401 |       "Action = met with\n",
402 |       "Participant2 = the Queen\n",
403 |       "\n",
404 |       "Sentence = Donald Trump meets the Queen at Buckingham Palace.\n",
405 |       "Participant1 = Donald Trump\n",
406 |       "Action = meets\n",
407 |       "Participant2 = the Queen\n",
408 |       "\n",
409 |       "Sentence = The two leaders also posed for photographs and the President talked to reporters.\n",
410 |       "Participant1 = \n",
411 |       "Action = \n",
412 |       "Participant2 = \n"
413 |      ]
414 |     }
415 |    ],
416 |    "source": [
417 |     "sentences = [\"On Friday, board members meet with senior managers \" +\n",
418 |     "             \"to discuss future development of the company.\", \n",
419 |     "             \"Boris Johnson met with the Queen last week.\",\n",
420 |     "             \"Donald Trump meets the Queen at Buckingham Palace.\",\n",
421 |     "             \"The two leaders also posed for photographs and \" +\n",
422 |     "             \"the President talked to reporters.\"]\n",
423 |     "\n",
424 |     "def extract_information(doc):\n",
425 |     "    action=\"\"\n",
426 |     "    participant1 = \"\"\n",
427 |     "    participant2 = \"\"\n",
428 |     "    for token in doc:\n",
429 |     "        if token.lemma_==\"meet\" and token.pos_==\"VERB\" and token.dep_==\"ROOT\":\n",
430 |     "            action = token.text\n",
431 |     "            children = [child for child in token.children]   \n",
432 |     "            for child1 in children:\n",
433 |     "                if child1.dep_==\"nsubj\":\n",
434 |     "                    participant1 = \" \".join([attr.text for \n",
435 |     "                                             attr in child1.children]) + \" \" + child1.text\n",
436 |     "                elif child1.text==\"with\":\n",
437 |     "                    action += \" \" + child1.text\n",
438 |     "                    child1_children = [child for child in child1.children]\n",
439 |     "                    for child2 in child1_children:\n",
440 |     "                        if child2.pos_ == \"NOUN\" or child2.pos_ == \"PROPN\":\n",
441 |     "                            participant2 = \" \".join([attr.text for \n",
442 |     "                                                 attr in child2.children]) + \" \" + child2.text\n",
443 |     "                elif child1.dep_==\"dobj\" and (child1.pos_ == \"NOUN\"\n",
444 |     "                                              or child1.pos_ == \"PROPN\"):\n",
445 |     "                    participant2 = \" \".join([attr.text for \n",
446 |     "                                             attr in child1.children]) + \" \" + child1.text\n",
447 |     "    print (f\"Participant1 = {participant1}\")\n",
448 |     "    print (f\"Action = {action}\")\n",
449 |     "    print (f\"Participant2 = {participant2}\")\n",
450 |     "\n",
451 |     "for sent in sentences:\n",
452 |     "    print(f\"\\nSentence = {sent}\")\n",
453 |     "    doc = nlp(sent)\n",
454 |     "    extract_information(doc)"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": null,
460 |    "metadata": {},
461 |    "outputs": [],
462 |    "source": []
463 |   }
464 |  ],
465 |  "metadata": {
466 |   "kernelspec": {
467 |    "display_name": "Python 3",
468 |    "language": "python",
469 |    "name": "python3"
470 |   },
471 |   "language_info": {
472 |    "codemirror_mode": {
473 |     "name": "ipython",
474 |     "version": 3
475 |    },
476 |    "file_extension": ".py",
477 |    "mimetype": "text/x-python",
478 |    "name": "python",
479 |    "nbconvert_exporter": "python",
480 |    "pygments_lexer": "ipython3",
481 |    "version": "3.7.6"
482 |   }
483 |  },
484 |  "nbformat": 4,
485 |  "nbformat_minor": 2
486 | }
487 | 


--------------------------------------------------------------------------------
/Chapter9.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Chapter 9: Topic Analysis\n",
  8 |     "\n",
  9 |     "## Supervised Approach\n",
 10 |     "\n",
 11 |     "First, let's import the libraries that we are going to use in this notebook:"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from sklearn.datasets import fetch_20newsgroups\n",
 21 |     "import numpy as np"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "Now, let's define a method to load *training* and *test* subsets using a predefined list of categories. Note that following options are also available:\n",
 29 |     "- you can use `load_dataset('all', categories)` to load the whole dataset\n",
 30 |     "- you can use `load_dataset('train', None)` to load the set of all topics"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 2,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "def load_dataset(a_set, cats):\n",
 40 |     "    dataset = fetch_20newsgroups(subset=a_set, categories=cats,\n",
 41 |     "                          remove=('headers', 'footers', 'quotes'),\n",
 42 |     "                          shuffle=True)\n",
 43 |     "    return dataset"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "categories = [\"comp.windows.x\", \"misc.forsale\", \"rec.autos\", \"rec.motorcycles\", \"rec.sport.baseball\"]\n",
 53 |     "categories += [\"rec.sport.hockey\", \"sci.crypt\", \"sci.med\", \"sci.space\", \"talk.politics.mideast\"]\n",
 54 |     "\n",
 55 |     "newsgroups_train = load_dataset('train', categories)\n",
 56 |     "newsgroups_test = load_dataset('test', categories)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "Let's check our uploaded data subsets:"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 4,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "def check_data(dataset):\n",
 73 |     "    print(list(dataset.target_names))\n",
 74 |     "    print(dataset.filenames.shape)\n",
 75 |     "    print(dataset.target.shape)\n",
 76 |     "    if dataset.filenames.shape[0]==dataset.target.shape[0]:\n",
 77 |     "        print(\"Equal sizes for data and targets\")\n",
 78 |     "    print(dataset.filenames[0])\n",
 79 |     "    print(dataset.data[0])\n",
 80 |     "    print(dataset.target[:10])"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 5,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stdout",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "['comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.med', 'sci.space', 'talk.politics.mideast']\n",
 93 |       "(5913,)\n",
 94 |       "(5913,)\n",
 95 |       "Equal sizes for data and targets\n",
 96 |       "/Users/ekaterinakochmar/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.baseball/102665\n",
 97 |       "I have posted the logos of the NL East teams to alt.binaries.pictures.misc \n",
 98 |       " Hopefully, I'll finish the series up next week with the NL West.\n",
 99 |       "\n",
100 |       " Darren\n",
101 |       "\n",
102 |       "[4 3 9 7 4 3 0 5 7 8]\n",
103 |       "\n",
104 |       "***\n",
105 |       "\n",
106 |       "['comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.med', 'sci.space', 'talk.politics.mideast']\n",
107 |       "(3937,)\n",
108 |       "(3937,)\n",
109 |       "Equal sizes for data and targets\n",
110 |       "/Users/ekaterinakochmar/scikit_learn_data/20news_home/20news-bydate-test/misc.forsale/76785\n",
111 |       "As the title says. I would like to sell my Star LV2010 9 pin printer.\n",
112 |       "Its a narrow colum dot matrix, supports both parallel and serial\n",
113 |       "interfacing, prints at 200 characters per second, has a 16K buffer, \n",
114 |       "and is very dependable...\n",
115 |       "\n",
116 |       "Drop some mail if your interested in it. $55 Plus shipping get the\n",
117 |       "printer, and 6 extra srink-wraped ribbons, parallel connection\n",
118 |       "cable, power cord, manual, and ONE sheet of paper (smile)...\n",
119 |       "[1 7 2 5 3 5 7 3 0 2]\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "check_data(newsgroups_train)\n",
125 |     "print(\"\\n***\\n\")\n",
126 |     "check_data(newsgroups_test)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "Now let's create word vectors based on the content of the posts:"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 6,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
143 |     "\n",
144 |     "vectorizer = TfidfVectorizer(stop_words = 'english')\n",
145 |     "\n",
146 |     "def text2vec(vectorizer, train_set, test_set):\n",
147 |     "    vectors_train = vectorizer.fit_transform(train_set.data)\n",
148 |     "    vectors_test = vectorizer.transform(test_set.data)\n",
149 |     "    return vectors_train, vectors_test\n",
150 |     "\n",
151 |     "vectors_train, vectors_test = text2vec(vectorizer, newsgroups_train, newsgroups_test)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "Let's check how the data looks like now:"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 7,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "(5913, 52746)\n",
171 |       "(3937, 52746)\n",
172 |       "  (0, 15218)\t0.31618146678372416\n",
173 |       "  (0, 50534)\t0.20153071455804605\n",
174 |       "  (0, 50435)\t0.1817612919269656\n",
175 |       "  (0, 42031)\t0.1891577831889085\n",
176 |       "  (0, 20349)\t0.2372918776268056\n",
177 |       "  (0, 29215)\t0.14244326085583361\n",
178 |       "  (0, 24214)\t0.23045715683316248\n",
179 |       "  (0, 31546)\t0.21952696479551445\n",
180 |       "  (0, 36274)\t0.23637098993673133\n",
181 |       "  (0, 9616)\t0.2606508810838842\n",
182 |       "  (0, 6736)\t0.23045715683316248\n",
183 |       "  (0, 46098)\t0.18751137951875305\n",
184 |       "  (0, 17820)\t0.1996672692556469\n",
185 |       "  (0, 33404)\t0.47274197987346267\n",
186 |       "  (0, 29330)\t0.32348469409130415\n",
187 |       "  (0, 36985)\t0.1806134526365663\n",
188 |       "nl\n"
189 |      ]
190 |     }
191 |    ],
192 |    "source": [
193 |     "print(vectors_train.shape)\n",
194 |     "print(vectors_test.shape)\n",
195 |     "print(vectors_train[0])\n",
196 |     "print(vectorizer.get_feature_names()[33404])"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "Next, let's apply the Multinomial Naive Bayes classifier:"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 8,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "from sklearn.naive_bayes import MultinomialNB\n",
213 |     "\n",
214 |     "clf = MultinomialNB(alpha=0.1)\n",
215 |     "clf.fit(vectors_train, newsgroups_train.target)\n",
216 |     "predictions = clf.predict(vectors_test)"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "Finally, let's evaluate the results, extract the most informative terms per topic, and print out and vosialise the confusion matrix:"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 11,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "name": "stdout",
233 |      "output_type": "stream",
234 |      "text": [
235 |       "                       precision    recall  f1-score   support\n",
236 |       "\n",
237 |       "       comp.windows.x       0.92      0.90      0.91       395\n",
238 |       "         misc.forsale       0.88      0.87      0.87       390\n",
239 |       "            rec.autos       0.83      0.78      0.80       396\n",
240 |       "      rec.motorcycles       0.85      0.80      0.83       398\n",
241 |       "   rec.sport.baseball       0.92      0.84      0.88       397\n",
242 |       "     rec.sport.hockey       0.71      0.94      0.81       399\n",
243 |       "            sci.crypt       0.82      0.85      0.83       396\n",
244 |       "              sci.med       0.92      0.82      0.87       396\n",
245 |       "            sci.space       0.86      0.82      0.84       394\n",
246 |       "talk.politics.mideast       0.86      0.90      0.88       376\n",
247 |       "\n",
248 |       "             accuracy                           0.85      3937\n",
249 |       "            macro avg       0.86      0.85      0.85      3937\n",
250 |       "         weighted avg       0.86      0.85      0.85      3937\n",
251 |       "\n",
252 |       "comp.windows.x: program using application windows widget use thanks motif server window\n",
253 |       "misc.forsale: asking email sell price condition new shipping offer 00 sale\n",
254 |       "rec.autos: know don new good dealer engine just like cars car\n",
255 |       "rec.motorcycles: don helmet riding just like motorcycle ride bikes dod bike\n",
256 |       "rec.sport.baseball: braves pitching hit think runs games game baseball team year\n",
257 |       "rec.sport.hockey: think year nhl season games players play hockey team game\n",
258 |       "sci.crypt: escrow people use nsa keys government clipper chip encryption key\n",
259 |       "sci.med: cadre dsl chastity n3jxp skepticism banks pitt geb gordon msg\n",
260 |       "sci.space: lunar just shuttle earth like moon orbit launch nasa space\n",
261 |       "talk.politics.mideast: just said arab turkish armenians people armenian jews israeli israel\n"
262 |      ]
263 |     }
264 |    ],
265 |    "source": [
266 |     "from sklearn import metrics\n",
267 |     "\n",
268 |     "def show_top(classifier, categories, vectorizer, n):\n",
269 |     "    feature_names = np.asarray(vectorizer.get_feature_names())\n",
270 |     "    for i, category in enumerate(categories):\n",
271 |     "        top = np.argsort(classifier.coef_[i])[-n:]\n",
272 |     "        print(f'{category}: {\" \".join(feature_names[top])}')\n",
273 |     "        \n",
274 |     "\n",
275 |     "full_report = metrics.classification_report(newsgroups_test.target, \n",
276 |     "                                            predictions, target_names=newsgroups_test.target_names)\n",
277 |     "print(full_report)\n",
278 |     "show_top(clf, categories, vectorizer, 10)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 12,
284 |    "metadata": {},
285 |    "outputs": [
286 |     {
287 |      "name": "stdout",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "[[355   8   2   2   1   8  12   2   5   0]\n",
291 |       " [  3 339  16   9   4  11   2   1   5   0]\n",
292 |       " [  0   9 308  25   3  27   7   3   9   5]\n",
293 |       " [  2   8  26 320   4  15   9   4   6   4]\n",
294 |       " [  3   7   0   2 333  33   4   5   2   8]\n",
295 |       " [  1   0   0   2   6 374   8   2   2   4]\n",
296 |       " [  9   7   3   3   2  18 335   2   7  10]\n",
297 |       " [  2   4   9   5   3  17   7 324  13  12]\n",
298 |       " [  9   3   7   2   3  18  11   6 324  11]\n",
299 |       " [  2   0   2   7   2   9  13   2   2 337]]\n"
300 |      ]
301 |     },
302 |     {
303 |      "data": {
304 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAATgAAAEGCAYAAADxD4m3AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOydeZxN9f/Hn+9778wwDDMMYyzZd2UYiWSJlC1Lq1JJshQpLVKJypK0yZcWosiSFkopQtmX7HsYIluYbGOY7c7n98e5M8Yydz2Hmfl9nh7n4d5zz+d93vO5977vZ32/RCmFRqPR5EVs19sBjUajsQod4DQaTZ5FBziNRpNn0QFOo9HkWXSA02g0eRbH9XYgKxIUqiRfuOl2Y6qUNN1mbkMssmvVHLxVk/tiVUVYgBWuHjiwn/j4+IBM2wuVVSrtglfXqgsn5iulWgVyv0DIWQEuXzghdXqYbnfpgiGm2wSw23LPt0Us+manOdMtsetMtybCBTvM77RY5Koln69Gt9QL2IZKu0BI1Qe8ujZp07jIgG8YADkqwGk0mtyAgOSO0S0d4DQajW8IYLNfby+8Qgc4jUbjO7lkMFMHOI1G4yO6i6rRaPIyugXnPyFBDuZ+1I2QIAd2u405S3YwcvIfjBvQkUa1y3E2MQmAp9/5gW17/6VR7XJMH/oQB/49BcBPy3by7ldLfLrnpzP+YOqcVYgI1StGM2ZQF/KFBPn9Nxw6doqn3/iKY/+dxSZC106N6N25md/2rLbb962pzF++jciIMFbNfC1ge1kxq26fGz6dBSu2ExlRkCXTXgHgzbE/smD5NoKC7JQrFcno1x6mcFioX35aVbcAdToOoWBoCHabDbvdxqLJA0yxu3DlDl55/zuc6ek82uFW+j9+pyl23SLoFhyAiLQCPgLswOdKqZHelEtOTaPD85NJTErBYbfx65juLPxzDwCDP/uNOUt3XFFm1dYDdH5tul9+Hj1+mgnfLGH5jFfJny+Y7q9NYvaCDTzU7ha/7AE47DaGPtuJ2tXKkJCYRPPHRtGsflWqVYj226aVdh9q14AeDzSl95ApAdm5HDPr9sE29XnivsY889bUzHNNb67Ka73b4XDYGTpuDmOmLOT1Pu398tWqus3gh4/7UTS8oCm2AJzOdF4a9Q2zx/alZFQ4zbu+S+smN5rmb/ZIrmnBWRaGRcQOjANaAzWAh0SkhrflE5NSAAhy2Aly2LA6rVOaM52k5FTS0pxcSEqlRLFCAdkrEVmY2tXKABBWIB9Vypfg6IkzAftpld1GdSsRUci/lo8nzKrbhnUqEX6Zj81uqYbDYczoxdYqy9ETp/3206q6tYr12/dToUwk5UpHEhzk4J6WdfllyZZrc3Ob3bvjOmNlO7M+EKeU2qeUSgG+Bjp4W9hmE5aO783uWS+xeN0+1v91GIBB3VuwfMJTDH+6FcFBFyvw5hplWDbhKb59+xGqlSvmk6PRxcN5uktzYjoOoVa7QRQqkI/bb6nukw13/HPkP7bsOkRszbKm2bTSrplYXbdZmfHzGpo3MMe22XUrwH39xtH8sVFMnr3CFJtHT5yhVFRE5vOSURHXKCC7Jhm8Oa4zVnpQCjiY5fkh17lLEJGeIrJORNap1POZ59PTFU16fkrNBz6gbrVSVC9XnLc+X0j9rv+j+dPjiQjLz7OdbwNgy56j3PTQhzTu8Qnjf1jD1Lce8snR02fPM2/pVtbPGsLWn4dxPimFb39d68/ffAXnzifTdeBERjx/D4UK5jfFppV2zcbKus3K6C9/w2G3ce9dga/Ut6Ju5054nj+mvMzM0U8x6bulrNwYF7DNq/VqrknPUVw38uZwZ0Ykn4j8KSKbRWS7iLzpOv+liPwtIptcR4zrvIjIGBGJE5EtIlLXk6tWBrir/XVXvCNKqfFKqXpKqXoSdGUX6WxiEss376dF/UocO3kOgJRUJ9PmbSS2mhEvE84nZ3ZpF6zZQ5DDRhEfultL1u7ihpJFiYwII8hhp22z2qzd+rfX5bMjNc1J15c/57676nH37TEB27ParhVYVbdZmfnLnyxYsZ1xbzwW8JY0q+o2ulhhAIoVCaNNs9ps2H4gYJsli4dz+NipzOdHjp2iRGThgO16hTktuGSguVKqNhADtBKRBq7XXlJKxbiOTa5zrYHKrqMn8ImnG1gZ4A4BZbI8Lw0c8aZg0cKhFCqQD4B8wQ6a1a3Ann/iiSpycYC27W3V2Ln/OADFIy6er1utFDYRTp49j7eUjopg/bb9nE9KQSnF0nW7qVwuyuvyV0MpRb+h06hSvgR9ujQPyNa1sGsVVtRtVn5fvZOxUxcyeVQPQvMFB2TLqrpNvJBMgmvmP/FCMovX/EX1ioFPBNStUZa9/5zgwOF4UlLTmLVgA62b3BSwXc+Y00VVBudcT4Nch7vB9g7AFFe51UC4iLitSCtnUdcClUWkPHAY6Aw87E3BEkXD+PjlTthtgs0mzF68nfmrd/Pj+12JLFwAEdga9y/Pf/gzAB2a1qBb+5txOtO5kJxK92Hf+eRobK1y3N08hhZdR+Gw27mxSike63irb3/tZazZvI+Zv66lRqWSNOliTB6//vTdtGxUM0fa7f7aF6xYv4f/Tp+jZttBDOzZhkc7BFYHYG7d9h48mZUb4zh5+hx1OgzmpSdbM2bKQlJS03jwuY+N+9Usy6gBD/pl36q6PXEyga4DJgDGhMu9d9WjRUOv59uyxeGwM2rAA9zbbxxOp6JL+wamBE6PCGD3egIhUkTWZXk+Xik1PtOUMRm5HqgEjFNKrRGRp4DhIjIYWAQMVEolk/2w19FsXbVydlJE2gCjMZaJTFJKDXd3vS2spLIim8gJnU1EZxNxobOJ1GP9+nUBGbYVKqVCbu7j1bVJv7+2XinlcWBURMKB2cAzwH/Av0AwMB7Yq5R6S0TmAm8rpZa7yiwCBiil1mdn19J1cEqpX4BfrLyHRqO51pi/VUspdVpEFgOtlFLvuU4ni8gXwIuu5z4Pe13/eVyNRpP7MGcWtZir5YaI5AfuAP7KGFcTo9vREdjmKjIHeMw1m9oAOKOUyrZ7Cjl0q5ZGo8nhmNOCiwYmu8bhbMA3SqmfReR3ESmGMdq3Cejtuv4XoA0QB5wHunm6gQ5wGo3GN7xonXmDUmoLUOcq5686ha2MCQPvBv9c6ACn0Wh8Jwdsw/IGHeA0Go2P6HxwfhFTpSTLF75hut2i7UebbhPg6Kx+ptu0aumJ3aLPo1VLJKxYzgEWLZexOBFEjiSXZBPJUQFOo9HkAnQ+OI1Gk3fRXVSNRpOX0ZMMGo0mz6LH4DQaTZ5EdBdVo9HkZXQLznySklNp13s0KSlppDnTad88hoE923pVNiTIztx3HiAkyI7dZmPOij2MnL6KMf1aUqdyFALEHTlNnw/nk5iUSpliYfzvuTuJLJSfU+eS6PXePI78d87jffqPuKj8tHjqK5nnJ367lC++X4bdbuOOW2vweh+vs7dfwfiZi5k6ZxVKKR5p35BenW/329blOJ3ptHj8XaKLFWbGB709F7gKzw6bxoKV24mMCGOpS/1qzqKNvDfxV3bvP8a8iS8QU/0Gv320Uv3KKnWxPKWqhXXZaczGsgAnIpOAdsBxpVQtM2yGBDv4YVw/CoaGkJrmpE3PD2nRsAY331jeY9nkVCcdXv2OxKRUQ6lr1AMsXP83r01YQsIFIxvwsCeb0KNdDKO/W8tb3Zvw9aKdfP37DhrfVIbBXW+j9wfzPN7ngTb16XZvY/oNvaj8tGL9HuYv38qiKS8TEuwg/lSC33Wwc+8Rps5ZxbyJLxDssPNg/09o2agmFcoU99tmVj6buZgq5aIyEzT6Q+e2t9D9/ib0zaJ+Va1iNJPe7s5L78wM2Ecr1a+sUheDvKOqZWQszx0BzsqO9JdAKzMNiggFQ0MAI7V0WprTp4pOTEoFIMhhI8huQykygxtA/mBHZp77qmWKsnTzPwAs23KQ1g0qeHWPhjFXqlNN/mE5fR+5g5Bg4/ckMiLMa58vZ8/+Y8TWLEtovmAcDju31qnEXJOUlA4fO8VvK7bzSIeGAdm5mvpVlXIlqFTWnEy+VqpfWakuZjbXTVVLBLF5d1xvLAtwSqmlwEmz7Tqd6TR9ZCTVWr1C0/rVqFernNdlbTZh6Zgu7J7ai8Wb/mH97n8BGPvsnez6qieVSxdh/M9G+vftf5/g7kaVAWjXsBKFQkOICMvnl8/7/jnBms17adPjAzr1GcOmnf7n469WMZpVm/Zy8kwi55NSWLhqB0eO+S+Vl5XXPpzFG307YMslA8iQO5TFIK+pahmNDW+O6811/yRnVdWKjz/h8Xq73caSqQPZ+tNQNm4/wM69Xsk8AC6lrn7TqPn459StUoLqZYsC0Pej36jedQK7D56kU+MqALw+aSmNapViyUddaHRjaQ7HJ+D0M3ttmtPJmYQLzB3fn8F9OtDz9S/91nmtUq4EzzxyB/f3G0fn/p9Qs1IpHCbsw5q/fBuRRQoGNDZ2rcktymKQx1S10AHOa7KqakVGeq9nWjgslEaxlVi0aqfP9zybmMzyrYdoUbdc5rn0dMWsZbtof6vRavv3ZCKPjfiZps9OY9gU4xf37PmUq5nzSHTxcNo0vQkRoU6NsthE+O90ol+2ALq0b8iiyQOY88mzRBQKpXwZ33Rgr8aazfuYt3QbMR2H0GPQFyxbt5teQyYHbNcqcpOyGOQ9VS0d4Cwg/lQCZxIMtawLSSks+XOX1wpNRQvlp1ABY/wuX7CdZjE3EHf4FOWjL34gWtWvwO5DRq+6SKF8mb+G/e+/mWkLtvvtd6vGN7J8/R4A9v5znNQ0J0XDC/ht78RJY5Li0L8nmbt4M/e0jPXbVgaD+7Rn289D2fTDm0wY1o3G9arw2ZtdA7ZrBblNWSzPqWqJD8d1JlctEzkWf5Y+b03FmZ5OerqiY4s63HWbdxO0JYoU4OP+d11U6lq2m/lr9/HrOw8SFhqMCGz7+wQvjPsdgNtuLMPgro1QClZuO8RLn/zh1X2eGnJR+alux8G82L01D7VrQP8R02n2yNsEBTn4aFCXgH7dnnh1IqfOJOJw2Bn54v1XDOhfb3oN/pKVG4w6iGn/Oi892YaIQqG8+sF3/Hf6HF1e+IxaVUoxc/TTftm3Sv0KrFEXy2uqWkLOaJ15g2WqWiIyA2gGRALHgCFKqYnuytSNraeWrzJf9VynS7LObppF+ZKC7Nb4a8UX0yoFsJyqquUoWkEVajPMq2tPTe3ilaqWVVjWglNKPWSVbY1Gc33JLS24XNVF1Wg0OYAcMr7mDblqkkGj0eQMzJhFFZF8IvKniGwWke0i8qbrfHkRWSMie0RkpogEu86HuJ7HuV4v58lPHeA0Go1PZEwymLBMJBlorpSqDcQArVx6p+8AHyqlKgOngO6u67sDp5RSlYAPXde5RQc4jUbjM2Zs1VIGGRksglyHApoD37nOT8YQfwbo4HqO6/UW4iGK6gCn0Wh8Q3zqokZm7FRyHT0vMSViF5FNwHFgAbAXOK2USnNdcggo5XpcCjgI4Hr9DFDUnas5apJBMPaLms3xH5413SZAuV6BZ8a4nJ1j7zPdJkDBEGve6gspTkvs2kKsSYltxWqZdKukxSzALE99mEWNd7dMRCnlBGJEJByYDVS/2mUZt3Xz2lXRLTiNRuMzZm/VUkqdBhYDDYBwEcn4RS4NZGw4PwSUcd3fARTGQ0IPHeA0Go1PmDXJICLFXC03RCQ/cAewE/gDyOjKdAV+dD2e43qO6/XflYedCjmqi6rRaHIJ5nT1o4HJImLHaGx9o5T6WUR2AF+LyDBgI5CxA2oi8JWIxGG03Dp7uoEOcBqNxjcEbLbAO39KqS1Anauc3wfUv8r5JOB+X+6hA5xGo/EZvVVLo9HkXXJHfMt9Ac4qFSF/lapCHDa+f/kOQoJs2G025q7/h/d/3EaZyAJ83OtWIgqEsPXASfp9vppUZzoli4TyUfcGFAoNxibC299v4vetR93e48ixU/QfMZ0T/53FZhMevrshT9zflA8nzWPGz6szc8u91KMtzf1MwxOIYtkV/h4/xfPDp3PipOHvQ3c35In7mtLnjcnsO3gcgLPnLlCoYH5+nfiSX/eIO3CMJwd9mfn8wOF4Xu7Zht4BKIyZWQeXY5USmlVqXZ74f9+CE5EywBSgBJAOjFdKfRSITatUhAJRqkpOS+eB937nfHIaDrswe+Ad/LH1KD3vrMaEBbuY8+c/jHy0Hg81rsCUxXE8264mP639hymL46gcXYivnmtKg5d/cnsPu93GoKfbc2PVMpw7n0S7Jz/gtpurAtD9/qb0eijwL0sgimWX47DbGNSnPbWqGP7e3eMDGteryrg3LibQHDbuR8IK+KdxAVCpbBSLv3oZMD4XN979Om2b1vbbHphbB1mxWgnNbLUuT+SUbL3eYOUykTTgBaVUdYy1LX1EJKAsf1apCAWqVHU+2Vh07bBfVOtqVC2KuesOAvDtyr+5q05p42IFBfMHAVAoNIhjpy94tB8VWZgbqxoqUgVD81GpbBTHTBYXkQAVy7JSvGhhalW56G/FslH8m8VfpRRz/9hE+zvqBu44sHTdLsqViqRMdJGA7JhZB1mxUgntevH/PmW5UuqoUmqD63ECxvqWUu5LuccqFaFAlapsIvw2pBVbPuzE0h3/sv9EAmfOp2QmQjx68gIlIgxRlPfnbOWeBuVY924HpjzbjEHT1/vk68GjJ9m+5xAxNQwVqSmzl3HX46N4ceSMzHTu/hKIYpk7f3dk8Rfgzy37iCxSkPKlA9eSAJi9YAP33Bl42nawpg6sVEKzQq3Lq/vmEtnAazIG50prUgdYc5XXegI9Acrc4F7RySoVoaxKVQVCQ3xWqkpXijvfnEeh/EFM7NuYytFXCn9kuN7xlrJ8u+JvPvvtL2IrFmXMkw1pPvgXvEmsnHg+md6vf8HgZzoRViAfj3RsRL+udyIC7038laHjfuS9gf7nGc1QLDuTcJ7HBnzOzr1HqF6xpN/2Es8n89Tgi/5mMGfhBtq3MKf1lpKaxvxl2xj01N2m2DO7DiDwz5c75k54nuhihTlxMoH7nhlL5XJR3Fqnkim23ZETWmfeYPlOBhEpCHwPPKeUOnv561lVtYp5UNWyUkXIDKWqsxdSWbnrOHUrFKVwaHBmyunoIvkzu6Kdb6vIT2sNQen1e/8jJMhOkYIhHm2npjnp/foXdGwZS+umhrBIsSJh2O02bDYbD7VryOad//js89UIRLHsEn8Hf0HHO2JplUUIJS3NyfxlW2h3+xXLn/xi0aod3FS1NMWLFjLFXgZm1EFWrFBCA2vUujzi22b764qlAU5EgjCC2zSl1KxA7VmpIuSvUlWRgiEUco2p5Quy07h6FHFHz7Jy1zHa1jPGoe6/tTy/bToEwOGTidxWw1ACqxRdiJAgG/8lJLu9h1KKAe98TaWyUfR4sFnm+WPxF7vn85dtoWp5/ydbAlEsu5q/L7v8fTKLvwDL1++mwg1RRBcP99vXrMz6bQOdTOqemlkHl2OFEppVal2eEIyekzfH9cbKWVTB2FqxUyn1gRk2rVQR8lepKio8P6O7N8Amgs0GP639h4VbjrD7yBk+7tWIAR1vYvvBU8xYtg+At2Zu5N2u9enRsipKQf9JV/Tar2Dd1r+ZNX8d1SpE0/qJdwFjScicRRvYsecIIlC6RBFGvOjTIu9LCESx7Kr+/ubyt7vh74Aebbm9QQ1++n0j7VuY03o7n5TCkj//4v2BD5piz8w6uBwrlNCsUuvyTM5onXmDlapatwHLgK0Yy0QAXlVK/ZJdmdjYemrFmnWm+5Ka5p8ivSd0uiRISErzfJEfFLAsXVLuUdWyInVY44Y3syFAVa18Jaqosl3/59W1u0e1yrOqWsvJNeudNRqN1+SQ7qc35LqdDBqN5vpiVWJaK9ABTqPR+IxuwWk0mjxLbplk0AFOo9H4hh6D8w/F1XcrBIpVb4YVM54Vuk8z3SbA8aldPV/kByEOa5ZSWjS5b8m0l1Wfr5waQwQxJeHltSBHBTiNRpM70C04jUaTZ9FjcBqNJm+Si8bgckdHWqPR5BiMvaimyAaWEZE/RGSniGwXkWdd598QkcMissl1tMlS5hURiRORXSJylydfdQtOo9H4jEktuIykuBtEJAxYLyILXK99qJR679J7Sg0MqcCaQElgoYhUUUo5s7uBDnAajcZnzNjJoJQ6Chx1PU4QEU9JcTsAXyulkoG/Xfqo9YFV2RXIVQHu0LFTPP3GVxz77yw2Ebp2akTvzs1Msf3pjD+YOmcVIkL1itGMGdSFfCFBPtnIThwG4IvvlzJl1nLsdhvNG9bg1afau7UVEmTnp8GtCHbYcdiFn9Yc4J3vN/Fpn8bElI8k1ZnOhr3xvDBxJWlOY03FiMfqc0dMaS6kpPHMp8vZsv+kz/XgdKbT4vF3iS5WmBkf9Pa5PED/EdNZsGI7kREFWTz1FcBIyDltzqpM7YBXerWlxa01/bIP1om4gDl1cDlnEs7z3IgZ/LXvKILw0aCHA9Z6AGt89Yj4NMkQKSJZM2iMV0qNv8LkpUlxGwF9ReQxYB1GK+8URvBbnaXYITxkCbcyXVI+YCkQ4rrPd0qpIYHYdNhtDH22E7WrlSEhMYnmj42iWf2qAYvOHD1+mgnfLGH5jFfJny+Y7q9NYvaCDTzU7haf7GQnDhN/MoEFy7cx74sBhAQ7iD+V4NFWcqqTTsPmk+gSs5k7pA0LNx/muxX76D1uGQDj+zbh0dur8MXCXdwRU4oKJQpR//lZxFYqxrtPNOSuwXN9rovPZi6mSrmozDxj/vBAm/p0u7cx/YZOveR8zweb8dTDzf22m4HVIi5m1MHlvPrhLJo3qM4Xb3cnJTWNC0kppti1wldPZOSD85J4T9lELk+KKyKfAEMxlsYOBd4HnuDqSwPdrpi0cpIhGWiulKoNxACtRKRBIAZLRBamdjUjiWRYgXxUKV/CFE0GMPJpJSWnkpbm5EJSKiWK+Z4hNjtxmKk/ruDpLi0ICTZ+TyIjwryyl+gSswnKFLNRLNx0OPP1DXvjiS5i5BVrHXsD3yzbC8D6uBMUDg0mKjy/T/4fPnaK31Zs55EODX0qdzkNYyoRYUK+s+ywUsTFrDrISkLiBVZvjOOR9obN4CAHhcMCrx8rfPUO7yYYvGnlXS0prlLqmFLKqZRKByZwUeX+EFAmS/HSwBF39q0UnVFKqXOup0Guw7T16f8c+Y8tuw4RW7Os54s9EF08nKe7NCem4xBqtRtEoQL5uP2W6gHZzCoO8/fBE/y5ZR8den3IA8+M9Tq1uE2EP0a0Z+ennVm89Qgb9sZnvuawCw/cVpHfNxsBLzoilMMnEzNfP3IykegI375Er304izf6dsAm1nwsJn2/jOaPjaT/iOmcPuu/QI6VIi5W1MH+w/9RNKIgzwydxu2PvcNzw6eTeMF9FmdvsPr9cocZGX2zS4orIlm7ZJ2Aba7Hc4DOIhIiIuWBysCf7u5hdcpyu4hsAo4DC5RSVxWdEZF1IrIuPv6EV3bPnU+m68CJjHj+HgoV9K2VcjVOnz3PvKVbWT9rCFt/Hsb5pBS+/XWt3/YuF4dJc6ZzJuECP3z6HK8+dTdPD5ns1Za0dKW4/dU53NT3W+pWjKRa6Ytpvt/t1pCVfx1j9S5DSPlqHyZffk3mL99GZJGCxFR3L/zjL107NWL1N6+z8MsBFC9aiDfH/uC3rawiLp37f2KaiItVdeB0prNl1yG63XMbf0x5mdD8IYyZsjAgm1a/X24RY5LBm8MDjYBHgeaXLQkZJSJbRWQLcDvQH0AptR34BtgBzAP6uJtBBYsnGVw3jxGRcGC2iNRSSm277JrxwHiAurH1PH4nU9OcdH35c+67qx533x5jip9L1u7ihpJFM7uObZvVZu3Wv7m/9c0+27qaOEx0sXBaNbkJESGmRllsNuHkmUSvxXrPnk9hxc5/aVG7FH8dOs1L99SmaKF8PP/h75nXHDl5nlJFCmQ+L1mkAP+e8r6VtGbzPuYt3cbClTtITk4lITGJXkMm89mb5uxhLVbkYpf/kfYNefSlK8aZfaJL+4Z0cXX5hn/ykykaD1bVQXTxcEoWCyfWJUF4d/MYxkxZ4L7QdfLVGzLWwQWKm6S42Wb9VkoNB4Z7e49r0rZVSp0GFgOtArRDv6HTqFK+BH26BD5YnUHpqAjWb9vP+aQUlFIsXbfbL7GR7MRh7mxci5Ub9gCw7+BxUlOdFClcIBsrBkXDQigUGgwYYjZNapVkz5EzPNKsMrffVIqe/1tyyYb0eesP8kDjigDEVirG2QspXolKZzC4T3u2/TyUTT+8yYRh3Whcr4qpX5asAjm/LNkS8MSQFSIuVtVBVNFClIwKJ+7AMQCWrd1F1fIlcqSv3pJbVLWsnEUtBqQqpU6LSH7gDuCdQGyu2byPmb+upUalkjTpMhKA15++m5aN/F9uABBbqxx3N4+hRddROOx2bqxSisc63uqznezEYR5ocwsvjfyall3fIchh5/1XH/b45keFhzL2qduw2wSbCD+u3s9vGw/x71ePcTD+HL++2RaAuWsP8N7szSzYdIg7Ykqx9sN7uJDspN9ny32vCJN4ashkVm6M4+Tpc9TtOJgXu7dm5cY4tu85jAiUKVGUUQMeCOgeVoi4WMnbL9xH7yFTSE11UrZUUcYM6nK9XQqIHBC7vMJK0ZmbgMmAHaOl+I1S6i13ZerG1lMrVvs/9pUdVomCnE9x2/33i9yWLinFIkEfu0Upsa2wm27Rd8gKgZzbTBCdCStTTcW+MMmra5f0b5RnRWe2YCzc02g0eYlctNk+V+1k0Gg01x8j4WXuiHA6wGk0Gp+xovtsBTrAaTQan8kl8U0HOI1G4xvi22b760q2AU5E3G7GVEqdNd8djUaTG8glQ3BuW3DbMXb7ZP1TMp4rwJI9IpapKVlAofy+pVPyhhPTrFnOUeSeTy2xu/er7pbYDQ81v27BxM3QWbBqSYtVy5vMINdPMiilymT3mkaj+f+LYMyk5ga82qolIp1F5FXX49IiEvi+GI1Gk2uxiXfH9cZjgBORsRg7+h91nToPWNPf0Wg0OR8v97n4uVEAACAASURBVKHmhIkIb2ZRb1VK1RWRjQBKqZMiEmyxXxqNJgeTA2KXV3gT4FJFxIZrfFZEigLWbEDUaDQ5HiFvLfQdh5FSuJiIvAk8ALxpqVcajSZHk+tnUTNQSk0RkfUY6Y4A7r88aeW1Iik5lXa9R5OSkkaaM532zWMY2LNtwHbjDhzjyUFfZj4/cDiel3u2oXeASk0LV+7glfe/w5mezqMdbqX/43cG6GngymIhQXbmDu9AiMOO3W5jzqp9jPx6LWP6NKNOxWKICHFHTtPnf7+TmJRGt7tq8GTrWjjTFYlJqTz38RJ2HTrl9h7JKal0fnYsKSlpOJ3ptGpam+e6tWLgqK/ZuusgCihfuhijBj5EgfwhfteFFYpSdToOoWBoCHabDbvdxqLJAwK2mdPV4HzFm3TkOQVvdzLYgVSMbqpPSTJFxI4h/XVYKdXON/cuJSTYwQ/j+lEwNITUNCdten5Ii4Y1ApZfq1Q2isVfvQwYX5ob736dtk1rB2TT6UznpVHfMHtsX0pGhdO867u0bnJjwIkeA1UWS0510mHwHBKT0nDYbfw6oiMLN/zDa5NWkHAhFYBh3W6lR5sbGT1rI98t3cMX83cA0Prmcgzrdiv3D3Wv1hUc5GDqB09TIL/xPj34zP9oeks1XuvTkbAC+QAYPu5Hvpq9nN4Pt/C7LqxSlPrh435eZ1v2hpyuBucPuaWL6s0s6mvADAwl6dLAdBF5xYd7PAvs9M+9K3yhYKjxi5+a5iQtzWn6TM3SdbsoVyqSMtFFArKzfvt+KpSJpFzpSIKDHNzTsi6/mKD8ZIayWGLSlWpdGcENIH+wI1MzIuv50BDvfg9FJLNllpbmJM3pRJDM4KaUIiklNaBWwPVTlPKdnK4G5w/i5XG98eYT+wgQq5Q6DyAiw4H1wNueCopIaaAtRg715wPwMxOnM53mXUfx96ETPHFfE+q58tybxewFG7jnzsCX+R09cYZSURGZz0u60qKbib/KYjabsPi9+yhfojATf93G+j2GcM3YvrfTMvYGdh08xaAvVmZe/2TrmjzdvjbBDjvtB8/x6h5OZzoden3AgcPxPNKxETE1DB8HvDODxWt2UqlslEfxa3dkKEqdOx+4OlVWBLiv3zgEoyvZtVMjU+1bpQaXPySIZvWrBawG5y05YQmIN3jT3TzApYHQAezz0v5oYABuZl19VdWy220smTqQrT8NZeP2A+zc61YW0SdSUtOYv2wb7ZsHLmZztUzJZn4mAlEWS09XNHn+W2o+OYW6lYtT/Qajtdp37B9U7z6F3YdO0em2ipnXf/7rduo+NZ03pqzmxfu9C/52u42fP3+RFd8OYfNf/7Dr76MAjHr5IVZ9+waVbohi7h+bfPI7AysVpeZOeJ4/przMzNFPMem7pazcGGea7ZyuBuctxixq4At9RaSMiPwhIjtFZLuIPOs6X0REFojIHtf/Ea7zIiJjRCRORLaISF1PvmYb4ETkQxH5AGNh73YR+VxEJgBbAY8ilCLSDjiulFrv7jql1HilVD2lVL3IyGKezGZSOCyURrGVWLTKlN4vAItW7eCmqqUpXjTwZn7J4uEcPnZxMP7IsVOUiCwcsF0wT1ns7PkUlm87Qos6F3flpacrZq2Io33DCldc//3yPbStX86nexQqmJ8GMZVY+udfmefsdhttb49h3lL/uuwZilIxHYfQY9AXLFu3m15DJvtl63KiixnvUbEiYbRpVpsN2w+YYtdqNbgghz1TDc5yxDvJQC9mWtOAF5RS1YEGQB8RqQEMBBYppSoDi1zPAVpjaKFWBnoCn3i6gbsW3DaMDfdzgTeAVcBq4C3g9+yLZdIIaC8i+4GvMbQPp3pRLlviTyVwJsGQwruQlMKSP3f5pX6VHbN+20AnE7qnAHVrlGXvPyc4cDielNQ0Zi3YQOsmNwVsN1BlsaKF8l1U6wq206x2aeIOn6Z8iYtBvVW9cuw+ZPyGVYi+GJTvii3L3qOex47+O32Os+cMRa+k5BRWrN9NhTLF2X/4RObfsGjVDircUNxn/8E6RanEC8mZExaJF5JZvOYvqlcMbCIAcr4anD+YsZNBKXVUKbXB9TgBY6y+FNABQ88F1/8dXY87AFNcovKrgfDLRKKvwN1m+4ne/KFuyr8CvAIgIs2AF5VSjwRi81j8Wfq8NRVnejrp6YqOLepw1221AjGZyfmkFJb8+RfvD3zQFHsOh51RAx7g3n7jcDoVXdo3MOXLEqiyWImIUD7u1xy7zYbNJsxeEcf89Qf4dXhHwkKDERG2/R3PC58tBaBHm1o0vak0ac50Tp9L5ukxnn/bTvx3lpdGzsh8n9o2q83tDarzYL+xnDufhFJQvWJJ3up/n/8VYQEnTibQdcAEwBi8v/euerRoWCNguzldDc5XMrqoptoUKYeh4bIGiFJKHQUjCIpIxi9hKeBglmKHXOeOZmvXk6qWiFTEmCSoAeTLOK+UquKD880wApzbZSJ1Y+up5avMH0OwSvXIDDX1y7FK5UynSzKwonatWvNqRbqkJrfWD1hVK7JCTXX3iK+9uvbLh246AMRnOTXeJfaeiYgUBJYAw5VSs0TktFIqPMvrp5RSESIyF3jbJRiNiCwCBrgbBvNmFvVLYBjwHkYfuBs+btVSSi3GEH7WaDR5AB8iZLw72UARCcLYKTVNKTXLdfqYiES7Wm/RwHHX+UNA1jRupQG3s4zeNEFClVLzAZRSe5VSgzCyi2g0mv+HiBhJPr053NsRASYCO5VSH2R5aQ6QMajaFfgxy/nHXLOpDYAzGV3Z7PCmBZfscmSviPQGDgP+jQ5rNJo8gUnr4BphpGHbKiIZa4ZeBUYC34hId+Af4H7Xa78AbYA4jNUd3TzdwJsA1x8oCPTDGIsrDDzh/d+g0WjyGmbEN9dYWnaWrtjDp4wB6j6+3MObzfZrXA8TuJj0UqPR/D9FkFyzF9WdqtZs3Ew6KaXuscQjjUaTs8kj2UTGXjMvXAjW5Jmy5Yhtv9eXE9/1ssRusQb9LLF7YvUYS+xapYBlBTm5lZRb9qK6W+i76Fo6otFocgcC2HN7gNNoNJrsyC0NYR3gNBqNz+S5ACciIUopc5NvaTSaXIeRsjx3RDhvMvrWF5GtwB7X89oi8j/LPdNoNDmWPCP8DIwB2gH/ASilNqO3amk0/6/JEJ7xdFxvvOmi2pRSBy5rkjot8sctfd+aamRzjQhj1czXTLVthQJWTlTVcseZhPM8N2IGf+07iiB8NOhhrwR9QoIdzB3/HCFBDuwOO3MWbWTk+F/4ZfxzFHTpMERGhLFh+34eeWlCZrk6NW5gwaQXeeLVScz53bfsvlaoSVlVt1a+Z1Yoi3lCAEdOiF5e4E2AOygi9QHlUsh6BtjtjXFXsssEjICY5i6rgDc81K4BPR5oSu8hUwIxcwVWKGDlVFUtd7z64SyaN6jOF293JyU1jQtJKV6VS05Jo8NTY0i8kGIodX3+PAtX7qBNz9GZ10x+58lLRHdsNuGNvh34fbXvGZmtUpOyqm6tfM+sUhbzRC6Jb151UZ/CEIy5ATiGkVr4KR/ucbtSKibQ4AbQqG4lIgqFBmrmCqxQwMrJqlpXIyHxAqs3xvFIe0OlKjjIQeEw7+s68YIRDIMcdoIc9kvy2hUMDaFJvSqX/P09H2zKT39s5sSpBL/8tUJNyqq6tcru9VIWEzG2anlzXG88Bjil1HGlVGelVKTr6KyUivdULjdxNQWsQD+AVti8HDMVmvYf/o+iEQV5Zug0bn/sHZ4bPp3EC95PmttswtJpA9n920gWr/mL9Vm0DNo2q82StbsyWxnRxQrTrlltJn2/zC9fs6pJ1Wo3iEIF8pmuJmVm3VplN0NZzCbmJ171RG4Zg/NmFnWCiIy//PDSvgJ+E5H1ItIzG/uZqlonvFDVsgIrFLBysqrW1XA609my6xDd7rmNP6a8TGj+EMZMWeh1+fR0RZMuI6nZdhB1a5a9JD37fXfF8v38i0lXRzx/L2/870fS/cxYa7WalNl1a4VdK5XFvCG3zKJ6MwaX9VOeD+jEpXnR3dFIKXXElVN9gYj8pZRamvUCV/ri8QCxsfWsydftASsUsHKDqlZWoouHU7JYOLEundm7m8cwZsoCn+2cPXeB5ev30KJhDXbuPUpE4QLUrVHu0smF6jcwcbiRyqtIeEFa3lqTNGe61134rGpSQKaa1P2tb/bZ38uxom6tsJuhLLZw5Q6Sk1NJSEyi15DJpojveELIPXt6vUmXNDPrcxH5CvDqk6+UOuL6/7grO0l9YKn7UteerApY0cXDmbVgAxOGPp7jbIJ1Ck1RRQtRMiqcuAPHqFQ2imVrd1G1fAmvyhYNL0hqmpOz5y6QLySIZvWr8pGr9dexRR3mL99Gckpa5vUxHd/IfDxuyCPMX7bNp/HJrGpS+UOCWLpuNzHVyngu6AGr6tYKu4P7tGdwH0M4e/n6PYybtuiaBDcAckjrzBv82apVHvA4gCAiBTCWmCS4Ht+JITnoN91f+4IV6/fw3+lz1Gw7iIE92/Boh8BVhKxQwMqpqlruePuF++g9ZAqpqU7KlirKmEFdvCpXIrIQH7/x6EWlroUbmL98GwD33BnL6Mm/BexbVqxSk7Kqbq18z64Xkksy9HijqnWKi3nhbMBJYKBS6hsP5SoAs11PHcB0pdRwd2ViY+upFWvWeeN3nsUqVS0rFJpAp0uyEis+Crc1vDlgVa3SVW9UfT/5watrX2lRab0ZKyj8xW0LzqXFUBtDhwEgXXn5DVRK7XOV1Wg0eYzc8jvhdhbVFcxmK6WcruO6TAJoNJqchRnK9tcCbxbQ/CkidS33RKPR5AoM2UDvDs+2ZJKIHBeRbVnOvSEih0Vkk+tok+W1V0QkTkR2ichdnuy702RwKKXSgNuAHiKyF0jEmCVWSikd9DSa/6eYuEvhSwx5hMv3X36olHov6wkRqQF0BmoCJYGFIlJFKZXt3nh3Y3B/AnWBjn44rdFo8iiCeWNwSqmlIlLOy8s7AF+78lL+LSJxGEvPVmVXwF2AE5cDe728uUaj+X/CNRhe6ysijwHrgBeUUqeAUsDqLNcccp3LFncBrpiIPJ/di0qpD3xw1isUkJqWbrZZy96MnLCZ2FusWh5xePlozxf5QbF7xlli9+QPfU23adUSnJw7pSe+KNVFikjWtV/jXbuX3PEJMBQjJAwF3scQm7/aTd3WkrsAZ8dQtM8932KNRmM5gk+Nhnhf18EppY5l3ktkAvCz6+khIOuWldLAEXe23AW4o0qpgHYeaDSaPIiAw8KFcCISrZQ66nraCciYYZ0DTBeRDzAmGSpjzBVki8cxOI1Go8mKjy0497ZEZgDNMLqyh4AhQDMRicHofu4HegEopbaLyDfADiAN6ONuBhXcB7gWAXuv0WjyJGaNPyulHrrK6Ylurh8OuN3ymRV3yvYnvTWi0Wj+f5Fb5te08LNGo/EJwbstUDmBXBfgxs9czNQ5q1BK8Uj7hvTqHLiCYdyBYzw56MvM5wcOx/Nyzzb0DtB2UnIq7XqPJiUljTRnOu2bxzCwZ9scZxPMVX56fsR0Fq7cQWREQX7/aiAA2/YcYuC735KckorDbmfEC/dRp4b7rFshQXbmjriXkCA7drswZ+VeRs5Yw5i+zalTKQoRiDtymj4fLSQxKZWHmlfjrcdv4+h/5wCY8MsWvlqw47rUQVbM/Hw9O2waC1ZuJzIijKXTXgHg1JlEerz+JQePnqRMdBE+H9aNcAu0SzKR3LNEytIAJyLhwOdALYwBwyeUUtmuOvbEzr1HmDpnFfMmvkCww86D/T+hZaOaVChTPCA/K5WNYvFXLwNG6u4b736dtk0DT4QSEuzgh3H9KBgaQmqakzY9P6RFwxpeSfFdS5tgrvLTA21uodu9jXl22LTMc8M//onnu91F84Y1WLRqB8M/nsN3Y59xayc51UmH12eTmJRqqHWNvJeF6/fz2sRlJFxIBWDYE7fRo+1NjP7eSIk+e/keBoxf4rPPYJ36lZmfr85tb6H7/U3o+9bUzHNjvlpIk3pV6PdYS8ZMWcCYrxYwuE+HgHx2h7GTIXcEOKtbmh8B85RS1TBSJ/muEZeFPfuPEVuzLKH5gnE47NxapxJzTVCqysrSdbsoVyqSMtFFArYlIhQMDQGMlNVpac6AMyxYYRPMVX5qEFPxihaECCScN0RnEs5dIMrL9O2JSUYgC7LbCLLbUJAZ3ADyBztMy6FnlfpVVgL9fDWsU+mKup23bCsPtqkPwINt6vPr0q0B++kJ8fK43ljWghORQkAT4HEApVQK4J3QZjZUqxjNiM9+5uSZRPKFBLFw1Q5iqpkrujF7wQbuuTPWNHtOZzrNu47i70MneOK+JtRzaR7kNJtZsUJR6s1+nXj4+U8ZOm4OKl3x46fPelXOZhMWv/8g5aMLM/GXrazfbawBHduvBS1jy7Hr4EkGTVqeef3dDStya82SxB05zWsTl3E4/pxf/lqlqmX25wvgxMmEzB+MqMjCxPspxegLuaQBZ2kLrgJwAvhCRDaKyOeu1OWXkFVVK/6Ee1WtKuVK8Mwjd3B/v3F07v8JNSuVwuFNThYvSUlNY/6ybbRvbp7YiN1uY8nUgWz9aSgbtx9g5163C6+vm80MrFKUmvLDCt7o14l1s95gyDMdeeHtr70ql56uaNL/a2p2/4K6VaKofoPR8uk7ZhHVu01i98GTdGpcGYB5a/dTu8eX3PbsDJZsPsjHz97hl69W1YEVn6/rg3e54HJLPjh/cWBkI/lEKVUHI9XSwMsvUkqNV0rVU0rViyxWzKPRLu0bsmjyAOZ88iwRhUIpX8ZzGW9ZtGoHN1UtTfGigYsIX07hsFAaxVZi0aqAeumW2rRKUQrg21/X0qbpTYCh2LVp5wEPJS7lbGIKy7cepkXdiy2q9HTFrOV7aN+wIgCnEpJIce1lnvzbdmIq+j42a2UdWPX5KlYkjGPxRlf6WPyZTLUxq8iYRfXmuN5Y6cMh4JBSao3r+XcYAS8gTpw0mt+H/j3J3MWbuaelec39Wb9toJOJ3Yf4UwmcSTgPwIWkFJb8uYvK5aJynE2wTlEqg6jIQqzaGAcYKlDlS3v+YSpaKB+FCgQDkC/YTrPaZYg7fIryJS6O37W6uTy7DxnyjFERF8emWtcvz65Dp/AFq+vA7M9XBnfdVouZvxg7lmb+8ietGt9o+j0uJ7co21s2BqeU+ldEDopIVaXULoydEd7P2WfDE69O5NSZRBwOOyNfvN+06fDzSSks+fMv3h/4oCn2AI7Fn6XPW1NxpqeTnq7o2KIOd91WK8fZBHOVn54eMplVm/Zy8vQ5YjsN4cXurXl3QGcGfzSLNGc6+YIdjBrguZ5LRBTg4+daYrcZX5bZK/Ywf91+fn37XsLyByMibNsfzwuf/AFAr3a1aVW/PE6n4tS5JPp85L1wtdl1cDlmfb56Df6SFRviOHn6HLXbv86AJ9vQ77GW9HjtC6b9tJrSURF87tKctQwhR3Q/vcGjqlZAxo39ZJ8DwcA+oJsrr9NVqRtbTy1Z4XbvrJ9+mG4SyD1T5WBdHVxIcbsV0G9KPfCJJXb/v6dLatqofsCqWpVq1lajps/z6tp7Y0rmXFWtQFFKbQKu2x+n0WisIbe04HLdTgaNRnP9yR3hTQc4jUbjIwLYdQtOo9HkVXJJfNMBTqPR+IoguaSTqgOcRqPxGd2C8xOH3fyas2oljBXLA2xW5bq3qA6smk07MauPJXaLPvSF6TaPftXVdJtWYcZ3wdjJkDsiXI4LcBqNJocjugWn0WjyMLllkbsOcBqNxieMhJfX2wvvyAkb/jUaTS5DvPzn0Y7IJBE5LiLbspwrIiILRGSP6/8I13kRkTEiEiciW0TEY/IOHeA0Go3PiHh3eMGXQKvLzg0EFimlKgOLuJhmrTWG2HNloCfgccNyruqiWiUKkoHTmU6Lx98lulhhZnzQO2B7VgjkANTpOISCoSHYbTbsdhuLJg8I2KaZYjb9R0xn4YrtREYU5I+phjBKr9e/ZO8/xwE4e+4ChQrmZ2EAfgci5BISZOfnIW0ICbLjsAlz1uxn5Hcb+axvU2IqRJLmTGdD3An6f76CNOfFacc6FSL5bVg7un+0mDlr9ru9x3PDp7PAVQdLXOIw74yfy7xlW7HZbESGF+SjQV0oUcy71O3u7L459kcWLN9GUJCdcqUiGf3awxQOs1B0BkxbB6eUWioi5S473QFDDBpgMrAYeNl1fooyMoSsFpFwEYlWSh3Nzr6VKcurAjOznKoADFZKjfbXplWiIBl8NnMxVcpFkZCYFLAtqwRyMvjh434UDS9oii0wV8zmwTb1DdGZoReFUT4b+njm4zf/N5uwAoFlyg1EyCU51UnHob+SmJyGwy78+mY7Fm46xLfL99JrrCFYM+GZZjzavCpfLPgLMAbVhzxcj983H/bqHg+2qc8T9zXmmSziME93acHLrh+Nz79ZwgdfzPMqbZQnu01vrsprvdvhcNgZOm4OY6Ys5PU+7X2y6ws+jsFFisi6LM/HK6XGeygTlRG0lFJHRSTjS1MKOJjlukOuc9kGOMu6qEqpXUqpGKVUDBALnAdmB2LTSlGQw8dO8duK7TzSoaEp9q6FQI6ZmClm0yCmEhHZ5OlTSjHn9010bBlw7tNM/BFySUxOAwwxG4ddUMDCTYcyX9+w9wQli1zMsN+zVXV++vMAJ8569+N3NXGYsAL5Mh+fT0rxa63F1ew2u6UaDocdgNhaZTl64rTPdn3Cy2SXrpnW+IyM3a7DU3Bze+ernHO7su9ajcG1APYqpXzLU+0Gs0VBXvtwFm/07YBNzKmSahWjjaSPZxI5n5TCwlU7OHLMnA+eAPf1G0fzx0YxefYKU2yC0RJq+shIqrV6hab1q5kuZgOwZvNeikWEmdaSBf+EXGwiLBnZgV3jH2bx1iOsj7uoB+KwCw80rsiizUbAi44Ipe3NZTNbc4Hw9qc/U7fjEL6fv44BT7YJ2N7lzPh5Dc0bVDfd7uVYrKp1TESiAVz/H3edPwSUyXJdacCtIMm1CnCdgRlXe+ES0Zl496IzGZgtCjJ/+TYiixQkprp5Cl1WCuTMnfA8f0x5mZmjn2LSd0tZ6UoFHihWitlk8MOCDaa23vwVcklXiqYDf6TW0zOpW7EY1UuHZ7723hO3smrnMVb/ZSh4jeh6C29OX0e6CdsAXundjg0/vMm9d9Vj0vdLA7aXldFf/obDbuPeu6xNwZihi2phyvI5QMb2kK7Aj1nOP+aaTW0AnHE3/gbXIMCJSDDQHvj2aq9fIjoT6TlPvxWiIGs272Pe0m3EdBxCj0FfsGzdbnoNmRywXasEcqJdA9PFioTRplltNmw3rWEMWCOQA5CW5uSXJZtp38K8ABeokMvZ8yms2HGUFjGlARhwbwxFC+Xjta/WZF4TUyGSz59txqb/3U/7W8rx7hMNaVMvsB/DTi1jmfvH5oBsZGXmL3+yYMV2xr3x2DVJRmlWC05EZgCrgKoickhEugMjgZYisgdo6XoO8AtGZvA4YALwtCf712IWtTWwQSl1LFBDVomCDO7TnsGuQdnl6/cwbtoiPnsz8P2FJ04mUKxIWKZAzi8Tng/YZuKFZNLTFWEF8pF4IZnFa/7ixe6Xz7L7TvypBIIcdgqHhWaK2fR7rGXAdrOybN1uKpWNomTxcM8Xe4k/Qi5Fw/KR6kzn7PkU8gXZaXpjST6as5VHb69C89ql6Dh03iV7Nuv0u/jbPPapxvy24SC/rPvHZ1/3HTye2TWfv3wblcoGLhYE8PvqnYydupDZ4/oRmi/YFJseMSmGKqUeyualFle5VgE+bVK+FgHuIbLpnvqKlaIgVmCFQM6Jkwl0HTABgDRnOvfeVY8WDWsEbNdMMZunhkxm1UZDGCW242Be6N6ah+9uyI8LN9DxDvNab/4KuURF5Ofjp5oYYjY24YdVf/PbhoMcn/Y4B+PPMX9oOwB+/vMA787a5JdvvQdPZqWrDup0GMxLT7Zm0aodxB04js0mlC5RhFEDHjDF7pgpC0lJTePB5z4GILZmWZ9nZ30lt2zVslp0JhRjWreCUsrjdGfd2Hpqxeq1pvuhs4lYl2I62aVDajZBFmSVASje5UvTbeambCLNGt3Cxg2Bic5Uv7GOmvLjYq+urV8xPE+LzpwHilp5D41Gcx3IHQ243LWTQaPRXH+MCYTcEeF0gNNoNL6h88FpNJq8TC6JbzrAaTQaXxEt/KzRaPIuuSS+5bwAlzU9jVnYrVp6YYFZyzRnLFoqY9Uyo3RlTUUcn/a46TatWHoC8O/Ux803akK1BrjP9JqS4wKcRqPJBeSSCKcDnEaj8Rm9TESj0eRZ9BicRqPJm+h1cBqNJi+ju6gajSZPIugWnGVYoVRlpqJUVgJRfsoOq5TFzKyD50dMZ+HKHURGFOT3rwzFt217DjHw3W9JTknFYbcz4oX7qFPDt3Tzzw6bxoKV24mMCGOpS1FqzqKNvDfxV3bvP8a8iS8EnJX5TMJ5nhsxg7/2HUUQPhr0sNfCO9dCreta1IE35JL4Zm2AE5H+wJMYwhBbgW5KKb8lq6xSqjJTUSorgSg/ZYdVymJm1sEDbW4xVLWGTcs8N/zjn3i+2100b1iDRat2MPzjOXw39hmf7HZuewvd729C3yyKUtUqRjPp7e689M5MNyW959UPZ9G8QXW+eLs7KalpXEhK8brstVDruhZ14BW5JMJZlrJcREoB/YB6SqlagB1Dm8FvrFKqMlNRKjv8UX66GlYpi5mrqlXxiuSeIpBw3vhtSzh3gahI3/RA4eqKUlXKlTAtM25C4gVWb4zjkfaGslpwkMNnfdHrodZlZh14i8WaDOb5abF9B5BfRBxAKB4UcDxhpVKV1YpS/ig/ecJsZTEr6+DNfp0YNm4O9e55g6Hj5vBK73am2TaL/Yf/o2hEQZ4ZOo3bmKUbXwAAEGJJREFUH3uH54ZPJ/FCsk82rpda17XGYlUt07BSF/Uw8B7wD4Yw6xml1G+XX3eJqtYJ96paVipVWako5a/ykzvMVhYDa+tgyg8reKNfJ9bNeoMhz3Tkhbe/Ns22WTid6WzZdYhu99zGH1NeJjR/CGOmLPTJxvVS67rm5JIIZ2UXNQLoAJQHSgIFROSRy6+7RFWrmGfVKauUqjKwQlEqUOWny7FCWSwrVtTBt7+upU3TmwC4u3kMm3aaqwRmBtHFwylZLJxYV8v17uYxbNl10H2hbLheal3XgoyEl97882hLZL+IbBWRTSKyznWuiIgsEJE9rv8j/PXVyi7qHcDfSqkTSqlUYBZwa6BGT5xMAMhUqrqnZeDdvvhTCZxJOA+QqShVuZx5Yxr+KD9lh1XKYlbXQVRkIVa59FuXr99D+dLm/jCZQVTRQpSMCifugNHCWrZ2F1XLl/C6fNGwfBQKNVStMtS6dh85k6nW1WPM4ivUumKeMY45a/bz0qRVfql1XXNcC329ObzkdqVUTBbthoHAIqVUZWCR67lfWDmL+g/QwCU8cwFDBmxdoEatUKoyU1HqcvxVfsoOq5TFzKyDp4dMNsZKT58jttMQXuzemncHdGbwR7NIc6aTL9jhl+pTr8FfsnKDoSgV0/51XnqyDRGFQnn1g+/47/Q5urzwGbWqlGLmaI9ymdny9gv30XvIFFJTnZQtVZQxg7p4XfZaqHVdizrwBot7nx2AZq7Hk4HFwMv+GLJaVetN4EEgDdgIPKmUynbUtm5sPbVkxZ+m+2FVuiQrxk6s8tWqtzkp1WmJXbPGVi/HiurNTemSmjaqz8b1galq3RgTq35csMKraysWz38AiM9yarxSanzGExH5GziFsZTsM6XUeBE5rZQKz3LNKaWUX91Uq1W1hgBDrLyHRqO59vjQ/Yz3IBvYSCl1RESKAwtExNQpZauXiWg0mjyGtxOo3sRApdQR1//HgdlAfeCYiEQDuP4/7q+vOsBpNBrfMSHCiUgBEQnLeAzcCWwD5gAZatpdgR/9dTPX7UXVaDTXH5OyiUQBs107ZhzAdKXUPBFZC3wjIt0xJivv9/cGOsBpNBqfMWMXllJqH3DF5myl1H8Yqy4CRgc4jUbjG2KdOJLZ5LgA57CbX3NWaTiKBUsvnOnWrOewavlJkEXLOazavmS3wN//ZnQz3SZAkQ7/M91mcpzf4/WXkTsiXI4LcBqNJmejE15qNJo8TS6JbzrAaTQa39EtOI1Gk2exalzbbHSA02g0PpM7wpsOcBqNxkd8TIV0XclVAc4qRSmAhSt38Mr73+FMT+fRDrfS//E7c6SvVih1gbV1a5YS2nPDp7NgxXYiIwqyxKUo9c74ucxbthWbzUZkeEE+GtSFEsV813sA6+ogELshQXbmvn0vIUF27HYbc1bEMXLGGsY804I6lYojIsQdPk2fjxaQmJTK8O6NaXyjkWQzf4iDYoVDKffwZwH/DZeTW3RRrU6X9CzQA6NFO0EpNdrd9XVj66kVq9dm+/q/8Wc4Fn/2EkWpr97t4VFRytN4gdOZTr1732L22L6UjAqnedd3+XzY4x7tuqs7f331ZR1chlLX/IkveBSz8bQOzl9/s8rfXY2de4/Qa/DkS5TQ3h3wgEcltKutg1u1MY4CoSE889bUzACXkJhEWIF8AHz+zRJ27//Xba65YEf26+D8rQNP+Gs3Yx1cgXxBJCal4rDb+HXkfbzy+VJ2/XOShAuG4tewJxoTf+Y8o79ff0n5Hm1v4qaKxXhmzKLMc8kr3iP9zD8BRaeYurFqwdI1ni8EiocFrfeQTcRSrExZXgsjuNXH2I7RTkQqB2LTKkWp9dv3U6FMJOVKRxIc5OCelnX5JUC1Lqt8zYpZSl1gnb9mKqFdTVEqI7iBkVw0kL6TVXUQqN3EpFTAWFQd5LChlMoMbgD5Q+xXzfd3X5OqfL90d2DOZ0MukWSwNJtIdWC1Uuq8UioNWAJ0Msu4mYpSR0+coVTUxXx6JaMiTA1GZqtfZWCFUheY66+VSmgZvP3pz9TtOITv569jwJNtTLFp1Xvmj12bTVg6+iF2f/Ukizf9w/rdRkr1sf3uYNeUJ6lcqgjjf958SZkyxcK4IaoQS7ccuprJAPFOMjCvywZuA5qISFFX2vI2QJnLL7pEVSvevapWBmYrSl2tq2nWe2OF+hVYo9QF5vtrpRJaBq/0bseGH97k3rvqMen7pQHbs+o989duerqiyXMzqPnEJOpWLkH1G4wWe98xC6n++ER2HzpJp8aXdo7uaVyFOSvjSLdg61/GTgYTNRksw0rZwJ3AO8ACYB6wGSN1+eXXXVTVivQsRGKFolTJ4uEcPnYq8/mRY6co4Ycw8eVYqX5ltlIXWOev1UpoGXRqGcvcPzZ7vtANVtWBGXbPJqawfNshWtS92PpLT1fMWraH9rdWuuTa/2vv3IOtrqo4/vly4crloaSkKJA8RJJhFHAwkvHGKDEwJZqaI6M1FFqQmWTB9DCrmWZitGkaJ7VITJuUUh4zjmX4KAMMH3CBAHmDICoBU/nKBsTVH3sfONxB7rn89i/u78f6zJy5v/M7+3z3Or9z7jp777PXWlc0ns3chesz2VwGck14aWazzGyYmTUC/wQ2ZtTLpaLUsEFnsnn7bra9uoe9+95j3pNNjGs8N5NmXrZWSFmpC/K1N49KaBW2vHIweHzB4tWZKrzndQ2y6J5yYgMndo6VuurrGHVebza9+m/6nn7wC3jsBX3ZsOPgF/RZPbvRrfMJvLBuZ5oXcBiKMoLLdZuIpFPNbJekjwBXAB/PopdXRan27eu4ffrVXPm1u9i/37h2/AjO6Z/tl7O8bIX0lbogX3tTVUKbfNsD/G15qCg19LLbmHb9OJ5e8hKbtu2iXTvRq8fJ3D796qO2M69rkEW3x8mduHvqmFCpS2L+4o0sWLqVx2dcRdeGeiSxeutuvnHPMweec2XjQOYtyufHhQq+TQSQtAg4BdgH3GJmTx+pfUvbRDLYkVwTjrxN5GgpWrqklraJHC15pUs60jaRtkYu6ZISbBMZ2orqdyc11B3TbSJ5V9W6KE99x3H+/3i6JMdxSk1Rpqju4BzHaTVFGcEVZ0HCcZw2Q6pIBkljJa2XtEnSt1Lb6Q7OcZzWk6Yuah1wFzAOGARMkDQopZnu4BzHaRWCVKFaFwCbzGyLme0FfgdcltLWNrUGt7xp2Z5O9e221dC0O7AnBxNct1i2Fk23LdiaObi2qWnZgoYO6l5j846Sllbdn2lmM+NxT+CVqsd2AB/Lal81bcrBmVlNMTySluaxt8Z1i2Vr0XSLZOuRMLOxiaQON8RLugHSp6iO4xwrdnBoAo5ewGspO3AH5zjOseJFYICkvpLqgWuAR1N20KamqK1gZstNXLcNabpufpp56uaKmb0n6avAAqAOuM/M1qTsI9dYVMdxnGOJT1Edxykt7uAcxykthXNweYR2SLpP0i5Jq1PoRc3ekv4iaa2kNbHCWArdjpJekLQy6v4whW6Vfp2k5ZIeS6j5sqRVklY02xOVRbObpDmS1sVrnCnXYNQcGG2s3N6UNDWRvV+P79dqSbMldWz5WTXp3hw116SytVSYWWFuhIXIzUA/oJ6QBn1QAt1GYBiwOqGtpwPD4nFXYEMiWwV0iccdgOeBEQntvgV4CHgsoebLQPfEn4UHgOvjcT3QLYfP2k7gzARaPYGtQEO8/zAwMYHuYELtk06EHwyfAgakvA5FvxVtBJdLaIeZLSSkVE+Gmb1uZk3x+C1gLeGDnlXXzOzteLdDvCX5pUhSL+BTwL0p9PJC0omEL6VZAGa218zSluqCS4DNZlZLZE0ttAcaJLUnOKQU+71yrVxXBorm4A4X2pHZaeSNpD7AUMJoK4VenaQVwC7gSTNLogv8DJgOvJ9Ir4IBT0haJulLCfT6AbuBX8fp9L2SOifQreYaYHYKITN7FfgJsB14HXjDzJ5IIF1T5brjmaI5uNxDO1IjqQswF5hqZm+m0DSz/WY2hLDz+4JYZDsTkj4N7DKzZS02bj0jzWwYIWvEjZIaM+q1Jywp3GNmQ4F3gGSpduKm0/HAI4n0PkSYafQFzgA6S7ouq67VWLnueKZoDi730I6USOpAcG4Pmtm81PpxWvYMkCI2cCQwXtLLhKn/xZJ+m0AXM3st/t0FzCcsNWRhB7CjauQ6h+DwUjEOaDKzfyTSGw1sNbPdZrYPmAdcmELYEleuKxtFc3C5h3akQqHSzSxgrZn9NKHuhyV1i8cNhH+edVl1zezbZtbLzPoQruufzSzzKENSZ0ldK8fAGMLUKoutO4FXJA2Mpy4BXspk6KFMINH0NLIdGCGpU/xcXEJYk82MpFPj30rlupR2F55ChWpZTqEdkmYDo4DuknYA3zezWRllRwKfA1bF9TKA75jZHzPqng48EJMFtgMeNrNkWzpy4DRgfqxs1h54yMz+lED3JuDB+EW3BfhCAk3iWtYngS+n0AMws+clzQGaCFPI5aQLr5orqVK57kYz+1dLTzie8FAtx3FKS9GmqI7jODXjDs5xnNLiDs5xnNLiDs5xnNLiDs5xnNLiDq5ASNofs1yslvRI3NJwtFqjKhlDJI0/UmaWmLnjK0fRxw8kfbPW883a3C/pqlb01SdlNhinHLiDKxbvmtkQMxsM7AUmVz+oQKvfUzN71MxmHKFJN6DVDs5xjjXu4IrLIuCsOHJZK+luwkbS3pLGSFoiqSmO9LrAgVx66yQtJux6J56fKOnn8fg0SfNjvrmVki4EZgD94+jxjthumqQXJf29OiedpO8q5Ot7ChhIC0i6IeqslDS32ah0tKRFkjbEWNlKooE7qvpOtiHXKR/u4ApITLkzDlgVTw0EflMVeH4rMDoGuC8FbokJFn8FXApcBPT4APk7gb+a2XmE+M41hED2zXH0OE3SGGAAIaZ0CHC+pEZJ5xPCvIYSHOjwGl7OPDMbHvtbC0yqeqwP8AlCCqdfxNcwiZCNY3jUv0FS3xr6cY5DChWq5dBQFfa1iBDregawzcyei+dHAIOAZ2N4VD2wBPgoIeB7I0AMpD9c6qKLgc9DyFoCvBGzYVQzJt6Wx/tdCA6vKzDfzP4T+6glTniwpB8RpsFdCGF4FR42s/eBjZK2xNcwBji3an3upNj3hhr6co4z3MEVi3djmqQDRCf2TvUpQo64Cc3aDSFdaikBPzazXzbrY+pR9HE/cLmZrZQ0kRATXKG5lsW+bzKzakdYybnnOIfgU9Ty8RwwUtJZEILHJZ1NyDjSV1L/2G7CBzz/aWBKfG6dQvbctwijswoLgC9Wre31jFktFgKfkdQQM4hcWoO9XYHXY2qpa5s99llJ7aLN/YD1se8psT2Szlb6ZJdOSfARXMkws91xJDRb0gnx9K1mtkEhm+4fJO0BFhNy+jfnZmCmpEnAfmCKmS2R9GzchvF4XIc7B1gSR5BvA9eZWZOk3wMrgG2EaXRLfI+Q6XgbYU2x2pGuJ6ThPg2YbGb/lXQvYW2uSaHz3cDltV0d53jDs4k4jlNafIrqOE5pcQfnOE5pcQfnOE5pcQfnOE5pcQfnOE5pcQfnOE5pcQfnOE5p+R9ayFB7LGdZ4gAAAABJRU5ErkJggg==\n",
305 |       "text/plain": [
306 |        "<Figure size 432x288 with 2 Axes>"
307 |       ]
308 |      },
309 |      "metadata": {
310 |       "needs_background": "light"
311 |      },
312 |      "output_type": "display_data"
313 |     },
314 |     {
315 |      "name": "stdout",
316 |      "output_type": "stream",
317 |      "text": [
318 |       "0 comp.windows.x\n",
319 |       "1 misc.forsale\n",
320 |       "2 rec.autos\n",
321 |       "3 rec.motorcycles\n",
322 |       "4 rec.sport.baseball\n",
323 |       "5 rec.sport.hockey\n",
324 |       "6 sci.crypt\n",
325 |       "7 sci.med\n",
326 |       "8 sci.space\n",
327 |       "9 talk.politics.mideast\n"
328 |      ]
329 |     }
330 |    ],
331 |    "source": [
332 |     "from sklearn.metrics import plot_confusion_matrix\n",
333 |     "import matplotlib.pyplot as plt\n",
334 |     "\n",
335 |     "classifier = clf.fit(vectors_train, newsgroups_train.target)\n",
336 |     "\n",
337 |     "disp = plot_confusion_matrix(classifier, vectors_test, \n",
338 |     "                             newsgroups_test.target,\n",
339 |     "                             values_format=\"0.0f\",\n",
340 |     "                             cmap=plt.cm.Blues)\n",
341 |     "    \n",
342 |     "print(disp.confusion_matrix)\n",
343 |     "\n",
344 |     "plt.show()\n",
345 |     "for i, category in enumerate(newsgroups_train.target_names):\n",
346 |     "    print(i, category)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "# Unsupervised Approach\n",
354 |     "\n",
355 |     "First, let's prepare the data:"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 51,
361 |    "metadata": {},
362 |    "outputs": [
363 |     {
364 |      "name": "stdout",
365 |      "output_type": "stream",
366 |      "text": [
367 |       "Data:\n",
368 |       "9850 posts in 10 categories\n",
369 |       "\n",
370 |       "Labels: \n",
371 |       "[2, 6, 1, 9, 0, 5, 1, 2, 9, 0]\n",
372 |       "Assumed number of clusters: 10\n"
373 |      ]
374 |     }
375 |    ],
376 |    "source": [
377 |     "import random\n",
378 |     "random.seed(42)\n",
379 |     "\n",
380 |     "all_news = list(zip(newsgroups_train.data, newsgroups_train.target))\n",
381 |     "all_news += list(zip(newsgroups_test.data, newsgroups_test.target))\n",
382 |     "random.shuffle(all_news)\n",
383 |     "\n",
384 |     "all_news_data = [text for (text, label) in all_news]\n",
385 |     "all_news_labels = [label for (text, label) in all_news]\n",
386 |     "\n",
387 |     "print(\"Data:\")\n",
388 |     "print(str(len(all_news_data)) + \" posts in \"\n",
389 |     "      + str(np.unique(all_news_labels).shape[0]) + \" categories\\n\")\n",
390 |     "\n",
391 |     "print(\"Labels: \")\n",
392 |     "print(all_news_labels[:10])\n",
393 |     "num_clusters = np.unique(all_news_labels).shape[0]\n",
394 |     "print(\"Assumed number of clusters: \" + str(num_clusters))"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "metadata": {},
400 |    "source": [
401 |     "Since the original dimensionality of the data is prohibitively large to allow for efficient clustering, let's reduce its dimensionality using [`Singular Value Decomposition`](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html#sklearn.decomposition.TruncatedSVD):"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": 46,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "from sklearn.decomposition import TruncatedSVD\n",
411 |     "from sklearn.pipeline import make_pipeline\n",
412 |     "from sklearn.preprocessing import Normalizer\n",
413 |     "\n",
414 |     "vectorizer = TfidfVectorizer(min_df=2, max_df=0.5,\n",
415 |     "                             stop_words='english',\n",
416 |     "                             use_idf=True)\n",
417 |     "\n",
418 |     "def transform(data, vectorizer, dimensions):\n",
419 |     "    trans_data = vectorizer.fit_transform(data)\n",
420 |     "    print(\"Transformed data contains: \" + str(trans_data.shape[0]) +\n",
421 |     "          \" with \" + str(trans_data.shape[1]) + \" features =>\")\n",
422 |     "\n",
423 |     "    #See more examples here:\n",
424 |     "    #https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#sphx-glr-auto-examples-text-plot-document-clustering-py\n",
425 |     "    svd = TruncatedSVD(dimensions)\n",
426 |     "    pipe = make_pipeline(svd, Normalizer(copy=False))\n",
427 |     "    reduced_data = pipe.fit_transform(trans_data)\n",
428 |     "\n",
429 |     "    return reduced_data, svd"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": 47,
435 |    "metadata": {},
436 |    "outputs": [
437 |     {
438 |      "name": "stdout",
439 |      "output_type": "stream",
440 |      "text": [
441 |       "Transformed data contains: 9850 with 33976 features =>\n",
442 |       "Reduced data contains: 9850 with 300 features\n"
443 |      ]
444 |     }
445 |    ],
446 |    "source": [
447 |     "reduced_data, svd = transform(all_news_data, vectorizer, 300)\n",
448 |     "print(\"Reduced data contains: \" + str(reduced_data.shape[0]) +\n",
449 |     "        \" with \" + str(reduced_data.shape[1]) + \" features\")"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {},
455 |    "source": [
456 |     "Now, let's cluster the data using [`KMeans`](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) algorithm:"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 70,
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": [
465 |     "from sklearn.cluster import KMeans\n",
466 |     "\n",
467 |     "def cluster(data, num_clusters):\n",
468 |     "    km = KMeans(n_clusters=num_clusters, init='k-means++', \n",
469 |     "                max_iter=100, random_state=0)\n",
470 |     "    km.fit(data)\n",
471 |     "    return km\n",
472 |     "\n",
473 |     "km = cluster(reduced_data, num_clusters)"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "markdown",
478 |    "metadata": {},
479 |    "source": [
480 |     "And finally evaluate:"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": 75,
486 |    "metadata": {},
487 |    "outputs": [],
488 |    "source": [
489 |     "def evaluate(km, labels, svd):\n",
490 |     "    print(\"Clustering report:\\n\")\n",
491 |     "    print(f\"* Homogeneity: {str(metrics.homogeneity_score(labels, km.labels_))}\")\n",
492 |     "    print(f\"* Completeness: {str(metrics.completeness_score(labels, km.labels_))}\")\n",
493 |     "    print(f\"* V-measure: {str(metrics.v_measure_score(labels, km.labels_))}\")\n",
494 |     "\n",
495 |     "    print(\"\\nMost discriminative words per cluster:\")\n",
496 |     "    original_space_centroids = svd.inverse_transform(km.cluster_centers_)\n",
497 |     "    order_centroids = original_space_centroids.argsort()[:, ::-1]\n",
498 |     "\n",
499 |     "    terms = vectorizer.get_feature_names()\n",
500 |     "    for i in range(num_clusters):\n",
501 |     "        print(\"Cluster \" + str(i) + \": \")\n",
502 |     "        cl_terms = \"\"\n",
503 |     "        for ind in order_centroids[i, :50]:\n",
504 |     "            cl_terms += terms[ind] + \" \"\n",
505 |     "        print(cl_terms + \"\\n\")"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": 76,
511 |    "metadata": {},
512 |    "outputs": [
513 |     {
514 |      "name": "stdout",
515 |      "output_type": "stream",
516 |      "text": [
517 |       "Clustering report:\n",
518 |       "\n",
519 |       "* Homogeneity: 0.4905834160659784\n",
520 |       "* Completeness: 0.5545553250427578\n",
521 |       "* V-measure: 0.5206115419058042\n",
522 |       "\n",
523 |       "Most discriminative words per cluster:\n",
524 |       "Cluster 0: \n",
525 |       "key chip clipper encryption government keys nsa escrow algorithm use des people secure security phone public law crypto don privacy secret encrypted just data enforcement think bit know phones chips message number used using like agencies make wiretap way scheme trust rsa time private court serial fbi does police right \n",
526 |       "\n",
527 |       "Cluster 1: \n",
528 |       "doctor disease medical msg patients pain don cause cancer know treatment food people like patient gordon drug body good geb candida yeast time skepticism effects banks pitt diet n3jxp dsl cadre chastity intellect health shameful medicine surrender edu ve just does blood effect soon years think use drugs chronic kidney \n",
529 |       "\n",
530 |       "Cluster 2: \n",
531 |       "game team games year hockey players season play think don win baseball good player teams league like time nhl espn just did hit better fans best know really series played years night detroit playing great pens playoffs boston toronto runs goal won going ll pitching 10 leafs braves way lot \n",
532 |       "\n",
533 |       "Cluster 3: \n",
534 |       "just don like think know people does good right say time make did edu sure way new ll want got use ve really going better com years let actually read probably thought things need used article come maybe dod didn stuff heard believe day problem little thing long lot oh \n",
535 |       "\n",
536 |       "Cluster 4: \n",
537 |       "window server motif widget use using display application file windows program running code set x11r5 sun problem xterm does run color like x11 mit help version screen error openwindows manager know work files want user way thanks lib include client ve hi widgets just available unix xlib usr colormap source \n",
538 |       "\n",
539 |       "Cluster 5: \n",
540 |       "sale 00 offer shipping condition asking new drive sell interested price email original 10 card excellent used edu 50 best 25 cd mail 20 monitor brand meg includes software obo following box 15 old manuals disk make disks included great like power modem model 40 manual hard ram 30 printer \n",
541 |       "\n",
542 |       "Cluster 6: \n",
543 |       "car bike engine cars just like new miles good don ride ve rear know speed oil road ford think dealer really time right drive riding driving used got make bikes honda tires power way gear problem buy does price little want clutch manual thing wheel turn year work auto need \n",
544 |       "\n",
545 |       "Cluster 7: \n",
546 |       "thanks mail advance know address send list email edu hi info does information com looking like mailing interested help reply appreciate anybody ve tell new use copy post net appreciated don responses just group need good thank hello wondering replies people read time request kind want available internet file contact \n",
547 |       "\n",
548 |       "Cluster 8: \n",
549 |       "space orbit launch nasa shuttle moon earth mission lunar solar satellite spacecraft like hst cost just think program time station use data long mars don know low years sky science satellites project idea dc new sun people sci missions high technology need going power large money work rocket mass commercial \n",
550 |       "\n",
551 |       "Cluster 9: \n",
552 |       "israel jews israeli armenian arab people jewish armenians turkish arabs muslims war said killed muslim state government just palestinian peace genocide palestinians did world like turks armenia turkey right israelis 000 soldiers don population rights land children human anti know greek fact think serbs time soviet say does villages bosnian \n",
553 |       "\n",
554 |       "\n",
555 |       "Categories:\n",
556 |       "* comp.windows.x\n",
557 |       "* misc.forsale\n",
558 |       "* rec.autos\n",
559 |       "* rec.motorcycles\n",
560 |       "* rec.sport.baseball\n",
561 |       "* rec.sport.hockey\n",
562 |       "* sci.crypt\n",
563 |       "* sci.med\n",
564 |       "* sci.space\n",
565 |       "* talk.politics.mideast\n"
566 |      ]
567 |     }
568 |    ],
569 |    "source": [
570 |     "evaluate(km, all_news_labels, svd)\n",
571 |     "\n",
572 |     "print(\"\\nCategories:\")\n",
573 |     "for i, category in enumerate(newsgroups_train.target_names):\n",
574 |     "    print(\"*\", category)"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": null,
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": []
583 |   }
584 |  ],
585 |  "metadata": {
586 |   "kernelspec": {
587 |    "display_name": "Python 3",
588 |    "language": "python",
589 |    "name": "python3"
590 |   },
591 |   "language_info": {
592 |    "codemirror_mode": {
593 |     "name": "ipython",
594 |     "version": 3
595 |    },
596 |    "file_extension": ".py",
597 |    "mimetype": "text/x-python",
598 |    "name": "python",
599 |    "nbconvert_exporter": "python",
600 |    "pygments_lexer": "ipython3",
601 |    "version": "3.7.6"
602 |   }
603 |  },
604 |  "nbformat": 4,
605 |  "nbformat_minor": 4
606 | }
607 | 


--------------------------------------------------------------------------------
/On-Friday-board-members-meet-with-senior-managers-to-discuss-future-development-of-the-company.svg:
--------------------------------------------------------------------------------
  1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="5f6d0197b0f948e981f77c6b3a45d199-0" class="displacy" width="2675" height="487.0" direction="ltr" style="max-width: none; height: 487.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">
  2 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
  3 |     <tspan class="displacy-word" fill="currentColor" x="50">On</tspan>
  4 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">ADP</tspan>
  5 | </text>
  6 | 
  7 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
  8 |     <tspan class="displacy-word" fill="currentColor" x="225">Friday,</tspan>
  9 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">PROPN</tspan>
 10 | </text>
 11 | 
 12 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 13 |     <tspan class="displacy-word" fill="currentColor" x="400">board</tspan>
 14 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="400">NOUN</tspan>
 15 | </text>
 16 | 
 17 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 18 |     <tspan class="displacy-word" fill="currentColor" x="575">members</tspan>
 19 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="575">NOUN</tspan>
 20 | </text>
 21 | 
 22 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 23 |     <tspan class="displacy-word" fill="currentColor" x="750">meet</tspan>
 24 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="750">VERB</tspan>
 25 | </text>
 26 | 
 27 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 28 |     <tspan class="displacy-word" fill="currentColor" x="925">with</tspan>
 29 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="925">ADP</tspan>
 30 | </text>
 31 | 
 32 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 33 |     <tspan class="displacy-word" fill="currentColor" x="1100">senior</tspan>
 34 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="1100">ADJ</tspan>
 35 | </text>
 36 | 
 37 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 38 |     <tspan class="displacy-word" fill="currentColor" x="1275">managers</tspan>
 39 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="1275">NOUN</tspan>
 40 | </text>
 41 | 
 42 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 43 |     <tspan class="displacy-word" fill="currentColor" x="1450">to</tspan>
 44 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="1450">PART</tspan>
 45 | </text>
 46 | 
 47 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 48 |     <tspan class="displacy-word" fill="currentColor" x="1625">discuss</tspan>
 49 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="1625">VERB</tspan>
 50 | </text>
 51 | 
 52 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 53 |     <tspan class="displacy-word" fill="currentColor" x="1800">future</tspan>
 54 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="1800">ADJ</tspan>
 55 | </text>
 56 | 
 57 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 58 |     <tspan class="displacy-word" fill="currentColor" x="1975">development</tspan>
 59 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="1975">NOUN</tspan>
 60 | </text>
 61 | 
 62 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 63 |     <tspan class="displacy-word" fill="currentColor" x="2150">of</tspan>
 64 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="2150">ADP</tspan>
 65 | </text>
 66 | 
 67 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 68 |     <tspan class="displacy-word" fill="currentColor" x="2325">the</tspan>
 69 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="2325">DET</tspan>
 70 | </text>
 71 | 
 72 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="397.0">
 73 |     <tspan class="displacy-word" fill="currentColor" x="2500">company.</tspan>
 74 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="2500">NOUN</tspan>
 75 | </text>
 76 | 
 77 | <g class="displacy-arrow">
 78 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-0" stroke-width="2px" d="M70,352.0 C70,89.5 745.0,89.5 745.0,352.0" fill="none" stroke="currentColor"/>
 79 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
 80 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-0" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">prep</textPath>
 81 |     </text>
 82 |     <path class="displacy-arrowhead" d="M70,354.0 L62,342.0 78,342.0" fill="currentColor"/>
 83 | </g>
 84 | 
 85 | <g class="displacy-arrow">
 86 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-1" stroke-width="2px" d="M70,352.0 C70,264.5 210.0,264.5 210.0,352.0" fill="none" stroke="currentColor"/>
 87 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
 88 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-1" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">pobj</textPath>
 89 |     </text>
 90 |     <path class="displacy-arrowhead" d="M210.0,354.0 L218.0,342.0 202.0,342.0" fill="currentColor"/>
 91 | </g>
 92 | 
 93 | <g class="displacy-arrow">
 94 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-2" stroke-width="2px" d="M420,352.0 C420,264.5 560.0,264.5 560.0,352.0" fill="none" stroke="currentColor"/>
 95 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
 96 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-2" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">compound</textPath>
 97 |     </text>
 98 |     <path class="displacy-arrowhead" d="M420,354.0 L412,342.0 428,342.0" fill="currentColor"/>
 99 | </g>
100 | 
101 | <g class="displacy-arrow">
102 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-3" stroke-width="2px" d="M595,352.0 C595,264.5 735.0,264.5 735.0,352.0" fill="none" stroke="currentColor"/>
103 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
104 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-3" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">nsubj</textPath>
105 |     </text>
106 |     <path class="displacy-arrowhead" d="M595,354.0 L587,342.0 603,342.0" fill="currentColor"/>
107 | </g>
108 | 
109 | <g class="displacy-arrow">
110 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-4" stroke-width="2px" d="M770,352.0 C770,264.5 910.0,264.5 910.0,352.0" fill="none" stroke="currentColor"/>
111 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
112 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-4" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">prep</textPath>
113 |     </text>
114 |     <path class="displacy-arrowhead" d="M910.0,354.0 L918.0,342.0 902.0,342.0" fill="currentColor"/>
115 | </g>
116 | 
117 | <g class="displacy-arrow">
118 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-5" stroke-width="2px" d="M1120,352.0 C1120,264.5 1260.0,264.5 1260.0,352.0" fill="none" stroke="currentColor"/>
119 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
120 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-5" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">amod</textPath>
121 |     </text>
122 |     <path class="displacy-arrowhead" d="M1120,354.0 L1112,342.0 1128,342.0" fill="currentColor"/>
123 | </g>
124 | 
125 | <g class="displacy-arrow">
126 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-6" stroke-width="2px" d="M945,352.0 C945,177.0 1265.0,177.0 1265.0,352.0" fill="none" stroke="currentColor"/>
127 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
128 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-6" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">pobj</textPath>
129 |     </text>
130 |     <path class="displacy-arrowhead" d="M1265.0,354.0 L1273.0,342.0 1257.0,342.0" fill="currentColor"/>
131 | </g>
132 | 
133 | <g class="displacy-arrow">
134 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-7" stroke-width="2px" d="M1470,352.0 C1470,264.5 1610.0,264.5 1610.0,352.0" fill="none" stroke="currentColor"/>
135 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
136 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-7" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">aux</textPath>
137 |     </text>
138 |     <path class="displacy-arrowhead" d="M1470,354.0 L1462,342.0 1478,342.0" fill="currentColor"/>
139 | </g>
140 | 
141 | <g class="displacy-arrow">
142 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-8" stroke-width="2px" d="M770,352.0 C770,2.0 1625.0,2.0 1625.0,352.0" fill="none" stroke="currentColor"/>
143 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
144 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-8" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">advcl</textPath>
145 |     </text>
146 |     <path class="displacy-arrowhead" d="M1625.0,354.0 L1633.0,342.0 1617.0,342.0" fill="currentColor"/>
147 | </g>
148 | 
149 | <g class="displacy-arrow">
150 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-9" stroke-width="2px" d="M1820,352.0 C1820,264.5 1960.0,264.5 1960.0,352.0" fill="none" stroke="currentColor"/>
151 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
152 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-9" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">amod</textPath>
153 |     </text>
154 |     <path class="displacy-arrowhead" d="M1820,354.0 L1812,342.0 1828,342.0" fill="currentColor"/>
155 | </g>
156 | 
157 | <g class="displacy-arrow">
158 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-10" stroke-width="2px" d="M1645,352.0 C1645,177.0 1965.0,177.0 1965.0,352.0" fill="none" stroke="currentColor"/>
159 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
160 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-10" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">dobj</textPath>
161 |     </text>
162 |     <path class="displacy-arrowhead" d="M1965.0,354.0 L1973.0,342.0 1957.0,342.0" fill="currentColor"/>
163 | </g>
164 | 
165 | <g class="displacy-arrow">
166 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-11" stroke-width="2px" d="M1995,352.0 C1995,264.5 2135.0,264.5 2135.0,352.0" fill="none" stroke="currentColor"/>
167 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
168 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-11" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">prep</textPath>
169 |     </text>
170 |     <path class="displacy-arrowhead" d="M2135.0,354.0 L2143.0,342.0 2127.0,342.0" fill="currentColor"/>
171 | </g>
172 | 
173 | <g class="displacy-arrow">
174 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-12" stroke-width="2px" d="M2345,352.0 C2345,264.5 2485.0,264.5 2485.0,352.0" fill="none" stroke="currentColor"/>
175 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
176 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-12" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">det</textPath>
177 |     </text>
178 |     <path class="displacy-arrowhead" d="M2345,354.0 L2337,342.0 2353,342.0" fill="currentColor"/>
179 | </g>
180 | 
181 | <g class="displacy-arrow">
182 |     <path class="displacy-arc" id="arrow-5f6d0197b0f948e981f77c6b3a45d199-0-13" stroke-width="2px" d="M2170,352.0 C2170,177.0 2490.0,177.0 2490.0,352.0" fill="none" stroke="currentColor"/>
183 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
184 |         <textPath xlink:href="#arrow-5f6d0197b0f948e981f77c6b3a45d199-0-13" class="displacy-label" startOffset="50%" side="left" fill="currentColor" text-anchor="middle">pobj</textPath>
185 |     </text>
186 |     <path class="displacy-arrowhead" d="M2490.0,354.0 L2498.0,342.0 2482.0,342.0" fill="currentColor"/>
187 | </g>
188 | </svg>


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Getting Started with Natural Language Processing
 2 | 
 3 | This repository accompanies the book **Getting Started with Natural Language Processing**, which you can get from [Manning](https://www.manning.com/books/getting-started-with-natural-language-processing?query=natural). Use the coupon code "slkochmar" to get a 42% discount there. It is also available on [Amazon](https://www.amazon.com/Getting-Started-Natural-Language-Processing/dp/1617296767).
 4 | 
 5 | [**Here**](https://ekochmar.github.io/nlp-course/) you will also find an **NLP course** that uses this book.
 6 | 
 7 | - [Chapter 1 – Introduction](https://github.com/ekochmar/Getting-Started-with-NLP/blob/master/Chapter1.ipynb)
 8 | - [Chapter 2 – Your first NLP example](https://github.com/ekochmar/Essential-NLP/blob/master/Chapter2.ipynb)
 9 | - [Chapter 3 – Introduction to Information Search](https://github.com/ekochmar/Essential-NLP/blob/master/Chapter3.ipynb)
10 | - [Chapter 4 – Information Extraction](https://github.com/ekochmar/Essential-NLP/blob/master/Chapter4.ipynb)
11 | - [Chapters 5 & 6 – Author Attribution and User Profiling](https://github.com/ekochmar/Essential-NLP/blob/master/Chapters5-6.ipynb)
12 | - [Chapters 7 & 8 – Sentiment Analysis](https://github.com/ekochmar/Essential-NLP/blob/master/Chapters7-8.ipynb)
13 | - [Chapter 9 – Topic Analysis](https://github.com/ekochmar/Essential-NLP/blob/master/Chapter9.ipynb)
14 | - [Chapter 10 – Topic Modeling](https://github.com/ekochmar/Essential-NLP/blob/master/Chapter10.ipynb)
15 | - [Chapter 11 – Named Entity Recognition](https://github.com/ekochmar/Essential-NLP/blob/master/Chapter11.ipynb)
16 | 
17 | ## Installation Instructions
18 | 
19 | To run the notebooks on your machine, check if `Python 3` is installed (all code was written and tested with `Python 3.7`). In addition, you will need the following libraries (notebooks tested with the versions indicated in the brackets):
20 | 
21 | - `NLTK` (v 3.5): check installation instructions for the toolkit at https://www.nltk.org/install.html and the accompanying data at https://www.nltk.org/data.html
22 | - `SpaCy` (v 3.1.3): check installation instructions at https://spacy.io/usage. You will also need to install models (e.g., `en_core_web_sm`, `en_core_web_md`, and `en_core_web_lg`) using the instructions on the website.
23 | - `Gensim` (v 3.8.0): check installation instructions at https://radimrehurek.com/gensim/
24 | - `Matplotlib` (v 3.1.3): check installation instructions at https://matplotlib.org/users/installing.html
25 | - `Scikit-learn` (v 0.22.1): check installation instructions at http://scikit-learn.org/stable/install.html
26 | - `NumPy` (v 1.18.1): check installation instructions at https://www.scipy.org/install.html
27 | - `Pandas` (v 1.0.1) check installation instructions at https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html
28 | 
29 | Alternatively, a number of these libraries can be installed in one go through [Anaconda](https://www.anaconda.com/products/individual) distribution. 
30 | 
31 | For more information on `Jupyter` notebooks, check https://jupyter.org.
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/chapter1_1.py:
--------------------------------------------------------------------------------
 1 | document1 = "meeting ... management ... meeting ... management ... meeting ... management ... meeting ... meeting"
 2 | 
 3 | vector = [0, 0]
 4 | 
 5 | for word in document1.split(" "):
 6 |     if word=="management":
 7 |         vector[0] = vector[0] + 1
 8 |     if word=="meeting":
 9 |         vector[1] = vector[1] + 1
10 |         
11 | print (vector)
12 | 


--------------------------------------------------------------------------------
/chapter1_2.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | query = [1, 1]
 4 | document = [3, 5]
 5 | sq_length = 0
 6 | 
 7 | for index in range(0, len(query)):
 8 |     sq_length += math.pow((document[index] - query[index]), 2)
 9 |     
10 |         
11 | print (math.sqrt(sq_length))
12 | 


--------------------------------------------------------------------------------
/chapter1_3.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | query = [1, 1]
 4 | document = [3, 5]
 5 | 
 6 | def length(vector):
 7 |     sq_length = 0
 8 |     for index in range(0, len(vector)):
 9 |         sq_length += math.pow(vector[index], 2)
10 |     return math.sqrt(sq_length)
11 |     
12 | def dot_product(vector1, vector2):
13 |     if len(vector1)==len(vector2):
14 |         dot_prod = 0
15 |         for index in range(0, len(vector1)):
16 |             dot_prod += vector1[index]*vector2[index]
17 |         return dot_prod
18 |     else:
19 |         return "Unmatching dimensionality"
20 | 
21 | cosine =  dot_product(query, document) / (length(query) * length(document))     
22 | print (cosine)
23 | 


--------------------------------------------------------------------------------
/chapter2_word_split.py:
--------------------------------------------------------------------------------
 1 | text = 'Define which data represents "ham" class and which data represents "spam" class for the machine-learning algorithm.'
 2 | text = "i. e."
 3 | delimiters = ['"', "."]
 4 | words = []
 5 | current_word = ""
 6 | for char in text:
 7 |     if char==" ":
 8 |         if not current_word=="":
 9 |             words.append(current_word)
10 |             current_word = ""
11 |     elif char in delimiters:
12 |         if current_word=="":
13 |             words.append(char)
14 |         else:
15 |             words.append(current_word)
16 |             words.append(char)
17 |             current_word = ""
18 |     else:
19 |         current_word += char
20 | 
21 | print(words)
22 | 


--------------------------------------------------------------------------------
/cisi.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekochmar/Getting-Started-with-NLP/df1b3dae0164b5f5beeabbc37d732e9ef6412a4b/cisi.zip


--------------------------------------------------------------------------------
/enron1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekochmar/Getting-Started-with-NLP/df1b3dae0164b5f5beeabbc37d732e9ef6412a4b/enron1.zip


--------------------------------------------------------------------------------
/enron2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekochmar/Getting-Started-with-NLP/df1b3dae0164b5f5beeabbc37d732e9ef6412a4b/enron2.zip


--------------------------------------------------------------------------------
/review_polarity.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekochmar/Getting-Started-with-NLP/df1b3dae0164b5f5beeabbc37d732e9ef6412a4b/review_polarity.zip


--------------------------------------------------------------------------------
/rt-polaritydata.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekochmar/Getting-Started-with-NLP/df1b3dae0164b5f5beeabbc37d732e9ef6412a4b/rt-polaritydata.zip


--------------------------------------------------------------------------------
/sentiment_words.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekochmar/Getting-Started-with-NLP/df1b3dae0164b5f5beeabbc37d732e9ef6412a4b/sentiment_words.zip


--------------------------------------------------------------------------------
/time.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ekochmar/Getting-Started-with-NLP/df1b3dae0164b5f5beeabbc37d732e9ef6412a4b/time.zip


--------------------------------------------------------------------------------