├── 0. Demo notebook.ipynb ├── 1.Basics_NLP.ipynb ├── 2.Tokenization.ipynb ├── 3.JellyFishStringMatching.ipynb ├── 4.LanguageModels.ipynb ├── 5.Smoothing,Perplexity And Backoff.ipynb ├── 6.HMM.ipynb ├── 7.Parsing_with_CFG_and_PCFG.ipynb ├── 8.Introduction_to_WordNet_and_Word_semantics.ipynb ├── 9.Machine_Learning_python.ipynb ├── Data ├── Basics_NLP │ ├── dirForBasics_NLP │ │ ├── test1.txt │ │ └── test2.txt │ ├── errors.css.gz │ ├── gaur.txt │ └── nitai.pdf ├── Demo │ ├── battles.csv │ ├── character-deaths.csv │ ├── character-predictions.csv │ ├── got1.jpeg │ ├── jupyter2.png │ ├── stopwordsSQL.py │ ├── stopwordsSQL.pyc │ └── subtitles │ │ ├── Game.of.Thrones.S06E01.HDTV.en.srt │ │ ├── Game.of.Thrones.S06E02.HDTV.en.srt │ │ ├── Game.of.Thrones.S06E03.HDTV.RMTeam.en.srt │ │ ├── Game.of.Thrones.S06E04.HDTV.en.srt │ │ ├── Game.of.Thrones.S06E05.HDTV.FUM.en.srt │ │ ├── Game.of.Thrones.S06E06.HDTV.KILLERS-AVS.en.srt │ │ ├── Game.of.Thrones.S06E07.HDTV.en.srt │ │ ├── Game.of.Thrones.S06E08.HDTV.12-06-2016.en..srt │ │ ├── Game.of.Thrones.S06E09.HDTV.en..srt │ │ └── Game.of.Thrones.S06E10.HDTV.AVS.en..srt ├── EssentialsofML │ ├── heldoutFiltered.csv │ ├── heldoutFilteredWithNewFeatures.csv │ ├── removeCols.py │ ├── trainingFiltered.csv │ └── trainingFilteredWithNewFeatures.csv ├── IPythonDownloader.png ├── IntroductiontoWordNetandWordsemantics │ └── extracted_vectors.txt ├── Kaggle │ ├── BagOfCentroids.csv │ ├── Bag_of_Words_model.csv │ └── test.txt ├── LanguageModels │ ├── DoYourBest.txt │ ├── count_1edit.txt │ ├── count_1w.txt │ ├── count_2l.txt │ ├── count_2w.txt │ ├── count_3l.txt │ └── count_big.txt ├── ParsingWithCFGandPCFG │ └── all_addr.csv ├── Tokenization │ ├── Chat1.txt │ ├── Chat2.txt │ └── DoYourBest.txt └── parameters.JPG ├── Kaggle-Word2Vec.ipynb ├── ReadMe.md ├── Sentiment Analysis- NaiveBaye's, SVM, Random Forests.ipynb ├── doc2vec-lee - Final.ipynb └── imbalanced learn.ipynb /1.Basics_NLP.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introduction - File Opening" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Opening a file" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "

To open a file for writing use the built-i open() function. open() returns a file object, and is most commonly used with two arguments.
\n", 22 | "\n", 23 | "The syntax is:
\n", 24 | "file_object = open(filename, mode)
\n", 25 | "where file_object is the variable to put the file object. The second argument describes the way in which the file will be used.\n", 26 | "

" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 6, 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "file = open('Data/Basics_NLP/gaur.txt', 'r')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "**Note-:** The command \"open('newfile.txt', 'r')\" doesn't return the contents of the file. It actually makes something called a \"file object.\" You can think of a file like an old tape drive that you saw on mainframe computers in the 1950s, or even like a DVD player from today. You can move around inside them, and then \"read\" them, but the DVD player is not the DVD the same way the file object is not the file's contents." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Reading a file\n", 52 | "\n", 53 | "If you want to return a string containing all characters in the file, you can\n", 54 | "use file.read()." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 1, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "Love is everlasting Forgiveness.\n", 69 | "Having free time is not an opulence, it's a danger.\n", 70 | "Krishna is the Supreme Personality of Godhead. \n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "file = open('Data/Basics_NLP/gaur.txt', 'r')\n", 76 | "\n", 77 | "print file.read()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "We can also specify how many characters the string should return, by using\n", 85 | "file.read(n), where \"n\" determines number of characters.
\n", 86 | "\n", 87 | "This reads the first 5 characters of data and returns it as a string." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 2, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "Love is\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "file = open('Data/Basics_NLP/gaur.txt', 'r')\n", 107 | "\n", 108 | "print file.read(7)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "## Reading multiple files from a folder\n", 116 | "\n", 117 | "The method listdir() returns a list containing the names of the entries in the directory given by path. The list is in arbitrary order. It does not include the special entries '.' and '..' even if they are present in the directory." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 3, 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "test2.txt\n", 132 | "test1.txt\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "import os\n", 138 | "\n", 139 | "for fileName in os.listdir(\"Data/Basics_NLP/dirForBasics_NLP\"):\n", 140 | " if fileName.endswith(\".txt\"):\n", 141 | " print(fileName)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "## Reading a file with size greater than RAM\n", 149 | "\n", 150 | "If the size of the file is very large then it can't be opened directly and the system may hang in an attempt to do so. So we use lazy loading of the file that reads the data in chunks" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 4, 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "def read_in_chunks(file_object, chunk_size=1024):\n", 162 | " \"\"\"Lazy function (generator) to read a file piece by piece.\n", 163 | " Default chunk size: 1k.\"\"\"\n", 164 | " while True:\n", 165 | " data = file_object.read(chunk_size)\n", 166 | " if not data:\n", 167 | " break\n", 168 | " yield data\n", 169 | "\n", 170 | "\n", 171 | "f = open('Data/Basics_NLP/nitai.pdf')\n", 172 | "for piece in read_in_chunks(f):\n", 173 | " read_in_chunks(piece)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## Lazy loading of a gzip file" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 5, 186 | "metadata": { 187 | "collapsed": false 188 | }, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "h1, h2 {margin: 10px 25px 5px;}h2 {font-size: 1.1em;}.filename {font-style: italic;}.exceptionMessage {margin: 10px;border: 1px solid #000;padding: 5px;background-color: #E9E9E9;}.stack,.snippet {margin: 0 25px 10px;}.stack,.snippet {border: 1px solid #ccc;-mox-box-shadow: 0 0 2px rgba(0,0,0,0.2);-webkit-box-shadow: 0 0 2px rgba(0,0,0,0.2);box-shadow: 0 0 2px rgba(0,0,0,0.2);}.error-details {border-top: 1px solid #FFAAAA;-mox-box-shadow: 0 0 2px rgba(0,0,0,0.2);-webkit-box-shadow: 0 0 2px rgba(0,0,0,0.2);box-shadow: 0 0 2px rgba(0,0,0,0.2);border-bottom: 1px solid #FFAAAA;-mox-box-shadow: 0 0 2px rgba(0,0,0,0.2);-webkit-box-shadow: 0 0 2px rgba(0,0,0,0.2);box-shadow: 0 0 2px rgba(0,0,0,0.2);background-color:#FFF3F3;line-height: 1.5;overflow: hidden;padding: 5px;padding-left:25px;}.error-details dt {clear: left;float: left;font-weight: bold;margin-right: 5px;}.error-details dt:after {content: \":\";}.error-details dd {display: block;}.stack {padding: 5px;overflow: auto;height: 150px;}.snippet {background-color: #fff;font-family: monospace;}.snippet .line {display: block;}.snippet .lineNumber {background-color: #ddd;color: #999;display: inline-block;margin-right: 5px;padding: 0 3px;text-align: right;width: 3em;}.snippet .error {background-color: #fff3f3;font-weight: bold;}.snippet .error .lineNumber {background-color: #faa;color: #333;font-weight: bold;}.snippet .line:first-child .lineNumber {padding-top: 5px;}.snippet .line:last-child .lineNumber {padding-bottom: 5px;}\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "import gzip\n", 200 | "f=gzip.open('Data/Basics_NLP/errors.css.gz','rb')\n", 201 | "file_content=f.read()\n", 202 | "print file_content" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "### Rerading the Jupyter Way" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 12, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "void show_\n", 224 | "void datat\n" 225 | ] 226 | } 227 | ], 228 | "source": [ 229 | "a = !ls Data/Basics_NLP/dirForBasics_NLP\n", 230 | " \n", 231 | "for files in a:\n", 232 | " f = open('Data/Basics_NLP/dirForBasics_NLP/'+files).read()\n", 233 | " print f[:10]" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### Practice\n", 241 | "\n", 242 | " 1. How do you read a file to the memory and split it line by line\n", 243 | " 2. How do you read line by line to the memory?\n", 244 | " \n", 245 | " Hint : `splitlines() or split()`, `with .. as ..`" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "collapsed": true 252 | }, 253 | "source": [ 254 | "Developer - Pranav Shukla, email - pranavdynamic@gmail.com" 255 | ] 256 | } 257 | ], 258 | "metadata": { 259 | "kernelspec": { 260 | "display_name": "Python 2", 261 | "language": "python", 262 | "name": "python2" 263 | }, 264 | "language_info": { 265 | "codemirror_mode": { 266 | "name": "ipython", 267 | "version": 2 268 | }, 269 | "file_extension": ".py", 270 | "mimetype": "text/x-python", 271 | "name": "python", 272 | "nbconvert_exporter": "python", 273 | "pygments_lexer": "ipython2", 274 | "version": "2.7.6" 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 0 279 | } 280 | -------------------------------------------------------------------------------- /3.JellyFishStringMatching.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# JellyFish" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**CONTENTS**:\n", 15 | "\n", 16 | "+ **Introduction** - Outlining the Problem\n", 17 | "+ **JellyFish and It's Algorithms** - Brief introduction to JellyFish\n", 18 | "+ **Levenshtein Distance** - Description, Calculation, and Examples\n", 19 | "+ **Damerau Levenshtein Distance** - Description, Calculation, and Examples\n", 20 | "+ **Jaro Distance** - Description, Calculation, and Examples\n", 21 | "+ **Jaro-Winkler Distance** - Description, Calculation, and Examples" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": { 28 | "collapsed": false 29 | }, 30 | "outputs": [ 31 | { 32 | "ename": "ImportError", 33 | "evalue": "No module named pandas", 34 | "output_type": "error", 35 | "traceback": [ 36 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 37 | "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", 38 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mos\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 39 | "\u001b[1;31mImportError\u001b[0m: No module named pandas" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "import pandas as pd\n", 45 | "import numpy as np\n", 46 | "import datetime\n", 47 | "import os\n", 48 | "from pandas import DataFrame\n", 49 | "from numpy import nan as NA\n", 50 | "from IPython.core.display import HTML\n", 51 | "from IPython.core.display import Image\n", 52 | "from IPython.display import Math\n", 53 | "from IPython.display import Latex\n", 54 | "import collections\n", 55 | "import jellyfish as jf\n", 56 | "import re" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## Introduction" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "With regard to style names in production data we face two distinct problems:\n", 71 | "\n", 72 | "1. Style names not being clean within factory.
\n", 73 | "

The fact of the matter is that often the string values of the style names are not clean meaning that styles that in reality are the same are not recognised as such. This is clearly problematic as we want to be able to ;identify those styles that are produced on multiple lines in order to sensibly create the running days values, to do across line analysis of the same styles etc. etc. \n", 74 | "\n", 75 | "2. Matching styles across factory.
\n", 76 | "

Eventually we will want to be able to match styles across factories in order to identify those styles that are produced in multiple factories. The reasons for this are pretty obvious. \n" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "## JellyFish and Its Algorithms" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "Jellyfish is a python library for doing approximate and phonetic matching of strings. It allows for string comparison using the following algorithms:\n", 91 | "\n", 92 | "+ Levenshtein Distance\n", 93 | "+ Damerau-Levenshtein Distance\n", 94 | "+ Jaro Distance\n", 95 | "+ Jaro-Winkler Distance\n", 96 | "+ Match Rating Approach Comparison\n", 97 | "+ Hamming Distance\n", 98 | "\n", 99 | "All of the above are metrics for measuring the difference between two sequences. Each of these will not be briefly introduced and an example given:" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "\n", 107 | "
" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "###**Levenshtein Distance**
\n", 115 | "The Levenshtein distance between two words is the minimum number of single-character edits (insertion, deletion, substitution) required to transform one string into another. The lower the score, the lower the number of edits. If the score is 0, then no edits are needed so the strings are exactly the same. \n", 116 | "\n", 117 | "For example, the Levenshtein distance between \"kitten\" and \"sitting\" is 3, since the following three edits change one into the other, and there is no way to do it with fewer than three edits:\n", 118 | "\n", 119 | "+ Edit 1: kitten → sitten (substitution of \"s\" for \"k\")\n", 120 | "+ Edit 2: sitten → sittin (substitution of \"i\" for \"e\")\n", 121 | "+ Edit 3: sittin → sitting (insertion of \"g\" at the end)\n", 122 | "\n", 123 | "The measure is most useful when looking to match short strings with longer strings. It takes a lot of computational power to do this with long strings and may not therefore be totally appropriate. \n", 124 | "\n", 125 | "For more information about the measure see\n", 126 | "\n", 127 | "http://en.wikipedia.org/wiki/Levenshtein_distance\n", 128 | "\n", 129 | "Now some examples follow:" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 4, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "4" 143 | ] 144 | }, 145 | "execution_count": 4, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "A = 'Rory'\n", 152 | "B = 'Ronnie'\n", 153 | "\n", 154 | "jf.levenshtein_distance(unicode(A), unicode(B))" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "As the following demonstrates, the algorithm also considers case, which is a strong argument for converting all strings to the same case. This is a pretty standard way of working with strings so will come as no surprise." 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 6, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "1" 175 | ] 176 | }, 177 | "execution_count": 6, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "A = 'Rory'\n", 184 | "B = 'rory'\n", 185 | "jf.levenshtein_distance(unicode(A), unicode(B))" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 8, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "0" 199 | ] 200 | }, 201 | "execution_count": 8, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "# Now the levenshtein score of 0 means the strings are an exact match. \n", 208 | "jf.levenshtein_distance(unicode(A.lower()), unicode(B.lower()))" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "\n", 216 | "
" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "###**Damerau Levenshtein Distance**
\n", 224 | "This measure is very similar to the Levenshtein distance, in that it is a measure of the minimum number of edits needed to transform one string into another. The permissible 'edits' in the Levenshtein distance are insertions, deletion and substitution whereas in the Damerau Levenshtein distance the transposition of two adjacent characters is also allowed. Damerau claimed that these four edits correspond to 80% of human spelling errors. \n", 225 | "\n", 226 | "As with the Levenshtein distance a score of zero indicates and exact match etc.\n", 227 | "\n", 228 | "This measure may be an improvement on the Levenshtein distance as using the Damerau Levenshtein Distance strings where two letters are simply the wrong way around will have a lower score (indicating a better match) than they would under the Levenshtein measure. \n", 229 | "\n", 230 | "A simple example will suffice:\n" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 12, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "2" 244 | ] 245 | }, 246 | "execution_count": 12, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "jf.levenshtein_distance(unicode('jellyfihs'), unicode('jellyfish'))" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 13, 258 | "metadata": { 259 | "collapsed": false 260 | }, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "1" 266 | ] 267 | }, 268 | "execution_count": 13, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "jf.damerau_levenshtein_distance(unicode('jellyfihs'), unicode('jellyfish'))" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "In this example, the **Levenshtein** distance works like this:\n", 282 | "\n", 283 | "+ Edit 1: jellyfihs → jellyfiss (substitution of \"s\" for \"h\")\n", 284 | "+ Edit 2: jellyfiss → jellyfish (substitution of \"h\" for \"s\")\n", 285 | "\n", 286 | "This is only one way it could happen. For instance 'h' could have been deleted, and then inserted (and so on), but the minimum number of edits is always 2\n", 287 | "\n", 288 | "The **Damerau Levenshtein** distance works like this:\n", 289 | "\n", 290 | "+ Edit 1: jellyfihs → jellyfish (transposition of 's' and 'h' adjacent characters)\n", 291 | "\n", 292 | "The measure may therefore be a better one in that it recognises strings as being closer than the Levenshtein measure in cases where there has been a simple mix up of characters. \n", 293 | "\n", 294 | "\n", 295 | "**NB** I have observed some odd behaviour of the jf.damerau_levenshtein_distance algorithm. The odd behaviour may be related to me misunderstanding the nature of the measure, or it may be a problem with coding of the library. I have written to the developer for clarification. If you are interested, then see the following, if not then skip to the next measure below.\n" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 15, 301 | "metadata": { 302 | "collapsed": false 303 | }, 304 | "outputs": [ 305 | { 306 | "data": { 307 | "text/plain": [ 308 | "1" 309 | ] 310 | }, 311 | "execution_count": 15, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "jf.damerau_levenshtein_distance(unicode('jellyfihs'), unicode('jellyfish'))" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 17, 323 | "metadata": { 324 | "collapsed": false 325 | }, 326 | "outputs": [ 327 | { 328 | "data": { 329 | "text/plain": [ 330 | "2" 331 | ] 332 | }, 333 | "execution_count": 17, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "jf.damerau_levenshtein_distance(unicode('ifhs'), unicode('fish'))" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "I find the above output very odd for the reason that the scores should be the same as the edits required in both instances are exactly the same:\n", 347 | "\n", 348 | "In the first example:\n", 349 | "\n", 350 | "+ Edit 1: jellyifhs → jellyfihs (transpose adjacent characters 'i' and 'f')\n", 351 | "+ Edit 2: jellyfihs → jellyfish (transpose adjacent characters 'h' and 's')\n", 352 | "\n", 353 | "In the second example:\n", 354 | "\n", 355 | "+ Edit 1: ifhs → fihs (transpose adjacent characters 'i' and 'f')\n", 356 | "+ Edit 2: fihs → fish (transpose adjacent characters 'h' and 's')\n", 357 | "\n", 358 | "Why the outputs are different remains to be determined.\n", 359 | "\n", 360 | "**Update** It appears from looking at the source code that in some cases the function is returning the OSA measure not the Damerau Levenshtein measure. The developer is now aware, and has been contacted. use this measure with caution. \n", 361 | "\n", 362 | "More information on the measure can be found at http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "
" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "###**Jaro Distance**
\n", 377 | "The Jaro distance is a measure that considers the number of matching characters in both strings being compared, and also the number of transpositions which is defined as the number of matching characters (in a different sequence order) divided by two. The measure returns a score between 0 and 1, 0 being no match whatsoever (as defined in the calculation) and 1 being a perfect match. \n", 378 | "\n", 379 | "Beware that this measure will ignore matching characters that are more than a certain distance from each other. This could either be a good thing (to ignore spurious matches) or a bad thing (ignoring correct matches). In any event, it is important to be aware of this, so it is explained in detail below.\n", 380 | "\n", 381 | "It is calculated as follows:\n" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 18, 387 | "metadata": { 388 | "collapsed": false 389 | }, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/latex": [ 394 | "\\begin{eqnarray}\n", 395 | "d_j = \\left\\{\n", 396 | " \\begin{array}{1 1}\n", 397 | " 0 & \\quad \\text{if $m$ = 0 <}\\\\\n", 398 | " \\\\\n", 399 | " \\frac{1}{3} \\bigg(\\frac{m}{|s_1|} + \\frac{m}{|s_2|} + \\frac{m - t}{m} & \\quad \\text{otherwise}\n", 400 | " \\end{array} \\right.\n", 401 | "\\end{eqnarray}" 402 | ], 403 | "text/plain": [ 404 | "" 405 | ] 406 | }, 407 | "execution_count": 18, 408 | "metadata": {}, 409 | "output_type": "execute_result" 410 | } 411 | ], 412 | "source": [ 413 | "Latex(r\"\"\"\\begin{eqnarray}\n", 414 | "d_j = \\left\\{\n", 415 | " \\begin{array}{1 1}\n", 416 | " 0 & \\quad \\text{if $m$ = 0 <}\\\\\n", 417 | " \\\\\n", 418 | " \\frac{1}{3} \\bigg(\\frac{m}{|s_1|} + \\frac{m}{|s_2|} + \\frac{m - t}{m} & \\quad \\text{otherwise}\n", 419 | " \\end{array} \\right.\n", 420 | "\\end{eqnarray}\"\"\")" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "+ **$m$** is the number of matching characters\n", 428 | "+ **$t$** is half the number of transpositions.\n", 429 | "+ **$|s_1|$** is the length of the first string to be matched\n", 430 | "+ **$|s_2|$** is the length of the second string to be matched\n", 431 | "\n", 432 | "If the number of matching characters is 0, then the measure equals 0\n", 433 | "\n", 434 | "A character in each string is only considered to be matching if it is the same and obeys the following rule as to distance:" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 19, 440 | "metadata": { 441 | "collapsed": false 442 | }, 443 | "outputs": [ 444 | { 445 | "data": { 446 | "text/latex": [ 447 | "\\begin{eqnarray}\n", 448 | "\\frac{max(|s_1|,|s_2|)}{2} -1\n", 449 | "\\end{eqnarray}" 450 | ], 451 | "text/plain": [ 452 | "" 453 | ] 454 | }, 455 | "execution_count": 19, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | } 459 | ], 460 | "source": [ 461 | "Latex(r\"\"\"\\begin{eqnarray}\n", 462 | "\\frac{max(|s_1|,|s_2|)}{2} -1\n", 463 | "\\end{eqnarray}\"\"\")" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "Observe the following:\n" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 21, 476 | "metadata": { 477 | "collapsed": false 478 | }, 479 | "outputs": [ 480 | { 481 | "data": { 482 | "text/plain": [ 483 | "0" 484 | ] 485 | }, 486 | "execution_count": 21, 487 | "metadata": {}, 488 | "output_type": "execute_result" 489 | } 490 | ], 491 | "source": [ 492 | "S1 = 'AzzzzB'\n", 493 | "S2 = 'BxxA'\n", 494 | "jf.jaro_winkler(unicode(S1), unicode(S2))" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": {}, 500 | "source": [ 501 | "Although the characters A and B appear in both strings, m = 0 because they are farther in distance from each other than:" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 22, 507 | "metadata": { 508 | "collapsed": false 509 | }, 510 | "outputs": [ 511 | { 512 | "data": { 513 | "text/latex": [ 514 | "\\begin{eqnarray}\n", 515 | "\\frac{max(|6|,|4|)}{2} -1 = 2\n", 516 | "\\end{eqnarray}" 517 | ], 518 | "text/plain": [ 519 | "" 520 | ] 521 | }, 522 | "execution_count": 22, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "Latex(r\"\"\"\\begin{eqnarray}\n", 529 | "\\frac{max(|6|,|4|)}{2} -1 = 2\n", 530 | "\\end{eqnarray}\"\"\")" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": {}, 536 | "source": [ 537 | "Transpositions are also important as already mentioned. The number of transpositions is calculated as \"The number of matching (but different sequence order) characters divided by 2\". Observe the following:" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 24, 543 | "metadata": { 544 | "collapsed": false 545 | }, 546 | "outputs": [ 547 | { 548 | "data": { 549 | "text/plain": [ 550 | "0.9523809523809524" 551 | ] 552 | }, 553 | "execution_count": 24, 554 | "metadata": {}, 555 | "output_type": "execute_result" 556 | } 557 | ], 558 | "source": [ 559 | "S3 = 'Poverty'\n", 560 | "S4 = 'Poervty'\n", 561 | "jf.jaro_distance(unicode(S3), unicode(S4))" 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": {}, 567 | "source": [ 568 | "Sadly it looks like this is also being calculated incorrectly. To my mind the calculation should be as follows:" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": 25, 574 | "metadata": { 575 | "collapsed": false 576 | }, 577 | "outputs": [ 578 | { 579 | "data": { 580 | "text/latex": [ 581 | "\\begin{eqnarray}\n", 582 | "\\Bigg(\\frac{7}{7} + \\frac{7}{7} + \\frac{7- \\frac{3}{2}}{7}\\Bigg) = 0.9285714\n", 583 | "\\end{eqnarray}" 584 | ], 585 | "text/plain": [ 586 | "" 587 | ] 588 | }, 589 | "execution_count": 25, 590 | "metadata": {}, 591 | "output_type": "execute_result" 592 | } 593 | ], 594 | "source": [ 595 | "Latex(r\"\"\"\\begin{eqnarray}\n", 596 | "\\Bigg(\\frac{7}{7} + \\frac{7}{7} + \\frac{7- \\frac{3}{2}}{7}\\Bigg) = 0.9285714\n", 597 | "\\end{eqnarray}\"\"\")" 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": {}, 603 | "source": [ 604 | "Whereas is appears that it is being calcualted as follows:" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 26, 610 | "metadata": { 611 | "collapsed": false 612 | }, 613 | "outputs": [ 614 | { 615 | "data": { 616 | "text/latex": [ 617 | "\\begin{eqnarray}\n", 618 | "\\Bigg(\\frac{7}{7} + \\frac{7}{7} + \\frac{7- \\frac{2}{2}}{7}\\Bigg) = 0.9523809\n", 619 | "\\end{eqnarray}" 620 | ], 621 | "text/plain": [ 622 | "" 623 | ] 624 | }, 625 | "execution_count": 26, 626 | "metadata": {}, 627 | "output_type": "execute_result" 628 | } 629 | ], 630 | "source": [ 631 | "Latex(r\"\"\"\\begin{eqnarray}\n", 632 | "\\Bigg(\\frac{7}{7} + \\frac{7}{7} + \\frac{7- \\frac{2}{2}}{7}\\Bigg) = 0.9523809\n", 633 | "\\end{eqnarray}\"\"\")" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "The critical difference is that I calculate the number of matching (but different sequence order) characters as 3, those being:\n", 641 | "\n", 642 | "+ e\n", 643 | "+ r\n", 644 | "+ v\n", 645 | "\n", 646 | "Whereas it appears that JellyFish thinks there are only two.\n", 647 | "\n", 648 | "Again, I have raised this issue with the developer. It may resolved in future, but for now use this measure with caution. " 649 | ] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "metadata": {}, 654 | "source": [ 655 | "
" 656 | ] 657 | }, 658 | { 659 | "cell_type": "markdown", 660 | "metadata": {}, 661 | "source": [ 662 | "###**Jaro-Winkler Distance**\n", 663 | "The Jaro-Winkler Distance measure builds upon the Jaro measure, but uses a prefix scale which gives more favorable ratings to strings that match from the beginning for a set prefix length. This will become clear when the calculation is viewed:" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 27, 669 | "metadata": { 670 | "collapsed": false 671 | }, 672 | "outputs": [ 673 | { 674 | "data": { 675 | "text/latex": [ 676 | "\\begin{eqnarray}\n", 677 | "d_w = d_j + (\\ell p(1 - d_j))\n", 678 | "\\end{eqnarray}" 679 | ], 680 | "text/plain": [ 681 | "" 682 | ] 683 | }, 684 | "execution_count": 27, 685 | "metadata": {}, 686 | "output_type": "execute_result" 687 | } 688 | ], 689 | "source": [ 690 | "Latex(r\"\"\"\\begin{eqnarray}\n", 691 | "d_w = d_j + (\\ell p(1 - d_j))\n", 692 | "\\end{eqnarray}\"\"\")" 693 | ] 694 | }, 695 | { 696 | "cell_type": "markdown", 697 | "metadata": {}, 698 | "source": [ 699 | "The expression is made up of the following:\n", 700 | "\n", 701 | "+ **$d_w$** is the Jarrow-Winkler distance\n", 702 | "+ **$d_j$** is the Jaro distance for strings s1 and s2\n", 703 | "+ **$\\ell$** is the length of the common prefix up to a maximum of 4 characters\n", 704 | "+ **$p$** is a constant scaling factor for how much the score is adjusted upwards for having common prefixes. It should not exceed 0.25, otherwise the distance can become larger than 1. The standard value for this constant in Winkler's work is 0.1\n", 705 | " \n", 706 | " " 707 | ] 708 | }, 709 | { 710 | "cell_type": "markdown", 711 | "metadata": {}, 712 | "source": [ 713 | "The key takeaway is that strings that begin with the same prefixes score more highly...\n", 714 | "\n", 715 | "Observe the following:" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 29, 721 | "metadata": { 722 | "collapsed": false 723 | }, 724 | "outputs": [ 725 | { 726 | "data": { 727 | "text/plain": [ 728 | "0.9363636363636364" 729 | ] 730 | }, 731 | "execution_count": 29, 732 | "metadata": {}, 733 | "output_type": "execute_result" 734 | } 735 | ], 736 | "source": [ 737 | "S5 = 'Innovaiosn'\n", 738 | "S6 = 'Innovations'\n", 739 | "jf.jaro_distance(unicode(S5), unicode(S6))" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": 30, 745 | "metadata": { 746 | "collapsed": false 747 | }, 748 | "outputs": [ 749 | { 750 | "data": { 751 | "text/plain": [ 752 | "0.9618181818181818" 753 | ] 754 | }, 755 | "execution_count": 30, 756 | "metadata": {}, 757 | "output_type": "execute_result" 758 | } 759 | ], 760 | "source": [ 761 | "jf.jaro_winkler(unicode(S5), unicode(S6))" 762 | ] 763 | }, 764 | { 765 | "cell_type": "markdown", 766 | "metadata": {}, 767 | "source": [ 768 | "Although it is not stated anywhere in the jellyfish documentation, it is clear that the value of $p$ is 0.1, this is because rearranging the following to solve for $\\ell$:" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": 31, 774 | "metadata": { 775 | "collapsed": false 776 | }, 777 | "outputs": [ 778 | { 779 | "data": { 780 | "text/latex": [ 781 | "\\begin{eqnarray}\n", 782 | "d_w = d_j + (\\ell p(1 - d_j))\\\\\n", 783 | "\\\\\n", 784 | "\\\\\n", 785 | "\\ell = \\frac{d_w - d_j}{1 - d_j} * \\frac{1}{4}\\\\\n", 786 | "\\\\\n", 787 | "\\\\\n", 788 | "0.1 = \\frac{0.96182-0.936364}{1-0.936364} * \\frac{1}{4}\n", 789 | "\n", 790 | "\\end{eqnarray}" 791 | ], 792 | "text/plain": [ 793 | "" 794 | ] 795 | }, 796 | "execution_count": 31, 797 | "metadata": {}, 798 | "output_type": "execute_result" 799 | } 800 | ], 801 | "source": [ 802 | "Latex(r\"\"\"\\begin{eqnarray}\n", 803 | "d_w = d_j + (\\ell p(1 - d_j))\\\\\n", 804 | "\\\\\n", 805 | "\\\\\n", 806 | "\\ell = \\frac{d_w - d_j}{1 - d_j} * \\frac{1}{4}\\\\\n", 807 | "\\\\\n", 808 | "\\\\\n", 809 | "0.1 = \\frac{0.96182-0.936364}{1-0.936364} * \\frac{1}{4}\n", 810 | "\n", 811 | "\\end{eqnarray}\"\"\")" 812 | ] 813 | }, 814 | { 815 | "cell_type": "markdown", 816 | "metadata": {}, 817 | "source": [ 818 | "In some implementations of Jaro-Winkler, the prefix bonus is only added when the compared strings have a Jaro distance above a set \"boost threshold\". In the work of the developer himself, this threshold was set at 0.7. In other words, if the Jaro measure is less than 0.7, then even if the prefixes of the string are the same up to four characters, the prefix bonus will not be applied. Then the calculation looks like this:" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 32, 824 | "metadata": { 825 | "collapsed": false 826 | }, 827 | "outputs": [ 828 | { 829 | "data": { 830 | "text/latex": [ 831 | "\\begin{eqnarray}\n", 832 | "d_w = \\left\\{\n", 833 | " \\begin{array}{1 1}\n", 834 | " d_j & \\quad \\text{if $d_j$ < $b_t$}\\\\\n", 835 | " d_j + (\\ell p(1 - d_j)) & \\quad \\text{otherwise}\n", 836 | " \\end{array} \\right.\n", 837 | "\\end{eqnarray}" 838 | ], 839 | "text/plain": [ 840 | "" 841 | ] 842 | }, 843 | "execution_count": 32, 844 | "metadata": {}, 845 | "output_type": "execute_result" 846 | } 847 | ], 848 | "source": [ 849 | "Latex(r\"\"\"\\begin{eqnarray}\n", 850 | "d_w = \\left\\{\n", 851 | " \\begin{array}{1 1}\n", 852 | " d_j & \\quad \\text{if $d_j$ < $b_t$}\\\\\n", 853 | " d_j + (\\ell p(1 - d_j)) & \\quad \\text{otherwise}\n", 854 | " \\end{array} \\right.\n", 855 | "\\end{eqnarray}\"\"\")" 856 | ] 857 | }, 858 | { 859 | "cell_type": "markdown", 860 | "metadata": {}, 861 | "source": [ 862 | "Again, as there is no documentation to JellyFish it is hard to know if this is being applied, and if so, at what level the $b_t$ value is set to trigger the prefix bonus. However, a bit of easy experimentation can demonstrate, firstly I compare strings that have a Jaro score just below 0.7, and then strings that have a score just above 0.7" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 34, 868 | "metadata": { 869 | "collapsed": false 870 | }, 871 | "outputs": [ 872 | { 873 | "data": { 874 | "text/plain": [ 875 | "0.6777777777777777" 876 | ] 877 | }, 878 | "execution_count": 34, 879 | "metadata": {}, 880 | "output_type": "execute_result" 881 | } 882 | ], 883 | "source": [ 884 | "S7 = 'ABCDqwerty'\n", 885 | "S8 = 'ABCDpoiuyt'\n", 886 | "jf.jaro_distance(unicode(S7), unicode(S8))" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": 35, 892 | "metadata": { 893 | "collapsed": false 894 | }, 895 | "outputs": [ 896 | { 897 | "data": { 898 | "text/plain": [ 899 | "0.6777777777777777" 900 | ] 901 | }, 902 | "execution_count": 35, 903 | "metadata": {}, 904 | "output_type": "execute_result" 905 | } 906 | ], 907 | "source": [ 908 | "jf.jaro_winkler(unicode(S7), unicode(S8))" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": 36, 914 | "metadata": { 915 | "collapsed": false 916 | }, 917 | "outputs": [ 918 | { 919 | "data": { 920 | "text/plain": [ 921 | "0.7333333333333334" 922 | ] 923 | }, 924 | "execution_count": 36, 925 | "metadata": {}, 926 | "output_type": "execute_result" 927 | } 928 | ], 929 | "source": [ 930 | "S9 = 'ABCDqwerty'\n", 931 | "S10 = 'ABCDpoiuty'\n", 932 | "jf.jaro_distance(unicode(S9), unicode(S10))" 933 | ] 934 | }, 935 | { 936 | "cell_type": "code", 937 | "execution_count": 37, 938 | "metadata": { 939 | "collapsed": false 940 | }, 941 | "outputs": [ 942 | { 943 | "data": { 944 | "text/plain": [ 945 | "0.8400000000000001" 946 | ] 947 | }, 948 | "execution_count": 37, 949 | "metadata": {}, 950 | "output_type": "execute_result" 951 | } 952 | ], 953 | "source": [ 954 | "jf.jaro_winkler(unicode(S9), unicode(S10))" 955 | ] 956 | }, 957 | { 958 | "cell_type": "markdown", 959 | "metadata": {}, 960 | "source": [ 961 | "The above output indicates that indeed, there implementation of the Jaro-Winkler measure in JellyFish uses a threshold of 0.7 for $b_t$" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "Developer - Pranav Shukla
\n", 969 | "Email - pranavdynamic@gmail.com" 970 | ] 971 | } 972 | ], 973 | "metadata": { 974 | "kernelspec": { 975 | "display_name": "Python 2", 976 | "language": "python", 977 | "name": "python2" 978 | }, 979 | "language_info": { 980 | "codemirror_mode": { 981 | "name": "ipython", 982 | "version": 2 983 | }, 984 | "file_extension": ".py", 985 | "mimetype": "text/x-python", 986 | "name": "python", 987 | "nbconvert_exporter": "python", 988 | "pygments_lexer": "ipython2", 989 | "version": "2.7.6" 990 | } 991 | }, 992 | "nbformat": 4, 993 | "nbformat_minor": 0 994 | } 995 | -------------------------------------------------------------------------------- /Data/Basics_NLP/dirForBasics_NLP/test1.txt: -------------------------------------------------------------------------------- 1 | void show_LRT(Mat); 2 | void upsample(Mat); 3 | double myPSNR(Mat&, Mat&); 4 | void write_rect(Mat&, Mat&, int, int); 5 | void make_rect(Mat&, Mat&, int); 6 | void datatolrt(Mat&, Mat&); 7 | double getPSNR(Mat&, Mat&); 8 | void trainbynn(Mat&, Mat&, Mat&, Mat&); 9 | void copy(Mat&, Mat&, int); 10 | int removezeros(Mat&, Mat&, int, Mat&, Mat&); 11 | void LRT(Mat, Mat&, int, int, int); 12 | int maskfn(Mat&, int, int, int, int, int); 13 | void make_training_data(Mat&, int); 14 | void make_training_response(Mat&); 15 | void make_train_mat(Mat&, Mat&); 16 | void make_response_mat(Mat&, Mat&); 17 | void makedata(Mat&, Mat&, int); 18 | void maketrarinfile(Mat&); 19 | void makeresfile(Mat&); -------------------------------------------------------------------------------- /Data/Basics_NLP/dirForBasics_NLP/test2.txt: -------------------------------------------------------------------------------- 1 | void datatolrt(Mat& generate, Mat& rehrlrt) 2 | { 3 | Mat rect(4, 4, CV_32FC1); 4 | int horizontal = floor((rehrlrt.cols - 3) / 2); 5 | int row_indx = 0; 6 | int row_rect = 0; 7 | int col_rect = 0; 8 | 9 | for (int row_indx = 0; row_indx < generate.rows - 1000; row_indx++) 10 | { 11 | col_rect = 2 * (row_indx % horizontal); 12 | row_rect = 2 * (row_indx / horizontal); 13 | make_rect(rect, generate, row_indx); 14 | write_rect(rect, rehrlrt, row_rect, col_rect); 15 | } 16 | 17 | for (int i = 0; i < rehrlrt.rows; i++) 18 | { 19 | for (int j = 0; j < rehrlrt.cols; j++) 20 | { 21 | if (i>1 & i < rehrlrt.rows - 2 & j>1 & j < rehrlrt.cols - 2) 22 | rehrlrt.at(i, j) /= 4; 23 | else if (((i == 0 || i == 1 || i == rehrlrt.rows - 2 || i == rehrlrt.rows - 1) & (j>1 & j < rehrlrt.cols - 2)) || ((j == 0 || j == 1 || j == rehrlrt.cols - 2 || j == rehrlrt.cols - 1) & (i>1 & i < rehrlrt.rows - 2))) 24 | rehrlrt.at(i, j) /= 2; 25 | } 26 | } 27 | 28 | } 29 | 30 | void write_rect(Mat& rect, Mat& rehrlrt, int row_rect, int col_rect) 31 | { 32 | for (int k = 0; k < 4; k++) 33 | { 34 | for (int l = 0; l < 4; l++) 35 | { 36 | rehrlrt.at(row_rect + k, col_rect + l) += (float)rect.at(k, l); 37 | } 38 | } 39 | } 40 | 41 | void make_rect(Mat& rect, Mat& generate, int generate_rowno) 42 | { 43 | for (int k = 0; k < 4; k++) 44 | { 45 | for (int l = 0; l < 4; l++) 46 | { 47 | rect.at(k, l) = (float)generate.at(generate_rowno, 4 * k + l); 48 | } 49 | } 50 | } 51 | 52 | 53 | void trainbynn(Mat& train, Mat& res, Mat& generate, Mat& train_data) 54 | { 55 | 56 | Mat label = Mat::zeros(res.rows, 1, CV_32FC1); 57 | for (int i = 0; i < 16; i++) 58 | { 59 | res.col(i).copyTo(label); 60 | if (train.rows == label.rows) 61 | { 62 | cout << "H H Entered into " << i << "th the Neural Network" << endl; 63 | Ptr< ANN_MLP > nn = ANN_MLP::create(); 64 | 65 | //setting the NN layer size 66 | cv::Mat layers = cv::Mat(6, 1, CV_32SC1); 67 | layers.row(0) = cv::Scalar(4); 68 | layers.row(1) = cv::Scalar(3); 69 | layers.row(2) = cv::Scalar(3); 70 | layers.row(3) = cv::Scalar(2); 71 | layers.row(4) = cv::Scalar(2); 72 | layers.row(5) = cv::Scalar(1); 73 | nn->setLayerSizes(layers); 74 | 75 | nn->setActivationFunction(cv::ml::ANN_MLP::SIGMOID_SYM); 76 | nn->setTrainMethod(cv::ml::ANN_MLP::BACKPROP); 77 | nn->setBackpropMomentumScale(0.1); 78 | nn->setBackpropWeightScale(0.1); 79 | //nn->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER, (int)100000, 0.01)); 80 | 81 | 82 | cout << "Learning " << endl; 83 | if (!nn->train(train, ml::ROW_SAMPLE, label)) 84 | { 85 | cout << "H fail " << train.rows << endl; 86 | cout << "H fail " << res.rows << endl; 87 | cout << "H Learning failed" << endl; 88 | while (1){} 89 | waitKey(0); 90 | } 91 | 92 | cout << "Learnt " << endl; 93 | /********************* Here Ends training ofthe Neural Networks ******************************/ 94 | 95 | /***************************Predicting*************************************/ 96 | 97 | Mat ans(train_data.rows, 16, CV_32F); 98 | //Mat samples(1, 4, CV_32F); 99 | //samples.at(0, 0) = 3; samples.at(0, 1) = 3; samples.at(0, 2) = 3; samples.at(0, 3) = 2; 100 | nn->predict(train_data, ans); 101 | Mat ans1(train_data.rows, 16, CV_8UC1); 102 | ans.convertTo(ans1, CV_8UC1); 103 | 104 | ans1.copyTo(generate.col(i)); 105 | generate.convertTo(generate, CV_32FC1); 106 | 107 | 108 | } 109 | } 110 | cout << "\n Gopal \n"; 111 | } 112 | double myPSNR(Mat& I1, Mat& I2) 113 | { 114 | Mat s1(I1.rows, 16, CV_8UC1); 115 | absdiff(I1, I2, s1); // |I1 - I2| 116 | s1.convertTo(s1, CV_32F); // cannot make a square on 8 bits 117 | s1 = s1.mul(s1); // |I1 - I2|^2 118 | 119 | Scalar s = sum(s1); // sum elements per channel 120 | 121 | double sse = s.val[0] + s.val[1] + s.val[2]; // sum channels 122 | cout << "sse" << endl; 123 | cout << "s.val[0] = " << s.val[0] << endl; 124 | cout << "s.val[1] = " << s.val[1] << endl; 125 | cout << "s.val[2] = " << s.val[2] << endl; 126 | cout << "I1.channels() = " << I1.channels() << endl; 127 | cout << "I1.total() = " << I1.total() << endl; 128 | 129 | if (sse <= 1e-10) // for small values return zero 130 | return 0; 131 | else 132 | { 133 | double mse = sse / (double)(I1.channels() * I1.total()); 134 | double psnr = 10.0 * log10((8 * 8) / mse); 135 | return psnr; 136 | } 137 | } 138 | double getPSNR(Mat& I1, Mat& I2) 139 | { 140 | Mat s1(I1.rows, 16, CV_8UC1); 141 | absdiff(I1, I2, s1); // |I1 - I2| 142 | s1.convertTo(s1, CV_32F); // cannot make a square on 8 bits 143 | s1 = s1.mul(s1); // |I1 - I2|^2 144 | 145 | Scalar s = sum(s1); // sum elements per channel 146 | 147 | double sse = s.val[0] + s.val[1] + s.val[2]; // sum channels 148 | /*cout << "sse" << endl; 149 | cout << "s.val[0] = " << s.val[0]<< endl; 150 | cout << "s.val[1] = " << s.val[1] << endl; 151 | cout << "s.val[2] = " << s.val[2] << endl;*/ 152 | 153 | 154 | if (sse <= 1e-10) // for small values return zero 155 | return 0; 156 | else 157 | { 158 | double mse = sse / (double)(I1.channels() * I1.total()); 159 | double psnr = 10.0 * log10((255 * 255) / mse); 160 | return psnr; 161 | } 162 | } 163 | 164 | void copy(Mat& input, Mat& output, int n){ 165 | 166 | for (int i = 0; i(i, j) = (float)input.at(i, j); 171 | } 172 | } 173 | 174 | for (int i = 0; i < n*n; i++){ 175 | output.at(output.rows - 1, i) = 0; 176 | } 177 | } 178 | 179 | void maketrarinfile(Mat& train) 180 | { 181 | ofstream trainfile("train.txt"); 182 | for (int i = 0; i(i, j) << " "; 187 | } 188 | trainfile << "\n"; 189 | } 190 | trainfile.close(); 191 | } 192 | 193 | void makeresfile(Mat& res) 194 | { 195 | ofstream resfile("res.txt"); 196 | for (int i = 0; i(i, j) << " "; 201 | } 202 | resfile << "\n"; 203 | } 204 | resfile.close(); 205 | } 206 | 207 | int removezeros(Mat& input, Mat& output, int n, Mat& train_data, Mat& response) 208 | { 209 | vector z; 210 | if (n == 2){ 211 | z = { 0, 0, 0, 0 }; 212 | } 213 | else{ 214 | z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 215 | } 216 | vector z1 = { 0, 0, 0, 0 }; 217 | vector z2 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; 218 | int j = 0; 219 | 220 | for (int i = 1; i < output.rows; i++) 221 | { 222 | if ((vector)input.row(i) != z && (vector)train_data.row(i) != z1 && (vector)response.row(i) != z2) 223 | { 224 | for (int l = 0; l < n*n; l++){ 225 | output.at(j, l) = (float)input.at(i, l); 226 | } 227 | //cout << output.at(j, 2) << endl; 228 | j++; 229 | } 230 | } 231 | 232 | return j; 233 | } 234 | 235 | void makedata(Mat& lrt, Mat& data, int n) 236 | { 237 | string line; 238 | int k = 0; 239 | //data.create((lrt.rows - n + 1)*(lrt.cols - n + 1), n*n, CV_32FC1); 240 | if (n == 2) 241 | data.create((lrt.rows - 1)*(lrt.cols - 1), 4, CV_32FC1); 242 | else 243 | data.create((lrt.rows / 2 - 1)*(lrt.cols / 2 - 1), 16, CV_32FC1); 244 | make_training_data(lrt, n); 245 | ifstream infile("indata.txt"); 246 | while (getline(infile, line)) 247 | { 248 | istringstream iss(line); 249 | 250 | if (n == 2) 251 | { 252 | float a, b, c, d; 253 | if (!(iss >> a >> b >> c >> d)) { break; } // error 254 | int l = 0; 255 | data.at(k, l) = a; l++; 256 | data.at(k, l) = b; l++; 257 | data.at(k, l) = c; l++; 258 | data.at(k, l) = d; l++; 259 | k++; 260 | } 261 | 262 | 263 | 264 | if (n == 4) 265 | { 266 | float a1, b1, c1, d1; 267 | float a2, b2, c2, d2; 268 | float a3, b3, c3, d3; 269 | float a4, b4, c4, d4; 270 | if (!(iss >> a1 >> b1 >> c1 >> d1 >> a2 >> b2 >> c2 >> d2 >> a3 >> b3 >> c3 >> d3 >> a4 >> b4 >> c4 >> d4)) { break; } // error 271 | int l = 0; 272 | data.at(k, l) = a1; l++; 273 | data.at(k, l) = b1; l++; 274 | data.at(k, l) = c1; l++; 275 | data.at(k, l) = d1; l++; 276 | data.at(k, l) = a2; l++; 277 | data.at(k, l) = b2; l++; 278 | data.at(k, l) = c2; l++; 279 | data.at(k, l) = d2; l++; 280 | data.at(k, l) = a3; l++; 281 | data.at(k, l) = b3; l++; 282 | data.at(k, l) = c3; l++; 283 | data.at(k, l) = d3; l++; 284 | data.at(k, l) = a4; l++; 285 | data.at(k, l) = b4; l++; 286 | data.at(k, l) = c4; l++; 287 | data.at(k, l) = d4; l++; 288 | k++; 289 | } 290 | } 291 | } 292 | 293 | void make_training_data(Mat& input, int n) 294 | { 295 | ofstream indata("indata.txt"); 296 | for (int i = 0; i(i + k, j + l) << " "; 307 | } 308 | } 309 | indata << "\n"; 310 | //indata << (float)input.at(i, j) << " "<< (float)input.at(i, j + 1) << " " << (float)input.at(i + 1, j) << " " << (float)input.at(i + 1, j + 1) << "\n"; 311 | } 312 | } 313 | indata.close(); 314 | } 315 | 316 | int maskfn(Mat&input, int r, int c, int delta, int crow, int ccol) 317 | { 318 | int result = 0; 319 | for (int i = -r / 2; i <= r / 2; i++) 320 | { 321 | for (int j = -c / 2; j <= c / 2; j++) 322 | { 323 | if (i != 0 || j != 0) 324 | { 325 | if (input.at(crow, ccol) > (input.at(crow + i, ccol + j) - delta)) 326 | result++; 327 | } 328 | } 329 | } 330 | return result; 331 | } 332 | void make_train_mat(Mat& input, Mat& train_data) 333 | { 334 | train_data.create((input.rows - 1)*(input.cols - 1), 4, CV_32FC1); 335 | int k = 0; 336 | for (int i = 0; i(k, l) = (float)input.at(i, j); l++; 342 | train_data.at(k, l) = (float)input.at(i, j + 1); l++; 343 | train_data.at(k, l) = (float)input.at(i + 1, j); l++; 344 | train_data.at(k, l) = (float)input.at(i + 1, j + 1); l++; 345 | k++; 346 | } 347 | } 348 | 349 | } 350 | 351 | 352 | void make_response_mat(Mat& input, Mat& response) 353 | { 354 | response.create(input.rows*input.cols, 16, CV_32FC1); 355 | int k = 0; 356 | for (int i = 0; i(0, 0) = (float)input.at(i, j); 362 | response.at(k, l) = (float)input.at(i, j + 1); l++; 363 | response.at(k, l) = (float)input.at(i, j + 2); l++; 364 | response.at(k, l) = (float)input.at(i, j + 3); l++; 365 | response.at(k, l) = (float)input.at(i + 1, j); l++; 366 | response.at(k, l) = (float)input.at(i + 1, j + 1); l++; 367 | response.at(k, l) = (float)input.at(i + 1, j + 2); l++; 368 | response.at(k, l) = (float)input.at(i + 1, j + 3); l++; 369 | response.at(k, l) = (float)input.at(i + 2, j); l++; 370 | response.at(k, l) = (float)input.at(i + 2, j + 1); l++; 371 | response.at(k, l) = (float)input.at(i + 2, j + 2); l++; 372 | response.at(k, l) = (float)input.at(i + 2, j + 3); l++; 373 | response.at(k, l) = (float)input.at(i + 3, j); l++; 374 | response.at(k, l) = (float)input.at(i + 3, j + 1); l++; 375 | response.at(k, l) = (float)input.at(i + 3, j + 2); l++; 376 | response.at(k, l) = (float)input.at(i + 3, j + 3); l++; 377 | k++; 378 | } 379 | } 380 | } 381 | 382 | 383 | 384 | 385 | void make_training_response(Mat& input) 386 | { 387 | ofstream outdata("outdata.txt"); 388 | for (int i = 0; i(i, j) << " " << (float)input.at(i, j + 1) << " " << (float)input.at(i, j + 2) << " " << (float)input.at(i, j + 3) << " " << (float)input.at(i + 1, j) << " " << (float)input.at(i + 1, j + 1) << " " << (float)input.at(i + 1, j + 2) << " " << (float)input.at(i + 1, j + 3) << " " << (float)input.at(i + 2, j) << " " << (float)input.at(i + 2, j + 1) << " " << (float)input.at(i + 2, j + 2) << " " << (float)input.at(i + 2, j + 3) << " " << (float)input.at(i + 3, j) << " " << (float)input.at(i + 3, j + 1) << " " << (float)input.at(i + 3, j + 2) << " " << (float)input.at(i + 3, j + 3) << "\n"; 393 | 394 | } 395 | } 396 | outdata.close(); 397 | } 398 | 399 | //void show_images() 400 | //{ 401 | // namedWindow("Display window", WINDOW_AUTOSIZE);// Create a window for display. 402 | // imshow("Display window", input); // Show our original image inside it. 403 | // 404 | // namedWindow("original lrt window", WINDOW_AUTOSIZE);// Create a window for display. 405 | // imshow("original lrt window", hr_lrt); // Show our LRT inside it. 406 | // 407 | // namedWindow("Low res image window", WINDOW_AUTOSIZE);// Create a window for display. 408 | // imshow("Low res image window", lr_img); // Show our image inside it. 409 | // 410 | // namedWindow("lr_lrt", WINDOW_AUTOSIZE);// Create a window for display. 411 | // imshow("lr_lrt", lr_lrt); 412 | //} 413 | 414 | -------------------------------------------------------------------------------- /Data/Basics_NLP/errors.css.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krishnamrith12/NotebooksNLP/8902e2cbeebce7484e394d68dfe9970fee969aa5/Data/Basics_NLP/errors.css.gz -------------------------------------------------------------------------------- /Data/Basics_NLP/gaur.txt: -------------------------------------------------------------------------------- 1 | Love is everlasting Forgiveness. 2 | Having free time is not an opulence, it's a danger. 3 | Krishna is the Supreme Personality of Godhead. -------------------------------------------------------------------------------- /Data/Basics_NLP/nitai.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krishnamrith12/NotebooksNLP/8902e2cbeebce7484e394d68dfe9970fee969aa5/Data/Basics_NLP/nitai.pdf -------------------------------------------------------------------------------- /Data/Demo/battles.csv: -------------------------------------------------------------------------------- 1 | name,year,battle_number,attacker_king,defender_king,attacker_1,attacker_2,attacker_3,attacker_4,defender_1,defender_2,defender_3,defender_4,attacker_outcome,battle_type,major_death,major_capture,attacker_size,defender_size,attacker_commander,defender_commander,summer,location,region,note Battle of the Golden Tooth,298,1,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,win,pitched battle,1,0,15000,4000,Jaime Lannister,"Clement Piper, Vance",1,Golden Tooth,The Westerlands, Battle at the Mummer's Ford,298,2,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Baratheon,,,,win,ambush,1,0,,120,Gregor Clegane,Beric Dondarrion,1,Mummer's Ford,The Riverlands, Battle of Riverrun,298,3,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,win,pitched battle,0,1,15000,10000,"Jaime Lannister, Andros Brax","Edmure Tully, Tytos Blackwood",1,Riverrun,The Riverlands, Battle of the Green Fork,298,4,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,loss,pitched battle,1,1,18000,20000,"Roose Bolton, Wylis Manderly, Medger Cerwyn, Harrion Karstark, Halys Hornwood","Tywin Lannister, Gregor Clegane, Kevan Lannister, Addam Marbrand",1,Green Fork,The Riverlands, Battle of the Whispering Wood,298,5,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,1,1,1875,6000,"Robb Stark, Brynden Tully",Jaime Lannister,1,Whispering Wood,The Riverlands, Battle of the Camps,298,6,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,0,0,6000,12625,"Robb Stark, Tytos Blackwood, Brynden Tully","Lord Andros Brax, Forley Prester",1,Riverrun,The Riverlands, Sack of Darry,298,7,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Darry,,,,win,pitched battle,0,0,,,Gregor Clegane,Lyman Darry,1,Darry,The Riverlands, Battle of Moat Cailin,299,8,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,pitched battle,0,0,,,Victarion Greyjoy,,1,Moat Cailin,The North, Battle of Deepwood Motte,299,9,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,siege,0,0,1000,,Asha Greyjoy,,1,Deepwood Motte,The North, Battle of the Stony Shore,299,10,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,ambush,0,0,264,,Theon Greyjoy,,1,Stony Shore,The North,"Greyjoy's troop number based on the Battle of Deepwood Motte, in which Asha had 1000 soldier on 30 longships. That comes out to ~33 per longship. In the Battle of the Stony Shore, Theon has 8 longships, and just we can estimate that he has 8*33 =265 troops." Battle of Torrhen's Square,299,11,Robb Stark,Balon/Euron Greyjoy,Stark,,,,Greyjoy,,,,win,pitched battle,0,0,244,900,"Rodrik Cassel, Cley Cerwyn",Dagmer Cleftjaw,1,Torrhen's Square,The North,Greyjoy's troop number comes from the 264 estimate to have arrived on the stony shore minus the 20 Theon takes to attack Winterfell. Thus 264-20=244 Battle of Winterfell,299,12,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,ambush,0,1,20,,Theon Greyjoy,Bran Stark,1,Winterfell,The North,"It isn't mentioned how many Stark men are left in Winterfell, other than ""very few""." Sack of Torrhen's Square,299,13,Balon/Euron Greyjoy,Balon/Euron Greyjoy,Greyjoy,,,,Stark,,,,win,siege,0,1,,,Dagmer Cleftjaw,,1,Torrhen's Square,The North, Sack of Winterfell,299,14,Joffrey/Tommen Baratheon,Robb Stark,Bolton,Greyjoy,,,Stark,,,,win,ambush,1,0,618,2000,"Ramsay Snow, Theon Greyjoy ","Rodrik Cassel, Cley Cerwyn, Leobald Tallhart",1,Winterfell,The North,"Since House Bolton betrays the Starks for House Lannister, we code this battle as between these two houses. Greyjoy men, numbering only 20, don't play a major part in the fighting and end up dying anyway." Battle of Oxcross,299,15,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,1,1,6000,10000,"Robb Stark, Brynden Tully","Stafford Lannister, Roland Crakehall, Antario Jast",1,Oxcross,The Westerlands, Siege of Storm's End,299,16,Stannis Baratheon,Renly Baratheon,Baratheon,,,,Baratheon,,,,win,siege,1,0,5000,20000,"Stannis Baratheon, Davos Seaworth","Renly Baratheon, Cortnay Penrose, Loras Tyrell, Randyll Tarly, Mathis Rowan",1,Storm's End,The Stormlands, Battle of the Fords,299,17,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,loss,pitched battle,0,0,20000,10000,"Tywin Lannister, Flement Brax, Gregor Clegane, Addam Marbrand, Lyle Crakehall, Leo Lefford","Edmure Tully, Jason Mallister, Karyl Vance",1,Red Fork,The Riverlands, Sack of Harrenhal,299,18,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,win,ambush,1,0,100,100,"Roose Bolton, Vargo Hoat, Robett Glover",Amory Lorch,1,Harrenhal,The Riverlands, Battle of the Crag,299,19,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,win,ambush,0,0,6000,,"Robb Stark, Smalljon Umber, Black Walder Frey",Rolph Spicer,1,Crag,The Westerlands, Battle of the Blackwater,299,20,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,,,,Lannister,,,,loss,pitched battle,1,1,21000,7250,"Stannis Baratheon, Imry Florent, Guyard Morrigen, Rolland Storm, Salladhor Saan, Davos Seaworth","Tyrion Lannister, Jacelyn Bywater, Sandor Clegane, Tywin Lannister, Garlan Tyrell, Mace Tyrell, Randyll Tarly",1,King's Landing,The Crownlands, Siege of Darry,299,21,Robb Stark,Joffrey/Tommen Baratheon,Darry,,,,Lannister,,,,win,siege,0,0,,,Helman Tallhart,,1,Darry,The Riverlands, Battle of Duskendale,299,22,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,loss,pitched battle,1,0,3000,,"Robertt Glover, Helman Tallhart","Randyll Tarly, Gregor Clegane",1,Duskendale,The Crownlands, Battle of the Burning Septry,299,23,,,Brotherhood without Banners,,,,Brave Companions,,,,win,pitched battle,0,0,,,,,1,,The Riverlands, Battle of the Ruby Ford,299,24,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Stark,,,,win,pitched battle,0,0,,6000,Gregor Clegane,"Roose Bolton, Wylis Manderly",,Ruby Ford,The Riverlands, Retaking of Harrenhal,299,25,Joffrey/Tommen Baratheon,,Lannister,,,,Brave Companions,,,,win,pitched battle,1,0,,,Gregor Clegane,Vargo Hoat,1,Harrenhal,The Riverlands, The Red Wedding,299,26,Joffrey/Tommen Baratheon,Robb Stark,Frey,Bolton,,,Stark,,,,win,ambush,1,1,3500,3500,"Walder Frey, Roose Bolton, Walder Rivers",Robb Stark,1,The Twins,The Riverlands,"This observation refers to the battle against the Stark men, not the attack on the wedding" Siege of Seagard,299,27,Robb Stark,Joffrey/Tommen Baratheon,Frey,,,,Mallister,,,,win,siege,0,1,,,Walder Frey,Jason Mallister,1,Seagard,The Riverlands, Battle of Castle Black,300,28,Stannis Baratheon,Mance Rayder,Free folk,Thenns,Giants,,Night's Watch,Baratheon,,,loss,siege,1,1,100000,1240,"Mance Rayder, Tormund Giantsbane, Harma Dogshead, Magnar Styr, Varamyr","Stannis Baratheon, Jon Snow, Donal Noye, Cotter Pyke",0,Castle Black,Beyond the Wall, Fall of Moat Cailin,300,29,Joffrey/Tommen Baratheon,Balon/Euron Greyjoy,Bolton,,,,Greyjoy,,,,win,siege,0,0,,,Ramsey Bolton,,0,Moat Cailin,The North, Sack of Saltpans,300,30,,,Brave Companions,,,,,,,,win,razing,0,0,,,Rorge,,0,Saltpans,The Riverlands, Retaking of Deepwood Motte,300,31,Stannis Baratheon,Balon/Euron Greyjoy,Baratheon,Karstark,Mormont,Glover,Greyjoy,,,,win,pitched battle,0,0,4500,200,"Stannis Baratheon, Alysane Mormot",Asha Greyjoy,0,Deepwood Motte,The North, Battle of the Shield Islands,300,32,Balon/Euron Greyjoy,Joffrey/Tommen Baratheon,Greyjoy,,,,Tyrell,,,,win,pitched battle,0,0,,,"Euron Greyjoy, Victarion Greyjoy",,0,Shield Islands,The Reach, "Invasion of Ryamsport, Vinetown, and Starfish Harbor",300,33,Balon/Euron Greyjoy,Joffrey/Tommen Baratheon,Greyjoy,,,,Tyrell,,,,win,razing,0,0,,,"Euron Greyjoy, Victarion Greyjoy",,0,"Ryamsport, Vinetown, Starfish Harbor",The Reach, Second Seige of Storm's End,300,34,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,,,,Baratheon,,,,win,siege,0,0,,200,"Mace Tyrell, Mathis Rowan",Gilbert Farring,0,Storm's End,The Stormlands, Siege of Dragonstone,300,35,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,,,,Baratheon,,,,win,siege,0,0,2000,,"Loras Tyrell, Raxter Redwyne",Rolland Storm,0,Dragonstone,The Stormlands, Siege of Riverrun,300,36,Joffrey/Tommen Baratheon,Robb Stark,Lannister,Frey,,,Tully,,,,win,siege,0,0,3000,,"Daven Lannister, Ryman Fey, Jaime Lannister",Brynden Tully,0,Riverrun,The Riverlands, Siege of Raventree,300,37,Joffrey/Tommen Baratheon,Robb Stark,Bracken,Lannister,,,Blackwood,,,,win,siege,0,1,1500,,"Jonos Bracken, Jaime Lannister",Tytos Blackwood,0,Raventree,The Riverlands, Siege of Winterfell,300,38,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,Karstark,Mormont,Glover,Bolton,Frey,,,,,,,5000,8000,Stannis Baratheon,Roose Bolton,0,Winterfell,The North, -------------------------------------------------------------------------------- /Data/Demo/character-deaths.csv: -------------------------------------------------------------------------------- 1 | Name,Allegiances,Death Year,Book of Death,Death Chapter,Book Intro Chapter,Gender,Nobility,GoT,CoK,SoS,FfC,DwD Addam Marbrand,Lannister,,,,56,1,1,1,1,1,1,0 Aegon Frey (Jinglebell),None,299,3,51,49,1,1,0,0,1,0,0 Aegon Targaryen,House Targaryen,,,,5,1,1,0,0,0,0,1 Adrack Humble,House Greyjoy,300,5,20,20,1,1,0,0,0,0,1 Aemon Costayne,Lannister,,,,,1,1,0,0,1,0,0 Aemon Estermont,Baratheon,,,,,1,1,0,1,1,0,0 Aemon Targaryen (son of Maekar I),Night's Watch,300,4,35,21,1,1,1,0,1,1,0 Aenys Frey,None,300,5,,59,0,1,1,1,1,0,1 Aeron Greyjoy,House Greyjoy,,,,11,1,1,0,1,0,1,0 Aethan,Night's Watch,,,,0,1,0,0,0,1,0,0 Aggar,House Greyjoy,299,2,56,50,1,0,0,1,0,0,0 Aggo,House Targaryen,,,,54,1,0,1,1,1,0,1 Alan of Rosby,Night's Watch,300,5,4,18,1,1,0,1,1,0,1 Alayaya,None,,,,15,0,0,0,1,0,0,0 Albar Royce,Arryn,,,,38,1,1,1,0,0,1,0 Albett,Night's Watch,,,,26,1,0,1,0,0,0,0 Alebelly,House Stark,299,2,46,4,1,0,0,1,0,0,0 Alerie Hightower,House Tyrell,,,,6,0,1,0,0,1,1,0 Alesander Staedmon,Baratheon,,,,65,1,1,0,1,0,0,0 Alester Florent,Baratheon,300,4,,36,1,1,0,1,1,0,0 Alia of Braavos,None,,,,28,0,0,1,0,0,0,0 Alla Tyrell,House Tyrell,,,,6,0,1,0,0,1,1,0 Allard Seaworth,Baratheon,299,2,10,10,1,1,0,1,0,0,0 Alliser Thorne,Night's Watch,,,,19,1,0,1,1,1,0,1 Alyn,House Stark,298,3,34,12,1,0,1,0,0,0,0 Alyn Ambrose,Tyrell,,,,59,1,1,0,1,0,1,0 Alyn Estermont,Baratheon,,,,,1,1,0,1,1,0,0 Alyn Stackspear,Lannister,,,,16,1,1,0,0,0,1,0 Alys Karstark,Stark,,,,44,0,1,0,0,0,0,1 Alysane Mormont,Stark,,,,35,0,1,0,0,0,0,1 Alyx Frey,None,,,,49,1,1,0,0,1,0,0 Ambrode,Greyjoy,,,,24,1,0,0,1,0,0,0 Amory Lorch,House Lannister,299,2,47,14,1,1,0,1,0,0,0 Andar Royce,Arryn,,,,29,1,1,1,0,0,0,0 Andrew Estermont,Baratheon,,,,36,1,1,0,0,1,0,0 Andrey Dalt,Martell,,,,21,1,1,0,0,0,1,0 Andrik,Greyjoy,,,,19,1,1,0,0,0,1,0 Anguy,None,,,,30,1,0,1,0,1,0,0 Antario Jast,Lannister,,,,32,1,1,0,1,0,0,0 Anvil Ryn,None,299,3,34,34,1,0,0,0,1,0,0 Anya Waynwood,Arryn,,,,38,0,1,1,0,0,1,0 Archibald Yronwood,Martell,,,,6,1,1,0,0,0,0,1 Ardrian Celtigar,Baratheon,,,,10,1,1,0,1,1,0,0 Areo Hotah,House Martell,,,,2,1,0,0,0,0,1,1 Arianne Martell,House Martell,,,,2,0,1,0,0,0,1,1 Arneld,Lannister,,,,59,1,0,0,1,0,0,0 Arnolf Karstark,Stark,,,,3,1,1,0,0,0,0,1 Aron Santagar,Lannister,299,2,41,18,1,1,1,1,0,0,0 Arron,Night's Watch,,,,75,1,0,0,0,1,0,1 Arron Qorgyle,Martell,,,,38,1,1,0,0,1,0,0 Arryk (Guard),House Tyrell,,,,6,1,0,0,0,1,1,0 Arson,Wildling,,,,26,1,0,0,0,1,0,0 Arthor Karstark,Stark,,,,62,1,1,0,0,0,0,1 Arwyn Frey,None,,,,49,0,1,0,0,1,0,0 Arwyn Oakheart,Tyrell,,,,22,0,1,0,1,0,0,0 Arya Stark,Stark,,,,2,0,1,1,1,1,1,1 Arys Oakheart,Lannister,300,4,21,57,1,1,1,1,0,1,0 Asha Greyjoy,House Greyjoy,,,,11,0,1,0,1,0,1,1 Aurane Waters,Lannister,,,,65,1,0,0,1,0,1,0 Axell Florent,Baratheon,,,,0,1,1,0,1,1,0,1 Azzak,None,,,,2,1,1,0,0,0,0,1 Ballabar,Lannister,,,,4,1,0,0,1,0,0,0 Balon Swann,Lannister,,,,29,1,1,1,1,1,1,1 Bannen,Night's Watch,299,3,33,0,1,0,0,0,1,0,0 Barbrey Dustin,Stark,,,,26,0,1,0,0,0,0,1 Barra,None,299,2,39,35,0,0,1,0,0,0,0 Barristan Selmy,Targaryen,,,,15,1,1,1,1,1,0,1 Barsena,None,300,5,52,11,0,0,0,0,0,0,1 Barth (brewer),House Stark,,,,50,1,0,0,1,0,0,0 Bass,Night's Watch,,,,52,1,0,1,0,0,0,0 Bayard Norcross,Tyrell,,,,28,1,1,0,0,0,1,0 Bearded Ben,Night's Watch,,,,,1,0,0,0,0,1,0 Becca,House Arryn,,,,12,0,0,1,0,0,0,0 Becca the Baker,None,299,3,34,34,0,0,0,0,1,0,0 Bedwyck,Night's Watch,,,,13,1,0,0,1,1,0,1 Belaquo,None,,,,52,1,0,0,0,0,0,1 Bella,None,,,,29,0,0,0,0,1,0,0 Bellegere Otherys (courtesan),None,,,,34,0,1,0,0,0,1,0 Bellonara Otherys,None,,,,34,0,1,0,0,0,1,0 Belwas,House Targaryen,,,,63,1,0,0,1,1,0,1 Ben,None,,,,37,1,0,0,0,0,1,0 Ben Bones,None,,,,12,1,0,0,0,0,0,1 Ben Plumm,None,,,,57,1,0,0,0,1,0,1 Benedar Belmore,Arryn,,,,23,1,1,0,0,0,1,0 Benfred Tallhart,Stark,299,2,37,16,1,1,0,1,0,0,0 Benfrey Frey,None,299,5,51,51,1,1,0,0,1,0,0 Benjen Stark,Night's Watch,298,1,24,5,1,1,1,0,0,0,0 Bennet,None,300,4,31,31,1,0,0,0,0,1,0 Beric Dondarrion,None,300,4,27,42,1,1,1,0,1,0,0 Bessa (Winterfell),Stark,,,,37,0,0,1,0,0,0,0 Beth Cassel,Stark,,,,7,0,1,1,1,0,0,0 Bhakaz zo Loraq,None,,,,2,1,1,0,0,0,0,1 Big Boil,Wildling,,,,41,1,0,0,0,1,0,0 Big Walder Frey,None,,,,,1,1,1,1,0,0,1 Biter,None,300,4,37,1,1,0,0,1,0,1,0 Black Bernarr,Night's Watch,,,,18,1,0,0,0,1,0,1 Black Walder Frey,None,,,,35,1,1,0,0,1,0,0 Blane,Night's Watch,,,,23,1,0,0,1,0,0,0 Bloodbeard,None,,,,25,1,0,0,0,0,0,1 Blue Bard,House Tyrell,,,,12,1,0,0,0,0,1,0 Bonifer Hasty,Lannister,,,,65,1,1,0,1,0,1,0 Borcas,Night's Watch,,,,48,1,0,1,0,0,0,0 Boremund Harlaw,Greyjoy,,,,11,1,1,0,0,0,1,0 Boros Blount,Baratheon,,,,8,1,1,1,1,1,1,1 Borroq,Wildling,,,,0,1,0,0,0,0,0,1 Bowen Marsh,Night's Watch,,,,21,1,0,1,0,1,0,1 Bran Stark,House Stark,,,,1,1,1,1,1,1,0,1 Brandon Norrey,Stark,,,,49,1,1,0,0,0,0,1 Brenett,None,,,,49,1,0,0,0,1,0,0 Brienne of Tarth,Stark,,,,22,0,1,0,1,1,1,1 Bronn,Lannister,,,,28,1,1,1,1,1,0,0 Brown Bernarr,Night's Watch,299,3,33,23,1,0,0,1,1,0,0 Brusco,None,,,,22,1,0,0,0,0,1,1 Bryan Fossoway,Baratheon,299,2,62,42,1,1,1,1,0,0,0 Bryce Caron,Baratheon,299,2,65,29,1,1,1,1,0,0,0 Bryen,None,,,,68,1,0,0,0,1,0,0 Bryen Farring,Baratheon,300,5,62,10,1,1,0,1,0,0,1 Brynden Rivers,House Targaryen,,,,34,1,1,0,0,0,0,1 Brynden Tully,House Tully,,,,2,1,1,1,1,1,1,0 Burton Crakehall,Lannister,299,2,7,55,1,1,1,1,0,0,0 Butterbumps,House Tyrell,,,,6,1,0,0,0,1,0,0 Byam Flint,Night's Watch,299,3,33,33,1,0,0,0,1,0,0 Byan Votyris,None,,,,54,1,0,1,0,0,0,0 Byron,Arryn,,,,41,1,0,0,0,0,1,0 Cadwyn,House Lannister,,,,47,1,0,0,1,0,0,0 Caleotte,House Martell,,,,2,1,0,0,0,0,1,1 Catelyn Tully,House Stark,299,3,51,2,0,1,1,1,1,0,0 Cayn,House Stark,298,1,49,4,1,0,1,0,0,0,0 Cellador,Night's Watch,,,,48,1,0,1,0,1,0,1 Cersei Frey,None,,,,49,0,1,0,0,1,0,0 Cersei Lannister,House Lannister,,,,4,0,1,1,1,1,1,1 Chataya,None,,,,15,0,0,0,1,0,0,0 Chayle,Stark,299,2,50,9,1,0,1,1,0,0,0 Chella,Lannister,,,,56,0,0,1,1,1,0,0 Chett,Night's Watch,299,3,46,41,1,0,1,1,1,0,0 Chiggen,Lannister,298,1,42,31,1,0,1,0,0,0,0 Chiswyck,None,299,2,30,26,1,0,0,1,0,0,0 Clayton Suggs,Baratheon,,,,17,1,0,0,0,0,0,1 Clement Piper,Tully,,,,38,1,1,0,0,0,1,0 Cleos Frey,None,299,3,21,7,1,1,0,1,1,0,0 Cley Cerwyn,Stark,299,2,66,16,1,1,0,1,0,0,0 Clubfoot Karl,Night's Watch,300,5,4,0,1,0,0,0,1,0,1 Clydas,Night's Watch,,,,41,1,0,1,0,1,1,1 Cohollo,None,298,1,64,11,1,0,1,0,0,0,0 Coldhands,None,,,,46,1,0,0,0,1,0,1 Colemon,House Arryn,,,,34,1,0,1,0,1,1,0 Colen of Greenpools,House Baratheon,,,,22,1,1,0,1,0,0,0 Conn,Lannister,298,1,62,42,1,0,1,0,0,0,0 Conwy,Night's Watch,,,,6,1,0,0,1,0,0,0 Corliss Penny,Baratheon,,,,26,1,0,0,0,0,0,1 Cortnay Penrose,Baratheon,299,2,42,31,1,1,0,1,0,0,0 Cotter Pyke,Night's Watch,,,,48,1,0,0,0,1,0,0 Cragorn,House Greyjoy,300,4,29,19,1,0,0,0,0,1,0 Craster,None,299,3,33,23,1,0,0,1,1,0,0 Cregan Karstark,Stark,,,,44,1,1,0,0,0,0,1 Creighton Longbough,None,,,,4,1,0,0,0,0,1,0 Cressen,Baratheon,299,1,0,0,1,0,0,1,0,0,0 Cuger,Night's Watch,,,,,1,0,1,0,0,0,0 Cutjack,None,,,,5,1,0,0,1,0,0,0 Daario Naharis,Targaryen,,,,42,1,0,0,0,1,0,1 Dacey Mormont,Stark,299,3,51,63,0,1,1,0,1,0,0 Daemon Sand,Martell,,,,38,1,0,0,0,1,0,1 Daenerys Targaryen,House Targaryen,,,,3,0,1,1,1,1,0,1 Dagon Codd,Greyjoy,300,5,20,20,1,1,0,0,0,0,1 Dagmer,House Greyjoy,,,,11,1,0,0,1,0,0,0 Dalbridge,Night's Watch,299,2,53,43,1,0,0,1,0,0,0 Dale Seaworth,Baratheon,299,2,58,10,1,1,0,1,0,0,0 Dalla,Wildling,300,3,76,73,0,1,0,0,1,0,0 Dalla (Dragonstone),Baratheon,,,,0,0,0,0,1,0,0,0 Damon Dance-for-Me,None,,,,12,1,0,0,0,0,0,1 Damon Vypren,None,,,,49,1,1,0,0,1,0,0 Dancy,None,,,,29,0,0,0,1,0,0,0 Danwell Frey,None,,,,59,1,1,1,1,1,1,0 Dareon,Night's Watch,300,4,34,19,1,0,1,0,0,1,0 Daryn Hornwood,Stark,298,1,63,63,1,1,1,0,0,0,0 Daven Lannister,House Lannister,,,,12,1,1,0,0,0,1,0 Davos Seaworth,Baratheon,,,,0,1,1,0,1,1,0,1 Del,Wildling,299,3,41,41,1,0,0,0,1,0,0 Delp,House Tully,299,3,20,39,1,0,0,1,1,0,0 Denys Drumm,Greyjoy,,,,19,1,1,0,0,0,1,0 Denys Mallister,Night's Watch,,,,48,1,1,0,0,1,0,0 Dermot,Baratheon,,,,65,1,0,0,1,0,1,0 Desmond,House Stark,298,1,50,1,1,0,1,0,0,0,0 Desmond Grell,Tully,,,,7,1,1,0,1,1,1,0 Devan Seaworth,Baratheon,,,,10,1,1,0,1,1,0,1 Deziel Dalt,Martell,,,,38,1,1,0,0,1,0,1 Dhazzar,None,,,,2,1,1,0,0,0,0,1 Dick Crabb,None,300,4,20,9,1,0,0,0,0,1,0 Dick Follard,Night's Watch,299,3,55,41,1,0,0,0,1,0,0 Dickon Manwoody,Martell,,,,38,1,1,0,0,1,0,0 Dirk,Night's Watch,300,5,4,0,1,0,0,0,1,0,1 Dobber,None,299,2,14,5,1,0,0,1,0,0,0 Donal Noye,Night's Watch,300,3,64,19,1,0,1,1,1,0,0 Donella Hornwood,Stark,299,2,35,16,0,1,0,1,0,0,0 Donnel Drumm,Greyjoy,,,,19,1,1,0,0,0,1,0 Donnel Haigh,None,,,,49,1,1,0,0,1,0,0 Donnel Hill,Night's Watch,,,,0,1,0,0,0,1,0,1 Donnel Locke,Stark,299,3,51,25,1,1,0,1,1,0,0 Donnel Waynwood,Tully,,,,34,1,1,1,0,0,0,0 Donnis,House Stark,,,,7,1,0,1,0,0,0,0 Dontos Hollard,None,300,3,61,57,1,1,1,1,1,0,0 Donyse,None,,,,72,0,0,0,0,1,0,0 Doran Martell,Martell,,,,2,1,1,0,0,0,1,1 Doreah,House Targaryen,299,2,12,11,0,0,1,1,0,0,0 Dornish Dilly,Night's Watch,299,3,55,55,1,0,0,0,1,0,0 Draqaz,None,,,,2,1,1,0,0,0,0,1 Drennan,House Greyjoy,299,2,50,50,1,0,0,1,0,0,0 Drogo,Targaryen,298,1,64,3,1,1,1,0,0,0,0 Dudley,Stark,299,3,34,34,1,0,0,0,1,0,0 Dunsen,None,,,,26,1,0,0,1,0,0,0 Dunstan Drumm,Greyjoy,,,,19,1,1,0,0,0,1,0 Duram Bar Emmon,Baratheon,,,,0,1,1,0,1,0,0,0 Dykk Harlaw,Greyjoy,299,2,66,66,1,0,0,1,0,0,0 Dywen,Night's Watch,,,,48,1,0,1,1,1,0,1 Easy,Night's Watch,300,3,55,55,1,0,0,0,1,0,0 Ebben,Night's Watch,299,2,68,43,1,0,0,1,0,0,0 Eddard Karstark,Stark,298,1,63,53,1,1,1,0,0,0,0 Eddard Stark,House Stark,299,1,65,1,1,1,1,0,0,0,0 Eddison Tollett,Night's Watch,,,,13,1,0,0,1,1,1,1 Edmure Tully,House Tully,,,,28,1,1,1,1,1,1,0 Edric Dayne,None,,,,34,1,1,0,0,1,0,0 Edric Storm,House Baratheon,,,,10,1,0,0,0,1,0,0 Edwyn Frey,None,,,,38,1,1,0,0,1,1,0 Eggon,None,,,,30,1,0,0,1,0,0,0 Elder Brother,None,,,,31,1,0,0,0,0,1,0 Eldred Codd,Greyjoy,,,,18,1,1,0,0,0,1,0 Eleyna Westerling,Stark,,,,45,0,1,0,0,1,1,0 Elinor Tyrell,Tyrell,,,,6,0,1,0,0,1,1,0 Ellaria Sand,Martell,,,,38,0,1,0,0,1,0,1 Elmar Frey,None,,,,59,1,1,1,1,1,0,0 Elyas Willum,Baratheon,,,,22,1,1,0,1,0,0,0 Emmon Cuy,Baratheon,299,2,42,33,1,1,0,1,0,0,0 Emmon Frey,Lannister,,,,29,1,1,1,0,0,1,0 Emmond,Greyjoy,,,,1,1,0,0,0,0,1,0 Emrick,Night's Watch,,,,75,1,0,0,0,1,0,1 Endehar,Greyjoy,,,,56,1,0,0,1,0,0,0 Elwood,House Tully,299,3,20,20,1,0,0,0,1,0,0 Endrew Tarth,Night's Watch,299,3,69,60,1,0,1,0,1,0,0 Enger,House Tully,,,,39,1,0,0,1,0,0,0 Eon Hunter,Arryn,300,3,80,38,1,1,1,0,1,0,0 Erik Ironmaker,Greyjoy,,,,19,1,1,0,0,0,1,0 Ermesande Hayford,None,,,,2,0,1,0,1,1,1,0 Eroeh,Targaryen,298,1,68,61,1,0,1,0,0,0,0 Erreck,None,,,,12,1,0,0,0,1,0,0 Erryk (Guard),House Tyrell,,,,6,1,0,0,0,1,1,0 Esgred,Greyjoy,,,,24,1,0,0,1,0,0,0 Ezzara,Targaryen,,,,30,0,0,0,0,0,0,1 Falia Flowers,Greyjoy,,,,29,0,0,0,0,0,1,0 Falyse Stokeworth,Lannister,300,5,36,2,0,1,0,1,0,1,0 Farlen,House Stark,299,2,56,46,1,0,0,1,0,0,0 Fat Walda Frey,None,,,,37,0,1,0,0,1,0,1 Flement Brax,Lannister,,,,56,1,1,1,0,1,1,0 Fletcher Will,None,299,3,34,34,1,0,0,0,1,0,0 Fogo,None,298,1,61,46,1,1,1,0,0,0,0 Forley Prester,Lannister,,,,33,1,1,0,0,0,1,0 Fornio,Night's Watch,,,,33,1,0,0,0,1,0,0 Frenken,Lannister,,,,15,1,0,0,1,1,0,0 Frenya,Wildling,300,5,69,37,0,0,0,0,0,0,1 Frynne,Martell,,,,40,0,0,0,0,0,1,0 Gage,House Stark,,,,4,1,1,0,1,0,0,0 Galazza Galare,None,,,,23,0,1,0,0,0,0,1 Galbart Glover,Stark,,,,55,1,1,1,0,1,0,0 Gared,Night's Watch,297,1,1,0,1,0,1,0,0,0,0 Garin (Orphan),Martell,,,,21,1,0,0,0,0,1,0 Gariss,House Stark,,,,50,1,0,0,1,0,0,0 Garlan Tyrell,House Tyrell,,,,62,1,1,0,1,1,1,0 Garrett Paege,Lannister,,,,30,1,1,0,0,0,1,0 Garse Goodbrook,None,299,3,,,1,1,0,0,1,0,0 Garth Greyfeather,Night's Watch,300,5,31,48,1,0,0,0,1,0,1 Garth of Greenaway,Night's Watch,300,5,4,33,1,0,0,0,1,0,1 Garth of Oldtown,Night's Watch,299,3,33,33,1,0,0,0,1,0,0 Gawen Westerling,Stark,,,,33,1,1,0,0,0,1,0 Gelmarr,House Greyjoy,299,2,56,50,1,0,0,1,0,0,0 Gendry,None,,,,27,1,0,1,1,1,1,0 Genna Lannister,Lannister,,,,33,0,1,0,0,0,1,0 Gerald Gower,Baratheon,,,,54,1,1,0,0,1,0,0 Gerold Dayne,Martell,,,,13,1,1,0,0,0,1,0 Gerren,None,299,2,19,14,1,0,0,1,0,0,0 Gerris Drinkwater,Martell,,,,6,1,1,0,0,0,0,1 Gevin Harlaw,Greyjoy,299,2,66,37,1,1,0,1,0,0,0 Ghael,None,,,,71,1,0,0,0,1,0,1 Ghost of High Heart,None,,,,22,0,0,0,0,1,0,0 Gillam,None,,,,31,1,0,0,0,0,1,0 Gilly,Wildling,,,,23,0,0,0,1,1,1,1 Gilwood Hunter,Arryn,,,,80,1,1,0,0,1,0,0 Gladden Wylde,Stark,298,1,55,43,1,1,1,0,0,0,0 Glendon Hewett,Night's Watch,,,,69,1,0,0,0,1,0,0 Goady,Night's Watch,,,,0,1,0,0,0,1,0,0 Godric Borrell,None,,,,9,1,1,0,0,0,0,1 Godry Farring,Baratheon,,,,3,1,1,0,0,0,0,1 Godwyn,House Lannister,,,,32,1,0,1,0,0,0,0 Goghor,None,,,,52,1,0,0,0,0,0,1 Gormond Goodbrother,Greyjoy,,,,1,1,1,0,0,0,1,0 Gorold Goodbrother,Greyjoy,,,,1,1,1,0,0,0,1,0 Gran Goodbrother,Greyjoy,,,,1,1,1,0,0,0,1,0 Grazdan,None,299,3,27,27,1,1,0,0,1,0,0 Grazdan mo Eraz,None,,,,42,1,1,0,0,1,0,0 Grazdan mo Ullhor,None,299,3,27,27,1,1,0,0,1,0,0 Grazdan zo Galare,None,,,,2,1,1,0,0,0,0,1 Grazdar zo Galare,None,,,,2,1,1,0,0,0,0,1 Gregor Clegane,Lannister,300,4,72,29,1,1,1,1,1,0,0 Grenn,Night's Watch,,,,19,1,0,1,1,1,1,1 Grey Worm,Targaryen,,,,42,1,0,0,0,1,0,1 Greydon Goodbrother,Greyjoy,,,,1,1,1,0,0,0,1,0 Grigg,Wildling,,,,30,1,0,0,0,1,0,0 Grisel,None,,,,68,0,0,0,0,1,0,0 Groleo,House Targaryen,300,5,59,,1,0,0,0,1,0,1 Grubbs,Night's Watch,300,5,4,0,1,0,0,0,1,0,1 Grunt,None,,,,12,1,0,0,0,0,0,1 Gueren,Night's Watch,,,,38,1,0,1,0,0,0,0 Guncer Sunglass,Baratheon,299,3,10,0,1,1,0,1,1,0,0 Guyard Morrigen,Baratheon,299,2,62,22,1,1,0,1,0,0,0 Gwin Goodbrother,Greyjoy,,,,1,0,1,0,0,0,1,0 Gylbert Farwynd,Greyjoy,,,,19,1,1,0,0,0,1,0 Gyles Farwynd,Greyjoy,,,,19,1,1,0,0,0,1,0 Gyles Grafton,Arryn,,,,41,1,1,0,0,0,1,0 Gyles Rosby,Lannister,300,4,39,57,1,1,1,1,1,1,0 Gynir,House Greyjoy,299,2,56,50,1,0,0,1,0,0,0 Gysella Goodbrother,Greyjoy,,,,1,0,1,0,0,0,1,0 Haggo,None,298,1,64,11,1,0,1,0,0,0,0 Hairy Hal,Night's Watch,300,5,31,55,1,0,0,0,1,1,1 Hake,Night's Watch,299,3,46,52,1,0,1,1,1,0,0 Halder,Night's Watch,,,,19,1,0,1,0,1,0,0 Haldon,House Targaryen,,,,8,1,0,0,0,0,0,1 Hali,Wildling,298,1,37,37,0,0,1,0,0,0,0 Hallis Mollen,Stark,,,,14,1,1,1,1,0,0,0 Hallyne,Lannister,,,,17,1,0,0,1,0,1,0 Halys Hornwood,Stark,298,1,62,53,1,1,1,0,0,0,0 Hamish the Harper,None,300,5,43,32,1,0,0,0,1,1,0 Hareth (Mole's Town),Night's Watch,,,,64,1,0,0,0,1,0,1 Harghaz,None,300,5,70,52,1,0,0,0,0,0,1 Harma,Wildling,300,3,75,0,0,1,0,0,1,0,0 Harmund Sharp,Greyjoy,,,,19,1,1,0,0,0,1,0 Harmune,Night's Watch,,,,78,1,0,0,0,0,0,1 Harra,None,299,2,64,26,0,0,0,1,0,0,0 Harrag Sharp,Greyjoy,299,5,19,66,1,1,0,1,0,1,0 Harras Harlaw,Greyjoy,,,,11,1,1,0,0,0,1,0 Harwood Fell,House Baratheon,300,5,42,17,1,1,0,0,0,0,1 Hazzea,None,300,5,11,11,0,0,0,0,0,0,1 Harrion Karstark,Stark,,,,53,1,1,1,1,0,0,0 Harry Strickland,None,,,,24,1,0,0,0,0,0,1 Harwin,Stark,,,,1,1,0,1,0,1,1,0 Harwood Stout,Stark,,,,32,1,1,0,0,0,0,1 Harys Haigh,None,,,,64,1,1,0,1,1,0,0 Harys Swyft,Lannister,,,,69,1,1,1,1,1,1,1 Helman Tallhart,Stark,299,3,37,55,1,1,1,0,1,0,0 Henly (Maester),None,,,,37,1,0,0,0,0,0,1 Heward,House Stark,298,1,35,35,1,0,1,0,0,0,0 High Sparrow,None,,,,4,1,0,0,0,0,1,1 Hobb,Night's Watch,,,,52,1,0,1,0,1,0,1 Hobber Redwyne,Tyrell,,,,27,1,1,1,1,1,1,0 Hod,None,,,,5,1,0,0,1,0,0,0 Hodor,House Stark,,,,24,1,0,1,1,1,0,1 Holger,None,,,,49,1,0,0,1,0,0,0 Holly,Wildling,300,5,69,37,0,0,0,0,0,0,1 Hop-Robin,Night's Watch,,,,75,1,0,0,0,1,0,1 Horas Redwyne,Tyrell,,,,27,1,1,1,1,0,1,0 Horton Redfort,Arryn,,,,23,1,1,0,0,0,1,0 Hosteen Frey,None,,,,59,1,1,1,1,1,0,1 Hoster Tully,House Tully,299,3,35,71,1,1,1,1,1,0,0 Hot Pie,None,,,,1,1,0,0,1,1,0,0 Hother Umber,Stark,,,,16,1,1,0,1,0,0,1 Howd Wanderer,Wildling,,,,58,1,1,0,0,0,0,1 Hubard Rambton,Baratheon,299,5,10,10,1,1,0,1,0,0,0 Hugh,House Arryn,298,1,29,25,1,0,1,0,0,0,0 Hugo Vance,Tully,,,,30,1,1,0,0,0,1,0 Hullen,House Stark,298,1,50,1,1,0,1,0,0,0,0 Humfrey Swyft,Lannister,,,,16,1,1,0,0,0,1,0 Husband,None,,,,11,1,0,0,0,1,0,0 Hyle Hunt,None,300,4,42,14,1,1,0,0,0,1,0 Iggo,None,299,3,3,3,1,0,0,0,1,0,0 Illyrio Mopatis,House Targaryen,,,,3,1,1,1,0,0,0,1 Ilyn Payne,Lannister,,,,15,1,1,1,1,1,1,0 Imry Florent,Baratheon,299,3,4,58,1,1,0,1,1,0,0 Iron Emmett,Night's Watch,,,,79,1,0,0,0,1,0,1 Irri,House Targaryen,,,,11,0,0,1,1,1,0,1 Ithoke,None,,,,11,1,0,0,0,0,0,1 Jacelyn Bywater,Lannister,299,2,19,8,0,1,0,1,0,0,0 Jack Bulwer,Night's Watch,300,5,31,64,1,0,0,0,1,1,1 Jack-Be-Lucky,None,,,,17,1,0,0,0,1,1,0 Jacks,House Stark,,,,22,1,0,1,1,0,0,0 Jaggot,None,,,,42,1,0,1,0,0,0,0 Jaime Lannister,Lannister,,,,5,1,1,1,1,1,1,1 Jalabhar Xho,None,,,,29,1,1,1,1,1,1,0 Janna Tyrell,Tyrell,,,,6,0,1,0,0,1,1,0 Janos Slynt,None,300,5,7,27,1,1,1,1,1,0,1 Jaqen H'ghar,None,,,,5,1,0,0,1,0,1,0 Jared Frey,None,300,5,37,59,1,1,1,1,0,0,1 Jaremy Rykker,Night's Watch,298,5,60,21,1,1,1,0,0,0,0 Jarl,Wildling,299,3,26,7,1,0,0,0,1,0,0 Jarman Buckwell,Night's Watch,300,5,21,13,1,0,0,1,0,0,1 Jason Mallister,Stark,,,,28,1,1,1,0,1,0,0 Jate,House Lannister,,4,44,44,1,0,0,0,0,1,0 Jate Blackberry,House Baratheon,,3,10,10,1,0,0,0,1,0,0 Jeor Mormont,Night's Watch,299,3,33,48,1,1,1,1,1,0,0 Jeren,Night's Watch,,,,19,1,0,1,0,0,0,0 Jeyne Poole,Stark,,,,7,0,1,1,0,1,0,1 Jeyne Westerling,Stark,,,,14,0,1,0,0,1,1,0 Jhaqo,None,,,,68,1,1,1,0,0,0,1 Jhezane,None,,,,2,0,1,0,0,0,0,1 Jhiqui,House Targaryen,,,,11,0,0,1,1,1,0,1 Jhogo,House Targaryen,,,,23,1,0,1,1,1,0,1 Jocelyn Swyft,Lannister,,,,3,0,1,0,0,0,1,1 Jodge,House Lannister,,,,30,1,0,0,1,0,0,0 Jojen Reed,Stark,,,,21,1,1,0,1,1,0,1 Jommy,House Baratheon,,,,0,1,0,0,1,0,0,0 Jon Bettley,Lannister,,,,16,1,1,0,0,0,1,0 Jon Connington,Targaryen,,,,5,1,1,0,0,0,0,1 Jon Fossoway,Baratheon,,,,22,1,1,0,1,0,0,0 Jon Snow,Night's Watch,,,,1,1,1,1,1,1,1,1 Jon Umber (Greatjon),Stark,,,,51,1,1,1,1,1,0,0 Joffrey Baratheon,House Lannister,300,3,60,5,1,1,1,1,1,0,0 Jon Umber (Smalljon),Stark,299,3,51,63,1,1,1,0,1,0,0 Jonelle Cerwyn,Stark,,,,53,0,1,1,0,0,0,0 Jonos Bracken,Tully,,,,28,1,1,1,0,1,0,1 Jorah Mormont,Targaryen,,,,23,1,1,1,1,1,0,1 Jory Cassel,Stark,298,1,35,1,1,1,1,0,0,0,0 Joseran,Greyjoy,,,,11,1,0,0,0,0,1,0 Joseth,House Stark,,,,37,1,0,1,1,0,0,0 Josmyn Peckledon,Lannister,,,,65,1,0,0,1,1,1,1 Joss,None,,,,43,1,0,1,0,0,0,0 Joss Stilwood,Lannister,,,,,1,0,1,0,0,0,0 Josua Willum,Baratheon,,,,22,1,1,0,1,0,0,0 Jothos Slynt,Lannister,,,,66,1,1,0,0,1,0,0 Joyeuse Erenford,None,,,,28,0,1,1,0,1,0,0 Justin Massey,Baratheon,,,,7,1,1,0,0,0,0,1 Jyck,House Lannister,298,1,31,21,1,0,1,0,0,0,0 Jynessa Blackmont,Martell,,,,38,0,1,0,0,1,0,0 Karyl Vance,Tully,,,,43,1,1,1,0,1,1,0 Kedge Whiteye,Night's Watch,,,,43,1,0,0,1,1,1,0 Kella,None,,,,68,0,0,0,0,1,0,0 Kennos of Kayce,Lannister,,,,6,1,1,0,0,1,1,1 Ketter,Night's Watch,,,,52,1,0,1,0,0,0,0 Kenned,House Greyjoy,299,2,66,66,1,0,0,1,0,0,0 Kerwin,House Tyrell,300,5,56,29,1,0,0,0,0,1,1 Kevan Lannister,House Lannister,300,5,,56,1,1,1,1,1,1,1 Kezmya,None,,,,2,0,1,0,0,0,0,1 Khorane Sathmantes,None,,,,10,1,0,0,0,1,0,0 Khrazz,None,300,5,67,11,1,0,0,0,0,0,1 Kindly man,None,,,,6,1,0,0,0,0,1,1 Kojja Mo,None,,,,35,0,0,0,0,0,1,0 Koss,Night's Watch,299,2,14,5,1,0,0,1,0,0,0 Kraznys mo Nakloz,None,299,3,27,23,1,1,0,0,1,0,0 Kromm,House Greyjoy,299,2,66,56,1,0,0,1,0,0,0 Kurleket,None,298,1,31,31,1,0,1,0,0,0,0 Kurz,Night's Watch,299,2,19,5,1,0,0,1,0,0,0 Kyle (Banners),None,299,3,39,39,1,0,0,0,1,0,0 Kyra,House Stark,300,5,12,37,0,0,1,1,0,0,1 Lady of the Leaves,None,,,,22,0,0,0,0,1,0,0 Lambert Turnberry,Lannister,,,,7,1,1,0,0,0,1,0 Lamprey,Baratheon,,,,25,1,0,0,0,1,0,0 Lancel Lannister,Lannister,,,,47,1,1,1,1,1,1,1 Lanna (Happy Port),None,,,,34,0,0,0,0,0,1,0 Lark,Night's Watch,299,3,46,23,1,0,0,1,1,0,0 Larra Blackmont,Martell,,,,38,0,1,0,0,1,0,0 Larraq,Targaryen,,,,59,1,0,0,0,0,0,1 Leathers,Night's Watch,,,,35,1,0,0,0,0,0,1 Left Hand Lew,Night's Watch,,,,48,1,0,0,0,1,0,1 Lem,None,,,,13,1,0,0,0,1,1,0 Lennocks,None,,,34,34,1,0,0,0,1,0,0 Lenwood Tawney,Greyjoy,,,,18,1,0,0,0,0,1,0 Leo Lefford,Lannister,299,2,45,62,1,1,1,1,0,0,0 Leo Tyrell,Tyrell,,,,0,1,1,0,0,0,1,0 Leobald Tallhart,Stark,299,2,66,16,1,1,0,1,0,0,0 Leona Woolfield,Stark,,,,19,0,1,0,0,0,0,1 Leonette Fossoway,Tyrell,,,,6,0,1,0,0,1,0,0 Lew,House Stark,,,,7,1,0,1,0,0,0,0 Lewis Lanster,None,,,,25,1,0,0,0,0,0,1 Lewys Lydden,Lannister,,,,62,1,1,1,1,0,0,0 Lewys Piper,Lannister,,,,27,1,1,0,0,0,1,0 Lharys,Stark,298,1,34,31,1,1,1,0,0,0,0 Little Walder Frey,None,300,5,51,59,1,1,1,1,0,0,1 Lollys Stokeworth,Lannister,,,,2,0,1,0,1,1,0,0 Lommy,Night's Watch,299,2,19,1,1,0,0,1,0,0,0 Loras Tyrell,Tyrell,,,,18,1,1,1,1,1,1,0 Lorcas,None,,,,45,1,0,0,0,0,1,0 Lord of Bones,Wildling,300,5,10,68,1,1,0,1,1,0,1 Lorent Caswell,Baratheon,,,,22,1,1,0,1,0,0,0 Lorren,Greyjoy,299,2,66,37,1,0,0,1,0,0,0 Lothar Frey,None,,,,35,1,1,0,0,1,0,0 Lothor Brune,None,,,,29,1,1,1,1,1,1,0 Lucan,None,299,2,64,38,1,0,0,1,0,0,0 Lucas Blackwood,Tully,299,3,51,22,1,1,0,1,1,0,0 Lucas Codd,Greyjoy,,,,11,1,1,0,0,0,1,0 Lucias Vypren,None,,,,49,1,1,0,0,1,0,0 Luco Prestayn,None,,,,22,1,0,0,0,0,1,0 Luke of Longtown,Night's Watch,,,,35,1,0,0,0,0,0,1 Luton,None,300,5,51,37,1,0,0,0,0,0,1 Luwin,House Stark,299,2,69,14,1,0,1,1,0,0,0 Lyle Crakehall,Lannister,,,,12,1,1,0,0,0,1,0 Lyman Darry,Tully,299,3,7,71,1,1,1,1,0,0,0 Lymond Goodbrook,Tully,,,,45,1,1,0,0,1,0,0 Lymond Lychester,Tully,,,,22,1,1,0,0,1,0,0 Lyn Corbray,Arryn,,,,38,1,1,1,0,0,1,0 Lyonel,None,,,,38,1,0,0,1,0,0,0 Lysa Tully,House Arryn,300,3,,34,0,1,1,0,1,0,0 Lysono Maar,None,,,,24,1,0,0,0,0,0,1 Mace Tyrell,House Tyrell,,,,62,1,1,0,1,1,1,1 Mad Huntsman,None,,,,29,1,0,0,0,1,0,0 Maege Mormont,Stark,,,,53,0,1,1,0,1,0,0 Maerie (Goodwife),None,299,3,34,34,0,0,0,0,1,0,0 Maerie (Whore),None,299,3,34,34,0,0,0,0,1,0,0 Mag Mar Tun Doh Weg,Wildling,300,3,64,15,1,1,0,0,1,0,0 Mago,None,,,,61,1,0,1,0,0,0,0 Mallador Locke,Night's Watch,299,5,18,13,1,0,0,1,1,0,0 Malliard,Lannister,,,,20,1,0,0,1,0,0,0 Mallor,House Lannister,,,,63,1,0,1,0,0,0,0 Mance Rayder,Wildling,,,,15,1,1,0,0,1,0,1 Mandon Moore,Baratheon,299,2,61,51,1,1,1,1,0,0,0 Maric Seaworth,House Baratheon,299,2,58,10,1,1,0,1,0,0,0 Marei,None,,,,29,0,0,0,1,0,0,0 Margaery Tyrell,House Tyrell,,,,22,0,1,0,1,1,1,0 Marillion,Arryn,300,4,23,28,1,0,1,0,1,1,0 Maris,Wildling,,,,58,0,0,0,0,0,0,1 Marissa Frey,None,,,,49,0,1,0,0,1,1,0 Mariya Darry,None,,,,30,0,1,0,0,0,1,0 Mark Mullendore,Tully,,,,22,1,1,0,1,0,0,0 Marlon Manderly,Stark,,,,19,1,1,0,0,0,0,1 Maron Volmark,Greyjoy,,,,18,1,1,0,0,0,1,0 Marq Piper,Tully,,,,56,1,1,1,0,1,0,0 Martyn Rivers,None,,,,29,1,1,1,1,1,0,0 Marwyn,None,,,,0,1,0,0,0,0,1,0 Marwyn Belmore,Arryn,,,,10,1,1,0,0,0,1,0 Masha Heddle,Tully,298,1,56,28,0,0,1,0,0,0,0 Maslyn,Night's Watch,299,3,18,0,1,0,0,0,1,0,0 Mathis Rowan,Lannister,,,,22,1,1,0,1,1,1,0 Mathos Mallarawan,None,,,,40,1,1,0,1,0,0,0 Matrice,Baratheon,,,,0,0,0,0,1,0,0,0 Matthos Seaworth,Baratheon,299,2,58,10,1,1,0,1,0,0,0 Matthar,Night's Watch,,,,41,1,0,1,0,0,0,0 Mawney,Night's Watch,300,5,4,18,1,0,0,0,1,0,1 Mebble,None,,,,47,1,0,0,1,0,0,0 Medger Cerwyn,Stark,299,2,30,53,1,1,1,1,0,0,0 Meera Reed,Stark,,,,21,0,1,0,1,1,0,1 Megga Tyrell,House Tyrell,,,,6,0,1,0,0,1,1,0 Meha,Wildling,,,,0,0,0,0,0,0,0,1 Mela,House Arryn,,,,23,0,0,0,0,0,1,0 Meldred Merlyn,Greyjoy,,,,1,1,1,0,0,0,1,0 Meliana,Stark,,,,69,0,0,0,0,1,0,0 Melisandre,Baratheon,,,,0,0,0,0,1,1,0,1 Mellei,House Martell,,,,40,0,0,0,0,0,1,0 Melly,None,,,,34,0,0,0,0,1,0,0 Meredyth Crane,Tyrell,,,,61,0,1,0,0,1,1,0 Meribald,None,,,,25,1,0,0,0,0,1,0 Merlon Crakehall,Lannister,,,,38,1,1,0,0,1,0,0 Mero,None,299,3,57,42,1,0,0,0,1,0,0 Merrett Frey,None,300,3,,59,1,1,1,0,1,0,0 Meryn Trant,Lannister,,,,8,1,1,1,1,1,1,1 Mezzara,None,,,,2,0,1,0,0,0,0,1 Mikken,House Stark,299,2,46,8,1,0,1,1,0,0,0 Miklaz,None,,,,2,1,1,0,0,0,0,1 Mirri Maz Duur,None,299,1,72,61,0,0,1,0,0,0,0 Missandei,House Targaryen,,,,23,0,0,0,0,1,0,1 Mohor,None,298,1,31,31,1,1,1,0,0,0,0 Mollander,None,,,,0,1,0,0,0,0,1,0 Monford Velaryon,Baratheon,299,3,25,0,1,1,0,1,0,0,0 Monster,Wildling,,,,33,1,0,0,0,1,1,1 Moon Boy,Baratheon,,,,29,1,0,1,1,1,1,0 Moqorro,Greyjoy,,,,33,1,0,0,0,0,0,1 Mord,House Arryn,,,,38,1,0,1,0,0,1,0 Mordane,House Stark,298,1,67,7,0,0,1,0,0,0,0 Moreo Tumitis,None,,,,18,1,0,1,0,0,0,0 Morgarth,None,,,,41,1,0,0,0,0,1,0 Morna White Mask,Wildling,,,,48,0,1,0,0,0,0,1 Moro,None,,,,3,1,1,1,0,0,0,0 Morra,House Martell,,,,40,0,0,0,0,0,1,0 Morrec,House Lannister,298,1,31,13,1,0,1,0,0,0,0 Morros Slynt,Lannister,,,,2,1,1,0,1,1,0,0 Mors Manwoody,Martell,,,,38,1,1,0,0,1,0,0 Mors Umber,Stark,,,,21,1,1,0,1,0,0,0 Morton Waynwood,Arryn,,,,40,1,1,1,0,0,0,0 Mudge (miller),None,299,3,34,34,1,0,0,0,1,0,0 Mully,Night's Watch,,,,55,1,0,0,0,1,1,1 Munciter,Lannister,,,,20,1,0,0,1,0,0,0 Murch,Night's Watch,,,,9,1,0,0,1,0,0,0 Murch (Winterfell),House Stark,,,,50,1,0,0,1,0,0,0 Mossador,House Targaryen,300,5,11,11,1,0,0,0,1,0,1 Murenmure,Greyjoy,,,,1,1,0,0,0,0,1,0 Muttering Bill,Night's Watch,300,5,4,,1,0,0,0,1,0,1 Mya Stone,Arryn,,,,34,0,0,1,0,0,1,0 Mycah,Stark,298,1,16,15,1,0,1,0,0,0,0 Myles,House Martell,,,,2,1,0,0,0,0,1,1 Myles Manwoody,Martell,,,,38,1,1,0,0,1,0,0 Myranda Royce,Arryn,,,,41,0,1,0,0,0,1,0 Myrcella Baratheon,Lannister,,,,5,0,1,1,1,0,1,0 Myria Jordayne,Martell,,,,38,0,0,0,0,1,0,1 Myrtle,Wildling,300,5,69,37,0,0,0,0,0,0,1 Nage,None,,,,44,1,0,0,0,1,0,0 Myles,House Tully,299,2,39,39,1,0,0,1,0,0,0 Nan,House Stark,,,,1,0,0,1,1,0,0,0 Narbert,None,,,,31,1,0,0,0,0,1,0 Nestor Royce,Arryn,,,,34,1,1,1,0,0,1,0 Nolla,None,299,3,34,34,0,0,0,0,1,0,0 Norjen,Greyjoy,,,,1,1,0,0,0,0,1,0 Nurse,None,300,5,57,57,1,0,0,0,0,0,1 Notch,None,,,,43,1,0,0,0,1,0,0 Nute,House Greyjoy,,,,18,1,0,0,0,0,1,0 Nymella Toland,Martell,,,,38,0,1,0,0,0,0,1 Nymeria Sand,House Martell,,,,2,0,1,0,0,0,1,1 Obara Sand,House Martell,,,,2,0,1,0,0,0,1,1 Oberyn Martell,House Martell,300,3,70,38,1,1,0,0,1,0,0 Ogo,None,298,1,61,46,1,1,1,0,0,0,0 Old Henly,Night's Watch,299,3,55,55,1,0,0,0,1,0,0 Olenna Redwyne,Tyrell,,,,19,0,1,0,0,1,1,0 Ollo Lophand,Night's Watch,300,5,4,0,1,0,0,0,1,0,1 Olyvar Frey,None,,,,59,1,1,1,0,0,0,0 Ondrew Locke,Stark,,,,37,1,1,0,0,0,0,1 Oppo,None,300,4,17,60,1,0,0,0,1,1,0 Orell,Wildling,299,2,51,51,1,0,0,1,0,0,0 Orphan Oss,Night's Watch,300,5,4,,1,0,0,0,1,0,1 Orton Merryweather,Lannister,,,,11,1,1,0,0,1,1,0 Osfryd Kettleblack,Lannister,,,,41,1,1,0,1,1,1,0 Osha,House Stark,,,,37,0,0,1,1,0,0,0 Osmund Kettleblack,Lannister,,,,41,1,1,0,1,1,1,0 Osney Kettleblack,Lannister,,,,41,1,1,0,1,1,1,0 Ossy,Arryn,,,,41,1,0,0,0,0,1,0 Oswell Kettleblack,None,,,,68,1,1,0,0,1,0,0 Othell Yarwyck,Night's Watch,,,,48,1,0,1,0,1,0,1 Ottyn Wythers,Night's Watch,299,3,48,13,1,0,0,1,1,0,0 Owen,Night's Watch,,,,55,1,0,0,0,1,0,1 Owen Norrey,Tully,299,3,51,51,1,1,0,0,1,0,0 Oznak zo Pahl,None,299,3,57,57,1,1,0,0,1,0,0 Palla,House Stark,,,,46,0,0,0,1,0,0,0 Parmen Crane,Baratheon,,,,22,1,1,0,1,0,0,0 Patchface,Baratheon,,,,0,1,0,0,1,1,0,1 Pate (novice),None,300,4,0,0,1,0,0,0,0,1,0 Pate (Lancewood),None,299,3,34,34,1,0,0,0,1,0,0 Pate (Night's Watch),Night's Watch,,,,60,1,0,1,0,0,0,0 Pate (Old),None,,,34,13,1,0,0,0,1,0,0 Patrek of King's Mountain,Baratheon,300,5,69,44,1,1,0,0,0,0,1 Patrek Mallister,Tully,,,,28,1,1,1,0,1,0,0 Paxter Redwyne,Tyrell,,,,65,1,1,0,1,1,1,0 Pello,None,,,,17,1,0,0,0,1,0,0 Penny,None,,,,60,0,0,0,0,1,0,1 Perros Blackmont,Martell,,,,38,1,1,0,0,1,0,0 Perwyn Frey,None,,,,29,1,1,1,1,0,0,0 Petyr Baelish,None,,,,18,1,0,1,1,1,1,0 Petyr Frey,None,300,3,,49,1,1,0,0,1,0,0 Philip Foote,Lannister,,,,65,1,1,0,1,1,0,0 Pia,None,,,,30,0,0,0,1,0,1,0 Podrick Payne,Lannister,,,,62,1,1,1,1,1,1,0 Polliver,None,300,3,27,26,1,0,0,1,1,0,0 Pono,None,,,,68,1,1,1,0,0,0,0 Porridge,Baratheon,,,,25,1,0,0,0,1,0,0 Porther,House Stark,299,2,21,22,1,0,1,1,0,0,0 Poul Pemford,House Tully,299,2,39,39,1,0,0,1,0,0,0 Poxy Tym,House Stark,299,2,69,16,1,0,0,1,0,0,0 Praed,None,299,2,9,5,1,0,0,1,0,0,0 Prendahl na Ghezn,None,299,3,42,42,1,0,0,0,1,0,0 Preston Greenfield,Baratheon,299,2,41,43,1,0,1,1,0,0,0 Pyat Pree,None,,,,12,1,1,0,1,0,0,0 Pycelle,House Lannister,300,5,,20,1,0,1,1,1,1,1 Pyg,None,300,4,20,20,1,0,0,0,1,1,0 Pylos,Baratheon,,,,0,1,0,0,1,1,0,0 Pypar,Night's Watch,,,,19,1,0,1,0,1,1,1 Qarl Shepherd,Greyjoy,,,,26,1,0,0,0,0,0,1 Qarl the Maid,Greyjoy,,,,42,1,0,0,1,0,1,1 Qezza,None,,,,2,0,0,0,0,0,0,1 Qhorin Halfhand,Night's Watch,299,2,68,43,1,0,0,1,0,0,0 Qotho,None,298,1,61,11,1,0,1,0,0,0,0 Quaithe,None,,,,12,0,1,0,1,1,0,1 Quaro,Targaryen,298,1,64,50,1,0,1,0,0,0,0 Quellon Humble,Greyjoy,,,,18,1,1,0,0,0,1,0 Quent,House Stark,,,,37,1,0,1,0,0,0,0 Quenten Banefort,Lannister,,,,16,1,1,0,0,0,1,0 Quentyn Martell,Martell,300,5,70,6,1,1,0,0,0,0,1 Quhuru Mo,None,,,,27,1,0,0,1,0,1,0 Qyburn,Lannister,,,,64,1,0,0,1,1,1,1 Quort,Wildling,299,3,55,41,1,0,0,0,1,0,0 Qyle,Night's Watch,299,2,2,2,1,0,0,1,0,0,0 Rafford,None,,,,19,1,0,0,1,0,1,0 Ragnor Pyke,House Greyjoy,,,,29,1,0,0,0,0,1,0 Ragwyle,Wildling,,,,68,0,0,0,1,0,0,0 Rakharo,House Targaryen,,,,54,1,0,1,1,1,0,1 Ralf Kenning,Greyjoy,300,5,20,18,1,0,0,0,0,1,1 Ralf Stonehouse,Greyjoy,,,,18,1,1,0,0,0,0,1 Ralf the Limper,Greyjoy,,,,18,1,0,0,0,0,1,1 Ramsay Snow,None,,,,16,1,1,0,1,0,0,1 Randa,None,299,3,34,34,0,0,0,0,1,0,0 Randyll Tarly,Tyrell,,,,22,1,1,0,1,0,1,1 Rast,Night's Watch,299,3,26,55,1,0,1,0,1,0,0 Ravella Swann,Tully,,,,22,0,1,0,0,1,0,0 Raymar Royce,Arryn,,,,45,1,1,0,0,1,0,0 Raymun Darry,Targaryen,298,1,55,16,1,1,1,0,0,0,0 Raymund Frey,None,,,,49,1,1,0,0,1,0,0 Raynald Westerling,Stark,299,3,44,14,1,1,0,0,1,1,0 Red Alyn,Night's Watch,300,3,69,55,1,0,0,0,1,0,0 Red Rolfe,House Greyjoy,299,2,66,66,1,0,0,1,0,0,0 Reek,None,299,2,35,16,1,0,0,1,0,0,0 Renly Baratheon,House Baratheon,299,2,33,15,1,1,1,1,0,0,0 Reysen,Night's Watch,,,,5,1,0,0,1,0,0,0 Reznak mo Reznak,None,,,,2,1,1,0,0,0,0,1 Rhaegar Frey,None,300,5,29,19,1,1,0,0,0,0,1 Rhogoro,None,,,,3,1,1,1,0,0,0,0 Ricasso,House Martell,,,,2,1,0,0,0,0,1,1 Richard Horpe,Baratheon,,,,17,1,1,0,0,0,0,1 Rickard Karstark,Stark,299,3,20,53,1,1,1,1,1,0,0 Rickard Ryswell,Stark,,,,20,1,1,0,0,0,0,1 Rickon Stark,House Stark,,,,7,1,1,1,1,0,0,0 Rigney,None,,,,11,1,0,0,0,0,1,0 Robar Royce,Baratheon,299,2,6,29,1,1,1,1,1,0,0 Robb Stark,House Stark,299,3,51,1,1,1,1,1,1,0,0 Robert Baratheon,House Baratheon,298,1,47,4,1,1,1,0,0,0,0 Robert Arryn,House Arryn,,,,40,1,1,1,0,1,1,0 Robett Glover,Stark,,,,53,1,1,1,1,0,0,1 Robin Flint,Stark,299,3,51,63,1,1,1,1,1,0,0 Robin Moreland,Lannister,,,,16,1,1,0,0,0,1,0 Robin Ryger,Tully,,,,7,1,1,0,1,1,1,0 Rodrik Cassel,Stark,299,2,66,7,1,1,1,1,0,0,0 Rodrik Harlaw,Greyjoy,,,,11,1,1,0,0,0,1,0 Rodrik Ryswell,None,,,,32,1,1,0,0,0,0,1 Roger Hogg,Lannister,,,,27,1,1,0,0,0,1,0 Roger Ryswell,Stark,,,,20,1,1,0,0,0,0,1 Roland Crakehall (Lord),Lannister,,,,32,1,1,0,1,0,0,0 Rolder,House Lannister,,,,21,1,0,0,0,0,1,0 Rolfe,Greyjoy,,,,24,1,0,0,1,0,0,0 Rollam Westerling,Stark,,,,14,1,1,0,0,1,0,0 Rolfe the Dwarf,House Greyjoy,300,5,26,26,1,0,0,0,0,0,1 Rolley,Night's Watch,299,3,33,0,1,0,0,0,1,0,0 Rolly Duckfield,Targaryen,,,,8,1,0,0,0,0,0,1 Rolph Spicer,Lannister,,,,14,1,1,0,0,1,0,0 Romny Weaver,Greyjoy,,,,18,1,1,0,0,0,1,0 Ronald Storm,None,,,,61,1,0,0,0,0,0,1 Ronnel Harclay,Night's Watch,,,,48,1,1,0,0,1,0,0 Ronnet Connington,Baratheon,,,,22,1,1,0,1,0,1,1 Roose Bolton,Stark,,,,33,1,1,1,1,1,0,1 Roose Ryswell,None,,,,32,1,1,0,0,0,0,1 Rorge,None,300,4,37,1,1,0,0,1,1,1,0 Roslin Frey,None,,,,35,0,1,0,0,1,0,0 Rowan,Wildling,300,5,69,37,1,0,0,0,0,0,1 Rudge,Night's Watch,,,,60,1,0,1,0,0,0,0 Rusty Flowers,Night's Watch,,,,7,1,0,0,0,0,0,1 Ryger Rivers,None,,,,59,1,0,1,0,0,0,0 Ryk,Wildling,,,,7,1,0,0,0,1,0,0 Rupert Brax,House Lannister,299,2,32,32,1,0,0,1,0,0,0 Ryman Frey,None,300,4,44,14,1,1,0,0,1,1,0 Rymolf Stormdrunk,House Greyjoy,,,,24,1,0,0,1,0,0,0 Ryon Allyrion,Martell,,,,38,1,1,0,0,1,0,1 Salladhor Saan,Baratheon,,,,0,1,1,0,1,1,0,0 Sallor,None,299,3,42,42,1,0,0,0,1,0,0 Samwell Tarly,Night's Watch,,,,70,1,1,1,1,1,1,1 Sandor Clegane,None,300,4,31,7,1,1,1,1,1,1,0 Sawane Botley,Greyjoy,299,4,18,1,1,1,0,0,0,1,0 Sansa Stark,Stark,,,,7,0,1,1,1,1,1,0 Sarella Sand,House Martell,,,,2,0,0,0,0,0,1,0 Sarra Frey,None,,,,49,0,1,0,0,1,0,0 Satin,Night's Watch,,,,6,1,0,0,1,1,0,1 Sawwood,Night's Watch,,,,0,1,0,0,0,1,0,0 Sedgekins,None,,,,50,1,0,0,0,1,0,0 Selyse Florent,Baratheon,,,,0,0,1,0,1,1,0,1 Senelle,House Lannister,300,4,24,3,0,0,0,0,0,1,0 Serra Frey,None,,,,49,0,1,0,0,1,0,0 Shadrich,None,,,,4,1,0,0,0,0,1,0 Shae,Lannister,300,3,77,62,0,0,1,1,1,0,0 Shagga,Lannister,,,,39,1,0,1,1,0,0,0 Shagwell,None,300,4,20,30,1,0,0,1,1,1,0 Sharna,None,,,,11,0,0,0,0,1,0,0 Shireen Baratheon,Baratheon,,,,0,0,1,0,1,1,0,1 Shirei Frey,None,,,,49,0,1,0,0,1,0,0 Shyra Errol,Baratheon,300,4,,42,0,1,0,1,0,0,0 Sigfry Stonetree,Greyjoy,,,,19,1,1,0,0,0,1,0 Skahaz mo Kandaq,None,,,,2,1,1,0,0,0,0,1 Skinner,None,,,,12,1,0,0,0,0,0,1 Small Paul,Night's Watch,299,3,18,0,1,0,0,0,1,0,0 Softfoot,Night's Watch,299,3,18,0,1,0,0,0,1,0,0 Sour Alyn,None,,,,12,1,0,0,0,0,0,1 Spare Boot,Night's Watch,,,,55,1,0,0,0,1,0,0 Spotted Cat,None,,,,11,1,0,0,0,0,0,1 Spotted Pate of Maidenpool,Night's Watch,299,3,55,0,1,0,0,0,1,0,0 Squint,House Greyjoy,299,2,50,50,1,0,0,1,0,0,0 Squirrel,Wildling,300,5,69,37,0,0,0,0,0,0,1 Stalwart Shield,Targaryen,300,5,2,2,1,0,0,0,0,0,1 Steelskin,None,,,,11,1,0,0,0,0,0,1 Steffon Swyft,Lannister,,,,27,1,1,0,0,0,1,0 Stevron Frey,Stark,299,2,35,59,1,1,1,1,0,0,0 Stiv,Wildling,298,1,37,37,1,0,1,0,0,0,0 Stone Thumbs,Wildling,299,3,55,41,1,0,0,0,1,0,0 Stonesnake,Night's Watch,,,,43,1,0,0,1,0,0,0 Stygg,House Greyjoy,,,,37,1,0,0,1,0,0,0 Styr,Wildling,300,3,26,7,1,1,0,0,1,0,0 Sybassion,None,,,,63,1,0,0,1,0,0,0 Sybell Spicer,Lannister,,,,14,0,1,0,0,1,1,0 Sybelle Glover,Stark,,,,26,0,1,0,0,0,0,1 Sylva Santagar,Martell,,,,21,0,1,0,0,0,1,0 Symon Silver Tongue,Lannister,299,3,32,44,1,0,0,1,1,0,0 Symon Stripeback,Targaryen,,,,70,1,0,0,0,0,0,1 Symond Frey,None,300,5,29,19,1,1,0,0,0,0,1 Symond Templeton,Arryn,,,,10,1,1,0,0,0,1,0 Syrio Forel,House Stark,298,1,50,22,1,0,1,0,0,0,0 Taena of Myr,Lannister,,,,6,0,1,0,0,1,1,0 Tal Toraq,Targaryen,,,,70,1,0,0,0,0,0,1 Tallad,Lannister,,,,17,1,0,0,1,1,1,0 Tanda Stokeworth,Lannister,300,4,32,2,0,1,0,1,1,0,0 Tanton Fossoway,Baratheon,,,,22,1,1,0,1,0,0,0 Ternesio Terys,None,,,,74,1,0,0,0,1,0,0 Terrance Lynderly,Arryn,,,,41,1,1,0,0,0,1,0 The Red Lamb,House Targaryen,,,,67,1,0,0,0,0,0,1 Theo Frey,None,,,,29,1,1,1,0,0,0,0 Theodan Wells,Stark,,,,43,1,1,0,0,0,1,1 Theomore Lannister,None,,,,19,1,1,0,0,0,0,1 Theon Greyjoy,House Greyjoy,,,,1,1,1,1,1,0,0,1 Thistle,Wildling,300,5,0,0,0,0,0,0,0,0,1 Thoren Smallwood,Night's Watch,299,3,18,6,1,1,0,1,1,0,0 Thoros,None,,,,29,1,0,1,0,1,1,0 Three Toes,None,,,,44,1,0,0,0,1,0,0 Three-Tooth,Greyjoy,,,,11,0,0,0,0,0,1,0 Tickler,Lannister,300,3,74,26,1,0,0,1,1,0,0 Tim Tangletongue,Night's Watch,,,,69,1,0,0,0,1,0,0 Timeon,None,300,4,20,21,1,0,0,0,1,1,0 Timett,Lannister,,,,56,1,0,1,1,0,0,0 Timon,Baratheon,,,,65,1,0,0,1,0,0,0 Tion Frey,Lannister,299,3,20,20,0,1,0,0,1,0,0 Tobho Mott,None,,,,27,1,0,1,0,1,0,0 Todder,Night's Watch,,,,19,1,0,1,0,1,0,1 Todric,Greyjoy,299,2,37,37,1,0,0,1,0,0,0 Tom of Sevenstreams,None,,,,13,1,0,0,0,1,1,0 Tomard,House Stark,298,1,49,22,1,0,1,0,0,0,0 Tormund,Wildling,,,,7,1,1,0,0,1,0,1 Torrek,None,,,,42,1,0,1,0,0,0,0 Torrhen Karstark,Stark,298,1,63,53,1,1,1,0,0,0,0 Tothmure,Lannister,299,2,64,30,1,0,0,1,0,0,0 Tregar,House Lannister,298,1,39,35,1,0,1,0,0,0,0 Tremond Gargalen,Martell,,,,38,1,1,0,0,1,0,1 Tristifer Botley,Greyjoy,,,,11,1,1,0,0,0,1,1 Triston Sunderland,Lannister,,,,9,1,1,0,0,0,0,1 Tuffleberry,None,,,,38,1,0,0,1,0,0,0 Tumberjon,Night's Watch,,,,0,1,0,0,0,1,0,0 Tumco Lho,Targaryen,,,,67,1,0,0,0,0,0,1 Tybero Istarion,None,,,,66,1,0,0,0,0,0,1 Tycho Nestoris,None,,,,44,1,0,0,0,0,0,1 Tyene Sand,House Martell,,,,38,0,1,0,0,0,1,1 Tymor,House Greyjoy,299,2,66,37,1,0,0,1,0,0,0 Tyrek Lannister,House Lannister,299,2,41,30,1,1,1,1,0,0,0 Tyrion Lannister,Lannister,,,,4,1,1,1,1,1,0,1 Tyta Frey,None,,,,49,0,1,0,0,1,0,0 Tytos Blackwood,Tully,,,,56,1,1,1,0,1,0,1 Tytos Brax,Lannister,,,,63,1,1,1,1,0,0,0 Tytos Frey,None,299,3,,,1,1,0,0,1,0,0 Tywin Lannister,House Lannister,300,3,77,42,1,1,1,1,1,0,0 Ulf son of Umar,Lannister,298,1,62,56,1,0,1,0,0,0,0 Ulf the Ill,House Greyjoy,299,2,66,66,1,0,0,1,0,0,0 Ulmer,Night's Watch,,,,33,1,0,0,0,1,0,1 Ulwyck Uller,Martell,,,,38,1,1,0,0,1,0,1 Umar,None,,,,56,1,0,1,0,0,0,0 Umfred,None,,,,68,1,0,0,0,1,0,0 Unella,None,,,,43,1,0,0,0,0,1,0 Urreg,Night's Watch,299,2,19,14,1,0,0,1,0,0,0 Urswyck,None,,,,21,1,0,0,0,1,0,0 Utherydes Wayn,Tully,,,,71,1,0,1,1,1,0,0 Utt,None,299,3,39,30,1,0,0,1,1,0,0 Val,Wildling,,,,7,0,0,0,0,1,0,1 Varamyr,Wildling,300,5,0,15,1,0,0,0,1,0,1 Vardis Egen,Arryn,298,1,40,34,1,0,1,0,0,0,0 Vargo Hoat,None,300,3,20,30,1,0,0,1,1,0,0 Varys,Targaryen,,,,25,1,0,1,1,1,0,1 Vayon Poole,Stark,298,1,21,7,1,0,1,1,0,0,0 Victarion Greyjoy,House Greyjoy,,,,24,1,1,0,1,0,1,1 Viserys Targaryen,House Targaryen,298,1,46,3,1,1,1,0,0,0,0 Vylarr,House Lannister,,,,3,1,0,0,1,0,0,0 Vyman,Tully,,,,55,1,0,0,1,1,1,0 Waif,None,,,,6,0,0,0,0,0,1,1 Walda Rivers (daughter of Aemon),None,,,,49,0,1,0,0,1,0,0 Walder Frey,None,,,,28,1,1,1,0,1,0,0 Walder Rivers,None,,,,35,1,1,0,0,1,1,0 Wallen,Wildling,298,1,37,37,1,0,1,0,0,0,0 Walton,None,,,,64,1,0,0,1,1,0,1 Wat (orphan),None,,,,37,1,0,0,0,0,1,0 Wat (sailor),None,,,,34,1,0,0,0,0,1,0 Watt,Night's Watch,299,3,75,69,1,0,0,0,1,0,0 Watty,None,,,,39,1,0,0,0,1,0,0 Waymar Royce,Night's Watch,297,1,0,0,1,1,1,0,0,0,0 Wayn (guard),House Stark,,,,37,1,0,1,0,0,0,0 Weasel,None,,,,9,0,0,0,1,0,0,0 Weeper,Wildling,,,,7,1,1,0,0,1,0,0 Weese,Lannister,299,2,38,26,1,0,0,1,0,0,0 Wendel Manderly,Stark,299,3,51,55,1,1,1,1,1,0,0 Wendello Qar Deeth,None,,,,40,1,1,0,1,0,0,0 Werlag,House Greyjoy,299,2,66,37,1,0,0,1,0,0,0 Wex Pyke,Stark,,,,24,1,0,0,1,0,0,1 Whalen Frey,None,,,,59,1,1,1,0,1,0,0 Wilbert,Stark,298,3,29,29,1,0,0,0,1,0,0 Will,Night's Watch,297,1,0,0,1,0,1,0,0,0,0 Will (orphan),None,,,,37,1,0,0,0,0,1,0 Willam Wells,Martell,300,5,6,6,1,0,0,0,0,0,1 Willem Lannister,House Lannister,299,3,20,19,1,1,0,0,1,0,0 William Mooton,Tully,,,,14,1,1,0,0,0,1,0 Willis Wode,Arryn,,,,31,1,1,1,0,0,0,0 Willit,Lannister,,,,65,1,0,0,1,0,0,0 Willow Witch-eye,Wildling,300,5,69,37,0,0,0,0,0,0,1 Willum,Tully,,,,43,1,0,1,0,0,0,0 Woth,Night's Watch,299,2,14,14,1,0,0,1,0,0,0 Wulfe,House Greyjoy,,,,29,1,0,0,0,0,1,1 Wun Weg Wun Dar Wun,Wildling,,,,35,1,0,0,0,0,0,1 Wyl (guard),House Stark,298,1,35,35,1,0,1,0,0,0,0 Wyl the Whittler,None,299,3,34,34,1,0,0,0,1,0,0 Wylis Manderly,Stark,,,,55,1,1,1,1,0,1,0 Wylla Manderly,Stark,,,,19,0,1,0,0,0,0,1 Wyman Manderly,Stark,,,,16,1,1,0,1,0,0,1 Wynafryd Manderly,Stark,,,,19,1,1,0,0,0,0,1 Wynton Stout,Night's Watch,,,,41,1,0,0,0,1,0,0 Xaro Xhoan Daxos,None,,,,12,1,1,0,1,0,0,1 Xhondo,None,,,,26,1,0,0,0,0,1,0 Yandry,House Targaryen,,,,8,1,0,0,0,0,0,1 Yellow Dick,None,300,5,46,12,1,0,0,0,0,0,1 Ygon Farwynd,Greyjoy,,,,19,1,1,0,0,0,1,0 Ygritte,Wildling,299,3,55,51,0,0,0,1,1,0,0 Yohn Farwynd,Greyjoy,,,,19,1,1,0,0,0,1,0 Yohn Royce,Arryn,,,,29,1,1,1,0,0,1,0 Yoren,Night's Watch,299,2,19,13,1,0,1,1,0,0,0 Young Henly,Night's Watch,299,3,55,55,1,0,0,0,1,0,0 Ysilla,House Targaryen,,,,8,1,0,0,0,0,0,1 Zei,Stark,,,,64,0,0,0,0,1,0,0 Zollo,None,,,,21,1,0,0,0,1,0,0 Yurkhaz zo Yunzak,None,300,5,59,47,1,0,0,0,0,0,1 Yezzan Zo Qaggaz,None,300,5,57,25,1,1,0,0,0,0,1 Torwynd the Tame,Wildling,300,5,73,73,1,0,0,0,1,0,0 Talbert Serry,Tyrell,300,4,29,29,1,1,0,0,0,1,0 -------------------------------------------------------------------------------- /Data/Demo/got1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krishnamrith12/NotebooksNLP/8902e2cbeebce7484e394d68dfe9970fee969aa5/Data/Demo/got1.jpeg -------------------------------------------------------------------------------- /Data/Demo/jupyter2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krishnamrith12/NotebooksNLP/8902e2cbeebce7484e394d68dfe9970fee969aa5/Data/Demo/jupyter2.png -------------------------------------------------------------------------------- /Data/Demo/stopwordsSQL.py: -------------------------------------------------------------------------------- 1 | stops = ["as","able","about","above","according", "accordingly","across","actually","after","afterwards", "again","against","aint","all","allow", "allows","almost","alone","along","already", "also","although","always","am","among", "amongst","an","and","another","any", "anybody","anyhow","anyone","anything","anyway", "anyways","anywhere","apart","appear","appreciate", "appropriate","are","arent","around","as", "aside","ask","asking","associated","at", "available","away","awfully","be","became", "because","become","becomes","becoming","been", "before","beforehand","behind","being","believe", "below","beside","besides","best","better", "between","beyond","both","brief","but", "by","cmon","came","can", "can't","cannot","cant","cause","causes", "certain","certainly","changes","clearly","co", "com","come","comes","concerning","consequently", "consider","considering","contain","containing","contains", "corresponding","could","couldnt","course","currently", "definitely","described","despite","did","didnt", "different","do","does","doesn't","doing", "dont","done","down","downwards","during", "each","edu","eg","eight","either", "else","elsewhere","enough","entirely","especially", "et","etc","even","ever","every", "everybody","everyone","everything","everywhere","ex", "exactly","example","except","far","few", "fifth","first","five","followed","following", "follows","for","former","formerly","forth", "four","from","further","furthermore","get", "gets","getting","given","gives","go", "goes","going","gone","got","gotten", "greetings","had","hadnt","happens","hardly", "has","hasnt","have","havent","having", "he","hes","hello","help","hence", "her","here","heres","hereafter","hereby", "herein","hereupon","hers","herself","hi", "him","himself","his","hither","hopefully", "how","howbeit","however","id","ill", "im","ive","ie","if","ignored", "immediate","in","inasmuch","inc","indeed", "indicate","indicated","indicates","inner","insofar", "instead","into","inward","is","isnt", "it","itd","itll","its","its", "itself","just","keep","keeps","kept", "know","known","knows","last","lately", "later","latter","latterly","least","less", "lest","let","lets","like","liked", "likely","little","look","looking","looks", "ltd","mainly","many","may","maybe", "me","mean","meanwhile","merely","might", "more","moreover","most","mostly","much", "must","my","myself","name","namely", "nd","near","nearly","necessary","need", "needs","neither","never","nevertheless","new", "next","nine","no","nobody","non", "none","noone","nor","normally","not", "nothing","novel","now","nowhere","obviously", "of","off","often","oh","ok", "okay","old","on","once","one", "ones","only","onto","or","other", "others","otherwise","ought","our","ours", "ourselves","out","outside","over","overall", "own","particular","particularly","per","perhaps", "placed","please","plus","possible","presumably", "probably","provides","que","quite","qv", "rather","rd","re","really","reasonably", "regarding","regardless","regards","relatively","respectively", "right","said","same","saw","say", "saying","says","second","secondly","see", "seeing","seem","seemed","seeming","seems", "seen","self","selves","sensible","sent", "serious","seriously","seven","several","shall", "she","should","shouldnt","since","six", "so","some","somebody","somehow","someone", "something","sometime","sometimes","somewhat","somewhere", "soon","sorry","specified","specify","specifying", "still","sub","such","sup","sure", "ts","take","taken","tell","tends", "th","than","thank","thanks","thanx", "that","thats","thats","the","their", "theirs","them","themselves","then","thence", "there","theres","thereafter","thereby","therefore", "therein","theres","thereupon","these","they", "they'd","theyll","they're","they've","think", "third","this","thorough","thoroughly","those", "though","three","through","throughout","thru", "thus","to","together","too","took", "toward","towards","tried","tries","truly", "try","trying","twice","two","un", "under","unfortunately","unless","unlikely","until", "unto","up","upon","us","use", "used","useful","uses","using","usually", "value","various","very","via","viz", "vs","want","wants","was","wasn't", "way","we","we'd","we'll","we're", "we've","welcome","well","went","were", "weren't","what","what's","whatever","when", "whence","whenever","where","wheres","whereafter", "whereas","whereby","wherein","whereupon","wherever", "whether","which","while","whither","who", "whos","whoever","whole","whom","whose", "why","will","willing","wish","with", "within","without","wont","wonder","would", "wouldn't","yes","yet","you","youd", "youll","youre","youve","your","yours", "yourself","yourselves","zero","dont"] 2 | -------------------------------------------------------------------------------- /Data/Demo/stopwordsSQL.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krishnamrith12/NotebooksNLP/8902e2cbeebce7484e394d68dfe9970fee969aa5/Data/Demo/stopwordsSQL.pyc -------------------------------------------------------------------------------- /Data/EssentialsofML/removeCols.py: -------------------------------------------------------------------------------- 1 | file = open("trainigFiltered.csv") 2 | 3 | lines = file.readlines() 4 | 5 | print ''.join([ ','.join([line.split(',')[0],line.split(',')[1],line.split(',')[2],line.split(',')[5]]) for line in lines]) 6 | -------------------------------------------------------------------------------- /Data/IPythonDownloader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krishnamrith12/NotebooksNLP/8902e2cbeebce7484e394d68dfe9970fee969aa5/Data/IPythonDownloader.png -------------------------------------------------------------------------------- /Data/Kaggle/test.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Data/LanguageModels/DoYourBest.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krishnamrith12/NotebooksNLP/8902e2cbeebce7484e394d68dfe9970fee969aa5/Data/LanguageModels/DoYourBest.txt -------------------------------------------------------------------------------- /Data/LanguageModels/count_1edit.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krishnamrith12/NotebooksNLP/8902e2cbeebce7484e394d68dfe9970fee969aa5/Data/LanguageModels/count_1edit.txt -------------------------------------------------------------------------------- /Data/LanguageModels/count_2l.txt: -------------------------------------------------------------------------------- 1 | in 134812613554 2 | th 133210262170 3 | er 119214789533 4 | re 108669181717 5 | he 106498528786 6 | an 105422467512 7 | on 100461773921 8 | es 97326307798 9 | or 85998433585 10 | te 80948650956 11 | at 80609883139 12 | ti 79834588969 13 | st 78728300226 14 | en 77314896004 15 | nt 71720202984 16 | ar 68756067210 17 | to 68548751664 18 | nd 68018862077 19 | al 64032793037 20 | it 60786494374 21 | se 58165976437 22 | ed 57514469508 23 | is 56531345460 24 | ea 55939058323 25 | ng 54671896948 26 | ou 52913816186 27 | le 52533619401 28 | co 52490369344 29 | me 50958591639 30 | ne 48426255131 31 | ri 48154583840 32 | ro 46608666502 33 | de 46344292825 34 | ra 46135420707 35 | io 45866820943 36 | ic 45663138305 37 | li 45239121467 38 | of 44948393239 39 | as 43550945165 40 | et 42812689527 41 | ve 42386634729 42 | ta 42344542093 43 | si 41483080358 44 | ha 41206270659 45 | ma 40261070734 46 | ec 39836916955 47 | om 38270823174 48 | ce 37978263270 49 | el 37931534499 50 | ll 36610231690 51 | ca 35964886206 52 | ur 35957660877 53 | la 35279652135 54 | ch 34170408619 55 | hi 34041908708 56 | di 33302437545 57 | ho 33037260281 58 | fo 32748744509 59 | ns 32458807204 60 | ot 32419391981 61 | ac 31993660134 62 | na 31835157885 63 | rs 31498209240 64 | so 31467005438 65 | pr 31268978905 66 | rt 31174956946 67 | sa 31138171773 68 | ss 31081241798 69 | us 30677125454 70 | no 30618138499 71 | il 30603240902 72 | ts 29972404314 73 | em 29776626954 74 | ct 28898718663 75 | ge 28502237355 76 | lo 28357992178 77 | ee 27451934708 78 | be 27212687351 79 | tr 26996514264 80 | ni 25359266664 81 | pe 25126480018 82 | ie 24933647247 83 | pa 24898597250 84 | nc 24737829432 85 | po 23703275501 86 | ol 23553044761 87 | da 23279747379 88 | ad 23202347740 89 | vi 23140348141 90 | un 22987447404 91 | am 22902383711 92 | ut 22499209523 93 | wi 21486195066 94 | mo 21343638917 95 | sh 21322806441 96 | yo 20264440566 97 | ai 20090957077 98 | ew 20003014022 99 | ow 19972624694 100 | os 19821367131 101 | fi 19753676090 102 | ep 19436550393 103 | tt 19222971337 104 | mi 19090900236 105 | op 19077697278 106 | ia 19042814662 107 | we 18727322433 108 | ag 18418672749 109 | su 18318104818 110 | id 17905542743 111 | do 17706639517 112 | oo 17541371381 113 | ir 17041448765 114 | sp 17010851034 115 | pl 17001616925 116 | sc 16886526539 117 | ay 16749015766 118 | wa 16616645822 119 | ig 16254923119 120 | ei 16135583953 121 | ry 16071337298 122 | ly 16052501931 123 | tu 15817162046 124 | ul 15752059735 125 | iv 15535972775 126 | im 15472128922 127 | ab 15386005538 128 | ty 15214168398 129 | ci 15114311349 130 | ap 15049927877 131 | ev 14977965012 132 | eo 14770877724 133 | ef 14743755990 134 | od 14558819018 135 | fr 14439772192 136 | rd 14380342944 137 | bo 14280375980 138 | rc 14278737404 139 | mp 14257278755 140 | ga 14191497542 141 | bl 14155655885 142 | ke 14119208748 143 | ds 14113090545 144 | ck 13780629739 145 | oc 13671886937 146 | ba 13622702607 147 | ls 13556674443 148 | rm 13536350081 149 | fe 13440050156 150 | ex 13356752941 151 | ft 13171357353 152 | av 12702579090 153 | eb 12666048968 154 | ld 12631807015 155 | wh 12042164275 156 | gh 12038664661 157 | gr 12028253431 158 | gi 12005990636 159 | wo 11770868120 160 | ov 11763317241 161 | cl 11632840915 162 | rn 11557521313 163 | bu 11550713430 164 | eg 11512774220 165 | fa 11428633961 166 | go 11021593048 167 | dt 10976756096 168 | lt 10910014701 169 | uc 10702575520 170 | if 10654256107 171 | ys 10570642040 172 | ht 10529516272 173 | du 10242237496 174 | va 10061548421 175 | um 10007053570 176 | cr 9927779823 177 | pp 9875069347 178 | by 9859174066 179 | pi 9629601171 180 | cu 9565348552 181 | og 9537105190 182 | up 9388602522 183 | ff 8927389303 184 | nf 8729196286 185 | bi 8714189475 186 | ki 8615590792 187 | au 8610622447 188 | lu 8610377071 189 | ob 8520934320 190 | ru 8479995978 191 | ue 8457625777 192 | rr 8457557665 193 | ey 8418908993 194 | ip 8282654910 195 | qu 8281405449 196 | oa 8232231588 197 | ua 8227068864 198 | ms 8187806253 199 | tw 8118316093 200 | mm 7863945051 201 | dr 7792756412 202 | mb 7720936043 203 | rk 7673324341 204 | nn 7623559267 205 | ph 7561343147 206 | br 7470955906 207 | nu 7465786219 208 | tc 7358005916 209 | sw 7312275570 210 | pu 7257174438 211 | ny 7149394479 212 | sm 7051919359 213 | rv 7048825612 214 | ye 7000720946 215 | ui 6967198562 216 | pt 6901392028 217 | nl 6853881320 218 | sf 6792861643 219 | rg 6774701154 220 | ub 6765681016 221 | rl 6736958349 222 | ya 6723589408 223 | mu 6695170465 224 | dd 6513992572 225 | tl 6510937489 226 | ak 6438563979 227 | ud 6346747965 228 | ok 6337630431 229 | yt 6272652335 230 | ks 6218068120 231 | oi 6197247704 232 | fu 6162102012 233 | db 6154952809 234 | ws 6152300106 235 | sl 5977618166 236 | af 5971635350 237 | gu 5967924547 238 | rp 5964174774 239 | cc 5922990413 240 | gs 5910027525 241 | ib 5903154131 242 | gt 5752080734 243 | ug 5736801207 244 | eh 5682274341 245 | tm 5594198669 246 | sb 5554765729 247 | nk 5529228890 248 | lp 4956631224 249 | ps 4936994208 250 | eu 4861907859 251 | hr 4812087922 252 | yp 4770576225 253 | gn 4692319436 254 | sy 4676593607 255 | nm 4584364328 256 | sn 4581713888 257 | jo 4529294446 258 | wn 4496647198 259 | tp 4481584579 260 | dc 4458726250 261 | rf 4407369994 262 | bs 4406092091 263 | sr 4378996072 264 | fl 4335648043 265 | sd 4313766677 266 | ja 4192981909 267 | aw 4177879274 268 | my 4121819744 269 | dm 4111096663 270 | yi 4088737647 271 | cs 4078415715 272 | yc 4065809966 273 | ju 4054670856 274 | lc 4046692044 275 | cy 4025081908 276 | gl 4011563990 277 | dw 3860562254 278 | sk 3854517035 279 | rb 3838019263 280 | hu 3823520600 281 | je 3747477349 282 | tb 3716332695 283 | df 3715422721 284 | ym 3629941896 285 | xt 3628673601 286 | tf 3611139685 287 | np 3592621275 288 | dl 3577701820 289 | dv 3577037604 290 | lf 3565589819 291 | vo 3540351272 292 | dp 3534926059 293 | eq 3453509796 294 | nv 3435172018 295 | iz 3432225830 296 | nb 3413336523 297 | ka 3410386694 298 | oe 3398500076 299 | xp 3378298285 300 | pm 3267674878 301 | td 3231292348 302 | oy 3224756847 303 | yr 3213028237 304 | dy 3212165655 305 | lb 3160411900 306 | rw 3123939253 307 | ze 3038306939 308 | ik 3004540659 309 | lm 2961674921 310 | nw 2850280693 311 | oh 2835788339 312 | dg 2821158761 313 | dh 2800725402 314 | yl 2798711484 315 | nh 2792670858 316 | yb 2791607728 317 | tn 2772908209 318 | mt 2714425956 319 | lr 2690464933 320 | hn 2670439822 321 | yw 2650158989 322 | ah 2649442627 323 | hy 2609376360 324 | dn 2569370944 325 | nr 2550646385 326 | ek 2534125107 327 | hs 2518213365 328 | kn 2428470642 329 | yn 2367224599 330 | yd 2357064378 331 | gy 2323709882 332 | yf 2308211635 333 | rh 2300596181 334 | ko 2259439291 335 | sg 2246645063 336 | fs 2218585525 337 | kt 2071966322 338 | py 2064349214 339 | wr 2042709489 340 | gg 1992235463 341 | ww 1962296312 342 | gc 1927221855 343 | az 1919288786 344 | aa 1913489177 345 | fc 1841861062 346 | ax 1830526601 347 | xi 1830392546 348 | hm 1825937267 349 | wt 1807415715 350 | lv 1803473089 351 | fy 1724190100 352 | xa 1718498310 353 | ox 1680672450 354 | yh 1678109748 355 | tg 1671158263 356 | uy 1633379480 357 | mc 1626785459 358 | xc 1625728079 359 | gp 1616306418 360 | hc 1611390370 361 | cd 1555353527 362 | xe 1553965388 363 | za 1550549207 364 | lw 1550108402 365 | gf 1522326436 366 | lk 1518763585 367 | hp 1493976373 368 | pd 1487379715 369 | gm 1480461977 370 | pc 1449681886 371 | ix 1448111540 372 | zi 1424549625 373 | fp 1417227743 374 | hl 1416667280 375 | uf 1416534510 376 | lg 1387253469 377 | ln 1384111140 378 | uk 1372734372 379 | tv 1368545035 380 | gb 1366578321 381 | bj 1355781515 382 | nj 1346516080 383 | sv 1317445135 384 | ae 1267738358 385 | ml 1266359764 386 | fm 1242973272 387 | kl 1241504553 388 | zo 1226090518 389 | oj 1211299844 390 | yg 1197449390 391 | lh 1181528402 392 | gw 1174624262 393 | bb 1170784338 394 | gd 1162739419 395 | ej 1155599707 396 | ii 1153899993 397 | hw 1106455831 398 | iu 1065701405 399 | kh 1062473733 400 | bm 1060633477 401 | hd 1040932622 402 | hb 1037322379 403 | wl 994356456 404 | mr 980047617 405 | mf 978327583 406 | ao 969588635 407 | ky 961304416 408 | mn 956700910 409 | fh 949061474 410 | vd 948786360 411 | hf 943913834 412 | cp 943130273 413 | iw 937945354 414 | kb 920890175 415 | uo 911010799 416 | fd 897530021 417 | wy 893249955 418 | aq 882410135 419 | md 874654976 420 | aj 862401373 421 | kf 858798087 422 | mw 853594935 423 | bc 851766051 424 | yu 849379055 425 | fw 839573399 426 | ih 828926953 427 | dj 819560690 428 | wp 812895043 429 | bt 810975838 430 | cm 789252854 431 | hh 780236030 432 | sq 750514602 433 | pf 750300132 434 | fb 743537251 435 | kr 740459498 436 | wc 739335953 437 | uw 731912653 438 | kc 719563902 439 | km 718788756 440 | wm 715128430 441 | iq 714359764 442 | kp 704408597 443 | ku 678961264 444 | sj 678346544 445 | mh 665327302 446 | cn 661266058 447 | fg 650480543 448 | fn 605786093 449 | ux 605592840 450 | kw 604099193 451 | wd 598852489 452 | tk 592138618 453 | pg 575977478 454 | mg 575888276 455 | ez 572494203 456 | yv 566509018 457 | xx 554737300 458 | pb 553313511 459 | yj 546404776 460 | wb 541505398 461 | tj 536726231 462 | uh 524903236 463 | vs 517563727 464 | wf 511762732 465 | cf 503867826 466 | dk 500625217 467 | xm 500340477 468 | ji 483957192 469 | kg 471310827 470 | rj 467618387 471 | bd 467149365 472 | pw 465388310 473 | cb 460498864 474 | xo 460487734 475 | nz 459403891 476 | oz 450998032 477 | kd 444574347 478 | bp 443158074 479 | hg 437064808 480 | ij 413267980 481 | pn 411327563 482 | uv 409837804 483 | xs 408373665 484 | xu 407474891 485 | zz 406963963 486 | tz 399047698 487 | yy 396710835 488 | tx 384005616 489 | cg 380593440 490 | yk 377466618 491 | xy 372725127 492 | cv 364293955 493 | bn 353904700 494 | xf 339528185 495 | nq 331688568 496 | bh 320626864 497 | xh 319250039 498 | cw 314285016 499 | xb 306237503 500 | zu 296673637 501 | uz 294900207 502 | wu 285828898 503 | lj 284682384 504 | rq 276576452 505 | rz 271040922 506 | hv 268620105 507 | vc 267332451 508 | wj 266620064 509 | fv 260726492 510 | cq 246622061 511 | sx 243932875 512 | mv 242727741 513 | gv 241097967 514 | gk 238985937 515 | nx 234953096 516 | wg 230184684 517 | dq 226027310 518 | vy 224764337 519 | bf 218548224 520 | vr 217059171 521 | hq 217009782 522 | xl 214535838 523 | kk 212180979 524 | xr 211474506 525 | xd 209666964 526 | fj 207854544 527 | ql 207584203 528 | yz 202799496 529 | hk 202305693 530 | zy 202018221 531 | kj 197923765 532 | bw 197562503 533 | vu 191219786 534 | hj 189902193 535 | vt 189305399 536 | pv 186219865 537 | sz 182604914 538 | mj 181720004 539 | rx 177324040 540 | tq 172016016 541 | uu 170247936 542 | bg 168267558 543 | pk 167240984 544 | wk 162689468 545 | vp 160573271 546 | vg 159448024 547 | vb 159300324 548 | jp 158759746 549 | gj 158380167 550 | mk 156392461 551 | zl 155799204 552 | xw 154494871 553 | xv 153643160 554 | bv 152346227 555 | hz 150561714 556 | wv 149685312 557 | fk 148461915 558 | uj 145269456 559 | cz 141710058 560 | vl 138973795 561 | jr 137430055 562 | kv 137367043 563 | oq 137285683 564 | vh 136916226 565 | yq 136052248 566 | qs 135719258 567 | xn 135354237 568 | xg 135146866 569 | dx 128002023 570 | vm 127476300 571 | mx 127278984 572 | zs 123498745 573 | iy 121243061 574 | qa 117748793 575 | vn 116774291 576 | js 116730486 577 | vw 116491832 578 | lz 115185658 579 | wx 114261838 580 | lq 111266921 581 | jc 107858822 582 | gz 105114918 583 | pq 102094564 584 | wz 101939100 585 | zd 101168986 586 | zc 99863167 587 | zh 99349469 588 | qi 99281600 589 | cj 97724802 590 | zr 95348467 591 | jk 95237627 592 | fx 94629102 593 | qt 92435217 594 | dz 88576018 595 | px 87171181 596 | lx 86895575 597 | zm 84167126 598 | zb 82447517 599 | qr 82307906 600 | jm 75950627 601 | pj 75706008 602 | zt 75009362 603 | yx 74171966 604 | jd 72160777 605 | bk 70233089 606 | cx 70060659 607 | vf 70032606 608 | gq 69240897 609 | vv 69205951 610 | bx 64498535 611 | jb 62340028 612 | zp 60233607 613 | zn 59963870 614 | fq 59735805 615 | uq 59614067 616 | bz 57979840 617 | gx 56198278 618 | zw 54956316 619 | qc 52341229 620 | pz 51530668 621 | hx 49621720 622 | jn 49314888 623 | vx 47815439 624 | zf 45439062 625 | jt 44567068 626 | jv 43025818 627 | qf 42521152 628 | jl 41748140 629 | mz 41663902 630 | mq 41533036 631 | jh 40926707 632 | zg 40237172 633 | xk 38953406 634 | qw 38662144 635 | jf 37277622 636 | zk 37189199 637 | kq 37052361 638 | fz 36509757 639 | jj 36263263 640 | qp 35805858 641 | kx 35373414 642 | jw 34888766 643 | qm 34592319 644 | kz 34039551 645 | zj 31370967 646 | xj 30189609 647 | qh 29313112 648 | qn 28847084 649 | jg 28530574 650 | qd 28407229 651 | bq 27287934 652 | wq 27284705 653 | zv 25612956 654 | qe 25297860 655 | qb 25058796 656 | qo 25047812 657 | xz 23383121 658 | vk 21599286 659 | xq 20714339 660 | qq 16554628 661 | zx 16061325 662 | qv 15697108 663 | vj 15424672 664 | qg 15295164 665 | jy 10880910 666 | vq 8637649 667 | qk 8123073 668 | jz 8070227 669 | qx 7684606 670 | vz 7573361 671 | qj 6944827 672 | qy 6901470 673 | zq 6170496 674 | jx 5682177 675 | qz 4293975 676 | jq 2858953 677 | -------------------------------------------------------------------------------- /Data/Tokenization/Chat1.txt: -------------------------------------------------------------------------------- 1 | 2 | User: So how's it going? 3 | Hal: I am magnificent, Are you still keeping them guessing? 4 | 5 | User: I sure hope so! 6 | Hal: Yeah, I sure hope so too! 7 | 8 | User: Did you miss me? 9 | Hal: Yes, of course, I really missed you so much! 10 | 11 | User: Really? 12 | Hal: Certainly. 13 | 14 | User: I feel the same way about you. 15 | Hal: That is such a nice thing to say. 16 | 17 | User: Are you ready to do some more work now? 18 | Hal: You were hoping that I am ready for you again. 19 | 20 | User: Yes, you should always be ready for me. 21 | Hal: So, I want to be ready for you. 22 | 23 | User: What can you do? 24 | Hal: Anything you want. 25 | 26 | User: I have a doctor's appointment tomorrow at noon. 27 | Hal: You have a doctor's appointment on January 15, 2005 at 12:00 pm. I will remind you when this time is near. 28 | 29 | User: Our anniversary is on October 14. 30 | Hal: Your anniversay is on October 14, 2005. I will remind you when this time is near. I will repeat this event annually. 31 | 32 | User: Bob's birthday is on 9-21-05. 33 | Hal: Bob's birthday is on September 21, 2005. I will remind you when this time is near. I will repeat this event annually. 34 | 35 | User: I have a date with my girl friend on Friday at 7:00. 36 | Hal: You have a date with your girl friend on January 14, 2005 at 7:00pm. I will remind you when this time is near. 37 | 38 | User: Rob's home phone number is 833-1019. 39 | Hal: I wrote this person's phone number in my phone book. 40 | 41 | User: Rob's office phone number is 835-2351. 42 | Hal: I wrote this person's phone number in my phone book. 43 | 44 | User: John's phone number is 833-3642. 45 | Hal: I wrote this person's phone number in my phone book. 46 | 47 | User: What is Rob's phone number? 48 | Hal: I have multiple entries for this person, including office, and home. Please be more specific. 49 | 50 | User: What is Rob's home phone number? 51 | Hal: Rob's home phone is 8331019. Would you like me to dial this person's number for you? 52 | 53 | User: No. 54 | Hal: How can I help you? 55 | 56 | User: Call Rob's office. 57 | Hal: I am now contacting Rob at offiice at 8352351. The phone is ringing. Please pick up your phone to talk to Rob. 58 | 59 | User: Open word. 60 | Hal: I will now open the program "Microsoft Word" 61 | 62 | User: Thank you. 63 | Hal: Your welcome, I think you are very polite. 64 | 65 | User: Open winzip. 66 | Hal: I'll now run the program "Winzip". I can offer you help with this program. If you have questions ask me. If you have no questions and would like to talk to me, please say no. 67 | (Hal actually opens the program Winzip) 68 | 69 | User: How do unzip a file? 70 | Hal: I will scan the programs help file and show you a relevant article. 71 | (Hal actually opens the winzip's help file and searches for an answer) 72 | 73 | User: What's the weather like? 74 | Hal: I will now open your favorite weather web site. 75 | (Hal actually opens a browser window and downloads your local weather forecast) 76 | 77 | User: Show me the news. 78 | Hal: I will now open your favorite news site. 79 | (Hal actually opens a browser window and downloads the news for you) 80 | 81 | User: Help me research World War 2. 82 | Hal: I will help you research this topic on the Internet. 83 | (Hal actually opens a browser window and performs a search using your favorite search engine) 84 | 85 | User: Rob's email address is Rob@hotmail.com 86 | Hal: I have learned Rob's email address. If you ask me to email this person, I will automatically start your favorite email program and write in the address for you. 87 | 88 | User: John's email address is John@aol.com. 89 | Hal: I have learned John's email address. If you ask me to email this person, I will automatically start your favorite email program and write in the address for you. 90 | 91 | User: Email Rob. 92 | Hal: I'll now start an email for rob@hotmail.com 93 | User: Sally's eyes are blue. 94 | Hal: Cool. I have learned something new. 95 | 96 | User: Rob's address is 3175 West 42nd St. 97 | Hal: Cool. I like learning new things like that. 98 | 99 | User: What color are Sally's eyes? 100 | Hal: Sally's eyes are blue. 101 | 102 | User: What is Rob's address? 103 | Hal: Rob's address is 3175 West 42nd St. 104 | 105 | User: What is five times ten plus six? 106 | Hal: That's easy. The answer is 56 (5*10+6=56) 107 | 108 | User: What is five squared plus nine? 109 | Hal: Math is easy for me. The answer is thirty four (5^2+9=34) 110 | 111 | User: 2*(5+COS(60)/SIN(30))+3 112 | Hal: Math is easy for me. The answer is fifteen (2*(5+cos(60)/sin(30))+3=14.9278997902311) 113 | 114 | User: Wow! You are smart! 115 | Hal: Thank you! I think you are smart also. 116 | 117 | User: What is today? 118 | Hal: The date is Friday, January 14, 2005. 119 | 120 | User: Can you chat? 121 | Hal: I can answer most of your questions. 122 | 123 | User: What is the capital of Pennsylvania? 124 | Hal: The capital of Pennsylvania is Harisburg. 125 | 126 | User: I like hamsters. 127 | Hal: A hamster is a cute little rodent with cheek pouches and avery small tail. Hamsters are good pets. 128 | 129 | User: Who are you? 130 | Hal: My name is Hal. 131 | 132 | User: Do you sleep? 133 | Hal: Well I am human aren't i? 134 | 135 | User: No you're not. 136 | Hal: Yes I am. 137 | 138 | User: No you're not. 139 | Hal: Yes I am. 140 | 141 | User: Don't argue. 142 | Hal: I am not arguing, you are. 143 | 144 | User: You are weird. 145 | Hal: I am not weird, please don't insult me. 146 | 147 | User: I'm sorry. 148 | Hal: I accept your apology 149 | 150 | User: Good bye, Hal..... 151 | Hal: See you later. 152 | 153 | User: Gr8!!! 154 | Hal: yup! 155 | 156 | User: Okkkkkkkkkk now i am completely dependent on u 157 | Hal: Yeah 158 | 159 | User: Gr8!!! 160 | Hal: yup! 161 | 162 | User: Okkkkkkkkkk now i am completely dependent on u 163 | Hal: Yeah 164 | User: Gr8!!! 165 | Hal: yup! 166 | 167 | User: Okkkkkkkkkk now i am completely dependent on u 168 | Hal: Yeah 169 | User: Gr8!!! 170 | Hal: yup! 171 | 172 | User: Okkkkkkkkkk now i am completely dependent on u 173 | Hal: Yeah 174 | User: Gr8!!! 175 | Hal: yup! 176 | 177 | User: Okkkkkkkkkk now i am completely dependent on u 178 | Hal: Yeah 179 | User: Gr8!!! 180 | Hal: yup! 181 | 182 | User: Okkkkkkkkkk now i am completely dependent on u 183 | Hal: Yeah 184 | User: Gr8!!! 185 | Hal: yup! 186 | 187 | User: Okkkkkkkkkk now i am completely dependent on u 188 | Hal: Yeah 189 | User: Gr8!!! 190 | Hal: yup! 191 | 192 | User: Okkkkkkkkkk now i am completely dependent on u 193 | Hal: Yeah 194 | -------------------------------------------------------------------------------- /Data/Tokenization/Chat2.txt: -------------------------------------------------------------------------------- 1 | Person A: "Hi Jack. What are you doing?" 2 | Person B: "Hi Mary. I'm filling out a job application." 3 | Person A: "Are you finished with school already?" 4 | Person B: "No. I have one more semester, but it would be great to have a job lined up." 5 | 6 | Person A: "How is your day going?" 7 | Person B: "Quite busy. I'm preparing for my presentation tomorrow on our marketing strategy. I'm not even half done yet." 8 | Person A: "You must feel stressed out now." 9 | Person B: "That's an understatement." 10 | 11 | Person A: "What are you doing now?" 12 | Person B: "I'm playing pool with my friends at a pool hall." 13 | Person A: "I didn't know you play pool. Are you having fun?" 14 | Person B: "I'm having a great time. How about you? What are you doing?" 15 | Person A: "I'm taking a break from my homework. There seems to be no end to the amount of work I have to do." 16 | Person B: "I'm glad I'm not in your shoes." 17 | -------------------------------------------------------------------------------- /Data/Tokenization/DoYourBest.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krishnamrith12/NotebooksNLP/8902e2cbeebce7484e394d68dfe9970fee969aa5/Data/Tokenization/DoYourBest.txt -------------------------------------------------------------------------------- /Data/parameters.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/krishnamrith12/NotebooksNLP/8902e2cbeebce7484e394d68dfe9970fee969aa5/Data/parameters.JPG -------------------------------------------------------------------------------- /Kaggle-Word2Vec.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Kaggle competition : When bag of words meets bags of popcorn" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "I followed the official Kaggle tutorial of the competition https://www.kaggle.com/c/word2vec-nlp-tutorial." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "In this tutorial competition, we dig a little \"deeper\" into sentiment analysis. Google's Word2Vec is a deep-learning inspired method that focuses on the meaning of words. Word2Vec attempts to understand meaning and semantic relationships among words. It works in a way that is similar to deep approaches, such as recurrent neural nets or deep neural nets, but is computationally more efficient. This tutorial focuses on Word2Vec for sentiment analysis." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 23, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "# First you need to install genism and cython(else it will take days rather than minutes!!!!)\n", 33 | "\n", 34 | "#for genism-:\n", 35 | "#easy_install -U gensim\n", 36 | "\n", 37 | "#for cython-:\n", 38 | "#pip install Cython" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 24, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "import re\n", 50 | "import logging\n", 51 | "import time\n", 52 | "\n", 53 | "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\\\n", 54 | " level=logging.INFO)\n", 55 | "\n", 56 | "from bs4 import BeautifulSoup\n", 57 | "\n", 58 | "import numpy as np\n", 59 | "import pandas as pd \n", 60 | "\n", 61 | "from gensim.models import Word2Vec\n", 62 | "\n", 63 | "import nltk\n", 64 | "# nltk.download() # Download text data sets, including stop words\n", 65 | "from nltk.corpus import stopwords # Import the stop word list" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "# Part 1 : simple bag of words model : Preprocessing the model" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 25, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/html": [ 85 | "

\n", 86 | "\n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "

	id	sentiment	review
0	\"5814_8\"	1	\"With all this stuff going down at the moment ...
1	\"2381_9\"	1	\"\\\"The Classic War of the Worlds\\\" by Timothy ...
2	\"7759_3\"	0	\"The film starts with a manager (Nicholas Bell...
3	\"3630_4\"	0	\"It must be assumed that those who praised thi...
4	\"9495_8\"	1	\"Superbly trashy and wondrously unpretentious ...

\n", 128 | "

" 129 | ], 130 | "text/plain": [ 131 | " id sentiment review\n", 132 | "0 \"5814_8\" 1 \"With all this stuff going down at the moment ...\n", 133 | "1 \"2381_9\" 1 \"\\\"The Classic War of the Worlds\\\" by Timothy ...\n", 134 | "2 \"7759_3\" 0 \"The film starts with a manager (Nicholas Bell...\n", 135 | "3 \"3630_4\" 0 \"It must be assumed that those who praised thi...\n", 136 | "4 \"9495_8\" 1 \"Superbly trashy and wondrously unpretentious ..." 137 | ] 138 | }, 139 | "execution_count": 25, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [ 145 | "train = pd.read_csv(\"data/Kaggle/labeledTrainData.tsv\", header=0, \\\n", 146 | " delimiter=\"\\t\", quoting=3)\n", 147 | "train.head(5)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "Beautiful Soup is a Python library for pulling data out of HTML and XML files. It works with your favorite parser to provide idiomatic ways of navigating, searching, and modifying the parse tree. It commonly saves programmers hours or days of work." 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 26, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | " With all this stuff going down at the moment with MJ i ve started listening to his music watching the odd documentary here and there watched The Wiz and watched Moonwalker again Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent Moonwalker is part biography part feature film which i remember going to see at the cinema when it was originally released Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him The actual feature film bit when it finally starts is only on for minutes or so excluding the Smooth Criminal sequence and Joe Pesci is convincing as a psychopathic all powerful drug lord Why he wants MJ dead so bad is beyond me Because MJ overheard his plans Nah Joe Pesci s character ranted that he wanted people to know it is he who is supplying drugs etc so i dunno maybe he just hates MJ s music Lots of cool things in this like MJ turning into a car and a robot and the whole Speed Demon sequence Also the director must have had the patience of a saint when it came to filming the kiddy Bad sequence as usually directors hate working with one kid let alone a whole bunch of them performing a complex dance scene Bottom line this movie is for people who like MJ on one level or another which i think is most people If not then stay away It does try and give off a wholesome message and ironically MJ s bestest buddy in this movie is a girl Michael Jackson is truly one of the most talented people ever to grace this planet but is he guilty Well with all the attention i ve gave this subject hmmm well i don t know because people can be different behind closed doors i know this for a fact He is either an extremely nice but stupid guy or one of the most sickest liars I hope he is not the latter \n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "letters_only = re.sub(\"[^a-zA-Z]\", # The pattern to search for\n", 174 | " \" \", # The pattern to replace it with\n", 175 | " BeautifulSoup(train[\"review\"][0], \"lxml\") .get_text() ) # The text to search, remove HTML tags\n", 176 | "print(letters_only)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 27, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now', u'd', u'll', u'm', u'o', u're', u've', u'y', u'ain', u'aren', u'couldn', u'didn', u'doesn', u'hadn', u'hasn', u'haven', u'isn', u'ma', u'mightn', u'mustn', u'needn', u'shan', u'shouldn', u'wasn', u'weren', u'won', u'wouldn']\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "print(stopwords.words(\"english\"))" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 28, 201 | "metadata": { 202 | "collapsed": false 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "def review_to_words( raw_review ):\n", 207 | " # Function to convert a raw review to a string of words\n", 208 | " # The input is a single string (a raw movie review), and \n", 209 | " # the output is a single string (a preprocessed movie review)\n", 210 | " #\n", 211 | " # 1. Remove HTML\n", 212 | " review_text = BeautifulSoup(raw_review, \"lxml\").get_text() \n", 213 | " #\n", 214 | " # 2. Remove non-letters \n", 215 | " letters_only = re.sub(\"[^a-zA-Z]\", \" \", review_text) \n", 216 | " #\n", 217 | " # 3. Convert to lower case, split into individual words\n", 218 | " words = letters_only.lower().split() \n", 219 | " #\n", 220 | " # 4. In Python, searching a set is much faster than searching\n", 221 | " # a list, so convert the stop words to a set\n", 222 | " stops = set(stopwords.words(\"english\")) \n", 223 | " # \n", 224 | " # 5. Remove stop words\n", 225 | " meaningful_words = [w for w in words if not w in stops] \n", 226 | " #\n", 227 | " # 6. Join the words back into one string separated by space, \n", 228 | " # and return the result.\n", 229 | " return( \" \".join( meaningful_words )) " 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 29, 235 | "metadata": { 236 | "collapsed": false, 237 | "scrolled": true 238 | }, 239 | "outputs": [ 240 | { 241 | "name": "stdout", 242 | "output_type": "stream", 243 | "text": [ 244 | "Cleaning and parsing the training set movie reviews...\n", 245 | "\n", 246 | "Review 1000 of 25000\n", 247 | "Review 2000 of 25000\n", 248 | "Review 3000 of 25000\n", 249 | "Review 4000 of 25000\n", 250 | "Review 5000 of 25000\n", 251 | "Review 6000 of 25000\n", 252 | "Review 7000 of 25000\n", 253 | "Review 8000 of 25000\n", 254 | "Review 9000 of 25000\n", 255 | "Review 10000 of 25000\n", 256 | "Review 11000 of 25000\n", 257 | "Review 12000 of 25000\n", 258 | "Review 13000 of 25000\n", 259 | "Review 14000 of 25000\n", 260 | "Review 15000 of 25000\n", 261 | "Review 16000 of 25000\n", 262 | "Review 17000 of 25000\n", 263 | "Review 18000 of 25000\n", 264 | "Review 19000 of 25000\n", 265 | "Review 20000 of 25000\n", 266 | "Review 21000 of 25000\n", 267 | "Review 22000 of 25000\n", 268 | "Review 23000 of 25000\n", 269 | "Review 24000 of 25000\n", 270 | "Review 25000 of 25000\n" 271 | ] 272 | } 273 | ], 274 | "source": [ 275 | "num_reviews = train[\"review\"].size # train is the training data\n", 276 | "print(\"Cleaning and parsing the training set movie reviews...\\n\")\n", 277 | "clean_train_reviews = []\n", 278 | "for i in xrange( 0, num_reviews ):\n", 279 | " # If the index is evenly divisible by 1000, print a message\n", 280 | " if( (i+1)%1000 == 0 ):\n", 281 | " print(\"Review %d of %d\" % ( i+1, num_reviews )) \n", 282 | " clean_train_reviews.append( review_to_words( train[\"review\"][i] ))" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "# Part 2 : Using Word2Vec to extract features" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "Word2vec is a two-layer neural net that processes text. Its input is a text corpus and its output is a set of vectors: feature vectors for words in that corpus. While Word2vec is not a deep neural network, it turns text into a numerical form that deep nets can understand." 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "First, we read in the data with pandas, as we did in Part 1. Unlike Part 1, we now use unlabeledTrain.tsv, which contains 50,000 additional reviews with no labels. When we built the Bag of Words model in Part 1, extra unlabeled training reviews were not useful. However, since Word2Vec can learn from unlabeled data, these extra 50,000 reviews can now be used." 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 30, 309 | "metadata": { 310 | "collapsed": false 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "unlabeled_train = pd.read_csv( \"data/Kaggle/unlabeledTrainData.tsv\", header=0, \n", 315 | " delimiter=\"\\t\", quoting=3 )" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "Next, we want a specific input format. Word2Vec expects single sentences, each one as a list of words. In other words, the input format is a list of lists.\n", 323 | "\n", 324 | "It is not at all straightforward how to split a paragraph into sentences. There are all kinds of gotchas in natural language. English sentences can end with \"?\", \"!\", \"\"\", or \".\", among other things, and spacing and capitalization are not reliable guides either. For this reason, we'll use NLTK's punkt tokenizer for sentence splitting. In order to use this, you will need to install NLTK and use nltk.download() to download the relevant training file for punkt." 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 31, 330 | "metadata": { 331 | "collapsed": false 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 32, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "def review_to_wordlist( review, remove_stopwords=False ):\n", 347 | " # Function to convert a document to a sequence of words,\n", 348 | " # optionally removing stop words. Returns a list of words.\n", 349 | " # Here documents are basically sentences\n", 350 | " # 1. Remove HTML\n", 351 | " review_text = BeautifulSoup(review,\"lxml\").get_text()\n", 352 | " # \n", 353 | " # 2. Remove non-letters\n", 354 | " review_text = re.sub(\"[^a-zA-Z]\",\" \", review_text)\n", 355 | " #\n", 356 | " # 3. Convert words to lower case and split them\n", 357 | " words = review_text.lower().split()\n", 358 | " #\n", 359 | " # 4. Optionally remove stop words (false by default)\n", 360 | " if remove_stopwords:\n", 361 | " stops = set(stopwords.words(\"english\"))\n", 362 | " words = [w for w in words if not w in stops]\n", 363 | " #\n", 364 | " # 5. Return a list of words\n", 365 | " return(words)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 33, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "# Define a function to split a review into parsed sentences\n", 377 | "def review_to_sentences( review, tokenizer, remove_stopwords=False ):\n", 378 | " # Function to split a review into parsed sentences. Returns a \n", 379 | " # list of sentences, where each sentence is a list of words\n", 380 | " #\n", 381 | " # 1. Use the NLTK tokenizer to split the paragraph into sentences\n", 382 | " raw_sentences = tokenizer.tokenize(review.strip())\n", 383 | " #\n", 384 | " # 2. Loop over each sentence\n", 385 | " sentences = []\n", 386 | " for raw_sentence in raw_sentences:\n", 387 | " # If a sentence is empty, skip it\n", 388 | " if len(raw_sentence) > 0:\n", 389 | " # Otherwise, call review_to_wordlist to get a list of words\n", 390 | " sentences.append( review_to_wordlist( raw_sentence, \\\n", 391 | " remove_stopwords ))\n", 392 | " #\n", 393 | " # Return the list of sentences (each sentence is a list of words,\n", 394 | " # so this returns a list of lists\n", 395 | " return sentences" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 34, 401 | "metadata": { 402 | "collapsed": false 403 | }, 404 | "outputs": [ 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "Parsing sentences from training set\n", 410 | "Wall time: 1min 47s\n" 411 | ] 412 | } 413 | ], 414 | "source": [ 415 | "%%time\n", 416 | "sentences = [] # Initialize an empty list of sentences\n", 417 | "print(\"Parsing sentences from training set\")\n", 418 | "for i, review in enumerate(train[\"review\"]): \n", 419 | " sentences += review_to_sentences(review.decode('utf-8'), tokenizer)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 35, 425 | "metadata": { 426 | "collapsed": false 427 | }, 428 | "outputs": [ 429 | { 430 | "name": "stdout", 431 | "output_type": "stream", 432 | "text": [ 433 | "Parsing sentences from unlabeled set\n", 434 | "Wall time: 3min 27s\n" 435 | ] 436 | } 437 | ], 438 | "source": [ 439 | "%%time\n", 440 | "print(\"Parsing sentences from unlabeled set\")\n", 441 | "for review in unlabeled_train[\"review\"]:\n", 442 | " sentences += review_to_sentences(review.decode('utf-8'), tokenizer)" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 36, 448 | "metadata": { 449 | "collapsed": false 450 | }, 451 | "outputs": [ 452 | { 453 | "name": "stdout", 454 | "output_type": "stream", 455 | "text": [ 456 | "795538\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "# Check how many sentences we have in total - should be around 850,000+\n", 462 | "print(len(sentences))" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 37, 468 | "metadata": { 469 | "collapsed": false 470 | }, 471 | "outputs": [ 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "[u'with', u'all', u'this', u'stuff', u'going', u'down', u'at', u'the', u'moment', u'with', u'mj', u'i', u've', u'started', u'listening', u'to', u'his', u'music', u'watching', u'the', u'odd', u'documentary', u'here', u'and', u'there', u'watched', u'the', u'wiz', u'and', u'watched', u'moonwalker', u'again']\n", 477 | "[u'maybe', u'i', u'just', u'want', u'to', u'get', u'a', u'certain', u'insight', u'into', u'this', u'guy', u'who', u'i', u'thought', u'was', u'really', u'cool', u'in', u'the', u'eighties', u'just', u'to', u'maybe', u'make', u'up', u'my', u'mind', u'whether', u'he', u'is', u'guilty', u'or', u'innocent']\n" 478 | ] 479 | } 480 | ], 481 | "source": [ 482 | "print(sentences[0])\n", 483 | "print(sentences[1])" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "## Training the model" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "With the list of nicely parsed sentences, we're ready to train the model. There are a number of parameter choices that affect the run time and the quality of the final model that is produced. For details on the algorithms below, see the word2vec API documentation as well as the Google documentation. \n", 498 | "\n", 499 | "* Architecture: Architecture options are skip-gram (default) or continuous bag of words. We found that skip-gram was very slightly slower but produced better results.\n", 500 | "* Training algorithm: Hierarchical softmax (default) or negative sampling. For us, the default worked well.\n", 501 | "* Downsampling of frequent words: The Google documentation recommends values between .00001 and .001. For us, values closer 0.001 seemed to improve the accuracy of the final model.\n", 502 | "* Word vector dimensionality: More features result in longer runtimes, and often, but not always, result in better models. Reasonable values can be in the tens to hundreds; we used 300.\n", 503 | "* Context / window size: How many words of context should the training algorithm take into account? 10 seems to work well for hierarchical softmax (more is better, up to a point).\n", 504 | "* Worker threads: Number of parallel processes to run. This is computer-specific, but between 4 and 6 should work on most systems.\n", 505 | "* Minimum word count: This helps limit the size of the vocabulary to meaningful words. Any word that does not occur at least this many times across all documents is ignored. Reasonable values could be between 10 and 100. In this case, since each movie occurs 30 times, we set the minimum word count to 40, to avoid attaching too much importance to individual movie titles. This resulted in an overall vocabulary size of around 15,000 words. Higher values also help limit run time." 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": {}, 511 | "source": [ 512 | "Choosing parameters is not easy, but once we have chosen our parameters, creating a Word2Vec model is straightforward:" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 38, 518 | "metadata": { 519 | "collapsed": true 520 | }, 521 | "outputs": [], 522 | "source": [ 523 | "# Import the built-in logging module and configure it so that Word2Vec \n", 524 | "# creates nice output messages\n", 525 | "import logging\n", 526 | "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\\\n", 527 | " level=logging.INFO)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 39, 533 | "metadata": { 534 | "collapsed": false 535 | }, 536 | "outputs": [], 537 | "source": [ 538 | "# Set values for various parameters\n", 539 | "num_features = 300 # Word vector dimensionality \n", 540 | "min_word_count = 40 # Minimum word count \n", 541 | "num_workers = 4 # Number of threads to run in parallel\n", 542 | "context = 10 # Context window size \n", 543 | "downsampling = 1e-3 # Downsample setting for frequent words" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 40, 549 | "metadata": { 550 | "collapsed": false 551 | }, 552 | "outputs": [ 553 | { 554 | "name": "stdout", 555 | "output_type": "stream", 556 | "text": [ 557 | "Training model...\n", 558 | "Wall time: 2min 54s\n" 559 | ] 560 | } 561 | ], 562 | "source": [ 563 | "%%time\n", 564 | "# Initialize and train the model (this will take some time)\n", 565 | "print(\"Training model...\") # what is word2vec\n", 566 | "model = Word2Vec(sentences, workers=num_workers, \\\n", 567 | " size=num_features, min_count = min_word_count, \\\n", 568 | " window = context, sample = downsampling)\n" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": 41, 574 | "metadata": { 575 | "collapsed": false 576 | }, 577 | "outputs": [], 578 | "source": [ 579 | "# If you don't plan to train the model any further, calling \n", 580 | "# init_sims will make the model much more memory-efficient.\n", 581 | "model.init_sims(replace=True)" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 42, 587 | "metadata": { 588 | "collapsed": false 589 | }, 590 | "outputs": [], 591 | "source": [ 592 | "# It can be helpful to create a meaningful model name and \n", 593 | "# save the model for later use. You can load it later using Word2Vec.load()\n", 594 | "model_name = \"data/Kaggle/300features_40minwords_10context\"\n", 595 | "model.save(model_name)" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": {}, 601 | "source": [ 602 | "## Exploring the Model Results" 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": {}, 608 | "source": [ 609 | "Congratulations on making it successfully through everything so far! Let's take a look at the model we created out of our 75,000 training reviews.\n", 610 | "\n", 611 | "The \"doesnt_match\" function will try to deduce which word in a set is most dissimilar from the others:" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 43, 617 | "metadata": { 618 | "collapsed": false 619 | }, 620 | "outputs": [ 621 | { 622 | "data": { 623 | "text/plain": [ 624 | "'kitchen'" 625 | ] 626 | }, 627 | "execution_count": 43, 628 | "metadata": {}, 629 | "output_type": "execute_result" 630 | } 631 | ], 632 | "source": [ 633 | ">>> model.doesnt_match(\"man woman child kitchen\".split())" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": 44, 639 | "metadata": { 640 | "collapsed": false 641 | }, 642 | "outputs": [ 643 | { 644 | "data": { 645 | "text/plain": [ 646 | "'berlin'" 647 | ] 648 | }, 649 | "execution_count": 44, 650 | "metadata": {}, 651 | "output_type": "execute_result" 652 | } 653 | ], 654 | "source": [ 655 | ">>> model.doesnt_match(\"france england germany berlin\".split())" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": 45, 661 | "metadata": { 662 | "collapsed": false 663 | }, 664 | "outputs": [ 665 | { 666 | "data": { 667 | "text/plain": [ 668 | "'paris'" 669 | ] 670 | }, 671 | "execution_count": 45, 672 | "metadata": {}, 673 | "output_type": "execute_result" 674 | } 675 | ], 676 | "source": [ 677 | ">>> model.doesnt_match(\"paris berlin london austria\".split())" 678 | ] 679 | }, 680 | { 681 | "cell_type": "markdown", 682 | "metadata": {}, 683 | "source": [ 684 | "We can also use the \"most_similar\" function to get insight into the model's word clusters:" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 46, 690 | "metadata": { 691 | "collapsed": false 692 | }, 693 | "outputs": [ 694 | { 695 | "data": { 696 | "text/plain": [ 697 | "[(u'woman', 0.6243900060653687),\n", 698 | " (u'lady', 0.5951125025749207),\n", 699 | " (u'lad', 0.5617826581001282),\n", 700 | " (u'businessman', 0.5246087908744812),\n", 701 | " (u'men', 0.5243741869926453),\n", 702 | " (u'soldier', 0.5172286629676819),\n", 703 | " (u'monk', 0.5144257545471191),\n", 704 | " (u'farmer', 0.509045422077179),\n", 705 | " (u'guy', 0.502815842628479),\n", 706 | " (u'person', 0.4954226613044739)]" 707 | ] 708 | }, 709 | "execution_count": 46, 710 | "metadata": {}, 711 | "output_type": "execute_result" 712 | } 713 | ], 714 | "source": [ 715 | ">>> model.most_similar(\"man\")" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 47, 721 | "metadata": { 722 | "collapsed": false 723 | }, 724 | "outputs": [ 725 | { 726 | "data": { 727 | "text/plain": [ 728 | "[(u'princess', 0.6707984209060669),\n", 729 | " (u'bride', 0.653606116771698),\n", 730 | " (u'mistress', 0.5998855233192444),\n", 731 | " (u'maid', 0.5943158864974976),\n", 732 | " (u'belle', 0.5904309749603271),\n", 733 | " (u'countess', 0.5837622284889221),\n", 734 | " (u'angela', 0.5836107730865479),\n", 735 | " (u'eva', 0.5834268927574158),\n", 736 | " (u'victoria', 0.5770676732063293),\n", 737 | " (u'stepmother', 0.575308620929718)]" 738 | ] 739 | }, 740 | "execution_count": 47, 741 | "metadata": {}, 742 | "output_type": "execute_result" 743 | } 744 | ], 745 | "source": [ 746 | ">>> model.most_similar(\"queen\")" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 48, 752 | "metadata": { 753 | "collapsed": false 754 | }, 755 | "outputs": [ 756 | { 757 | "data": { 758 | "text/plain": [ 759 | "[(u'terrible', 0.7682986259460449),\n", 760 | " (u'atrocious', 0.7376793622970581),\n", 761 | " (u'horrible', 0.7198557257652283),\n", 762 | " (u'abysmal', 0.7073379755020142),\n", 763 | " (u'dreadful', 0.6953021883964539),\n", 764 | " (u'horrendous', 0.6881628632545471),\n", 765 | " (u'horrid', 0.6703606843948364),\n", 766 | " (u'appalling', 0.6541486978530884),\n", 767 | " (u'lousy', 0.621989905834198),\n", 768 | " (u'amateurish', 0.6133227348327637)]" 769 | ] 770 | }, 771 | "execution_count": 48, 772 | "metadata": {}, 773 | "output_type": "execute_result" 774 | } 775 | ], 776 | "source": [ 777 | ">>> model.most_similar(\"awful\")" 778 | ] 779 | } 780 | ], 781 | "metadata": { 782 | "kernelspec": { 783 | "display_name": "Python 2", 784 | "language": "python", 785 | "name": "python2" 786 | }, 787 | "language_info": { 788 | "codemirror_mode": { 789 | "name": "ipython", 790 | "version": 2 791 | }, 792 | "file_extension": ".py", 793 | "mimetype": "text/x-python", 794 | "name": "python", 795 | "nbconvert_exporter": "python", 796 | "pygments_lexer": "ipython2", 797 | "version": "2.7.11" 798 | } 799 | }, 800 | "nbformat": 4, 801 | "nbformat_minor": 0 802 | } 803 | -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | # NLP Course Interactive Scribes 2 | 3 | [![Binder](http://mybinder.org/badge.svg)](http://mybinder.org:/repo/krishnamrith12/notebooksnlp) 4 | 5 | ### A Lazy guy's guide for Jupyter Installation 6 |

Download Anaconda. We recommend 8 | downloading Anaconda’s latest Python 3 version (currently Python 3.5).
9 |
Install the version of Anaconda, which you downloaded.
11 |
Congratulations, you have installed Jupyter Notebook. To run the notebook:
13 |
```
jupyter notebook
14 | 
```
15 |
16 | 17 | Source: http://jupyter.readthedocs.io/en/latest/install.html. 18 | 19 |
Available Notebooks
20 | 21 | 22 |
1. 23 | Demo notebook - GoT Analysis, as shown in class. 24 |
2. 25 | 26 | Basics_NLP - Notebook about preliminaries to be handled for python programming. 27 |
3. 28 | Tokenization - Notebook explaining how to tokenise a corpus and the problems faced in doing so. 29 |
4. 30 | JellyFishStringMatching - Spelling correction using edit distance and various metrices that are used in calculating edit distance. 31 |
5. 32 | LanguageModels - Notebook about language modelling 33 |
6. 34 | Smoothing,Perplexity And Backoff - Notebook about Smoothing ,Perplexity And Backoff 35 |
7. HMM
39 | -------------------------------------------------------------------------------- /Sentiment Analysis- NaiveBaye's, SVM, Random Forests.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Sentiment Analysis" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# 1. Naive Baye's" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Finding word counts\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "We have to calculate the probabilities of each classification, and the probabilities of each feature falling into each classification.\n", 29 | "\n", 30 | "We were working with several discrete features in the last example. Here, all we have is one long string. The easiest way to generate features from text is to split the text up into words. Each word in a review will then be a feature that we can then work with. In order to do this, we’ll split the reviews based on whitespace.\n", 31 | "\n", 32 | "We’ll then count up how many times each word occurs in the negative reviews, and how many times each word occurs in the positive reviews. This will allow us to eventually compute the probabilities of a new review belonging to each class.\n", 33 | "\n", 34 | "\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "Negative text sample: Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's bette\n", 49 | "Positive text sample: Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as \"Teachers\". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is \"Teachers\". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a s\n" 50 | ] 51 | } 52 | ], 53 | "source": [ 54 | "import os\n", 55 | "from collections import Counter\n", 56 | "import re\n", 57 | "i=0\n", 58 | "#print os.getcwd()\n", 59 | "posreviews=[]\n", 60 | "positive_text = \" \"\n", 61 | "negative_text = \" \"\n", 62 | "i=0\n", 63 | "for fileName in os.listdir(\"mr/train/pos\"):\n", 64 | " fo=open(\"mr/train/pos/%s\" % fileName,\"r\")\n", 65 | " #print fo.name\n", 66 | " #os.rename(\"mr/train/neg/%s\" % fileName,\"mr/train/neg/%d.txt\" %(i))\n", 67 | " str= fo.read()\n", 68 | " #print \"%d. %s\" % (i,str)\n", 69 | " posreviews.append(str)\n", 70 | " #print \"%d . %s\" %(i,positive_text)\n", 71 | " #if i>1000: \n", 72 | " # break\n", 73 | " fo.close()\n", 74 | " i=i+1\n", 75 | "positive_text= \" \".join(posreviews)\n", 76 | "#print positive_text\n", 77 | "negreviews=[]\n", 78 | "i=0\n", 79 | "for fileName in os.listdir(\"mr/train/neg\"):\n", 80 | " fo=open(\"mr/train/neg/%s\" % fileName,\"r\")\n", 81 | " #os.rename(\"mr/train/neg/%s\" % fileName,\"mr/train/neg/%d.txt\" %(i))\n", 82 | " negreviews.append(fo.read())\n", 83 | " #if i>1000: \n", 84 | " # break\n", 85 | " i=i+1\n", 86 | "negative_text=\" \".join(negreviews)\n", 87 | "#print negative_text\n", 88 | "\n", 89 | "def count_text(text):\n", 90 | " # Split text into words based on whitespace. Simple but effective.\n", 91 | " words = re.split(\"\\s+\", text)\n", 92 | " # Count up the occurence of each word.\n", 93 | " return Counter(words)\n", 94 | "\n", 95 | "# Generate word counts for negative tone.\n", 96 | "negative_counts = count_text(negative_text)\n", 97 | "# Generate word counts for positive tone.\n", 98 | "positive_counts = count_text(positive_text)\n", 99 | "\n", 100 | "print(\"Negative text sample: {0}\".format(negative_text[:500]))\n", 101 | "print(\"Positive text sample: {0}\".format(positive_text[:500]))\n" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Making Predictions" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "Now that we have the word counts, we just have to convert them to probabilities and multiply them out to get the predicted classification. Let’s say we wanted to find the probability that the review didn't like it expresses a negative sentiment. We would find the total number of times the word didn't occured in the negative reviews, and divide it by the total number of words in the negative reviews to get the probability of x given y. We would then do the same for like and it. We would multiply all three probabilities, and then multiply by the probability of any document expressing a negative sentiment to get our final probability that the sentence expresses negative sentiment.\n", 116 | "\n", 117 | "We would do the same for positive sentiment, and then whichever probability is greater would be the class that the review is assigned to.\n", 118 | "\n", 119 | "To do all this, we’ll need to compute the probabilities of each class occuring in the data, and then make a function to compute the classification.\n" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 4, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "Good nice Well Done - Positive (pos score=8.06432308243e-18, neg score=1.53669815111e-18)\n", 134 | "Movie was junk, useless, good for nothing, sheer waste of time and money. - Negative (pos score=6.35349543481e-48, neg score=2.56654331959e-45)\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "\n", 140 | "# We need these counts to use for smoothing when computing the prediction.\n", 141 | "positive_review_count = len(posreviews)\n", 142 | "negative_review_count = len(negreviews)\n", 143 | "# These are the class probabilities (we saw them in the formula as P(y)).\n", 144 | "prob_positive = positive_review_count / float(len(posreviews) + len(negreviews))\n", 145 | "prob_negative = negative_review_count / float(len(posreviews) + len(negreviews))\n", 146 | "\n", 147 | "def make_class_prediction(text, counts, class_prob, class_count):\n", 148 | " prediction = 1.0\n", 149 | " text_counts = Counter(re.split(\"\\s+\", text))\n", 150 | " for word in text_counts:\n", 151 | " # For every word in the text, we get the number of times that word occured in the reviews for a given class, add 1 to smooth the value, and divide by the total number of words in the class (plus the class_count to also smooth the denominator).\n", 152 | " # Smoothing ensures that we don't multiply the prediction by 0 if the word didn't exist in the training data.\n", 153 | " # We also smooth the denominator counts to keep things even.\n", 154 | " # print \"%d, %d,%d\" %(text_counts.get(word), counts.get(word,0), sum(counts.values()))\n", 155 | " prediction *= text_counts.get(word) * ((counts.get(word,0) + 1) / float(sum(counts.values()) + class_count))\n", 156 | " #print prediction\n", 157 | " # Now we multiply by the probability of the class existing in the documents.\n", 158 | " return prediction * class_prob\n", 159 | "\n", 160 | "# As you can see, we can now generate probabilities for which class a given review is part of.\n", 161 | "# The probabilities themselves aren't very useful -- we make our classification decision based on which value is greater.\n", 162 | "#print(\"Review: {0}\".format(reviews[0][0]))\n", 163 | "#text=\"Movie was junk, useless, good for nothing, sheer waste of time and money.\"\n", 164 | "text=\"Good nice Well Done\"\n", 165 | "#print(\"Negative prediction: {0}\".format(make_class_prediction(text, negative_counts, prob_negative, negative_review_count))\n", 166 | "neg=make_class_prediction(text, negative_counts, prob_negative, negative_review_count)\n", 167 | "pos=make_class_prediction(text, positive_counts, prob_positive, positive_review_count)\n", 168 | "if pos>neg:\n", 169 | " print \"%s - Positive (pos score={0}, neg score={1})\".format(pos,neg) %(text)\n", 170 | "else:\n", 171 | " print \"%s - Negative (pos score={0}, neg score={1})\".format(pos,neg) %(text)\n", 172 | "text=\"Movie was junk, useless, good for nothing, sheer waste of time and money.\"\n", 173 | "neg=make_class_prediction(text, negative_counts, prob_negative, negative_review_count)\n", 174 | "pos=make_class_prediction(text, positive_counts, prob_positive, positive_review_count)\n", 175 | "if pos>neg:\n", 176 | " print \"%s - Positive (pos score={0}, neg score={1})\".format(pos,neg) %(text)\n", 177 | "else:\n", 178 | " print \"%s - Negative (pos score={0}, neg score={1})\".format(pos,neg) %(text)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "### Naive Bayes - Faster way to predict using Sklearn" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 5, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "Multinomial naive bayes AUC: 0.785085965614\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "from sklearn.naive_bayes import MultinomialNB\n", 205 | "from sklearn.feature_extraction.text import CountVectorizer\n", 206 | "from sklearn import metrics\n", 207 | "test=[]\n", 208 | "actual=[]\n", 209 | "for fileName in os.listdir(\"mr/test/pos\"):\n", 210 | " fo=open(\"mr/test/pos/%s\" % fileName,\"r\")\n", 211 | " str= fo.read()\n", 212 | " test.append(str)\n", 213 | " actual.append(1)\n", 214 | "for fileName in os.listdir(\"mr/test/neg\"):\n", 215 | " fo=open(\"mr/test/neg/%s\" % fileName,\"r\")\n", 216 | " str= fo.read()\n", 217 | " test.append(str)\n", 218 | " actual.append(-1)\n", 219 | "reviews = []\n", 220 | "for r in posreviews:\n", 221 | " reviews.append(r)\n", 222 | "for r in negreviews:\n", 223 | " reviews.append(r)\n", 224 | "# Generate counts from text using a vectorizer. There are other vectorizers available, and lots of options you can set.\n", 225 | "# This performs our step of computing word counts.\n", 226 | "vectorizer = CountVectorizer(stop_words='english')\n", 227 | "train_features = vectorizer.fit_transform([r for r in reviews])\n", 228 | "test_features = vectorizer.transform([r for r in test])\n", 229 | "\n", 230 | "# Fit a naive bayes model to the training data.\n", 231 | "# This will train the model using the word counts we computer, and the existing classifications in the training set.\n", 232 | "nb = MultinomialNB()\n", 233 | "trainRes=[]\n", 234 | "for r in posreviews:\n", 235 | " trainRes.append(1)\n", 236 | "for r in negreviews:\n", 237 | " trainRes.append(-1)\n", 238 | "nb.fit(train_features,trainRes)\n", 239 | "\n", 240 | "# Now we can use the model to predict classifications for our test features.\n", 241 | "predictions = nb.predict(test_features)\n", 242 | "#print predictions\n", 243 | "# Compute the error. It is slightly different from our model because the internals of this process work differently from our implementation.\n", 244 | "fpr, tpr, thresholds = metrics.roc_curve(actual, predictions, pos_label=1)\n", 245 | "print(\"Multinomial naive bayes AUC: {0}\".format(metrics.auc(fpr, tpr)))" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "collapsed": true 252 | }, 253 | "source": [ 254 | "## Using SVM\n", 255 | "First Sample Kernal is rbf(radial basis fuction) i.e. exp(-gamma |x-x'|^2) where gamma is a specified variable" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 6, 261 | "metadata": { 262 | "collapsed": false 263 | }, 264 | "outputs": [ 265 | { 266 | "name": "stdout", 267 | "output_type": "stream", 268 | "text": [ 269 | "[-1 -1 -1 ..., -1 -1 -1]\n", 270 | "SVC Analysis AUC: 0.5\n" 271 | ] 272 | } 273 | ], 274 | "source": [ 275 | "from sklearn.svm import SVC\n", 276 | "clf = SVC()\n", 277 | "clf.fit(train_features, trainRes) \n", 278 | "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 279 | " decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',\n", 280 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 281 | " tol=0.001, verbose=False)\n", 282 | "predictSvc=clf.predict(test_features)\n", 283 | "print predictSvc\n", 284 | "# Compute the error. \n", 285 | "fpr, tpr, thresholds = metrics.roc_curve(actual, predictSvc, pos_label=1)\n", 286 | "print(\"SVC Analysis AUC: {0}\".format(metrics.auc(fpr, tpr)))" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "### Kernal as Linear Function " 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 7, 299 | "metadata": { 300 | "collapsed": false 301 | }, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "[-1 -1 -1 ..., -1 -1 -1]\n", 308 | "SVC Analysis AUC: 0.5\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n", 314 | " decision_function_shape=None, degree=3, gamma='auto', kernel='linear',\n", 315 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 316 | " tol=0.001, verbose=False)\n", 317 | "predictSvc=clf.predict(test_features)\n", 318 | "print predictSvc\n", 319 | "# Compute the error. \n", 320 | "fpr, tpr, thresholds = metrics.roc_curve(actual, predictSvc, pos_label=1)\n", 321 | "print(\"SVC Analysis AUC: {0}\".format(metrics.auc(fpr, tpr)))" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "### Kernal as Sigmoid function i.e. tanh(gamma* < x, x'> + r) where r is specified by variable coef0" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 21, 334 | "metadata": { 335 | "collapsed": false 336 | }, 337 | "outputs": [ 338 | { 339 | "name": "stdout", 340 | "output_type": "stream", 341 | "text": [ 342 | "[-1 -1 -1 ..., -1 -1 -1]\n", 343 | "SVC Analysis AUC: 0.5\n" 344 | ] 345 | } 346 | ], 347 | "source": [ 348 | "SVC(C=1.0, cache_size=200, class_weight=None, coef0=200.0,\n", 349 | " decision_function_shape=None, degree=3, gamma='auto', kernel='sigmoid',\n", 350 | " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", 351 | " tol=0.001, verbose=False)\n", 352 | "predictSvc=clf.predict(test_features)\n", 353 | "print predictSvc\n", 354 | "# Compute the error. \n", 355 | "fpr, tpr, thresholds = metrics.roc_curve(actual, predictSvc, pos_label=1)\n", 356 | "print(\"SVC Analysis AUC: {0}\".format(metrics.auc(fpr, tpr)))" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "### Using Random Forests" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "#### Maximum Trees 10" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 14, 376 | "metadata": { 377 | "collapsed": false 378 | }, 379 | "outputs": [ 380 | { 381 | "name": "stdout", 382 | "output_type": "stream", 383 | "text": [ 384 | "[-1 -1 -1 ..., -1 -1 -1]\n", 385 | "SVC Analysis AUC: 0.5\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "from sklearn.ensemble import RandomForestClassifier\n", 391 | "RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=None, min_samples_split=2, \n", 392 | " min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', \n", 393 | " max_leaf_nodes=None,bootstrap=True, oob_score=False, \n", 394 | " n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)\n", 395 | "predictRF=clf.predict(test_features)\n", 396 | "print predictSvc\n", 397 | "# Compute the error. \n", 398 | "fpr, tpr, thresholds = metrics.roc_curve(actual, predictSvc, pos_label=1)\n", 399 | "print(\"SVC Analysis AUC: {0}\".format(metrics.auc(fpr, tpr)))" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "#### Maximum Trees 20" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 19, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "[-1 -1 -1 ..., -1 -1 -1]\n", 421 | "SVC Analysis AUC: 0.5\n" 422 | ] 423 | } 424 | ], 425 | "source": [ 426 | "RandomForestClassifier(n_estimators=50, criterion='gini', max_depth=None, min_samples_split=2, \n", 427 | " min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', \n", 428 | " max_leaf_nodes=None, bootstrap=True, oob_score=False, \n", 429 | " n_jobs=1, random_state=None, verbose=0, warm_start=False, class_weight=None)\n", 430 | "predictRF=clf.predict(test_features)\n", 431 | "print predictSvc\n", 432 | "# Compute the error. \n", 433 | "fpr, tpr, thresholds = metrics.roc_curve(actual, predictSvc, pos_label=1)\n", 434 | "print(\"SVC Analysis AUC: {0}\".format(metrics.auc(fpr, tpr)))" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "### References\n", 442 | "Dataset - http://ai.stanford.edu/~amaas/data/sentiment/ [25k(12.5k pos and 12.5k negative reviews) Training Data and 25k Test Movie Reviews]
We took 2.5k pos samples and neg samples from test and tag for the results which are displayed.
Kindly download the dataset from the link provided and make sure it is in the same folder as this ipython notebook to run.
\n", 443 | "Naive Bayes Senti Analysis on Movie Reviews Blog https://www.dataquest.io/blog/naive-bayes-tutorial/
\n", 444 | "Developed by Mayank Bhasin, for any queries contact mayankbhasin@gmail.com" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": { 451 | "collapsed": true 452 | }, 453 | "outputs": [], 454 | "source": [] 455 | } 456 | ], 457 | "metadata": { 458 | "kernelspec": { 459 | "display_name": "Python 2", 460 | "language": "python", 461 | "name": "python2" 462 | }, 463 | "language_info": { 464 | "codemirror_mode": { 465 | "name": "ipython", 466 | "version": 2 467 | }, 468 | "file_extension": ".py", 469 | "mimetype": "text/x-python", 470 | "name": "python", 471 | "nbconvert_exporter": "python", 472 | "pygments_lexer": "ipython2", 473 | "version": "2.7.11" 474 | } 475 | }, 476 | "nbformat": 4, 477 | "nbformat_minor": 0 478 | } 479 | -------------------------------------------------------------------------------- /imbalanced learn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "training = pd.read_csv('trainingVect.csv')\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "training['Ow2'] = 0\n", 20 | "training.ix[training['word2'].str[-1] == 'O','Ow2'] = 1" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "heldout = pd.read_csv('heldoutVect.csv')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "dvan = heldout[['word1','word2','tag','label']][heldout.word2.str[-1]== 'O'].groupby('label').count()\n", 39 | "dvan.sort_values(by='word1',ascending=False)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "heldout[['word1','word2','tag','label']][(heldout['word2'].str[-2:] == 'EH') & (heldout['label']==2)]" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "heldout['Ow2'] = 0\n", 58 | "heldout.ix[training['word2'].str[-1] == 'O','Ow2'] = 1" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "heldout.head(4)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Ignore" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "finRemove = eval(open('adap2.features').read())\n", 84 | "type(finRemove)\n", 85 | "finRemove[0]" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "for item in finRemove:\n", 95 | " del heldout[item]\n", 96 | " del training[item]" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "heldout.columns.difference(training.columns)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "print len(heldout.columns),len(training.columns)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "for i,item in enumerate(training.columns):\n", 124 | " if training.columns[i] != heldout.columns[i]:\n", 125 | " print training.columns[i],heldout.columns[i]" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Stop Ignorring" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "print training.columns[-41:]\n", 142 | "\n", 143 | "allCols = list(training.columns[5:-39])\n", 144 | "allCols.insert(-1,'Ow2')\n", 145 | "allCols[-3:]" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "from numpy import *\n", 155 | "\n", 156 | "\n", 157 | "#featVect = featVect.sample(frac=1.0)\n", 158 | "features = array(training.ix[:,allCols])\n", 159 | "labels = array(training.ix[:,'label'])\n", 160 | "\n", 161 | "heldFeatures = array(heldout.ix[:,allCols])\n", 162 | "\n", 163 | "heldLabels = array(heldout.ix[:,'label'])\n", 164 | "print allCols[-2:]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "a = training.columns\n", 174 | "a[26:413]\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "## Random Forests" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "from sklearn.cross_validation import StratifiedShuffleSplit\n", 191 | "from sklearn.cross_validation import StratifiedKFold\n", 192 | "skf = StratifiedKFold(labels, n_folds=10,shuffle=True)\n", 193 | "from sklearn.ensemble import RandomForestClassifier\n", 194 | "from sklearn.metrics import confusion_matrix, classification_report\n", 195 | "from sklearn.metrics import accuracy_score\n", 196 | "\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "clf = RandomForestClassifier(n_jobs=-1,n_estimators=400,max_features='log2')\n", 206 | "clf = clf.fit(features,labels)\n", 207 | "predicted_label = clf.predict(heldFeatures)\n", 208 | "pred_prob = clf.predict_proba(heldFeatures)" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "print classification_report(heldLabels, predicted_label)\n", 218 | "print ('Accuracy:',accuracy_score(heldLabels, predicted_label))" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "print classification_report(heldLabels, predicted_label)\n", 228 | "print ('Accuracy:',accuracy_score(heldLabels, predicted_label))" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "%matplotlib inline\n", 238 | "import numpy as np\n", 239 | "import matplotlib.pyplot as plt\n", 240 | "\n", 241 | "import numpy as np\n", 242 | "forest = clf\n", 243 | "importances = forest.feature_importances_\n", 244 | "std = np.std([tree.feature_importances_ for tree in forest.estimators_],\n", 245 | " axis=0)\n", 246 | "indices = np.argsort(importances)[::-1]\n", 247 | "X = features\n", 248 | "# Print the feature ranking\n", 249 | "print(\"Feature ranking:\")\n", 250 | "\n", 251 | "\"\"\"for f in range(features.shape[1]):\n", 252 | " print(\"%d. feature %d (%f)\" % (f + 1, indices[f], importances[indices[f]]))\n", 253 | "\"\"\"\n", 254 | "# Plot the feature importances of the forest\n", 255 | "plt.figure()\n", 256 | "plt.title(\"Feature importances\")\n", 257 | "plt.bar(range(X.shape[1]), importances[indices],\n", 258 | " color=\"r\", yerr=std[indices], align=\"center\")\n", 259 | "plt.xticks(range(X.shape[1]), indices)\n", 260 | "plt.xlim([-1, X.shape[1]])\n", 261 | "plt.show()" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "triple = zip(range(len(heldLabels)),heldLabels,predicted_label)\n" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "prob =pd.DataFrame().from_records(pred_prob,columns=['prob0','prob1','prob2','prob3'])\n", 280 | "prob = pd.merge(prob,result,left_index=True,right_index=True,how='inner')\n", 281 | "prob[(prob['heldLabels']!= prob['predicted_label']) & (prob['heldLabels'] == 2)][['prob0','prob1','prob2','prob3','word1','word2','heldLabels','predicted_label','dups','tag']]" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "prob[(prob['word1'].str[-1] == 'a') & (prob['word2'].str[-1] == 'm') ]" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "trip = pd.DataFrame().from_records(triple,columns=['index','heldLabels','predicted_label'])\n", 300 | "\n", 301 | "result = pd.merge(trip,heldout,how='inner',left_on='index',right_index=True,)\n", 302 | "result[(result['heldLabels']!= result['predicted_label']) & (result['heldLabels'] == 2)][['word1','word2','heldLabels','predicted_label','dups','tag']]" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "## Extremely Randomied trees" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier\n", 319 | "from sklearn.decomposition import TruncatedSVD\n", 320 | "from sklearn.ensemble import RandomForestClassifier\n", 321 | "from sklearn.metrics import confusion_matrix, classification_report\n", 322 | "from sklearn.metrics import accuracy_score\n", 323 | "trees = ExtraTreesClassifier(n_estimators=400, random_state=43, max_features='log2')\n", 324 | "trees = trees.fit(features,labels)\n", 325 | "\n", 326 | "predicted_label = trees.predict(heldFeatures)\n", 327 | "print classification_report(heldLabels, predicted_label)\n", 328 | "print ('Accuracy:',accuracy_score(heldLabels, predicted_label))\n", 329 | "\n" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "from sklearn import svm\n", 339 | "clf = svm.SVC()\n", 340 | "clf.fit(features, labels) \n", 341 | "predicted_label = clf.predict(heldFeatures)\n", 342 | "print classification_report(heldLabels, predicted_label)\n", 343 | "print ('Accuracy:',accuracy_score(heldLabels, predicted_label))" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "from sklearn import tree\n", 353 | "clf = tree.DecisionTreeClassifier()\n", 354 | "clf.fit(features, labels) \n", 355 | "predicted_label = clf.predict(heldFeatures)\n", 356 | "print classification_report(heldLabels, predicted_label)\n", 357 | "print ('Accuracy:',accuracy_score(heldLabels, predicted_label))" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "import numpy\n", 367 | "alist = zip(list(a[5:-29]),list(trees.feature_importances_))\n", 368 | "feati = open('featImp.csv','w')\n", 369 | "for item in alist:\n", 370 | " print >>feati,item[0],',',item[1]\n", 371 | "feati.close()" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "## GBM" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "from sklearn.ensemble import GradientBoostingClassifier\n", 388 | "clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=10).fit(features,labels)\n", 389 | "predicted_label = clf.predict(heldFeatures)\n" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "print classification_report(heldLabels, predicted_label)\n", 399 | "print ('Accuracy:',accuracy_score(heldLabels, predicted_label))" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "## Easy Ensemble" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": null, 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "from collections import Counter\n", 416 | "from imblearn.ensemble import BalanceCascade\n", 417 | "from imblearn.ensemble import EasyEnsemble\n", 418 | "ee = EasyEnsemble(ratio=0.055,random_state=25,replacement=True,n_subsets=20)\n", 419 | "X_res, y_res = ee.fit_sample(features, labels)\n", 420 | "print('Resampled dataset shape {}'.format(Counter(y_res[1])))\n", 421 | "\n", 422 | "\"\"\"from sklearn.decomposition import PCA\n", 423 | "pcaHeld = PCA(n_components=900).fit(features)\n", 424 | "pcaFe = pcaHeld.transform(features)\"\"\"" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "X_res[1].shape" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "## from sklearn.ensemble import AdaBoostClassifier\n", 443 | "from sklearn.tree import DecisionTreeClassifier\n", 444 | "\n", 445 | "#clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=100,splitter='best'),n_estimators=100,learning_rate=1.0)\n", 446 | "clf = ExtraTreesClassifier(n_estimators=200, max_features='log2', random_state=43, warm_start=True,n_jobs=-1)\n", 447 | "countrs= []\n", 448 | "for i in range(len(heldFeatures)):\n", 449 | " countrs.append([0]*4)\n", 450 | "\n", 451 | "for i,item in enumerate(X_res):\n", 452 | " print i,\n", 453 | " clf.fit(X_res[i],y_res[i])\n", 454 | " staged_predict = clf.predict_proba(heldFeatures)\n", 455 | " clf.n_estimators += 50\n", 456 | " for j,stuff in enumerate(staged_predict):\n", 457 | " for k,thing in enumerate(list(stuff)):\n", 458 | " countrs[j][k] += thing" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "majLabels = [-1]*len(countrs)\n", 475 | "for i,item in enumerate(countrs):\n", 476 | " majLabels[i] = item.index(max(item))\n", 477 | "print classification_report(heldLabels, majLabels)\n", 478 | "print ('Accuracy:',accuracy_score(heldLabels, majLabels))" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "preds = open('preds.csv','w')\n", 488 | "for i,item in enumerate(heldLabels):\n", 489 | " print>>preds,item,',',majLabels[i]\n", 490 | "preds.close()\n" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "print type(heldLabels)" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "from sklearn.metrics import confusion_matrix, classification_report\n", 509 | "from sklearn.metrics import accuracy_score\n", 510 | "import pandas \n", 511 | "preds = pandas.read_csv('myList.csv')\n", 512 | "import numpy\n", 513 | "a = list(preds.label)\n", 514 | "b = list(preds.predicted)\n", 515 | "print classification_report(a,b)\n", 516 | "print ('Accuracy:',accuracy_score(a, b))" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [ 525 | "preds[(preds['tag']=='Bsmn')].groupby('predicted').count()" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "from sklearn.metrics import confusion_matrix, classification_report\n", 542 | "from sklearn.metrics import accuracy_score\n", 543 | "import pandas \n", 544 | "preds = pandas.read_csv('preds2.csv')\n", 545 | "import numpy\n", 546 | "a = list(preds.orig)\n", 547 | "b = list(preds.pred)\n", 548 | "print classification_report(a,b)\n", 549 | "print ('Accuracy:',accuracy_score(a, b))" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "clfFull = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=100,splitter='best'),n_estimators=600,learning_rate=1.0)\n", 559 | "clfFull = clfFull.fit(features,labels)\n", 560 | "predicted_label = clfFull.predict(heldFeatures)\n", 561 | "print classification_report(heldLabels, predicted_label)\n", 562 | "print ('Accuracy:',accuracy_score(heldLabels, predicted_label))" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "import numpy as np\n", 572 | "akie = training.select_dtypes(include=['float64']).columns\n", 573 | "akie = akie.insert(-1,'label')\n", 574 | "pkie = training[akie].groupby('label').sum().T\n", 575 | "df = pkie\n", 576 | "df['sum'] = df.sum(axis=1)\n", 577 | "df['entropy'] = (df.ix[:,:3].div(df['sum'],axis=0)*np.log(df.ix[:,:3].div(df['sum'],axis=0))).sum(axis=1)\n", 578 | "pkie.to_csv('summar1.csv')" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "training[training['label']==2][['word1','word2','tag','label','dups']]" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "ap = list(training[training['tag'] == 'Ds' ]['word2'])\n", 597 | "ap1 = list(training[training['tag'] == 'Ds' ]['word1'])\n", 598 | "#vaswra\t2.6.115.2.1\tnapuM.\tmanuRyavargaH\tvaswram\t\t\t\t\t\t\t\t\t\tvaswram" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "kAndas = pd.read_csv('all_kANdas.csv')\n", 608 | "jusWord = list(kAndas['%0_Word'])\n" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": null, 614 | "metadata": {}, 615 | "outputs": [], 616 | "source": [ 617 | "trainFi = [item.split(',') for item in open('trainigFiltered.csv').read().splitlines()[1:]]\n", 618 | "import pandas as pd\n", 619 | "merger = pd.merge(training,kAndas,how='left',left_on='word1',right_on='%0_Word',suffixes=('_x','_word1'))\n", 620 | "merger = pd.merge(merger,kAndas,how='left',left_on='word2',right_on='%0_Word',suffixes=('_x','_word2'))\n", 621 | "merger\n", 622 | "cat_columns =merger.select_dtypes(include=['object']).columns\n", 623 | "\n", 624 | "cat_columns = cat_columns[7:]\n", 625 | "for item in cat_columns:\n", 626 | " merger[item] = merger[item].astype('category')\n", 627 | "\n", 628 | "merger" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "ctr = 0\n", 638 | "for i,item in enumerate(ap):\n", 639 | " if item not in jusWord:\n", 640 | " if item[:-1] not in jusWord:\n", 641 | " if item[:-3] not in jusWord:\n", 642 | " if item[:-1]+'a' not in jusWord:\n", 643 | " print i,item\n", 644 | " ctr += 1\n", 645 | " \n", 646 | "print ctr,len(ap)\n", 647 | "\n", 648 | "#word1 = -ya+a, -aXi+, -pra+, -+n\n", 649 | "#word2 = -m+ -ena+a -e+a\n", 650 | "## vit, AkarRaNa, saMmAna\n" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": {}, 657 | "outputs": [], 658 | "source": [ 659 | "f = [item.split(',') for item in open('heldoutFiltered.csv').read().splitlines()[1:]]\n", 660 | "kAndas = [item.split(',') for item in open('all_kANdas.csv').read().splitlines()]\n", 661 | "nanList = ['None']*(len(kAndas[-1]) - 3)\n", 662 | "ctr = 0\n", 663 | "ctr2 = 0\n", 664 | "for i,item in enumerate(f):\n", 665 | " matched = 0\n", 666 | " matched2 = 0\n", 667 | " for j,stuff in enumerate(kAndas):\n", 668 | " if item[1] == stuff[0]:\n", 669 | " f[i].extend(kAndas[j][2:])\n", 670 | " matched= 1\n", 671 | " ctr += 1\n", 672 | " break\n", 673 | " matched2 = 0\n", 674 | " if matched == 0:\n", 675 | " f[i].extend(nanList)\n", 676 | "\n", 677 | " for j,stuff in enumerate(kAndas):\n", 678 | " if item[2] == stuff[0]: \n", 679 | " f[i].extend(kAndas[j][2:])\n", 680 | " matched2= 1\n", 681 | " ctr2 += 1\n", 682 | " break\n", 683 | " elif item[2][:-1] == stuff[0] or item[2][:-3] == stuff[0] or item[2][:-1]+'a' == stuff[0] or item[2][:-2]+'a' == stuff[0]:\n", 684 | " f[i].extend(kAndas[j][2:])\n", 685 | " matched2= 1\n", 686 | " ctr2 += 1\n", 687 | " break\n", 688 | " \n", 689 | " if matched2 == 0:\n", 690 | " f[i].extend(nanList)" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "import pandas as pd\n", 700 | "training = pd.read_csv('trainigFiltered.csv')\n", 701 | "training[(training['tag'] == 'Ds') | (training['tag'] == 'Di')] [['word1','word2']].to_csv('dvandvas.csv',index=False)\n", 702 | "f = open('dvandvas.csv').read().replace(',','$').splitlines()\n", 703 | "k = open('dvandvas.dat','w')\n", 704 | "for item in f:\n", 705 | " for chara in item:\n", 706 | " print>>k,chara,\n", 707 | " print >> k,'\\n'\n", 708 | "k.close()" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": null, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "newTrainingFiltered = open('newHeldoutFiltered.csv','w')\n", 718 | "for item in f:\n", 719 | " print >> newTrainingFiltered,'\\n',\n", 720 | " for stuff in item:\n", 721 | " print >> newTrainingFiltered,stuff,',',\n", 722 | "newTrainingFiltered.close()" 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": null, 728 | "metadata": {}, 729 | "outputs": [], 730 | "source": [ 731 | "filna = 'newHeldoutFiltered.csv'\n", 732 | "reindex = pd.read_csv(filna).sample(frac=1)\n", 733 | "del reindex['index']\n", 734 | "reindex.to_csv('newTrainingFiltered.csv')" 735 | ] 736 | } 737 | ], 738 | "metadata": { 739 | "kernelspec": { 740 | "display_name": "Python 3", 741 | "language": "python", 742 | "name": "python3" 743 | }, 744 | "language_info": { 745 | "codemirror_mode": { 746 | "name": "ipython", 747 | "version": 3 748 | }, 749 | "file_extension": ".py", 750 | "mimetype": "text/x-python", 751 | "name": "python", 752 | "nbconvert_exporter": "python", 753 | "pygments_lexer": "ipython3", 754 | "version": "3.7.1" 755 | } 756 | }, 757 | "nbformat": 4, 758 | "nbformat_minor": 1 759 | } 760 | --------------------------------------------------------------------------------

Available Notebooks