├── Advanced_Lexial_Processing ├── spell_corrector.py ├── soundex.ipynb ├── edit+distance.ipynb └── spell-corrector.ipynb ├── Introduction_to_NLP ├── Bonus+exercise.ipynb ├── Bonus+exercise+with+solution.ipynb └── Regular_Expressions .ipynb └── Basic_Lexical_Processing ├── tokenisation .ipynb ├── stemming.ipynb └── bag+of+words.ipynb /Advanced_Lexial_Processing/spell_corrector.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import Counter 3 | 4 | def words(document): 5 | "Convert text to lower case and tokenize the document" 6 | return re.findall(r'\w+', document.lower()) 7 | 8 | # create a frequency table of all the words of the document 9 | all_words = Counter(words(open('big.txt').read())) 10 | 11 | def edits_one(word): 12 | "Create all edits that are one edit away from `word`." 13 | alphabets = 'abcdefghijklmnopqrstuvwxyz' 14 | splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] 15 | deletes = [left + right[1:] for left, right in splits if right] 16 | inserts = [left + c + right for left, right in splits for c in alphabets] 17 | replaces = [left + c + right[1:] for left, right in splits if right for c in alphabets] 18 | transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1] 19 | return set(deletes + inserts + replaces + transposes) 20 | 21 | def edits_two(word): 22 | "Create all edits that are two edits away from `word`." 23 | return (e2 for e1 in edits_one(word) for e2 in edits_one(e1)) 24 | 25 | def known(words): 26 | "The subset of `words` that appear in the `all_words`." 27 | return set(word for word in words if word in all_words) 28 | 29 | def possible_corrections(word): 30 | "Generate possible spelling corrections for word." 31 | return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word]) 32 | 33 | def prob(word, N=sum(all_words.values())): 34 | "Probability of `word`: Number of appearances of 'word' / total number of tokens" 35 | return all_words[word] / N 36 | 37 | def rectify(word): 38 | "return the most probable spelling correction for `word` out of all the `possible_corrections`" 39 | correct_word = max(possible_corrections(word), key=prob) 40 | return correct_word 41 | -------------------------------------------------------------------------------- /Advanced_Lexial_Processing/soundex.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Soundex" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Let's create a function which calculates the soundex of any given string " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 8, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "def get_soundex(token):\n", 24 | " \"\"\"Get the soundex code for the string\"\"\"\n", 25 | " token = token.upper()\n", 26 | "\n", 27 | " soundex = \"\"\n", 28 | " \n", 29 | " # first letter of input is always the first letter of soundex\n", 30 | " soundex += token[0]\n", 31 | " \n", 32 | " # create a dictionary which maps letters to respective soundex codes. Vowels and 'H', 'W' and 'Y' will be represented by '.'\n", 33 | " dictionary = {\"BFPV\": \"1\", \"CGJKQSXZ\":\"2\", \"DT\":\"3\", \"L\":\"4\", \"MN\":\"5\", \"R\":\"6\", \"AEIOUHWY\":\".\"}\n", 34 | " \n", 35 | " for char in token[1:]:\n", 36 | " for key in dictionary.keys():\n", 37 | " if char in key:\n", 38 | " code = dictionary[key] \n", 39 | " if code != '.': \n", 40 | " if code != soundex[-1]: \n", 41 | " soundex += code \n", 42 | " \n", 43 | " \n", 44 | " # trim or pad to make soundex a 4-character code\n", 45 | " soundex = soundex[:4].ljust(4, \"0\")\n", 46 | " \n", 47 | " return soundex" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "Let's see what's the soudex of 'Bombay' and 'Bambai'" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 9, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "S300\n", 67 | "A261\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "print(get_soundex(\"STOUT\"))\n", 73 | "print(get_soundex(\"Ashcraft\"))" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 10, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "S300\n", 86 | "A261\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "print(get_soundex(\"STOUT\"))\n", 92 | "print(get_soundex(\"Ashcraft\"))" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "Let's see soundex of 'Aggrawal', 'Agrawal', 'Aggarwal' and 'Agarwal'" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 11, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "A264\n", 112 | "A264\n", 113 | "A264\n", 114 | "A264\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "print(get_soundex(\"Aggrawal\"))\n", 120 | "print(get_soundex(\"Agrawal\"))\n", 121 | "print(get_soundex(\"Aggarwal\"))\n", 122 | "print(get_soundex(\"Agarwal\"))" 123 | ] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python 3", 129 | "language": "python", 130 | "name": "python3" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": { 134 | "name": "ipython", 135 | "version": 3 136 | }, 137 | "file_extension": ".py", 138 | "mimetype": "text/x-python", 139 | "name": "python", 140 | "nbconvert_exporter": "python", 141 | "pygments_lexer": "ipython3", 142 | "version": "3.7.3" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 2 147 | } 148 | -------------------------------------------------------------------------------- /Advanced_Lexial_Processing/edit+distance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Levenshtein Edit Distance\n", 8 | "The levenshtein distance calculates the number of steps (insertions, deletions or substitutions) required to go from source string to target string." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "def lev_distance(source='', target=''):\n", 20 | " \"\"\"Make a Levenshtein Distances Matrix\"\"\"\n", 21 | " \n", 22 | " # get length of both strings\n", 23 | " n1, n2 = len(source), len(target)\n", 24 | " \n", 25 | " # create matrix using length of both strings - source string sits on columns, target string sits on rows\n", 26 | " matrix = [ [ 0 for i1 in range(n1 + 1) ] for i2 in range(n2 + 1) ]\n", 27 | " \n", 28 | " # fill the first row - (0 to n1-1)\n", 29 | " for i1 in range(1, n1 + 1):\n", 30 | " matrix[0][i1] = i1\n", 31 | " \n", 32 | " # fill the first column - (0 to n2-1)\n", 33 | " for i2 in range(1, n2 + 1):\n", 34 | " matrix[i2][0] = i2\n", 35 | " \n", 36 | " # fill the matrix\n", 37 | " for i2 in range(1, n2 + 1):\n", 38 | " for i1 in range(1, n1 + 1):\n", 39 | " \n", 40 | " # check whether letters being compared are same\n", 41 | " if (source[i1-1] == target[i2-1]):\n", 42 | " value = matrix[i2-1][i1-1] # top-left cell value\n", 43 | " else:\n", 44 | " value = min(matrix[i2-1][i1] + 1, # left cell value + 1\n", 45 | " matrix[i2][i1-1] + 1, # top cell value + 1\n", 46 | " matrix[i2-1][i1-1] + 1) # top-left cell value + 1\n", 47 | " \n", 48 | " matrix[i2][i1] = value\n", 49 | " \n", 50 | " # return bottom-right cell value\n", 51 | " return matrix[-1][-1]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "2" 63 | ] 64 | }, 65 | "execution_count": 2, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "lev_distance('cat', 'cta')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Levenshtein distance in nltk library" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 1, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "# import library\n", 90 | "from nltk.metrics.distance import edit_distance" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "2" 102 | ] 103 | }, 104 | "execution_count": 4, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "edit_distance(\"apple\", \"appel\")" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Damerau-Levenshtein Distance\n", 118 | "The Damerau-Levenshtein distance allows transpositions (swap of two letters which are adjacent to each other) as well." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 3, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "2" 130 | ] 131 | }, 132 | "execution_count": 3, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "edit_distance(\"apple\", \"appel\", transpositions=False, )" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "Python 3", 154 | "language": "python", 155 | "name": "python3" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.6.5" 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 2 172 | } 173 | -------------------------------------------------------------------------------- /Introduction_to_NLP/Bonus+exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import re" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### Q1. \n", 19 | "Write a regular expression to match all the files that have either .exe, .xml or .jar extensions. A valid file name can contain any alphabet, digit and underscore followed by the extension." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "files = ['employees.xml', 'calculator.jar', 'nfsmw.exe', 'bkgrnd001.jpg', 'sales_report.ppt']\n", 29 | "\n", 30 | "result = []\n", 31 | "\n", 32 | "# write your code here\n", 33 | "\n", 34 | "# print result - result should only contain the items that match the pattern. In this case, result should be ['employees.xml', 'calculator.jar', 'nfsmw.exe']\n", 35 | "print(result)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Q2. \n", 43 | "Write a regular expression to match all the addresses that have Koramangala embedded in them.\n", 44 | "\n", 45 | "Strings that should match:\n", 46 | "* 466, 5th block, Koramangala, Bangalore\n", 47 | "* 4th BLOCK, KORAMANGALA - 560034\n", 48 | "\n", 49 | "Strings that shouldn't match:\n", 50 | "* 999, St. Marks Road, Bangalore\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "addresses = ['466, 5th block, Koramangala, Bangalore', '4th BLOCK, KORAMANGALA - 560034', '999, St. Marks Road, Bangalore']\n", 60 | "\n", 61 | "result = []\n", 62 | "\n", 63 | "# write your code here\n", 64 | "\n", 65 | "\n", 66 | "# print result - result should only contain the items that match the pattern\n", 67 | "print(result)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### Q3. \n", 75 | "Write a regular expression that matches either integer numbers or floats upto 2 decimal places.\n", 76 | "\n", 77 | "Strings that should match: \n", 78 | "* 2\n", 79 | "* 2.3\n", 80 | "* 4.56\n", 81 | "* .61\n", 82 | "\n", 83 | "Strings that shoudln't match:\n", 84 | "* 4.567\n", 85 | "* 75.8792\n", 86 | "* abc\n" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "numbers = ['2', '2.3', '4.56', '.61', '4.567', '75.8792', 'abc']\n", 96 | "\n", 97 | "result = []\n", 98 | "\n", 99 | "# write your code here\n", 100 | "\n", 101 | "\n", 102 | "# print result - result should only contain the items that match the pattern\n", 103 | "print(result)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "### Q4. \n", 111 | "Write a regular expression to match the model names of smartphones which follow the following pattern: \n", 112 | "\n", 113 | "mobile company name followed by underscore followed by model name followed by underscore followed by model number\n", 114 | "\n", 115 | "Strings that should match:\n", 116 | "* apple_iphone_6\n", 117 | "* samsung_note_4\n", 118 | "* google_pixel_2\n", 119 | "\n", 120 | "Strings that shouldn’t match:\n", 121 | "* apple_6\n", 122 | "* iphone_6\n", 123 | "* google\\_pixel\\_\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "phones = ['apple_iphone_6', 'samsung_note_4', 'google_pixel_2', 'apple_6', 'iphone_6', 'google_pixel_']\n", 133 | "\n", 134 | "result = []\n", 135 | "\n", 136 | "# write your code here\n", 137 | "\n", 138 | "\n", 139 | "# print result - result should only contain the items that match the pattern\n", 140 | "print(result)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### Q5.\n", 148 | "Write a regular expression that can be used to match the emails present in a database. \n", 149 | "\n", 150 | "The pattern of a valid email address is defined as follows:\n", 151 | "The '@' character can be preceded either by alphanumeric characters, period characters or underscore characters. The length of the part that precedes the '@' character should be between 4 to 20 characters.\n", 152 | "\n", 153 | "The '@' character should be followed by a domain name (e.g. gmail.com). The domain name has three parts - a prefix (e.g. 'gmail'), the period character and a suffix (e.g. 'com'). The prefix can have a length between 3 to 15 characters followed by a period character followed by either of these suffixes - 'com', 'in' or 'org'.\n", 154 | "\n", 155 | "\n", 156 | "Emails that should match:\n", 157 | "* random.guy123@gmail.com\n", 158 | "* mr_x_in_bombay@gov.in\n", 159 | "\n", 160 | "Emails that shouldn’t match:\n", 161 | "* 1@ued.org\n", 162 | "* @gmail.com\n", 163 | "* abc!@yahoo.in\n", 164 | "* sam_12@gov.us\n", 165 | "* neeraj@" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "emails = ['random.guy123@gmail.com', 'mr_x_in_bombay@gov.in', '1@ued.org',\n", 175 | " '@gmail.com','abc!@yahoo.in', 'sam_12@gov.us', 'neeraj@']\n", 176 | "\n", 177 | "result = []\n", 178 | "\n", 179 | "# write your code here\n", 180 | "\n", 181 | "\n", 182 | "# print result - result should only contain the items that match the pattern\n", 183 | "print(result)" 184 | ] 185 | } 186 | ], 187 | "metadata": { 188 | "kernelspec": { 189 | "display_name": "Python 3", 190 | "language": "python", 191 | "name": "python3" 192 | }, 193 | "language_info": { 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 3 197 | }, 198 | "file_extension": ".py", 199 | "mimetype": "text/x-python", 200 | "name": "python", 201 | "nbconvert_exporter": "python", 202 | "pygments_lexer": "ipython3", 203 | "version": "3.6.5" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 2 208 | } 209 | -------------------------------------------------------------------------------- /Advanced_Lexial_Processing/spell-corrector.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import re\n", 12 | "from collections import Counter" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# function to tokenise words\n", 24 | "def words(document):\n", 25 | " \"Convert text to lower case and tokenise the document\"\n", 26 | " return re.findall(r'\\w+', document.lower())" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "# create a frequency table of all the words of the document\n", 38 | "all_words = Counter(words(open('big.txt').read()))" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# check frequency of a random word, say, 'chair'\n", 48 | "all_words['chair']" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# look at top 10 frequent words\n", 58 | "all_words.most_common(10)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "def edits_one(word):\n", 70 | " \"Create all edits that are one edit away from `word`.\"\n", 71 | " alphabets = 'abcdefghijklmnopqrstuvwxyz'\n", 72 | " splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]\n", 73 | " deletes = [left + right[1:] for left, right in splits if right]\n", 74 | " inserts = [left + c + right for left, right in splits for c in alphabets]\n", 75 | " replaces = [left + c + right[1:] for left, right in splits if right for c in alphabets]\n", 76 | " transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]\n", 77 | " return set(deletes + inserts + replaces + transposes)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "def edits_two(word):\n", 89 | " \"Create all edits that are two edits away from `word`.\"\n", 90 | " return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "def known(words):\n", 102 | " \"The subset of `words` that appear in the `all_words`.\"\n", 103 | " return set(word for word in words if word in all_words)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "def possible_corrections(word):\n", 115 | " \"Generate possible spelling corrections for word.\"\n", 116 | " return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "def prob(word, N=sum(all_words.values())): \n", 128 | " \"Probability of `word`: Number of appearances of 'word' / total number of tokens\"\n", 129 | " return all_words[word] / N" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "print(len(set(edits_one(\"monney\"))))\n", 139 | "print(edits_one(\"monney\"))" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "print(known(edits_one(\"monney\")))" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "# Let's look at words that are two edits away\n", 158 | "print(len(set(edits_two(\"monney\"))))\n", 159 | "print(known(edits_one(\"monney\")))" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "# Let's look at possible corrections of a word\n", 169 | "print(possible_corrections(\"monney\"))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# Let's look at probability of a word\n", 179 | "print(prob(\"money\"))\n", 180 | "print(prob(\"monkey\"))" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "def spell_check(word):\n", 190 | " \"Print the most probable spelling correction for `word` out of all the `possible_corrections`\"\n", 191 | " correct_word = max(possible_corrections(word), key=prob)\n", 192 | " if correct_word != word:\n", 193 | " return \"Did you mean \" + correct_word + \"?\"\n", 194 | " else:\n", 195 | " return \"Correct spelling.\"" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "# test spell check\n", 205 | "print(spell_check(\"monney\"))" 206 | ] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.6.4" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 2 230 | } 231 | -------------------------------------------------------------------------------- /Introduction_to_NLP/Bonus+exercise+with+solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import re" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### Q1.\n", 19 | "Write a regular expression to match all the files that have either .exe, .xml or .jar extensions. A valid file name can contain any alphabet, digit and underscore followed by the extension." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "['employees.xml', 'calculator.jar', 'nfsmw.exe']\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "files = ['employees.xml', 'calculator.jar', 'nfsmw.exe', 'bkgrnd001.jpg', 'sales_report.ppt']\n", 37 | "\n", 38 | "pattern = \"^.+\\.(xml|jar|exe)$\"\n", 39 | "\n", 40 | "result = []\n", 41 | "\n", 42 | "for file in files:\n", 43 | " match = re.search(pattern, file)\n", 44 | " if match !=None:\n", 45 | " result.append(file)\n", 46 | "\n", 47 | "# print result - result should only contain the items that match the pattern\n", 48 | "print(result)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "### Q2\n", 56 | "Write a regular expression to match all the addresses that have Koramangala embedded in them.\n", 57 | "\n", 58 | "Strings that should match:\n", 59 | "* 466, 5th block, Koramangala, Bangalore\n", 60 | "* 4th BLOCK, KORAMANGALA - 560034\n", 61 | "\n", 62 | "Strings that shouldn't match:\n", 63 | "* 999, St. Marks Road, Bangalore\n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "['466, 5th block, Koramangala, Bangalore', '4th BLOCK, KORAMANGALA - 560034']\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "addresses = ['466, 5th block, Koramangala, Bangalore', '4th BLOCK, KORAMANGALA - 560034', '999, St. Marks Road, Bangalore']\n", 81 | "\n", 82 | "pattern = \"^[\\w\\d\\s,-]*koramangala[\\w\\d\\s,-]*$\"\n", 83 | "\n", 84 | "result = []\n", 85 | "\n", 86 | "for address in addresses:\n", 87 | " match = re.search(pattern, address, re.I)\n", 88 | " if match !=None:\n", 89 | " result.append(address)\n", 90 | "\n", 91 | "# print result - result should only contain the items that match the pattern\n", 92 | "print(result)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Q3. \n", 100 | "Write a regular expression that matches either integer numbers or floats upto 2 decimal places.\n", 101 | "\n", 102 | "Strings that should match: \n", 103 | "* 2\n", 104 | "* 2.3\n", 105 | "* 4.56\n", 106 | "* .61\n", 107 | "\n", 108 | "Strings that shoudln't match:\n", 109 | "* 4.567\n", 110 | "* 75.8792\n", 111 | "* abc\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 4, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "['2', '2.3', '4.56', '.61']\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "numbers = ['2', '2.3', '4.56', '.61', '4.567', '75.8792', 'abc']\n", 129 | "\n", 130 | "pattern = \"^[0-9]*(\\.[0-9]{,2})?$\"\n", 131 | "\n", 132 | "result = []\n", 133 | "\n", 134 | "for number in numbers:\n", 135 | " match = re.search(pattern, number)\n", 136 | " if match != None:\n", 137 | " result.append(number)\n", 138 | "\n", 139 | "# print result - result should only contain the items that match the pattern\n", 140 | "print(result)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### Q4. \n", 148 | "Write a regular expression to match the model names of smartphones which follow the following pattern: \n", 149 | "\n", 150 | "mobile company name followed by underscore followed by model name followed by underscore followed by model number\n", 151 | "\n", 152 | "Strings that should match:\n", 153 | "* apple_iphone_6\n", 154 | "* samsung_note_4\n", 155 | "* google_pixel_2\n", 156 | "\n", 157 | "Strings that shouldn’t match:\n", 158 | "* apple_6\n", 159 | "* iphone_6\n", 160 | "* google\\_pixel\\_\n" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 5, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "['apple_iphone_6', 'samsung_note_4', 'google_pixel_2']\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "phones = ['apple_iphone_6', 'samsung_note_4', 'google_pixel_2', 'apple_6', 'iphone_6', 'google_pixel_']\n", 178 | "\n", 179 | "pattern = \"^.*_.*_\\d$\"\n", 180 | "\n", 181 | "result = []\n", 182 | "\n", 183 | "for phone in phones:\n", 184 | " match = re.search(pattern, phone)\n", 185 | " if match !=None:\n", 186 | " result.append(phone)\n", 187 | "\n", 188 | "# print result - result should only contain the items that match the pattern\n", 189 | "print(result)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "### Q5. \n", 197 | "Write a regular expression that can be used to match the emails present in a database. \n", 198 | "\n", 199 | "The pattern of a valid email address is defined as follows:\n", 200 | "The '@' character can be preceded either by alphanumeric characters, period characters or underscore characters. The length of the part that precedes the '@' character should be between 4 to 20 characters.\n", 201 | "\n", 202 | "The '@' character should be followed by a domain name (e.g. gmail.com). The domain name has three parts - a prefix (e.g. 'gmail'), the period character and a suffix (e.g. 'com'). The prefix can have a length between 3 to 15 characters followed by a period character followed by either of these suffixes - 'com', 'in' or 'org'.\n", 203 | "\n", 204 | "\n", 205 | "Emails that should match:\n", 206 | "* random.guy123@gmail.com\n", 207 | "* mr_x_in_bombay@gov.in\n", 208 | "\n", 209 | "Emails that shouldn’t match:\n", 210 | "* 1@ued.org\n", 211 | "* @gmail.com\n", 212 | "* abc!@yahoo.in\n", 213 | "* sam_12@gov.us\n", 214 | "* neeraj@" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 6, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "['random.guy123@gmail.com', 'mr_x_in_bombay@gov.in']\n" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "emails = ['random.guy123@gmail.com', 'mr_x_in_bombay@gov.in', '1@ued.org',\n", 232 | " '@gmail.com', 'abc!@yahoo.in', 'sam_12@gov.us', 'neeraj@']\n", 233 | "\n", 234 | "pattern = \"^[a-z_.0-9]{4,20}@[a-z]{3,15}\\.(com|in|org)$\"\n", 235 | "\n", 236 | "result = []\n", 237 | "\n", 238 | "for email in emails:\n", 239 | " match = re.search(pattern, email, re.I)\n", 240 | " if match !=None:\n", 241 | " result.append(email)\n", 242 | "\n", 243 | "# print result - result should only contain the items that match the pattern\n", 244 | "print(result)" 245 | ] 246 | } 247 | ], 248 | "metadata": { 249 | "kernelspec": { 250 | "display_name": "Python 3", 251 | "language": "python", 252 | "name": "python3" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.6.5" 265 | } 266 | }, 267 | "nbformat": 4, 268 | "nbformat_minor": 2 269 | } 270 | -------------------------------------------------------------------------------- /Basic_Lexical_Processing/tokenisation .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tokenisation\n", 8 | "\n", 9 | "The notebook contains three types of tokenisation techniques:\n", 10 | "1. Word tokenisation\n", 11 | "2. Sentence tokenisation\n", 12 | "3. Tweet tokenisation\n", 13 | "4. Custom tokenisation using regular expressions" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "### 1. Word tokenisation" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "At nine o'clock I visited him myself. It looks like religious mania, and he'll soon think that he himself is God.\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "document = \"At nine o'clock I visited him myself. It looks like religious mania, and he'll soon think that he himself is God.\"\n", 38 | "print(document)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Tokenising on spaces using python" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "['At', 'nine', \"o'clock\", 'I', 'visited', 'him', 'myself.', 'It', 'looks', 'like', 'religious', 'mania,', 'and', \"he'll\", 'soon', 'think', 'that', 'he', 'himself', 'is', 'God.']\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "print(document.split())" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Tokenising using nltk word tokeniser" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": { 76 | "collapsed": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "from nltk.tokenize import word_tokenize\n", 81 | "words = word_tokenize(document)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "['At', 'nine', \"o'clock\", 'I', 'visited', 'him', 'myself', '.', 'It', 'looks', 'like', 'religious', 'mania', ',', 'and', 'he', \"'ll\", 'soon', 'think', 'that', 'he', 'himself', 'is', 'God', '.']\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "print(words)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "NLTK's word tokeniser not only breaks on whitespaces but also breaks contraction words such as he'll into \"he\" and \"'ll\". On the other hand it doesn't break \"o'clock\" and treats it as a separate token." 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "### 2. Sentence tokeniser" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "Tokenising based on sentence requires you to split on the period ('.'). Let's use nltk sentence tokeniser." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "from nltk.tokenize import sent_tokenize\n", 131 | "sentences = sent_tokenize(document)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "[\"At nine o'clock I visited him myself.\", \"It looks like religious mania, and he'll soon think that he himself is God.\"]\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "print(sentences)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### 3. Tweet tokeniser" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "A problem with word tokeniser is that it fails to tokeniser emojis and other complex special characters such as word with hashtags. Emojis are common these days and people use them all the time." 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 7, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "message = \"i recently watched this show called mindhunters:). i totally loved it 😍. it was gr8 <3. #bingewatching #nothingtodo 😎\"" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 8, 179 | "metadata": { 180 | "scrolled": true 181 | }, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "['i', 'recently', 'watched', 'this', 'show', 'called', 'mindhunters', ':', ')', '.', 'i', 'totally', 'loved', 'it', '😍', '.', 'it', 'was', 'gr8', '<', '3', '.', '#', 'bingewatching', '#', 'nothingtodo', '😎']\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "print(word_tokenize(message))" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "The word tokeniser breaks the emoji '<3' into '<' and '3' which is something that we don't want. Emojis have their own significance in areas like sentiment analysis where a happy face and sad face can salone prove to be a really good predictor of the sentiment. Similarly, the hashtags are broken into two tokens. A hashtag is used for searching specific topics or photos in social media apps such as Instagram and facebook. So there, you want to use the hashtag as is.\n", 200 | "\n", 201 | "Let's use the tweet tokeniser of nltk to tokenise this message." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 9, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "from nltk.tokenize import TweetTokenizer\n", 213 | "tknzr = TweetTokenizer()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 10, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "text/plain": [ 224 | "['i',\n", 225 | " 'recently',\n", 226 | " 'watched',\n", 227 | " 'this',\n", 228 | " 'show',\n", 229 | " 'called',\n", 230 | " 'mindhunters',\n", 231 | " ':)',\n", 232 | " '.',\n", 233 | " 'i',\n", 234 | " 'totally',\n", 235 | " 'loved',\n", 236 | " 'it',\n", 237 | " '😍',\n", 238 | " '.',\n", 239 | " 'it',\n", 240 | " 'was',\n", 241 | " 'gr8',\n", 242 | " '<3',\n", 243 | " '.',\n", 244 | " '#bingewatching',\n", 245 | " '#nothingtodo',\n", 246 | " '😎']" 247 | ] 248 | }, 249 | "execution_count": 10, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "tknzr.tokenize(message)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "As you can see, it handles all the emojis and the hashtags pretty well.\n", 263 | "\n", 264 | "Now, there is a tokeniser that takes a regular expression and tokenises and returns result based on the pattern of regular expression.\n", 265 | "\n", 266 | "Let's look at how you can use regular expression tokeniser." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 11, 272 | "metadata": { 273 | "collapsed": true 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "from nltk.tokenize import regexp_tokenize\n", 278 | "message = \"i recently watched this show called mindhunters:). i totally loved it 😍. it was gr8 <3. #bingewatching #nothingtodo 😎\"\n", 279 | "pattern = \"#[\\w]+\"" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 12, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "['#bingewatching', '#nothingtodo']" 291 | ] 292 | }, 293 | "execution_count": 12, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "regexp_tokenize(message, pattern)" 300 | ] 301 | } 302 | ], 303 | "metadata": { 304 | "kernelspec": { 305 | "display_name": "Python 3", 306 | "language": "python", 307 | "name": "python3" 308 | }, 309 | "language_info": { 310 | "codemirror_mode": { 311 | "name": "ipython", 312 | "version": 3 313 | }, 314 | "file_extension": ".py", 315 | "mimetype": "text/x-python", 316 | "name": "python", 317 | "nbconvert_exporter": "python", 318 | "pygments_lexer": "ipython3", 319 | "version": "3.6.5" 320 | } 321 | }, 322 | "nbformat": 4, 323 | "nbformat_minor": 2 324 | } 325 | -------------------------------------------------------------------------------- /Basic_Lexical_Processing/stemming.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Stemming" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "# import libraries\n", 19 | "import pandas as pd\n", 20 | "from nltk.tokenize import word_tokenize\n", 21 | "from nltk.stem.porter import PorterStemmer\n", 22 | "from nltk.stem.snowball import SnowballStemmer" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire.\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "text = \"Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire.\"\n", 40 | "print(text)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "['very', 'orderly', 'and', 'methodical', 'he', 'looked', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'ticking', 'a', 'sonorous', 'sermon', 'under', 'his', 'flapped', 'newly', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pitted', 'its', 'gravity', 'and', 'longevity', 'against', 'the', 'levity', 'and', 'evanescence', 'of', 'the', 'brisk', 'fire', '.']\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "tokens = word_tokenize(text.lower())\n", 58 | "print(tokens)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "['veri', 'orderli', 'and', 'method', 'he', 'look', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'tick', 'a', 'sonor', 'sermon', 'under', 'hi', 'flap', 'newli', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pit', 'it', 'graviti', 'and', 'longev', 'against', 'the', 'leviti', 'and', 'evanesc', 'of', 'the', 'brisk', 'fire', '.']\n" 71 | ] 72 | }, 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "47" 77 | ] 78 | }, 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "stemmer = PorterStemmer()\n", 86 | "porter_stemmed = [stemmer.stem(token) for token in tokens]\n", 87 | "print(porter_stemmed)\n", 88 | "len(porter_stemmed)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "['veri', 'order', 'and', 'method', 'he', 'look', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'tick', 'a', 'sonor', 'sermon', 'under', 'his', 'flap', 'newli', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pit', 'it', 'graviti', 'and', 'longev', 'against', 'the', 'leviti', 'and', 'evanesc', 'of', 'the', 'brisk', 'fire', '.']\n" 101 | ] 102 | }, 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "47" 107 | ] 108 | }, 109 | "execution_count": 5, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "# snowball stemmer\n", 116 | "stemmer = SnowballStemmer(\"english\")\n", 117 | "snowball_stemmed = [stemmer.stem(token) for token in tokens]\n", 118 | "print(snowball_stemmed)\n", 119 | "len(snowball_stemmed)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 6, 125 | "metadata": { 126 | "collapsed": true, 127 | "scrolled": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "df = pd.DataFrame({'token': tokens, 'porter_stemmed': porter_stemmed, 'snowball_stemmed': snowball_stemmed})\n", 132 | "df = df[['token', 'porter_stemmed', 'snowball_stemmed']]" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 7, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/html": [ 143 | "
| \n", 161 | " | token | \n", 162 | "porter_stemmed | \n", 163 | "snowball_stemmed | \n", 164 | "
|---|---|---|---|
| 0 | \n", 169 | "very | \n", 170 | "veri | \n", 171 | "veri | \n", 172 | "
| 1 | \n", 175 | "orderly | \n", 176 | "orderli | \n", 177 | "order | \n", 178 | "
| 3 | \n", 181 | "methodical | \n", 182 | "method | \n", 183 | "method | \n", 184 | "
| 5 | \n", 187 | "looked | \n", 188 | "look | \n", 189 | "look | \n", 190 | "
| 18 | \n", 193 | "ticking | \n", 194 | "tick | \n", 195 | "tick | \n", 196 | "
| 20 | \n", 199 | "sonorous | \n", 200 | "sonor | \n", 201 | "sonor | \n", 202 | "
| 23 | \n", 205 | "his | \n", 206 | "hi | \n", 207 | "his | \n", 208 | "
| 24 | \n", 211 | "flapped | \n", 212 | "flap | \n", 213 | "flap | \n", 214 | "
| 25 | \n", 217 | "newly | \n", 218 | "newli | \n", 219 | "newli | \n", 220 | "
| 32 | \n", 223 | "pitted | \n", 224 | "pit | \n", 225 | "pit | \n", 226 | "
| 33 | \n", 229 | "its | \n", 230 | "it | \n", 231 | "it | \n", 232 | "
| 34 | \n", 235 | "gravity | \n", 236 | "graviti | \n", 237 | "graviti | \n", 238 | "
| 36 | \n", 241 | "longevity | \n", 242 | "longev | \n", 243 | "longev | \n", 244 | "
| 39 | \n", 247 | "levity | \n", 248 | "leviti | \n", 249 | "leviti | \n", 250 | "
| 41 | \n", 253 | "evanescence | \n", 254 | "evanesc | \n", 255 | "evanesc | \n", 256 | "
| \n", 460 | " | 000 | \n", 461 | "07732584351 | \n", 462 | "0800 | \n", 463 | "08000930705 | \n", 464 | "08002986030 | \n", 465 | "08452810075over18 | \n", 466 | "09061209465 | \n", 467 | "09061701461 | \n", 468 | "09066364589 | \n", 469 | "10 | \n", 470 | "100 | \n", 471 | "1000 | \n", 472 | "10am | \n", 473 | "11 | \n", 474 | "12 | \n", 475 | "1500 | \n", 476 | "150p | \n", 477 | "150pm | \n", 478 | "16 | \n", 479 | "169 | \n", 480 | "18 | \n", 481 | "20 | \n", 482 | "2005 | \n", 483 | "21st | \n", 484 | "2nd | \n", 485 | "3aj | \n", 486 | "4403ldnw1a7rw18 | \n", 487 | "450ppw | \n", 488 | "4txt | \n", 489 | "50 | \n", 490 | "5000 | \n", 491 | "5249 | \n", 492 | "530 | \n", 493 | "5we | \n", 494 | "6031 | \n", 495 | "6days | \n", 496 | "81010 | \n", 497 | "85069 | \n", 498 | "87077 | \n", 499 | "87121 | \n", 500 | "... | \n", 501 | "wat | \n", 502 | "watching | \n", 503 | "watts | \n", 504 | "way | \n", 505 | "weak | \n", 506 | "week | \n", 507 | "weekend | \n", 508 | "well | \n", 509 | "wen | \n", 510 | "wet | \n", 511 | "whole | \n", 512 | "wif | \n", 513 | "win | \n", 514 | "wings | \n", 515 | "winner | \n", 516 | "wk | \n", 517 | "wkly | \n", 518 | "wo | \n", 519 | "wonderful | \n", 520 | "wont | \n", 521 | "word | \n", 522 | "words | \n", 523 | "work | \n", 524 | "world | \n", 525 | "worried | \n", 526 | "wow | \n", 527 | "wun | \n", 528 | "www | \n", 529 | "xuhui | \n", 530 | "xx | \n", 531 | "xxx | \n", 532 | "xxxmobilemovieclub | \n", 533 | "yeah | \n", 534 | "year | \n", 535 | "yes | \n", 536 | "yesterday | \n", 537 | "yo | \n", 538 | "yummy | \n", 539 | "yup | \n", 540 | "ú1 | \n", 541 | "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", 546 | "0 | \n", 547 | "0 | \n", 548 | "0 | \n", 549 | "0 | \n", 550 | "0 | \n", 551 | "0 | \n", 552 | "0 | \n", 553 | "0 | \n", 554 | "0 | \n", 555 | "0 | \n", 556 | "0 | \n", 557 | "0 | \n", 558 | "0 | \n", 559 | "0 | \n", 560 | "0 | \n", 561 | "0 | \n", 562 | "0 | \n", 563 | "0 | \n", 564 | "0 | \n", 565 | "0 | \n", 566 | "0 | \n", 567 | "0 | \n", 568 | "0 | \n", 569 | "0 | \n", 570 | "0 | \n", 571 | "0 | \n", 572 | "0 | \n", 573 | "0 | \n", 574 | "0 | \n", 575 | "0 | \n", 576 | "0 | \n", 577 | "0 | \n", 578 | "0 | \n", 579 | "0 | \n", 580 | "0 | \n", 581 | "0 | \n", 582 | "0 | \n", 583 | "0 | \n", 584 | "0 | \n", 585 | "0 | \n", 586 | "... | \n", 587 | "1 | \n", 588 | "0 | \n", 589 | "0 | \n", 590 | "0 | \n", 591 | "0 | \n", 592 | "0 | \n", 593 | "0 | \n", 594 | "0 | \n", 595 | "0 | \n", 596 | "0 | \n", 597 | "0 | \n", 598 | "0 | \n", 599 | "0 | \n", 600 | "0 | \n", 601 | "0 | \n", 602 | "0 | \n", 603 | "0 | \n", 604 | "0 | \n", 605 | "0 | \n", 606 | "0 | \n", 607 | "0 | \n", 608 | "0 | \n", 609 | "0 | \n", 610 | "1 | \n", 611 | "0 | \n", 612 | "0 | \n", 613 | "0 | \n", 614 | "0 | \n", 615 | "0 | \n", 616 | "0 | \n", 617 | "0 | \n", 618 | "0 | \n", 619 | "0 | \n", 620 | "0 | \n", 621 | "0 | \n", 622 | "0 | \n", 623 | "0 | \n", 624 | "0 | \n", 625 | "0 | \n", 626 | "0 | \n", 627 | "
| 1 | \n", 630 | "0 | \n", 631 | "0 | \n", 632 | "0 | \n", 633 | "0 | \n", 634 | "0 | \n", 635 | "0 | \n", 636 | "0 | \n", 637 | "0 | \n", 638 | "0 | \n", 639 | "0 | \n", 640 | "0 | \n", 641 | "0 | \n", 642 | "0 | \n", 643 | "0 | \n", 644 | "0 | \n", 645 | "0 | \n", 646 | "0 | \n", 647 | "0 | \n", 648 | "0 | \n", 649 | "0 | \n", 650 | "0 | \n", 651 | "0 | \n", 652 | "0 | \n", 653 | "0 | \n", 654 | "0 | \n", 655 | "0 | \n", 656 | "0 | \n", 657 | "0 | \n", 658 | "0 | \n", 659 | "0 | \n", 660 | "0 | \n", 661 | "0 | \n", 662 | "0 | \n", 663 | "0 | \n", 664 | "0 | \n", 665 | "0 | \n", 666 | "0 | \n", 667 | "0 | \n", 668 | "0 | \n", 669 | "0 | \n", 670 | "... | \n", 671 | "0 | \n", 672 | "0 | \n", 673 | "0 | \n", 674 | "0 | \n", 675 | "0 | \n", 676 | "0 | \n", 677 | "0 | \n", 678 | "0 | \n", 679 | "0 | \n", 680 | "0 | \n", 681 | "0 | \n", 682 | "1 | \n", 683 | "0 | \n", 684 | "0 | \n", 685 | "0 | \n", 686 | "0 | \n", 687 | "0 | \n", 688 | "0 | \n", 689 | "0 | \n", 690 | "0 | \n", 691 | "0 | \n", 692 | "0 | \n", 693 | "0 | \n", 694 | "0 | \n", 695 | "0 | \n", 696 | "0 | \n", 697 | "0 | \n", 698 | "0 | \n", 699 | "0 | \n", 700 | "0 | \n", 701 | "0 | \n", 702 | "0 | \n", 703 | "0 | \n", 704 | "0 | \n", 705 | "0 | \n", 706 | "0 | \n", 707 | "0 | \n", 708 | "0 | \n", 709 | "0 | \n", 710 | "0 | \n", 711 | "
| 2 | \n", 714 | "0 | \n", 715 | "0 | \n", 716 | "0 | \n", 717 | "0 | \n", 718 | "0 | \n", 719 | "1 | \n", 720 | "0 | \n", 721 | "0 | \n", 722 | "0 | \n", 723 | "0 | \n", 724 | "0 | \n", 725 | "0 | \n", 726 | "0 | \n", 727 | "0 | \n", 728 | "0 | \n", 729 | "0 | \n", 730 | "0 | \n", 731 | "0 | \n", 732 | "0 | \n", 733 | "0 | \n", 734 | "0 | \n", 735 | "0 | \n", 736 | "1 | \n", 737 | "1 | \n", 738 | "0 | \n", 739 | "0 | \n", 740 | "0 | \n", 741 | "0 | \n", 742 | "0 | \n", 743 | "0 | \n", 744 | "0 | \n", 745 | "0 | \n", 746 | "0 | \n", 747 | "0 | \n", 748 | "0 | \n", 749 | "0 | \n", 750 | "0 | \n", 751 | "0 | \n", 752 | "0 | \n", 753 | "1 | \n", 754 | "... | \n", 755 | "0 | \n", 756 | "0 | \n", 757 | "0 | \n", 758 | "0 | \n", 759 | "0 | \n", 760 | "0 | \n", 761 | "0 | \n", 762 | "0 | \n", 763 | "0 | \n", 764 | "0 | \n", 765 | "0 | \n", 766 | "0 | \n", 767 | "1 | \n", 768 | "0 | \n", 769 | "0 | \n", 770 | "0 | \n", 771 | "1 | \n", 772 | "0 | \n", 773 | "0 | \n", 774 | "0 | \n", 775 | "0 | \n", 776 | "0 | \n", 777 | "0 | \n", 778 | "0 | \n", 779 | "0 | \n", 780 | "0 | \n", 781 | "0 | \n", 782 | "0 | \n", 783 | "0 | \n", 784 | "0 | \n", 785 | "0 | \n", 786 | "0 | \n", 787 | "0 | \n", 788 | "0 | \n", 789 | "0 | \n", 790 | "0 | \n", 791 | "0 | \n", 792 | "0 | \n", 793 | "0 | \n", 794 | "0 | \n", 795 | "
| 3 | \n", 798 | "0 | \n", 799 | "0 | \n", 800 | "0 | \n", 801 | "0 | \n", 802 | "0 | \n", 803 | "0 | \n", 804 | "0 | \n", 805 | "0 | \n", 806 | "0 | \n", 807 | "0 | \n", 808 | "0 | \n", 809 | "0 | \n", 810 | "0 | \n", 811 | "0 | \n", 812 | "0 | \n", 813 | "0 | \n", 814 | "0 | \n", 815 | "0 | \n", 816 | "0 | \n", 817 | "0 | \n", 818 | "0 | \n", 819 | "0 | \n", 820 | "0 | \n", 821 | "0 | \n", 822 | "0 | \n", 823 | "0 | \n", 824 | "0 | \n", 825 | "0 | \n", 826 | "0 | \n", 827 | "0 | \n", 828 | "0 | \n", 829 | "0 | \n", 830 | "0 | \n", 831 | "0 | \n", 832 | "0 | \n", 833 | "0 | \n", 834 | "0 | \n", 835 | "0 | \n", 836 | "0 | \n", 837 | "0 | \n", 838 | "... | \n", 839 | "0 | \n", 840 | "0 | \n", 841 | "0 | \n", 842 | "0 | \n", 843 | "0 | \n", 844 | "0 | \n", 845 | "0 | \n", 846 | "0 | \n", 847 | "0 | \n", 848 | "0 | \n", 849 | "0 | \n", 850 | "0 | \n", 851 | "0 | \n", 852 | "0 | \n", 853 | "0 | \n", 854 | "0 | \n", 855 | "0 | \n", 856 | "0 | \n", 857 | "0 | \n", 858 | "0 | \n", 859 | "0 | \n", 860 | "0 | \n", 861 | "0 | \n", 862 | "0 | \n", 863 | "0 | \n", 864 | "0 | \n", 865 | "0 | \n", 866 | "0 | \n", 867 | "0 | \n", 868 | "0 | \n", 869 | "0 | \n", 870 | "0 | \n", 871 | "0 | \n", 872 | "0 | \n", 873 | "0 | \n", 874 | "0 | \n", 875 | "0 | \n", 876 | "0 | \n", 877 | "0 | \n", 878 | "0 | \n", 879 | "
| 4 | \n", 882 | "0 | \n", 883 | "0 | \n", 884 | "0 | \n", 885 | "0 | \n", 886 | "0 | \n", 887 | "0 | \n", 888 | "0 | \n", 889 | "0 | \n", 890 | "0 | \n", 891 | "0 | \n", 892 | "0 | \n", 893 | "0 | \n", 894 | "0 | \n", 895 | "0 | \n", 896 | "0 | \n", 897 | "0 | \n", 898 | "0 | \n", 899 | "0 | \n", 900 | "0 | \n", 901 | "0 | \n", 902 | "0 | \n", 903 | "0 | \n", 904 | "0 | \n", 905 | "0 | \n", 906 | "0 | \n", 907 | "0 | \n", 908 | "0 | \n", 909 | "0 | \n", 910 | "0 | \n", 911 | "0 | \n", 912 | "0 | \n", 913 | "0 | \n", 914 | "0 | \n", 915 | "0 | \n", 916 | "0 | \n", 917 | "0 | \n", 918 | "0 | \n", 919 | "0 | \n", 920 | "0 | \n", 921 | "0 | \n", 922 | "... | \n", 923 | "0 | \n", 924 | "0 | \n", 925 | "0 | \n", 926 | "0 | \n", 927 | "0 | \n", 928 | "0 | \n", 929 | "0 | \n", 930 | "0 | \n", 931 | "0 | \n", 932 | "0 | \n", 933 | "0 | \n", 934 | "0 | \n", 935 | "0 | \n", 936 | "0 | \n", 937 | "0 | \n", 938 | "0 | \n", 939 | "0 | \n", 940 | "0 | \n", 941 | "0 | \n", 942 | "0 | \n", 943 | "0 | \n", 944 | "0 | \n", 945 | "0 | \n", 946 | "0 | \n", 947 | "0 | \n", 948 | "0 | \n", 949 | "0 | \n", 950 | "0 | \n", 951 | "0 | \n", 952 | "0 | \n", 953 | "0 | \n", 954 | "0 | \n", 955 | "0 | \n", 956 | "0 | \n", 957 | "0 | \n", 958 | "0 | \n", 959 | "0 | \n", 960 | "0 | \n", 961 | "0 | \n", 962 | "0 | \n", 963 | "
| ... | \n", 966 | "... | \n", 967 | "... | \n", 968 | "... | \n", 969 | "... | \n", 970 | "... | \n", 971 | "... | \n", 972 | "... | \n", 973 | "... | \n", 974 | "... | \n", 975 | "... | \n", 976 | "... | \n", 977 | "... | \n", 978 | "... | \n", 979 | "... | \n", 980 | "... | \n", 981 | "... | \n", 982 | "... | \n", 983 | "... | \n", 984 | "... | \n", 985 | "... | \n", 986 | "... | \n", 987 | "... | \n", 988 | "... | \n", 989 | "... | \n", 990 | "... | \n", 991 | "... | \n", 992 | "... | \n", 993 | "... | \n", 994 | "... | \n", 995 | "... | \n", 996 | "... | \n", 997 | "... | \n", 998 | "... | \n", 999 | "... | \n", 1000 | "... | \n", 1001 | "... | \n", 1002 | "... | \n", 1003 | "... | \n", 1004 | "... | \n", 1005 | "... | \n", 1006 | "... | \n", 1007 | "... | \n", 1008 | "... | \n", 1009 | "... | \n", 1010 | "... | \n", 1011 | "... | \n", 1012 | "... | \n", 1013 | "... | \n", 1014 | "... | \n", 1015 | "... | \n", 1016 | "... | \n", 1017 | "... | \n", 1018 | "... | \n", 1019 | "... | \n", 1020 | "... | \n", 1021 | "... | \n", 1022 | "... | \n", 1023 | "... | \n", 1024 | "... | \n", 1025 | "... | \n", 1026 | "... | \n", 1027 | "... | \n", 1028 | "... | \n", 1029 | "... | \n", 1030 | "... | \n", 1031 | "... | \n", 1032 | "... | \n", 1033 | "... | \n", 1034 | "... | \n", 1035 | "... | \n", 1036 | "... | \n", 1037 | "... | \n", 1038 | "... | \n", 1039 | "... | \n", 1040 | "... | \n", 1041 | "... | \n", 1042 | "... | \n", 1043 | "... | \n", 1044 | "... | \n", 1045 | "... | \n", 1046 | "... | \n", 1047 | "
| 95 | \n", 1050 | "0 | \n", 1051 | "0 | \n", 1052 | "0 | \n", 1053 | "0 | \n", 1054 | "0 | \n", 1055 | "0 | \n", 1056 | "0 | \n", 1057 | "0 | \n", 1058 | "0 | \n", 1059 | "0 | \n", 1060 | "0 | \n", 1061 | "0 | \n", 1062 | "0 | \n", 1063 | "0 | \n", 1064 | "0 | \n", 1065 | "0 | \n", 1066 | "0 | \n", 1067 | "0 | \n", 1068 | "1 | \n", 1069 | "0 | \n", 1070 | "0 | \n", 1071 | "0 | \n", 1072 | "0 | \n", 1073 | "0 | \n", 1074 | "0 | \n", 1075 | "0 | \n", 1076 | "0 | \n", 1077 | "1 | \n", 1078 | "0 | \n", 1079 | "0 | \n", 1080 | "0 | \n", 1081 | "1 | \n", 1082 | "0 | \n", 1083 | "0 | \n", 1084 | "0 | \n", 1085 | "0 | \n", 1086 | "0 | \n", 1087 | "1 | \n", 1088 | "0 | \n", 1089 | "0 | \n", 1090 | "... | \n", 1091 | "0 | \n", 1092 | "0 | \n", 1093 | "0 | \n", 1094 | "0 | \n", 1095 | "0 | \n", 1096 | "0 | \n", 1097 | "0 | \n", 1098 | "0 | \n", 1099 | "0 | \n", 1100 | "0 | \n", 1101 | "0 | \n", 1102 | "0 | \n", 1103 | "0 | \n", 1104 | "0 | \n", 1105 | "0 | \n", 1106 | "0 | \n", 1107 | "0 | \n", 1108 | "0 | \n", 1109 | "0 | \n", 1110 | "0 | \n", 1111 | "0 | \n", 1112 | "0 | \n", 1113 | "0 | \n", 1114 | "0 | \n", 1115 | "0 | \n", 1116 | "0 | \n", 1117 | "0 | \n", 1118 | "0 | \n", 1119 | "0 | \n", 1120 | "0 | \n", 1121 | "0 | \n", 1122 | "0 | \n", 1123 | "0 | \n", 1124 | "0 | \n", 1125 | "0 | \n", 1126 | "0 | \n", 1127 | "0 | \n", 1128 | "0 | \n", 1129 | "0 | \n", 1130 | "0 | \n", 1131 | "
| 96 | \n", 1134 | "0 | \n", 1135 | "0 | \n", 1136 | "0 | \n", 1137 | "0 | \n", 1138 | "0 | \n", 1139 | "0 | \n", 1140 | "0 | \n", 1141 | "0 | \n", 1142 | "0 | \n", 1143 | "0 | \n", 1144 | "0 | \n", 1145 | "0 | \n", 1146 | "0 | \n", 1147 | "0 | \n", 1148 | "0 | \n", 1149 | "0 | \n", 1150 | "0 | \n", 1151 | "0 | \n", 1152 | "0 | \n", 1153 | "0 | \n", 1154 | "0 | \n", 1155 | "0 | \n", 1156 | "0 | \n", 1157 | "0 | \n", 1158 | "0 | \n", 1159 | "0 | \n", 1160 | "0 | \n", 1161 | "0 | \n", 1162 | "0 | \n", 1163 | "0 | \n", 1164 | "0 | \n", 1165 | "0 | \n", 1166 | "0 | \n", 1167 | "0 | \n", 1168 | "0 | \n", 1169 | "0 | \n", 1170 | "0 | \n", 1171 | "0 | \n", 1172 | "0 | \n", 1173 | "0 | \n", 1174 | "... | \n", 1175 | "1 | \n", 1176 | "1 | \n", 1177 | "0 | \n", 1178 | "0 | \n", 1179 | "0 | \n", 1180 | "0 | \n", 1181 | "0 | \n", 1182 | "0 | \n", 1183 | "0 | \n", 1184 | "0 | \n", 1185 | "0 | \n", 1186 | "0 | \n", 1187 | "0 | \n", 1188 | "0 | \n", 1189 | "0 | \n", 1190 | "0 | \n", 1191 | "0 | \n", 1192 | "0 | \n", 1193 | "0 | \n", 1194 | "0 | \n", 1195 | "0 | \n", 1196 | "0 | \n", 1197 | "0 | \n", 1198 | "0 | \n", 1199 | "0 | \n", 1200 | "0 | \n", 1201 | "0 | \n", 1202 | "0 | \n", 1203 | "0 | \n", 1204 | "0 | \n", 1205 | "0 | \n", 1206 | "0 | \n", 1207 | "0 | \n", 1208 | "0 | \n", 1209 | "0 | \n", 1210 | "0 | \n", 1211 | "0 | \n", 1212 | "0 | \n", 1213 | "0 | \n", 1214 | "0 | \n", 1215 | "
| 97 | \n", 1218 | "0 | \n", 1219 | "0 | \n", 1220 | "0 | \n", 1221 | "0 | \n", 1222 | "0 | \n", 1223 | "0 | \n", 1224 | "0 | \n", 1225 | "0 | \n", 1226 | "0 | \n", 1227 | "0 | \n", 1228 | "0 | \n", 1229 | "0 | \n", 1230 | "0 | \n", 1231 | "0 | \n", 1232 | "0 | \n", 1233 | "0 | \n", 1234 | "0 | \n", 1235 | "0 | \n", 1236 | "0 | \n", 1237 | "0 | \n", 1238 | "0 | \n", 1239 | "0 | \n", 1240 | "0 | \n", 1241 | "0 | \n", 1242 | "0 | \n", 1243 | "0 | \n", 1244 | "0 | \n", 1245 | "0 | \n", 1246 | "0 | \n", 1247 | "0 | \n", 1248 | "0 | \n", 1249 | "0 | \n", 1250 | "0 | \n", 1251 | "0 | \n", 1252 | "0 | \n", 1253 | "0 | \n", 1254 | "0 | \n", 1255 | "0 | \n", 1256 | "0 | \n", 1257 | "0 | \n", 1258 | "... | \n", 1259 | "0 | \n", 1260 | "0 | \n", 1261 | "0 | \n", 1262 | "0 | \n", 1263 | "0 | \n", 1264 | "0 | \n", 1265 | "0 | \n", 1266 | "0 | \n", 1267 | "0 | \n", 1268 | "0 | \n", 1269 | "0 | \n", 1270 | "0 | \n", 1271 | "0 | \n", 1272 | "0 | \n", 1273 | "0 | \n", 1274 | "0 | \n", 1275 | "0 | \n", 1276 | "0 | \n", 1277 | "0 | \n", 1278 | "0 | \n", 1279 | "0 | \n", 1280 | "0 | \n", 1281 | "0 | \n", 1282 | "0 | \n", 1283 | "0 | \n", 1284 | "0 | \n", 1285 | "0 | \n", 1286 | "0 | \n", 1287 | "0 | \n", 1288 | "0 | \n", 1289 | "0 | \n", 1290 | "0 | \n", 1291 | "0 | \n", 1292 | "0 | \n", 1293 | "0 | \n", 1294 | "0 | \n", 1295 | "0 | \n", 1296 | "0 | \n", 1297 | "0 | \n", 1298 | "0 | \n", 1299 | "
| 98 | \n", 1302 | "0 | \n", 1303 | "0 | \n", 1304 | "0 | \n", 1305 | "0 | \n", 1306 | "0 | \n", 1307 | "0 | \n", 1308 | "0 | \n", 1309 | "0 | \n", 1310 | "0 | \n", 1311 | "0 | \n", 1312 | "0 | \n", 1313 | "0 | \n", 1314 | "0 | \n", 1315 | "0 | \n", 1316 | "0 | \n", 1317 | "0 | \n", 1318 | "0 | \n", 1319 | "0 | \n", 1320 | "0 | \n", 1321 | "0 | \n", 1322 | "0 | \n", 1323 | "0 | \n", 1324 | "0 | \n", 1325 | "0 | \n", 1326 | "0 | \n", 1327 | "0 | \n", 1328 | "0 | \n", 1329 | "0 | \n", 1330 | "0 | \n", 1331 | "0 | \n", 1332 | "0 | \n", 1333 | "0 | \n", 1334 | "0 | \n", 1335 | "0 | \n", 1336 | "0 | \n", 1337 | "0 | \n", 1338 | "0 | \n", 1339 | "0 | \n", 1340 | "0 | \n", 1341 | "0 | \n", 1342 | "... | \n", 1343 | "0 | \n", 1344 | "0 | \n", 1345 | "0 | \n", 1346 | "0 | \n", 1347 | "0 | \n", 1348 | "0 | \n", 1349 | "0 | \n", 1350 | "0 | \n", 1351 | "0 | \n", 1352 | "0 | \n", 1353 | "0 | \n", 1354 | "0 | \n", 1355 | "0 | \n", 1356 | "0 | \n", 1357 | "0 | \n", 1358 | "1 | \n", 1359 | "0 | \n", 1360 | "0 | \n", 1361 | "0 | \n", 1362 | "0 | \n", 1363 | "0 | \n", 1364 | "0 | \n", 1365 | "0 | \n", 1366 | "0 | \n", 1367 | "0 | \n", 1368 | "0 | \n", 1369 | "0 | \n", 1370 | "0 | \n", 1371 | "0 | \n", 1372 | "0 | \n", 1373 | "0 | \n", 1374 | "0 | \n", 1375 | "0 | \n", 1376 | "0 | \n", 1377 | "1 | \n", 1378 | "0 | \n", 1379 | "0 | \n", 1380 | "0 | \n", 1381 | "0 | \n", 1382 | "0 | \n", 1383 | "
| 99 | \n", 1386 | "0 | \n", 1387 | "0 | \n", 1388 | "0 | \n", 1389 | "0 | \n", 1390 | "0 | \n", 1391 | "0 | \n", 1392 | "0 | \n", 1393 | "0 | \n", 1394 | "0 | \n", 1395 | "0 | \n", 1396 | "0 | \n", 1397 | "0 | \n", 1398 | "0 | \n", 1399 | "0 | \n", 1400 | "0 | \n", 1401 | "0 | \n", 1402 | "0 | \n", 1403 | "0 | \n", 1404 | "0 | \n", 1405 | "0 | \n", 1406 | "0 | \n", 1407 | "0 | \n", 1408 | "0 | \n", 1409 | "0 | \n", 1410 | "0 | \n", 1411 | "0 | \n", 1412 | "0 | \n", 1413 | "0 | \n", 1414 | "0 | \n", 1415 | "0 | \n", 1416 | "0 | \n", 1417 | "0 | \n", 1418 | "0 | \n", 1419 | "0 | \n", 1420 | "0 | \n", 1421 | "0 | \n", 1422 | "0 | \n", 1423 | "0 | \n", 1424 | "0 | \n", 1425 | "0 | \n", 1426 | "... | \n", 1427 | "0 | \n", 1428 | "0 | \n", 1429 | "0 | \n", 1430 | "0 | \n", 1431 | "0 | \n", 1432 | "0 | \n", 1433 | "0 | \n", 1434 | "0 | \n", 1435 | "0 | \n", 1436 | "0 | \n", 1437 | "0 | \n", 1438 | "0 | \n", 1439 | "0 | \n", 1440 | "0 | \n", 1441 | "0 | \n", 1442 | "0 | \n", 1443 | "0 | \n", 1444 | "0 | \n", 1445 | "0 | \n", 1446 | "0 | \n", 1447 | "0 | \n", 1448 | "0 | \n", 1449 | "0 | \n", 1450 | "0 | \n", 1451 | "0 | \n", 1452 | "0 | \n", 1453 | "0 | \n", 1454 | "0 | \n", 1455 | "0 | \n", 1456 | "0 | \n", 1457 | "0 | \n", 1458 | "0 | \n", 1459 | "0 | \n", 1460 | "0 | \n", 1461 | "0 | \n", 1462 | "0 | \n", 1463 | "0 | \n", 1464 | "0 | \n", 1465 | "0 | \n", 1466 | "0 | \n", 1467 | "
100 rows × 640 columns
\n", 1471 | "