├── Advanced_Lexial_Processing ├── spell_corrector.py ├── soundex.ipynb ├── edit+distance.ipynb └── spell-corrector.ipynb ├── Introduction_to_NLP ├── Bonus+exercise.ipynb ├── Bonus+exercise+with+solution.ipynb └── Regular_Expressions .ipynb └── Basic_Lexical_Processing ├── tokenisation .ipynb ├── stemming.ipynb └── bag+of+words.ipynb /Advanced_Lexial_Processing/spell_corrector.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import Counter 3 | 4 | def words(document): 5 | "Convert text to lower case and tokenize the document" 6 | return re.findall(r'\w+', document.lower()) 7 | 8 | # create a frequency table of all the words of the document 9 | all_words = Counter(words(open('big.txt').read())) 10 | 11 | def edits_one(word): 12 | "Create all edits that are one edit away from `word`." 13 | alphabets = 'abcdefghijklmnopqrstuvwxyz' 14 | splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] 15 | deletes = [left + right[1:] for left, right in splits if right] 16 | inserts = [left + c + right for left, right in splits for c in alphabets] 17 | replaces = [left + c + right[1:] for left, right in splits if right for c in alphabets] 18 | transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1] 19 | return set(deletes + inserts + replaces + transposes) 20 | 21 | def edits_two(word): 22 | "Create all edits that are two edits away from `word`." 23 | return (e2 for e1 in edits_one(word) for e2 in edits_one(e1)) 24 | 25 | def known(words): 26 | "The subset of `words` that appear in the `all_words`." 27 | return set(word for word in words if word in all_words) 28 | 29 | def possible_corrections(word): 30 | "Generate possible spelling corrections for word." 31 | return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word]) 32 | 33 | def prob(word, N=sum(all_words.values())): 34 | "Probability of `word`: Number of appearances of 'word' / total number of tokens" 35 | return all_words[word] / N 36 | 37 | def rectify(word): 38 | "return the most probable spelling correction for `word` out of all the `possible_corrections`" 39 | correct_word = max(possible_corrections(word), key=prob) 40 | return correct_word 41 | -------------------------------------------------------------------------------- /Advanced_Lexial_Processing/soundex.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Soundex" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Let's create a function which calculates the soundex of any given string " 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 8, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "def get_soundex(token):\n", 24 | " \"\"\"Get the soundex code for the string\"\"\"\n", 25 | " token = token.upper()\n", 26 | "\n", 27 | " soundex = \"\"\n", 28 | " \n", 29 | " # first letter of input is always the first letter of soundex\n", 30 | " soundex += token[0]\n", 31 | " \n", 32 | " # create a dictionary which maps letters to respective soundex codes. Vowels and 'H', 'W' and 'Y' will be represented by '.'\n", 33 | " dictionary = {\"BFPV\": \"1\", \"CGJKQSXZ\":\"2\", \"DT\":\"3\", \"L\":\"4\", \"MN\":\"5\", \"R\":\"6\", \"AEIOUHWY\":\".\"}\n", 34 | " \n", 35 | " for char in token[1:]:\n", 36 | " for key in dictionary.keys():\n", 37 | " if char in key:\n", 38 | " code = dictionary[key] \n", 39 | " if code != '.': \n", 40 | " if code != soundex[-1]: \n", 41 | " soundex += code \n", 42 | " \n", 43 | " \n", 44 | " # trim or pad to make soundex a 4-character code\n", 45 | " soundex = soundex[:4].ljust(4, \"0\")\n", 46 | " \n", 47 | " return soundex" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "Let's see what's the soudex of 'Bombay' and 'Bambai'" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 9, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "S300\n", 67 | "A261\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "print(get_soundex(\"STOUT\"))\n", 73 | "print(get_soundex(\"Ashcraft\"))" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 10, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "S300\n", 86 | "A261\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "print(get_soundex(\"STOUT\"))\n", 92 | "print(get_soundex(\"Ashcraft\"))" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "Let's see soundex of 'Aggrawal', 'Agrawal', 'Aggarwal' and 'Agarwal'" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 11, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "A264\n", 112 | "A264\n", 113 | "A264\n", 114 | "A264\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "print(get_soundex(\"Aggrawal\"))\n", 120 | "print(get_soundex(\"Agrawal\"))\n", 121 | "print(get_soundex(\"Aggarwal\"))\n", 122 | "print(get_soundex(\"Agarwal\"))" 123 | ] 124 | } 125 | ], 126 | "metadata": { 127 | "kernelspec": { 128 | "display_name": "Python 3", 129 | "language": "python", 130 | "name": "python3" 131 | }, 132 | "language_info": { 133 | "codemirror_mode": { 134 | "name": "ipython", 135 | "version": 3 136 | }, 137 | "file_extension": ".py", 138 | "mimetype": "text/x-python", 139 | "name": "python", 140 | "nbconvert_exporter": "python", 141 | "pygments_lexer": "ipython3", 142 | "version": "3.7.3" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 2 147 | } 148 | -------------------------------------------------------------------------------- /Advanced_Lexial_Processing/edit+distance.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Levenshtein Edit Distance\n", 8 | "The levenshtein distance calculates the number of steps (insertions, deletions or substitutions) required to go from source string to target string." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "def lev_distance(source='', target=''):\n", 20 | " \"\"\"Make a Levenshtein Distances Matrix\"\"\"\n", 21 | " \n", 22 | " # get length of both strings\n", 23 | " n1, n2 = len(source), len(target)\n", 24 | " \n", 25 | " # create matrix using length of both strings - source string sits on columns, target string sits on rows\n", 26 | " matrix = [ [ 0 for i1 in range(n1 + 1) ] for i2 in range(n2 + 1) ]\n", 27 | " \n", 28 | " # fill the first row - (0 to n1-1)\n", 29 | " for i1 in range(1, n1 + 1):\n", 30 | " matrix[0][i1] = i1\n", 31 | " \n", 32 | " # fill the first column - (0 to n2-1)\n", 33 | " for i2 in range(1, n2 + 1):\n", 34 | " matrix[i2][0] = i2\n", 35 | " \n", 36 | " # fill the matrix\n", 37 | " for i2 in range(1, n2 + 1):\n", 38 | " for i1 in range(1, n1 + 1):\n", 39 | " \n", 40 | " # check whether letters being compared are same\n", 41 | " if (source[i1-1] == target[i2-1]):\n", 42 | " value = matrix[i2-1][i1-1] # top-left cell value\n", 43 | " else:\n", 44 | " value = min(matrix[i2-1][i1] + 1, # left cell value + 1\n", 45 | " matrix[i2][i1-1] + 1, # top cell value + 1\n", 46 | " matrix[i2-1][i1-1] + 1) # top-left cell value + 1\n", 47 | " \n", 48 | " matrix[i2][i1] = value\n", 49 | " \n", 50 | " # return bottom-right cell value\n", 51 | " return matrix[-1][-1]" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "2" 63 | ] 64 | }, 65 | "execution_count": 2, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "lev_distance('cat', 'cta')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Levenshtein distance in nltk library" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 1, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "# import library\n", 90 | "from nltk.metrics.distance import edit_distance" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "2" 102 | ] 103 | }, 104 | "execution_count": 4, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "edit_distance(\"apple\", \"appel\")" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "## Damerau-Levenshtein Distance\n", 118 | "The Damerau-Levenshtein distance allows transpositions (swap of two letters which are adjacent to each other) as well." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 3, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "2" 130 | ] 131 | }, 132 | "execution_count": 3, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "edit_distance(\"apple\", \"appel\", transpositions=False, )" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "Python 3", 154 | "language": "python", 155 | "name": "python3" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.6.5" 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 2 172 | } 173 | -------------------------------------------------------------------------------- /Introduction_to_NLP/Bonus+exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import re" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### Q1. \n", 19 | "Write a regular expression to match all the files that have either .exe, .xml or .jar extensions. A valid file name can contain any alphabet, digit and underscore followed by the extension." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "files = ['employees.xml', 'calculator.jar', 'nfsmw.exe', 'bkgrnd001.jpg', 'sales_report.ppt']\n", 29 | "\n", 30 | "result = []\n", 31 | "\n", 32 | "# write your code here\n", 33 | "\n", 34 | "# print result - result should only contain the items that match the pattern. In this case, result should be ['employees.xml', 'calculator.jar', 'nfsmw.exe']\n", 35 | "print(result)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Q2. \n", 43 | "Write a regular expression to match all the addresses that have Koramangala embedded in them.\n", 44 | "\n", 45 | "Strings that should match:\n", 46 | "* 466, 5th block, Koramangala, Bangalore\n", 47 | "* 4th BLOCK, KORAMANGALA - 560034\n", 48 | "\n", 49 | "Strings that shouldn't match:\n", 50 | "* 999, St. Marks Road, Bangalore\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "addresses = ['466, 5th block, Koramangala, Bangalore', '4th BLOCK, KORAMANGALA - 560034', '999, St. Marks Road, Bangalore']\n", 60 | "\n", 61 | "result = []\n", 62 | "\n", 63 | "# write your code here\n", 64 | "\n", 65 | "\n", 66 | "# print result - result should only contain the items that match the pattern\n", 67 | "print(result)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### Q3. \n", 75 | "Write a regular expression that matches either integer numbers or floats upto 2 decimal places.\n", 76 | "\n", 77 | "Strings that should match: \n", 78 | "* 2\n", 79 | "* 2.3\n", 80 | "* 4.56\n", 81 | "* .61\n", 82 | "\n", 83 | "Strings that shoudln't match:\n", 84 | "* 4.567\n", 85 | "* 75.8792\n", 86 | "* abc\n" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "numbers = ['2', '2.3', '4.56', '.61', '4.567', '75.8792', 'abc']\n", 96 | "\n", 97 | "result = []\n", 98 | "\n", 99 | "# write your code here\n", 100 | "\n", 101 | "\n", 102 | "# print result - result should only contain the items that match the pattern\n", 103 | "print(result)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "### Q4. \n", 111 | "Write a regular expression to match the model names of smartphones which follow the following pattern: \n", 112 | "\n", 113 | "mobile company name followed by underscore followed by model name followed by underscore followed by model number\n", 114 | "\n", 115 | "Strings that should match:\n", 116 | "* apple_iphone_6\n", 117 | "* samsung_note_4\n", 118 | "* google_pixel_2\n", 119 | "\n", 120 | "Strings that shouldn’t match:\n", 121 | "* apple_6\n", 122 | "* iphone_6\n", 123 | "* google\\_pixel\\_\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "phones = ['apple_iphone_6', 'samsung_note_4', 'google_pixel_2', 'apple_6', 'iphone_6', 'google_pixel_']\n", 133 | "\n", 134 | "result = []\n", 135 | "\n", 136 | "# write your code here\n", 137 | "\n", 138 | "\n", 139 | "# print result - result should only contain the items that match the pattern\n", 140 | "print(result)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### Q5.\n", 148 | "Write a regular expression that can be used to match the emails present in a database. \n", 149 | "\n", 150 | "The pattern of a valid email address is defined as follows:\n", 151 | "The '@' character can be preceded either by alphanumeric characters, period characters or underscore characters. The length of the part that precedes the '@' character should be between 4 to 20 characters.\n", 152 | "\n", 153 | "The '@' character should be followed by a domain name (e.g. gmail.com). The domain name has three parts - a prefix (e.g. 'gmail'), the period character and a suffix (e.g. 'com'). The prefix can have a length between 3 to 15 characters followed by a period character followed by either of these suffixes - 'com', 'in' or 'org'.\n", 154 | "\n", 155 | "\n", 156 | "Emails that should match:\n", 157 | "* random.guy123@gmail.com\n", 158 | "* mr_x_in_bombay@gov.in\n", 159 | "\n", 160 | "Emails that shouldn’t match:\n", 161 | "* 1@ued.org\n", 162 | "* @gmail.com\n", 163 | "* abc!@yahoo.in\n", 164 | "* sam_12@gov.us\n", 165 | "* neeraj@" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "emails = ['random.guy123@gmail.com', 'mr_x_in_bombay@gov.in', '1@ued.org',\n", 175 | " '@gmail.com','abc!@yahoo.in', 'sam_12@gov.us', 'neeraj@']\n", 176 | "\n", 177 | "result = []\n", 178 | "\n", 179 | "# write your code here\n", 180 | "\n", 181 | "\n", 182 | "# print result - result should only contain the items that match the pattern\n", 183 | "print(result)" 184 | ] 185 | } 186 | ], 187 | "metadata": { 188 | "kernelspec": { 189 | "display_name": "Python 3", 190 | "language": "python", 191 | "name": "python3" 192 | }, 193 | "language_info": { 194 | "codemirror_mode": { 195 | "name": "ipython", 196 | "version": 3 197 | }, 198 | "file_extension": ".py", 199 | "mimetype": "text/x-python", 200 | "name": "python", 201 | "nbconvert_exporter": "python", 202 | "pygments_lexer": "ipython3", 203 | "version": "3.6.5" 204 | } 205 | }, 206 | "nbformat": 4, 207 | "nbformat_minor": 2 208 | } 209 | -------------------------------------------------------------------------------- /Advanced_Lexial_Processing/spell-corrector.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import re\n", 12 | "from collections import Counter" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# function to tokenise words\n", 24 | "def words(document):\n", 25 | " \"Convert text to lower case and tokenise the document\"\n", 26 | " return re.findall(r'\\w+', document.lower())" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "# create a frequency table of all the words of the document\n", 38 | "all_words = Counter(words(open('big.txt').read()))" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# check frequency of a random word, say, 'chair'\n", 48 | "all_words['chair']" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# look at top 10 frequent words\n", 58 | "all_words.most_common(10)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "def edits_one(word):\n", 70 | " \"Create all edits that are one edit away from `word`.\"\n", 71 | " alphabets = 'abcdefghijklmnopqrstuvwxyz'\n", 72 | " splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]\n", 73 | " deletes = [left + right[1:] for left, right in splits if right]\n", 74 | " inserts = [left + c + right for left, right in splits for c in alphabets]\n", 75 | " replaces = [left + c + right[1:] for left, right in splits if right for c in alphabets]\n", 76 | " transposes = [left + right[1] + right[0] + right[2:] for left, right in splits if len(right)>1]\n", 77 | " return set(deletes + inserts + replaces + transposes)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "def edits_two(word):\n", 89 | " \"Create all edits that are two edits away from `word`.\"\n", 90 | " return (e2 for e1 in edits_one(word) for e2 in edits_one(e1))" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": { 97 | "collapsed": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "def known(words):\n", 102 | " \"The subset of `words` that appear in the `all_words`.\"\n", 103 | " return set(word for word in words if word in all_words)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "def possible_corrections(word):\n", 115 | " \"Generate possible spelling corrections for word.\"\n", 116 | " return (known([word]) or known(edits_one(word)) or known(edits_two(word)) or [word])" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "def prob(word, N=sum(all_words.values())): \n", 128 | " \"Probability of `word`: Number of appearances of 'word' / total number of tokens\"\n", 129 | " return all_words[word] / N" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "print(len(set(edits_one(\"monney\"))))\n", 139 | "print(edits_one(\"monney\"))" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "print(known(edits_one(\"monney\")))" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "# Let's look at words that are two edits away\n", 158 | "print(len(set(edits_two(\"monney\"))))\n", 159 | "print(known(edits_one(\"monney\")))" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "# Let's look at possible corrections of a word\n", 169 | "print(possible_corrections(\"monney\"))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# Let's look at probability of a word\n", 179 | "print(prob(\"money\"))\n", 180 | "print(prob(\"monkey\"))" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "def spell_check(word):\n", 190 | " \"Print the most probable spelling correction for `word` out of all the `possible_corrections`\"\n", 191 | " correct_word = max(possible_corrections(word), key=prob)\n", 192 | " if correct_word != word:\n", 193 | " return \"Did you mean \" + correct_word + \"?\"\n", 194 | " else:\n", 195 | " return \"Correct spelling.\"" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "# test spell check\n", 205 | "print(spell_check(\"monney\"))" 206 | ] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.6.4" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 2 230 | } 231 | -------------------------------------------------------------------------------- /Introduction_to_NLP/Bonus+exercise+with+solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import re" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### Q1.\n", 19 | "Write a regular expression to match all the files that have either .exe, .xml or .jar extensions. A valid file name can contain any alphabet, digit and underscore followed by the extension." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 2, 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "['employees.xml', 'calculator.jar', 'nfsmw.exe']\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "files = ['employees.xml', 'calculator.jar', 'nfsmw.exe', 'bkgrnd001.jpg', 'sales_report.ppt']\n", 37 | "\n", 38 | "pattern = \"^.+\\.(xml|jar|exe)$\"\n", 39 | "\n", 40 | "result = []\n", 41 | "\n", 42 | "for file in files:\n", 43 | " match = re.search(pattern, file)\n", 44 | " if match !=None:\n", 45 | " result.append(file)\n", 46 | "\n", 47 | "# print result - result should only contain the items that match the pattern\n", 48 | "print(result)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "### Q2\n", 56 | "Write a regular expression to match all the addresses that have Koramangala embedded in them.\n", 57 | "\n", 58 | "Strings that should match:\n", 59 | "* 466, 5th block, Koramangala, Bangalore\n", 60 | "* 4th BLOCK, KORAMANGALA - 560034\n", 61 | "\n", 62 | "Strings that shouldn't match:\n", 63 | "* 999, St. Marks Road, Bangalore\n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "['466, 5th block, Koramangala, Bangalore', '4th BLOCK, KORAMANGALA - 560034']\n" 76 | ] 77 | } 78 | ], 79 | "source": [ 80 | "addresses = ['466, 5th block, Koramangala, Bangalore', '4th BLOCK, KORAMANGALA - 560034', '999, St. Marks Road, Bangalore']\n", 81 | "\n", 82 | "pattern = \"^[\\w\\d\\s,-]*koramangala[\\w\\d\\s,-]*$\"\n", 83 | "\n", 84 | "result = []\n", 85 | "\n", 86 | "for address in addresses:\n", 87 | " match = re.search(pattern, address, re.I)\n", 88 | " if match !=None:\n", 89 | " result.append(address)\n", 90 | "\n", 91 | "# print result - result should only contain the items that match the pattern\n", 92 | "print(result)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Q3. \n", 100 | "Write a regular expression that matches either integer numbers or floats upto 2 decimal places.\n", 101 | "\n", 102 | "Strings that should match: \n", 103 | "* 2\n", 104 | "* 2.3\n", 105 | "* 4.56\n", 106 | "* .61\n", 107 | "\n", 108 | "Strings that shoudln't match:\n", 109 | "* 4.567\n", 110 | "* 75.8792\n", 111 | "* abc\n" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 4, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "['2', '2.3', '4.56', '.61']\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "numbers = ['2', '2.3', '4.56', '.61', '4.567', '75.8792', 'abc']\n", 129 | "\n", 130 | "pattern = \"^[0-9]*(\\.[0-9]{,2})?$\"\n", 131 | "\n", 132 | "result = []\n", 133 | "\n", 134 | "for number in numbers:\n", 135 | " match = re.search(pattern, number)\n", 136 | " if match != None:\n", 137 | " result.append(number)\n", 138 | "\n", 139 | "# print result - result should only contain the items that match the pattern\n", 140 | "print(result)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### Q4. \n", 148 | "Write a regular expression to match the model names of smartphones which follow the following pattern: \n", 149 | "\n", 150 | "mobile company name followed by underscore followed by model name followed by underscore followed by model number\n", 151 | "\n", 152 | "Strings that should match:\n", 153 | "* apple_iphone_6\n", 154 | "* samsung_note_4\n", 155 | "* google_pixel_2\n", 156 | "\n", 157 | "Strings that shouldn’t match:\n", 158 | "* apple_6\n", 159 | "* iphone_6\n", 160 | "* google\\_pixel\\_\n" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 5, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "['apple_iphone_6', 'samsung_note_4', 'google_pixel_2']\n" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "phones = ['apple_iphone_6', 'samsung_note_4', 'google_pixel_2', 'apple_6', 'iphone_6', 'google_pixel_']\n", 178 | "\n", 179 | "pattern = \"^.*_.*_\\d$\"\n", 180 | "\n", 181 | "result = []\n", 182 | "\n", 183 | "for phone in phones:\n", 184 | " match = re.search(pattern, phone)\n", 185 | " if match !=None:\n", 186 | " result.append(phone)\n", 187 | "\n", 188 | "# print result - result should only contain the items that match the pattern\n", 189 | "print(result)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "### Q5. \n", 197 | "Write a regular expression that can be used to match the emails present in a database. \n", 198 | "\n", 199 | "The pattern of a valid email address is defined as follows:\n", 200 | "The '@' character can be preceded either by alphanumeric characters, period characters or underscore characters. The length of the part that precedes the '@' character should be between 4 to 20 characters.\n", 201 | "\n", 202 | "The '@' character should be followed by a domain name (e.g. gmail.com). The domain name has three parts - a prefix (e.g. 'gmail'), the period character and a suffix (e.g. 'com'). The prefix can have a length between 3 to 15 characters followed by a period character followed by either of these suffixes - 'com', 'in' or 'org'.\n", 203 | "\n", 204 | "\n", 205 | "Emails that should match:\n", 206 | "* random.guy123@gmail.com\n", 207 | "* mr_x_in_bombay@gov.in\n", 208 | "\n", 209 | "Emails that shouldn’t match:\n", 210 | "* 1@ued.org\n", 211 | "* @gmail.com\n", 212 | "* abc!@yahoo.in\n", 213 | "* sam_12@gov.us\n", 214 | "* neeraj@" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 6, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "['random.guy123@gmail.com', 'mr_x_in_bombay@gov.in']\n" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "emails = ['random.guy123@gmail.com', 'mr_x_in_bombay@gov.in', '1@ued.org',\n", 232 | " '@gmail.com', 'abc!@yahoo.in', 'sam_12@gov.us', 'neeraj@']\n", 233 | "\n", 234 | "pattern = \"^[a-z_.0-9]{4,20}@[a-z]{3,15}\\.(com|in|org)$\"\n", 235 | "\n", 236 | "result = []\n", 237 | "\n", 238 | "for email in emails:\n", 239 | " match = re.search(pattern, email, re.I)\n", 240 | " if match !=None:\n", 241 | " result.append(email)\n", 242 | "\n", 243 | "# print result - result should only contain the items that match the pattern\n", 244 | "print(result)" 245 | ] 246 | } 247 | ], 248 | "metadata": { 249 | "kernelspec": { 250 | "display_name": "Python 3", 251 | "language": "python", 252 | "name": "python3" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.6.5" 265 | } 266 | }, 267 | "nbformat": 4, 268 | "nbformat_minor": 2 269 | } 270 | -------------------------------------------------------------------------------- /Basic_Lexical_Processing/tokenisation .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tokenisation\n", 8 | "\n", 9 | "The notebook contains three types of tokenisation techniques:\n", 10 | "1. Word tokenisation\n", 11 | "2. Sentence tokenisation\n", 12 | "3. Tweet tokenisation\n", 13 | "4. Custom tokenisation using regular expressions" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "### 1. Word tokenisation" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "At nine o'clock I visited him myself. It looks like religious mania, and he'll soon think that he himself is God.\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "document = \"At nine o'clock I visited him myself. It looks like religious mania, and he'll soon think that he himself is God.\"\n", 38 | "print(document)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Tokenising on spaces using python" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "['At', 'nine', \"o'clock\", 'I', 'visited', 'him', 'myself.', 'It', 'looks', 'like', 'religious', 'mania,', 'and', \"he'll\", 'soon', 'think', 'that', 'he', 'himself', 'is', 'God.']\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "print(document.split())" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Tokenising using nltk word tokeniser" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 3, 75 | "metadata": { 76 | "collapsed": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "from nltk.tokenize import word_tokenize\n", 81 | "words = word_tokenize(document)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "['At', 'nine', \"o'clock\", 'I', 'visited', 'him', 'myself', '.', 'It', 'looks', 'like', 'religious', 'mania', ',', 'and', 'he', \"'ll\", 'soon', 'think', 'that', 'he', 'himself', 'is', 'God', '.']\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "print(words)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "NLTK's word tokeniser not only breaks on whitespaces but also breaks contraction words such as he'll into \"he\" and \"'ll\". On the other hand it doesn't break \"o'clock\" and treats it as a separate token." 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "### 2. Sentence tokeniser" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "Tokenising based on sentence requires you to split on the period ('.'). Let's use nltk sentence tokeniser." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "from nltk.tokenize import sent_tokenize\n", 131 | "sentences = sent_tokenize(document)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 6, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "[\"At nine o'clock I visited him myself.\", \"It looks like religious mania, and he'll soon think that he himself is God.\"]\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "print(sentences)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### 3. Tweet tokeniser" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "A problem with word tokeniser is that it fails to tokeniser emojis and other complex special characters such as word with hashtags. Emojis are common these days and people use them all the time." 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 7, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "message = \"i recently watched this show called mindhunters:). i totally loved it 😍. it was gr8 <3. #bingewatching #nothingtodo 😎\"" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 8, 179 | "metadata": { 180 | "scrolled": true 181 | }, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "['i', 'recently', 'watched', 'this', 'show', 'called', 'mindhunters', ':', ')', '.', 'i', 'totally', 'loved', 'it', '😍', '.', 'it', 'was', 'gr8', '<', '3', '.', '#', 'bingewatching', '#', 'nothingtodo', '😎']\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "print(word_tokenize(message))" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "The word tokeniser breaks the emoji '<3' into '<' and '3' which is something that we don't want. Emojis have their own significance in areas like sentiment analysis where a happy face and sad face can salone prove to be a really good predictor of the sentiment. Similarly, the hashtags are broken into two tokens. A hashtag is used for searching specific topics or photos in social media apps such as Instagram and facebook. So there, you want to use the hashtag as is.\n", 200 | "\n", 201 | "Let's use the tweet tokeniser of nltk to tokenise this message." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 9, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "from nltk.tokenize import TweetTokenizer\n", 213 | "tknzr = TweetTokenizer()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 10, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "text/plain": [ 224 | "['i',\n", 225 | " 'recently',\n", 226 | " 'watched',\n", 227 | " 'this',\n", 228 | " 'show',\n", 229 | " 'called',\n", 230 | " 'mindhunters',\n", 231 | " ':)',\n", 232 | " '.',\n", 233 | " 'i',\n", 234 | " 'totally',\n", 235 | " 'loved',\n", 236 | " 'it',\n", 237 | " '😍',\n", 238 | " '.',\n", 239 | " 'it',\n", 240 | " 'was',\n", 241 | " 'gr8',\n", 242 | " '<3',\n", 243 | " '.',\n", 244 | " '#bingewatching',\n", 245 | " '#nothingtodo',\n", 246 | " '😎']" 247 | ] 248 | }, 249 | "execution_count": 10, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "tknzr.tokenize(message)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "As you can see, it handles all the emojis and the hashtags pretty well.\n", 263 | "\n", 264 | "Now, there is a tokeniser that takes a regular expression and tokenises and returns result based on the pattern of regular expression.\n", 265 | "\n", 266 | "Let's look at how you can use regular expression tokeniser." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 11, 272 | "metadata": { 273 | "collapsed": true 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "from nltk.tokenize import regexp_tokenize\n", 278 | "message = \"i recently watched this show called mindhunters:). i totally loved it 😍. it was gr8 <3. #bingewatching #nothingtodo 😎\"\n", 279 | "pattern = \"#[\\w]+\"" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 12, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "data": { 289 | "text/plain": [ 290 | "['#bingewatching', '#nothingtodo']" 291 | ] 292 | }, 293 | "execution_count": 12, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "regexp_tokenize(message, pattern)" 300 | ] 301 | } 302 | ], 303 | "metadata": { 304 | "kernelspec": { 305 | "display_name": "Python 3", 306 | "language": "python", 307 | "name": "python3" 308 | }, 309 | "language_info": { 310 | "codemirror_mode": { 311 | "name": "ipython", 312 | "version": 3 313 | }, 314 | "file_extension": ".py", 315 | "mimetype": "text/x-python", 316 | "name": "python", 317 | "nbconvert_exporter": "python", 318 | "pygments_lexer": "ipython3", 319 | "version": "3.6.5" 320 | } 321 | }, 322 | "nbformat": 4, 323 | "nbformat_minor": 2 324 | } 325 | -------------------------------------------------------------------------------- /Basic_Lexical_Processing/stemming.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Stemming" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "# import libraries\n", 19 | "import pandas as pd\n", 20 | "from nltk.tokenize import word_tokenize\n", 21 | "from nltk.stem.porter import PorterStemmer\n", 22 | "from nltk.stem.snowball import SnowballStemmer" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire.\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "text = \"Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire.\"\n", 40 | "print(text)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 3, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "['very', 'orderly', 'and', 'methodical', 'he', 'looked', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'ticking', 'a', 'sonorous', 'sermon', 'under', 'his', 'flapped', 'newly', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pitted', 'its', 'gravity', 'and', 'longevity', 'against', 'the', 'levity', 'and', 'evanescence', 'of', 'the', 'brisk', 'fire', '.']\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "tokens = word_tokenize(text.lower())\n", 58 | "print(tokens)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "['veri', 'orderli', 'and', 'method', 'he', 'look', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'tick', 'a', 'sonor', 'sermon', 'under', 'hi', 'flap', 'newli', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pit', 'it', 'graviti', 'and', 'longev', 'against', 'the', 'leviti', 'and', 'evanesc', 'of', 'the', 'brisk', 'fire', '.']\n" 71 | ] 72 | }, 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "47" 77 | ] 78 | }, 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "stemmer = PorterStemmer()\n", 86 | "porter_stemmed = [stemmer.stem(token) for token in tokens]\n", 87 | "print(porter_stemmed)\n", 88 | "len(porter_stemmed)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "['veri', 'order', 'and', 'method', 'he', 'look', ',', 'with', 'a', 'hand', 'on', 'each', 'knee', ',', 'and', 'a', 'loud', 'watch', 'tick', 'a', 'sonor', 'sermon', 'under', 'his', 'flap', 'newli', 'bought', 'waist-coat', ',', 'as', 'though', 'it', 'pit', 'it', 'graviti', 'and', 'longev', 'against', 'the', 'leviti', 'and', 'evanesc', 'of', 'the', 'brisk', 'fire', '.']\n" 101 | ] 102 | }, 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "47" 107 | ] 108 | }, 109 | "execution_count": 5, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "# snowball stemmer\n", 116 | "stemmer = SnowballStemmer(\"english\")\n", 117 | "snowball_stemmed = [stemmer.stem(token) for token in tokens]\n", 118 | "print(snowball_stemmed)\n", 119 | "len(snowball_stemmed)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 6, 125 | "metadata": { 126 | "collapsed": true, 127 | "scrolled": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "df = pd.DataFrame({'token': tokens, 'porter_stemmed': porter_stemmed, 'snowball_stemmed': snowball_stemmed})\n", 132 | "df = df[['token', 'porter_stemmed', 'snowball_stemmed']]" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 7, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/html": [ 143 | "
\n", 144 | "\n", 157 | "\n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | "
tokenporter_stemmedsnowball_stemmed
0veryveriveri
1orderlyorderliorder
3methodicalmethodmethod
5lookedlooklook
18tickingticktick
20sonoroussonorsonor
23hishihis
24flappedflapflap
25newlynewlinewli
32pittedpitpit
33itsitit
34gravitygravitigraviti
36longevitylongevlongev
39levitylevitileviti
41evanescenceevanescevanesc
\n", 259 | "
" 260 | ], 261 | "text/plain": [ 262 | " token porter_stemmed snowball_stemmed\n", 263 | "0 very veri veri\n", 264 | "1 orderly orderli order\n", 265 | "3 methodical method method\n", 266 | "5 looked look look\n", 267 | "18 ticking tick tick\n", 268 | "20 sonorous sonor sonor\n", 269 | "23 his hi his\n", 270 | "24 flapped flap flap\n", 271 | "25 newly newli newli\n", 272 | "32 pitted pit pit\n", 273 | "33 its it it\n", 274 | "34 gravity graviti graviti\n", 275 | "36 longevity longev longev\n", 276 | "39 levity leviti leviti\n", 277 | "41 evanescence evanesc evanesc" 278 | ] 279 | }, 280 | "execution_count": 7, 281 | "metadata": {}, 282 | "output_type": "execute_result" 283 | } 284 | ], 285 | "source": [ 286 | "df[(df.token != df.porter_stemmed) | (df.token != df.snowball_stemmed)]" 287 | ] 288 | } 289 | ], 290 | "metadata": { 291 | "kernelspec": { 292 | "display_name": "Python 3", 293 | "language": "python", 294 | "name": "python3" 295 | }, 296 | "language_info": { 297 | "codemirror_mode": { 298 | "name": "ipython", 299 | "version": 3 300 | }, 301 | "file_extension": ".py", 302 | "mimetype": "text/x-python", 303 | "name": "python", 304 | "nbconvert_exporter": "python", 305 | "pygments_lexer": "ipython3", 306 | "version": "3.6.5" 307 | } 308 | }, 309 | "nbformat": 4, 310 | "nbformat_minor": 2 311 | } 312 | -------------------------------------------------------------------------------- /Introduction_to_NLP/Regular_Expressions .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Regular Expressions\n", 8 | "Regular expression is a set of characters, called as the pattern, which helps in finding substrings in a given string. The pattern is used to detect the substrings\n", 9 | "\n", 10 | "For example, suppose you have a dataset of customer reviews about your restaurant. Say, you want to extract the emojis from the reviews because they are a good predictor os the sentiment of the review.\n", 11 | "\n", 12 | "Take another example, the artificial assistants such as Siri, Google Now use information retrieval to give you better results. When you ask them for any query or ask them to search for something interesting on the screen, they look for common patterns such as emails, phone numbers, place names, date and time and so on. This is because then the assitant can automatically make a booking or ask you to call the resturant to make a booking.\n", 13 | "\n", 14 | "Regular expressions are very powerful tool in text processing. It will help you to clean and handle your text in a much better way." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Let's import the regular expression library in python." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import re" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "Let's do a quick search using a pattern." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "" 49 | ] 50 | }, 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "re.search('Ravi', 'Ravi is an exceptional student!')" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "Ravi\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "# print output of re.search()\n", 75 | "match = re.search('Ravi', 'Ravi is an exceptional student!')\n", 76 | "print(match.group())" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "Let's define a function to match regular expression patterns" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "def find_pattern(text, patterns):\n", 93 | " if re.search(patterns, text):\n", 94 | " return re.search(patterns, text)\n", 95 | " else:\n", 96 | " return 'Not Found!'" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "### Quantifiers" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "\n", 116 | "\n", 117 | "\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "# '*': Zero or more \n", 123 | "print(find_pattern(\"ac\", \"ab*\"))\n", 124 | "print(find_pattern(\"abc\", \"ab*\"))\n", 125 | "print(find_pattern(\"abbc\", \"ab*\"))" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 6, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "\n", 138 | "\n", 139 | "\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "# '?': Zero or one (tells whether a pattern is absent or present)\n", 145 | "print(find_pattern(\"ac\", \"ab?\"))\n", 146 | "print(find_pattern(\"abc\", \"ab?\"))\n", 147 | "print(find_pattern(\"abbc\", \"ab?\"))" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 7, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "Not Found!\n", 160 | "\n", 161 | "\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "# '+': One or more\n", 167 | "print(find_pattern(\"ac\", \"ab+\"))\n", 168 | "print(find_pattern(\"abc\", \"ab+\"))\n", 169 | "print(find_pattern(\"abbc\", \"ab+\"))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 8, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "# {n}: Matches if a character is present exactly n number of times\n", 187 | "print(find_pattern(\"abbc\", \"ab{2}\"))\n" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 9, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "\n", 200 | "Not Found!\n", 201 | "\n", 202 | "Not Found!\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "# {m,n}: Matches if a character is present from m to n number of times\n", 208 | "print(find_pattern(\"aabbbbbbc\", \"ab{3,5}\")) # return true if 'b' is present 3-5 times\n", 209 | "print(find_pattern(\"aabbbbbbc\", \"ab{7,10}\")) # return true if 'b' is present 7-10 times\n", 210 | "print(find_pattern(\"aabbbbbbc\", \"ab{,10}\")) # return true if 'b' is present atmost 10 times\n", 211 | "print(find_pattern(\"aabbbbbbc\", \"ab{10,}\")) # return true if 'b' is present from at least 10 times" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "### Anchors" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 10, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "name": "stdout", 228 | "output_type": "stream", 229 | "text": [ 230 | "\n", 231 | "Not Found!\n", 232 | "\n", 233 | "Not Found!\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "# '^': Indicates start of a string\n", 239 | "# '$': Indicates end of string\n", 240 | "\n", 241 | "print(find_pattern(\"James\", \"^J\")) # return true if string starts with 'J' \n", 242 | "print(find_pattern(\"Pramod\", \"^J\")) # return true if string starts with 'J' \n", 243 | "print(find_pattern(\"India\", \"a$\")) # return true if string ends with 'c'\n", 244 | "print(find_pattern(\"Japan\", \"a$\")) # return true if string ends with 'c'\n" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "### Wildcard" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 11, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "\n", 264 | "\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "# '.': Matches any character\n", 270 | "print(find_pattern(\"a\", \".\"))\n", 271 | "print(find_pattern(\"#\", \".\"))\n" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Character sets" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 12, 284 | "metadata": {}, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "\n", 291 | "\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "# Now we will look at '[' and ']'.\n", 297 | "# They're used for specifying a character class, which is a set of characters that you wish to match.\n", 298 | "# Characters can be listed individually as follows\n", 299 | "print(find_pattern(\"a\", \"[abc]\"))\n", 300 | "\n", 301 | "# Or a range of characters can be indicated by giving two characters and separating them by a '-'.\n", 302 | "print(find_pattern(\"c\", \"[a-c]\")) # same as above" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 13, 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "Not Found!\n" 315 | ] 316 | } 317 | ], 318 | "source": [ 319 | "# '^' is used inside character set to indicate complementary set\n", 320 | "print(find_pattern(\"a\", \"[^abc]\")) # return true if neither of these is present - a,b or c" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "### Character sets\n", 328 | "| Pattern | Matches |\n", 329 | "|----------|--------------------------------------------------------------------------------------------|\n", 330 | "| [abc] | Matches either an a, b or c character |\n", 331 | "| [abcABC] | Matches either an a, A, b, B, c or C character |\n", 332 | "| [a-z] | Matches any characters between a and z, including a and z |\n", 333 | "| [A-Z] | Matches any characters between A and Z, including A and Z |\n", 334 | "| [a-zA-Z] | Matches any characters between a and z, including a and z ignoring cases of the characters |\n", 335 | "| [0-9] | Matches any character which is a number between 0 and 9 |" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "### Meta sequences\n", 343 | "\n", 344 | "| Pattern | Equivalent to |\n", 345 | "|----------|------------------|\n", 346 | "| \\s | [ \\t\\n\\r\\f\\v] |\n", 347 | "| \\S | [^ \\t\\n\\r\\f\\v] |\n", 348 | "| \\d | [0-9] |\n", 349 | "| \\D | [^0-9] |\n", 350 | "| \\w | [a-zA-Z0-9_] |\n", 351 | "| \\W | [^a-zA-Z0-9_] |" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "### Greedy vs non-greedy regex" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 14, 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "\n" 371 | ] 372 | } 373 | ], 374 | "source": [ 375 | "print(find_pattern(\"aabbbbbb\", \"ab{3,5}\")) # return if a is followed by b 3-5 times GREEDY" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 15, 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "name": "stdout", 385 | "output_type": "stream", 386 | "text": [ 387 | "\n" 388 | ] 389 | } 390 | ], 391 | "source": [ 392 | "print(find_pattern(\"aabbbbbb\", \"ab{3,5}?\")) # return if a is followed by b 3-5 times GREEDY" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 16, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "\n" 405 | ] 406 | } 407 | ], 408 | "source": [ 409 | "# Example of HTML code\n", 410 | "print(re.search(\"<.*>\",\"My Page\"))" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 17, 416 | "metadata": {}, 417 | "outputs": [ 418 | { 419 | "name": "stdout", 420 | "output_type": "stream", 421 | "text": [ 422 | "\n" 423 | ] 424 | } 425 | ], 426 | "source": [ 427 | "# Example of HTML code\n", 428 | "print(re.search(\"<.*?>\",\"My Page\"))" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "### The five most important re functions that you would be required to use most of the times are\n", 436 | "\n", 437 | "match() Determine if the RE matches at the beginning of the string\n", 438 | "\n", 439 | "search() Scan through a string, looking for any location where this RE matches\n", 440 | "\n", 441 | "finall() Find all the substrings where the RE matches, and return them as a list\n", 442 | "\n", 443 | "finditer() Find all substrings where RE matches and return them as asn iterator\n", 444 | "\n", 445 | "sub() Find all substrings where the RE matches and substitute them with the given string" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 18, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "# - this function uses the re.match() and let's see how it differs from re.search()\n", 455 | "def match_pattern(text, patterns):\n", 456 | " if re.match(patterns, text):\n", 457 | " return re.match(patterns, text)\n", 458 | " else:\n", 459 | " return ('Not found!')" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 19, 465 | "metadata": {}, 466 | "outputs": [ 467 | { 468 | "name": "stdout", 469 | "output_type": "stream", 470 | "text": [ 471 | "\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "print(find_pattern(\"abbc\", \"b+\"))" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 20, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "name": "stdout", 486 | "output_type": "stream", 487 | "text": [ 488 | "Not found!\n" 489 | ] 490 | } 491 | ], 492 | "source": [ 493 | "print(match_pattern(\"abbc\", \"b+\"))" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 21, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "name": "stdout", 503 | "output_type": "stream", 504 | "text": [ 505 | "21 Ramakrishna Rd\n" 506 | ] 507 | } 508 | ], 509 | "source": [ 510 | "## Example usage of the sub() function. Replace Road with rd.\n", 511 | "\n", 512 | "street = '21 Ramakrishna Road'\n", 513 | "print(re.sub('Road', 'Rd', street))" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 22, 519 | "metadata": {}, 520 | "outputs": [ 521 | { 522 | "name": "stdout", 523 | "output_type": "stream", 524 | "text": [ 525 | "21 Rd Rd\n" 526 | ] 527 | } 528 | ], 529 | "source": [ 530 | "print(re.sub('R\\w+', 'Rd', street))" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": 23, 536 | "metadata": {}, 537 | "outputs": [ 538 | { 539 | "name": "stdout", 540 | "output_type": "stream", 541 | "text": [ 542 | "START - 12END - 20\n", 543 | "START - 42END - 50\n" 544 | ] 545 | } 546 | ], 547 | "source": [ 548 | "## Example usage of finditer(). Find all occurrences of word Festival in given sentence\n", 549 | "\n", 550 | "text = 'Diwali is a festival of lights, Holi is a festival of colors!'\n", 551 | "pattern = 'festival'\n", 552 | "for match in re.finditer(pattern, text):\n", 553 | " print('START -', match.start(), end=\"\")\n", 554 | " print('END -', match.end())" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 24, 560 | "metadata": {}, 561 | "outputs": [ 562 | { 563 | "name": "stdout", 564 | "output_type": "stream", 565 | "text": [ 566 | "[('2017', '10', '28')]\n" 567 | ] 568 | } 569 | ], 570 | "source": [ 571 | "# Example usage of findall(). In the given URL find all dates\n", 572 | "url = \"http://www.telegraph.co.uk/formula-1/2017/10/28/mexican-grand-prix-2017-time-does-start-tv-channel-odds-lewisl/2017/05/12\"\n", 573 | "date_regex = '/(\\d{4})/(\\d{1,2})/(\\d{1,2})/'\n", 574 | "print(re.findall(date_regex, url))" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 25, 580 | "metadata": {}, 581 | "outputs": [ 582 | { 583 | "name": "stdout", 584 | "output_type": "stream", 585 | "text": [ 586 | "/2017/10/28/\n" 587 | ] 588 | } 589 | ], 590 | "source": [ 591 | "## Exploring Groups\n", 592 | "m1 = re.search(date_regex, url)\n", 593 | "print(m1.group()) ## print the matched group" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": 26, 599 | "metadata": {}, 600 | "outputs": [ 601 | { 602 | "name": "stdout", 603 | "output_type": "stream", 604 | "text": [ 605 | "2017\n" 606 | ] 607 | } 608 | ], 609 | "source": [ 610 | "print(m1.group(1)) # - Print first group" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 27, 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "name": "stdout", 620 | "output_type": "stream", 621 | "text": [ 622 | "10\n" 623 | ] 624 | } 625 | ], 626 | "source": [ 627 | "print(m1.group(2)) # - Print second group" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 28, 633 | "metadata": {}, 634 | "outputs": [ 635 | { 636 | "name": "stdout", 637 | "output_type": "stream", 638 | "text": [ 639 | "28\n" 640 | ] 641 | } 642 | ], 643 | "source": [ 644 | "print(m1.group(3)) # - Print third group" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 29, 650 | "metadata": {}, 651 | "outputs": [ 652 | { 653 | "name": "stdout", 654 | "output_type": "stream", 655 | "text": [ 656 | "/2017/10/28/\n" 657 | ] 658 | } 659 | ], 660 | "source": [ 661 | "print(m1.group(0)) # - Print zero or the default group" 662 | ] 663 | } 664 | ], 665 | "metadata": { 666 | "kernelspec": { 667 | "display_name": "Python 3", 668 | "language": "python", 669 | "name": "python3" 670 | }, 671 | "language_info": { 672 | "codemirror_mode": { 673 | "name": "ipython", 674 | "version": 3 675 | }, 676 | "file_extension": ".py", 677 | "mimetype": "text/x-python", 678 | "name": "python", 679 | "nbconvert_exporter": "python", 680 | "pygments_lexer": "ipython3", 681 | "version": "3.7.0" 682 | } 683 | }, 684 | "nbformat": 4, 685 | "nbformat_minor": 2 686 | } 687 | -------------------------------------------------------------------------------- /Basic_Lexical_Processing/bag+of+words.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "kernelspec": { 6 | "display_name": "Python 3", 7 | "language": "python", 8 | "name": "python3" 9 | }, 10 | "language_info": { 11 | "codemirror_mode": { 12 | "name": "ipython", 13 | "version": 3 14 | }, 15 | "file_extension": ".py", 16 | "mimetype": "text/x-python", 17 | "name": "python", 18 | "nbconvert_exporter": "python", 19 | "pygments_lexer": "ipython3", 20 | "version": "3.6.5" 21 | }, 22 | "colab": { 23 | "name": "bag+of+words (1).ipynb", 24 | "provenance": [], 25 | "collapsed_sections": [], 26 | "toc_visible": true 27 | } 28 | }, 29 | "cells": [ 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "id": "hk69s6t2XztA" 34 | }, 35 | "source": [ 36 | "### Bag of words model" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "collapsed": true, 43 | "id": "he5k-KJWXztO" 44 | }, 45 | "source": [ 46 | "# load all necessary libraries\n", 47 | "import pandas as pd\n", 48 | "from nltk.tokenize import word_tokenize\n", 49 | "from nltk.corpus import stopwords\n", 50 | "from sklearn.feature_extraction.text import CountVectorizer\n", 51 | "\n", 52 | "pd.set_option('max_colwidth', 100)" 53 | ], 54 | "execution_count": 3, 55 | "outputs": [] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "id": "c5I-jasIXztR" 61 | }, 62 | "source": [ 63 | "#### Let's build a basic bag of words model on three sample documents" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "metadata": { 69 | "colab": { 70 | "base_uri": "https://localhost:8080/" 71 | }, 72 | "id": "QxBuvHZjXztR", 73 | "outputId": "e803296f-fe4e-4a20-f07b-cf78593c2ed6" 74 | }, 75 | "source": [ 76 | "documents = [\"Gangs of Wasseypur is a great movie.\", \"The success of a movie depends on the performance of the actors.\", \"There are no new movies releasing this week.\"]\n", 77 | "print(documents)" 78 | ], 79 | "execution_count": 4, 80 | "outputs": [ 81 | { 82 | "output_type": "stream", 83 | "text": [ 84 | "['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movies releasing this week.']\n" 85 | ], 86 | "name": "stdout" 87 | } 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "metadata": { 93 | "colab": { 94 | "base_uri": "https://localhost:8080/" 95 | }, 96 | "id": "TiZl45PlXztT", 97 | "outputId": "96504c23-bb0f-4621-8cbc-94e04c1fe222" 98 | }, 99 | "source": [ 100 | "import nltk\n", 101 | "nltk.download('punkt')\n", 102 | "nltk.download('stopwords')\n", 103 | "\n", 104 | "def preprocess(document):\n", 105 | " 'changes document to lower case and removes stopwords'\n", 106 | "\n", 107 | " # change sentence to lower case\n", 108 | " document = document.lower()\n", 109 | "\n", 110 | " # tokenize into words\n", 111 | " words = word_tokenize(document)\n", 112 | "\n", 113 | " # remove stop words\n", 114 | " words = [word for word in words if word not in stopwords.words(\"english\")]\n", 115 | "\n", 116 | " # join words to make sentence\n", 117 | " document = \" \".join(words)\n", 118 | " \n", 119 | " return document\n", 120 | "\n", 121 | "documents = [preprocess(document) for document in documents]\n", 122 | "print(documents)\n" 123 | ], 124 | "execution_count": 5, 125 | "outputs": [ 126 | { 127 | "output_type": "stream", 128 | "text": [ 129 | "[nltk_data] Downloading package punkt to /root/nltk_data...\n", 130 | "[nltk_data] Unzipping tokenizers/punkt.zip.\n", 131 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 132 | "[nltk_data] Unzipping corpora/stopwords.zip.\n", 133 | "['gangs wasseypur great movie .', 'success movie depends performance actors .', 'new movies releasing week .']\n" 134 | ], 135 | "name": "stdout" 136 | } 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": { 142 | "id": "lqtU89tUXztU" 143 | }, 144 | "source": [ 145 | "#### Creating bag of words model using count vectorizer function" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "metadata": { 151 | "colab": { 152 | "base_uri": "https://localhost:8080/" 153 | }, 154 | "id": "-pU0GI7gXztV", 155 | "outputId": "bf5bfd07-79a4-49a8-b14f-1acccd1a425c" 156 | }, 157 | "source": [ 158 | "vectorizer = CountVectorizer()\n", 159 | "bow_model = vectorizer.fit_transform(documents)\n", 160 | "print(bow_model) # returns the rown and column number of cells which have 1 as value" 161 | ], 162 | "execution_count": 6, 163 | "outputs": [ 164 | { 165 | "output_type": "stream", 166 | "text": [ 167 | " (0, 2)\t1\n", 168 | " (0, 10)\t1\n", 169 | " (0, 3)\t1\n", 170 | " (0, 4)\t1\n", 171 | " (1, 4)\t1\n", 172 | " (1, 9)\t1\n", 173 | " (1, 1)\t1\n", 174 | " (1, 7)\t1\n", 175 | " (1, 0)\t1\n", 176 | " (2, 6)\t1\n", 177 | " (2, 5)\t1\n", 178 | " (2, 8)\t1\n", 179 | " (2, 11)\t1\n" 180 | ], 181 | "name": "stdout" 182 | } 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "metadata": { 188 | "colab": { 189 | "base_uri": "https://localhost:8080/" 190 | }, 191 | "id": "9WO_W821XztW", 192 | "outputId": "92340ba9-0ca3-45ad-f4fd-678d3e1584bc" 193 | }, 194 | "source": [ 195 | "# print the full sparse matrix\n", 196 | "print(bow_model.toarray())" 197 | ], 198 | "execution_count": 7, 199 | "outputs": [ 200 | { 201 | "output_type": "stream", 202 | "text": [ 203 | "[[0 0 1 1 1 0 0 0 0 0 1 0]\n", 204 | " [1 1 0 0 1 0 0 1 0 1 0 0]\n", 205 | " [0 0 0 0 0 1 1 0 1 0 0 1]]\n" 206 | ], 207 | "name": "stdout" 208 | } 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "metadata": { 214 | "colab": { 215 | "base_uri": "https://localhost:8080/" 216 | }, 217 | "id": "itSbT_CrXztW", 218 | "outputId": "6d2e0b10-6418-446f-e809-474389e27653" 219 | }, 220 | "source": [ 221 | "print(bow_model.shape)\n", 222 | "print(vectorizer.get_feature_names())" 223 | ], 224 | "execution_count": 8, 225 | "outputs": [ 226 | { 227 | "output_type": "stream", 228 | "text": [ 229 | "(3, 12)\n", 230 | "['actors', 'depends', 'gangs', 'great', 'movie', 'movies', 'new', 'performance', 'releasing', 'success', 'wasseypur', 'week']\n" 231 | ], 232 | "name": "stdout" 233 | } 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": { 239 | "id": "9Q4L0Qg_XztX" 240 | }, 241 | "source": [ 242 | "### Let's create a bag of words model on the spam dataset." 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "metadata": { 248 | "colab": { 249 | "base_uri": "https://localhost:8080/" 250 | }, 251 | "id": "mRGAhBjaXztY", 252 | "outputId": "acda0870-01ca-48fe-acde-1aa6dc5f52b9" 253 | }, 254 | "source": [ 255 | "# load data\n", 256 | "spam = pd.read_csv(\"SMSSpamCollection.txt\", sep = \"\\t\", names=[\"label\", \"message\"])\n", 257 | "spam.shape" 258 | ], 259 | "execution_count": 9, 260 | "outputs": [ 261 | { 262 | "output_type": "execute_result", 263 | "data": { 264 | "text/plain": [ 265 | "(5572, 2)" 266 | ] 267 | }, 268 | "metadata": { 269 | "tags": [] 270 | }, 271 | "execution_count": 9 272 | } 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "id": "rqPrAVYSXztZ" 279 | }, 280 | "source": [ 281 | "##### Let's take a subset of data (first 50 rows only) and create bag of word model on that." 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "metadata": { 287 | "scrolled": true, 288 | "id": "6yKYa3_cXztZ", 289 | "colab": { 290 | "base_uri": "https://localhost:8080/" 291 | }, 292 | "outputId": "84763f1e-48e3-4200-8326-7f9de84e2a00" 293 | }, 294 | "source": [ 295 | "spam = spam.iloc[0:100,:]\n", 296 | "print(spam)" 297 | ], 298 | "execution_count": 10, 299 | "outputs": [ 300 | { 301 | "output_type": "stream", 302 | "text": [ 303 | " label message\n", 304 | "0 ham Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...\n", 305 | "1 ham Ok lar... Joking wif u oni...\n", 306 | "2 spam Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...\n", 307 | "3 ham U dun say so early hor... U c already then say...\n", 308 | "4 ham Nah I don't think he goes to usf, he lives around here though\n", 309 | ".. ... ...\n", 310 | "95 spam Your free ringtone is waiting to be collected. Simply text the password \"MIX\" to 85069 to verify...\n", 311 | "96 ham Watching telugu movie..wat abt u?\n", 312 | "97 ham i see. When we finish we have loads of loans to pay\n", 313 | "98 ham Hi. Wk been ok - on hols now! Yes on for a bit of a run. Forgot that i have hairdressers appoint...\n", 314 | "99 ham I see a cup of coffee animation\n", 315 | "\n", 316 | "[100 rows x 2 columns]\n" 317 | ], 318 | "name": "stdout" 319 | } 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "metadata": { 325 | "id": "7a_4nefxXzta", 326 | "colab": { 327 | "base_uri": "https://localhost:8080/" 328 | }, 329 | "outputId": "22a56995-1f40-4328-a735-41c75cc92461" 330 | }, 331 | "source": [ 332 | "# extract the messages from the dataframe\n", 333 | "messages = spam.message\n", 334 | "print(messages)" 335 | ], 336 | "execution_count": 11, 337 | "outputs": [ 338 | { 339 | "output_type": "stream", 340 | "text": [ 341 | "0 Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...\n", 342 | "1 Ok lar... Joking wif u oni...\n", 343 | "2 Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...\n", 344 | "3 U dun say so early hor... U c already then say...\n", 345 | "4 Nah I don't think he goes to usf, he lives around here though\n", 346 | " ... \n", 347 | "95 Your free ringtone is waiting to be collected. Simply text the password \"MIX\" to 85069 to verify...\n", 348 | "96 Watching telugu movie..wat abt u?\n", 349 | "97 i see. When we finish we have loads of loans to pay\n", 350 | "98 Hi. Wk been ok - on hols now! Yes on for a bit of a run. Forgot that i have hairdressers appoint...\n", 351 | "99 I see a cup of coffee animation\n", 352 | "Name: message, Length: 100, dtype: object\n" 353 | ], 354 | "name": "stdout" 355 | } 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "metadata": { 361 | "id": "OkRKQc9pXzta", 362 | "colab": { 363 | "base_uri": "https://localhost:8080/" 364 | }, 365 | "outputId": "16b3d8f8-a568-43eb-b12a-fafa4e1ff9b7" 366 | }, 367 | "source": [ 368 | "# convert messages into list\n", 369 | "messages = [message for message in messages]\n", 370 | "print(messages)" 371 | ], 372 | "execution_count": 12, 373 | "outputs": [ 374 | { 375 | "output_type": "stream", 376 | "text": [ 377 | "['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'Ok lar... Joking wif u oni...', \"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\", 'U dun say so early hor... U c already then say...', \"Nah I don't think he goes to usf, he lives around here though\", \"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv\", 'Even my brother is not like to speak with me. They treat me like aids patent.', \"As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune\", 'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.', 'Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030', \"I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.\", 'SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info', 'URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18', \"I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\", 'I HAVE A DATE ON SUNDAY WITH WILL!!', 'XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL', \"Oh k...i'm watching here:)\", 'Eh u remember how 2 spell his name... Yes i did. He v naughty make until i v wet.', 'Fine if that\\x92s the way u feel. That\\x92s the way its gota b', 'England v Macedonia - dont miss the goals/team news. Txt ur national team to 87077 eg ENGLAND to 87077 Try:WALES, SCOTLAND 4txt/ú1.20 POBOXox36504W45WQ 16+', 'Is that seriously how you spell his name?', 'I‘m going to try for 2 months ha ha only joking', 'So ü pay first lar... Then when is da stock comin...', 'Aft i finish my lunch then i go str down lor. Ard 3 smth lor. U finish ur lunch already?', 'Ffffffffff. Alright no way I can meet up with you sooner?', \"Just forced myself to eat a slice. I'm really not hungry tho. This sucks. Mark is getting worried. He knows I'm sick when I turn down pizza. Lol\", 'Lol your always so convincing.', \"Did you catch the bus ? Are you frying an egg ? Did you make a tea? Are you eating your mom's left over dinner ? Do you feel my Love ?\", \"I'm back & we're packing the car now, I'll let you know if there's room\", 'Ahhh. Work. I vaguely remember that! What does it feel like? Lol', \"Wait that's still not all that clear, were you not sure about me being sarcastic or that that's why x doesn't want to live with us\", \"Yeah he got in at 2 and was v apologetic. n had fallen out and she was actin like spoilt child and he got caught up in that. Till 2! But we won't go there! Not doing too badly cheers. You? \", 'K tell me anything about you.', 'For fear of fainting with the of all that housework you just did? Quick have a cuppa', 'Thanks for your subscription to Ringtone UK your mobile will be charged £5/month Please confirm by replying YES or NO. If you reply NO you will not be charged', 'Yup... Ok i go home look at the timings then i msg ü again... Xuhui going to learn on 2nd may too but her lesson is at 8am', \"Oops, I'll let you know when my roommate's done\", 'I see the letter B on my car', 'Anything lor... U decide...', \"Hello! How's you and how did saturday go? I was just texting to see if you'd decided to do anything tomo. Not that i'm trying to invite myself or anything!\", 'Pls go ahead with watts. I just wanted to be sure. Do have a great weekend. Abiola', 'Did I forget to tell you ? I want you , I need you, I crave you ... But most of all ... I love you my sweet Arabian steed ... Mmmmmm ... Yummy', '07732584351 - Rodger Burns - MSG = We tried to call you re your reply to our sms for a free nokia mobile + free camcorder. Please call now 08000930705 for delivery tomorrow', 'WHO ARE YOU SEEING?', 'Great! I hope you like your man well endowed. I am <#> inches...', 'No calls..messages..missed calls', \"Didn't you get hep b immunisation in nigeria.\", 'Fair enough, anything going on?', \"Yeah hopefully, if tyler can't do it I could maybe ask around a bit\", \"U don't know how stubborn I am. I didn't even want to go to the hospital. I kept telling Mark I'm not a weak sucker. Hospitals are for weak suckers.\", 'What you thinked about me. First time you saw me in class.', 'A gram usually runs like <#> , a half eighth is smarter though and gets you almost a whole second gram for <#>', \"K fyi x has a ride early tomorrow morning but he's crashing at our place tonight\", 'Wow. I never realized that you were so embarassed by your accomodations. I thought you liked it, since i was doing the best i could and you always seemed so happy about \"the cave\". I\\'m sorry I didn\\'t and don\\'t have more to give. I\\'m sorry i offered. I\\'m sorry your room was so embarassing.', 'SMS. ac Sptv: The New Jersey Devils and the Detroit Red Wings play Ice Hockey. Correct or Incorrect? End? Reply END SPTV', 'Do you know what Mallika Sherawat did yesterday? Find out now @ <URL>', 'Congrats! 1 year special cinema pass for 2 is yours. call 09061209465 now! C Suprman V, Matrix3, StarWars3, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out! ', \"Sorry, I'll call later in meeting.\", 'Tell where you reached', 'Yes..gauti and sehwag out of odi series.', \"Your gonna have to pick up a $1 burger for yourself on your way home. I can't even move. Pain is killing me.\", 'Ha ha ha good joke. Girls are situation seekers.', 'Its a part of checking IQ', 'Sorry my roommates took forever, it ok if I come by now?', 'Ok lar i double check wif da hair dresser already he said wun cut v short. He said will cut until i look nice.', 'As a valued customer, I am pleased to advise you that following recent review of your Mob No. you are awarded with a £1500 Bonus Prize, call 09066364589', 'Today is \"song dedicated day..\" Which song will u dedicate for me? Send this to all ur valuable frnds but first rply me...', 'Urgent UR awarded a complimentary trip to EuroDisinc Trav, Aco&Entry41 Or £1000. To claim txt DIS to 87121 18+6*£1.50(moreFrmMob. ShrAcomOrSglSuplt)10, LS1 3AJ', 'Did you hear about the new \"Divorce Barbie\"? It comes with all of Ken\\'s stuff!', 'I plane to give on this month end.', 'Wah lucky man... Then can save money... Hee...', 'Finished class where are you.', 'HI BABE IM AT HOME NOW WANNA DO SOMETHING? XX', 'K..k:)where are you?how did you performed?', 'U can call me now...', 'I am waiting machan. Call me once you free.', 'Thats cool. i am a gentleman and will treat you with dignity and respect.', 'I like you peoples very much:) but am very shy pa.', 'Does not operate after <#> or what', \"Its not the same here. Still looking for a job. How much do Ta's earn there.\", \"Sorry, I'll call later\", 'K. Did you call me just now ah? ', 'Ok i am on the way to home hi hi', 'You will be in the place of that man', 'Yup next stop.', \"I call you later, don't have network. If urgnt, sms me.\", \"For real when u getting on yo? I only need 2 more tickets and one more jacket and I'm done. I already used all my multis.\", \"Yes I started to send requests to make it but pain came back so I'm back in bed. Double coins at the factory too. I gotta cash in all my nitros.\", \"I'm really not up to it still tonight babe\", 'Ela kano.,il download, come wen ur free..', 'Yeah do! Don‘t stand to close tho- you‘ll catch something!', \"Sorry to be a pain. Is it ok if we meet another night? I spent late afternoon in casualty and that means i haven't done any of y stuff42moro and that includes all my time sheets and that. Sorry. \", 'Smile in Pleasure Smile in Pain Smile when trouble pours like Rain Smile when sum1 Hurts U Smile becoz SOMEONE still Loves to see u Smiling!!', 'Please call our customer service representative on 0800 169 6031 between 10am-9pm as you have WON a guaranteed £1000 cash or £5000 prize!', 'Havent planning to buy later. I check already lido only got 530 show in e afternoon. U finish work already?', 'Your free ringtone is waiting to be collected. Simply text the password \"MIX\" to 85069 to verify. Get Usher and Britney. FML, PO Box 5249, MK17 92H. 450Ppw 16', 'Watching telugu movie..wat abt u?', 'i see. When we finish we have loads of loans to pay', 'Hi. Wk been ok - on hols now! Yes on for a bit of a run. Forgot that i have hairdressers appointment at four so need to get home n shower beforehand. Does that cause prob for u?\"', 'I see a cup of coffee animation']\n" 378 | ], 379 | "name": "stdout" 380 | } 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "metadata": { 386 | "id": "jxXtqpIbXztb", 387 | "colab": { 388 | "base_uri": "https://localhost:8080/" 389 | }, 390 | "outputId": "1f261e69-cfaa-4d32-a1a7-68d07728b49e" 391 | }, 392 | "source": [ 393 | "# preprocess messages using the preprocess function\n", 394 | "messages = [preprocess(message) for message in messages]\n", 395 | "print(messages)" 396 | ], 397 | "execution_count": 13, 398 | "outputs": [ 399 | { 400 | "output_type": "stream", 401 | "text": [ 402 | "['go jurong point , crazy.. available bugis n great world la e buffet ... cine got amore wat ...', 'ok lar ... joking wif u oni ...', \"free entry 2 wkly comp win fa cup final tkts 21st may 2005. text fa 87121 receive entry question ( std txt rate ) & c 's apply 08452810075over18 's\", 'u dun say early hor ... u c already say ...', \"nah n't think goes usf , lives around though\", \"freemsg hey darling 's 3 week 's word back ! 'd like fun still ? tb ok ! xxx std chgs send , £1.50 rcv\", 'even brother like speak . treat like aids patent .', \"per request 'melle melle ( oru minnaminunginte nurungu vettam ) ' set callertune callers . press *9 copy friends callertune\", 'winner ! ! valued network customer selected receivea £900 prize reward ! claim call 09061701461. claim code kl341 . valid 12 hours .', 'mobile 11 months ? u r entitled update latest colour mobiles camera free ! call mobile update co free 08002986030', \"'m gon na home soon n't want talk stuff anymore tonight , k ? 've cried enough today .\", 'six chances win cash ! 100 20,000 pounds txt > csh11 send 87575. cost 150p/day , 6days , 16+ tsandcs apply reply hl 4 info', 'urgent ! 1 week free membership £100,000 prize jackpot ! txt word : claim : 81010 & c www.dbuk.net lccltd pobox 4403ldnw1a7rw18', \"'ve searching right words thank breather . promise wont take help granted fulfil promise . wonderful blessing times .\", 'date sunday ! !', 'xxxmobilemovieclub : use credit , click wap link next txt message click > > http : //wap . xxxmobilemovieclub.com ? n=qjkgighjjgcbl', \"oh k ... 'm watching : )\", 'eh u remember 2 spell name ... yes . v naughty make v wet .', 'fine that\\x92s way u feel . that\\x92s way gota b', 'england v macedonia - dont miss goals/team news . txt ur national team 87077 eg england 87077 try : wales , scotland 4txt/ú1.20 poboxox36504w45wq 16+', 'seriously spell name ?', '‘ going try 2 months ha ha joking', 'ü pay first lar ... da stock comin ...', 'aft finish lunch go str lor . ard 3 smth lor . u finish ur lunch already ?', 'ffffffffff . alright way meet sooner ?', \"forced eat slice . 'm really hungry tho . sucks . mark getting worried . knows 'm sick turn pizza . lol\", 'lol always convincing .', \"catch bus ? frying egg ? make tea ? eating mom 's left dinner ? feel love ?\", \"'m back & amp ; 're packing car , 'll let know 's room\", 'ahhh . work . vaguely remember ! feel like ? lol', \"wait 's still clear , sure sarcastic 's x n't want live us\", \"yeah got 2 v apologetic . n fallen actin like spoilt child got caught . till 2 ! wo n't go ! badly cheers . ?\", 'k tell anything .', 'fear fainting housework ? quick cuppa', 'thanks subscription ringtone uk mobile charged £5/month please confirm replying yes . reply charged', 'yup ... ok go home look timings msg ü ... xuhui going learn 2nd may lesson 8am', \"oops , 'll let know roommate 's done\", 'see letter b car', 'anything lor ... u decide ...', \"hello ! 's saturday go ? texting see 'd decided anything tomo . 'm trying invite anything !\", 'pls go ahead watts . wanted sure . great weekend . abiola', 'forget tell ? want , need , crave ... ... love sweet arabian steed ... mmmmmm ... yummy', '07732584351 - rodger burns - msg = tried call reply sms free nokia mobile + free camcorder . please call 08000930705 delivery tomorrow', 'seeing ?', 'great ! hope like man well endowed . & lt ; # & gt ; inches ...', 'calls..messages..missed calls', \"n't get hep b immunisation nigeria .\", 'fair enough , anything going ?', \"yeah hopefully , tyler ca n't could maybe ask around bit\", \"u n't know stubborn . n't even want go hospital . kept telling mark 'm weak sucker . hospitals weak suckers .\", 'thinked . first time saw class .', 'gram usually runs like & lt ; # & gt ; , half eighth smarter though gets almost whole second gram & lt ; # & gt ;', \"k fyi x ride early tomorrow morning 's crashing place tonight\", \"wow . never realized embarassed accomodations . thought liked , since best could always seemed happy `` cave '' . 'm sorry n't n't give . 'm sorry offered . 'm sorry room embarassing .\", 'sms . ac sptv : new jersey devils detroit red wings play ice hockey . correct incorrect ? end ? reply end sptv', 'know mallika sherawat yesterday ? find @ & lt ; url & gt ;', 'congrats ! 1 year special cinema pass 2 . call 09061209465 ! c suprman v , matrix3 , starwars3 , etc 4 free ! bx420-ip4-5we . 150pm . dont miss !', \"sorry , 'll call later meeting .\", 'tell reached', 'yes..gauti sehwag odi series .', \"gon na pick $ 1 burger way home . ca n't even move . pain killing .\", 'ha ha ha good joke . girls situation seekers .', 'part checking iq', 'sorry roommates took forever , ok come ?', 'ok lar double check wif da hair dresser already said wun cut v short . said cut look nice .', 'valued customer , pleased advise following recent review mob . awarded £1500 bonus prize , call 09066364589', \"today `` song dedicated day.. '' song u dedicate ? send ur valuable frnds first rply ...\", 'urgent ur awarded complimentary trip eurodisinc trav , aco & entry41 £1000 . claim txt dis 87121 18+6*£1.50 ( morefrmmob . shracomorsglsuplt ) 10 , ls1 3aj', \"hear new `` divorce barbie '' ? comes ken 's stuff !\", 'plane give month end .', 'wah lucky man ... save money ... hee ...', 'finished class .', 'hi babe im home wan na something ? xx', 'k..k : ) ? performed ?', 'u call ...', 'waiting machan . call free .', 'thats cool . gentleman treat dignity respect .', 'like peoples much : ) shy pa .', 'operate & lt ; # & gt ;', \". still looking job . much ta 's earn .\", \"sorry , 'll call later\", 'k. call ah ?', 'ok way home hi hi', 'place man', 'yup next stop .', \"call later , n't network . urgnt , sms .\", \"real u getting yo ? need 2 tickets one jacket 'm done . already used multis .\", \"yes started send requests make pain came back 'm back bed . double coins factory . got ta cash nitros .\", \"'m really still tonight babe\", 'ela kano. , il download , come wen ur free..', 'yeah ! ‘ stand close tho- ‘ catch something !', \"sorry pain . ok meet another night ? spent late afternoon casualty means n't done stuff42moro includes time sheets . sorry .\", 'smile pleasure smile pain smile trouble pours like rain smile sum1 hurts u smile becoz someone still loves see u smiling ! !', 'please call customer service representative 0800 169 6031 10am-9pm guaranteed £1000 cash £5000 prize !', 'havent planning buy later . check already lido got 530 show e afternoon . u finish work already ?', \"free ringtone waiting collected . simply text password `` mix '' 85069 verify . get usher britney . fml , po box 5249 , mk17 92h . 450ppw 16\", 'watching telugu movie..wat abt u ?', 'see . finish loads loans pay', \"hi . wk ok - hols ! yes bit run . forgot hairdressers appointment four need get home n shower beforehand . cause prob u ? ''\", 'see cup coffee animation']\n" 403 | ], 404 | "name": "stdout" 405 | } 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "metadata": { 411 | "collapsed": true, 412 | "id": "yZL5fre7Xztc" 413 | }, 414 | "source": [ 415 | "# bag of words model\n", 416 | "vectorizer = CountVectorizer()\n", 417 | "bow_model = vectorizer.fit_transform(messages)" 418 | ], 419 | "execution_count": 14, 420 | "outputs": [] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "metadata": { 425 | "id": "iDTNt7-zXztc", 426 | "colab": { 427 | "base_uri": "https://localhost:8080/", 428 | "height": 439 429 | }, 430 | "outputId": "2e44995b-e45a-40a5-e210-d031edb93b5c" 431 | }, 432 | "source": [ 433 | "# look at the dataframe\n", 434 | "pd.DataFrame(bow_model.toarray(), columns = vectorizer.get_feature_names())" 435 | ], 436 | "execution_count": 15, 437 | "outputs": [ 438 | { 439 | "output_type": "execute_result", 440 | "data": { 441 | "text/html": [ 442 | "
\n", 443 | "\n", 456 | "\n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | "
000077325843510800080009307050800298603008452810075over1809061209465090617014610906636458910100100010am11121500150p150pm161691820200521st2nd3aj4403ldnw1a7rw18450ppw4txt50500052495305we60316days81010850698707787121...watwatchingwattswayweakweekweekendwellwenwetwholewifwinwingswinnerwkwklywowonderfulwontwordwordsworkworldworriedwowwunwwwxuhuixxxxxxxxmobilemovieclubyeahyearyesyesterdayyoyummyyupú1
00000000000000000000000000000000000000000...1000000000000000000000010000000000000000
10000000000000000000000000000000000000000...0000000000010000000000000000000000000000
20000010000000000000000110000000000000001...0000000000001000100000000000000000000000
30000000000000000000000000000000000000000...0000000000000000000000000000000000000000
40000000000000000000000000000000000000000...0000000000000000000000000000000000000000
......................................................................................................................................................................................................................................................
950000000000000000001000000001000100000100...0000000000000000000000000000000000000000
960000000000000000000000000000000000000000...1100000000000000000000000000000000000000
970000000000000000000000000000000000000000...0000000000000000000000000000000000000000
980000000000000000000000000000000000000000...0000000000000001000000000000000000100000
990000000000000000000000000000000000000000...0000000000000000000000000000000000000000
\n", 1470 | "

100 rows × 640 columns

\n", 1471 | "
" 1472 | ], 1473 | "text/plain": [ 1474 | " 000 07732584351 0800 08000930705 ... yo yummy yup ú1\n", 1475 | "0 0 0 0 0 ... 0 0 0 0\n", 1476 | "1 0 0 0 0 ... 0 0 0 0\n", 1477 | "2 0 0 0 0 ... 0 0 0 0\n", 1478 | "3 0 0 0 0 ... 0 0 0 0\n", 1479 | "4 0 0 0 0 ... 0 0 0 0\n", 1480 | ".. ... ... ... ... ... .. ... ... ..\n", 1481 | "95 0 0 0 0 ... 0 0 0 0\n", 1482 | "96 0 0 0 0 ... 0 0 0 0\n", 1483 | "97 0 0 0 0 ... 0 0 0 0\n", 1484 | "98 0 0 0 0 ... 0 0 0 0\n", 1485 | "99 0 0 0 0 ... 0 0 0 0\n", 1486 | "\n", 1487 | "[100 rows x 640 columns]" 1488 | ] 1489 | }, 1490 | "metadata": { 1491 | "tags": [] 1492 | }, 1493 | "execution_count": 15 1494 | } 1495 | ] 1496 | }, 1497 | { 1498 | "cell_type": "code", 1499 | "metadata": { 1500 | "id": "dVDxYkp4Xztd", 1501 | "colab": { 1502 | "base_uri": "https://localhost:8080/" 1503 | }, 1504 | "outputId": "c99ab8c9-cbbf-4e5a-ff06-1eca085c2138" 1505 | }, 1506 | "source": [ 1507 | "print(vectorizer.get_feature_names())" 1508 | ], 1509 | "execution_count": 16, 1510 | "outputs": [ 1511 | { 1512 | "output_type": "stream", 1513 | "text": [ 1514 | "['000', '07732584351', '0800', '08000930705', '08002986030', '08452810075over18', '09061209465', '09061701461', '09066364589', '10', '100', '1000', '10am', '11', '12', '1500', '150p', '150pm', '16', '169', '18', '20', '2005', '21st', '2nd', '3aj', '4403ldnw1a7rw18', '450ppw', '4txt', '50', '5000', '5249', '530', '5we', '6031', '6days', '81010', '85069', '87077', '87121', '87575', '8am', '900', '92h', '9pm', 'abiola', 'abt', 'ac', 'accomodations', 'aco', 'actin', 'advise', 'aft', 'afternoon', 'ah', 'ahead', 'ahhh', 'aids', 'almost', 'already', 'alright', 'always', 'amore', 'amp', 'animation', 'another', 'anymore', 'anything', 'apologetic', 'apply', 'appointment', 'arabian', 'ard', 'around', 'ask', 'available', 'awarded', 'babe', 'back', 'badly', 'barbie', 'becoz', 'bed', 'beforehand', 'best', 'bit', 'blessing', 'bonus', 'box', 'breather', 'britney', 'brother', 'buffet', 'bugis', 'burger', 'burns', 'bus', 'buy', 'bx420', 'ca', 'call', 'callers', 'callertune', 'calls', 'camcorder', 'came', 'camera', 'car', 'cash', 'casualty', 'catch', 'caught', 'cause', 'cave', 'chances', 'charged', 'check', 'checking', 'cheers', 'chgs', 'child', 'cine', 'cinema', 'claim', 'class', 'clear', 'click', 'close', 'co', 'code', 'coffee', 'coins', 'collected', 'colour', 'com', 'come', 'comes', 'comin', 'comp', 'complimentary', 'confirm', 'congrats', 'convincing', 'cool', 'copy', 'correct', 'cost', 'could', 'crashing', 'crave', 'crazy', 'credit', 'cried', 'csh11', 'cup', 'cuppa', 'customer', 'cut', 'da', 'darling', 'date', 'day', 'dbuk', 'decide', 'decided', 'dedicate', 'dedicated', 'delivery', 'detroit', 'devils', 'dignity', 'dinner', 'dis', 'divorce', 'done', 'dont', 'double', 'download', 'dresser', 'dun', 'early', 'earn', 'eat', 'eating', 'eg', 'egg', 'eh', 'eighth', 'ela', 'embarassed', 'embarassing', 'end', 'endowed', 'england', 'enough', 'entitled', 'entry', 'entry41', 'etc', 'eurodisinc', 'even', 'fa', 'factory', 'fainting', 'fair', 'fallen', 'fear', 'feel', 'ffffffffff', 'final', 'find', 'fine', 'finish', 'finished', 'first', 'fml', 'following', 'forced', 'forever', 'forget', 'forgot', 'four', 'free', 'freemsg', 'friends', 'frnds', 'frying', 'fulfil', 'fun', 'fyi', 'gauti', 'gentleman', 'get', 'gets', 'getting', 'girls', 'give', 'go', 'goals', 'goes', 'going', 'gon', 'good', 'got', 'gota', 'gram', 'granted', 'great', 'gt', 'guaranteed', 'ha', 'hair', 'hairdressers', 'half', 'happy', 'havent', 'hear', 'hee', 'hello', 'help', 'hep', 'hey', 'hi', 'hl', 'hockey', 'hols', 'home', 'hope', 'hopefully', 'hor', 'hospital', 'hospitals', 'hours', 'housework', 'http', 'hungry', 'hurts', 'ice', 'il', 'im', 'immunisation', 'inches', 'includes', 'incorrect', 'info', 'invite', 'ip4', 'iq', 'jacket', 'jackpot', 'jersey', 'job', 'joke', 'joking', 'jurong', 'kano', 'ken', 'kept', 'killing', 'kl341', 'know', 'knows', 'la', 'lar', 'late', 'later', 'latest', 'lccltd', 'learn', 'left', 'lesson', 'let', 'letter', 'lido', 'like', 'liked', 'link', 'live', 'lives', 'll', 'loads', 'loans', 'lol', 'look', 'looking', 'lor', 'love', 'loves', 'ls1', 'lt', 'lucky', 'lunch', 'macedonia', 'machan', 'make', 'mallika', 'man', 'mark', 'matrix3', 'may', 'maybe', 'means', 'meet', 'meeting', 'melle', 'membership', 'message', 'messages', 'minnaminunginte', 'miss', 'missed', 'mix', 'mk17', 'mmmmmm', 'mob', 'mobile', 'mobiles', 'mom', 'money', 'month', 'months', 'morefrmmob', 'morning', 'move', 'movie', 'msg', 'much', 'multis', 'na', 'nah', 'name', 'national', 'naughty', 'need', 'net', 'network', 'never', 'new', 'news', 'next', 'nice', 'nigeria', 'night', 'nitros', 'nokia', 'nurungu', 'odi', 'offered', 'oh', 'ok', 'one', 'oni', 'oops', 'operate', 'oru', 'pa', 'packing', 'pain', 'part', 'pass', 'password', 'patent', 'pay', 'peoples', 'per', 'performed', 'pick', 'pizza', 'place', 'plane', 'planning', 'play', 'please', 'pleased', 'pleasure', 'pls', 'po', 'pobox', 'poboxox36504w45wq', 'point', 'pounds', 'pours', 'press', 'prize', 'prob', 'promise', 'qjkgighjjgcbl', 'question', 'quick', 'rain', 'rate', 'rcv', 're', 'reached', 'real', 'realized', 'really', 'receive', 'receivea', 'recent', 'red', 'remember', 'reply', 'replying', 'representative', 'request', 'requests', 'respect', 'review', 'reward', 'ride', 'right', 'ringtone', 'rodger', 'room', 'roommate', 'roommates', 'rply', 'run', 'runs', 'said', 'sarcastic', 'saturday', 'save', 'saw', 'say', 'scotland', 'searching', 'second', 'see', 'seeing', 'seekers', 'seemed', 'sehwag', 'selected', 'send', 'series', 'seriously', 'service', 'set', 'sheets', 'sherawat', 'short', 'show', 'shower', 'shracomorsglsuplt', 'shy', 'sick', 'simply', 'since', 'situation', 'six', 'slice', 'smarter', 'smile', 'smiling', 'sms', 'smth', 'someone', 'something', 'song', 'soon', 'sooner', 'sorry', 'speak', 'special', 'spell', 'spent', 'spoilt', 'sptv', 'stand', 'started', 'starwars3', 'std', 'steed', 'still', 'stock', 'stop', 'str', 'stubborn', 'stuff', 'stuff42moro', 'subscription', 'sucker', 'suckers', 'sucks', 'sum1', 'sunday', 'suprman', 'sure', 'sweet', 'ta', 'take', 'talk', 'tb', 'tea', 'team', 'tell', 'telling', 'telugu', 'text', 'texting', 'thank', 'thanks', 'that', 'thats', 'think', 'thinked', 'tho', 'though', 'thought', 'tickets', 'till', 'time', 'times', 'timings', 'tkts', 'today', 'tomo', 'tomorrow', 'tonight', 'took', 'trav', 'treat', 'tried', 'trip', 'trouble', 'try', 'trying', 'tsandcs', 'turn', 'txt', 'tyler', 'uk', 'update', 'ur', 'urgent', 'urgnt', 'url', 'us', 'use', 'used', 'usf', 'usher', 'usually', 'vaguely', 'valid', 'valuable', 'valued', 've', 'verify', 'vettam', 'wah', 'wait', 'waiting', 'wales', 'wan', 'want', 'wanted', 'wap', 'wat', 'watching', 'watts', 'way', 'weak', 'week', 'weekend', 'well', 'wen', 'wet', 'whole', 'wif', 'win', 'wings', 'winner', 'wk', 'wkly', 'wo', 'wonderful', 'wont', 'word', 'words', 'work', 'world', 'worried', 'wow', 'wun', 'www', 'xuhui', 'xx', 'xxx', 'xxxmobilemovieclub', 'yeah', 'year', 'yes', 'yesterday', 'yo', 'yummy', 'yup', 'ú1']\n" 1515 | ], 1516 | "name": "stdout" 1517 | } 1518 | ] 1519 | }, 1520 | { 1521 | "cell_type": "markdown", 1522 | "metadata": { 1523 | "id": "tK85YunWXztd" 1524 | }, 1525 | "source": [ 1526 | "* A lot of duplicate tokens such as 'win'and 'winner'; 'reply' and 'replying'; 'want' and 'wanted' etc. " 1527 | ] 1528 | }, 1529 | { 1530 | "cell_type": "code", 1531 | "metadata": { 1532 | "colab": { 1533 | "base_uri": "https://localhost:8080/" 1534 | }, 1535 | "id": "WVjDGT6zauCV", 1536 | "outputId": "0a145f7a-c925-4288-a0ce-864526fa663a" 1537 | }, 1538 | "source": [ 1539 | "bow_model.shape" 1540 | ], 1541 | "execution_count": 19, 1542 | "outputs": [ 1543 | { 1544 | "output_type": "execute_result", 1545 | "data": { 1546 | "text/plain": [ 1547 | "(100, 640)" 1548 | ] 1549 | }, 1550 | "metadata": { 1551 | "tags": [] 1552 | }, 1553 | "execution_count": 19 1554 | } 1555 | ] 1556 | }, 1557 | { 1558 | "cell_type": "code", 1559 | "metadata": { 1560 | "id": "bq41cvribFf0" 1561 | }, 1562 | "source": [ 1563 | "" 1564 | ], 1565 | "execution_count": 19, 1566 | "outputs": [] 1567 | }, 1568 | { 1569 | "cell_type": "code", 1570 | "metadata": { 1571 | "id": "YTpYEHfMcapY" 1572 | }, 1573 | "source": [ 1574 | "" 1575 | ], 1576 | "execution_count": 18, 1577 | "outputs": [] 1578 | } 1579 | ] 1580 | } --------------------------------------------------------------------------------