├── .gitattributes ├── .hgignore ├── HW1 - Implement a Trie.ipynb ├── HW1 - Regular Expression Parsing.ipynb ├── HW1 - Reproduce Power Laws.ipynb ├── HW2 - Classifier-Based Tagging.ipynb ├── HW2 - Improve Tagging with Wordnet.ipynb ├── HW3 - HMMs and FSTs.ipynb ├── HW3 - regex generation.ipynb ├── README.md ├── brown.zip ├── college-degrees-perc.tsv ├── crawling-with-scrapy.ipynb ├── fstutils.py ├── household-ppp.tsv ├── internet-archive-query.ipynb ├── letter.png ├── nlpa-basic-nltk.ipynb ├── nlpa-classification-intro.ipynb ├── nlpa-classification-tagging.ipynb ├── nlpa-classifier-dialog-acts.ipynb ├── nlpa-classifier-errors.ipynb ├── nlpa-classifier-sentence-segmentation-Copy0.ipynb ├── nlpa-classifier-sentence-segmentation.ipynb ├── nlpa-corpora.ipynb ├── nlpa-course.ipynb ├── nlpa-downloading-tomsawyer.ipynb ├── nlpa-edit-distance.ipynb ├── nlpa-find-xargs.ipynb ├── nlpa-hmm-ocr.ipynb ├── nlpa-intro-demo-videos.ipynb ├── nlpa-intro.ipynb ├── nlpa-lexical-resources.ipynb ├── nlpa-markov-models.ipynb ├── nlpa-memm.ipynb ├── nlpa-nltk-automated-tagging-old.ipynb ├── nlpa-nltk-automated-tagging.ipynb ├── nlpa-nltk-basic.ipynb ├── nlpa-nltk-corpora.ipynb ├── nlpa-nltk-lexical-resources.ipynb ├── nlpa-nltk-reading-german.ipynb ├── nlpa-nltk-wordnet.ipynb ├── nlpa-openfst-edit-distance.ipynb ├── nlpa-openfst-edit-distance.py ├── nlpa-openfst.ipynb ├── nlpa-openfst2.ipynb ├── nlpa-pos-tagging.ipynb ├── nlpa-re-fsa.ipynb ├── nlpa-re-intro.ipynb ├── nlpa-regular-expressions.ipynb ├── nlpa-simple-ir.ipynb ├── nlpa-unicode.ipynb ├── nlpa-unix-cleanup.ipynb ├── nlpa-unix-join.ipynb ├── nlpa-vectorspace.ipynb ├── nlpa-word-histograms.ipynb ├── nlpa-wordnet.ipynb ├── nltk-available-taggers.ipynb ├── nltk-ngram-taggers.ipynb ├── nltk-summary-stemming-lemmatizing.ipynb ├── nltk-tagging-from-scratch.ipynb ├── openfst-weights-and-forwardbackward.ipynb ├── tagutils.py ├── tomsawyer.html ├── tomsawyer.txt └── zipf-law-example.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb linguist-language=Python 2 | *.tex linguist-detectable=false 3 | *.html linguist-detectable=false 4 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | syntax: glob 2 | ? 3 | *.bak 4 | brown 5 | .* 6 | *~ 7 | *.o 8 | *.so 9 | *.a 10 | *.err 11 | *.log 12 | *.os 13 | *.pyc 14 | *.png 15 | *.jpg 16 | [0-9] 17 | _* 18 | book 19 | book-* 20 | unlv 21 | unlv-* 22 | Volume-* 23 | Volume_* 24 | *[0-9][0-9][0-9][0-9]* 25 | TAGS 26 | build 27 | *.db 28 | OLD 29 | *.tgz 30 | *.zip 31 | html 32 | apidocs 33 | JUNK 34 | OLD 35 | models*/* 36 | *.orig 37 | temp/ 38 | temp.* 39 | *.temp 40 | -------------------------------------------------------------------------------- /HW1 - Implement a Trie.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "49477604", 6 | "metadata": {}, 7 | "source": [ 8 | "# Trie Data Structure" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "d6fbd5f6", 14 | "metadata": {}, 15 | "source": [ 16 | "Define a class `Trie` that can be used for fast lookups of strings.\n", 17 | "Such classes are frequently used in many natural language processing applications.\n", 18 | "\n", 19 | "(Writing such a class is also a common interview question; you need\n", 20 | "to be able to do it in real time.)\n", 21 | "\n", 22 | "Have your class update a global variable `nops` for each node traversal during\n", 23 | "`add`, `lookup`, and `remove` operations." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 11, 29 | "id": "7da39832", 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "class Trie:\n", 36 | " def __init__(self):\n", 37 | " pass\n", 38 | " def add(self,s,value):\n", 39 | " \"\"\"Add the string `s` to the `Trie` and\n", 40 | " map it to the given value.\"\"\"\n", 41 | " global nops\n", 42 | " nops += 1 # this is just a placeholder\n", 43 | " pass\n", 44 | " def lookup(self,s,default=None):\n", 45 | " \"\"\"Look up the value corresponding to the\n", 46 | " string `s`.\"\"\"\n", 47 | " def remove(self,s):\n", 48 | " \"\"\"Remove the string s from the Trie.\n", 49 | " Returns True if the string was a member.\"\"\"\n", 50 | " pass\n", 51 | " def prefix(self,s):\n", 52 | " \"\"\"Check whether the string `s` is a prefix\n", 53 | " of some member.\"\"\"\n", 54 | " pass\n", 55 | " def items(self):\n", 56 | " \"\"\"Return an iterator over the items of the `Trie`.\"\"\"" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "id": "e241371b", 62 | "metadata": {}, 63 | "source": [ 64 | "# Unit Tests" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "id": "c10be409", 70 | "metadata": {}, 71 | "source": [ 72 | "Write some unit tests demonstrating that your class works as intended.\n", 73 | "The next cell gives some examples, but you need to write additional tests\n", 74 | "for other methods and common sequences of operations." 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 21, 80 | "id": "9965f973", 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "ename": "AssertionError", 87 | "evalue": "", 88 | "output_type": "error", 89 | "traceback": [ 90 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", 91 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtrie\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlookup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"street\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtrie\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlookup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"house\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mtrie\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlookup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"hello\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mtrie\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlookup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"world\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 92 | "\u001b[0;31mAssertionError\u001b[0m: " 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "trie = Trie()\n", 98 | "trie.add(\"hello\",1)\n", 99 | "trie.add(\"world\",2)\n", 100 | "assert not trie.lookup(\"street\")\n", 101 | "assert not trie.lookup(\"house\")\n", 102 | "assert trie.lookup(\"hello\")\n", 103 | "assert trie.lookup(\"world\")" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 23, 109 | "id": "4b23d401", 110 | "metadata": { 111 | "collapsed": false 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "trie = Trie()\n", 116 | "nops = 0\n", 117 | "trie.add(\"hello\",1)\n", 118 | "assert nops>0" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "71de5b4e", 124 | "metadata": {}, 125 | "source": [ 126 | "# Performance Measurements" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "9fbef093", 132 | "metadata": {}, 133 | "source": [ 134 | "Next, let's measure how `Trie` performance scales on real data." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 24, 140 | "id": "7d114a5d", 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "74354 ['the', 'adventures', 'of', 'tom', 'sawyer', 'mark', 'twain', 'harper', 'and', 'brothers']\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "import re\n", 155 | "words = re.findall(r'\\w+',open(\"tomsawyer.txt\").read())\n", 156 | "words = [w.lower() for w in words]\n", 157 | "print len(words),words[:10]" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 17, 163 | "id": "5be56794", 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [], 168 | "source": [ 169 | "counts = []\n", 170 | "for n in range(1000,70000,1000):\n", 171 | " trie = Trie()\n", 172 | " nops = 0\n", 173 | " for i,w in enumerate(words[:n]):\n", 174 | " trie.add(w,i)\n", 175 | " counts.append((n,nops))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 18, 181 | "id": "dd835354", 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "[]" 190 | ] 191 | }, 192 | "execution_count": 18, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | }, 196 | { 197 | "data": { 198 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY8AAAD9CAYAAABEB/uZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3W9MXNed//H3ZOFJuzWqCQzuDFoqGEywCdDYwINdiWY6\njuOuwQ62CVvZxPVWFdZGrlVH3krd1pZqQ7ZaafPPWmlFV4hsM7g8sN0qpiRu2Fqxd6JgZ50N3Xh2\nOzHMGFBiAiE2NsE+vwd07g9Mwh+bYeYyn5eEBGfmznyPhefwueeecx3GGIOIiMgCPBDvAkRExH40\neIiIyIJp8BARkQXT4CEiIgumwUNERBZMg4eIiCzYrIPH+++/T2lpqfWVlpbG888/z9DQED6fj/z8\nfDZs2MDw8LB1TGNjIx6Ph4KCAjo7O6327u5uioqK8Hg87Nu3z2q/desWtbW1eDweKioquHLlSgy6\nKSIii2nWwWP16tVcvHiRixcv0t3dzZe+9CW2bt1KU1MTPp+Py5cv4/V6aWpqAqCnp4e2tjZ6enro\n6Ohg7969RJeRNDQ00NzcTDAYJBgM0tHRAUBzczPp6ekEg0H279/PwYMHY9xlERG5X/M+bfX666+T\nl5dHdnY2p06dor6+HoD6+npOnDgBwMmTJ6mrqyM1NZWcnBzy8vIIBAL09/czOjpKWVkZALt27bKO\nmfpaNTU1nDlzZlE7KCIiiy9lvk/0+/3U1dUBMDg4iNPpBMDpdDI4OAjA1atXqaiosI5xu91EIhFS\nU1Nxu91Wu8vlIhKJABCJRMjOzp4sJiWFtLQ0hoaGWLlypfV8h8Nxr/0TEUlqsdpEZF7JY3x8nF//\n+tds3759xmMOh2NJPtyNMbb9+ulPfxr3GpK1fjvXrvrj/2X3+mNpXoPH6dOneeSRR8jIyAAm08bA\nwAAA/f39ZGZmApOJoq+vzzouHA7jdrtxuVyEw+EZ7dFjent7AZiYmGBkZGRa6hARkcQzr8HjlVde\nsU5ZAVRVVdHS0gJAS0sLW7Zssdr9fj/j4+OEQiGCwSBlZWVkZWWxYsUKAoEAxhhaW1uprq6e8Vrt\n7e14vd5F7aCIyHI3NgYHDsDJk0v4pmYOn376qUlPTzeffPKJ1Xbt2jXj9XqNx+MxPp/PfPzxx9Zj\nR44cMbm5uWb16tWmo6PDan/77bfN2rVrTW5urnn66aet9ps3b5rt27ebvLw8U15ebkKh0Iwa5lFm\nQnvjjTfiXcJ9sXP9dq7dGNUfb3ao/803jcnPN6a21pgPP5z+WCw/Ox1/eoOE5nA4Yn7+TkTETsbG\n4B/+Af793+HFF6GmZuZzYvnZqRXmIiI2c/48lJZCXx9cuvT5A0eszftSXRERia+xMfjJT+Dll+GF\nF2DbtvjVouQhImID0bTR2zuZNuI5cICSh4hIQkuktDGVkoeISII6dw5KShInbUyl5CEikmCmXkmV\nSGljKiUPEZEEEk0b4XDipY2plDxERBLAfNZtJBIlDxGROJuaNt59N/EHDlDyEBGJG7uljamUPERE\n4sCOaWMqJQ8RkSVk57QxlZKHiMgSia4St2vamErJQ0QkxqauErdz2phKyUNEJIbu3pNqOQwcoOQh\nIhITdlglfj+UPEREFtn585NXUkXvt7HcBg5Q8hARWTTLPW1MpeQhIrIIous2lnPamErJQ0TkPiRT\n2phKyUNE5B4lW9qYSslDRGSBkjVtTKXkISKyANF1G8mYNqaac/AYHh5m27ZtPPTQQxQWFhIIBBga\nGsLn85Gfn8+GDRsYHh62nt/Y2IjH46GgoIDOzk6rvbu7m6KiIjweD/v27bPab926RW1tLR6Ph4qK\nCq5cubLIXRQRuX9jY/DMM/DEE/Czn0FbG2RkxLuq+Jlz8Ni3bx+bNm3iD3/4A5cuXaKgoICmpiZ8\nPh+XL1/G6/XS1NQEQE9PD21tbfT09NDR0cHevXsxxgDQ0NBAc3MzwWCQYDBIR0cHAM3NzaSnpxMM\nBtm/fz8HDx6MYXdFRBbu7lXiyZo2pnKY6Kf75xgZGaG0tJQ//vGP09oLCgr4j//4D5xOJwMDA1RW\nVvI///M/NDY28sADD1gDwMaNGzl06BB/8Rd/waOPPsof/vAHAPx+P11dXfzLv/wLGzdu5PDhw5SX\nlzMxMcGqVav48MMPpxfpcPDTn/7U+rmyspLKysrF+jcQEflcU/ekssPcRldXF11dXdbPhw8fZpaP\n+Psy64R5KBQiIyOD3bt381//9V888sgj/PM//zODg4M4nU4AnE4ng4ODAFy9epWKigrreLfbTSQS\nITU1FbfbbbW7XC4ikQgAkUiE7OzsyWJSUkhLS2NoaIiVK1dOq+XQoUP331sRkXk6fx5274bi4sm0\nYYdTVHf/YX348OGYvdesp60mJia4cOECe/fu5cKFC3z5y1+2TlFFORwOHA5HzAoUEVlKY2Nw4IDm\nNuYy6+Dhdrtxu92sX78egG3btnHhwgWysrIYGBgAoL+/n8zMTGAyUfT19VnHh8Nh3G43LpeLcDg8\noz16TG9vLzA5WI2MjMxIHSIiS2Hq3f00tzG7WQePrKwssrOzuXz5MgCvv/46a9asYfPmzbS0tADQ\n0tLCli1bAKiqqsLv9zM+Pk4oFCIYDFJWVkZWVhYrVqwgEAhgjKG1tZXq6mrrmOhrtbe34/V6Y9ZZ\nEZHPE00bNTVw9Cj4/Uobc5lzkeALL7zAd77zHcbHx8nNzeXf/u3fuH37Njt27KC5uZmcnByOHz8O\nQGFhITt27KCwsJCUlBSOHTtmndI6duwYTz31FGNjY2zatImNGzcCsGfPHnbu3InH4yE9PR2/3x/D\n7oqITHfu3OTcRmnp5N39Hnww3hXZw6xXWyUKh8MRsysGRCQ5LZd7ic8mlp+dWmEuIkln6tyG3e8l\nHi/a20pEkkYypI2louQhIkkhenc/pY3FoeQhIsua3VaJ24WSh4gsW9qTKnaUPERk2dH9NmJPyUNE\nlpXo3Eay328j1pQ8RGRZUNpYWkoeImJ7yXwv8XhR8hAR21LaiB8lDxGxJaWN+FLyEBFb0SrxxKDk\nISK2oT2pEoeSh4gkvLEx+PGP4Ze/VNpIFEoeIpLQomkjElHaSCRKHiKSkDS3kdiUPEQk4UT3pNLc\nRuJS8hCRhKEdcO1DyUNEEoJ2wLUXJQ8RiSulDXtS8hCRuIleSaW0YT9KHiKy5LQnlf3NmTxycnJ4\n+OGHKS0tpaysDIChoSF8Ph/5+fls2LCB4eFh6/mNjY14PB4KCgro7Oy02ru7uykqKsLj8bBv3z6r\n/datW9TW1uLxeKioqODKlSuL2T8RSTBTV4krbdjXnIOHw+Ggq6uLixcv8tZbbwHQ1NSEz+fj8uXL\neL1empqaAOjp6aGtrY2enh46OjrYu3cvxhgAGhoaaG5uJhgMEgwG6ejoAKC5uZn09HSCwSD79+/n\n4MGDseqriMTR2BgcODB52e3Ro+D3Q0ZGvKuSezWvOY/oABB16tQp6uvrAaivr+fEiRMAnDx5krq6\nOlJTU8nJySEvL49AIEB/fz+jo6NWctm1a5d1zNTXqqmp4cyZM4vTMxFJGNG7+2ndxvIx55yHw+Hg\nW9/6Fn/2Z3/G97//fb73ve8xODiI0+kEwOl0Mjg4CMDVq1epqKiwjnW73UQiEVJTU3G73Va7y+Ui\nEokAEIlEyM7OniwmJYW0tDSGhoZYuXLltDoOHTpkfV9ZWUllZeW99VhEloyupFpaXV1ddHV1Lcl7\nzTl4vPnmm6xatYoPP/wQn89HQUHBtMcdDgcOhyNmBUZNHTxEJPGdPw+7d0Nx8eTchk5Rxd7df1gf\nPnw4Zu8152mrVatWAZCRkcHWrVt56623cDqdDAwMANDf309mZiYwmSj6+vqsY8PhMG63G5fLRTgc\nntEePaa3txeAiYkJRkZGZqQOEbGP6NzGE0/Az34GbW0aOJajWQePGzduMDo6CsD169fp7OykqKiI\nqqoqWlpaAGhpaWHLli0AVFVV4ff7GR8fJxQKEQwGKSsrIysrixUrVhAIBDDG0NraSnV1tXVM9LXa\n29vxer0x66yIxJbu7pc8Zj1tNTg4yNatW4HJVPCd73yHDRs2sG7dOnbs2EFzczM5OTkcP34cgMLC\nQnbs2EFhYSEpKSkcO3bMOqV17NgxnnrqKcbGxti0aRMbN24EYM+ePezcuROPx0N6ejp+vz+W/RWR\nGNC6jeTjMHdfSpWAHA7HjCu+RCQxnDs3ObdRUjK5dbpOUSWOWH52aoW5iNwTpY3kpr2tRGTBNLch\nSh4iMm9KGxKl5CEi86K0IVMpeYjIrJQ25PMoeYjIF1LakC+i5CEiMyhtyFyUPERkmui9xJU2ZDZK\nHiICaAdcWRglDxGx0obuJS7zpeQhksSUNuReKXmIJKnolVRKG3IvlDxEkoyupJLFoOQhkkSiaSMc\nVtqQ+6PkIZIEpqaNF1+Empp4VyR2p+QhssxNTRvvvquBQxaHkofIMqW5DYklJQ+RZUirxCXWlDxE\nlhGlDVkqSh4iy4R2wJWlpOQhYnNKGxIPSh4iNqa0IfGi5CFiQ0obEm/zSh63b9+mtLSUzZs3AzA0\nNITP5yM/P58NGzYwPDxsPbexsRGPx0NBQQGdnZ1We3d3N0VFRXg8Hvbt22e137p1i9raWjweDxUV\nFVy5cmWx+iayLJ0/r7Qh8TevweO5556jsLAQh8MBQFNTEz6fj8uXL+P1emlqagKgp6eHtrY2enp6\n6OjoYO/evRhjAGhoaKC5uZlgMEgwGKSjowOA5uZm0tPTCQaD7N+/n4MHD8ainyK2NzYGBw7AE0/A\nkSPQ1gYZGfGuSpLVnINHOBzm1Vdf5W//9m+tgeDUqVPU19cDUF9fz4kTJwA4efIkdXV1pKamkpOT\nQ15eHoFAgP7+fkZHRykrKwNg165d1jFTX6umpoYzZ84sfi9FbE5zG5Jo5pzz2L9/Pz//+c/55JNP\nrLbBwUGcTicATqeTwcFBAK5evUpFRYX1PLfbTSQSITU1FbfbbbW7XC4ikQgAkUiE7OzsyWJSUkhL\nS2NoaIiVK1dOq+PQoUPW95WVlVRWVi6wqyL2o7kNWYiuri66urqW5L1mHTx+85vfkJmZSWlp6RcW\n5HA4rNNZsTR18BBJBufOwe7dk4nj0iWdopK53f2H9eHDh2P2XrMOHufOnePUqVO8+uqr3Lx5k08+\n+YSdO3fidDoZGBggKyuL/v5+MjMzgclE0dfXZx0fDodxu924XC7C4fCM9ugxvb29fO1rX2NiYoKR\nkZEZqUMkmShtiB3MOudx9OhR+vr6CIVC+P1+Hn30UVpbW6mqqqKlpQWAlpYWtmzZAkBVVRV+v5/x\n8XFCoRDBYJCysjKysrJYsWIFgUAAYwytra1UV1dbx0Rfq729Ha/XG8v+iiQ0zW2IXSxonUf09NTf\n//3fs2PHDpqbm8nJyeH48eMAFBYWsmPHDgoLC0lJSeHYsWPWMceOHeOpp55ibGyMTZs2sXHjRgD2\n7NnDzp078Xg8pKen4/f7F7N/Irage4mL3ThM9BKqBOZwOLBBmSL35Pz5ybmN4uLJGzVpbkMWSyw/\nO7XCXCROlDbEzrS3lUgcRO+30duruQ2xJyUPkSWkK6lkuVDyEFkiU+8lrrQhdqfkIRJjU9PGiy9C\nTU28KxK5f0oeIjF0d9rQwCHLhZKHSAwobchyp+Qhssii99sIh+HddzVwyPKk5CGySJQ2JJkoeYgs\nAqUNSTZKHiL3QavEJVkpeYjcI60Sl2Sm5CGyQFolLqLkIbIg0bkN3W9Dkp2Sh8g8KG2ITKfkITIH\n3d1PZCYlD5EvoLQh8sWUPEQ+h9KGyOyUPESmUNoQmR8lD5E/UdoQmT8lD0l6ShsiC6fkIUktukpc\naUNkYZQ8JClpTyqR+zNr8rh58ybl5eWUlJRQWFjIj370IwCGhobw+Xzk5+ezYcMGhoeHrWMaGxvx\neDwUFBTQ2dlptXd3d1NUVITH42Hfvn1W+61bt6itrcXj8VBRUcGVK1cWu48i02hPKpFFYOZw/fp1\nY4wxn332mSkvLzdnz541zzzzjHn22WeNMcY0NTWZgwcPGmOMee+990xxcbEZHx83oVDI5Obmmjt3\n7hhjjFm/fr0JBALGGGMef/xxc/r0aWOMMS+99JJpaGgwxhjj9/tNbW3tjBrmUabInG7cMObAAWOy\nsoz51a/iXY1I7MXys3POOY8vfelLAIyPj3P79m2++tWvcurUKerr6wGor6/nxIkTAJw8eZK6ujpS\nU1PJyckhLy+PQCBAf38/o6OjlJWVAbBr1y7rmKmvVVNTw5kzZxZ7fBRR2hBZZHPOedy5c4dvfOMb\n/N///R8NDQ2sWbOGwcFBnE4nAE6nk8HBQQCuXr1KRUWFdazb7SYSiZCamorb7bbaXS4XkUgEgEgk\nQnZ29mQxKSmkpaUxNDTEypUrp9Vx6NAh6/vKykoqKyvvrceSVDS3Icmkq6uLrq6uJXmvOQePBx54\ngHfeeYeRkREee+wx3njjjWmPOxwOHA5HzAqMmjp4iMzHuXOwe/fk2o1LlyAjI94VicTW3X9YHz58\nOGbvNe9LddPS0vj2t79Nd3c3TqeTgYEBAPr7+8nMzAQmE0VfX591TDgcxu1243K5CIfDM9qjx/T2\n9gIwMTHByMjIjNQhshBjY3DgwOStYI8cgbY2DRwii23WweOjjz6yrqQaGxvjtddeo7S0lKqqKlpa\nWgBoaWlhy5YtAFRVVeH3+xkfHycUChEMBikrKyMrK4sVK1YQCAQwxtDa2kp1dbV1TPS12tvb8Xq9\nMeusLH/RVeLhsOY2RGJp1tNW/f391NfXc+fOHe7cucPOnTvxer2UlpayY8cOmpubycnJ4fjx4wAU\nFhayY8cOCgsLSUlJ4dixY9YprWPHjvHUU08xNjbGpk2b2LhxIwB79uxh586deDwe0tPT8fv9Me6y\nLEdTV4m/+OJk6hCR2HH86XKuhOZwOLBBmRIn0bmN0tLJgePBB+NdkUhiiOVnp1aYi21pTyqR+NHe\nVmJL2pNKJL6UPMRWlDZEEoOSh9iG7rchkjiUPCThKW2IJB4lD0loShsiiUnJQxKS0oZIYlPykIRz\n/rzShkiiU/KQhKG0IWIfSh6SEKbuSfXuuxo4RBKdkofE1dgY/PjH8Mtfak8qETtR8pC4iaaNSGQy\nbWjgELEPJQ9ZctoBV8T+lDxkSUX3pIrObWjgELEnJQ9ZErqXuMjyouQhMRdNG729WrchslwoeUjM\nKG2ILF9KHhITShsiy5uShywqrRIXSQ5KHrJopq4SV9oQWd6UPOS+ad2GSPJR8pD7cveeVBo4RJKD\nkofcE6UNkeQ2a/Lo6+vjm9/8JmvWrGHt2rU8//zzAAwNDeHz+cjPz2fDhg0MDw9bxzQ2NuLxeCgo\nKKCzs9Nq7+7upqioCI/Hw759+6z2W7duUVtbi8fjoaKigitXrix2H2WRRa+kit5vQwOHSBIys+jv\n7zcXL140xhgzOjpq8vPzTU9Pj3nmmWfMs88+a4wxpqmpyRw8eNAYY8x7771niouLzfj4uAmFQiY3\nN9fcuXPHGGPM+vXrTSAQMMYY8/jjj5vTp08bY4x56aWXTENDgzHGGL/fb2pra2fUMUeZskRu3DDm\nwAFjsrKM+dWv4l2NiMwllp+dsyaPrKwsSkpKAPjzP/9zHnroISKRCKdOnaK+vh6A+vp6Tpw4AcDJ\nkyepq6sjNTWVnJwc8vLyCAQC9Pf3Mzo6SllZGQC7du2yjpn6WjU1NZw5cyYWY6TcJ63bEJGp5j3n\n8cEHH3Dx4kXKy8sZHBzE6XQC4HQ6GRwcBODq1atUVFRYx7jdbiKRCKmpqbjdbqvd5XIRiUQAiEQi\nZGdnTxaTkkJaWhpDQ0OsXLly2vsfOnTI+r6yspLKysqF9VTuidZtiNhHV1cXXV1dS/Je8xo8Pv30\nU2pqanjuuef4yle+Mu0xh8OBw+GISXFTTR08ZGmcOwe7d09eTXXpEmRkxLsiEZnN3X9YHz58OGbv\nNeelup999hk1NTXs3LmTLVu2AJNpY2BgAID+/n4yMzOByUTR19dnHRsOh3G73bhcLsLh8Iz26DG9\nvb0ATExMMDIyMiN1yNIaG4MDByYnwo8cgbY2DRwiMt2sg4cxhj179lBYWMgPfvADq72qqoqWlhYA\nWlparEGlqqoKv9/P+Pg4oVCIYDBIWVkZWVlZrFixgkAggDGG1tZWqqurZ7xWe3s7Xq83Jh2V+Tl/\nfjJpRK+k0mkqEflcs82mnz171jgcDlNcXGxKSkpMSUmJOX36tLl27Zrxer3G4/EYn89nPv74Y+uY\nI0eOmNzcXLN69WrT0dFhtb/99ttm7dq1Jjc31zz99NNW+82bN8327dtNXl6eKS8vN6FQaEYdc5Qp\ni+DGDWN++ENdSSWynMTys9PxpzdIaA6HAxuUaVtT5zZefFGnqESWi1h+dmqFeRLTlVQicq+0t1WS\niu5JpbkNEbkXSh5JRmlDRBaDkkcSUdoQkcWi5JEElDZEZLEpeSxzd++Aq4FDRBaDkscyNTYGP/kJ\nvPyy0oaILD4lj2VIO+CKSKwpeSwjmtsQkaWi5LFMTL2XuNKGiMSakofN6V7iIhIPSh42NjVtvPuu\nBg4RWTpKHjaktCEi8abkYTNKGyKSCJQ8bEJpQ0QSiZKHDUTv7qe0ISKJQskjgWmVuIgkKiWPBKVV\n4iKSyJQ8EoxWiYuIHSh5JBDdb0NE7ELJIwEobYiI3Sh5xJnShojYkZJHnChtiIidzZo8vvvd7+J0\nOikqKrLahoaG8Pl85Ofns2HDBoaHh63HGhsb8Xg8FBQU0NnZabV3d3dTVFSEx+Nh3759VvutW7eo\nra3F4/FQUVHBlStXFrNvCUtpQ0TsbtbBY/fu3XR0dExra2pqwufzcfnyZbxeL01NTQD09PTQ1tZG\nT08PHR0d7N27F2MMAA0NDTQ3NxMMBgkGg9ZrNjc3k56eTjAYZP/+/Rw8eDAWfUwYY2Nw4MDkIr8j\nR6CtDTIy4l2ViMjCzTp4/NVf/RVf/epXp7WdOnWK+vp6AOrr6zlx4gQAJ0+epK6ujtTUVHJycsjL\nyyMQCNDf38/o6ChlZWUA7Nq1yzpm6mvV1NRw5syZxe1dAlHaEJHlZMFzHoODgzidTgCcTieDg4MA\nXL16lYqKCut5brebSCRCamoqbrfbane5XEQiEQAikQjZ2dmThaSkkJaWxtDQECtXrpzxvocOHbK+\nr6yspLKycqGlx4XmNkRkqXR1ddHV1bUk73VfE+YOhwOHw7FYtcxq6uBhF+fPw+7dUFw8mTZ0ikpE\nYunuP6wPHz4cs/da8KW6TqeTgYEBAPr7+8nMzAQmE0VfX5/1vHA4jNvtxuVyEQ6HZ7RHj+nt7QVg\nYmKCkZGRz00ddjM2Bs88A088AT/7meY2RGT5WfDgUVVVRUtLCwAtLS1s2bLFavf7/YyPjxMKhQgG\ng5SVlZGVlcWKFSsIBAIYY2htbaW6unrGa7W3t+P1eherX3GjPalEJCmYWTz55JNm1apVJjU11bjd\nbvOLX/zCXLt2zXi9XuPxeIzP5zMff/yx9fwjR46Y3Nxcs3r1atPR0WG1v/3222bt2rUmNzfXPP30\n01b7zZs3zfbt201eXp4pLy83oVDoc+uYo8yEcOOGMT/8oTFZWcb86lfxrkZEJLafnY4/vUFCczgc\nJHKZ585Nzm2UlEzeqEmnqEQkEcTys1MrzO+DrqQSkWSlva3u0d33EtfAISLJRMljgXQvcRERJY8F\nuTttaOAQkWSl5DEPShsiItMpecxBaUNEZCYljy+gK6lERL6YksfniK4S1w64IiKfT8ljCqUNEZH5\nUfL4E91vQ0Rk/pI+eShtiIgsXFInD6UNEZF7k5TJQ2lDROT+JF3yOH9eaUNE5H4lTfJQ2hARWTxJ\nkTy0A66IyOJa1sljbAx+/GP45S+1J5WIyGJatskjmjYiEe1JJSKy2JZd8tAOuCIisbeskod2wBUR\nWRrLInnoSioRkaVl++ShVeIiIkvPtoPH2BgcODB5aurIEWhrg4yMeFf1+bq6uuJdwn2xc/12rh1U\nf7zZvf5YSojBo6Ojg4KCAjweD88+++yczzcGfD77pA27/wLauX471w6qP97sXn8sxX3O4/bt2/zd\n3/0dr7/+Oi6Xi/Xr11NVVcVDDz30hcc4HHD8OHzta0tYqIiIWOKePN566y3y8vLIyckhNTWVJ598\nkpMnT855nAYOEZH4cRhjTDwLaG9v57e//S3/+q//CsDLL79MIBDghRdesJ7jcDjiVZ6IiK3F6iM+\n7qet5jMwxHl8ExGRu8T9tJXL5aKvr8/6ua+vD7fbHceKRERkLnEfPNatW0cwGOSDDz5gfHyctrY2\nqqqq4l2WiIjMIu6nrVJSUnjxxRd57LHHuH37Nnv27Jn1SisREYm/uCcPgMcff5z333+f//3f/+VH\nP/rRtMcWugYkVr773e/idDopKiqy2oaGhvD5fOTn57NhwwaGh4etxxobG/F4PBQUFNDZ2Wm1d3d3\nU1RUhMfjYd++fVb7rVu3qK2txePxUFFRwZUrVxa1/r6+Pr75zW+yZs0a1q5dy/PPP2+rPty8eZPy\n8nJKSkooLCy0fk/sUj9MXpZeWlrK5s2bbVd7Tk4ODz/8MKWlpZSVldmu/uHhYbZt28ZDDz1EYWEh\ngUDANvW///77lJaWWl9paWk8//zz8a/fJLCJiQmTm5trQqGQGR8fN8XFxaanpycutfz+9783Fy5c\nMGvXrrXannnmGfPss88aY4xpamoyBw8eNMYY895775ni4mIzPj5uQqGQyc3NNXfu3DHGGLN+/XoT\nCASMMcY8/vjj5vTp08YYY1566SXT0NBgjDHG7/eb2traRa2/v7/fXLx40RhjzOjoqMnPzzc9PT22\n6sP169eNMcZ89tlnpry83Jw9e9ZW9f/TP/2T+Zu/+RuzefNmY4y9fn9ycnLMtWvXprXZqf5du3aZ\n5uZmY8zk78/w8LCt6o+6ffu2ycrKMr29vXGvP6EHj3PnzpnHHnvM+rmxsdE0NjbGrZ5QKDRt8Fi9\nerUZGBjNVUVnAAAETUlEQVQwxkx+OK9evdoYY8zRo0dNU1OT9bzHHnvMnD9/3ly9etUUFBRY7a+8\n8or5/ve/bz3nP//zP40xk7/cDz74YEz7Ul1dbV577TVb9uH69etm3bp15r//+79tU39fX5/xer3m\nd7/7nfnrv/5rY4y9fn9ycnLMRx99NK3NLvUPDw+br3/96zPa7VL/VL/97W/NX/7lXyZE/Qlx2uqL\nRCIRsrOzrZ/dbjeRSCSOFU03ODiI0+kEwOl0Mjg4CMDVq1enXTEWrfvudpfLZfVnal9TUlJIS0tj\naGgoJnV/8MEHXLx4kfLyclv14c6dO5SUlOB0Oq1TcHapf//+/fz85z/ngQf+/385u9QOk5fUf+tb\n32LdunXWmiy71B8KhcjIyGD37t184xvf4Hvf+x7Xr1+3Tf1T+f1+6urqgPj/+yf04GGnxYEOh8MW\n9X766afU1NTw3HPP8ZWvfGXaY4nehwceeIB33nmHcDjM73//e954441pjydq/b/5zW/IzMyktLT0\nC9csJWrtUW+++SYXL17k9OnTvPTSS5w9e3ba44lc/8TEBBcuXGDv3r1cuHCBL3/5yzQ1NU17TiLX\nHzU+Ps6vf/1rtm/fPuOxeNSf0INHoq8BcTqdDAwMANDf309mZiYws+5wOIzb7cblchEOh2e0R4/p\n7e0FJn/ZR0ZGWLly5aLW+9lnn1FTU8POnTvZsmWLLfsAkJaWxre//W26u7ttUf+5c+c4deoUX//6\n16mrq+N3v/sdO3futEXtUatWrQIgIyODrVu38tZbb9mmfrfbjdvtZv369QBs27aNCxcukJWVZYv6\no06fPs0jjzxCxp+2D4/3v39CDx6JvgakqqqKlpYWAFpaWqwP5KqqKvx+P+Pj44RCIYLBIGVlZWRl\nZbFixQoCgQDGGFpbW6murp7xWu3t7Xi93kWt1RjDnj17KCws5Ac/+IHt+vDRRx9ZV5OMjY3x2muv\nUVpaaov6jx49Sl9fH6FQCL/fz6OPPkpra6stage4ceMGo6OjAFy/fp3Ozk6KiopsU39WVhbZ2dlc\nvnwZgNdff501a9awefNmW9Qf9corr1inrO5+z7jUf1+zN0vg1VdfNfn5+SY3N9ccPXo0bnU8+eST\nZtWqVSY1NdW43W7zi1/8wly7ds14vV7j8XiMz+czH3/8sfX8I0eOmNzcXLN69WrT0dFhtb/99ttm\n7dq1Jjc31zz99NNW+82bN8327dtNXl6eKS8vN6FQaFHrP3v2rHE4HKa4uNiUlJSYkpISc/r0adv0\n4dKlS6a0tNQUFxeboqIi84//+I/GGGOb+qO6urqsq63sUvsf//hHU1xcbIqLi82aNWus/4d2qd8Y\nY9555x2zbt068/DDD5utW7ea4eFhW9X/6aefmvT0dPPJJ59YbfGuP+4bI4qIiP0k9GkrERFJTBo8\nRERkwTR4iIjIgmnwEBGRBdPgISIiC6bBQ0REFuz/AbN6wdoZghAKAAAAAElFTkSuQmCC\n" 199 | }, 200 | "metadata": {}, 201 | "output_type": "display_data" 202 | } 203 | ], 204 | "source": [ 205 | "counts = array(counts)\n", 206 | "plot(counts[:,0],counts[:,1])" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "id": "2f2b2953", 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [], 217 | "source": [] 218 | } 219 | ], 220 | "metadata": {}, 221 | "nbformat": 4, 222 | "nbformat_minor": 5 223 | } 224 | -------------------------------------------------------------------------------- /HW1 - Regular Expression Parsing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4f9f453d", 6 | "metadata": {}, 7 | "source": [ 8 | "# HW1 - Regular Expression Parsing" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "29688f82", 14 | "metadata": {}, 15 | "source": [ 16 | "Here is a set of strings with balanced parentheses." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "id": "43e994bf", 23 | "metadata": { 24 | "collapsed": false 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "yes1 = \"x (a b c) y\"\n", 29 | "yes2 = \"((((x))))\"\n", 30 | "yes3 = \"a (b (c d) ((e)) f) (g)\"" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "d0539d90", 36 | "metadata": {}, 37 | "source": [ 38 | "Here is a set of strings with unbalanced parentheses." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "4680f0fa", 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "no1 = \"(a b c\"\n", 51 | "no2 = \"((())))\"\n", 52 | "no3 = \"a (b (c d) (e)) f) (g)\"" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "id": "1fa04fb8", 58 | "metadata": {}, 59 | "source": [ 60 | "Write a function `check_balanced` that uses regular expressions\n", 61 | "to check whether the parentheses are balanced.\n", 62 | "Note that you can't do this with a single regular expression,\n", 63 | "you need to write a little loop around it.\n", 64 | "Your code structure might differ a little from the\n", 65 | "function below, but it shouldn't be much longer." 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "1f7ce8e6", 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "import re\n", 78 | "def check_balanced(s):\n", 79 | " ...\n", 80 | " while ...:\n", 81 | " ...\n", 82 | " ... = re.subn(...)\n", 83 | " ...\n", 84 | " ..." 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "4048dab7", 90 | "metadata": {}, 91 | "source": [ 92 | "Now show that it works." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "6cf53b8c", 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "print check_balanced(yes1)\n", 105 | "print check_balanced(yes2)\n", 106 | "print check_balanced(yes3)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "ce592615", 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "print check_balanced(no1)\n", 119 | "print check_balanced(no2)\n", 120 | "print check_balanced(no3)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "d8a02e88", 127 | "metadata": { 128 | "collapsed": false 129 | }, 130 | "outputs": [], 131 | "source": [] 132 | } 133 | ], 134 | "metadata": {}, 135 | "nbformat": 4, 136 | "nbformat_minor": 5 137 | } 138 | -------------------------------------------------------------------------------- /HW1 - Reproduce Power Laws.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2f1bdeea", 6 | "metadata": {}, 7 | "source": [ 8 | "# HW1 - Reproduce Power Laws" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "5429ce97", 14 | "metadata": {}, 15 | "source": [ 16 | "In class, we talked about how various papers claim that Zipf's law is a general\n", 17 | "property of many discrete distributions.\n", 18 | "[Wikipedia](http://en.wikipedia.org/wiki/Zipf's_law)\n", 19 | "[Mathworld](http://mathworld.wolfram.com/ZipfsLaw.html)\n", 20 | "\n", 21 | "The original paper by Belevitch examined this question theoretically.\n", 22 | "However, if this is true, it should be easy to reproduce experimentally,\n", 23 | "namely by picking various discrete distributions, computing word frequency\n", 24 | "by rank, and plotting the result.\n", 25 | "\n", 26 | "For working with the worksheet, remember that you can insert additional\n", 27 | "cells, both to add text and explanations, and to add additional code cells.\n", 28 | "You can change existing cells if you like; they are just there to help you\n", 29 | "get started." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 21, 35 | "id": "baa1d02e", 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "from pylab import *\n", 42 | "from collections import Counter" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "8654f3c8", 48 | "metadata": {}, 49 | "source": [ 50 | "Start by generating a random sample.\n", 51 | "Here is an example of a function that generates a uniform sample.\n", 52 | "Obviously, this particular choice of distribution will\n", 53 | "not reproduce Zipf's law, so\n", 54 | "you need to modify this to try to come up with distributions\n", 55 | "that will reproduce Zipf's law." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 26, 61 | "id": "3221154a", 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "def generate_sample(nsamples,vocabulary_size):\n", 68 | " return array(rand(nsamples)*vocabulary_size,'i')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 28, 74 | "id": "8b5d2fea", 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "data = generate_sample(100000,1000)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "fcf041dc", 86 | "metadata": {}, 87 | "source": [ 88 | "Compute a histogram." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 14, 94 | "id": "7668e2be", 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "histogram = Counter(samples)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "6bc1d8f0", 106 | "metadata": {}, 107 | "source": [ 108 | "Compute frequency by rank." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 16, 114 | "id": "9702b0a7", 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "frequencies = ..." 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "504c9406", 126 | "metadata": {}, 127 | "source": [ 128 | "Plot the result." 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 19, 134 | "id": "fb4c1706", 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "[]" 143 | ] 144 | }, 145 | "execution_count": 19, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | }, 149 | { 150 | "data": { 151 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD9CAYAAACcJ53WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGxxJREFUeJzt3XuczfW+x/HXaCiZPbMpZmSmzZnYZsZlBplNYck90Qjl\nfos8TnU6PKqdOvYj225HW3YbOWfvnLKTQkQhRhzNuIWcQQ4yhM1oRodyGzK33/njW6TjMmvNWuu7\nfjPv5+Mxjxhrrd/nUXr7+l4+3zDHcRxERMQ1KtkuQEREvKPgFhFxGQW3iIjLKLhFRFxGwS0i4jIK\nbhERl7lucI8YMYLo6GgaN2586XvffvstnTp1okGDBnTu3JlTp04FvEgREbnsusE9fPhw0tPTr/je\n5MmT6dSpE9nZ2XTo0IHJkycHtEAREblS2I0O4Bw+fJgePXqwa9cuABo2bEhmZibR0dHk5eXh8Xj4\n8ssvg1KsiIhAuLdvOH78ONHR0QBER0dz/Pjx//easLCwslcmIlIBleYwe5kWJ8PCwq4Z0o7juPbr\nxRdftF6D6rdfR0WrXfXb/yotr4P7xykSgNzcXGrVquXtR4iISBl4Hdw9e/bk7bffBuDtt98mLS3N\n70WJiMi1XTe4+/fvT+vWrdm3bx9xcXHMnj2bcePGsXr1aho0aMDatWsZN27cVd978WJA6g0Kj8dj\nu4QyUf32uLl2UP1uccNdJT59aFgYDzzgsGQJhHu9/CkiUjGFhYWVaq47YCcnCwvh8cfB/38siIhU\nbAEL7kWLICsLJk4M1BNERCqmgE1kRETAxx9D69Zwxx0walSgniQiUrEEdAY6OhrS06FtW4iJgR49\nAvk0EZGKIeDdAevXh48+gkcfhc2bA/00EZHyLyhtXVu2hL//HdLSYN++YDxRRKT8Clo/7vvvh0mT\noFs3yM0N1lNFRMqfoO6yHj4cjh0zIZ6ZCZGRwXy6iEj5ELADONf6WMcx+7v374cVK6BKFX8/XUTE\nnUp7ACfowQ1QXAx9+kC1ajBnDlTSBWoiIvZPTl7PTTfBe+/BoUNwjVYnIiJyDdbGulWrwrJl5mva\nNFtViIi4j9UWUDVqmAM6994LtWvDww/brEZExB2s9+771a/M0fiOHaFmTWjf3nZFIiKhLSSWBZs0\ngQUL4JFH4Ic7iUVE5BpCIrjBjLSnTzd7vI8csV2NiEjosj5V8lP9+plTlV27woYNZg5cRESuZGUf\n940884xpSLV6tdl9IiJSEYT0AZwbKSmBwYPhwgVYuNDs+xYRKe9C+gDOjVSqBLNnw5kz8NRTuv5M\nROSnQjK4wfQwWbwYNm40XQVFRMQIqcXJn4uMhJUrL19/NmyY7YpEROwL6eAGc6IyPR3atTNXoXXr\nZrsiERG7Qnaq5Kd+/WtYsgSGDoXPP7ddjYiIXa4IboBWreA//xMefBAOHLBdjYiIPSE/VfJTPXtC\nXp45oLNpE9SqZbsiEZHgc82I+0ePPQYDB0L37nDunO1qRESCLyQP4NyI45gAz8mBpUuhcuWAPUpE\nJGhcfXKyNIqKIC0Nbr/dHNYJCwvo40REAs7VJydLIzzctILduxfGj7ddjYhI8Lg2uMFcNrx8ueln\n8u//brsaEZHgcNWukqupWRNWrTLXn8XEwEMP2a5IRCSwXB/cAPXqmZF3ly5mi+C999quSEQkcFw9\nVfJTKSnw7rvQuzfs2WO7GhGRwCk3wQ3QqRNMnWr6meTk2K5GRCQwfA7uSZMmkZSUROPGjRkwYAAX\nL170Z10+GzQInnjChPepU7arERHxP5+C+/Dhw8yaNYusrCx27dpFcXEx8+fP93dtPnv2WbjvPmjT\nBj74AIqLbVckIuI/PgV3ZGQklStX5vz58xQVFXH+/Hnq1Knj79p8FhYGr70GEyfClCmQkACzZkGI\n/KVARKRMfNpVUqNGDZ5++mnuvPNOqlatSpcuXejYseMVr5kwYcKlH3s8HjweT1nq9FqlStCrlzld\nuW4dvPIKTJgA//qvMHo0REUFtRwRkf8nIyODjIwMr9/n05H3r776ih49erB+/XqioqLo27cvffr0\nYeDAgeZDg3Dk3Rc7d8Kf/mQuZhg1yoR47dq2qxIRMQJ65H3btm20bt2a2267jfDwcB566CE2bdrk\ny0cFVdOmZsvgtm2Qnw+JiaZZ1f79tisTESk9n4K7YcOGbN68mQsXLuA4DmvWrCExMdHftQVMvXow\nYwZkZ5vTlq1bQ9++JtBFREKdT8HdtGlThgwZQosWLWjSpAkAjz32mF8LC4aaNc0C5qFDcM895rh8\nhw6werVpHSsiEopc29Y1EAoLYd48s5B5883w3HPmJGZ4uWgMICKhrtz34w6kkhL4+GMT4Lm58Mwz\nMGwYVK1quzIRKc8U3H6yYYMJ8M8/h379zFdqqi5uEBH/U3D7WXa2mUaZPx8uXIBHHjFfKSkKcRHx\nDwV3gDgOfPGFuX1n/nwz/92vnwnxpCTb1YmImym4g8BxzBbC+fNNkFevfnkkXr++7epExG0U3EFW\nUgKbNpkQX7gQ4uIuh/idd9quTkTcQMFtUVERZGaaEF+82LSYfeEFc1JTRORayv0t76EsPNwc5Jk1\nCw4eNIHdvj306QPbt9uuTkTcTsEdYFFRZrR98KA5nfnAA+Zr82bblYmIWym4g6RaNRg7Fr76Crp3\nN3PfHTtCRoaO14uIdzTHbUlhIcydC5MmmZvpx483t9RrT7hIxaXFSZcoLja7UP74R7Oo+fDD5kt7\nwkUqHgW3yzgObNliQvz99yEy0gR4377ajSJSUSi4XaykxIT4++/DokVmgfPHEE9IsF2diASKgruc\nKCkxO1B+HIknJsIf/gC/+Y3tykTE3xTc5VBhIfz97/DSS9CokbkEonlz21WJiL/oAE45VLmyueQ4\nOxvuvx969jQ32X/xhe3KRCSYFNwudPPN8MQTcOAAtG0LnTubfeE7d9quTESCQVMl5UB+Prz+uvmq\nXh0GDID+/eFXv7JdmYh4Q3PcFVBJibmx5733zG6UhAQT4n37wu23265ORG5EwV3BFRTAJ5+YEF++\nHP7pn6BVK2jd2vwzPl6nNEVCjYJbLikoMF0JP/vM9Az/7DO4eBHS0sx9mtWr265QREDBLTdw9ChM\nmWL6hf/1r6ZjoYjYpeCWUsnMhBEj4N574S9/0ehbxCbt45ZSadfO7AOPjDSHet55B06ftl2ViFyP\nRtxySWammfPesAGaNTOHfO6/3xyzr6Q/4kUCTlMl4rPz5+HTT2HFCli5Ek6cMKPxJk2gcWNITYWU\nFLjpJtuVipQvCm7xm2+/hV27zNcXX8DGjZCba6ZZ7rsPhg2DX/zCdpUi7qfgloDKzTWj8iVLTKAv\nWmRG5SLiOwW3BM2cOfD00zB1KgwZYrsaEfdScEtQ/c//QO/e0LAhNG0Kd9wBsbHQvr25KFlEbkzB\nLUF39qy57CEnB77+2nQvzMoyrWeHDoV77oHwcNtVioQuBbeEhNxcePddc6P9wYPm5p527Uxb2l/+\n0nZ1IqFFwS0h5+RJ0ytlyRLTAGvmTHjwQdtViYQOBbeEtMxMGDnS9AyPj4caNaBlS+jeHapUsV2d\niB0Kbgl5Fy7Axx+bAz4nTphR+N69pof4mDFQr57tCkWCK+DBferUKUaOHMnu3bsJCwvjrbfe4jc/\nXD2u4BZfHTwIb7wBs2ZBly7mcI/Ho1G4VAwBD+6hQ4fSrl07RowYQVFREfn5+URFRXn1cJFrOX0a\n3nzTHOzZuxeGD4fx482Uikh5FdDgPn36NCkpKRw8eLBMDxcpjWPH4KWX4IMP4PnnzY4UjcClPApo\ncO/YsYPRo0eTmJjIzp07ad68OdOmTePWW2+99PAXX3zx0us9Hg8ej8fbx4hcYfduePZZyM42UyhR\nUeZwT0SE2SNep47tCkW8k5GRQUZGxqWf//73vw9ccG/bto1WrVqxadMm7r77bsaMGUNkZCQTJ040\nH6oRtwTQmjWwerW53T4/H86cMX1TBgww3QujoqBPHx32EfcJ6Ig7Ly+PVq1acejQIQA2bNjA5MmT\nWb58uVcPF/GX48fNvvDjx83IvLAQZs82vcRF3KK02enTmCQmJoa4uDiys7Np0KABa9asISkpyZeP\nEvGL6Gj44S98lJSYezTbtYOuXaFHD7NXPDIS7rwTbr7Zbq0iZeXzrpKdO3cycuRICgoKiI+PZ/bs\n2dpVIiHl9GmztXDzZvjqKzOlcvw4JCebOfGRI6FBA9tVilymAzgiV3H2LGzdCmvXwt/+BoMHw6RJ\ncMsttisTUXCL3NDJkzB6tNknPnq0aUcbHQ21a5sFTpFgU3CLlILjQHo6LFwI+/fDN9+YlrSVK8O9\n95rLkxMSbFcpFYWCW8RHjmMCfMEC+OMfzQURgwZB69a2K5PyrrTZWSkItYi4SliYmTJ56inTBKtu\nXUhLg/XrbVcmYmjELVIKS5eaIG/YEN57Tz1TJDA04hbxo549zZbCxo3NfZrx8WaP+F/+YvaNiwST\nRtwiXrp4Ef7xD9i3D159FQ4dMic0u3c3YV6/vu0Kxa20OCkSBMXFZjdKVhZkZMDixTBlimlDK+It\nBbeIBfv2QbduZmdKYqK5HPnJJ6F6dduViRsouEUsKSoy0yd795pFzQ8/NHPkLVrAr38Nd99t+qaI\n/JyCWyREHDxo7tP87/82I/Ldu82FEGPGqPWsXEnBLRKiDhyAf/5nc+T+jTfMSFwEFNwiIc1xYO5c\nc6NP27bQqhW0aQOxsebwT1iY7QrFBgW3iAucOAEffWRaz27ZArm5plPhQw+ZY/ZJSfDDjYBSASi4\nRVxq1y6YMwdWrYLvvoO33oKOHTUKrwgU3CLlwNKl8NxzZhTu8UCHDma74U032a5MAkHBLVJOFBfD\nunXmQuRVqyAnxzS9+u1vIS4OKqlxRbmh4BYpp3buNHdqLlxo9oXPmWN6p4j7KbhFyrmSEnPRw5//\nDC1bmt0pjzxiRuGaSnEnBbdIBXHihOmTsno1fPABnDtnTmp262YaXt1zjxY23ULBLVJB5eTAihWQ\nmQnbt5uFzZdfNkftb7vNdnVyPQpuEcFx4E9/Mjf5bNt2eVEzOdl2ZXI1Cm4RucKRI/D++zB1qula\nmJgIw4apf3goUXCLyFXl5cHy5WYa5Y03zHVsyclQpw5MnAhVqtiusOJScIvIDeXnw44d8OWXsGSJ\nudnn6adh6FAtaNqg4BYRrzgOLFsGf/iDaUXbujU8+CA88ADExNiurmJQcIuIT0pKzMXI69ebDobb\ntpmr2O6/Hzp31kg8kBTcIuIXmzaZnilLlsCFC+a0Zo8e8PDDGon7m4JbRPzqx9vtMzNh+nRzIUSj\nRqZz4bBh0KCBRuNlpeAWkYA6d85cybZwodknHhMDo0bBY49BVJTt6txJwS0iQVNQYML7z3+GDRtM\n35S0NOjaFRISbFfnHqXNTjWEFJEyq1IFevUyC5p5eTB4MPzXf5lDPr/9rZkbF//RiFtEAmbnThPi\nu3ebezWbNoVmzcw2w9tvt11d6NFUiYiEjH37YP9+2LjRjMQ//9wE+MMPm1t9UlLUihYU3CISwnJz\nIT0d3nsPsrPNlEr37mZU3qxZxd2douAWEVc4cQJmzYKVK82USqVKMGaM2aFSq5bt6oIr4MFdXFxM\nixYtiI2NZdmyZT49XETkpwoKYPFieO012LoVkpIgNRWefdbc7FOtmu0KAyvgu0qmTZtGYmIiYRX1\n7zQi4ndVqkC/frBliznsM20afPONuZotIsJczbZ5s+0q7fMpuHNyclixYgUjR47UyFpEAuLOO83C\n5bJlcOaM6WIYEWHmwWvXhpdeguPHbVdpR7gvbxo7dixTpkzhzJkz13zNhAkTLv3Y4/Hg8Xh8eZSI\nCGC2Er75JsycCfPnm2P3v/sdtGljrmT73e/MwqabZGRkkJGR4fX7vJ7jXr58OStXrmTmzJlkZGQw\ndepUzXGLiBX798Phw+ZCiEWLYNAgMz/u1j3iAVucfOGFF3jnnXcIDw/n+++/58yZM/Tu3Zs5c+Z4\n/XAREX/59FP4l38xO1M6dzbH7ocPN9MqblmKC8p2wMzMTF599VWNuEUkZHzxBaxdC3PmmOvZHnjA\n9FC5667QD/Cg9SrRrhIRCSVNmph94FlZZo/4mTNmW2HLlma/+LlztissOx3AEZFy73//F95+29xw\nX1hoplKaN4fevSEyEmrUsF2hoZOTIiI/U1JithceO2YWMQsK4MgReOIJGDfOnNS0ecu9gltEpBTW\nrIFXX4VVq8zP/+3fzE331asHvxYFt4iIlxYvhueeM7fc//KX5jafJ5+EOnWC83wFt4iIDy5eNHPi\nn34KL74I330HHo+5yWfcOLjllsBNpyi4RUTK6Px5WL0aiorMTpXvvoM77oCxYyE8HIYMgZtv9t/z\nFNwiIn5WVATPP2+2FKanmxOaDz4IzzxjRuJlpeAWEQmgf/wDtm0z8+COY3qo3Hor9Ozp+0EfBbeI\nSJA8+yzk5JjLILp1g5EjTWdDbym4RUSCbN06mDfPXMkWEwPvvmu6GlauXLr3B+3Iu4iIGG3bwuuv\nm+P27dpB+/ZmDnzPHv8+R8EtIuJHN90E8fGm1eyXX5rTmW3bmqZX/pqIUHCLiARInTpmO+Hjj8NT\nT5mr2PxBc9wiIkHw4YfQvz8MGwYzZph94D+nOW4RkRCSlmZu6VmwAKZMKdtnacQtIhJE//EfZupk\n+3ZITr7y17QdUEQkBBUVQadOZovgJ59c+WsKbhGRELVrl7mp55tvoGbNy9/XHLeISIhq1AgaNDBz\n3r5QcIuIBFlYmLnE+LPPfHu/gltExILWrWHHDt/eq+AWEbEgPh4q+ZjACm4REQtuuQUuXPDtvQpu\nERELqlZVcIuIuIqCW0TEZRTcIiIuEx4OxcW+vVfBLSLiMgpuERGXUXCLiLiMgltExBJfe/EpuEVE\nLAgL8/29Cm4REZdRcIuIuIyCW0TEZRTcIiKWBHVx8ujRo7Rv356kpCQaNWrE9OnTfXu6iEgFVZbF\nSZ/unMzLyyMvL4/k5GTOnTtH8+bN+fDDD0lISPihIN05KSJyPRcvQmSk+eePAnrnZExMDMk/3Csf\nERFBQkICX3/9tS8fJSIiXgov6wccPnyY7du3k5qaesX3J0yYcOnHHo8Hj8dT1keJiJQrJSUZTJiQ\n4fX7fJoq+dG5c+fweDyMHz+etLS0yx+qqRIRkeu6eBF+8QsoKLj8vYBOlQAUFhbSu3dvBg0adEVo\ni4jIjQV9cdJxHIYOHcptt93Ga6+9dpWCNOIWEbmeggKIiPBtxO1TcG/YsIG2bdvSpEkTwn74Y2PS\npEl07drVq4eLiFRUQQ/uG36ogltE5LrKEtw6OSkiYonauoqIuIjauoqIVCAKbhERl1Fwi4i4jIJb\nRMQSLU6KiLiIFidFRCoQBbeIiMsouEVELNEct4iIi2iOW0SkAlFwi4i4jIJbRMRlFNwiIpZocVJE\nxEW0OCkiUoEouEVEXEbBLSLiMgpuERFLtDgpIuIiWpwUEalAFNwiIi6j4BYRcRkFt4iIyyi4RUQs\n0OKkiEgFouAWEXEZBbeIiMsouEVELPLl9KSCW0TEZRTcIiIuo+AWEXEZBbeIiMsouEVELNLipJ9k\nZGTYLqFMVL89bq4dVH+w+Xp60ufgTk9Pp2HDhtSvX59XXnnF148JSW77j/9zqt8eN9cOqt8tfAru\n4uJinnzySdLT09mzZw/z5s1j7969/q5NRESuwqfg3rp1K3fddRd169alcuXK9OvXj48++sjftYmI\nlHu+zHGHOY73b1u0aBGrVq1i1qxZAMydO5ctW7YwY8YM86FlaXslIlKBlSaSw3354BsFsw9/FoiI\nSCn5NFVSp04djh49eunnR48eJTY21m9FiYjItfkU3C1atGD//v0cPnyYgoICFixYQM+ePf1dm4iI\nXIVPUyXh4eG8/vrrdOnSheLiYh599FESEhL8XZuIiFyFz/u4u3Xrxr59+zhw4ADPP//8pe+7eX/3\niBEjiI6OpnHjxrZL8cnRo0dp3749SUlJNGrUiOnTp9suySvff/89qampJCcnk5iYeMXvK7coLi4m\nJSWFHj162C7Fa3Xr1qVJkyakpKTQsmVL2+V47dSpU/Tp04eEhAQSExPZvHmz7ZJKbd++faSkpFz6\nioqKuv7/v44fFRUVOfHx8c6hQ4ecgoICp2nTps6ePXv8+YiAWrdunZOVleU0atTIdik+yc3NdbZv\n3+44juOcPXvWadCggav+/TuO4+Tn5zuO4ziFhYVOamqqs379essVeWfq1KnOgAEDnB49etguxWt1\n69Z1Tp48absMnw0ZMsR58803Hccxv39OnTpluSLfFBcXOzExMc6RI0eu+Rq/Hnl3+/7uNm3aUL16\nddtl+CwmJobk5GQAIiIiSEhI4Ouvv7ZclXduvfVWAAoKCiguLqZGjRqWKyq9nJwcVqxYwciRI127\ns8qtdZ8+fZr169czYsQIwEznRkVFWa7KN2vWrCE+Pp64uLhrvsavwX3s2LErHhYbG8uxY8f8+Qgp\npcOHD7N9+3ZSU1Ntl+KVkpISkpOTiY6Opn379iQmJtouqdTGjh3LlClTqFTJnS2AwsLC6NixIy1a\ntLh0RsMtDh06RM2aNRk+fDjNmjVj1KhRnD9/3nZZPpk/fz4DBgy47mv8+jtMB29Cw7lz5+jTpw/T\npk0jIiLCdjleqVSpEjt27CAnJ4d169a5pvfE8uXLqVWrFikpKa4dtW7cuJHt27ezcuVKZs6cyfr1\n622XVGpFRUVkZWXx+OOPk5WVRbVq1Zg8ebLtsrxWUFDAsmXL6Nu373Vf59fg1v5u+woLC+nduzeD\nBg0iLS3Ndjk+i4qKonv37mzbts12KaWyadMmli5dSr169ejfvz9r165lyJAhtsvySu3atQGoWbMm\nvXr1YuvWrZYrKr3Y2FhiY2O5++67AejTpw9ZWVmWq/LeypUrad68OTVr1rzu6/wa3NrfbZfjODz6\n6KMkJiYyZswY2+V47cSJE5w6dQqACxcusHr1alJSUixXVTovv/wyR48e5dChQ8yfP5/77ruPOXPm\n2C6r1M6fP8/Zs2cByM/P55NPPnHV7qqYmBji4uLIzs4GzDxxUlKS5aq8N2/ePPr373/D1/m0j/ua\nH+by/d39+/cnMzOTkydPEhcXx8SJExk+fLjtskpt48aNzJ0799KWLoBJkybRtWtXy5WVTm5uLkOH\nDqWkpISSkhIGDx5Mhw4dbJflE7dNGx4/fpxevXoBZtph4MCBdO7c2XJV3pkxYwYDBw6koKCA+Ph4\nZs+ebbskr+Tn57NmzZpSrS/41GRKRETscefyt4hIBabgFhFxGQW3iIjLKLhFRFxGwS0i4jIKbhER\nl/k/0reyxz2zb7MAAAAASUVORK5CYII=\n" 152 | }, 153 | "metadata": {}, 154 | "output_type": "display_data" 155 | } 156 | ], 157 | "source": [ 158 | "plot(log(1+arange(len(frequencies))),log(frequencies))" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "id": "b41ac5c1", 164 | "metadata": {}, 165 | "source": [ 166 | "Now wrap this up as a function." 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "id": "f1c0ab77", 173 | "metadata": { 174 | "collapsed": false 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "def frequency_by_rank_plot(samples):\n", 179 | " ..." 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "id": "bec889a9", 185 | "metadata": {}, 186 | "source": [ 187 | "Now define multiple distributions and plot their frequency ranks.\n", 188 | "Which ones give rise to power laws?" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "d00e4a4e", 195 | "metadata": { 196 | "collapsed": false 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "def distribution1(n,q):\n", 201 | " ...\n", 202 | "frequency_by_rank_plot(distribution1(100000,1000))" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "id": "8c57b066", 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "def distribution2(n,q):\n", 215 | " ...\n", 216 | "frequency_by_rank_plot(distribution2(100000,1000))" 217 | ] 218 | } 219 | ], 220 | "metadata": {}, 221 | "nbformat": 4, 222 | "nbformat_minor": 5 223 | } 224 | -------------------------------------------------------------------------------- /HW2 - Classifier-Based Tagging.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f1b129e9", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import nltk\n", 13 | "import tagutils; reload(tagutils)\n", 14 | "from tagutils import *\n", 15 | "from IPython.core.display import HTML\n", 16 | "from nltk.corpus import brown\n", 17 | "import random as pyrand\n", 18 | "from tagutils import *" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "367e4acf", 24 | "metadata": {}, 25 | "source": [ 26 | "# Evaluation Framework" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 5, 32 | "id": "ce49e28b", 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "sents = list(brown.tagged_sents())\n", 39 | "n = len(sents)\n", 40 | "test = sorted(list(set(range(0,n,10))))\n", 41 | "training = sorted(list(set(range(n))-set(test)))\n", 42 | "training_set = [sents[i] for i in training]\n", 43 | "test_set = [sents[i] for i in test]" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 7, 49 | "id": "41dd57a9", 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "51606\n", 59 | "5734\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "print len(training_set)\n", 65 | "print len(test_set)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "id": "4ff74a64", 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "0.9236947791164659" 80 | ] 81 | }, 82 | "execution_count": 6, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "t0 = nltk.DefaultTagger('NN')\n", 89 | "t1 = nltk.UnigramTagger(training_set, backoff=t0)\n", 90 | "t2 = nltk.BigramTagger(training_set, backoff=t1)\n", 91 | "t2.evaluate(test_set)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "1914cd9d", 97 | "metadata": {}, 98 | "source": [ 99 | "# Classifier-Based Tagging" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 14, 105 | "id": "bdae973b", 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "import nltk.tag.api\n", 112 | "# help(nltk.tag.api.TaggerI)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "452f7133", 118 | "metadata": {}, 119 | "source": [ 120 | "Implement a new tagger based on classifiers.\n", 121 | "\n", 122 | "When applying a classifier, you need to transform the input into a feature vector.\n", 123 | "In this case, we are trying to predict $P(t_n| < \\hbox{input words} > )$. How do we do this?\n", 124 | "\n", 125 | "For a simple unigram tagger, we are estimating $P(t_n | w_n)$.\n", 126 | "If $w_n \\in V = \\\\{1,...,N\\\\}$, where $V$ is a vocabulary of size $N$ representing each word\n", 127 | "as an integer, then the input feature vector might be a binary vector $\\vec{x} = (x_1 ... x_N)$ where\n", 128 | "\n", 129 | "$$ x_i = \\delta_{i,w_n} $$\n", 130 | "\n", 131 | "For a simple bigram tagger, we are estimating something like $P(t_n | w_n t_{n-1})$, which\n", 132 | "we could similarly represent as a concatenation of two large binary input vectors.\n", 133 | "\n", 134 | "However, such a brute force approach may not work very well because we have a very high\n", 135 | "dimensional input vector and classifiers often need a lot of training data.\n", 136 | "We are free to preprocess the data in any form we like in order to get better feature\n", 137 | "vectors. \n", 138 | "\n", 139 | "Here are some ideas:\n", 140 | "\n", 141 | "- use the posterior probabilities for tags returned by a unigram and bigram tagger as feature vectors\n", 142 | "- use possible grammatical categories and semantic categories from Wordnet as feature vectors\n", 143 | "- use simple features like capitalization, word length, and position in sentence\n", 144 | "- provide information about word frequency in input\n", 145 | "- \"hash\" the large range of possible words $V$ down to a much smaller vocabulary\n", 146 | "- same as before, but do the hasing somewhat more intelligently: leave all the stop words alone, but has down the content words\n", 147 | "- do the \"hashing\" in some way that's informed by Wordnet\n", 148 | "\n", 149 | "Note that in order to be able to tag using the algorithms we have described, you can use tags assigned to previous words, but you cannot use tags assigned to subsequent words.\n", 150 | "\n", 151 | "Try to beat the bigram-with-backoff tagger above, using the same evaluation paradigm.\n", 152 | "Your tagger should implement the standard NLTK tagging API.\n", 153 | "\n", 154 | "Two classifiers to try are logistic regression and decision tree classifiers.\n", 155 | "You can use implementations from the `sklearn` package." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "id": "cb74b2b4", 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [] 167 | } 168 | ], 169 | "metadata": {}, 170 | "nbformat": 4, 171 | "nbformat_minor": 5 172 | } 173 | -------------------------------------------------------------------------------- /HW2 - Improve Tagging with Wordnet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0a68f3d1", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import nltk\n", 13 | "import tagutils; reload(tagutils)\n", 14 | "from tagutils import *\n", 15 | "from IPython.core.display import HTML\n", 16 | "from nltk.corpus import brown\n", 17 | "import random as pyrand\n", 18 | "from tagutils import *" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "id": "0f36a27e", 24 | "metadata": {}, 25 | "source": [ 26 | "# Evaluation Framework" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 5, 32 | "id": "5325e07c", 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "sents = list(brown.tagged_sents())\n", 39 | "n = len(sents)\n", 40 | "test = sorted(list(set(range(0,n,10))))\n", 41 | "training = sorted(list(set(range(n))-set(test)))\n", 42 | "training_set = [sents[i] for i in training]\n", 43 | "test_set = [sents[i] for i in test]" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 7, 49 | "id": "3ee2d098", 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "51606\n", 59 | "5734\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "print len(training_set)\n", 65 | "print len(test_set)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "id": "290c50e0", 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "0.9236947791164659" 80 | ] 81 | }, 82 | "execution_count": 6, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "t0 = nltk.DefaultTagger('NN')\n", 89 | "t1 = nltk.UnigramTagger(training_set, backoff=t0)\n", 90 | "t2 = nltk.BigramTagger(training_set, backoff=t1)\n", 91 | "t2.evaluate(test_set)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "e909843a", 97 | "metadata": {}, 98 | "source": [ 99 | "# Wordnet-Based Improvements" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 14, 105 | "id": "79bafa77", 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "import nltk.tag.api\n", 112 | "# help(nltk.tag.api.TaggerI)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "id": "b913049e", 118 | "metadata": {}, 119 | "source": [ 120 | "Your homework consists of implementing new taggers based on Wordnet.\n", 121 | "With regular taggers, we have a problem of sparsity; that is, we don't know\n", 122 | "what tag to assign to a word if we have never seen it in a context.\n", 123 | "\n", 124 | "However, for many words, Wordnet may give us useful information to help with\n", 125 | "tagging. You need to work out some ideas, implement them, and test them.\n", 126 | "\n", 127 | "There are different implementation strategies, but a simple one might be:\n", 128 | "\n", 129 | "- write classes that map token sequences to other token sequences using\n", 130 | " WordNet; for example, you might map an input sentence to some collection\n", 131 | " of hyponyms\n", 132 | "- then, apply the regular NLTK n-gram taggers to the modified output sequences\n", 133 | "- use backoff (as above) when the WordNet mapping fails for some reason\n", 134 | " (you can't find the word, or maybe the mapping would be ambiguous and\n", 135 | " you don't know how to handle it)\n", 136 | "\n", 137 | "This may not be the best strategy, but it's a good way of getting started.\n", 138 | "\n", 139 | "Another strategy is to use use WordNet to generate a cloud of related words\n", 140 | "around a given word, and then see whether you can find bigrams in an existing\n", 141 | "model for any of the related words.\n", 142 | "\n", 143 | "Implement your model(s) so that they conform to the NLTK tagging APIs,\n", 144 | "perform evaluations on the training and test sets defined above,\n", 145 | "and be ready to present your results (idea, evaluation, results) in \n", 146 | "the exercises.\n", 147 | "\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "id": "49438688", 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [], 158 | "source": [] 159 | } 160 | ], 161 | "metadata": {}, 162 | "nbformat": 4, 163 | "nbformat_minor": 5 164 | } 165 | -------------------------------------------------------------------------------- /HW3 - HMMs and FSTs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c2cfa1b2", 6 | "metadata": {}, 7 | "source": [ 8 | "# Transform HMMs into FSTs" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "5bbd63a2", 14 | "metadata": {}, 15 | "source": [ 16 | "Assume you are given a Hidden Markov Model (HMM) described\n", 17 | "by a transition matrix $A$ and an emission matrix $B$,\n", 18 | "with $n$ states and $m$ symbols.\n", 19 | "Assume state $0$ is the start state.\n", 20 | "\n", 21 | "Write a function that converts the Hidden Markov Model into an\n", 22 | "equivalent FST and demonstrate that the two models give equivalent results.\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "1308396d", 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "def hmm_to_fst(A,B):\n", 35 | " \"\"\"Convert an HMM to an equivalent FST. State 0\n", 36 | " is always the start state, and state n-1 is always\n", 37 | " the accept state.\"\"\"\n", 38 | " n,n1 = A.shape\n", 39 | " assert n==n1\n", 40 | " m,n2 = B.shape\n", 41 | " assert n==n2\n", 42 | " if accept is None: accept_state = n-1\n", 43 | " raise Exception(\"IMPLEMENT ME\")\n", 44 | " return fst" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "id": "595857b4", 50 | "metadata": {}, 51 | "source": [ 52 | "For the following, just call the OpenFST functions." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "id": "5a4dabef", 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "def fst_log_probability(fst,s):\n", 65 | " \"\"\"Find the lowest cost path corresponding to the string `s`\n", 66 | " through `fst`. The string `s` is given as a 1D numpy array.\"\"\"\n", 67 | " raise Exception(\"IMPLEMENT ME\")\n", 68 | " return log_p" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "589aef5c", 74 | "metadata": {}, 75 | "source": [ 76 | "For the following, you can use the Viterbi algorithm from lecture as a starting point." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 4, 82 | "id": "2e099820", 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "def hmm_log_probability(A,B,s):\n", 89 | " \"\"\"Find the lowest cost path through the HMM corresponding\n", 90 | " to the given string `s`.\"\"\"\n", 91 | " raise Exception(\"IMPLEMENT ME\")\n", 92 | " return log_p" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "id": "f6147853", 98 | "metadata": {}, 99 | "source": [ 100 | "Now write some unit tests." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "39258b72", 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "A = ones((1,1))\n", 113 | "B = ones((1,1))\n", 114 | "assert abs(hmm_log_probability(A,B,zeros(10,'i'))) < 1e-4\n", 115 | "\n", 116 | "fst = hmm_to_fst(A,B)\n", 117 | "assert abs(fst_log_probability(fst,zeros(10,'i'))) < 1e-4\n", 118 | "\n", 119 | "# more meaningful unit tests" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "849bd7c9", 125 | "metadata": {}, 126 | "source": [ 127 | "And write some general purpose random tests." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "f42bf203", 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "for trial in range(10):\n", 140 | " A = rand((10,10))\n", 141 | " # normalize appropriately\n", 142 | " B = rand((17,10))\n", 143 | " # normalize appropriately\n", 144 | " fst = hmm_to_fst(A,B)\n", 145 | " for trial2 in range(10):\n", 146 | " s = array(10*rand(7),'i')\n", 147 | " p1 = fst_log_probability(fst,s)\n", 148 | " p2 = hmm_log_probability(A,B,s)\n", 149 | " assert abs(p1-p2)/min(abs(p1),abs(p2)) < 1e-4" 150 | ] 151 | } 152 | ], 153 | "metadata": {}, 154 | "nbformat": 4, 155 | "nbformat_minor": 5 156 | } 157 | -------------------------------------------------------------------------------- /HW3 - regex generation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a57cac02", 6 | "metadata": {}, 7 | "source": [ 8 | "# Regular Expression Class based on OpenFST" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "44d783c1", 14 | "metadata": {}, 15 | "source": [ 16 | "The goal of this exercise is to write a small regular expression class\n", 17 | "that internally uses OpenFST to perform the matching." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "id": "058dfa5c", 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "class OpenRE:\n", 30 | " def __init__(self,regex=None,cost=0.0):\n", 31 | " if regex is not None:\n", 32 | " self.add(regex,cost)\n", 33 | " self.compile()\n", 34 | " # IMPLEMENT ME\n", 35 | " def add(self,regex,cost=0.0):\n", 36 | " \"\"\"Add a regular expression to the overall\n", 37 | " regular expression using a disjunction.\"\"\"\n", 38 | " # IMPLEMENT ME\n", 39 | " def compile(self):\n", 40 | " \"\"\"After adding component regular expressions,\n", 41 | " compile the internal fst.\"\"\"\n", 42 | " # IMPLEMENT ME\n", 43 | " self.fst = something\n", 44 | " def cost(self,s):\n", 45 | " \"\"\"Match the given string against the compiled\n", 46 | " regular expression and return the cost. Returns\n", 47 | " `inf` if there is no match.\"\"\"\n", 48 | " # IMPLEMENT ME\n", 49 | " return cost" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "df247c82", 55 | "metadata": {}, 56 | "source": [ 57 | "Your package should understand the following expressions:\n", 58 | "\n", 59 | "- \"ABC\" - simple strings\n", 60 | "- \"AB|CD\" - alternation\n", 61 | "- \"AB*C\" - regex star (zero or more repeats)\n", 62 | "- \"AB+C\" - regex plus (one or more repeats)\n", 63 | "- \"A(B|C)*D\" - parentheses and optional operators\n", 64 | "\n", 65 | "Assume that expressions are implicitly anchored at the beginning and end\n", 66 | "(no partial matches).\n", 67 | "\n", 68 | "It's OK if you limit yourself to ASCII strings. Use `ord` to encode characters\n", 69 | "to integers. Do not worry about escape characters or wildcards." 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "599cd492", 75 | "metadata": {}, 76 | "source": [ 77 | "# Unit Tests" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "4ef79ae4", 83 | "metadata": {}, 84 | "source": [ 85 | "Write a set of unit tests demonstrating that your code works." 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "0e32cd81", 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "assert OpenRE(\"abc\").cost(\"abc\") == 0\n", 98 | "assert OpenRE(\"abC\").cost(\"abc\") == inf\n", 99 | "assert OpenRE(\"ab\").cost(\"abc\") == inf # no anchoring\n", 100 | "assert OpenRE(\"(a|b)\").cost(\"a\") == 0\n", 101 | "assert OpenRE(\"(a|b)\").cost(\"b\") == 0\n", 102 | "assert OpenRE(\"a|b\").cost(\"a\") == 0\n", 103 | "# etc." 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "id": "cb2ff370", 109 | "metadata": {}, 110 | "source": [ 111 | "# Parsing" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "32eb7e36", 117 | "metadata": {}, 118 | "source": [ 119 | "For parsing the regular expression itself, you may want to use the `pyparsing` module.\n", 120 | "\n", 121 | "Here is a simple example of how you might go about this. Note that this is _not_ a correct\n", 122 | "regular expression parser yet and that you may want to generate a different kind of structure.\n", 123 | "\n", 124 | "Read the documentation to figure out how to deal with whitespace and more characters." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "id": "3459ffe3", 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "from pyparsing import *\n", 137 | "postfix = Literal('+') | Literal('*')\n", 138 | "alt = Literal( '|' )\n", 139 | "lpar = Literal( '(' ).suppress()\n", 140 | "rpar = Literal( ')' ).suppress()\n", 141 | "lit = Regex('[^()|+*]+')\n", 142 | "expr = Forward()\n", 143 | "term = lit | alt + expr | Group( lpar + expr + rpar + Optional(postfix) )\n", 144 | "expr << ZeroOrMore( term )\n", 145 | "expr.parseString(\"hello, (world|there)+|(a(b)c)\")" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "id": "730deba5", 152 | "metadata": { 153 | "collapsed": false 154 | }, 155 | "outputs": [], 156 | "source": [] 157 | } 158 | ], 159 | "metadata": {}, 160 | "nbformat": 4, 161 | "nbformat_minor": 5 162 | } 163 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural Language Processing and Applications 2 | 3 | [Course](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-course.ipynb) 4 | 5 | [Intro](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-intro.ipynb) 6 | 7 | [Demo Videos](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-intro-demo-videos.ipynb) 8 | 9 | ## UNIX Command Line Tools 10 | 11 | [Database Join](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-unix-join.ipynb) 12 | 13 | [Making the Brown Corpus Readable](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-unix-cleanup.ipynb) 14 | 15 | [Downloading Text from the Internet](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-downloading-tomsawyer.ipynb) 16 | 17 | [Word Histogram](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-word-histograms.ipynb) 18 | 19 | [find and xargs](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-find-xargs.ipynb) 20 | 21 | [Zipf Law](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/zipf-law-example.ipynb) 22 | 23 | ## Web Data 24 | 25 | [Crawling with Scrapy](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/crawling-with-scrapy.ipynb) 26 | 27 | [Internet Archive Query](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/internet-archive-query.ipynb) 28 | 29 | ## Text Data and Algorithms 30 | 31 | [Unicode](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-unicode.ipynb) 32 | 33 | [Intro to Regular Expressions](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-re-intro.ipynb) 34 | 35 | [Regular Expressions](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-regular-expressions.ipynb) 36 | 37 | [Regular Expressions and FSA](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-re-fsa.ipynb) 38 | 39 | [Homework Regular Expressions](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW1 - Regular Expression Parsing.ipynb) 40 | 41 | [Edit Distance](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-edit-distance.ipynb) 42 | 43 | ## NLTK 44 | 45 | [NLTK Basics](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-basic-nltk.ipynb) 46 | 47 | [NLTK Basics (2)](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-basic.ipynb) 48 | 49 | [Corpora](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-corpora.ipynb) 50 | 51 | [NLTK Corpora](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-corpora.ipynb) 52 | 53 | [Lexical Resources](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-lexical-resources.ipynb) 54 | 55 | [NLTK Lexical Resources](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-lexical-resources.ipynb) 56 | 57 | [NLTK Reading German](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-reading-german.ipynb) 58 | 59 | [Wordnet](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-wordnet.ipynb) 60 | 61 | [NLTK Wordnet](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-wordnet.ipynb) 62 | 63 | [NLTK POS Tagging](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-pos-tagging.ipynb) 64 | 65 | [NLTK Automated Tagging](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-automated-tagging.ipynb) 66 | 67 | [NLTK Available Taggers](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nltk-available-taggers.ipynb) 68 | 69 | [NLTK n-gram Taggers](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nltk-ngram-taggers.ipynb) 70 | 71 | [NLTK Tagging from Scratch](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nltk-tagging-from-scratch.ipynb) 72 | 73 | [NLTK Stemming and Lemmatizing](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nltk-summary-stemming-lemmatizing.ipynb) 74 | 75 | ## Markov Models and OpenFST 76 | 77 | [Markov Models](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-markov-models.ipynb) 78 | 79 | [HMM OCR](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-hmm-ocr.ipynb) 80 | 81 | ## Finite State Transducers 82 | 83 | [OpenFST](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-openfst.ipynb) 84 | 85 | [OpenFST (2)](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-openfst2.ipynb) 86 | 87 | [OpenFST Edit Distance](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-openfst-edit-distance.ipynb) 88 | 89 | [OpenFST Weights and Forward-Backward](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/openfst-weights-and-forwardbackward.ipynb) 90 | 91 | ## Classification 92 | 93 | [Introduction to Classification](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-classification-intro.ipynb) 94 | 95 | [Classification for Tagging](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-classification-tagging.ipynb) 96 | 97 | [Sentence Segmentation](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-classifier-sentence-segmentation.ipynb) 98 | 99 | [Classifier for Dialog Acts](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-classifier-dialog-acts.ipynb) 100 | 101 | [Classifier Errors](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-classifier-errors.ipynb) 102 | 103 | ## Information Retrieval 104 | 105 | [Simple Information Retrieval](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-simple-ir.ipynb) 106 | 107 | [Vectorspace Model](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-vectorspace.ipynb) 108 | 109 | [MEMM](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-memm.ipynb) 110 | 111 | ## Homework 112 | 113 | [Homework Implement a Trie](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW1 - Implement a Trie.ipynb) 114 | 115 | [Homework Reproduce Power Laws](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW1 - Reproduce Power Laws.ipynb) 116 | 117 | [Homework NLTK Wordnet](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW2+-+Improve+Tagging+with+Wordnet.ipynb) 118 | 119 | [Homework Classifier Based Tagging](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW2 - Classifier-Based Tagging.ipynb) 120 | 121 | [Homework HMMs and FSTs](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW3 - HMMs and FSTs.ipynb) 122 | 123 | [Homework Regex Generation](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW3 - regex generation.ipynb) 124 | 125 | -------------------------------------------------------------------------------- /brown.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmbdev-teaching/teaching-nlpa/5409d4ad0d762c76e262e130f4fcbf6bcdfc2317/brown.zip -------------------------------------------------------------------------------- /college-degrees-perc.tsv: -------------------------------------------------------------------------------- 1 | 1 Norway 32 2 | 2 United States 31 3 | 3 Netherlands 29 4 | 4 Iceland 26 5 | 5 Denmark 25 6 | 6 New Zealand 25 7 | 7 Canada 25 8 | 8 South Korea 24 9 | 9 Australia 24 10 | 10 Sweden 23 11 | 11 United Kingdom 23 12 | 12 Japan 23 13 | 13 Finland 21 14 | 14 Ireland 21 15 | 15 Switzerland 21 16 | 16 Spain 20 17 | 17 Poland 19 18 | 18 Hungary 18 19 | 19 Luxembourg 18 20 | 20 Germany 16 21 | 21 France 16 22 | 22 Greece 15 23 | 23 Mexico 15 24 | 24 Belgium 14 25 | 25 Portugal 14 26 | -------------------------------------------------------------------------------- /crawling-with-scrapy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "070ce222", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from scrapy.spider import BaseSpider" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 14, 18 | "id": "4803dd7e", 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "class MySpider(BaseSpider):\n", 25 | " name = \"uni-kl.de\"\n", 26 | " allowed_domains = [\"uni-kl.de\"]\n", 27 | " start_urls = [\n", 28 | " \"http://www.uni-kl.de/\",\n", 29 | " ]\n", 30 | "\n", 31 | " def parse(self, response):\n", 32 | " filename = response.url.split(\"/\")[-2]\n", 33 | " open(filename, 'wb').write(response.body)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 17, 39 | "id": "6a9b5c8b", 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "# see: http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python\n", 46 | "\n", 47 | "from scrapy import project, signals\n", 48 | "from scrapy.conf import settings\n", 49 | "from scrapy.crawler import CrawlerProcess\n", 50 | "from scrapy.xlib.pydispatch import dispatcher\n", 51 | "from multiprocessing.queues import Queue\n", 52 | "from multiprocessing import Process\n", 53 | "\n", 54 | "class CrawlerWorker(Process):\n", 55 | " def __init__(self, spider, results):\n", 56 | " Process.__init__(self)\n", 57 | " self.results = results\n", 58 | "\n", 59 | " self.crawler = CrawlerProcess(settings)\n", 60 | " if not hasattr(project, 'crawler'):\n", 61 | " self.crawler.install()\n", 62 | " self.crawler.configure()\n", 63 | "\n", 64 | " self.items = []\n", 65 | " self.spider = spider\n", 66 | " dispatcher.connect(self._item_passed, signals.item_passed)\n", 67 | "\n", 68 | " def _item_passed(self, item):\n", 69 | " self.items.append(item)\n", 70 | "\n", 71 | " def run(self):\n", 72 | " self.crawler.crawl(self.spider)\n", 73 | " self.crawler.start()\n", 74 | " self.crawler.stop()\n", 75 | " self.results.put(self.items)\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 18, 81 | "id": "7d601d0c", 82 | "metadata": { 83 | "collapsed": false 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "# The part below can be called as often as you want\n", 88 | "results = Queue()\n", 89 | "crawler = CrawlerWorker(MySpider(), results)\n", 90 | "crawler.start()\n", 91 | "for item in results.get():\n", 92 | " print item" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "cd6b067d", 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [] 104 | } 105 | ], 106 | "metadata": {}, 107 | "nbformat": 4, 108 | "nbformat_minor": 5 109 | } 110 | -------------------------------------------------------------------------------- /fstutils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from pylab import * 4 | import openfst 5 | from openfst import StdVectorFst as FST 6 | from openfst import LogVectorFst as LFST 7 | 8 | ASCII = openfst.SymbolTable("ASCII") 9 | 10 | for i in range(127): 11 | if i==0: 12 | ASCII.AddSymbol("ϵ",i) 13 | elif i<=32: 14 | ASCII.AddSymbol("$%02x"%i,i) 15 | else: 16 | ASCII.AddSymbol(chr(i),i) 17 | 18 | def minimize(fst): 19 | dfst = FST() 20 | openfst.Determinize(fst,dfst) 21 | openfst.Minimize(dfst) 22 | return dfst 23 | 24 | def log_minimize(fst): 25 | dfst = LFST() 26 | openfst.Determinize(fst,dfst) 27 | openfst.Minimize(dfst) 28 | return dfst 29 | 30 | def show_fst(fst): 31 | import pydot,pylab 32 | graph = pydot.Dot(rankdir="LR") 33 | isyms = fst.InputSymbols() 34 | if not isyms: isyms = ASCII 35 | osyms = fst.OutputSymbols() 36 | if not osyms: osyms = ASCII 37 | for s in range(fst.NumStates()): 38 | if s==fst.Start(): 39 | n = pydot.Node("%d"%s,shape="box") 40 | graph.add_node(n) 41 | if fst.IsFinal(s): 42 | l = '"' 43 | l += "%d"%s # node id 44 | if fst.Final(s).Value()!=0.0: # optional non-zero accept cost 45 | l += "/%s"%fst.Final(s).Value() 46 | l += '"' 47 | n = pydot.Node("%d"%s,label=l,penwidth="3") 48 | graph.add_node(n) 49 | for t in range(fst.NumArcs(s)): 50 | a = fst.GetArc(s,t) 51 | l = '"' 52 | l += '%s'%isyms.Find(a.ilabel) 53 | if a.olabel!=a.ilabel: l += ":%s"%osyms.Find(a.olabel) 54 | v = a.weight.Value() 55 | if v!=0.0: l += "/%s"%v 56 | l += '"' 57 | n = a.nextstate 58 | e = pydot.Edge("%d"%s,"%d"%n,label=l) 59 | graph.add_edge(e) 60 | graph.write_png("/tmp/_test.png") 61 | pylab.gca().set_xticks([]); pylab.gca().set_yticks([]) 62 | pylab.clf() 63 | pylab.imshow(pylab.imread("/tmp/_test.png")) 64 | 65 | def fstsize(fst): 66 | edges = 0 67 | for s in range(fst.NumStates()): 68 | edges += fst.NumArcs(s) 69 | return fst.NumStates(),edges 70 | -------------------------------------------------------------------------------- /household-ppp.tsv: -------------------------------------------------------------------------------- 1 | 1 Luxembourg 34407 2 | 2 United States 31111 3 | 3 Norway 31011 4 | 4 Iceland 28166 5 | 5 Australia 26915 6 | 6 Switzerland 26844 7 | 7 Canada 25363 8 | 8 United Kingdom 25168 9 | 9 Ireland 24677 10 | 10 Austria 24114 11 | 11 Netherlands 24024 12 | 12 Sweden 22889 13 | 13 Denmark 22461 14 | 14 Belgium 21532 15 | 15 Germany 21241 16 | 16 Finland 20875 17 | 17 New Zealand 20679 18 | 18 France 19615 19 | 19 Japan 19432 20 | 20 South Korea 19179 21 | -------------------------------------------------------------------------------- /internet-archive-query.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "813a86b3", 6 | "metadata": {}, 7 | "source": [ 8 | "# Querying and Downloading from the Internet Archive" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "027159cd", 14 | "metadata": {}, 15 | "source": [ 16 | "This worksheet shows how to query the Internet Archive with JSON and how to download from it." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 127, 22 | "id": "16257e0b", 23 | "metadata": { 24 | "collapsed": false 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import urllib2\n", 29 | "import json\n", 30 | "import re\n", 31 | "import pprint\n", 32 | "pp = pprint.PrettyPrinter(indent=4).pprint\n", 33 | "Q = urllib2.quote\n", 34 | "U = urllib2.unquote" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "ed7fca79", 40 | "metadata": {}, 41 | "source": [ 42 | "We construct a url-encoded query (can we also post JSON?)." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 97, 48 | "id": "b09830f8", 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "http://archive.org/advancedsearch.php?q=title%3A%28alice%20in%20wonderland%29%20AND%20format%3A%28djvu%29&fl%5B%5D%3Didentifier&fl%5B%5D%3Dsource&fl%5B%5D%3Dtitle&rows=100&page=1&output=json\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "query = Q(\"title:(alice in wonderland) AND format:(djvu)\")\n", 63 | "columns = \"&\".join([Q(s) for s in \"fl[]=identifier fl[]=source fl[]=title\".split()])\n", 64 | "params = \"rows=100&page=1&output=json\"\n", 65 | "url = \"http://archive.org/advancedsearch.php?q=\"+query+\"&\"+columns+\"&\"+params\n", 66 | "print url" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 128, 72 | "id": "1e296144", 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "# could we also post the query?\n", 79 | "#jdata = json.dumps({\"username\":\"...\", \"password\":\"...\"})\n", 80 | "#urllib2.urlopen(\"http://www.example.com/\", jdata)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "e53b48f4", 86 | "metadata": {}, 87 | "source": [ 88 | "Now we read and parse the response." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 129, 94 | "id": "1bac2f60", 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "[u'start', u'numFound', u'docs']" 103 | ] 104 | }, 105 | "execution_count": 129, 106 | "metadata": {}, 107 | "output_type": "execute_result" 108 | } 109 | ], 110 | "source": [ 111 | "response = urllib2.urlopen(url).read()\n", 112 | "response = json.loads(response)[\"response\"]\n", 113 | "response.keys()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 131, 119 | "id": "c19f9166", 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "43" 128 | ] 129 | }, 130 | "execution_count": 131, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "response[\"numFound\"]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 132, 142 | "id": "f58eb244", 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "43" 151 | ] 152 | }, 153 | "execution_count": 132, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "len(response[\"docs\"])" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "id": "2e9afd0a", 165 | "metadata": {}, 166 | "source": [ 167 | "Each doc contains a title and an identifier (we asked for those):" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 101, 173 | "id": "bd307dc0", 174 | "metadata": { 175 | "collapsed": false 176 | }, 177 | "outputs": [ 178 | { 179 | "name": "stdout", 180 | "output_type": "stream", 181 | "text": [ 182 | "0 Alice In Wonderland caralic\n", 183 | "1 Alice's Adventures In Wonderland AlicesAdventuresInWonderland\n", 184 | "2 Alice's Adventures in Wonderland AlicesAdventuresInWonderland_917\n", 185 | "3 Alice in Wonderland aliceinwonderla00carrgoog\n", 186 | "4 Alice's Adventures in Wonderland alicesadventure00jackgoog\n", 187 | "5 Alice's Adventures in Wonderland AlicesAdventuresInWonderland_841\n", 188 | "6 Alice's adventures in Wonderland alicesadventure00tenngoog\n", 189 | "7 Alice's Adventures in Wonderland alicesadventures00011gut\n", 190 | "8 Alice's adventures in Wonderland adventuresalices00carrrich\n", 191 | "9 Alice in Wonderland aliceinwonderlan00carriala\n" 192 | ] 193 | } 194 | ], 195 | "source": [ 196 | "for i,e in enumerate(response[\"docs\"][:10]):\n", 197 | " print i,e[\"title\"],e[\"identifier\"]" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 102, 203 | "id": "2a9dd657", 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [ 208 | { 209 | "data": { 210 | "text/plain": [ 211 | "u'AlicesAdventuresInWonderland_841'" 212 | ] 213 | }, 214 | "execution_count": 102, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "identifier = response[\"docs\"][5][\"identifier\"]\n", 221 | "identifier" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "id": "4c6aa2b4", 227 | "metadata": {}, 228 | "source": [ 229 | "# Retrieving Details using the Identifier" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "id": "2d5f9448", 235 | "metadata": {}, 236 | "source": [ 237 | "Once we have the identifier for a document, we can retrieve more info about it,\n", 238 | "again in JSON." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 111, 244 | "id": "d7504a13", 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "[u'files', u'misc', u'server', u'item', u'creativecommons', u'dir', u'metadata']\n", 254 | "ia701208.us.archive.org\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "hit = urllib2.urlopen(\"http://archive.org/details/\"+Q(identifier)+\"?output=json\").read()\n", 260 | "hit = json.loads(hit)\n", 261 | "print hit.keys()\n", 262 | "print hit[\"server\"]" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "id": "1c3d32e1", 268 | "metadata": {}, 269 | "source": [ 270 | "We're particularly interested in the files." 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 112, 276 | "id": "c78731ca", 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "[u'sha1', u'format', u'source', u'mtime', u'crc32', u'md5', u'original', u'size']\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "print hit[\"files\"].items()[0][1].keys()" 291 | ] 292 | }, 293 | { 294 | "cell_type": "markdown", 295 | "id": "dbcca733", 296 | "metadata": {}, 297 | "source": [ 298 | "The file list contains information about formats, sources, sizes, etc.\n", 299 | "We're looking for text." 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 123, 305 | "id": "30ae9136", 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "outputs": [ 310 | { 311 | "name": "stdout", 312 | "output_type": "stream", 313 | "text": [ 314 | "u'DjVu' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.djvu\n", 315 | "u'Abbyy GZ' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_abbyy.gz\n", 316 | "u'Image Container PDF' u'original' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.pdf\n", 317 | "u'Metadata' u'original' /AlicesAdventuresInWonderland_841_meta.xml\n", 318 | "u'Single Page Processed JP2 ZIP' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_jp2.zip\n", 319 | "u'DjVuTXT' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.txt\n", 320 | "u'Scandata' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_scandata.xml\n", 321 | "u'EPUB' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.epub\n", 322 | "u'Metadata' u'original' /AlicesAdventuresInWonderland_841_files.xml\n", 323 | "u'Animated GIF' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.gif\n", 324 | "u'Djvu XML' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.xml\n", 325 | "u'Additional Text PDF' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_text.pdf\n", 326 | "/86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.txt\n" 327 | ] 328 | } 329 | ], 330 | "source": [ 331 | "fname = None\n", 332 | "for k,v in hit[\"files\"].items():\n", 333 | " print repr(v[\"format\"]),repr(v[\"source\"]),k\n", 334 | " if v[\"format\"]==\"DjVuTXT\": fname = k\n", 335 | "print fname" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "id": "f9c6f7ad", 341 | "metadata": {}, 342 | "source": [ 343 | "# Retrieving the File" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "id": "e7c579fc", 349 | "metadata": {}, 350 | "source": [ 351 | "We can retrieve files from the `archive.org/download` URL, combining the identifier for the document and the specific file name." 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 138, 357 | "id": "cfedd3cf", 358 | "metadata": { 359 | "collapsed": false 360 | }, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/plain": [ 365 | "u'\\n\\n\\n1 \\n\\n\\n\\n\\nwtx \\n\\n\\n\\n\\n\\n% \\xa7eb vtfy \"tired of $LM&$ \\nby nzr sisfer* ojl. tdthlmnh \\n\\ndo : once or \"twice, sit ka.A \\nfittfottL tufa i&& Irotk ktv \\nS^^Mt Si ^ r w <-7i. fi\\xa3r own niind, \\n^aS w&^ as S/te- Could, fa'" 366 | ] 367 | }, 368 | "execution_count": 138, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "text = urllib2.urlopen(\"http://archive.org/download/\"+Q(identifier)+fname).read()\n", 375 | "text = text.decode(\"utf-8\")\n", 376 | "text[:400]" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "id": "68e95f62", 382 | "metadata": {}, 383 | "source": [ 384 | "We can now continue to process this text, for example with NLTK." 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 139, 390 | "id": "ad6c305b", 391 | "metadata": { 392 | "collapsed": false 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "import nltk\n", 397 | "tokens = nltk.tokenize.word_tokenize(text)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 140, 403 | "id": "d0495f59", 404 | "metadata": { 405 | "collapsed": false 406 | }, 407 | "outputs": [ 408 | { 409 | "data": { 410 | "text/plain": [ 411 | "[u'.', u'\\u25a0', u'/I', u'nor', u'way', u'\"', u'to', u'Uar', u'^', u'U', u'T', u'^', u'Ub', u'say', u'~', u'6', u'>', u'rfs', u'\\xab', u'#', u\"'\", u'cU', u'\\xb1', u'r', u',', u'dear', u';', u'*', u'UtL', u'U', u'too', u'\\xa3', u'*', u'\\xa3', u'e', u'r', u'(', u'vji', u'*', u'n.', u'$', u'U', u'idLca.', u'3', u'i&', u'A', u'ovtr', u'*', u'.', u'\\xa3', u'&-', u'-WO-rcLS', u')', u'tir', u'occurred', u'&', u'A-', u'*', u'*', u'*', u'that', u's', u'^', u'e', u'oll', u'^', u'H', u'tfi', u'kavt', u'woTuLkfttL', u'at', u'-tiiis', u',', u'(', u'rat', u'ai', u'Ofb', u'tirae', u',', u'l&', u'alt', u'\\u2022', u'seemed', u'auitl', u'natural', u'}', u'>', u'bu.t', u'wAe', u'*', u'.', u'\\xb1', u'kt', u'raUit', u'actadly', u'-took', u'QL', u'w', u'atch.', u'out']" 412 | ] 413 | }, 414 | "execution_count": 140, 415 | "metadata": {}, 416 | "output_type": "execute_result" 417 | } 418 | ], 419 | "source": [ 420 | "tokens[200:300]" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "id": "f3a8be5e", 427 | "metadata": { 428 | "collapsed": false 429 | }, 430 | "outputs": [], 431 | "source": [] 432 | } 433 | ], 434 | "metadata": {}, 435 | "nbformat": 4, 436 | "nbformat_minor": 5 437 | } 438 | -------------------------------------------------------------------------------- /letter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tmbdev-teaching/teaching-nlpa/5409d4ad0d762c76e262e130f4fcbf6bcdfc2317/letter.png -------------------------------------------------------------------------------- /nlpa-classification-tagging.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 23, 6 | "id": "03d3a48a", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import nltk\n", 13 | "from nltk.corpus import names\n", 14 | "from pylab import *\n", 15 | "import random as pyrandom" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "38897036", 21 | "metadata": {}, 22 | "source": [ 23 | "# Parts of Speech Tagging" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 24, 29 | "id": "7101d69a", 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "from nltk.corpus import brown" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 25, 41 | "id": "893549d7", 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "tagged_words = brown.tagged_words(categories='news')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 44, 53 | "id": "2d8fdc90", 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "def features(s,i,y):\n", 60 | " f = dict(ltag=y[i-1] if i>0 else \"^\", # previous tag\n", 61 | " lword=s[i-1] if i>0 else \"^\", # previous word\n", 62 | " s1 = s[i][-1:], # current word features\n", 63 | " s2 = s[i][-2:],\n", 64 | " s3 = s[i][-3:])\n", 65 | " return f" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "id": "fbe6ff18", 71 | "metadata": {}, 72 | "source": [ 73 | "# Training" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 35, 79 | "id": "7ccac27e", 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "data = []\n", 86 | "for sy in brown.tagged_sents(categories='news'):\n", 87 | " s,y = zip(*sy)\n", 88 | " for i in range(len(s)):\n", 89 | " data.append((features(s,i,y),y[i]))\n", 90 | "n = len(data)\n", 91 | "training_set = data[n//10:]\n", 92 | "test_set = data[:n//10]" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 36, 98 | "id": "66ba816e", 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "classifier = nltk.NaiveBayesClassifier.train(training_set)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 37, 110 | "id": "fd3afe50", 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "0.8176031824962705" 119 | ] 120 | }, 121 | "execution_count": 37, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "nltk.classify.accuracy(classifier,test_set)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "id": "24401659", 133 | "metadata": {}, 134 | "source": [ 135 | "# Greedy Decoding" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 46, 141 | "id": "596ce943", 142 | "metadata": { 143 | "collapsed": false 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "class MyTagger:\n", 148 | " def __init__(self,classifier):\n", 149 | " self.classifier = classifier\n", 150 | " def tag(self,s):\n", 151 | " y = []\n", 152 | " for i in range(len(s)):\n", 153 | " f = features(s,i,y)\n", 154 | " y.append(classifier.classify(features(s,i,y)))\n", 155 | " return zip(s,y) " 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 40, 161 | "id": "bb6adcd5", 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "tagger = MyTagger(classifier)" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 45, 173 | "id": "cab4c5cb", 174 | "metadata": { 175 | "collapsed": false 176 | }, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "[('The', 'AT'),\n", 182 | " ('quick', 'NN'),\n", 183 | " ('brown', 'NN'),\n", 184 | " ('fox', 'NPS-TL'),\n", 185 | " ('jumped', 'VBD'),\n", 186 | " ('over', 'RP'),\n", 187 | " ('the', 'AT'),\n", 188 | " ('lazy', 'JJ'),\n", 189 | " ('dogs.', 'NP')]" 190 | ] 191 | }, 192 | "execution_count": 45, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "tagger.tag(\"The quick brown fox jumped over the lazy dogs.\".split())" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "id": "dd64592b", 204 | "metadata": {}, 205 | "source": [ 206 | "# More Advanced Models" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "id": "662450d0", 212 | "metadata": {}, 213 | "source": [ 214 | "- Viterbi Decoding\n", 215 | "- MEMM\n", 216 | "- Conditional Random Fields" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "id": "05f70ce3", 223 | "metadata": { 224 | "collapsed": false 225 | }, 226 | "outputs": [], 227 | "source": [] 228 | } 229 | ], 230 | "metadata": {}, 231 | "nbformat": 4, 232 | "nbformat_minor": 5 233 | } 234 | -------------------------------------------------------------------------------- /nlpa-classifier-dialog-acts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "285c5b90", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import nltk\n", 13 | "from nltk.corpus import names\n", 14 | "from pylab import *\n", 15 | "import random as pyrandom" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "a179c9e6", 21 | "metadata": {}, 22 | "source": [ 23 | "# Dialog Act Type Classification" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 4, 29 | "id": "a2f720fc", 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "posts = nltk.corpus.nps_chat.xml_posts()[:10000]" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 5, 41 | "id": "290cf628", 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "def features(post):\n", 48 | " f = {}\n", 49 | " for w in nltk.word_tokenize(post): f[w.lower()] = True\n", 50 | " return f" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 6, 56 | "id": "1f37d87f", 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "'wouldnt let her date'" 65 | ] 66 | }, 67 | "execution_count": 6, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "posts[333].text" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 7, 79 | "id": "4e23cd97", 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "'Emotion'" 88 | ] 89 | }, 90 | "execution_count": 7, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "posts[333].get('class')" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 8, 102 | "id": "f8965b14", 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "set(['Emotion', 'ynQuestion', 'yAnswer', 'Continuer', 'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis', 'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other'])\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "print set([p.get('class') for p in posts])" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 9, 122 | "id": "b63c66a7", 123 | "metadata": { 124 | "collapsed": false 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "featuresets = [(features(p.text),p.get('class')) for p in posts]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 10, 134 | "id": "eedff20d", 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "training_set = featuresets[1000:]\n", 141 | "test_set = featuresets[:1000]" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 11, 147 | "id": "6f6671f5", 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "0.66\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "classifier = nltk.NaiveBayesClassifier.train(training_set)\n", 162 | "print nltk.classify.accuracy(classifier,test_set)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "19972e3d", 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [], 173 | "source": [] 174 | } 175 | ], 176 | "metadata": {}, 177 | "nbformat": 4, 178 | "nbformat_minor": 5 179 | } 180 | -------------------------------------------------------------------------------- /nlpa-classifier-sentence-segmentation-Copy0.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /nlpa-classifier-sentence-segmentation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "38c97229", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import nltk\n", 13 | "from nltk.corpus import names\n", 14 | "from pylab import *\n", 15 | "import random as pyrandom" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "id": "01d28868", 21 | "metadata": {}, 22 | "source": [ 23 | "# Sentence Segmentation" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 34, 29 | "id": "4b05b3e9", 30 | "metadata": { 31 | "collapsed": false 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "sents = nltk.corpus.treebank_raw.sents()\n", 36 | "sents = [s for s in sents if len(s)>3]\n", 37 | "sents = [s for s in sents if \"START\" not in s]" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 35, 43 | "id": "98e6db7e", 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "tokens = []\n", 50 | "boundaries = []\n", 51 | "for s in sents:\n", 52 | " tokens += s\n", 53 | " boundaries.append(len(tokens)-1)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 36, 59 | "id": "6990a8a0", 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.', 'A', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '.', 'The', 'asbestos', 'fiber', ',', 'crocidolite', ',', 'is', 'unusually', 'resilient', 'once', 'it', 'enters', 'the', 'lungs', ',', 'with', 'even', 'brief', 'exposures', 'to', 'it', 'causing', 'symptoms', 'that', 'show', 'up', 'decades', 'later', ',', 'researchers', 'said', '.', 'Lorillard', 'Inc', '.,', 'the', 'unit', 'of', 'New', 'York', '-', 'based', 'Loews', 'Corp', '.', 'that', 'makes', 'Kent', 'cigarettes', ',', 'stopped', 'using', 'crocidolite', 'in', 'its', 'Micronite', 'cigarette', 'filters', 'in', '1956', '.', 'Although', 'preliminary', 'findings', 'were', 'reported', 'more', 'than', 'a', 'year', 'ago', ',', 'the', 'latest', 'results', 'appear', 'in', 'today', \"'\", 's', 'New', 'England', 'Journal', 'of', 'Medicine', ',', 'a', 'forum', 'likely', 'to', 'bring', 'new', 'attention', 'to', 'the', 'problem', '.', 'A', 'Lorillard', 'spokewoman', 'said', ',', '\"']\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "print tokens[:200]" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 42, 79 | "id": "9c84af47", 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "def features(s,i):\n", 86 | " return dict(current=tokens[i],\n", 87 | " prev=tokens[i-1],\n", 88 | " next=tokens[i+1],\n", 89 | " upper=tokens[i+1][0].isupper(),\n", 90 | " plen=len(tokens[i-1]),\n", 91 | " nlen=len(tokens[i+1]))\n", 92 | " " 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 46, 98 | "id": "fa07a737", 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "5951\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "data = []\n", 113 | "for i in range(1,len(tokens)-1):\n", 114 | " if tokens[i] not in [\".\",\"?\",\"!\"]: continue\n", 115 | " c = (i in boundaries)\n", 116 | " f = features(tokens,i)\n", 117 | " data.append((f,c))\n", 118 | "pyrandom.shuffle(data)\n", 119 | "n = len(data)\n", 120 | "print n\n", 121 | "training_set = data[n//10:]\n", 122 | "test_set = data[:n//10]" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 47, 128 | "id": "7262c00e", 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "0.9798319327731092" 137 | ] 138 | }, 139 | "execution_count": 47, 140 | "metadata": {}, 141 | "output_type": "execute_result" 142 | } 143 | ], 144 | "source": [ 145 | "classifier = nltk.NaiveBayesClassifier.train(training_set)\n", 146 | "nltk.classify.accuracy(classifier,test_set)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 48, 152 | "id": "06641c58", 153 | "metadata": { 154 | "collapsed": false 155 | }, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "True" 161 | ] 162 | }, 163 | "execution_count": 48, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "classifier.classify(features(\"The quick . brown\".split(),2))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 59, 175 | "id": "bf296af7", 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "def segment_sentences(words):\n", 182 | " sentences = [[words[0]]]\n", 183 | " for i in range(1,len(words)):\n", 184 | " sentences[-1].append(words[i])\n", 185 | " c = words[i] in [\".\",\"?\",\"!\"] and classifier.classify(features(words,i))\n", 186 | " if c: sentences.append([])\n", 187 | " if sentences[-1]==[]: sentences = sentences[:-1]\n", 188 | " return sentences\n", 189 | " " 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 61, 195 | "id": "40939b59", 196 | "metadata": { 197 | "collapsed": false 198 | }, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "[['Smith', 'ran', '.'], ['J', '.', 'Smith', 'really', 'ran', '.']]" 204 | ] 205 | }, 206 | "execution_count": 61, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "segment_sentences(\"\"\"Smith ran . J . Smith really ran . \"\"\".split())" 213 | ] 214 | } 215 | ], 216 | "metadata": {}, 217 | "nbformat": 4, 218 | "nbformat_minor": 5 219 | } 220 | -------------------------------------------------------------------------------- /nlpa-corpora.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1ce9fe94", 6 | "metadata": {}, 7 | "source": [ 8 | "# Properties of Corpora" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "90005a64", 15 | "metadata": { 16 | "collapsed": false 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "from nltk.corpus import brown" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "2980d0d7", 26 | "metadata": {}, 27 | "source": [ 28 | "## Corpora are Collections of Files" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 17, 34 | "id": "47104c75", 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/plain": [ 42 | "FileSystemPathPointer('/home/tmb/nltk_data/corpora/brown')" 43 | ] 44 | }, 45 | "execution_count": 17, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "brown.root" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 18, 57 | "id": "9886db10", 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "'BROWN CORPUS\\n\\nA Standard Corpus of Present-Day Edited American\\nEnglish, for use with Digital Computers.\\n\\nby W. N. Francis and H. Kucera (1964)\\nDepartment of Linguistics, Brown University\\nProvidence, Rhode Island, USA\\n\\nRevised 1971, Revised and Amplified 1979\\n\\nhttp://www.hit.uib.no/icame/brown/bcm.html\\n\\nDistributed with the permission of the copyright holder,\\nredistribution permitted.\\n'" 66 | ] 67 | }, 68 | "execution_count": 18, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "brown.readme()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 15, 80 | "id": "1d690b25", 81 | "metadata": { 82 | "collapsed": false 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "['ca01',\n", 89 | " 'ca02',\n", 90 | " 'ca03',\n", 91 | " 'ca04',\n", 92 | " 'ca05',\n", 93 | " 'ca06',\n", 94 | " 'ca07',\n", 95 | " 'ca08',\n", 96 | " 'ca09',\n", 97 | " 'ca10']" 98 | ] 99 | }, 100 | "execution_count": 15, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "brown.fileids()[:10]" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "2e7d5c4b", 112 | "metadata": {}, 113 | "source": [ 114 | "Files may have different encodings; the default is ASCII processed as `str`." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 16, 120 | "id": "d2efd5df", 121 | "metadata": { 122 | "collapsed": false 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "brown.encoding(\"ca01\")" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "id": "f9b70616", 132 | "metadata": {}, 133 | "source": [ 134 | "Files may also be in different categories." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 19, 140 | "id": "39af3549", 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "['adventure',\n", 149 | " 'belles_lettres',\n", 150 | " 'editorial',\n", 151 | " 'fiction',\n", 152 | " 'government',\n", 153 | " 'hobbies',\n", 154 | " 'humor',\n", 155 | " 'learned',\n", 156 | " 'lore',\n", 157 | " 'mystery',\n", 158 | " 'news',\n", 159 | " 'religion',\n", 160 | " 'reviews',\n", 161 | " 'romance',\n", 162 | " 'science_fiction']" 163 | ] 164 | }, 165 | "execution_count": 19, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "brown.categories()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "id": "0ce78685", 177 | "metadata": {}, 178 | "source": [ 179 | "## Accessing Content" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "id": "9fe8b232", 185 | "metadata": {}, 186 | "source": [ 187 | "The corpus abstraction allows you to avoid having to deal with individual files, encodings, etc.\n", 188 | "\n", 189 | "That is, you can access all the words, all the text, all the sentences etc. in a corpus from a single object.\n" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 54, 195 | "id": "caddbbd4", 196 | "metadata": { 197 | "collapsed": false 198 | }, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "'\\n\\n\\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn'" 204 | ] 205 | }, 206 | "execution_count": 54, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "brown.raw()[:100]" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 2, 218 | "id": "f281d7f8", 219 | "metadata": { 220 | "collapsed": false 221 | }, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/plain": [ 226 | "['The',\n", 227 | " 'Fulton',\n", 228 | " 'County',\n", 229 | " 'Grand',\n", 230 | " 'Jury',\n", 231 | " 'said',\n", 232 | " 'Friday',\n", 233 | " 'an',\n", 234 | " 'investigation',\n", 235 | " 'of']" 236 | ] 237 | }, 238 | "execution_count": 2, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "brown.words()[:10]" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 5, 250 | "id": "ed836db2", 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [ 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "['The', 'Fulton', 'County', 'Grand', 'Jury']\n", 260 | "['The', 'jury', 'further', 'said', 'in']\n", 261 | "['The', 'September-October', 'term', 'jury', 'had']\n", 262 | "['``', 'Only', 'a', 'relative', 'handful']\n", 263 | "['The', 'jury', 'said', 'it', 'did']\n", 264 | "['It', 'recommended', 'that', 'Fulton', 'legislators']\n", 265 | "['The', 'grand', 'jury', 'commented', 'on']\n", 266 | "['Merger', 'proposed']\n", 267 | "['However', ',', 'the', 'jury', 'said']\n", 268 | "['The', 'City', 'Purchasing', 'Department', ',']\n" 269 | ] 270 | } 271 | ], 272 | "source": [ 273 | "for s in brown.sents()[:10]: print s[:5]" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 6, 279 | "id": "0ac68d8b", 280 | "metadata": { 281 | "collapsed": false 282 | }, 283 | "outputs": [ 284 | { 285 | "data": { 286 | "text/plain": [ 287 | "[('The', 'AT'),\n", 288 | " ('Fulton', 'NP-TL'),\n", 289 | " ('County', 'NN-TL'),\n", 290 | " ('Grand', 'JJ-TL'),\n", 291 | " ('Jury', 'NN-TL'),\n", 292 | " ('said', 'VBD'),\n", 293 | " ('Friday', 'NR'),\n", 294 | " ('an', 'AT'),\n", 295 | " ('investigation', 'NN'),\n", 296 | " ('of', 'IN')]" 297 | ] 298 | }, 299 | "execution_count": 6, 300 | "metadata": {}, 301 | "output_type": "execute_result" 302 | } 303 | ], 304 | "source": [ 305 | "brown.tagged_words()[:10]" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 8, 311 | "id": "3f909b3b", 312 | "metadata": { 313 | "collapsed": false 314 | }, 315 | "outputs": [ 316 | { 317 | "data": { 318 | "text/plain": [ 319 | "[('The', 'AT'),\n", 320 | " ('Fulton', 'NP-TL'),\n", 321 | " ('County', 'NN-TL'),\n", 322 | " ('Grand', 'JJ-TL'),\n", 323 | " ('Jury', 'NN-TL'),\n", 324 | " ('said', 'VBD'),\n", 325 | " ('Friday', 'NR'),\n", 326 | " ('an', 'AT'),\n", 327 | " ('investigation', 'NN'),\n", 328 | " ('of', 'IN')]" 329 | ] 330 | }, 331 | "execution_count": 8, 332 | "metadata": {}, 333 | "output_type": "execute_result" 334 | } 335 | ], 336 | "source": [ 337 | "brown.tagged_sents()[0][:10]" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "id": "edac51c0", 343 | "metadata": {}, 344 | "source": [ 345 | "# Reading New Corpora" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 20, 351 | "id": "79f45f8a", 352 | "metadata": { 353 | "collapsed": false 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "import nltk.corpus.reader" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 30, 363 | "id": "88ef0e47", 364 | "metadata": { 365 | "collapsed": false 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "corpus = nltk.corpus.reader.plaintext.PlaintextCorpusReader(\".\",r\"[ft].*txt\",encoding=\"utf8\")" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 31, 375 | "id": "045a15a0", 376 | "metadata": { 377 | "collapsed": false 378 | }, 379 | "outputs": [ 380 | { 381 | "data": { 382 | "text/plain": [ 383 | "['faust.txt', 'tomsawyer.txt']" 384 | ] 385 | }, 386 | "execution_count": 31, 387 | "metadata": {}, 388 | "output_type": "execute_result" 389 | } 390 | ], 391 | "source": [ 392 | "corpus.fileids()" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 32, 398 | "id": "d25b1b3d", 399 | "metadata": { 400 | "collapsed": false 401 | }, 402 | "outputs": [ 403 | { 404 | "data": { 405 | "text/plain": [ 406 | "u'Faust: Der Trag\\xf6die erster Teil\\n\\nJohann Wolfgang von Goethe\\n\\n\\nZueignung.\\n\\nIhr naht euch wieder, schw'" 407 | ] 408 | }, 409 | "execution_count": 32, 410 | "metadata": {}, 411 | "output_type": "execute_result" 412 | } 413 | ], 414 | "source": [ 415 | "corpus.raw()[:100]" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 33, 421 | "id": "b69ac46c", 422 | "metadata": { 423 | "collapsed": false 424 | }, 425 | "outputs": [ 426 | { 427 | "data": { 428 | "text/plain": [ 429 | "[[[u'Faust', u':', u'Der', u'Trag\\xf6die', u'erster', u'Teil']],\n", 430 | " [[u'Johann', u'Wolfgang', u'von', u'Goethe']]]" 431 | ] 432 | }, 433 | "execution_count": 33, 434 | "metadata": {}, 435 | "output_type": "execute_result" 436 | } 437 | ], 438 | "source": [ 439 | "corpus.paras()[:2]" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 39, 445 | "id": "9331354f", 446 | "metadata": { 447 | "collapsed": false 448 | }, 449 | "outputs": [ 450 | { 451 | "name": "stdout", 452 | "output_type": "stream", 453 | "text": [ 454 | "[u'FAUST', u':', u'Vor', u'jenem', u'droben', u'steht', u'geb\\xfcckt', u',', u'Der', u'helfen', u'lehrt', u'und', u'H\\xfclfe', u'schickt', u'.']\n" 455 | ] 456 | } 457 | ], 458 | "source": [ 459 | "print corpus.sents()[500]" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 40, 465 | "id": "68818aff", 466 | "metadata": { 467 | "collapsed": false 468 | }, 469 | "outputs": [ 470 | { 471 | "name": "stdout", 472 | "output_type": "stream", 473 | "text": [ 474 | "[u'heute', u'!', u'DICHTER', u':', u'O', u'sprich', u'mir', u'nicht', u'von', u'jener']\n" 475 | ] 476 | } 477 | ], 478 | "source": [ 479 | "print corpus.words()[500:510]" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 44, 485 | "id": "23376f31", 486 | "metadata": { 487 | "collapsed": false 488 | }, 489 | "outputs": [], 490 | "source": [ 491 | "from nltk import Text\n", 492 | "text = Text(corpus.words(\"tomsawyer.txt\"))" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 47, 498 | "id": "1b2dcb00", 499 | "metadata": { 500 | "collapsed": false 501 | }, 502 | "outputs": [ 503 | { 504 | "name": "stdout", 505 | "output_type": "stream", 506 | "text": [ 507 | "Building index...\n", 508 | "Displaying 25 of 647 matches:\n", 509 | "\" TOM !\" No answer . \" What ' s gone with that boy , I wonder ? You TOM !\" No \n", 510 | "ding down and punching under the bed with the broom , and so she needed breath\n", 511 | "eded breath to punctuate the punches with . She resurrected nothing but the ca\n", 512 | " - brother ) Sid was already through with his part of the work ( picking up ch\n", 513 | "et vanity to believe she was endowed with a talent for dark and mysterious dip\n", 514 | " sewed . \" Bother ! Well , go ' long with you . I ' d made sure you ' d played\n", 515 | " didn ' t think you sewed his collar with white thread , but it ' s black .\" \"\n", 516 | "it ' s black .\" \" Why , I did sew it with white ! Tom !\" But Tom did not wait \n", 517 | " Confound it ! sometimes she sews it with white , and sometimes she sews it wi\n", 518 | "th white , and sometimes she sews it with black . I wish to geeminy she ' d st\n", 519 | "f it , and he strode down the street with his mouth full of harmony __________\n", 520 | "ure is concerned , the advantage was with the boy , not the astronomer . The s\n", 521 | "art , don ' t you ? I could lick you with one hand tied behind me , if I wante\n", 522 | "do it .\" \" Well I will , if you fool with me .\" \" Oh yes -- I ' ve seen whole \n", 523 | "n ' t either .\" So they stood , each with a foot placed at an angle as a brace\n", 524 | " angle as a brace , and both shoving with might and main , and glowering at ea\n", 525 | "d main , and glowering at each other with hate . But neither could get an adva\n", 526 | "nd flushed , each relaxed his strain with watchful caution , and Tom said : \" \n", 527 | "other on you , and he can thrash you with his little finger , and I ' ll make \n", 528 | "it so .\" Tom drew a line in the dust with his big toe , and said : \" I dare yo\n", 529 | " out of his pocket and held them out with derision . Tom struck them to the gr\n", 530 | "er ' s nose , and covered themselves with dust and glory . Presently the confu\n", 531 | "tride the new boy , and pounding him with his fists . \" Holler ' nuff !\" said \n", 532 | "Better look out who you ' re fooling with next time .\" The new boy went off br\n", 533 | "ht him out .\" To which Tom responded with jeers , and started off in high feat\n" 534 | ] 535 | } 536 | ], 537 | "source": [ 538 | "text.concordance(\"with\")" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 48, 544 | "id": "487f92bf", 545 | "metadata": { 546 | "collapsed": false 547 | }, 548 | "outputs": [ 549 | { 550 | "name": "stdout", 551 | "output_type": "stream", 552 | "text": [ 553 | "Building word-context index...\n", 554 | "and in on to for of was at into up s that through but if just upon\n", 555 | "what as by\n" 556 | ] 557 | } 558 | ], 559 | "source": [ 560 | "text.similar(\"with\")" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 50, 566 | "id": "fdaee38c", 567 | "metadata": { 568 | "collapsed": false 569 | }, 570 | "outputs": [ 571 | { 572 | "name": "stdout", 573 | "output_type": "stream", 574 | "text": [ 575 | "but_the is_a long_you up_a\n" 576 | ] 577 | } 578 | ], 579 | "source": [ 580 | "text.common_contexts([\"with\",\"as\"])" 581 | ] 582 | } 583 | ], 584 | "metadata": {}, 585 | "nbformat": 4, 586 | "nbformat_minor": 5 587 | } 588 | -------------------------------------------------------------------------------- /nlpa-course.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "id": "b4913d2b", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from IPython.core.display import Image,HTML\n", 13 | "from IPython.lib.display import YouTubeVideo" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "928e08dc", 19 | "metadata": {}, 20 | "source": [ 21 | "# Introduction to Natural Language Processing and Applications" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "724cc955", 27 | "metadata": {}, 28 | "source": [ 29 | "Goals of the course:\n", 30 | "\n", 31 | "- understanding of major concepts and algorithms in text and natural language processing\n", 32 | "- acquire the skills to work with research code in NLP, and make contributions to it\n", 33 | "\n", 34 | "*It is not just sufficient to memorize some terms and algorithms, you must be able to apply them.*" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "6a673f8e", 40 | "metadata": {}, 41 | "source": [ 42 | "Course Content:\n", 43 | "\n", 44 | "- text processing and encoding\n", 45 | "- string algorithms, edit distance\n", 46 | "- statistical language models\n", 47 | "- spell correction\n", 48 | "- n-gram models\n", 49 | "- word sense disambiguation\n", 50 | "- Markov models, parts-of-speech tagging\n", 51 | "- probabilistic grammars and parsing\n", 52 | "- text alignment, clustering, text categorization\n", 53 | "- statistical machine translation\n", 54 | "- applications in speech recognition, handwriting recognition, and OCR\n", 55 | "- language acquisition\n", 56 | "- machine learning for NLP\n", 57 | "- cognitive and psychological aspects of NLP" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 24, 63 | "id": "e3f6696d", 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/html": [ 71 | "\n", 72 | " \n", 79 | " " 80 | ], 81 | "text/plain": [ 82 | "" 83 | ] 84 | }, 85 | "execution_count": 24, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "YouTubeVideo(\"https://www.youtube.com/watch?v=PHzoX2AIzqo\")" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "id": "f79f105c", 97 | "metadata": {}, 98 | "source": [ 99 | "## IUPR Home Page" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 20, 105 | "id": "d829365f", 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/html": [ 113 | "\n", 35 | " " 36 | ], 37 | "text/plain": [ 38 | "" 39 | ] 40 | }, 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "YouTubeVideo(\"PHzoX2AIzqo\")" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "id": "f1027548", 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "\n", 62 | " \n", 69 | " " 70 | ], 71 | "text/plain": [ 72 | "" 73 | ] 74 | }, 75 | "execution_count": 3, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "YouTubeVideo(\"RAJIDH5d4C4\")" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "id": "b5790420", 88 | "metadata": { 89 | "collapsed": false 90 | }, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/html": [ 95 | "\n", 96 | " \n", 103 | " " 104 | ], 105 | "text/plain": [ 106 | "" 107 | ] 108 | }, 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "YouTubeVideo(\"WuP6AQPRpUg\")" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 5, 121 | "id": "25330a69", 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/html": [ 129 | "\n", 130 | " \n", 137 | " " 138 | ], 139 | "text/plain": [ 140 | "" 141 | ] 142 | }, 143 | "execution_count": 5, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "YouTubeVideo(\"s8WFctIbt84\")" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 6, 155 | "id": "72d55ba1", 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/html": [ 163 | "\n", 164 | " \n", 171 | " " 172 | ], 173 | "text/plain": [ 174 | "" 175 | ] 176 | }, 177 | "execution_count": 6, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "YouTubeVideo(\"Qy6zhx9gndI\")" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 7, 189 | "id": "37aaf341", 190 | "metadata": { 191 | "collapsed": false 192 | }, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/html": [ 197 | "\n", 198 | " \n", 205 | " " 206 | ], 207 | "text/plain": [ 208 | "" 209 | ] 210 | }, 211 | "execution_count": 7, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "YouTubeVideo(\"c9jk3P0GqLU\")" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 8, 223 | "id": "430610d9", 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/html": [ 231 | "\n", 232 | " \n", 239 | " " 240 | ], 241 | "text/plain": [ 242 | "" 243 | ] 244 | }, 245 | "execution_count": 8, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "YouTubeVideo(\"YKjo9dldp2g\")" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 9, 257 | "id": "ba43e62f", 258 | "metadata": { 259 | "collapsed": false 260 | }, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/html": [ 265 | "\n", 266 | " \n", 273 | " " 274 | ], 275 | "text/plain": [ 276 | "" 277 | ] 278 | }, 279 | "execution_count": 9, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "YouTubeVideo(\"nXgboDb9ucE\")" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 10, 291 | "id": "89f6f7c6", 292 | "metadata": { 293 | "collapsed": false 294 | }, 295 | "outputs": [ 296 | { 297 | "data": { 298 | "text/html": [ 299 | "\n", 300 | " \n", 307 | " " 308 | ], 309 | "text/plain": [ 310 | "" 311 | ] 312 | }, 313 | "execution_count": 10, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "YouTubeVideo(\"BOUTfUmI8vs\")" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "id": "1cca4c44", 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [], 330 | "source": [] 331 | } 332 | ], 333 | "metadata": {}, 334 | "nbformat": 4, 335 | "nbformat_minor": 5 336 | } 337 | -------------------------------------------------------------------------------- /nlpa-intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "5a7b3a4e", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [] 12 | } 13 | ], 14 | "metadata": {}, 15 | "nbformat": 4, 16 | "nbformat_minor": 5 17 | } 18 | -------------------------------------------------------------------------------- /nlpa-memm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 52, 6 | "id": "e37e73b1", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pylab import *\n", 13 | "import bisect\n", 14 | "def unit(i,n):\n", 15 | " result = zeros(n)\n", 16 | " result[i] = 1.0\n", 17 | " return result" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "dbf90496", 23 | "metadata": {}, 24 | "source": [ 25 | "# Generating a Dataset" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 10, 31 | "id": "24d5a838", 32 | "metadata": { 33 | "collapsed": false 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "M = roll(diag(ones(4)) + 0.1*rand(4,4),-1,1)\n", 38 | "M /= sum(M,axis=0)[newaxis,:]" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 55, 44 | "id": "0405b0ea", 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "def generate(M,s=0,n=10):\n", 51 | " result = []\n", 52 | " for i in range(n):\n", 53 | " result.append(s)\n", 54 | " x = unit(s,len(M))\n", 55 | " x = dot(M,x)\n", 56 | " s = bisect.bisect_left(add.accumulate(x),rand())\n", 57 | " return result" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 63, 63 | "id": "7e0a5123", 64 | "metadata": { 65 | "collapsed": false 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "def glabel(s):\n", 70 | " result = [0]\n", 71 | " for i in range(1,len(s)):\n", 72 | " if s[i]==s[i-1]+1 and result[-1]:\n", 73 | " result.append(s[i])\n", 74 | " else:\n", 75 | " result.append(0)\n", 76 | " return result" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 78, 82 | "id": "851dbbf2", 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "xs = generate(M,0,10000)\n", 89 | "ys = glabel(xs)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 110, 95 | "id": "92d0b0a5", 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "[0, 1, 2, 3, 1, 2, 3, 0, 1, 2, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]\n", 105 | "[0, 1, 2, 3, 0, 2, 3, 0, 1, 2, 0, 3, 0, 1, 2, 3, 0, 1, 2, 3]\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "print xs[:20]\n", 111 | "print ys[:20]" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "8e8504d4", 117 | "metadata": {}, 118 | "source": [ 119 | "# Learning the Transitions" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 87, 125 | "id": "3064f77d", 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "cs = []\n", 132 | "ps = []\n", 133 | "for i in range(1,len(xs)):\n", 134 | " c = concatenate([unit(ys[i-1],4),unit(xs[i],4)])\n", 135 | " p = unit(xs[i],4)\n", 136 | " cs.append(c)\n", 137 | " ps.append(p)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 88, 143 | "id": "be0e8b63", 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "from sklearn.linear_model import LogisticRegression" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 89, 155 | "id": "839b6d16", 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 164 | " intercept_scaling=1, penalty='l2', tol=0.0001)" 165 | ] 166 | }, 167 | "execution_count": 89, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "lr = LogisticRegression()\n", 174 | "lr.fit(cs,ys[1:])" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 103, 180 | "id": "649ffa79", 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "442" 189 | ] 190 | }, 191 | "execution_count": 103, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "sum(array(lr.predict(cs),'i')!=array(ys[1:],'i'))" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "id": "4b5aee0f", 203 | "metadata": {}, 204 | "source": [ 205 | "# Forward Algorithm" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 105, 211 | "id": "c9318bc6", 212 | "metadata": { 213 | "collapsed": false 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "pys = [0]\n", 218 | "for i in range(1,len(xs)):\n", 219 | " c = concatenate([unit(pys[i-1],4),unit(xs[i],4)])\n", 220 | " y = lr.predict([c])\n", 221 | " pys.append(y)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 108, 227 | "id": "9cf12afb", 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "382" 236 | ] 237 | }, 238 | "execution_count": 108, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "sum(array(ys)!=array(pys))" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 91, 250 | "id": "297824fb", 251 | "metadata": { 252 | "collapsed": false 253 | }, 254 | "outputs": [ 255 | { 256 | "data": { 257 | "text/plain": [ 258 | "array([[-11, -18, -10, 48, 49, -15, -8, -17],\n", 259 | " [ 36, -31, -28, -30, -31, 47, -34, -34],\n", 260 | " [ -2, 41, -44, -37, -28, -35, 51, -31],\n", 261 | " [ -2, -40, 40, -40, -21, -34, -37, 51]], dtype=int32)" 262 | ] 263 | }, 264 | "execution_count": 91, 265 | "metadata": {}, 266 | "output_type": "execute_result" 267 | } 268 | ], 269 | "source": [ 270 | "array(lr.coef_*10,'i')" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 111, 276 | "id": "3a1ce39f", 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [], 281 | "source": [ 282 | "lprobs = zeros((100,4))" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 112, 288 | "id": "4e75a7eb", 289 | "metadata": { 290 | "collapsed": false 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "lprobs[0,:] = log(1.0/4)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "id": "01a98b08", 301 | "metadata": { 302 | "collapsed": false 303 | }, 304 | "outputs": [], 305 | "source": [] 306 | } 307 | ], 308 | "metadata": {}, 309 | "nbformat": 4, 310 | "nbformat_minor": 5 311 | } 312 | -------------------------------------------------------------------------------- /nlpa-nltk-automated-tagging.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 51, 6 | "id": "54edbe68", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import nltk\n", 13 | "import urllib2\n", 14 | "import re" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "id": "4e25d1be", 20 | "metadata": {}, 21 | "source": [ 22 | "# Automatic Tagging with NLTK" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "id": "5196f354", 28 | "metadata": {}, 29 | "source": [ 30 | "Although the above results are neat, they aren't all that useful in practice\n", 31 | "because most texts we want to visualize in such ways aren't tagged, and tagging\n", 32 | "them by hand ist costly.\n", 33 | "\n", 34 | "What we need is an *automated tagger*." 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "6770f788", 40 | "metadata": {}, 41 | "source": [ 42 | "Let's take a page off Wikipedia and tag it automatically." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 52, 48 | "id": "dd9fe4bf", 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "u'\\n\\n\\nGeorge Washington - Wikipedia, the free encyclopedia\\n\\n\\n\\n\\n\\n3.0 3 | 4 | # 5 | 6 | from pylab import * 7 | import openfst 8 | from openfst import StdVectorFst as FST 9 | from openfst import LogVectorFst as LFST 10 | from fstutils import * 11 | 12 | # 13 | 14 | # Simple Edit Distance 15 | 16 | # 17 | 18 | # It's easy in principle to compute edit distance with finite state transducers. 19 | # We construct a transducer that takes takes each symbol in the alphabet to itself with cost 0, 20 | # and takes each symbol to a different symbol, or to/from epsilon with cost 1. 21 | # This transducer is called a *flower transducer* because of its appearance. 22 | # 23 | # We then compose an FST corresponding to the first string with this transducer, 24 | # compose the result with an FST corresponding to the second string, and compute 25 | # the cost of the shortest path. 26 | 27 | # 28 | 29 | def make_flower(chars): 30 | epsilon = 0 31 | fst = FST() 32 | s = fst.AddState() 33 | fst.SetStart(s) 34 | fst.SetFinal(s,0.0) 35 | for c in chars: 36 | c = ord(c) 37 | fst.AddArc(s,c,c,0.0,s) 38 | fst.AddArc(s,c,epsilon,1.0,s) 39 | fst.AddArc(s,epsilon,c,1.0,s) 40 | for c2 in chars: 41 | c2 = ord(c2) 42 | fst.AddArc(s,c,c2,1.0,s) 43 | return fst 44 | 45 | # 46 | 47 | flower = make_flower("AB") 48 | show_fst(flower) 49 | 50 | # 51 | 52 | fst1 = FST() 53 | fst1.AddString("AABBAAA") 54 | fst2 = FST() 55 | fst2.AddString("AABBABAB") 56 | 57 | # 58 | 59 | temp1 = FST() 60 | openfst.ArcSortOutput(fst1) 61 | openfst.ArcSortInput(flower) 62 | openfst.Compose(fst1,flower,temp1) 63 | show_fst(temp1) 64 | 65 | # 66 | 67 | temp2 = FST() 68 | openfst.ArcSortOutput(temp1) 69 | openfst.ArcSortInput(fst2) 70 | openfst.Compose(temp1,fst2,temp2) 71 | show_fst(temp2) 72 | 73 | # 74 | 75 | result = FST() 76 | openfst.ShortestPath(temp2,result,1) 77 | show_fst(result) 78 | 79 | # 80 | 81 | print fstsize(temp1),fstsize(temp2) 82 | 83 | # 84 | 85 | # Factoring the Edit Distance Transducer 86 | 87 | # 88 | 89 | # The problem with the previous transducer is that it gets very large very quickly when 90 | # composed with the original string. In fact, the size ends up being quadratic. 91 | # 92 | # We can fix this by introducing some additional symbols. 93 | # (Here, we're just using ASCII symbols to represent insertion, deletion, and substitution, but we could 94 | # be using something fancier.) 95 | 96 | # 97 | 98 | epsilon = 0 99 | insertion = ord("#") 100 | deletion = ord("_") 101 | substitution = ord("~") 102 | 103 | def make_left(chars): 104 | fst = FST() 105 | s = fst.AddState() 106 | fst.SetStart(s) 107 | fst.SetFinal(s,0.0) 108 | fst.AddArc(s,epsilon,insertion,0.5,s) 109 | for c in chars: 110 | c = ord(c) 111 | fst.AddArc(s,c,c,0.0,s) 112 | fst.AddArc(s,c,substitution,0.5,s) 113 | fst.AddArc(s,c,deletion,0.5,s) 114 | return fst 115 | 116 | def make_right(chars): 117 | fst = FST() 118 | s = fst.AddState() 119 | fst.SetStart(s) 120 | fst.SetFinal(s,0.0) 121 | fst.AddArc(s,deletion,epsilon,0.5,s) 122 | for c in chars: 123 | c = ord(c) 124 | fst.AddArc(s,c,c,0.0,s) 125 | fst.AddArc(s,substitution,c,0.5,s) 126 | fst.AddArc(s,insertion,c,0.5,s) 127 | return fst 128 | 129 | # 130 | 131 | temp1 = FST() 132 | temp2 = FST() 133 | openfst.Compose(fst1,make_left("AB"),temp1) 134 | openfst.Compose(make_right("AB"),fst2,temp2) 135 | print fstsize(temp1),fstsize(temp2) 136 | 137 | # 138 | 139 | show_fst(temp1) 140 | 141 | # 142 | 143 | show_fst(temp2) 144 | 145 | # 146 | 147 | temp3 = FST() 148 | openfst.ArcSortOutput(temp1) 149 | openfst.ArcSortInput(temp2) 150 | openfst.Compose(temp1,temp2,temp3) 151 | result = FST() 152 | openfst.ShortestPath(temp3,result,1) 153 | print fstsize(result) 154 | show_fst(result) 155 | 156 | # 157 | 158 | # This becomes particularly important when using larger alphabets. Here is an illustration. 159 | 160 | # 161 | 162 | ascii = "".join([chr(c) for c in range(32,127) if c not in [ord("~"),ord("_"),ord("#")]]) 163 | 164 | # 165 | 166 | ascii_left = make_left(ascii) 167 | ascii_right = make_right(ascii) 168 | 169 | # 170 | 171 | def edit_distance(s1,s2): 172 | fst1 = FST() 173 | fst1.AddString(s1) 174 | fst2 = FST() 175 | fst2.AddString(s2) 176 | temp1 = FST() 177 | temp2 = FST() 178 | openfst.Compose(fst1,ascii_left,temp1) 179 | openfst.Compose(ascii_right,fst2,temp2) 180 | print fstsize(temp1),fstsize(temp2) 181 | temp3 = FST() 182 | openfst.ArcSortOutput(temp1) 183 | openfst.ArcSortInput(temp2) 184 | openfst.Compose(temp1,temp2,temp3) 185 | print fstsize(temp3) 186 | result = FST() 187 | openfst.ShortestPath(temp3,result,1) 188 | return result 189 | 190 | # 191 | 192 | show_fst(edit_distance("quick fox","quack fowl")) 193 | 194 | # 195 | 196 | # Limited Contiguous Insertions / Deletions 197 | 198 | # 199 | 200 | # A second way in which we can make edit distance computations more efficient 201 | # is to limit the number of consecutive deletions/insertions that can occur. 202 | # 203 | # (Think about what constraint this corresponds to for a "manual" computation of the edit distance.) 204 | 205 | # 206 | 207 | epsilon = 0 208 | def make_edit1(chars): 209 | fst = FST() 210 | s = fst.AddState() 211 | s2 = fst.AddState() 212 | fst.SetStart(s) 213 | fst.SetFinal(s,0.0) 214 | fst.SetFinal(s2,0.0) 215 | for c in chars: 216 | c = ord(c) 217 | fst.AddArc(s,c,c,0.0,s) 218 | fst.AddArc(s,c,epsilon,1.0,s2) 219 | fst.AddArc(s,epsilon,c,1.0,s2) 220 | fst.AddArc(s2,c,c,0.0,s) 221 | for c2 in chars: 222 | c2 = ord(c2) 223 | fst.AddArc(s,c,c2,1.0,s) 224 | fst.AddArc(s2,c,c2,1.0,s) 225 | return fst 226 | 227 | # 228 | 229 | temp1 = FST() 230 | openfst.ArcSortOutput(fst1) 231 | efst = make_edit1("AB") 232 | openfst.ArcSortInput(efst) 233 | openfst.Compose(fst1,efst,temp1) 234 | show_fst(temp1) 235 | temp2 = FST() 236 | openfst.ArcSortOutput(temp1) 237 | openfst.ArcSortInput(fst2) 238 | openfst.Compose(temp1,fst2,temp2) 239 | show_fst(temp2) 240 | print fstsize(temp2) 241 | 242 | # 243 | 244 | result = FST() 245 | openfst.ShortestPath(temp2,result,1) 246 | show_fst(result) 247 | 248 | # 249 | 250 | temp1 = FST() 251 | openfst.ArcSortOutput(fst1) 252 | efst = make_flower("AB") 253 | openfst.ArcSortInput(efst) 254 | openfst.Compose(fst1,efst,temp1) 255 | show_fst(temp1) 256 | temp2 = FST() 257 | openfst.ArcSortOutput(temp1) 258 | openfst.ArcSortInput(fst2) 259 | openfst.Compose(temp1,fst2,temp2) 260 | show_fst(temp2) 261 | print fstsize(temp2) 262 | 263 | # 264 | 265 | result = FST() 266 | openfst.ShortestPath(temp2,result,1) 267 | show_fst(result) 268 | 269 | # 270 | 271 | # Oracle Edit Distance 272 | 273 | # 274 | 275 | # The regular edit distance is limited to computing the best match between two strings. 276 | # However, with finite state transducers, we can compute the best match between two 277 | # sets of strings. 278 | 279 | # 280 | 281 | # recognition output 282 | fst1 = FST() 283 | fst1.AddString("qulck") 284 | fst1.AddString("qwck") 285 | fst1.AddString("quidc") 286 | fst1 = minimize(fst1) 287 | show_fst(fst1) 288 | 289 | # 290 | 291 | # English dictionary 292 | fst2 = FST() 293 | with open("basic-english.txt") as stream: 294 | for line in stream.readlines(): 295 | line = line.strip() 296 | fst2.AddString(line) 297 | print fstsize(fst2) 298 | fst2 = minimize(fst2) 299 | print fstsize(fst2) 300 | 301 | # 302 | 303 | temp2 = FST() 304 | openfst.ArcSortOutput(ascii_right) 305 | openfst.ArcSortInput(fst2) 306 | openfst.Compose(ascii_right,fst2,temp2) 307 | print fstsize(temp2) 308 | 309 | # 310 | 311 | temp2 = minimize(temp2) 312 | 313 | # 314 | 315 | temp1 = FST() 316 | openfst.Compose(fst1,ascii_left,temp1) 317 | print fstsize(temp1),fstsize(temp2) 318 | temp3 = FST() 319 | openfst.ArcSortOutput(temp1) 320 | openfst.ArcSortInput(temp2) 321 | openfst.Compose(temp1,temp2,temp3) 322 | print fstsize(temp3) 323 | result = FST() 324 | openfst.ShortestPath(temp3,result,1) 325 | show_fst(result) 326 | 327 | # 328 | 329 | 330 | -------------------------------------------------------------------------------- /nlpa-unix-cleanup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3468f54a", 6 | "metadata": {}, 7 | "source": [ 8 | "# Making the Brown Corpus Readable" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "3906c193", 14 | "metadata": {}, 15 | "source": [ 16 | "Here's a simple example of developing a command line removing the tags from the Brown corpus files." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "id": "4b86f5be", 23 | "metadata": { 24 | "collapsed": false 25 | }, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "CONTENTS\r\n", 32 | "README\r\n", 33 | "ca01\r\n", 34 | "ca02\r\n", 35 | "ca03\r\n", 36 | "ca04\r\n", 37 | "ca05\r\n", 38 | "ca06\r\n", 39 | "ca07\r\n", 40 | "ca08\r\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "!ls brown/. | head" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "ce619cd5", 51 | "metadata": {}, 52 | "source": [ 53 | "We should probably look at the README for the definition of the tag file, but let's just figure this out." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 2, 59 | "id": "88e3f236", 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [ 64 | { 65 | "name": "stdout", 66 | "output_type": "stream", 67 | "text": [ 68 | "BROWN CORPUS\r\n", 69 | "\r\n", 70 | "A Standard Corpus of Present-Day Edited American\r\n", 71 | "English, for use with Digital Computers.\r\n", 72 | "\r\n", 73 | "by W. N. Francis and H. Kucera (1964)\r\n", 74 | "Department of Linguistics, Brown University\r\n", 75 | "Providence, Rhode Island, USA\r\n", 76 | "\r\n", 77 | "Revised 1971, Revised and Amplified 1979\r\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "!head brown/README" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "7a58cf6a", 88 | "metadata": {}, 89 | "source": [ 90 | "Here's the first 10 lines from the file `brown/ca07`." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 3, 96 | "id": "c9f39ff7", 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "\r\n", 106 | "\r\n", 107 | "\tResentment/nn welled/vbd up/rp yesterday/nr among/in Democratic/jj-tl district/nn leaders/nns and/cc some/dti county/nn leaders/nns at/in reports/nns that/cs Mayor/nn-tl Wagner/np had/hvd decided/vbn to/to seek/vb a/at third/od term/nn with/in Paul/np R./np Screvane/np and/cc Abraham/np D./np Beame/np as/cs running/vbg mates/nns ./.\r\n", 108 | "\r\n", 109 | "\r\n", 110 | "\tAt/in the/at same/ap time/nn reaction/nn among/in anti-organization/jj Democratic/jj-tl leaders/nns and/cc in/in the/at Liberal/jj-tl party/nn to/in the/at Mayor's/nn$-tl reported/vbn plan/nn was/bedz generally/rb favorable/jj ./.\r\n", 111 | "\r\n", 112 | "\r\n", 113 | "\tSome/dti anti-organization/jj Democrats/nps saw/vbd in/in the/at program/nn an/at opportunity/nn to/to end/vb the/at bitter/jj internal/jj fight/nn within/in the/at Democratic/jj-tl party/nn that/wps has/hvz been/ben going/vbg on/rp for/in the/at last/ap three/cd years/nns ./.\r\n", 114 | "\r\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "!sed 10q brown/ca07" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "7e376c42", 125 | "metadata": {}, 126 | "source": [ 127 | "The main thing is that every word or punctuation is followed by a `/something`.\n", 128 | "We can remove that with a simple regular expression.\n", 129 | "Well, it's not quite so simple...\n", 130 | "\n", 131 | "- We want to replace `/`, but that's already the regular expression delimiter, so we need to escape it: `\\/`\n", 132 | "- the `g` is needed because we want to replace all occurrences" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 7, 138 | "id": "6106ffad", 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "\r\n", 148 | "\r\n", 149 | "\tResentment welled up yesterday among Democratic district leaders and some county leaders at reports that Mayor Wagner had decided to seek a third term with Paul R. Screvane and Abraham D. Beame as running mates .\r\n", 150 | "\r\n", 151 | "\r\n", 152 | "\tAt the same time reaction among anti-organization Democratic leaders and in the Liberal party to the Mayor's reported plan was generally favorable .\r\n", 153 | "\r\n", 154 | "\r\n", 155 | "\tSome anti-organization Democrats saw in the program an opportunity to end the bitter internal fight within the Democratic party that has been going on for the last three years .\r\n", 156 | "\r\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "!sed 's/\\/[^ ]*//g;10q' brown/ca07" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "id": "a78c6e56", 167 | "metadata": {}, 168 | "source": [ 169 | "Let's now clean up the whitespace at the beginning of the line. `\\t` is a shorthand for the tab character." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 8, 175 | "id": "1f7307db", 176 | "metadata": { 177 | "collapsed": false 178 | }, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "\r\n", 185 | "\r\n", 186 | "Resentment welled up yesterday among Democratic district leaders and some county leaders at reports that Mayor Wagner had decided to seek a third term with Paul R. Screvane and Abraham D. Beame as running mates .\r\n", 187 | "\r\n", 188 | "\r\n", 189 | "At the same time reaction among anti-organization Democratic leaders and in the Liberal party to the Mayor's reported plan was generally favorable .\r\n", 190 | "\r\n", 191 | "\r\n", 192 | "Some anti-organization Democrats saw in the program an opportunity to end the bitter internal fight within the Democratic party that has been going on for the last three years .\r\n", 193 | "\r\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "!sed 's/\\/[^ ]*//g;s/^[ \\t]*//;10q' brown/ca07" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "id": "211b87da", 204 | "metadata": {}, 205 | "source": [ 206 | "There are a lot of blank lines; the `cat -s` (squeeze) command gets rid of them." 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 9, 212 | "id": "32f18008", 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "\r\n", 222 | "Resentment welled up yesterday among Democratic district leaders and some county leaders at reports that Mayor Wagner had decided to seek a third term with Paul R. Screvane and Abraham D. Beame as running mates .\r\n", 223 | "\r\n", 224 | "At the same time reaction among anti-organization Democratic leaders and in the Liberal party to the Mayor's reported plan was generally favorable .\r\n", 225 | "\r\n", 226 | "Some anti-organization Democrats saw in the program an opportunity to end the bitter internal fight within the Democratic party that has been going on for the last three years .\r\n", 227 | "\r\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "!sed 's/\\/[^ ]*//g;s/^[ \\t]*//;10q' brown/ca07 | cat -s" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "8135c8f7", 238 | "metadata": {}, 239 | "source": [ 240 | "Now we still have a problem with extra spaces before punctuation.\n", 241 | "We can fix that with another regular expression.\n", 242 | "This one contains *grouping* `\\(...\\)` and a backwards reference to the group `\\1`" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 13, 248 | "id": "99222f49", 249 | "metadata": { 250 | "collapsed": false 251 | }, 252 | "outputs": [ 253 | { 254 | "name": "stdout", 255 | "output_type": "stream", 256 | "text": [ 257 | "\r\n", 258 | "Resentment welled up yesterday among Democratic district leaders and some county leaders at reports that Mayor Wagner had decided to seek a third term with Paul R. Screvane and Abraham D. Beame as running mates.\r\n", 259 | "\r\n", 260 | "At the same time reaction among anti-organization Democratic leaders and in the Liberal party to the Mayor's reported plan was generally favorable.\r\n", 261 | "\r\n", 262 | "Some anti-organization Democrats saw in the program an opportunity to end the bitter internal fight within the Democratic party that has been going on for the last three years.\r\n", 263 | "\r\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "!sed 's/\\/[^ ]*//g;s/^[ \\t]*//;s/ \\([.,]\\)/\\1/;10q' brown/ca07 | cat -s" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "id": "f8378292", 274 | "metadata": {}, 275 | "source": [ 276 | "Finally, let's wrap the long lines back around. The `fmt` command is handy for that." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 14, 282 | "id": "4e221ba4", 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "\r\n", 292 | "Resentment welled up yesterday among Democratic district leaders and\r\n", 293 | "some county leaders at reports that Mayor Wagner had decided to seek a\r\n", 294 | "third term with Paul R. Screvane and Abraham D. Beame as running mates.\r\n", 295 | "\r\n", 296 | "At the same time reaction among anti-organization Democratic leaders and\r\n", 297 | "in the Liberal party to the Mayor's reported plan was generally favorable.\r\n", 298 | "\r\n", 299 | "Some anti-organization Democrats saw in the program an opportunity to\r\n", 300 | "end the bitter internal fight within the Democratic party that has been\r\n", 301 | "going on for the last three years.\r\n", 302 | "\r\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "!sed 's/\\/[^ ]*//g;s/^[ \\t]*//;s/ \\([.,]\\)/\\1/;10q' brown/ca07 | cat -s | fmt" 308 | ] 309 | } 310 | ], 311 | "metadata": {}, 312 | "nbformat": 4, 313 | "nbformat_minor": 5 314 | } 315 | -------------------------------------------------------------------------------- /nltk-summary-stemming-lemmatizing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 39, 6 | "id": "b28adc0f", 7 | "metadata": { 8 | "collapsed": false 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from nltk.stem import SnowballStemmer,PorterStemmer,WordNetLemmatizer" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "c0db7299", 18 | "metadata": {}, 19 | "source": [ 20 | "# Grammar" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "f108dbfd", 26 | "metadata": {}, 27 | "source": [ 28 | "Recall:\n", 29 | "\n", 30 | "- inflection - systematic alteration of words according to grammatical rules\n", 31 | "- declension - nouns, adjectives, articles, pronouns - number, gender, case\n", 32 | "- conjugation - verbs - person, number, tense, gender, aspect, mood, voice\n", 33 | "\n", 34 | "Some of the terms:\n", 35 | "\n", 36 | "- person, number tense, gender... pretty obvious\n", 37 | "- voice - relationship between verb and its arguments (subject, object, ...)\n", 38 | "- aspect - ongoing, completed, habitual, consequential, ...\n", 39 | "- mood - actual, hypothetical, counterfactual, wished for, conditional, command, question, ..." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "5dcc0d8c", 45 | "metadata": {}, 46 | "source": [ 47 | "# Porter Stemmer on English" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 43, 53 | "id": "b431c369", 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "en_nouns = \\\n", 60 | "[\n", 61 | " \"house houses house's\",\n", 62 | " \"child children\",\n", 63 | "]\n", 64 | "en_verbs = \\\n", 65 | "[\n", 66 | " \"walk walked walking walks\",\n", 67 | " \"see saw sees seen seeing\",\n", 68 | "]\n", 69 | "en_cases = en_nouns + en_verbs" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 44, 75 | "id": "5801e1c4", 76 | "metadata": { 77 | "collapsed": false 78 | }, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "hous hous house'\n", 85 | "child children\n", 86 | "walk walk walk walk\n", 87 | "see saw see seen see\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "pen = PorterStemmer()\n", 93 | "for c in en_cases:\n", 94 | " for w in c.split():\n", 95 | " print pen.stem(w),\n", 96 | " print" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "id": "10cfb39d", 102 | "metadata": {}, 103 | "source": [ 104 | "# Snowball Stemmer on German" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 36, 110 | "id": "e4ff6b4a", 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "de_nouns = \\\n", 117 | "[\n", 118 | " u\"Bruder Bruders Brüder Brüdern\",\n", 119 | " u\"Leuchte Leuchten\",\n", 120 | " u\"Haus Hauses Hause Häuser Häusern\",\n", 121 | "]\n", 122 | "de_verbs = \\\n", 123 | "[\n", 124 | " u\"geb geben gebe gibst gibt gebt gab gabst gaben gabt gegeben gäbe gäbst gäb gäben gäbet\",\n", 125 | " u\"fangen fang fange fängst fängt fangen fangt fing fingst fingen fingt\",\n", 126 | " u\"backen backe backst backt backte backtest backten backtet gebackt gebackte\",\n", 127 | " u\"bäckst bäckt bukest bükest\",\n", 128 | " \n", 129 | "]\n", 130 | "de_cases = de_nouns+de_verbs" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 38, 136 | "id": "eff68904", 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "brud brud brud brud\n", 146 | "leucht leucht\n", 147 | "haus haus haus haus haus\n", 148 | "geb geb geb gibst gibt gebt gab gabst gab gabt gegeb gab gabst gab gab gabet\n", 149 | "fang fang fang fang fangt fang fangt fing fing fing fingt\n", 150 | "back back back backt backt backt backt backtet gebackt gebackt\n", 151 | "back backt buk buk\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "des = SnowballStemmer(\"german\")\n", 157 | "for c in de_cases:\n", 158 | " for w in c.split():\n", 159 | " print des.stem(w),\n", 160 | " print" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "0ffdca22", 166 | "metadata": {}, 167 | "source": [ 168 | "# WordNet Lemmatization" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 40, 174 | "id": "a766109f", 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "wnl = WordNetLemmatizer()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 45, 186 | "id": "f5e43ff7", 187 | "metadata": { 188 | "collapsed": false 189 | }, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "house house house's\n", 196 | "child child\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "for c in en_nouns:\n", 202 | " for w in c.split():\n", 203 | " print wnl.lemmatize(w,pos='n'),\n", 204 | " print" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 46, 210 | "id": "08bc0780", 211 | "metadata": { 212 | "collapsed": false 213 | }, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "walk walk walk walk\n", 220 | "see saw see see see\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "for c in en_verbs:\n", 226 | " for w in c.split():\n", 227 | " print wnl.lemmatize(w,pos='v'),\n", 228 | " print" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 47, 234 | "id": "623e970f", 235 | "metadata": { 236 | "collapsed": false 237 | }, 238 | "outputs": [ 239 | { 240 | "name": "stdout", 241 | "output_type": "stream", 242 | "text": [ 243 | "walk walked walking walk\n", 244 | "see saw see seen seeing\n", 245 | "house house house's\n", 246 | "child child\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "for c in en_verbs+en_nouns:\n", 252 | " for w in c.split():\n", 253 | " print wnl.lemmatize(w),\n", 254 | " print" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "id": "f48f8926", 261 | "metadata": { 262 | "collapsed": false 263 | }, 264 | "outputs": [], 265 | "source": [] 266 | } 267 | ], 268 | "metadata": {}, 269 | "nbformat": 4, 270 | "nbformat_minor": 5 271 | } 272 | -------------------------------------------------------------------------------- /tagutils.py: -------------------------------------------------------------------------------- 1 | def stag(wt): 2 | w,t = wt 3 | if t=="Unk": return "[{}]".format(w) 4 | if t=="AT": return "{}".format(w) 5 | if t=="IN": return "{}".format(w) 6 | if t=="CS": return "{}".format(w) 7 | if t=="CC": return "{}".format(w) 8 | if t[0]=="N": return "{}".format(w) 9 | if t[:2]=="PP": return "{}".format(w) 10 | if t[:2]=="DO" or t[:2]=="EX" or t[:2]=="HV" or t[:2]=="MD": 11 | # do, be, have, modal 12 | return "{}".format(w) 13 | if t[0]=="V": 14 | return "{}".format(w) 15 | if "JJ" in t: return "{}".format(w) 16 | if "RB" in t and "WRB" not in t: 17 | return "{}".format(w) 18 | return w 19 | 20 | def stags(tagged): 21 | return " ".join([stag(x) for x in tagged]) 22 | 23 | def mstags(sentences): 24 | return "

\n".join(stags(s) for s in sentences) 25 | --------------------------------------------------------------------------------