├── .gitattributes
├── .hgignore
├── HW1 - Implement a Trie.ipynb
├── HW1 - Regular Expression Parsing.ipynb
├── HW1 - Reproduce Power Laws.ipynb
├── HW2 - Classifier-Based Tagging.ipynb
├── HW2 - Improve Tagging with Wordnet.ipynb
├── HW3 - HMMs and FSTs.ipynb
├── HW3 - regex generation.ipynb
├── README.md
├── brown.zip
├── college-degrees-perc.tsv
├── crawling-with-scrapy.ipynb
├── fstutils.py
├── household-ppp.tsv
├── internet-archive-query.ipynb
├── letter.png
├── nlpa-basic-nltk.ipynb
├── nlpa-classification-intro.ipynb
├── nlpa-classification-tagging.ipynb
├── nlpa-classifier-dialog-acts.ipynb
├── nlpa-classifier-errors.ipynb
├── nlpa-classifier-sentence-segmentation-Copy0.ipynb
├── nlpa-classifier-sentence-segmentation.ipynb
├── nlpa-corpora.ipynb
├── nlpa-course.ipynb
├── nlpa-downloading-tomsawyer.ipynb
├── nlpa-edit-distance.ipynb
├── nlpa-find-xargs.ipynb
├── nlpa-hmm-ocr.ipynb
├── nlpa-intro-demo-videos.ipynb
├── nlpa-intro.ipynb
├── nlpa-lexical-resources.ipynb
├── nlpa-markov-models.ipynb
├── nlpa-memm.ipynb
├── nlpa-nltk-automated-tagging-old.ipynb
├── nlpa-nltk-automated-tagging.ipynb
├── nlpa-nltk-basic.ipynb
├── nlpa-nltk-corpora.ipynb
├── nlpa-nltk-lexical-resources.ipynb
├── nlpa-nltk-reading-german.ipynb
├── nlpa-nltk-wordnet.ipynb
├── nlpa-openfst-edit-distance.ipynb
├── nlpa-openfst-edit-distance.py
├── nlpa-openfst.ipynb
├── nlpa-openfst2.ipynb
├── nlpa-pos-tagging.ipynb
├── nlpa-re-fsa.ipynb
├── nlpa-re-intro.ipynb
├── nlpa-regular-expressions.ipynb
├── nlpa-simple-ir.ipynb
├── nlpa-unicode.ipynb
├── nlpa-unix-cleanup.ipynb
├── nlpa-unix-join.ipynb
├── nlpa-vectorspace.ipynb
├── nlpa-word-histograms.ipynb
├── nlpa-wordnet.ipynb
├── nltk-available-taggers.ipynb
├── nltk-ngram-taggers.ipynb
├── nltk-summary-stemming-lemmatizing.ipynb
├── nltk-tagging-from-scratch.ipynb
├── openfst-weights-and-forwardbackward.ipynb
├── tagutils.py
├── tomsawyer.html
├── tomsawyer.txt
└── zipf-law-example.ipynb


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb linguist-language=Python
2 | *.tex linguist-detectable=false
3 | *.html linguist-detectable=false
4 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
 1 | syntax: glob
 2 | ?
 3 | *.bak
 4 | brown
 5 | .*
 6 | *~
 7 | *.o
 8 | *.so
 9 | *.a
10 | *.err
11 | *.log
12 | *.os
13 | *.pyc
14 | *.png
15 | *.jpg
16 | [0-9]
17 | _*
18 | book
19 | book-*
20 | unlv
21 | unlv-*
22 | Volume-*
23 | Volume_*
24 | *[0-9][0-9][0-9][0-9]*
25 | TAGS
26 | build
27 | *.db
28 | OLD
29 | *.tgz
30 | *.zip
31 | html
32 | apidocs
33 | JUNK
34 | OLD
35 | models*/*
36 | *.orig
37 | temp/
38 | temp.*
39 | *.temp
40 | 


--------------------------------------------------------------------------------
/HW1 - Implement a Trie.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "49477604",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Trie Data Structure"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "d6fbd5f6",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Define a class `Trie` that can be used for fast lookups of strings.\n",
 17 |     "Such classes are frequently used in many natural language processing applications.\n",
 18 |     "\n",
 19 |     "(Writing such a class is also a common interview question; you need\n",
 20 |     "to be able to do it in real time.)\n",
 21 |     "\n",
 22 |     "Have your class update a global variable `nops` for each node traversal during\n",
 23 |     "`add`, `lookup`, and `remove` operations."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 11,
 29 |    "id": "7da39832",
 30 |    "metadata": {
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "class Trie:\n",
 36 |     "    def __init__(self):\n",
 37 |     "        pass\n",
 38 |     "    def add(self,s,value):\n",
 39 |     "        \"\"\"Add the string `s` to the `Trie` and\n",
 40 |     "        map it to the given value.\"\"\"\n",
 41 |     "        global nops\n",
 42 |     "        nops += 1 # this is just a placeholder\n",
 43 |     "        pass\n",
 44 |     "    def lookup(self,s,default=None):\n",
 45 |     "        \"\"\"Look up the value corresponding to the\n",
 46 |     "        string `s`.\"\"\"\n",
 47 |     "    def remove(self,s):\n",
 48 |     "        \"\"\"Remove the string s from the Trie.\n",
 49 |     "        Returns True if the string was a member.\"\"\"\n",
 50 |     "        pass\n",
 51 |     "    def prefix(self,s):\n",
 52 |     "        \"\"\"Check whether the string `s` is a prefix\n",
 53 |     "        of some member.\"\"\"\n",
 54 |     "        pass\n",
 55 |     "    def items(self):\n",
 56 |     "        \"\"\"Return an iterator over the items of the `Trie`.\"\"\""
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "id": "e241371b",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "# Unit Tests"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "id": "c10be409",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "Write some unit tests demonstrating that your class works as intended.\n",
 73 |     "The next cell gives some examples, but you need to write additional tests\n",
 74 |     "for other methods and common sequences of operations."
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 21,
 80 |    "id": "9965f973",
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [
 85 |     {
 86 |      "ename": "AssertionError",
 87 |      "evalue": "",
 88 |      "output_type": "error",
 89 |      "traceback": [
 90 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
 91 |       "\u001b[0;32m<ipython-input-21-18e07a553991>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtrie\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlookup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"street\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mtrie\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlookup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"house\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mtrie\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlookup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"hello\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mtrie\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlookup\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"world\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 92 |       "\u001b[0;31mAssertionError\u001b[0m: "
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "trie = Trie()\n",
 98 |     "trie.add(\"hello\",1)\n",
 99 |     "trie.add(\"world\",2)\n",
100 |     "assert not trie.lookup(\"street\")\n",
101 |     "assert not trie.lookup(\"house\")\n",
102 |     "assert trie.lookup(\"hello\")\n",
103 |     "assert trie.lookup(\"world\")"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 23,
109 |    "id": "4b23d401",
110 |    "metadata": {
111 |     "collapsed": false
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "trie = Trie()\n",
116 |     "nops = 0\n",
117 |     "trie.add(\"hello\",1)\n",
118 |     "assert nops>0"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "id": "71de5b4e",
124 |    "metadata": {},
125 |    "source": [
126 |     "# Performance Measurements"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "9fbef093",
132 |    "metadata": {},
133 |    "source": [
134 |     "Next, let's measure how `Trie` performance scales on real data."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 24,
140 |    "id": "7d114a5d",
141 |    "metadata": {
142 |     "collapsed": false
143 |    },
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "74354 ['the', 'adventures', 'of', 'tom', 'sawyer', 'mark', 'twain', 'harper', 'and', 'brothers']\n"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "import re\n",
155 |     "words = re.findall(r'\\w+',open(\"tomsawyer.txt\").read())\n",
156 |     "words = [w.lower() for  w in words]\n",
157 |     "print len(words),words[:10]"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 17,
163 |    "id": "5be56794",
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [],
168 |    "source": [
169 |     "counts = []\n",
170 |     "for n in range(1000,70000,1000):\n",
171 |     "    trie = Trie()\n",
172 |     "    nops = 0\n",
173 |     "    for i,w in enumerate(words[:n]):\n",
174 |     "        trie.add(w,i)\n",
175 |     "    counts.append((n,nops))"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 18,
181 |    "id": "dd835354",
182 |    "metadata": {
183 |     "collapsed": false
184 |    },
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "[<matplotlib.lines.Line2D at 0x2fd9a10>]"
190 |       ]
191 |      },
192 |      "execution_count": 18,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     },
196 |     {
197 |      "data": {
198 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY8AAAD9CAYAAABEB/uZAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3W9MXNed//H3ZOFJuzWqCQzuDFoqGEywCdDYwINdiWY6\njuOuwQ62CVvZxPVWFdZGrlVH3krd1pZqQ7ZaafPPWmlFV4hsM7g8sN0qpiRu2Fqxd6JgZ50N3Xh2\nOzHMGFBiAiE2NsE+vwd07g9Mwh+bYeYyn5eEBGfmznyPhefwueeecx3GGIOIiMgCPBDvAkRExH40\neIiIyIJp8BARkQXT4CEiIgumwUNERBZMg4eIiCzYrIPH+++/T2lpqfWVlpbG888/z9DQED6fj/z8\nfDZs2MDw8LB1TGNjIx6Ph4KCAjo7O6327u5uioqK8Hg87Nu3z2q/desWtbW1eDweKioquHLlSgy6\nKSIii2nWwWP16tVcvHiRixcv0t3dzZe+9CW2bt1KU1MTPp+Py5cv4/V6aWpqAqCnp4e2tjZ6enro\n6Ohg7969RJeRNDQ00NzcTDAYJBgM0tHRAUBzczPp6ekEg0H279/PwYMHY9xlERG5X/M+bfX666+T\nl5dHdnY2p06dor6+HoD6+npOnDgBwMmTJ6mrqyM1NZWcnBzy8vIIBAL09/czOjpKWVkZALt27bKO\nmfpaNTU1nDlzZlE7KCIiiy9lvk/0+/3U1dUBMDg4iNPpBMDpdDI4OAjA1atXqaiosI5xu91EIhFS\nU1Nxu91Wu8vlIhKJABCJRMjOzp4sJiWFtLQ0hoaGWLlypfV8h8Nxr/0TEUlqsdpEZF7JY3x8nF//\n+tds3759xmMOh2NJPtyNMbb9+ulPfxr3GpK1fjvXrvrj/2X3+mNpXoPH6dOneeSRR8jIyAAm08bA\nwAAA/f39ZGZmApOJoq+vzzouHA7jdrtxuVyEw+EZ7dFjent7AZiYmGBkZGRa6hARkcQzr8HjlVde\nsU5ZAVRVVdHS0gJAS0sLW7Zssdr9fj/j4+OEQiGCwSBlZWVkZWWxYsUKAoEAxhhaW1uprq6e8Vrt\n7e14vd5F7aCIyHI3NgYHDsDJk0v4pmYOn376qUlPTzeffPKJ1Xbt2jXj9XqNx+MxPp/PfPzxx9Zj\nR44cMbm5uWb16tWmo6PDan/77bfN2rVrTW5urnn66aet9ps3b5rt27ebvLw8U15ebkKh0Iwa5lFm\nQnvjjTfiXcJ9sXP9dq7dGNUfb3ao/803jcnPN6a21pgPP5z+WCw/Ox1/eoOE5nA4Yn7+TkTETsbG\n4B/+Af793+HFF6GmZuZzYvnZqRXmIiI2c/48lJZCXx9cuvT5A0eszftSXRERia+xMfjJT+Dll+GF\nF2DbtvjVouQhImID0bTR2zuZNuI5cICSh4hIQkuktDGVkoeISII6dw5KShInbUyl5CEikmCmXkmV\nSGljKiUPEZEEEk0b4XDipY2plDxERBLAfNZtJBIlDxGROJuaNt59N/EHDlDyEBGJG7uljamUPERE\n4sCOaWMqJQ8RkSVk57QxlZKHiMgSia4St2vamErJQ0QkxqauErdz2phKyUNEJIbu3pNqOQwcoOQh\nIhITdlglfj+UPEREFtn585NXUkXvt7HcBg5Q8hARWTTLPW1MpeQhIrIIous2lnPamErJQ0TkPiRT\n2phKyUNE5B4lW9qYSslDRGSBkjVtTKXkISKyANF1G8mYNqaac/AYHh5m27ZtPPTQQxQWFhIIBBga\nGsLn85Gfn8+GDRsYHh62nt/Y2IjH46GgoIDOzk6rvbu7m6KiIjweD/v27bPab926RW1tLR6Ph4qK\nCq5cubLIXRQRuX9jY/DMM/DEE/Czn0FbG2RkxLuq+Jlz8Ni3bx+bNm3iD3/4A5cuXaKgoICmpiZ8\nPh+XL1/G6/XS1NQEQE9PD21tbfT09NDR0cHevXsxxgDQ0NBAc3MzwWCQYDBIR0cHAM3NzaSnpxMM\nBtm/fz8HDx6MYXdFRBbu7lXiyZo2pnKY6Kf75xgZGaG0tJQ//vGP09oLCgr4j//4D5xOJwMDA1RW\nVvI///M/NDY28sADD1gDwMaNGzl06BB/8Rd/waOPPsof/vAHAPx+P11dXfzLv/wLGzdu5PDhw5SX\nlzMxMcGqVav48MMPpxfpcPDTn/7U+rmyspLKysrF+jcQEflcU/ekssPcRldXF11dXdbPhw8fZpaP\n+Psy64R5KBQiIyOD3bt381//9V888sgj/PM//zODg4M4nU4AnE4ng4ODAFy9epWKigrreLfbTSQS\nITU1FbfbbbW7XC4ikQgAkUiE7OzsyWJSUkhLS2NoaIiVK1dOq+XQoUP331sRkXk6fx5274bi4sm0\nYYdTVHf/YX348OGYvdesp60mJia4cOECe/fu5cKFC3z5y1+2TlFFORwOHA5HzAoUEVlKY2Nw4IDm\nNuYy6+Dhdrtxu92sX78egG3btnHhwgWysrIYGBgAoL+/n8zMTGAyUfT19VnHh8Nh3G43LpeLcDg8\noz16TG9vLzA5WI2MjMxIHSIiS2Hq3f00tzG7WQePrKwssrOzuXz5MgCvv/46a9asYfPmzbS0tADQ\n0tLCli1bAKiqqsLv9zM+Pk4oFCIYDFJWVkZWVhYrVqwgEAhgjKG1tZXq6mrrmOhrtbe34/V6Y9ZZ\nEZHPE00bNTVw9Cj4/Uobc5lzkeALL7zAd77zHcbHx8nNzeXf/u3fuH37Njt27KC5uZmcnByOHz8O\nQGFhITt27KCwsJCUlBSOHTtmndI6duwYTz31FGNjY2zatImNGzcCsGfPHnbu3InH4yE9PR2/3x/D\n7oqITHfu3OTcRmnp5N39Hnww3hXZw6xXWyUKh8MRsysGRCQ5LZd7ic8mlp+dWmEuIkln6tyG3e8l\nHi/a20pEkkYypI2louQhIkkhenc/pY3FoeQhIsua3VaJ24WSh4gsW9qTKnaUPERk2dH9NmJPyUNE\nlpXo3Eay328j1pQ8RGRZUNpYWkoeImJ7yXwv8XhR8hAR21LaiB8lDxGxJaWN+FLyEBFb0SrxxKDk\nISK2oT2pEoeSh4gkvLEx+PGP4Ze/VNpIFEoeIpLQomkjElHaSCRKHiKSkDS3kdiUPEQk4UT3pNLc\nRuJS8hCRhKEdcO1DyUNEEoJ2wLUXJQ8RiSulDXtS8hCRuIleSaW0YT9KHiKy5LQnlf3NmTxycnJ4\n+OGHKS0tpaysDIChoSF8Ph/5+fls2LCB4eFh6/mNjY14PB4KCgro7Oy02ru7uykqKsLj8bBv3z6r\n/datW9TW1uLxeKioqODKlSuL2T8RSTBTV4krbdjXnIOHw+Ggq6uLixcv8tZbbwHQ1NSEz+fj8uXL\neL1empqaAOjp6aGtrY2enh46OjrYu3cvxhgAGhoaaG5uJhgMEgwG6ejoAKC5uZn09HSCwSD79+/n\n4MGDseqriMTR2BgcODB52e3Ro+D3Q0ZGvKuSezWvOY/oABB16tQp6uvrAaivr+fEiRMAnDx5krq6\nOlJTU8nJySEvL49AIEB/fz+jo6NWctm1a5d1zNTXqqmp4cyZM4vTMxFJGNG7+2ndxvIx55yHw+Hg\nW9/6Fn/2Z3/G97//fb73ve8xODiI0+kEwOl0Mjg4CMDVq1epqKiwjnW73UQiEVJTU3G73Va7y+Ui\nEokAEIlEyM7OniwmJYW0tDSGhoZYuXLltDoOHTpkfV9ZWUllZeW99VhEloyupFpaXV1ddHV1Lcl7\nzTl4vPnmm6xatYoPP/wQn89HQUHBtMcdDgcOhyNmBUZNHTxEJPGdPw+7d0Nx8eTchk5Rxd7df1gf\nPnw4Zu8152mrVatWAZCRkcHWrVt56623cDqdDAwMANDf309mZiYwmSj6+vqsY8PhMG63G5fLRTgc\nntEePaa3txeAiYkJRkZGZqQOEbGP6NzGE0/Az34GbW0aOJajWQePGzduMDo6CsD169fp7OykqKiI\nqqoqWlpaAGhpaWHLli0AVFVV4ff7GR8fJxQKEQwGKSsrIysrixUrVhAIBDDG0NraSnV1tXVM9LXa\n29vxer0x66yIxJbu7pc8Zj1tNTg4yNatW4HJVPCd73yHDRs2sG7dOnbs2EFzczM5OTkcP34cgMLC\nQnbs2EFhYSEpKSkcO3bMOqV17NgxnnrqKcbGxti0aRMbN24EYM+ePezcuROPx0N6ejp+vz+W/RWR\nGNC6jeTjMHdfSpWAHA7HjCu+RCQxnDs3ObdRUjK5dbpOUSWOWH52aoW5iNwTpY3kpr2tRGTBNLch\nSh4iMm9KGxKl5CEi86K0IVMpeYjIrJQ25PMoeYjIF1LakC+i5CEiMyhtyFyUPERkmui9xJU2ZDZK\nHiICaAdcWRglDxGx0obuJS7zpeQhksSUNuReKXmIJKnolVRKG3IvlDxEkoyupJLFoOQhkkSiaSMc\nVtqQ+6PkIZIEpqaNF1+Empp4VyR2p+QhssxNTRvvvquBQxaHkofIMqW5DYklJQ+RZUirxCXWlDxE\nlhGlDVkqSh4iy4R2wJWlpOQhYnNKGxIPSh4iNqa0IfGi5CFiQ0obEm/zSh63b9+mtLSUzZs3AzA0\nNITP5yM/P58NGzYwPDxsPbexsRGPx0NBQQGdnZ1We3d3N0VFRXg8Hvbt22e137p1i9raWjweDxUV\nFVy5cmWx+iayLJ0/r7Qh8TevweO5556jsLAQh8MBQFNTEz6fj8uXL+P1emlqagKgp6eHtrY2enp6\n6OjoYO/evRhjAGhoaKC5uZlgMEgwGKSjowOA5uZm0tPTCQaD7N+/n4MHD8ainyK2NzYGBw7AE0/A\nkSPQ1gYZGfGuSpLVnINHOBzm1Vdf5W//9m+tgeDUqVPU19cDUF9fz4kTJwA4efIkdXV1pKamkpOT\nQ15eHoFAgP7+fkZHRykrKwNg165d1jFTX6umpoYzZ84sfi9FbE5zG5Jo5pzz2L9/Pz//+c/55JNP\nrLbBwUGcTicATqeTwcFBAK5evUpFRYX1PLfbTSQSITU1FbfbbbW7XC4ikQgAkUiE7OzsyWJSUkhL\nS2NoaIiVK1dOq+PQoUPW95WVlVRWVi6wqyL2o7kNWYiuri66urqW5L1mHTx+85vfkJmZSWlp6RcW\n5HA4rNNZsTR18BBJBufOwe7dk4nj0iWdopK53f2H9eHDh2P2XrMOHufOnePUqVO8+uqr3Lx5k08+\n+YSdO3fidDoZGBggKyuL/v5+MjMzgclE0dfXZx0fDodxu924XC7C4fCM9ugxvb29fO1rX2NiYoKR\nkZEZqUMkmShtiB3MOudx9OhR+vr6CIVC+P1+Hn30UVpbW6mqqqKlpQWAlpYWtmzZAkBVVRV+v5/x\n8XFCoRDBYJCysjKysrJYsWIFgUAAYwytra1UV1dbx0Rfq729Ha/XG8v+iiQ0zW2IXSxonUf09NTf\n//3fs2PHDpqbm8nJyeH48eMAFBYWsmPHDgoLC0lJSeHYsWPWMceOHeOpp55ibGyMTZs2sXHjRgD2\n7NnDzp078Xg8pKen4/f7F7N/Irage4mL3ThM9BKqBOZwOLBBmSL35Pz5ybmN4uLJGzVpbkMWSyw/\nO7XCXCROlDbEzrS3lUgcRO+30duruQ2xJyUPkSWkK6lkuVDyEFkiU+8lrrQhdqfkIRJjU9PGiy9C\nTU28KxK5f0oeIjF0d9rQwCHLhZKHSAwobchyp+Qhssii99sIh+HddzVwyPKk5CGySJQ2JJkoeYgs\nAqUNSTZKHiL3QavEJVkpeYjcI60Sl2Sm5CGyQFolLqLkIbIg0bkN3W9Dkp2Sh8g8KG2ITKfkITIH\n3d1PZCYlD5EvoLQh8sWUPEQ+h9KGyOyUPESmUNoQmR8lD5E/UdoQmT8lD0l6ShsiC6fkIUktukpc\naUNkYZQ8JClpTyqR+zNr8rh58ybl5eWUlJRQWFjIj370IwCGhobw+Xzk5+ezYcMGhoeHrWMaGxvx\neDwUFBTQ2dlptXd3d1NUVITH42Hfvn1W+61bt6itrcXj8VBRUcGVK1cWu48i02hPKpFFYOZw/fp1\nY4wxn332mSkvLzdnz541zzzzjHn22WeNMcY0NTWZgwcPGmOMee+990xxcbEZHx83oVDI5Obmmjt3\n7hhjjFm/fr0JBALGGGMef/xxc/r0aWOMMS+99JJpaGgwxhjj9/tNbW3tjBrmUabInG7cMObAAWOy\nsoz51a/iXY1I7MXys3POOY8vfelLAIyPj3P79m2++tWvcurUKerr6wGor6/nxIkTAJw8eZK6ujpS\nU1PJyckhLy+PQCBAf38/o6OjlJWVAbBr1y7rmKmvVVNTw5kzZxZ7fBRR2hBZZHPOedy5c4dvfOMb\n/N///R8NDQ2sWbOGwcFBnE4nAE6nk8HBQQCuXr1KRUWFdazb7SYSiZCamorb7bbaXS4XkUgEgEgk\nQnZ29mQxKSmkpaUxNDTEypUrp9Vx6NAh6/vKykoqKyvvrceSVDS3Icmkq6uLrq6uJXmvOQePBx54\ngHfeeYeRkREee+wx3njjjWmPOxwOHA5HzAqMmjp4iMzHuXOwe/fk2o1LlyAjI94VicTW3X9YHz58\nOGbvNe9LddPS0vj2t79Nd3c3TqeTgYEBAPr7+8nMzAQmE0VfX591TDgcxu1243K5CIfDM9qjx/T2\n9gIwMTHByMjIjNQhshBjY3DgwOStYI8cgbY2DRwii23WweOjjz6yrqQaGxvjtddeo7S0lKqqKlpa\nWgBoaWlhy5YtAFRVVeH3+xkfHycUChEMBikrKyMrK4sVK1YQCAQwxtDa2kp1dbV1TPS12tvb8Xq9\nMeusLH/RVeLhsOY2RGJp1tNW/f391NfXc+fOHe7cucPOnTvxer2UlpayY8cOmpubycnJ4fjx4wAU\nFhayY8cOCgsLSUlJ4dixY9YprWPHjvHUU08xNjbGpk2b2LhxIwB79uxh586deDwe0tPT8fv9Me6y\nLEdTV4m/+OJk6hCR2HH86XKuhOZwOLBBmRIn0bmN0tLJgePBB+NdkUhiiOVnp1aYi21pTyqR+NHe\nVmJL2pNKJL6UPMRWlDZEEoOSh9iG7rchkjiUPCThKW2IJB4lD0loShsiiUnJQxKS0oZIYlPykIRz\n/rzShkiiU/KQhKG0IWIfSh6SEKbuSfXuuxo4RBKdkofE1dgY/PjH8Mtfak8qETtR8pC4iaaNSGQy\nbWjgELEPJQ9ZctoBV8T+lDxkSUX3pIrObWjgELEnJQ9ZErqXuMjyouQhMRdNG729WrchslwoeUjM\nKG2ILF9KHhITShsiy5uShywqrRIXSQ5KHrJopq4SV9oQWd6UPOS+ad2GSPJR8pD7cveeVBo4RJKD\nkofcE6UNkeQ2a/Lo6+vjm9/8JmvWrGHt2rU8//zzAAwNDeHz+cjPz2fDhg0MDw9bxzQ2NuLxeCgo\nKKCzs9Nq7+7upqioCI/Hw759+6z2W7duUVtbi8fjoaKigitXrix2H2WRRa+kit5vQwOHSBIys+jv\n7zcXL140xhgzOjpq8vPzTU9Pj3nmmWfMs88+a4wxpqmpyRw8eNAYY8x7771niouLzfj4uAmFQiY3\nN9fcuXPHGGPM+vXrTSAQMMYY8/jjj5vTp08bY4x56aWXTENDgzHGGL/fb2pra2fUMUeZskRu3DDm\nwAFjsrKM+dWv4l2NiMwllp+dsyaPrKwsSkpKAPjzP/9zHnroISKRCKdOnaK+vh6A+vp6Tpw4AcDJ\nkyepq6sjNTWVnJwc8vLyCAQC9Pf3Mzo6SllZGQC7du2yjpn6WjU1NZw5cyYWY6TcJ63bEJGp5j3n\n8cEHH3Dx4kXKy8sZHBzE6XQC4HQ6GRwcBODq1atUVFRYx7jdbiKRCKmpqbjdbqvd5XIRiUQAiEQi\nZGdnTxaTkkJaWhpDQ0OsXLly2vsfOnTI+r6yspLKysqF9VTuidZtiNhHV1cXXV1dS/Je8xo8Pv30\nU2pqanjuuef4yle+Mu0xh8OBw+GISXFTTR08ZGmcOwe7d09eTXXpEmRkxLsiEZnN3X9YHz58OGbv\nNeelup999hk1NTXs3LmTLVu2AJNpY2BgAID+/n4yMzOByUTR19dnHRsOh3G73bhcLsLh8Iz26DG9\nvb0ATExMMDIyMiN1yNIaG4MDByYnwo8cgbY2DRwiMt2sg4cxhj179lBYWMgPfvADq72qqoqWlhYA\nWlparEGlqqoKv9/P+Pg4oVCIYDBIWVkZWVlZrFixgkAggDGG1tZWqqurZ7xWe3s7Xq83Jh2V+Tl/\nfjJpRK+k0mkqEflcs82mnz171jgcDlNcXGxKSkpMSUmJOX36tLl27Zrxer3G4/EYn89nPv74Y+uY\nI0eOmNzcXLN69WrT0dFhtb/99ttm7dq1Jjc31zz99NNW+82bN8327dtNXl6eKS8vN6FQaEYdc5Qp\ni+DGDWN++ENdSSWynMTys9PxpzdIaA6HAxuUaVtT5zZefFGnqESWi1h+dmqFeRLTlVQicq+0t1WS\niu5JpbkNEbkXSh5JRmlDRBaDkkcSUdoQkcWi5JEElDZEZLEpeSxzd++Aq4FDRBaDkscyNTYGP/kJ\nvPyy0oaILD4lj2VIO+CKSKwpeSwjmtsQkaWi5LFMTL2XuNKGiMSakofN6V7iIhIPSh42NjVtvPuu\nBg4RWTpKHjaktCEi8abkYTNKGyKSCJQ8bEJpQ0QSiZKHDUTv7qe0ISKJQskjgWmVuIgkKiWPBKVV\n4iKSyJQ8EoxWiYuIHSh5JBDdb0NE7ELJIwEobYiI3Sh5xJnShojYkZJHnChtiIidzZo8vvvd7+J0\nOikqKrLahoaG8Pl85Ofns2HDBoaHh63HGhsb8Xg8FBQU0NnZabV3d3dTVFSEx+Nh3759VvutW7eo\nra3F4/FQUVHBlStXFrNvCUtpQ0TsbtbBY/fu3XR0dExra2pqwufzcfnyZbxeL01NTQD09PTQ1tZG\nT08PHR0d7N27F2MMAA0NDTQ3NxMMBgkGg9ZrNjc3k56eTjAYZP/+/Rw8eDAWfUwYY2Nw4MDkIr8j\nR6CtDTIy4l2ViMjCzTp4/NVf/RVf/epXp7WdOnWK+vp6AOrr6zlx4gQAJ0+epK6ujtTUVHJycsjL\nyyMQCNDf38/o6ChlZWUA7Nq1yzpm6mvV1NRw5syZxe1dAlHaEJHlZMFzHoODgzidTgCcTieDg4MA\nXL16lYqKCut5brebSCRCamoqbrfbane5XEQiEQAikQjZ2dmThaSkkJaWxtDQECtXrpzxvocOHbK+\nr6yspLKycqGlx4XmNkRkqXR1ddHV1bUk73VfE+YOhwOHw7FYtcxq6uBhF+fPw+7dUFw8mTZ0ikpE\nYunuP6wPHz4cs/da8KW6TqeTgYEBAPr7+8nMzAQmE0VfX5/1vHA4jNvtxuVyEQ6HZ7RHj+nt7QVg\nYmKCkZGRz00ddjM2Bs88A088AT/7meY2RGT5WfDgUVVVRUtLCwAtLS1s2bLFavf7/YyPjxMKhQgG\ng5SVlZGVlcWKFSsIBAIYY2htbaW6unrGa7W3t+P1eherX3GjPalEJCmYWTz55JNm1apVJjU11bjd\nbvOLX/zCXLt2zXi9XuPxeIzP5zMff/yx9fwjR46Y3Nxcs3r1atPR0WG1v/3222bt2rUmNzfXPP30\n01b7zZs3zfbt201eXp4pLy83oVDoc+uYo8yEcOOGMT/8oTFZWcb86lfxrkZEJLafnY4/vUFCczgc\nJHKZ585Nzm2UlEzeqEmnqEQkEcTys1MrzO+DrqQSkWSlva3u0d33EtfAISLJRMljgXQvcRERJY8F\nuTttaOAQkWSl5DEPShsiItMpecxBaUNEZCYljy+gK6lERL6YksfniK4S1w64IiKfT8ljCqUNEZH5\nUfL4E91vQ0Rk/pI+eShtiIgsXFInD6UNEZF7k5TJQ2lDROT+JF3yOH9eaUNE5H4lTfJQ2hARWTxJ\nkTy0A66IyOJa1sljbAx+/GP45S+1J5WIyGJatskjmjYiEe1JJSKy2JZd8tAOuCIisbeskod2wBUR\nWRrLInnoSioRkaVl++ShVeIiIkvPtoPH2BgcODB5aurIEWhrg4yMeFf1+bq6uuJdwn2xc/12rh1U\nf7zZvf5YSojBo6Ojg4KCAjweD88+++yczzcGfD77pA27/wLauX471w6qP97sXn8sxX3O4/bt2/zd\n3/0dr7/+Oi6Xi/Xr11NVVcVDDz30hcc4HHD8OHzta0tYqIiIWOKePN566y3y8vLIyckhNTWVJ598\nkpMnT855nAYOEZH4cRhjTDwLaG9v57e//S3/+q//CsDLL79MIBDghRdesJ7jcDjiVZ6IiK3F6iM+\n7qet5jMwxHl8ExGRu8T9tJXL5aKvr8/6ua+vD7fbHceKRERkLnEfPNatW0cwGOSDDz5gfHyctrY2\nqqqq4l2WiIjMIu6nrVJSUnjxxRd57LHHuH37Nnv27Jn1SisREYm/uCcPgMcff5z333+f//3f/+VH\nP/rRtMcWugYkVr773e/idDopKiqy2oaGhvD5fOTn57NhwwaGh4etxxobG/F4PBQUFNDZ2Wm1d3d3\nU1RUhMfjYd++fVb7rVu3qK2txePxUFFRwZUrVxa1/r6+Pr75zW+yZs0a1q5dy/PPP2+rPty8eZPy\n8nJKSkooLCy0fk/sUj9MXpZeWlrK5s2bbVd7Tk4ODz/8MKWlpZSVldmu/uHhYbZt28ZDDz1EYWEh\ngUDANvW///77lJaWWl9paWk8//zz8a/fJLCJiQmTm5trQqGQGR8fN8XFxaanpycutfz+9783Fy5c\nMGvXrrXannnmGfPss88aY4xpamoyBw8eNMYY895775ni4mIzPj5uQqGQyc3NNXfu3DHGGLN+/XoT\nCASMMcY8/vjj5vTp08YYY1566SXT0NBgjDHG7/eb2traRa2/v7/fXLx40RhjzOjoqMnPzzc9PT22\n6sP169eNMcZ89tlnpry83Jw9e9ZW9f/TP/2T+Zu/+RuzefNmY4y9fn9ycnLMtWvXprXZqf5du3aZ\n5uZmY8zk78/w8LCt6o+6ffu2ycrKMr29vXGvP6EHj3PnzpnHHnvM+rmxsdE0NjbGrZ5QKDRt8Fi9\nerUZGBjNVUVnAAAETUlEQVQwxkx+OK9evdoYY8zRo0dNU1OT9bzHHnvMnD9/3ly9etUUFBRY7a+8\n8or5/ve/bz3nP//zP40xk7/cDz74YEz7Ul1dbV577TVb9uH69etm3bp15r//+79tU39fX5/xer3m\nd7/7nfnrv/5rY4y9fn9ycnLMRx99NK3NLvUPDw+br3/96zPa7VL/VL/97W/NX/7lXyZE/Qlx2uqL\nRCIRsrOzrZ/dbjeRSCSOFU03ODiI0+kEwOl0Mjg4CMDVq1enXTEWrfvudpfLZfVnal9TUlJIS0tj\naGgoJnV/8MEHXLx4kfLyclv14c6dO5SUlOB0Oq1TcHapf//+/fz85z/ngQf+/385u9QOk5fUf+tb\n32LdunXWmiy71B8KhcjIyGD37t184xvf4Hvf+x7Xr1+3Tf1T+f1+6urqgPj/+yf04GGnxYEOh8MW\n9X766afU1NTw3HPP8ZWvfGXaY4nehwceeIB33nmHcDjM73//e954441pjydq/b/5zW/IzMyktLT0\nC9csJWrtUW+++SYXL17k9OnTvPTSS5w9e3ba44lc/8TEBBcuXGDv3r1cuHCBL3/5yzQ1NU17TiLX\nHzU+Ps6vf/1rtm/fPuOxeNSf0INHoq8BcTqdDAwMANDf309mZiYws+5wOIzb7cblchEOh2e0R4/p\n7e0FJn/ZR0ZGWLly5aLW+9lnn1FTU8POnTvZsmWLLfsAkJaWxre//W26u7ttUf+5c+c4deoUX//6\n16mrq+N3v/sdO3futEXtUatWrQIgIyODrVu38tZbb9mmfrfbjdvtZv369QBs27aNCxcukJWVZYv6\no06fPs0jjzxCxp+2D4/3v39CDx6JvgakqqqKlpYWAFpaWqwP5KqqKvx+P+Pj44RCIYLBIGVlZWRl\nZbFixQoCgQDGGFpbW6murp7xWu3t7Xi93kWt1RjDnj17KCws5Ac/+IHt+vDRRx9ZV5OMjY3x2muv\nUVpaaov6jx49Sl9fH6FQCL/fz6OPPkpra6stage4ceMGo6OjAFy/fp3Ozk6KiopsU39WVhbZ2dlc\nvnwZgNdff501a9awefNmW9Qf9corr1inrO5+z7jUf1+zN0vg1VdfNfn5+SY3N9ccPXo0bnU8+eST\nZtWqVSY1NdW43W7zi1/8wly7ds14vV7j8XiMz+czH3/8sfX8I0eOmNzcXLN69WrT0dFhtb/99ttm\n7dq1Jjc31zz99NNW+82bN8327dtNXl6eKS8vN6FQaFHrP3v2rHE4HKa4uNiUlJSYkpISc/r0adv0\n4dKlS6a0tNQUFxeboqIi84//+I/GGGOb+qO6urqsq63sUvsf//hHU1xcbIqLi82aNWus/4d2qd8Y\nY9555x2zbt068/DDD5utW7ea4eFhW9X/6aefmvT0dPPJJ59YbfGuP+4bI4qIiP0k9GkrERFJTBo8\nRERkwTR4iIjIgmnwEBGRBdPgISIiC6bBQ0REFuz/AbN6wdoZghAKAAAAAElFTkSuQmCC\n"
199 |      },
200 |      "metadata": {},
201 |      "output_type": "display_data"
202 |     }
203 |    ],
204 |    "source": [
205 |     "counts = array(counts)\n",
206 |     "plot(counts[:,0],counts[:,1])"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "id": "2f2b2953",
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [],
217 |    "source": []
218 |   }
219 |  ],
220 |  "metadata": {},
221 |  "nbformat": 4,
222 |  "nbformat_minor": 5
223 | }
224 | 


--------------------------------------------------------------------------------
/HW1 - Regular Expression Parsing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "4f9f453d",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# HW1 - Regular Expression Parsing"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "29688f82",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Here is a set of strings with balanced parentheses."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "id": "43e994bf",
 23 |    "metadata": {
 24 |     "collapsed": false
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "yes1 = \"x (a b c) y\"\n",
 29 |     "yes2 = \"((((x))))\"\n",
 30 |     "yes3 = \"a (b (c d) ((e)) f) (g)\""
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "d0539d90",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "Here is a set of strings with unbalanced parentheses."
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "id": "4680f0fa",
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "no1 = \"(a b c\"\n",
 51 |     "no2 = \"((())))\"\n",
 52 |     "no3 = \"a (b (c d) (e)) f) (g)\""
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "id": "1fa04fb8",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "Write a function `check_balanced` that uses regular expressions\n",
 61 |     "to check whether the parentheses are balanced.\n",
 62 |     "Note that you can't do this with a single regular expression,\n",
 63 |     "you need to write a little loop around it.\n",
 64 |     "Your code structure might differ a little from the\n",
 65 |     "function below, but it shouldn't be much longer."
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "id": "1f7ce8e6",
 72 |    "metadata": {
 73 |     "collapsed": false
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "import re\n",
 78 |     "def check_balanced(s):\n",
 79 |     "    ...\n",
 80 |     "    while ...:\n",
 81 |     "        ...\n",
 82 |     "        ... = re.subn(...)\n",
 83 |     "        ...\n",
 84 |     "    ..."
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "id": "4048dab7",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "Now show that it works."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "6cf53b8c",
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "print check_balanced(yes1)\n",
105 |     "print check_balanced(yes2)\n",
106 |     "print check_balanced(yes3)"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "id": "ce592615",
113 |    "metadata": {
114 |     "collapsed": false
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "print check_balanced(no1)\n",
119 |     "print check_balanced(no2)\n",
120 |     "print check_balanced(no3)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "id": "d8a02e88",
127 |    "metadata": {
128 |     "collapsed": false
129 |    },
130 |    "outputs": [],
131 |    "source": []
132 |   }
133 |  ],
134 |  "metadata": {},
135 |  "nbformat": 4,
136 |  "nbformat_minor": 5
137 | }
138 | 


--------------------------------------------------------------------------------
/HW1 - Reproduce Power Laws.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "2f1bdeea",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# HW1 - Reproduce Power Laws"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "5429ce97",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "In class, we talked about how various papers claim that Zipf's law is a general\n",
 17 |     "property of many discrete distributions.\n",
 18 |     "[Wikipedia](http://en.wikipedia.org/wiki/Zipf's_law)\n",
 19 |     "[Mathworld](http://mathworld.wolfram.com/ZipfsLaw.html)\n",
 20 |     "\n",
 21 |     "The original paper by Belevitch examined this question theoretically.\n",
 22 |     "However, if this is true, it should be easy to reproduce experimentally,\n",
 23 |     "namely by picking various discrete distributions, computing word frequency\n",
 24 |     "by rank, and plotting the result.\n",
 25 |     "\n",
 26 |     "For working with the worksheet, remember that you can insert additional\n",
 27 |     "cells, both to add text and explanations, and to add additional code cells.\n",
 28 |     "You can change existing cells if you like; they are just there to help you\n",
 29 |     "get started."
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 21,
 35 |    "id": "baa1d02e",
 36 |    "metadata": {
 37 |     "collapsed": false
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from pylab import *\n",
 42 |     "from collections import Counter"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "id": "8654f3c8",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "Start by generating a random sample.\n",
 51 |     "Here is an example of a function that generates a uniform sample.\n",
 52 |     "Obviously, this particular choice of distribution will\n",
 53 |     "not reproduce Zipf's law, so\n",
 54 |     "you need to modify this to try to come up with distributions\n",
 55 |     "that will reproduce Zipf's law."
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 26,
 61 |    "id": "3221154a",
 62 |    "metadata": {
 63 |     "collapsed": false
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "def generate_sample(nsamples,vocabulary_size):\n",
 68 |     "    return array(rand(nsamples)*vocabulary_size,'i')"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 28,
 74 |    "id": "8b5d2fea",
 75 |    "metadata": {
 76 |     "collapsed": false
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "data = generate_sample(100000,1000)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "id": "fcf041dc",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "Compute a histogram."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 14,
 94 |    "id": "7668e2be",
 95 |    "metadata": {
 96 |     "collapsed": false
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "histogram = Counter(samples)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "id": "6bc1d8f0",
106 |    "metadata": {},
107 |    "source": [
108 |     "Compute frequency by rank."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 16,
114 |    "id": "9702b0a7",
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "frequencies = ..."
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "id": "504c9406",
126 |    "metadata": {},
127 |    "source": [
128 |     "Plot the result."
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 19,
134 |    "id": "fb4c1706",
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "[<matplotlib.lines.Line2D at 0x3f727d0>]"
143 |       ]
144 |      },
145 |      "execution_count": 19,
146 |      "metadata": {},
147 |      "output_type": "execute_result"
148 |     },
149 |     {
150 |      "data": {
151 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW4AAAD9CAYAAACcJ53WAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGxxJREFUeJzt3XuczfW+x/HXaCiZPbMpZmSmzZnYZsZlBplNYck90Qjl\nfos8TnU6PKqdOvYj225HW3YbOWfvnLKTQkQhRhzNuIWcQQ4yhM1oRodyGzK33/njW6TjMmvNWuu7\nfjPv5+Mxjxhrrd/nUXr7+l4+3zDHcRxERMQ1KtkuQEREvKPgFhFxGQW3iIjLKLhFRFxGwS0i4jIK\nbhERl7lucI8YMYLo6GgaN2586XvffvstnTp1okGDBnTu3JlTp04FvEgREbnsusE9fPhw0tPTr/je\n5MmT6dSpE9nZ2XTo0IHJkycHtEAREblS2I0O4Bw+fJgePXqwa9cuABo2bEhmZibR0dHk5eXh8Xj4\n8ssvg1KsiIhAuLdvOH78ONHR0QBER0dz/Pjx//easLCwslcmIlIBleYwe5kWJ8PCwq4Z0o7juPbr\nxRdftF6D6rdfR0WrXfXb/yotr4P7xykSgNzcXGrVquXtR4iISBl4Hdw9e/bk7bffBuDtt98mLS3N\n70WJiMi1XTe4+/fvT+vWrdm3bx9xcXHMnj2bcePGsXr1aho0aMDatWsZN27cVd978WJA6g0Kj8dj\nu4QyUf32uLl2UP1uccNdJT59aFgYDzzgsGQJhHu9/CkiUjGFhYWVaq47YCcnCwvh8cfB/38siIhU\nbAEL7kWLICsLJk4M1BNERCqmgE1kRETAxx9D69Zwxx0walSgniQiUrEEdAY6OhrS06FtW4iJgR49\nAvk0EZGKIeDdAevXh48+gkcfhc2bA/00EZHyLyhtXVu2hL//HdLSYN++YDxRRKT8Clo/7vvvh0mT\noFs3yM0N1lNFRMqfoO6yHj4cjh0zIZ6ZCZGRwXy6iEj5ELADONf6WMcx+7v374cVK6BKFX8/XUTE\nnUp7ACfowQ1QXAx9+kC1ajBnDlTSBWoiIvZPTl7PTTfBe+/BoUNwjVYnIiJyDdbGulWrwrJl5mva\nNFtViIi4j9UWUDVqmAM6994LtWvDww/brEZExB2s9+771a/M0fiOHaFmTWjf3nZFIiKhLSSWBZs0\ngQUL4JFH4Ic7iUVE5BpCIrjBjLSnTzd7vI8csV2NiEjosj5V8lP9+plTlV27woYNZg5cRESuZGUf\n940884xpSLV6tdl9IiJSEYT0AZwbKSmBwYPhwgVYuNDs+xYRKe9C+gDOjVSqBLNnw5kz8NRTuv5M\nROSnQjK4wfQwWbwYNm40XQVFRMQIqcXJn4uMhJUrL19/NmyY7YpEROwL6eAGc6IyPR3atTNXoXXr\nZrsiERG7Qnaq5Kd+/WtYsgSGDoXPP7ddjYiIXa4IboBWreA//xMefBAOHLBdjYiIPSE/VfJTPXtC\nXp45oLNpE9SqZbsiEZHgc82I+0ePPQYDB0L37nDunO1qRESCLyQP4NyI45gAz8mBpUuhcuWAPUpE\nJGhcfXKyNIqKIC0Nbr/dHNYJCwvo40REAs7VJydLIzzctILduxfGj7ddjYhI8Lg2uMFcNrx8ueln\n8u//brsaEZHgcNWukqupWRNWrTLXn8XEwEMP2a5IRCSwXB/cAPXqmZF3ly5mi+C999quSEQkcFw9\nVfJTKSnw7rvQuzfs2WO7GhGRwCk3wQ3QqRNMnWr6meTk2K5GRCQwfA7uSZMmkZSUROPGjRkwYAAX\nL170Z10+GzQInnjChPepU7arERHxP5+C+/Dhw8yaNYusrCx27dpFcXEx8+fP93dtPnv2WbjvPmjT\nBj74AIqLbVckIuI/PgV3ZGQklStX5vz58xQVFXH+/Hnq1Knj79p8FhYGr70GEyfClCmQkACzZkGI\n/KVARKRMfNpVUqNGDZ5++mnuvPNOqlatSpcuXejYseMVr5kwYcKlH3s8HjweT1nq9FqlStCrlzld\nuW4dvPIKTJgA//qvMHo0REUFtRwRkf8nIyODjIwMr9/n05H3r776ih49erB+/XqioqLo27cvffr0\nYeDAgeZDg3Dk3Rc7d8Kf/mQuZhg1yoR47dq2qxIRMQJ65H3btm20bt2a2267jfDwcB566CE2bdrk\ny0cFVdOmZsvgtm2Qnw+JiaZZ1f79tisTESk9n4K7YcOGbN68mQsXLuA4DmvWrCExMdHftQVMvXow\nYwZkZ5vTlq1bQ9++JtBFREKdT8HdtGlThgwZQosWLWjSpAkAjz32mF8LC4aaNc0C5qFDcM895rh8\nhw6werVpHSsiEopc29Y1EAoLYd48s5B5883w3HPmJGZ4uWgMICKhrtz34w6kkhL4+GMT4Lm58Mwz\nMGwYVK1quzIRKc8U3H6yYYMJ8M8/h379zFdqqi5uEBH/U3D7WXa2mUaZPx8uXIBHHjFfKSkKcRHx\nDwV3gDgOfPGFuX1n/nwz/92vnwnxpCTb1YmImym4g8BxzBbC+fNNkFevfnkkXr++7epExG0U3EFW\nUgKbNpkQX7gQ4uIuh/idd9quTkTcQMFtUVERZGaaEF+82LSYfeEFc1JTRORayv0t76EsPNwc5Jk1\nCw4eNIHdvj306QPbt9uuTkTcTsEdYFFRZrR98KA5nfnAA+Zr82bblYmIWym4g6RaNRg7Fr76Crp3\nN3PfHTtCRoaO14uIdzTHbUlhIcydC5MmmZvpx483t9RrT7hIxaXFSZcoLja7UP74R7Oo+fDD5kt7\nwkUqHgW3yzgObNliQvz99yEy0gR4377ajSJSUSi4XaykxIT4++/DokVmgfPHEE9IsF2diASKgruc\nKCkxO1B+HIknJsIf/gC/+Y3tykTE3xTc5VBhIfz97/DSS9CokbkEonlz21WJiL/oAE45VLmyueQ4\nOxvuvx969jQ32X/xhe3KRCSYFNwudPPN8MQTcOAAtG0LnTubfeE7d9quTESCQVMl5UB+Prz+uvmq\nXh0GDID+/eFXv7JdmYh4Q3PcFVBJibmx5733zG6UhAQT4n37wu23265ORG5EwV3BFRTAJ5+YEF++\nHP7pn6BVK2jd2vwzPl6nNEVCjYJbLikoMF0JP/vM9Az/7DO4eBHS0sx9mtWr265QREDBLTdw9ChM\nmWL6hf/1r6ZjoYjYpeCWUsnMhBEj4N574S9/0ehbxCbt45ZSadfO7AOPjDSHet55B06ftl2ViFyP\nRtxySWammfPesAGaNTOHfO6/3xyzr6Q/4kUCTlMl4rPz5+HTT2HFCli5Ek6cMKPxJk2gcWNITYWU\nFLjpJtuVipQvCm7xm2+/hV27zNcXX8DGjZCba6ZZ7rsPhg2DX/zCdpUi7qfgloDKzTWj8iVLTKAv\nWmRG5SLiOwW3BM2cOfD00zB1KgwZYrsaEfdScEtQ/c//QO/e0LAhNG0Kd9wBsbHQvr25KFlEbkzB\nLUF39qy57CEnB77+2nQvzMoyrWeHDoV77oHwcNtVioQuBbeEhNxcePddc6P9wYPm5p527Uxb2l/+\n0nZ1IqFFwS0h5+RJ0ytlyRLTAGvmTHjwQdtViYQOBbeEtMxMGDnS9AyPj4caNaBlS+jeHapUsV2d\niB0Kbgl5Fy7Axx+bAz4nTphR+N69pof4mDFQr57tCkWCK+DBferUKUaOHMnu3bsJCwvjrbfe4jc/\nXD2u4BZfHTwIb7wBs2ZBly7mcI/Ho1G4VAwBD+6hQ4fSrl07RowYQVFREfn5+URFRXn1cJFrOX0a\n3nzTHOzZuxeGD4fx482Uikh5FdDgPn36NCkpKRw8eLBMDxcpjWPH4KWX4IMP4PnnzY4UjcClPApo\ncO/YsYPRo0eTmJjIzp07ad68OdOmTePWW2+99PAXX3zx0us9Hg8ej8fbx4hcYfduePZZyM42UyhR\nUeZwT0SE2SNep47tCkW8k5GRQUZGxqWf//73vw9ccG/bto1WrVqxadMm7r77bsaMGUNkZCQTJ040\nH6oRtwTQmjWwerW53T4/H86cMX1TBgww3QujoqBPHx32EfcJ6Ig7Ly+PVq1acejQIQA2bNjA5MmT\nWb58uVcPF/GX48fNvvDjx83IvLAQZs82vcRF3KK02enTmCQmJoa4uDiys7Np0KABa9asISkpyZeP\nEvGL6Gj44S98lJSYezTbtYOuXaFHD7NXPDIS7rwTbr7Zbq0iZeXzrpKdO3cycuRICgoKiI+PZ/bs\n2dpVIiHl9GmztXDzZvjqKzOlcvw4JCebOfGRI6FBA9tVilymAzgiV3H2LGzdCmvXwt/+BoMHw6RJ\ncMsttisTUXCL3NDJkzB6tNknPnq0aUcbHQ21a5sFTpFgU3CLlILjQHo6LFwI+/fDN9+YlrSVK8O9\n95rLkxMSbFcpFYWCW8RHjmMCfMEC+OMfzQURgwZB69a2K5PyrrTZWSkItYi4SliYmTJ56inTBKtu\nXUhLg/XrbVcmYmjELVIKS5eaIG/YEN57Tz1TJDA04hbxo549zZbCxo3NfZrx8WaP+F/+YvaNiwST\nRtwiXrp4Ef7xD9i3D159FQ4dMic0u3c3YV6/vu0Kxa20OCkSBMXFZjdKVhZkZMDixTBlimlDK+It\nBbeIBfv2QbduZmdKYqK5HPnJJ6F6dduViRsouEUsKSoy0yd795pFzQ8/NHPkLVrAr38Nd99t+qaI\n/JyCWyREHDxo7tP87/82I/Ldu82FEGPGqPWsXEnBLRKiDhyAf/5nc+T+jTfMSFwEFNwiIc1xYO5c\nc6NP27bQqhW0aQOxsebwT1iY7QrFBgW3iAucOAEffWRaz27ZArm5plPhQw+ZY/ZJSfDDjYBSASi4\nRVxq1y6YMwdWrYLvvoO33oKOHTUKrwgU3CLlwNKl8NxzZhTu8UCHDma74U032a5MAkHBLVJOFBfD\nunXmQuRVqyAnxzS9+u1vIS4OKqlxRbmh4BYpp3buNHdqLlxo9oXPmWN6p4j7KbhFyrmSEnPRw5//\nDC1bmt0pjzxiRuGaSnEnBbdIBXHihOmTsno1fPABnDtnTmp262YaXt1zjxY23ULBLVJB5eTAihWQ\nmQnbt5uFzZdfNkftb7vNdnVyPQpuEcFx4E9/Mjf5bNt2eVEzOdl2ZXI1Cm4RucKRI/D++zB1qula\nmJgIw4apf3goUXCLyFXl5cHy5WYa5Y03zHVsyclQpw5MnAhVqtiusOJScIvIDeXnw44d8OWXsGSJ\nudnn6adh6FAtaNqg4BYRrzgOLFsGf/iDaUXbujU8+CA88ADExNiurmJQcIuIT0pKzMXI69ebDobb\ntpmr2O6/Hzp31kg8kBTcIuIXmzaZnilLlsCFC+a0Zo8e8PDDGon7m4JbRPzqx9vtMzNh+nRzIUSj\nRqZz4bBh0KCBRuNlpeAWkYA6d85cybZwodknHhMDo0bBY49BVJTt6txJwS0iQVNQYML7z3+GDRtM\n35S0NOjaFRISbFfnHqXNTjWEFJEyq1IFevUyC5p5eTB4MPzXf5lDPr/9rZkbF//RiFtEAmbnThPi\nu3ebezWbNoVmzcw2w9tvt11d6NFUiYiEjH37YP9+2LjRjMQ//9wE+MMPm1t9UlLUihYU3CISwnJz\nIT0d3nsPsrPNlEr37mZU3qxZxd2douAWEVc4cQJmzYKVK82USqVKMGaM2aFSq5bt6oIr4MFdXFxM\nixYtiI2NZdmyZT49XETkpwoKYPFieO012LoVkpIgNRWefdbc7FOtmu0KAyvgu0qmTZtGYmIiYRX1\n7zQi4ndVqkC/frBliznsM20afPONuZotIsJczbZ5s+0q7fMpuHNyclixYgUjR47UyFpEAuLOO83C\n5bJlcOaM6WIYEWHmwWvXhpdeguPHbVdpR7gvbxo7dixTpkzhzJkz13zNhAkTLv3Y4/Hg8Xh8eZSI\nCGC2Er75JsycCfPnm2P3v/sdtGljrmT73e/MwqabZGRkkJGR4fX7vJ7jXr58OStXrmTmzJlkZGQw\ndepUzXGLiBX798Phw+ZCiEWLYNAgMz/u1j3iAVucfOGFF3jnnXcIDw/n+++/58yZM/Tu3Zs5c+Z4\n/XAREX/59FP4l38xO1M6dzbH7ocPN9MqblmKC8p2wMzMTF599VWNuEUkZHzxBaxdC3PmmOvZHnjA\n9FC5667QD/Cg9SrRrhIRCSVNmph94FlZZo/4mTNmW2HLlma/+LlztissOx3AEZFy73//F95+29xw\nX1hoplKaN4fevSEyEmrUsF2hoZOTIiI/U1JithceO2YWMQsK4MgReOIJGDfOnNS0ecu9gltEpBTW\nrIFXX4VVq8zP/+3fzE331asHvxYFt4iIlxYvhueeM7fc//KX5jafJ5+EOnWC83wFt4iIDy5eNHPi\nn34KL74I330HHo+5yWfcOLjllsBNpyi4RUTK6Px5WL0aiorMTpXvvoM77oCxYyE8HIYMgZtv9t/z\nFNwiIn5WVATPP2+2FKanmxOaDz4IzzxjRuJlpeAWEQmgf/wDtm0z8+COY3qo3Hor9Ozp+0EfBbeI\nSJA8+yzk5JjLILp1g5EjTWdDbym4RUSCbN06mDfPXMkWEwPvvmu6GlauXLr3B+3Iu4iIGG3bwuuv\nm+P27dpB+/ZmDnzPHv8+R8EtIuJHN90E8fGm1eyXX5rTmW3bmqZX/pqIUHCLiARInTpmO+Hjj8NT\nT5mr2PxBc9wiIkHw4YfQvz8MGwYzZph94D+nOW4RkRCSlmZu6VmwAKZMKdtnacQtIhJE//EfZupk\n+3ZITr7y17QdUEQkBBUVQadOZovgJ59c+WsKbhGRELVrl7mp55tvoGbNy9/XHLeISIhq1AgaNDBz\n3r5QcIuIBFlYmLnE+LPPfHu/gltExILWrWHHDt/eq+AWEbEgPh4q+ZjACm4REQtuuQUuXPDtvQpu\nERELqlZVcIuIuIqCW0TEZRTcIiIuEx4OxcW+vVfBLSLiMgpuERGXUXCLiLiMgltExBJfe/EpuEVE\nLAgL8/29Cm4REZdRcIuIuIyCW0TEZRTcIiKWBHVx8ujRo7Rv356kpCQaNWrE9OnTfXu6iEgFVZbF\nSZ/unMzLyyMvL4/k5GTOnTtH8+bN+fDDD0lISPihIN05KSJyPRcvQmSk+eePAnrnZExMDMk/3Csf\nERFBQkICX3/9tS8fJSIiXgov6wccPnyY7du3k5qaesX3J0yYcOnHHo8Hj8dT1keJiJQrJSUZTJiQ\n4fX7fJoq+dG5c+fweDyMHz+etLS0yx+qqRIRkeu6eBF+8QsoKLj8vYBOlQAUFhbSu3dvBg0adEVo\ni4jIjQV9cdJxHIYOHcptt93Ga6+9dpWCNOIWEbmeggKIiPBtxO1TcG/YsIG2bdvSpEkTwn74Y2PS\npEl07drVq4eLiFRUQQ/uG36ogltE5LrKEtw6OSkiYonauoqIuIjauoqIVCAKbhERl1Fwi4i4jIJb\nRMQSLU6KiLiIFidFRCoQBbeIiMsouEVELNEct4iIi2iOW0SkAlFwi4i4jIJbRMRlFNwiIpZocVJE\nxEW0OCkiUoEouEVEXEbBLSLiMgpuERFLtDgpIuIiWpwUEalAFNwiIi6j4BYRcRkFt4iIyyi4RUQs\n0OKkiEgFouAWEXEZBbeIiMsouEVELPLl9KSCW0TEZRTcIiIuo+AWEXEZBbeIiMsouEVELNLipJ9k\nZGTYLqFMVL89bq4dVH+w+Xp60ufgTk9Pp2HDhtSvX59XXnnF148JSW77j/9zqt8eN9cOqt8tfAru\n4uJinnzySdLT09mzZw/z5s1j7969/q5NRESuwqfg3rp1K3fddRd169alcuXK9OvXj48++sjftYmI\nlHu+zHGHOY73b1u0aBGrVq1i1qxZAMydO5ctW7YwY8YM86FlaXslIlKBlSaSw3354BsFsw9/FoiI\nSCn5NFVSp04djh49eunnR48eJTY21m9FiYjItfkU3C1atGD//v0cPnyYgoICFixYQM+ePf1dm4iI\nXIVPUyXh4eG8/vrrdOnSheLiYh599FESEhL8XZuIiFyFz/u4u3Xrxr59+zhw4ADPP//8pe+7eX/3\niBEjiI6OpnHjxrZL8cnRo0dp3749SUlJNGrUiOnTp9suySvff/89qampJCcnk5iYeMXvK7coLi4m\nJSWFHj162C7Fa3Xr1qVJkyakpKTQsmVL2+V47dSpU/Tp04eEhAQSExPZvHmz7ZJKbd++faSkpFz6\nioqKuv7/v44fFRUVOfHx8c6hQ4ecgoICp2nTps6ePXv8+YiAWrdunZOVleU0atTIdik+yc3NdbZv\n3+44juOcPXvWadCggav+/TuO4+Tn5zuO4ziFhYVOamqqs379essVeWfq1KnOgAEDnB49etguxWt1\n69Z1Tp48absMnw0ZMsR58803Hccxv39OnTpluSLfFBcXOzExMc6RI0eu+Rq/Hnl3+/7uNm3aUL16\nddtl+CwmJobk5GQAIiIiSEhI4Ouvv7ZclXduvfVWAAoKCiguLqZGjRqWKyq9nJwcVqxYwciRI127\ns8qtdZ8+fZr169czYsQIwEznRkVFWa7KN2vWrCE+Pp64uLhrvsavwX3s2LErHhYbG8uxY8f8+Qgp\npcOHD7N9+3ZSU1Ntl+KVkpISkpOTiY6Opn379iQmJtouqdTGjh3LlClTqFTJnS2AwsLC6NixIy1a\ntLh0RsMtDh06RM2aNRk+fDjNmjVj1KhRnD9/3nZZPpk/fz4DBgy47mv8+jtMB29Cw7lz5+jTpw/T\npk0jIiLCdjleqVSpEjt27CAnJ4d169a5pvfE8uXLqVWrFikpKa4dtW7cuJHt27ezcuVKZs6cyfr1\n622XVGpFRUVkZWXx+OOPk5WVRbVq1Zg8ebLtsrxWUFDAsmXL6Nu373Vf59fg1v5u+woLC+nduzeD\nBg0iLS3Ndjk+i4qKonv37mzbts12KaWyadMmli5dSr169ejfvz9r165lyJAhtsvySu3atQGoWbMm\nvXr1YuvWrZYrKr3Y2FhiY2O5++67AejTpw9ZWVmWq/LeypUrad68OTVr1rzu6/wa3NrfbZfjODz6\n6KMkJiYyZswY2+V47cSJE5w6dQqACxcusHr1alJSUixXVTovv/wyR48e5dChQ8yfP5/77ruPOXPm\n2C6r1M6fP8/Zs2cByM/P55NPPnHV7qqYmBji4uLIzs4GzDxxUlKS5aq8N2/ePPr373/D1/m0j/ua\nH+by/d39+/cnMzOTkydPEhcXx8SJExk+fLjtskpt48aNzJ0799KWLoBJkybRtWtXy5WVTm5uLkOH\nDqWkpISSkhIGDx5Mhw4dbJflE7dNGx4/fpxevXoBZtph4MCBdO7c2XJV3pkxYwYDBw6koKCA+Ph4\nZs+ebbskr+Tn57NmzZpSrS/41GRKRETscefyt4hIBabgFhFxGQW3iIjLKLhFRFxGwS0i4jIKbhER\nl/k/0reyxz2zb7MAAAAASUVORK5CYII=\n"
152 |      },
153 |      "metadata": {},
154 |      "output_type": "display_data"
155 |     }
156 |    ],
157 |    "source": [
158 |     "plot(log(1+arange(len(frequencies))),log(frequencies))"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "id": "b41ac5c1",
164 |    "metadata": {},
165 |    "source": [
166 |     "Now wrap this up as a function."
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "id": "f1c0ab77",
173 |    "metadata": {
174 |     "collapsed": false
175 |    },
176 |    "outputs": [],
177 |    "source": [
178 |     "def frequency_by_rank_plot(samples):\n",
179 |     "    ..."
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "id": "bec889a9",
185 |    "metadata": {},
186 |    "source": [
187 |     "Now define multiple distributions and plot their frequency ranks.\n",
188 |     "Which ones give rise to power laws?"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "id": "d00e4a4e",
195 |    "metadata": {
196 |     "collapsed": false
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "def distribution1(n,q):\n",
201 |     "    ...\n",
202 |     "frequency_by_rank_plot(distribution1(100000,1000))"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "id": "8c57b066",
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [],
213 |    "source": [
214 |     "def distribution2(n,q):\n",
215 |     "    ...\n",
216 |     "frequency_by_rank_plot(distribution2(100000,1000))"
217 |    ]
218 |   }
219 |  ],
220 |  "metadata": {},
221 |  "nbformat": 4,
222 |  "nbformat_minor": 5
223 | }
224 | 


--------------------------------------------------------------------------------
/HW2 - Classifier-Based Tagging.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "f1b129e9",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import nltk\n",
 13 |     "import tagutils; reload(tagutils)\n",
 14 |     "from tagutils import *\n",
 15 |     "from IPython.core.display import HTML\n",
 16 |     "from nltk.corpus import brown\n",
 17 |     "import random as pyrand\n",
 18 |     "from tagutils import *"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "367e4acf",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Evaluation Framework"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 5,
 32 |    "id": "ce49e28b",
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "sents = list(brown.tagged_sents())\n",
 39 |     "n = len(sents)\n",
 40 |     "test = sorted(list(set(range(0,n,10))))\n",
 41 |     "training = sorted(list(set(range(n))-set(test)))\n",
 42 |     "training_set = [sents[i] for i in training]\n",
 43 |     "test_set = [sents[i] for i in test]"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 7,
 49 |    "id": "41dd57a9",
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "51606\n",
 59 |       "5734\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "print len(training_set)\n",
 65 |     "print len(test_set)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "id": "4ff74a64",
 72 |    "metadata": {
 73 |     "collapsed": false
 74 |    },
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "0.9236947791164659"
 80 |       ]
 81 |      },
 82 |      "execution_count": 6,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "t0 = nltk.DefaultTagger('NN')\n",
 89 |     "t1 = nltk.UnigramTagger(training_set, backoff=t0)\n",
 90 |     "t2 = nltk.BigramTagger(training_set, backoff=t1)\n",
 91 |     "t2.evaluate(test_set)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "id": "1914cd9d",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "# Classifier-Based Tagging"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 14,
105 |    "id": "bdae973b",
106 |    "metadata": {
107 |     "collapsed": false
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "import nltk.tag.api\n",
112 |     "# help(nltk.tag.api.TaggerI)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "id": "452f7133",
118 |    "metadata": {},
119 |    "source": [
120 |     "Implement a new tagger based on classifiers.\n",
121 |     "\n",
122 |     "When applying a classifier, you need to transform the input into a feature vector.\n",
123 |     "In this case, we are trying to predict $P(t_n| < \\hbox{input words} > )$.  How do we do this?\n",
124 |     "\n",
125 |     "For a simple unigram tagger, we are estimating $P(t_n | w_n)$.\n",
126 |     "If $w_n \\in V = \\\\{1,...,N\\\\}$, where $V$ is a vocabulary of size $N$ representing each word\n",
127 |     "as an integer, then the input feature vector might be a binary vector $\\vec{x} = (x_1 ... x_N)$ where\n",
128 |     "\n",
129 |     "$$ x_i = \\delta_{i,w_n} $$\n",
130 |     "\n",
131 |     "For a simple bigram tagger, we are estimating something like $P(t_n | w_n t_{n-1})$, which\n",
132 |     "we could similarly represent as a concatenation of two large binary input vectors.\n",
133 |     "\n",
134 |     "However, such a brute force approach may not work very well because we have a very high\n",
135 |     "dimensional input vector and classifiers often need a lot of training data.\n",
136 |     "We are free to preprocess the data in any form we like in order to get better feature\n",
137 |     "vectors. \n",
138 |     "\n",
139 |     "Here are some ideas:\n",
140 |     "\n",
141 |     "- use the posterior probabilities for tags returned by a unigram and bigram tagger as feature vectors\n",
142 |     "- use possible grammatical categories and semantic categories from Wordnet as feature vectors\n",
143 |     "- use simple features like capitalization, word length, and position in sentence\n",
144 |     "- provide information about word frequency in input\n",
145 |     "- \"hash\" the large range of possible words $V$ down to a much smaller vocabulary\n",
146 |     "- same as before, but do the hasing somewhat more intelligently: leave all the stop words alone, but has down the content words\n",
147 |     "- do the \"hashing\" in some way that's informed by Wordnet\n",
148 |     "\n",
149 |     "Note that in order to be able to tag using the algorithms we have described, you can use tags assigned to previous words, but you cannot use tags assigned to subsequent words.\n",
150 |     "\n",
151 |     "Try to beat the bigram-with-backoff tagger above, using the same evaluation paradigm.\n",
152 |     "Your tagger should implement the standard NLTK tagging API.\n",
153 |     "\n",
154 |     "Two classifiers to try are logistic regression and decision tree classifiers.\n",
155 |     "You can use implementations from the `sklearn` package."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "id": "cb74b2b4",
162 |    "metadata": {
163 |     "collapsed": false
164 |    },
165 |    "outputs": [],
166 |    "source": []
167 |   }
168 |  ],
169 |  "metadata": {},
170 |  "nbformat": 4,
171 |  "nbformat_minor": 5
172 | }
173 | 


--------------------------------------------------------------------------------
/HW2 - Improve Tagging with Wordnet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "0a68f3d1",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import nltk\n",
 13 |     "import tagutils; reload(tagutils)\n",
 14 |     "from tagutils import *\n",
 15 |     "from IPython.core.display import HTML\n",
 16 |     "from nltk.corpus import brown\n",
 17 |     "import random as pyrand\n",
 18 |     "from tagutils import *"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "id": "0f36a27e",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "# Evaluation Framework"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 5,
 32 |    "id": "5325e07c",
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "sents = list(brown.tagged_sents())\n",
 39 |     "n = len(sents)\n",
 40 |     "test = sorted(list(set(range(0,n,10))))\n",
 41 |     "training = sorted(list(set(range(n))-set(test)))\n",
 42 |     "training_set = [sents[i] for i in training]\n",
 43 |     "test_set = [sents[i] for i in test]"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 7,
 49 |    "id": "3ee2d098",
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "51606\n",
 59 |       "5734\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "print len(training_set)\n",
 65 |     "print len(test_set)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "id": "290c50e0",
 72 |    "metadata": {
 73 |     "collapsed": false
 74 |    },
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "0.9236947791164659"
 80 |       ]
 81 |      },
 82 |      "execution_count": 6,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "t0 = nltk.DefaultTagger('NN')\n",
 89 |     "t1 = nltk.UnigramTagger(training_set, backoff=t0)\n",
 90 |     "t2 = nltk.BigramTagger(training_set, backoff=t1)\n",
 91 |     "t2.evaluate(test_set)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "id": "e909843a",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "# Wordnet-Based Improvements"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 14,
105 |    "id": "79bafa77",
106 |    "metadata": {
107 |     "collapsed": false
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "import nltk.tag.api\n",
112 |     "# help(nltk.tag.api.TaggerI)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "id": "b913049e",
118 |    "metadata": {},
119 |    "source": [
120 |     "Your homework consists of implementing new taggers based on Wordnet.\n",
121 |     "With regular taggers, we have a problem of sparsity; that is, we don't know\n",
122 |     "what tag to assign to a word if we have never seen it in a context.\n",
123 |     "\n",
124 |     "However, for many words, Wordnet may give us useful information to help with\n",
125 |     "tagging. You need to work out some ideas, implement them, and test them.\n",
126 |     "\n",
127 |     "There are different implementation strategies, but a simple one might be:\n",
128 |     "\n",
129 |     "- write classes that map token sequences to other token sequences using\n",
130 |     "  WordNet; for example, you might map an input sentence to some collection\n",
131 |     "  of hyponyms\n",
132 |     "- then, apply the regular NLTK n-gram taggers to the modified output sequences\n",
133 |     "- use backoff (as above) when the WordNet mapping fails for some reason\n",
134 |     "  (you can't find the word, or maybe the mapping would be ambiguous and\n",
135 |     "  you don't know how to handle it)\n",
136 |     "\n",
137 |     "This may not be the best strategy, but it's a good way of getting started.\n",
138 |     "\n",
139 |     "Another strategy is to use use WordNet to generate a cloud of related words\n",
140 |     "around a given word, and then see whether you can find bigrams in an existing\n",
141 |     "model for any of the related words.\n",
142 |     "\n",
143 |     "Implement your model(s) so that they conform to the NLTK tagging APIs,\n",
144 |     "perform evaluations on the training and test sets defined above,\n",
145 |     "and be ready to present your results (idea, evaluation, results) in \n",
146 |     "the exercises.\n",
147 |     "\n"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "id": "49438688",
154 |    "metadata": {
155 |     "collapsed": false
156 |    },
157 |    "outputs": [],
158 |    "source": []
159 |   }
160 |  ],
161 |  "metadata": {},
162 |  "nbformat": 4,
163 |  "nbformat_minor": 5
164 | }
165 | 


--------------------------------------------------------------------------------
/HW3 - HMMs and FSTs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "c2cfa1b2",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Transform HMMs into FSTs"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "5bbd63a2",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Assume you are given a Hidden Markov Model (HMM) described\n",
 17 |     "by a transition matrix $A$ and an emission matrix $B$,\n",
 18 |     "with $n$ states and $m$ symbols.\n",
 19 |     "Assume state $0$ is the start state.\n",
 20 |     "\n",
 21 |     "Write a function that converts the Hidden Markov Model into an\n",
 22 |     "equivalent FST and demonstrate that the two models give equivalent results.\n"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "1308396d",
 29 |    "metadata": {
 30 |     "collapsed": false
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "def hmm_to_fst(A,B):\n",
 35 |     "    \"\"\"Convert an HMM to an equivalent FST. State 0\n",
 36 |     "    is always the start state, and state n-1 is always\n",
 37 |     "    the accept state.\"\"\"\n",
 38 |     "    n,n1 = A.shape\n",
 39 |     "    assert n==n1\n",
 40 |     "    m,n2 = B.shape\n",
 41 |     "    assert n==n2\n",
 42 |     "    if accept is None: accept_state = n-1\n",
 43 |     "    raise Exception(\"IMPLEMENT ME\")\n",
 44 |     "    return fst"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "id": "595857b4",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "For the following, just call the OpenFST functions."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 3,
 58 |    "id": "5a4dabef",
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "def fst_log_probability(fst,s):\n",
 65 |     "    \"\"\"Find the lowest cost path corresponding to the string `s`\n",
 66 |     "    through `fst`. The string `s` is given as a 1D numpy array.\"\"\"\n",
 67 |     "    raise Exception(\"IMPLEMENT ME\")\n",
 68 |     "    return log_p"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "id": "589aef5c",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "For the following, you can use the Viterbi algorithm from lecture as a starting point."
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 4,
 82 |    "id": "2e099820",
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "def hmm_log_probability(A,B,s):\n",
 89 |     "    \"\"\"Find the lowest cost path through the HMM corresponding\n",
 90 |     "    to the given string `s`.\"\"\"\n",
 91 |     "    raise Exception(\"IMPLEMENT ME\")\n",
 92 |     "    return log_p"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "id": "f6147853",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "Now write some unit tests."
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "id": "39258b72",
107 |    "metadata": {
108 |     "collapsed": false
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "A = ones((1,1))\n",
113 |     "B = ones((1,1))\n",
114 |     "assert abs(hmm_log_probability(A,B,zeros(10,'i'))) < 1e-4\n",
115 |     "\n",
116 |     "fst = hmm_to_fst(A,B)\n",
117 |     "assert abs(fst_log_probability(fst,zeros(10,'i'))) < 1e-4\n",
118 |     "\n",
119 |     "# more meaningful unit tests"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "id": "849bd7c9",
125 |    "metadata": {},
126 |    "source": [
127 |     "And write some general purpose random tests."
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "id": "f42bf203",
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "for trial in range(10):\n",
140 |     "    A = rand((10,10))\n",
141 |     "    # normalize appropriately\n",
142 |     "    B = rand((17,10))\n",
143 |     "    # normalize appropriately\n",
144 |     "    fst = hmm_to_fst(A,B)\n",
145 |     "    for trial2 in range(10):\n",
146 |     "        s = array(10*rand(7),'i')\n",
147 |     "        p1 = fst_log_probability(fst,s)\n",
148 |     "        p2 = hmm_log_probability(A,B,s)\n",
149 |     "        assert abs(p1-p2)/min(abs(p1),abs(p2)) < 1e-4"
150 |    ]
151 |   }
152 |  ],
153 |  "metadata": {},
154 |  "nbformat": 4,
155 |  "nbformat_minor": 5
156 | }
157 | 


--------------------------------------------------------------------------------
/HW3 - regex generation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a57cac02",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Regular Expression Class based on OpenFST"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "44d783c1",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "The goal of this exercise is to write a small regular expression class\n",
 17 |     "that internally uses OpenFST to perform the matching."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "id": "058dfa5c",
 24 |    "metadata": {
 25 |     "collapsed": false
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "class OpenRE:\n",
 30 |     "    def __init__(self,regex=None,cost=0.0):\n",
 31 |     "        if regex is not None:\n",
 32 |     "            self.add(regex,cost)\n",
 33 |     "            self.compile()\n",
 34 |     "        # IMPLEMENT ME\n",
 35 |     "    def add(self,regex,cost=0.0):\n",
 36 |     "        \"\"\"Add a regular expression to the overall\n",
 37 |     "        regular expression using a disjunction.\"\"\"\n",
 38 |     "        # IMPLEMENT ME\n",
 39 |     "    def compile(self):\n",
 40 |     "        \"\"\"After adding component regular expressions,\n",
 41 |     "        compile the internal fst.\"\"\"\n",
 42 |     "        # IMPLEMENT ME\n",
 43 |     "        self.fst = something\n",
 44 |     "    def cost(self,s):\n",
 45 |     "        \"\"\"Match the given string against the compiled\n",
 46 |     "        regular expression and return the cost. Returns\n",
 47 |     "        `inf` if there is no match.\"\"\"\n",
 48 |     "        # IMPLEMENT ME\n",
 49 |     "        return cost"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "id": "df247c82",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "Your package should understand the following expressions:\n",
 58 |     "\n",
 59 |     "- \"ABC\" - simple strings\n",
 60 |     "- \"AB|CD\" - alternation\n",
 61 |     "- \"AB*C\" - regex star (zero or more repeats)\n",
 62 |     "- \"AB+C\" - regex plus (one or more repeats)\n",
 63 |     "- \"A(B|C)*D\" - parentheses and optional operators\n",
 64 |     "\n",
 65 |     "Assume that expressions are implicitly anchored at the beginning and end\n",
 66 |     "(no partial matches).\n",
 67 |     "\n",
 68 |     "It's OK if you limit yourself to ASCII strings. Use `ord` to encode characters\n",
 69 |     "to integers. Do not worry about escape characters or wildcards."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "id": "599cd492",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "# Unit Tests"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "id": "4ef79ae4",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "Write a set of unit tests demonstrating that your code works."
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "id": "0e32cd81",
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "assert OpenRE(\"abc\").cost(\"abc\") == 0\n",
 98 |     "assert OpenRE(\"abC\").cost(\"abc\") == inf\n",
 99 |     "assert OpenRE(\"ab\").cost(\"abc\") == inf # no anchoring\n",
100 |     "assert OpenRE(\"(a|b)\").cost(\"a\") == 0\n",
101 |     "assert OpenRE(\"(a|b)\").cost(\"b\") == 0\n",
102 |     "assert OpenRE(\"a|b\").cost(\"a\") == 0\n",
103 |     "# etc."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "id": "cb2ff370",
109 |    "metadata": {},
110 |    "source": [
111 |     "# Parsing"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "id": "32eb7e36",
117 |    "metadata": {},
118 |    "source": [
119 |     "For parsing the regular expression itself, you may want to use the `pyparsing` module.\n",
120 |     "\n",
121 |     "Here is a simple example of how you might go about this. Note that this is _not_ a correct\n",
122 |     "regular expression parser yet and that you may want to generate a different kind of structure.\n",
123 |     "\n",
124 |     "Read the documentation to figure out how to deal with whitespace and more characters."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "id": "3459ffe3",
131 |    "metadata": {
132 |     "collapsed": false
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "from pyparsing import *\n",
137 |     "postfix = Literal('+') | Literal('*')\n",
138 |     "alt = Literal( '|' )\n",
139 |     "lpar  = Literal( '(' ).suppress()\n",
140 |     "rpar  = Literal( ')' ).suppress()\n",
141 |     "lit = Regex('[^()|+*]+')\n",
142 |     "expr = Forward()\n",
143 |     "term = lit | alt + expr | Group( lpar + expr + rpar + Optional(postfix) )\n",
144 |     "expr << ZeroOrMore( term  )\n",
145 |     "expr.parseString(\"hello, (world|there)+|(a(b)c)\")"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "id": "730deba5",
152 |    "metadata": {
153 |     "collapsed": false
154 |    },
155 |    "outputs": [],
156 |    "source": []
157 |   }
158 |  ],
159 |  "metadata": {},
160 |  "nbformat": 4,
161 |  "nbformat_minor": 5
162 | }
163 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Natural Language Processing and Applications
  2 | 
  3 | [Course](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-course.ipynb)
  4 | 
  5 | [Intro](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-intro.ipynb)
  6 | 
  7 | [Demo Videos](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-intro-demo-videos.ipynb)
  8 | 
  9 | ## UNIX Command Line Tools
 10 | 
 11 | [Database Join](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-unix-join.ipynb)
 12 | 
 13 | [Making the Brown Corpus Readable](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-unix-cleanup.ipynb)
 14 | 
 15 | [Downloading Text from the Internet](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-downloading-tomsawyer.ipynb)
 16 | 
 17 | [Word Histogram](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-word-histograms.ipynb)
 18 | 
 19 | [find and xargs](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-find-xargs.ipynb)
 20 | 
 21 | [Zipf Law](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/zipf-law-example.ipynb)
 22 | 
 23 | ## Web Data
 24 | 
 25 | [Crawling with Scrapy](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/crawling-with-scrapy.ipynb)
 26 | 
 27 | [Internet Archive Query](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/internet-archive-query.ipynb)
 28 | 
 29 | ## Text Data and Algorithms
 30 | 
 31 | [Unicode](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-unicode.ipynb)
 32 | 
 33 | [Intro to Regular Expressions](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-re-intro.ipynb)
 34 | 
 35 | [Regular Expressions](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-regular-expressions.ipynb)
 36 | 
 37 | [Regular Expressions and FSA](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-re-fsa.ipynb)
 38 | 
 39 | [Homework Regular Expressions](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW1 - Regular Expression Parsing.ipynb)
 40 | 
 41 | [Edit Distance](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-edit-distance.ipynb)
 42 | 
 43 | ## NLTK
 44 | 
 45 | [NLTK Basics](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-basic-nltk.ipynb)
 46 | 
 47 | [NLTK Basics (2)](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-basic.ipynb)
 48 | 
 49 | [Corpora](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-corpora.ipynb)
 50 | 
 51 | [NLTK Corpora](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-corpora.ipynb)
 52 | 
 53 | [Lexical Resources](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-lexical-resources.ipynb)
 54 | 
 55 | [NLTK Lexical Resources](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-lexical-resources.ipynb)
 56 | 
 57 | [NLTK Reading German](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-reading-german.ipynb)
 58 | 
 59 | [Wordnet](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-wordnet.ipynb)
 60 | 
 61 | [NLTK Wordnet](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-wordnet.ipynb)
 62 | 
 63 | [NLTK POS Tagging](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-pos-tagging.ipynb)
 64 | 
 65 | [NLTK Automated Tagging](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-nltk-automated-tagging.ipynb)
 66 | 
 67 | [NLTK Available Taggers](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nltk-available-taggers.ipynb)
 68 | 
 69 | [NLTK n-gram Taggers](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nltk-ngram-taggers.ipynb)
 70 | 
 71 | [NLTK Tagging from Scratch](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nltk-tagging-from-scratch.ipynb)
 72 | 
 73 | [NLTK Stemming and Lemmatizing](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nltk-summary-stemming-lemmatizing.ipynb)
 74 | 
 75 | ## Markov Models and OpenFST
 76 | 
 77 | [Markov Models](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-markov-models.ipynb)
 78 | 
 79 | [HMM OCR](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-hmm-ocr.ipynb)
 80 | 
 81 | ## Finite State Transducers
 82 | 
 83 | [OpenFST](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-openfst.ipynb)
 84 | 
 85 | [OpenFST (2)](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-openfst2.ipynb)
 86 | 
 87 | [OpenFST Edit Distance](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-openfst-edit-distance.ipynb)
 88 | 
 89 | [OpenFST Weights and Forward-Backward](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/openfst-weights-and-forwardbackward.ipynb)
 90 | 
 91 | ## Classification
 92 | 
 93 | [Introduction to Classification](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-classification-intro.ipynb)
 94 | 
 95 | [Classification for Tagging](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-classification-tagging.ipynb)
 96 | 
 97 | [Sentence Segmentation](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-classifier-sentence-segmentation.ipynb)
 98 | 
 99 | [Classifier for Dialog Acts](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-classifier-dialog-acts.ipynb)
100 | 
101 | [Classifier Errors](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-classifier-errors.ipynb)
102 | 
103 | ## Information Retrieval
104 | 
105 | [Simple Information Retrieval](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-simple-ir.ipynb)
106 | 
107 | [Vectorspace Model](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-vectorspace.ipynb)
108 | 
109 | [MEMM](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/nlpa-memm.ipynb)
110 | 
111 | ## Homework
112 | 
113 | [Homework Implement a Trie](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW1 - Implement a Trie.ipynb)
114 | 
115 | [Homework Reproduce Power Laws](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW1 - Reproduce Power Laws.ipynb)
116 | 
117 | [Homework NLTK Wordnet](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW2+-+Improve+Tagging+with+Wordnet.ipynb)
118 | 
119 | [Homework Classifier Based Tagging](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW2 - Classifier-Based Tagging.ipynb)
120 | 
121 | [Homework HMMs and FSTs](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW3 - HMMs and FSTs.ipynb)
122 | 
123 | [Homework Regex Generation](http://nbviewer.ipython.org/urls/bitbucket.org/tmbdev/teaching-nlpa/raw/tip/HW3 - regex generation.ipynb)
124 | 
125 | 


--------------------------------------------------------------------------------
/brown.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmbdev-teaching/teaching-nlpa/5409d4ad0d762c76e262e130f4fcbf6bcdfc2317/brown.zip


--------------------------------------------------------------------------------
/college-degrees-perc.tsv:
--------------------------------------------------------------------------------
 1 | 1	Norway	32
 2 | 2	United States	31
 3 | 3	Netherlands	29
 4 | 4	Iceland	26
 5 | 5	Denmark	25
 6 | 6	New Zealand	25
 7 | 7	Canada	25
 8 | 8	South Korea	24
 9 | 9	Australia	24
10 | 10	Sweden	23
11 | 11	United Kingdom	23
12 | 12	Japan	23
13 | 13	Finland	21
14 | 14	Ireland	21
15 | 15	Switzerland	21
16 | 16	Spain	20
17 | 17	Poland	19
18 | 18	Hungary	18
19 | 19	Luxembourg	18
20 | 20	Germany	16
21 | 21	France	16
22 | 22	Greece	15
23 | 23	Mexico	15
24 | 24	Belgium	14
25 | 25	Portugal	14
26 | 


--------------------------------------------------------------------------------
/crawling-with-scrapy.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "070ce222",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "from scrapy.spider import BaseSpider"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 14,
 18 |    "id": "4803dd7e",
 19 |    "metadata": {
 20 |     "collapsed": false
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "class MySpider(BaseSpider):\n",
 25 |     "    name = \"uni-kl.de\"\n",
 26 |     "    allowed_domains = [\"uni-kl.de\"]\n",
 27 |     "    start_urls = [\n",
 28 |     "        \"http://www.uni-kl.de/\",\n",
 29 |     "    ]\n",
 30 |     "\n",
 31 |     "    def parse(self, response):\n",
 32 |     "        filename = response.url.split(\"/\")[-2]\n",
 33 |     "        open(filename, 'wb').write(response.body)"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 17,
 39 |    "id": "6a9b5c8b",
 40 |    "metadata": {
 41 |     "collapsed": false
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "# see: http://stackoverflow.com/questions/7993680/running-scrapy-tasks-in-python\n",
 46 |     "\n",
 47 |     "from scrapy import project, signals\n",
 48 |     "from scrapy.conf import settings\n",
 49 |     "from scrapy.crawler import CrawlerProcess\n",
 50 |     "from scrapy.xlib.pydispatch import dispatcher\n",
 51 |     "from multiprocessing.queues import Queue\n",
 52 |     "from multiprocessing import Process\n",
 53 |     "\n",
 54 |     "class CrawlerWorker(Process):\n",
 55 |     "    def __init__(self, spider, results):\n",
 56 |     "        Process.__init__(self)\n",
 57 |     "        self.results = results\n",
 58 |     "\n",
 59 |     "        self.crawler = CrawlerProcess(settings)\n",
 60 |     "        if not hasattr(project, 'crawler'):\n",
 61 |     "            self.crawler.install()\n",
 62 |     "        self.crawler.configure()\n",
 63 |     "\n",
 64 |     "        self.items = []\n",
 65 |     "        self.spider = spider\n",
 66 |     "        dispatcher.connect(self._item_passed, signals.item_passed)\n",
 67 |     "\n",
 68 |     "    def _item_passed(self, item):\n",
 69 |     "        self.items.append(item)\n",
 70 |     "\n",
 71 |     "    def run(self):\n",
 72 |     "        self.crawler.crawl(self.spider)\n",
 73 |     "        self.crawler.start()\n",
 74 |     "        self.crawler.stop()\n",
 75 |     "        self.results.put(self.items)\n"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 18,
 81 |    "id": "7d601d0c",
 82 |    "metadata": {
 83 |     "collapsed": false
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# The part below can be called as often as you want\n",
 88 |     "results = Queue()\n",
 89 |     "crawler = CrawlerWorker(MySpider(), results)\n",
 90 |     "crawler.start()\n",
 91 |     "for item in results.get():\n",
 92 |     "    print item"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "cd6b067d",
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [],
103 |    "source": []
104 |   }
105 |  ],
106 |  "metadata": {},
107 |  "nbformat": 4,
108 |  "nbformat_minor": 5
109 | }
110 | 


--------------------------------------------------------------------------------
/fstutils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from pylab import *
 4 | import openfst
 5 | from openfst import StdVectorFst as FST
 6 | from openfst import LogVectorFst as LFST
 7 | 
 8 | ASCII = openfst.SymbolTable("ASCII")
 9 | 
10 | for i in range(127):
11 |     if i==0:
12 |         ASCII.AddSymbol("ϵ",i)
13 |     elif i<=32: 
14 |         ASCII.AddSymbol("$%02x"%i,i)
15 |     else:
16 |         ASCII.AddSymbol(chr(i),i)
17 | 
18 | def minimize(fst):
19 |     dfst = FST()
20 |     openfst.Determinize(fst,dfst)
21 |     openfst.Minimize(dfst)
22 |     return dfst
23 | 
24 | def log_minimize(fst):
25 |     dfst = LFST()
26 |     openfst.Determinize(fst,dfst)
27 |     openfst.Minimize(dfst)
28 |     return dfst
29 | 
30 | def show_fst(fst):
31 |     import pydot,pylab
32 |     graph = pydot.Dot(rankdir="LR")
33 |     isyms = fst.InputSymbols()
34 |     if not isyms: isyms = ASCII
35 |     osyms = fst.OutputSymbols()
36 |     if not osyms: osyms = ASCII
37 |     for s in range(fst.NumStates()):
38 |         if s==fst.Start():
39 |             n = pydot.Node("%d"%s,shape="box")
40 |             graph.add_node(n)
41 |         if fst.IsFinal(s):
42 |             l = '"'
43 |             l += "%d"%s # node id
44 |             if fst.Final(s).Value()!=0.0: # optional non-zero accept cost
45 |                 l += "/%s"%fst.Final(s).Value()
46 |             l += '"'
47 |             n = pydot.Node("%d"%s,label=l,penwidth="3")
48 |             graph.add_node(n)
49 |         for t in range(fst.NumArcs(s)):
50 |             a = fst.GetArc(s,t)
51 |             l = '"'
52 |             l += '%s'%isyms.Find(a.ilabel)
53 |             if a.olabel!=a.ilabel: l += ":%s"%osyms.Find(a.olabel)
54 |             v = a.weight.Value()
55 |             if v!=0.0: l += "/%s"%v
56 |             l += '"'
57 |             n = a.nextstate
58 |             e = pydot.Edge("%d"%s,"%d"%n,label=l)
59 |             graph.add_edge(e)
60 |     graph.write_png("/tmp/_test.png")
61 |     pylab.gca().set_xticks([]); pylab.gca().set_yticks([])
62 |     pylab.clf()
63 |     pylab.imshow(pylab.imread("/tmp/_test.png"))   
64 | 
65 | def fstsize(fst):
66 |     edges = 0
67 |     for s in range(fst.NumStates()):
68 |         edges += fst.NumArcs(s)
69 |     return fst.NumStates(),edges
70 | 


--------------------------------------------------------------------------------
/household-ppp.tsv:
--------------------------------------------------------------------------------
 1 | 1	Luxembourg	34407
 2 | 2	United States	31111
 3 | 3	Norway	31011
 4 | 4	Iceland	28166
 5 | 5	Australia	26915
 6 | 6	Switzerland	26844
 7 | 7	Canada	25363
 8 | 8	United Kingdom	25168
 9 | 9	Ireland	24677
10 | 10	Austria	24114
11 | 11	Netherlands	24024
12 | 12	Sweden	22889
13 | 13	Denmark	22461
14 | 14	Belgium	21532
15 | 15	Germany	21241
16 | 16	Finland	20875
17 | 17	New Zealand	20679
18 | 18	France	19615
19 | 19	Japan	19432
20 | 20	South Korea	19179
21 | 


--------------------------------------------------------------------------------
/internet-archive-query.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "813a86b3",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Querying and Downloading from the Internet Archive"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "027159cd",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "This worksheet shows how to query the Internet Archive with JSON and how to download from it."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 127,
 22 |    "id": "16257e0b",
 23 |    "metadata": {
 24 |     "collapsed": false
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import urllib2\n",
 29 |     "import json\n",
 30 |     "import re\n",
 31 |     "import pprint\n",
 32 |     "pp = pprint.PrettyPrinter(indent=4).pprint\n",
 33 |     "Q = urllib2.quote\n",
 34 |     "U = urllib2.unquote"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "id": "ed7fca79",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "We construct a url-encoded query (can we also post JSON?)."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 97,
 48 |    "id": "b09830f8",
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "http://archive.org/advancedsearch.php?q=title%3A%28alice%20in%20wonderland%29%20AND%20format%3A%28djvu%29&fl%5B%5D%3Didentifier&fl%5B%5D%3Dsource&fl%5B%5D%3Dtitle&rows=100&page=1&output=json\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "query = Q(\"title:(alice in wonderland) AND format:(djvu)\")\n",
 63 |     "columns = \"&\".join([Q(s) for s in \"fl[]=identifier fl[]=source fl[]=title\".split()])\n",
 64 |     "params = \"rows=100&page=1&output=json\"\n",
 65 |     "url = \"http://archive.org/advancedsearch.php?q=\"+query+\"&\"+columns+\"&\"+params\n",
 66 |     "print url"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 128,
 72 |    "id": "1e296144",
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "# could we also post the query?\n",
 79 |     "#jdata = json.dumps({\"username\":\"...\", \"password\":\"...\"})\n",
 80 |     "#urllib2.urlopen(\"http://www.example.com/\", jdata)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "id": "e53b48f4",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "Now we read and parse the response."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 129,
 94 |    "id": "1bac2f60",
 95 |    "metadata": {
 96 |     "collapsed": false
 97 |    },
 98 |    "outputs": [
 99 |     {
100 |      "data": {
101 |       "text/plain": [
102 |        "[u'start', u'numFound', u'docs']"
103 |       ]
104 |      },
105 |      "execution_count": 129,
106 |      "metadata": {},
107 |      "output_type": "execute_result"
108 |     }
109 |    ],
110 |    "source": [
111 |     "response = urllib2.urlopen(url).read()\n",
112 |     "response = json.loads(response)[\"response\"]\n",
113 |     "response.keys()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 131,
119 |    "id": "c19f9166",
120 |    "metadata": {
121 |     "collapsed": false
122 |    },
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "43"
128 |       ]
129 |      },
130 |      "execution_count": 131,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "response[\"numFound\"]"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 132,
142 |    "id": "f58eb244",
143 |    "metadata": {
144 |     "collapsed": false
145 |    },
146 |    "outputs": [
147 |     {
148 |      "data": {
149 |       "text/plain": [
150 |        "43"
151 |       ]
152 |      },
153 |      "execution_count": 132,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "len(response[\"docs\"])"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "id": "2e9afd0a",
165 |    "metadata": {},
166 |    "source": [
167 |     "Each doc contains a title and an identifier (we asked for those):"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 101,
173 |    "id": "bd307dc0",
174 |    "metadata": {
175 |     "collapsed": false
176 |    },
177 |    "outputs": [
178 |     {
179 |      "name": "stdout",
180 |      "output_type": "stream",
181 |      "text": [
182 |       "0 Alice In Wonderland caralic\n",
183 |       "1 Alice's Adventures In Wonderland AlicesAdventuresInWonderland\n",
184 |       "2 Alice's Adventures in Wonderland AlicesAdventuresInWonderland_917\n",
185 |       "3 Alice in Wonderland aliceinwonderla00carrgoog\n",
186 |       "4 Alice's Adventures in Wonderland alicesadventure00jackgoog\n",
187 |       "5 Alice's Adventures in Wonderland AlicesAdventuresInWonderland_841\n",
188 |       "6 Alice's adventures in Wonderland alicesadventure00tenngoog\n",
189 |       "7 Alice's Adventures in Wonderland alicesadventures00011gut\n",
190 |       "8 Alice's adventures in Wonderland adventuresalices00carrrich\n",
191 |       "9 Alice in Wonderland aliceinwonderlan00carriala\n"
192 |      ]
193 |     }
194 |    ],
195 |    "source": [
196 |     "for i,e in enumerate(response[\"docs\"][:10]):\n",
197 |     "    print i,e[\"title\"],e[\"identifier\"]"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": 102,
203 |    "id": "2a9dd657",
204 |    "metadata": {
205 |     "collapsed": false
206 |    },
207 |    "outputs": [
208 |     {
209 |      "data": {
210 |       "text/plain": [
211 |        "u'AlicesAdventuresInWonderland_841'"
212 |       ]
213 |      },
214 |      "execution_count": 102,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "source": [
220 |     "identifier = response[\"docs\"][5][\"identifier\"]\n",
221 |     "identifier"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "id": "4c6aa2b4",
227 |    "metadata": {},
228 |    "source": [
229 |     "# Retrieving Details using the Identifier"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "id": "2d5f9448",
235 |    "metadata": {},
236 |    "source": [
237 |     "Once we have the identifier for a document, we can retrieve more info about it,\n",
238 |     "again in JSON."
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 111,
244 |    "id": "d7504a13",
245 |    "metadata": {
246 |     "collapsed": false
247 |    },
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "[u'files', u'misc', u'server', u'item', u'creativecommons', u'dir', u'metadata']\n",
254 |       "ia701208.us.archive.org\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "hit = urllib2.urlopen(\"http://archive.org/details/\"+Q(identifier)+\"?output=json\").read()\n",
260 |     "hit = json.loads(hit)\n",
261 |     "print hit.keys()\n",
262 |     "print hit[\"server\"]"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "id": "1c3d32e1",
268 |    "metadata": {},
269 |    "source": [
270 |     "We're particularly interested in the files."
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 112,
276 |    "id": "c78731ca",
277 |    "metadata": {
278 |     "collapsed": false
279 |    },
280 |    "outputs": [
281 |     {
282 |      "name": "stdout",
283 |      "output_type": "stream",
284 |      "text": [
285 |       "[u'sha1', u'format', u'source', u'mtime', u'crc32', u'md5', u'original', u'size']\n"
286 |      ]
287 |     }
288 |    ],
289 |    "source": [
290 |     "print hit[\"files\"].items()[0][1].keys()"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "id": "dbcca733",
296 |    "metadata": {},
297 |    "source": [
298 |     "The file list contains information about formats, sources, sizes, etc.\n",
299 |     "We're looking for text."
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 123,
305 |    "id": "30ae9136",
306 |    "metadata": {
307 |     "collapsed": false
308 |    },
309 |    "outputs": [
310 |     {
311 |      "name": "stdout",
312 |      "output_type": "stream",
313 |      "text": [
314 |       "u'DjVu' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.djvu\n",
315 |       "u'Abbyy GZ' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_abbyy.gz\n",
316 |       "u'Image Container PDF' u'original' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.pdf\n",
317 |       "u'Metadata' u'original' /AlicesAdventuresInWonderland_841_meta.xml\n",
318 |       "u'Single Page Processed JP2 ZIP' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_jp2.zip\n",
319 |       "u'DjVuTXT' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.txt\n",
320 |       "u'Scandata' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_scandata.xml\n",
321 |       "u'EPUB' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.epub\n",
322 |       "u'Metadata' u'original' /AlicesAdventuresInWonderland_841_files.xml\n",
323 |       "u'Animated GIF' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll.gif\n",
324 |       "u'Djvu XML' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.xml\n",
325 |       "u'Additional Text PDF' u'derivative' /86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_text.pdf\n",
326 |       "/86311283-Original-Version-of-Alice-s-Adventures-in-Wonderland-by-Lewis-Carroll_djvu.txt\n"
327 |      ]
328 |     }
329 |    ],
330 |    "source": [
331 |     "fname = None\n",
332 |     "for k,v in hit[\"files\"].items():\n",
333 |     "    print repr(v[\"format\"]),repr(v[\"source\"]),k\n",
334 |     "    if v[\"format\"]==\"DjVuTXT\": fname = k\n",
335 |     "print fname"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "id": "f9c6f7ad",
341 |    "metadata": {},
342 |    "source": [
343 |     "# Retrieving the File"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "markdown",
348 |    "id": "e7c579fc",
349 |    "metadata": {},
350 |    "source": [
351 |     "We can retrieve files from the `archive.org/download` URL, combining the identifier for the document and the specific file name."
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 138,
357 |    "id": "cfedd3cf",
358 |    "metadata": {
359 |     "collapsed": false
360 |    },
361 |    "outputs": [
362 |     {
363 |      "data": {
364 |       "text/plain": [
365 |        "u'\\n\\n\\n1 \\n\\n\\n\\n\\nwtx \\n\\n\\n\\n\\n\\n% \\xa7eb vtfy \"tired of $LM&$ \\nby nzr sisfer* ojl. tdthlmnh \\n\\ndo : once or \"twice, sit ka.A \\nfittfottL tufa i&& Irotk ktv \\nS^^Mt Si ^ r w<t ^ riding, frat it \\nk<U Ko pictures or conversation*- in lt } ajruL wh&*. is tfa, \\nU&& of a- (rook t -ikoiL^kir Alice, , wii&out- pictures &<r can.* \\n-VtrScrtio-ns t So ska, MCLS cons.ttle.rino> <-7i. fi\\xa3r own niind, \\n^aS w&^ as S/te- Could, fa'"
366 |       ]
367 |      },
368 |      "execution_count": 138,
369 |      "metadata": {},
370 |      "output_type": "execute_result"
371 |     }
372 |    ],
373 |    "source": [
374 |     "text = urllib2.urlopen(\"http://archive.org/download/\"+Q(identifier)+fname).read()\n",
375 |     "text = text.decode(\"utf-8\")\n",
376 |     "text[:400]"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "id": "68e95f62",
382 |    "metadata": {},
383 |    "source": [
384 |     "We can now continue to process this text, for example with NLTK."
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 139,
390 |    "id": "ad6c305b",
391 |    "metadata": {
392 |     "collapsed": false
393 |    },
394 |    "outputs": [],
395 |    "source": [
396 |     "import nltk\n",
397 |     "tokens = nltk.tokenize.word_tokenize(text)"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 140,
403 |    "id": "d0495f59",
404 |    "metadata": {
405 |     "collapsed": false
406 |    },
407 |    "outputs": [
408 |     {
409 |      "data": {
410 |       "text/plain": [
411 |        "[u'.', u'\\u25a0', u'/I', u'nor', u'way', u'\"', u'to', u'Uar', u'^', u'U', u'T', u'^', u'Ub', u'say', u'~', u'6', u'>', u'rfs', u'\\xab', u'#', u\"'\", u'cU', u'\\xb1', u'r', u',', u'dear', u';', u'*', u'UtL', u'U', u'too', u'\\xa3', u'*', u'\\xa3', u'e', u'r', u'(', u'vji', u'*', u'n.', u'$', u'U', u'idLca.', u'3', u'i&', u'A', u'ovtr', u'*', u'.', u'\\xa3', u'&-', u'-WO-rcLS', u')', u'tir', u'occurred', u'&', u'A-', u'*', u'*', u'*', u'that', u's', u'^', u'e', u'oll', u'^', u'H', u'tfi', u'kavt', u'woTuLkfttL', u'at', u'-tiiis', u',', u'(', u'rat', u'ai', u'Ofb', u'tirae', u',', u'l&', u'alt', u'\\u2022', u'seemed', u'auitl', u'natural', u'}', u'>', u'bu.t', u'wAe', u'*', u'.', u'\\xb1', u'kt', u'raUit', u'actadly', u'-took', u'QL', u'w', u'atch.', u'out']"
412 |       ]
413 |      },
414 |      "execution_count": 140,
415 |      "metadata": {},
416 |      "output_type": "execute_result"
417 |     }
418 |    ],
419 |    "source": [
420 |     "tokens[200:300]"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": null,
426 |    "id": "f3a8be5e",
427 |    "metadata": {
428 |     "collapsed": false
429 |    },
430 |    "outputs": [],
431 |    "source": []
432 |   }
433 |  ],
434 |  "metadata": {},
435 |  "nbformat": 4,
436 |  "nbformat_minor": 5
437 | }
438 | 


--------------------------------------------------------------------------------
/letter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tmbdev-teaching/teaching-nlpa/5409d4ad0d762c76e262e130f4fcbf6bcdfc2317/letter.png


--------------------------------------------------------------------------------
/nlpa-classification-tagging.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 23,
  6 |    "id": "03d3a48a",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import nltk\n",
 13 |     "from nltk.corpus import names\n",
 14 |     "from pylab import *\n",
 15 |     "import random as pyrandom"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "38897036",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# Parts of Speech Tagging"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 24,
 29 |    "id": "7101d69a",
 30 |    "metadata": {
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "from nltk.corpus import brown"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 25,
 41 |    "id": "893549d7",
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "tagged_words = brown.tagged_words(categories='news')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 44,
 53 |    "id": "2d8fdc90",
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "def features(s,i,y):\n",
 60 |     "    f = dict(ltag=y[i-1] if i>0 else \"^\", # previous tag\n",
 61 |     "             lword=s[i-1] if i>0 else \"^\", # previous word\n",
 62 |     "             s1 = s[i][-1:], # current word features\n",
 63 |     "             s2 = s[i][-2:],\n",
 64 |     "             s3 = s[i][-3:])\n",
 65 |     "    return f"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "id": "fbe6ff18",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "# Training"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 35,
 79 |    "id": "7ccac27e",
 80 |    "metadata": {
 81 |     "collapsed": false
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "data = []\n",
 86 |     "for sy in brown.tagged_sents(categories='news'):\n",
 87 |     "    s,y = zip(*sy)\n",
 88 |     "    for i in range(len(s)):\n",
 89 |     "        data.append((features(s,i,y),y[i]))\n",
 90 |     "n = len(data)\n",
 91 |     "training_set = data[n//10:]\n",
 92 |     "test_set = data[:n//10]"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 36,
 98 |    "id": "66ba816e",
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "classifier = nltk.NaiveBayesClassifier.train(training_set)"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 37,
110 |    "id": "fd3afe50",
111 |    "metadata": {
112 |     "collapsed": false
113 |    },
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "0.8176031824962705"
119 |       ]
120 |      },
121 |      "execution_count": 37,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "nltk.classify.accuracy(classifier,test_set)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "id": "24401659",
133 |    "metadata": {},
134 |    "source": [
135 |     "# Greedy Decoding"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 46,
141 |    "id": "596ce943",
142 |    "metadata": {
143 |     "collapsed": false
144 |    },
145 |    "outputs": [],
146 |    "source": [
147 |     "class MyTagger:\n",
148 |     "    def __init__(self,classifier):\n",
149 |     "        self.classifier = classifier\n",
150 |     "    def tag(self,s):\n",
151 |     "        y = []\n",
152 |     "        for i in range(len(s)):\n",
153 |     "            f = features(s,i,y)\n",
154 |     "            y.append(classifier.classify(features(s,i,y)))\n",
155 |     "        return zip(s,y)            "
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 40,
161 |    "id": "bb6adcd5",
162 |    "metadata": {
163 |     "collapsed": false
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "tagger = MyTagger(classifier)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 45,
173 |    "id": "cab4c5cb",
174 |    "metadata": {
175 |     "collapsed": false
176 |    },
177 |    "outputs": [
178 |     {
179 |      "data": {
180 |       "text/plain": [
181 |        "[('The', 'AT'),\n",
182 |        " ('quick', 'NN'),\n",
183 |        " ('brown', 'NN'),\n",
184 |        " ('fox', 'NPS-TL'),\n",
185 |        " ('jumped', 'VBD'),\n",
186 |        " ('over', 'RP'),\n",
187 |        " ('the', 'AT'),\n",
188 |        " ('lazy', 'JJ'),\n",
189 |        " ('dogs.', 'NP')]"
190 |       ]
191 |      },
192 |      "execution_count": 45,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "tagger.tag(\"The quick brown fox jumped over the lazy dogs.\".split())"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "id": "dd64592b",
204 |    "metadata": {},
205 |    "source": [
206 |     "# More Advanced Models"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "id": "662450d0",
212 |    "metadata": {},
213 |    "source": [
214 |     "- Viterbi Decoding\n",
215 |     "- MEMM\n",
216 |     "- Conditional Random Fields"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "id": "05f70ce3",
223 |    "metadata": {
224 |     "collapsed": false
225 |    },
226 |    "outputs": [],
227 |    "source": []
228 |   }
229 |  ],
230 |  "metadata": {},
231 |  "nbformat": 4,
232 |  "nbformat_minor": 5
233 | }
234 | 


--------------------------------------------------------------------------------
/nlpa-classifier-dialog-acts.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "id": "285c5b90",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import nltk\n",
 13 |     "from nltk.corpus import names\n",
 14 |     "from pylab import *\n",
 15 |     "import random as pyrandom"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "a179c9e6",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# Dialog Act Type Classification"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 4,
 29 |    "id": "a2f720fc",
 30 |    "metadata": {
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "posts = nltk.corpus.nps_chat.xml_posts()[:10000]"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 5,
 41 |    "id": "290cf628",
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "def features(post):\n",
 48 |     "    f = {}\n",
 49 |     "    for w in nltk.word_tokenize(post): f[w.lower()] = True\n",
 50 |     "    return f"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 6,
 56 |    "id": "1f37d87f",
 57 |    "metadata": {
 58 |     "collapsed": false
 59 |    },
 60 |    "outputs": [
 61 |     {
 62 |      "data": {
 63 |       "text/plain": [
 64 |        "'wouldnt let her date'"
 65 |       ]
 66 |      },
 67 |      "execution_count": 6,
 68 |      "metadata": {},
 69 |      "output_type": "execute_result"
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "posts[333].text"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 7,
 79 |    "id": "4e23cd97",
 80 |    "metadata": {
 81 |     "collapsed": false
 82 |    },
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/plain": [
 87 |        "'Emotion'"
 88 |       ]
 89 |      },
 90 |      "execution_count": 7,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "posts[333].get('class')"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 8,
102 |    "id": "f8965b14",
103 |    "metadata": {
104 |     "collapsed": false
105 |    },
106 |    "outputs": [
107 |     {
108 |      "name": "stdout",
109 |      "output_type": "stream",
110 |      "text": [
111 |       "set(['Emotion', 'ynQuestion', 'yAnswer', 'Continuer', 'whQuestion', 'System', 'Accept', 'Clarify', 'Emphasis', 'nAnswer', 'Greet', 'Statement', 'Reject', 'Bye', 'Other'])\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "print set([p.get('class') for p in posts])"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 9,
122 |    "id": "b63c66a7",
123 |    "metadata": {
124 |     "collapsed": false
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "featuresets = [(features(p.text),p.get('class')) for p in posts]"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 10,
134 |    "id": "eedff20d",
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "training_set = featuresets[1000:]\n",
141 |     "test_set = featuresets[:1000]"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 11,
147 |    "id": "6f6671f5",
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "0.66\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "classifier = nltk.NaiveBayesClassifier.train(training_set)\n",
162 |     "print nltk.classify.accuracy(classifier,test_set)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "id": "19972e3d",
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [],
173 |    "source": []
174 |   }
175 |  ],
176 |  "metadata": {},
177 |  "nbformat": 4,
178 |  "nbformat_minor": 5
179 | }
180 | 


--------------------------------------------------------------------------------
/nlpa-classifier-sentence-segmentation-Copy0.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 5
6 | }
7 | 


--------------------------------------------------------------------------------
/nlpa-classifier-sentence-segmentation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "38c97229",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import nltk\n",
 13 |     "from nltk.corpus import names\n",
 14 |     "from pylab import *\n",
 15 |     "import random as pyrandom"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "01d28868",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# Sentence Segmentation"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 34,
 29 |    "id": "4b05b3e9",
 30 |    "metadata": {
 31 |     "collapsed": false
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "sents = nltk.corpus.treebank_raw.sents()\n",
 36 |     "sents = [s for s in sents if len(s)>3]\n",
 37 |     "sents = [s for s in sents if \"START\" not in s]"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 35,
 43 |    "id": "98e6db7e",
 44 |    "metadata": {
 45 |     "collapsed": false
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "tokens = []\n",
 50 |     "boundaries = []\n",
 51 |     "for s in sents:\n",
 52 |     "    tokens += s\n",
 53 |     "    boundaries.append(len(tokens)-1)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 36,
 59 |    "id": "6990a8a0",
 60 |    "metadata": {
 61 |     "collapsed": false
 62 |    },
 63 |    "outputs": [
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov', '.', '29', '.', 'Mr', '.', 'Vinken', 'is', 'chairman', 'of', 'Elsevier', 'N', '.', 'V', '.,', 'the', 'Dutch', 'publishing', 'group', '.', 'Rudolph', 'Agnew', ',', '55', 'years', 'old', 'and', 'former', 'chairman', 'of', 'Consolidated', 'Gold', 'Fields', 'PLC', ',', 'was', 'named', 'a', 'nonexecutive', 'director', 'of', 'this', 'British', 'industrial', 'conglomerate', '.', 'A', 'form', 'of', 'asbestos', 'once', 'used', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '.', 'The', 'asbestos', 'fiber', ',', 'crocidolite', ',', 'is', 'unusually', 'resilient', 'once', 'it', 'enters', 'the', 'lungs', ',', 'with', 'even', 'brief', 'exposures', 'to', 'it', 'causing', 'symptoms', 'that', 'show', 'up', 'decades', 'later', ',', 'researchers', 'said', '.', 'Lorillard', 'Inc', '.,', 'the', 'unit', 'of', 'New', 'York', '-', 'based', 'Loews', 'Corp', '.', 'that', 'makes', 'Kent', 'cigarettes', ',', 'stopped', 'using', 'crocidolite', 'in', 'its', 'Micronite', 'cigarette', 'filters', 'in', '1956', '.', 'Although', 'preliminary', 'findings', 'were', 'reported', 'more', 'than', 'a', 'year', 'ago', ',', 'the', 'latest', 'results', 'appear', 'in', 'today', \"'\", 's', 'New', 'England', 'Journal', 'of', 'Medicine', ',', 'a', 'forum', 'likely', 'to', 'bring', 'new', 'attention', 'to', 'the', 'problem', '.', 'A', 'Lorillard', 'spokewoman', 'said', ',', '\"']\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "print tokens[:200]"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 42,
 79 |    "id": "9c84af47",
 80 |    "metadata": {
 81 |     "collapsed": false
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "def features(s,i):\n",
 86 |     "    return dict(current=tokens[i],\n",
 87 |     "                prev=tokens[i-1],\n",
 88 |     "                next=tokens[i+1],\n",
 89 |     "                upper=tokens[i+1][0].isupper(),\n",
 90 |     "                plen=len(tokens[i-1]),\n",
 91 |     "                nlen=len(tokens[i+1]))\n",
 92 |     "    "
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 46,
 98 |    "id": "fa07a737",
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "5951\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "data = []\n",
113 |     "for i in range(1,len(tokens)-1):\n",
114 |     "    if tokens[i] not in [\".\",\"?\",\"!\"]: continue\n",
115 |     "    c = (i in boundaries)\n",
116 |     "    f = features(tokens,i)\n",
117 |     "    data.append((f,c))\n",
118 |     "pyrandom.shuffle(data)\n",
119 |     "n = len(data)\n",
120 |     "print n\n",
121 |     "training_set = data[n//10:]\n",
122 |     "test_set = data[:n//10]"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 47,
128 |    "id": "7262c00e",
129 |    "metadata": {
130 |     "collapsed": false
131 |    },
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/plain": [
136 |        "0.9798319327731092"
137 |       ]
138 |      },
139 |      "execution_count": 47,
140 |      "metadata": {},
141 |      "output_type": "execute_result"
142 |     }
143 |    ],
144 |    "source": [
145 |     "classifier = nltk.NaiveBayesClassifier.train(training_set)\n",
146 |     "nltk.classify.accuracy(classifier,test_set)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 48,
152 |    "id": "06641c58",
153 |    "metadata": {
154 |     "collapsed": false
155 |    },
156 |    "outputs": [
157 |     {
158 |      "data": {
159 |       "text/plain": [
160 |        "True"
161 |       ]
162 |      },
163 |      "execution_count": 48,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "classifier.classify(features(\"The quick . brown\".split(),2))"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 59,
175 |    "id": "bf296af7",
176 |    "metadata": {
177 |     "collapsed": false
178 |    },
179 |    "outputs": [],
180 |    "source": [
181 |     "def segment_sentences(words):\n",
182 |     "    sentences = [[words[0]]]\n",
183 |     "    for i in range(1,len(words)):\n",
184 |     "        sentences[-1].append(words[i])\n",
185 |     "        c = words[i] in [\".\",\"?\",\"!\"] and classifier.classify(features(words,i))\n",
186 |     "        if c: sentences.append([])\n",
187 |     "    if sentences[-1]==[]: sentences = sentences[:-1]\n",
188 |     "    return sentences\n",
189 |     "    "
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 61,
195 |    "id": "40939b59",
196 |    "metadata": {
197 |     "collapsed": false
198 |    },
199 |    "outputs": [
200 |     {
201 |      "data": {
202 |       "text/plain": [
203 |        "[['Smith', 'ran', '.'], ['J', '.', 'Smith', 'really', 'ran', '.']]"
204 |       ]
205 |      },
206 |      "execution_count": 61,
207 |      "metadata": {},
208 |      "output_type": "execute_result"
209 |     }
210 |    ],
211 |    "source": [
212 |     "segment_sentences(\"\"\"Smith ran . J . Smith really ran . \"\"\".split())"
213 |    ]
214 |   }
215 |  ],
216 |  "metadata": {},
217 |  "nbformat": 4,
218 |  "nbformat_minor": 5
219 | }
220 | 


--------------------------------------------------------------------------------
/nlpa-corpora.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "1ce9fe94",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Properties of Corpora"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "90005a64",
 15 |    "metadata": {
 16 |     "collapsed": false
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from nltk.corpus import brown"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "2980d0d7",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Corpora are Collections of Files"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 17,
 34 |    "id": "47104c75",
 35 |    "metadata": {
 36 |     "collapsed": false
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/plain": [
 42 |        "FileSystemPathPointer('/home/tmb/nltk_data/corpora/brown')"
 43 |       ]
 44 |      },
 45 |      "execution_count": 17,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "brown.root"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 18,
 57 |    "id": "9886db10",
 58 |    "metadata": {
 59 |     "collapsed": false
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "'BROWN CORPUS\\n\\nA Standard Corpus of Present-Day Edited American\\nEnglish, for use with Digital Computers.\\n\\nby W. N. Francis and H. Kucera (1964)\\nDepartment of Linguistics, Brown University\\nProvidence, Rhode Island, USA\\n\\nRevised 1971, Revised and Amplified 1979\\n\\nhttp://www.hit.uib.no/icame/brown/bcm.html\\n\\nDistributed with the permission of the copyright holder,\\nredistribution permitted.\\n'"
 66 |       ]
 67 |      },
 68 |      "execution_count": 18,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "brown.readme()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 15,
 80 |    "id": "1d690b25",
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "text/plain": [
 88 |        "['ca01',\n",
 89 |        " 'ca02',\n",
 90 |        " 'ca03',\n",
 91 |        " 'ca04',\n",
 92 |        " 'ca05',\n",
 93 |        " 'ca06',\n",
 94 |        " 'ca07',\n",
 95 |        " 'ca08',\n",
 96 |        " 'ca09',\n",
 97 |        " 'ca10']"
 98 |       ]
 99 |      },
100 |      "execution_count": 15,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "brown.fileids()[:10]"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "id": "2e7d5c4b",
112 |    "metadata": {},
113 |    "source": [
114 |     "Files may have different encodings; the default is ASCII processed as `str`."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 16,
120 |    "id": "d2efd5df",
121 |    "metadata": {
122 |     "collapsed": false
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "brown.encoding(\"ca01\")"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "f9b70616",
132 |    "metadata": {},
133 |    "source": [
134 |     "Files may also be in different categories."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 19,
140 |    "id": "39af3549",
141 |    "metadata": {
142 |     "collapsed": false
143 |    },
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "['adventure',\n",
149 |        " 'belles_lettres',\n",
150 |        " 'editorial',\n",
151 |        " 'fiction',\n",
152 |        " 'government',\n",
153 |        " 'hobbies',\n",
154 |        " 'humor',\n",
155 |        " 'learned',\n",
156 |        " 'lore',\n",
157 |        " 'mystery',\n",
158 |        " 'news',\n",
159 |        " 'religion',\n",
160 |        " 'reviews',\n",
161 |        " 'romance',\n",
162 |        " 'science_fiction']"
163 |       ]
164 |      },
165 |      "execution_count": 19,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     }
169 |    ],
170 |    "source": [
171 |     "brown.categories()"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "id": "0ce78685",
177 |    "metadata": {},
178 |    "source": [
179 |     "## Accessing Content"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "id": "9fe8b232",
185 |    "metadata": {},
186 |    "source": [
187 |     "The corpus abstraction allows you to avoid having to deal with individual files, encodings, etc.\n",
188 |     "\n",
189 |     "That is, you can access all the words, all the text, all the sentences etc. in a corpus from a single object.\n"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 54,
195 |    "id": "caddbbd4",
196 |    "metadata": {
197 |     "collapsed": false
198 |    },
199 |    "outputs": [
200 |     {
201 |      "data": {
202 |       "text/plain": [
203 |        "'\\n\\n\\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn'"
204 |       ]
205 |      },
206 |      "execution_count": 54,
207 |      "metadata": {},
208 |      "output_type": "execute_result"
209 |     }
210 |    ],
211 |    "source": [
212 |     "brown.raw()[:100]"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 2,
218 |    "id": "f281d7f8",
219 |    "metadata": {
220 |     "collapsed": false
221 |    },
222 |    "outputs": [
223 |     {
224 |      "data": {
225 |       "text/plain": [
226 |        "['The',\n",
227 |        " 'Fulton',\n",
228 |        " 'County',\n",
229 |        " 'Grand',\n",
230 |        " 'Jury',\n",
231 |        " 'said',\n",
232 |        " 'Friday',\n",
233 |        " 'an',\n",
234 |        " 'investigation',\n",
235 |        " 'of']"
236 |       ]
237 |      },
238 |      "execution_count": 2,
239 |      "metadata": {},
240 |      "output_type": "execute_result"
241 |     }
242 |    ],
243 |    "source": [
244 |     "brown.words()[:10]"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 5,
250 |    "id": "ed836db2",
251 |    "metadata": {
252 |     "collapsed": false
253 |    },
254 |    "outputs": [
255 |     {
256 |      "name": "stdout",
257 |      "output_type": "stream",
258 |      "text": [
259 |       "['The', 'Fulton', 'County', 'Grand', 'Jury']\n",
260 |       "['The', 'jury', 'further', 'said', 'in']\n",
261 |       "['The', 'September-October', 'term', 'jury', 'had']\n",
262 |       "['``', 'Only', 'a', 'relative', 'handful']\n",
263 |       "['The', 'jury', 'said', 'it', 'did']\n",
264 |       "['It', 'recommended', 'that', 'Fulton', 'legislators']\n",
265 |       "['The', 'grand', 'jury', 'commented', 'on']\n",
266 |       "['Merger', 'proposed']\n",
267 |       "['However', ',', 'the', 'jury', 'said']\n",
268 |       "['The', 'City', 'Purchasing', 'Department', ',']\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "for s in brown.sents()[:10]: print s[:5]"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 6,
279 |    "id": "0ac68d8b",
280 |    "metadata": {
281 |     "collapsed": false
282 |    },
283 |    "outputs": [
284 |     {
285 |      "data": {
286 |       "text/plain": [
287 |        "[('The', 'AT'),\n",
288 |        " ('Fulton', 'NP-TL'),\n",
289 |        " ('County', 'NN-TL'),\n",
290 |        " ('Grand', 'JJ-TL'),\n",
291 |        " ('Jury', 'NN-TL'),\n",
292 |        " ('said', 'VBD'),\n",
293 |        " ('Friday', 'NR'),\n",
294 |        " ('an', 'AT'),\n",
295 |        " ('investigation', 'NN'),\n",
296 |        " ('of', 'IN')]"
297 |       ]
298 |      },
299 |      "execution_count": 6,
300 |      "metadata": {},
301 |      "output_type": "execute_result"
302 |     }
303 |    ],
304 |    "source": [
305 |     "brown.tagged_words()[:10]"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 8,
311 |    "id": "3f909b3b",
312 |    "metadata": {
313 |     "collapsed": false
314 |    },
315 |    "outputs": [
316 |     {
317 |      "data": {
318 |       "text/plain": [
319 |        "[('The', 'AT'),\n",
320 |        " ('Fulton', 'NP-TL'),\n",
321 |        " ('County', 'NN-TL'),\n",
322 |        " ('Grand', 'JJ-TL'),\n",
323 |        " ('Jury', 'NN-TL'),\n",
324 |        " ('said', 'VBD'),\n",
325 |        " ('Friday', 'NR'),\n",
326 |        " ('an', 'AT'),\n",
327 |        " ('investigation', 'NN'),\n",
328 |        " ('of', 'IN')]"
329 |       ]
330 |      },
331 |      "execution_count": 8,
332 |      "metadata": {},
333 |      "output_type": "execute_result"
334 |     }
335 |    ],
336 |    "source": [
337 |     "brown.tagged_sents()[0][:10]"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "id": "edac51c0",
343 |    "metadata": {},
344 |    "source": [
345 |     "# Reading New Corpora"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 20,
351 |    "id": "79f45f8a",
352 |    "metadata": {
353 |     "collapsed": false
354 |    },
355 |    "outputs": [],
356 |    "source": [
357 |     "import nltk.corpus.reader"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 30,
363 |    "id": "88ef0e47",
364 |    "metadata": {
365 |     "collapsed": false
366 |    },
367 |    "outputs": [],
368 |    "source": [
369 |     "corpus = nltk.corpus.reader.plaintext.PlaintextCorpusReader(\".\",r\"[ft].*txt\",encoding=\"utf8\")"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 31,
375 |    "id": "045a15a0",
376 |    "metadata": {
377 |     "collapsed": false
378 |    },
379 |    "outputs": [
380 |     {
381 |      "data": {
382 |       "text/plain": [
383 |        "['faust.txt', 'tomsawyer.txt']"
384 |       ]
385 |      },
386 |      "execution_count": 31,
387 |      "metadata": {},
388 |      "output_type": "execute_result"
389 |     }
390 |    ],
391 |    "source": [
392 |     "corpus.fileids()"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 32,
398 |    "id": "d25b1b3d",
399 |    "metadata": {
400 |     "collapsed": false
401 |    },
402 |    "outputs": [
403 |     {
404 |      "data": {
405 |       "text/plain": [
406 |        "u'Faust: Der Trag\\xf6die erster Teil\\n\\nJohann Wolfgang von Goethe\\n\\n\\nZueignung.\\n\\nIhr naht euch wieder, schw'"
407 |       ]
408 |      },
409 |      "execution_count": 32,
410 |      "metadata": {},
411 |      "output_type": "execute_result"
412 |     }
413 |    ],
414 |    "source": [
415 |     "corpus.raw()[:100]"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 33,
421 |    "id": "b69ac46c",
422 |    "metadata": {
423 |     "collapsed": false
424 |    },
425 |    "outputs": [
426 |     {
427 |      "data": {
428 |       "text/plain": [
429 |        "[[[u'Faust', u':', u'Der', u'Trag\\xf6die', u'erster', u'Teil']],\n",
430 |        " [[u'Johann', u'Wolfgang', u'von', u'Goethe']]]"
431 |       ]
432 |      },
433 |      "execution_count": 33,
434 |      "metadata": {},
435 |      "output_type": "execute_result"
436 |     }
437 |    ],
438 |    "source": [
439 |     "corpus.paras()[:2]"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 39,
445 |    "id": "9331354f",
446 |    "metadata": {
447 |     "collapsed": false
448 |    },
449 |    "outputs": [
450 |     {
451 |      "name": "stdout",
452 |      "output_type": "stream",
453 |      "text": [
454 |       "[u'FAUST', u':', u'Vor', u'jenem', u'droben', u'steht', u'geb\\xfcckt', u',', u'Der', u'helfen', u'lehrt', u'und', u'H\\xfclfe', u'schickt', u'.']\n"
455 |      ]
456 |     }
457 |    ],
458 |    "source": [
459 |     "print corpus.sents()[500]"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 40,
465 |    "id": "68818aff",
466 |    "metadata": {
467 |     "collapsed": false
468 |    },
469 |    "outputs": [
470 |     {
471 |      "name": "stdout",
472 |      "output_type": "stream",
473 |      "text": [
474 |       "[u'heute', u'!', u'DICHTER', u':', u'O', u'sprich', u'mir', u'nicht', u'von', u'jener']\n"
475 |      ]
476 |     }
477 |    ],
478 |    "source": [
479 |     "print corpus.words()[500:510]"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 44,
485 |    "id": "23376f31",
486 |    "metadata": {
487 |     "collapsed": false
488 |    },
489 |    "outputs": [],
490 |    "source": [
491 |     "from nltk import Text\n",
492 |     "text = Text(corpus.words(\"tomsawyer.txt\"))"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": 47,
498 |    "id": "1b2dcb00",
499 |    "metadata": {
500 |     "collapsed": false
501 |    },
502 |    "outputs": [
503 |     {
504 |      "name": "stdout",
505 |      "output_type": "stream",
506 |      "text": [
507 |       "Building index...\n",
508 |       "Displaying 25 of 647 matches:\n",
509 |       "\" TOM !\" No answer . \" What ' s gone with that boy , I wonder ? You TOM !\" No \n",
510 |       "ding down and punching under the bed with the broom , and so she needed breath\n",
511 |       "eded breath to punctuate the punches with . She resurrected nothing but the ca\n",
512 |       " - brother ) Sid was already through with his part of the work ( picking up ch\n",
513 |       "et vanity to believe she was endowed with a talent for dark and mysterious dip\n",
514 |       " sewed . \" Bother ! Well , go ' long with you . I ' d made sure you ' d played\n",
515 |       " didn ' t think you sewed his collar with white thread , but it ' s black .\" \"\n",
516 |       "it ' s black .\" \" Why , I did sew it with white ! Tom !\" But Tom did not wait \n",
517 |       " Confound it ! sometimes she sews it with white , and sometimes she sews it wi\n",
518 |       "th white , and sometimes she sews it with black . I wish to geeminy she ' d st\n",
519 |       "f it , and he strode down the street with his mouth full of harmony __________\n",
520 |       "ure is concerned , the advantage was with the boy , not the astronomer . The s\n",
521 |       "art , don ' t you ? I could lick you with one hand tied behind me , if I wante\n",
522 |       "do it .\" \" Well I will , if you fool with me .\" \" Oh yes -- I ' ve seen whole \n",
523 |       "n ' t either .\" So they stood , each with a foot placed at an angle as a brace\n",
524 |       " angle as a brace , and both shoving with might and main , and glowering at ea\n",
525 |       "d main , and glowering at each other with hate . But neither could get an adva\n",
526 |       "nd flushed , each relaxed his strain with watchful caution , and Tom said : \" \n",
527 |       "other on you , and he can thrash you with his little finger , and I ' ll make \n",
528 |       "it so .\" Tom drew a line in the dust with his big toe , and said : \" I dare yo\n",
529 |       " out of his pocket and held them out with derision . Tom struck them to the gr\n",
530 |       "er ' s nose , and covered themselves with dust and glory . Presently the confu\n",
531 |       "tride the new boy , and pounding him with his fists . \" Holler ' nuff !\" said \n",
532 |       "Better look out who you ' re fooling with next time .\" The new boy went off br\n",
533 |       "ht him out .\" To which Tom responded with jeers , and started off in high feat\n"
534 |      ]
535 |     }
536 |    ],
537 |    "source": [
538 |     "text.concordance(\"with\")"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "code",
543 |    "execution_count": 48,
544 |    "id": "487f92bf",
545 |    "metadata": {
546 |     "collapsed": false
547 |    },
548 |    "outputs": [
549 |     {
550 |      "name": "stdout",
551 |      "output_type": "stream",
552 |      "text": [
553 |       "Building word-context index...\n",
554 |       "and in on to for of was at into up s that through but if just upon\n",
555 |       "what as by\n"
556 |      ]
557 |     }
558 |    ],
559 |    "source": [
560 |     "text.similar(\"with\")"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 50,
566 |    "id": "fdaee38c",
567 |    "metadata": {
568 |     "collapsed": false
569 |    },
570 |    "outputs": [
571 |     {
572 |      "name": "stdout",
573 |      "output_type": "stream",
574 |      "text": [
575 |       "but_the is_a long_you up_a\n"
576 |      ]
577 |     }
578 |    ],
579 |    "source": [
580 |     "text.common_contexts([\"with\",\"as\"])"
581 |    ]
582 |   }
583 |  ],
584 |  "metadata": {},
585 |  "nbformat": 4,
586 |  "nbformat_minor": 5
587 | }
588 | 


--------------------------------------------------------------------------------
/nlpa-course.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 8,
  6 |    "id": "b4913d2b",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "from IPython.core.display import Image,HTML\n",
 13 |     "from IPython.lib.display import YouTubeVideo"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "928e08dc",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Introduction to Natural Language Processing and Applications"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "id": "724cc955",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "Goals of the course:\n",
 30 |     "\n",
 31 |     "- understanding of major concepts and algorithms in text and natural language processing\n",
 32 |     "- acquire the skills to work with research code in NLP, and make contributions to it\n",
 33 |     "\n",
 34 |     "*It is not just sufficient to memorize some terms and algorithms, you must be able to apply them.*"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "id": "6a673f8e",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Course Content:\n",
 43 |     "\n",
 44 |     "- text processing and encoding\n",
 45 |     "- string algorithms, edit distance\n",
 46 |     "- statistical language models\n",
 47 |     "- spell correction\n",
 48 |     "- n-gram models\n",
 49 |     "- word sense disambiguation\n",
 50 |     "- Markov models, parts-of-speech tagging\n",
 51 |     "- probabilistic grammars and parsing\n",
 52 |     "- text alignment, clustering, text categorization\n",
 53 |     "- statistical machine translation\n",
 54 |     "- applications in speech recognition, handwriting recognition, and OCR\n",
 55 |     "- language acquisition\n",
 56 |     "- machine learning for NLP\n",
 57 |     "- cognitive and psychological aspects of NLP"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 24,
 63 |    "id": "e3f6696d",
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [
 68 |     {
 69 |      "data": {
 70 |       "text/html": [
 71 |        "\n",
 72 |        "            <iframe\n",
 73 |        "                width=\"400\"\n",
 74 |        "                height=\"300\"\n",
 75 |        "                src=\"http://www.youtube.com/embed/https://www.youtube.com/watch?v=PHzoX2AIzqo\"\n",
 76 |        "                frameborder=\"0\"\n",
 77 |        "                allowfullscreen\n",
 78 |        "            ></iframe>\n",
 79 |        "        "
 80 |       ],
 81 |       "text/plain": [
 82 |        "<IPython.lib.display.YouTubeVideo at 0x3e6d710>"
 83 |       ]
 84 |      },
 85 |      "execution_count": 24,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "YouTubeVideo(\"https://www.youtube.com/watch?v=PHzoX2AIzqo\")"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "id": "f79f105c",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## IUPR Home Page"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 20,
105 |    "id": "d829365f",
106 |    "metadata": {
107 |     "collapsed": false
108 |    },
109 |    "outputs": [
110 |     {
111 |      "data": {
112 |       "text/html": [
113 |        "<iframe src='http://www.iupr.com/' width=1000 height=500>"
114 |       ],
115 |       "text/plain": [
116 |        "<IPython.core.display.HTML at 0x3e6d510>"
117 |       ]
118 |      },
119 |      "execution_count": 20,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "HTML(\"<iframe src='http://www.iupr.com/' width=1000 height=500>\")"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "id": "c72e177c",
131 |    "metadata": {},
132 |    "source": [
133 |     "## NLPA Home Page"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 21,
139 |    "id": "912645e4",
140 |    "metadata": {
141 |     "collapsed": false
142 |    },
143 |    "outputs": [
144 |     {
145 |      "data": {
146 |       "text/html": [
147 |        "<iframe src='http://nlpa.iupr.com/' width=1000 height=500>"
148 |       ],
149 |       "text/plain": [
150 |        "<IPython.core.display.HTML at 0x3e6d610>"
151 |       ]
152 |      },
153 |      "execution_count": 21,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "HTML(\"<iframe src='http://nlpa.iupr.com/' width=1000 height=500>\")"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "id": "d5f2746d",
165 |    "metadata": {},
166 |    "source": [
167 |     "## Oral Exam Guide"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 23,
173 |    "id": "b1b7071f",
174 |    "metadata": {
175 |     "collapsed": false
176 |    },
177 |    "outputs": [
178 |     {
179 |      "data": {
180 |       "text/html": [
181 |        "<iframe src='http://goo.gl/SllCx' width=1000 height=500>"
182 |       ],
183 |       "text/plain": [
184 |        "<IPython.core.display.HTML at 0x3e6d290>"
185 |       ]
186 |      },
187 |      "execution_count": 23,
188 |      "metadata": {},
189 |      "output_type": "execute_result"
190 |     }
191 |    ],
192 |    "source": [
193 |     "HTML(\"<iframe src='http://goo.gl/SllCx' width=1000 height=500>\")"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "id": "80b620e7",
199 |    "metadata": {},
200 |    "source": [
201 |     "## Keep in Mind"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "id": "ae5c007d",
207 |    "metadata": {},
208 |    "source": [
209 |     "- after each lecture, identify definition, concepts, algorithms, proofs\n",
210 |     "- do all the exercises\n",
211 |     "- come up with, and solve, your own problems"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": null,
217 |    "id": "4dc12b84",
218 |    "metadata": {
219 |     "collapsed": false
220 |    },
221 |    "outputs": [],
222 |    "source": []
223 |   }
224 |  ],
225 |  "metadata": {},
226 |  "nbformat": 4,
227 |  "nbformat_minor": 5
228 | }
229 | 


--------------------------------------------------------------------------------
/nlpa-downloading-tomsawyer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "47e4a028",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Downloading Text from the Internet"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "d22cfae0",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Here is a simple example of how to download and reformat text from the Internet."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "id": "c1f802f0",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "Let's start by using `curl` to get some text data.\n",
 25 |     "We need the `-L` because this is a URL with a redirect built in."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 21,
 31 |    "id": "2abed08d",
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\r\n",
 41 |       "                                 Dload  Upload   Total   Spent    Left  Speed\r\n",
 42 |       "100   336    0   336    0     0   5905      0 --:--:-- --:--:-- --:--:-- 22400\r\n",
 43 |       "100  467k  100  467k    0     0   182k      0  0:00:02  0:00:02 --:--:--  255k\r\n"
 44 |      ]
 45 |     }
 46 |    ],
 47 |    "source": [
 48 |     "!curl -L 'http://goo.gl/g3aE4' > tomsawyer.html"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "id": "dcf4c30e",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "For single pages, `curl` is generally the best tool to use.\n",
 57 |     "For whole directory trees and mirroring, `wget` is what people usually use."
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "id": "8038bc1c",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "If we look at it, we got the page in HTML format."
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 22,
 71 |    "id": "6471e588",
 72 |    "metadata": {
 73 |     "collapsed": false
 74 |    },
 75 |    "outputs": [
 76 |     {
 77 |      "name": "stdout",
 78 |      "output_type": "stream",
 79 |      "text": [
 80 |       "\r\n",
 81 |       "\r\n",
 82 |       "<HTML>\r\n",
 83 |       "<HEAD>\r\n",
 84 |       "<TITLE>The Adventures of Tom Sawyer </TITLE>\r\n",
 85 |       "</HEAD>\r\n",
 86 |       "<BODY BGCOLOR=\"#FFFFF2\">\r\n",
 87 |       "<CENTER><B>Twain, Mark, 1835-1910. The Adventures of Tom Sawyer </B>\r\n",
 88 |       "<BR>\r\n",
 89 |       "Electronic Text Center, University of Virginia Library</CENTER>\r\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "!head tomsawyer.html"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "id": "b55d4388",
100 |    "metadata": {},
101 |    "source": [
102 |     "Now let's convert it to text format (we could also have used `lynx -dump URL` directly).\n",
103 |     "There are several other tools for converting HTML to text; they may or may not work better."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 23,
109 |    "id": "212d3e8f",
110 |    "metadata": {
111 |     "collapsed": false
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "!lynx -dump tomsawyer.html > tomsawyer.txt"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "id": "732de11b",
121 |    "metadata": {},
122 |    "source": [
123 |     "The output is pretty good, but has some extra spaces at the beginning."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 24,
129 |    "id": "262528d6",
130 |    "metadata": {
131 |     "collapsed": false
132 |    },
133 |    "outputs": [
134 |     {
135 |      "name": "stdout",
136 |      "output_type": "stream",
137 |      "text": [
138 |       "            Twain, Mark, 1835-1910. The Adventures of Tom Sawyer\r\n",
139 |       "           Electronic Text Center, University of Virginia Library\r\n",
140 |       "\r\n",
141 |       "                   | [1]Table of Contents for this work |\r\n",
142 |       "\r\n",
143 |       "           | [2]All on-line databases | [3]Etext Center Homepage |\r\n",
144 |       "     __________________________________________________________________\r\n",
145 |       "\r\n",
146 |       "   About the electronic version\r\n",
147 |       "   The Adventures of Tom Sawyer\r\n"
148 |      ]
149 |     }
150 |    ],
151 |    "source": [
152 |     "!head tomsawyer.txt"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 25,
158 |    "id": "e11f51b3",
159 |    "metadata": {
160 |     "collapsed": false
161 |    },
162 |    "outputs": [
163 |     {
164 |      "name": "stdout",
165 |      "output_type": "stream",
166 |      "text": [
167 |       "   curtain of a second-story window. Was the sacred presence there? He\r\n",
168 |       "   climbed the fence, threaded his stealthy way through the plants, till\r\n",
169 |       "   he stood under that window; he looked up at it long, and with emotion;\r\n",
170 |       "   then he laid him down on the ground under it, disposing himself upon\r\n",
171 |       "   his back, with his hands clasped upon his breast and holding his poor\r\n",
172 |       "   wilted flower. And thus he would\r\n",
173 |       "            _____________________________________________________\r\n",
174 |       "\r\n",
175 |       "                                    -43-\r\n",
176 |       "\r\n",
177 |       "   die -- out in the cold world, with no shelter over his homeless head,\r\n",
178 |       "   no friendly hand to wipe the death-damps from his brow, no loving face\r\n",
179 |       "   to bend pityingly over him when the great agony came. And thus she\r\n",
180 |       "   would see him when she looked out upon the glad morning, and oh! would\r\n",
181 |       "   she drop one little tear upon his poor, lifeless form, would she heave\r\n",
182 |       "   one little sigh to see a bright young life so rudely blighted, so\r\n",
183 |       "   untimely cut down?\r\n",
184 |       "\r\n",
185 |       "      The window went up, a maid-servant's discordant voice profaned the\r\n",
186 |       "   holy calm, and a deluge of water drenched the prone martyr's remains!\r\n",
187 |       "\r\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "!sed '1000,1020!d' tomsawyer.txt"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "id": "04d6068a",
198 |    "metadata": {},
199 |    "source": [
200 |     "Let's fix these with `sed`."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 26,
206 |    "id": "d6cdd4cf",
207 |    "metadata": {
208 |     "collapsed": false
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "!sed 's/^ *//' -i tomsawyer.txt"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "id": "9948e20c",
218 |    "metadata": {},
219 |    "source": [
220 |     "Also, there's a header at the beginning of the file (we can see that from the web page)."
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 27,
226 |    "id": "86c376c0",
227 |    "metadata": {
228 |     "collapsed": false
229 |    },
230 |    "outputs": [
231 |     {
232 |      "name": "stdout",
233 |      "output_type": "stream",
234 |      "text": [
235 |       "About the print version\r\n",
236 |       "The Adventures of Tom Sawyer\r\n",
237 |       "Mark Twain\r\n",
238 |       "Harper and Brothers\r\n",
239 |       "New York and London\r\n",
240 |       "1903\r\n",
241 |       "\r\n",
242 |       "Author's National Edition: The Writings of Mark Twain, Vol. XII\r\n",
243 |       "\r\n",
244 |       "Spell-check not performed due to presence of dialect.\r\n",
245 |       "Published: 1876\r\n"
246 |      ]
247 |     }
248 |    ],
249 |    "source": [
250 |     "!sed '/About the print/,+10!d' tomsawyer.txt"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 28,
256 |    "id": "a604f6ef",
257 |    "metadata": {
258 |     "collapsed": false
259 |    },
260 |    "outputs": [],
261 |    "source": [
262 |     "!sed '1,/About the print/d' -i tomsawyer.txt"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 29,
268 |    "id": "a3ff9f2c",
269 |    "metadata": {
270 |     "collapsed": false
271 |    },
272 |    "outputs": [
273 |     {
274 |      "name": "stdout",
275 |      "output_type": "stream",
276 |      "text": [
277 |       "The Adventures of Tom Sawyer\r\n",
278 |       "Mark Twain\r\n",
279 |       "Harper and Brothers\r\n",
280 |       "New York and London\r\n",
281 |       "1903\r\n",
282 |       "\r\n",
283 |       "Author's National Edition: The Writings of Mark Twain, Vol. XII\r\n",
284 |       "\r\n",
285 |       "Spell-check not performed due to presence of dialect.\r\n",
286 |       "Published: 1876\r\n"
287 |      ]
288 |     }
289 |    ],
290 |    "source": [
291 |     "!head tomsawyer.txt"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "id": "bc5eda7f",
297 |    "metadata": {},
298 |    "source": [
299 |     "# Other Converters"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "id": "6d0008bb",
305 |    "metadata": {},
306 |    "source": [
307 |     "If you don't like that kind of editing, there are a bunch of other converters you can use.\n",
308 |     "To find them, use `apt-cache search` or `synaptic` on Ubuntu or Debian.\n",
309 |     "Other Linux distributions have their own search tools.\n",
310 |     "You can also check [Freecode](http://www.freecode.com/) (AKA Freshmeat)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 33,
316 |    "id": "8b6a29fd",
317 |    "metadata": {
318 |     "collapsed": false
319 |    },
320 |    "outputs": [
321 |     {
322 |      "name": "stdout",
323 |      "output_type": "stream",
324 |      "text": [
325 |       "html2text - advanced HTML to text converter\r\n",
326 |       "poppler-utils - PDF utilities (based on Poppler)\r\n",
327 |       "xmlto - XML-to-any converter\r\n",
328 |       "wap-wml-tools - Wireless Markup Language development and test tools\r\n",
329 |       "highlight - Universal source code to formatted text converter\r\n",
330 |       "highlight-common - source code to formatted text converter (architecture independent files)\r\n",
331 |       "html2ps - HTML to PostScript converter\r\n",
332 |       "khmerconverter - converts between legacy Khmer encodings and Unicode\r\n",
333 |       "libghc-pandoc-dev - general markup converter\r\n",
334 |       "libghc-pandoc-doc - general markup converter\r\n",
335 |       "libghc-pandoc-prof - general markup converter\r\n",
336 |       "libhighlight-perl - perl bindings for highlight source code to formatted text converter\r\n",
337 |       "mira-assembler - Whole Genome Shotgun and EST Sequence Assembler\r\n",
338 |       "pandoc - general markup converter\r\n",
339 |       "php-text-wiki - transforms Wiki and BBCode markup into XHTML, LaTeX or plain text markup\r\n",
340 |       "pod2pdf - Plain Old Documentation to Portable Document Format converter\r\n",
341 |       "python-pdfminer - PDF parser and analyser\r\n",
342 |       "python-zope.app.renderer - Zope 3 Text Renderer Framework\r\n",
343 |       "src2tex - A converter from source program files to TeX format files\r\n",
344 |       "stx2any - Converter from structured plain text to other formats\r\n",
345 |       "t2html - text to HTML converter implemented in Perl\r\n",
346 |       "txt2html - Text to HTML converter\r\n",
347 |       "uni2ascii - UTF-8 to 7-bit ASCII and vice versa converter\r\n",
348 |       "unrtf - RTF to other formats converter\r\n",
349 |       "vilistextum - a HTML to text converter\r\n",
350 |       "wp2x - WordPerfect 5.x documents to whatever converter\r\n",
351 |       "yodl - Your Own Document Language (Yodl) is a pre-document language\r\n",
352 |       "yodl-doc - Documenation for Your Own Document Language (Yodl)\r\n"
353 |      ]
354 |     }
355 |    ],
356 |    "source": [
357 |     "!apt-cache search html text converter\n",
358 |     "# install with \"sudo apt-get install ...\" if needed"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 31,
364 |    "id": "102decb1",
365 |    "metadata": {
366 |     "collapsed": false
367 |    },
368 |    "outputs": [
369 |     {
370 |      "name": "stdout",
371 |      "output_type": "stream",
372 |      "text": [
373 |       "his speckled straw hat. He now looked exceedingly improved and uncomfortable.\r\n",
374 |       "He was fully as uncomfortable as he looked; for there was a restraint about\r\n",
375 |       "whole clothes and cleanliness that galled him. He hoped that Mary would forget\r\n",
376 |       "his shoes, but the hope was blighted; she coated them thoroughly with tallow,\r\n",
377 |       "as was the\r\n",
378 |       "\r\n",
379 |       "===============================================================================\r\n",
380 |       "\r\n",
381 |       "                                     -48-\r\n",
382 |       "\r\n",
383 |       "\r\n",
384 |       "custom, and brought them out. He lost his temper and said he was always being\r\n",
385 |       "made to do everything he didn't want to do. But Mary said, persuasively:\r\n",
386 |       "   \"Please, Tom -- that's a good boy.\"\r\n",
387 |       "   So he got into the shoes snarling. Mary was soon ready, and the three\r\n",
388 |       "children set out for Sunday-school -- a place that Tom hated with his whole\r\n",
389 |       "heart; but Sid and Mary were fond of it.\r\n",
390 |       "   Sabbath-school hours were from nine to half-past ten; and then church\r\n",
391 |       "service. Two of the children always remained for the sermon voluntarily, and\r\n",
392 |       "the other always remained too -- for stronger reasons. The church's high-\r\n",
393 |       "backed, uncushioned pews would seat about three hundred persons; the edifice\r\n"
394 |      ]
395 |     }
396 |    ],
397 |    "source": [
398 |     "!html2text tomsawyer.html | sed '1000,1020!d'"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 32,
404 |    "id": "ae0a1239",
405 |    "metadata": {
406 |     "collapsed": false
407 |    },
408 |    "outputs": [
409 |     {
410 |      "name": "stdout",
411 |      "output_type": "stream",
412 |      "text": [
413 |       "pityingly over him when the great agony came. And thus she would see him\r\n",
414 |       "when she looked out upon the glad morning, and oh! would she drop one little\r\n",
415 |       "tear upon his poor, lifeless form, would she heave one little sigh to see a\r\n",
416 |       "bright young life so rudely blighted, so untimely cut down?\r\n",
417 |       "\r\n",
418 |       "���The window went up, a maid-servant's discordant voice profaned the holy\r\n",
419 |       "calm, and a deluge of water drenched the prone martyr's remains!\r\n",
420 |       "\r\n",
421 |       "���The strangling hero sprang up with a relieving snort. There was a whiz as\r\n",
422 |       "of a missile in the air, mingled with the murmur of a curse, a sound as of\r\n",
423 |       "shivering glass followed, and a small, vague form went over the fence and\r\n",
424 |       "shot away in the gloom.\r\n",
425 |       "\r\n",
426 |       "���Not long after, as Tom, all undressed for bed, was surveying his drenched\r\n",
427 |       "garments by the light of a tallow dip, Sid woke up; but if he had any dim\r\n",
428 |       "idea of making any \"references to allusions,\" he thought better of it and\r\n",
429 |       "held his peace, for there was danger in Tom's eye.\r\n",
430 |       "\r\n",
431 |       "���Tom turned in without the added vexation of prayers, and Sid made mental\r\n",
432 |       "note of the omission.\r\n",
433 |       "\r\n"
434 |      ]
435 |     }
436 |    ],
437 |    "source": [
438 |     "!vilistextum tomsawyer.html - | sed '1000,1020!d'"
439 |    ]
440 |   }
441 |  ],
442 |  "metadata": {},
443 |  "nbformat": 4,
444 |  "nbformat_minor": 5
445 | }
446 | 


--------------------------------------------------------------------------------
/nlpa-find-xargs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "69e195ea",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Operating on Directory Trees"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 46,
 14 |    "id": "5871d5a9",
 15 |    "metadata": {
 16 |     "collapsed": false
 17 |    },
 18 |    "outputs": [
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "CONTENTS  ca41\t\t     cc10  ce18  cf24  cg18  cg60  ch27  cj39  ck01  cl14  cn26\r\n",
 24 |       "README\t  ca42\t\t     cc11  ce19  cf25  cg19  cg61  ch28  cj40  ck02  cl15  cn27\r\n",
 25 |       "ca01\t  ca43\t\t     cc12  ce20  cf26  cg20  cg62  ch29  cj41  ck03  cl16  cn28\r\n",
 26 |       "ca02\t  ca44\t\t     cc13  ce21  cf27  cg21  cg63  ch30  cj42  ck04  cl17  cn29\r\n",
 27 |       "ca03\t  categories.pickle  cc14  ce22  cf28  cg22  cg64  cj01  cj43  ck05  cl18  cp01\r\n",
 28 |       "ca04\t  cats.txt\t     cc15  ce23  cf29  cg23  cg65  cj02  cj44  ck06  cl19  cp02\r\n",
 29 |       "ca05\t  cb01\t\t     cc16  ce24  cf30  cg24  cg66  cj03  cj45  ck07  cl20  cp03\r\n",
 30 |       "ca06\t  cb02\t\t     cc17  ce25  cf31  cg25  cg67  cj04  cj46  ck08  cl21  cp04\r\n",
 31 |       "ca07\t  cb03\t\t     cd01  ce26  cf32  cg26  cg68  cj05  cj47  ck09  cl22  cp05\r\n",
 32 |       "ca08\t  cb04\t\t     cd02  ce27  cf33  cg27  cg69  cj06  cj48  ck10  cl23  cp06\r\n",
 33 |       "ca09\t  cb05\t\t     cd03  ce28  cf34  cg28  cg70  cj07  cj49  ck11  cl24  cp07\r\n",
 34 |       "ca10\t  cb06\t\t     cd04  ce29  cf35  cg29  cg71  cj08  cj50  ck12  cm01  cp08\r\n",
 35 |       "ca11\t  cb07\t\t     cd05  ce30  cf36  cg30  cg72  cj09  cj51  ck13  cm02  cp09\r\n",
 36 |       "ca12\t  cb08\t\t     cd06  ce31  cf37  cg31  cg73  cj10  cj52  ck14  cm03  cp10\r\n",
 37 |       "ca13\t  cb09\t\t     cd07  ce32  cf38  cg32  cg74  cj11  cj53  ck15  cm04  cp11\r\n",
 38 |       "ca14\t  cb10\t\t     cd08  ce33  cf39  cg33  cg75  cj12  cj54  ck16  cm05  cp12\r\n",
 39 |       "ca15\t  cb11\t\t     cd09  ce34  cf40  cg34  ch01  cj13  cj55  ck17  cm06  cp13\r\n",
 40 |       "ca16\t  cb12\t\t     cd10  ce35  cf41  cg35  ch02  cj14  cj56  ck18  cn01  cp14\r\n",
 41 |       "ca17\t  cb13\t\t     cd11  ce36  cf42  cg36  ch03  cj15  cj57  ck19  cn02  cp15\r\n",
 42 |       "ca18\t  cb14\t\t     cd12  cf01  cf43  cg37  ch04  cj16  cj58  ck20  cn03  cp16\r\n",
 43 |       "ca19\t  cb15\t\t     cd13  cf02  cf44  cg38  ch05  cj17  cj59  ck21  cn04  cp17\r\n",
 44 |       "ca20\t  cb16\t\t     cd14  cf03  cf45  cg39  ch06  cj18  cj60  ck22  cn05  cp18\r\n",
 45 |       "ca21\t  cb17\t\t     cd15  cf04  cf46  cg40  ch07  cj19  cj61  ck23  cn06  cp19\r\n",
 46 |       "ca22\t  cb18\t\t     cd16  cf05  cf47  cg41  ch08  cj20  cj62  ck24  cn07  cp20\r\n",
 47 |       "ca23\t  cb19\t\t     cd17  cf06  cf48  cg42  ch09  cj21  cj63  ck25  cn08  cp21\r\n",
 48 |       "ca24\t  cb20\t\t     ce01  cf07  cg01  cg43  ch10  cj22  cj64  ck26  cn09  cp22\r\n",
 49 |       "ca25\t  cb21\t\t     ce02  cf08  cg02  cg44  ch11  cj23  cj65  ck27  cn10  cp23\r\n",
 50 |       "ca26\t  cb22\t\t     ce03  cf09  cg03  cg45  ch12  cj24  cj66  ck28  cn11  cp24\r\n",
 51 |       "ca27\t  cb23\t\t     ce04  cf10  cg04  cg46  ch13  cj25  cj67  ck29  cn12  cp25\r\n",
 52 |       "ca28\t  cb24\t\t     ce05  cf11  cg05  cg47  ch14  cj26  cj68  cl01  cn13  cp26\r\n",
 53 |       "ca29\t  cb25\t\t     ce06  cf12  cg06  cg48  ch15  cj27  cj69  cl02  cn14  cp27\r\n",
 54 |       "ca30\t  cb26\t\t     ce07  cf13  cg07  cg49  ch16  cj28  cj70  cl03  cn15  cp28\r\n",
 55 |       "ca31\t  cb27\t\t     ce08  cf14  cg08  cg50  ch17  cj29  cj71  cl04  cn16  cp29\r\n",
 56 |       "ca32\t  cc01\t\t     ce09  cf15  cg09  cg51  ch18  cj30  cj72  cl05  cn17  cr01\r\n",
 57 |       "ca33\t  cc02\t\t     ce10  cf16  cg10  cg52  ch19  cj31  cj73  cl06  cn18  cr02\r\n",
 58 |       "ca34\t  cc03\t\t     ce11  cf17  cg11  cg53  ch20  cj32  cj74  cl07  cn19  cr03\r\n",
 59 |       "ca35\t  cc04\t\t     ce12  cf18  cg12  cg54  ch21  cj33  cj75  cl08  cn20  cr04\r\n",
 60 |       "ca36\t  cc05\t\t     ce13  cf19  cg13  cg55  ch22  cj34  cj76  cl09  cn21  cr05\r\n",
 61 |       "ca37\t  cc06\t\t     ce14  cf20  cg14  cg56  ch23  cj35  cj77  cl10  cn22  cr06\r\n",
 62 |       "ca38\t  cc07\t\t     ce15  cf21  cg15  cg57  ch24  cj36  cj78  cl11  cn23  cr07\r\n",
 63 |       "ca39\t  cc08\t\t     ce16  cf22  cg16  cg58  ch25  cj37  cj79  cl12  cn24  cr08\r\n",
 64 |       "ca40\t  cc09\t\t     ce17  cf23  cg17  cg59  ch26  cj38  cj80  cl13  cn25  cr09\r\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "!ls brown"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "id": "6ca95130",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "Let's look at operating on directory trees, a fairly common operation\n",
 78 |     "when dealing with files.\n",
 79 |     "\n",
 80 |     "It's common to want to search through a directory tree of files for matches.\n",
 81 |     "These days, `grep` has a built-in option for that, but let's\n",
 82 |     "see whether we can write that in some other (and more flexible) way."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 15,
 88 |    "id": "730058e3",
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "    107    3132   28944\r\n"
 98 |      ]
 99 |     }
100 |    ],
101 |    "source": [
102 |     "!grep -r nuclear brown/. | wc"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "id": "68e357ca",
108 |    "metadata": {},
109 |    "source": [
110 |     "The first thing people tend to do is look at the `find` command and\n",
111 |     "see its `-exec` option; they then write something like this command.\n",
112 |     "Do not use this kind of command; `-exec` is rarely the right thing to use\n",
113 |     "because it is quite inefficient, because it is limited in what you can do with it,\n",
114 |     "and because the syntax and quoting can get tricky."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 44,
120 |    "id": "294b7597",
121 |    "metadata": {
122 |     "collapsed": false
123 |    },
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "    107    3099   27553\r\n"
130 |      ]
131 |     }
132 |    ],
133 |    "source": [
134 |     "!find brown/. -type f -exec grep nuclear '{}' \\; | wc # DO NOT USE"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "id": "bff52247",
140 |    "metadata": {},
141 |    "source": [
142 |     "A better way of dealing with this is the `xargs` command.\n",
143 |     "It takes a partial command as its arguments, reads a list of file names\n",
144 |     "on its standard input, and then applies the command to all those\n",
145 |     "file names.  It can do this in parallel (and there are even distributed\n",
146 |     "versions of it)."
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": 16,
152 |    "id": "1a7bdc18",
153 |    "metadata": {
154 |     "collapsed": false
155 |    },
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "    107    3132   28944\r\n"
162 |      ]
163 |     }
164 |    ],
165 |    "source": [
166 |     "!find brown/. | xargs grep nuclear | wc "
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "id": "411423c5",
172 |    "metadata": {},
173 |    "source": [
174 |     "To deal properly with file names containing spaces, you need to use one of the following two commands (look at the manual pages to see why that works).  The latter is probably better behaved, since most UNIX commands expect line-oriented inputs, not null terminated inputs."
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 42,
180 |    "id": "c95295cb",
181 |    "metadata": {
182 |     "collapsed": false
183 |    },
184 |    "outputs": [
185 |     {
186 |      "name": "stdout",
187 |      "output_type": "stream",
188 |      "text": [
189 |       "    107    3132   28944\r\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "!find brown/. -print0 | xargs -0 grep nuclear | wc "
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 45,
200 |    "id": "e3bee5c4",
201 |    "metadata": {
202 |     "collapsed": false
203 |    },
204 |    "outputs": [
205 |     {
206 |      "name": "stdout",
207 |      "output_type": "stream",
208 |      "text": [
209 |       "    107    3132   28944\r\n"
210 |      ]
211 |     }
212 |    ],
213 |    "source": [
214 |     "!find brown/. | xargs -d '\\n' grep nuclear | wc   # THIS REALLY SHOULD BE THE DEFAULT"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "id": "8dbbbdff",
220 |    "metadata": {},
221 |    "source": [
222 |     "The `-l` option to `grep` tells it only to list the names of matching files.\n",
223 |     "So, if we want to know the number of matching files (instead of the number of\n",
224 |     "matching lines), we use this command:"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 36,
230 |    "id": "7058a80e",
231 |    "metadata": {
232 |     "collapsed": false
233 |    },
234 |    "outputs": [
235 |     {
236 |      "name": "stdout",
237 |      "output_type": "stream",
238 |      "text": [
239 |       "brown/./cj72\r\n",
240 |       "brown/./cj74\r\n",
241 |       "brown/./cb21\r\n",
242 |       "brown/./ch21\r\n",
243 |       "brown/./cg03\r\n"
244 |      ]
245 |     }
246 |    ],
247 |    "source": [
248 |     "!find brown/. | xargs grep -l nuclear | sed 5q"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 37,
254 |    "id": "36d4d79a",
255 |    "metadata": {
256 |     "collapsed": false
257 |    },
258 |    "outputs": [
259 |     {
260 |      "name": "stdout",
261 |      "output_type": "stream",
262 |      "text": [
263 |       "     35      35     455\r\n"
264 |      ]
265 |     }
266 |    ],
267 |    "source": [
268 |     "!find brown/. | xargs grep -l nuclear | wc "
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "id": "fd5632f0",
274 |    "metadata": {},
275 |    "source": [
276 |     "Since the output of `find` is just a list of lines, we can apply filters to it as well,\n",
277 |     "for example searching for specific file names, file name extensions, or other conditions.\n",
278 |     "So, if we want to look for the term `nuclear` only in the `ch` files of the Brown corpus,\n",
279 |     "we can use this command:"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 39,
285 |    "id": "84bfda2f",
286 |    "metadata": {
287 |     "collapsed": false
288 |    },
289 |    "outputs": [
290 |     {
291 |      "name": "stdout",
292 |      "output_type": "stream",
293 |      "text": [
294 |       "      3       3      39\r\n"
295 |      ]
296 |     }
297 |    ],
298 |    "source": [
299 |     "!find brown/. | fgrep brown/./ch | xargs grep -l nuclear | wc"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "id": "339900bb",
305 |    "metadata": {},
306 |    "source": [
307 |     "We can even put another grep in between there to filter things:"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 41,
313 |    "id": "196285a8",
314 |    "metadata": {
315 |     "collapsed": false
316 |    },
317 |    "outputs": [
318 |     {
319 |      "name": "stdout",
320 |      "output_type": "stream",
321 |      "text": [
322 |       "     11      11     143\r\n"
323 |      ]
324 |     }
325 |    ],
326 |    "source": [
327 |     "!find brown/. | xargs grep -l Kennedy | xargs grep -l nuclear | wc"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "id": "2037a2de",
333 |    "metadata": {},
334 |    "source": [
335 |     "Finally, let's add our little `sed` script back in to format the output."
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 47,
341 |    "id": "169b8d14",
342 |    "metadata": {
343 |     "collapsed": false
344 |    },
345 |    "outputs": [
346 |     {
347 |      "name": "stdout",
348 |      "output_type": "stream",
349 |      "text": [
350 |       "Until Moscow resumed nuclear testing last September 1 , the US and UK had released more than twice as much radiation into the atmosphere as the Russians , and the fallout from the earlier blasts is still coming down .\r\n",
351 |       "On October 19 , after the Soviets had detonated at least 20 nuclear devices , Ambassador Stevenson warned the UN General Assembly that this country , in `` self protection '' , might have to resume above-ground tests .\r\n",
352 |       "Now , of course , that the Russians are the nuclear villains , radiation is a nastier word than it was in the mid , when the US was testing in the atmosphere .\r\n",
353 |       "After a nuclear blast , one bureaucrat suggested in those halcyon days , about all you had to do was haul out the broom and sweep off your sidewalks and roof .\r\n",
354 |       "Can thermonuclear war be set off by accident ? ?\r\n",
355 |       "`` E '' stands for `` execution '' -- the moment a `` go order '' would unleash an American nuclear strike .\r\n",
356 |       "Work is under way to see whether new restraining devices should be installed on all nuclear weapons .\r\n",
357 |       "Only the President is permitted to authorize the use of nuclear weapons .\r\n",
358 |       "The President cannot personally remove the safety devices from every nuclear trigger .\r\n",
359 |       "However , the system is designed , ingeniously and hopefully , so that no one man could initiate a thermonuclear war .\r\n",
360 |       "sed: couldn't flush stdout: Broken pipe\r\n"
361 |      ]
362 |     }
363 |    ],
364 |    "source": [
365 |     "!find brown/. | xargs grep -l Kennedy | xargs grep -h nuclear | sed 's/\\/[^ ]*//g;s/^\\s//' | head"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "id": "0aa1a43e",
372 |    "metadata": {
373 |     "collapsed": false
374 |    },
375 |    "outputs": [],
376 |    "source": []
377 |   }
378 |  ],
379 |  "metadata": {},
380 |  "nbformat": 4,
381 |  "nbformat_minor": 5
382 | }
383 | 


--------------------------------------------------------------------------------
/nlpa-intro-demo-videos.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "2e727f66",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "from IPython.core.display import Image,HTML\n",
 13 |     "from IPython.lib.display import YouTubeVideo"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "id": "4b312fd7",
 20 |    "metadata": {
 21 |     "collapsed": false
 22 |    },
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/html": [
 27 |        "\n",
 28 |        "            <iframe\n",
 29 |        "                width=\"400\"\n",
 30 |        "                height=\"300\"\n",
 31 |        "                src=\"http://www.youtube.com/embed/PHzoX2AIzqo\"\n",
 32 |        "                frameborder=\"0\"\n",
 33 |        "                allowfullscreen\n",
 34 |        "            ></iframe>\n",
 35 |        "        "
 36 |       ],
 37 |       "text/plain": [
 38 |        "<IPython.lib.display.YouTubeVideo at 0x3e473d0>"
 39 |       ]
 40 |      },
 41 |      "execution_count": 2,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "YouTubeVideo(\"PHzoX2AIzqo\")"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "id": "f1027548",
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/html": [
 61 |        "\n",
 62 |        "            <iframe\n",
 63 |        "                width=\"400\"\n",
 64 |        "                height=\"300\"\n",
 65 |        "                src=\"http://www.youtube.com/embed/RAJIDH5d4C4\"\n",
 66 |        "                frameborder=\"0\"\n",
 67 |        "                allowfullscreen\n",
 68 |        "            ></iframe>\n",
 69 |        "        "
 70 |       ],
 71 |       "text/plain": [
 72 |        "<IPython.lib.display.YouTubeVideo at 0x3e474d0>"
 73 |       ]
 74 |      },
 75 |      "execution_count": 3,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "YouTubeVideo(\"RAJIDH5d4C4\")"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 4,
 87 |    "id": "b5790420",
 88 |    "metadata": {
 89 |     "collapsed": false
 90 |    },
 91 |    "outputs": [
 92 |     {
 93 |      "data": {
 94 |       "text/html": [
 95 |        "\n",
 96 |        "            <iframe\n",
 97 |        "                width=\"400\"\n",
 98 |        "                height=\"300\"\n",
 99 |        "                src=\"http://www.youtube.com/embed/WuP6AQPRpUg\"\n",
100 |        "                frameborder=\"0\"\n",
101 |        "                allowfullscreen\n",
102 |        "            ></iframe>\n",
103 |        "        "
104 |       ],
105 |       "text/plain": [
106 |        "<IPython.lib.display.YouTubeVideo at 0x3e47590>"
107 |       ]
108 |      },
109 |      "execution_count": 4,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "YouTubeVideo(\"WuP6AQPRpUg\")"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 5,
121 |    "id": "25330a69",
122 |    "metadata": {
123 |     "collapsed": false
124 |    },
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/html": [
129 |        "\n",
130 |        "            <iframe\n",
131 |        "                width=\"400\"\n",
132 |        "                height=\"300\"\n",
133 |        "                src=\"http://www.youtube.com/embed/s8WFctIbt84\"\n",
134 |        "                frameborder=\"0\"\n",
135 |        "                allowfullscreen\n",
136 |        "            ></iframe>\n",
137 |        "        "
138 |       ],
139 |       "text/plain": [
140 |        "<IPython.lib.display.YouTubeVideo at 0x3e47650>"
141 |       ]
142 |      },
143 |      "execution_count": 5,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "YouTubeVideo(\"s8WFctIbt84\")"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 6,
155 |    "id": "72d55ba1",
156 |    "metadata": {
157 |     "collapsed": false
158 |    },
159 |    "outputs": [
160 |     {
161 |      "data": {
162 |       "text/html": [
163 |        "\n",
164 |        "            <iframe\n",
165 |        "                width=\"400\"\n",
166 |        "                height=\"300\"\n",
167 |        "                src=\"http://www.youtube.com/embed/Qy6zhx9gndI\"\n",
168 |        "                frameborder=\"0\"\n",
169 |        "                allowfullscreen\n",
170 |        "            ></iframe>\n",
171 |        "        "
172 |       ],
173 |       "text/plain": [
174 |        "<IPython.lib.display.YouTubeVideo at 0x3e475d0>"
175 |       ]
176 |      },
177 |      "execution_count": 6,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "YouTubeVideo(\"Qy6zhx9gndI\")"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 7,
189 |    "id": "37aaf341",
190 |    "metadata": {
191 |     "collapsed": false
192 |    },
193 |    "outputs": [
194 |     {
195 |      "data": {
196 |       "text/html": [
197 |        "\n",
198 |        "            <iframe\n",
199 |        "                width=\"400\"\n",
200 |        "                height=\"300\"\n",
201 |        "                src=\"http://www.youtube.com/embed/c9jk3P0GqLU\"\n",
202 |        "                frameborder=\"0\"\n",
203 |        "                allowfullscreen\n",
204 |        "            ></iframe>\n",
205 |        "        "
206 |       ],
207 |       "text/plain": [
208 |        "<IPython.lib.display.YouTubeVideo at 0x3e47390>"
209 |       ]
210 |      },
211 |      "execution_count": 7,
212 |      "metadata": {},
213 |      "output_type": "execute_result"
214 |     }
215 |    ],
216 |    "source": [
217 |     "YouTubeVideo(\"c9jk3P0GqLU\")"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 8,
223 |    "id": "430610d9",
224 |    "metadata": {
225 |     "collapsed": false
226 |    },
227 |    "outputs": [
228 |     {
229 |      "data": {
230 |       "text/html": [
231 |        "\n",
232 |        "            <iframe\n",
233 |        "                width=\"400\"\n",
234 |        "                height=\"300\"\n",
235 |        "                src=\"http://www.youtube.com/embed/YKjo9dldp2g\"\n",
236 |        "                frameborder=\"0\"\n",
237 |        "                allowfullscreen\n",
238 |        "            ></iframe>\n",
239 |        "        "
240 |       ],
241 |       "text/plain": [
242 |        "<IPython.lib.display.YouTubeVideo at 0x3e47790>"
243 |       ]
244 |      },
245 |      "execution_count": 8,
246 |      "metadata": {},
247 |      "output_type": "execute_result"
248 |     }
249 |    ],
250 |    "source": [
251 |     "YouTubeVideo(\"YKjo9dldp2g\")"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 9,
257 |    "id": "ba43e62f",
258 |    "metadata": {
259 |     "collapsed": false
260 |    },
261 |    "outputs": [
262 |     {
263 |      "data": {
264 |       "text/html": [
265 |        "\n",
266 |        "            <iframe\n",
267 |        "                width=\"400\"\n",
268 |        "                height=\"300\"\n",
269 |        "                src=\"http://www.youtube.com/embed/nXgboDb9ucE\"\n",
270 |        "                frameborder=\"0\"\n",
271 |        "                allowfullscreen\n",
272 |        "            ></iframe>\n",
273 |        "        "
274 |       ],
275 |       "text/plain": [
276 |        "<IPython.lib.display.YouTubeVideo at 0x3e47850>"
277 |       ]
278 |      },
279 |      "execution_count": 9,
280 |      "metadata": {},
281 |      "output_type": "execute_result"
282 |     }
283 |    ],
284 |    "source": [
285 |     "YouTubeVideo(\"nXgboDb9ucE\")"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 10,
291 |    "id": "89f6f7c6",
292 |    "metadata": {
293 |     "collapsed": false
294 |    },
295 |    "outputs": [
296 |     {
297 |      "data": {
298 |       "text/html": [
299 |        "\n",
300 |        "            <iframe\n",
301 |        "                width=\"400\"\n",
302 |        "                height=\"300\"\n",
303 |        "                src=\"http://www.youtube.com/embed/BOUTfUmI8vs\"\n",
304 |        "                frameborder=\"0\"\n",
305 |        "                allowfullscreen\n",
306 |        "            ></iframe>\n",
307 |        "        "
308 |       ],
309 |       "text/plain": [
310 |        "<IPython.lib.display.YouTubeVideo at 0x3e478d0>"
311 |       ]
312 |      },
313 |      "execution_count": 10,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "YouTubeVideo(\"BOUTfUmI8vs\")"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "id": "1cca4c44",
326 |    "metadata": {
327 |     "collapsed": false
328 |    },
329 |    "outputs": [],
330 |    "source": []
331 |   }
332 |  ],
333 |  "metadata": {},
334 |  "nbformat": 4,
335 |  "nbformat_minor": 5
336 | }
337 | 


--------------------------------------------------------------------------------
/nlpa-intro.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "5a7b3a4e",
 7 |    "metadata": {
 8 |     "collapsed": false
 9 |    },
10 |    "outputs": [],
11 |    "source": []
12 |   }
13 |  ],
14 |  "metadata": {},
15 |  "nbformat": 4,
16 |  "nbformat_minor": 5
17 | }
18 | 


--------------------------------------------------------------------------------
/nlpa-memm.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 52,
  6 |    "id": "e37e73b1",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "from pylab import *\n",
 13 |     "import bisect\n",
 14 |     "def unit(i,n):\n",
 15 |     "    result = zeros(n)\n",
 16 |     "    result[i] = 1.0\n",
 17 |     "    return result"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "id": "dbf90496",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "# Generating a Dataset"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 10,
 31 |    "id": "24d5a838",
 32 |    "metadata": {
 33 |     "collapsed": false
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "M = roll(diag(ones(4)) + 0.1*rand(4,4),-1,1)\n",
 38 |     "M /= sum(M,axis=0)[newaxis,:]"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 55,
 44 |    "id": "0405b0ea",
 45 |    "metadata": {
 46 |     "collapsed": false
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "def generate(M,s=0,n=10):\n",
 51 |     "    result = []\n",
 52 |     "    for i in range(n):\n",
 53 |     "        result.append(s)\n",
 54 |     "        x = unit(s,len(M))\n",
 55 |     "        x = dot(M,x)\n",
 56 |     "        s = bisect.bisect_left(add.accumulate(x),rand())\n",
 57 |     "    return result"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 63,
 63 |    "id": "7e0a5123",
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "def glabel(s):\n",
 70 |     "    result = [0]\n",
 71 |     "    for i in range(1,len(s)):\n",
 72 |     "        if s[i]==s[i-1]+1 and result[-1]:\n",
 73 |     "            result.append(s[i])\n",
 74 |     "        else:\n",
 75 |     "            result.append(0)\n",
 76 |     "    return result"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 78,
 82 |    "id": "851dbbf2",
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "xs = generate(M,0,10000)\n",
 89 |     "ys = glabel(xs)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 110,
 95 |    "id": "92d0b0a5",
 96 |    "metadata": {
 97 |     "collapsed": false
 98 |    },
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "[0, 1, 2, 3, 1, 2, 3, 0, 1, 2, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]\n",
105 |       "[0, 1, 2, 3, 0, 2, 3, 0, 1, 2, 0, 3, 0, 1, 2, 3, 0, 1, 2, 3]\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "print xs[:20]\n",
111 |     "print ys[:20]"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "id": "8e8504d4",
117 |    "metadata": {},
118 |    "source": [
119 |     "# Learning the Transitions"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 87,
125 |    "id": "3064f77d",
126 |    "metadata": {
127 |     "collapsed": false
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "cs = []\n",
132 |     "ps = []\n",
133 |     "for i in range(1,len(xs)):\n",
134 |     "    c = concatenate([unit(ys[i-1],4),unit(xs[i],4)])\n",
135 |     "    p = unit(xs[i],4)\n",
136 |     "    cs.append(c)\n",
137 |     "    ps.append(p)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 88,
143 |    "id": "be0e8b63",
144 |    "metadata": {
145 |     "collapsed": false
146 |    },
147 |    "outputs": [],
148 |    "source": [
149 |     "from sklearn.linear_model import LogisticRegression"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 89,
155 |    "id": "839b6d16",
156 |    "metadata": {
157 |     "collapsed": false
158 |    },
159 |    "outputs": [
160 |     {
161 |      "data": {
162 |       "text/plain": [
163 |        "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
164 |        "          intercept_scaling=1, penalty='l2', tol=0.0001)"
165 |       ]
166 |      },
167 |      "execution_count": 89,
168 |      "metadata": {},
169 |      "output_type": "execute_result"
170 |     }
171 |    ],
172 |    "source": [
173 |     "lr = LogisticRegression()\n",
174 |     "lr.fit(cs,ys[1:])"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": 103,
180 |    "id": "649ffa79",
181 |    "metadata": {
182 |     "collapsed": false
183 |    },
184 |    "outputs": [
185 |     {
186 |      "data": {
187 |       "text/plain": [
188 |        "442"
189 |       ]
190 |      },
191 |      "execution_count": 103,
192 |      "metadata": {},
193 |      "output_type": "execute_result"
194 |     }
195 |    ],
196 |    "source": [
197 |     "sum(array(lr.predict(cs),'i')!=array(ys[1:],'i'))"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "id": "4b5aee0f",
203 |    "metadata": {},
204 |    "source": [
205 |     "# Forward Algorithm"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 105,
211 |    "id": "c9318bc6",
212 |    "metadata": {
213 |     "collapsed": false
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "pys = [0]\n",
218 |     "for i in range(1,len(xs)):\n",
219 |     "    c = concatenate([unit(pys[i-1],4),unit(xs[i],4)])\n",
220 |     "    y = lr.predict([c])\n",
221 |     "    pys.append(y)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 108,
227 |    "id": "9cf12afb",
228 |    "metadata": {
229 |     "collapsed": false
230 |    },
231 |    "outputs": [
232 |     {
233 |      "data": {
234 |       "text/plain": [
235 |        "382"
236 |       ]
237 |      },
238 |      "execution_count": 108,
239 |      "metadata": {},
240 |      "output_type": "execute_result"
241 |     }
242 |    ],
243 |    "source": [
244 |     "sum(array(ys)!=array(pys))"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 91,
250 |    "id": "297824fb",
251 |    "metadata": {
252 |     "collapsed": false
253 |    },
254 |    "outputs": [
255 |     {
256 |      "data": {
257 |       "text/plain": [
258 |        "array([[-11, -18, -10,  48,  49, -15,  -8, -17],\n",
259 |        "       [ 36, -31, -28, -30, -31,  47, -34, -34],\n",
260 |        "       [ -2,  41, -44, -37, -28, -35,  51, -31],\n",
261 |        "       [ -2, -40,  40, -40, -21, -34, -37,  51]], dtype=int32)"
262 |       ]
263 |      },
264 |      "execution_count": 91,
265 |      "metadata": {},
266 |      "output_type": "execute_result"
267 |     }
268 |    ],
269 |    "source": [
270 |     "array(lr.coef_*10,'i')"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 111,
276 |    "id": "3a1ce39f",
277 |    "metadata": {
278 |     "collapsed": false
279 |    },
280 |    "outputs": [],
281 |    "source": [
282 |     "lprobs = zeros((100,4))"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 112,
288 |    "id": "4e75a7eb",
289 |    "metadata": {
290 |     "collapsed": false
291 |    },
292 |    "outputs": [],
293 |    "source": [
294 |     "lprobs[0,:] = log(1.0/4)"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": null,
300 |    "id": "01a98b08",
301 |    "metadata": {
302 |     "collapsed": false
303 |    },
304 |    "outputs": [],
305 |    "source": []
306 |   }
307 |  ],
308 |  "metadata": {},
309 |  "nbformat": 4,
310 |  "nbformat_minor": 5
311 | }
312 | 


--------------------------------------------------------------------------------
/nlpa-nltk-automated-tagging.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 51,
  6 |    "id": "54edbe68",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import nltk\n",
 13 |     "import urllib2\n",
 14 |     "import re"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "id": "4e25d1be",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "# Automatic Tagging with NLTK"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "id": "5196f354",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "Although the above results are neat, they aren't all that useful in practice\n",
 31 |     "because most texts we want to visualize in such ways aren't tagged, and tagging\n",
 32 |     "them by hand ist costly.\n",
 33 |     "\n",
 34 |     "What we need is an *automated tagger*."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "id": "6770f788",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "Let's take a page off Wikipedia and tag it automatically."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 52,
 48 |    "id": "dd9fe4bf",
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/plain": [
 56 |        "u'<!DOCTYPE html>\\n<html lang=\"en\" dir=\"ltr\" class=\"client-nojs\">\\n<head>\\n<title>George Washington - Wikipedia, the free encyclopedia</title>\\n<meta charset=\"UTF-8\" />\\n<meta name=\"generator\" content=\"MediaWiki 1.21wmf4\" />\\n<meta name=\"robots\" content=\"noindex,follow\" />\\n<link rel=\"apple-touch-icon\" href=\"//en.wikipedia.org/apple-touch-icon.png\" />\\n<link rel=\"shortcut icon\" href=\"/favicon.ico\" />\\n<link '"
 57 |       ]
 58 |      },
 59 |      "execution_count": 52,
 60 |      "metadata": {},
 61 |      "output_type": "execute_result"
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "opener = urllib2.build_opener()\n",
 66 |     "opener.addheaders = [('User-agent', 'Mozilla/5.0')]\n",
 67 |     "infile = opener.open('http://en.wikipedia.org/w/index.php?title=George_Washington&printable=yes')\n",
 68 |     "page = infile.read().decode(\"utf-8\")\n",
 69 |     "page[:400]"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "id": "90b04f54",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "This is in HTML format, so we first need to clean it up.\n",
 78 |     "\n",
 79 |     "(There are other ways of cleaning up and analyzing HTML.\n",
 80 |     "A good HTML library is BeautifulSoup.)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 53,
 86 |    "id": "d134f139",
 87 |    "metadata": {
 88 |     "collapsed": false
 89 |    },
 90 |    "outputs": [
 91 |     {
 92 |      "name": "stdout",
 93 |      "output_type": "stream",
 94 |      "text": [
 95 |       "George Washington - Wikipedia, the free encyclopedia\n",
 96 |       "\t\t\t\t\t\t\t\t\t\t\t\t George Washington\n",
 97 |       "\t\t\t\t\t\t\t\t\t\t\t\t From Wikipedia, the free encyclopedia\n",
 98 |       "\t\t\t\t\tJump to:\t\t\t\t\t navigation , \t\t\t\t\t search\n",
 99 |       "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t This article is about the first President of the United States. For other uses, see George Washington (disambiguation) .\n",
100 |       " For a simpler version of this article, see the Simple English Wikipedia article: \n"
101 |      ]
102 |     }
103 |    ],
104 |    "source": [
105 |     "page = nltk.util.clean_html(page)\n",
106 |     "page = re.sub(r'\\s*\\n+','\\n',page)\n",
107 |     "print page[:400]"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "id": "fdb46466",
113 |    "metadata": {},
114 |    "source": [
115 |     "All the rest of the software works on tokenized data."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 54,
121 |    "id": "c7d69732",
122 |    "metadata": {
123 |     "collapsed": false
124 |    },
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "[u'Washington', u'quickly', u'became', u'a', u'senior', u'officer', u'in', u'the', u'colonial', u'forces', u'during', u'the', u'first', u'stages', u'of', u'the', u'French', u'and', u'Indian', u'War', u'.']\n"
131 |      ]
132 |     }
133 |    ],
134 |    "source": [
135 |     "sents = nltk.sent_tokenize(page)\n",
136 |     "sents = [nltk.word_tokenize(s) for s in sents]\n",
137 |     "print sents[17]"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "id": "21e6e7b2",
143 |    "metadata": {},
144 |    "source": [
145 |     "This data hasn't been manually tagged, so we need an automatic tagger."
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 55,
151 |    "id": "17b505e6",
152 |    "metadata": {
153 |     "collapsed": false
154 |    },
155 |    "outputs": [
156 |     {
157 |      "name": "stdout",
158 |      "output_type": "stream",
159 |      "text": [
160 |       "[(u'Washington', 'NNP'), (u'quickly', 'RB'), (u'became', 'VBD'), (u'a', 'DT'), (u'senior', 'JJ'), (u'officer', 'NN'), (u'in', 'IN'), (u'the', 'DT'), (u'colonial', 'JJ'), (u'forces', 'NNS'), (u'during', 'IN'), (u'the', 'DT'), (u'first', 'JJ'), (u'stages', 'NNS'), (u'of', 'IN'), (u'the', 'DT'), (u'French', 'JJ'), (u'and', 'CC'), (u'Indian', 'JJ'), (u'War', 'NN'), (u'.', '.')]\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "print nltk.pos_tag(sents[17])"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "id": "1c6a5247",
171 |    "metadata": {},
172 |    "source": [
173 |     "To perform named entity extraction, we run `ne_chunk` on the output of `pos_tag`.\n",
174 |     "\n",
175 |     "The output is a mix of tree nodes combining multiple tagged words,\n",
176 |     "together with raw tagged words."
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 56,
182 |    "id": "ae90265a",
183 |    "metadata": {
184 |     "collapsed": false
185 |    },
186 |    "outputs": [
187 |     {
188 |      "name": "stdout",
189 |      "output_type": "stream",
190 |      "text": [
191 |       "(GPE Washington/NNP)\n",
192 |       "(u'quickly', 'RB')\n",
193 |       "(u'became', 'VBD')\n",
194 |       "(u'a', 'DT')\n",
195 |       "(u'senior', 'JJ')\n",
196 |       "(u'officer', 'NN')\n",
197 |       "(u'in', 'IN')\n",
198 |       "(u'the', 'DT')\n",
199 |       "(u'colonial', 'JJ')\n",
200 |       "(u'forces', 'NNS')\n",
201 |       "(u'during', 'IN')\n",
202 |       "(u'the', 'DT')\n",
203 |       "(u'first', 'JJ')\n",
204 |       "(u'stages', 'NNS')\n",
205 |       "(u'of', 'IN')\n",
206 |       "(u'the', 'DT')\n",
207 |       "(GPE French/JJ)\n",
208 |       "(u'and', 'CC')\n",
209 |       "(GPE Indian/JJ)\n",
210 |       "(u'War', 'NN')\n",
211 |       "(u'.', '.')\n"
212 |      ]
213 |     }
214 |    ],
215 |    "source": [
216 |     "chunked = nltk.ne_chunk(nltk.pos_tag(sents[17]))\n",
217 |     "for node in chunked:\n",
218 |     "    print node"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "id": "e4f83e4d",
224 |    "metadata": {},
225 |    "source": [
226 |     "There are actually several different kinds of named entities."
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 57,
232 |    "id": "5eec5314",
233 |    "metadata": {
234 |     "collapsed": false
235 |    },
236 |    "outputs": [],
237 |    "source": [
238 |     "\"\"\"\n",
239 |     "ORGANIZATION\tGeorgia-Pacific Corp., WHO\n",
240 |     "PERSON\t      Eddy Bonte, President Obama\n",
241 |     "LOCATION\t    Murray River, Mount Everest\n",
242 |     "DATE\t        June, 2008-06-29\n",
243 |     "TIME\t        two fifty a m, 1:30 p.m.\n",
244 |     "MONEY\t       175 million Canadian Dollars, GBP 10.40\n",
245 |     "PERCENT\t     twenty pct, 18.75 %\n",
246 |     "FACILITY\t    Washington Monument, Stonehenge\n",
247 |     "GPE\t         South East Asia, Midlothian\n",
248 |     "\"\"\";None"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "id": "83113387",
254 |    "metadata": {},
255 |    "source": [
256 |     "We now need to dig through this data to extract the actual information about\n",
257 |     "the named entity."
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 58,
263 |    "id": "8cafa8f0",
264 |    "metadata": {
265 |     "collapsed": false
266 |    },
267 |    "outputs": [
268 |     {
269 |      "data": {
270 |       "text/plain": [
271 |        "[Tree('GPE', [(u'Washington', 'NNP')]),\n",
272 |        " Tree('GPE', [(u'French', 'JJ')]),\n",
273 |        " Tree('GPE', [(u'Indian', 'JJ')])]"
274 |       ]
275 |      },
276 |      "execution_count": 58,
277 |      "metadata": {},
278 |      "output_type": "execute_result"
279 |     }
280 |    ],
281 |    "source": [
282 |     "def nextract(tokens,types=[\"GPE\",\"PERSON\"]):\n",
283 |     "    chunked = nltk.ne_chunk(nltk.pos_tag(tokens))\n",
284 |     "    return [c for c in chunked if hasattr(c,\"node\") and c.node in types]\n",
285 |     "nes = nextract(sents[17])\n",
286 |     "nes"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 59,
292 |    "id": "1c43f5f2",
293 |    "metadata": {
294 |     "collapsed": false
295 |    },
296 |    "outputs": [
297 |     {
298 |      "data": {
299 |       "text/plain": [
300 |        "[(u'Washington', 'NNP')]"
301 |       ]
302 |      },
303 |      "execution_count": 59,
304 |      "metadata": {},
305 |      "output_type": "execute_result"
306 |     }
307 |    ],
308 |    "source": [
309 |     "nes[0].leaves()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 60,
315 |    "id": "0fbafcb8",
316 |    "metadata": {
317 |     "collapsed": false
318 |    },
319 |    "outputs": [],
320 |    "source": [
321 |     "def nextract_text(tokens,types=[\"GPE\",\"PERSON\"]):\n",
322 |     "    nodes = nextract(tokens,types)\n",
323 |     "    return [\" \".join(c[0] for c in chunk.leaves()) for chunk in nodes]"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 61,
329 |    "id": "206e1703",
330 |    "metadata": {
331 |     "collapsed": false
332 |    },
333 |    "outputs": [
334 |     {
335 |      "data": {
336 |       "text/plain": [
337 |        "[u'Washington', u'French', u'Indian']"
338 |       ]
339 |      },
340 |      "execution_count": 61,
341 |      "metadata": {},
342 |      "output_type": "execute_result"
343 |     }
344 |    ],
345 |    "source": [
346 |     "nextract_text(sents[17])"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 62,
352 |    "id": "fe9894a1",
353 |    "metadata": {
354 |     "collapsed": false
355 |    },
356 |    "outputs": [],
357 |    "source": [
358 |     "nes = [nextract_text(s,[\"PERSON\"]) for s in sents]"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "markdown",
363 |    "id": "a4c1b802",
364 |    "metadata": {},
365 |    "source": [
366 |     "Let's look at what this extracted."
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 49,
372 |    "id": "b2ed9b28",
373 |    "metadata": {
374 |     "collapsed": false
375 |    },
376 |    "outputs": [
377 |     {
378 |      "data": {
379 |       "text/plain": [
380 |        "[(u'George Washington', 90),\n",
381 |        " (u'George', 44),\n",
382 |        " (u'Mount Vernon', 22),\n",
383 |        " (u'Martha', 14),\n",
384 |        " (u'Jefferson', 8),\n",
385 |        " (u'Chernow', 8),\n",
386 |        " (u'John Adams', 8),\n",
387 |        " (u'See', 7),\n",
388 |        " (u'Oxford University', 6),\n",
389 |        " (u'John', 6)]"
390 |       ]
391 |      },
392 |      "execution_count": 49,
393 |      "metadata": {},
394 |      "output_type": "execute_result"
395 |     }
396 |    ],
397 |    "source": [
398 |     "from collections import Counter\n",
399 |     "Counter([x for l in nes for x in l]).most_common(10)"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "markdown",
404 |    "id": "b87d96e7",
405 |    "metadata": {},
406 |    "source": [
407 |     "As you can see, the named entity extractor has a significant error rate.\n",
408 |     "\"Mount Vernon\", \"See\", and \"Oxford University\" are not  persons.\n",
409 |     "Also, we don't know the identity of \"George\", \"Martha\", and \"John\".\n",
410 |     "But generally, it seems to return the right thing."
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "markdown",
415 |    "id": "50b81d2d",
416 |    "metadata": {},
417 |    "source": [
418 |     "Now let's look at co-occurences of named entities."
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 50,
424 |    "id": "91fd94ce",
425 |    "metadata": {
426 |     "collapsed": false
427 |    },
428 |    "outputs": [
429 |     {
430 |      "data": {
431 |       "text/plain": [
432 |        "[((u'George Washington', u'Mount Vernon'), 17),\n",
433 |        " ((u'George Washington', u'Lawrence Washington'), 12),\n",
434 |        " ((u'John Adams', u'Position'), 10),\n",
435 |        " ((u'George Washington', u'Henry Knox'), 8),\n",
436 |        " ((u'George Washington', u'Henry Compton'), 8),\n",
437 |        " ((u'George Washington', u'Martha Washington'), 8),\n",
438 |        " ((u'George Washington', u'John Adams'), 8),\n",
439 |        " ((u'George Washington', u'William Wake'), 8),\n",
440 |        " ((u'George Washington', u'John Tyler'), 8),\n",
441 |        " ((u'George Washington', u'Gibson'), 8),\n",
442 |        " ((u'George Washington', u'William &amp'), 8),\n",
443 |        " ((u'George Washington', u'Timothy Pickering'), 8),\n",
444 |        " ((u'Gardens Discover', u'George Washington'), 7),\n",
445 |        " ((u'George Washington', u'George Washington Birthplace National Monument'),\n",
446 |        "  7),\n",
447 |        " ((u'George Washington', u'Mount Vernon Estate'), 7),\n",
448 |        " ((u'George Washington', u'Miller Center'), 7),\n",
449 |        " ((u'George Washington', u'Museum &amp'), 7),\n",
450 |        " ((u'George Washington', u'Made George Washington'), 7)]"
451 |       ]
452 |      },
453 |      "execution_count": 50,
454 |      "metadata": {},
455 |      "output_type": "execute_result"
456 |     }
457 |    ],
458 |    "source": [
459 |     "from itertools import *\n",
460 |     "pairs = Counter([tuple(sorted(list(p))) for s in nes for p in combinations(s,2)])\n",
461 |     "[p for p in pairs.most_common(20) if p[0][0]!=p[0][1]]"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": null,
467 |    "id": "75cbe7bf",
468 |    "metadata": {
469 |     "collapsed": false
470 |    },
471 |    "outputs": [],
472 |    "source": []
473 |   }
474 |  ],
475 |  "metadata": {},
476 |  "nbformat": 4,
477 |  "nbformat_minor": 5
478 | }
479 | 


--------------------------------------------------------------------------------
/nlpa-nltk-corpora.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "1531ac8b",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Properties of Corpora"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "ca3ec458",
 15 |    "metadata": {
 16 |     "collapsed": false
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from nltk.corpus import brown"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "d9d77308",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Corpora are Collections of Files"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 17,
 34 |    "id": "2e30d45a",
 35 |    "metadata": {
 36 |     "collapsed": false
 37 |    },
 38 |    "outputs": [
 39 |     {
 40 |      "data": {
 41 |       "text/plain": [
 42 |        "FileSystemPathPointer('/home/tmb/nltk_data/corpora/brown')"
 43 |       ]
 44 |      },
 45 |      "execution_count": 17,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "brown.root"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 18,
 57 |    "id": "b80d5ffa",
 58 |    "metadata": {
 59 |     "collapsed": false
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "data": {
 64 |       "text/plain": [
 65 |        "'BROWN CORPUS\\n\\nA Standard Corpus of Present-Day Edited American\\nEnglish, for use with Digital Computers.\\n\\nby W. N. Francis and H. Kucera (1964)\\nDepartment of Linguistics, Brown University\\nProvidence, Rhode Island, USA\\n\\nRevised 1971, Revised and Amplified 1979\\n\\nhttp://www.hit.uib.no/icame/brown/bcm.html\\n\\nDistributed with the permission of the copyright holder,\\nredistribution permitted.\\n'"
 66 |       ]
 67 |      },
 68 |      "execution_count": 18,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "brown.readme()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 15,
 80 |    "id": "220fca12",
 81 |    "metadata": {
 82 |     "collapsed": false
 83 |    },
 84 |    "outputs": [
 85 |     {
 86 |      "data": {
 87 |       "text/plain": [
 88 |        "['ca01',\n",
 89 |        " 'ca02',\n",
 90 |        " 'ca03',\n",
 91 |        " 'ca04',\n",
 92 |        " 'ca05',\n",
 93 |        " 'ca06',\n",
 94 |        " 'ca07',\n",
 95 |        " 'ca08',\n",
 96 |        " 'ca09',\n",
 97 |        " 'ca10']"
 98 |       ]
 99 |      },
100 |      "execution_count": 15,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "brown.fileids()[:10]"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "id": "e0cc3e31",
112 |    "metadata": {},
113 |    "source": [
114 |     "Files may have different encodings; the default is ASCII processed as `str`."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 16,
120 |    "id": "4a480457",
121 |    "metadata": {
122 |     "collapsed": false
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "brown.encoding(\"ca01\")"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "id": "a7bb635b",
132 |    "metadata": {},
133 |    "source": [
134 |     "Files may also be in different categories."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 19,
140 |    "id": "2dc85ab3",
141 |    "metadata": {
142 |     "collapsed": false
143 |    },
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "['adventure',\n",
149 |        " 'belles_lettres',\n",
150 |        " 'editorial',\n",
151 |        " 'fiction',\n",
152 |        " 'government',\n",
153 |        " 'hobbies',\n",
154 |        " 'humor',\n",
155 |        " 'learned',\n",
156 |        " 'lore',\n",
157 |        " 'mystery',\n",
158 |        " 'news',\n",
159 |        " 'religion',\n",
160 |        " 'reviews',\n",
161 |        " 'romance',\n",
162 |        " 'science_fiction']"
163 |       ]
164 |      },
165 |      "execution_count": 19,
166 |      "metadata": {},
167 |      "output_type": "execute_result"
168 |     }
169 |    ],
170 |    "source": [
171 |     "brown.categories()"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "id": "b0a39b77",
177 |    "metadata": {},
178 |    "source": [
179 |     "## Accessing Content"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "id": "4ef0e3bd",
185 |    "metadata": {},
186 |    "source": [
187 |     "The corpus abstraction allows you to avoid having to deal with individual files, encodings, etc.\n",
188 |     "\n",
189 |     "That is, you can access all the words, all the text, all the sentences etc. in a corpus from a single object.\n"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 54,
195 |    "id": "566b63f4",
196 |    "metadata": {
197 |     "collapsed": false
198 |    },
199 |    "outputs": [
200 |     {
201 |      "data": {
202 |       "text/plain": [
203 |        "'\\n\\n\\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn'"
204 |       ]
205 |      },
206 |      "execution_count": 54,
207 |      "metadata": {},
208 |      "output_type": "execute_result"
209 |     }
210 |    ],
211 |    "source": [
212 |     "brown.raw()[:100]"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 2,
218 |    "id": "897f7165",
219 |    "metadata": {
220 |     "collapsed": false
221 |    },
222 |    "outputs": [
223 |     {
224 |      "data": {
225 |       "text/plain": [
226 |        "['The',\n",
227 |        " 'Fulton',\n",
228 |        " 'County',\n",
229 |        " 'Grand',\n",
230 |        " 'Jury',\n",
231 |        " 'said',\n",
232 |        " 'Friday',\n",
233 |        " 'an',\n",
234 |        " 'investigation',\n",
235 |        " 'of']"
236 |       ]
237 |      },
238 |      "execution_count": 2,
239 |      "metadata": {},
240 |      "output_type": "execute_result"
241 |     }
242 |    ],
243 |    "source": [
244 |     "brown.words()[:10]"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 5,
250 |    "id": "f1d12f36",
251 |    "metadata": {
252 |     "collapsed": false
253 |    },
254 |    "outputs": [
255 |     {
256 |      "name": "stdout",
257 |      "output_type": "stream",
258 |      "text": [
259 |       "['The', 'Fulton', 'County', 'Grand', 'Jury']\n",
260 |       "['The', 'jury', 'further', 'said', 'in']\n",
261 |       "['The', 'September-October', 'term', 'jury', 'had']\n",
262 |       "['``', 'Only', 'a', 'relative', 'handful']\n",
263 |       "['The', 'jury', 'said', 'it', 'did']\n",
264 |       "['It', 'recommended', 'that', 'Fulton', 'legislators']\n",
265 |       "['The', 'grand', 'jury', 'commented', 'on']\n",
266 |       "['Merger', 'proposed']\n",
267 |       "['However', ',', 'the', 'jury', 'said']\n",
268 |       "['The', 'City', 'Purchasing', 'Department', ',']\n"
269 |      ]
270 |     }
271 |    ],
272 |    "source": [
273 |     "for s in brown.sents()[:10]: print s[:5]"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 6,
279 |    "id": "d9898140",
280 |    "metadata": {
281 |     "collapsed": false
282 |    },
283 |    "outputs": [
284 |     {
285 |      "data": {
286 |       "text/plain": [
287 |        "[('The', 'AT'),\n",
288 |        " ('Fulton', 'NP-TL'),\n",
289 |        " ('County', 'NN-TL'),\n",
290 |        " ('Grand', 'JJ-TL'),\n",
291 |        " ('Jury', 'NN-TL'),\n",
292 |        " ('said', 'VBD'),\n",
293 |        " ('Friday', 'NR'),\n",
294 |        " ('an', 'AT'),\n",
295 |        " ('investigation', 'NN'),\n",
296 |        " ('of', 'IN')]"
297 |       ]
298 |      },
299 |      "execution_count": 6,
300 |      "metadata": {},
301 |      "output_type": "execute_result"
302 |     }
303 |    ],
304 |    "source": [
305 |     "brown.tagged_words()[:10]"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 8,
311 |    "id": "0dd9d6e1",
312 |    "metadata": {
313 |     "collapsed": false
314 |    },
315 |    "outputs": [
316 |     {
317 |      "data": {
318 |       "text/plain": [
319 |        "[('The', 'AT'),\n",
320 |        " ('Fulton', 'NP-TL'),\n",
321 |        " ('County', 'NN-TL'),\n",
322 |        " ('Grand', 'JJ-TL'),\n",
323 |        " ('Jury', 'NN-TL'),\n",
324 |        " ('said', 'VBD'),\n",
325 |        " ('Friday', 'NR'),\n",
326 |        " ('an', 'AT'),\n",
327 |        " ('investigation', 'NN'),\n",
328 |        " ('of', 'IN')]"
329 |       ]
330 |      },
331 |      "execution_count": 8,
332 |      "metadata": {},
333 |      "output_type": "execute_result"
334 |     }
335 |    ],
336 |    "source": [
337 |     "brown.tagged_sents()[0][:10]"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "markdown",
342 |    "id": "9e53c928",
343 |    "metadata": {},
344 |    "source": [
345 |     "# Reading New Corpora"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 20,
351 |    "id": "83bfd117",
352 |    "metadata": {
353 |     "collapsed": false
354 |    },
355 |    "outputs": [],
356 |    "source": [
357 |     "import nltk.corpus.reader"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 30,
363 |    "id": "85dd6da1",
364 |    "metadata": {
365 |     "collapsed": false
366 |    },
367 |    "outputs": [],
368 |    "source": [
369 |     "corpus = nltk.corpus.reader.plaintext.PlaintextCorpusReader(\".\",r\"[ft].*txt\",encoding=\"utf8\")"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 31,
375 |    "id": "31347fd3",
376 |    "metadata": {
377 |     "collapsed": false
378 |    },
379 |    "outputs": [
380 |     {
381 |      "data": {
382 |       "text/plain": [
383 |        "['faust.txt', 'tomsawyer.txt']"
384 |       ]
385 |      },
386 |      "execution_count": 31,
387 |      "metadata": {},
388 |      "output_type": "execute_result"
389 |     }
390 |    ],
391 |    "source": [
392 |     "corpus.fileids()"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 32,
398 |    "id": "82606afb",
399 |    "metadata": {
400 |     "collapsed": false
401 |    },
402 |    "outputs": [
403 |     {
404 |      "data": {
405 |       "text/plain": [
406 |        "u'Faust: Der Trag\\xf6die erster Teil\\n\\nJohann Wolfgang von Goethe\\n\\n\\nZueignung.\\n\\nIhr naht euch wieder, schw'"
407 |       ]
408 |      },
409 |      "execution_count": 32,
410 |      "metadata": {},
411 |      "output_type": "execute_result"
412 |     }
413 |    ],
414 |    "source": [
415 |     "corpus.raw()[:100]"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": 33,
421 |    "id": "00d3e6bb",
422 |    "metadata": {
423 |     "collapsed": false
424 |    },
425 |    "outputs": [
426 |     {
427 |      "data": {
428 |       "text/plain": [
429 |        "[[[u'Faust', u':', u'Der', u'Trag\\xf6die', u'erster', u'Teil']],\n",
430 |        " [[u'Johann', u'Wolfgang', u'von', u'Goethe']]]"
431 |       ]
432 |      },
433 |      "execution_count": 33,
434 |      "metadata": {},
435 |      "output_type": "execute_result"
436 |     }
437 |    ],
438 |    "source": [
439 |     "corpus.paras()[:2]"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 39,
445 |    "id": "46725ea0",
446 |    "metadata": {
447 |     "collapsed": false
448 |    },
449 |    "outputs": [
450 |     {
451 |      "name": "stdout",
452 |      "output_type": "stream",
453 |      "text": [
454 |       "[u'FAUST', u':', u'Vor', u'jenem', u'droben', u'steht', u'geb\\xfcckt', u',', u'Der', u'helfen', u'lehrt', u'und', u'H\\xfclfe', u'schickt', u'.']\n"
455 |      ]
456 |     }
457 |    ],
458 |    "source": [
459 |     "print corpus.sents()[500]"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 40,
465 |    "id": "eb78fdf1",
466 |    "metadata": {
467 |     "collapsed": false
468 |    },
469 |    "outputs": [
470 |     {
471 |      "name": "stdout",
472 |      "output_type": "stream",
473 |      "text": [
474 |       "[u'heute', u'!', u'DICHTER', u':', u'O', u'sprich', u'mir', u'nicht', u'von', u'jener']\n"
475 |      ]
476 |     }
477 |    ],
478 |    "source": [
479 |     "print corpus.words()[500:510]"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 44,
485 |    "id": "bc47a810",
486 |    "metadata": {
487 |     "collapsed": false
488 |    },
489 |    "outputs": [],
490 |    "source": [
491 |     "from nltk import Text\n",
492 |     "text = Text(corpus.words(\"tomsawyer.txt\"))"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": 47,
498 |    "id": "0317977b",
499 |    "metadata": {
500 |     "collapsed": false
501 |    },
502 |    "outputs": [
503 |     {
504 |      "name": "stdout",
505 |      "output_type": "stream",
506 |      "text": [
507 |       "Building index...\n",
508 |       "Displaying 25 of 647 matches:\n",
509 |       "\" TOM !\" No answer . \" What ' s gone with that boy , I wonder ? You TOM !\" No \n",
510 |       "ding down and punching under the bed with the broom , and so she needed breath\n",
511 |       "eded breath to punctuate the punches with . She resurrected nothing but the ca\n",
512 |       " - brother ) Sid was already through with his part of the work ( picking up ch\n",
513 |       "et vanity to believe she was endowed with a talent for dark and mysterious dip\n",
514 |       " sewed . \" Bother ! Well , go ' long with you . I ' d made sure you ' d played\n",
515 |       " didn ' t think you sewed his collar with white thread , but it ' s black .\" \"\n",
516 |       "it ' s black .\" \" Why , I did sew it with white ! Tom !\" But Tom did not wait \n",
517 |       " Confound it ! sometimes she sews it with white , and sometimes she sews it wi\n",
518 |       "th white , and sometimes she sews it with black . I wish to geeminy she ' d st\n",
519 |       "f it , and he strode down the street with his mouth full of harmony __________\n",
520 |       "ure is concerned , the advantage was with the boy , not the astronomer . The s\n",
521 |       "art , don ' t you ? I could lick you with one hand tied behind me , if I wante\n",
522 |       "do it .\" \" Well I will , if you fool with me .\" \" Oh yes -- I ' ve seen whole \n",
523 |       "n ' t either .\" So they stood , each with a foot placed at an angle as a brace\n",
524 |       " angle as a brace , and both shoving with might and main , and glowering at ea\n",
525 |       "d main , and glowering at each other with hate . But neither could get an adva\n",
526 |       "nd flushed , each relaxed his strain with watchful caution , and Tom said : \" \n",
527 |       "other on you , and he can thrash you with his little finger , and I ' ll make \n",
528 |       "it so .\" Tom drew a line in the dust with his big toe , and said : \" I dare yo\n",
529 |       " out of his pocket and held them out with derision . Tom struck them to the gr\n",
530 |       "er ' s nose , and covered themselves with dust and glory . Presently the confu\n",
531 |       "tride the new boy , and pounding him with his fists . \" Holler ' nuff !\" said \n",
532 |       "Better look out who you ' re fooling with next time .\" The new boy went off br\n",
533 |       "ht him out .\" To which Tom responded with jeers , and started off in high feat\n"
534 |      ]
535 |     }
536 |    ],
537 |    "source": [
538 |     "text.concordance(\"with\")"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "code",
543 |    "execution_count": 48,
544 |    "id": "cb9e02f5",
545 |    "metadata": {
546 |     "collapsed": false
547 |    },
548 |    "outputs": [
549 |     {
550 |      "name": "stdout",
551 |      "output_type": "stream",
552 |      "text": [
553 |       "Building word-context index...\n",
554 |       "and in on to for of was at into up s that through but if just upon\n",
555 |       "what as by\n"
556 |      ]
557 |     }
558 |    ],
559 |    "source": [
560 |     "text.similar(\"with\")"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 50,
566 |    "id": "ee7c7ec1",
567 |    "metadata": {
568 |     "collapsed": false
569 |    },
570 |    "outputs": [
571 |     {
572 |      "name": "stdout",
573 |      "output_type": "stream",
574 |      "text": [
575 |       "but_the is_a long_you up_a\n"
576 |      ]
577 |     }
578 |    ],
579 |    "source": [
580 |     "text.common_contexts([\"with\",\"as\"])"
581 |    ]
582 |   }
583 |  ],
584 |  "metadata": {},
585 |  "nbformat": 4,
586 |  "nbformat_minor": 5
587 | }
588 | 


--------------------------------------------------------------------------------
/nlpa-openfst-edit-distance.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # <nbformat>3.0</nbformat>
  3 | 
  4 | # <codecell>
  5 | 
  6 | from pylab import *
  7 | import openfst
  8 | from openfst import StdVectorFst as FST
  9 | from openfst import LogVectorFst as LFST
 10 | from fstutils import *
 11 | 
 12 | # <headingcell level=1>
 13 | 
 14 | # Simple Edit Distance
 15 | 
 16 | # <markdowncell>
 17 | 
 18 | # It's easy in principle to compute edit distance with finite state transducers.
 19 | # We construct a transducer that takes takes each symbol in the alphabet to itself with cost 0,
 20 | # and takes each symbol to a different symbol, or to/from epsilon with cost 1.
 21 | # This transducer is called a *flower transducer* because of its appearance.
 22 | # 
 23 | # We then compose an FST corresponding to the first string with this transducer,
 24 | # compose the result with an FST corresponding to the second string, and compute
 25 | # the cost of the shortest path.
 26 | 
 27 | # <codecell>
 28 | 
 29 | def make_flower(chars):
 30 |     epsilon = 0
 31 |     fst = FST()
 32 |     s = fst.AddState()
 33 |     fst.SetStart(s)
 34 |     fst.SetFinal(s,0.0)
 35 |     for c in chars:
 36 |         c = ord(c)
 37 |         fst.AddArc(s,c,c,0.0,s)
 38 |         fst.AddArc(s,c,epsilon,1.0,s)
 39 |         fst.AddArc(s,epsilon,c,1.0,s)
 40 |         for c2 in chars:
 41 |             c2 = ord(c2)
 42 |             fst.AddArc(s,c,c2,1.0,s)
 43 |     return fst
 44 | 
 45 | # <codecell>
 46 | 
 47 | flower = make_flower("AB")
 48 | show_fst(flower)
 49 | 
 50 | # <codecell>
 51 | 
 52 | fst1 = FST()
 53 | fst1.AddString("AABBAAA")
 54 | fst2 = FST()
 55 | fst2.AddString("AABBABAB")
 56 | 
 57 | # <codecell>
 58 | 
 59 | temp1 = FST()
 60 | openfst.ArcSortOutput(fst1)
 61 | openfst.ArcSortInput(flower)
 62 | openfst.Compose(fst1,flower,temp1)
 63 | show_fst(temp1)
 64 | 
 65 | # <codecell>
 66 | 
 67 | temp2 = FST()
 68 | openfst.ArcSortOutput(temp1)
 69 | openfst.ArcSortInput(fst2)
 70 | openfst.Compose(temp1,fst2,temp2)
 71 | show_fst(temp2)
 72 | 
 73 | # <codecell>
 74 | 
 75 | result = FST()
 76 | openfst.ShortestPath(temp2,result,1)
 77 | show_fst(result)
 78 | 
 79 | # <codecell>
 80 | 
 81 | print fstsize(temp1),fstsize(temp2)
 82 | 
 83 | # <headingcell level=1>
 84 | 
 85 | # Factoring the Edit Distance Transducer
 86 | 
 87 | # <markdowncell>
 88 | 
 89 | # The problem with the previous transducer is that it gets very large very quickly when
 90 | # composed with the original string. In fact, the size ends up being quadratic.
 91 | # 
 92 | # We can fix this by introducing some additional symbols. 
 93 | # (Here, we're just using ASCII symbols to represent insertion, deletion, and substitution, but we could
 94 | # be using something fancier.)
 95 | 
 96 | # <codecell>
 97 | 
 98 | epsilon = 0
 99 | insertion = ord("#")
100 | deletion = ord("_")
101 | substitution = ord("~")
102 | 
103 | def make_left(chars):
104 |     fst = FST()
105 |     s = fst.AddState()
106 |     fst.SetStart(s)
107 |     fst.SetFinal(s,0.0)
108 |     fst.AddArc(s,epsilon,insertion,0.5,s)
109 |     for c in chars:
110 |         c = ord(c)
111 |         fst.AddArc(s,c,c,0.0,s)
112 |         fst.AddArc(s,c,substitution,0.5,s)
113 |         fst.AddArc(s,c,deletion,0.5,s)
114 |     return fst
115 | 
116 | def make_right(chars):
117 |     fst = FST()
118 |     s = fst.AddState()
119 |     fst.SetStart(s)
120 |     fst.SetFinal(s,0.0)
121 |     fst.AddArc(s,deletion,epsilon,0.5,s)
122 |     for c in chars:
123 |         c = ord(c)
124 |         fst.AddArc(s,c,c,0.0,s)
125 |         fst.AddArc(s,substitution,c,0.5,s)
126 |         fst.AddArc(s,insertion,c,0.5,s)
127 |     return fst
128 | 
129 | # <codecell>
130 | 
131 | temp1 = FST()
132 | temp2 = FST()
133 | openfst.Compose(fst1,make_left("AB"),temp1)
134 | openfst.Compose(make_right("AB"),fst2,temp2)
135 | print fstsize(temp1),fstsize(temp2)
136 | 
137 | # <codecell>
138 | 
139 | show_fst(temp1)
140 | 
141 | # <codecell>
142 | 
143 | show_fst(temp2)
144 | 
145 | # <codecell>
146 | 
147 | temp3 = FST()
148 | openfst.ArcSortOutput(temp1)
149 | openfst.ArcSortInput(temp2)
150 | openfst.Compose(temp1,temp2,temp3)
151 | result = FST()
152 | openfst.ShortestPath(temp3,result,1)
153 | print fstsize(result)
154 | show_fst(result)
155 | 
156 | # <markdowncell>
157 | 
158 | # This becomes particularly important when using larger alphabets. Here is an illustration.
159 | 
160 | # <codecell>
161 | 
162 | ascii = "".join([chr(c) for c in range(32,127) if c not in [ord("~"),ord("_"),ord("#")]])
163 | 
164 | # <codecell>
165 | 
166 | ascii_left = make_left(ascii)
167 | ascii_right = make_right(ascii)
168 | 
169 | # <codecell>
170 | 
171 | def edit_distance(s1,s2):
172 |     fst1 = FST()
173 |     fst1.AddString(s1)
174 |     fst2 = FST()
175 |     fst2.AddString(s2)
176 |     temp1 = FST()
177 |     temp2 = FST()
178 |     openfst.Compose(fst1,ascii_left,temp1)
179 |     openfst.Compose(ascii_right,fst2,temp2)
180 |     print fstsize(temp1),fstsize(temp2)
181 |     temp3 = FST()
182 |     openfst.ArcSortOutput(temp1)
183 |     openfst.ArcSortInput(temp2)
184 |     openfst.Compose(temp1,temp2,temp3)
185 |     print fstsize(temp3)
186 |     result = FST()
187 |     openfst.ShortestPath(temp3,result,1)
188 |     return result
189 | 
190 | # <codecell>
191 | 
192 | show_fst(edit_distance("quick fox","quack fowl"))
193 | 
194 | # <headingcell level=1>
195 | 
196 | # Limited Contiguous Insertions / Deletions
197 | 
198 | # <markdowncell>
199 | 
200 | # A second way in which we can make edit distance computations more efficient
201 | # is to limit the number of consecutive deletions/insertions that can occur.
202 | # 
203 | # (Think about what constraint this corresponds to for a "manual" computation of the edit distance.)
204 | 
205 | # <codecell>
206 | 
207 | epsilon = 0
208 | def make_edit1(chars):
209 |     fst = FST()
210 |     s = fst.AddState()
211 |     s2 = fst.AddState()
212 |     fst.SetStart(s)
213 |     fst.SetFinal(s,0.0)
214 |     fst.SetFinal(s2,0.0)
215 |     for c in chars:
216 |         c = ord(c)
217 |         fst.AddArc(s,c,c,0.0,s)
218 |         fst.AddArc(s,c,epsilon,1.0,s2)
219 |         fst.AddArc(s,epsilon,c,1.0,s2)
220 |         fst.AddArc(s2,c,c,0.0,s)
221 |         for c2 in chars:
222 |             c2 = ord(c2)
223 |             fst.AddArc(s,c,c2,1.0,s)
224 |             fst.AddArc(s2,c,c2,1.0,s)
225 |     return fst
226 | 
227 | # <codecell>
228 | 
229 | temp1 = FST()
230 | openfst.ArcSortOutput(fst1)
231 | efst = make_edit1("AB")
232 | openfst.ArcSortInput(efst)
233 | openfst.Compose(fst1,efst,temp1)
234 | show_fst(temp1)
235 | temp2 = FST()
236 | openfst.ArcSortOutput(temp1)
237 | openfst.ArcSortInput(fst2)
238 | openfst.Compose(temp1,fst2,temp2)
239 | show_fst(temp2)
240 | print fstsize(temp2)
241 | 
242 | # <codecell>
243 | 
244 | result = FST()
245 | openfst.ShortestPath(temp2,result,1)
246 | show_fst(result)
247 | 
248 | # <codecell>
249 | 
250 | temp1 = FST()
251 | openfst.ArcSortOutput(fst1)
252 | efst = make_flower("AB")
253 | openfst.ArcSortInput(efst)
254 | openfst.Compose(fst1,efst,temp1)
255 | show_fst(temp1)
256 | temp2 = FST()
257 | openfst.ArcSortOutput(temp1)
258 | openfst.ArcSortInput(fst2)
259 | openfst.Compose(temp1,fst2,temp2)
260 | show_fst(temp2)
261 | print fstsize(temp2)
262 | 
263 | # <codecell>
264 | 
265 | result = FST()
266 | openfst.ShortestPath(temp2,result,1)
267 | show_fst(result)
268 | 
269 | # <headingcell level=1>
270 | 
271 | # Oracle Edit Distance
272 | 
273 | # <markdowncell>
274 | 
275 | # The regular edit distance is limited to computing the best match between two strings.
276 | # However, with finite state transducers, we can compute the best match between two
277 | # sets of strings.
278 | 
279 | # <codecell>
280 | 
281 | # recognition output
282 | fst1 = FST()
283 | fst1.AddString("qulck")
284 | fst1.AddString("qwck")
285 | fst1.AddString("quidc")
286 | fst1 = minimize(fst1)
287 | show_fst(fst1)
288 | 
289 | # <codecell>
290 | 
291 | # English dictionary
292 | fst2 = FST()
293 | with open("basic-english.txt") as stream:
294 |     for line in stream.readlines():
295 |         line = line.strip()
296 |         fst2.AddString(line)
297 | print fstsize(fst2)
298 | fst2 = minimize(fst2)
299 | print fstsize(fst2)
300 | 
301 | # <codecell>
302 | 
303 | temp2 = FST()
304 | openfst.ArcSortOutput(ascii_right)
305 | openfst.ArcSortInput(fst2)
306 | openfst.Compose(ascii_right,fst2,temp2)
307 | print fstsize(temp2)
308 | 
309 | # <codecell>
310 | 
311 | temp2 = minimize(temp2)
312 | 
313 | # <codecell>
314 | 
315 | temp1 = FST()
316 | openfst.Compose(fst1,ascii_left,temp1)
317 | print fstsize(temp1),fstsize(temp2)
318 | temp3 = FST()
319 | openfst.ArcSortOutput(temp1)
320 | openfst.ArcSortInput(temp2)
321 | openfst.Compose(temp1,temp2,temp3)
322 | print fstsize(temp3)
323 | result = FST()
324 | openfst.ShortestPath(temp3,result,1)
325 | show_fst(result)
326 | 
327 | # <codecell>
328 | 
329 | 
330 | 


--------------------------------------------------------------------------------
/nlpa-unix-cleanup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "3468f54a",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Making the Brown Corpus Readable"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "3906c193",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Here's a simple example of developing a command line removing the tags from the Brown corpus files."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "id": "4b86f5be",
 23 |    "metadata": {
 24 |     "collapsed": false
 25 |    },
 26 |    "outputs": [
 27 |     {
 28 |      "name": "stdout",
 29 |      "output_type": "stream",
 30 |      "text": [
 31 |       "CONTENTS\r\n",
 32 |       "README\r\n",
 33 |       "ca01\r\n",
 34 |       "ca02\r\n",
 35 |       "ca03\r\n",
 36 |       "ca04\r\n",
 37 |       "ca05\r\n",
 38 |       "ca06\r\n",
 39 |       "ca07\r\n",
 40 |       "ca08\r\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "!ls brown/. | head"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "id": "ce619cd5",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "We should probably look at the README for the definition of the tag file, but let's just figure this out."
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 2,
 59 |    "id": "88e3f236",
 60 |    "metadata": {
 61 |     "collapsed": false
 62 |    },
 63 |    "outputs": [
 64 |     {
 65 |      "name": "stdout",
 66 |      "output_type": "stream",
 67 |      "text": [
 68 |       "BROWN CORPUS\r\n",
 69 |       "\r\n",
 70 |       "A Standard Corpus of Present-Day Edited American\r\n",
 71 |       "English, for use with Digital Computers.\r\n",
 72 |       "\r\n",
 73 |       "by W. N. Francis and H. Kucera (1964)\r\n",
 74 |       "Department of Linguistics, Brown University\r\n",
 75 |       "Providence, Rhode Island, USA\r\n",
 76 |       "\r\n",
 77 |       "Revised 1971, Revised and Amplified 1979\r\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "!head brown/README"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "id": "7a58cf6a",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "Here's the first 10 lines from the file `brown/ca07`."
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 3,
 96 |    "id": "c9f39ff7",
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [
101 |     {
102 |      "name": "stdout",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "\r\n",
106 |       "\r\n",
107 |       "\tResentment/nn welled/vbd up/rp yesterday/nr among/in Democratic/jj-tl district/nn leaders/nns and/cc some/dti county/nn leaders/nns at/in reports/nns that/cs Mayor/nn-tl Wagner/np had/hvd decided/vbn to/to seek/vb a/at third/od term/nn with/in Paul/np R./np Screvane/np and/cc Abraham/np D./np Beame/np as/cs running/vbg mates/nns ./.\r\n",
108 |       "\r\n",
109 |       "\r\n",
110 |       "\tAt/in the/at same/ap time/nn reaction/nn among/in anti-organization/jj Democratic/jj-tl leaders/nns and/cc in/in the/at Liberal/jj-tl party/nn to/in the/at Mayor's/nn$-tl reported/vbn plan/nn was/bedz generally/rb favorable/jj ./.\r\n",
111 |       "\r\n",
112 |       "\r\n",
113 |       "\tSome/dti anti-organization/jj Democrats/nps saw/vbd in/in the/at program/nn an/at opportunity/nn to/to end/vb the/at bitter/jj internal/jj fight/nn within/in the/at Democratic/jj-tl party/nn that/wps has/hvz been/ben going/vbg on/rp for/in the/at last/ap three/cd years/nns ./.\r\n",
114 |       "\r\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "!sed 10q brown/ca07"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "id": "7e376c42",
125 |    "metadata": {},
126 |    "source": [
127 |     "The main thing is that every word or punctuation is followed by a `/something`.\n",
128 |     "We can remove that with a simple regular expression.\n",
129 |     "Well, it's not quite so simple...\n",
130 |     "\n",
131 |     "- We want to replace `/`, but that's already the regular expression delimiter, so we need to escape it: `\\/`\n",
132 |     "- the `g` is needed because we want to replace all occurrences"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 7,
138 |    "id": "6106ffad",
139 |    "metadata": {
140 |     "collapsed": false
141 |    },
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "\r\n",
148 |       "\r\n",
149 |       "\tResentment welled up yesterday among Democratic district leaders and some county leaders at reports that Mayor Wagner had decided to seek a third term with Paul R. Screvane and Abraham D. Beame as running mates .\r\n",
150 |       "\r\n",
151 |       "\r\n",
152 |       "\tAt the same time reaction among anti-organization Democratic leaders and in the Liberal party to the Mayor's reported plan was generally favorable .\r\n",
153 |       "\r\n",
154 |       "\r\n",
155 |       "\tSome anti-organization Democrats saw in the program an opportunity to end the bitter internal fight within the Democratic party that has been going on for the last three years .\r\n",
156 |       "\r\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "!sed 's/\\/[^ ]*//g;10q' brown/ca07"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "id": "a78c6e56",
167 |    "metadata": {},
168 |    "source": [
169 |     "Let's now clean up the whitespace at the beginning of the line.  `\\t` is a shorthand for the tab character."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 8,
175 |    "id": "1f7307db",
176 |    "metadata": {
177 |     "collapsed": false
178 |    },
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "\r\n",
185 |       "\r\n",
186 |       "Resentment welled up yesterday among Democratic district leaders and some county leaders at reports that Mayor Wagner had decided to seek a third term with Paul R. Screvane and Abraham D. Beame as running mates .\r\n",
187 |       "\r\n",
188 |       "\r\n",
189 |       "At the same time reaction among anti-organization Democratic leaders and in the Liberal party to the Mayor's reported plan was generally favorable .\r\n",
190 |       "\r\n",
191 |       "\r\n",
192 |       "Some anti-organization Democrats saw in the program an opportunity to end the bitter internal fight within the Democratic party that has been going on for the last three years .\r\n",
193 |       "\r\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "!sed 's/\\/[^ ]*//g;s/^[ \\t]*//;10q' brown/ca07"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "id": "211b87da",
204 |    "metadata": {},
205 |    "source": [
206 |     "There are a lot of blank lines; the `cat -s` (squeeze) command gets rid of them."
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 9,
212 |    "id": "32f18008",
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [
217 |     {
218 |      "name": "stdout",
219 |      "output_type": "stream",
220 |      "text": [
221 |       "\r\n",
222 |       "Resentment welled up yesterday among Democratic district leaders and some county leaders at reports that Mayor Wagner had decided to seek a third term with Paul R. Screvane and Abraham D. Beame as running mates .\r\n",
223 |       "\r\n",
224 |       "At the same time reaction among anti-organization Democratic leaders and in the Liberal party to the Mayor's reported plan was generally favorable .\r\n",
225 |       "\r\n",
226 |       "Some anti-organization Democrats saw in the program an opportunity to end the bitter internal fight within the Democratic party that has been going on for the last three years .\r\n",
227 |       "\r\n"
228 |      ]
229 |     }
230 |    ],
231 |    "source": [
232 |     "!sed 's/\\/[^ ]*//g;s/^[ \\t]*//;10q' brown/ca07 | cat -s"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "id": "8135c8f7",
238 |    "metadata": {},
239 |    "source": [
240 |     "Now we still have a problem with extra spaces before punctuation.\n",
241 |     "We can fix that with another regular expression.\n",
242 |     "This one contains *grouping* `\\(...\\)` and a backwards reference to the group `\\1`"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 13,
248 |    "id": "99222f49",
249 |    "metadata": {
250 |     "collapsed": false
251 |    },
252 |    "outputs": [
253 |     {
254 |      "name": "stdout",
255 |      "output_type": "stream",
256 |      "text": [
257 |       "\r\n",
258 |       "Resentment welled up yesterday among Democratic district leaders and some county leaders at reports that Mayor Wagner had decided to seek a third term with Paul R. Screvane and Abraham D. Beame as running mates.\r\n",
259 |       "\r\n",
260 |       "At the same time reaction among anti-organization Democratic leaders and in the Liberal party to the Mayor's reported plan was generally favorable.\r\n",
261 |       "\r\n",
262 |       "Some anti-organization Democrats saw in the program an opportunity to end the bitter internal fight within the Democratic party that has been going on for the last three years.\r\n",
263 |       "\r\n"
264 |      ]
265 |     }
266 |    ],
267 |    "source": [
268 |     "!sed 's/\\/[^ ]*//g;s/^[ \\t]*//;s/ \\([.,]\\)/\\1/;10q' brown/ca07 | cat -s"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "id": "f8378292",
274 |    "metadata": {},
275 |    "source": [
276 |     "Finally, let's wrap the long lines back around.  The `fmt` command is handy for that."
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 14,
282 |    "id": "4e221ba4",
283 |    "metadata": {
284 |     "collapsed": false
285 |    },
286 |    "outputs": [
287 |     {
288 |      "name": "stdout",
289 |      "output_type": "stream",
290 |      "text": [
291 |       "\r\n",
292 |       "Resentment welled up yesterday among Democratic district leaders and\r\n",
293 |       "some county leaders at reports that Mayor Wagner had decided to seek a\r\n",
294 |       "third term with Paul R. Screvane and Abraham D. Beame as running mates.\r\n",
295 |       "\r\n",
296 |       "At the same time reaction among anti-organization Democratic leaders and\r\n",
297 |       "in the Liberal party to the Mayor's reported plan was generally favorable.\r\n",
298 |       "\r\n",
299 |       "Some anti-organization Democrats saw in the program an opportunity to\r\n",
300 |       "end the bitter internal fight within the Democratic party that has been\r\n",
301 |       "going on for the last three years.\r\n",
302 |       "\r\n"
303 |      ]
304 |     }
305 |    ],
306 |    "source": [
307 |     "!sed 's/\\/[^ ]*//g;s/^[ \\t]*//;s/ \\([.,]\\)/\\1/;10q' brown/ca07 | cat -s | fmt"
308 |    ]
309 |   }
310 |  ],
311 |  "metadata": {},
312 |  "nbformat": 4,
313 |  "nbformat_minor": 5
314 | }
315 | 


--------------------------------------------------------------------------------
/nltk-summary-stemming-lemmatizing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 39,
  6 |    "id": "b28adc0f",
  7 |    "metadata": {
  8 |     "collapsed": false
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "from nltk.stem import SnowballStemmer,PorterStemmer,WordNetLemmatizer"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "id": "c0db7299",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Grammar"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "id": "f108dbfd",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "Recall:\n",
 29 |     "\n",
 30 |     "- inflection - systematic alteration of words according to grammatical rules\n",
 31 |     "- declension - nouns, adjectives, articles, pronouns - number, gender, case\n",
 32 |     "- conjugation - verbs - person, number, tense, gender, aspect, mood, voice\n",
 33 |     "\n",
 34 |     "Some of the terms:\n",
 35 |     "\n",
 36 |     "- person, number tense, gender... pretty obvious\n",
 37 |     "- voice - relationship between verb and its arguments (subject, object, ...)\n",
 38 |     "- aspect - ongoing, completed, habitual, consequential, ...\n",
 39 |     "- mood - actual, hypothetical, counterfactual, wished for, conditional, command, question, ..."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "5dcc0d8c",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "# Porter Stemmer on English"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 43,
 53 |    "id": "b431c369",
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "en_nouns = \\\n",
 60 |     "[\n",
 61 |     "    \"house houses house's\",\n",
 62 |     "    \"child children\",\n",
 63 |     "]\n",
 64 |     "en_verbs = \\\n",
 65 |     "[\n",
 66 |     "    \"walk walked walking walks\",\n",
 67 |     "    \"see saw sees seen seeing\",\n",
 68 |     "]\n",
 69 |     "en_cases = en_nouns + en_verbs"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 44,
 75 |    "id": "5801e1c4",
 76 |    "metadata": {
 77 |     "collapsed": false
 78 |    },
 79 |    "outputs": [
 80 |     {
 81 |      "name": "stdout",
 82 |      "output_type": "stream",
 83 |      "text": [
 84 |       "hous hous house'\n",
 85 |       "child children\n",
 86 |       "walk walk walk walk\n",
 87 |       "see saw see seen see\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "pen = PorterStemmer()\n",
 93 |     "for c in en_cases:\n",
 94 |     "    for w in c.split():\n",
 95 |     "        print pen.stem(w),\n",
 96 |     "    print"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "id": "10cfb39d",
102 |    "metadata": {},
103 |    "source": [
104 |     "# Snowball Stemmer on German"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 36,
110 |    "id": "e4ff6b4a",
111 |    "metadata": {
112 |     "collapsed": false
113 |    },
114 |    "outputs": [],
115 |    "source": [
116 |     "de_nouns = \\\n",
117 |     "[\n",
118 |     "    u\"Bruder Bruders Brüder Brüdern\",\n",
119 |     "    u\"Leuchte Leuchten\",\n",
120 |     "    u\"Haus Hauses Hause Häuser Häusern\",\n",
121 |     "]\n",
122 |     "de_verbs = \\\n",
123 |     "[\n",
124 |     "    u\"geb geben gebe gibst gibt gebt gab gabst gaben gabt gegeben gäbe gäbst gäb gäben gäbet\",\n",
125 |     "    u\"fangen fang fange fängst fängt fangen fangt fing fingst fingen fingt\",\n",
126 |     "    u\"backen backe backst backt backte backtest backten backtet gebackt gebackte\",\n",
127 |     "    u\"bäckst bäckt bukest bükest\",\n",
128 |     "    \n",
129 |     "]\n",
130 |     "de_cases = de_nouns+de_verbs"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 38,
136 |    "id": "eff68904",
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "name": "stdout",
143 |      "output_type": "stream",
144 |      "text": [
145 |       "brud brud brud brud\n",
146 |       "leucht leucht\n",
147 |       "haus haus haus haus haus\n",
148 |       "geb geb geb gibst gibt gebt gab gabst gab gabt gegeb gab gabst gab gab gabet\n",
149 |       "fang fang fang fang fangt fang fangt fing fing fing fingt\n",
150 |       "back back back backt backt backt backt backtet gebackt gebackt\n",
151 |       "back backt buk buk\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "des = SnowballStemmer(\"german\")\n",
157 |     "for c in de_cases:\n",
158 |     "    for w in c.split():\n",
159 |     "        print des.stem(w),\n",
160 |     "    print"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "id": "0ffdca22",
166 |    "metadata": {},
167 |    "source": [
168 |     "# WordNet Lemmatization"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 40,
174 |    "id": "a766109f",
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [],
179 |    "source": [
180 |     "wnl = WordNetLemmatizer()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 45,
186 |    "id": "f5e43ff7",
187 |    "metadata": {
188 |     "collapsed": false
189 |    },
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "house house house's\n",
196 |       "child child\n"
197 |      ]
198 |     }
199 |    ],
200 |    "source": [
201 |     "for c in en_nouns:\n",
202 |     "    for w in c.split():\n",
203 |     "        print wnl.lemmatize(w,pos='n'),\n",
204 |     "    print"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 46,
210 |    "id": "08bc0780",
211 |    "metadata": {
212 |     "collapsed": false
213 |    },
214 |    "outputs": [
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "walk walk walk walk\n",
220 |       "see saw see see see\n"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "for c in en_verbs:\n",
226 |     "    for w in c.split():\n",
227 |     "        print wnl.lemmatize(w,pos='v'),\n",
228 |     "    print"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 47,
234 |    "id": "623e970f",
235 |    "metadata": {
236 |     "collapsed": false
237 |    },
238 |    "outputs": [
239 |     {
240 |      "name": "stdout",
241 |      "output_type": "stream",
242 |      "text": [
243 |       "walk walked walking walk\n",
244 |       "see saw see seen seeing\n",
245 |       "house house house's\n",
246 |       "child child\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "for c in en_verbs+en_nouns:\n",
252 |     "    for w in c.split():\n",
253 |     "        print wnl.lemmatize(w),\n",
254 |     "    print"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "id": "f48f8926",
261 |    "metadata": {
262 |     "collapsed": false
263 |    },
264 |    "outputs": [],
265 |    "source": []
266 |   }
267 |  ],
268 |  "metadata": {},
269 |  "nbformat": 4,
270 |  "nbformat_minor": 5
271 | }
272 | 


--------------------------------------------------------------------------------
/tagutils.py:
--------------------------------------------------------------------------------
 1 | def stag(wt):
 2 |     w,t = wt
 3 |     if t=="Unk": return "<i>[{}]</i>".format(w)
 4 |     if t=="AT": return "<font size=-2>{}</font>".format(w)
 5 |     if t=="IN": return "<font color='#c08000' size=+2>{}</font>".format(w)
 6 |     if t=="CS": return "<font color='purple' size=+1>{}</font>".format(w)
 7 |     if t=="CC": return "<font color='purple' size=+1>{}</font>".format(w)
 8 |     if t[0]=="N": return "<font color='blue' size=+1>{}</font>".format(w)
 9 |     if t[:2]=="PP": return "<font color='#8080ff' size=+1>{}</font>".format(w)    
10 |     if t[:2]=="DO" or t[:2]=="EX" or t[:2]=="HV" or t[:2]=="MD": 
11 |         # do, be, have, modal
12 |         return "<font color='red'>{}</font>".format(w)
13 |     if t[0]=="V": 
14 |         return "<font color='red' size=+2>{}</font>".format(w)
15 |     if "JJ" in t: return "<font color='green'>{}</font>".format(w)
16 |     if "RB" in t and "WRB" not in t: 
17 |         return "<font color='#00c080''>{}</font>".format(w)
18 |     return w
19 | 
20 | def stags(tagged):
21 |     return " ".join([stag(x) for x in tagged])
22 | 
23 | def mstags(sentences):
24 |     return "<p />\n".join(stags(s) for s in sentences)
25 | 


--------------------------------------------------------------------------------