├── .gitignore ├── BPE └── BPE.py ├── Chapter12.ipynb ├── Chapter13.ipynb ├── Chapter14.ipynb ├── Chapter2.ipynb ├── Chapter3.ipynb ├── Chapter4.ipynb ├── Chapter8.ipynb ├── Constituency Parsing ├── CKY.py ├── Chmosky.py ├── __pycache__ │ ├── Chmosky.cpython-36.pyc │ └── utils.cpython-36.pyc └── utils.py ├── POS Tagging ├── __pycache__ │ └── taggers.cpython-36.pyc ├── main.py └── taggers.py ├── README.md ├── Statistical Constituency Parsing ├── CKY.py ├── __pycache__ │ ├── Chmosky.cpython-36.pyc │ └── utils.cpython-36.pyc └── utils.py └── data └── conll2000 └── README /.gitignore: -------------------------------------------------------------------------------- 1 | # txt file 2 | *.txt 3 | *.xlsx -------------------------------------------------------------------------------- /BPE/BPE.py: -------------------------------------------------------------------------------- 1 | import re, collections 2 | 3 | def get_stats(vocab): 4 | """ get frequencies from adjacent pairs 5 | @param vocab: a vocabulary with sequential pairs 6 | 7 | @returns pairs: a dictionary with counted byte pairs 8 | """ 9 | pairs = collections.defaultdict(int) 10 | for word, freq in vocab.items(): 11 | symbols = word.split('_') 12 | for i in range(len(symbols)-1): 13 | pairs[symbols[i], symbols[i+1]] += freq 14 | return pairs 15 | 16 | def merge_vocab(pair, v_in): 17 | v_out = {} 18 | print(v_in) 19 | bigram = re.escape('_'.join(pair)) # re.escape()用于字符转义 20 | print(bigram) 21 | p = re.compile(r'(': 5, 'l_o_w_e_s_t_':2, 30 | 'n_e_w_e_r_': 6, 'w_i_d_e_r_':3, 'n_e_w_':2 31 | } 32 | num_merges = 8 33 | 34 | for i in range(num_merges): 35 | pairs = get_stats(vocab) 36 | best = max(pairs, key=pairs.get) 37 | vocab = merge_vocab(best, vocab) 38 | print(vocab) -------------------------------------------------------------------------------- /Chapter12.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "language_info": { 6 | "name": "python", 7 | "codemirror_mode": { 8 | "name": "ipython", 9 | "version": 3 10 | }, 11 | "version": "3.6.5-final" 12 | }, 13 | "orig_nbformat": 2, 14 | "file_extension": ".py", 15 | "mimetype": "text/x-python", 16 | "name": "python", 17 | "npconvert_exporter": "python", 18 | "pygments_lexer": "ipython3", 19 | "version": 3, 20 | "kernelspec": { 21 | "name": "python36564bitanaconda3conda4d2a131d00244148923b6e8eafe61e3b", 22 | "display_name": "Python 3.6.5 64-bit ('Anaconda3': conda)" 23 | } 24 | }, 25 | "cells": [ 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## 12.1" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 49, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": "['after', 'five', 'p.m.']\n" 42 | } 43 | ], 44 | "source": [ 45 | "import nltk\n", 46 | "grammar = nltk.CFG.fromstring(\"\"\"\n", 47 | " S -> NP | PP | VP | NP PP\n", 48 | " NP -> Pronoun | Proper-Noun | Det Nominal | Nominal | Adj \n", 49 | " Nominal -> Nominal N | Adj N | N \n", 50 | " VP -> V | V NP | V NP PP | V PP\n", 51 | " PP -> Preposition NP\n", 52 | " Adj -> 'early' | 'all' | 'one-way' | 'any'\n", 53 | " N -> 'p.m.' | 'flights' | 'redeye' | 'fare' | 'delays' | 'five'\n", 54 | " Det -> 'a'\n", 55 | " V -> 'arriving'\n", 56 | " Preposition -> 'from' | 'in' | 'after' | 'on'\n", 57 | " Proper-Noun -> 'Denver' | 'Dallas' | 'Washington' | 'Thursday'\n", 58 | " \"\"\")\n", 59 | "# NPChunker = nltk.ChartParser(pattern)\n", 60 | "\n", 61 | "parser = nltk.ChartParser(grammar)\n", 62 | "\n", 63 | "def parse(sent):\n", 64 | " #Returns nltk.Tree.Tree format output\n", 65 | " a = [] \n", 66 | " for tree in parser.parse(sent):\n", 67 | " a.append(tree)\n", 68 | " return a[0]\n", 69 | "\n", 70 | "# 1. Dallas\n", 71 | "sentence = ['Dallas']\n", 72 | "result = parse(sentence)\n", 73 | "result.draw()\n", 74 | "\n", 75 | "# 2. from Denver\n", 76 | "sentence = 'from Denver'.split()\n", 77 | "result = parse(sentence)\n", 78 | "result.draw()\n", 79 | "\n", 80 | "# 3. after five p.m.\n", 81 | "sentence = 'after five p.m.'.split()\n", 82 | "result = parse(sentence)\n", 83 | "result.draw()\n", 84 | "\n", 85 | "# 4. arriving in Washington\n", 86 | "sentence = 'arriving in Washington'.split()\n", 87 | "result = parse(sentence)\n", 88 | "result.draw()\n", 89 | "\n", 90 | "# 5. early flights\n", 91 | "sentence = 'early flights'.split()\n", 92 | "result = parse(sentence)\n", 93 | "result.draw()\n", 94 | "\n", 95 | "# 6. all redeye flights\n", 96 | "sentence = 'all redeye flights'.split()\n", 97 | "result = parse(sentence)\n", 98 | "result.draw()\n", 99 | "\n", 100 | "# 7. on Tursday\n", 101 | "sentence = 'on Thursday'.split()\n", 102 | "result = parse(sentence)\n", 103 | "result.draw()\n", 104 | "\n", 105 | "# 8. a one-way fare\n", 106 | "sentence = 'a one-way fare'.split()\n", 107 | "result = parse(sentence)\n", 108 | "result.draw()\n", 109 | "\n", 110 | "# 9. any delays in Denver\n", 111 | "sentence = 'any delays in Denver'.split()\n", 112 | "result = parse(sentence)\n", 113 | "result.draw()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": {}, 119 | "source": [ 120 | "## 12.2" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 68, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "\n", 130 | "# NPChunker = nltk.ChartParser(pattern)\n", 131 | "def parse(sent, grammar):\n", 132 | " #Returns nltk.Tree.Tree format output\n", 133 | " a = []\n", 134 | " parser = nltk.ChartParser(grammar)\n", 135 | " for tree in parser.parse(sent):\n", 136 | " a.append(tree)\n", 137 | " return a[0]\n", 138 | "\n", 139 | "sentence = 'Does AmericanAirlines have a flight between five a.m. and six a.m.'.split()\n", 140 | "grammar = nltk.CFG.fromstring(\"\"\"\n", 141 | " S -> Aux NP VP\n", 142 | " VP -> V NP PP\n", 143 | " NP -> Prop-Noun | Det Nominal | NP 'and' NP | Nominal\n", 144 | " PP -> Prep NP\n", 145 | " Nominal -> Nominal N | N\n", 146 | " V -> 'have'\n", 147 | " Det -> 'a'\n", 148 | " N -> 'five' | 'six' | 'a.m.' | 'flight'\n", 149 | " Prep -> 'between'\n", 150 | " Prop-Noun -> 'AmericanAirlines'\n", 151 | " Aux -> 'Does'\n", 152 | " \"\"\")\n", 153 | "result = parse(sentence, grammar)\n", 154 | "result.draw()\n", 155 | "\n", 156 | "sentence = 'I wouldlike tofly on AmericanAirlines'.split()\n", 157 | "grammar = nltk.CFG.fromstring(\"\"\"\n", 158 | " S -> NP VP\n", 159 | " VP -> V VP | V PP\n", 160 | " NP -> Prop-Noun | Pronoun\n", 161 | " PP -> Prep NP\n", 162 | " Pronoun -> 'I'\n", 163 | " V -> 'wouldlike' | 'tofly'\n", 164 | " Prep -> 'on'\n", 165 | " Prop-Noun -> 'AmericanAirlines'\n", 166 | " \"\"\")\n", 167 | "result = parse(sentence, grammar)\n", 168 | "result.draw()\n", 169 | "\n", 170 | "sentence = 'Please repeat that'.split()\n", 171 | "grammar = nltk.CFG.fromstring(\"\"\"\n", 172 | " S -> VP\n", 173 | " VP -> V VP | V NP\n", 174 | " NP -> Pronoun\n", 175 | " Pronoun -> 'that'\n", 176 | " V -> 'Please' | 'repeat'\n", 177 | " \"\"\")\n", 178 | "result = parse(sentence, grammar)\n", 179 | "result.draw()\n", 180 | "\n", 181 | "sentence = 'Does American487 have a first-class section'.split()\n", 182 | "grammar = nltk.CFG.fromstring(\"\"\"\n", 183 | " S -> Aux NP VP\n", 184 | " VP -> V NP\n", 185 | " NP -> Prop-Noun | Det Nominal\n", 186 | " Nominal -> Nominal N | N\n", 187 | " V -> 'have'\n", 188 | " Det -> 'a'\n", 189 | " N -> 'first-class' | 'section'\n", 190 | " Prop-Noun -> 'American487'\n", 191 | " Aux -> 'Does'\n", 192 | " \"\"\")\n", 193 | "result = parse(sentence, grammar)\n", 194 | "result.draw()\n", 195 | "\n", 196 | "sentence = 'I need tofly between Philadelphia and Atlanta'.split()\n", 197 | "grammar = nltk.CFG.fromstring(\"\"\"\n", 198 | " S -> NP VP\n", 199 | " VP -> V VP | V PP\n", 200 | " NP -> NP 'and' NP | Prop-Noun | Pronoun\n", 201 | " PP -> Prep NP\n", 202 | " Pronoun -> 'I'\n", 203 | " Prop-Noun -> 'Philadelphia' | 'Atlanta'\n", 204 | " V -> 'need' | 'tofly'\n", 205 | " Prep -> 'between'\n", 206 | " \"\"\")\n", 207 | "result = parse(sentence, grammar)\n", 208 | "result.draw()\n", 209 | "\n", 210 | "sentence = 'What is the fare from Atlanta to Denver'.split()\n", 211 | "grammar = nltk.CFG.fromstring(\"\"\"\n", 212 | " S -> Wh VP\n", 213 | " VP -> V NP\n", 214 | " NP -> Det Nominal | Prop-Noun\n", 215 | " PP -> Prep NP Prep NP\n", 216 | " Nominal -> Nominal PP | N\n", 217 | " N -> 'fare'\n", 218 | " Det -> 'the'\n", 219 | " Wh -> 'What'\n", 220 | " Prep -> 'from' | 'to'\n", 221 | " Prop-Noun -> 'Denver' | 'Atlanta'\n", 222 | " V -> 'is'\n", 223 | " Prep -> 'between'\n", 224 | " \"\"\")\n", 225 | "result = parse(sentence, grammar)\n", 226 | "result.draw()\n", 227 | "\n", 228 | "sentence = 'Is there an AmericanAirlines flight from Philadelphia to Dallas'.split()\n", 229 | "grammar = nltk.CFG.fromstring(\"\"\"\n", 230 | " S -> VP NP\n", 231 | " VP -> V NP\n", 232 | " NP -> Pronoun | Det Nominal | Prop-Noun\n", 233 | " PP -> Prep NP Prep NP\n", 234 | " Nominal -> Nominal PP | NP Nominal | N\n", 235 | " Pronoun -> 'there'\n", 236 | " V -> 'Is'\n", 237 | " Prep -> 'from' | 'to'\n", 238 | " Det -> 'an'\n", 239 | " N -> 'flight'\n", 240 | " Prop-Noun -> 'AmericanAirlines' | 'Philadelphia' | 'Dallas'\n", 241 | " Aux -> 'Does'\n", 242 | " \"\"\")\n", 243 | "result = parse(sentence, grammar)\n", 244 | "result.draw()" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "## 12.3" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "*RelClause $\\to$ (who|that) VP*\n", 259 | "\n", 260 | "*RelClause $\\to$ (who|that) NoObjS*\n", 261 | "\n", 262 | "*NoObjS $\\to$ NP NoObjVP*\n", 263 | "\n", 264 | "*NoObjVP $\\to$ (Aux) Verb-with-NP-Comp (PP)*\n", 265 | "\n", 266 | "*NoObjVP $\\to$ (Aux) Verb-with-S-Comp (NoObjS)*\n", 267 | "\n", 268 | "*NoObjVP $\\to$ (Aux) Verb-with-Inf-VP-Comp ((NP) to NoObjVP)*" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "## 12.4" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "Yes, the optional *Aux* elements allow for auxiliaries like *can*, and the recursive uses of NoObjS and NoObjVP in the last two rules allow for the long-distance dependencies." 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "## 12.5" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "*verb-with-bare-stem-VP-complement$\\to$can*" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "## 12.6" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "*Det$\\to$NP'S*\n", 311 | "\n", 312 | "*NP$\\to$(Det)Nominal*" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "## 12.7" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": {}, 325 | "source": [] 326 | } 327 | ] 328 | } -------------------------------------------------------------------------------- /Chapter13.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "language_info": { 6 | "name": "python", 7 | "codemirror_mode": { 8 | "name": "ipython", 9 | "version": 3 10 | }, 11 | "version": "3.6.5-final" 12 | }, 13 | "orig_nbformat": 2, 14 | "file_extension": ".py", 15 | "mimetype": "text/x-python", 16 | "name": "python", 17 | "npconvert_exporter": "python", 18 | "pygments_lexer": "ipython3", 19 | "version": 3, 20 | "kernelspec": { 21 | "name": "python36564bitanaconda3conda4d2a131d00244148923b6e8eafe61e3b", 22 | "display_name": "Python 3.6.5 64-bit ('Anaconda3': conda)" 23 | } 24 | }, 25 | "cells": [ 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## 13.1\n", 31 | "Please see \"Chomsky.py\" under the folder \"Constituency Parsing\"." 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## 13.2\n", 39 | "\n", 40 | "Please see \"CKY.py\" under the folder \"Constituency Parsing\".\n" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## 13.3\n", 48 | "\n", 49 | "For terminals we have\n", 50 | "$$\n", 51 | "for \\quad all {A|A\\rightarrow words[j]\\in grammar} \\\\\n", 52 | " if\\quad\\exists B such that B \\rightarrow A in grammar \\\\\n", 53 | " table[j-1,j] \\leftarrow table[j-1,j]\\cup B \\quad\n", 54 | " else\n", 55 | " table[j-1,j] \\leftarrow table[j-1,j]\\cup A\n", 56 | "$$\n", 57 | "\n", 58 | "For combined process we have\n", 59 | "$$\n", 60 | "for all {A|A\\rightarrow BC \\in grammar and B \\in table[i,k] and C \\in table[k,j]} \\\\\n", 61 | " if\\quad\\exists B such that B \\rightarrow A in grammar \\\\\n", 62 | " table[i,j] \\leftarrow table[i,j] \\cup B \\\\\n", 63 | " else \\\\\n", 64 | "table[i,j]\\leftarrow table[i,j]\\cup A \\\\\n", 65 | "$$\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [] 74 | } 75 | ] 76 | } -------------------------------------------------------------------------------- /Chapter14.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "language_info": { 6 | "name": "python", 7 | "codemirror_mode": { 8 | "name": "ipython", 9 | "version": 3 10 | }, 11 | "version": "3.6.5-final" 12 | }, 13 | "orig_nbformat": 2, 14 | "file_extension": ".py", 15 | "mimetype": "text/x-python", 16 | "name": "python", 17 | "npconvert_exporter": "python", 18 | "pygments_lexer": "ipython3", 19 | "version": 3, 20 | "kernelspec": { 21 | "name": "python36564bitanaconda3conda4d2a131d00244148923b6e8eafe61e3b", 22 | "display_name": "Python 3.6.5 64-bit ('Anaconda3': conda)" 23 | } 24 | }, 25 | "cells": [ 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## 14.1\n", 31 | "Please see the code in CKY.py" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## 14.2&14.3&14.4" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Please see the code under \"Statistical Constituency Parsing\" folder." 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## 14.5\n", 53 | "\n", 54 | "Rule matching process in CKY doesn't only work on types, but also work on words.\n", 55 | "In other words, we need to expand the tags for matching, each with a label and word.\n", 56 | "\n", 57 | "In my implementation, these two are the same. Because we fixed the expression of lexicalized grammar as \"A->B1(word) B2(word)\". In previous CKY, we compare each target with these two labels on the right of the arrow. However in current CKY, " 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## 14.6" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## 14.6" 79 | ] 80 | } 81 | ] 82 | } -------------------------------------------------------------------------------- /Chapter2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "language_info": { 6 | "name": "python", 7 | "codemirror_mode": { 8 | "name": "ipython", 9 | "version": 3 10 | }, 11 | "version": "3.6.5-final" 12 | }, 13 | "orig_nbformat": 2, 14 | "file_extension": ".py", 15 | "mimetype": "text/x-python", 16 | "name": "python", 17 | "npconvert_exporter": "python", 18 | "pygments_lexer": "ipython3", 19 | "version": 3, 20 | "kernelspec": { 21 | "name": "python36564bitanaconda3conda4d2a131d00244148923b6e8eafe61e3b", 22 | "display_name": "Python 3.6.5 64-bit ('Anaconda3': conda)" 23 | } 24 | }, 25 | "cells": [ 26 | { 27 | "cell_type": "markdown", 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "ename": "SyntaxError", 33 | "evalue": "unexpected character after line continuation character (, line 1)", 34 | "output_type": "error", 35 | "traceback": [ 36 | "\u001b[1;36m File \u001b[1;32m\"\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m 1. \\[A-Za-z\\]+\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m unexpected character after line continuation character\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "2.1\n", 42 | "\n", 43 | "1. \\[A-Za-z\\]+\n", 44 | "2. \\[a-z]+b\b", 45 | "3. (b+(ab+)+)?" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "2.2\n", 55 | "\n", 56 | "1. (\\[A-Za-z\\]+)\\b+\\1\n", 57 | "2. ^\\d+\\b.*\\b\\[A-Za-z\\]+$\n", 58 | "3. \\bgrotto\\b.\\*\\braven\\b | \\braven\\b.*\\bgrotto\\b" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 1, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "output_type": "stream", 68 | "name": "stdout", 69 | "text": "Delete r\nInsert r\nInsert s\n" 70 | } 71 | ], 72 | "source": [ 73 | "# -*- coding: utf-8 -*-\n", 74 | "\n", 75 | "def compute_ed(word1, word2):\n", 76 | " ed_table = [[0 for i in range(len(word2)+1)] for j in range(len(word1)+1)]\n", 77 | " ed_table[0][0] = 0\n", 78 | " trace_table = [[[] for i in range(len(word2)+1)] for j in range(len(word1)+1)]\n", 79 | " for i in range(len(word1)+1):\n", 80 | " for j in range(len(word2)+1):\n", 81 | " if i == 0:\n", 82 | " ed_table[0][j] = j\n", 83 | " if j > 0:\n", 84 | " trace_table[0][j].append(\"Insert {}\".format(word2[j-1]))\n", 85 | " elif j == 0:\n", 86 | " ed_table[i][0] = i\n", 87 | " if i > 0:\n", 88 | " trace_table[i][0].append(\"Detele {}\".format(word1[i-1]))\n", 89 | " else:\n", 90 | " ed_table[i][j] = min(ed_table[i-1][j]+1, ed_table[i][j-1]+1, \\\n", 91 | " ed_table[i-1][j-1]+(word1[i-1]!=word2[j-1]))\n", 92 | " if ed_table[i-1][j] + 1 == ed_table[i][j]:\n", 93 | " trace_table[i][j] = trace_table[i-1][j] + [\"Delete {}\".format(word1[i-1])]\n", 94 | " elif ed_table[i][j-1] + 1 == ed_table[i][j]:\n", 95 | " trace_table[i][j] = trace_table[i][j-1] + [\"Insert {}\".format(word2[j-1])]\n", 96 | " elif word1[i-1]!=word2[j-1]:\n", 97 | " trace_table[i][j] = trace_table[i-1][j-1] + [\"Substitute {0} with {1}\".format(word1[i-1],word2[j-1])]\n", 98 | " else:\n", 99 | " trace_table[i][j] = trace_table[i-1][j-1]\n", 100 | " \n", 101 | " return ed_table[-1][-1] ,trace_table[-1][-1]\n", 102 | "\n", 103 | "word1 = \"drive\"\n", 104 | "word2 = \"divers\"\n", 105 | "ed, trace = compute_ed(word1,word2)\n", 106 | "for tr in trace:\n", 107 | " print(tr)" 108 | ] 109 | } 110 | ] 111 | } 112 | -------------------------------------------------------------------------------- /Chapter3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "language_info": { 6 | "name": "python", 7 | "codemirror_mode": { 8 | "name": "ipython", 9 | "version": 3 10 | }, 11 | "version": "3.6.5-final" 12 | }, 13 | "orig_nbformat": 2, 14 | "file_extension": ".py", 15 | "mimetype": "text/x-python", 16 | "name": "python", 17 | "npconvert_exporter": "python", 18 | "pygments_lexer": "ipython3", 19 | "version": 3, 20 | "kernelspec": { 21 | "name": "python36564bitanaconda3conda4d2a131d00244148923b6e8eafe61e3b", 22 | "display_name": "Python 3.6.5 64-bit ('Anaconda3': conda)" 23 | } 24 | }, 25 | "cells": [ 26 | { 27 | "cell_type": "markdown", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "## 3.1\n", 33 | "\n", 34 | "$$P(am|,I)=\\frac{1}{2}=0.5$$\n", 35 | "$$P(Sam | I,am)=\\frac{1}{2}=0.5$$\n", 36 | "$$P(|am Sam)=\\frac{1}{1}=1$$\n", 37 | "$$P(I|,Sam)=\\frac{1}{1}=1$$" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "## 3.2\n", 47 | "$$P( i want chinese food) = P(i|)P(want|i)P(chinese|want)P(food|chinese)P(<\\s>|food)\\\\=0.19\\times0.33\\times0.0065\\times0.52\\times0.40=0.0000847704$$\n", 48 | "\n", 49 | "$$P( i want chinese food) = P(i|)P(want|i)P(chinese|want)P(food|chinese)P(<\\s>|food)\\\\=0.19\\times0.21\\times0.0029\\times0.052\\times0.40=0.00000240676$$" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "## 3.3\n", 59 | "The first probability is higher without smoothing. Because without applyting smooth, frequent bigrams have higher probability than those in smoothed distribution." 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "## 3.4\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 34, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": "[['', 'I', 'am', 'Sam', ''], ['', 'Sam', 'I', 'am', ''], ['', 'I', 'am', 'Sam', ''], ['', 'I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'Sam', '']]\n0.21428571428571427\n" 80 | } 81 | ], 82 | "source": [ 83 | "sens = [' I am Sam ',' Sam I am ', ' I am Sam ', ' I do not like green eggs and Sam ']\n", 84 | "sen_wds = []\n", 85 | "for sen in sens:\n", 86 | " sen_wds.append(sen.split())\n", 87 | "print(sen_wds)\n", 88 | "\n", 89 | "def add1smooth(sens, pre, cur):\n", 90 | " Vocab = set()\n", 91 | " for sen in sens:\n", 92 | " for wd in sen:\n", 93 | " Vocab.add(wd)\n", 94 | " V = len(Vocab)\n", 95 | " total = target = 0\n", 96 | " for sen in sens:\n", 97 | " for i in range(len(sen)-1):\n", 98 | " if sen[i] == pre:\n", 99 | " total += 1\n", 100 | " if sen[i+1] == cur:\n", 101 | " target += 1\n", 102 | " return (target+1)/(total+V)\n", 103 | " \n", 104 | "print(add1smooth(sen_wds, 'am', 'Sam'))" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "## 3.5" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 35, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": "0.8333333333333333\n" 125 | } 126 | ], 127 | "source": [ 128 | "def sumadd1smmoth(sens, pres, curs):\n", 129 | " total = 0.0\n", 130 | " for i in range(len(pres)):\n", 131 | " total += add1smooth(sens, pres[i], curs[i])\n", 132 | " return total\n", 133 | "\n", 134 | "sens = [' a b',' b b',' b a',' a a']\n", 135 | "print(sumadd1smmoth(sens, ['','','a','b','a','b'],['a','b','a','b','b','a']))" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "## 3.6\n", 145 | "\n", 146 | "$$P(w_3|w_1,w_2)=\\frac{c(w_1,w_2,w_3)}{c(w_1,w_2)}$$\n", 147 | "$$P^{*}(w_3|w_1,w_2)=\\frac{c(w_1,w_2,w_3)+1}{c(w_1,w_2)+V}$$" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "## 3.7\n", 157 | "$$P\\hat(Sam|am)=\\lambda_1P(Sam|am)+\\lambda_2P(Sam)=0.5\\times\\frac{2}{3}+0.5\\times\\frac{4}{25}=0.08+0.33=0.41$$" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "## 3.8" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 36, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": "[['', 'dear', 'members', 'of', 'hkust', '<\\\\s>'], ['', 'in', 'view', 'of', 'the', 'latest', 'development', 'of', 'the', 'novel', 'coronavirus', 'outbreak', 'in', 'hong', 'kong', '<\\\\s>'], ['', 'the', \"university's\", 'management', 'has', 'decided', 'to', 'implement', 'procedures', 'to', 'provide', 'disposable', 'face', 'mask', 'to', 'students', 'and', 'staff', 'who', 'need', 'to', 'be', 'on', 'campus', 'starting', 'from', 'the', 'week', 'of', '10', 'feb', '2020', '<\\\\s>'], ['', 'campus', 'mask', 'supply', 'scheme', '<\\\\s>'], ['', '1', 'staff', 'members', 'who', 'have', 'to', 'provide', 'essential', 'services', 'on', 'campus', 'during', 'the', '\"work', 'at', 'home', 'period\"', 'will', 'be', 'issued', 'a', 'mask', 'on', 'a', 'daily', 'basis', 'departments', 'units', 'are', 'requested', 'to', 'assess', 'the', 'quantities', 'of', 'masks', 'needed', 'and', 'submit', 'requests', 'to', 'respective', \"dean's\", 'office', '<\\\\s>'], ['', 'while', 'non-academic', 'offices', 'should', 'submit', 'the', 'requests', 'to', 'respective', 'p/vp', 'office', 'on', 'or', 'before', 'feb', '12', '<\\\\s>'], ['', '2020', '(wed)', 'cls', 'will', 'coordinate', 'the', 'supply', 'of', 'masks', 'and', 'work', 'with', 'the', \"dean's\", 'offices', 'and', 'p/vp', 'offices', 'for', 'mask', 'dissemination', '<\\\\s>'], ['', '2', 'students', 'are', 'requested', 'to', 'stay', 'away', 'from', 'campus', 'if', 'at', 'all', 'possible', 'those', 'who', 'must', 'be', 'on', 'campus', 'will', 'be', 'issued', 'a', 'mask', 'on', 'a', 'daily', 'basis', 'students', 'must', 'collect', 'their', 'mask', 'in-person', 'and', 'present', 'their', 'student', 'id', 'which', 'is', 'for', 'validation', 'only', 'and', 'no', 'personal', 'data', 'will', 'be', 'captured', 'hall', 'residents', 'can', 'collect', 'their', 'masks', 'at', 'the', 'student', 'hall', 'counter', '<\\\\s>'], ['', 'while', 'other', 'students', 'can', 'collect', 'outside', 'the', 'security', 'office', 'at', 'the', 'atrium', 'and', 'lskbb', '<\\\\s>'], ['', '3', 'considering', 'the', 'scarce', 'supply', 'of', 'disposable', 'mask', 'in', 'the', 'community', '<\\\\s>'], ['', 'a', 'limited', 'quantity', 'of', 'masks', 'will', 'be', 'made', 'available', 'for', 'purchase', 'by', 'staff', 'for', 'personal', 'use', 'on', 'a', 'one-off', '<\\\\s>'], ['', 'cost', 'recovery', 'basis', 'staff', 'association', 'has', 'agreed', 'to', 'help', 'with', 'the', 'distribution', '<\\\\s>'], ['', 'please', 'visit', 'http://staff.ust.hk/sa-maskorder.pdf', 'for', 'details', 'and', 'registration', 'on', 'or', 'before', 'feb', '12', '<\\\\s>'], ['', '2020', '(wed)', 'while', 'the', 'mask', 'supply', 'is', 'limited', 'and', 'may', 'not', 'be', 'sufficient', 'to', 'satisfy', 'all', 'requests', '<\\\\s>'], ['', 'it', 'is', 'the', \"university's\", 'desire', 'to', 'help', 'alleviate', 'the', 'impact', 'of', 'the', 'virus', 'outbreak', 'in', 'our', 'campus', 'community', 'by', 'this', 'special', 'arrangement', '<\\\\s>'], ['', 'important', 'reminder', 'about', 'personal', 'hygiene', '<\\\\s>'], ['', 'please', 'be', 'reminded', 'that', 'wearing', 'mask', 'is', 'only', 'a', 'small', 'part', 'of', 'a', 'holistic', 'approach', 'to', 'combat', 'spreading', 'of', 'the', 'coronavirus', 'keeping', 'up', 'other', 'good', 'personal', 'hygiene', 'practices', '<\\\\s>'], ['', 'especially', 'hand', 'hygiene', '<\\\\s>'], ['', 'is', 'crucial', 'for', 'information', 'on', 'hygiene', 'and', 'preventive', 'measures', '<\\\\s>'], ['', 'please', 'visit', 'https://www.chp.gov.hk/en/features/102465.html', 'for', 'details', '<\\\\s>'], ['', 'health', '<\\\\s>'], ['', 'safety', 'and', 'environment', 'office', '<\\\\s>']]\n" 178 | } 179 | ], 180 | "source": [ 181 | "import collections\n", 182 | "\n", 183 | "def preprocessing(filename):\n", 184 | " f = open(filename,'r',encoding='gb18030', errors='ignore')\n", 185 | " copora = [x.strip().strip(',') for x in f]\n", 186 | " copora = list(filter(None,copora))\n", 187 | " sents = []\n", 188 | " for cop in copora:\n", 189 | " cop = cop.lower().strip('.')\n", 190 | " sent = cop.split(',')\n", 191 | " for sen in sent:\n", 192 | " new_sent = sen.split()\n", 193 | " new_sent = [''] + new_sent + ['<\\s>']\n", 194 | " new_sent = [word.strip('.') for word in new_sent]\n", 195 | " sents.append(new_sent)\n", 196 | " return sents\n", 197 | "\n", 198 | "def bigram(sents, ug_count_dic):\n", 199 | " bg_dic = collections.defaultdict(float)\n", 200 | " for sen in sents:\n", 201 | " for i in range(len(sen)-1):\n", 202 | " if (sen[i], sen[i+1]) not in bg_dic:\n", 203 | " bg_dic[(sen[i],sen[i+1])] = 1\n", 204 | " else:\n", 205 | " bg_dic[(sen[i],sen[i+1])] += 1\n", 206 | " for bi in bg_dic:\n", 207 | " bg_dic[bi] = bg_dic[bi]/ug_count_dic[bi[0]]\n", 208 | " return bg_dic\n", 209 | "\n", 210 | "def unigrams(sents):\n", 211 | " \"\"\" \n", 212 | " @param sents: a list of list of word, each list contain a sentences of a long text file\n", 213 | " @returns (ug_count_dic, unigram_dic): a tuple of two dictionaries with each item as \"unigram:count\" and \"unigram:probability\" respectively\n", 214 | " \"\"\"\n", 215 | " ug_count_dic = collections.defaultdict(int)\n", 216 | " ug_prob_dic = collections.defaultdict(float)\n", 217 | " total = sum(len(sent) for sent in sents)\n", 218 | " for sent in sents:\n", 219 | " for word in sent:\n", 220 | " if word not in ug_count_dic:\n", 221 | " ug_count_dic[word] = 1\n", 222 | " else:\n", 223 | " ug_count_dic[word] += 1\n", 224 | " for word in ug_count_dic:\n", 225 | " ug_prob_dic[word] = ug_count_dic[word]/total\n", 226 | " return ug_count_dic, ug_prob_dic\n", 227 | "\n", 228 | "print(preprocessing('email.txt'))" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "## 3.9" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 37, 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": "{'': 0.05472636815920398, '<\\\\s>': 0.05472636815920398, 'the': 0.04975124378109453, 'to': 0.03233830845771144, 'of': 0.02736318407960199, 'and': 0.02736318407960199, 'mask': 0.022388059701492536, 'on': 0.022388059701492536, 'be': 0.01990049751243781, 'a': 0.01990049751243781, 'for': 0.017412935323383085, 'campus': 0.014925373134328358, 'will': 0.012437810945273632, 'is': 0.012437810945273632, 'in': 0.009950248756218905, 'students': 0.009950248756218905, 'staff': 0.009950248756218905, 'supply': 0.009950248756218905, 'at': 0.009950248756218905, 'masks': 0.009950248756218905, 'office': 0.009950248756218905, 'personal': 0.009950248756218905, 'hygiene': 0.009950248756218905, 'who': 0.007462686567164179, 'feb': 0.007462686567164179, '2020': 0.007462686567164179, 'basis': 0.007462686567164179, 'requests': 0.007462686567164179, 'while': 0.007462686567164179, 'offices': 0.007462686567164179, 'collect': 0.007462686567164179, 'their': 0.007462686567164179, 'please': 0.007462686567164179, 'members': 0.004975124378109453, 'coronavirus': 0.004975124378109453, 'outbreak': 0.004975124378109453, \"university's\": 0.004975124378109453, 'has': 0.004975124378109453, 'provide': 0.004975124378109453, 'disposable': 0.004975124378109453, 'from': 0.004975124378109453, 'issued': 0.004975124378109453, 'daily': 0.004975124378109453, 'are': 0.004975124378109453, 'requested': 0.004975124378109453, 'submit': 0.004975124378109453, 'respective': 0.004975124378109453, \"dean's\": 0.004975124378109453, 'p/vp': 0.004975124378109453, 'or': 0.004975124378109453, 'before': 0.004975124378109453, '12': 0.004975124378109453, '(wed)': 0.004975124378109453, 'with': 0.004975124378109453, 'all': 0.004975124378109453, 'must': 0.004975124378109453, 'student': 0.004975124378109453, 'only': 0.004975124378109453, 'hall': 0.004975124378109453, 'can': 0.004975124378109453, 'other': 0.004975124378109453, 'community': 0.004975124378109453, 'limited': 0.004975124378109453, 'by': 0.004975124378109453, 'help': 0.004975124378109453, 'visit': 0.004975124378109453, 'details': 0.004975124378109453, 'dear': 0.0024875621890547263, 'hkust': 0.0024875621890547263, 'view': 0.0024875621890547263, 'latest': 0.0024875621890547263, 'development': 0.0024875621890547263, 'novel': 0.0024875621890547263, 'hong': 0.0024875621890547263, 'kong': 0.0024875621890547263, 'management': 0.0024875621890547263, 'decided': 0.0024875621890547263, 'implement': 0.0024875621890547263, 'procedures': 0.0024875621890547263, 'face': 0.0024875621890547263, 'need': 0.0024875621890547263, 'starting': 0.0024875621890547263, 'week': 0.0024875621890547263, '10': 0.0024875621890547263, 'scheme': 0.0024875621890547263, '1': 0.0024875621890547263, 'have': 0.0024875621890547263, 'essential': 0.0024875621890547263, 'services': 0.0024875621890547263, 'during': 0.0024875621890547263, '\"work': 0.0024875621890547263, 'home': 0.0024875621890547263, 'period\"': 0.0024875621890547263, 'departments': 0.0024875621890547263, 'units': 0.0024875621890547263, 'assess': 0.0024875621890547263, 'quantities': 0.0024875621890547263, 'needed': 0.0024875621890547263, 'non-academic': 0.0024875621890547263, 'should': 0.0024875621890547263, 'cls': 0.0024875621890547263, 'coordinate': 0.0024875621890547263, 'work': 0.0024875621890547263, 'dissemination': 0.0024875621890547263, '2': 0.0024875621890547263, 'stay': 0.0024875621890547263, 'away': 0.0024875621890547263, 'if': 0.0024875621890547263, 'possible': 0.0024875621890547263, 'those': 0.0024875621890547263, 'in-person': 0.0024875621890547263, 'present': 0.0024875621890547263, 'id': 0.0024875621890547263, 'which': 0.0024875621890547263, 'validation': 0.0024875621890547263, 'no': 0.0024875621890547263, 'data': 0.0024875621890547263, 'captured': 0.0024875621890547263, 'residents': 0.0024875621890547263, 'counter': 0.0024875621890547263, 'outside': 0.0024875621890547263, 'security': 0.0024875621890547263, 'atrium': 0.0024875621890547263, 'lskbb': 0.0024875621890547263, '3': 0.0024875621890547263, 'considering': 0.0024875621890547263, 'scarce': 0.0024875621890547263, 'quantity': 0.0024875621890547263, 'made': 0.0024875621890547263, 'available': 0.0024875621890547263, 'purchase': 0.0024875621890547263, 'use': 0.0024875621890547263, 'one-off': 0.0024875621890547263, 'cost': 0.0024875621890547263, 'recovery': 0.0024875621890547263, 'association': 0.0024875621890547263, 'agreed': 0.0024875621890547263, 'distribution': 0.0024875621890547263, 'http://staff.ust.hk/sa-maskorder.pdf': 0.0024875621890547263, 'registration': 0.0024875621890547263, 'may': 0.0024875621890547263, 'not': 0.0024875621890547263, 'sufficient': 0.0024875621890547263, 'satisfy': 0.0024875621890547263, 'it': 0.0024875621890547263, 'desire': 0.0024875621890547263, 'alleviate': 0.0024875621890547263, 'impact': 0.0024875621890547263, 'virus': 0.0024875621890547263, 'our': 0.0024875621890547263, 'this': 0.0024875621890547263, 'special': 0.0024875621890547263, 'arrangement': 0.0024875621890547263, 'important': 0.0024875621890547263, 'reminder': 0.0024875621890547263, 'about': 0.0024875621890547263, 'reminded': 0.0024875621890547263, 'that': 0.0024875621890547263, 'wearing': 0.0024875621890547263, 'small': 0.0024875621890547263, 'part': 0.0024875621890547263, 'holistic': 0.0024875621890547263, 'approach': 0.0024875621890547263, 'combat': 0.0024875621890547263, 'spreading': 0.0024875621890547263, 'keeping': 0.0024875621890547263, 'up': 0.0024875621890547263, 'good': 0.0024875621890547263, 'practices': 0.0024875621890547263, 'especially': 0.0024875621890547263, 'hand': 0.0024875621890547263, 'crucial': 0.0024875621890547263, 'information': 0.0024875621890547263, 'preventive': 0.0024875621890547263, 'measures': 0.0024875621890547263, 'https://www.chp.gov.hk/en/features/102465.html': 0.0024875621890547263, 'health': 0.0024875621890547263, 'safety': 0.0024875621890547263, 'environment': 0.0024875621890547263}\n{('dear', 'members'): 1.0, ('hkust', '<\\\\s>'): 1.0, ('view', 'of'): 1.0, ('latest', 'development'): 1.0, ('development', 'of'): 1.0, ('novel', 'coronavirus'): 1.0, ('outbreak', 'in'): 1.0, ('hong', 'kong'): 1.0, ('kong', '<\\\\s>'): 1.0, ('management', 'has'): 1.0, ('decided', 'to'): 1.0, ('implement', 'procedures'): 1.0, ('procedures', 'to'): 1.0, ('face', 'mask'): 1.0, ('need', 'to'): 1.0, ('starting', 'from'): 1.0, ('week', 'of'): 1.0, ('10', 'feb'): 1.0, ('scheme', '<\\\\s>'): 1.0, ('1', 'staff'): 1.0, ('have', 'to'): 1.0, ('essential', 'services'): 1.0, ('services', 'on'): 1.0, ('during', 'the'): 1.0, ('\"work', 'at'): 1.0, ('home', 'period\"'): 1.0, ('period\"', 'will'): 1.0, ('issued', 'a'): 1.0, ('daily', 'basis'): 1.0, ('departments', 'units'): 1.0, ('units', 'are'): 1.0, ('are', 'requested'): 1.0, ('requested', 'to'): 1.0, ('assess', 'the'): 1.0, ('quantities', 'of'): 1.0, ('needed', 'and'): 1.0, ('non-academic', 'offices'): 1.0, ('should', 'submit'): 1.0, ('or', 'before'): 1.0, ('before', 'feb'): 1.0, ('12', '<\\\\s>'): 1.0, ('cls', 'will'): 1.0, ('coordinate', 'the'): 1.0, ('work', 'with'): 1.0, ('with', 'the'): 1.0, ('dissemination', '<\\\\s>'): 1.0, ('2', 'students'): 1.0, ('stay', 'away'): 1.0, ('away', 'from'): 1.0, ('if', 'at'): 1.0, ('possible', 'those'): 1.0, ('those', 'who'): 1.0, ('in-person', 'and'): 1.0, ('present', 'their'): 1.0, ('id', 'which'): 1.0, ('which', 'is'): 1.0, ('validation', 'only'): 1.0, ('no', 'personal'): 1.0, ('data', 'will'): 1.0, ('captured', 'hall'): 1.0, ('residents', 'can'): 1.0, ('can', 'collect'): 1.0, ('counter', '<\\\\s>'): 1.0, ('outside', 'the'): 1.0, ('security', 'office'): 1.0, ('atrium', 'and'): 1.0, ('lskbb', '<\\\\s>'): 1.0, ('3', 'considering'): 1.0, ('considering', 'the'): 1.0, ('scarce', 'supply'): 1.0, ('quantity', 'of'): 1.0, ('made', 'available'): 1.0, ('available', 'for'): 1.0, ('purchase', 'by'): 1.0, ('use', 'on'): 1.0, ('one-off', '<\\\\s>'): 1.0, ('cost', 'recovery'): 1.0, ('recovery', 'basis'): 1.0, ('association', 'has'): 1.0, ('agreed', 'to'): 1.0, ('distribution', '<\\\\s>'): 1.0, ('http://staff.ust.hk/sa-maskorder.pdf', 'for'): 1.0, ('registration', 'on'): 1.0, ('may', 'not'): 1.0, ('not', 'be'): 1.0, ('sufficient', 'to'): 1.0, ('satisfy', 'all'): 1.0, ('it', 'is'): 1.0, ('desire', 'to'): 1.0, ('alleviate', 'the'): 1.0, ('impact', 'of'): 1.0, ('virus', 'outbreak'): 1.0, ('our', 'campus'): 1.0, ('this', 'special'): 1.0, ('special', 'arrangement'): 1.0, ('arrangement', '<\\\\s>'): 1.0, ('important', 'reminder'): 1.0, ('reminder', 'about'): 1.0, ('about', 'personal'): 1.0, ('reminded', 'that'): 1.0, ('that', 'wearing'): 1.0, ('wearing', 'mask'): 1.0, ('small', 'part'): 1.0, ('part', 'of'): 1.0, ('holistic', 'approach'): 1.0, ('approach', 'to'): 1.0, ('combat', 'spreading'): 1.0, ('spreading', 'of'): 1.0, ('keeping', 'up'): 1.0, ('up', 'other'): 1.0, ('good', 'personal'): 1.0, ('practices', '<\\\\s>'): 1.0, ('especially', 'hand'): 1.0, ('hand', 'hygiene'): 1.0, ('crucial', 'for'): 1.0, ('information', 'on'): 1.0, ('preventive', 'measures'): 1.0, ('measures', '<\\\\s>'): 1.0, ('https://www.chp.gov.hk/en/features/102465.html', 'for'): 1.0, ('health', '<\\\\s>'): 1.0, ('safety', 'and'): 1.0, ('environment', 'office'): 1.0, ('will', 'be'): 0.8, ('requests', 'to'): 0.6666666666666666, ('feb', '12'): 0.6666666666666666, ('2020', '(wed)'): 0.6666666666666666, ('collect', 'their'): 0.6666666666666666, ('please', 'visit'): 0.6666666666666666, ('members', 'of'): 0.5, ('coronavirus', 'outbreak'): 0.5, (\"university's\", 'management'): 0.5, ('has', 'decided'): 0.5, ('provide', 'disposable'): 0.5, ('disposable', 'face'): 0.5, ('from', 'the'): 0.5, ('members', 'who'): 0.5, ('provide', 'essential'): 0.5, ('submit', 'requests'): 0.5, ('respective', \"dean's\"): 0.5, (\"dean's\", 'office'): 0.5, ('office', '<\\\\s>'): 0.5, ('submit', 'the'): 0.5, ('respective', 'p/vp'): 0.5, ('p/vp', 'office'): 0.5, ('(wed)', 'cls'): 0.5, ('supply', 'of'): 0.5, (\"dean's\", 'offices'): 0.5, ('p/vp', 'offices'): 0.5, ('from', 'campus'): 0.5, ('all', 'possible'): 0.5, ('must', 'be'): 0.5, ('must', 'collect'): 0.5, ('student', 'id'): 0.5, ('only', 'and'): 0.5, ('hall', 'residents'): 0.5, ('at', 'the'): 0.5, ('student', 'hall'): 0.5, ('hall', 'counter'): 0.5, ('other', 'students'): 0.5, ('disposable', 'mask'): 0.5, ('community', '<\\\\s>'): 0.5, ('limited', 'quantity'): 0.5, ('by', 'staff'): 0.5, ('has', 'agreed'): 0.5, ('help', 'with'): 0.5, ('visit', 'http://staff.ust.hk/sa-maskorder.pdf'): 0.5, ('details', 'and'): 0.5, ('(wed)', 'while'): 0.5, ('limited', 'and'): 0.5, ('all', 'requests'): 0.5, (\"university's\", 'desire'): 0.5, ('help', 'alleviate'): 0.5, ('community', 'by'): 0.5, ('by', 'this'): 0.5, ('personal', 'hygiene'): 0.5, ('hygiene', '<\\\\s>'): 0.5, ('only', 'a'): 0.5, ('coronavirus', 'keeping'): 0.5, ('other', 'good'): 0.5, ('visit', 'https://www.chp.gov.hk/en/features/102465.html'): 0.5, ('details', '<\\\\s>'): 0.5, ('of', 'the'): 0.36363636363636365, ('who', 'need'): 0.3333333333333333, ('on', 'campus'): 0.3333333333333333, ('feb', '2020'): 0.3333333333333333, ('2020', '<\\\\s>'): 0.3333333333333333, ('who', 'have'): 0.3333333333333333, ('on', 'a'): 0.3333333333333333, ('basis', 'departments'): 0.3333333333333333, ('while', 'non-academic'): 0.3333333333333333, ('offices', 'should'): 0.3333333333333333, ('offices', 'and'): 0.3333333333333333, ('offices', 'for'): 0.3333333333333333, ('who', 'must'): 0.3333333333333333, ('basis', 'students'): 0.3333333333333333, ('their', 'mask'): 0.3333333333333333, ('their', 'student'): 0.3333333333333333, ('their', 'masks'): 0.3333333333333333, ('while', 'other'): 0.3333333333333333, ('collect', 'outside'): 0.3333333333333333, ('basis', 'staff'): 0.3333333333333333, ('while', 'the'): 0.3333333333333333, ('requests', '<\\\\s>'): 0.3333333333333333, ('please', 'be'): 0.3333333333333333, ('for', 'details'): 0.2857142857142857, ('of', 'masks'): 0.2727272727272727, ('in', 'view'): 0.25, ('in', 'hong'): 0.25, ('students', 'and'): 0.25, ('staff', 'who'): 0.25, ('be', 'on'): 0.25, ('supply', 'scheme'): 0.25, ('staff', 'members'): 0.25, ('at', 'home'): 0.25, ('be', 'issued'): 0.25, ('a', 'mask'): 0.25, ('a', 'daily'): 0.25, ('masks', 'needed'): 0.25, ('office', 'on'): 0.25, ('masks', 'and'): 0.25, ('students', 'are'): 0.25, ('at', 'all'): 0.25, ('students', 'must'): 0.25, ('personal', 'data'): 0.25, ('masks', 'at'): 0.25, ('students', 'can'): 0.25, ('office', 'at'): 0.25, ('in', 'the'): 0.25, ('masks', 'will'): 0.25, ('staff', 'for'): 0.25, ('personal', 'use'): 0.25, ('staff', 'association'): 0.25, ('supply', 'is'): 0.25, ('in', 'our'): 0.25, ('hygiene', 'practices'): 0.25, ('hygiene', 'and'): 0.25, ('mask', 'supply'): 0.2222222222222222, ('mask', 'on'): 0.2222222222222222, ('on', 'or'): 0.2222222222222222, ('will', 'coordinate'): 0.2, ('is', 'for'): 0.2, ('is', 'limited'): 0.2, ('is', 'the'): 0.2, ('is', 'only'): 0.2, ('is', 'crucial'): 0.2, ('campus', 'starting'): 0.16666666666666666, ('campus', 'mask'): 0.16666666666666666, ('campus', 'during'): 0.16666666666666666, ('campus', 'if'): 0.16666666666666666, ('campus', 'will'): 0.16666666666666666, ('campus', 'community'): 0.16666666666666666, ('to', 'provide'): 0.15384615384615385, ('to', 'respective'): 0.15384615384615385, ('to', 'help'): 0.15384615384615385, ('for', 'mask'): 0.14285714285714285, ('for', 'validation'): 0.14285714285714285, ('for', 'purchase'): 0.14285714285714285, ('for', 'personal'): 0.14285714285714285, ('for', 'information'): 0.14285714285714285, ('', 'please'): 0.13636363636363635, ('be', 'captured'): 0.125, ('a', 'limited'): 0.125, ('be', 'made'): 0.125, ('a', 'one-off'): 0.125, ('be', 'sufficient'): 0.125, ('be', 'reminded'): 0.125, ('a', 'small'): 0.125, ('a', 'holistic'): 0.125, ('mask', 'to'): 0.1111111111111111, ('mask', 'dissemination'): 0.1111111111111111, ('mask', 'in-person'): 0.1111111111111111, ('mask', 'in'): 0.1111111111111111, ('mask', 'is'): 0.1111111111111111, ('on', 'hygiene'): 0.1111111111111111, ('the', \"university's\"): 0.1, ('of', 'hkust'): 0.09090909090909091, ('and', 'staff'): 0.09090909090909091, ('of', '10'): 0.09090909090909091, ('and', 'submit'): 0.09090909090909091, ('', 'while'): 0.09090909090909091, ('', '2020'): 0.09090909090909091, ('and', 'work'): 0.09090909090909091, ('and', 'p/vp'): 0.09090909090909091, ('and', 'present'): 0.09090909090909091, ('and', 'no'): 0.09090909090909091, ('and', 'lskbb'): 0.09090909090909091, ('of', 'disposable'): 0.09090909090909091, ('and', 'registration'): 0.09090909090909091, ('and', 'may'): 0.09090909090909091, ('of', 'a'): 0.09090909090909091, ('and', 'preventive'): 0.09090909090909091, ('and', 'environment'): 0.09090909090909091, ('to', 'implement'): 0.07692307692307693, ('to', 'students'): 0.07692307692307693, ('to', 'be'): 0.07692307692307693, ('to', 'assess'): 0.07692307692307693, ('to', 'stay'): 0.07692307692307693, ('to', 'satisfy'): 0.07692307692307693, ('to', 'combat'): 0.07692307692307693, ('the', 'latest'): 0.05, ('the', 'novel'): 0.05, ('the', 'week'): 0.05, ('the', '\"work'): 0.05, ('the', 'quantities'): 0.05, ('the', 'requests'): 0.05, ('the', 'supply'): 0.05, ('the', \"dean's\"): 0.05, ('the', 'student'): 0.05, ('the', 'security'): 0.05, ('the', 'atrium'): 0.05, ('the', 'scarce'): 0.05, ('the', 'community'): 0.05, ('the', 'distribution'): 0.05, ('the', 'mask'): 0.05, ('the', 'impact'): 0.05, ('the', 'virus'): 0.05, ('the', 'coronavirus'): 0.05, ('', 'dear'): 0.045454545454545456, ('', 'in'): 0.045454545454545456, ('', 'the'): 0.045454545454545456, ('', 'campus'): 0.045454545454545456, ('', '1'): 0.045454545454545456, ('', '2'): 0.045454545454545456, ('', '3'): 0.045454545454545456, ('', 'a'): 0.045454545454545456, ('', 'cost'): 0.045454545454545456, ('', 'it'): 0.045454545454545456, ('', 'important'): 0.045454545454545456, ('', 'especially'): 0.045454545454545456, ('', 'is'): 0.045454545454545456, ('', 'health'): 0.045454545454545456, ('', 'safety'): 0.045454545454545456}\n" 249 | } 250 | ], 251 | "source": [ 252 | "sents = preprocessing('email.txt')\n", 253 | "ug_count_dic, ug_prob_dic = unigrams(sents)\n", 254 | "bi_dic = bigram(sents, ug_count_dic)\n", 255 | "# print(ug_prob_dic)\n", 256 | "# print(bi_dic)\n", 257 | "\n", 258 | "ug_sorted_dic = {x:v for x,v in sorted(ug_prob_dic.items(), key=lambda item:item[1], reverse=True)}\n", 259 | "print(ug_sorted_dic)\n", 260 | "bi_sorted_dic = {x:v for x,v in sorted(bi_dic.items(), key=lambda item:item[1], reverse=True)}\n", 261 | "print(bi_sorted_dic)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "## 3.10 & 3.11" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 38, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "name": "stdout", 280 | "output_type": "stream", 281 | "text": "sentence 1: ppl = 2.15\nsafety and may not be on a daily basis departments units are requested to help alleviate the supply of the university's desire to\nsentence 2: ppl = 2.34\n2020 (wed) while non-academic offices for details and may not be reminded that wearing mask in-person and p/vp offices and present their masks\nsentence 3: ppl = 2.61\nsafety and registration on campus starting from campus during the latest development of the university's desire to stay away from campus mask dissemination\nsentence 4: ppl = 2.70\nis for information on campus starting from campus community\nsentence 5: ppl = 1.92\n2020 (wed) while other students can collect their student hall counter\nsentence 6: ppl = 2.93\nplease be sufficient to implement procedures to respective p/vp offices for information on campus starting from the security office on a mask to\nsentence 7: ppl = 1.00\nhealth\nsentence 8: ppl = 2.24\nwhile the university's management has agreed to respective dean's office\nsentence 9: ppl = 2.35\ndear members of masks at home period\" will be on a limited quantity of masks needed and no personal use on campus mask\nsentence 10: ppl = 1.00\nhealth\nsentence 11: ppl = 1.78\nplease visit https://www.chp.gov.hk/en/features/102465.html for personal use on or before feb 12\nsentence 12: ppl = 1.41\nespecially hand hygiene practices\nsentence 13: ppl = 2.59\nin our campus during the atrium and preventive measures\nsentence 14: ppl = 2.52\nwhile other good personal hygiene and p/vp offices should submit requests to combat spreading of the community\nsentence 15: ppl = 5.65\nthe mask dissemination\nsentence 16: ppl = 2.44\n2 students must collect their student id which is the atrium and work with the distribution\nsentence 17: ppl = 2.94\nthe supply is the \"work at all possible those who have to provide disposable mask supply scheme\nsentence 18: ppl = 2.16\nis only a small part of the virus outbreak in the novel coronavirus keeping up other students can collect their student id which\nsentence 19: ppl = 2.15\ncost recovery basis departments units are requested to combat spreading of the latest development of masks needed and staff for information on or\nsentence 20: ppl = 1.98\n1 staff association has decided to provide disposable face mask dissemination\nAverage ppl: 2.34\n" 282 | } 283 | ], 284 | "source": [ 285 | "import random\n", 286 | "\n", 287 | "def rand_sent(bi_dic, max_len):\n", 288 | " sent = ['']\n", 289 | " while sent[-1]!='<\\s>' and len(sent)+1' else sent+['<\\s>']\n", 299 | "\n", 300 | "def cal_ppl(sent, bi_dic):\n", 301 | " ppl = 1\n", 302 | " for i in range(1,len(sent)-2):\n", 303 | " w1, w2 = sent[i], sent[i+1]\n", 304 | " ppl *= bi_dic[(w1,w2)]\n", 305 | " return pow(ppl, -1/(len(sent)-2))\n", 306 | "\n", 307 | "test_times = 20\n", 308 | "\n", 309 | "total = 0\n", 310 | "for i in range(test_times):\n", 311 | " sent = rand_sent(bi_dic, 25)\n", 312 | " ppl = cal_ppl(sent, bi_dic)\n", 313 | " print(\"sentence %d: ppl = %.2f\"%(i+1,ppl))\n", 314 | " print(' '.join([word for word in sent[1:-1]]))\n", 315 | " total += ppl\n", 316 | "print(\"Average ppl: %.2f\"%(total/test_times))" 317 | ] 318 | } 319 | ] 320 | } 321 | -------------------------------------------------------------------------------- /Chapter4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 2, 4 | "metadata": { 5 | "language_info": { 6 | "name": "python", 7 | "codemirror_mode": { 8 | "name": "ipython", 9 | "version": 3 10 | } 11 | }, 12 | "orig_nbformat": 2, 13 | "file_extension": ".py", 14 | "mimetype": "text/x-python", 15 | "name": "python", 16 | "npconvert_exporter": "python", 17 | "pygments_lexer": "ipython3", 18 | "version": 3 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## 4.1\n", 26 | "\n", 27 | "$$c_{pos}=P(I|pos)P(always|pos)P(like|pos)P(foreign|pos)P(films|pos)P(pos) = 0.09\\times0.07\\times0.29\\times0.04\\times0.08\\times0.5=0.0000029232$$\n", 28 | "$$c_{neg}=P(I|pos)P(always|pos)P(like|pos)P(foreign|pos)P(films|pos)P(pos) = 0.16\\times0.06\\times0.06\\times0.15\\times0.11\\times0.5=0.000004752$$\n", 29 | "Hence,\n", 30 | "$$\\hat{c}=argmax_c(c_{pos},c_{neg})=neg$$" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "## 4.2\n", 38 | "\n", 39 | "Class probability: \n", 40 | "$$logprior[comedy]=\\log{P(comedy)} = \\log\\frac{2}{5} = -0.3979$$\n", 41 | "$$logprior[action]=\\log{P(action)} = \\log\\frac{3}{5} = -0.2218$$\n", 42 | "\n", 43 | "Vocabulary:\n", 44 | "V = {fun, couple, love, fast, furious, shoot, fly}\n", 45 | "\n", 46 | "bigdoc\\[comedy\\] = {fun, couple, love, fly, fast}\n", 47 | "\n", 48 | "bigdoc\\[action\\] = {fast, furious, shoot, fun, fly, love}\n", 49 | "\n", 50 | "loglikelihood\n", 51 | "$$loglikelihood[fun,comedy]=\\frac{3+1}{9+7}=\\frac{4}{16}$$\n", 52 | "$$loglikelihood[fun,action]=\\frac{1+1}{11+7}=\\frac{3}{18}$$\n", 53 | "$$loglikelihood[couple,comedy]=\\frac{2+1}{9+7}=\\frac{3}{16}$$\n", 54 | "$$loglikelihood[couple,action]=\\frac{0+1}{11+7}=\\frac{1}{18}$$\n", 55 | "$$loglikelihood[love,comedy]=\\frac{2+1}{9+7}=\\frac{3}{16}$$\n", 56 | "$$loglikelihood[love,action]=\\frac{1+1}{11+7}=\\frac{2}{18}$$\n", 57 | "$$loglikelihood[fast,comedy]=\\frac{1+1}{9+7}=\\frac{2}{16}$$\n", 58 | "$$loglikelihood[fast,action]=\\frac{2+1}{11+7}=\\frac{3}{18}$$\n", 59 | "$$loglikelihood[furious,comedy]=\\frac{0+1}{9+7}=\\frac{1}{16}$$\n", 60 | "$$loglikelihood[furious,action]=\\frac{2+1}{11+7}=\\frac{3}{18}$$\n", 61 | "$$loglikelihood[shoot,comedy]=\\frac{0+1}{9+7}=\\frac{1}{16}$$\n", 62 | "$$loglikelihood[shoot,action]=\\frac{3+1}{11+7}=\\frac{4}{18}$$\n", 63 | "$$loglikelihood[fly,comedy]=\\frac{1+1}{9+7}=\\frac{2}{16}$$\n", 64 | "$$loglikelihood[fly,action]=\\frac{1+1}{11+7}=\\frac{2}{18}$$\n", 65 | "\n", 66 | "We have the following table\n", 67 | "\n", 68 | "
\n", 69 | "\n", 70 | "|word|comedy|action|\n", 71 | "|:----:|:---:|:---:|\n", 72 | "|fun|4/16|3/18|\n", 73 | "|couple|3/16|1/18|\n", 74 | "|love|3/16|2/18|\n", 75 | "|fast|2/16|3/18|\n", 76 | "|furious|1/16|3/18|\n", 77 | "|shoot|1/16|4/18|\n", 78 | "|fly|2/16|2/18|\n", 79 | "\n", 80 | "
\n", 81 | "\n", 82 | "Next we can compute classifier output\n", 83 | "For comedy class:\n", 84 | "$$sum[comedy]=-0.3979+\\log\\frac{2}{16}+\\log\\frac{3}{16}+\\log\\frac{1}{16}+\\log\\frac{2}{16}=-4.1351$$\n", 85 | "$$sum[action]=-0.2218+\\log\\frac{3}{18}+\\log\\frac{1}{18}+\\log\\frac{4}{18}+\\log\\frac{2}{18}=-3.8627$$\n", 86 | "\n", 87 | "Because sum\\[action\\]>sum\\[comedy\\], we assert that D should be in the action class." 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## 4.3\n", 95 | "\n", 96 | "### 1) Multinomial Naive Bayes\n", 97 | "Class prior\n", 98 | "$$logprior[positive]=\\log(2/5)=-0.3979$$\n", 99 | "$$logprior[negative]=\\log(3/5)=-0.2218$$\n", 100 | "\n", 101 | "loglikelihood\n", 102 | "$$loglikelihood[good,pos]=\\log\\frac{3+1}{9+3}=-0.4771$$\n", 103 | "$$loglikelihood[good,neg]=\\log\\frac{2+1}{14+3}=-0.7533$$\n", 104 | "$$loglikelihood[poor,pos]=\\log\\frac{1+1}{9+3}=-0.7782$$\n", 105 | "$$loglikelihood[poor,neg]=\\log\\frac{10+1}{14+3}=-0.1891$$\n", 106 | "$$loglikelihood[great,pos]=\\log\\frac{5+1}{9+3}=-0.3010$$\n", 107 | "$$loglikelihood[great,neg]=\\log\\frac{2+1}{14+3}=-0.7533$$\n", 108 | "\n", 109 | "Next we can compute classifier output\n", 110 | "For comedy class:\n", 111 | "$$sum[pos]=-0.3979-0.4771\\times2-0.7782-0.3010=-2.4313$$\n", 112 | "$$sum[neg]=-0.2218-0.7533\\times2-0.1891-0.7533=-2.6708$$\n", 113 | "\n", 114 | "Because sum\\[pos\\]>sum\\[neg\\], we assert it should be classified as pos.\n", 115 | "\n", 116 | "## 2) Binarized naive Bayes\n", 117 | "loglikelihood\n", 118 | "$$loglikelihood[good,pos]=\\log\\frac{1+1}{4+3}=--0.5441$$\n", 119 | "$$loglikelihood[good,neg]=\\log\\frac{2+1}{6+3}=-0.4771$$\n", 120 | "$$loglikelihood[poor,pos]=\\log\\frac{1+1}{4+3}=-0.5441$$\n", 121 | "$$loglikelihood[poor,neg]=\\log\\frac{3+1}{6+3}=-0.3522$$\n", 122 | "$$loglikelihood[great,pos]=\\log\\frac{2+1}{4+3}=-0.3680$$\n", 123 | "$$loglikelihood[great,neg]=\\log\\frac{1+1}{6+3}=-0.6532$$\n", 124 | "\n", 125 | "$$sum[pos]=-0.3979-0.5441-0.5441-0.3680=-1.8541$$\n", 126 | "$$sum[neg]=-0.2218-0.4771-0.3522-0.6532=-1.7043$$\n", 127 | "\n", 128 | "Because sum\\[pos\\] NP VP | Aux NP VP | VP 41 | NP -> Pronoun | Proper-Noun | Det Nominal 42 | Nominal -> Noun | Nominal Noun | Nominal PP 43 | VP -> Verb | Verb NP | Verb NP PP | Verb PP | VP PP 44 | PP -> Preposition NP 45 | Det -> that | this | a | the 46 | Noun -> book | flight | meal | money 47 | Verb -> book | include | prefer 48 | Pronoun -> I | she | me 49 | Proper-Noun -> Houston | TWA 50 | Aux -> does 51 | Preposition -> from | to | on | near | through''') 52 | new_grammar = Chmosky(grammar) 53 | # display(grammar) 54 | display_grammar(new_grammar) 55 | words = "book the flight through Houston".split() 56 | table = CKY(words, new_grammar) 57 | display_table(words, table) -------------------------------------------------------------------------------- /Constituency Parsing/Chmosky.py: -------------------------------------------------------------------------------- 1 | from utils import get_grammar, display_grammar 2 | import copy 3 | 4 | """TODO: Buildup a non-terminal list 5 | """ 6 | 7 | def Chmosky(grammar): 8 | """ Convert arbitrary grammar to a CNF grammar 9 | @param grammar: list of tuples, each of which is in type: (str, list[str]) 10 | """ 11 | # grammar = set(grammar) 12 | nonterminals = list(g[0] for g in grammar) 13 | 14 | def isTerminal(p): 15 | return not p in nonterminals 16 | 17 | new_grammar = copy.deepcopy(grammar) 18 | temp_grammar = copy.deepcopy(grammar) 19 | # A flag indicating if the processing happens 20 | process = True 21 | # remove epsilon rules 22 | while process: 23 | process = False 24 | # remove single symbol nonterminal rules 25 | for l, r in temp_grammar: 26 | if l != 'S' and not r: 27 | process = True 28 | new_grammar.remove((l,r)) 29 | temp_grammar = copy.deepcopy(new_grammar) 30 | for l_, r_ in temp_grammar: 31 | if r in r_: 32 | new_grammar.remove((l_, r_)) 33 | new_grammar.append((l_, r_.pop(r))) 34 | temp_grammar = copy.deepcopy(new_grammar) 35 | 36 | process = True 37 | # remove single symbol nonterminal rules 38 | while process: 39 | process = False 40 | for l, r in temp_grammar: 41 | if len(r) == 1 and not isTerminal(r[0]): 42 | process = True 43 | new_grammar.remove((l,r)) 44 | for l_, r_ in temp_grammar: 45 | if l_ == r[0]: 46 | new_grammar.append((l, r_)) 47 | temp_grammar = copy.deepcopy(new_grammar) 48 | 49 | """ process = True 50 | # move terminals to their own rules 51 | while process: 52 | process = False 53 | for l, r in temp_grammar: 54 | if len(r) > 1: 55 | for i, rr in enumerate(r): 56 | if isTerminal(rr): 57 | process = True 58 | new_l = rr.upper() 59 | new_grammar.append((new_l, [rr])) 60 | new_grammar.remove(l, r) 61 | new_grammar.append((l, r[:i]+[new_l]+r[i+1:])) 62 | temp_grammar = copy.deepcopy(new_grammar) """ 63 | 64 | process = True 65 | # ensure there are only two nonterminals per rule 66 | index = 1 67 | while process: 68 | process = False 69 | # remove single symbol nonterminal rules 70 | for l, r in temp_grammar: 71 | if len(r) > 2: 72 | process = True 73 | unknown = True 74 | # if rule exists, don't create a new rule 75 | new_r = r[0:2] 76 | for l_, r_ in temp_grammar: 77 | if r_ == new_r and l_[0]=='X': 78 | unknown = False 79 | new_l = l_ 80 | break 81 | if unknown: 82 | new_l = 'X%d'%index 83 | index += 1 84 | new_grammar.append((new_l, new_r)) 85 | # replace all 86 | new_grammar.remove((l,r)) 87 | new_grammar.append((l,[new_l]+r[2:])) 88 | temp_grammar = copy.deepcopy(new_grammar) 89 | 90 | return new_grammar 91 | 92 | if __name__ == '__main__': 93 | grammar = get_grammar('''\ 94 | S -> NP VP | Aux NP VP | VP 95 | NP -> Pronoun | Proper-Noun | Det Nominal 96 | Nominal -> Noun | Nominal Noun | Nominal PP 97 | VP -> Verb | Verb NP | Verb NP PP | Verb PP | VP PP 98 | PP -> Preposition NP 99 | Det -> that | this | a 100 | Noun -> book | flight | meal | money 101 | Verb -> book | include | prefer 102 | Pronoun -> I | she | me 103 | Proper-Noun -> Houston | TWA 104 | Aux -> does 105 | Preposition -> from | to | on | near | through''') 106 | new_grammar = Chmosky(grammar) 107 | -------------------------------------------------------------------------------- /Constituency Parsing/__pycache__/Chmosky.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Clement25/Speech-and-Language-Processing-ver3-solutions/24af9e49568187aeb062beacd33eba562f2e485c/Constituency Parsing/__pycache__/Chmosky.cpython-36.pyc -------------------------------------------------------------------------------- /Constituency Parsing/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Clement25/Speech-and-Language-Processing-ver3-solutions/24af9e49568187aeb062beacd33eba562f2e485c/Constituency Parsing/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /Constituency Parsing/utils.py: -------------------------------------------------------------------------------- 1 | from prettytable import PrettyTable 2 | 3 | def get_grammar(string): 4 | grammar = list() 5 | for line in string.splitlines(): 6 | left, rights = line.split(' -> ') 7 | for right in rights.split('|'): 8 | grammar.append((left.strip(),[r.strip() for r in right.split()])) 9 | return grammar 10 | 11 | def display_grammar(grammar): 12 | disp_dic = {} 13 | for g in grammar: 14 | try: 15 | disp_dic[g[0]].append(g[1]) 16 | except: 17 | disp_dic[g[0]] = [g[1]] 18 | for key, value in disp_dic.items(): 19 | print(key+" -> "+" | ".join([" ".join(v) for v in value])) 20 | 21 | def display_table(words, table): 22 | """display CKY result table 23 | @param (table): list of list of set. Each set contains all possible results from CKY algorithm. 24 | @return: None 25 | """ 26 | pt = PrettyTable(words) 27 | for row in table: 28 | if row == table[-1]: 29 | break 30 | new_row = [','.join(list(item)) for item in row[1:]] 31 | pt.add_row(new_row) 32 | print(pt) -------------------------------------------------------------------------------- /POS Tagging/__pycache__/taggers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Clement25/Speech-and-Language-Processing-ver3-solutions/24af9e49568187aeb062beacd33eba562f2e485c/POS Tagging/__pycache__/taggers.cpython-36.pyc -------------------------------------------------------------------------------- /POS Tagging/main.py: -------------------------------------------------------------------------------- 1 | from taggers import BLTagger, HMMTagger 2 | import ipdb 3 | 4 | 5 | def main(): 6 | filepath = './data/conll2000/' 7 | file_name_train, file_name_test = 'train.txt', 'test.txt' 8 | 9 | train_file, test_file = open(filepath+file_name_train),open(filepath+file_name_test) 10 | train_data = [line.strip() for line in train_file] 11 | test_data = [line.strip() for line in test_file] 12 | 13 | print(train_data[0:50]) 14 | 15 | # Baseline model, "most likely tag" 16 | train_pairs = [(pair.split()[0], pair.split()[1]) for pair in train_data if pair] 17 | test_pairs = [(pair.split()[0], pair.split()[1]) for pair in test_data if pair] 18 | 19 | Base_Tagger = BLTagger(train_pairs) 20 | Base_Tagger.test(test_pairs) 21 | 22 | # HMM model 23 | train_pairs_bi = [((train_data[i].split()[0], train_data[i].split()[1]),(train_data[i+1].split()[0], train_data[i+1].split()[1])) \ 24 | for i in range(len(train_data)-1) if train_data[i+1] and train_data[i]] 25 | # test_pairs_bi = [((test_data[i].split()[0], test_data[i].split()[1]),(test_data[i+1].split()[0], test_data[i+1].split()[1])) \ 26 | # for i in range(len(test_data)-1) if test_data[i+1] and test_data[i]] 27 | 28 | # exclude the last word in each scentence because they have no contribution to the transition probability 29 | train_pairs_hmm = [pair[0] for pair in train_pairs_bi] 30 | 31 | # start of each sentence 32 | train_starts = [train_data[i].split()[1] for i in range(len(train_data)) if i == 0 or not train_data[i-1]] 33 | # test_pairs_hmm = [pair[0] for pair in test_pairs_bi] 34 | test_sents = [] 35 | test_tags = [] 36 | sent = [] 37 | sent_tags = [] 38 | for pair in test_data: 39 | if pair: 40 | sent.append(pair.split()[0]) 41 | sent_tags.append(pair.split()[1]) 42 | else: 43 | test_sents.append(sent) 44 | test_tags.append(sent_tags) 45 | sent = [] 46 | sent_tags = [] 47 | 48 | vocab = [pair.split()[0] for pair in train_data if pair] 49 | vocab = list(set(vocab)) 50 | 51 | # all tags 52 | tags = [pair[1] for pair in train_pairs] 53 | all_tags = list(set(tags)) 54 | 55 | print(train_pairs[:20]) 56 | print(train_pairs_bi[:20]) 57 | 58 | HMM_Tagger = HMMTagger(train_pairs, train_pairs_bi, train_starts, all_tags, len(vocab)) 59 | HMM_Tagger.test(test_sents, test_tags) 60 | 61 | if __name__ == '__main__': 62 | main() -------------------------------------------------------------------------------- /POS Tagging/taggers.py: -------------------------------------------------------------------------------- 1 | import time 2 | import ipdb 3 | import numpy as np 4 | import pandas as pd 5 | from collections import defaultdict 6 | from prettytable import PrettyTable 7 | 8 | class BLTagger(object): 9 | """Baseline Tagger 10 | Using max likelihood to tag all words 11 | """ 12 | def __init__(self, train_pairs): 13 | """Initialize a baseline tagger 14 | @param train_pairs (List of Tuple): Each with format (word, tag) 15 | """ 16 | self.word_tag_dic = self._build(train_pairs) 17 | print("Finish building baseline tagger") 18 | 19 | def _build(self, train_pairs): 20 | freq_dict = {} 21 | for word, tag in train_pairs: 22 | if word not in freq_dict: 23 | word_dict = {} 24 | word_dict[tag] = 1 25 | freq_dict[word] = word_dict 26 | else: 27 | word_dict = freq_dict[word] 28 | if tag not in word_dict: 29 | word_dict[tag] = 1 30 | else: 31 | word_dict[tag] += 1 32 | word_tag_dic = {} 33 | for word, word_dict in freq_dict.items(): 34 | best_tag = max(word_dict, key=word_dict.get) 35 | word_tag_dic[word] = best_tag 36 | return word_tag_dic 37 | 38 | def test(self, test_pairs): 39 | correct = 0 40 | start_time = time.time() 41 | for word, tag in test_pairs: 42 | if word not in self.word_tag_dic: 43 | pred_tag = 'NN' 44 | else: 45 | pred_tag = self.word_tag_dic[word] 46 | correct += (pred_tag==tag) 47 | print("Test %d examples in %.5f seconds"%(len(test_pairs),time.time()-start_time)) 48 | print("Test Accuracy: %.2f%%"%(100*correct/len(test_pairs))) 49 | 50 | class HMMTagger(object): 51 | def __init__(self, train_pairs, train_pairs_bi, train_starts, all_tags, vocab_size): 52 | self.all_tags = all_tags 53 | self.tag2id = {tag:index for index,tag in enumerate(all_tags)} 54 | self.id2tag = {index:tag for index,tag in enumerate(all_tags)} 55 | self.vocab_size = vocab_size 56 | self.tag_freq_dic = self._get_tag_freqs(train_pairs) 57 | self.total_tags = sum([tag_freq for tag_freq in self.tag_freq_dic.values()]) 58 | self._build(train_pairs, train_pairs_bi, train_starts) 59 | print("Finish building HMM tagger!") 60 | 61 | def _build(self, train_pairs, train_pairs_bi, train_starts): 62 | self.A = self._init_trans_matrix(train_pairs_bi, train_pairs) 63 | self.B = self._init_emission_prob(train_pairs) 64 | self.C = self._init_start_distrib(train_starts) 65 | 66 | def _init_trans_matrix(self, train_pairs_bi, train_pairs): 67 | num_tag = len(self.tag2id) 68 | A = np.zeros((num_tag, num_tag),dtype=np.float32) 69 | bigram_freq_dic = {} 70 | for bigram in train_pairs_bi: 71 | tag1, tag2 = bigram[0][1], bigram[1][1] 72 | try: 73 | bigram_freq_dic[(tag1, tag2)] += 1 74 | except: 75 | bigram_freq_dic[(tag1, tag2)] = 1 76 | for i in range(num_tag): 77 | for j in range(num_tag): 78 | try: 79 | A[i][j] = bigram_freq_dic[(self.id2tag[i], self.id2tag[j])]/ \ 80 | self.tag_freq_dic[self.id2tag[i]] # add smoothing 81 | except: 82 | A[i][j] = 0 83 | return A 84 | 85 | def _init_emission_prob(self, train_pairs): 86 | emission_prob_dic = {} 87 | for pair in train_pairs: 88 | try: 89 | emission_prob_dic[pair] += 1 90 | except: 91 | emission_prob_dic[pair] = 1 92 | for pair in emission_prob_dic: 93 | # emission_prob_dic[pair] = (emission_prob_dic[pair]+1)/(self.tag_freq_dic[pair[1]] + self.vocab_size) 94 | emission_prob_dic[pair] = float(emission_prob_dic[pair])/float(self.tag_freq_dic[pair[1]]) 95 | return emission_prob_dic 96 | 97 | def _init_start_distrib(self, train_starts): 98 | count = 0 99 | start_distrib_dic = {tag:0 for tag in self.all_tags} 100 | for start in train_starts: 101 | try: 102 | start_distrib_dic[start] += 1 103 | except: 104 | start_distrib_dic[start] = 1 105 | for start in start_distrib_dic: 106 | start_distrib_dic[start] /= len(train_starts) 107 | return start_distrib_dic 108 | 109 | def _get_tag_freqs(self, train_pairs): 110 | tag_freq_dic = {} 111 | for word, tag in train_pairs: 112 | if tag not in tag_freq_dic: 113 | tag_freq_dic[tag] = 1 114 | else: 115 | tag_freq_dic[tag] += 1 116 | return tag_freq_dic 117 | 118 | def search_word(self, word): 119 | for pair in self.B: 120 | if pair[0] == word: 121 | print(pair,self.B[pair]) 122 | 123 | def _save_confusion(self, conf_matrix): 124 | """ Save the confusion matrix to an excel file for easy checking 125 | """ 126 | # table = PrettyTable(['\t']+[self.id2tag[i] for i in range(len(self.id2tag))]) 127 | # for i in range(len(self.id2tag)): 128 | # table.add_row([self.id2tag[i]]+list(conf_matrix[i])) 129 | # print(table) 130 | writer = pd.ExcelWriter('confusion.xlsx') 131 | df1 = pd.DataFrame(conf_matrix, index=[self.id2tag[i] for i in range(len(self.id2tag))], columns=[self.id2tag[i] for i in range(len(self.id2tag))]) 132 | df1.to_excel(writer,'Sheet1') 133 | writer.save() 134 | 135 | def test(self, test_sents, test_tags): 136 | correct = 0 137 | total_test = 0 138 | start_time = time.time() 139 | conf_matrix = np.zeros((len(self.all_tags),len(self.all_tags)),dtype=np.int) 140 | print_every = 10000 141 | for idx, sent in enumerate(test_sents): 142 | T = len(sent) 143 | num_tag = len(self.tag_freq_dic) 144 | viterbi = np.zeros((num_tag, T), dtype=np.float) 145 | back_trace = np.full((num_tag,T), -1, dtype=np.int) 146 | 147 | for s in range(num_tag): 148 | try: 149 | viterbi[s][0] = self.C[self.id2tag[s]]*self.B[(sent[0],self.id2tag[s])] 150 | except: 151 | # if (word, tag) pair doesn't exist, treat as NNP 152 | viterbi[s][0] = 0 153 | 154 | if np.sum(viterbi[:][0]) == 0: 155 | viterbi[self.tag2id['NNP']][0] = 1 156 | 157 | for t in range(1,len(sent)): 158 | for s in range(num_tag): 159 | max_score = 0 160 | for s_ in range(num_tag): 161 | try: 162 | score = viterbi[s_][t-1]*self.A[s_][s]*self.B[(sent[t],self.id2tag[s])] 163 | except: 164 | score = 0 165 | if max_score < score: 166 | max_score = score 167 | last_tag = s_ 168 | viterbi[s][t] = max_score 169 | try: 170 | back_trace[s][t] = last_tag 171 | except: 172 | back_trace[s][t] = -1 173 | 174 | bestpath = [np.argmax(viterbi[:,len(sent)-1])] 175 | for i in range(T-1): 176 | bestpath.append(back_trace[bestpath[-1]][T-1-i]) 177 | bestpath.reverse() 178 | bestpath = [self.id2tag[aa] for aa in bestpath] 179 | 180 | for i in range(T): 181 | correct += bestpath[i] == test_tags[idx][i] 182 | conf_matrix[self.tag2id[bestpath[i]]][self.tag2id[test_tags[idx][i]]] += 1 183 | total_test += T 184 | if idx % print_every == 0: 185 | print("Processing %d item..."%idx) 186 | print("Test %d examples in %.5f seconds"%(total_test,time.time()-start_time)) 187 | print("Test Accuracy: %.2f%%"%(100*correct/total_test)) 188 | self._save_confusion(conf_matrix) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # slp3 2 | The most comprehensive solution on Github. 3 | Solutions and codes of the book "Speech and Language Processing (3rd Edition draft)" by Daniel Jurafsky and James H.Martin. 4 | 5 | The book can be found [here](https://web.stanford.edu/~jurafsky/slp3/ed3book.pdf). 6 | 7 | Feel free to contact me via should you have any questions about this repository. 8 | 9 | # slp3 10 | 《语音和语言处理(第三版草稿)》一书课后习题的解答 11 | 这是目前github上关于该书最全面的解答! 12 | 13 | 如果你有关于repository的任何问题,欢迎通过我的邮箱与我联系! 14 | -------------------------------------------------------------------------------- /Statistical Constituency Parsing/CKY.py: -------------------------------------------------------------------------------- 1 | import operator 2 | from utils import get_grammar, display_grammar, display_table 3 | 4 | def FindGrammar(grammar, word=None, prod1=None, prod2=None): 5 | """Find a pair in grammar 6 | @param word (string): the word to be searched as a terminal 7 | @param prod1, prod2 (list of tuple): cell in CKY table, each tuple is in the format (head, probability, backnode) 8 | @return 9 | """ 10 | if word: 11 | # (head probability back) 12 | heads = [(head, probability, None) for (head, probability, symbol) in grammar if symbol==[word]] 13 | return heads 14 | # Find the maximum probability 15 | elif prod1 and prod2: 16 | result = None 17 | max_prob = 0 18 | for B in prod1: # The last element is probability 19 | for C in prod2: 20 | if B and C: 21 | for (head, prob, symbol) in grammar: 22 | if symbol==[B[0], C[0]] and prob*B[1]*C[1] > max_prob: 23 | max_prob = prob*B[1]*C[1] 24 | result = [(head, max_prob, [B[0],C[0]])] 25 | # For unit productions 26 | for(head_, prob_, symbol_) in grammar: 27 | if symbol_ == head: 28 | result.append(head_, max_prob*prob_, [B[0],C[0]]) 29 | 30 | if not result: 31 | return None 32 | return result 33 | else: 34 | return None 35 | # raise ValueError("Either word or nt1 and nt2 must not be none") 36 | 37 | def CKY(words, grammar): 38 | words = [" "] + words 39 | table = [[list() for _ in range(len(words))] for _ in range(len(words))] 40 | for j in range(1, len(words)): 41 | res = FindGrammar(grammar, word=words[j]) 42 | if res: 43 | table[j-1][j] = res 44 | for i in range(j-2, -1, -1): 45 | max_prob = 0 46 | for k in range(i+1, j): 47 | result = FindGrammar(grammar, prod1=table[i][k], prod2=table[k][j]) 48 | if result: 49 | table[i][j] += result 50 | # Find the maximum probability 51 | return table 52 | 53 | if __name__ == '__main__': 54 | # grammar = get_grammar('''\ 55 | # S -> NP VP 0.8 | Aux NP VP 0.15 | VP 0.05 56 | # NP -> Pronoun 0.35 | Proper-Noun 0.30 | Det Noun 0.20 | Nominal 0.15 57 | # Nominal -> Noun 0.75 | Nominal Noun 0.20 | Nominal PP 0.05 58 | # VP -> Verb 0.35 | Verb NP 0.20 | Verb NP PP 0.10 | Verb PP 0.15 | VP PP 0.15 | Verb NP NP 0.05 59 | # PP -> Preposition NP 1.0 60 | # Det -> that 0.10 | a 0.30 | the 0.60 61 | # Noun -> book 0.10 | flight 0.70 | meal 0.05 | money 0.05 | dinner 0.10 62 | # Verb -> book 0.30 | include 0.30 | prefer 0.40 63 | # Pronoun -> I 0.40 | she 0.05 | me 0.15 | you 0.40 64 | # Proper-Noun -> Houston 0.60 | TWA 0.40 65 | # Aux -> does 0.60 | can 0.40 66 | # Preposition -> from 0.30 | to 0.30 | on 0.20 | near 0.15 | through 0.05''') 67 | 68 | grammar = get_grammar('''\ 69 | S -> NP VP 0.8 70 | NP -> Det N 0.3 71 | VP -> V NP 0.2 72 | V -> includes 0.05 73 | Det -> a 0.40 | the 0.40 74 | N -> meal 0.1 | flight 0.2''') 75 | 76 | display_grammar(grammar) 77 | words = "the flight includes a meal".split() 78 | table = CKY(words, grammar) 79 | print(table) 80 | # display_table(words, table) -------------------------------------------------------------------------------- /Statistical Constituency Parsing/__pycache__/Chmosky.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Clement25/Speech-and-Language-Processing-ver3-solutions/24af9e49568187aeb062beacd33eba562f2e485c/Statistical Constituency Parsing/__pycache__/Chmosky.cpython-36.pyc -------------------------------------------------------------------------------- /Statistical Constituency Parsing/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Clement25/Speech-and-Language-Processing-ver3-solutions/24af9e49568187aeb062beacd33eba562f2e485c/Statistical Constituency Parsing/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /Statistical Constituency Parsing/utils.py: -------------------------------------------------------------------------------- 1 | from prettytable import PrettyTable 2 | 3 | def get_grammar(string, probalistic=True): 4 | """Build up a grammar data structure 5 | @param string (str): a long string representing all grammar rules with format " A -> B1 | B2 | ... | B_n |", 6 | where A is a single grammar element and B_k is either a single and multiple grammar element(s). 7 | @return grammar (list of tuple): Tuples storing the grammar, each tuple is in the format (A, list[B, B_prob]) 8 | """ 9 | grammar = list() 10 | for line in string.splitlines(): 11 | left, rights = line.split(' -> ') 12 | for right in rights.split('|'): 13 | r_sp = right.split() 14 | if probalistic: 15 | grammar.append((left.strip(),float(r_sp[-1]),[r.strip() for r in r_sp[:-1]])) 16 | else: 17 | grammar.append((left.strip(),1.0,[r.strip() for r in r_sp[:-1]])) 18 | return grammar 19 | 20 | def get_lexcializeg_grammar(string, probalistic=True): 21 | """Build up a grammar data structure 22 | @param string (str): a long string representing all grammar rules with format " A -> B1(word) | B2 | ... | B_n |", 23 | where A is a single grammar element and B_k is either a single or multiple grammar element(s). 24 | @return grammar (list of tuple): Tuples storing the grammar, each tuple is in the format (A, list[B, B_prob]) 25 | """ 26 | grammar = list() 27 | for line in string.splitlines(): 28 | left, rights = line.split(' -> ') 29 | for right in rights.split('|'): 30 | r_sp = right.split() 31 | if probalistic: 32 | grammar.append((left.strip(),float(r_sp[-1]),[r.strip() for r in r_sp[:-1]])) 33 | else: 34 | grammar.append((left.strip(),1.0,[r.strip() for r in r_sp[:-1]])) 35 | return grammar 36 | 37 | def display_grammar(grammar): 38 | disp_dic = {} 39 | for g in grammar: 40 | try: 41 | disp_dic[g[0]].append(g[2]) 42 | except: 43 | disp_dic[g[0]] = [g[2]] 44 | for key, value in disp_dic.items(): 45 | print(key+" -> "+" | ".join([" ".join(v) for v in value])) 46 | 47 | def display_table(words, table): 48 | """display CKY result table 49 | @param (table): list of list of set. Each set contains all possible results from CKY algorithm. 50 | @return: None 51 | """ 52 | pt = PrettyTable(words) 53 | for row in table: 54 | print(row) 55 | if row == table[-1]: 56 | break 57 | new_row = [','.join(list(item)) for item in row[1:]] 58 | pt.add_row(new_row) 59 | print(pt) -------------------------------------------------------------------------------- /data/conll2000/README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Clement25/Speech-and-Language-Processing-ver3-solutions/24af9e49568187aeb062beacd33eba562f2e485c/data/conll2000/README --------------------------------------------------------------------------------