├── .gitignore ├── README.md ├── hmm ├── README.md ├── hmm_tutorial.ipynb └── src │ ├── const.py │ ├── corpus │ └── toy │ │ └── train.txt │ ├── dataset.py │ ├── hmm.py │ ├── main.py │ └── processing.py ├── language_model ├── README.md ├── languange_model_tutorial.ipynb └── src │ ├── const.py │ ├── corpus │ └── toy │ │ ├── test.txt │ │ └── train.txt │ ├── dataset.py │ ├── evaluate.py │ ├── main.py │ ├── ngram.py │ ├── processing.py │ └── smooth.py ├── lsa └── lsa.py ├── nbayes ├── nbayes.py └── tfidf_nbayes.py ├── pca └── pca.py ├── pcfg ├── README.md ├── pcfg_tutorial.ipynb └── src │ ├── corpus │ └── toy │ │ └── train.txt │ ├── main.py │ └── pcfg.py ├── reading_comprehension ├── README.md ├── const.py ├── corpus │ ├── bAbI │ │ ├── LICENSE.txt │ │ ├── README.txt │ │ └── en-10k │ │ │ ├── qa5_three-arg-relations_test.txt │ │ │ └── qa5_three-arg-relations_train.txt │ ├── reading_comprehension.png │ └── result.png ├── dataset.py ├── main.py └── model.py ├── text_similarity └── vsm_sim.py └── word2vec ├── README.md ├── cbow ├── pytorch │ ├── negative_sampling │ │ ├── cbow.py │ │ ├── const.py │ │ ├── dataset.py │ │ ├── main.py │ │ └── utils.py │ └── softmax │ │ ├── cbow.py │ │ ├── const.py │ │ ├── dataset.py │ │ ├── main.py │ │ └── utils.py └── tensorflow │ ├── negative_sampling │ ├── cbow.py │ ├── const.py │ ├── dataset.py │ └── main.py │ └── softmax │ ├── cbow.py │ ├── const.py │ ├── dataset.py │ └── main.py ├── corpus ├── articles.txt ├── result.png └── trans_code.py └── skipgram ├── pytorch ├── negative_sampling │ ├── const.py │ ├── dataset.py │ ├── main.py │ ├── skipgram.py │ └── utils.py └── softmax │ ├── const.py │ ├── dataset.py │ ├── main.py │ ├── skipgram.py │ └── utils.py └── tensorflow ├── negative_sampling ├── const.py ├── dataset.py ├── main.py └── skipgram.py └── softmax ├── const.py ├── dataset.py ├── main.py └── skipgram.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

/ NLP Learning /

2 | 3 |

结合python学习自然语言处理

4 | 5 | ## 目录 6 | 7 | - [python实现语言模型](https://github.com/SeanLee97/nlp_learning/tree/master/language_model) 8 | - [python实现HMM](https://github.com/SeanLee97/nlp_learning/tree/master/hmm) 9 | - [python实现PCFG](https://github.com/SeanLee97/nlp_learning/tree/master/pcfg) 10 | - [pytorch&tensorflow实现word2vec (CBOW softmax, CBOW negative_sampling, Skipgram softmax, Skipgram negative_sampling)](https://github.com/SeanLee97/nlp_learning/tree/master/word2vec) 11 | - [reading comprehension 阅读理解任务](https://github.com/SeanLee97/nlp_learning/tree/master/reading_comprehension) 12 | - [tfidf + 朴素贝叶斯](https://seanlee97.github.io/2018/08/25/%E4%B8%BA%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E5%8A%A0%E5%85%A5TF-IDF%E7%89%B9%E5%BE%81/) 13 | - [text_similarity 文本相似度计算](https://seanlee97.github.io/2018/08/31/%E4%BD%99%E5%BC%A6%E5%AE%9A%E7%90%86%E5%92%8C%E6%96%87%E6%9C%AC%E7%9B%B8%E4%BC%BC%E5%BA%A6/) 14 | - [从特征值特征向量去理解PCA](https://seanlee97.github.io/2018/03/29/%E4%BB%8E%E7%89%B9%E5%BE%81%E5%80%BC%E7%89%B9%E5%BE%81%E5%90%91%E9%87%8F%E5%8E%BB%E7%90%86%E8%A7%A3PCA/) 15 | - [SVD的原理及LSA的求解](https://seanlee97.github.io/2018/09/01/SVD%E7%9A%84%E5%8E%9F%E7%90%86%E5%8F%8ALSA%E7%9A%84%E6%B1%82%E8%A7%A3/) 16 | 17 | 18 | ## 论文实现 19 | - [QANet](https://github.com/SeanLee97/QANet_dureader) 20 | - [bimpm](https://github.com/SeanLee97/bimpm) 21 | -------------------------------------------------------------------------------- /hmm/README.md: -------------------------------------------------------------------------------- 1 | # HMM 2 | 3 | 基于bigram, trigram实现的HMM, 支持viterbi解码输出更高效! 4 | -------------------------------------------------------------------------------- /hmm/hmm_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# python 实现 隐马尔可夫模型(HMM)\n", 8 | "\n", 9 | "本例子主要受 Michael Collins 教授的 Tagging Problems, and Hidden Markov Models 启发而编写,为了帮助大家理解,我在我的博客、公众号上发表了文章[一文读懂NLP中的HMM(公众号)](https://mp.weixin.qq.com/s?__biz=MzIwNDM1NjUzMA==&mid=2247483662&idx=1&sn=cf463dde9af1844a3fd1e3e4fec26f5c&chksm=96c02fd3a1b7a6c5cfabe53efbff54af33cd2f61d13064645fbff92ce1b024d82acb2375d9b0#rd),欢迎大家阅读。当然强烈推荐Michael Collins 教授的 [Tagging Problems, and Hidden Markov Models](http://www.cs.columbia.edu/~mcollins/hmms-spring2013.pdf)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## 目录\n", 17 | "\n", 18 | "1. [项目结构](#项目结构)\n", 19 | "2. [环境要求](#环境要求)\n", 20 | "3. [代码分析](#代码分析)\n", 21 | "4. [结果分析](#结果分析)\n", 22 | "5. [项目后续](#项目后续)\n", 23 | "6. [联系作者](#联系作者)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## 项目结构\n", 31 | "\n", 32 | "| - src\n", 33 | "\n", 34 | " | - const.py 常量定义文件\n", 35 | "\n", 36 | " | - corpus 语料库\n", 37 | "\n", 38 | " | - dataset.py 加载语料\n", 39 | "\n", 40 | " | - hmm.py bigram hmm, trigram hmm, viterbi\n", 41 | "\n", 42 | " | - main.py 例子程序\n", 43 | "\n", 44 | " | - processing.py 字典的生成等处理方法" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## 环境要求\n", 52 | "\n", 53 | " python3" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## 代码分析" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### const.py\n", 68 | "\n", 69 | "在这里定义了三个常量" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 1, 75 | "metadata": { 76 | "collapsed": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "# 未登录词\n", 81 | "UNK = None\n", 82 | "# 句子开始标记,代表句子的开头\n", 83 | "START_TOKEN = ''\n", 84 | "# 句子结束标记,代表句子的结尾\n", 85 | "END_TOKEN = ''" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### processing.py\n", 93 | "\n", 94 | "字典的构建" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "# !/usr/bin/env python3\n", 106 | "# -*- coding: utf-8 -*-\n", 107 | "\n", 108 | "'''\n", 109 | "@description: 句子的处理,字典的构建\n", 110 | "@author: Sean QQ: 929325776\n", 111 | "'''\n", 112 | "\n", 113 | "import const\n", 114 | "\n", 115 | "#加入起始标记\n", 116 | "def build_tags(tags):\n", 117 | "\tout = []\n", 118 | "\tfor sentence in tags:\n", 119 | "\t\titems = [x.lower() for x in sentence]\n", 120 | "\t\titems.insert(0, const.START_TOKEN)\n", 121 | "\t\titems.append(const.END_TOKEN)\n", 122 | "\t\tout.append(items)\n", 123 | "\treturn out\n", 124 | "\n", 125 | "# 构建ungram词频词典\n", 126 | "def build_undict(tags):\n", 127 | "\tundict = {}\n", 128 | "\tfor items in tags:\n", 129 | "\t\tfor word in items:\n", 130 | "\t\t\tif word == const.START_TOKEN or word == const.END_TOKEN:\n", 131 | "\t\t\t\tcontinue\n", 132 | "\t\t\tif word not in undict:\n", 133 | "\t\t\t\tundict[word] = 1\n", 134 | "\t\t\telse:\n", 135 | "\t\t\t\tundict[word] += 1\n", 136 | "\treturn undict\n", 137 | "\n", 138 | "\n", 139 | "# 构建bigram词频词典,其中以三元组(u, v)作为词典的键\n", 140 | "def build_bidict(tags):\n", 141 | "\tbidict = {}\n", 142 | "\tfor items in tags: \n", 143 | "\t\tfor i in range(len(items)-1):\n", 144 | "\t\t\ttup = (items[i], items[i+1])\n", 145 | "\t\t\tif tup not in bidict:\n", 146 | "\t\t\t\tbidict[tup] = 1\n", 147 | "\t\t\telse:\n", 148 | "\t\t\t\tbidict[tup] += 1\n", 149 | "\treturn bidict\n", 150 | "\n", 151 | "# 构建trigram词频词典,其中以三元组(u, v, w)作为词典的键\n", 152 | "def build_tridict(tags):\n", 153 | "\ttridict = {}\n", 154 | "\tfor items in tags:\n", 155 | "\t\titems.insert(0, const.START_TOKEN)\n", 156 | "\t\tfor i in range(len(items) -2):\n", 157 | "\t\t\ttup = (items[i], items[i+1], items[i+2])\n", 158 | "\t\t\tif tup not in tridict:\n", 159 | "\t\t\t\ttridict[tup] = 1\n", 160 | "\t\t\telse:\n", 161 | "\t\t\t\ttridict[tup] += 1\n", 162 | "\treturn tridict\n", 163 | "\n", 164 | "# 构建(词,词性)词频字典,以及统计词频\n", 165 | "def build_count_dict(datas, tags):\n", 166 | "\ttagword_dict = {}\n", 167 | "\twordcount = {}\n", 168 | "\ttagcount = {}\n", 169 | "\tfor i, data in enumerate(datas):\n", 170 | "\t\ttag = tags[i][1:-1]\n", 171 | "\t\tfor idx, d in enumerate(data):\n", 172 | "\t\t\ttup = (tag[idx], d)\n", 173 | "\t\t\tif tup not in tagword_dict:\n", 174 | "\t\t\t\ttagword_dict[tup] = 1\n", 175 | "\t\t\telse:\n", 176 | "\t\t\t\ttagword_dict[tup] += 1\n", 177 | "\n", 178 | "\t\t\tif d not in wordcount:\n", 179 | "\t\t\t\twordcount[d] = 1\n", 180 | "\t\t\telse:\n", 181 | "\t\t\t\twordcount[d] += 1\n", 182 | "\t\t\tif tag[idx] not in tagcount:\n", 183 | "\t\t\t\ttagcount[tag[idx]] = 1\n", 184 | "\t\t\telse:\n", 185 | "\t\t\t\ttagcount[tag[idx]] += 1\n", 186 | "\treturn tagword_dict, wordcount, tagcount" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "### hmm.py\n", 194 | "\n", 195 | "基于bigram, trigram实现了hmm, 支持viterbi解码" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "collapsed": true 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "# !/usr/bin/env python3\n", 207 | "# -*- coding: utf-8 -*-\n", 208 | "\n", 209 | "'''\n", 210 | "@description: bigram hmm, trigram hmm, _viterbi\n", 211 | "@author: Sean QQ: 929325776\n", 212 | "'''\n", 213 | "\n", 214 | "import math\n", 215 | "import const\n", 216 | "from processing import *\n", 217 | "\n", 218 | "'''bigram hmm'''\n", 219 | "class BiHMM(object):\n", 220 | "\tdef __init__(self, datas, tags):\n", 221 | "\t\tself.datas = datas\n", 222 | "\t\tself.tags = build_tags(tags)\n", 223 | "\t\tself.undict = build_undict(self.tags) \n", 224 | "\t\tself.bidict = build_bidict(self.tags)\n", 225 | "\t\tself.tagword, self.wordcount, self.tagcount = build_count_dict(datas, self.tags)\n", 226 | "\t\tself.postags = [tag for tag in self.undict]\n", 227 | "\n", 228 | "\tdef calc_e_prob(self, *args):\n", 229 | "\t\tif len(args) != 2:\n", 230 | "\t\t\traise ValueError('two tags is required')\n", 231 | "\n", 232 | "\t\tn = 0.0\n", 233 | "\t\tm = 0.0\n", 234 | "\t\tif args in self.tagword:\n", 235 | "\t\t\tn = self.tagword[args]\n", 236 | "\t\tif args[0] in self.undict:\n", 237 | "\t\t\tm = self.undict[args[0]]\n", 238 | "\t\treturn (n + 1) * 1.0 / (m + len(self.wordcount)*len(self.undict))\n", 239 | "\n", 240 | "\tdef calc_prob(self, *args):\n", 241 | "\t\tif len(args) != 2:\n", 242 | "\t\t\traise ValueError('two tags is required')\n", 243 | "\n", 244 | "\t\tn = 0.0\n", 245 | "\t\tm = 0.0\n", 246 | "\t\tif args in self.bidict:\n", 247 | "\t\t\tn = self.bidict[args]\n", 248 | "\t\tif args[0] in self.undict:\n", 249 | "\t\t\tm = self.undict[args[0]]\n", 250 | "\t\treturn (n + 1) * 1.0 / (m + len(self.postags)**2)\n", 251 | "\n", 252 | "\tdef calc_tags_prob(self, tags):\n", 253 | "\t\tprob = 0\n", 254 | "\t\tprev_tag = const.START_TOKEN\n", 255 | "\t\tfor tag in tags:\n", 256 | "\t\t\ttag_prob = self.calc_prob(prev_tag, tag)\n", 257 | "\t\t\tprob += tag_prob\n", 258 | "\t\t\tprev_tag = tag\n", 259 | "\t\treturn prob\n", 260 | "\n", 261 | "\tdef calc_tagword_proba(self, tag, word):\n", 262 | "\t\tprob = 0.0\n", 263 | "\t\ttagword = (tag, word)\n", 264 | "\t\tif tagword in self.tagword:\n", 265 | "\t\t\tprob = float(self.tagword[tagword]) / self.tagcount[tag]\n", 266 | "\t\treturn prob\n", 267 | "\n", 268 | "\t# @param vb _viterbi\n", 269 | "\tdef pred(self, sentence, vb=False):\n", 270 | "\t\tif vb:\n", 271 | "\t\t\t# _viterbi\n", 272 | "\t\t\treturn self._viterbi(sentence)\n", 273 | "\n", 274 | "\t\twordtag = []\n", 275 | "\t\tmax_prob = 0.0\n", 276 | "\t\tmax_tag = None\n", 277 | "\t\t#total_prob = None\n", 278 | "\t\tfor word in sentence:\n", 279 | "\t\t\tfor tag1 in self.postags:\n", 280 | "\t\t\t\tfor tag2 in self.postags:\n", 281 | "\t\t\t\t\tq = self.calc_tags_prob((tag1, tag2))\n", 282 | "\t\t\t\t\te = self.calc_tagword_proba(tag2, word)\n", 283 | "\t\t\t\t\tprob = q*e*1.0\n", 284 | "\t\t\t\t\tif prob >= max_prob:\n", 285 | "\t\t\t\t\t\tmax_prob = prob\n", 286 | "\t\t\t\t\t\tmax_tag = tag2\n", 287 | "\t\t\twordtag.append((word, max_tag))\n", 288 | "\t\t\t'''\n", 289 | "\t\t\tif total_prob == None:\n", 290 | "\t\t\t\ttotal_prob = max_prob\n", 291 | "\t\t\telse:\n", 292 | "\t\t\t\ttotal_prob *= max_prob \n", 293 | "\t\t\t'''\n", 294 | "\t\t\tmax_prob = 0.0\t\t\n", 295 | "\t\treturn wordtag\n", 296 | "\n", 297 | "\n", 298 | "\tdef _viterbi_decode(self, sentence, score, trace):\n", 299 | "\t\tresult = []\n", 300 | "\t\ttmp = -float('inf')\n", 301 | "\t\tres_x = 0\n", 302 | "\t\tfor idx, val in enumerate(self.postags):\n", 303 | "\t\t\tif tmp < score[idx][len(sentence)-1]:\n", 304 | "\t\t\t\ttmp = score[idx][len(sentence)-1]\n", 305 | "\t\t\t\tres_x = idx\n", 306 | "\t\tresult.append(res_x)\n", 307 | "\t\tfor idx in range(len(sentence)-1, 0, -1):\n", 308 | "\t\t\tresult.append(trace[result[-1]][idx])\n", 309 | "\t\tresult.reverse()\n", 310 | "\t\tresult_pos = []\n", 311 | "\t\tresult_pos = [self.postags[k] for k in result]\n", 312 | "\t\twordtag = list(zip(sentence, result_pos))\n", 313 | "\t\treturn wordtag\n", 314 | "\n", 315 | "\tdef _viterbi(self, sentence):\n", 316 | "\t\trow = len(self.postags)\n", 317 | "\t\tcol = len(sentence)\n", 318 | "\n", 319 | "\t\ttrace = [[-1 for i in range(col)] for i in range(row)]\n", 320 | "\t\tscore = [[-1 for i in range(col)] for i in range(row)]\n", 321 | "\n", 322 | "\t\tfor idx, val in enumerate(sentence):\n", 323 | "\t\t\tif idx == 0:\n", 324 | "\t\t\t\tfor idx_pos, val_pos in enumerate(self.postags):\n", 325 | "\t\t\t\t\tscore[idx_pos][idx] = self.calc_e_prob(val_pos, sentence[idx]) # emit\n", 326 | "\t\t\telse:\n", 327 | "\t\t\t\tfor idx_pos, val_pos in enumerate(self.postags):\n", 328 | "\t\t\t\t\ttmp = -float('inf')\n", 329 | "\t\t\t\t\ttrace_tmp = -1\n", 330 | "\t\t\t\t\tfor idx_pos2, val_pos2 in enumerate(self.postags):\n", 331 | "\t\t\t\t\t\tr = score[idx_pos2][idx-1]*self.calc_prob(val_pos2, val_pos)\n", 332 | "\t\t\t\t\t\tif r > tmp:\n", 333 | "\t\t\t\t\t\t\ttmp = r\n", 334 | "\t\t\t\t\t\t\ttrace_tmp = idx_pos2\n", 335 | "\t\t\t\t\t\ttrace[idx_pos][idx] = trace_tmp\n", 336 | "\t\t\t\t\t\tscore[idx_pos][idx] = tmp*self.calc_e_prob(val_pos, val)\n", 337 | "\t\treturn self._viterbi_decode(sentence, score, trace)\n", 338 | "\n", 339 | "class TriHMM(BiHMM):\n", 340 | "\tdef __init__(self, datas, tags):\n", 341 | "\t\tBiHMM.__init__(self, datas, tags)\n", 342 | "\t\tself.tridict = build_tridict(self.tags)\n", 343 | "\n", 344 | "\tdef calc_prob(self, *args):\n", 345 | "\t\tif len(args) != 3:\n", 346 | "\t\t\traise ValueError('three tags is required')\n", 347 | "\n", 348 | "\t\tn = 0.0\n", 349 | "\t\tm = 0.0\n", 350 | "\t\tbitup = (args[0], args[1])\n", 351 | "\t\tif args in self.tridict:\n", 352 | "\t\t\tn = self.tridict[args]\n", 353 | "\t\tif bitup in self.bidict:\n", 354 | "\t\t\tm = self.bidict[bitup]\n", 355 | "\t\treturn (n + 1) * 1.0 / (m + len(self.postags)**2)\n", 356 | "\n", 357 | "\n", 358 | "\t\tprob = 0\n", 359 | "\t\tif self.smooth != None:\n", 360 | "\t\t\tprob = self.smooth(args[0], args[1], args[2], tridict=self.tridict, bidict=self.bidict, undict=self.undict)\n", 361 | "\t\telse:\n", 362 | "\t\t\tbitup = (args[0], args[1])\t\t\t\t\n", 363 | "\t\t\tif args in self.tridict and bitup in self.bidict:\n", 364 | "\t\t\t\treturn float(self.tridict[args]) / self.bidict[bitup]\n", 365 | "\t\treturn prob\n", 366 | "\n", 367 | "\tdef calc_tags_prob(self, tags):\n", 368 | "\t\tprob = 0\n", 369 | "\t\tprev_stack = [const.START_TOKEN, const.START_TOKEN]\n", 370 | "\t\tfor tag in tags:\n", 371 | "\t\t\ttag_prob = self.calc_prob(prev_stack[0], prev_stack[1], tag)\n", 372 | "\t\t\tprob += tag_prob\n", 373 | "\t\t\tprev_stack[0] = prev_stack[1]\n", 374 | "\t\t\tprev_stack[1] = tag\n", 375 | "\t\treturn prob\n", 376 | "\n", 377 | "\t# @param vb _viterbi\n", 378 | "\tdef pred(self, sentence, vb=False):\n", 379 | "\t\tif vb:\n", 380 | "\t\t\treturn self._viterbi(sentence)\n", 381 | "\t\twordtag = []\n", 382 | "\t\tmax_prob = 0.0\n", 383 | "\t\tmax_tag = None\n", 384 | "\t\t#total_prob = None\n", 385 | "\t\tfor word in sentence:\n", 386 | "\t\t\tfor tag1 in self.postags:\n", 387 | "\t\t\t\tfor tag2 in self.postags:\n", 388 | "\t\t\t\t\tfor tag3 in self.postags:\n", 389 | "\t\t\t\t\t\tq = self.calc_tags_prob((tag1, tag2, tag3))\n", 390 | "\t\t\t\t\t\te = self.calc_tagword_proba(tag3, word)\n", 391 | "\t\t\t\t\t\tprob = q*e*1.0\n", 392 | "\t\t\t\t\t\tif prob >= max_prob:\n", 393 | "\t\t\t\t\t\t\tmax_prob = prob\n", 394 | "\t\t\t\t\t\t\tmax_tag = tag3\n", 395 | "\t\t\twordtag.append((word, max_tag))\n", 396 | "\t\t\t'''\n", 397 | "\t\t\tif total_prob == None:\n", 398 | "\t\t\t\ttotal_prob = max_prob\n", 399 | "\t\t\telse:\n", 400 | "\t\t\t\ttotal_prob *= max_prob \n", 401 | "\t\t\t'''\n", 402 | "\t\t\tmax_prob = 0.0\t\t\n", 403 | "\t\treturn wordtag\n", 404 | "\n", 405 | "\tdef _viterbi_decode(self, sentence, score, trace):\n", 406 | "\t\tresult = []\n", 407 | "\t\ttmp = -float('inf')\n", 408 | "\t\tres_x = 0\n", 409 | "\t\tres_y = 0\n", 410 | "\t\tfor idx, val in enumerate(self.postags):\n", 411 | "\t\t\tfor idx_pos2, val_pos2 in enumerate(self.postags):\n", 412 | "\t\t\t\tif tmp < score[idx_pos2][idx][len(sentence)-1]:\n", 413 | "\t\t\t\t\ttmp = score[idx_pos2][idx][len(sentence)-1]\n", 414 | "\t\t\t\t\tres_x = idx\n", 415 | "\t\t\t\t\tres_y = idx_pos2\n", 416 | "\t\tresult.extend([res_x, res_y])\n", 417 | "\t\tfor idx in range(len(sentence)-1, 0, -1):\n", 418 | "\t\t\tresult.append(trace[result[-2]][result[-1]][idx])\n", 419 | "\t\tresult.reverse()\n", 420 | "\t\tresult_pos = []\n", 421 | "\t\tresult_pos = [self.postags[k] for k in result]\n", 422 | "\t\twordtag = list(zip(sentence, result_pos))\n", 423 | "\t\treturn wordtag\n", 424 | "\n", 425 | "\tdef _viterbi(self, sentence):\n", 426 | "\t\trow = len(self.postags)\n", 427 | "\t\tcol = len(sentence)\n", 428 | "\n", 429 | "\t\ttrace = [[[-1 for i in range(col)] for i in range(row)] for i in range(row)]\n", 430 | "\t\tscore = [[[-1 for i in range(col)] for i in range(row)] for i in range(row)]\n", 431 | "\n", 432 | "\t\tfor idx, val in enumerate(sentence):\n", 433 | "\t\t\tif idx == 0:\n", 434 | "\t\t\t\tfor idx_pos, val_pos in enumerate(self.postags):\n", 435 | "\t\t\t\t\tscore[idx_pos][0][idx] = self.calc_e_prob(val_pos, sentence[idx]) # emit\n", 436 | "\t\t\telse:\n", 437 | "\t\t\t\tfor idx_pos, val_pos in enumerate(self.postags):\n", 438 | "\t\t\t\t\ttmp = -float('inf')\n", 439 | "\t\t\t\t\ttrace_tmp = -1\n", 440 | "\t\t\t\t\tfor idx_pos2, val_pos2 in enumerate(self.postags):\n", 441 | "\t\t\t\t\t\tfor idx_pos3, val_pos3 in enumerate(self.postags):\n", 442 | "\t\t\t\t\t\t\tr = score[idx_pos3][idx_pos2][idx-1]*self.calc_prob(val_pos3, val_pos2 ,val_pos)\n", 443 | "\t\t\t\t\t\t\tif r > tmp:\n", 444 | "\t\t\t\t\t\t\t\ttmp = r\n", 445 | "\t\t\t\t\t\t\t\ttrace_tmp = idx_pos3\n", 446 | "\t\t\t\t\t\t\ttrace[idx_pos][idx_pos2][idx] = trace_tmp\n", 447 | "\t\t\t\t\t\t\tscore[idx_pos][idx_pos2][idx] = tmp*self.calc_e_prob(val_pos, val)\n", 448 | "\t\treturn self._viterbi_decode(sentence, score, trace)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "## 结果分析" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "** bigram hmm **\n", 463 | "\n", 464 | "bigram hmm\n", 465 | "\n", 466 | "[('小明', 'nr'), ('爱', 'v'), ('老鼠', 'n'), ('和', 'c'), ('狗', 'n')]\n", 467 | "\n", 468 | "bigram hmm with viterbi decode\n", 469 | "\n", 470 | "[('小明', 'nr'), ('爱', 'v'), ('老鼠', 'n'), ('和', 'v'), ('狗', 'n')]\n", 471 | "\n", 472 | "**trigram hmm**\n", 473 | "\n", 474 | "trigram hmm\n", 475 | "\n", 476 | "[('小明', 'nr'), ('爱', 'v'), ('老鼠', 'n'), ('和', 'c'), ('狗', 'n')]\n", 477 | "\n", 478 | "trigram hmm with viterbi decode\n", 479 | "\n", 480 | "[('小明', 'nr'), ('爱', 'v'), ('老鼠', 'n'), ('和', 'c'), ('狗', 'n')]\n" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "## 项目后续\n", 488 | "\n", 489 | "过段时间会加入深度学习在NLP上的应用,如果你感兴趣,可以关注我的公众号,或者star, watch 本项目哦" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": {}, 495 | "source": [ 496 | "## 联系作者\n", 497 | "\n", 498 | "@author sean\n", 499 | "\n", 500 | "@qq 929325776\n", 501 | "\n", 502 | "有什么问题,可以联系我,一起讨论" 503 | ] 504 | } 505 | ], 506 | "metadata": { 507 | "kernelspec": { 508 | "display_name": "Python 3", 509 | "language": "python", 510 | "name": "python3" 511 | }, 512 | "language_info": { 513 | "codemirror_mode": { 514 | "name": "ipython", 515 | "version": 3 516 | }, 517 | "file_extension": ".py", 518 | "mimetype": "text/x-python", 519 | "name": "python", 520 | "nbconvert_exporter": "python", 521 | "pygments_lexer": "ipython3", 522 | "version": "3.6.1" 523 | } 524 | }, 525 | "nbformat": 4, 526 | "nbformat_minor": 2 527 | } 528 | -------------------------------------------------------------------------------- /hmm/src/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | ''' 6 | @description: 定义常量 7 | @author: Sean QQ: 929325776 8 | ''' 9 | 10 | UNK = None 11 | START_TOKEN = '' 12 | END_TOKEN = '' 13 | -------------------------------------------------------------------------------- /hmm/src/corpus/toy/train.txt: -------------------------------------------------------------------------------- 1 | 猫/n 抓/v 老鼠/n 2 | 狗/n 追/v 猫/n 3 | 小明/nr 爱/v 狗/n 和/c 猫/n 4 | -------------------------------------------------------------------------------- /hmm/src/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | def load_data(file_path): 5 | datas, tags = [], [] 6 | with open(file_path, 'r') as f: 7 | for line in f: 8 | line = line.strip() 9 | splits = line.split(' ') 10 | data, tag = [], [] 11 | for part in splits: 12 | parts = part.split('/') 13 | data.append(parts[0]) 14 | tag.append(parts[1]) 15 | datas.append(data) 16 | tags.append(tag) 17 | return datas, tags 18 | 19 | -------------------------------------------------------------------------------- /hmm/src/hmm.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | @description: bigram hmm, trigram hmm, _viterbi 6 | @author: Sean QQ: 929325776 7 | ''' 8 | 9 | import math 10 | import const 11 | from processing import * 12 | 13 | '''bigram hmm''' 14 | class BiHMM(object): 15 | def __init__(self, datas, tags): 16 | self.datas = datas 17 | self.tags = build_tags(tags) 18 | self.undict = build_undict(self.tags) 19 | self.bidict = build_bidict(self.tags) 20 | self.tagword, self.wordcount, self.tagcount = build_count_dict(datas, self.tags) 21 | self.postags = [tag for tag in self.undict] 22 | 23 | def calc_e_prob(self, *args): 24 | if len(args) != 2: 25 | raise ValueError('two tags is required') 26 | 27 | n = 0.0 28 | m = 0.0 29 | if args in self.tagword: 30 | n = self.tagword[args] 31 | if args[0] in self.undict: 32 | m = self.undict[args[0]] 33 | return (n + 1) * 1.0 / (m + len(self.wordcount)*len(self.undict)) 34 | 35 | def calc_prob(self, *args): 36 | if len(args) != 2: 37 | raise ValueError('two tags is required') 38 | 39 | n = 0.0 40 | m = 0.0 41 | if args in self.bidict: 42 | n = self.bidict[args] 43 | if args[0] in self.undict: 44 | m = self.undict[args[0]] 45 | return (n + 1) * 1.0 / (m + len(self.postags)**2) 46 | 47 | def calc_tags_prob(self, tags): 48 | prob = 0 49 | prev_tag = const.START_TOKEN 50 | for tag in tags: 51 | tag_prob = self.calc_prob(prev_tag, tag) 52 | prob += tag_prob 53 | prev_tag = tag 54 | return prob 55 | 56 | def calc_tagword_proba(self, tag, word): 57 | prob = 0.0 58 | tagword = (tag, word) 59 | if tagword in self.tagword: 60 | prob = float(self.tagword[tagword]) / self.tagcount[tag] 61 | return prob 62 | 63 | # @param vb _viterbi 64 | def pred(self, sentence, vb=False): 65 | if vb: 66 | # _viterbi 67 | return self._viterbi(sentence) 68 | 69 | wordtag = [] 70 | max_prob = 0.0 71 | max_tag = None 72 | #total_prob = None 73 | for word in sentence: 74 | for tag1 in self.postags: 75 | for tag2 in self.postags: 76 | q = self.calc_tags_prob((tag1, tag2)) 77 | e = self.calc_tagword_proba(tag2, word) 78 | prob = q*e*1.0 79 | if prob >= max_prob: 80 | max_prob = prob 81 | max_tag = tag2 82 | wordtag.append((word, max_tag)) 83 | ''' 84 | if total_prob == None: 85 | total_prob = max_prob 86 | else: 87 | total_prob *= max_prob 88 | ''' 89 | max_prob = 0.0 90 | return wordtag 91 | 92 | 93 | def _viterbi_decode(self, sentence, score, trace): 94 | result = [] 95 | tmp = -float('inf') 96 | res_x = 0 97 | for idx, val in enumerate(self.postags): 98 | if tmp < score[idx][len(sentence)-1]: 99 | tmp = score[idx][len(sentence)-1] 100 | res_x = idx 101 | result.append(res_x) 102 | for idx in range(len(sentence)-1, 0, -1): 103 | result.append(trace[result[-1]][idx]) 104 | result.reverse() 105 | result_pos = [] 106 | result_pos = [self.postags[k] for k in result] 107 | wordtag = list(zip(sentence, result_pos)) 108 | return wordtag 109 | 110 | def _viterbi(self, sentence): 111 | row = len(self.postags) 112 | col = len(sentence) 113 | 114 | trace = [[-1 for i in range(col)] for i in range(row)] 115 | score = [[-1 for i in range(col)] for i in range(row)] 116 | 117 | for idx, val in enumerate(sentence): 118 | if idx == 0: 119 | for idx_pos, val_pos in enumerate(self.postags): 120 | score[idx_pos][idx] = self.calc_e_prob(val_pos, sentence[idx]) # emit 121 | else: 122 | for idx_pos, val_pos in enumerate(self.postags): 123 | tmp = -float('inf') 124 | trace_tmp = -1 125 | for idx_pos2, val_pos2 in enumerate(self.postags): 126 | r = score[idx_pos2][idx-1]*self.calc_prob(val_pos2, val_pos) 127 | if r > tmp: 128 | tmp = r 129 | trace_tmp = idx_pos2 130 | trace[idx_pos][idx] = trace_tmp 131 | score[idx_pos][idx] = tmp*self.calc_e_prob(val_pos, val) 132 | return self._viterbi_decode(sentence, score, trace) 133 | 134 | class TriHMM(BiHMM): 135 | def __init__(self, datas, tags): 136 | BiHMM.__init__(self, datas, tags) 137 | self.tridict = build_tridict(self.tags) 138 | 139 | def calc_prob(self, *args): 140 | if len(args) != 3: 141 | raise ValueError('three tags is required') 142 | 143 | n = 0.0 144 | m = 0.0 145 | bitup = (args[0], args[1]) 146 | if args in self.tridict: 147 | n = self.tridict[args] 148 | if bitup in self.bidict: 149 | m = self.bidict[bitup] 150 | return (n + 1) * 1.0 / (m + len(self.postags)**2) 151 | 152 | 153 | prob = 0 154 | if self.smooth != None: 155 | prob = self.smooth(args[0], args[1], args[2], tridict=self.tridict, bidict=self.bidict, undict=self.undict) 156 | else: 157 | bitup = (args[0], args[1]) 158 | if args in self.tridict and bitup in self.bidict: 159 | return float(self.tridict[args]) / self.bidict[bitup] 160 | return prob 161 | 162 | def calc_tags_prob(self, tags): 163 | prob = 0 164 | prev_stack = [const.START_TOKEN, const.START_TOKEN] 165 | for tag in tags: 166 | tag_prob = self.calc_prob(prev_stack[0], prev_stack[1], tag) 167 | prob += tag_prob 168 | prev_stack[0] = prev_stack[1] 169 | prev_stack[1] = tag 170 | return prob 171 | 172 | # @param vb _viterbi 173 | def pred(self, sentence, vb=False): 174 | if vb: 175 | return self._viterbi(sentence) 176 | wordtag = [] 177 | max_prob = 0.0 178 | max_tag = None 179 | #total_prob = None 180 | for word in sentence: 181 | for tag1 in self.postags: 182 | for tag2 in self.postags: 183 | for tag3 in self.postags: 184 | q = self.calc_tags_prob((tag1, tag2, tag3)) 185 | e = self.calc_tagword_proba(tag3, word) 186 | prob = q*e*1.0 187 | if prob >= max_prob: 188 | max_prob = prob 189 | max_tag = tag3 190 | wordtag.append((word, max_tag)) 191 | ''' 192 | if total_prob == None: 193 | total_prob = max_prob 194 | else: 195 | total_prob *= max_prob 196 | ''' 197 | max_prob = 0.0 198 | return wordtag 199 | 200 | def _viterbi_decode(self, sentence, score, trace): 201 | result = [] 202 | tmp = -float('inf') 203 | res_x = 0 204 | res_y = 0 205 | for idx, val in enumerate(self.postags): 206 | for idx_pos2, val_pos2 in enumerate(self.postags): 207 | if tmp < score[idx_pos2][idx][len(sentence)-1]: 208 | tmp = score[idx_pos2][idx][len(sentence)-1] 209 | res_x = idx 210 | res_y = idx_pos2 211 | result.extend([res_x, res_y]) 212 | for idx in range(len(sentence)-1, 0, -1): 213 | result.append(trace[result[-2]][result[-1]][idx]) 214 | result.reverse() 215 | result_pos = [] 216 | result_pos = [self.postags[k] for k in result] 217 | wordtag = list(zip(sentence, result_pos)) 218 | return wordtag 219 | 220 | def _viterbi(self, sentence): 221 | row = len(self.postags) 222 | col = len(sentence) 223 | 224 | trace = [[[-1 for i in range(col)] for i in range(row)] for i in range(row)] 225 | score = [[[-1 for i in range(col)] for i in range(row)] for i in range(row)] 226 | 227 | for idx, val in enumerate(sentence): 228 | if idx == 0: 229 | for idx_pos, val_pos in enumerate(self.postags): 230 | score[idx_pos][0][idx] = self.calc_e_prob(val_pos, sentence[idx]) # emit 231 | else: 232 | for idx_pos, val_pos in enumerate(self.postags): 233 | tmp = -float('inf') 234 | trace_tmp = -1 235 | for idx_pos2, val_pos2 in enumerate(self.postags): 236 | for idx_pos3, val_pos3 in enumerate(self.postags): 237 | r = score[idx_pos3][idx_pos2][idx-1]*self.calc_prob(val_pos3, val_pos2 ,val_pos) 238 | if r > tmp: 239 | tmp = r 240 | trace_tmp = idx_pos3 241 | trace[idx_pos][idx_pos2][idx] = trace_tmp 242 | score[idx_pos][idx_pos2][idx] = tmp*self.calc_e_prob(val_pos, val) 243 | return self._viterbi_decode(sentence, score, trace) -------------------------------------------------------------------------------- /hmm/src/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from dataset import load_data 5 | from hmm import * 6 | 7 | datas, tags = load_data('./corpus/toy/train.txt') 8 | 9 | ################## Bigram HMM start ##################### 10 | print('\n************** bigram hmm **************\n') 11 | bihmm = BiHMM(datas, tags) 12 | print("bigram hmm") 13 | print(bihmm.pred(['小明', '爱', '老鼠', '和', '狗'])) 14 | print("bigram hmm with viterbi decode") 15 | print(bihmm.pred(['小明', '爱', '老鼠', '和', '狗'], vb=True)) 16 | ################## Bigram HMM end ##################### 17 | 18 | ################## Trigram HMM start ##################### 19 | print('\n************* trigram hmm *************\n') 20 | trihmm = TriHMM(datas, tags) 21 | print("trigram hmm") 22 | print(trihmm.pred(['小明', '爱', '老鼠', '和', '狗'])) 23 | print("trigram hmm with viterbi decode") 24 | print(trihmm.pred(['小明', '爱', '老鼠', '和', '狗'], vb=True)) 25 | ################## Trigram HMM end ##################### 26 | -------------------------------------------------------------------------------- /hmm/src/processing.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | @description: 句子的处理,字典的构建 6 | @author: Sean QQ: 929325776 7 | ''' 8 | 9 | import const 10 | 11 | #加入起始标记 12 | def build_tags(tags): 13 | out = [] 14 | for sentence in tags: 15 | items = [x.lower() for x in sentence] 16 | items.insert(0, const.START_TOKEN) 17 | items.append(const.END_TOKEN) 18 | out.append(items) 19 | return out 20 | 21 | # 构建ungram词频词典 22 | def build_undict(tags): 23 | undict = {} 24 | for items in tags: 25 | for word in items: 26 | if word == const.START_TOKEN or word == const.END_TOKEN: 27 | continue 28 | if word not in undict: 29 | undict[word] = 1 30 | else: 31 | undict[word] += 1 32 | return undict 33 | 34 | 35 | # 构建bigram词频词典,其中以三元组(u, v)作为词典的键 36 | def build_bidict(tags): 37 | bidict = {} 38 | for items in tags: 39 | for i in range(len(items)-1): 40 | tup = (items[i], items[i+1]) 41 | if tup not in bidict: 42 | bidict[tup] = 1 43 | else: 44 | bidict[tup] += 1 45 | return bidict 46 | 47 | # 构建trigram词频词典,其中以三元组(u, v, w)作为词典的键 48 | def build_tridict(tags): 49 | tridict = {} 50 | for items in tags: 51 | items.insert(0, const.START_TOKEN) 52 | for i in range(len(items) -2): 53 | tup = (items[i], items[i+1], items[i+2]) 54 | if tup not in tridict: 55 | tridict[tup] = 1 56 | else: 57 | tridict[tup] += 1 58 | return tridict 59 | 60 | # 构建(词,词性)词频字典,以及统计词频 61 | def build_count_dict(datas, tags): 62 | tagword_dict = {} 63 | wordcount = {} 64 | tagcount = {} 65 | for i, data in enumerate(datas): 66 | tag = tags[i][1:-1] 67 | for idx, d in enumerate(data): 68 | tup = (tag[idx], d) 69 | if tup not in tagword_dict: 70 | tagword_dict[tup] = 1 71 | else: 72 | tagword_dict[tup] += 1 73 | 74 | if d not in wordcount: 75 | wordcount[d] = 1 76 | else: 77 | wordcount[d] += 1 78 | if tag[idx] not in tagcount: 79 | tagcount[tag[idx]] = 1 80 | else: 81 | tagcount[tag[idx]] += 1 82 | return tagword_dict, wordcount, tagcount 83 | -------------------------------------------------------------------------------- /language_model/README.md: -------------------------------------------------------------------------------- 1 | # language model 2 | 实现了 3 | 4 | * unigram 5 | * bigram 6 | * trigram 7 | 8 | 采用了困惑度perplexity对模型评价, 采用了smooth方法 9 | -------------------------------------------------------------------------------- /language_model/languange_model_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# python实现n-gram\n", 8 | "\n", 9 | "本例子主要受 Michael Collins 教授的 Language Modeling 启发而编写,为了帮助大家理解语言模型,我在我的博客、公众号上发表了文章[一文读懂NLP中的语言模型(公众号)](http://mp.weixin.qq.com/s?__biz=MzIwNDM1NjUzMA==&mid=2247483658&idx=1&sn=9c5e7cc50b65cf31a08f1e2a0046ceb1&chksm=96c02fd7a1b7a6c1bbabe19145665d370020f4a3e89ebdc1226a1ec4ed110ef089c6fb0212c4&mpshare=1&scene=1&srcid=1114A1PGK4rDqKMMbsAmplr3#rd),欢迎大家阅读。当然强烈推荐[Michael Collins 教授的 Language Modeling 原文](http://www.cs.columbia.edu/~mcollins/lm-spring2013.pdf)" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## 目录\n", 17 | "\n", 18 | "1. [项目结构](#项目结构)\n", 19 | "\n", 20 | "2. [环境要求](#环境要求)\n", 21 | "\n", 22 | "3. [代码分析](#代码分析)\n", 23 | "\n", 24 | "4. [结果分析](#结果分析)\n", 25 | "\n", 26 | "5. [项目后续](#项目后续)\n", 27 | "\n", 28 | "6. [联系作者](#联系作者)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "### 项目结构\n", 36 | "\n", 37 | "| - src\n", 38 | " \n", 39 | " | - const.py 常量定义文件\n", 40 | " \n", 41 | " | - corpus 语料库\n", 42 | " \n", 43 | " | - dataset.py 加载语料\n", 44 | " \n", 45 | " | - evaluate.py 模型的评估方法\n", 46 | " \n", 47 | " | - main.py 例子程序\n", 48 | " \n", 49 | " | - ngram.py ungram, bigram, trigram 模型,以及一些模型方法\n", 50 | " \n", 51 | " | - processing.py 字典的生成等处理方法\n", 52 | " \n", 53 | " | - smooth.py 平滑方法" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## 环境要求\n", 61 | "\n", 62 | " python3" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## 代码分析" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### const.py\n", 77 | "\n", 78 | "在这里定义了三个常量" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 1, 84 | "metadata": { 85 | "collapsed": true 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "# 未登录词\n", 90 | "UNK = None\n", 91 | "# 句子开始标记,代表句子的开头\n", 92 | "START_TOKEN = ''\n", 93 | "# 句子结束标记,代表句子的结尾\n", 94 | "END_TOKEN = ''" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "### processing.py" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": true 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "import const\n", 113 | "\n", 114 | "#加入起始标记\n", 115 | "def build_sentences(sentences):\n", 116 | " out = []\n", 117 | " for sentence in sentences:\n", 118 | " words = [x.lower() for x in sentence]\n", 119 | " words.insert(0, \"\")\n", 120 | " words.append(\"\")\n", 121 | " out.append(words)\n", 122 | " return out\n", 123 | "\n", 124 | "# 构建ungram词频词典\n", 125 | "def build_undict(sentences):\n", 126 | " undict = {}\n", 127 | " total = 0\n", 128 | " for words in sentences:\n", 129 | " for word in words:\n", 130 | " if word not in undict:\n", 131 | " undict[word] = 1\n", 132 | " else:\n", 133 | " undict[word] += 1\n", 134 | " if word != const.START_TOKEN and word != const.END_TOKEN:\n", 135 | " total += 1\n", 136 | " return undict, total\n", 137 | "\n", 138 | "# 构建bigram词频词典,其中以三元组(u, v)作为词典的键\n", 139 | "def build_bidict(sentences):\n", 140 | " bidict = {}\n", 141 | " for words in sentences:\n", 142 | " for i in range(len(words)-1):\n", 143 | " tup = (words[i], words[i+1])\n", 144 | " if tup not in bidict:\n", 145 | " bidict[tup] = 1\n", 146 | " else:\n", 147 | " bidict[tup] += 1\n", 148 | " return bidict\n", 149 | "\n", 150 | "# 构建trigram词频词典,其中以三元组(u, v, w)作为词典的键\n", 151 | "def build_tridict(sentences):\n", 152 | " tridict = {}\n", 153 | " for words in sentences:\n", 154 | " for i in range(len(words) -2):\n", 155 | " tup = (words[i], words[i+1], words[i+2])\n", 156 | " if tup not in tridict:\n", 157 | " tridict[tup] = 1\n", 158 | " else:\n", 159 | " tridict[tup] += 1\n", 160 | " return tridict" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "### ngram.py\n", 168 | "\n", 169 | "n-gram模型,实现了ungram, bigram, trigram" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "collapsed": true 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "import math\n", 181 | "import const\n", 182 | "from processing import *\n", 183 | "\n", 184 | "'''\n", 185 | "@function calc_prob \t\t\t计算条件概率,这里使用最大似然估计(max-likelihood estimate)去计算概率\n", 186 | "@function calc_sentence_prob\t计算句子的条件概率\n", 187 | "'''\n", 188 | "class UnGram(object):\n", 189 | "\tdef __init__(self, sentences, smooth = None):\n", 190 | "\t\tself.undict, self.total = build_undict(sentences)\n", 191 | "\t\tself.smooth = smooth\n", 192 | "\n", 193 | "\tdef calc_prob(self, word):\n", 194 | "\t\tprob = 0\n", 195 | "\t\tif self.smooth != None:\n", 196 | "\t\t\tprob = self.smooth(word, undict=self.undict, total=self.total)\n", 197 | "\t\telse:\n", 198 | "\t\t\tif word in self.undict:\n", 199 | "\t\t\t\tprob = float(self.undict[word]) / self.total\n", 200 | "\t\treturn prob\n", 201 | "\n", 202 | "\tdef calc_sentence_prob(self, sentence, prob_log=True):\n", 203 | "\t\tprob_log_sum = 0\n", 204 | "\t\tfor word in sentence:\n", 205 | "\t\t\tif word != const.START_TOKEN and word != const.END_TOKEN:\n", 206 | "\t\t\t\tword_prob = self.calc_prob(word)\n", 207 | "\t\t\t\tif word_prob != 0:\n", 208 | "\t\t\t\t\tprob_log_sum += math.log(word_prob, 2)\n", 209 | "\t\treturn math.pow(2, prob_log_sum) if prob_log else prob_log_sum\n", 210 | "\n", 211 | "\tdef sort_vocab(self):\n", 212 | "\t\tvocabs = list(self.undict.keys())\n", 213 | "\t\tvocabs.remove(const.START_TOKEN)\n", 214 | "\t\tvocabs.remove(const.END_TOKEN)\n", 215 | "\t\tvocabs.sort()\n", 216 | "\t\tvocabs.append(const.UNK)\n", 217 | "\t\tvocabs.append(const.START_TOKEN)\n", 218 | "\t\tvocabs.append(const.END_TOKEN)\n", 219 | "\t\treturn vocabs\n", 220 | "\n", 221 | "class BiGram(UnGram):\n", 222 | "\tdef __init__(self, sentences, smooth = None):\n", 223 | "\t\tUnGram.__init__(self, sentences, smooth)\n", 224 | "\t\tself.bidict = build_bidict(sentences)\n", 225 | "\n", 226 | "\tdef calc_prob(self, *args):\n", 227 | "\t\tif len(args) != 2:\n", 228 | "\t\t\traise ValueError('two words is required')\n", 229 | "\n", 230 | "\t\tprob = 0\n", 231 | "\t\tif self.smooth != None:\n", 232 | "\t\t\tprob = self.smooth(args[0], args[1], bidict=self.bidict, undict=self.undict)\n", 233 | "\t\telse:\n", 234 | "\t\t\tif args in self.bidict and args[0] in self.undict:\n", 235 | "\t\t\t\treturn float(self.bidict[args]) / self.undict[args[0]]\n", 236 | "\t\treturn prob\n", 237 | "\n", 238 | "\tdef calc_sentence_prob(self, sentence, prob_log=True):\n", 239 | "\t\tprob_log_sum = 0\n", 240 | "\t\tprev_word = None\n", 241 | "\t\tfor word in sentence:\n", 242 | "\t\t\tif prev_word != None:\n", 243 | "\t\t\t\tword_prob = self.calc_prob(prev_word, word)\n", 244 | "\t\t\t\tprob_log_sum += word_prob\n", 245 | "\t\t\tprev_word = word\n", 246 | "\t\treturn math.pow(2, prob_log_sum) if prob_log else prob_log_sum\n", 247 | "\n", 248 | "\n", 249 | "class TriGram(BiGram):\n", 250 | "\tdef __init__(self, sentences, smooth = None):\n", 251 | "\t\tBiGram.__init__(self, sentences, smooth)\n", 252 | "\t\tself.tridict = build_tridict(sentences)\n", 253 | "\n", 254 | "\tdef calc_prob(self, *args):\n", 255 | "\t\tif len(args) != 3:\n", 256 | "\t\t\traise ValueError('three words is required')\n", 257 | "\n", 258 | "\t\tprob = 0\n", 259 | "\t\tif self.smooth != None:\n", 260 | "\t\t\tprob = self.smooth(args[0], args[1], args[2], tridict=self.tridict, bidict=self.bidict, undict=self.undict)\n", 261 | "\t\telse:\n", 262 | "\t\t\tbitup = (args[0], args[1])\t\t\t\t\n", 263 | "\t\t\tif args in self.tridict and bitup in self.bidict:\n", 264 | "\t\t\t\treturn float(self.tridict[args]) / self.bidict[bitup]\n", 265 | "\t\treturn prob\n", 266 | "\n", 267 | "\tdef calc_sentence_prob(self, sentence, prob_log=True):\n", 268 | "\t\tprob_log_sum = 0\n", 269 | "\t\tprev_stack = []\n", 270 | "\t\tfor word in sentence:\n", 271 | "\t\t\tif len(prev_stack) < 2:\n", 272 | "\t\t\t\tprev_stack.append(word)\n", 273 | "\t\t\telif len(prev_stack) == 2:\n", 274 | "\t\t\t\tword_prob = self.calc_prob(prev_stack[0], prev_stack[1], word)\n", 275 | "\t\t\t\tprob_log_sum += word_prob\n", 276 | "\t\t\t\tprev_stack[0] = prev_stack[1]\n", 277 | "\t\t\t\tprev_stack[1] = word\n", 278 | "\t\treturn math.pow(2, prob_log_sum) if prob_log else prob_log_sum\n", 279 | "\n", 280 | "'''\n", 281 | "@function: calc_xxgram_count 主要用来统计语料库中词的总数\n", 282 | "@function: print_xxgram_probas 格式化输出概率 \n", 283 | "'''\n", 284 | "class GramUtil(object):\n", 285 | "\n", 286 | "\t@staticmethod\n", 287 | "\tdef calc_ungram_count(sentences):\n", 288 | "\t\tcount = 0\n", 289 | "\t\tfor sentence in sentences:\n", 290 | "\t\t\t# except START_TOKEN and END_TOKEN\n", 291 | "\t\t\tcount += len(sentence) - 2\n", 292 | "\t\treturn count\n", 293 | "\n", 294 | "\t@staticmethod\n", 295 | "\tdef calc_bigram_count(sentences):\n", 296 | "\t\tcount = 0\n", 297 | "\t\tfor sentence in sentences:\n", 298 | "\t\t\tcount += len(sentence) - 1\n", 299 | "\t\treturn count\n", 300 | "\n", 301 | "\t@staticmethod\n", 302 | "\tdef calc_trigram_count(sentences):\n", 303 | "\t\tcount = 0\n", 304 | "\t\tfor sentence in sentences:\n", 305 | "\t\t\tcount += len(sentence)\n", 306 | "\t\treturn count\n", 307 | "\n", 308 | "\t@staticmethod\n", 309 | "\tdef print_ungram_probs(model, vocabs):\n", 310 | "\t\tfor vocab in vocabs:\n", 311 | "\t\t\tif vocab != const.START_TOKEN and vocab != const.END_TOKEN:\n", 312 | "\t\t\t\tprint(\"{} \\t {}\".format(vocab if vocab != const.UNK else 'UNK', model.calc_prob(vocab)))\n", 313 | "\n", 314 | "\t@staticmethod\n", 315 | "\tdef print_bigram_probs(model, vocabs):\n", 316 | "\t\tprint(\"\\t\\t\", end=\"\")\n", 317 | "\t\tfor vocab in vocabs:\n", 318 | "\t\t\tif vocab != const.START_TOKEN:\n", 319 | "\t\t\t\tprint(vocab if vocab != const.UNK else \"UNK\", end=\"\\t\\t\")\n", 320 | "\t\tprint(\"\")\n", 321 | "\t\tfor vocab in vocabs:\n", 322 | "\t\t\tif vocab != const.END_TOKEN:\n", 323 | "\t\t\t\tprint(vocab if vocab != const.UNK else \"UNK\", end=\"\\t\\t\")\n", 324 | "\t\t\t\tfor vocab2 in vocabs:\n", 325 | "\t\t\t\t\tif vocab2 != const.START_TOKEN:\n", 326 | "\t\t\t\t\t\tprint(\"{0:.3f}\".format(model.calc_prob(vocab, vocab2)), end=\"\\t\\t\")\n", 327 | "\t\t\t\tprint(\"\")\n", 328 | "\n", 329 | "\t@staticmethod\n", 330 | "\tdef print_trigram_probs(model, vocabs):\n", 331 | "\t\tprint(\"\\t\\t\", end=\"\")\n", 332 | "\t\tfor vocab in vocabs:\n", 333 | "\t\t\tif vocab != const.START_TOKEN:\n", 334 | "\t\t\t\tprint(vocab if vocab != const.UNK else \"UNK\", end=\"\\t\")\n", 335 | "\t\tprint(\"\")\n", 336 | "\t\tfor vocab in vocabs:\n", 337 | "\t\t\tif vocab != const.END_TOKEN:\n", 338 | "\t\t\t\tfor vocab2 in vocabs:\n", 339 | "\t\t\t\t\tif vocab2 != const.START_TOKEN and vocab != const.UNK and vocab2 != const.UNK and vocab2 != const.END_TOKEN:\n", 340 | "\t\t\t\t\t\tprint(vocab, vocab2 if vocab2 != const.UNK else \"UNK\", end=\"\\t\\t\")\n", 341 | "\t\t\t\t\t\tfor vocab3 in vocabs:\n", 342 | "\t\t\t\t\t\t\tif vocab3 != const.END_TOKEN\n", 343 | "\t\t\t\t\t\t\t\tprint(\"{0:.3f}\".format(model.calc_prob(vocab, vocab2, vocab3)), end=\"\\t\")\n", 344 | "\t\t\t\t\t\tprint(\"\")\n" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "### evaluate.py\n", 352 | "\n", 353 | "模型的评估,这里主要用了困惑度Perplexity" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "import math\n", 365 | "\n", 366 | "# 计算困惑度\n", 367 | "def perplexity(model, sentences, cal_gram_func):\n", 368 | " # gram_count 词的总数,对应教程中的 M\n", 369 | "\tgram_count = cal_gram_func(sentences)\n", 370 | "\tprob_log_sum = 0\n", 371 | "\tfor sentence in sentences:\n", 372 | "\t\ttry:\n", 373 | "\t\t\tprob_log_sum -= math.log(model.calc_sentence_prob(sentence), 2)\n", 374 | "\t\texcept:\n", 375 | "\t\t\tprob_log_sum -= float('-inf')\n", 376 | "\t\treturn math.pow(2, prob_log_sum/gram_count)" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "## 结果分析" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | "
**#****smooth****unsmooth**
你好不2.991673.97368
好不你1.104091.21901
你是不1.752632.06712
\n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | "
**#****smooth****unsmooth**
Perplexity0.912720.89138
" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": {}, 429 | "source": [ 430 | "## 项目后续\n", 431 | "\n", 432 | "过段时间会加入深度学习在语言模型上的应用,如果你感兴趣,可以关注我的公众号,或者star, watch 本项目哦" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "## 联系作者\n", 440 | "\n", 441 | "@author sean\n", 442 | "\n", 443 | "@qq 929325776\n", 444 | "\n", 445 | "有什么问题,可以联系我,一起讨论" 446 | ] 447 | } 448 | ], 449 | "metadata": { 450 | "kernelspec": { 451 | "display_name": "Python 3", 452 | "language": "python", 453 | "name": "python3" 454 | }, 455 | "language_info": { 456 | "codemirror_mode": { 457 | "name": "ipython", 458 | "version": 3 459 | }, 460 | "file_extension": ".py", 461 | "mimetype": "text/x-python", 462 | "name": "python", 463 | "nbconvert_exporter": "python", 464 | "pygments_lexer": "ipython3", 465 | "version": "3.6.1" 466 | } 467 | }, 468 | "nbformat": 4, 469 | "nbformat_minor": 2 470 | } 471 | -------------------------------------------------------------------------------- /language_model/src/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | ''' 6 | @description: 定义常量 7 | @author: Sean QQ: 929325776 8 | ''' 9 | 10 | UNK = None 11 | START_TOKEN = '' 12 | END_TOKEN = '' 13 | -------------------------------------------------------------------------------- /language_model/src/corpus/toy/test.txt: -------------------------------------------------------------------------------- 1 | 你 好 不 2 | 好 不 你 3 | 你 是 不 4 | -------------------------------------------------------------------------------- /language_model/src/corpus/toy/train.txt: -------------------------------------------------------------------------------- 1 | 你 好 2 | 你 好 吗 3 | 好 了 吗 4 | 你 好 了 5 | 不 好 了 6 | 你 不 好 7 | 你 好 不 8 | -------------------------------------------------------------------------------- /language_model/src/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | @description: 加载语料,并加入起始标记 6 | @author: Sean QQ: 929325776 7 | ''' 8 | import re 9 | from processing import build_sentences 10 | 11 | def load_dataset(file_path): 12 | with open(file_path, "r") as f: 13 | return build_sentences([re.split("\s+", line.rstrip('\n')) for line in f]) 14 | -------------------------------------------------------------------------------- /language_model/src/evaluate.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | @description: 模型评估 6 | @author: Sean QQ: 929325776 7 | ''' 8 | 9 | import math 10 | 11 | # 计算困惑度 12 | def perplexity(model, sentences, cal_gram_func): 13 | # gram_count 词的总数,对应教程中的 M 14 | gram_count = cal_gram_func(sentences) 15 | prob_log_sum = 0 16 | for sentence in sentences: 17 | try: 18 | prob_log_sum -= math.log(model.calc_sentence_prob(sentence), 2) 19 | except: 20 | prob_log_sum -= float('-inf') 21 | return math.pow(2, prob_log_sum/gram_count) 22 | 23 | -------------------------------------------------------------------------------- /language_model/src/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from dataset import load_dataset 5 | from smooth import Smooth 6 | from ngram import * 7 | from evaluate import perplexity 8 | 9 | train_dataset = load_dataset('./corpus/toy/train.txt') 10 | test_dataset = load_dataset('./corpus/toy/test.txt') 11 | 12 | 13 | ###################### ungram start ###################### 14 | 15 | ''' 16 | model_unsmooth = UnGram(train_dataset) 17 | model_smooth = UnGram(train_dataset, Smooth.discounting) 18 | 19 | vocabs = model_unsmooth.sort_vocab() 20 | 21 | print("- ungram unsmooth -") 22 | GramUtil.print_ungram_probs(model_unsmooth, vocabs) 23 | 24 | print("- ungram smooth -") 25 | GramUtil.print_ungram_probs(model_smooth, vocabs) 26 | 27 | print('- sentence_prob -') 28 | print("\t\t smooth\t\t unsmooth") 29 | for sentence in test_dataset: 30 | smooth = "{0:.5f}".format(model_smooth.calc_sentence_prob(sentence)) 31 | unsmooth = "{0:.5f}".format(model_unsmooth.calc_sentence_prob(sentence)) 32 | print("".join(sentence), "\t", smooth, "\t", unsmooth) 33 | 34 | print("- test perplexity -") 35 | print("unsmooth: ", perplexity(model_smooth, test_dataset, GramUtil.calc_ungram_count)) 36 | print("smooth: ", perplexity(model_unsmooth, test_dataset, GramUtil.calc_ungram_count)) 37 | ''' 38 | ###################### ungram end ###################### 39 | 40 | 41 | ###################### bigram start ###################### 42 | 43 | model_unsmooth = BiGram(train_dataset) 44 | model_smooth = BiGram(train_dataset, Smooth.discounting) 45 | 46 | vocabs = model_unsmooth.sort_vocab() 47 | 48 | print("- bigram unsmooth -") 49 | GramUtil.print_bigram_probs(model_unsmooth, vocabs) 50 | 51 | print("- bigram smooth -") 52 | GramUtil.print_bigram_probs(model_smooth, vocabs) 53 | 54 | print('- sentence_prob -') 55 | print("\t\t smooth\t\t unsmooth") 56 | for sentence in test_dataset: 57 | smooth = "{0:.5f}".format(model_smooth.calc_sentence_prob(sentence)) 58 | unsmooth = "{0:.5f}".format(model_unsmooth.calc_sentence_prob(sentence)) 59 | print("".join(sentence), "\t", smooth, "\t", unsmooth) 60 | 61 | print("- test perplexity -") 62 | print("unsmooth: ", perplexity(model_smooth, test_dataset, GramUtil.calc_bigram_count)) 63 | print("smooth: ", perplexity(model_unsmooth, test_dataset, GramUtil.calc_bigram_count)) 64 | 65 | ###################### ungram end ###################### 66 | 67 | 68 | ###################### trigram start ###################### 69 | ''' 70 | model_unsmooth = TriGram(train_dataset) 71 | model_smooth = TriGram(train_dataset, Smooth.discounting) 72 | 73 | vocabs = model_unsmooth.sort_vocab() 74 | 75 | print("- ungram unsmooth -") 76 | GramUtil.print_trigram_probs(model_unsmooth, vocabs) 77 | 78 | print("- ungram smooth -") 79 | GramUtil.print_trigram_probs(model_smooth, vocabs) 80 | 81 | print('- sentence_prob -') 82 | print("\t\t smooth\t\t unsmooth") 83 | for sentence in test_dataset: 84 | smooth = "{0:.5f}".format(model_smooth.calc_sentence_prob(sentence)) 85 | unsmooth = "{0:.5f}".format(model_unsmooth.calc_sentence_prob(sentence)) 86 | print("".join(sentence), "\t", smooth, "\t", unsmooth) 87 | 88 | print("- test perplexity -") 89 | print("unsmooth: ", perplexity(model_smooth, test_dataset, GramUtil.calc_bigram_count)) 90 | print("smooth: ", perplexity(model_unsmooth, test_dataset, GramUtil.calc_bigram_count)) 91 | ''' 92 | ###################### ungram end ###################### 93 | -------------------------------------------------------------------------------- /language_model/src/ngram.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | @description: 语言模型 6 | UniGram 7 | BiGram 8 | TriGram 9 | GramUtil - 工具函数 10 | @author: Sean QQ: 929325776 11 | ''' 12 | 13 | import math 14 | import const 15 | from processing import * 16 | 17 | ''' 18 | @function calc_prob 计算条件概率,这里使用最大似然估计(max-likelihood estimate)去计算概率 19 | @function calc_sentence_prob 计算句子的条件概率 20 | ''' 21 | class UnGram(object): 22 | def __init__(self, sentences, smooth = None): 23 | self.undict, self.total = build_undict(sentences) 24 | self.smooth = smooth 25 | 26 | def calc_prob(self, word): 27 | prob = 0 28 | if self.smooth != None: 29 | prob = self.smooth(word, undict=self.undict, total=self.total) 30 | else: 31 | if word in self.undict: 32 | prob = float(self.undict[word]) / self.total 33 | return prob 34 | 35 | def calc_sentence_prob(self, sentence, prob_log=True): 36 | prob_log_sum = 0 37 | for word in sentence: 38 | if word != const.START_TOKEN and word != const.END_TOKEN: 39 | word_prob = self.calc_prob(word) 40 | if word_prob != 0: 41 | prob_log_sum += math.log(word_prob, 2) 42 | return math.pow(2, prob_log_sum) if prob_log else prob_log_sum 43 | 44 | def sort_vocab(self): 45 | vocabs = list(self.undict.keys()) 46 | vocabs.remove(const.START_TOKEN) 47 | vocabs.remove(const.END_TOKEN) 48 | vocabs.sort() 49 | vocabs.append(const.UNK) 50 | vocabs.append(const.START_TOKEN) 51 | vocabs.append(const.END_TOKEN) 52 | return vocabs 53 | 54 | class BiGram(UnGram): 55 | def __init__(self, sentences, smooth = None): 56 | UnGram.__init__(self, sentences, smooth) 57 | self.bidict = build_bidict(sentences) 58 | 59 | def calc_prob(self, *args): 60 | if len(args) != 2: 61 | raise ValueError('two words is required') 62 | 63 | prob = 0 64 | if self.smooth != None: 65 | prob = self.smooth(args[0], args[1], bidict=self.bidict, undict=self.undict) 66 | else: 67 | if args in self.bidict and args[0] in self.undict: 68 | return float(self.bidict[args]) / self.undict[args[0]] 69 | return prob 70 | 71 | def calc_sentence_prob(self, sentence, prob_log=True): 72 | prob_log_sum = 0 73 | prev_word = None 74 | for word in sentence: 75 | if prev_word != None: 76 | word_prob = self.calc_prob(prev_word, word) 77 | prob_log_sum += word_prob 78 | prev_word = word 79 | return math.pow(2, prob_log_sum) if prob_log else prob_log_sum 80 | 81 | 82 | class TriGram(BiGram): 83 | def __init__(self, sentences, smooth = None): 84 | BiGram.__init__(self, sentences, smooth) 85 | self.tridict = build_tridict(sentences) 86 | 87 | def calc_prob(self, *args): 88 | if len(args) != 3: 89 | raise ValueError('three words is required') 90 | 91 | prob = 0 92 | if self.smooth != None: 93 | prob = self.smooth(args[0], args[1], args[2], tridict=self.tridict, bidict=self.bidict, undict=self.undict) 94 | else: 95 | bitup = (args[0], args[1]) 96 | if args in self.tridict and bitup in self.bidict: 97 | return float(self.tridict[args]) / self.bidict[bitup] 98 | return prob 99 | 100 | def calc_sentence_prob(self, sentence, prob_log=True): 101 | prob_log_sum = 0 102 | prev_stack = [] 103 | for word in sentence: 104 | if len(prev_stack) < 2: 105 | prev_stack.append(word) 106 | elif len(prev_stack) == 2: 107 | word_prob = self.calc_prob(prev_stack[0], prev_stack[1], word) 108 | prob_log_sum += word_prob 109 | prev_stack[0] = prev_stack[1] 110 | prev_stack[1] = word 111 | return math.pow(2, prob_log_sum) if prob_log else prob_log_sum 112 | 113 | ''' 114 | @function: calc_xxgram_count 主要用来统计语料库中词的总数 115 | @function: print_xxgram_probas 格式化输出概率 116 | ''' 117 | class GramUtil(object): 118 | 119 | @staticmethod 120 | def calc_ungram_count(sentences): 121 | count = 0 122 | for sentence in sentences: 123 | # except START_TOKEN and END_TOKEN 124 | count += len(sentence) - 2 125 | return count 126 | 127 | @staticmethod 128 | def calc_bigram_count(sentences): 129 | count = 0 130 | for sentence in sentences: 131 | count += len(sentence) - 1 132 | return count 133 | 134 | @staticmethod 135 | def calc_trigram_count(sentences): 136 | count = 0 137 | for sentence in sentences: 138 | count += len(sentence) 139 | return count 140 | 141 | @staticmethod 142 | def print_ungram_probs(model, vocabs): 143 | for vocab in vocabs: 144 | if vocab != const.START_TOKEN and vocab != const.END_TOKEN: 145 | print("{} \t {}".format(vocab if vocab != const.UNK else 'UNK', model.calc_prob(vocab))) 146 | 147 | @staticmethod 148 | def print_bigram_probs(model, vocabs): 149 | print("\t\t", end="") 150 | for vocab in vocabs: 151 | if vocab != const.START_TOKEN: 152 | print(vocab if vocab != const.UNK else "UNK", end="\t\t") 153 | print("") 154 | for vocab in vocabs: 155 | if vocab != const.END_TOKEN: 156 | print(vocab if vocab != const.UNK else "UNK", end="\t\t") 157 | for vocab2 in vocabs: 158 | if vocab2 != const.START_TOKEN: 159 | print("{0:.3f}".format(model.calc_prob(vocab, vocab2)), end="\t\t") 160 | print("") 161 | 162 | @staticmethod 163 | def print_trigram_probs(model, vocabs): 164 | print("\t\t", end="") 165 | for vocab in vocabs: 166 | if vocab != const.START_TOKEN: 167 | print(vocab if vocab != const.UNK else "UNK", end="\t") 168 | print("") 169 | for vocab in vocabs: 170 | if vocab != const.END_TOKEN: 171 | for vocab2 in vocabs: 172 | if vocab2 != const.START_TOKEN and vocab != const.UNK and vocab2 != const.UNK and vocab2 != const.END_TOKEN: 173 | print(vocab, vocab2 if vocab2 != const.UNK else "UNK", end="\t\t") 174 | for vocab3 in vocabs: 175 | if vocab3 != const.END_TOKEN: 176 | print("{0:.3f}".format(model.calc_prob(vocab, vocab2, vocab3)), end="\t") 177 | print("") 178 | -------------------------------------------------------------------------------- /language_model/src/processing.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | @description: 句子的处理,字典的构建 6 | @author: Sean QQ: 929325776 7 | ''' 8 | 9 | import const 10 | 11 | #加入起始标记 12 | def build_sentences(sentences): 13 | out = [] 14 | for sentence in sentences: 15 | words = [x.lower() for x in sentence] 16 | words.insert(0, const.START_TOKEN) 17 | words.append(const.END_TOKEN) 18 | out.append(words) 19 | return out 20 | 21 | # 构建ungram词频词典 22 | def build_undict(sentences): 23 | undict = {} 24 | total = 0 25 | for words in sentences: 26 | for word in words: 27 | if word not in undict: 28 | undict[word] = 1 29 | else: 30 | undict[word] += 1 31 | if word != const.START_TOKEN and word != const.END_TOKEN: 32 | total += 1 33 | return undict, total 34 | 35 | 36 | # 构建bigram词频词典,其中以三元组(u, v)作为词典的键 37 | def build_bidict(sentences): 38 | bidict = {} 39 | for words in sentences: 40 | for i in range(len(words)-1): 41 | tup = (words[i], words[i+1]) 42 | if tup not in bidict: 43 | bidict[tup] = 1 44 | else: 45 | bidict[tup] += 1 46 | return bidict 47 | 48 | # 构建trigram词频词典,其中以三元组(u, v, w)作为词典的键 49 | def build_tridict(sentences): 50 | tridict = {} 51 | sentences.insert(0, const.START_TOKEN) 52 | for words in sentences: 53 | for i in range(len(words) -2): 54 | tup = (words[i], words[i+1], words[i+2]) 55 | if tup not in tridict: 56 | tridict[tup] = 1 57 | else: 58 | tridict[tup] += 1 59 | return tridict 60 | -------------------------------------------------------------------------------- /language_model/src/smooth.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | ''' 5 | @description: 平滑估计计算 6 | @author: Sean QQ: 929325776 7 | ''' 8 | 9 | class Smooth(object): 10 | @staticmethod 11 | def discounting(*args, **kwargs): 12 | discount_value = 0.5 13 | if 'discount_value' in kwargs: 14 | discount_value = kwargs['discount_value'] 15 | if len(args) == 1: 16 | if 'undict' not in kwargs: 17 | raise ValueError('undict is required') 18 | if 'total' not in kwargs: 19 | raise ValueError('total (words count in sentences) is required') 20 | undict = kwargs['undict'] 21 | total = kwargs['total'] 22 | word = args[0] 23 | if word in undict: 24 | return float(undict[word] - discount_value) / total 25 | if len(args) == 2: 26 | if 'bidict' not in kwargs and 'undict' not in kwargs: 27 | raise ValueError('bidict and undict is required') 28 | bidict = kwargs['bidict'] 29 | undict = kwargs['undict'] 30 | if args in bidict and args[0] in undict: 31 | return float(bidict[args] - discount_value) / undict[args[0]] 32 | else: 33 | return 0 34 | elif len(args) == 3: 35 | if 'tridict' not in kwargs and 'bidict' not in kwargs: 36 | raise ValueError('tridict and bidict is required') 37 | tridict = kwargs['tridict'] 38 | bidict = kwargs['bidict'] 39 | bitup = (args[0], args[1]) 40 | if args in tridict and bitup in bidict: 41 | return float(tridict[args] - discount_value) / bidict[bitup] 42 | else: 43 | return 0 44 | else: 45 | return 0 46 | -------------------------------------------------------------------------------- /lsa/lsa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import jieba 5 | 6 | class LSA(object): 7 | def __init__(self, docs, kernel=None): 8 | super(LSA, self).__init__() 9 | self.kernel = kernel 10 | self.docs = [] 11 | self.vocabs = set() 12 | self.build_vocab(docs) 13 | 14 | def tokenizer(self, sent): 15 | return jieba.lcut(sent) 16 | 17 | def build_vocab(self, docs): 18 | for doc in docs: 19 | doc = doc.strip() 20 | # 为了简单仅仅保留词的长度大于1的 21 | words = list(filter(lambda x: len(x) > 1, self.tokenizer(doc))) 22 | self.docs.append(words) 23 | self.vocabs.update(words) 24 | 25 | self.vocabs = list(self.vocabs) 26 | self.word2idx = dict(zip(self.vocabs, range(len(self.vocabs)))) 27 | 28 | def build_bow_matrix(self): 29 | matrix = np.zeros([len(self.vocabs), len(self.docs)]) 30 | for docidx, words in enumerate(self.docs): 31 | for word in words: 32 | matrix[self.word2idx[word], docidx] += 1 33 | return matrix 34 | 35 | def build_tfidf_matrix(self): 36 | tf = self.build_bow_matrix() 37 | print(tf) 38 | df = np.ones([len(self.vocabs), 1]) 39 | 40 | for docidx, words in enumerate(self.docs): 41 | tf[:, docidx] /= np.max(tf[:, docidx]) 42 | for word in words: 43 | df[self.word2idx[word], 0] += 1 44 | idf = np.log(len(self.docs)) - np.log(df) 45 | 46 | return tf*idf 47 | 48 | def sim_words(self, k=3): 49 | if self.kernel == 'tfidf': 50 | matrix = self.build_tfidf_matrix() 51 | else: 52 | matrix = self.build_bow_matrix() 53 | 54 | U, S, Vt = np.linalg.svd(matrix) 55 | 56 | sort_idx = np.argsort(-U) 57 | # 一般不取第一列,第一列的词往往是本身 58 | topk = sort_idx[:, 1:k+1] 59 | print("word \t similarity") 60 | for widx, word in enumerate(self.vocabs): 61 | line = word + ":\t" 62 | idxs = topk[widx] 63 | for idx in idxs: 64 | line += str(self.vocabs[idx]) + " " 65 | print(line) 66 | 67 | def topic_relate(self, k=2): 68 | if self.kernel == 'tfidf': 69 | matrix = self.build_tfidf_matrix() 70 | else: 71 | matrix = self.build_bow_matrix() 72 | 73 | U, S, Vt = np.linalg.svd(matrix) 74 | 75 | sort_idx = np.argsort(-Vt, axis=1) 76 | # 一般不取第一行,第一行是自己本身 77 | topk = sort_idx[1:k+1, :] 78 | print(topk) 79 | 80 | if __name__ == '__main__': 81 | doc1 = """计算机科学是系统性研究信息与计算的理论基础以及它们在计算机系统中如何实现与应用的实用技术的学科""" 82 | 83 | doc2 = """自然语言处理是人工智能和语言学领域的分支学科。此领域探讨如何处理及运用自然语言;自然语言认知则是指让电脑“懂”人类的语言。 84 | 自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。""" 85 | 86 | doc3 = """人工智能是计算机科学的一个分支,它企图了解智能的实质,并生产出一种新的能以人类智能相似的方式做出反应的智能机器, 87 | 该领域的研究包括机器人、语言识别、图像识别、自然语言处理和专家系统等""" 88 | 89 | doc4 = """《瓦尔登湖》是美国作家梭罗独居瓦尔登湖畔的记录,描绘了他两年多时间里的所见、所闻和所思。 90 | 该书崇尚简朴生活,热爱大自然的风光,内容丰厚,意义深远,语言生动""" 91 | 92 | docs = [doc1, doc2, doc3, doc4] 93 | 94 | lsa = LSA(docs, kernel=None) 95 | lsa.sim_words() 96 | lsa.topic_relate() -------------------------------------------------------------------------------- /nbayes/nbayes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import math 4 | 5 | from collections import defaultdict 6 | 7 | class NBayes(object): 8 | def __init__(self, trainSet): 9 | self.data = trainSet 10 | self.tags = defaultdict(int) 11 | self.tagwords = defaultdict(int) 12 | self.total = 0 13 | 14 | def _tokenizer(self, sent): 15 | return list(sent) 16 | 17 | def train(self): 18 | for tag, doc in self.data: 19 | words = self._tokenizer(doc) 20 | for word in words: 21 | self.tags[tag] += 1 22 | self.tagwords[(tag, word)] += 1 23 | self.total += 1 24 | 25 | def predict(self, inp): 26 | words = self._tokenizer(inp) 27 | 28 | tmp = {} 29 | for tag in self.tags.keys(): 30 | tmp[tag] = math.log(self.tags[tag]) - math.log(self.total) 31 | for word in words: 32 | tmp[tag] += math.log(self.tagwords.get((tag, word), 1.0)) - math.log(self.tags[tag]) 33 | ret, score = 0, 0.0 34 | for t in self.tags.keys(): 35 | cnt = 0.0 36 | for tt in self.tags.keys(): 37 | cnt += math.exp(tmp[tt] - tmp[t]) 38 | cnt = 1.0 / cnt 39 | if cnt > score: 40 | ret, score = t, cnt 41 | return ret, score 42 | 43 | 44 | 45 | if __name__ == '__main__': 46 | trainSet = [("pos", "good job !"), 47 | ("pos", "表现不错哦"), 48 | ("pos", "厉害咯"), 49 | ("pos", "做的很好啊"), 50 | ("pos", "做得不错继续努力"), 51 | ("pos", "不错!点赞"), 52 | ("neg", "太差了"), 53 | ("neg", "太糟糕了"), 54 | ("neg", "你做的一点都不好"), 55 | ("neg", "so bad"), 56 | ("non", "一般般吧,还过的去"), 57 | ("non", "不算太好,也不算太差"), 58 | ("non", "继续努力吧") 59 | ] 60 | clf = NBayes(trainSet) 61 | clf.train() 62 | print(clf.predict("不错哦")) 63 | -------------------------------------------------------------------------------- /nbayes/tfidf_nbayes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import jieba 4 | import numpy as np 5 | from collections import defaultdict 6 | 7 | class Corpus(object): 8 | def __init__(self): 9 | self.word2idx = {} 10 | self.tags = defaultdict(int) 11 | self.docs = [] 12 | self.total = 0 13 | 14 | # 分词器 15 | def tokenizer(self, sent): 16 | return jieba.lcut(sent) 17 | 18 | # 构建字典,获取分类标记集 19 | def process_data(self, docs): 20 | vocabs = set() 21 | for tag, doc in docs: 22 | words = self.tokenizer(doc) 23 | if len(words) == 0: 24 | continue 25 | self.tags[tag] += 1 26 | self.total += 1 27 | self.docs.append((tag, words)) 28 | vocabs.update(words) 29 | vocabs = list(vocabs) 30 | self.word2idx = dict(zip(vocabs, range(len(vocabs)))) 31 | 32 | # 计算词袋模型 33 | def calc_bow(self): 34 | bow = np.zeros([self.total, len(self.word2idx)]) 35 | 36 | for docidx, (tag, doc) in enumerate(self.docs): 37 | for word in doc: 38 | bow[docidx, self.word2idx[word]] += 1 39 | return bow 40 | 41 | # 计算tf-idf 42 | def calc_tfidf(self): 43 | tf = self.calc_bow() 44 | df = np.ones([1, len(self.word2idx)]) 45 | 46 | for docidx, (tag, doc) in enumerate(self.docs): 47 | tf[docidx] /= np.max(tf[docidx]) 48 | for word in doc: 49 | df[0, self.word2idx[word]] += 1 50 | idf = np.log(float(self.total)) - np.log(df) 51 | return np.multiply(tf, idf) 52 | 53 | # 计算输入词的向量 54 | def get_vec(self, words): 55 | vec = np.zeros([1, len(self.word2idx)]) 56 | for word in words: 57 | if word in self.word2idx: 58 | vec[0, self.word2idx[word]] += 1 59 | return vec 60 | 61 | class NBayes(Corpus): 62 | def __init__(self, docs, kernel='tfidf'): 63 | super(NBayes, self).__init__() 64 | self.kernel = kernel 65 | self.process_data(docs) 66 | self.y_prob = {} 67 | self.c_prob = None 68 | 69 | def train(self): 70 | if self.kernel == 'tfidf': 71 | self.feature = self.calc_tfidf() 72 | else: 73 | self.feature = self.calc_bow() 74 | 75 | # 采用极大似然估计计算p(y) 76 | for tag in self.tags: 77 | self.y_prob[tag] = float(self.tags[tag]) / self.total 78 | 79 | # 计算条件概率 p(x|y_i) 80 | self.c_prob = np.zeros([len(self.tags), len(self.word2idx)]) 81 | Z = np.zeros([len(self.tags), 1]) 82 | for docidx in range(len(self.docs)): 83 | # 获得类别标签id 84 | tid = self.tags.keys().index(self.docs[docidx][0]) 85 | self.c_prob[tid] += self.feature[docidx] 86 | Z[tid] = np.sum(self.c_prob[tid]) 87 | self.c_prob /= Z # 归一化 88 | 89 | def predict(self, sent): 90 | words = self.tokenizer(sent) 91 | vec = self.get_vec(words) 92 | ret, max_score = None, -1.0 93 | for y, pc in zip(self.y_prob, self.c_prob): 94 | score = np.sum(vec * pc * self.y_prob[y]) # p(x1....xn|yi)p(yi) 95 | if score > max_score: 96 | max_score = score 97 | ret = y 98 | return ret, 1 - max_score 99 | 100 | if __name__ == '__main__': 101 | trainSet = [("pos", "good job !"), 102 | ("pos", "表现不错哦"), 103 | ("pos", "厉害咯"), 104 | ("pos", "做的很好啊"), 105 | ("pos", "做得不错继续努力"), 106 | ("pos", "不错!点赞"), 107 | ("neg", "太差了"), 108 | ("neg", "太糟糕了"), 109 | ("neg", "你做的一点都不好"), 110 | ("neg", "不行,重做"), 111 | ("neg", "so bad"), 112 | ("non", "一般般吧,还过的去"), 113 | ("non", "不算太好,也不算太差"), 114 | ("non", "继续努力吧") 115 | ] 116 | 117 | nb = NBayes(trainSet) 118 | nb.train() 119 | print(nb.predict("不错哦")) # ('pos', 0.9286) -------------------------------------------------------------------------------- /pca/pca.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import numpy as np 5 | 6 | def PCA(x, n_components=2): 7 | # 1. 对每个特征(每一行)进行去中心化,即每个数据减去均值 8 | mean_val = np.mean(x, axis=0) 9 | mean_x = x - mean_val 10 | 11 | # 2. 求mean_x协方差方阵 12 | C_x = np.cov(mean_x, rowvar=True) 13 | 14 | # 3. 求C_x特征值和特征向量 15 | eig_vals, eig_vects = np.linalg.eig(np.mat(C_x)) 16 | 17 | # 4. 对特征值从大到小排序 18 | sorted_idx = np.argsort(-eig_vals) 19 | 20 | # 5. 降维 21 | topn_index = sorted_idx[:n_components] 22 | topn_vects = eig_vects[topn_index, :] 23 | 24 | # 6. 投影到低维空间 25 | pca_x = topn_vects * x 26 | return pca_x 27 | 28 | if __name__ == '__main__': 29 | x = np.mat([[-1, -1, 0, 2, 0], 30 | [-2, 0, 0, 1, 1]]) 31 | x_ = PCA(x, n_components=1) 32 | print(x_) -------------------------------------------------------------------------------- /pcfg/README.md: -------------------------------------------------------------------------------- 1 | # PCFG 2 | 3 | PCFG, Probabilistic Context-Free Grammars 4 | 5 | 使用了CKY算法实现了CNF(Chomsky Normal Form)下的文法解析 6 | -------------------------------------------------------------------------------- /pcfg/pcfg_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# python 实现PCFG\n", 8 | "\n", 9 | "本例子主要受 Michael Collins 教授的 Probabilistic Context-Free Grammars (PCFGs) 启发而编写,为了帮助大家理解,我在我的博客、公众号上发表了文章[一文读懂NLP中的PCFG(公众号)](https://mp.weixin.qq.com/s?__biz=MzIwNDM1NjUzMA==&mid=2247483666&idx=1&sn=708dcbce5be808b3be273838db298da7&chksm=96c02fcfa1b7a6d99a69c35e0de413488d4da4dc13c4ab3d21c8a415c8f2310c141676a068e0#rd),欢迎大家阅读。当然强烈推荐Michael Collins 教授的 [Probabilistic Context-Free Grammars (PCFGs)](http://www.cs.columbia.edu/~mcollins/courses/nlp2011/notes/pcfgs.pdf)\n", 10 | "\n", 11 | "pcfg 常用于生成文法解析树,再这里使用CKY算法对CNF(Chomsky Normal Form)的文法进行解析" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## 目录\n", 19 | "\n", 20 | "1. [项目结构](#项目结构)\n", 21 | "2. [环境要求](#环境要求)\n", 22 | "3. [代码分析](#代码分析)\n", 23 | "4. [项目后续](#项目后续)\n", 24 | "5. [联系作者](#联系作者)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## 项目结构\n", 32 | "\n", 33 | "| - src\n", 34 | "\n", 35 | " | - corpus 语料库\n", 36 | "\n", 37 | " | - pcfg.py \n", 38 | "\n", 39 | " | - main.py 例子程序" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## 环境要求\n", 47 | "\n", 48 | " python3" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## 代码分析" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "### pcfg.py\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 1, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "# !/usr/bin/env python3\n", 74 | "# -*- coding: utf-8 -*-\n", 75 | "\n", 76 | "# -------------------------------------------#\n", 77 | "# PCFG Parser\t #\n", 78 | "# author: sean lee #\n", 79 | "# qq: 929325776\t\t\t\t\t\t\t #\n", 80 | "# email: lxm_0828@163.com #\n", 81 | "#--------------------------------------------#\n", 82 | "\n", 83 | "from collections import defaultdict\n", 84 | "\n", 85 | "class PCFG(object):\n", 86 | "\n", 87 | "\t# N_dict - count nonterminal\n", 88 | "\t# NR_dict - count relation X->Y1 Y2 (X Y1 Y2 are nonterminal)\n", 89 | "\t# TR_dict - count relation X->y (X is nonterminal y is terminal)\n", 90 | "\tdef __init__(self):\n", 91 | "\t\tself.N_dict = defaultdict(int)\n", 92 | "\t\tself.NR_dict = defaultdict(int)\n", 93 | "\t\tself.TR_dict = defaultdict(int)\n", 94 | "\n", 95 | "\tdef fit(self, train_corpus):\n", 96 | "\t\twith open(train_corpus, 'r') as f:\n", 97 | "\t\t\tfor line in f:\n", 98 | "\t\t\t\tarr = line.strip().split('->')\n", 99 | "\t\t\t\tself.N_dict[arr[0]] += 1;\n", 100 | "\t\t\t\tif ' ' in arr[1].strip():\n", 101 | "\t\t\t\t\tarr2 = arr[1].split()\n", 102 | "\t\t\t\t\tif len(arr2) > 2:\n", 103 | "\t\t\t\t\t\tcontinue\n", 104 | "\t\t\t\t\tself.N_dict[arr2[0]] += 1\n", 105 | "\t\t\t\t\tself.N_dict[arr2[1]] += 1\n", 106 | "\t\t\t\t\tself.NR_dict[(arr[0], arr2[0], arr2[1])] += 1\n", 107 | "\t\t\t\telse:\n", 108 | "\t\t\t\t\tself.TR_dict[(arr[0], arr[1])] += 1\n", 109 | "\t# q(X->Y Z)\n", 110 | "\tdef calc_NR_proba(self, x, y1, y2):\n", 111 | "\t\treturn float(self.NR_dict[(x, y1, y2)]) / self.N_dict[x]\n", 112 | "\n", 113 | "\t# q(X->y)\n", 114 | "\tdef calc_TR_proba(self, x, y):\n", 115 | "\t\treturn float(self.TR_dict[(x, y)]) / self.N_dict[x]\n", 116 | "\n", 117 | "\t# Return parse tree\n", 118 | "\tdef parse(self, sentence):\n", 119 | "\t\timport json\n", 120 | "\t\tprint(json.dumps(self.CKY(sentence.split())))\n", 121 | "\n", 122 | "\t# CKY algorithm \n", 123 | "\t# 适用于CNF (Chomsky normal form)\n", 124 | "\tdef CKY(self, sentence):\n", 125 | "\t\tn = len(sentence)\n", 126 | "\t\tpi = defaultdict(float) \n", 127 | "\t\tbp = {}\t# backpointer\n", 128 | "\t\tN = self.N_dict.keys()\n", 129 | "\n", 130 | "\t\tfor i in range(n):\n", 131 | "\t\t\tword = sentence[i]\n", 132 | "\t\t\tfor X in N:\n", 133 | "\t\t\t\tpi[(i, i, X)] = self.calc_TR_proba(X, word)\n", 134 | "\n", 135 | "\t\tfor i in range(1, n):\n", 136 | "\t\t\tfor j in range(n-1):\n", 137 | "\t\t\t\tk = i + j\n", 138 | "\t\t\t\tfor X in N:\n", 139 | "\t\t\t\t\tmax_score = 0\n", 140 | "\t\t\t\t\targmax = None\n", 141 | "\t\t\t\t\tfor R in self.NR_dict.keys():\n", 142 | "\t\t\t\t\t\tif R[0] == X: # start from X\n", 143 | "\t\t\t\t\t\t\tY, Z = R[1:]\n", 144 | "\t\t\t\t\t\t\tfor s in range(j, k):\n", 145 | "\t\t\t\t\t\t\t\tif pi[(j, s, Y)] and pi[s+1, k, Z]:\n", 146 | "\t\t\t\t\t\t\t\t\tscore = self.calc_NR_proba(X, Y, Z) * pi[(j, s, Y)] * pi[s+1, k, Z]\n", 147 | "\t\t\t\t\t\t\t\t\tif max_score < score:\n", 148 | "\t\t\t\t\t\t\t\t\t\tmax_score = score\n", 149 | "\t\t\t\t\t\t\t\t\t\targmax = Y, Z, s\n", 150 | "\t\t\t\t\tif max_score:\n", 151 | "\t\t\t\t\t\tpi[j, k, X] = max_score\n", 152 | "\t\t\t\t\t\tbp[j, k, X] = argmax\n", 153 | "\n", 154 | "\t\t# return\n", 155 | "\t\tif pi[(0, n-1, 'S')]:\n", 156 | "\t\t\treturn self.recover(sentence, bp, 0, n-1, 'S')\n", 157 | "\t\telse:\n", 158 | "\t\t\tmax_score = 0\n", 159 | "\t\t\targmax = 0, 0, 'S'\n", 160 | "\t\t\tfor X in N:\n", 161 | "\t\t\t\tif max_score < pi[(0, n-1, X)]:\n", 162 | "\t\t\t\t\tmax_score = pi[(0, n-1, X)]\n", 163 | "\t\t\t\t\targmax = 0, n-1, X\n", 164 | "\t\t\treturn self.recover(sentence, bp, *argmax)\n", 165 | "\n", 166 | "\t# Return the list of the parsed tree with back pointers.\n", 167 | "\tdef recover(self, sentence, bp, i, j, X):\n", 168 | "\t\tif i == j:\n", 169 | "\t\t\treturn [X, sentence[i]]\n", 170 | "\t\telse:\n", 171 | "\t\t\tY, Z, s = bp[i, j, X]\n", 172 | "\t\t\treturn [X, self.recover(sentence, bp, i, s, Y), self.recover(sentence, bp, s+1, j, Z)]" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "### main.py" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 7, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "name": "stdout", 189 | "output_type": "stream", 190 | "text": [ 191 | "sentence: the man saw the dog\n", 192 | "parse tree\n", 193 | "[\"S\", [\"NP\", [\"DT\", \"the\"], [\"NN\", \"man\"]], [\"VP\", [\"Vt\", \"saw\"], [\"NP\", [\"DT\", \"the\"], [\"NN\", \"dog\"]]]]\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "# !/usr/bin/env python3\n", 199 | "# -*- coding: utf-8 -*-\n", 200 | "\n", 201 | "# -------------------------------------------#\n", 202 | "# main.py \t #\n", 203 | "# author: sean lee #\n", 204 | "# qq: 929325776\t\t\t\t\t\t\t #\n", 205 | "# email: lxm_0828@163.com #\n", 206 | "#--------------------------------------------#\n", 207 | "\n", 208 | "parser = PCFG()\n", 209 | "parser.fit('./corpus/toy/train.txt')\n", 210 | "\n", 211 | "'''\n", 212 | "print(parser.N_dict)\n", 213 | "print(parser.NR_dict)\n", 214 | "print(parser.TR_dict)\n", 215 | "'''\n", 216 | "\n", 217 | "sentence = \"the man saw the dog\"\n", 218 | "print(\"sentence:\", sentence)\n", 219 | "print(\"parse tree\")\n", 220 | "parser.parse(sentence)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "## 项目后续\n", 228 | "\n", 229 | "过段时间会加入深度学习在NLP上的应用,如果你感兴趣,可以关注我的公众号,或者star, watch 本项目哦" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "## 联系作者\n", 237 | "\n", 238 | "@author sean\n", 239 | "\n", 240 | "@qq 929325776\n", 241 | "\n", 242 | "有什么问题,可以联系我,一起讨论" 243 | ] 244 | } 245 | ], 246 | "metadata": { 247 | "kernelspec": { 248 | "display_name": "Python 3", 249 | "language": "python", 250 | "name": "python3" 251 | }, 252 | "language_info": { 253 | "codemirror_mode": { 254 | "name": "ipython", 255 | "version": 3 256 | }, 257 | "file_extension": ".py", 258 | "mimetype": "text/x-python", 259 | "name": "python", 260 | "nbconvert_exporter": "python", 261 | "pygments_lexer": "ipython3", 262 | "version": "3.6.1" 263 | } 264 | }, 265 | "nbformat": 4, 266 | "nbformat_minor": 2 267 | } 268 | -------------------------------------------------------------------------------- /pcfg/src/corpus/toy/train.txt: -------------------------------------------------------------------------------- 1 | S->NP VP 2 | VP->Vt NP 3 | VP->VP PP 4 | NP->DT NN 5 | NP->NP PP 6 | PP->IN NP 7 | Vi->sleeps 8 | Vt->saw 9 | NN->man 10 | NN->woman 11 | NN->telescope 12 | NN->dog 13 | DT->the 14 | IN->with 15 | IN->in 16 | -------------------------------------------------------------------------------- /pcfg/src/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # main.py # 6 | # author: sean lee # 7 | # qq: 929325776 # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | from pcfg import PCFG 12 | 13 | parser = PCFG() 14 | parser.fit('./corpus/toy/train.txt') 15 | parser.parse("the man saw the dog") 16 | ''' 17 | print(parser.N_dict) 18 | print(parser.NR_dict) 19 | print(parser.TR_dict) 20 | ''' -------------------------------------------------------------------------------- /pcfg/src/pcfg.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # PCFG Parser # 6 | # author: sean lee # 7 | # qq: 929325776 # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | from collections import defaultdict 12 | 13 | class PCFG(object): 14 | 15 | # N_dict - count nonterminal 16 | # NR_dict - count relation X->Y1 Y2 (X Y1 Y2 are nonterminal) 17 | # TR_dict - count relation X->y (X is nonterminal y is terminal) 18 | def __init__(self): 19 | self.N_dict = defaultdict(int) 20 | self.NR_dict = defaultdict(int) 21 | self.TR_dict = defaultdict(int) 22 | 23 | def fit(self, train_corpus): 24 | with open(train_corpus, 'r') as f: 25 | for line in f: 26 | arr = line.strip().split('->') 27 | self.N_dict[arr[0]] += 1; 28 | if ' ' in arr[1].strip(): 29 | arr2 = arr[1].split() 30 | if len(arr2) > 2: 31 | continue 32 | self.N_dict[arr2[0]] += 1 33 | self.N_dict[arr2[1]] += 1 34 | self.NR_dict[(arr[0], arr2[0], arr2[1])] += 1 35 | else: 36 | self.TR_dict[(arr[0], arr[1])] += 1 37 | # q(X->Y Z) 38 | def calc_NR_proba(self, x, y1, y2): 39 | return float(self.NR_dict[(x, y1, y2)]) / self.N_dict[x] 40 | 41 | # q(X->y) 42 | def calc_TR_proba(self, x, y): 43 | return float(self.TR_dict[(x, y)]) / self.N_dict[x] 44 | 45 | # Return parse tree 46 | def parse(self, sentence): 47 | import json 48 | print(json.dumps(self.CKY(sentence.split()))) 49 | 50 | # CKY algorithm 51 | # 适用于CNF (Chomsky normal form) 52 | def CKY(self, sentence): 53 | n = len(sentence) 54 | pi = defaultdict(float) 55 | bp = {} # backpointer 56 | N = self.N_dict.keys() 57 | 58 | for i in range(n): 59 | word = sentence[i] 60 | for X in N: 61 | pi[(i, i, X)] = self.calc_TR_proba(X, word) 62 | 63 | for i in range(1, n): 64 | for j in range(n-1): 65 | k = i + j 66 | for X in N: 67 | max_score = 0 68 | argmax = None 69 | for R in self.NR_dict.keys(): 70 | if R[0] == X: # start from X 71 | Y, Z = R[1:] 72 | for s in range(j, k): 73 | if pi[(j, s, Y)] and pi[s+1, k, Z]: 74 | score = self.calc_NR_proba(X, Y, Z) * pi[(j, s, Y)] * pi[s+1, k, Z] 75 | if max_score < score: 76 | max_score = score 77 | argmax = Y, Z, s 78 | if max_score: 79 | pi[j, k, X] = max_score 80 | bp[j, k, X] = argmax 81 | 82 | # return 83 | if pi[(0, n-1, 'S')]: 84 | return self.recover(sentence, bp, 0, n-1, 'S') 85 | else: 86 | max_score = 0 87 | argmax = 0, 0, 'S' 88 | for X in N: 89 | if max_score < pi[(0, n-1, X)]: 90 | max_score = pi[(0, n-1, X)] 91 | argmax = 0, n-1, X 92 | return self.recover(sentence, bp, *argmax) 93 | 94 | # Return the list of the parsed tree with back pointers. 95 | def recover(self, sentence, bp, i, j, X): 96 | if i == j: 97 | return [X, sentence[i]] 98 | else: 99 | Y, Z, s = bp[i, j, X] 100 | return [X, self.recover(sentence, bp, i, s, Y), self.recover(sentence, bp, s+1, j, Z)] -------------------------------------------------------------------------------- /reading_comprehension/README.md: -------------------------------------------------------------------------------- 1 | ### Reading Comprehension 2 | 3 | 阅读理解是NLP的热点,也是难点 4 | 5 | 为了帮助大家更好的理解reading comprehension,本项目实现了一个简单的baseline(pytorch实现) 6 | 7 | ### baseline 8 | ![baseline](./corpus/reading_comprehension.png) 9 | 10 | ### dataset 11 | 12 | 使用了[facebook的bAbI数据集](https://research.fb.com/downloads/babi/),由于完整语料比较大在这里只使用了en-10k下的qa5-\*语料 13 | 14 | ### Result 15 | ![baseline](./corpus/result.png) 16 | 17 | ### Recommed 18 | 推荐阅读我的公众号文章[DeepNLP之阅读理解](https://mp.weixin.qq.com/s?__biz=MzIwNDM1NjUzMA==&mid=2247483674&idx=1&sn=8b7e470b8a8222b057d715d3ec48dd74&chksm=96c02fc7a1b7a6d1688351b4c2bc393ffdcd5d1686344f5a6500613b4e44f08bb696d830f45c#rd) 19 | 20 | ### Reference 21 | 22 | * [Dynamic-memory-networks-plus-Pytorch](https://github.com/dandelin/Dynamic-memory-networks-plus-Pytorch) 23 | -------------------------------------------------------------------------------- /reading_comprehension/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import torch 11 | 12 | # tokens 13 | unk = '' 14 | pad = '' 15 | sos = '' 16 | eos = '' 17 | 18 | #nnwork 19 | lr_rate = 0.001 20 | batch_size = 16 21 | hidden_size = 128 22 | epochs = 10 23 | task_id = 5 # 与bAbI/en-10k 中的task匹配 24 | 25 | use_cuda = torch.cuda.is_available() 26 | -------------------------------------------------------------------------------- /reading_comprehension/corpus/bAbI/LICENSE.txt: -------------------------------------------------------------------------------- 1 | CC License 2 | 3 | bAbI tasks data 4 | 5 | Copyright (c) 2015-present, Facebook, Inc. All rights reserved. 6 | 7 | Creative Commons Legal Code 8 | 9 | Attribution 3.0 Unported 10 | 11 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 12 | LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN 13 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 14 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 15 | REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR 16 | DAMAGES RESULTING FROM ITS USE. 17 | 18 | License 19 | 20 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE 21 | COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY 22 | COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS 23 | AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED. 24 | 25 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE 26 | TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY 27 | BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS 28 | CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND 29 | CONDITIONS. 30 | 31 | 1. Definitions 32 | 33 | a. "Adaptation" means a work based upon the Work, or upon the Work and 34 | other pre-existing works, such as a translation, adaptation, 35 | derivative work, arrangement of music or other alterations of a 36 | literary or artistic work, or phonogram or performance and includes 37 | cinematographic adaptations or any other form in which the Work may be 38 | recast, transformed, or adapted including in any form recognizably 39 | derived from the original, except that a work that constitutes a 40 | Collection will not be considered an Adaptation for the purpose of 41 | this License. For the avoidance of doubt, where the Work is a musical 42 | work, performance or phonogram, the synchronization of the Work in 43 | timed-relation with a moving image ("synching") will be considered an 44 | Adaptation for the purpose of this License. 45 | b. "Collection" means a collection of literary or artistic works, such as 46 | encyclopedias and anthologies, or performances, phonograms or 47 | broadcasts, or other works or subject matter other than works listed 48 | in Section 1(f) below, which, by reason of the selection and 49 | arrangement of their contents, constitute intellectual creations, in 50 | which the Work is included in its entirety in unmodified form along 51 | with one or more other contributions, each constituting separate and 52 | independent works in themselves, which together are assembled into a 53 | collective whole. A work that constitutes a Collection will not be 54 | considered an Adaptation (as defined above) for the purposes of this 55 | License. 56 | c. "Distribute" means to make available to the public the original and 57 | copies of the Work or Adaptation, as appropriate, through sale or 58 | other transfer of ownership. 59 | d. "Licensor" means the individual, individuals, entity or entities that 60 | offer(s) the Work under the terms of this License. 61 | e. "Original Author" means, in the case of a literary or artistic work, 62 | the individual, individuals, entity or entities who created the Work 63 | or if no individual or entity can be identified, the publisher; and in 64 | addition (i) in the case of a performance the actors, singers, 65 | musicians, dancers, and other persons who act, sing, deliver, declaim, 66 | play in, interpret or otherwise perform literary or artistic works or 67 | expressions of folklore; (ii) in the case of a phonogram the producer 68 | being the person or legal entity who first fixes the sounds of a 69 | performance or other sounds; and, (iii) in the case of broadcasts, the 70 | organization that transmits the broadcast. 71 | f. "Work" means the literary and/or artistic work offered under the terms 72 | of this License including without limitation any production in the 73 | literary, scientific and artistic domain, whatever may be the mode or 74 | form of its expression including digital form, such as a book, 75 | pamphlet and other writing; a lecture, address, sermon or other work 76 | of the same nature; a dramatic or dramatico-musical work; a 77 | choreographic work or entertainment in dumb show; a musical 78 | composition with or without words; a cinematographic work to which are 79 | assimilated works expressed by a process analogous to cinematography; 80 | a work of drawing, painting, architecture, sculpture, engraving or 81 | lithography; a photographic work to which are assimilated works 82 | expressed by a process analogous to photography; a work of applied 83 | art; an illustration, map, plan, sketch or three-dimensional work 84 | relative to geography, topography, architecture or science; a 85 | performance; a broadcast; a phonogram; a compilation of data to the 86 | extent it is protected as a copyrightable work; or a work performed by 87 | a variety or circus performer to the extent it is not otherwise 88 | considered a literary or artistic work. 89 | g. "You" means an individual or entity exercising rights under this 90 | License who has not previously violated the terms of this License with 91 | respect to the Work, or who has received express permission from the 92 | Licensor to exercise rights under this License despite a previous 93 | violation. 94 | h. "Publicly Perform" means to perform public recitations of the Work and 95 | to communicate to the public those public recitations, by any means or 96 | process, including by wire or wireless means or public digital 97 | performances; to make available to the public Works in such a way that 98 | members of the public may access these Works from a place and at a 99 | place individually chosen by them; to perform the Work to the public 100 | by any means or process and the communication to the public of the 101 | performances of the Work, including by public digital performance; to 102 | broadcast and rebroadcast the Work by any means including signs, 103 | sounds or images. 104 | i. "Reproduce" means to make copies of the Work by any means including 105 | without limitation by sound or visual recordings and the right of 106 | fixation and reproducing fixations of the Work, including storage of a 107 | protected performance or phonogram in digital form or other electronic 108 | medium. 109 | 110 | 2. Fair Dealing Rights. Nothing in this License is intended to reduce, 111 | limit, or restrict any uses free from copyright or rights arising from 112 | limitations or exceptions that are provided for in connection with the 113 | copyright protection under copyright law or other applicable laws. 114 | 115 | 3. License Grant. Subject to the terms and conditions of this License, 116 | Licensor hereby grants You a worldwide, royalty-free, non-exclusive, 117 | perpetual (for the duration of the applicable copyright) license to 118 | exercise the rights in the Work as stated below: 119 | 120 | a. to Reproduce the Work, to incorporate the Work into one or more 121 | Collections, and to Reproduce the Work as incorporated in the 122 | Collections; 123 | b. to create and Reproduce Adaptations provided that any such Adaptation, 124 | including any translation in any medium, takes reasonable steps to 125 | clearly label, demarcate or otherwise identify that changes were made 126 | to the original Work. For example, a translation could be marked "The 127 | original work was translated from English to Spanish," or a 128 | modification could indicate "The original work has been modified."; 129 | c. to Distribute and Publicly Perform the Work including as incorporated 130 | in Collections; and, 131 | d. to Distribute and Publicly Perform Adaptations. 132 | e. For the avoidance of doubt: 133 | 134 | i. Non-waivable Compulsory License Schemes. In those jurisdictions in 135 | which the right to collect royalties through any statutory or 136 | compulsory licensing scheme cannot be waived, the Licensor 137 | reserves the exclusive right to collect such royalties for any 138 | exercise by You of the rights granted under this License; 139 | ii. Waivable Compulsory License Schemes. In those jurisdictions in 140 | which the right to collect royalties through any statutory or 141 | compulsory licensing scheme can be waived, the Licensor waives the 142 | exclusive right to collect such royalties for any exercise by You 143 | of the rights granted under this License; and, 144 | iii. Voluntary License Schemes. The Licensor waives the right to 145 | collect royalties, whether individually or, in the event that the 146 | Licensor is a member of a collecting society that administers 147 | voluntary licensing schemes, via that society, from any exercise 148 | by You of the rights granted under this License. 149 | 150 | The above rights may be exercised in all media and formats whether now 151 | known or hereafter devised. The above rights include the right to make 152 | such modifications as are technically necessary to exercise the rights in 153 | other media and formats. Subject to Section 8(f), all rights not expressly 154 | granted by Licensor are hereby reserved. 155 | 156 | 4. Restrictions. The license granted in Section 3 above is expressly made 157 | subject to and limited by the following restrictions: 158 | 159 | a. You may Distribute or Publicly Perform the Work only under the terms 160 | of this License. You must include a copy of, or the Uniform Resource 161 | Identifier (URI) for, this License with every copy of the Work You 162 | Distribute or Publicly Perform. You may not offer or impose any terms 163 | on the Work that restrict the terms of this License or the ability of 164 | the recipient of the Work to exercise the rights granted to that 165 | recipient under the terms of the License. You may not sublicense the 166 | Work. You must keep intact all notices that refer to this License and 167 | to the disclaimer of warranties with every copy of the Work You 168 | Distribute or Publicly Perform. When You Distribute or Publicly 169 | Perform the Work, You may not impose any effective technological 170 | measures on the Work that restrict the ability of a recipient of the 171 | Work from You to exercise the rights granted to that recipient under 172 | the terms of the License. This Section 4(a) applies to the Work as 173 | incorporated in a Collection, but this does not require the Collection 174 | apart from the Work itself to be made subject to the terms of this 175 | License. If You create a Collection, upon notice from any Licensor You 176 | must, to the extent practicable, remove from the Collection any credit 177 | as required by Section 4(b), as requested. If You create an 178 | Adaptation, upon notice from any Licensor You must, to the extent 179 | practicable, remove from the Adaptation any credit as required by 180 | Section 4(b), as requested. 181 | b. If You Distribute, or Publicly Perform the Work or any Adaptations or 182 | Collections, You must, unless a request has been made pursuant to 183 | Section 4(a), keep intact all copyright notices for the Work and 184 | provide, reasonable to the medium or means You are utilizing: (i) the 185 | name of the Original Author (or pseudonym, if applicable) if supplied, 186 | and/or if the Original Author and/or Licensor designate another party 187 | or parties (e.g., a sponsor institute, publishing entity, journal) for 188 | attribution ("Attribution Parties") in Licensor's copyright notice, 189 | terms of service or by other reasonable means, the name of such party 190 | or parties; (ii) the title of the Work if supplied; (iii) to the 191 | extent reasonably practicable, the URI, if any, that Licensor 192 | specifies to be associated with the Work, unless such URI does not 193 | refer to the copyright notice or licensing information for the Work; 194 | and (iv) , consistent with Section 3(b), in the case of an Adaptation, 195 | a credit identifying the use of the Work in the Adaptation (e.g., 196 | "French translation of the Work by Original Author," or "Screenplay 197 | based on original Work by Original Author"). The credit required by 198 | this Section 4 (b) may be implemented in any reasonable manner; 199 | provided, however, that in the case of a Adaptation or Collection, at 200 | a minimum such credit will appear, if a credit for all contributing 201 | authors of the Adaptation or Collection appears, then as part of these 202 | credits and in a manner at least as prominent as the credits for the 203 | other contributing authors. For the avoidance of doubt, You may only 204 | use the credit required by this Section for the purpose of attribution 205 | in the manner set out above and, by exercising Your rights under this 206 | License, You may not implicitly or explicitly assert or imply any 207 | connection with, sponsorship or endorsement by the Original Author, 208 | Licensor and/or Attribution Parties, as appropriate, of You or Your 209 | use of the Work, without the separate, express prior written 210 | permission of the Original Author, Licensor and/or Attribution 211 | Parties. 212 | c. Except as otherwise agreed in writing by the Licensor or as may be 213 | otherwise permitted by applicable law, if You Reproduce, Distribute or 214 | Publicly Perform the Work either by itself or as part of any 215 | Adaptations or Collections, You must not distort, mutilate, modify or 216 | take other derogatory action in relation to the Work which would be 217 | prejudicial to the Original Author's honor or reputation. Licensor 218 | agrees that in those jurisdictions (e.g. Japan), in which any exercise 219 | of the right granted in Section 3(b) of this License (the right to 220 | make Adaptations) would be deemed to be a distortion, mutilation, 221 | modification or other derogatory action prejudicial to the Original 222 | Author's honor and reputation, the Licensor will waive or not assert, 223 | as appropriate, this Section, to the fullest extent permitted by the 224 | applicable national law, to enable You to reasonably exercise Your 225 | right under Section 3(b) of this License (right to make Adaptations) 226 | but not otherwise. 227 | 228 | 5. Representations, Warranties and Disclaimer 229 | 230 | UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR 231 | OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY 232 | KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, 233 | INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, 234 | FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF 235 | LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, 236 | WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION 237 | OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. 238 | 239 | 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE 240 | LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR 241 | ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES 242 | ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS 243 | BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 244 | 245 | 7. Termination 246 | 247 | a. This License and the rights granted hereunder will terminate 248 | automatically upon any breach by You of the terms of this License. 249 | Individuals or entities who have received Adaptations or Collections 250 | from You under this License, however, will not have their licenses 251 | terminated provided such individuals or entities remain in full 252 | compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will 253 | survive any termination of this License. 254 | b. Subject to the above terms and conditions, the license granted here is 255 | perpetual (for the duration of the applicable copyright in the Work). 256 | Notwithstanding the above, Licensor reserves the right to release the 257 | Work under different license terms or to stop distributing the Work at 258 | any time; provided, however that any such election will not serve to 259 | withdraw this License (or any other license that has been, or is 260 | required to be, granted under the terms of this License), and this 261 | License will continue in full force and effect unless terminated as 262 | stated above. 263 | 264 | 8. Miscellaneous 265 | 266 | a. Each time You Distribute or Publicly Perform the Work or a Collection, 267 | the Licensor offers to the recipient a license to the Work on the same 268 | terms and conditions as the license granted to You under this License. 269 | b. Each time You Distribute or Publicly Perform an Adaptation, Licensor 270 | offers to the recipient a license to the original Work on the same 271 | terms and conditions as the license granted to You under this License. 272 | c. If any provision of this License is invalid or unenforceable under 273 | applicable law, it shall not affect the validity or enforceability of 274 | the remainder of the terms of this License, and without further action 275 | by the parties to this agreement, such provision shall be reformed to 276 | the minimum extent necessary to make such provision valid and 277 | enforceable. 278 | d. No term or provision of this License shall be deemed waived and no 279 | breach consented to unless such waiver or consent shall be in writing 280 | and signed by the party to be charged with such waiver or consent. 281 | e. This License constitutes the entire agreement between the parties with 282 | respect to the Work licensed here. There are no understandings, 283 | agreements or representations with respect to the Work not specified 284 | here. Licensor shall not be bound by any additional provisions that 285 | may appear in any communication from You. This License may not be 286 | modified without the mutual written agreement of the Licensor and You. 287 | f. The rights granted under, and the subject matter referenced, in this 288 | License were drafted utilizing the terminology of the Berne Convention 289 | for the Protection of Literary and Artistic Works (as amended on 290 | September 28, 1979), the Rome Convention of 1961, the WIPO Copyright 291 | Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 292 | and the Universal Copyright Convention (as revised on July 24, 1971). 293 | These rights and subject matter take effect in the relevant 294 | jurisdiction in which the License terms are sought to be enforced 295 | according to the corresponding provisions of the implementation of 296 | those treaty provisions in the applicable national law. If the 297 | standard suite of rights granted under applicable copyright law 298 | includes additional rights not granted under this License, such 299 | additional rights are deemed to be included in the License; this 300 | License is not intended to restrict the license of any rights under 301 | applicable law. 302 | 303 | 304 | Creative Commons Notice 305 | 306 | Creative Commons is not a party to this License, and makes no warranty 307 | whatsoever in connection with the Work. Creative Commons will not be 308 | liable to You or any party on any legal theory for any damages 309 | whatsoever, including without limitation any general, special, 310 | incidental or consequential damages arising in connection to this 311 | license. Notwithstanding the foregoing two (2) sentences, if Creative 312 | Commons has expressly identified itself as the Licensor hereunder, it 313 | shall have all rights and obligations of Licensor. 314 | 315 | Except for the limited purpose of indicating to the public that the 316 | Work is licensed under the CCPL, Creative Commons does not authorize 317 | the use by either party of the trademark "Creative Commons" or any 318 | related trademark or logo of Creative Commons without the prior 319 | written consent of Creative Commons. Any permitted use will be in 320 | compliance with Creative Commons' then-current trademark usage 321 | guidelines, as may be published on its website or otherwise made 322 | available upon request from time to time. For the avoidance of doubt, 323 | this trademark restriction does not form part of this License. 324 | 325 | Creative Commons may be contacted at https://creativecommons.org/. 326 | -------------------------------------------------------------------------------- /reading_comprehension/corpus/bAbI/README.txt: -------------------------------------------------------------------------------- 1 | Towards AI Complete Question Answering: A Set of Prerequisite Toy Tasks 2 | ----------------------------------------------------------------------- 3 | In this directory is the first set of 20 tasks for testing text understanding and reasoning in the bAbI project. 4 | The aim is that each task tests a unique aspect of text and reasoning, and hence test different capabilities of learning models. More tasks are planned in the future to capture more aspects. 5 | 6 | For each task, there are 1000 questions for training, and 1000 for testing. 7 | However, we emphasize that the goal is still to use as little data as possible to do well on the task (i.e. if you can use less than 1000 that's even better) -- and without resorting to engineering task-specific tricks that will not generalize to other tasks, as they may not be of much use subsequently. Note that the aim during evaluation is to use the _same_ learner across all tasks to evaluate its skills and capabilities. 8 | Further while the MemNN results in the paper use full supervision (including of the supporting facts) results with weak supervision would also be ultimately preferable as this kind of data is easier to collect. Hence results of that form are very welcome. 9 | 10 | For the reasons above there are currently several directories: 11 | 12 | 1) en/ -- the tasks in English, readable by humans. 13 | 2) hn/ -- the tasks in Hindi, readable by humans. 14 | 3) shuffled/ -- the same tasks with shuffled letters so they are not readable by humans, and for existing parsers and taggers cannot be used in a straight-forward fashion to leverage extra resources-- in this case the learner is more forced to rely on the given training data. This mimics a learner being first presented a language and having to learn from scratch. 15 | 4) en-10k/ shuffled-10k/ and hn-10k/ -- the same tasks in the three formats, but with 10,000 training examples, rather than 1000 training examples. 16 | 5) en-valid/ and en-valid-10k/ are the same as en/ and en10k/ except the train sets have been conveniently split into train and valid portions (90% and 10% split). 17 | 18 | The file format for each task is as follows: 19 | ID text 20 | ID text 21 | ID text 22 | ID question[tab]answer[tab]supporting fact IDS. 23 | ... 24 | 25 | The IDs for a given "story" start at 1 and increase. 26 | When the IDs in a file reset back to 1 you can consider the following sentences as a new "story". 27 | Supporting fact IDs only ever reference the sentences within a "story". 28 | 29 | For Example: 30 | 1 Mary moved to the bathroom. 31 | 2 John went to the hallway. 32 | 3 Where is Mary? bathroom 1 33 | 4 Daniel went back to the hallway. 34 | 5 Sandra moved to the garden. 35 | 6 Where is Daniel? hallway 4 36 | 7 John moved to the office. 37 | 8 Sandra journeyed to the bathroom. 38 | 9 Where is Daniel? hallway 4 39 | 10 Mary moved to the hallway. 40 | 11 Daniel travelled to the office. 41 | 12 Where is Daniel? office 11 42 | 13 John went back to the garden. 43 | 14 John moved to the bedroom. 44 | 15 Where is Sandra? bathroom 8 45 | 1 Sandra travelled to the office. 46 | 2 Sandra went to the bathroom. 47 | 3 Where is Sandra? bathroom 2 48 | 49 | Changes between versions. 50 | ========================= 51 | V1.2 (this version) - Added Hindi versions of all the tasks. Fixed some problems with task 16, and added a separate set of directories for 10k training data, as we received requests for this. 52 | V1.1 (this version) - Fixed some problems with task 3, and reduced the training set size available to 1000 as this matches the results in the paper cited above, in order to avoid confusion. 53 | -------------------------------------------------------------------------------- /reading_comprehension/corpus/reading_comprehension.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SeanLee97/nlp_learning/82f158f63c7b943dabc0fb18ed7ebde5c655214a/reading_comprehension/corpus/reading_comprehension.png -------------------------------------------------------------------------------- /reading_comprehension/corpus/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SeanLee97/nlp_learning/82f158f63c7b943dabc0fb18ed7ebde5c655214a/reading_comprehension/corpus/result.png -------------------------------------------------------------------------------- /reading_comprehension/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | from glob import glob 11 | from torch.utils.data.dataset import Dataset 12 | from torch.utils.data import DataLoader 13 | from torch.utils.data.dataloader import default_collate 14 | import re 15 | import numpy as np 16 | import const 17 | 18 | class adict(dict): 19 | def __init__(self, *av, **kav): 20 | dict.__init__(self, *av, **kav) 21 | self.__dict__ = self 22 | 23 | def pad_collate(batch): 24 | max_context_sen_len = float('-inf') 25 | max_context_len = float('-inf') 26 | max_question_len = float('-inf') 27 | for elem in batch: 28 | context, question, _ = elem 29 | max_context_len = max_context_len if max_context_len > len(context) else len(context) 30 | max_question_len = max_question_len if max_question_len > len(question) else len(question) 31 | for sen in context: 32 | max_context_sen_len = max_context_sen_len if max_context_sen_len > len(sen) else len(sen) 33 | max_context_len = min(max_context_len, 70) 34 | for i, elem in enumerate(batch): 35 | _context, question, answer = elem 36 | _context = _context[-max_context_len:] 37 | context = np.zeros((max_context_len, max_context_sen_len)) 38 | for j, sen in enumerate(_context): 39 | context[j] = np.pad(sen, (0, max_context_sen_len - len(sen)), 'constant', constant_values=0) 40 | question = np.pad(question, (0, max_question_len - len(question)), 'constant', constant_values=0) 41 | batch[i] = (context, question, answer) 42 | return default_collate(batch) 43 | 44 | class BabiDataset(Dataset): 45 | def __init__(self, task_id, mode='train'): 46 | self.vocab_path = 'dataset/babi{}_vocab.pkl'.format(task_id) 47 | self.mode = mode 48 | raw_train, raw_test = get_raw_babi(task_id) 49 | self.QA = adict() 50 | self.QA.VOCAB = {const.pad: 0, const.eos: 1} 51 | self.QA.IVOCAB = {0: const.pad, 1: const.eos} 52 | self.train = self.get_indexed_qa(raw_train) 53 | self.valid = [self.train[i][int(-len(self.train[i])/10):] for i in range(3)] 54 | self.train = [self.train[i][:int(9 * len(self.train[i])/10)] for i in range(3)] 55 | self.test = self.get_indexed_qa(raw_test) 56 | 57 | def set_mode(self, mode): 58 | self.mode = mode 59 | 60 | def __len__(self): 61 | if self.mode == 'train': 62 | return len(self.train[0]) 63 | elif self.mode == 'valid': 64 | return len(self.valid[0]) 65 | elif self.mode == 'test': 66 | return len(self.test[0]) 67 | 68 | def __getitem__(self, index): 69 | if self.mode == 'train': 70 | contexts, questions, answers = self.train 71 | elif self.mode == 'valid': 72 | contexts, questions, answers = self.valid 73 | elif self.mode == 'test': 74 | contexts, questions, answers = self.test 75 | return contexts[index], questions[index], answers[index] 76 | 77 | def get_indexed_qa(self, raw_babi): 78 | unindexed = get_unindexed_qa(raw_babi) 79 | questions = [] 80 | contexts = [] 81 | answers = [] 82 | for qa in unindexed: 83 | context = [c.lower().split() + [const.eos] for c in qa['C']] 84 | 85 | for con in context: 86 | for token in con: 87 | self.build_vocab(token) 88 | context = [[self.QA.VOCAB[token] for token in sentence] for sentence in context] 89 | question = qa['Q'].lower().split() + [const.eos] 90 | 91 | for token in question: 92 | self.build_vocab(token) 93 | question = [self.QA.VOCAB[token] for token in question] 94 | 95 | self.build_vocab(qa['A'].lower()) 96 | answer = self.QA.VOCAB[qa['A'].lower()] 97 | 98 | 99 | contexts.append(context) 100 | questions.append(question) 101 | answers.append(answer) 102 | return (contexts, questions, answers) 103 | 104 | def build_vocab(self, token): 105 | if not token in self.QA.VOCAB: 106 | next_index = len(self.QA.VOCAB) 107 | self.QA.VOCAB[token] = next_index 108 | self.QA.IVOCAB[next_index] = token 109 | 110 | 111 | def get_raw_babi(taskid): 112 | paths = glob('corpus/bAbI/en-10k/qa{}_*'.format(taskid)) 113 | for path in paths: 114 | if 'train' in path: 115 | with open(path, 'r') as fp: 116 | train = fp.read() 117 | elif 'test' in path: 118 | with open(path, 'r') as fp: 119 | test = fp.read() 120 | return train, test 121 | 122 | def build_vocab(raw_babi): 123 | lowered = raw_babi.lower() 124 | tokens = re.findall('[a-zA-Z]+', lowered) 125 | types = set(tokens) 126 | return types 127 | 128 | # adapted from https://github.com/YerevaNN/Dynamic-memory-networks-in-Theano/ 129 | def get_unindexed_qa(raw_babi): 130 | tasks = [] 131 | task = None 132 | babi = raw_babi.strip().split('\n') 133 | for i, line in enumerate(babi): 134 | id = int(line[0:line.find(' ')]) 135 | if id == 1: 136 | task = {"C": "", "Q": "", "A": "", "S": ""} 137 | counter = 0 138 | id_map = {} 139 | 140 | line = line.strip() 141 | line = line.replace('.', ' . ') 142 | line = line[line.find(' ')+1:] 143 | # if not a question 144 | if line.find('?') == -1: 145 | task["C"] += line + '' 146 | id_map[id] = counter 147 | counter += 1 148 | else: 149 | idx = line.find('?') 150 | tmp = line[idx+1:].split('\t') 151 | task["Q"] = line[:idx] 152 | task["A"] = tmp[1].strip() 153 | task["S"] = [] # Supporting facts 154 | for num in tmp[2].split(): 155 | task["S"].append(id_map[int(num.strip())]) 156 | tc = task.copy() 157 | tc['C'] = tc['C'].split('')[:-1] 158 | tasks.append(tc) 159 | return tasks 160 | 161 | if __name__ == '__main__': 162 | dset_train = BabiDataset(20, mode='train') 163 | train_loader = DataLoader(dset_train, batch_size=2, shuffle=True, collate_fn=pad_collate) 164 | for batch_idx, data in enumerate(train_loader): 165 | contexts, questions, answers = data 166 | break -------------------------------------------------------------------------------- /reading_comprehension/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import numpy as np 12 | import torch 13 | import torch.nn as nn 14 | import torch.optim as optim 15 | from model import * 16 | from dataset import * 17 | 18 | import argparse 19 | parser = argparse.ArgumentParser(description='gca main.py') 20 | parser.add_argument('-train', action='store_true', default=False, help='train model') 21 | parser.add_argument('-test', action='store_true', default=False, help='test model') 22 | parser.add_argument('-evaluate', action='store_true', default=False, help='evaluate') 23 | args = parser.parse_args() 24 | 25 | def train(): 26 | dataset = BabiDataset(const.task_id) 27 | model = ReaderNet(len(dataset.QA.VOCAB), const.hidden_size) 28 | model = load_model(model) 29 | model = model.cuda() if const.use_cuda else model 30 | optimizer = optim.Adam(model.parameters(), lr=const.lr_rate) 31 | 32 | best_acc = 0 33 | for epoch in range(const.epochs): 34 | model.train() 35 | total_acc = 0.0 36 | cnt = 0 37 | dataset.set_mode('train') 38 | train_loader = DataLoader(dataset, batch_size=const.batch_size, shuffle=True, collate_fn=pad_collate) 39 | losses = [] 40 | for batch_idx, data in enumerate(train_loader): 41 | optimizer.zero_grad() 42 | 43 | contexts, questions, answers = data 44 | 45 | contexts = contexts.long() 46 | contexts = contexts.cuda() if const.use_cuda else contexts 47 | contexts = Variable(contexts) 48 | 49 | questions = questions.long() 50 | questions = questions.cuda() if const.use_cuda else questions 51 | questions = Variable(questions) 52 | 53 | #answers = answers.long() 54 | answers = answers.cuda() if const.use_cuda else answers 55 | answers = Variable(answers) 56 | 57 | loss, acc = model.get_loss(contexts, questions, answers) 58 | losses.append(loss.data[0]) 59 | total_acc += acc * const.batch_size 60 | if batch_idx % 50 == 0: 61 | print('loss', np.mean(losses)) 62 | losses = [] 63 | cnt += const.batch_size 64 | loss.backward() 65 | optimizer.step() 66 | 67 | dataset.set_mode('valid') 68 | valid_loader = DataLoader( 69 | dataset, batch_size=const.batch_size, shuffle=False, collate_fn=pad_collate 70 | ) 71 | 72 | model.eval() 73 | total_acc = 0.0 74 | cnt = 0 75 | for batch_idx, data in enumerate(valid_loader): 76 | contexts, questions, answers = data 77 | batch_size = contexts.size()[0] 78 | 79 | contexts = contexts.long() 80 | contexts = contexts.cuda() if const.use_cuda else contexts 81 | contexts = Variable(contexts) 82 | 83 | questions = questions.long() 84 | questions = questions.cuda() if const.use_cuda else questions 85 | questions = Variable(questions) 86 | 87 | answers = answers.cuda() if const.use_cuda else answers 88 | answers = Variable(answers) 89 | 90 | _, acc = model.get_loss(contexts, questions, answers) 91 | total_acc += acc * const.batch_size 92 | cnt += const.batch_size 93 | 94 | total_acc = total_acc / cnt 95 | print('accuracy: %.4f' % total_acc) 96 | if total_acc > best_acc: 97 | best_acc = total_acc 98 | best_state = model.state_dict() 99 | save_model(model) 100 | print('save model') 101 | 102 | def evaluate(): 103 | dataset = BabiDataset(const.task_id) 104 | model = ReaderNet(len(dataset.QA.VOCAB), const.hidden_size) 105 | model = load_model(model) 106 | model = model.cuda() if const.use_cuda else model 107 | 108 | model.eval() 109 | dataset.set_mode('test') 110 | test_loader = DataLoader( 111 | dataset, batch_size=1, shuffle=True, collate_fn=pad_collate 112 | ) 113 | for batch_idx, data in enumerate(test_loader): 114 | contexts, questions, answers = data 115 | 116 | print(contexts.size()) 117 | print(questions.size()) 118 | print(answers.size()) 119 | contexts_raw = [] 120 | for cont in contexts.numpy().tolist()[0]: 121 | c = [] 122 | [c.append(dataset.QA.IVOCAB[w]) for w in cont] 123 | contexts_raw.append(c) 124 | 125 | q_raw = [] 126 | [q_raw.append(dataset.QA.IVOCAB[w]) for w in questions.numpy().tolist()[0]] 127 | 128 | a_raw = dataset.QA.IVOCAB[answers.numpy().tolist()[0]] 129 | 130 | print('\n>facts: ') 131 | for cont in contexts_raw: 132 | print(cont) 133 | 134 | contexts = contexts.long() 135 | contexts = contexts.cuda() if const.use_cuda else contexts 136 | contexts = Variable(contexts) 137 | while True: 138 | question = input('\n>input your question: ') 139 | questions = list(map(lambda w: dataset.QA.VOCAB[w] if w in dataset.QA.VOCAB else dataset.QA.VOCAB[const.pad], question.split(' '))) 140 | 141 | #print(questions) 142 | 143 | questions = torch.LongTensor(questions) 144 | questions = questions.cuda() if const.use_cuda else questions 145 | questions = Variable(questions).unsqueeze(0) 146 | 147 | pred = model.predict(contexts, questions) 148 | print(">pred: ", dataset.QA.IVOCAB[pred]) 149 | 150 | break; 151 | pass 152 | 153 | if args.train: 154 | train() 155 | elif args.evaluate: 156 | evaluate() -------------------------------------------------------------------------------- /reading_comprehension/model.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import os 11 | import const 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.init as init 15 | import torch.nn.functional as F 16 | from torch.autograd import Variable 17 | 18 | def save_model(model, path=f'models/model.pth'): 19 | os.makedirs('models', exist_ok=True) 20 | with open(path, 'wb') as fp: 21 | torch.save(model.state_dict(), fp) 22 | 23 | def load_model(model, path=f'models/model.pth'): 24 | if not os.path.exists(path): 25 | return model 26 | model.load_state_dict(torch.load(path)) 27 | return model 28 | 29 | def position_encoding(embedded_sentence): 30 | ''' 31 | embedded_sentence.size() -> (#batch, #sentence, #token, #embedding) 32 | l.size() -> (#sentence, #embedding) 33 | output.size() -> (#batch, #sentence, #embedding) 34 | ''' 35 | _, _, slen, elen = embedded_sentence.size() 36 | 37 | l = [[(1 - s/(slen-1)) - (e/(elen-1)) * (1 - 2*s/(slen-1)) for e in range(elen)] for s in range(slen)] 38 | l = torch.FloatTensor(l) 39 | l = l.unsqueeze(0) # for #batch 40 | l = l.unsqueeze(1) # for #sen 41 | l = l.expand_as(embedded_sentence) 42 | l = l.cuda() if const.use_cuda else l 43 | weighted = embedded_sentence * Variable(l) 44 | return torch.sum(weighted, dim=2).squeeze(2) # sum with tokens 45 | 46 | class InputNet(nn.Module): 47 | def __init__(self, input_size, hidden_size): 48 | super(InputNet, self).__init__() 49 | self.hidden_size = hidden_size 50 | self.gru = nn.GRU(hidden_size, hidden_size, bidirectional=True, batch_first=True) 51 | for name, param in self.gru.state_dict().items(): 52 | if 'weight' in name: init.xavier_normal(param) 53 | self.dropout = nn.Dropout(0.1) 54 | 55 | def forward(self, contexts, embedding): 56 | ''' 57 | contexts.size() -> (#batch, #sentence, #token) 58 | embedding() -> (#batch, #sentence x #token, #embedding) 59 | position_encoding() -> (#batch, #sentence, #embedding) 60 | facts.size() -> (#batch, #sentence, #hidden = #embedding) 61 | ''' 62 | batch_size, sen_size, token_size = contexts.size() 63 | 64 | contexts = contexts.view(batch_size, -1) 65 | contexts = embedding(contexts) 66 | 67 | contexts = contexts.view(batch_size, sen_size, token_size, -1) 68 | contexts = position_encoding(contexts) 69 | contexts = self.dropout(contexts) 70 | 71 | # init hidden 72 | h0 = torch.zeros(2, batch_size, self.hidden_size) 73 | h0 = h0.cuda() if const.use_cuda else h0 74 | h0 = Variable(h0) 75 | 76 | facts, hdn = self.gru(contexts, h0) 77 | facts = facts[:, :, :self.hidden_size] + facts[:, :, self.hidden_size:] 78 | return facts 79 | 80 | class QuestionNet(nn.Module): 81 | def __init__(self, input_size, hidden_size): 82 | super(QuestionNet, self).__init__() 83 | self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True) 84 | 85 | def forward(self, questions, embedding): 86 | ''' 87 | questions.size() -> (#batch, #token) 88 | embedding() -> (#batch, #token, #embedding) 89 | gru() -> (1, #batch, #hidden) 90 | ''' 91 | questions = embedding(questions) 92 | _, questions = self.gru(questions) # last hidden as questions, (num_layers * num_directions, B, hidden_size) 93 | 94 | questions = questions.transpose(0, 1) # B x 1 x hidden_size 95 | return questions 96 | 97 | class AttnNet(nn.Module): 98 | def __init__(self, hidden_size): 99 | super(AttnNet, self).__init__() 100 | self.hidden_size = hidden_size 101 | 102 | def forward(self, questions, facts): 103 | batch_size, seqnum, _ = facts.size() 104 | 105 | attn_energies = Variable(torch.zeros(batch_size, seqnum)) # B x S 106 | for b in range(batch_size): 107 | for i in range(seqnum): 108 | attn_energies[b, i] = self.score(facts[b, i], questions[b]) # calc Ct 109 | 110 | attn_energies = attn_energies.cuda() if const.use_cuda else attn_energies 111 | return F.softmax(attn_energies.unsqueeze(1)) 112 | 113 | def score(self, fact, question): 114 | energy = fact.dot(question) 115 | return energy 116 | 117 | class ReaderNet(nn.Module): 118 | def __init__(self, input_size, hidden_size, dropout_p=0.1): 119 | super(ReaderNet, self).__init__() 120 | 121 | self.hidden_size = hidden_size 122 | self.embedding = nn.Embedding(input_size, hidden_size) 123 | self.embedding = self.embedding.cuda() if const.use_cuda else self.embedding 124 | init.uniform(self.embedding.state_dict()['weight'], a=-1.0, b=1.0) 125 | 126 | self.input_net = InputNet(input_size, hidden_size) 127 | self.question_net = QuestionNet(input_size, hidden_size) 128 | self.attn_net = AttnNet(hidden_size) 129 | self.h2o = nn.Linear(hidden_size, input_size) 130 | 131 | self.criterion = nn.CrossEntropyLoss() 132 | 133 | def forward(self, contexts, questions): 134 | facts = self.input_net(contexts, self.embedding) 135 | questions = self.question_net(questions, self.embedding).squeeze(1) 136 | 137 | facts_attn = self.attn_net(questions, facts) 138 | facts = torch.bmm(facts_attn, facts).squeeze(1) 139 | 140 | outputs = questions * facts 141 | outputs = self.h2o(F.tanh(outputs)) 142 | return outputs 143 | 144 | def get_loss(self, contexts, questions, targets): 145 | output = self.forward(contexts, questions) 146 | loss = self.criterion(output.view(targets.size(0), -1), targets) 147 | reg_loss = 0 148 | for param in self.parameters(): 149 | reg_loss += 0.001 * torch.sum(param * param) 150 | preds = F.softmax(output) 151 | _, pred_ids = torch.max(preds, dim=1) 152 | corrects = (pred_ids.data == targets.data) 153 | acc = torch.mean(corrects.float()) 154 | return loss + reg_loss, acc 155 | 156 | def predict(self, contexts, questions): 157 | output = self.forward(contexts, questions) 158 | preds = F.softmax(output) 159 | _, pred_ids = torch.max(preds, dim=1) 160 | pred_value, pred_ids = torch.topk(preds, 1) 161 | return pred_ids.data.tolist()[0][0] -------------------------------------------------------------------------------- /text_similarity/vsm_sim.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import jieba 5 | 6 | class Sim(object): 7 | def __init__(self, kernel='tfidf'): 8 | self.word2idx = {} 9 | self.kernel = kernel 10 | 11 | def tokenizer(self, sent): 12 | return jieba.lcut(sent) 13 | 14 | def calc_bow(self, docs): 15 | bow = np.zeros([len(docs), len(self.word2idx)]) 16 | for docidx, words in enumerate(docs): 17 | for word in words: 18 | if word in self.word2idx: 19 | bow[docidx, self.word2idx[word]] += 1 20 | return bow 21 | 22 | def calc_tfidf(self, docs): 23 | tf = self.calc_bow(docs) 24 | df = np.ones([1, len(self.word2idx)]) 25 | 26 | for docidx, words in enumerate(docs): 27 | tf[docidx] /= np.max(tf[docidx]) 28 | for word in words: 29 | if word in self.word2idx: 30 | df[0, self.word2idx[word]] += 1 31 | idf = np.log(len(docs)) - np.log(df) 32 | tfidf = tf * idf 33 | return tfidf 34 | 35 | def cos(self, vec1, vec2): 36 | cos = np.dot(vec1, vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2)) 37 | try: 38 | cos = np.dot(vec1, vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2)) 39 | except: 40 | cos = None 41 | 42 | return cos 43 | 44 | def similarity(self, doc1, doc2): 45 | words1 = self.tokenizer(doc1) 46 | words2 = self.tokenizer(doc2) 47 | 48 | words = set(words1) | set(words2) 49 | self.word2idx = dict(zip(words, range(len(words)))) 50 | 51 | if self.kernel == 'tfidf': 52 | feature = self.calc_tfidf 53 | else: 54 | feature = self.calc_bow 55 | 56 | vec = feature([words1, words2]) 57 | vec1 = vec[0] 58 | vec2 = vec[1] 59 | 60 | return self.cos(vec1, vec2) 61 | 62 | if __name__ == '__main__': 63 | doc1 = """计算机科学(英语:computer science,有时缩写为CS)是系统性研究信息与计算的理论基础以及它们在计算机系统中如何实现与应用的实用技术的学科。 64 | [1] [2]它通常被形容为对那些创造、描述以及转换信息的算法处理的系统研究。 65 | 计算机科学包含很多分支领域;有些强调特定结果的计算,比如计算机图形学; 66 | 而有些是探讨计算问题的性质,比如计算复杂性理论;还有一些领域专注于怎样实现计算,比如编程语言理论是研究描述计算的方法, 67 | 而程序设计是应用特定的编程语言解决特定的计算问题,人机交互则是专注于怎样使计算机和计算变得有用、好用,以及随时随地为人所用。""" 68 | 69 | doc2 = """自然语言处理(英语:natural language processing,缩写作 NLP)是人工智能和语言学领域的分支学科。此领域探讨如何处理及运用自然语言;自然语言认知则是指让电脑“懂”人类的语言。 70 | 自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。""" 71 | sim = Sim() 72 | print(sim.similarity(doc1, doc2)) -------------------------------------------------------------------------------- /word2vec/README.md: -------------------------------------------------------------------------------- 1 | ### Word2vec 2 | 3 | word2vec是NLP一大利器,用它我们可以做很多事情,比如说近义词,关系挖掘,迁徙学习等。 4 | 5 | 为了帮助大家更好的理解word2vec,用深度学习框架实现了几个主要的模型: 6 | * CBOW naive softmax, CBOW negative_sampling 7 | * Skipgram naive softmax, Skipgram negative_samping 8 | 9 | 考虑到大家可能会使用不同的深度学习框架,在这里我主要使用了两种框架: 10 | * pytorch 11 | * tensorflow 12 | 13 | 声明: 代码仅供学习,谨慎用于实际项目中,实际项目中推荐使用成熟的解决方案: 14 | * gensim word2vec 15 | * word2vec 16 | 17 | ### Result 18 | ![result.png](./corpus/result.png) 19 | 20 | ### Recommed 21 | * [Distributed Representations of Words and Phrases and their Compositionality](https://arxiv.org/abs/1310.4546) 22 | * [word2vec Parameter Learning Explained](https://arxiv.org/abs/1411.2738) 23 | * [word2vec中的数学原理](http://blog.csdn.net/itplus/article/details/37969519) 24 | 25 | ### Reference 26 | 部分代码借鉴了: 27 | * [CBOW_on_TensorFlow](https://github.com/edugp/CBOW_on_TensorFlow/blob/master/CBOW.ipynb) 28 | * [Skip-gram with naiive softmax](https://nbviewer.jupyter.org/github/DSKSD/DeepNLP-models-Pytorch/blob/master/notebooks/01.Skip-gram-Naive-Softmax.ipynb) 29 | -------------------------------------------------------------------------------- /word2vec/cbow/pytorch/negative_sampling/cbow.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # cbow # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | class Cbow(nn.Module): 15 | def __init__(self, input_size, projection_size): 16 | super(Cbow, self).__init__() 17 | self.V = nn.Embedding(input_size, projection_size) 18 | self.U = nn.Embedding(input_size, projection_size) 19 | self.logsigmoid = nn.LogSigmoid() 20 | 21 | initrange = (2.0 / (input_size + projection_size))**5 22 | self.V.weight.data.uniform_(-initrange, initrange) 23 | self.U.weight.data.uniform_(-0.0, 0.0) # zero 24 | 25 | def forward(self, center_words, target_words, neg_words): 26 | v = self.V(center_words) # batch_size x 1 x projection_size 27 | u = self.U(target_words) # batch_size x 1 x projection_size 28 | u_neg = -self.U(neg_words) 29 | 30 | pos_score = u.bmm(v.transpose(1, 2)).squeeze(2) # batch_size x 1 31 | neg_score = torch.sum(u_neg.bmm(v.transpose(1, 2)).squeeze(2), 1).view(neg_words.size(0), -1) # batch_size x input_size 32 | 33 | return self.loss(pos_score, neg_score) 34 | 35 | def loss(self, pos_score, neg_score): 36 | loss = self.logsigmoid(pos_score) + self.logsigmoid(neg_score) 37 | return -torch.mean(loss) 38 | 39 | def pred(self, inp): 40 | return self.V(inp) 41 | 42 | -------------------------------------------------------------------------------- /word2vec/cbow/pytorch/negative_sampling/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | S_TOKEN = '' # start token 11 | E_TOKEN = '' # end token 12 | U_TOKEN = '' # unknown token 13 | D_TOKEN = '' # dummy token 14 | 15 | WIN_SIZE = 4 # window size 16 | SKIP_WIN = 2 # skip window siaze 17 | Z = 0.01 18 | 19 | # nnwork 20 | EMBEDDING_SIZE = 100 21 | BATCH_SIZE = 128 22 | EPOCH = 10000 23 | LR_RATE = 0.001 24 | NEG = 10 # Num of Negative Sampling 25 | -------------------------------------------------------------------------------- /word2vec/cbow/pytorch/negative_sampling/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # dataset.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import re 12 | import random 13 | import numpy as np 14 | import nltk 15 | import jieba 16 | 17 | import torch 18 | from torch.autograd import Variable 19 | 20 | import collections 21 | from collections import defaultdict, Counter 22 | 23 | def rm_sign(string): 24 | string = re.sub("[\.\!_,\$\(\)\"\'\]\[!!\?,。?、~@#¥……&]+", "", string) 25 | return string 26 | 27 | def load_data(corpus_dir = '../../../corpus/articles.txt'): 28 | with open(corpus_dir, 'r') as f: 29 | for line in f: 30 | line = line.strip() 31 | if len(line) == 0: 32 | continue 33 | yield jieba.lcut(rm_sign(line)) 34 | 35 | class Corpus(object): 36 | def __init__(self, data): 37 | 38 | # data sample 39 | data_split = len(data) // 10 40 | neg_data = data[-data_split:] 41 | data = data[:1-data_split] 42 | flatten = lambda l: [item.lower() for sublist in l for item in sublist] 43 | 44 | self.neg_vocab = list(set(flatten(neg_data))) 45 | 46 | word_count = Counter(flatten(data)) 47 | self.word2idx = {const.U_TOKEN: 0} 48 | self.n_words = 1 49 | for word, _ in word_count.items(): 50 | self.word2idx[word] = self.n_words 51 | self.n_words += 1 52 | self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys())) 53 | self.vocab = list(self.word2idx.keys()) 54 | 55 | # unigram_table 56 | vocab_total_words = sum([c for w, c in word_count.items() if w not in self.neg_vocab]) 57 | self.unigram_table = [] 58 | for v in self.vocab: 59 | self.unigram_table.extend([v]*int(((word_count[v]/vocab_total_words)**(3/4))/const.Z)) 60 | 61 | # @return batch data 62 | # @generator 63 | def batch_data(self): 64 | batch_size = const.BATCH_SIZE * const.WIN_SIZE 65 | data = self.vocab 66 | data_index = 0 67 | assert batch_size % const.WIN_SIZE == 0 68 | assert const.WIN_SIZE <= 2 * const.SKIP_WIN 69 | 70 | batch = np.ndarray(shape=(batch_size), dtype=np.int32) 71 | labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) 72 | span = 2 * const.SKIP_WIN + 1 # [ const.SKIP_WIN target const.SKIP_WIN ] 73 | buffers = collections.deque(maxlen=span) 74 | 75 | for _ in range(span): 76 | buffers.append(data[data_index]) 77 | data_index = (data_index + 1) % len(data) 78 | 79 | for i in range(batch_size // const.WIN_SIZE): 80 | 81 | target = const.SKIP_WIN # target label at the center of the buffers 82 | targets_to_avoid = [const.SKIP_WIN] 83 | for j in range(const.WIN_SIZE): 84 | while target in targets_to_avoid: 85 | target = random.randint(0, span - 1) 86 | targets_to_avoid.append(target) 87 | batch[i * const.WIN_SIZE + j] = self.var_word(buffers[const.SKIP_WIN])[0] 88 | labels[i * const.WIN_SIZE + j, 0] = self.var_word(buffers[target])[0] 89 | buffers.append(data[data_index]) 90 | data_index = (data_index + 1) % len(data) 91 | 92 | label_CBOW = [] 93 | context_CBOW = [] 94 | for i in range(0,len(batch), const.WIN_SIZE): 95 | label_CBOW.append(batch[i]) 96 | context_CBOW.append([l[0] for l in labels[i:i+const.WIN_SIZE]]) 97 | return np.array(context_CBOW), np.array(label_CBOW).reshape(batch_size // const.WIN_SIZE, 1) 98 | 99 | def negative_sampling(self, targets): 100 | batch_size = targets.size(0) 101 | neg_samples = [] 102 | for i in range(batch_size): 103 | sample = [] 104 | target_idx = targets[i].data.tolist()[0] 105 | while len(sample) < const.NEG: 106 | if self.word2idx == target_idx: 107 | continue 108 | sample.append(random.choice(self.unigram_table)) 109 | neg_samples.append(Variable(torch.LongTensor(self.var_sentence(sample))).view(1, -1)) 110 | return torch.cat(neg_samples) 111 | 112 | # @input sentence [w1, w2, ... , wn] 113 | def var_sentence(self, sentence): 114 | idxs = list(map(lambda w: self.word2idx[w] if w in self.vocab else self.word2idx[const.U_TOKEN], sentence)) 115 | return idxs 116 | 117 | # @input word 118 | def var_word(self, word): 119 | idx = [self.word2idx[const.U_TOKEN]] 120 | if word in self.word2idx: 121 | idx = [self.word2idx[word]] 122 | return idx 123 | -------------------------------------------------------------------------------- /word2vec/cbow/pytorch/negative_sampling/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # main.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | 11 | import argparse 12 | parser = argparse.ArgumentParser(description='main.py') 13 | parser.add_argument('-train', action='store_true', default=False, help='train model') 14 | parser.add_argument('-retrain', action='store_true', default=False, help='train model') 15 | parser.add_argument('-test', action='store_true', default=False, help='test model') 16 | args = parser.parse_args() 17 | 18 | import const 19 | import numpy as np 20 | import torch 21 | import torch.optim as optim 22 | import torch.nn.functional as F 23 | from torch.autograd import Variable 24 | 25 | from dataset import Corpus, load_data 26 | from cbow import Cbow 27 | from utils import Utils 28 | 29 | def test(word, corpus, k=10): 30 | vocab = corpus.vocab 31 | model,_ = Utils.load_previous_model('model') 32 | target_V = model.pred(Variable(torch.LongTensor(corpus.var_word(word)))) 33 | scores=[] 34 | for i in range(len(vocab)): 35 | if vocab[i] == word or vocab[i] == const.U_TOKEN: 36 | continue 37 | vector = model.pred(Variable(torch.LongTensor(corpus.var_word(list(vocab)[i])))) 38 | cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 39 | scores.append([vocab[i],cosine_sim]) 40 | return sorted(scores, key=lambda x: x[1], reverse=True)[:k] # sort by similarity 41 | 42 | def train(corpus): 43 | if args.retrain: 44 | Utils.remove_models('model') 45 | 46 | losses = [] 47 | 48 | start_epoch = 0 49 | model, start_epoch = Utils.load_previous_model('model') 50 | if model == None: 51 | model = Cbow(corpus.n_words, const.EMBEDDING_SIZE) 52 | 53 | if torch.cuda.is_available(): 54 | model.cuda() 55 | optimizer = optim.Adam(model.parameters(), const.LR_RATE) 56 | 57 | for epoch in range(start_epoch, const.EPOCH): 58 | inputs, targets = corpus.batch_data() 59 | 60 | 61 | inputs = Variable(torch.from_numpy(inputs).long()) 62 | targets = Variable(torch.from_numpy(targets).long()) 63 | 64 | negs = corpus.negative_sampling(targets) 65 | #print(inputs.size(), targets.size(), vocabs.size()) 66 | #exit() 67 | model.zero_grad() 68 | loss = model(inputs, targets, negs) 69 | loss.backward() 70 | optimizer.step() 71 | 72 | losses.append(loss.data.tolist()[0]) 73 | if epoch % 100 == 0: 74 | print("Epoch : %d, mean_loss : %.02f" % (epoch , np.mean(losses))) 75 | Utils.save_model(model, epoch, 'model') 76 | losses = [] 77 | Utils.save_model(model, epoch, 'model') 78 | 79 | data = list(load_data()) 80 | corpus = Corpus(data) 81 | if args.train or args.retrain: 82 | train(corpus) 83 | elif args.test: 84 | word = input('Input word> ') 85 | print(test(word, corpus)) -------------------------------------------------------------------------------- /word2vec/cbow/pytorch/negative_sampling/utils.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # model utils # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | import torch 12 | import os, glob 13 | import numpy as np 14 | 15 | class Utils(object): 16 | 17 | @staticmethod 18 | def save_model(model, epoch, save_dir, max_keep=5): 19 | if not os.path.exists(save_dir): 20 | os.makedirs(save_dir) 21 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 22 | if len(f_list) >= max_keep + 2: 23 | epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list] 24 | to_delete = [f_list[i] for i in np.argsort(epoch_list)[-max_keep:]] 25 | for f in to_delete: 26 | os.remove(f) 27 | name = 'model_{}.ckpt'.format(epoch) 28 | file_path = os.path.join(save_dir, name) 29 | #torch.save(model.state_dict(), file_path) 30 | torch.save(model, file_path) 31 | 32 | @staticmethod 33 | def load_previous_model(save_dir): 34 | if not os.path.exists(save_dir): 35 | os.makedirs(save_dir) 36 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 37 | start_epoch = 1 38 | model = None 39 | if len(f_list) >= 1: 40 | epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list] 41 | last_checkpoint = f_list[np.argmax(epoch_list)] 42 | if os.path.exists(last_checkpoint): 43 | #print('load from {}'.format(last_checkpoint)) 44 | # CNN 不支持参数保存 45 | #model.load_state_dict(torch.load(last_checkpoint)) 46 | model = torch.load(last_checkpoint) 47 | start_epoch = np.max(epoch_list) 48 | return model, start_epoch 49 | 50 | @staticmethod 51 | def remove_models(save_dir): 52 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 53 | f_list.append(os.path.join(save_dir, 'param.pkl')) 54 | f_list.append(os.path.join(save_dir, 'log.txt')) 55 | for filename in f_list: 56 | try: 57 | os.remove(filename) 58 | except: 59 | pass 60 | -------------------------------------------------------------------------------- /word2vec/cbow/pytorch/softmax/cbow.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # skipgram # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | 14 | class Cbow(nn.Module): 15 | def __init__(self, input_size, projection_size): 16 | super(Cbow, self).__init__() 17 | self.V = nn.Embedding(input_size, projection_size) 18 | self.U = nn.Embedding(input_size, projection_size) 19 | 20 | self.V.weight.data.uniform_(-1.0, 1.0) 21 | self.U.weight.data.uniform_(0.0, 0.0) # zero 22 | 23 | def forward(self, center_words, target_words, out_words): 24 | v = self.V(center_words) # batch_size x win_size x projection_size 25 | u = self.U(target_words) # batch_size x 1 x projection_size 26 | u_actual = self.U(out_words) # batch_size x input_size x projection_size 27 | 28 | scores = u.bmm(v.transpose(1, 2)).squeeze(2) # batch_size x win_size 29 | norm_scores = u_actual.bmm(v.transpose(1, 2)).squeeze(2) # batch_size x input_size 30 | return self.nll_loss(scores, norm_scores) 31 | 32 | def nll_loss(self, scores, norm_scores): 33 | # 34 | softmax = torch.exp(scores)/torch.sum(torch.exp(norm_scores),1).unsqueeze(1) 35 | return -torch.mean(torch.log(softmax)) 36 | 37 | def pred(self, inp): 38 | return self.V(inp) 39 | 40 | -------------------------------------------------------------------------------- /word2vec/cbow/pytorch/softmax/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | S_TOKEN = '' # start token 12 | E_TOKEN = '' # end token 13 | U_TOKEN = '' # unknown token 14 | D_TOKEN = '' # dummy token 15 | 16 | WIN_SIZE = 4 # window size 17 | SKIP_WIN = 2 18 | 19 | # nnwork 20 | EMBEDDING_SIZE = 100 21 | BATCH_SIZE = 128 22 | EPOCH = 10000 23 | LR_RATE = 0.0001 24 | -------------------------------------------------------------------------------- /word2vec/cbow/pytorch/softmax/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # dataset.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import re 12 | import random 13 | import numpy as np 14 | import nltk 15 | import jieba 16 | import collections 17 | from collections import defaultdict, Counter 18 | 19 | def rm_sign(string): 20 | string = re.sub("[\.\!_,\$\(\)\"\'\]\[!!\?,。?、~@#¥……&]+", "", string) 21 | return string 22 | 23 | def load_data(corpus_dir = '../../../corpus/articles.txt'): 24 | with open(corpus_dir, 'r') as f: 25 | for line in f: 26 | line = line.strip() 27 | if len(line) == 0: 28 | continue 29 | yield jieba.lcut(rm_sign(line)) 30 | 31 | class Corpus(object): 32 | def __init__(self, data): 33 | flatten = lambda l: [item.lower() for sublist in l for item in sublist] 34 | word_count = Counter(flatten(data)).most_common() 35 | self.word2idx = {const.U_TOKEN: 0} 36 | self.n_words = 1 37 | for word, _ in word_count: 38 | self.word2idx[word] = self.n_words 39 | self.n_words += 1 40 | self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys())) 41 | self.vocab = list(self.word2idx.keys()) 42 | 43 | # @return batch data 44 | # @generator 45 | def batch_data(self): 46 | batch_size = const.BATCH_SIZE * const.WIN_SIZE 47 | data = self.vocab 48 | data_index = 0 49 | assert batch_size % const.WIN_SIZE == 0 50 | assert const.WIN_SIZE <= 2 * const.SKIP_WIN 51 | 52 | batch = np.ndarray(shape=(batch_size), dtype=np.int32) 53 | labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) 54 | span = 2 * const.SKIP_WIN + 1 # [ const.SKIP_WIN target const.SKIP_WIN ] 55 | buffers = collections.deque(maxlen=span) 56 | 57 | for _ in range(span): 58 | buffers.append(data[data_index]) 59 | data_index = (data_index + 1) % len(data) 60 | 61 | for i in range(batch_size // const.WIN_SIZE): 62 | 63 | target = const.SKIP_WIN # target label at the center of the buffers 64 | targets_to_avoid = [const.SKIP_WIN] 65 | for j in range(const.WIN_SIZE): 66 | while target in targets_to_avoid: 67 | target = random.randint(0, span - 1) 68 | targets_to_avoid.append(target) 69 | batch[i * const.WIN_SIZE + j] = self.var_word(buffers[const.SKIP_WIN])[0] 70 | labels[i * const.WIN_SIZE + j, 0] = self.var_word(buffers[target])[0] 71 | buffers.append(data[data_index]) 72 | data_index = (data_index + 1) % len(data) 73 | 74 | label_CBOW = [] 75 | context_CBOW = [] 76 | for i in range(0,len(batch), const.WIN_SIZE): 77 | label_CBOW.append(batch[i]) 78 | context_CBOW.append([l[0] for l in labels[i:i+const.WIN_SIZE]]) 79 | return np.array(context_CBOW), np.array(label_CBOW).reshape(batch_size // const.WIN_SIZE, 1) 80 | 81 | # @input sentence [w1, w2, ... , wn] 82 | def var_sentence(self, sentence): 83 | idxs = list(map(lambda w: self.word2idx[w] if w in self.vocab else self.word2idx[const.U_TOKEN], sentence)) 84 | return idxs 85 | 86 | # @input word 87 | def var_word(self, word): 88 | idx = [self.word2idx[const.U_TOKEN]] 89 | if word in self.word2idx: 90 | idx = [self.word2idx[word]] 91 | return idx 92 | -------------------------------------------------------------------------------- /word2vec/cbow/pytorch/softmax/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # main.py # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | 12 | import argparse 13 | parser = argparse.ArgumentParser(description='main.py') 14 | parser.add_argument('-train', action='store_true', default=False, help='train model') 15 | parser.add_argument('-retrain', action='store_true', default=False, help='train model') 16 | parser.add_argument('-test', action='store_true', default=False, help='test model') 17 | args = parser.parse_args() 18 | 19 | import const 20 | import numpy as np 21 | import torch 22 | import torch.optim as optim 23 | import torch.nn.functional as F 24 | from torch.autograd import Variable 25 | from dataset import Corpus, load_data 26 | from cbow import Cbow 27 | from utils import Utils 28 | 29 | def test(word, corpus, k=10): 30 | vocab = corpus.vocab 31 | model,_ = Utils.load_previous_model('model') 32 | target_V = model.pred(Variable(torch.LongTensor(corpus.var_word(word)))) 33 | scores=[] 34 | for i in range(len(vocab)): 35 | if vocab[i] == word or vocab[i] == const.U_TOKEN: 36 | continue 37 | vector = model.pred(Variable(torch.LongTensor(corpus.var_word(list(vocab)[i])))) 38 | cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 39 | scores.append([vocab[i],cosine_sim]) 40 | return sorted(scores, key=lambda x: x[1], reverse=True)[:k] # sort by similarity 41 | 42 | def train(corpus): 43 | if args.retrain: 44 | Utils.remove_models('model') 45 | 46 | losses = [] 47 | 48 | start_epoch = 0 49 | model, start_epoch = Utils.load_previous_model('model') 50 | if model == None: 51 | model = Cbow(corpus.n_words, const.EMBEDDING_SIZE) 52 | 53 | if torch.cuda.is_available(): 54 | model.cuda() 55 | 56 | optimizer = optim.Adam(model.parameters(), const.LR_RATE) 57 | 58 | for epoch in range(start_epoch, const.EPOCH): 59 | inputs, targets = corpus.batch_data() 60 | 61 | 62 | inputs = Variable(torch.from_numpy(inputs).long()) 63 | targets = Variable(torch.from_numpy(targets).long()) 64 | vocabs = Variable(torch.LongTensor(corpus.var_sentence(corpus.vocab))).expand(inputs.size(0), corpus.n_words) 65 | 66 | model.zero_grad() 67 | loss = model(inputs, targets, vocabs) 68 | loss.backward() 69 | optimizer.step() 70 | 71 | losses.append(loss.data.tolist()[0]) 72 | if epoch % 100 == 0: 73 | print("Epoch : %d, mean_loss : %.02f" % (epoch , np.mean(losses))) 74 | Utils.save_model(model, epoch, 'model') 75 | losses = [] 76 | Utils.save_model(model, epoch, 'model') 77 | 78 | data = list(load_data()) 79 | corpus = Corpus(data) 80 | if args.train or args.retrain: 81 | train(corpus) 82 | elif args.test: 83 | word = input('Input word> ') 84 | print(test(word, corpus)) 85 | -------------------------------------------------------------------------------- /word2vec/cbow/pytorch/softmax/utils.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # model utils # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | import torch 12 | import os, glob 13 | import numpy as np 14 | 15 | class Utils(object): 16 | 17 | @staticmethod 18 | def save_model(model, epoch, save_dir, max_keep=5): 19 | if not os.path.exists(save_dir): 20 | os.makedirs(save_dir) 21 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 22 | if len(f_list) >= max_keep + 2: 23 | epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list] 24 | to_delete = [f_list[i] for i in np.argsort(epoch_list)[-max_keep:]] 25 | for f in to_delete: 26 | os.remove(f) 27 | name = 'model_{}.ckpt'.format(epoch) 28 | file_path = os.path.join(save_dir, name) 29 | #torch.save(model.state_dict(), file_path) 30 | torch.save(model, file_path) 31 | 32 | @staticmethod 33 | def load_previous_model(save_dir): 34 | if not os.path.exists(save_dir): 35 | os.makedirs(save_dir) 36 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 37 | start_epoch = 1 38 | model = None 39 | if len(f_list) >= 1: 40 | epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list] 41 | last_checkpoint = f_list[np.argmax(epoch_list)] 42 | if os.path.exists(last_checkpoint): 43 | #print('load from {}'.format(last_checkpoint)) 44 | # CNN 不支持参数保存 45 | #model.load_state_dict(torch.load(last_checkpoint)) 46 | model = torch.load(last_checkpoint) 47 | start_epoch = np.max(epoch_list) 48 | return model, start_epoch 49 | 50 | @staticmethod 51 | def remove_models(save_dir): 52 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 53 | f_list.append(os.path.join(save_dir, 'param.pkl')) 54 | f_list.append(os.path.join(save_dir, 'log.txt')) 55 | for filename in f_list: 56 | try: 57 | os.remove(filename) 58 | except: 59 | pass 60 | -------------------------------------------------------------------------------- /word2vec/cbow/tensorflow/negative_sampling/cbow.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # skipgram # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | import const 12 | import numpy as np 13 | import math 14 | import tensorflow as tf 15 | 16 | class Cbow(object): 17 | def __init__(self, corpus): 18 | self.corpus = corpus 19 | 20 | def test(self, word, k=10): 21 | Weight = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0)) 22 | inputs = tf.placeholder(tf.int32, [None]) 23 | embed = tf.nn.embedding_lookup(Weight, inputs) 24 | 25 | # cosine 26 | test_embed = tf.placeholder(tf.float32, [None]) 27 | test_input = tf.placeholder(tf.float32, [None]) 28 | normed_embed = tf.nn.l2_normalize(test_embed, dim=0) 29 | normed_array = tf.nn.l2_normalize(test_input, dim=0) 30 | cosine_similarity = tf.reduce_sum(tf.multiply(normed_array, normed_embed)) 31 | 32 | with tf.Session() as sess: 33 | tf.global_variables_initializer().run() 34 | #restore model 35 | tf.train.Saver().restore(sess, const.MODEL_PATH) 36 | 37 | vectors = sess.run(embed, feed_dict={inputs: range(self.corpus.n_words)}) 38 | vocab = self.corpus.vocab 39 | idx = self.corpus.var_word(word) 40 | scores = [] 41 | for i in range(len(vocab)): 42 | if vocab[i] == word or vocab[i] == const.U_TOKEN: 43 | continue 44 | vec_a = vectors[i].reshape([-1]) 45 | vec_b = vectors[idx].reshape([-1]) 46 | cosine_sim = sess.run(cosine_similarity, feed_dict={test_embed: vec_a, test_input: vec_b}) 47 | scores.append([vocab[i], cosine_sim]) #calculates cosine similarity 48 | return sorted(scores, key=lambda x: x[1], reverse=True)[:k] 49 | 50 | def train(self): 51 | Weight = tf.Variable(tf.truncated_normal([self.corpus.n_words, const.EMBEDDING_SIZE], stddev=1.0/math.sqrt(const.EMBEDDING_SIZE))) 52 | bias = tf.Variable(tf.zeros([self.corpus.n_words])) 53 | 54 | inputs = tf.placeholder(tf.int32, [const.BATCH_SIZE, const.WIN_SIZE]) 55 | outputs = tf.placeholder(tf.int32, [const.BATCH_SIZE, 1]) 56 | embed = tf.nn.embedding_lookup(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0), inputs) 57 | 58 | embed_sum = tf.reduce_sum(embed, 1) 59 | loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(Weight, bias, outputs, embed_sum, 3, self.corpus.n_words)) # negative sampling 60 | optimizer = tf.train.AdamOptimizer(learning_rate=const.LR_RATE).minimize(loss) 61 | 62 | saver = tf.train.Saver() 63 | 64 | losses = [] 65 | with tf.Session() as sess: 66 | tf.global_variables_initializer().run() 67 | 68 | for epoch in range(const.EPOCH): 69 | inps, targets = self.corpus.batch_data() 70 | _, _loss = sess.run([optimizer, loss], feed_dict={inputs:inps, outputs:targets}) 71 | 72 | losses.append(_loss) 73 | if epoch % 100 == 0: 74 | print('epoch, ', epoch, 'mean loss', np.mean(losses)) 75 | losses= [] 76 | 77 | # save model 78 | saver.save(sess, const.MODEL_PATH) -------------------------------------------------------------------------------- /word2vec/cbow/tensorflow/negative_sampling/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | S_TOKEN = '' # start token 12 | E_TOKEN = '' # end token 13 | U_TOKEN = '' # unknown token 14 | D_TOKEN = '' # dummy token 15 | 16 | WIN_SIZE = 4 # window size 17 | SKIP_WIN = 2 18 | 19 | # nnwork 20 | EMBEDDING_SIZE = 100 21 | BATCH_SIZE = 128 22 | EPOCH = 10000 23 | LR_RATE = 0.001 24 | 25 | MODEL_PATH = './model/word2vec.bin' 26 | -------------------------------------------------------------------------------- /word2vec/cbow/tensorflow/negative_sampling/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # dataset.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import re 12 | import random 13 | import numpy as np 14 | import nltk 15 | import jieba 16 | import collections 17 | from collections import defaultdict, Counter 18 | 19 | def rm_sign(string): 20 | string = re.sub("[\.\!_,\$\(\)\"\'\]\[!!\?,。?、~@#¥……&]+", "", string) 21 | return string 22 | 23 | def load_data(corpus_dir = '../../../corpus/articles.txt'): 24 | with open(corpus_dir, 'r') as f: 25 | for line in f: 26 | line = line.strip() 27 | if len(line) == 0: 28 | continue 29 | yield jieba.lcut(rm_sign(line)) 30 | 31 | class Corpus(object): 32 | def __init__(self, data): 33 | flatten = lambda l: [item.lower() for sublist in l for item in sublist] 34 | word_count = Counter(flatten(data)).most_common() 35 | self.word2idx = {const.U_TOKEN: 0} 36 | self.n_words = 1 37 | for word, _ in word_count: 38 | self.word2idx[word] = self.n_words 39 | self.n_words += 1 40 | self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys())) 41 | self.vocab = list(self.word2idx.keys()) 42 | 43 | # @return batch data 44 | # @generator 45 | def batch_data(self): 46 | batch_size = const.BATCH_SIZE * const.WIN_SIZE 47 | data = self.vocab 48 | data_index = 0 49 | assert batch_size % const.WIN_SIZE == 0 50 | assert const.WIN_SIZE <= 2 * const.SKIP_WIN 51 | batch = np.ndarray(shape=(batch_size), dtype=np.int32) 52 | labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) 53 | span = 2 * const.SKIP_WIN + 1 # [ const.SKIP_WIN target const.SKIP_WIN ] 54 | buffers = collections.deque(maxlen=span) 55 | for _ in range(span): 56 | buffers.append(data[data_index]) 57 | data_index = (data_index + 1) % len(data) 58 | for i in range(batch_size // const.WIN_SIZE): 59 | 60 | target = const.SKIP_WIN # target label at the center of the buffers 61 | targets_to_avoid = [const.SKIP_WIN] 62 | for j in range(const.WIN_SIZE): 63 | while target in targets_to_avoid: 64 | target = random.randint(0, span - 1) 65 | targets_to_avoid.append(target) 66 | batch[i * const.WIN_SIZE + j] = self.var_word(buffers[const.SKIP_WIN])[0] 67 | labels[i * const.WIN_SIZE + j, 0] = self.var_word(buffers[target])[0] 68 | buffers.append(data[data_index]) 69 | data_index = (data_index + 1) % len(data) 70 | label_CBOW = [] 71 | context_CBOW = [] 72 | for i in range(0,len(batch), const.WIN_SIZE): 73 | label_CBOW.append(batch[i]) 74 | context_CBOW.append([l[0] for l in labels[i:i+const.WIN_SIZE]]) 75 | return np.array(context_CBOW), np.array(label_CBOW).reshape(batch_size // const.WIN_SIZE, 1) 76 | 77 | # @input sentence [w1, w2, ... , wn] 78 | def var_sentence(self, sentence): 79 | idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \ 80 | else self.word2idx[const.U_TOKEN], sentence)) 81 | return idxs 82 | 83 | # @input word 84 | def var_word(self, word): 85 | idx = [self.word2idx[const.U_TOKEN]] 86 | if word in self.word2idx: 87 | idx = [self.word2idx[word]] 88 | return idx 89 | -------------------------------------------------------------------------------- /word2vec/cbow/tensorflow/negative_sampling/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # main.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import argparse 11 | parser = argparse.ArgumentParser(description='main.py') 12 | parser.add_argument('-train', action='store_true', default=False, help='train model') 13 | parser.add_argument('-test', action='store_true', default=False, help='test model') 14 | args = parser.parse_args() 15 | 16 | from dataset import Corpus, load_data 17 | from cbow import Cbow 18 | 19 | if __name__ == '__main__': 20 | 21 | data = list(load_data()) 22 | corpus = Corpus(data) 23 | cbow = Cbow(corpus) 24 | 25 | 26 | if args.train: 27 | cbow.train() 28 | elif args.test: 29 | word = input('Input word> ') 30 | print(cbow.test(word)) -------------------------------------------------------------------------------- /word2vec/cbow/tensorflow/softmax/cbow.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # skipgram # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import math 12 | import numpy as np 13 | import tensorflow as tf 14 | 15 | class Cbow(object): 16 | def __init__(self, corpus): 17 | self.corpus = corpus 18 | 19 | def test(self, word, k=10): 20 | Weight = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0)) 21 | inputs = tf.placeholder(tf.int32, [None]) 22 | embed = tf.nn.embedding_lookup(Weight, inputs) 23 | 24 | # cosine 25 | test_embed = tf.placeholder(tf.float32, [None]) 26 | test_input = tf.placeholder(tf.float32, [None]) 27 | normed_embed = tf.nn.l2_normalize(test_embed, dim=0) 28 | normed_array = tf.nn.l2_normalize(test_input, dim=0) 29 | cosine_similarity = tf.reduce_sum(tf.multiply(normed_array, normed_embed)) 30 | 31 | with tf.Session() as sess: 32 | tf.global_variables_initializer().run() 33 | #restore model 34 | tf.train.Saver().restore(sess, const.MODEL_PATH) 35 | 36 | vectors = sess.run(embed, feed_dict={inputs: range(self.corpus.n_words)}) 37 | vocab = self.corpus.vocab 38 | idx = self.corpus.var_word(word) 39 | scores = [] 40 | for i in range(len(vocab)): 41 | if vocab[i] == word or vocab[i] == const.U_TOKEN: 42 | continue 43 | vec_a = vectors[i].reshape([-1]) 44 | vec_b = vectors[idx].reshape([-1]) 45 | cosine_sim = sess.run(cosine_similarity, feed_dict={test_embed: vec_a, test_input: vec_b}) 46 | scores.append([vocab[i], cosine_sim]) #cosine similarity 47 | return sorted(scores, key=lambda x: x[1], reverse=True)[:k] 48 | 49 | def train(self): 50 | Weight = tf.Variable(tf.truncated_normal([self.corpus.n_words, const.EMBEDDING_SIZE], stddev=1.0/math.sqrt(const.EMBEDDING_SIZE))) 51 | bias = tf.Variable(tf.random_normal([self.corpus.n_words])) 52 | 53 | inputs = tf.placeholder(tf.int32, [const.BATCH_SIZE, const.WIN_SIZE]) 54 | targets = tf.placeholder(tf.int32, [const.BATCH_SIZE, 1]) 55 | vocabs = tf.placeholder(tf.int32, [const.BATCH_SIZE, self.corpus.n_words]) 56 | 57 | embed_weight_v = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0)) 58 | embed_weight_u = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0)) 59 | embed_weight_actual = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0)) 60 | embed_v = tf.nn.embedding_lookup(embed_weight_v, inputs) 61 | embed_u = tf.nn.embedding_lookup(embed_weight_u, targets) 62 | embed_actual = tf.nn.embedding_lookup(embed_weight_actual, vocabs) 63 | 64 | ''' 65 | print(embed_u.shape) 66 | print(embed_v.shape) 67 | print(embed_actual.shape) 68 | exit() 69 | ''' 70 | embed_v_trans = tf.transpose(embed_v, [0, 2, 1]) 71 | 72 | #print(embed_v_trans.shape) 73 | scores = tf.matmul(embed_u, embed_v_trans) 74 | norm_scores = tf.matmul(embed_actual, embed_v_trans) 75 | 76 | softmax = tf.exp(scores) / tf.reduce_sum(tf.exp(norm_scores), 1) 77 | softmax = tf.expand_dims(softmax, 1) 78 | nll_loss = -tf.reduce_mean(tf.log(tf.clip_by_value(softmax,1e-10,1.0))) 79 | 80 | optimizer = tf.train.AdamOptimizer(learning_rate=const.LR_RATE).minimize(nll_loss) 81 | 82 | saver = tf.train.Saver() 83 | 84 | losses = [] 85 | with tf.Session() as sess: 86 | tf.global_variables_initializer().run() 87 | 88 | for epoch in range(const.EPOCH): 89 | _inputs, _targets = self.corpus.batch_data() 90 | 91 | #print(_inputs.shape, _targets.shape) 92 | #continue 93 | #_inputs = np.hstack(_inputs) # (2, ) 94 | #_inputs = _inputs.reshape(_inputs.shape[0], 1) 95 | _targets = np.vstack(_targets) # (2, 1) 96 | 97 | vocab = self.corpus.var_sentence(self.corpus.vocab) 98 | _vocabs = [] 99 | [_vocabs.append(vocab) for x in range(inputs.shape[0])] 100 | _vocabs = np.array(_vocabs) 101 | 102 | _, _loss = sess.run([optimizer, nll_loss], feed_dict={inputs:_inputs, targets:_targets, vocabs: _vocabs}) 103 | losses.append(_loss) 104 | 105 | if epoch % 10 == 0: 106 | print('epoch, ', epoch, 'mean loss', np.mean(losses)) 107 | losses= [] 108 | 109 | # save model 110 | saver.save(sess, const.MODEL_PATH) -------------------------------------------------------------------------------- /word2vec/cbow/tensorflow/softmax/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | S_TOKEN = '' # start token 12 | E_TOKEN = '' # end token 13 | U_TOKEN = '' # unknown token 14 | D_TOKEN = '' # dummy token 15 | 16 | WIN_SIZE = 4 # window size 17 | SKIP_WIN = 2 18 | 19 | # nnwork 20 | EMBEDDING_SIZE = 100 21 | BATCH_SIZE = 128 22 | EPOCH = 10000 23 | LR_RATE = 0.0001 24 | 25 | MODEL_PATH = './model/word2vec.bin' 26 | -------------------------------------------------------------------------------- /word2vec/cbow/tensorflow/softmax/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # dataset.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import re 12 | import random 13 | import numpy as np 14 | import nltk 15 | import jieba 16 | import collections 17 | from collections import defaultdict, Counter 18 | 19 | def rm_sign(string): 20 | string = re.sub("[\.\!_,\$\(\)\"\'\]\[!!\?,。?、~@#¥……&]+", "", string) 21 | return string 22 | 23 | def load_data(corpus_dir = '../../../corpus/articles.txt'): 24 | with open(corpus_dir, 'r') as f: 25 | for line in f: 26 | line = line.strip() 27 | if len(line) == 0: 28 | continue 29 | yield jieba.lcut(rm_sign(line)) 30 | 31 | class Corpus(object): 32 | def __init__(self, data): 33 | flatten = lambda l: [item.lower() for sublist in l for item in sublist] 34 | word_count = Counter(flatten(data)).most_common() 35 | self.word2idx = {const.U_TOKEN: 0} 36 | self.n_words = 1 37 | for word, _ in word_count: 38 | self.word2idx[word] = self.n_words 39 | self.n_words += 1 40 | self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys())) 41 | self.vocab = list(self.word2idx.keys()) 42 | 43 | # @return batch data 44 | # @generator 45 | def batch_data(self): 46 | batch_size = const.BATCH_SIZE * const.WIN_SIZE 47 | data = self.vocab 48 | data_index = 0 49 | assert batch_size % const.WIN_SIZE == 0 50 | assert const.WIN_SIZE <= 2 * const.SKIP_WIN 51 | batch = np.ndarray(shape=(batch_size), dtype=np.int32) 52 | labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) 53 | span = 2 * const.SKIP_WIN + 1 # [ const.SKIP_WIN target const.SKIP_WIN ] 54 | buffers = collections.deque(maxlen=span) 55 | for _ in range(span): 56 | buffers.append(data[data_index]) 57 | data_index = (data_index + 1) % len(data) 58 | for i in range(batch_size // const.WIN_SIZE): 59 | 60 | target = const.SKIP_WIN # target label at the center of the buffers 61 | targets_to_avoid = [const.SKIP_WIN] 62 | for j in range(const.WIN_SIZE): 63 | while target in targets_to_avoid: 64 | target = random.randint(0, span - 1) 65 | targets_to_avoid.append(target) 66 | batch[i * const.WIN_SIZE + j] = self.var_word(buffers[const.SKIP_WIN])[0] 67 | labels[i * const.WIN_SIZE + j, 0] = self.var_word(buffers[target])[0] 68 | buffers.append(data[data_index]) 69 | data_index = (data_index + 1) % len(data) 70 | label_CBOW = [] 71 | context_CBOW = [] 72 | for i in range(0,len(batch), const.WIN_SIZE): 73 | label_CBOW.append(batch[i]) 74 | context_CBOW.append([l[0] for l in labels[i:i+const.WIN_SIZE]]) 75 | return np.array(context_CBOW), np.array(label_CBOW).reshape(batch_size // const.WIN_SIZE, 1) 76 | 77 | # @input sentence [w1, w2, ... , wn] 78 | def var_sentence(self, sentence): 79 | idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \ 80 | else self.word2idx[const.U_TOKEN], sentence)) 81 | return idxs 82 | 83 | # @input word 84 | def var_word(self, word): 85 | idx = [self.word2idx[const.U_TOKEN]] 86 | if word in self.word2idx: 87 | idx = [self.word2idx[word]] 88 | return idx 89 | -------------------------------------------------------------------------------- /word2vec/cbow/tensorflow/softmax/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # main.py # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | import argparse 12 | parser = argparse.ArgumentParser(description='main.py') 13 | parser.add_argument('-train', action='store_true', default=False, help='train model') 14 | parser.add_argument('-test', action='store_true', default=False, help='test model') 15 | args = parser.parse_args() 16 | 17 | from dataset import Corpus, load_data 18 | from cbow import Cbow 19 | 20 | if __name__ == '__main__': 21 | 22 | data = list(load_data()) 23 | corpus = Corpus(data) 24 | cbow = Cbow(corpus) 25 | 26 | if args.train: 27 | cbow.train() 28 | elif args.test: 29 | word = input('Input word> ') 30 | print(cbow.test(word)) -------------------------------------------------------------------------------- /word2vec/corpus/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SeanLee97/nlp_learning/82f158f63c7b943dabc0fb18ed7ebde5c655214a/word2vec/corpus/result.png -------------------------------------------------------------------------------- /word2vec/corpus/trans_code.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | ### 5 | # Linux下GB*转UTF-8 6 | ### 7 | fin = open('articles.txt', 'r') 8 | fou = open('articles_uft8.txt', 'w') 9 | line = fin.readline() 10 | while line: 11 | newline = line.decode('GB18030').encode('utf-8') #用GBK、GB2312都会出错 12 | print newline, 13 | print >> fou, newline, 14 | line = fin.readline() 15 | fin.close() 16 | fou.close() 17 | -------------------------------------------------------------------------------- /word2vec/skipgram/pytorch/negative_sampling/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | S_TOKEN = '' # start token 12 | E_TOKEN = '' # end token 13 | U_TOKEN = '' # unknown token 14 | D_TOKEN = '' # dummy token 15 | 16 | WIN_SIZE = 3 # window size 17 | 18 | # nnwork 19 | EMBEDDING_SIZE = 30 20 | BATCH_SIZE = 128 21 | EPOCH = 1000 22 | LR_RATE = 0.001 23 | NEG = 10 # Num of Negative Sampling 24 | -------------------------------------------------------------------------------- /word2vec/skipgram/pytorch/negative_sampling/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # dataset.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import re 12 | import random 13 | import nltk 14 | import jieba 15 | import torch 16 | from torch.autograd import Variable 17 | from collections import defaultdict, Counter 18 | 19 | 20 | if torch.cuda.is_available(): 21 | FloatTensor = torch.cuda.FloatTensor 22 | LongTensor = torch.cuda.LongTensor 23 | ByteTensor = torch.cuda.ByteTensor 24 | 25 | def rm_sign(string): 26 | string = re.sub("[\.\!_,\$\(\)\"\'\]\[!!\?,。?、~@#¥……&]+", "", string) 27 | return string 28 | 29 | def load_data(corpus_dir = '../../../corpus/articles.txt'): 30 | with open(corpus_dir, 'r') as f: 31 | for line in f: 32 | line = line.strip() 33 | if len(line) == 0: 34 | continue 35 | yield jieba.lcut(rm_sign(line)) 36 | 37 | class Corpus(object): 38 | def __init__(self, data): 39 | self.vocab, self.neg_vocab, self.unigram_table = self.get_vocab(data) 40 | self.windows = [] 41 | self.vocab.append(const.U_TOKEN) 42 | self.word2idx = {} 43 | self.idx2word = {} 44 | self.n_words = 0 45 | 46 | for word in self.vocab: 47 | if word not in self.word2idx: 48 | self.word2idx[word] = self.n_words 49 | self.idx2word[self.n_words] = word 50 | self.n_words += 1 51 | 52 | for sentence in data: 53 | # n-gram 54 | self.windows.extend(\ 55 | list(\ 56 | nltk.ngrams([const.D_TOKEN]*const.WIN_SIZE+sentence+[const.D_TOKEN]*const.WIN_SIZE, const.WIN_SIZE*2+1)\ 57 | )\ 58 | ) 59 | 60 | dataset = [] 61 | for window in self.windows: 62 | for i in range(const.WIN_SIZE*2+1): 63 | if window[i] in self.neg_vocab or window[const.WIN_SIZE] in self.neg_vocab: 64 | continue 65 | if i == const.WIN_SIZE or window[i] == const.D_TOKEN: 66 | continue 67 | dataset.append((window[const.WIN_SIZE], window[i])) 68 | X_p, y_p = [], [] 69 | for d in dataset: 70 | X_p.append(self.var_word(d[0]).view(1,-1)) 71 | y_p.append(self.var_word(d[1]).view(1,-1)) 72 | self.dataset = list(zip(X_p, y_p)) 73 | 74 | def get_vocab(self, data, min_count=3, Z=0.01): 75 | # [[]] -> [] 76 | flatten = lambda l: [item.lower() for sublist in l for item in sublist] 77 | word_count = Counter(flatten(data)) 78 | neg_vocab = [w for w, c in word_count.items() if c < min_count] 79 | vocab = list(set(flatten(data))-set(neg_vocab)) 80 | vocab_total_words = sum([c for w, c in word_count.items() if w not in neg_vocab]) 81 | unigram_table = [] 82 | for v in vocab: 83 | unigram_table.extend([v]*int(((word_count[v]/vocab_total_words)**(3/4))/Z)) 84 | return vocab, neg_vocab, unigram_table 85 | 86 | def negative_sampling(self, targets): 87 | batch_size = targets.size(0) 88 | neg_samples = [] 89 | for i in range(batch_size): 90 | sample = [] 91 | target_idx = targets[i].data.tolist()[0] 92 | while len(sample) < const.NEG: 93 | if self.word2idx == target_idx: 94 | continue 95 | sample.append(random.choice(self.unigram_table)) 96 | neg_samples.append(self.var_sentence(sample).view(1, -1)) 97 | return torch.cat(neg_samples) 98 | 99 | # @return batch data 100 | # @generator 101 | def batch_data(self, batch_size): 102 | random.shuffle(self.dataset) 103 | sidx = 0 # start index 104 | eidx = batch_size # end index 105 | while eidx < len(self.dataset): 106 | batch = self.dataset[sidx:eidx] 107 | sidx = eidx 108 | eidx += batch_size 109 | yield batch 110 | 111 | if eidx >= len(self.dataset): 112 | batch = self.dataset[sidx: ] 113 | yield batch 114 | 115 | # @input sentence [w1, w2, ... , wn] 116 | # @return Variable 117 | def var_sentence(self, sentence): 118 | idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \ 119 | else self.word2idx[const.U_TOKEN], sentence)) 120 | return Variable(torch.LongTensor(idxs)) 121 | 122 | # @input word 123 | # @return Variable 124 | def var_word(self, word): 125 | return Variable(torch.LongTensor([self.word2idx[word]]) if word in self.word2idx.keys() \ 126 | else torch.LongTensor([self.word2idx[const.U_TOKEN]])) 127 | -------------------------------------------------------------------------------- /word2vec/skipgram/pytorch/negative_sampling/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # main.py # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | 12 | import argparse 13 | parser = argparse.ArgumentParser(description='main.py') 14 | parser.add_argument('-train', action='store_true', default=False, help='train model') 15 | parser.add_argument('-retrain', action='store_true', default=False, help='train model') 16 | parser.add_argument('-test', action='store_true', default=False, help='test model') 17 | args = parser.parse_args() 18 | 19 | import const 20 | import numpy as np 21 | import torch 22 | import torch.optim as optim 23 | import torch.nn.functional as F 24 | from dataset import Corpus, load_data 25 | from skipgram import Skipgram 26 | from utils import Utils 27 | 28 | def test(word, corpus, k=10): 29 | vocab = corpus.vocab 30 | model,_ = Utils.load_previous_model('model') 31 | target_V = model.pred(corpus.var_word(word)) 32 | scores=[] 33 | for i in range(len(vocab)): 34 | if vocab[i] == word or vocab[i] == const.U_TOKEN: 35 | continue 36 | vector = model.pred(corpus.var_word(list(vocab)[i])) 37 | cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 38 | scores.append([vocab[i],cosine_sim]) 39 | return sorted(scores, key=lambda x: x[1], reverse=True)[:k] # sort by similarity 40 | 41 | def train(corpus): 42 | if args.retrain: 43 | Utils.remove_models('model') 44 | 45 | losses = [] 46 | 47 | start_epoch = 0 48 | model, start_epoch = Utils.load_previous_model('model') 49 | if model == None: 50 | model = Skipgram(corpus.n_words, const.EMBEDDING_SIZE) 51 | 52 | if torch.cuda.is_available(): 53 | model.cuda() 54 | optimizer = optim.Adam(model.parameters(), const.LR_RATE) 55 | 56 | for epoch in range(start_epoch, const.EPOCH): 57 | for i, batch in enumerate(corpus.batch_data(const.BATCH_SIZE)): 58 | inputs, targets = zip(*batch) # unzip 59 | inputs = torch.cat(inputs) 60 | targets = torch.cat(targets) 61 | negs = corpus.negative_sampling(targets) 62 | #print(inputs.size(), targets.size(), vocabs.size()) 63 | #exit() 64 | model.zero_grad() 65 | loss = model(inputs, targets, negs) 66 | loss.backward() 67 | optimizer.step() 68 | 69 | losses.append(loss.data.tolist()[0]) 70 | if epoch % 10 == 0: 71 | print("Epoch : %d, mean_loss : %.02f" % (epoch , np.mean(losses))) 72 | Utils.save_model(model, epoch, 'model') 73 | losses = [] 74 | Utils.save_model(model, epoch, 'model') 75 | 76 | data = list(load_data()) 77 | corpus = Corpus(data) 78 | if args.train or args.retrain: 79 | train(corpus) 80 | elif args.test: 81 | word = input('Input word> ') 82 | print(test(word, corpus)) -------------------------------------------------------------------------------- /word2vec/skipgram/pytorch/negative_sampling/skipgram.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # skipgram # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | class Skipgram(nn.Module): 16 | def __init__(self, input_size, projection_size): 17 | super(Skipgram, self).__init__() 18 | self.V = nn.Embedding(input_size, projection_size) 19 | self.U = nn.Embedding(input_size, projection_size) 20 | self.logsigmoid = nn.LogSigmoid() 21 | 22 | initrange = (2.0 / (input_size + projection_size))**5 23 | self.V.weight.data.uniform_(-initrange, initrange) 24 | self.U.weight.data.uniform_(-0.0, 0.0) # zero 25 | 26 | def forward(self, center_words, target_words, neg_words): 27 | v = self.V(center_words) # batch_size x 1 x projection_size 28 | u = self.U(target_words) # batch_size x 1 x projection_size 29 | u_neg = -self.U(neg_words) 30 | 31 | pos_score = u.bmm(v.transpose(1, 2)).squeeze(2) # batch_size x 1 32 | neg_score = torch.sum(u_neg.bmm(v.transpose(1, 2)).squeeze(2), 1).view(neg_words.size(0), -1) # batch_size x input_size 33 | 34 | return self.loss(pos_score, neg_score) 35 | 36 | def loss(self, pos_score, neg_score): 37 | loss = self.logsigmoid(pos_score) + self.logsigmoid(neg_score) 38 | return -torch.mean(loss) 39 | 40 | def pred(self, inp): 41 | return self.V(inp) 42 | 43 | -------------------------------------------------------------------------------- /word2vec/skipgram/pytorch/negative_sampling/utils.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # model utils # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | import torch 12 | import os, glob 13 | import numpy as np 14 | 15 | class Utils(object): 16 | 17 | @staticmethod 18 | def save_model(model, epoch, save_dir, max_keep=5): 19 | if not os.path.exists(save_dir): 20 | os.makedirs(save_dir) 21 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 22 | if len(f_list) >= max_keep + 2: 23 | epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list] 24 | to_delete = [f_list[i] for i in np.argsort(epoch_list)[-max_keep:]] 25 | for f in to_delete: 26 | os.remove(f) 27 | name = 'model_{}.ckpt'.format(epoch) 28 | file_path = os.path.join(save_dir, name) 29 | #torch.save(model.state_dict(), file_path) 30 | torch.save(model, file_path) 31 | 32 | @staticmethod 33 | def load_previous_model(save_dir): 34 | if not os.path.exists(save_dir): 35 | os.makedirs(save_dir) 36 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 37 | start_epoch = 1 38 | model = None 39 | if len(f_list) >= 1: 40 | epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list] 41 | last_checkpoint = f_list[np.argmax(epoch_list)] 42 | if os.path.exists(last_checkpoint): 43 | #print('load from {}'.format(last_checkpoint)) 44 | # CNN 不支持参数保存 45 | #model.load_state_dict(torch.load(last_checkpoint)) 46 | model = torch.load(last_checkpoint) 47 | start_epoch = np.max(epoch_list) 48 | return model, start_epoch 49 | 50 | @staticmethod 51 | def remove_models(save_dir): 52 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 53 | f_list.append(os.path.join(save_dir, 'param.pkl')) 54 | f_list.append(os.path.join(save_dir, 'log.txt')) 55 | for filename in f_list: 56 | try: 57 | os.remove(filename) 58 | except: 59 | pass 60 | -------------------------------------------------------------------------------- /word2vec/skipgram/pytorch/softmax/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | S_TOKEN = '' # start token 12 | E_TOKEN = '' # end token 13 | U_TOKEN = '' # unknown token 14 | D_TOKEN = '' # dummy token 15 | 16 | WIN_SIZE = 5 # window size 17 | 18 | # nnwork 19 | EMBEDDING_SIZE = 30 20 | BATCH_SIZE = 256 21 | EPOCH = 1000 22 | LR_RATE = 0.001 23 | -------------------------------------------------------------------------------- /word2vec/skipgram/pytorch/softmax/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # dataset.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import re 12 | import random 13 | import nltk 14 | import jieba 15 | import torch 16 | from torch.autograd import Variable 17 | from collections import defaultdict, Counter 18 | 19 | 20 | if torch.cuda.is_available(): 21 | FloatTensor = torch.cuda.FloatTensor 22 | LongTensor = torch.cuda.LongTensor 23 | ByteTensor = torch.cuda.ByteTensor 24 | 25 | def rm_sign(string): 26 | string = re.sub("[\.\!_,\$\(\)\"\'\]\[!!\?,。?、~@#¥……&]+", "", string) 27 | return string 28 | 29 | def load_data(corpus_dir = '../../../corpus/articles.txt'): 30 | with open(corpus_dir, 'r') as f: 31 | for line in f: 32 | line = line.strip() 33 | if len(line) == 0: 34 | continue 35 | yield jieba.lcut(rm_sign(line)) 36 | 37 | class Corpus(object): 38 | def __init__(self, data): 39 | self.vocab = self.get_vocab(data) 40 | self.windows = [] 41 | self.vocab.append(const.U_TOKEN) 42 | self.word2idx = {} 43 | self.idx2word = {} 44 | self.n_words = 0 45 | 46 | for word in self.vocab: 47 | if word not in self.word2idx: 48 | self.word2idx[word] = self.n_words 49 | self.idx2word[self.n_words] = word 50 | self.n_words += 1 51 | 52 | for sentence in data: 53 | # n-gram 54 | self.windows.extend(\ 55 | list(\ 56 | nltk.ngrams([const.D_TOKEN]*const.WIN_SIZE+sentence+[const.D_TOKEN]*const.WIN_SIZE, const.WIN_SIZE*2+1)\ 57 | )\ 58 | ) 59 | 60 | dataset = [] 61 | for window in self.windows: 62 | for i in range(const.WIN_SIZE*2+1): 63 | if i == const.WIN_SIZE or window[i] == const.D_TOKEN: 64 | continue 65 | dataset.append((window[const.WIN_SIZE], window[i])) 66 | X_p, y_p = [], [] 67 | for d in dataset: 68 | X_p.append(self.var_word(d[0]).view(1,-1)) 69 | y_p.append(self.var_word(d[1]).view(1,-1)) 70 | self.dataset = list(zip(X_p, y_p)) 71 | 72 | def get_vocab(self, data): 73 | # [[]] -> [] 74 | flatten = lambda l: [item.lower() for sublist in l for item in sublist] 75 | word_count = Counter(flatten(data)) 76 | border = int(len(word_count)*0.01) 77 | stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border] 78 | stopwords = [s[0] for s in stopwords] 79 | vocab = list(set(flatten(data))-set(stopwords)) 80 | return vocab 81 | 82 | # @return batch data 83 | # @generator 84 | def batch_data(self, batch_size): 85 | random.shuffle(self.dataset) 86 | sidx = 0 # start index 87 | eidx = batch_size # end index 88 | while eidx < len(self.dataset): 89 | batch = self.dataset[sidx:eidx] 90 | sidx = eidx 91 | eidx += batch_size 92 | yield batch 93 | 94 | if eidx >= len(self.dataset): 95 | batch = self.dataset[sidx: ] 96 | yield batch 97 | 98 | # @input sentence [w1, w2, ... , wn] 99 | # @return Variable 100 | def var_sentence(self, sentence): 101 | idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \ 102 | else self.word2idx[const.U_TOKEN], sentence)) 103 | return Variable(torch.LongTensor(idxs)) 104 | 105 | # @input word 106 | # @return Variable 107 | def var_word(self, word): 108 | return Variable(torch.LongTensor([self.word2idx[word]]) if word in self.word2idx.keys() \ 109 | else torch.LongTensor([self.word2idx[const.U_TOKEN]])) 110 | -------------------------------------------------------------------------------- /word2vec/skipgram/pytorch/softmax/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # main.py # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | 12 | import argparse 13 | parser = argparse.ArgumentParser(description='main.py') 14 | parser.add_argument('-train', action='store_true', default=False, help='train model') 15 | parser.add_argument('-retrain', action='store_true', default=False, help='train model') 16 | parser.add_argument('-test', action='store_true', default=False, help='test model') 17 | args = parser.parse_args() 18 | 19 | import const 20 | import numpy as np 21 | import torch 22 | import torch.optim as optim 23 | import torch.nn.functional as F 24 | from dataset import Corpus, load_data 25 | from skipgram import Skipgram 26 | from utils import Utils 27 | 28 | def test(word, corpus, k=10): 29 | vocab = corpus.vocab 30 | model,_ = Utils.load_previous_model('model') 31 | target_V = model.pred(corpus.var_word(word)) 32 | scores=[] 33 | for i in range(len(vocab)): 34 | if vocab[i] == word or vocab[i] == const.U_TOKEN: 35 | continue 36 | vector = model.pred(corpus.var_word(list(vocab)[i])) 37 | cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 38 | scores.append([vocab[i],cosine_sim]) 39 | return sorted(scores, key=lambda x: x[1], reverse=True)[:k] # sort by similarity 40 | 41 | def train(corpus): 42 | if args.retrain: 43 | Utils.remove_models('model') 44 | 45 | losses = [] 46 | 47 | start_epoch = 0 48 | model, start_epoch = Utils.load_previous_model('model') 49 | if model == None: 50 | model = Skipgram(corpus.n_words, const.EMBEDDING_SIZE) 51 | 52 | if torch.cuda.is_available(): 53 | model.cuda() 54 | optimizer = optim.Adam(model.parameters(), const.LR_RATE) 55 | 56 | for epoch in range(start_epoch, const.EPOCH): 57 | for i, batch in enumerate(corpus.batch_data(const.BATCH_SIZE)): 58 | inputs, targets = zip(*batch) # unzip 59 | inputs = torch.cat(inputs) 60 | targets = torch.cat(targets) 61 | vocabs = corpus.var_sentence(corpus.vocab).expand(inputs.size(0), corpus.n_words) 62 | print(inputs.size(), targets.size(), vocabs.size()) 63 | exit() 64 | 65 | model.zero_grad() 66 | loss = model(inputs, targets, vocabs) 67 | loss.backward() 68 | optimizer.step() 69 | 70 | losses.append(loss.data.tolist()[0]) 71 | if epoch % 10 == 0: 72 | print("Epoch : %d, mean_loss : %.02f" % (epoch , np.mean(losses))) 73 | Utils.save_model(model, epoch, 'model') 74 | losses = [] 75 | Utils.save_model(model, epoch, 'model') 76 | 77 | data = list(load_data()) 78 | corpus = Corpus(data) 79 | if args.train or args.retrain: 80 | train(corpus) 81 | elif args.test: 82 | word = input('Input word> ') 83 | print(test(word, corpus)) -------------------------------------------------------------------------------- /word2vec/skipgram/pytorch/softmax/skipgram.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # skipgram # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | class Skipgram(nn.Module): 16 | def __init__(self, input_size, projection_size): 17 | super(Skipgram, self).__init__() 18 | self.V = nn.Embedding(input_size, projection_size) 19 | self.U = nn.Embedding(input_size, projection_size) 20 | 21 | self.V.weight.data.uniform_(-1.0, 1.0) 22 | self.U.weight.data.uniform_(0.0, 0.0) # zero 23 | 24 | def forward(self, center_words, target_words, out_words): 25 | v = self.V(center_words) # batch_size x 1 x projection_size 26 | u = self.U(target_words) # batch_size x 1 x projection_size 27 | u_actual = self.U(out_words) # batch_size x input_size x projection_size 28 | 29 | scores = u.bmm(v.transpose(1, 2)).squeeze(2) # batch_size x 1 30 | norm_scores = u_actual.bmm(v.transpose(1, 2)).squeeze(2) # batch_size x input_size 31 | return self.nll_loss(scores, norm_scores) 32 | 33 | def nll_loss(self, scores, norm_scores): 34 | # 35 | softmax = torch.exp(scores)/torch.sum(torch.exp(norm_scores),1).unsqueeze(1) 36 | return -torch.mean(torch.log(softmax)) 37 | 38 | def pred(self, inp): 39 | return self.V(inp) 40 | 41 | -------------------------------------------------------------------------------- /word2vec/skipgram/pytorch/softmax/utils.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # model utils # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | import torch 12 | import os, glob 13 | import numpy as np 14 | 15 | class Utils(object): 16 | 17 | @staticmethod 18 | def save_model(model, epoch, save_dir, max_keep=5): 19 | if not os.path.exists(save_dir): 20 | os.makedirs(save_dir) 21 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 22 | if len(f_list) >= max_keep + 2: 23 | epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list] 24 | to_delete = [f_list[i] for i in np.argsort(epoch_list)[-max_keep:]] 25 | for f in to_delete: 26 | os.remove(f) 27 | name = 'model_{}.ckpt'.format(epoch) 28 | file_path = os.path.join(save_dir, name) 29 | #torch.save(model.state_dict(), file_path) 30 | torch.save(model, file_path) 31 | 32 | @staticmethod 33 | def load_previous_model(save_dir): 34 | if not os.path.exists(save_dir): 35 | os.makedirs(save_dir) 36 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 37 | start_epoch = 1 38 | model = None 39 | if len(f_list) >= 1: 40 | epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list] 41 | last_checkpoint = f_list[np.argmax(epoch_list)] 42 | if os.path.exists(last_checkpoint): 43 | #print('load from {}'.format(last_checkpoint)) 44 | # CNN 不支持参数保存 45 | #model.load_state_dict(torch.load(last_checkpoint)) 46 | model = torch.load(last_checkpoint) 47 | start_epoch = np.max(epoch_list) 48 | return model, start_epoch 49 | 50 | @staticmethod 51 | def remove_models(save_dir): 52 | f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt') 53 | f_list.append(os.path.join(save_dir, 'param.pkl')) 54 | f_list.append(os.path.join(save_dir, 'log.txt')) 55 | for filename in f_list: 56 | try: 57 | os.remove(filename) 58 | except: 59 | pass 60 | -------------------------------------------------------------------------------- /word2vec/skipgram/tensorflow/negative_sampling/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | S_TOKEN = '' # start token 12 | E_TOKEN = '' # end token 13 | U_TOKEN = '' # unknown token 14 | D_TOKEN = '' # dummy token 15 | 16 | WIN_SIZE = 5 # window size 17 | 18 | # nnwork 19 | EMBEDDING_SIZE = 100 20 | BATCH_SIZE = 128 21 | EPOCH = 100 22 | LR_RATE = 0.001 23 | 24 | MODEL_PATH = './model/word2vec.bin' 25 | -------------------------------------------------------------------------------- /word2vec/skipgram/tensorflow/negative_sampling/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # dataset.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import re 12 | import random 13 | import numpy as np 14 | import nltk 15 | import jieba 16 | from collections import defaultdict, Counter 17 | 18 | def rm_sign(string): 19 | string = re.sub("[\.\!_,\$\(\)\"\'\]\[!!\?,。?、~@#¥……&]+", "", string) 20 | return string 21 | 22 | def load_data(corpus_dir = '../../../corpus/articles.txt'): 23 | with open(corpus_dir, 'r') as f: 24 | for line in f: 25 | line = line.strip() 26 | if len(line) == 0: 27 | continue 28 | yield jieba.lcut(rm_sign(line)) 29 | 30 | class Corpus(object): 31 | def __init__(self, data): 32 | self.vocab = self.get_vocab(data) 33 | self.windows = [] 34 | self.vocab.append(const.U_TOKEN) 35 | self.word2idx = {} 36 | self.idx2word = {} 37 | self.n_words = 0 38 | 39 | for word in self.vocab: 40 | if word not in self.word2idx: 41 | self.word2idx[word] = self.n_words 42 | self.idx2word[self.n_words] = word 43 | self.n_words += 1 44 | 45 | for sentence in data: 46 | # n-gram 47 | self.windows.extend(\ 48 | list(\ 49 | nltk.ngrams([const.D_TOKEN]*const.WIN_SIZE+sentence+[const.D_TOKEN]*const.WIN_SIZE, const.WIN_SIZE*2+1)\ 50 | )\ 51 | ) 52 | 53 | dataset = [] 54 | for window in self.windows: 55 | for i in range(const.WIN_SIZE*2+1): 56 | if i == const.WIN_SIZE or window[i] == const.D_TOKEN: 57 | continue 58 | dataset.append((window[const.WIN_SIZE], window[i])) 59 | X_p, y_p = [], [] 60 | for d in dataset: 61 | X_p.append(self.var_word(d[0])) 62 | y_p.append(self.var_word(d[1])) 63 | self.dataset = list(zip(X_p, y_p)) 64 | 65 | def get_vocab(self, data): 66 | # [[]] -> [] 67 | flatten = lambda l: [item.lower() for sublist in l for item in sublist] 68 | word_count = Counter(flatten(data)) 69 | border = int(len(word_count)*0.01) 70 | stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border] 71 | stopwords = [s[0] for s in stopwords] 72 | vocab = list(set(flatten(data))-set(stopwords)) 73 | return vocab 74 | 75 | # @return batch data 76 | # @generator 77 | def batch_data(self, batch_size): 78 | random.shuffle(self.dataset) 79 | sidx = 0 # start index 80 | eidx = batch_size # end index 81 | while eidx < len(self.dataset): 82 | batch = self.dataset[sidx:eidx] 83 | sidx = eidx 84 | eidx += batch_size 85 | yield batch 86 | 87 | if eidx >= len(self.dataset): 88 | batch = self.dataset[sidx: ] 89 | diff = eidx - len(self.dataset) 90 | inps, targets = zip(*batch) # unzip 91 | inps = list(inps) 92 | targets = list(targets) 93 | diff_vec = [self.word2idx[const.U_TOKEN]]*diff 94 | inps = inps + diff_vec 95 | targets = targets + diff_vec 96 | inps = tuple(inps) 97 | targets = tuple(targets) 98 | batch = zip(inps, targets) 99 | yield batch 100 | 101 | # @input sentence [w1, w2, ... , wn] 102 | def var_sentence(self, sentence): 103 | idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \ 104 | else self.word2idx[const.U_TOKEN], sentence)) 105 | return idxs 106 | 107 | # @input word 108 | def var_word(self, word): 109 | idx = [self.word2idx[const.U_TOKEN]] 110 | if word in self.word2idx: 111 | idx = [self.word2idx[word]] 112 | return idx 113 | -------------------------------------------------------------------------------- /word2vec/skipgram/tensorflow/negative_sampling/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # main.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import argparse 11 | parser = argparse.ArgumentParser(description='main.py') 12 | parser.add_argument('-train', action='store_true', default=False, help='train model') 13 | parser.add_argument('-test', action='store_true', default=False, help='test model') 14 | args = parser.parse_args() 15 | 16 | from dataset import Corpus, load_data 17 | from skipgram import Skipgram 18 | 19 | if __name__ == '__main__': 20 | 21 | data = list(load_data()) 22 | corpus = Corpus(data) 23 | skipgram = Skipgram(corpus) 24 | 25 | if args.train: 26 | skipgram.train() 27 | elif args.test: 28 | word = input('Input word> ') 29 | print(skipgram.test(word)) -------------------------------------------------------------------------------- /word2vec/skipgram/tensorflow/negative_sampling/skipgram.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # skipgram # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | import const 12 | import math 13 | import numpy as np 14 | import tensorflow as tf 15 | 16 | class Skipgram(object): 17 | def __init__(self, corpus): 18 | self.corpus = corpus 19 | 20 | def test(self, word, k=10): 21 | Weight = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0)) 22 | inputs = tf.placeholder(tf.int32, [None]) 23 | embed = tf.nn.embedding_lookup(Weight, inputs) 24 | 25 | # cosine 26 | test_embed = tf.placeholder(tf.float32, [None]) 27 | test_input = tf.placeholder(tf.float32, [None]) 28 | normed_embed = tf.nn.l2_normalize(test_embed, dim=0) 29 | normed_array = tf.nn.l2_normalize(test_input, dim=0) 30 | cosine_similarity = tf.reduce_sum(tf.multiply(normed_array, normed_embed)) 31 | 32 | with tf.Session() as sess: 33 | tf.global_variables_initializer().run() 34 | #restore model 35 | tf.train.Saver().restore(sess, const.MODEL_PATH) 36 | 37 | vectors = sess.run(embed, feed_dict={inputs: range(self.corpus.n_words)}) 38 | vocab = self.corpus.vocab 39 | idx = self.corpus.var_word(word) 40 | scores = [] 41 | for i in range(len(vocab)): 42 | if vocab[i] == word or vocab[i] == const.U_TOKEN: 43 | continue 44 | vec_a = vectors[i].reshape([-1]) 45 | vec_b = vectors[idx].reshape([-1]) 46 | cosine_sim = sess.run(cosine_similarity, feed_dict={test_embed: vec_a, test_input: vec_b}) 47 | scores.append([vocab[i], cosine_sim]) #calculates cosine similarity 48 | return sorted(scores, key=lambda x: x[1], reverse=True)[:k] 49 | 50 | def train(self): 51 | Weight = tf.Variable(tf.truncated_normal([self.corpus.n_words, const.EMBEDDING_SIZE], stddev=1.0/math.sqrt(const.EMBEDDING_SIZE))) 52 | bias = tf.Variable(tf.zeros([self.corpus.n_words])) 53 | 54 | inputs = tf.placeholder(tf.int32, [const.BATCH_SIZE]) 55 | outputs = tf.placeholder(tf.int32, [const.BATCH_SIZE, 1]) 56 | embed = tf.nn.embedding_lookup(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0), inputs) 57 | 58 | loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(Weight, bias, outputs, embed, 3, self.corpus.n_words)) # negative sampling 59 | optimizer = tf.train.AdamOptimizer(learning_rate=const.LR_RATE).minimize(loss) 60 | 61 | saver = tf.train.Saver() 62 | 63 | losses = [] 64 | with tf.Session() as sess: 65 | tf.global_variables_initializer().run() 66 | 67 | for epoch in range(const.EPOCH): 68 | for i, batch in enumerate(self.corpus.batch_data(const.BATCH_SIZE)): 69 | inps, targets = zip(*batch) # unzip 70 | inps = np.hstack(inps) # (2, ) 71 | targets = np.vstack(targets) # (2, 1) 72 | #print(inps.shape, targets.shape) 73 | _, _loss = sess.run([optimizer, loss], feed_dict={inputs:inps, outputs:targets}) 74 | 75 | losses.append(_loss) 76 | if epoch % 10 == 0: 77 | print('epoch, ', epoch, 'mean loss', np.mean(losses)) 78 | losses= [] 79 | 80 | # save model 81 | saver.save(sess, const.MODEL_PATH) -------------------------------------------------------------------------------- /word2vec/skipgram/tensorflow/softmax/const.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # const.python # 6 | # author: sean lee # 7 | # locate: Shanxi university, Taiyuan, China # 8 | # email: lxm_0828@163.com # 9 | #--------------------------------------------# 10 | 11 | S_TOKEN = '' # start token 12 | E_TOKEN = '' # end token 13 | U_TOKEN = '' # unknown token 14 | D_TOKEN = '' # dummy token 15 | 16 | WIN_SIZE = 5 # window size 17 | 18 | # nnwork 19 | EMBEDDING_SIZE = 30 20 | BATCH_SIZE = 128 21 | EPOCH = 1000 22 | LR_RATE = 0.001 23 | 24 | MODEL_PATH = './model/word2vec.bin' 25 | -------------------------------------------------------------------------------- /word2vec/skipgram/tensorflow/softmax/dataset.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # dataset.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import re 12 | import random 13 | import numpy as np 14 | import nltk 15 | import jieba 16 | from collections import defaultdict, Counter 17 | 18 | def rm_sign(string): 19 | string = re.sub("[\.\!_,\$\(\)\"\'\]\[!!\?,。?、~@#¥……&]+", "", string) 20 | return string 21 | 22 | def load_data(corpus_dir = '../../../corpus/articles.txt'): 23 | with open(corpus_dir, 'r') as f: 24 | for line in f: 25 | line = line.strip() 26 | if len(line) == 0: 27 | continue 28 | yield jieba.lcut(rm_sign(line)) 29 | 30 | class Corpus(object): 31 | def __init__(self, data): 32 | self.vocab = self.get_vocab(data) 33 | self.windows = [] 34 | self.vocab.append(const.U_TOKEN) 35 | self.word2idx = {} 36 | self.idx2word = {} 37 | self.n_words = 0 38 | 39 | for word in self.vocab: 40 | if word not in self.word2idx: 41 | self.word2idx[word] = self.n_words 42 | self.idx2word[self.n_words] = word 43 | self.n_words += 1 44 | 45 | for sentence in data: 46 | # n-gram 47 | self.windows.extend(\ 48 | list(\ 49 | nltk.ngrams([const.D_TOKEN]*const.WIN_SIZE+sentence+[const.D_TOKEN]*const.WIN_SIZE, const.WIN_SIZE*2+1)\ 50 | )\ 51 | ) 52 | 53 | dataset = [] 54 | for window in self.windows: 55 | for i in range(const.WIN_SIZE*2+1): 56 | if i == const.WIN_SIZE or window[i] == const.D_TOKEN: 57 | continue 58 | dataset.append((window[const.WIN_SIZE], window[i])) 59 | X_p, y_p = [], [] 60 | for d in dataset: 61 | X_p.append(self.var_word(d[0])) 62 | y_p.append(self.var_word(d[1])) 63 | self.dataset = list(zip(X_p, y_p)) 64 | 65 | def get_vocab(self, data): 66 | # [[]] -> [] 67 | flatten = lambda l: [item.lower() for sublist in l for item in sublist] 68 | word_count = Counter(flatten(data)) 69 | border = int(len(word_count)*0.01) 70 | stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border] 71 | stopwords = [s[0] for s in stopwords] 72 | vocab = list(set(flatten(data))-set(stopwords)) 73 | return vocab 74 | 75 | # @return batch data 76 | # @generator 77 | def batch_data(self, batch_size): 78 | random.shuffle(self.dataset) 79 | sidx = 0 # start index 80 | eidx = batch_size # end index 81 | while eidx < len(self.dataset): 82 | batch = self.dataset[sidx:eidx] 83 | sidx = eidx 84 | eidx += batch_size 85 | yield batch 86 | 87 | if eidx >= len(self.dataset): 88 | batch = self.dataset[sidx: ] 89 | diff = eidx - len(self.dataset) 90 | inps, targets = zip(*batch) # unzip 91 | inps = list(inps) 92 | targets = list(targets) 93 | diff_vec = [self.word2idx[const.U_TOKEN]]*diff 94 | inps = inps + diff_vec 95 | targets = targets + diff_vec 96 | inps = tuple(inps) 97 | targets = tuple(targets) 98 | batch = zip(inps, targets) 99 | yield batch 100 | 101 | # @input sentence [w1, w2, ... , wn] 102 | def var_sentence(self, sentence): 103 | idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \ 104 | else self.word2idx[const.U_TOKEN], sentence)) 105 | return idxs 106 | 107 | # @input word 108 | def var_word(self, word): 109 | idx = [self.word2idx[const.U_TOKEN]] 110 | if word in self.word2idx: 111 | idx = [self.word2idx[word]] 112 | return idx 113 | -------------------------------------------------------------------------------- /word2vec/skipgram/tensorflow/softmax/main.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # main.py # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import argparse 11 | parser = argparse.ArgumentParser(description='main.py') 12 | parser.add_argument('-train', action='store_true', default=False, help='train model') 13 | parser.add_argument('-test', action='store_true', default=False, help='test model') 14 | args = parser.parse_args() 15 | 16 | from dataset import Corpus, load_data 17 | from skipgram import Skipgram 18 | 19 | if __name__ == '__main__': 20 | 21 | data = list(load_data()) 22 | corpus = Corpus(data) 23 | skipgram = Skipgram(corpus) 24 | 25 | if args.train: 26 | skipgram.train() 27 | elif args.test: 28 | word = input('Input word> ') 29 | print(skipgram.test(word)) -------------------------------------------------------------------------------- /word2vec/skipgram/tensorflow/softmax/skipgram.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # -------------------------------------------# 5 | # skipgram # 6 | # author: sean lee # 7 | # email: lxm_0828@163.com # 8 | #--------------------------------------------# 9 | 10 | import const 11 | import math 12 | import numpy as np 13 | import tensorflow as tf 14 | 15 | class Skipgram(object): 16 | def __init__(self, corpus): 17 | self.corpus = corpus 18 | 19 | def test(self, word, k=10): 20 | Weight = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0)) 21 | inputs = tf.placeholder(tf.int32, [None]) 22 | embed = tf.nn.embedding_lookup(Weight, inputs) 23 | 24 | # cosine 25 | test_embed = tf.placeholder(tf.float32, [None]) 26 | test_input = tf.placeholder(tf.float32, [None]) 27 | normed_embed = tf.nn.l2_normalize(test_embed, dim=0) 28 | normed_array = tf.nn.l2_normalize(test_input, dim=0) 29 | cosine_similarity = tf.reduce_sum(tf.multiply(normed_array, normed_embed)) 30 | 31 | with tf.Session() as sess: 32 | tf.global_variables_initializer().run() 33 | #restore model 34 | tf.train.Saver().restore(sess, const.MODEL_PATH) 35 | 36 | vectors = sess.run(embed, feed_dict={inputs: range(self.corpus.n_words)}) 37 | vocab = self.corpus.vocab 38 | idx = self.corpus.var_word(word) 39 | scores = [] 40 | for i in range(len(vocab)): 41 | if vocab[i] == word or vocab[i] == const.U_TOKEN: 42 | continue 43 | vec_a = vectors[i].reshape([-1]) 44 | vec_b = vectors[idx].reshape([-1]) 45 | cosine_sim = sess.run(cosine_similarity, feed_dict={test_embed: vec_a, test_input: vec_b}) 46 | scores.append([vocab[i], cosine_sim]) #cosine similarity 47 | return sorted(scores, key=lambda x: x[1], reverse=True)[:k] 48 | 49 | def train(self): 50 | Weight = tf.Variable(tf.truncated_normal([self.corpus.n_words, const.EMBEDDING_SIZE], stddev=1.0/math.sqrt(const.EMBEDDING_SIZE))) 51 | bias = tf.Variable(tf.random_normal([self.corpus.n_words])) 52 | 53 | inputs = tf.placeholder(tf.int32, [const.BATCH_SIZE, 1]) 54 | targets = tf.placeholder(tf.int32, [const.BATCH_SIZE, 1]) 55 | vocabs = tf.placeholder(tf.int32, [const.BATCH_SIZE, self.corpus.n_words]) 56 | 57 | embed_weight_v = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0)) 58 | embed_weight_u = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0)) 59 | embed_weight_actual = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0)) 60 | embed_v = tf.nn.embedding_lookup(embed_weight_v, inputs) 61 | embed_u = tf.nn.embedding_lookup(embed_weight_u, targets) 62 | embed_actual = tf.nn.embedding_lookup(embed_weight_actual, vocabs) 63 | 64 | ''' 65 | print(embed_u.shape) 66 | print(embed_v.shape) 67 | print(embed_actual.shape) 68 | exit() 69 | ''' 70 | embed_v_trans = tf.transpose(embed_v, [0, 2, 1]) 71 | 72 | #print(embed_v_trans.shape) 73 | scores = tf.squeeze(tf.matmul(embed_u, embed_v_trans), [2]) # batch_size x 1 74 | norm_scores = tf.squeeze(tf.matmul(embed_actual, embed_v_trans), [2]) # batch_size x input_size 75 | 76 | softmax = tf.exp(scores) / tf.reduce_sum(tf.exp(norm_scores), 1) 77 | softmax = tf.expand_dims(softmax, 1) 78 | nll_loss = -tf.reduce_mean(tf.log(tf.clip_by_value(softmax,1e-10,1.0))) 79 | 80 | optimizer = tf.train.AdamOptimizer(learning_rate=const.LR_RATE).minimize(nll_loss) 81 | 82 | saver = tf.train.Saver() 83 | 84 | losses = [] 85 | with tf.Session() as sess: 86 | tf.global_variables_initializer().run() 87 | 88 | for epoch in range(const.EPOCH): 89 | for i, batch in enumerate(self.corpus.batch_data(const.BATCH_SIZE)): 90 | 91 | _inputs, _targets = zip(*batch) # unzip 92 | 93 | _inputs = np.hstack(_inputs) # (2, ) 94 | _inputs = _inputs.reshape(_inputs.shape[0], 1) 95 | _targets = np.vstack(_targets) # (2, 1) 96 | 97 | vocab = self.corpus.var_sentence(self.corpus.vocab) 98 | _vocabs = [] 99 | [_vocabs.append(vocab) for x in range(inputs.shape[0])] 100 | _vocabs = np.array(_vocabs) 101 | 102 | _, _loss = sess.run([optimizer, nll_loss], feed_dict={inputs:_inputs, targets:_targets, vocabs: _vocabs}) 103 | losses.append(_loss) 104 | if i % 500: 105 | print('i, ', i, 'loss', _loss) 106 | 107 | if epoch % 10 == 0: 108 | print('epoch, ', epoch, 'mean loss', np.mean(losses)) 109 | losses= [] 110 | 111 | # save model 112 | saver.save(sess, const.MODEL_PATH) --------------------------------------------------------------------------------