├── .gitignore
├── README.md
├── hmm
    ├── README.md
    ├── hmm_tutorial.ipynb
    └── src
    │   ├── const.py
    │   ├── corpus
    │       └── toy
    │       │   └── train.txt
    │   ├── dataset.py
    │   ├── hmm.py
    │   ├── main.py
    │   └── processing.py
├── language_model
    ├── README.md
    ├── languange_model_tutorial.ipynb
    └── src
    │   ├── const.py
    │   ├── corpus
    │       └── toy
    │       │   ├── test.txt
    │       │   └── train.txt
    │   ├── dataset.py
    │   ├── evaluate.py
    │   ├── main.py
    │   ├── ngram.py
    │   ├── processing.py
    │   └── smooth.py
├── lsa
    └── lsa.py
├── nbayes
    ├── nbayes.py
    └── tfidf_nbayes.py
├── pca
    └── pca.py
├── pcfg
    ├── README.md
    ├── pcfg_tutorial.ipynb
    └── src
    │   ├── corpus
    │       └── toy
    │       │   └── train.txt
    │   ├── main.py
    │   └── pcfg.py
├── reading_comprehension
    ├── README.md
    ├── const.py
    ├── corpus
    │   ├── bAbI
    │   │   ├── LICENSE.txt
    │   │   ├── README.txt
    │   │   └── en-10k
    │   │   │   ├── qa5_three-arg-relations_test.txt
    │   │   │   └── qa5_three-arg-relations_train.txt
    │   ├── reading_comprehension.png
    │   └── result.png
    ├── dataset.py
    ├── main.py
    └── model.py
├── text_similarity
    └── vsm_sim.py
└── word2vec
    ├── README.md
    ├── cbow
        ├── pytorch
        │   ├── negative_sampling
        │   │   ├── cbow.py
        │   │   ├── const.py
        │   │   ├── dataset.py
        │   │   ├── main.py
        │   │   └── utils.py
        │   └── softmax
        │   │   ├── cbow.py
        │   │   ├── const.py
        │   │   ├── dataset.py
        │   │   ├── main.py
        │   │   └── utils.py
        └── tensorflow
        │   ├── negative_sampling
        │       ├── cbow.py
        │       ├── const.py
        │       ├── dataset.py
        │       └── main.py
        │   └── softmax
        │       ├── cbow.py
        │       ├── const.py
        │       ├── dataset.py
        │       └── main.py
    ├── corpus
        ├── articles.txt
        ├── result.png
        └── trans_code.py
    └── skipgram
        ├── pytorch
            ├── negative_sampling
            │   ├── const.py
            │   ├── dataset.py
            │   ├── main.py
            │   ├── skipgram.py
            │   └── utils.py
            └── softmax
            │   ├── const.py
            │   ├── dataset.py
            │   ├── main.py
            │   ├── skipgram.py
            │   └── utils.py
        └── tensorflow
            ├── negative_sampling
                ├── const.py
                ├── dataset.py
                ├── main.py
                └── skipgram.py
            └── softmax
                ├── const.py
                ├── dataset.py
                ├── main.py
                └── skipgram.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center"> / NLP Learning / </p>
 2 | 
 3 | <p align="center">结合python学习自然语言处理</p>
 4 | 
 5 | ## 目录
 6 | 
 7 | - [python实现语言模型](https://github.com/SeanLee97/nlp_learning/tree/master/language_model) 
 8 | - [python实现HMM](https://github.com/SeanLee97/nlp_learning/tree/master/hmm) 
 9 | - [python实现PCFG](https://github.com/SeanLee97/nlp_learning/tree/master/pcfg)
10 | - [pytorch&tensorflow实现word2vec (CBOW softmax, CBOW negative_sampling, Skipgram softmax, Skipgram negative_sampling)](https://github.com/SeanLee97/nlp_learning/tree/master/word2vec)
11 | - [reading comprehension 阅读理解任务](https://github.com/SeanLee97/nlp_learning/tree/master/reading_comprehension)
12 | - [tfidf + 朴素贝叶斯](https://seanlee97.github.io/2018/08/25/%E4%B8%BA%E6%9C%B4%E7%B4%A0%E8%B4%9D%E5%8F%B6%E6%96%AF%E5%8A%A0%E5%85%A5TF-IDF%E7%89%B9%E5%BE%81/)
13 | - [text_similarity 文本相似度计算](https://seanlee97.github.io/2018/08/31/%E4%BD%99%E5%BC%A6%E5%AE%9A%E7%90%86%E5%92%8C%E6%96%87%E6%9C%AC%E7%9B%B8%E4%BC%BC%E5%BA%A6/)
14 | - [从特征值特征向量去理解PCA](https://seanlee97.github.io/2018/03/29/%E4%BB%8E%E7%89%B9%E5%BE%81%E5%80%BC%E7%89%B9%E5%BE%81%E5%90%91%E9%87%8F%E5%8E%BB%E7%90%86%E8%A7%A3PCA/)
15 | - [SVD的原理及LSA的求解](https://seanlee97.github.io/2018/09/01/SVD%E7%9A%84%E5%8E%9F%E7%90%86%E5%8F%8ALSA%E7%9A%84%E6%B1%82%E8%A7%A3/)
16 | 
17 | 
18 | ## 论文实现
19 | - [QANet](https://github.com/SeanLee97/QANet_dureader)
20 | - [bimpm](https://github.com/SeanLee97/bimpm)
21 | 


--------------------------------------------------------------------------------
/hmm/README.md:
--------------------------------------------------------------------------------
1 | # HMM
2 | 
3 | 基于bigram, trigram实现的HMM， 支持viterbi解码输出更高效！
4 | 


--------------------------------------------------------------------------------
/hmm/hmm_tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# python 实现 隐马尔可夫模型（HMM）\n",
  8 |     "\n",
  9 |     "本例子主要受 Michael Collins 教授的 Tagging Problems, and Hidden Markov Models 启发而编写，为了帮助大家理解，我在我的博客、公众号上发表了文章[一文读懂NLP中的HMM(公众号)](https://mp.weixin.qq.com/s?__biz=MzIwNDM1NjUzMA==&mid=2247483662&idx=1&sn=cf463dde9af1844a3fd1e3e4fec26f5c&chksm=96c02fd3a1b7a6c5cfabe53efbff54af33cd2f61d13064645fbff92ce1b024d82acb2375d9b0#rd)，欢迎大家阅读。当然强烈推荐Michael Collins 教授的 [Tagging Problems, and Hidden Markov Models](http://www.cs.columbia.edu/~mcollins/hmms-spring2013.pdf)"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 目录\n",
 17 |     "\n",
 18 |     "1. [项目结构](#项目结构)\n",
 19 |     "2. [环境要求](#环境要求)\n",
 20 |     "3. [代码分析](#代码分析)\n",
 21 |     "4. [结果分析](#结果分析)\n",
 22 |     "5. [项目后续](#项目后续)\n",
 23 |     "6. [联系作者](#联系作者)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "## 项目结构\n",
 31 |     "\n",
 32 |     "| - src\n",
 33 |     "\n",
 34 |     "    | - const.py      常量定义文件\n",
 35 |     "\n",
 36 |     "    | - corpus        语料库\n",
 37 |     "\n",
 38 |     "    | - dataset.py    加载语料\n",
 39 |     "\n",
 40 |     "    | - hmm.py        bigram hmm, trigram hmm, viterbi\n",
 41 |     "\n",
 42 |     "    | - main.py       例子程序\n",
 43 |     "\n",
 44 |     "    | - processing.py 字典的生成等处理方法"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## 环境要求\n",
 52 |     "\n",
 53 |     "    python3"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## 代码分析"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "### const.py\n",
 68 |     "\n",
 69 |     "在这里定义了三个常量"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 1,
 75 |    "metadata": {
 76 |     "collapsed": true
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# 未登录词\n",
 81 |     "UNK = None\n",
 82 |     "# 句子开始标记，代表句子的开头\n",
 83 |     "START_TOKEN = '<s>'\n",
 84 |     "# 句子结束标记，代表句子的结尾\n",
 85 |     "END_TOKEN = '</s>'"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### processing.py\n",
 93 |     "\n",
 94 |     "字典的构建"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {
101 |     "collapsed": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "# !/usr/bin/env python3\n",
106 |     "# -*- coding: utf-8 -*-\n",
107 |     "\n",
108 |     "'''\n",
109 |     "@description: 句子的处理，字典的构建\n",
110 |     "@author: Sean QQ: 929325776\n",
111 |     "'''\n",
112 |     "\n",
113 |     "import const\n",
114 |     "\n",
115 |     "#加入起始标记\n",
116 |     "def build_tags(tags):\n",
117 |     "\tout = []\n",
118 |     "\tfor sentence in tags:\n",
119 |     "\t\titems = [x.lower() for x in sentence]\n",
120 |     "\t\titems.insert(0, const.START_TOKEN)\n",
121 |     "\t\titems.append(const.END_TOKEN)\n",
122 |     "\t\tout.append(items)\n",
123 |     "\treturn out\n",
124 |     "\n",
125 |     "# 构建ungram词频词典\n",
126 |     "def build_undict(tags):\n",
127 |     "\tundict = {}\n",
128 |     "\tfor items in tags:\n",
129 |     "\t\tfor word in items:\n",
130 |     "\t\t\tif word == const.START_TOKEN or word == const.END_TOKEN:\n",
131 |     "\t\t\t\tcontinue\n",
132 |     "\t\t\tif word not in undict:\n",
133 |     "\t\t\t\tundict[word] = 1\n",
134 |     "\t\t\telse:\n",
135 |     "\t\t\t\tundict[word] += 1\n",
136 |     "\treturn undict\n",
137 |     "\n",
138 |     "\n",
139 |     "# 构建bigram词频词典，其中以三元组(u, v)作为词典的键\n",
140 |     "def build_bidict(tags):\n",
141 |     "\tbidict = {}\n",
142 |     "\tfor items in tags: \n",
143 |     "\t\tfor i in range(len(items)-1):\n",
144 |     "\t\t\ttup = (items[i], items[i+1])\n",
145 |     "\t\t\tif tup not in bidict:\n",
146 |     "\t\t\t\tbidict[tup] = 1\n",
147 |     "\t\t\telse:\n",
148 |     "\t\t\t\tbidict[tup] += 1\n",
149 |     "\treturn bidict\n",
150 |     "\n",
151 |     "# 构建trigram词频词典，其中以三元组(u, v, w)作为词典的键\n",
152 |     "def build_tridict(tags):\n",
153 |     "\ttridict = {}\n",
154 |     "\tfor items in tags:\n",
155 |     "\t\titems.insert(0, const.START_TOKEN)\n",
156 |     "\t\tfor i in range(len(items) -2):\n",
157 |     "\t\t\ttup = (items[i], items[i+1], items[i+2])\n",
158 |     "\t\t\tif tup not in tridict:\n",
159 |     "\t\t\t\ttridict[tup] = 1\n",
160 |     "\t\t\telse:\n",
161 |     "\t\t\t\ttridict[tup] += 1\n",
162 |     "\treturn tridict\n",
163 |     "\n",
164 |     "# 构建(词,词性)词频字典，以及统计词频\n",
165 |     "def build_count_dict(datas, tags):\n",
166 |     "\ttagword_dict = {}\n",
167 |     "\twordcount = {}\n",
168 |     "\ttagcount = {}\n",
169 |     "\tfor i, data in enumerate(datas):\n",
170 |     "\t\ttag = tags[i][1:-1]\n",
171 |     "\t\tfor idx, d in enumerate(data):\n",
172 |     "\t\t\ttup = (tag[idx], d)\n",
173 |     "\t\t\tif tup not in tagword_dict:\n",
174 |     "\t\t\t\ttagword_dict[tup] = 1\n",
175 |     "\t\t\telse:\n",
176 |     "\t\t\t\ttagword_dict[tup] += 1\n",
177 |     "\n",
178 |     "\t\t\tif d not in wordcount:\n",
179 |     "\t\t\t\twordcount[d] = 1\n",
180 |     "\t\t\telse:\n",
181 |     "\t\t\t\twordcount[d] += 1\n",
182 |     "\t\t\tif tag[idx] not in tagcount:\n",
183 |     "\t\t\t\ttagcount[tag[idx]] = 1\n",
184 |     "\t\t\telse:\n",
185 |     "\t\t\t\ttagcount[tag[idx]] += 1\n",
186 |     "\treturn tagword_dict, wordcount, tagcount"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "### hmm.py\n",
194 |     "\n",
195 |     "基于bigram, trigram实现了hmm, 支持viterbi解码"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "collapsed": true
203 |    },
204 |    "outputs": [],
205 |    "source": [
206 |     "# !/usr/bin/env python3\n",
207 |     "# -*- coding: utf-8 -*-\n",
208 |     "\n",
209 |     "'''\n",
210 |     "@description: bigram hmm, trigram hmm, _viterbi\n",
211 |     "@author: Sean QQ: 929325776\n",
212 |     "'''\n",
213 |     "\n",
214 |     "import math\n",
215 |     "import const\n",
216 |     "from processing import *\n",
217 |     "\n",
218 |     "'''bigram hmm'''\n",
219 |     "class BiHMM(object):\n",
220 |     "\tdef __init__(self, datas, tags):\n",
221 |     "\t\tself.datas = datas\n",
222 |     "\t\tself.tags = build_tags(tags)\n",
223 |     "\t\tself.undict = build_undict(self.tags) \n",
224 |     "\t\tself.bidict = build_bidict(self.tags)\n",
225 |     "\t\tself.tagword, self.wordcount, self.tagcount = build_count_dict(datas, self.tags)\n",
226 |     "\t\tself.postags = [tag for tag in self.undict]\n",
227 |     "\n",
228 |     "\tdef calc_e_prob(self, *args):\n",
229 |     "\t\tif len(args) != 2:\n",
230 |     "\t\t\traise ValueError('two tags is required')\n",
231 |     "\n",
232 |     "\t\tn = 0.0\n",
233 |     "\t\tm = 0.0\n",
234 |     "\t\tif args in self.tagword:\n",
235 |     "\t\t\tn = self.tagword[args]\n",
236 |     "\t\tif args[0] in self.undict:\n",
237 |     "\t\t\tm = self.undict[args[0]]\n",
238 |     "\t\treturn (n + 1) * 1.0 / (m + len(self.wordcount)*len(self.undict))\n",
239 |     "\n",
240 |     "\tdef calc_prob(self, *args):\n",
241 |     "\t\tif len(args) != 2:\n",
242 |     "\t\t\traise ValueError('two tags is required')\n",
243 |     "\n",
244 |     "\t\tn = 0.0\n",
245 |     "\t\tm = 0.0\n",
246 |     "\t\tif args in self.bidict:\n",
247 |     "\t\t\tn = self.bidict[args]\n",
248 |     "\t\tif args[0] in self.undict:\n",
249 |     "\t\t\tm = self.undict[args[0]]\n",
250 |     "\t\treturn (n + 1) * 1.0 / (m + len(self.postags)**2)\n",
251 |     "\n",
252 |     "\tdef calc_tags_prob(self, tags):\n",
253 |     "\t\tprob = 0\n",
254 |     "\t\tprev_tag = const.START_TOKEN\n",
255 |     "\t\tfor tag in tags:\n",
256 |     "\t\t\ttag_prob = self.calc_prob(prev_tag, tag)\n",
257 |     "\t\t\tprob += tag_prob\n",
258 |     "\t\t\tprev_tag = tag\n",
259 |     "\t\treturn prob\n",
260 |     "\n",
261 |     "\tdef calc_tagword_proba(self, tag, word):\n",
262 |     "\t\tprob = 0.0\n",
263 |     "\t\ttagword = (tag, word)\n",
264 |     "\t\tif tagword in self.tagword:\n",
265 |     "\t\t\tprob = float(self.tagword[tagword]) / self.tagcount[tag]\n",
266 |     "\t\treturn prob\n",
267 |     "\n",
268 |     "\t# @param vb _viterbi\n",
269 |     "\tdef pred(self, sentence, vb=False):\n",
270 |     "\t\tif vb:\n",
271 |     "\t\t\t# _viterbi\n",
272 |     "\t\t\treturn self._viterbi(sentence)\n",
273 |     "\n",
274 |     "\t\twordtag = []\n",
275 |     "\t\tmax_prob = 0.0\n",
276 |     "\t\tmax_tag = None\n",
277 |     "\t\t#total_prob = None\n",
278 |     "\t\tfor word in sentence:\n",
279 |     "\t\t\tfor tag1 in self.postags:\n",
280 |     "\t\t\t\tfor tag2 in self.postags:\n",
281 |     "\t\t\t\t\tq = self.calc_tags_prob((tag1, tag2))\n",
282 |     "\t\t\t\t\te = self.calc_tagword_proba(tag2, word)\n",
283 |     "\t\t\t\t\tprob = q*e*1.0\n",
284 |     "\t\t\t\t\tif prob >= max_prob:\n",
285 |     "\t\t\t\t\t\tmax_prob = prob\n",
286 |     "\t\t\t\t\t\tmax_tag = tag2\n",
287 |     "\t\t\twordtag.append((word, max_tag))\n",
288 |     "\t\t\t'''\n",
289 |     "\t\t\tif total_prob == None:\n",
290 |     "\t\t\t\ttotal_prob = max_prob\n",
291 |     "\t\t\telse:\n",
292 |     "\t\t\t\ttotal_prob *= max_prob \n",
293 |     "\t\t\t'''\n",
294 |     "\t\t\tmax_prob = 0.0\t\t\n",
295 |     "\t\treturn wordtag\n",
296 |     "\n",
297 |     "\n",
298 |     "\tdef _viterbi_decode(self, sentence, score, trace):\n",
299 |     "\t\tresult = []\n",
300 |     "\t\ttmp = -float('inf')\n",
301 |     "\t\tres_x = 0\n",
302 |     "\t\tfor idx, val in enumerate(self.postags):\n",
303 |     "\t\t\tif tmp < score[idx][len(sentence)-1]:\n",
304 |     "\t\t\t\ttmp = score[idx][len(sentence)-1]\n",
305 |     "\t\t\t\tres_x = idx\n",
306 |     "\t\tresult.append(res_x)\n",
307 |     "\t\tfor idx in range(len(sentence)-1, 0, -1):\n",
308 |     "\t\t\tresult.append(trace[result[-1]][idx])\n",
309 |     "\t\tresult.reverse()\n",
310 |     "\t\tresult_pos = []\n",
311 |     "\t\tresult_pos = [self.postags[k] for k in result]\n",
312 |     "\t\twordtag = list(zip(sentence, result_pos))\n",
313 |     "\t\treturn wordtag\n",
314 |     "\n",
315 |     "\tdef _viterbi(self, sentence):\n",
316 |     "\t\trow = len(self.postags)\n",
317 |     "\t\tcol = len(sentence)\n",
318 |     "\n",
319 |     "\t\ttrace = [[-1 for i in range(col)] for i in range(row)]\n",
320 |     "\t\tscore = [[-1 for i in range(col)] for i in range(row)]\n",
321 |     "\n",
322 |     "\t\tfor idx, val in enumerate(sentence):\n",
323 |     "\t\t\tif idx == 0:\n",
324 |     "\t\t\t\tfor idx_pos, val_pos in enumerate(self.postags):\n",
325 |     "\t\t\t\t\tscore[idx_pos][idx] = self.calc_e_prob(val_pos, sentence[idx]) # emit\n",
326 |     "\t\t\telse:\n",
327 |     "\t\t\t\tfor idx_pos, val_pos in enumerate(self.postags):\n",
328 |     "\t\t\t\t\ttmp = -float('inf')\n",
329 |     "\t\t\t\t\ttrace_tmp = -1\n",
330 |     "\t\t\t\t\tfor idx_pos2, val_pos2 in enumerate(self.postags):\n",
331 |     "\t\t\t\t\t\tr = score[idx_pos2][idx-1]*self.calc_prob(val_pos2, val_pos)\n",
332 |     "\t\t\t\t\t\tif r > tmp:\n",
333 |     "\t\t\t\t\t\t\ttmp = r\n",
334 |     "\t\t\t\t\t\t\ttrace_tmp = idx_pos2\n",
335 |     "\t\t\t\t\t\ttrace[idx_pos][idx] = trace_tmp\n",
336 |     "\t\t\t\t\t\tscore[idx_pos][idx] = tmp*self.calc_e_prob(val_pos, val)\n",
337 |     "\t\treturn self._viterbi_decode(sentence, score, trace)\n",
338 |     "\n",
339 |     "class TriHMM(BiHMM):\n",
340 |     "\tdef __init__(self, datas, tags):\n",
341 |     "\t\tBiHMM.__init__(self, datas, tags)\n",
342 |     "\t\tself.tridict = build_tridict(self.tags)\n",
343 |     "\n",
344 |     "\tdef calc_prob(self, *args):\n",
345 |     "\t\tif len(args) != 3:\n",
346 |     "\t\t\traise ValueError('three tags is required')\n",
347 |     "\n",
348 |     "\t\tn = 0.0\n",
349 |     "\t\tm = 0.0\n",
350 |     "\t\tbitup = (args[0], args[1])\n",
351 |     "\t\tif args in self.tridict:\n",
352 |     "\t\t\tn = self.tridict[args]\n",
353 |     "\t\tif bitup in self.bidict:\n",
354 |     "\t\t\tm = self.bidict[bitup]\n",
355 |     "\t\treturn (n + 1) * 1.0 / (m + len(self.postags)**2)\n",
356 |     "\n",
357 |     "\n",
358 |     "\t\tprob = 0\n",
359 |     "\t\tif self.smooth != None:\n",
360 |     "\t\t\tprob = self.smooth(args[0], args[1], args[2], tridict=self.tridict, bidict=self.bidict, undict=self.undict)\n",
361 |     "\t\telse:\n",
362 |     "\t\t\tbitup = (args[0], args[1])\t\t\t\t\n",
363 |     "\t\t\tif args in self.tridict and bitup in self.bidict:\n",
364 |     "\t\t\t\treturn float(self.tridict[args]) / self.bidict[bitup]\n",
365 |     "\t\treturn prob\n",
366 |     "\n",
367 |     "\tdef calc_tags_prob(self, tags):\n",
368 |     "\t\tprob = 0\n",
369 |     "\t\tprev_stack = [const.START_TOKEN, const.START_TOKEN]\n",
370 |     "\t\tfor tag in tags:\n",
371 |     "\t\t\ttag_prob = self.calc_prob(prev_stack[0], prev_stack[1], tag)\n",
372 |     "\t\t\tprob += tag_prob\n",
373 |     "\t\t\tprev_stack[0] = prev_stack[1]\n",
374 |     "\t\t\tprev_stack[1] = tag\n",
375 |     "\t\treturn prob\n",
376 |     "\n",
377 |     "\t# @param vb _viterbi\n",
378 |     "\tdef pred(self, sentence, vb=False):\n",
379 |     "\t\tif vb:\n",
380 |     "\t\t\treturn self._viterbi(sentence)\n",
381 |     "\t\twordtag = []\n",
382 |     "\t\tmax_prob = 0.0\n",
383 |     "\t\tmax_tag = None\n",
384 |     "\t\t#total_prob = None\n",
385 |     "\t\tfor word in sentence:\n",
386 |     "\t\t\tfor tag1 in self.postags:\n",
387 |     "\t\t\t\tfor tag2 in self.postags:\n",
388 |     "\t\t\t\t\tfor tag3 in self.postags:\n",
389 |     "\t\t\t\t\t\tq = self.calc_tags_prob((tag1, tag2, tag3))\n",
390 |     "\t\t\t\t\t\te = self.calc_tagword_proba(tag3, word)\n",
391 |     "\t\t\t\t\t\tprob = q*e*1.0\n",
392 |     "\t\t\t\t\t\tif prob >= max_prob:\n",
393 |     "\t\t\t\t\t\t\tmax_prob = prob\n",
394 |     "\t\t\t\t\t\t\tmax_tag = tag3\n",
395 |     "\t\t\twordtag.append((word, max_tag))\n",
396 |     "\t\t\t'''\n",
397 |     "\t\t\tif total_prob == None:\n",
398 |     "\t\t\t\ttotal_prob = max_prob\n",
399 |     "\t\t\telse:\n",
400 |     "\t\t\t\ttotal_prob *= max_prob \n",
401 |     "\t\t\t'''\n",
402 |     "\t\t\tmax_prob = 0.0\t\t\n",
403 |     "\t\treturn wordtag\n",
404 |     "\n",
405 |     "\tdef _viterbi_decode(self, sentence, score, trace):\n",
406 |     "\t\tresult = []\n",
407 |     "\t\ttmp = -float('inf')\n",
408 |     "\t\tres_x = 0\n",
409 |     "\t\tres_y = 0\n",
410 |     "\t\tfor idx, val in enumerate(self.postags):\n",
411 |     "\t\t\tfor idx_pos2, val_pos2 in enumerate(self.postags):\n",
412 |     "\t\t\t\tif tmp < score[idx_pos2][idx][len(sentence)-1]:\n",
413 |     "\t\t\t\t\ttmp = score[idx_pos2][idx][len(sentence)-1]\n",
414 |     "\t\t\t\t\tres_x = idx\n",
415 |     "\t\t\t\t\tres_y = idx_pos2\n",
416 |     "\t\tresult.extend([res_x, res_y])\n",
417 |     "\t\tfor idx in range(len(sentence)-1, 0, -1):\n",
418 |     "\t\t\tresult.append(trace[result[-2]][result[-1]][idx])\n",
419 |     "\t\tresult.reverse()\n",
420 |     "\t\tresult_pos = []\n",
421 |     "\t\tresult_pos = [self.postags[k] for k in result]\n",
422 |     "\t\twordtag = list(zip(sentence, result_pos))\n",
423 |     "\t\treturn wordtag\n",
424 |     "\n",
425 |     "\tdef _viterbi(self, sentence):\n",
426 |     "\t\trow = len(self.postags)\n",
427 |     "\t\tcol = len(sentence)\n",
428 |     "\n",
429 |     "\t\ttrace = [[[-1 for i in range(col)] for i in range(row)] for i in range(row)]\n",
430 |     "\t\tscore = [[[-1 for i in range(col)] for i in range(row)] for i in range(row)]\n",
431 |     "\n",
432 |     "\t\tfor idx, val in enumerate(sentence):\n",
433 |     "\t\t\tif idx == 0:\n",
434 |     "\t\t\t\tfor idx_pos, val_pos in enumerate(self.postags):\n",
435 |     "\t\t\t\t\tscore[idx_pos][0][idx] = self.calc_e_prob(val_pos, sentence[idx]) # emit\n",
436 |     "\t\t\telse:\n",
437 |     "\t\t\t\tfor idx_pos, val_pos in enumerate(self.postags):\n",
438 |     "\t\t\t\t\ttmp = -float('inf')\n",
439 |     "\t\t\t\t\ttrace_tmp = -1\n",
440 |     "\t\t\t\t\tfor idx_pos2, val_pos2 in enumerate(self.postags):\n",
441 |     "\t\t\t\t\t\tfor idx_pos3, val_pos3 in enumerate(self.postags):\n",
442 |     "\t\t\t\t\t\t\tr = score[idx_pos3][idx_pos2][idx-1]*self.calc_prob(val_pos3, val_pos2 ,val_pos)\n",
443 |     "\t\t\t\t\t\t\tif r > tmp:\n",
444 |     "\t\t\t\t\t\t\t\ttmp = r\n",
445 |     "\t\t\t\t\t\t\t\ttrace_tmp = idx_pos3\n",
446 |     "\t\t\t\t\t\t\ttrace[idx_pos][idx_pos2][idx] = trace_tmp\n",
447 |     "\t\t\t\t\t\t\tscore[idx_pos][idx_pos2][idx] = tmp*self.calc_e_prob(val_pos, val)\n",
448 |     "\t\treturn self._viterbi_decode(sentence, score, trace)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {},
454 |    "source": [
455 |     "## 结果分析"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "markdown",
460 |    "metadata": {},
461 |    "source": [
462 |     "** bigram hmm **\n",
463 |     "\n",
464 |     "bigram hmm\n",
465 |     "\n",
466 |     "[('小明', 'nr'), ('爱', 'v'), ('老鼠', 'n'), ('和', 'c'), ('狗', 'n')]\n",
467 |     "\n",
468 |     "bigram hmm with viterbi decode\n",
469 |     "\n",
470 |     "[('小明', 'nr'), ('爱', 'v'), ('老鼠', 'n'), ('和', 'v'), ('狗', 'n')]\n",
471 |     "\n",
472 |     "**trigram hmm**\n",
473 |     "\n",
474 |     "trigram hmm\n",
475 |     "\n",
476 |     "[('小明', 'nr'), ('爱', 'v'), ('老鼠', 'n'), ('和', 'c'), ('狗', 'n')]\n",
477 |     "\n",
478 |     "trigram hmm with viterbi decode\n",
479 |     "\n",
480 |     "[('小明', 'nr'), ('爱', 'v'), ('老鼠', 'n'), ('和', 'c'), ('狗', 'n')]\n"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "markdown",
485 |    "metadata": {},
486 |    "source": [
487 |     "## 项目后续\n",
488 |     "\n",
489 |     "过段时间会加入深度学习在NLP上的应用，如果你感兴趣，可以关注我的公众号，或者star, watch 本项目哦"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "markdown",
494 |    "metadata": {},
495 |    "source": [
496 |     "## 联系作者\n",
497 |     "\n",
498 |     "@author sean\n",
499 |     "\n",
500 |     "@qq 929325776\n",
501 |     "\n",
502 |     "有什么问题，可以联系我，一起讨论"
503 |    ]
504 |   }
505 |  ],
506 |  "metadata": {
507 |   "kernelspec": {
508 |    "display_name": "Python 3",
509 |    "language": "python",
510 |    "name": "python3"
511 |   },
512 |   "language_info": {
513 |    "codemirror_mode": {
514 |     "name": "ipython",
515 |     "version": 3
516 |    },
517 |    "file_extension": ".py",
518 |    "mimetype": "text/x-python",
519 |    "name": "python",
520 |    "nbconvert_exporter": "python",
521 |    "pygments_lexer": "ipython3",
522 |    "version": "3.6.1"
523 |   }
524 |  },
525 |  "nbformat": 4,
526 |  "nbformat_minor": 2
527 | }
528 | 


--------------------------------------------------------------------------------
/hmm/src/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | '''
 6 | @description: 定义常量
 7 | @author: Sean QQ: 929325776
 8 | '''
 9 | 
10 | UNK = None
11 | START_TOKEN = '<s>'
12 | END_TOKEN = '</s>'
13 | 


--------------------------------------------------------------------------------
/hmm/src/corpus/toy/train.txt:
--------------------------------------------------------------------------------
1 | 猫/n 抓/v 老鼠/n
2 | 狗/n 追/v 猫/n
3 | 小明/nr 爱/v 狗/n 和/c 猫/n
4 | 


--------------------------------------------------------------------------------
/hmm/src/dataset.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | def load_data(file_path):
 5 | 	datas, tags = [], []
 6 | 	with open(file_path, 'r') as f:
 7 | 		for line in f:
 8 | 			line = line.strip()
 9 | 			splits = line.split(' ')
10 | 			data, tag = [], [] 
11 | 			for part in splits:
12 | 				parts = part.split('/')
13 | 				data.append(parts[0])
14 | 				tag.append(parts[1])
15 | 			datas.append(data)
16 | 			tags.append(tag)
17 | 	return datas, tags
18 | 
19 | 


--------------------------------------------------------------------------------
/hmm/src/hmm.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | '''
  5 | @description: bigram hmm, trigram hmm, _viterbi
  6 | @author: Sean QQ: 929325776
  7 | '''
  8 | 
  9 | import math
 10 | import const
 11 | from processing import *
 12 | 
 13 | '''bigram hmm'''
 14 | class BiHMM(object):
 15 | 	def __init__(self, datas, tags):
 16 | 		self.datas = datas
 17 | 		self.tags = build_tags(tags)
 18 | 		self.undict = build_undict(self.tags) 
 19 | 		self.bidict = build_bidict(self.tags)
 20 | 		self.tagword, self.wordcount, self.tagcount = build_count_dict(datas, self.tags)
 21 | 		self.postags = [tag for tag in self.undict]
 22 | 
 23 | 	def calc_e_prob(self, *args):
 24 | 		if len(args) != 2:
 25 | 			raise ValueError('two tags is required')
 26 | 
 27 | 		n = 0.0
 28 | 		m = 0.0
 29 | 		if args in self.tagword:
 30 | 			n = self.tagword[args]
 31 | 		if args[0] in self.undict:
 32 | 			m = self.undict[args[0]]
 33 | 		return (n + 1) * 1.0 / (m + len(self.wordcount)*len(self.undict))
 34 | 
 35 | 	def calc_prob(self, *args):
 36 | 		if len(args) != 2:
 37 | 			raise ValueError('two tags is required')
 38 | 
 39 | 		n = 0.0
 40 | 		m = 0.0
 41 | 		if args in self.bidict:
 42 | 			n = self.bidict[args]
 43 | 		if args[0] in self.undict:
 44 | 			m = self.undict[args[0]]
 45 | 		return (n + 1) * 1.0 / (m + len(self.postags)**2)
 46 | 
 47 | 	def calc_tags_prob(self, tags):
 48 | 		prob = 0
 49 | 		prev_tag = const.START_TOKEN
 50 | 		for tag in tags:
 51 | 			tag_prob = self.calc_prob(prev_tag, tag)
 52 | 			prob += tag_prob
 53 | 			prev_tag = tag
 54 | 		return prob
 55 | 
 56 | 	def calc_tagword_proba(self, tag, word):
 57 | 		prob = 0.0
 58 | 		tagword = (tag, word)
 59 | 		if tagword in self.tagword:
 60 | 			prob = float(self.tagword[tagword]) / self.tagcount[tag]
 61 | 		return prob
 62 | 
 63 | 	# @param vb _viterbi
 64 | 	def pred(self, sentence, vb=False):
 65 | 		if vb:
 66 | 			# _viterbi
 67 | 			return self._viterbi(sentence)
 68 | 
 69 | 		wordtag = []
 70 | 		max_prob = 0.0
 71 | 		max_tag = None
 72 | 		#total_prob = None
 73 | 		for word in sentence:
 74 | 			for tag1 in self.postags:
 75 | 				for tag2 in self.postags:
 76 | 					q = self.calc_tags_prob((tag1, tag2))
 77 | 					e = self.calc_tagword_proba(tag2, word)
 78 | 					prob = q*e*1.0
 79 | 					if prob >= max_prob:
 80 | 						max_prob = prob
 81 | 						max_tag = tag2
 82 | 			wordtag.append((word, max_tag))
 83 | 			'''
 84 | 			if total_prob == None:
 85 | 				total_prob = max_prob
 86 | 			else:
 87 | 				total_prob *= max_prob 
 88 | 			'''
 89 | 			max_prob = 0.0		
 90 | 		return wordtag
 91 | 
 92 | 
 93 | 	def _viterbi_decode(self, sentence, score, trace):
 94 | 		result = []
 95 | 		tmp = -float('inf')
 96 | 		res_x = 0
 97 | 		for idx, val in enumerate(self.postags):
 98 | 			if tmp < score[idx][len(sentence)-1]:
 99 | 				tmp = score[idx][len(sentence)-1]
100 | 				res_x = idx
101 | 		result.append(res_x)
102 | 		for idx in range(len(sentence)-1, 0, -1):
103 | 			result.append(trace[result[-1]][idx])
104 | 		result.reverse()
105 | 		result_pos = []
106 | 		result_pos = [self.postags[k] for k in result]
107 | 		wordtag = list(zip(sentence, result_pos))
108 | 		return wordtag
109 | 
110 | 	def _viterbi(self, sentence):
111 | 		row = len(self.postags)
112 | 		col = len(sentence)
113 | 
114 | 		trace = [[-1 for i in range(col)] for i in range(row)]
115 | 		score = [[-1 for i in range(col)] for i in range(row)]
116 | 
117 | 		for idx, val in enumerate(sentence):
118 | 			if idx == 0:
119 | 				for idx_pos, val_pos in enumerate(self.postags):
120 | 					score[idx_pos][idx] = self.calc_e_prob(val_pos, sentence[idx]) # emit
121 | 			else:
122 | 				for idx_pos, val_pos in enumerate(self.postags):
123 | 					tmp = -float('inf')
124 | 					trace_tmp = -1
125 | 					for idx_pos2, val_pos2 in enumerate(self.postags):
126 | 						r = score[idx_pos2][idx-1]*self.calc_prob(val_pos2, val_pos)
127 | 						if r > tmp:
128 | 							tmp = r
129 | 							trace_tmp = idx_pos2
130 | 						trace[idx_pos][idx] = trace_tmp
131 | 						score[idx_pos][idx] = tmp*self.calc_e_prob(val_pos, val)
132 | 		return self._viterbi_decode(sentence, score, trace)
133 | 
134 | class TriHMM(BiHMM):
135 | 	def __init__(self, datas, tags):
136 | 		BiHMM.__init__(self, datas, tags)
137 | 		self.tridict = build_tridict(self.tags)
138 | 
139 | 	def calc_prob(self, *args):
140 | 		if len(args) != 3:
141 | 			raise ValueError('three tags is required')
142 | 
143 | 		n = 0.0
144 | 		m = 0.0
145 | 		bitup = (args[0], args[1])
146 | 		if args in self.tridict:
147 | 			n = self.tridict[args]
148 | 		if bitup in self.bidict:
149 | 			m = self.bidict[bitup]
150 | 		return (n + 1) * 1.0 / (m + len(self.postags)**2)
151 | 
152 | 
153 | 		prob = 0
154 | 		if self.smooth != None:
155 | 			prob = self.smooth(args[0], args[1], args[2], tridict=self.tridict, bidict=self.bidict, undict=self.undict)
156 | 		else:
157 | 			bitup = (args[0], args[1])				
158 | 			if args in self.tridict and bitup in self.bidict:
159 | 				return float(self.tridict[args]) / self.bidict[bitup]
160 | 		return prob
161 | 
162 | 	def calc_tags_prob(self, tags):
163 | 		prob = 0
164 | 		prev_stack = [const.START_TOKEN, const.START_TOKEN]
165 | 		for tag in tags:
166 | 			tag_prob = self.calc_prob(prev_stack[0], prev_stack[1], tag)
167 | 			prob += tag_prob
168 | 			prev_stack[0] = prev_stack[1]
169 | 			prev_stack[1] = tag
170 | 		return prob
171 | 
172 | 	# @param vb _viterbi
173 | 	def pred(self, sentence, vb=False):
174 | 		if vb:
175 | 			return self._viterbi(sentence)
176 | 		wordtag = []
177 | 		max_prob = 0.0
178 | 		max_tag = None
179 | 		#total_prob = None
180 | 		for word in sentence:
181 | 			for tag1 in self.postags:
182 | 				for tag2 in self.postags:
183 | 					for tag3 in self.postags:
184 | 						q = self.calc_tags_prob((tag1, tag2, tag3))
185 | 						e = self.calc_tagword_proba(tag3, word)
186 | 						prob = q*e*1.0
187 | 						if prob >= max_prob:
188 | 							max_prob = prob
189 | 							max_tag = tag3
190 | 			wordtag.append((word, max_tag))
191 | 			'''
192 | 			if total_prob == None:
193 | 				total_prob = max_prob
194 | 			else:
195 | 				total_prob *= max_prob 
196 | 			'''
197 | 			max_prob = 0.0		
198 | 		return wordtag
199 | 
200 | 	def _viterbi_decode(self, sentence, score, trace):
201 | 		result = []
202 | 		tmp = -float('inf')
203 | 		res_x = 0
204 | 		res_y = 0
205 | 		for idx, val in enumerate(self.postags):
206 | 			for idx_pos2, val_pos2 in enumerate(self.postags):
207 | 				if tmp < score[idx_pos2][idx][len(sentence)-1]:
208 | 					tmp = score[idx_pos2][idx][len(sentence)-1]
209 | 					res_x = idx
210 | 					res_y = idx_pos2
211 | 		result.extend([res_x, res_y])
212 | 		for idx in range(len(sentence)-1, 0, -1):
213 | 			result.append(trace[result[-2]][result[-1]][idx])
214 | 		result.reverse()
215 | 		result_pos = []
216 | 		result_pos = [self.postags[k] for k in result]
217 | 		wordtag = list(zip(sentence, result_pos))
218 | 		return wordtag
219 | 
220 | 	def _viterbi(self, sentence):
221 | 		row = len(self.postags)
222 | 		col = len(sentence)
223 | 
224 | 		trace = [[[-1 for i in range(col)] for i in range(row)] for i in range(row)]
225 | 		score = [[[-1 for i in range(col)] for i in range(row)] for i in range(row)]
226 | 
227 | 		for idx, val in enumerate(sentence):
228 | 			if idx == 0:
229 | 				for idx_pos, val_pos in enumerate(self.postags):
230 | 					score[idx_pos][0][idx] = self.calc_e_prob(val_pos, sentence[idx]) # emit
231 | 			else:
232 | 				for idx_pos, val_pos in enumerate(self.postags):
233 | 					tmp = -float('inf')
234 | 					trace_tmp = -1
235 | 					for idx_pos2, val_pos2 in enumerate(self.postags):
236 | 						for idx_pos3, val_pos3 in enumerate(self.postags):
237 | 							r = score[idx_pos3][idx_pos2][idx-1]*self.calc_prob(val_pos3, val_pos2 ,val_pos)
238 | 							if r > tmp:
239 | 								tmp = r
240 | 								trace_tmp = idx_pos3
241 | 							trace[idx_pos][idx_pos2][idx] = trace_tmp
242 | 							score[idx_pos][idx_pos2][idx] = tmp*self.calc_e_prob(val_pos, val)
243 | 		return self._viterbi_decode(sentence, score, trace)


--------------------------------------------------------------------------------
/hmm/src/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from dataset import load_data
 5 | from hmm import *
 6 | 
 7 | datas, tags = load_data('./corpus/toy/train.txt')
 8 | 
 9 | ################## Bigram HMM start #####################
10 | print('\n************** bigram hmm **************\n')
11 | bihmm = BiHMM(datas, tags) 
12 | print("bigram hmm")
13 | print(bihmm.pred(['小明', '爱', '老鼠', '和', '狗']))
14 | print("bigram hmm with viterbi decode")
15 | print(bihmm.pred(['小明', '爱', '老鼠', '和', '狗'], vb=True))
16 | ################## Bigram HMM end   #####################
17 | 
18 | ################## Trigram HMM start #####################
19 | print('\n************* trigram hmm  *************\n')
20 | trihmm = TriHMM(datas, tags) 
21 | print("trigram hmm")
22 | print(trihmm.pred(['小明', '爱', '老鼠', '和', '狗']))
23 | print("trigram hmm with viterbi decode")
24 | print(trihmm.pred(['小明', '爱', '老鼠', '和', '狗'], vb=True))
25 | ################## Trigram HMM end   #####################
26 | 


--------------------------------------------------------------------------------
/hmm/src/processing.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | '''
 5 | @description: 句子的处理，字典的构建
 6 | @author: Sean QQ: 929325776
 7 | '''
 8 | 
 9 | import const
10 | 
11 | #加入起始标记
12 | def build_tags(tags):
13 | 	out = []
14 | 	for sentence in tags:
15 | 		items = [x.lower() for x in sentence]
16 | 		items.insert(0, const.START_TOKEN)
17 | 		items.append(const.END_TOKEN)
18 | 		out.append(items)
19 | 	return out
20 | 
21 | # 构建ungram词频词典
22 | def build_undict(tags):
23 | 	undict = {}
24 | 	for items in tags:
25 | 		for word in items:
26 | 			if word == const.START_TOKEN or word == const.END_TOKEN:
27 | 				continue
28 | 			if word not in undict:
29 | 				undict[word] = 1
30 | 			else:
31 | 				undict[word] += 1
32 | 	return undict
33 | 
34 | 
35 | # 构建bigram词频词典，其中以三元组(u, v)作为词典的键
36 | def build_bidict(tags):
37 | 	bidict = {}
38 | 	for items in tags: 
39 | 		for i in range(len(items)-1):
40 | 			tup = (items[i], items[i+1])
41 | 			if tup not in bidict:
42 | 				bidict[tup] = 1
43 | 			else:
44 | 				bidict[tup] += 1
45 | 	return bidict
46 | 
47 | # 构建trigram词频词典，其中以三元组(u, v, w)作为词典的键
48 | def build_tridict(tags):
49 | 	tridict = {}
50 | 	for items in tags:
51 | 		items.insert(0, const.START_TOKEN)
52 | 		for i in range(len(items) -2):
53 | 			tup = (items[i], items[i+1], items[i+2])
54 | 			if tup not in tridict:
55 | 				tridict[tup] = 1
56 | 			else:
57 | 				tridict[tup] += 1
58 | 	return tridict
59 | 
60 | # 构建(词,词性)词频字典，以及统计词频
61 | def build_count_dict(datas, tags):
62 | 	tagword_dict = {}
63 | 	wordcount = {}
64 | 	tagcount = {}
65 | 	for i, data in enumerate(datas):
66 | 		tag = tags[i][1:-1]
67 | 		for idx, d in enumerate(data):
68 | 			tup = (tag[idx], d)
69 | 			if tup not in tagword_dict:
70 | 				tagword_dict[tup] = 1
71 | 			else:
72 | 				tagword_dict[tup] += 1
73 | 
74 | 			if d not in wordcount:
75 | 				wordcount[d] = 1
76 | 			else:
77 | 				wordcount[d] += 1
78 | 			if tag[idx] not in tagcount:
79 | 				tagcount[tag[idx]] = 1
80 | 			else:
81 | 				tagcount[tag[idx]] += 1
82 | 	return tagword_dict, wordcount, tagcount
83 | 


--------------------------------------------------------------------------------
/language_model/README.md:
--------------------------------------------------------------------------------
1 | # language model
2 | 实现了
3 | 
4 | * unigram
5 | * bigram
6 | * trigram
7 | 
8 | 采用了困惑度perplexity对模型评价， 采用了smooth方法
9 | 


--------------------------------------------------------------------------------
/language_model/languange_model_tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# python实现n-gram\n",
  8 |     "\n",
  9 |     "本例子主要受 Michael Collins 教授的 Language Modeling 启发而编写，为了帮助大家理解语言模型，我在我的博客、公众号上发表了文章[一文读懂NLP中的语言模型(公众号)](http://mp.weixin.qq.com/s?__biz=MzIwNDM1NjUzMA==&mid=2247483658&idx=1&sn=9c5e7cc50b65cf31a08f1e2a0046ceb1&chksm=96c02fd7a1b7a6c1bbabe19145665d370020f4a3e89ebdc1226a1ec4ed110ef089c6fb0212c4&mpshare=1&scene=1&srcid=1114A1PGK4rDqKMMbsAmplr3#rd)，欢迎大家阅读。当然强烈推荐[Michael Collins 教授的 Language Modeling 原文](http://www.cs.columbia.edu/~mcollins/lm-spring2013.pdf)"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 目录\n",
 17 |     "\n",
 18 |     "1. [项目结构](#项目结构)\n",
 19 |     "\n",
 20 |     "2. [环境要求](#环境要求)\n",
 21 |     "\n",
 22 |     "3. [代码分析](#代码分析)\n",
 23 |     "\n",
 24 |     "4. [结果分析](#结果分析)\n",
 25 |     "\n",
 26 |     "5. [项目后续](#项目后续)\n",
 27 |     "\n",
 28 |     "6. [联系作者](#联系作者)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "### 项目结构\n",
 36 |     "\n",
 37 |     "| - src\n",
 38 |     "    \n",
 39 |     "    | - const.py      常量定义文件\n",
 40 |     "    \n",
 41 |     "    | - corpus        语料库\n",
 42 |     "    \n",
 43 |     "    | - dataset.py    加载语料\n",
 44 |     "    \n",
 45 |     "    | - evaluate.py   模型的评估方法\n",
 46 |     "    \n",
 47 |     "    | - main.py       例子程序\n",
 48 |     "    \n",
 49 |     "    | - ngram.py      ungram, bigram, trigram 模型，以及一些模型方法\n",
 50 |     "    \n",
 51 |     "    | - processing.py 字典的生成等处理方法\n",
 52 |     "    \n",
 53 |     "    | - smooth.py     平滑方法"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "## 环境要求\n",
 61 |     "\n",
 62 |     "    python3"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "## 代码分析"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### const.py\n",
 77 |     "\n",
 78 |     "在这里定义了三个常量"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 1,
 84 |    "metadata": {
 85 |     "collapsed": true
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# 未登录词\n",
 90 |     "UNK = None\n",
 91 |     "# 句子开始标记，代表句子的开头\n",
 92 |     "START_TOKEN = '<s>'\n",
 93 |     "# 句子结束标记，代表句子的结尾\n",
 94 |     "END_TOKEN = '</s>'"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "### processing.py"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {
108 |     "collapsed": true
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "import const\n",
113 |     "\n",
114 |     "#加入起始标记\n",
115 |     "def build_sentences(sentences):\n",
116 |     "        out = []\n",
117 |     "        for sentence in sentences:\n",
118 |     "                words = [x.lower() for x in sentence]\n",
119 |     "                words.insert(0, \"<s>\")\n",
120 |     "                words.append(\"</s>\")\n",
121 |     "                out.append(words)\n",
122 |     "        return out\n",
123 |     "\n",
124 |     "# 构建ungram词频词典\n",
125 |     "def build_undict(sentences):\n",
126 |     "        undict = {}\n",
127 |     "        total = 0\n",
128 |     "        for words in sentences:\n",
129 |     "                for word in words:\n",
130 |     "                        if word not in undict:\n",
131 |     "                                undict[word] = 1\n",
132 |     "                        else:\n",
133 |     "                                undict[word] += 1\n",
134 |     "                        if word != const.START_TOKEN and word != const.END_TOKEN:\n",
135 |     "                                total += 1\n",
136 |     "        return undict, total\n",
137 |     "\n",
138 |     "# 构建bigram词频词典，其中以三元组(u, v)作为词典的键\n",
139 |     "def build_bidict(sentences):\n",
140 |     "    bidict = {}\n",
141 |     "    for words in sentences:\n",
142 |     "            for i in range(len(words)-1):\n",
143 |     "                    tup = (words[i], words[i+1])\n",
144 |     "                    if tup not in bidict:\n",
145 |     "                            bidict[tup] = 1\n",
146 |     "                    else:\n",
147 |     "                            bidict[tup] += 1\n",
148 |     "    return bidict\n",
149 |     "\n",
150 |     "# 构建trigram词频词典，其中以三元组(u, v, w)作为词典的键\n",
151 |     "def build_tridict(sentences):\n",
152 |     "        tridict = {}\n",
153 |     "        for words in sentences:\n",
154 |     "                for i in range(len(words) -2):\n",
155 |     "                        tup = (words[i], words[i+1], words[i+2])\n",
156 |     "                        if tup not in tridict:\n",
157 |     "                                tridict[tup] = 1\n",
158 |     "                        else:\n",
159 |     "                                tridict[tup] += 1\n",
160 |     "        return tridict"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "### ngram.py\n",
168 |     "\n",
169 |     "n-gram模型，实现了ungram, bigram, trigram"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {
176 |     "collapsed": true
177 |    },
178 |    "outputs": [],
179 |    "source": [
180 |     "import math\n",
181 |     "import const\n",
182 |     "from processing import *\n",
183 |     "\n",
184 |     "'''\n",
185 |     "@function calc_prob \t\t\t计算条件概率，这里使用最大似然估计(max-likelihood estimate)去计算概率\n",
186 |     "@function calc_sentence_prob\t计算句子的条件概率\n",
187 |     "'''\n",
188 |     "class UnGram(object):\n",
189 |     "\tdef __init__(self, sentences, smooth = None):\n",
190 |     "\t\tself.undict, self.total = build_undict(sentences)\n",
191 |     "\t\tself.smooth = smooth\n",
192 |     "\n",
193 |     "\tdef calc_prob(self, word):\n",
194 |     "\t\tprob = 0\n",
195 |     "\t\tif self.smooth != None:\n",
196 |     "\t\t\tprob = self.smooth(word, undict=self.undict, total=self.total)\n",
197 |     "\t\telse:\n",
198 |     "\t\t\tif word in self.undict:\n",
199 |     "\t\t\t\tprob = float(self.undict[word]) / self.total\n",
200 |     "\t\treturn prob\n",
201 |     "\n",
202 |     "\tdef calc_sentence_prob(self, sentence, prob_log=True):\n",
203 |     "\t\tprob_log_sum = 0\n",
204 |     "\t\tfor word in sentence:\n",
205 |     "\t\t\tif word != const.START_TOKEN and word != const.END_TOKEN:\n",
206 |     "\t\t\t\tword_prob = self.calc_prob(word)\n",
207 |     "\t\t\t\tif word_prob != 0:\n",
208 |     "\t\t\t\t\tprob_log_sum += math.log(word_prob, 2)\n",
209 |     "\t\treturn math.pow(2, prob_log_sum) if prob_log else prob_log_sum\n",
210 |     "\n",
211 |     "\tdef sort_vocab(self):\n",
212 |     "\t\tvocabs = list(self.undict.keys())\n",
213 |     "\t\tvocabs.remove(const.START_TOKEN)\n",
214 |     "\t\tvocabs.remove(const.END_TOKEN)\n",
215 |     "\t\tvocabs.sort()\n",
216 |     "\t\tvocabs.append(const.UNK)\n",
217 |     "\t\tvocabs.append(const.START_TOKEN)\n",
218 |     "\t\tvocabs.append(const.END_TOKEN)\n",
219 |     "\t\treturn vocabs\n",
220 |     "\n",
221 |     "class BiGram(UnGram):\n",
222 |     "\tdef __init__(self, sentences, smooth = None):\n",
223 |     "\t\tUnGram.__init__(self, sentences, smooth)\n",
224 |     "\t\tself.bidict = build_bidict(sentences)\n",
225 |     "\n",
226 |     "\tdef calc_prob(self, *args):\n",
227 |     "\t\tif len(args) != 2:\n",
228 |     "\t\t\traise ValueError('two words is required')\n",
229 |     "\n",
230 |     "\t\tprob = 0\n",
231 |     "\t\tif self.smooth != None:\n",
232 |     "\t\t\tprob = self.smooth(args[0], args[1], bidict=self.bidict, undict=self.undict)\n",
233 |     "\t\telse:\n",
234 |     "\t\t\tif args in self.bidict and args[0] in self.undict:\n",
235 |     "\t\t\t\treturn float(self.bidict[args]) / self.undict[args[0]]\n",
236 |     "\t\treturn prob\n",
237 |     "\n",
238 |     "\tdef calc_sentence_prob(self, sentence, prob_log=True):\n",
239 |     "\t\tprob_log_sum = 0\n",
240 |     "\t\tprev_word = None\n",
241 |     "\t\tfor word in sentence:\n",
242 |     "\t\t\tif prev_word != None:\n",
243 |     "\t\t\t\tword_prob = self.calc_prob(prev_word, word)\n",
244 |     "\t\t\t\tprob_log_sum += word_prob\n",
245 |     "\t\t\tprev_word = word\n",
246 |     "\t\treturn math.pow(2, prob_log_sum) if prob_log else prob_log_sum\n",
247 |     "\n",
248 |     "\n",
249 |     "class TriGram(BiGram):\n",
250 |     "\tdef __init__(self, sentences, smooth = None):\n",
251 |     "\t\tBiGram.__init__(self, sentences, smooth)\n",
252 |     "\t\tself.tridict = build_tridict(sentences)\n",
253 |     "\n",
254 |     "\tdef calc_prob(self, *args):\n",
255 |     "\t\tif len(args) != 3:\n",
256 |     "\t\t\traise ValueError('three words is required')\n",
257 |     "\n",
258 |     "\t\tprob = 0\n",
259 |     "\t\tif self.smooth != None:\n",
260 |     "\t\t\tprob = self.smooth(args[0], args[1], args[2], tridict=self.tridict, bidict=self.bidict, undict=self.undict)\n",
261 |     "\t\telse:\n",
262 |     "\t\t\tbitup = (args[0], args[1])\t\t\t\t\n",
263 |     "\t\t\tif args in self.tridict and bitup in self.bidict:\n",
264 |     "\t\t\t\treturn float(self.tridict[args]) / self.bidict[bitup]\n",
265 |     "\t\treturn prob\n",
266 |     "\n",
267 |     "\tdef calc_sentence_prob(self, sentence, prob_log=True):\n",
268 |     "\t\tprob_log_sum = 0\n",
269 |     "\t\tprev_stack = []\n",
270 |     "\t\tfor word in sentence:\n",
271 |     "\t\t\tif len(prev_stack) < 2:\n",
272 |     "\t\t\t\tprev_stack.append(word)\n",
273 |     "\t\t\telif len(prev_stack) == 2:\n",
274 |     "\t\t\t\tword_prob = self.calc_prob(prev_stack[0], prev_stack[1], word)\n",
275 |     "\t\t\t\tprob_log_sum += word_prob\n",
276 |     "\t\t\t\tprev_stack[0] = prev_stack[1]\n",
277 |     "\t\t\t\tprev_stack[1] = word\n",
278 |     "\t\treturn math.pow(2, prob_log_sum) if prob_log else prob_log_sum\n",
279 |     "\n",
280 |     "'''\n",
281 |     "@function: calc_xxgram_count   主要用来统计语料库中词的总数\n",
282 |     "@function: print_xxgram_probas 格式化输出概率 \n",
283 |     "'''\n",
284 |     "class GramUtil(object):\n",
285 |     "\n",
286 |     "\t@staticmethod\n",
287 |     "\tdef calc_ungram_count(sentences):\n",
288 |     "\t\tcount = 0\n",
289 |     "\t\tfor sentence in sentences:\n",
290 |     "\t\t\t# except START_TOKEN and END_TOKEN\n",
291 |     "\t\t\tcount += len(sentence) - 2\n",
292 |     "\t\treturn count\n",
293 |     "\n",
294 |     "\t@staticmethod\n",
295 |     "\tdef calc_bigram_count(sentences):\n",
296 |     "\t\tcount = 0\n",
297 |     "\t\tfor sentence in sentences:\n",
298 |     "\t\t\tcount += len(sentence) - 1\n",
299 |     "\t\treturn count\n",
300 |     "\n",
301 |     "\t@staticmethod\n",
302 |     "\tdef calc_trigram_count(sentences):\n",
303 |     "\t\tcount = 0\n",
304 |     "\t\tfor sentence in sentences:\n",
305 |     "\t\t\tcount += len(sentence)\n",
306 |     "\t\treturn count\n",
307 |     "\n",
308 |     "\t@staticmethod\n",
309 |     "\tdef print_ungram_probs(model, vocabs):\n",
310 |     "\t\tfor vocab in vocabs:\n",
311 |     "\t\t\tif vocab != const.START_TOKEN and vocab != const.END_TOKEN:\n",
312 |     "\t\t\t\tprint(\"{} \\t {}\".format(vocab if vocab != const.UNK else 'UNK', model.calc_prob(vocab)))\n",
313 |     "\n",
314 |     "\t@staticmethod\n",
315 |     "\tdef print_bigram_probs(model, vocabs):\n",
316 |     "\t\tprint(\"\\t\\t\", end=\"\")\n",
317 |     "\t\tfor vocab in vocabs:\n",
318 |     "\t\t\tif vocab != const.START_TOKEN:\n",
319 |     "\t\t\t\tprint(vocab if vocab != const.UNK else \"UNK\", end=\"\\t\\t\")\n",
320 |     "\t\tprint(\"\")\n",
321 |     "\t\tfor vocab in vocabs:\n",
322 |     "\t\t\tif vocab != const.END_TOKEN:\n",
323 |     "\t\t\t\tprint(vocab if vocab != const.UNK else \"UNK\", end=\"\\t\\t\")\n",
324 |     "\t\t\t\tfor vocab2 in vocabs:\n",
325 |     "\t\t\t\t\tif vocab2 != const.START_TOKEN:\n",
326 |     "\t\t\t\t\t\tprint(\"{0:.3f}\".format(model.calc_prob(vocab, vocab2)), end=\"\\t\\t\")\n",
327 |     "\t\t\t\tprint(\"\")\n",
328 |     "\n",
329 |     "\t@staticmethod\n",
330 |     "\tdef print_trigram_probs(model, vocabs):\n",
331 |     "\t\tprint(\"\\t\\t\", end=\"\")\n",
332 |     "\t\tfor vocab in vocabs:\n",
333 |     "\t\t\tif vocab != const.START_TOKEN:\n",
334 |     "\t\t\t\tprint(vocab if vocab != const.UNK else \"UNK\", end=\"\\t\")\n",
335 |     "\t\tprint(\"\")\n",
336 |     "\t\tfor vocab in vocabs:\n",
337 |     "\t\t\tif vocab != const.END_TOKEN:\n",
338 |     "\t\t\t\tfor vocab2 in vocabs:\n",
339 |     "\t\t\t\t\tif vocab2 != const.START_TOKEN and vocab != const.UNK and vocab2 != const.UNK and vocab2 != const.END_TOKEN:\n",
340 |     "\t\t\t\t\t\tprint(vocab, vocab2 if vocab2 != const.UNK else \"UNK\", end=\"\\t\\t\")\n",
341 |     "\t\t\t\t\t\tfor vocab3 in vocabs:\n",
342 |     "\t\t\t\t\t\t\tif vocab3 != const.END_TOKEN\n",
343 |     "\t\t\t\t\t\t\t\tprint(\"{0:.3f}\".format(model.calc_prob(vocab, vocab2, vocab3)), end=\"\\t\")\n",
344 |     "\t\t\t\t\t\tprint(\"\")\n"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {},
350 |    "source": [
351 |     "### evaluate.py\n",
352 |     "\n",
353 |     "模型的评估，这里主要用了困惑度Perplexity"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {
360 |     "collapsed": true
361 |    },
362 |    "outputs": [],
363 |    "source": [
364 |     "import math\n",
365 |     "\n",
366 |     "# 计算困惑度\n",
367 |     "def perplexity(model, sentences, cal_gram_func):\n",
368 |     "    # gram_count 词的总数，对应教程中的 M\n",
369 |     "\tgram_count = cal_gram_func(sentences)\n",
370 |     "\tprob_log_sum = 0\n",
371 |     "\tfor sentence in sentences:\n",
372 |     "\t\ttry:\n",
373 |     "\t\t\tprob_log_sum -= math.log(model.calc_sentence_prob(sentence), 2)\n",
374 |     "\t\texcept:\n",
375 |     "\t\t\tprob_log_sum -= float('-inf')\n",
376 |     "\t\treturn math.pow(2, prob_log_sum/gram_count)"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "## 结果分析"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "markdown",
388 |    "metadata": {},
389 |    "source": [
390 |     "<table class=\"table table-bordered\">  \n",
391 |     "    <tr>  \n",
392 |     "        <td>**#**</td>  \n",
393 |     "        <td>**smooth**</td>\n",
394 |     "        <td>**unsmooth**</td>\n",
395 |     "    </tr>  \n",
396 |     "    <tr>  \n",
397 |     "        <td>你好不</td>\n",
398 |     "        <td>2.99167</td>  \n",
399 |     "        <td>3.97368</td>  \n",
400 |     "    </tr>  \n",
401 |     "    <tr>  \n",
402 |     "        <td>好不你</td>\n",
403 |     "        <td>1.10409</td>  \n",
404 |     "        <td>1.21901</td>  \n",
405 |     "    </tr>  \n",
406 |     "    <tr>  \n",
407 |     "        <td>你是不</td>\n",
408 |     "        <td>1.75263</td>  \n",
409 |     "        <td>2.06712</td>  \n",
410 |     "    </tr>  \n",
411 |     "</table>  \n",
412 |     "<table class=\"table table-bordered\">  \n",
413 |     "    <tr>  \n",
414 |     "        <td>**#**</td>  \n",
415 |     "        <td>**smooth**</td>\n",
416 |     "        <td>**unsmooth**</td>\n",
417 |     "    </tr>  \n",
418 |     "    <tr>  \n",
419 |     "        <td>Perplexity</td>\n",
420 |     "        <td>0.91272</td>  \n",
421 |     "        <td>0.89138</td>  \n",
422 |     "    </tr>  \n",
423 |     "</table>  "
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "markdown",
428 |    "metadata": {},
429 |    "source": [
430 |     "## 项目后续\n",
431 |     "\n",
432 |     "过段时间会加入深度学习在语言模型上的应用，如果你感兴趣，可以关注我的公众号，或者star, watch 本项目哦"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "markdown",
437 |    "metadata": {},
438 |    "source": [
439 |     "## 联系作者\n",
440 |     "\n",
441 |     "@author sean\n",
442 |     "\n",
443 |     "@qq  929325776\n",
444 |     "\n",
445 |     "有什么问题，可以联系我，一起讨论"
446 |    ]
447 |   }
448 |  ],
449 |  "metadata": {
450 |   "kernelspec": {
451 |    "display_name": "Python 3",
452 |    "language": "python",
453 |    "name": "python3"
454 |   },
455 |   "language_info": {
456 |    "codemirror_mode": {
457 |     "name": "ipython",
458 |     "version": 3
459 |    },
460 |    "file_extension": ".py",
461 |    "mimetype": "text/x-python",
462 |    "name": "python",
463 |    "nbconvert_exporter": "python",
464 |    "pygments_lexer": "ipython3",
465 |    "version": "3.6.1"
466 |   }
467 |  },
468 |  "nbformat": 4,
469 |  "nbformat_minor": 2
470 | }
471 | 


--------------------------------------------------------------------------------
/language_model/src/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | '''
 6 | @description: 定义常量
 7 | @author: Sean QQ: 929325776
 8 | '''
 9 | 
10 | UNK = None
11 | START_TOKEN = '<s>'
12 | END_TOKEN = '</s>'
13 | 


--------------------------------------------------------------------------------
/language_model/src/corpus/toy/test.txt:
--------------------------------------------------------------------------------
1 | 你 好 不
2 | 好 不 你
3 | 你 是 不
4 | 


--------------------------------------------------------------------------------
/language_model/src/corpus/toy/train.txt:
--------------------------------------------------------------------------------
1 | 你 好
2 | 你 好 吗
3 | 好 了 吗
4 | 你 好 了
5 | 不 好 了
6 | 你 不 好
7 | 你 好 不
8 | 


--------------------------------------------------------------------------------
/language_model/src/dataset.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | '''
 5 | @description: 加载语料，并加入起始标记<s></s>
 6 | @author: Sean QQ: 929325776
 7 | '''
 8 | import re
 9 | from processing import build_sentences
10 | 
11 | def load_dataset(file_path):
12 | 	with open(file_path, "r") as f:
13 | 		return build_sentences([re.split("\s+", line.rstrip('\n')) for line in f])
14 | 


--------------------------------------------------------------------------------
/language_model/src/evaluate.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | '''
 5 | @description: 模型评估
 6 | @author: Sean QQ: 929325776
 7 | '''
 8 | 
 9 | import math
10 | 
11 | # 计算困惑度
12 | def perplexity(model, sentences, cal_gram_func):
13 | 	# gram_count 词的总数，对应教程中的 M
14 | 	gram_count = cal_gram_func(sentences)
15 | 	prob_log_sum = 0
16 | 	for sentence in sentences:
17 | 		try:
18 | 			prob_log_sum -= math.log(model.calc_sentence_prob(sentence), 2)
19 | 		except:
20 | 			prob_log_sum -= float('-inf')
21 | 		return math.pow(2, prob_log_sum/gram_count)
22 | 
23 | 


--------------------------------------------------------------------------------
/language_model/src/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from dataset import load_dataset
 5 | from smooth import Smooth
 6 | from ngram import *
 7 | from evaluate import perplexity
 8 | 
 9 | train_dataset = load_dataset('./corpus/toy/train.txt')
10 | test_dataset = load_dataset('./corpus/toy/test.txt')
11 | 
12 | 
13 | ###################### ungram start ######################
14 | 
15 | '''
16 | model_unsmooth = UnGram(train_dataset)
17 | model_smooth = UnGram(train_dataset, Smooth.discounting)
18 | 
19 | vocabs = model_unsmooth.sort_vocab()
20 | 
21 | print("- ungram unsmooth -")
22 | GramUtil.print_ungram_probs(model_unsmooth, vocabs)
23 | 
24 | print("- ungram smooth -")
25 | GramUtil.print_ungram_probs(model_smooth, vocabs)
26 | 
27 | print('- sentence_prob -')
28 | print("\t\t smooth\t\t unsmooth")
29 | for sentence in test_dataset:
30 | 	smooth = "{0:.5f}".format(model_smooth.calc_sentence_prob(sentence))
31 | 	unsmooth = "{0:.5f}".format(model_unsmooth.calc_sentence_prob(sentence))
32 | 	print("".join(sentence), "\t", smooth, "\t", unsmooth)
33 | 
34 | print("- test perplexity -")
35 | print("unsmooth: ", perplexity(model_smooth, test_dataset, GramUtil.calc_ungram_count))
36 | print("smooth: ", perplexity(model_unsmooth, test_dataset, GramUtil.calc_ungram_count))
37 | '''
38 | ###################### ungram end ######################
39 | 
40 | 
41 | ###################### bigram start ######################
42 | 
43 | model_unsmooth = BiGram(train_dataset)
44 | model_smooth = BiGram(train_dataset, Smooth.discounting)
45 | 
46 | vocabs = model_unsmooth.sort_vocab()
47 | 
48 | print("- bigram unsmooth -")
49 | GramUtil.print_bigram_probs(model_unsmooth, vocabs)
50 | 
51 | print("- bigram smooth -")
52 | GramUtil.print_bigram_probs(model_smooth, vocabs)
53 | 
54 | print('- sentence_prob -')
55 | print("\t\t smooth\t\t unsmooth")
56 | for sentence in test_dataset:
57 | 	smooth = "{0:.5f}".format(model_smooth.calc_sentence_prob(sentence))
58 | 	unsmooth = "{0:.5f}".format(model_unsmooth.calc_sentence_prob(sentence))
59 | 	print("".join(sentence), "\t", smooth, "\t", unsmooth)
60 | 
61 | print("- test perplexity -")
62 | print("unsmooth: ", perplexity(model_smooth, test_dataset, GramUtil.calc_bigram_count))
63 | print("smooth: ", perplexity(model_unsmooth, test_dataset, GramUtil.calc_bigram_count))
64 | 
65 | ###################### ungram end ######################
66 | 
67 | 
68 | ###################### trigram start ######################
69 | '''
70 | model_unsmooth = TriGram(train_dataset)
71 | model_smooth = TriGram(train_dataset, Smooth.discounting)
72 | 
73 | vocabs = model_unsmooth.sort_vocab()
74 | 
75 | print("- ungram unsmooth -")
76 | GramUtil.print_trigram_probs(model_unsmooth, vocabs)
77 | 
78 | print("- ungram smooth -")
79 | GramUtil.print_trigram_probs(model_smooth, vocabs)
80 | 
81 | print('- sentence_prob -')
82 | print("\t\t smooth\t\t unsmooth")
83 | for sentence in test_dataset:
84 | 	smooth = "{0:.5f}".format(model_smooth.calc_sentence_prob(sentence))
85 | 	unsmooth = "{0:.5f}".format(model_unsmooth.calc_sentence_prob(sentence))
86 | 	print("".join(sentence), "\t", smooth, "\t", unsmooth)
87 | 
88 | print("- test perplexity -")
89 | print("unsmooth: ", perplexity(model_smooth, test_dataset, GramUtil.calc_bigram_count))
90 | print("smooth: ", perplexity(model_unsmooth, test_dataset, GramUtil.calc_bigram_count))
91 | '''
92 | ###################### ungram end ######################
93 | 


--------------------------------------------------------------------------------
/language_model/src/ngram.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | '''
  5 | @description: 语言模型
  6 |  UniGram 
  7 |  BiGram
  8 |  TriGram
  9 |  GramUtil - 工具函数
 10 | @author: Sean QQ: 929325776
 11 | '''
 12 | 
 13 | import math
 14 | import const
 15 | from processing import *
 16 | 
 17 | '''
 18 | @function calc_prob 		计算条件概率，这里使用最大似然估计(max-likelihood estimate)去计算概率
 19 | @function calc_sentence_prob	计算句子的条件概率
 20 | '''
 21 | class UnGram(object):
 22 | 	def __init__(self, sentences, smooth = None):
 23 | 		self.undict, self.total = build_undict(sentences)
 24 | 		self.smooth = smooth
 25 | 
 26 | 	def calc_prob(self, word):
 27 | 		prob = 0
 28 | 		if self.smooth != None:
 29 | 			prob = self.smooth(word, undict=self.undict, total=self.total)
 30 | 		else:
 31 | 			if word in self.undict:
 32 | 				prob = float(self.undict[word]) / self.total
 33 | 		return prob
 34 | 
 35 | 	def calc_sentence_prob(self, sentence, prob_log=True):
 36 | 		prob_log_sum = 0
 37 | 		for word in sentence:
 38 | 			if word != const.START_TOKEN and word != const.END_TOKEN:
 39 | 				word_prob = self.calc_prob(word)
 40 | 				if word_prob != 0:
 41 | 					prob_log_sum += math.log(word_prob, 2)
 42 | 		return math.pow(2, prob_log_sum) if prob_log else prob_log_sum
 43 | 
 44 | 	def sort_vocab(self):
 45 | 		vocabs = list(self.undict.keys())
 46 | 		vocabs.remove(const.START_TOKEN)
 47 | 		vocabs.remove(const.END_TOKEN)
 48 | 		vocabs.sort()
 49 | 		vocabs.append(const.UNK)
 50 | 		vocabs.append(const.START_TOKEN)
 51 | 		vocabs.append(const.END_TOKEN)
 52 | 		return vocabs
 53 | 
 54 | class BiGram(UnGram):
 55 | 	def __init__(self, sentences, smooth = None):
 56 | 		UnGram.__init__(self, sentences, smooth)
 57 | 		self.bidict = build_bidict(sentences)
 58 | 
 59 | 	def calc_prob(self, *args):
 60 | 		if len(args) != 2:
 61 | 			raise ValueError('two words is required')
 62 | 
 63 | 		prob = 0
 64 | 		if self.smooth != None:
 65 | 			prob = self.smooth(args[0], args[1], bidict=self.bidict, undict=self.undict)
 66 | 		else:
 67 | 			if args in self.bidict and args[0] in self.undict:
 68 | 				return float(self.bidict[args]) / self.undict[args[0]]
 69 | 		return prob
 70 | 
 71 | 	def calc_sentence_prob(self, sentence, prob_log=True):
 72 | 		prob_log_sum = 0
 73 | 		prev_word = None
 74 | 		for word in sentence:
 75 | 			if prev_word != None:
 76 | 				word_prob = self.calc_prob(prev_word, word)
 77 | 				prob_log_sum += word_prob
 78 | 			prev_word = word
 79 | 		return math.pow(2, prob_log_sum) if prob_log else prob_log_sum
 80 | 
 81 | 
 82 | class TriGram(BiGram):
 83 | 	def __init__(self, sentences, smooth = None):
 84 | 		BiGram.__init__(self, sentences, smooth)
 85 | 		self.tridict = build_tridict(sentences)
 86 | 
 87 | 	def calc_prob(self, *args):
 88 | 		if len(args) != 3:
 89 | 			raise ValueError('three words is required')
 90 | 
 91 | 		prob = 0
 92 | 		if self.smooth != None:
 93 | 			prob = self.smooth(args[0], args[1], args[2], tridict=self.tridict, bidict=self.bidict, undict=self.undict)
 94 | 		else:
 95 | 			bitup = (args[0], args[1])				
 96 | 			if args in self.tridict and bitup in self.bidict:
 97 | 				return float(self.tridict[args]) / self.bidict[bitup]
 98 | 		return prob
 99 | 
100 | 	def calc_sentence_prob(self, sentence, prob_log=True):
101 | 		prob_log_sum = 0
102 | 		prev_stack = []
103 | 		for word in sentence:
104 | 			if len(prev_stack) < 2:
105 | 				prev_stack.append(word)
106 | 			elif len(prev_stack) == 2:
107 | 				word_prob = self.calc_prob(prev_stack[0], prev_stack[1], word)
108 | 				prob_log_sum += word_prob
109 | 				prev_stack[0] = prev_stack[1]
110 | 				prev_stack[1] = word
111 | 		return math.pow(2, prob_log_sum) if prob_log else prob_log_sum
112 | 
113 | '''
114 | @function: calc_xxgram_count   主要用来统计语料库中词的总数
115 | @function: print_xxgram_probas 格式化输出概率 
116 | '''
117 | class GramUtil(object):
118 | 
119 | 	@staticmethod
120 | 	def calc_ungram_count(sentences):
121 | 		count = 0
122 | 		for sentence in sentences:
123 | 			# except START_TOKEN and END_TOKEN
124 | 			count += len(sentence) - 2
125 | 		return count
126 | 
127 | 	@staticmethod
128 | 	def calc_bigram_count(sentences):
129 | 		count = 0
130 | 		for sentence in sentences:
131 | 			count += len(sentence) - 1
132 | 		return count
133 | 
134 | 	@staticmethod
135 | 	def calc_trigram_count(sentences):
136 | 		count = 0
137 | 		for sentence in sentences:
138 | 			count += len(sentence)
139 | 		return count
140 | 
141 | 	@staticmethod
142 | 	def print_ungram_probs(model, vocabs):
143 | 		for vocab in vocabs:
144 | 			if vocab != const.START_TOKEN and vocab != const.END_TOKEN:
145 | 				print("{} \t {}".format(vocab if vocab != const.UNK else 'UNK', model.calc_prob(vocab)))
146 | 
147 | 	@staticmethod
148 | 	def print_bigram_probs(model, vocabs):
149 | 		print("\t\t", end="")
150 | 		for vocab in vocabs:
151 | 			if vocab != const.START_TOKEN:
152 | 				print(vocab if vocab != const.UNK else "UNK", end="\t\t")
153 | 		print("")
154 | 		for vocab in vocabs:
155 | 			if vocab != const.END_TOKEN:
156 | 				print(vocab if vocab != const.UNK else "UNK", end="\t\t")
157 | 				for vocab2 in vocabs:
158 | 					if vocab2 != const.START_TOKEN:
159 | 						print("{0:.3f}".format(model.calc_prob(vocab, vocab2)), end="\t\t")
160 | 				print("")
161 | 
162 | 	@staticmethod
163 | 	def print_trigram_probs(model, vocabs):
164 | 		print("\t\t", end="")
165 | 		for vocab in vocabs:
166 | 			if vocab != const.START_TOKEN:
167 | 				print(vocab if vocab != const.UNK else "UNK", end="\t")
168 | 		print("")
169 | 		for vocab in vocabs:
170 | 			if vocab != const.END_TOKEN:
171 | 				for vocab2 in vocabs:
172 | 					if vocab2 != const.START_TOKEN and vocab != const.UNK and vocab2 != const.UNK and vocab2 != const.END_TOKEN:
173 | 						print(vocab, vocab2 if vocab2 != const.UNK else "UNK", end="\t\t")
174 | 						for vocab3 in vocabs:
175 | 							if vocab3 != const.END_TOKEN:
176 | 								print("{0:.3f}".format(model.calc_prob(vocab, vocab2, vocab3)), end="\t")
177 | 						print("")
178 | 


--------------------------------------------------------------------------------
/language_model/src/processing.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | '''
 5 | @description: 句子的处理，字典的构建
 6 | @author: Sean QQ: 929325776
 7 | '''
 8 | 
 9 | import const
10 | 
11 | #加入起始标记
12 | def build_sentences(sentences):
13 | 	out = []
14 | 	for sentence in sentences:
15 | 		words = [x.lower() for x in sentence]
16 | 		words.insert(0, const.START_TOKEN)
17 | 		words.append(const.END_TOKEN)
18 | 		out.append(words)
19 | 	return out
20 | 
21 | # 构建ungram词频词典
22 | def build_undict(sentences):
23 | 	undict = {}
24 | 	total = 0
25 | 	for words in sentences:
26 | 		for word in words:
27 | 			if word not in undict:
28 | 				undict[word] = 1
29 | 			else:
30 | 				undict[word] += 1
31 | 			if word != const.START_TOKEN and word != const.END_TOKEN:
32 | 				total += 1
33 | 	return undict, total
34 | 
35 | 
36 | # 构建bigram词频词典，其中以三元组(u, v)作为词典的键
37 | def build_bidict(sentences):
38 | 	bidict = {}
39 | 	for words in sentences: 
40 | 		for i in range(len(words)-1):
41 | 			tup = (words[i], words[i+1])
42 | 			if tup not in bidict:
43 | 				bidict[tup] = 1
44 | 			else:
45 | 				bidict[tup] += 1
46 | 	return bidict
47 | 
48 | # 构建trigram词频词典，其中以三元组(u, v, w)作为词典的键
49 | def build_tridict(sentences):
50 | 	tridict = {}
51 | 	sentences.insert(0, const.START_TOKEN)
52 | 	for words in sentences:
53 | 		for i in range(len(words) -2):
54 | 			tup = (words[i], words[i+1], words[i+2])
55 | 			if tup not in tridict:
56 | 				tridict[tup] = 1
57 | 			else:
58 | 				tridict[tup] += 1
59 | 	return tridict
60 | 


--------------------------------------------------------------------------------
/language_model/src/smooth.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | '''
 5 | @description: 平滑估计计算
 6 | @author: Sean QQ: 929325776
 7 | '''
 8 | 
 9 | class Smooth(object):
10 | 	@staticmethod
11 | 	def discounting(*args, **kwargs):
12 | 		discount_value = 0.5
13 | 		if 'discount_value' in kwargs:
14 | 			discount_value = kwargs['discount_value']
15 | 		if len(args) == 1:
16 | 			if 'undict' not in kwargs:
17 | 				raise ValueError('undict is required')
18 | 			if 'total' not in kwargs:
19 | 				raise ValueError('total (words count in sentences) is required')
20 | 			undict = kwargs['undict']
21 | 			total = kwargs['total']
22 | 			word = args[0]
23 | 			if word in undict:
24 | 				return float(undict[word] - discount_value) / total
25 | 		if len(args) == 2:
26 | 			if 'bidict' not in kwargs and 'undict' not in kwargs:
27 | 				raise ValueError('bidict and undict is required')
28 | 			bidict = kwargs['bidict']
29 | 			undict = kwargs['undict']
30 | 			if args in bidict and args[0] in undict:
31 | 				return float(bidict[args] - discount_value) / undict[args[0]]
32 | 			else:
33 | 				return 0
34 | 		elif len(args) == 3:
35 | 			if 'tridict' not in kwargs and 'bidict' not in kwargs:
36 | 				raise ValueError('tridict and bidict is required')
37 | 			tridict = kwargs['tridict']
38 | 			bidict = kwargs['bidict']
39 | 			bitup = (args[0], args[1])
40 | 			if args in tridict and bitup in bidict:
41 | 				return float(tridict[args] - discount_value) / bidict[bitup]
42 | 			else:
43 | 				return 0
44 | 		else:
45 | 			return 0
46 | 


--------------------------------------------------------------------------------
/lsa/lsa.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | import jieba
 5 | 
 6 | class LSA(object):
 7 |     def __init__(self, docs, kernel=None):
 8 |         super(LSA, self).__init__()
 9 |         self.kernel = kernel
10 |         self.docs = []
11 |         self.vocabs = set()
12 |         self.build_vocab(docs)
13 | 
14 |     def tokenizer(self, sent):
15 |         return jieba.lcut(sent)
16 | 
17 |     def build_vocab(self, docs):
18 |         for doc in docs:
19 |             doc = doc.strip()
20 |             # 为了简单仅仅保留词的长度大于1的
21 |             words = list(filter(lambda x: len(x) > 1, self.tokenizer(doc))) 
22 |             self.docs.append(words)
23 |             self.vocabs.update(words)
24 | 
25 |         self.vocabs = list(self.vocabs)
26 |         self.word2idx = dict(zip(self.vocabs, range(len(self.vocabs))))
27 | 
28 |     def build_bow_matrix(self):
29 |         matrix = np.zeros([len(self.vocabs), len(self.docs)])
30 |         for docidx, words in enumerate(self.docs):
31 |             for word in words:
32 |                 matrix[self.word2idx[word], docidx] += 1
33 |         return matrix
34 | 
35 |     def build_tfidf_matrix(self):
36 |         tf = self.build_bow_matrix()
37 |         print(tf)
38 |         df = np.ones([len(self.vocabs), 1])
39 | 
40 |         for docidx, words in enumerate(self.docs):
41 |             tf[:, docidx] /= np.max(tf[:, docidx])
42 |             for word in words:
43 |                 df[self.word2idx[word], 0] += 1
44 |         idf = np.log(len(self.docs)) - np.log(df)
45 | 
46 |         return tf*idf
47 | 
48 |     def sim_words(self, k=3):
49 |         if self.kernel == 'tfidf':
50 |             matrix = self.build_tfidf_matrix()
51 |         else:
52 |             matrix = self.build_bow_matrix()
53 | 
54 |         U, S, Vt = np.linalg.svd(matrix)
55 | 
56 |         sort_idx = np.argsort(-U)
57 |         # 一般不取第一列，第一列的词往往是本身
58 |         topk = sort_idx[:, 1:k+1] 
59 |         print("word \t similarity")
60 |         for widx, word in enumerate(self.vocabs):
61 |             line = word + ":\t"
62 |             idxs = topk[widx]
63 |             for idx in idxs:
64 |                 line += str(self.vocabs[idx]) + " "
65 |             print(line)
66 | 
67 |     def topic_relate(self, k=2):
68 |         if self.kernel == 'tfidf':
69 |             matrix = self.build_tfidf_matrix()
70 |         else:
71 |             matrix = self.build_bow_matrix()
72 | 
73 |         U, S, Vt = np.linalg.svd(matrix)
74 | 
75 |         sort_idx = np.argsort(-Vt, axis=1)
76 |         # 一般不取第一行，第一行是自己本身
77 |         topk = sort_idx[1:k+1, :]
78 |         print(topk)
79 | 
80 | if __name__ == '__main__':
81 |     doc1 = """计算机科学是系统性研究信息与计算的理论基础以及它们在计算机系统中如何实现与应用的实用技术的学科"""
82 |     
83 |     doc2 = """自然语言处理是人工智能和语言学领域的分支学科。此领域探讨如何处理及运用自然语言；自然语言认知则是指让电脑“懂”人类的语言。
84 |     自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。"""
85 |     
86 |     doc3 = """人工智能是计算机科学的一个分支，它企图了解智能的实质，并生产出一种新的能以人类智能相似的方式做出反应的智能机器，
87 |     该领域的研究包括机器人、语言识别、图像识别、自然语言处理和专家系统等"""
88 |     
89 |     doc4 = """《瓦尔登湖》是美国作家梭罗独居瓦尔登湖畔的记录，描绘了他两年多时间里的所见、所闻和所思。
90 |     该书崇尚简朴生活，热爱大自然的风光，内容丰厚，意义深远，语言生动"""
91 | 
92 |     docs = [doc1, doc2, doc3, doc4]
93 | 
94 |     lsa = LSA(docs, kernel=None)
95 |     lsa.sim_words()
96 |     lsa.topic_relate()


--------------------------------------------------------------------------------
/nbayes/nbayes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import math
 4 | 
 5 | from collections import defaultdict
 6 | 
 7 | class NBayes(object):
 8 |     def __init__(self, trainSet):
 9 |         self.data = trainSet
10 |         self.tags = defaultdict(int)
11 |         self.tagwords = defaultdict(int)
12 |         self.total = 0
13 | 
14 |     def _tokenizer(self, sent):
15 |         return list(sent)
16 | 
17 |     def train(self):
18 |         for tag, doc in self.data:
19 |             words = self._tokenizer(doc)  
20 |             for word in words:
21 |                 self.tags[tag] += 1
22 |                 self.tagwords[(tag, word)] += 1
23 |                 self.total += 1
24 | 
25 |     def predict(self, inp):
26 |         words = self._tokenizer(inp)
27 | 
28 |         tmp = {}
29 |         for tag in self.tags.keys():
30 |             tmp[tag] = math.log(self.tags[tag]) -  math.log(self.total)
31 |             for word in words:
32 |                 tmp[tag] += math.log(self.tagwords.get((tag, word), 1.0)) - math.log(self.tags[tag])
33 |         ret, score = 0, 0.0
34 |         for t in self.tags.keys():
35 |             cnt = 0.0
36 |             for tt in self.tags.keys():
37 |                 cnt += math.exp(tmp[tt] - tmp[t])
38 |             cnt = 1.0 / cnt
39 |             if cnt > score:
40 |                 ret, score = t, cnt
41 |         return ret, score
42 | 
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     trainSet = [("pos", "good job !"),
47 |                 ("pos", "表现不错哦"), 
48 |                 ("pos", "厉害咯"), 
49 |                 ("pos", "做的很好啊"), 
50 |                 ("pos", "做得不错继续努力"),
51 |                 ("pos", "不错！点赞"),
52 |                 ("neg", "太差了"), 
53 |                 ("neg", "太糟糕了"), 
54 |                 ("neg", "你做的一点都不好"), 
55 |                 ("neg", "so bad"),
56 |                 ("non", "一般般吧，还过的去"), 
57 |                 ("non", "不算太好，也不算太差"), 
58 |                 ("non", "继续努力吧")
59 |                ]
60 |     clf = NBayes(trainSet)
61 |     clf.train()
62 |     print(clf.predict("不错哦"))
63 | 


--------------------------------------------------------------------------------
/nbayes/tfidf_nbayes.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import jieba
  4 | import numpy as np
  5 | from collections import defaultdict
  6 | 
  7 | class Corpus(object):
  8 |     def __init__(self):
  9 |         self.word2idx = {}
 10 |         self.tags = defaultdict(int)
 11 |         self.docs = []
 12 |         self.total = 0
 13 | 
 14 |     # 分词器
 15 |     def tokenizer(self, sent):
 16 |         return jieba.lcut(sent)
 17 | 
 18 |     # 构建字典，获取分类标记集
 19 |     def process_data(self, docs):
 20 |         vocabs = set()
 21 |         for tag, doc in docs:
 22 |             words = self.tokenizer(doc)
 23 |             if len(words) == 0:
 24 |                 continue
 25 |             self.tags[tag] += 1
 26 |             self.total += 1
 27 |             self.docs.append((tag, words))
 28 |             vocabs.update(words)
 29 |         vocabs = list(vocabs)
 30 |         self.word2idx = dict(zip(vocabs, range(len(vocabs))))
 31 | 
 32 |     # 计算词袋模型
 33 |     def calc_bow(self):
 34 |         bow = np.zeros([self.total, len(self.word2idx)])
 35 | 
 36 |         for docidx, (tag, doc) in enumerate(self.docs):
 37 |             for word in doc:
 38 |                 bow[docidx, self.word2idx[word]] += 1
 39 |         return bow 
 40 | 
 41 |     # 计算tf-idf
 42 |     def calc_tfidf(self):
 43 |         tf = self.calc_bow()
 44 |         df = np.ones([1, len(self.word2idx)])
 45 | 
 46 |         for docidx, (tag, doc) in enumerate(self.docs):
 47 |             tf[docidx] /= np.max(tf[docidx])
 48 |             for word in doc:
 49 |                 df[0, self.word2idx[word]] += 1
 50 |         idf = np.log(float(self.total)) - np.log(df)
 51 |         return np.multiply(tf, idf)
 52 | 
 53 |     # 计算输入词的向量
 54 |     def get_vec(self, words):
 55 |         vec = np.zeros([1, len(self.word2idx)])
 56 |         for word in words:
 57 |             if word in self.word2idx:
 58 |                 vec[0, self.word2idx[word]] += 1
 59 |         return vec
 60 | 
 61 | class NBayes(Corpus):
 62 |     def __init__(self, docs, kernel='tfidf'):
 63 |         super(NBayes, self).__init__()
 64 |         self.kernel = kernel
 65 |         self.process_data(docs)
 66 |         self.y_prob = {}
 67 |         self.c_prob = None
 68 | 
 69 |     def train(self):
 70 |         if self.kernel == 'tfidf':
 71 |             self.feature = self.calc_tfidf()
 72 |         else:
 73 |             self.feature = self.calc_bow()
 74 | 
 75 |         # 采用极大似然估计计算p(y)
 76 |         for tag in self.tags:
 77 |             self.y_prob[tag] = float(self.tags[tag]) / self.total
 78 | 
 79 |         # 计算条件概率 p(x|y_i)
 80 |         self.c_prob = np.zeros([len(self.tags), len(self.word2idx)])
 81 |         Z = np.zeros([len(self.tags), 1])
 82 |         for docidx in range(len(self.docs)):
 83 |             # 获得类别标签id
 84 |             tid = self.tags.keys().index(self.docs[docidx][0])
 85 |             self.c_prob[tid] += self.feature[docidx]
 86 |             Z[tid] = np.sum(self.c_prob[tid])
 87 |         self.c_prob /= Z # 归一化
 88 | 
 89 |     def predict(self, sent):
 90 |         words = self.tokenizer(sent)
 91 |         vec = self.get_vec(words)
 92 |         ret, max_score = None, -1.0
 93 |         for y, pc in zip(self.y_prob, self.c_prob):
 94 |             score = np.sum(vec * pc * self.y_prob[y]) # p(x1....xn|yi)p(yi)
 95 |             if score > max_score:
 96 |                 max_score = score
 97 |                 ret = y
 98 |         return ret, 1 - max_score
 99 | 
100 | if __name__ == '__main__':
101 |     trainSet = [("pos", "good job !"),
102 |                 ("pos", "表现不错哦"), 
103 |                 ("pos", "厉害咯"), 
104 |                 ("pos", "做的很好啊"), 
105 |                 ("pos", "做得不错继续努力"),
106 |                 ("pos", "不错！点赞"),
107 |                 ("neg", "太差了"), 
108 |                 ("neg", "太糟糕了"), 
109 |                 ("neg", "你做的一点都不好"), 
110 |                 ("neg", "不行，重做"),
111 |                 ("neg", "so bad"),
112 |                 ("non", "一般般吧，还过的去"), 
113 |                 ("non", "不算太好，也不算太差"), 
114 |                 ("non", "继续努力吧")
115 |                ]
116 |                
117 |     nb = NBayes(trainSet)
118 |     nb.train()
119 |     print(nb.predict("不错哦")) # ('pos', 0.9286)


--------------------------------------------------------------------------------
/pca/pca.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import numpy as np
 5 | 
 6 | def PCA(x, n_components=2):
 7 |     # 1. 对每个特征（每一行）进行去中心化，即每个数据减去均值
 8 |     mean_val = np.mean(x, axis=0)
 9 |     mean_x = x - mean_val
10 | 
11 |     # 2. 求mean_x协方差方阵
12 |     C_x = np.cov(mean_x, rowvar=True)
13 | 
14 |     # 3. 求C_x特征值和特征向量
15 |     eig_vals, eig_vects = np.linalg.eig(np.mat(C_x))
16 | 
17 |     # 4. 对特征值从大到小排序
18 |     sorted_idx = np.argsort(-eig_vals)
19 | 
20 |     # 5. 降维
21 |     topn_index = sorted_idx[:n_components]
22 |     topn_vects = eig_vects[topn_index, :]
23 | 
24 |     # 6. 投影到低维空间
25 |     pca_x = topn_vects * x  
26 |     return pca_x
27 | 
28 | if __name__ == '__main__':
29 |     x = np.mat([[-1, -1, 0, 2, 0], 
30 |                 [-2,  0, 0, 1, 1]])
31 |     x_ = PCA(x, n_components=1)
32 |     print(x_)


--------------------------------------------------------------------------------
/pcfg/README.md:
--------------------------------------------------------------------------------
1 | # PCFG
2 | 
3 | PCFG, Probabilistic Context-Free Grammars
4 | 
5 | 使用了CKY算法实现了CNF(Chomsky Normal Form)下的文法解析
6 | 


--------------------------------------------------------------------------------
/pcfg/pcfg_tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# python 实现PCFG\n",
  8 |     "\n",
  9 |     "本例子主要受 Michael Collins 教授的 Probabilistic Context-Free Grammars (PCFGs) 启发而编写，为了帮助大家理解，我在我的博客、公众号上发表了文章[一文读懂NLP中的PCFG(公众号)](https://mp.weixin.qq.com/s?__biz=MzIwNDM1NjUzMA==&mid=2247483666&idx=1&sn=708dcbce5be808b3be273838db298da7&chksm=96c02fcfa1b7a6d99a69c35e0de413488d4da4dc13c4ab3d21c8a415c8f2310c141676a068e0#rd)，欢迎大家阅读。当然强烈推荐Michael Collins 教授的 [Probabilistic Context-Free Grammars (PCFGs)](http://www.cs.columbia.edu/~mcollins/courses/nlp2011/notes/pcfgs.pdf)\n",
 10 |     "\n",
 11 |     "pcfg 常用于生成文法解析树，再这里使用CKY算法对CNF（Chomsky Normal Form）的文法进行解析"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## 目录\n",
 19 |     "\n",
 20 |     "1. [项目结构](#项目结构)\n",
 21 |     "2. [环境要求](#环境要求)\n",
 22 |     "3. [代码分析](#代码分析)\n",
 23 |     "4. [项目后续](#项目后续)\n",
 24 |     "5. [联系作者](#联系作者)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## 项目结构\n",
 32 |     "\n",
 33 |     "| - src\n",
 34 |     "\n",
 35 |     "    | - corpus        语料库\n",
 36 |     "\n",
 37 |     "    | - pcfg.py       \n",
 38 |     "\n",
 39 |     "    | - main.py       例子程序"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "## 环境要求\n",
 47 |     "\n",
 48 |     "    python3"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "## 代码分析"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "### pcfg.py\n"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 1,
 68 |    "metadata": {
 69 |     "collapsed": true
 70 |    },
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# !/usr/bin/env python3\n",
 74 |     "# -*- coding: utf-8 -*-\n",
 75 |     "\n",
 76 |     "# -------------------------------------------#\n",
 77 |     "# PCFG Parser\t                             #\n",
 78 |     "# author: sean lee                           #\n",
 79 |     "# qq: 929325776\t\t\t\t\t\t\t     #\n",
 80 |     "# email: lxm_0828@163.com                    #\n",
 81 |     "#--------------------------------------------#\n",
 82 |     "\n",
 83 |     "from collections import defaultdict\n",
 84 |     "\n",
 85 |     "class PCFG(object):\n",
 86 |     "\n",
 87 |     "\t# N_dict - count nonterminal\n",
 88 |     "\t# NR_dict - count relation X->Y1 Y2 (X Y1 Y2 are nonterminal)\n",
 89 |     "\t# TR_dict - count relation X->y (X is nonterminal y is terminal)\n",
 90 |     "\tdef __init__(self):\n",
 91 |     "\t\tself.N_dict = defaultdict(int)\n",
 92 |     "\t\tself.NR_dict = defaultdict(int)\n",
 93 |     "\t\tself.TR_dict = defaultdict(int)\n",
 94 |     "\n",
 95 |     "\tdef fit(self, train_corpus):\n",
 96 |     "\t\twith open(train_corpus, 'r') as f:\n",
 97 |     "\t\t\tfor line in f:\n",
 98 |     "\t\t\t\tarr = line.strip().split('->')\n",
 99 |     "\t\t\t\tself.N_dict[arr[0]] += 1;\n",
100 |     "\t\t\t\tif ' ' in arr[1].strip():\n",
101 |     "\t\t\t\t\tarr2 = arr[1].split()\n",
102 |     "\t\t\t\t\tif len(arr2) > 2:\n",
103 |     "\t\t\t\t\t\tcontinue\n",
104 |     "\t\t\t\t\tself.N_dict[arr2[0]] += 1\n",
105 |     "\t\t\t\t\tself.N_dict[arr2[1]] += 1\n",
106 |     "\t\t\t\t\tself.NR_dict[(arr[0], arr2[0], arr2[1])] += 1\n",
107 |     "\t\t\t\telse:\n",
108 |     "\t\t\t\t\tself.TR_dict[(arr[0], arr[1])] += 1\n",
109 |     "\t# q(X->Y Z)\n",
110 |     "\tdef calc_NR_proba(self, x, y1, y2):\n",
111 |     "\t\treturn float(self.NR_dict[(x, y1, y2)]) / self.N_dict[x]\n",
112 |     "\n",
113 |     "\t# q(X->y)\n",
114 |     "\tdef calc_TR_proba(self, x, y):\n",
115 |     "\t\treturn float(self.TR_dict[(x, y)]) / self.N_dict[x]\n",
116 |     "\n",
117 |     "\t# Return parse tree\n",
118 |     "\tdef parse(self, sentence):\n",
119 |     "\t\timport json\n",
120 |     "\t\tprint(json.dumps(self.CKY(sentence.split())))\n",
121 |     "\n",
122 |     "\t# CKY algorithm \n",
123 |     "\t# 适用于CNF (Chomsky normal form)\n",
124 |     "\tdef CKY(self, sentence):\n",
125 |     "\t\tn = len(sentence)\n",
126 |     "\t\tpi = defaultdict(float) \n",
127 |     "\t\tbp = {}\t# backpointer\n",
128 |     "\t\tN = self.N_dict.keys()\n",
129 |     "\n",
130 |     "\t\tfor i in range(n):\n",
131 |     "\t\t\tword = sentence[i]\n",
132 |     "\t\t\tfor X in N:\n",
133 |     "\t\t\t\tpi[(i, i, X)] = self.calc_TR_proba(X, word)\n",
134 |     "\n",
135 |     "\t\tfor i in range(1, n):\n",
136 |     "\t\t\tfor j in range(n-1):\n",
137 |     "\t\t\t\tk = i + j\n",
138 |     "\t\t\t\tfor X in N:\n",
139 |     "\t\t\t\t\tmax_score = 0\n",
140 |     "\t\t\t\t\targmax = None\n",
141 |     "\t\t\t\t\tfor R in self.NR_dict.keys():\n",
142 |     "\t\t\t\t\t\tif R[0] == X:  # start from X\n",
143 |     "\t\t\t\t\t\t\tY, Z = R[1:]\n",
144 |     "\t\t\t\t\t\t\tfor s in range(j, k):\n",
145 |     "\t\t\t\t\t\t\t\tif pi[(j, s, Y)] and pi[s+1, k, Z]:\n",
146 |     "\t\t\t\t\t\t\t\t\tscore = self.calc_NR_proba(X, Y, Z) * pi[(j, s, Y)] * pi[s+1, k, Z]\n",
147 |     "\t\t\t\t\t\t\t\t\tif max_score < score:\n",
148 |     "\t\t\t\t\t\t\t\t\t\tmax_score = score\n",
149 |     "\t\t\t\t\t\t\t\t\t\targmax = Y, Z, s\n",
150 |     "\t\t\t\t\tif max_score:\n",
151 |     "\t\t\t\t\t\tpi[j, k, X] = max_score\n",
152 |     "\t\t\t\t\t\tbp[j, k, X] = argmax\n",
153 |     "\n",
154 |     "\t\t# return\n",
155 |     "\t\tif pi[(0, n-1, 'S')]:\n",
156 |     "\t\t\treturn self.recover(sentence, bp, 0, n-1, 'S')\n",
157 |     "\t\telse:\n",
158 |     "\t\t\tmax_score = 0\n",
159 |     "\t\t\targmax = 0, 0, 'S'\n",
160 |     "\t\t\tfor X in N:\n",
161 |     "\t\t\t\tif max_score < pi[(0, n-1, X)]:\n",
162 |     "\t\t\t\t\tmax_score = pi[(0, n-1, X)]\n",
163 |     "\t\t\t\t\targmax = 0, n-1, X\n",
164 |     "\t\t\treturn self.recover(sentence, bp, *argmax)\n",
165 |     "\n",
166 |     "\t#  Return the list of the parsed tree with back pointers.\n",
167 |     "\tdef recover(self, sentence, bp, i, j, X):\n",
168 |     "\t\tif i == j:\n",
169 |     "\t\t\treturn [X, sentence[i]]\n",
170 |     "\t\telse:\n",
171 |     "\t\t\tY, Z, s = bp[i, j, X]\n",
172 |     "\t\t\treturn [X, self.recover(sentence, bp, i, s, Y), self.recover(sentence, bp, s+1, j, Z)]"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "### main.py"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 7,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "name": "stdout",
189 |      "output_type": "stream",
190 |      "text": [
191 |       "sentence: the man saw the dog\n",
192 |       "parse tree\n",
193 |       "[\"S\", [\"NP\", [\"DT\", \"the\"], [\"NN\", \"man\"]], [\"VP\", [\"Vt\", \"saw\"], [\"NP\", [\"DT\", \"the\"], [\"NN\", \"dog\"]]]]\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "# !/usr/bin/env python3\n",
199 |     "# -*- coding: utf-8 -*-\n",
200 |     "\n",
201 |     "# -------------------------------------------#\n",
202 |     "# main.py    \t                             #\n",
203 |     "# author: sean lee                           #\n",
204 |     "# qq: 929325776\t\t\t\t\t\t\t     #\n",
205 |     "# email: lxm_0828@163.com                    #\n",
206 |     "#--------------------------------------------#\n",
207 |     "\n",
208 |     "parser = PCFG()\n",
209 |     "parser.fit('./corpus/toy/train.txt')\n",
210 |     "\n",
211 |     "'''\n",
212 |     "print(parser.N_dict)\n",
213 |     "print(parser.NR_dict)\n",
214 |     "print(parser.TR_dict)\n",
215 |     "'''\n",
216 |     "\n",
217 |     "sentence = \"the man saw the dog\"\n",
218 |     "print(\"sentence:\", sentence)\n",
219 |     "print(\"parse tree\")\n",
220 |     "parser.parse(sentence)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "## 项目后续\n",
228 |     "\n",
229 |     "过段时间会加入深度学习在NLP上的应用，如果你感兴趣，可以关注我的公众号，或者star, watch 本项目哦"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "## 联系作者\n",
237 |     "\n",
238 |     "@author sean\n",
239 |     "\n",
240 |     "@qq 929325776\n",
241 |     "\n",
242 |     "有什么问题，可以联系我，一起讨论"
243 |    ]
244 |   }
245 |  ],
246 |  "metadata": {
247 |   "kernelspec": {
248 |    "display_name": "Python 3",
249 |    "language": "python",
250 |    "name": "python3"
251 |   },
252 |   "language_info": {
253 |    "codemirror_mode": {
254 |     "name": "ipython",
255 |     "version": 3
256 |    },
257 |    "file_extension": ".py",
258 |    "mimetype": "text/x-python",
259 |    "name": "python",
260 |    "nbconvert_exporter": "python",
261 |    "pygments_lexer": "ipython3",
262 |    "version": "3.6.1"
263 |   }
264 |  },
265 |  "nbformat": 4,
266 |  "nbformat_minor": 2
267 | }
268 | 


--------------------------------------------------------------------------------
/pcfg/src/corpus/toy/train.txt:
--------------------------------------------------------------------------------
 1 | S->NP VP
 2 | VP->Vt NP
 3 | VP->VP PP
 4 | NP->DT NN
 5 | NP->NP PP
 6 | PP->IN NP
 7 | Vi->sleeps
 8 | Vt->saw
 9 | NN->man
10 | NN->woman
11 | NN->telescope
12 | NN->dog
13 | DT->the
14 | IN->with
15 | IN->in
16 | 


--------------------------------------------------------------------------------
/pcfg/src/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # main.py    	                             #
 6 | # author: sean lee                           #
 7 | # qq: 929325776							     #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | from pcfg import PCFG 
12 | 
13 | parser = PCFG()
14 | parser.fit('./corpus/toy/train.txt')
15 | parser.parse("the man saw the dog")
16 | '''
17 | print(parser.N_dict)
18 | print(parser.NR_dict)
19 | print(parser.TR_dict)
20 | '''


--------------------------------------------------------------------------------
/pcfg/src/pcfg.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # PCFG Parser	                             #
  6 | # author: sean lee                           #
  7 | # qq: 929325776							     #
  8 | # email: lxm_0828@163.com                    #
  9 | #--------------------------------------------#
 10 | 
 11 | from collections import defaultdict
 12 | 
 13 | class PCFG(object):
 14 | 
 15 | 	# N_dict - count nonterminal
 16 | 	# NR_dict - count relation X->Y1 Y2 (X Y1 Y2 are nonterminal)
 17 | 	# TR_dict - count relation X->y (X is nonterminal y is terminal)
 18 | 	def __init__(self):
 19 | 		self.N_dict = defaultdict(int)
 20 | 		self.NR_dict = defaultdict(int)
 21 | 		self.TR_dict = defaultdict(int)
 22 | 
 23 | 	def fit(self, train_corpus):
 24 | 		with open(train_corpus, 'r') as f:
 25 | 			for line in f:
 26 | 				arr = line.strip().split('->')
 27 | 				self.N_dict[arr[0]] += 1;
 28 | 				if ' ' in arr[1].strip():
 29 | 					arr2 = arr[1].split()
 30 | 					if len(arr2) > 2:
 31 | 						continue
 32 | 					self.N_dict[arr2[0]] += 1
 33 | 					self.N_dict[arr2[1]] += 1
 34 | 					self.NR_dict[(arr[0], arr2[0], arr2[1])] += 1
 35 | 				else:
 36 | 					self.TR_dict[(arr[0], arr[1])] += 1
 37 | 	# q(X->Y Z)
 38 | 	def calc_NR_proba(self, x, y1, y2):
 39 | 		return float(self.NR_dict[(x, y1, y2)]) / self.N_dict[x]
 40 | 
 41 | 	# q(X->y)
 42 | 	def calc_TR_proba(self, x, y):
 43 | 		return float(self.TR_dict[(x, y)]) / self.N_dict[x]
 44 | 
 45 | 	# Return parse tree
 46 | 	def parse(self, sentence):
 47 | 		import json
 48 | 		print(json.dumps(self.CKY(sentence.split())))
 49 | 
 50 | 	# CKY algorithm 
 51 | 	# 适用于CNF (Chomsky normal form)
 52 | 	def CKY(self, sentence):
 53 | 		n = len(sentence)
 54 | 		pi = defaultdict(float) 
 55 | 		bp = {}	# backpointer
 56 | 		N = self.N_dict.keys()
 57 | 
 58 | 		for i in range(n):
 59 | 			word = sentence[i]
 60 | 			for X in N:
 61 | 				pi[(i, i, X)] = self.calc_TR_proba(X, word)
 62 | 
 63 | 		for i in range(1, n):
 64 | 			for j in range(n-1):
 65 | 				k = i + j
 66 | 				for X in N:
 67 | 					max_score = 0
 68 | 					argmax = None
 69 | 					for R in self.NR_dict.keys():
 70 | 						if R[0] == X:  # start from X
 71 | 							Y, Z = R[1:]
 72 | 							for s in range(j, k):
 73 | 								if pi[(j, s, Y)] and pi[s+1, k, Z]:
 74 | 									score = self.calc_NR_proba(X, Y, Z) * pi[(j, s, Y)] * pi[s+1, k, Z]
 75 | 									if max_score < score:
 76 | 										max_score = score
 77 | 										argmax = Y, Z, s
 78 | 					if max_score:
 79 | 						pi[j, k, X] = max_score
 80 | 						bp[j, k, X] = argmax
 81 | 
 82 | 		# return
 83 | 		if pi[(0, n-1, 'S')]:
 84 | 			return self.recover(sentence, bp, 0, n-1, 'S')
 85 | 		else:
 86 | 			max_score = 0
 87 | 			argmax = 0, 0, 'S'
 88 | 			for X in N:
 89 | 				if max_score < pi[(0, n-1, X)]:
 90 | 					max_score = pi[(0, n-1, X)]
 91 | 					argmax = 0, n-1, X
 92 | 			return self.recover(sentence, bp, *argmax)
 93 | 
 94 | 	#  Return the list of the parsed tree with back pointers.
 95 | 	def recover(self, sentence, bp, i, j, X):
 96 | 		if i == j:
 97 | 			return [X, sentence[i]]
 98 | 		else:
 99 | 			Y, Z, s = bp[i, j, X]
100 | 			return [X, self.recover(sentence, bp, i, s, Y), self.recover(sentence, bp, s+1, j, Z)]


--------------------------------------------------------------------------------
/reading_comprehension/README.md:
--------------------------------------------------------------------------------
 1 | ### Reading Comprehension
 2 | 
 3 | 阅读理解是NLP的热点，也是难点
 4 | 
 5 | 为了帮助大家更好的理解reading comprehension，本项目实现了一个简单的baseline（pytorch实现）
 6 | 
 7 | ### baseline
 8 | ![baseline](./corpus/reading_comprehension.png)
 9 | 
10 | ### dataset
11 | 
12 | 使用了[facebook的bAbI数据集](https://research.fb.com/downloads/babi/)，由于完整语料比较大在这里只使用了en-10k下的qa5-\*语料
13 | 
14 | ### Result
15 | ![baseline](./corpus/result.png)
16 | 
17 | ### Recommed
18 | 推荐阅读我的公众号文章[DeepNLP之阅读理解](https://mp.weixin.qq.com/s?__biz=MzIwNDM1NjUzMA==&mid=2247483674&idx=1&sn=8b7e470b8a8222b057d715d3ec48dd74&chksm=96c02fc7a1b7a6d1688351b4c2bc393ffdcd5d1686344f5a6500613b4e44f08bb696d830f45c#rd)
19 | 
20 | ### Reference
21 | 
22 | * [Dynamic-memory-networks-plus-Pytorch](https://github.com/dandelin/Dynamic-memory-networks-plus-Pytorch)
23 | 


--------------------------------------------------------------------------------
/reading_comprehension/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # const.python 			                     #
 6 | # author: sean lee                           #
 7 | # email: lxm_0828@163.com                    #
 8 | #--------------------------------------------#
 9 | 
10 | import torch
11 | 
12 | # tokens
13 | unk = '<unk>'
14 | pad = '<pad>'
15 | sos = '<s>'
16 | eos = '</s>'
17 | 
18 | #nnwork
19 | lr_rate = 0.001
20 | batch_size = 16
21 | hidden_size = 128
22 | epochs = 10
23 | task_id = 5	# 与bAbI/en-10k 中的task匹配
24 | 
25 | use_cuda = torch.cuda.is_available()
26 | 


--------------------------------------------------------------------------------
/reading_comprehension/corpus/bAbI/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | CC License
  2 | 
  3 | bAbI tasks data
  4 | 
  5 | Copyright (c) 2015-present, Facebook, Inc. All rights reserved.
  6 | 
  7 | Creative Commons Legal Code
  8 | 
  9 | Attribution 3.0 Unported
 10 | 
 11 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
 12 |     LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN
 13 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
 14 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
 15 |     REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR
 16 |     DAMAGES RESULTING FROM ITS USE.
 17 | 
 18 | License
 19 | 
 20 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE
 21 | COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY
 22 | COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS
 23 | AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
 24 | 
 25 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE
 26 | TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY
 27 | BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS
 28 | CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND
 29 | CONDITIONS.
 30 | 
 31 | 1. Definitions
 32 | 
 33 |  a. "Adaptation" means a work based upon the Work, or upon the Work and
 34 |     other pre-existing works, such as a translation, adaptation,
 35 |     derivative work, arrangement of music or other alterations of a
 36 |     literary or artistic work, or phonogram or performance and includes
 37 |     cinematographic adaptations or any other form in which the Work may be
 38 |     recast, transformed, or adapted including in any form recognizably
 39 |     derived from the original, except that a work that constitutes a
 40 |     Collection will not be considered an Adaptation for the purpose of
 41 |     this License. For the avoidance of doubt, where the Work is a musical
 42 |     work, performance or phonogram, the synchronization of the Work in
 43 |     timed-relation with a moving image ("synching") will be considered an
 44 |     Adaptation for the purpose of this License.
 45 |  b. "Collection" means a collection of literary or artistic works, such as
 46 |     encyclopedias and anthologies, or performances, phonograms or
 47 |     broadcasts, or other works or subject matter other than works listed
 48 |     in Section 1(f) below, which, by reason of the selection and
 49 |     arrangement of their contents, constitute intellectual creations, in
 50 |     which the Work is included in its entirety in unmodified form along
 51 |     with one or more other contributions, each constituting separate and
 52 |     independent works in themselves, which together are assembled into a
 53 |     collective whole. A work that constitutes a Collection will not be
 54 |     considered an Adaptation (as defined above) for the purposes of this
 55 |     License.
 56 |  c. "Distribute" means to make available to the public the original and
 57 |     copies of the Work or Adaptation, as appropriate, through sale or
 58 |     other transfer of ownership.
 59 |  d. "Licensor" means the individual, individuals, entity or entities that
 60 |     offer(s) the Work under the terms of this License.
 61 |  e. "Original Author" means, in the case of a literary or artistic work,
 62 |     the individual, individuals, entity or entities who created the Work
 63 |     or if no individual or entity can be identified, the publisher; and in
 64 |     addition (i) in the case of a performance the actors, singers,
 65 |     musicians, dancers, and other persons who act, sing, deliver, declaim,
 66 |     play in, interpret or otherwise perform literary or artistic works or
 67 |     expressions of folklore; (ii) in the case of a phonogram the producer
 68 |     being the person or legal entity who first fixes the sounds of a
 69 |     performance or other sounds; and, (iii) in the case of broadcasts, the
 70 |     organization that transmits the broadcast.
 71 |  f. "Work" means the literary and/or artistic work offered under the terms
 72 |     of this License including without limitation any production in the
 73 |     literary, scientific and artistic domain, whatever may be the mode or
 74 |     form of its expression including digital form, such as a book,
 75 |     pamphlet and other writing; a lecture, address, sermon or other work
 76 |     of the same nature; a dramatic or dramatico-musical work; a
 77 |     choreographic work or entertainment in dumb show; a musical
 78 |     composition with or without words; a cinematographic work to which are
 79 |     assimilated works expressed by a process analogous to cinematography;
 80 |     a work of drawing, painting, architecture, sculpture, engraving or
 81 |     lithography; a photographic work to which are assimilated works
 82 |     expressed by a process analogous to photography; a work of applied
 83 |     art; an illustration, map, plan, sketch or three-dimensional work
 84 |     relative to geography, topography, architecture or science; a
 85 |     performance; a broadcast; a phonogram; a compilation of data to the
 86 |     extent it is protected as a copyrightable work; or a work performed by
 87 |     a variety or circus performer to the extent it is not otherwise
 88 |     considered a literary or artistic work.
 89 |  g. "You" means an individual or entity exercising rights under this
 90 |     License who has not previously violated the terms of this License with
 91 |     respect to the Work, or who has received express permission from the
 92 |     Licensor to exercise rights under this License despite a previous
 93 |     violation.
 94 |  h. "Publicly Perform" means to perform public recitations of the Work and
 95 |     to communicate to the public those public recitations, by any means or
 96 |     process, including by wire or wireless means or public digital
 97 |     performances; to make available to the public Works in such a way that
 98 |     members of the public may access these Works from a place and at a
 99 |     place individually chosen by them; to perform the Work to the public
100 |     by any means or process and the communication to the public of the
101 |     performances of the Work, including by public digital performance; to
102 |     broadcast and rebroadcast the Work by any means including signs,
103 |     sounds or images.
104 |  i. "Reproduce" means to make copies of the Work by any means including
105 |     without limitation by sound or visual recordings and the right of
106 |     fixation and reproducing fixations of the Work, including storage of a
107 |     protected performance or phonogram in digital form or other electronic
108 |     medium.
109 | 
110 | 2. Fair Dealing Rights. Nothing in this License is intended to reduce,
111 | limit, or restrict any uses free from copyright or rights arising from
112 | limitations or exceptions that are provided for in connection with the
113 | copyright protection under copyright law or other applicable laws.
114 | 
115 | 3. License Grant. Subject to the terms and conditions of this License,
116 | Licensor hereby grants You a worldwide, royalty-free, non-exclusive,
117 | perpetual (for the duration of the applicable copyright) license to
118 | exercise the rights in the Work as stated below:
119 | 
120 |  a. to Reproduce the Work, to incorporate the Work into one or more
121 |     Collections, and to Reproduce the Work as incorporated in the
122 |     Collections;
123 |  b. to create and Reproduce Adaptations provided that any such Adaptation,
124 |     including any translation in any medium, takes reasonable steps to
125 |     clearly label, demarcate or otherwise identify that changes were made
126 |     to the original Work. For example, a translation could be marked "The
127 |     original work was translated from English to Spanish," or a
128 |     modification could indicate "The original work has been modified.";
129 |  c. to Distribute and Publicly Perform the Work including as incorporated
130 |     in Collections; and,
131 |  d. to Distribute and Publicly Perform Adaptations.
132 |  e. For the avoidance of doubt:
133 | 
134 |      i. Non-waivable Compulsory License Schemes. In those jurisdictions in
135 |         which the right to collect royalties through any statutory or
136 |         compulsory licensing scheme cannot be waived, the Licensor
137 |         reserves the exclusive right to collect such royalties for any
138 |         exercise by You of the rights granted under this License;
139 |     ii. Waivable Compulsory License Schemes. In those jurisdictions in
140 |         which the right to collect royalties through any statutory or
141 |         compulsory licensing scheme can be waived, the Licensor waives the
142 |         exclusive right to collect such royalties for any exercise by You
143 |         of the rights granted under this License; and,
144 |    iii. Voluntary License Schemes. The Licensor waives the right to
145 |         collect royalties, whether individually or, in the event that the
146 |         Licensor is a member of a collecting society that administers
147 |         voluntary licensing schemes, via that society, from any exercise
148 |         by You of the rights granted under this License.
149 | 
150 | The above rights may be exercised in all media and formats whether now
151 | known or hereafter devised. The above rights include the right to make
152 | such modifications as are technically necessary to exercise the rights in
153 | other media and formats. Subject to Section 8(f), all rights not expressly
154 | granted by Licensor are hereby reserved.
155 | 
156 | 4. Restrictions. The license granted in Section 3 above is expressly made
157 | subject to and limited by the following restrictions:
158 | 
159 |  a. You may Distribute or Publicly Perform the Work only under the terms
160 |     of this License. You must include a copy of, or the Uniform Resource
161 |     Identifier (URI) for, this License with every copy of the Work You
162 |     Distribute or Publicly Perform. You may not offer or impose any terms
163 |     on the Work that restrict the terms of this License or the ability of
164 |     the recipient of the Work to exercise the rights granted to that
165 |     recipient under the terms of the License. You may not sublicense the
166 |     Work. You must keep intact all notices that refer to this License and
167 |     to the disclaimer of warranties with every copy of the Work You
168 |     Distribute or Publicly Perform. When You Distribute or Publicly
169 |     Perform the Work, You may not impose any effective technological
170 |     measures on the Work that restrict the ability of a recipient of the
171 |     Work from You to exercise the rights granted to that recipient under
172 |     the terms of the License. This Section 4(a) applies to the Work as
173 |     incorporated in a Collection, but this does not require the Collection
174 |     apart from the Work itself to be made subject to the terms of this
175 |     License. If You create a Collection, upon notice from any Licensor You
176 |     must, to the extent practicable, remove from the Collection any credit
177 |     as required by Section 4(b), as requested. If You create an
178 |     Adaptation, upon notice from any Licensor You must, to the extent
179 |     practicable, remove from the Adaptation any credit as required by
180 |     Section 4(b), as requested.
181 |  b. If You Distribute, or Publicly Perform the Work or any Adaptations or
182 |     Collections, You must, unless a request has been made pursuant to
183 |     Section 4(a), keep intact all copyright notices for the Work and
184 |     provide, reasonable to the medium or means You are utilizing: (i) the
185 |     name of the Original Author (or pseudonym, if applicable) if supplied,
186 |     and/or if the Original Author and/or Licensor designate another party
187 |     or parties (e.g., a sponsor institute, publishing entity, journal) for
188 |     attribution ("Attribution Parties") in Licensor's copyright notice,
189 |     terms of service or by other reasonable means, the name of such party
190 |     or parties; (ii) the title of the Work if supplied; (iii) to the
191 |     extent reasonably practicable, the URI, if any, that Licensor
192 |     specifies to be associated with the Work, unless such URI does not
193 |     refer to the copyright notice or licensing information for the Work;
194 |     and (iv) , consistent with Section 3(b), in the case of an Adaptation,
195 |     a credit identifying the use of the Work in the Adaptation (e.g.,
196 |     "French translation of the Work by Original Author," or "Screenplay
197 |     based on original Work by Original Author"). The credit required by
198 |     this Section 4 (b) may be implemented in any reasonable manner;
199 |     provided, however, that in the case of a Adaptation or Collection, at
200 |     a minimum such credit will appear, if a credit for all contributing
201 |     authors of the Adaptation or Collection appears, then as part of these
202 |     credits and in a manner at least as prominent as the credits for the
203 |     other contributing authors. For the avoidance of doubt, You may only
204 |     use the credit required by this Section for the purpose of attribution
205 |     in the manner set out above and, by exercising Your rights under this
206 |     License, You may not implicitly or explicitly assert or imply any
207 |     connection with, sponsorship or endorsement by the Original Author,
208 |     Licensor and/or Attribution Parties, as appropriate, of You or Your
209 |     use of the Work, without the separate, express prior written
210 |     permission of the Original Author, Licensor and/or Attribution
211 |     Parties.
212 |  c. Except as otherwise agreed in writing by the Licensor or as may be
213 |     otherwise permitted by applicable law, if You Reproduce, Distribute or
214 |     Publicly Perform the Work either by itself or as part of any
215 |     Adaptations or Collections, You must not distort, mutilate, modify or
216 |     take other derogatory action in relation to the Work which would be
217 |     prejudicial to the Original Author's honor or reputation. Licensor
218 |     agrees that in those jurisdictions (e.g. Japan), in which any exercise
219 |     of the right granted in Section 3(b) of this License (the right to
220 |     make Adaptations) would be deemed to be a distortion, mutilation,
221 |     modification or other derogatory action prejudicial to the Original
222 |     Author's honor and reputation, the Licensor will waive or not assert,
223 |     as appropriate, this Section, to the fullest extent permitted by the
224 |     applicable national law, to enable You to reasonably exercise Your
225 |     right under Section 3(b) of this License (right to make Adaptations)
226 |     but not otherwise.
227 | 
228 | 5. Representations, Warranties and Disclaimer
229 | 
230 | UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR
231 | OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY
232 | KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE,
233 | INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY,
234 | FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF
235 | LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS,
236 | WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION
237 | OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
238 | 
239 | 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE
240 | LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR
241 | ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES
242 | ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS
243 | BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
244 | 
245 | 7. Termination
246 | 
247 |  a. This License and the rights granted hereunder will terminate
248 |     automatically upon any breach by You of the terms of this License.
249 |     Individuals or entities who have received Adaptations or Collections
250 |     from You under this License, however, will not have their licenses
251 |     terminated provided such individuals or entities remain in full
252 |     compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will
253 |     survive any termination of this License.
254 |  b. Subject to the above terms and conditions, the license granted here is
255 |     perpetual (for the duration of the applicable copyright in the Work).
256 |     Notwithstanding the above, Licensor reserves the right to release the
257 |     Work under different license terms or to stop distributing the Work at
258 |     any time; provided, however that any such election will not serve to
259 |     withdraw this License (or any other license that has been, or is
260 |     required to be, granted under the terms of this License), and this
261 |     License will continue in full force and effect unless terminated as
262 |     stated above.
263 | 
264 | 8. Miscellaneous
265 | 
266 |  a. Each time You Distribute or Publicly Perform the Work or a Collection,
267 |     the Licensor offers to the recipient a license to the Work on the same
268 |     terms and conditions as the license granted to You under this License.
269 |  b. Each time You Distribute or Publicly Perform an Adaptation, Licensor
270 |     offers to the recipient a license to the original Work on the same
271 |     terms and conditions as the license granted to You under this License.
272 |  c. If any provision of this License is invalid or unenforceable under
273 |     applicable law, it shall not affect the validity or enforceability of
274 |     the remainder of the terms of this License, and without further action
275 |     by the parties to this agreement, such provision shall be reformed to
276 |     the minimum extent necessary to make such provision valid and
277 |     enforceable.
278 |  d. No term or provision of this License shall be deemed waived and no
279 |     breach consented to unless such waiver or consent shall be in writing
280 |     and signed by the party to be charged with such waiver or consent.
281 |  e. This License constitutes the entire agreement between the parties with
282 |     respect to the Work licensed here. There are no understandings,
283 |     agreements or representations with respect to the Work not specified
284 |     here. Licensor shall not be bound by any additional provisions that
285 |     may appear in any communication from You. This License may not be
286 |     modified without the mutual written agreement of the Licensor and You.
287 |  f. The rights granted under, and the subject matter referenced, in this
288 |     License were drafted utilizing the terminology of the Berne Convention
289 |     for the Protection of Literary and Artistic Works (as amended on
290 |     September 28, 1979), the Rome Convention of 1961, the WIPO Copyright
291 |     Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996
292 |     and the Universal Copyright Convention (as revised on July 24, 1971).
293 |     These rights and subject matter take effect in the relevant
294 |     jurisdiction in which the License terms are sought to be enforced
295 |     according to the corresponding provisions of the implementation of
296 |     those treaty provisions in the applicable national law. If the
297 |     standard suite of rights granted under applicable copyright law
298 |     includes additional rights not granted under this License, such
299 |     additional rights are deemed to be included in the License; this
300 |     License is not intended to restrict the license of any rights under
301 |     applicable law.
302 | 
303 | 
304 | Creative Commons Notice
305 | 
306 |     Creative Commons is not a party to this License, and makes no warranty
307 |     whatsoever in connection with the Work. Creative Commons will not be
308 |     liable to You or any party on any legal theory for any damages
309 |     whatsoever, including without limitation any general, special,
310 |     incidental or consequential damages arising in connection to this
311 |     license. Notwithstanding the foregoing two (2) sentences, if Creative
312 |     Commons has expressly identified itself as the Licensor hereunder, it
313 |     shall have all rights and obligations of Licensor.
314 | 
315 |     Except for the limited purpose of indicating to the public that the
316 |     Work is licensed under the CCPL, Creative Commons does not authorize
317 |     the use by either party of the trademark "Creative Commons" or any
318 |     related trademark or logo of Creative Commons without the prior
319 |     written consent of Creative Commons. Any permitted use will be in
320 |     compliance with Creative Commons' then-current trademark usage
321 |     guidelines, as may be published on its website or otherwise made
322 |     available upon request from time to time. For the avoidance of doubt,
323 |     this trademark restriction does not form part of this License.
324 | 
325 |     Creative Commons may be contacted at https://creativecommons.org/.
326 | 


--------------------------------------------------------------------------------
/reading_comprehension/corpus/bAbI/README.txt:
--------------------------------------------------------------------------------
 1 | Towards AI Complete Question Answering: A Set of Prerequisite Toy Tasks
 2 | -----------------------------------------------------------------------
 3 | In this directory is the first set of 20 tasks for testing text understanding and reasoning in the bAbI project.
 4 | The aim is that each task tests a unique aspect of text and reasoning, and hence test different capabilities of learning models. More tasks are planned in the future to capture more aspects.
 5 | 
 6 | For each task, there are 1000 questions for training, and 1000 for testing.
 7 | However, we emphasize that the goal is still to use as little data as possible to do well on the task (i.e. if you can use less than 1000 that's even better) -- and without resorting to engineering task-specific tricks that will not generalize to other tasks, as they may not be of much use subsequently. Note that the aim during evaluation is to use the _same_ learner across all tasks to evaluate its skills and capabilities.
 8 | Further while the MemNN results in the paper use full supervision (including of the supporting facts) results with weak supervision would also be ultimately preferable as this kind of data is easier to collect. Hence results of that form are very welcome.
 9 | 
10 | For the reasons above there are currently several directories:
11 | 
12 | 1) en/ -- the tasks in English, readable by humans.
13 | 2) hn/ -- the tasks in Hindi, readable by humans.
14 | 3) shuffled/ -- the same tasks with shuffled letters so they are not readable by humans, and for existing parsers and taggers cannot be used in a straight-forward fashion to leverage extra resources-- in this case the learner is more forced to rely on the given training data. This mimics a learner being first presented a language and having to learn from scratch.
15 | 4) en-10k/  shuffled-10k/ and hn-10k/  -- the same tasks in the three formats, but with 10,000 training examples, rather than 1000 training examples.
16 | 5) en-valid/ and en-valid-10k/ are the same as en/ and en10k/ except the train sets have been conveniently split into train and valid portions (90% and 10% split).
17 | 
18 | The file format for each task is as follows:
19 | ID text
20 | ID text
21 | ID text
22 | ID question[tab]answer[tab]supporting fact IDS.
23 | ...
24 | 
25 | The IDs for a given "story" start at 1 and increase.
26 | When the IDs in a file reset back to 1 you can consider the following sentences as a new "story".
27 | Supporting fact IDs only ever reference the sentences within a "story".
28 | 
29 | For Example:
30 | 1 Mary moved to the bathroom.
31 | 2 John went to the hallway.
32 | 3 Where is Mary?        bathroom        1
33 | 4 Daniel went back to the hallway.
34 | 5 Sandra moved to the garden.
35 | 6 Where is Daniel?      hallway 4
36 | 7 John moved to the office.
37 | 8 Sandra journeyed to the bathroom.
38 | 9 Where is Daniel?      hallway 4
39 | 10 Mary moved to the hallway.
40 | 11 Daniel travelled to the office.
41 | 12 Where is Daniel?     office  11
42 | 13 John went back to the garden.
43 | 14 John moved to the bedroom.
44 | 15 Where is Sandra?     bathroom        8
45 | 1 Sandra travelled to the office.
46 | 2 Sandra went to the bathroom.
47 | 3 Where is Sandra?      bathroom        2
48 | 
49 | Changes between versions.
50 | =========================
51 | V1.2 (this version) - Added Hindi versions of all the tasks. Fixed some problems with task 16, and added a separate set of directories for 10k training data, as we received requests for this.
52 | V1.1 (this version) - Fixed some problems with task 3, and reduced the training set size available to 1000 as this matches the results in the paper cited above, in order to avoid confusion.
53 | 


--------------------------------------------------------------------------------
/reading_comprehension/corpus/reading_comprehension.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/nlp_learning/82f158f63c7b943dabc0fb18ed7ebde5c655214a/reading_comprehension/corpus/reading_comprehension.png


--------------------------------------------------------------------------------
/reading_comprehension/corpus/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/nlp_learning/82f158f63c7b943dabc0fb18ed7ebde5c655214a/reading_comprehension/corpus/result.png


--------------------------------------------------------------------------------
/reading_comprehension/dataset.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # const.python 			                     #
  6 | # author: sean lee                           #
  7 | # email: lxm_0828@163.com                    #
  8 | #--------------------------------------------#
  9 | 
 10 | from glob import glob
 11 | from torch.utils.data.dataset import Dataset
 12 | from torch.utils.data import DataLoader
 13 | from torch.utils.data.dataloader import default_collate
 14 | import re
 15 | import numpy as np
 16 | import const
 17 | 
 18 | class adict(dict):
 19 | 	def __init__(self, *av, **kav):
 20 | 		dict.__init__(self, *av, **kav)
 21 | 		self.__dict__ = self
 22 | 
 23 | def pad_collate(batch):
 24 | 	max_context_sen_len = float('-inf')
 25 | 	max_context_len = float('-inf')
 26 | 	max_question_len = float('-inf')
 27 | 	for elem in batch:
 28 | 		context, question, _ = elem
 29 | 		max_context_len = max_context_len if max_context_len > len(context) else len(context)
 30 | 		max_question_len = max_question_len if max_question_len > len(question) else len(question)
 31 | 		for sen in context:
 32 | 			max_context_sen_len = max_context_sen_len if max_context_sen_len > len(sen) else len(sen)
 33 | 	max_context_len = min(max_context_len, 70)
 34 | 	for i, elem in enumerate(batch):
 35 | 		_context, question, answer = elem
 36 | 		_context = _context[-max_context_len:]
 37 | 		context = np.zeros((max_context_len, max_context_sen_len))
 38 | 		for j, sen in enumerate(_context):
 39 | 			context[j] = np.pad(sen, (0, max_context_sen_len - len(sen)), 'constant', constant_values=0)
 40 | 		question = np.pad(question, (0, max_question_len - len(question)), 'constant', constant_values=0)
 41 | 		batch[i] = (context, question, answer)
 42 | 	return default_collate(batch)
 43 | 
 44 | class BabiDataset(Dataset):
 45 | 	def __init__(self, task_id, mode='train'):
 46 | 		self.vocab_path = 'dataset/babi{}_vocab.pkl'.format(task_id)
 47 | 		self.mode = mode
 48 | 		raw_train, raw_test = get_raw_babi(task_id)
 49 | 		self.QA = adict()
 50 | 		self.QA.VOCAB = {const.pad: 0, const.eos: 1}
 51 | 		self.QA.IVOCAB = {0: const.pad, 1: const.eos}
 52 | 		self.train = self.get_indexed_qa(raw_train)
 53 | 		self.valid = [self.train[i][int(-len(self.train[i])/10):] for i in range(3)]
 54 | 		self.train = [self.train[i][:int(9 * len(self.train[i])/10)] for i in range(3)]
 55 | 		self.test = self.get_indexed_qa(raw_test)
 56 | 
 57 | 	def set_mode(self, mode):
 58 | 		self.mode = mode
 59 | 
 60 | 	def __len__(self):
 61 | 		if self.mode == 'train':
 62 | 			return len(self.train[0])
 63 | 		elif self.mode == 'valid':
 64 | 			return len(self.valid[0])
 65 | 		elif self.mode == 'test':
 66 | 			return len(self.test[0])
 67 | 
 68 | 	def __getitem__(self, index):
 69 | 		if self.mode == 'train':
 70 | 			contexts, questions, answers = self.train
 71 | 		elif self.mode == 'valid':
 72 | 			contexts, questions, answers = self.valid
 73 | 		elif self.mode == 'test':
 74 | 			contexts, questions, answers = self.test
 75 | 		return contexts[index], questions[index], answers[index]
 76 | 
 77 | 	def get_indexed_qa(self, raw_babi):
 78 | 		unindexed = get_unindexed_qa(raw_babi)
 79 | 		questions = []
 80 | 		contexts = []
 81 | 		answers = []
 82 | 		for qa in unindexed:
 83 | 			context = [c.lower().split() + [const.eos] for c in qa['C']]
 84 | 
 85 | 			for con in context:
 86 | 				for token in con:
 87 | 					self.build_vocab(token)
 88 | 			context = [[self.QA.VOCAB[token] for token in sentence] for sentence in context]
 89 | 			question = qa['Q'].lower().split() + [const.eos]
 90 | 
 91 | 			for token in question:
 92 | 				self.build_vocab(token)
 93 | 			question = [self.QA.VOCAB[token] for token in question]
 94 | 
 95 | 			self.build_vocab(qa['A'].lower())
 96 | 			answer = self.QA.VOCAB[qa['A'].lower()]
 97 | 
 98 | 
 99 | 			contexts.append(context)
100 | 			questions.append(question)
101 | 			answers.append(answer)
102 | 		return (contexts, questions, answers)
103 | 
104 | 	def build_vocab(self, token):
105 | 		if not token in self.QA.VOCAB:
106 | 			next_index = len(self.QA.VOCAB)
107 | 			self.QA.VOCAB[token] = next_index
108 | 			self.QA.IVOCAB[next_index] = token
109 | 
110 | 
111 | def get_raw_babi(taskid):
112 | 	paths = glob('corpus/bAbI/en-10k/qa{}_*'.format(taskid))
113 | 	for path in paths:
114 | 		if 'train' in path:
115 | 			with open(path, 'r') as fp:
116 | 				train = fp.read()
117 | 		elif 'test' in path:
118 | 			with open(path, 'r') as fp:
119 | 				test = fp.read()
120 | 	return train, test
121 | 
122 | def build_vocab(raw_babi):
123 | 	lowered = raw_babi.lower()
124 | 	tokens = re.findall('[a-zA-Z]+', lowered)
125 | 	types = set(tokens)
126 | 	return types
127 | 
128 | # adapted from https://github.com/YerevaNN/Dynamic-memory-networks-in-Theano/
129 | def get_unindexed_qa(raw_babi):
130 | 	tasks = []
131 | 	task = None
132 | 	babi = raw_babi.strip().split('\n')
133 | 	for i, line in enumerate(babi):
134 | 		id = int(line[0:line.find(' ')])
135 | 		if id == 1:
136 | 			task = {"C": "", "Q": "", "A": "", "S": ""}
137 | 			counter = 0
138 | 			id_map = {}
139 | 
140 | 		line = line.strip()
141 | 		line = line.replace('.', ' . ')
142 | 		line = line[line.find(' ')+1:]
143 | 		# if not a question
144 | 		if line.find('?') == -1:
145 | 			task["C"] += line + '<line>'
146 | 			id_map[id] = counter
147 | 			counter += 1
148 | 		else:
149 | 			idx = line.find('?')
150 | 			tmp = line[idx+1:].split('\t')
151 | 			task["Q"] = line[:idx]
152 | 			task["A"] = tmp[1].strip()
153 | 			task["S"] = [] # Supporting facts
154 | 			for num in tmp[2].split():
155 | 				task["S"].append(id_map[int(num.strip())])
156 | 			tc = task.copy()
157 | 			tc['C'] = tc['C'].split('<line>')[:-1]
158 | 			tasks.append(tc)
159 | 	return tasks
160 | 
161 | if __name__ == '__main__':
162 | 	dset_train = BabiDataset(20, mode='train')
163 | 	train_loader = DataLoader(dset_train, batch_size=2, shuffle=True, collate_fn=pad_collate)
164 | 	for batch_idx, data in enumerate(train_loader):
165 | 		contexts, questions, answers = data
166 | 		break


--------------------------------------------------------------------------------
/reading_comprehension/main.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # const.python 			                     #
  6 | # author: sean lee                           #
  7 | # email: lxm_0828@163.com                    #
  8 | #--------------------------------------------#
  9 | 
 10 | import const
 11 | import numpy as np
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.optim as optim
 15 | from model import *
 16 | from dataset import *
 17 | 
 18 | import argparse
 19 | parser = argparse.ArgumentParser(description='gca main.py')
 20 | parser.add_argument('-train', action='store_true', default=False, help='train model')
 21 | parser.add_argument('-test', action='store_true', default=False, help='test model')
 22 | parser.add_argument('-evaluate', action='store_true', default=False, help='evaluate')
 23 | args = parser.parse_args()
 24 | 
 25 | def train():
 26 | 	dataset = BabiDataset(const.task_id)
 27 | 	model = ReaderNet(len(dataset.QA.VOCAB), const.hidden_size)
 28 | 	model = load_model(model)
 29 | 	model = model.cuda() if const.use_cuda else model
 30 | 	optimizer = optim.Adam(model.parameters(), lr=const.lr_rate)
 31 | 
 32 | 	best_acc = 0
 33 | 	for epoch in range(const.epochs):
 34 | 		model.train()
 35 | 		total_acc = 0.0
 36 | 		cnt = 0
 37 | 		dataset.set_mode('train')
 38 | 		train_loader = DataLoader(dataset, batch_size=const.batch_size, shuffle=True, collate_fn=pad_collate)
 39 | 		losses = []
 40 | 		for batch_idx, data in enumerate(train_loader):
 41 | 			optimizer.zero_grad()
 42 | 
 43 | 			contexts, questions, answers = data
 44 | 			
 45 | 			contexts = contexts.long()
 46 | 			contexts = contexts.cuda() if const.use_cuda else contexts
 47 | 			contexts = Variable(contexts)
 48 | 
 49 | 			questions = questions.long()
 50 | 			questions = questions.cuda() if const.use_cuda else questions
 51 | 			questions = Variable(questions)
 52 | 
 53 | 			#answers = answers.long()
 54 | 			answers = answers.cuda() if const.use_cuda else answers
 55 | 			answers = Variable(answers)
 56 | 
 57 | 			loss, acc = model.get_loss(contexts, questions, answers)
 58 | 			losses.append(loss.data[0])
 59 | 			total_acc += acc * const.batch_size
 60 | 			if batch_idx % 50 == 0:
 61 | 				print('loss', np.mean(losses))
 62 | 				losses = []
 63 | 			cnt += const.batch_size
 64 | 			loss.backward()
 65 | 			optimizer.step()
 66 | 
 67 | 		dataset.set_mode('valid')
 68 | 		valid_loader = DataLoader(
 69 | 			dataset, batch_size=const.batch_size, shuffle=False, collate_fn=pad_collate
 70 | 		)
 71 | 
 72 | 		model.eval()
 73 | 		total_acc = 0.0
 74 | 		cnt = 0
 75 | 		for batch_idx, data in enumerate(valid_loader):
 76 | 			contexts, questions, answers = data
 77 | 			batch_size = contexts.size()[0]
 78 | 
 79 | 			contexts = contexts.long()
 80 | 			contexts = contexts.cuda() if const.use_cuda else contexts
 81 | 			contexts = Variable(contexts)
 82 | 
 83 | 			questions = questions.long()
 84 | 			questions = questions.cuda() if const.use_cuda else questions
 85 | 			questions = Variable(questions)
 86 | 
 87 | 			answers = answers.cuda() if const.use_cuda else answers
 88 | 			answers = Variable(answers)
 89 | 
 90 | 			_, acc = model.get_loss(contexts, questions, answers)
 91 | 			total_acc += acc * const.batch_size
 92 | 			cnt += const.batch_size
 93 | 
 94 | 		total_acc = total_acc / cnt
 95 | 		print('accuracy: %.4f' % total_acc)
 96 | 		if total_acc > best_acc:
 97 | 			best_acc = total_acc
 98 | 			best_state = model.state_dict()
 99 | 			save_model(model)
100 | 			print('save model')
101 | 
102 | def evaluate():
103 | 	dataset = BabiDataset(const.task_id)
104 | 	model = ReaderNet(len(dataset.QA.VOCAB), const.hidden_size)
105 | 	model = load_model(model)
106 | 	model = model.cuda() if const.use_cuda else model
107 | 
108 | 	model.eval()
109 | 	dataset.set_mode('test')
110 | 	test_loader = DataLoader(
111 | 		dataset, batch_size=1, shuffle=True, collate_fn=pad_collate
112 | 	)
113 | 	for batch_idx, data in enumerate(test_loader):
114 | 		contexts, questions, answers = data
115 | 
116 | 		print(contexts.size())
117 | 		print(questions.size())
118 | 		print(answers.size())
119 | 		contexts_raw = []
120 | 		for cont in contexts.numpy().tolist()[0]:
121 | 			c = []
122 | 			[c.append(dataset.QA.IVOCAB[w]) for w in cont]
123 | 			contexts_raw.append(c)
124 | 
125 | 		q_raw = []
126 | 		[q_raw.append(dataset.QA.IVOCAB[w]) for w in questions.numpy().tolist()[0]]
127 | 
128 | 		a_raw = dataset.QA.IVOCAB[answers.numpy().tolist()[0]]
129 | 
130 | 		print('\n>facts: ')
131 | 		for cont in contexts_raw:
132 | 			print(cont)
133 | 
134 | 		contexts = contexts.long()
135 | 		contexts = contexts.cuda() if const.use_cuda else contexts
136 | 		contexts = Variable(contexts)
137 | 		while True:
138 | 			question = input('\n>input your question: ')
139 | 			questions = list(map(lambda w: dataset.QA.VOCAB[w] if w in dataset.QA.VOCAB else dataset.QA.VOCAB[const.pad], question.split(' ')))
140 | 
141 | 			#print(questions)
142 | 
143 | 			questions = torch.LongTensor(questions)
144 | 			questions = questions.cuda() if const.use_cuda else questions
145 | 			questions = Variable(questions).unsqueeze(0)
146 | 
147 | 			pred = model.predict(contexts, questions)
148 | 			print(">pred: ", dataset.QA.IVOCAB[pred])
149 | 
150 | 		break;
151 | 	pass
152 | 
153 | if args.train:
154 | 	train()
155 | elif args.evaluate:
156 | 	evaluate()


--------------------------------------------------------------------------------
/reading_comprehension/model.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # const.python 			                     #
  6 | # author: sean lee                           #
  7 | # email: lxm_0828@163.com                    #
  8 | #--------------------------------------------#
  9 | 
 10 | import os
 11 | import const
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.nn.init as init
 15 | import torch.nn.functional as F
 16 | from torch.autograd import Variable
 17 | 
 18 | def save_model(model, path=f'models/model.pth'):
 19 | 	os.makedirs('models', exist_ok=True)
 20 | 	with open(path, 'wb') as fp:
 21 | 		torch.save(model.state_dict(), fp)
 22 | 
 23 | def load_model(model, path=f'models/model.pth'):
 24 | 	if not os.path.exists(path):
 25 | 		return model
 26 | 	model.load_state_dict(torch.load(path))
 27 | 	return model
 28 | 
 29 | def position_encoding(embedded_sentence):
 30 | 	'''
 31 | 	embedded_sentence.size() -> (#batch, #sentence, #token, #embedding)
 32 | 	l.size() -> (#sentence, #embedding)
 33 | 	output.size() -> (#batch, #sentence, #embedding)
 34 | 	'''
 35 | 	_, _, slen, elen = embedded_sentence.size()
 36 | 
 37 | 	l = [[(1 - s/(slen-1)) - (e/(elen-1)) * (1 - 2*s/(slen-1)) for e in range(elen)] for s in range(slen)]
 38 | 	l = torch.FloatTensor(l)
 39 | 	l = l.unsqueeze(0) # for #batch
 40 | 	l = l.unsqueeze(1) # for #sen
 41 | 	l = l.expand_as(embedded_sentence)
 42 | 	l = l.cuda() if const.use_cuda else l
 43 | 	weighted = embedded_sentence * Variable(l)
 44 | 	return torch.sum(weighted, dim=2).squeeze(2) # sum with tokens
 45 | 
 46 | class InputNet(nn.Module):
 47 | 	def __init__(self, input_size, hidden_size):
 48 | 		super(InputNet, self).__init__()
 49 | 		self.hidden_size = hidden_size
 50 | 		self.gru = nn.GRU(hidden_size, hidden_size, bidirectional=True, batch_first=True)
 51 | 		for name, param in self.gru.state_dict().items():
 52 | 			if 'weight' in name: init.xavier_normal(param)
 53 | 		self.dropout = nn.Dropout(0.1)
 54 | 
 55 | 	def forward(self, contexts, embedding):
 56 | 		'''
 57 | 		contexts.size() -> (#batch, #sentence, #token)
 58 | 		embedding() -> (#batch, #sentence x #token, #embedding)
 59 | 		position_encoding() -> (#batch, #sentence, #embedding)
 60 | 		facts.size() -> (#batch, #sentence, #hidden = #embedding)
 61 | 		'''
 62 | 		batch_size, sen_size, token_size = contexts.size()
 63 | 
 64 | 		contexts = contexts.view(batch_size, -1)
 65 | 		contexts = embedding(contexts)
 66 | 
 67 | 		contexts = contexts.view(batch_size, sen_size, token_size, -1)
 68 | 		contexts = position_encoding(contexts)
 69 | 		contexts = self.dropout(contexts)
 70 | 
 71 | 		# init hidden
 72 | 		h0 = torch.zeros(2, batch_size, self.hidden_size)
 73 | 		h0 = h0.cuda() if const.use_cuda else h0
 74 | 		h0 = Variable(h0)
 75 | 
 76 | 		facts, hdn = self.gru(contexts, h0)
 77 | 		facts = facts[:, :, :self.hidden_size] + facts[:, :, self.hidden_size:]
 78 | 		return facts
 79 | 
 80 | class QuestionNet(nn.Module):
 81 | 	def __init__(self, input_size, hidden_size):
 82 | 		super(QuestionNet, self).__init__()
 83 | 		self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
 84 | 
 85 | 	def forward(self, questions, embedding):
 86 | 		'''
 87 | 		questions.size() -> (#batch, #token)
 88 | 		embedding() -> (#batch, #token, #embedding)
 89 | 		gru() -> (1, #batch, #hidden)
 90 | 		'''
 91 | 		questions = embedding(questions)
 92 | 		_, questions = self.gru(questions)    # last hidden as questions, (num_layers * num_directions, B, hidden_size)
 93 | 
 94 | 		questions = questions.transpose(0, 1) # B x 1 x hidden_size
 95 | 		return questions
 96 | 
 97 | class AttnNet(nn.Module):
 98 | 	def __init__(self, hidden_size):
 99 | 		super(AttnNet, self).__init__()
100 | 		self.hidden_size = hidden_size
101 | 
102 | 	def forward(self, questions, facts):
103 | 		batch_size, seqnum, _ = facts.size()
104 | 
105 | 		attn_energies = Variable(torch.zeros(batch_size, seqnum)) # B x S
106 | 		for b in range(batch_size):
107 | 			for i in range(seqnum):
108 | 				attn_energies[b, i] = self.score(facts[b, i], questions[b])  # calc Ct
109 | 
110 | 		attn_energies = attn_energies.cuda() if const.use_cuda else attn_energies
111 | 		return F.softmax(attn_energies.unsqueeze(1))
112 | 	
113 | 	def score(self, fact, question):
114 | 		energy = fact.dot(question)
115 | 		return energy
116 | 
117 | class ReaderNet(nn.Module):
118 | 	def __init__(self, input_size, hidden_size, dropout_p=0.1):
119 | 		super(ReaderNet, self).__init__()
120 | 
121 | 		self.hidden_size = hidden_size
122 | 		self.embedding = nn.Embedding(input_size, hidden_size)
123 | 		self.embedding = self.embedding.cuda() if const.use_cuda else self.embedding
124 | 		init.uniform(self.embedding.state_dict()['weight'], a=-1.0, b=1.0)
125 | 
126 | 		self.input_net = InputNet(input_size, hidden_size)
127 | 		self.question_net = QuestionNet(input_size, hidden_size)
128 | 		self.attn_net = AttnNet(hidden_size)
129 | 		self.h2o = nn.Linear(hidden_size, input_size)
130 | 
131 | 		self.criterion = nn.CrossEntropyLoss()
132 | 
133 | 	def forward(self, contexts, questions):
134 | 		facts = self.input_net(contexts, self.embedding)
135 | 		questions = self.question_net(questions, self.embedding).squeeze(1)
136 | 
137 | 		facts_attn = self.attn_net(questions, facts)
138 | 		facts = torch.bmm(facts_attn, facts).squeeze(1)
139 | 		
140 | 		outputs = questions * facts
141 | 		outputs = self.h2o(F.tanh(outputs))
142 | 		return outputs
143 | 
144 | 	def get_loss(self, contexts, questions, targets):
145 | 		output = self.forward(contexts, questions)
146 | 		loss = self.criterion(output.view(targets.size(0), -1), targets)
147 | 		reg_loss = 0
148 | 		for param in self.parameters():
149 | 			reg_loss += 0.001 * torch.sum(param * param)
150 | 		preds = F.softmax(output)
151 | 		_, pred_ids = torch.max(preds, dim=1)
152 | 		corrects = (pred_ids.data == targets.data)
153 | 		acc = torch.mean(corrects.float())
154 | 		return loss + reg_loss, acc		
155 | 
156 | 	def predict(self, contexts, questions):
157 | 		output = self.forward(contexts, questions)
158 | 		preds = F.softmax(output)
159 | 		_, pred_ids = torch.max(preds, dim=1)
160 | 		pred_value, pred_ids = torch.topk(preds, 1)
161 | 		return pred_ids.data.tolist()[0][0]


--------------------------------------------------------------------------------
/text_similarity/vsm_sim.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import numpy as np
 4 | import jieba
 5 | 
 6 | class Sim(object):
 7 |     def __init__(self, kernel='tfidf'):
 8 |         self.word2idx = {}
 9 |         self.kernel = kernel
10 | 
11 |     def tokenizer(self, sent):
12 |         return jieba.lcut(sent)
13 | 
14 |     def calc_bow(self, docs):
15 |         bow = np.zeros([len(docs), len(self.word2idx)])
16 |         for docidx, words in enumerate(docs):
17 |             for word in words:
18 |                 if word in self.word2idx:
19 |                     bow[docidx, self.word2idx[word]] += 1
20 |         return bow
21 | 
22 |     def calc_tfidf(self, docs):
23 |         tf = self.calc_bow(docs)
24 |         df = np.ones([1, len(self.word2idx)])
25 | 
26 |         for docidx, words in enumerate(docs):
27 |             tf[docidx] /= np.max(tf[docidx])
28 |             for word in words:
29 |                 if word in self.word2idx:
30 |                     df[0, self.word2idx[word]] += 1
31 |         idf = np.log(len(docs)) - np.log(df)
32 |         tfidf = tf * idf
33 |         return tfidf
34 | 
35 |     def cos(self, vec1, vec2):
36 |         cos = np.dot(vec1, vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2))
37 |         try:
38 |             cos = np.dot(vec1, vec2) / (np.linalg.norm(vec1)*np.linalg.norm(vec2))
39 |         except:
40 |             cos = None
41 | 
42 |         return cos
43 | 
44 |     def similarity(self, doc1, doc2):
45 |         words1 = self.tokenizer(doc1)
46 |         words2 = self.tokenizer(doc2)
47 | 
48 |         words = set(words1) | set(words2)
49 |         self.word2idx = dict(zip(words, range(len(words))))
50 | 
51 |         if self.kernel == 'tfidf':
52 |             feature = self.calc_tfidf
53 |         else:
54 |             feature = self.calc_bow
55 | 
56 |         vec = feature([words1, words2])
57 |         vec1 = vec[0]
58 |         vec2 = vec[1]
59 |         
60 |         return self.cos(vec1, vec2)
61 | 
62 | if __name__ == '__main__':
63 |     doc1 = """计算机科学（英语：computer science，有时缩写为CS）是系统性研究信息与计算的理论基础以及它们在计算机系统中如何实现与应用的实用技术的学科。
64 |     [1] [2]它通常被形容为对那些创造、描述以及转换信息的算法处理的系统研究。
65 |     计算机科学包含很多分支领域；有些强调特定结果的计算，比如计算机图形学；
66 |     而有些是探讨计算问题的性质，比如计算复杂性理论；还有一些领域专注于怎样实现计算，比如编程语言理论是研究描述计算的方法，
67 |     而程序设计是应用特定的编程语言解决特定的计算问题，人机交互则是专注于怎样使计算机和计算变得有用、好用，以及随时随地为人所用。"""
68 | 
69 |     doc2 = """自然语言处理（英语：natural language processing，缩写作 NLP）是人工智能和语言学领域的分支学科。此领域探讨如何处理及运用自然语言；自然语言认知则是指让电脑“懂”人类的语言。
70 | 自然语言生成系统把计算机数据转化为自然语言。自然语言理解系统把自然语言转化为计算机程序更易于处理的形式。"""
71 |     sim = Sim()
72 |     print(sim.similarity(doc1, doc2))


--------------------------------------------------------------------------------
/word2vec/README.md:
--------------------------------------------------------------------------------
 1 | ### Word2vec
 2 | 
 3 | word2vec是NLP一大利器，用它我们可以做很多事情，比如说近义词，关系挖掘，迁徙学习等。
 4 | 
 5 | 为了帮助大家更好的理解word2vec，用深度学习框架实现了几个主要的模型：
 6 | * CBOW naive softmax, CBOW negative_sampling
 7 | * Skipgram naive softmax, Skipgram negative_samping
 8 | 
 9 | 考虑到大家可能会使用不同的深度学习框架，在这里我主要使用了两种框架：
10 | * pytorch
11 | * tensorflow
12 | 
13 | 声明： 代码仅供学习，谨慎用于实际项目中，实际项目中推荐使用成熟的解决方案：
14 | * gensim word2vec
15 | * word2vec 
16 | 
17 | ### Result
18 | ![result.png](./corpus/result.png)
19 | 
20 | ### Recommed
21 | * [Distributed Representations of Words and Phrases and their Compositionality](https://arxiv.org/abs/1310.4546)
22 | * [word2vec Parameter Learning Explained](https://arxiv.org/abs/1411.2738)
23 | * [word2vec中的数学原理](http://blog.csdn.net/itplus/article/details/37969519)
24 | 
25 | ### Reference
26 | 部分代码借鉴了：
27 | * [CBOW_on_TensorFlow](https://github.com/edugp/CBOW_on_TensorFlow/blob/master/CBOW.ipynb)
28 | * [Skip-gram with naiive softmax](https://nbviewer.jupyter.org/github/DSKSD/DeepNLP-models-Pytorch/blob/master/notebooks/01.Skip-gram-Naive-Softmax.ipynb)
29 | 


--------------------------------------------------------------------------------
/word2vec/cbow/pytorch/negative_sampling/cbow.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # cbow   	 			    				 #
 6 | # author: sean lee                           #
 7 | # email: lxm_0828@163.com                    #
 8 | #--------------------------------------------#
 9 | 
10 | import torch
11 | import torch.nn as nn
12 | import torch.nn.functional as F
13 | 
14 | class Cbow(nn.Module):
15 | 	def __init__(self, input_size, projection_size):
16 | 		super(Cbow, self).__init__()
17 | 		self.V = nn.Embedding(input_size, projection_size)
18 | 		self.U = nn.Embedding(input_size, projection_size)
19 | 		self.logsigmoid = nn.LogSigmoid()
20 | 		
21 | 		initrange = (2.0 / (input_size + projection_size))**5
22 | 		self.V.weight.data.uniform_(-initrange, initrange)
23 | 		self.U.weight.data.uniform_(-0.0, 0.0)  # zero
24 | 
25 | 	def forward(self, center_words, target_words, neg_words):
26 | 		v = self.V(center_words)  # batch_size x 1 x projection_size
27 | 		u = self.U(target_words)	# batch_size x 1 x projection_size
28 | 		u_neg = -self.U(neg_words)
29 | 
30 | 		pos_score = u.bmm(v.transpose(1, 2)).squeeze(2)    # batch_size x 1
31 | 		neg_score = torch.sum(u_neg.bmm(v.transpose(1, 2)).squeeze(2), 1).view(neg_words.size(0), -1)	# batch_size x input_size
32 | 		
33 | 		return self.loss(pos_score, neg_score)
34 | 		
35 | 	def loss(self, pos_score, neg_score):
36 | 		loss = self.logsigmoid(pos_score) + self.logsigmoid(neg_score)
37 | 		return -torch.mean(loss)
38 | 
39 | 	def pred(self, inp):
40 | 		return self.V(inp)
41 | 
42 | 


--------------------------------------------------------------------------------
/word2vec/cbow/pytorch/negative_sampling/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # const.python 			                     #
 6 | # author: sean lee                           #
 7 | # email: lxm_0828@163.com                    #
 8 | #--------------------------------------------#
 9 | 
10 | S_TOKEN = '<s>'		# start token
11 | E_TOKEN = '</s>'	# end token
12 | U_TOKEN = '<u>'		# unknown token
13 | D_TOKEN = '<d>'		# dummy token
14 | 
15 | WIN_SIZE = 4		# window size
16 | SKIP_WIN = 2		# skip window siaze
17 | Z = 0.01
18 | 
19 | # nnwork
20 | EMBEDDING_SIZE = 100
21 | BATCH_SIZE = 128
22 | EPOCH = 10000
23 | LR_RATE = 0.001
24 | NEG = 10 # Num of Negative Sampling
25 | 


--------------------------------------------------------------------------------
/word2vec/cbow/pytorch/negative_sampling/dataset.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # dataset.py 			                     #
  6 | # author: sean lee                           #
  7 | # email: lxm_0828@163.com                    #
  8 | #--------------------------------------------#
  9 | 
 10 | import const
 11 | import re
 12 | import random
 13 | import numpy as np
 14 | import nltk
 15 | import jieba
 16 | 
 17 | import torch
 18 | from torch.autograd import Variable
 19 | 
 20 | import collections
 21 | from collections import defaultdict, Counter
 22 | 
 23 | def rm_sign(string):
 24 | 	string = re.sub("[\.\!_,\$\(\)\"\'\]\[！!\?，。？、~@#￥……&]+", "", string) 
 25 | 	return string
 26 | 
 27 | def load_data(corpus_dir = '../../../corpus/articles.txt'):
 28 | 	with open(corpus_dir, 'r') as f:
 29 | 		for line in f:
 30 | 			line = line.strip()
 31 | 			if len(line) == 0:
 32 | 				continue
 33 | 			yield jieba.lcut(rm_sign(line))
 34 | 
 35 | class Corpus(object):
 36 | 	def __init__(self, data):
 37 | 
 38 | 		# data sample
 39 | 		data_split = len(data) // 10
 40 | 		neg_data = data[-data_split:]
 41 | 		data = data[:1-data_split]
 42 | 		flatten = lambda l: [item.lower() for sublist in l for item in sublist]
 43 | 
 44 | 		self.neg_vocab = list(set(flatten(neg_data)))
 45 | 
 46 | 		word_count = Counter(flatten(data))
 47 | 		self.word2idx = {const.U_TOKEN: 0}
 48 | 		self.n_words = 1
 49 | 		for word, _ in word_count.items():
 50 | 			self.word2idx[word] = self.n_words
 51 | 			self.n_words += 1
 52 | 		self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys())) 
 53 | 		self.vocab = list(self.word2idx.keys())
 54 | 
 55 | 		# unigram_table
 56 | 		vocab_total_words = sum([c for w, c in word_count.items() if w not in self.neg_vocab])
 57 | 		self.unigram_table = []
 58 | 		for v in self.vocab:
 59 | 			self.unigram_table.extend([v]*int(((word_count[v]/vocab_total_words)**(3/4))/const.Z))
 60 | 
 61 | 	# @return batch data
 62 | 	# @generator
 63 | 	def batch_data(self):
 64 | 		batch_size = const.BATCH_SIZE * const.WIN_SIZE
 65 | 		data = self.vocab
 66 | 		data_index = 0
 67 | 		assert batch_size % const.WIN_SIZE == 0
 68 | 		assert const.WIN_SIZE <= 2 * const.SKIP_WIN
 69 | 
 70 | 		batch = np.ndarray(shape=(batch_size), dtype=np.int32)
 71 | 		labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
 72 | 		span = 2 * const.SKIP_WIN + 1 # [ const.SKIP_WIN target const.SKIP_WIN ]
 73 | 		buffers = collections.deque(maxlen=span)
 74 | 
 75 | 		for _ in range(span):
 76 | 			buffers.append(data[data_index])
 77 | 			data_index = (data_index + 1) % len(data)
 78 | 
 79 | 		for i in range(batch_size // const.WIN_SIZE):
 80 | 
 81 | 			target = const.SKIP_WIN  # target label at the center of the buffers
 82 | 			targets_to_avoid = [const.SKIP_WIN]
 83 | 			for j in range(const.WIN_SIZE):
 84 | 				while target in targets_to_avoid:
 85 | 					target = random.randint(0, span - 1)
 86 | 				targets_to_avoid.append(target)
 87 | 				batch[i * const.WIN_SIZE + j] = self.var_word(buffers[const.SKIP_WIN])[0]
 88 | 				labels[i * const.WIN_SIZE + j, 0] = self.var_word(buffers[target])[0]
 89 | 			buffers.append(data[data_index])
 90 | 			data_index = (data_index + 1) % len(data)
 91 | 			
 92 | 		label_CBOW = []
 93 | 		context_CBOW = []
 94 | 		for i in range(0,len(batch), const.WIN_SIZE):
 95 | 			label_CBOW.append(batch[i])
 96 | 			context_CBOW.append([l[0] for l in labels[i:i+const.WIN_SIZE]])
 97 | 		return np.array(context_CBOW), np.array(label_CBOW).reshape(batch_size // const.WIN_SIZE, 1)
 98 | 
 99 | 	def negative_sampling(self, targets):
100 | 		batch_size = targets.size(0)
101 | 		neg_samples = []
102 | 		for i in range(batch_size):
103 | 			sample = []
104 | 			target_idx = targets[i].data.tolist()[0]
105 | 			while len(sample) < const.NEG:
106 | 				if self.word2idx == target_idx:
107 | 					continue
108 | 				sample.append(random.choice(self.unigram_table))
109 | 			neg_samples.append(Variable(torch.LongTensor(self.var_sentence(sample))).view(1, -1))
110 | 		return torch.cat(neg_samples)
111 | 
112 | 	# @input sentence [w1, w2, ... , wn]
113 | 	def var_sentence(self, sentence):
114 | 		idxs = list(map(lambda w: self.word2idx[w] if w in self.vocab else self.word2idx[const.U_TOKEN], sentence))
115 | 		return idxs
116 | 
117 | 	# @input word
118 | 	def var_word(self, word):
119 | 		idx = [self.word2idx[const.U_TOKEN]]
120 | 		if word in self.word2idx:
121 | 			idx = [self.word2idx[word]]
122 | 		return idx
123 | 


--------------------------------------------------------------------------------
/word2vec/cbow/pytorch/negative_sampling/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # main.py 									 #
 6 | # author: sean lee						   	 #
 7 | # email: lxm_0828@163.com					 #
 8 | #--------------------------------------------#
 9 | 
10 | 
11 | import argparse
12 | parser = argparse.ArgumentParser(description='main.py')
13 | parser.add_argument('-train', action='store_true', default=False, help='train model')
14 | parser.add_argument('-retrain', action='store_true', default=False, help='train model')
15 | parser.add_argument('-test', action='store_true', default=False, help='test model')
16 | args = parser.parse_args()
17 | 
18 | import const
19 | import numpy as np
20 | import torch
21 | import torch.optim as optim
22 | import torch.nn.functional as F
23 | from torch.autograd import Variable
24 | 
25 | from dataset import Corpus, load_data
26 | from cbow import Cbow
27 | from utils import Utils
28 | 
29 | def test(word, corpus, k=10):
30 | 	vocab = corpus.vocab
31 | 	model,_ = Utils.load_previous_model('model')
32 | 	target_V = model.pred(Variable(torch.LongTensor(corpus.var_word(word))))
33 | 	scores=[]
34 | 	for i in range(len(vocab)):
35 | 		if vocab[i] == word or vocab[i] == const.U_TOKEN: 
36 | 			continue
37 | 		vector = model.pred(Variable(torch.LongTensor(corpus.var_word(list(vocab)[i]))))
38 | 		cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 
39 | 		scores.append([vocab[i],cosine_sim])
40 | 	return sorted(scores, key=lambda x: x[1], reverse=True)[:k] # sort by similarity
41 | 
42 | def train(corpus):
43 | 	if args.retrain:
44 | 		Utils.remove_models('model')
45 | 
46 | 	losses = []
47 | 
48 | 	start_epoch = 0
49 | 	model, start_epoch = Utils.load_previous_model('model')
50 | 	if model == None:
51 | 		model = Cbow(corpus.n_words, const.EMBEDDING_SIZE)
52 | 
53 | 	if torch.cuda.is_available():
54 | 		model.cuda()
55 | 	optimizer = optim.Adam(model.parameters(), const.LR_RATE)
56 | 
57 | 	for epoch in range(start_epoch, const.EPOCH):
58 | 		inputs, targets = corpus.batch_data()
59 | 
60 | 
61 | 		inputs = Variable(torch.from_numpy(inputs).long())
62 | 		targets = Variable(torch.from_numpy(targets).long())
63 | 
64 | 		negs = corpus.negative_sampling(targets)
65 | 		#print(inputs.size(), targets.size(), vocabs.size())
66 | 		#exit()
67 | 		model.zero_grad()
68 | 		loss = model(inputs, targets, negs)
69 | 		loss.backward()
70 | 		optimizer.step()
71 | 
72 | 		losses.append(loss.data.tolist()[0])
73 | 		if epoch % 100 == 0:
74 | 			print("Epoch : %d, mean_loss : %.02f" % (epoch , np.mean(losses)))
75 | 			Utils.save_model(model, epoch, 'model')
76 | 			losses = []
77 | 	Utils.save_model(model, epoch, 'model')
78 | 
79 | data = list(load_data())
80 | corpus = Corpus(data)
81 | if args.train or args.retrain:
82 | 	train(corpus)
83 | elif args.test:
84 | 	word = input('Input word> ')
85 | 	print(test(word, corpus))


--------------------------------------------------------------------------------
/word2vec/cbow/pytorch/negative_sampling/utils.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # model utils			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | import torch
12 | import os, glob
13 | import numpy as np
14 | 
15 | class Utils(object):
16 | 
17 | 	@staticmethod
18 | 	def save_model(model, epoch, save_dir, max_keep=5):
19 | 		if not os.path.exists(save_dir):
20 | 			os.makedirs(save_dir)
21 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
22 | 		if len(f_list) >= max_keep + 2:
23 | 			epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list]
24 | 			to_delete = [f_list[i] for i in np.argsort(epoch_list)[-max_keep:]]
25 | 			for f in to_delete:
26 | 				os.remove(f)
27 | 		name = 'model_{}.ckpt'.format(epoch)
28 | 		file_path = os.path.join(save_dir, name)
29 | 		#torch.save(model.state_dict(), file_path)
30 | 		torch.save(model, file_path)
31 | 
32 | 	@staticmethod
33 | 	def load_previous_model(save_dir):
34 | 		if not os.path.exists(save_dir):
35 | 			os.makedirs(save_dir)
36 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
37 | 		start_epoch = 1
38 | 		model = None
39 | 		if len(f_list) >= 1:
40 | 			epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list]
41 | 			last_checkpoint = f_list[np.argmax(epoch_list)]
42 | 			if os.path.exists(last_checkpoint):
43 | 				#print('load from {}'.format(last_checkpoint))
44 | 				# CNN 不支持参数保存
45 | 				#model.load_state_dict(torch.load(last_checkpoint))
46 | 				model = torch.load(last_checkpoint)
47 | 				start_epoch = np.max(epoch_list)
48 | 		return model, start_epoch
49 | 
50 | 	@staticmethod
51 | 	def remove_models(save_dir):
52 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
53 | 		f_list.append(os.path.join(save_dir, 'param.pkl'))
54 | 		f_list.append(os.path.join(save_dir, 'log.txt'))
55 | 		for filename in f_list:
56 | 			try:
57 | 				os.remove(filename)
58 | 			except:
59 | 				pass
60 | 


--------------------------------------------------------------------------------
/word2vec/cbow/pytorch/softmax/cbow.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # skipgram	 			                     #
 6 | # author: sean lee                           #
 7 | # email: lxm_0828@163.com                    #
 8 | #--------------------------------------------#
 9 | 
10 | import torch
11 | import torch.nn as nn
12 | import torch.nn.functional as F
13 | 
14 | class Cbow(nn.Module):
15 | 	def __init__(self, input_size, projection_size):
16 | 		super(Cbow, self).__init__()
17 | 		self.V = nn.Embedding(input_size, projection_size)
18 | 		self.U = nn.Embedding(input_size, projection_size)
19 | 
20 | 		self.V.weight.data.uniform_(-1.0, 1.0)
21 | 		self.U.weight.data.uniform_(0.0, 0.0)  # zero
22 | 
23 | 	def forward(self, center_words, target_words, out_words):
24 | 		v = self.V(center_words)  # batch_size x win_size x projection_size
25 | 		u = self.U(target_words)	# batch_size x 1 x projection_size
26 | 		u_actual = self.U(out_words) # batch_size x input_size x projection_size
27 | 
28 | 		scores = u.bmm(v.transpose(1, 2)).squeeze(2)    # batch_size x win_size
29 | 		norm_scores = u_actual.bmm(v.transpose(1, 2)).squeeze(2)	# batch_size x input_size
30 | 		return self.nll_loss(scores, norm_scores)
31 | 		
32 | 	def nll_loss(self, scores, norm_scores):
33 | 		#
34 | 		softmax = torch.exp(scores)/torch.sum(torch.exp(norm_scores),1).unsqueeze(1)
35 | 		return -torch.mean(torch.log(softmax))
36 | 
37 | 	def pred(self, inp):
38 | 		return self.V(inp)
39 | 
40 | 


--------------------------------------------------------------------------------
/word2vec/cbow/pytorch/softmax/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # const.python 			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | S_TOKEN = '<s>'		# start token
12 | E_TOKEN = '</s>'	# end token
13 | U_TOKEN = '<u>'		# unknown token
14 | D_TOKEN = '<d>'		# dummy token
15 | 
16 | WIN_SIZE = 4		# window size
17 | SKIP_WIN = 2
18 | 
19 | # nnwork
20 | EMBEDDING_SIZE = 100
21 | BATCH_SIZE = 128
22 | EPOCH = 10000
23 | LR_RATE = 0.0001
24 | 


--------------------------------------------------------------------------------
/word2vec/cbow/pytorch/softmax/dataset.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # dataset.py 		                     #
 6 | # author: sean lee                           #
 7 | # email: lxm_0828@163.com                    #
 8 | #--------------------------------------------#
 9 | 
10 | import const
11 | import re
12 | import random
13 | import numpy as np
14 | import nltk
15 | import jieba
16 | import collections
17 | from collections import defaultdict, Counter
18 | 
19 | def rm_sign(string):
20 | 	string = re.sub("[\.\!_,\$\(\)\"\'\]\[！!\?，。？、~@#￥……&]+", "", string) 
21 | 	return string
22 | 
23 | def load_data(corpus_dir = '../../../corpus/articles.txt'):
24 | 	with open(corpus_dir, 'r') as f:
25 | 		for line in f:
26 | 			line = line.strip()
27 | 			if len(line) == 0:
28 | 				continue
29 | 			yield jieba.lcut(rm_sign(line))
30 | 
31 | class Corpus(object):
32 | 	def __init__(self, data):
33 | 		flatten = lambda l: [item.lower() for sublist in l for item in sublist]
34 | 		word_count = Counter(flatten(data)).most_common()
35 | 		self.word2idx = {const.U_TOKEN: 0}
36 | 		self.n_words = 1
37 | 		for word, _ in word_count:
38 | 			self.word2idx[word] = self.n_words
39 | 			self.n_words += 1
40 | 		self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys())) 
41 | 		self.vocab = list(self.word2idx.keys())
42 | 
43 | 	# @return batch data
44 | 	# @generator
45 | 	def batch_data(self):
46 | 		batch_size = const.BATCH_SIZE * const.WIN_SIZE
47 | 		data = self.vocab
48 | 		data_index = 0
49 | 		assert batch_size % const.WIN_SIZE == 0
50 | 		assert const.WIN_SIZE <= 2 * const.SKIP_WIN
51 | 
52 | 		batch = np.ndarray(shape=(batch_size), dtype=np.int32)
53 | 		labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
54 | 		span = 2 * const.SKIP_WIN + 1 # [ const.SKIP_WIN target const.SKIP_WIN ]
55 | 		buffers = collections.deque(maxlen=span)
56 | 
57 | 		for _ in range(span):
58 | 			buffers.append(data[data_index])
59 | 			data_index = (data_index + 1) % len(data)
60 | 
61 | 		for i in range(batch_size // const.WIN_SIZE):
62 | 
63 | 			target = const.SKIP_WIN  # target label at the center of the buffers
64 | 			targets_to_avoid = [const.SKIP_WIN]
65 | 			for j in range(const.WIN_SIZE):
66 | 				while target in targets_to_avoid:
67 | 					target = random.randint(0, span - 1)
68 | 				targets_to_avoid.append(target)
69 | 				batch[i * const.WIN_SIZE + j] = self.var_word(buffers[const.SKIP_WIN])[0]
70 | 				labels[i * const.WIN_SIZE + j, 0] = self.var_word(buffers[target])[0]
71 | 			buffers.append(data[data_index])
72 | 			data_index = (data_index + 1) % len(data)
73 | 			
74 | 		label_CBOW = []
75 | 		context_CBOW = []
76 | 		for i in range(0,len(batch), const.WIN_SIZE):
77 | 			label_CBOW.append(batch[i])
78 | 			context_CBOW.append([l[0] for l in labels[i:i+const.WIN_SIZE]])
79 | 		return np.array(context_CBOW), np.array(label_CBOW).reshape(batch_size // const.WIN_SIZE, 1)
80 | 
81 | 	# @input sentence [w1, w2, ... , wn]
82 | 	def var_sentence(self, sentence):
83 | 		idxs = list(map(lambda w: self.word2idx[w] if w in self.vocab else self.word2idx[const.U_TOKEN], sentence))
84 | 		return idxs
85 | 
86 | 	# @input word
87 | 	def var_word(self, word):
88 | 		idx = [self.word2idx[const.U_TOKEN]]
89 | 		if word in self.word2idx:
90 | 			idx = [self.word2idx[word]]
91 | 		return idx
92 | 


--------------------------------------------------------------------------------
/word2vec/cbow/pytorch/softmax/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # main.py 									 #
 6 | # author: sean lee						   	 #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com					 #
 9 | #--------------------------------------------#
10 | 
11 | 
12 | import argparse
13 | parser = argparse.ArgumentParser(description='main.py')
14 | parser.add_argument('-train', action='store_true', default=False, help='train model')
15 | parser.add_argument('-retrain', action='store_true', default=False, help='train model')
16 | parser.add_argument('-test', action='store_true', default=False, help='test model')
17 | args = parser.parse_args()
18 | 
19 | import const
20 | import numpy as np
21 | import torch
22 | import torch.optim as optim
23 | import torch.nn.functional as F
24 | from torch.autograd import Variable
25 | from dataset import Corpus, load_data
26 | from cbow import Cbow
27 | from utils import Utils
28 | 
29 | def test(word, corpus, k=10):
30 | 	vocab = corpus.vocab
31 | 	model,_ = Utils.load_previous_model('model')
32 | 	target_V = model.pred(Variable(torch.LongTensor(corpus.var_word(word))))
33 | 	scores=[]
34 | 	for i in range(len(vocab)):
35 | 		if vocab[i] == word or vocab[i] == const.U_TOKEN: 
36 | 			continue
37 | 		vector = model.pred(Variable(torch.LongTensor(corpus.var_word(list(vocab)[i]))))
38 | 		cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 
39 | 		scores.append([vocab[i],cosine_sim])
40 | 	return sorted(scores, key=lambda x: x[1], reverse=True)[:k] # sort by similarity
41 | 
42 | def train(corpus):
43 | 	if args.retrain:
44 | 		Utils.remove_models('model')
45 | 
46 | 	losses = []
47 | 
48 | 	start_epoch = 0
49 | 	model, start_epoch = Utils.load_previous_model('model')
50 | 	if model == None:
51 | 		model = Cbow(corpus.n_words, const.EMBEDDING_SIZE)
52 | 
53 | 	if torch.cuda.is_available():
54 | 		model.cuda()
55 | 
56 | 	optimizer = optim.Adam(model.parameters(), const.LR_RATE)
57 | 
58 | 	for epoch in range(start_epoch, const.EPOCH):
59 | 		inputs, targets = corpus.batch_data()
60 | 
61 | 
62 | 		inputs = Variable(torch.from_numpy(inputs).long())
63 | 		targets = Variable(torch.from_numpy(targets).long())
64 | 		vocabs = Variable(torch.LongTensor(corpus.var_sentence(corpus.vocab))).expand(inputs.size(0), corpus.n_words)
65 | 
66 | 		model.zero_grad()
67 | 		loss = model(inputs, targets, vocabs)
68 | 		loss.backward()
69 | 		optimizer.step()
70 | 
71 | 		losses.append(loss.data.tolist()[0])
72 | 		if epoch % 100 == 0:
73 | 			print("Epoch : %d, mean_loss : %.02f" % (epoch , np.mean(losses)))
74 | 			Utils.save_model(model, epoch, 'model')
75 | 			losses = []
76 | 	Utils.save_model(model, epoch, 'model')
77 | 
78 | data = list(load_data())
79 | corpus = Corpus(data)
80 | if args.train or args.retrain:
81 | 	train(corpus)
82 | elif args.test:
83 | 	word = input('Input word> ')
84 | 	print(test(word, corpus))
85 | 


--------------------------------------------------------------------------------
/word2vec/cbow/pytorch/softmax/utils.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # model utils		                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | import torch
12 | import os, glob
13 | import numpy as np
14 | 
15 | class Utils(object):
16 | 
17 | 	@staticmethod
18 | 	def save_model(model, epoch, save_dir, max_keep=5):
19 | 		if not os.path.exists(save_dir):
20 | 			os.makedirs(save_dir)
21 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
22 | 		if len(f_list) >= max_keep + 2:
23 | 			epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list]
24 | 			to_delete = [f_list[i] for i in np.argsort(epoch_list)[-max_keep:]]
25 | 			for f in to_delete:
26 | 				os.remove(f)
27 | 		name = 'model_{}.ckpt'.format(epoch)
28 | 		file_path = os.path.join(save_dir, name)
29 | 		#torch.save(model.state_dict(), file_path)
30 | 		torch.save(model, file_path)
31 | 
32 | 	@staticmethod
33 | 	def load_previous_model(save_dir):
34 | 		if not os.path.exists(save_dir):
35 | 			os.makedirs(save_dir)
36 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
37 | 		start_epoch = 1
38 | 		model = None
39 | 		if len(f_list) >= 1:
40 | 			epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list]
41 | 			last_checkpoint = f_list[np.argmax(epoch_list)]
42 | 			if os.path.exists(last_checkpoint):
43 | 				#print('load from {}'.format(last_checkpoint))
44 | 				# CNN 不支持参数保存
45 | 				#model.load_state_dict(torch.load(last_checkpoint))
46 | 				model = torch.load(last_checkpoint)
47 | 				start_epoch = np.max(epoch_list)
48 | 		return model, start_epoch
49 | 
50 | 	@staticmethod
51 | 	def remove_models(save_dir):
52 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
53 | 		f_list.append(os.path.join(save_dir, 'param.pkl'))
54 | 		f_list.append(os.path.join(save_dir, 'log.txt'))
55 | 		for filename in f_list:
56 | 			try:
57 | 				os.remove(filename)
58 | 			except:
59 | 				pass
60 | 


--------------------------------------------------------------------------------
/word2vec/cbow/tensorflow/negative_sampling/cbow.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # skipgram	 			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | import const
12 | import numpy as np
13 | import math
14 | import tensorflow as tf 
15 | 
16 | class Cbow(object):
17 | 	def __init__(self, corpus):
18 | 		self.corpus = corpus
19 | 
20 | 	def test(self, word, k=10):
21 | 		Weight = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0))
22 | 		inputs = tf.placeholder(tf.int32, [None])
23 | 		embed = tf.nn.embedding_lookup(Weight, inputs)
24 | 
25 | 		# cosine
26 | 		test_embed = tf.placeholder(tf.float32, [None])
27 | 		test_input = tf.placeholder(tf.float32, [None])
28 | 		normed_embed = tf.nn.l2_normalize(test_embed, dim=0)
29 | 		normed_array = tf.nn.l2_normalize(test_input, dim=0)
30 | 		cosine_similarity = tf.reduce_sum(tf.multiply(normed_array, normed_embed))
31 | 
32 | 		with tf.Session() as sess:
33 | 			tf.global_variables_initializer().run()
34 | 			#restore model
35 | 			tf.train.Saver().restore(sess, const.MODEL_PATH)
36 | 
37 | 			vectors = sess.run(embed, feed_dict={inputs: range(self.corpus.n_words)})
38 | 			vocab = self.corpus.vocab
39 | 			idx = self.corpus.var_word(word)
40 | 			scores = []
41 | 			for i in range(len(vocab)):
42 | 				if vocab[i] == word or vocab[i] == const.U_TOKEN: 
43 | 					continue
44 | 				vec_a = vectors[i].reshape([-1])
45 | 				vec_b = vectors[idx].reshape([-1])
46 | 				cosine_sim = sess.run(cosine_similarity, feed_dict={test_embed: vec_a, test_input: vec_b})
47 | 				scores.append([vocab[i], cosine_sim]) #calculates cosine similarity
48 | 			return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
49 | 
50 | 	def train(self):
51 | 		Weight = tf.Variable(tf.truncated_normal([self.corpus.n_words, const.EMBEDDING_SIZE], stddev=1.0/math.sqrt(const.EMBEDDING_SIZE)))
52 | 		bias = tf.Variable(tf.zeros([self.corpus.n_words]))
53 | 
54 | 		inputs = tf.placeholder(tf.int32, [const.BATCH_SIZE, const.WIN_SIZE])
55 | 		outputs = tf.placeholder(tf.int32, [const.BATCH_SIZE, 1])
56 | 		embed = tf.nn.embedding_lookup(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0), inputs)
57 | 
58 | 		embed_sum = tf.reduce_sum(embed, 1)
59 | 		loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(Weight, bias, outputs, embed_sum, 3, self.corpus.n_words)) # negative sampling
60 | 		optimizer = tf.train.AdamOptimizer(learning_rate=const.LR_RATE).minimize(loss)
61 | 
62 | 		saver = tf.train.Saver()
63 | 
64 | 		losses = []
65 | 		with tf.Session() as sess:
66 | 			tf.global_variables_initializer().run()
67 | 
68 | 			for epoch in range(const.EPOCH):
69 | 				inps, targets = self.corpus.batch_data()
70 | 				_, _loss = sess.run([optimizer, loss], feed_dict={inputs:inps, outputs:targets})
71 | 
72 | 				losses.append(_loss)
73 | 				if epoch % 100 == 0:
74 | 					print('epoch, ', epoch, 'mean loss', np.mean(losses))
75 | 					losses= []
76 | 
77 | 			# save model
78 | 			saver.save(sess, const.MODEL_PATH)


--------------------------------------------------------------------------------
/word2vec/cbow/tensorflow/negative_sampling/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # const.python 			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | S_TOKEN = '<s>'		# start token
12 | E_TOKEN = '</s>'	# end token
13 | U_TOKEN = '<u>'		# unknown token
14 | D_TOKEN = '<d>'		# dummy token
15 | 
16 | WIN_SIZE = 4		# window size
17 | SKIP_WIN = 2
18 | 
19 | # nnwork
20 | EMBEDDING_SIZE = 100
21 | BATCH_SIZE = 128
22 | EPOCH = 10000
23 | LR_RATE = 0.001
24 | 
25 | MODEL_PATH = './model/word2vec.bin'
26 | 


--------------------------------------------------------------------------------
/word2vec/cbow/tensorflow/negative_sampling/dataset.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # dataset.py 			             #
 6 | # author: sean lee                           #
 7 | # email: lxm_0828@163.com                    #
 8 | #--------------------------------------------#
 9 | 
10 | import const
11 | import re
12 | import random
13 | import numpy as np
14 | import nltk
15 | import jieba
16 | import collections
17 | from collections import defaultdict, Counter
18 | 
19 | def rm_sign(string):
20 | 	string = re.sub("[\.\!_,\$\(\)\"\'\]\[！!\?，。？、~@#￥……&]+", "", string) 
21 | 	return string
22 | 
23 | def load_data(corpus_dir = '../../../corpus/articles.txt'):
24 | 	with open(corpus_dir, 'r') as f:
25 | 		for line in f:
26 | 			line = line.strip()
27 | 			if len(line) == 0:
28 | 				continue
29 | 			yield jieba.lcut(rm_sign(line))
30 | 
31 | class Corpus(object):
32 | 	def __init__(self, data):
33 | 		flatten = lambda l: [item.lower() for sublist in l for item in sublist]
34 | 		word_count = Counter(flatten(data)).most_common()
35 | 		self.word2idx = {const.U_TOKEN: 0}
36 | 		self.n_words = 1
37 | 		for word, _ in word_count:
38 | 			self.word2idx[word] = self.n_words
39 | 			self.n_words += 1
40 | 		self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys())) 
41 | 		self.vocab = list(self.word2idx.keys())
42 | 
43 | 	# @return batch data
44 | 	# @generator
45 | 	def batch_data(self):
46 | 		batch_size = const.BATCH_SIZE * const.WIN_SIZE
47 | 		data = self.vocab
48 | 		data_index = 0
49 | 		assert batch_size % const.WIN_SIZE == 0
50 | 		assert const.WIN_SIZE <= 2 * const.SKIP_WIN
51 | 		batch = np.ndarray(shape=(batch_size), dtype=np.int32)
52 | 		labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
53 | 		span = 2 * const.SKIP_WIN + 1 # [ const.SKIP_WIN target const.SKIP_WIN ]
54 | 		buffers = collections.deque(maxlen=span)
55 | 		for _ in range(span):
56 | 			buffers.append(data[data_index])
57 | 			data_index = (data_index + 1) % len(data)
58 | 		for i in range(batch_size // const.WIN_SIZE):
59 | 
60 | 			target = const.SKIP_WIN  # target label at the center of the buffers
61 | 			targets_to_avoid = [const.SKIP_WIN]
62 | 			for j in range(const.WIN_SIZE):
63 | 				while target in targets_to_avoid:
64 | 					target = random.randint(0, span - 1)
65 | 				targets_to_avoid.append(target)
66 | 				batch[i * const.WIN_SIZE + j] = self.var_word(buffers[const.SKIP_WIN])[0]
67 | 				labels[i * const.WIN_SIZE + j, 0] = self.var_word(buffers[target])[0]
68 | 			buffers.append(data[data_index])
69 | 			data_index = (data_index + 1) % len(data)
70 | 		label_CBOW = []
71 | 		context_CBOW = []
72 | 		for i in range(0,len(batch), const.WIN_SIZE):
73 | 			label_CBOW.append(batch[i])
74 | 			context_CBOW.append([l[0] for l in labels[i:i+const.WIN_SIZE]])
75 | 		return np.array(context_CBOW), np.array(label_CBOW).reshape(batch_size // const.WIN_SIZE, 1)
76 | 
77 | 	# @input sentence [w1, w2, ... , wn]
78 | 	def var_sentence(self, sentence):
79 | 		idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \
80 | 				else self.word2idx[const.U_TOKEN], sentence))
81 | 		return idxs
82 | 
83 | 	# @input word
84 | 	def var_word(self, word):
85 | 		idx = [self.word2idx[const.U_TOKEN]]
86 | 		if word in self.word2idx:
87 | 			idx = [self.word2idx[word]]
88 | 		return idx
89 | 


--------------------------------------------------------------------------------
/word2vec/cbow/tensorflow/negative_sampling/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # main.py 									 #
 6 | # author: sean lee						 	 #
 7 | # email: lxm_0828@163.com					 #
 8 | #--------------------------------------------#
 9 | 
10 | import argparse
11 | parser = argparse.ArgumentParser(description='main.py')
12 | parser.add_argument('-train', action='store_true', default=False, help='train model')
13 | parser.add_argument('-test', action='store_true', default=False, help='test model')
14 | args = parser.parse_args()
15 | 
16 | from dataset import Corpus, load_data
17 | from cbow import Cbow
18 | 
19 | if __name__ == '__main__':
20 | 	
21 | 	data = list(load_data())
22 | 	corpus = Corpus(data)
23 | 	cbow = Cbow(corpus)
24 | 
25 | 
26 | 	if args.train:
27 | 		cbow.train()
28 | 	elif args.test:
29 | 		word = input('Input word> ')
30 | 		print(cbow.test(word))


--------------------------------------------------------------------------------
/word2vec/cbow/tensorflow/softmax/cbow.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # skipgram	 			                     #
  6 | # author: sean lee                           #
  7 | # email: lxm_0828@163.com                    #
  8 | #--------------------------------------------#
  9 | 
 10 | import const
 11 | import math
 12 | import numpy as np
 13 | import tensorflow as tf 
 14 | 
 15 | class Cbow(object):
 16 | 	def __init__(self, corpus):
 17 | 		self.corpus = corpus
 18 | 
 19 | 	def test(self, word, k=10):
 20 | 		Weight = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0))
 21 | 		inputs = tf.placeholder(tf.int32, [None])
 22 | 		embed = tf.nn.embedding_lookup(Weight, inputs)
 23 | 
 24 | 		# cosine
 25 | 		test_embed = tf.placeholder(tf.float32, [None])
 26 | 		test_input = tf.placeholder(tf.float32, [None])
 27 | 		normed_embed = tf.nn.l2_normalize(test_embed, dim=0)
 28 | 		normed_array = tf.nn.l2_normalize(test_input, dim=0)
 29 | 		cosine_similarity = tf.reduce_sum(tf.multiply(normed_array, normed_embed))
 30 | 
 31 | 		with tf.Session() as sess:
 32 | 			tf.global_variables_initializer().run()
 33 | 			#restore model
 34 | 			tf.train.Saver().restore(sess, const.MODEL_PATH)
 35 | 
 36 | 			vectors = sess.run(embed, feed_dict={inputs: range(self.corpus.n_words)})
 37 | 			vocab = self.corpus.vocab
 38 | 			idx = self.corpus.var_word(word)
 39 | 			scores = []
 40 | 			for i in range(len(vocab)):
 41 | 				if vocab[i] == word or vocab[i] == const.U_TOKEN: 
 42 | 					continue
 43 | 				vec_a = vectors[i].reshape([-1])
 44 | 				vec_b = vectors[idx].reshape([-1])
 45 | 				cosine_sim = sess.run(cosine_similarity, feed_dict={test_embed: vec_a, test_input: vec_b})
 46 | 				scores.append([vocab[i], cosine_sim]) #cosine similarity
 47 | 			return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
 48 | 
 49 | 	def train(self):
 50 | 		Weight = tf.Variable(tf.truncated_normal([self.corpus.n_words, const.EMBEDDING_SIZE], stddev=1.0/math.sqrt(const.EMBEDDING_SIZE)))
 51 | 		bias = tf.Variable(tf.random_normal([self.corpus.n_words]))
 52 | 
 53 | 		inputs = tf.placeholder(tf.int32, [const.BATCH_SIZE, const.WIN_SIZE])
 54 | 		targets = tf.placeholder(tf.int32, [const.BATCH_SIZE, 1])
 55 | 		vocabs = tf.placeholder(tf.int32, [const.BATCH_SIZE, self.corpus.n_words])
 56 | 
 57 | 		embed_weight_v = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0))
 58 | 		embed_weight_u = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0))
 59 | 		embed_weight_actual = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0))
 60 | 		embed_v = tf.nn.embedding_lookup(embed_weight_v, inputs)
 61 | 		embed_u = tf.nn.embedding_lookup(embed_weight_u, targets)
 62 | 		embed_actual = tf.nn.embedding_lookup(embed_weight_actual, vocabs)
 63 | 
 64 | 		'''
 65 | 		print(embed_u.shape)
 66 | 		print(embed_v.shape)
 67 | 		print(embed_actual.shape)
 68 | 		exit()
 69 | 		'''
 70 | 		embed_v_trans = tf.transpose(embed_v, [0, 2, 1])
 71 | 
 72 | 		#print(embed_v_trans.shape)
 73 | 		scores = tf.matmul(embed_u, embed_v_trans)   		
 74 | 		norm_scores = tf.matmul(embed_actual, embed_v_trans) 
 75 | 
 76 | 		softmax = tf.exp(scores) / tf.reduce_sum(tf.exp(norm_scores), 1)
 77 | 		softmax = tf.expand_dims(softmax, 1)
 78 | 		nll_loss = -tf.reduce_mean(tf.log(tf.clip_by_value(softmax,1e-10,1.0))) 
 79 | 
 80 | 		optimizer = tf.train.AdamOptimizer(learning_rate=const.LR_RATE).minimize(nll_loss)
 81 | 
 82 | 		saver = tf.train.Saver()
 83 | 
 84 | 		losses = []
 85 | 		with tf.Session() as sess:
 86 | 			tf.global_variables_initializer().run()
 87 | 
 88 | 			for epoch in range(const.EPOCH):
 89 | 				_inputs, _targets = self.corpus.batch_data()
 90 | 
 91 | 				#print(_inputs.shape, _targets.shape)
 92 | 				#continue
 93 | 				#_inputs = np.hstack(_inputs)   # (2, )
 94 | 				#_inputs = _inputs.reshape(_inputs.shape[0], 1)
 95 | 				_targets = np.vstack(_targets) # (2, 1)
 96 | 
 97 | 				vocab = self.corpus.var_sentence(self.corpus.vocab)
 98 | 				_vocabs = []
 99 | 				[_vocabs.append(vocab) for x in range(inputs.shape[0])]
100 | 				_vocabs = np.array(_vocabs)
101 | 
102 | 				_, _loss = sess.run([optimizer, nll_loss], feed_dict={inputs:_inputs, targets:_targets, vocabs: _vocabs})
103 | 				losses.append(_loss)
104 | 
105 | 				if epoch % 10 == 0:
106 | 					print('epoch, ', epoch, 'mean loss', np.mean(losses))
107 | 					losses= []
108 | 
109 | 			# save model
110 | 			saver.save(sess, const.MODEL_PATH)


--------------------------------------------------------------------------------
/word2vec/cbow/tensorflow/softmax/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # const.python 			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | S_TOKEN = '<s>'		# start token
12 | E_TOKEN = '</s>'	# end token
13 | U_TOKEN = '<u>'		# unknown token
14 | D_TOKEN = '<d>'		# dummy token
15 | 
16 | WIN_SIZE = 4		# window size
17 | SKIP_WIN = 2
18 | 
19 | # nnwork
20 | EMBEDDING_SIZE = 100
21 | BATCH_SIZE = 128
22 | EPOCH = 10000
23 | LR_RATE = 0.0001
24 | 
25 | MODEL_PATH = './model/word2vec.bin'
26 | 


--------------------------------------------------------------------------------
/word2vec/cbow/tensorflow/softmax/dataset.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # dataset.py 		                     #
 6 | # author: sean lee                           #
 7 | # email: lxm_0828@163.com                    #
 8 | #--------------------------------------------#
 9 | 
10 | import const
11 | import re
12 | import random
13 | import numpy as np
14 | import nltk
15 | import jieba
16 | import collections
17 | from collections import defaultdict, Counter
18 | 
19 | def rm_sign(string):
20 | 	string = re.sub("[\.\!_,\$\(\)\"\'\]\[！!\?，。？、~@#￥……&]+", "", string) 
21 | 	return string
22 | 
23 | def load_data(corpus_dir = '../../../corpus/articles.txt'):
24 | 	with open(corpus_dir, 'r') as f:
25 | 		for line in f:
26 | 			line = line.strip()
27 | 			if len(line) == 0:
28 | 				continue
29 | 			yield jieba.lcut(rm_sign(line))
30 | 
31 | class Corpus(object):
32 | 	def __init__(self, data):
33 | 		flatten = lambda l: [item.lower() for sublist in l for item in sublist]
34 | 		word_count = Counter(flatten(data)).most_common()
35 | 		self.word2idx = {const.U_TOKEN: 0}
36 | 		self.n_words = 1
37 | 		for word, _ in word_count:
38 | 			self.word2idx[word] = self.n_words
39 | 			self.n_words += 1
40 | 		self.idx2word = dict(zip(self.word2idx.values(), self.word2idx.keys())) 
41 | 		self.vocab = list(self.word2idx.keys())
42 | 
43 | 	# @return batch data
44 | 	# @generator
45 | 	def batch_data(self):
46 | 		batch_size = const.BATCH_SIZE * const.WIN_SIZE
47 | 		data = self.vocab
48 | 		data_index = 0
49 | 		assert batch_size % const.WIN_SIZE == 0
50 | 		assert const.WIN_SIZE <= 2 * const.SKIP_WIN
51 | 		batch = np.ndarray(shape=(batch_size), dtype=np.int32)
52 | 		labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
53 | 		span = 2 * const.SKIP_WIN + 1 # [ const.SKIP_WIN target const.SKIP_WIN ]
54 | 		buffers = collections.deque(maxlen=span)
55 | 		for _ in range(span):
56 | 			buffers.append(data[data_index])
57 | 			data_index = (data_index + 1) % len(data)
58 | 		for i in range(batch_size // const.WIN_SIZE):
59 | 
60 | 			target = const.SKIP_WIN  # target label at the center of the buffers
61 | 			targets_to_avoid = [const.SKIP_WIN]
62 | 			for j in range(const.WIN_SIZE):
63 | 				while target in targets_to_avoid:
64 | 					target = random.randint(0, span - 1)
65 | 				targets_to_avoid.append(target)
66 | 				batch[i * const.WIN_SIZE + j] = self.var_word(buffers[const.SKIP_WIN])[0]
67 | 				labels[i * const.WIN_SIZE + j, 0] = self.var_word(buffers[target])[0]
68 | 			buffers.append(data[data_index])
69 | 			data_index = (data_index + 1) % len(data)
70 | 		label_CBOW = []
71 | 		context_CBOW = []
72 | 		for i in range(0,len(batch), const.WIN_SIZE):
73 | 			label_CBOW.append(batch[i])
74 | 			context_CBOW.append([l[0] for l in labels[i:i+const.WIN_SIZE]])
75 | 		return np.array(context_CBOW), np.array(label_CBOW).reshape(batch_size // const.WIN_SIZE, 1)
76 | 
77 | 	# @input sentence [w1, w2, ... , wn]
78 | 	def var_sentence(self, sentence):
79 | 		idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \
80 | 				else self.word2idx[const.U_TOKEN], sentence))
81 | 		return idxs
82 | 
83 | 	# @input word
84 | 	def var_word(self, word):
85 | 		idx = [self.word2idx[const.U_TOKEN]]
86 | 		if word in self.word2idx:
87 | 			idx = [self.word2idx[word]]
88 | 		return idx
89 | 


--------------------------------------------------------------------------------
/word2vec/cbow/tensorflow/softmax/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # main.py 									 #
 6 | # author: sean lee						   	 #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com					 #
 9 | #--------------------------------------------#
10 | 
11 | import argparse
12 | parser = argparse.ArgumentParser(description='main.py')
13 | parser.add_argument('-train', action='store_true', default=False, help='train model')
14 | parser.add_argument('-test', action='store_true', default=False, help='test model')
15 | args = parser.parse_args()
16 | 
17 | from dataset import Corpus, load_data
18 | from cbow import Cbow
19 | 
20 | if __name__ == '__main__':
21 | 	
22 | 	data = list(load_data())
23 | 	corpus = Corpus(data)
24 | 	cbow = Cbow(corpus)
25 | 
26 | 	if args.train:
27 | 		cbow.train()
28 | 	elif args.test:
29 | 		word = input('Input word> ')
30 | 		print(cbow.test(word))


--------------------------------------------------------------------------------
/word2vec/corpus/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/nlp_learning/82f158f63c7b943dabc0fb18ed7ebde5c655214a/word2vec/corpus/result.png


--------------------------------------------------------------------------------
/word2vec/corpus/trans_code.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | ###
 5 | # Linux下GB*转UTF-8
 6 | ###
 7 | fin = open('articles.txt', 'r')  
 8 | fou = open('articles_uft8.txt', 'w')  
 9 | line = fin.readline()  
10 | while line:
11 |     newline = line.decode('GB18030').encode('utf-8')  #用GBK、GB2312都会出错  
12 |     print newline,
13 |     print >> fou, newline,  
14 |     line = fin.readline()  
15 | fin.close()  
16 | fou.close()  
17 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/pytorch/negative_sampling/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # const.python 			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | S_TOKEN = '<s>'		# start token
12 | E_TOKEN = '</s>'	# end token
13 | U_TOKEN = '<u>'		# unknown token
14 | D_TOKEN = '<d>'		# dummy token
15 | 
16 | WIN_SIZE = 3		# window size
17 | 
18 | # nnwork
19 | EMBEDDING_SIZE = 30
20 | BATCH_SIZE = 128
21 | EPOCH = 1000
22 | LR_RATE = 0.001
23 | NEG = 10 # Num of Negative Sampling
24 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/pytorch/negative_sampling/dataset.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # dataset.py 			             #
  6 | # author: sean lee                           #
  7 | # email: lxm_0828@163.com                    #
  8 | #--------------------------------------------#
  9 | 
 10 | import const
 11 | import re
 12 | import random
 13 | import nltk
 14 | import jieba
 15 | import torch
 16 | from torch.autograd import Variable
 17 | from collections import defaultdict, Counter
 18 | 
 19 | 
 20 | if torch.cuda.is_available():
 21 | 	FloatTensor = torch.cuda.FloatTensor
 22 | 	LongTensor = torch.cuda.LongTensor
 23 | 	ByteTensor = torch.cuda.ByteTensor
 24 | 
 25 | def rm_sign(string):
 26 | 	string = re.sub("[\.\!_,\$\(\)\"\'\]\[！!\?，。？、~@#￥……&]+", "", string) 
 27 | 	return string
 28 | 
 29 | def load_data(corpus_dir = '../../../corpus/articles.txt'):
 30 | 	with open(corpus_dir, 'r') as f:
 31 | 		for line in f:
 32 | 			line = line.strip()
 33 | 			if len(line) == 0:
 34 | 				continue
 35 | 			yield jieba.lcut(rm_sign(line))
 36 | 
 37 | class Corpus(object):
 38 | 	def __init__(self, data):
 39 | 		self.vocab, self.neg_vocab, self.unigram_table = self.get_vocab(data)
 40 | 		self.windows = []
 41 | 		self.vocab.append(const.U_TOKEN)
 42 | 		self.word2idx = {}
 43 | 		self.idx2word = {}
 44 | 		self.n_words = 0
 45 | 
 46 | 		for word in self.vocab:
 47 | 			if word not in self.word2idx:
 48 | 				self.word2idx[word] = self.n_words
 49 | 				self.idx2word[self.n_words] = word
 50 | 				self.n_words += 1
 51 | 
 52 | 		for sentence in data:
 53 | 			# n-gram
 54 | 			self.windows.extend(\
 55 | 				list(\
 56 | 					nltk.ngrams([const.D_TOKEN]*const.WIN_SIZE+sentence+[const.D_TOKEN]*const.WIN_SIZE, const.WIN_SIZE*2+1)\
 57 | 				)\
 58 | 			)
 59 | 
 60 | 		dataset = []
 61 | 		for window in self.windows:
 62 | 			for i in range(const.WIN_SIZE*2+1):
 63 | 				if window[i] in self.neg_vocab or window[const.WIN_SIZE] in self.neg_vocab:
 64 | 					continue
 65 | 				if i == const.WIN_SIZE or window[i] == const.D_TOKEN: 
 66 | 					continue
 67 | 				dataset.append((window[const.WIN_SIZE], window[i]))
 68 | 		X_p, y_p = [], []
 69 | 		for d in dataset:
 70 | 			X_p.append(self.var_word(d[0]).view(1,-1))
 71 | 			y_p.append(self.var_word(d[1]).view(1,-1))
 72 | 		self.dataset = list(zip(X_p, y_p))
 73 | 
 74 | 	def get_vocab(self, data, min_count=3, Z=0.01):
 75 | 		# [[]] -> []
 76 | 		flatten = lambda l: [item.lower() for sublist in l for item in sublist]
 77 | 		word_count = Counter(flatten(data))
 78 | 		neg_vocab = [w for w, c in word_count.items() if c < min_count]
 79 | 		vocab = list(set(flatten(data))-set(neg_vocab))
 80 | 		vocab_total_words = sum([c for w, c in word_count.items() if w not in neg_vocab])
 81 | 		unigram_table = []
 82 | 		for v in vocab:
 83 | 			unigram_table.extend([v]*int(((word_count[v]/vocab_total_words)**(3/4))/Z))
 84 | 		return vocab, neg_vocab, unigram_table
 85 | 
 86 | 	def negative_sampling(self, targets):
 87 | 		batch_size = targets.size(0)
 88 | 		neg_samples = []
 89 | 		for i in range(batch_size):
 90 | 			sample = []
 91 | 			target_idx = targets[i].data.tolist()[0]
 92 | 			while len(sample) < const.NEG:
 93 | 				if self.word2idx == target_idx:
 94 | 					continue
 95 | 				sample.append(random.choice(self.unigram_table))
 96 | 			neg_samples.append(self.var_sentence(sample).view(1, -1))
 97 | 		return torch.cat(neg_samples)
 98 | 
 99 | 	# @return batch data
100 | 	# @generator
101 | 	def batch_data(self, batch_size):
102 | 		random.shuffle(self.dataset)
103 | 		sidx = 0			# start index
104 | 		eidx = batch_size	# end index
105 | 		while eidx < len(self.dataset):
106 | 			batch = self.dataset[sidx:eidx]
107 | 			sidx = eidx
108 | 			eidx += batch_size
109 | 			yield batch
110 | 
111 | 		if eidx >= len(self.dataset):
112 | 			batch = self.dataset[sidx: ]
113 | 			yield batch
114 | 
115 | 	# @input sentence [w1, w2, ... , wn]
116 | 	# @return Variable
117 | 	def var_sentence(self, sentence):
118 | 		idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \
119 | 				else self.word2idx[const.U_TOKEN], sentence))
120 | 		return Variable(torch.LongTensor(idxs))
121 | 
122 | 	# @input word
123 | 	# @return Variable
124 | 	def var_word(self, word):
125 | 		return Variable(torch.LongTensor([self.word2idx[word]]) if word in self.word2idx.keys() \
126 | 				else torch.LongTensor([self.word2idx[const.U_TOKEN]]))
127 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/pytorch/negative_sampling/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # main.py 									 #
 6 | # author: sean lee						   	 #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com					 #
 9 | #--------------------------------------------#
10 | 
11 | 
12 | import argparse
13 | parser = argparse.ArgumentParser(description='main.py')
14 | parser.add_argument('-train', action='store_true', default=False, help='train model')
15 | parser.add_argument('-retrain', action='store_true', default=False, help='train model')
16 | parser.add_argument('-test', action='store_true', default=False, help='test model')
17 | args = parser.parse_args()
18 | 
19 | import const
20 | import numpy as np
21 | import torch
22 | import torch.optim as optim
23 | import torch.nn.functional as F
24 | from dataset import Corpus, load_data
25 | from skipgram import Skipgram
26 | from utils import Utils
27 | 
28 | def test(word, corpus, k=10):
29 | 	vocab = corpus.vocab
30 | 	model,_ = Utils.load_previous_model('model')
31 | 	target_V = model.pred(corpus.var_word(word))
32 | 	scores=[]
33 | 	for i in range(len(vocab)):
34 | 		if vocab[i] == word or vocab[i] == const.U_TOKEN: 
35 | 			continue
36 | 		vector = model.pred(corpus.var_word(list(vocab)[i]))
37 | 		cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 
38 | 		scores.append([vocab[i],cosine_sim])
39 | 	return sorted(scores, key=lambda x: x[1], reverse=True)[:k] # sort by similarity
40 | 
41 | def train(corpus):
42 | 	if args.retrain:
43 | 		Utils.remove_models('model')
44 | 
45 | 	losses = []
46 | 
47 | 	start_epoch = 0
48 | 	model, start_epoch = Utils.load_previous_model('model')
49 | 	if model == None:
50 | 		model = Skipgram(corpus.n_words, const.EMBEDDING_SIZE)
51 | 
52 | 	if torch.cuda.is_available():
53 | 		model.cuda()
54 | 	optimizer = optim.Adam(model.parameters(), const.LR_RATE)
55 | 
56 | 	for epoch in range(start_epoch, const.EPOCH):
57 | 		for i, batch in enumerate(corpus.batch_data(const.BATCH_SIZE)):
58 | 			inputs, targets = zip(*batch)  # unzip
59 | 			inputs = torch.cat(inputs)
60 | 			targets = torch.cat(targets)
61 | 			negs = corpus.negative_sampling(targets)
62 | 			#print(inputs.size(), targets.size(), vocabs.size())
63 | 			#exit()
64 | 			model.zero_grad()
65 | 			loss = model(inputs, targets, negs)
66 | 			loss.backward()
67 | 			optimizer.step()
68 | 
69 | 			losses.append(loss.data.tolist()[0])
70 | 		if epoch % 10 == 0:
71 | 			print("Epoch : %d, mean_loss : %.02f" % (epoch , np.mean(losses)))
72 | 			Utils.save_model(model, epoch, 'model')
73 | 			losses = []
74 | 	Utils.save_model(model, epoch, 'model')
75 | 
76 | data = list(load_data())
77 | corpus = Corpus(data)
78 | if args.train or args.retrain:
79 | 	train(corpus)
80 | elif args.test:
81 | 	word = input('Input word> ')
82 | 	print(test(word, corpus))


--------------------------------------------------------------------------------
/word2vec/skipgram/pytorch/negative_sampling/skipgram.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # skipgram	 			     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | 
15 | class Skipgram(nn.Module):
16 | 	def __init__(self, input_size, projection_size):
17 | 		super(Skipgram, self).__init__()
18 | 		self.V = nn.Embedding(input_size, projection_size)
19 | 		self.U = nn.Embedding(input_size, projection_size)
20 | 		self.logsigmoid = nn.LogSigmoid()
21 | 		
22 | 		initrange = (2.0 / (input_size + projection_size))**5
23 | 		self.V.weight.data.uniform_(-initrange, initrange)
24 | 		self.U.weight.data.uniform_(-0.0, 0.0)  # zero
25 | 
26 | 	def forward(self, center_words, target_words, neg_words):
27 | 		v = self.V(center_words)  # batch_size x 1 x projection_size
28 | 		u = self.U(target_words)	# batch_size x 1 x projection_size
29 | 		u_neg = -self.U(neg_words)
30 | 
31 | 		pos_score = u.bmm(v.transpose(1, 2)).squeeze(2)    # batch_size x 1
32 | 		neg_score = torch.sum(u_neg.bmm(v.transpose(1, 2)).squeeze(2), 1).view(neg_words.size(0), -1)	# batch_size x input_size
33 | 		
34 | 		return self.loss(pos_score, neg_score)
35 | 		
36 | 	def loss(self, pos_score, neg_score):
37 | 		loss = self.logsigmoid(pos_score) + self.logsigmoid(neg_score)
38 | 		return -torch.mean(loss)
39 | 
40 | 	def pred(self, inp):
41 | 		return self.V(inp)
42 | 
43 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/pytorch/negative_sampling/utils.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # model utils			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | import torch
12 | import os, glob
13 | import numpy as np
14 | 
15 | class Utils(object):
16 | 
17 | 	@staticmethod
18 | 	def save_model(model, epoch, save_dir, max_keep=5):
19 | 		if not os.path.exists(save_dir):
20 | 			os.makedirs(save_dir)
21 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
22 | 		if len(f_list) >= max_keep + 2:
23 | 			epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list]
24 | 			to_delete = [f_list[i] for i in np.argsort(epoch_list)[-max_keep:]]
25 | 			for f in to_delete:
26 | 				os.remove(f)
27 | 		name = 'model_{}.ckpt'.format(epoch)
28 | 		file_path = os.path.join(save_dir, name)
29 | 		#torch.save(model.state_dict(), file_path)
30 | 		torch.save(model, file_path)
31 | 
32 | 	@staticmethod
33 | 	def load_previous_model(save_dir):
34 | 		if not os.path.exists(save_dir):
35 | 			os.makedirs(save_dir)
36 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
37 | 		start_epoch = 1
38 | 		model = None
39 | 		if len(f_list) >= 1:
40 | 			epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list]
41 | 			last_checkpoint = f_list[np.argmax(epoch_list)]
42 | 			if os.path.exists(last_checkpoint):
43 | 				#print('load from {}'.format(last_checkpoint))
44 | 				# CNN 不支持参数保存
45 | 				#model.load_state_dict(torch.load(last_checkpoint))
46 | 				model = torch.load(last_checkpoint)
47 | 				start_epoch = np.max(epoch_list)
48 | 		return model, start_epoch
49 | 
50 | 	@staticmethod
51 | 	def remove_models(save_dir):
52 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
53 | 		f_list.append(os.path.join(save_dir, 'param.pkl'))
54 | 		f_list.append(os.path.join(save_dir, 'log.txt'))
55 | 		for filename in f_list:
56 | 			try:
57 | 				os.remove(filename)
58 | 			except:
59 | 				pass
60 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/pytorch/softmax/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # const.python 			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | S_TOKEN = '<s>'		# start token
12 | E_TOKEN = '</s>'	# end token
13 | U_TOKEN = '<u>'		# unknown token
14 | D_TOKEN = '<d>'		# dummy token
15 | 
16 | WIN_SIZE = 5		# window size
17 | 
18 | # nnwork
19 | EMBEDDING_SIZE = 30
20 | BATCH_SIZE = 256
21 | EPOCH = 1000
22 | LR_RATE = 0.001
23 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/pytorch/softmax/dataset.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # dataset.py 		                     #
  6 | # author: sean lee                           #
  7 | # email: lxm_0828@163.com                    #
  8 | #--------------------------------------------#
  9 | 
 10 | import const
 11 | import re
 12 | import random
 13 | import nltk
 14 | import jieba
 15 | import torch
 16 | from torch.autograd import Variable
 17 | from collections import defaultdict, Counter
 18 | 
 19 | 
 20 | if torch.cuda.is_available():
 21 | 	FloatTensor = torch.cuda.FloatTensor
 22 | 	LongTensor = torch.cuda.LongTensor
 23 | 	ByteTensor = torch.cuda.ByteTensor
 24 | 
 25 | def rm_sign(string):
 26 | 	string = re.sub("[\.\!_,\$\(\)\"\'\]\[！!\?，。？、~@#￥……&]+", "", string) 
 27 | 	return string
 28 | 
 29 | def load_data(corpus_dir = '../../../corpus/articles.txt'):
 30 | 	with open(corpus_dir, 'r') as f:
 31 | 		for line in f:
 32 | 			line = line.strip()
 33 | 			if len(line) == 0:
 34 | 				continue
 35 | 			yield jieba.lcut(rm_sign(line))
 36 | 
 37 | class Corpus(object):
 38 | 	def __init__(self, data):
 39 | 		self.vocab = self.get_vocab(data)
 40 | 		self.windows = []
 41 | 		self.vocab.append(const.U_TOKEN)
 42 | 		self.word2idx = {}
 43 | 		self.idx2word = {}
 44 | 		self.n_words = 0
 45 | 
 46 | 		for word in self.vocab:
 47 | 			if word not in self.word2idx:
 48 | 				self.word2idx[word] = self.n_words
 49 | 				self.idx2word[self.n_words] = word
 50 | 				self.n_words += 1
 51 | 
 52 | 		for sentence in data:
 53 | 			# n-gram
 54 | 			self.windows.extend(\
 55 | 				list(\
 56 | 					nltk.ngrams([const.D_TOKEN]*const.WIN_SIZE+sentence+[const.D_TOKEN]*const.WIN_SIZE, const.WIN_SIZE*2+1)\
 57 | 				)\
 58 | 			)
 59 | 
 60 | 		dataset = []
 61 | 		for window in self.windows:
 62 | 			for i in range(const.WIN_SIZE*2+1):
 63 | 				if i == const.WIN_SIZE or window[i] == const.D_TOKEN: 
 64 | 					continue
 65 | 				dataset.append((window[const.WIN_SIZE], window[i]))
 66 | 		X_p, y_p = [], []
 67 | 		for d in dataset:
 68 | 			X_p.append(self.var_word(d[0]).view(1,-1))
 69 | 			y_p.append(self.var_word(d[1]).view(1,-1))
 70 | 		self.dataset = list(zip(X_p, y_p))
 71 | 
 72 | 	def get_vocab(self, data):
 73 | 		# [[]] -> []
 74 | 		flatten = lambda l: [item.lower() for sublist in l for item in sublist]
 75 | 		word_count = Counter(flatten(data))
 76 | 		border = int(len(word_count)*0.01)
 77 | 		stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border]
 78 | 		stopwords = [s[0] for s in stopwords]
 79 | 		vocab = list(set(flatten(data))-set(stopwords))
 80 | 		return vocab
 81 | 
 82 | 	# @return batch data
 83 | 	# @generator
 84 | 	def batch_data(self, batch_size):
 85 | 		random.shuffle(self.dataset)
 86 | 		sidx = 0			# start index
 87 | 		eidx = batch_size	# end index
 88 | 		while eidx < len(self.dataset):
 89 | 			batch = self.dataset[sidx:eidx]
 90 | 			sidx = eidx
 91 | 			eidx += batch_size
 92 | 			yield batch
 93 | 
 94 | 		if eidx >= len(self.dataset):
 95 | 			batch = self.dataset[sidx: ]
 96 | 			yield batch
 97 | 
 98 | 	# @input sentence [w1, w2, ... , wn]
 99 | 	# @return Variable
100 | 	def var_sentence(self, sentence):
101 | 		idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \
102 | 				else self.word2idx[const.U_TOKEN], sentence))
103 | 		return Variable(torch.LongTensor(idxs))
104 | 
105 | 	# @input word
106 | 	# @return Variable
107 | 	def var_word(self, word):
108 | 		return Variable(torch.LongTensor([self.word2idx[word]]) if word in self.word2idx.keys() \
109 | 				else torch.LongTensor([self.word2idx[const.U_TOKEN]]))
110 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/pytorch/softmax/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # main.py 									 #
 6 | # author: sean lee						   	 #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com					 #
 9 | #--------------------------------------------#
10 | 
11 | 
12 | import argparse
13 | parser = argparse.ArgumentParser(description='main.py')
14 | parser.add_argument('-train', action='store_true', default=False, help='train model')
15 | parser.add_argument('-retrain', action='store_true', default=False, help='train model')
16 | parser.add_argument('-test', action='store_true', default=False, help='test model')
17 | args = parser.parse_args()
18 | 
19 | import const
20 | import numpy as np
21 | import torch
22 | import torch.optim as optim
23 | import torch.nn.functional as F
24 | from dataset import Corpus, load_data
25 | from skipgram import Skipgram
26 | from utils import Utils
27 | 
28 | def test(word, corpus, k=10):
29 | 	vocab = corpus.vocab
30 | 	model,_ = Utils.load_previous_model('model')
31 | 	target_V = model.pred(corpus.var_word(word))
32 | 	scores=[]
33 | 	for i in range(len(vocab)):
34 | 		if vocab[i] == word or vocab[i] == const.U_TOKEN: 
35 | 			continue
36 | 		vector = model.pred(corpus.var_word(list(vocab)[i]))
37 | 		cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 
38 | 		scores.append([vocab[i],cosine_sim])
39 | 	return sorted(scores, key=lambda x: x[1], reverse=True)[:k] # sort by similarity
40 | 
41 | def train(corpus):
42 | 	if args.retrain:
43 | 		Utils.remove_models('model')
44 | 
45 | 	losses = []
46 | 
47 | 	start_epoch = 0
48 | 	model, start_epoch = Utils.load_previous_model('model')
49 | 	if model == None:
50 | 		model = Skipgram(corpus.n_words, const.EMBEDDING_SIZE)
51 | 
52 | 	if torch.cuda.is_available():
53 | 		model.cuda()
54 | 	optimizer = optim.Adam(model.parameters(), const.LR_RATE)
55 | 
56 | 	for epoch in range(start_epoch, const.EPOCH):
57 | 		for i, batch in enumerate(corpus.batch_data(const.BATCH_SIZE)):
58 | 			inputs, targets = zip(*batch)  # unzip
59 | 			inputs = torch.cat(inputs)
60 | 			targets = torch.cat(targets)
61 | 			vocabs = corpus.var_sentence(corpus.vocab).expand(inputs.size(0), corpus.n_words)
62 | 			print(inputs.size(), targets.size(), vocabs.size())
63 | 			exit()
64 | 
65 | 			model.zero_grad()
66 | 			loss = model(inputs, targets, vocabs)
67 | 			loss.backward()
68 | 			optimizer.step()
69 | 
70 | 			losses.append(loss.data.tolist()[0])
71 | 		if epoch % 10 == 0:
72 | 			print("Epoch : %d, mean_loss : %.02f" % (epoch , np.mean(losses)))
73 | 			Utils.save_model(model, epoch, 'model')
74 | 			losses = []
75 | 	Utils.save_model(model, epoch, 'model')
76 | 
77 | data = list(load_data())
78 | corpus = Corpus(data)
79 | if args.train or args.retrain:
80 | 	train(corpus)
81 | elif args.test:
82 | 	word = input('Input word> ')
83 | 	print(test(word, corpus))


--------------------------------------------------------------------------------
/word2vec/skipgram/pytorch/softmax/skipgram.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # skipgram	 			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | 
15 | class Skipgram(nn.Module):
16 | 	def __init__(self, input_size, projection_size):
17 | 		super(Skipgram, self).__init__()
18 | 		self.V = nn.Embedding(input_size, projection_size)
19 | 		self.U = nn.Embedding(input_size, projection_size)
20 | 
21 | 		self.V.weight.data.uniform_(-1.0, 1.0)
22 | 		self.U.weight.data.uniform_(0.0, 0.0)  # zero
23 | 
24 | 	def forward(self, center_words, target_words, out_words):
25 | 		v = self.V(center_words)  # batch_size x 1 x projection_size
26 | 		u = self.U(target_words)	# batch_size x 1 x projection_size
27 | 		u_actual = self.U(out_words) # batch_size x input_size x projection_size
28 | 
29 | 		scores = u.bmm(v.transpose(1, 2)).squeeze(2)    # batch_size x 1
30 | 		norm_scores = u_actual.bmm(v.transpose(1, 2)).squeeze(2)	# batch_size x input_size
31 | 		return self.nll_loss(scores, norm_scores)
32 | 		
33 | 	def nll_loss(self, scores, norm_scores):
34 | 		#
35 | 		softmax = torch.exp(scores)/torch.sum(torch.exp(norm_scores),1).unsqueeze(1)
36 | 		return -torch.mean(torch.log(softmax))
37 | 
38 | 	def pred(self, inp):
39 | 		return self.V(inp)
40 | 
41 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/pytorch/softmax/utils.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # model utils			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | import torch
12 | import os, glob
13 | import numpy as np
14 | 
15 | class Utils(object):
16 | 
17 | 	@staticmethod
18 | 	def save_model(model, epoch, save_dir, max_keep=5):
19 | 		if not os.path.exists(save_dir):
20 | 			os.makedirs(save_dir)
21 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
22 | 		if len(f_list) >= max_keep + 2:
23 | 			epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list]
24 | 			to_delete = [f_list[i] for i in np.argsort(epoch_list)[-max_keep:]]
25 | 			for f in to_delete:
26 | 				os.remove(f)
27 | 		name = 'model_{}.ckpt'.format(epoch)
28 | 		file_path = os.path.join(save_dir, name)
29 | 		#torch.save(model.state_dict(), file_path)
30 | 		torch.save(model, file_path)
31 | 
32 | 	@staticmethod
33 | 	def load_previous_model(save_dir):
34 | 		if not os.path.exists(save_dir):
35 | 			os.makedirs(save_dir)
36 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
37 | 		start_epoch = 1
38 | 		model = None
39 | 		if len(f_list) >= 1:
40 | 			epoch_list = [int(i.split('_')[-1].split('.')[0]) for i in f_list]
41 | 			last_checkpoint = f_list[np.argmax(epoch_list)]
42 | 			if os.path.exists(last_checkpoint):
43 | 				#print('load from {}'.format(last_checkpoint))
44 | 				# CNN 不支持参数保存
45 | 				#model.load_state_dict(torch.load(last_checkpoint))
46 | 				model = torch.load(last_checkpoint)
47 | 				start_epoch = np.max(epoch_list)
48 | 		return model, start_epoch
49 | 
50 | 	@staticmethod
51 | 	def remove_models(save_dir):
52 | 		f_list = glob.glob(os.path.join(save_dir, 'model') + '_*.ckpt')
53 | 		f_list.append(os.path.join(save_dir, 'param.pkl'))
54 | 		f_list.append(os.path.join(save_dir, 'log.txt'))
55 | 		for filename in f_list:
56 | 			try:
57 | 				os.remove(filename)
58 | 			except:
59 | 				pass
60 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/tensorflow/negative_sampling/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # const.python 			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | S_TOKEN = '<s>'		# start token
12 | E_TOKEN = '</s>'	# end token
13 | U_TOKEN = '<u>'		# unknown token
14 | D_TOKEN = '<d>'		# dummy token
15 | 
16 | WIN_SIZE = 5		# window size
17 | 
18 | # nnwork
19 | EMBEDDING_SIZE = 100
20 | BATCH_SIZE = 128
21 | EPOCH = 100
22 | LR_RATE = 0.001
23 | 
24 | MODEL_PATH = './model/word2vec.bin'
25 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/tensorflow/negative_sampling/dataset.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # dataset.py 			                     #
  6 | # author: sean lee                           #
  7 | # email: lxm_0828@163.com                    #
  8 | #--------------------------------------------#
  9 | 
 10 | import const
 11 | import re
 12 | import random
 13 | import numpy as np
 14 | import nltk
 15 | import jieba
 16 | from collections import defaultdict, Counter
 17 | 
 18 | def rm_sign(string):
 19 | 	string = re.sub("[\.\!_,\$\(\)\"\'\]\[！!\?，。？、~@#￥……&]+", "", string) 
 20 | 	return string
 21 | 
 22 | def load_data(corpus_dir = '../../../corpus/articles.txt'):
 23 | 	with open(corpus_dir, 'r') as f:
 24 | 		for line in f:
 25 | 			line = line.strip()
 26 | 			if len(line) == 0:
 27 | 				continue
 28 | 			yield jieba.lcut(rm_sign(line))
 29 | 
 30 | class Corpus(object):
 31 | 	def __init__(self, data):
 32 | 		self.vocab = self.get_vocab(data)
 33 | 		self.windows = []
 34 | 		self.vocab.append(const.U_TOKEN)
 35 | 		self.word2idx = {}
 36 | 		self.idx2word = {}
 37 | 		self.n_words = 0
 38 | 
 39 | 		for word in self.vocab:
 40 | 			if word not in self.word2idx:
 41 | 				self.word2idx[word] = self.n_words
 42 | 				self.idx2word[self.n_words] = word
 43 | 				self.n_words += 1
 44 | 
 45 | 		for sentence in data:
 46 | 			# n-gram
 47 | 			self.windows.extend(\
 48 | 				list(\
 49 | 					nltk.ngrams([const.D_TOKEN]*const.WIN_SIZE+sentence+[const.D_TOKEN]*const.WIN_SIZE, const.WIN_SIZE*2+1)\
 50 | 				)\
 51 | 			)
 52 | 
 53 | 		dataset = []
 54 | 		for window in self.windows:
 55 | 			for i in range(const.WIN_SIZE*2+1):
 56 | 				if i == const.WIN_SIZE or window[i] == const.D_TOKEN: 
 57 | 					continue
 58 | 				dataset.append((window[const.WIN_SIZE], window[i]))
 59 | 		X_p, y_p = [], []
 60 | 		for d in dataset:
 61 | 			X_p.append(self.var_word(d[0]))
 62 | 			y_p.append(self.var_word(d[1]))
 63 | 		self.dataset = list(zip(X_p, y_p))
 64 | 
 65 | 	def get_vocab(self, data):
 66 | 		# [[]] -> []
 67 | 		flatten = lambda l: [item.lower() for sublist in l for item in sublist]
 68 | 		word_count = Counter(flatten(data))
 69 | 		border = int(len(word_count)*0.01)
 70 | 		stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border]
 71 | 		stopwords = [s[0] for s in stopwords]
 72 | 		vocab = list(set(flatten(data))-set(stopwords))
 73 | 		return vocab
 74 | 
 75 | 	# @return batch data
 76 | 	# @generator
 77 | 	def batch_data(self, batch_size):
 78 | 		random.shuffle(self.dataset)
 79 | 		sidx = 0			# start index
 80 | 		eidx = batch_size	# end index
 81 | 		while eidx < len(self.dataset):
 82 | 			batch = self.dataset[sidx:eidx]
 83 | 			sidx = eidx
 84 | 			eidx += batch_size
 85 | 			yield batch
 86 | 
 87 | 		if eidx >= len(self.dataset):
 88 | 			batch = self.dataset[sidx: ]
 89 | 			diff = eidx - len(self.dataset)
 90 | 			inps, targets = zip(*batch)  # unzip
 91 | 			inps = list(inps)
 92 | 			targets = list(targets)
 93 | 			diff_vec = [self.word2idx[const.U_TOKEN]]*diff
 94 | 			inps = inps + diff_vec
 95 | 			targets = targets + diff_vec
 96 | 			inps = tuple(inps)
 97 | 			targets = tuple(targets)
 98 | 			batch = zip(inps, targets)
 99 | 			yield batch
100 | 
101 | 	# @input sentence [w1, w2, ... , wn]
102 | 	def var_sentence(self, sentence):
103 | 		idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \
104 | 				else self.word2idx[const.U_TOKEN], sentence))
105 | 		return idxs
106 | 
107 | 	# @input word
108 | 	def var_word(self, word):
109 | 		idx = [self.word2idx[const.U_TOKEN]]
110 | 		if word in self.word2idx:
111 | 			idx = [self.word2idx[word]]
112 | 		return idx
113 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/tensorflow/negative_sampling/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # main.py 									 #
 6 | # author: sean lee						   	 #
 7 | # email: lxm_0828@163.com					 #
 8 | #--------------------------------------------#
 9 | 
10 | import argparse
11 | parser = argparse.ArgumentParser(description='main.py')
12 | parser.add_argument('-train', action='store_true', default=False, help='train model')
13 | parser.add_argument('-test', action='store_true', default=False, help='test model')
14 | args = parser.parse_args()
15 | 
16 | from dataset import Corpus, load_data
17 | from skipgram import Skipgram
18 | 
19 | if __name__ == '__main__':
20 | 	
21 | 	data = list(load_data())
22 | 	corpus = Corpus(data)
23 | 	skipgram = Skipgram(corpus)
24 | 
25 | 	if args.train:
26 | 		skipgram.train()
27 | 	elif args.test:
28 | 		word = input('Input word> ')
29 | 		print(skipgram.test(word))


--------------------------------------------------------------------------------
/word2vec/skipgram/tensorflow/negative_sampling/skipgram.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # skipgram	 			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | import const
12 | import math
13 | import numpy as np
14 | import tensorflow as tf 
15 | 
16 | class Skipgram(object):
17 | 	def __init__(self, corpus):
18 | 		self.corpus = corpus
19 | 
20 | 	def test(self, word, k=10):
21 | 		Weight = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0))
22 | 		inputs = tf.placeholder(tf.int32, [None])
23 | 		embed = tf.nn.embedding_lookup(Weight, inputs)
24 | 
25 | 		# cosine
26 | 		test_embed = tf.placeholder(tf.float32, [None])
27 | 		test_input = tf.placeholder(tf.float32, [None])
28 | 		normed_embed = tf.nn.l2_normalize(test_embed, dim=0)
29 | 		normed_array = tf.nn.l2_normalize(test_input, dim=0)
30 | 		cosine_similarity = tf.reduce_sum(tf.multiply(normed_array, normed_embed))
31 | 
32 | 		with tf.Session() as sess:
33 | 			tf.global_variables_initializer().run()
34 | 			#restore model
35 | 			tf.train.Saver().restore(sess, const.MODEL_PATH)
36 | 
37 | 			vectors = sess.run(embed, feed_dict={inputs: range(self.corpus.n_words)})
38 | 			vocab = self.corpus.vocab
39 | 			idx = self.corpus.var_word(word)
40 | 			scores = []
41 | 			for i in range(len(vocab)):
42 | 				if vocab[i] == word or vocab[i] == const.U_TOKEN: 
43 | 					continue
44 | 				vec_a = vectors[i].reshape([-1])
45 | 				vec_b = vectors[idx].reshape([-1])
46 | 				cosine_sim = sess.run(cosine_similarity, feed_dict={test_embed: vec_a, test_input: vec_b})
47 | 				scores.append([vocab[i], cosine_sim]) #calculates cosine similarity
48 | 			return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
49 | 
50 | 	def train(self):
51 | 		Weight = tf.Variable(tf.truncated_normal([self.corpus.n_words, const.EMBEDDING_SIZE], stddev=1.0/math.sqrt(const.EMBEDDING_SIZE)))
52 | 		bias = tf.Variable(tf.zeros([self.corpus.n_words]))
53 | 
54 | 		inputs = tf.placeholder(tf.int32, [const.BATCH_SIZE])
55 | 		outputs = tf.placeholder(tf.int32, [const.BATCH_SIZE, 1])
56 | 		embed = tf.nn.embedding_lookup(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0), inputs)
57 | 
58 | 		loss = tf.reduce_mean(tf.nn.sampled_softmax_loss(Weight, bias, outputs, embed, 3, self.corpus.n_words)) # negative sampling
59 | 		optimizer = tf.train.AdamOptimizer(learning_rate=const.LR_RATE).minimize(loss)
60 | 
61 | 		saver = tf.train.Saver()
62 | 
63 | 		losses = []
64 | 		with tf.Session() as sess:
65 | 			tf.global_variables_initializer().run()
66 | 
67 | 			for epoch in range(const.EPOCH):
68 | 				for i, batch in enumerate(self.corpus.batch_data(const.BATCH_SIZE)):
69 | 					inps, targets = zip(*batch)  # unzip
70 | 					inps = np.hstack(inps)   # (2, )
71 | 					targets = np.vstack(targets) # (2, 1)
72 | 					#print(inps.shape, targets.shape)
73 | 					_, _loss = sess.run([optimizer, loss], feed_dict={inputs:inps, outputs:targets})
74 | 
75 | 					losses.append(_loss)
76 | 				if epoch % 10 == 0:
77 | 					print('epoch, ', epoch, 'mean loss', np.mean(losses))
78 | 					losses= []
79 | 
80 | 			# save model
81 | 			saver.save(sess, const.MODEL_PATH)


--------------------------------------------------------------------------------
/word2vec/skipgram/tensorflow/softmax/const.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # const.python 			                     #
 6 | # author: sean lee                           #
 7 | # locate: Shanxi university, Taiyuan, China  #
 8 | # email: lxm_0828@163.com                    #
 9 | #--------------------------------------------#
10 | 
11 | S_TOKEN = '<s>'		# start token
12 | E_TOKEN = '</s>'	# end token
13 | U_TOKEN = '<u>'		# unknown token
14 | D_TOKEN = '<d>'		# dummy token
15 | 
16 | WIN_SIZE = 5		# window size
17 | 
18 | # nnwork
19 | EMBEDDING_SIZE = 30
20 | BATCH_SIZE = 128
21 | EPOCH = 1000
22 | LR_RATE = 0.001
23 | 
24 | MODEL_PATH = './model/word2vec.bin'
25 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/tensorflow/softmax/dataset.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # dataset.py 			                     #
  6 | # author: sean lee                           #
  7 | # email: lxm_0828@163.com                    #
  8 | #--------------------------------------------#
  9 | 
 10 | import const
 11 | import re
 12 | import random
 13 | import numpy as np
 14 | import nltk
 15 | import jieba
 16 | from collections import defaultdict, Counter
 17 | 
 18 | def rm_sign(string):
 19 | 	string = re.sub("[\.\!_,\$\(\)\"\'\]\[！!\?，。？、~@#￥……&]+", "", string) 
 20 | 	return string
 21 | 
 22 | def load_data(corpus_dir = '../../../corpus/articles.txt'):
 23 | 	with open(corpus_dir, 'r') as f:
 24 | 		for line in f:
 25 | 			line = line.strip()
 26 | 			if len(line) == 0:
 27 | 				continue
 28 | 			yield jieba.lcut(rm_sign(line))
 29 | 
 30 | class Corpus(object):
 31 | 	def __init__(self, data):
 32 | 		self.vocab = self.get_vocab(data)
 33 | 		self.windows = []
 34 | 		self.vocab.append(const.U_TOKEN)
 35 | 		self.word2idx = {}
 36 | 		self.idx2word = {}
 37 | 		self.n_words = 0
 38 | 
 39 | 		for word in self.vocab:
 40 | 			if word not in self.word2idx:
 41 | 				self.word2idx[word] = self.n_words
 42 | 				self.idx2word[self.n_words] = word
 43 | 				self.n_words += 1
 44 | 
 45 | 		for sentence in data:
 46 | 			# n-gram
 47 | 			self.windows.extend(\
 48 | 				list(\
 49 | 					nltk.ngrams([const.D_TOKEN]*const.WIN_SIZE+sentence+[const.D_TOKEN]*const.WIN_SIZE, const.WIN_SIZE*2+1)\
 50 | 				)\
 51 | 			)
 52 | 
 53 | 		dataset = []
 54 | 		for window in self.windows:
 55 | 			for i in range(const.WIN_SIZE*2+1):
 56 | 				if i == const.WIN_SIZE or window[i] == const.D_TOKEN: 
 57 | 					continue
 58 | 				dataset.append((window[const.WIN_SIZE], window[i]))
 59 | 		X_p, y_p = [], []
 60 | 		for d in dataset:
 61 | 			X_p.append(self.var_word(d[0]))
 62 | 			y_p.append(self.var_word(d[1]))
 63 | 		self.dataset = list(zip(X_p, y_p))
 64 | 
 65 | 	def get_vocab(self, data):
 66 | 		# [[]] -> []
 67 | 		flatten = lambda l: [item.lower() for sublist in l for item in sublist]
 68 | 		word_count = Counter(flatten(data))
 69 | 		border = int(len(word_count)*0.01)
 70 | 		stopwords = word_count.most_common()[:border]+list(reversed(word_count.most_common()))[:border]
 71 | 		stopwords = [s[0] for s in stopwords]
 72 | 		vocab = list(set(flatten(data))-set(stopwords))
 73 | 		return vocab
 74 | 
 75 | 	# @return batch data
 76 | 	# @generator
 77 | 	def batch_data(self, batch_size):
 78 | 		random.shuffle(self.dataset)
 79 | 		sidx = 0			# start index
 80 | 		eidx = batch_size	# end index
 81 | 		while eidx < len(self.dataset):
 82 | 			batch = self.dataset[sidx:eidx]
 83 | 			sidx = eidx
 84 | 			eidx += batch_size
 85 | 			yield batch
 86 | 
 87 | 		if eidx >= len(self.dataset):
 88 | 			batch = self.dataset[sidx: ]
 89 | 			diff = eidx - len(self.dataset)
 90 | 			inps, targets = zip(*batch)  # unzip
 91 | 			inps = list(inps)
 92 | 			targets = list(targets)
 93 | 			diff_vec = [self.word2idx[const.U_TOKEN]]*diff
 94 | 			inps = inps + diff_vec
 95 | 			targets = targets + diff_vec
 96 | 			inps = tuple(inps)
 97 | 			targets = tuple(targets)
 98 | 			batch = zip(inps, targets)
 99 | 			yield batch
100 | 
101 | 	# @input sentence [w1, w2, ... , wn]
102 | 	def var_sentence(self, sentence):
103 | 		idxs = list(map(lambda w: self.word2idx[w] if w in self.word2idx.keys() \
104 | 				else self.word2idx[const.U_TOKEN], sentence))
105 | 		return idxs
106 | 
107 | 	# @input word
108 | 	def var_word(self, word):
109 | 		idx = [self.word2idx[const.U_TOKEN]]
110 | 		if word in self.word2idx:
111 | 			idx = [self.word2idx[word]]
112 | 		return idx
113 | 


--------------------------------------------------------------------------------
/word2vec/skipgram/tensorflow/softmax/main.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # -------------------------------------------#
 5 | # main.py 									 #
 6 | # author: sean lee						   	 #
 7 | # email: lxm_0828@163.com					 #
 8 | #--------------------------------------------#
 9 | 
10 | import argparse
11 | parser = argparse.ArgumentParser(description='main.py')
12 | parser.add_argument('-train', action='store_true', default=False, help='train model')
13 | parser.add_argument('-test', action='store_true', default=False, help='test model')
14 | args = parser.parse_args()
15 | 
16 | from dataset import Corpus, load_data
17 | from skipgram import Skipgram
18 | 
19 | if __name__ == '__main__':
20 | 	
21 | 	data = list(load_data())
22 | 	corpus = Corpus(data)
23 | 	skipgram = Skipgram(corpus)
24 | 
25 | 	if args.train:
26 | 		skipgram.train()
27 | 	elif args.test:
28 | 		word = input('Input word> ')
29 | 		print(skipgram.test(word))


--------------------------------------------------------------------------------
/word2vec/skipgram/tensorflow/softmax/skipgram.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # -------------------------------------------#
  5 | # skipgram	 			                     #
  6 | # author: sean lee                           #
  7 | # email: lxm_0828@163.com                    #
  8 | #--------------------------------------------#
  9 | 
 10 | import const
 11 | import math
 12 | import numpy as np
 13 | import tensorflow as tf 
 14 | 
 15 | class Skipgram(object):
 16 | 	def __init__(self, corpus):
 17 | 		self.corpus = corpus
 18 | 
 19 | 	def test(self, word, k=10):
 20 | 		Weight = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0))
 21 | 		inputs = tf.placeholder(tf.int32, [None])
 22 | 		embed = tf.nn.embedding_lookup(Weight, inputs)
 23 | 
 24 | 		# cosine
 25 | 		test_embed = tf.placeholder(tf.float32, [None])
 26 | 		test_input = tf.placeholder(tf.float32, [None])
 27 | 		normed_embed = tf.nn.l2_normalize(test_embed, dim=0)
 28 | 		normed_array = tf.nn.l2_normalize(test_input, dim=0)
 29 | 		cosine_similarity = tf.reduce_sum(tf.multiply(normed_array, normed_embed))
 30 | 
 31 | 		with tf.Session() as sess:
 32 | 			tf.global_variables_initializer().run()
 33 | 			#restore model
 34 | 			tf.train.Saver().restore(sess, const.MODEL_PATH)
 35 | 
 36 | 			vectors = sess.run(embed, feed_dict={inputs: range(self.corpus.n_words)})
 37 | 			vocab = self.corpus.vocab
 38 | 			idx = self.corpus.var_word(word)
 39 | 			scores = []
 40 | 			for i in range(len(vocab)):
 41 | 				if vocab[i] == word or vocab[i] == const.U_TOKEN: 
 42 | 					continue
 43 | 				vec_a = vectors[i].reshape([-1])
 44 | 				vec_b = vectors[idx].reshape([-1])
 45 | 				cosine_sim = sess.run(cosine_similarity, feed_dict={test_embed: vec_a, test_input: vec_b})
 46 | 				scores.append([vocab[i], cosine_sim]) #cosine similarity
 47 | 			return sorted(scores, key=lambda x: x[1], reverse=True)[:k]
 48 | 
 49 | 	def train(self):
 50 | 		Weight = tf.Variable(tf.truncated_normal([self.corpus.n_words, const.EMBEDDING_SIZE], stddev=1.0/math.sqrt(const.EMBEDDING_SIZE)))
 51 | 		bias = tf.Variable(tf.random_normal([self.corpus.n_words]))
 52 | 
 53 | 		inputs = tf.placeholder(tf.int32, [const.BATCH_SIZE, 1])
 54 | 		targets = tf.placeholder(tf.int32, [const.BATCH_SIZE, 1])
 55 | 		vocabs = tf.placeholder(tf.int32, [const.BATCH_SIZE, self.corpus.n_words])
 56 | 
 57 | 		embed_weight_v = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0))
 58 | 		embed_weight_u = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0))
 59 | 		embed_weight_actual = tf.Variable(tf.random_normal([self.corpus.n_words, const.EMBEDDING_SIZE], -1.0, 1.0))
 60 | 		embed_v = tf.nn.embedding_lookup(embed_weight_v, inputs)
 61 | 		embed_u = tf.nn.embedding_lookup(embed_weight_u, targets)
 62 | 		embed_actual = tf.nn.embedding_lookup(embed_weight_actual, vocabs)
 63 | 
 64 | 		'''
 65 | 		print(embed_u.shape)
 66 | 		print(embed_v.shape)
 67 | 		print(embed_actual.shape)
 68 | 		exit()
 69 | 		'''
 70 | 		embed_v_trans = tf.transpose(embed_v, [0, 2, 1])
 71 | 
 72 | 		#print(embed_v_trans.shape)
 73 | 		scores = tf.squeeze(tf.matmul(embed_u, embed_v_trans), [2])   		# batch_size x 1
 74 | 		norm_scores = tf.squeeze(tf.matmul(embed_actual, embed_v_trans), [2])  # batch_size x input_size
 75 | 
 76 | 		softmax = tf.exp(scores) / tf.reduce_sum(tf.exp(norm_scores), 1)
 77 | 		softmax = tf.expand_dims(softmax, 1)
 78 | 		nll_loss = -tf.reduce_mean(tf.log(tf.clip_by_value(softmax,1e-10,1.0))) 
 79 | 
 80 | 		optimizer = tf.train.AdamOptimizer(learning_rate=const.LR_RATE).minimize(nll_loss)
 81 | 
 82 | 		saver = tf.train.Saver()
 83 | 
 84 | 		losses = []
 85 | 		with tf.Session() as sess:
 86 | 			tf.global_variables_initializer().run()
 87 | 
 88 | 			for epoch in range(const.EPOCH):
 89 | 				for i, batch in enumerate(self.corpus.batch_data(const.BATCH_SIZE)):
 90 | 
 91 | 					_inputs, _targets = zip(*batch)  # unzip
 92 | 
 93 | 					_inputs = np.hstack(_inputs)   # (2, )
 94 | 					_inputs = _inputs.reshape(_inputs.shape[0], 1)
 95 | 					_targets = np.vstack(_targets) # (2, 1)
 96 | 
 97 | 					vocab = self.corpus.var_sentence(self.corpus.vocab)
 98 | 					_vocabs = []
 99 | 					[_vocabs.append(vocab) for x in range(inputs.shape[0])]
100 | 					_vocabs = np.array(_vocabs)
101 | 
102 | 					_, _loss = sess.run([optimizer, nll_loss], feed_dict={inputs:_inputs, targets:_targets, vocabs: _vocabs})
103 | 					losses.append(_loss)
104 | 					if i % 500:
105 | 						print('i, ', i, 'loss', _loss)
106 | 
107 | 				if epoch % 10 == 0:
108 | 					print('epoch, ', epoch, 'mean loss', np.mean(losses))
109 | 					losses= []
110 | 
111 | 			# save model
112 | 			saver.save(sess, const.MODEL_PATH)


--------------------------------------------------------------------------------