├── README.md
└── Generate_japanese_text.ipynb


/README.md:
--------------------------------------------------------------------------------
1 | 　ipynbファイルをクリックするとJupyter notebook形式のファイルが表示されます。先頭にある「Open in Colab」ボタンを押すと、Google Colab上で実行できます。使用環境は、Google Colab が動作すれば、どんなものでも構いません。
2 | 
3 | # Generate_japanese_text.ipynb
4 | 　今年の４月、「ゼロから作るDeep Learning3 フレームワーク編」が発売されました。\
5 | ゼロ作は１・２と読んでいて大変勉強になったので、今度はフレームワーク編に挑戦することにしました。\
6 | ということで最近本を購入したのですが、１ステップづつ勉強を始める前に、まずフレームワークの全体像をザッと知るために、とりあえずコードを書いてみようと思いました。\
7 | Githubにある [DeZero](https://github.com/oreilly-japan/deep-learning-from-scratch-3) のライブラリー と example を参考に、本をパラパラ見ながら自然言語処理の簡単なコードを google colab で書いてみたので、備忘録として残します。詳細は、[Qiita](https://qiita.com/jun40vn/items/18ae6a89dba9c312e8c6) を参照下さい。
8 | 


--------------------------------------------------------------------------------
/Generate_japanese_text.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "DeZero_generate_japanese_text",
  7 |       "provenance": [],
  8 |       "include_colab_link": true
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "accelerator": "GPU"
 15 |   },
 16 |   "cells": [
 17 |     {
 18 |       "cell_type": "markdown",
 19 |       "metadata": {
 20 |         "id": "view-in-github",
 21 |         "colab_type": "text"
 22 |       },
 23 |       "source": [
 24 |         "<a href=\"https://colab.research.google.com/github/cedro3/DeZero_generate_japanese_text/blob/master/DeZero_generate_japanese_text.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 25 |       ]
 26 |     },
 27 |     {
 28 |       "cell_type": "markdown",
 29 |       "metadata": {
 30 |         "id": "ZU6uMMgamNUp",
 31 |         "colab_type": "text"
 32 |       },
 33 |       "source": [
 34 |         "フレームワーク **DeZero** のインストール"
 35 |       ]
 36 |     },
 37 |     {
 38 |       "cell_type": "code",
 39 |       "metadata": {
 40 |         "id": "V9d_aWXHcNhF",
 41 |         "colab_type": "code",
 42 |         "colab": {}
 43 |       },
 44 |       "source": [
 45 |         "!pip install dezero"
 46 |       ],
 47 |       "execution_count": null,
 48 |       "outputs": []
 49 |     },
 50 |     {
 51 |       "cell_type": "markdown",
 52 |       "metadata": {
 53 |         "id": "q3i1Z2blmVtG",
 54 |         "colab_type": "text"
 55 |       },
 56 |       "source": [
 57 |         "形態素分析ライブラリー **janome** のインストール"
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "code",
 62 |       "metadata": {
 63 |         "id": "WFHkiEghA8qT",
 64 |         "colab_type": "code",
 65 |         "colab": {}
 66 |       },
 67 |       "source": [
 68 |         "!pip install janome"
 69 |       ],
 70 |       "execution_count": null,
 71 |       "outputs": []
 72 |     },
 73 |     {
 74 |       "cell_type": "markdown",
 75 |       "metadata": {
 76 |         "id": "oa4uTXHmmljw",
 77 |         "colab_type": "text"
 78 |       },
 79 |       "source": [
 80 |         "日本語データセット **Nekoクラス**"
 81 |       ]
 82 |     },
 83 |     {
 84 |       "cell_type": "code",
 85 |       "metadata": {
 86 |         "id": "gu38qd0ya-Sb",
 87 |         "colab_type": "code",
 88 |         "colab": {}
 89 |       },
 90 |       "source": [
 91 |         "import numpy as np\n",
 92 |         "import dezero\n",
 93 |         "from dezero.datasets import Dataset\n",
 94 |         "from dezero.utils import get_file, cache_dir\n",
 95 |         "import zipfile\n",
 96 |         "import re\n",
 97 |         "from janome.tokenizer import Tokenizer\n",
 98 |         "\n",
 99 |         "class Neko(Dataset):\n",
100 |         "    \n",
101 |         "    def prepare(self):\n",
102 |         "        url = 'https://www.aozora.gr.jp/cards/000148/files/789_ruby_5639.zip'\n",
103 |         "        file = get_file(url)  \n",
104 |         "        data = self.unzip(cache_dir + '/' + '789_ruby_5639.zip')  \n",
105 |         "        self.text = self.preprocess(cache_dir + '/' + 'wagahaiwa_nekodearu.txt')\n",
106 |         "        self.wakati = self.keitaiso(self.text)\n",
107 |         "        self.corpus, self.word_to_id, self.id_to_word = self.process(self.wakati)\n",
108 |         "        self.data = np.array(self.corpus[:-1])\n",
109 |         "        self.label = np.array(self.corpus[1:])\n",
110 |         "    \n",
111 |         "    def unzip(self, file_path):\n",
112 |         "        with zipfile.ZipFile(file_path) as existing_zip:\n",
113 |         "            existing_zip.extractall(cache_dir)\n",
114 |         "            \n",
115 |         "    def preprocess(self, file_path):\n",
116 |         "        binarydata = open(file_path, 'rb').read()\n",
117 |         "        text = binarydata.decode('shift_jis')        \n",
118 |         "                   \n",
119 |         "        text = re.split(r'\\-{5,}', text)[2]  # ヘッダの削除\n",
120 |         "        text = re.split('底本：',text)[0]   # フッタの削除\n",
121 |         "        text = re.sub('｜', '', text)  # | の削除\n",
122 |         "        text = re.sub('［.+?］', '', text)  # 入力注の削除\n",
123 |         "        text = re.sub(r'《.+?》', '', text)  # ルビの削除\n",
124 |         "        text = re.sub(r'\\u3000', '', text)  # 空白の削除\n",
125 |         "        text = re.sub(r'\\r\\n', '', text)  # 改行の削除\n",
126 |         "        text = text[1:]  # 先頭の１文字を削除（調整）\n",
127 |         "        return text\n",
128 |         " \n",
129 |         "    def keitaiso(self, text):\n",
130 |         "        t = Tokenizer()\n",
131 |         "        output = t.tokenize(text, wakati=True)\n",
132 |         "        return output\n",
133 |         "     \n",
134 |         "    def process(self, text):\n",
135 |         "        # word_to_id, id_to_ward の作成\n",
136 |         "        word_to_id, id_to_word = {}, {}\n",
137 |         "        for word in text:\n",
138 |         "            if word not in word_to_id:\n",
139 |         "                new_id = len(word_to_id)\n",
140 |         "                word_to_id[word] = new_id\n",
141 |         "                id_to_word[new_id] = word\n",
142 |         "\n",
143 |         "        # corpus の作成\n",
144 |         "        corpus = np.array([word_to_id[W] for W in text])\n",
145 |         "        return corpus, word_to_id, id_to_word"
146 |       ],
147 |       "execution_count": null,
148 |       "outputs": []
149 |     },
150 |     {
151 |       "cell_type": "markdown",
152 |       "metadata": {
153 |         "id": "XBYXnooWm1nT",
154 |         "colab_type": "text"
155 |       },
156 |       "source": [
157 |         "データセットの作成 ( **Nekoクラス**のインスタンス化。実行に数十秒ほど掛かります)"
158 |       ]
159 |     },
160 |     {
161 |       "cell_type": "code",
162 |       "metadata": {
163 |         "id": "gIq-wn8FBF_8",
164 |         "colab_type": "code",
165 |         "colab": {}
166 |       },
167 |       "source": [
168 |         "neko = Neko()"
169 |       ],
170 |       "execution_count": null,
171 |       "outputs": []
172 |     },
173 |     {
174 |       "cell_type": "markdown",
175 |       "metadata": {
176 |         "id": "fLlULX5EIp6P",
177 |         "colab_type": "text"
178 |       },
179 |       "source": [
180 |         "作成したデータセットの内容を見てみます。"
181 |       ]
182 |     },
183 |     {
184 |       "cell_type": "code",
185 |       "metadata": {
186 |         "id": "M5D0D16TBkIk",
187 |         "colab_type": "code",
188 |         "colab": {}
189 |       },
190 |       "source": [
191 |         "print('neko.text = ', neko.text[:50])\n",
192 |         "print('neko.wakati = ', neko.wakati[:15])\n",
193 |         "print('neko.corpus = ',neko.corpus[:15])"
194 |       ],
195 |       "execution_count": null,
196 |       "outputs": []
197 |     },
198 |     {
199 |       "cell_type": "code",
200 |       "metadata": {
201 |         "id": "LfHWWMevQPDM",
202 |         "colab_type": "code",
203 |         "colab": {}
204 |       },
205 |       "source": [
206 |         "print('neko.word_to_id [猫] = ', neko.word_to_id['猫'])\n",
207 |         "print('neko.id_to_word [6] = ', neko.id_to_word[6])"
208 |       ],
209 |       "execution_count": null,
210 |       "outputs": []
211 |     },
212 |     {
213 |       "cell_type": "code",
214 |       "metadata": {
215 |         "id": "Y64F6lHqM3Gd",
216 |         "colab_type": "code",
217 |         "colab": {}
218 |       },
219 |       "source": [
220 |         "print('neko.data = ', neko.data[:15])\n",
221 |         "print('neko.label = ', neko.label[:15])"
222 |       ],
223 |       "execution_count": null,
224 |       "outputs": []
225 |     },
226 |     {
227 |       "cell_type": "code",
228 |       "metadata": {
229 |         "id": "rMghg_w3dPEZ",
230 |         "colab_type": "code",
231 |         "colab": {}
232 |       },
233 |       "source": [
234 |         "print('length od data = ', len(neko.data))\n",
235 |         "print('vaocab_size = ', len(neko.word_to_id))"
236 |       ],
237 |       "execution_count": null,
238 |       "outputs": []
239 |     },
240 |     {
241 |       "cell_type": "markdown",
242 |       "metadata": {
243 |         "id": "pDltLJ1AnFez",
244 |         "colab_type": "text"
245 |       },
246 |       "source": [
247 |         "**コード本体**です。１epoch毎に、100単語の文章を生成します。1eopch当たりの処理時間は2分くらいです。"
248 |       ]
249 |     },
250 |     {
251 |       "cell_type": "code",
252 |       "metadata": {
253 |         "id": "xLY49G3RbdKw",
254 |         "colab_type": "code",
255 |         "colab": {}
256 |       },
257 |       "source": [
258 |         "import numpy as np\n",
259 |         "import dezero\n",
260 |         "from dezero import Model\n",
261 |         "from dezero import SeqDataLoader\n",
262 |         "import dezero.functions as F\n",
263 |         "import dezero.layers as L\n",
264 |         "import random\n",
265 |         "from dezero import cuda \n",
266 |         "import textwrap\n",
267 |         "\n",
268 |         "max_epoch = 70\n",
269 |         "batch_size = 30 \n",
270 |         "vocab_size = len(neko.word_to_id)  \n",
271 |         "wordvec_size = 650  \n",
272 |         "hidden_size = 650\n",
273 |         "bptt_length = 30  \n",
274 |         "\n",
275 |         "class Lstm_nlp(Model):\n",
276 |         "    def __init__(self, vocab_size, wordvec_size, hidden_size, out_size):\n",
277 |         "        super().__init__()\n",
278 |         "        self.embed = L.EmbedID(vocab_size, wordvec_size)\n",
279 |         "        self.rnn = L.LSTM(hidden_size)\n",
280 |         "        self.fc = L.Linear(out_size)\n",
281 |         "\n",
282 |         "    def reset_state(self):  # 状態リセット\n",
283 |         "        self.rnn.reset_state()\n",
284 |         "\n",
285 |         "    def __call__(self, x):  # レイヤの接続内容を記載\n",
286 |         "        y = self.embed(x) \n",
287 |         "        y = self.rnn(y)\n",
288 |         "        y = self.fc(y)\n",
289 |         "        return y\n",
290 |         "\n",
291 |         "model = Lstm_nlp(vocab_size, wordvec_size, hidden_size, vocab_size)  # モデル生成\n",
292 |         "dataloader = SeqDataLoader(neko, batch_size=batch_size)  # データローダ生成\n",
293 |         "seqlen = len(neko)\n",
294 |         "optimizer = dezero.optimizers.Adam().setup(model)  # 最適化手法は Adam\n",
295 |         "\n",
296 |         "# GPUの有無判定と処理\n",
297 |         "if dezero.cuda.gpu_enable:  # GPUが有効であれば下記を実行\n",
298 |         "    dataloader.to_gpu()  # データローダをGPUへ\n",
299 |         "    model.to_gpu()  # モデルをGPUへ\n",
300 |         "\n",
301 |         "# 学習ループ\n",
302 |         "for epoch in range(max_epoch):\n",
303 |         "    model.reset_state()\n",
304 |         "    loss, count = 0, 0\n",
305 |         "\n",
306 |         "    for x, t in dataloader:\n",
307 |         "        y = model(x)  # 順伝播\n",
308 |         "\n",
309 |         "        # y は次の単語の出現度合いを表すベクトル(vocab_size次元)。\n",
310 |         "        # y にsoftmaxを掛け出現確率にしたものとワンホットの次の正解データからロス計算。\n",
311 |         "        # 但し、入力 t はワンホットベクトルの何番目に1が立っているかを表す数字(整数)。\n",
312 |         "        loss += F.softmax_cross_entropy_simple(y, t)  \n",
313 |         "        count += 1\n",
314 |         "\n",
315 |         "        if count % bptt_length == 0 or count == seqlen:\n",
316 |         "            model.cleargrads()  # 微分の初期化\n",
317 |         "            loss.backward()  # 逆伝播\n",
318 |         "            loss.unchain_backward()  # 計算グラフを遡ってつながりを切る\n",
319 |         "            optimizer.update()  # 重みの更新\n",
320 |         "    avg_loss = float(loss.data) / count\n",
321 |         "    print('| epoch %d | loss %f' % (epoch + 1, avg_loss))\n",
322 |         "\n",
323 |         "    # 文章生成\n",
324 |         "    model.reset_state()  # 状態をリセット\n",
325 |         "    with dezero.no_grad():  # 重みの更新をしない\n",
326 |         "         text = []\n",
327 |         "         x = random.randint(0,vocab_size)  # 最初の単語番号をランダムに選ぶ\n",
328 |         "         while len(text)  < 100:  # 100単語になるまで繰り返す\n",
329 |         "               x = np.array(int(x))\n",
330 |         "               y = model(x)  # yは次の単語の出現度合い\n",
331 |         "               p = F.softmax_simple(y, axis=0)  # softmax を掛けて出現確率にする\n",
332 |         "               xp = cuda.get_array_module(p)  # GPUがあれば xp=cp なければ xp=np\n",
333 |         "               sampled = xp.random.choice(len(p.data), size=1, p=p.data)  #　出現確率を考慮して数字(インデックス)を選ぶ\n",
334 |         "               word = neko.id_to_word[int(sampled)]  # 数字を単語に変換\n",
335 |         "               text.append(word)  # text に単語を追加\n",
336 |         "               x = sampled  # sampledを次の入力にする\n",
337 |         "         text = ''.join(text)\n",
338 |         "         print(textwrap.fill(text, 60))  # 60文字で改行して表示\n",
339 |         "    "
340 |       ],
341 |       "execution_count": null,
342 |       "outputs": []
343 |     }
344 |   ]
345 | }
346 | 


--------------------------------------------------------------------------------