├── README.md └── Generate_japanese_text.ipynb /README.md: -------------------------------------------------------------------------------- 1 |  ipynbファイルをクリックするとJupyter notebook形式のファイルが表示されます。先頭にある「Open in Colab」ボタンを押すと、Google Colab上で実行できます。使用環境は、Google Colab が動作すれば、どんなものでも構いません。 2 | 3 | # Generate_japanese_text.ipynb 4 |  今年の4月、「ゼロから作るDeep Learning3 フレームワーク編」が発売されました。\ 5 | ゼロ作は1・2と読んでいて大変勉強になったので、今度はフレームワーク編に挑戦することにしました。\ 6 | ということで最近本を購入したのですが、1ステップづつ勉強を始める前に、まずフレームワークの全体像をザッと知るために、とりあえずコードを書いてみようと思いました。\ 7 | Githubにある [DeZero](https://github.com/oreilly-japan/deep-learning-from-scratch-3) のライブラリー と example を参考に、本をパラパラ見ながら自然言語処理の簡単なコードを google colab で書いてみたので、備忘録として残します。詳細は、[Qiita](https://qiita.com/jun40vn/items/18ae6a89dba9c312e8c6) を参照下さい。 8 | -------------------------------------------------------------------------------- /Generate_japanese_text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "DeZero_generate_japanese_text", 7 | "provenance": [], 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "accelerator": "GPU" 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | "\"Open" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "ZU6uMMgamNUp", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "フレームワーク **DeZero** のインストール" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "V9d_aWXHcNhF", 41 | "colab_type": "code", 42 | "colab": {} 43 | }, 44 | "source": [ 45 | "!pip install dezero" 46 | ], 47 | "execution_count": null, 48 | "outputs": [] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "id": "q3i1Z2blmVtG", 54 | "colab_type": "text" 55 | }, 56 | "source": [ 57 | "形態素分析ライブラリー **janome** のインストール" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "metadata": { 63 | "id": "WFHkiEghA8qT", 64 | "colab_type": "code", 65 | "colab": {} 66 | }, 67 | "source": [ 68 | "!pip install janome" 69 | ], 70 | "execution_count": null, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "id": "oa4uTXHmmljw", 77 | "colab_type": "text" 78 | }, 79 | "source": [ 80 | "日本語データセット **Nekoクラス**" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "metadata": { 86 | "id": "gu38qd0ya-Sb", 87 | "colab_type": "code", 88 | "colab": {} 89 | }, 90 | "source": [ 91 | "import numpy as np\n", 92 | "import dezero\n", 93 | "from dezero.datasets import Dataset\n", 94 | "from dezero.utils import get_file, cache_dir\n", 95 | "import zipfile\n", 96 | "import re\n", 97 | "from janome.tokenizer import Tokenizer\n", 98 | "\n", 99 | "class Neko(Dataset):\n", 100 | " \n", 101 | " def prepare(self):\n", 102 | " url = 'https://www.aozora.gr.jp/cards/000148/files/789_ruby_5639.zip'\n", 103 | " file = get_file(url) \n", 104 | " data = self.unzip(cache_dir + '/' + '789_ruby_5639.zip') \n", 105 | " self.text = self.preprocess(cache_dir + '/' + 'wagahaiwa_nekodearu.txt')\n", 106 | " self.wakati = self.keitaiso(self.text)\n", 107 | " self.corpus, self.word_to_id, self.id_to_word = self.process(self.wakati)\n", 108 | " self.data = np.array(self.corpus[:-1])\n", 109 | " self.label = np.array(self.corpus[1:])\n", 110 | " \n", 111 | " def unzip(self, file_path):\n", 112 | " with zipfile.ZipFile(file_path) as existing_zip:\n", 113 | " existing_zip.extractall(cache_dir)\n", 114 | " \n", 115 | " def preprocess(self, file_path):\n", 116 | " binarydata = open(file_path, 'rb').read()\n", 117 | " text = binarydata.decode('shift_jis') \n", 118 | " \n", 119 | " text = re.split(r'\\-{5,}', text)[2] # ヘッダの削除\n", 120 | " text = re.split('底本:',text)[0] # フッタの削除\n", 121 | " text = re.sub('|', '', text) # | の削除\n", 122 | " text = re.sub('[.+?]', '', text) # 入力注の削除\n", 123 | " text = re.sub(r'《.+?》', '', text) # ルビの削除\n", 124 | " text = re.sub(r'\\u3000', '', text) # 空白の削除\n", 125 | " text = re.sub(r'\\r\\n', '', text) # 改行の削除\n", 126 | " text = text[1:] # 先頭の1文字を削除(調整)\n", 127 | " return text\n", 128 | " \n", 129 | " def keitaiso(self, text):\n", 130 | " t = Tokenizer()\n", 131 | " output = t.tokenize(text, wakati=True)\n", 132 | " return output\n", 133 | " \n", 134 | " def process(self, text):\n", 135 | " # word_to_id, id_to_ward の作成\n", 136 | " word_to_id, id_to_word = {}, {}\n", 137 | " for word in text:\n", 138 | " if word not in word_to_id:\n", 139 | " new_id = len(word_to_id)\n", 140 | " word_to_id[word] = new_id\n", 141 | " id_to_word[new_id] = word\n", 142 | "\n", 143 | " # corpus の作成\n", 144 | " corpus = np.array([word_to_id[W] for W in text])\n", 145 | " return corpus, word_to_id, id_to_word" 146 | ], 147 | "execution_count": null, 148 | "outputs": [] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": { 153 | "id": "XBYXnooWm1nT", 154 | "colab_type": "text" 155 | }, 156 | "source": [ 157 | "データセットの作成 ( **Nekoクラス**のインスタンス化。実行に数十秒ほど掛かります)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "metadata": { 163 | "id": "gIq-wn8FBF_8", 164 | "colab_type": "code", 165 | "colab": {} 166 | }, 167 | "source": [ 168 | "neko = Neko()" 169 | ], 170 | "execution_count": null, 171 | "outputs": [] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": { 176 | "id": "fLlULX5EIp6P", 177 | "colab_type": "text" 178 | }, 179 | "source": [ 180 | "作成したデータセットの内容を見てみます。" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "metadata": { 186 | "id": "M5D0D16TBkIk", 187 | "colab_type": "code", 188 | "colab": {} 189 | }, 190 | "source": [ 191 | "print('neko.text = ', neko.text[:50])\n", 192 | "print('neko.wakati = ', neko.wakati[:15])\n", 193 | "print('neko.corpus = ',neko.corpus[:15])" 194 | ], 195 | "execution_count": null, 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "metadata": { 201 | "id": "LfHWWMevQPDM", 202 | "colab_type": "code", 203 | "colab": {} 204 | }, 205 | "source": [ 206 | "print('neko.word_to_id [猫] = ', neko.word_to_id['猫'])\n", 207 | "print('neko.id_to_word [6] = ', neko.id_to_word[6])" 208 | ], 209 | "execution_count": null, 210 | "outputs": [] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "metadata": { 215 | "id": "Y64F6lHqM3Gd", 216 | "colab_type": "code", 217 | "colab": {} 218 | }, 219 | "source": [ 220 | "print('neko.data = ', neko.data[:15])\n", 221 | "print('neko.label = ', neko.label[:15])" 222 | ], 223 | "execution_count": null, 224 | "outputs": [] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "metadata": { 229 | "id": "rMghg_w3dPEZ", 230 | "colab_type": "code", 231 | "colab": {} 232 | }, 233 | "source": [ 234 | "print('length od data = ', len(neko.data))\n", 235 | "print('vaocab_size = ', len(neko.word_to_id))" 236 | ], 237 | "execution_count": null, 238 | "outputs": [] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": { 243 | "id": "pDltLJ1AnFez", 244 | "colab_type": "text" 245 | }, 246 | "source": [ 247 | "**コード本体**です。1epoch毎に、100単語の文章を生成します。1eopch当たりの処理時間は2分くらいです。" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "metadata": { 253 | "id": "xLY49G3RbdKw", 254 | "colab_type": "code", 255 | "colab": {} 256 | }, 257 | "source": [ 258 | "import numpy as np\n", 259 | "import dezero\n", 260 | "from dezero import Model\n", 261 | "from dezero import SeqDataLoader\n", 262 | "import dezero.functions as F\n", 263 | "import dezero.layers as L\n", 264 | "import random\n", 265 | "from dezero import cuda \n", 266 | "import textwrap\n", 267 | "\n", 268 | "max_epoch = 70\n", 269 | "batch_size = 30 \n", 270 | "vocab_size = len(neko.word_to_id) \n", 271 | "wordvec_size = 650 \n", 272 | "hidden_size = 650\n", 273 | "bptt_length = 30 \n", 274 | "\n", 275 | "class Lstm_nlp(Model):\n", 276 | " def __init__(self, vocab_size, wordvec_size, hidden_size, out_size):\n", 277 | " super().__init__()\n", 278 | " self.embed = L.EmbedID(vocab_size, wordvec_size)\n", 279 | " self.rnn = L.LSTM(hidden_size)\n", 280 | " self.fc = L.Linear(out_size)\n", 281 | "\n", 282 | " def reset_state(self): # 状態リセット\n", 283 | " self.rnn.reset_state()\n", 284 | "\n", 285 | " def __call__(self, x): # レイヤの接続内容を記載\n", 286 | " y = self.embed(x) \n", 287 | " y = self.rnn(y)\n", 288 | " y = self.fc(y)\n", 289 | " return y\n", 290 | "\n", 291 | "model = Lstm_nlp(vocab_size, wordvec_size, hidden_size, vocab_size) # モデル生成\n", 292 | "dataloader = SeqDataLoader(neko, batch_size=batch_size) # データローダ生成\n", 293 | "seqlen = len(neko)\n", 294 | "optimizer = dezero.optimizers.Adam().setup(model) # 最適化手法は Adam\n", 295 | "\n", 296 | "# GPUの有無判定と処理\n", 297 | "if dezero.cuda.gpu_enable: # GPUが有効であれば下記を実行\n", 298 | " dataloader.to_gpu() # データローダをGPUへ\n", 299 | " model.to_gpu() # モデルをGPUへ\n", 300 | "\n", 301 | "# 学習ループ\n", 302 | "for epoch in range(max_epoch):\n", 303 | " model.reset_state()\n", 304 | " loss, count = 0, 0\n", 305 | "\n", 306 | " for x, t in dataloader:\n", 307 | " y = model(x) # 順伝播\n", 308 | "\n", 309 | " # y は次の単語の出現度合いを表すベクトル(vocab_size次元)。\n", 310 | " # y にsoftmaxを掛け出現確率にしたものとワンホットの次の正解データからロス計算。\n", 311 | " # 但し、入力 t はワンホットベクトルの何番目に1が立っているかを表す数字(整数)。\n", 312 | " loss += F.softmax_cross_entropy_simple(y, t) \n", 313 | " count += 1\n", 314 | "\n", 315 | " if count % bptt_length == 0 or count == seqlen:\n", 316 | " model.cleargrads() # 微分の初期化\n", 317 | " loss.backward() # 逆伝播\n", 318 | " loss.unchain_backward() # 計算グラフを遡ってつながりを切る\n", 319 | " optimizer.update() # 重みの更新\n", 320 | " avg_loss = float(loss.data) / count\n", 321 | " print('| epoch %d | loss %f' % (epoch + 1, avg_loss))\n", 322 | "\n", 323 | " # 文章生成\n", 324 | " model.reset_state() # 状態をリセット\n", 325 | " with dezero.no_grad(): # 重みの更新をしない\n", 326 | " text = []\n", 327 | " x = random.randint(0,vocab_size) # 最初の単語番号をランダムに選ぶ\n", 328 | " while len(text) < 100: # 100単語になるまで繰り返す\n", 329 | " x = np.array(int(x))\n", 330 | " y = model(x) # yは次の単語の出現度合い\n", 331 | " p = F.softmax_simple(y, axis=0) # softmax を掛けて出現確率にする\n", 332 | " xp = cuda.get_array_module(p) # GPUがあれば xp=cp なければ xp=np\n", 333 | " sampled = xp.random.choice(len(p.data), size=1, p=p.data) # 出現確率を考慮して数字(インデックス)を選ぶ\n", 334 | " word = neko.id_to_word[int(sampled)] # 数字を単語に変換\n", 335 | " text.append(word) # text に単語を追加\n", 336 | " x = sampled # sampledを次の入力にする\n", 337 | " text = ''.join(text)\n", 338 | " print(textwrap.fill(text, 60)) # 60文字で改行して表示\n", 339 | " " 340 | ], 341 | "execution_count": null, 342 | "outputs": [] 343 | } 344 | ] 345 | } 346 | --------------------------------------------------------------------------------