├── README.md └── notebooks └── English SentSeg.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # classical_arabic_models 2 | Statistical models for Classical Arabic 3 | -------------------------------------------------------------------------------- /notebooks/English SentSeg.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import tensorflow as tf" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "b = pd.read_csv('../data/brown.csv')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import re\n", 30 | "\n", 31 | "docs = []\n", 32 | "non_alpha = re.compile(r\"[^ a-z\\-']\")\n", 33 | "\n", 34 | "doc_name = None\n", 35 | "for i in range(b.shape[0]):\n", 36 | " if b['filename'][i] != doc_name:\n", 37 | " if doc_name is not None:\n", 38 | " docs.append(doc)\n", 39 | " doc_name = b['filename'][i]\n", 40 | " doc = ''\n", 41 | " \n", 42 | " sent = re.sub(non_alpha, '', b['tokenized_text'][i].lower().replace('--', ' ').replace(\"''\", '')).strip() + \"|| \"\n", 43 | " doc += sent" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "vocab = {}\n", 53 | "vectors = [np.zeros(300)]\n", 54 | "\n", 55 | "idx = 1\n", 56 | "with open('../data/glove.6B.300d.txt') as f:\n", 57 | " for line in f:\n", 58 | " tokens = line.split()\n", 59 | " vocab[tokens[0]] = idx\n", 60 | " idx += 1\n", 61 | " vector = np.array(tokens[1:], dtype='float')\n", 62 | " vectors.append(vector)\n", 63 | "\n", 64 | "embedding_matrix = np.vstack(vectors)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 14, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "window_width = 13\n", 74 | "mid_point = window_width // 2\n", 75 | "final = []\n", 76 | "non_final = []\n", 77 | "sample_factor = 1\n", 78 | "\n", 79 | "for doc in docs:\n", 80 | " tokens = doc.split()\n", 81 | " for pos in range(len(tokens) - window_width):\n", 82 | " window = tokens[pos:(pos + window_width)]\n", 83 | " indices = [vocab.get(word.replace('||', ''), 0) for word in window]\n", 84 | " \n", 85 | " if window[mid_point].endswith('||'):\n", 86 | " yx = [1]\n", 87 | " yx.extend(indices)\n", 88 | " final.append(yx)\n", 89 | " else:\n", 90 | " yx = [0]\n", 91 | " yx.extend(indices)\n", 92 | " non_final.append(yx)\n", 93 | " \n", 94 | "\n", 95 | "sample_idx = np.random.choice(len(non_final), len(final) * sample_factor)\n", 96 | "\n", 97 | "sampled_non_final = [non_final[i] for i in sample_idx]\n", 98 | "YX = np.vstack([np.vstack(final), np.vstack(sampled_non_final)])\n", 99 | "np.random.shuffle(YX)\n", 100 | "Y = YX[:, 0]\n", 101 | "X = YX[:, 1:]\n", 102 | " \n", 103 | "#Y = \n", 104 | "#X = np.vstack(X)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 6, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "data": { 114 | "text/plain": [ 115 | "array([1, 0, 0, ..., 1, 1, 1])" 116 | ] 117 | }, 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "output_type": "execute_result" 121 | } 122 | ], 123 | "source": [ 124 | "Y" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 7, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "from tensorflow.keras import layers, Input\n", 134 | "from tensorflow.keras.models import Model\n", 135 | "from tensorflow.keras import regularizers\n", 136 | "\n", 137 | "\n", 138 | "def make_simple_model(word_vectors, window_width, dense_size):\n", 139 | " embed = layers.Embedding(word_vectors.shape[0],\n", 140 | " word_vectors.shape[1],\n", 141 | " input_length = window_width,\n", 142 | " weights = [word_vectors],\n", 143 | " trainable = False,\n", 144 | " mask_zero = True)\n", 145 | " \n", 146 | " word_input = Input(shape=(window_width,), dtype='float32')\n", 147 | " vectors = embed(word_input)\n", 148 | " \n", 149 | " out = layers.Flatten()(vectors)\n", 150 | " out = layers.Dropout(rate=0.4)(out)\n", 151 | " \n", 152 | " out = layers.Dense(dense_size, activation='relu')(out)\n", 153 | " out = layers.Dropout(rate=0.2)(out)\n", 154 | " out = layers.Dense(dense_size, activation='relu')(out)\n", 155 | " out = layers.Dropout(rate=0.2)(out)\n", 156 | " \n", 157 | " output = layers.Dense(1, activation='sigmoid')(out)\n", 158 | " \n", 159 | " model = Model(word_input, output)\n", 160 | "\n", 161 | " model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n", 162 | " \n", 163 | " \n", 164 | " return model" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 9, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "m1 = make_simple_model(embedding_matrix, window_width, 200)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "m1.summary()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "scrolled": true 190 | }, 191 | "outputs": [], 192 | "source": [ 193 | "m1.fit(X, Y, batch_size=64, epochs=20, validation_split=0.2)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 57, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "(array([0, 1]), array([56694, 56694]))" 205 | ] 206 | }, 207 | "execution_count": 57, 208 | "metadata": {}, 209 | "output_type": "execute_result" 210 | } 211 | ], 212 | "source": [ 213 | "np.unique(Y, return_counts=True)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 58, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "YX1 = np.vstack([non_final, final])\n", 223 | "np.random.shuffle(YX1)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 61, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "Y1 = YX1[:, 0]\n", 233 | "X1 = YX1[:, 1:]" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 62, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "Train on 801446 samples, validate on 200362 samples\n", 246 | "801446/801446 [==============================] - 1901s 2ms/sample - loss: 0.1415 - acc: 0.9460 - val_loss: 0.1217 - val_acc: 0.9496\n" 247 | ] 248 | }, 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "" 253 | ] 254 | }, 255 | "execution_count": 62, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "m1.fit(X1, Y1, batch_size=128, epochs=1, validation_split=0.2)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 82, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "gold = Y1[:1000]" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 80, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "scores = m1.predict(X1[:1000])" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 91, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "tp, tn, fp, fn = 0,0,0,0\n", 289 | "for i, l in enumerate(gold):\n", 290 | " s = scores[i]\n", 291 | " \n", 292 | " if s >= 0.1:\n", 293 | " if l == 1:\n", 294 | " tp += 1\n", 295 | " else:\n", 296 | " fp += 1\n", 297 | " else:\n", 298 | " if l == 1:\n", 299 | " fn += 1\n", 300 | " else:\n", 301 | " tn += 1" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 92, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "name": "stdout", 311 | "output_type": "stream", 312 | "text": [ 313 | "50 829 114 7\n" 314 | ] 315 | } 316 | ], 317 | "source": [ 318 | "print (tp, tn, fp, fn)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 93, 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "data": { 328 | "text/plain": [ 329 | "0.12089077412513255" 330 | ] 331 | }, 332 | "execution_count": 93, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "fp/(fp+tn)" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 94, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/plain": [ 349 | "0.8771929824561403" 350 | ] 351 | }, 352 | "execution_count": 94, 353 | "metadata": {}, 354 | "output_type": "execute_result" 355 | } 356 | ], 357 | "source": [ 358 | "tp/(tp+fn)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 15, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "def make_self_attention_model(word_vectors, window_width, dense_size):\n", 368 | " embed = layers.Embedding(word_vectors.shape[0],\n", 369 | " word_vectors.shape[1],\n", 370 | " input_length = window_width,\n", 371 | " weights = [word_vectors],\n", 372 | " trainable = False,\n", 373 | " mask_zero = True)\n", 374 | " \n", 375 | " word_input = Input(shape=(window_width,), dtype='float32')\n", 376 | " vectors = embed(word_input)\n", 377 | " \n", 378 | " cnn_layer = layers.Conv1D(\n", 379 | " filters=100,\n", 380 | " kernel_size=4,\n", 381 | " padding='same')\n", 382 | " query_value = cnn_layer(vectors)\n", 383 | " \n", 384 | " self_attended = layers.Attention()([query_value, query_value])\n", 385 | " out = layers.Concatenate()([query_value, self_attended])\n", 386 | " \n", 387 | " out = layers.Dropout(rate=0.4)(out)\n", 388 | " \n", 389 | " out = layers.Flatten()(out)\n", 390 | " out = layers.Dense(dense_size, activation='relu')(out)\n", 391 | " out = layers.Dropout(rate=0.2)(out)\n", 392 | " out = layers.Dense(dense_size, activation='relu')(out)\n", 393 | " out = layers.Dropout(rate=0.2)(out)\n", 394 | " \n", 395 | " output = layers.Dense(1, activation='sigmoid')(out)\n", 396 | " \n", 397 | " model = Model(word_input, output)\n", 398 | "\n", 399 | " model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n", 400 | " \n", 401 | " \n", 402 | " return model" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 16, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "m2 = make_self_attention_model(embedding_matrix, window_width, 200)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 17, 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "Model: \"model_2\"\n", 424 | "__________________________________________________________________________________________________\n", 425 | "Layer (type) Output Shape Param # Connected to \n", 426 | "==================================================================================================\n", 427 | "input_3 (InputLayer) [(None, 13)] 0 \n", 428 | "__________________________________________________________________________________________________\n", 429 | "embedding_2 (Embedding) (None, 13, 300) 120000300 input_3[0][0] \n", 430 | "__________________________________________________________________________________________________\n", 431 | "conv1d_1 (Conv1D) (None, 13, 100) 120100 embedding_2[0][0] \n", 432 | "__________________________________________________________________________________________________\n", 433 | "attention_1 (Attention) (None, 13, 100) 0 conv1d_1[0][0] \n", 434 | " conv1d_1[0][0] \n", 435 | "__________________________________________________________________________________________________\n", 436 | "concatenate_1 (Concatenate) (None, 13, 200) 0 conv1d_1[0][0] \n", 437 | " attention_1[0][0] \n", 438 | "__________________________________________________________________________________________________\n", 439 | "dropout_6 (Dropout) (None, 13, 200) 0 concatenate_1[0][0] \n", 440 | "__________________________________________________________________________________________________\n", 441 | "flatten_2 (Flatten) (None, 2600) 0 dropout_6[0][0] \n", 442 | "__________________________________________________________________________________________________\n", 443 | "dense_6 (Dense) (None, 200) 520200 flatten_2[0][0] \n", 444 | "__________________________________________________________________________________________________\n", 445 | "dropout_7 (Dropout) (None, 200) 0 dense_6[0][0] \n", 446 | "__________________________________________________________________________________________________\n", 447 | "dense_7 (Dense) (None, 200) 40200 dropout_7[0][0] \n", 448 | "__________________________________________________________________________________________________\n", 449 | "dropout_8 (Dropout) (None, 200) 0 dense_7[0][0] \n", 450 | "__________________________________________________________________________________________________\n", 451 | "dense_8 (Dense) (None, 1) 201 dropout_8[0][0] \n", 452 | "==================================================================================================\n", 453 | "Total params: 120,681,001\n", 454 | "Trainable params: 680,701\n", 455 | "Non-trainable params: 120,000,300\n", 456 | "__________________________________________________________________________________________________\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "m2.summary()" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 18, 467 | "metadata": {}, 468 | "outputs": [ 469 | { 470 | "name": "stdout", 471 | "output_type": "stream", 472 | "text": [ 473 | "Train on 90580 samples, validate on 22646 samples\n", 474 | "Epoch 1/20\n", 475 | "90580/90580 [==============================] - 444s 5ms/sample - loss: 0.4997 - acc: 0.7528 - val_loss: 0.4489 - val_acc: 0.7838\n", 476 | "Epoch 2/20\n", 477 | "90580/90580 [==============================] - 445s 5ms/sample - loss: 0.4404 - acc: 0.7928 - val_loss: 0.4236 - val_acc: 0.8046\n", 478 | "Epoch 3/20\n", 479 | "90580/90580 [==============================] - 445s 5ms/sample - loss: 0.4155 - acc: 0.8092 - val_loss: 0.4167 - val_acc: 0.8049\n", 480 | "Epoch 4/20\n", 481 | "90580/90580 [==============================] - 444s 5ms/sample - loss: 0.3965 - acc: 0.8193 - val_loss: 0.4123 - val_acc: 0.8123\n", 482 | "Epoch 5/20\n", 483 | "90580/90580 [==============================] - 446s 5ms/sample - loss: 0.3787 - acc: 0.8293 - val_loss: 0.4080 - val_acc: 0.8134\n", 484 | "Epoch 6/20\n", 485 | "90580/90580 [==============================] - 444s 5ms/sample - loss: 0.3613 - acc: 0.8377 - val_loss: 0.4048 - val_acc: 0.8154\n", 486 | "Epoch 7/20\n", 487 | "90580/90580 [==============================] - 444s 5ms/sample - loss: 0.3451 - acc: 0.8463 - val_loss: 0.4113 - val_acc: 0.8145\n", 488 | "Epoch 8/20\n", 489 | "90580/90580 [==============================] - 445s 5ms/sample - loss: 0.3310 - acc: 0.8535 - val_loss: 0.4148 - val_acc: 0.8176\n", 490 | "Epoch 9/20\n", 491 | "90580/90580 [==============================] - 445s 5ms/sample - loss: 0.3162 - acc: 0.8610 - val_loss: 0.4127 - val_acc: 0.8190\n", 492 | "Epoch 10/20\n", 493 | "90580/90580 [==============================] - 446s 5ms/sample - loss: 0.3025 - acc: 0.8675 - val_loss: 0.4155 - val_acc: 0.8183\n", 494 | "Epoch 11/20\n", 495 | "90580/90580 [==============================] - 445s 5ms/sample - loss: 0.2899 - acc: 0.8750 - val_loss: 0.4169 - val_acc: 0.8197\n", 496 | "Epoch 12/20\n", 497 | "90580/90580 [==============================] - 444s 5ms/sample - loss: 0.2791 - acc: 0.8810 - val_loss: 0.4332 - val_acc: 0.8208\n", 498 | "Epoch 13/20\n", 499 | "90580/90580 [==============================] - 445s 5ms/sample - loss: 0.2638 - acc: 0.8872 - val_loss: 0.4474 - val_acc: 0.8170\n", 500 | "Epoch 14/20\n", 501 | "90580/90580 [==============================] - 445s 5ms/sample - loss: 0.2573 - acc: 0.8897 - val_loss: 0.4406 - val_acc: 0.8171\n", 502 | "Epoch 15/20\n", 503 | "90580/90580 [==============================] - 446s 5ms/sample - loss: 0.2468 - acc: 0.8955 - val_loss: 0.4646 - val_acc: 0.8142\n", 504 | "Epoch 16/20\n", 505 | "90580/90580 [==============================] - 445s 5ms/sample - loss: 0.2401 - acc: 0.8987 - val_loss: 0.4564 - val_acc: 0.8163\n", 506 | "Epoch 17/20\n", 507 | "90580/90580 [==============================] - 444s 5ms/sample - loss: 0.2315 - acc: 0.9029 - val_loss: 0.4743 - val_acc: 0.8148\n", 508 | "Epoch 18/20\n", 509 | "90580/90580 [==============================] - 445s 5ms/sample - loss: 0.2263 - acc: 0.9059 - val_loss: 0.4868 - val_acc: 0.8164\n", 510 | "Epoch 19/20\n", 511 | "90580/90580 [==============================] - 446s 5ms/sample - loss: 0.2160 - acc: 0.9108 - val_loss: 0.4756 - val_acc: 0.8141\n", 512 | "Epoch 20/20\n", 513 | "90580/90580 [==============================] - 445s 5ms/sample - loss: 0.2119 - acc: 0.9134 - val_loss: 0.4726 - val_acc: 0.8136\n" 514 | ] 515 | }, 516 | { 517 | "data": { 518 | "text/plain": [ 519 | "" 520 | ] 521 | }, 522 | "execution_count": 18, 523 | "metadata": {}, 524 | "output_type": "execute_result" 525 | } 526 | ], 527 | "source": [ 528 | "m2.fit(X, Y, batch_size=64, epochs=20, validation_split=0.2)" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [] 537 | } 538 | ], 539 | "metadata": { 540 | "kernelspec": { 541 | "display_name": "Python 3", 542 | "language": "python", 543 | "name": "python3" 544 | }, 545 | "language_info": { 546 | "codemirror_mode": { 547 | "name": "ipython", 548 | "version": 3 549 | }, 550 | "file_extension": ".py", 551 | "mimetype": "text/x-python", 552 | "name": "python", 553 | "nbconvert_exporter": "python", 554 | "pygments_lexer": "ipython3", 555 | "version": "3.7.3" 556 | } 557 | }, 558 | "nbformat": 4, 559 | "nbformat_minor": 2 560 | } 561 | --------------------------------------------------------------------------------