├── .gitignore ├── README.md └── POS_tagger_for_Hebrew.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CACCHT: Creating Annotated Corpora of Classical Hebrew Texts 2 | 3 | The CACCHT project is a collaboration of Martijn Naaijer (University of Zurich), Willem van Peursen (Vrije Universiteit Amsterdam), Oliver Glanz (Andrews University), Christian Canu Højgaard (Fjellhaug International University College), Martin Ehrensvärd (University of Copenhagen) and Robert Rezetko (University of Copenhagen). 4 | Together with specialists in the field we develop linguistically annotated datsets of Semitic texts. These datasets are publicly available and can be used freely for research and education. Some datasets have only word-level annotations, while others also contain syntactic features. 5 | 6 | ## Datasets 7 | We are working on the following datasets: 8 | 9 | - [The Dead Sea Scrolls](https://github.com/etcbc/dss) 10 | - [The ETCBC Syriac Corpus](https://github.com/etcbc/syriac) 11 | - [The Samaritan Pentateuch](https://github.com/DT-UCPH/sp) 12 | - [The Copenhagen Ugaritic Corpus](https://github.com/dt-ucph/cuc) 13 | - [The Septuagint](https://github.com/CenterBLC/MT-LXX) 14 | 15 | ## Text-Fabric 16 | All the datasets are [Text-Fabric](https://annotation.github.io/text-fabric/tf/) datasets and can be accessed and used with Python. 17 | 18 | ## BHSA 19 | There is an important role for the [Biblia Hebraica Stuttgartensia Amstelodamensis (BHSA)](https://etcbc.github.io/bhsa) in this project. The BHSA is the dataset of the Masoretic Text of the Hebrew Bible with linguistic annotations that is developed and maintained by the [ETCBC](https://etcbc.nl). In general, CACCHT follows the annotation conventions of the BHSA and we adapt them for the specific characteristics of a language or text. 20 | -------------------------------------------------------------------------------- /POS_tagger_for_Hebrew.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Character based POS-tagger for Biblical Hebrew" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this script you find a character based pos-tagger for Biblical Hebrew. The input of the model consists of clauses of Biblical Hebrew text and the output is a sequence of parts of speech. The model does not know where the word boundaries are, because the space is simply another character." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "First some libraries are imported. These are Numpy, Keras and, of course, Text-Frabric." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stderr", 31 | "output_type": "stream", 32 | "text": [ 33 | "C:\\Users\\geitb\\Anaconda3\\lib\\site-packages\\h5py\\__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 34 | " from ._conv import register_converters as _register_converters\n", 35 | "Using TensorFlow backend.\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "from keras.models import Model\n", 41 | "from keras.layers import Input, LSTM, Dense\n", 42 | "import numpy as np" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "TF app is up-to-date.\n", 55 | "Using annotation/app-bhsa commit 43c1c5e88b371f575cdbbf57e38167deb8725f7f (=latest)\n", 56 | " in C:\\Users\\geitb/text-fabric-data/__apps__/bhsa.\n", 57 | "Using etcbc/bhsa/tf - c r1.5 in C:\\Users\\geitb/text-fabric-data\n", 58 | "Using etcbc/phono/tf - c r1.2 in C:\\Users\\geitb/text-fabric-data\n", 59 | "Using etcbc/parallels/tf - c r1.2 in C:\\Users\\geitb/text-fabric-data\n" 60 | ] 61 | }, 62 | { 63 | "data": { 64 | "text/markdown": [ 65 | "**Documentation:** BHSA Character table Feature docs bhsa API Text-Fabric API 7.3.15 Search Reference" 66 | ], 67 | "text/plain": [ 68 | "" 69 | ] 70 | }, 71 | "metadata": {}, 72 | "output_type": "display_data" 73 | }, 74 | { 75 | "data": { 76 | "text/html": [ 77 | "
Loaded features:\n", 78 | "

BHSA = Biblia Hebraica Stuttgartensia Amstelodamensis: book book@ll chapter code det freq_lex function g_word g_word_utf8 gloss gn label language lex lex_utf8 ls nametype nu number otype pdp prs_gn prs_nu prs_ps ps qere qere_trailer qere_trailer_utf8 qere_utf8 rank_lex rela sp st trailer trailer_utf8 txt typ verse voc_lex voc_lex_utf8 vs vt mother oslots

Parallel Passages: crossref

Phonetic Transcriptions: phono phono_trailer

" 79 | ], 80 | "text/plain": [ 81 | "" 82 | ] 83 | }, 84 | "metadata": {}, 85 | "output_type": "display_data" 86 | }, 87 | { 88 | "data": { 89 | "text/html": [ 90 | "" 355 | ], 356 | "text/plain": [ 357 | "" 358 | ] 359 | }, 360 | "metadata": {}, 361 | "output_type": "display_data" 362 | }, 363 | { 364 | "data": { 365 | "text/html": [ 366 | "
API members:\n", 367 | "C Computed, Call AllComputeds, Cs ComputedString
\n", 368 | "E Edge, Eall AllEdges, Es EdgeString
\n", 369 | "ensureLoaded, TF, ignored, loadLog
\n", 370 | "L Locality
\n", 371 | "cache, error, indent, info, reset
\n", 372 | "N Nodes, sortKey, sortKeyTuple, otypeRank, sortNodes
\n", 373 | "F Feature, Fall AllFeatures, Fs FeatureString
\n", 374 | "S Search
\n", 375 | "T Text
" 376 | ], 377 | "text/plain": [ 378 | "" 379 | ] 380 | }, 381 | "metadata": {}, 382 | "output_type": "display_data" 383 | } 384 | ], 385 | "source": [ 386 | "from tf.app import use\n", 387 | "A = use('bhsa', hoist=globals())\n", 388 | "A.displaySetup(extraFeatures='g_cons')" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "A train and test set are defined. The model is trained on all the books of the MT, except Jonah. The model will be used to predict parts of speech for this book." 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 74, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "train_books = ['Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', '1_Samuel', \n", 405 | " '2_Samuel','1_Kings', '2_Kings', 'Isaiah', 'Jeremiah', 'Ezekiel', 'Hosea', 'Joel', 'Amos', \n", 406 | " 'Obadiah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi', \n", 407 | " 'Psalms', 'Job', 'Proverbs', 'Ruth', 'Song_of_songs', 'Ecclesiastes', 'Lamentations',\n", 408 | " 'Esther', 'Daniel', 'Ezra', 'Nehemiah', '1_Chronicles', '2_Chronicles']\n", 409 | "\n", 410 | "test_books = ['Jonah']" 411 | ] 412 | }, 413 | { 414 | "cell_type": "markdown", 415 | "metadata": {}, 416 | "source": [ 417 | "The data are prepared." 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 75, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "def prepare_train_data(books):\n", 427 | " \"\"\"\"\n", 428 | " books is a list containing the books of the training set.\n", 429 | " The function returns:\n", 430 | " input_clauses is a list containing strings with the text of BH clauses\n", 431 | " output_pos is a list containing lists with all the pos of BH clauses\n", 432 | " input_chars is a list containing the characters occurring in the input_clauses (the input vocabulary)\n", 433 | " output_vocab is a list containing all the pos occurring in the bhsa\n", 434 | " max_len_input is the maximum length of all the input clauses in number of characters\n", 435 | " max_len_output is the maximum length of all the output clauses in number of phrases (+2, because a \n", 436 | " start and stop sign are added)\n", 437 | " \"\"\"\n", 438 | "\n", 439 | " input_clauses = []\n", 440 | " output_pos = []\n", 441 | " input_chars = set()\n", 442 | " output_vocab = set()\n", 443 | "\n", 444 | " for cl in F.otype.s(\"clause\"): \n", 445 | " \n", 446 | " bo, _, _ = T.sectionFromNode(cl)\n", 447 | " if bo not in books:\n", 448 | " continue\n", 449 | " \n", 450 | " # max length of a clause is 10 words\n", 451 | " if len(L.d(cl, \"word\")) > 10:\n", 452 | " continue\n", 453 | " \n", 454 | " # input and output is extracted from the bhsa\n", 455 | " words = \" \".join([F.g_cons.v(w) for w in L.d(cl, \"word\")])\n", 456 | " pos_prepare = [F.sp.v(w) for w in L.d(cl, \"word\")]\n", 457 | " \n", 458 | " poss = ['\\t']\n", 459 | " for elem in pos_prepare:\n", 460 | " poss.append(elem)\n", 461 | " poss.append('\\n')\n", 462 | " \n", 463 | " input_clauses.append(words)\n", 464 | " output_pos.append(poss)\n", 465 | " \n", 466 | " for ch in words:\n", 467 | " input_chars.add(ch)\n", 468 | " \n", 469 | " for pos in poss:\n", 470 | " output_vocab.add(pos)\n", 471 | " \n", 472 | " input_chars = sorted(list(input_chars))\n", 473 | " output_vocab = sorted(list(output_vocab))\n", 474 | " \n", 475 | " max_len_input = max([len(clause) for clause in input_clauses])\n", 476 | " max_len_output = max([len(poss) for poss in output_pos])\n", 477 | " \n", 478 | " return input_clauses, output_pos, input_chars, output_vocab, max_len_input, max_len_output" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 76, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "def prepare_test_data(books):\n", 488 | " \"\"\"\n", 489 | " books is a list containing the test books\n", 490 | " The function returns:\n", 491 | " input_clauses, a list containing the text of clauses in the test books\n", 492 | " \"\"\"\n", 493 | "\n", 494 | " input_clauses_test = []\n", 495 | " for cl in F.otype.s(\"clause\"): \n", 496 | " \n", 497 | " bo, _, _ = T.sectionFromNode(cl)\n", 498 | " if bo not in books:\n", 499 | " continue\n", 500 | " \n", 501 | " if len(L.d(cl, \"word\")) > 10:\n", 502 | " continue\n", 503 | "\n", 504 | " words = \" \".join([F.g_cons.v(w) for w in L.d(cl, \"word\")])\n", 505 | " input_clauses_test.append(words)\n", 506 | " \n", 507 | " return input_clauses_test" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 77, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "def create_dicts(input_chars, output_vocab):\n", 517 | " \"\"\"\n", 518 | " The network can only handle numeric data. This function provides four dicts. \n", 519 | " Two of them map between integers and the input characters (one dict for every direction), the other two \n", 520 | " map between integers and parts of speech.\n", 521 | " \"\"\"\n", 522 | " \n", 523 | " input_idx2char = {}\n", 524 | " input_char2idx = {}\n", 525 | "\n", 526 | " for k, v in enumerate(input_chars):\n", 527 | " input_idx2char[k] = v\n", 528 | " input_char2idx[v] = k\n", 529 | " \n", 530 | " output_idx2char = {}\n", 531 | " output_char2idx = {}\n", 532 | " \n", 533 | " for k, v in enumerate(output_vocab):\n", 534 | " output_idx2char[k] = v\n", 535 | " output_char2idx[v] = k\n", 536 | " \n", 537 | " return input_idx2char, input_char2idx, output_idx2char, output_char2idx" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 78, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "def one_hot_encode(nb_samples, max_len_input, max_len_output, input_chars, output_vocab, input_char2idx, output_char2idx, input_clauses, output_pos):\n", 547 | " \"\"\"\n", 548 | " Categorical data are generally one-hot encoded in neural networks, which is done here.\n", 549 | " \n", 550 | " \"\"\"\n", 551 | "\n", 552 | " tokenized_input_data = np.zeros(shape = (nb_samples,max_len_input,len(input_chars)), dtype='float32')\n", 553 | " tokenized_output = np.zeros(shape = (nb_samples,max_len_output,len(output_vocab)), dtype='float32')\n", 554 | " target_data = np.zeros((nb_samples, max_len_output, len(output_vocab)),dtype='float32')\n", 555 | "\n", 556 | " for i in range(nb_samples):\n", 557 | " for k, ch in enumerate(input_clauses[i]):\n", 558 | " tokenized_input_data[i, k, input_char2idx[ch]] = 1\n", 559 | " \n", 560 | " for k, ch in enumerate(output_pos[i]):\n", 561 | " tokenized_output[i, k, output_char2idx[ch]] = 1\n", 562 | "\n", 563 | " # decoder_target_data will be ahead by one timestep and will not include the start character.\n", 564 | " if k > 0:\n", 565 | " target_data[i, k-1, output_char2idx[ch]] = 1\n", 566 | " \n", 567 | " return tokenized_input_data, tokenized_output, target_data" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": 79, 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [ 576 | "def define_LSTM_model(input_chars, output_vocab):\n", 577 | " \"\"\"\n", 578 | " \n", 579 | " \n", 580 | " \"\"\"\n", 581 | "\n", 582 | " # Encoder model\n", 583 | "\n", 584 | " encoder_input = Input(shape=(None,len(input_chars)))\n", 585 | " encoder_LSTM = LSTM(512,activation = 'relu',return_state = True, return_sequences=True)(encoder_input)\n", 586 | " encoder_LSTM = LSTM(512,return_state = True)(encoder_LSTM)\n", 587 | " encoder_outputs, encoder_h, encoder_c = encoder_LSTM\n", 588 | " encoder_states = [encoder_h, encoder_c]\n", 589 | " \n", 590 | " # Decoder model\n", 591 | "\n", 592 | " decoder_input = Input(shape=(None,len(output_vocab)))\n", 593 | " decoder_LSTM = LSTM(512, return_sequences=True, return_state = True)\n", 594 | " decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)\n", 595 | " decoder_dense = Dense(len(output_vocab), activation='softmax')\n", 596 | " decoder_out = decoder_dense (decoder_out)\n", 597 | " \n", 598 | " model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out])\n", 599 | "\n", 600 | " model.summary()\n", 601 | "\n", 602 | " return encoder_input, encoder_states, decoder_input, decoder_LSTM, decoder_dense, model" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 80, 608 | "metadata": {}, 609 | "outputs": [], 610 | "source": [ 611 | "def compile_and_train(model, tokenized_input, tokenized_output, batch_size, epochs, validation_split):\n", 612 | "\n", 613 | " model.compile(optimizer='adam', loss='categorical_crossentropy')\n", 614 | " model.fit(x=[tokenized_input,tokenized_output], \n", 615 | " y=target_data,\n", 616 | " batch_size=batch_size,\n", 617 | " epochs=epochs,\n", 618 | " validation_split=validation_split)\n", 619 | " \n", 620 | " return model" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 81, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "nb_samples = 70000\n", 630 | "\n", 631 | "input_clauses, output_pos, input_chars, output_vocab, max_len_input, max_len_output = prepare_train_data(train_books)\n", 632 | "input_idx2char, input_char2idx, output_idx2char, output_char2idx = create_dicts(input_chars, output_vocab)\n", 633 | "tokenized_input, tokenized_output, target_data = one_hot_encode(nb_samples, max_len_input, max_len_output, input_chars, output_vocab, input_char2idx, output_char2idx, input_clauses, output_pos)" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": 82, 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "test_clauses = prepare_test_data(test_books)\n", 643 | "tokenized_test_data, _, _ = one_hot_encode(len(test_clauses), max_len_input, max_len_output, input_chars, output_vocab, input_char2idx, output_char2idx, test_clauses, output_pos)" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 83, 649 | "metadata": {}, 650 | "outputs": [ 651 | { 652 | "name": "stdout", 653 | "output_type": "stream", 654 | "text": [ 655 | "__________________________________________________________________________________________________\n", 656 | "Layer (type) Output Shape Param # Connected to \n", 657 | "==================================================================================================\n", 658 | "input_19 (InputLayer) (None, None, 25) 0 \n", 659 | "__________________________________________________________________________________________________\n", 660 | "lstm_19 (LSTM) [(None, None, 512), 1101824 input_19[0][0] \n", 661 | "__________________________________________________________________________________________________\n", 662 | "input_20 (InputLayer) (None, None, 16) 0 \n", 663 | "__________________________________________________________________________________________________\n", 664 | "lstm_20 (LSTM) [(None, 512), (None, 2099200 lstm_19[0][0] \n", 665 | " lstm_19[0][1] \n", 666 | " lstm_19[0][2] \n", 667 | "__________________________________________________________________________________________________\n", 668 | "lstm_21 (LSTM) [(None, None, 512), 1083392 input_20[0][0] \n", 669 | " lstm_20[0][1] \n", 670 | " lstm_20[0][2] \n", 671 | "__________________________________________________________________________________________________\n", 672 | "dense_7 (Dense) (None, None, 16) 8208 lstm_21[0][0] \n", 673 | "==================================================================================================\n", 674 | "Total params: 4,292,624\n", 675 | "Trainable params: 4,292,624\n", 676 | "Non-trainable params: 0\n", 677 | "__________________________________________________________________________________________________\n", 678 | "Train on 63000 samples, validate on 7000 samples\n", 679 | "Epoch 1/70\n", 680 | "63000/63000 [==============================] - 52s 825us/step - loss: 0.7861 - val_loss: 0.6606\n", 681 | "Epoch 2/70\n", 682 | "63000/63000 [==============================] - 44s 699us/step - loss: 0.6634 - val_loss: 0.5711\n", 683 | "Epoch 3/70\n", 684 | "63000/63000 [==============================] - 44s 701us/step - loss: 0.6318 - val_loss: 0.5787\n", 685 | "Epoch 4/70\n", 686 | "63000/63000 [==============================] - 44s 702us/step - loss: 0.5980 - val_loss: 0.5530\n", 687 | "Epoch 5/70\n", 688 | "63000/63000 [==============================] - 44s 705us/step - loss: 0.5784 - val_loss: 0.5595\n", 689 | "Epoch 6/70\n", 690 | "63000/63000 [==============================] - 45s 707us/step - loss: 0.5473 - val_loss: 0.5091\n", 691 | "Epoch 7/70\n", 692 | "63000/63000 [==============================] - 44s 704us/step - loss: 0.5193 - val_loss: 0.4841\n", 693 | "Epoch 8/70\n", 694 | "63000/63000 [==============================] - 44s 706us/step - loss: 0.4964 - val_loss: 0.4656\n", 695 | "Epoch 9/70\n", 696 | "63000/63000 [==============================] - 44s 706us/step - loss: 0.4872 - val_loss: 0.4460\n", 697 | "Epoch 10/70\n", 698 | "63000/63000 [==============================] - 44s 704us/step - loss: 0.4749 - val_loss: 0.4367\n", 699 | "Epoch 11/70\n", 700 | "63000/63000 [==============================] - 44s 704us/step - loss: 0.4578 - val_loss: 0.4266\n", 701 | "Epoch 12/70\n", 702 | "63000/63000 [==============================] - 45s 710us/step - loss: 0.4450 - val_loss: 0.4114\n", 703 | "Epoch 13/70\n", 704 | "63000/63000 [==============================] - 45s 707us/step - loss: 0.4334 - val_loss: 0.4190\n", 705 | "Epoch 14/70\n", 706 | "63000/63000 [==============================] - 45s 710us/step - loss: 0.4264 - val_loss: 0.4050\n", 707 | "Epoch 15/70\n", 708 | "63000/63000 [==============================] - 46s 726us/step - loss: 0.4168 - val_loss: 0.3859\n", 709 | "Epoch 16/70\n", 710 | "63000/63000 [==============================] - 45s 710us/step - loss: 0.3936 - val_loss: 0.4555\n", 711 | "Epoch 17/70\n", 712 | "63000/63000 [==============================] - 45s 709us/step - loss: 0.3846 - val_loss: 0.3311\n", 713 | "Epoch 18/70\n", 714 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.3189 - val_loss: 0.2705\n", 715 | "Epoch 19/70\n", 716 | "63000/63000 [==============================] - 45s 717us/step - loss: 0.2588 - val_loss: 0.2386\n", 717 | "Epoch 20/70\n", 718 | "63000/63000 [==============================] - 45s 717us/step - loss: 0.2141 - val_loss: 0.2200\n", 719 | "Epoch 21/70\n", 720 | "63000/63000 [==============================] - 45s 714us/step - loss: 0.2089 - val_loss: 0.1941\n", 721 | "Epoch 22/70\n", 722 | "63000/63000 [==============================] - 45s 715us/step - loss: 0.1698 - val_loss: 0.2052\n", 723 | "Epoch 23/70\n", 724 | "63000/63000 [==============================] - 45s 715us/step - loss: 0.1563 - val_loss: 0.2052\n", 725 | "Epoch 24/70\n", 726 | "63000/63000 [==============================] - 44s 703us/step - loss: 0.4989 - val_loss: 0.4322\n", 727 | "Epoch 25/70\n", 728 | "63000/63000 [==============================] - 44s 705us/step - loss: 0.4049 - val_loss: 0.3274\n", 729 | "Epoch 26/70\n", 730 | "63000/63000 [==============================] - 45s 714us/step - loss: 0.2439 - val_loss: 0.2030\n", 731 | "Epoch 27/70\n", 732 | "63000/63000 [==============================] - 45s 712us/step - loss: 0.1683 - val_loss: 0.1778\n", 733 | "Epoch 28/70\n", 734 | "63000/63000 [==============================] - 45s 714us/step - loss: 0.1444 - val_loss: 0.1777\n", 735 | "Epoch 29/70\n", 736 | "63000/63000 [==============================] - 45s 714us/step - loss: 0.1347 - val_loss: 0.1568\n", 737 | "Epoch 30/70\n", 738 | "63000/63000 [==============================] - 45s 714us/step - loss: 0.1195 - val_loss: 0.1517\n", 739 | "Epoch 31/70\n", 740 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.1107 - val_loss: 0.1488\n", 741 | "Epoch 32/70\n", 742 | "63000/63000 [==============================] - 45s 712us/step - loss: 0.1039 - val_loss: 0.1895\n", 743 | "Epoch 33/70\n", 744 | "63000/63000 [==============================] - 45s 709us/step - loss: 0.2501 - val_loss: 0.4811\n", 745 | "Epoch 34/70\n", 746 | "63000/63000 [==============================] - 44s 706us/step - loss: 0.3742 - val_loss: 0.2239\n", 747 | "Epoch 35/70\n", 748 | "63000/63000 [==============================] - 45s 709us/step - loss: 0.1743 - val_loss: 0.1717\n", 749 | "Epoch 36/70\n", 750 | "63000/63000 [==============================] - 45s 708us/step - loss: 0.1316 - val_loss: 0.1513\n", 751 | "Epoch 37/70\n", 752 | "63000/63000 [==============================] - 45s 712us/step - loss: 0.1174 - val_loss: 0.1447\n", 753 | "Epoch 38/70\n", 754 | "63000/63000 [==============================] - 45s 710us/step - loss: 0.1036 - val_loss: 0.1453\n", 755 | "Epoch 39/70\n", 756 | "63000/63000 [==============================] - 45s 713us/step - loss: 0.0972 - val_loss: 0.1380\n", 757 | "Epoch 40/70\n", 758 | "63000/63000 [==============================] - 45s 712us/step - loss: 0.0903 - val_loss: 0.1353\n", 759 | "Epoch 41/70\n", 760 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0846 - val_loss: 0.1305\n", 761 | "Epoch 42/70\n", 762 | "63000/63000 [==============================] - 45s 712us/step - loss: 0.0802 - val_loss: 0.1491\n", 763 | "Epoch 43/70\n", 764 | "63000/63000 [==============================] - 45s 713us/step - loss: 0.0757 - val_loss: 0.1978\n", 765 | "Epoch 44/70\n", 766 | "63000/63000 [==============================] - 45s 710us/step - loss: 0.0739 - val_loss: 0.1352\n", 767 | "Epoch 45/70\n", 768 | "63000/63000 [==============================] - 45s 712us/step - loss: 0.0681 - val_loss: 0.1400\n", 769 | "Epoch 46/70\n", 770 | "63000/63000 [==============================] - 45s 712us/step - loss: 0.0641 - val_loss: 0.1324\n", 771 | "Epoch 47/70\n", 772 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0683 - val_loss: 0.1251\n", 773 | "Epoch 48/70\n", 774 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0765 - val_loss: 0.1211\n", 775 | "Epoch 49/70\n", 776 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0606 - val_loss: 0.1417\n", 777 | "Epoch 50/70\n", 778 | "63000/63000 [==============================] - 45s 710us/step - loss: 0.0565 - val_loss: 0.1354\n", 779 | "Epoch 51/70\n", 780 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0522 - val_loss: 0.1170\n", 781 | "Epoch 52/70\n", 782 | "63000/63000 [==============================] - 45s 714us/step - loss: 0.0486 - val_loss: 0.1247\n", 783 | "Epoch 53/70\n", 784 | "63000/63000 [==============================] - 45s 713us/step - loss: 0.0476 - val_loss: 0.1172\n", 785 | "Epoch 54/70\n", 786 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0436 - val_loss: 0.1193\n", 787 | "Epoch 55/70\n", 788 | "63000/63000 [==============================] - 45s 713us/step - loss: 0.0402 - val_loss: 0.1202\n", 789 | "Epoch 56/70\n", 790 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0374 - val_loss: 0.1219\n", 791 | "Epoch 57/70\n" 792 | ] 793 | }, 794 | { 795 | "name": "stdout", 796 | "output_type": "stream", 797 | "text": [ 798 | "63000/63000 [==============================] - 45s 712us/step - loss: 0.0350 - val_loss: 0.1209\n", 799 | "Epoch 58/70\n", 800 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0314 - val_loss: 0.1267\n", 801 | "Epoch 59/70\n", 802 | "63000/63000 [==============================] - 45s 713us/step - loss: 0.0310 - val_loss: 0.1213\n", 803 | "Epoch 60/70\n", 804 | "63000/63000 [==============================] - 45s 712us/step - loss: 0.0288 - val_loss: 0.1251\n", 805 | "Epoch 61/70\n", 806 | "63000/63000 [==============================] - 45s 715us/step - loss: 0.0277 - val_loss: 0.1496\n", 807 | "Epoch 62/70\n", 808 | "63000/63000 [==============================] - 45s 710us/step - loss: 0.0245 - val_loss: 0.1470\n", 809 | "Epoch 63/70\n", 810 | "63000/63000 [==============================] - 45s 715us/step - loss: 0.0299 - val_loss: 0.1292\n", 811 | "Epoch 64/70\n", 812 | "63000/63000 [==============================] - 45s 715us/step - loss: 0.0212 - val_loss: 0.1357\n", 813 | "Epoch 65/70\n", 814 | "63000/63000 [==============================] - 45s 716us/step - loss: 0.0175 - val_loss: 0.1396\n", 815 | "Epoch 66/70\n", 816 | "63000/63000 [==============================] - 45s 712us/step - loss: 0.0194 - val_loss: 0.1389\n", 817 | "Epoch 67/70\n", 818 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0155 - val_loss: 0.1395\n", 819 | "Epoch 68/70\n", 820 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0131 - val_loss: 0.1545\n", 821 | "Epoch 69/70\n", 822 | "63000/63000 [==============================] - 45s 711us/step - loss: 0.0144 - val_loss: 0.1578\n", 823 | "Epoch 70/70\n", 824 | "63000/63000 [==============================] - 45s 713us/step - loss: 0.0206 - val_loss: 0.1472\n" 825 | ] 826 | } 827 | ], 828 | "source": [ 829 | "encoder_input, encoder_states, decoder_input, decoder_LSTM, decoder_dense, model = define_LSTM_model(input_chars, output_vocab)\n", 830 | "model = compile_and_train(model, tokenized_input, tokenized_output, 512, 70, 0.1)" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": 84, 836 | "metadata": {}, 837 | "outputs": [], 838 | "source": [ 839 | "# Inference models for testing\n", 840 | "\n", 841 | "# Encoder inference model\n", 842 | "encoder_model_inf = Model(encoder_input, encoder_states)\n", 843 | "\n", 844 | "# Decoder inference model\n", 845 | "decoder_state_input_h = Input(shape=(512,))\n", 846 | "decoder_state_input_c = Input(shape=(512,))\n", 847 | "decoder_input_states = [decoder_state_input_h, decoder_state_input_c]\n", 848 | "\n", 849 | "decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, \n", 850 | " initial_state=decoder_input_states)\n", 851 | "\n", 852 | "decoder_states = [decoder_h , decoder_c]\n", 853 | "\n", 854 | "decoder_out = decoder_dense(decoder_out)\n", 855 | "\n", 856 | "decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,\n", 857 | " outputs=[decoder_out] + decoder_states )" 858 | ] 859 | }, 860 | { 861 | "cell_type": "code", 862 | "execution_count": 85, 863 | "metadata": {}, 864 | "outputs": [], 865 | "source": [ 866 | "def decode_seq(inp_seq):\n", 867 | " \n", 868 | " # Initial states value is coming from the encoder \n", 869 | " states_val = encoder_model_inf.predict(inp_seq)\n", 870 | " \n", 871 | " target_seq = np.zeros((1, 1, len(output_vocab)))\n", 872 | " target_seq[0, 0, output_char2idx['\\t']] = 1\n", 873 | " \n", 874 | " translated_sent = ''\n", 875 | " pred_pos = []\n", 876 | " stop_condition = False\n", 877 | " \n", 878 | " while not stop_condition:\n", 879 | " \n", 880 | " decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)\n", 881 | " \n", 882 | " max_val_index = np.argmax(decoder_out[0,-1,:])\n", 883 | " sampled_out_char = output_idx2char[max_val_index]\n", 884 | " pred_pos.append(sampled_out_char)\n", 885 | " \n", 886 | " if (sampled_out_char == '\\n'):\n", 887 | " stop_condition = True\n", 888 | " \n", 889 | " target_seq = np.zeros((1, 1, len(output_vocab)))\n", 890 | " target_seq[0, 0, max_val_index] = 1\n", 891 | " \n", 892 | " states_val = [decoder_h, decoder_c]\n", 893 | " \n", 894 | " return pred_pos\n", 895 | "\n" 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": 86, 901 | "metadata": {}, 902 | "outputs": [ 903 | { 904 | "name": "stdout", 905 | "output_type": "stream", 906 | "text": [ 907 | "-\n", 908 | "Input sentence: W JHJ DBR JHWH >L JWNH BN >MTJ\n", 909 | "Decoded sentence: ['conj', 'verb', 'subs', 'nmpr', 'prep', 'subs', 'subs', 'subs']\n", 910 | "-\n", 911 | "Input sentence: L >MR\n", 912 | "Decoded sentence: ['prep', 'verb']\n", 913 | "-\n", 914 | "Input sentence: QWM\n", 915 | "Decoded sentence: ['verb']\n", 916 | "-\n", 917 | "Input sentence: LK >L NJNWH H >NJH\n", 936 | "Decoded sentence: ['conj', 'verb', 'nmpr']\n", 937 | "-\n", 938 | "Input sentence: B>H TRCJC\n", 939 | "Decoded sentence: ['verb', 'nmpr']\n", 940 | "-\n", 941 | "Input sentence: W JTN FKRH\n", 942 | "Decoded sentence: ['conj', 'verb', 'subs']\n", 943 | "-\n", 944 | "Input sentence: W JRD BH\n", 945 | "Decoded sentence: ['conj', 'verb', 'prep']\n", 946 | "-\n", 947 | "Input sentence: L BW> L H JM\n", 951 | "Decoded sentence: ['conj', 'nmpr', 'verb', 'subs', 'subs', 'prep', 'art', 'subs']\n", 952 | "-\n", 953 | "Input sentence: W JHJ SNJH XCBH\n", 957 | "Decoded sentence: ['conj', 'art', 'subs', 'verb']\n", 958 | "-\n", 959 | "Input sentence: L HCBR\n", 960 | "Decoded sentence: ['prep', 'verb']\n", 961 | "-\n", 962 | "Input sentence: W JJR>W H MLXJM\n", 963 | "Decoded sentence: ['conj', 'verb', 'art', 'subs']\n", 964 | "-\n", 965 | "Input sentence: W JZJC >L >LHJW\n", 969 | "Decoded sentence: ['subs', 'prep', 'subs']\n", 970 | "-\n", 971 | "Input sentence: W JVLW >T H KLJM >L H JM\n", 972 | "Decoded sentence: ['conj', 'verb', 'prep', 'art', 'subs', 'prep', 'art', 'subs']\n", 973 | "-\n", 974 | "Input sentence: >CR B >NJH\n", 975 | "Decoded sentence: ['conj', 'prep', 'art', 'subs']\n", 976 | "-\n", 977 | "Input sentence: L HQL M L JRKTJ H SPJNH\n", 981 | "Decoded sentence: ['conj', 'nmpr', 'verb', 'prep', 'subs', 'art', 'nmpr']\n", 982 | "-\n", 983 | "Input sentence: W JCKB\n", 984 | "Decoded sentence: ['conj', 'verb']\n", 985 | "-\n", 986 | "Input sentence: W JRDM\n", 987 | "Decoded sentence: ['conj', 'verb']\n", 988 | "-\n", 989 | "Input sentence: W JQRB >LJW RB H XBL\n", 990 | "Decoded sentence: ['conj', 'verb', 'prep', 'subs', 'art', 'subs']\n", 991 | "-\n", 992 | "Input sentence: W J>MR LW\n", 993 | "Decoded sentence: ['conj', 'verb', 'prep']\n", 994 | "-\n", 995 | "Input sentence: MH LK\n", 996 | "Decoded sentence: ['prin', 'prep']\n", 997 | "-\n", 998 | "Input sentence: NRDM\n", 999 | "Decoded sentence: ['verb']\n", 1000 | "-\n", 1001 | "Input sentence: QWM\n", 1002 | "Decoded sentence: ['verb']\n", 1003 | "-\n", 1004 | "Input sentence: QR> >L >LHJK\n", 1005 | "Decoded sentence: ['verb', 'prep', 'subs']\n", 1006 | "-\n", 1007 | "Input sentence: >WLJ JTLHJM LNW\n", 1008 | "Decoded sentence: ['advb', 'verb', 'art', 'subs', 'prep']\n", 1009 | "-\n", 1010 | "Input sentence: W L> N>BD\n", 1011 | "Decoded sentence: ['conj', 'nega', 'verb']\n", 1012 | "-\n", 1013 | "Input sentence: W J>MRW\n", 1014 | "Decoded sentence: ['conj', 'verb']\n", 1015 | "-\n", 1016 | "Input sentence: >JC >L RT LNW\n", 1029 | "Decoded sentence: ['prep', 'prep', 'prep', 'prep', 'art', 'subs', 'art', 'prde']\n", 1030 | "-\n", 1031 | "Input sentence: W JPLW GWRLWT\n", 1032 | "Decoded sentence: ['conj', 'verb', 'subs']\n", 1033 | "-\n", 1034 | "Input sentence: W JPL H GWRL MRW >LJW\n", 1038 | "Decoded sentence: ['conj', 'verb', 'prep']\n", 1039 | "-\n", 1040 | "Input sentence: HGJDH N> LNW\n", 1041 | "Decoded sentence: ['verb', 'intj', 'prep']\n", 1042 | "-\n", 1043 | "Input sentence: B >CR L MJ H RT LNW\n", 1044 | "Decoded sentence: ['prep', 'conj', 'prep', 'subs', 'art', 'subs', 'art', 'prde', 'prep']\n", 1045 | "-\n", 1046 | "Input sentence: MH ML>KTK\n", 1047 | "Decoded sentence: ['prin', 'subs']\n", 1048 | "-\n", 1049 | "Input sentence: W M >JN TBW>\n", 1050 | "Decoded sentence: ['conj', 'prep', 'inrg', 'verb']\n", 1051 | "-\n", 1052 | "Input sentence: MH >RYK\n", 1053 | "Decoded sentence: ['prin', 'subs']\n", 1054 | "-\n", 1055 | "Input sentence: W >J M ZH TH\n", 1056 | "Decoded sentence: ['conj', 'prin', 'prep', 'prde', 'prep', 'prps']\n", 1057 | "-\n", 1058 | "Input sentence: W J>MR >LJHM\n", 1059 | "Decoded sentence: ['conj', 'verb', 'prep']\n", 1060 | "-\n", 1061 | "Input sentence: NKJ\n", 1062 | "Decoded sentence: ['verb', 'prps']\n", 1063 | "-\n", 1064 | "Input sentence: W >T JHWH >LHJ H CMJM >NJ JR>\n", 1065 | "Decoded sentence: ['conj', 'prep', 'nmpr', 'subs', 'art', 'subs', 'prps', 'verb']\n", 1066 | "-\n", 1067 | "Input sentence: >CR T H JM W >T H JBCH\n", 1068 | "Decoded sentence: ['conj', 'verb', 'prep', 'art', 'subs', 'conj', 'prep', 'art', 'subs']\n", 1069 | "-\n", 1070 | "Input sentence: W JJR>W H >NCJM JR>H GDWLH\n", 1071 | "Decoded sentence: ['conj', 'verb', 'art', 'subs', 'subs', 'adjv']\n", 1072 | "-\n", 1073 | "Input sentence: W J>MRW >LJW\n", 1074 | "Decoded sentence: ['conj', 'verb', 'prep']\n", 1075 | "-\n", 1076 | "Input sentence: MH Z>T NCJM\n", 1080 | "Decoded sentence: ['conj', 'verb', 'art', 'subs']\n", 1081 | "-\n", 1082 | "Input sentence: KJ M L PNJ JHWH HW> BRX\n", 1083 | "Decoded sentence: ['conj', 'prep', 'prep', 'subs', 'nmpr', 'prps', 'verb']\n", 1084 | "-\n", 1085 | "Input sentence: KJ HGJD LHM\n", 1086 | "Decoded sentence: ['conj', 'verb', 'prep']\n", 1087 | "-\n", 1088 | "Input sentence: W J>MRW >LJW\n", 1089 | "Decoded sentence: ['conj', 'verb', 'prep']\n", 1090 | "-\n", 1091 | "Input sentence: MH NMR >LJHM\n", 1104 | "Decoded sentence: ['conj', 'verb', 'prep']\n", 1105 | "-\n", 1106 | "Input sentence: F>WNJ\n", 1107 | "Decoded sentence: ['verb']\n", 1108 | "-\n", 1109 | "Input sentence: W HVJLNJ >L H JM\n", 1110 | "Decoded sentence: ['conj', 'verb', 'prep', 'art', 'subs']\n", 1111 | "-\n", 1112 | "Input sentence: W JCTQ H JM M NJ\n", 1116 | "Decoded sentence: ['conj', 'verb', 'prps']\n", 1117 | "-\n", 1118 | "Input sentence: W JXTRW H >NCJM\n", 1119 | "Decoded sentence: ['conj', 'verb', 'art', 'subs']\n", 1120 | "-\n", 1121 | "Input sentence: L HCJB >L H JBCH\n", 1122 | "Decoded sentence: ['prep', 'verb', 'prep', 'art', 'subs']\n", 1123 | "-\n", 1124 | "Input sentence: W L> JKLW\n", 1125 | "Decoded sentence: ['conj', 'nega', 'verb']\n", 1126 | "-\n", 1127 | "Input sentence: KJ H JM HWLK\n", 1128 | "Decoded sentence: ['conj', 'art', 'subs', 'verb']\n", 1129 | "-\n", 1130 | "Input sentence: W SW >L JHWH\n", 1134 | "Decoded sentence: ['conj', 'verb', 'prep', 'nmpr']\n", 1135 | "-\n", 1136 | "Input sentence: W J>MRW\n", 1137 | "Decoded sentence: ['conj', 'verb']\n", 1138 | "-\n", 1139 | "Input sentence: >NH JHWH\n", 1140 | "Decoded sentence: ['inrg', 'nmpr']\n", 1141 | "-\n", 1142 | "Input sentence: >L N> N>BDH B NPC H >JC H ZH\n", 1143 | "Decoded sentence: ['nega', 'intj', 'verb', 'prep', 'art', 'subs', 'art', 'prde', 'art', 'prde']\n", 1144 | "-\n", 1145 | "Input sentence: W >L TTN \n", 1146 | "Decoded sentence: ['conj', 'nega', 'verb', 'prep', 'subs', 'adjv']\n", 1147 | "-\n", 1148 | "Input sentence: KJ >TH CR XPYT\n", 1155 | "Decoded sentence: ['prep', 'conj', 'verb']\n", 1156 | "-\n", 1157 | "Input sentence: W JF>W >T JWNH\n", 1158 | "Decoded sentence: ['conj', 'verb', 'prep', 'nmpr']\n", 1159 | "-\n", 1160 | "Input sentence: W JVLHW >L H JM\n", 1161 | "Decoded sentence: ['conj', 'verb', 'prep', 'art', 'subs']\n", 1162 | "-\n", 1163 | "Input sentence: W JW H >NCJM JR>H GDWLH >T JHWH\n", 1167 | "Decoded sentence: ['conj', 'verb', 'art', 'subs', 'subs', 'nmpr', 'prep', 'nmpr']\n", 1168 | "-\n", 1169 | "Input sentence: W JZBXW ZBX L JHWH\n", 1170 | "Decoded sentence: ['conj', 'verb', 'subs', 'prep', 'nmpr']\n", 1171 | "-\n", 1172 | "Input sentence: W JDRW NDRJM\n", 1173 | "Decoded sentence: ['conj', 'verb', 'subs']\n", 1174 | "-\n", 1175 | "Input sentence: W JMN JHWH DG GDWL\n", 1176 | "Decoded sentence: ['conj', 'verb', 'nmpr', 'subs', 'adjv']\n", 1177 | "-\n", 1178 | "Input sentence: L BL< >T JWNH\n", 1179 | "Decoded sentence: ['prep', 'verb', 'prep', 'nmpr']\n", 1180 | "-\n", 1181 | "Input sentence: W JTPLL JWNH >L JHWH >LHJW M MMR\n", 1185 | "Decoded sentence: ['conj', 'verb']\n", 1186 | "-\n", 1187 | "Input sentence: QR>TJ M YRH LJ >L JHWH\n", 1188 | "Decoded sentence: ['verb', 'prep', 'subs', 'prep', 'prep', 'nmpr']\n", 1189 | "-\n", 1190 | "Input sentence: W JWL CWNJ >MRTJ\n", 1209 | "Decoded sentence: ['conj', 'prps', 'verb']\n", 1210 | "-\n", 1211 | "Input sentence: NGRCTJ M NGD K >WSJP\n", 1221 | "Decoded sentence: ['advb', 'verb']\n", 1222 | "-\n", 1223 | "Input sentence: L HBJV >L HJKL QDCK\n", 1224 | "Decoded sentence: ['prep', 'verb', 'prep', 'subs', 'subs']\n", 1225 | "-\n", 1226 | "Input sentence: >PPWNJ MJM CJ\n", 1233 | "Decoded sentence: ['verb', 'subs', 'prep', 'subs']\n", 1234 | "-\n", 1235 | "Input sentence: L QYBJ HRJM JRDTJ\n", 1236 | "Decoded sentence: ['prep', 'subs', 'verb', 'verb']\n", 1237 | "-\n", 1238 | "Input sentence: H >RY\n", 1239 | "Decoded sentence: ['art', 'subs']\n", 1240 | "-\n", 1241 | "Input sentence: BRXJH BLHJ\n", 1248 | "Decoded sentence: ['nmpr', 'subs']\n", 1249 | "-\n", 1250 | "Input sentence: B HTT JHWH ZKRTJ\n", 1254 | "Decoded sentence: ['prep', 'nmpr', 'verb']\n", 1255 | "-\n", 1256 | "Input sentence: W TBW> >LJK TPLTJ >L HJKL QDCK\n", 1257 | "Decoded sentence: ['conj', 'verb', 'prep', 'subs', 'prep', 'subs', 'subs']\n", 1258 | "-\n", 1259 | "Input sentence: MCMRJM HBLJ CW>\n", 1260 | "Decoded sentence: ['verb', 'subs', 'subs']\n", 1261 | "-\n", 1262 | "Input sentence: XSDM JNJ\n", 1266 | "Decoded sentence: ['conj', 'prps']\n", 1267 | "-\n", 1268 | "Input sentence: B QWL TWDH >ZBXH LK\n", 1269 | "Decoded sentence: ['prep', 'subs', 'subs', 'verb', 'prep']\n", 1270 | "-\n", 1271 | "Input sentence: >CR NDRTJ\n", 1272 | "Decoded sentence: ['conj', 'verb']\n", 1273 | "-\n", 1274 | "Input sentence: >CLMH\n", 1275 | "Decoded sentence: ['verb']\n", 1276 | "-\n", 1277 | "Input sentence: JCWMR JHWH L DG\n", 1281 | "Decoded sentence: ['conj', 'verb', 'nmpr', 'prep', 'art', 'subs']\n", 1282 | "-\n", 1283 | "Input sentence: W JQ> >T JWNH >L H JBCH\n", 1284 | "Decoded sentence: ['conj', 'verb', 'prep', 'nmpr', 'prep', 'art', 'subs']\n", 1285 | "-\n", 1286 | "Input sentence: W JHJ DBR JHWH >L JWNH CNJT\n", 1287 | "Decoded sentence: ['conj', 'verb', 'subs', 'nmpr', 'prep', 'subs', 'nmpr']\n", 1288 | "-\n", 1289 | "Input sentence: L >MR\n", 1290 | "Decoded sentence: ['prep', 'verb']\n", 1291 | "-\n", 1292 | "Input sentence: QWM\n", 1293 | "Decoded sentence: ['verb']\n", 1294 | "-\n", 1295 | "Input sentence: LK >L NJNWH H >LJH >T H QRJ>H\n", 1299 | "Decoded sentence: ['conj', 'verb', 'prep', 'prep', 'art', 'subs']\n", 1300 | "-\n", 1301 | "Input sentence: >CR >NKJ DBR >LJK\n", 1302 | "Decoded sentence: ['conj', 'prps', 'verb', 'prep']\n", 1303 | "-\n", 1304 | "Input sentence: W JQM JWNH\n", 1305 | "Decoded sentence: ['conj', 'verb', 'nmpr']\n", 1306 | "-\n", 1307 | "Input sentence: W JLK >L NJNWH K DBR JHWH\n", 1308 | "Decoded sentence: ['conj', 'verb', 'prep', 'nmpr', 'prep', 'subs', 'nmpr']\n", 1309 | "-\n", 1310 | "Input sentence: W NJNWH HJTH LHJM\n", 1311 | "Decoded sentence: ['conj', 'nmpr', 'verb', 'subs', 'adjv', 'prep', 'subs']\n", 1312 | "-\n", 1313 | "Input sentence: MHLK CLCT JMJM\n", 1314 | "Decoded sentence: ['verb', 'adjv', 'subs']\n", 1315 | "-\n", 1316 | "Input sentence: W JXL JWNH\n", 1317 | "Decoded sentence: ['conj', 'verb', 'nmpr']\n", 1318 | "-\n", 1319 | "Input sentence: L BW> B XD\n", 1320 | "Decoded sentence: ['prep', 'verb', 'prep', 'art', 'subs', 'prep', 'subs', 'subs']\n", 1321 | "-\n", 1322 | "Input sentence: W JQR>\n", 1323 | "Decoded sentence: ['conj', 'verb']\n", 1324 | "-\n", 1325 | "Input sentence: W J>MR\n", 1326 | "Decoded sentence: ['conj', 'verb']\n", 1327 | "-\n", 1328 | "Input sentence: RBMJNW >NCJ NJNWH B >LHJM\n", 1335 | "Decoded sentence: ['conj', 'verb', 'subs', 'nmpr', 'prep', 'subs']\n", 1336 | "-\n", 1337 | "Input sentence: W JQR>W YWM\n", 1338 | "Decoded sentence: ['conj', 'verb', 'subs']\n", 1339 | "-\n", 1340 | "Input sentence: W JLBCW FQJM M GDWLM W L MLK NJNWH\n", 1344 | "Decoded sentence: ['conj', 'verb', 'art', 'subs', 'prep', 'subs', 'nmpr']\n", 1345 | "-\n", 1346 | "Input sentence: W JQM M KS>W\n", 1347 | "Decoded sentence: ['conj', 'verb', 'prep', 'subs']\n", 1348 | "-\n", 1349 | "Input sentence: W J
DRTW M PR\n", 1356 | "Decoded sentence: ['conj', 'verb', 'prep', 'art', 'subs']\n", 1357 | "-\n", 1358 | "Input sentence: W JZMR\n", 1362 | "Decoded sentence: ['conj', 'verb']\n", 1363 | "-\n", 1364 | "Input sentence: B NJNWH M VMR\n", 1368 | "Decoded sentence: ['prep', 'verb']\n", 1369 | "-\n", 1370 | "Input sentence: >L JRL JCTW\n", 1374 | "Decoded sentence: ['conj', 'subs', 'prep', 'verb']\n", 1375 | "-\n", 1376 | "Input sentence: W JTKSW FQJM H >DM W H BHMH\n", 1377 | "Decoded sentence: ['conj', 'verb', 'subs', 'art', 'subs', 'conj', 'art', 'subs']\n", 1378 | "-\n", 1379 | "Input sentence: W JQR>W >L >LHJM B XZQH\n", 1380 | "Decoded sentence: ['conj', 'verb', 'prep', 'subs', 'prep', 'nmpr']\n", 1381 | "-\n", 1382 | "Input sentence: W JCBW\n", 1383 | "Decoded sentence: ['conj', 'verb']\n", 1384 | "-\n", 1385 | "Input sentence: >JC M DRKW H RCR B KPJHM\n", 1389 | "Decoded sentence: ['conj', 'prep', 'subs']\n", 1390 | "-\n", 1391 | "Input sentence: MJ JWD<\n", 1392 | "Decoded sentence: ['prin', 'verb']\n", 1393 | "-\n", 1394 | "Input sentence: JCWB\n", 1395 | "Decoded sentence: ['verb']\n", 1396 | "-\n", 1397 | "Input sentence: W NXM H >LHJM\n", 1398 | "Decoded sentence: ['conj', 'verb', 'art', 'subs']\n", 1399 | "-\n", 1400 | "Input sentence: W CB M XRWN >PW\n", 1401 | "Decoded sentence: ['conj', 'verb', 'prep', 'subs', 'subs']\n", 1402 | "-\n", 1403 | "Input sentence: W L> N>BD\n", 1404 | "Decoded sentence: ['conj', 'nega', 'verb']\n", 1405 | "-\n", 1406 | "Input sentence: W JR> H >LHJM >T MLHJM CR DBR\n", 1416 | "Decoded sentence: ['conj', 'verb']\n", 1417 | "-\n", 1418 | "Input sentence: L L JWNH RL JHWH\n", 1431 | "Decoded sentence: ['conj', 'verb', 'prep', 'nmpr']\n", 1432 | "-\n", 1433 | "Input sentence: W J>MR\n", 1434 | "Decoded sentence: ['conj', 'verb']\n", 1435 | "-\n", 1436 | "Input sentence: >NH JHWH\n", 1437 | "Decoded sentence: ['inrg', 'nmpr']\n", 1438 | "-\n", 1439 | "Input sentence: H LW> ZH DBRJ\n", 1440 | "Decoded sentence: ['inrg', 'nega', 'prde', 'subs']\n", 1441 | "-\n", 1442 | "Input sentence: DMTJ\n", 1443 | "Decoded sentence: ['prep', 'verb', 'prep', 'subs']\n", 1444 | "-\n", 1445 | "Input sentence: TH >L XNWN W RXWM\n", 1455 | "Decoded sentence: ['conj', 'prps', 'subs', 'nmpr', 'conj', 'subs']\n", 1456 | "-\n", 1457 | "Input sentence: >RK >PJM\n", 1458 | "Decoded sentence: ['adjv', 'subs']\n", 1459 | "-\n", 1460 | "Input sentence: W RB XSD\n", 1461 | "Decoded sentence: ['conj', 'adjv', 'subs']\n", 1462 | "-\n", 1463 | "Input sentence: W NXM >T NPCJ MMNJ\n", 1473 | "Decoded sentence: ['verb', 'intj', 'prep', 'subs', 'prep']\n", 1474 | "-\n", 1475 | "Input sentence: KJ VWB MWTJ M XJJ\n", 1476 | "Decoded sentence: ['conj', 'adjv', 'subs', 'prep', 'adjv']\n", 1477 | "-\n", 1478 | "Input sentence: W J>MR JHWH\n", 1479 | "Decoded sentence: ['conj', 'verb', 'nmpr']\n", 1480 | "-\n", 1481 | "Input sentence: H HJVB\n", 1482 | "Decoded sentence: ['inrg', 'verb']\n", 1483 | "-\n", 1484 | "Input sentence: XRH LK\n", 1485 | "Decoded sentence: ['verb', 'prep']\n", 1486 | "-\n", 1487 | "Input sentence: W JY> JWNH MN H CR JR>H\n", 1500 | "Decoded sentence: ['prep', 'conj', 'verb']\n", 1501 | "-\n", 1502 | "Input sentence: MH JHJH B LHJM QJQJWN\n", 1506 | "Decoded sentence: ['conj', 'verb', 'nmpr', 'subs', 'adjv']\n", 1507 | "-\n", 1508 | "Input sentence: W JCW\n", 1512 | "Decoded sentence: ['prep', 'verb', 'subs', 'prep', 'subs']\n", 1513 | "-\n", 1514 | "Input sentence: L HYJL LW M RLHJM TWLT H QJQJWN\n", 1533 | "Decoded sentence: ['conj', 'verb', 'prep', 'art', 'adjv']\n", 1534 | "-\n", 1535 | "Input sentence: W JJBC\n", 1536 | "Decoded sentence: ['conj', 'verb']\n", 1537 | "-\n", 1538 | "Input sentence: W JHJ\n", 1539 | "Decoded sentence: ['conj', 'verb']\n", 1540 | "-\n", 1541 | "Input sentence: K ZRX H CMC\n", 1542 | "Decoded sentence: ['prep', 'verb', 'art', 'subs']\n", 1543 | "-\n", 1544 | "Input sentence: W JMN >LHJM RWX QDJM XRJCJT\n", 1545 | "Decoded sentence: ['conj', 'verb', 'subs', 'subs', 'subs', 'adjv']\n", 1546 | "-\n", 1547 | "Input sentence: W TK H CMC C JWNH\n", 1548 | "Decoded sentence: ['conj', 'verb', 'art', 'subs', 'prep', 'subs', 'nmpr']\n", 1549 | "-\n", 1550 | "Input sentence: W JTL >T NPCW\n", 1554 | "Decoded sentence: ['conj', 'verb', 'prep', 'subs']\n", 1555 | "-\n", 1556 | "Input sentence: L MWT\n", 1557 | "Decoded sentence: ['prep', 'verb']\n", 1558 | "-\n", 1559 | "Input sentence: W J>MR\n", 1560 | "Decoded sentence: ['conj', 'verb']\n", 1561 | "-\n", 1562 | "Input sentence: VWB MWTJ M XJJ\n", 1563 | "Decoded sentence: ['adjv', 'subs', 'prep', 'adjv']\n", 1564 | "-\n", 1565 | "Input sentence: W J>MR >LHJM >L JWNH\n", 1566 | "Decoded sentence: ['conj', 'verb', 'subs', 'prep', 'nmpr']\n", 1567 | "-\n", 1568 | "Input sentence: H HJVB\n", 1569 | "Decoded sentence: ['inrg', 'verb']\n", 1570 | "-\n", 1571 | "Input sentence: XRH LK MR\n", 1575 | "Decoded sentence: ['conj', 'verb']\n", 1576 | "-\n", 1577 | "Input sentence: HJVB\n", 1578 | "Decoded sentence: ['verb']\n" 1579 | ] 1580 | } 1581 | ], 1582 | "source": [ 1583 | "for seq_index in range(220):\n", 1584 | " inp_seq = tokenized_test_data[seq_index:seq_index+1]\n", 1585 | " \n", 1586 | " pred_pos = decode_seq(inp_seq)\n", 1587 | " print('-')\n", 1588 | " print('Input sentence:', test_clauses[seq_index])\n", 1589 | " print('Decoded sentence:', pred_pos[:-1])" 1590 | ] 1591 | } 1592 | ], 1593 | "metadata": { 1594 | "kernelspec": { 1595 | "display_name": "Python 3", 1596 | "language": "python", 1597 | "name": "python3" 1598 | }, 1599 | "language_info": { 1600 | "codemirror_mode": { 1601 | "name": "ipython", 1602 | "version": 3 1603 | }, 1604 | "file_extension": ".py", 1605 | "mimetype": "text/x-python", 1606 | "name": "python", 1607 | "nbconvert_exporter": "python", 1608 | "pygments_lexer": "ipython3", 1609 | "version": "3.6.5" 1610 | } 1611 | }, 1612 | "nbformat": 4, 1613 | "nbformat_minor": 2 1614 | } 1615 | --------------------------------------------------------------------------------