├── .gitignore ├── LICENSE ├── README.md ├── XLNet.ipynb ├── data.txt ├── data_utils.py ├── images ├── AEmodel.png ├── ARmodel.png ├── PLM.png ├── ParPrediction.png ├── hyperparameters.png ├── target-aware.png └── twoattn.png ├── main.py └── xlnet.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2019 Tae Hwan Jung 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## XLNet-Pytorch [arxiv:1906.08237](https://arxiv.org/pdf/1906.08237.pdf) 2 | 3 | **Simple XLNet implementation with Pytorch Wrapper!** 4 | 5 | #### You can see How XLNet Architecture work in pre-training with small batch size(=1) example. 6 | 7 | #### To Usage 8 | 9 | ```shell 10 | $ git clone https://github.com/graykode/xlnet-Pytorch && cd xlnet-Pytorch 11 | 12 | # To use Sentence Piece Tokenizer(pretrained-BERT Tokenizer) 13 | $ pip install pytorch_pretrained_bert 14 | 15 | $ python main.py --data ./data.txt --tokenizer bert-base-uncased \ 16 | --seq_len 512 --reuse_len 256 --perm_size 256 \ 17 | --bi_data True --mask_alpha 6 --mask_beta 1 \ 18 | --num_predict 85 --mem_len 384 --num_epoch 100 19 | ``` 20 | 21 | Also, You can run code in [Google Colab](https://colab.research.google.com/github/graykode/xlnet-Pytorch/blob/master/XLNet.ipynb) easily. 22 | 23 | - Hyperparameters for Pretraining in Paper. 24 | 25 |

26 | #### Option 27 | 28 | - `—data`(String) : `.txt` file to train. It doesn't matter multiline text. Also, one file will be one batch tensor. Default : `data.txt` 29 | - `—tokenizer`(String) : I just used [huggingface/pytorch-pretrained-BERT's Tokenizer](https://github.com/huggingface/pytorch-pretrained-BERT) as subword tokenizer(I'll edit it to sentence piece soon). you can choose in `bert-base-uncased`, `bert-large-uncased`, `bert-base-cased`, `bert-large-cased`. Default : `bert-base-uncased` 30 | - `—seq_len`(Integer) : Sequence length. Default : `512` 31 | - `—reuse_len`(Interger) : Number of token that can be reused as memory. Could be half of `seq_len`. Default : `256` 32 | - `—perm_size`(Interger) : the length of longest permutation. Could be set to be reuse_len. Default : `256` 33 | 34 | - `--bi_data`(Boolean) : whether to create bidirectional data. If `bi_data` is `True`, `biz(batch size)` should be even number. Default : `False` 35 | - `—mask_alpha`(Interger) : How many tokens to form a group. Defalut : `6` 36 | - `—mask_beta`(Integer) : How many tokens to mask within each group. Default : `1` 37 | - `—num_predict`(Interger) : Num of tokens to predict. In Paper, it mean Partial Prediction. Default : `85` 38 | - `—mem_len`(Interger) : Number of steps to cache in Transformer-XL Architecture. Default : `384` 39 | - `—num_epoch`(Interger) : Number of Epoch. Default : `100` 40 | 41 | 42 | 43 | ## What is XLNet? 44 | 45 | **XLNet** is a new unsupervised language representation learning method based on a novel generalized permutation language modeling objective. Additionally, XLNet employs [Transformer-XL](https://arxiv.org/abs/1901.02860) as the backbone model, exhibiting excellent performance for language tasks involving long context. 46 | 47 | - [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 48 | - [Paper Author's XLNet Github](https://github.com/zihangdai/xlnet) 49 | 50 | | Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B | 51 | | ----- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | 52 | | BERT | 86.6 | 92.3 | 91.3 | 70.4 | 93.2 | 88.0 | 60.6 | 90.0 | 53 | | XLNet | **89.8** | **93.9** | **91.8** | **83.8** | **95.6** | **89.2** | **63.6** | **91.8** | 54 | 55 | 56 | 57 | ### Keyword in XLNet 58 | 59 | 1. How did XLNet benefit from Auto-Regression and Auto-Encoding models? 60 | 61 | - Auto-Regression Model 62 | ![](images/ARmodel.png) 63 | - Auto-Encoding Model 64 | ![](images/AEmodel.png) 65 | 66 | 2. Permutation Language Modeling with Partial Prediction 67 | - Permutation Language Modeling 68 | ![](images/PLM.png) 69 | 70 | - Partial Prediction 71 | ![](images/ParPrediction.png) 72 | 73 | 3. Two-Stream Self-Attention with Target-Aware Representation 74 | 75 | - Two-Stram Self-Attention 76 | 77 | ![](images/twoattn.png) 78 | 79 | - Target-Aware Representation 80 | 81 | ![](images/target-aware.png) 82 | 83 | 84 | 85 | ## Author 86 | 87 | - Because the original repository is subject to the **Apache2.0 license**, it is subject to the same license. 88 | - Tae Hwan Jung(Jeff Jung) @graykode, Kyung Hee Univ CE(Undergraduate). 89 | - Author Email : [nlkey2022@gmail.com](mailto:nlkey2022@gmail.com) -------------------------------------------------------------------------------- /XLNet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "XLNet.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "collapsed_sections": [] 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "code", 20 | "metadata": { 21 | "id": "ayaYxaMPq5RF", 22 | "colab_type": "code", 23 | "outputId": "7ece05c3-51fc-43de-bfae-22d47cc97a10", 24 | "colab": { 25 | "base_uri": "https://localhost:8080/", 26 | "height": 119 27 | } 28 | }, 29 | "source": [ 30 | "!git clone https://github.com/graykode/xlnet-Pytorch" 31 | ], 32 | "execution_count": 5, 33 | "outputs": [ 34 | { 35 | "output_type": "stream", 36 | "text": [ 37 | "Cloning into 'xlnet-Pytorch'...\n", 38 | "remote: Enumerating objects: 32, done.\u001b[K\n", 39 | "remote: Counting objects: 100% (32/32), done.\u001b[K\n", 40 | "remote: Compressing objects: 100% (20/20), done.\u001b[K\n", 41 | "remote: Total 32 (delta 16), reused 27 (delta 11), pack-reused 0\n", 42 | "Unpacking objects: 100% (32/32), done.\n" 43 | ], 44 | "name": "stdout" 45 | } 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "metadata": { 51 | "id": "iP2YYduXrFWb", 52 | "colab_type": "code", 53 | "outputId": "fe3851e1-e1cb-4913-eb8b-02724538ffc6", 54 | "colab": { 55 | "base_uri": "https://localhost:8080/", 56 | "height": 34 57 | } 58 | }, 59 | "source": [ 60 | "%cd xlnet-Pytorch" 61 | ], 62 | "execution_count": 6, 63 | "outputs": [ 64 | { 65 | "output_type": "stream", 66 | "text": [ 67 | "/content/xlnet-Pytorch/xlnet-Pytorch\n" 68 | ], 69 | "name": "stdout" 70 | } 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "ijR047EprRIH", 77 | "colab_type": "code", 78 | "outputId": "ed80eb1e-f2f7-4035-d7b4-6dae19dced9a", 79 | "colab": { 80 | "base_uri": "https://localhost:8080/", 81 | "height": 326 82 | } 83 | }, 84 | "source": [ 85 | "!pip install pytorch_pretrained_bert" 86 | ], 87 | "execution_count": 7, 88 | "outputs": [ 89 | { 90 | "output_type": "stream", 91 | "text": [ 92 | "Requirement already satisfied: pytorch_pretrained_bert in /usr/local/lib/python3.6/dist-packages (0.6.2)\n", 93 | "Requirement already satisfied: torch>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (1.1.0)\n", 94 | "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (2.21.0)\n", 95 | "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (4.28.1)\n", 96 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (1.16.4)\n", 97 | "Requirement already satisfied: regex in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (2019.6.8)\n", 98 | "Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (1.9.167)\n", 99 | "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (1.24.3)\n", 100 | "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (2.8)\n", 101 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (2019.3.9)\n", 102 | "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (3.0.4)\n", 103 | "Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch_pretrained_bert) (0.2.1)\n", 104 | "Requirement already satisfied: botocore<1.13.0,>=1.12.167 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch_pretrained_bert) (1.12.167)\n", 105 | "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch_pretrained_bert) (0.9.4)\n", 106 | "Requirement already satisfied: docutils>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.13.0,>=1.12.167->boto3->pytorch_pretrained_bert) (0.14)\n", 107 | "Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\" in /usr/local/lib/python3.6/dist-packages (from botocore<1.13.0,>=1.12.167->boto3->pytorch_pretrained_bert) (2.5.3)\n", 108 | "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\"->botocore<1.13.0,>=1.12.167->boto3->pytorch_pretrained_bert) (1.12.0)\n" 109 | ], 110 | "name": "stdout" 111 | } 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "metadata": { 117 | "id": "BkV6fPyArNbN", 118 | "colab_type": "code", 119 | "outputId": "6c84ac20-3e16-4abc-c9ab-0d5d540d090a", 120 | "colab": { 121 | "base_uri": "https://localhost:8080/", 122 | "height": 1000 123 | } 124 | }, 125 | "source": [ 126 | "!python main.py" 127 | ], 128 | "execution_count": 4, 129 | "outputs": [ 130 | { 131 | "output_type": "stream", 132 | "text": [ 133 | "100% 231508/231508 [00:00<00:00, 418264.00B/s]\n", 134 | "Number of Step: 0001 cost = 26.299185\n", 135 | "Number of Step: 0001 cost = 25.861174\n", 136 | "Number of Step: 0001 cost = 26.162863\n", 137 | "Number of Step: 0001 cost = 26.079767\n", 138 | "Number of Step: 0001 cost = 25.013128\n", 139 | "Number of Step: 0001 cost = 25.666861\n", 140 | "Number of Step: 0001 cost = 25.705088\n", 141 | "Number of Step: 0001 cost = 24.955669\n", 142 | "Number of Step: 0002 cost = 24.116352\n", 143 | "Number of Step: 0002 cost = 25.042078\n", 144 | "Number of Step: 0002 cost = 24.091347\n", 145 | "Number of Step: 0002 cost = 25.537655\n", 146 | "Number of Step: 0002 cost = 24.248133\n", 147 | "Number of Step: 0002 cost = 25.824280\n", 148 | "Number of Step: 0002 cost = 24.787397\n", 149 | "Number of Step: 0002 cost = 26.053616\n", 150 | "Number of Step: 0003 cost = 25.367481\n", 151 | "Number of Step: 0003 cost = 25.543600\n", 152 | "Number of Step: 0003 cost = 25.891920\n", 153 | "Number of Step: 0003 cost = 24.980560\n", 154 | "Number of Step: 0003 cost = 25.297611\n", 155 | "Number of Step: 0003 cost = 25.850960\n", 156 | "Number of Step: 0003 cost = 25.115484\n", 157 | "Number of Step: 0003 cost = 23.884501\n", 158 | "Number of Step: 0004 cost = 24.838873\n", 159 | "Number of Step: 0004 cost = 25.575075\n", 160 | "Number of Step: 0004 cost = 26.392899\n", 161 | "Number of Step: 0004 cost = 24.445896\n", 162 | "Number of Step: 0004 cost = 24.398573\n", 163 | "Number of Step: 0004 cost = 25.106733\n", 164 | "Number of Step: 0004 cost = 26.140654\n", 165 | "Number of Step: 0004 cost = 25.236158\n", 166 | "Number of Step: 0005 cost = 24.996521\n", 167 | "Number of Step: 0005 cost = 25.761152\n", 168 | "Number of Step: 0005 cost = 25.200850\n", 169 | "Number of Step: 0005 cost = 23.871277\n", 170 | "Number of Step: 0005 cost = 24.210709\n", 171 | "Number of Step: 0005 cost = 23.889694\n", 172 | "Number of Step: 0005 cost = 24.945341\n", 173 | "Number of Step: 0005 cost = 25.475309\n", 174 | "Number of Step: 0006 cost = 25.655682\n", 175 | "Number of Step: 0006 cost = 24.298883\n", 176 | "Number of Step: 0006 cost = 25.119278\n", 177 | "Number of Step: 0006 cost = 25.098862\n", 178 | "Number of Step: 0006 cost = 25.151922\n", 179 | "Number of Step: 0006 cost = 25.285501\n", 180 | "Number of Step: 0006 cost = 24.107182\n", 181 | "Number of Step: 0006 cost = 23.748384\n", 182 | "Number of Step: 0007 cost = 24.113546\n", 183 | "Number of Step: 0007 cost = 25.006632\n", 184 | "Number of Step: 0007 cost = 24.332354\n", 185 | "Number of Step: 0007 cost = 25.120981\n", 186 | "Number of Step: 0007 cost = 24.404642\n", 187 | "Number of Step: 0007 cost = 24.650999\n", 188 | "Number of Step: 0007 cost = 24.360918\n", 189 | "Number of Step: 0007 cost = 23.753399\n", 190 | "Number of Step: 0008 cost = 24.496143\n", 191 | "Number of Step: 0008 cost = 24.459608\n", 192 | "Number of Step: 0008 cost = 24.346823\n", 193 | "Number of Step: 0008 cost = 25.390636\n", 194 | "Number of Step: 0008 cost = 25.017641\n", 195 | "Number of Step: 0008 cost = 24.501677\n", 196 | "Number of Step: 0008 cost = 23.598795\n", 197 | "Number of Step: 0008 cost = 24.301554\n", 198 | "Number of Step: 0009 cost = 25.410679\n", 199 | "Number of Step: 0009 cost = 24.467159\n", 200 | "Number of Step: 0009 cost = 24.054974\n", 201 | "Number of Step: 0009 cost = 23.793539\n", 202 | "Number of Step: 0009 cost = 23.268635\n", 203 | "Number of Step: 0009 cost = 24.078022\n", 204 | "Number of Step: 0009 cost = 24.316879\n", 205 | "Number of Step: 0009 cost = 23.515970\n", 206 | "Number of Step: 0010 cost = 24.067686\n", 207 | "Number of Step: 0010 cost = 24.360920\n", 208 | "Number of Step: 0010 cost = 25.631681\n", 209 | "Number of Step: 0010 cost = 23.218294\n", 210 | "Number of Step: 0010 cost = 24.856960\n", 211 | "Number of Step: 0010 cost = 23.544510\n", 212 | "Number of Step: 0010 cost = 23.410534\n", 213 | "Number of Step: 0010 cost = 23.849169\n", 214 | "Number of Step: 0011 cost = 24.627134\n", 215 | "Number of Step: 0011 cost = 25.954533\n", 216 | "Number of Step: 0011 cost = 24.426609\n", 217 | "Number of Step: 0011 cost = 25.764784\n", 218 | "Number of Step: 0011 cost = 25.585995\n", 219 | "Number of Step: 0011 cost = 25.143883\n", 220 | "Number of Step: 0011 cost = 22.926973\n", 221 | "Number of Step: 0011 cost = 24.910206\n", 222 | "Number of Step: 0012 cost = 23.564455\n", 223 | "Number of Step: 0012 cost = 24.482298\n", 224 | "Number of Step: 0012 cost = 24.107456\n", 225 | "Number of Step: 0012 cost = 23.281023\n", 226 | "Number of Step: 0012 cost = 24.001156\n", 227 | "Number of Step: 0012 cost = 23.877611\n", 228 | "Number of Step: 0012 cost = 23.695135\n", 229 | "Number of Step: 0012 cost = 23.912651\n", 230 | "Number of Step: 0013 cost = 24.788204\n", 231 | "Number of Step: 0013 cost = 24.605497\n", 232 | "Number of Step: 0013 cost = 23.714304\n", 233 | "Number of Step: 0013 cost = 24.666931\n", 234 | "Number of Step: 0013 cost = 24.020756\n", 235 | "Number of Step: 0013 cost = 23.386766\n", 236 | "Number of Step: 0013 cost = 24.413082\n", 237 | "Number of Step: 0013 cost = 24.092968\n", 238 | "Number of Step: 0014 cost = 22.983570\n", 239 | "Number of Step: 0014 cost = 25.068871\n", 240 | "Number of Step: 0014 cost = 23.518705\n", 241 | "Number of Step: 0014 cost = 23.637272\n", 242 | "Number of Step: 0014 cost = 22.940498\n", 243 | "Number of Step: 0014 cost = 25.140924\n", 244 | "Number of Step: 0014 cost = 23.010714\n", 245 | "Number of Step: 0014 cost = 23.191177\n", 246 | "Number of Step: 0015 cost = 22.630165\n", 247 | "Number of Step: 0015 cost = 23.283859\n", 248 | "Number of Step: 0015 cost = 23.364052\n", 249 | "Number of Step: 0015 cost = 24.416988\n", 250 | "Number of Step: 0015 cost = 24.014668\n", 251 | "Number of Step: 0015 cost = 22.869274\n", 252 | "Number of Step: 0015 cost = 23.760340\n", 253 | "Number of Step: 0015 cost = 23.840309\n", 254 | "Number of Step: 0016 cost = 23.310936\n", 255 | "Number of Step: 0016 cost = 24.574957\n", 256 | "Number of Step: 0016 cost = 23.376127\n", 257 | "Number of Step: 0016 cost = 24.164631\n", 258 | "Number of Step: 0016 cost = 23.071663\n", 259 | "Number of Step: 0016 cost = 23.024294\n", 260 | "Number of Step: 0016 cost = 23.181185\n", 261 | "Number of Step: 0016 cost = 24.051889\n", 262 | "Number of Step: 0017 cost = 23.288946\n", 263 | "Number of Step: 0017 cost = 24.085411\n", 264 | "Number of Step: 0017 cost = 22.379274\n", 265 | "Number of Step: 0017 cost = 24.024132\n", 266 | "Number of Step: 0017 cost = 24.557764\n", 267 | "Number of Step: 0017 cost = 24.751358\n", 268 | "Number of Step: 0017 cost = 23.339399\n", 269 | "Number of Step: 0017 cost = 23.635082\n", 270 | "Number of Step: 0018 cost = 22.173685\n", 271 | "Number of Step: 0018 cost = 23.776503\n", 272 | "Number of Step: 0018 cost = 23.752548\n", 273 | "Number of Step: 0018 cost = 23.538460\n", 274 | "Number of Step: 0018 cost = 22.357494\n", 275 | "Number of Step: 0018 cost = 23.622934\n", 276 | "Number of Step: 0018 cost = 23.631004\n", 277 | "Number of Step: 0018 cost = 22.551394\n", 278 | "Number of Step: 0019 cost = 22.947685\n", 279 | "Number of Step: 0019 cost = 22.403502\n", 280 | "Number of Step: 0019 cost = 22.972301\n", 281 | "Number of Step: 0019 cost = 21.893215\n", 282 | "Number of Step: 0019 cost = 23.263187\n", 283 | "Number of Step: 0019 cost = 22.995371\n", 284 | "Number of Step: 0019 cost = 22.956581\n", 285 | "Number of Step: 0019 cost = 23.096869\n", 286 | "Number of Step: 0020 cost = 23.094641\n", 287 | "Number of Step: 0020 cost = 22.921461\n", 288 | "Number of Step: 0020 cost = 23.298218\n", 289 | "Number of Step: 0020 cost = 22.579227\n", 290 | "Number of Step: 0020 cost = 24.224125\n", 291 | "Number of Step: 0020 cost = 23.881729\n", 292 | "Number of Step: 0020 cost = 21.861792\n", 293 | "Number of Step: 0020 cost = 22.777273\n", 294 | "Number of Step: 0021 cost = 22.222363\n", 295 | "Number of Step: 0021 cost = 21.119030\n", 296 | "Number of Step: 0021 cost = 23.907051\n", 297 | "Number of Step: 0021 cost = 23.819723\n", 298 | "Number of Step: 0021 cost = 23.788166\n", 299 | "Number of Step: 0021 cost = 23.229645\n", 300 | "Number of Step: 0021 cost = 24.274410\n", 301 | "Number of Step: 0021 cost = 22.877367\n", 302 | "Number of Step: 0022 cost = 22.700832\n", 303 | "Number of Step: 0022 cost = 23.184784\n", 304 | "Number of Step: 0022 cost = 22.714603\n", 305 | "Number of Step: 0022 cost = 23.461004\n", 306 | "Number of Step: 0022 cost = 21.954988\n", 307 | "Number of Step: 0022 cost = 21.957075\n", 308 | "Number of Step: 0022 cost = 22.306690\n", 309 | "Number of Step: 0022 cost = 23.725677\n", 310 | "Number of Step: 0023 cost = 22.276127\n", 311 | "Number of Step: 0023 cost = 23.766611\n", 312 | "Number of Step: 0023 cost = 22.990093\n", 313 | "Number of Step: 0023 cost = 23.687693\n", 314 | "Number of Step: 0023 cost = 23.503782\n", 315 | "Number of Step: 0023 cost = 22.529320\n", 316 | "Number of Step: 0023 cost = 23.205400\n", 317 | "Number of Step: 0023 cost = 22.896313\n", 318 | "Number of Step: 0024 cost = 22.109776\n", 319 | "Number of Step: 0024 cost = 22.556622\n", 320 | "Number of Step: 0024 cost = 22.328550\n", 321 | "Number of Step: 0024 cost = 22.092735\n", 322 | "Number of Step: 0024 cost = 23.011541\n", 323 | "Number of Step: 0024 cost = 23.419533\n", 324 | "Number of Step: 0024 cost = 22.668753\n", 325 | "Number of Step: 0024 cost = 22.147079\n", 326 | "Number of Step: 0025 cost = 21.928825\n", 327 | "Number of Step: 0025 cost = 23.278080\n", 328 | "Number of Step: 0025 cost = 23.145954\n", 329 | "Number of Step: 0025 cost = 22.317205\n", 330 | "Number of Step: 0025 cost = 22.680893\n", 331 | "Number of Step: 0025 cost = 22.483362\n", 332 | "Number of Step: 0025 cost = 22.365522\n", 333 | "Number of Step: 0025 cost = 22.656649\n", 334 | "Number of Step: 0026 cost = 21.763489\n", 335 | "Number of Step: 0026 cost = 21.822924\n", 336 | "Number of Step: 0026 cost = 22.338774\n", 337 | "Number of Step: 0026 cost = 22.268866\n", 338 | "Number of Step: 0026 cost = 22.320282\n", 339 | "Number of Step: 0026 cost = 22.410757\n", 340 | "Number of Step: 0026 cost = 22.637495\n", 341 | "Number of Step: 0026 cost = 22.134693\n", 342 | "Number of Step: 0027 cost = 23.152620\n", 343 | "Number of Step: 0027 cost = 21.620134\n", 344 | "Number of Step: 0027 cost = 22.444149\n", 345 | "Number of Step: 0027 cost = 22.397623\n", 346 | "Number of Step: 0027 cost = 22.449764\n", 347 | "Number of Step: 0027 cost = 22.904938\n", 348 | "Number of Step: 0027 cost = 22.601612\n", 349 | "Number of Step: 0027 cost = 22.304131\n", 350 | "Number of Step: 0028 cost = 20.434067\n", 351 | "Number of Step: 0028 cost = 22.794069\n", 352 | "Number of Step: 0028 cost = 23.426819\n", 353 | "Number of Step: 0028 cost = 22.357069\n", 354 | "Number of Step: 0028 cost = 22.608589\n", 355 | "Number of Step: 0028 cost = 22.465258\n", 356 | "Number of Step: 0028 cost = 22.891714\n", 357 | "Number of Step: 0028 cost = 21.988979\n", 358 | "Number of Step: 0029 cost = 22.498970\n", 359 | "Number of Step: 0029 cost = 21.521902\n", 360 | "Number of Step: 0029 cost = 23.053669\n", 361 | "Number of Step: 0029 cost = 22.357464\n", 362 | "Number of Step: 0029 cost = 21.904669\n", 363 | "Number of Step: 0029 cost = 21.634151\n", 364 | "Number of Step: 0029 cost = 20.249266\n", 365 | "Number of Step: 0029 cost = 22.062521\n", 366 | "Number of Step: 0030 cost = 22.270859\n", 367 | "Number of Step: 0030 cost = 22.696949\n", 368 | "Number of Step: 0030 cost = 22.070248\n", 369 | "Number of Step: 0030 cost = 22.543518\n", 370 | "Number of Step: 0030 cost = 23.614525\n", 371 | "Number of Step: 0030 cost = 22.858721\n", 372 | "Number of Step: 0030 cost = 21.260269\n", 373 | "Number of Step: 0030 cost = 21.462461\n", 374 | "Number of Step: 0031 cost = 22.491512\n", 375 | "Number of Step: 0031 cost = 21.529919\n", 376 | "Number of Step: 0031 cost = 22.923733\n", 377 | "Number of Step: 0031 cost = 21.869879\n", 378 | "Number of Step: 0031 cost = 22.435644\n", 379 | "Number of Step: 0031 cost = 22.430405\n", 380 | "Number of Step: 0031 cost = 22.144842\n", 381 | "Number of Step: 0031 cost = 21.964941\n", 382 | "Number of Step: 0032 cost = 22.055660\n", 383 | "Number of Step: 0032 cost = 22.045565\n", 384 | "Number of Step: 0032 cost = 21.927069\n", 385 | "Number of Step: 0032 cost = 21.248281\n", 386 | "Number of Step: 0032 cost = 21.617807\n", 387 | "Number of Step: 0032 cost = 20.935833\n", 388 | "Number of Step: 0032 cost = 22.303843\n", 389 | "Number of Step: 0032 cost = 22.419876\n", 390 | "Number of Step: 0033 cost = 21.030119\n", 391 | "Number of Step: 0033 cost = 22.250040\n", 392 | "Number of Step: 0033 cost = 20.650230\n", 393 | "Number of Step: 0033 cost = 22.192366\n", 394 | "Number of Step: 0033 cost = 21.154890\n", 395 | "Number of Step: 0033 cost = 22.080959\n", 396 | "Number of Step: 0033 cost = 21.871065\n", 397 | "Number of Step: 0033 cost = 22.260691\n", 398 | "Number of Step: 0034 cost = 21.728571\n", 399 | "Number of Step: 0034 cost = 22.007542\n", 400 | "Number of Step: 0034 cost = 21.078880\n", 401 | "Number of Step: 0034 cost = 21.850500\n", 402 | "Number of Step: 0034 cost = 21.490084\n", 403 | "Number of Step: 0034 cost = 21.671848\n", 404 | "Number of Step: 0034 cost = 22.391680\n", 405 | "Number of Step: 0034 cost = 21.705509\n", 406 | "Number of Step: 0035 cost = 21.543411\n", 407 | "Number of Step: 0035 cost = 21.777857\n", 408 | "Number of Step: 0035 cost = 21.447199\n", 409 | "Number of Step: 0035 cost = 22.119251\n", 410 | "Number of Step: 0035 cost = 22.220009\n", 411 | "Number of Step: 0035 cost = 19.819977\n", 412 | "Number of Step: 0035 cost = 21.531845\n", 413 | "Number of Step: 0035 cost = 21.998089\n", 414 | "Number of Step: 0036 cost = 21.735443\n", 415 | "Number of Step: 0036 cost = 21.648676\n", 416 | "Number of Step: 0036 cost = 21.940975\n", 417 | "Number of Step: 0036 cost = 20.961882\n", 418 | "Number of Step: 0036 cost = 20.211729\n", 419 | "Number of Step: 0036 cost = 22.080381\n", 420 | "Number of Step: 0036 cost = 21.906378\n", 421 | "Number of Step: 0036 cost = 20.907700\n", 422 | "Number of Step: 0037 cost = 22.363924\n", 423 | "Number of Step: 0037 cost = 21.551998\n", 424 | "Number of Step: 0037 cost = 21.930584\n", 425 | "Number of Step: 0037 cost = 21.773323\n", 426 | "Number of Step: 0037 cost = 21.142616\n", 427 | "Number of Step: 0037 cost = 21.637774\n", 428 | "Number of Step: 0037 cost = 22.236561\n", 429 | "Number of Step: 0037 cost = 22.778532\n", 430 | "Number of Step: 0038 cost = 21.648951\n", 431 | "Number of Step: 0038 cost = 21.444340\n", 432 | "Number of Step: 0038 cost = 21.856207\n", 433 | "Number of Step: 0038 cost = 22.085463\n", 434 | "Number of Step: 0038 cost = 21.862345\n", 435 | "Number of Step: 0038 cost = 21.632576\n", 436 | "Number of Step: 0038 cost = 20.948652\n", 437 | "Number of Step: 0038 cost = 21.160299\n", 438 | "Number of Step: 0039 cost = 22.679569\n", 439 | "Number of Step: 0039 cost = 22.160774\n", 440 | "Number of Step: 0039 cost = 20.349392\n", 441 | "Number of Step: 0039 cost = 21.426294\n", 442 | "Number of Step: 0039 cost = 21.323292\n", 443 | "Number of Step: 0039 cost = 20.700750\n", 444 | "Number of Step: 0039 cost = 19.951761\n", 445 | "Number of Step: 0039 cost = 21.798811\n", 446 | "Number of Step: 0040 cost = 20.747545\n", 447 | "Number of Step: 0040 cost = 21.327257\n", 448 | "Number of Step: 0040 cost = 20.517118\n", 449 | "Number of Step: 0040 cost = 20.885836\n", 450 | "Number of Step: 0040 cost = 22.016651\n", 451 | "Number of Step: 0040 cost = 21.293856\n", 452 | "Number of Step: 0040 cost = 20.931459\n", 453 | "Number of Step: 0040 cost = 21.169933\n", 454 | "Number of Step: 0041 cost = 20.563751\n", 455 | "Number of Step: 0041 cost = 21.146351\n", 456 | "Number of Step: 0041 cost = 20.402386\n", 457 | "Number of Step: 0041 cost = 20.247446\n", 458 | "Number of Step: 0041 cost = 20.776196\n", 459 | "Number of Step: 0041 cost = 19.906986\n", 460 | "Number of Step: 0041 cost = 21.070517\n", 461 | "Number of Step: 0041 cost = 20.296288\n", 462 | "Number of Step: 0042 cost = 22.144644\n", 463 | "Number of Step: 0042 cost = 21.261848\n", 464 | "Number of Step: 0042 cost = 21.801069\n", 465 | "Number of Step: 0042 cost = 21.594389\n", 466 | "Number of Step: 0042 cost = 21.601309\n", 467 | "Number of Step: 0042 cost = 21.305853\n", 468 | "Number of Step: 0042 cost = 22.010830\n", 469 | "Number of Step: 0042 cost = 20.897848\n", 470 | "Number of Step: 0043 cost = 20.145580\n", 471 | "Number of Step: 0043 cost = 20.813745\n", 472 | "Number of Step: 0043 cost = 20.590549\n", 473 | "Number of Step: 0043 cost = 21.520615\n", 474 | "Number of Step: 0043 cost = 21.588079\n", 475 | "Number of Step: 0043 cost = 21.597328\n", 476 | "Number of Step: 0043 cost = 21.607746\n", 477 | "Number of Step: 0043 cost = 21.700638\n", 478 | "Number of Step: 0044 cost = 21.086859\n", 479 | "Number of Step: 0044 cost = 20.642982\n", 480 | "Number of Step: 0044 cost = 20.240429\n", 481 | "Number of Step: 0044 cost = 21.358030\n", 482 | "Number of Step: 0044 cost = 20.559532\n", 483 | "Number of Step: 0044 cost = 21.468231\n", 484 | "Number of Step: 0044 cost = 18.892157\n", 485 | "Number of Step: 0044 cost = 20.416586\n", 486 | "Number of Step: 0045 cost = 22.413452\n", 487 | "Number of Step: 0045 cost = 20.464434\n", 488 | "Number of Step: 0045 cost = 20.259840\n", 489 | "Number of Step: 0045 cost = 19.961233\n", 490 | "Number of Step: 0045 cost = 21.648184\n", 491 | "Number of Step: 0045 cost = 20.172035\n", 492 | "Number of Step: 0045 cost = 20.020864\n", 493 | "Number of Step: 0045 cost = 21.115805\n", 494 | "Number of Step: 0046 cost = 20.295105\n", 495 | "Number of Step: 0046 cost = 21.123190\n", 496 | "Number of Step: 0046 cost = 20.192799\n", 497 | "Number of Step: 0046 cost = 21.497696\n", 498 | "Number of Step: 0046 cost = 20.283506\n", 499 | "Number of Step: 0046 cost = 20.353373\n", 500 | "Number of Step: 0046 cost = 21.410826\n", 501 | "Number of Step: 0046 cost = 20.060562\n", 502 | "Number of Step: 0047 cost = 20.613169\n", 503 | "Number of Step: 0047 cost = 21.671728\n", 504 | "Number of Step: 0047 cost = 19.864946\n", 505 | "Number of Step: 0047 cost = 20.989864\n", 506 | "Number of Step: 0047 cost = 20.588034\n", 507 | "Number of Step: 0047 cost = 20.573233\n", 508 | "Number of Step: 0047 cost = 20.943060\n", 509 | "Number of Step: 0047 cost = 20.618057\n", 510 | "Number of Step: 0048 cost = 20.469118\n", 511 | "Number of Step: 0048 cost = 19.834934\n", 512 | "Number of Step: 0048 cost = 20.439869\n", 513 | "Number of Step: 0048 cost = 19.947546\n", 514 | "Number of Step: 0048 cost = 19.776983\n", 515 | "Number of Step: 0048 cost = 20.157717\n", 516 | "Number of Step: 0048 cost = 20.530584\n", 517 | "Number of Step: 0048 cost = 20.621025\n", 518 | "Number of Step: 0049 cost = 20.297810\n", 519 | "Number of Step: 0049 cost = 20.466293\n", 520 | "Number of Step: 0049 cost = 20.277691\n", 521 | "Number of Step: 0049 cost = 19.828459\n", 522 | "Number of Step: 0049 cost = 20.133368\n", 523 | "Number of Step: 0049 cost = 20.968479\n", 524 | "Number of Step: 0049 cost = 19.882719\n", 525 | "Number of Step: 0049 cost = 18.925854\n", 526 | "Number of Step: 0050 cost = 19.301132\n", 527 | "Number of Step: 0050 cost = 20.110096\n", 528 | "Number of Step: 0050 cost = 19.726845\n", 529 | "Number of Step: 0050 cost = 19.612841\n", 530 | "Number of Step: 0050 cost = 21.341433\n", 531 | "Number of Step: 0050 cost = 19.957525\n", 532 | "Number of Step: 0050 cost = 20.750641\n", 533 | "Number of Step: 0050 cost = 19.585604\n", 534 | "Number of Step: 0051 cost = 20.252506\n", 535 | "Number of Step: 0051 cost = 20.737688\n", 536 | "Number of Step: 0051 cost = 19.447847\n", 537 | "Number of Step: 0051 cost = 21.109488\n", 538 | "Number of Step: 0051 cost = 20.200754\n", 539 | "Number of Step: 0051 cost = 20.505079\n", 540 | "Number of Step: 0051 cost = 20.696692\n", 541 | "Number of Step: 0051 cost = 21.314342\n", 542 | "Number of Step: 0052 cost = 18.995667\n", 543 | "Number of Step: 0052 cost = 19.546761\n", 544 | "Number of Step: 0052 cost = 20.188692\n", 545 | "Number of Step: 0052 cost = 20.453053\n", 546 | "Number of Step: 0052 cost = 18.985550\n", 547 | "Number of Step: 0052 cost = 20.688198\n", 548 | "Number of Step: 0052 cost = 19.881287\n", 549 | "Number of Step: 0052 cost = 19.692705\n", 550 | "Number of Step: 0053 cost = 21.027081\n", 551 | "Number of Step: 0053 cost = 19.673756\n", 552 | "Number of Step: 0053 cost = 20.669489\n", 553 | "Number of Step: 0053 cost = 19.392540\n", 554 | "Number of Step: 0053 cost = 19.796257\n", 555 | "Number of Step: 0053 cost = 20.843779\n", 556 | "Number of Step: 0053 cost = 20.343000\n", 557 | "Number of Step: 0053 cost = 19.988203\n", 558 | "Number of Step: 0054 cost = 19.376358\n", 559 | "Number of Step: 0054 cost = 18.737743\n", 560 | "Number of Step: 0054 cost = 19.616598\n", 561 | "Number of Step: 0054 cost = 18.932737\n", 562 | "Number of Step: 0054 cost = 20.735094\n", 563 | "Number of Step: 0054 cost = 20.219381\n", 564 | "Number of Step: 0054 cost = 20.130972\n", 565 | "Number of Step: 0054 cost = 19.450148\n", 566 | "Number of Step: 0055 cost = 18.508867\n", 567 | "Number of Step: 0055 cost = 20.512484\n", 568 | "Number of Step: 0055 cost = 20.341122\n", 569 | "Number of Step: 0055 cost = 20.939161\n", 570 | "Number of Step: 0055 cost = 20.921871\n", 571 | "Number of Step: 0055 cost = 18.596462\n", 572 | "Number of Step: 0055 cost = 19.717844\n", 573 | "Number of Step: 0055 cost = 19.101641\n", 574 | "Number of Step: 0056 cost = 19.201128\n", 575 | "Number of Step: 0056 cost = 19.193102\n", 576 | "Number of Step: 0056 cost = 20.116600\n", 577 | "Number of Step: 0056 cost = 19.417250\n", 578 | "Number of Step: 0056 cost = 20.242432\n", 579 | "Number of Step: 0056 cost = 19.264370\n", 580 | "Number of Step: 0056 cost = 19.881472\n", 581 | "Number of Step: 0056 cost = 18.565615\n", 582 | "Number of Step: 0057 cost = 18.786987\n", 583 | "Number of Step: 0057 cost = 19.718266\n", 584 | "Number of Step: 0057 cost = 20.988651\n", 585 | "Number of Step: 0057 cost = 18.925314\n", 586 | "Number of Step: 0057 cost = 19.604288\n", 587 | "Number of Step: 0057 cost = 20.408951\n", 588 | "Number of Step: 0057 cost = 19.542969\n", 589 | "Number of Step: 0057 cost = 19.413477\n", 590 | "Number of Step: 0058 cost = 20.094835\n", 591 | "Number of Step: 0058 cost = 19.935198\n", 592 | "Number of Step: 0058 cost = 20.000544\n", 593 | "Number of Step: 0058 cost = 19.038767\n", 594 | "Number of Step: 0058 cost = 19.766483\n", 595 | "Number of Step: 0058 cost = 19.640085\n", 596 | "Number of Step: 0058 cost = 19.713690\n", 597 | "Number of Step: 0058 cost = 19.069868\n", 598 | "Number of Step: 0059 cost = 20.695675\n", 599 | "Number of Step: 0059 cost = 18.600542\n", 600 | "Number of Step: 0059 cost = 20.206831\n", 601 | "Number of Step: 0059 cost = 19.218363\n", 602 | "Number of Step: 0059 cost = 20.146311\n", 603 | "Number of Step: 0059 cost = 20.238882\n", 604 | "Number of Step: 0059 cost = 19.937920\n", 605 | "Number of Step: 0059 cost = 18.828552\n", 606 | "Number of Step: 0060 cost = 18.958557\n", 607 | "Number of Step: 0060 cost = 20.044477\n", 608 | "Number of Step: 0060 cost = 19.873934\n", 609 | "Number of Step: 0060 cost = 19.420803\n", 610 | "Number of Step: 0060 cost = 19.914637\n", 611 | "Number of Step: 0060 cost = 18.239677\n", 612 | "Number of Step: 0060 cost = 18.858553\n", 613 | "Number of Step: 0060 cost = 19.074484\n", 614 | "Number of Step: 0061 cost = 19.019659\n", 615 | "Number of Step: 0061 cost = 19.953152\n", 616 | "Number of Step: 0061 cost = 19.777113\n", 617 | "Number of Step: 0061 cost = 20.127518\n", 618 | "Number of Step: 0061 cost = 19.441587\n", 619 | "Number of Step: 0061 cost = 20.103010\n", 620 | "Number of Step: 0061 cost = 19.744200\n", 621 | "Number of Step: 0061 cost = 20.644508\n", 622 | "Number of Step: 0062 cost = 19.728378\n", 623 | "Number of Step: 0062 cost = 19.435120\n", 624 | "Number of Step: 0062 cost = 19.840559\n", 625 | "Number of Step: 0062 cost = 18.457581\n", 626 | "Number of Step: 0062 cost = 19.384420\n", 627 | "Number of Step: 0062 cost = 19.274870\n", 628 | "Number of Step: 0062 cost = 19.981462\n", 629 | "Number of Step: 0062 cost = 18.948893\n", 630 | "Number of Step: 0063 cost = 20.056086\n", 631 | "Number of Step: 0063 cost = 18.939281\n", 632 | "Number of Step: 0063 cost = 19.173927\n", 633 | "Number of Step: 0063 cost = 18.720533\n", 634 | "Number of Step: 0063 cost = 17.662331\n", 635 | "Number of Step: 0063 cost = 18.570833\n", 636 | "Number of Step: 0063 cost = 19.597286\n", 637 | "Number of Step: 0063 cost = 20.456881\n", 638 | "Number of Step: 0064 cost = 20.077738\n", 639 | "Number of Step: 0064 cost = 19.628847\n", 640 | "Number of Step: 0064 cost = 19.091299\n", 641 | "Number of Step: 0064 cost = 18.709854\n", 642 | "Number of Step: 0064 cost = 18.878860\n", 643 | "Number of Step: 0064 cost = 19.234978\n", 644 | "Number of Step: 0064 cost = 19.819929\n", 645 | "Number of Step: 0064 cost = 19.293816\n", 646 | "Number of Step: 0065 cost = 18.452431\n", 647 | "Number of Step: 0065 cost = 18.643803\n", 648 | "Number of Step: 0065 cost = 18.060612\n", 649 | "Number of Step: 0065 cost = 19.449501\n", 650 | "Number of Step: 0065 cost = 18.175749\n", 651 | "Number of Step: 0065 cost = 19.265959\n", 652 | "Number of Step: 0065 cost = 19.055420\n", 653 | "Number of Step: 0065 cost = 20.121172\n", 654 | "Number of Step: 0066 cost = 18.595167\n", 655 | "Number of Step: 0066 cost = 18.821838\n", 656 | "Number of Step: 0066 cost = 17.799852\n", 657 | "Number of Step: 0066 cost = 17.803329\n", 658 | "Number of Step: 0066 cost = 18.767082\n", 659 | "Number of Step: 0066 cost = 19.105862\n", 660 | "Number of Step: 0066 cost = 20.024843\n", 661 | "Number of Step: 0066 cost = 19.094280\n", 662 | "Number of Step: 0067 cost = 19.719334\n", 663 | "Number of Step: 0067 cost = 19.748051\n", 664 | "Number of Step: 0067 cost = 18.628956\n", 665 | "Number of Step: 0067 cost = 19.446445\n", 666 | "Number of Step: 0067 cost = 18.864361\n", 667 | "Number of Step: 0067 cost = 19.247576\n", 668 | "Number of Step: 0067 cost = 19.597681\n", 669 | "Number of Step: 0067 cost = 18.425842\n", 670 | "Number of Step: 0068 cost = 18.854548\n", 671 | "Number of Step: 0068 cost = 19.533266\n", 672 | "Number of Step: 0068 cost = 18.578531\n", 673 | "Number of Step: 0068 cost = 19.277075\n", 674 | "Number of Step: 0068 cost = 19.370052\n", 675 | "Number of Step: 0068 cost = 18.791796\n", 676 | "Number of Step: 0068 cost = 19.355459\n", 677 | "Number of Step: 0068 cost = 19.153137\n", 678 | "Number of Step: 0069 cost = 19.308121\n", 679 | "Number of Step: 0069 cost = 19.086754\n", 680 | "Number of Step: 0069 cost = 18.994562\n", 681 | "Number of Step: 0069 cost = 18.890036\n", 682 | "Number of Step: 0069 cost = 19.419216\n", 683 | "Number of Step: 0069 cost = 18.398075\n", 684 | "Number of Step: 0069 cost = 18.872486\n", 685 | "Number of Step: 0069 cost = 19.547575\n", 686 | "Number of Step: 0070 cost = 20.047623\n", 687 | "Number of Step: 0070 cost = 18.613075\n", 688 | "Number of Step: 0070 cost = 17.807789\n", 689 | "Number of Step: 0070 cost = 18.225952\n", 690 | "Number of Step: 0070 cost = 17.380775\n", 691 | "Number of Step: 0070 cost = 19.216991\n", 692 | "Number of Step: 0070 cost = 18.267038\n", 693 | "Number of Step: 0070 cost = 18.534729\n", 694 | "Number of Step: 0071 cost = 18.784967\n", 695 | "Number of Step: 0071 cost = 19.547152\n", 696 | "Number of Step: 0071 cost = 18.409006\n", 697 | "Number of Step: 0071 cost = 19.480865\n", 698 | "Number of Step: 0071 cost = 18.878609\n", 699 | "Number of Step: 0071 cost = 18.263588\n", 700 | "Number of Step: 0071 cost = 18.590086\n", 701 | "Number of Step: 0071 cost = 18.835884\n", 702 | "Number of Step: 0072 cost = 19.132475\n", 703 | "Number of Step: 0072 cost = 18.259565\n", 704 | "Number of Step: 0072 cost = 21.376810\n", 705 | "Number of Step: 0072 cost = 17.928442\n", 706 | "Number of Step: 0072 cost = 18.454258\n", 707 | "Number of Step: 0072 cost = 18.397722\n", 708 | "Number of Step: 0072 cost = 19.086702\n", 709 | "Number of Step: 0072 cost = 19.379057\n", 710 | "Number of Step: 0073 cost = 18.311708\n", 711 | "Number of Step: 0073 cost = 18.536705\n", 712 | "Number of Step: 0073 cost = 18.127934\n", 713 | "Number of Step: 0073 cost = 18.702974\n", 714 | "Number of Step: 0073 cost = 19.017019\n", 715 | "Number of Step: 0073 cost = 18.554033\n", 716 | "Number of Step: 0073 cost = 19.887222\n", 717 | "Number of Step: 0073 cost = 19.013105\n", 718 | "Number of Step: 0074 cost = 18.646910\n", 719 | "Number of Step: 0074 cost = 18.836882\n", 720 | "Number of Step: 0074 cost = 18.586220\n", 721 | "Number of Step: 0074 cost = 19.116037\n", 722 | "Number of Step: 0074 cost = 18.636166\n", 723 | "Number of Step: 0074 cost = 18.499201\n", 724 | "Number of Step: 0074 cost = 19.262115\n", 725 | "Number of Step: 0074 cost = 18.619135\n", 726 | "Number of Step: 0075 cost = 18.495358\n", 727 | "Number of Step: 0075 cost = 18.133579\n", 728 | "Number of Step: 0075 cost = 18.864721\n", 729 | "Number of Step: 0075 cost = 17.838591\n", 730 | "Number of Step: 0075 cost = 17.295948\n", 731 | "Number of Step: 0075 cost = 18.530239\n", 732 | "Number of Step: 0075 cost = 18.883099\n", 733 | "Number of Step: 0075 cost = 18.843021\n", 734 | "Number of Step: 0076 cost = 18.259241\n", 735 | "Number of Step: 0076 cost = 17.779800\n", 736 | "Number of Step: 0076 cost = 18.148663\n", 737 | "Number of Step: 0076 cost = 18.095509\n", 738 | "Number of Step: 0076 cost = 18.393051\n", 739 | "Number of Step: 0076 cost = 17.617201\n", 740 | "Number of Step: 0076 cost = 20.075979\n", 741 | "Number of Step: 0076 cost = 18.985180\n", 742 | "Number of Step: 0077 cost = 18.118486\n", 743 | "Number of Step: 0077 cost = 18.579576\n", 744 | "Number of Step: 0077 cost = 18.543633\n", 745 | "Number of Step: 0077 cost = 19.796936\n", 746 | "Number of Step: 0077 cost = 18.677710\n", 747 | "Number of Step: 0077 cost = 16.775335\n", 748 | "Number of Step: 0077 cost = 18.743027\n", 749 | "Number of Step: 0077 cost = 18.926441\n", 750 | "Number of Step: 0078 cost = 19.418146\n", 751 | "Number of Step: 0078 cost = 17.935238\n", 752 | "Number of Step: 0078 cost = 18.198158\n", 753 | "Number of Step: 0078 cost = 17.662447\n", 754 | "Number of Step: 0078 cost = 18.733952\n", 755 | "Number of Step: 0078 cost = 18.023073\n", 756 | "Number of Step: 0078 cost = 18.234447\n", 757 | "Number of Step: 0078 cost = 17.417873\n", 758 | "Number of Step: 0079 cost = 17.578777\n", 759 | "Number of Step: 0079 cost = 18.905436\n", 760 | "Number of Step: 0079 cost = 18.485249\n", 761 | "Number of Step: 0079 cost = 18.892134\n", 762 | "Number of Step: 0079 cost = 18.983599\n", 763 | "Number of Step: 0079 cost = 18.547895\n", 764 | "Number of Step: 0079 cost = 18.229612\n", 765 | "Number of Step: 0079 cost = 18.261204\n", 766 | "Number of Step: 0080 cost = 19.453548\n", 767 | "Number of Step: 0080 cost = 18.618380\n", 768 | "Number of Step: 0080 cost = 18.912674\n", 769 | "Number of Step: 0080 cost = 18.301924\n", 770 | "Number of Step: 0080 cost = 17.535427\n", 771 | "Number of Step: 0080 cost = 18.374660\n", 772 | "Number of Step: 0080 cost = 19.154884\n", 773 | "Number of Step: 0080 cost = 18.170778\n", 774 | "Number of Step: 0081 cost = 18.523289\n", 775 | "Number of Step: 0081 cost = 18.059502\n", 776 | "Number of Step: 0081 cost = 18.606636\n", 777 | "Number of Step: 0081 cost = 17.094723\n", 778 | "Number of Step: 0081 cost = 17.885996\n", 779 | "Number of Step: 0081 cost = 17.596682\n", 780 | "Number of Step: 0081 cost = 20.653311\n", 781 | "Number of Step: 0081 cost = 17.972937\n", 782 | "Number of Step: 0082 cost = 19.049471\n", 783 | "Number of Step: 0082 cost = 17.324112\n", 784 | "Number of Step: 0082 cost = 17.414450\n", 785 | "Number of Step: 0082 cost = 18.378273\n", 786 | "Number of Step: 0082 cost = 18.309870\n", 787 | "Number of Step: 0082 cost = 17.814247\n", 788 | "Number of Step: 0082 cost = 19.608297\n", 789 | "Number of Step: 0082 cost = 17.104284\n", 790 | "Number of Step: 0083 cost = 17.783604\n", 791 | "Number of Step: 0083 cost = 17.561996\n", 792 | "Number of Step: 0083 cost = 17.339228\n", 793 | "Number of Step: 0083 cost = 17.625277\n", 794 | "Number of Step: 0083 cost = 17.664207\n", 795 | "Number of Step: 0083 cost = 17.919678\n", 796 | "Number of Step: 0083 cost = 17.632580\n", 797 | "Number of Step: 0083 cost = 17.944408\n", 798 | "Number of Step: 0084 cost = 17.866018\n", 799 | "Number of Step: 0084 cost = 19.102129\n", 800 | "Number of Step: 0084 cost = 18.013046\n", 801 | "Number of Step: 0084 cost = 17.948154\n", 802 | "Number of Step: 0084 cost = 17.853260\n", 803 | "Number of Step: 0084 cost = 16.999918\n", 804 | "Number of Step: 0084 cost = 18.198866\n", 805 | "Number of Step: 0084 cost = 17.912989\n", 806 | "Number of Step: 0085 cost = 18.169626\n", 807 | "Number of Step: 0085 cost = 18.324541\n", 808 | "Number of Step: 0085 cost = 18.369005\n", 809 | "Number of Step: 0085 cost = 18.447588\n", 810 | "Number of Step: 0085 cost = 18.641312\n", 811 | "Number of Step: 0085 cost = 17.931910\n", 812 | "Number of Step: 0085 cost = 18.035240\n", 813 | "Number of Step: 0085 cost = 19.216930\n", 814 | "Number of Step: 0086 cost = 17.682716\n", 815 | "Number of Step: 0086 cost = 17.063988\n", 816 | "Number of Step: 0086 cost = 18.460468\n", 817 | "Number of Step: 0086 cost = 18.014000\n", 818 | "Number of Step: 0086 cost = 16.637476\n", 819 | "Number of Step: 0086 cost = 18.234005\n", 820 | "Number of Step: 0086 cost = 17.542858\n", 821 | "Number of Step: 0086 cost = 18.681959\n", 822 | "Number of Step: 0087 cost = 17.915825\n", 823 | "Number of Step: 0087 cost = 18.332932\n", 824 | "Number of Step: 0087 cost = 18.023544\n", 825 | "Number of Step: 0087 cost = 17.747066\n", 826 | "Number of Step: 0087 cost = 18.476677\n", 827 | "Number of Step: 0087 cost = 18.061317\n", 828 | "Number of Step: 0087 cost = 17.447443\n", 829 | "Number of Step: 0087 cost = 17.476879\n", 830 | "Number of Step: 0088 cost = 17.811800\n", 831 | "Number of Step: 0088 cost = 17.839411\n", 832 | "Number of Step: 0088 cost = 17.015297\n", 833 | "Number of Step: 0088 cost = 18.072769\n", 834 | "Number of Step: 0088 cost = 17.579687\n", 835 | "Number of Step: 0088 cost = 17.227621\n", 836 | "Number of Step: 0088 cost = 19.641462\n", 837 | "Number of Step: 0088 cost = 17.546465\n", 838 | "Number of Step: 0089 cost = 18.312948\n", 839 | "Number of Step: 0089 cost = 17.350624\n", 840 | "Number of Step: 0089 cost = 18.149803\n", 841 | "Number of Step: 0089 cost = 16.513401\n", 842 | "Number of Step: 0089 cost = 17.910816\n", 843 | "Number of Step: 0089 cost = 17.231394\n", 844 | "Number of Step: 0089 cost = 18.227589\n", 845 | "Number of Step: 0089 cost = 16.880251\n", 846 | "Number of Step: 0090 cost = 15.888194\n", 847 | "Number of Step: 0090 cost = 18.700552\n", 848 | "Number of Step: 0090 cost = 17.834127\n", 849 | "Number of Step: 0090 cost = 16.903624\n", 850 | "Number of Step: 0090 cost = 17.001898\n", 851 | "Number of Step: 0090 cost = 16.596476\n", 852 | "Number of Step: 0090 cost = 17.636972\n", 853 | "Number of Step: 0090 cost = 18.484329\n", 854 | "Number of Step: 0091 cost = 18.303387\n", 855 | "Number of Step: 0091 cost = 17.834642\n", 856 | "Number of Step: 0091 cost = 17.869686\n", 857 | "Number of Step: 0091 cost = 16.905575\n", 858 | "Number of Step: 0091 cost = 17.179218\n", 859 | "Number of Step: 0091 cost = 17.584888\n", 860 | "Number of Step: 0091 cost = 17.895836\n", 861 | "Number of Step: 0091 cost = 15.996117\n", 862 | "Number of Step: 0092 cost = 17.124043\n", 863 | "Number of Step: 0092 cost = 16.982082\n", 864 | "Number of Step: 0092 cost = 18.135927\n", 865 | "Number of Step: 0092 cost = 18.133181\n", 866 | "Number of Step: 0092 cost = 17.417768\n", 867 | "Number of Step: 0092 cost = 17.834192\n", 868 | "Number of Step: 0092 cost = 19.040405\n", 869 | "Number of Step: 0092 cost = 18.214203\n", 870 | "Number of Step: 0093 cost = 17.028940\n", 871 | "Number of Step: 0093 cost = 17.721834\n", 872 | "Number of Step: 0093 cost = 17.565159\n", 873 | "Number of Step: 0093 cost = 17.463390\n", 874 | "Number of Step: 0093 cost = 19.197693\n", 875 | "Number of Step: 0093 cost = 16.874548\n", 876 | "Number of Step: 0093 cost = 18.761587\n", 877 | "Number of Step: 0093 cost = 17.809910\n", 878 | "Number of Step: 0094 cost = 18.114273\n", 879 | "Number of Step: 0094 cost = 17.609636\n", 880 | "Number of Step: 0094 cost = 17.840187\n", 881 | "Number of Step: 0094 cost = 17.969526\n", 882 | "Number of Step: 0094 cost = 18.489599\n", 883 | "Number of Step: 0094 cost = 16.545397\n", 884 | "Number of Step: 0094 cost = 17.046900\n", 885 | "Number of Step: 0094 cost = 17.239439\n", 886 | "Number of Step: 0095 cost = 17.767996\n", 887 | "Number of Step: 0095 cost = 16.229298\n", 888 | "Number of Step: 0095 cost = 16.676342\n", 889 | "Number of Step: 0095 cost = 17.530474\n", 890 | "Number of Step: 0095 cost = 17.203407\n", 891 | "Number of Step: 0095 cost = 17.162613\n", 892 | "Number of Step: 0095 cost = 15.997195\n", 893 | "Number of Step: 0095 cost = 17.632429\n", 894 | "Number of Step: 0096 cost = 18.374067\n", 895 | "Number of Step: 0096 cost = 17.368975\n", 896 | "Number of Step: 0096 cost = 17.390326\n", 897 | "Number of Step: 0096 cost = 18.181248\n", 898 | "Number of Step: 0096 cost = 17.433788\n", 899 | "Number of Step: 0096 cost = 17.595045\n", 900 | "Number of Step: 0096 cost = 17.498888\n", 901 | "Number of Step: 0096 cost = 17.583176\n", 902 | "Number of Step: 0097 cost = 17.504669\n", 903 | "Number of Step: 0097 cost = 17.342361\n", 904 | "Number of Step: 0097 cost = 17.802631\n", 905 | "Number of Step: 0097 cost = 16.414183\n", 906 | "Number of Step: 0097 cost = 16.846893\n", 907 | "Number of Step: 0097 cost = 17.576498\n", 908 | "Number of Step: 0097 cost = 18.128235\n", 909 | "Number of Step: 0097 cost = 18.250656\n", 910 | "Number of Step: 0098 cost = 18.195713\n", 911 | "Number of Step: 0098 cost = 16.961403\n", 912 | "Number of Step: 0098 cost = 16.245911\n", 913 | "Number of Step: 0098 cost = 16.547680\n", 914 | "Number of Step: 0098 cost = 17.897562\n", 915 | "Number of Step: 0098 cost = 17.286310\n", 916 | "Number of Step: 0098 cost = 17.829388\n", 917 | "Number of Step: 0098 cost = 18.228642\n", 918 | "Number of Step: 0099 cost = 16.456144\n", 919 | "Number of Step: 0099 cost = 17.276258\n", 920 | "Number of Step: 0099 cost = 16.501991\n", 921 | "Number of Step: 0099 cost = 17.593954\n", 922 | "Number of Step: 0099 cost = 17.236393\n", 923 | "Number of Step: 0099 cost = 17.581354\n", 924 | "Number of Step: 0099 cost = 17.807911\n", 925 | "Number of Step: 0099 cost = 17.202646\n", 926 | "Number of Step: 0100 cost = 17.215481\n", 927 | "Number of Step: 0100 cost = 16.990873\n", 928 | "Number of Step: 0100 cost = 16.657173\n", 929 | "Number of Step: 0100 cost = 17.039188\n", 930 | "Number of Step: 0100 cost = 17.793745\n", 931 | "Number of Step: 0100 cost = 17.052168\n", 932 | "Number of Step: 0100 cost = 17.739040\n", 933 | "Number of Step: 0100 cost = 17.128641\n" 934 | ], 935 | "name": "stdout" 936 | } 937 | ] 938 | } 939 | ] 940 | } -------------------------------------------------------------------------------- /data.txt: -------------------------------------------------------------------------------- 1 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 2 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 3 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 4 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 5 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 6 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 7 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 8 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 9 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 10 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 11 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? 12 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure? -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Copyright 2019 Tae Hwan Jung 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import random 24 | 25 | import torch 26 | import numpy as np 27 | 28 | special_symbols = { 29 | "[UNK]" : 0, 30 | "[CLS]" : 1, 31 | "[SEP]" : 2, 32 | "[PAD]" : 3, 33 | "[MASK]" : 4, 34 | } 35 | UNK_ID = special_symbols["[UNK]"] 36 | CLS_ID = special_symbols["[CLS]"] 37 | SEP_ID = special_symbols["[SEP]"] 38 | MASK_ID = special_symbols["[MASK]"] 39 | 40 | def _split_a_and_b(data, sent_ids, begin_idx, tot_len, extend_target=False): 41 | """Split two segments from `data` starting from the index `begin_idx`.""" 42 | 43 | data_len = data.shape[0] 44 | if begin_idx + tot_len >= data_len: 45 | print("[_split_a_and_b] returns None: " 46 | "begin_idx %d + tot_len %d >= data_len %d", 47 | begin_idx, tot_len, data_len) 48 | return None 49 | 50 | end_idx = begin_idx + 1 51 | cut_points = [] 52 | while end_idx < data_len: 53 | if sent_ids[end_idx] != sent_ids[end_idx - 1]: 54 | if end_idx - begin_idx >= tot_len: break 55 | cut_points.append(end_idx) 56 | end_idx += 1 57 | 58 | a_begin = begin_idx 59 | if len(cut_points) == 0 or random.random() < 0.5: 60 | # NotNext 61 | label = 0 62 | if len(cut_points) == 0: 63 | a_end = end_idx 64 | else: 65 | a_end = random.choice(cut_points) 66 | 67 | b_len = max(1, tot_len - (a_end - a_begin)) 68 | # (zihang): `data_len - 1` to account for extend_target 69 | b_begin = random.randint(0, data_len - 1 - b_len) 70 | b_end = b_begin + b_len 71 | while b_begin > 0 and sent_ids[b_begin - 1] == sent_ids[b_begin]: 72 | b_begin -= 1 73 | # (zihang): `data_len - 1` to account for extend_target 74 | while b_end < data_len - 1 and sent_ids[b_end - 1] == sent_ids[b_end]: 75 | b_end += 1 76 | 77 | new_begin = a_end 78 | else: 79 | # isNext 80 | label = 1 81 | a_end = random.choice(cut_points) 82 | b_begin = a_end 83 | b_end = end_idx 84 | 85 | new_begin = b_end 86 | 87 | while a_end - a_begin + b_end - b_begin > tot_len: 88 | if a_end - a_begin > b_end - b_begin: 89 | # delete the right side only for the LM objective 90 | a_end -= 1 91 | else: 92 | b_end -= 1 93 | 94 | ret = [data[a_begin: a_end], data[b_begin: b_end], label, new_begin] 95 | 96 | if extend_target: 97 | if a_end >= data_len or b_end >= data_len: 98 | print("[_split_a_and_b] returns None: " 99 | "a_end %d or b_end %d >= data_len %d", 100 | a_end, b_end, data_len) 101 | return None 102 | a_target = data[a_begin + 1: a_end + 1] 103 | b_target = data[b_begin: b_end + 1] 104 | ret.extend([a_target, b_target]) 105 | 106 | return ret 107 | 108 | def _is_start_piece(piece): 109 | special_pieces = set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~')) 110 | piece = ''.join(piece) 111 | if (piece.startswith("▁") or piece.startswith("<") 112 | or piece in special_pieces): 113 | return True 114 | else: 115 | return False 116 | 117 | def _sample_mask(sp, seg, mask_alpha, mask_beta, 118 | reverse=False, max_gram=5, goal_num_predict=None): 119 | """Sample `goal_num_predict` tokens for partial prediction. 120 | About `mask_beta` tokens are chosen in a context of `mask_alpha` tokens.""" 121 | 122 | seg_len = len(seg) 123 | mask = np.array([False] * seg_len, dtype=np.bool) 124 | 125 | num_predict = 0 126 | 127 | ngrams = np.arange(1, max_gram + 1, dtype=np.int64) 128 | pvals = 1. / np.arange(1, max_gram + 1) 129 | pvals /= pvals.sum(keepdims=True) 130 | 131 | if reverse: 132 | seg = np.flip(seg, 0) 133 | 134 | cur_len = 0 135 | while cur_len < seg_len: 136 | if goal_num_predict is not None and num_predict >= goal_num_predict: break 137 | 138 | n = np.random.choice(ngrams, p=pvals) 139 | if goal_num_predict is not None: 140 | n = min(n, goal_num_predict - num_predict) 141 | ctx_size = (n * mask_alpha) // mask_beta 142 | l_ctx = np.random.choice(ctx_size) 143 | r_ctx = ctx_size - l_ctx 144 | 145 | # Find the start position of a complete token 146 | beg = cur_len + l_ctx 147 | while beg < seg_len and not _is_start_piece(sp.convert_ids_to_tokens([seg[beg].item()])): 148 | beg += 1 149 | if beg >= seg_len: 150 | break 151 | 152 | # Find the end position of the n-gram (start pos of the n+1-th gram) 153 | end = beg + 1 154 | cnt_ngram = 1 155 | while end < seg_len: 156 | if _is_start_piece(sp.convert_ids_to_tokens([seg[beg].item()])): 157 | cnt_ngram += 1 158 | if cnt_ngram > n: 159 | break 160 | end += 1 161 | if end >= seg_len: 162 | break 163 | 164 | # Update 165 | mask[beg:end] = True 166 | num_predict += end - beg 167 | 168 | cur_len = end + r_ctx 169 | 170 | while goal_num_predict is not None and num_predict < goal_num_predict: 171 | i = np.random.randint(seg_len) 172 | if not mask[i]: 173 | mask[i] = True 174 | num_predict += 1 175 | 176 | if reverse: 177 | mask = np.flip(mask, 0) 178 | 179 | return mask 180 | 181 | def _create_data(sp, input_paths, seq_len, reuse_len, 182 | bi_data, num_predict, mask_alpha, mask_beta): 183 | features = [] 184 | 185 | f = open(input_paths, 'r') 186 | lines = f.readlines() 187 | input_data, sent_ids, sent_id = [], [], True 188 | 189 | for line in lines: 190 | tokens = sp.tokenize(line) 191 | cur_sent = sp.convert_tokens_to_ids(tokens) 192 | input_data.extend(cur_sent) 193 | sent_ids.extend([sent_id] * len(cur_sent)) 194 | sent_id = not sent_id 195 | 196 | # shape of data : [1, 582] 197 | data = np.array([input_data], dtype=np.int64) 198 | sent_ids = np.array([sent_ids], dtype=np.bool) 199 | 200 | assert reuse_len < seq_len - 3 201 | 202 | data_len = data.shape[1] 203 | sep_array = np.array([SEP_ID], dtype=np.int64) 204 | cls_array = np.array([CLS_ID], dtype=np.int64) 205 | 206 | i = 0 207 | while i + seq_len <= data_len: 208 | inp = data[0, i: i + reuse_len] 209 | tgt = data[0, i + 1: i + reuse_len + 1] 210 | 211 | results = _split_a_and_b( 212 | data[0], # all line in one Text file. 213 | sent_ids[0], 214 | begin_idx=i + reuse_len, 215 | tot_len=seq_len - reuse_len - 3, 216 | extend_target=True) 217 | 218 | # unpack the results 219 | (a_data, b_data, label, _, a_target, b_target) = tuple(results) 220 | 221 | # sample ngram spans to predict 222 | reverse = bi_data 223 | if num_predict is None: 224 | num_predict_0 = num_predict_1 = None 225 | else: 226 | num_predict_1 = num_predict // 2 227 | num_predict_0 = num_predict - num_predict_1 228 | 229 | mask_0 = _sample_mask(sp, inp, mask_alpha, mask_beta, reverse=reverse, 230 | goal_num_predict=num_predict_0) 231 | mask_1 = _sample_mask(sp, np.concatenate([a_data, sep_array, b_data, 232 | sep_array, cls_array]), 233 | mask_alpha, mask_beta, 234 | reverse=reverse, goal_num_predict=num_predict_1) 235 | 236 | # concatenate data 237 | cat_data = np.concatenate([inp, a_data, sep_array, b_data, 238 | sep_array, cls_array]) 239 | seg_id = ([0] * (reuse_len + a_data.shape[0]) + [0] + 240 | [1] * b_data.shape[0] + [1] + [2]) 241 | assert cat_data.shape[0] == seq_len 242 | assert mask_0.shape[0] == seq_len // 2 243 | assert mask_1.shape[0] == seq_len // 2 244 | 245 | # the last two CLS's are not used, just for padding purposes 246 | tgt = np.concatenate([tgt, a_target, b_target, cls_array, cls_array]) 247 | assert tgt.shape[0] == seq_len 248 | 249 | is_masked = np.concatenate([mask_0, mask_1], 0) 250 | if num_predict is not None: 251 | assert np.sum(is_masked) == num_predict 252 | 253 | feature = { 254 | "input": cat_data, 255 | "is_masked": is_masked, 256 | "target": tgt, 257 | "seg_id": seg_id, 258 | "label": [label], 259 | } 260 | features.append(feature) 261 | 262 | i += reuse_len 263 | 264 | f.close() 265 | return features 266 | 267 | def _local_perm(inputs, targets, is_masked, perm_size, seq_len): 268 | """ 269 | Sample a permutation of the factorization order, and create an 270 | attention mask accordingly. 271 | 272 | Args: 273 | inputs: int64 Tensor in shape [seq_len], input ids. 274 | targets: int64 Tensor in shape [seq_len], target ids. 275 | is_masked: bool Tensor in shape [seq_len]. True means being selected 276 | for partial prediction. 277 | perm_size: the length of longest permutation. Could be set to be reuse_len. 278 | Should not be larger than reuse_len or there will be data leaks. 279 | seq_len: int, sequence length. 280 | """ 281 | 282 | # Generate permutation indices 283 | index = torch.arange(seq_len, dtype=torch.int64) 284 | 285 | index = torch.reshape(index, [-1, perm_size]).t() 286 | index = index[torch.randperm(index.shape[0])] 287 | index = torch.reshape(index.t(), [-1]) 288 | 289 | # `perm_mask` and `target_mask` 290 | # non-functional tokens 291 | non_func_tokens = ~(torch.eq(inputs, SEP_ID) | torch.eq(inputs, CLS_ID)) 292 | non_mask_tokens = (~is_masked) & non_func_tokens 293 | masked_or_func_tokens = ~non_mask_tokens 294 | 295 | # Set the permutation indices of non-masked (& non-funcional) tokens to the 296 | # smallest index (-1): 297 | # (1) they can be seen by all other positions 298 | # (2) they cannot see masked positions, so there won"t be information leak 299 | smallest_index = -torch.ones([seq_len], dtype=torch.int64) 300 | 301 | # put -1 if `non_mask_tokens(real token not cls or sep)` not permutation index 302 | rev_index = torch.where(non_mask_tokens, smallest_index, index) 303 | 304 | # Create `target_mask`: non-funcional and maksed tokens 305 | # 1: use mask as input and have loss 306 | # 0: use token (or [SEP], [CLS]) as input and do not have loss 307 | target_tokens = masked_or_func_tokens & non_func_tokens 308 | target_mask = target_tokens.type(torch.float32) 309 | 310 | # Create `perm_mask` 311 | # `target_tokens` cannot see themselves 312 | # put `rev_index` if real mask(not cls or sep) else `rev_index + 1` 313 | self_rev_index = torch.where(target_tokens, rev_index, rev_index + 1) 314 | 315 | # 1: cannot attend if i <= j and j is not non-masked (masked_or_func_tokens) 316 | # 0: can attend if i > j or j is non-masked 317 | perm_mask = (self_rev_index[:, None] <= rev_index[None, :]) & masked_or_func_tokens 318 | perm_mask = perm_mask.type(torch.float32) 319 | 320 | # new target: [next token] for LM and [curr token] (self) for PLM 321 | new_targets = torch.cat([inputs[0: 1], targets[: -1]], dim=0) 322 | 323 | # construct inputs_k 324 | inputs_k = inputs 325 | 326 | # construct inputs_q 327 | inputs_q = target_mask 328 | 329 | return perm_mask, new_targets, target_mask, inputs_k, inputs_q 330 | 331 | def make_permute(feature, reuse_len, seq_len, perm_size, num_predict): 332 | 333 | inputs = torch.LongTensor(feature.pop("input")) 334 | target = torch.LongTensor(feature.pop("target")) 335 | is_masked = torch.ByteTensor(feature.pop("is_masked")) 336 | 337 | non_reuse_len = seq_len - reuse_len 338 | assert perm_size <= reuse_len and perm_size <= non_reuse_len 339 | 340 | perm_mask_0, target_0, target_mask_0, input_k_0, input_q_0 = _local_perm( 341 | inputs[:reuse_len], # inp 342 | target[:reuse_len], 343 | is_masked[:reuse_len], 344 | perm_size, 345 | reuse_len) 346 | 347 | perm_mask_1, target_1, target_mask_1, input_k_1, input_q_1 = _local_perm( 348 | inputs[reuse_len:], # (senA, seq, senBm seq, cls) 349 | target[reuse_len:], 350 | is_masked[reuse_len:], 351 | perm_size, 352 | non_reuse_len) 353 | 354 | perm_mask_0 = torch.cat([perm_mask_0, torch.ones([reuse_len, non_reuse_len])], 355 | dim=1) 356 | perm_mask_1 = torch.cat([torch.zeros([non_reuse_len, reuse_len]), perm_mask_1], 357 | dim=1) 358 | 359 | perm_mask = torch.cat([perm_mask_0, perm_mask_1], dim=0) 360 | target = torch.cat([target_0, target_1], dim=0) 361 | target_mask = torch.cat([target_mask_0, target_mask_1], dim=0) 362 | input_k = torch.cat([input_k_0, input_k_1], dim=0) 363 | input_q = torch.cat([input_q_0, input_q_1], dim=0) 364 | 365 | if num_predict is not None: 366 | indices = torch.arange(seq_len, dtype=torch.int64) 367 | bool_target_mask = target_mask.byte() 368 | indices = indices[bool_target_mask] 369 | 370 | ##### extra padding due to CLS/SEP introduced after prepro 371 | actual_num_predict = indices.shape[0] 372 | pad_len = num_predict - actual_num_predict 373 | 374 | assert seq_len >= actual_num_predict 375 | 376 | ##### target_mapping 377 | target_mapping = torch.eye(seq_len, dtype=torch.float32)[indices] 378 | paddings = torch.zeros([pad_len, seq_len], dtype=target_mapping.dtype) 379 | target_mapping = torch.cat([target_mapping, paddings], dim=0) 380 | feature["target_mapping"] = torch.reshape(target_mapping, 381 | [num_predict, seq_len]) 382 | ##### target 383 | target = target[bool_target_mask] 384 | paddings = torch.zeros([pad_len], dtype=target.dtype) 385 | target = torch.cat([target, paddings], dim=0) 386 | feature["target"] = torch.reshape(target, [num_predict]) 387 | 388 | ##### target mask 389 | target_mask = torch.cat( 390 | [torch.ones([actual_num_predict], dtype=torch.float32), 391 | torch.zeros([pad_len], dtype=torch.float32)], 392 | dim=0) 393 | feature["target_mask"] = torch.reshape(target_mask, [num_predict]) 394 | else: 395 | feature["target"] = torch.reshape(target, [seq_len]) 396 | feature["target_mask"] = torch.reshape(target_mask, [seq_len]) 397 | 398 | # reshape back to fixed shape 399 | feature["seg_id"] = torch.IntTensor(feature["seg_id"]) 400 | feature["perm_mask"] = torch.reshape(perm_mask, [seq_len, seq_len]) 401 | feature["input_k"] = torch.reshape(input_k, [seq_len]) 402 | feature["input_q"] = torch.reshape(input_q, [seq_len]) 403 | 404 | return feature -------------------------------------------------------------------------------- /images/AEmodel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/AEmodel.png -------------------------------------------------------------------------------- /images/ARmodel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/ARmodel.png -------------------------------------------------------------------------------- /images/PLM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/PLM.png -------------------------------------------------------------------------------- /images/ParPrediction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/ParPrediction.png -------------------------------------------------------------------------------- /images/hyperparameters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/hyperparameters.png -------------------------------------------------------------------------------- /images/target-aware.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/target-aware.png -------------------------------------------------------------------------------- /images/twoattn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/twoattn.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Copyright 2019 Tae Hwan Jung 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import data_utils 24 | import argparse 25 | 26 | import xlnet 27 | import torch 28 | import numpy as np 29 | import torch.nn as nn 30 | import torch.optim as optim 31 | from pytorch_pretrained_bert import BertTokenizer 32 | 33 | if __name__ == "__main__": 34 | parser = argparse.ArgumentParser(description='PyTorch XLNet Language Model') 35 | parser.add_argument('--data', type=str, default='data.txt') 36 | parser.add_argument('--tokenizer', type=str, default='bert-base-uncased', 37 | help='Path to the sentence piece model from pytorch-pretrained-BERT') 38 | parser.add_argument('--seq_len', type=int, default=512, help="Sequence length.") 39 | parser.add_argument('--reuse_len', type=int, default=256, 40 | help="Number of token that can be reused as memory. " 41 | "Could be half of `seq_len`.") 42 | parser.add_argument('--perm_size', type=int, 43 | default=256, 44 | help="the length of longest permutation. Could be set to be reuse_len.") 45 | parser.add_argument('--bi_data', type=bool, default=False, 46 | help="whether to create bidirectional data") 47 | parser.add_argument('--mask_alpha', type=int, 48 | default=6, help="How many tokens to form a group.") 49 | parser.add_argument('--mask_beta', type=int, 50 | default=1, help="How many tokens to mask within each group.") 51 | parser.add_argument('--num_predict', type=int, 52 | default=85, help="Num of tokens to predict.") 53 | parser.add_argument('--mem_len', type=int, 54 | default=384, help="Number of steps to cache") 55 | parser.add_argument('--num_epoch', type=int, 56 | default=100, help="Number of epochs") 57 | 58 | args = parser.parse_args() 59 | 60 | sp = BertTokenizer.from_pretrained(args.tokenizer) 61 | model = xlnet.XLNet(n_token=len(sp.vocab), n_layer=6, n_head=4, d_head=8, 62 | d_inner=32, d_model=32, 63 | dropout=0.1, dropatt=0.1, 64 | attn_type="bi", bi_data=args.bi_data, 65 | clamp_len=-1, same_length=False, 66 | reuse_len=args.reuse_len, mem_len=args.mem_len) 67 | 68 | criterion = nn.CrossEntropyLoss() 69 | optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01) 70 | 71 | for num_epoch in range(args.num_epoch): 72 | mems = None 73 | 74 | features = data_utils._create_data(sp=sp, 75 | input_paths=args.data, 76 | seq_len=args.seq_len, 77 | reuse_len=args.reuse_len, 78 | bi_data=args.bi_data, 79 | num_predict=args.num_predict, 80 | mask_alpha=args.mask_alpha, 81 | mask_beta=args.mask_beta) 82 | 83 | num_step = 0 84 | for feature in features: 85 | permutation = data_utils.make_permute(feature, 86 | reuse_len=args.reuse_len, 87 | seq_len=args.seq_len, 88 | perm_size=args.perm_size, 89 | num_predict=args.num_predict) 90 | 91 | # batch size is 1 92 | inp_k = permutation['input_k'].unsqueeze(-1) # [seq_len, 1(=bsz)] 93 | seg_id = permutation['seg_id'].unsqueeze(-1) # [seq_len, 1(=bsz)] 94 | target = permutation['target'].unsqueeze(-1) # [num_predict, 1(=bsz)] 95 | perm_mask = permutation['perm_mask'].unsqueeze(-1) # [seq_len, seq_len, 1(=bsz)] 96 | target_mapping = \ 97 | permutation['target_mapping'].unsqueeze(-1) # [num_predict, seq_len, 1(=bsz)] 98 | inp_q = permutation['input_q'].unsqueeze(-1) # [seq_len, 1(=bsz)] 99 | tgt_mask = permutation['target_mask'].unsqueeze(-1) # [num_predict, 1(=bsz)] 100 | 101 | logits, new_mems = model(inp_k=inp_k, seg_id=seg_id, input_mask=None, 102 | mems=mems, perm_mask=perm_mask, 103 | target_mapping=target_mapping, inp_q=inp_q) 104 | 105 | lm_loss = criterion(logits.transpose(1, 2), target).type(torch.float32) 106 | tgt_mask_sum = tgt_mask.reshape(-1).sum() 107 | lm_loss_sum = (lm_loss * tgt_mask).reshape(-1).sum() 108 | 109 | optimizer.zero_grad() 110 | total_loss = lm_loss_sum / tgt_mask_sum 111 | print('Number of Epoch: %04d in %04d Step' % ((num_epoch + 1), (num_step + 1)), 112 | 'cost =', '{:.6f}'.format(total_loss)) 113 | num_step += 1 114 | 115 | total_loss.backward() 116 | optimizer.step() 117 | 118 | mems = new_mems -------------------------------------------------------------------------------- /xlnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Copyright 2019 Tae Hwan Jung 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | 19 | from __future__ import absolute_import 20 | from __future__ import division 21 | from __future__ import print_function 22 | 23 | import numpy as np 24 | 25 | import torch 26 | import torch.nn as nn 27 | import torch.nn.functional as F 28 | 29 | class XLNet(nn.Module): 30 | """ 31 | Defines a Transformer-XL computation graph with additional 32 | support for XLNet. 33 | 34 | Args: 35 | 36 | inp_k: int32 Tensor in shape [len, bsz], the input token IDs. 37 | seg_id: int32 Tensor in shape [len, bsz], the input segment IDs. 38 | input_mask: float32 Tensor in shape [len, bsz], the input mask. 39 | 0 for real tokens and 1 for padding. 40 | mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory 41 | from previous batches. The length of the list equals n_layer. 42 | If None, no memory is used. 43 | perm_mask: float32 Tensor in shape [len, len, bsz]. 44 | If perm_mask[i, j, k] = 0, i attend to j in batch k; 45 | if perm_mask[i, j, k] = 1, i does not attend to j in batch k. 46 | If None, each position attends to all the others. 47 | target_mapping: float32 Tensor in shape [num_predict, len, bsz]. 48 | If target_mapping[i, j, k] = 1, the i-th predict in batch k is 49 | on the j-th token. 50 | Only used during pretraining for partial prediction. 51 | Set to None during finetuning. 52 | inp_q: float32 Tensor in shape [len, bsz]. 53 | 1 for tokens with losses and 0 for tokens without losses. 54 | Only used during pretraining for two-stream attention. 55 | Set to None during finetuning. 56 | 57 | n_layer: int, the number of layers. 58 | d_model: int, the hidden size. 59 | n_head: int, the number of attention heads. 60 | d_head: int, the dimension size of each attention head. 61 | d_inner: int, the hidden size in feed-forward layers. 62 | ff_activation: str, "relu" or "gelu". 63 | n_token: int, the vocab size. 64 | 65 | dropout: float, dropout rate. 66 | dropatt: float, dropout rate on attention probabilities. 67 | 68 | mem_len: int, the number of tokens to cache. 69 | reuse_len: int, the number of tokens in the currect batch to be cached 70 | and reused in the future. 71 | bi_data: bool, whether to use bidirectional input pipeline. 72 | Usually set to True during pretraining and False during finetuning. 73 | clamp_len: int, clamp all relative distances larger than clamp_len. 74 | -1 means no clamping. 75 | 76 | """ 77 | def __init__(self, n_token, n_layer, n_head, d_head, d_inner, d_model, dropout, dropatt, 78 | attn_type, bi_data, clamp_len, same_length, reuse_len, mem_len): 79 | super(XLNet, self).__init__() 80 | 81 | self.n_token = n_token 82 | self.n_layer = n_layer 83 | self.n_head = n_head 84 | self.d_head = d_head 85 | self.d_inner = d_inner 86 | self.d_model = d_model 87 | self.dropout = dropout 88 | self.dropatt = dropatt 89 | self.attn_type = attn_type 90 | self.bi_data = bi_data 91 | self.clamp_len = clamp_len 92 | self.same_length = same_length 93 | self.reuse_len = reuse_len 94 | self.mem_len = mem_len 95 | 96 | self.embedding = nn.Embedding(n_token, d_model) 97 | self.Dropout = nn.Dropout(p=dropout) 98 | self.DropAttn = nn.Dropout(p=dropatt) 99 | 100 | self.r_w_bias = nn.Parameter(torch.randn(self.n_layer, 101 | self.n_head,self.d_head)) 102 | self.r_r_bias = nn.Parameter(torch.randn(self.n_layer, 103 | self.n_head, self.d_head)) 104 | 105 | ##### Segment embedding 106 | self.r_s_bias = nn.Parameter(torch.randn(self.n_layer, 107 | self.n_head,self.d_head)) 108 | 109 | self.seg_embed = nn.Parameter(torch.randn(self.n_layer, 2, 110 | self.n_head, self.d_head)) 111 | 112 | self.mask_emb = nn.Parameter(torch.randn(1, 1, d_model)) 113 | 114 | # post-attention projection (back to `d_model`) 115 | self.proj_o = nn.Parameter(torch.randn(self.d_model, 116 | self.n_head, self.d_head)) 117 | 118 | #### Project hidden states to a specific head with a 4D-shape. 119 | self.q_proj_weight = nn.Parameter(torch.randn(self.d_model, 120 | self.n_head, self.d_head)) 121 | self.k_proj_weight = nn.Parameter(torch.randn(self.d_model, 122 | self.n_head, self.d_head)) 123 | self.v_proj_weight = nn.Parameter(torch.randn(self.d_model, 124 | self.n_head, self.d_head)) 125 | self.r_proj_weight = nn.Parameter(torch.randn(self.d_model, 126 | self.n_head, self.d_head)) 127 | 128 | self.layer_norm = nn.LayerNorm(d_model) 129 | 130 | self.conv1 = nn.Linear(d_model, d_inner) 131 | self.conv2 = nn.Linear(d_inner, d_model) 132 | self.relu = nn.ReLU(inplace=True) 133 | 134 | self.softmax_b = nn.Parameter(torch.zeros(self.n_token)) 135 | 136 | 137 | def gelu(self, x): 138 | """Gaussian Error Linear Unit. 139 | 140 | This is a smoother version of the RELU. 141 | Original paper: https://arxiv.org/abs/1606.08415 142 | Args: 143 | x: float Tensor to perform activation. 144 | 145 | Returns: 146 | `x` with the GELU activation applied. 147 | """ 148 | cdf = 0.5 * (1.0 + torch.tanh( 149 | (np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3))))) 150 | return x * cdf 151 | 152 | def rel_shift(self, x, klen=-1): 153 | """perform relative shift to form the relative attention score.""" 154 | x_size = x.shape 155 | 156 | x = torch.reshape(x, [x_size[1], x_size[0], x_size[2], x_size[3]]) 157 | x = x[1:, 0:, 0:, 0:] # tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1]) 158 | x = torch.reshape(x, [x_size[0], x_size[1] - 1, x_size[2], x_size[3]]) 159 | x = x[0:, 0:klen, 0:, 0:] # tf.slice(x, [0, 0, 0, 0], [-1, klen, -1, -1]) 160 | 161 | return x 162 | 163 | def positionwise_ffn(self, inp, activation_type='relu'): 164 | 165 | """Position-wise Feed-forward Network.""" 166 | output = self.conv1(inp) 167 | output = self.Dropout(output) 168 | if activation_type == 'relu': 169 | output = self.relu(output) 170 | elif activation_type == 'gelu': 171 | output = self.gelu(output) 172 | else: 173 | raise ValueError('Unsupported activation type {}'.format(activation_type)) 174 | 175 | output = self.layer_norm(output + inp) 176 | return output 177 | 178 | def post_attention(self, h, attn_vec, residual=True): 179 | """Post-attention processing.""" 180 | 181 | # post-attention projection (back to `d_model`) 182 | attn_out = torch.einsum('ibnd,hnd->ibh', attn_vec, self.proj_o) 183 | 184 | attn_out = self.Dropout(attn_out) 185 | if residual: 186 | output = self.layer_norm(attn_out + h) 187 | else: 188 | output = self.layer_norm(attn_out) 189 | 190 | return output 191 | 192 | def head_projection(self, h, name): 193 | """Project hidden states to a specific head with a 4D-shape.""" 194 | proj_weight = None 195 | if name == 'q': 196 | proj_weight = self.q_proj_weight 197 | elif name == 'k': 198 | proj_weight = self.k_proj_weight 199 | elif name =='v': 200 | proj_weight = self.v_proj_weight 201 | elif name == 'r': 202 | proj_weight = self.r_proj_weight 203 | else: 204 | raise ValueError('Unknown `name` {}.'.format(name)) 205 | 206 | head = torch.einsum('ibh,hnd->ibnd', h, proj_weight) 207 | 208 | return head 209 | 210 | def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, 211 | r_w_bias, r_r_bias, r_s_bias, attn_mask, scale): 212 | 213 | """Core relative positional attention operations.""" 214 | 215 | # content based attention score 216 | ac = torch.einsum('ibnd,jbnd->ijbn', q_head + r_w_bias, k_head_h) 217 | 218 | # position based attention score 219 | bd = torch.einsum('ibnd,jbnd->ijbn', q_head + r_r_bias, k_head_r) 220 | bd = self.rel_shift(bd, klen=ac.shape[1]) 221 | 222 | # segment based attention score 223 | if seg_mat is None: 224 | ef = 0 225 | else: 226 | ef = torch.einsum('ibnd,snd->ibns', q_head + r_s_bias, seg_embed) 227 | ef = torch.einsum('ijbs,ibns->ijbn', seg_mat, ef) 228 | 229 | # merge attention scores and perform masking 230 | attn_score = (ac + bd + ef) * scale 231 | if attn_mask is not None: 232 | # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask 233 | attn_score = attn_score - 1e30 * attn_mask 234 | 235 | # attention probability 236 | attn_prob = F.softmax(attn_score, dim=1) 237 | attn_prob = self.DropAttn(attn_prob) 238 | 239 | # attention output 240 | attn_vec = torch.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h) 241 | 242 | return attn_vec 243 | 244 | def rel_multihead_attn(self, h, r, r_w_bias, r_r_bias, seg_mat, r_s_bias, seg_embed, 245 | attn_mask, mems, d_model, n_head, d_head, dropout, dropatt): 246 | """Multi-head attention with relative positional encoding.""" 247 | 248 | scale = 1 / (d_head ** 0.5) 249 | if mems is not None and len(mems.size()) > 1: 250 | cat = torch.cat([mems, h], dim=0) 251 | else: 252 | cat = h 253 | 254 | # content heads 255 | q_head_h = self.head_projection(h, 'q') 256 | k_head_h = self.head_projection(cat, 'k') 257 | v_head_h = self.head_projection(cat, 'v') 258 | 259 | # positional heads 260 | k_head_r = self.head_projection(r, 'r') 261 | 262 | # core attention ops 263 | attn_vec = self.rel_attn_core( 264 | q_head_h, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, 265 | r_r_bias, r_s_bias, attn_mask, scale) 266 | 267 | # post processing 268 | output = self.post_attention(h, attn_vec) 269 | 270 | return output 271 | 272 | def two_stream_rel_attn(self, h, g, r, mems, r_w_bias, r_r_bias, seg_mat, r_s_bias, 273 | seg_embed, attn_mask_h, attn_mask_g, target_mapping): 274 | scale = 1 / (self.d_head ** 0.5) 275 | 276 | # content based attention score 277 | if mems is not None and len(mems.size()) > 1: 278 | cat = torch.cat([mems, h], dim=0) 279 | else: 280 | cat = h 281 | 282 | # content-based key head 283 | k_head_h = self.head_projection(cat, 'k') 284 | 285 | # content-based value head 286 | v_head_h = self.head_projection(cat, 'v') 287 | 288 | # position-based key head 289 | k_head_r = self.head_projection(r, 'r') 290 | 291 | ##### h-stream 292 | # content-stream query head 293 | q_head_h = self.head_projection(h, 'q') 294 | 295 | # core attention ops 296 | # hˆ(m)_zt = LayerNorm(h^(m-1)_zt + RelAttn(h^(m-1)_zt + [h~^(m-1), hT(m-1)_z<=t])) 297 | attn_vec_h = self.rel_attn_core( 298 | q_head_h, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, 299 | r_r_bias, r_s_bias, attn_mask_h, scale) 300 | 301 | # post processing 302 | output_h = self.post_attention(h, attn_vec_h) 303 | 304 | ##### g-stream 305 | # query-stream query head 306 | q_head_g = self.head_projection(g, 'q') 307 | 308 | # core attention ops 309 | # gˆ(m)_zt = LayerNorm(g^(m-1)_zt + RelAttn(g^(m-1)_zt + [h~^(m-1), hT(m-1)_z<=t])) 310 | if target_mapping is not None: 311 | q_head_g = torch.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping) 312 | attn_vec_g = self.rel_attn_core( 313 | q_head_g, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, 314 | r_r_bias, r_s_bias, attn_mask_g, scale) 315 | attn_vec_g = torch.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping) 316 | else: 317 | attn_vec_g = self.rel_attn_core( 318 | q_head_g, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, 319 | r_r_bias, r_s_bias, attn_mask_g, scale) 320 | 321 | # post processing 322 | output_g = self.post_attention(g, attn_vec_g) 323 | 324 | return output_h, output_g 325 | 326 | 327 | def _create_mask(self, qlen, mlen, dtype, same_length=False): 328 | """create causal attention mask.""" 329 | # [[0,1,1], 330 | # [0,0,1], 331 | # [0,0,0]] 332 | attn_mask = torch.ones([qlen, qlen], dtype=dtype) 333 | mask_u = torch.triu(attn_mask) # Upper triangular part. 334 | mask_dia = torch.tril(attn_mask) & torch.triu(attn_mask) # Diagonal. Figure 2(c) 335 | attn_mask_pad = torch.zeros([qlen, mlen], dtype=dtype) 336 | ret = torch.cat([attn_mask_pad, mask_u - mask_dia], dim=1) # [qlen, mlen] 337 | if same_length: 338 | # [[0,1,1], 339 | # [1,0,1], 340 | # [1,1,0]] 341 | mask_l = torch.tril(attn_mask) # Lower triangular part. 342 | ret = torch.cat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], dim=1) 343 | 344 | return ret.type(dtype=torch.float32) # [qlen, qlen] 345 | 346 | def positional_embedding(self, pos_seq, inv_freq): 347 | sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq) 348 | pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1) 349 | pos_emb = pos_emb[:, None, :] 350 | 351 | return pos_emb 352 | 353 | def _cache_mem(self, curr_out, prev_mem, mem_len, reuse_len=None): 354 | """cache hidden states into memory.""" 355 | 356 | with torch.no_grad(): 357 | if mem_len is None or mem_len == 0: 358 | return None 359 | else: 360 | if reuse_len is not None and reuse_len > 0: 361 | curr_out = curr_out[:reuse_len] 362 | 363 | if prev_mem is None: 364 | new_mem = curr_out[-mem_len:] 365 | else: 366 | new_mem = torch.cat([prev_mem, curr_out], dim=0)[-mem_len:] 367 | 368 | return new_mem 369 | 370 | 371 | def relative_positional_encoding(self, qlen, klen, d_model, clamp_len, attn_type, 372 | bi_data, bsz=None, dtype=None): 373 | """create relative positional encoding.""" 374 | 375 | freq_seq = torch.arange(0, d_model, 2.0) 376 | if dtype is not None and dtype != torch.float32: 377 | freq_seq = freq_seq.type(dtype) 378 | inv_freq = 1 / (10000 ** (freq_seq / d_model)) 379 | 380 | if attn_type == 'bi': 381 | # beg, end = klen - 1, -qlen 382 | beg, end = klen, -qlen 383 | elif attn_type == 'uni': 384 | # beg, end = klen - 1, -1 385 | beg, end = klen, -1 386 | else: 387 | raise ValueError('Unknown `attn_type` {}.'.format(attn_type)) 388 | 389 | if bi_data and bsz%2 is 0: 390 | fwd_pos_seq = torch.arange(beg, end, -1.0) 391 | bwd_pos_seq = torch.arange(-beg, -end, 1.0) 392 | 393 | if dtype is not None and dtype != torch.float32: 394 | fwd_pos_seq = fwd_pos_seq.type(dtype=dtype) 395 | bwd_pos_seq = bwd_pos_seq.type(dtype=dtype) 396 | 397 | if clamp_len > 0: 398 | fwd_pos_seq = torch.clamp(fwd_pos_seq, -clamp_len, clamp_len) 399 | bwd_pos_seq = torch.clamp(bwd_pos_seq, -clamp_len, clamp_len) 400 | 401 | fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq) 402 | bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq) 403 | 404 | pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1) 405 | else: 406 | fwd_pos_seq = torch.arange(beg, end, -1.0) 407 | if dtype is not None and dtype != torch.float32: 408 | fwd_pos_seq = fwd_pos_seq.type(dtype=dtype) 409 | if clamp_len > 0: 410 | fwd_pos_seq = torch.clamp(fwd_pos_seq, -clamp_len, clamp_len) 411 | pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq) 412 | 413 | return pos_emb 414 | 415 | def forward(self, inp_k, seg_id, input_mask, mems, perm_mask, target_mapping, inp_q): 416 | new_mems = [] 417 | 418 | bsz = inp_k.shape[1] 419 | qlen = inp_k.shape[0] 420 | mlen = mems[0].size(0) if mems is not None else 0 421 | klen = mlen + qlen 422 | 423 | ##### Attention mask 424 | # causal attention mask 425 | if self.attn_type == 'uni': 426 | attn_mask = self._create_mask(qlen, mlen, torch.int64, self.same_length) 427 | attn_mask = attn_mask[:, :, None, None] 428 | elif self.attn_type == 'bi': 429 | attn_mask = None 430 | else: 431 | raise ValueError('Unsupported attention type: {}'.format(self.attn_type)) 432 | 433 | # data mask: input mask & perm mask 434 | if input_mask is not None and perm_mask is not None: 435 | data_mask = input_mask[None] + perm_mask 436 | elif input_mask is not None and perm_mask is None: 437 | data_mask = input_mask[None] 438 | elif input_mask is None and perm_mask is not None: 439 | data_mask = perm_mask 440 | else: 441 | data_mask = None 442 | 443 | if data_mask is not None: 444 | # all mems can be attended to 445 | mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz], 446 | dtype=torch.float32) 447 | data_mask = torch.cat([mems_mask, data_mask], dim=1) 448 | if attn_mask is None: 449 | attn_mask = data_mask[:, :, :, None] 450 | else: 451 | attn_mask += data_mask[:, :, :, None] 452 | 453 | if attn_mask is not None: 454 | attn_mask = attn_mask.gt(0).type(torch.float32) 455 | 456 | if attn_mask is not None: 457 | non_tgt_mask = -torch.eye(qlen, dtype=torch.float32) # [qlen, qlen] 458 | non_tgt_mask = torch.cat([torch.zeros([qlen, mlen], dtype=torch.float32), # [qlen, klen] 459 | non_tgt_mask], 460 | dim=-1) 461 | non_tgt_mask = (attn_mask + 462 | non_tgt_mask[:, :, None, None]).gt(0).type(dtype=torch.float32) 463 | else: 464 | non_tgt_mask = None 465 | 466 | ##### Word embedding 467 | lookup_table = self.embedding 468 | word_emb_k = lookup_table(inp_k) 469 | 470 | if inp_q is not None: 471 | if target_mapping is not None: 472 | word_emb_q = self.mask_emb.repeat(target_mapping.shape[0], bsz, 1) 473 | else: 474 | inp_q_ext = inp_q[:, :, None] 475 | word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k 476 | 477 | #### Figure 2(a), Content Stream(Original Attention), h^(0)_t = e(x_i) = e(inp_k) 478 | output_h = self.Dropout(word_emb_k) 479 | if inp_q is not None: 480 | #### Query Stream, g^(0)_t = w 481 | #### the first layer query stream is initialized with a trainable vector 482 | output_g = self.Dropout(word_emb_q) 483 | 484 | ##### Segment embedding 485 | # paper 486 | # Given a pair of positions i and j in the sequence, if 487 | # i and j are from the same segment 488 | if seg_id is not None: 489 | # Convert `seg_id` to one-hot `seg_mat` 490 | mem_pad = torch.zeros([mlen, bsz], dtype=torch.int32) 491 | cat_ids = torch.cat([mem_pad, seg_id], dim=0) 492 | 493 | # `1` indicates not in the same segment [qlen x klen x bsz] 494 | seg_mat = (~torch.eq(seg_id[:, None], cat_ids[None, :])).type(torch.long) 495 | seg_mat = torch.eye(2, dtype=torch.float32)[seg_mat] 496 | else: 497 | seg_mat = None 498 | 499 | ##### Positional encoding 500 | pos_emb = self.relative_positional_encoding( 501 | qlen, klen, self.d_model, self.clamp_len, self.attn_type, self.bi_data, 502 | bsz=bsz, dtype=torch.float32) 503 | pos_emb = self.Dropout(pos_emb) 504 | 505 | ##### Attention layers 506 | if mems is None: 507 | mems = [None] * self.n_layer 508 | 509 | for i in range(self.n_layer): 510 | # cache new mems 511 | new_mems.append(self._cache_mem(output_h, mems[i], self.mem_len, self.reuse_len)) 512 | 513 | # segment bias 514 | if seg_id is None: 515 | r_s_bias_i = None 516 | seg_embed_i = None 517 | else: 518 | r_s_bias_i = self.r_s_bias[i] 519 | seg_embed_i = self.seg_embed[i] 520 | 521 | if inp_q is not None: 522 | output_h, output_g = self.two_stream_rel_attn( 523 | h=output_h, 524 | g=output_g, 525 | r=pos_emb, 526 | r_w_bias= self.r_w_bias[i], 527 | r_r_bias= self.r_r_bias[i], 528 | seg_mat=seg_mat, 529 | r_s_bias=r_s_bias_i, 530 | seg_embed=seg_embed_i, 531 | attn_mask_h=non_tgt_mask, 532 | attn_mask_g=attn_mask, 533 | mems=mems[i], 534 | target_mapping=target_mapping) 535 | else: 536 | output_h = self.rel_multihead_attn( 537 | h=output_h, 538 | r=pos_emb, 539 | r_w_bias=self.r_w_bias[i], 540 | r_r_bias=self.r_r_bias[i], 541 | seg_mat=seg_mat, 542 | r_s_bias=r_s_bias_i, 543 | seg_embed=seg_embed_i, 544 | attn_mask=non_tgt_mask, 545 | mems=mems[i]) 546 | 547 | if inp_q is not None: 548 | output_g = self.positionwise_ffn(inp=output_g) 549 | 550 | output_h = self.positionwise_ffn(inp=output_h) 551 | 552 | if inp_q is not None: 553 | output = self.Dropout(output_g) 554 | else: 555 | output = self.Dropout(output_h) 556 | 557 | logits = torch.einsum('ibd,nd->ibn', output, lookup_table.weight) + self.softmax_b 558 | 559 | return logits, new_mems --------------------------------------------------------------------------------