├── .gitignore
├── LICENSE
├── README.md
├── XLNet.ipynb
├── data.txt
├── data_utils.py
├── images
├── AEmodel.png
├── ARmodel.png
├── PLM.png
├── ParPrediction.png
├── hyperparameters.png
├── target-aware.png
└── twoattn.png
├── main.py
└── xlnet.py
/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/.gitignore
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2019 Tae Hwan Jung
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## XLNet-Pytorch [arxiv:1906.08237](https://arxiv.org/pdf/1906.08237.pdf)
2 |
3 | **Simple XLNet implementation with Pytorch Wrapper!**
4 |
5 | #### You can see How XLNet Architecture work in pre-training with small batch size(=1) example.
6 |
7 | #### To Usage
8 |
9 | ```shell
10 | $ git clone https://github.com/graykode/xlnet-Pytorch && cd xlnet-Pytorch
11 |
12 | # To use Sentence Piece Tokenizer(pretrained-BERT Tokenizer)
13 | $ pip install pytorch_pretrained_bert
14 |
15 | $ python main.py --data ./data.txt --tokenizer bert-base-uncased \
16 | --seq_len 512 --reuse_len 256 --perm_size 256 \
17 | --bi_data True --mask_alpha 6 --mask_beta 1 \
18 | --num_predict 85 --mem_len 384 --num_epoch 100
19 | ```
20 |
21 | Also, You can run code in [Google Colab](https://colab.research.google.com/github/graykode/xlnet-Pytorch/blob/master/XLNet.ipynb) easily.
22 |
23 | - Hyperparameters for Pretraining in Paper.
24 |
25 |
26 | #### Option
27 |
28 | - `—data`(String) : `.txt` file to train. It doesn't matter multiline text. Also, one file will be one batch tensor. Default : `data.txt`
29 | - `—tokenizer`(String) : I just used [huggingface/pytorch-pretrained-BERT's Tokenizer](https://github.com/huggingface/pytorch-pretrained-BERT) as subword tokenizer(I'll edit it to sentence piece soon). you can choose in `bert-base-uncased`, `bert-large-uncased`, `bert-base-cased`, `bert-large-cased`. Default : `bert-base-uncased`
30 | - `—seq_len`(Integer) : Sequence length. Default : `512`
31 | - `—reuse_len`(Interger) : Number of token that can be reused as memory. Could be half of `seq_len`. Default : `256`
32 | - `—perm_size`(Interger) : the length of longest permutation. Could be set to be reuse_len. Default : `256`
33 |
34 | - `--bi_data`(Boolean) : whether to create bidirectional data. If `bi_data` is `True`, `biz(batch size)` should be even number. Default : `False`
35 | - `—mask_alpha`(Interger) : How many tokens to form a group. Defalut : `6`
36 | - `—mask_beta`(Integer) : How many tokens to mask within each group. Default : `1`
37 | - `—num_predict`(Interger) : Num of tokens to predict. In Paper, it mean Partial Prediction. Default : `85`
38 | - `—mem_len`(Interger) : Number of steps to cache in Transformer-XL Architecture. Default : `384`
39 | - `—num_epoch`(Interger) : Number of Epoch. Default : `100`
40 |
41 |
42 |
43 | ## What is XLNet?
44 |
45 | **XLNet** is a new unsupervised language representation learning method based on a novel generalized permutation language modeling objective. Additionally, XLNet employs [Transformer-XL](https://arxiv.org/abs/1901.02860) as the backbone model, exhibiting excellent performance for language tasks involving long context.
46 |
47 | - [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237)
48 | - [Paper Author's XLNet Github](https://github.com/zihangdai/xlnet)
49 |
50 | | Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B |
51 | | ----- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- |
52 | | BERT | 86.6 | 92.3 | 91.3 | 70.4 | 93.2 | 88.0 | 60.6 | 90.0 |
53 | | XLNet | **89.8** | **93.9** | **91.8** | **83.8** | **95.6** | **89.2** | **63.6** | **91.8** |
54 |
55 |
56 |
57 | ### Keyword in XLNet
58 |
59 | 1. How did XLNet benefit from Auto-Regression and Auto-Encoding models?
60 |
61 | - Auto-Regression Model
62 | 
63 | - Auto-Encoding Model
64 | 
65 |
66 | 2. Permutation Language Modeling with Partial Prediction
67 | - Permutation Language Modeling
68 | 
69 |
70 | - Partial Prediction
71 | 
72 |
73 | 3. Two-Stream Self-Attention with Target-Aware Representation
74 |
75 | - Two-Stram Self-Attention
76 |
77 | 
78 |
79 | - Target-Aware Representation
80 |
81 | 
82 |
83 |
84 |
85 | ## Author
86 |
87 | - Because the original repository is subject to the **Apache2.0 license**, it is subject to the same license.
88 | - Tae Hwan Jung(Jeff Jung) @graykode, Kyung Hee Univ CE(Undergraduate).
89 | - Author Email : [nlkey2022@gmail.com](mailto:nlkey2022@gmail.com)
--------------------------------------------------------------------------------
/XLNet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "XLNet.ipynb",
7 | "version": "0.3.2",
8 | "provenance": [],
9 | "collapsed_sections": []
10 | },
11 | "kernelspec": {
12 | "name": "python3",
13 | "display_name": "Python 3"
14 | },
15 | "accelerator": "GPU"
16 | },
17 | "cells": [
18 | {
19 | "cell_type": "code",
20 | "metadata": {
21 | "id": "ayaYxaMPq5RF",
22 | "colab_type": "code",
23 | "outputId": "7ece05c3-51fc-43de-bfae-22d47cc97a10",
24 | "colab": {
25 | "base_uri": "https://localhost:8080/",
26 | "height": 119
27 | }
28 | },
29 | "source": [
30 | "!git clone https://github.com/graykode/xlnet-Pytorch"
31 | ],
32 | "execution_count": 5,
33 | "outputs": [
34 | {
35 | "output_type": "stream",
36 | "text": [
37 | "Cloning into 'xlnet-Pytorch'...\n",
38 | "remote: Enumerating objects: 32, done.\u001b[K\n",
39 | "remote: Counting objects: 100% (32/32), done.\u001b[K\n",
40 | "remote: Compressing objects: 100% (20/20), done.\u001b[K\n",
41 | "remote: Total 32 (delta 16), reused 27 (delta 11), pack-reused 0\n",
42 | "Unpacking objects: 100% (32/32), done.\n"
43 | ],
44 | "name": "stdout"
45 | }
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "metadata": {
51 | "id": "iP2YYduXrFWb",
52 | "colab_type": "code",
53 | "outputId": "fe3851e1-e1cb-4913-eb8b-02724538ffc6",
54 | "colab": {
55 | "base_uri": "https://localhost:8080/",
56 | "height": 34
57 | }
58 | },
59 | "source": [
60 | "%cd xlnet-Pytorch"
61 | ],
62 | "execution_count": 6,
63 | "outputs": [
64 | {
65 | "output_type": "stream",
66 | "text": [
67 | "/content/xlnet-Pytorch/xlnet-Pytorch\n"
68 | ],
69 | "name": "stdout"
70 | }
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "metadata": {
76 | "id": "ijR047EprRIH",
77 | "colab_type": "code",
78 | "outputId": "ed80eb1e-f2f7-4035-d7b4-6dae19dced9a",
79 | "colab": {
80 | "base_uri": "https://localhost:8080/",
81 | "height": 326
82 | }
83 | },
84 | "source": [
85 | "!pip install pytorch_pretrained_bert"
86 | ],
87 | "execution_count": 7,
88 | "outputs": [
89 | {
90 | "output_type": "stream",
91 | "text": [
92 | "Requirement already satisfied: pytorch_pretrained_bert in /usr/local/lib/python3.6/dist-packages (0.6.2)\n",
93 | "Requirement already satisfied: torch>=0.4.1 in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (1.1.0)\n",
94 | "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (2.21.0)\n",
95 | "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (4.28.1)\n",
96 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (1.16.4)\n",
97 | "Requirement already satisfied: regex in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (2019.6.8)\n",
98 | "Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from pytorch_pretrained_bert) (1.9.167)\n",
99 | "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (1.24.3)\n",
100 | "Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (2.8)\n",
101 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (2019.3.9)\n",
102 | "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->pytorch_pretrained_bert) (3.0.4)\n",
103 | "Requirement already satisfied: s3transfer<0.3.0,>=0.2.0 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch_pretrained_bert) (0.2.1)\n",
104 | "Requirement already satisfied: botocore<1.13.0,>=1.12.167 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch_pretrained_bert) (1.12.167)\n",
105 | "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->pytorch_pretrained_bert) (0.9.4)\n",
106 | "Requirement already satisfied: docutils>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.13.0,>=1.12.167->boto3->pytorch_pretrained_bert) (0.14)\n",
107 | "Requirement already satisfied: python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\" in /usr/local/lib/python3.6/dist-packages (from botocore<1.13.0,>=1.12.167->boto3->pytorch_pretrained_bert) (2.5.3)\n",
108 | "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil<3.0.0,>=2.1; python_version >= \"2.7\"->botocore<1.13.0,>=1.12.167->boto3->pytorch_pretrained_bert) (1.12.0)\n"
109 | ],
110 | "name": "stdout"
111 | }
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "metadata": {
117 | "id": "BkV6fPyArNbN",
118 | "colab_type": "code",
119 | "outputId": "6c84ac20-3e16-4abc-c9ab-0d5d540d090a",
120 | "colab": {
121 | "base_uri": "https://localhost:8080/",
122 | "height": 1000
123 | }
124 | },
125 | "source": [
126 | "!python main.py"
127 | ],
128 | "execution_count": 4,
129 | "outputs": [
130 | {
131 | "output_type": "stream",
132 | "text": [
133 | "100% 231508/231508 [00:00<00:00, 418264.00B/s]\n",
134 | "Number of Step: 0001 cost = 26.299185\n",
135 | "Number of Step: 0001 cost = 25.861174\n",
136 | "Number of Step: 0001 cost = 26.162863\n",
137 | "Number of Step: 0001 cost = 26.079767\n",
138 | "Number of Step: 0001 cost = 25.013128\n",
139 | "Number of Step: 0001 cost = 25.666861\n",
140 | "Number of Step: 0001 cost = 25.705088\n",
141 | "Number of Step: 0001 cost = 24.955669\n",
142 | "Number of Step: 0002 cost = 24.116352\n",
143 | "Number of Step: 0002 cost = 25.042078\n",
144 | "Number of Step: 0002 cost = 24.091347\n",
145 | "Number of Step: 0002 cost = 25.537655\n",
146 | "Number of Step: 0002 cost = 24.248133\n",
147 | "Number of Step: 0002 cost = 25.824280\n",
148 | "Number of Step: 0002 cost = 24.787397\n",
149 | "Number of Step: 0002 cost = 26.053616\n",
150 | "Number of Step: 0003 cost = 25.367481\n",
151 | "Number of Step: 0003 cost = 25.543600\n",
152 | "Number of Step: 0003 cost = 25.891920\n",
153 | "Number of Step: 0003 cost = 24.980560\n",
154 | "Number of Step: 0003 cost = 25.297611\n",
155 | "Number of Step: 0003 cost = 25.850960\n",
156 | "Number of Step: 0003 cost = 25.115484\n",
157 | "Number of Step: 0003 cost = 23.884501\n",
158 | "Number of Step: 0004 cost = 24.838873\n",
159 | "Number of Step: 0004 cost = 25.575075\n",
160 | "Number of Step: 0004 cost = 26.392899\n",
161 | "Number of Step: 0004 cost = 24.445896\n",
162 | "Number of Step: 0004 cost = 24.398573\n",
163 | "Number of Step: 0004 cost = 25.106733\n",
164 | "Number of Step: 0004 cost = 26.140654\n",
165 | "Number of Step: 0004 cost = 25.236158\n",
166 | "Number of Step: 0005 cost = 24.996521\n",
167 | "Number of Step: 0005 cost = 25.761152\n",
168 | "Number of Step: 0005 cost = 25.200850\n",
169 | "Number of Step: 0005 cost = 23.871277\n",
170 | "Number of Step: 0005 cost = 24.210709\n",
171 | "Number of Step: 0005 cost = 23.889694\n",
172 | "Number of Step: 0005 cost = 24.945341\n",
173 | "Number of Step: 0005 cost = 25.475309\n",
174 | "Number of Step: 0006 cost = 25.655682\n",
175 | "Number of Step: 0006 cost = 24.298883\n",
176 | "Number of Step: 0006 cost = 25.119278\n",
177 | "Number of Step: 0006 cost = 25.098862\n",
178 | "Number of Step: 0006 cost = 25.151922\n",
179 | "Number of Step: 0006 cost = 25.285501\n",
180 | "Number of Step: 0006 cost = 24.107182\n",
181 | "Number of Step: 0006 cost = 23.748384\n",
182 | "Number of Step: 0007 cost = 24.113546\n",
183 | "Number of Step: 0007 cost = 25.006632\n",
184 | "Number of Step: 0007 cost = 24.332354\n",
185 | "Number of Step: 0007 cost = 25.120981\n",
186 | "Number of Step: 0007 cost = 24.404642\n",
187 | "Number of Step: 0007 cost = 24.650999\n",
188 | "Number of Step: 0007 cost = 24.360918\n",
189 | "Number of Step: 0007 cost = 23.753399\n",
190 | "Number of Step: 0008 cost = 24.496143\n",
191 | "Number of Step: 0008 cost = 24.459608\n",
192 | "Number of Step: 0008 cost = 24.346823\n",
193 | "Number of Step: 0008 cost = 25.390636\n",
194 | "Number of Step: 0008 cost = 25.017641\n",
195 | "Number of Step: 0008 cost = 24.501677\n",
196 | "Number of Step: 0008 cost = 23.598795\n",
197 | "Number of Step: 0008 cost = 24.301554\n",
198 | "Number of Step: 0009 cost = 25.410679\n",
199 | "Number of Step: 0009 cost = 24.467159\n",
200 | "Number of Step: 0009 cost = 24.054974\n",
201 | "Number of Step: 0009 cost = 23.793539\n",
202 | "Number of Step: 0009 cost = 23.268635\n",
203 | "Number of Step: 0009 cost = 24.078022\n",
204 | "Number of Step: 0009 cost = 24.316879\n",
205 | "Number of Step: 0009 cost = 23.515970\n",
206 | "Number of Step: 0010 cost = 24.067686\n",
207 | "Number of Step: 0010 cost = 24.360920\n",
208 | "Number of Step: 0010 cost = 25.631681\n",
209 | "Number of Step: 0010 cost = 23.218294\n",
210 | "Number of Step: 0010 cost = 24.856960\n",
211 | "Number of Step: 0010 cost = 23.544510\n",
212 | "Number of Step: 0010 cost = 23.410534\n",
213 | "Number of Step: 0010 cost = 23.849169\n",
214 | "Number of Step: 0011 cost = 24.627134\n",
215 | "Number of Step: 0011 cost = 25.954533\n",
216 | "Number of Step: 0011 cost = 24.426609\n",
217 | "Number of Step: 0011 cost = 25.764784\n",
218 | "Number of Step: 0011 cost = 25.585995\n",
219 | "Number of Step: 0011 cost = 25.143883\n",
220 | "Number of Step: 0011 cost = 22.926973\n",
221 | "Number of Step: 0011 cost = 24.910206\n",
222 | "Number of Step: 0012 cost = 23.564455\n",
223 | "Number of Step: 0012 cost = 24.482298\n",
224 | "Number of Step: 0012 cost = 24.107456\n",
225 | "Number of Step: 0012 cost = 23.281023\n",
226 | "Number of Step: 0012 cost = 24.001156\n",
227 | "Number of Step: 0012 cost = 23.877611\n",
228 | "Number of Step: 0012 cost = 23.695135\n",
229 | "Number of Step: 0012 cost = 23.912651\n",
230 | "Number of Step: 0013 cost = 24.788204\n",
231 | "Number of Step: 0013 cost = 24.605497\n",
232 | "Number of Step: 0013 cost = 23.714304\n",
233 | "Number of Step: 0013 cost = 24.666931\n",
234 | "Number of Step: 0013 cost = 24.020756\n",
235 | "Number of Step: 0013 cost = 23.386766\n",
236 | "Number of Step: 0013 cost = 24.413082\n",
237 | "Number of Step: 0013 cost = 24.092968\n",
238 | "Number of Step: 0014 cost = 22.983570\n",
239 | "Number of Step: 0014 cost = 25.068871\n",
240 | "Number of Step: 0014 cost = 23.518705\n",
241 | "Number of Step: 0014 cost = 23.637272\n",
242 | "Number of Step: 0014 cost = 22.940498\n",
243 | "Number of Step: 0014 cost = 25.140924\n",
244 | "Number of Step: 0014 cost = 23.010714\n",
245 | "Number of Step: 0014 cost = 23.191177\n",
246 | "Number of Step: 0015 cost = 22.630165\n",
247 | "Number of Step: 0015 cost = 23.283859\n",
248 | "Number of Step: 0015 cost = 23.364052\n",
249 | "Number of Step: 0015 cost = 24.416988\n",
250 | "Number of Step: 0015 cost = 24.014668\n",
251 | "Number of Step: 0015 cost = 22.869274\n",
252 | "Number of Step: 0015 cost = 23.760340\n",
253 | "Number of Step: 0015 cost = 23.840309\n",
254 | "Number of Step: 0016 cost = 23.310936\n",
255 | "Number of Step: 0016 cost = 24.574957\n",
256 | "Number of Step: 0016 cost = 23.376127\n",
257 | "Number of Step: 0016 cost = 24.164631\n",
258 | "Number of Step: 0016 cost = 23.071663\n",
259 | "Number of Step: 0016 cost = 23.024294\n",
260 | "Number of Step: 0016 cost = 23.181185\n",
261 | "Number of Step: 0016 cost = 24.051889\n",
262 | "Number of Step: 0017 cost = 23.288946\n",
263 | "Number of Step: 0017 cost = 24.085411\n",
264 | "Number of Step: 0017 cost = 22.379274\n",
265 | "Number of Step: 0017 cost = 24.024132\n",
266 | "Number of Step: 0017 cost = 24.557764\n",
267 | "Number of Step: 0017 cost = 24.751358\n",
268 | "Number of Step: 0017 cost = 23.339399\n",
269 | "Number of Step: 0017 cost = 23.635082\n",
270 | "Number of Step: 0018 cost = 22.173685\n",
271 | "Number of Step: 0018 cost = 23.776503\n",
272 | "Number of Step: 0018 cost = 23.752548\n",
273 | "Number of Step: 0018 cost = 23.538460\n",
274 | "Number of Step: 0018 cost = 22.357494\n",
275 | "Number of Step: 0018 cost = 23.622934\n",
276 | "Number of Step: 0018 cost = 23.631004\n",
277 | "Number of Step: 0018 cost = 22.551394\n",
278 | "Number of Step: 0019 cost = 22.947685\n",
279 | "Number of Step: 0019 cost = 22.403502\n",
280 | "Number of Step: 0019 cost = 22.972301\n",
281 | "Number of Step: 0019 cost = 21.893215\n",
282 | "Number of Step: 0019 cost = 23.263187\n",
283 | "Number of Step: 0019 cost = 22.995371\n",
284 | "Number of Step: 0019 cost = 22.956581\n",
285 | "Number of Step: 0019 cost = 23.096869\n",
286 | "Number of Step: 0020 cost = 23.094641\n",
287 | "Number of Step: 0020 cost = 22.921461\n",
288 | "Number of Step: 0020 cost = 23.298218\n",
289 | "Number of Step: 0020 cost = 22.579227\n",
290 | "Number of Step: 0020 cost = 24.224125\n",
291 | "Number of Step: 0020 cost = 23.881729\n",
292 | "Number of Step: 0020 cost = 21.861792\n",
293 | "Number of Step: 0020 cost = 22.777273\n",
294 | "Number of Step: 0021 cost = 22.222363\n",
295 | "Number of Step: 0021 cost = 21.119030\n",
296 | "Number of Step: 0021 cost = 23.907051\n",
297 | "Number of Step: 0021 cost = 23.819723\n",
298 | "Number of Step: 0021 cost = 23.788166\n",
299 | "Number of Step: 0021 cost = 23.229645\n",
300 | "Number of Step: 0021 cost = 24.274410\n",
301 | "Number of Step: 0021 cost = 22.877367\n",
302 | "Number of Step: 0022 cost = 22.700832\n",
303 | "Number of Step: 0022 cost = 23.184784\n",
304 | "Number of Step: 0022 cost = 22.714603\n",
305 | "Number of Step: 0022 cost = 23.461004\n",
306 | "Number of Step: 0022 cost = 21.954988\n",
307 | "Number of Step: 0022 cost = 21.957075\n",
308 | "Number of Step: 0022 cost = 22.306690\n",
309 | "Number of Step: 0022 cost = 23.725677\n",
310 | "Number of Step: 0023 cost = 22.276127\n",
311 | "Number of Step: 0023 cost = 23.766611\n",
312 | "Number of Step: 0023 cost = 22.990093\n",
313 | "Number of Step: 0023 cost = 23.687693\n",
314 | "Number of Step: 0023 cost = 23.503782\n",
315 | "Number of Step: 0023 cost = 22.529320\n",
316 | "Number of Step: 0023 cost = 23.205400\n",
317 | "Number of Step: 0023 cost = 22.896313\n",
318 | "Number of Step: 0024 cost = 22.109776\n",
319 | "Number of Step: 0024 cost = 22.556622\n",
320 | "Number of Step: 0024 cost = 22.328550\n",
321 | "Number of Step: 0024 cost = 22.092735\n",
322 | "Number of Step: 0024 cost = 23.011541\n",
323 | "Number of Step: 0024 cost = 23.419533\n",
324 | "Number of Step: 0024 cost = 22.668753\n",
325 | "Number of Step: 0024 cost = 22.147079\n",
326 | "Number of Step: 0025 cost = 21.928825\n",
327 | "Number of Step: 0025 cost = 23.278080\n",
328 | "Number of Step: 0025 cost = 23.145954\n",
329 | "Number of Step: 0025 cost = 22.317205\n",
330 | "Number of Step: 0025 cost = 22.680893\n",
331 | "Number of Step: 0025 cost = 22.483362\n",
332 | "Number of Step: 0025 cost = 22.365522\n",
333 | "Number of Step: 0025 cost = 22.656649\n",
334 | "Number of Step: 0026 cost = 21.763489\n",
335 | "Number of Step: 0026 cost = 21.822924\n",
336 | "Number of Step: 0026 cost = 22.338774\n",
337 | "Number of Step: 0026 cost = 22.268866\n",
338 | "Number of Step: 0026 cost = 22.320282\n",
339 | "Number of Step: 0026 cost = 22.410757\n",
340 | "Number of Step: 0026 cost = 22.637495\n",
341 | "Number of Step: 0026 cost = 22.134693\n",
342 | "Number of Step: 0027 cost = 23.152620\n",
343 | "Number of Step: 0027 cost = 21.620134\n",
344 | "Number of Step: 0027 cost = 22.444149\n",
345 | "Number of Step: 0027 cost = 22.397623\n",
346 | "Number of Step: 0027 cost = 22.449764\n",
347 | "Number of Step: 0027 cost = 22.904938\n",
348 | "Number of Step: 0027 cost = 22.601612\n",
349 | "Number of Step: 0027 cost = 22.304131\n",
350 | "Number of Step: 0028 cost = 20.434067\n",
351 | "Number of Step: 0028 cost = 22.794069\n",
352 | "Number of Step: 0028 cost = 23.426819\n",
353 | "Number of Step: 0028 cost = 22.357069\n",
354 | "Number of Step: 0028 cost = 22.608589\n",
355 | "Number of Step: 0028 cost = 22.465258\n",
356 | "Number of Step: 0028 cost = 22.891714\n",
357 | "Number of Step: 0028 cost = 21.988979\n",
358 | "Number of Step: 0029 cost = 22.498970\n",
359 | "Number of Step: 0029 cost = 21.521902\n",
360 | "Number of Step: 0029 cost = 23.053669\n",
361 | "Number of Step: 0029 cost = 22.357464\n",
362 | "Number of Step: 0029 cost = 21.904669\n",
363 | "Number of Step: 0029 cost = 21.634151\n",
364 | "Number of Step: 0029 cost = 20.249266\n",
365 | "Number of Step: 0029 cost = 22.062521\n",
366 | "Number of Step: 0030 cost = 22.270859\n",
367 | "Number of Step: 0030 cost = 22.696949\n",
368 | "Number of Step: 0030 cost = 22.070248\n",
369 | "Number of Step: 0030 cost = 22.543518\n",
370 | "Number of Step: 0030 cost = 23.614525\n",
371 | "Number of Step: 0030 cost = 22.858721\n",
372 | "Number of Step: 0030 cost = 21.260269\n",
373 | "Number of Step: 0030 cost = 21.462461\n",
374 | "Number of Step: 0031 cost = 22.491512\n",
375 | "Number of Step: 0031 cost = 21.529919\n",
376 | "Number of Step: 0031 cost = 22.923733\n",
377 | "Number of Step: 0031 cost = 21.869879\n",
378 | "Number of Step: 0031 cost = 22.435644\n",
379 | "Number of Step: 0031 cost = 22.430405\n",
380 | "Number of Step: 0031 cost = 22.144842\n",
381 | "Number of Step: 0031 cost = 21.964941\n",
382 | "Number of Step: 0032 cost = 22.055660\n",
383 | "Number of Step: 0032 cost = 22.045565\n",
384 | "Number of Step: 0032 cost = 21.927069\n",
385 | "Number of Step: 0032 cost = 21.248281\n",
386 | "Number of Step: 0032 cost = 21.617807\n",
387 | "Number of Step: 0032 cost = 20.935833\n",
388 | "Number of Step: 0032 cost = 22.303843\n",
389 | "Number of Step: 0032 cost = 22.419876\n",
390 | "Number of Step: 0033 cost = 21.030119\n",
391 | "Number of Step: 0033 cost = 22.250040\n",
392 | "Number of Step: 0033 cost = 20.650230\n",
393 | "Number of Step: 0033 cost = 22.192366\n",
394 | "Number of Step: 0033 cost = 21.154890\n",
395 | "Number of Step: 0033 cost = 22.080959\n",
396 | "Number of Step: 0033 cost = 21.871065\n",
397 | "Number of Step: 0033 cost = 22.260691\n",
398 | "Number of Step: 0034 cost = 21.728571\n",
399 | "Number of Step: 0034 cost = 22.007542\n",
400 | "Number of Step: 0034 cost = 21.078880\n",
401 | "Number of Step: 0034 cost = 21.850500\n",
402 | "Number of Step: 0034 cost = 21.490084\n",
403 | "Number of Step: 0034 cost = 21.671848\n",
404 | "Number of Step: 0034 cost = 22.391680\n",
405 | "Number of Step: 0034 cost = 21.705509\n",
406 | "Number of Step: 0035 cost = 21.543411\n",
407 | "Number of Step: 0035 cost = 21.777857\n",
408 | "Number of Step: 0035 cost = 21.447199\n",
409 | "Number of Step: 0035 cost = 22.119251\n",
410 | "Number of Step: 0035 cost = 22.220009\n",
411 | "Number of Step: 0035 cost = 19.819977\n",
412 | "Number of Step: 0035 cost = 21.531845\n",
413 | "Number of Step: 0035 cost = 21.998089\n",
414 | "Number of Step: 0036 cost = 21.735443\n",
415 | "Number of Step: 0036 cost = 21.648676\n",
416 | "Number of Step: 0036 cost = 21.940975\n",
417 | "Number of Step: 0036 cost = 20.961882\n",
418 | "Number of Step: 0036 cost = 20.211729\n",
419 | "Number of Step: 0036 cost = 22.080381\n",
420 | "Number of Step: 0036 cost = 21.906378\n",
421 | "Number of Step: 0036 cost = 20.907700\n",
422 | "Number of Step: 0037 cost = 22.363924\n",
423 | "Number of Step: 0037 cost = 21.551998\n",
424 | "Number of Step: 0037 cost = 21.930584\n",
425 | "Number of Step: 0037 cost = 21.773323\n",
426 | "Number of Step: 0037 cost = 21.142616\n",
427 | "Number of Step: 0037 cost = 21.637774\n",
428 | "Number of Step: 0037 cost = 22.236561\n",
429 | "Number of Step: 0037 cost = 22.778532\n",
430 | "Number of Step: 0038 cost = 21.648951\n",
431 | "Number of Step: 0038 cost = 21.444340\n",
432 | "Number of Step: 0038 cost = 21.856207\n",
433 | "Number of Step: 0038 cost = 22.085463\n",
434 | "Number of Step: 0038 cost = 21.862345\n",
435 | "Number of Step: 0038 cost = 21.632576\n",
436 | "Number of Step: 0038 cost = 20.948652\n",
437 | "Number of Step: 0038 cost = 21.160299\n",
438 | "Number of Step: 0039 cost = 22.679569\n",
439 | "Number of Step: 0039 cost = 22.160774\n",
440 | "Number of Step: 0039 cost = 20.349392\n",
441 | "Number of Step: 0039 cost = 21.426294\n",
442 | "Number of Step: 0039 cost = 21.323292\n",
443 | "Number of Step: 0039 cost = 20.700750\n",
444 | "Number of Step: 0039 cost = 19.951761\n",
445 | "Number of Step: 0039 cost = 21.798811\n",
446 | "Number of Step: 0040 cost = 20.747545\n",
447 | "Number of Step: 0040 cost = 21.327257\n",
448 | "Number of Step: 0040 cost = 20.517118\n",
449 | "Number of Step: 0040 cost = 20.885836\n",
450 | "Number of Step: 0040 cost = 22.016651\n",
451 | "Number of Step: 0040 cost = 21.293856\n",
452 | "Number of Step: 0040 cost = 20.931459\n",
453 | "Number of Step: 0040 cost = 21.169933\n",
454 | "Number of Step: 0041 cost = 20.563751\n",
455 | "Number of Step: 0041 cost = 21.146351\n",
456 | "Number of Step: 0041 cost = 20.402386\n",
457 | "Number of Step: 0041 cost = 20.247446\n",
458 | "Number of Step: 0041 cost = 20.776196\n",
459 | "Number of Step: 0041 cost = 19.906986\n",
460 | "Number of Step: 0041 cost = 21.070517\n",
461 | "Number of Step: 0041 cost = 20.296288\n",
462 | "Number of Step: 0042 cost = 22.144644\n",
463 | "Number of Step: 0042 cost = 21.261848\n",
464 | "Number of Step: 0042 cost = 21.801069\n",
465 | "Number of Step: 0042 cost = 21.594389\n",
466 | "Number of Step: 0042 cost = 21.601309\n",
467 | "Number of Step: 0042 cost = 21.305853\n",
468 | "Number of Step: 0042 cost = 22.010830\n",
469 | "Number of Step: 0042 cost = 20.897848\n",
470 | "Number of Step: 0043 cost = 20.145580\n",
471 | "Number of Step: 0043 cost = 20.813745\n",
472 | "Number of Step: 0043 cost = 20.590549\n",
473 | "Number of Step: 0043 cost = 21.520615\n",
474 | "Number of Step: 0043 cost = 21.588079\n",
475 | "Number of Step: 0043 cost = 21.597328\n",
476 | "Number of Step: 0043 cost = 21.607746\n",
477 | "Number of Step: 0043 cost = 21.700638\n",
478 | "Number of Step: 0044 cost = 21.086859\n",
479 | "Number of Step: 0044 cost = 20.642982\n",
480 | "Number of Step: 0044 cost = 20.240429\n",
481 | "Number of Step: 0044 cost = 21.358030\n",
482 | "Number of Step: 0044 cost = 20.559532\n",
483 | "Number of Step: 0044 cost = 21.468231\n",
484 | "Number of Step: 0044 cost = 18.892157\n",
485 | "Number of Step: 0044 cost = 20.416586\n",
486 | "Number of Step: 0045 cost = 22.413452\n",
487 | "Number of Step: 0045 cost = 20.464434\n",
488 | "Number of Step: 0045 cost = 20.259840\n",
489 | "Number of Step: 0045 cost = 19.961233\n",
490 | "Number of Step: 0045 cost = 21.648184\n",
491 | "Number of Step: 0045 cost = 20.172035\n",
492 | "Number of Step: 0045 cost = 20.020864\n",
493 | "Number of Step: 0045 cost = 21.115805\n",
494 | "Number of Step: 0046 cost = 20.295105\n",
495 | "Number of Step: 0046 cost = 21.123190\n",
496 | "Number of Step: 0046 cost = 20.192799\n",
497 | "Number of Step: 0046 cost = 21.497696\n",
498 | "Number of Step: 0046 cost = 20.283506\n",
499 | "Number of Step: 0046 cost = 20.353373\n",
500 | "Number of Step: 0046 cost = 21.410826\n",
501 | "Number of Step: 0046 cost = 20.060562\n",
502 | "Number of Step: 0047 cost = 20.613169\n",
503 | "Number of Step: 0047 cost = 21.671728\n",
504 | "Number of Step: 0047 cost = 19.864946\n",
505 | "Number of Step: 0047 cost = 20.989864\n",
506 | "Number of Step: 0047 cost = 20.588034\n",
507 | "Number of Step: 0047 cost = 20.573233\n",
508 | "Number of Step: 0047 cost = 20.943060\n",
509 | "Number of Step: 0047 cost = 20.618057\n",
510 | "Number of Step: 0048 cost = 20.469118\n",
511 | "Number of Step: 0048 cost = 19.834934\n",
512 | "Number of Step: 0048 cost = 20.439869\n",
513 | "Number of Step: 0048 cost = 19.947546\n",
514 | "Number of Step: 0048 cost = 19.776983\n",
515 | "Number of Step: 0048 cost = 20.157717\n",
516 | "Number of Step: 0048 cost = 20.530584\n",
517 | "Number of Step: 0048 cost = 20.621025\n",
518 | "Number of Step: 0049 cost = 20.297810\n",
519 | "Number of Step: 0049 cost = 20.466293\n",
520 | "Number of Step: 0049 cost = 20.277691\n",
521 | "Number of Step: 0049 cost = 19.828459\n",
522 | "Number of Step: 0049 cost = 20.133368\n",
523 | "Number of Step: 0049 cost = 20.968479\n",
524 | "Number of Step: 0049 cost = 19.882719\n",
525 | "Number of Step: 0049 cost = 18.925854\n",
526 | "Number of Step: 0050 cost = 19.301132\n",
527 | "Number of Step: 0050 cost = 20.110096\n",
528 | "Number of Step: 0050 cost = 19.726845\n",
529 | "Number of Step: 0050 cost = 19.612841\n",
530 | "Number of Step: 0050 cost = 21.341433\n",
531 | "Number of Step: 0050 cost = 19.957525\n",
532 | "Number of Step: 0050 cost = 20.750641\n",
533 | "Number of Step: 0050 cost = 19.585604\n",
534 | "Number of Step: 0051 cost = 20.252506\n",
535 | "Number of Step: 0051 cost = 20.737688\n",
536 | "Number of Step: 0051 cost = 19.447847\n",
537 | "Number of Step: 0051 cost = 21.109488\n",
538 | "Number of Step: 0051 cost = 20.200754\n",
539 | "Number of Step: 0051 cost = 20.505079\n",
540 | "Number of Step: 0051 cost = 20.696692\n",
541 | "Number of Step: 0051 cost = 21.314342\n",
542 | "Number of Step: 0052 cost = 18.995667\n",
543 | "Number of Step: 0052 cost = 19.546761\n",
544 | "Number of Step: 0052 cost = 20.188692\n",
545 | "Number of Step: 0052 cost = 20.453053\n",
546 | "Number of Step: 0052 cost = 18.985550\n",
547 | "Number of Step: 0052 cost = 20.688198\n",
548 | "Number of Step: 0052 cost = 19.881287\n",
549 | "Number of Step: 0052 cost = 19.692705\n",
550 | "Number of Step: 0053 cost = 21.027081\n",
551 | "Number of Step: 0053 cost = 19.673756\n",
552 | "Number of Step: 0053 cost = 20.669489\n",
553 | "Number of Step: 0053 cost = 19.392540\n",
554 | "Number of Step: 0053 cost = 19.796257\n",
555 | "Number of Step: 0053 cost = 20.843779\n",
556 | "Number of Step: 0053 cost = 20.343000\n",
557 | "Number of Step: 0053 cost = 19.988203\n",
558 | "Number of Step: 0054 cost = 19.376358\n",
559 | "Number of Step: 0054 cost = 18.737743\n",
560 | "Number of Step: 0054 cost = 19.616598\n",
561 | "Number of Step: 0054 cost = 18.932737\n",
562 | "Number of Step: 0054 cost = 20.735094\n",
563 | "Number of Step: 0054 cost = 20.219381\n",
564 | "Number of Step: 0054 cost = 20.130972\n",
565 | "Number of Step: 0054 cost = 19.450148\n",
566 | "Number of Step: 0055 cost = 18.508867\n",
567 | "Number of Step: 0055 cost = 20.512484\n",
568 | "Number of Step: 0055 cost = 20.341122\n",
569 | "Number of Step: 0055 cost = 20.939161\n",
570 | "Number of Step: 0055 cost = 20.921871\n",
571 | "Number of Step: 0055 cost = 18.596462\n",
572 | "Number of Step: 0055 cost = 19.717844\n",
573 | "Number of Step: 0055 cost = 19.101641\n",
574 | "Number of Step: 0056 cost = 19.201128\n",
575 | "Number of Step: 0056 cost = 19.193102\n",
576 | "Number of Step: 0056 cost = 20.116600\n",
577 | "Number of Step: 0056 cost = 19.417250\n",
578 | "Number of Step: 0056 cost = 20.242432\n",
579 | "Number of Step: 0056 cost = 19.264370\n",
580 | "Number of Step: 0056 cost = 19.881472\n",
581 | "Number of Step: 0056 cost = 18.565615\n",
582 | "Number of Step: 0057 cost = 18.786987\n",
583 | "Number of Step: 0057 cost = 19.718266\n",
584 | "Number of Step: 0057 cost = 20.988651\n",
585 | "Number of Step: 0057 cost = 18.925314\n",
586 | "Number of Step: 0057 cost = 19.604288\n",
587 | "Number of Step: 0057 cost = 20.408951\n",
588 | "Number of Step: 0057 cost = 19.542969\n",
589 | "Number of Step: 0057 cost = 19.413477\n",
590 | "Number of Step: 0058 cost = 20.094835\n",
591 | "Number of Step: 0058 cost = 19.935198\n",
592 | "Number of Step: 0058 cost = 20.000544\n",
593 | "Number of Step: 0058 cost = 19.038767\n",
594 | "Number of Step: 0058 cost = 19.766483\n",
595 | "Number of Step: 0058 cost = 19.640085\n",
596 | "Number of Step: 0058 cost = 19.713690\n",
597 | "Number of Step: 0058 cost = 19.069868\n",
598 | "Number of Step: 0059 cost = 20.695675\n",
599 | "Number of Step: 0059 cost = 18.600542\n",
600 | "Number of Step: 0059 cost = 20.206831\n",
601 | "Number of Step: 0059 cost = 19.218363\n",
602 | "Number of Step: 0059 cost = 20.146311\n",
603 | "Number of Step: 0059 cost = 20.238882\n",
604 | "Number of Step: 0059 cost = 19.937920\n",
605 | "Number of Step: 0059 cost = 18.828552\n",
606 | "Number of Step: 0060 cost = 18.958557\n",
607 | "Number of Step: 0060 cost = 20.044477\n",
608 | "Number of Step: 0060 cost = 19.873934\n",
609 | "Number of Step: 0060 cost = 19.420803\n",
610 | "Number of Step: 0060 cost = 19.914637\n",
611 | "Number of Step: 0060 cost = 18.239677\n",
612 | "Number of Step: 0060 cost = 18.858553\n",
613 | "Number of Step: 0060 cost = 19.074484\n",
614 | "Number of Step: 0061 cost = 19.019659\n",
615 | "Number of Step: 0061 cost = 19.953152\n",
616 | "Number of Step: 0061 cost = 19.777113\n",
617 | "Number of Step: 0061 cost = 20.127518\n",
618 | "Number of Step: 0061 cost = 19.441587\n",
619 | "Number of Step: 0061 cost = 20.103010\n",
620 | "Number of Step: 0061 cost = 19.744200\n",
621 | "Number of Step: 0061 cost = 20.644508\n",
622 | "Number of Step: 0062 cost = 19.728378\n",
623 | "Number of Step: 0062 cost = 19.435120\n",
624 | "Number of Step: 0062 cost = 19.840559\n",
625 | "Number of Step: 0062 cost = 18.457581\n",
626 | "Number of Step: 0062 cost = 19.384420\n",
627 | "Number of Step: 0062 cost = 19.274870\n",
628 | "Number of Step: 0062 cost = 19.981462\n",
629 | "Number of Step: 0062 cost = 18.948893\n",
630 | "Number of Step: 0063 cost = 20.056086\n",
631 | "Number of Step: 0063 cost = 18.939281\n",
632 | "Number of Step: 0063 cost = 19.173927\n",
633 | "Number of Step: 0063 cost = 18.720533\n",
634 | "Number of Step: 0063 cost = 17.662331\n",
635 | "Number of Step: 0063 cost = 18.570833\n",
636 | "Number of Step: 0063 cost = 19.597286\n",
637 | "Number of Step: 0063 cost = 20.456881\n",
638 | "Number of Step: 0064 cost = 20.077738\n",
639 | "Number of Step: 0064 cost = 19.628847\n",
640 | "Number of Step: 0064 cost = 19.091299\n",
641 | "Number of Step: 0064 cost = 18.709854\n",
642 | "Number of Step: 0064 cost = 18.878860\n",
643 | "Number of Step: 0064 cost = 19.234978\n",
644 | "Number of Step: 0064 cost = 19.819929\n",
645 | "Number of Step: 0064 cost = 19.293816\n",
646 | "Number of Step: 0065 cost = 18.452431\n",
647 | "Number of Step: 0065 cost = 18.643803\n",
648 | "Number of Step: 0065 cost = 18.060612\n",
649 | "Number of Step: 0065 cost = 19.449501\n",
650 | "Number of Step: 0065 cost = 18.175749\n",
651 | "Number of Step: 0065 cost = 19.265959\n",
652 | "Number of Step: 0065 cost = 19.055420\n",
653 | "Number of Step: 0065 cost = 20.121172\n",
654 | "Number of Step: 0066 cost = 18.595167\n",
655 | "Number of Step: 0066 cost = 18.821838\n",
656 | "Number of Step: 0066 cost = 17.799852\n",
657 | "Number of Step: 0066 cost = 17.803329\n",
658 | "Number of Step: 0066 cost = 18.767082\n",
659 | "Number of Step: 0066 cost = 19.105862\n",
660 | "Number of Step: 0066 cost = 20.024843\n",
661 | "Number of Step: 0066 cost = 19.094280\n",
662 | "Number of Step: 0067 cost = 19.719334\n",
663 | "Number of Step: 0067 cost = 19.748051\n",
664 | "Number of Step: 0067 cost = 18.628956\n",
665 | "Number of Step: 0067 cost = 19.446445\n",
666 | "Number of Step: 0067 cost = 18.864361\n",
667 | "Number of Step: 0067 cost = 19.247576\n",
668 | "Number of Step: 0067 cost = 19.597681\n",
669 | "Number of Step: 0067 cost = 18.425842\n",
670 | "Number of Step: 0068 cost = 18.854548\n",
671 | "Number of Step: 0068 cost = 19.533266\n",
672 | "Number of Step: 0068 cost = 18.578531\n",
673 | "Number of Step: 0068 cost = 19.277075\n",
674 | "Number of Step: 0068 cost = 19.370052\n",
675 | "Number of Step: 0068 cost = 18.791796\n",
676 | "Number of Step: 0068 cost = 19.355459\n",
677 | "Number of Step: 0068 cost = 19.153137\n",
678 | "Number of Step: 0069 cost = 19.308121\n",
679 | "Number of Step: 0069 cost = 19.086754\n",
680 | "Number of Step: 0069 cost = 18.994562\n",
681 | "Number of Step: 0069 cost = 18.890036\n",
682 | "Number of Step: 0069 cost = 19.419216\n",
683 | "Number of Step: 0069 cost = 18.398075\n",
684 | "Number of Step: 0069 cost = 18.872486\n",
685 | "Number of Step: 0069 cost = 19.547575\n",
686 | "Number of Step: 0070 cost = 20.047623\n",
687 | "Number of Step: 0070 cost = 18.613075\n",
688 | "Number of Step: 0070 cost = 17.807789\n",
689 | "Number of Step: 0070 cost = 18.225952\n",
690 | "Number of Step: 0070 cost = 17.380775\n",
691 | "Number of Step: 0070 cost = 19.216991\n",
692 | "Number of Step: 0070 cost = 18.267038\n",
693 | "Number of Step: 0070 cost = 18.534729\n",
694 | "Number of Step: 0071 cost = 18.784967\n",
695 | "Number of Step: 0071 cost = 19.547152\n",
696 | "Number of Step: 0071 cost = 18.409006\n",
697 | "Number of Step: 0071 cost = 19.480865\n",
698 | "Number of Step: 0071 cost = 18.878609\n",
699 | "Number of Step: 0071 cost = 18.263588\n",
700 | "Number of Step: 0071 cost = 18.590086\n",
701 | "Number of Step: 0071 cost = 18.835884\n",
702 | "Number of Step: 0072 cost = 19.132475\n",
703 | "Number of Step: 0072 cost = 18.259565\n",
704 | "Number of Step: 0072 cost = 21.376810\n",
705 | "Number of Step: 0072 cost = 17.928442\n",
706 | "Number of Step: 0072 cost = 18.454258\n",
707 | "Number of Step: 0072 cost = 18.397722\n",
708 | "Number of Step: 0072 cost = 19.086702\n",
709 | "Number of Step: 0072 cost = 19.379057\n",
710 | "Number of Step: 0073 cost = 18.311708\n",
711 | "Number of Step: 0073 cost = 18.536705\n",
712 | "Number of Step: 0073 cost = 18.127934\n",
713 | "Number of Step: 0073 cost = 18.702974\n",
714 | "Number of Step: 0073 cost = 19.017019\n",
715 | "Number of Step: 0073 cost = 18.554033\n",
716 | "Number of Step: 0073 cost = 19.887222\n",
717 | "Number of Step: 0073 cost = 19.013105\n",
718 | "Number of Step: 0074 cost = 18.646910\n",
719 | "Number of Step: 0074 cost = 18.836882\n",
720 | "Number of Step: 0074 cost = 18.586220\n",
721 | "Number of Step: 0074 cost = 19.116037\n",
722 | "Number of Step: 0074 cost = 18.636166\n",
723 | "Number of Step: 0074 cost = 18.499201\n",
724 | "Number of Step: 0074 cost = 19.262115\n",
725 | "Number of Step: 0074 cost = 18.619135\n",
726 | "Number of Step: 0075 cost = 18.495358\n",
727 | "Number of Step: 0075 cost = 18.133579\n",
728 | "Number of Step: 0075 cost = 18.864721\n",
729 | "Number of Step: 0075 cost = 17.838591\n",
730 | "Number of Step: 0075 cost = 17.295948\n",
731 | "Number of Step: 0075 cost = 18.530239\n",
732 | "Number of Step: 0075 cost = 18.883099\n",
733 | "Number of Step: 0075 cost = 18.843021\n",
734 | "Number of Step: 0076 cost = 18.259241\n",
735 | "Number of Step: 0076 cost = 17.779800\n",
736 | "Number of Step: 0076 cost = 18.148663\n",
737 | "Number of Step: 0076 cost = 18.095509\n",
738 | "Number of Step: 0076 cost = 18.393051\n",
739 | "Number of Step: 0076 cost = 17.617201\n",
740 | "Number of Step: 0076 cost = 20.075979\n",
741 | "Number of Step: 0076 cost = 18.985180\n",
742 | "Number of Step: 0077 cost = 18.118486\n",
743 | "Number of Step: 0077 cost = 18.579576\n",
744 | "Number of Step: 0077 cost = 18.543633\n",
745 | "Number of Step: 0077 cost = 19.796936\n",
746 | "Number of Step: 0077 cost = 18.677710\n",
747 | "Number of Step: 0077 cost = 16.775335\n",
748 | "Number of Step: 0077 cost = 18.743027\n",
749 | "Number of Step: 0077 cost = 18.926441\n",
750 | "Number of Step: 0078 cost = 19.418146\n",
751 | "Number of Step: 0078 cost = 17.935238\n",
752 | "Number of Step: 0078 cost = 18.198158\n",
753 | "Number of Step: 0078 cost = 17.662447\n",
754 | "Number of Step: 0078 cost = 18.733952\n",
755 | "Number of Step: 0078 cost = 18.023073\n",
756 | "Number of Step: 0078 cost = 18.234447\n",
757 | "Number of Step: 0078 cost = 17.417873\n",
758 | "Number of Step: 0079 cost = 17.578777\n",
759 | "Number of Step: 0079 cost = 18.905436\n",
760 | "Number of Step: 0079 cost = 18.485249\n",
761 | "Number of Step: 0079 cost = 18.892134\n",
762 | "Number of Step: 0079 cost = 18.983599\n",
763 | "Number of Step: 0079 cost = 18.547895\n",
764 | "Number of Step: 0079 cost = 18.229612\n",
765 | "Number of Step: 0079 cost = 18.261204\n",
766 | "Number of Step: 0080 cost = 19.453548\n",
767 | "Number of Step: 0080 cost = 18.618380\n",
768 | "Number of Step: 0080 cost = 18.912674\n",
769 | "Number of Step: 0080 cost = 18.301924\n",
770 | "Number of Step: 0080 cost = 17.535427\n",
771 | "Number of Step: 0080 cost = 18.374660\n",
772 | "Number of Step: 0080 cost = 19.154884\n",
773 | "Number of Step: 0080 cost = 18.170778\n",
774 | "Number of Step: 0081 cost = 18.523289\n",
775 | "Number of Step: 0081 cost = 18.059502\n",
776 | "Number of Step: 0081 cost = 18.606636\n",
777 | "Number of Step: 0081 cost = 17.094723\n",
778 | "Number of Step: 0081 cost = 17.885996\n",
779 | "Number of Step: 0081 cost = 17.596682\n",
780 | "Number of Step: 0081 cost = 20.653311\n",
781 | "Number of Step: 0081 cost = 17.972937\n",
782 | "Number of Step: 0082 cost = 19.049471\n",
783 | "Number of Step: 0082 cost = 17.324112\n",
784 | "Number of Step: 0082 cost = 17.414450\n",
785 | "Number of Step: 0082 cost = 18.378273\n",
786 | "Number of Step: 0082 cost = 18.309870\n",
787 | "Number of Step: 0082 cost = 17.814247\n",
788 | "Number of Step: 0082 cost = 19.608297\n",
789 | "Number of Step: 0082 cost = 17.104284\n",
790 | "Number of Step: 0083 cost = 17.783604\n",
791 | "Number of Step: 0083 cost = 17.561996\n",
792 | "Number of Step: 0083 cost = 17.339228\n",
793 | "Number of Step: 0083 cost = 17.625277\n",
794 | "Number of Step: 0083 cost = 17.664207\n",
795 | "Number of Step: 0083 cost = 17.919678\n",
796 | "Number of Step: 0083 cost = 17.632580\n",
797 | "Number of Step: 0083 cost = 17.944408\n",
798 | "Number of Step: 0084 cost = 17.866018\n",
799 | "Number of Step: 0084 cost = 19.102129\n",
800 | "Number of Step: 0084 cost = 18.013046\n",
801 | "Number of Step: 0084 cost = 17.948154\n",
802 | "Number of Step: 0084 cost = 17.853260\n",
803 | "Number of Step: 0084 cost = 16.999918\n",
804 | "Number of Step: 0084 cost = 18.198866\n",
805 | "Number of Step: 0084 cost = 17.912989\n",
806 | "Number of Step: 0085 cost = 18.169626\n",
807 | "Number of Step: 0085 cost = 18.324541\n",
808 | "Number of Step: 0085 cost = 18.369005\n",
809 | "Number of Step: 0085 cost = 18.447588\n",
810 | "Number of Step: 0085 cost = 18.641312\n",
811 | "Number of Step: 0085 cost = 17.931910\n",
812 | "Number of Step: 0085 cost = 18.035240\n",
813 | "Number of Step: 0085 cost = 19.216930\n",
814 | "Number of Step: 0086 cost = 17.682716\n",
815 | "Number of Step: 0086 cost = 17.063988\n",
816 | "Number of Step: 0086 cost = 18.460468\n",
817 | "Number of Step: 0086 cost = 18.014000\n",
818 | "Number of Step: 0086 cost = 16.637476\n",
819 | "Number of Step: 0086 cost = 18.234005\n",
820 | "Number of Step: 0086 cost = 17.542858\n",
821 | "Number of Step: 0086 cost = 18.681959\n",
822 | "Number of Step: 0087 cost = 17.915825\n",
823 | "Number of Step: 0087 cost = 18.332932\n",
824 | "Number of Step: 0087 cost = 18.023544\n",
825 | "Number of Step: 0087 cost = 17.747066\n",
826 | "Number of Step: 0087 cost = 18.476677\n",
827 | "Number of Step: 0087 cost = 18.061317\n",
828 | "Number of Step: 0087 cost = 17.447443\n",
829 | "Number of Step: 0087 cost = 17.476879\n",
830 | "Number of Step: 0088 cost = 17.811800\n",
831 | "Number of Step: 0088 cost = 17.839411\n",
832 | "Number of Step: 0088 cost = 17.015297\n",
833 | "Number of Step: 0088 cost = 18.072769\n",
834 | "Number of Step: 0088 cost = 17.579687\n",
835 | "Number of Step: 0088 cost = 17.227621\n",
836 | "Number of Step: 0088 cost = 19.641462\n",
837 | "Number of Step: 0088 cost = 17.546465\n",
838 | "Number of Step: 0089 cost = 18.312948\n",
839 | "Number of Step: 0089 cost = 17.350624\n",
840 | "Number of Step: 0089 cost = 18.149803\n",
841 | "Number of Step: 0089 cost = 16.513401\n",
842 | "Number of Step: 0089 cost = 17.910816\n",
843 | "Number of Step: 0089 cost = 17.231394\n",
844 | "Number of Step: 0089 cost = 18.227589\n",
845 | "Number of Step: 0089 cost = 16.880251\n",
846 | "Number of Step: 0090 cost = 15.888194\n",
847 | "Number of Step: 0090 cost = 18.700552\n",
848 | "Number of Step: 0090 cost = 17.834127\n",
849 | "Number of Step: 0090 cost = 16.903624\n",
850 | "Number of Step: 0090 cost = 17.001898\n",
851 | "Number of Step: 0090 cost = 16.596476\n",
852 | "Number of Step: 0090 cost = 17.636972\n",
853 | "Number of Step: 0090 cost = 18.484329\n",
854 | "Number of Step: 0091 cost = 18.303387\n",
855 | "Number of Step: 0091 cost = 17.834642\n",
856 | "Number of Step: 0091 cost = 17.869686\n",
857 | "Number of Step: 0091 cost = 16.905575\n",
858 | "Number of Step: 0091 cost = 17.179218\n",
859 | "Number of Step: 0091 cost = 17.584888\n",
860 | "Number of Step: 0091 cost = 17.895836\n",
861 | "Number of Step: 0091 cost = 15.996117\n",
862 | "Number of Step: 0092 cost = 17.124043\n",
863 | "Number of Step: 0092 cost = 16.982082\n",
864 | "Number of Step: 0092 cost = 18.135927\n",
865 | "Number of Step: 0092 cost = 18.133181\n",
866 | "Number of Step: 0092 cost = 17.417768\n",
867 | "Number of Step: 0092 cost = 17.834192\n",
868 | "Number of Step: 0092 cost = 19.040405\n",
869 | "Number of Step: 0092 cost = 18.214203\n",
870 | "Number of Step: 0093 cost = 17.028940\n",
871 | "Number of Step: 0093 cost = 17.721834\n",
872 | "Number of Step: 0093 cost = 17.565159\n",
873 | "Number of Step: 0093 cost = 17.463390\n",
874 | "Number of Step: 0093 cost = 19.197693\n",
875 | "Number of Step: 0093 cost = 16.874548\n",
876 | "Number of Step: 0093 cost = 18.761587\n",
877 | "Number of Step: 0093 cost = 17.809910\n",
878 | "Number of Step: 0094 cost = 18.114273\n",
879 | "Number of Step: 0094 cost = 17.609636\n",
880 | "Number of Step: 0094 cost = 17.840187\n",
881 | "Number of Step: 0094 cost = 17.969526\n",
882 | "Number of Step: 0094 cost = 18.489599\n",
883 | "Number of Step: 0094 cost = 16.545397\n",
884 | "Number of Step: 0094 cost = 17.046900\n",
885 | "Number of Step: 0094 cost = 17.239439\n",
886 | "Number of Step: 0095 cost = 17.767996\n",
887 | "Number of Step: 0095 cost = 16.229298\n",
888 | "Number of Step: 0095 cost = 16.676342\n",
889 | "Number of Step: 0095 cost = 17.530474\n",
890 | "Number of Step: 0095 cost = 17.203407\n",
891 | "Number of Step: 0095 cost = 17.162613\n",
892 | "Number of Step: 0095 cost = 15.997195\n",
893 | "Number of Step: 0095 cost = 17.632429\n",
894 | "Number of Step: 0096 cost = 18.374067\n",
895 | "Number of Step: 0096 cost = 17.368975\n",
896 | "Number of Step: 0096 cost = 17.390326\n",
897 | "Number of Step: 0096 cost = 18.181248\n",
898 | "Number of Step: 0096 cost = 17.433788\n",
899 | "Number of Step: 0096 cost = 17.595045\n",
900 | "Number of Step: 0096 cost = 17.498888\n",
901 | "Number of Step: 0096 cost = 17.583176\n",
902 | "Number of Step: 0097 cost = 17.504669\n",
903 | "Number of Step: 0097 cost = 17.342361\n",
904 | "Number of Step: 0097 cost = 17.802631\n",
905 | "Number of Step: 0097 cost = 16.414183\n",
906 | "Number of Step: 0097 cost = 16.846893\n",
907 | "Number of Step: 0097 cost = 17.576498\n",
908 | "Number of Step: 0097 cost = 18.128235\n",
909 | "Number of Step: 0097 cost = 18.250656\n",
910 | "Number of Step: 0098 cost = 18.195713\n",
911 | "Number of Step: 0098 cost = 16.961403\n",
912 | "Number of Step: 0098 cost = 16.245911\n",
913 | "Number of Step: 0098 cost = 16.547680\n",
914 | "Number of Step: 0098 cost = 17.897562\n",
915 | "Number of Step: 0098 cost = 17.286310\n",
916 | "Number of Step: 0098 cost = 17.829388\n",
917 | "Number of Step: 0098 cost = 18.228642\n",
918 | "Number of Step: 0099 cost = 16.456144\n",
919 | "Number of Step: 0099 cost = 17.276258\n",
920 | "Number of Step: 0099 cost = 16.501991\n",
921 | "Number of Step: 0099 cost = 17.593954\n",
922 | "Number of Step: 0099 cost = 17.236393\n",
923 | "Number of Step: 0099 cost = 17.581354\n",
924 | "Number of Step: 0099 cost = 17.807911\n",
925 | "Number of Step: 0099 cost = 17.202646\n",
926 | "Number of Step: 0100 cost = 17.215481\n",
927 | "Number of Step: 0100 cost = 16.990873\n",
928 | "Number of Step: 0100 cost = 16.657173\n",
929 | "Number of Step: 0100 cost = 17.039188\n",
930 | "Number of Step: 0100 cost = 17.793745\n",
931 | "Number of Step: 0100 cost = 17.052168\n",
932 | "Number of Step: 0100 cost = 17.739040\n",
933 | "Number of Step: 0100 cost = 17.128641\n"
934 | ],
935 | "name": "stdout"
936 | }
937 | ]
938 | }
939 | ]
940 | }
--------------------------------------------------------------------------------
/data.txt:
--------------------------------------------------------------------------------
1 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
2 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
3 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
4 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
5 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
6 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
7 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
8 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
9 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
10 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
11 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
12 | But I must explain to you how all this mistaken idea of denouncing pleasure and praising pain was born and I will give you a complete account of the system, and expound the actual teachings of the great explorer of the truth, the master-builder of human happiness. No one rejects, dislikes, or avoids pleasure itself, because it is pleasure, but because those who do not know how to pursue pleasure rationally encounter consequences that are extremely painful. Nor again is there anyone who loves or pursues or desires to obtain pain of itself, because it is pain, but because occasionally circumstances occur in which toil and pain can procure him some great pleasure. To take a trivial example, which of us ever undertakes laborious physical exercise, except to obtain some advantage from it? But who has any right to find fault with a man who chooses to enjoy a pleasure that has no annoying consequences, or one who avoids a pain that produces no resultant pleasure?
--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Copyright 2019 Tae Hwan Jung
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | """
18 |
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 |
23 | import random
24 |
25 | import torch
26 | import numpy as np
27 |
28 | special_symbols = {
29 | "[UNK]" : 0,
30 | "[CLS]" : 1,
31 | "[SEP]" : 2,
32 | "[PAD]" : 3,
33 | "[MASK]" : 4,
34 | }
35 | UNK_ID = special_symbols["[UNK]"]
36 | CLS_ID = special_symbols["[CLS]"]
37 | SEP_ID = special_symbols["[SEP]"]
38 | MASK_ID = special_symbols["[MASK]"]
39 |
40 | def _split_a_and_b(data, sent_ids, begin_idx, tot_len, extend_target=False):
41 | """Split two segments from `data` starting from the index `begin_idx`."""
42 |
43 | data_len = data.shape[0]
44 | if begin_idx + tot_len >= data_len:
45 | print("[_split_a_and_b] returns None: "
46 | "begin_idx %d + tot_len %d >= data_len %d",
47 | begin_idx, tot_len, data_len)
48 | return None
49 |
50 | end_idx = begin_idx + 1
51 | cut_points = []
52 | while end_idx < data_len:
53 | if sent_ids[end_idx] != sent_ids[end_idx - 1]:
54 | if end_idx - begin_idx >= tot_len: break
55 | cut_points.append(end_idx)
56 | end_idx += 1
57 |
58 | a_begin = begin_idx
59 | if len(cut_points) == 0 or random.random() < 0.5:
60 | # NotNext
61 | label = 0
62 | if len(cut_points) == 0:
63 | a_end = end_idx
64 | else:
65 | a_end = random.choice(cut_points)
66 |
67 | b_len = max(1, tot_len - (a_end - a_begin))
68 | # (zihang): `data_len - 1` to account for extend_target
69 | b_begin = random.randint(0, data_len - 1 - b_len)
70 | b_end = b_begin + b_len
71 | while b_begin > 0 and sent_ids[b_begin - 1] == sent_ids[b_begin]:
72 | b_begin -= 1
73 | # (zihang): `data_len - 1` to account for extend_target
74 | while b_end < data_len - 1 and sent_ids[b_end - 1] == sent_ids[b_end]:
75 | b_end += 1
76 |
77 | new_begin = a_end
78 | else:
79 | # isNext
80 | label = 1
81 | a_end = random.choice(cut_points)
82 | b_begin = a_end
83 | b_end = end_idx
84 |
85 | new_begin = b_end
86 |
87 | while a_end - a_begin + b_end - b_begin > tot_len:
88 | if a_end - a_begin > b_end - b_begin:
89 | # delete the right side only for the LM objective
90 | a_end -= 1
91 | else:
92 | b_end -= 1
93 |
94 | ret = [data[a_begin: a_end], data[b_begin: b_end], label, new_begin]
95 |
96 | if extend_target:
97 | if a_end >= data_len or b_end >= data_len:
98 | print("[_split_a_and_b] returns None: "
99 | "a_end %d or b_end %d >= data_len %d",
100 | a_end, b_end, data_len)
101 | return None
102 | a_target = data[a_begin + 1: a_end + 1]
103 | b_target = data[b_begin: b_end + 1]
104 | ret.extend([a_target, b_target])
105 |
106 | return ret
107 |
108 | def _is_start_piece(piece):
109 | special_pieces = set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~'))
110 | piece = ''.join(piece)
111 | if (piece.startswith("▁") or piece.startswith("<")
112 | or piece in special_pieces):
113 | return True
114 | else:
115 | return False
116 |
117 | def _sample_mask(sp, seg, mask_alpha, mask_beta,
118 | reverse=False, max_gram=5, goal_num_predict=None):
119 | """Sample `goal_num_predict` tokens for partial prediction.
120 | About `mask_beta` tokens are chosen in a context of `mask_alpha` tokens."""
121 |
122 | seg_len = len(seg)
123 | mask = np.array([False] * seg_len, dtype=np.bool)
124 |
125 | num_predict = 0
126 |
127 | ngrams = np.arange(1, max_gram + 1, dtype=np.int64)
128 | pvals = 1. / np.arange(1, max_gram + 1)
129 | pvals /= pvals.sum(keepdims=True)
130 |
131 | if reverse:
132 | seg = np.flip(seg, 0)
133 |
134 | cur_len = 0
135 | while cur_len < seg_len:
136 | if goal_num_predict is not None and num_predict >= goal_num_predict: break
137 |
138 | n = np.random.choice(ngrams, p=pvals)
139 | if goal_num_predict is not None:
140 | n = min(n, goal_num_predict - num_predict)
141 | ctx_size = (n * mask_alpha) // mask_beta
142 | l_ctx = np.random.choice(ctx_size)
143 | r_ctx = ctx_size - l_ctx
144 |
145 | # Find the start position of a complete token
146 | beg = cur_len + l_ctx
147 | while beg < seg_len and not _is_start_piece(sp.convert_ids_to_tokens([seg[beg].item()])):
148 | beg += 1
149 | if beg >= seg_len:
150 | break
151 |
152 | # Find the end position of the n-gram (start pos of the n+1-th gram)
153 | end = beg + 1
154 | cnt_ngram = 1
155 | while end < seg_len:
156 | if _is_start_piece(sp.convert_ids_to_tokens([seg[beg].item()])):
157 | cnt_ngram += 1
158 | if cnt_ngram > n:
159 | break
160 | end += 1
161 | if end >= seg_len:
162 | break
163 |
164 | # Update
165 | mask[beg:end] = True
166 | num_predict += end - beg
167 |
168 | cur_len = end + r_ctx
169 |
170 | while goal_num_predict is not None and num_predict < goal_num_predict:
171 | i = np.random.randint(seg_len)
172 | if not mask[i]:
173 | mask[i] = True
174 | num_predict += 1
175 |
176 | if reverse:
177 | mask = np.flip(mask, 0)
178 |
179 | return mask
180 |
181 | def _create_data(sp, input_paths, seq_len, reuse_len,
182 | bi_data, num_predict, mask_alpha, mask_beta):
183 | features = []
184 |
185 | f = open(input_paths, 'r')
186 | lines = f.readlines()
187 | input_data, sent_ids, sent_id = [], [], True
188 |
189 | for line in lines:
190 | tokens = sp.tokenize(line)
191 | cur_sent = sp.convert_tokens_to_ids(tokens)
192 | input_data.extend(cur_sent)
193 | sent_ids.extend([sent_id] * len(cur_sent))
194 | sent_id = not sent_id
195 |
196 | # shape of data : [1, 582]
197 | data = np.array([input_data], dtype=np.int64)
198 | sent_ids = np.array([sent_ids], dtype=np.bool)
199 |
200 | assert reuse_len < seq_len - 3
201 |
202 | data_len = data.shape[1]
203 | sep_array = np.array([SEP_ID], dtype=np.int64)
204 | cls_array = np.array([CLS_ID], dtype=np.int64)
205 |
206 | i = 0
207 | while i + seq_len <= data_len:
208 | inp = data[0, i: i + reuse_len]
209 | tgt = data[0, i + 1: i + reuse_len + 1]
210 |
211 | results = _split_a_and_b(
212 | data[0], # all line in one Text file.
213 | sent_ids[0],
214 | begin_idx=i + reuse_len,
215 | tot_len=seq_len - reuse_len - 3,
216 | extend_target=True)
217 |
218 | # unpack the results
219 | (a_data, b_data, label, _, a_target, b_target) = tuple(results)
220 |
221 | # sample ngram spans to predict
222 | reverse = bi_data
223 | if num_predict is None:
224 | num_predict_0 = num_predict_1 = None
225 | else:
226 | num_predict_1 = num_predict // 2
227 | num_predict_0 = num_predict - num_predict_1
228 |
229 | mask_0 = _sample_mask(sp, inp, mask_alpha, mask_beta, reverse=reverse,
230 | goal_num_predict=num_predict_0)
231 | mask_1 = _sample_mask(sp, np.concatenate([a_data, sep_array, b_data,
232 | sep_array, cls_array]),
233 | mask_alpha, mask_beta,
234 | reverse=reverse, goal_num_predict=num_predict_1)
235 |
236 | # concatenate data
237 | cat_data = np.concatenate([inp, a_data, sep_array, b_data,
238 | sep_array, cls_array])
239 | seg_id = ([0] * (reuse_len + a_data.shape[0]) + [0] +
240 | [1] * b_data.shape[0] + [1] + [2])
241 | assert cat_data.shape[0] == seq_len
242 | assert mask_0.shape[0] == seq_len // 2
243 | assert mask_1.shape[0] == seq_len // 2
244 |
245 | # the last two CLS's are not used, just for padding purposes
246 | tgt = np.concatenate([tgt, a_target, b_target, cls_array, cls_array])
247 | assert tgt.shape[0] == seq_len
248 |
249 | is_masked = np.concatenate([mask_0, mask_1], 0)
250 | if num_predict is not None:
251 | assert np.sum(is_masked) == num_predict
252 |
253 | feature = {
254 | "input": cat_data,
255 | "is_masked": is_masked,
256 | "target": tgt,
257 | "seg_id": seg_id,
258 | "label": [label],
259 | }
260 | features.append(feature)
261 |
262 | i += reuse_len
263 |
264 | f.close()
265 | return features
266 |
267 | def _local_perm(inputs, targets, is_masked, perm_size, seq_len):
268 | """
269 | Sample a permutation of the factorization order, and create an
270 | attention mask accordingly.
271 |
272 | Args:
273 | inputs: int64 Tensor in shape [seq_len], input ids.
274 | targets: int64 Tensor in shape [seq_len], target ids.
275 | is_masked: bool Tensor in shape [seq_len]. True means being selected
276 | for partial prediction.
277 | perm_size: the length of longest permutation. Could be set to be reuse_len.
278 | Should not be larger than reuse_len or there will be data leaks.
279 | seq_len: int, sequence length.
280 | """
281 |
282 | # Generate permutation indices
283 | index = torch.arange(seq_len, dtype=torch.int64)
284 |
285 | index = torch.reshape(index, [-1, perm_size]).t()
286 | index = index[torch.randperm(index.shape[0])]
287 | index = torch.reshape(index.t(), [-1])
288 |
289 | # `perm_mask` and `target_mask`
290 | # non-functional tokens
291 | non_func_tokens = ~(torch.eq(inputs, SEP_ID) | torch.eq(inputs, CLS_ID))
292 | non_mask_tokens = (~is_masked) & non_func_tokens
293 | masked_or_func_tokens = ~non_mask_tokens
294 |
295 | # Set the permutation indices of non-masked (& non-funcional) tokens to the
296 | # smallest index (-1):
297 | # (1) they can be seen by all other positions
298 | # (2) they cannot see masked positions, so there won"t be information leak
299 | smallest_index = -torch.ones([seq_len], dtype=torch.int64)
300 |
301 | # put -1 if `non_mask_tokens(real token not cls or sep)` not permutation index
302 | rev_index = torch.where(non_mask_tokens, smallest_index, index)
303 |
304 | # Create `target_mask`: non-funcional and maksed tokens
305 | # 1: use mask as input and have loss
306 | # 0: use token (or [SEP], [CLS]) as input and do not have loss
307 | target_tokens = masked_or_func_tokens & non_func_tokens
308 | target_mask = target_tokens.type(torch.float32)
309 |
310 | # Create `perm_mask`
311 | # `target_tokens` cannot see themselves
312 | # put `rev_index` if real mask(not cls or sep) else `rev_index + 1`
313 | self_rev_index = torch.where(target_tokens, rev_index, rev_index + 1)
314 |
315 | # 1: cannot attend if i <= j and j is not non-masked (masked_or_func_tokens)
316 | # 0: can attend if i > j or j is non-masked
317 | perm_mask = (self_rev_index[:, None] <= rev_index[None, :]) & masked_or_func_tokens
318 | perm_mask = perm_mask.type(torch.float32)
319 |
320 | # new target: [next token] for LM and [curr token] (self) for PLM
321 | new_targets = torch.cat([inputs[0: 1], targets[: -1]], dim=0)
322 |
323 | # construct inputs_k
324 | inputs_k = inputs
325 |
326 | # construct inputs_q
327 | inputs_q = target_mask
328 |
329 | return perm_mask, new_targets, target_mask, inputs_k, inputs_q
330 |
331 | def make_permute(feature, reuse_len, seq_len, perm_size, num_predict):
332 |
333 | inputs = torch.LongTensor(feature.pop("input"))
334 | target = torch.LongTensor(feature.pop("target"))
335 | is_masked = torch.ByteTensor(feature.pop("is_masked"))
336 |
337 | non_reuse_len = seq_len - reuse_len
338 | assert perm_size <= reuse_len and perm_size <= non_reuse_len
339 |
340 | perm_mask_0, target_0, target_mask_0, input_k_0, input_q_0 = _local_perm(
341 | inputs[:reuse_len], # inp
342 | target[:reuse_len],
343 | is_masked[:reuse_len],
344 | perm_size,
345 | reuse_len)
346 |
347 | perm_mask_1, target_1, target_mask_1, input_k_1, input_q_1 = _local_perm(
348 | inputs[reuse_len:], # (senA, seq, senBm seq, cls)
349 | target[reuse_len:],
350 | is_masked[reuse_len:],
351 | perm_size,
352 | non_reuse_len)
353 |
354 | perm_mask_0 = torch.cat([perm_mask_0, torch.ones([reuse_len, non_reuse_len])],
355 | dim=1)
356 | perm_mask_1 = torch.cat([torch.zeros([non_reuse_len, reuse_len]), perm_mask_1],
357 | dim=1)
358 |
359 | perm_mask = torch.cat([perm_mask_0, perm_mask_1], dim=0)
360 | target = torch.cat([target_0, target_1], dim=0)
361 | target_mask = torch.cat([target_mask_0, target_mask_1], dim=0)
362 | input_k = torch.cat([input_k_0, input_k_1], dim=0)
363 | input_q = torch.cat([input_q_0, input_q_1], dim=0)
364 |
365 | if num_predict is not None:
366 | indices = torch.arange(seq_len, dtype=torch.int64)
367 | bool_target_mask = target_mask.byte()
368 | indices = indices[bool_target_mask]
369 |
370 | ##### extra padding due to CLS/SEP introduced after prepro
371 | actual_num_predict = indices.shape[0]
372 | pad_len = num_predict - actual_num_predict
373 |
374 | assert seq_len >= actual_num_predict
375 |
376 | ##### target_mapping
377 | target_mapping = torch.eye(seq_len, dtype=torch.float32)[indices]
378 | paddings = torch.zeros([pad_len, seq_len], dtype=target_mapping.dtype)
379 | target_mapping = torch.cat([target_mapping, paddings], dim=0)
380 | feature["target_mapping"] = torch.reshape(target_mapping,
381 | [num_predict, seq_len])
382 | ##### target
383 | target = target[bool_target_mask]
384 | paddings = torch.zeros([pad_len], dtype=target.dtype)
385 | target = torch.cat([target, paddings], dim=0)
386 | feature["target"] = torch.reshape(target, [num_predict])
387 |
388 | ##### target mask
389 | target_mask = torch.cat(
390 | [torch.ones([actual_num_predict], dtype=torch.float32),
391 | torch.zeros([pad_len], dtype=torch.float32)],
392 | dim=0)
393 | feature["target_mask"] = torch.reshape(target_mask, [num_predict])
394 | else:
395 | feature["target"] = torch.reshape(target, [seq_len])
396 | feature["target_mask"] = torch.reshape(target_mask, [seq_len])
397 |
398 | # reshape back to fixed shape
399 | feature["seg_id"] = torch.IntTensor(feature["seg_id"])
400 | feature["perm_mask"] = torch.reshape(perm_mask, [seq_len, seq_len])
401 | feature["input_k"] = torch.reshape(input_k, [seq_len])
402 | feature["input_q"] = torch.reshape(input_q, [seq_len])
403 |
404 | return feature
--------------------------------------------------------------------------------
/images/AEmodel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/AEmodel.png
--------------------------------------------------------------------------------
/images/ARmodel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/ARmodel.png
--------------------------------------------------------------------------------
/images/PLM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/PLM.png
--------------------------------------------------------------------------------
/images/ParPrediction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/ParPrediction.png
--------------------------------------------------------------------------------
/images/hyperparameters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/hyperparameters.png
--------------------------------------------------------------------------------
/images/target-aware.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/target-aware.png
--------------------------------------------------------------------------------
/images/twoattn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/graykode/xlnet-Pytorch/cb793a1c75bdc59e3360f04ec641af726719811f/images/twoattn.png
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Copyright 2019 Tae Hwan Jung
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | """
18 |
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 |
23 | import data_utils
24 | import argparse
25 |
26 | import xlnet
27 | import torch
28 | import numpy as np
29 | import torch.nn as nn
30 | import torch.optim as optim
31 | from pytorch_pretrained_bert import BertTokenizer
32 |
33 | if __name__ == "__main__":
34 | parser = argparse.ArgumentParser(description='PyTorch XLNet Language Model')
35 | parser.add_argument('--data', type=str, default='data.txt')
36 | parser.add_argument('--tokenizer', type=str, default='bert-base-uncased',
37 | help='Path to the sentence piece model from pytorch-pretrained-BERT')
38 | parser.add_argument('--seq_len', type=int, default=512, help="Sequence length.")
39 | parser.add_argument('--reuse_len', type=int, default=256,
40 | help="Number of token that can be reused as memory. "
41 | "Could be half of `seq_len`.")
42 | parser.add_argument('--perm_size', type=int,
43 | default=256,
44 | help="the length of longest permutation. Could be set to be reuse_len.")
45 | parser.add_argument('--bi_data', type=bool, default=False,
46 | help="whether to create bidirectional data")
47 | parser.add_argument('--mask_alpha', type=int,
48 | default=6, help="How many tokens to form a group.")
49 | parser.add_argument('--mask_beta', type=int,
50 | default=1, help="How many tokens to mask within each group.")
51 | parser.add_argument('--num_predict', type=int,
52 | default=85, help="Num of tokens to predict.")
53 | parser.add_argument('--mem_len', type=int,
54 | default=384, help="Number of steps to cache")
55 | parser.add_argument('--num_epoch', type=int,
56 | default=100, help="Number of epochs")
57 |
58 | args = parser.parse_args()
59 |
60 | sp = BertTokenizer.from_pretrained(args.tokenizer)
61 | model = xlnet.XLNet(n_token=len(sp.vocab), n_layer=6, n_head=4, d_head=8,
62 | d_inner=32, d_model=32,
63 | dropout=0.1, dropatt=0.1,
64 | attn_type="bi", bi_data=args.bi_data,
65 | clamp_len=-1, same_length=False,
66 | reuse_len=args.reuse_len, mem_len=args.mem_len)
67 |
68 | criterion = nn.CrossEntropyLoss()
69 | optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
70 |
71 | for num_epoch in range(args.num_epoch):
72 | mems = None
73 |
74 | features = data_utils._create_data(sp=sp,
75 | input_paths=args.data,
76 | seq_len=args.seq_len,
77 | reuse_len=args.reuse_len,
78 | bi_data=args.bi_data,
79 | num_predict=args.num_predict,
80 | mask_alpha=args.mask_alpha,
81 | mask_beta=args.mask_beta)
82 |
83 | num_step = 0
84 | for feature in features:
85 | permutation = data_utils.make_permute(feature,
86 | reuse_len=args.reuse_len,
87 | seq_len=args.seq_len,
88 | perm_size=args.perm_size,
89 | num_predict=args.num_predict)
90 |
91 | # batch size is 1
92 | inp_k = permutation['input_k'].unsqueeze(-1) # [seq_len, 1(=bsz)]
93 | seg_id = permutation['seg_id'].unsqueeze(-1) # [seq_len, 1(=bsz)]
94 | target = permutation['target'].unsqueeze(-1) # [num_predict, 1(=bsz)]
95 | perm_mask = permutation['perm_mask'].unsqueeze(-1) # [seq_len, seq_len, 1(=bsz)]
96 | target_mapping = \
97 | permutation['target_mapping'].unsqueeze(-1) # [num_predict, seq_len, 1(=bsz)]
98 | inp_q = permutation['input_q'].unsqueeze(-1) # [seq_len, 1(=bsz)]
99 | tgt_mask = permutation['target_mask'].unsqueeze(-1) # [num_predict, 1(=bsz)]
100 |
101 | logits, new_mems = model(inp_k=inp_k, seg_id=seg_id, input_mask=None,
102 | mems=mems, perm_mask=perm_mask,
103 | target_mapping=target_mapping, inp_q=inp_q)
104 |
105 | lm_loss = criterion(logits.transpose(1, 2), target).type(torch.float32)
106 | tgt_mask_sum = tgt_mask.reshape(-1).sum()
107 | lm_loss_sum = (lm_loss * tgt_mask).reshape(-1).sum()
108 |
109 | optimizer.zero_grad()
110 | total_loss = lm_loss_sum / tgt_mask_sum
111 | print('Number of Epoch: %04d in %04d Step' % ((num_epoch + 1), (num_step + 1)),
112 | 'cost =', '{:.6f}'.format(total_loss))
113 | num_step += 1
114 |
115 | total_loss.backward()
116 | optimizer.step()
117 |
118 | mems = new_mems
--------------------------------------------------------------------------------
/xlnet.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Copyright 2019 Tae Hwan Jung
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | """
18 |
19 | from __future__ import absolute_import
20 | from __future__ import division
21 | from __future__ import print_function
22 |
23 | import numpy as np
24 |
25 | import torch
26 | import torch.nn as nn
27 | import torch.nn.functional as F
28 |
29 | class XLNet(nn.Module):
30 | """
31 | Defines a Transformer-XL computation graph with additional
32 | support for XLNet.
33 |
34 | Args:
35 |
36 | inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
37 | seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
38 | input_mask: float32 Tensor in shape [len, bsz], the input mask.
39 | 0 for real tokens and 1 for padding.
40 | mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
41 | from previous batches. The length of the list equals n_layer.
42 | If None, no memory is used.
43 | perm_mask: float32 Tensor in shape [len, len, bsz].
44 | If perm_mask[i, j, k] = 0, i attend to j in batch k;
45 | if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
46 | If None, each position attends to all the others.
47 | target_mapping: float32 Tensor in shape [num_predict, len, bsz].
48 | If target_mapping[i, j, k] = 1, the i-th predict in batch k is
49 | on the j-th token.
50 | Only used during pretraining for partial prediction.
51 | Set to None during finetuning.
52 | inp_q: float32 Tensor in shape [len, bsz].
53 | 1 for tokens with losses and 0 for tokens without losses.
54 | Only used during pretraining for two-stream attention.
55 | Set to None during finetuning.
56 |
57 | n_layer: int, the number of layers.
58 | d_model: int, the hidden size.
59 | n_head: int, the number of attention heads.
60 | d_head: int, the dimension size of each attention head.
61 | d_inner: int, the hidden size in feed-forward layers.
62 | ff_activation: str, "relu" or "gelu".
63 | n_token: int, the vocab size.
64 |
65 | dropout: float, dropout rate.
66 | dropatt: float, dropout rate on attention probabilities.
67 |
68 | mem_len: int, the number of tokens to cache.
69 | reuse_len: int, the number of tokens in the currect batch to be cached
70 | and reused in the future.
71 | bi_data: bool, whether to use bidirectional input pipeline.
72 | Usually set to True during pretraining and False during finetuning.
73 | clamp_len: int, clamp all relative distances larger than clamp_len.
74 | -1 means no clamping.
75 |
76 | """
77 | def __init__(self, n_token, n_layer, n_head, d_head, d_inner, d_model, dropout, dropatt,
78 | attn_type, bi_data, clamp_len, same_length, reuse_len, mem_len):
79 | super(XLNet, self).__init__()
80 |
81 | self.n_token = n_token
82 | self.n_layer = n_layer
83 | self.n_head = n_head
84 | self.d_head = d_head
85 | self.d_inner = d_inner
86 | self.d_model = d_model
87 | self.dropout = dropout
88 | self.dropatt = dropatt
89 | self.attn_type = attn_type
90 | self.bi_data = bi_data
91 | self.clamp_len = clamp_len
92 | self.same_length = same_length
93 | self.reuse_len = reuse_len
94 | self.mem_len = mem_len
95 |
96 | self.embedding = nn.Embedding(n_token, d_model)
97 | self.Dropout = nn.Dropout(p=dropout)
98 | self.DropAttn = nn.Dropout(p=dropatt)
99 |
100 | self.r_w_bias = nn.Parameter(torch.randn(self.n_layer,
101 | self.n_head,self.d_head))
102 | self.r_r_bias = nn.Parameter(torch.randn(self.n_layer,
103 | self.n_head, self.d_head))
104 |
105 | ##### Segment embedding
106 | self.r_s_bias = nn.Parameter(torch.randn(self.n_layer,
107 | self.n_head,self.d_head))
108 |
109 | self.seg_embed = nn.Parameter(torch.randn(self.n_layer, 2,
110 | self.n_head, self.d_head))
111 |
112 | self.mask_emb = nn.Parameter(torch.randn(1, 1, d_model))
113 |
114 | # post-attention projection (back to `d_model`)
115 | self.proj_o = nn.Parameter(torch.randn(self.d_model,
116 | self.n_head, self.d_head))
117 |
118 | #### Project hidden states to a specific head with a 4D-shape.
119 | self.q_proj_weight = nn.Parameter(torch.randn(self.d_model,
120 | self.n_head, self.d_head))
121 | self.k_proj_weight = nn.Parameter(torch.randn(self.d_model,
122 | self.n_head, self.d_head))
123 | self.v_proj_weight = nn.Parameter(torch.randn(self.d_model,
124 | self.n_head, self.d_head))
125 | self.r_proj_weight = nn.Parameter(torch.randn(self.d_model,
126 | self.n_head, self.d_head))
127 |
128 | self.layer_norm = nn.LayerNorm(d_model)
129 |
130 | self.conv1 = nn.Linear(d_model, d_inner)
131 | self.conv2 = nn.Linear(d_inner, d_model)
132 | self.relu = nn.ReLU(inplace=True)
133 |
134 | self.softmax_b = nn.Parameter(torch.zeros(self.n_token))
135 |
136 |
137 | def gelu(self, x):
138 | """Gaussian Error Linear Unit.
139 |
140 | This is a smoother version of the RELU.
141 | Original paper: https://arxiv.org/abs/1606.08415
142 | Args:
143 | x: float Tensor to perform activation.
144 |
145 | Returns:
146 | `x` with the GELU activation applied.
147 | """
148 | cdf = 0.5 * (1.0 + torch.tanh(
149 | (np.sqrt(2 / np.pi) * (x + 0.044715 * torch.pow(x, 3)))))
150 | return x * cdf
151 |
152 | def rel_shift(self, x, klen=-1):
153 | """perform relative shift to form the relative attention score."""
154 | x_size = x.shape
155 |
156 | x = torch.reshape(x, [x_size[1], x_size[0], x_size[2], x_size[3]])
157 | x = x[1:, 0:, 0:, 0:] # tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
158 | x = torch.reshape(x, [x_size[0], x_size[1] - 1, x_size[2], x_size[3]])
159 | x = x[0:, 0:klen, 0:, 0:] # tf.slice(x, [0, 0, 0, 0], [-1, klen, -1, -1])
160 |
161 | return x
162 |
163 | def positionwise_ffn(self, inp, activation_type='relu'):
164 |
165 | """Position-wise Feed-forward Network."""
166 | output = self.conv1(inp)
167 | output = self.Dropout(output)
168 | if activation_type == 'relu':
169 | output = self.relu(output)
170 | elif activation_type == 'gelu':
171 | output = self.gelu(output)
172 | else:
173 | raise ValueError('Unsupported activation type {}'.format(activation_type))
174 |
175 | output = self.layer_norm(output + inp)
176 | return output
177 |
178 | def post_attention(self, h, attn_vec, residual=True):
179 | """Post-attention processing."""
180 |
181 | # post-attention projection (back to `d_model`)
182 | attn_out = torch.einsum('ibnd,hnd->ibh', attn_vec, self.proj_o)
183 |
184 | attn_out = self.Dropout(attn_out)
185 | if residual:
186 | output = self.layer_norm(attn_out + h)
187 | else:
188 | output = self.layer_norm(attn_out)
189 |
190 | return output
191 |
192 | def head_projection(self, h, name):
193 | """Project hidden states to a specific head with a 4D-shape."""
194 | proj_weight = None
195 | if name == 'q':
196 | proj_weight = self.q_proj_weight
197 | elif name == 'k':
198 | proj_weight = self.k_proj_weight
199 | elif name =='v':
200 | proj_weight = self.v_proj_weight
201 | elif name == 'r':
202 | proj_weight = self.r_proj_weight
203 | else:
204 | raise ValueError('Unknown `name` {}.'.format(name))
205 |
206 | head = torch.einsum('ibh,hnd->ibnd', h, proj_weight)
207 |
208 | return head
209 |
210 | def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat,
211 | r_w_bias, r_r_bias, r_s_bias, attn_mask, scale):
212 |
213 | """Core relative positional attention operations."""
214 |
215 | # content based attention score
216 | ac = torch.einsum('ibnd,jbnd->ijbn', q_head + r_w_bias, k_head_h)
217 |
218 | # position based attention score
219 | bd = torch.einsum('ibnd,jbnd->ijbn', q_head + r_r_bias, k_head_r)
220 | bd = self.rel_shift(bd, klen=ac.shape[1])
221 |
222 | # segment based attention score
223 | if seg_mat is None:
224 | ef = 0
225 | else:
226 | ef = torch.einsum('ibnd,snd->ibns', q_head + r_s_bias, seg_embed)
227 | ef = torch.einsum('ijbs,ibns->ijbn', seg_mat, ef)
228 |
229 | # merge attention scores and perform masking
230 | attn_score = (ac + bd + ef) * scale
231 | if attn_mask is not None:
232 | # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
233 | attn_score = attn_score - 1e30 * attn_mask
234 |
235 | # attention probability
236 | attn_prob = F.softmax(attn_score, dim=1)
237 | attn_prob = self.DropAttn(attn_prob)
238 |
239 | # attention output
240 | attn_vec = torch.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h)
241 |
242 | return attn_vec
243 |
244 | def rel_multihead_attn(self, h, r, r_w_bias, r_r_bias, seg_mat, r_s_bias, seg_embed,
245 | attn_mask, mems, d_model, n_head, d_head, dropout, dropatt):
246 | """Multi-head attention with relative positional encoding."""
247 |
248 | scale = 1 / (d_head ** 0.5)
249 | if mems is not None and len(mems.size()) > 1:
250 | cat = torch.cat([mems, h], dim=0)
251 | else:
252 | cat = h
253 |
254 | # content heads
255 | q_head_h = self.head_projection(h, 'q')
256 | k_head_h = self.head_projection(cat, 'k')
257 | v_head_h = self.head_projection(cat, 'v')
258 |
259 | # positional heads
260 | k_head_r = self.head_projection(r, 'r')
261 |
262 | # core attention ops
263 | attn_vec = self.rel_attn_core(
264 | q_head_h, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias,
265 | r_r_bias, r_s_bias, attn_mask, scale)
266 |
267 | # post processing
268 | output = self.post_attention(h, attn_vec)
269 |
270 | return output
271 |
272 | def two_stream_rel_attn(self, h, g, r, mems, r_w_bias, r_r_bias, seg_mat, r_s_bias,
273 | seg_embed, attn_mask_h, attn_mask_g, target_mapping):
274 | scale = 1 / (self.d_head ** 0.5)
275 |
276 | # content based attention score
277 | if mems is not None and len(mems.size()) > 1:
278 | cat = torch.cat([mems, h], dim=0)
279 | else:
280 | cat = h
281 |
282 | # content-based key head
283 | k_head_h = self.head_projection(cat, 'k')
284 |
285 | # content-based value head
286 | v_head_h = self.head_projection(cat, 'v')
287 |
288 | # position-based key head
289 | k_head_r = self.head_projection(r, 'r')
290 |
291 | ##### h-stream
292 | # content-stream query head
293 | q_head_h = self.head_projection(h, 'q')
294 |
295 | # core attention ops
296 | # hˆ(m)_zt = LayerNorm(h^(m-1)_zt + RelAttn(h^(m-1)_zt + [h~^(m-1), hT(m-1)_z<=t]))
297 | attn_vec_h = self.rel_attn_core(
298 | q_head_h, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias,
299 | r_r_bias, r_s_bias, attn_mask_h, scale)
300 |
301 | # post processing
302 | output_h = self.post_attention(h, attn_vec_h)
303 |
304 | ##### g-stream
305 | # query-stream query head
306 | q_head_g = self.head_projection(g, 'q')
307 |
308 | # core attention ops
309 | # gˆ(m)_zt = LayerNorm(g^(m-1)_zt + RelAttn(g^(m-1)_zt + [h~^(m-1), hT(m-1)_z<=t]))
310 | if target_mapping is not None:
311 | q_head_g = torch.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping)
312 | attn_vec_g = self.rel_attn_core(
313 | q_head_g, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias,
314 | r_r_bias, r_s_bias, attn_mask_g, scale)
315 | attn_vec_g = torch.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping)
316 | else:
317 | attn_vec_g = self.rel_attn_core(
318 | q_head_g, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias,
319 | r_r_bias, r_s_bias, attn_mask_g, scale)
320 |
321 | # post processing
322 | output_g = self.post_attention(g, attn_vec_g)
323 |
324 | return output_h, output_g
325 |
326 |
327 | def _create_mask(self, qlen, mlen, dtype, same_length=False):
328 | """create causal attention mask."""
329 | # [[0,1,1],
330 | # [0,0,1],
331 | # [0,0,0]]
332 | attn_mask = torch.ones([qlen, qlen], dtype=dtype)
333 | mask_u = torch.triu(attn_mask) # Upper triangular part.
334 | mask_dia = torch.tril(attn_mask) & torch.triu(attn_mask) # Diagonal. Figure 2(c)
335 | attn_mask_pad = torch.zeros([qlen, mlen], dtype=dtype)
336 | ret = torch.cat([attn_mask_pad, mask_u - mask_dia], dim=1) # [qlen, mlen]
337 | if same_length:
338 | # [[0,1,1],
339 | # [1,0,1],
340 | # [1,1,0]]
341 | mask_l = torch.tril(attn_mask) # Lower triangular part.
342 | ret = torch.cat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], dim=1)
343 |
344 | return ret.type(dtype=torch.float32) # [qlen, qlen]
345 |
346 | def positional_embedding(self, pos_seq, inv_freq):
347 | sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq)
348 | pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1)
349 | pos_emb = pos_emb[:, None, :]
350 |
351 | return pos_emb
352 |
353 | def _cache_mem(self, curr_out, prev_mem, mem_len, reuse_len=None):
354 | """cache hidden states into memory."""
355 |
356 | with torch.no_grad():
357 | if mem_len is None or mem_len == 0:
358 | return None
359 | else:
360 | if reuse_len is not None and reuse_len > 0:
361 | curr_out = curr_out[:reuse_len]
362 |
363 | if prev_mem is None:
364 | new_mem = curr_out[-mem_len:]
365 | else:
366 | new_mem = torch.cat([prev_mem, curr_out], dim=0)[-mem_len:]
367 |
368 | return new_mem
369 |
370 |
371 | def relative_positional_encoding(self, qlen, klen, d_model, clamp_len, attn_type,
372 | bi_data, bsz=None, dtype=None):
373 | """create relative positional encoding."""
374 |
375 | freq_seq = torch.arange(0, d_model, 2.0)
376 | if dtype is not None and dtype != torch.float32:
377 | freq_seq = freq_seq.type(dtype)
378 | inv_freq = 1 / (10000 ** (freq_seq / d_model))
379 |
380 | if attn_type == 'bi':
381 | # beg, end = klen - 1, -qlen
382 | beg, end = klen, -qlen
383 | elif attn_type == 'uni':
384 | # beg, end = klen - 1, -1
385 | beg, end = klen, -1
386 | else:
387 | raise ValueError('Unknown `attn_type` {}.'.format(attn_type))
388 |
389 | if bi_data and bsz%2 is 0:
390 | fwd_pos_seq = torch.arange(beg, end, -1.0)
391 | bwd_pos_seq = torch.arange(-beg, -end, 1.0)
392 |
393 | if dtype is not None and dtype != torch.float32:
394 | fwd_pos_seq = fwd_pos_seq.type(dtype=dtype)
395 | bwd_pos_seq = bwd_pos_seq.type(dtype=dtype)
396 |
397 | if clamp_len > 0:
398 | fwd_pos_seq = torch.clamp(fwd_pos_seq, -clamp_len, clamp_len)
399 | bwd_pos_seq = torch.clamp(bwd_pos_seq, -clamp_len, clamp_len)
400 |
401 | fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
402 | bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq)
403 |
404 | pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1)
405 | else:
406 | fwd_pos_seq = torch.arange(beg, end, -1.0)
407 | if dtype is not None and dtype != torch.float32:
408 | fwd_pos_seq = fwd_pos_seq.type(dtype=dtype)
409 | if clamp_len > 0:
410 | fwd_pos_seq = torch.clamp(fwd_pos_seq, -clamp_len, clamp_len)
411 | pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq)
412 |
413 | return pos_emb
414 |
415 | def forward(self, inp_k, seg_id, input_mask, mems, perm_mask, target_mapping, inp_q):
416 | new_mems = []
417 |
418 | bsz = inp_k.shape[1]
419 | qlen = inp_k.shape[0]
420 | mlen = mems[0].size(0) if mems is not None else 0
421 | klen = mlen + qlen
422 |
423 | ##### Attention mask
424 | # causal attention mask
425 | if self.attn_type == 'uni':
426 | attn_mask = self._create_mask(qlen, mlen, torch.int64, self.same_length)
427 | attn_mask = attn_mask[:, :, None, None]
428 | elif self.attn_type == 'bi':
429 | attn_mask = None
430 | else:
431 | raise ValueError('Unsupported attention type: {}'.format(self.attn_type))
432 |
433 | # data mask: input mask & perm mask
434 | if input_mask is not None and perm_mask is not None:
435 | data_mask = input_mask[None] + perm_mask
436 | elif input_mask is not None and perm_mask is None:
437 | data_mask = input_mask[None]
438 | elif input_mask is None and perm_mask is not None:
439 | data_mask = perm_mask
440 | else:
441 | data_mask = None
442 |
443 | if data_mask is not None:
444 | # all mems can be attended to
445 | mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz],
446 | dtype=torch.float32)
447 | data_mask = torch.cat([mems_mask, data_mask], dim=1)
448 | if attn_mask is None:
449 | attn_mask = data_mask[:, :, :, None]
450 | else:
451 | attn_mask += data_mask[:, :, :, None]
452 |
453 | if attn_mask is not None:
454 | attn_mask = attn_mask.gt(0).type(torch.float32)
455 |
456 | if attn_mask is not None:
457 | non_tgt_mask = -torch.eye(qlen, dtype=torch.float32) # [qlen, qlen]
458 | non_tgt_mask = torch.cat([torch.zeros([qlen, mlen], dtype=torch.float32), # [qlen, klen]
459 | non_tgt_mask],
460 | dim=-1)
461 | non_tgt_mask = (attn_mask +
462 | non_tgt_mask[:, :, None, None]).gt(0).type(dtype=torch.float32)
463 | else:
464 | non_tgt_mask = None
465 |
466 | ##### Word embedding
467 | lookup_table = self.embedding
468 | word_emb_k = lookup_table(inp_k)
469 |
470 | if inp_q is not None:
471 | if target_mapping is not None:
472 | word_emb_q = self.mask_emb.repeat(target_mapping.shape[0], bsz, 1)
473 | else:
474 | inp_q_ext = inp_q[:, :, None]
475 | word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k
476 |
477 | #### Figure 2(a), Content Stream(Original Attention), h^(0)_t = e(x_i) = e(inp_k)
478 | output_h = self.Dropout(word_emb_k)
479 | if inp_q is not None:
480 | #### Query Stream, g^(0)_t = w
481 | #### the first layer query stream is initialized with a trainable vector
482 | output_g = self.Dropout(word_emb_q)
483 |
484 | ##### Segment embedding
485 | # paper
486 | # Given a pair of positions i and j in the sequence, if
487 | # i and j are from the same segment
488 | if seg_id is not None:
489 | # Convert `seg_id` to one-hot `seg_mat`
490 | mem_pad = torch.zeros([mlen, bsz], dtype=torch.int32)
491 | cat_ids = torch.cat([mem_pad, seg_id], dim=0)
492 |
493 | # `1` indicates not in the same segment [qlen x klen x bsz]
494 | seg_mat = (~torch.eq(seg_id[:, None], cat_ids[None, :])).type(torch.long)
495 | seg_mat = torch.eye(2, dtype=torch.float32)[seg_mat]
496 | else:
497 | seg_mat = None
498 |
499 | ##### Positional encoding
500 | pos_emb = self.relative_positional_encoding(
501 | qlen, klen, self.d_model, self.clamp_len, self.attn_type, self.bi_data,
502 | bsz=bsz, dtype=torch.float32)
503 | pos_emb = self.Dropout(pos_emb)
504 |
505 | ##### Attention layers
506 | if mems is None:
507 | mems = [None] * self.n_layer
508 |
509 | for i in range(self.n_layer):
510 | # cache new mems
511 | new_mems.append(self._cache_mem(output_h, mems[i], self.mem_len, self.reuse_len))
512 |
513 | # segment bias
514 | if seg_id is None:
515 | r_s_bias_i = None
516 | seg_embed_i = None
517 | else:
518 | r_s_bias_i = self.r_s_bias[i]
519 | seg_embed_i = self.seg_embed[i]
520 |
521 | if inp_q is not None:
522 | output_h, output_g = self.two_stream_rel_attn(
523 | h=output_h,
524 | g=output_g,
525 | r=pos_emb,
526 | r_w_bias= self.r_w_bias[i],
527 | r_r_bias= self.r_r_bias[i],
528 | seg_mat=seg_mat,
529 | r_s_bias=r_s_bias_i,
530 | seg_embed=seg_embed_i,
531 | attn_mask_h=non_tgt_mask,
532 | attn_mask_g=attn_mask,
533 | mems=mems[i],
534 | target_mapping=target_mapping)
535 | else:
536 | output_h = self.rel_multihead_attn(
537 | h=output_h,
538 | r=pos_emb,
539 | r_w_bias=self.r_w_bias[i],
540 | r_r_bias=self.r_r_bias[i],
541 | seg_mat=seg_mat,
542 | r_s_bias=r_s_bias_i,
543 | seg_embed=seg_embed_i,
544 | attn_mask=non_tgt_mask,
545 | mems=mems[i])
546 |
547 | if inp_q is not None:
548 | output_g = self.positionwise_ffn(inp=output_g)
549 |
550 | output_h = self.positionwise_ffn(inp=output_h)
551 |
552 | if inp_q is not None:
553 | output = self.Dropout(output_g)
554 | else:
555 | output = self.Dropout(output_h)
556 |
557 | logits = torch.einsum('ibd,nd->ibn', output, lookup_table.weight) + self.softmax_b
558 |
559 | return logits, new_mems
--------------------------------------------------------------------------------