├── HISTORY.rst
├── doc
    ├── source
    │   ├── _static
    │   │   └── output_change.png
    │   ├── contents.rst
    │   ├── index.rst
    │   └── conf.py
    └── Makefile
├── admin_torch
    ├── __init__.py
    └── admin.py
├── MANIFEST.in
├── CODE_OF_CONDUCT.md
├── .gitignore
├── example
    ├── eval_wmt_en-de.sh
    ├── train_wmt_en-de_huge_batch.sh
    ├── train_big_wmt_en-de_huge_batch.sh
    ├── train_wmt_en-de.sh
    ├── average_checkpoints.py
    ├── README.md
    └── profile.ratio.init
├── LICENSE
├── setup.py
├── SUPPORT.md
├── .github
    └── workflows
    │   └── codeql-analysis.yml
├── SECURITY.md
└── README.md


/HISTORY.rst:
--------------------------------------------------------------------------------
1 | History
2 | =======
3 | 
4 | 0.1.0 (2022/3/3)
5 | ------------------
6 | * implemented Admin
7 | 


--------------------------------------------------------------------------------
/doc/source/_static/output_change.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/admin-torch/main/doc/source/_static/output_change.png


--------------------------------------------------------------------------------
/admin_torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 | 
4 | __author__ = "Liyuan Liu"
5 | 
6 | __maintainer__ = "Liyuan Liu"
7 | __email__ = "llychinalz@gmail.com"
8 | 
9 | from admin_torch.admin import *


--------------------------------------------------------------------------------
/doc/source/contents.rst:
--------------------------------------------------------------------------------
 1 | .. Admin-Torch documentation file.
 2 | 
 3 | :github_url: https://github.com/microsoft/admin-torch
 4 | 
 5 | *************************
 6 | Admin-Torch documentation
 7 | *************************
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    
12 |    index


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | # Include the README
 2 | include *.md
 3 | 
 4 | # Include the license file
 5 | include LICENSE.txt
 6 | 
 7 | # Include the history
 8 | include HISTORY.rst
 9 | 
10 | # Include the script
11 | include bin/torch_scope
12 | 
13 | # Include image
14 | include doc/source/_static/output_change.png
15 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = Admin
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # yaml files
 7 | *.yaml
 8 | 
 9 | # aml files
10 | .amltconfig
11 | .amltignore
12 | 
13 | # macOS dir files
14 | .DS_Store
15 | 
16 | # Distribution / packaging
17 | .Python
18 | env/
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 | 
35 | # PyInstaller
36 | #  Usually these files are written by a python script from a template
37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 | 
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 | 
45 | # PyBuilder
46 | target/
47 | 
48 | # Jupyter Notebook
49 | .ipynb_checkpoints
50 | 
51 | # pyenv
52 | .python-version
53 | 
54 | # dotenv
55 | .env
56 | 
57 | # mypy
58 | .mypy_cache/
59 | 


--------------------------------------------------------------------------------
/example/eval_wmt_en-de.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | DATADIR=${1:-"./wmt14_en_de_joined_dict"}
 3 | MODELDIR=${2:-"None"}
 4 | 
 5 | SAVEDIR=${3:-"None"}
 6 | UPPER_BOUND=${4:-100}
 7 | CP_POINT_NUM=${5:-10}
 8 | 
 9 | if [[ $MODELDIR == "None" ]]
10 | then
11 |     if [[ $SAVEDIR == "None" ]]
12 |     then
13 |         echo "SAVEDIR and MODELDIR cannot be None at the same time."
14 |         exit
15 |     fi
16 |     MODELDIR=$SAVEDIR/model_${UPPER_BOUND}_${CP_POINT_NUM}.pt
17 |     if [ -f $MODELDIR  ]; then
18 |         echo $MODELDIR "already exists"
19 |     else
20 |         echo "Start averaging model"
21 |         python average_checkpoints.py --inputs $SAVEDIR --num-epoch-checkpoints ${CP_POINT_NUM}  --output $MODELDIR --checkpoint-upper-bound $UPPER_BOUND | grep 'Finish'
22 |         echo "End averaging model"
23 |     fi
24 | fi
25 | 
26 | echo "Model path" $MODELDIR
27 | 
28 | CUDA_VISIBLE_DEVICES=0 fairseq-generate $DATADIR \
29 |                     --path $MODELDIR \
30 |                     --batch-size 128 --beam 4 --lenpen 0.6 --remove-bpe \
31 |                     --quiet --fp16
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from setuptools import setup, find_packages
 5 | 
 6 | def read_readme():
 7 |     with open('README.md') as f:
 8 |         return f.read()
 9 | 
10 | with open('HISTORY.rst') as history_file:
11 |     history = history_file.read()
12 | 
13 | requirements = [
14 |     'torch'
15 | ]
16 | 
17 | setup(
18 |     name='admin_torch',
19 |     version='0.1.0',
20 |     description='Plug-in-and-Play Toolbox for Stablizing Transformer Training',
21 |     long_description= read_readme(),
22 |     long_description_content_type="text/markdown",
23 |     author='Lucas Liu',
24 |     author_email='llychinalz@gmail.com',
25 |     url='https://github.com/microsoft/admin-torch',
26 |     packages=find_packages(exclude=['docs']),
27 |     include_package_data=True,
28 |     install_requires=requirements,
29 |     license='Apache License 2.0',
30 |     zip_safe=False,
31 |     classifiers=[
32 |         'Development Status :: 2 - Pre-Alpha',
33 |         'Intended Audience :: Developers',
34 |         'Natural Language :: English',
35 |         'Programming Language :: Python :: 3.7',
36 |         'Programming Language :: Python :: 3.8',
37 |         'Programming Language :: Python :: 3.9',
38 |     ]
39 | )
40 | 
41 | # python setup.py sdist bdist_wheel --universal
42 | # twine upload dist/*


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # TODO: The maintainer of this repo has not yet edited this file
 2 | 
 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
 4 | 
 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
 8 | 
 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 | 
11 | # Support
12 | 
13 | ## How to file issues and get help  
14 | 
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 
16 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or 
17 | feature request as a new Issue.
18 | 
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 | 
23 | ## Microsoft Support Policy  
24 | 
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 | 


--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Admin-Torch documentation file.
 2 | 
 3 | :github_url: https://github.com/LiyuanLucasLiu/Admin
 4 | 
 5 | *************************
 6 | Admin-Torch documentation
 7 | *************************
 8 | 
 9 | A plug-in-and-play PyTorch wrapper for `Adaptive model initialization (Admin)`__.
10 | 
11 | For a neural network f, input x, randomly initialized weight w, we describe its stability (
12 | ``output_change_scale``) as
13 | 
14 | .. math:: E[|f(x, w) - f(x, w + \delta)|_2^2], \mbox{where } \delta \mbox{ is a random perturbation.}
15 | 
16 | In `our study`__, we show that, an original N-layer Transformer's ``output_change_scale`` is ``O(n)``, 
17 | which unstabilizes its training. Admin stabilize Transformer's training by regulating this scale to 
18 | ``O(logn)`` and ``O(1)``. We keep ``O(logn)`` as the ``default`` setting, which can handle most scenarios.
19 | In need of additional stability, set ``output_change_scale`` to ``O(1)`` instead. 
20 | 
21 | __ https://arxiv.org/abs/2004.08249
22 | __ https://arxiv.org/abs/2004.08249
23 | 
24 | 
25 | admin_torch\.as_module()
26 | ===============================
27 | .. autofunction:: admin_torch.as_module
28 | 
29 |    
30 | admin_torch\.as_parameter()
31 | ===============================
32 | .. autofunction:: admin_torch.as_parameter
33 | 
34 |    
35 | admin_torch\.as_buffer()
36 | ===============================
37 | .. autofunction:: admin_torch.as_buffer
38 | 
39 | 
40 | admin_torch\.OmegaResidual
41 | ================================
42 | .. autoclass:: admin_torch.OmegaResidual
43 | 	:members:
44 | 


--------------------------------------------------------------------------------
/example/train_wmt_en-de_huge_batch.sh:
--------------------------------------------------------------------------------
 1 | DATA_PATH=${1:-"./wmt14_en_de_joined_dict"}
 2 | LAYERS=${2:-18}
 3 | OUTPUT_PATH=${3:-"./admin_18L_asParameter"}
 4 | ADDITIONAL_ARGS=${4:-"--encoder-as-parameter --decoder-as-parameter --share-all-embeddings"}
 5 | 
 6 | TOKENS=2048
 7 | DEVICE_NUMBER=8
 8 | FREQ=32
 9 | 
10 | NUMBER_OF_GPUS=$(nvidia-smi --list-gpus | wc -l)
11 | if [[ $NUMBER_OF_GPUS != $DEVICE_NUMBER ]]
12 | then
13 |     echo "The script is for $DEVICE_NUMBER card, but only find $NUMBER_OF_GPUS cards."
14 |     echo "Please modify TOKENS, DEVICES, and FREQ in the script accordingly."
15 |     echo 
16 |     echo "Note that you need to keep device_number * tokens * freq = 32768"
17 |     exit    
18 | fi
19 | 
20 | DEVICE_LIST=$(( DEVICE_NUMBER - 1 ))
21 | DEVICE_LIST=$(seq -s "," 0 $DEVICE_LIST)
22 | 
23 | echo "Using GPUs $DEVICE_LIST for training"
24 | 
25 | CUDA_VISIBLE_DEVICES=$DEVICE_LIST fairseq-train \
26 |     $DATA_PATH $ADDITIONAL_ARGS -s en -t de \
27 |     --arch transformer_wmt_en_de \
28 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
29 |     --lr-scheduler inverse_sqrt --max-update 100000 \
30 |     --warmup-init-lr 1e-07 --warmup-updates 4000 --lr 0.0015 \
31 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
32 |     --weight-decay 0.0001 --dropout 0.4 \
33 |     --max-tokens $TOKENS --update-freq $FREQ \
34 |     --save-dir $OUTPUT_PATH --seed 1111 --restore-file x.pt \
35 |     --log-format simple --log-interval 30 --memory-efficient-fp16 \
36 |     --encoder-layers $LAYERS --decoder-layers $LAYERS \
37 |     --threshold-loss-scale 0.0625 --fp16-scale-window 256 --fp16 
38 | 


--------------------------------------------------------------------------------
/example/train_big_wmt_en-de_huge_batch.sh:
--------------------------------------------------------------------------------
 1 | DATA_PATH=${1:-"./wmt14_en_de_joined_dict"}
 2 | LAYERS=${2:-18}
 3 | OUTPUT_PATH=${3:-"./admin_18L_asParameter"}
 4 | ADDITIONAL_ARGS=${4:-"--encoder-as-parameter --decoder-as-parameter --share-all-embeddings --lr 0.0007"}
 5 | 
 6 | TOKENS=2048
 7 | DEVICE_NUMBER=16
 8 | FREQ=16
 9 | 
10 | NUMBER_OF_GPUS=$(nvidia-smi --list-gpus | wc -l)
11 | if [[ $NUMBER_OF_GPUS != $DEVICE_NUMBER ]]
12 | then
13 |     echo "The script is for $DEVICE_NUMBER card, but only find $NUMBER_OF_GPUS cards."
14 |     echo "Please modify TOKENS, DEVICES, and FREQ in the script accordingly."
15 |     echo 
16 |     echo "Note that you need to keep device_number * tokens * freq = 32768"
17 |     exit    
18 | fi
19 | 
20 | DEVICE_LIST=$(( DEVICE_NUMBER - 1 ))
21 | DEVICE_LIST=$(seq -s "," 0 $DEVICE_LIST)
22 | 
23 | echo "Using GPUs $DEVICE_LIST for training"
24 | 
25 | CUDA_VISIBLE_DEVICES=$DEVICE_LIST fairseq-train \
26 |     $DATA_PATH $ADDITIONAL_ARGS -s en -t de \
27 |     --arch transformer_vaswani_wmt_en_de_big \
28 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
29 |     --lr-scheduler inverse_sqrt --max-update 100000 \
30 |     --warmup-init-lr 1e-07 --warmup-updates 4000 \
31 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
32 |     --weight-decay 0.0001 --dropout 0.4 \
33 |     --max-tokens $TOKENS --update-freq $FREQ \
34 |     --save-dir $OUTPUT_PATH --seed 1111 --restore-file x.pt \
35 |     --log-format simple --log-interval 30 --memory-efficient-fp16 \
36 |     --encoder-layers $LAYERS --decoder-layers $LAYERS \
37 |     --threshold-loss-scale 0.0625 --fp16-scale-window 256 --fp16 
38 | 


--------------------------------------------------------------------------------
/example/train_wmt_en-de.sh:
--------------------------------------------------------------------------------
 1 | DATA_PATH=${1:-"./wmt14_en_de_joined_dict"}
 2 | LAYERS=${2:-18}
 3 | OUTPUT_PATH=${3:-"./admin_18L_asParameter"}
 4 | ADDITIONAL_ARGS=${4:-"--share-all-embeddings --encoder-as-parameter --decoder-as-parameter"}
 5 | 
 6 | TOKENS=4096
 7 | DEVICE_NUMBER=8
 8 | FREQ=1
 9 | 
10 | NUMBER_OF_GPUS=$(nvidia-smi --list-gpus | wc -l)
11 | if [[ $NUMBER_OF_GPUS != $DEVICE_NUMBER ]]
12 | then
13 |     echo "The script is for $DEVICE_NUMBER card, but only find $NUMBER_OF_GPUS cards."
14 |     echo "Please modify TOKENS, DEVICES, and FREQ in the script accordingly."
15 |     echo 
16 |     echo "Note that you need to keep device_number * tokens * freq = 32768"
17 |     exit    
18 | fi
19 | 
20 | DEVICE_LIST=$(( DEVICE_NUMBER - 1 ))
21 | DEVICE_LIST=$(seq -s "," 0 $DEVICE_LIST)
22 | 
23 | echo "Using GPUs $DEVICE_LIST for training"
24 | 
25 | CUDA_VISIBLE_DEVICES=$DEVICE_LIST fairseq-train \
26 |     $DATA_PATH $ADDITIONAL_ARGS -s en -t de \
27 |     --arch transformer_wmt_en_de \
28 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
29 |     --lr-scheduler inverse_sqrt --max-update 1000000 \
30 |     --warmup-init-lr 1e-07 --warmup-updates 8000 --lr 0.001 \
31 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
32 |     --weight-decay 0.0 --attention-dropout 0.1 --relu-dropout 0.1 \
33 |     --max-tokens $TOKENS --update-freq $FREQ \
34 |     --save-dir $OUTPUT_PATH --seed 1111 --restore-file x.pt \
35 |     --log-format simple --log-interval 30 --memory-efficient-fp16 \
36 |     --encoder-layers $LAYERS --decoder-layers $LAYERS \
37 |     --threshold-loss-scale 0.0625 --fp16-scale-window 256 --fp16 
38 | 
39 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ main ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ main ]
20 |   schedule:
21 |     - cron: '43 10 * * 6'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ 'python' ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v3
42 | 
43 |     # Initializes the CodeQL tools for scanning.
44 |     - name: Initialize CodeQL
45 |       uses: github/codeql-action/init@v2
46 |       with:
47 |         languages: ${{ matrix.language }}
48 |         # If you wish to specify custom queries, you can do so here or in a config file.
49 |         # By default, queries listed here will override any specified in a config file.
50 |         # Prefix the list here with "+" to use these queries and those in the config file.
51 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 | 
53 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
54 |     # If this step fails, then you should remove it and run the build manually (see below)
55 |     - name: Autobuild
56 |       uses: github/codeql-action/autobuild@v2
57 | 
58 |     # ℹ️ Command-line programs to run using the OS shell.
59 |     # 📚 https://git.io/JvXDl
60 | 
61 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 |     #    and modify them (or add more) to build your code if your project
63 |     #    uses a compiled language
64 | 
65 |     #- run: |
66 |     #   make bootstrap
67 |     #   make release
68 | 
69 |     - name: Perform CodeQL Analysis
70 |       uses: github/codeql-action/analyze@v2
71 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/very-deep-transformers-for-neural-machine/machine-translation-on-wmt2014-english-french)](https://paperswithcode.com/sota/machine-translation-on-wmt2014-english-french?p=very-deep-transformers-for-neural-machine)
  2 | ![PyTorch](https://img.shields.io/badge/PyTorch-%23EE4C2C.svg?style=flat&logo=PyTorch&logoColor=white)
  3 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/admin-torch) 
  4 | ![GitHub](https://img.shields.io/github/license/microsoft/admin-Torch) 
  5 | [![Maintenance](https://img.shields.io/badge/doc-yes-success.svg)](https://microsoft.github.io/admin-torch/) 
  6 | ![PyPI](https://img.shields.io/pypi/v/admin-torch) 
  7 | 
  8 | <h2 align="center">Admin-Torch</h2>
  9 | <h5 align="center">Transformers Training **Stabilized**</h5>
 10 | 
 11 | <p align="center">
 12 |   <a href="#whats-new">What's New?</a> •
 13 |   <a href="#key-idea">Key Idea</a> •
 14 |   <a href="#how-to-use">How To Use</a> •
 15 |   <a href="https://microsoft.github.io/admin-torch/">Docs</a> •
 16 |   <a href="https://github.com/microsoft/admin-torch/tree/main/example">Examples</a> •
 17 |   <a href="#citation">Citation</a> •
 18 |   <a href="https://github.com/microsoft/admin-torch/tree/main/LICENSE">License</a>
 19 | </p>
 20 | 
 21 | Here, we provide a plug-in-and-play implementation of [Admin](https://arxiv.org/abs/2004.08249),
 22 | which stabilizes previously-diverged Transformer training and achieves better performance, 
 23 | **without introducing additional hyper-parameters**. The design of Admin is half-precision 
 24 | friendly and can be **reparameterized into the original Transformer**. 
 25 | 
 26 | ______________________________________________________________________
 27 | ## What's New?
 28 | 
 29 | Beyond the [original admin implementation](https://github.com/LiyuanLucasLiu/Transformer-Clinic):
 30 | 1.  `admin-torch` removed the profilling stage and is **plug-in-and-play**. 
 31 | 2.  `admin-torch`'s implementation is **more robust** (see below).
 32 | 
 33 | Comparison w. the [DeepNet Init](https://arxiv.org/abs/2203.00555) and the [Original Admin Init](https://github.com/LiyuanLucasLiu/Transformer-Clinic) 
 34 | (on WMT'17 En-De).
 35 | 
 36 | |               | Regular batch size (8x4096) |  Huge batch size (128x4096) |
 37 | |---------------|--------------------|------------------|
 38 | | [Original Admin](https://github.com/LiyuanLucasLiu/Transformer-Clinic)| ✅ | ❌ |
 39 | | [DeepNet](https://arxiv.org/abs/2203.00555) | ❌ | ✅ |
 40 | | `admin-torch` | ✅ | ✅ |
 41 | 
 42 | More details can be found in [our example](https://github.com/microsoft/admin-torch/tree/main/example).
 43 | 
 44 | ## Key Idea
 45 | <h5 align="center"><i>What complicates Transformer training?</i></h5>
 46 | 
 47 | For Transformer f, input x, randomly initialized weight w, we describe its stability (``output_change_scale``) as 
 48 | 
 49 | <p align="center">
 50 | <!-- $E[|f(x, w) - f(x, w + \delta)|_2^2]$ --> <img style="transform: translateY(0.1em); background: white;" src="https://render.githubusercontent.com/render/math?math=E%5B%7Cf(x%2C%20w)%20-%20f(x%2C%20w%20%2B%20%5Cdelta)%7C_2%5E2%5D">
 51 | </p>
 52 | 
 53 | In [our study](https://arxiv.org/abs/2004.08249), we show that, an original n-layer Transformer's 
 54 | ``output_change_scale`` is ``O(n)``, which unstabilizes its training. Admin stabilize Transformer's
 55 | training by regulating this scale to ``O(logn)`` or ``O(1)``. 
 56 | 
 57 | <p align="center"><img width="60%" src="doc/source/_static/output_change.png"/></p>
 58 |  
 59 | More details can be found in our [paper](https://arxiv.org/abs/2004.08249).
 60 | 
 61 | 
 62 | ## How to use?
 63 | 
 64 | ### install 
 65 | ```
 66 | pip install admin-torch==0.1.0
 67 | ```
 68 | 
 69 | ### import
 70 | ```
 71 | import admin_torch
 72 | ```
 73 | 
 74 | ### enjoy
 75 | 
 76 | ```diff
 77 | def __init__(self, ...):
 78 | ...
 79 | +(self.residual = admin_torch.as_module(self, self.number_of_sub_layers))+
 80 | ...
 81 | 
 82 | def forward(self, ...):
 83 | ...
 84 | -!x = x + self.f(x)!-
 85 | +(x = self.residual(x, self.f(x)))+
 86 | x = self.LN(x)
 87 | ...
 88 | ```
 89 | 
 90 | An elaborated example can be found at [our doc](https://microsoft.github.io/admin-torch/), and a real working example can be found at [LiyuanLucasLiu/fairseq](https://github.com/LiyuanLucasLiu/fairseq/commit/33ad76ae5dc927bc32b9594f9728a367c45680bb) (training recipe is available at [our example](https://github.com/microsoft/admin-torch/tree/main/example)).
 91 | 
 92 | ## Citation
 93 | Please cite the following papers if you found our model useful. Thanks!
 94 | 
 95 | >Liyuan Liu, Xiaodong Liu, Jianfeng Gao, Weizhu Chen, and Jiawei Han (2020). Understanding the Difficulty of Training Transformers. Proc. 2020 Conf. on Empirical Methods in Natural Language Processing (EMNLP'20).
 96 | ```
 97 | @inproceedings{liu2020admin,
 98 |   title={Understanding the Difficulty of Training Transformers},
 99 |   author = {Liu, Liyuan and Liu, Xiaodong and Gao, Jianfeng and Chen, Weizhu and Han, Jiawei},
100 |   booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP 2020)},
101 |   year={2020}
102 | }
103 | ```
104 | > Xiaodong Liu, Kevin Duh, Liyuan Liu, and Jianfeng Gao (2020). Very Deep Transformers for Neural Machine Translation. arXiv preprint arXiv:2008.07772 (2020).
105 | ```
106 | @inproceedings{liu_deep_2020,
107 |  author = {Liu, Xiaodong and Duh, Kevin and Liu, Liyuan and Gao, Jianfeng},
108 |  booktitle = {arXiv:2008.07772 [cs]},
109 |  title = {Very Deep Transformers for Neural Machine Translation},
110 |  year = {2020}
111 | }
112 | ```
113 | 


--------------------------------------------------------------------------------
/example/average_checkpoints.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2017-present, Facebook, Inc.
  3 | # All rights reserved.
  4 | #
  5 | # This source code is licensed under the license found in the LICENSE file in
  6 | # the root directory of this source tree. An additional grant of patent rights
  7 | # can be found in the PATENTS file in the same directory.
  8 | 
  9 | import argparse
 10 | import collections
 11 | import torch
 12 | import os
 13 | import re
 14 | 
 15 | 
 16 | def average_checkpoints(inputs):
 17 |     """Loads checkpoints from inputs and returns a model with averaged weights.
 18 | 
 19 |     Args:
 20 |       inputs: An iterable of string paths of checkpoints to load from.
 21 | 
 22 |     Returns:
 23 |       A dict of string keys mapping to various values. The 'model' key
 24 |       from the returned dict should correspond to an OrderedDict mapping
 25 |       string parameter names to torch Tensors.
 26 |     """
 27 |     params_dict = collections.OrderedDict()
 28 |     params_keys = None
 29 |     new_state = None
 30 |     num_models = len(inputs)
 31 | 
 32 |     for f in inputs:
 33 |         state = torch.load(
 34 |             f,
 35 |             map_location=(
 36 |                 lambda s, _: torch.serialization.default_restore_location(s, 'cpu')
 37 |             ),
 38 |         )
 39 |         # Copies over the settings from the first checkpoint
 40 |         if new_state is None:
 41 |             new_state = state
 42 | 
 43 |         model_params = state['model']
 44 | 
 45 |         model_params_keys = list(model_params.keys())
 46 |         if params_keys is None:
 47 |             params_keys = model_params_keys
 48 |         elif params_keys != model_params_keys:
 49 |             raise KeyError(
 50 |                 'For checkpoint {}, expected list of params: {}, '
 51 |                 'but found: {}'.format(f, params_keys, model_params_keys)
 52 |             )
 53 | 
 54 |         for k in params_keys:
 55 |             p = model_params[k]
 56 |             if isinstance(p, torch.HalfTensor):
 57 |                 p = p.float()
 58 |             if k not in params_dict:
 59 |                 params_dict[k] = p.clone()
 60 |                 # NOTE: clone() is needed in case of p is a shared parameter
 61 |             else:
 62 |                 params_dict[k] += p
 63 | 
 64 |     averaged_params = collections.OrderedDict()
 65 |     for k, v in params_dict.items():
 66 |         averaged_params[k] = v
 67 |         averaged_params[k].div_(num_models)
 68 |     new_state['model'] = averaged_params
 69 |     return new_state
 70 | 
 71 | 
 72 | def last_n_checkpoints(paths, n, update_based, upper_bound=None):
 73 |     assert len(paths) == 1
 74 |     path = paths[0]
 75 |     if update_based:
 76 |         pt_regexp = re.compile(r'checkpoint_\d+_(\d+)\.pt')
 77 |     else:
 78 |         pt_regexp = re.compile(r'checkpoint(\d+)\.pt')
 79 |     files = os.listdir(path)
 80 | 
 81 |     entries = []
 82 |     for f in files:
 83 |         m = pt_regexp.fullmatch(f)
 84 |         if m is not None:
 85 |             sort_key = int(m.group(1))
 86 |             if upper_bound is None or sort_key <= upper_bound:
 87 |                 entries.append((sort_key, m.group(0)))
 88 |     if len(entries) < n:
 89 |         raise Exception('Found {} checkpoint files but need at least {}', len(entries), n)
 90 |     return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]]
 91 | 
 92 | 
 93 | def main():
 94 |     parser = argparse.ArgumentParser(
 95 |         description='Tool to average the params of input checkpoints to '
 96 |                     'produce a new checkpoint',
 97 |     )
 98 |     # fmt: off
 99 |     parser.add_argument('--inputs', required=True, nargs='+',
100 |                         help='Input checkpoint file paths.')
101 |     parser.add_argument('--output', required=True, metavar='FILE',
102 |                         help='Write the new checkpoint containing the averaged weights to this path.')
103 |     num_group = parser.add_mutually_exclusive_group()
104 |     num_group.add_argument('--num-epoch-checkpoints', type=int,
105 |                            help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
106 |                            'and average last this many of them.')
107 |     num_group.add_argument('--num-update-checkpoints', type=int,
108 |                            help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
109 |                            'and average last this many of them.')
110 |     parser.add_argument('--checkpoint-upper-bound', type=int,
111 |                         help='when using --num-epoch-checkpoints, this will set an upper bound on which checkpoint to use, '
112 |                         'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.')
113 |     # fmt: on
114 |     args = parser.parse_args()
115 |     print(args)
116 | 
117 |     num = None
118 |     is_update_based = False
119 |     if args.num_update_checkpoints is not None:
120 |         num = args.num_update_checkpoints
121 |         is_update_based = True
122 |     elif args.num_epoch_checkpoints is not None:
123 |         num = args.num_epoch_checkpoints
124 | 
125 |     assert args.checkpoint_upper_bound is None or args.num_epoch_checkpoints is not None, \
126 |             '--checkpoint-upper-bound requires --num-epoch-checkpoints'
127 |     assert args.num_epoch_checkpoints is None or args.num_update_checkpoints is None, \
128 |             'Cannot combine --num-epoch-checkpoints and --num-update-checkpoints'
129 | 
130 |     if num is not None:
131 |         args.inputs = last_n_checkpoints(
132 |             args.inputs, num, is_update_based, upper_bound=args.checkpoint_upper_bound,
133 |         )
134 |         print('averaging checkpoints: ', args.inputs)
135 | 
136 |     new_state = average_checkpoints(args.inputs)
137 |     torch.save(new_state, args.output)
138 |     print('Finished writing averaged checkpoint to {}.'.format(args.output))
139 | 
140 | 
141 | if __name__ == '__main__':
142 |     main()
143 | 


--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Wrapper documentation build configuration file, created by
  5 | # sphinx-quickstart on Thu Sep 14 03:49:01 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | 
 20 | import os
 21 | import sys
 22 | 
 23 | sys.path.insert(0, os.path.abspath('../..'))
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = [
 35 |     'sphinx.ext.autodoc',
 36 |     'sphinx.ext.autosummary',
 37 |     'sphinx.ext.doctest',
 38 |     'sphinx.ext.intersphinx',
 39 |     'sphinx.ext.todo',
 40 |     'sphinx.ext.coverage',
 41 |     'sphinx.ext.mathjax',
 42 |     'sphinx.ext.napoleon',
 43 |     'sphinx.ext.viewcode',
 44 |     'sphinx.ext.githubpages',
 45 | ]
 46 | 
 47 | napoleon_use_ivar = True
 48 | 
 49 | # Add any paths that contain templates here, relative to this directory.
 50 | templates_path = ['_templates']
 51 | 
 52 | # The suffix(es) of source filenames.
 53 | # You can specify multiple suffix as a list of string:
 54 | #
 55 | # source_suffix = ['.rst', '.md']
 56 | source_suffix = '.rst'
 57 | 
 58 | # The master toctree document.
 59 | master_doc = 'contents'
 60 | 
 61 | # General information about the project.
 62 | project = 'Admin-Torch'
 63 | copyright = '2022, Liyuan Liu'
 64 | author = 'Liyuan Liu'
 65 | 
 66 | # The version info for the project you're documenting, acts as replacement for
 67 | # |version| and |release|, also used in various other places throughout the
 68 | # built documents.
 69 | #
 70 | # The short X.Y version.
 71 | version = ''
 72 | # The full version, including alpha/beta/rc tags.
 73 | release = ''
 74 | 
 75 | # The language for content autogenerated by Sphinx. Refer to documentation
 76 | # for a list of supported languages.
 77 | #
 78 | # This is also used if you do content translation via gettext catalogs.
 79 | # Usually you set "language" from the command line for these cases.
 80 | language = None
 81 | 
 82 | # List of patterns, relative to source directory, that match files and
 83 | # directories to ignore when looking for source files.
 84 | # This patterns also effect to html_static_path and html_extra_path
 85 | exclude_patterns = []
 86 | 
 87 | # The name of the Pygments (syntax highlighting) style to use.
 88 | pygments_style = 'sphinx'
 89 | 
 90 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 91 | todo_include_todos = False
 92 | 
 93 | # -- Options for HTML output ----------------------------------------------
 94 | 
 95 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 96 | # a list of builtin themes.
 97 | #
 98 | 
 99 | html_theme = 'sphinx_rtd_theme'
100 | # html_theme = 'sphinx_documatt_theme'
101 | 
102 | # Theme options are theme-specific and customize the look and feel of a theme
103 | # further.  For a list of options available for each theme, see the
104 | # documentation.
105 | #
106 | # html_theme_options = {}
107 | html_theme_options = {
108 |     'globaltoc_maxdepth': 5,
109 | }
110 | 
111 | # Add any paths that contain custom static files (such as style sheets) here,
112 | # relative to this directory. They are copied after the builtin static files,
113 | # so a file named "default.css" will overwrite the builtin "default.css".
114 | html_static_path = ['_static']
115 | 
116 | # Custom sidebar templates, must be a dictionary that maps document names
117 | # to template names.
118 | #
119 | # This is required for the alabaster theme
120 | # # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
121 | html_sidebars = {
122 |     '**': ['globaltoc.html', 'relations.html', 'sourcelink.html', 'searchbox.html']
123 | }
124 | 
125 | # -- Options for HTMLHelp output ------------------------------------------
126 | 
127 | # Output file base name for HTML help builder.
128 | htmlhelp_basename = 'Admin-Torch'
129 | 
130 | # -- Options for LaTeX output ---------------------------------------------
131 | 
132 | latex_elements = {
133 |     # The paper size ('letterpaper' or 'a4paper').
134 |     #
135 |     # 'papersize': 'letterpaper',
136 | 
137 |     # The font size ('10pt', '11pt' or '12pt').
138 |     #
139 |     # 'pointsize': '10pt',
140 | 
141 |     # Additional stuff for the LaTeX preamble.
142 |     #
143 |     # 'preamble': '',
144 | 
145 |     # Latex figure (float) alignment
146 |     #
147 |     # 'figure_align': 'htbp',
148 | }
149 | 
150 | # Grouping the document tree into LaTeX files. List of tuples
151 | # (source start file, target name, title,
152 | #  author, documentclass [howto, manual, or own class]).
153 | latex_documents = [
154 |     (master_doc, 'admin_torch.tex', 'Admin-Torch Documentation',
155 |      'Admin-Torch', 'manual'),
156 | ]
157 | 
158 | # -- Options for manual page output ---------------------------------------
159 | 
160 | # One entry per manual page. List of tuples
161 | # (source start file, name, description, authors, manual section).
162 | man_pages = [
163 |     (master_doc, 'Admin-Torch', 'Admin-Torch Documentation',
164 |      [author], 1)
165 | ]
166 | 
167 | # -- Options for Texinfo output -------------------------------------------
168 | 
169 | # Grouping the document tree into Texinfo files. List of tuples
170 | # (source start file, target name, title, author,
171 | #  dir menu entry, description, category)
172 | texinfo_documents = [
173 |     (master_doc, 'Admin-Torch', 'Admin-Torch Documentation',
174 |      author, 'Admin-Torch', 'Adaptive Model Initialization.',
175 |      'Miscellaneous'),
176 | ]
177 | 
178 | autodoc_mock_imports = ['torch']
179 | 
180 | intersphinx_mapping = {
181 |     'python':('https://docs.python.org/3', None),
182 |     'torch': ('http://pytorch.org/docs/master', None)
183 |     }
184 | 
185 | autodoc_member_order = 'bysource'


--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
  1 | # Table of Contents
  2 | 
  3 | - [Real example: `admin_torch` on WMT'14 En-De](#admin_torch-on-WMT14-En-De)
  4 | - [Comparison with original Admin and DeepNet](#comparison-with-original-admin-and-deepnet-on-wmt17-en-de)
  5 | 
  6 | # Real example: `admin-torch` on WMT'14 En-De 
  7 | 
  8 | As an example, we apply `admin_torch` to `fairseq` and train Transformer on WMT'14 En-De. 
  9 | 
 10 | > Note: the efforts to incorporate `admin-torch` into fairseq are summarized as [this commit](https://github.com/LiyuanLucasLiu/fairseq/commit/33ad76ae5dc927bc32b9594f9728a367c45680bb):
 11 | 
 12 | ## 1. Pre-processing
 13 | 
 14 | ### 1.1. Data Preparation
 15 | 
 16 | please refer to [the Transformer-Clinic repo](https://github.com/LiyuanLucasLiu/Transformer-Clinic/blob/master/pre-process/wmt14en-de.sh) for data preparation. 
 17 | 
 18 | ### 1.2. Package Install
 19 | 
 20 | ```
 21 | pip install admin_torch==0.1.0
 22 | pip uninstall fairseq
 23 | pip install https://github.com/LiyuanLucasLiu/fairseq/archive/refs/tags/admin-torch.zip
 24 | ```
 25 | 
 26 | ## 2. Training and Evaluation
 27 | 
 28 | ### 2.1. Training
 29 | ```
 30 | bash train_wmt_en-de.sh $PATH-to-WMT14 $NUBMER_LAYER $OUTPUT_PATH
 31 | ```
 32 | Not that `$PATH-to-WMT14` is the path to the `wmt14_en_de_joined_dict` data
 33 | folder from data preparation. `$NUMBER_LAYER` is the encoder/decoder layer number.
 34 | `$OUTPUT_PATH` is the path where you want to save your checkpoints. 
 35 | 
 36 | ### 2.2. Evaluation
 37 | ```
 38 | bash eval_wmt_en-de.sh $PATH-to-WMT14 NONE $OUTPUT_PATH
 39 | ```
 40 | Not that `$PATH-to-WMT14` is the path to the `wmt14_en_de_joined_dict` data folder
 41 | from data preparation. `$OUTPUT_PATH` is the path used in the training step. 
 42 | 
 43 | ## 3. Pre-trained Weights
 44 | 
 45 | | Layer Number | BLEU  | PATH |
 46 | |--------------|-------|------|
 47 | | 6L-6L        | 27.84 | TBD |
 48 | | 18L-18L      | 28.91 | TBD |
 49 | | 100L-100L*   | 29.65 | TBD |
 50 | 
 51 | *: trained with the [huge-batch-size setting](#omparison-with-original-admin-and-deepnet-on-wmt17-en-de),
 52 | but only for 40 epochs, due to the huge cost of the training. 
 53 | 
 54 | ## 4. Discussion on the `admin-torch` setting. 
 55 | 
 56 | `admin-torch.as_module` can be configured by changing `output_change_scale` and
 57 | `as_parameter`. `output_change_scale` can be set to `O(1)` for additional stability, but
 58 | results in a performance drop in our experiments. `as_parameter` can be set to `False` to
 59 | make `omega` (the shortcut connection scaler) as a constant (no updates). Their performance are listed
 60 | as below:
 61 | 
 62 | |    Layer Number | Output Change | Omega           | BLEU  |
 63 | |-----------------|---------------|-----------------|-------|
 64 | | 6L-6L           | O(1)          | as a constant   | 27.71 |
 65 | | 6L-6L           | O(1)          | as a parameter  | 27.79 |
 66 | | 6L-6L           | O(logn)       | as a constant   | 27.83 |
 67 | | 6L-6L           | O(logn)       | as a parameter  | 27.84 |
 68 | | 18L-18L         | O(1)          | as a constant   | 28.66 |
 69 | | 18L-18L         | O(1)          | as a parameter  | 28.89 |
 70 | | 18L-18L         | O(logn)       | as a constant   | 28.78 |
 71 | | 18L-18L         | O(logn)       | as a parameter  | 28.91 |
 72 | 
 73 | # Comparison with original Admin and DeepNet on WMT'17 En-De
 74 | 
 75 | We choose to make comparisons with DeepNet and the original Admin implementation on WMT'17 En-De,
 76 | the dataset used in the DeepNet paper. 
 77 | 
 78 | We noticed that the training configuration in the DeepNet paper is different from the setting used
 79 | in the original Admin repo. Their major difference is the batch size (i.e., regular batch size and
 80 | huge batch size). We refer the setting used in the DeepNet paper as `Huge batch size (128x4096)`, 
 81 | and they refer the setting with changed batch size as `Regular batch size (8x4096)`. 
 82 | 
 83 | We can find that they can only work on their own settings. 
 84 | 
 85 | |               | Regular batch size (8x4096) |  Huge batch size (128x4096) |
 86 | |---------------|--------------------|------------------|
 87 | | [Original Admin](https://github.com/LiyuanLucasLiu/Transformer-Clinic)| ✅ | ❌ |
 88 | | [DeepNet](https://arxiv.org/abs/2203.00555) | ❌ | ✅ |
 89 | | `admin-torch` | ✅ | ✅ |
 90 | 
 91 | Here, we re-implemented admin as `admin-torch`, and we can find that the new `admin-torch`
 92 | implementation works well on both settings. 
 93 | 
 94 | All implementations are publicly released (elaborated as below). 
 95 | 
 96 | 
 97 | ## 1. Data Preparation
 98 | Please refer to the DeepNet paper for data preparation. Here we used the same data shared by the 
 99 | DeepNet team. 
100 | 
101 | ## 2. Original Admin and DeepNet
102 | 
103 | ### 2.1. Implementation Download and Code Install
104 | ```
105 | pip uninstall fairseq
106 | git clone https://github.com/LiyuanLucasLiu/Transformer-Clinic.git
107 | cd Transformer-Clinic/fairseq
108 | pip install --editable .
109 | ```
110 |  
111 | ### 2.2. Training
112 | 
113 | #### 2.2.1. Original Admin
114 | ```
115 | # Before running the training, the original admin requires to do a profilling 
116 | # of the network. The profilling result for 100L-100L is included in this repo
117 | # (i.e., example/profile.ratio.init). The command to generate this profilling 
118 | # can be found at https://github.com/LiyuanLucasLiu/Transformer-Clinic/blob/master/nmt-experiments/wmt14_en-de.md#100l-100l-admin-without-any-hyper-parameter-tuning
119 | 
120 | # regular batch size (4096 x 8)
121 | bash train_wmt_en-de.sh $PATH-to-WMT17 100 $OUTPUT_PATH_REG "--init-type adaptive"
122 | 
123 | # huge batch size (4096 x 128)
124 | bash train_wmt_en-de_huge.sh $PATH-to-WMT17 100 $OUTPUT_PATH_HUG "--init-type adaptive"
125 | 
126 | # evaluate 
127 | bash eval_wmt_en-de.sh $PATH-to-WMT17 none $OUTPUT_PATH_HUG/REG 45 10
128 | ```
129 | 
130 | #### 2.2.2. DeepNet
131 | ```
132 | # regular batch size (4096 x 8)
133 | bash train_wmt_en-de.sh $PATH-to-WMT17 100 $OUTPUT_PATH_REG "--init-type deepnet"
134 | 
135 | # huge batch size (4096 x 128)
136 | bash train_wmt_en-de_huge.sh $PATH-to-WMT17 100 $OUTPUT_PATH_HUG "--init-type deepnet"
137 | 
138 | # evaluate 
139 | bash eval_wmt_en-de.sh $PATH-to-WMT17 none $OUTPUT_PATH_HUG/REG 45 10
140 | ```
141 | 
142 | ## 3 `torch-admin`
143 | 
144 | ### 3.1 Package Install
145 | 
146 | ```
147 | pip install admin_torch==0.1.0
148 | pip uninstall fairseq
149 | pip install https://github.com/LiyuanLucasLiu/fairseq/archive/refs/tags/admin-torch.zip
150 | ```
151 | 
152 | ### 3.2 Training and Evaluation
153 | 
154 | ```
155 | # regular batch size (4096 x 8)
156 | bash train_wmt_en-de.sh $PATH-to-WMT17 100 $OUTPUT_PATH_REG
157 | 
158 | # huge batch size (4096 x 128)
159 | bash train_wmt_en-de_huge.sh $PATH-to-WMT17 100 $OUTPUT_PATH_HUG
160 | 
161 | # evaluate 
162 | bash eval_wmt_en-de.sh $PATH-to-WMT17 none $OUTPUT_PATH_HUG/REG 45 10
163 | ```


--------------------------------------------------------------------------------
/admin_torch/admin.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation.
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import math
  6 | 
  7 | class OmegaResidual(torch.nn.Module):
  8 |     """
  9 |     Residual connection module with shortcut connection rescaling.
 10 | 
 11 |     Parameters
 12 |     ---------- 
 13 |     init_value: ``float``, required.
 14 |         The initialization value of the shortcut connection rescalar, omega. 
 15 |     as_parameter: ``bool``, optional (default = False).
 16 |         Whether to set the rescalar as trainable parameter. Note that, when set as trainable
 17 |         parameters, the rescalar would be set as a vector (similar to the weight vector in layer 
 18 |         norm), and the embed_dim input is required.  
 19 |     embed_dim: ``int``, optional (default = None).
 20 |         The hidden state dimension of the shortcut connection. This field is required and only used 
 21 |         when ``as_parameter == True``. 
 22 |     """
 23 |     
 24 |     def __init__(self, init_value, as_parameter=False, embed_dim=None):
 25 |         super().__init__()
 26 |         if as_parameter:
 27 |             assert embed_dim is not None, 'embed_dim is required when as_parameter is set as True'
 28 |             self.omega = torch.nn.Parameter(torch.ones(embed_dim))
 29 |             self.omega.data.fill_(init_value)
 30 |             self.forward = self.forward_omega
 31 |         else:            
 32 |             self.register_buffer('omega', torch.FloatTensor([init_value]))
 33 |             if 1.0 == init_value:
 34 |                 self.forward = self.forward_original
 35 |             else:
 36 |                 self.forward = self.forward_omega
 37 |     
 38 |     def forward(self, x, f_x):
 39 |         """
 40 |         Calculate x * omega + f_x. The output shape would be same with the input shape. 
 41 | 
 42 |         When omega is set to be a constant 1 (``as buffer`` and ``O(n)`` output change), the 
 43 |         ``OmegaResidual`` would downgrade to the ordinary residual module and x + f_x  would be 
 44 |         calculated instead.  
 45 |         """
 46 |         raise NotImplementedError("Placeholder forward function used in OmegaResidual")
 47 | 
 48 |     def forward_original(self, x, f_x):
 49 |         return x + f_x
 50 | 
 51 |     def forward_omega(self, x, f_x):
 52 |         return x * self.omega + f_x
 53 | 
 54 | def calculate_init(
 55 |         num_res_layers,
 56 |         output_change_scale='O(logn)',
 57 |     ) -> int:
 58 |     r"""
 59 |     Calculate initialization for omega.
 60 | 
 61 |     Parameters
 62 |     ---------- 
 63 |     num_res_layers: ``int``, required.
 64 |         The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers. 
 65 |     output_change_scale: ``str``, optional (default = ``'O(logn)'``).
 66 |         The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``, 
 67 |         and ``'O(1)'`` are supported. 
 68 | 
 69 |     Returns
 70 |     -------
 71 |     int: It would return the initialization value.
 72 |     """
 73 |     if 'O(logn)' == output_change_scale or 'default' == output_change_scale:
 74 |         omega_value = (num_res_layers + 1) / math.log(num_res_layers + 1) - 1
 75 |     elif 'O(n)' == output_change_scale:
 76 |         omega_value = 1.
 77 |     else:
 78 |         assert 'O(1)' == output_change_scale, \
 79 |             'only O(n), O(logn), and O(1) output changes are supported.'
 80 |         omega_value = num_res_layers
 81 |     return omega_value ** 0.5
 82 | 
 83 | def as_module(
 84 |         num_res_layers,
 85 |         output_change_scale='default',
 86 |         as_parameter=False,
 87 |         embed_dim=None
 88 |     ) -> OmegaResidual:
 89 |     r"""
 90 |     Calculate initialization for omega and return a residual module with the initialized omega. 
 91 | 
 92 |     Parameters
 93 |     ----------
 94 |     num_res_layers: ``int``, required.
 95 |         The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers. 
 96 |     output_change_scale: ``str``, optional (default = ``'O(logn)'``).
 97 |         The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``, 
 98 |         and ``'O(1)'`` are supported. 
 99 |     as_parameter: ``bool``, optional (default = False).
100 |         Whether to set the rescalar as trainable parameter. Note that, when set as trainable
101 |         parameters, the rescalar would be set as a vector (similar to the weight vector in
102 |         layer norm), and the embed_dim input is required.  
103 |     embed_dim: ``int``, optional (default = None).
104 |         The hidden state dimension of the shortcut connection. This field is required and only
105 |         used when as_parameter == True. 
106 |         
107 |     Returns
108 |     -------
109 |     admin_torch.OmegaResidual: It would return a ``OmegaResidual`` module with the properly initialized omega inside.
110 | 
111 |     Example
112 |     -------
113 |     
114 |     .. highlight:: python
115 |     .. code-block:: python
116 | 
117 |         import torch.nn as nn
118 |         import admin_torch
119 | 
120 |         class TransformerEncoderLayer(nn.Module):
121 | 
122 |             def __init__(self, cfg):
123 |                 super().__init__()
124 |                 
125 |                 num_layer =  2 * cfg.encoder_layers # number of residual layers
126 | 
127 |                 self.attn = nn.MultiheadAttention(cfg.embed_dim, cfg.num_heads)
128 |                 self.residual_attn = admin_torch.as_module(num_layer) 
129 |                 self.ln_attn = nn.LayerNorm(cfg.embed_dim)
130 |     
131 |                 self.ffn = nn.Sequential(
132 |                     nn.Linear(cfg.embed_dim, cfg.feedforward_dim),
133 |                     nn.ReLU(),
134 |                     nn.Linear(cfg.feedforward_dim)
135 |                 )
136 |                 self.residual_ffn = admin_torch.as_module(num_layer) 
137 |                 self.ln_ffn = nn.LayerNorm(cfg.embed_dim)
138 |             
139 |             def forward(self, x):
140 | 
141 |                 f_x, _ = self.attn(x)
142 |                 x = self.residual_attn(x, f_x)
143 |                 x = self.ln_attn(x)
144 | 
145 |                 f_x = self.ffn(x)
146 |                 x = self.residual_ffn(x, f_x)
147 |                 x = self.ln_ffn(x)
148 | 
149 |                 return x
150 |     """
151 |     omega_value = calculate_init(num_res_layers, output_change_scale)
152 |     return OmegaResidual(omega_value, as_parameter=as_parameter, embed_dim=embed_dim)
153 | 
154 | def as_buffer(
155 |         network, 
156 |         buffer_name,
157 |         num_res_layers,
158 |         output_change_scale='default',
159 |     ) -> None:
160 |     r"""
161 |     Calculate initialization for omega and *register* omega as a buffer (not trainable).
162 | 
163 |     Parameters
164 |     ----------
165 |     network: ``torch.nn.Module``, required.
166 |         The ``torch.nn.Module`` contains the residual network. This is where the omega would 
167 |         be registered to.   
168 |     buffer_name: ``str``, required.
169 |         The name of omega (as buffer). The omega can be accessed in the network, using the
170 |         given name.
171 |     num_res_layers: ``int``, required.
172 |         The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers. 
173 |     output_change_scale: ``str``, optional (default = ``'O(logn)'``).
174 |         The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``, 
175 |         and ``'O(1)'`` are supported. 
176 |         
177 |     Returns
178 |     -------
179 |     None: No returns. The initialized omega would be registered as a buffer within `network`. 
180 | 
181 |     Example
182 |     -------
183 |     
184 |     .. highlight:: python
185 |     .. code-block:: python
186 | 
187 |         import torch.nn as nn
188 |         import admin_torch
189 | 
190 |         class TransformerEncoderLayer(nn.Module):
191 | 
192 |             def __init__(self, cfg):
193 |                 super().__init__()
194 |                 
195 |                 num_layer =  2 * cfg.encoder_layers # number of residual layers
196 | 
197 |                 self.attn = nn.MultiheadAttention(cfg.embed_dim, cfg.num_heads)
198 |                 admin_torch.as_buffer(self, 'attn_omega', num_layer)
199 |                 self.ln_attn = nn.LayerNorm(cfg.embed_dim)
200 |     
201 |                 self.ffn = nn.Sequential(
202 |                     nn.Linear(cfg.embed_dim, cfg.feedforward_dim),
203 |                     nn.ReLU(),
204 |                     nn.Linear(cfg.feedforward_dim)
205 |                 )
206 |                 admin_torch.as_buffer(self, 'ffn_omega', num_layer)
207 |                 self.ln_ffn = nn.LayerNorm(cfg.embed_dim)
208 |             
209 |             def forward(self, x):
210 | 
211 |                 f_x, _ = self.attn(x)
212 |                 x = x * self.attn_omega + f_x
213 |                 x = self.ln_attn(x)
214 | 
215 |                 f_x = self.ffn(x)
216 |                 x = x * self.ffn_omega + f_x
217 |                 x = self.ln_ffn(x)
218 | 
219 |                 return x
220 |     """
221 |     assert isinstance(network, torch.nn.Module), \
222 |         'the input network has to be a torch.nn.Module object'
223 |     omega_value = calculate_init(num_res_layers, output_change_scale)
224 |     network.register_buffer(buffer_name, torch.FloatTensor([omega_value]))
225 | 
226 | def as_parameter(
227 |         network, 
228 |         parameter_name,
229 |         num_res_layers,
230 |         embed_dim,
231 |         output_change_scale='default',
232 |     ) -> None:
233 |     r"""
234 |     Calculate initialization for omega and *register* omega as a parameter (trainable).
235 | 
236 |     Parameters
237 |     ----------
238 |     network: ``torch.nn.Module``, required.
239 |         The ``torch.nn.Module`` contains the residual network. This is where the omega would 
240 |         be registered to.   
241 |     parameter_name: ``str``, required.
242 |         The name of omega (as parameter). The omega can be accessed in the network, using the
243 |         given name.
244 |     num_res_layers: ``int``, required.
245 |         The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers. 
246 |     embed_dim: ``int``, required.
247 |         The hidden state dimension of the shortcut connection. 
248 |     output_change_scale: ``str``, optional (default = ``'O(logn)'``).
249 |         The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``, 
250 |         and ``'O(1)'`` are supported. 
251 |         
252 |     Returns
253 |     -------
254 |     None: No returns. The initialized omega would be registered as a parameter within `network`. 
255 |         
256 |     Example
257 |     -------
258 |     
259 |     .. highlight:: python
260 |     .. code-block:: python
261 | 
262 |         import torch.nn as nn
263 |         import admin_torch
264 | 
265 |         class TransformerEncoderLayer(nn.Module):
266 | 
267 |             def __init__(self, cfg):
268 |                 super().__init__()
269 | 
270 |                 num_layer =  2 * cfg.encoder_layers # number of residual layers
271 | 
272 |                 self.attn = nn.MultiheadAttention(cfg.embed_dim, cfg.num_heads)
273 |                 admin_torch.as_parameter(self, 'attn_omega', num_layer, cfg.embed_dim) 
274 |                 self.ln_attn = nn.LayerNorm(cfg.embed_dim)
275 |     
276 |                 self.ffn = nn.Sequential(
277 |                     nn.Linear(cfg.embed_dim, cfg.feedforward_dim),
278 |                     nn.ReLU(),
279 |                     nn.Linear(cfg.feedforward_dim)
280 |                 )
281 |                 admin_torch.as_parameter(self, 'ffn_omega', num_layer, cfg.embed_dim) 
282 |                 self.ln_ffn = nn.LayerNorm(cfg.embed_dim)
283 |             
284 |             def forward(self, x):
285 | 
286 |                 f_x, _ = self.attn(x)
287 |                 x = x * self.attn_omega + f_x
288 |                 x = self.ln_attn(x)
289 | 
290 |                 f_x = self.ffn(x)
291 |                 x = x * self.ffn_omega + f_x
292 |                 x = self.ln_ffn(x)
293 | 
294 |                 return x
295 |     """
296 |     omega_vector = torch.ones(embed_dim)
297 |     omega_vector.data.fill_(calculate_init(num_res_layers, output_change_scale))
298 |     network.register_parameter(parameter_name,torch.nn.Parameter(omega_vector))
299 | 


--------------------------------------------------------------------------------
/example/profile.ratio.init:
--------------------------------------------------------------------------------
  1 | 1 1.0
  2 | 2 1.3190397024154663
  3 | 3 1.4550316333770752
  4 | 4 1.5123295783996582
  5 | 5 1.6428167819976807
  6 | 6 1.695853352546692
  7 | 7 1.8099972009658813
  8 | 8 1.8667889833450317
  9 | 9 1.9713013172149658
 10 | 10 2.028479814529419
 11 | 11 2.1205849647521973
 12 | 12 2.1734628677368164
 13 | 13 2.2616868019104004
 14 | 14 2.3189287185668945
 15 | 15 2.404047966003418
 16 | 16 2.456615924835205
 17 | 17 2.5324201583862305
 18 | 18 2.5882620811462402
 19 | 19 2.669269561767578
 20 | 20 2.7247939109802246
 21 | 21 2.7918922901153564
 22 | 22 2.8465681076049805
 23 | 23 2.9145007133483887
 24 | 24 2.961702585220337
 25 | 25 3.027456045150757
 26 | 26 3.080919027328491
 27 | 27 3.1440911293029785
 28 | 28 3.198875904083252
 29 | 29 3.2594544887542725
 30 | 30 3.306119203567505
 31 | 31 3.3605387210845947
 32 | 32 3.4074313640594482
 33 | 33 3.4670989513397217
 34 | 34 3.5158419609069824
 35 | 35 3.573145627975464
 36 | 36 3.621101140975952
 37 | 37 3.6756269931793213
 38 | 38 3.723588705062866
 39 | 39 3.7751662731170654
 40 | 40 3.820997714996338
 41 | 41 3.874152183532715
 42 | 42 3.919919729232788
 43 | 43 3.971492052078247
 44 | 44 4.015556335449219
 45 | 45 4.062773704528809
 46 | 46 4.102989673614502
 47 | 47 4.147921562194824
 48 | 48 4.188112735748291
 49 | 49 4.236110210418701
 50 | 50 4.280117988586426
 51 | 51 4.3282270431518555
 52 | 52 4.3700642585754395
 53 | 53 4.413156986236572
 54 | 54 4.457295894622803
 55 | 55 4.502137184143066
 56 | 56 4.543152332305908
 57 | 57 4.5831618309021
 58 | 58 4.6219892501831055
 59 | 59 4.665818214416504
 60 | 60 4.704627513885498
 61 | 61 4.7454071044921875
 62 | 62 4.782839298248291
 63 | 63 4.824032783508301
 64 | 64 4.865396499633789
 65 | 65 4.905492782592773
 66 | 66 4.944614410400391
 67 | 67 4.986900329589844
 68 | 68 5.027820110321045
 69 | 69 5.068670272827148
 70 | 70 5.105625152587891
 71 | 71 5.1475629806518555
 72 | 72 5.185201168060303
 73 | 73 5.222909450531006
 74 | 74 5.258800029754639
 75 | 75 5.293256759643555
 76 | 76 5.3247480392456055
 77 | 77 5.362242221832275
 78 | 78 5.398958206176758
 79 | 79 5.436041831970215
 80 | 80 5.472423076629639
 81 | 81 5.50972318649292
 82 | 82 5.542351722717285
 83 | 83 5.580074310302734
 84 | 84 5.614049434661865
 85 | 85 5.647655487060547
 86 | 86 5.685954570770264
 87 | 87 5.72280740737915
 88 | 88 5.756107807159424
 89 | 89 5.7939252853393555
 90 | 90 5.8227972984313965
 91 | 91 5.860756874084473
 92 | 92 5.893098831176758
 93 | 93 5.931689262390137
 94 | 94 5.965618133544922
 95 | 95 6.000150203704834
 96 | 96 6.031996726989746
 97 | 97 6.064528465270996
 98 | 98 6.096097469329834
 99 | 99 6.130829334259033
100 | 100 6.1620001792907715
101 | 101 6.192485809326172
102 | 102 6.223220348358154
103 | 103 6.255505084991455
104 | 104 6.285265922546387
105 | 105 6.317674160003662
106 | 106 6.347247123718262
107 | 107 6.380263805389404
108 | 108 6.4059295654296875
109 | 109 6.4350199699401855
110 | 110 6.463878631591797
111 | 111 6.495331287384033
112 | 112 6.530038356781006
113 | 113 6.560511112213135
114 | 114 6.592663288116455
115 | 115 6.6225266456604
116 | 116 6.649158477783203
117 | 117 6.678005695343018
118 | 118 6.706629753112793
119 | 119 6.735388278961182
120 | 120 6.765207767486572
121 | 121 6.796474933624268
122 | 122 6.829586505889893
123 | 123 6.8563995361328125
124 | 124 6.884836196899414
125 | 125 6.913089752197266
126 | 126 6.945504665374756
127 | 127 6.972911834716797
128 | 128 7.0005574226379395
129 | 129 7.029700756072998
130 | 130 7.057539463043213
131 | 131 7.087018966674805
132 | 132 7.115658283233643
133 | 133 7.145423889160156
134 | 134 7.176324844360352
135 | 135 7.203213214874268
136 | 136 7.2326130867004395
137 | 137 7.263206481933594
138 | 138 7.290313243865967
139 | 139 7.316440105438232
140 | 140 7.342780590057373
141 | 141 7.371157646179199
142 | 142 7.401747226715088
143 | 143 7.428928375244141
144 | 144 7.455725193023682
145 | 145 7.482446670532227
146 | 146 7.509575843811035
147 | 147 7.534206390380859
148 | 148 7.561554431915283
149 | 149 7.5872907638549805
150 | 150 7.610199928283691
151 | 151 7.634483814239502
152 | 152 7.664854526519775
153 | 153 7.689417362213135
154 | 154 7.711843490600586
155 | 155 7.73599910736084
156 | 156 7.759965896606445
157 | 157 7.788935661315918
158 | 158 7.814115524291992
159 | 159 7.835615158081055
160 | 160 7.8606743812561035
161 | 161 7.882746696472168
162 | 162 7.907750606536865
163 | 163 7.9333109855651855
164 | 164 7.958787441253662
165 | 165 7.981550693511963
166 | 166 8.011580467224121
167 | 167 8.03337287902832
168 | 168 8.056806564331055
169 | 169 8.079192161560059
170 | 170 8.103809356689453
171 | 171 8.125045776367188
172 | 172 8.15402889251709
173 | 173 8.181538581848145
174 | 174 8.207011222839355
175 | 175 8.227130889892578
176 | 176 8.252174377441406
177 | 177 8.272038459777832
178 | 178 8.298123359680176
179 | 179 8.317887306213379
180 | 180 8.347379684448242
181 | 181 8.365970611572266
182 | 182 8.39012336730957
183 | 183 8.413267135620117
184 | 184 8.441601753234863
185 | 185 8.469433784484863
186 | 186 8.492582321166992
187 | 187 8.515233993530273
188 | 188 8.537558555603027
189 | 189 8.561620712280273
190 | 190 8.58764362335205
191 | 191 8.611865043640137
192 | 192 8.638022422790527
193 | 193 8.662741661071777
194 | 194 8.689210891723633
195 | 195 8.71304702758789
196 | 196 8.739282608032227
197 | 197 8.76534366607666
198 | 198 8.791733741760254
199 | 199 8.812337875366211
200 | 200 8.836019515991211
201 | 1 1.0
202 | 2 1.3425869941711426
203 | 3 1.4800665378570557
204 | 4 1.610133409500122
205 | 5 1.6861705780029297
206 | 6 1.8158448934555054
207 | 7 1.9169756174087524
208 | 8 1.9898321628570557
209 | 9 2.103006362915039
210 | 10 2.198607921600342
211 | 11 2.2673819065093994
212 | 12 2.379521131515503
213 | 13 2.453937292098999
214 | 14 2.521658420562744
215 | 15 2.6081600189208984
216 | 16 2.6755027770996094
217 | 17 2.7483041286468506
218 | 18 2.8297157287597656
219 | 19 2.905391216278076
220 | 20 2.9680662155151367
221 | 21 3.0330522060394287
222 | 22 3.1018290519714355
223 | 23 3.178995132446289
224 | 24 3.243086099624634
225 | 25 3.304431438446045
226 | 26 3.3670051097869873
227 | 27 3.4263482093811035
228 | 28 3.48586368560791
229 | 29 3.5485644340515137
230 | 30 3.60994553565979
231 | 31 3.658292293548584
232 | 32 3.7145533561706543
233 | 33 3.764017343521118
234 | 34 3.820631742477417
235 | 35 3.8807389736175537
236 | 36 3.927197217941284
237 | 37 3.9778075218200684
238 | 38 4.032394886016846
239 | 39 4.079466342926025
240 | 40 4.121256351470947
241 | 41 4.179348945617676
242 | 42 4.235202789306641
243 | 43 4.276139736175537
244 | 44 4.322049617767334
245 | 45 4.375305652618408
246 | 46 4.421659469604492
247 | 47 4.465145111083984
248 | 48 4.511823654174805
249 | 49 4.558897972106934
250 | 50 4.612059116363525
251 | 51 4.6558332443237305
252 | 52 4.701802730560303
253 | 53 4.749824523925781
254 | 54 4.791557788848877
255 | 55 4.836613655090332
256 | 56 4.882650375366211
257 | 57 4.921323776245117
258 | 58 4.962111949920654
259 | 59 4.998560905456543
260 | 60 5.037952899932861
261 | 61 5.07641077041626
262 | 62 5.117368698120117
263 | 63 5.156766414642334
264 | 64 5.189724445343018
265 | 65 5.232119083404541
266 | 66 5.274285316467285
267 | 67 5.309042453765869
268 | 68 5.348519325256348
269 | 69 5.385103225708008
270 | 70 5.4217658042907715
271 | 71 5.458439350128174
272 | 72 5.49429988861084
273 | 73 5.532592296600342
274 | 74 5.5714898109436035
275 | 75 5.615612030029297
276 | 76 5.6548895835876465
277 | 77 5.69318962097168
278 | 78 5.726015090942383
279 | 79 5.7607831954956055
280 | 80 5.803152084350586
281 | 81 5.840878963470459
282 | 82 5.873714447021484
283 | 83 5.916788578033447
284 | 84 5.954561233520508
285 | 85 5.986266613006592
286 | 86 6.01418924331665
287 | 87 6.048747539520264
288 | 88 6.081315517425537
289 | 89 6.119509220123291
290 | 90 6.158118724822998
291 | 91 6.1867289543151855
292 | 92 6.221379280090332
293 | 93 6.256557464599609
294 | 94 6.288797855377197
295 | 95 6.326879024505615
296 | 96 6.361721038818359
297 | 97 6.393381118774414
298 | 98 6.428704261779785
299 | 99 6.463216304779053
300 | 100 6.492666244506836
301 | 101 6.531671524047852
302 | 102 6.56462287902832
303 | 103 6.595141410827637
304 | 104 6.63072395324707
305 | 105 6.664488792419434
306 | 106 6.691591262817383
307 | 107 6.724743843078613
308 | 108 6.752674102783203
309 | 109 6.781800270080566
310 | 110 6.8150529861450195
311 | 111 6.846014022827148
312 | 112 6.875141620635986
313 | 113 6.913217544555664
314 | 114 6.9485979080200195
315 | 115 6.978128433227539
316 | 116 7.006283283233643
317 | 117 7.036981105804443
318 | 118 7.06494665145874
319 | 119 7.097784519195557
320 | 120 7.124679088592529
321 | 121 7.151909828186035
322 | 122 7.189268112182617
323 | 123 7.214508056640625
324 | 124 7.244059085845947
325 | 125 7.272270202636719
326 | 126 7.302449703216553
327 | 127 7.325904846191406
328 | 128 7.352625370025635
329 | 129 7.382406234741211
330 | 130 7.409889221191406
331 | 131 7.441695213317871
332 | 132 7.465174198150635
333 | 133 7.490511894226074
334 | 134 7.52184534072876
335 | 135 7.552266597747803
336 | 136 7.576979160308838
337 | 137 7.60615873336792
338 | 138 7.631077766418457
339 | 139 7.659272193908691
340 | 140 7.689055442810059
341 | 141 7.714266777038574
342 | 142 7.740371227264404
343 | 143 7.770144462585449
344 | 144 7.797163963317871
345 | 145 7.825152397155762
346 | 146 7.850924015045166
347 | 147 7.8776116371154785
348 | 148 7.905229568481445
349 | 149 7.938175678253174
350 | 150 7.963441371917725
351 | 151 7.989377021789551
352 | 152 8.015571594238281
353 | 153 8.03981876373291
354 | 154 8.061357498168945
355 | 155 8.089569091796875
356 | 156 8.110694885253906
357 | 157 8.13357162475586
358 | 158 8.16172981262207
359 | 159 8.186487197875977
360 | 160 8.212444305419922
361 | 161 8.237144470214844
362 | 162 8.259098052978516
363 | 163 8.284126281738281
364 | 164 8.307903289794922
365 | 165 8.32878303527832
366 | 166 8.352269172668457
367 | 167 8.380142211914062
368 | 168 8.406278610229492
369 | 169 8.428736686706543
370 | 170 8.453876495361328
371 | 171 8.476970672607422
372 | 172 8.50069808959961
373 | 173 8.524378776550293
374 | 174 8.547581672668457
375 | 175 8.568750381469727
376 | 176 8.596118927001953
377 | 177 8.616921424865723
378 | 178 8.64217472076416
379 | 179 8.666587829589844
380 | 180 8.689314842224121
381 | 181 8.712116241455078
382 | 182 8.737107276916504
383 | 183 8.7545166015625
384 | 184 8.781569480895996
385 | 185 8.804463386535645
386 | 186 8.828217506408691
387 | 187 8.84929370880127
388 | 188 8.87789535522461
389 | 189 8.89671516418457
390 | 190 8.919512748718262
391 | 191 8.948515892028809
392 | 192 8.968647956848145
393 | 193 8.989168167114258
394 | 194 9.019471168518066
395 | 195 9.040534019470215
396 | 196 9.059708595275879
397 | 197 9.086166381835938
398 | 198 9.106014251708984
399 | 199 9.12833309173584
400 | 200 9.152287483215332
401 | 201 9.17170524597168
402 | 202 9.192819595336914
403 | 203 9.222025871276855
404 | 204 9.242000579833984
405 | 205 9.262640953063965
406 | 206 9.292340278625488
407 | 207 9.312517166137695
408 | 208 9.332563400268555
409 | 209 9.362652778625488
410 | 210 9.37942886352539
411 | 211 9.403473854064941
412 | 212 9.430370330810547
413 | 213 9.448878288269043
414 | 214 9.467549324035645
415 | 215 9.49483585357666
416 | 216 9.513956069946289
417 | 217 9.536771774291992
418 | 218 9.561524391174316
419 | 219 9.583642959594727
420 | 220 9.608484268188477
421 | 221 9.632624626159668
422 | 222 9.656436920166016
423 | 223 9.676606178283691
424 | 224 9.702705383300781
425 | 225 9.725610733032227
426 | 226 9.74528980255127
427 | 227 9.76707935333252
428 | 228 9.788287162780762
429 | 229 9.808671951293945
430 | 230 9.835151672363281
431 | 231 9.85518741607666
432 | 232 9.870915412902832
433 | 233 9.89185905456543
434 | 234 9.909582138061523
435 | 235 9.925028800964355
436 | 236 9.948447227478027
437 | 237 9.964569091796875
438 | 238 9.98108959197998
439 | 239 10.01058292388916
440 | 240 10.029886245727539
441 | 241 10.049802780151367
442 | 242 10.071462631225586
443 | 243 10.090588569641113
444 | 244 10.113792419433594
445 | 245 10.143721580505371
446 | 246 10.159034729003906
447 | 247 10.17558479309082
448 | 248 10.193281173706055
449 | 249 10.21686840057373
450 | 250 10.241169929504395
451 | 251 10.26410961151123
452 | 252 10.285745620727539
453 | 253 10.306694984436035
454 | 254 10.328060150146484
455 | 255 10.34904956817627
456 | 256 10.371853828430176
457 | 257 10.396636962890625
458 | 258 10.417922019958496
459 | 259 10.433265686035156
460 | 260 10.45438003540039
461 | 261 10.473858833312988
462 | 262 10.496040344238281
463 | 263 10.512752532958984
464 | 264 10.536433219909668
465 | 265 10.55765438079834
466 | 266 10.574774742126465
467 | 267 10.599035263061523
468 | 268 10.620415687561035
469 | 269 10.63953971862793
470 | 270 10.661383628845215
471 | 271 10.681900024414062
472 | 272 10.70407772064209
473 | 273 10.722298622131348
474 | 274 10.737654685974121
475 | 275 10.756811141967773
476 | 276 10.776389122009277
477 | 277 10.792834281921387
478 | 278 10.81033992767334
479 | 279 10.83313274383545
480 | 280 10.854608535766602
481 | 281 10.872381210327148
482 | 282 10.895024299621582
483 | 283 10.916109085083008
484 | 284 10.934562683105469
485 | 285 10.954834938049316
486 | 286 10.973169326782227
487 | 287 11.00400447845459
488 | 288 11.027678489685059
489 | 289 11.048644065856934
490 | 290 11.068979263305664
491 | 291 11.088113784790039
492 | 292 11.103851318359375
493 | 293 11.122899055480957
494 | 294 11.144107818603516
495 | 295 11.157708168029785
496 | 296 11.176496505737305
497 | 297 11.197405815124512
498 | 298 11.212449073791504
499 | 299 11.23056411743164
500 | 300 11.249613761901855
501 | 


--------------------------------------------------------------------------------