├── HISTORY.rst
├── doc
├── source
│ ├── _static
│ │ └── output_change.png
│ ├── contents.rst
│ ├── index.rst
│ └── conf.py
└── Makefile
├── admin_torch
├── __init__.py
└── admin.py
├── MANIFEST.in
├── CODE_OF_CONDUCT.md
├── .gitignore
├── example
├── eval_wmt_en-de.sh
├── train_wmt_en-de_huge_batch.sh
├── train_big_wmt_en-de_huge_batch.sh
├── train_wmt_en-de.sh
├── average_checkpoints.py
├── README.md
└── profile.ratio.init
├── LICENSE
├── setup.py
├── SUPPORT.md
├── .github
└── workflows
│ └── codeql-analysis.yml
├── SECURITY.md
└── README.md
/HISTORY.rst:
--------------------------------------------------------------------------------
1 | History
2 | =======
3 |
4 | 0.1.0 (2022/3/3)
5 | ------------------
6 | * implemented Admin
7 |
--------------------------------------------------------------------------------
/doc/source/_static/output_change.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/admin-torch/main/doc/source/_static/output_change.png
--------------------------------------------------------------------------------
/admin_torch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | __author__ = "Liyuan Liu"
5 |
6 | __maintainer__ = "Liyuan Liu"
7 | __email__ = "llychinalz@gmail.com"
8 |
9 | from admin_torch.admin import *
--------------------------------------------------------------------------------
/doc/source/contents.rst:
--------------------------------------------------------------------------------
1 | .. Admin-Torch documentation file.
2 |
3 | :github_url: https://github.com/microsoft/admin-torch
4 |
5 | *************************
6 | Admin-Torch documentation
7 | *************************
8 |
9 | .. toctree::
10 | :maxdepth: 2
11 |
12 | index
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the README
2 | include *.md
3 |
4 | # Include the license file
5 | include LICENSE.txt
6 |
7 | # Include the history
8 | include HISTORY.rst
9 |
10 | # Include the script
11 | include bin/torch_scope
12 |
13 | # Include image
14 | include doc/source/_static/output_change.png
15 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = python -msphinx
7 | SPHINXPROJ = Admin
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # yaml files
7 | *.yaml
8 |
9 | # aml files
10 | .amltconfig
11 | .amltignore
12 |
13 | # macOS dir files
14 | .DS_Store
15 |
16 | # Distribution / packaging
17 | .Python
18 | env/
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | wheels/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # PyBuilder
46 | target/
47 |
48 | # Jupyter Notebook
49 | .ipynb_checkpoints
50 |
51 | # pyenv
52 | .python-version
53 |
54 | # dotenv
55 | .env
56 |
57 | # mypy
58 | .mypy_cache/
59 |
--------------------------------------------------------------------------------
/example/eval_wmt_en-de.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DATADIR=${1:-"./wmt14_en_de_joined_dict"}
3 | MODELDIR=${2:-"None"}
4 |
5 | SAVEDIR=${3:-"None"}
6 | UPPER_BOUND=${4:-100}
7 | CP_POINT_NUM=${5:-10}
8 |
9 | if [[ $MODELDIR == "None" ]]
10 | then
11 | if [[ $SAVEDIR == "None" ]]
12 | then
13 | echo "SAVEDIR and MODELDIR cannot be None at the same time."
14 | exit
15 | fi
16 | MODELDIR=$SAVEDIR/model_${UPPER_BOUND}_${CP_POINT_NUM}.pt
17 | if [ -f $MODELDIR ]; then
18 | echo $MODELDIR "already exists"
19 | else
20 | echo "Start averaging model"
21 | python average_checkpoints.py --inputs $SAVEDIR --num-epoch-checkpoints ${CP_POINT_NUM} --output $MODELDIR --checkpoint-upper-bound $UPPER_BOUND | grep 'Finish'
22 | echo "End averaging model"
23 | fi
24 | fi
25 |
26 | echo "Model path" $MODELDIR
27 |
28 | CUDA_VISIBLE_DEVICES=0 fairseq-generate $DATADIR \
29 | --path $MODELDIR \
30 | --batch-size 128 --beam 4 --lenpen 0.6 --remove-bpe \
31 | --quiet --fp16
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from setuptools import setup, find_packages
5 |
6 | def read_readme():
7 | with open('README.md') as f:
8 | return f.read()
9 |
10 | with open('HISTORY.rst') as history_file:
11 | history = history_file.read()
12 |
13 | requirements = [
14 | 'torch'
15 | ]
16 |
17 | setup(
18 | name='admin_torch',
19 | version='0.1.0',
20 | description='Plug-in-and-Play Toolbox for Stablizing Transformer Training',
21 | long_description= read_readme(),
22 | long_description_content_type="text/markdown",
23 | author='Lucas Liu',
24 | author_email='llychinalz@gmail.com',
25 | url='https://github.com/microsoft/admin-torch',
26 | packages=find_packages(exclude=['docs']),
27 | include_package_data=True,
28 | install_requires=requirements,
29 | license='Apache License 2.0',
30 | zip_safe=False,
31 | classifiers=[
32 | 'Development Status :: 2 - Pre-Alpha',
33 | 'Intended Audience :: Developers',
34 | 'Natural Language :: English',
35 | 'Programming Language :: Python :: 3.7',
36 | 'Programming Language :: Python :: 3.8',
37 | 'Programming Language :: Python :: 3.9',
38 | ]
39 | )
40 |
41 | # python setup.py sdist bdist_wheel --universal
42 | # twine upload dist/*
--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
1 | # TODO: The maintainer of this repo has not yet edited this file
2 |
3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
4 |
5 | - **No CSS support:** Fill out this template with information about how to file issues and get help.
6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport).
7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide.
8 |
9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10 |
11 | # Support
12 |
13 | ## How to file issues and get help
14 |
15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
17 | feature request as a new Issue.
18 |
19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22 |
23 | ## Microsoft Support Policy
24 |
25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
26 |
--------------------------------------------------------------------------------
/doc/source/index.rst:
--------------------------------------------------------------------------------
1 | .. Admin-Torch documentation file.
2 |
3 | :github_url: https://github.com/LiyuanLucasLiu/Admin
4 |
5 | *************************
6 | Admin-Torch documentation
7 | *************************
8 |
9 | A plug-in-and-play PyTorch wrapper for `Adaptive model initialization (Admin)`__.
10 |
11 | For a neural network f, input x, randomly initialized weight w, we describe its stability (
12 | ``output_change_scale``) as
13 |
14 | .. math:: E[|f(x, w) - f(x, w + \delta)|_2^2], \mbox{where } \delta \mbox{ is a random perturbation.}
15 |
16 | In `our study`__, we show that, an original N-layer Transformer's ``output_change_scale`` is ``O(n)``,
17 | which unstabilizes its training. Admin stabilize Transformer's training by regulating this scale to
18 | ``O(logn)`` and ``O(1)``. We keep ``O(logn)`` as the ``default`` setting, which can handle most scenarios.
19 | In need of additional stability, set ``output_change_scale`` to ``O(1)`` instead.
20 |
21 | __ https://arxiv.org/abs/2004.08249
22 | __ https://arxiv.org/abs/2004.08249
23 |
24 |
25 | admin_torch\.as_module()
26 | ===============================
27 | .. autofunction:: admin_torch.as_module
28 |
29 |
30 | admin_torch\.as_parameter()
31 | ===============================
32 | .. autofunction:: admin_torch.as_parameter
33 |
34 |
35 | admin_torch\.as_buffer()
36 | ===============================
37 | .. autofunction:: admin_torch.as_buffer
38 |
39 |
40 | admin_torch\.OmegaResidual
41 | ================================
42 | .. autoclass:: admin_torch.OmegaResidual
43 | :members:
44 |
--------------------------------------------------------------------------------
/example/train_wmt_en-de_huge_batch.sh:
--------------------------------------------------------------------------------
1 | DATA_PATH=${1:-"./wmt14_en_de_joined_dict"}
2 | LAYERS=${2:-18}
3 | OUTPUT_PATH=${3:-"./admin_18L_asParameter"}
4 | ADDITIONAL_ARGS=${4:-"--encoder-as-parameter --decoder-as-parameter --share-all-embeddings"}
5 |
6 | TOKENS=2048
7 | DEVICE_NUMBER=8
8 | FREQ=32
9 |
10 | NUMBER_OF_GPUS=$(nvidia-smi --list-gpus | wc -l)
11 | if [[ $NUMBER_OF_GPUS != $DEVICE_NUMBER ]]
12 | then
13 | echo "The script is for $DEVICE_NUMBER card, but only find $NUMBER_OF_GPUS cards."
14 | echo "Please modify TOKENS, DEVICES, and FREQ in the script accordingly."
15 | echo
16 | echo "Note that you need to keep device_number * tokens * freq = 32768"
17 | exit
18 | fi
19 |
20 | DEVICE_LIST=$(( DEVICE_NUMBER - 1 ))
21 | DEVICE_LIST=$(seq -s "," 0 $DEVICE_LIST)
22 |
23 | echo "Using GPUs $DEVICE_LIST for training"
24 |
25 | CUDA_VISIBLE_DEVICES=$DEVICE_LIST fairseq-train \
26 | $DATA_PATH $ADDITIONAL_ARGS -s en -t de \
27 | --arch transformer_wmt_en_de \
28 | --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
29 | --lr-scheduler inverse_sqrt --max-update 100000 \
30 | --warmup-init-lr 1e-07 --warmup-updates 4000 --lr 0.0015 \
31 | --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
32 | --weight-decay 0.0001 --dropout 0.4 \
33 | --max-tokens $TOKENS --update-freq $FREQ \
34 | --save-dir $OUTPUT_PATH --seed 1111 --restore-file x.pt \
35 | --log-format simple --log-interval 30 --memory-efficient-fp16 \
36 | --encoder-layers $LAYERS --decoder-layers $LAYERS \
37 | --threshold-loss-scale 0.0625 --fp16-scale-window 256 --fp16
38 |
--------------------------------------------------------------------------------
/example/train_big_wmt_en-de_huge_batch.sh:
--------------------------------------------------------------------------------
1 | DATA_PATH=${1:-"./wmt14_en_de_joined_dict"}
2 | LAYERS=${2:-18}
3 | OUTPUT_PATH=${3:-"./admin_18L_asParameter"}
4 | ADDITIONAL_ARGS=${4:-"--encoder-as-parameter --decoder-as-parameter --share-all-embeddings --lr 0.0007"}
5 |
6 | TOKENS=2048
7 | DEVICE_NUMBER=16
8 | FREQ=16
9 |
10 | NUMBER_OF_GPUS=$(nvidia-smi --list-gpus | wc -l)
11 | if [[ $NUMBER_OF_GPUS != $DEVICE_NUMBER ]]
12 | then
13 | echo "The script is for $DEVICE_NUMBER card, but only find $NUMBER_OF_GPUS cards."
14 | echo "Please modify TOKENS, DEVICES, and FREQ in the script accordingly."
15 | echo
16 | echo "Note that you need to keep device_number * tokens * freq = 32768"
17 | exit
18 | fi
19 |
20 | DEVICE_LIST=$(( DEVICE_NUMBER - 1 ))
21 | DEVICE_LIST=$(seq -s "," 0 $DEVICE_LIST)
22 |
23 | echo "Using GPUs $DEVICE_LIST for training"
24 |
25 | CUDA_VISIBLE_DEVICES=$DEVICE_LIST fairseq-train \
26 | $DATA_PATH $ADDITIONAL_ARGS -s en -t de \
27 | --arch transformer_vaswani_wmt_en_de_big \
28 | --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
29 | --lr-scheduler inverse_sqrt --max-update 100000 \
30 | --warmup-init-lr 1e-07 --warmup-updates 4000 \
31 | --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
32 | --weight-decay 0.0001 --dropout 0.4 \
33 | --max-tokens $TOKENS --update-freq $FREQ \
34 | --save-dir $OUTPUT_PATH --seed 1111 --restore-file x.pt \
35 | --log-format simple --log-interval 30 --memory-efficient-fp16 \
36 | --encoder-layers $LAYERS --decoder-layers $LAYERS \
37 | --threshold-loss-scale 0.0625 --fp16-scale-window 256 --fp16
38 |
--------------------------------------------------------------------------------
/example/train_wmt_en-de.sh:
--------------------------------------------------------------------------------
1 | DATA_PATH=${1:-"./wmt14_en_de_joined_dict"}
2 | LAYERS=${2:-18}
3 | OUTPUT_PATH=${3:-"./admin_18L_asParameter"}
4 | ADDITIONAL_ARGS=${4:-"--share-all-embeddings --encoder-as-parameter --decoder-as-parameter"}
5 |
6 | TOKENS=4096
7 | DEVICE_NUMBER=8
8 | FREQ=1
9 |
10 | NUMBER_OF_GPUS=$(nvidia-smi --list-gpus | wc -l)
11 | if [[ $NUMBER_OF_GPUS != $DEVICE_NUMBER ]]
12 | then
13 | echo "The script is for $DEVICE_NUMBER card, but only find $NUMBER_OF_GPUS cards."
14 | echo "Please modify TOKENS, DEVICES, and FREQ in the script accordingly."
15 | echo
16 | echo "Note that you need to keep device_number * tokens * freq = 32768"
17 | exit
18 | fi
19 |
20 | DEVICE_LIST=$(( DEVICE_NUMBER - 1 ))
21 | DEVICE_LIST=$(seq -s "," 0 $DEVICE_LIST)
22 |
23 | echo "Using GPUs $DEVICE_LIST for training"
24 |
25 | CUDA_VISIBLE_DEVICES=$DEVICE_LIST fairseq-train \
26 | $DATA_PATH $ADDITIONAL_ARGS -s en -t de \
27 | --arch transformer_wmt_en_de \
28 | --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
29 | --lr-scheduler inverse_sqrt --max-update 1000000 \
30 | --warmup-init-lr 1e-07 --warmup-updates 8000 --lr 0.001 \
31 | --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
32 | --weight-decay 0.0 --attention-dropout 0.1 --relu-dropout 0.1 \
33 | --max-tokens $TOKENS --update-freq $FREQ \
34 | --save-dir $OUTPUT_PATH --seed 1111 --restore-file x.pt \
35 | --log-format simple --log-interval 30 --memory-efficient-fp16 \
36 | --encoder-layers $LAYERS --decoder-layers $LAYERS \
37 | --threshold-loss-scale 0.0625 --fp16-scale-window 256 --fp16
38 |
39 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ main ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ main ]
20 | schedule:
21 | - cron: '43 10 * * 6'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'python' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v3
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v2
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
52 |
53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
54 | # If this step fails, then you should remove it and run the build manually (see below)
55 | - name: Autobuild
56 | uses: github/codeql-action/autobuild@v2
57 |
58 | # ℹ️ Command-line programs to run using the OS shell.
59 | # 📚 https://git.io/JvXDl
60 |
61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
62 | # and modify them (or add more) to build your code if your project
63 | # uses a compiled language
64 |
65 | #- run: |
66 | # make bootstrap
67 | # make release
68 |
69 | - name: Perform CodeQL Analysis
70 | uses: github/codeql-action/analyze@v2
71 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 |
41 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://paperswithcode.com/sota/machine-translation-on-wmt2014-english-french?p=very-deep-transformers-for-neural-machine)
2 | 
3 | 
4 | 
5 | [](https://microsoft.github.io/admin-torch/)
6 | 
7 |
8 |
Admin-Torch
9 | Transformers Training **Stabilized**
10 |
11 |
12 | What's New? •
13 | Key Idea •
14 | How To Use •
15 | Docs •
16 | Examples •
17 | Citation •
18 | License
19 |
20 |
21 | Here, we provide a plug-in-and-play implementation of [Admin](https://arxiv.org/abs/2004.08249),
22 | which stabilizes previously-diverged Transformer training and achieves better performance,
23 | **without introducing additional hyper-parameters**. The design of Admin is half-precision
24 | friendly and can be **reparameterized into the original Transformer**.
25 |
26 | ______________________________________________________________________
27 | ## What's New?
28 |
29 | Beyond the [original admin implementation](https://github.com/LiyuanLucasLiu/Transformer-Clinic):
30 | 1. `admin-torch` removed the profilling stage and is **plug-in-and-play**.
31 | 2. `admin-torch`'s implementation is **more robust** (see below).
32 |
33 | Comparison w. the [DeepNet Init](https://arxiv.org/abs/2203.00555) and the [Original Admin Init](https://github.com/LiyuanLucasLiu/Transformer-Clinic)
34 | (on WMT'17 En-De).
35 |
36 | | | Regular batch size (8x4096) | Huge batch size (128x4096) |
37 | |---------------|--------------------|------------------|
38 | | [Original Admin](https://github.com/LiyuanLucasLiu/Transformer-Clinic)| ✅ | ❌ |
39 | | [DeepNet](https://arxiv.org/abs/2203.00555) | ❌ | ✅ |
40 | | `admin-torch` | ✅ | ✅ |
41 |
42 | More details can be found in [our example](https://github.com/microsoft/admin-torch/tree/main/example).
43 |
44 | ## Key Idea
45 | What complicates Transformer training?
46 |
47 | For Transformer f, input x, randomly initialized weight w, we describe its stability (``output_change_scale``) as
48 |
49 |
50 |
51 |
52 |
53 | In [our study](https://arxiv.org/abs/2004.08249), we show that, an original n-layer Transformer's
54 | ``output_change_scale`` is ``O(n)``, which unstabilizes its training. Admin stabilize Transformer's
55 | training by regulating this scale to ``O(logn)`` or ``O(1)``.
56 |
57 | 
58 |
59 | More details can be found in our [paper](https://arxiv.org/abs/2004.08249).
60 |
61 |
62 | ## How to use?
63 |
64 | ### install
65 | ```
66 | pip install admin-torch==0.1.0
67 | ```
68 |
69 | ### import
70 | ```
71 | import admin_torch
72 | ```
73 |
74 | ### enjoy
75 |
76 | ```diff
77 | def __init__(self, ...):
78 | ...
79 | +(self.residual = admin_torch.as_module(self, self.number_of_sub_layers))+
80 | ...
81 |
82 | def forward(self, ...):
83 | ...
84 | -!x = x + self.f(x)!-
85 | +(x = self.residual(x, self.f(x)))+
86 | x = self.LN(x)
87 | ...
88 | ```
89 |
90 | An elaborated example can be found at [our doc](https://microsoft.github.io/admin-torch/), and a real working example can be found at [LiyuanLucasLiu/fairseq](https://github.com/LiyuanLucasLiu/fairseq/commit/33ad76ae5dc927bc32b9594f9728a367c45680bb) (training recipe is available at [our example](https://github.com/microsoft/admin-torch/tree/main/example)).
91 |
92 | ## Citation
93 | Please cite the following papers if you found our model useful. Thanks!
94 |
95 | >Liyuan Liu, Xiaodong Liu, Jianfeng Gao, Weizhu Chen, and Jiawei Han (2020). Understanding the Difficulty of Training Transformers. Proc. 2020 Conf. on Empirical Methods in Natural Language Processing (EMNLP'20).
96 | ```
97 | @inproceedings{liu2020admin,
98 | title={Understanding the Difficulty of Training Transformers},
99 | author = {Liu, Liyuan and Liu, Xiaodong and Gao, Jianfeng and Chen, Weizhu and Han, Jiawei},
100 | booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP 2020)},
101 | year={2020}
102 | }
103 | ```
104 | > Xiaodong Liu, Kevin Duh, Liyuan Liu, and Jianfeng Gao (2020). Very Deep Transformers for Neural Machine Translation. arXiv preprint arXiv:2008.07772 (2020).
105 | ```
106 | @inproceedings{liu_deep_2020,
107 | author = {Liu, Xiaodong and Duh, Kevin and Liu, Liyuan and Gao, Jianfeng},
108 | booktitle = {arXiv:2008.07772 [cs]},
109 | title = {Very Deep Transformers for Neural Machine Translation},
110 | year = {2020}
111 | }
112 | ```
113 |
--------------------------------------------------------------------------------
/example/average_checkpoints.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Copyright (c) 2017-present, Facebook, Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the license found in the LICENSE file in
6 | # the root directory of this source tree. An additional grant of patent rights
7 | # can be found in the PATENTS file in the same directory.
8 |
9 | import argparse
10 | import collections
11 | import torch
12 | import os
13 | import re
14 |
15 |
16 | def average_checkpoints(inputs):
17 | """Loads checkpoints from inputs and returns a model with averaged weights.
18 |
19 | Args:
20 | inputs: An iterable of string paths of checkpoints to load from.
21 |
22 | Returns:
23 | A dict of string keys mapping to various values. The 'model' key
24 | from the returned dict should correspond to an OrderedDict mapping
25 | string parameter names to torch Tensors.
26 | """
27 | params_dict = collections.OrderedDict()
28 | params_keys = None
29 | new_state = None
30 | num_models = len(inputs)
31 |
32 | for f in inputs:
33 | state = torch.load(
34 | f,
35 | map_location=(
36 | lambda s, _: torch.serialization.default_restore_location(s, 'cpu')
37 | ),
38 | )
39 | # Copies over the settings from the first checkpoint
40 | if new_state is None:
41 | new_state = state
42 |
43 | model_params = state['model']
44 |
45 | model_params_keys = list(model_params.keys())
46 | if params_keys is None:
47 | params_keys = model_params_keys
48 | elif params_keys != model_params_keys:
49 | raise KeyError(
50 | 'For checkpoint {}, expected list of params: {}, '
51 | 'but found: {}'.format(f, params_keys, model_params_keys)
52 | )
53 |
54 | for k in params_keys:
55 | p = model_params[k]
56 | if isinstance(p, torch.HalfTensor):
57 | p = p.float()
58 | if k not in params_dict:
59 | params_dict[k] = p.clone()
60 | # NOTE: clone() is needed in case of p is a shared parameter
61 | else:
62 | params_dict[k] += p
63 |
64 | averaged_params = collections.OrderedDict()
65 | for k, v in params_dict.items():
66 | averaged_params[k] = v
67 | averaged_params[k].div_(num_models)
68 | new_state['model'] = averaged_params
69 | return new_state
70 |
71 |
72 | def last_n_checkpoints(paths, n, update_based, upper_bound=None):
73 | assert len(paths) == 1
74 | path = paths[0]
75 | if update_based:
76 | pt_regexp = re.compile(r'checkpoint_\d+_(\d+)\.pt')
77 | else:
78 | pt_regexp = re.compile(r'checkpoint(\d+)\.pt')
79 | files = os.listdir(path)
80 |
81 | entries = []
82 | for f in files:
83 | m = pt_regexp.fullmatch(f)
84 | if m is not None:
85 | sort_key = int(m.group(1))
86 | if upper_bound is None or sort_key <= upper_bound:
87 | entries.append((sort_key, m.group(0)))
88 | if len(entries) < n:
89 | raise Exception('Found {} checkpoint files but need at least {}', len(entries), n)
90 | return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]]
91 |
92 |
93 | def main():
94 | parser = argparse.ArgumentParser(
95 | description='Tool to average the params of input checkpoints to '
96 | 'produce a new checkpoint',
97 | )
98 | # fmt: off
99 | parser.add_argument('--inputs', required=True, nargs='+',
100 | help='Input checkpoint file paths.')
101 | parser.add_argument('--output', required=True, metavar='FILE',
102 | help='Write the new checkpoint containing the averaged weights to this path.')
103 | num_group = parser.add_mutually_exclusive_group()
104 | num_group.add_argument('--num-epoch-checkpoints', type=int,
105 | help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
106 | 'and average last this many of them.')
107 | num_group.add_argument('--num-update-checkpoints', type=int,
108 | help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
109 | 'and average last this many of them.')
110 | parser.add_argument('--checkpoint-upper-bound', type=int,
111 | help='when using --num-epoch-checkpoints, this will set an upper bound on which checkpoint to use, '
112 | 'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.')
113 | # fmt: on
114 | args = parser.parse_args()
115 | print(args)
116 |
117 | num = None
118 | is_update_based = False
119 | if args.num_update_checkpoints is not None:
120 | num = args.num_update_checkpoints
121 | is_update_based = True
122 | elif args.num_epoch_checkpoints is not None:
123 | num = args.num_epoch_checkpoints
124 |
125 | assert args.checkpoint_upper_bound is None or args.num_epoch_checkpoints is not None, \
126 | '--checkpoint-upper-bound requires --num-epoch-checkpoints'
127 | assert args.num_epoch_checkpoints is None or args.num_update_checkpoints is None, \
128 | 'Cannot combine --num-epoch-checkpoints and --num-update-checkpoints'
129 |
130 | if num is not None:
131 | args.inputs = last_n_checkpoints(
132 | args.inputs, num, is_update_based, upper_bound=args.checkpoint_upper_bound,
133 | )
134 | print('averaging checkpoints: ', args.inputs)
135 |
136 | new_state = average_checkpoints(args.inputs)
137 | torch.save(new_state, args.output)
138 | print('Finished writing averaged checkpoint to {}.'.format(args.output))
139 |
140 |
141 | if __name__ == '__main__':
142 | main()
143 |
--------------------------------------------------------------------------------
/doc/source/conf.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | #
4 | # Wrapper documentation build configuration file, created by
5 | # sphinx-quickstart on Thu Sep 14 03:49:01 2017.
6 | #
7 | # This file is execfile()d with the current directory set to its
8 | # containing dir.
9 | #
10 | # Note that not all possible configuration values are present in this
11 | # autogenerated file.
12 | #
13 | # All configuration values have a default; values that are commented out
14 | # serve to show the default.
15 |
16 | # If extensions (or modules to document with autodoc) are in another directory,
17 | # add these directories to sys.path here. If the directory is relative to the
18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
19 |
20 | import os
21 | import sys
22 |
23 | sys.path.insert(0, os.path.abspath('../..'))
24 |
25 | # -- General configuration ------------------------------------------------
26 |
27 | # If your documentation needs a minimal Sphinx version, state it here.
28 | #
29 | # needs_sphinx = '1.0'
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 | 'sphinx.ext.autodoc',
36 | 'sphinx.ext.autosummary',
37 | 'sphinx.ext.doctest',
38 | 'sphinx.ext.intersphinx',
39 | 'sphinx.ext.todo',
40 | 'sphinx.ext.coverage',
41 | 'sphinx.ext.mathjax',
42 | 'sphinx.ext.napoleon',
43 | 'sphinx.ext.viewcode',
44 | 'sphinx.ext.githubpages',
45 | ]
46 |
47 | napoleon_use_ivar = True
48 |
49 | # Add any paths that contain templates here, relative to this directory.
50 | templates_path = ['_templates']
51 |
52 | # The suffix(es) of source filenames.
53 | # You can specify multiple suffix as a list of string:
54 | #
55 | # source_suffix = ['.rst', '.md']
56 | source_suffix = '.rst'
57 |
58 | # The master toctree document.
59 | master_doc = 'contents'
60 |
61 | # General information about the project.
62 | project = 'Admin-Torch'
63 | copyright = '2022, Liyuan Liu'
64 | author = 'Liyuan Liu'
65 |
66 | # The version info for the project you're documenting, acts as replacement for
67 | # |version| and |release|, also used in various other places throughout the
68 | # built documents.
69 | #
70 | # The short X.Y version.
71 | version = ''
72 | # The full version, including alpha/beta/rc tags.
73 | release = ''
74 |
75 | # The language for content autogenerated by Sphinx. Refer to documentation
76 | # for a list of supported languages.
77 | #
78 | # This is also used if you do content translation via gettext catalogs.
79 | # Usually you set "language" from the command line for these cases.
80 | language = None
81 |
82 | # List of patterns, relative to source directory, that match files and
83 | # directories to ignore when looking for source files.
84 | # This patterns also effect to html_static_path and html_extra_path
85 | exclude_patterns = []
86 |
87 | # The name of the Pygments (syntax highlighting) style to use.
88 | pygments_style = 'sphinx'
89 |
90 | # If true, `todo` and `todoList` produce output, else they produce nothing.
91 | todo_include_todos = False
92 |
93 | # -- Options for HTML output ----------------------------------------------
94 |
95 | # The theme to use for HTML and HTML Help pages. See the documentation for
96 | # a list of builtin themes.
97 | #
98 |
99 | html_theme = 'sphinx_rtd_theme'
100 | # html_theme = 'sphinx_documatt_theme'
101 |
102 | # Theme options are theme-specific and customize the look and feel of a theme
103 | # further. For a list of options available for each theme, see the
104 | # documentation.
105 | #
106 | # html_theme_options = {}
107 | html_theme_options = {
108 | 'globaltoc_maxdepth': 5,
109 | }
110 |
111 | # Add any paths that contain custom static files (such as style sheets) here,
112 | # relative to this directory. They are copied after the builtin static files,
113 | # so a file named "default.css" will overwrite the builtin "default.css".
114 | html_static_path = ['_static']
115 |
116 | # Custom sidebar templates, must be a dictionary that maps document names
117 | # to template names.
118 | #
119 | # This is required for the alabaster theme
120 | # # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
121 | html_sidebars = {
122 | '**': ['globaltoc.html', 'relations.html', 'sourcelink.html', 'searchbox.html']
123 | }
124 |
125 | # -- Options for HTMLHelp output ------------------------------------------
126 |
127 | # Output file base name for HTML help builder.
128 | htmlhelp_basename = 'Admin-Torch'
129 |
130 | # -- Options for LaTeX output ---------------------------------------------
131 |
132 | latex_elements = {
133 | # The paper size ('letterpaper' or 'a4paper').
134 | #
135 | # 'papersize': 'letterpaper',
136 |
137 | # The font size ('10pt', '11pt' or '12pt').
138 | #
139 | # 'pointsize': '10pt',
140 |
141 | # Additional stuff for the LaTeX preamble.
142 | #
143 | # 'preamble': '',
144 |
145 | # Latex figure (float) alignment
146 | #
147 | # 'figure_align': 'htbp',
148 | }
149 |
150 | # Grouping the document tree into LaTeX files. List of tuples
151 | # (source start file, target name, title,
152 | # author, documentclass [howto, manual, or own class]).
153 | latex_documents = [
154 | (master_doc, 'admin_torch.tex', 'Admin-Torch Documentation',
155 | 'Admin-Torch', 'manual'),
156 | ]
157 |
158 | # -- Options for manual page output ---------------------------------------
159 |
160 | # One entry per manual page. List of tuples
161 | # (source start file, name, description, authors, manual section).
162 | man_pages = [
163 | (master_doc, 'Admin-Torch', 'Admin-Torch Documentation',
164 | [author], 1)
165 | ]
166 |
167 | # -- Options for Texinfo output -------------------------------------------
168 |
169 | # Grouping the document tree into Texinfo files. List of tuples
170 | # (source start file, target name, title, author,
171 | # dir menu entry, description, category)
172 | texinfo_documents = [
173 | (master_doc, 'Admin-Torch', 'Admin-Torch Documentation',
174 | author, 'Admin-Torch', 'Adaptive Model Initialization.',
175 | 'Miscellaneous'),
176 | ]
177 |
178 | autodoc_mock_imports = ['torch']
179 |
180 | intersphinx_mapping = {
181 | 'python':('https://docs.python.org/3', None),
182 | 'torch': ('http://pytorch.org/docs/master', None)
183 | }
184 |
185 | autodoc_member_order = 'bysource'
--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
1 | # Table of Contents
2 |
3 | - [Real example: `admin_torch` on WMT'14 En-De](#admin_torch-on-WMT14-En-De)
4 | - [Comparison with original Admin and DeepNet](#comparison-with-original-admin-and-deepnet-on-wmt17-en-de)
5 |
6 | # Real example: `admin-torch` on WMT'14 En-De
7 |
8 | As an example, we apply `admin_torch` to `fairseq` and train Transformer on WMT'14 En-De.
9 |
10 | > Note: the efforts to incorporate `admin-torch` into fairseq are summarized as [this commit](https://github.com/LiyuanLucasLiu/fairseq/commit/33ad76ae5dc927bc32b9594f9728a367c45680bb):
11 |
12 | ## 1. Pre-processing
13 |
14 | ### 1.1. Data Preparation
15 |
16 | please refer to [the Transformer-Clinic repo](https://github.com/LiyuanLucasLiu/Transformer-Clinic/blob/master/pre-process/wmt14en-de.sh) for data preparation.
17 |
18 | ### 1.2. Package Install
19 |
20 | ```
21 | pip install admin_torch==0.1.0
22 | pip uninstall fairseq
23 | pip install https://github.com/LiyuanLucasLiu/fairseq/archive/refs/tags/admin-torch.zip
24 | ```
25 |
26 | ## 2. Training and Evaluation
27 |
28 | ### 2.1. Training
29 | ```
30 | bash train_wmt_en-de.sh $PATH-to-WMT14 $NUBMER_LAYER $OUTPUT_PATH
31 | ```
32 | Not that `$PATH-to-WMT14` is the path to the `wmt14_en_de_joined_dict` data
33 | folder from data preparation. `$NUMBER_LAYER` is the encoder/decoder layer number.
34 | `$OUTPUT_PATH` is the path where you want to save your checkpoints.
35 |
36 | ### 2.2. Evaluation
37 | ```
38 | bash eval_wmt_en-de.sh $PATH-to-WMT14 NONE $OUTPUT_PATH
39 | ```
40 | Not that `$PATH-to-WMT14` is the path to the `wmt14_en_de_joined_dict` data folder
41 | from data preparation. `$OUTPUT_PATH` is the path used in the training step.
42 |
43 | ## 3. Pre-trained Weights
44 |
45 | | Layer Number | BLEU | PATH |
46 | |--------------|-------|------|
47 | | 6L-6L | 27.84 | TBD |
48 | | 18L-18L | 28.91 | TBD |
49 | | 100L-100L* | 29.65 | TBD |
50 |
51 | *: trained with the [huge-batch-size setting](#omparison-with-original-admin-and-deepnet-on-wmt17-en-de),
52 | but only for 40 epochs, due to the huge cost of the training.
53 |
54 | ## 4. Discussion on the `admin-torch` setting.
55 |
56 | `admin-torch.as_module` can be configured by changing `output_change_scale` and
57 | `as_parameter`. `output_change_scale` can be set to `O(1)` for additional stability, but
58 | results in a performance drop in our experiments. `as_parameter` can be set to `False` to
59 | make `omega` (the shortcut connection scaler) as a constant (no updates). Their performance are listed
60 | as below:
61 |
62 | | Layer Number | Output Change | Omega | BLEU |
63 | |-----------------|---------------|-----------------|-------|
64 | | 6L-6L | O(1) | as a constant | 27.71 |
65 | | 6L-6L | O(1) | as a parameter | 27.79 |
66 | | 6L-6L | O(logn) | as a constant | 27.83 |
67 | | 6L-6L | O(logn) | as a parameter | 27.84 |
68 | | 18L-18L | O(1) | as a constant | 28.66 |
69 | | 18L-18L | O(1) | as a parameter | 28.89 |
70 | | 18L-18L | O(logn) | as a constant | 28.78 |
71 | | 18L-18L | O(logn) | as a parameter | 28.91 |
72 |
73 | # Comparison with original Admin and DeepNet on WMT'17 En-De
74 |
75 | We choose to make comparisons with DeepNet and the original Admin implementation on WMT'17 En-De,
76 | the dataset used in the DeepNet paper.
77 |
78 | We noticed that the training configuration in the DeepNet paper is different from the setting used
79 | in the original Admin repo. Their major difference is the batch size (i.e., regular batch size and
80 | huge batch size). We refer the setting used in the DeepNet paper as `Huge batch size (128x4096)`,
81 | and they refer the setting with changed batch size as `Regular batch size (8x4096)`.
82 |
83 | We can find that they can only work on their own settings.
84 |
85 | | | Regular batch size (8x4096) | Huge batch size (128x4096) |
86 | |---------------|--------------------|------------------|
87 | | [Original Admin](https://github.com/LiyuanLucasLiu/Transformer-Clinic)| ✅ | ❌ |
88 | | [DeepNet](https://arxiv.org/abs/2203.00555) | ❌ | ✅ |
89 | | `admin-torch` | ✅ | ✅ |
90 |
91 | Here, we re-implemented admin as `admin-torch`, and we can find that the new `admin-torch`
92 | implementation works well on both settings.
93 |
94 | All implementations are publicly released (elaborated as below).
95 |
96 |
97 | ## 1. Data Preparation
98 | Please refer to the DeepNet paper for data preparation. Here we used the same data shared by the
99 | DeepNet team.
100 |
101 | ## 2. Original Admin and DeepNet
102 |
103 | ### 2.1. Implementation Download and Code Install
104 | ```
105 | pip uninstall fairseq
106 | git clone https://github.com/LiyuanLucasLiu/Transformer-Clinic.git
107 | cd Transformer-Clinic/fairseq
108 | pip install --editable .
109 | ```
110 |
111 | ### 2.2. Training
112 |
113 | #### 2.2.1. Original Admin
114 | ```
115 | # Before running the training, the original admin requires to do a profilling
116 | # of the network. The profilling result for 100L-100L is included in this repo
117 | # (i.e., example/profile.ratio.init). The command to generate this profilling
118 | # can be found at https://github.com/LiyuanLucasLiu/Transformer-Clinic/blob/master/nmt-experiments/wmt14_en-de.md#100l-100l-admin-without-any-hyper-parameter-tuning
119 |
120 | # regular batch size (4096 x 8)
121 | bash train_wmt_en-de.sh $PATH-to-WMT17 100 $OUTPUT_PATH_REG "--init-type adaptive"
122 |
123 | # huge batch size (4096 x 128)
124 | bash train_wmt_en-de_huge.sh $PATH-to-WMT17 100 $OUTPUT_PATH_HUG "--init-type adaptive"
125 |
126 | # evaluate
127 | bash eval_wmt_en-de.sh $PATH-to-WMT17 none $OUTPUT_PATH_HUG/REG 45 10
128 | ```
129 |
130 | #### 2.2.2. DeepNet
131 | ```
132 | # regular batch size (4096 x 8)
133 | bash train_wmt_en-de.sh $PATH-to-WMT17 100 $OUTPUT_PATH_REG "--init-type deepnet"
134 |
135 | # huge batch size (4096 x 128)
136 | bash train_wmt_en-de_huge.sh $PATH-to-WMT17 100 $OUTPUT_PATH_HUG "--init-type deepnet"
137 |
138 | # evaluate
139 | bash eval_wmt_en-de.sh $PATH-to-WMT17 none $OUTPUT_PATH_HUG/REG 45 10
140 | ```
141 |
142 | ## 3 `torch-admin`
143 |
144 | ### 3.1 Package Install
145 |
146 | ```
147 | pip install admin_torch==0.1.0
148 | pip uninstall fairseq
149 | pip install https://github.com/LiyuanLucasLiu/fairseq/archive/refs/tags/admin-torch.zip
150 | ```
151 |
152 | ### 3.2 Training and Evaluation
153 |
154 | ```
155 | # regular batch size (4096 x 8)
156 | bash train_wmt_en-de.sh $PATH-to-WMT17 100 $OUTPUT_PATH_REG
157 |
158 | # huge batch size (4096 x 128)
159 | bash train_wmt_en-de_huge.sh $PATH-to-WMT17 100 $OUTPUT_PATH_HUG
160 |
161 | # evaluate
162 | bash eval_wmt_en-de.sh $PATH-to-WMT17 none $OUTPUT_PATH_HUG/REG 45 10
163 | ```
--------------------------------------------------------------------------------
/admin_torch/admin.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import torch
5 | import math
6 |
7 | class OmegaResidual(torch.nn.Module):
8 | """
9 | Residual connection module with shortcut connection rescaling.
10 |
11 | Parameters
12 | ----------
13 | init_value: ``float``, required.
14 | The initialization value of the shortcut connection rescalar, omega.
15 | as_parameter: ``bool``, optional (default = False).
16 | Whether to set the rescalar as trainable parameter. Note that, when set as trainable
17 | parameters, the rescalar would be set as a vector (similar to the weight vector in layer
18 | norm), and the embed_dim input is required.
19 | embed_dim: ``int``, optional (default = None).
20 | The hidden state dimension of the shortcut connection. This field is required and only used
21 | when ``as_parameter == True``.
22 | """
23 |
24 | def __init__(self, init_value, as_parameter=False, embed_dim=None):
25 | super().__init__()
26 | if as_parameter:
27 | assert embed_dim is not None, 'embed_dim is required when as_parameter is set as True'
28 | self.omega = torch.nn.Parameter(torch.ones(embed_dim))
29 | self.omega.data.fill_(init_value)
30 | self.forward = self.forward_omega
31 | else:
32 | self.register_buffer('omega', torch.FloatTensor([init_value]))
33 | if 1.0 == init_value:
34 | self.forward = self.forward_original
35 | else:
36 | self.forward = self.forward_omega
37 |
38 | def forward(self, x, f_x):
39 | """
40 | Calculate x * omega + f_x. The output shape would be same with the input shape.
41 |
42 | When omega is set to be a constant 1 (``as buffer`` and ``O(n)`` output change), the
43 | ``OmegaResidual`` would downgrade to the ordinary residual module and x + f_x would be
44 | calculated instead.
45 | """
46 | raise NotImplementedError("Placeholder forward function used in OmegaResidual")
47 |
48 | def forward_original(self, x, f_x):
49 | return x + f_x
50 |
51 | def forward_omega(self, x, f_x):
52 | return x * self.omega + f_x
53 |
54 | def calculate_init(
55 | num_res_layers,
56 | output_change_scale='O(logn)',
57 | ) -> int:
58 | r"""
59 | Calculate initialization for omega.
60 |
61 | Parameters
62 | ----------
63 | num_res_layers: ``int``, required.
64 | The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers.
65 | output_change_scale: ``str``, optional (default = ``'O(logn)'``).
66 | The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``,
67 | and ``'O(1)'`` are supported.
68 |
69 | Returns
70 | -------
71 | int: It would return the initialization value.
72 | """
73 | if 'O(logn)' == output_change_scale or 'default' == output_change_scale:
74 | omega_value = (num_res_layers + 1) / math.log(num_res_layers + 1) - 1
75 | elif 'O(n)' == output_change_scale:
76 | omega_value = 1.
77 | else:
78 | assert 'O(1)' == output_change_scale, \
79 | 'only O(n), O(logn), and O(1) output changes are supported.'
80 | omega_value = num_res_layers
81 | return omega_value ** 0.5
82 |
83 | def as_module(
84 | num_res_layers,
85 | output_change_scale='default',
86 | as_parameter=False,
87 | embed_dim=None
88 | ) -> OmegaResidual:
89 | r"""
90 | Calculate initialization for omega and return a residual module with the initialized omega.
91 |
92 | Parameters
93 | ----------
94 | num_res_layers: ``int``, required.
95 | The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers.
96 | output_change_scale: ``str``, optional (default = ``'O(logn)'``).
97 | The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``,
98 | and ``'O(1)'`` are supported.
99 | as_parameter: ``bool``, optional (default = False).
100 | Whether to set the rescalar as trainable parameter. Note that, when set as trainable
101 | parameters, the rescalar would be set as a vector (similar to the weight vector in
102 | layer norm), and the embed_dim input is required.
103 | embed_dim: ``int``, optional (default = None).
104 | The hidden state dimension of the shortcut connection. This field is required and only
105 | used when as_parameter == True.
106 |
107 | Returns
108 | -------
109 | admin_torch.OmegaResidual: It would return a ``OmegaResidual`` module with the properly initialized omega inside.
110 |
111 | Example
112 | -------
113 |
114 | .. highlight:: python
115 | .. code-block:: python
116 |
117 | import torch.nn as nn
118 | import admin_torch
119 |
120 | class TransformerEncoderLayer(nn.Module):
121 |
122 | def __init__(self, cfg):
123 | super().__init__()
124 |
125 | num_layer = 2 * cfg.encoder_layers # number of residual layers
126 |
127 | self.attn = nn.MultiheadAttention(cfg.embed_dim, cfg.num_heads)
128 | self.residual_attn = admin_torch.as_module(num_layer)
129 | self.ln_attn = nn.LayerNorm(cfg.embed_dim)
130 |
131 | self.ffn = nn.Sequential(
132 | nn.Linear(cfg.embed_dim, cfg.feedforward_dim),
133 | nn.ReLU(),
134 | nn.Linear(cfg.feedforward_dim)
135 | )
136 | self.residual_ffn = admin_torch.as_module(num_layer)
137 | self.ln_ffn = nn.LayerNorm(cfg.embed_dim)
138 |
139 | def forward(self, x):
140 |
141 | f_x, _ = self.attn(x)
142 | x = self.residual_attn(x, f_x)
143 | x = self.ln_attn(x)
144 |
145 | f_x = self.ffn(x)
146 | x = self.residual_ffn(x, f_x)
147 | x = self.ln_ffn(x)
148 |
149 | return x
150 | """
151 | omega_value = calculate_init(num_res_layers, output_change_scale)
152 | return OmegaResidual(omega_value, as_parameter=as_parameter, embed_dim=embed_dim)
153 |
154 | def as_buffer(
155 | network,
156 | buffer_name,
157 | num_res_layers,
158 | output_change_scale='default',
159 | ) -> None:
160 | r"""
161 | Calculate initialization for omega and *register* omega as a buffer (not trainable).
162 |
163 | Parameters
164 | ----------
165 | network: ``torch.nn.Module``, required.
166 | The ``torch.nn.Module`` contains the residual network. This is where the omega would
167 | be registered to.
168 | buffer_name: ``str``, required.
169 | The name of omega (as buffer). The omega can be accessed in the network, using the
170 | given name.
171 | num_res_layers: ``int``, required.
172 | The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers.
173 | output_change_scale: ``str``, optional (default = ``'O(logn)'``).
174 | The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``,
175 | and ``'O(1)'`` are supported.
176 |
177 | Returns
178 | -------
179 | None: No returns. The initialized omega would be registered as a buffer within `network`.
180 |
181 | Example
182 | -------
183 |
184 | .. highlight:: python
185 | .. code-block:: python
186 |
187 | import torch.nn as nn
188 | import admin_torch
189 |
190 | class TransformerEncoderLayer(nn.Module):
191 |
192 | def __init__(self, cfg):
193 | super().__init__()
194 |
195 | num_layer = 2 * cfg.encoder_layers # number of residual layers
196 |
197 | self.attn = nn.MultiheadAttention(cfg.embed_dim, cfg.num_heads)
198 | admin_torch.as_buffer(self, 'attn_omega', num_layer)
199 | self.ln_attn = nn.LayerNorm(cfg.embed_dim)
200 |
201 | self.ffn = nn.Sequential(
202 | nn.Linear(cfg.embed_dim, cfg.feedforward_dim),
203 | nn.ReLU(),
204 | nn.Linear(cfg.feedforward_dim)
205 | )
206 | admin_torch.as_buffer(self, 'ffn_omega', num_layer)
207 | self.ln_ffn = nn.LayerNorm(cfg.embed_dim)
208 |
209 | def forward(self, x):
210 |
211 | f_x, _ = self.attn(x)
212 | x = x * self.attn_omega + f_x
213 | x = self.ln_attn(x)
214 |
215 | f_x = self.ffn(x)
216 | x = x * self.ffn_omega + f_x
217 | x = self.ln_ffn(x)
218 |
219 | return x
220 | """
221 | assert isinstance(network, torch.nn.Module), \
222 | 'the input network has to be a torch.nn.Module object'
223 | omega_value = calculate_init(num_res_layers, output_change_scale)
224 | network.register_buffer(buffer_name, torch.FloatTensor([omega_value]))
225 |
226 | def as_parameter(
227 | network,
228 | parameter_name,
229 | num_res_layers,
230 | embed_dim,
231 | output_change_scale='default',
232 | ) -> None:
233 | r"""
234 | Calculate initialization for omega and *register* omega as a parameter (trainable).
235 |
236 | Parameters
237 | ----------
238 | network: ``torch.nn.Module``, required.
239 | The ``torch.nn.Module`` contains the residual network. This is where the omega would
240 | be registered to.
241 | parameter_name: ``str``, required.
242 | The name of omega (as parameter). The omega can be accessed in the network, using the
243 | given name.
244 | num_res_layers: ``int``, required.
245 | The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers.
246 | embed_dim: ``int``, required.
247 | The hidden state dimension of the shortcut connection.
248 | output_change_scale: ``str``, optional (default = ``'O(logn)'``).
249 | The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``,
250 | and ``'O(1)'`` are supported.
251 |
252 | Returns
253 | -------
254 | None: No returns. The initialized omega would be registered as a parameter within `network`.
255 |
256 | Example
257 | -------
258 |
259 | .. highlight:: python
260 | .. code-block:: python
261 |
262 | import torch.nn as nn
263 | import admin_torch
264 |
265 | class TransformerEncoderLayer(nn.Module):
266 |
267 | def __init__(self, cfg):
268 | super().__init__()
269 |
270 | num_layer = 2 * cfg.encoder_layers # number of residual layers
271 |
272 | self.attn = nn.MultiheadAttention(cfg.embed_dim, cfg.num_heads)
273 | admin_torch.as_parameter(self, 'attn_omega', num_layer, cfg.embed_dim)
274 | self.ln_attn = nn.LayerNorm(cfg.embed_dim)
275 |
276 | self.ffn = nn.Sequential(
277 | nn.Linear(cfg.embed_dim, cfg.feedforward_dim),
278 | nn.ReLU(),
279 | nn.Linear(cfg.feedforward_dim)
280 | )
281 | admin_torch.as_parameter(self, 'ffn_omega', num_layer, cfg.embed_dim)
282 | self.ln_ffn = nn.LayerNorm(cfg.embed_dim)
283 |
284 | def forward(self, x):
285 |
286 | f_x, _ = self.attn(x)
287 | x = x * self.attn_omega + f_x
288 | x = self.ln_attn(x)
289 |
290 | f_x = self.ffn(x)
291 | x = x * self.ffn_omega + f_x
292 | x = self.ln_ffn(x)
293 |
294 | return x
295 | """
296 | omega_vector = torch.ones(embed_dim)
297 | omega_vector.data.fill_(calculate_init(num_res_layers, output_change_scale))
298 | network.register_parameter(parameter_name,torch.nn.Parameter(omega_vector))
299 |
--------------------------------------------------------------------------------
/example/profile.ratio.init:
--------------------------------------------------------------------------------
1 | 1 1.0
2 | 2 1.3190397024154663
3 | 3 1.4550316333770752
4 | 4 1.5123295783996582
5 | 5 1.6428167819976807
6 | 6 1.695853352546692
7 | 7 1.8099972009658813
8 | 8 1.8667889833450317
9 | 9 1.9713013172149658
10 | 10 2.028479814529419
11 | 11 2.1205849647521973
12 | 12 2.1734628677368164
13 | 13 2.2616868019104004
14 | 14 2.3189287185668945
15 | 15 2.404047966003418
16 | 16 2.456615924835205
17 | 17 2.5324201583862305
18 | 18 2.5882620811462402
19 | 19 2.669269561767578
20 | 20 2.7247939109802246
21 | 21 2.7918922901153564
22 | 22 2.8465681076049805
23 | 23 2.9145007133483887
24 | 24 2.961702585220337
25 | 25 3.027456045150757
26 | 26 3.080919027328491
27 | 27 3.1440911293029785
28 | 28 3.198875904083252
29 | 29 3.2594544887542725
30 | 30 3.306119203567505
31 | 31 3.3605387210845947
32 | 32 3.4074313640594482
33 | 33 3.4670989513397217
34 | 34 3.5158419609069824
35 | 35 3.573145627975464
36 | 36 3.621101140975952
37 | 37 3.6756269931793213
38 | 38 3.723588705062866
39 | 39 3.7751662731170654
40 | 40 3.820997714996338
41 | 41 3.874152183532715
42 | 42 3.919919729232788
43 | 43 3.971492052078247
44 | 44 4.015556335449219
45 | 45 4.062773704528809
46 | 46 4.102989673614502
47 | 47 4.147921562194824
48 | 48 4.188112735748291
49 | 49 4.236110210418701
50 | 50 4.280117988586426
51 | 51 4.3282270431518555
52 | 52 4.3700642585754395
53 | 53 4.413156986236572
54 | 54 4.457295894622803
55 | 55 4.502137184143066
56 | 56 4.543152332305908
57 | 57 4.5831618309021
58 | 58 4.6219892501831055
59 | 59 4.665818214416504
60 | 60 4.704627513885498
61 | 61 4.7454071044921875
62 | 62 4.782839298248291
63 | 63 4.824032783508301
64 | 64 4.865396499633789
65 | 65 4.905492782592773
66 | 66 4.944614410400391
67 | 67 4.986900329589844
68 | 68 5.027820110321045
69 | 69 5.068670272827148
70 | 70 5.105625152587891
71 | 71 5.1475629806518555
72 | 72 5.185201168060303
73 | 73 5.222909450531006
74 | 74 5.258800029754639
75 | 75 5.293256759643555
76 | 76 5.3247480392456055
77 | 77 5.362242221832275
78 | 78 5.398958206176758
79 | 79 5.436041831970215
80 | 80 5.472423076629639
81 | 81 5.50972318649292
82 | 82 5.542351722717285
83 | 83 5.580074310302734
84 | 84 5.614049434661865
85 | 85 5.647655487060547
86 | 86 5.685954570770264
87 | 87 5.72280740737915
88 | 88 5.756107807159424
89 | 89 5.7939252853393555
90 | 90 5.8227972984313965
91 | 91 5.860756874084473
92 | 92 5.893098831176758
93 | 93 5.931689262390137
94 | 94 5.965618133544922
95 | 95 6.000150203704834
96 | 96 6.031996726989746
97 | 97 6.064528465270996
98 | 98 6.096097469329834
99 | 99 6.130829334259033
100 | 100 6.1620001792907715
101 | 101 6.192485809326172
102 | 102 6.223220348358154
103 | 103 6.255505084991455
104 | 104 6.285265922546387
105 | 105 6.317674160003662
106 | 106 6.347247123718262
107 | 107 6.380263805389404
108 | 108 6.4059295654296875
109 | 109 6.4350199699401855
110 | 110 6.463878631591797
111 | 111 6.495331287384033
112 | 112 6.530038356781006
113 | 113 6.560511112213135
114 | 114 6.592663288116455
115 | 115 6.6225266456604
116 | 116 6.649158477783203
117 | 117 6.678005695343018
118 | 118 6.706629753112793
119 | 119 6.735388278961182
120 | 120 6.765207767486572
121 | 121 6.796474933624268
122 | 122 6.829586505889893
123 | 123 6.8563995361328125
124 | 124 6.884836196899414
125 | 125 6.913089752197266
126 | 126 6.945504665374756
127 | 127 6.972911834716797
128 | 128 7.0005574226379395
129 | 129 7.029700756072998
130 | 130 7.057539463043213
131 | 131 7.087018966674805
132 | 132 7.115658283233643
133 | 133 7.145423889160156
134 | 134 7.176324844360352
135 | 135 7.203213214874268
136 | 136 7.2326130867004395
137 | 137 7.263206481933594
138 | 138 7.290313243865967
139 | 139 7.316440105438232
140 | 140 7.342780590057373
141 | 141 7.371157646179199
142 | 142 7.401747226715088
143 | 143 7.428928375244141
144 | 144 7.455725193023682
145 | 145 7.482446670532227
146 | 146 7.509575843811035
147 | 147 7.534206390380859
148 | 148 7.561554431915283
149 | 149 7.5872907638549805
150 | 150 7.610199928283691
151 | 151 7.634483814239502
152 | 152 7.664854526519775
153 | 153 7.689417362213135
154 | 154 7.711843490600586
155 | 155 7.73599910736084
156 | 156 7.759965896606445
157 | 157 7.788935661315918
158 | 158 7.814115524291992
159 | 159 7.835615158081055
160 | 160 7.8606743812561035
161 | 161 7.882746696472168
162 | 162 7.907750606536865
163 | 163 7.9333109855651855
164 | 164 7.958787441253662
165 | 165 7.981550693511963
166 | 166 8.011580467224121
167 | 167 8.03337287902832
168 | 168 8.056806564331055
169 | 169 8.079192161560059
170 | 170 8.103809356689453
171 | 171 8.125045776367188
172 | 172 8.15402889251709
173 | 173 8.181538581848145
174 | 174 8.207011222839355
175 | 175 8.227130889892578
176 | 176 8.252174377441406
177 | 177 8.272038459777832
178 | 178 8.298123359680176
179 | 179 8.317887306213379
180 | 180 8.347379684448242
181 | 181 8.365970611572266
182 | 182 8.39012336730957
183 | 183 8.413267135620117
184 | 184 8.441601753234863
185 | 185 8.469433784484863
186 | 186 8.492582321166992
187 | 187 8.515233993530273
188 | 188 8.537558555603027
189 | 189 8.561620712280273
190 | 190 8.58764362335205
191 | 191 8.611865043640137
192 | 192 8.638022422790527
193 | 193 8.662741661071777
194 | 194 8.689210891723633
195 | 195 8.71304702758789
196 | 196 8.739282608032227
197 | 197 8.76534366607666
198 | 198 8.791733741760254
199 | 199 8.812337875366211
200 | 200 8.836019515991211
201 | 1 1.0
202 | 2 1.3425869941711426
203 | 3 1.4800665378570557
204 | 4 1.610133409500122
205 | 5 1.6861705780029297
206 | 6 1.8158448934555054
207 | 7 1.9169756174087524
208 | 8 1.9898321628570557
209 | 9 2.103006362915039
210 | 10 2.198607921600342
211 | 11 2.2673819065093994
212 | 12 2.379521131515503
213 | 13 2.453937292098999
214 | 14 2.521658420562744
215 | 15 2.6081600189208984
216 | 16 2.6755027770996094
217 | 17 2.7483041286468506
218 | 18 2.8297157287597656
219 | 19 2.905391216278076
220 | 20 2.9680662155151367
221 | 21 3.0330522060394287
222 | 22 3.1018290519714355
223 | 23 3.178995132446289
224 | 24 3.243086099624634
225 | 25 3.304431438446045
226 | 26 3.3670051097869873
227 | 27 3.4263482093811035
228 | 28 3.48586368560791
229 | 29 3.5485644340515137
230 | 30 3.60994553565979
231 | 31 3.658292293548584
232 | 32 3.7145533561706543
233 | 33 3.764017343521118
234 | 34 3.820631742477417
235 | 35 3.8807389736175537
236 | 36 3.927197217941284
237 | 37 3.9778075218200684
238 | 38 4.032394886016846
239 | 39 4.079466342926025
240 | 40 4.121256351470947
241 | 41 4.179348945617676
242 | 42 4.235202789306641
243 | 43 4.276139736175537
244 | 44 4.322049617767334
245 | 45 4.375305652618408
246 | 46 4.421659469604492
247 | 47 4.465145111083984
248 | 48 4.511823654174805
249 | 49 4.558897972106934
250 | 50 4.612059116363525
251 | 51 4.6558332443237305
252 | 52 4.701802730560303
253 | 53 4.749824523925781
254 | 54 4.791557788848877
255 | 55 4.836613655090332
256 | 56 4.882650375366211
257 | 57 4.921323776245117
258 | 58 4.962111949920654
259 | 59 4.998560905456543
260 | 60 5.037952899932861
261 | 61 5.07641077041626
262 | 62 5.117368698120117
263 | 63 5.156766414642334
264 | 64 5.189724445343018
265 | 65 5.232119083404541
266 | 66 5.274285316467285
267 | 67 5.309042453765869
268 | 68 5.348519325256348
269 | 69 5.385103225708008
270 | 70 5.4217658042907715
271 | 71 5.458439350128174
272 | 72 5.49429988861084
273 | 73 5.532592296600342
274 | 74 5.5714898109436035
275 | 75 5.615612030029297
276 | 76 5.6548895835876465
277 | 77 5.69318962097168
278 | 78 5.726015090942383
279 | 79 5.7607831954956055
280 | 80 5.803152084350586
281 | 81 5.840878963470459
282 | 82 5.873714447021484
283 | 83 5.916788578033447
284 | 84 5.954561233520508
285 | 85 5.986266613006592
286 | 86 6.01418924331665
287 | 87 6.048747539520264
288 | 88 6.081315517425537
289 | 89 6.119509220123291
290 | 90 6.158118724822998
291 | 91 6.1867289543151855
292 | 92 6.221379280090332
293 | 93 6.256557464599609
294 | 94 6.288797855377197
295 | 95 6.326879024505615
296 | 96 6.361721038818359
297 | 97 6.393381118774414
298 | 98 6.428704261779785
299 | 99 6.463216304779053
300 | 100 6.492666244506836
301 | 101 6.531671524047852
302 | 102 6.56462287902832
303 | 103 6.595141410827637
304 | 104 6.63072395324707
305 | 105 6.664488792419434
306 | 106 6.691591262817383
307 | 107 6.724743843078613
308 | 108 6.752674102783203
309 | 109 6.781800270080566
310 | 110 6.8150529861450195
311 | 111 6.846014022827148
312 | 112 6.875141620635986
313 | 113 6.913217544555664
314 | 114 6.9485979080200195
315 | 115 6.978128433227539
316 | 116 7.006283283233643
317 | 117 7.036981105804443
318 | 118 7.06494665145874
319 | 119 7.097784519195557
320 | 120 7.124679088592529
321 | 121 7.151909828186035
322 | 122 7.189268112182617
323 | 123 7.214508056640625
324 | 124 7.244059085845947
325 | 125 7.272270202636719
326 | 126 7.302449703216553
327 | 127 7.325904846191406
328 | 128 7.352625370025635
329 | 129 7.382406234741211
330 | 130 7.409889221191406
331 | 131 7.441695213317871
332 | 132 7.465174198150635
333 | 133 7.490511894226074
334 | 134 7.52184534072876
335 | 135 7.552266597747803
336 | 136 7.576979160308838
337 | 137 7.60615873336792
338 | 138 7.631077766418457
339 | 139 7.659272193908691
340 | 140 7.689055442810059
341 | 141 7.714266777038574
342 | 142 7.740371227264404
343 | 143 7.770144462585449
344 | 144 7.797163963317871
345 | 145 7.825152397155762
346 | 146 7.850924015045166
347 | 147 7.8776116371154785
348 | 148 7.905229568481445
349 | 149 7.938175678253174
350 | 150 7.963441371917725
351 | 151 7.989377021789551
352 | 152 8.015571594238281
353 | 153 8.03981876373291
354 | 154 8.061357498168945
355 | 155 8.089569091796875
356 | 156 8.110694885253906
357 | 157 8.13357162475586
358 | 158 8.16172981262207
359 | 159 8.186487197875977
360 | 160 8.212444305419922
361 | 161 8.237144470214844
362 | 162 8.259098052978516
363 | 163 8.284126281738281
364 | 164 8.307903289794922
365 | 165 8.32878303527832
366 | 166 8.352269172668457
367 | 167 8.380142211914062
368 | 168 8.406278610229492
369 | 169 8.428736686706543
370 | 170 8.453876495361328
371 | 171 8.476970672607422
372 | 172 8.50069808959961
373 | 173 8.524378776550293
374 | 174 8.547581672668457
375 | 175 8.568750381469727
376 | 176 8.596118927001953
377 | 177 8.616921424865723
378 | 178 8.64217472076416
379 | 179 8.666587829589844
380 | 180 8.689314842224121
381 | 181 8.712116241455078
382 | 182 8.737107276916504
383 | 183 8.7545166015625
384 | 184 8.781569480895996
385 | 185 8.804463386535645
386 | 186 8.828217506408691
387 | 187 8.84929370880127
388 | 188 8.87789535522461
389 | 189 8.89671516418457
390 | 190 8.919512748718262
391 | 191 8.948515892028809
392 | 192 8.968647956848145
393 | 193 8.989168167114258
394 | 194 9.019471168518066
395 | 195 9.040534019470215
396 | 196 9.059708595275879
397 | 197 9.086166381835938
398 | 198 9.106014251708984
399 | 199 9.12833309173584
400 | 200 9.152287483215332
401 | 201 9.17170524597168
402 | 202 9.192819595336914
403 | 203 9.222025871276855
404 | 204 9.242000579833984
405 | 205 9.262640953063965
406 | 206 9.292340278625488
407 | 207 9.312517166137695
408 | 208 9.332563400268555
409 | 209 9.362652778625488
410 | 210 9.37942886352539
411 | 211 9.403473854064941
412 | 212 9.430370330810547
413 | 213 9.448878288269043
414 | 214 9.467549324035645
415 | 215 9.49483585357666
416 | 216 9.513956069946289
417 | 217 9.536771774291992
418 | 218 9.561524391174316
419 | 219 9.583642959594727
420 | 220 9.608484268188477
421 | 221 9.632624626159668
422 | 222 9.656436920166016
423 | 223 9.676606178283691
424 | 224 9.702705383300781
425 | 225 9.725610733032227
426 | 226 9.74528980255127
427 | 227 9.76707935333252
428 | 228 9.788287162780762
429 | 229 9.808671951293945
430 | 230 9.835151672363281
431 | 231 9.85518741607666
432 | 232 9.870915412902832
433 | 233 9.89185905456543
434 | 234 9.909582138061523
435 | 235 9.925028800964355
436 | 236 9.948447227478027
437 | 237 9.964569091796875
438 | 238 9.98108959197998
439 | 239 10.01058292388916
440 | 240 10.029886245727539
441 | 241 10.049802780151367
442 | 242 10.071462631225586
443 | 243 10.090588569641113
444 | 244 10.113792419433594
445 | 245 10.143721580505371
446 | 246 10.159034729003906
447 | 247 10.17558479309082
448 | 248 10.193281173706055
449 | 249 10.21686840057373
450 | 250 10.241169929504395
451 | 251 10.26410961151123
452 | 252 10.285745620727539
453 | 253 10.306694984436035
454 | 254 10.328060150146484
455 | 255 10.34904956817627
456 | 256 10.371853828430176
457 | 257 10.396636962890625
458 | 258 10.417922019958496
459 | 259 10.433265686035156
460 | 260 10.45438003540039
461 | 261 10.473858833312988
462 | 262 10.496040344238281
463 | 263 10.512752532958984
464 | 264 10.536433219909668
465 | 265 10.55765438079834
466 | 266 10.574774742126465
467 | 267 10.599035263061523
468 | 268 10.620415687561035
469 | 269 10.63953971862793
470 | 270 10.661383628845215
471 | 271 10.681900024414062
472 | 272 10.70407772064209
473 | 273 10.722298622131348
474 | 274 10.737654685974121
475 | 275 10.756811141967773
476 | 276 10.776389122009277
477 | 277 10.792834281921387
478 | 278 10.81033992767334
479 | 279 10.83313274383545
480 | 280 10.854608535766602
481 | 281 10.872381210327148
482 | 282 10.895024299621582
483 | 283 10.916109085083008
484 | 284 10.934562683105469
485 | 285 10.954834938049316
486 | 286 10.973169326782227
487 | 287 11.00400447845459
488 | 288 11.027678489685059
489 | 289 11.048644065856934
490 | 290 11.068979263305664
491 | 291 11.088113784790039
492 | 292 11.103851318359375
493 | 293 11.122899055480957
494 | 294 11.144107818603516
495 | 295 11.157708168029785
496 | 296 11.176496505737305
497 | 297 11.197405815124512
498 | 298 11.212449073791504
499 | 299 11.23056411743164
500 | 300 11.249613761901855
501 |
--------------------------------------------------------------------------------