├── HISTORY.rst ├── doc ├── source │ ├── _static │ │ └── output_change.png │ ├── contents.rst │ ├── index.rst │ └── conf.py └── Makefile ├── admin_torch ├── __init__.py └── admin.py ├── MANIFEST.in ├── CODE_OF_CONDUCT.md ├── .gitignore ├── example ├── eval_wmt_en-de.sh ├── train_wmt_en-de_huge_batch.sh ├── train_big_wmt_en-de_huge_batch.sh ├── train_wmt_en-de.sh ├── average_checkpoints.py ├── README.md └── profile.ratio.init ├── LICENSE ├── setup.py ├── SUPPORT.md ├── .github └── workflows │ └── codeql-analysis.yml ├── SECURITY.md └── README.md /HISTORY.rst: -------------------------------------------------------------------------------- 1 | History 2 | ======= 3 | 4 | 0.1.0 (2022/3/3) 5 | ------------------ 6 | * implemented Admin 7 | -------------------------------------------------------------------------------- /doc/source/_static/output_change.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/admin-torch/main/doc/source/_static/output_change.png -------------------------------------------------------------------------------- /admin_torch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | __author__ = "Liyuan Liu" 5 | 6 | __maintainer__ = "Liyuan Liu" 7 | __email__ = "llychinalz@gmail.com" 8 | 9 | from admin_torch.admin import * -------------------------------------------------------------------------------- /doc/source/contents.rst: -------------------------------------------------------------------------------- 1 | .. Admin-Torch documentation file. 2 | 3 | :github_url: https://github.com/microsoft/admin-torch 4 | 5 | ************************* 6 | Admin-Torch documentation 7 | ************************* 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | 12 | index -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the README 2 | include *.md 3 | 4 | # Include the license file 5 | include LICENSE.txt 6 | 7 | # Include the history 8 | include HISTORY.rst 9 | 10 | # Include the script 11 | include bin/torch_scope 12 | 13 | # Include image 14 | include doc/source/_static/output_change.png 15 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = Admin 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # yaml files 7 | *.yaml 8 | 9 | # aml files 10 | .amltconfig 11 | .amltignore 12 | 13 | # macOS dir files 14 | .DS_Store 15 | 16 | # Distribution / packaging 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # PyBuilder 46 | target/ 47 | 48 | # Jupyter Notebook 49 | .ipynb_checkpoints 50 | 51 | # pyenv 52 | .python-version 53 | 54 | # dotenv 55 | .env 56 | 57 | # mypy 58 | .mypy_cache/ 59 | -------------------------------------------------------------------------------- /example/eval_wmt_en-de.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DATADIR=${1:-"./wmt14_en_de_joined_dict"} 3 | MODELDIR=${2:-"None"} 4 | 5 | SAVEDIR=${3:-"None"} 6 | UPPER_BOUND=${4:-100} 7 | CP_POINT_NUM=${5:-10} 8 | 9 | if [[ $MODELDIR == "None" ]] 10 | then 11 | if [[ $SAVEDIR == "None" ]] 12 | then 13 | echo "SAVEDIR and MODELDIR cannot be None at the same time." 14 | exit 15 | fi 16 | MODELDIR=$SAVEDIR/model_${UPPER_BOUND}_${CP_POINT_NUM}.pt 17 | if [ -f $MODELDIR ]; then 18 | echo $MODELDIR "already exists" 19 | else 20 | echo "Start averaging model" 21 | python average_checkpoints.py --inputs $SAVEDIR --num-epoch-checkpoints ${CP_POINT_NUM} --output $MODELDIR --checkpoint-upper-bound $UPPER_BOUND | grep 'Finish' 22 | echo "End averaging model" 23 | fi 24 | fi 25 | 26 | echo "Model path" $MODELDIR 27 | 28 | CUDA_VISIBLE_DEVICES=0 fairseq-generate $DATADIR \ 29 | --path $MODELDIR \ 30 | --batch-size 128 --beam 4 --lenpen 0.6 --remove-bpe \ 31 | --quiet --fp16 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup, find_packages 5 | 6 | def read_readme(): 7 | with open('README.md') as f: 8 | return f.read() 9 | 10 | with open('HISTORY.rst') as history_file: 11 | history = history_file.read() 12 | 13 | requirements = [ 14 | 'torch' 15 | ] 16 | 17 | setup( 18 | name='admin_torch', 19 | version='0.1.0', 20 | description='Plug-in-and-Play Toolbox for Stablizing Transformer Training', 21 | long_description= read_readme(), 22 | long_description_content_type="text/markdown", 23 | author='Lucas Liu', 24 | author_email='llychinalz@gmail.com', 25 | url='https://github.com/microsoft/admin-torch', 26 | packages=find_packages(exclude=['docs']), 27 | include_package_data=True, 28 | install_requires=requirements, 29 | license='Apache License 2.0', 30 | zip_safe=False, 31 | classifiers=[ 32 | 'Development Status :: 2 - Pre-Alpha', 33 | 'Intended Audience :: Developers', 34 | 'Natural Language :: English', 35 | 'Programming Language :: Python :: 3.7', 36 | 'Programming Language :: Python :: 3.8', 37 | 'Programming Language :: Python :: 3.9', 38 | ] 39 | ) 40 | 41 | # python setup.py sdist bdist_wheel --universal 42 | # twine upload dist/* -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # TODO: The maintainer of this repo has not yet edited this file 2 | 3 | **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project? 4 | 5 | - **No CSS support:** Fill out this template with information about how to file issues and get help. 6 | - **Yes CSS support:** Fill out an intake form at [aka.ms/spot](https://aka.ms/spot). CSS will work with/help you to determine next steps. More details also available at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). 7 | - **Not sure?** Fill out a SPOT intake as though the answer were "Yes". CSS will help you decide. 8 | 9 | *Then remove this first heading from this SUPPORT.MD file before publishing your repo.* 10 | 11 | # Support 12 | 13 | ## How to file issues and get help 14 | 15 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 16 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 17 | feature request as a new Issue. 18 | 19 | For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE 20 | FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER 21 | CHANNEL. WHERE WILL YOU HELP PEOPLE?**. 22 | 23 | ## Microsoft Support Policy 24 | 25 | Support for this **PROJECT or PRODUCT** is limited to the resources listed above. 26 | -------------------------------------------------------------------------------- /doc/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Admin-Torch documentation file. 2 | 3 | :github_url: https://github.com/LiyuanLucasLiu/Admin 4 | 5 | ************************* 6 | Admin-Torch documentation 7 | ************************* 8 | 9 | A plug-in-and-play PyTorch wrapper for `Adaptive model initialization (Admin)`__. 10 | 11 | For a neural network f, input x, randomly initialized weight w, we describe its stability ( 12 | ``output_change_scale``) as 13 | 14 | .. math:: E[|f(x, w) - f(x, w + \delta)|_2^2], \mbox{where } \delta \mbox{ is a random perturbation.} 15 | 16 | In `our study`__, we show that, an original N-layer Transformer's ``output_change_scale`` is ``O(n)``, 17 | which unstabilizes its training. Admin stabilize Transformer's training by regulating this scale to 18 | ``O(logn)`` and ``O(1)``. We keep ``O(logn)`` as the ``default`` setting, which can handle most scenarios. 19 | In need of additional stability, set ``output_change_scale`` to ``O(1)`` instead. 20 | 21 | __ https://arxiv.org/abs/2004.08249 22 | __ https://arxiv.org/abs/2004.08249 23 | 24 | 25 | admin_torch\.as_module() 26 | =============================== 27 | .. autofunction:: admin_torch.as_module 28 | 29 | 30 | admin_torch\.as_parameter() 31 | =============================== 32 | .. autofunction:: admin_torch.as_parameter 33 | 34 | 35 | admin_torch\.as_buffer() 36 | =============================== 37 | .. autofunction:: admin_torch.as_buffer 38 | 39 | 40 | admin_torch\.OmegaResidual 41 | ================================ 42 | .. autoclass:: admin_torch.OmegaResidual 43 | :members: 44 | -------------------------------------------------------------------------------- /example/train_wmt_en-de_huge_batch.sh: -------------------------------------------------------------------------------- 1 | DATA_PATH=${1:-"./wmt14_en_de_joined_dict"} 2 | LAYERS=${2:-18} 3 | OUTPUT_PATH=${3:-"./admin_18L_asParameter"} 4 | ADDITIONAL_ARGS=${4:-"--encoder-as-parameter --decoder-as-parameter --share-all-embeddings"} 5 | 6 | TOKENS=2048 7 | DEVICE_NUMBER=8 8 | FREQ=32 9 | 10 | NUMBER_OF_GPUS=$(nvidia-smi --list-gpus | wc -l) 11 | if [[ $NUMBER_OF_GPUS != $DEVICE_NUMBER ]] 12 | then 13 | echo "The script is for $DEVICE_NUMBER card, but only find $NUMBER_OF_GPUS cards." 14 | echo "Please modify TOKENS, DEVICES, and FREQ in the script accordingly." 15 | echo 16 | echo "Note that you need to keep device_number * tokens * freq = 32768" 17 | exit 18 | fi 19 | 20 | DEVICE_LIST=$(( DEVICE_NUMBER - 1 )) 21 | DEVICE_LIST=$(seq -s "," 0 $DEVICE_LIST) 22 | 23 | echo "Using GPUs $DEVICE_LIST for training" 24 | 25 | CUDA_VISIBLE_DEVICES=$DEVICE_LIST fairseq-train \ 26 | $DATA_PATH $ADDITIONAL_ARGS -s en -t de \ 27 | --arch transformer_wmt_en_de \ 28 | --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ 29 | --lr-scheduler inverse_sqrt --max-update 100000 \ 30 | --warmup-init-lr 1e-07 --warmup-updates 4000 --lr 0.0015 \ 31 | --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ 32 | --weight-decay 0.0001 --dropout 0.4 \ 33 | --max-tokens $TOKENS --update-freq $FREQ \ 34 | --save-dir $OUTPUT_PATH --seed 1111 --restore-file x.pt \ 35 | --log-format simple --log-interval 30 --memory-efficient-fp16 \ 36 | --encoder-layers $LAYERS --decoder-layers $LAYERS \ 37 | --threshold-loss-scale 0.0625 --fp16-scale-window 256 --fp16 38 | -------------------------------------------------------------------------------- /example/train_big_wmt_en-de_huge_batch.sh: -------------------------------------------------------------------------------- 1 | DATA_PATH=${1:-"./wmt14_en_de_joined_dict"} 2 | LAYERS=${2:-18} 3 | OUTPUT_PATH=${3:-"./admin_18L_asParameter"} 4 | ADDITIONAL_ARGS=${4:-"--encoder-as-parameter --decoder-as-parameter --share-all-embeddings --lr 0.0007"} 5 | 6 | TOKENS=2048 7 | DEVICE_NUMBER=16 8 | FREQ=16 9 | 10 | NUMBER_OF_GPUS=$(nvidia-smi --list-gpus | wc -l) 11 | if [[ $NUMBER_OF_GPUS != $DEVICE_NUMBER ]] 12 | then 13 | echo "The script is for $DEVICE_NUMBER card, but only find $NUMBER_OF_GPUS cards." 14 | echo "Please modify TOKENS, DEVICES, and FREQ in the script accordingly." 15 | echo 16 | echo "Note that you need to keep device_number * tokens * freq = 32768" 17 | exit 18 | fi 19 | 20 | DEVICE_LIST=$(( DEVICE_NUMBER - 1 )) 21 | DEVICE_LIST=$(seq -s "," 0 $DEVICE_LIST) 22 | 23 | echo "Using GPUs $DEVICE_LIST for training" 24 | 25 | CUDA_VISIBLE_DEVICES=$DEVICE_LIST fairseq-train \ 26 | $DATA_PATH $ADDITIONAL_ARGS -s en -t de \ 27 | --arch transformer_vaswani_wmt_en_de_big \ 28 | --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ 29 | --lr-scheduler inverse_sqrt --max-update 100000 \ 30 | --warmup-init-lr 1e-07 --warmup-updates 4000 \ 31 | --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ 32 | --weight-decay 0.0001 --dropout 0.4 \ 33 | --max-tokens $TOKENS --update-freq $FREQ \ 34 | --save-dir $OUTPUT_PATH --seed 1111 --restore-file x.pt \ 35 | --log-format simple --log-interval 30 --memory-efficient-fp16 \ 36 | --encoder-layers $LAYERS --decoder-layers $LAYERS \ 37 | --threshold-loss-scale 0.0625 --fp16-scale-window 256 --fp16 38 | -------------------------------------------------------------------------------- /example/train_wmt_en-de.sh: -------------------------------------------------------------------------------- 1 | DATA_PATH=${1:-"./wmt14_en_de_joined_dict"} 2 | LAYERS=${2:-18} 3 | OUTPUT_PATH=${3:-"./admin_18L_asParameter"} 4 | ADDITIONAL_ARGS=${4:-"--share-all-embeddings --encoder-as-parameter --decoder-as-parameter"} 5 | 6 | TOKENS=4096 7 | DEVICE_NUMBER=8 8 | FREQ=1 9 | 10 | NUMBER_OF_GPUS=$(nvidia-smi --list-gpus | wc -l) 11 | if [[ $NUMBER_OF_GPUS != $DEVICE_NUMBER ]] 12 | then 13 | echo "The script is for $DEVICE_NUMBER card, but only find $NUMBER_OF_GPUS cards." 14 | echo "Please modify TOKENS, DEVICES, and FREQ in the script accordingly." 15 | echo 16 | echo "Note that you need to keep device_number * tokens * freq = 32768" 17 | exit 18 | fi 19 | 20 | DEVICE_LIST=$(( DEVICE_NUMBER - 1 )) 21 | DEVICE_LIST=$(seq -s "," 0 $DEVICE_LIST) 22 | 23 | echo "Using GPUs $DEVICE_LIST for training" 24 | 25 | CUDA_VISIBLE_DEVICES=$DEVICE_LIST fairseq-train \ 26 | $DATA_PATH $ADDITIONAL_ARGS -s en -t de \ 27 | --arch transformer_wmt_en_de \ 28 | --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ 29 | --lr-scheduler inverse_sqrt --max-update 1000000 \ 30 | --warmup-init-lr 1e-07 --warmup-updates 8000 --lr 0.001 \ 31 | --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ 32 | --weight-decay 0.0 --attention-dropout 0.1 --relu-dropout 0.1 \ 33 | --max-tokens $TOKENS --update-freq $FREQ \ 34 | --save-dir $OUTPUT_PATH --seed 1111 --restore-file x.pt \ 35 | --log-format simple --log-interval 30 --memory-efficient-fp16 \ 36 | --encoder-layers $LAYERS --decoder-layers $LAYERS \ 37 | --threshold-loss-scale 0.0625 --fp16-scale-window 256 --fp16 38 | 39 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ main ] 20 | schedule: 21 | - cron: '43 10 * * 6' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v3 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v2 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v2 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v2 71 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/very-deep-transformers-for-neural-machine/machine-translation-on-wmt2014-english-french)](https://paperswithcode.com/sota/machine-translation-on-wmt2014-english-french?p=very-deep-transformers-for-neural-machine) 2 | ![PyTorch](https://img.shields.io/badge/PyTorch-%23EE4C2C.svg?style=flat&logo=PyTorch&logoColor=white) 3 | ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/admin-torch) 4 | ![GitHub](https://img.shields.io/github/license/microsoft/admin-Torch) 5 | [![Maintenance](https://img.shields.io/badge/doc-yes-success.svg)](https://microsoft.github.io/admin-torch/) 6 | ![PyPI](https://img.shields.io/pypi/v/admin-torch) 7 | 8 |

Admin-Torch

9 |
Transformers Training **Stabilized**
10 | 11 |

12 | What's New? • 13 | Key Idea • 14 | How To Use • 15 | Docs • 16 | Examples • 17 | Citation • 18 | License 19 |

20 | 21 | Here, we provide a plug-in-and-play implementation of [Admin](https://arxiv.org/abs/2004.08249), 22 | which stabilizes previously-diverged Transformer training and achieves better performance, 23 | **without introducing additional hyper-parameters**. The design of Admin is half-precision 24 | friendly and can be **reparameterized into the original Transformer**. 25 | 26 | ______________________________________________________________________ 27 | ## What's New? 28 | 29 | Beyond the [original admin implementation](https://github.com/LiyuanLucasLiu/Transformer-Clinic): 30 | 1. `admin-torch` removed the profilling stage and is **plug-in-and-play**. 31 | 2. `admin-torch`'s implementation is **more robust** (see below). 32 | 33 | Comparison w. the [DeepNet Init](https://arxiv.org/abs/2203.00555) and the [Original Admin Init](https://github.com/LiyuanLucasLiu/Transformer-Clinic) 34 | (on WMT'17 En-De). 35 | 36 | | | Regular batch size (8x4096) | Huge batch size (128x4096) | 37 | |---------------|--------------------|------------------| 38 | | [Original Admin](https://github.com/LiyuanLucasLiu/Transformer-Clinic)| ✅ | ❌ | 39 | | [DeepNet](https://arxiv.org/abs/2203.00555) | ❌ | ✅ | 40 | | `admin-torch` | ✅ | ✅ | 41 | 42 | More details can be found in [our example](https://github.com/microsoft/admin-torch/tree/main/example). 43 | 44 | ## Key Idea 45 |
What complicates Transformer training?
46 | 47 | For Transformer f, input x, randomly initialized weight w, we describe its stability (``output_change_scale``) as 48 | 49 |

50 | 51 |

52 | 53 | In [our study](https://arxiv.org/abs/2004.08249), we show that, an original n-layer Transformer's 54 | ``output_change_scale`` is ``O(n)``, which unstabilizes its training. Admin stabilize Transformer's 55 | training by regulating this scale to ``O(logn)`` or ``O(1)``. 56 | 57 |

58 | 59 | More details can be found in our [paper](https://arxiv.org/abs/2004.08249). 60 | 61 | 62 | ## How to use? 63 | 64 | ### install 65 | ``` 66 | pip install admin-torch==0.1.0 67 | ``` 68 | 69 | ### import 70 | ``` 71 | import admin_torch 72 | ``` 73 | 74 | ### enjoy 75 | 76 | ```diff 77 | def __init__(self, ...): 78 | ... 79 | +(self.residual = admin_torch.as_module(self, self.number_of_sub_layers))+ 80 | ... 81 | 82 | def forward(self, ...): 83 | ... 84 | -!x = x + self.f(x)!- 85 | +(x = self.residual(x, self.f(x)))+ 86 | x = self.LN(x) 87 | ... 88 | ``` 89 | 90 | An elaborated example can be found at [our doc](https://microsoft.github.io/admin-torch/), and a real working example can be found at [LiyuanLucasLiu/fairseq](https://github.com/LiyuanLucasLiu/fairseq/commit/33ad76ae5dc927bc32b9594f9728a367c45680bb) (training recipe is available at [our example](https://github.com/microsoft/admin-torch/tree/main/example)). 91 | 92 | ## Citation 93 | Please cite the following papers if you found our model useful. Thanks! 94 | 95 | >Liyuan Liu, Xiaodong Liu, Jianfeng Gao, Weizhu Chen, and Jiawei Han (2020). Understanding the Difficulty of Training Transformers. Proc. 2020 Conf. on Empirical Methods in Natural Language Processing (EMNLP'20). 96 | ``` 97 | @inproceedings{liu2020admin, 98 | title={Understanding the Difficulty of Training Transformers}, 99 | author = {Liu, Liyuan and Liu, Xiaodong and Gao, Jianfeng and Chen, Weizhu and Han, Jiawei}, 100 | booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP 2020)}, 101 | year={2020} 102 | } 103 | ``` 104 | > Xiaodong Liu, Kevin Duh, Liyuan Liu, and Jianfeng Gao (2020). Very Deep Transformers for Neural Machine Translation. arXiv preprint arXiv:2008.07772 (2020). 105 | ``` 106 | @inproceedings{liu_deep_2020, 107 | author = {Liu, Xiaodong and Duh, Kevin and Liu, Liyuan and Gao, Jianfeng}, 108 | booktitle = {arXiv:2008.07772 [cs]}, 109 | title = {Very Deep Transformers for Neural Machine Translation}, 110 | year = {2020} 111 | } 112 | ``` 113 | -------------------------------------------------------------------------------- /example/average_checkpoints.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2017-present, Facebook, Inc. 3 | # All rights reserved. 4 | # 5 | # This source code is licensed under the license found in the LICENSE file in 6 | # the root directory of this source tree. An additional grant of patent rights 7 | # can be found in the PATENTS file in the same directory. 8 | 9 | import argparse 10 | import collections 11 | import torch 12 | import os 13 | import re 14 | 15 | 16 | def average_checkpoints(inputs): 17 | """Loads checkpoints from inputs and returns a model with averaged weights. 18 | 19 | Args: 20 | inputs: An iterable of string paths of checkpoints to load from. 21 | 22 | Returns: 23 | A dict of string keys mapping to various values. The 'model' key 24 | from the returned dict should correspond to an OrderedDict mapping 25 | string parameter names to torch Tensors. 26 | """ 27 | params_dict = collections.OrderedDict() 28 | params_keys = None 29 | new_state = None 30 | num_models = len(inputs) 31 | 32 | for f in inputs: 33 | state = torch.load( 34 | f, 35 | map_location=( 36 | lambda s, _: torch.serialization.default_restore_location(s, 'cpu') 37 | ), 38 | ) 39 | # Copies over the settings from the first checkpoint 40 | if new_state is None: 41 | new_state = state 42 | 43 | model_params = state['model'] 44 | 45 | model_params_keys = list(model_params.keys()) 46 | if params_keys is None: 47 | params_keys = model_params_keys 48 | elif params_keys != model_params_keys: 49 | raise KeyError( 50 | 'For checkpoint {}, expected list of params: {}, ' 51 | 'but found: {}'.format(f, params_keys, model_params_keys) 52 | ) 53 | 54 | for k in params_keys: 55 | p = model_params[k] 56 | if isinstance(p, torch.HalfTensor): 57 | p = p.float() 58 | if k not in params_dict: 59 | params_dict[k] = p.clone() 60 | # NOTE: clone() is needed in case of p is a shared parameter 61 | else: 62 | params_dict[k] += p 63 | 64 | averaged_params = collections.OrderedDict() 65 | for k, v in params_dict.items(): 66 | averaged_params[k] = v 67 | averaged_params[k].div_(num_models) 68 | new_state['model'] = averaged_params 69 | return new_state 70 | 71 | 72 | def last_n_checkpoints(paths, n, update_based, upper_bound=None): 73 | assert len(paths) == 1 74 | path = paths[0] 75 | if update_based: 76 | pt_regexp = re.compile(r'checkpoint_\d+_(\d+)\.pt') 77 | else: 78 | pt_regexp = re.compile(r'checkpoint(\d+)\.pt') 79 | files = os.listdir(path) 80 | 81 | entries = [] 82 | for f in files: 83 | m = pt_regexp.fullmatch(f) 84 | if m is not None: 85 | sort_key = int(m.group(1)) 86 | if upper_bound is None or sort_key <= upper_bound: 87 | entries.append((sort_key, m.group(0))) 88 | if len(entries) < n: 89 | raise Exception('Found {} checkpoint files but need at least {}', len(entries), n) 90 | return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]] 91 | 92 | 93 | def main(): 94 | parser = argparse.ArgumentParser( 95 | description='Tool to average the params of input checkpoints to ' 96 | 'produce a new checkpoint', 97 | ) 98 | # fmt: off 99 | parser.add_argument('--inputs', required=True, nargs='+', 100 | help='Input checkpoint file paths.') 101 | parser.add_argument('--output', required=True, metavar='FILE', 102 | help='Write the new checkpoint containing the averaged weights to this path.') 103 | num_group = parser.add_mutually_exclusive_group() 104 | num_group.add_argument('--num-epoch-checkpoints', type=int, 105 | help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, ' 106 | 'and average last this many of them.') 107 | num_group.add_argument('--num-update-checkpoints', type=int, 108 | help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, ' 109 | 'and average last this many of them.') 110 | parser.add_argument('--checkpoint-upper-bound', type=int, 111 | help='when using --num-epoch-checkpoints, this will set an upper bound on which checkpoint to use, ' 112 | 'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.') 113 | # fmt: on 114 | args = parser.parse_args() 115 | print(args) 116 | 117 | num = None 118 | is_update_based = False 119 | if args.num_update_checkpoints is not None: 120 | num = args.num_update_checkpoints 121 | is_update_based = True 122 | elif args.num_epoch_checkpoints is not None: 123 | num = args.num_epoch_checkpoints 124 | 125 | assert args.checkpoint_upper_bound is None or args.num_epoch_checkpoints is not None, \ 126 | '--checkpoint-upper-bound requires --num-epoch-checkpoints' 127 | assert args.num_epoch_checkpoints is None or args.num_update_checkpoints is None, \ 128 | 'Cannot combine --num-epoch-checkpoints and --num-update-checkpoints' 129 | 130 | if num is not None: 131 | args.inputs = last_n_checkpoints( 132 | args.inputs, num, is_update_based, upper_bound=args.checkpoint_upper_bound, 133 | ) 134 | print('averaging checkpoints: ', args.inputs) 135 | 136 | new_state = average_checkpoints(args.inputs) 137 | torch.save(new_state, args.output) 138 | print('Finished writing averaged checkpoint to {}.'.format(args.output)) 139 | 140 | 141 | if __name__ == '__main__': 142 | main() 143 | -------------------------------------------------------------------------------- /doc/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Wrapper documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Sep 14 03:49:01 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | 20 | import os 21 | import sys 22 | 23 | sys.path.insert(0, os.path.abspath('../..')) 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'sphinx.ext.autodoc', 36 | 'sphinx.ext.autosummary', 37 | 'sphinx.ext.doctest', 38 | 'sphinx.ext.intersphinx', 39 | 'sphinx.ext.todo', 40 | 'sphinx.ext.coverage', 41 | 'sphinx.ext.mathjax', 42 | 'sphinx.ext.napoleon', 43 | 'sphinx.ext.viewcode', 44 | 'sphinx.ext.githubpages', 45 | ] 46 | 47 | napoleon_use_ivar = True 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ['_templates'] 51 | 52 | # The suffix(es) of source filenames. 53 | # You can specify multiple suffix as a list of string: 54 | # 55 | # source_suffix = ['.rst', '.md'] 56 | source_suffix = '.rst' 57 | 58 | # The master toctree document. 59 | master_doc = 'contents' 60 | 61 | # General information about the project. 62 | project = 'Admin-Torch' 63 | copyright = '2022, Liyuan Liu' 64 | author = 'Liyuan Liu' 65 | 66 | # The version info for the project you're documenting, acts as replacement for 67 | # |version| and |release|, also used in various other places throughout the 68 | # built documents. 69 | # 70 | # The short X.Y version. 71 | version = '' 72 | # The full version, including alpha/beta/rc tags. 73 | release = '' 74 | 75 | # The language for content autogenerated by Sphinx. Refer to documentation 76 | # for a list of supported languages. 77 | # 78 | # This is also used if you do content translation via gettext catalogs. 79 | # Usually you set "language" from the command line for these cases. 80 | language = None 81 | 82 | # List of patterns, relative to source directory, that match files and 83 | # directories to ignore when looking for source files. 84 | # This patterns also effect to html_static_path and html_extra_path 85 | exclude_patterns = [] 86 | 87 | # The name of the Pygments (syntax highlighting) style to use. 88 | pygments_style = 'sphinx' 89 | 90 | # If true, `todo` and `todoList` produce output, else they produce nothing. 91 | todo_include_todos = False 92 | 93 | # -- Options for HTML output ---------------------------------------------- 94 | 95 | # The theme to use for HTML and HTML Help pages. See the documentation for 96 | # a list of builtin themes. 97 | # 98 | 99 | html_theme = 'sphinx_rtd_theme' 100 | # html_theme = 'sphinx_documatt_theme' 101 | 102 | # Theme options are theme-specific and customize the look and feel of a theme 103 | # further. For a list of options available for each theme, see the 104 | # documentation. 105 | # 106 | # html_theme_options = {} 107 | html_theme_options = { 108 | 'globaltoc_maxdepth': 5, 109 | } 110 | 111 | # Add any paths that contain custom static files (such as style sheets) here, 112 | # relative to this directory. They are copied after the builtin static files, 113 | # so a file named "default.css" will overwrite the builtin "default.css". 114 | html_static_path = ['_static'] 115 | 116 | # Custom sidebar templates, must be a dictionary that maps document names 117 | # to template names. 118 | # 119 | # This is required for the alabaster theme 120 | # # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 121 | html_sidebars = { 122 | '**': ['globaltoc.html', 'relations.html', 'sourcelink.html', 'searchbox.html'] 123 | } 124 | 125 | # -- Options for HTMLHelp output ------------------------------------------ 126 | 127 | # Output file base name for HTML help builder. 128 | htmlhelp_basename = 'Admin-Torch' 129 | 130 | # -- Options for LaTeX output --------------------------------------------- 131 | 132 | latex_elements = { 133 | # The paper size ('letterpaper' or 'a4paper'). 134 | # 135 | # 'papersize': 'letterpaper', 136 | 137 | # The font size ('10pt', '11pt' or '12pt'). 138 | # 139 | # 'pointsize': '10pt', 140 | 141 | # Additional stuff for the LaTeX preamble. 142 | # 143 | # 'preamble': '', 144 | 145 | # Latex figure (float) alignment 146 | # 147 | # 'figure_align': 'htbp', 148 | } 149 | 150 | # Grouping the document tree into LaTeX files. List of tuples 151 | # (source start file, target name, title, 152 | # author, documentclass [howto, manual, or own class]). 153 | latex_documents = [ 154 | (master_doc, 'admin_torch.tex', 'Admin-Torch Documentation', 155 | 'Admin-Torch', 'manual'), 156 | ] 157 | 158 | # -- Options for manual page output --------------------------------------- 159 | 160 | # One entry per manual page. List of tuples 161 | # (source start file, name, description, authors, manual section). 162 | man_pages = [ 163 | (master_doc, 'Admin-Torch', 'Admin-Torch Documentation', 164 | [author], 1) 165 | ] 166 | 167 | # -- Options for Texinfo output ------------------------------------------- 168 | 169 | # Grouping the document tree into Texinfo files. List of tuples 170 | # (source start file, target name, title, author, 171 | # dir menu entry, description, category) 172 | texinfo_documents = [ 173 | (master_doc, 'Admin-Torch', 'Admin-Torch Documentation', 174 | author, 'Admin-Torch', 'Adaptive Model Initialization.', 175 | 'Miscellaneous'), 176 | ] 177 | 178 | autodoc_mock_imports = ['torch'] 179 | 180 | intersphinx_mapping = { 181 | 'python':('https://docs.python.org/3', None), 182 | 'torch': ('http://pytorch.org/docs/master', None) 183 | } 184 | 185 | autodoc_member_order = 'bysource' -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | # Table of Contents 2 | 3 | - [Real example: `admin_torch` on WMT'14 En-De](#admin_torch-on-WMT14-En-De) 4 | - [Comparison with original Admin and DeepNet](#comparison-with-original-admin-and-deepnet-on-wmt17-en-de) 5 | 6 | # Real example: `admin-torch` on WMT'14 En-De 7 | 8 | As an example, we apply `admin_torch` to `fairseq` and train Transformer on WMT'14 En-De. 9 | 10 | > Note: the efforts to incorporate `admin-torch` into fairseq are summarized as [this commit](https://github.com/LiyuanLucasLiu/fairseq/commit/33ad76ae5dc927bc32b9594f9728a367c45680bb): 11 | 12 | ## 1. Pre-processing 13 | 14 | ### 1.1. Data Preparation 15 | 16 | please refer to [the Transformer-Clinic repo](https://github.com/LiyuanLucasLiu/Transformer-Clinic/blob/master/pre-process/wmt14en-de.sh) for data preparation. 17 | 18 | ### 1.2. Package Install 19 | 20 | ``` 21 | pip install admin_torch==0.1.0 22 | pip uninstall fairseq 23 | pip install https://github.com/LiyuanLucasLiu/fairseq/archive/refs/tags/admin-torch.zip 24 | ``` 25 | 26 | ## 2. Training and Evaluation 27 | 28 | ### 2.1. Training 29 | ``` 30 | bash train_wmt_en-de.sh $PATH-to-WMT14 $NUBMER_LAYER $OUTPUT_PATH 31 | ``` 32 | Not that `$PATH-to-WMT14` is the path to the `wmt14_en_de_joined_dict` data 33 | folder from data preparation. `$NUMBER_LAYER` is the encoder/decoder layer number. 34 | `$OUTPUT_PATH` is the path where you want to save your checkpoints. 35 | 36 | ### 2.2. Evaluation 37 | ``` 38 | bash eval_wmt_en-de.sh $PATH-to-WMT14 NONE $OUTPUT_PATH 39 | ``` 40 | Not that `$PATH-to-WMT14` is the path to the `wmt14_en_de_joined_dict` data folder 41 | from data preparation. `$OUTPUT_PATH` is the path used in the training step. 42 | 43 | ## 3. Pre-trained Weights 44 | 45 | | Layer Number | BLEU | PATH | 46 | |--------------|-------|------| 47 | | 6L-6L | 27.84 | TBD | 48 | | 18L-18L | 28.91 | TBD | 49 | | 100L-100L* | 29.65 | TBD | 50 | 51 | *: trained with the [huge-batch-size setting](#omparison-with-original-admin-and-deepnet-on-wmt17-en-de), 52 | but only for 40 epochs, due to the huge cost of the training. 53 | 54 | ## 4. Discussion on the `admin-torch` setting. 55 | 56 | `admin-torch.as_module` can be configured by changing `output_change_scale` and 57 | `as_parameter`. `output_change_scale` can be set to `O(1)` for additional stability, but 58 | results in a performance drop in our experiments. `as_parameter` can be set to `False` to 59 | make `omega` (the shortcut connection scaler) as a constant (no updates). Their performance are listed 60 | as below: 61 | 62 | | Layer Number | Output Change | Omega | BLEU | 63 | |-----------------|---------------|-----------------|-------| 64 | | 6L-6L | O(1) | as a constant | 27.71 | 65 | | 6L-6L | O(1) | as a parameter | 27.79 | 66 | | 6L-6L | O(logn) | as a constant | 27.83 | 67 | | 6L-6L | O(logn) | as a parameter | 27.84 | 68 | | 18L-18L | O(1) | as a constant | 28.66 | 69 | | 18L-18L | O(1) | as a parameter | 28.89 | 70 | | 18L-18L | O(logn) | as a constant | 28.78 | 71 | | 18L-18L | O(logn) | as a parameter | 28.91 | 72 | 73 | # Comparison with original Admin and DeepNet on WMT'17 En-De 74 | 75 | We choose to make comparisons with DeepNet and the original Admin implementation on WMT'17 En-De, 76 | the dataset used in the DeepNet paper. 77 | 78 | We noticed that the training configuration in the DeepNet paper is different from the setting used 79 | in the original Admin repo. Their major difference is the batch size (i.e., regular batch size and 80 | huge batch size). We refer the setting used in the DeepNet paper as `Huge batch size (128x4096)`, 81 | and they refer the setting with changed batch size as `Regular batch size (8x4096)`. 82 | 83 | We can find that they can only work on their own settings. 84 | 85 | | | Regular batch size (8x4096) | Huge batch size (128x4096) | 86 | |---------------|--------------------|------------------| 87 | | [Original Admin](https://github.com/LiyuanLucasLiu/Transformer-Clinic)| ✅ | ❌ | 88 | | [DeepNet](https://arxiv.org/abs/2203.00555) | ❌ | ✅ | 89 | | `admin-torch` | ✅ | ✅ | 90 | 91 | Here, we re-implemented admin as `admin-torch`, and we can find that the new `admin-torch` 92 | implementation works well on both settings. 93 | 94 | All implementations are publicly released (elaborated as below). 95 | 96 | 97 | ## 1. Data Preparation 98 | Please refer to the DeepNet paper for data preparation. Here we used the same data shared by the 99 | DeepNet team. 100 | 101 | ## 2. Original Admin and DeepNet 102 | 103 | ### 2.1. Implementation Download and Code Install 104 | ``` 105 | pip uninstall fairseq 106 | git clone https://github.com/LiyuanLucasLiu/Transformer-Clinic.git 107 | cd Transformer-Clinic/fairseq 108 | pip install --editable . 109 | ``` 110 | 111 | ### 2.2. Training 112 | 113 | #### 2.2.1. Original Admin 114 | ``` 115 | # Before running the training, the original admin requires to do a profilling 116 | # of the network. The profilling result for 100L-100L is included in this repo 117 | # (i.e., example/profile.ratio.init). The command to generate this profilling 118 | # can be found at https://github.com/LiyuanLucasLiu/Transformer-Clinic/blob/master/nmt-experiments/wmt14_en-de.md#100l-100l-admin-without-any-hyper-parameter-tuning 119 | 120 | # regular batch size (4096 x 8) 121 | bash train_wmt_en-de.sh $PATH-to-WMT17 100 $OUTPUT_PATH_REG "--init-type adaptive" 122 | 123 | # huge batch size (4096 x 128) 124 | bash train_wmt_en-de_huge.sh $PATH-to-WMT17 100 $OUTPUT_PATH_HUG "--init-type adaptive" 125 | 126 | # evaluate 127 | bash eval_wmt_en-de.sh $PATH-to-WMT17 none $OUTPUT_PATH_HUG/REG 45 10 128 | ``` 129 | 130 | #### 2.2.2. DeepNet 131 | ``` 132 | # regular batch size (4096 x 8) 133 | bash train_wmt_en-de.sh $PATH-to-WMT17 100 $OUTPUT_PATH_REG "--init-type deepnet" 134 | 135 | # huge batch size (4096 x 128) 136 | bash train_wmt_en-de_huge.sh $PATH-to-WMT17 100 $OUTPUT_PATH_HUG "--init-type deepnet" 137 | 138 | # evaluate 139 | bash eval_wmt_en-de.sh $PATH-to-WMT17 none $OUTPUT_PATH_HUG/REG 45 10 140 | ``` 141 | 142 | ## 3 `torch-admin` 143 | 144 | ### 3.1 Package Install 145 | 146 | ``` 147 | pip install admin_torch==0.1.0 148 | pip uninstall fairseq 149 | pip install https://github.com/LiyuanLucasLiu/fairseq/archive/refs/tags/admin-torch.zip 150 | ``` 151 | 152 | ### 3.2 Training and Evaluation 153 | 154 | ``` 155 | # regular batch size (4096 x 8) 156 | bash train_wmt_en-de.sh $PATH-to-WMT17 100 $OUTPUT_PATH_REG 157 | 158 | # huge batch size (4096 x 128) 159 | bash train_wmt_en-de_huge.sh $PATH-to-WMT17 100 $OUTPUT_PATH_HUG 160 | 161 | # evaluate 162 | bash eval_wmt_en-de.sh $PATH-to-WMT17 none $OUTPUT_PATH_HUG/REG 45 10 163 | ``` -------------------------------------------------------------------------------- /admin_torch/admin.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import torch 5 | import math 6 | 7 | class OmegaResidual(torch.nn.Module): 8 | """ 9 | Residual connection module with shortcut connection rescaling. 10 | 11 | Parameters 12 | ---------- 13 | init_value: ``float``, required. 14 | The initialization value of the shortcut connection rescalar, omega. 15 | as_parameter: ``bool``, optional (default = False). 16 | Whether to set the rescalar as trainable parameter. Note that, when set as trainable 17 | parameters, the rescalar would be set as a vector (similar to the weight vector in layer 18 | norm), and the embed_dim input is required. 19 | embed_dim: ``int``, optional (default = None). 20 | The hidden state dimension of the shortcut connection. This field is required and only used 21 | when ``as_parameter == True``. 22 | """ 23 | 24 | def __init__(self, init_value, as_parameter=False, embed_dim=None): 25 | super().__init__() 26 | if as_parameter: 27 | assert embed_dim is not None, 'embed_dim is required when as_parameter is set as True' 28 | self.omega = torch.nn.Parameter(torch.ones(embed_dim)) 29 | self.omega.data.fill_(init_value) 30 | self.forward = self.forward_omega 31 | else: 32 | self.register_buffer('omega', torch.FloatTensor([init_value])) 33 | if 1.0 == init_value: 34 | self.forward = self.forward_original 35 | else: 36 | self.forward = self.forward_omega 37 | 38 | def forward(self, x, f_x): 39 | """ 40 | Calculate x * omega + f_x. The output shape would be same with the input shape. 41 | 42 | When omega is set to be a constant 1 (``as buffer`` and ``O(n)`` output change), the 43 | ``OmegaResidual`` would downgrade to the ordinary residual module and x + f_x would be 44 | calculated instead. 45 | """ 46 | raise NotImplementedError("Placeholder forward function used in OmegaResidual") 47 | 48 | def forward_original(self, x, f_x): 49 | return x + f_x 50 | 51 | def forward_omega(self, x, f_x): 52 | return x * self.omega + f_x 53 | 54 | def calculate_init( 55 | num_res_layers, 56 | output_change_scale='O(logn)', 57 | ) -> int: 58 | r""" 59 | Calculate initialization for omega. 60 | 61 | Parameters 62 | ---------- 63 | num_res_layers: ``int``, required. 64 | The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers. 65 | output_change_scale: ``str``, optional (default = ``'O(logn)'``). 66 | The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``, 67 | and ``'O(1)'`` are supported. 68 | 69 | Returns 70 | ------- 71 | int: It would return the initialization value. 72 | """ 73 | if 'O(logn)' == output_change_scale or 'default' == output_change_scale: 74 | omega_value = (num_res_layers + 1) / math.log(num_res_layers + 1) - 1 75 | elif 'O(n)' == output_change_scale: 76 | omega_value = 1. 77 | else: 78 | assert 'O(1)' == output_change_scale, \ 79 | 'only O(n), O(logn), and O(1) output changes are supported.' 80 | omega_value = num_res_layers 81 | return omega_value ** 0.5 82 | 83 | def as_module( 84 | num_res_layers, 85 | output_change_scale='default', 86 | as_parameter=False, 87 | embed_dim=None 88 | ) -> OmegaResidual: 89 | r""" 90 | Calculate initialization for omega and return a residual module with the initialized omega. 91 | 92 | Parameters 93 | ---------- 94 | num_res_layers: ``int``, required. 95 | The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers. 96 | output_change_scale: ``str``, optional (default = ``'O(logn)'``). 97 | The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``, 98 | and ``'O(1)'`` are supported. 99 | as_parameter: ``bool``, optional (default = False). 100 | Whether to set the rescalar as trainable parameter. Note that, when set as trainable 101 | parameters, the rescalar would be set as a vector (similar to the weight vector in 102 | layer norm), and the embed_dim input is required. 103 | embed_dim: ``int``, optional (default = None). 104 | The hidden state dimension of the shortcut connection. This field is required and only 105 | used when as_parameter == True. 106 | 107 | Returns 108 | ------- 109 | admin_torch.OmegaResidual: It would return a ``OmegaResidual`` module with the properly initialized omega inside. 110 | 111 | Example 112 | ------- 113 | 114 | .. highlight:: python 115 | .. code-block:: python 116 | 117 | import torch.nn as nn 118 | import admin_torch 119 | 120 | class TransformerEncoderLayer(nn.Module): 121 | 122 | def __init__(self, cfg): 123 | super().__init__() 124 | 125 | num_layer = 2 * cfg.encoder_layers # number of residual layers 126 | 127 | self.attn = nn.MultiheadAttention(cfg.embed_dim, cfg.num_heads) 128 | self.residual_attn = admin_torch.as_module(num_layer) 129 | self.ln_attn = nn.LayerNorm(cfg.embed_dim) 130 | 131 | self.ffn = nn.Sequential( 132 | nn.Linear(cfg.embed_dim, cfg.feedforward_dim), 133 | nn.ReLU(), 134 | nn.Linear(cfg.feedforward_dim) 135 | ) 136 | self.residual_ffn = admin_torch.as_module(num_layer) 137 | self.ln_ffn = nn.LayerNorm(cfg.embed_dim) 138 | 139 | def forward(self, x): 140 | 141 | f_x, _ = self.attn(x) 142 | x = self.residual_attn(x, f_x) 143 | x = self.ln_attn(x) 144 | 145 | f_x = self.ffn(x) 146 | x = self.residual_ffn(x, f_x) 147 | x = self.ln_ffn(x) 148 | 149 | return x 150 | """ 151 | omega_value = calculate_init(num_res_layers, output_change_scale) 152 | return OmegaResidual(omega_value, as_parameter=as_parameter, embed_dim=embed_dim) 153 | 154 | def as_buffer( 155 | network, 156 | buffer_name, 157 | num_res_layers, 158 | output_change_scale='default', 159 | ) -> None: 160 | r""" 161 | Calculate initialization for omega and *register* omega as a buffer (not trainable). 162 | 163 | Parameters 164 | ---------- 165 | network: ``torch.nn.Module``, required. 166 | The ``torch.nn.Module`` contains the residual network. This is where the omega would 167 | be registered to. 168 | buffer_name: ``str``, required. 169 | The name of omega (as buffer). The omega can be accessed in the network, using the 170 | given name. 171 | num_res_layers: ``int``, required. 172 | The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers. 173 | output_change_scale: ``str``, optional (default = ``'O(logn)'``). 174 | The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``, 175 | and ``'O(1)'`` are supported. 176 | 177 | Returns 178 | ------- 179 | None: No returns. The initialized omega would be registered as a buffer within `network`. 180 | 181 | Example 182 | ------- 183 | 184 | .. highlight:: python 185 | .. code-block:: python 186 | 187 | import torch.nn as nn 188 | import admin_torch 189 | 190 | class TransformerEncoderLayer(nn.Module): 191 | 192 | def __init__(self, cfg): 193 | super().__init__() 194 | 195 | num_layer = 2 * cfg.encoder_layers # number of residual layers 196 | 197 | self.attn = nn.MultiheadAttention(cfg.embed_dim, cfg.num_heads) 198 | admin_torch.as_buffer(self, 'attn_omega', num_layer) 199 | self.ln_attn = nn.LayerNorm(cfg.embed_dim) 200 | 201 | self.ffn = nn.Sequential( 202 | nn.Linear(cfg.embed_dim, cfg.feedforward_dim), 203 | nn.ReLU(), 204 | nn.Linear(cfg.feedforward_dim) 205 | ) 206 | admin_torch.as_buffer(self, 'ffn_omega', num_layer) 207 | self.ln_ffn = nn.LayerNorm(cfg.embed_dim) 208 | 209 | def forward(self, x): 210 | 211 | f_x, _ = self.attn(x) 212 | x = x * self.attn_omega + f_x 213 | x = self.ln_attn(x) 214 | 215 | f_x = self.ffn(x) 216 | x = x * self.ffn_omega + f_x 217 | x = self.ln_ffn(x) 218 | 219 | return x 220 | """ 221 | assert isinstance(network, torch.nn.Module), \ 222 | 'the input network has to be a torch.nn.Module object' 223 | omega_value = calculate_init(num_res_layers, output_change_scale) 224 | network.register_buffer(buffer_name, torch.FloatTensor([omega_value])) 225 | 226 | def as_parameter( 227 | network, 228 | parameter_name, 229 | num_res_layers, 230 | embed_dim, 231 | output_change_scale='default', 232 | ) -> None: 233 | r""" 234 | Calculate initialization for omega and *register* omega as a parameter (trainable). 235 | 236 | Parameters 237 | ---------- 238 | network: ``torch.nn.Module``, required. 239 | The ``torch.nn.Module`` contains the residual network. This is where the omega would 240 | be registered to. 241 | parameter_name: ``str``, required. 242 | The name of omega (as parameter). The omega can be accessed in the network, using the 243 | given name. 244 | num_res_layers: ``int``, required. 245 | The total number of residual layers. Typical n-layer Transformer encoder has 2n residual layers. 246 | embed_dim: ``int``, required. 247 | The hidden state dimension of the shortcut connection. 248 | output_change_scale: ``str``, optional (default = ``'O(logn)'``). 249 | The desired output change scale at initialization. Only ``'O(n)'``, ``'O(logn)'`` / ``'default'``, 250 | and ``'O(1)'`` are supported. 251 | 252 | Returns 253 | ------- 254 | None: No returns. The initialized omega would be registered as a parameter within `network`. 255 | 256 | Example 257 | ------- 258 | 259 | .. highlight:: python 260 | .. code-block:: python 261 | 262 | import torch.nn as nn 263 | import admin_torch 264 | 265 | class TransformerEncoderLayer(nn.Module): 266 | 267 | def __init__(self, cfg): 268 | super().__init__() 269 | 270 | num_layer = 2 * cfg.encoder_layers # number of residual layers 271 | 272 | self.attn = nn.MultiheadAttention(cfg.embed_dim, cfg.num_heads) 273 | admin_torch.as_parameter(self, 'attn_omega', num_layer, cfg.embed_dim) 274 | self.ln_attn = nn.LayerNorm(cfg.embed_dim) 275 | 276 | self.ffn = nn.Sequential( 277 | nn.Linear(cfg.embed_dim, cfg.feedforward_dim), 278 | nn.ReLU(), 279 | nn.Linear(cfg.feedforward_dim) 280 | ) 281 | admin_torch.as_parameter(self, 'ffn_omega', num_layer, cfg.embed_dim) 282 | self.ln_ffn = nn.LayerNorm(cfg.embed_dim) 283 | 284 | def forward(self, x): 285 | 286 | f_x, _ = self.attn(x) 287 | x = x * self.attn_omega + f_x 288 | x = self.ln_attn(x) 289 | 290 | f_x = self.ffn(x) 291 | x = x * self.ffn_omega + f_x 292 | x = self.ln_ffn(x) 293 | 294 | return x 295 | """ 296 | omega_vector = torch.ones(embed_dim) 297 | omega_vector.data.fill_(calculate_init(num_res_layers, output_change_scale)) 298 | network.register_parameter(parameter_name,torch.nn.Parameter(omega_vector)) 299 | -------------------------------------------------------------------------------- /example/profile.ratio.init: -------------------------------------------------------------------------------- 1 | 1 1.0 2 | 2 1.3190397024154663 3 | 3 1.4550316333770752 4 | 4 1.5123295783996582 5 | 5 1.6428167819976807 6 | 6 1.695853352546692 7 | 7 1.8099972009658813 8 | 8 1.8667889833450317 9 | 9 1.9713013172149658 10 | 10 2.028479814529419 11 | 11 2.1205849647521973 12 | 12 2.1734628677368164 13 | 13 2.2616868019104004 14 | 14 2.3189287185668945 15 | 15 2.404047966003418 16 | 16 2.456615924835205 17 | 17 2.5324201583862305 18 | 18 2.5882620811462402 19 | 19 2.669269561767578 20 | 20 2.7247939109802246 21 | 21 2.7918922901153564 22 | 22 2.8465681076049805 23 | 23 2.9145007133483887 24 | 24 2.961702585220337 25 | 25 3.027456045150757 26 | 26 3.080919027328491 27 | 27 3.1440911293029785 28 | 28 3.198875904083252 29 | 29 3.2594544887542725 30 | 30 3.306119203567505 31 | 31 3.3605387210845947 32 | 32 3.4074313640594482 33 | 33 3.4670989513397217 34 | 34 3.5158419609069824 35 | 35 3.573145627975464 36 | 36 3.621101140975952 37 | 37 3.6756269931793213 38 | 38 3.723588705062866 39 | 39 3.7751662731170654 40 | 40 3.820997714996338 41 | 41 3.874152183532715 42 | 42 3.919919729232788 43 | 43 3.971492052078247 44 | 44 4.015556335449219 45 | 45 4.062773704528809 46 | 46 4.102989673614502 47 | 47 4.147921562194824 48 | 48 4.188112735748291 49 | 49 4.236110210418701 50 | 50 4.280117988586426 51 | 51 4.3282270431518555 52 | 52 4.3700642585754395 53 | 53 4.413156986236572 54 | 54 4.457295894622803 55 | 55 4.502137184143066 56 | 56 4.543152332305908 57 | 57 4.5831618309021 58 | 58 4.6219892501831055 59 | 59 4.665818214416504 60 | 60 4.704627513885498 61 | 61 4.7454071044921875 62 | 62 4.782839298248291 63 | 63 4.824032783508301 64 | 64 4.865396499633789 65 | 65 4.905492782592773 66 | 66 4.944614410400391 67 | 67 4.986900329589844 68 | 68 5.027820110321045 69 | 69 5.068670272827148 70 | 70 5.105625152587891 71 | 71 5.1475629806518555 72 | 72 5.185201168060303 73 | 73 5.222909450531006 74 | 74 5.258800029754639 75 | 75 5.293256759643555 76 | 76 5.3247480392456055 77 | 77 5.362242221832275 78 | 78 5.398958206176758 79 | 79 5.436041831970215 80 | 80 5.472423076629639 81 | 81 5.50972318649292 82 | 82 5.542351722717285 83 | 83 5.580074310302734 84 | 84 5.614049434661865 85 | 85 5.647655487060547 86 | 86 5.685954570770264 87 | 87 5.72280740737915 88 | 88 5.756107807159424 89 | 89 5.7939252853393555 90 | 90 5.8227972984313965 91 | 91 5.860756874084473 92 | 92 5.893098831176758 93 | 93 5.931689262390137 94 | 94 5.965618133544922 95 | 95 6.000150203704834 96 | 96 6.031996726989746 97 | 97 6.064528465270996 98 | 98 6.096097469329834 99 | 99 6.130829334259033 100 | 100 6.1620001792907715 101 | 101 6.192485809326172 102 | 102 6.223220348358154 103 | 103 6.255505084991455 104 | 104 6.285265922546387 105 | 105 6.317674160003662 106 | 106 6.347247123718262 107 | 107 6.380263805389404 108 | 108 6.4059295654296875 109 | 109 6.4350199699401855 110 | 110 6.463878631591797 111 | 111 6.495331287384033 112 | 112 6.530038356781006 113 | 113 6.560511112213135 114 | 114 6.592663288116455 115 | 115 6.6225266456604 116 | 116 6.649158477783203 117 | 117 6.678005695343018 118 | 118 6.706629753112793 119 | 119 6.735388278961182 120 | 120 6.765207767486572 121 | 121 6.796474933624268 122 | 122 6.829586505889893 123 | 123 6.8563995361328125 124 | 124 6.884836196899414 125 | 125 6.913089752197266 126 | 126 6.945504665374756 127 | 127 6.972911834716797 128 | 128 7.0005574226379395 129 | 129 7.029700756072998 130 | 130 7.057539463043213 131 | 131 7.087018966674805 132 | 132 7.115658283233643 133 | 133 7.145423889160156 134 | 134 7.176324844360352 135 | 135 7.203213214874268 136 | 136 7.2326130867004395 137 | 137 7.263206481933594 138 | 138 7.290313243865967 139 | 139 7.316440105438232 140 | 140 7.342780590057373 141 | 141 7.371157646179199 142 | 142 7.401747226715088 143 | 143 7.428928375244141 144 | 144 7.455725193023682 145 | 145 7.482446670532227 146 | 146 7.509575843811035 147 | 147 7.534206390380859 148 | 148 7.561554431915283 149 | 149 7.5872907638549805 150 | 150 7.610199928283691 151 | 151 7.634483814239502 152 | 152 7.664854526519775 153 | 153 7.689417362213135 154 | 154 7.711843490600586 155 | 155 7.73599910736084 156 | 156 7.759965896606445 157 | 157 7.788935661315918 158 | 158 7.814115524291992 159 | 159 7.835615158081055 160 | 160 7.8606743812561035 161 | 161 7.882746696472168 162 | 162 7.907750606536865 163 | 163 7.9333109855651855 164 | 164 7.958787441253662 165 | 165 7.981550693511963 166 | 166 8.011580467224121 167 | 167 8.03337287902832 168 | 168 8.056806564331055 169 | 169 8.079192161560059 170 | 170 8.103809356689453 171 | 171 8.125045776367188 172 | 172 8.15402889251709 173 | 173 8.181538581848145 174 | 174 8.207011222839355 175 | 175 8.227130889892578 176 | 176 8.252174377441406 177 | 177 8.272038459777832 178 | 178 8.298123359680176 179 | 179 8.317887306213379 180 | 180 8.347379684448242 181 | 181 8.365970611572266 182 | 182 8.39012336730957 183 | 183 8.413267135620117 184 | 184 8.441601753234863 185 | 185 8.469433784484863 186 | 186 8.492582321166992 187 | 187 8.515233993530273 188 | 188 8.537558555603027 189 | 189 8.561620712280273 190 | 190 8.58764362335205 191 | 191 8.611865043640137 192 | 192 8.638022422790527 193 | 193 8.662741661071777 194 | 194 8.689210891723633 195 | 195 8.71304702758789 196 | 196 8.739282608032227 197 | 197 8.76534366607666 198 | 198 8.791733741760254 199 | 199 8.812337875366211 200 | 200 8.836019515991211 201 | 1 1.0 202 | 2 1.3425869941711426 203 | 3 1.4800665378570557 204 | 4 1.610133409500122 205 | 5 1.6861705780029297 206 | 6 1.8158448934555054 207 | 7 1.9169756174087524 208 | 8 1.9898321628570557 209 | 9 2.103006362915039 210 | 10 2.198607921600342 211 | 11 2.2673819065093994 212 | 12 2.379521131515503 213 | 13 2.453937292098999 214 | 14 2.521658420562744 215 | 15 2.6081600189208984 216 | 16 2.6755027770996094 217 | 17 2.7483041286468506 218 | 18 2.8297157287597656 219 | 19 2.905391216278076 220 | 20 2.9680662155151367 221 | 21 3.0330522060394287 222 | 22 3.1018290519714355 223 | 23 3.178995132446289 224 | 24 3.243086099624634 225 | 25 3.304431438446045 226 | 26 3.3670051097869873 227 | 27 3.4263482093811035 228 | 28 3.48586368560791 229 | 29 3.5485644340515137 230 | 30 3.60994553565979 231 | 31 3.658292293548584 232 | 32 3.7145533561706543 233 | 33 3.764017343521118 234 | 34 3.820631742477417 235 | 35 3.8807389736175537 236 | 36 3.927197217941284 237 | 37 3.9778075218200684 238 | 38 4.032394886016846 239 | 39 4.079466342926025 240 | 40 4.121256351470947 241 | 41 4.179348945617676 242 | 42 4.235202789306641 243 | 43 4.276139736175537 244 | 44 4.322049617767334 245 | 45 4.375305652618408 246 | 46 4.421659469604492 247 | 47 4.465145111083984 248 | 48 4.511823654174805 249 | 49 4.558897972106934 250 | 50 4.612059116363525 251 | 51 4.6558332443237305 252 | 52 4.701802730560303 253 | 53 4.749824523925781 254 | 54 4.791557788848877 255 | 55 4.836613655090332 256 | 56 4.882650375366211 257 | 57 4.921323776245117 258 | 58 4.962111949920654 259 | 59 4.998560905456543 260 | 60 5.037952899932861 261 | 61 5.07641077041626 262 | 62 5.117368698120117 263 | 63 5.156766414642334 264 | 64 5.189724445343018 265 | 65 5.232119083404541 266 | 66 5.274285316467285 267 | 67 5.309042453765869 268 | 68 5.348519325256348 269 | 69 5.385103225708008 270 | 70 5.4217658042907715 271 | 71 5.458439350128174 272 | 72 5.49429988861084 273 | 73 5.532592296600342 274 | 74 5.5714898109436035 275 | 75 5.615612030029297 276 | 76 5.6548895835876465 277 | 77 5.69318962097168 278 | 78 5.726015090942383 279 | 79 5.7607831954956055 280 | 80 5.803152084350586 281 | 81 5.840878963470459 282 | 82 5.873714447021484 283 | 83 5.916788578033447 284 | 84 5.954561233520508 285 | 85 5.986266613006592 286 | 86 6.01418924331665 287 | 87 6.048747539520264 288 | 88 6.081315517425537 289 | 89 6.119509220123291 290 | 90 6.158118724822998 291 | 91 6.1867289543151855 292 | 92 6.221379280090332 293 | 93 6.256557464599609 294 | 94 6.288797855377197 295 | 95 6.326879024505615 296 | 96 6.361721038818359 297 | 97 6.393381118774414 298 | 98 6.428704261779785 299 | 99 6.463216304779053 300 | 100 6.492666244506836 301 | 101 6.531671524047852 302 | 102 6.56462287902832 303 | 103 6.595141410827637 304 | 104 6.63072395324707 305 | 105 6.664488792419434 306 | 106 6.691591262817383 307 | 107 6.724743843078613 308 | 108 6.752674102783203 309 | 109 6.781800270080566 310 | 110 6.8150529861450195 311 | 111 6.846014022827148 312 | 112 6.875141620635986 313 | 113 6.913217544555664 314 | 114 6.9485979080200195 315 | 115 6.978128433227539 316 | 116 7.006283283233643 317 | 117 7.036981105804443 318 | 118 7.06494665145874 319 | 119 7.097784519195557 320 | 120 7.124679088592529 321 | 121 7.151909828186035 322 | 122 7.189268112182617 323 | 123 7.214508056640625 324 | 124 7.244059085845947 325 | 125 7.272270202636719 326 | 126 7.302449703216553 327 | 127 7.325904846191406 328 | 128 7.352625370025635 329 | 129 7.382406234741211 330 | 130 7.409889221191406 331 | 131 7.441695213317871 332 | 132 7.465174198150635 333 | 133 7.490511894226074 334 | 134 7.52184534072876 335 | 135 7.552266597747803 336 | 136 7.576979160308838 337 | 137 7.60615873336792 338 | 138 7.631077766418457 339 | 139 7.659272193908691 340 | 140 7.689055442810059 341 | 141 7.714266777038574 342 | 142 7.740371227264404 343 | 143 7.770144462585449 344 | 144 7.797163963317871 345 | 145 7.825152397155762 346 | 146 7.850924015045166 347 | 147 7.8776116371154785 348 | 148 7.905229568481445 349 | 149 7.938175678253174 350 | 150 7.963441371917725 351 | 151 7.989377021789551 352 | 152 8.015571594238281 353 | 153 8.03981876373291 354 | 154 8.061357498168945 355 | 155 8.089569091796875 356 | 156 8.110694885253906 357 | 157 8.13357162475586 358 | 158 8.16172981262207 359 | 159 8.186487197875977 360 | 160 8.212444305419922 361 | 161 8.237144470214844 362 | 162 8.259098052978516 363 | 163 8.284126281738281 364 | 164 8.307903289794922 365 | 165 8.32878303527832 366 | 166 8.352269172668457 367 | 167 8.380142211914062 368 | 168 8.406278610229492 369 | 169 8.428736686706543 370 | 170 8.453876495361328 371 | 171 8.476970672607422 372 | 172 8.50069808959961 373 | 173 8.524378776550293 374 | 174 8.547581672668457 375 | 175 8.568750381469727 376 | 176 8.596118927001953 377 | 177 8.616921424865723 378 | 178 8.64217472076416 379 | 179 8.666587829589844 380 | 180 8.689314842224121 381 | 181 8.712116241455078 382 | 182 8.737107276916504 383 | 183 8.7545166015625 384 | 184 8.781569480895996 385 | 185 8.804463386535645 386 | 186 8.828217506408691 387 | 187 8.84929370880127 388 | 188 8.87789535522461 389 | 189 8.89671516418457 390 | 190 8.919512748718262 391 | 191 8.948515892028809 392 | 192 8.968647956848145 393 | 193 8.989168167114258 394 | 194 9.019471168518066 395 | 195 9.040534019470215 396 | 196 9.059708595275879 397 | 197 9.086166381835938 398 | 198 9.106014251708984 399 | 199 9.12833309173584 400 | 200 9.152287483215332 401 | 201 9.17170524597168 402 | 202 9.192819595336914 403 | 203 9.222025871276855 404 | 204 9.242000579833984 405 | 205 9.262640953063965 406 | 206 9.292340278625488 407 | 207 9.312517166137695 408 | 208 9.332563400268555 409 | 209 9.362652778625488 410 | 210 9.37942886352539 411 | 211 9.403473854064941 412 | 212 9.430370330810547 413 | 213 9.448878288269043 414 | 214 9.467549324035645 415 | 215 9.49483585357666 416 | 216 9.513956069946289 417 | 217 9.536771774291992 418 | 218 9.561524391174316 419 | 219 9.583642959594727 420 | 220 9.608484268188477 421 | 221 9.632624626159668 422 | 222 9.656436920166016 423 | 223 9.676606178283691 424 | 224 9.702705383300781 425 | 225 9.725610733032227 426 | 226 9.74528980255127 427 | 227 9.76707935333252 428 | 228 9.788287162780762 429 | 229 9.808671951293945 430 | 230 9.835151672363281 431 | 231 9.85518741607666 432 | 232 9.870915412902832 433 | 233 9.89185905456543 434 | 234 9.909582138061523 435 | 235 9.925028800964355 436 | 236 9.948447227478027 437 | 237 9.964569091796875 438 | 238 9.98108959197998 439 | 239 10.01058292388916 440 | 240 10.029886245727539 441 | 241 10.049802780151367 442 | 242 10.071462631225586 443 | 243 10.090588569641113 444 | 244 10.113792419433594 445 | 245 10.143721580505371 446 | 246 10.159034729003906 447 | 247 10.17558479309082 448 | 248 10.193281173706055 449 | 249 10.21686840057373 450 | 250 10.241169929504395 451 | 251 10.26410961151123 452 | 252 10.285745620727539 453 | 253 10.306694984436035 454 | 254 10.328060150146484 455 | 255 10.34904956817627 456 | 256 10.371853828430176 457 | 257 10.396636962890625 458 | 258 10.417922019958496 459 | 259 10.433265686035156 460 | 260 10.45438003540039 461 | 261 10.473858833312988 462 | 262 10.496040344238281 463 | 263 10.512752532958984 464 | 264 10.536433219909668 465 | 265 10.55765438079834 466 | 266 10.574774742126465 467 | 267 10.599035263061523 468 | 268 10.620415687561035 469 | 269 10.63953971862793 470 | 270 10.661383628845215 471 | 271 10.681900024414062 472 | 272 10.70407772064209 473 | 273 10.722298622131348 474 | 274 10.737654685974121 475 | 275 10.756811141967773 476 | 276 10.776389122009277 477 | 277 10.792834281921387 478 | 278 10.81033992767334 479 | 279 10.83313274383545 480 | 280 10.854608535766602 481 | 281 10.872381210327148 482 | 282 10.895024299621582 483 | 283 10.916109085083008 484 | 284 10.934562683105469 485 | 285 10.954834938049316 486 | 286 10.973169326782227 487 | 287 11.00400447845459 488 | 288 11.027678489685059 489 | 289 11.048644065856934 490 | 290 11.068979263305664 491 | 291 11.088113784790039 492 | 292 11.103851318359375 493 | 293 11.122899055480957 494 | 294 11.144107818603516 495 | 295 11.157708168029785 496 | 296 11.176496505737305 497 | 297 11.197405815124512 498 | 298 11.212449073791504 499 | 299 11.23056411743164 500 | 300 11.249613761901855 501 | --------------------------------------------------------------------------------