├── .github └── ISSUE_TEMPLATE │ ├── bug-report.md │ ├── feature_request.md │ └── new-model.md ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README-ZH.md ├── README.md ├── docs ├── Makefile ├── README.md ├── make.bat ├── requirements.txt └── source │ ├── _static │ ├── css │ │ └── custom.css │ ├── images │ │ └── logo.png │ └── js │ │ └── custom.js │ ├── api │ ├── block.rst │ └── module.rst │ ├── conf.py │ ├── index.md │ ├── model │ ├── bert.rst │ ├── cpm1.rst │ ├── cpm2.rst │ ├── gpt2.rst │ ├── gptj.rst │ └── t5.rst │ └── notes │ ├── benchmark.md │ ├── installation.md │ ├── pretrain_data.md │ ├── quickstart.md │ └── write_model.md ├── examples ├── bert │ ├── BoolQ.sh │ ├── CB.sh │ ├── COPA.sh │ ├── RTE.sh │ ├── WiC.sh │ └── finetune_bert.py ├── cpm1 │ ├── finetune_cpm1.py │ ├── finetune_cpm1.sh │ ├── pretrain_cpm1.py │ └── pretrain_cpm1.sh ├── cpm2 │ ├── finetune_cpm2.py │ ├── finetune_cpm2.sh │ ├── pretrain_cpm2.py │ └── pretrain_cpm2.sh ├── gpt2 │ ├── BoolQ.sh │ ├── CB.sh │ ├── COPA.sh │ ├── RTE.sh │ ├── WSC.sh │ ├── WiC.sh │ └── finetune_gpt2.py ├── gptj │ ├── BoolQ.sh │ ├── CB.sh │ ├── COPA.sh │ ├── RTE.sh │ ├── WSC.sh │ ├── WiC.sh │ └── finetune_gptj.py ├── llama │ ├── RTE.sh │ └── finetune_llama.py ├── mt5 │ ├── BoolQ.sh │ ├── CB.sh │ ├── COPA.sh │ ├── RTE.sh │ ├── WSC.sh │ ├── WiC.sh │ └── finetune_mt5.py ├── t5-v1_1 │ ├── BoolQ.sh │ ├── CB.sh │ ├── COPA.sh │ ├── RTE.sh │ ├── WSC.sh │ ├── WiC.sh │ └── finetune_t5-v1_1.py └── t5 │ ├── BoolQ.sh │ ├── CB.sh │ ├── COPA.sh │ ├── RTE.sh │ ├── SQuAD.sh │ ├── WSC.sh │ ├── WiC.sh │ ├── finetune_t5_squad.py │ ├── finetune_t5_superglue.py │ └── squad_metric.py ├── model_center ├── __init__.py ├── arguments.py ├── dataset │ ├── __init__.py │ ├── bertdataset │ │ ├── __init__.py │ │ └── superglue.py │ ├── cpm1 │ │ ├── __init__.py │ │ └── cpm1_dataset.py │ ├── cpm1dataset │ │ ├── __init__.py │ │ └── down_data.py │ ├── cpm2 │ │ ├── __init__.py │ │ └── dataset.py │ ├── cpm2dataset │ │ ├── __init__.py │ │ └── down_data.py │ ├── distributed_dataset.py │ ├── distributed_indexed.py │ ├── distributed_loader.py │ ├── gpt2dataset │ │ ├── __init__.py │ │ └── superglue.py │ ├── indexed.py │ ├── llamadataset │ │ ├── __init__.py │ │ └── superglue.py │ ├── t5dataset │ │ ├── __init__.py │ │ ├── squad.py │ │ └── superglue.py │ └── utils.py ├── generation │ ├── __init__.py │ ├── generation_utils.py │ ├── llama.py │ └── t5.py ├── layer │ ├── __init__.py │ ├── attention.py │ ├── blocks.py │ ├── conv.py │ ├── embedding.py │ ├── feedforward.py │ ├── layernorm.py │ ├── linear.py │ ├── position_embedding.py │ └── transformer.py ├── model │ ├── __init__.py │ ├── basemodel.py │ ├── bert.py │ ├── config │ │ ├── __init__.py │ │ ├── bert_config.py │ │ ├── config.py │ │ ├── cpm1_config.py │ │ ├── cpm2_config.py │ │ ├── cpm3_config.py │ │ ├── glm_config.py │ │ ├── gpt2_config.py │ │ ├── gptj_config.py │ │ ├── llama_config.py │ │ ├── longformer_config.py │ │ ├── opt_config.py │ │ ├── roberta_config.py │ │ ├── t5_config.py │ │ └── vit_config.py │ ├── cpm1.py │ ├── cpm2.py │ ├── cpm3.py │ ├── glm.py │ ├── gpt2.py │ ├── gptj.py │ ├── llama.py │ ├── longformer.py │ ├── opt.py │ ├── roberta.py │ ├── t5.py │ └── vit.py ├── tokenizer │ ├── __init__.py │ ├── base_tokenizer.py │ ├── bert_tokenizer.py │ ├── cpm1_tokenizer.py │ ├── cpm2_tokenizer.py │ ├── glm_tokenizer.py │ ├── gpt2_tokenizer.py │ ├── gptj_tokenizer.py │ ├── llama_tokenizer.py │ ├── opt_tokenizer.py │ ├── roberta_tokenizer.py │ └── t5_tokenizer.py ├── tools │ ├── indexed_dataset.py │ ├── preprocess_cpm1_lm.py │ └── run_preprocess.sh └── utils │ ├── __init__.py │ ├── net_utils.py │ └── print_utils.py ├── requirements.txt ├── setup.py ├── tests ├── test.sh ├── test_bert.py ├── test_bert_pkv.py ├── test_flan_t5.py ├── test_glm.py ├── test_gpt2.py ├── test_gpt_pkv.py ├── test_gptj.py ├── test_llama.py ├── test_longformer.py ├── test_mt5.py ├── test_opt.py ├── test_roberta.py ├── test_t5.py ├── test_t5v1_1.py └── test_vit.py └── transfer ├── hugFLANT5_bmtrainFLANT5.py ├── hugGPT2_bmtrainGPT2.py ├── hugGPTj_bmtrainGPTj.py ├── hugLLaMa2_bmtrainLLaMa2.py ├── hugLLaMa_bmtrainLLaMa.py ├── hugLongformer_bmtrainLongformer.py ├── hugMT5_bmtrainMT5.py ├── hugOPT_bmtrainOPT.py ├── hugRoBERTa_bmtrainRoBERTa.py ├── hugT5_bmtrainT5.py ├── hugT5v1_1_bmtrainT5v1_1.py └── run.sh /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | 12 | 13 | **Minimal steps to reproduce** 14 | 15 | 16 | **Expected behavior** 17 | 18 | 19 | **Screenshots** 20 | 21 | 22 | **Environment:** 23 | 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | 12 | 13 | **Describe the solution you'd like** 14 | 15 | 16 | **Describe alternatives you've considered** 17 | 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/new-model.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: New model 3 | about: Add a new model to this project 4 | title: "[MODEL] " 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Introduction** 11 | 12 | 13 | **Resources** 14 | 15 | Paper: 16 | 17 | Code: 18 | 19 | Author: 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | **/__pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | .DS_STORE 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | .vscode/ 134 | pretrain_data/ 135 | small_data/ 136 | large_data/ 137 | raw_data/ 138 | checkpoints/ 139 | results 140 | new_data 141 | down_data 142 | 143 | *.bin 144 | *.idx 145 | *.pt 146 | 147 | pretrain_data_raw/ 148 | data_log 149 | debug 150 | checkpoints/ 151 | 152 | src/BMTrain 153 | src/transformers 154 | logs 155 | debug.sh -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to ModelCenter 2 | 3 | We welcome everyone's effort to make the community and the package better. You are welcomed to propose an issue, make a pull request or help others in the community. All of the efforts are appreciated! 4 | 5 | There are many ways that you can contribute to ModelCenter: 6 | 7 | - ✉️ Submitting an issue. 8 | - ⌨️ Making a pull request. 9 | - 🤝 Serving the community. 10 | 11 | ## Submitting an issue 12 | You can submit an issue if you find bugs or require new features and enhancements. Here are some principles: 13 | 14 | 1. **Language.** It is better to write your issue in English so that more people can understand and help you more conveniently. 15 | 2. **Search.** It is a good habit to search existing issues using the search bar of GitHub. Make sure there are no duplicated or similar issues with yours and if yes, check their solutions first. 16 | 3. **Format.** It is also very helpful to write the issue with a good writing style. We provide templates of common types of issues and everyone is encouraged to use these templates. If the templates do not fit in your issue, feel free to open a blank one. 17 | 4. **Writing style.** Write your issues in clear and concise words. It is also important to provide enough details for others to help. For example in a bug report, it is better to provide your running environment and minimal lines of code to reproduce it. 18 | 19 | ## Making a pull request (PR) 20 | You can also write codes to contribute. The codes may include a bug fix, a new enhancement, or a new running example. Here we provide the steps to make a pull request: 21 | 22 | 1. **Combine the PR with an issue.** Make us and others know what you are going to work on. If your codes try to solve an existing issue, you should comment on the issue and make sure there are no others working on it. If you are proposing a new enhancement, submit an issue first and we can discuss it with you before you work on it. 23 | 24 | 2. **Fork the repository.** Fork the repository to your own GitHub space by clicking the "Fork" button. Then clone it on your disk and set the remote repo: 25 | ```git 26 | $ git clone https://github.com//ModelCenter.git 27 | $ cd ModelCenter 28 | $ git remote add upstream https://github.com/OpenBMB/ModelCenter.git 29 | ``` 30 | 31 | 3. **Write your code.** Change to a new branch to work on your modifications. 32 | ```git 33 | $ git checkout -b your-branch-name 34 | ``` 35 | You are encouraged to think up a meaningful and descriptive name for your branch. 36 | 37 | 4. **Make a pull request.** After you finish coding, you should first rebase your code and solve the conflicts with the remote codes: 38 | ```git 39 | $ git fetch upstream 40 | $ git rebase upstream/main 41 | ``` 42 | Then you can push your codes to your own repo: 43 | ```git 44 | $ git push -u origin your-branch-name 45 | ``` 46 | Finally, you can make the pull request from your GitHub repo and merge it with ours. Your codes will be merged into the main repo after our code review. 47 | 48 | 49 | ## Serving the community 50 | 51 | Besides submitting issues and PRs, you can also join our community and help others. Efforts like writing the documents, answering questions as well as discussing new features are appreciated and welcomed. It will also be helpful if you can post your opinions and feelings about using our package on social media. 52 | 53 | We are now developing a reward system and all your contributions will be recorded and rewarded in the future. 54 | 55 | 56 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # ModelCenter Documentation 2 | 3 | To build this doc locally, first 4 | 5 | ``` 6 | pip install -r docs/requirements.txt 7 | ``` 8 | 9 | then, 10 | 11 | ``` 12 | cd docs 13 | make html 14 | ``` 15 | 16 | Then open the generated `docs/build/html/index.html` in your local browser. -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.https://www.sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=4.0.0 2 | recommonmark 3 | sphinx_markdown_tables 4 | sphinx_rtd_theme>=0.3.0 5 | torch>=1.10 6 | transformers 7 | jieba -------------------------------------------------------------------------------- /docs/source/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | a, 2 | .wy-menu-vertical header, 3 | .wy-menu-vertical p.caption, 4 | .wy-nav-top .fa-bars, 5 | .wy-menu-vertical a:hover, 6 | 7 | .rst-content code.literal, .rst-content tt.literal 8 | 9 | { 10 | color: #315EFE !important; 11 | } 12 | 13 | /* inspired by sphinx press theme */ 14 | .wy-menu.wy-menu-vertical li.toctree-l1.current > a { 15 | border-left: solid 8px #315EFE !important; 16 | border-top: none; 17 | border-bottom: none; 18 | } 19 | 20 | .wy-menu.wy-menu-vertical li.toctree-l1.current > ul { 21 | border-left: solid 8px #315EFE !important; 22 | } 23 | /* inspired by sphinx press theme */ 24 | 25 | .wy-nav-side { 26 | color: unset !important; 27 | background: unset !important; 28 | border-right: solid 1px #ccc !important; 29 | } 30 | 31 | .wy-side-nav-search, 32 | .wy-nav-top, 33 | .wy-menu-vertical li, 34 | .wy-menu-vertical li a:hover, 35 | .wy-menu-vertical li a 36 | { 37 | background: unset !important; 38 | } 39 | 40 | .wy-menu-vertical li.current a { 41 | border-right: unset !important; 42 | } 43 | 44 | .wy-side-nav-search div, 45 | .wy-menu-vertical a { 46 | color: #404040 !important; 47 | } 48 | 49 | .wy-menu-vertical button.toctree-expand { 50 | color: #333 !important; 51 | } 52 | 53 | .wy-nav-content { 54 | max-width: unset; 55 | } 56 | 57 | .rst-content { 58 | max-width: 900px; 59 | } 60 | 61 | .wy-nav-content .icon-home:before { 62 | content: "Docs"; 63 | } 64 | 65 | .wy-side-nav-search .icon-home:before { 66 | content: ""; 67 | } 68 | 69 | dl.field-list { 70 | display: block !important; 71 | } 72 | 73 | dl.field-list > dt:after { 74 | content: "" !important; 75 | } 76 | 77 | :root { 78 | --dark-blue: #3260F7; 79 | --light-blue: rgba(194, 233, 248, 0.1) ; 80 | } 81 | 82 | dl.field-list > dt { 83 | display: table; 84 | padding-left: 6px !important; 85 | padding-right: 6px !important; 86 | margin-bottom: 4px !important; 87 | padding-bottom: 1px !important; 88 | background: var(--light-blue); 89 | border-left: solid 2px var(--dark-blue); 90 | } 91 | 92 | 93 | dl.py.class>dt 94 | { 95 | color: rgba(17, 16, 17, 0.822) !important; 96 | background: var(--light-blue) !important; 97 | border-top: solid 2px var(--dark-blue) !important; 98 | } 99 | 100 | dl.py.method>dt 101 | { 102 | background: var(--light-blue) !important; 103 | border-left: solid 2px var(--dark-blue) !important; 104 | } 105 | 106 | dl.py.attribute>dt, 107 | dl.py.property>dt 108 | { 109 | background: var(--light-blue) !important; 110 | border-left: solid 2px var(--dark-blue) !important; 111 | } 112 | 113 | .fa-plus-square-o::before, .wy-menu-vertical li button.toctree-expand::before, 114 | .fa-minus-square-o::before, .wy-menu-vertical li.current > a button.toctree-expand::before, .wy-menu-vertical li.on a button.toctree-expand::before 115 | { 116 | content: ""; 117 | } 118 | 119 | .rst-content .viewcode-back, 120 | .rst-content .viewcode-link 121 | { 122 | color:#58b5cc; 123 | font-size: 120%; 124 | } -------------------------------------------------------------------------------- /docs/source/_static/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ModelCenter/14490451e9a91675ef8816c64cf6304d509bce62/docs/source/_static/images/logo.png -------------------------------------------------------------------------------- /docs/source/_static/js/custom.js: -------------------------------------------------------------------------------- 1 | document.addEventListener("DOMContentLoaded", function(event) { 2 | document.querySelectorAll(".wy-menu.wy-menu-vertical > ul.current > li > a").forEach(a => a.addEventListener("click", e=>{ 3 | f = document.querySelector(".wy-menu.wy-menu-vertical > ul.current > li > ul") 4 | if (f.style.display=='none') { f.style.display='block'; } else f.style.display = 'none' 5 | })); 6 | document.querySelectorAll(".headerlink").forEach(a => a.text="\u{1F517}"); 7 | }); -------------------------------------------------------------------------------- /docs/source/api/block.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | block 3 | ======================= 4 | 5 | Encoder 6 | ------------------------------------ 7 | .. autoclass:: model_center.layer.Encoder 8 | :members: 9 | :show-inheritance: 10 | 11 | Decoder 12 | ------------------------------------ 13 | .. autoclass:: model_center.layer.Decoder 14 | :members: 15 | :show-inheritance: 16 | 17 | TransformerBlock 18 | ------------------------------------ 19 | .. autoclass:: model_center.layer.TransformerBlock 20 | :members: 21 | :show-inheritance: 22 | 23 | FFNBlock 24 | ------------------------------------ 25 | .. autoclass:: model_center.layer.FFNBlock 26 | :members: 27 | :show-inheritance: 28 | 29 | SelfAttentionBlock 30 | ------------------------------------ 31 | .. autoclass:: model_center.layer.SelfAttentionBlock 32 | :members: 33 | :show-inheritance: 34 | 35 | CrossAttentionBlock 36 | ------------------------------------ 37 | .. autoclass:: model_center.layer.CrossAttentionBlock 38 | :members: 39 | :show-inheritance: -------------------------------------------------------------------------------- /docs/source/api/module.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | module 3 | ======================= 4 | 5 | Linear 6 | ------------------------------------ 7 | .. autoclass:: model_center.layer.Linear 8 | :members: 9 | :show-inheritance: 10 | 11 | Embedding 12 | ------------------------------------ 13 | .. autoclass:: model_center.layer.Embedding 14 | :members: 15 | :show-inheritance: 16 | 17 | RelativePositionEmbedding 18 | ------------------------------------ 19 | .. autoclass:: model_center.layer.RelativePositionEmbedding 20 | :members: 21 | :show-inheritance: 22 | 23 | RotaryEmbedding 24 | ------------------------------------ 25 | .. autoclass:: model_center.layer.RotaryEmbedding 26 | :members: 27 | :show-inheritance: 28 | 29 | LayerNorm 30 | ------------------------------------ 31 | .. autoclass:: model_center.layer.LayerNorm 32 | :members: 33 | :show-inheritance: 34 | 35 | Attention 36 | ------------------------------------ 37 | .. autoclass:: model_center.layer.Attention 38 | :members: 39 | :show-inheritance: 40 | 41 | FeedForward 42 | ------------------------------------ 43 | .. autoclass:: model_center.layer.FeedForward 44 | :members: 45 | :show-inheritance: -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../..')) 16 | 17 | import recommonmark 18 | from recommonmark.transform import AutoStructify 19 | from recommonmark.parser import CommonMarkParser 20 | 21 | 22 | 23 | # -- Project information ----------------------------------------------------- 24 | 25 | project = 'ModelCenter' 26 | copyright = '2022, OpenBMB' 27 | author = 'BMTrain Team' 28 | autodoc_mock_imports = ["bmtrain"] 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | 'sphinx.ext.autodoc', 37 | 'sphinx.ext.napoleon', 38 | 'sphinx.ext.mathjax', 39 | 'recommonmark', 40 | 'sphinx_markdown_tables', 41 | ] 42 | 43 | source_parsers = { 44 | '.md': CommonMarkParser, 45 | } 46 | 47 | source_suffix = ['.rst', '.md'] 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ['_templates'] 51 | 52 | # The language for content autogenerated by Sphinx. Refer to documentation 53 | # for a list of supported languages. 54 | # 55 | # This is also used if you do content translation via gettext catalogs. 56 | # Usually you set "language" from the command line for these cases. 57 | # language = 'zh_CN' 58 | 59 | # List of patterns, relative to source directory, that match files and 60 | # directories to ignore when looking for source files. 61 | # This pattern also affects html_static_path and html_extra_path. 62 | exclude_patterns = [] 63 | 64 | 65 | # -- Options for HTML output ------------------------------------------------- 66 | 67 | # The theme to use for HTML and HTML Help pages. See the documentation for 68 | # a list of builtin themes. 69 | # 70 | html_theme = 'sphinx_rtd_theme' 71 | 72 | # Add any paths that contain custom static files (such as style sheets) here, 73 | # relative to this directory. They are copied after the builtin static files, 74 | # so a file named "default.css" will overwrite the builtin "default.css". 75 | html_static_path = ['_static'] 76 | #html_stype="css/custom.css" 77 | html_css_files=['css/custom.css' ] 78 | html_js_files= ['js/custom.js' ] 79 | add_module_names = True 80 | 81 | 82 | # At the bottom of conf.py 83 | def setup(app): 84 | app.add_config_value('recommonmark_config', { 85 | #'url_resolver': lambda url: github_doc_root + url, 86 | 'auto_toc_tree_section': 'Contents', 87 | 'enable_math': False, 88 | 'enable_inline_math': False, 89 | 'enable_eval_rst': True, 90 | }, True) 91 | app.add_transform(AutoStructify) 92 | -------------------------------------------------------------------------------- /docs/source/index.md: -------------------------------------------------------------------------------- 1 | # ModelCenter's Documentation 2 | 3 | ModelCenter implements PLMs (Pretrained Language Models) based on [BMTrain](https://bmtrain.readthedocs.io/en/latest/index.html>) backend. 4 | 5 | ## Main Advantages: 6 | 7 | - Low-Resource 8 | - Efficient 9 | - Extendable 10 | 11 | ```eval_rst 12 | .. toctree:: 13 | :maxdepth: 2 14 | :caption: GETTING STARTED 15 | 16 | notes/installation.md 17 | notes/quickstart.md 18 | notes/benchmark.md 19 | notes/write_model.md 20 | notes/pretrain_data.md 21 | 22 | .. toctree:: 23 | :maxdepth: 1 24 | :caption: Models 25 | 26 | model/bert.rst 27 | model/gpt2.rst 28 | model/gptj.rst 29 | model/t5.rst 30 | model/cpm1.rst 31 | model/cpm2.rst 32 | 33 | .. toctree:: 34 | :maxdepth: 2 35 | :caption: PACKAGE REFERENCE 36 | 37 | api/module.rst 38 | api/block.rst 39 | 40 | .. toctree:: 41 | :maxdepth: 2 42 | :caption: Advanced 43 | 44 | 45 | Indices and tables 46 | ================== 47 | 48 | * :ref:`genindex` 49 | 50 | ``` -------------------------------------------------------------------------------- /docs/source/model/bert.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | BERT 3 | ======================= 4 | 5 | `Bert `_ 6 | 7 | We currently support loading the following checkpoint via ``Bert.from_pretrained(identifier)`` 8 | 9 | - bert-base-cased 10 | - bert-base-uncased 11 | - bert-large-cased 12 | - bert-large-uncased 13 | - bert-base-chinese 14 | - bert-base-multilingual-cased 15 | 16 | BertConfig 17 | ------------------------------------ 18 | .. autoclass:: model_center.model.BertConfig 19 | :members: 20 | 21 | BertModel 22 | ------------------------------------ 23 | .. autoclass:: model_center.model.Bert 24 | :members: 25 | 26 | BertTokenizer 27 | ------------------------------------ 28 | .. class:: model_center.tokenizer.BertTokenizer 29 | 30 | The current implementation is mainly an alias to BertTokenizer of `Hugging Face Transformers `_. 31 | we will change to our SAM implementation in the future, which will be a more efficient tokenizer. -------------------------------------------------------------------------------- /docs/source/model/cpm1.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | CPM1 3 | ======================= 4 | 5 | `CPM1 `_ 6 | 7 | CPM1Config 8 | ------------------------------------ 9 | .. autoclass:: model_center.model.CPM1Config 10 | :members: 11 | 12 | CPM1Model 13 | ------------------------------------ 14 | .. autoclass:: model_center.model.CPM1 15 | :members: 16 | 17 | CPM1Tokenizer 18 | ------------------------------------ 19 | .. autoclass:: model_center.tokenizer.CPM1Tokenizer 20 | :members: -------------------------------------------------------------------------------- /docs/source/model/cpm2.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | CPM2 3 | ======================= 4 | 5 | `CPM2 `_ 6 | 7 | CPM2Config 8 | ------------------------------------ 9 | .. autoclass:: model_center.model.CPM2Config 10 | :members: 11 | 12 | CPM2Model 13 | ------------------------------------ 14 | .. autoclass:: model_center.model.CPM2 15 | :members: 16 | 17 | CPM2Tokenizer 18 | ------------------------------------ 19 | .. autoclass:: model_center.tokenizer.CPM2Tokenizer 20 | :members: -------------------------------------------------------------------------------- /docs/source/model/gpt2.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | GPT2 3 | ======================= 4 | 5 | `GPT2 `_ 6 | 7 | We currently support loading the following checkpoint via ``GPT2.from_pretrained(identifier)`` 8 | 9 | - gpt2-base 10 | - gpt2-medium 11 | - gpt2-large 12 | - gpt2-xl 13 | 14 | GPT2Config 15 | ------------------------------------ 16 | .. autoclass:: model_center.model.GPT2Config 17 | :members: 18 | 19 | GPT2Model 20 | ------------------------------------ 21 | .. autoclass:: model_center.model.GPT2 22 | :members: 23 | 24 | GPT2Tokenizer 25 | ------------------------------------ 26 | .. class:: model_center.tokenizer.GPT2Tokenizer 27 | 28 | The current implementation is mainly an alias to GPT2Tokenizer of `Hugging Face Transformers `_. 29 | we will change to our SAM implementation in the future, which will be a more efficient tokenizer. -------------------------------------------------------------------------------- /docs/source/model/gptj.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | GPT-j 3 | ======================= 4 | 5 | `GPTj `_ 6 | 7 | We currently support loading the following checkpoint via ``GPTj.from_pretrained(identifier)`` 8 | 9 | - gptj-6b 10 | 11 | GPTjConfig 12 | ------------------------------------ 13 | .. autoclass:: model_center.model.GPTjConfig 14 | :members: 15 | 16 | GPTjModel 17 | ------------------------------------ 18 | .. autoclass:: model_center.model.GPTj 19 | :members: 20 | 21 | GPTjTokenizer 22 | ------------------------------------ 23 | .. class:: model_center.tokenizer.GPTjTokenizer 24 | 25 | The current implementation is mainly an alias to AutoTokenizer of `Hugging Face Transformers `_. 26 | we will change to our SAM implementation in the future, which will be a more efficient tokenizer. -------------------------------------------------------------------------------- /docs/source/model/t5.rst: -------------------------------------------------------------------------------- 1 | ======================= 2 | T5 3 | ======================= 4 | 5 | `T5 `_ 6 | 7 | We currently support loading the following checkpoint via ``T5.from_pretrained(identifier)`` 8 | 9 | - t5-small 10 | - t5-base 11 | - t5-large 12 | - t5-3b 13 | - t5-11b 14 | 15 | T5Config 16 | ------------------------------------ 17 | .. autoclass:: model_center.model.T5Config 18 | :members: 19 | 20 | T5Model 21 | ------------------------------------ 22 | .. autoclass:: model_center.model.T5 23 | :members: 24 | 25 | T5Tokenizer 26 | ------------------------------------ 27 | .. class:: model_center.tokenizer.T5Tokenizer 28 | 29 | The current implementation is mainly an alias to T5Tokenizer of `Hugging Face Transformers `_. 30 | we will change to our SAM implementation in the future, which will be a more efficient tokenizer. -------------------------------------------------------------------------------- /docs/source/notes/benchmark.md: -------------------------------------------------------------------------------- 1 | # Benchmark 2 | 3 | ## Comparison between Hugging Face Transformers 4 | 5 | ### Make Big Models trainable on consumer GPUs 6 | 7 | Tested on 32GB V100 machine using `bert-large-uncased`, we have comparable throughput as Hugging Face Transformers but much fewer GPU memory footprint. 8 | 9 | |repo|**max**-batchsize(#examples)|time(s)|throughput(#examples/s)| 10 | |-|-|-|-| 11 | |transformers|11|1.11|9.9| 12 | |transformers+fp16|14|0.53|26.4| 13 | |modelcenter|256|10.3|24.9| 14 | 15 | Tested on a **single consumer GPU**, 11GB 2080Ti, however, training `bert-large-uncased` is no longer supported in Hugging Face Transformers, but we make it possible. 16 | 17 | |repo|**max**-batchsize(#examples)| 18 | |-|-| 19 | |transformers|0| 20 | |transformers+fp16|0| 21 | |modelcenter|72| 22 | 23 | ### Make Huge Models train easily. 24 | 25 | Tested on 40GB A100 machine using `T5-11B`, we make it possible to train with 16 batch-size using two GPUs. 26 | 27 | ## Comparison between Deepspeed ZeRO 28 | 29 | see also [BMTrain's Performance](https://github.com/OpenBMB/BMTrain#performance) -------------------------------------------------------------------------------- /docs/source/notes/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## 1. From PyPI (Recommend) 4 | 5 | ```shell 6 | $ pip install model-center 7 | ``` 8 | 9 | ## 2. From Source 10 | 11 | ```shell 12 | $ git clone https://github.com/OpenBMB/ModelCenter.git 13 | $ cd ModelCenter 14 | $ pip install -r requirements.txt 15 | $ python3 setup.py install 16 | ``` 17 | -------------------------------------------------------------------------------- /docs/source/notes/write_model.md: -------------------------------------------------------------------------------- 1 | # How to write a new model 2 | 3 | ## Model Implementation 4 | 5 | We implement our models in `model_center/model` 6 | 7 | We provided commonly used [modules](https://bmtrain.readthedocs.io/en/latest/api/module.html) in `model_center/layer`, such as `Linear`, `LayerNorm`, `Embedding`, 8 | which are implemented based on [bmtrain.DistributedParameter](https://bmtrain.readthedocs.io/en/latest/api/bmtrain.html#bmtrain.DistributedParameter) 9 | and [bmtrain.DistributedModule](https://bmtrain.readthedocs.io/en/latest/api/bmtrain.html#bmtrain.DistributedModule), for distributed training support. 10 | 11 | We have also implemented common ways of combining modules in `model_center/layer`, which are [block](https://bmtrain.readthedocs.io/en/latest/api/block.html). 12 | For example, `SelfAttentionBlock` combines Layernorm, Attention, Add&Norm together. 13 | Each blocks has diverse option, e.g., `FFNBlock` supports `gated_relu`, `relu`, `gated_gelu`, `gelu`; blocks support pre-layernorm and post-layernorm. 14 | 15 | With the help of these commonly used modules we provided, a new model can be written easily without many exceptions. You can just add the model specific feature into the common structure. 16 | 17 | A classic transformer is implemented in the following structure: 18 | 19 | We use [bmtrain.CheckpointBlock](https://bmtrain.readthedocs.io/en/latest/api/bmtrain.html#bmtrain.CheckpointBlock), and 20 | [bmtrain.TransformerBlockList](https://bmtrain.readthedocs.io/en/latest/api/bmtrain.html#bmtrain.TransformerBlockList) to wrap our transformer blocks. 21 | These reducd the GPU memory usage by a great amount without adding lots of computation time. 22 | For more information, see [BMTrain's Quick Start](https://bmtrain.readthedocs.io/en/latest/notes/quickstart-zh.html) 23 | 24 | ``` 25 | T5( 26 | (input_embedding): Embedding() 27 | (position_bias_enc): RelativePositionEmbedding() 28 | (position_bias_dec): RelativePositionEmbedding() 29 | (encoder): Encoder( 30 | (layers): bmtrain.TransformerBlockList( 31 | (0): bmtrain.CheckpointBlock( 32 | TransformerBlock( 33 | (self_att): SelfAttentionBlock( 34 | (layernorm_before_attention): LayerNorm() 35 | (attention): Attention( 36 | (project_q): Linear() 37 | (project_k): Linear() 38 | (project_v): Linear() 39 | (attention_out): Linear() 40 | ) 41 | ) 42 | (ffn): FFNBlock( 43 | (layernorm_before_ffn): LayerNorm() 44 | (ffn): FeedForward( 45 | (w_in): DenseACT( 46 | (w): Linear() 47 | (act): ReLU() 48 | ) 49 | (w_out): Linear() 50 | ) 51 | ) 52 | ) 53 | ) 54 | (1): bmtrain.CheckpointBlock() 55 | . 56 | . 57 | . 58 | ) 59 | (output_layernorm): LayerNorm() 60 | ) 61 | (decoder): Decoder( 62 | (layers): bmtrain.TransformerBlockList( 63 | (0): bmtrain.CheckpointBlock( 64 | (self_att): SelfAttentionBlock( 65 | (layernorm_before_attention): LayerNorm() 66 | (attention): Attention( 67 | (project_q): Linear() 68 | (project_k): Linear() 69 | (project_v): Linear() 70 | (attention_out): Linear() 71 | ) 72 | ) 73 | (cross_att): CrossAttentionBlock( 74 | (layernorm_before_attention): LayerNorm() 75 | (attention): Attention( 76 | (project_q): Linear() 77 | (project_k): Linear() 78 | (project_v): Linear() 79 | (attention_out): Linear() 80 | ) 81 | ) 82 | (ffn): FFNBlock( 83 | (layernorm_before_ffn): LayerNorm() 84 | (ffn): FeedForward( 85 | (w_in): DenseACT( 86 | (w): Linear() 87 | (act): ReLU() 88 | ) 89 | (w_out): Linear() 90 | ) 91 | ) 92 | ) 93 | (1): bmtrain.CheckpointBlock() 94 | . 95 | . 96 | . 97 | ) 98 | (output_layernorm): LayerNorm() 99 | ) 100 | (output_projection): Linear( 101 | (weight): bmtrain.DistributedParameter() 102 | (bias): bmtrain.DistributedParameter() 103 | ) 104 | ) 105 | ``` 106 | 107 | ## Model Config 108 | 109 | We add model configs in `model_center/model/config` 110 | 111 | By inheriting `model_center.config.Config`, config class can parse json files with `config.from_json_file(path)` method, 112 | the parsed json file are then save to the config class and used by model by instantiating model with `model(config)`. -------------------------------------------------------------------------------- /examples/bert/BoolQ.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="bert-large-cased" 17 | DATASET="BoolQ" 18 | 19 | OPTS="" 20 | OPTS+=" --model-config ${VERSION}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --dataset_name ${DATASET}" 23 | OPTS+=" --batch-size 64" 24 | OPTS+=" --lr 0.00001" 25 | OPTS+=" --max-encoder-length 512" 26 | OPTS+=" --train-iters 1400" 27 | OPTS+=" --lr-decay-style constant" 28 | OPTS+=" --weight-decay 1e-2" 29 | OPTS+=" --clip-grad 10.0" 30 | OPTS+=" --loss-scale 128" 31 | 32 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/bert/finetune_bert.py ${OPTS}" 33 | echo ${CMD} 34 | 35 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/bert_superglue/finetune-${VERSION}-${DATASET}.log -------------------------------------------------------------------------------- /examples/bert/CB.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12347 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="bert-large-cased" 17 | DATASET="CB" 18 | 19 | OPTS="" 20 | OPTS+=" --model-config ${VERSION}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --dataset_name ${DATASET}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --lr 0.00001" 25 | OPTS+=" --max-encoder-length 512" 26 | OPTS+=" --train-iters 400" 27 | OPTS+=" --lr-decay-style constant" 28 | OPTS+=" --warmup-iters 40" 29 | OPTS+=" --weight-decay 2e-3" 30 | OPTS+=" --clip-grad 1.0" 31 | OPTS+=" --loss-scale 128" 32 | 33 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/bert/finetune_bert.py ${OPTS}" 34 | echo ${CMD} 35 | 36 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/bert_superglue/finetune-${VERSION}-${DATASET}.log -------------------------------------------------------------------------------- /examples/bert/COPA.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="bert-large-cased" 17 | DATASET="COPA" 18 | 19 | OPTS="" 20 | OPTS+=" --model-config ${VERSION}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --dataset_name ${DATASET}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --lr 0.00001" 25 | OPTS+=" --max-encoder-length 512" 26 | OPTS+=" --train-iters 400" 27 | OPTS+=" --lr-decay-style constant" 28 | OPTS+=" --warmup-iters 40" 29 | OPTS+=" --weight-decay 1e-2" 30 | OPTS+=" --clip-grad 1.0" 31 | OPTS+=" --loss-scale 128" 32 | 33 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/bert/finetune_bert.py ${OPTS}" 34 | echo ${CMD} 35 | 36 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/bert_superglue/finetune-${VERSION}-${DATASET}.log -------------------------------------------------------------------------------- /examples/bert/RTE.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="bert-large-cased" 17 | DATASET="RTE" 18 | 19 | OPTS="" 20 | OPTS+=" --model-config ${VERSION}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --dataset_name ${DATASET}" 23 | OPTS+=" --batch-size 64" 24 | OPTS+=" --warmup-iters 40" 25 | OPTS+=" --lr 0.00005" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --train-iters 400" 28 | OPTS+=" --lr-decay-style constant" 29 | OPTS+=" --weight-decay 1e-2" 30 | OPTS+=" --clip-grad 10.0" 31 | OPTS+=" --loss-scale 128" 32 | 33 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/bert/finetune_bert.py ${OPTS}" 34 | echo ${CMD} 35 | 36 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/bert_superglue/finetune-${VERSION}-${DATASET}.log 37 | -------------------------------------------------------------------------------- /examples/bert/WiC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="bert-large-cased" 17 | DATASET="WiC" 18 | 19 | OPTS="" 20 | OPTS+=" --model-config ${VERSION}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --dataset_name ${DATASET}" 23 | OPTS+=" --batch-size 64" 24 | OPTS+=" --warmup-iters 100" 25 | OPTS+=" --lr 0.00001" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --train-iters 400" 28 | OPTS+=" --lr-decay-style constant" 29 | OPTS+=" --weight-decay 1e-2" 30 | OPTS+=" --clip-grad 10.0" 31 | OPTS+=" --loss-scale 128" 32 | 33 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/bert/finetune_bert.py ${OPTS}" 34 | echo ${CMD} 35 | 36 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/bert_superglue/finetune-${VERSION}-${DATASET}.log -------------------------------------------------------------------------------- /examples/cpm1/finetune_cpm1.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | DATASET="LCQMC" 17 | 18 | OPTS="" 19 | OPTS+=" --dataset ${DATASET}" 20 | OPTS+=" --base-path ${BASE_PATH}" 21 | OPTS+=" --model-config cpm1-large" 22 | OPTS+=" --batch-size 64" 23 | OPTS+=" --train-iters 3000" 24 | OPTS+=" --save-iters 1000" 25 | OPTS+=" --max-length 256" 26 | OPTS+=" --save ${BASE_PATH}/results" 27 | OPTS+=" --save-name finetune-cpm1-ckpt" 28 | OPTS+=" --lr 0.02" 29 | OPTS+=" --inspect-iters 100" 30 | OPTS+=" --warmup-iters 200" 31 | OPTS+=" --lr-decay-style noam" 32 | OPTS+=" --weight-decay 1e-3" 33 | OPTS+=" --clip-grad 1.0" 34 | OPTS+=" --loss-scale 1048576" 35 | # OPTS+=" --load ${BASE_PATH}/results/cpm1-new.pt" 36 | 37 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/cpm1/finetune_cpm1.py ${OPTS}" 38 | echo ${CMD} 39 | 40 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/cpm1/${DATASET}.log 41 | -------------------------------------------------------------------------------- /examples/cpm1/pretrain_cpm1.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/mnt/sfs_turbo/hx/ModelCenter" 16 | 17 | OPTS="" 18 | OPTS+=" --base-path ${BASE_PATH}" 19 | OPTS+=" --model-config ${BASE_PATH}/configs/cpm1/cpm1-large" 20 | OPTS+=" --batch-size 64" 21 | OPTS+=" --train-iters 200000" 22 | OPTS+=" --save-iters 1000" 23 | OPTS+=" --save-name noam-1e-3-0.1-checkpoint" 24 | OPTS+=" --max-length 512" 25 | OPTS+=" --save ${BASE_PATH}/results" 26 | OPTS+=" --lr 0.1" 27 | OPTS+=" --inspect-iters 1000" 28 | OPTS+=" --warmup-iters 2000" 29 | OPTS+=" --lr-decay-style noam" 30 | OPTS+=" --weight-decay 0.001" 31 | OPTS+=" --clip-grad 1.0" 32 | OPTS+=" --loss-scale 1048576" 33 | OPTS+=" --start-step 0" 34 | # OPTS+=" --load ${BASE_PATH}/results/noam-1e-3-0.05-checkpoint-1000.pt" 35 | 36 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/cpm1/pretrain_cpm1.py ${OPTS}" 37 | echo ${CMD} 38 | 39 | if [[ $NODE_RANK == 0 ]]; then 40 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/cpm1-new.log 41 | else 42 | ${CMD} 43 | fi 44 | -------------------------------------------------------------------------------- /examples/cpm2/finetune_cpm2.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | DATASET="LCQMC" 17 | 18 | OPTS="" 19 | OPTS+=" --dataset ${DATASET}" 20 | OPTS+=" --base-path ${BASE_PATH}" 21 | OPTS+=" --model-config cpm2-large" 22 | OPTS+=" --batch-size 64" 23 | OPTS+=" --train-iters 3000" 24 | OPTS+=" --save-iters 1000" 25 | OPTS+=" --max-encoder-length 256" 26 | OPTS+=" --max-decoder-length 2" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-cpm2-ckpt" 29 | OPTS+=" --lr 0.002" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 200" 32 | OPTS+=" --lr-decay-style noam" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 10.0" 35 | OPTS+=" --loss-scale 1048576" 36 | # OPTS+=" --load ${BASE_PATH}/results/CPM2-0.25-0.005-checkpoint-110000.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/cpm2/finetune_cpm2.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/cpm2/${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/cpm2/pretrain_cpm2.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/mnt/sfs_turbo/ModelCenter" 16 | 17 | OPTS="" 18 | OPTS+=" --base-path ${BASE_PATH}" 19 | OPTS+=" --model-config ${BASE_PATH}/configs/cpm2/cpm2-large" 20 | OPTS+=" --batch-size 4" 21 | OPTS+=" --train-iters 3000" 22 | OPTS+=" --save-iters 500" 23 | OPTS+=" --save-name cpm2-checkpoint" 24 | OPTS+=" --max-encoder-length 512" 25 | OPTS+=" --max-decoder-length 256" 26 | OPTS+=" --save ${BASE_PATH}/results" 27 | OPTS+=" --lr 0.25" 28 | OPTS+=" --inspect-iters 100" 29 | OPTS+=" --warmup-iters 2000" 30 | OPTS+=" --lr-decay-style noam" 31 | OPTS+=" --weight-decay 5e-3" 32 | OPTS+=" --clip-grad 1.0" 33 | OPTS+=" --loss-scale 1048576" 34 | OPTS+=" --start-step 110000" 35 | # OPTS+=" --load ${BASE_PATH}/results/CPM2-0.25-0.005-checkpoint-110000.pt" 36 | 37 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/cpm2/pretrain_cpm2.py ${OPTS}" 38 | echo ${CMD} 39 | 40 | if [[ $NODE_RANK == 0 ]]; then 41 | ${CMD} | tee ${BASE_PATH}/logs/cpm2.log 42 | else 43 | ${CMD} 44 | fi 45 | -------------------------------------------------------------------------------- /examples/gpt2/BoolQ.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="base" 17 | DATASET="BoolQ" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gpt2-${VERSION}" 23 | OPTS+=" --batch-size 64" 24 | OPTS+=" --train-iters 1400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gpt2-ckpt" 29 | OPTS+=" --lr 0.00005" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 100" 32 | OPTS+=" --lr-decay-style constant" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 10.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gpt2/CB.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="base" 17 | DATASET="CB" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gpt2-${VERSION}" 23 | OPTS+=" --batch-size 8" 24 | OPTS+=" --train-iters 400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gpt2-ckpt" 29 | OPTS+=" --lr 0.00005" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 100" 32 | OPTS+=" --lr-decay-style constant" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 1.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gpt2/COPA.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="base" 17 | DATASET="COPA" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gpt2-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 900" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gpt2-ckpt" 29 | OPTS+=" --lr 0.00005" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 100" 32 | OPTS+=" --lr-decay-style constant" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 1.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gpt2/RTE.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="base" 17 | DATASET="RTE" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gpt2-${VERSION}" 23 | OPTS+=" --batch-size 64" 24 | OPTS+=" --train-iters 800" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gpt2-ckpt" 29 | OPTS+=" --lr 0.00005" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 100" 32 | OPTS+=" --lr-decay-style constant" 33 | OPTS+=" --weight-decay 1e-3" 34 | OPTS+=" --clip-grad 10.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gpt2/WSC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="base" 17 | DATASET="WSC" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gpt2-${VERSION}" 23 | OPTS+=" --batch-size 64" 24 | OPTS+=" --train-iters 700" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gpt2-ckpt" 29 | OPTS+=" --lr 0.00005" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 100" 32 | OPTS+=" --lr-decay-style constant" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 1.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gpt2/WiC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="base" 17 | DATASET="WiC" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gpt2-${VERSION}" 23 | OPTS+=" --batch-size 64" 24 | OPTS+=" --train-iters 1500" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gpt2-ckpt" 29 | OPTS+=" --lr 0.00005" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 100" 32 | OPTS+=" --lr-decay-style constant" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 1.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gptj/BoolQ.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/mnt/sfs_turbo/hx/ModelCenter" 16 | VERSION="6b" 17 | DATASET="BoolQ" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gptj-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 1400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gptj-ckpt" 29 | OPTS+=" --lr 0.00001" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 140" 32 | OPTS+=" --lr-decay-style noam" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 10.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gptj/CB.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/mnt/sfs_turbo/hx/ModelCenter" 16 | VERSION="6b" 17 | DATASET="CB" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gptj-${VERSION}" 23 | OPTS+=" --batch-size 8" 24 | OPTS+=" --train-iters 400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gptj-ckpt" 29 | OPTS+=" --lr 0.00001" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 40" 32 | OPTS+=" --lr-decay-style noam" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 10.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gptj/COPA.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="6b" 17 | DATASET="COPA" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gptj-${VERSION}" 23 | OPTS+=" --batch-size 1" 24 | OPTS+=" --train-iters 900" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gptj-ckpt" 29 | OPTS+=" --lr 0.00001" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 40" 32 | OPTS+=" --lr-decay-style noam" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 1.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gptj/RTE.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=8 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/mnt/sfs_turbo/hx/ModelCenter" 16 | VERSION="6b" 17 | DATASET="RTE" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gptj-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 800" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gptj-ckpt" 29 | OPTS+=" --lr 0.00001" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 40" 32 | OPTS+=" --lr-decay-style noam" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 10.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gptj/WSC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="6b" 17 | DATASET="WSC" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gptj-${VERSION}" 23 | OPTS+=" --batch-size 4" 24 | OPTS+=" --train-iters 700" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gptj-ckpt" 29 | OPTS+=" --lr 0.0001" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 30" 32 | OPTS+=" --lr-decay-style noam" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 1.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/gptj/WiC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=1 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="6b" 17 | DATASET="WiC" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config gptj-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 1500" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-decoder-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-gptj-ckpt" 29 | OPTS+=" --lr 0.00001" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 70" 32 | OPTS+=" --lr-decay-style noam" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 1.0" 35 | OPTS+=" --loss-scale 128" 36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/llama/RTE.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=2 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/data/ModelCenter" 16 | VERSION="7b" 17 | DATASET="RTE" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config ${BASE_PATH}/results/llama-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 1400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-length 512" 27 | OPTS+=" --save ${BASE_PATH}/results" 28 | OPTS+=" --save-name finetune-llama-${DATASET}" 29 | OPTS+=" --lr 0.00001" 30 | OPTS+=" --inspect-iters 100" 31 | OPTS+=" --warmup-iters 140" 32 | OPTS+=" --lr-decay-style constant" 33 | OPTS+=" --weight-decay 1e-2" 34 | OPTS+=" --clip-grad 1.0" 35 | OPTS+=" --loss-scale 1048576" 36 | 37 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/llama/finetune_llama.py ${OPTS}" 38 | echo ${CMD} 39 | 40 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/llama_superglue/finetune-llama-${VERSION}-${DATASET}.log 41 | -------------------------------------------------------------------------------- /examples/mt5/BoolQ.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="BoolQ" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 64" 24 | OPTS+=" --train-iters 1400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 140" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 10.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/mt5/CB.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="CB" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 8" 24 | OPTS+=" --train-iters 400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 40" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 10.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/mt5/COPA.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="COPA" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 4" 24 | OPTS+=" --train-iters 900" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 40" 33 | OPTS+=" --lr-decay-style noam" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 10.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/T5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/mt5/RTE.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="RTE" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 800" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 100" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 10.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/mt5/WSC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="WSC" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 4" 24 | OPTS+=" --train-iters 700" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 50" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 100.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/mt5/WiC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="WiC" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 1500" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 70" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 100.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5-v1_1/BoolQ.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="BoolQ" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 64" 24 | OPTS+=" --train-iters 1400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 140" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 10.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5-v1_1/CB.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="CB" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 8" 24 | OPTS+=" --train-iters 400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 40" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 10.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5-v1_1/COPA.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="COPA" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 4" 24 | OPTS+=" --train-iters 900" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 40" 33 | OPTS+=" --lr-decay-style noam" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 10.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/T5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5-v1_1/RTE.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="RTE" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 800" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 100" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 10.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5-v1_1/WSC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="WSC" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 4" 24 | OPTS+=" --train-iters 700" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 50" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 100.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5-v1_1/WiC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="xxl" 17 | DATASET="WiC" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-v1_1-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 1500" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 70" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 100.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5/BoolQ.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="11b" 17 | DATASET="BoolQ" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 1400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 140" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 1.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5/CB.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="11b" 17 | DATASET="CB" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-${VERSION}" 23 | OPTS+=" --batch-size 8" 24 | OPTS+=" --train-iters 400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 40" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 1.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5/COPA.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="11b" 17 | DATASET="COPA" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-${VERSION}" 23 | OPTS+=" --batch-size 4" 24 | OPTS+=" --train-iters 900" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 40" 33 | OPTS+=" --lr-decay-style noam" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 1.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5/RTE.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="11b" 17 | DATASET="RTE" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 800" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 100" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 10.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5/SQuAD.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=2 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/data/ModelCenter" 16 | VERSION="3b" 17 | DATASET="SQuAD" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config ${BASE_PATH}/results/t5-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 1400" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 32" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 140" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 1.0" 36 | OPTS+=" --loss-scale 128" 37 | 38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_squad.py ${OPTS}" 39 | echo ${CMD} 40 | 41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_squad/finetune-t5-${VERSION}-${DATASET}.log 42 | -------------------------------------------------------------------------------- /examples/t5/WSC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="11b" 17 | DATASET="WSC" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-${VERSION}" 23 | OPTS+=" --batch-size 4" 24 | OPTS+=" --train-iters 700" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-ckpt" 30 | OPTS+=" --lr 0.0001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 30" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 1.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /examples/t5/WiC.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | MASTER_ADDR=localhost 4 | MASTER_PORT=12345 5 | NNODES=1 6 | NODE_RANK=0 7 | GPUS_PER_NODE=4 8 | 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 10 | --nnodes $NNODES \ 11 | --node_rank $NODE_RANK \ 12 | --master_addr $MASTER_ADDR \ 13 | --master_port $MASTER_PORT" 14 | 15 | BASE_PATH="/home/hx/ModelCenter" 16 | VERSION="11b" 17 | DATASET="WiC" 18 | 19 | OPTS="" 20 | OPTS+=" --dataset ${DATASET}" 21 | OPTS+=" --base-path ${BASE_PATH}" 22 | OPTS+=" --model-config t5-${VERSION}" 23 | OPTS+=" --batch-size 16" 24 | OPTS+=" --train-iters 1500" 25 | OPTS+=" --save-iters 1000" 26 | OPTS+=" --max-encoder-length 512" 27 | OPTS+=" --max-decoder-length 2" 28 | OPTS+=" --save ${BASE_PATH}/results" 29 | OPTS+=" --save-name finetune-t5-ckpt" 30 | OPTS+=" --lr 0.00001" 31 | OPTS+=" --inspect-iters 100" 32 | OPTS+=" --warmup-iters 70" 33 | OPTS+=" --lr-decay-style constant" 34 | OPTS+=" --weight-decay 1e-2" 35 | OPTS+=" --clip-grad 1.0" 36 | OPTS+=" --loss-scale 128" 37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt" 38 | 39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}" 40 | echo ${CMD} 41 | 42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log 43 | -------------------------------------------------------------------------------- /model_center/__init__.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from .arguments import get_args 4 | -------------------------------------------------------------------------------- /model_center/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .indexed import MMapIndexedDataset 2 | from .distributed_indexed import DistributedMMapIndexedDataset 3 | from .distributed_loader import DistributedDataLoader 4 | 5 | from .distributed_dataset import DistributedDataset, SimpleDataset, build_dataset 6 | from .utils import shuffle_dataset, compact_dataset, mask_dataset 7 | -------------------------------------------------------------------------------- /model_center/dataset/bertdataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .superglue import * 2 | 3 | DATASET = { 4 | "BoolQ": BoolQ_Dataset, 5 | "CB": CB_Dataset, 6 | "COPA": COPA_Dataset, 7 | "RTE": RTE_Dataset, 8 | "WiC": WiC_Dataset, 9 | "WSC": WSC_Dataset, 10 | } -------------------------------------------------------------------------------- /model_center/dataset/cpm1/__init__.py: -------------------------------------------------------------------------------- 1 | from .cpm1_dataset import CPM1_Dataset, CPM1_Dataset_Merge 2 | -------------------------------------------------------------------------------- /model_center/dataset/cpm1/cpm1_dataset.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The OpenBMB team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | import torch.utils.data as data 18 | from ..indexed import MMapIndexedDataset 19 | import random 20 | import numpy as np 21 | 22 | 23 | class CPM1_Dataset(data.Dataset): 24 | def __init__(self, ctx : MMapIndexedDataset, 25 | tgt : MMapIndexedDataset, 26 | max_length = 1024): 27 | self.ctx = ctx 28 | self.tgt = tgt 29 | self.max_length = max_length 30 | 31 | def __len__(self): 32 | return len(self.ctx) 33 | 34 | def __get_item_data(self, ctx, tgt): 35 | if ctx.shape[0] > self.max_length or tgt.shape[0] > self.max_length: 36 | return None, None, None 37 | assert len(ctx) == len(tgt) 38 | len_ctx = min(ctx.shape[0], self.max_length) 39 | 40 | ctx = ctx.astype('int64') 41 | tgt = tgt.astype('int64') 42 | 43 | th_ctx = torch.zeros(self.max_length, dtype=torch.long) 44 | th_ctx[:len_ctx] = torch.from_numpy(ctx)[:len_ctx].long() 45 | th_tgt = torch.full((self.max_length,), -100, dtype=torch.long) 46 | th_tgt[:len_ctx] = torch.from_numpy(tgt)[:len_ctx].long() 47 | return th_ctx, len_ctx, th_tgt 48 | 49 | def __getitem__(self, index): 50 | ctx = self.ctx[index] 51 | tgt = self.tgt[index] 52 | 53 | if isinstance(index, int): 54 | th_ctx, len_ctx, th_tgt = self.__get_item_data(ctx, tgt) 55 | return { 56 | "ctx": th_ctx, 57 | "tgt": th_tgt, 58 | "len_ctx": len_ctx, 59 | } 60 | else: 61 | res = {"ctx": [], "tgt": [], "len_ctx": [],} 62 | for _ctx, _tgt in zip(ctx, tgt): 63 | _th_ctx, _len_ctx, _th_tgt = self.__get_item_data(_ctx, _tgt) 64 | if _th_ctx is None: 65 | continue 66 | res["ctx"].append(_th_ctx) 67 | res["tgt"].append(_th_tgt) 68 | res["len_ctx"].append(_len_ctx) 69 | return { 70 | "ctx": torch.stack(res["ctx"]), 71 | "tgt": torch.stack(res["tgt"]), 72 | "len_ctx": torch.LongTensor(res["len_ctx"]), 73 | } 74 | 75 | 76 | class CPM1_Dataset_Merge(data.Dataset): 77 | def __init__(self, ctx : MMapIndexedDataset, max_length = 1024): 78 | self.ctx = ctx 79 | self.max_length = max_length 80 | 81 | def __len__(self): 82 | return len(self.ctx) 83 | 84 | def __get_item_data(self, ctx): 85 | if ctx.shape[0] > self.max_length: 86 | return None, None, None, None 87 | len_ctx = min(ctx.shape[0], self.max_length) 88 | lef = random.randint(len_ctx // 8, len_ctx // 4) 89 | rig = random.randint(len_ctx // 4 * 3, len_ctx) 90 | if ctx[len_ctx-1] == 4: 91 | rig = len_ctx 92 | tgt = np.full((len_ctx), -100) 93 | tgt[lef-1:rig-1] = ctx[lef:rig] 94 | context_ctx = np.arange((len_ctx)) 95 | context_ctx = (context_ctx < lef) | (context_ctx >= rig) 96 | return ctx, tgt, len_ctx, context_ctx 97 | 98 | def __getitem__(self, index): 99 | ctx = self.ctx[index] 100 | th_ctx, th_tgt, len_ctx, context_ctx = self.__get_item_data(ctx) 101 | return th_ctx, th_tgt, len_ctx, context_ctx 102 | -------------------------------------------------------------------------------- /model_center/dataset/cpm1dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .down_data import * 2 | 3 | DATASET = { 4 | "LCQMC": LCQMC_Dataset, 5 | } -------------------------------------------------------------------------------- /model_center/dataset/cpm1dataset/down_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The OpenBMB team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | import csv 17 | import numpy as np 18 | 19 | class LCQMC_Dataset(torch.utils.data.Dataset): 20 | def __init__(self, path, split, rank, world_size, tokenizer, max_length) -> None: 21 | self.data = [] 22 | 23 | path = f"{path}/LCQMC/{split}.tsv" 24 | with open(path, encoding='utf8') as fin: 25 | reader = list(csv.reader(fin, delimiter='\t'))[1:] 26 | for i, row in enumerate(reader): 27 | text_a, text_b, label = row 28 | lef_tokens = [1] + tokenizer.encode(f'"{text_a}"与"{text_b}"的关系是:') 29 | rig_tokens = tokenizer.encode("。") 30 | 31 | input_tokens, input_length, context, input_span = self.make_input(lef_tokens, rig_tokens, 1, max_length) 32 | 33 | index = torch.zeros((max_length,), dtype=torch.int32) 34 | index[len(lef_tokens) - 1] = 1 35 | 36 | target = torch.tensor(int(label), dtype=torch.long) 37 | 38 | self.data.append({ 39 | "input_tokens": input_tokens.cuda(), 40 | "input_length": input_length.cuda(), 41 | "input_context": context.cuda(), 42 | "input_span": input_span.cuda(), 43 | "targets": target.cuda(), 44 | "index": index.cuda(), 45 | }) 46 | 47 | def make_input(self, lef_tokens, rig_tokens, spans, max_length): 48 | input = lef_tokens + [0 for i in range(spans)] + rig_tokens 49 | length = len(input) 50 | 51 | assert length < max_length # TODO 52 | 53 | input_tokens = torch.zeros((max_length,), dtype=torch.int32) 54 | input_tokens[:length] = torch.tensor(input).int() 55 | 56 | input_length = torch.tensor(length, dtype=torch.int32) 57 | 58 | context = np.arange(max_length) 59 | context = (context < len(lef_tokens)) | (context >= len(lef_tokens) + spans) 60 | context = torch.from_numpy(context).bool() 61 | 62 | input_span = torch.zeros((max_length,), dtype=torch.int32) 63 | 64 | return input_tokens, input_length, context, input_span 65 | 66 | def __len__(self): 67 | return len(self.data) 68 | 69 | def __getitem__(self, idx): 70 | return self.data[idx] 71 | 72 | @classmethod 73 | def get_verbalizer(cls, tokenizer): 74 | return [15682, 16357] # 有关,无关 # TODO change to tokenizer.encode(xxx) -------------------------------------------------------------------------------- /model_center/dataset/cpm2/__init__.py: -------------------------------------------------------------------------------- 1 | from .cpm2_dataset import CPM2_Dataset 2 | -------------------------------------------------------------------------------- /model_center/dataset/cpm2/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.utils.data as data 3 | from ..indexed import MMapIndexedDataset 4 | import random 5 | import numpy as np 6 | 7 | class CPM2_Dataset(data.Dataset): 8 | def __init__(self, ctx : MMapIndexedDataset, 9 | tgt : MMapIndexedDataset, 10 | max_source_length = 512, 11 | max_target_length = 256): 12 | self.ctx = ctx 13 | self.tgt = tgt 14 | self.max_target_length = max_target_length 15 | self.max_source_length = max_source_length 16 | 17 | def __len__(self): 18 | return len(self.ctx) 19 | 20 | def __get_item_data(self, ctx, tgt): 21 | # TODO 26240 22 | ctx = ctx - (ctx >= 26240) * 190 23 | tgt = tgt - (tgt >= 26240) * 190 24 | 25 | if ctx.shape[0] > self.max_source_length or tgt.shape[0] > self.max_target_length+1: # TODO 26 | return None, None, None, None 27 | len_ctx = min(ctx.shape[0], self.max_source_length) 28 | len_tgt = min(tgt.shape[0], self.max_target_length) 29 | 30 | # TODO 31 | # ctx.astype('int64') 32 | # tgt.astype('int64') 33 | 34 | th_ctx = torch.zeros(self.max_source_length, dtype=torch.long) 35 | th_ctx[:len_ctx] = torch.from_numpy(ctx)[:len_ctx].long() 36 | th_tgt = torch.full((self.max_target_length + 1,), -100, dtype=torch.long) 37 | # th_tgt[0] = 1 38 | # th_tgt[1:1+len_tgt] = torch.from_numpy(tgt)[:len_tgt].long() 39 | th_tgt[:len_tgt] = torch.from_numpy(tgt)[:len_tgt].long() # TODO 40 | return th_ctx, th_tgt, len_ctx, len_tgt 41 | 42 | def __getitem__(self, index): 43 | ctx = self.ctx[index] 44 | tgt = self.tgt[index] 45 | 46 | if isinstance(index, int): 47 | th_ctx, th_tgt, len_ctx, len_tgt = self.__get_item_data(ctx, tgt) 48 | if th_ctx is None: 49 | return None 50 | return { 51 | "ctx": th_ctx, 52 | "tgt": th_tgt, 53 | "len_ctx": len_ctx, 54 | "len_tgt": len_tgt 55 | } 56 | else: 57 | res = {"ctx": [], "tgt": [], "len_ctx": [], "len_tgt":[]} 58 | for _ctx, _tgt in zip(ctx, tgt): 59 | _th_ctx, _th_tgt, _len_ctx, _len_tgt = self.__get_item_data(_ctx, _tgt) 60 | if _th_ctx is None: 61 | continue 62 | res["ctx"].append(_th_ctx) 63 | res["tgt"].append(_th_tgt) 64 | res["len_ctx"].append(_len_ctx) 65 | res["len_tgt"].append(_len_tgt) 66 | return { 67 | "ctx": torch.stack(res["ctx"]), 68 | "tgt": torch.stack(res["tgt"]), 69 | "len_ctx": torch.LongTensor(res["len_ctx"]), 70 | "len_tgt": torch.LongTensor(res["len_tgt"]) 71 | } 72 | 73 | -------------------------------------------------------------------------------- /model_center/dataset/cpm2dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .down_data import * 2 | 3 | DATASET = { 4 | "LCQMC": LCQMC_Dataset, 5 | } -------------------------------------------------------------------------------- /model_center/dataset/cpm2dataset/down_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The OpenBMB team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | import csv 17 | import numpy as np 18 | 19 | class LCQMC_Dataset(torch.utils.data.Dataset): 20 | def __init__(self, path, split, rank, world_size, tokenizer, max_encoder_length, max_decoder_length) -> None: 21 | self.data = [] 22 | 23 | path = f"{path}/LCQMC/{split}.tsv" 24 | with open(path, encoding='utf8') as fin: 25 | reader = list(csv.reader(fin, delimiter='\t'))[1:] 26 | for i, row in enumerate(reader): 27 | text_a, text_b, label = row 28 | enc_input = tokenizer.encode(f'“{text_a}”与“{text_b}”是否有关?') 29 | 30 | enc_tokens, enc_length, dec_tokens, dec_length, index = self.make_input(tokenizer, enc_input, max_encoder_length, max_decoder_length) 31 | 32 | target = torch.tensor(int(label), dtype=torch.long) 33 | 34 | self.data.append({ 35 | "enc_input": enc_tokens.cuda(), 36 | "enc_length": enc_length.cuda(), 37 | "dec_input": dec_tokens.cuda(), 38 | "dec_length": dec_length.cuda(), 39 | "targets": target.cuda(), 40 | "index": index.cuda(), 41 | }) 42 | 43 | def make_input(self, tokenizer, input, max_encoder_length, max_decoder_length): 44 | input = input + [tokenizer.get_sentinel_id(0)] 45 | length = len(input) 46 | 47 | assert length < max_encoder_length # TODO 48 | 49 | input_tokens = torch.zeros((max_encoder_length,), dtype=torch.int32) 50 | input_tokens[:length] = torch.tensor(input).int() 51 | 52 | input_length = torch.tensor(length, dtype=torch.int32) 53 | 54 | output = [tokenizer.get_sentinel_id(0)] 55 | length = len(output) 56 | output_tokens = torch.zeros((max_decoder_length,), dtype=torch.int32) 57 | output_tokens[:length] = torch.tensor(output).int() 58 | output_length = torch.tensor(length, dtype=torch.int32) 59 | 60 | index = torch.zeros((max_decoder_length,), dtype=torch.int32) 61 | index[length - 1] = 1 62 | 63 | return input_tokens, input_length, output_tokens, output_length, index 64 | 65 | def __len__(self): 66 | return len(self.data) 67 | 68 | def __getitem__(self, idx): 69 | return self.data[idx] 70 | 71 | @classmethod 72 | def get_verbalizer(cls, tokenizer): 73 | return [1744, 24] # 有关,无关 # TODO change to tokenizer.encode(xxx) -------------------------------------------------------------------------------- /model_center/dataset/distributed_loader.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The OpenBMB team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch.utils.data as data 16 | import bmtrain as bmt 17 | 18 | class DistributedDataLoader: 19 | def __init__(self, dataset, shuffle=False, seed=0, **kwargs): 20 | self.sampler = data.distributed.DistributedSampler(dataset, shuffle=shuffle, seed=seed, rank=bmt.rank(), num_replicas=bmt.world_size()) 21 | self.loader = data.DataLoader(dataset, shuffle=False, sampler=self.sampler, **kwargs) 22 | self.epoch = 0 23 | self.shuffle = shuffle 24 | 25 | def __iter__(self): 26 | if self.shuffle: 27 | self.epoch += 1 28 | self.sampler.set_epoch(self.epoch) 29 | return self.loader.__iter__() 30 | 31 | def __len__(self): 32 | return len(self.loader) 33 | 34 | -------------------------------------------------------------------------------- /model_center/dataset/gpt2dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .superglue import * 2 | 3 | DATASET = { 4 | "BoolQ": BoolQ_Dataset, 5 | "CB": CB_Dataset, 6 | "COPA": COPA_Dataset, 7 | "MultiRC": MultiRC_Dataset, 8 | "ReCoRD": ReCoRD_Dataset, 9 | "RTE": RTE_Dataset, 10 | "WiC": WiC_Dataset, 11 | "WSC": WSC_Dataset, 12 | } -------------------------------------------------------------------------------- /model_center/dataset/llamadataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .superglue import * 2 | 3 | DATASET = { 4 | "BoolQ": BoolQ_Dataset, 5 | "CB": CB_Dataset, 6 | "COPA": COPA_Dataset, 7 | "MultiRC": MultiRC_Dataset, 8 | "ReCoRD": ReCoRD_Dataset, 9 | "RTE": RTE_Dataset, 10 | "WiC": WiC_Dataset, 11 | "WSC": WSC_Dataset, 12 | } -------------------------------------------------------------------------------- /model_center/dataset/t5dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .superglue import * 2 | from .squad import * 3 | 4 | DATASET = { 5 | "BoolQ": BoolQ_Dataset, 6 | "CB": CB_Dataset, 7 | "COPA": COPA_Dataset, 8 | "MultiRC": MultiRC_Dataset, 9 | "ReCoRD": ReCoRD_Dataset, 10 | "RTE": RTE_Dataset, 11 | "WiC": WiC_Dataset, 12 | "WSC": WSC_Dataset, 13 | "SQuAD": SQuAD_Dataset, 14 | } -------------------------------------------------------------------------------- /model_center/dataset/t5dataset/squad.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import json 3 | import random 4 | 5 | class SQuAD_Dataset(torch.utils.data.Dataset): 6 | def __init__(self, path, split, tokenizer, max_encoder_length, max_decoder_length) -> None: 7 | super().__init__() 8 | self.split = split 9 | self.data = [] 10 | 11 | for input, target in self.read_data(path, split): 12 | if split == 'train': 13 | self.make_input(tokenizer, input, target, max_encoder_length, max_decoder_length) 14 | else: 15 | self.data.append({ 16 | "inputs": input, 17 | "targets": target 18 | }) 19 | 20 | def shift_tokens_right(self, input_ids, pad_token_id: int=0, decoder_start_token_id: int=0): 21 | shifted_input_ids = input_ids.new_zeros(input_ids.shape) 22 | shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() 23 | shifted_input_ids[..., 0] = decoder_start_token_id 24 | shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) 25 | return shifted_input_ids 26 | 27 | def make_input(self, tokenizer, inputs, targets, max_encoder_length, max_decoder_length): 28 | model_inputs = tokenizer(inputs, max_length=max_encoder_length, padding="max_length", truncation=True) 29 | labels = tokenizer(targets, max_length=max_decoder_length, padding="max_length", truncation=True) 30 | 31 | labels["input_ids"] = torch.LongTensor([l if l != tokenizer.pad_token_id else -100 for l in labels["input_ids"]]) 32 | 33 | model_inputs['input_ids'] = torch.LongTensor(model_inputs['input_ids']) 34 | model_inputs['attention_mask'] = torch.LongTensor(model_inputs['attention_mask']) 35 | model_inputs["decoder_input_ids"] = self.shift_tokens_right(labels["input_ids"]) 36 | model_inputs["targets"] = labels["input_ids"] 37 | model_inputs["decoder_attention_mask"] = torch.LongTensor(labels["attention_mask"]) 38 | 39 | self.data.append(model_inputs) 40 | 41 | def generate_input(self, question, context): 42 | return 43 | 44 | def read_data(self, path, split): 45 | if split == 'test': return 46 | path = f"{path}/{split}-v1.1.json" 47 | with open(path, encoding='utf8') as f: 48 | f = json.load(f) 49 | for data in f["data"]: 50 | for paragraph in data['paragraphs']: 51 | for qa in paragraph['qas']: 52 | input = " ".join(["Question:", qa["question"].lstrip(), "Answer: ", "Context:", paragraph["context"].lstrip()]) 53 | if len(qa["answers"])==0: 54 | qa["answers"] = [{"text": "no answer"}] 55 | if split=='train': 56 | target = " ".join(["", random.choice(qa["answers"])["text"], ""]) 57 | else: 58 | target = {a['text'] for a in qa["answers"]} 59 | yield input, target 60 | 61 | def __len__(self): 62 | return len(self.data) 63 | 64 | def __getitem__(self, idx): 65 | if self.split == 'train': 66 | model_inputs = self.data[idx] 67 | for key, value in model_inputs.items(): 68 | model_inputs[key] = value.cuda() 69 | return model_inputs 70 | else: 71 | return self.data[idx] -------------------------------------------------------------------------------- /model_center/generation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBMB/ModelCenter/14490451e9a91675ef8816c64cf6304d509bce62/model_center/generation/__init__.py -------------------------------------------------------------------------------- /model_center/layer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from .conv import Conv2d 16 | from .attention import Attention 17 | from .layernorm import LayerNorm 18 | from .feedforward import FeedForward 19 | from .position_embedding import RelativePositionEmbedding, RotaryEmbedding, SegmentPositionEmbedding, RotaryEmbeddingESM 20 | from .blocks import SelfAttentionBlock, CrossAttentionBlock, FFNBlock, TransformerBlock 21 | from .transformer import Encoder, Decoder 22 | from .embedding import Embedding, PatchEmbedding 23 | from .linear import Linear -------------------------------------------------------------------------------- /model_center/layer/conv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import bmtrain as bmt 3 | import torch.nn.functional as F 4 | import collections 5 | from itertools import repeat 6 | def _ntuple(n): 7 | def parse(x): 8 | if isinstance(x, collections.abc.Iterable): 9 | return x 10 | return tuple(repeat(x, n)) 11 | return parse 12 | 13 | to_2tuple = _ntuple(2) 14 | class Identity(bmt.DistributedModule): 15 | def __init__(self, *args, **kwargs): 16 | super(Identity, self).__init__() 17 | 18 | def forward(self, input): 19 | return input 20 | class Conv2d(bmt.DistributedModule): 21 | def __init__(self, 22 | in_channels, 23 | out_channels, 24 | kernel_size, 25 | stride=1, 26 | padding=0, 27 | dilation=1, 28 | groups=1, 29 | dtype=torch.float, 30 | int8: bool=False, 31 | init_mean : float=0.0, 32 | init_std : float = 1, 33 | bias : bool=True, 34 | padding_mode='zeros', 35 | ): 36 | super().__init__() 37 | self.in_channels = in_channels 38 | self.out_channels = out_channels 39 | self.kernel_size = kernel_size 40 | self.transposed = None 41 | self.output_padding = None 42 | 43 | self.stride = stride 44 | self.dilation = dilation 45 | self.groups = groups 46 | self.padding = padding 47 | self.padding_mode = padding_mode 48 | 49 | kernel = to_2tuple(kernel_size) 50 | self.weight = bmt.DistributedParameter( 51 | torch.empty((out_channels, int(in_channels/groups), kernel[0], kernel[1]), dtype=dtype), 52 | init_method=bmt.ParameterInitializer(torch.nn.init.normal_, mean=init_mean, std=init_std) 53 | ) 54 | self.bias = bmt.DistributedParameter( 55 | torch.empty((out_channels,), dtype=dtype), 56 | init_method=bmt.ParameterInitializer(torch.nn.init.zeros_) 57 | ) if bias else None 58 | self.int8=int8 59 | def forward(self, x : torch.Tensor): 60 | x = F.conv2d(x, 61 | weight=self.weight, 62 | bias=self.bias, 63 | stride=self.stride, 64 | padding=self.padding, 65 | dilation=self.dilation, 66 | groups=self.groups, 67 | ) 68 | 69 | return x -------------------------------------------------------------------------------- /model_center/layer/layernorm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | import bmtrain as bmt 18 | import torch.nn.functional as F 19 | 20 | @torch.jit.script 21 | def rms_layernorm(hidden : torch.Tensor, weight : torch.Tensor, eps :float): 22 | old_dtype = hidden.dtype 23 | variance = hidden.to(torch.float32).pow(2).mean(dim=-1, keepdim=True) 24 | hidden = (hidden * torch.rsqrt(variance + eps)).to(old_dtype) 25 | return hidden * weight 26 | 27 | 28 | class LayerNorm(bmt.DistributedModule): 29 | r""" 30 | `LayerNorm `_ if bias = True: :math:`y = {x-\text{E}[x]\over \text{Var}[x]+\text{eps}} * w + \text{bias}` 31 | 32 | `RMS LayerNorm `_ if bias = False: :math:`y = {x\over \text{Var}[x]+\text{eps}} * w` 33 | 34 | Args: 35 | dim_norm (int): norm dimesion 36 | dtype (optional): Defaults to torch.half. 37 | bias (bool, optional): whether to add the :math:`\text{bias}` term. Defaults to True. 38 | eps (float, optional): :math:`\text{eps}` term. Defaults to 1e-5. 39 | init_var (float, optional): weight will be all initialized to init_var. Defaults to 1.0. 40 | """ 41 | def __init__(self, dim_norm : int, 42 | dtype=torch.half, 43 | bias=True, 44 | eps : float = 1e-5, 45 | init_var = 1.0 46 | ): 47 | 48 | super().__init__() 49 | 50 | self.eps = eps 51 | self.dim_norm = dim_norm 52 | self.weight = bmt.DistributedParameter( 53 | torch.ones(dim_norm, dtype=dtype) * init_var) 54 | self.bias = bmt.DistributedParameter( 55 | torch.zeros(dim_norm, dtype=dtype)) if bias else None 56 | 57 | def forward(self, x : torch.Tensor): 58 | """ 59 | Args: 60 | x (:obj:`torch.Tensor` of shape ``(batch_size, seq_len, dim_norm)``): Input tensor that need to be normalized. 61 | 62 | Return: 63 | :obj:`torch.Tensor` of shape ``(batch_size, seq_len, dim_norm)``: The layernorm output. 64 | 65 | """ 66 | assert x.size(-1) == self.dim_norm 67 | 68 | if self.bias is not None: 69 | return F.layer_norm(x, (self.dim_norm,), self.weight, self.bias, self.eps) 70 | else: 71 | return rms_layernorm(x, self.weight, self.eps) 72 | -------------------------------------------------------------------------------- /model_center/layer/linear.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | import bmtrain as bmt 18 | import math 19 | import torch.nn.functional as F 20 | 21 | class Linear(bmt.DistributedModule): 22 | r"""A fully connected layer, which performs :math:`\pmb{y} = \mathbf{W} \pmb{x} + \pmb{b}` 23 | 24 | Args: 25 | dim_in (int): input dimension of :math:`\pmb{x}` 26 | dim_out (int): output dimension of :math:`\pmb{y}` 27 | dtype (optional): Defaults to torch.half. 28 | init_mean (float, optional): mean of :math:`\mathbf{W}\sim\mathcal{N}(\text{mean}, \text{std}^2)`. Defaults to 0. 29 | init_std (float, optional): std of :math:`\mathbf{W}\sim\mathcal{N}(\text{mean}, \text{std}^2)`. Defaults to 1. 30 | bias (bool, optional): whether to add bias term :math:`\pmb{b}`. Defaults to False. 31 | """ 32 | def __init__(self, 33 | dim_in : int, 34 | dim_out : int, 35 | length_scale : bool = False, 36 | length_scale_before : bool = False, 37 | dtype = torch.half, 38 | int8 : bool = False, 39 | init_mean : float = 0.0, 40 | init_std : float = 1, 41 | bias : bool = False, 42 | ): 43 | super().__init__() 44 | self.dim_in = self.in_features = dim_in 45 | self.dim_out = self.out_features = dim_out 46 | self.weight = bmt.DistributedParameter( 47 | torch.empty((dim_out, dim_in), dtype=dtype), 48 | init_method=bmt.ParameterInitializer(torch.nn.init.normal_, mean=init_mean, std=init_std) 49 | ) 50 | self.bias = bmt.DistributedParameter( 51 | torch.empty((dim_out,), dtype=dtype), 52 | init_method=bmt.ParameterInitializer(torch.nn.init.zeros_) 53 | ) if bias else None 54 | self.length_scale = length_scale 55 | self.length_scale_before = length_scale_before 56 | self.int8 = int8 57 | 58 | def forward(self, x : torch.Tensor): 59 | """ 60 | Args: 61 | x (:obj:`torch.Tensor` of shape ``(batch, seq_len, dim_in)``): The input of linear layer 62 | 63 | Returns: 64 | :obj:`torch.Tensor` of shape ``(batch, seq_len, dim_out)``: The output of the linear transform y. 65 | 66 | """ 67 | if self.length_scale and self.length_scale_before: 68 | x = x / math.sqrt(self.dim_in) 69 | x = F.linear(x, self.weight) 70 | if self.length_scale and not self.length_scale_before: 71 | x = x / math.sqrt(self.dim_in) 72 | if self.bias is not None: 73 | x = x + self.bias 74 | return x 75 | -------------------------------------------------------------------------------- /model_center/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | # Model Architecture 18 | from .config import * 19 | from .basemodel import BaseModel, ModelOutput, BaseModelOutput, BaseModelOutputWithPooling, Seq2SeqModelOutput 20 | from .bert import Bert 21 | from .roberta import Roberta 22 | from .gpt2 import GPT2 23 | from .gptj import GPTj 24 | from .t5 import T5 25 | from .cpm1 import CPM1 26 | from .cpm2 import CPM2 27 | from .cpm3 import CPM3 28 | from .glm import GLM 29 | from .longformer import Longformer 30 | from .vit import ViT 31 | from .opt import OPT 32 | from .llama import Llama 33 | -------------------------------------------------------------------------------- /model_center/model/config/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from .config import Config 17 | from .cpm1_config import CPM1Config 18 | from .cpm2_config import CPM2Config 19 | from .cpm3_config import CPM3Config 20 | from .t5_config import T5Config 21 | from .gpt2_config import GPT2Config 22 | from .gptj_config import GPTjConfig 23 | from .bert_config import BertConfig 24 | from .roberta_config import RobertaConfig 25 | from .vit_config import VitConfig 26 | from .longformer_config import LongformerConfig 27 | from .glm_config import GLMConfig 28 | from .opt_config import OPTConfig 29 | from .llama_config import LlamaConfig 30 | -------------------------------------------------------------------------------- /model_center/model/config/bert_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from .config import Config 18 | 19 | class BertConfig(Config): 20 | """ 21 | This is a configuration class that stores the configuration of the BERT model, which inherits from the Config class. 22 | It is used to instantiate the Bert model according to the specified parameters and define the model architecture. 23 | You can set specific parameters to control the output of the model. 24 | 25 | For example: 26 | [`dim_model`] is used to determine the Dimension of the encoder layers and the pooler layer. 27 | You can choose to use the default value of 768 or customize their dimensions. 28 | 29 | """ 30 | 31 | def __init__(self, vocab_size = 119547, 32 | type_size = 2, 33 | dim_model = 768, 34 | num_heads = 12, 35 | dim_head = 64, 36 | dim_ff = 3072, 37 | num_layers = 12, 38 | dropout_p = 0.0, 39 | emb_init_mean = 0.0, 40 | emb_init_std = 0.02, 41 | pos_bias_type = "none", 42 | position_size = 512, 43 | norm_init_var = 1.0, 44 | norm_bias = True, 45 | norm_eps = 1e-12, 46 | att_init_mean = 0.0, 47 | att_init_std = 0.02, 48 | att_bias = True, 49 | att_mask_value = float("-1e4"), 50 | ffn_init_mean = 0.0, 51 | ffn_init_std = 0.02, 52 | ffn_bias = True, 53 | ffn_activate_fn = "gelu", 54 | proj_init_mean = 0.0, 55 | proj_init_std = 0.02, 56 | proj_bias = True, 57 | length_scale = False, 58 | attn_scale = True, 59 | half = True, 60 | int8 = False, 61 | tied = True, 62 | cls_head = None, 63 | post_layer_norm = True, 64 | ): 65 | 66 | super().__init__() 67 | 68 | self.vocab_size = vocab_size 69 | self.type_size = type_size 70 | self.position_size = position_size 71 | self.dim_model = dim_model 72 | self.num_heads = num_heads 73 | self.dim_head = dim_head 74 | self.dim_ff = dim_ff 75 | self.num_layers = num_layers 76 | self.dropout_p = dropout_p 77 | self.emb_init_mean = emb_init_mean 78 | self.emb_init_std = emb_init_std 79 | self.pos_bias_type = pos_bias_type 80 | self.norm_init_var = norm_init_var 81 | self.norm_bias = norm_bias 82 | self.norm_eps = norm_eps 83 | self.att_init_mean = att_init_mean 84 | self.att_init_std = att_init_std 85 | self.att_bias = att_bias 86 | self.att_mask_value = att_mask_value 87 | self.ffn_init_mean = ffn_init_mean 88 | self.ffn_init_std = ffn_init_std 89 | self.ffn_bias = ffn_bias 90 | self.ffn_activate_fn = ffn_activate_fn 91 | self.proj_init_mean = proj_init_mean 92 | self.proj_init_std = proj_init_std 93 | self.proj_bias = proj_bias 94 | self.length_scale = length_scale 95 | self.attn_scale = attn_scale 96 | self.int8 = int8 97 | self.tied = tied 98 | if half: 99 | self.dtype = torch.half 100 | else: 101 | self.dtype = torch.float 102 | self.cls_head = cls_head 103 | self.post_layer_norm = post_layer_norm -------------------------------------------------------------------------------- /model_center/model/config/config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import json 17 | import os 18 | import copy 19 | from typing import Any, Dict, Union 20 | from ...utils import check_web_and_convert_path 21 | 22 | class Config(object): 23 | """ enc_dec model configuration """ 24 | 25 | def __init__(self): 26 | super().__init__() 27 | 28 | @classmethod 29 | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **args): 30 | path = check_web_and_convert_path(pretrained_model_name_or_path, 'config') 31 | return cls.from_json_file(os.path.join(path, 'config.json'), **args) 32 | 33 | @classmethod 34 | def from_json_file(cls, json_file: Union[str, os.PathLike], **args): 35 | config_dict = cls._dict_from_json_file(json_file, **args) 36 | return cls(**config_dict) 37 | 38 | @classmethod 39 | def _dict_from_json_file(cls, json_file: Union[str, os.PathLike], **args): 40 | with open(json_file, "r", encoding="utf-8") as reader: 41 | text = reader.read() 42 | res = json.loads(text) 43 | for key in args: 44 | res[key] = args[key] 45 | return res 46 | 47 | def to_json_file(self, json_file_path: Union[str, os.PathLike]): 48 | with open(json_file_path, "w", encoding="utf-8") as writer: 49 | writer.write(self.to_json_string()) 50 | 51 | def to_json_string(self) -> str: 52 | config_dict = self.to_dict() 53 | return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" 54 | 55 | def to_dict(self) -> Dict[str, Any]: 56 | output = copy.deepcopy(self.__dict__) 57 | if hasattr(self.__class__, "model_type"): 58 | output["model_type"] = self.__class__.model_type 59 | return output -------------------------------------------------------------------------------- /model_center/model/config/glm_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from .config import Config 18 | 19 | class GLMConfig(Config): 20 | 21 | def __init__(self, vocab_size=50048, 22 | dim_model=1024, 23 | num_heads=16, 24 | dim_head=64, 25 | dim_ff=4096, 26 | num_layers=24, 27 | dropout_p=0.1, 28 | emb_init_mean = 0, 29 | emb_init_std = 0.02, 30 | pos_bias_type = "none", 31 | position_size = 1025, 32 | norm_init_var = 1.0, 33 | norm_bias = True, 34 | norm_eps = 1e-5, 35 | att_init_mean = 0.0, 36 | att_init_std = 0.02, 37 | att_bias = True, 38 | att_mask_value = float("-inf"), 39 | ffn_init_mean = 0.0, 40 | ffn_init_std = 0.02, 41 | ffn_bias = True, 42 | ffn_activate_fn = "gelu", 43 | proj_init_mean = 0.0, 44 | proj_init_std = 0.02, 45 | proj_bias = False, 46 | length_scale = False, 47 | attn_scale = True, 48 | half = True, 49 | int8 = False, 50 | tied = True, 51 | cls_head = None, 52 | post_layer_norm = False, 53 | sop_tok_id = 50006, 54 | eop_tok_id = 50007, 55 | mask_tok_id = 50008, 56 | ): 57 | 58 | super().__init__() 59 | 60 | self.vocab_size = vocab_size 61 | self.dim_model = dim_model 62 | self.num_heads = num_heads 63 | self.dim_head = dim_head 64 | self.dim_ff = dim_ff 65 | self.num_layers = num_layers 66 | self.dropout_p = dropout_p 67 | self.emb_init_mean = emb_init_mean 68 | self.emb_init_std = emb_init_std 69 | self.pos_bias_type = pos_bias_type 70 | self.position_size = position_size 71 | self.norm_init_var = norm_init_var 72 | self.norm_bias = norm_bias 73 | self.norm_eps = norm_eps 74 | self.att_init_mean = att_init_mean 75 | self.att_init_std = att_init_std 76 | self.att_bias = att_bias 77 | self.att_mask_value = att_mask_value 78 | self.ffn_init_mean = ffn_init_mean 79 | self.ffn_init_std = ffn_init_std 80 | self.ffn_bias = ffn_bias 81 | self.ffn_activate_fn = ffn_activate_fn 82 | self.proj_init_mean = proj_init_mean 83 | self.proj_init_std = proj_init_std 84 | self.proj_bias = proj_bias 85 | self.length_scale = length_scale 86 | self.attn_scale = attn_scale 87 | self.int8 = int8 88 | self.tied = tied 89 | if half: 90 | self.dtype = torch.half 91 | else: 92 | self.dtype = torch.float 93 | self.cls_head = cls_head 94 | self.post_layer_norm = post_layer_norm 95 | self.sop_tok_id = sop_tok_id 96 | self.eop_tok_id = eop_tok_id 97 | self.mask_tok_id = mask_tok_id -------------------------------------------------------------------------------- /model_center/model/config/gpt2_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from .config import Config 18 | 19 | class GPT2Config(Config): 20 | """ 21 | This is a configuration class that stores the configuration of the GPT-2 model, which inherits from the Config class. 22 | It is used to instantiate the Bert model according to the specified parameters and define the model architecture. 23 | You can set specific parameters to control the output of the model. 24 | 25 | For example: 26 | [`dim_model`] is used to determine the Dimension of the encoder layers. 27 | You can choose to use the default value of 768 or customize their dimensions. 28 | 29 | """ 30 | 31 | def __init__(self, vocab_size = 50258, 32 | dim_model = 1024, 33 | num_heads = 16, 34 | dim_head = 64, 35 | dim_ff = 4096, 36 | num_layers = 24, 37 | dropout_p = 0.0, 38 | emb_init_mean = 0.0, 39 | emb_init_std = 0.02, 40 | pos_bias_type = "none", 41 | position_size = 1024, 42 | norm_init_var = 1.0, 43 | norm_bias = True, 44 | norm_eps = 1e-5, 45 | att_init_mean = 0.0, 46 | att_init_std = 0.02, 47 | att_bias = True, 48 | att_mask_value = float("-1e4"), 49 | ffn_init_mean = 0.0, 50 | ffn_init_std = 0.02, 51 | ffn_bias = True, 52 | ffn_activate_fn = "gelu", 53 | proj_init_mean = 0.0, 54 | proj_init_std = 0.02, 55 | proj_bias = True, 56 | length_scale = False, 57 | attn_scale = True, 58 | half = True, 59 | int8 = False, 60 | tied = True, 61 | cls_head = None, 62 | post_layer_norm = False, 63 | ): 64 | 65 | super().__init__() 66 | 67 | self.vocab_size = vocab_size 68 | self.dim_model = dim_model 69 | self.num_heads = num_heads 70 | self.dim_head = dim_head 71 | self.dim_ff = dim_ff 72 | self.num_layers = num_layers 73 | self.dropout_p = dropout_p 74 | self.emb_init_mean = emb_init_mean 75 | self.emb_init_std = emb_init_std 76 | self.pos_bias_type = pos_bias_type 77 | self.position_size = position_size 78 | self.norm_init_var = norm_init_var 79 | self.norm_bias = norm_bias 80 | self.norm_eps = norm_eps 81 | self.att_init_mean = att_init_mean 82 | self.att_init_std = att_init_std 83 | self.att_bias = att_bias 84 | self.att_mask_value = att_mask_value 85 | self.ffn_init_mean = ffn_init_mean 86 | self.ffn_init_std = ffn_init_std 87 | self.ffn_bias = ffn_bias 88 | self.ffn_activate_fn = ffn_activate_fn 89 | self.proj_init_mean = proj_init_mean 90 | self.proj_init_std = proj_init_std 91 | self.proj_bias = proj_bias 92 | self.length_scale = length_scale 93 | self.attn_scale = attn_scale 94 | self.int8 = int8 95 | self.tied = tied 96 | if half: 97 | self.dtype = torch.half 98 | else: 99 | self.dtype = torch.float 100 | self.cls_head = cls_head 101 | self.post_layer_norm = post_layer_norm -------------------------------------------------------------------------------- /model_center/model/config/gptj_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | from .config import Config 17 | 18 | class GPTjConfig(Config): 19 | """ 20 | This is a configuration class that stores the configuration of the GPT-J model, which inherits from the Config class. 21 | It is used to instantiate the Bert model according to the specified parameters and define the model architecture. 22 | You can set specific parameters to control the output of the model. 23 | 24 | For example: 25 | [`dim_model`] is used to determine the Dimension of the encoder layers. 26 | You can choose to use the default value of 4096 or customize their dimensions. 27 | 28 | """ 29 | def __init__(self, vocab_size=50400, 30 | dim_model=4096, 31 | num_heads=16, 32 | dim_head=256, 33 | dim_ff=16384, 34 | num_layers=28, 35 | dropout_p=0, 36 | emb_init_mean = 0.0, 37 | emb_init_std = 1, 38 | pos_bias_type = "rotary", 39 | pos_rotary_dim = 64, 40 | norm_init_var = 1.0, 41 | norm_bias = True, 42 | norm_eps = 1e-5, 43 | att_init_mean = 0.0, 44 | att_init_std = 0.1, 45 | att_bias = False, 46 | att_mask_value = float("-inf"), 47 | ffn_init_mean = 0.0, 48 | ffn_init_std = 0.1, 49 | ffn_bias = True, 50 | ffn_activate_fn = "gelu", 51 | proj_init_mean = 0.0, 52 | proj_init_std = 1, 53 | proj_bias = True, 54 | length_scale = False, 55 | attn_scale = True, 56 | half = True, 57 | int8 = False, 58 | tied = False, 59 | cls_head = None, 60 | post_layer_norm = False, 61 | ): 62 | 63 | super().__init__() 64 | 65 | self.vocab_size = vocab_size 66 | self.dim_model = dim_model 67 | self.num_heads = num_heads 68 | self.dim_head = dim_head 69 | self.dim_ff = dim_ff 70 | self.num_layers = num_layers 71 | self.dropout_p = dropout_p 72 | self.emb_init_mean = emb_init_mean 73 | self.emb_init_std = emb_init_std 74 | self.pos_bias_type = pos_bias_type 75 | self.pos_rotary_dim = pos_rotary_dim 76 | self.norm_init_var = norm_init_var 77 | self.norm_bias = norm_bias 78 | self.norm_eps = norm_eps 79 | self.att_init_mean = att_init_mean 80 | self.att_init_std = att_init_std 81 | self.att_bias = att_bias 82 | self.att_mask_value = att_mask_value 83 | self.ffn_init_mean = ffn_init_mean 84 | self.ffn_init_std = ffn_init_std 85 | self.ffn_bias = ffn_bias 86 | self.ffn_activate_fn = ffn_activate_fn 87 | self.proj_init_mean = proj_init_mean 88 | self.proj_init_std = proj_init_std 89 | self.proj_bias = proj_bias 90 | self.length_scale = length_scale 91 | self.attn_scale = attn_scale 92 | self.int8 = int8 93 | self.tied = tied 94 | if half: 95 | self.dtype = torch.half 96 | else: 97 | self.dtype = torch.float 98 | self.cls_head = cls_head 99 | self.post_layer_norm = post_layer_norm -------------------------------------------------------------------------------- /model_center/model/config/llama_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | from .config import Config 17 | 18 | class LlamaConfig(Config): 19 | """ 20 | This is a configuration class that stores the configuration of the LLaMa model, which inherits from the Config class. 21 | It is used to instantiate the Bert model according to the specified parameters and define the model architecture. 22 | You can set specific parameters to control the output of the model. 23 | 24 | For example: 25 | [`dim_model`] is used to determine the Dimension of the encoder layers. 26 | You can choose to use the default value of 4096 or customize their dimensions. 27 | 28 | """ 29 | def __init__(self, vocab_size=32000, 30 | dim_model=4096, 31 | num_heads=32, 32 | num_heads_kv=-1, 33 | dim_head=128, 34 | dim_ff=11008, 35 | num_layers=32, 36 | dropout_p=0, 37 | emb_init_mean = 0.0, 38 | emb_init_std = 0.02, 39 | pos_bias_type = "rotary", 40 | norm_init_var = 1.0, 41 | norm_bias = False, 42 | norm_eps = 1e-6, 43 | att_init_mean = 0.0, 44 | att_init_std = 0.02, 45 | att_bias = False, 46 | att_mask_value = float("-inf"), 47 | ffn_init_mean = 0.0, 48 | ffn_init_std = 0.02, 49 | ffn_bias = False, 50 | ffn_activate_fn = "gated_silu", 51 | proj_init_mean = 0.0, 52 | proj_init_std = 0.02, 53 | proj_bias = False, 54 | length_scale = False, 55 | attn_scale = True, 56 | half = True, 57 | int8 = False, 58 | tied = False, 59 | cls_head = None, 60 | post_layer_norm = False, 61 | ): 62 | 63 | super().__init__() 64 | 65 | self.vocab_size = vocab_size 66 | self.dim_model = dim_model 67 | self.num_heads = num_heads 68 | self.num_heads_kv = num_heads_kv if num_heads_kv != -1 else num_heads 69 | self.dim_head = dim_head 70 | self.dim_ff = dim_ff 71 | self.num_layers = num_layers 72 | self.dropout_p = dropout_p 73 | self.emb_init_mean = emb_init_mean 74 | self.emb_init_std = emb_init_std 75 | self.pos_bias_type = pos_bias_type 76 | self.norm_init_var = norm_init_var 77 | self.norm_bias = norm_bias 78 | self.norm_eps = norm_eps 79 | self.att_init_mean = att_init_mean 80 | self.att_init_std = att_init_std 81 | self.att_bias = att_bias 82 | self.att_mask_value = att_mask_value 83 | self.ffn_init_mean = ffn_init_mean 84 | self.ffn_init_std = ffn_init_std 85 | self.ffn_bias = ffn_bias 86 | self.ffn_activate_fn = ffn_activate_fn 87 | self.proj_init_mean = proj_init_mean 88 | self.proj_init_std = proj_init_std 89 | self.proj_bias = proj_bias 90 | self.length_scale = length_scale 91 | self.attn_scale = attn_scale 92 | self.int8 = int8 93 | self.tied = tied 94 | if half: 95 | self.dtype = torch.half 96 | else: 97 | self.dtype = torch.float 98 | self.cls_head = cls_head 99 | self.post_layer_norm = post_layer_norm -------------------------------------------------------------------------------- /model_center/model/config/opt_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from .config import Config 18 | 19 | class OPTConfig(Config): 20 | """ 21 | This is a configuration class that stores the configuration of the OPT model, which inherits from the Config class. 22 | It is used to instantiate the Bert model according to the specified parameters and define the model architecture. 23 | You can set specific parameters to control the output of the model. 24 | 25 | For example: 26 | [`dim_model`] is used to determine the Dimension of the encoder layers. 27 | You can choose to use the default value of 768 or customize their dimensions. 28 | 29 | """ 30 | 31 | def __init__(self, vocab_size = 50272, 32 | dim_model = 768, 33 | num_heads = 12, 34 | dim_head = 64, 35 | dim_ff = 3072, 36 | num_layers = 12, 37 | dropout_p = 0.1, 38 | emb_init_mean = 0.0, 39 | emb_init_std = 0.02, 40 | pos_bias_type = "none", 41 | pad_token_id = 1, 42 | prefix = "", 43 | position_size = 2048, 44 | norm_init_var = 1.0, 45 | norm_bias = True, 46 | norm_eps = 1e-5, 47 | att_init_mean = 0.0, 48 | att_init_std = 0.02, 49 | att_bias = True, 50 | att_mask_value = float("-65504"), 51 | ffn_init_mean = 0.0, 52 | ffn_init_std = 0.02, 53 | ffn_bias = True, 54 | ffn_activate_fn = "relu", 55 | length_scale = False, 56 | attn_scale = True, 57 | half = True, 58 | int8 = False, 59 | tied = True, 60 | cls_head = None, 61 | post_layer_norm = False, 62 | ): 63 | 64 | super().__init__() 65 | 66 | self.vocab_size = vocab_size 67 | self.dim_model = dim_model 68 | self.num_heads = num_heads 69 | self.dim_head = dim_head 70 | self.dim_ff = dim_ff 71 | self.num_layers = num_layers 72 | self.dropout_p = dropout_p 73 | self.emb_init_mean = emb_init_mean 74 | self.emb_init_std = emb_init_std 75 | self.pos_bias_type = pos_bias_type 76 | self.pad_token_id = pad_token_id 77 | self.prefix = prefix 78 | self.position_size = position_size 79 | self.norm_init_var = norm_init_var 80 | self.norm_bias = norm_bias 81 | self.norm_eps = norm_eps 82 | self.att_init_mean = att_init_mean 83 | self.att_init_std = att_init_std 84 | self.att_bias = att_bias 85 | self.att_mask_value = att_mask_value 86 | self.ffn_init_mean = ffn_init_mean 87 | self.ffn_init_std = ffn_init_std 88 | self.ffn_bias = ffn_bias 89 | self.ffn_activate_fn = ffn_activate_fn 90 | self.length_scale = length_scale 91 | self.attn_scale = attn_scale 92 | self.int8 = int8 93 | self.tied = tied 94 | if half: 95 | self.dtype = torch.half 96 | else: 97 | self.dtype = torch.float 98 | self.cls_head = cls_head 99 | self.post_layer_norm = post_layer_norm -------------------------------------------------------------------------------- /model_center/model/config/roberta_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from .config import Config 18 | 19 | class RobertaConfig(Config): 20 | """ 21 | This is a configuration class that stores the configuration of the RoBERTa model, which inherits from the Config class. 22 | It is used to instantiate the Bert model according to the specified parameters and define the model architecture. 23 | You can set specific parameters to control the output of the model. 24 | 25 | For example: 26 | [`dim_model`] is used to determine the Dimension of the encoder layers and the pooler layer. 27 | You can choose to use the default value of 768 or customize their dimensions. 28 | 29 | """ 30 | 31 | def __init__(self, vocab_size = 50265, 32 | type_size = 1, 33 | dim_model = 1024, 34 | num_heads = 16, 35 | dim_head = 64, 36 | dim_ff = 4096, 37 | num_layers = 24, 38 | dropout_p = 0.0, 39 | emb_init_mean = 0.0, 40 | emb_init_std = 0.02, 41 | pos_bias_type = "none", 42 | position_size = 514, 43 | norm_init_var = 1.0, 44 | norm_bias = True, 45 | norm_eps = 1e-05, 46 | att_init_mean = 0.0, 47 | att_init_std = 0.02, 48 | att_bias = True, 49 | att_mask_value = float("-1e4"), 50 | ffn_init_mean = 0.0, 51 | ffn_init_std = 0.02, 52 | ffn_bias = True, 53 | ffn_activate_fn = "gelu", 54 | proj_init_mean = 0.0, 55 | proj_init_std = 0.02, 56 | proj_bias = True, 57 | length_scale = False, 58 | attn_scale = True, 59 | half = True, 60 | int8 = False, 61 | tied = True, 62 | cls_head = None, 63 | post_layer_norm = True, 64 | pad_token_id = 1, 65 | ): 66 | 67 | super().__init__() 68 | 69 | self.vocab_size = vocab_size 70 | self.type_size = type_size 71 | self.position_size = position_size 72 | self.dim_model = dim_model 73 | self.num_heads = num_heads 74 | self.dim_head = dim_head 75 | self.dim_ff = dim_ff 76 | self.num_layers = num_layers 77 | self.dropout_p = dropout_p 78 | self.emb_init_mean = emb_init_mean 79 | self.emb_init_std = emb_init_std 80 | self.pos_bias_type = pos_bias_type 81 | self.norm_init_var = norm_init_var 82 | self.norm_bias = norm_bias 83 | self.norm_eps = norm_eps 84 | self.att_init_mean = att_init_mean 85 | self.att_init_std = att_init_std 86 | self.att_bias = att_bias 87 | self.att_mask_value = att_mask_value 88 | self.ffn_init_mean = ffn_init_mean 89 | self.ffn_init_std = ffn_init_std 90 | self.ffn_bias = ffn_bias 91 | self.ffn_activate_fn = ffn_activate_fn 92 | self.proj_init_mean = proj_init_mean 93 | self.proj_init_std = proj_init_std 94 | self.proj_bias = proj_bias 95 | self.length_scale = length_scale 96 | self.attn_scale = attn_scale 97 | self.int8 = int8 98 | self.tied = tied 99 | if half: 100 | self.dtype = torch.half 101 | else: 102 | self.dtype = torch.float 103 | self.cls_head = cls_head 104 | self.post_layer_norm = post_layer_norm 105 | self.pad_token_id = pad_token_id -------------------------------------------------------------------------------- /model_center/model/config/vit_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from .config import Config 18 | 19 | class VitConfig(Config): 20 | """ 21 | This is a configuration class that stores the configuration of the Vit model, which inherits from the Config class. 22 | It is used to instantiate the vit model according to the specified parameters and define the model architecture. 23 | You can set specific parameters to control the output of the model. 24 | 25 | For example: 26 | [`hidden_size`] is used to determine the Dimension of the encoder layers. 27 | You can choose to use the default value of 768 or customize their dimensions. 28 | 29 | """ 30 | def __init__(self, img_size=224, 31 | patch_size=16, 32 | channels_in=3, 33 | num_classes=1000, 34 | hidden_size=768, 35 | num_layers=12, 36 | num_heads=12, 37 | mlp_size=3072, 38 | attn_bias=True, 39 | attn_scale=None, 40 | norm_bias=True, 41 | ffn_bias=True, 42 | representation_size=None, 43 | drop=0., 44 | half=True, 45 | dtype=torch.float): 46 | 47 | super().__init__() 48 | 49 | self.img_size = img_size 50 | self.patch_size = patch_size 51 | self.channels_in = channels_in 52 | self.num_classes = num_classes 53 | self.hidden_size = hidden_size 54 | self.num_layers = num_layers 55 | self.num_heads = num_heads 56 | self.mlp_size = mlp_size 57 | self.attn_bias = attn_bias 58 | self.attn_scale = attn_scale 59 | self.norm_bias = norm_bias 60 | self.ffn_bias = ffn_bias 61 | self.representation_size = representation_size 62 | self.drop = drop 63 | if half: 64 | self.dtype = torch.half 65 | else: 66 | self.dtype = torch.float -------------------------------------------------------------------------------- /model_center/model/vit.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 The OpenBMB team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import torch 17 | from .basemodel import BaseModel 18 | from .config import VitConfig 19 | from ..layer import PatchEmbedding, Encoder, Linear 20 | 21 | class ViT(BaseModel): 22 | 23 | _CONFIG_TYPE = VitConfig 24 | def __init__(self, config: VitConfig): 25 | 26 | super().__init__() 27 | 28 | hidden_size = config.hidden_size 29 | self.num_features = config.hidden_size # num_features for consistency with other models 30 | self.patch_embed = PatchEmbedding( 31 | img_size=config.img_size, 32 | patch_size=config.patch_size, 33 | in_chans=config.channels_in, 34 | embed_dim=hidden_size, dtype=config.dtype) 35 | self.num_patches = self.patch_embed.num_patches 36 | 37 | self.pos_drop = torch.nn.Dropout(p=config.drop) 38 | self.representation_size = config.representation_size 39 | 40 | self.blocks = Encoder(num_layers=config.num_layers, 41 | dim_model=hidden_size,dim_ff=config.mlp_size, 42 | num_heads=config.num_heads, 43 | dim_head=hidden_size//config.num_heads, 44 | att_bias=config.attn_bias, 45 | attn_scale=True, 46 | dropout_p=config.drop, 47 | norm_bias=config.norm_bias, 48 | ffn_bias=config.ffn_bias, 49 | ffn_activate_fn="gelu", 50 | dtype=config.dtype) 51 | 52 | if self.representation_size is not None: 53 | self.representation_layer = Linear(hidden_size,config.representation_size) 54 | hidden_size = config.representation_size 55 | 56 | self.head = Linear(hidden_size, config.num_classes, dtype=config.dtype,bias=True) 57 | 58 | def forward(self, input_seq, register_blk=-1, attention_mask=None): 59 | batch = input_seq.shape[0] 60 | hidden_state = self.patch_embed(input_seq) 61 | device = input_seq.device 62 | if attention_mask is not None: 63 | attention_mask = attention_mask.to(torch.bool) 64 | else: 65 | attention_mask = torch.ones(self.num_patches+1, device=device,dtype=torch.int32)[None, :].repeat(batch, 1) 66 | attention_mask = attention_mask.view(batch, self.num_patches+1, 1) & attention_mask.view(batch, 1, self.num_patches+1) 67 | hidden_state = self.pos_drop(hidden_state) 68 | hidden_state = self.blocks(hidden_state,attention_mask=attention_mask) 69 | if self.representation_size is not None: 70 | hidden_state = self.representation_layer(hidden_state) 71 | hidden_state = torch.tanh(hidden_state) 72 | logits = self.head(hidden_state[:,0]) 73 | return logits 74 | -------------------------------------------------------------------------------- /model_center/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from .cpm1_tokenizer import CPM1Tokenizer 4 | from .cpm2_tokenizer import CPM2Tokenizer 5 | from .bert_tokenizer import BertTokenizer 6 | from .roberta_tokenizer import RobertaTokenizer 7 | from .t5_tokenizer import T5Tokenizer 8 | from .gpt2_tokenizer import GPT2Tokenizer 9 | from .gptj_tokenizer import GPTjTokenizer 10 | from .glm_tokenizer import GLMTokenizer 11 | from .opt_tokenizer import OPTTokenizer 12 | from .llama_tokenizer import LlamaTokenizer -------------------------------------------------------------------------------- /model_center/tokenizer/base_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import os 4 | from typing import Union 5 | import torch 6 | import bmtrain as bmt 7 | from model_center.utils import check_web_and_convert_path 8 | 9 | class BaseTokenizer: 10 | """ 11 | The current implementation is mainly to adapt the training framework of the Transformers toolkit, 12 | and replace the original model implementation. 13 | TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer 14 | """ 15 | def __init__(self, tokenizer_type): 16 | self.tokenizer_type = tokenizer_type 17 | 18 | def from_pretrained(self, pretrained_model_name_or_path: Union[str, os.PathLike], *args, **kwargs): 19 | pretrained_model_name_or_path = check_web_and_convert_path(pretrained_model_name_or_path, 'tokenizer') 20 | return self.tokenizer_type.from_pretrained(pretrained_model_name_or_path, *args, **kwargs) 21 | -------------------------------------------------------------------------------- /model_center/tokenizer/bert_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 4 | # and replace the original model implementation. 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer 6 | 7 | from .base_tokenizer import BaseTokenizer 8 | from transformers import BertTokenizer as transformers_BertTokenizer 9 | 10 | BertTokenizer = BaseTokenizer(transformers_BertTokenizer) 11 | -------------------------------------------------------------------------------- /model_center/tokenizer/gpt2_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 4 | # and replace the original model implementation. 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer 6 | 7 | from .base_tokenizer import BaseTokenizer 8 | from transformers import GPT2Tokenizer as transformers_GPT2Tokenizer 9 | 10 | GPT2Tokenizer = BaseTokenizer(transformers_GPT2Tokenizer) 11 | -------------------------------------------------------------------------------- /model_center/tokenizer/gptj_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 4 | # and replace the original model implementation. 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer 6 | 7 | from .base_tokenizer import BaseTokenizer 8 | from transformers import AutoTokenizer as transformers_GPTjTokenizer 9 | 10 | GPTjTokenizer = BaseTokenizer(transformers_GPTjTokenizer) 11 | -------------------------------------------------------------------------------- /model_center/tokenizer/llama_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 4 | # and replace the original model implementation. 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer 6 | 7 | from .base_tokenizer import BaseTokenizer 8 | from transformers import LlamaTokenizer as LlamaTokenizerTransformers 9 | 10 | class LlamaTokenizerBase(BaseTokenizer): 11 | def from_pretrained(self, pretrained_model_name_or_path, *args, **kwargs): 12 | tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs) 13 | tokenizer.bos_token_id = 1 14 | tokenizer.eos_token_id = 2 15 | return tokenizer 16 | 17 | LlamaTokenizer = LlamaTokenizerBase(LlamaTokenizerTransformers) 18 | -------------------------------------------------------------------------------- /model_center/tokenizer/opt_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 4 | # and replace the original model implementation. 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer 6 | 7 | from .base_tokenizer import BaseTokenizer 8 | from transformers import GPT2Tokenizer as transformers_OPTTokenizer 9 | 10 | OPTTokenizer = BaseTokenizer(transformers_OPTTokenizer) -------------------------------------------------------------------------------- /model_center/tokenizer/roberta_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 4 | # and replace the original model implementation. 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer 6 | 7 | from .base_tokenizer import BaseTokenizer 8 | from transformers import RobertaTokenizer as transformers_RobertaTokenizer 9 | 10 | RobertaTokenizer = BaseTokenizer(transformers_RobertaTokenizer) 11 | -------------------------------------------------------------------------------- /model_center/tokenizer/t5_tokenizer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 4 | # and replace the original model implementation. 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer 6 | 7 | from .base_tokenizer import BaseTokenizer 8 | from transformers import T5Tokenizer as transformers_T5Tokenizer 9 | 10 | T5Tokenizer = BaseTokenizer(transformers_T5Tokenizer) 11 | -------------------------------------------------------------------------------- /model_center/tools/run_preprocess.sh: -------------------------------------------------------------------------------- 1 | for ((i=$1; i<$2; i++)); do 2 | { 3 | python3 /mnt/sfs_turbo/hx/ModelCenter/src/tools/preprocess_cpm1_lm.py --uid $i 4 | } 5 | done 6 | 7 | -------------------------------------------------------------------------------- /model_center/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .print_utils import print_inspect 2 | from .net_utils import check_web_and_convert_path -------------------------------------------------------------------------------- /model_center/utils/net_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import os 16 | import requests 17 | import tqdm 18 | import bmtrain as bmt 19 | 20 | file_names = { 21 | 'config': ['config.json'], 22 | 'model': ['pytorch_model.pt'], 23 | 'tokenizer': ['vocab.json', 'vocab.txt', 'merges.txt', 'tokenizer.json', 'added_tokens.json', 'special_tokens_map.json', 'tokenizer_config.json', 'spiece.model', 'vocab.model'], 24 | } 25 | 26 | def download(path, url): 27 | req = requests.get(url, stream=True) 28 | try: 29 | os.makedirs(os.path.dirname(path), exist_ok=True) 30 | file = open(path, "wb") 31 | req.raise_for_status() 32 | print(f"download from web, cache will be save to: {path}") 33 | content_length = req.headers.get("Content-Length") 34 | total = int(content_length) if content_length is not None else None 35 | progress = tqdm.tqdm( 36 | unit="B", 37 | unit_scale=True, 38 | unit_divisor=1024, 39 | total=total, 40 | desc="Downloading", 41 | ) 42 | for chunk in req.iter_content(chunk_size=1024): 43 | if chunk: 44 | progress.update(len(chunk)) 45 | file.write(chunk) 46 | progress.close() 47 | file.close() 48 | except: 49 | file.close() 50 | os.remove(path) 51 | 52 | def check_web_and_convert_path(path, load_type): # TODO add hash 53 | if os.path.isdir(path): 54 | try: 55 | bmt.print_rank(f"load from local file: {path}") 56 | except: 57 | pass 58 | return path 59 | else: 60 | if bmt.rank() == 0: 61 | url = f"https://openbmb.oss-cn-hongkong.aliyuncs.com/model_center/{path}" 62 | try: 63 | requests.get(f'{url}/config.json', stream=True).raise_for_status() # use config.json to check if identifier is valid 64 | except: 65 | raise ValueError(f"'{path}' is not a valid model identifier") 66 | cache_path = os.path.expanduser(f"~/.cache/model_center/{path}") 67 | for name in file_names[load_type]: 68 | p = os.path.join(cache_path, name) 69 | if os.path.exists(p): 70 | bmt.print_rank(f"load from cache: {p}") 71 | else: 72 | if bmt.rank() == 0: 73 | download(p, f"{url}/{name}") 74 | else: 75 | cache_path = os.path.expanduser(f"~/.cache/model_center/{path}") 76 | try: 77 | bmt.synchronize() 78 | except: 79 | pass 80 | return cache_path 81 | 82 | -------------------------------------------------------------------------------- /model_center/utils/print_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The OpenBMB team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | import bmtrain as bmt 17 | 18 | def print_inspect(model : torch.nn.Module, param_name : str, prefix : str = ''): 19 | """Inspect the model and print the summary of the parameters on rank 0. 20 | 21 | Args: 22 | model (torch.nn.Module): The model to be inspected. 23 | param_name (str): The name of the parameter to be inspected. The wildcard '*' can be used to match multiple parameters. 24 | prefix (str): The prefix of the parameter name. 25 | 26 | Example: 27 | >>> from model_center.utils import print_inspect 28 | >>> print_inspect(model, "*.linear*") 29 | name shape max min std mean grad_std grad_mean 30 | ... 31 | 32 | """ 33 | bmt.print_rank( 34 | bmt.inspect.format_summary( 35 | bmt.inspect.inspect_model(model, param_name, prefix) 36 | ) 37 | ) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.10 2 | bmtrain 3 | transformers 4 | jieba -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import os 3 | 4 | def main(): 5 | setup( 6 | name='model-center', 7 | version='1.0.3', 8 | description="example codes for big models using bmtrain", 9 | author="Weilin Zhao", 10 | author_email="acha131441373@gmail.com", 11 | packages=find_packages(), 12 | url="https://github.com/OpenBMB/ModelCenter", 13 | install_requires=[ 14 | "bmtrain", 15 | "transformers>=4.28.0", 16 | "jieba", 17 | ], 18 | keywords="CPM, cuda, AI, model, transformer", 19 | license='Apache 2.0', 20 | ) 21 | 22 | if __name__ == '__main__': 23 | main() 24 | -------------------------------------------------------------------------------- /tests/test.sh: -------------------------------------------------------------------------------- 1 | MASTER_ADDR=localhost 2 | MASTER_PORT=12347 3 | NNODES=1 4 | NODE_RANK=0 5 | GPUS_PER_NODE=2 6 | 7 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \ 8 | --nnodes $NNODES \ 9 | --node_rank $NODE_RANK \ 10 | --master_addr $MASTER_ADDR \ 11 | --master_port $MASTER_PORT" 12 | 13 | # cd ../ 14 | # python3 setup.py install 15 | # cd - 16 | 17 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_vit.py 18 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_bert_pkv.py 19 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_bert.py 20 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_roberta.py 21 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_t5.py 22 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_t5v1_1.py 23 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_flan_t5.py 24 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_mt5.py 25 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_gpt2.py 26 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_gptj.py 27 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_glm.py 28 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_opt.py 29 | python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_llama.py 30 | -------------------------------------------------------------------------------- /tests/test_bert.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.tokenizer import BertTokenizer 7 | from model_center.model import BertConfig, Bert 8 | 9 | from transformers import BertForMaskedLM as hugBert 10 | 11 | def main(): 12 | bmt.init_distributed() 13 | 14 | path = "bert-base-uncased" 15 | tokenizer = BertTokenizer.from_pretrained(path) 16 | config = BertConfig.from_pretrained(path) 17 | config.dropout_p = 0 18 | bmt_bert = Bert.from_pretrained(path, config=config) 19 | 20 | hug_bert = hugBert.from_pretrained(path).cuda().eval().half() 21 | 22 | for i in range(10): 23 | batch = 1 24 | max_encoder_length = 512 25 | input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda() 26 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 27 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 28 | 29 | bmt_logits = bmt_bert(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits 30 | hug_logits = hug_bert(input_ids = input_ids, attention_mask = attention_mask).logits 31 | b = bmt_logits*attention_mask[:,:,None] 32 | h = hug_logits*attention_mask[:,:,None] 33 | d = (h - b).abs() 34 | print(d.max()) 35 | b_emb=bmt_bert._modules['input_embedding'] 36 | h_emb=hug_bert._modules['bert']._modules['embeddings']._modules['word_embeddings'] 37 | emb_grad=[] 38 | def hook(name): 39 | def backward_hook(module, grad_input, grad_output): 40 | emb_grad.append(grad_output[0]) 41 | return backward_hook 42 | h_emb.register_full_backward_hook(hook("h")) 43 | b_emb.register_full_backward_hook(hook("b")) 44 | loss_func = torch.nn.CrossEntropyLoss() 45 | labels=torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.long).cuda() 46 | loss1 = loss_func(b.view(-1,b.shape[-1]), labels.view(-1)) 47 | loss2 = loss_func(h.view(-1,h.shape[-1]), labels.view(-1)) 48 | loss1.backward() 49 | loss2.backward() 50 | if i>0: 51 | d_grad=(emb_grad[0]-emb_grad[1]).abs() 52 | print(d_grad.max()) 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /tests/test_bert_pkv.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.tokenizer import BertTokenizer 7 | from model_center.model import BertConfig, Bert 8 | 9 | from transformers import BertModel as hugBert 10 | 11 | def main(): 12 | bmt.init_distributed() 13 | 14 | path = "bert-base-uncased" 15 | config = BertConfig.from_pretrained(path) 16 | config.dropout_p = 0 17 | bmt_bert = Bert.from_pretrained(path, config=config) 18 | 19 | cur_len = 0 20 | add_len = 8 21 | bmt_pkv = None 22 | hug_pkv = None 23 | 24 | input_ids_list = [] 25 | logits_list = [] 26 | attention_mask_all = None 27 | 28 | for _ in range(40): 29 | batch = 2 30 | input_ids = torch.randint(config.vocab_size, (batch, add_len,), dtype=torch.int32).cuda() 31 | attention_mask = torch.randint(2,(batch, add_len, add_len + cur_len), dtype=torch.int32).cuda() 32 | 33 | bmt_res = bmt_bert(input_ids = input_ids, attention_mask = attention_mask, use_cache = True, past_key_values = bmt_pkv) 34 | bmt_pkv = bmt_res.past_key_values 35 | bmt_logits = bmt_res.last_hidden_state 36 | 37 | input_ids_list.append(input_ids) 38 | logits_list.append(bmt_logits) 39 | if attention_mask_all is None: 40 | attention_mask_all = attention_mask 41 | else: 42 | attention_mask_all = torch.cat([attention_mask_all, torch.zeros(batch, cur_len, add_len).cuda()], dim=2) 43 | attention_mask_all = torch.cat([attention_mask_all, attention_mask], dim=1) 44 | 45 | cur_len += add_len 46 | 47 | input_ids = torch.cat(input_ids_list, dim=1) 48 | logits_pkv = torch.cat(logits_list, dim=1) 49 | logits = bmt_bert(input_ids = input_ids, attention_mask = attention_mask_all).last_hidden_state 50 | print((logits - logits_pkv).abs().max()) 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /tests/test_flan_t5.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.tokenizer import T5Tokenizer 7 | from model_center.model import T5Config, T5 8 | 9 | from transformers import T5ForConditionalGeneration as hugT5 10 | 11 | import sys 12 | 13 | def main(): 14 | bmt.init_distributed() 15 | 16 | ver = "xl" 17 | 18 | path = f"flan-t5-{ver}" 19 | tokenizer = T5Tokenizer.from_pretrained(path) 20 | config = T5Config.from_pretrained(path) 21 | bmt_t5 = T5.from_pretrained(path) 22 | 23 | path = f"google/flan-t5-{ver}" 24 | hug_t5 = hugT5.from_pretrained(path).cuda() 25 | 26 | for _ in range(10): 27 | batch = 1 28 | max_encoder_length = 512 29 | max_decoder_length = 512 30 | input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda() 31 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 32 | decoder_input_ids = torch.randint(config.vocab_size, (batch, max_decoder_length,), dtype=torch.int32).cuda() 33 | decoder_length = torch.randint(max_decoder_length, (batch, ), dtype=torch.int32).cuda() 34 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 35 | decoder_attention_mask = torch.arange(decoder_input_ids.shape[1], device=decoder_input_ids.device)[None, :].repeat(decoder_input_ids.shape[0], 1) < decoder_length[:, None] 36 | 37 | bmt_logits = bmt_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, output_logits=True).logits 38 | hug_logits = hug_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask).logits 39 | mask = decoder_attention_mask[:,:,None] 40 | b = bmt_logits * mask 41 | h = hug_logits * mask 42 | d = (h - b).abs() 43 | print(d.max()) 44 | 45 | if __name__ == "__main__": 46 | main() 47 | 48 | -------------------------------------------------------------------------------- /tests/test_gpt2.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.tokenizer import GPT2Tokenizer 7 | from model_center.model import GPT2Config, GPT2 8 | from transformers import GPT2LMHeadModel as hugGPT2 9 | 10 | def main(): 11 | bmt.init_distributed() 12 | 13 | path = "gpt2-base" 14 | tokenizer = GPT2Tokenizer.from_pretrained(path) 15 | config = GPT2Config.from_pretrained(path) 16 | config.dropout_p = 0 17 | bmt_gpt2 = GPT2.from_pretrained(path, config=config) 18 | 19 | hug_gpt2 = hugGPT2.from_pretrained('gpt2').cuda().eval().half() 20 | def hook(name): 21 | def backward_hook(module, grad_input, grad_output): 22 | emb_grad[name]=grad_output[0] 23 | return backward_hook 24 | for i in range(10): 25 | batch = 1 26 | max_encoder_length = 512 27 | input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda() 28 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 29 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 30 | 31 | bmt_logits = bmt_gpt2(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits 32 | hug_logits = hug_gpt2(input_ids = input_ids, attention_mask = attention_mask).logits 33 | b = bmt_logits*attention_mask[:,:,None] 34 | h = hug_logits*attention_mask[:,:,None] 35 | d = (h - b).abs() 36 | print(d.max()) 37 | b_emb=bmt_gpt2._modules['input_embedding'] 38 | h_emb=hug_gpt2._modules['transformer']._modules['wte'] 39 | emb_grad={} 40 | h_emb.register_full_backward_hook(hook("h")) 41 | b_emb.register_full_backward_hook(hook("b")) 42 | loss_func = torch.nn.CrossEntropyLoss() 43 | labels=torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.long).cuda() 44 | loss1 = loss_func(b.view(-1,b.shape[-1]), labels.view(-1)) 45 | loss2 = loss_func(h.view(-1,h.shape[-1]), labels.view(-1)) 46 | loss1.backward() 47 | loss2.backward() 48 | if i>0: 49 | d_grad=(emb_grad["h"]-emb_grad["b"]).abs() 50 | print(d_grad.max()) 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /tests/test_gpt_pkv.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.tokenizer import BertTokenizer 7 | from model_center.model import BertConfig, Bert 8 | from transformers import BertModel as hugBert 9 | from model_center.model import GPT2Config, GPT2 10 | from transformers import GPT2Model as hugGPT2 11 | 12 | 13 | def main(): 14 | bmt.init_distributed() 15 | 16 | # path = "bert-base-uncased" 17 | # config = BertConfig.from_pretrained(path) 18 | # config.dropout_p = 0 19 | # bmt_bert = Bert.from_pretrained(path, config=config) 20 | # hug_bert = hugBert.from_pretrained(path).cuda().eval().half() 21 | 22 | path = "gpt2-base" 23 | config = GPT2Config.from_pretrained(path, use_cache = True) 24 | config.dropout_p = 0 25 | bmt_bert = GPT2.from_pretrained(path, config=config) 26 | hug_bert = hugGPT2.from_pretrained('gpt2').cuda().eval().half() 27 | 28 | cur_len = 0 29 | add_len = 1 30 | bmt_pkv = None 31 | hug_pkv = None 32 | 33 | input_ids_list = [] 34 | bmt_logits_list = [] 35 | hug_logits_list = [] 36 | 37 | for _ in range(100): 38 | 39 | batch = 2 40 | input_ids = torch.randint(config.vocab_size, (batch, add_len,), dtype=torch.int32).cuda() 41 | attention_mask = torch.ones((batch, add_len + cur_len, add_len), dtype=torch.int32).cuda() 42 | attention_mask_1 = torch.ones(((batch, add_len + cur_len)), dtype=torch.int32).cuda() 43 | 44 | bmt_res = bmt_bert(input_ids = input_ids, attention_mask = attention_mask, use_cache = True, past_key_values = bmt_pkv) 45 | bmt_pkv = bmt_res.past_key_values 46 | bmt_logits = bmt_res.last_hidden_state 47 | bmt_logits_list.append(bmt_logits) 48 | 49 | input_ids_list.append(input_ids) 50 | hug_res = hug_bert(input_ids = input_ids, attention_mask = attention_mask_1, use_cache = True, past_key_values = hug_pkv) 51 | hug_pkv = hug_res.past_key_values 52 | hug_logits = hug_res.last_hidden_state 53 | hug_logits_list.append(hug_logits) 54 | 55 | cur_len += add_len 56 | 57 | bmt_logits_pkv = torch.cat(bmt_logits_list, dim=1) 58 | hug_logits_pkv = torch.cat(hug_logits_list, dim=1) 59 | print((bmt_logits_pkv - hug_logits_pkv).abs().mean()) 60 | 61 | input_ids = torch.cat(input_ids_list, dim=1) 62 | logits = bmt_bert(input_ids = input_ids, attention_mask = torch.ones((2, cur_len), dtype=torch.int32).cuda()).last_hidden_state 63 | print((logits - bmt_logits_pkv).abs().mean()) 64 | 65 | if __name__ == "__main__": 66 | main() 67 | -------------------------------------------------------------------------------- /tests/test_gptj.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.tokenizer import GPTjTokenizer 7 | from model_center.model import GPTjConfig, GPTj 8 | 9 | from transformers import GPTJForCausalLM as hugGPTj 10 | 11 | def main(): 12 | bmt.init_distributed() 13 | 14 | tokenizer = GPTjTokenizer.from_pretrained("/yinxr/zwl/.cache/model_center/gptj-6b") 15 | config = GPTjConfig.from_pretrained("/yinxr/zwl/.cache/model_center/gptj-6b") 16 | config.dropout_p = 0 17 | bmt_gptj = GPTj.from_pretrained("/yinxr/zwl/.cache/model_center/gptj-6b") 18 | 19 | hug_gptj = hugGPTj.from_pretrained("/yinxr/zwl/.cache/transformer/EleutherAI/gpt-j-6B").cuda().eval().half() 20 | 21 | for _ in range(10): 22 | batch = 1 23 | max_encoder_length = 512 24 | input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda() 25 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 26 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 27 | 28 | bmt_logits = bmt_gptj(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits 29 | hug_logits = hug_gptj(input_ids = input_ids, attention_mask = attention_mask).logits 30 | b = (bmt_logits*attention_mask[:,:,None]) 31 | h = hug_logits*attention_mask[:,:,None] 32 | d = (h - b).abs() 33 | print(d.max()) 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /tests/test_llama.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | from model_center.model.config import LlamaConfig 6 | from model_center.model import Llama 7 | from model_center.tokenizer import LlamaTokenizer 8 | 9 | from transformers import LlamaForCausalLM 10 | from transformers import LlamaTokenizer as LlamaTokenizerHF 11 | 12 | def main(): 13 | # path = f"../results/llama-7b" 14 | # hf_path = f"../results/llama-7b-hf" 15 | # path = f"../results/llama-2-7b" 16 | # hf_path = f"../results/llama-2-7b-hf" 17 | path = f"../results/llama-2-13b" 18 | hf_path = f"../results/llama-2-13b-hf" 19 | 20 | tokenizer = LlamaTokenizer.from_pretrained(path) 21 | config = LlamaConfig.from_pretrained(path) 22 | bmt_llama = Llama.from_pretrained(path, config=config) 23 | hug_llama = LlamaForCausalLM.from_pretrained(hf_path).half().eval().cuda() 24 | 25 | for ith in range(1, 11): 26 | batch = 1 27 | max_encoder_length = ith * 16 28 | input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda() 29 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 30 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 31 | 32 | bmt_logits = bmt_llama(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits 33 | hug_logits = hug_llama(input_ids = input_ids, attention_mask = attention_mask).logits 34 | b = bmt_logits*attention_mask[:,:,None] 35 | h = hug_logits*attention_mask[:,:,None] 36 | d = (h - b).abs() 37 | if bmt.rank() == 0: 38 | print(d.max()) 39 | 40 | def generate(): 41 | # only one GPU is enough 42 | from model_center.generation.llama import LlamaBeamSearch, LlamaRandomSampling 43 | path = f"../results/llama-7b" 44 | 45 | tokenizer = LlamaTokenizer.from_pretrained(path) 46 | model = Llama.from_pretrained(path) 47 | 48 | beam_search = LlamaBeamSearch( 49 | model=model, 50 | tokenizer=tokenizer, 51 | ) 52 | random_search = LlamaRandomSampling( 53 | model=model, 54 | tokenizer=tokenizer, 55 | ) 56 | 57 | data_list = [ 58 | "Beijing is the capital of", 59 | "Steven Jobs", 60 | ] 61 | 62 | inference_results = beam_search.generate(data_list, max_length=100) 63 | print("beam search:") 64 | for res in inference_results: 65 | print(res) 66 | print("random sampling:") 67 | inference_results = random_search.generate(data_list, max_length=100) 68 | for res in inference_results: 69 | print(res) 70 | 71 | if __name__ == "__main__": 72 | bmt.init_distributed(seed=2333) 73 | main() 74 | # generate() 75 | -------------------------------------------------------------------------------- /tests/test_longformer.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | from cgitb import lookup 4 | import torch 5 | import bmtrain as bmt 6 | from model_center.tokenizer import BertTokenizer 7 | from model_center.model import Longformer 8 | from transformers import LongformerForMaskedLM 9 | from transformers import BertForMaskedLM as hugBert 10 | 11 | import sys 12 | def main(): 13 | bmt.init_distributed() 14 | 15 | bmt_bert = Longformer.from_pretrained("lawformer") 16 | hug_bert = LongformerForMaskedLM.from_pretrained("thunlp/Lawformer").cuda() 17 | bmt_bert.eval() 18 | hug_bert.eval() 19 | b_emb=bmt_bert._modules['input_embedding'] 20 | h_emb=hug_bert._modules['longformer']._modules['embeddings']._modules['word_embeddings'] 21 | for i in range(1): 22 | batch = 1 23 | max_encoder_length = 2048 24 | input_ids = torch.randint(21128, (batch, max_encoder_length,), dtype=torch.int32).cuda() 25 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 26 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 27 | global_attn = torch.zeros(input_ids.shape[1],device=input_ids.device).repeat(input_ids.shape[0], 1) 28 | global_attn[:,:100] = 1 29 | bmt_logits = bmt_bert(input_ids = input_ids, return_logits=True,attention_mask=attention_mask,global_attention_mask=global_attn) 30 | hug_logits = hug_bert(input_ids = input_ids,attention_mask=attention_mask,global_attention_mask=global_attn).logits 31 | b = bmt_logits*attention_mask[:,:,None] 32 | h = hug_logits*attention_mask[:,:,None] 33 | d = (h - b).abs() 34 | emb_grad={} 35 | print(d.max()) 36 | def hook(name): 37 | def backward_hook(module, grad_input, grad_output): 38 | emb_grad[name]=grad_output[0] 39 | return backward_hook 40 | h_emb.register_full_backward_hook(hook("h")) 41 | b_emb.register_full_backward_hook(hook("b")) 42 | loss_func = torch.nn.CrossEntropyLoss() 43 | labels=torch.randint(21128, (batch, max_encoder_length,), dtype=torch.long).cuda() 44 | loss1 = loss_func(b.view(-1,b.shape[-1]), labels.view(-1)) 45 | loss2 = loss_func(h.view(-1,h.shape[-1]), labels.view(-1)) 46 | loss1.backward() 47 | loss2.backward() 48 | if i>0: 49 | d_grad=(emb_grad["h"]-emb_grad["b"]).abs() 50 | print(d_grad.max()) 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /tests/test_mt5.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.tokenizer import T5Tokenizer 7 | from model_center.model import T5Config, T5 8 | 9 | from transformers import MT5ForConditionalGeneration as hugT5 10 | 11 | import sys 12 | 13 | def main(): 14 | bmt.init_distributed() 15 | 16 | ver = "large" 17 | 18 | path = f"mt5-{ver}" 19 | tokenizer = T5Tokenizer.from_pretrained(path) 20 | config = T5Config.from_pretrained(path) 21 | bmt_t5 = T5.from_pretrained(path) 22 | 23 | path = f"google/mt5-{ver}" 24 | hug_t5 = hugT5.from_pretrained(path).cuda() 25 | 26 | for _ in range(10): 27 | batch = 1 28 | max_encoder_length = 512 29 | max_decoder_length = 512 30 | input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda() 31 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 32 | decoder_input_ids = torch.randint(config.vocab_size, (batch, max_decoder_length,), dtype=torch.int32).cuda() 33 | decoder_length = torch.randint(max_decoder_length, (batch, ), dtype=torch.int32).cuda() 34 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 35 | decoder_attention_mask = torch.arange(decoder_input_ids.shape[1], device=decoder_input_ids.device)[None, :].repeat(decoder_input_ids.shape[0], 1) < decoder_length[:, None] 36 | 37 | bmt_logits = bmt_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, output_logits=True).logits 38 | hug_logits = hug_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask).logits 39 | mask = decoder_attention_mask[:,:,None] 40 | b = bmt_logits * mask 41 | h = hug_logits * mask 42 | d = (h - b).abs() 43 | print(d.max()) 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /tests/test_opt.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.tokenizer import OPTTokenizer 7 | from model_center.model import OPTConfig, OPT 8 | from transformers import OPTForCausalLM as hugOPT 9 | 10 | def main(): 11 | bmt.init_distributed() 12 | 13 | ver = "2.7b" 14 | path = f"opt-{ver}" 15 | tokenizer = OPTTokenizer.from_pretrained(path) 16 | config = OPTConfig.from_pretrained(path) 17 | config.dropout_p = 0 18 | bmt_opt = OPT.from_pretrained(path, config=config) 19 | 20 | hug_opt = hugOPT.from_pretrained(f'opt-{ver}').cuda().eval().half() 21 | def hook(name): 22 | def backward_hook(module, grad_input, grad_output): 23 | emb_grad[name]=grad_output[0] 24 | return backward_hook 25 | emb_grad={} 26 | for i in range(10): 27 | batch = 1 28 | max_encoder_length = 512 29 | input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda() 30 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 31 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 32 | 33 | bmt_logits = bmt_opt(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits 34 | hug_logits = hug_opt(input_ids = input_ids, attention_mask = attention_mask).logits 35 | b = bmt_logits*attention_mask[:,:,None] 36 | h = hug_logits*attention_mask[:,:,None] 37 | d = (h - b).abs() 38 | print(d.max()) 39 | if i == 0: 40 | b_emb=bmt_opt._modules['input_embedding'] 41 | h_emb=hug_opt._modules['model']._modules['decoder']._modules['embed_tokens'] 42 | h_emb.register_full_backward_hook(hook("h")) 43 | b_emb.register_full_backward_hook(hook("b")) 44 | else: 45 | emb_grad.clear() 46 | loss_func = torch.nn.CrossEntropyLoss() 47 | labels = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.long).cuda() 48 | loss1 = loss_func(b.view(-1,b.shape[-1]), labels.view(-1)) 49 | loss2 = loss_func(h.view(-1,h.shape[-1]), labels.view(-1)) 50 | loss1.backward() 51 | loss2.backward() 52 | if i>0: 53 | d_grad=(emb_grad["h"]-emb_grad["b"]).abs() 54 | print(d_grad.max()) 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /tests/test_roberta.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | from model_center.model.config import RobertaConfig 6 | from model_center.model import Roberta 7 | from model_center.tokenizer import RobertaTokenizer 8 | 9 | from transformers import BertTokenizer, RobertaForMaskedLM as hugRoberta 10 | 11 | def main(): 12 | bmt.init_distributed() 13 | 14 | # path = "roberta-base" 15 | path = "roberta-large" 16 | tokenizer = RobertaTokenizer.from_pretrained(path) 17 | config = RobertaConfig.from_pretrained(path) 18 | config.dropout_p = 0 19 | bmt_roberta = Roberta.from_pretrained(path, config=config) 20 | 21 | hug_roberta = hugRoberta.from_pretrained(path).cuda().eval().half() 22 | 23 | for _ in range(10): 24 | batch = 1 25 | max_encoder_length = 512 26 | input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda() 27 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 28 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 29 | 30 | bmt_logits = bmt_roberta(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits 31 | hug_logits = hug_roberta(input_ids = input_ids, attention_mask = attention_mask).logits 32 | b = bmt_logits*attention_mask[:,:,None] 33 | h = hug_logits*attention_mask[:,:,None] 34 | d = (h - b).abs() 35 | print(d.max()) 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /tests/test_t5.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.tokenizer import T5Tokenizer 7 | from model_center.model import T5Config, T5 8 | 9 | from transformers import T5ForConditionalGeneration as hugT5 10 | 11 | def main(): 12 | path = "t5-base" 13 | tokenizer = T5Tokenizer.from_pretrained(path) 14 | config = T5Config.from_pretrained(path) 15 | config.scale = True 16 | bmt_t5 = T5.from_pretrained(path, config=config) 17 | 18 | hug_t5 = hugT5.from_pretrained(path).cuda() 19 | 20 | for _ in range(10): 21 | batch = 1 22 | max_encoder_length = 512 23 | max_decoder_length = 512 24 | input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda() 25 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 26 | decoder_input_ids = torch.randint(config.vocab_size, (batch, max_decoder_length,), dtype=torch.int32).cuda() 27 | decoder_length = torch.randint(max_decoder_length, (batch, ), dtype=torch.int32).cuda() 28 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 29 | decoder_attention_mask = torch.arange(decoder_input_ids.shape[1], device=decoder_input_ids.device)[None, :].repeat(decoder_input_ids.shape[0], 1) < decoder_length[:, None] 30 | 31 | bmt_logits = bmt_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, output_logits=True).logits 32 | hug_logits = hug_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask).logits 33 | b = bmt_logits*decoder_attention_mask[:,:,None] 34 | h = hug_logits*decoder_attention_mask[:,:,None] 35 | d = (h - b).abs() 36 | print(d.max()) 37 | print(h / b) 38 | 39 | def generate(): 40 | # only one GPU is enough 41 | from model_center.generation.t5 import T5BeamSearch, T5RandomSampling 42 | path = f"../results/t5-3b" 43 | 44 | tokenizer = T5Tokenizer.from_pretrained(path) 45 | model = T5.from_pretrained(path) 46 | model.config.scale = True 47 | 48 | beam_search = T5BeamSearch( 49 | model=model, 50 | tokenizer=tokenizer, 51 | ) 52 | random_search = T5RandomSampling( 53 | model=model, 54 | tokenizer=tokenizer, 55 | ) 56 | 57 | data_list = [ 58 | "Beijing is the capital of", 59 | "Steven Jobs is one of the", 60 | "translate English to German. English: The house is wonderful. German:", 61 | ] 62 | 63 | inference_results = beam_search.generate(data_list, max_length=100) 64 | print("beam search:") 65 | for res in inference_results: 66 | print(res) 67 | print("random sampling:") 68 | inference_results = random_search.generate(data_list, max_length=100) 69 | for res in inference_results: 70 | print(res) 71 | 72 | if __name__ == "__main__": 73 | bmt.init_distributed() 74 | # main() 75 | generate() 76 | -------------------------------------------------------------------------------- /tests/test_t5v1_1.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.tokenizer import T5Tokenizer 7 | from model_center.model import T5Config, T5 8 | 9 | from transformers import T5ForConditionalGeneration as hugT5 10 | 11 | import sys 12 | 13 | def main(): 14 | bmt.init_distributed() 15 | 16 | ver = "large" 17 | 18 | path = f"t5-v1_1-{ver}" 19 | tokenizer = T5Tokenizer.from_pretrained(path) 20 | config = T5Config.from_pretrained(path) 21 | bmt_t5 = T5.from_pretrained(path) 22 | 23 | path = f"google/t5-v1_1-{ver}" 24 | hug_t5 = hugT5.from_pretrained(path).cuda() 25 | 26 | for _ in range(10): 27 | batch = 1 28 | max_encoder_length = 512 29 | max_decoder_length = 512 30 | input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda() 31 | length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda() 32 | decoder_input_ids = torch.randint(config.vocab_size, (batch, max_decoder_length,), dtype=torch.int32).cuda() 33 | decoder_length = torch.randint(max_decoder_length, (batch, ), dtype=torch.int32).cuda() 34 | attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None] 35 | decoder_attention_mask = torch.arange(decoder_input_ids.shape[1], device=decoder_input_ids.device)[None, :].repeat(decoder_input_ids.shape[0], 1) < decoder_length[:, None] 36 | 37 | bmt_logits = bmt_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, output_logits=True).logits 38 | hug_logits = hug_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask).logits 39 | mask = decoder_attention_mask[:,:,None] 40 | b = bmt_logits * mask 41 | h = hug_logits * mask 42 | d = (h - b).abs() 43 | print(d.max()) 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /tests/test_vit.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | import torch 4 | import bmtrain as bmt 5 | 6 | from model_center.model import ViT,VitConfig 7 | from transformers import ViTForImageClassification 8 | 9 | 10 | 11 | def main(): 12 | bmt.init_distributed() 13 | 14 | path = "vit-base_patch16_224" 15 | config = VitConfig.from_pretrained(path) 16 | config.dropout_p = 0 17 | bmt_vit = ViT.from_pretrained(path, config=config) 18 | hug_vit = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224').cuda().half() 19 | def hook(name): 20 | def backward_hook(module, grad_input, grad_output): 21 | emb_grad[name]=grad_output[0] 22 | return backward_hook 23 | for i in range(10): 24 | with torch.autograd.set_detect_anomaly(True): 25 | batch = 12 26 | # max_encoder_length = 512 27 | patch_size=224 28 | channel_size=3 29 | # inputs = torch.randn((1,3,224,224),dtype = torch.half).cuda().to(memory_format=torch.channels_last) 30 | inputs = torch.randn((batch,channel_size,patch_size,patch_size),dtype = torch.half).cuda().to(memory_format=torch.channels_last) 31 | 32 | bmt_logits = bmt_vit(inputs) 33 | hug_logits = hug_vit(inputs).logits 34 | b = bmt_logits 35 | h = hug_logits 36 | d = (h - b).abs() 37 | print(d.max()) 38 | b_emb=bmt_vit.patch_embed.proj 39 | h_emb=hug_vit.vit.embeddings.patch_embeddings.projection 40 | emb_grad={} 41 | h_emb.register_full_backward_hook(hook("h")) 42 | b_emb.register_full_backward_hook(hook("b")) 43 | loss_func = torch.nn.CrossEntropyLoss(ignore_index=-100) 44 | labels=torch.randint(1000, (batch,), dtype=torch.long).cuda() 45 | loss1 = loss_func(b, labels) 46 | loss2 = loss_func(h, labels) 47 | loss1.backward() 48 | loss2.backward() 49 | if i>0: 50 | d_grad=(emb_grad["h"]-emb_grad["b"]).abs() 51 | print(d_grad.max()) 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /transfer/hugGPT2_bmtrainGPT2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The OpenBMB team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from collections import OrderedDict 16 | import torch 17 | from tqdm import tqdm 18 | 19 | def main(): 20 | ver_layernum = [ 21 | ("base", 12), 22 | ("medium", 24), 23 | ("large", 36), 24 | ("xl", 48), 25 | ] 26 | ver, layernum = ver_layernum[0] 27 | inpath = f"../results/gpt2-{ver}-pytorch_model.bin" 28 | outpath = f"../results/GPT2-{ver}.pt" 29 | inp = torch.load(inpath) 30 | out = OrderedDict() 31 | out["input_embedding.weight"] = torch.cat([inp["wte.weight"], torch.zeros((1,inp["wte.weight"].shape[1]))], dim=0).contiguous() # original vocab size is an odd number 32 | out["position_embedding.weight"] = inp["wpe.weight"].contiguous() 33 | out["encoder.output_layernorm.weight"] = inp["ln_f.weight"].contiguous() 34 | out["encoder.output_layernorm.bias"] = inp["ln_f.bias"].contiguous() 35 | for i in range(layernum): 36 | prefix = f"encoder.layers.{i}" 37 | old_prefix = f"h.{i}" 38 | attn_size = inp[f"{old_prefix}.attn.c_attn.weight"].shape[0] 39 | out[f"{prefix}.self_att.layernorm_before_attention.weight"] = inp[f"{old_prefix}.ln_1.weight"].contiguous() 40 | out[f"{prefix}.self_att.layernorm_before_attention.bias"] = inp[f"{old_prefix}.ln_1.bias"].contiguous() 41 | out[f"{prefix}.self_att.self_attention.project_q.weight"] = inp[f"{old_prefix}.attn.c_attn.weight"][:, :attn_size].transpose(0,1).contiguous() 42 | out[f"{prefix}.self_att.self_attention.project_q.bias"] = inp[f"{old_prefix}.attn.c_attn.bias"][:attn_size].contiguous() 43 | out[f"{prefix}.self_att.self_attention.project_k.weight"] = inp[f"{old_prefix}.attn.c_attn.weight"][:, attn_size:2*attn_size].transpose(0,1).contiguous() 44 | out[f"{prefix}.self_att.self_attention.project_k.bias"] = inp[f"{old_prefix}.attn.c_attn.bias"][attn_size:2*attn_size].contiguous() 45 | out[f"{prefix}.self_att.self_attention.project_v.weight"] = inp[f"{old_prefix}.attn.c_attn.weight"][:, 2*attn_size:].transpose(0,1).contiguous() 46 | out[f"{prefix}.self_att.self_attention.project_v.bias"] = inp[f"{old_prefix}.attn.c_attn.bias"][2*attn_size:].contiguous() 47 | out[f"{prefix}.self_att.self_attention.attention_out.weight"] = inp[f"{old_prefix}.attn.c_proj.weight"].transpose(0,1).contiguous() 48 | out[f"{prefix}.self_att.self_attention.attention_out.bias"] = inp[f"{old_prefix}.attn.c_proj.bias"].contiguous() 49 | 50 | out[f"{prefix}.ffn.layernorm_before_ffn.weight"] = inp[f"{old_prefix}.ln_2.weight"].contiguous() 51 | out[f"{prefix}.ffn.layernorm_before_ffn.bias"] = inp[f"{old_prefix}.ln_2.bias"].contiguous() 52 | out[f"{prefix}.ffn.ffn.w_in.w.weight"] = inp[f"{old_prefix}.mlp.c_fc.weight"].transpose(0,1).contiguous() 53 | out[f"{prefix}.ffn.ffn.w_in.w.bias"] = inp[f"{old_prefix}.mlp.c_fc.bias"].contiguous() 54 | out[f"{prefix}.ffn.ffn.w_out.weight"] = inp[f"{old_prefix}.mlp.c_proj.weight"].transpose(0,1).contiguous() 55 | out[f"{prefix}.ffn.ffn.w_out.bias"] = inp[f"{old_prefix}.mlp.c_proj.bias"].contiguous() 56 | 57 | for k, v in out.items(): 58 | out[k] = out[k].half() 59 | 60 | torch.save(out, outpath) 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /transfer/hugGPTj_bmtrainGPTj.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The OpenBMB team. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from collections import OrderedDict 16 | import torch 17 | from tqdm import tqdm 18 | 19 | def main(): 20 | ver = "6b" 21 | layernum = 28 22 | inpath = f"../results/gptj-{ver}-pytorch_model.bin" 23 | outpath = f"../results/GPTj-{ver}.pt" 24 | inp = torch.load(inpath) 25 | out = OrderedDict() 26 | out["input_embedding.weight"] = inp["transformer.wte.weight"].contiguous() # original vocab size is an odd number 27 | out["output_projection.weight"] = inp["lm_head.weight"].contiguous() 28 | out["output_projection.bias"] = inp["lm_head.bias"].contiguous() 29 | out["encoder.output_layernorm.weight"] = inp["transformer.ln_f.weight"].contiguous() 30 | out["encoder.output_layernorm.bias"] = inp["transformer.ln_f.bias"].contiguous() 31 | for i in range(layernum): 32 | prefix = f"encoder.layers.{i}" 33 | old_prefix = f"transformer.h.{i}" 34 | # parallel, share the same layernorm 35 | out[f"{prefix}.self_att.layernorm_before_attention.weight"] = inp[f"{old_prefix}.ln_1.weight"].contiguous() 36 | out[f"{prefix}.self_att.layernorm_before_attention.bias"] = inp[f"{old_prefix}.ln_1.bias"].contiguous() 37 | out[f"{prefix}.ffn.layernorm_before_ffn.weight"] = inp[f"{old_prefix}.ln_1.weight"].contiguous() 38 | out[f"{prefix}.ffn.layernorm_before_ffn.bias"] = inp[f"{old_prefix}.ln_1.bias"].contiguous() 39 | 40 | out[f"{prefix}.self_att.self_attention.project_q.weight"] = inp[f"{old_prefix}.attn.q_proj.weight"].contiguous() 41 | out[f"{prefix}.self_att.self_attention.project_k.weight"] = inp[f"{old_prefix}.attn.k_proj.weight"].contiguous() 42 | out[f"{prefix}.self_att.self_attention.project_v.weight"] = inp[f"{old_prefix}.attn.v_proj.weight"].contiguous() 43 | out[f"{prefix}.self_att.self_attention.attention_out.weight"] = inp[f"{old_prefix}.attn.out_proj.weight"].contiguous() 44 | 45 | out[f"{prefix}.ffn.ffn.w_in.w.weight"] = inp[f"{old_prefix}.mlp.fc_in.weight"].contiguous() 46 | out[f"{prefix}.ffn.ffn.w_in.w.bias"] = inp[f"{old_prefix}.mlp.fc_in.bias"].contiguous() 47 | out[f"{prefix}.ffn.ffn.w_out.weight"] = inp[f"{old_prefix}.mlp.fc_out.weight"].contiguous() 48 | out[f"{prefix}.ffn.ffn.w_out.bias"] = inp[f"{old_prefix}.mlp.fc_out.bias"].contiguous() 49 | 50 | for k, v in out.items(): 51 | out[k] = out[k].half() 52 | 53 | torch.save(out, outpath) 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /transfer/hugLLaMa2_bmtrainLLaMa2.py: -------------------------------------------------------------------------------- 1 | from transformers import LlamaConfig 2 | import torch, os 3 | import json 4 | from collections import OrderedDict 5 | 6 | ver_layernum = [ 7 | # "7b", 8 | "13b", 9 | ] 10 | 11 | ver = ver_layernum[0] 12 | 13 | inpath = f"../results/llama-2-{ver}-hf" 14 | outpath = f"../results/llama-2-{ver}" 15 | 16 | hf_config = LlamaConfig.from_pretrained(inpath) 17 | config = { 18 | 'dim_model': hf_config.hidden_size, 19 | 'dim_ff': hf_config.intermediate_size, 20 | 'num_layers': hf_config.num_hidden_layers, 21 | 'num_heads': hf_config.num_attention_heads, 22 | 'num_heads_kv': hf_config.num_key_value_heads, 23 | 'dim_head': hf_config.hidden_size // hf_config.num_attention_heads, 24 | 'norm_eps': hf_config.rms_norm_eps, 25 | } 26 | with open(os.path.join(outpath, "config.json"), 'w') as f: 27 | json.dump(config, f) 28 | 29 | layernum = config['num_layers'] 30 | 31 | model_hf = OrderedDict() 32 | ckpt_num = None 33 | for name in os.listdir(inpath): 34 | if name.startswith("pytorch_model-") and name.endswith(".bin"): 35 | ckpt_num = int(name[-9:-4]) 36 | for i in range(1, ckpt_num + 1): 37 | part = torch.load(os.path.join(inpath, f"pytorch_model-{i:05d}-of-{ckpt_num:05d}.bin")) 38 | model_hf.update(part) 39 | 40 | out = OrderedDict() 41 | 42 | out["input_embedding.weight"] = model_hf['model.embed_tokens.weight'].contiguous() 43 | out["encoder.output_layernorm.weight"] = model_hf['model.norm.weight'].contiguous() 44 | out['output_projection.weight'] = model_hf['lm_head.weight'].contiguous() 45 | for lnum in range(layernum): 46 | hf_pfx = f"model.layers.{lnum}" 47 | bmt_pfx = f"encoder.layers.{lnum}" 48 | 49 | out[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"] = model_hf[f"{hf_pfx}.input_layernorm.weight"].contiguous() 50 | 51 | out[f"{bmt_pfx}.self_att.self_attention.project_q.weight"] = model_hf[f"{hf_pfx}.self_attn.q_proj.weight"].contiguous() 52 | out[f"{bmt_pfx}.self_att.self_attention.project_k.weight"] = model_hf[f"{hf_pfx}.self_attn.k_proj.weight"].contiguous() 53 | out[f"{bmt_pfx}.self_att.self_attention.project_v.weight"] = model_hf[f"{hf_pfx}.self_attn.v_proj.weight"].contiguous() 54 | out[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"] = model_hf[f"{hf_pfx}.self_attn.o_proj.weight"].contiguous() 55 | 56 | out[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"] = model_hf[f"{hf_pfx}.post_attention_layernorm.weight"].contiguous() 57 | 58 | out[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"] = model_hf[f"{hf_pfx}.mlp.gate_proj.weight"].contiguous() 59 | out[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"] = model_hf[f"{hf_pfx}.mlp.up_proj.weight"].contiguous() 60 | 61 | out[f"{bmt_pfx}.ffn.ffn.w_out.weight"] = model_hf[f"{hf_pfx}.mlp.down_proj.weight"].contiguous() 62 | 63 | 64 | for key in out: 65 | out[key] = out[key].half() 66 | 67 | if not os.path.exists(outpath): 68 | os.makedirs(outpath) 69 | torch.save(out, os.path.join(outpath, "pytorch_model.pt")) 70 | -------------------------------------------------------------------------------- /transfer/hugLLaMa_bmtrainLLaMa.py: -------------------------------------------------------------------------------- 1 | from transformers import LlamaConfig 2 | import torch, os 3 | import json 4 | from collections import OrderedDict 5 | 6 | ver_layernum = [ 7 | "7b", 8 | "13b", 9 | "30b", 10 | "65b", 11 | ] 12 | 13 | ver = ver_layernum[0] 14 | 15 | inpath = f"../results/llama-{ver}-hf" 16 | outpath = f"../results/llama-{ver}" 17 | 18 | hf_config = LlamaConfig.from_pretrained(inpath) 19 | config = { 20 | 'dim_model': hf_config.hidden_size, 21 | 'dim_ff': hf_config.intermediate_size, 22 | 'num_layers': hf_config.num_hidden_layers, 23 | 'num_heads': hf_config.num_attention_heads, 24 | 'dim_head': hf_config.hidden_size // hf_config.num_attention_heads, 25 | 'norm_eps': hf_config.rms_norm_eps, 26 | } 27 | with open(os.path.join(outpath, "config.json"), 'w') as f: 28 | json.dump(config, f) 29 | 30 | layernum = config['num_layers'] 31 | 32 | model_hf = OrderedDict() 33 | for i in range(1, layernum + 2): 34 | part = torch.load(os.path.join(inpath, f"pytorch_model-{i:05d}-of-000{layernum+1}.bin")) 35 | model_hf.update(part) 36 | 37 | out = OrderedDict() 38 | 39 | out["input_embedding.weight"] = model_hf['model.embed_tokens.weight'].contiguous() 40 | out["encoder.output_layernorm.weight"] = model_hf['model.norm.weight'].contiguous() 41 | out['output_projection.weight'] = model_hf['lm_head.weight'].contiguous() 42 | for lnum in range(layernum): 43 | hf_pfx = f"model.layers.{lnum}" 44 | bmt_pfx = f"encoder.layers.{lnum}" 45 | 46 | out[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"] = model_hf[f"{hf_pfx}.input_layernorm.weight"].contiguous() 47 | 48 | out[f"{bmt_pfx}.self_att.self_attention.project_q.weight"] = model_hf[f"{hf_pfx}.self_attn.q_proj.weight"].contiguous() 49 | out[f"{bmt_pfx}.self_att.self_attention.project_k.weight"] = model_hf[f"{hf_pfx}.self_attn.k_proj.weight"].contiguous() 50 | out[f"{bmt_pfx}.self_att.self_attention.project_v.weight"] = model_hf[f"{hf_pfx}.self_attn.v_proj.weight"].contiguous() 51 | out[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"] = model_hf[f"{hf_pfx}.self_attn.o_proj.weight"].contiguous() 52 | 53 | out[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"] = model_hf[f"{hf_pfx}.post_attention_layernorm.weight"].contiguous() 54 | 55 | out[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"] = model_hf[f"{hf_pfx}.mlp.gate_proj.weight"].contiguous() 56 | out[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"] = model_hf[f"{hf_pfx}.mlp.up_proj.weight"].contiguous() 57 | 58 | out[f"{bmt_pfx}.ffn.ffn.w_out.weight"] = model_hf[f"{hf_pfx}.mlp.down_proj.weight"].contiguous() 59 | 60 | 61 | for key in out: 62 | out[key] = out[key].half() 63 | 64 | if not os.path.exists(outpath): 65 | os.makedirs(outpath) 66 | torch.save(out, os.path.join(outpath, "pytorch_model.pt")) 67 | -------------------------------------------------------------------------------- /transfer/run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | WORKING_DIR=/mnt/sfs_turbo/hx/cpm3-pretrain/transfer 3 | cd ${WORKING_DIR} 4 | echo "Current working directory ${WORKING_DIR}" 5 | # python3 cpm1_oldffn2newffn.py 6 | # python3 cpm2_oldffn2newffn.py 7 | # python3 hugGPTj_bmtrainGPTj.py 8 | CMD="python3 cpm1_old2new.py" 9 | echo ${CMD} 10 | 11 | ${CMD} 2>&1 | tee /mnt/sfs_turbo/hx/cpm3-pretrain/logs/test-new.log --------------------------------------------------------------------------------