├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug-report.md
    │   ├── feature_request.md
    │   └── new-model.md
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README-ZH.md
├── README.md
├── docs
    ├── Makefile
    ├── README.md
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── _static
    │       ├── css
    │       │   └── custom.css
    │       ├── images
    │       │   └── logo.png
    │       └── js
    │       │   └── custom.js
    │   ├── api
    │       ├── block.rst
    │       └── module.rst
    │   ├── conf.py
    │   ├── index.md
    │   ├── model
    │       ├── bert.rst
    │       ├── cpm1.rst
    │       ├── cpm2.rst
    │       ├── gpt2.rst
    │       ├── gptj.rst
    │       └── t5.rst
    │   └── notes
    │       ├── benchmark.md
    │       ├── installation.md
    │       ├── pretrain_data.md
    │       ├── quickstart.md
    │       └── write_model.md
├── examples
    ├── bert
    │   ├── BoolQ.sh
    │   ├── CB.sh
    │   ├── COPA.sh
    │   ├── RTE.sh
    │   ├── WiC.sh
    │   └── finetune_bert.py
    ├── cpm1
    │   ├── finetune_cpm1.py
    │   ├── finetune_cpm1.sh
    │   ├── pretrain_cpm1.py
    │   └── pretrain_cpm1.sh
    ├── cpm2
    │   ├── finetune_cpm2.py
    │   ├── finetune_cpm2.sh
    │   ├── pretrain_cpm2.py
    │   └── pretrain_cpm2.sh
    ├── gpt2
    │   ├── BoolQ.sh
    │   ├── CB.sh
    │   ├── COPA.sh
    │   ├── RTE.sh
    │   ├── WSC.sh
    │   ├── WiC.sh
    │   └── finetune_gpt2.py
    ├── gptj
    │   ├── BoolQ.sh
    │   ├── CB.sh
    │   ├── COPA.sh
    │   ├── RTE.sh
    │   ├── WSC.sh
    │   ├── WiC.sh
    │   └── finetune_gptj.py
    ├── llama
    │   ├── RTE.sh
    │   └── finetune_llama.py
    ├── mt5
    │   ├── BoolQ.sh
    │   ├── CB.sh
    │   ├── COPA.sh
    │   ├── RTE.sh
    │   ├── WSC.sh
    │   ├── WiC.sh
    │   └── finetune_mt5.py
    ├── t5-v1_1
    │   ├── BoolQ.sh
    │   ├── CB.sh
    │   ├── COPA.sh
    │   ├── RTE.sh
    │   ├── WSC.sh
    │   ├── WiC.sh
    │   └── finetune_t5-v1_1.py
    └── t5
    │   ├── BoolQ.sh
    │   ├── CB.sh
    │   ├── COPA.sh
    │   ├── RTE.sh
    │   ├── SQuAD.sh
    │   ├── WSC.sh
    │   ├── WiC.sh
    │   ├── finetune_t5_squad.py
    │   ├── finetune_t5_superglue.py
    │   └── squad_metric.py
├── model_center
    ├── __init__.py
    ├── arguments.py
    ├── dataset
    │   ├── __init__.py
    │   ├── bertdataset
    │   │   ├── __init__.py
    │   │   └── superglue.py
    │   ├── cpm1
    │   │   ├── __init__.py
    │   │   └── cpm1_dataset.py
    │   ├── cpm1dataset
    │   │   ├── __init__.py
    │   │   └── down_data.py
    │   ├── cpm2
    │   │   ├── __init__.py
    │   │   └── dataset.py
    │   ├── cpm2dataset
    │   │   ├── __init__.py
    │   │   └── down_data.py
    │   ├── distributed_dataset.py
    │   ├── distributed_indexed.py
    │   ├── distributed_loader.py
    │   ├── gpt2dataset
    │   │   ├── __init__.py
    │   │   └── superglue.py
    │   ├── indexed.py
    │   ├── llamadataset
    │   │   ├── __init__.py
    │   │   └── superglue.py
    │   ├── t5dataset
    │   │   ├── __init__.py
    │   │   ├── squad.py
    │   │   └── superglue.py
    │   └── utils.py
    ├── generation
    │   ├── __init__.py
    │   ├── generation_utils.py
    │   ├── llama.py
    │   └── t5.py
    ├── layer
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── blocks.py
    │   ├── conv.py
    │   ├── embedding.py
    │   ├── feedforward.py
    │   ├── layernorm.py
    │   ├── linear.py
    │   ├── position_embedding.py
    │   └── transformer.py
    ├── model
    │   ├── __init__.py
    │   ├── basemodel.py
    │   ├── bert.py
    │   ├── config
    │   │   ├── __init__.py
    │   │   ├── bert_config.py
    │   │   ├── config.py
    │   │   ├── cpm1_config.py
    │   │   ├── cpm2_config.py
    │   │   ├── cpm3_config.py
    │   │   ├── glm_config.py
    │   │   ├── gpt2_config.py
    │   │   ├── gptj_config.py
    │   │   ├── llama_config.py
    │   │   ├── longformer_config.py
    │   │   ├── opt_config.py
    │   │   ├── roberta_config.py
    │   │   ├── t5_config.py
    │   │   └── vit_config.py
    │   ├── cpm1.py
    │   ├── cpm2.py
    │   ├── cpm3.py
    │   ├── glm.py
    │   ├── gpt2.py
    │   ├── gptj.py
    │   ├── llama.py
    │   ├── longformer.py
    │   ├── opt.py
    │   ├── roberta.py
    │   ├── t5.py
    │   └── vit.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── base_tokenizer.py
    │   ├── bert_tokenizer.py
    │   ├── cpm1_tokenizer.py
    │   ├── cpm2_tokenizer.py
    │   ├── glm_tokenizer.py
    │   ├── gpt2_tokenizer.py
    │   ├── gptj_tokenizer.py
    │   ├── llama_tokenizer.py
    │   ├── opt_tokenizer.py
    │   ├── roberta_tokenizer.py
    │   └── t5_tokenizer.py
    ├── tools
    │   ├── indexed_dataset.py
    │   ├── preprocess_cpm1_lm.py
    │   └── run_preprocess.sh
    └── utils
    │   ├── __init__.py
    │   ├── net_utils.py
    │   └── print_utils.py
├── requirements.txt
├── setup.py
├── tests
    ├── test.sh
    ├── test_bert.py
    ├── test_bert_pkv.py
    ├── test_flan_t5.py
    ├── test_glm.py
    ├── test_gpt2.py
    ├── test_gpt_pkv.py
    ├── test_gptj.py
    ├── test_llama.py
    ├── test_longformer.py
    ├── test_mt5.py
    ├── test_opt.py
    ├── test_roberta.py
    ├── test_t5.py
    ├── test_t5v1_1.py
    └── test_vit.py
└── transfer
    ├── hugFLANT5_bmtrainFLANT5.py
    ├── hugGPT2_bmtrainGPT2.py
    ├── hugGPTj_bmtrainGPTj.py
    ├── hugLLaMa2_bmtrainLLaMa2.py
    ├── hugLLaMa_bmtrainLLaMa.py
    ├── hugLongformer_bmtrainLongformer.py
    ├── hugMT5_bmtrainMT5.py
    ├── hugOPT_bmtrainOPT.py
    ├── hugRoBERTa_bmtrainRoBERTa.py
    ├── hugT5_bmtrainT5.py
    ├── hugT5v1_1_bmtrainT5v1_1.py
    └── run.sh


/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | <!-- A clear and concise description of what the bug is. -->
12 | 
13 | **Minimal steps to reproduce**
14 | <!-- Minimal steps/codes to reproduce the behavior -->
15 | 
16 | **Expected behavior**
17 | <!-- A clear and concise description of what you expected to happen. -->
18 | 
19 | **Screenshots**
20 | <!-- If applicable, add screenshots to help explain your problem. -->
21 | 
22 | **Environment:**
23 | <!-- List your package version and running environment like the cuda version, etc. -->
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[FEATURE]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | <!-- A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] -->
12 | 
13 | **Describe the solution you'd like**
14 | <!-- A clear and concise description of what you want to happen. -->
15 | 
16 | **Describe alternatives you've considered**
17 | <!-- A clear and concise description of any alternative solutions or features you've considered. -->
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/new-model.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: New model
 3 | about: Add a new model to this project
 4 | title: "[MODEL] "
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Introduction**
11 | <!-- A clear and concise description of the model and reasons for adding it. -->
12 | 
13 | **Resources**
14 | <!-- Please provide papers, codes, and authors of the model.-->
15 | Paper:
16 | 
17 | Code:
18 | 
19 | Author:
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | **/__pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | .DS_STORE
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 
133 | .vscode/
134 | pretrain_data/
135 | small_data/
136 | large_data/
137 | raw_data/
138 | checkpoints/
139 | results
140 | new_data
141 | down_data
142 | 
143 | *.bin
144 | *.idx
145 | *.pt
146 | 
147 | pretrain_data_raw/
148 | data_log
149 | debug
150 | checkpoints/
151 | 
152 | src/BMTrain
153 | src/transformers
154 | logs
155 | debug.sh


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to ModelCenter
 2 | 
 3 | We welcome everyone's effort to make the community and the package better. You are welcomed to propose an issue, make a pull request or help others in the community. All of the efforts are appreciated!
 4 | 
 5 | There are many ways that you can contribute to ModelCenter:
 6 | 
 7 | - ✉️ Submitting an issue.
 8 | - ⌨️ Making a pull request.
 9 | - 🤝 Serving the community.
10 | 
11 | ## Submitting an issue
12 | You can submit an issue if you find bugs or require new features and enhancements. Here are some principles:
13 | 
14 | 1. **Language.** It is better to write your issue in English so that more people can understand and help you more conveniently. 
15 | 2. **Search.** It is a good habit to search existing issues using the search bar of GitHub. Make sure there are no duplicated or similar issues with yours and if yes, check their solutions first.
16 | 3. **Format.** It is also very helpful to write the issue with a good writing style. We provide templates of common types of issues and everyone is encouraged to use these templates. If the templates do not fit in your issue, feel free to open a blank one.
17 | 4. **Writing style.** Write your issues in clear and concise words. It is also important to provide enough details for others to help. For example in a bug report, it is better to provide your running environment and minimal lines of code to reproduce it.
18 | 
19 | ## Making a pull request (PR)
20 | You can also write codes to contribute. The codes may include a bug fix, a new enhancement, or a new running example. Here we provide the steps to make a pull request:
21 | 
22 | 1. **Combine the PR with an issue.** Make us and others know what you are going to work on. If your codes try to solve an existing issue, you should comment on the issue and make sure there are no others working on it. If you are proposing a new enhancement, submit an issue first and we can discuss it with you before you work on it.
23 | 
24 | 2. **Fork the repository.** Fork the repository to your own GitHub space by clicking the "Fork" button. Then clone it on your disk and set the remote repo:
25 | ```git
26 | $ git clone https://github.com/<your GitHub>/ModelCenter.git
27 | $ cd ModelCenter
28 | $ git remote add upstream https://github.com/OpenBMB/ModelCenter.git
29 | ```
30 | 
31 | 3. **Write your code.** Change to a new branch to work on your modifications. 
32 | ```git
33 | $ git checkout -b your-branch-name
34 | ```
35 | You are encouraged to think up a meaningful and descriptive name for your branch. 
36 | 
37 | 4. **Make a pull request.** After you finish coding, you should first rebase your code and solve the conflicts with the remote codes:
38 | ```git
39 | $ git fetch upstream
40 | $ git rebase upstream/main
41 | ```
42 | Then you can push your codes to your own repo:
43 | ```git
44 | $ git push -u origin your-branch-name
45 | ```
46 | Finally, you can make the pull request from your GitHub repo and merge it with ours. Your codes will be merged into the main repo after our code review.
47 | 
48 | 
49 | ## Serving the community
50 | 
51 | Besides submitting issues and PRs, you can also join our community and help others. Efforts like writing the documents, answering questions as well as discussing new features are appreciated and welcomed. It will also be helpful if you can post your opinions and feelings about using our package on social media.
52 | 
53 | We are now developing a reward system and all your contributions will be recorded and rewarded in the future.
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # ModelCenter Documentation
 2 | 
 3 | To build this doc locally, first
 4 | 
 5 | ```
 6 | pip install -r docs/requirements.txt
 7 | ```
 8 | 
 9 | then,
10 | 
11 | ```
12 | cd docs
13 | make html
14 | ```
15 | 
16 | Then open the generated `docs/build/html/index.html` in your local browser. 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.https://www.sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=4.0.0
2 | recommonmark
3 | sphinx_markdown_tables
4 | sphinx_rtd_theme>=0.3.0
5 | torch>=1.10
6 | transformers
7 | jieba


--------------------------------------------------------------------------------
/docs/source/_static/css/custom.css:
--------------------------------------------------------------------------------
  1 | a,
  2 | .wy-menu-vertical header,
  3 | .wy-menu-vertical p.caption,
  4 | .wy-nav-top .fa-bars,
  5 | .wy-menu-vertical a:hover,
  6 | 
  7 | .rst-content code.literal, .rst-content tt.literal
  8 | 
  9 | {
 10 | 	color: #315EFE  !important; 
 11 | }
 12 | 
 13 | /* inspired by sphinx press theme */
 14 | .wy-menu.wy-menu-vertical li.toctree-l1.current > a {
 15 | 	border-left: solid 8px #315EFE  !important;
 16 | 	border-top: none;
 17 | 	border-bottom: none;
 18 | }
 19 | 
 20 | .wy-menu.wy-menu-vertical li.toctree-l1.current > ul {
 21 | 	border-left: solid 8px #315EFE  !important;
 22 | }
 23 | /* inspired by sphinx press theme */
 24 | 
 25 | .wy-nav-side {
 26 | 	color: unset !important;
 27 | 	background: unset !important;
 28 | 	border-right: solid 1px #ccc !important;
 29 | }
 30 | 
 31 | .wy-side-nav-search,
 32 | .wy-nav-top,
 33 | .wy-menu-vertical li,
 34 | .wy-menu-vertical li a:hover,
 35 | .wy-menu-vertical li a
 36 | {
 37 | 	background: unset !important;
 38 | }
 39 | 
 40 | .wy-menu-vertical li.current a {
 41 | 	border-right: unset !important;
 42 | }
 43 | 
 44 | .wy-side-nav-search div,
 45 | .wy-menu-vertical a {
 46 | 	color: #404040 !important;
 47 | }
 48 | 
 49 | .wy-menu-vertical button.toctree-expand {
 50 | 	color: #333 !important;
 51 | }
 52 | 
 53 | .wy-nav-content {
 54 | 	max-width: unset;
 55 | }
 56 | 
 57 | .rst-content {
 58 | 	max-width: 900px;
 59 | }
 60 | 
 61 | .wy-nav-content .icon-home:before {
 62 | 	content: "Docs";
 63 | }
 64 | 
 65 | .wy-side-nav-search .icon-home:before {
 66 | 	content: "";
 67 | }
 68 | 
 69 | dl.field-list {
 70 | 	display: block !important;
 71 | }
 72 | 
 73 | dl.field-list > dt:after {
 74 | 	content: "" !important;
 75 | }
 76 | 
 77 | :root {
 78 |     --dark-blue: #3260F7;
 79 |     --light-blue: rgba(194, 233, 248, 0.1) ;
 80 | }
 81 |   
 82 | dl.field-list > dt {
 83 | 	display: table;
 84 | 	padding-left: 6px !important;
 85 | 	padding-right: 6px !important;
 86 | 	margin-bottom: 4px !important;
 87 | 	padding-bottom: 1px !important;
 88 | 	background: var(--light-blue);
 89 | 	border-left: solid 2px var(--dark-blue);
 90 | }
 91 | 
 92 | 
 93 | dl.py.class>dt
 94 | {
 95 | 	color: rgba(17, 16, 17, 0.822) !important;
 96 | 	background: var(--light-blue) !important;
 97 | 	border-top: solid 2px var(--dark-blue) !important;
 98 | }
 99 | 
100 | dl.py.method>dt
101 | {
102 | 	background: var(--light-blue) !important;
103 | 	border-left: solid 2px var(--dark-blue) !important;
104 | }
105 | 
106 | dl.py.attribute>dt,
107 | dl.py.property>dt
108 | {
109 | 	background: var(--light-blue) !important;
110 | 	border-left: solid 2px var(--dark-blue) !important;
111 | }
112 | 
113 | .fa-plus-square-o::before, .wy-menu-vertical li button.toctree-expand::before,
114 | .fa-minus-square-o::before, .wy-menu-vertical li.current > a button.toctree-expand::before, .wy-menu-vertical li.on a button.toctree-expand::before
115 | {
116 | 	content: "";
117 | }
118 | 
119 | .rst-content .viewcode-back,
120 | .rst-content .viewcode-link
121 | {
122 |     color:#58b5cc;
123 | 	font-size: 120%;
124 | }


--------------------------------------------------------------------------------
/docs/source/_static/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/ModelCenter/14490451e9a91675ef8816c64cf6304d509bce62/docs/source/_static/images/logo.png


--------------------------------------------------------------------------------
/docs/source/_static/js/custom.js:
--------------------------------------------------------------------------------
1 | document.addEventListener("DOMContentLoaded", function(event) {
2 | 	document.querySelectorAll(".wy-menu.wy-menu-vertical > ul.current > li > a").forEach(a => a.addEventListener("click", e=>{
3 | 		f = document.querySelector(".wy-menu.wy-menu-vertical > ul.current > li > ul")
4 | 		if (f.style.display=='none') { f.style.display='block'; } else f.style.display = 'none'
5 | 	}));
6 | 	document.querySelectorAll(".headerlink").forEach(a => a.text="\u{1F517}");
7 | });


--------------------------------------------------------------------------------
/docs/source/api/block.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | block
 3 | =======================
 4 | 
 5 | Encoder
 6 | ------------------------------------
 7 | .. autoclass:: model_center.layer.Encoder
 8 |    :members:
 9 |    :show-inheritance:
10 | 
11 | Decoder
12 | ------------------------------------
13 | .. autoclass:: model_center.layer.Decoder
14 |    :members:
15 |    :show-inheritance:
16 | 
17 | TransformerBlock
18 | ------------------------------------
19 | .. autoclass:: model_center.layer.TransformerBlock
20 |    :members:
21 |    :show-inheritance:
22 | 
23 | FFNBlock
24 | ------------------------------------
25 | .. autoclass:: model_center.layer.FFNBlock
26 |    :members:
27 |    :show-inheritance:
28 | 
29 | SelfAttentionBlock
30 | ------------------------------------
31 | .. autoclass:: model_center.layer.SelfAttentionBlock
32 |    :members:
33 |    :show-inheritance:
34 | 
35 | CrossAttentionBlock
36 | ------------------------------------
37 | .. autoclass:: model_center.layer.CrossAttentionBlock
38 |    :members:
39 |    :show-inheritance:


--------------------------------------------------------------------------------
/docs/source/api/module.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | module
 3 | =======================
 4 | 
 5 | Linear
 6 | ------------------------------------
 7 | .. autoclass:: model_center.layer.Linear
 8 |    :members:
 9 |    :show-inheritance:
10 | 
11 | Embedding
12 | ------------------------------------
13 | .. autoclass:: model_center.layer.Embedding
14 |    :members:
15 |    :show-inheritance:
16 | 
17 | RelativePositionEmbedding
18 | ------------------------------------
19 | .. autoclass:: model_center.layer.RelativePositionEmbedding
20 |    :members:
21 |    :show-inheritance:
22 | 
23 | RotaryEmbedding
24 | ------------------------------------
25 | .. autoclass:: model_center.layer.RotaryEmbedding
26 |    :members:
27 |    :show-inheritance:
28 | 
29 | LayerNorm
30 | ------------------------------------
31 | .. autoclass:: model_center.layer.LayerNorm
32 |    :members:
33 |    :show-inheritance:
34 | 
35 | Attention
36 | ------------------------------------
37 | .. autoclass:: model_center.layer.Attention
38 |    :members:
39 |    :show-inheritance:
40 | 
41 | FeedForward
42 | ------------------------------------
43 | .. autoclass:: model_center.layer.FeedForward
44 |    :members:
45 |    :show-inheritance:


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.insert(0, os.path.abspath('../..'))
16 | 
17 | import recommonmark
18 | from recommonmark.transform import AutoStructify
19 | from recommonmark.parser import CommonMarkParser
20 | 
21 | 
22 | 
23 | # -- Project information -----------------------------------------------------
24 | 
25 | project = 'ModelCenter'
26 | copyright = '2022, OpenBMB'
27 | author = 'BMTrain Team'
28 | autodoc_mock_imports = ["bmtrain"]
29 | 
30 | # -- General configuration ---------------------------------------------------
31 | 
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 |     'sphinx.ext.autodoc',
37 |     'sphinx.ext.napoleon',
38 |     'sphinx.ext.mathjax',
39 |     'recommonmark',
40 |     'sphinx_markdown_tables',
41 | ]
42 | 
43 | source_parsers = {
44 |     '.md': CommonMarkParser,
45 | }
46 | 
47 | source_suffix = ['.rst', '.md']
48 | 
49 | # Add any paths that contain templates here, relative to this directory.
50 | templates_path = ['_templates']
51 | 
52 | # The language for content autogenerated by Sphinx. Refer to documentation
53 | # for a list of supported languages.
54 | #
55 | # This is also used if you do content translation via gettext catalogs.
56 | # Usually you set "language" from the command line for these cases.
57 | # language = 'zh_CN'
58 | 
59 | # List of patterns, relative to source directory, that match files and
60 | # directories to ignore when looking for source files.
61 | # This pattern also affects html_static_path and html_extra_path.
62 | exclude_patterns = []
63 | 
64 | 
65 | # -- Options for HTML output -------------------------------------------------
66 | 
67 | # The theme to use for HTML and HTML Help pages.  See the documentation for
68 | # a list of builtin themes.
69 | #
70 | html_theme = 'sphinx_rtd_theme'
71 | 
72 | # Add any paths that contain custom static files (such as style sheets) here,
73 | # relative to this directory. They are copied after the builtin static files,
74 | # so a file named "default.css" will overwrite the builtin "default.css".
75 | html_static_path = ['_static']
76 | #html_stype="css/custom.css"
77 | html_css_files=['css/custom.css' ]
78 | html_js_files= ['js/custom.js' ]
79 | add_module_names = True
80 | 
81 | 
82 | # At the bottom of conf.py
83 | def setup(app):
84 |     app.add_config_value('recommonmark_config', {
85 |         #'url_resolver': lambda url: github_doc_root + url,
86 |         'auto_toc_tree_section': 'Contents',
87 |         'enable_math': False,
88 |         'enable_inline_math': False,
89 |         'enable_eval_rst': True,
90 |     }, True)
91 |     app.add_transform(AutoStructify)
92 | 


--------------------------------------------------------------------------------
/docs/source/index.md:
--------------------------------------------------------------------------------
 1 | # ModelCenter's Documentation
 2 | 
 3 | ModelCenter implements PLMs (Pretrained Language Models) based on [BMTrain](https://bmtrain.readthedocs.io/en/latest/index.html>) backend.
 4 | 
 5 | ## Main Advantages:
 6 | 
 7 | - <span style="color:green;font-weight:bold">Low-Resource</span>
 8 | - <span style="color:red;font-weight:bold">Efficient</span>
 9 | - <span style="color:orange;font-weight:bold">Extendable</span>
10 | 
11 | ```eval_rst
12 | .. toctree::
13 |    :maxdepth: 2
14 |    :caption: GETTING STARTED
15 | 
16 |    notes/installation.md
17 |    notes/quickstart.md
18 |    notes/benchmark.md
19 |    notes/write_model.md
20 |    notes/pretrain_data.md
21 | 
22 | .. toctree::
23 |    :maxdepth: 1
24 |    :caption: Models
25 | 
26 |    model/bert.rst
27 |    model/gpt2.rst
28 |    model/gptj.rst
29 |    model/t5.rst
30 |    model/cpm1.rst
31 |    model/cpm2.rst
32 | 
33 | .. toctree::
34 |    :maxdepth: 2
35 |    :caption: PACKAGE REFERENCE
36 | 
37 |    api/module.rst
38 |    api/block.rst
39 | 
40 | .. toctree::
41 |    :maxdepth: 2
42 |    :caption: Advanced
43 | 
44 | 
45 | Indices and tables
46 | ==================
47 | 
48 | * :ref:`genindex`
49 | 
50 | ```


--------------------------------------------------------------------------------
/docs/source/model/bert.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | BERT
 3 | =======================
 4 | 
 5 | `Bert <https://arxiv.org/abs/1810.04805>`_
 6 | 
 7 | We currently support loading the following checkpoint via ``Bert.from_pretrained(identifier)``
 8 | 
 9 | - bert-base-cased
10 | - bert-base-uncased
11 | - bert-large-cased
12 | - bert-large-uncased
13 | - bert-base-chinese
14 | - bert-base-multilingual-cased
15 | 
16 | BertConfig
17 | ------------------------------------
18 | .. autoclass:: model_center.model.BertConfig
19 |    :members:
20 | 
21 | BertModel
22 | ------------------------------------
23 | .. autoclass:: model_center.model.Bert
24 |    :members:
25 | 
26 | BertTokenizer
27 | ------------------------------------
28 | .. class:: model_center.tokenizer.BertTokenizer
29 | 
30 | The current implementation is mainly an alias to BertTokenizer of `Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`_.
31 | we will change to our SAM implementation in the future, which will be a more efficient tokenizer.


--------------------------------------------------------------------------------
/docs/source/model/cpm1.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | CPM1
 3 | =======================
 4 | 
 5 | `CPM1 <https://arxiv.org/abs/2012.00413>`_
 6 | 
 7 | CPM1Config
 8 | ------------------------------------
 9 | .. autoclass:: model_center.model.CPM1Config
10 |    :members:
11 | 
12 | CPM1Model
13 | ------------------------------------
14 | .. autoclass:: model_center.model.CPM1
15 |    :members:
16 | 
17 | CPM1Tokenizer
18 | ------------------------------------
19 | .. autoclass:: model_center.tokenizer.CPM1Tokenizer
20 |    :members:


--------------------------------------------------------------------------------
/docs/source/model/cpm2.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | CPM2
 3 | =======================
 4 | 
 5 | `CPM2 <https://arxiv.org/pdf/2106.10715>`_
 6 | 
 7 | CPM2Config
 8 | ------------------------------------
 9 | .. autoclass:: model_center.model.CPM2Config
10 |    :members:
11 | 
12 | CPM2Model
13 | ------------------------------------
14 | .. autoclass:: model_center.model.CPM2
15 |    :members:
16 | 
17 | CPM2Tokenizer
18 | ------------------------------------
19 | .. autoclass:: model_center.tokenizer.CPM2Tokenizer
20 |    :members:


--------------------------------------------------------------------------------
/docs/source/model/gpt2.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | GPT2
 3 | =======================
 4 | 
 5 | `GPT2 <https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_
 6 | 
 7 | We currently support loading the following checkpoint via ``GPT2.from_pretrained(identifier)``
 8 | 
 9 | - gpt2-base
10 | - gpt2-medium
11 | - gpt2-large
12 | - gpt2-xl
13 | 
14 | GPT2Config
15 | ------------------------------------
16 | .. autoclass:: model_center.model.GPT2Config
17 |    :members:
18 | 
19 | GPT2Model
20 | ------------------------------------
21 | .. autoclass:: model_center.model.GPT2
22 |    :members:
23 | 
24 | GPT2Tokenizer
25 | ------------------------------------
26 | .. class:: model_center.tokenizer.GPT2Tokenizer
27 | 
28 | The current implementation is mainly an alias to GPT2Tokenizer of `Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`_.
29 | we will change to our SAM implementation in the future, which will be a more efficient tokenizer.


--------------------------------------------------------------------------------
/docs/source/model/gptj.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | GPT-j
 3 | =======================
 4 | 
 5 | `GPTj <https://github.com/kingoflolz/mesh-transformer-jax>`_
 6 | 
 7 | We currently support loading the following checkpoint via ``GPTj.from_pretrained(identifier)``
 8 | 
 9 | - gptj-6b
10 | 
11 | GPTjConfig
12 | ------------------------------------
13 | .. autoclass:: model_center.model.GPTjConfig
14 |    :members:
15 | 
16 | GPTjModel
17 | ------------------------------------
18 | .. autoclass:: model_center.model.GPTj
19 |    :members:
20 | 
21 | GPTjTokenizer
22 | ------------------------------------
23 | .. class:: model_center.tokenizer.GPTjTokenizer
24 | 
25 | The current implementation is mainly an alias to AutoTokenizer of `Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`_.
26 | we will change to our SAM implementation in the future, which will be a more efficient tokenizer.


--------------------------------------------------------------------------------
/docs/source/model/t5.rst:
--------------------------------------------------------------------------------
 1 | =======================
 2 | T5
 3 | =======================
 4 | 
 5 | `T5 <https://arxiv.org/pdf/1910.10683.pdf>`_
 6 | 
 7 | We currently support loading the following checkpoint via ``T5.from_pretrained(identifier)``
 8 | 
 9 | - t5-small
10 | - t5-base
11 | - t5-large
12 | - t5-3b
13 | - t5-11b
14 | 
15 | T5Config
16 | ------------------------------------
17 | .. autoclass:: model_center.model.T5Config
18 |    :members:
19 | 
20 | T5Model
21 | ------------------------------------
22 | .. autoclass:: model_center.model.T5
23 |    :members:
24 | 
25 | T5Tokenizer
26 | ------------------------------------
27 | .. class:: model_center.tokenizer.T5Tokenizer
28 | 
29 | The current implementation is mainly an alias to T5Tokenizer of `Hugging Face Transformers <https://huggingface.co/docs/transformers/index>`_.
30 | we will change to our SAM implementation in the future, which will be a more efficient tokenizer.


--------------------------------------------------------------------------------
/docs/source/notes/benchmark.md:
--------------------------------------------------------------------------------
 1 | # Benchmark
 2 | 
 3 | ## Comparison between Hugging Face Transformers
 4 | 
 5 | ### Make Big Models trainable on consumer GPUs
 6 | 
 7 | Tested on 32GB V100 machine using `bert-large-uncased`, we have comparable throughput as Hugging Face Transformers but much fewer GPU memory footprint.
 8 | 
 9 | |repo|**max**-batchsize(#examples)|time(s)|throughput(#examples/s)|
10 | |-|-|-|-|
11 | |transformers|11|1.11|9.9|
12 | |transformers+fp16|14|0.53|26.4|
13 | |modelcenter|256|10.3|24.9|
14 | 
15 | Tested on a **single consumer GPU**, 11GB 2080Ti, however, training `bert-large-uncased` is no longer supported in Hugging Face Transformers, but we make it possible.
16 | 
17 | |repo|**max**-batchsize(#examples)|
18 | |-|-|
19 | |transformers|0|
20 | |transformers+fp16|0|
21 | |modelcenter|72|
22 | 
23 | ### Make Huge Models train easily.
24 | 
25 | Tested on 40GB A100 machine using `T5-11B`, we make it possible to train with 16 batch-size using two GPUs.
26 | 
27 | ## Comparison between Deepspeed ZeRO
28 | 
29 | see also [BMTrain's Performance](https://github.com/OpenBMB/BMTrain#performance)


--------------------------------------------------------------------------------
/docs/source/notes/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ## 1. From PyPI (Recommend)
 4 | 
 5 | ```shell
 6 | $ pip install model-center
 7 | ```
 8 | 
 9 | ## 2. From Source
10 | 
11 | ```shell
12 | $ git clone https://github.com/OpenBMB/ModelCenter.git
13 | $ cd ModelCenter
14 | $ pip install -r requirements.txt
15 | $ python3 setup.py install
16 | ```
17 | 


--------------------------------------------------------------------------------
/docs/source/notes/write_model.md:
--------------------------------------------------------------------------------
  1 | # How to write a new model
  2 | 
  3 | ## Model Implementation
  4 | 
  5 | We implement our models in `model_center/model`
  6 | 
  7 | We provided commonly used [modules](https://bmtrain.readthedocs.io/en/latest/api/module.html) in `model_center/layer`, such as `Linear`, `LayerNorm`, `Embedding`, 
  8 | which are implemented based on [bmtrain.DistributedParameter](https://bmtrain.readthedocs.io/en/latest/api/bmtrain.html#bmtrain.DistributedParameter)
  9 | and [bmtrain.DistributedModule](https://bmtrain.readthedocs.io/en/latest/api/bmtrain.html#bmtrain.DistributedModule), for distributed training support.
 10 | 
 11 | We have also implemented common ways of combining modules in `model_center/layer`, which are [block](https://bmtrain.readthedocs.io/en/latest/api/block.html).
 12 | For example, `SelfAttentionBlock` combines Layernorm, Attention, Add&Norm together.
 13 | Each blocks has diverse option, e.g., `FFNBlock` supports `gated_relu`, `relu`, `gated_gelu`, `gelu`; blocks support pre-layernorm and post-layernorm.
 14 | 
 15 | With the help of these commonly used modules we provided, a new model can be written easily without many exceptions. You can just add the model specific feature into the common structure.
 16 | 
 17 | A classic transformer is implemented in the following structure:
 18 | 
 19 | We use [bmtrain.CheckpointBlock](https://bmtrain.readthedocs.io/en/latest/api/bmtrain.html#bmtrain.CheckpointBlock), and
 20 | [bmtrain.TransformerBlockList](https://bmtrain.readthedocs.io/en/latest/api/bmtrain.html#bmtrain.TransformerBlockList) to wrap our transformer blocks.
 21 | These reducd the GPU memory usage by a great amount without adding lots of computation time.
 22 | For more information, see [BMTrain's Quick Start](https://bmtrain.readthedocs.io/en/latest/notes/quickstart-zh.html)
 23 | 
 24 | ```
 25 | T5(
 26 |   (input_embedding): Embedding()
 27 |   (position_bias_enc): RelativePositionEmbedding()
 28 |   (position_bias_dec): RelativePositionEmbedding()
 29 |   (encoder): Encoder(
 30 |     (layers): bmtrain.TransformerBlockList(
 31 |       (0): bmtrain.CheckpointBlock(
 32 |         TransformerBlock(
 33 |           (self_att): SelfAttentionBlock(
 34 |             (layernorm_before_attention): LayerNorm()
 35 |             (attention): Attention(
 36 |               (project_q): Linear()
 37 |               (project_k): Linear()
 38 |               (project_v): Linear()
 39 |               (attention_out): Linear()
 40 |             )
 41 |           )
 42 |           (ffn): FFNBlock(
 43 |             (layernorm_before_ffn): LayerNorm()
 44 |             (ffn): FeedForward(
 45 |               (w_in): DenseACT(
 46 |                 (w): Linear()
 47 |                 (act): ReLU()
 48 |               )
 49 |               (w_out): Linear()
 50 |             )
 51 |           )
 52 |         )
 53 |       )
 54 |       (1): bmtrain.CheckpointBlock()
 55 |       .
 56 |       .
 57 |       .
 58 |     )
 59 |     (output_layernorm): LayerNorm()
 60 |   )
 61 |   (decoder): Decoder(
 62 |     (layers): bmtrain.TransformerBlockList(
 63 |       (0): bmtrain.CheckpointBlock(
 64 |         (self_att): SelfAttentionBlock(
 65 |           (layernorm_before_attention): LayerNorm()
 66 |           (attention): Attention(
 67 |             (project_q): Linear()
 68 |             (project_k): Linear()
 69 |             (project_v): Linear()
 70 |             (attention_out): Linear()
 71 |           )
 72 |         )
 73 |         (cross_att): CrossAttentionBlock(
 74 |           (layernorm_before_attention): LayerNorm()
 75 |           (attention): Attention(
 76 |             (project_q): Linear()
 77 |             (project_k): Linear()
 78 |             (project_v): Linear()
 79 |             (attention_out): Linear()
 80 |           )
 81 |         )
 82 |         (ffn): FFNBlock(
 83 |           (layernorm_before_ffn): LayerNorm()
 84 |           (ffn): FeedForward(
 85 |             (w_in): DenseACT(
 86 |               (w): Linear()
 87 |               (act): ReLU()
 88 |             )
 89 |             (w_out): Linear()
 90 |           )
 91 |         )
 92 |       )
 93 |       (1): bmtrain.CheckpointBlock()
 94 |       .
 95 |       .
 96 |       .
 97 |     )
 98 |     (output_layernorm): LayerNorm()
 99 |   )
100 |   (output_projection): Linear(
101 |     (weight): bmtrain.DistributedParameter()
102 |     (bias): bmtrain.DistributedParameter()
103 |   )
104 | )
105 | ```
106 | 
107 | ## Model Config
108 | 
109 | We add model configs in `model_center/model/config`
110 | 
111 | By inheriting `model_center.config.Config`, config class can parse json files with `config.from_json_file(path)` method,
112 | the parsed json file are then save to the config class and used by model by instantiating model with `model(config)`.


--------------------------------------------------------------------------------
/examples/bert/BoolQ.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="bert-large-cased"
17 | DATASET="BoolQ"
18 | 
19 | OPTS=""
20 | OPTS+=" --model-config ${VERSION}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --dataset_name ${DATASET}"
23 | OPTS+=" --batch-size 64"
24 | OPTS+=" --lr 0.00001"
25 | OPTS+=" --max-encoder-length 512"
26 | OPTS+=" --train-iters 1400"
27 | OPTS+=" --lr-decay-style constant"
28 | OPTS+=" --weight-decay 1e-2"
29 | OPTS+=" --clip-grad 10.0"
30 | OPTS+=" --loss-scale 128"
31 | 
32 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/bert/finetune_bert.py ${OPTS}"
33 | echo ${CMD}
34 | 
35 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/bert_superglue/finetune-${VERSION}-${DATASET}.log


--------------------------------------------------------------------------------
/examples/bert/CB.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12347
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="bert-large-cased"
17 | DATASET="CB"
18 | 
19 | OPTS=""
20 | OPTS+=" --model-config ${VERSION}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --dataset_name ${DATASET}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --lr 0.00001"
25 | OPTS+=" --max-encoder-length 512"
26 | OPTS+=" --train-iters 400"
27 | OPTS+=" --lr-decay-style constant"
28 | OPTS+=" --warmup-iters 40"
29 | OPTS+=" --weight-decay 2e-3"
30 | OPTS+=" --clip-grad 1.0"
31 | OPTS+=" --loss-scale 128"
32 | 
33 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/bert/finetune_bert.py ${OPTS}"
34 | echo ${CMD}
35 | 
36 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/bert_superglue/finetune-${VERSION}-${DATASET}.log


--------------------------------------------------------------------------------
/examples/bert/COPA.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="bert-large-cased"
17 | DATASET="COPA"
18 | 
19 | OPTS=""
20 | OPTS+=" --model-config ${VERSION}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --dataset_name ${DATASET}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --lr 0.00001"
25 | OPTS+=" --max-encoder-length 512"
26 | OPTS+=" --train-iters 400"
27 | OPTS+=" --lr-decay-style constant"
28 | OPTS+=" --warmup-iters 40"
29 | OPTS+=" --weight-decay 1e-2"
30 | OPTS+=" --clip-grad 1.0"
31 | OPTS+=" --loss-scale 128"
32 | 
33 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/bert/finetune_bert.py ${OPTS}"
34 | echo ${CMD}
35 | 
36 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/bert_superglue/finetune-${VERSION}-${DATASET}.log


--------------------------------------------------------------------------------
/examples/bert/RTE.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="bert-large-cased"
17 | DATASET="RTE"
18 | 
19 | OPTS=""
20 | OPTS+=" --model-config ${VERSION}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --dataset_name ${DATASET}"
23 | OPTS+=" --batch-size 64"
24 | OPTS+=" --warmup-iters 40"
25 | OPTS+=" --lr 0.00005"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --train-iters 400"
28 | OPTS+=" --lr-decay-style constant"
29 | OPTS+=" --weight-decay 1e-2"
30 | OPTS+=" --clip-grad 10.0"
31 | OPTS+=" --loss-scale 128"
32 | 
33 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/bert/finetune_bert.py ${OPTS}"
34 | echo ${CMD}
35 | 
36 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/bert_superglue/finetune-${VERSION}-${DATASET}.log
37 | 


--------------------------------------------------------------------------------
/examples/bert/WiC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="bert-large-cased"
17 | DATASET="WiC"
18 | 
19 | OPTS=""
20 | OPTS+=" --model-config ${VERSION}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --dataset_name ${DATASET}"
23 | OPTS+=" --batch-size 64"
24 | OPTS+=" --warmup-iters 100"
25 | OPTS+=" --lr 0.00001"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --train-iters 400"
28 | OPTS+=" --lr-decay-style constant"
29 | OPTS+=" --weight-decay 1e-2"
30 | OPTS+=" --clip-grad 10.0"
31 | OPTS+=" --loss-scale 128"
32 | 
33 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/bert/finetune_bert.py ${OPTS}"
34 | echo ${CMD}
35 | 
36 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/bert_superglue/finetune-${VERSION}-${DATASET}.log


--------------------------------------------------------------------------------
/examples/cpm1/finetune_cpm1.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | DATASET="LCQMC"
17 | 
18 | OPTS=""
19 | OPTS+=" --dataset ${DATASET}"
20 | OPTS+=" --base-path ${BASE_PATH}"
21 | OPTS+=" --model-config cpm1-large"
22 | OPTS+=" --batch-size 64"
23 | OPTS+=" --train-iters 3000"
24 | OPTS+=" --save-iters 1000"
25 | OPTS+=" --max-length 256"
26 | OPTS+=" --save ${BASE_PATH}/results"
27 | OPTS+=" --save-name finetune-cpm1-ckpt"
28 | OPTS+=" --lr 0.02"
29 | OPTS+=" --inspect-iters 100"
30 | OPTS+=" --warmup-iters 200"
31 | OPTS+=" --lr-decay-style noam"
32 | OPTS+=" --weight-decay 1e-3"
33 | OPTS+=" --clip-grad 1.0"
34 | OPTS+=" --loss-scale 1048576"
35 | # OPTS+=" --load ${BASE_PATH}/results/cpm1-new.pt"
36 | 
37 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/cpm1/finetune_cpm1.py ${OPTS}"
38 | echo ${CMD}
39 | 
40 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/cpm1/${DATASET}.log
41 | 


--------------------------------------------------------------------------------
/examples/cpm1/pretrain_cpm1.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/mnt/sfs_turbo/hx/ModelCenter"
16 | 
17 | OPTS=""
18 | OPTS+=" --base-path ${BASE_PATH}"
19 | OPTS+=" --model-config ${BASE_PATH}/configs/cpm1/cpm1-large"
20 | OPTS+=" --batch-size 64"
21 | OPTS+=" --train-iters 200000"
22 | OPTS+=" --save-iters 1000"
23 | OPTS+=" --save-name noam-1e-3-0.1-checkpoint"
24 | OPTS+=" --max-length 512"
25 | OPTS+=" --save ${BASE_PATH}/results"
26 | OPTS+=" --lr 0.1"
27 | OPTS+=" --inspect-iters 1000"
28 | OPTS+=" --warmup-iters 2000"
29 | OPTS+=" --lr-decay-style noam"
30 | OPTS+=" --weight-decay 0.001"
31 | OPTS+=" --clip-grad 1.0"
32 | OPTS+=" --loss-scale 1048576"
33 | OPTS+=" --start-step 0"
34 | # OPTS+=" --load ${BASE_PATH}/results/noam-1e-3-0.05-checkpoint-1000.pt"
35 | 
36 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/cpm1/pretrain_cpm1.py ${OPTS}"
37 | echo ${CMD}
38 | 
39 | if [[ $NODE_RANK == 0 ]]; then
40 |     ${CMD} 2>&1 | tee ${BASE_PATH}/logs/cpm1-new.log
41 | else
42 |     ${CMD}
43 | fi
44 | 


--------------------------------------------------------------------------------
/examples/cpm2/finetune_cpm2.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | DATASET="LCQMC"
17 | 
18 | OPTS=""
19 | OPTS+=" --dataset ${DATASET}"
20 | OPTS+=" --base-path ${BASE_PATH}"
21 | OPTS+=" --model-config cpm2-large"
22 | OPTS+=" --batch-size 64"
23 | OPTS+=" --train-iters 3000"
24 | OPTS+=" --save-iters 1000"
25 | OPTS+=" --max-encoder-length 256"
26 | OPTS+=" --max-decoder-length 2"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-cpm2-ckpt"
29 | OPTS+=" --lr 0.002"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 200"
32 | OPTS+=" --lr-decay-style noam"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 10.0"
35 | OPTS+=" --loss-scale 1048576"
36 | # OPTS+=" --load ${BASE_PATH}/results/CPM2-0.25-0.005-checkpoint-110000.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/cpm2/finetune_cpm2.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/cpm2/${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/cpm2/pretrain_cpm2.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/mnt/sfs_turbo/ModelCenter"
16 | 
17 | OPTS=""
18 | OPTS+=" --base-path ${BASE_PATH}"
19 | OPTS+=" --model-config ${BASE_PATH}/configs/cpm2/cpm2-large"
20 | OPTS+=" --batch-size 4"
21 | OPTS+=" --train-iters 3000"
22 | OPTS+=" --save-iters 500"
23 | OPTS+=" --save-name cpm2-checkpoint"
24 | OPTS+=" --max-encoder-length 512"
25 | OPTS+=" --max-decoder-length 256"
26 | OPTS+=" --save ${BASE_PATH}/results"
27 | OPTS+=" --lr 0.25"
28 | OPTS+=" --inspect-iters 100"
29 | OPTS+=" --warmup-iters 2000"
30 | OPTS+=" --lr-decay-style noam"
31 | OPTS+=" --weight-decay 5e-3"
32 | OPTS+=" --clip-grad 1.0"
33 | OPTS+=" --loss-scale 1048576"
34 | OPTS+=" --start-step 110000"
35 | # OPTS+=" --load ${BASE_PATH}/results/CPM2-0.25-0.005-checkpoint-110000.pt"
36 | 
37 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/cpm2/pretrain_cpm2.py ${OPTS}"
38 | echo ${CMD}
39 | 
40 | if [[ $NODE_RANK == 0 ]]; then
41 |     ${CMD} | tee ${BASE_PATH}/logs/cpm2.log
42 | else
43 |     ${CMD}
44 | fi
45 | 


--------------------------------------------------------------------------------
/examples/gpt2/BoolQ.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="base"
17 | DATASET="BoolQ"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gpt2-${VERSION}"
23 | OPTS+=" --batch-size 64"
24 | OPTS+=" --train-iters 1400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gpt2-ckpt"
29 | OPTS+=" --lr 0.00005"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 100"
32 | OPTS+=" --lr-decay-style constant"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 10.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gpt2/CB.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="base"
17 | DATASET="CB"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gpt2-${VERSION}"
23 | OPTS+=" --batch-size 8"
24 | OPTS+=" --train-iters 400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gpt2-ckpt"
29 | OPTS+=" --lr 0.00005"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 100"
32 | OPTS+=" --lr-decay-style constant"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 1.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gpt2/COPA.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="base"
17 | DATASET="COPA"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gpt2-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 900"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gpt2-ckpt"
29 | OPTS+=" --lr 0.00005"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 100"
32 | OPTS+=" --lr-decay-style constant"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 1.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gpt2/RTE.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="base"
17 | DATASET="RTE"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gpt2-${VERSION}"
23 | OPTS+=" --batch-size 64"
24 | OPTS+=" --train-iters 800"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gpt2-ckpt"
29 | OPTS+=" --lr 0.00005"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 100"
32 | OPTS+=" --lr-decay-style constant"
33 | OPTS+=" --weight-decay 1e-3"
34 | OPTS+=" --clip-grad 10.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gpt2/WSC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="base"
17 | DATASET="WSC"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gpt2-${VERSION}"
23 | OPTS+=" --batch-size 64"
24 | OPTS+=" --train-iters 700"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gpt2-ckpt"
29 | OPTS+=" --lr 0.00005"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 100"
32 | OPTS+=" --lr-decay-style constant"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 1.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gpt2/WiC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="base"
17 | DATASET="WiC"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gpt2-${VERSION}"
23 | OPTS+=" --batch-size 64"
24 | OPTS+=" --train-iters 1500"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gpt2-ckpt"
29 | OPTS+=" --lr 0.00005"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 100"
32 | OPTS+=" --lr-decay-style constant"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 1.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPT2-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gpt2/finetune_gpt2.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gpt2_superglue/finetune-gpt2-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gptj/BoolQ.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/mnt/sfs_turbo/hx/ModelCenter"
16 | VERSION="6b"
17 | DATASET="BoolQ"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gptj-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 1400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gptj-ckpt"
29 | OPTS+=" --lr 0.00001"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 140"
32 | OPTS+=" --lr-decay-style noam"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 10.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gptj/CB.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/mnt/sfs_turbo/hx/ModelCenter"
16 | VERSION="6b"
17 | DATASET="CB"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gptj-${VERSION}"
23 | OPTS+=" --batch-size 8"
24 | OPTS+=" --train-iters 400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gptj-ckpt"
29 | OPTS+=" --lr 0.00001"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 40"
32 | OPTS+=" --lr-decay-style noam"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 10.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gptj/COPA.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="6b"
17 | DATASET="COPA"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gptj-${VERSION}"
23 | OPTS+=" --batch-size 1"
24 | OPTS+=" --train-iters 900"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gptj-ckpt"
29 | OPTS+=" --lr 0.00001"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 40"
32 | OPTS+=" --lr-decay-style noam"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 1.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gptj/RTE.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=8
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/mnt/sfs_turbo/hx/ModelCenter"
16 | VERSION="6b"
17 | DATASET="RTE"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gptj-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 800"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gptj-ckpt"
29 | OPTS+=" --lr 0.00001"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 40"
32 | OPTS+=" --lr-decay-style noam"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 10.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gptj/WSC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="6b"
17 | DATASET="WSC"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gptj-${VERSION}"
23 | OPTS+=" --batch-size 4"
24 | OPTS+=" --train-iters 700"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gptj-ckpt"
29 | OPTS+=" --lr 0.0001"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 30"
32 | OPTS+=" --lr-decay-style noam"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 1.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/gptj/WiC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=1
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="6b"
17 | DATASET="WiC"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config gptj-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 1500"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-decoder-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-gptj-ckpt"
29 | OPTS+=" --lr 0.00001"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 70"
32 | OPTS+=" --lr-decay-style noam"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 1.0"
35 | OPTS+=" --loss-scale 128"
36 | # OPTS+=" --load ${BASE_PATH}/results/GPTj-${VERSION}.pt"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/gptj/finetune_gptj.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/gptj_superglue/finetune-gptj-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/llama/RTE.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=2
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/data/ModelCenter"
16 | VERSION="7b"
17 | DATASET="RTE"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config ${BASE_PATH}/results/llama-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 1400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-length 512"
27 | OPTS+=" --save ${BASE_PATH}/results"
28 | OPTS+=" --save-name finetune-llama-${DATASET}"
29 | OPTS+=" --lr 0.00001"
30 | OPTS+=" --inspect-iters 100"
31 | OPTS+=" --warmup-iters 140"
32 | OPTS+=" --lr-decay-style constant"
33 | OPTS+=" --weight-decay 1e-2"
34 | OPTS+=" --clip-grad 1.0"
35 | OPTS+=" --loss-scale 1048576"
36 | 
37 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/llama/finetune_llama.py ${OPTS}"
38 | echo ${CMD}
39 | 
40 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/llama_superglue/finetune-llama-${VERSION}-${DATASET}.log
41 | 


--------------------------------------------------------------------------------
/examples/mt5/BoolQ.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="BoolQ"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 64"
24 | OPTS+=" --train-iters 1400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 140"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 10.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/mt5/CB.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="CB"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 8"
24 | OPTS+=" --train-iters 400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 40"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 10.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/mt5/COPA.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="COPA"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 4"
24 | OPTS+=" --train-iters 900"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 40"
33 | OPTS+=" --lr-decay-style noam"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 10.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/T5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/mt5/RTE.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="RTE"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 800"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 100"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 10.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/mt5/WSC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="WSC"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 4"
24 | OPTS+=" --train-iters 700"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 50"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 100.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/mt5/WiC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="WiC"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 1500"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 70"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 100.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5-v1_1/BoolQ.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="BoolQ"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 64"
24 | OPTS+=" --train-iters 1400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 140"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 10.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5-v1_1/CB.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="CB"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 8"
24 | OPTS+=" --train-iters 400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 40"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 10.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5-v1_1/COPA.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="COPA"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 4"
24 | OPTS+=" --train-iters 900"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 40"
33 | OPTS+=" --lr-decay-style noam"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 10.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/T5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5-v1_1/RTE.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="RTE"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 800"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 100"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 10.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5-v1_1/WSC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="WSC"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 4"
24 | OPTS+=" --train-iters 700"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 50"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 100.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5-v1_1/WiC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="xxl"
17 | DATASET="WiC"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-v1_1-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 1500"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-v1_1-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 70"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 100.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/t5-v1_1-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5-v1_1/finetune_t5-v1_1.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5-v1_1_superglue/finetune-t5-v1_1-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5/BoolQ.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="11b"
17 | DATASET="BoolQ"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 1400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 140"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 1.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5/CB.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="11b"
17 | DATASET="CB"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-${VERSION}"
23 | OPTS+=" --batch-size 8"
24 | OPTS+=" --train-iters 400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 40"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 1.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5/COPA.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="11b"
17 | DATASET="COPA"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-${VERSION}"
23 | OPTS+=" --batch-size 4"
24 | OPTS+=" --train-iters 900"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 40"
33 | OPTS+=" --lr-decay-style noam"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 1.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5/RTE.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="11b"
17 | DATASET="RTE"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 800"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 100"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 10.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5/SQuAD.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=2
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/data/ModelCenter"
16 | VERSION="3b"
17 | DATASET="SQuAD"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config ${BASE_PATH}/results/t5-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 1400"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 32"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 140"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 1.0"
36 | OPTS+=" --loss-scale 128"
37 | 
38 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_squad.py ${OPTS}"
39 | echo ${CMD}
40 | 
41 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_squad/finetune-t5-${VERSION}-${DATASET}.log
42 | 


--------------------------------------------------------------------------------
/examples/t5/WSC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="11b"
17 | DATASET="WSC"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-${VERSION}"
23 | OPTS+=" --batch-size 4"
24 | OPTS+=" --train-iters 700"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-ckpt"
30 | OPTS+=" --lr 0.0001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 30"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 1.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/examples/t5/WiC.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | MASTER_ADDR=localhost
 4 | MASTER_PORT=12345
 5 | NNODES=1
 6 | NODE_RANK=0
 7 | GPUS_PER_NODE=4
 8 | 
 9 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
10 |                   --nnodes $NNODES \
11 |                   --node_rank $NODE_RANK \
12 |                   --master_addr $MASTER_ADDR \
13 |                   --master_port $MASTER_PORT"
14 | 
15 | BASE_PATH="/home/hx/ModelCenter"
16 | VERSION="11b"
17 | DATASET="WiC"
18 | 
19 | OPTS=""
20 | OPTS+=" --dataset ${DATASET}"
21 | OPTS+=" --base-path ${BASE_PATH}"
22 | OPTS+=" --model-config t5-${VERSION}"
23 | OPTS+=" --batch-size 16"
24 | OPTS+=" --train-iters 1500"
25 | OPTS+=" --save-iters 1000"
26 | OPTS+=" --max-encoder-length 512"
27 | OPTS+=" --max-decoder-length 2"
28 | OPTS+=" --save ${BASE_PATH}/results"
29 | OPTS+=" --save-name finetune-t5-ckpt"
30 | OPTS+=" --lr 0.00001"
31 | OPTS+=" --inspect-iters 100"
32 | OPTS+=" --warmup-iters 70"
33 | OPTS+=" --lr-decay-style constant"
34 | OPTS+=" --weight-decay 1e-2"
35 | OPTS+=" --clip-grad 1.0"
36 | OPTS+=" --loss-scale 128"
37 | # OPTS+=" --load ${BASE_PATH}/results/T5-${VERSION}.pt"
38 | 
39 | CMD="python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} ${BASE_PATH}/examples/t5/finetune_t5_superglue.py ${OPTS}"
40 | echo ${CMD}
41 | 
42 | ${CMD} 2>&1 | tee ${BASE_PATH}/logs/t5_superglue/finetune-t5-${VERSION}-${DATASET}.log
43 | 


--------------------------------------------------------------------------------
/model_center/__init__.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | 
3 | from .arguments import get_args
4 | 


--------------------------------------------------------------------------------
/model_center/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .indexed import MMapIndexedDataset
2 | from .distributed_indexed import DistributedMMapIndexedDataset
3 | from .distributed_loader import DistributedDataLoader
4 | 
5 | from .distributed_dataset import DistributedDataset, SimpleDataset, build_dataset
6 | from .utils import shuffle_dataset, compact_dataset, mask_dataset
7 | 


--------------------------------------------------------------------------------
/model_center/dataset/bertdataset/__init__.py:
--------------------------------------------------------------------------------
 1 | from .superglue import *
 2 | 
 3 | DATASET = {
 4 |     "BoolQ": BoolQ_Dataset,
 5 |     "CB": CB_Dataset,
 6 |     "COPA": COPA_Dataset,
 7 |     "RTE": RTE_Dataset,
 8 |     "WiC": WiC_Dataset,
 9 |     "WSC": WSC_Dataset,
10 | }


--------------------------------------------------------------------------------
/model_center/dataset/cpm1/__init__.py:
--------------------------------------------------------------------------------
1 | from .cpm1_dataset import CPM1_Dataset, CPM1_Dataset_Merge
2 | 


--------------------------------------------------------------------------------
/model_center/dataset/cpm1/cpm1_dataset.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2020 The OpenBMB team. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | import torch.utils.data as data
 18 | from ..indexed import MMapIndexedDataset
 19 | import random
 20 | import numpy as np
 21 | 
 22 | 
 23 | class CPM1_Dataset(data.Dataset):
 24 |     def __init__(self, ctx : MMapIndexedDataset, 
 25 |                        tgt : MMapIndexedDataset,
 26 |                        max_length = 1024):
 27 |         self.ctx = ctx
 28 |         self.tgt = tgt
 29 |         self.max_length = max_length
 30 | 
 31 |     def __len__(self):
 32 |         return len(self.ctx)
 33 |     
 34 |     def __get_item_data(self, ctx, tgt):
 35 |         if ctx.shape[0] > self.max_length or tgt.shape[0] > self.max_length:
 36 |             return None, None, None
 37 |         assert len(ctx) == len(tgt)
 38 |         len_ctx = min(ctx.shape[0], self.max_length)
 39 | 
 40 |         ctx = ctx.astype('int64')
 41 |         tgt = tgt.astype('int64')
 42 | 
 43 |         th_ctx = torch.zeros(self.max_length, dtype=torch.long)
 44 |         th_ctx[:len_ctx] = torch.from_numpy(ctx)[:len_ctx].long()
 45 |         th_tgt = torch.full((self.max_length,), -100, dtype=torch.long)
 46 |         th_tgt[:len_ctx] = torch.from_numpy(tgt)[:len_ctx].long()
 47 |         return th_ctx, len_ctx, th_tgt
 48 | 
 49 |     def __getitem__(self, index):
 50 |         ctx = self.ctx[index]
 51 |         tgt = self.tgt[index]
 52 | 
 53 |         if isinstance(index, int):
 54 |             th_ctx, len_ctx, th_tgt = self.__get_item_data(ctx, tgt)
 55 |             return {
 56 |                 "ctx": th_ctx,
 57 |                 "tgt": th_tgt,
 58 |                 "len_ctx": len_ctx,
 59 |             }
 60 |         else:
 61 |             res = {"ctx": [], "tgt": [], "len_ctx": [],}
 62 |             for _ctx, _tgt in zip(ctx, tgt):
 63 |                 _th_ctx, _len_ctx, _th_tgt = self.__get_item_data(_ctx, _tgt)
 64 |                 if _th_ctx is None:
 65 |                     continue
 66 |                 res["ctx"].append(_th_ctx)
 67 |                 res["tgt"].append(_th_tgt)
 68 |                 res["len_ctx"].append(_len_ctx)
 69 |             return {
 70 |                 "ctx": torch.stack(res["ctx"]), 
 71 |                 "tgt": torch.stack(res["tgt"]),
 72 |                 "len_ctx": torch.LongTensor(res["len_ctx"]),
 73 |             }
 74 | 
 75 | 
 76 | class CPM1_Dataset_Merge(data.Dataset):
 77 |     def __init__(self, ctx : MMapIndexedDataset, max_length = 1024):
 78 |         self.ctx = ctx
 79 |         self.max_length = max_length
 80 | 
 81 |     def __len__(self):
 82 |         return len(self.ctx)
 83 |     
 84 |     def __get_item_data(self, ctx):
 85 |         if ctx.shape[0] > self.max_length:
 86 |             return None, None, None, None
 87 |         len_ctx = min(ctx.shape[0], self.max_length)
 88 |         lef = random.randint(len_ctx // 8, len_ctx // 4)
 89 |         rig = random.randint(len_ctx // 4 * 3, len_ctx)
 90 |         if ctx[len_ctx-1] == 4:
 91 |             rig = len_ctx
 92 |         tgt = np.full((len_ctx), -100)
 93 |         tgt[lef-1:rig-1] = ctx[lef:rig]
 94 |         context_ctx = np.arange((len_ctx))
 95 |         context_ctx = (context_ctx < lef) | (context_ctx >= rig)
 96 |         return ctx, tgt, len_ctx, context_ctx
 97 | 
 98 |     def __getitem__(self, index):
 99 |         ctx = self.ctx[index]
100 |         th_ctx, th_tgt, len_ctx, context_ctx = self.__get_item_data(ctx)
101 |         return th_ctx, th_tgt, len_ctx, context_ctx
102 | 


--------------------------------------------------------------------------------
/model_center/dataset/cpm1dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .down_data import *
2 | 
3 | DATASET = {
4 |     "LCQMC": LCQMC_Dataset,
5 | }


--------------------------------------------------------------------------------
/model_center/dataset/cpm1dataset/down_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The OpenBMB team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | import csv
17 | import numpy as np
18 | 
19 | class LCQMC_Dataset(torch.utils.data.Dataset):
20 |     def __init__(self, path, split, rank, world_size, tokenizer, max_length) -> None:
21 |         self.data = []
22 | 
23 |         path = f"{path}/LCQMC/{split}.tsv"
24 |         with open(path, encoding='utf8') as fin:
25 |             reader = list(csv.reader(fin, delimiter='\t'))[1:]
26 |             for i, row in enumerate(reader):
27 |                 text_a, text_b, label = row
28 |                 lef_tokens = [1] + tokenizer.encode(f'"{text_a}"与"{text_b}"的关系是:')
29 |                 rig_tokens = tokenizer.encode("。")
30 | 
31 |                 input_tokens, input_length, context, input_span = self.make_input(lef_tokens, rig_tokens, 1, max_length)
32 | 
33 |                 index = torch.zeros((max_length,), dtype=torch.int32)
34 |                 index[len(lef_tokens) - 1] = 1
35 | 
36 |                 target = torch.tensor(int(label), dtype=torch.long)
37 | 
38 |                 self.data.append({
39 |                     "input_tokens": input_tokens.cuda(),
40 |                     "input_length": input_length.cuda(),
41 |                     "input_context": context.cuda(),
42 |                     "input_span": input_span.cuda(),
43 |                     "targets": target.cuda(),
44 |                     "index": index.cuda(),
45 |                 })
46 | 
47 |     def make_input(self, lef_tokens, rig_tokens, spans, max_length):
48 |         input = lef_tokens + [0 for i in range(spans)] + rig_tokens
49 |         length = len(input)
50 | 
51 |         assert length < max_length # TODO
52 | 
53 |         input_tokens = torch.zeros((max_length,), dtype=torch.int32)
54 |         input_tokens[:length] = torch.tensor(input).int()
55 | 
56 |         input_length = torch.tensor(length, dtype=torch.int32)
57 | 
58 |         context = np.arange(max_length)
59 |         context = (context < len(lef_tokens)) | (context >= len(lef_tokens) + spans)
60 |         context = torch.from_numpy(context).bool()
61 | 
62 |         input_span = torch.zeros((max_length,), dtype=torch.int32)
63 | 
64 |         return input_tokens, input_length, context, input_span
65 | 
66 |     def __len__(self):
67 |         return len(self.data)
68 |     
69 |     def __getitem__(self, idx):
70 |         return self.data[idx]
71 | 
72 |     @classmethod
73 |     def get_verbalizer(cls, tokenizer):
74 |         return [15682, 16357] # 有关，无关 # TODO change to tokenizer.encode(xxx)


--------------------------------------------------------------------------------
/model_center/dataset/cpm2/__init__.py:
--------------------------------------------------------------------------------
1 | from .cpm2_dataset import CPM2_Dataset
2 | 


--------------------------------------------------------------------------------
/model_center/dataset/cpm2/dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.utils.data as data
 3 | from ..indexed import MMapIndexedDataset
 4 | import random
 5 | import numpy as np
 6 | 
 7 | class CPM2_Dataset(data.Dataset):
 8 |     def __init__(self, ctx : MMapIndexedDataset, 
 9 |                        tgt : MMapIndexedDataset,
10 |                        max_source_length = 512,
11 |                        max_target_length = 256):
12 |         self.ctx = ctx
13 |         self.tgt = tgt
14 |         self.max_target_length = max_target_length
15 |         self.max_source_length = max_source_length
16 | 
17 |     def __len__(self):
18 |         return len(self.ctx)
19 |     
20 |     def __get_item_data(self, ctx, tgt):
21 |         # TODO 26240
22 |         ctx = ctx - (ctx >= 26240) * 190
23 |         tgt = tgt - (tgt >= 26240) * 190
24 | 
25 |         if ctx.shape[0] > self.max_source_length or tgt.shape[0] > self.max_target_length+1: # TODO
26 |             return None, None, None, None
27 |         len_ctx = min(ctx.shape[0], self.max_source_length)
28 |         len_tgt = min(tgt.shape[0], self.max_target_length)
29 | 
30 |         # TODO
31 |         # ctx.astype('int64')
32 |         # tgt.astype('int64')
33 | 
34 |         th_ctx = torch.zeros(self.max_source_length, dtype=torch.long)
35 |         th_ctx[:len_ctx] = torch.from_numpy(ctx)[:len_ctx].long()
36 |         th_tgt = torch.full((self.max_target_length + 1,), -100, dtype=torch.long)
37 |         # th_tgt[0] = 1
38 |         # th_tgt[1:1+len_tgt] = torch.from_numpy(tgt)[:len_tgt].long()
39 |         th_tgt[:len_tgt] = torch.from_numpy(tgt)[:len_tgt].long() # TODO
40 |         return th_ctx, th_tgt, len_ctx, len_tgt
41 | 
42 |     def __getitem__(self, index):
43 |         ctx = self.ctx[index]
44 |         tgt = self.tgt[index]
45 | 
46 |         if isinstance(index, int):
47 |             th_ctx, th_tgt, len_ctx, len_tgt = self.__get_item_data(ctx, tgt)
48 |             if th_ctx is None:
49 |                 return None
50 |             return {
51 |                 "ctx": th_ctx,
52 |                 "tgt": th_tgt,
53 |                 "len_ctx": len_ctx,
54 |                 "len_tgt": len_tgt
55 |             }
56 |         else:
57 |             res = {"ctx": [], "tgt": [], "len_ctx": [], "len_tgt":[]}
58 |             for _ctx, _tgt in zip(ctx, tgt):
59 |                 _th_ctx, _th_tgt, _len_ctx, _len_tgt = self.__get_item_data(_ctx, _tgt)
60 |                 if _th_ctx is None:
61 |                     continue
62 |                 res["ctx"].append(_th_ctx)
63 |                 res["tgt"].append(_th_tgt)
64 |                 res["len_ctx"].append(_len_ctx)
65 |                 res["len_tgt"].append(_len_tgt)
66 |             return {
67 |                 "ctx": torch.stack(res["ctx"]), 
68 |                 "tgt": torch.stack(res["tgt"]),
69 |                 "len_ctx": torch.LongTensor(res["len_ctx"]),
70 |                 "len_tgt": torch.LongTensor(res["len_tgt"])
71 |             }
72 | 
73 | 


--------------------------------------------------------------------------------
/model_center/dataset/cpm2dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .down_data import *
2 | 
3 | DATASET = {
4 |     "LCQMC": LCQMC_Dataset,
5 | }


--------------------------------------------------------------------------------
/model_center/dataset/cpm2dataset/down_data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The OpenBMB team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | import csv
17 | import numpy as np
18 | 
19 | class LCQMC_Dataset(torch.utils.data.Dataset):
20 |     def __init__(self, path, split, rank, world_size, tokenizer, max_encoder_length, max_decoder_length) -> None:
21 |         self.data = []
22 | 
23 |         path = f"{path}/LCQMC/{split}.tsv"
24 |         with open(path, encoding='utf8') as fin:
25 |             reader = list(csv.reader(fin, delimiter='\t'))[1:]
26 |             for i, row in enumerate(reader):
27 |                 text_a, text_b, label = row
28 |                 enc_input = tokenizer.encode(f'“{text_a}”与“{text_b}”是否有关？')
29 | 
30 |                 enc_tokens, enc_length, dec_tokens, dec_length, index = self.make_input(tokenizer, enc_input, max_encoder_length, max_decoder_length)
31 | 
32 |                 target = torch.tensor(int(label), dtype=torch.long)
33 | 
34 |                 self.data.append({
35 |                     "enc_input": enc_tokens.cuda(),
36 |                     "enc_length": enc_length.cuda(),
37 |                     "dec_input": dec_tokens.cuda(),
38 |                     "dec_length": dec_length.cuda(),
39 |                     "targets": target.cuda(),
40 |                     "index": index.cuda(),
41 |                 })
42 | 
43 |     def make_input(self, tokenizer, input, max_encoder_length, max_decoder_length):
44 |         input = input + [tokenizer.get_sentinel_id(0)]
45 |         length = len(input)
46 | 
47 |         assert length < max_encoder_length # TODO
48 | 
49 |         input_tokens = torch.zeros((max_encoder_length,), dtype=torch.int32)
50 |         input_tokens[:length] = torch.tensor(input).int()
51 | 
52 |         input_length = torch.tensor(length, dtype=torch.int32)
53 | 
54 |         output = [tokenizer.get_sentinel_id(0)]
55 |         length = len(output)
56 |         output_tokens = torch.zeros((max_decoder_length,), dtype=torch.int32)
57 |         output_tokens[:length] = torch.tensor(output).int()
58 |         output_length = torch.tensor(length, dtype=torch.int32)
59 | 
60 |         index = torch.zeros((max_decoder_length,), dtype=torch.int32)
61 |         index[length - 1] = 1
62 | 
63 |         return input_tokens, input_length, output_tokens, output_length, index
64 | 
65 |     def __len__(self):
66 |         return len(self.data)
67 |     
68 |     def __getitem__(self, idx):
69 |         return self.data[idx]
70 | 
71 |     @classmethod
72 |     def get_verbalizer(cls, tokenizer):
73 |         return [1744, 24] # 有关，无关 # TODO change to tokenizer.encode(xxx)


--------------------------------------------------------------------------------
/model_center/dataset/distributed_loader.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The OpenBMB team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch.utils.data as data
16 | import bmtrain as bmt
17 | 
18 | class DistributedDataLoader:
19 |     def __init__(self, dataset, shuffle=False, seed=0, **kwargs):
20 |         self.sampler = data.distributed.DistributedSampler(dataset, shuffle=shuffle, seed=seed, rank=bmt.rank(), num_replicas=bmt.world_size())
21 |         self.loader = data.DataLoader(dataset, shuffle=False, sampler=self.sampler, **kwargs)
22 |         self.epoch = 0
23 |         self.shuffle = shuffle
24 | 
25 |     def __iter__(self):
26 |         if self.shuffle:
27 |             self.epoch += 1
28 |         self.sampler.set_epoch(self.epoch)
29 |         return self.loader.__iter__()
30 | 
31 |     def __len__(self):
32 |         return len(self.loader)
33 |     
34 | 


--------------------------------------------------------------------------------
/model_center/dataset/gpt2dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | from .superglue import *
 2 | 
 3 | DATASET = {
 4 |     "BoolQ": BoolQ_Dataset,
 5 |     "CB": CB_Dataset,
 6 |     "COPA": COPA_Dataset,
 7 |     "MultiRC": MultiRC_Dataset,
 8 |     "ReCoRD": ReCoRD_Dataset,
 9 |     "RTE": RTE_Dataset,
10 |     "WiC": WiC_Dataset,
11 |     "WSC": WSC_Dataset,
12 | }


--------------------------------------------------------------------------------
/model_center/dataset/llamadataset/__init__.py:
--------------------------------------------------------------------------------
 1 | from .superglue import *
 2 | 
 3 | DATASET = {
 4 |     "BoolQ": BoolQ_Dataset,
 5 |     "CB": CB_Dataset,
 6 |     "COPA": COPA_Dataset,
 7 |     "MultiRC": MultiRC_Dataset,
 8 |     "ReCoRD": ReCoRD_Dataset,
 9 |     "RTE": RTE_Dataset,
10 |     "WiC": WiC_Dataset,
11 |     "WSC": WSC_Dataset,
12 | }


--------------------------------------------------------------------------------
/model_center/dataset/t5dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | from .superglue import *
 2 | from .squad import *
 3 | 
 4 | DATASET = {
 5 |     "BoolQ": BoolQ_Dataset,
 6 |     "CB": CB_Dataset,
 7 |     "COPA": COPA_Dataset,
 8 |     "MultiRC": MultiRC_Dataset,
 9 |     "ReCoRD": ReCoRD_Dataset,
10 |     "RTE": RTE_Dataset,
11 |     "WiC": WiC_Dataset,
12 |     "WSC": WSC_Dataset,
13 |     "SQuAD": SQuAD_Dataset,
14 | }


--------------------------------------------------------------------------------
/model_center/dataset/t5dataset/squad.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import json
 3 | import random
 4 | 
 5 | class SQuAD_Dataset(torch.utils.data.Dataset):
 6 |     def __init__(self, path, split, tokenizer, max_encoder_length, max_decoder_length) -> None:
 7 |         super().__init__()
 8 |         self.split = split
 9 |         self.data = []
10 | 
11 |         for input, target in self.read_data(path, split):
12 |             if split == 'train':
13 |                 self.make_input(tokenizer, input, target, max_encoder_length, max_decoder_length)
14 |             else:
15 |                 self.data.append({
16 |                     "inputs": input,
17 |                     "targets": target
18 |                 })
19 | 
20 |     def shift_tokens_right(self, input_ids, pad_token_id: int=0, decoder_start_token_id: int=0):
21 |         shifted_input_ids = input_ids.new_zeros(input_ids.shape)
22 |         shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
23 |         shifted_input_ids[..., 0] = decoder_start_token_id
24 |         shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
25 |         return shifted_input_ids
26 | 
27 |     def make_input(self, tokenizer, inputs, targets, max_encoder_length, max_decoder_length):
28 |         model_inputs = tokenizer(inputs, max_length=max_encoder_length, padding="max_length", truncation=True)
29 |         labels = tokenizer(targets, max_length=max_decoder_length, padding="max_length", truncation=True)
30 | 
31 |         labels["input_ids"] = torch.LongTensor([l if l != tokenizer.pad_token_id else -100 for l in labels["input_ids"]])
32 | 
33 |         model_inputs['input_ids'] = torch.LongTensor(model_inputs['input_ids'])
34 |         model_inputs['attention_mask'] = torch.LongTensor(model_inputs['attention_mask'])
35 |         model_inputs["decoder_input_ids"] = self.shift_tokens_right(labels["input_ids"])
36 |         model_inputs["targets"] = labels["input_ids"]
37 |         model_inputs["decoder_attention_mask"] = torch.LongTensor(labels["attention_mask"])
38 | 
39 |         self.data.append(model_inputs)
40 | 
41 |     def generate_input(self, question, context):
42 |         return 
43 | 
44 |     def read_data(self, path, split):
45 |         if split == 'test': return
46 |         path = f"{path}/{split}-v1.1.json"
47 |         with open(path, encoding='utf8') as f:
48 |             f = json.load(f)
49 |             for data in f["data"]:
50 |                 for paragraph in data['paragraphs']:
51 |                     for qa in paragraph['qas']:
52 |                         input = " ".join(["Question:", qa["question"].lstrip(), "Answer: <extra_id_0>", "Context:", paragraph["context"].lstrip()])
53 |                         if len(qa["answers"])==0:
54 |                             qa["answers"] = [{"text": "no answer"}]
55 |                         if split=='train':
56 |                             target = " ".join(["<extra_id_0>", random.choice(qa["answers"])["text"], "<extra_id_1>"])
57 |                         else:
58 |                             target = {a['text'] for a in qa["answers"]}
59 |                         yield input, target
60 |     
61 |     def __len__(self):
62 |         return len(self.data)
63 |     
64 |     def __getitem__(self, idx):
65 |         if self.split == 'train':
66 |             model_inputs = self.data[idx]
67 |             for key, value in model_inputs.items():
68 |                 model_inputs[key] = value.cuda()
69 |             return model_inputs
70 |         else:
71 |             return self.data[idx]


--------------------------------------------------------------------------------
/model_center/generation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/ModelCenter/14490451e9a91675ef8816c64cf6304d509bce62/model_center/generation/__init__.py


--------------------------------------------------------------------------------
/model_center/layer/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from .conv import Conv2d
16 | from .attention import Attention
17 | from .layernorm import LayerNorm
18 | from .feedforward import FeedForward
19 | from .position_embedding import RelativePositionEmbedding, RotaryEmbedding, SegmentPositionEmbedding, RotaryEmbeddingESM
20 | from .blocks import SelfAttentionBlock, CrossAttentionBlock, FFNBlock, TransformerBlock
21 | from .transformer import Encoder, Decoder
22 | from .embedding import Embedding, PatchEmbedding
23 | from .linear import Linear


--------------------------------------------------------------------------------
/model_center/layer/conv.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import bmtrain as bmt
 3 | import torch.nn.functional as F
 4 | import collections
 5 | from itertools import repeat
 6 | def _ntuple(n):
 7 |     def parse(x):
 8 |         if isinstance(x, collections.abc.Iterable):
 9 |             return x
10 |         return tuple(repeat(x, n))
11 |     return parse
12 | 
13 | to_2tuple = _ntuple(2)
14 | class Identity(bmt.DistributedModule):
15 |     def __init__(self, *args, **kwargs):
16 |         super(Identity, self).__init__()
17 |     
18 |     def forward(self, input):
19 |         return input
20 | class Conv2d(bmt.DistributedModule):
21 |     def __init__(self,
22 |                 in_channels,
23 |                 out_channels,
24 |                 kernel_size,
25 |                 stride=1,
26 |                 padding=0,
27 |                 dilation=1,
28 |                 groups=1,
29 |                 dtype=torch.float,
30 |                 int8: bool=False,
31 |                 init_mean : float=0.0,
32 |                 init_std : float = 1,
33 |                 bias : bool=True,
34 |                 padding_mode='zeros',
35 |                 ):
36 |         super().__init__()
37 |         self.in_channels = in_channels
38 |         self.out_channels = out_channels
39 |         self.kernel_size = kernel_size
40 |         self.transposed = None
41 |         self.output_padding = None
42 | 
43 |         self.stride = stride
44 |         self.dilation = dilation
45 |         self.groups = groups
46 |         self.padding = padding
47 |         self.padding_mode = padding_mode
48 | 
49 |         kernel = to_2tuple(kernel_size)
50 |         self.weight = bmt.DistributedParameter(
51 |             torch.empty((out_channels, int(in_channels/groups), kernel[0], kernel[1]), dtype=dtype),
52 |             init_method=bmt.ParameterInitializer(torch.nn.init.normal_, mean=init_mean, std=init_std)
53 |         )
54 |         self.bias = bmt.DistributedParameter(
55 |             torch.empty((out_channels,), dtype=dtype),
56 |             init_method=bmt.ParameterInitializer(torch.nn.init.zeros_)
57 |         ) if bias else None
58 |         self.int8=int8
59 |     def forward(self, x : torch.Tensor):
60 |         x = F.conv2d(x,
61 |                     weight=self.weight, 
62 |                     bias=self.bias, 
63 |                     stride=self.stride,
64 |                     padding=self.padding,
65 |                     dilation=self.dilation,
66 |                     groups=self.groups,
67 |                     )
68 |         
69 |         return x


--------------------------------------------------------------------------------
/model_center/layer/layernorm.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | import bmtrain as bmt
18 | import torch.nn.functional as F
19 | 
20 | @torch.jit.script
21 | def rms_layernorm(hidden : torch.Tensor, weight : torch.Tensor, eps :float):
22 |     old_dtype = hidden.dtype
23 |     variance = hidden.to(torch.float32).pow(2).mean(dim=-1, keepdim=True)
24 |     hidden = (hidden * torch.rsqrt(variance + eps)).to(old_dtype)
25 |     return hidden * weight
26 | 
27 | 
28 | class LayerNorm(bmt.DistributedModule):
29 |     r"""
30 |     `LayerNorm <https://arxiv.org/abs/1607.06450>`_ if bias = True: :math:`y = {x-\text{E}[x]\over \text{Var}[x]+\text{eps}} * w + \text{bias}`
31 | 
32 |     `RMS LayerNorm <https://arxiv.org/abs/1910.07467>`_ if bias = False: :math:`y = {x\over \text{Var}[x]+\text{eps}} * w`
33 | 
34 |     Args:
35 |         dim_norm (int): norm dimesion
36 |         dtype (optional): Defaults to torch.half.
37 |         bias (bool, optional): whether to add the :math:`\text{bias}` term. Defaults to True.
38 |         eps (float, optional): :math:`\text{eps}` term. Defaults to 1e-5.
39 |         init_var (float, optional): weight will be all initialized to init_var. Defaults to 1.0.
40 |     """
41 |     def __init__(self, dim_norm : int, 
42 |                        dtype=torch.half, 
43 |                        bias=True, 
44 |                        eps : float = 1e-5,
45 |                        init_var = 1.0
46 |                        ):
47 | 
48 |         super().__init__()
49 | 
50 |         self.eps = eps
51 |         self.dim_norm = dim_norm
52 |         self.weight = bmt.DistributedParameter(
53 |             torch.ones(dim_norm, dtype=dtype) * init_var)
54 |         self.bias = bmt.DistributedParameter(
55 |             torch.zeros(dim_norm, dtype=dtype)) if bias else None
56 |     
57 |     def forward(self, x : torch.Tensor):
58 |         """ 
59 |         Args:
60 |             x (:obj:`torch.Tensor` of shape ``(batch_size, seq_len, dim_norm)``): Input tensor that need to be normalized.
61 | 
62 |         Return:
63 |             :obj:`torch.Tensor` of shape ``(batch_size, seq_len, dim_norm)``: The layernorm output. 
64 | 
65 |         """
66 |         assert x.size(-1) == self.dim_norm
67 |         
68 |         if self.bias is not None:
69 |             return F.layer_norm(x, (self.dim_norm,), self.weight, self.bias, self.eps)
70 |         else:
71 |             return rms_layernorm(x, self.weight, self.eps)
72 | 


--------------------------------------------------------------------------------
/model_center/layer/linear.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | import bmtrain as bmt
18 | import math
19 | import torch.nn.functional as F
20 | 
21 | class Linear(bmt.DistributedModule):
22 |     r"""A fully connected layer, which performs :math:`\pmb{y} = \mathbf{W} \pmb{x} + \pmb{b}`
23 | 
24 |     Args:
25 |         dim_in (int): input dimension of :math:`\pmb{x}`
26 |         dim_out (int): output dimension of :math:`\pmb{y}`
27 |         dtype (optional): Defaults to torch.half.
28 |         init_mean (float, optional): mean of :math:`\mathbf{W}\sim\mathcal{N}(\text{mean}, \text{std}^2)`. Defaults to 0.
29 |         init_std (float, optional): std of :math:`\mathbf{W}\sim\mathcal{N}(\text{mean}, \text{std}^2)`. Defaults to 1.
30 |         bias (bool, optional): whether to add bias term :math:`\pmb{b}`. Defaults to False.
31 |     """
32 |     def __init__(self,
33 |                  dim_in : int,
34 |                  dim_out : int,
35 |                  length_scale : bool = False,
36 |                  length_scale_before : bool = False,
37 |                  dtype = torch.half,
38 |                  int8 : bool = False,
39 |                  init_mean : float = 0.0,
40 |                  init_std : float = 1,
41 |                  bias : bool = False,
42 |                 ):
43 |         super().__init__()
44 |         self.dim_in = self.in_features = dim_in
45 |         self.dim_out = self.out_features = dim_out
46 |         self.weight = bmt.DistributedParameter(
47 |             torch.empty((dim_out, dim_in), dtype=dtype),
48 |             init_method=bmt.ParameterInitializer(torch.nn.init.normal_, mean=init_mean, std=init_std)
49 |         )
50 |         self.bias = bmt.DistributedParameter(
51 |             torch.empty((dim_out,), dtype=dtype),
52 |             init_method=bmt.ParameterInitializer(torch.nn.init.zeros_)
53 |         ) if bias else None
54 |         self.length_scale = length_scale
55 |         self.length_scale_before = length_scale_before
56 |         self.int8 = int8
57 | 
58 |     def forward(self, x : torch.Tensor):
59 |         """ 
60 |         Args:
61 |             x (:obj:`torch.Tensor` of shape ``(batch, seq_len, dim_in)``): The input of linear layer
62 | 
63 |         Returns:
64 |             :obj:`torch.Tensor` of shape ``(batch, seq_len, dim_out)``: The output of the linear transform y.
65 | 
66 |         """
67 |         if self.length_scale and self.length_scale_before:
68 |             x = x / math.sqrt(self.dim_in)
69 |         x = F.linear(x, self.weight)
70 |         if self.length_scale and not self.length_scale_before:
71 |             x = x / math.sqrt(self.dim_in)
72 |         if self.bias is not None:
73 |             x = x + self.bias
74 |         return x
75 | 


--------------------------------------------------------------------------------
/model_center/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | 
17 | # Model Architecture
18 | from .config import *
19 | from .basemodel import BaseModel, ModelOutput, BaseModelOutput, BaseModelOutputWithPooling, Seq2SeqModelOutput
20 | from .bert import Bert
21 | from .roberta import Roberta
22 | from .gpt2 import GPT2
23 | from .gptj import GPTj
24 | from .t5 import T5
25 | from .cpm1 import CPM1
26 | from .cpm2 import CPM2
27 | from .cpm3 import CPM3
28 | from .glm import GLM
29 | from .longformer import Longformer
30 | from .vit import ViT
31 | from .opt import OPT
32 | from .llama import Llama
33 | 


--------------------------------------------------------------------------------
/model_center/model/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from .config import Config
17 | from .cpm1_config import CPM1Config
18 | from .cpm2_config import CPM2Config
19 | from .cpm3_config import CPM3Config
20 | from .t5_config import T5Config
21 | from .gpt2_config import GPT2Config
22 | from .gptj_config import GPTjConfig
23 | from .bert_config import BertConfig
24 | from .roberta_config import RobertaConfig
25 | from .vit_config import VitConfig
26 | from .longformer_config import LongformerConfig
27 | from .glm_config import GLMConfig
28 | from .opt_config import OPTConfig
29 | from .llama_config import LlamaConfig
30 | 


--------------------------------------------------------------------------------
/model_center/model/config/bert_config.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2022 The OpenBMB team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | from .config import Config
 18 | 
 19 | class BertConfig(Config):
 20 |     """
 21 |     This is a configuration class that stores the configuration of the BERT model, which inherits from the Config class.
 22 |     It is used to instantiate the Bert model according to the specified parameters and define the model architecture.
 23 |     You can set specific parameters to control the output of the model.
 24 | 
 25 |     For example:
 26 |     [`dim_model`] is used to determine the Dimension of the encoder layers and the pooler layer.
 27 |     You can choose to use the default value of 768 or customize their dimensions.  
 28 | 
 29 |     """
 30 | 
 31 |     def __init__(self, vocab_size = 119547,
 32 |                        type_size = 2,
 33 |                        dim_model = 768,
 34 |                        num_heads = 12,
 35 |                        dim_head = 64,
 36 |                        dim_ff = 3072,
 37 |                        num_layers = 12,
 38 |                        dropout_p = 0.0,
 39 |                        emb_init_mean = 0.0,
 40 |                        emb_init_std = 0.02,
 41 |                        pos_bias_type = "none",
 42 |                        position_size = 512,
 43 |                        norm_init_var = 1.0,
 44 |                        norm_bias = True,
 45 |                        norm_eps = 1e-12,
 46 |                        att_init_mean = 0.0,
 47 |                        att_init_std = 0.02,
 48 |                        att_bias = True,
 49 |                        att_mask_value = float("-1e4"),
 50 |                        ffn_init_mean = 0.0,
 51 |                        ffn_init_std = 0.02,
 52 |                        ffn_bias = True,
 53 |                        ffn_activate_fn = "gelu",
 54 |                        proj_init_mean = 0.0,
 55 |                        proj_init_std = 0.02,
 56 |                        proj_bias = True,
 57 |                        length_scale = False,
 58 |                        attn_scale = True,
 59 |                        half = True,
 60 |                        int8 = False,
 61 |                        tied = True,
 62 |                        cls_head = None,
 63 |                        post_layer_norm = True,
 64 |                     ):
 65 | 
 66 |         super().__init__()
 67 | 
 68 |         self.vocab_size = vocab_size
 69 |         self.type_size = type_size
 70 |         self.position_size = position_size
 71 |         self.dim_model = dim_model
 72 |         self.num_heads = num_heads
 73 |         self.dim_head = dim_head
 74 |         self.dim_ff = dim_ff
 75 |         self.num_layers = num_layers
 76 |         self.dropout_p = dropout_p
 77 |         self.emb_init_mean = emb_init_mean
 78 |         self.emb_init_std = emb_init_std
 79 |         self.pos_bias_type = pos_bias_type
 80 |         self.norm_init_var = norm_init_var
 81 |         self.norm_bias = norm_bias
 82 |         self.norm_eps = norm_eps
 83 |         self.att_init_mean = att_init_mean
 84 |         self.att_init_std = att_init_std
 85 |         self.att_bias = att_bias
 86 |         self.att_mask_value = att_mask_value
 87 |         self.ffn_init_mean = ffn_init_mean
 88 |         self.ffn_init_std = ffn_init_std
 89 |         self.ffn_bias = ffn_bias
 90 |         self.ffn_activate_fn = ffn_activate_fn
 91 |         self.proj_init_mean = proj_init_mean
 92 |         self.proj_init_std = proj_init_std
 93 |         self.proj_bias = proj_bias
 94 |         self.length_scale = length_scale
 95 |         self.attn_scale = attn_scale
 96 |         self.int8 = int8
 97 |         self.tied = tied
 98 |         if half: 
 99 |             self.dtype = torch.half
100 |         else:
101 |             self.dtype = torch.float
102 |         self.cls_head = cls_head
103 |         self.post_layer_norm = post_layer_norm


--------------------------------------------------------------------------------
/model_center/model/config/config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import json
17 | import os
18 | import copy
19 | from typing import Any, Dict, Union
20 | from ...utils import check_web_and_convert_path
21 | 
22 | class Config(object):
23 |     """ enc_dec model configuration """
24 | 
25 |     def __init__(self):
26 |         super().__init__()
27 | 
28 |     @classmethod
29 |     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **args):
30 |         path = check_web_and_convert_path(pretrained_model_name_or_path, 'config')
31 |         return cls.from_json_file(os.path.join(path, 'config.json'), **args)
32 | 
33 |     @classmethod
34 |     def from_json_file(cls, json_file: Union[str, os.PathLike], **args):
35 |         config_dict = cls._dict_from_json_file(json_file, **args)
36 |         return cls(**config_dict)
37 | 
38 |     @classmethod
39 |     def _dict_from_json_file(cls, json_file: Union[str, os.PathLike], **args):
40 |         with open(json_file, "r", encoding="utf-8") as reader:
41 |             text = reader.read()
42 |         res = json.loads(text)
43 |         for key in args:
44 |             res[key] = args[key]
45 |         return res
46 | 
47 |     def to_json_file(self, json_file_path: Union[str, os.PathLike]):
48 |         with open(json_file_path, "w", encoding="utf-8") as writer:
49 |             writer.write(self.to_json_string())
50 | 
51 |     def to_json_string(self) -> str:
52 |         config_dict = self.to_dict()
53 |         return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
54 | 
55 |     def to_dict(self) -> Dict[str, Any]:
56 |         output = copy.deepcopy(self.__dict__)
57 |         if hasattr(self.__class__, "model_type"):
58 |             output["model_type"] = self.__class__.model_type
59 |         return output


--------------------------------------------------------------------------------
/model_center/model/config/glm_config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | from .config import Config
18 | 
19 | class GLMConfig(Config):
20 | 
21 |     def __init__(self, vocab_size=50048,
22 |                        dim_model=1024,
23 |                        num_heads=16,
24 |                        dim_head=64,
25 |                        dim_ff=4096,
26 |                        num_layers=24,
27 |                        dropout_p=0.1,
28 |                        emb_init_mean = 0,
29 |                        emb_init_std = 0.02,
30 |                        pos_bias_type = "none",
31 |                        position_size = 1025,
32 |                        norm_init_var = 1.0,
33 |                        norm_bias = True,
34 |                        norm_eps = 1e-5,
35 |                        att_init_mean = 0.0,
36 |                        att_init_std = 0.02,
37 |                        att_bias = True,
38 |                        att_mask_value = float("-inf"),
39 |                        ffn_init_mean = 0.0,
40 |                        ffn_init_std = 0.02,
41 |                        ffn_bias = True,
42 |                        ffn_activate_fn = "gelu",
43 |                        proj_init_mean = 0.0,
44 |                        proj_init_std = 0.02,
45 |                        proj_bias = False,
46 |                        length_scale = False,
47 |                        attn_scale = True,
48 |                        half = True,
49 |                        int8 = False,
50 |                        tied = True,
51 |                        cls_head = None,
52 |                        post_layer_norm = False,
53 |                        sop_tok_id = 50006,
54 |                        eop_tok_id = 50007,
55 |                        mask_tok_id = 50008,
56 |         ):
57 | 
58 |         super().__init__()
59 | 
60 |         self.vocab_size = vocab_size
61 |         self.dim_model = dim_model
62 |         self.num_heads = num_heads
63 |         self.dim_head = dim_head
64 |         self.dim_ff = dim_ff
65 |         self.num_layers = num_layers
66 |         self.dropout_p = dropout_p
67 |         self.emb_init_mean = emb_init_mean
68 |         self.emb_init_std = emb_init_std
69 |         self.pos_bias_type = pos_bias_type
70 |         self.position_size = position_size
71 |         self.norm_init_var = norm_init_var
72 |         self.norm_bias = norm_bias
73 |         self.norm_eps = norm_eps
74 |         self.att_init_mean = att_init_mean
75 |         self.att_init_std = att_init_std
76 |         self.att_bias = att_bias
77 |         self.att_mask_value = att_mask_value
78 |         self.ffn_init_mean = ffn_init_mean
79 |         self.ffn_init_std = ffn_init_std
80 |         self.ffn_bias = ffn_bias
81 |         self.ffn_activate_fn = ffn_activate_fn
82 |         self.proj_init_mean = proj_init_mean
83 |         self.proj_init_std = proj_init_std
84 |         self.proj_bias = proj_bias
85 |         self.length_scale = length_scale
86 |         self.attn_scale = attn_scale
87 |         self.int8 = int8
88 |         self.tied = tied
89 |         if half: 
90 |             self.dtype = torch.half
91 |         else:
92 |             self.dtype = torch.float
93 |         self.cls_head = cls_head
94 |         self.post_layer_norm = post_layer_norm
95 |         self.sop_tok_id = sop_tok_id
96 |         self.eop_tok_id = eop_tok_id
97 |         self.mask_tok_id = mask_tok_id


--------------------------------------------------------------------------------
/model_center/model/config/gpt2_config.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2022 The OpenBMB team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | from .config import Config
 18 | 
 19 | class GPT2Config(Config):
 20 |     """
 21 |     This is a configuration class that stores the configuration of the GPT-2 model, which inherits from the Config class.
 22 |     It is used to instantiate the Bert model according to the specified parameters and define the model architecture.
 23 |     You can set specific parameters to control the output of the model.
 24 | 
 25 |     For example:
 26 |     [`dim_model`] is used to determine the Dimension of the encoder layers.
 27 |     You can choose to use the default value of 768 or customize their dimensions.  
 28 |     
 29 |     """
 30 | 
 31 |     def __init__(self, vocab_size = 50258,
 32 |                        dim_model = 1024,
 33 |                        num_heads = 16,
 34 |                        dim_head = 64,
 35 |                        dim_ff = 4096,
 36 |                        num_layers = 24,
 37 |                        dropout_p = 0.0,
 38 |                        emb_init_mean = 0.0,
 39 |                        emb_init_std = 0.02,
 40 |                        pos_bias_type = "none",
 41 |                        position_size = 1024,
 42 |                        norm_init_var = 1.0,
 43 |                        norm_bias = True,
 44 |                        norm_eps = 1e-5,
 45 |                        att_init_mean = 0.0,
 46 |                        att_init_std = 0.02,
 47 |                        att_bias = True,
 48 |                        att_mask_value = float("-1e4"),
 49 |                        ffn_init_mean = 0.0,
 50 |                        ffn_init_std = 0.02,
 51 |                        ffn_bias = True,
 52 |                        ffn_activate_fn = "gelu",
 53 |                        proj_init_mean = 0.0,
 54 |                        proj_init_std = 0.02,
 55 |                        proj_bias = True,
 56 |                        length_scale = False,
 57 |                        attn_scale = True,
 58 |                        half = True,
 59 |                        int8 = False,
 60 |                        tied = True,
 61 |                        cls_head = None,
 62 |                        post_layer_norm = False,
 63 |         ):
 64 | 
 65 |         super().__init__()
 66 | 
 67 |         self.vocab_size = vocab_size
 68 |         self.dim_model = dim_model
 69 |         self.num_heads = num_heads
 70 |         self.dim_head = dim_head
 71 |         self.dim_ff = dim_ff
 72 |         self.num_layers = num_layers
 73 |         self.dropout_p = dropout_p
 74 |         self.emb_init_mean = emb_init_mean
 75 |         self.emb_init_std = emb_init_std
 76 |         self.pos_bias_type = pos_bias_type
 77 |         self.position_size = position_size
 78 |         self.norm_init_var = norm_init_var
 79 |         self.norm_bias = norm_bias
 80 |         self.norm_eps = norm_eps
 81 |         self.att_init_mean = att_init_mean
 82 |         self.att_init_std = att_init_std
 83 |         self.att_bias = att_bias
 84 |         self.att_mask_value = att_mask_value
 85 |         self.ffn_init_mean = ffn_init_mean
 86 |         self.ffn_init_std = ffn_init_std
 87 |         self.ffn_bias = ffn_bias
 88 |         self.ffn_activate_fn = ffn_activate_fn
 89 |         self.proj_init_mean = proj_init_mean
 90 |         self.proj_init_std = proj_init_std
 91 |         self.proj_bias = proj_bias
 92 |         self.length_scale = length_scale
 93 |         self.attn_scale = attn_scale
 94 |         self.int8 = int8
 95 |         self.tied = tied
 96 |         if half: 
 97 |             self.dtype = torch.half
 98 |         else:
 99 |             self.dtype = torch.float
100 |         self.cls_head = cls_head
101 |         self.post_layer_norm = post_layer_norm


--------------------------------------------------------------------------------
/model_center/model/config/gptj_config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | from .config import Config
17 | 
18 | class GPTjConfig(Config):
19 |     """
20 |     This is a configuration class that stores the configuration of the GPT-J model, which inherits from the Config class.
21 |     It is used to instantiate the Bert model according to the specified parameters and define the model architecture.
22 |     You can set specific parameters to control the output of the model.
23 | 
24 |     For example:
25 |     [`dim_model`] is used to determine the Dimension of the encoder layers.
26 |     You can choose to use the default value of 4096 or customize their dimensions.  
27 |     
28 |     """
29 |     def __init__(self, vocab_size=50400,
30 |                        dim_model=4096,
31 |                        num_heads=16,
32 |                        dim_head=256,
33 |                        dim_ff=16384,
34 |                        num_layers=28,
35 |                        dropout_p=0,
36 |                        emb_init_mean = 0.0,
37 |                        emb_init_std = 1,
38 |                        pos_bias_type = "rotary",
39 |                        pos_rotary_dim = 64,
40 |                        norm_init_var = 1.0,
41 |                        norm_bias = True,
42 |                        norm_eps = 1e-5,
43 |                        att_init_mean = 0.0,
44 |                        att_init_std = 0.1,
45 |                        att_bias = False,
46 |                        att_mask_value = float("-inf"),
47 |                        ffn_init_mean = 0.0,
48 |                        ffn_init_std = 0.1,
49 |                        ffn_bias = True,
50 |                        ffn_activate_fn = "gelu",
51 |                        proj_init_mean = 0.0,
52 |                        proj_init_std = 1,
53 |                        proj_bias = True,
54 |                        length_scale = False,
55 |                        attn_scale = True,
56 |                        half = True,
57 |                        int8 = False,
58 |                        tied = False,
59 |                        cls_head = None,
60 |                        post_layer_norm = False,
61 |                     ):
62 | 
63 |         super().__init__()
64 | 
65 |         self.vocab_size = vocab_size
66 |         self.dim_model = dim_model
67 |         self.num_heads = num_heads
68 |         self.dim_head = dim_head
69 |         self.dim_ff = dim_ff
70 |         self.num_layers = num_layers
71 |         self.dropout_p = dropout_p
72 |         self.emb_init_mean = emb_init_mean
73 |         self.emb_init_std = emb_init_std
74 |         self.pos_bias_type = pos_bias_type
75 |         self.pos_rotary_dim = pos_rotary_dim
76 |         self.norm_init_var = norm_init_var
77 |         self.norm_bias = norm_bias
78 |         self.norm_eps = norm_eps
79 |         self.att_init_mean = att_init_mean
80 |         self.att_init_std = att_init_std
81 |         self.att_bias = att_bias
82 |         self.att_mask_value = att_mask_value
83 |         self.ffn_init_mean = ffn_init_mean
84 |         self.ffn_init_std = ffn_init_std
85 |         self.ffn_bias = ffn_bias
86 |         self.ffn_activate_fn = ffn_activate_fn
87 |         self.proj_init_mean = proj_init_mean
88 |         self.proj_init_std = proj_init_std
89 |         self.proj_bias = proj_bias
90 |         self.length_scale = length_scale
91 |         self.attn_scale = attn_scale
92 |         self.int8 = int8
93 |         self.tied = tied
94 |         if half: 
95 |             self.dtype = torch.half
96 |         else:
97 |             self.dtype = torch.float
98 |         self.cls_head = cls_head
99 |         self.post_layer_norm = post_layer_norm


--------------------------------------------------------------------------------
/model_center/model/config/llama_config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | from .config import Config
17 | 
18 | class LlamaConfig(Config):
19 |     """
20 |     This is a configuration class that stores the configuration of the LLaMa model, which inherits from the Config class.
21 |     It is used to instantiate the Bert model according to the specified parameters and define the model architecture.
22 |     You can set specific parameters to control the output of the model.
23 | 
24 |     For example:
25 |     [`dim_model`] is used to determine the Dimension of the encoder layers.
26 |     You can choose to use the default value of 4096 or customize their dimensions.  
27 |     
28 |     """
29 |     def __init__(self, vocab_size=32000,
30 |                        dim_model=4096,
31 |                        num_heads=32,
32 |                        num_heads_kv=-1,
33 |                        dim_head=128,
34 |                        dim_ff=11008,
35 |                        num_layers=32,
36 |                        dropout_p=0,
37 |                        emb_init_mean = 0.0,
38 |                        emb_init_std = 0.02,
39 |                        pos_bias_type = "rotary",
40 |                        norm_init_var = 1.0,
41 |                        norm_bias = False,
42 |                        norm_eps = 1e-6,
43 |                        att_init_mean = 0.0,
44 |                        att_init_std = 0.02,
45 |                        att_bias = False,
46 |                        att_mask_value = float("-inf"),
47 |                        ffn_init_mean = 0.0,
48 |                        ffn_init_std = 0.02,
49 |                        ffn_bias = False,
50 |                        ffn_activate_fn = "gated_silu",
51 |                        proj_init_mean = 0.0,
52 |                        proj_init_std = 0.02,
53 |                        proj_bias = False,
54 |                        length_scale = False,
55 |                        attn_scale = True,
56 |                        half = True,
57 |                        int8 = False,
58 |                        tied = False,
59 |                        cls_head = None,
60 |                        post_layer_norm = False,
61 |                     ):
62 | 
63 |         super().__init__()
64 | 
65 |         self.vocab_size = vocab_size
66 |         self.dim_model = dim_model
67 |         self.num_heads = num_heads
68 |         self.num_heads_kv = num_heads_kv if num_heads_kv != -1 else num_heads
69 |         self.dim_head = dim_head
70 |         self.dim_ff = dim_ff
71 |         self.num_layers = num_layers
72 |         self.dropout_p = dropout_p
73 |         self.emb_init_mean = emb_init_mean
74 |         self.emb_init_std = emb_init_std
75 |         self.pos_bias_type = pos_bias_type
76 |         self.norm_init_var = norm_init_var
77 |         self.norm_bias = norm_bias
78 |         self.norm_eps = norm_eps
79 |         self.att_init_mean = att_init_mean
80 |         self.att_init_std = att_init_std
81 |         self.att_bias = att_bias
82 |         self.att_mask_value = att_mask_value
83 |         self.ffn_init_mean = ffn_init_mean
84 |         self.ffn_init_std = ffn_init_std
85 |         self.ffn_bias = ffn_bias
86 |         self.ffn_activate_fn = ffn_activate_fn
87 |         self.proj_init_mean = proj_init_mean
88 |         self.proj_init_std = proj_init_std
89 |         self.proj_bias = proj_bias
90 |         self.length_scale = length_scale
91 |         self.attn_scale = attn_scale
92 |         self.int8 = int8
93 |         self.tied = tied
94 |         if half: 
95 |             self.dtype = torch.half
96 |         else:
97 |             self.dtype = torch.float
98 |         self.cls_head = cls_head
99 |         self.post_layer_norm = post_layer_norm


--------------------------------------------------------------------------------
/model_center/model/config/opt_config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | from .config import Config
18 | 
19 | class OPTConfig(Config):
20 |     """
21 |     This is a configuration class that stores the configuration of the OPT model, which inherits from the Config class.
22 |     It is used to instantiate the Bert model according to the specified parameters and define the model architecture.
23 |     You can set specific parameters to control the output of the model.
24 | 
25 |     For example:
26 |     [`dim_model`] is used to determine the Dimension of the encoder layers.
27 |     You can choose to use the default value of 768 or customize their dimensions.  
28 |     
29 |     """
30 | 
31 |     def __init__(self, vocab_size = 50272,
32 |                        dim_model = 768,
33 |                        num_heads = 12,
34 |                        dim_head = 64,
35 |                        dim_ff = 3072,
36 |                        num_layers = 12,
37 |                        dropout_p = 0.1,
38 |                        emb_init_mean = 0.0,
39 |                        emb_init_std = 0.02,
40 |                        pos_bias_type = "none",
41 |                        pad_token_id = 1,
42 |                        prefix = "</s>",
43 |                        position_size = 2048,
44 |                        norm_init_var = 1.0,
45 |                        norm_bias = True,
46 |                        norm_eps = 1e-5,
47 |                        att_init_mean = 0.0,
48 |                        att_init_std = 0.02,
49 |                        att_bias = True,
50 |                        att_mask_value = float("-65504"),
51 |                        ffn_init_mean = 0.0,
52 |                        ffn_init_std = 0.02,
53 |                        ffn_bias = True,
54 |                        ffn_activate_fn = "relu",
55 |                        length_scale = False,
56 |                        attn_scale = True,
57 |                        half = True,
58 |                        int8 = False,
59 |                        tied = True,
60 |                        cls_head = None,
61 |                        post_layer_norm = False,
62 |         ):
63 | 
64 |         super().__init__()
65 | 
66 |         self.vocab_size = vocab_size
67 |         self.dim_model = dim_model
68 |         self.num_heads = num_heads
69 |         self.dim_head = dim_head
70 |         self.dim_ff = dim_ff
71 |         self.num_layers = num_layers
72 |         self.dropout_p = dropout_p
73 |         self.emb_init_mean = emb_init_mean
74 |         self.emb_init_std = emb_init_std
75 |         self.pos_bias_type = pos_bias_type
76 |         self.pad_token_id = pad_token_id
77 |         self.prefix = prefix
78 |         self.position_size = position_size
79 |         self.norm_init_var = norm_init_var
80 |         self.norm_bias = norm_bias
81 |         self.norm_eps = norm_eps
82 |         self.att_init_mean = att_init_mean
83 |         self.att_init_std = att_init_std
84 |         self.att_bias = att_bias
85 |         self.att_mask_value = att_mask_value
86 |         self.ffn_init_mean = ffn_init_mean
87 |         self.ffn_init_std = ffn_init_std
88 |         self.ffn_bias = ffn_bias
89 |         self.ffn_activate_fn = ffn_activate_fn
90 |         self.length_scale = length_scale
91 |         self.attn_scale = attn_scale
92 |         self.int8 = int8
93 |         self.tied = tied
94 |         if half: 
95 |             self.dtype = torch.half
96 |         else:
97 |             self.dtype = torch.float
98 |         self.cls_head = cls_head
99 |         self.post_layer_norm = post_layer_norm


--------------------------------------------------------------------------------
/model_center/model/config/roberta_config.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2022 The OpenBMB team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import torch
 17 | from .config import Config
 18 | 
 19 | class RobertaConfig(Config):
 20 |     """
 21 |     This is a configuration class that stores the configuration of the RoBERTa model, which inherits from the Config class.
 22 |     It is used to instantiate the Bert model according to the specified parameters and define the model architecture.
 23 |     You can set specific parameters to control the output of the model.
 24 | 
 25 |     For example:
 26 |     [`dim_model`] is used to determine the Dimension of the encoder layers and the pooler layer.
 27 |     You can choose to use the default value of 768 or customize their dimensions.
 28 | 
 29 |     """
 30 | 
 31 |     def __init__(self, vocab_size = 50265,
 32 |                        type_size = 1,
 33 |                        dim_model = 1024,
 34 |                        num_heads = 16,
 35 |                        dim_head = 64,
 36 |                        dim_ff = 4096,
 37 |                        num_layers = 24,
 38 |                        dropout_p = 0.0,
 39 |                        emb_init_mean = 0.0,
 40 |                        emb_init_std = 0.02,
 41 |                        pos_bias_type = "none",
 42 |                        position_size = 514,
 43 |                        norm_init_var = 1.0,
 44 |                        norm_bias = True,
 45 |                        norm_eps = 1e-05,
 46 |                        att_init_mean = 0.0,
 47 |                        att_init_std = 0.02,
 48 |                        att_bias = True,
 49 |                        att_mask_value = float("-1e4"),
 50 |                        ffn_init_mean = 0.0,
 51 |                        ffn_init_std = 0.02,
 52 |                        ffn_bias = True,
 53 |                        ffn_activate_fn = "gelu",
 54 |                        proj_init_mean = 0.0,
 55 |                        proj_init_std = 0.02,
 56 |                        proj_bias = True,
 57 |                        length_scale = False,
 58 |                        attn_scale = True,
 59 |                        half = True,
 60 |                        int8 = False,
 61 |                        tied = True,
 62 |                        cls_head = None,
 63 |                        post_layer_norm = True,
 64 |                        pad_token_id = 1,
 65 |                     ):
 66 | 
 67 |         super().__init__()
 68 | 
 69 |         self.vocab_size = vocab_size
 70 |         self.type_size = type_size
 71 |         self.position_size = position_size
 72 |         self.dim_model = dim_model
 73 |         self.num_heads = num_heads
 74 |         self.dim_head = dim_head
 75 |         self.dim_ff = dim_ff
 76 |         self.num_layers = num_layers
 77 |         self.dropout_p = dropout_p
 78 |         self.emb_init_mean = emb_init_mean
 79 |         self.emb_init_std = emb_init_std
 80 |         self.pos_bias_type = pos_bias_type
 81 |         self.norm_init_var = norm_init_var
 82 |         self.norm_bias = norm_bias
 83 |         self.norm_eps = norm_eps
 84 |         self.att_init_mean = att_init_mean
 85 |         self.att_init_std = att_init_std
 86 |         self.att_bias = att_bias
 87 |         self.att_mask_value = att_mask_value
 88 |         self.ffn_init_mean = ffn_init_mean
 89 |         self.ffn_init_std = ffn_init_std
 90 |         self.ffn_bias = ffn_bias
 91 |         self.ffn_activate_fn = ffn_activate_fn
 92 |         self.proj_init_mean = proj_init_mean
 93 |         self.proj_init_std = proj_init_std
 94 |         self.proj_bias = proj_bias
 95 |         self.length_scale = length_scale
 96 |         self.attn_scale = attn_scale
 97 |         self.int8 = int8
 98 |         self.tied = tied
 99 |         if half:
100 |             self.dtype = torch.half
101 |         else:
102 |             self.dtype = torch.float
103 |         self.cls_head = cls_head
104 |         self.post_layer_norm = post_layer_norm
105 |         self.pad_token_id = pad_token_id


--------------------------------------------------------------------------------
/model_center/model/config/vit_config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | from .config import Config
18 | 
19 | class VitConfig(Config):
20 |     """
21 |     This is a configuration class that stores the configuration of the Vit model, which inherits from the Config class.
22 |     It is used to instantiate the vit model according to the specified parameters and define the model architecture.
23 |     You can set specific parameters to control the output of the model.
24 | 
25 |     For example:
26 |     [`hidden_size`] is used to determine the Dimension of the encoder layers.
27 |     You can choose to use the default value of 768 or customize their dimensions.  
28 |     
29 |     """
30 |     def __init__(self, img_size=224,
31 |                        patch_size=16,
32 |                        channels_in=3,
33 |                        num_classes=1000,
34 |                        hidden_size=768,
35 |                        num_layers=12,
36 |                        num_heads=12,
37 |                        mlp_size=3072,
38 |                        attn_bias=True,
39 |                        attn_scale=None,
40 |                        norm_bias=True,
41 |                        ffn_bias=True,
42 |                        representation_size=None,
43 |                        drop=0.,
44 |                        half=True,
45 |                        dtype=torch.float):
46 | 
47 |         super().__init__()
48 | 
49 |         self.img_size = img_size
50 |         self.patch_size = patch_size
51 |         self.channels_in = channels_in
52 |         self.num_classes = num_classes
53 |         self.hidden_size = hidden_size
54 |         self.num_layers = num_layers
55 |         self.num_heads = num_heads
56 |         self.mlp_size = mlp_size
57 |         self.attn_bias = attn_bias
58 |         self.attn_scale = attn_scale
59 |         self.norm_bias = norm_bias
60 |         self.ffn_bias = ffn_bias
61 |         self.representation_size = representation_size
62 |         self.drop = drop
63 |         if half:
64 |             self.dtype = torch.half
65 |         else:
66 |             self.dtype = torch.float


--------------------------------------------------------------------------------
/model_center/model/vit.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 The OpenBMB team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import torch
17 | from .basemodel import BaseModel
18 | from .config import VitConfig
19 | from ..layer import PatchEmbedding, Encoder, Linear
20 | 
21 | class ViT(BaseModel):
22 | 
23 |     _CONFIG_TYPE = VitConfig
24 |     def __init__(self, config: VitConfig):
25 | 
26 |         super().__init__()
27 | 
28 |         hidden_size = config.hidden_size
29 |         self.num_features =  config.hidden_size  # num_features for consistency with other models
30 |         self.patch_embed = PatchEmbedding(
31 |                 img_size=config.img_size, 
32 |                 patch_size=config.patch_size, 
33 |                 in_chans=config.channels_in,
34 |                 embed_dim=hidden_size, dtype=config.dtype)
35 |         self.num_patches = self.patch_embed.num_patches
36 | 
37 |         self.pos_drop = torch.nn.Dropout(p=config.drop)
38 |         self.representation_size = config.representation_size
39 | 
40 |         self.blocks = Encoder(num_layers=config.num_layers,
41 |                                dim_model=hidden_size,dim_ff=config.mlp_size,
42 |                                num_heads=config.num_heads,
43 |                                dim_head=hidden_size//config.num_heads,
44 |                                att_bias=config.attn_bias,
45 |                                attn_scale=True, 
46 |                                dropout_p=config.drop,
47 |                                norm_bias=config.norm_bias,
48 |                                ffn_bias=config.ffn_bias,
49 |                                ffn_activate_fn="gelu",
50 |                                dtype=config.dtype)
51 | 
52 |         if self.representation_size is not None:
53 |             self.representation_layer = Linear(hidden_size,config.representation_size)
54 |             hidden_size = config.representation_size
55 | 
56 |         self.head = Linear(hidden_size, config.num_classes, dtype=config.dtype,bias=True)
57 | 
58 |     def forward(self, input_seq, register_blk=-1, attention_mask=None):
59 |         batch = input_seq.shape[0]
60 |         hidden_state = self.patch_embed(input_seq)
61 |         device = input_seq.device
62 |         if attention_mask is not None:
63 |             attention_mask = attention_mask.to(torch.bool)
64 |         else:
65 |             attention_mask = torch.ones(self.num_patches+1, device=device,dtype=torch.int32)[None, :].repeat(batch, 1)
66 |         attention_mask = attention_mask.view(batch, self.num_patches+1, 1) & attention_mask.view(batch, 1, self.num_patches+1)
67 |         hidden_state = self.pos_drop(hidden_state)
68 |         hidden_state = self.blocks(hidden_state,attention_mask=attention_mask)
69 |         if self.representation_size is not None:
70 |             hidden_state = self.representation_layer(hidden_state)
71 |             hidden_state = torch.tanh(hidden_state)
72 |         logits = self.head(hidden_state[:,0])
73 |         return logits
74 | 


--------------------------------------------------------------------------------
/model_center/tokenizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from .cpm1_tokenizer import CPM1Tokenizer
 4 | from .cpm2_tokenizer import CPM2Tokenizer
 5 | from .bert_tokenizer import BertTokenizer
 6 | from .roberta_tokenizer import RobertaTokenizer
 7 | from .t5_tokenizer import T5Tokenizer
 8 | from .gpt2_tokenizer import GPT2Tokenizer
 9 | from .gptj_tokenizer import GPTjTokenizer
10 | from .glm_tokenizer import GLMTokenizer
11 | from .opt_tokenizer import OPTTokenizer
12 | from .llama_tokenizer import LlamaTokenizer


--------------------------------------------------------------------------------
/model_center/tokenizer/base_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import os
 4 | from typing import Union
 5 | import torch
 6 | import bmtrain as bmt
 7 | from model_center.utils import check_web_and_convert_path
 8 | 
 9 | class BaseTokenizer:
10 |     """
11 |     The current implementation is mainly to adapt the training framework of the Transformers toolkit, 
12 |     and replace the original model implementation.
13 |     TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer
14 |     """
15 |     def __init__(self, tokenizer_type):
16 |         self.tokenizer_type = tokenizer_type
17 | 
18 |     def from_pretrained(self, pretrained_model_name_or_path: Union[str, os.PathLike], *args, **kwargs):
19 |         pretrained_model_name_or_path = check_web_and_convert_path(pretrained_model_name_or_path, 'tokenizer')
20 |         return self.tokenizer_type.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
21 | 


--------------------------------------------------------------------------------
/model_center/tokenizer/bert_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 
 4 | # and replace the original model implementation.
 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer
 6 | 
 7 | from .base_tokenizer import BaseTokenizer
 8 | from transformers import BertTokenizer as transformers_BertTokenizer
 9 | 
10 | BertTokenizer = BaseTokenizer(transformers_BertTokenizer)
11 | 


--------------------------------------------------------------------------------
/model_center/tokenizer/gpt2_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 
 4 | # and replace the original model implementation.
 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer
 6 | 
 7 | from .base_tokenizer import BaseTokenizer
 8 | from transformers import GPT2Tokenizer as transformers_GPT2Tokenizer
 9 | 
10 | GPT2Tokenizer = BaseTokenizer(transformers_GPT2Tokenizer)
11 | 


--------------------------------------------------------------------------------
/model_center/tokenizer/gptj_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 
 4 | # and replace the original model implementation.
 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer
 6 | 
 7 | from .base_tokenizer import BaseTokenizer
 8 | from transformers import AutoTokenizer as transformers_GPTjTokenizer
 9 | 
10 | GPTjTokenizer = BaseTokenizer(transformers_GPTjTokenizer)
11 | 


--------------------------------------------------------------------------------
/model_center/tokenizer/llama_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 
 4 | # and replace the original model implementation.
 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer
 6 | 
 7 | from .base_tokenizer import BaseTokenizer
 8 | from transformers import LlamaTokenizer as LlamaTokenizerTransformers
 9 | 
10 | class LlamaTokenizerBase(BaseTokenizer):
11 |     def from_pretrained(self, pretrained_model_name_or_path, *args, **kwargs):
12 |         tokenizer = super().from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
13 |         tokenizer.bos_token_id = 1
14 |         tokenizer.eos_token_id = 2
15 |         return tokenizer
16 | 
17 | LlamaTokenizer = LlamaTokenizerBase(LlamaTokenizerTransformers)
18 | 


--------------------------------------------------------------------------------
/model_center/tokenizer/opt_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 
 4 | # and replace the original model implementation.
 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer
 6 | 
 7 | from .base_tokenizer import BaseTokenizer
 8 | from transformers import GPT2Tokenizer as transformers_OPTTokenizer
 9 | 
10 | OPTTokenizer = BaseTokenizer(transformers_OPTTokenizer)


--------------------------------------------------------------------------------
/model_center/tokenizer/roberta_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 
 4 | # and replace the original model implementation.
 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer
 6 | 
 7 | from .base_tokenizer import BaseTokenizer
 8 | from transformers import RobertaTokenizer as transformers_RobertaTokenizer
 9 | 
10 | RobertaTokenizer = BaseTokenizer(transformers_RobertaTokenizer)
11 | 


--------------------------------------------------------------------------------
/model_center/tokenizer/t5_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | # The current implementation is mainly to adapt the training framework of the Transformers toolkit, 
 4 | # and replace the original model implementation.
 5 | # TODO we will change to our SAM implementation in the future, which will be a more efficient tokenizer
 6 | 
 7 | from .base_tokenizer import BaseTokenizer
 8 | from transformers import T5Tokenizer as transformers_T5Tokenizer
 9 | 
10 | T5Tokenizer = BaseTokenizer(transformers_T5Tokenizer)
11 | 


--------------------------------------------------------------------------------
/model_center/tools/run_preprocess.sh:
--------------------------------------------------------------------------------
1 | for ((i=$1; i<$2; i++)); do
2 | {
3 |     python3 /mnt/sfs_turbo/hx/ModelCenter/src/tools/preprocess_cpm1_lm.py --uid $i
4 | }
5 | done
6 | 
7 | 


--------------------------------------------------------------------------------
/model_center/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .print_utils import print_inspect
2 | from .net_utils import check_web_and_convert_path


--------------------------------------------------------------------------------
/model_center/utils/net_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import os
16 | import requests
17 | import tqdm
18 | import bmtrain as bmt
19 | 
20 | file_names = {
21 |     'config': ['config.json'],
22 |     'model': ['pytorch_model.pt'],
23 |     'tokenizer': ['vocab.json', 'vocab.txt', 'merges.txt', 'tokenizer.json', 'added_tokens.json', 'special_tokens_map.json', 'tokenizer_config.json', 'spiece.model', 'vocab.model'],
24 | }
25 | 
26 | def download(path, url):
27 |     req = requests.get(url, stream=True)
28 |     try:
29 |         os.makedirs(os.path.dirname(path), exist_ok=True)
30 |         file = open(path, "wb")
31 |         req.raise_for_status()
32 |         print(f"download from web, cache will be save to: {path}")
33 |         content_length = req.headers.get("Content-Length")
34 |         total = int(content_length) if content_length is not None else None
35 |         progress = tqdm.tqdm(
36 |             unit="B",
37 |             unit_scale=True,
38 |             unit_divisor=1024,
39 |             total=total,
40 |             desc="Downloading",
41 |         )
42 |         for chunk in req.iter_content(chunk_size=1024):
43 |             if chunk:
44 |                 progress.update(len(chunk))
45 |                 file.write(chunk)
46 |         progress.close()
47 |         file.close()
48 |     except:
49 |         file.close()
50 |         os.remove(path)
51 | 
52 | def check_web_and_convert_path(path, load_type): # TODO add hash
53 |     if os.path.isdir(path):
54 |         try:
55 |             bmt.print_rank(f"load from local file: {path}")
56 |         except:
57 |             pass
58 |         return path
59 |     else:
60 |         if bmt.rank() == 0:
61 |             url = f"https://openbmb.oss-cn-hongkong.aliyuncs.com/model_center/{path}"
62 |             try:
63 |                 requests.get(f'{url}/config.json', stream=True).raise_for_status() # use config.json to check if identifier is valid
64 |             except:
65 |                 raise ValueError(f"'{path}' is not a valid model identifier")
66 |             cache_path = os.path.expanduser(f"~/.cache/model_center/{path}")
67 |             for name in file_names[load_type]:
68 |                 p = os.path.join(cache_path, name)
69 |                 if os.path.exists(p):
70 |                     bmt.print_rank(f"load from cache: {p}")
71 |                 else:
72 |                     if bmt.rank() == 0:
73 |                         download(p, f"{url}/{name}")
74 |         else:
75 |             cache_path = os.path.expanduser(f"~/.cache/model_center/{path}")
76 |         try:
77 |             bmt.synchronize()
78 |         except:
79 |             pass
80 |         return cache_path
81 |         
82 | 


--------------------------------------------------------------------------------
/model_center/utils/print_utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The OpenBMB team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | import bmtrain as bmt
17 | 
18 | def print_inspect(model : torch.nn.Module, param_name : str, prefix : str = ''):
19 |     """Inspect the model and print the summary of the parameters on rank 0.
20 | 
21 |     Args:
22 |         model (torch.nn.Module): The model to be inspected.
23 |         param_name (str): The name of the parameter to be inspected. The wildcard '*' can be used to match multiple parameters.
24 |         prefix (str): The prefix of the parameter name.
25 |     
26 |     Example:
27 |         >>> from model_center.utils import print_inspect
28 |         >>> print_inspect(model, "*.linear*")
29 |         name   shape     max     min     std     mean    grad_std  grad_mean
30 |         ...
31 | 
32 |     """
33 |     bmt.print_rank(
34 |         bmt.inspect.format_summary(
35 |             bmt.inspect.inspect_model(model, param_name, prefix)
36 |         )
37 |     )


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch>=1.10
2 | bmtrain
3 | transformers
4 | jieba


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import os
 3 | 
 4 | def main():
 5 |     setup(
 6 |         name='model-center',
 7 |         version='1.0.3',
 8 |         description="example codes for big models using bmtrain",
 9 |         author="Weilin Zhao",
10 |         author_email="acha131441373@gmail.com",
11 |         packages=find_packages(),
12 |         url="https://github.com/OpenBMB/ModelCenter",
13 |         install_requires=[
14 |             "bmtrain",
15 |             "transformers>=4.28.0",
16 |             "jieba",
17 |         ],
18 |         keywords="CPM, cuda, AI, model, transformer",
19 |         license='Apache 2.0',
20 |     )
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 


--------------------------------------------------------------------------------
/tests/test.sh:
--------------------------------------------------------------------------------
 1 | MASTER_ADDR=localhost
 2 | MASTER_PORT=12347
 3 | NNODES=1
 4 | NODE_RANK=0
 5 | GPUS_PER_NODE=2
 6 | 
 7 | DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE \
 8 |                   --nnodes $NNODES \
 9 |                   --node_rank $NODE_RANK \
10 |                   --master_addr $MASTER_ADDR \
11 |                   --master_port $MASTER_PORT"
12 | 
13 | # cd ../
14 | # python3 setup.py install
15 | # cd -
16 | 
17 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_vit.py
18 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_bert_pkv.py
19 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_bert.py
20 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_roberta.py
21 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_t5.py
22 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_t5v1_1.py
23 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_flan_t5.py
24 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_mt5.py
25 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_gpt2.py
26 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_gptj.py
27 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_glm.py
28 | # python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_opt.py
29 | python3 -m torch.distributed.launch ${DISTRIBUTED_ARGS} test_llama.py
30 | 


--------------------------------------------------------------------------------
/tests/test_bert.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.tokenizer import BertTokenizer
 7 | from model_center.model import BertConfig, Bert
 8 | 
 9 | from transformers import BertForMaskedLM as hugBert
10 | 
11 | def main():
12 |     bmt.init_distributed()
13 | 
14 |     path = "bert-base-uncased"
15 |     tokenizer = BertTokenizer.from_pretrained(path)
16 |     config = BertConfig.from_pretrained(path)
17 |     config.dropout_p = 0
18 |     bmt_bert = Bert.from_pretrained(path, config=config)
19 | 
20 |     hug_bert = hugBert.from_pretrained(path).cuda().eval().half()
21 | 
22 |     for i in range(10):
23 |         batch = 1
24 |         max_encoder_length = 512
25 |         input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda()
26 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
27 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
28 | 
29 |         bmt_logits = bmt_bert(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits
30 |         hug_logits = hug_bert(input_ids = input_ids, attention_mask = attention_mask).logits
31 |         b = bmt_logits*attention_mask[:,:,None]
32 |         h = hug_logits*attention_mask[:,:,None]
33 |         d = (h - b).abs()
34 |         print(d.max())
35 |         b_emb=bmt_bert._modules['input_embedding']
36 |         h_emb=hug_bert._modules['bert']._modules['embeddings']._modules['word_embeddings']
37 |         emb_grad=[]
38 |         def hook(name):
39 |             def backward_hook(module, grad_input, grad_output):
40 |                 emb_grad.append(grad_output[0])
41 |             return backward_hook
42 |         h_emb.register_full_backward_hook(hook("h"))
43 |         b_emb.register_full_backward_hook(hook("b"))
44 |         loss_func = torch.nn.CrossEntropyLoss()
45 |         labels=torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.long).cuda()
46 |         loss1 = loss_func(b.view(-1,b.shape[-1]), labels.view(-1))
47 |         loss2 = loss_func(h.view(-1,h.shape[-1]), labels.view(-1))
48 |         loss1.backward()
49 |         loss2.backward()
50 |         if i>0:
51 |             d_grad=(emb_grad[0]-emb_grad[1]).abs()
52 |             print(d_grad.max())
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/tests/test_bert_pkv.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.tokenizer import BertTokenizer
 7 | from model_center.model import BertConfig, Bert
 8 | 
 9 | from transformers import BertModel as hugBert
10 | 
11 | def main():
12 |     bmt.init_distributed()
13 | 
14 |     path = "bert-base-uncased"
15 |     config = BertConfig.from_pretrained(path)
16 |     config.dropout_p = 0
17 |     bmt_bert = Bert.from_pretrained(path, config=config)
18 | 
19 |     cur_len = 0
20 |     add_len = 8
21 |     bmt_pkv = None
22 |     hug_pkv = None
23 | 
24 |     input_ids_list = []
25 |     logits_list = []
26 |     attention_mask_all = None
27 | 
28 |     for _ in range(40):
29 |         batch = 2
30 |         input_ids = torch.randint(config.vocab_size, (batch, add_len,), dtype=torch.int32).cuda()
31 |         attention_mask = torch.randint(2,(batch, add_len, add_len + cur_len), dtype=torch.int32).cuda()
32 | 
33 |         bmt_res = bmt_bert(input_ids = input_ids, attention_mask = attention_mask, use_cache = True, past_key_values = bmt_pkv)
34 |         bmt_pkv = bmt_res.past_key_values
35 |         bmt_logits = bmt_res.last_hidden_state
36 | 
37 |         input_ids_list.append(input_ids)
38 |         logits_list.append(bmt_logits)
39 |         if attention_mask_all is None:
40 |             attention_mask_all = attention_mask
41 |         else:
42 |             attention_mask_all = torch.cat([attention_mask_all, torch.zeros(batch, cur_len, add_len).cuda()], dim=2)
43 |             attention_mask_all = torch.cat([attention_mask_all, attention_mask], dim=1)
44 | 
45 |         cur_len += add_len
46 | 
47 |     input_ids = torch.cat(input_ids_list, dim=1)
48 |     logits_pkv = torch.cat(logits_list, dim=1)
49 |     logits = bmt_bert(input_ids = input_ids, attention_mask = attention_mask_all).last_hidden_state
50 |     print((logits - logits_pkv).abs().max())
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 | 


--------------------------------------------------------------------------------
/tests/test_flan_t5.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.tokenizer import T5Tokenizer
 7 | from model_center.model import T5Config, T5
 8 | 
 9 | from transformers import T5ForConditionalGeneration as hugT5
10 | 
11 | import sys
12 | 
13 | def main():
14 |     bmt.init_distributed()
15 | 
16 |     ver = "xl"
17 | 
18 |     path = f"flan-t5-{ver}"
19 |     tokenizer = T5Tokenizer.from_pretrained(path)
20 |     config = T5Config.from_pretrained(path)
21 |     bmt_t5 = T5.from_pretrained(path)
22 | 
23 |     path = f"google/flan-t5-{ver}"
24 |     hug_t5 = hugT5.from_pretrained(path).cuda()
25 |     
26 |     for _ in range(10):
27 |         batch = 1
28 |         max_encoder_length = 512
29 |         max_decoder_length = 512
30 |         input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda()
31 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
32 |         decoder_input_ids = torch.randint(config.vocab_size, (batch, max_decoder_length,), dtype=torch.int32).cuda()
33 |         decoder_length = torch.randint(max_decoder_length, (batch, ), dtype=torch.int32).cuda()
34 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
35 |         decoder_attention_mask = torch.arange(decoder_input_ids.shape[1], device=decoder_input_ids.device)[None, :].repeat(decoder_input_ids.shape[0], 1) < decoder_length[:, None]
36 | 
37 |         bmt_logits = bmt_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, output_logits=True).logits
38 |         hug_logits = hug_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask).logits
39 |         mask = decoder_attention_mask[:,:,None]
40 |         b = bmt_logits * mask
41 |         h = hug_logits * mask
42 |         d = (h - b).abs()
43 |         print(d.max())
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 
48 | 


--------------------------------------------------------------------------------
/tests/test_gpt2.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.tokenizer import GPT2Tokenizer
 7 | from model_center.model import GPT2Config, GPT2
 8 | from transformers import GPT2LMHeadModel as hugGPT2
 9 | 
10 | def main():
11 |     bmt.init_distributed()
12 | 
13 |     path = "gpt2-base"
14 |     tokenizer = GPT2Tokenizer.from_pretrained(path)
15 |     config = GPT2Config.from_pretrained(path)
16 |     config.dropout_p = 0
17 |     bmt_gpt2 = GPT2.from_pretrained(path, config=config)
18 | 
19 |     hug_gpt2 = hugGPT2.from_pretrained('gpt2').cuda().eval().half()
20 |     def hook(name):
21 |         def backward_hook(module, grad_input, grad_output):
22 |             emb_grad[name]=grad_output[0]
23 |         return backward_hook
24 |     for i in range(10):
25 |         batch = 1
26 |         max_encoder_length = 512
27 |         input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda()
28 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
29 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
30 |         
31 |         bmt_logits = bmt_gpt2(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits
32 |         hug_logits = hug_gpt2(input_ids = input_ids, attention_mask = attention_mask).logits
33 |         b = bmt_logits*attention_mask[:,:,None]
34 |         h = hug_logits*attention_mask[:,:,None]
35 |         d = (h - b).abs()
36 |         print(d.max())
37 |         b_emb=bmt_gpt2._modules['input_embedding']
38 |         h_emb=hug_gpt2._modules['transformer']._modules['wte']
39 |         emb_grad={}
40 |         h_emb.register_full_backward_hook(hook("h"))
41 |         b_emb.register_full_backward_hook(hook("b"))
42 |         loss_func = torch.nn.CrossEntropyLoss()
43 |         labels=torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.long).cuda()
44 |         loss1 = loss_func(b.view(-1,b.shape[-1]), labels.view(-1))
45 |         loss2 = loss_func(h.view(-1,h.shape[-1]), labels.view(-1))
46 |         loss1.backward()
47 |         loss2.backward()
48 |         if i>0:
49 |             d_grad=(emb_grad["h"]-emb_grad["b"]).abs()
50 |             print(d_grad.max())
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/tests/test_gpt_pkv.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.tokenizer import BertTokenizer
 7 | from model_center.model import BertConfig, Bert
 8 | from transformers import BertModel as hugBert
 9 | from model_center.model import GPT2Config, GPT2
10 | from transformers import GPT2Model as hugGPT2
11 | 
12 | 
13 | def main():
14 |     bmt.init_distributed()
15 | 
16 |     # path = "bert-base-uncased"
17 |     # config = BertConfig.from_pretrained(path)
18 |     # config.dropout_p = 0
19 |     # bmt_bert = Bert.from_pretrained(path, config=config)
20 |     # hug_bert = hugBert.from_pretrained(path).cuda().eval().half()
21 | 
22 |     path = "gpt2-base"
23 |     config = GPT2Config.from_pretrained(path, use_cache = True)
24 |     config.dropout_p = 0
25 |     bmt_bert = GPT2.from_pretrained(path, config=config)
26 |     hug_bert = hugGPT2.from_pretrained('gpt2').cuda().eval().half()
27 | 
28 |     cur_len = 0
29 |     add_len = 1
30 |     bmt_pkv = None
31 |     hug_pkv = None
32 | 
33 |     input_ids_list = []
34 |     bmt_logits_list = []
35 |     hug_logits_list = []
36 | 
37 |     for _ in range(100):
38 | 
39 |         batch = 2
40 |         input_ids = torch.randint(config.vocab_size, (batch, add_len,), dtype=torch.int32).cuda()
41 |         attention_mask = torch.ones((batch, add_len + cur_len, add_len), dtype=torch.int32).cuda()
42 |         attention_mask_1 = torch.ones(((batch, add_len + cur_len)), dtype=torch.int32).cuda()
43 | 
44 |         bmt_res = bmt_bert(input_ids = input_ids, attention_mask = attention_mask, use_cache = True, past_key_values = bmt_pkv)
45 |         bmt_pkv = bmt_res.past_key_values
46 |         bmt_logits = bmt_res.last_hidden_state
47 |         bmt_logits_list.append(bmt_logits)
48 | 
49 |         input_ids_list.append(input_ids)
50 |         hug_res = hug_bert(input_ids = input_ids, attention_mask = attention_mask_1, use_cache = True, past_key_values = hug_pkv)
51 |         hug_pkv = hug_res.past_key_values        
52 |         hug_logits = hug_res.last_hidden_state
53 |         hug_logits_list.append(hug_logits)
54 | 
55 |         cur_len += add_len
56 | 
57 |     bmt_logits_pkv = torch.cat(bmt_logits_list, dim=1)
58 |     hug_logits_pkv = torch.cat(hug_logits_list, dim=1)
59 |     print((bmt_logits_pkv - hug_logits_pkv).abs().mean())
60 | 
61 |     input_ids = torch.cat(input_ids_list, dim=1)
62 |     logits = bmt_bert(input_ids = input_ids, attention_mask = torch.ones((2, cur_len), dtype=torch.int32).cuda()).last_hidden_state
63 |     print((logits - bmt_logits_pkv).abs().mean())
64 | 
65 | if __name__ == "__main__":
66 |     main()
67 | 


--------------------------------------------------------------------------------
/tests/test_gptj.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.tokenizer import GPTjTokenizer
 7 | from model_center.model import GPTjConfig, GPTj
 8 | 
 9 | from transformers import GPTJForCausalLM as hugGPTj
10 | 
11 | def main():
12 |     bmt.init_distributed()
13 | 
14 |     tokenizer = GPTjTokenizer.from_pretrained("/yinxr/zwl/.cache/model_center/gptj-6b")
15 |     config = GPTjConfig.from_pretrained("/yinxr/zwl/.cache/model_center/gptj-6b")
16 |     config.dropout_p = 0
17 |     bmt_gptj = GPTj.from_pretrained("/yinxr/zwl/.cache/model_center/gptj-6b")
18 | 
19 |     hug_gptj = hugGPTj.from_pretrained("/yinxr/zwl/.cache/transformer/EleutherAI/gpt-j-6B").cuda().eval().half()
20 |     
21 |     for _ in range(10):
22 |         batch = 1
23 |         max_encoder_length = 512
24 |         input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda()
25 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
26 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
27 | 
28 |         bmt_logits = bmt_gptj(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits
29 |         hug_logits = hug_gptj(input_ids = input_ids, attention_mask = attention_mask).logits
30 |         b = (bmt_logits*attention_mask[:,:,None])
31 |         h = hug_logits*attention_mask[:,:,None]
32 |         d = (h - b).abs()
33 |         print(d.max())
34 | 
35 | if __name__ == "__main__":
36 |     main()
37 | 


--------------------------------------------------------------------------------
/tests/test_llama.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | from model_center.model.config import LlamaConfig
 6 | from model_center.model import Llama
 7 | from model_center.tokenizer import LlamaTokenizer
 8 | 
 9 | from transformers import LlamaForCausalLM
10 | from transformers import LlamaTokenizer as LlamaTokenizerHF
11 | 
12 | def main():
13 |     # path = f"../results/llama-7b"
14 |     # hf_path = f"../results/llama-7b-hf"
15 |     # path = f"../results/llama-2-7b"
16 |     # hf_path = f"../results/llama-2-7b-hf"
17 |     path = f"../results/llama-2-13b"
18 |     hf_path = f"../results/llama-2-13b-hf"
19 |     
20 |     tokenizer = LlamaTokenizer.from_pretrained(path)
21 |     config = LlamaConfig.from_pretrained(path)
22 |     bmt_llama = Llama.from_pretrained(path, config=config)   
23 |     hug_llama = LlamaForCausalLM.from_pretrained(hf_path).half().eval().cuda()
24 | 
25 |     for ith in range(1, 11):
26 |         batch = 1
27 |         max_encoder_length = ith * 16
28 |         input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda()
29 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
30 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
31 | 
32 |         bmt_logits = bmt_llama(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits
33 |         hug_logits = hug_llama(input_ids = input_ids, attention_mask = attention_mask).logits
34 |         b = bmt_logits*attention_mask[:,:,None]
35 |         h = hug_logits*attention_mask[:,:,None]
36 |         d = (h - b).abs()
37 |         if bmt.rank() == 0:
38 |             print(d.max())
39 | 
40 | def generate():
41 |     # only one GPU is enough
42 |     from model_center.generation.llama import LlamaBeamSearch, LlamaRandomSampling
43 |     path = f"../results/llama-7b"
44 |     
45 |     tokenizer = LlamaTokenizer.from_pretrained(path)
46 |     model = Llama.from_pretrained(path)
47 | 
48 |     beam_search = LlamaBeamSearch(
49 |         model=model,
50 |         tokenizer=tokenizer,
51 |     )
52 |     random_search = LlamaRandomSampling(
53 |         model=model,
54 |         tokenizer=tokenizer,
55 |     )
56 | 
57 |     data_list = [
58 |         "Beijing is the capital of",
59 |         "Steven Jobs",
60 |     ]
61 | 
62 |     inference_results = beam_search.generate(data_list, max_length=100)
63 |     print("beam search:")
64 |     for res in inference_results:
65 |         print(res)
66 |     print("random sampling:")
67 |     inference_results = random_search.generate(data_list, max_length=100)
68 |     for res in inference_results:
69 |         print(res)
70 | 
71 | if __name__ == "__main__":
72 |     bmt.init_distributed(seed=2333)
73 |     main()
74 |     # generate()
75 | 


--------------------------------------------------------------------------------
/tests/test_longformer.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | from cgitb import lookup
 4 | import torch
 5 | import bmtrain as bmt
 6 | from model_center.tokenizer import BertTokenizer
 7 | from model_center.model import Longformer
 8 | from transformers import LongformerForMaskedLM
 9 | from transformers import BertForMaskedLM as hugBert
10 | 
11 | import sys
12 | def main():
13 |     bmt.init_distributed()
14 | 
15 |     bmt_bert = Longformer.from_pretrained("lawformer")
16 |     hug_bert = LongformerForMaskedLM.from_pretrained("thunlp/Lawformer").cuda()
17 |     bmt_bert.eval()
18 |     hug_bert.eval()
19 |     b_emb=bmt_bert._modules['input_embedding']
20 |     h_emb=hug_bert._modules['longformer']._modules['embeddings']._modules['word_embeddings']
21 |     for i in range(1): 
22 |         batch = 1
23 |         max_encoder_length = 2048
24 |         input_ids = torch.randint(21128, (batch, max_encoder_length,), dtype=torch.int32).cuda()
25 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
26 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
27 |         global_attn = torch.zeros(input_ids.shape[1],device=input_ids.device).repeat(input_ids.shape[0], 1)
28 |         global_attn[:,:100] = 1
29 |         bmt_logits = bmt_bert(input_ids = input_ids, return_logits=True,attention_mask=attention_mask,global_attention_mask=global_attn)
30 |         hug_logits = hug_bert(input_ids = input_ids,attention_mask=attention_mask,global_attention_mask=global_attn).logits
31 |         b = bmt_logits*attention_mask[:,:,None]
32 |         h = hug_logits*attention_mask[:,:,None]
33 |         d = (h - b).abs()
34 |         emb_grad={}
35 |         print(d.max())
36 |         def hook(name):
37 |             def backward_hook(module, grad_input, grad_output):
38 |                 emb_grad[name]=grad_output[0]
39 |             return backward_hook
40 |         h_emb.register_full_backward_hook(hook("h"))
41 |         b_emb.register_full_backward_hook(hook("b"))
42 |         loss_func = torch.nn.CrossEntropyLoss()
43 |         labels=torch.randint(21128, (batch, max_encoder_length,), dtype=torch.long).cuda()
44 |         loss1 = loss_func(b.view(-1,b.shape[-1]), labels.view(-1))
45 |         loss2 = loss_func(h.view(-1,h.shape[-1]), labels.view(-1))
46 |         loss1.backward()
47 |         loss2.backward()
48 |         if i>0:
49 |             d_grad=(emb_grad["h"]-emb_grad["b"]).abs()
50 |             print(d_grad.max())
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/tests/test_mt5.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.tokenizer import T5Tokenizer
 7 | from model_center.model import T5Config, T5
 8 | 
 9 | from transformers import MT5ForConditionalGeneration as hugT5
10 | 
11 | import sys
12 | 
13 | def main():
14 |     bmt.init_distributed()
15 | 
16 |     ver = "large"
17 | 
18 |     path = f"mt5-{ver}"
19 |     tokenizer = T5Tokenizer.from_pretrained(path)
20 |     config = T5Config.from_pretrained(path)
21 |     bmt_t5 = T5.from_pretrained(path)
22 | 
23 |     path = f"google/mt5-{ver}"
24 |     hug_t5 = hugT5.from_pretrained(path).cuda()
25 |     
26 |     for _ in range(10):
27 |         batch = 1
28 |         max_encoder_length = 512
29 |         max_decoder_length = 512
30 |         input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda()
31 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
32 |         decoder_input_ids = torch.randint(config.vocab_size, (batch, max_decoder_length,), dtype=torch.int32).cuda()
33 |         decoder_length = torch.randint(max_decoder_length, (batch, ), dtype=torch.int32).cuda()
34 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
35 |         decoder_attention_mask = torch.arange(decoder_input_ids.shape[1], device=decoder_input_ids.device)[None, :].repeat(decoder_input_ids.shape[0], 1) < decoder_length[:, None]
36 | 
37 |         bmt_logits = bmt_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, output_logits=True).logits
38 |         hug_logits = hug_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask).logits
39 |         mask = decoder_attention_mask[:,:,None]
40 |         b = bmt_logits * mask
41 |         h = hug_logits * mask
42 |         d = (h - b).abs()
43 |         print(d.max())
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/tests/test_opt.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.tokenizer import OPTTokenizer
 7 | from model_center.model import OPTConfig, OPT
 8 | from transformers import OPTForCausalLM as hugOPT
 9 | 
10 | def main():
11 |     bmt.init_distributed()
12 | 
13 |     ver = "2.7b"
14 |     path = f"opt-{ver}"
15 |     tokenizer = OPTTokenizer.from_pretrained(path)
16 |     config = OPTConfig.from_pretrained(path)
17 |     config.dropout_p = 0
18 |     bmt_opt = OPT.from_pretrained(path, config=config)
19 | 
20 |     hug_opt = hugOPT.from_pretrained(f'opt-{ver}').cuda().eval().half()
21 |     def hook(name):
22 |         def backward_hook(module, grad_input, grad_output):
23 |             emb_grad[name]=grad_output[0]
24 |         return backward_hook
25 |     emb_grad={}
26 |     for i in range(10):
27 |         batch = 1
28 |         max_encoder_length = 512
29 |         input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda()
30 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
31 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
32 |         
33 |         bmt_logits = bmt_opt(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits
34 |         hug_logits = hug_opt(input_ids = input_ids, attention_mask = attention_mask).logits
35 |         b = bmt_logits*attention_mask[:,:,None]
36 |         h = hug_logits*attention_mask[:,:,None]
37 |         d = (h - b).abs()
38 |         print(d.max())
39 |         if i == 0:
40 |             b_emb=bmt_opt._modules['input_embedding']
41 |             h_emb=hug_opt._modules['model']._modules['decoder']._modules['embed_tokens']
42 |             h_emb.register_full_backward_hook(hook("h"))
43 |             b_emb.register_full_backward_hook(hook("b"))
44 |         else:
45 |             emb_grad.clear()
46 |         loss_func = torch.nn.CrossEntropyLoss()
47 |         labels = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.long).cuda()
48 |         loss1 = loss_func(b.view(-1,b.shape[-1]), labels.view(-1))
49 |         loss2 = loss_func(h.view(-1,h.shape[-1]), labels.view(-1))
50 |         loss1.backward()
51 |         loss2.backward()
52 |         if i>0:
53 |             d_grad=(emb_grad["h"]-emb_grad["b"]).abs()
54 |             print(d_grad.max())
55 | if __name__ == "__main__":
56 |     main()
57 | 


--------------------------------------------------------------------------------
/tests/test_roberta.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | from model_center.model.config import RobertaConfig
 6 | from model_center.model import Roberta
 7 | from model_center.tokenizer import RobertaTokenizer
 8 | 
 9 | from transformers import BertTokenizer, RobertaForMaskedLM as hugRoberta
10 | 
11 | def main():
12 |     bmt.init_distributed()
13 | 
14 |     # path = "roberta-base"
15 |     path = "roberta-large"
16 |     tokenizer = RobertaTokenizer.from_pretrained(path)
17 |     config = RobertaConfig.from_pretrained(path)
18 |     config.dropout_p = 0
19 |     bmt_roberta = Roberta.from_pretrained(path, config=config)
20 | 
21 |     hug_roberta = hugRoberta.from_pretrained(path).cuda().eval().half()
22 | 
23 |     for _ in range(10):
24 |         batch = 1
25 |         max_encoder_length = 512
26 |         input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda()
27 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
28 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
29 | 
30 |         bmt_logits = bmt_roberta(input_ids = input_ids, attention_mask = attention_mask, output_logits=True).logits
31 |         hug_logits = hug_roberta(input_ids = input_ids, attention_mask = attention_mask).logits
32 |         b = bmt_logits*attention_mask[:,:,None]
33 |         h = hug_logits*attention_mask[:,:,None]
34 |         d = (h - b).abs()
35 |         print(d.max())
36 | 
37 | if __name__ == "__main__":
38 |     main()
39 | 


--------------------------------------------------------------------------------
/tests/test_t5.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.tokenizer import T5Tokenizer
 7 | from model_center.model import T5Config, T5
 8 | 
 9 | from transformers import T5ForConditionalGeneration as hugT5
10 | 
11 | def main():
12 |     path = "t5-base"
13 |     tokenizer = T5Tokenizer.from_pretrained(path)
14 |     config = T5Config.from_pretrained(path)
15 |     config.scale = True
16 |     bmt_t5 = T5.from_pretrained(path, config=config)
17 | 
18 |     hug_t5 = hugT5.from_pretrained(path).cuda()
19 |     
20 |     for _ in range(10):
21 |         batch = 1
22 |         max_encoder_length = 512
23 |         max_decoder_length = 512
24 |         input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda()
25 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
26 |         decoder_input_ids = torch.randint(config.vocab_size, (batch, max_decoder_length,), dtype=torch.int32).cuda()
27 |         decoder_length = torch.randint(max_decoder_length, (batch, ), dtype=torch.int32).cuda()
28 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
29 |         decoder_attention_mask = torch.arange(decoder_input_ids.shape[1], device=decoder_input_ids.device)[None, :].repeat(decoder_input_ids.shape[0], 1) < decoder_length[:, None]
30 | 
31 |         bmt_logits = bmt_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, output_logits=True).logits
32 |         hug_logits = hug_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask).logits
33 |         b = bmt_logits*decoder_attention_mask[:,:,None]
34 |         h = hug_logits*decoder_attention_mask[:,:,None]
35 |         d = (h - b).abs()
36 |         print(d.max())
37 |         print(h / b)
38 | 
39 | def generate():
40 |     # only one GPU is enough
41 |     from model_center.generation.t5 import T5BeamSearch, T5RandomSampling
42 |     path = f"../results/t5-3b"
43 |     
44 |     tokenizer = T5Tokenizer.from_pretrained(path)
45 |     model = T5.from_pretrained(path)
46 |     model.config.scale = True
47 | 
48 |     beam_search = T5BeamSearch(
49 |         model=model,
50 |         tokenizer=tokenizer,
51 |     )
52 |     random_search = T5RandomSampling(
53 |         model=model,
54 |         tokenizer=tokenizer,
55 |     )
56 | 
57 |     data_list = [
58 |         "Beijing is the capital of",
59 |         "Steven Jobs is one of the",
60 |         "translate English to German. English: The house is wonderful. German:",
61 |     ]
62 | 
63 |     inference_results = beam_search.generate(data_list, max_length=100)
64 |     print("beam search:")
65 |     for res in inference_results:
66 |         print(res)
67 |     print("random sampling:")
68 |     inference_results = random_search.generate(data_list, max_length=100)
69 |     for res in inference_results:
70 |         print(res)
71 | 
72 | if __name__ == "__main__":
73 |     bmt.init_distributed()
74 |     # main()
75 |     generate()
76 | 


--------------------------------------------------------------------------------
/tests/test_t5v1_1.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.tokenizer import T5Tokenizer
 7 | from model_center.model import T5Config, T5
 8 | 
 9 | from transformers import T5ForConditionalGeneration as hugT5
10 | 
11 | import sys
12 | 
13 | def main():
14 |     bmt.init_distributed()
15 | 
16 |     ver = "large"
17 | 
18 |     path = f"t5-v1_1-{ver}"
19 |     tokenizer = T5Tokenizer.from_pretrained(path)
20 |     config = T5Config.from_pretrained(path)
21 |     bmt_t5 = T5.from_pretrained(path)
22 | 
23 |     path = f"google/t5-v1_1-{ver}"
24 |     hug_t5 = hugT5.from_pretrained(path).cuda()
25 |     
26 |     for _ in range(10):
27 |         batch = 1
28 |         max_encoder_length = 512
29 |         max_decoder_length = 512
30 |         input_ids = torch.randint(config.vocab_size, (batch, max_encoder_length,), dtype=torch.int32).cuda()
31 |         length = torch.randint(max_encoder_length, (batch, ), dtype=torch.int32).cuda()
32 |         decoder_input_ids = torch.randint(config.vocab_size, (batch, max_decoder_length,), dtype=torch.int32).cuda()
33 |         decoder_length = torch.randint(max_decoder_length, (batch, ), dtype=torch.int32).cuda()
34 |         attention_mask = torch.arange(input_ids.shape[1], device=input_ids.device)[None, :].repeat(input_ids.shape[0], 1) < length[:, None]
35 |         decoder_attention_mask = torch.arange(decoder_input_ids.shape[1], device=decoder_input_ids.device)[None, :].repeat(decoder_input_ids.shape[0], 1) < decoder_length[:, None]
36 | 
37 |         bmt_logits = bmt_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, output_logits=True).logits
38 |         hug_logits = hug_t5(input_ids = input_ids, attention_mask = attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask).logits
39 |         mask = decoder_attention_mask[:,:,None]
40 |         b = bmt_logits * mask
41 |         h = hug_logits * mask
42 |         d = (h - b).abs()
43 |         print(d.max())
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/tests/test_vit.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | import torch
 4 | import bmtrain as bmt
 5 | 
 6 | from model_center.model import ViT,VitConfig
 7 | from transformers import  ViTForImageClassification
 8 | 
 9 | 
10 | 
11 | def main():
12 |     bmt.init_distributed()
13 | 
14 |     path = "vit-base_patch16_224"
15 |     config = VitConfig.from_pretrained(path)
16 |     config.dropout_p = 0
17 |     bmt_vit = ViT.from_pretrained(path, config=config)
18 |     hug_vit = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224').cuda().half()
19 |     def hook(name):
20 |         def backward_hook(module, grad_input, grad_output):
21 |             emb_grad[name]=grad_output[0]
22 |         return backward_hook
23 |     for i in range(10):
24 |         with torch.autograd.set_detect_anomaly(True): 
25 |             batch = 12
26 |             # max_encoder_length = 512
27 |             patch_size=224
28 |             channel_size=3
29 |             # inputs  =  torch.randn((1,3,224,224),dtype = torch.half).cuda().to(memory_format=torch.channels_last)
30 |             inputs  =  torch.randn((batch,channel_size,patch_size,patch_size),dtype = torch.half).cuda().to(memory_format=torch.channels_last)
31 |             
32 |             bmt_logits = bmt_vit(inputs)
33 |             hug_logits = hug_vit(inputs).logits
34 |             b = bmt_logits
35 |             h = hug_logits
36 |             d = (h - b).abs()
37 |             print(d.max())
38 |             b_emb=bmt_vit.patch_embed.proj
39 |             h_emb=hug_vit.vit.embeddings.patch_embeddings.projection
40 |             emb_grad={}
41 |             h_emb.register_full_backward_hook(hook("h"))
42 |             b_emb.register_full_backward_hook(hook("b"))
43 |             loss_func = torch.nn.CrossEntropyLoss(ignore_index=-100)
44 |             labels=torch.randint(1000, (batch,), dtype=torch.long).cuda()
45 |             loss1 = loss_func(b, labels)
46 |             loss2 = loss_func(h, labels)
47 |             loss1.backward()
48 |             loss2.backward()
49 |             if i>0:
50 |                 d_grad=(emb_grad["h"]-emb_grad["b"]).abs()
51 |                 print(d_grad.max())
52 | if __name__ == "__main__":
53 |     main()
54 | 


--------------------------------------------------------------------------------
/transfer/hugGPT2_bmtrainGPT2.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The OpenBMB team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from collections import OrderedDict
16 | import torch
17 | from tqdm import tqdm
18 | 
19 | def main():
20 |     ver_layernum = [
21 |         ("base", 12),
22 |         ("medium", 24),
23 |         ("large", 36),
24 |         ("xl", 48),
25 |     ]
26 |     ver, layernum = ver_layernum[0]
27 |     inpath = f"../results/gpt2-{ver}-pytorch_model.bin"
28 |     outpath = f"../results/GPT2-{ver}.pt"
29 |     inp = torch.load(inpath)
30 |     out = OrderedDict()
31 |     out["input_embedding.weight"] = torch.cat([inp["wte.weight"], torch.zeros((1,inp["wte.weight"].shape[1]))], dim=0).contiguous() # original vocab size is an odd number
32 |     out["position_embedding.weight"] = inp["wpe.weight"].contiguous()
33 |     out["encoder.output_layernorm.weight"] = inp["ln_f.weight"].contiguous()
34 |     out["encoder.output_layernorm.bias"] = inp["ln_f.bias"].contiguous()
35 |     for i in range(layernum):
36 |         prefix = f"encoder.layers.{i}"
37 |         old_prefix = f"h.{i}"
38 |         attn_size = inp[f"{old_prefix}.attn.c_attn.weight"].shape[0]
39 |         out[f"{prefix}.self_att.layernorm_before_attention.weight"] = inp[f"{old_prefix}.ln_1.weight"].contiguous()
40 |         out[f"{prefix}.self_att.layernorm_before_attention.bias"] = inp[f"{old_prefix}.ln_1.bias"].contiguous()
41 |         out[f"{prefix}.self_att.self_attention.project_q.weight"] = inp[f"{old_prefix}.attn.c_attn.weight"][:, :attn_size].transpose(0,1).contiguous()
42 |         out[f"{prefix}.self_att.self_attention.project_q.bias"] = inp[f"{old_prefix}.attn.c_attn.bias"][:attn_size].contiguous()
43 |         out[f"{prefix}.self_att.self_attention.project_k.weight"] = inp[f"{old_prefix}.attn.c_attn.weight"][:, attn_size:2*attn_size].transpose(0,1).contiguous()
44 |         out[f"{prefix}.self_att.self_attention.project_k.bias"] = inp[f"{old_prefix}.attn.c_attn.bias"][attn_size:2*attn_size].contiguous()
45 |         out[f"{prefix}.self_att.self_attention.project_v.weight"] = inp[f"{old_prefix}.attn.c_attn.weight"][:, 2*attn_size:].transpose(0,1).contiguous()
46 |         out[f"{prefix}.self_att.self_attention.project_v.bias"] = inp[f"{old_prefix}.attn.c_attn.bias"][2*attn_size:].contiguous()
47 |         out[f"{prefix}.self_att.self_attention.attention_out.weight"] = inp[f"{old_prefix}.attn.c_proj.weight"].transpose(0,1).contiguous()
48 |         out[f"{prefix}.self_att.self_attention.attention_out.bias"] = inp[f"{old_prefix}.attn.c_proj.bias"].contiguous()
49 | 
50 |         out[f"{prefix}.ffn.layernorm_before_ffn.weight"] = inp[f"{old_prefix}.ln_2.weight"].contiguous()
51 |         out[f"{prefix}.ffn.layernorm_before_ffn.bias"] = inp[f"{old_prefix}.ln_2.bias"].contiguous()
52 |         out[f"{prefix}.ffn.ffn.w_in.w.weight"] = inp[f"{old_prefix}.mlp.c_fc.weight"].transpose(0,1).contiguous()
53 |         out[f"{prefix}.ffn.ffn.w_in.w.bias"] = inp[f"{old_prefix}.mlp.c_fc.bias"].contiguous()
54 |         out[f"{prefix}.ffn.ffn.w_out.weight"] = inp[f"{old_prefix}.mlp.c_proj.weight"].transpose(0,1).contiguous()
55 |         out[f"{prefix}.ffn.ffn.w_out.bias"] = inp[f"{old_prefix}.mlp.c_proj.bias"].contiguous()
56 | 
57 |     for k, v in out.items():
58 |         out[k] = out[k].half()
59 | 
60 |     torch.save(out, outpath)
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 


--------------------------------------------------------------------------------
/transfer/hugGPTj_bmtrainGPTj.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The OpenBMB team. All rights reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from collections import OrderedDict
16 | import torch
17 | from tqdm import tqdm
18 | 
19 | def main():
20 |     ver = "6b"
21 |     layernum = 28
22 |     inpath = f"../results/gptj-{ver}-pytorch_model.bin"
23 |     outpath = f"../results/GPTj-{ver}.pt"
24 |     inp = torch.load(inpath)
25 |     out = OrderedDict()
26 |     out["input_embedding.weight"] = inp["transformer.wte.weight"].contiguous() # original vocab size is an odd number
27 |     out["output_projection.weight"] = inp["lm_head.weight"].contiguous()
28 |     out["output_projection.bias"] = inp["lm_head.bias"].contiguous()
29 |     out["encoder.output_layernorm.weight"] = inp["transformer.ln_f.weight"].contiguous()
30 |     out["encoder.output_layernorm.bias"] = inp["transformer.ln_f.bias"].contiguous()
31 |     for i in range(layernum):
32 |         prefix = f"encoder.layers.{i}"
33 |         old_prefix = f"transformer.h.{i}"
34 |         # parallel, share the same layernorm
35 |         out[f"{prefix}.self_att.layernorm_before_attention.weight"] = inp[f"{old_prefix}.ln_1.weight"].contiguous()
36 |         out[f"{prefix}.self_att.layernorm_before_attention.bias"] = inp[f"{old_prefix}.ln_1.bias"].contiguous()
37 |         out[f"{prefix}.ffn.layernorm_before_ffn.weight"] = inp[f"{old_prefix}.ln_1.weight"].contiguous()
38 |         out[f"{prefix}.ffn.layernorm_before_ffn.bias"] = inp[f"{old_prefix}.ln_1.bias"].contiguous()
39 | 
40 |         out[f"{prefix}.self_att.self_attention.project_q.weight"] = inp[f"{old_prefix}.attn.q_proj.weight"].contiguous()
41 |         out[f"{prefix}.self_att.self_attention.project_k.weight"] = inp[f"{old_prefix}.attn.k_proj.weight"].contiguous()
42 |         out[f"{prefix}.self_att.self_attention.project_v.weight"] = inp[f"{old_prefix}.attn.v_proj.weight"].contiguous()
43 |         out[f"{prefix}.self_att.self_attention.attention_out.weight"] = inp[f"{old_prefix}.attn.out_proj.weight"].contiguous()
44 | 
45 |         out[f"{prefix}.ffn.ffn.w_in.w.weight"] = inp[f"{old_prefix}.mlp.fc_in.weight"].contiguous()
46 |         out[f"{prefix}.ffn.ffn.w_in.w.bias"] = inp[f"{old_prefix}.mlp.fc_in.bias"].contiguous()
47 |         out[f"{prefix}.ffn.ffn.w_out.weight"] = inp[f"{old_prefix}.mlp.fc_out.weight"].contiguous()
48 |         out[f"{prefix}.ffn.ffn.w_out.bias"] = inp[f"{old_prefix}.mlp.fc_out.bias"].contiguous()
49 | 
50 |     for k, v in out.items():
51 |         out[k] = out[k].half()
52 | 
53 |     torch.save(out, outpath)
54 | 
55 | if __name__ == "__main__":
56 |     main()
57 | 


--------------------------------------------------------------------------------
/transfer/hugLLaMa2_bmtrainLLaMa2.py:
--------------------------------------------------------------------------------
 1 | from transformers import LlamaConfig
 2 | import torch, os
 3 | import json
 4 | from collections import OrderedDict
 5 | 
 6 | ver_layernum = [
 7 |     # "7b",
 8 |     "13b",
 9 | ]
10 | 
11 | ver = ver_layernum[0]
12 | 
13 | inpath = f"../results/llama-2-{ver}-hf"
14 | outpath = f"../results/llama-2-{ver}"
15 | 
16 | hf_config = LlamaConfig.from_pretrained(inpath)
17 | config = {
18 |     'dim_model': hf_config.hidden_size,
19 |     'dim_ff': hf_config.intermediate_size,
20 |     'num_layers': hf_config.num_hidden_layers,
21 |     'num_heads': hf_config.num_attention_heads,
22 |     'num_heads_kv': hf_config.num_key_value_heads,
23 |     'dim_head': hf_config.hidden_size // hf_config.num_attention_heads,
24 |     'norm_eps': hf_config.rms_norm_eps,
25 | }
26 | with open(os.path.join(outpath, "config.json"), 'w') as f:
27 |     json.dump(config, f)
28 | 
29 | layernum = config['num_layers']
30 | 
31 | model_hf = OrderedDict()
32 | ckpt_num = None
33 | for name in os.listdir(inpath):
34 |     if name.startswith("pytorch_model-") and name.endswith(".bin"):
35 |         ckpt_num = int(name[-9:-4])
36 | for i in range(1, ckpt_num + 1):
37 |     part = torch.load(os.path.join(inpath, f"pytorch_model-{i:05d}-of-{ckpt_num:05d}.bin"))
38 |     model_hf.update(part)
39 | 
40 | out = OrderedDict()
41 | 
42 | out["input_embedding.weight"] = model_hf['model.embed_tokens.weight'].contiguous()
43 | out["encoder.output_layernorm.weight"] = model_hf['model.norm.weight'].contiguous()
44 | out['output_projection.weight'] = model_hf['lm_head.weight'].contiguous()
45 | for lnum in range(layernum):
46 |     hf_pfx = f"model.layers.{lnum}"
47 |     bmt_pfx = f"encoder.layers.{lnum}"
48 |     
49 |     out[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"] = model_hf[f"{hf_pfx}.input_layernorm.weight"].contiguous()
50 | 
51 |     out[f"{bmt_pfx}.self_att.self_attention.project_q.weight"] = model_hf[f"{hf_pfx}.self_attn.q_proj.weight"].contiguous()
52 |     out[f"{bmt_pfx}.self_att.self_attention.project_k.weight"] = model_hf[f"{hf_pfx}.self_attn.k_proj.weight"].contiguous()
53 |     out[f"{bmt_pfx}.self_att.self_attention.project_v.weight"] = model_hf[f"{hf_pfx}.self_attn.v_proj.weight"].contiguous()
54 |     out[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"] = model_hf[f"{hf_pfx}.self_attn.o_proj.weight"].contiguous()
55 | 
56 |     out[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"] = model_hf[f"{hf_pfx}.post_attention_layernorm.weight"].contiguous()
57 | 
58 |     out[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"] = model_hf[f"{hf_pfx}.mlp.gate_proj.weight"].contiguous()
59 |     out[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"] = model_hf[f"{hf_pfx}.mlp.up_proj.weight"].contiguous()
60 | 
61 |     out[f"{bmt_pfx}.ffn.ffn.w_out.weight"] = model_hf[f"{hf_pfx}.mlp.down_proj.weight"].contiguous()
62 |     
63 |     
64 | for key in out:
65 |     out[key] = out[key].half()
66 | 
67 | if not os.path.exists(outpath):
68 |     os.makedirs(outpath)
69 | torch.save(out, os.path.join(outpath, "pytorch_model.pt"))
70 | 


--------------------------------------------------------------------------------
/transfer/hugLLaMa_bmtrainLLaMa.py:
--------------------------------------------------------------------------------
 1 | from transformers import LlamaConfig
 2 | import torch, os
 3 | import json
 4 | from collections import OrderedDict
 5 | 
 6 | ver_layernum = [
 7 |     "7b",
 8 |     "13b",
 9 |     "30b",
10 |     "65b",
11 | ]
12 | 
13 | ver = ver_layernum[0]
14 | 
15 | inpath = f"../results/llama-{ver}-hf"
16 | outpath = f"../results/llama-{ver}"
17 | 
18 | hf_config = LlamaConfig.from_pretrained(inpath)
19 | config = {
20 |     'dim_model': hf_config.hidden_size,
21 |     'dim_ff': hf_config.intermediate_size,
22 |     'num_layers': hf_config.num_hidden_layers,
23 |     'num_heads': hf_config.num_attention_heads,
24 |     'dim_head': hf_config.hidden_size // hf_config.num_attention_heads,
25 |     'norm_eps': hf_config.rms_norm_eps,
26 | }
27 | with open(os.path.join(outpath, "config.json"), 'w') as f:
28 |     json.dump(config, f)
29 | 
30 | layernum = config['num_layers']
31 | 
32 | model_hf = OrderedDict()
33 | for i in range(1, layernum + 2):
34 |     part = torch.load(os.path.join(inpath, f"pytorch_model-{i:05d}-of-000{layernum+1}.bin"))
35 |     model_hf.update(part)
36 | 
37 | out = OrderedDict()
38 | 
39 | out["input_embedding.weight"] = model_hf['model.embed_tokens.weight'].contiguous()
40 | out["encoder.output_layernorm.weight"] = model_hf['model.norm.weight'].contiguous()
41 | out['output_projection.weight'] = model_hf['lm_head.weight'].contiguous()
42 | for lnum in range(layernum):
43 |     hf_pfx = f"model.layers.{lnum}"
44 |     bmt_pfx = f"encoder.layers.{lnum}"
45 |     
46 |     out[f"{bmt_pfx}.self_att.layernorm_before_attention.weight"] = model_hf[f"{hf_pfx}.input_layernorm.weight"].contiguous()
47 | 
48 |     out[f"{bmt_pfx}.self_att.self_attention.project_q.weight"] = model_hf[f"{hf_pfx}.self_attn.q_proj.weight"].contiguous()
49 |     out[f"{bmt_pfx}.self_att.self_attention.project_k.weight"] = model_hf[f"{hf_pfx}.self_attn.k_proj.weight"].contiguous()
50 |     out[f"{bmt_pfx}.self_att.self_attention.project_v.weight"] = model_hf[f"{hf_pfx}.self_attn.v_proj.weight"].contiguous()
51 |     out[f"{bmt_pfx}.self_att.self_attention.attention_out.weight"] = model_hf[f"{hf_pfx}.self_attn.o_proj.weight"].contiguous()
52 | 
53 |     out[f"{bmt_pfx}.ffn.layernorm_before_ffn.weight"] = model_hf[f"{hf_pfx}.post_attention_layernorm.weight"].contiguous()
54 | 
55 |     out[f"{bmt_pfx}.ffn.ffn.w_in.w_0.weight"] = model_hf[f"{hf_pfx}.mlp.gate_proj.weight"].contiguous()
56 |     out[f"{bmt_pfx}.ffn.ffn.w_in.w_1.weight"] = model_hf[f"{hf_pfx}.mlp.up_proj.weight"].contiguous()
57 | 
58 |     out[f"{bmt_pfx}.ffn.ffn.w_out.weight"] = model_hf[f"{hf_pfx}.mlp.down_proj.weight"].contiguous()
59 |     
60 |     
61 | for key in out:
62 |     out[key] = out[key].half()
63 | 
64 | if not os.path.exists(outpath):
65 |     os.makedirs(outpath)
66 | torch.save(out, os.path.join(outpath, "pytorch_model.pt"))
67 | 


--------------------------------------------------------------------------------
/transfer/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | WORKING_DIR=/mnt/sfs_turbo/hx/cpm3-pretrain/transfer
 3 | cd ${WORKING_DIR}
 4 | echo "Current working directory ${WORKING_DIR}"
 5 | # python3 cpm1_oldffn2newffn.py
 6 | # python3 cpm2_oldffn2newffn.py
 7 | # python3 hugGPTj_bmtrainGPTj.py
 8 | CMD="python3 cpm1_old2new.py"
 9 | echo ${CMD}
10 | 
11 | ${CMD} 2>&1 | tee /mnt/sfs_turbo/hx/cpm3-pretrain/logs/test-new.log


--------------------------------------------------------------------------------