├── .github └── workflows │ └── build-and-push.yml ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── .pylintrc ├── .readthedocs.yaml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── adrs └── 001-resource-estimator-library.md ├── docs ├── Makefile ├── make.bat └── source │ ├── conf.py │ └── index.rst ├── fm_training_estimator ├── __init__.py ├── config │ ├── __init__.py │ ├── arguments.py │ ├── parser.py │ ├── test_configs │ │ ├── config1.json │ │ ├── config2.json │ │ ├── config3.json │ │ └── config4.json │ ├── test_parser.py │ ├── test_utils.py │ └── utils.py ├── data │ ├── README.md │ ├── __init__.py │ └── manager.py ├── memory │ ├── __init__.py │ ├── fsdp │ │ ├── __init__.py │ │ ├── fsdp.py │ │ └── test_fsdp.py │ ├── full │ │ ├── README.md │ │ ├── __init__.py │ │ ├── full.py │ │ └── test_full.py │ ├── hybrid │ │ ├── README.md │ │ ├── __init__.py │ │ ├── hybrid.py │ │ └── hybrid_test.py │ ├── lora │ │ ├── __init__.py │ │ ├── hybrid.py │ │ ├── lora.py │ │ └── test_lora.py │ └── qlora │ │ ├── __init__.py │ │ ├── hybrid.py │ │ ├── qlora.py │ │ └── test_qlora.py ├── regressor │ ├── README.md │ ├── __init__.py │ ├── arise │ │ ├── README.md │ │ ├── __init__.py │ │ ├── arise.py │ │ └── train.py │ ├── dispatch.py │ ├── linear │ │ ├── __init__.py │ │ ├── linear.py │ │ └── train.py │ ├── lookup │ │ ├── __init__.py │ │ ├── lookup.py │ │ └── test_lookup.py │ ├── test_data │ │ ├── data1.csv │ │ ├── data2.csv │ │ └── data3.csv │ └── xgboost │ │ ├── README.md │ │ ├── __init__.py │ │ ├── test_reg.py │ │ ├── train.py │ │ └── xgboost.py ├── sdk │ ├── README.md │ ├── __init__.py │ ├── examples │ │ └── ex1.py │ └── sdk.py ├── throughput │ ├── __init__.py │ ├── hybrid │ │ ├── __init__.py │ │ ├── hybrid.py │ │ └── test_hybrid.py │ └── mock │ │ ├── __init__.py │ │ ├── mock.py │ │ └── test_mock.py ├── time │ ├── README.md │ ├── __init__.py │ └── time.py ├── tokens │ ├── README.md │ ├── __init__.py │ ├── te.py │ ├── te0 │ │ ├── __init__.py │ │ ├── te0.py │ │ ├── te_test1.jsonl │ │ └── test_te0.py │ └── te2 │ │ ├── README.md │ │ ├── __init__.py │ │ ├── gen_contract.py │ │ ├── te2.py │ │ ├── te_test1.jsonl │ │ ├── test1.contract.json │ │ ├── test_axb.contract.json │ │ └── test_te2.py ├── ui │ ├── README.md │ ├── __init__.py │ ├── api.py │ ├── cli.py │ ├── core.py │ ├── model_whitelist.txt │ └── web.py └── utils │ ├── __init__.py │ ├── model.py │ ├── test_model.py │ └── utils.py ├── imgs ├── build-model.png └── demo-cli.gif ├── launch_estimator.py ├── pyproject.toml ├── tox.ini └── tox.sh /.github/workflows/build-and-push.yml: -------------------------------------------------------------------------------- 1 | # Copyright The FM Training Estimator Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | name: Upload Python Package for FM Training Estimator 16 | 17 | on: 18 | release: 19 | types: [published] 20 | 21 | permissions: 22 | contents: read 23 | 24 | jobs: 25 | deploy: 26 | 27 | runs-on: ubuntu-latest 28 | strategy: 29 | matrix: 30 | python-version: 31 | - setup: "3.11" 32 | tox: "py311" 33 | 34 | permissions: 35 | id-token: write # IMPORTANT: this permission is mandatory for trusted publishing 36 | 37 | steps: 38 | - uses: actions/checkout@v4 39 | - name: Set up Python 40 | uses: actions/setup-python@v4 41 | with: 42 | python-version: ${{ matrix.python-version.setup }} 43 | 44 | 45 | - name: Install dependencies 46 | run: | 47 | python -m pip install --upgrade pip 48 | python -m pip install tox 49 | - name: Build and test with tox 50 | run: tox -e ${{ matrix.python-version.tox }} 51 | - name: Build and check wheel package 52 | run: 53 | tox -e build,twinecheck 54 | - name: Publish package 55 | uses: pypa/gh-action-pypi-publish@release/v1 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | wheel/ 30 | fm_training_estimator/_version.py 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/source/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Mac personalization files 116 | *.DS_Store 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | .*~ 137 | /workdir/ 138 | 139 | .DS_Store 140 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile=black 3 | from_first=true 4 | import_heading_future=Future 5 | import_heading_stdlib=Standard 6 | import_heading_thirdparty=Third Party 7 | import_heading_firstparty=First Party 8 | import_heading_localfolder=Local 9 | known_firstparty= 10 | known_localfolder=tuning 11 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 22.3.0 4 | hooks: 5 | - id: black 6 | exclude: imports 7 | - repo: https://github.com/PyCQA/isort 8 | rev: 5.11.5 9 | hooks: 10 | - id: isort 11 | exclude: imports 12 | - repo: https://github.com/compilerla/conventional-pre-commit 13 | rev: v3.2.0 14 | hooks: 15 | - id: conventional-pre-commit 16 | stages: [commit-msg] 17 | args: [] 18 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Configuration version 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the docs/source directory with Sphinx 15 | sphinx: 16 | configuration: docs/source/conf.py 17 | 18 | # Declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - method: pip 22 | path: . 23 | extra_requirements: 24 | - all 25 | - dev-docs -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ## Global Args ################################################################# 2 | ARG BASE_UBI_IMAGE_TAG=latest 3 | ARG USER=tuning 4 | ARG USER_UID=1000 5 | ARG PYTHON_VERSION=3.11 6 | ARG WHEEL_VERSION="" 7 | 8 | ## Base Layer ################################################################## 9 | FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base 10 | 11 | ARG PYTHON_VERSION 12 | ARG USER 13 | ARG USER_UID 14 | 15 | # Note this works for 3.9, 3.11, 3.12 16 | RUN dnf remove -y --disableplugin=subscription-manager \ 17 | subscription-manager \ 18 | && dnf install -y python${PYTHON_VERSION} procps g++ python${PYTHON_VERSION}-devel \ 19 | && ln -s /usr/bin/python${PYTHON_VERSION} /bin/python \ 20 | && python -m ensurepip --upgrade \ 21 | && python -m pip install --upgrade pip \ 22 | && python -m pip install --upgrade setuptools \ 23 | && dnf update -y \ 24 | && dnf clean all 25 | 26 | RUN useradd -u $USER_UID ${USER} -m -g 0 --system && \ 27 | chmod g+rx /home/${USER} 28 | 29 | FROM base AS python-installations 30 | 31 | ARG WHEEL_VERSION 32 | ARG USER 33 | ARG USER_UID 34 | 35 | RUN dnf install -y git && \ 36 | # perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies 37 | # Twistlock detects it as H severity: Private keys stored in image 38 | rm -f /usr/share/doc/perl-Net-SSLeay/examples/server_key.pem && \ 39 | dnf clean all 40 | 41 | USER ${USER} 42 | WORKDIR /tmp 43 | RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \ 44 | python -m pip install --user build 45 | COPY --chown=${USER}:root fm_training_estimator fm_training_estimator 46 | COPY .git .git 47 | COPY pyproject.toml pyproject.toml 48 | 49 | # Build a wheel if PyPi wheel_version is empty else download the wheel from PyPi 50 | RUN if [[ -z "${WHEEL_VERSION}" ]]; \ 51 | then python -m build --wheel --outdir /tmp; \ 52 | else pip download fm_training_estimator==${WHEEL_VERSION} --dest /tmp --only-binary=:all: --no-deps; \ 53 | fi && \ 54 | ls /tmp/*.whl >/tmp/bdist_name 55 | 56 | # Install from the wheel 57 | RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \ 58 | python -m pip install --user wheel && \ 59 | python -m pip install --user "$(head bdist_name)" && \ 60 | # Cleanup the bdist whl file 61 | rm $(head bdist_name) /tmp/bdist_name 62 | 63 | ## Final image ################################################ 64 | FROM base AS release 65 | ARG USER 66 | ARG PYTHON_VERSION 67 | 68 | RUN mkdir -p /licenses 69 | COPY LICENSE /licenses/ 70 | 71 | RUN mkdir /app && \ 72 | chown -R $USER:0 /app /tmp && \ 73 | chmod -R g+rwX /app /tmp 74 | 75 | 76 | # Copy scripts and default configs 77 | COPY launch_estimator.py /app/ 78 | RUN chmod +x /app/launch_estimator.py 79 | 80 | WORKDIR /app 81 | USER ${USER} 82 | COPY --from=python-installations /home/${USER}/.local /home/${USER}/.local 83 | ENV PYTHONPATH="/home/${USER}/.local/lib/python${PYTHON_VERSION}/site-packages" 84 | 85 | CMD [ "python", "/app/launch_estimator.py" ] 86 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | IMAGE ?= icr.io/ftplatform/fm_training_estimator:latest 2 | 3 | .PHONY: build 4 | build: lint fmt install 5 | tox -e build 6 | 7 | .PHONY: install 8 | install: hook 9 | pip install -e . 10 | 11 | .PHONY: fmt 12 | fmt: 13 | tox -e fmt 14 | 15 | .PHONY: lint 16 | lint: 17 | tox -e lint 18 | 19 | .PHONY: test 20 | test: 21 | tox -e test 22 | 23 | .PHONY: hook 24 | hook: 25 | pre-commit install --hook-type commit-msg 26 | 27 | .PHONY: build-model 28 | build-model: 29 | python -m fm_training_estimator.regressor.xgboost.train ./workdir/data.csv ./workdir/model.zip '["tokens_per_second","memory","memory_act"]' 30 | 31 | .PHONY: build-linear-model 32 | build-linear-model: 33 | python -m fm_training_estimator.regressor.linear.train ./workdir/data.csv ./workdir/model.zip '["tokens_per_second","memory","memory_act"]' 34 | 35 | .PHONY: build-arise-model 36 | build-arise-model: 37 | python -m fm_training_estimator.regressor.arise.train ./workdir/data.csv ./workdir/model.zip ./workdir/arise-config.yaml '["tokens_per_second","memory","memory_act"]' 38 | 39 | .PHONY: run-web-ui 40 | run-web-ui: 41 | python -m fm_training_estimator.ui.web ./workdir/model_whitelist.txt ./workdir/data.csv ./workdir/model.json --enable_api=True 42 | 43 | .PHONY: run-cli 44 | run-cli: 45 | python -m fm_training_estimator.ui.cli --lookup_data_path ./workdir/data.csv -m ./workdir/model.zip $(CONF) 46 | 47 | .PHONY: run-cli-arise 48 | run-cli-arise: 49 | python -m fm_training_estimator.ui.cli --lookup_data_path ./workdir/data.csv -m ./workdir/model.zip $(CONF) 50 | 51 | .PHONY: run-api 52 | run-api: 53 | python -m fm_training_estimator.ui.api ./workdir/data.csv ./workdir/model.json 54 | 55 | .PHONY: cbuild 56 | cbuild: 57 | docker build -t ${IMAGE} -f Dockerfile . 58 | 59 | .PHONY: cpush 60 | cpush: 61 | docker push ${IMAGE} 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FM Training Estimator 2 | 3 | Estimators for Large Language Model Training. 4 | 5 | Estimate resource consumption - memory, tokens, time etc for training and fine-tuning jobs using an hybrid of theory and learned regression models. 6 | 7 | ## Feature Matrix and Roadmap 8 | 9 | | Technique | Support | 10 | |--------------------|--------------------| 11 | | Full (1 gpu) | :heavy_check_mark: | 12 | | FSDP (multi) | :heavy_check_mark: | 13 | | Lora (1 gpu) | :heavy_check_mark: | 14 | | QLora (1 gpu) | :heavy_check_mark: | 15 | | Speculators | Planned | 16 | | Tensor Parallelism | Planned | 17 | 18 | ### Time 19 | 20 | Full learned approach. Coverage based on availability of training data. 21 | 22 | ### Memory 23 | 24 | Hybrid theory + learned. Coverage of learned approach is subject to availability of training data. 25 | 26 | ### Tokens 27 | 28 | Fully theory. Simulation based models available. 29 | 30 | | Technique | Explanation | Availability | 31 | |-----------|------------------------------------------------|--------------------| 32 | | TE0 | Simulation based - slow but accurate | :heavy_check_mark: | 33 | | TE1 | Statistical | Planned | 34 | | TE2 | Approximate - fast, light, reasonable accurate | :heavy_check_mark | 35 | 36 | ## Usage 37 | 38 | You can use the library `fm_training_estimator` as a Python package by installing it via pip, see [installation](#install), [build a regression model](#build-a-regression-model-for-learned-prediction-method) and [using the library](#use-the-library-to-get-estimates). If you'd like to construct the estimator service with a [Web UI](#make-estimates-via-a-web-ui) via FastAPI or [build a docker image](#build-a-docker-container-image), clone the repository in your local machine before following the instructions in those sections. 39 | 40 | Within your working directory, it is recommended to create a virtual environment to ensure no conflicts in dependencies. 41 | 42 | ``` 43 | python -m venv .venv 44 | source .venv/bin/activate 45 | ``` 46 | 47 | ### Install 48 | ``` 49 | pip install fm_training_estimator 50 | ``` 51 | 52 | ### Build a regression model for learned prediction method 53 | 54 | Now, prepare data in the expected format for lookup and regression. The format to be used to save this data is given [here](https://github.com/foundation-model-stack/fm-training-estimator/tree/main/fm_training_estimator/data/README.md). Save your data file into `./workdir/data.csv`. 55 | 56 | ``` 57 | mkdir workdir 58 | mv ./workdir/data.csv 59 | ``` 60 | 61 | Now, build a regression model using this data, using one of the the provided make targets. 62 | 63 | ![Building a model](./imgs/build-model.png) 64 | 65 | This will create a model called `./workdir/model.zip` which you can then use to estimate the resource consumption. 66 | 67 | You can now run the estimator library, see below. 68 | 69 | ### Using the Estimator 70 | 71 | There are a few ways to use the Estimator now: 72 | 73 | 1. Using the CLI tool, passing in a config in json format. 74 | 2. Using the Web UI. 75 | 3. Using the SDK directly from Python code. 76 | 77 | #### Using the CLI 78 | 79 | ![Demo of using CLI](./imgs/demo-cli.gif) 80 | 81 | ### Make estimates via a Web UI 82 | 83 | To do this, first prepare a txt file called `model_whitelist.txt` in the `workdir/` with a list of model names, 1 per line. Note that these are the models on which you want to run the estimator to estimate their resource consumption. You can use the provided [example](https://github.com/foundation-model-stack/fm-training-estimator/blob/main/fm_training_estimator/ui/model_whitelist.txt) and place it in your `workdir`. Modify this list as needed. 84 | 85 | Now, run the ui: 86 | ``` 87 | make run-web-ui 88 | ``` 89 | This will start the UI on `localhost:3000` port. 90 | 91 | (The web ui has other options, not covered in this simple setup. If you want to skip the model whitelisting or change the port, directly run the UI as shown in the README in the `./fm_training_estimator/ui` folder.) 92 | 93 | #### Use the library to get estimates 94 | 95 | For a full API reference, visit our [readthedocs](link). 96 | 97 | Example code: 98 | ```python 99 | # Standard 100 | import os 101 | 102 | # First Party 103 | from fm_training_estimator.config.arguments import ( 104 | DataArguments, 105 | EstimateInput, 106 | EstimatorMetadata, 107 | FMArguments, 108 | HFTrainingArguments, 109 | InfraArguments, 110 | JobConfig, 111 | ) 112 | from fm_training_estimator.sdk import ( 113 | estimate_cost, 114 | estimate_memory, 115 | estimate_time, 116 | estimate_tokens, 117 | ) 118 | 119 | workdir_path = os.path.join(os.path.abspath(os.curdir), "workdir") 120 | 121 | model_path = os.path.join(workdir_path, "model.json") 122 | lookup_data_path = os.path.join(workdir_path, "data.csv") 123 | 124 | estimator_metadata = EstimatorMetadata(base_data_path=lookup_data_path) 125 | 126 | fm = FMArguments( 127 | base_model_path="ibm-granite/granite-7b-base", 128 | torch_dtype="bfloat16", 129 | block_size=1024, 130 | ) 131 | hf_training = HFTrainingArguments( 132 | per_device_train_batch_size=1, gradient_checkpointing=False 133 | ) 134 | data = DataArguments(dataset="imdb", te_approach=0) 135 | infra = InfraArguments(numGpusPerPod=1) 136 | job_conf = JobConfig(hf_training, fm, data, infra) 137 | est_input = EstimateInput(estimator_metadata=estimator_metadata, job_configs=[job_conf]) 138 | 139 | print("Estimating Memory:....") 140 | 141 | print("With only theory: ", estimate_memory(est_input)) 142 | print("With reg model: ", estimate_memory(est_input, model_path)) 143 | 144 | hf_training.fsdp = "full_shard" 145 | 146 | print("Using fsdp full shard") 147 | print("With only theory: ", estimate_memory(est_input)) 148 | print("With reg model: ", estimate_memory(est_input, model_path)) 149 | 150 | 151 | print("Estimating Time:....") 152 | print("With only theory: ", estimate_time(est_input)) 153 | print("With reg model: ", estimate_time(est_input, model_path)) 154 | 155 | print("Estimating Tokens:....") 156 | print("With only theory: ", estimate_tokens(est_input)) 157 | print("With reg model: ", estimate_tokens(est_input, model_path)) 158 | ``` 159 | 160 | ### Build a Docker Container Image 161 | 162 | To build the estimator container image: 163 | 164 | 1. Make sure both `model.json` and `data.csv` files are present in the `workdir` folder. 165 | 166 | 2. Use this command to build and push the image: 167 | 168 | ```shell 169 | make cbuild 170 | make cpush # If you want to push to the container registry 171 | ``` 172 | 173 | 3. Use this command to run the image: 174 | 175 | ```shell 176 | docker run --rm -it -v "/path/to/input.json:/app/input.json" icr.io/ftplatform/fm_training_estimator:latest 177 | ``` 178 | -------------------------------------------------------------------------------- /adrs/001-resource-estimator-library.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Resource Estimator Library 3 | --- 4 | 5 | - **Author(s)**: Angel Luu (@aluu317) 6 | - **Signer(s)**: Praveen Jayachandran, Ashok Pon Kumar Sree Prakash @ashokponkumar, Chander Govindarajan @ChanderG 7 | - **Date (YYYY-MM-DD)**: 2024-10-31 8 | - **Obsoletes ADRs**: N/A 9 | - **Modified By ADRs**: N/A 10 | - **Relevant Issues**: N/A 11 | 12 | ## Problem Context 13 | 14 | Users of tuning/training stack currently have no way of estimating how much memory, time or cost it takes to run a training. They often hit OOM errors due to lack of memory. Users don't have enough information to make trade-off decisions on time vs. cost. Platform admins do not have any info to better schedule/pack jobs onto GPUs. 15 | 16 | In order to be useful, the capability of estimating resources must be exposed to tuning/training users. The primary user personas of this service include training users and platform admins. 17 | 18 | This ADR defines a Resource Estimator Python Library that provides an estimate of resource requirements for training runs. 19 | 20 | ## Impact Table 21 | 22 | | AI Functionality | Operational Functionality | 23 | | ----------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | 24 | | Tuning Stack | APIs | 25 | 26 | ## Decision 27 | 28 | - We will expose the resource estimator service as a Python library `fm_training_estimator`, hosted as Open Source at the repo [fm-training-estimator](https://github.com/foundation-model-stack/fm-training-estimator) and published to [PyPI](https://pypi.org/). 29 | - This Python library can be installed and plugged into any UI backend or a docker image by a product team. 30 | - The `fm_training_estimator` exposes 4 methods to calculate memory, time, tokens and cost. The method calls allows for user to pass training data as input for "learned" or "hybrid" model. If training data is missing, the "theory" is used. 31 | 32 | ### Alternatives to Python library deliverable 33 | We have considered choices of: 34 | - Alternative 1: A new docker image which has a FastAPI Server with a REST interface defined. When a product team integrates as a service, they can run this docker image, a server will run on localhost which can then be queried by GET/POST calls to do the estimates. 35 | 36 | - Alternative 2: A new docker image with a python script similar to fms-hf-tuning, which accepts a JSON config and calls the necessary python scripts to get estimate and save results in a file. 37 | 38 | Both alternatives provide more value to consumers. However does not provide the flexibility of how the library can be integrated and consumed. 39 | 40 | ## Consequences 41 | 42 | - By using this library, users need to supply their own dataset for the estimator to generate a learned model, and assume the security and privacy of that data. They can use flight service plugin should that be applicable. 43 | - The library can be used as backend component of a larger UI effort, or as part of a Docker image. The product teams can consume the library however they see fit and create their own build/update process. 44 | 45 | ## High Level Design 46 | 47 | - The `EstimateInput` data class (not all fields are required) defines the set of configs the library will use to calculate the results. This includes a list of instances of `Config` data class which in turns includes different types of configs (hf training args `HFArguments`, fms-hf-tuning additional args `FMArguments`, data args `DataArguments`, infrastructure args `InfraArguments`, peft lora args `PeftLoraConfig` and peft qlora args `PeftQLoraConfig`), and `EstimatorConfig` with metadata parameters. The input can be read from a json file using `--input_file_path` or `-f`. 48 | 49 | Example of an `EstimateInput` with all fields defined: 50 | ```json 51 | { 52 | "estimator": { // EstimatorMetadata 53 | "base_data_path": "data.csv", 54 | "method": "theory", // theory, learned, hybrid 55 | "token_estimation_version": 0 56 | }, 57 | "job_configs": [{ // list of [JobConfig] 58 | "hf_training": { // HFArguments 59 | "output_dir": "./output" 60 | }, 61 | "fm": { // FMArguments 62 | "base_model_path": "ibm-granite/granite-3b-code-base", 63 | "flash_attention_v2": "false", 64 | "lora_config": null, 65 | "max_seq_length": 2048, 66 | "block_size": 2048, 67 | "data_config_file": "data_config.json", 68 | "prompt_tuning_config": null, 69 | "torch_dtype": "float32", 70 | "technique": "full" 71 | }, 72 | "data": { // DataArguments 73 | "te_approach": 0, 74 | "dataset": null, 75 | "dataset_text_field": "text", 76 | "dataset_split": "test", 77 | "dataset_config_name": null 78 | }, 79 | "infra": { // InfraArguments 80 | "numGpusPerPod": 1, 81 | "numPods": 1, 82 | "gpu_memory_in_gb": 80, 83 | "gpuModel": "A100" 84 | }, 85 | "peft_lora": { // PeftLoraConfig 86 | "r": 4, 87 | "lora_alpha": 8, 88 | "lora_dropout": 0.1, 89 | "target_modules": "[q_proj, v_proj]" 90 | }, 91 | "peft_qlora": { // PeftQLoraConfig 92 | "quant_type": "nf4", 93 | "use_double_quant": False 94 | } 95 | }] 96 | } 97 | ``` 98 | 99 | - The API exposes 4 functions: 100 | 101 | Function `estimate_memory` returns a `MemoryEstimate`: 102 | ```python 103 | { 104 | "memory": { # MemoryEstimate 105 | "total_mem_estimate": "44.6 GiB", 106 | "activation_memory": "34.7 GiB", 107 | "gradient_memory": "2.5 GiB", 108 | "model_memory": "2.5 GiB", 109 | "optimizer_memory": "4.9 GiB", 110 | "num_gpus": 2 111 | } 112 | } 113 | ``` 114 | 115 | Function `estimate_time` returns a `TimeEstimate`: 116 | ```python 117 | { 118 | "time": { # TimeEstimate 119 | "time": "40s" 120 | } 121 | } 122 | ``` 123 | 124 | Function `estimate_tokens` returns a `TokensEstimate`: 125 | ```python 126 | { 127 | "tokens": { # TokensEstimate 128 | "tps": "5259.07373046875" 129 | } 130 | } 131 | ``` 132 | 133 | Function `estimate_cost` returns a `CostEstimate`: 134 | ```python 135 | { 136 | "cost": { # CostEstimate 137 | "usd": "0.0" 138 | } 139 | } 140 | ``` 141 | 142 | Function `estimate` returns a `Estimate` that include all 4 types of estimates above: 143 | ```python 144 | { 145 | "estimate": { # Estimate 146 | "memory": { # MemoryEstimate 147 | "total_mem_estimate": "44.6 GiB", 148 | "activation_memory": "34.7 GiB", 149 | "gradient_memory": "2.5 GiB", 150 | "model_memory": "2.5 GiB", 151 | "optimizer_memory": "4.9 GiB", 152 | "num_gpus": 2 153 | }, 154 | "time": { # TimeEstimate 155 | "time": "40s" 156 | }, 157 | "tokens": { # TokensEstimate 158 | "tps": "5259.07373046875" 159 | }, 160 | "cost": { # CostEstimate 161 | "usd": "0.0" 162 | } 163 | } 164 | } 165 | ``` 166 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # Standard 10 | # If extensions (or modules to document with autodoc) are in another directory, 11 | # add these directories to sys.path here. If the directory is relative to the 12 | # documentation root, use os.path.abspath to make it absolute, like shown here. 13 | # 14 | import os 15 | import sys 16 | 17 | sys.path.insert(0, os.path.abspath(os.path.join("..", ".."))) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "FM Training Estimator" 23 | copyright = "2024, The Training Estimator Authors" 24 | author = "The Training Estimator Authors" 25 | 26 | 27 | # -- General configuration --------------------------------------------------- 28 | 29 | # Add any Sphinx extension module names here, as strings. They can be 30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 31 | # ones. 32 | extensions = [ 33 | # Generate complete API docs by parsing source code 34 | "autoapi.extension", 35 | # Add links to source from generated docs 36 | "sphinx.ext.viewcode", 37 | ] 38 | 39 | # Add any paths that contain templates here, relative to this directory. 40 | templates_path = ["_templates"] 41 | 42 | # List of patterns, relative to source directory, that match files and 43 | # directories to ignore when looking for source files. 44 | # This pattern also affects html_static_path and html_extra_path. 45 | exclude_patterns = [ 46 | "_build", 47 | "Thumbs.db", 48 | ".DS_Store", 49 | "**ex1**", 50 | "**test**", 51 | "**_version**", 52 | ] 53 | 54 | # -- autoapi configuration --------------------------------------------------- 55 | 56 | # Language of source code to parse 57 | autoapi_type = "python" 58 | 59 | # Source code to parse to generate API docs relative to 'docs/source' directory 60 | autoapi_dirs = [os.path.join("..", "..", "fm_training_estimator")] 61 | 62 | # -- Options for HTML output ------------------------------------------------- 63 | 64 | # The theme to use for HTML and HTML Help pages. See the documentation for 65 | # a list of builtin themes. 66 | # 67 | html_theme = "sphinx_rtd_theme" 68 | 69 | # Add any paths that contain custom static files (such as style sheets) here, 70 | # relative to this directory. They are copied after the builtin static files, 71 | # so a file named "default.css" will overwrite the builtin "default.css". 72 | html_static_path = [] 73 | 74 | # Support external links to specific versions of the files in the Github repo 75 | branch = os.environ.get("READTHEDOCS_VERSION") 76 | if branch is None or branch == "latest": 77 | branch = "main" 78 | 79 | REPO = "foundation-model-stack/fm-training-estimator" 80 | scm_raw_web = "https://raw.githubusercontent.com/" + REPO + branch 81 | scm_web = "https://github.com/" + REPO + "blob/" + branch 82 | 83 | # Store variables in the epilogue so they are globally available. 84 | rst_epilog = """ 85 | .. |SCM_WEB| replace:: {s} 86 | .. |SCM_RAW_WEB| replace:: {sr} 87 | .. |SCM_BRANCH| replace:: {b} 88 | """.format( 89 | s=scm_web, sr=scm_raw_web, b=branch 90 | ) 91 | 92 | # used to have links to repo files 93 | extlinks = { 94 | "scm_raw_web": (scm_raw_web + "/%s", "scm_raw_web"), 95 | "scm_web": (scm_web + "/%s", "scm_web"), 96 | } 97 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to FM Training Estimator's API documentation! 2 | ====================================== 3 | 4 | 5 | Example usage: 6 | 7 | >>> # Standard 8 | import os 9 | # First Party 10 | from fm_training_estimator.config.arguments import ( 11 | DataArguments, 12 | EstimateInput, 13 | EstimatorMetadata, 14 | FMArguments, 15 | HFTrainingArguments, 16 | InfraArguments, 17 | JobConfig, 18 | ) 19 | from fm_training_estimator.sdk import ( 20 | estimate_cost, 21 | estimate_memory, 22 | estimate_time, 23 | estimate_tokens, 24 | ) 25 | workdir_path = os.path.join(os.path.abspath(os.curdir), "workdir") 26 | model_path = os.path.join(workdir_path, "model.json") 27 | lookup_data_path = os.path.join(workdir_path, "data.csv") 28 | estimator_metadata = EstimatorMetadata(base_data_path=lookup_data_path) 29 | fm = FMArguments( 30 | base_model_path="ibm-granite/granite-7b-base", 31 | torch_dtype="bfloat16", 32 | block_size=1024, 33 | ) 34 | hf_training = HFTrainingArguments( 35 | per_device_train_batch_size=1, gradient_checkpointing=False 36 | ) 37 | data = DataArguments(dataset="imdb", te_approach=0) 38 | infra = InfraArguments(numGpusPerPod=1) 39 | job_conf = JobConfig(hf_training, fm, data, infra) 40 | est_input = EstimateInput(estimator_metadata=estimator_metadata, job_configs=[job_conf]) 41 | print("Estimating Memory:....") 42 | print("With only theory: ", estimate_memory(est_input)) 43 | print("With reg model: ", estimate_memory(est_input, model_path)) 44 | hf_training.fsdp = "full_shard" 45 | print("Using fsdp full shard") 46 | print("With only theory: ", estimate_memory(est_input)) 47 | print("With reg model: ", estimate_memory(est_input, model_path)) 48 | print("Estimating Time:....") 49 | print("With only theory: ", estimate_time(est_input)) 50 | print("With reg model: ", estimate_time(est_input, model_path)) 51 | print("Estimating Tokens:....") 52 | print("With only theory: ", estimate_tokens(est_input)) 53 | print("With reg model: ", estimate_tokens(est_input, model_path)) 54 | 55 | .. toctree:: 56 | :maxdepth: 2 57 | :caption: Contents: 58 | 59 | 60 | Indices and tables 61 | ================== 62 | 63 | * :ref:`genindex` 64 | * :ref:`modindex` 65 | * :ref:`search` -------------------------------------------------------------------------------- /fm_training_estimator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-model-stack/fm-training-estimator/ba3f02ab129877c6fa6dabca46f860ce0f95c62c/fm_training_estimator/__init__.py -------------------------------------------------------------------------------- /fm_training_estimator/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .arguments import ( 3 | DataArguments, 4 | FMArguments, 5 | HFTrainingArguments, 6 | InfraArguments, 7 | PeftLoraConfig, 8 | PeftPromptTuningConfig, 9 | PeftQLoraConfig, 10 | ) 11 | from .parser import parse 12 | from .utils import is_fsdp 13 | 14 | __all__ = [ 15 | "FMArguments", 16 | "PeftPromptTuningConfig", 17 | "PeftLoraConfig", 18 | "PeftQLoraConfig", 19 | "HFTrainingArguments", 20 | "InfraArguments", 21 | "DataArguments", 22 | "parse", 23 | "is_fsdp", 24 | ] 25 | -------------------------------------------------------------------------------- /fm_training_estimator/config/arguments.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from dataclasses import dataclass, field 3 | from enum import Enum 4 | from typing import List, Optional 5 | 6 | # Third Party 7 | from dataclass_wizard import JSONWizard 8 | from peft.tuners.lora import LoraConfig 9 | from peft.tuners.prompt_tuning import PromptTuningConfig 10 | from transformers import TrainingArguments 11 | 12 | 13 | @dataclass 14 | class PeftPromptTuningConfig(PromptTuningConfig): 15 | """dataclass for prompt tuning config 16 | 17 | Args: 18 | PromptTuningConfig (_type_): imported directly from peft library 19 | """ 20 | 21 | 22 | @dataclass 23 | class PeftLoraConfig: 24 | """Dataclass for LoRA tuning config 25 | 26 | Not directly imported from peft LoraConfig due to complexity. 27 | """ 28 | 29 | r: int = field(default=4, metadata={"help": ("Lora rank parameter")}) 30 | 31 | lora_alpha: int = field(default=8) 32 | lora_dropout: float = field(default=0.1) 33 | target_modules: List[str] = field(default_factory=lambda: ["q_proj", "v_proj"]) 34 | 35 | 36 | @dataclass 37 | class PeftQLoraConfig: 38 | """Dataclass for QLoRA tuning config""" 39 | 40 | quant_type: str = field(default="nf4") 41 | use_double_quant: bool = field(default=False) 42 | 43 | 44 | @dataclass 45 | class HFTrainingArguments(TrainingArguments): 46 | """HF trainer arguments 47 | 48 | Args: 49 | TrainingArguments (_type_): directly imported from transformers library 50 | """ 51 | 52 | output_dir: str = field( 53 | default="./output", metadata={"help": ("temporary output dir for HF")} 54 | ) 55 | 56 | 57 | @dataclass 58 | class InfraArguments: 59 | """dataclass for infrastructure arguments""" 60 | 61 | numGpusPerPod: int = field( 62 | default=0, 63 | metadata={ 64 | "help": ( 65 | "number of gpus requested per pod. Setting to 0 for auto-discover." 66 | ) 67 | }, 68 | ) 69 | 70 | numPods: int = field( 71 | default=1, 72 | metadata={"help": ("number of pods requested")}, 73 | ) 74 | 75 | gpu_memory_in_gb: int = field(default=80, metadata={"help": ("GPU RAM in GBs")}) 76 | 77 | gpuModel: str = field( 78 | default="A100", 79 | metadata={"help": ("model of gpu used")}, 80 | ) 81 | 82 | 83 | @dataclass 84 | class FMArguments: 85 | """dataclass to store additional args not covered by standard HF argument dataclasses""" 86 | 87 | base_model_path: str = field( 88 | default="ibm-granite/granite-3b-code-base", 89 | metadata={ 90 | "help": ( 91 | "Base Model location. Can be empty if output path has a checkpoint." 92 | ) 93 | }, 94 | ) 95 | 96 | flash_attention_v2: bool = field( 97 | default=False, 98 | metadata={"help": ("It enable flash attention v2 for attention calculation.")}, 99 | ) 100 | 101 | lora_config: str = field( 102 | default=None, metadata={"help": ("LORA configuration json file path.")} 103 | ) 104 | 105 | max_seq_length: int = field( 106 | default=2048, 107 | metadata={"help": ("model max sequence length.")}, 108 | ) 109 | 110 | block_size: int = field( 111 | default=2048, 112 | metadata={"help": ("Sequence length.")}, 113 | ) 114 | 115 | data_config_file: str = field( 116 | default="data_config.json", 117 | metadata={"help": ("Input files in glob format.")}, 118 | ) 119 | 120 | prompt_tuning_config: str = field( 121 | default=None, metadata={"help": ("Prompt tuning config json file path")} 122 | ) 123 | 124 | torch_dtype: str = field( 125 | default="float32", 126 | metadata={ 127 | "help": ( 128 | "provide torch dtype for the model precision. \ 129 | Choose one from float16, float32, bfloat16" 130 | ) 131 | }, 132 | ) 133 | 134 | technique: str = field( 135 | default="full", 136 | metadata={"help": ("Fine-tuning technique being used")}, 137 | ) 138 | 139 | 140 | @dataclass 141 | class DataArguments: 142 | """dataclass to define args handling training data as input for estimation.""" 143 | 144 | te_approach: int = field( 145 | default=0, metadata={"help": ("Approach to use for Token Estimation")} 146 | ) 147 | 148 | dataset: str = field( 149 | default=None, metadata={"help": ("name of HF dataset or path to json file")} 150 | ) 151 | 152 | dataset_text_field: str = field( 153 | default="text", metadata={"help": ("field of the dataset to use")} 154 | ) 155 | 156 | dataset_split: str = field( 157 | default="test", 158 | metadata={"help": ("dataset split to use, in case of HF dataset")}, 159 | ) 160 | 161 | dataset_config_name: str = field( 162 | default=None, 163 | metadata={"help": ("dataset configuration to use, in case of HF dataset")}, 164 | ) 165 | 166 | trust_remote_code: bool = field( 167 | default=True, 168 | metadata={"help": ("allow dataset with a loading script")} 169 | ) 170 | 171 | dataset_config_file: str = field( 172 | default=None, 173 | metadata={"help": ("dataset configuration file in case dataset is not available/provided")}, 174 | ) 175 | 176 | class EstimatorMethod(Enum): 177 | """Enumerate different estimation models the FM Training Estimator is to use to make an estimation.""" 178 | 179 | THEORY = "theory" 180 | """Theory model for estimation.""" 181 | 182 | LEARNED = "learned" 183 | """Learned model for estimation, based on user provided training data.""" 184 | 185 | HYBRID = "hybrid" 186 | """Hybrid model for estimation, a combination of theory and learned models.""" 187 | 188 | 189 | @dataclass 190 | class EstimatorMetadata: 191 | """Metadata for the FM Training Estimator.""" 192 | 193 | base_data_path: str = field( 194 | default=None, metadata={"help": ("path to the data path for training data")} 195 | ) 196 | method: EstimatorMethod = field( 197 | default=EstimatorMethod.HYBRID, 198 | metadata={"help": ("enum method the estimator should use")}, 199 | ) 200 | token_estimation_version: str = field( 201 | default=0, metadata={"help": ("version of token estimator to use")} 202 | ) 203 | 204 | 205 | @dataclass 206 | class JobConfig: 207 | """Dataclass that represents a set of different configs for a tuning job to make estimate on.""" 208 | 209 | hf_training: HFTrainingArguments = field(default_factory=HFTrainingArguments) 210 | fm: FMArguments = field(default_factory=FMArguments) 211 | data: DataArguments = field(default_factory=DataArguments) 212 | infra: InfraArguments = field(default_factory=InfraArguments) 213 | peft_lora: PeftLoraConfig = field(default_factory=PeftLoraConfig) 214 | peft_qlora: PeftQLoraConfig = field(default_factory=PeftQLoraConfig) 215 | 216 | 217 | @dataclass 218 | class EstimateInput(JSONWizard): 219 | """ 220 | The dataclass that is an input to a estimate function. 221 | It includes a list of different training job configs and metadata about the estimator. 222 | """ 223 | 224 | job_configs: List[JobConfig] 225 | estimator_metadata: Optional[EstimatorMetadata] = None 226 | 227 | 228 | @dataclass 229 | class TimeEstimate: 230 | """The estimated time response to estimate_time function.""" 231 | 232 | time: str 233 | train_time: str 234 | 235 | 236 | @dataclass 237 | class MemoryEstimate: 238 | """The estimated memory response to estimate_memory function.""" 239 | 240 | total_mem_estimate: str 241 | activation_memory: str 242 | gradient_memory: str 243 | model_memory: str 244 | optimizer_memory: str 245 | num_gpus: int 246 | 247 | 248 | @dataclass 249 | class TokensEstimate: 250 | """The estimated token response to estimate_token function.""" 251 | 252 | tps: float 253 | 254 | 255 | @dataclass 256 | class CostEstimate: 257 | """The estimated cost response to estimate_cost function.""" 258 | 259 | usd: float 260 | 261 | 262 | @dataclass 263 | class Estimate: 264 | """The estimate response to estimate function, including time, memory, tokens and cost.""" 265 | 266 | memory: MemoryEstimate 267 | time: TimeEstimate 268 | tokens: TokensEstimate 269 | cost: CostEstimate 270 | -------------------------------------------------------------------------------- /fm_training_estimator/config/parser.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from typing import Dict, Tuple, Union 3 | 4 | # Third Party 5 | from transformers import HfArgumentParser 6 | 7 | # Local 8 | from ..utils import logger, unmarshal 9 | from .arguments import ( 10 | DataArguments, 11 | FMArguments, 12 | HFTrainingArguments, 13 | InfraArguments, 14 | PeftLoraConfig, 15 | PeftQLoraConfig, 16 | ) 17 | 18 | 19 | def parse( 20 | config: Union[Dict, str] 21 | ) -> Tuple[ 22 | FMArguments, 23 | HFTrainingArguments, 24 | InfraArguments, 25 | DataArguments, 26 | PeftLoraConfig, 27 | PeftQLoraConfig, 28 | ]: 29 | """parse config and return respective dataclass objects 30 | 31 | Args: 32 | config (Union[Dict, str]): path to config file or a config python dict 33 | 34 | Returns: 35 | Tuple[FMArguments, TrainingArguments, PeftLoraConfig, PeftQLoraConfig, PeftPromptTuningConfig]: 36 | dataclass objects 37 | """ 38 | try: 39 | if not isinstance(config, (str, Dict)): 40 | raise TypeError( 41 | "provided config should be either path to a config file \ 42 | or a python:dict, but got {config_type}".format( 43 | config_type=type(config) 44 | ) 45 | ) 46 | if isinstance(config, str): 47 | config = unmarshal(config) 48 | 49 | arg_parser = HfArgumentParser( 50 | [ 51 | FMArguments, 52 | HFTrainingArguments, 53 | InfraArguments, 54 | DataArguments, 55 | PeftLoraConfig, 56 | PeftQLoraConfig, 57 | ] 58 | ) 59 | 60 | return arg_parser.parse_dict(config) 61 | except Exception as e: # pylint: disable=broad-except 62 | logger.error( 63 | "failed to parse the provided arguments from config {config}. error: {e}".format( 64 | config=config, e=e 65 | ) 66 | ) 67 | -------------------------------------------------------------------------------- /fm_training_estimator/config/test_configs/config1.json: -------------------------------------------------------------------------------- 1 | { 2 | "max_seq_length": 1023 3 | } 4 | -------------------------------------------------------------------------------- /fm_training_estimator/config/test_configs/config2.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_model_path": "ibm-granite/granite-7b-base", 3 | "torch_dtype": "float16", 4 | "fsdp": "full_shard", 5 | "numGpusPerPod": 2, 6 | "per_device_train_batch_size": 4, 7 | "block_size": 512 8 | } 9 | -------------------------------------------------------------------------------- /fm_training_estimator/config/test_configs/config3.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_model_path": "ibm-granite/granite-7b-base", 3 | "torch_dtype": "float16", 4 | "fsdp": "full_shard", 5 | "numGpusPerPod": 2, 6 | "per_device_train_batch_size": 5, 7 | "block_size": 512 8 | } 9 | -------------------------------------------------------------------------------- /fm_training_estimator/config/test_configs/config4.json: -------------------------------------------------------------------------------- 1 | { 2 | "base_model_path": "ibm-granite/granite-7b-base", 3 | "torch_dtype": "float16", 4 | "fsdp": "full_shard", 5 | "numGpusPerPod": 2, 6 | "per_device_train_batch_size": 5, 7 | "block_size": 512, 8 | "dataset": "imdb", 9 | "dataset_config_file": "abc.json" 10 | } 11 | -------------------------------------------------------------------------------- /fm_training_estimator/config/test_parser.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from pathlib import Path 3 | 4 | # Local 5 | from .parser import parse 6 | 7 | config_file_1 = (Path(__file__).parent / "./test_configs/config4.json").as_posix() 8 | 9 | 10 | def test_parse_empty_dict(): 11 | config = {} 12 | _, _, _, _, _, _ = parse(config) 13 | 14 | 15 | def test_parse_dict(): 16 | config = { 17 | "max_seq_length": 1023, 18 | "gpu_memory_in_gb": 40, 19 | "block_size": 1023, 20 | "per_device_train_batch_size": 2, 21 | "dataset": "my-dataset", 22 | } 23 | fm, ta, ia, da, _, _ = parse(config) 24 | 25 | assert fm.max_seq_length == 1023 26 | assert ia.gpu_memory_in_gb == 40 27 | assert fm.block_size == 1023 28 | assert ta.per_device_train_batch_size == 2 29 | assert da.dataset == "my-dataset" 30 | 31 | 32 | def test_parse_file(): 33 | fm, ta, ia, da, _, _ = parse(config_file_1) 34 | 35 | assert da.dataset_config_file == "abc.json" 36 | -------------------------------------------------------------------------------- /fm_training_estimator/config/test_utils.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .parser import parse 3 | from .utils import is_fsdp 4 | 5 | 6 | def test_fsdp_empty(): 7 | config = {} 8 | _, ta, _, _, _, _ = parse(config) 9 | 10 | assert is_fsdp(ta) is False 11 | 12 | 13 | def test_fsdp_enabled(): 14 | config = {"fsdp": "full_shard"} 15 | _, ta, _, _, _, _ = parse(config) 16 | 17 | assert is_fsdp(ta) is True 18 | 19 | config = {"fsdp": ["hybrid_shard", "offload"]} 20 | _, ta, _, _, _, _ = parse(config) 21 | 22 | assert is_fsdp(ta) is True 23 | -------------------------------------------------------------------------------- /fm_training_estimator/config/utils.py: -------------------------------------------------------------------------------- 1 | # First Party 2 | from fm_training_estimator.config.arguments import HFTrainingArguments 3 | 4 | 5 | def is_fsdp(ta: HFTrainingArguments): 6 | if hasattr(ta, "fsdp") and len(ta.fsdp) != 0: 7 | return True 8 | 9 | return False 10 | -------------------------------------------------------------------------------- /fm_training_estimator/data/README.md: -------------------------------------------------------------------------------- 1 | # Data 2 | 3 | This module is used to standardize and version the supported data formats to be used both at train time (for the regression models) and at run time (the format to structure the data to feed to the lookup and the regression modules). 4 | 5 | Since, we wish to support an ever evolving set of dataset features, the data format has been versioned into formats, such as "v1", "v2" and so on. 6 | 7 | There are 3 integration points of this format: 8 | 1. The format of the data in the csv file for lookup. The names and order of columns, basically. 9 | 2. The feature names (with order) used to train any regression model to be used with the estimator. 10 | 3. The key values (with order) to be used at run time, to query one of the above 2 modules. 11 | 12 | This module, locks in code, the exact expected format of data with version names. These names are mainly for human use, to refer to various formats. However, the job of this module is to automatically infer data format versions and adjust the data fields to make it easy for other modules to work with continuously changing data formats. 13 | 14 | Specifcally: 15 | 1. For CSV files used in lookup, this module will check based on the header, the format version before using it. 16 | 2. Regression training is expected to use this module to bake the used data format into the model. This way, the model file can be safely shared and re-used. At model load, this format is extracted out and used in 3. 17 | 3. For runtime queries, this module provides helper functions to structure input data to fit the expected data format. 18 | 19 | In the future, this module can also: 20 | 1. Provide validation functions to check any input data files/models. 21 | 2. Provide correction functions, to coerce input data files to the specified format. 22 | 23 | ## Formats 24 | 25 | # v1: Name based data 26 | 27 | For an example, look at `../regressor/test_data/data2.csv`. We need the following fields, in order: 28 | ``` 29 | model_name,number_gpus,batch_size,seq_len,tokens_per_second,memory,memory_act 30 | ``` 31 | 32 | Model_name is HF compatible name. All other fields are numbers. 33 | 34 | Memory refers to total memory taken by that configuration in Bytes. 35 | Memory_act refers to activation memory consumed by that configuration in Bytes. 36 | 37 | # v2: Feature based data 38 | 39 | For an example, look at `../regressor/test_data/data3.csv`. We need the following fields in order: 40 | ``` 41 | model_arch,model_hidden_size,model_intermediate_size,model_num_attn_heads,model_num_hidden_layers,model_num_key_value_heads,number_gpus,batch_size,seq_len,tokens_per_second,memory,memory_act 42 | ``` 43 | 44 | Notice how we no longer have the name of the model in the data. Instead, the first 6 fields refer to model configuration features which are now being used. All other fields are as in the `Name based data` format. 45 | -------------------------------------------------------------------------------- /fm_training_estimator/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .manager import format_query, lookup_format_version, get_format_by_version 3 | -------------------------------------------------------------------------------- /fm_training_estimator/data/manager.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from ..utils import extract_model_features 3 | 4 | 5 | class Format: 6 | """A class to track the various data formats used for lookup/regressor. 7 | 8 | Stores the features used/predicted as strings. 9 | """ 10 | 11 | def __init__(self, name, X, Y): 12 | self.name = name 13 | self.X = X 14 | self.Y = Y 15 | 16 | def get_all_columns_string(self): 17 | return self.X + "," + self.Y 18 | 19 | def get_empty_key_dict(self): 20 | res = {} 21 | for x in self.X.split(","): 22 | res[x] = None 23 | 24 | return res 25 | 26 | 27 | """ 28 | This is the list of accepted/known data formats. 29 | 30 | Only one of the following is a valid format for csv files for lookup and for any trained regression models. 31 | 32 | When new formats are to be supported this list is to be updated with a new Format object. 33 | """ 34 | formats = [ 35 | Format( 36 | "v1", 37 | "model_name,number_gpus,batch_size,seq_len", 38 | "tokens_per_second,memory,memory_act", 39 | ), 40 | Format( 41 | "v2", 42 | "model_arch,model_hidden_size,model_intermediate_size,model_num_attn_heads,model_num_hidden_layers,model_num_key_value_heads,number_gpus,batch_size,seq_len", 43 | "tokens_per_second,memory,memory_act", 44 | ), 45 | Format( 46 | "v3", 47 | "model_arch,model_hidden_size,model_intermediate_size,model_num_attn_heads,model_num_hidden_layers,model_num_key_value_heads,method,gpu_model,number_gpus,batch_size,seq_len", 48 | "tokens_per_second,memory,memory_act", 49 | ), 50 | ] 51 | 52 | 53 | def lookup_format_version(data_keys): 54 | """ 55 | Given a string of comma separated keys, looks up any matching defined format version. 56 | 57 | The input included both X and Y columns in that order, like the header of 58 | the CSV used to train/lookup. 59 | """ 60 | for f in formats: 61 | if data_keys == f.get_all_columns_string(): 62 | return f.name 63 | 64 | return "undefined" 65 | 66 | 67 | def get_format_by_version(version): 68 | """Given a version string, return the relevant Format object.""" 69 | for f in formats: 70 | if f.name == version: 71 | return f 72 | 73 | return None 74 | 75 | 76 | def format_query(partials, version, only_values=False): 77 | """ 78 | Format a query for a given version using the provided partial information. 79 | 80 | If only_values is False, returns a dictionary of key-values according to the format. 81 | If it is true, returns the values as an array. The former is needed for direct 82 | lookup in the lookup module, while the latter is used by the regressor. 83 | """ 84 | 85 | vf = get_format_by_version(version) 86 | 87 | # TODO: vf can be None here, if an unsupported format is seen. 88 | 89 | query = vf.get_empty_key_dict() 90 | 91 | # fill in all matching fields from the input, if present in desired version 92 | for k, v in partials.items(): 93 | if k in query: 94 | query[k] = v 95 | 96 | # Handle changes for other model versions here 97 | 98 | if version == "v2" or version == "v3": 99 | model_features = extract_model_features(partials["model_name"]) 100 | for k, v in model_features.items(): 101 | if k in query: 102 | query[k] = v 103 | 104 | # TODO: validate that all fields are filled in here, no None's present 105 | # print(query) 106 | 107 | if not only_values: 108 | return query 109 | else: 110 | return query.values() 111 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .fsdp import FSDPEstimator 3 | from .full import FullParameterTuningEstimator 4 | from .hybrid import HybridEstimator 5 | from .lora import HybridLoraEstimator, LoraEstimator 6 | from .qlora import HybridQLoraEstimator, QLoraEstimator 7 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/fsdp/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .fsdp import FSDPEstimator 3 | 4 | __all__ = ["FSDPEstimator"] 5 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/fsdp/fsdp.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | import math 3 | 4 | # Local 5 | from ...config import FMArguments, HFTrainingArguments 6 | from ...utils import fmt_size 7 | from ..full import FullParameterTuningEstimator 8 | 9 | 10 | class FSDPEstimator: 11 | def __init__( 12 | self, 13 | fm_args: FMArguments, 14 | train_args: HFTrainingArguments, 15 | base: FullParameterTuningEstimator, 16 | gpuSize: int, 17 | ) -> None: 18 | self.base = base 19 | self.gpuSize = gpuSize 20 | self.num_of_model_params = self.base.num_of_model_params 21 | self.num_of_trainable_params = self.base.num_of_trainable_params 22 | self.optimizer = self.base.optimizer 23 | self.precision = self.base.precision 24 | self.s = self.base.s 25 | """fsdp options 26 | - `"full_shard"`: Shard parameters, gradients and optimizer states. 27 | - `"shard_grad_op"`: Shard optimizer states and gradients. 28 | - `"hybrid_shard"`: Apply `FULL_SHARD` within a node, and replicate parameters across nodes. 29 | - `"hybrid_shard_zero2"`: Apply `SHARD_GRAD_OP` within a node, and replicate parameters across nodes. 30 | - `"offload"`: Offload parameters and gradients to CPUs (only compatible with `"full_shard"` and 31 | `"shard_grad_op"`). 32 | - `"auto_wrap"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`. 33 | """ 34 | self.fsdp_options = train_args.fsdp 35 | # ignores multi node training 36 | self.num_gpus = None 37 | 38 | def set_number_of_gpus(self, num_gpus): 39 | self.num_gpus = num_gpus 40 | 41 | def get_number_of_gpus(self): 42 | if self.num_gpus is None: 43 | self.estimate_number_of_gpus() 44 | 45 | return self.num_gpus 46 | 47 | def estimate_number_of_gpus(self): 48 | base_memory = ( 49 | self.base.calculate_activation_memory(readable=False) 50 | + self.base.calculate_gradient_memory(readable=False) 51 | + self.base.calculate_optimizer_memory(readable=False) 52 | ) 53 | if "shard_grad_op" in self.fsdp_options: 54 | return math.ceil( 55 | base_memory 56 | / ( 57 | self.gpuSize 58 | - ( 59 | self.gpuSize * 0.01 60 | + self.base.calculate_model_memory(readable=False) 61 | ) 62 | ) 63 | ) 64 | # leaving out 1% gap 65 | base_memory = (self.base.calculate_model_memory(readable=False)) + base_memory 66 | self.num_gpus = math.ceil(base_memory / (self.gpuSize - self.gpuSize * 0.01)) 67 | return self.num_gpus 68 | 69 | def get_total_mem_estimate(self, readable: bool = False): 70 | size = ( 71 | self.calculate_activation_memory() 72 | + self.calculate_gradient_memory() 73 | + self.calculate_model_memory() 74 | + self.calculate_optimizer_memory() 75 | ) 76 | if readable: 77 | return fmt_size(size) 78 | return size 79 | 80 | def calculate_activation_memory(self, readable: bool = False): 81 | # activations are not sharded however, they are reduced by the minibatch size 82 | # minibatch is the per device batch size 83 | size = self.base.calculate_activation_memory(readable=False) 84 | if readable: 85 | return fmt_size(size) 86 | return size 87 | 88 | def calculate_gradient_memory(self, readable: bool = False): 89 | size = self.base.calculate_gradient_memory(readable=False) / ( 90 | self.get_number_of_gpus() 91 | ) 92 | if readable: 93 | return fmt_size(size) 94 | return size 95 | 96 | def calculate_optimizer_memory(self, readable: bool = False): 97 | size = self.base.calculate_optimizer_memory(readable=False) / ( 98 | self.get_number_of_gpus() 99 | ) 100 | if readable: 101 | return fmt_size(size) 102 | return size 103 | 104 | def calculate_model_memory(self, readable: bool = False): 105 | # at some point FSDP loads double the sharded model memory 106 | size = self.base.calculate_model_memory(readable=False) 107 | if not "shard_grad_op" in self.fsdp_options: 108 | size = size / (self.get_number_of_gpus()) 109 | if readable: 110 | return fmt_size(size) 111 | return size 112 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/fsdp/test_fsdp.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from ...config import parse 3 | from ..full import FullParameterTuningEstimator 4 | from .fsdp import FSDPEstimator 5 | 6 | 7 | def test_fsdp(): 8 | fm, ta, ia, _, _, _ = parse( 9 | {"base_model_path": "ibm-granite/granite-8b-code-base", "gpu_memory_in_gb": 80} 10 | ) 11 | 12 | base = FullParameterTuningEstimator(fm, ta) 13 | est = FSDPEstimator(fm, ta, base, 1024 * 1024 * 1024 * ia.gpu_memory_in_gb) 14 | 15 | est.set_number_of_gpus(1) 16 | mm1 = est.calculate_model_memory() 17 | 18 | est.set_number_of_gpus(2) 19 | mm2 = est.calculate_model_memory() 20 | 21 | assert mm1 == mm2 * 2 22 | 23 | est.set_number_of_gpus(4) 24 | mm4 = est.calculate_model_memory() 25 | 26 | assert mm1 == mm4 * 4 27 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/full/README.md: -------------------------------------------------------------------------------- 1 | # Full Estimator 2 | 3 | Estimating memory for a single GPU full fine-tuning. 4 | 5 | ## Experimental Features 6 | 7 | ### Gradient Checkpointing 8 | 9 | How do we scale down Activation memory when Gradient Checkpointing is enabled? (The other 3 components are not impacted). 10 | 11 | By examining Profiler output and looking at the code, we find that the checkpoint function (`torch.utils.checkpoint`) is called for each block, for eg, see: https://github.com/huggingface/transformers/blob/f5f1e52f6cf13cdf63ff25c311d33e2f2a842911/src/transformers/models/llama/modeling_llama.py#L984 12 | 13 | This means that activations of a single block are stored when they are computed and once we are done with a block, just the inputs (which are stored in the `checkpoint` function) are retained to recompute activations for the backward pass. 14 | 15 | So, a simple approximation is being used here - of scaling down the total activation memory to that consumed by a single layer or block in the Transformer arch. 16 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/full/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .full import FullParameterTuningEstimator 3 | 4 | __all__ = ["FullParameterTuningEstimator"] 5 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/full/full.py: -------------------------------------------------------------------------------- 1 | # Third Party 2 | from transformers import AutoConfig, AutoTokenizer 3 | from transformers.training_args import OptimizerNames 4 | 5 | # Local 6 | from ...config import FMArguments, HFTrainingArguments 7 | from ...utils import fmt_size, get_size_from_precision, logger 8 | 9 | 10 | class FullParameterTuningEstimator: 11 | def __init__(self, fm_args: FMArguments, train_args: HFTrainingArguments) -> None: 12 | # see https://huggingface.co/docs/transformers/v4.18.0/en/performance 13 | self.train_args = train_args 14 | self.fm_args = fm_args 15 | self.model_path = self.fm_args.base_model_path 16 | self.config = AutoConfig.from_pretrained(self.model_path) 17 | # check https://github.com/bigscience-workshop/bigscience/tree/6917a3b5fefcf439d3485ca184b4d9f6ab605150/math#model-sizing 18 | if hasattr(self.config, "n_embed"): 19 | self.h = self.config.n_embed 20 | elif hasattr(self.config, "n_embd"): 21 | self.h = self.config.n_embd 22 | elif hasattr(self.config, "hidden_size"): 23 | self.h = self.config.hidden_size 24 | h = self.h 25 | if hasattr(self.config, "n_layer"): 26 | l = self.config.n_layer 27 | elif hasattr(self.config, "num_hidden_layers"): 28 | l = self.config.num_hidden_layers 29 | self.l = l 30 | v = self.config.vocab_size 31 | self.v = v 32 | if hasattr(self.config, "n_head"): 33 | a = self.config.n_head 34 | elif hasattr(self.config, "num_attention_heads"): 35 | a = self.config.num_attention_heads 36 | self.a = a 37 | self.b = self.train_args.per_device_train_batch_size 38 | tokenizer = AutoTokenizer.from_pretrained(self.model_path) 39 | n_positions = tokenizer.model_max_length 40 | if hasattr(self.config, "n_positions"): 41 | n_positions = self.config.n_positions 42 | if hasattr(self.config, "max_position_embeddings"): 43 | n_positions = self.config.max_position_embeddings 44 | self.model_max_length = n_positions 45 | self.s = min(self.fm_args.block_size, self.model_max_length) 46 | # trainable parameters in full paramter tuning 47 | self.num_of_model_params = l * (12 * h**2 + 13 * h) + v * h + 4 * h 48 | self.num_of_trainable_params = self.num_of_model_params 49 | 50 | # optimizers supported by transformer library 51 | self.optimizer = OptimizerNames(self.train_args.optim) 52 | self.precision = self._get_precision() 53 | 54 | def set_trainable_parameters(self, num_params): 55 | self.num_of_trainable_params = num_params 56 | 57 | def set_hidden_size(self, hidden_size): 58 | self.h = hidden_size 59 | 60 | def _get_precision(self) -> str: 61 | ## TODO: expand support for other precisions mentioned in TrainingArguments 62 | return self.fm_args.torch_dtype 63 | 64 | def calculate_activation_memory(self, readable: bool = False): 65 | # see https://blog.eleuther.ai/transformer-math/#activations-and-batch-size 66 | s = self.s 67 | b = self.b 68 | # dimension of the hidden representation 69 | a = self.a 70 | l = self.l 71 | h = self.h 72 | v = self.v 73 | # no tensor parallelism and sequence parallelism is considered at this point 74 | # activations stored in fp16 is assumed 75 | # (https://blog.eleuther.ai/transformer-math/#activations-and-batch-size) 76 | t = 1 77 | # TODO there are variations in mem usage based on activation recomputation 78 | # we take the worst case scenario 79 | transformer_block_size = (s * b * h * l) * ( 80 | 10 + (24 / t) + (5 * (a * s) / (h * t)) 81 | ) 82 | # input embeddings + last norm + output layer 83 | # no pipeline parallelism 84 | v = self.config.vocab_size 85 | p = 1 86 | # peripheral_size = ((s*b*h*l) / t) * ((p / l) + ((p * 4 / l) * (1 + (v/h)))) 87 | # print(fmt_size(peripheral_size)) 88 | size = transformer_block_size 89 | 90 | if self.train_args.gradient_checkpointing: 91 | size /= self.l 92 | 93 | multiplier = 1 94 | if self.precision == "float32": 95 | logger.debug(f"Memory Full - Using multiplier 2 as precision is float32.") 96 | multiplier = 2 97 | elif self.precision == "float16" or self.precision == "bfloat16": 98 | logger.debug( 99 | f"Memory Full - Using multiplier 1 as precision is bfloat16 or float16." 100 | ) 101 | multiplier = 1 102 | # print(s, b, h, l) 103 | # print(fmt_size(19 * s * b * h * l)) 104 | size = size * multiplier 105 | # print(fmt_size(size / l)) 106 | if readable: 107 | return fmt_size(size) 108 | return size 109 | 110 | def get_total_mem_estimate(self, readable: bool = False): 111 | # see https://blog.eleuther.ai/transformer-math/#distributed-training 112 | # TODO: fsdp is considered similar to Deepspeed zeros in terms of memory consumption 113 | # fsdp_sharding_strategy 114 | # FULL_SHARD (params, optim, and gradient) == deepspeed zero 3 115 | # SHARD_GRAD_OP (optim, and gradient) == deepspeed zero 2 116 | # NO_SHARD == DDP / deepspeed zero 0 117 | # HYBRID_SHARD (full shard in each node, like ddp across nodes) == deepspeed zero++ stage 3 118 | 119 | # however concrete formulation would be more helpful 120 | size = ( 121 | self.calculate_activation_memory() 122 | + self.calculate_gradient_memory() 123 | + self.calculate_model_memory() 124 | + self.calculate_optimizer_memory() 125 | ) 126 | if readable: 127 | return fmt_size(size) 128 | return size 129 | 130 | def calculate_gradient_memory(self, readable: bool = False): 131 | # see https://blog.eleuther.ai/transformer-math/#gradients 132 | multiplier = 0 133 | # TODO: gradient may not be in the same precision as the model 134 | # NOTE: there could be mixed precision as well 135 | # for mixed precision it is still fp32 computation 136 | if self.precision == "float32": 137 | multiplier = 4 138 | elif self.precision == "float16" or self.precision == "bfloat16": 139 | multiplier = 2 140 | else: 141 | raise ValueError("no support for the precision") 142 | size = self.num_of_trainable_params * multiplier 143 | if readable: 144 | return fmt_size(size) 145 | return size 146 | 147 | def calculate_model_memory(self, readable: bool = False): 148 | # TODO we did not consider mixed precision here 149 | # see https://huggingface.co/docs/transformers/v4.25.1/en/perf_train_gpu_one 150 | size = self.num_of_model_params * get_size_from_precision(self.precision) 151 | if readable: 152 | return fmt_size(size) 153 | return size 154 | 155 | def calculate_optimizer_memory(self, readable: bool = False): 156 | multiplier = 0 157 | # check https://github.com/huggingface/transformers/issues/22101 158 | ## check https://blog.eleuther.ai/transformer-math/#optimizer-states 159 | ## check https://huggingface.co/docs/transformers/v4.25.1/en/perf_train_gpu_one 160 | ## TODO: should detect 8-bit adamw if being used and compute 161 | if self.optimizer == OptimizerNames.ADAMW_TORCH or OptimizerNames.ADAMW_HF: 162 | # optimizer state is funciton of gradients/parameters dtype 163 | if self.precision == "float32": 164 | multiplier = 8 165 | elif self.precision == "float16" or self.precision == "bfloat16": 166 | multiplier = 4 167 | elif self.optimizer == OptimizerNames.SGD: 168 | multiplier = 4 169 | else: 170 | raise NotImplementedError("computation for optimizer is not implemented") 171 | size = self.num_of_trainable_params * multiplier 172 | if readable: 173 | return fmt_size(size) 174 | return size 175 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/full/test_full.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | 3 | # Local 4 | from ...config import parse 5 | from .full import FullParameterTuningEstimator 6 | 7 | 8 | def test_full(): 9 | fm, ta, _, _, _, _ = parse({}) 10 | est = FullParameterTuningEstimator(fm, ta) 11 | 12 | mm = est.calculate_model_memory() 13 | assert mm > 5 * 1_000_000_000 14 | assert mm < 15 * 1_000_000_000 15 | 16 | 17 | def test_custom_model(): 18 | fm, ta, _, _, _, _ = parse({"base_model_path": "ibm-granite/granite-8b-code-base"}) 19 | est = FullParameterTuningEstimator(fm, ta) 20 | 21 | mm = est.calculate_model_memory() 22 | assert mm > 25 * 1_000_000_000 23 | assert mm < 35 * 1_000_000_000 24 | 25 | 26 | def test_half_precision(): 27 | fm, ta, _, _, _, _ = parse( 28 | { 29 | "base_model_path": "ibm-granite/granite-8b-code-base", 30 | "torch_dtype": "float16", 31 | } 32 | ) 33 | est = FullParameterTuningEstimator(fm, ta) 34 | 35 | mm = est.calculate_model_memory() 36 | assert mm > 10 * 1_000_000_000 37 | assert mm < 20 * 1_000_000_000 38 | 39 | 40 | def test_gradient_checkpointing(): 41 | fm, ta, _, _, _, _ = parse( 42 | { 43 | "base_model_path": "ibm-granite/granite-8b-code-base", 44 | } 45 | ) 46 | est1 = FullParameterTuningEstimator(fm, ta) 47 | mm1 = est1.calculate_activation_memory() 48 | 49 | ta.gradient_checkpointing = True 50 | est2 = FullParameterTuningEstimator(fm, ta) 51 | mm2 = est2.calculate_activation_memory() 52 | 53 | assert mm2 * 10 < mm1 54 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/hybrid/README.md: -------------------------------------------------------------------------------- 1 | # Hybrid Memory Estimator 2 | 3 | Uses a mix of theory, lookup and regressor, as follows. 4 | 5 | ```mermaid 6 | flowchart TD 7 | A[Input config] --> B{Is it FSDP?}; 8 | B -- No --> C[Report breakup and total from Theory]; 9 | B -- Yes --> D{Is Lookup DB available?}; 10 | D -- No --> H; 11 | D -- Yes --> E[Try Lookup]; 12 | E --> F{Data point present?}; 13 | F -- Yes --> G[Return full memory]; 14 | F -- No --> H{Is ML Model available?}; 15 | H -- No --> I[Failure]; 16 | H -- Yes --> J[Predict Activation Memory from model]; 17 | J --> K[Calculate other components from Theory]; 18 | K --> L[Report total]; 19 | ``` 20 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/hybrid/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .hybrid import HybridEstimator 3 | 4 | __all__ = ["HybridEstimator"] 5 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/hybrid/hybrid.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from ...config import FMArguments, HFTrainingArguments, InfraArguments, is_fsdp 3 | from ...data import format_query 4 | from ...regressor import LookupRegressor, GetRegressor 5 | from ...utils import logger 6 | from ..fsdp import FSDPEstimator 7 | from ..full import FullParameterTuningEstimator 8 | 9 | 10 | class HybridEstimator: 11 | def __init__( 12 | self, 13 | fm_args: FMArguments, 14 | train_args: HFTrainingArguments, 15 | infra_args: InfraArguments, 16 | lookup_data_path, 17 | model_path, 18 | ): 19 | 20 | logger.info("Memory Hybrid: Initializing") 21 | 22 | self.fm = fm_args 23 | self.ta = train_args 24 | self.ia = infra_args 25 | 26 | # if fsdp param is not set, set it to default 27 | if self.ia.numGpusPerPod != 1: 28 | if self.ta.fsdp == []: 29 | self.ta.fsdp = ["full_shard"] 30 | 31 | self.full_est = FullParameterTuningEstimator(fm_args, train_args) 32 | 33 | if not is_fsdp(self.ta): 34 | self.fsdp_enabled = False 35 | return 36 | 37 | # FSDP related logic 38 | self.fsdp_enabled = True 39 | self.fsdp_est = FSDPEstimator( 40 | fm_args, 41 | train_args, 42 | self.full_est, 43 | infra_args.gpu_memory_in_gb * 1024 * 1024 * 1024, 44 | ) 45 | 46 | self.fsdp_est.set_number_of_gpus(self.ia.numGpusPerPod) 47 | 48 | # Lookup based estimator 49 | if lookup_data_path is not None: 50 | self.lookup_est = LookupRegressor(lookup_data_path) 51 | else: 52 | self.lookup_est = None 53 | 54 | # Model based estimator 55 | if model_path is not None: 56 | self.reg_est = GetRegressor(model_path) 57 | else: 58 | self.reg_est = None 59 | 60 | # auto-discover? 61 | if self.ia.numGpusPerPod == 0: 62 | self.auto_discover_num_gpus() 63 | 64 | def auto_discover_num_gpus(self): 65 | """Discover the number of gpus needed - by guess and emperical validation.""" 66 | logger.info("Memory Hybrid - Attempting auto discovery of num gpus...") 67 | 68 | guess = self.fsdp_est.estimate_number_of_gpus() 69 | trials = 10 70 | 71 | while trials > 0: 72 | self.fsdp_est.set_number_of_gpus(guess) 73 | mem = self.get_total_mem_estimate() 74 | 75 | # acceptable memory configuration found 76 | if mem < self.ia.gpu_memory_in_gb * 1024**3: 77 | logger.debug( 78 | "Memory Hybrid - finalized num of gpus to: {}".format(guess) 79 | ) 80 | return 81 | 82 | guess += 1 83 | trials -= 1 84 | 85 | logger.warning("Memory Hybrid - No suitable num gpus found!") 86 | self.fsdp_est.set_number_of_gpus(-1) 87 | 88 | def lookup_mem(self): 89 | lookup_query = { 90 | "model_name": self.fm.base_model_path, 91 | "number_gpus": self.fsdp_est.num_gpus, 92 | "batch_size": self.ta.per_device_train_batch_size, 93 | "seq_len": self.fm.block_size, 94 | "gpu_model": self.ia.gpuModel, 95 | "method": self.fm.technique, 96 | } 97 | 98 | lookup_query = format_query(lookup_query, self.lookup_est.get_data_format()) 99 | 100 | res = self.lookup_est.run(lookup_query) 101 | 102 | if res.empty: 103 | return None 104 | 105 | return res["memory"][0:1].item() 106 | 107 | def calculate_activation_memory(self): 108 | if not self.fsdp_enabled: 109 | return self.full_est.calculate_activation_memory() 110 | 111 | if self.reg_est is None: 112 | logger.debug("Memory Hybrid - Skipping Regression") 113 | return self.fsdp_est.calculate_activation_memory() 114 | 115 | logger.debug("Memory Hybrid - Attempting Regression") 116 | 117 | lookup_query = { 118 | "model_name": self.fm.base_model_path, 119 | "number_gpus": self.fsdp_est.num_gpus, 120 | "batch_size": self.ta.per_device_train_batch_size, 121 | "seq_len": self.fm.block_size, 122 | "gpu_model": self.ia.gpuModel, 123 | "method": self.fm.technique, 124 | } 125 | 126 | params = format_query( 127 | lookup_query, self.reg_est.get_data_format(), only_values=True 128 | ) 129 | 130 | act = self.reg_est.run(params, "memory_act") 131 | 132 | logger.info( 133 | "Memory Hybrid - Activation, from regression: {}, from theory: {}".format( 134 | act, self.fsdp_est.calculate_activation_memory() 135 | ) 136 | ) 137 | 138 | return act 139 | 140 | def calculate_gradient_memory(self): 141 | if not self.fsdp_enabled: 142 | return self.full_est.calculate_gradient_memory() 143 | 144 | return self.fsdp_est.calculate_gradient_memory() 145 | 146 | def calculate_model_memory(self): 147 | if not self.fsdp_enabled: 148 | return self.full_est.calculate_model_memory() 149 | 150 | return self.fsdp_est.calculate_model_memory() 151 | 152 | def calculate_optimizer_memory(self): 153 | if not self.fsdp_enabled: 154 | return self.full_est.calculate_optimizer_memory() 155 | 156 | return self.fsdp_est.calculate_optimizer_memory() 157 | 158 | def get_total_mem_estimate(self): 159 | if not self.fsdp_enabled: 160 | return self.full_est.get_total_mem_estimate() 161 | 162 | # simple lookup 163 | if self.lookup_est is not None: 164 | logger.debug("Memory Hybrid - attempting lookup") 165 | lookup_mem = self.lookup_mem() 166 | if lookup_mem is not None: 167 | logger.debug("Memory Hybrid - match found") 168 | return lookup_mem 169 | 170 | logger.info("Memory Hybrid - lookup failed") 171 | 172 | size = ( 173 | self.calculate_activation_memory() 174 | + self.fsdp_est.calculate_gradient_memory() 175 | + self.fsdp_est.calculate_model_memory() 176 | + self.fsdp_est.calculate_optimizer_memory() 177 | ) 178 | 179 | return size 180 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/hybrid/hybrid_test.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from pathlib import Path 3 | 4 | # Local 5 | from ...config import parse 6 | from ...regressor import XGBoostRegressor 7 | from .hybrid import HybridEstimator 8 | 9 | test_data2 = (Path(__file__).parent / "../../regressor/test_data/data2.csv").as_posix() 10 | test_data3 = (Path(__file__).parent / "../../regressor/test_data/data3.csv").as_posix() 11 | 12 | 13 | def test_hybrid(tmp_path): 14 | 15 | model_path = tmp_path / "test.model.json" 16 | reg = XGBoostRegressor() 17 | reg.train(test_data2, model_path, ["tokens_per_second", "memory", "memory_act"]) 18 | 19 | fm, ta, ia, _, _, _ = parse( 20 | { 21 | "base_model_path": "ibm-granite/granite-7b-base", 22 | "gpu_memory_in_gb": 80, 23 | "fsdp": "full_shard", 24 | "per_device_train_batch_size": 4, 25 | "block_size": 512, 26 | "numGpusPerPod": 2, 27 | } 28 | ) 29 | 30 | est = HybridEstimator(fm, ta, ia, test_data2, model_path) 31 | # Direct lookup example 32 | assert est.get_total_mem_estimate() == 20 33 | 34 | fm, ta, ia, _, _, _ = parse( 35 | { 36 | "base_model_path": "ibm-granite/granite-7b-base", 37 | "gpu_memory_in_gb": 80, 38 | "fsdp": "full_shard", 39 | "per_device_train_batch_size": 4, 40 | "block_size": 512, 41 | "numGpusPerPod": 3, 42 | } 43 | ) 44 | 45 | est = HybridEstimator(fm, ta, ia, test_data2, model_path) 46 | # Lookup fails - uses Reg based approach 47 | assert est.get_total_mem_estimate() >= 10 * 1024 * 1024 * 1024 48 | 49 | grad_mem = est.calculate_gradient_memory() 50 | model_mem = est.calculate_model_memory() 51 | assert grad_mem >= 7 * 1024 * 1024 * 1024 52 | assert model_mem >= 7 * 1024 * 1024 * 1024 53 | assert grad_mem == model_mem 54 | 55 | 56 | def test_use_model_features(tmp_path): 57 | 58 | model_path = tmp_path / "test.model.json" 59 | reg = XGBoostRegressor() 60 | reg.train(test_data3, model_path, ["tokens_per_second", "memory", "memory_act"]) 61 | 62 | fm, ta, ia, _, _, _ = parse( 63 | { 64 | "base_model_path": "ibm-granite/granite-7b-base", 65 | "gpu_memory_in_gb": 80, 66 | "fsdp": "full_shard", 67 | "per_device_train_batch_size": 16, 68 | "block_size": 1024, 69 | "numGpusPerPod": 4, 70 | } 71 | ) 72 | 73 | est = HybridEstimator(fm, ta, ia, test_data3, None) 74 | 75 | # Direct lookup example should work as before 76 | assert est.get_total_mem_estimate() == 20 77 | 78 | fm, ta, ia, _, _, _ = parse( 79 | { 80 | "base_model_path": "ibm-granite/granite-8b-code-base", 81 | "gpu_memory_in_gb": 80, 82 | "fsdp": "full_shard", 83 | "per_device_train_batch_size": 16, 84 | "block_size": 1024, 85 | "numGpusPerPod": 4, 86 | } 87 | ) 88 | 89 | est = HybridEstimator(fm, ta, ia, test_data3, model_path) 90 | 91 | # Regression - based on model params 92 | # though we have only input model name here, we get predictions based on it's features 93 | assert est.calculate_activation_memory() < 30 94 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/lora/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .hybrid import HybridLoraEstimator 3 | from .lora import LoraEstimator 4 | 5 | __all__ = ["LoraEstimator", "HybridLoraEstimator"] 6 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/lora/hybrid.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from ...config import FMArguments, HFTrainingArguments, InfraArguments, PeftLoraConfig 3 | from ...data import format_query 4 | from ...regressor import LookupRegressor, GetRegressor 5 | from ...utils import logger 6 | from .lora import LoraEstimator 7 | 8 | 9 | class HybridLoraEstimator: 10 | def __init__( 11 | self, 12 | fm_args: FMArguments, 13 | train_args: HFTrainingArguments, 14 | infra_args: InfraArguments, 15 | lora_args: PeftLoraConfig, 16 | lookup_data_path, 17 | model_path, 18 | ): 19 | 20 | logger.info("Memory Lora Hybrid - Initializing") 21 | 22 | self.fm = fm_args 23 | self.ta = train_args 24 | self.ia = infra_args 25 | 26 | self.lora_est = LoraEstimator(fm_args, train_args, lora_args) 27 | 28 | # Lookup based estimator 29 | if lookup_data_path is not None: 30 | self.lookup_est = LookupRegressor(lookup_data_path) 31 | else: 32 | self.lookup_est = None 33 | 34 | # Model based estimator 35 | if model_path is not None: 36 | self.reg_est = GetRegressor(model_path) 37 | else: 38 | self.reg_est = None 39 | 40 | if self.ia.numGpusPerPod == 0: 41 | # discover number of gpus 42 | self.auto_discover_num_gpus() 43 | else: 44 | self.num_gpus = self.ia.numGpusPerPod 45 | 46 | def auto_discover_num_gpus(self): 47 | num = self.lora_est.calculate_model_memory() / ( 48 | self.ia.gpu_memory_in_gb * 1024**3 49 | ) 50 | self.num_gpus = int(num) if num > 1 else 1 51 | 52 | trials = 10 53 | while trials > 0: 54 | mem = self.get_total_mem_estimate() 55 | if mem < self.ia.gpu_memory_in_gb * 1024**3: 56 | logger.info( 57 | "Memory Lora Hybrid - Discovered num gpus: {0}".format( 58 | self.num_gpus 59 | ) 60 | ) 61 | return 62 | 63 | trials -= 1 64 | self.num_gpus += 1 65 | 66 | logger.warning("Memory Lora Hybrid - No suitable num gpus found!") 67 | 68 | def calculate_model_memory(self): 69 | return self.lora_est.calculate_model_memory() / self.num_gpus 70 | 71 | def calculate_gradient_memory(self): 72 | return self.lora_est.calculate_gradient_memory() / self.num_gpus 73 | 74 | def calculate_optimizer_memory(self): 75 | return self.lora_est.calculate_optimizer_memory() / self.num_gpus 76 | 77 | def calculate_activation_memory(self): 78 | return self.lora_est.calculate_activation_memory() / self.num_gpus 79 | 80 | def get_total_mem_estimate(self): 81 | 82 | lookup_query_base = { 83 | "model_name": self.fm.base_model_path, 84 | "number_gpus": self.num_gpus, 85 | "batch_size": self.ta.per_device_train_batch_size, 86 | "seq_len": self.fm.block_size, 87 | "gpu_model": self.ia.gpuModel, 88 | "method": self.fm.technique, 89 | } 90 | 91 | if self.lookup_est is not None: 92 | logger.debug("Memory Lora Hybrid - Attempting lookup") 93 | lookup_query = format_query( 94 | lookup_query_base, self.lookup_est.get_data_format() 95 | ) 96 | logger.debug("Memory Lora Hybrid - Lookup query for lookup_est is: %s", lookup_query) 97 | res = self.lookup_est.run(lookup_query) 98 | if res.empty: 99 | lookup_mem = None 100 | logger.debug( 101 | "Memory Lora Hybrid - No match was found by lookup, trying reg_est" 102 | ) 103 | else: 104 | lookup_mem = res["memory"][0:1].item() 105 | if lookup_mem is not None: 106 | logger.info("Memory Lora Hybrid - Lookup: match found") 107 | return lookup_mem 108 | 109 | if self.reg_est is not None: 110 | params = format_query( 111 | lookup_query_base, self.reg_est.get_data_format(), only_values=True 112 | ) 113 | logger.debug("Memory Lora Hybrid - Lookup query for reg_est is: %s", params) 114 | act = self.reg_est.run(params, "memory") 115 | logger.debug("Memory Lora Hybrid - Lookup query result for reg_est is: %s", act) 116 | 117 | return act 118 | 119 | # If we reach here, we are falling back on theory 120 | size = ( 121 | self.calculate_activation_memory() 122 | + self.lora_est.calculate_gradient_memory() 123 | + self.lora_est.calculate_model_memory() 124 | + self.lora_est.calculate_optimizer_memory() 125 | ) 126 | 127 | return size 128 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/lora/lora.py: -------------------------------------------------------------------------------- 1 | # Third Party 2 | from accelerate import init_empty_weights 3 | from peft import LoraConfig, get_peft_model 4 | from transformers import AutoConfig, AutoModelForCausalLM 5 | 6 | # Local 7 | from ...config import FMArguments, HFTrainingArguments, PeftLoraConfig 8 | from ...utils import fmt_size, get_size_from_precision, logger 9 | from ..full import FullParameterTuningEstimator 10 | 11 | 12 | class LoraEstimator(FullParameterTuningEstimator): 13 | def __init__( 14 | self, 15 | fm_args: FMArguments, 16 | train_args: HFTrainingArguments, 17 | lora_args: PeftLoraConfig, 18 | ): 19 | super().__init__(fm_args, train_args) 20 | 21 | self.train_args = train_args 22 | self.fm_args = fm_args 23 | self.lora_args = lora_args 24 | 25 | with init_empty_weights(): 26 | modelc = AutoConfig.from_pretrained(self.fm_args.base_model_path) 27 | model = AutoModelForCausalLM.from_config(modelc) 28 | 29 | logger.info("Initializing LoraEstimator with lora args %s", self.lora_args) 30 | self.peft_model = get_peft_model( 31 | model, 32 | LoraConfig( 33 | r=self.lora_args.r, 34 | lora_alpha=self.lora_args.lora_alpha, 35 | lora_dropout=self.lora_args.lora_dropout, 36 | target_modules=self.lora_args.target_modules, 37 | ), 38 | ) 39 | 40 | self.num_of_trainable_params = self.peft_model.num_parameters( 41 | only_trainable=True 42 | ) 43 | self.num_of_model_params = self.peft_model.num_parameters() 44 | 45 | self.precision = self._get_precision() 46 | 47 | def calculate_activation_memory(self, readable=False): 48 | # tensors created during forward pass that are needed for gradient computation 49 | # outputs have to be stored which will be used during backward pass 50 | peft_model_state = self.peft_model.state_dict() 51 | lora_a = [] 52 | lora_b = [] 53 | lora_dropout = [] 54 | either_q_k_v_present = False 55 | for k in peft_model_state: 56 | if "lora_A" in k: 57 | lora_a.append(peft_model_state[k]) 58 | if "lora_B" in k: 59 | lora_b.append(peft_model_state[k]) 60 | if "lora_dropout" in k: 61 | lora_dropout.append(peft_model_state[k]) 62 | if "self_attn" in k: 63 | either_q_k_v_present = True 64 | # for each trainable linear layer 65 | # input_features * batch_size * seq_length elements needed for each layer 66 | lora_a_size = 0 67 | lora_b_size = 0 68 | lora_dropout_size = 0 69 | # single shared input for Q K V matrices 70 | input_size = 0 71 | if either_q_k_v_present: 72 | input_size = ( 73 | self.h * self.b * self.s * get_size_from_precision(self.precision) 74 | ) 75 | for lora_a_i in lora_a: 76 | lora_a_size += ( 77 | lora_a_i.size()[1] 78 | * self.b 79 | * self.s 80 | * get_size_from_precision(self.precision) 81 | ) 82 | for lora_b_i in lora_b: 83 | lora_b_size += ( 84 | lora_b_i.size()[1] 85 | * self.b 86 | * self.s 87 | * get_size_from_precision(self.precision) 88 | ) 89 | for lora_dropout_i in lora_dropout: 90 | lora_dropout_size += lora_dropout_i.size()[1] * self.b * self.s 91 | # ignored 2 layer normalization layers and softmax 92 | size = input_size + lora_a_size + lora_b_size + lora_dropout_size 93 | if readable: 94 | return fmt_size(size) 95 | return size 96 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/lora/test_lora.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | 3 | # Local 4 | from ...config import parse 5 | from ...utils import fmt_size 6 | from .lora import LoraEstimator 7 | 8 | 9 | def test_lora(): 10 | fm, ta, _, _, la, _ = parse( 11 | { 12 | "base_model_path": "codellama/CodeLlama-13b-hf", 13 | "per_device_train_batch_size": 1, 14 | "torch_dtype": "bfloat16", 15 | "r": 8, 16 | } 17 | ) 18 | est = LoraEstimator(fm, ta, la) 19 | 20 | assert est.calculate_optimizer_memory() < 100 * 1_000_000 21 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/qlora/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .hybrid import HybridQLoraEstimator 3 | from .qlora import QLoraEstimator 4 | 5 | __all__ = ["QLoraEstimator", "HybridQLoraEstimator"] 6 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/qlora/hybrid.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from ...config import ( 3 | FMArguments, 4 | HFTrainingArguments, 5 | InfraArguments, 6 | PeftLoraConfig, 7 | PeftQLoraConfig, 8 | ) 9 | from ...data import format_query 10 | from ...regressor import LookupRegressor, GetRegressor 11 | from ...utils import logger 12 | from .qlora import QLoraEstimator 13 | 14 | 15 | class HybridQLoraEstimator: 16 | def __init__( 17 | self, 18 | fm_args: FMArguments, 19 | train_args: HFTrainingArguments, 20 | infra_args: InfraArguments, 21 | lora_args: PeftLoraConfig, 22 | qlora_args: PeftQLoraConfig, 23 | lookup_data_path, 24 | model_path, 25 | ): 26 | 27 | logger.info("Memory QLoRA Hybrid - Initializing") 28 | 29 | self.fm = fm_args 30 | self.ta = train_args 31 | self.ia = infra_args 32 | 33 | self.qlora_est = QLoraEstimator(fm_args, train_args, lora_args, qlora_args) 34 | 35 | # Lookup based estimator 36 | if lookup_data_path is not None: 37 | self.lookup_est = LookupRegressor(lookup_data_path) 38 | else: 39 | self.lookup_est = None 40 | 41 | # Model based estimator 42 | if model_path is not None: 43 | self.reg_est = GetRegressor(model_path) 44 | else: 45 | self.reg_est = None 46 | 47 | if self.ia.numGpusPerPod == 0: 48 | # discover number of gpus 49 | self.auto_discover_num_gpus() 50 | else: 51 | self.num_gpus = self.ia.numGpusPerPod 52 | 53 | def auto_discover_num_gpus(self): 54 | num = self.qlora_est.calculate_model_memory() / ( 55 | self.ia.gpu_memory_in_gb * 1024**3 56 | ) 57 | self.num_gpus = int(num) if num > 1 else 1 58 | 59 | trials = 10 60 | while trials > 0: 61 | mem = self.get_total_mem_estimate() 62 | if mem < self.ia.gpu_memory_in_gb * 1024**3: 63 | logger.debug( 64 | "Memory QLoRA Hybrid - Discovered num gpus: {0}".format( 65 | self.num_gpus 66 | ) 67 | ) 68 | return 69 | 70 | trials -= 1 71 | self.num_gpus += 1 72 | 73 | logger.warning("Memory QLoRA Hybrid - No suitable num gpus found!") 74 | 75 | def calculate_model_memory(self): 76 | return self.qlora_est.calculate_model_memory() / self.num_gpus 77 | 78 | def calculate_gradient_memory(self): 79 | return self.qlora_est.calculate_gradient_memory() / self.num_gpus 80 | 81 | def calculate_optimizer_memory(self): 82 | return self.qlora_est.calculate_optimizer_memory() / self.num_gpus 83 | 84 | def calculate_activation_memory(self): 85 | return self.qlora_est.calculate_activation_memory() / self.num_gpus 86 | 87 | def get_total_mem_estimate(self): 88 | 89 | lookup_query_base = { 90 | "model_name": self.fm.base_model_path, 91 | "number_gpus": self.num_gpus, 92 | "batch_size": self.ta.per_device_train_batch_size, 93 | "seq_len": self.fm.block_size, 94 | "gpu_model": self.ia.gpuModel, 95 | "method": self.fm.technique, 96 | } 97 | 98 | if self.lookup_est is not None: 99 | logger.debug("Memory QLoRA Hybrid - attempting lookup") 100 | lookup_query = format_query( 101 | lookup_query_base, self.lookup_est.get_data_format() 102 | ) 103 | res = self.lookup_est.run(lookup_query) 104 | if res.empty: 105 | lookup_mem = None 106 | else: 107 | lookup_mem = res["memory"][0:1].item() 108 | if lookup_mem is not None: 109 | logger.debug("Memory QLoRA Hybrid - match found") 110 | return lookup_mem 111 | 112 | if self.reg_est is not None: 113 | params = format_query( 114 | lookup_query_base, self.reg_est.get_data_format(), only_values=True 115 | ) 116 | act = self.reg_est.run(params, "memory") 117 | 118 | return act 119 | 120 | # No fall back here 121 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/qlora/qlora.py: -------------------------------------------------------------------------------- 1 | # Third Party 2 | from accelerate import init_empty_weights 3 | from peft import LoraConfig, get_peft_model 4 | from transformers import AutoConfig, AutoModelForCausalLM 5 | 6 | # Local 7 | from ...config import FMArguments, HFTrainingArguments, PeftLoraConfig, PeftQLoraConfig 8 | from ...utils import fmt_size, get_size_from_precision 9 | from ..full import FullParameterTuningEstimator 10 | 11 | 12 | class QLoraEstimator(FullParameterTuningEstimator): 13 | def __init__( 14 | self, 15 | fm_args: FMArguments, 16 | train_args: HFTrainingArguments, 17 | lora_args: PeftLoraConfig, 18 | qlora_args: PeftQLoraConfig, 19 | ): 20 | super().__init__(fm_args, train_args) 21 | 22 | self.train_args = train_args 23 | self.fm_args = fm_args 24 | self.lora_args = lora_args 25 | self.qlora_args = qlora_args 26 | 27 | with init_empty_weights(): 28 | modelc = AutoConfig.from_pretrained(self.fm_args.base_model_path) 29 | model = AutoModelForCausalLM.from_config(modelc) 30 | 31 | # cast our lora config dataclass instance into the real peft dataclass fmt 32 | self.peft_model = get_peft_model(model, LoraConfig(**self.lora_args.__dict__)) 33 | 34 | self.num_of_trainable_params = self.peft_model.num_parameters( 35 | only_trainable=True 36 | ) 37 | self.num_of_model_params = self.peft_model.num_parameters() 38 | 39 | self.precision = self._get_precision() 40 | 41 | def calculate_model_memory(self, readable=False): 42 | # See QLora paper https://arxiv.org/pdf/2305.14314 43 | # Quantization overhead for each model parameter of 0.5 bits or 0.0625 bytes. 44 | # If double quantization is enabled this can be further brought down to 0.127 bits or 0.015875 45 | 46 | if self.qlora_args.use_double_quant: 47 | size = self.num_of_model_params * ( 48 | get_size_from_precision(self.qlora_args.quant_type) + 0.015875 49 | ) 50 | else: 51 | size = self.num_of_model_params * ( 52 | get_size_from_precision(self.qlora_args.quant_type) + 0.0625 53 | ) 54 | 55 | if readable: 56 | return fmt_size(size) 57 | return size 58 | 59 | def calculate_activation_memory(self, readable=False): 60 | # tensors created during forward pass that are needed for gradient computation 61 | # outputs have to be stored which will be used during backward pass 62 | 63 | # TODO: this is currently same as LoRA, since theoretically tensors created during the forward pass are the same 64 | peft_model_state = self.peft_model.state_dict() 65 | lora_a = [] 66 | lora_b = [] 67 | lora_dropout = [] 68 | either_q_k_v_present = False 69 | for k in peft_model_state: 70 | if "lora_A" in k: 71 | lora_a.append(peft_model_state[k]) 72 | if "lora_B" in k: 73 | lora_b.append(peft_model_state[k]) 74 | if "lora_dropout" in k: 75 | lora_dropout.append(peft_model_state[k]) 76 | if "self_attn" in k: 77 | either_q_k_v_present = True 78 | # for each trainable linear layer 79 | # input_features * batch_size * seq_length elements needed for each layer 80 | lora_a_size = 0 81 | lora_b_size = 0 82 | lora_dropout_size = 0 83 | # single shared input for Q K V matrices 84 | input_size = 0 85 | if either_q_k_v_present: 86 | input_size = ( 87 | self.h * self.b * self.s * get_size_from_precision(self.precision) 88 | ) 89 | for lora_a_i in lora_a: 90 | lora_a_size += ( 91 | lora_a_i.size()[1] 92 | * self.b 93 | * self.s 94 | * get_size_from_precision(self.precision) 95 | ) 96 | for lora_b_i in lora_b: 97 | lora_b_size += ( 98 | lora_b_i.size()[1] 99 | * self.b 100 | * self.s 101 | * get_size_from_precision(self.precision) 102 | ) 103 | for lora_dropout_i in lora_dropout: 104 | lora_dropout_size += lora_dropout_i.size()[1] * self.b * self.s 105 | # ignored 2 layer normalization layers and softmax 106 | size = input_size + lora_a_size + lora_b_size + lora_dropout_size 107 | if readable: 108 | return fmt_size(size) 109 | return size 110 | -------------------------------------------------------------------------------- /fm_training_estimator/memory/qlora/test_qlora.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | 3 | # Local 4 | from ...config import parse 5 | from ...utils import fmt_size 6 | from .qlora import QLoraEstimator 7 | 8 | def test_qlora(): 9 | fm, ta, _, _, la, qla = parse( 10 | { 11 | "base_model_path": "codellama/CodeLlama-13b-hf", 12 | "per_device_train_batch_size": 1, 13 | "torch_dtype": "bfloat16", 14 | "r": 8, 15 | "use_double_quant": False, 16 | } 17 | ) 18 | est = QLoraEstimator(fm, ta, la, qla) 19 | 20 | assert est.calculate_optimizer_memory() < 100 * 1_000_000 21 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/README.md: -------------------------------------------------------------------------------- 1 | # regressor 2 | 3 | This folder contains a few important pieces. 4 | 5 | 1. The format of data expected 6 | 2. Lookup module: to directly lookup an input configuration from existing dataset 7 | 3. XGBoost module: XGBoost based regressor 8 | 9 | More regression modules can be added to this folder and then used in the various estimator modules. For example, the current modules are used in `memory/hybrid` and `throughput/hybrid`. 10 | 11 | As more ML based modules are added here, the interfaces will be locked in. For now, the example to follow is the XGBoost module. 12 | 13 | ## Data formats 14 | 15 | Data has to be in a common format for training regression modules and for runtime invocations. This is needed so that at runtime, we are able to correctly format the query to the lookup and regression modules. 16 | 17 | Refer to the `data/` module for details on the data formats. 18 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .lookup import LookupRegressor 3 | from .xgboost import XGBoostRegressor 4 | from .linear import LinearRegressor 5 | from .arise import AriseRegressor 6 | 7 | from .dispatch import GetRegressor 8 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/arise/README.md: -------------------------------------------------------------------------------- 1 | # Arise Regressor 2 | 3 | Train the regressor: 4 | ``` 5 | python -m fm_training_estimator.regressor.arise.train 6 | ``` 7 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/arise/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .arise import AriseRegressor 3 | 4 | __all__ = ["AriseRegressor"] 5 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/arise/arise.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import shutil 4 | import pandas 5 | import yaml 6 | import zipfile 7 | from functools import lru_cache 8 | 9 | from arise_predictions.preprocessing import job_parser 10 | from arise_predictions.utils import constants, utils 11 | from arise_predictions.auto_model.build_models import auto_build_models, get_estimators_config 12 | from arise_predictions.perform_predict.predict import demo_predict, get_predict_config_from_dict 13 | 14 | from ...data import lookup_format_version, get_format_by_version 15 | 16 | class AriseRegressor: 17 | def __init__(self, model_path=None): 18 | self.model_path = model_path 19 | 20 | def preprocess(self, workdir, job_spec): 21 | inputs = sorted(list(job_spec[0])) 22 | outputs = sorted(list(job_spec[1])) 23 | start_time_field_name = job_spec[2] 24 | end_time_field_name = job_spec[3] 25 | job_parser_class_name = job_spec[4] 26 | job_entry_filter = job_spec[5] 27 | feature_engineering = job_spec[6] if len(job_spec) > 6 else None 28 | metadata_parser_class_name = job_spec[7] if len(job_spec) > 7 else None 29 | history_file = os.path.join(workdir, constants.JOB_HISTORY_FILE_NAME + ".csv") 30 | 31 | history_data, history_file = job_parser.collect_jobs_history( 32 | os.path.join(workdir, constants.JOB_DATA_DIR), workdir, inputs, outputs, 33 | start_time_field_name, end_time_field_name, None, job_parser_class_name, 34 | job_entry_filter, feature_engineering, metadata_parser_class_name, 35 | workdir) 36 | return history_data, history_file 37 | 38 | def execute_build(self, workdir, js, config_path): 39 | 40 | history_data, history_file = self.preprocess(workdir, js) 41 | outputs = sorted(list(js[1])) 42 | output_path = os.path.join(workdir, constants.AM_OUTPUT_PATH_SUFFIX) 43 | 44 | auto_build_models(raw_data=history_data, 45 | config=get_estimators_config(config_path, num_jobs=-1), 46 | target_variables=outputs, 47 | output_path=output_path, 48 | leave_one_out_cv=None, 49 | feature_col=None, 50 | low_threshold=None, 51 | high_threshold=None, 52 | single_output_file=True, 53 | randomized_hpo=False, 54 | n_random_iter=False) 55 | 56 | def train(self, data_path: str, model_path: str, config_path: str, y_headers: list[str]): 57 | with tempfile.TemporaryDirectory() as workdir: 58 | print(workdir) 59 | 60 | datadir = os.path.join(workdir, "data") 61 | os.mkdir(datadir) 62 | shutil.copy2(data_path, datadir) 63 | 64 | # we only need the headers, so we read just a single row 65 | data = pandas.read_csv(data_path, nrows=1) 66 | # these 2 lines are for calc data version needed by manager module 67 | data_keys = ",".join(list(data.columns.values)) 68 | data_version = lookup_format_version(data_keys) 69 | # this is used for arise 70 | x_headers = list(set(data.columns.values) - set(y_headers)) 71 | 72 | # prepare the job spec file 73 | job_spec = {"job-metadata-inputs": {}, "job-metadata-outputs": y_headers} 74 | for h in x_headers: 75 | job_spec["job-metadata-inputs"][h] = 0 76 | 77 | js = job_parser.parse_job_spec(job_spec) 78 | 79 | # pre-emptively create output dir for arise 80 | output_path = os.path.join(workdir, constants.AM_OUTPUT_PATH_SUFFIX) 81 | utils.mkdirs(output_path) 82 | # normally, arise saves job spec into the model file 83 | # but, we skip it here 84 | # save the data version into a file in arise model 85 | with open(os.path.join(output_path, "estimator_data_version"), "w") as f: 86 | f.write(data_version) 87 | 88 | # save the model type also here 89 | with open(os.path.join(output_path, "model_type"), "w") as f: 90 | f.write("arise") 91 | 92 | self.execute_build(workdir, js, config_path) 93 | 94 | # copy the model to required destination 95 | shutil.copy2(os.path.join(workdir, "ARISE-auto-models.zip"), model_path) 96 | 97 | def get_columns(self): 98 | col_str = get_format_by_version(self.get_data_format()).X 99 | return col_str.split(",") 100 | 101 | def execute_predict(self, workdir, js, predict_config, model_file_name): 102 | return demo_predict( 103 | original_data=None, 104 | config=get_predict_config_from_dict(predict_config), 105 | estimator_path=model_file_name, 106 | feature_engineering=js[6], 107 | metadata_parser_class_name=js[7], 108 | metadata_path=model_file_name, 109 | output_path=os.path.join(workdir, constants.PRED_OUTPUT_PATH_SUFFIX)) 110 | 111 | def run(self, X, y): 112 | cols = self.get_columns() 113 | input_vars = [] 114 | for k, v in zip(cols, X): 115 | input_vars.append({k: v}) 116 | 117 | with tempfile.TemporaryDirectory() as workdir: 118 | # TODO: get the right value for the "greater is better" variable 119 | est_config = {"target_variable": y, "greater_is_better": True} 120 | predict_config = {"fixed_values": input_vars, 121 | "variable_values": [], 122 | "estimators": [est_config]} 123 | 124 | job_spec = {"job-metadata-inputs": {}, "job-metadata-outputs": [y]} 125 | for h in cols: 126 | job_spec["job-metadata-inputs"][h] = 0 127 | 128 | shutil.copy2(self.model_path, workdir) 129 | model_file_name = os.path.join(workdir, os.path.basename(self.model_path)) 130 | 131 | js = job_parser.parse_job_spec(job_spec) 132 | self.execute_predict(workdir, js, predict_config, model_file_name) 133 | 134 | # now read the result 135 | res_file = os.path.join(workdir, "ARISE-predictions", "all-predictions.csv") 136 | res = pandas.read_csv(res_file) 137 | 138 | return res[y][0] 139 | 140 | @lru_cache 141 | def get_data_format(self): 142 | with zipfile.ZipFile(self.model_path) as model_zip: 143 | with model_zip.open("estimator_data_version", 'r') as edv: 144 | dv = edv.read().decode() 145 | 146 | return dv 147 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/arise/train.py: -------------------------------------------------------------------------------- 1 | # Third Party 2 | import fire 3 | 4 | # Local 5 | from .arise import AriseRegressor 6 | 7 | def train(data_path: str, model_path: str, config_path: str, y_headers: list[str]): 8 | """Train a AriseRegressor model that can be used by this estimator library. 9 | 10 | Args: 11 | data_path (str): the path to training data 12 | model_path (str): the output path of trained model. Must end with .json. 13 | config_path (str): the path to an ARISE training config file. See: https://github.com/arise-insights/arise-predictions/tree/main/config for examples. 14 | y_headers (list[str]): list of column names to drop from data 15 | 16 | """ 17 | model = AriseRegressor() 18 | 19 | if not model_path.endswith(".zip"): 20 | print("model_path must be a zip extension.") 21 | print("Refusing to continue!!") 22 | return 23 | 24 | print("Training model...") 25 | model.train(data_path, model_path, config_path, y_headers) 26 | print("...successfully wrote model to file: ", model_path) 27 | 28 | 29 | if __name__ == "__main__": 30 | fire.Fire(train) 31 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/dispatch.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | 3 | from .xgboost import XGBoostRegressor 4 | from .linear import LinearRegressor 5 | from .arise import AriseRegressor 6 | 7 | def GetRegressor(model_path): 8 | with zipfile.ZipFile(model_path, mode='r') as model_zip: 9 | mt = model_zip.read("model_type").decode() 10 | 11 | if mt == "linear": 12 | return LinearRegressor(model_path) 13 | elif mt == "xgboost": 14 | return XGBoostRegressor(model_path) 15 | elif mt == "arise": 16 | return AriseRegressor(model_path) 17 | else: 18 | raise ValueError("Unknown model type found", mt) 19 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/linear/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .linear import LinearRegressor 3 | 4 | __all__ = ["LinearRegressor"] 5 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/linear/linear.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | import tempfile 4 | 5 | # Third Party 6 | import pandas 7 | from sklearn.ensemble import RandomForestRegressor 8 | from sklearn.preprocessing import OneHotEncoder 9 | import joblib 10 | 11 | # Local 12 | from ...data import lookup_format_version, get_format_by_version 13 | 14 | 15 | class LinearRegressor: 16 | def __init__(self, model_path=None): 17 | self.model = RandomForestRegressor() 18 | self.cat_enc = OneHotEncoder(sparse_output=False).set_output(transform="pandas") 19 | 20 | if model_path is not None: 21 | self.load(model_path) 22 | 23 | def load(self, model_path): 24 | with tempfile.TemporaryDirectory() as mdir: 25 | with zipfile.ZipFile(model_path) as model_zip: 26 | model_zip.extractall(mdir) 27 | 28 | path_m = os.path.join(mdir, "model.json") 29 | self.model = joblib.load(path_m) 30 | 31 | path_e = os.path.join(mdir, "cat_enc.json") 32 | self.cat_enc = joblib.load(path_e) 33 | 34 | def train(self, data_path: str, model_path: str, y_headers: list[str]): 35 | data = pandas.read_csv(data_path) 36 | 37 | # prepare the pure list of X for saving as metadata later 38 | X_cols = list(data.drop(columns=y_headers).columns) 39 | 40 | # obtain the data format metadata 41 | data_keys = ",".join(list(data.columns.values)) 42 | 43 | # encode category columns 44 | cat_feats = data.dtypes[data.dtypes=='object'].index.values.tolist() 45 | ecats = self.cat_enc.fit_transform(data[cat_feats]) 46 | 47 | data = data.drop(columns=cat_feats) 48 | data = pandas.concat([data, ecats], axis=1) 49 | 50 | X = data.drop(columns=y_headers) 51 | Y = data[y_headers] 52 | 53 | self.model.fit(X, Y) 54 | 55 | # save the feature names into the model 56 | self.model.metadata = {} 57 | self.model.metadata['feature_names'] = X_cols 58 | # save the data format 59 | self.model.metadata['data_format_version']=lookup_format_version(data_keys) 60 | 61 | with ( tempfile.NamedTemporaryFile(suffix='.json', mode='w') as buf_m, 62 | tempfile.NamedTemporaryFile(suffix='.json', mode='w') as buf_e, 63 | tempfile.NamedTemporaryFile(mode='w') as buf_mt, 64 | zipfile.ZipFile(model_path, mode='w') as model_zip 65 | ): 66 | 67 | # save model to tmp buffer 68 | joblib.dump(self.model, buf_m.name) 69 | # save encoder into tmp buffer 70 | joblib.dump(self.cat_enc, buf_e.name) 71 | # save model type to file 72 | with open(buf_mt.name, 'w') as f: 73 | f.write("linear") 74 | 75 | # now move the files into the zip file 76 | model_zip.write(buf_m.name, 'model.json') 77 | model_zip.write(buf_e.name, 'cat_enc.json') 78 | model_zip.write(buf_mt.name, 'model_type') 79 | 80 | def run(self, X, y): 81 | # convert input data array into form suitable to feed in 82 | 83 | # add column names 84 | data = pandas.DataFrame([X], columns=self.model.metadata['feature_names']) 85 | 86 | # encode category columns 87 | cat_feats = data.dtypes[data.dtypes=='object'].index.values.tolist() 88 | ecats = self.cat_enc.transform(data[cat_feats]) 89 | 90 | data = data.drop(columns=cat_feats) 91 | data = pandas.concat([data, ecats], axis=1) 92 | 93 | res = self.model.predict(data) 94 | col_idx = get_format_by_version(self.get_data_format()).Y.split(",").index(y) 95 | return res[0][col_idx] 96 | 97 | def get_data_format(self): 98 | return self.model.metadata["data_format_version"] 99 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/linear/train.py: -------------------------------------------------------------------------------- 1 | # Third Party 2 | import fire 3 | 4 | # Local 5 | from .linear import LinearRegressor 6 | 7 | 8 | def train(data_path: str, model_path: str, y_headers: list[str]): 9 | """Train a LinearRegressor model that can be used by this estimator library. 10 | 11 | Args: 12 | data_path (str): the path to training data 13 | model_path (str): the output path of trained model. Must end with .zip. 14 | y_headers (list[str]): list of column names to drop from data 15 | 16 | """ 17 | model = LinearRegressor() 18 | 19 | if not model_path.endswith(".zip"): 20 | print("model_path must be a zip extension.") 21 | print("Refusing to continue!!") 22 | return 23 | 24 | print("Training model...") 25 | model.train(data_path, model_path, y_headers) 26 | print("...successfully wrote model to file: ", model_path) 27 | 28 | 29 | if __name__ == "__main__": 30 | fire.Fire(train) 31 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/lookup/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .lookup import LookupRegressor 3 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/lookup/lookup.py: -------------------------------------------------------------------------------- 1 | # Third Party 2 | import pandas 3 | 4 | # Local 5 | from ...data import lookup_format_version 6 | 7 | 8 | class LookupRegressor: 9 | def __init__(self, data_path=None): 10 | self.data = None 11 | 12 | if data_path is not None: 13 | self.load(data_path) 14 | 15 | def load(self, data_path): 16 | self.data = pandas.read_csv(data_path) 17 | 18 | def get_data_format(self): 19 | keys = ",".join(list(self.data.columns.values)) 20 | return lookup_format_version(keys) 21 | 22 | def run(self, X: dict): 23 | query = "" 24 | for key, val in X.items(): 25 | if isinstance(val, str): 26 | query += f' and {key} == "{val}"' 27 | else: 28 | query += f" and {key} == {val}" 29 | query = query[5:] 30 | 31 | res = self.data.query(query) 32 | res = res.drop(columns=X.keys()) 33 | 34 | return res 35 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/lookup/test_lookup.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from pathlib import Path 3 | 4 | # Local 5 | from .lookup import LookupRegressor 6 | 7 | test_data1 = (Path(__file__).parent / "../test_data/data1.csv").as_posix() 8 | test_data2 = (Path(__file__).parent / "../test_data/data2.csv").as_posix() 9 | 10 | 11 | def test_lookup(): 12 | reg = LookupRegressor() 13 | 14 | reg.load(test_data1) 15 | 16 | res = reg.run( 17 | { 18 | "model_name": "mercury-12b", 19 | "gpu_model": "X100", 20 | "number_gpus": 2, 21 | "batch_size": 4, 22 | "seq_len": 512, 23 | } 24 | ) 25 | 26 | assert res.shape[0] == 1 27 | assert res["tokens_per_second"][0] == 500 28 | 29 | # should return multiple entries 30 | res = reg.run( 31 | { 32 | "model_name": "mercury-12b", 33 | "gpu_model": "X100", 34 | "number_gpus": 2, 35 | "batch_size": 4, 36 | } 37 | ) 38 | 39 | assert res.shape[0] == 3 40 | 41 | reg.load(test_data2) 42 | 43 | res = reg.run( 44 | { 45 | "model_name": "ibm-granite/granite-7b-base", 46 | "number_gpus": 2, 47 | "batch_size": 4, 48 | "seq_len": 512, 49 | } 50 | ) 51 | 52 | assert res.shape[0] == 1 53 | assert res[0:1]["tokens_per_second"].item() == 500 54 | 55 | res = reg.run( 56 | { 57 | "model_name": "ibm-granite/granite-7b-base", 58 | "number_gpus": 2, 59 | "batch_size": 4, 60 | "seq_len": 1024, 61 | } 62 | ) 63 | 64 | assert res.shape[0] == 1 65 | assert res[0:1]["tokens_per_second"].item() == 1000 66 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/test_data/data1.csv: -------------------------------------------------------------------------------- 1 | model_name,gpu_model,number_gpus,batch_size,seq_len,tokens_per_second 2 | mercury-12b,X100,2,4,512,500 3 | mercury-12b,X100,2,4,1024,1000 4 | mercury-12b,X100,2,4,4096,4000 5 | mercury-12b,X100,2,8,512,1000 6 | mercury-12b,X100,2,8,1024,2000 7 | mercury-12b,X100,2,8,4096,8000 8 | pluto-13b,X100,2,4,512,500 9 | pluto-13b,X100,2,4,4096,4000 10 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/test_data/data2.csv: -------------------------------------------------------------------------------- 1 | model_name,number_gpus,batch_size,seq_len,tokens_per_second,memory,memory_act 2 | ibm-granite/granite-7b-base,2,4,512,500,20,10 3 | ibm-granite/granite-7b-base,2,4,1024,1000,20,10 4 | ibm-granite/granite-7b-base,2,8,512,500,20,10 5 | ibm-granite/granite-7b-base,2,16,512,500,20,10 6 | ibm-granite/granite-7b-base,4,4,1024,500,20,10 7 | ibm-granite/granite-7b-base,4,8,1024,500,20,10 8 | ibm-granite/granite-7b-base,4,16,1024,500,20,10 9 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/test_data/data3.csv: -------------------------------------------------------------------------------- 1 | model_arch,model_hidden_size,model_intermediate_size,model_num_attn_heads,model_num_hidden_layers,model_num_key_value_heads,number_gpus,batch_size,seq_len,tokens_per_second,memory,memory_act 2 | LlamaForCausalLM,2560,10240,32,32,32,4,16,1024,500,20,10 3 | LlamaForCausalLM,4096,11008,32,32,32,4,16,1024,500,20,10 4 | 5 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/xgboost/README.md: -------------------------------------------------------------------------------- 1 | # XGBoost Regressor 2 | 3 | Train the regressor: 4 | ``` 5 | python -m fm_training_estimator.regressor.xgboost.train 6 | ``` 7 | 8 | Here is an example - run from the top level folder: 9 | ``` 10 | python -m fm_training_estimator.regressor.xgboost.train ./fm_training_estimator/regressor/test_data/data1.csv ./test.model.json ["train_tokens_per_second"] 11 | ``` 12 | 13 | This command will fail if the passed in y fields are not found in the input data. 14 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/xgboost/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .xgboost import XGBoostRegressor 3 | 4 | __all__ = ["XGBoostRegressor"] 5 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/xgboost/test_reg.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from pathlib import Path 3 | 4 | # Local 5 | from .xgboost import XGBoostRegressor 6 | 7 | test_data1 = (Path(__file__).parent / "../test_data/data1.csv").as_posix() 8 | test_data2 = (Path(__file__).parent / "../test_data/data2.csv").as_posix() 9 | 10 | 11 | def test_reg_lifecycle(tmp_path): 12 | model_path = tmp_path / "test.model.json" 13 | 14 | reg = XGBoostRegressor() 15 | 16 | # train the model - and use it directly 17 | # includes saving to file 18 | reg.train(test_data1, model_path, ["tokens_per_second"]) 19 | out = reg.run(["mercury-12b", "X100", 2, 4, 512]) 20 | assert int(out[0]) < 1000 21 | 22 | # load the model from file 23 | reg1 = XGBoostRegressor() 24 | reg1.load(model_path) 25 | out1 = reg1.run(["mercury-12b", "X100", 2, 4, 512]) 26 | assert int(out[0]) == int(out1[0]) 27 | 28 | 29 | def test_reg_multi(tmp_path): 30 | model_path = tmp_path / "test2.model.json" 31 | 32 | reg = XGBoostRegressor() 33 | 34 | # train the model - and use it directly 35 | # includes saving to file 36 | reg.train(test_data2, model_path, ["tokens_per_second", "memory", "memory_act"]) 37 | out = reg.run(["ibm-granite/granite-7b-base", 2, 4, 512]) 38 | 39 | assert len(out) == 1 40 | assert len(out[0]) == 3 41 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/xgboost/train.py: -------------------------------------------------------------------------------- 1 | # Third Party 2 | import fire 3 | 4 | # Local 5 | from .xgboost import XGBoostRegressor 6 | 7 | 8 | def train(data_path: str, model_path: str, y_headers: list[str]): 9 | """Train a XGBoostRegressor model that can be used by this estimator library. 10 | 11 | Args: 12 | data_path (str): the path to training data 13 | model_path (str): the output path of trained model. Must end with .zip. 14 | y_headers (list[str]): list of column names to drop from data 15 | 16 | """ 17 | model = XGBoostRegressor() 18 | 19 | if not model_path.endswith(".zip"): 20 | print("model_path must be a zip extension.") 21 | print("Refusing to continue!!") 22 | return 23 | 24 | print("Training model...") 25 | model.train(data_path, model_path, y_headers) 26 | print("...successfully wrote model to file: ", model_path) 27 | 28 | 29 | if __name__ == "__main__": 30 | fire.Fire(train) 31 | -------------------------------------------------------------------------------- /fm_training_estimator/regressor/xgboost/xgboost.py: -------------------------------------------------------------------------------- 1 | import os 2 | import zipfile 3 | import tempfile 4 | 5 | # Third Party 6 | from xgboost import XGBRegressor 7 | import pandas 8 | from sklearn.preprocessing import OrdinalEncoder 9 | import joblib 10 | 11 | # Local 12 | from ...data import lookup_format_version, get_format_by_version 13 | 14 | 15 | class XGBoostRegressor: 16 | def __init__(self, model_path=None): 17 | self.model = XGBRegressor( 18 | n_estimators=400, 19 | max_depth=7, 20 | eta=0.1, 21 | subsample=0.7, 22 | colsample_bytree=0.8, 23 | enable_categorical=True, 24 | ) 25 | self.cat_enc = OrdinalEncoder() 26 | 27 | if model_path is not None: 28 | self.load(model_path) 29 | 30 | 31 | def load(self, model_path): 32 | with tempfile.TemporaryDirectory() as mdir: 33 | with zipfile.ZipFile(model_path) as model_zip: 34 | model_zip.extractall(mdir) 35 | 36 | path_m = os.path.join(mdir, "model.json") 37 | self.model.load_model(path_m) 38 | 39 | path_e = os.path.join(mdir, "cat_enc.json") 40 | self.cat_enc = joblib.load(path_e) 41 | 42 | def train(self, data_path: str, model_path: str, y_headers: list[str]): 43 | data = pandas.read_csv(data_path) 44 | 45 | # obtain the data format metadata 46 | data_keys = ",".join(list(data.columns.values)) 47 | 48 | # ordinal encode all "object" type columns, which are actually categories 49 | cat_feats = data.dtypes[data.dtypes=='object'].index.values.tolist() 50 | data[cat_feats] = self.cat_enc.fit_transform(data[cat_feats]) 51 | 52 | # now mark these as categorical feats 53 | for cf in cat_feats: 54 | data[cat_feats] = data[cat_feats].astype("category") 55 | 56 | X = data.drop(columns=y_headers) 57 | Y = data[y_headers] 58 | 59 | self.model.fit(X, Y) 60 | 61 | # save the feature names into the model 62 | self.model.get_booster().feature_names = list(X.columns) 63 | # save the data format 64 | self.model.get_booster().set_attr( 65 | data_format_version=lookup_format_version(data_keys) 66 | ) 67 | 68 | with ( tempfile.NamedTemporaryFile(suffix='.json', mode='w') as buf_m, 69 | tempfile.NamedTemporaryFile(suffix='.json', mode='w') as buf_e, 70 | tempfile.NamedTemporaryFile(mode='w') as buf_mt, 71 | zipfile.ZipFile(model_path, mode='w') as model_zip 72 | ): 73 | 74 | # save model to tmp buffer 75 | self.model.save_model(buf_m.name) 76 | # save encoder into tmp buffer 77 | joblib.dump(self.cat_enc, buf_e.name) 78 | # save model type to file 79 | with open(buf_mt.name, 'w') as f: 80 | f.write("xgboost") 81 | 82 | # now move the files into the zip file 83 | model_zip.write(buf_m.name, 'model.json') 84 | model_zip.write(buf_e.name, 'cat_enc.json') 85 | model_zip.write(buf_mt.name, 'model_type') 86 | 87 | 88 | def run(self, X, y): 89 | # convert input data array into form suitable to feed in 90 | 91 | # add column names 92 | data = pandas.DataFrame([X], columns=self.model.get_booster().feature_names) 93 | 94 | # encode category columns 95 | cat_feats = data.dtypes[data.dtypes=='object'].index.values.tolist() 96 | data[cat_feats] = self.cat_enc.transform(data[cat_feats]) 97 | 98 | # now mark these as categorical feats 99 | for cf in cat_feats: 100 | data[cat_feats] = data[cat_feats].astype("category") 101 | 102 | res = self.model.predict(data) 103 | col_idx = get_format_by_version(self.get_data_format()).Y.split(",").index(y) 104 | return res[0][col_idx] 105 | 106 | def get_data_format(self): 107 | return self.model.get_booster().attr("data_format_version") 108 | -------------------------------------------------------------------------------- /fm_training_estimator/sdk/README.md: -------------------------------------------------------------------------------- 1 | # SDK 2 | 3 | To use the estimator directly, refer to the `../ui` folder. 4 | 5 | This SDK is meant to be used from other Python programs to get estimates. Refer to examples in the `examples/` folder to learn more. 6 | -------------------------------------------------------------------------------- /fm_training_estimator/sdk/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .sdk import estimate_cost, estimate_memory, estimate_time, estimate_tokens 3 | -------------------------------------------------------------------------------- /fm_training_estimator/sdk/examples/ex1.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | import os 3 | 4 | # First Party 5 | from fm_training_estimator.config.arguments import ( 6 | DataArguments, 7 | EstimateInput, 8 | EstimatorMetadata, 9 | FMArguments, 10 | HFTrainingArguments, 11 | InfraArguments, 12 | JobConfig, 13 | ) 14 | from fm_training_estimator.sdk import ( 15 | estimate_cost, 16 | estimate_memory, 17 | estimate_time, 18 | estimate_tokens, 19 | ) 20 | 21 | workdir_path = os.path.join(os.path.abspath(os.curdir), "workdir") 22 | 23 | model_path = os.path.join(workdir_path, "model.json") 24 | lookup_data_path = os.path.join(workdir_path, "data.csv") 25 | 26 | estimator_metadata = EstimatorMetadata(base_data_path=lookup_data_path) 27 | 28 | fm = FMArguments( 29 | base_model_path="ibm-granite/granite-7b-base", 30 | torch_dtype="bfloat16", 31 | block_size=1024, 32 | ) 33 | hf_training = HFTrainingArguments( 34 | per_device_train_batch_size=1, gradient_checkpointing=False 35 | ) 36 | data = DataArguments(dataset="imdb", te_approach=0) 37 | infra = InfraArguments(numGpusPerPod=1) 38 | job_conf = JobConfig(hf_training, fm, data, infra) 39 | est_input = EstimateInput(estimator_metadata=estimator_metadata, job_configs=[job_conf]) 40 | 41 | print("Estimating Memory:....") 42 | 43 | print("With only theory: ", estimate_memory(est_input)) 44 | print("With reg model: ", estimate_memory(est_input, model_path)) 45 | 46 | hf_training.fsdp = "full_shard" 47 | 48 | print("Using fsdp full shard") 49 | print("With only theory: ", estimate_memory(est_input)) 50 | # print("With reg model: ", estimate_memory(est_input, model_path)) 51 | 52 | 53 | print("Estimating Time:....") 54 | print("With only theory: ", estimate_time(est_input)) 55 | # print("With reg model: ", estimate_time(est_input, model_path)) 56 | 57 | print("Estimating Tokens:....") 58 | print("With only theory: ", estimate_tokens(est_input)) 59 | # print("With reg model: ", estimate_tokens(est_input, model_path)) 60 | -------------------------------------------------------------------------------- /fm_training_estimator/sdk/sdk.py: -------------------------------------------------------------------------------- 1 | # First Party 2 | from fm_training_estimator.config.arguments import ( 3 | CostEstimate, 4 | EstimateInput, 5 | JobConfig, 6 | MemoryEstimate, 7 | TimeEstimate, 8 | TokensEstimate, 9 | ) 10 | from fm_training_estimator.memory.hybrid.hybrid import HybridEstimator 11 | from fm_training_estimator.memory.lora.hybrid import HybridLoraEstimator 12 | from fm_training_estimator.memory.qlora.hybrid import HybridQLoraEstimator 13 | from fm_training_estimator.throughput.hybrid.hybrid import HybridSpeedEstimator 14 | from fm_training_estimator.time import get_total_time 15 | from fm_training_estimator.tokens.te0.te0 import TokenEstimator0 16 | 17 | # Local 18 | from ..config import is_fsdp 19 | from ..utils import fmt_size, logger 20 | 21 | 22 | def _get_hybrid_estimator( 23 | conf: JobConfig, model_path: str = None, lookup_data_path: str = None 24 | ): 25 | if conf.fm.technique == "lora": 26 | return HybridLoraEstimator( 27 | conf.fm, 28 | conf.hf_training, 29 | conf.infra, 30 | conf.peft_lora, 31 | lookup_data_path, 32 | model_path, 33 | ) 34 | elif conf.fm.technique == "qlora": 35 | return HybridQLoraEstimator( 36 | conf.fm, 37 | conf.hf_training, 38 | conf.infra, 39 | conf.peft_lora, 40 | conf.peft_qlora, 41 | None, 42 | model_path, 43 | ) 44 | else: 45 | return HybridEstimator( 46 | conf.fm, conf.hf_training, conf.infra, lookup_data_path, model_path 47 | ) 48 | 49 | def _update_seq_width( 50 | conf: JobConfig 51 | ) -> JobConfig: 52 | """ 53 | Update the seq width based on the input dataset characteristics. 54 | 55 | This is only needed for memory and should not impact tps/tokens since those 56 | functions anyway operate on the input dataset. 57 | """ 58 | 59 | token_est = None 60 | if conf.data.te_approach == 0: 61 | token_est = TokenEstimator0(conf.data) 62 | if conf.data.te_approach == 2: 63 | token_est = TokenEstimator2(conf.data) 64 | 65 | if token_est != None: 66 | data_max_width = token_est.get_max_sample_length() 67 | if data_max_width < conf.fm.block_size: 68 | conf.fm.block_size = data_max_width 69 | 70 | return conf 71 | 72 | def estimate_memory( 73 | estimate_input: EstimateInput, model_path: str = None 74 | ) -> MemoryEstimate: 75 | """Estimate memory needed for training. This method uses hybdrid model by default. 76 | 77 | Args: 78 | estimate_input (fm_training_estimator.config.arguments.EstimateInput): the input for this estimation 79 | This input includes training job configs and optionally, metadata about this estimate run. 80 | model_path (str, optional): path to the trained xgboost model for the estimator to use for this run. 81 | 82 | Returns: 83 | fm_training_estimator.config.arguments.MemoryEstimate: the memory estimate of this run. 84 | 85 | """ 86 | 87 | if estimate_input.job_configs is None or len(estimate_input.job_configs) == 0: 88 | raise ValueError("Did not receive a training job config") 89 | 90 | # Only going to process first job_config for now 91 | job_config = estimate_input.job_configs[0] 92 | 93 | # Update expected max width based on data 94 | job_config = _update_seq_width(job_config) 95 | 96 | if estimate_input.estimator_metadata: 97 | lookup_data_path = estimate_input.estimator_metadata.base_data_path 98 | if lookup_data_path is None: 99 | logger.warning( 100 | "SDK - No lookup data path given. Set it via estimator_metadata.base_data_path in input json. Proceeding with estimator with limited lookup ability." 101 | ) 102 | 103 | est = _get_hybrid_estimator(job_config, model_path, lookup_data_path) 104 | 105 | total_mem_estimate = fmt_size(est.get_total_mem_estimate()) 106 | activation_memory = fmt_size(est.calculate_activation_memory()) 107 | gradient_memory = fmt_size(est.calculate_gradient_memory()) 108 | model_memory = fmt_size(est.calculate_model_memory()) 109 | optimizer_memory = fmt_size(est.calculate_optimizer_memory()) 110 | 111 | num_gpus = job_config.infra.numGpusPerPod 112 | 113 | if num_gpus == 0: 114 | if job_config.fm.technique == "full" and is_fsdp(job_config.hf_training): 115 | num_gpus = est.fsdp_est.get_number_of_gpus() 116 | elif job_config.fm.technique == "lora" or job_config.fm.technique == "qlora": 117 | num_gpus = est.num_gpus 118 | else: 119 | num_gpus = 1 120 | 121 | job_config.infra.numGpusPerPod = num_gpus 122 | 123 | # No suitable configuration found 124 | if num_gpus == -1: 125 | raise ValueError("Input configuration is infeasible!") 126 | 127 | return MemoryEstimate( 128 | total_mem_estimate, 129 | activation_memory, 130 | gradient_memory, 131 | model_memory, 132 | optimizer_memory, 133 | num_gpus, 134 | ) 135 | 136 | 137 | def _estimate_tokens_and_time( 138 | conf: JobConfig, 139 | model_path: str = None, 140 | lookup_data_path: str = None, 141 | ) -> tuple[float, float]: 142 | token_est = None 143 | if conf.data.te_approach == 0: 144 | token_est = TokenEstimator0(conf.data) 145 | if conf.data.te_approach == 2: 146 | token_est = TokenEstimator2(conf.data) 147 | 148 | speed_est = HybridSpeedEstimator( 149 | conf.fm, conf.hf_training, conf.infra, lookup_data_path, model_path 150 | ) 151 | 152 | estimated_tps = speed_est.get_tps() 153 | if estimated_tps is not None: 154 | tps = float(estimated_tps) 155 | logger.info("SDK - Initial estimated tps is %f", tps) 156 | else: 157 | logger.info("SDK - Could not calculate tps initially, defaulting to 1.") 158 | tps = 1 159 | 160 | if token_est is not None: 161 | tokens_per_sample = int( 162 | token_est.get_estimated_batch_width( 163 | conf.hf_training.per_device_train_batch_size 164 | ) 165 | ) 166 | total_tokens = int(token_est.get_total_tokens()) 167 | 168 | # get the update tps for this estimate token width 169 | estimated_tps = speed_est.get_tps(tokens_per_sample) 170 | if estimated_tps is not None: 171 | tps = float(estimated_tps) 172 | logger.info("SDK - Updated estimated tps after token width is %f", tps) 173 | else: 174 | logger.info( 175 | "SDK - Could not calculate tps after token width, defaulting to 1." 176 | ) 177 | tps = 1 178 | 179 | # calculate full time here 180 | time = get_total_time( 181 | conf.hf_training, conf.infra, token_est, tps, total_tokens 182 | ) 183 | else: 184 | time = (0, 0) 185 | logger.info( 186 | "SDK - Could not get a total tokens to calculate time, setting time to 0." 187 | ) 188 | return (tps, time) 189 | 190 | 191 | def estimate_time( 192 | estimate_input: EstimateInput, model_path: str = None 193 | ) -> TimeEstimate: 194 | """Estimate time needed for training. This method uses hybdrid model by default. 195 | 196 | Args: 197 | estimate_input (fm_training_estimator.config.arguments.EstimateInput): the input for this estimation 198 | This input includes training job configs and optionally, metadata about this estimate run. 199 | model_path (str, optional): path to the trained xgboost model for the estimator to use for this run. 200 | 201 | Returns: 202 | fm_training_estimator.config.arguments.TimeEstimate: the time estimate of this run. 203 | 204 | """ 205 | if estimate_input.job_configs is None or len(estimate_input.job_configs) == 0: 206 | raise ValueError("Did not receive a training job config") 207 | 208 | # Only going to process first job_config for now 209 | job_config = estimate_input.job_configs[0] 210 | 211 | if estimate_input.estimator_metadata: 212 | lookup_data_path = estimate_input.estimator_metadata.base_data_path 213 | if lookup_data_path is None: 214 | logger.warning( 215 | "SDK - No lookup data path given. Set it via estimator_metadata.base_data_path in input json. Proceeding with estimator with limited lookup ability." 216 | ) 217 | 218 | _, (time, train_time) = _estimate_tokens_and_time( 219 | job_config, model_path, estimate_input.estimator_metadata.base_data_path 220 | ) 221 | 222 | return TimeEstimate(time, train_time) 223 | 224 | 225 | def estimate_tokens( 226 | estimate_input: EstimateInput, model_path: str = None 227 | ) -> TokensEstimate: 228 | """Estimate tokens throughput for a training. This method uses hybdrid model by default. 229 | 230 | Args: 231 | estimate_input (fm_training_estimator.config.arguments.EstimateInput): the input for this estimation 232 | This input includes training job configs and optionally, metadata about this estimate run. 233 | model_path (str, optional): path to the trained xgboost model for the estimator to use for this run. 234 | 235 | Returns: 236 | fm_training_estimator.config.arguments.TokensEstimate: the tokens throughput estimate of this run. 237 | 238 | """ 239 | if estimate_input.job_configs is None or len(estimate_input.job_configs) == 0: 240 | raise ValueError("Did not receive a training job config") 241 | 242 | # Only going to process first job_config for now 243 | job_config = estimate_input.job_configs[0] 244 | 245 | if estimate_input.estimator_metadata: 246 | lookup_data_path = estimate_input.estimator_metadata.base_data_path 247 | if lookup_data_path is None: 248 | logger.warning( 249 | "SDK - No lookup data path given. Set it via estimator_metadata.base_data_path in input json. Proceeding with estimator with limited lookup ability." 250 | ) 251 | 252 | tps, _ = _estimate_tokens_and_time( 253 | job_config, model_path, estimate_input.estimator_metadata.base_data_path 254 | ) 255 | 256 | return TokensEstimate(tps) 257 | 258 | 259 | def estimate_cost( 260 | estimate_input: EstimateInput, model_path: str = None 261 | ) -> CostEstimate: 262 | """Estimate cost for a training. This method uses hybdrid model by default. (Not yet supported) 263 | 264 | Args: 265 | estimate_input (fm_training_estimator.config.arguments.EstimateInput): the input for this estimation 266 | This input includes training job configs and optionally, metadata about this estimate run. 267 | model_path (str, optional): path to the trained xgboost model for the estimator to use for this run. 268 | 269 | Returns: 270 | fm_training_estimator.config.arguments.CostEstimate: the cost estimate of this run. 271 | 272 | """ 273 | raise NotImplementedError("Not supported in this version.") 274 | -------------------------------------------------------------------------------- /fm_training_estimator/throughput/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .hybrid import HybridSpeedEstimator 3 | from .mock import MockSpeedEstimator 4 | -------------------------------------------------------------------------------- /fm_training_estimator/throughput/hybrid/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .hybrid import HybridSpeedEstimator 3 | -------------------------------------------------------------------------------- /fm_training_estimator/throughput/hybrid/hybrid.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from ...config import FMArguments, HFTrainingArguments, InfraArguments 3 | from ...data import format_query 4 | from ...regressor import LookupRegressor, GetRegressor 5 | from ...utils import logger 6 | 7 | 8 | class HybridSpeedEstimator: 9 | def __init__( 10 | self, 11 | fm_args: FMArguments, 12 | train_args: HFTrainingArguments, 13 | infra_args: InfraArguments, 14 | lookup_data_path, 15 | model_path, 16 | ): 17 | 18 | self.fm = fm_args 19 | self.ta = train_args 20 | self.ia = infra_args 21 | self.lookup_est = None 22 | self.reg_est = None 23 | 24 | # Lookup based estimator 25 | if lookup_data_path is not None: 26 | self.lookup_est = LookupRegressor(lookup_data_path) 27 | 28 | # Model based estimator 29 | if model_path is not None: 30 | self.reg_est = GetRegressor(model_path) 31 | 32 | if lookup_data_path is None and model_path is None: 33 | raise RuntimeError("HybridSpeedEstimator not properly initialized") 34 | 35 | def check_lookup(self, seqlen): 36 | lookup_query = { 37 | "model_name": self.fm.base_model_path, 38 | "number_gpus": self.ia.numGpusPerPod, 39 | "batch_size": self.ta.per_device_train_batch_size, 40 | "seq_len": seqlen, 41 | "gpu_model": self.ia.gpuModel, 42 | "method": self.fm.technique, 43 | } 44 | 45 | lookup_query = format_query(lookup_query, self.lookup_est.get_data_format()) 46 | 47 | res = self.lookup_est.run(lookup_query) 48 | 49 | if res.empty: 50 | return None 51 | 52 | logger.debug(f"Throughput Hybrid - Lookup result: {res}") 53 | return res[0:1]["tokens_per_second"].item() 54 | 55 | def get_tps(self, seqlen=None): 56 | if seqlen is None: 57 | seqlen = self.fm.block_size 58 | 59 | res = None 60 | 61 | # attempt lookup 62 | if self.lookup_est is not None: 63 | res = self.check_lookup(seqlen) 64 | if res is not None: 65 | return res 66 | if self.reg_est is None: 67 | return res 68 | 69 | # attempt reg approach 70 | lookup_query = { 71 | "model_name": self.fm.base_model_path, 72 | "number_gpus": self.ia.numGpusPerPod, 73 | "batch_size": self.ta.per_device_train_batch_size, 74 | "seq_len": int(seqlen), 75 | "gpu_model": self.ia.gpuModel, 76 | "method": self.fm.technique, 77 | } 78 | params = format_query( 79 | lookup_query, self.reg_est.get_data_format(), only_values=True 80 | ) 81 | 82 | res = self.reg_est.run(params, "tokens_per_second") 83 | return res 84 | -------------------------------------------------------------------------------- /fm_training_estimator/throughput/hybrid/test_hybrid.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from pathlib import Path 3 | 4 | # Third Party 5 | from pytest import raises 6 | 7 | # Local 8 | from ...config import parse 9 | from ...regressor import XGBoostRegressor 10 | from .hybrid import HybridSpeedEstimator 11 | 12 | test_data2 = (Path(__file__).parent / "../../regressor/test_data/data2.csv").as_posix() 13 | test_data3 = (Path(__file__).parent / "../../regressor/test_data/data3.csv").as_posix() 14 | 15 | 16 | def test_hybrid_empty(): 17 | fm, ta, ia, _, _, _ = parse({}) 18 | 19 | with raises(RuntimeError): 20 | _ = HybridSpeedEstimator(fm, ta, ia, None, None) 21 | 22 | 23 | def test_hybrid_lookup(): 24 | fm, ta, ia, _, _, _ = parse( 25 | { 26 | "base_model_path": "ibm-granite/granite-7b-base", 27 | "per_device_train_batch_size": 4, 28 | "block_size": 512, 29 | "numGpusPerPod": 2, 30 | } 31 | ) 32 | 33 | est = HybridSpeedEstimator(fm, ta, ia, test_data2, None) 34 | 35 | assert est.get_tps() == 500 36 | # test lookup approach 37 | assert est.get_tps(1024) == 1000 38 | 39 | 40 | def test_hybrid_reg(tmp_path): 41 | model_path = tmp_path / "test.model.json" 42 | reg = XGBoostRegressor() 43 | reg.train(test_data2, model_path, ["tokens_per_second", "memory", "memory_act"]) 44 | 45 | fm, ta, ia, _, _, _ = parse( 46 | { 47 | "base_model_path": "ibm-granite/granite-7b-base", 48 | "per_device_train_batch_size": 4, 49 | "block_size": 512, 50 | "numGpusPerPod": 4, 51 | } 52 | ) 53 | 54 | est = HybridSpeedEstimator(fm, ta, ia, test_data2, model_path) 55 | 56 | assert est.get_tps() > 300 57 | 58 | 59 | def test_hybrid_model_features(tmp_path): 60 | model_path = tmp_path / "test.model.json" 61 | reg = XGBoostRegressor() 62 | reg.train(test_data3, model_path, ["tokens_per_second", "memory", "memory_act"]) 63 | 64 | fm, ta, ia, _, _, _ = parse( 65 | { 66 | "base_model_path": "ibm-granite/granite-8b-code-base", 67 | "per_device_train_batch_size": 16, 68 | "block_size": 1024, 69 | "numGpusPerPod": 4, 70 | } 71 | ) 72 | 73 | est = HybridSpeedEstimator(fm, ta, ia, test_data3, model_path) 74 | 75 | assert est.get_tps() > 400 76 | -------------------------------------------------------------------------------- /fm_training_estimator/throughput/mock/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .mock import MockSpeedEstimator 3 | -------------------------------------------------------------------------------- /fm_training_estimator/throughput/mock/mock.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | import random 3 | import time 4 | 5 | # Local 6 | from ...config import FMArguments 7 | 8 | 9 | class MockSpeedEstimator: 10 | def __init__(self, fm_args: FMArguments, seed=None): 11 | self.fm = fm_args 12 | 13 | if seed is not None: 14 | self.seed = seed 15 | else: 16 | self.seed = time.time() 17 | 18 | def get_tps(self, seqlen=None): 19 | if seqlen is None: 20 | seqlen = self.fm.block_size 21 | random.seed(self.seed + seqlen) 22 | return random.randint(100, 10000) 23 | -------------------------------------------------------------------------------- /fm_training_estimator/throughput/mock/test_mock.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from ...config import parse 3 | from .mock import MockSpeedEstimator 4 | 5 | 6 | def test_mock_1(): 7 | fm, _, _, _, _, _ = parse({"block_size": 512}) 8 | est = MockSpeedEstimator(fm, seed=10) 9 | 10 | tps = est.get_tps() 11 | assert tps == 1355 12 | 13 | 14 | def test_mock_2(): 15 | fm, _, _, _, _, _ = parse({"block_size": 1024}) 16 | est = MockSpeedEstimator(fm, seed=10) 17 | 18 | tps = est.get_tps() 19 | assert tps == 719 20 | -------------------------------------------------------------------------------- /fm_training_estimator/time/README.md: -------------------------------------------------------------------------------- 1 | # Time 2 | 3 | Time taken for a training job consists for two main sub components: 4 | 5 | 1. Training time: actual time spent in the training process - forward pass, backward pass and so on. 6 | 2. Non-training time: other significant sources of times such as model load, model save etc. 7 | 8 | ## Training time 9 | 10 | Training time is calculated in the estimator as a simple combination of two inputs: 11 | 12 | 1. `throughput`: that is number of tokens per second achieved by the training script. 13 | 2. `tokens`: the number of tokens to be processed for the given dataset under the given conditions. 14 | 15 | Refer to the subcomponents in `../throughput` and `../tokens` for these calculations. Once we have both of these, a simple trivial division gives us the training time - albeit for a single epoch. 16 | 17 | ## Non-training time 18 | 19 | This is made of the following components. 20 | 21 | ### Model load 22 | 23 | In the beginning, a model must be loaded from disk, usually from files in Pytorch model formats or Hugging Face SafeTensor formats. This model may possibly fetched from the Hugging Face Hub, or available already on disk, cached or a from a local checkpoint format. 24 | 25 | ### Dataload time 26 | 27 | Time take to load a dataset from files (typically json, jsonl or parquet) on disk. 28 | 29 | ### Checkpoint time 30 | 31 | Every k steps or l epochs, a checkpoint may be saved to disk. There is a lot of research in this topic, to make this process faster. 32 | 33 | -------------------------------------------------------------------------------- /fm_training_estimator/time/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .time import get_total_time 3 | -------------------------------------------------------------------------------- /fm_training_estimator/time/time.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | import logging 3 | 4 | # Local 5 | from ..config import HFTrainingArguments, InfraArguments 6 | from ..tokens import TokenEstimator 7 | 8 | MODEL_LOAD_TIME = 5 * 60 9 | CHECKPOINT_TIME = 60 10 | 11 | 12 | def get_total_time( 13 | hf: HFTrainingArguments, ia: InfraArguments, te: TokenEstimator, tps, tokens 14 | ): 15 | """ 16 | Returns a tuple of (time_total, time_train). 17 | The first is the second plus the time taken for model loading/checkpoint saving etc. 18 | """ 19 | train_time_per_epoch = tokens / tps 20 | 21 | num_epochs = hf.num_train_epochs 22 | 23 | # one checkpoint at the very end 24 | num_checkpoints = 1 25 | ss = hf.save_strategy 26 | if ss == "epoch": 27 | num_checkpoints += num_epochs 28 | elif ss == "steps": 29 | steps_in_epoch = te.get_num_samples() / ( 30 | hf.per_device_train_batch_size * ia.numGpusPerPod 31 | ) 32 | num_checkpoints += num_epochs * steps_in_epoch / hf.save_steps 33 | elif ss == "best": 34 | logging.warn( 35 | "Unable to guess number of checkpoints due to use of `best` saving strategy." 36 | ) 37 | 38 | time_train = train_time_per_epoch * num_epochs 39 | time_total = MODEL_LOAD_TIME + time_train + (num_checkpoints * CHECKPOINT_TIME) 40 | return (time_total, time_train) 41 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/README.md: -------------------------------------------------------------------------------- 1 | # Tokens 2 | 3 | This module is meant to predict the number of tokens that will be processed in a training run. This is not directly the number of tokens in the data for a few reasons: 4 | 5 | 1. Data is formatting into a template using various fields. 6 | 2. Data is then batched into batches (like 4, 8, etc) and then padded into rectangular tensors. This can add a number of so-called "padding" tokens which are not real data, but nevertheless processed during training at various stages. 7 | 3. etc 8 | 9 | ## Mechanism 10 | 11 | We have 2 mechanisms for token predictions: 12 | 13 | 1. TE0: Emperical sampling of data into batches. 14 | 2. TE2: Offline generation of statistical information and approximate calculations. 15 | 16 | ## TE0 17 | 18 | This is highly accurate, since this does exactly what the real training process would do, with the real data. But, this can be slow, since the whole data has to be walked through. 19 | 20 | Use this technique, if you have small data sizes, or publically available datasets like on HF hub. 21 | 22 | ## TE2 23 | 24 | In this approach, we first derive some statistical information from the dataset in a one-time pass. This information is stored in a json file, called a TE2 Contract file. 25 | 26 | For prediction, this TE2 contract is input and an approximate calculate is done to estimate the real token sizes. 27 | 28 | Use this technique for large datasets and when privacy of the data is important. 29 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .te import TokenEstimator 3 | from .te0 import TokenEstimator0 4 | from .te2 import TokenEstimator2 5 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te.py: -------------------------------------------------------------------------------- 1 | class TokenEstimator: 2 | def get_total_tokens(self): 3 | pass 4 | 5 | def get_estimated_batch_width(self, batch_size: int): 6 | pass 7 | 8 | def get_num_samples(self): 9 | pass 10 | 11 | def get_max_sample_length(self): 12 | pass 13 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te0/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .te0 import TokenEstimator0 3 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te0/te0.py: -------------------------------------------------------------------------------- 1 | # Third Party 2 | from datasets import load_dataset 3 | from tqdm import tqdm 4 | import numpy as np 5 | 6 | # Local 7 | from ...config import DataArguments 8 | from ...utils import logger 9 | from ..te import TokenEstimator 10 | 11 | RUNS = 5 12 | SEED = 42 13 | 14 | np.random.seed(SEED) 15 | 16 | 17 | class TokenEstimator0(TokenEstimator): 18 | def __init__(self, da: DataArguments): 19 | if da.dataset is None: 20 | raise RuntimeError("Dataset argument has to be filled in for TE0!") 21 | 22 | if da.dataset.endswith(".json") or da.dataset.endswith(".jsonl"): 23 | logger.debug("Tokens TE0 - Parsing dataset as local json file") 24 | dataset = load_dataset("json", data_files={"train": da.dataset})["train"] 25 | else: 26 | dataset = load_dataset( 27 | da.dataset, 28 | name=da.dataset_config_name, 29 | split=da.dataset_split, 30 | trust_remote_code=da.trust_remote_code 31 | ) 32 | 33 | tokens = [] 34 | logger.info("Tokens TE0 - Loading data in dataset...") 35 | for item in tqdm(dataset): 36 | txt = da.dataset_text_field.format_map(item) 37 | tokens.append(int(len(txt) / 3.6)) 38 | 39 | self.tokens = tokens 40 | 41 | def get_total_tokens(self): 42 | return np.sum(self.tokens) 43 | 44 | def get_estimated_batch_width(self, batch_size, runs=RUNS): 45 | widths = [ 46 | self.get_estimated_batch_width_random_shuffle(batch_size) 47 | for i in range(runs) 48 | ] 49 | return np.mean(widths) 50 | 51 | def get_num_samples(self): 52 | return len(self.tokens) 53 | 54 | def get_estimated_batch_width_random_shuffle(self, bs): 55 | tokens = np.array(self.tokens) 56 | np.random.shuffle(tokens) 57 | if len(tokens) % bs != 0: 58 | tokens = np.concatenate( 59 | [tokens, np.zeros(bs - len(tokens) % bs)] 60 | ) # simulating drop_last=False 61 | return np.mean(np.max(np.split(tokens, len(tokens) / bs), axis=1)) 62 | 63 | def get_max_sample_length(self): 64 | return np.max(self.tokens) 65 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te0/te_test1.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Mercury is the first planet from the Sun and the smallest in the Solar System"} 2 | {"text": "Venus is notable for having the densest atmosphere of the terrestrial planets, composed mostly of carbon dioxide with a thick, global sulfuric acid cloud cover."} 3 | {"text": "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water."} 4 | {"text": "The surface of Mars is orange-red because it is covered in iron(III) oxide dust, giving it the nickname 'the Red Planet'. Mars is among the brightest objects in Earth's sky and its high-contrast albedo features have made it a common subject for telescope viewing."} 5 | {"text": "Jupiter is the fifth planet from the Sun and the largest in the Solar System. A gas giant, Jupiter's mass is more than two and a half times that of all the other planets in the Solar System combined and slightly less than one one-thousandth the mass of the Sun. Jupiter orbits the Sun at a distance of 5.20 AU (778.5 Gm) with an orbital period of 11.86 years. It is the third brightest natural object in the Earth's night sky after the Moon and Venus and has been observed since prehistoric times. Its name derives from Jupiter, the chief deity of ancient Roman religion. "} 6 | {"text": "Saturn is a gas giant with an average radius of about nine-and-a-half times that of Earth, prominently known for its rings."} 7 | {"text": "Uranus is the seventh planet from the Sun. It is a gaseous cyan-coloured ice giant. Most of the planet is made of water, ammonia, and methane in a supercritical phase of matter, which in astronomy is called 'ice' or volatiles."} 8 | {"text": "Neptune is the eighth and farthest known planet from the Sun. It is the fourth-largest planet in the Solar System by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth."} 9 | {"text": "Pluto is a dwarf planet in the Kuiper belt, a ring of bodies beyond the orbit of Neptune."} 10 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te0/test_te0.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from pathlib import Path 3 | 4 | # Local 5 | from ...config import parse 6 | from .te0 import TokenEstimator0 7 | 8 | # trick to ensure running this with pytest works from root dir 9 | test_data = (Path(__file__).parent / "te_test1.jsonl").as_posix() 10 | 11 | 12 | def test_te_raw_hf_dataset(): 13 | _, _, _, da, _, _ = parse( 14 | { 15 | "base_model_path": "ibm-granite/granite-8b-code-base", 16 | "gpu_memory_in_gb": 80, 17 | "dataset": "super_glue", 18 | "dataset_config_name": "axb", 19 | "dataset_split": "test", 20 | "dataset_text_field": "Input###:\n {sentence1}", 21 | } 22 | ) 23 | 24 | te = TokenEstimator0(da) 25 | 26 | assert te.get_num_samples() == 1104, "Number of samples in data should match" 27 | assert te.get_estimated_batch_width(4) < te.get_estimated_batch_width( 28 | 8 29 | ), "Larger batches should have more padding" 30 | assert ( 31 | abs(te.get_estimated_batch_width(1) - (te.get_total_tokens() / 1104)) < 1e6 32 | ), "Estimated batch width for BS=1 should be equal to mean of tokens" 33 | assert te.get_estimated_batch_width(1104) == max( 34 | te.tokens 35 | ), "For batch size equal to dataset length, estimated batch width should be equal to max length of tokens" 36 | 37 | 38 | def test_te_raw_json(): 39 | _, _, _, da, _, _ = parse( 40 | { 41 | "base_model_path": "ibm-granite/granite-8b-code-base", 42 | "gpu_memory_in_gb": 80, 43 | "dataset": test_data, 44 | "dataset_text_field": "{text}", 45 | } 46 | ) 47 | 48 | te = TokenEstimator0(da) 49 | 50 | assert te.get_num_samples() == 9 51 | assert te.get_estimated_batch_width(1) < te.get_estimated_batch_width(2) 52 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te2/README.md: -------------------------------------------------------------------------------- 1 | # TE2 2 | 3 | ## Contract Generation 4 | 5 | Examples: 6 | ``` 7 | python -m fm_training_estimator.tokens.te2.gen_contract --dataset imdb --output out1.contract.json 8 | ``` 9 | or 10 | ``` 11 | python -m fm_training_estimator.tokens.te2.gen_contract --dataset ./fm_training_estimator/tokens/te2/te_test1.jsonl --output out1.contract.json 12 | ``` 13 | 14 | This will output a single small contract file. This file should be later used with the estimator. 15 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te2/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .te2 import TokenEstimator2 3 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te2/gen_contract.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # Third Party 4 | import fire 5 | 6 | # Local 7 | from .te2 import GenerateTokenEstimator2Contract 8 | 9 | def gen(dataset: str, output: str, ds_config_name: str = None, ds_split: str = "test", sample_percent: int = None): 10 | """ 11 | Inputs: 12 | dataset: the path to a json/jsonl file, or the name of HF dataset on the HF hub 13 | output: the path to output the contract file 14 | ds_config_name: For HF datasets, optional name of config to use 15 | ds_split: for HF datasets, the name of the split of the data to use 16 | sample_percent: an optional integer between (0-100], indicating what percent of the dataset we should sample. Default (if nothing specified), is no sampling, which means 100% of the data is used. 17 | """ 18 | 19 | print("Generating contract...") 20 | contract = GenerateTokenEstimator2Contract(dataset, ds_config_name, ds_split, sample_percent) 21 | 22 | with open(output, "w") as f: 23 | json.dump(contract, f) 24 | 25 | print("...successfully wrote contract to file: ", output) 26 | 27 | if __name__ == "__main__": 28 | fire.Fire(gen) 29 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te2/te2.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | # Third Party 5 | from datasets import load_dataset 6 | from tqdm import tqdm 7 | import numpy as np 8 | from sklearn.linear_model import LinearRegression 9 | 10 | # Local 11 | from ...config import DataArguments 12 | from ...utils import logger 13 | from ..te import TokenEstimator 14 | 15 | 16 | def load_dataset_config_from_json(json_file_path): 17 | try: 18 | with open(json_file_path, "r") as file: 19 | config = json.load(file) 20 | print("Dataset configuration loaded successfully.") 21 | return config 22 | except FileNotFoundError: 23 | print(f"Error: The file {json_file_path} was not found.") 24 | return None 25 | except json.JSONDecodeError as e: 26 | print(f"Error: Failed to parse JSON. {e}") 27 | return None 28 | except Exception as e: 29 | print(f"An unexpected error occurred: {e}") 30 | return None 31 | 32 | 33 | class TokenEstimator2(TokenEstimator): 34 | def __init__(self, da: DataArguments): 35 | if da.dataset_config_file is None: 36 | raise RuntimeError("Dataset configuration file has to be uploaded for TE2!") 37 | 38 | if da.dataset_config_file.endswith(".json"): 39 | logger.info("Parsing dataset configuration as local json file") 40 | contracts = load_dataset_config_from_json(da.dataset_config_file) 41 | else: 42 | raise RuntimeError("Please upload dataset configuration in correct JSON format!") 43 | 44 | baseline, fields = self.process_sample_format(da.dataset_text_field) 45 | 46 | self.baseline = baseline 47 | 48 | self.contract = {} 49 | self.m = {} 50 | self.reg = {} 51 | 52 | # For each field found in the input format, extract info from contract 53 | for field in fields: 54 | contract = contracts[field] 55 | 56 | m = {} 57 | m[1] = contract["bs1"] 58 | batch_sizes = [2**i for i in range(1, 5) if 2**i <= contract["len"]] 59 | for bs in batch_sizes: 60 | m[bs] = contract[f"bs{bs}"] 61 | 62 | X = np.array([[i] for i in m.keys()]) 63 | y = np.array(list(m.values())) 64 | 65 | self.contract[field] = contract 66 | self.m[field] = m 67 | self.reg[field] = LinearRegression().fit(X, y) 68 | 69 | def process_sample_format(self, format_str): 70 | """ 71 | Convert an input format string, into the constant baseline part and the fields used from the dataset. 72 | 73 | The baseline part is the number of tokens used in the static string part of the format. 74 | The fields are simply a list of matches of words in {}. 75 | 76 | For example, input format string maybe: 77 | 'Below is a an instruction.... 78 | ### Instruction 79 | {instruction} 80 | ### Input: 81 | {input} 82 | ### Response:' 83 | 84 | In the original data, we have contract information about "instruction" and "input" stored. 85 | In this function, we need to extract out how many tokens make the static portion and 86 | what fields are left over. 87 | """ 88 | matches = re.findall(r'\{(.*?)\}', format_str) 89 | 90 | total = len(format_str) 91 | 92 | slot_len = 0 93 | for m in matches: 94 | # add 2 for the curly braces 95 | slot_len += 2 + len(matches) 96 | 97 | # number of tokens 98 | baseline = (total - slot_len) / 3.6 99 | 100 | return (total - slot_len, matches) 101 | 102 | def get_total_tokens(self): 103 | """ 104 | Since each entry is also formatted with the fmt_string, we need to add the static portions. 105 | """ 106 | total = 0 107 | num_samples = self.get_num_samples() 108 | 109 | # add all the common static tokens, one full set of baseline for each entey 110 | total += self.baseline * num_samples 111 | # now add the full set of tokens for fields that are present in here 112 | for con in self.contract.values(): 113 | total += con["total"] 114 | 115 | return total 116 | 117 | def get_estimated_batch_width(self, bs): 118 | """ 119 | Since multiple fields make up a single entry, we predict average size of each and 120 | also add the baseline width to it. 121 | """ 122 | width = self.baseline 123 | 124 | for field in self.contract.keys(): 125 | m = self.m[field] 126 | reg = self.reg[field] 127 | if bs in m.keys(): 128 | width += m[bs] 129 | else: 130 | width += reg.predict([[bs]])[0] 131 | 132 | return width 133 | 134 | def get_num_samples(self): 135 | # length of all contracts will be same 136 | con = list(self.contract.values())[0] 137 | return con["len"] 138 | 139 | def get_max_sample_length(self): 140 | res = self.baseline 141 | # this is a very pessimistic view, and not the actual worst case 142 | for con in self.contract.values(): 143 | res += con["max"] 144 | 145 | return res 146 | 147 | 148 | # TODO: generate for all configs and splits 149 | def GenerateTokenEstimator2Contract(dataset, config_name=None, split=None, sample_percent=None): 150 | if dataset.endswith(".json") or dataset.endswith(".jsonl"): 151 | logger.info("Parsing dataset as local json file") 152 | dataset = load_dataset("json", data_files={"train": dataset})["train"] 153 | else: 154 | dataset = load_dataset(dataset, name=config_name, split=split) 155 | 156 | print("Loading data in dataset...") 157 | 158 | feat_tokens = {} 159 | # TODO: run sampling instead of going through it all 160 | num_items = len(dataset) 161 | if sample_percent != None: 162 | if sample_percent > 0 and sample_percent <= 100: 163 | num_items = int(num_items * sample_percent/100) 164 | 165 | # mark all string features to generate contracts for 166 | for feat, f_val in dataset.features.items(): 167 | if f_val.dtype == 'string': 168 | feat_tokens[feat] = [] 169 | 170 | seen_items = 0 171 | for item in tqdm(dataset): 172 | # loop over needed features 173 | for feat in feat_tokens.keys(): 174 | feat_tokens[feat].append(int(len(item[feat]) / 3.6)) 175 | 176 | seen_items += 1 177 | if seen_items >= num_items: 178 | break 179 | 180 | contracts = {} 181 | for feat in feat_tokens.keys(): 182 | tokens = np.sort(feat_tokens[feat])[::-1] 183 | 184 | contract = {} 185 | contract["len"] = len(tokens) 186 | contract["total"] = int(np.sum(tokens)) 187 | contract["min"] = int(np.min(tokens)) 188 | contract["max"] = int(np.max(tokens)) 189 | contract["mean"] = round(np.mean(tokens), 2) 190 | contract["std"] = round(np.std(tokens), 2) 191 | 192 | contract["bs1"] = contract["mean"] 193 | 194 | # for bs = 2, 4, 6, 8, 16 195 | batch_sizes = [2**i for i in range(1, 5) if 2**i <= contract["len"]] 196 | for bs in batch_sizes: 197 | contract[f"bs{bs}"] = np.mean(tokens[:int(len(tokens)/bs)]) 198 | 199 | contracts[feat] = contract 200 | 201 | # if we are in sampling mode, rescale the stats 202 | if num_items != len(dataset): 203 | contract["len"] = len(dataset) 204 | contract["total"] = int(contract["total"]*len(dataset)/num_items) 205 | 206 | return contracts 207 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te2/te_test1.jsonl: -------------------------------------------------------------------------------- 1 | {"text": "Mercury is the first planet from the Sun and the smallest in the Solar System"} 2 | {"text": "Venus is notable for having the densest atmosphere of the terrestrial planets, composed mostly of carbon dioxide with a thick, global sulfuric acid cloud cover."} 3 | {"text": "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water."} 4 | {"text": "The surface of Mars is orange-red because it is covered in iron(III) oxide dust, giving it the nickname 'the Red Planet'. Mars is among the brightest objects in Earth's sky and its high-contrast albedo features have made it a common subject for telescope viewing."} 5 | {"text": "Jupiter is the fifth planet from the Sun and the largest in the Solar System. A gas giant, Jupiter's mass is more than two and a half times that of all the other planets in the Solar System combined and slightly less than one one-thousandth the mass of the Sun. Jupiter orbits the Sun at a distance of 5.20 AU (778.5 Gm) with an orbital period of 11.86 years. It is the third brightest natural object in the Earth's night sky after the Moon and Venus and has been observed since prehistoric times. Its name derives from Jupiter, the chief deity of ancient Roman religion. "} 6 | {"text": "Saturn is a gas giant with an average radius of about nine-and-a-half times that of Earth, prominently known for its rings."} 7 | {"text": "Uranus is the seventh planet from the Sun. It is a gaseous cyan-coloured ice giant. Most of the planet is made of water, ammonia, and methane in a supercritical phase of matter, which in astronomy is called 'ice' or volatiles."} 8 | {"text": "Neptune is the eighth and farthest known planet from the Sun. It is the fourth-largest planet in the Solar System by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth."} 9 | {"text": "Pluto is a dwarf planet in the Kuiper belt, a ring of bodies beyond the orbit of Neptune."} 10 | -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te2/test1.contract.json: -------------------------------------------------------------------------------- 1 | {"text": {"len": 9, "total": 533, "min": 21, "max": 158, "mean": 59.22, "std": 38.78, "bs1": 59.22, "bs2": 88.5, "bs4": 115.5, "bs8": 158.0}} -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te2/test_axb.contract.json: -------------------------------------------------------------------------------- 1 | {"sentence1": {"len": 1104, "total": 29400, "min": 3, "max": 82, "mean": 26.63, "std": 15.51, "bs1": 26.63, "bs2": 38.96376811594203, "bs4": 48.56884057971015, "bs8": 56.07971014492754, "bs16": 62.231884057971016}, "sentence2": {"len": 1104, "total": 29400, "min": 3, "max": 82, "mean": 26.63, "std": 15.51, "bs1": 26.63, "bs2": 38.96376811594203, "bs4": 48.56884057971015, "bs8": 56.07971014492754, "bs16": 62.231884057971016}} -------------------------------------------------------------------------------- /fm_training_estimator/tokens/te2/test_te2.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from pathlib import Path 3 | 4 | # Local 5 | from ...config import parse 6 | from .te2 import TokenEstimator2, GenerateTokenEstimator2Contract 7 | from ..te0 import TokenEstimator0 8 | 9 | # trick to ensure running this with pytest works from root dir 10 | test_data = (Path(__file__).parent / "te_test1.jsonl").as_posix() 11 | contract_axb = (Path(__file__).parent / "test_axb.contract.json").as_posix() 12 | contract_test1 = (Path(__file__).parent / "test1.contract.json").as_posix() 13 | 14 | def test_te_raw_hf_dataset(): 15 | _, _, _, da, _, _ = parse( 16 | { 17 | "base_model_path": "ibm-granite/granite-8b-code-base", 18 | "gpu_memory_in_gb": 80, 19 | "dataset": "super_glue", 20 | "dataset_config_name": "axb", 21 | "dataset_split": "test", 22 | "dataset_text_field": "{sentence1}", 23 | "dataset_config_file": contract_axb 24 | } 25 | ) 26 | 27 | te = TokenEstimator2(da) 28 | te0 = TokenEstimator0(da) 29 | 30 | 31 | assert te.get_num_samples() == 1104, "Number of samples in data should match" 32 | assert te.get_estimated_batch_width(4) < te.get_estimated_batch_width( 33 | 8 34 | ), "Larger batches should have more padding" 35 | assert ( 36 | abs(te.get_estimated_batch_width(1) - (te.get_total_tokens() / 1104)) < 1e6 37 | ), "Estimated batch width for BS=1 should be equal to mean of tokens" 38 | assert ( 39 | te0.get_estimated_batch_width(4) < te.get_estimated_batch_width(4) 40 | ), "TE0 must be less than TE2 for same batch size" 41 | 42 | 43 | def test_te_raw_json(): 44 | _, _, _, da, _, _ = parse( 45 | { 46 | "base_model_path": "ibm-granite/granite-8b-code-base", 47 | "gpu_memory_in_gb": 80, 48 | "dataset": test_data, 49 | "dataset_text_field": "{text}", 50 | "dataset_config_file": contract_test1 51 | } 52 | ) 53 | 54 | te = TokenEstimator2(da) 55 | te0 = TokenEstimator0(da) 56 | 57 | assert te.get_num_samples() == 9 58 | assert te.get_estimated_batch_width(1) < te.get_estimated_batch_width(2) 59 | assert te0.get_estimated_batch_width(2) < te.get_estimated_batch_width(2) 60 | 61 | def test_te2_contract(): 62 | _, _, _, da, _, _ = parse( 63 | { 64 | "base_model_path": "ibm-granite/granite-8b-code-base", 65 | "gpu_memory_in_gb": 80, 66 | "dataset": test_data, 67 | "dataset_text_field": "{text}", 68 | } 69 | ) 70 | 71 | contract = GenerateTokenEstimator2Contract(da.dataset) 72 | 73 | assert contract["text"]["len"] == 9 74 | -------------------------------------------------------------------------------- /fm_training_estimator/ui/README.md: -------------------------------------------------------------------------------- 1 | # ui 2 | 3 | ## UI configuration options 4 | 5 | ### Lookup path 6 | 7 | Path to file with raw CSV data. Look into the `regressor` folder to learn more about data formats. 8 | 9 | ### Model path 10 | 11 | Path to model built using the `regressor` module. 12 | 13 | ## cli 14 | 15 | To use the cli: 16 | ``` 17 | python -m fm_training_estimator.ui.cli -l -m 18 | ``` 19 | Lookup file and model file are optional and can be left out. 20 | 21 | First train a memory model: 22 | ``` 23 | python -m fm_training_estimator.regressor.xgboost.train ./fm_training_estimator/regressor/test_data/data2.csv ./test.model.json '["tokens_per_second","memory","memory_act"]' 24 | ``` 25 | 26 | Run with all inputs: 27 | ``` 28 | python -m fm_training_estimator.ui.cli \ 29 | ./fm_training_estimator/config/test_configs/config2.json \ 30 | -l ./fm_training_estimator/regressor/test_data/data2.csv \ 31 | -m ./test.model.json 32 | ``` 33 | `config2.json` is an example of the setup where Lookup would work. `config3.json` is an example where lookup will fail and the system will fall back to regression. 34 | 35 | ## api 36 | 37 | Run the api: 38 | ``` 39 | make run-api 40 | ``` 41 | 42 | Now, you can get an estimate for the config using something like the following: 43 | ``` 44 | curl localhost:3000/api/estimate -d@ 45 | ``` 46 | Notice that the request is a POST, since we need to pass in config json as a request body. 47 | 48 | ## web 49 | 50 | To use the web ui: 51 | ``` 52 | python -m fm_training_estimator.ui.web 53 | ``` 54 | 55 | to enable white listing of models, you can pass in the path of a txt file with one model per line. See the file `model_whitelist.txt` for an example. Use as: 56 | ``` 57 | python -m fm_training_estimator.ui.web ./model_whitelist.txt 58 | ``` 59 | 60 | To enable lookup and regression based hybrid estimator: 61 | ``` 62 | python -m fm_training_estimator.ui.web ./model_whitelist.txt \ 63 | ../regressor/test_data/data2.csv \ 64 | ../../test.model.json 65 | ``` 66 | 67 | As with the cli version, first train the model to use. 68 | -------------------------------------------------------------------------------- /fm_training_estimator/ui/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .core import run 3 | -------------------------------------------------------------------------------- /fm_training_estimator/ui/api.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from typing import Any 3 | import json 4 | import logging 5 | 6 | # Third Party 7 | from fastapi import Body, FastAPI 8 | import fire 9 | import uvicorn 10 | 11 | # Local 12 | from .core import run 13 | 14 | 15 | def api(data_path, model_path): 16 | app = FastAPI() 17 | 18 | @app.post("/api/estimate") 19 | def estimate(config: Any = Body()): 20 | conf = json.loads(config) 21 | output = run(conf, data_path, model_path) 22 | # this default float business is needed to deal with numpy.float32 23 | # types present in the output json which don't serialize out of the box 24 | return json.dumps(output, default=float) 25 | 26 | return app 27 | 28 | 29 | def run_api(data_path=None, model_path=None, port=3000): 30 | 31 | app = api(data_path, model_path) 32 | uvicorn.run(app, host="0.0.0.0", port=port) 33 | 34 | 35 | if __name__ == "__main__": 36 | logging.basicConfig(level=logging.INFO) 37 | fire.Fire(run_api) 38 | -------------------------------------------------------------------------------- /fm_training_estimator/ui/cli.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from pathlib import Path 3 | from typing import Optional 4 | import json 5 | import logging 6 | 7 | # Third Party 8 | import fire 9 | 10 | # Local 11 | from .core import run 12 | 13 | 14 | def run_cli( 15 | config: str, 16 | output_path: str = "", 17 | log_level: str = "INFO", 18 | lookup_data_path: Optional[str] = None, 19 | model_path: Optional[str] = None, 20 | ): 21 | """Run the CLI.""" 22 | log_level = log_level.upper() 23 | logging.basicConfig(level=log_level) 24 | output = run( 25 | config=config, 26 | lookup_data_path=lookup_data_path, 27 | model_path=model_path, 28 | ) 29 | output_json = json.dumps(output, indent=4) 30 | if output_path == "": 31 | # use print instead of logging so that 32 | # the output can be parsed as valid json 33 | print(output_json) 34 | return 35 | output_path: Path = Path(output_path) 36 | output_path.parent.mkdir(mode=0o755, parents=True, exist_ok=True) 37 | logging.info("writing the output to a file at %s", output_path) 38 | with open(output_path, "w", encoding="utf-8") as f: 39 | f.write(output_json) 40 | 41 | 42 | if __name__ == "__main__": 43 | fire.Fire(run_cli) 44 | -------------------------------------------------------------------------------- /fm_training_estimator/ui/core.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from ..config import is_fsdp, parse 3 | from ..memory import HybridEstimator, HybridLoraEstimator, HybridQLoraEstimator 4 | from ..throughput import HybridSpeedEstimator 5 | from ..time import get_total_time 6 | from ..tokens import TokenEstimator0, TokenEstimator2 7 | from ..utils import fmt_size 8 | 9 | 10 | def run(config, lookup_data_path=None, model_path=None): 11 | 12 | res = {} 13 | fm, ta, ia, da, la, qla = parse(config) 14 | 15 | token_est = None 16 | if da.te_approach == 0: 17 | token_est = TokenEstimator0(da) 18 | elif da.te_approach == 2: 19 | token_est = TokenEstimator2(da) 20 | 21 | if token_est != None: 22 | data_max_width = token_est.get_max_sample_length() 23 | if data_max_width < fm.block_size: 24 | fm.block_size = data_max_width 25 | 26 | if fm.technique == "lora": 27 | est = HybridLoraEstimator(fm, ta, ia, la, lookup_data_path, model_path) 28 | elif fm.technique == "qlora": 29 | est = HybridQLoraEstimator(fm, ta, ia, la, qla, lookup_data_path, model_path) 30 | else: 31 | est = HybridEstimator(fm, ta, ia, lookup_data_path, model_path) 32 | 33 | res["total_mem_estimate_og"] = float(est.get_total_mem_estimate()) 34 | res["activation_memory_og"] = float(est.calculate_activation_memory()) 35 | res["gradient_memory_og"] = float(est.calculate_gradient_memory()) 36 | res["model_memory_og"] = float(est.calculate_model_memory()) 37 | res["optimizer_memory_og"] = float(est.calculate_optimizer_memory()) 38 | 39 | res["total_mem_estimate"] = fmt_size(res["total_mem_estimate_og"]) 40 | res["activation_memory"] = fmt_size(res["activation_memory_og"]) 41 | res["gradient_memory"] = fmt_size(res["gradient_memory_og"]) 42 | res["model_memory"] = fmt_size(res["model_memory_og"]) 43 | res["optimizer_memory"] = fmt_size(res["optimizer_memory_og"]) 44 | 45 | res["num_gpus"] = ia.numGpusPerPod 46 | 47 | if ia.numGpusPerPod == 0: 48 | if fm.technique == "full" and is_fsdp(ta): 49 | res["num_gpus"] = est.fsdp_est.get_number_of_gpus() 50 | elif fm.technique == "lora" or fm.technique == "qlora": 51 | res["num_gpus"] = est.num_gpus 52 | else: 53 | res["num_gpus"] = 1 54 | 55 | ia.numGpusPerPod = res["num_gpus"] 56 | 57 | # No suitable configuration found 58 | if res["num_gpus"] == -1: 59 | return {"error": "Input configuration is infeasible!"} 60 | 61 | speed_est = HybridSpeedEstimator(fm, ta, ia, lookup_data_path, model_path) 62 | res["tps"] = float(speed_est.get_tps()) 63 | 64 | if token_est is not None: 65 | res["tokens_per_sample"] = int( 66 | token_est.get_estimated_batch_width(ta.per_device_train_batch_size) 67 | ) 68 | res["total_tokens"] = int(token_est.get_total_tokens()) 69 | 70 | # get the update tps for this estimate token width 71 | res["tps"] = float(speed_est.get_tps(res["tokens_per_sample"])) 72 | 73 | time_total, time_train = get_total_time(ta, ia, token_est, res["tps"], res["total_tokens"]) 74 | res["time"] = time_total 75 | res["time_train"] = time_train 76 | 77 | return res 78 | -------------------------------------------------------------------------------- /fm_training_estimator/ui/model_whitelist.txt: -------------------------------------------------------------------------------- 1 | ibm-granite/granite-3b-code-base 2 | ibm-granite/granite-3b-code-instruct 3 | ibm-granite/granite-7b-base 4 | ibm-granite/granite-7b-instruct 5 | ibm-granite/granite-8b-code-base 6 | ibm-granite/granite-8b-code-instruct 7 | ibm-granite/granite-20b-code-base 8 | ibm-granite/granite-20b-code-instruct 9 | ibm-granite/granite-34b-code-base 10 | ibm-granite/granite-34b-code-instruct 11 | instructlab/merlinite-7b-lab 12 | instructlab/granite-7b-lab 13 | -------------------------------------------------------------------------------- /fm_training_estimator/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .model import extract_model_features, get_model_max_length 3 | from .utils import ( 4 | fmt_size, 5 | get_human_readable_number, 6 | get_size_from_precision, 7 | logger, 8 | unmarshal, 9 | ) 10 | 11 | __all__ = [ 12 | "unmarshal", 13 | "get_size_from_precision", 14 | "get_human_readable_number", 15 | "fmt_size", 16 | "get_model_max_length", 17 | "logger", 18 | "extract_model_features", 19 | ] 20 | -------------------------------------------------------------------------------- /fm_training_estimator/utils/model.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | import logging 3 | 4 | # Third Party 5 | from transformers import AutoConfig 6 | 7 | 8 | def get_model_max_length(model_path: str) -> int: 9 | """return model's max sequence length by looking up its config 10 | 11 | Args: 12 | model_path (str): model path on filesystem or hugging face id 13 | 14 | Returns: 15 | int: max sequence length 16 | """ 17 | config = AutoConfig.from_pretrained(model_path) 18 | n_positions = 4096 19 | if hasattr(config, "n_positions"): 20 | n_positions = config.n_positions 21 | elif hasattr(config, "max_position_embeddings"): 22 | n_positions = config.max_position_embeddings 23 | return n_positions 24 | 25 | 26 | def extract_model_features(model, fmt="dict"): 27 | """Given a model name in HF format, extract out the params of the model. 28 | 29 | Can return the data in one of supported formats. 30 | "dict": return a dictionary 31 | "list": return a list 32 | "csv": return a comma separated string of values 33 | """ 34 | try: 35 | conf = AutoConfig.from_pretrained(model) 36 | conf = conf.to_dict() 37 | res = {} 38 | 39 | # TODO: include number of params here 40 | # need to refactor code out from Full Memory Estimator class 41 | 42 | res["model_arch"] = conf["architectures"][0] 43 | 44 | if "hidden_size" in conf: 45 | res["model_hidden_size"] = conf["hidden_size"] 46 | elif "n_embd" in conf: 47 | res["model_hidden_size"] = conf["n_embd"] 48 | elif "n_embed" in conf: 49 | res["model_hidden_size"] = conf["n_embed"] 50 | else: 51 | res["model_hidden_size"] = -1 52 | 53 | if "intermediate_size" in conf: 54 | res["model_intermediate_size"] = conf["intermediate_size"] 55 | elif "n_inner" in conf: 56 | res["model_intermediate_size"] = conf["n_inner"] 57 | else: 58 | res["model_intermediate_size"] = -1 59 | 60 | if "num_attention_heads" in conf: 61 | res["model_num_attn_heads"] = conf["num_attention_heads"] 62 | elif "n_head" in conf: 63 | res["model_num_attn_heads"] = conf["n_head"] 64 | else: 65 | res["model_num_attn_heads"] = -1 66 | 67 | if "num_hidden_layers" in conf: 68 | res["model_num_hidden_layers"] = conf["num_hidden_layers"] 69 | elif "n_layer" in conf: 70 | res["model_num_hidden_layers"] = conf["n_layer"] 71 | else: 72 | res["model_num_hidden_layers"] = -1 73 | 74 | if "num_key_value_heads" in conf: 75 | res["model_num_key_value_heads"] = conf["num_key_value_heads"] 76 | else: 77 | res["model_num_key_value_heads"] = res["model_num_attn_heads"] 78 | 79 | except Exception as e: 80 | logging.error(e) 81 | logging.warning("Returning empty response!") 82 | res = {} 83 | 84 | if fmt == "dict": 85 | return res 86 | 87 | if fmt == "list": 88 | return list(res.values()) 89 | 90 | if fmt == "csv": 91 | out = "" 92 | for v in res.values(): 93 | out += f"{v}," 94 | return out[:-1] 95 | 96 | logging.warning("Unknown format selected: ", fmt) 97 | return res 98 | -------------------------------------------------------------------------------- /fm_training_estimator/utils/test_model.py: -------------------------------------------------------------------------------- 1 | # Local 2 | from .model import extract_model_features 3 | 4 | 5 | def test_extract_model_features(): 6 | 7 | # default dict format 8 | res = extract_model_features("ibm-granite/granite-3b-code-base") 9 | assert res["model_num_hidden_layers"] == 32 10 | 11 | res = extract_model_features("ibm-granite/granite-3b-code-base", fmt="list") 12 | assert res == ["LlamaForCausalLM", 2560, 10240, 32, 32, 32] 13 | 14 | # extract in csv format 15 | res = extract_model_features("ibm-granite/granite-3b-code-base", fmt="csv") 16 | assert res == "LlamaForCausalLM,2560,10240,32,32,32" 17 | 18 | # example from different format 19 | res = extract_model_features("ibm-granite/granite-20b-code-base", fmt="list") 20 | assert res == ["GPTBigCodeForCausalLM", 6144, 24576, 48, 52, 48] 21 | -------------------------------------------------------------------------------- /fm_training_estimator/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from typing import Dict 3 | import json 4 | import logging 5 | import math 6 | import os 7 | 8 | # Third Party 9 | import yaml 10 | 11 | logger = logging.getLogger("fm_training_estimator") 12 | log_level = os.getenv("LOG_LEVEL", "INFO").upper() 13 | 14 | # Validate the log level 15 | numeric_level = getattr(logging, log_level, None) 16 | if not isinstance(numeric_level, int): 17 | raise ValueError(f"Invalid log level: {log_level}") 18 | 19 | logging.basicConfig(level=numeric_level) 20 | logging.info( 21 | f"FM Training Estimator utils: Set central logging config to level {log_level}." 22 | ) 23 | 24 | 25 | def unmarshal(path: str) -> Dict: 26 | """load data from the given filesystem path and return python dict 27 | 28 | Args: 29 | path (str): path to json or yaml file 30 | 31 | Returns: 32 | Dict: loaded data as python dictionary 33 | """ 34 | if not path.endswith((".json", ".yaml", ".yml")): 35 | raise ValueError( 36 | "path to unmarshal should have either json or yaml extension, but got {path}".format( 37 | path=path 38 | ) 39 | ) 40 | with open(path, "r", encoding="utf8") as f: 41 | if path.endswith(".json"): 42 | return json.load(f) 43 | return yaml.safe_load(f) 44 | 45 | 46 | def get_size_from_precision(precision: str) -> float: 47 | """return multiplier based on the precision 48 | 49 | Args: 50 | precision (str): paramter precision 51 | 52 | Returns: 53 | int: multiplier 54 | """ 55 | if precision in ("float16", "bfloat16"): 56 | return 2 57 | if precision == "float32": 58 | return 4 59 | if precision == "nf4": 60 | return 0.5 61 | return 4 62 | 63 | 64 | # https://stackoverflow.com/a/3155023 65 | def get_human_readable_number(number: int) -> str: 66 | """return human readable format with denomination for the given number 67 | 68 | Args: 69 | number (int): number 70 | 71 | Returns: 72 | str: human readable string for number with denomination 73 | """ 74 | denominations = ["", " Thousand", " Million", " Billion", " Trillion"] 75 | number = float(number) 76 | millidx = max( 77 | 0, 78 | min( 79 | len(denominations) - 1, 80 | int(math.floor(0 if number == 0 else math.log10(abs(number)) / 3)), 81 | ), 82 | ) 83 | 84 | return "{:.0f}{}".format(number / 10 ** (3 * millidx), denominations[millidx]) 85 | 86 | 87 | # https://stackoverflow.com/a/1094933 88 | def fmt_size(size: int) -> str: 89 | """returns human readable format for the given size in bytes 90 | 91 | Args: 92 | size (int): number of bytes 93 | 94 | Returns: 95 | str: human readable string format for the bytes 96 | """ 97 | for unit in ("", "Ki", "Mi", "Gi", "Ti"): 98 | if abs(size) < 1024.0: 99 | return f"{size:3.1f} {unit}B" 100 | size /= 1024.0 101 | 102 | return f"{size:.1f} PiB" 103 | -------------------------------------------------------------------------------- /imgs/build-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-model-stack/fm-training-estimator/ba3f02ab129877c6fa6dabca46f860ce0f95c62c/imgs/build-model.png -------------------------------------------------------------------------------- /imgs/demo-cli.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foundation-model-stack/fm-training-estimator/ba3f02ab129877c6fa6dabca46f860ce0f95c62c/imgs/demo-cli.gif -------------------------------------------------------------------------------- /launch_estimator.py: -------------------------------------------------------------------------------- 1 | # Copyright The FM Training Estimator Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Script wraps fm_training_estimator to run with user provided training configs. 15 | The script will read configuration via environment variable `ESTIMATOR_INPUT_JSON_PATH` 16 | for the path to the JSON config file or `ESTIMATOR_INPUT_JSON_ENV_VAR` 17 | for the encoded config string to parse. 18 | """ 19 | 20 | # Standard 21 | from pathlib import Path 22 | import base64 23 | import json 24 | import logging 25 | import os 26 | import pickle 27 | import subprocess 28 | import sys 29 | import traceback 30 | 31 | # First Party 32 | from fm_training_estimator.config.arguments import EstimateInput, MemoryEstimate 33 | from fm_training_estimator.sdk import estimate_memory, estimate_time, estimate_tokens 34 | 35 | logging.basicConfig(level=logging.INFO) 36 | 37 | 38 | def main(): 39 | ########## 40 | # 41 | # Parse arguments 42 | # 43 | ########## 44 | try: 45 | input_dict = get_input_dict() 46 | logging.info("estimator launch parsed input json: %s", input_dict) 47 | if not input_dict: 48 | raise ValueError( 49 | "Must set environment variable 'ESTIMATOR_INPUT_JSON_PATH'\ 50 | or 'ESTIMATOR_INPUT_JSON_ENV_VAR'." 51 | ) 52 | 53 | except FileNotFoundError as e: 54 | logging.error(traceback.format_exc()) 55 | sys.exit(1) 56 | except (TypeError, ValueError, EnvironmentError) as e: 57 | logging.error(traceback.format_exc()) 58 | sys.exit(1) 59 | except Exception as e: # pylint: disable=broad-except 60 | logging.error(traceback.format_exc()) 61 | sys.exit(1) 62 | 63 | ########## 64 | # 65 | # Run the estimator 66 | # 67 | ########## 68 | model_path = os.getenv("ESTIMATOR_MODEL_PATH") 69 | estimator_input = EstimateInput.from_dict(input_dict) 70 | 71 | out_path = os.getenv("ESTIMATOR_OUTPUT_PATH", "estimator_output") 72 | if not os.path.exists(out_path): 73 | os.makedirs(out_path) 74 | 75 | out_content = "Input parsed for this estimate: " + str(estimator_input) + "\n\n" 76 | 77 | ############ Memory ############ 78 | out_content += "Estimating Memory:....\n" 79 | 80 | memory_output = estimate_memory(estimator_input) 81 | f = open(os.path.join(out_path, "memory_theory.json"), "w") 82 | f.write(json.dumps(memory_output.__dict__)) 83 | f.close() 84 | 85 | out_content += "With only theory: " + str(memory_output) + "\n" 86 | if model_path: 87 | memory_output = estimate_memory(estimator_input, model_path) 88 | out_content += "With reg model: " + str(memory_output) + "\n" 89 | f = open(os.path.join(out_path, "memory_hybrid.json"), "w") 90 | f.write(json.dumps(memory_output.__dict__)) 91 | f.close() 92 | 93 | ############ Time ############ 94 | out_content += "\n" * 3 95 | out_content += "Estimating Time:....\n" 96 | 97 | time_output = estimate_time(estimator_input) 98 | f = open(os.path.join(out_path, "time_theory.json"), "w") 99 | f.write(json.dumps(time_output.__dict__)) 100 | f.close() 101 | 102 | out_content += "With only theory: " + str(time_output) + "\n" 103 | if model_path: 104 | time_output = estimate_time(estimator_input, model_path) 105 | out_content += "With reg model: " + str(time_output) + "\n" 106 | f = open(os.path.join(out_path, "time_hybrid.json"), "w") 107 | f.write(json.dumps(time_output.__dict__)) 108 | f.close() 109 | 110 | ############ Tps ############ 111 | out_content += "\n" * 3 112 | out_content += "Estimating tps:....\n" 113 | 114 | tps_output = estimate_tokens(estimator_input) 115 | f = open(os.path.join(out_path, "tps_theory.json"), "w") 116 | f.write(json.dumps(tps_output.__dict__)) 117 | f.close() 118 | 119 | out_content += "With only theory: " + str(tps_output) + "\n" 120 | if model_path: 121 | tps_output = estimate_tokens(estimator_input, model_path) 122 | out_content += "With reg model: " + str(tps_output) + "\n" 123 | f = open(os.path.join(out_path, "tps_hybrid.json"), "w") 124 | f.write(json.dumps(tps_output.__dict__)) 125 | f.close() 126 | 127 | print(out_content) 128 | 129 | f = open(os.path.join(out_path, "output.txt"), "w") 130 | f.write(out_content) 131 | f.close() 132 | return 0 133 | 134 | 135 | def get_input_dict(): 136 | """Parses JSON configuration if provided via environment variables 137 | ESTIMATOR_INPUT_JSON_ENV_VAR or ESTIMATOR_INPUT_JSON_PATH. 138 | 139 | ESTIMATOR_INPUT_JSON_ENV_VAR is the base64 encoded JSON. 140 | ESTIMATOR_INPUT_JSON_PATH is the path to the JSON config file. 141 | 142 | Returns: dict or {} 143 | """ 144 | json_env_var = os.getenv("ESTIMATOR_INPUT_JSON_ENV_VAR") 145 | json_path = os.getenv("ESTIMATOR_INPUT_JSON_PATH") 146 | 147 | # accepts either path to JSON file or encoded string config 148 | # env var takes precedent 149 | input_dict = {} 150 | if json_env_var: 151 | input_dict = txt_to_obj(json_env_var) 152 | elif json_path: 153 | with open(json_path, "r", encoding="utf-8") as f: 154 | input_dict = json.load(f) 155 | 156 | return input_dict 157 | 158 | 159 | def txt_to_obj(txt): 160 | """Given encoded byte string, converts to base64 decoded dict. 161 | 162 | Args: 163 | txt: str 164 | Returns: dict[str, Any] 165 | """ 166 | base64_bytes = txt.encode("ascii") 167 | message_bytes = base64.b64decode(base64_bytes) 168 | try: 169 | # If the bytes represent JSON string 170 | return json.loads(message_bytes) 171 | except UnicodeDecodeError: 172 | # Otherwise the bytes are a pickled python dictionary 173 | return pickle.loads(message_bytes) 174 | 175 | 176 | if __name__ == "__main__": 177 | main() 178 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "setuptools-scm>=8.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "fm_training_estimator" 7 | dynamic = ["version"] 8 | authors = [ 9 | { name="Angel Luu", email="angel.luu@us.ibm.com" }, 10 | { name="Chander Govindarajan", email="mail@chandergovind.org" }, 11 | ] 12 | description = "A package of Estimators for Large Language Model Training." 13 | license = {text = "Apache-2.0"} 14 | readme = "README.md" 15 | requires-python = ">=3.8" 16 | classifiers = [ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: Apache Software License", 19 | "Operating System :: OS Independent", 20 | ] 21 | dependencies = [ 22 | "tox", 23 | "pre-commit", 24 | "transformers", 25 | "peft", 26 | "setuptools", 27 | "fire", 28 | "pandas", 29 | "xgboost", 30 | "scikit-learn<1.6.0", 31 | "gradio", 32 | "datasets", 33 | "dataclass-wizard", 34 | "uvicorn", 35 | "arise-predictions==1.0.2" 36 | ] 37 | 38 | [project.urls] 39 | Source = "https://github.com/foundation-model-stack/fm-training-estimator" 40 | 41 | 42 | [project.optional-dependencies] 43 | dev-docs = [ 44 | "sphinx>=4.0.2,<8.0", 45 | "sphinx-autoapi>=2.1.0", 46 | "sphinx-rtd-theme>=1.2.1,<2.1.0", 47 | ] 48 | 49 | [tool.setuptools.packages.find] 50 | exclude = ["tests", "tests.*", "test_*.py"] 51 | namespaces = false 52 | 53 | [tool.setuptools_scm] 54 | version_file = "fm_training_estimator/_version.py" -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = lint, fmt, test 3 | 4 | [testenv:fmt] 5 | description = format with pre-commit 6 | deps = 7 | pre-commit 8 | tox 9 | commands = bash ./tox.sh 10 | allowlist_externals = bash,./tox.sh 11 | 12 | [testenv:lint] 13 | description = lint with pylint 14 | deps = 15 | pylint>=2.16.2,<=3.1.0 16 | pytest 17 | commands = pylint fm_training_estimator 18 | allowlist_externals = pylint 19 | 20 | [testenv:docs] 21 | recreate = True 22 | extras = dev-docs 23 | changedir = docs/source 24 | 25 | ; Disabled '-W' flag as warnings in the files 26 | ; TOTO: Add back in once build warnings fixed 27 | commands = 28 | sphinx-build -E -a -b html -T . _build/html 29 | 30 | [testenv:test] 31 | description = test with pytest 32 | deps = 33 | pytest 34 | -r requirements.txt 35 | commands = pytest 36 | allowlist_externals = pytest 37 | 38 | [testenv:build] 39 | description = build wheel 40 | deps = 41 | build 42 | commands = python -m build -w 43 | skip_install = True 44 | 45 | [testenv:twinecheck] 46 | description = check wheel 47 | deps = 48 | twine 49 | commands = twine check dist/* 50 | skip_install = True 51 | -------------------------------------------------------------------------------- /tox.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | pre-commit run --all-files 4 | RETURN_CODE=$? 5 | 6 | function echoWarning() { 7 | LIGHT_YELLOW='\033[1;33m' 8 | NC='\033[0m' # No Color 9 | echo -e "${LIGHT_YELLOW}${1}${NC}" 10 | } 11 | 12 | if [ "$RETURN_CODE" -ne 0 ]; then 13 | if [ "${CI}" != "true" ]; then 14 | echoWarning "☝️ This appears to have failed, but actually your files have been formatted." 15 | echoWarning "Make a new commit with these changes before making a pull request." 16 | else 17 | echoWarning "This test failed because your code isn't formatted correctly." 18 | echoWarning 'Locally, run `make fmt`, it will appear to fail, but change files.' 19 | echoWarning "Add the changed files to your commit and this stage will pass." 20 | fi 21 | 22 | exit $RETURN_CODE 23 | fi 24 | --------------------------------------------------------------------------------