├── .github
    └── workflows
    │   └── build-and-push.yml
├── .gitignore
├── .isort.cfg
├── .pre-commit-config.yaml
├── .pylintrc
├── .readthedocs.yaml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── adrs
    └── 001-resource-estimator-library.md
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── conf.py
    │   └── index.rst
├── fm_training_estimator
    ├── __init__.py
    ├── config
    │   ├── __init__.py
    │   ├── arguments.py
    │   ├── parser.py
    │   ├── test_configs
    │   │   ├── config1.json
    │   │   ├── config2.json
    │   │   ├── config3.json
    │   │   └── config4.json
    │   ├── test_parser.py
    │   ├── test_utils.py
    │   └── utils.py
    ├── data
    │   ├── README.md
    │   ├── __init__.py
    │   └── manager.py
    ├── memory
    │   ├── __init__.py
    │   ├── fsdp
    │   │   ├── __init__.py
    │   │   ├── fsdp.py
    │   │   └── test_fsdp.py
    │   ├── full
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── full.py
    │   │   └── test_full.py
    │   ├── hybrid
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── hybrid.py
    │   │   └── hybrid_test.py
    │   ├── lora
    │   │   ├── __init__.py
    │   │   ├── hybrid.py
    │   │   ├── lora.py
    │   │   └── test_lora.py
    │   └── qlora
    │   │   ├── __init__.py
    │   │   ├── hybrid.py
    │   │   ├── qlora.py
    │   │   └── test_qlora.py
    ├── regressor
    │   ├── README.md
    │   ├── __init__.py
    │   ├── arise
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── arise.py
    │   │   └── train.py
    │   ├── dispatch.py
    │   ├── linear
    │   │   ├── __init__.py
    │   │   ├── linear.py
    │   │   └── train.py
    │   ├── lookup
    │   │   ├── __init__.py
    │   │   ├── lookup.py
    │   │   └── test_lookup.py
    │   ├── test_data
    │   │   ├── data1.csv
    │   │   ├── data2.csv
    │   │   └── data3.csv
    │   └── xgboost
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── test_reg.py
    │   │   ├── train.py
    │   │   └── xgboost.py
    ├── sdk
    │   ├── README.md
    │   ├── __init__.py
    │   ├── examples
    │   │   └── ex1.py
    │   └── sdk.py
    ├── throughput
    │   ├── __init__.py
    │   ├── hybrid
    │   │   ├── __init__.py
    │   │   ├── hybrid.py
    │   │   └── test_hybrid.py
    │   └── mock
    │   │   ├── __init__.py
    │   │   ├── mock.py
    │   │   └── test_mock.py
    ├── time
    │   ├── README.md
    │   ├── __init__.py
    │   └── time.py
    ├── tokens
    │   ├── README.md
    │   ├── __init__.py
    │   ├── te.py
    │   ├── te0
    │   │   ├── __init__.py
    │   │   ├── te0.py
    │   │   ├── te_test1.jsonl
    │   │   └── test_te0.py
    │   └── te2
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── gen_contract.py
    │   │   ├── te2.py
    │   │   ├── te_test1.jsonl
    │   │   ├── test1.contract.json
    │   │   ├── test_axb.contract.json
    │   │   └── test_te2.py
    ├── ui
    │   ├── README.md
    │   ├── __init__.py
    │   ├── api.py
    │   ├── cli.py
    │   ├── core.py
    │   ├── model_whitelist.txt
    │   └── web.py
    └── utils
    │   ├── __init__.py
    │   ├── model.py
    │   ├── test_model.py
    │   └── utils.py
├── imgs
    ├── build-model.png
    └── demo-cli.gif
├── launch_estimator.py
├── pyproject.toml
├── tox.ini
└── tox.sh


/.github/workflows/build-and-push.yml:
--------------------------------------------------------------------------------
 1 | # Copyright The FM Training Estimator Authors
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | name: Upload Python Package for FM Training Estimator
16 | 
17 | on:
18 |   release:
19 |     types: [published]
20 | 
21 | permissions:
22 |   contents: read
23 | 
24 | jobs:
25 |   deploy:
26 | 
27 |     runs-on: ubuntu-latest
28 |     strategy:
29 |       matrix:
30 |         python-version:
31 |           - setup: "3.11"
32 |             tox: "py311"
33 | 
34 |     permissions:
35 |       id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
36 | 
37 |     steps:
38 |     - uses: actions/checkout@v4
39 |     - name: Set up Python
40 |       uses: actions/setup-python@v4
41 |       with:
42 |         python-version: ${{ matrix.python-version.setup }}
43 | 
44 |     
45 |     - name: Install dependencies
46 |       run: |
47 |         python -m pip install --upgrade pip
48 |         python -m pip install tox
49 |     - name: Build and test with tox
50 |       run: tox -e ${{ matrix.python-version.tox }}
51 |     - name: Build and check wheel package
52 |       run:
53 |         tox -e build,twinecheck
54 |     - name: Publish package
55 |       uses: pypa/gh-action-pypi-publish@release/v1


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | wheel/
 30 | fm_training_estimator/_version.py
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/source/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Mac personalization files
116 | *.DS_Store
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 
136 | .*~
137 | /workdir/
138 | 
139 | .DS_Store
140 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
 1 | [settings]
 2 | profile=black
 3 | from_first=true
 4 | import_heading_future=Future
 5 | import_heading_stdlib=Standard
 6 | import_heading_thirdparty=Third Party
 7 | import_heading_firstparty=First Party
 8 | import_heading_localfolder=Local
 9 | known_firstparty=
10 | known_localfolder=tuning
11 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |     - repo: https://github.com/psf/black
 3 |       rev: 22.3.0
 4 |       hooks:
 5 |           - id: black
 6 |             exclude: imports
 7 |     - repo: https://github.com/PyCQA/isort
 8 |       rev: 5.11.5
 9 |       hooks:
10 |           - id: isort
11 |             exclude: imports
12 |     - repo: https://github.com/compilerla/conventional-pre-commit
13 |       rev: v3.2.0
14 |       hooks:
15 |         - id: conventional-pre-commit
16 |           stages: [commit-msg]
17 |           args: []
18 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Configuration version
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |     os: ubuntu-22.04
11 |     tools:
12 |         python: "3.11"
13 | 
14 | # Build documentation in the docs/source directory with Sphinx
15 | sphinx:
16 |     configuration: docs/source/conf.py
17 | 
18 | # Declare the Python requirements required to build your docs
19 | python:
20 |     install:
21 |         - method: pip
22 |           path: .
23 |           extra_requirements:
24 |               - all
25 |               - dev-docs


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ## Global Args #################################################################
 2 | ARG BASE_UBI_IMAGE_TAG=latest
 3 | ARG USER=tuning
 4 | ARG USER_UID=1000
 5 | ARG PYTHON_VERSION=3.11
 6 | ARG WHEEL_VERSION=""
 7 | 
 8 | ## Base Layer ##################################################################
 9 | FROM registry.access.redhat.com/ubi9/ubi:${BASE_UBI_IMAGE_TAG} AS base
10 | 
11 | ARG PYTHON_VERSION
12 | ARG USER
13 | ARG USER_UID
14 | 
15 | # Note this works for 3.9, 3.11, 3.12
16 | RUN dnf remove -y --disableplugin=subscription-manager \
17 |         subscription-manager \
18 |     && dnf install -y python${PYTHON_VERSION} procps g++ python${PYTHON_VERSION}-devel \
19 |     && ln -s /usr/bin/python${PYTHON_VERSION} /bin/python \
20 |     && python -m ensurepip --upgrade \
21 |     && python -m pip install --upgrade pip \
22 |     && python -m pip install --upgrade setuptools \
23 |     && dnf update -y \
24 |     && dnf clean all
25 | 
26 | RUN useradd -u $USER_UID ${USER} -m -g 0 --system && \
27 |     chmod g+rx /home/${USER}
28 | 
29 | FROM base AS python-installations
30 | 
31 | ARG WHEEL_VERSION
32 | ARG USER
33 | ARG USER_UID
34 | 
35 | RUN dnf install -y git && \
36 |     # perl-Net-SSLeay.x86_64 and server_key.pem are installed with git as dependencies
37 |     # Twistlock detects it as H severity: Private keys stored in image
38 |     rm -f /usr/share/doc/perl-Net-SSLeay/examples/server_key.pem && \
39 |     dnf clean all
40 | 
41 | USER ${USER}
42 | WORKDIR /tmp
43 | RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
44 |     python -m pip install --user build
45 | COPY --chown=${USER}:root fm_training_estimator fm_training_estimator
46 | COPY .git .git
47 | COPY pyproject.toml pyproject.toml
48 | 
49 | # Build a wheel if PyPi wheel_version is empty else download the wheel from PyPi
50 | RUN if [[ -z "${WHEEL_VERSION}" ]]; \
51 |     then python -m build --wheel --outdir /tmp; \
52 |     else pip download fm_training_estimator==${WHEEL_VERSION} --dest /tmp --only-binary=:all: --no-deps; \
53 |     fi && \
54 |     ls /tmp/*.whl >/tmp/bdist_name
55 | 
56 | # Install from the wheel
57 | RUN --mount=type=cache,target=/home/${USER}/.cache/pip,uid=${USER_UID} \
58 |     python -m pip install --user wheel && \
59 |     python -m pip install --user "$(head bdist_name)" && \
60 |     # Cleanup the bdist whl file
61 |     rm $(head bdist_name) /tmp/bdist_name
62 | 
63 | ## Final image ################################################
64 | FROM base AS release
65 | ARG USER
66 | ARG PYTHON_VERSION
67 | 
68 | RUN mkdir -p /licenses
69 | COPY LICENSE /licenses/
70 | 
71 | RUN mkdir /app && \
72 |     chown -R $USER:0 /app /tmp && \
73 |     chmod -R g+rwX /app /tmp
74 | 
75 | 
76 | # Copy scripts and default configs
77 | COPY launch_estimator.py /app/
78 | RUN chmod +x /app/launch_estimator.py
79 | 
80 | WORKDIR /app
81 | USER ${USER}
82 | COPY --from=python-installations /home/${USER}/.local /home/${USER}/.local
83 | ENV PYTHONPATH="/home/${USER}/.local/lib/python${PYTHON_VERSION}/site-packages"
84 | 
85 | CMD [ "python", "/app/launch_estimator.py" ]
86 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | IMAGE ?= icr.io/ftplatform/fm_training_estimator:latest
 2 | 
 3 | .PHONY: build
 4 | build: lint fmt install
 5 | 	tox -e build
 6 | 
 7 | .PHONY: install
 8 | install: hook
 9 | 	pip install -e .
10 | 
11 | .PHONY: fmt
12 | fmt:
13 | 	tox -e fmt
14 | 
15 | .PHONY: lint
16 | lint:
17 | 	tox -e lint
18 | 
19 | .PHONY: test
20 | test:
21 | 	tox -e test
22 | 
23 | .PHONY: hook
24 | hook:
25 | 	pre-commit install --hook-type commit-msg
26 | 
27 | .PHONY: build-model
28 | build-model:
29 | 	python -m fm_training_estimator.regressor.xgboost.train ./workdir/data.csv ./workdir/model.zip '["tokens_per_second","memory","memory_act"]'
30 | 
31 | .PHONY: build-linear-model
32 | build-linear-model:
33 | 	python -m fm_training_estimator.regressor.linear.train ./workdir/data.csv ./workdir/model.zip '["tokens_per_second","memory","memory_act"]'
34 | 
35 | .PHONY: build-arise-model
36 | build-arise-model:
37 | 	python -m fm_training_estimator.regressor.arise.train ./workdir/data.csv ./workdir/model.zip ./workdir/arise-config.yaml '["tokens_per_second","memory","memory_act"]'
38 | 
39 | .PHONY: run-web-ui
40 | run-web-ui:
41 | 	python -m fm_training_estimator.ui.web ./workdir/model_whitelist.txt ./workdir/data.csv ./workdir/model.json --enable_api=True
42 | 
43 | .PHONY: run-cli
44 | run-cli:
45 | 	python -m fm_training_estimator.ui.cli --lookup_data_path ./workdir/data.csv -m ./workdir/model.zip $(CONF)
46 | 
47 | .PHONY: run-cli-arise
48 | run-cli-arise:
49 | 	python -m fm_training_estimator.ui.cli --lookup_data_path ./workdir/data.csv -m ./workdir/model.zip $(CONF)
50 | 
51 | .PHONY: run-api
52 | run-api:
53 | 	python -m fm_training_estimator.ui.api ./workdir/data.csv ./workdir/model.json
54 | 
55 | .PHONY: cbuild
56 | cbuild:
57 | 	docker build -t ${IMAGE} -f Dockerfile .
58 | 
59 | .PHONY: cpush
60 | cpush:
61 | 	docker push ${IMAGE}
62 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # FM Training Estimator
  2 | 
  3 | Estimators for Large Language Model Training.
  4 | 
  5 | Estimate resource consumption - memory, tokens, time etc for training and fine-tuning jobs using an hybrid of theory and learned regression models.
  6 | 
  7 | ## Feature Matrix and Roadmap
  8 | 
  9 | | Technique          | Support            |
 10 | |--------------------|--------------------|
 11 | | Full (1 gpu)       | :heavy_check_mark: |
 12 | | FSDP (multi)       | :heavy_check_mark: |
 13 | | Lora (1 gpu)       | :heavy_check_mark: |
 14 | | QLora (1 gpu)      | :heavy_check_mark: |
 15 | | Speculators        | Planned            |
 16 | | Tensor Parallelism | Planned            |
 17 | 
 18 | ### Time
 19 | 
 20 | Full learned approach. Coverage based on availability of training data.
 21 | 
 22 | ### Memory
 23 | 
 24 | Hybrid theory + learned. Coverage of learned approach is subject to availability of training data.
 25 | 
 26 | ### Tokens
 27 | 
 28 | Fully theory. Simulation based models available.
 29 | 
 30 | | Technique | Explanation                                    | Availability       |
 31 | |-----------|------------------------------------------------|--------------------|
 32 | | TE0       | Simulation based - slow but accurate           | :heavy_check_mark: |
 33 | | TE1       | Statistical                                    | Planned            |
 34 | | TE2       | Approximate - fast, light, reasonable accurate | :heavy_check_mark  |
 35 | 
 36 | ## Usage
 37 | 
 38 | You can use the library `fm_training_estimator` as a Python package by installing it via pip, see [installation](#install), [build a regression model](#build-a-regression-model-for-learned-prediction-method) and [using the library](#use-the-library-to-get-estimates). If you'd like to construct the estimator service with a [Web UI](#make-estimates-via-a-web-ui) via FastAPI or [build a docker image](#build-a-docker-container-image), clone the repository in your local machine before following the instructions in those sections.
 39 | 
 40 | Within your working directory, it is recommended to create a virtual environment to ensure no conflicts in dependencies.
 41 | 
 42 | ```
 43 | python -m venv .venv
 44 | source .venv/bin/activate
 45 | ```
 46 | 
 47 | ### Install
 48 | ```
 49 | pip install fm_training_estimator
 50 | ```
 51 | 
 52 | ### Build a regression model for learned prediction method
 53 | 
 54 | Now, prepare data in the expected format for lookup and regression. The format to be used to save this data is given [here](https://github.com/foundation-model-stack/fm-training-estimator/tree/main/fm_training_estimator/data/README.md). Save your data file into `./workdir/data.csv`.
 55 | 
 56 | ```
 57 | mkdir workdir
 58 | mv <data file> ./workdir/data.csv
 59 | ```
 60 | 
 61 | Now, build a regression model using this data, using one of the the provided make targets.
 62 | 
 63 | ![Building a model](./imgs/build-model.png)
 64 | 
 65 | This will create a model called `./workdir/model.zip` which you can then use to estimate the resource consumption.
 66 | 
 67 | You can now run the estimator library, see below.
 68 | 
 69 | ### Using the Estimator
 70 | 
 71 | There are a few ways to use the Estimator now:
 72 | 
 73 | 1. Using the CLI tool, passing in a config in json format.
 74 | 2. Using the Web UI.
 75 | 3. Using the SDK directly from Python code.
 76 | 
 77 | #### Using the CLI
 78 | 
 79 | ![Demo of using CLI](./imgs/demo-cli.gif)
 80 | 
 81 | ### Make estimates via a Web UI
 82 | 
 83 | To do this, first prepare a txt file called `model_whitelist.txt` in the `workdir/` with a list of model names, 1 per line. Note that these are the models on which you want to run the estimator to estimate their resource consumption. You can use the provided [example](https://github.com/foundation-model-stack/fm-training-estimator/blob/main/fm_training_estimator/ui/model_whitelist.txt) and place it in your `workdir`. Modify this list as needed.
 84 | 
 85 | Now, run the ui:
 86 | ```
 87 | make run-web-ui
 88 | ```
 89 | This will start the UI on `localhost:3000` port.
 90 | 
 91 | (The web ui has other options, not covered in this simple setup. If you want to skip the model whitelisting or change the port, directly run the UI as shown in the README in the `./fm_training_estimator/ui` folder.)
 92 | 
 93 | #### Use the library to get estimates
 94 | 
 95 | For a full API reference, visit our [readthedocs](link).
 96 | 
 97 | Example code:
 98 | ```python
 99 | # Standard
100 | import os
101 | 
102 | # First Party
103 | from fm_training_estimator.config.arguments import (
104 |     DataArguments,
105 |     EstimateInput,
106 |     EstimatorMetadata,
107 |     FMArguments,
108 |     HFTrainingArguments,
109 |     InfraArguments,
110 |     JobConfig,
111 | )
112 | from fm_training_estimator.sdk import (
113 |     estimate_cost,
114 |     estimate_memory,
115 |     estimate_time,
116 |     estimate_tokens,
117 | )
118 | 
119 | workdir_path = os.path.join(os.path.abspath(os.curdir), "workdir")
120 | 
121 | model_path = os.path.join(workdir_path, "model.json")
122 | lookup_data_path = os.path.join(workdir_path, "data.csv")
123 | 
124 | estimator_metadata = EstimatorMetadata(base_data_path=lookup_data_path)
125 | 
126 | fm = FMArguments(
127 |     base_model_path="ibm-granite/granite-7b-base",
128 |     torch_dtype="bfloat16",
129 |     block_size=1024,
130 | )
131 | hf_training = HFTrainingArguments(
132 |     per_device_train_batch_size=1, gradient_checkpointing=False
133 | )
134 | data = DataArguments(dataset="imdb", te_approach=0)
135 | infra = InfraArguments(numGpusPerPod=1)
136 | job_conf = JobConfig(hf_training, fm, data, infra)
137 | est_input = EstimateInput(estimator_metadata=estimator_metadata, job_configs=[job_conf])
138 | 
139 | print("Estimating Memory:....")
140 | 
141 | print("With only theory: ", estimate_memory(est_input))
142 | print("With reg model: ", estimate_memory(est_input, model_path))
143 | 
144 | hf_training.fsdp = "full_shard"
145 | 
146 | print("Using fsdp full shard")
147 | print("With only theory: ", estimate_memory(est_input))
148 | print("With reg model: ", estimate_memory(est_input, model_path))
149 | 
150 | 
151 | print("Estimating Time:....")
152 | print("With only theory: ", estimate_time(est_input))
153 | print("With reg model: ", estimate_time(est_input, model_path))
154 | 
155 | print("Estimating Tokens:....")
156 | print("With only theory: ", estimate_tokens(est_input))
157 | print("With reg model: ", estimate_tokens(est_input, model_path))
158 | ```
159 | 
160 | ### Build a Docker Container Image
161 | 
162 | To build the estimator container image:
163 | 
164 | 1. Make sure both `model.json` and `data.csv` files are present in the `workdir` folder.
165 | 
166 | 2. Use this command to build and push the image:
167 | 
168 | ```shell
169 | make cbuild
170 | make cpush # If you want to push to the container registry
171 | ```
172 | 
173 | 3. Use this command to run the image:
174 | 
175 | ```shell
176 | docker run --rm -it -v "/path/to/input.json:/app/input.json" icr.io/ftplatform/fm_training_estimator:latest
177 | ```
178 | 


--------------------------------------------------------------------------------
/adrs/001-resource-estimator-library.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: Resource Estimator Library
  3 | ---
  4 | 
  5 | - **Author(s)**: Angel Luu (@aluu317)
  6 | - **Signer(s)**: Praveen Jayachandran, Ashok Pon Kumar Sree Prakash @ashokponkumar, Chander Govindarajan @ChanderG
  7 | - **Date (YYYY-MM-DD)**: 2024-10-31
  8 | - **Obsoletes ADRs**: N/A
  9 | - **Modified By ADRs**: N/A
 10 | - **Relevant Issues**: N/A
 11 | 
 12 | ## Problem Context
 13 | 
 14 | Users of tuning/training stack currently have no way of estimating how much memory, time or cost it takes to run a training. They often hit OOM errors due to lack of memory. Users don't have enough information to make trade-off decisions on time vs. cost. Platform admins do not have any info to better schedule/pack jobs onto GPUs.
 15 | 
 16 | In order to be useful, the capability of estimating resources must be exposed to tuning/training users. The primary user personas of this service include training users and platform admins.
 17 | 
 18 | This ADR defines a Resource Estimator Python Library that provides an estimate of resource requirements for training runs.
 19 | 
 20 | ## Impact Table
 21 | 
 22 | | AI Functionality                                                                                            | Operational Functionality                                                                      |
 23 | | ----------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- |
 24 | | Tuning Stack | APIs |
 25 | 
 26 | ## Decision
 27 | 
 28 | - We will expose the resource estimator service as a Python library `fm_training_estimator`, hosted as Open Source at the repo [fm-training-estimator](https://github.com/foundation-model-stack/fm-training-estimator) and published to [PyPI](https://pypi.org/).
 29 | - This Python library can be installed and plugged into any UI backend or a docker image by a product team.
 30 | - The `fm_training_estimator` exposes 4 methods to calculate memory, time, tokens and cost. The method calls allows for user to pass training data as input for "learned" or "hybrid" model. If training data is missing, the "theory" is used.
 31 | 
 32 | ### Alternatives to Python library deliverable
 33 | We have considered choices of:
 34 | - Alternative 1: A new docker image which has a FastAPI Server with a REST interface defined. When a product team integrates as a service, they can run this docker image, a server will run on localhost which can then be queried by GET/POST calls to do the estimates.
 35 | 
 36 | - Alternative 2: A new docker image with a python script similar to fms-hf-tuning, which accepts a JSON config and calls the necessary python scripts to get estimate and save results in a file.
 37 | 
 38 | Both alternatives provide more value to consumers. However does not provide the flexibility of how the library can be integrated and consumed.
 39 | 
 40 | ## Consequences
 41 | 
 42 | - By using this library, users need to supply their own dataset for the estimator to generate a learned model, and assume the security and privacy of that data. They can use flight service plugin should that be applicable.
 43 | - The library can be used as backend component of a larger UI effort, or as part of a Docker image. The product teams can consume the library however they see fit and create their own build/update process.
 44 | 
 45 | ## High Level Design
 46 | 
 47 | - The `EstimateInput` data class (not all fields are required) defines the set of configs the library will use to calculate the results. This includes a list of instances of `Config` data class which in turns includes different types of configs (hf training args `HFArguments`, fms-hf-tuning additional args `FMArguments`, data args `DataArguments`, infrastructure args `InfraArguments`, peft lora args `PeftLoraConfig` and peft qlora args `PeftQLoraConfig`), and `EstimatorConfig` with metadata parameters. The input can be read from a json file using `--input_file_path` or `-f`.
 48 | 
 49 | Example of an `EstimateInput` with all fields defined:
 50 | ```json
 51 | {
 52 |   "estimator": { // EstimatorMetadata
 53 |     "base_data_path": "data.csv",
 54 |     "method": "theory", // theory, learned, hybrid
 55 |     "token_estimation_version": 0
 56 |   },
 57 |   "job_configs": [{ // list of [JobConfig]
 58 |     "hf_training": { // HFArguments
 59 |       "output_dir": "./output"
 60 |     },
 61 |     "fm": { // FMArguments
 62 |       "base_model_path": "ibm-granite/granite-3b-code-base",
 63 |       "flash_attention_v2": "false",
 64 |       "lora_config": null,
 65 |       "max_seq_length": 2048,
 66 |       "block_size": 2048,
 67 |       "data_config_file": "data_config.json",
 68 |       "prompt_tuning_config": null,
 69 |       "torch_dtype": "float32",
 70 |       "technique": "full"
 71 |     },
 72 |     "data": { // DataArguments
 73 |       "te_approach": 0,
 74 |       "dataset": null,
 75 |       "dataset_text_field": "text",
 76 |       "dataset_split": "test",
 77 |       "dataset_config_name": null
 78 |     },
 79 |     "infra": { // InfraArguments
 80 |       "numGpusPerPod": 1,
 81 |       "numPods": 1,
 82 |       "gpu_memory_in_gb": 80,
 83 |       "gpuModel": "A100"
 84 |     },
 85 |     "peft_lora": { // PeftLoraConfig
 86 |       "r": 4,
 87 |       "lora_alpha": 8,
 88 |       "lora_dropout": 0.1,
 89 |       "target_modules": "[q_proj, v_proj]"
 90 |     },
 91 |     "peft_qlora": { // PeftQLoraConfig
 92 |       "quant_type": "nf4",
 93 |       "use_double_quant": False
 94 |     }
 95 |   }]
 96 | }
 97 | ```
 98 | 
 99 | - The API exposes 4 functions: 
100 | 
101 | Function `estimate_memory` returns a `MemoryEstimate`:
102 | ```python
103 | {
104 |   "memory": { # MemoryEstimate
105 |     "total_mem_estimate": "44.6 GiB",
106 |     "activation_memory": "34.7 GiB",
107 |     "gradient_memory": "2.5 GiB",
108 |     "model_memory": "2.5 GiB",
109 |     "optimizer_memory": "4.9 GiB",
110 |     "num_gpus": 2
111 |   }
112 | }
113 | ```
114 | 
115 | Function `estimate_time` returns a `TimeEstimate`:
116 | ```python
117 | {
118 |   "time": { # TimeEstimate
119 |     "time": "40s"
120 |   }
121 | }
122 | ```
123 | 
124 | Function `estimate_tokens` returns a `TokensEstimate`:
125 | ```python
126 | {
127 |   "tokens": { # TokensEstimate
128 |     "tps": "5259.07373046875"
129 |   }
130 | }
131 | ```
132 | 
133 | Function `estimate_cost` returns a `CostEstimate`:
134 | ```python
135 | {
136 |   "cost": { # CostEstimate
137 |     "usd": "0.0"
138 |   }
139 | }
140 | ```
141 | 
142 | Function `estimate` returns a `Estimate` that include all 4 types of estimates above:
143 | ```python
144 | {
145 |   "estimate": { # Estimate
146 |     "memory": { # MemoryEstimate
147 |       "total_mem_estimate": "44.6 GiB",
148 |       "activation_memory": "34.7 GiB",
149 |       "gradient_memory": "2.5 GiB",
150 |       "model_memory": "2.5 GiB",
151 |       "optimizer_memory": "4.9 GiB",
152 |       "num_gpus": 2
153 |     },
154 |     "time": { # TimeEstimate
155 |       "time": "40s"
156 |     },
157 |     "tokens": { # TokensEstimate
158 |       "tps": "5259.07373046875"
159 |     },
160 |     "cost": { # CostEstimate
161 |       "usd": "0.0"
162 |     }
163 |   }
164 | }
165 | ```
166 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # Standard
10 | # If extensions (or modules to document with autodoc) are in another directory,
11 | # add these directories to sys.path here. If the directory is relative to the
12 | # documentation root, use os.path.abspath to make it absolute, like shown here.
13 | #
14 | import os
15 | import sys
16 | 
17 | sys.path.insert(0, os.path.abspath(os.path.join("..", "..")))
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = "FM Training Estimator"
23 | copyright = "2024, The Training Estimator Authors"
24 | author = "The Training Estimator Authors"
25 | 
26 | 
27 | # -- General configuration ---------------------------------------------------
28 | 
29 | # Add any Sphinx extension module names here, as strings. They can be
30 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
31 | # ones.
32 | extensions = [
33 |     # Generate complete API docs by parsing source code
34 |     "autoapi.extension",
35 |     # Add links to source from generated docs
36 |     "sphinx.ext.viewcode",
37 | ]
38 | 
39 | # Add any paths that contain templates here, relative to this directory.
40 | templates_path = ["_templates"]
41 | 
42 | # List of patterns, relative to source directory, that match files and
43 | # directories to ignore when looking for source files.
44 | # This pattern also affects html_static_path and html_extra_path.
45 | exclude_patterns = [
46 |     "_build",
47 |     "Thumbs.db",
48 |     ".DS_Store",
49 |     "**ex1**",
50 |     "**test**",
51 |     "**_version**",
52 | ]
53 | 
54 | # -- autoapi configuration ---------------------------------------------------
55 | 
56 | # Language of source code to parse
57 | autoapi_type = "python"
58 | 
59 | # Source code to parse to generate API docs relative to 'docs/source' directory
60 | autoapi_dirs = [os.path.join("..", "..", "fm_training_estimator")]
61 | 
62 | # -- Options for HTML output -------------------------------------------------
63 | 
64 | # The theme to use for HTML and HTML Help pages.  See the documentation for
65 | # a list of builtin themes.
66 | #
67 | html_theme = "sphinx_rtd_theme"
68 | 
69 | # Add any paths that contain custom static files (such as style sheets) here,
70 | # relative to this directory. They are copied after the builtin static files,
71 | # so a file named "default.css" will overwrite the builtin "default.css".
72 | html_static_path = []
73 | 
74 | # Support external links to specific versions of the files in the Github repo
75 | branch = os.environ.get("READTHEDOCS_VERSION")
76 | if branch is None or branch == "latest":
77 |     branch = "main"
78 | 
79 | REPO = "foundation-model-stack/fm-training-estimator"
80 | scm_raw_web = "https://raw.githubusercontent.com/" + REPO + branch
81 | scm_web = "https://github.com/" + REPO + "blob/" + branch
82 | 
83 | # Store variables in the epilogue so they are globally available.
84 | rst_epilog = """
85 | .. |SCM_WEB| replace:: {s}
86 | .. |SCM_RAW_WEB| replace:: {sr}
87 | .. |SCM_BRANCH| replace:: {b}
88 | """.format(
89 |     s=scm_web, sr=scm_raw_web, b=branch
90 | )
91 | 
92 | # used to have links to repo files
93 | extlinks = {
94 |     "scm_raw_web": (scm_raw_web + "/%s", "scm_raw_web"),
95 |     "scm_web": (scm_web + "/%s", "scm_web"),
96 | }
97 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to FM Training Estimator's API documentation!
 2 | ======================================
 3 | 
 4 | 
 5 | Example usage:
 6 | 
 7 | >>> # Standard
 8 | import os
 9 | # First Party
10 | from fm_training_estimator.config.arguments import (
11 |     DataArguments,
12 |     EstimateInput,
13 |     EstimatorMetadata,
14 |     FMArguments,
15 |     HFTrainingArguments,
16 |     InfraArguments,
17 |     JobConfig,
18 | )
19 | from fm_training_estimator.sdk import (
20 |     estimate_cost,
21 |     estimate_memory,
22 |     estimate_time,
23 |     estimate_tokens,
24 | )
25 | workdir_path = os.path.join(os.path.abspath(os.curdir), "workdir")
26 | model_path = os.path.join(workdir_path, "model.json")
27 | lookup_data_path = os.path.join(workdir_path, "data.csv")
28 | estimator_metadata = EstimatorMetadata(base_data_path=lookup_data_path)
29 | fm = FMArguments(
30 |     base_model_path="ibm-granite/granite-7b-base",
31 |     torch_dtype="bfloat16",
32 |     block_size=1024,
33 | )
34 | hf_training = HFTrainingArguments(
35 |     per_device_train_batch_size=1, gradient_checkpointing=False
36 | )
37 | data = DataArguments(dataset="imdb", te_approach=0)
38 | infra = InfraArguments(numGpusPerPod=1)
39 | job_conf = JobConfig(hf_training, fm, data, infra)
40 | est_input = EstimateInput(estimator_metadata=estimator_metadata, job_configs=[job_conf])
41 | print("Estimating Memory:....")
42 | print("With only theory: ", estimate_memory(est_input))
43 | print("With reg model: ", estimate_memory(est_input, model_path))
44 | hf_training.fsdp = "full_shard"
45 | print("Using fsdp full shard")
46 | print("With only theory: ", estimate_memory(est_input))
47 | print("With reg model: ", estimate_memory(est_input, model_path))
48 | print("Estimating Time:....")
49 | print("With only theory: ", estimate_time(est_input))
50 | print("With reg model: ", estimate_time(est_input, model_path))
51 | print("Estimating Tokens:....")
52 | print("With only theory: ", estimate_tokens(est_input))
53 | print("With reg model: ", estimate_tokens(est_input, model_path))
54 | 
55 | .. toctree::
56 |    :maxdepth: 2
57 |    :caption: Contents:
58 | 
59 | 
60 | Indices and tables
61 | ==================
62 | 
63 | * :ref:`genindex`
64 | * :ref:`modindex`
65 | * :ref:`search`


--------------------------------------------------------------------------------
/fm_training_estimator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-model-stack/fm-training-estimator/ba3f02ab129877c6fa6dabca46f860ce0f95c62c/fm_training_estimator/__init__.py


--------------------------------------------------------------------------------
/fm_training_estimator/config/__init__.py:
--------------------------------------------------------------------------------
 1 | # Local
 2 | from .arguments import (
 3 |     DataArguments,
 4 |     FMArguments,
 5 |     HFTrainingArguments,
 6 |     InfraArguments,
 7 |     PeftLoraConfig,
 8 |     PeftPromptTuningConfig,
 9 |     PeftQLoraConfig,
10 | )
11 | from .parser import parse
12 | from .utils import is_fsdp
13 | 
14 | __all__ = [
15 |     "FMArguments",
16 |     "PeftPromptTuningConfig",
17 |     "PeftLoraConfig",
18 |     "PeftQLoraConfig",
19 |     "HFTrainingArguments",
20 |     "InfraArguments",
21 |     "DataArguments",
22 |     "parse",
23 |     "is_fsdp",
24 | ]
25 | 


--------------------------------------------------------------------------------
/fm_training_estimator/config/arguments.py:
--------------------------------------------------------------------------------
  1 | # Standard
  2 | from dataclasses import dataclass, field
  3 | from enum import Enum
  4 | from typing import List, Optional
  5 | 
  6 | # Third Party
  7 | from dataclass_wizard import JSONWizard
  8 | from peft.tuners.lora import LoraConfig
  9 | from peft.tuners.prompt_tuning import PromptTuningConfig
 10 | from transformers import TrainingArguments
 11 | 
 12 | 
 13 | @dataclass
 14 | class PeftPromptTuningConfig(PromptTuningConfig):
 15 |     """dataclass for prompt tuning config
 16 | 
 17 |     Args:
 18 |         PromptTuningConfig (_type_): imported directly from peft library
 19 |     """
 20 | 
 21 | 
 22 | @dataclass
 23 | class PeftLoraConfig:
 24 |     """Dataclass for LoRA tuning config
 25 | 
 26 |     Not directly imported from peft LoraConfig due to complexity.
 27 |     """
 28 | 
 29 |     r: int = field(default=4, metadata={"help": ("Lora rank parameter")})
 30 | 
 31 |     lora_alpha: int = field(default=8)
 32 |     lora_dropout: float = field(default=0.1)
 33 |     target_modules: List[str] = field(default_factory=lambda: ["q_proj", "v_proj"])
 34 | 
 35 | 
 36 | @dataclass
 37 | class PeftQLoraConfig:
 38 |     """Dataclass for QLoRA tuning config"""
 39 | 
 40 |     quant_type: str = field(default="nf4")
 41 |     use_double_quant: bool = field(default=False)
 42 | 
 43 | 
 44 | @dataclass
 45 | class HFTrainingArguments(TrainingArguments):
 46 |     """HF trainer arguments
 47 | 
 48 |     Args:
 49 |         TrainingArguments (_type_): directly imported from transformers library
 50 |     """
 51 | 
 52 |     output_dir: str = field(
 53 |         default="./output", metadata={"help": ("temporary output dir for HF")}
 54 |     )
 55 | 
 56 | 
 57 | @dataclass
 58 | class InfraArguments:
 59 |     """dataclass for infrastructure arguments"""
 60 | 
 61 |     numGpusPerPod: int = field(
 62 |         default=0,
 63 |         metadata={
 64 |             "help": (
 65 |                 "number of gpus requested per pod. Setting to 0 for auto-discover."
 66 |             )
 67 |         },
 68 |     )
 69 | 
 70 |     numPods: int = field(
 71 |         default=1,
 72 |         metadata={"help": ("number of pods requested")},
 73 |     )
 74 | 
 75 |     gpu_memory_in_gb: int = field(default=80, metadata={"help": ("GPU RAM in GBs")})
 76 | 
 77 |     gpuModel: str = field(
 78 |         default="A100",
 79 |         metadata={"help": ("model of gpu used")},
 80 |     )
 81 | 
 82 | 
 83 | @dataclass
 84 | class FMArguments:
 85 |     """dataclass to store additional args not covered by standard HF argument dataclasses"""
 86 | 
 87 |     base_model_path: str = field(
 88 |         default="ibm-granite/granite-3b-code-base",
 89 |         metadata={
 90 |             "help": (
 91 |                 "Base Model location. Can be empty if output path has a checkpoint."
 92 |             )
 93 |         },
 94 |     )
 95 | 
 96 |     flash_attention_v2: bool = field(
 97 |         default=False,
 98 |         metadata={"help": ("It enable flash attention v2 for attention calculation.")},
 99 |     )
100 | 
101 |     lora_config: str = field(
102 |         default=None, metadata={"help": ("LORA configuration json file path.")}
103 |     )
104 | 
105 |     max_seq_length: int = field(
106 |         default=2048,
107 |         metadata={"help": ("model max sequence length.")},
108 |     )
109 | 
110 |     block_size: int = field(
111 |         default=2048,
112 |         metadata={"help": ("Sequence length.")},
113 |     )
114 | 
115 |     data_config_file: str = field(
116 |         default="data_config.json",
117 |         metadata={"help": ("Input files in glob format.")},
118 |     )
119 | 
120 |     prompt_tuning_config: str = field(
121 |         default=None, metadata={"help": ("Prompt tuning config json file path")}
122 |     )
123 | 
124 |     torch_dtype: str = field(
125 |         default="float32",
126 |         metadata={
127 |             "help": (
128 |                 "provide torch dtype for the model precision. \
129 |                 Choose one from float16, float32, bfloat16"
130 |             )
131 |         },
132 |     )
133 | 
134 |     technique: str = field(
135 |         default="full",
136 |         metadata={"help": ("Fine-tuning technique being used")},
137 |     )
138 | 
139 | 
140 | @dataclass
141 | class DataArguments:
142 |     """dataclass to define args handling training data as input for estimation."""
143 | 
144 |     te_approach: int = field(
145 |         default=0, metadata={"help": ("Approach to use for Token Estimation")}
146 |     )
147 | 
148 |     dataset: str = field(
149 |         default=None, metadata={"help": ("name of HF dataset or path to json file")}
150 |     )
151 | 
152 |     dataset_text_field: str = field(
153 |         default="text", metadata={"help": ("field of the dataset to use")}
154 |     )
155 | 
156 |     dataset_split: str = field(
157 |         default="test",
158 |         metadata={"help": ("dataset split to use, in case of HF dataset")},
159 |     )
160 | 
161 |     dataset_config_name: str = field(
162 |         default=None,
163 |         metadata={"help": ("dataset configuration to use, in case of HF dataset")},
164 |     )
165 | 
166 |     trust_remote_code: bool = field(
167 |         default=True,
168 |         metadata={"help": ("allow dataset with a loading script")}
169 |     )
170 | 
171 |     dataset_config_file: str = field(
172 |         default=None,
173 |         metadata={"help": ("dataset configuration file in case dataset is not available/provided")},
174 |     )
175 | 
176 | class EstimatorMethod(Enum):
177 |     """Enumerate different estimation models the FM Training Estimator is to use to make an estimation."""
178 | 
179 |     THEORY = "theory"
180 |     """Theory model for estimation."""
181 | 
182 |     LEARNED = "learned"
183 |     """Learned model for estimation, based on user provided training data."""
184 | 
185 |     HYBRID = "hybrid"
186 |     """Hybrid model for estimation, a combination of theory and learned models."""
187 | 
188 | 
189 | @dataclass
190 | class EstimatorMetadata:
191 |     """Metadata for the FM Training Estimator."""
192 | 
193 |     base_data_path: str = field(
194 |         default=None, metadata={"help": ("path to the data path for training data")}
195 |     )
196 |     method: EstimatorMethod = field(
197 |         default=EstimatorMethod.HYBRID,
198 |         metadata={"help": ("enum method the estimator should use")},
199 |     )
200 |     token_estimation_version: str = field(
201 |         default=0, metadata={"help": ("version of token estimator to use")}
202 |     )
203 | 
204 | 
205 | @dataclass
206 | class JobConfig:
207 |     """Dataclass that represents a set of different configs for a tuning job to make estimate on."""
208 | 
209 |     hf_training: HFTrainingArguments = field(default_factory=HFTrainingArguments)
210 |     fm: FMArguments = field(default_factory=FMArguments)
211 |     data: DataArguments = field(default_factory=DataArguments)
212 |     infra: InfraArguments = field(default_factory=InfraArguments)
213 |     peft_lora: PeftLoraConfig = field(default_factory=PeftLoraConfig)
214 |     peft_qlora: PeftQLoraConfig = field(default_factory=PeftQLoraConfig)
215 | 
216 | 
217 | @dataclass
218 | class EstimateInput(JSONWizard):
219 |     """
220 |     The dataclass that is an input to a estimate function.
221 |     It includes a list of different training job configs and metadata about the estimator.
222 |     """
223 | 
224 |     job_configs: List[JobConfig]
225 |     estimator_metadata: Optional[EstimatorMetadata] = None
226 | 
227 | 
228 | @dataclass
229 | class TimeEstimate:
230 |     """The estimated time response to estimate_time function."""
231 | 
232 |     time: str
233 |     train_time: str
234 | 
235 | 
236 | @dataclass
237 | class MemoryEstimate:
238 |     """The estimated memory response to estimate_memory function."""
239 | 
240 |     total_mem_estimate: str
241 |     activation_memory: str
242 |     gradient_memory: str
243 |     model_memory: str
244 |     optimizer_memory: str
245 |     num_gpus: int
246 | 
247 | 
248 | @dataclass
249 | class TokensEstimate:
250 |     """The estimated token response to estimate_token function."""
251 | 
252 |     tps: float
253 | 
254 | 
255 | @dataclass
256 | class CostEstimate:
257 |     """The estimated cost response to estimate_cost function."""
258 | 
259 |     usd: float
260 | 
261 | 
262 | @dataclass
263 | class Estimate:
264 |     """The estimate response to estimate function, including time, memory, tokens and cost."""
265 | 
266 |     memory: MemoryEstimate
267 |     time: TimeEstimate
268 |     tokens: TokensEstimate
269 |     cost: CostEstimate
270 | 


--------------------------------------------------------------------------------
/fm_training_estimator/config/parser.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from typing import Dict, Tuple, Union
 3 | 
 4 | # Third Party
 5 | from transformers import HfArgumentParser
 6 | 
 7 | # Local
 8 | from ..utils import logger, unmarshal
 9 | from .arguments import (
10 |     DataArguments,
11 |     FMArguments,
12 |     HFTrainingArguments,
13 |     InfraArguments,
14 |     PeftLoraConfig,
15 |     PeftQLoraConfig,
16 | )
17 | 
18 | 
19 | def parse(
20 |     config: Union[Dict, str]
21 | ) -> Tuple[
22 |     FMArguments,
23 |     HFTrainingArguments,
24 |     InfraArguments,
25 |     DataArguments,
26 |     PeftLoraConfig,
27 |     PeftQLoraConfig,
28 | ]:
29 |     """parse config and return respective dataclass objects
30 | 
31 |     Args:
32 |         config (Union[Dict, str]): path to config file or a config python dict
33 | 
34 |     Returns:
35 |         Tuple[FMArguments, TrainingArguments, PeftLoraConfig, PeftQLoraConfig, PeftPromptTuningConfig]:
36 |             dataclass objects
37 |     """
38 |     try:
39 |         if not isinstance(config, (str, Dict)):
40 |             raise TypeError(
41 |                 "provided config should be either path to a config file \
42 |                     or a python:dict, but got {config_type}".format(
43 |                     config_type=type(config)
44 |                 )
45 |             )
46 |         if isinstance(config, str):
47 |             config = unmarshal(config)
48 | 
49 |         arg_parser = HfArgumentParser(
50 |             [
51 |                 FMArguments,
52 |                 HFTrainingArguments,
53 |                 InfraArguments,
54 |                 DataArguments,
55 |                 PeftLoraConfig,
56 |                 PeftQLoraConfig,
57 |             ]
58 |         )
59 | 
60 |         return arg_parser.parse_dict(config)
61 |     except Exception as e:  # pylint: disable=broad-except
62 |         logger.error(
63 |             "failed to parse the provided arguments from config {config}. error: {e}".format(
64 |                 config=config, e=e
65 |             )
66 |         )
67 | 


--------------------------------------------------------------------------------
/fm_training_estimator/config/test_configs/config1.json:
--------------------------------------------------------------------------------
1 | {
2 |     "max_seq_length": 1023
3 | }
4 | 


--------------------------------------------------------------------------------
/fm_training_estimator/config/test_configs/config2.json:
--------------------------------------------------------------------------------
1 | {
2 |     "base_model_path": "ibm-granite/granite-7b-base",
3 |     "torch_dtype": "float16",
4 |     "fsdp": "full_shard",
5 |     "numGpusPerPod": 2,
6 |     "per_device_train_batch_size": 4,
7 |     "block_size": 512
8 | }
9 | 


--------------------------------------------------------------------------------
/fm_training_estimator/config/test_configs/config3.json:
--------------------------------------------------------------------------------
1 | {
2 |     "base_model_path": "ibm-granite/granite-7b-base",
3 |     "torch_dtype": "float16",
4 |     "fsdp": "full_shard",
5 |     "numGpusPerPod": 2,
6 |     "per_device_train_batch_size": 5,
7 |     "block_size": 512
8 | }
9 | 


--------------------------------------------------------------------------------
/fm_training_estimator/config/test_configs/config4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "base_model_path": "ibm-granite/granite-7b-base",
 3 |     "torch_dtype": "float16",
 4 |     "fsdp": "full_shard",
 5 |     "numGpusPerPod": 2,
 6 |     "per_device_train_batch_size": 5,
 7 |     "block_size": 512,
 8 |     "dataset": "imdb",
 9 |     "dataset_config_file": "abc.json"
10 | }
11 | 


--------------------------------------------------------------------------------
/fm_training_estimator/config/test_parser.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from pathlib import Path
 3 | 
 4 | # Local
 5 | from .parser import parse
 6 | 
 7 | config_file_1 = (Path(__file__).parent / "./test_configs/config4.json").as_posix()
 8 | 
 9 | 
10 | def test_parse_empty_dict():
11 |     config = {}
12 |     _, _, _, _, _, _ = parse(config)
13 | 
14 | 
15 | def test_parse_dict():
16 |     config = {
17 |         "max_seq_length": 1023,
18 |         "gpu_memory_in_gb": 40,
19 |         "block_size": 1023,
20 |         "per_device_train_batch_size": 2,
21 |         "dataset": "my-dataset",
22 |     }
23 |     fm, ta, ia, da, _, _ = parse(config)
24 | 
25 |     assert fm.max_seq_length == 1023
26 |     assert ia.gpu_memory_in_gb == 40
27 |     assert fm.block_size == 1023
28 |     assert ta.per_device_train_batch_size == 2
29 |     assert da.dataset == "my-dataset"
30 | 
31 | 
32 | def test_parse_file():
33 |     fm, ta, ia, da, _, _ = parse(config_file_1)
34 | 
35 |     assert da.dataset_config_file == "abc.json"
36 | 


--------------------------------------------------------------------------------
/fm_training_estimator/config/test_utils.py:
--------------------------------------------------------------------------------
 1 | # Local
 2 | from .parser import parse
 3 | from .utils import is_fsdp
 4 | 
 5 | 
 6 | def test_fsdp_empty():
 7 |     config = {}
 8 |     _, ta, _, _, _, _ = parse(config)
 9 | 
10 |     assert is_fsdp(ta) is False
11 | 
12 | 
13 | def test_fsdp_enabled():
14 |     config = {"fsdp": "full_shard"}
15 |     _, ta, _, _, _, _ = parse(config)
16 | 
17 |     assert is_fsdp(ta) is True
18 | 
19 |     config = {"fsdp": ["hybrid_shard", "offload"]}
20 |     _, ta, _, _, _, _ = parse(config)
21 | 
22 |     assert is_fsdp(ta) is True
23 | 


--------------------------------------------------------------------------------
/fm_training_estimator/config/utils.py:
--------------------------------------------------------------------------------
 1 | # First Party
 2 | from fm_training_estimator.config.arguments import HFTrainingArguments
 3 | 
 4 | 
 5 | def is_fsdp(ta: HFTrainingArguments):
 6 |     if hasattr(ta, "fsdp") and len(ta.fsdp) != 0:
 7 |         return True
 8 | 
 9 |     return False
10 | 


--------------------------------------------------------------------------------
/fm_training_estimator/data/README.md:
--------------------------------------------------------------------------------
 1 | # Data
 2 | 
 3 | This module is used to standardize and version the supported data formats to be used both at train time (for the regression models) and at run time (the format to structure the data to feed to the lookup and the regression modules).
 4 | 
 5 | Since, we wish to support an ever evolving set of dataset features, the data format has been versioned into formats, such as "v1", "v2" and so on.
 6 | 
 7 | There are 3 integration points of this format:
 8 | 1. The format of the data in the csv file for lookup. The names and order of columns, basically.
 9 | 2. The feature names (with order) used to train any regression model to be used with the estimator.
10 | 3. The key values (with order) to be used at run time, to query one of the above 2 modules.
11 | 
12 | This module, locks in code, the exact expected format of data with version names. These names are mainly for human use, to refer to various formats. However, the job of this module is to automatically infer data format versions and adjust the data fields to make it easy for other modules to work with continuously changing data formats.
13 | 
14 | Specifcally:
15 | 1. For CSV files used in lookup, this module will check based on the header, the format version before using it.
16 | 2. Regression training is expected to use this module to bake the used data format into the model. This way, the model file can be safely shared and re-used. At model load, this format is extracted out and used in 3.
17 | 3. For runtime queries, this module provides helper functions to structure input data to fit the expected data format.
18 | 
19 | In the future, this module can also:
20 | 1. Provide validation functions to check any input data files/models.
21 | 2. Provide correction functions, to coerce input data files to the specified format.
22 | 
23 | ## Formats
24 | 
25 | # v1: Name based data
26 | 
27 | For an example, look at `../regressor/test_data/data2.csv`. We need the following fields, in order:
28 | ```
29 | model_name,number_gpus,batch_size,seq_len,tokens_per_second,memory,memory_act
30 | ```
31 | 
32 | Model_name is HF compatible name. All other fields are numbers.
33 | 
34 | Memory refers to total memory taken by that configuration in Bytes.
35 | Memory_act refers to activation memory consumed by that configuration in Bytes.
36 | 
37 | # v2: Feature based data
38 | 
39 | For an example, look at `../regressor/test_data/data3.csv`. We need the following fields in order:
40 | ```
41 | model_arch,model_hidden_size,model_intermediate_size,model_num_attn_heads,model_num_hidden_layers,model_num_key_value_heads,number_gpus,batch_size,seq_len,tokens_per_second,memory,memory_act
42 | ```
43 | 
44 | Notice how we no longer have the name of the model in the data. Instead, the first 6 fields refer to model configuration features which are now being used. All other fields are as in the `Name based data` format.
45 | 


--------------------------------------------------------------------------------
/fm_training_estimator/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .manager import format_query, lookup_format_version, get_format_by_version
3 | 


--------------------------------------------------------------------------------
/fm_training_estimator/data/manager.py:
--------------------------------------------------------------------------------
  1 | # Local
  2 | from ..utils import extract_model_features
  3 | 
  4 | 
  5 | class Format:
  6 |     """A class to track the various data formats used for lookup/regressor.
  7 | 
  8 |     Stores the features used/predicted as strings.
  9 |     """
 10 | 
 11 |     def __init__(self, name, X, Y):
 12 |         self.name = name
 13 |         self.X = X
 14 |         self.Y = Y
 15 | 
 16 |     def get_all_columns_string(self):
 17 |         return self.X + "," + self.Y
 18 | 
 19 |     def get_empty_key_dict(self):
 20 |         res = {}
 21 |         for x in self.X.split(","):
 22 |             res[x] = None
 23 | 
 24 |         return res
 25 | 
 26 | 
 27 | """
 28 | This is the list of accepted/known data formats.
 29 | 
 30 | Only one of the following is a valid format for csv files for lookup and for any trained regression models.
 31 | 
 32 | When new formats are to be supported this list is to be updated with a new Format object.
 33 | """
 34 | formats = [
 35 |     Format(
 36 |         "v1",
 37 |         "model_name,number_gpus,batch_size,seq_len",
 38 |         "tokens_per_second,memory,memory_act",
 39 |     ),
 40 |     Format(
 41 |         "v2",
 42 |         "model_arch,model_hidden_size,model_intermediate_size,model_num_attn_heads,model_num_hidden_layers,model_num_key_value_heads,number_gpus,batch_size,seq_len",
 43 |         "tokens_per_second,memory,memory_act",
 44 |     ),
 45 |     Format(
 46 |         "v3",
 47 |         "model_arch,model_hidden_size,model_intermediate_size,model_num_attn_heads,model_num_hidden_layers,model_num_key_value_heads,method,gpu_model,number_gpus,batch_size,seq_len",
 48 |         "tokens_per_second,memory,memory_act",
 49 |     ),
 50 | ]
 51 | 
 52 | 
 53 | def lookup_format_version(data_keys):
 54 |     """
 55 |     Given a string of comma separated keys, looks up any matching defined format version.
 56 | 
 57 |     The input included both X and Y columns in that order, like the header of
 58 |     the CSV used to train/lookup.
 59 |     """
 60 |     for f in formats:
 61 |         if data_keys == f.get_all_columns_string():
 62 |             return f.name
 63 | 
 64 |     return "undefined"
 65 | 
 66 | 
 67 | def get_format_by_version(version):
 68 |     """Given a version string, return the relevant Format object."""
 69 |     for f in formats:
 70 |         if f.name == version:
 71 |             return f
 72 | 
 73 |     return None
 74 | 
 75 | 
 76 | def format_query(partials, version, only_values=False):
 77 |     """
 78 |     Format a query for a given version using the provided partial information.
 79 | 
 80 |     If only_values is False, returns a dictionary of key-values according to the format.
 81 |     If it is true, returns the values as an array. The former is needed for direct
 82 |     lookup in the lookup module, while the latter is used by the regressor.
 83 |     """
 84 | 
 85 |     vf = get_format_by_version(version)
 86 | 
 87 |     # TODO: vf can be None here, if an unsupported format is seen.
 88 | 
 89 |     query = vf.get_empty_key_dict()
 90 | 
 91 |     # fill in all matching fields from the input, if present in desired version
 92 |     for k, v in partials.items():
 93 |         if k in query:
 94 |             query[k] = v
 95 | 
 96 |     # Handle changes for other model versions here
 97 | 
 98 |     if version == "v2" or version == "v3":
 99 |         model_features = extract_model_features(partials["model_name"])
100 |         for k, v in model_features.items():
101 |             if k in query:
102 |                 query[k] = v
103 | 
104 |     # TODO: validate that all fields are filled in here, no None's present
105 |     # print(query)
106 | 
107 |     if not only_values:
108 |         return query
109 |     else:
110 |         return query.values()
111 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .fsdp import FSDPEstimator
3 | from .full import FullParameterTuningEstimator
4 | from .hybrid import HybridEstimator
5 | from .lora import HybridLoraEstimator, LoraEstimator
6 | from .qlora import HybridQLoraEstimator, QLoraEstimator
7 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/fsdp/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .fsdp import FSDPEstimator
3 | 
4 | __all__ = ["FSDPEstimator"]
5 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/fsdp/fsdp.py:
--------------------------------------------------------------------------------
  1 | # Standard
  2 | import math
  3 | 
  4 | # Local
  5 | from ...config import FMArguments, HFTrainingArguments
  6 | from ...utils import fmt_size
  7 | from ..full import FullParameterTuningEstimator
  8 | 
  9 | 
 10 | class FSDPEstimator:
 11 |     def __init__(
 12 |         self,
 13 |         fm_args: FMArguments,
 14 |         train_args: HFTrainingArguments,
 15 |         base: FullParameterTuningEstimator,
 16 |         gpuSize: int,
 17 |     ) -> None:
 18 |         self.base = base
 19 |         self.gpuSize = gpuSize
 20 |         self.num_of_model_params = self.base.num_of_model_params
 21 |         self.num_of_trainable_params = self.base.num_of_trainable_params
 22 |         self.optimizer = self.base.optimizer
 23 |         self.precision = self.base.precision
 24 |         self.s = self.base.s
 25 |         """fsdp options
 26 |             - `"full_shard"`: Shard parameters, gradients and optimizer states.
 27 |             - `"shard_grad_op"`: Shard optimizer states and gradients.
 28 |             - `"hybrid_shard"`: Apply `FULL_SHARD` within a node, and replicate parameters across nodes.
 29 |             - `"hybrid_shard_zero2"`: Apply `SHARD_GRAD_OP` within a node, and replicate parameters across nodes.
 30 |             - `"offload"`: Offload parameters and gradients to CPUs (only compatible with `"full_shard"` and
 31 |               `"shard_grad_op"`).
 32 |             - `"auto_wrap"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`.
 33 |         """
 34 |         self.fsdp_options = train_args.fsdp
 35 |         # ignores multi node training
 36 |         self.num_gpus = None
 37 | 
 38 |     def set_number_of_gpus(self, num_gpus):
 39 |         self.num_gpus = num_gpus
 40 | 
 41 |     def get_number_of_gpus(self):
 42 |         if self.num_gpus is None:
 43 |             self.estimate_number_of_gpus()
 44 | 
 45 |         return self.num_gpus
 46 | 
 47 |     def estimate_number_of_gpus(self):
 48 |         base_memory = (
 49 |             self.base.calculate_activation_memory(readable=False)
 50 |             + self.base.calculate_gradient_memory(readable=False)
 51 |             + self.base.calculate_optimizer_memory(readable=False)
 52 |         )
 53 |         if "shard_grad_op" in self.fsdp_options:
 54 |             return math.ceil(
 55 |                 base_memory
 56 |                 / (
 57 |                     self.gpuSize
 58 |                     - (
 59 |                         self.gpuSize * 0.01
 60 |                         + self.base.calculate_model_memory(readable=False)
 61 |                     )
 62 |                 )
 63 |             )
 64 |         # leaving out 1% gap
 65 |         base_memory = (self.base.calculate_model_memory(readable=False)) + base_memory
 66 |         self.num_gpus = math.ceil(base_memory / (self.gpuSize - self.gpuSize * 0.01))
 67 |         return self.num_gpus
 68 | 
 69 |     def get_total_mem_estimate(self, readable: bool = False):
 70 |         size = (
 71 |             self.calculate_activation_memory()
 72 |             + self.calculate_gradient_memory()
 73 |             + self.calculate_model_memory()
 74 |             + self.calculate_optimizer_memory()
 75 |         )
 76 |         if readable:
 77 |             return fmt_size(size)
 78 |         return size
 79 | 
 80 |     def calculate_activation_memory(self, readable: bool = False):
 81 |         # activations are not sharded however, they are reduced by the minibatch size
 82 |         # minibatch is the per device batch size
 83 |         size = self.base.calculate_activation_memory(readable=False)
 84 |         if readable:
 85 |             return fmt_size(size)
 86 |         return size
 87 | 
 88 |     def calculate_gradient_memory(self, readable: bool = False):
 89 |         size = self.base.calculate_gradient_memory(readable=False) / (
 90 |             self.get_number_of_gpus()
 91 |         )
 92 |         if readable:
 93 |             return fmt_size(size)
 94 |         return size
 95 | 
 96 |     def calculate_optimizer_memory(self, readable: bool = False):
 97 |         size = self.base.calculate_optimizer_memory(readable=False) / (
 98 |             self.get_number_of_gpus()
 99 |         )
100 |         if readable:
101 |             return fmt_size(size)
102 |         return size
103 | 
104 |     def calculate_model_memory(self, readable: bool = False):
105 |         # at some point FSDP loads double the sharded model memory
106 |         size = self.base.calculate_model_memory(readable=False)
107 |         if not "shard_grad_op" in self.fsdp_options:
108 |             size = size / (self.get_number_of_gpus())
109 |         if readable:
110 |             return fmt_size(size)
111 |         return size
112 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/fsdp/test_fsdp.py:
--------------------------------------------------------------------------------
 1 | # Local
 2 | from ...config import parse
 3 | from ..full import FullParameterTuningEstimator
 4 | from .fsdp import FSDPEstimator
 5 | 
 6 | 
 7 | def test_fsdp():
 8 |     fm, ta, ia, _, _, _ = parse(
 9 |         {"base_model_path": "ibm-granite/granite-8b-code-base", "gpu_memory_in_gb": 80}
10 |     )
11 | 
12 |     base = FullParameterTuningEstimator(fm, ta)
13 |     est = FSDPEstimator(fm, ta, base, 1024 * 1024 * 1024 * ia.gpu_memory_in_gb)
14 | 
15 |     est.set_number_of_gpus(1)
16 |     mm1 = est.calculate_model_memory()
17 | 
18 |     est.set_number_of_gpus(2)
19 |     mm2 = est.calculate_model_memory()
20 | 
21 |     assert mm1 == mm2 * 2
22 | 
23 |     est.set_number_of_gpus(4)
24 |     mm4 = est.calculate_model_memory()
25 | 
26 |     assert mm1 == mm4 * 4
27 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/full/README.md:
--------------------------------------------------------------------------------
 1 | # Full Estimator
 2 | 
 3 | Estimating memory for a single GPU full fine-tuning.
 4 | 
 5 | ## Experimental Features
 6 | 
 7 | ### Gradient Checkpointing
 8 | 
 9 | How do we scale down Activation memory when Gradient Checkpointing is enabled? (The other 3 components are not impacted).
10 | 
11 | By examining Profiler output and looking at the code, we find that the checkpoint function (`torch.utils.checkpoint`) is called for each block, for eg, see: https://github.com/huggingface/transformers/blob/f5f1e52f6cf13cdf63ff25c311d33e2f2a842911/src/transformers/models/llama/modeling_llama.py#L984
12 | 
13 | This means that activations of a single block are stored when they are computed and once we are done with a block, just the inputs (which are stored in the `checkpoint` function) are retained to recompute activations for the backward pass.
14 | 
15 | So, a simple approximation is being used here - of scaling down the total activation memory to that consumed by a single layer or block in the Transformer arch.
16 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/full/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .full import FullParameterTuningEstimator
3 | 
4 | __all__ = ["FullParameterTuningEstimator"]
5 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/full/full.py:
--------------------------------------------------------------------------------
  1 | # Third Party
  2 | from transformers import AutoConfig, AutoTokenizer
  3 | from transformers.training_args import OptimizerNames
  4 | 
  5 | # Local
  6 | from ...config import FMArguments, HFTrainingArguments
  7 | from ...utils import fmt_size, get_size_from_precision, logger
  8 | 
  9 | 
 10 | class FullParameterTuningEstimator:
 11 |     def __init__(self, fm_args: FMArguments, train_args: HFTrainingArguments) -> None:
 12 |         # see https://huggingface.co/docs/transformers/v4.18.0/en/performance
 13 |         self.train_args = train_args
 14 |         self.fm_args = fm_args
 15 |         self.model_path = self.fm_args.base_model_path
 16 |         self.config = AutoConfig.from_pretrained(self.model_path)
 17 |         # check https://github.com/bigscience-workshop/bigscience/tree/6917a3b5fefcf439d3485ca184b4d9f6ab605150/math#model-sizing
 18 |         if hasattr(self.config, "n_embed"):
 19 |             self.h = self.config.n_embed
 20 |         elif hasattr(self.config, "n_embd"):
 21 |             self.h = self.config.n_embd
 22 |         elif hasattr(self.config, "hidden_size"):
 23 |             self.h = self.config.hidden_size
 24 |         h = self.h
 25 |         if hasattr(self.config, "n_layer"):
 26 |             l = self.config.n_layer
 27 |         elif hasattr(self.config, "num_hidden_layers"):
 28 |             l = self.config.num_hidden_layers
 29 |         self.l = l
 30 |         v = self.config.vocab_size
 31 |         self.v = v
 32 |         if hasattr(self.config, "n_head"):
 33 |             a = self.config.n_head
 34 |         elif hasattr(self.config, "num_attention_heads"):
 35 |             a = self.config.num_attention_heads
 36 |         self.a = a
 37 |         self.b = self.train_args.per_device_train_batch_size
 38 |         tokenizer = AutoTokenizer.from_pretrained(self.model_path)
 39 |         n_positions = tokenizer.model_max_length
 40 |         if hasattr(self.config, "n_positions"):
 41 |             n_positions = self.config.n_positions
 42 |         if hasattr(self.config, "max_position_embeddings"):
 43 |             n_positions = self.config.max_position_embeddings
 44 |         self.model_max_length = n_positions
 45 |         self.s = min(self.fm_args.block_size, self.model_max_length)
 46 |         # trainable parameters in full paramter tuning
 47 |         self.num_of_model_params = l * (12 * h**2 + 13 * h) + v * h + 4 * h
 48 |         self.num_of_trainable_params = self.num_of_model_params
 49 | 
 50 |         # optimizers supported by transformer library
 51 |         self.optimizer = OptimizerNames(self.train_args.optim)
 52 |         self.precision = self._get_precision()
 53 | 
 54 |     def set_trainable_parameters(self, num_params):
 55 |         self.num_of_trainable_params = num_params
 56 | 
 57 |     def set_hidden_size(self, hidden_size):
 58 |         self.h = hidden_size
 59 | 
 60 |     def _get_precision(self) -> str:
 61 |         ## TODO: expand support for other precisions mentioned in TrainingArguments
 62 |         return self.fm_args.torch_dtype
 63 | 
 64 |     def calculate_activation_memory(self, readable: bool = False):
 65 |         # see https://blog.eleuther.ai/transformer-math/#activations-and-batch-size
 66 |         s = self.s
 67 |         b = self.b
 68 |         # dimension of the hidden representation
 69 |         a = self.a
 70 |         l = self.l
 71 |         h = self.h
 72 |         v = self.v
 73 |         # no tensor parallelism and sequence parallelism is considered at this point
 74 |         # activations stored in fp16 is assumed
 75 |         # (https://blog.eleuther.ai/transformer-math/#activations-and-batch-size)
 76 |         t = 1
 77 |         # TODO there are variations in  mem usage based on activation recomputation
 78 |         # we take the worst case scenario
 79 |         transformer_block_size = (s * b * h * l) * (
 80 |             10 + (24 / t) + (5 * (a * s) / (h * t))
 81 |         )
 82 |         # input embeddings + last norm + output layer
 83 |         # no pipeline parallelism
 84 |         v = self.config.vocab_size
 85 |         p = 1
 86 |         # peripheral_size = ((s*b*h*l) / t) * ((p / l) + ((p * 4 / l) * (1 + (v/h))))
 87 |         # print(fmt_size(peripheral_size))
 88 |         size = transformer_block_size
 89 | 
 90 |         if self.train_args.gradient_checkpointing:
 91 |             size /= self.l
 92 | 
 93 |         multiplier = 1
 94 |         if self.precision == "float32":
 95 |             logger.debug(f"Memory Full - Using multiplier 2 as precision is float32.")
 96 |             multiplier = 2
 97 |         elif self.precision == "float16" or self.precision == "bfloat16":
 98 |             logger.debug(
 99 |                 f"Memory Full - Using multiplier 1 as precision is bfloat16 or float16."
100 |             )
101 |             multiplier = 1
102 |         # print(s, b, h, l)
103 |         # print(fmt_size(19 * s * b * h * l))
104 |         size = size * multiplier
105 |         # print(fmt_size(size / l))
106 |         if readable:
107 |             return fmt_size(size)
108 |         return size
109 | 
110 |     def get_total_mem_estimate(self, readable: bool = False):
111 |         # see https://blog.eleuther.ai/transformer-math/#distributed-training
112 |         # TODO: fsdp is considered similar to Deepspeed zeros in terms of memory consumption
113 |         # fsdp_sharding_strategy
114 |         # FULL_SHARD (params, optim, and gradient) == deepspeed zero 3
115 |         # SHARD_GRAD_OP (optim, and gradient) == deepspeed zero 2
116 |         # NO_SHARD == DDP / deepspeed zero 0
117 |         # HYBRID_SHARD (full shard in each node, like ddp across nodes) == deepspeed zero++ stage 3
118 | 
119 |         # however concrete formulation would be more helpful
120 |         size = (
121 |             self.calculate_activation_memory()
122 |             + self.calculate_gradient_memory()
123 |             + self.calculate_model_memory()
124 |             + self.calculate_optimizer_memory()
125 |         )
126 |         if readable:
127 |             return fmt_size(size)
128 |         return size
129 | 
130 |     def calculate_gradient_memory(self, readable: bool = False):
131 |         # see https://blog.eleuther.ai/transformer-math/#gradients
132 |         multiplier = 0
133 |         # TODO: gradient may not be in the same precision as the model
134 |         # NOTE: there could be mixed precision as well
135 |         # for mixed precision it is still fp32 computation
136 |         if self.precision == "float32":
137 |             multiplier = 4
138 |         elif self.precision == "float16" or self.precision == "bfloat16":
139 |             multiplier = 2
140 |         else:
141 |             raise ValueError("no support for the precision")
142 |         size = self.num_of_trainable_params * multiplier
143 |         if readable:
144 |             return fmt_size(size)
145 |         return size
146 | 
147 |     def calculate_model_memory(self, readable: bool = False):
148 |         # TODO we did not consider mixed precision here
149 |         # see https://huggingface.co/docs/transformers/v4.25.1/en/perf_train_gpu_one
150 |         size = self.num_of_model_params * get_size_from_precision(self.precision)
151 |         if readable:
152 |             return fmt_size(size)
153 |         return size
154 | 
155 |     def calculate_optimizer_memory(self, readable: bool = False):
156 |         multiplier = 0
157 |         # check https://github.com/huggingface/transformers/issues/22101
158 |         ## check https://blog.eleuther.ai/transformer-math/#optimizer-states
159 |         ## check https://huggingface.co/docs/transformers/v4.25.1/en/perf_train_gpu_one
160 |         ## TODO: should detect 8-bit adamw if being used and compute
161 |         if self.optimizer == OptimizerNames.ADAMW_TORCH or OptimizerNames.ADAMW_HF:
162 |             # optimizer state is funciton of gradients/parameters dtype
163 |             if self.precision == "float32":
164 |                 multiplier = 8
165 |             elif self.precision == "float16" or self.precision == "bfloat16":
166 |                 multiplier = 4
167 |         elif self.optimizer == OptimizerNames.SGD:
168 |             multiplier = 4
169 |         else:
170 |             raise NotImplementedError("computation for optimizer is not implemented")
171 |         size = self.num_of_trainable_params * multiplier
172 |         if readable:
173 |             return fmt_size(size)
174 |         return size
175 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/full/test_full.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | 
 3 | # Local
 4 | from ...config import parse
 5 | from .full import FullParameterTuningEstimator
 6 | 
 7 | 
 8 | def test_full():
 9 |     fm, ta, _, _, _, _ = parse({})
10 |     est = FullParameterTuningEstimator(fm, ta)
11 | 
12 |     mm = est.calculate_model_memory()
13 |     assert mm > 5 * 1_000_000_000
14 |     assert mm < 15 * 1_000_000_000
15 | 
16 | 
17 | def test_custom_model():
18 |     fm, ta, _, _, _, _ = parse({"base_model_path": "ibm-granite/granite-8b-code-base"})
19 |     est = FullParameterTuningEstimator(fm, ta)
20 | 
21 |     mm = est.calculate_model_memory()
22 |     assert mm > 25 * 1_000_000_000
23 |     assert mm < 35 * 1_000_000_000
24 | 
25 | 
26 | def test_half_precision():
27 |     fm, ta, _, _, _, _ = parse(
28 |         {
29 |             "base_model_path": "ibm-granite/granite-8b-code-base",
30 |             "torch_dtype": "float16",
31 |         }
32 |     )
33 |     est = FullParameterTuningEstimator(fm, ta)
34 | 
35 |     mm = est.calculate_model_memory()
36 |     assert mm > 10 * 1_000_000_000
37 |     assert mm < 20 * 1_000_000_000
38 | 
39 | 
40 | def test_gradient_checkpointing():
41 |     fm, ta, _, _, _, _ = parse(
42 |         {
43 |             "base_model_path": "ibm-granite/granite-8b-code-base",
44 |         }
45 |     )
46 |     est1 = FullParameterTuningEstimator(fm, ta)
47 |     mm1 = est1.calculate_activation_memory()
48 | 
49 |     ta.gradient_checkpointing = True
50 |     est2 = FullParameterTuningEstimator(fm, ta)
51 |     mm2 = est2.calculate_activation_memory()
52 | 
53 |     assert mm2 * 10 < mm1
54 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/hybrid/README.md:
--------------------------------------------------------------------------------
 1 | # Hybrid Memory Estimator
 2 | 
 3 | Uses a mix of theory, lookup and regressor, as follows.
 4 | 
 5 | ```mermaid
 6 | flowchart TD
 7 |   A[Input config] --> B{Is it FSDP?};
 8 |   B -- No --> C[Report breakup and total from Theory];
 9 |   B -- Yes --> D{Is Lookup DB available?};
10 |   D -- No --> H;
11 |   D -- Yes --> E[Try Lookup];
12 |   E --> F{Data point present?};
13 |   F -- Yes --> G[Return full memory];
14 |   F -- No --> H{Is ML Model available?};
15 |   H -- No --> I[Failure];
16 |   H -- Yes --> J[Predict Activation Memory from model];
17 |   J --> K[Calculate other components from Theory];
18 |   K --> L[Report total];
19 | ```
20 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/hybrid/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .hybrid import HybridEstimator
3 | 
4 | __all__ = ["HybridEstimator"]
5 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/hybrid/hybrid.py:
--------------------------------------------------------------------------------
  1 | # Local
  2 | from ...config import FMArguments, HFTrainingArguments, InfraArguments, is_fsdp
  3 | from ...data import format_query
  4 | from ...regressor import LookupRegressor, GetRegressor
  5 | from ...utils import logger
  6 | from ..fsdp import FSDPEstimator
  7 | from ..full import FullParameterTuningEstimator
  8 | 
  9 | 
 10 | class HybridEstimator:
 11 |     def __init__(
 12 |         self,
 13 |         fm_args: FMArguments,
 14 |         train_args: HFTrainingArguments,
 15 |         infra_args: InfraArguments,
 16 |         lookup_data_path,
 17 |         model_path,
 18 |     ):
 19 | 
 20 |         logger.info("Memory Hybrid: Initializing")
 21 | 
 22 |         self.fm = fm_args
 23 |         self.ta = train_args
 24 |         self.ia = infra_args
 25 | 
 26 |         # if fsdp param is not set, set it to default
 27 |         if self.ia.numGpusPerPod != 1:
 28 |             if self.ta.fsdp == []:
 29 |                 self.ta.fsdp = ["full_shard"]
 30 | 
 31 |         self.full_est = FullParameterTuningEstimator(fm_args, train_args)
 32 | 
 33 |         if not is_fsdp(self.ta):
 34 |             self.fsdp_enabled = False
 35 |             return
 36 | 
 37 |         # FSDP related logic
 38 |         self.fsdp_enabled = True
 39 |         self.fsdp_est = FSDPEstimator(
 40 |             fm_args,
 41 |             train_args,
 42 |             self.full_est,
 43 |             infra_args.gpu_memory_in_gb * 1024 * 1024 * 1024,
 44 |         )
 45 | 
 46 |         self.fsdp_est.set_number_of_gpus(self.ia.numGpusPerPod)
 47 | 
 48 |         # Lookup based estimator
 49 |         if lookup_data_path is not None:
 50 |             self.lookup_est = LookupRegressor(lookup_data_path)
 51 |         else:
 52 |             self.lookup_est = None
 53 | 
 54 |         # Model based estimator
 55 |         if model_path is not None:
 56 |             self.reg_est = GetRegressor(model_path)
 57 |         else:
 58 |             self.reg_est = None
 59 | 
 60 |         # auto-discover?
 61 |         if self.ia.numGpusPerPod == 0:
 62 |             self.auto_discover_num_gpus()
 63 | 
 64 |     def auto_discover_num_gpus(self):
 65 |         """Discover the number of gpus needed - by guess and emperical validation."""
 66 |         logger.info("Memory Hybrid - Attempting auto discovery of num gpus...")
 67 | 
 68 |         guess = self.fsdp_est.estimate_number_of_gpus()
 69 |         trials = 10
 70 | 
 71 |         while trials > 0:
 72 |             self.fsdp_est.set_number_of_gpus(guess)
 73 |             mem = self.get_total_mem_estimate()
 74 | 
 75 |             # acceptable memory configuration found
 76 |             if mem < self.ia.gpu_memory_in_gb * 1024**3:
 77 |                 logger.debug(
 78 |                     "Memory Hybrid - finalized num of gpus to: {}".format(guess)
 79 |                 )
 80 |                 return
 81 | 
 82 |             guess += 1
 83 |             trials -= 1
 84 | 
 85 |         logger.warning("Memory Hybrid - No suitable num gpus found!")
 86 |         self.fsdp_est.set_number_of_gpus(-1)
 87 | 
 88 |     def lookup_mem(self):
 89 |         lookup_query = {
 90 |             "model_name": self.fm.base_model_path,
 91 |             "number_gpus": self.fsdp_est.num_gpus,
 92 |             "batch_size": self.ta.per_device_train_batch_size,
 93 |             "seq_len": self.fm.block_size,
 94 |             "gpu_model": self.ia.gpuModel,
 95 |             "method": self.fm.technique,
 96 |         }
 97 | 
 98 |         lookup_query = format_query(lookup_query, self.lookup_est.get_data_format())
 99 | 
100 |         res = self.lookup_est.run(lookup_query)
101 | 
102 |         if res.empty:
103 |             return None
104 | 
105 |         return res["memory"][0:1].item()
106 | 
107 |     def calculate_activation_memory(self):
108 |         if not self.fsdp_enabled:
109 |             return self.full_est.calculate_activation_memory()
110 | 
111 |         if self.reg_est is None:
112 |             logger.debug("Memory Hybrid - Skipping Regression")
113 |             return self.fsdp_est.calculate_activation_memory()
114 | 
115 |         logger.debug("Memory Hybrid - Attempting Regression")
116 | 
117 |         lookup_query = {
118 |             "model_name": self.fm.base_model_path,
119 |             "number_gpus": self.fsdp_est.num_gpus,
120 |             "batch_size": self.ta.per_device_train_batch_size,
121 |             "seq_len": self.fm.block_size,
122 |             "gpu_model": self.ia.gpuModel,
123 |             "method": self.fm.technique,
124 |         }
125 | 
126 |         params = format_query(
127 |             lookup_query, self.reg_est.get_data_format(), only_values=True
128 |         )
129 | 
130 |         act = self.reg_est.run(params, "memory_act")
131 | 
132 |         logger.info(
133 |             "Memory Hybrid - Activation, from regression: {}, from theory: {}".format(
134 |                 act, self.fsdp_est.calculate_activation_memory()
135 |             )
136 |         )
137 | 
138 |         return act
139 | 
140 |     def calculate_gradient_memory(self):
141 |         if not self.fsdp_enabled:
142 |             return self.full_est.calculate_gradient_memory()
143 | 
144 |         return self.fsdp_est.calculate_gradient_memory()
145 | 
146 |     def calculate_model_memory(self):
147 |         if not self.fsdp_enabled:
148 |             return self.full_est.calculate_model_memory()
149 | 
150 |         return self.fsdp_est.calculate_model_memory()
151 | 
152 |     def calculate_optimizer_memory(self):
153 |         if not self.fsdp_enabled:
154 |             return self.full_est.calculate_optimizer_memory()
155 | 
156 |         return self.fsdp_est.calculate_optimizer_memory()
157 | 
158 |     def get_total_mem_estimate(self):
159 |         if not self.fsdp_enabled:
160 |             return self.full_est.get_total_mem_estimate()
161 | 
162 |         # simple lookup
163 |         if self.lookup_est is not None:
164 |             logger.debug("Memory Hybrid - attempting lookup")
165 |             lookup_mem = self.lookup_mem()
166 |             if lookup_mem is not None:
167 |                 logger.debug("Memory Hybrid - match found")
168 |                 return lookup_mem
169 | 
170 |         logger.info("Memory Hybrid - lookup failed")
171 | 
172 |         size = (
173 |             self.calculate_activation_memory()
174 |             + self.fsdp_est.calculate_gradient_memory()
175 |             + self.fsdp_est.calculate_model_memory()
176 |             + self.fsdp_est.calculate_optimizer_memory()
177 |         )
178 | 
179 |         return size
180 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/hybrid/hybrid_test.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from pathlib import Path
 3 | 
 4 | # Local
 5 | from ...config import parse
 6 | from ...regressor import XGBoostRegressor
 7 | from .hybrid import HybridEstimator
 8 | 
 9 | test_data2 = (Path(__file__).parent / "../../regressor/test_data/data2.csv").as_posix()
10 | test_data3 = (Path(__file__).parent / "../../regressor/test_data/data3.csv").as_posix()
11 | 
12 | 
13 | def test_hybrid(tmp_path):
14 | 
15 |     model_path = tmp_path / "test.model.json"
16 |     reg = XGBoostRegressor()
17 |     reg.train(test_data2, model_path, ["tokens_per_second", "memory", "memory_act"])
18 | 
19 |     fm, ta, ia, _, _, _ = parse(
20 |         {
21 |             "base_model_path": "ibm-granite/granite-7b-base",
22 |             "gpu_memory_in_gb": 80,
23 |             "fsdp": "full_shard",
24 |             "per_device_train_batch_size": 4,
25 |             "block_size": 512,
26 |             "numGpusPerPod": 2,
27 |         }
28 |     )
29 | 
30 |     est = HybridEstimator(fm, ta, ia, test_data2, model_path)
31 |     # Direct lookup example
32 |     assert est.get_total_mem_estimate() == 20
33 | 
34 |     fm, ta, ia, _, _, _ = parse(
35 |         {
36 |             "base_model_path": "ibm-granite/granite-7b-base",
37 |             "gpu_memory_in_gb": 80,
38 |             "fsdp": "full_shard",
39 |             "per_device_train_batch_size": 4,
40 |             "block_size": 512,
41 |             "numGpusPerPod": 3,
42 |         }
43 |     )
44 | 
45 |     est = HybridEstimator(fm, ta, ia, test_data2, model_path)
46 |     # Lookup fails - uses Reg based approach
47 |     assert est.get_total_mem_estimate() >= 10 * 1024 * 1024 * 1024
48 | 
49 |     grad_mem = est.calculate_gradient_memory()
50 |     model_mem = est.calculate_model_memory()
51 |     assert grad_mem >= 7 * 1024 * 1024 * 1024
52 |     assert model_mem >= 7 * 1024 * 1024 * 1024
53 |     assert grad_mem == model_mem
54 | 
55 | 
56 | def test_use_model_features(tmp_path):
57 | 
58 |     model_path = tmp_path / "test.model.json"
59 |     reg = XGBoostRegressor()
60 |     reg.train(test_data3, model_path, ["tokens_per_second", "memory", "memory_act"])
61 | 
62 |     fm, ta, ia, _, _, _ = parse(
63 |         {
64 |             "base_model_path": "ibm-granite/granite-7b-base",
65 |             "gpu_memory_in_gb": 80,
66 |             "fsdp": "full_shard",
67 |             "per_device_train_batch_size": 16,
68 |             "block_size": 1024,
69 |             "numGpusPerPod": 4,
70 |         }
71 |     )
72 | 
73 |     est = HybridEstimator(fm, ta, ia, test_data3, None)
74 | 
75 |     # Direct lookup example should work as before
76 |     assert est.get_total_mem_estimate() == 20
77 | 
78 |     fm, ta, ia, _, _, _ = parse(
79 |         {
80 |             "base_model_path": "ibm-granite/granite-8b-code-base",
81 |             "gpu_memory_in_gb": 80,
82 |             "fsdp": "full_shard",
83 |             "per_device_train_batch_size": 16,
84 |             "block_size": 1024,
85 |             "numGpusPerPod": 4,
86 |         }
87 |     )
88 | 
89 |     est = HybridEstimator(fm, ta, ia, test_data3, model_path)
90 | 
91 |     # Regression - based on model params
92 |     # though we have only input model name here, we get predictions based on it's features
93 |     assert est.calculate_activation_memory() < 30
94 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/lora/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .hybrid import HybridLoraEstimator
3 | from .lora import LoraEstimator
4 | 
5 | __all__ = ["LoraEstimator", "HybridLoraEstimator"]
6 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/lora/hybrid.py:
--------------------------------------------------------------------------------
  1 | # Local
  2 | from ...config import FMArguments, HFTrainingArguments, InfraArguments, PeftLoraConfig
  3 | from ...data import format_query
  4 | from ...regressor import LookupRegressor, GetRegressor
  5 | from ...utils import logger
  6 | from .lora import LoraEstimator
  7 | 
  8 | 
  9 | class HybridLoraEstimator:
 10 |     def __init__(
 11 |         self,
 12 |         fm_args: FMArguments,
 13 |         train_args: HFTrainingArguments,
 14 |         infra_args: InfraArguments,
 15 |         lora_args: PeftLoraConfig,
 16 |         lookup_data_path,
 17 |         model_path,
 18 |     ):
 19 | 
 20 |         logger.info("Memory Lora Hybrid - Initializing")
 21 | 
 22 |         self.fm = fm_args
 23 |         self.ta = train_args
 24 |         self.ia = infra_args
 25 | 
 26 |         self.lora_est = LoraEstimator(fm_args, train_args, lora_args)
 27 | 
 28 |         # Lookup based estimator
 29 |         if lookup_data_path is not None:
 30 |             self.lookup_est = LookupRegressor(lookup_data_path)
 31 |         else:
 32 |             self.lookup_est = None
 33 | 
 34 |         # Model based estimator
 35 |         if model_path is not None:
 36 |             self.reg_est = GetRegressor(model_path)
 37 |         else:
 38 |             self.reg_est = None
 39 | 
 40 |         if self.ia.numGpusPerPod == 0:
 41 |             # discover number of gpus
 42 |             self.auto_discover_num_gpus()
 43 |         else:
 44 |             self.num_gpus = self.ia.numGpusPerPod
 45 | 
 46 |     def auto_discover_num_gpus(self):
 47 |         num = self.lora_est.calculate_model_memory() / (
 48 |             self.ia.gpu_memory_in_gb * 1024**3
 49 |         )
 50 |         self.num_gpus = int(num) if num > 1 else 1
 51 | 
 52 |         trials = 10
 53 |         while trials > 0:
 54 |             mem = self.get_total_mem_estimate()
 55 |             if mem < self.ia.gpu_memory_in_gb * 1024**3:
 56 |                 logger.info(
 57 |                     "Memory Lora Hybrid - Discovered num gpus: {0}".format(
 58 |                         self.num_gpus
 59 |                     )
 60 |                 )
 61 |                 return
 62 | 
 63 |             trials -= 1
 64 |             self.num_gpus += 1
 65 | 
 66 |         logger.warning("Memory Lora Hybrid - No suitable num gpus found!")
 67 | 
 68 |     def calculate_model_memory(self):
 69 |         return self.lora_est.calculate_model_memory() / self.num_gpus
 70 | 
 71 |     def calculate_gradient_memory(self):
 72 |         return self.lora_est.calculate_gradient_memory() / self.num_gpus
 73 | 
 74 |     def calculate_optimizer_memory(self):
 75 |         return self.lora_est.calculate_optimizer_memory() / self.num_gpus
 76 | 
 77 |     def calculate_activation_memory(self):
 78 |         return self.lora_est.calculate_activation_memory() / self.num_gpus
 79 | 
 80 |     def get_total_mem_estimate(self):
 81 | 
 82 |         lookup_query_base = {
 83 |             "model_name": self.fm.base_model_path,
 84 |             "number_gpus": self.num_gpus,
 85 |             "batch_size": self.ta.per_device_train_batch_size,
 86 |             "seq_len": self.fm.block_size,
 87 |             "gpu_model": self.ia.gpuModel,
 88 |             "method": self.fm.technique,
 89 |         }
 90 | 
 91 |         if self.lookup_est is not None:
 92 |             logger.debug("Memory Lora Hybrid - Attempting lookup")
 93 |             lookup_query = format_query(
 94 |                 lookup_query_base, self.lookup_est.get_data_format()
 95 |             )
 96 |             logger.debug("Memory Lora Hybrid - Lookup query for lookup_est is: %s", lookup_query)
 97 |             res = self.lookup_est.run(lookup_query)
 98 |             if res.empty:
 99 |                 lookup_mem = None
100 |                 logger.debug(
101 |                     "Memory Lora Hybrid - No match was found by lookup, trying reg_est"
102 |                 )
103 |             else:
104 |                 lookup_mem = res["memory"][0:1].item()
105 |             if lookup_mem is not None:
106 |                 logger.info("Memory Lora Hybrid - Lookup: match found")
107 |                 return lookup_mem
108 | 
109 |         if self.reg_est is not None:
110 |             params = format_query(
111 |                 lookup_query_base, self.reg_est.get_data_format(), only_values=True
112 |             )
113 |             logger.debug("Memory Lora Hybrid - Lookup query for reg_est is: %s", params)
114 |             act = self.reg_est.run(params, "memory")
115 |             logger.debug("Memory Lora Hybrid - Lookup query result for reg_est is: %s", act)
116 | 
117 |             return act
118 | 
119 |         # If we reach here, we are falling back on theory
120 |         size = (
121 |             self.calculate_activation_memory()
122 |             + self.lora_est.calculate_gradient_memory()
123 |             + self.lora_est.calculate_model_memory()
124 |             + self.lora_est.calculate_optimizer_memory()
125 |         )
126 | 
127 |         return size
128 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/lora/lora.py:
--------------------------------------------------------------------------------
 1 | # Third Party
 2 | from accelerate import init_empty_weights
 3 | from peft import LoraConfig, get_peft_model
 4 | from transformers import AutoConfig, AutoModelForCausalLM
 5 | 
 6 | # Local
 7 | from ...config import FMArguments, HFTrainingArguments, PeftLoraConfig
 8 | from ...utils import fmt_size, get_size_from_precision, logger
 9 | from ..full import FullParameterTuningEstimator
10 | 
11 | 
12 | class LoraEstimator(FullParameterTuningEstimator):
13 |     def __init__(
14 |         self,
15 |         fm_args: FMArguments,
16 |         train_args: HFTrainingArguments,
17 |         lora_args: PeftLoraConfig,
18 |     ):
19 |         super().__init__(fm_args, train_args)
20 | 
21 |         self.train_args = train_args
22 |         self.fm_args = fm_args
23 |         self.lora_args = lora_args
24 | 
25 |         with init_empty_weights():
26 |             modelc = AutoConfig.from_pretrained(self.fm_args.base_model_path)
27 |             model = AutoModelForCausalLM.from_config(modelc)
28 | 
29 |         logger.info("Initializing LoraEstimator with lora args %s", self.lora_args)
30 |         self.peft_model = get_peft_model(
31 |             model,
32 |             LoraConfig(
33 |                 r=self.lora_args.r,
34 |                 lora_alpha=self.lora_args.lora_alpha,
35 |                 lora_dropout=self.lora_args.lora_dropout,
36 |                 target_modules=self.lora_args.target_modules,
37 |             ),
38 |         )
39 | 
40 |         self.num_of_trainable_params = self.peft_model.num_parameters(
41 |             only_trainable=True
42 |         )
43 |         self.num_of_model_params = self.peft_model.num_parameters()
44 | 
45 |         self.precision = self._get_precision()
46 | 
47 |     def calculate_activation_memory(self, readable=False):
48 |         # tensors created during forward pass that are needed for gradient computation
49 |         # outputs have to be stored which will be used during backward pass
50 |         peft_model_state = self.peft_model.state_dict()
51 |         lora_a = []
52 |         lora_b = []
53 |         lora_dropout = []
54 |         either_q_k_v_present = False
55 |         for k in peft_model_state:
56 |             if "lora_A" in k:
57 |                 lora_a.append(peft_model_state[k])
58 |             if "lora_B" in k:
59 |                 lora_b.append(peft_model_state[k])
60 |             if "lora_dropout" in k:
61 |                 lora_dropout.append(peft_model_state[k])
62 |             if "self_attn" in k:
63 |                 either_q_k_v_present = True
64 |         # for each trainable linear layer
65 |         # input_features * batch_size * seq_length elements needed for each layer
66 |         lora_a_size = 0
67 |         lora_b_size = 0
68 |         lora_dropout_size = 0
69 |         # single shared input for Q K V matrices
70 |         input_size = 0
71 |         if either_q_k_v_present:
72 |             input_size = (
73 |                 self.h * self.b * self.s * get_size_from_precision(self.precision)
74 |             )
75 |         for lora_a_i in lora_a:
76 |             lora_a_size += (
77 |                 lora_a_i.size()[1]
78 |                 * self.b
79 |                 * self.s
80 |                 * get_size_from_precision(self.precision)
81 |             )
82 |         for lora_b_i in lora_b:
83 |             lora_b_size += (
84 |                 lora_b_i.size()[1]
85 |                 * self.b
86 |                 * self.s
87 |                 * get_size_from_precision(self.precision)
88 |             )
89 |         for lora_dropout_i in lora_dropout:
90 |             lora_dropout_size += lora_dropout_i.size()[1] * self.b * self.s
91 |         # ignored 2 layer normalization layers and softmax
92 |         size = input_size + lora_a_size + lora_b_size + lora_dropout_size
93 |         if readable:
94 |             return fmt_size(size)
95 |         return size
96 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/lora/test_lora.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | 
 3 | # Local
 4 | from ...config import parse
 5 | from ...utils import fmt_size
 6 | from .lora import LoraEstimator
 7 | 
 8 | 
 9 | def test_lora():
10 |     fm, ta, _, _, la, _ = parse(
11 |         {
12 |             "base_model_path": "codellama/CodeLlama-13b-hf",
13 |             "per_device_train_batch_size": 1,
14 |             "torch_dtype": "bfloat16",
15 |             "r": 8,
16 |         }
17 |     )
18 |     est = LoraEstimator(fm, ta, la)
19 | 
20 |     assert est.calculate_optimizer_memory() < 100 * 1_000_000
21 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/qlora/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .hybrid import HybridQLoraEstimator
3 | from .qlora import QLoraEstimator
4 | 
5 | __all__ = ["QLoraEstimator", "HybridQLoraEstimator"]
6 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/qlora/hybrid.py:
--------------------------------------------------------------------------------
  1 | # Local
  2 | from ...config import (
  3 |     FMArguments,
  4 |     HFTrainingArguments,
  5 |     InfraArguments,
  6 |     PeftLoraConfig,
  7 |     PeftQLoraConfig,
  8 | )
  9 | from ...data import format_query
 10 | from ...regressor import LookupRegressor, GetRegressor
 11 | from ...utils import logger
 12 | from .qlora import QLoraEstimator
 13 | 
 14 | 
 15 | class HybridQLoraEstimator:
 16 |     def __init__(
 17 |         self,
 18 |         fm_args: FMArguments,
 19 |         train_args: HFTrainingArguments,
 20 |         infra_args: InfraArguments,
 21 |         lora_args: PeftLoraConfig,
 22 |         qlora_args: PeftQLoraConfig,
 23 |         lookup_data_path,
 24 |         model_path,
 25 |     ):
 26 | 
 27 |         logger.info("Memory QLoRA Hybrid - Initializing")
 28 | 
 29 |         self.fm = fm_args
 30 |         self.ta = train_args
 31 |         self.ia = infra_args
 32 | 
 33 |         self.qlora_est = QLoraEstimator(fm_args, train_args, lora_args, qlora_args)
 34 | 
 35 |         # Lookup based estimator
 36 |         if lookup_data_path is not None:
 37 |             self.lookup_est = LookupRegressor(lookup_data_path)
 38 |         else:
 39 |             self.lookup_est = None
 40 | 
 41 |         # Model based estimator
 42 |         if model_path is not None:
 43 |             self.reg_est = GetRegressor(model_path)
 44 |         else:
 45 |             self.reg_est = None
 46 | 
 47 |         if self.ia.numGpusPerPod == 0:
 48 |             # discover number of gpus
 49 |             self.auto_discover_num_gpus()
 50 |         else:
 51 |             self.num_gpus = self.ia.numGpusPerPod
 52 | 
 53 |     def auto_discover_num_gpus(self):
 54 |         num = self.qlora_est.calculate_model_memory() / (
 55 |             self.ia.gpu_memory_in_gb * 1024**3
 56 |         )
 57 |         self.num_gpus = int(num) if num > 1 else 1
 58 | 
 59 |         trials = 10
 60 |         while trials > 0:
 61 |             mem = self.get_total_mem_estimate()
 62 |             if mem < self.ia.gpu_memory_in_gb * 1024**3:
 63 |                 logger.debug(
 64 |                     "Memory QLoRA Hybrid - Discovered num gpus: {0}".format(
 65 |                         self.num_gpus
 66 |                     )
 67 |                 )
 68 |                 return
 69 | 
 70 |             trials -= 1
 71 |             self.num_gpus += 1
 72 | 
 73 |         logger.warning("Memory QLoRA Hybrid - No suitable num gpus found!")
 74 | 
 75 |     def calculate_model_memory(self):
 76 |         return self.qlora_est.calculate_model_memory() / self.num_gpus
 77 | 
 78 |     def calculate_gradient_memory(self):
 79 |         return self.qlora_est.calculate_gradient_memory() / self.num_gpus
 80 | 
 81 |     def calculate_optimizer_memory(self):
 82 |         return self.qlora_est.calculate_optimizer_memory() / self.num_gpus
 83 | 
 84 |     def calculate_activation_memory(self):
 85 |         return self.qlora_est.calculate_activation_memory() / self.num_gpus
 86 | 
 87 |     def get_total_mem_estimate(self):
 88 | 
 89 |         lookup_query_base = {
 90 |             "model_name": self.fm.base_model_path,
 91 |             "number_gpus": self.num_gpus,
 92 |             "batch_size": self.ta.per_device_train_batch_size,
 93 |             "seq_len": self.fm.block_size,
 94 |             "gpu_model": self.ia.gpuModel,
 95 |             "method": self.fm.technique,
 96 |         }
 97 | 
 98 |         if self.lookup_est is not None:
 99 |             logger.debug("Memory QLoRA Hybrid - attempting lookup")
100 |             lookup_query = format_query(
101 |                 lookup_query_base, self.lookup_est.get_data_format()
102 |             )
103 |             res = self.lookup_est.run(lookup_query)
104 |             if res.empty:
105 |                 lookup_mem = None
106 |             else:
107 |                 lookup_mem = res["memory"][0:1].item()
108 |             if lookup_mem is not None:
109 |                 logger.debug("Memory QLoRA Hybrid - match found")
110 |                 return lookup_mem
111 | 
112 |         if self.reg_est is not None:
113 |             params = format_query(
114 |                 lookup_query_base, self.reg_est.get_data_format(), only_values=True
115 |             )
116 |             act = self.reg_est.run(params, "memory")
117 | 
118 |             return act
119 | 
120 |         # No fall back here
121 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/qlora/qlora.py:
--------------------------------------------------------------------------------
  1 | # Third Party
  2 | from accelerate import init_empty_weights
  3 | from peft import LoraConfig, get_peft_model
  4 | from transformers import AutoConfig, AutoModelForCausalLM
  5 | 
  6 | # Local
  7 | from ...config import FMArguments, HFTrainingArguments, PeftLoraConfig, PeftQLoraConfig
  8 | from ...utils import fmt_size, get_size_from_precision
  9 | from ..full import FullParameterTuningEstimator
 10 | 
 11 | 
 12 | class QLoraEstimator(FullParameterTuningEstimator):
 13 |     def __init__(
 14 |         self,
 15 |         fm_args: FMArguments,
 16 |         train_args: HFTrainingArguments,
 17 |         lora_args: PeftLoraConfig,
 18 |         qlora_args: PeftQLoraConfig,
 19 |     ):
 20 |         super().__init__(fm_args, train_args)
 21 | 
 22 |         self.train_args = train_args
 23 |         self.fm_args = fm_args
 24 |         self.lora_args = lora_args
 25 |         self.qlora_args = qlora_args
 26 | 
 27 |         with init_empty_weights():
 28 |             modelc = AutoConfig.from_pretrained(self.fm_args.base_model_path)
 29 |             model = AutoModelForCausalLM.from_config(modelc)
 30 | 
 31 |         # cast our lora config dataclass instance into the real peft dataclass fmt
 32 |         self.peft_model = get_peft_model(model, LoraConfig(**self.lora_args.__dict__))
 33 | 
 34 |         self.num_of_trainable_params = self.peft_model.num_parameters(
 35 |             only_trainable=True
 36 |         )
 37 |         self.num_of_model_params = self.peft_model.num_parameters()
 38 | 
 39 |         self.precision = self._get_precision()
 40 | 
 41 |     def calculate_model_memory(self, readable=False):
 42 |         # See QLora paper https://arxiv.org/pdf/2305.14314
 43 |         # Quantization overhead for each model parameter of 0.5 bits or 0.0625 bytes.
 44 |         # If double quantization is enabled this can be further brought down to 0.127 bits or 0.015875
 45 | 
 46 |         if self.qlora_args.use_double_quant:
 47 |             size = self.num_of_model_params * (
 48 |                 get_size_from_precision(self.qlora_args.quant_type) + 0.015875
 49 |             )
 50 |         else:
 51 |             size = self.num_of_model_params * (
 52 |                 get_size_from_precision(self.qlora_args.quant_type) + 0.0625
 53 |             )
 54 | 
 55 |         if readable:
 56 |             return fmt_size(size)
 57 |         return size
 58 | 
 59 |     def calculate_activation_memory(self, readable=False):
 60 |         # tensors created during forward pass that are needed for gradient computation
 61 |         # outputs have to be stored which will be used during backward pass
 62 | 
 63 |         # TODO: this is currently same as LoRA, since theoretically tensors created during the forward pass are the same
 64 |         peft_model_state = self.peft_model.state_dict()
 65 |         lora_a = []
 66 |         lora_b = []
 67 |         lora_dropout = []
 68 |         either_q_k_v_present = False
 69 |         for k in peft_model_state:
 70 |             if "lora_A" in k:
 71 |                 lora_a.append(peft_model_state[k])
 72 |             if "lora_B" in k:
 73 |                 lora_b.append(peft_model_state[k])
 74 |             if "lora_dropout" in k:
 75 |                 lora_dropout.append(peft_model_state[k])
 76 |             if "self_attn" in k:
 77 |                 either_q_k_v_present = True
 78 |         # for each trainable linear layer
 79 |         # input_features * batch_size * seq_length elements needed for each layer
 80 |         lora_a_size = 0
 81 |         lora_b_size = 0
 82 |         lora_dropout_size = 0
 83 |         # single shared input for Q K V matrices
 84 |         input_size = 0
 85 |         if either_q_k_v_present:
 86 |             input_size = (
 87 |                 self.h * self.b * self.s * get_size_from_precision(self.precision)
 88 |             )
 89 |         for lora_a_i in lora_a:
 90 |             lora_a_size += (
 91 |                 lora_a_i.size()[1]
 92 |                 * self.b
 93 |                 * self.s
 94 |                 * get_size_from_precision(self.precision)
 95 |             )
 96 |         for lora_b_i in lora_b:
 97 |             lora_b_size += (
 98 |                 lora_b_i.size()[1]
 99 |                 * self.b
100 |                 * self.s
101 |                 * get_size_from_precision(self.precision)
102 |             )
103 |         for lora_dropout_i in lora_dropout:
104 |             lora_dropout_size += lora_dropout_i.size()[1] * self.b * self.s
105 |         # ignored 2 layer normalization layers and softmax
106 |         size = input_size + lora_a_size + lora_b_size + lora_dropout_size
107 |         if readable:
108 |             return fmt_size(size)
109 |         return size
110 | 


--------------------------------------------------------------------------------
/fm_training_estimator/memory/qlora/test_qlora.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | 
 3 | # Local
 4 | from ...config import parse
 5 | from ...utils import fmt_size
 6 | from .qlora import QLoraEstimator
 7 | 
 8 | def test_qlora():
 9 |     fm, ta, _, _, la, qla = parse(
10 |         {
11 |             "base_model_path": "codellama/CodeLlama-13b-hf",
12 |             "per_device_train_batch_size": 1,
13 |             "torch_dtype": "bfloat16",
14 |             "r": 8,
15 |             "use_double_quant": False,
16 |         }
17 |     )
18 |     est = QLoraEstimator(fm, ta, la, qla)
19 | 
20 |     assert est.calculate_optimizer_memory() < 100 * 1_000_000
21 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/README.md:
--------------------------------------------------------------------------------
 1 | # regressor
 2 | 
 3 | This folder contains a few important pieces.
 4 | 
 5 | 1. The format of data expected
 6 | 2. Lookup module: to directly lookup an input configuration from existing dataset
 7 | 3. XGBoost module: XGBoost based regressor
 8 | 
 9 | More regression modules can be added to this folder and then used in the various estimator modules. For example, the current modules are used in `memory/hybrid` and `throughput/hybrid`.
10 | 
11 | As more ML based modules are added here, the interfaces will be locked in. For now, the example to follow is the XGBoost module.
12 | 
13 | ## Data formats
14 | 
15 | Data has to be in a common format for training regression modules and for runtime invocations. This is needed so that at runtime, we are able to correctly format the query to the lookup and regression modules.
16 | 
17 | Refer to the `data/` module for details on the data formats.
18 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .lookup import LookupRegressor
3 | from .xgboost import XGBoostRegressor
4 | from .linear import LinearRegressor
5 | from .arise import AriseRegressor
6 | 
7 | from .dispatch import GetRegressor
8 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/arise/README.md:
--------------------------------------------------------------------------------
1 | # Arise Regressor
2 | 
3 | Train the regressor:
4 | ```
5 | python -m fm_training_estimator.regressor.arise.train
6 | ```
7 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/arise/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .arise import AriseRegressor
3 | 
4 | __all__ = ["AriseRegressor"]
5 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/arise/arise.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import shutil
  4 | import pandas
  5 | import yaml
  6 | import zipfile
  7 | from functools import lru_cache
  8 | 
  9 | from arise_predictions.preprocessing import job_parser
 10 | from arise_predictions.utils import constants, utils
 11 | from arise_predictions.auto_model.build_models import auto_build_models, get_estimators_config
 12 | from arise_predictions.perform_predict.predict import demo_predict, get_predict_config_from_dict
 13 | 
 14 | from ...data import lookup_format_version, get_format_by_version
 15 | 
 16 | class AriseRegressor:
 17 |     def __init__(self, model_path=None):
 18 |         self.model_path = model_path
 19 | 
 20 |     def preprocess(self, workdir, job_spec):
 21 |         inputs = sorted(list(job_spec[0]))
 22 |         outputs = sorted(list(job_spec[1]))
 23 |         start_time_field_name = job_spec[2]
 24 |         end_time_field_name = job_spec[3]
 25 |         job_parser_class_name = job_spec[4]
 26 |         job_entry_filter = job_spec[5]
 27 |         feature_engineering = job_spec[6] if len(job_spec) > 6 else None
 28 |         metadata_parser_class_name = job_spec[7] if len(job_spec) > 7 else None
 29 |         history_file = os.path.join(workdir, constants.JOB_HISTORY_FILE_NAME + ".csv")
 30 | 
 31 |         history_data, history_file = job_parser.collect_jobs_history(
 32 |                 os.path.join(workdir, constants.JOB_DATA_DIR), workdir, inputs, outputs,
 33 |                 start_time_field_name, end_time_field_name, None, job_parser_class_name,
 34 |                 job_entry_filter, feature_engineering, metadata_parser_class_name,
 35 |                 workdir)
 36 |         return history_data, history_file
 37 | 
 38 |     def execute_build(self, workdir, js, config_path):
 39 | 
 40 |         history_data, history_file = self.preprocess(workdir, js)
 41 |         outputs = sorted(list(js[1]))
 42 |         output_path = os.path.join(workdir, constants.AM_OUTPUT_PATH_SUFFIX)
 43 | 
 44 |         auto_build_models(raw_data=history_data,
 45 |                           config=get_estimators_config(config_path, num_jobs=-1),
 46 |                           target_variables=outputs,
 47 |                           output_path=output_path,
 48 |                           leave_one_out_cv=None,
 49 |                           feature_col=None,
 50 |                           low_threshold=None,
 51 |                           high_threshold=None,
 52 |                           single_output_file=True,
 53 |                           randomized_hpo=False,
 54 |                           n_random_iter=False)
 55 | 
 56 |     def train(self, data_path: str, model_path: str, config_path: str, y_headers: list[str]):
 57 |         with tempfile.TemporaryDirectory() as workdir:
 58 |             print(workdir)
 59 | 
 60 |             datadir = os.path.join(workdir, "data")
 61 |             os.mkdir(datadir)
 62 |             shutil.copy2(data_path, datadir)
 63 | 
 64 |             # we only need the headers, so we read just a single row
 65 |             data = pandas.read_csv(data_path, nrows=1)
 66 |             # these 2 lines are for calc data version needed by manager module
 67 |             data_keys = ",".join(list(data.columns.values))
 68 |             data_version = lookup_format_version(data_keys)
 69 |             # this is used for arise
 70 |             x_headers = list(set(data.columns.values) - set(y_headers))
 71 | 
 72 |             # prepare the job spec file
 73 |             job_spec = {"job-metadata-inputs": {}, "job-metadata-outputs": y_headers}
 74 |             for h in x_headers:
 75 |                 job_spec["job-metadata-inputs"][h] = 0
 76 | 
 77 |             js = job_parser.parse_job_spec(job_spec)
 78 | 
 79 |             # pre-emptively create output dir for arise
 80 |             output_path = os.path.join(workdir, constants.AM_OUTPUT_PATH_SUFFIX)
 81 |             utils.mkdirs(output_path)
 82 |             # normally, arise saves job spec into the model file
 83 |             # but, we skip it here
 84 |             # save the data version into a file in arise model
 85 |             with open(os.path.join(output_path, "estimator_data_version"), "w") as f:
 86 |                 f.write(data_version)
 87 | 
 88 |             # save the model type also here
 89 |             with open(os.path.join(output_path, "model_type"), "w") as f:
 90 |                 f.write("arise")
 91 | 
 92 |             self.execute_build(workdir, js, config_path)
 93 | 
 94 |             # copy the model to required destination
 95 |             shutil.copy2(os.path.join(workdir, "ARISE-auto-models.zip"), model_path)
 96 | 
 97 |     def get_columns(self):
 98 |         col_str = get_format_by_version(self.get_data_format()).X
 99 |         return col_str.split(",")
100 | 
101 |     def execute_predict(self, workdir, js, predict_config, model_file_name):
102 |         return demo_predict(
103 |             original_data=None,
104 |             config=get_predict_config_from_dict(predict_config),
105 |             estimator_path=model_file_name,
106 |             feature_engineering=js[6],
107 |             metadata_parser_class_name=js[7],
108 |             metadata_path=model_file_name,
109 |             output_path=os.path.join(workdir, constants.PRED_OUTPUT_PATH_SUFFIX))
110 | 
111 |     def run(self, X, y):
112 |         cols = self.get_columns()
113 |         input_vars = []
114 |         for k, v in zip(cols, X):
115 |             input_vars.append({k: v})
116 | 
117 |         with tempfile.TemporaryDirectory() as workdir:
118 |             # TODO: get the right value for the "greater is better" variable
119 |             est_config = {"target_variable": y, "greater_is_better": True}
120 |             predict_config = {"fixed_values": input_vars,
121 |                               "variable_values": [],
122 |                               "estimators": [est_config]}
123 | 
124 |             job_spec = {"job-metadata-inputs": {}, "job-metadata-outputs": [y]}
125 |             for h in cols:
126 |                 job_spec["job-metadata-inputs"][h] = 0
127 | 
128 |             shutil.copy2(self.model_path, workdir)
129 |             model_file_name = os.path.join(workdir, os.path.basename(self.model_path))
130 |  
131 |             js = job_parser.parse_job_spec(job_spec)
132 |             self.execute_predict(workdir, js, predict_config, model_file_name)
133 | 
134 |             # now read the result
135 |             res_file = os.path.join(workdir, "ARISE-predictions", "all-predictions.csv")
136 |             res = pandas.read_csv(res_file)
137 | 
138 |             return res[y][0]
139 | 
140 |     @lru_cache
141 |     def get_data_format(self):
142 |         with zipfile.ZipFile(self.model_path) as model_zip:
143 |             with model_zip.open("estimator_data_version", 'r') as edv:
144 |                 dv = edv.read().decode()
145 | 
146 |         return dv
147 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/arise/train.py:
--------------------------------------------------------------------------------
 1 | # Third Party
 2 | import fire
 3 | 
 4 | # Local
 5 | from .arise import AriseRegressor
 6 | 
 7 | def train(data_path: str, model_path: str, config_path: str, y_headers: list[str]):
 8 |     """Train a AriseRegressor model that can be used by this estimator library.
 9 | 
10 |     Args:
11 |         data_path (str): the path to training data
12 |         model_path (str): the output path of trained model. Must end with .json.
13 |         config_path (str): the path to an ARISE training config file. See: https://github.com/arise-insights/arise-predictions/tree/main/config for examples.
14 |         y_headers (list[str]): list of column names to drop from data
15 | 
16 |     """
17 |     model = AriseRegressor()
18 | 
19 |     if not model_path.endswith(".zip"):
20 |         print("model_path must be a zip extension.")
21 |         print("Refusing to continue!!")
22 |         return
23 | 
24 |     print("Training model...")
25 |     model.train(data_path, model_path, config_path, y_headers)
26 |     print("...successfully wrote model to file: ", model_path)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     fire.Fire(train)
31 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/dispatch.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | 
 3 | from .xgboost import XGBoostRegressor
 4 | from .linear import LinearRegressor
 5 | from .arise import AriseRegressor
 6 | 
 7 | def GetRegressor(model_path):
 8 |     with zipfile.ZipFile(model_path, mode='r') as model_zip:
 9 |         mt = model_zip.read("model_type").decode()
10 | 
11 |         if mt == "linear":
12 |             return LinearRegressor(model_path)
13 |         elif mt == "xgboost":
14 |             return XGBoostRegressor(model_path)
15 |         elif mt == "arise":
16 |             return AriseRegressor(model_path)
17 |         else:
18 |             raise ValueError("Unknown model type found", mt)
19 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/linear/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .linear import LinearRegressor
3 | 
4 | __all__ = ["LinearRegressor"]
5 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/linear/linear.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import zipfile
 3 | import tempfile
 4 | 
 5 | # Third Party
 6 | import pandas
 7 | from sklearn.ensemble import RandomForestRegressor
 8 | from sklearn.preprocessing import OneHotEncoder
 9 | import joblib
10 | 
11 | # Local
12 | from ...data import lookup_format_version, get_format_by_version
13 | 
14 | 
15 | class LinearRegressor:
16 |     def __init__(self, model_path=None):
17 |         self.model = RandomForestRegressor()
18 |         self.cat_enc = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
19 | 
20 |         if model_path is not None:
21 |             self.load(model_path)
22 | 
23 |     def load(self, model_path):
24 |         with tempfile.TemporaryDirectory() as mdir:
25 |             with zipfile.ZipFile(model_path) as model_zip:
26 |                 model_zip.extractall(mdir)
27 | 
28 |             path_m = os.path.join(mdir, "model.json")
29 |             self.model = joblib.load(path_m)
30 | 
31 |             path_e = os.path.join(mdir, "cat_enc.json")
32 |             self.cat_enc = joblib.load(path_e)
33 | 
34 |     def train(self, data_path: str, model_path: str, y_headers: list[str]):
35 |         data = pandas.read_csv(data_path)
36 | 
37 |         # prepare the pure list of X for saving as metadata later
38 |         X_cols = list(data.drop(columns=y_headers).columns)
39 | 
40 |         # obtain the data format metadata
41 |         data_keys = ",".join(list(data.columns.values))
42 | 
43 |         # encode category columns
44 |         cat_feats = data.dtypes[data.dtypes=='object'].index.values.tolist()
45 |         ecats = self.cat_enc.fit_transform(data[cat_feats])
46 | 
47 |         data = data.drop(columns=cat_feats)
48 |         data = pandas.concat([data, ecats], axis=1)
49 | 
50 |         X = data.drop(columns=y_headers)
51 |         Y = data[y_headers]
52 | 
53 |         self.model.fit(X, Y)
54 | 
55 |         # save the feature names into the model
56 |         self.model.metadata = {}
57 |         self.model.metadata['feature_names'] = X_cols
58 |         # save the data format
59 |         self.model.metadata['data_format_version']=lookup_format_version(data_keys)
60 | 
61 |         with ( tempfile.NamedTemporaryFile(suffix='.json', mode='w') as buf_m,
62 |                tempfile.NamedTemporaryFile(suffix='.json', mode='w') as buf_e,
63 |                tempfile.NamedTemporaryFile(mode='w') as buf_mt,
64 |                zipfile.ZipFile(model_path, mode='w') as model_zip
65 |               ):
66 | 
67 |             # save model to tmp buffer
68 |             joblib.dump(self.model, buf_m.name)
69 |             # save encoder into tmp buffer
70 |             joblib.dump(self.cat_enc, buf_e.name)
71 |             # save model type to file
72 |             with open(buf_mt.name, 'w') as f:
73 |                 f.write("linear")
74 | 
75 |             # now move the files into the zip file
76 |             model_zip.write(buf_m.name, 'model.json')
77 |             model_zip.write(buf_e.name, 'cat_enc.json')
78 |             model_zip.write(buf_mt.name, 'model_type')
79 | 
80 |     def run(self, X, y):
81 |         # convert input data array into form suitable to feed in
82 | 
83 |         # add column names
84 |         data = pandas.DataFrame([X], columns=self.model.metadata['feature_names'])
85 | 
86 |         # encode category columns
87 |         cat_feats = data.dtypes[data.dtypes=='object'].index.values.tolist()
88 |         ecats = self.cat_enc.transform(data[cat_feats])
89 | 
90 |         data = data.drop(columns=cat_feats)
91 |         data = pandas.concat([data, ecats], axis=1)
92 | 
93 |         res = self.model.predict(data)
94 |         col_idx = get_format_by_version(self.get_data_format()).Y.split(",").index(y)
95 |         return res[0][col_idx]
96 | 
97 |     def get_data_format(self):
98 |         return self.model.metadata["data_format_version"]
99 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/linear/train.py:
--------------------------------------------------------------------------------
 1 | # Third Party
 2 | import fire
 3 | 
 4 | # Local
 5 | from .linear import LinearRegressor
 6 | 
 7 | 
 8 | def train(data_path: str, model_path: str, y_headers: list[str]):
 9 |     """Train a LinearRegressor model that can be used by this estimator library.
10 | 
11 |     Args:
12 |         data_path (str): the path to training data
13 |         model_path (str): the output path of trained model. Must end with .zip.
14 |         y_headers (list[str]): list of column names to drop from data
15 | 
16 |     """
17 |     model = LinearRegressor()
18 | 
19 |     if not model_path.endswith(".zip"):
20 |         print("model_path must be a zip extension.")
21 |         print("Refusing to continue!!")
22 |         return
23 | 
24 |     print("Training model...")
25 |     model.train(data_path, model_path, y_headers)
26 |     print("...successfully wrote model to file: ", model_path)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     fire.Fire(train)
31 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/lookup/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .lookup import LookupRegressor
3 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/lookup/lookup.py:
--------------------------------------------------------------------------------
 1 | # Third Party
 2 | import pandas
 3 | 
 4 | # Local
 5 | from ...data import lookup_format_version
 6 | 
 7 | 
 8 | class LookupRegressor:
 9 |     def __init__(self, data_path=None):
10 |         self.data = None
11 | 
12 |         if data_path is not None:
13 |             self.load(data_path)
14 | 
15 |     def load(self, data_path):
16 |         self.data = pandas.read_csv(data_path)
17 | 
18 |     def get_data_format(self):
19 |         keys = ",".join(list(self.data.columns.values))
20 |         return lookup_format_version(keys)
21 | 
22 |     def run(self, X: dict):
23 |         query = ""
24 |         for key, val in X.items():
25 |             if isinstance(val, str):
26 |                 query += f' and {key} == "{val}"'
27 |             else:
28 |                 query += f" and {key} == {val}"
29 |         query = query[5:]
30 | 
31 |         res = self.data.query(query)
32 |         res = res.drop(columns=X.keys())
33 | 
34 |         return res
35 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/lookup/test_lookup.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from pathlib import Path
 3 | 
 4 | # Local
 5 | from .lookup import LookupRegressor
 6 | 
 7 | test_data1 = (Path(__file__).parent / "../test_data/data1.csv").as_posix()
 8 | test_data2 = (Path(__file__).parent / "../test_data/data2.csv").as_posix()
 9 | 
10 | 
11 | def test_lookup():
12 |     reg = LookupRegressor()
13 | 
14 |     reg.load(test_data1)
15 | 
16 |     res = reg.run(
17 |         {
18 |             "model_name": "mercury-12b",
19 |             "gpu_model": "X100",
20 |             "number_gpus": 2,
21 |             "batch_size": 4,
22 |             "seq_len": 512,
23 |         }
24 |     )
25 | 
26 |     assert res.shape[0] == 1
27 |     assert res["tokens_per_second"][0] == 500
28 | 
29 |     # should return multiple entries
30 |     res = reg.run(
31 |         {
32 |             "model_name": "mercury-12b",
33 |             "gpu_model": "X100",
34 |             "number_gpus": 2,
35 |             "batch_size": 4,
36 |         }
37 |     )
38 | 
39 |     assert res.shape[0] == 3
40 | 
41 |     reg.load(test_data2)
42 | 
43 |     res = reg.run(
44 |         {
45 |             "model_name": "ibm-granite/granite-7b-base",
46 |             "number_gpus": 2,
47 |             "batch_size": 4,
48 |             "seq_len": 512,
49 |         }
50 |     )
51 | 
52 |     assert res.shape[0] == 1
53 |     assert res[0:1]["tokens_per_second"].item() == 500
54 | 
55 |     res = reg.run(
56 |         {
57 |             "model_name": "ibm-granite/granite-7b-base",
58 |             "number_gpus": 2,
59 |             "batch_size": 4,
60 |             "seq_len": 1024,
61 |         }
62 |     )
63 | 
64 |     assert res.shape[0] == 1
65 |     assert res[0:1]["tokens_per_second"].item() == 1000
66 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/test_data/data1.csv:
--------------------------------------------------------------------------------
 1 | model_name,gpu_model,number_gpus,batch_size,seq_len,tokens_per_second
 2 | mercury-12b,X100,2,4,512,500
 3 | mercury-12b,X100,2,4,1024,1000
 4 | mercury-12b,X100,2,4,4096,4000
 5 | mercury-12b,X100,2,8,512,1000
 6 | mercury-12b,X100,2,8,1024,2000
 7 | mercury-12b,X100,2,8,4096,8000
 8 | pluto-13b,X100,2,4,512,500
 9 | pluto-13b,X100,2,4,4096,4000
10 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/test_data/data2.csv:
--------------------------------------------------------------------------------
1 | model_name,number_gpus,batch_size,seq_len,tokens_per_second,memory,memory_act
2 | ibm-granite/granite-7b-base,2,4,512,500,20,10
3 | ibm-granite/granite-7b-base,2,4,1024,1000,20,10
4 | ibm-granite/granite-7b-base,2,8,512,500,20,10
5 | ibm-granite/granite-7b-base,2,16,512,500,20,10
6 | ibm-granite/granite-7b-base,4,4,1024,500,20,10
7 | ibm-granite/granite-7b-base,4,8,1024,500,20,10
8 | ibm-granite/granite-7b-base,4,16,1024,500,20,10
9 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/test_data/data3.csv:
--------------------------------------------------------------------------------
1 | model_arch,model_hidden_size,model_intermediate_size,model_num_attn_heads,model_num_hidden_layers,model_num_key_value_heads,number_gpus,batch_size,seq_len,tokens_per_second,memory,memory_act
2 | LlamaForCausalLM,2560,10240,32,32,32,4,16,1024,500,20,10
3 | LlamaForCausalLM,4096,11008,32,32,32,4,16,1024,500,20,10
4 | 
5 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/xgboost/README.md:
--------------------------------------------------------------------------------
 1 | # XGBoost Regressor
 2 | 
 3 | Train the regressor:
 4 | ```
 5 | python -m fm_training_estimator.regressor.xgboost.train
 6 | ```
 7 | 
 8 | Here is an example - run from the top level folder:
 9 | ```
10 | python -m fm_training_estimator.regressor.xgboost.train ./fm_training_estimator/regressor/test_data/data1.csv ./test.model.json ["train_tokens_per_second"]
11 | ```
12 | 
13 | This command will fail if the passed in y fields are not found in the input data.
14 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/xgboost/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .xgboost import XGBoostRegressor
3 | 
4 | __all__ = ["XGBoostRegressor"]
5 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/xgboost/test_reg.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from pathlib import Path
 3 | 
 4 | # Local
 5 | from .xgboost import XGBoostRegressor
 6 | 
 7 | test_data1 = (Path(__file__).parent / "../test_data/data1.csv").as_posix()
 8 | test_data2 = (Path(__file__).parent / "../test_data/data2.csv").as_posix()
 9 | 
10 | 
11 | def test_reg_lifecycle(tmp_path):
12 |     model_path = tmp_path / "test.model.json"
13 | 
14 |     reg = XGBoostRegressor()
15 | 
16 |     # train the model - and use it directly
17 |     # includes saving to file
18 |     reg.train(test_data1, model_path, ["tokens_per_second"])
19 |     out = reg.run(["mercury-12b", "X100", 2, 4, 512])
20 |     assert int(out[0]) < 1000
21 | 
22 |     # load the model from file
23 |     reg1 = XGBoostRegressor()
24 |     reg1.load(model_path)
25 |     out1 = reg1.run(["mercury-12b", "X100", 2, 4, 512])
26 |     assert int(out[0]) == int(out1[0])
27 | 
28 | 
29 | def test_reg_multi(tmp_path):
30 |     model_path = tmp_path / "test2.model.json"
31 | 
32 |     reg = XGBoostRegressor()
33 | 
34 |     # train the model - and use it directly
35 |     # includes saving to file
36 |     reg.train(test_data2, model_path, ["tokens_per_second", "memory", "memory_act"])
37 |     out = reg.run(["ibm-granite/granite-7b-base", 2, 4, 512])
38 | 
39 |     assert len(out) == 1
40 |     assert len(out[0]) == 3
41 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/xgboost/train.py:
--------------------------------------------------------------------------------
 1 | # Third Party
 2 | import fire
 3 | 
 4 | # Local
 5 | from .xgboost import XGBoostRegressor
 6 | 
 7 | 
 8 | def train(data_path: str, model_path: str, y_headers: list[str]):
 9 |     """Train a XGBoostRegressor model that can be used by this estimator library.
10 | 
11 |     Args:
12 |         data_path (str): the path to training data
13 |         model_path (str): the output path of trained model. Must end with .zip.
14 |         y_headers (list[str]): list of column names to drop from data
15 | 
16 |     """
17 |     model = XGBoostRegressor()
18 | 
19 |     if not model_path.endswith(".zip"):
20 |         print("model_path must be a zip extension.")
21 |         print("Refusing to continue!!")
22 |         return
23 | 
24 |     print("Training model...")
25 |     model.train(data_path, model_path, y_headers)
26 |     print("...successfully wrote model to file: ", model_path)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     fire.Fire(train)
31 | 


--------------------------------------------------------------------------------
/fm_training_estimator/regressor/xgboost/xgboost.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import zipfile
  3 | import tempfile
  4 | 
  5 | # Third Party
  6 | from xgboost import XGBRegressor
  7 | import pandas
  8 | from sklearn.preprocessing import OrdinalEncoder
  9 | import joblib
 10 | 
 11 | # Local
 12 | from ...data import lookup_format_version, get_format_by_version
 13 | 
 14 | 
 15 | class XGBoostRegressor:
 16 |     def __init__(self, model_path=None):
 17 |         self.model = XGBRegressor(
 18 |             n_estimators=400,
 19 |             max_depth=7,
 20 |             eta=0.1,
 21 |             subsample=0.7,
 22 |             colsample_bytree=0.8,
 23 |             enable_categorical=True,
 24 |         )
 25 |         self.cat_enc = OrdinalEncoder()
 26 | 
 27 |         if model_path is not None:
 28 |             self.load(model_path)
 29 | 
 30 | 
 31 |     def load(self, model_path):
 32 |         with tempfile.TemporaryDirectory() as mdir:
 33 |             with zipfile.ZipFile(model_path) as model_zip:
 34 |                 model_zip.extractall(mdir)
 35 | 
 36 |             path_m = os.path.join(mdir, "model.json")
 37 |             self.model.load_model(path_m)
 38 | 
 39 |             path_e = os.path.join(mdir, "cat_enc.json")
 40 |             self.cat_enc = joblib.load(path_e)
 41 | 
 42 |     def train(self, data_path: str, model_path: str, y_headers: list[str]):
 43 |         data = pandas.read_csv(data_path)
 44 | 
 45 |         # obtain the data format metadata
 46 |         data_keys = ",".join(list(data.columns.values))
 47 | 
 48 |         # ordinal encode all "object" type columns, which are actually categories
 49 |         cat_feats = data.dtypes[data.dtypes=='object'].index.values.tolist()
 50 |         data[cat_feats] = self.cat_enc.fit_transform(data[cat_feats])
 51 | 
 52 |         # now mark these as categorical feats
 53 |         for cf in cat_feats:
 54 |             data[cat_feats] = data[cat_feats].astype("category")
 55 | 
 56 |         X = data.drop(columns=y_headers)
 57 |         Y = data[y_headers]
 58 | 
 59 |         self.model.fit(X, Y)
 60 | 
 61 |         # save the feature names into the model
 62 |         self.model.get_booster().feature_names = list(X.columns)
 63 |         # save the data format
 64 |         self.model.get_booster().set_attr(
 65 |             data_format_version=lookup_format_version(data_keys)
 66 |         )
 67 | 
 68 |         with ( tempfile.NamedTemporaryFile(suffix='.json', mode='w') as buf_m,
 69 |                tempfile.NamedTemporaryFile(suffix='.json', mode='w') as buf_e,
 70 |                tempfile.NamedTemporaryFile(mode='w') as buf_mt,
 71 |                zipfile.ZipFile(model_path, mode='w') as model_zip
 72 |               ):
 73 | 
 74 |             # save model to tmp buffer
 75 |             self.model.save_model(buf_m.name)
 76 |             # save encoder into tmp buffer
 77 |             joblib.dump(self.cat_enc, buf_e.name)
 78 |             # save model type to file
 79 |             with open(buf_mt.name, 'w') as f:
 80 |                 f.write("xgboost")
 81 | 
 82 |             # now move the files into the zip file
 83 |             model_zip.write(buf_m.name, 'model.json')
 84 |             model_zip.write(buf_e.name, 'cat_enc.json')
 85 |             model_zip.write(buf_mt.name, 'model_type')
 86 | 
 87 | 
 88 |     def run(self, X, y):
 89 |         # convert input data array into form suitable to feed in
 90 | 
 91 |         # add column names
 92 |         data = pandas.DataFrame([X], columns=self.model.get_booster().feature_names)
 93 | 
 94 |         # encode category columns
 95 |         cat_feats = data.dtypes[data.dtypes=='object'].index.values.tolist()
 96 |         data[cat_feats] = self.cat_enc.transform(data[cat_feats])
 97 | 
 98 |         # now mark these as categorical feats
 99 |         for cf in cat_feats:
100 |             data[cat_feats] = data[cat_feats].astype("category")
101 | 
102 |         res = self.model.predict(data)
103 |         col_idx = get_format_by_version(self.get_data_format()).Y.split(",").index(y)
104 |         return res[0][col_idx]
105 | 
106 |     def get_data_format(self):
107 |         return self.model.get_booster().attr("data_format_version")
108 | 


--------------------------------------------------------------------------------
/fm_training_estimator/sdk/README.md:
--------------------------------------------------------------------------------
1 | # SDK
2 | 
3 | To use the estimator directly, refer to the `../ui` folder.
4 | 
5 | This SDK is meant to be used from other Python programs to get estimates. Refer to examples in the `examples/` folder to learn more.
6 | 


--------------------------------------------------------------------------------
/fm_training_estimator/sdk/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .sdk import estimate_cost, estimate_memory, estimate_time, estimate_tokens
3 | 


--------------------------------------------------------------------------------
/fm_training_estimator/sdk/examples/ex1.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | import os
 3 | 
 4 | # First Party
 5 | from fm_training_estimator.config.arguments import (
 6 |     DataArguments,
 7 |     EstimateInput,
 8 |     EstimatorMetadata,
 9 |     FMArguments,
10 |     HFTrainingArguments,
11 |     InfraArguments,
12 |     JobConfig,
13 | )
14 | from fm_training_estimator.sdk import (
15 |     estimate_cost,
16 |     estimate_memory,
17 |     estimate_time,
18 |     estimate_tokens,
19 | )
20 | 
21 | workdir_path = os.path.join(os.path.abspath(os.curdir), "workdir")
22 | 
23 | model_path = os.path.join(workdir_path, "model.json")
24 | lookup_data_path = os.path.join(workdir_path, "data.csv")
25 | 
26 | estimator_metadata = EstimatorMetadata(base_data_path=lookup_data_path)
27 | 
28 | fm = FMArguments(
29 |     base_model_path="ibm-granite/granite-7b-base",
30 |     torch_dtype="bfloat16",
31 |     block_size=1024,
32 | )
33 | hf_training = HFTrainingArguments(
34 |     per_device_train_batch_size=1, gradient_checkpointing=False
35 | )
36 | data = DataArguments(dataset="imdb", te_approach=0)
37 | infra = InfraArguments(numGpusPerPod=1)
38 | job_conf = JobConfig(hf_training, fm, data, infra)
39 | est_input = EstimateInput(estimator_metadata=estimator_metadata, job_configs=[job_conf])
40 | 
41 | print("Estimating Memory:....")
42 | 
43 | print("With only theory: ", estimate_memory(est_input))
44 | print("With reg model: ", estimate_memory(est_input, model_path))
45 | 
46 | hf_training.fsdp = "full_shard"
47 | 
48 | print("Using fsdp full shard")
49 | print("With only theory: ", estimate_memory(est_input))
50 | # print("With reg model: ", estimate_memory(est_input, model_path))
51 | 
52 | 
53 | print("Estimating Time:....")
54 | print("With only theory: ", estimate_time(est_input))
55 | # print("With reg model: ", estimate_time(est_input, model_path))
56 | 
57 | print("Estimating Tokens:....")
58 | print("With only theory: ", estimate_tokens(est_input))
59 | # print("With reg model: ", estimate_tokens(est_input, model_path))
60 | 


--------------------------------------------------------------------------------
/fm_training_estimator/sdk/sdk.py:
--------------------------------------------------------------------------------
  1 | # First Party
  2 | from fm_training_estimator.config.arguments import (
  3 |     CostEstimate,
  4 |     EstimateInput,
  5 |     JobConfig,
  6 |     MemoryEstimate,
  7 |     TimeEstimate,
  8 |     TokensEstimate,
  9 | )
 10 | from fm_training_estimator.memory.hybrid.hybrid import HybridEstimator
 11 | from fm_training_estimator.memory.lora.hybrid import HybridLoraEstimator
 12 | from fm_training_estimator.memory.qlora.hybrid import HybridQLoraEstimator
 13 | from fm_training_estimator.throughput.hybrid.hybrid import HybridSpeedEstimator
 14 | from fm_training_estimator.time import get_total_time
 15 | from fm_training_estimator.tokens.te0.te0 import TokenEstimator0
 16 | 
 17 | # Local
 18 | from ..config import is_fsdp
 19 | from ..utils import fmt_size, logger
 20 | 
 21 | 
 22 | def _get_hybrid_estimator(
 23 |     conf: JobConfig, model_path: str = None, lookup_data_path: str = None
 24 | ):
 25 |     if conf.fm.technique == "lora":
 26 |         return HybridLoraEstimator(
 27 |             conf.fm,
 28 |             conf.hf_training,
 29 |             conf.infra,
 30 |             conf.peft_lora,
 31 |             lookup_data_path,
 32 |             model_path,
 33 |         )
 34 |     elif conf.fm.technique == "qlora":
 35 |         return HybridQLoraEstimator(
 36 |             conf.fm,
 37 |             conf.hf_training,
 38 |             conf.infra,
 39 |             conf.peft_lora,
 40 |             conf.peft_qlora,
 41 |             None,
 42 |             model_path,
 43 |         )
 44 |     else:
 45 |         return HybridEstimator(
 46 |             conf.fm, conf.hf_training, conf.infra, lookup_data_path, model_path
 47 |         )
 48 | 
 49 | def _update_seq_width(
 50 |     conf: JobConfig
 51 | ) -> JobConfig:
 52 |     """
 53 |     Update the seq width based on the input dataset characteristics.
 54 | 
 55 |     This is only needed for memory and should not impact tps/tokens since those
 56 |     functions anyway operate on the input dataset.
 57 |     """
 58 | 
 59 |     token_est = None
 60 |     if conf.data.te_approach == 0:
 61 |         token_est = TokenEstimator0(conf.data)
 62 |     if conf.data.te_approach == 2:
 63 |         token_est = TokenEstimator2(conf.data)
 64 | 
 65 |     if token_est != None:
 66 |         data_max_width = token_est.get_max_sample_length()
 67 |         if data_max_width < conf.fm.block_size:
 68 |             conf.fm.block_size = data_max_width
 69 | 
 70 |     return conf
 71 | 
 72 | def estimate_memory(
 73 |     estimate_input: EstimateInput, model_path: str = None
 74 | ) -> MemoryEstimate:
 75 |     """Estimate memory needed for training. This method uses hybdrid model by default.
 76 | 
 77 |     Args:
 78 |         estimate_input (fm_training_estimator.config.arguments.EstimateInput): the input for this estimation
 79 |             This input includes training job configs and optionally, metadata about this estimate run.
 80 |         model_path (str, optional): path to the trained xgboost model for the estimator to use for this run.
 81 | 
 82 |     Returns:
 83 |         fm_training_estimator.config.arguments.MemoryEstimate: the memory estimate of this run.
 84 | 
 85 |     """
 86 | 
 87 |     if estimate_input.job_configs is None or len(estimate_input.job_configs) == 0:
 88 |         raise ValueError("Did not receive a training job config")
 89 | 
 90 |     # Only going to process first job_config for now
 91 |     job_config = estimate_input.job_configs[0]
 92 | 
 93 |     # Update expected max width based on data
 94 |     job_config = _update_seq_width(job_config)
 95 | 
 96 |     if estimate_input.estimator_metadata:
 97 |         lookup_data_path = estimate_input.estimator_metadata.base_data_path
 98 |     if lookup_data_path is None:
 99 |         logger.warning(
100 |             "SDK - No lookup data path given. Set it via estimator_metadata.base_data_path in input json. Proceeding with estimator with limited lookup ability."
101 |         )
102 | 
103 |     est = _get_hybrid_estimator(job_config, model_path, lookup_data_path)
104 | 
105 |     total_mem_estimate = fmt_size(est.get_total_mem_estimate())
106 |     activation_memory = fmt_size(est.calculate_activation_memory())
107 |     gradient_memory = fmt_size(est.calculate_gradient_memory())
108 |     model_memory = fmt_size(est.calculate_model_memory())
109 |     optimizer_memory = fmt_size(est.calculate_optimizer_memory())
110 | 
111 |     num_gpus = job_config.infra.numGpusPerPod
112 | 
113 |     if num_gpus == 0:
114 |         if job_config.fm.technique == "full" and is_fsdp(job_config.hf_training):
115 |             num_gpus = est.fsdp_est.get_number_of_gpus()
116 |         elif job_config.fm.technique == "lora" or job_config.fm.technique == "qlora":
117 |             num_gpus = est.num_gpus
118 |         else:
119 |             num_gpus = 1
120 | 
121 |         job_config.infra.numGpusPerPod = num_gpus
122 | 
123 |     # No suitable configuration found
124 |     if num_gpus == -1:
125 |         raise ValueError("Input configuration is infeasible!")
126 | 
127 |     return MemoryEstimate(
128 |         total_mem_estimate,
129 |         activation_memory,
130 |         gradient_memory,
131 |         model_memory,
132 |         optimizer_memory,
133 |         num_gpus,
134 |     )
135 | 
136 | 
137 | def _estimate_tokens_and_time(
138 |     conf: JobConfig,
139 |     model_path: str = None,
140 |     lookup_data_path: str = None,
141 | ) -> tuple[float, float]:
142 |     token_est = None
143 |     if conf.data.te_approach == 0:
144 |         token_est = TokenEstimator0(conf.data)
145 |     if conf.data.te_approach == 2:
146 |         token_est = TokenEstimator2(conf.data)
147 | 
148 |     speed_est = HybridSpeedEstimator(
149 |         conf.fm, conf.hf_training, conf.infra, lookup_data_path, model_path
150 |     )
151 | 
152 |     estimated_tps = speed_est.get_tps()
153 |     if estimated_tps is not None:
154 |         tps = float(estimated_tps)
155 |         logger.info("SDK - Initial estimated tps is %f", tps)
156 |     else:
157 |         logger.info("SDK - Could not calculate tps initially, defaulting to 1.")
158 |         tps = 1
159 | 
160 |     if token_est is not None:
161 |         tokens_per_sample = int(
162 |             token_est.get_estimated_batch_width(
163 |                 conf.hf_training.per_device_train_batch_size
164 |             )
165 |         )
166 |         total_tokens = int(token_est.get_total_tokens())
167 | 
168 |         # get the update tps for this estimate token width
169 |         estimated_tps = speed_est.get_tps(tokens_per_sample)
170 |         if estimated_tps is not None:
171 |             tps = float(estimated_tps)
172 |             logger.info("SDK - Updated estimated tps after token width is %f", tps)
173 |         else:
174 |             logger.info(
175 |                 "SDK - Could not calculate tps after token width, defaulting to 1."
176 |             )
177 |             tps = 1
178 | 
179 |         # calculate full time here
180 |         time = get_total_time(
181 |             conf.hf_training, conf.infra, token_est, tps, total_tokens
182 |         )
183 |     else:
184 |         time = (0, 0)
185 |         logger.info(
186 |             "SDK - Could not get a total tokens to calculate time, setting time to 0."
187 |         )
188 |     return (tps, time)
189 | 
190 | 
191 | def estimate_time(
192 |     estimate_input: EstimateInput, model_path: str = None
193 | ) -> TimeEstimate:
194 |     """Estimate time needed for training. This method uses hybdrid model by default.
195 | 
196 |     Args:
197 |         estimate_input (fm_training_estimator.config.arguments.EstimateInput): the input for this estimation
198 |             This input includes training job configs and optionally, metadata about this estimate run.
199 |         model_path (str, optional): path to the trained xgboost model for the estimator to use for this run.
200 | 
201 |     Returns:
202 |         fm_training_estimator.config.arguments.TimeEstimate: the time estimate of this run.
203 | 
204 |     """
205 |     if estimate_input.job_configs is None or len(estimate_input.job_configs) == 0:
206 |         raise ValueError("Did not receive a training job config")
207 | 
208 |     # Only going to process first job_config for now
209 |     job_config = estimate_input.job_configs[0]
210 | 
211 |     if estimate_input.estimator_metadata:
212 |         lookup_data_path = estimate_input.estimator_metadata.base_data_path
213 |     if lookup_data_path is None:
214 |         logger.warning(
215 |             "SDK - No lookup data path given. Set it via estimator_metadata.base_data_path in input json. Proceeding with estimator with limited lookup ability."
216 |         )
217 | 
218 |     _, (time, train_time) = _estimate_tokens_and_time(
219 |         job_config, model_path, estimate_input.estimator_metadata.base_data_path
220 |     )
221 | 
222 |     return TimeEstimate(time, train_time)
223 | 
224 | 
225 | def estimate_tokens(
226 |     estimate_input: EstimateInput, model_path: str = None
227 | ) -> TokensEstimate:
228 |     """Estimate tokens throughput for a training. This method uses hybdrid model by default.
229 | 
230 |     Args:
231 |         estimate_input (fm_training_estimator.config.arguments.EstimateInput): the input for this estimation
232 |             This input includes training job configs and optionally, metadata about this estimate run.
233 |         model_path (str, optional): path to the trained xgboost model for the estimator to use for this run.
234 | 
235 |     Returns:
236 |         fm_training_estimator.config.arguments.TokensEstimate: the tokens throughput estimate of this run.
237 | 
238 |     """
239 |     if estimate_input.job_configs is None or len(estimate_input.job_configs) == 0:
240 |         raise ValueError("Did not receive a training job config")
241 | 
242 |     # Only going to process first job_config for now
243 |     job_config = estimate_input.job_configs[0]
244 | 
245 |     if estimate_input.estimator_metadata:
246 |         lookup_data_path = estimate_input.estimator_metadata.base_data_path
247 |     if lookup_data_path is None:
248 |         logger.warning(
249 |             "SDK - No lookup data path given. Set it via estimator_metadata.base_data_path in input json. Proceeding with estimator with limited lookup ability."
250 |         )
251 | 
252 |     tps, _ = _estimate_tokens_and_time(
253 |         job_config, model_path, estimate_input.estimator_metadata.base_data_path
254 |     )
255 | 
256 |     return TokensEstimate(tps)
257 | 
258 | 
259 | def estimate_cost(
260 |     estimate_input: EstimateInput, model_path: str = None
261 | ) -> CostEstimate:
262 |     """Estimate cost for a training. This method uses hybdrid model by default. (Not yet supported)
263 | 
264 |     Args:
265 |         estimate_input (fm_training_estimator.config.arguments.EstimateInput): the input for this estimation
266 |             This input includes training job configs and optionally, metadata about this estimate run.
267 |         model_path (str, optional): path to the trained xgboost model for the estimator to use for this run.
268 | 
269 |     Returns:
270 |         fm_training_estimator.config.arguments.CostEstimate: the cost estimate of this run.
271 | 
272 |     """
273 |     raise NotImplementedError("Not supported in this version.")
274 | 


--------------------------------------------------------------------------------
/fm_training_estimator/throughput/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .hybrid import HybridSpeedEstimator
3 | from .mock import MockSpeedEstimator
4 | 


--------------------------------------------------------------------------------
/fm_training_estimator/throughput/hybrid/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .hybrid import HybridSpeedEstimator
3 | 


--------------------------------------------------------------------------------
/fm_training_estimator/throughput/hybrid/hybrid.py:
--------------------------------------------------------------------------------
 1 | # Local
 2 | from ...config import FMArguments, HFTrainingArguments, InfraArguments
 3 | from ...data import format_query
 4 | from ...regressor import LookupRegressor, GetRegressor
 5 | from ...utils import logger
 6 | 
 7 | 
 8 | class HybridSpeedEstimator:
 9 |     def __init__(
10 |         self,
11 |         fm_args: FMArguments,
12 |         train_args: HFTrainingArguments,
13 |         infra_args: InfraArguments,
14 |         lookup_data_path,
15 |         model_path,
16 |     ):
17 | 
18 |         self.fm = fm_args
19 |         self.ta = train_args
20 |         self.ia = infra_args
21 |         self.lookup_est = None
22 |         self.reg_est = None
23 | 
24 |         # Lookup based estimator
25 |         if lookup_data_path is not None:
26 |             self.lookup_est = LookupRegressor(lookup_data_path)
27 | 
28 |         # Model based estimator
29 |         if model_path is not None:
30 |             self.reg_est = GetRegressor(model_path)
31 | 
32 |         if lookup_data_path is None and model_path is None:
33 |             raise RuntimeError("HybridSpeedEstimator not properly initialized")
34 | 
35 |     def check_lookup(self, seqlen):
36 |         lookup_query = {
37 |             "model_name": self.fm.base_model_path,
38 |             "number_gpus": self.ia.numGpusPerPod,
39 |             "batch_size": self.ta.per_device_train_batch_size,
40 |             "seq_len": seqlen,
41 |             "gpu_model": self.ia.gpuModel,
42 |             "method": self.fm.technique,
43 |         }
44 | 
45 |         lookup_query = format_query(lookup_query, self.lookup_est.get_data_format())
46 | 
47 |         res = self.lookup_est.run(lookup_query)
48 | 
49 |         if res.empty:
50 |             return None
51 | 
52 |         logger.debug(f"Throughput Hybrid - Lookup result: {res}")
53 |         return res[0:1]["tokens_per_second"].item()
54 | 
55 |     def get_tps(self, seqlen=None):
56 |         if seqlen is None:
57 |             seqlen = self.fm.block_size
58 | 
59 |         res = None
60 | 
61 |         # attempt lookup
62 |         if self.lookup_est is not None:
63 |             res = self.check_lookup(seqlen)
64 |             if res is not None:
65 |                 return res
66 |         if self.reg_est is None:
67 |             return res
68 | 
69 |         # attempt reg approach
70 |         lookup_query = {
71 |             "model_name": self.fm.base_model_path,
72 |             "number_gpus": self.ia.numGpusPerPod,
73 |             "batch_size": self.ta.per_device_train_batch_size,
74 |             "seq_len": int(seqlen),
75 |             "gpu_model": self.ia.gpuModel,
76 |             "method": self.fm.technique,
77 |         }
78 |         params = format_query(
79 |             lookup_query, self.reg_est.get_data_format(), only_values=True
80 |         )
81 | 
82 |         res = self.reg_est.run(params, "tokens_per_second")
83 |         return res
84 | 


--------------------------------------------------------------------------------
/fm_training_estimator/throughput/hybrid/test_hybrid.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from pathlib import Path
 3 | 
 4 | # Third Party
 5 | from pytest import raises
 6 | 
 7 | # Local
 8 | from ...config import parse
 9 | from ...regressor import XGBoostRegressor
10 | from .hybrid import HybridSpeedEstimator
11 | 
12 | test_data2 = (Path(__file__).parent / "../../regressor/test_data/data2.csv").as_posix()
13 | test_data3 = (Path(__file__).parent / "../../regressor/test_data/data3.csv").as_posix()
14 | 
15 | 
16 | def test_hybrid_empty():
17 |     fm, ta, ia, _, _, _ = parse({})
18 | 
19 |     with raises(RuntimeError):
20 |         _ = HybridSpeedEstimator(fm, ta, ia, None, None)
21 | 
22 | 
23 | def test_hybrid_lookup():
24 |     fm, ta, ia, _, _, _ = parse(
25 |         {
26 |             "base_model_path": "ibm-granite/granite-7b-base",
27 |             "per_device_train_batch_size": 4,
28 |             "block_size": 512,
29 |             "numGpusPerPod": 2,
30 |         }
31 |     )
32 | 
33 |     est = HybridSpeedEstimator(fm, ta, ia, test_data2, None)
34 | 
35 |     assert est.get_tps() == 500
36 |     # test lookup approach
37 |     assert est.get_tps(1024) == 1000
38 | 
39 | 
40 | def test_hybrid_reg(tmp_path):
41 |     model_path = tmp_path / "test.model.json"
42 |     reg = XGBoostRegressor()
43 |     reg.train(test_data2, model_path, ["tokens_per_second", "memory", "memory_act"])
44 | 
45 |     fm, ta, ia, _, _, _ = parse(
46 |         {
47 |             "base_model_path": "ibm-granite/granite-7b-base",
48 |             "per_device_train_batch_size": 4,
49 |             "block_size": 512,
50 |             "numGpusPerPod": 4,
51 |         }
52 |     )
53 | 
54 |     est = HybridSpeedEstimator(fm, ta, ia, test_data2, model_path)
55 | 
56 |     assert est.get_tps() > 300
57 | 
58 | 
59 | def test_hybrid_model_features(tmp_path):
60 |     model_path = tmp_path / "test.model.json"
61 |     reg = XGBoostRegressor()
62 |     reg.train(test_data3, model_path, ["tokens_per_second", "memory", "memory_act"])
63 | 
64 |     fm, ta, ia, _, _, _ = parse(
65 |         {
66 |             "base_model_path": "ibm-granite/granite-8b-code-base",
67 |             "per_device_train_batch_size": 16,
68 |             "block_size": 1024,
69 |             "numGpusPerPod": 4,
70 |         }
71 |     )
72 | 
73 |     est = HybridSpeedEstimator(fm, ta, ia, test_data3, model_path)
74 | 
75 |     assert est.get_tps() > 400
76 | 


--------------------------------------------------------------------------------
/fm_training_estimator/throughput/mock/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .mock import MockSpeedEstimator
3 | 


--------------------------------------------------------------------------------
/fm_training_estimator/throughput/mock/mock.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | import random
 3 | import time
 4 | 
 5 | # Local
 6 | from ...config import FMArguments
 7 | 
 8 | 
 9 | class MockSpeedEstimator:
10 |     def __init__(self, fm_args: FMArguments, seed=None):
11 |         self.fm = fm_args
12 | 
13 |         if seed is not None:
14 |             self.seed = seed
15 |         else:
16 |             self.seed = time.time()
17 | 
18 |     def get_tps(self, seqlen=None):
19 |         if seqlen is None:
20 |             seqlen = self.fm.block_size
21 |         random.seed(self.seed + seqlen)
22 |         return random.randint(100, 10000)
23 | 


--------------------------------------------------------------------------------
/fm_training_estimator/throughput/mock/test_mock.py:
--------------------------------------------------------------------------------
 1 | # Local
 2 | from ...config import parse
 3 | from .mock import MockSpeedEstimator
 4 | 
 5 | 
 6 | def test_mock_1():
 7 |     fm, _, _, _, _, _ = parse({"block_size": 512})
 8 |     est = MockSpeedEstimator(fm, seed=10)
 9 | 
10 |     tps = est.get_tps()
11 |     assert tps == 1355
12 | 
13 | 
14 | def test_mock_2():
15 |     fm, _, _, _, _, _ = parse({"block_size": 1024})
16 |     est = MockSpeedEstimator(fm, seed=10)
17 | 
18 |     tps = est.get_tps()
19 |     assert tps == 719
20 | 


--------------------------------------------------------------------------------
/fm_training_estimator/time/README.md:
--------------------------------------------------------------------------------
 1 | # Time
 2 | 
 3 | Time taken for a training job consists for two main sub components:
 4 | 
 5 | 1. Training time: actual time spent in the training process - forward pass, backward pass and so on.
 6 | 2. Non-training time: other significant sources of times such as model load, model save etc.
 7 | 
 8 | ## Training time
 9 | 
10 | Training time is calculated in the estimator as a simple combination of two inputs:
11 | 
12 | 1. `throughput`: that is number of tokens per second achieved by the training script.
13 | 2. `tokens`: the number of tokens to be processed for the given dataset under the given conditions.
14 | 
15 | Refer to the subcomponents in `../throughput` and `../tokens` for these calculations. Once we have both of these, a simple trivial division gives us the training time - albeit for a single epoch.
16 | 
17 | ## Non-training time
18 | 
19 | This is made of the following components.
20 | 
21 | ### Model load
22 | 
23 | In the beginning, a model must be loaded from disk, usually from files in Pytorch model formats or Hugging Face SafeTensor formats. This model may possibly fetched from the Hugging Face Hub, or available already on disk, cached or a from a local checkpoint format.
24 | 
25 | ### Dataload time
26 | 
27 | Time take to load a dataset from files (typically json, jsonl or parquet) on disk.
28 | 
29 | ### Checkpoint time
30 | 
31 | Every k steps or l epochs, a checkpoint may be saved to disk. There is a lot of research in this topic, to make this process faster.
32 | 
33 | 


--------------------------------------------------------------------------------
/fm_training_estimator/time/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .time import get_total_time
3 | 


--------------------------------------------------------------------------------
/fm_training_estimator/time/time.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | import logging
 3 | 
 4 | # Local
 5 | from ..config import HFTrainingArguments, InfraArguments
 6 | from ..tokens import TokenEstimator
 7 | 
 8 | MODEL_LOAD_TIME = 5 * 60
 9 | CHECKPOINT_TIME = 60
10 | 
11 | 
12 | def get_total_time(
13 |     hf: HFTrainingArguments, ia: InfraArguments, te: TokenEstimator, tps, tokens
14 | ):
15 |     """
16 |     Returns a tuple of (time_total, time_train).
17 |     The first is the second plus the time taken for model loading/checkpoint saving etc.
18 |     """
19 |     train_time_per_epoch = tokens / tps
20 | 
21 |     num_epochs = hf.num_train_epochs
22 | 
23 |     # one checkpoint at the very end
24 |     num_checkpoints = 1
25 |     ss = hf.save_strategy
26 |     if ss == "epoch":
27 |         num_checkpoints += num_epochs
28 |     elif ss == "steps":
29 |         steps_in_epoch = te.get_num_samples() / (
30 |             hf.per_device_train_batch_size * ia.numGpusPerPod
31 |         )
32 |         num_checkpoints += num_epochs * steps_in_epoch / hf.save_steps
33 |     elif ss == "best":
34 |         logging.warn(
35 |             "Unable to guess number of checkpoints due to use of `best` saving strategy."
36 |         )
37 | 
38 |     time_train = train_time_per_epoch * num_epochs
39 |     time_total = MODEL_LOAD_TIME + time_train + (num_checkpoints * CHECKPOINT_TIME)
40 |     return (time_total, time_train)
41 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/README.md:
--------------------------------------------------------------------------------
 1 | # Tokens
 2 | 
 3 | This module is meant to predict the number of tokens that will be processed in a training run. This is not directly the number of tokens in the data for a few reasons:
 4 | 
 5 | 1. Data is formatting into a template using various fields.
 6 | 2. Data is then batched into batches (like 4, 8, etc) and then padded into rectangular tensors. This can add a number of so-called "padding" tokens which are not real data, but nevertheless processed during training at various stages.
 7 | 3. etc
 8 | 
 9 | ## Mechanism
10 | 
11 | We have 2 mechanisms for token predictions:
12 | 
13 | 1. TE0: Emperical sampling of data into batches.
14 | 2. TE2: Offline generation of statistical information and approximate calculations.
15 | 
16 | ## TE0
17 | 
18 | This is highly accurate, since this does exactly what the real training process would do, with the real data. But, this can be slow, since the whole data has to be walked through.
19 | 
20 | Use this technique, if you have small data sizes, or publically available datasets like on HF hub.
21 | 
22 | ## TE2
23 | 
24 | In this approach, we first derive some statistical information from the dataset in a one-time pass. This information is stored in a json file, called a TE2 Contract file.
25 | 
26 | For prediction, this TE2 contract is input and an approximate calculate is done to estimate the real token sizes.
27 | 
28 | Use this technique for large datasets and when privacy of the data is important.
29 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .te import TokenEstimator
3 | from .te0 import TokenEstimator0
4 | from .te2 import TokenEstimator2
5 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te.py:
--------------------------------------------------------------------------------
 1 | class TokenEstimator:
 2 |     def get_total_tokens(self):
 3 |         pass
 4 | 
 5 |     def get_estimated_batch_width(self, batch_size: int):
 6 |         pass
 7 | 
 8 |     def get_num_samples(self):
 9 |         pass
10 | 
11 |     def get_max_sample_length(self):
12 |         pass
13 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te0/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .te0 import TokenEstimator0
3 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te0/te0.py:
--------------------------------------------------------------------------------
 1 | # Third Party
 2 | from datasets import load_dataset
 3 | from tqdm import tqdm
 4 | import numpy as np
 5 | 
 6 | # Local
 7 | from ...config import DataArguments
 8 | from ...utils import logger
 9 | from ..te import TokenEstimator
10 | 
11 | RUNS = 5
12 | SEED = 42
13 | 
14 | np.random.seed(SEED)
15 | 
16 | 
17 | class TokenEstimator0(TokenEstimator):
18 |     def __init__(self, da: DataArguments):
19 |         if da.dataset is None:
20 |             raise RuntimeError("Dataset argument has to be filled in for TE0!")
21 | 
22 |         if da.dataset.endswith(".json") or da.dataset.endswith(".jsonl"):
23 |             logger.debug("Tokens TE0 - Parsing dataset as local json file")
24 |             dataset = load_dataset("json", data_files={"train": da.dataset})["train"]
25 |         else:
26 |             dataset = load_dataset(
27 |                 da.dataset,
28 |                 name=da.dataset_config_name,
29 |                 split=da.dataset_split,
30 |                 trust_remote_code=da.trust_remote_code
31 |             )
32 | 
33 |         tokens = []
34 |         logger.info("Tokens TE0 - Loading data in dataset...")
35 |         for item in tqdm(dataset):
36 |             txt = da.dataset_text_field.format_map(item)
37 |             tokens.append(int(len(txt) / 3.6))
38 | 
39 |         self.tokens = tokens
40 | 
41 |     def get_total_tokens(self):
42 |         return np.sum(self.tokens)
43 | 
44 |     def get_estimated_batch_width(self, batch_size, runs=RUNS):
45 |         widths = [
46 |             self.get_estimated_batch_width_random_shuffle(batch_size)
47 |             for i in range(runs)
48 |         ]
49 |         return np.mean(widths)
50 | 
51 |     def get_num_samples(self):
52 |         return len(self.tokens)
53 | 
54 |     def get_estimated_batch_width_random_shuffle(self, bs):
55 |         tokens = np.array(self.tokens)
56 |         np.random.shuffle(tokens)
57 |         if len(tokens) % bs != 0:
58 |             tokens = np.concatenate(
59 |                 [tokens, np.zeros(bs - len(tokens) % bs)]
60 |             )  # simulating drop_last=False
61 |         return np.mean(np.max(np.split(tokens, len(tokens) / bs), axis=1))
62 | 
63 |     def get_max_sample_length(self):
64 |         return np.max(self.tokens)
65 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te0/te_test1.jsonl:
--------------------------------------------------------------------------------
 1 | {"text": "Mercury is the first planet from the Sun and the smallest in the Solar System"}
 2 | {"text": "Venus is notable for having the densest atmosphere of the terrestrial planets, composed mostly of carbon dioxide with a thick, global sulfuric acid cloud cover."}
 3 | {"text": "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water."}
 4 | {"text": "The surface of Mars is orange-red because it is covered in iron(III) oxide dust, giving it the nickname 'the Red Planet'. Mars is among the brightest objects in Earth's sky and its high-contrast albedo features have made it a common subject for telescope viewing."}
 5 | {"text": "Jupiter is the fifth planet from the Sun and the largest in the Solar System. A gas giant, Jupiter's mass is more than two and a half times that of all the other planets in the Solar System combined and slightly less than one one-thousandth the mass of the Sun. Jupiter orbits the Sun at a distance of 5.20 AU (778.5 Gm) with an orbital period of 11.86 years. It is the third brightest natural object in the Earth's night sky after the Moon and Venus and has been observed since prehistoric times. Its name derives from Jupiter, the chief deity of ancient Roman religion. "}
 6 | {"text": "Saturn is a gas giant with an average radius of about nine-and-a-half times that of Earth, prominently known for its rings."}
 7 | {"text": "Uranus is the seventh planet from the Sun. It is a gaseous cyan-coloured ice giant. Most of the planet is made of water, ammonia, and methane in a supercritical phase of matter, which in astronomy is called 'ice' or volatiles."}
 8 | {"text": "Neptune is the eighth and farthest known planet from the Sun. It is the fourth-largest planet in the Solar System by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth."}
 9 | {"text": "Pluto is a dwarf planet in the Kuiper belt, a ring of bodies beyond the orbit of Neptune."}
10 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te0/test_te0.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from pathlib import Path
 3 | 
 4 | # Local
 5 | from ...config import parse
 6 | from .te0 import TokenEstimator0
 7 | 
 8 | # trick to ensure running this with pytest works from root dir
 9 | test_data = (Path(__file__).parent / "te_test1.jsonl").as_posix()
10 | 
11 | 
12 | def test_te_raw_hf_dataset():
13 |     _, _, _, da, _, _ = parse(
14 |         {
15 |             "base_model_path": "ibm-granite/granite-8b-code-base",
16 |             "gpu_memory_in_gb": 80,
17 |             "dataset": "super_glue",
18 |             "dataset_config_name": "axb",
19 |             "dataset_split": "test",
20 |             "dataset_text_field": "Input###:\n {sentence1}",
21 |         }
22 |     )
23 | 
24 |     te = TokenEstimator0(da)
25 | 
26 |     assert te.get_num_samples() == 1104, "Number of samples in data should match"
27 |     assert te.get_estimated_batch_width(4) < te.get_estimated_batch_width(
28 |         8
29 |     ), "Larger batches should have more padding"
30 |     assert (
31 |         abs(te.get_estimated_batch_width(1) - (te.get_total_tokens() / 1104)) < 1e6
32 |     ), "Estimated batch width for BS=1 should be equal to mean of tokens"
33 |     assert te.get_estimated_batch_width(1104) == max(
34 |         te.tokens
35 |     ), "For batch size equal to dataset length, estimated batch width should be equal to max length of tokens"
36 | 
37 | 
38 | def test_te_raw_json():
39 |     _, _, _, da, _, _ = parse(
40 |         {
41 |             "base_model_path": "ibm-granite/granite-8b-code-base",
42 |             "gpu_memory_in_gb": 80,
43 |             "dataset": test_data,
44 |             "dataset_text_field": "{text}",
45 |         }
46 |     )
47 | 
48 |     te = TokenEstimator0(da)
49 | 
50 |     assert te.get_num_samples() == 9
51 |     assert te.get_estimated_batch_width(1) < te.get_estimated_batch_width(2)
52 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te2/README.md:
--------------------------------------------------------------------------------
 1 | # TE2
 2 | 
 3 | ## Contract Generation
 4 | 
 5 | Examples:
 6 | ```
 7 | python -m fm_training_estimator.tokens.te2.gen_contract --dataset imdb --output out1.contract.json
 8 | ```
 9 | or
10 | ```
11 | python -m fm_training_estimator.tokens.te2.gen_contract --dataset ./fm_training_estimator/tokens/te2/te_test1.jsonl --output out1.contract.json
12 | ```
13 | 
14 | This will output a single small contract file. This file should be later used with the estimator.
15 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te2/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .te2 import TokenEstimator2
3 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te2/gen_contract.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | # Third Party
 4 | import fire
 5 | 
 6 | # Local
 7 | from .te2 import GenerateTokenEstimator2Contract
 8 | 
 9 | def gen(dataset: str, output: str, ds_config_name: str = None, ds_split: str = "test", sample_percent: int = None):
10 |     """
11 |     Inputs: 
12 |     dataset: <str> the path to a json/jsonl file, or the name of HF dataset on the HF hub
13 |     output: <str> the path to output the contract file
14 |     ds_config_name: <str> For HF datasets, optional name of config to use
15 |     ds_split: <str> for HF datasets, the name of the split of the data to use
16 |     sample_percent: <int> an optional integer between (0-100], indicating what percent of the dataset we should sample. Default (if nothing specified), is no sampling, which means 100% of the data is used.
17 |     """
18 | 
19 |     print("Generating contract...")
20 |     contract = GenerateTokenEstimator2Contract(dataset, ds_config_name, ds_split, sample_percent)
21 | 
22 |     with open(output, "w") as f:
23 |         json.dump(contract, f)
24 | 
25 |     print("...successfully wrote contract to file: ", output)
26 | 
27 | if __name__ == "__main__":
28 |     fire.Fire(gen)
29 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te2/te2.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | 
  4 | # Third Party
  5 | from datasets import load_dataset
  6 | from tqdm import tqdm
  7 | import numpy as np
  8 | from sklearn.linear_model import LinearRegression
  9 | 
 10 | # Local
 11 | from ...config import DataArguments
 12 | from ...utils import logger
 13 | from ..te import TokenEstimator
 14 | 
 15 | 
 16 | def load_dataset_config_from_json(json_file_path):
 17 |     try:
 18 |         with open(json_file_path, "r") as file:
 19 |             config = json.load(file)
 20 |         print("Dataset configuration loaded successfully.")
 21 |         return config
 22 |     except FileNotFoundError:
 23 |         print(f"Error: The file {json_file_path} was not found.")
 24 |         return None
 25 |     except json.JSONDecodeError as e:
 26 |         print(f"Error: Failed to parse JSON. {e}")
 27 |         return None
 28 |     except Exception as e:
 29 |         print(f"An unexpected error occurred: {e}")
 30 |         return None
 31 | 
 32 | 
 33 | class TokenEstimator2(TokenEstimator):
 34 |     def __init__(self, da: DataArguments):
 35 |         if da.dataset_config_file is None:
 36 |             raise RuntimeError("Dataset configuration file has to be uploaded for TE2!")
 37 | 
 38 |         if da.dataset_config_file.endswith(".json"):
 39 |             logger.info("Parsing dataset configuration as local json file")
 40 |             contracts = load_dataset_config_from_json(da.dataset_config_file)
 41 |         else:
 42 |             raise RuntimeError("Please upload dataset configuration in correct JSON format!")
 43 | 
 44 |         baseline, fields = self.process_sample_format(da.dataset_text_field)
 45 | 
 46 |         self.baseline = baseline
 47 | 
 48 |         self.contract = {}
 49 |         self.m = {}
 50 |         self.reg = {}
 51 | 
 52 |         # For each field found in the input format, extract info from contract
 53 |         for field in fields:
 54 |             contract = contracts[field]
 55 |         
 56 |             m = {}
 57 |             m[1] = contract["bs1"]
 58 |             batch_sizes = [2**i for i in range(1, 5) if 2**i <= contract["len"]]
 59 |             for bs in batch_sizes:
 60 |                 m[bs] = contract[f"bs{bs}"]
 61 | 
 62 |             X = np.array([[i] for i in m.keys()])
 63 |             y = np.array(list(m.values()))
 64 | 
 65 |             self.contract[field] = contract
 66 |             self.m[field] = m
 67 |             self.reg[field] = LinearRegression().fit(X, y)
 68 | 
 69 |     def process_sample_format(self, format_str):
 70 |         """
 71 |         Convert an input format string, into the constant baseline part and the fields used from the dataset.
 72 | 
 73 |         The baseline part is the number of tokens used in the static string part of the format.
 74 |         The fields are simply a list of matches of words in {}.
 75 | 
 76 |         For example, input format string maybe:
 77 |         'Below is a an instruction....
 78 |          ### Instruction
 79 |          {instruction}
 80 |          ### Input:
 81 |          {input}
 82 |          ### Response:'
 83 | 
 84 |         In the original data, we have contract information about "instruction" and "input" stored.
 85 |         In this function, we need to extract out how many tokens make the static portion and
 86 |         what fields are left over.
 87 |         """
 88 |         matches = re.findall(r'\{(.*?)\}', format_str)
 89 | 
 90 |         total = len(format_str)
 91 | 
 92 |         slot_len = 0
 93 |         for m in matches:
 94 |             # add 2 for the curly braces
 95 |             slot_len += 2 + len(matches)
 96 | 
 97 |         # number of tokens
 98 |         baseline = (total - slot_len) / 3.6
 99 | 
100 |         return (total - slot_len, matches)
101 | 
102 |     def get_total_tokens(self):
103 |         """
104 |         Since each entry is also formatted with the fmt_string, we need to add the static portions.
105 |         """
106 |         total = 0
107 |         num_samples = self.get_num_samples()
108 | 
109 |         # add all the common static tokens, one full set of baseline for each entey
110 |         total += self.baseline * num_samples
111 |         # now add the full set of tokens for fields that are present in here
112 |         for con in self.contract.values():
113 |             total += con["total"]
114 | 
115 |         return total
116 | 
117 |     def get_estimated_batch_width(self, bs):
118 |         """
119 |         Since multiple fields make up a single entry, we predict average size of each and 
120 |         also add the baseline width to it.
121 |         """
122 |         width = self.baseline
123 | 
124 |         for field in self.contract.keys():
125 |             m = self.m[field]
126 |             reg = self.reg[field]
127 |             if bs in m.keys():
128 |                 width += m[bs]
129 |             else:
130 |                 width += reg.predict([[bs]])[0]
131 | 
132 |         return width
133 | 
134 |     def get_num_samples(self):
135 |         # length of all contracts will be same
136 |         con = list(self.contract.values())[0]
137 |         return con["len"]
138 | 
139 |     def get_max_sample_length(self):
140 |         res = self.baseline
141 |         # this is a very pessimistic view, and not the actual worst case
142 |         for con in self.contract.values():
143 |             res += con["max"]
144 | 
145 |         return res
146 | 
147 | 
148 | # TODO: generate for all configs and splits
149 | def GenerateTokenEstimator2Contract(dataset, config_name=None, split=None, sample_percent=None):
150 |     if dataset.endswith(".json") or dataset.endswith(".jsonl"):
151 |         logger.info("Parsing dataset as local json file")
152 |         dataset = load_dataset("json", data_files={"train": dataset})["train"]
153 |     else:
154 |         dataset = load_dataset(dataset, name=config_name, split=split)
155 | 
156 |     print("Loading data in dataset...")
157 | 
158 |     feat_tokens = {}
159 |     # TODO: run sampling instead of going through it all
160 |     num_items = len(dataset)
161 |     if sample_percent != None:
162 |         if sample_percent > 0 and sample_percent <= 100:
163 |             num_items = int(num_items * sample_percent/100)
164 | 
165 |     # mark all string features to generate contracts for
166 |     for feat, f_val in dataset.features.items():
167 |         if f_val.dtype == 'string':
168 |             feat_tokens[feat] = []
169 | 
170 |     seen_items = 0
171 |     for item in tqdm(dataset):
172 |         # loop over needed features
173 |         for feat in feat_tokens.keys():
174 |             feat_tokens[feat].append(int(len(item[feat]) / 3.6))
175 | 
176 |         seen_items += 1
177 |         if seen_items >= num_items:
178 |             break
179 | 
180 |     contracts = {}
181 |     for feat in feat_tokens.keys():
182 |         tokens = np.sort(feat_tokens[feat])[::-1]
183 | 
184 |         contract = {}
185 |         contract["len"] = len(tokens)
186 |         contract["total"] = int(np.sum(tokens))
187 |         contract["min"] = int(np.min(tokens))
188 |         contract["max"] = int(np.max(tokens))
189 |         contract["mean"] = round(np.mean(tokens), 2)
190 |         contract["std"] = round(np.std(tokens), 2)
191 | 
192 |         contract["bs1"] = contract["mean"]
193 | 
194 |         # for bs = 2, 4, 6, 8, 16
195 |         batch_sizes = [2**i for i in range(1, 5) if 2**i <= contract["len"]]
196 |         for bs in batch_sizes:
197 |             contract[f"bs{bs}"] = np.mean(tokens[:int(len(tokens)/bs)])
198 | 
199 |         contracts[feat] = contract
200 | 
201 |         # if we are in sampling mode, rescale the stats
202 |         if num_items != len(dataset):
203 |             contract["len"] = len(dataset)
204 |             contract["total"] = int(contract["total"]*len(dataset)/num_items)
205 | 
206 |     return contracts
207 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te2/te_test1.jsonl:
--------------------------------------------------------------------------------
 1 | {"text": "Mercury is the first planet from the Sun and the smallest in the Solar System"}
 2 | {"text": "Venus is notable for having the densest atmosphere of the terrestrial planets, composed mostly of carbon dioxide with a thick, global sulfuric acid cloud cover."}
 3 | {"text": "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being a water world, the only one in the Solar System sustaining liquid surface water."}
 4 | {"text": "The surface of Mars is orange-red because it is covered in iron(III) oxide dust, giving it the nickname 'the Red Planet'. Mars is among the brightest objects in Earth's sky and its high-contrast albedo features have made it a common subject for telescope viewing."}
 5 | {"text": "Jupiter is the fifth planet from the Sun and the largest in the Solar System. A gas giant, Jupiter's mass is more than two and a half times that of all the other planets in the Solar System combined and slightly less than one one-thousandth the mass of the Sun. Jupiter orbits the Sun at a distance of 5.20 AU (778.5 Gm) with an orbital period of 11.86 years. It is the third brightest natural object in the Earth's night sky after the Moon and Venus and has been observed since prehistoric times. Its name derives from Jupiter, the chief deity of ancient Roman religion. "}
 6 | {"text": "Saturn is a gas giant with an average radius of about nine-and-a-half times that of Earth, prominently known for its rings."}
 7 | {"text": "Uranus is the seventh planet from the Sun. It is a gaseous cyan-coloured ice giant. Most of the planet is made of water, ammonia, and methane in a supercritical phase of matter, which in astronomy is called 'ice' or volatiles."}
 8 | {"text": "Neptune is the eighth and farthest known planet from the Sun. It is the fourth-largest planet in the Solar System by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth."}
 9 | {"text": "Pluto is a dwarf planet in the Kuiper belt, a ring of bodies beyond the orbit of Neptune."}
10 | 


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te2/test1.contract.json:
--------------------------------------------------------------------------------
1 | {"text": {"len": 9, "total": 533, "min": 21, "max": 158, "mean": 59.22, "std": 38.78, "bs1": 59.22, "bs2": 88.5, "bs4": 115.5, "bs8": 158.0}}


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te2/test_axb.contract.json:
--------------------------------------------------------------------------------
1 | {"sentence1": {"len": 1104, "total": 29400, "min": 3, "max": 82, "mean": 26.63, "std": 15.51, "bs1": 26.63, "bs2": 38.96376811594203, "bs4": 48.56884057971015, "bs8": 56.07971014492754, "bs16": 62.231884057971016}, "sentence2": {"len": 1104, "total": 29400, "min": 3, "max": 82, "mean": 26.63, "std": 15.51, "bs1": 26.63, "bs2": 38.96376811594203, "bs4": 48.56884057971015, "bs8": 56.07971014492754, "bs16": 62.231884057971016}}


--------------------------------------------------------------------------------
/fm_training_estimator/tokens/te2/test_te2.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from pathlib import Path
 3 | 
 4 | # Local
 5 | from ...config import parse
 6 | from .te2 import TokenEstimator2, GenerateTokenEstimator2Contract
 7 | from ..te0 import TokenEstimator0
 8 | 
 9 | # trick to ensure running this with pytest works from root dir
10 | test_data = (Path(__file__).parent / "te_test1.jsonl").as_posix()
11 | contract_axb = (Path(__file__).parent / "test_axb.contract.json").as_posix()
12 | contract_test1 = (Path(__file__).parent / "test1.contract.json").as_posix()
13 | 
14 | def test_te_raw_hf_dataset():
15 |     _, _, _, da, _, _ = parse(
16 |         {
17 |             "base_model_path": "ibm-granite/granite-8b-code-base",
18 |             "gpu_memory_in_gb": 80,
19 |             "dataset": "super_glue",
20 |             "dataset_config_name": "axb",
21 |             "dataset_split": "test",
22 |             "dataset_text_field": "{sentence1}",
23 |             "dataset_config_file": contract_axb
24 |         }
25 |     )
26 | 
27 |     te = TokenEstimator2(da)
28 |     te0 = TokenEstimator0(da)
29 | 
30 | 
31 |     assert te.get_num_samples() == 1104, "Number of samples in data should match"
32 |     assert te.get_estimated_batch_width(4) < te.get_estimated_batch_width(
33 |         8
34 |     ), "Larger batches should have more padding"
35 |     assert (
36 |         abs(te.get_estimated_batch_width(1) - (te.get_total_tokens() / 1104)) < 1e6
37 |     ), "Estimated batch width for BS=1 should be equal to mean of tokens"
38 |     assert (
39 |         te0.get_estimated_batch_width(4) < te.get_estimated_batch_width(4)
40 |     ), "TE0 must be less than TE2 for same batch size"
41 | 
42 | 
43 | def test_te_raw_json():
44 |     _, _, _, da, _, _ = parse(
45 |         {
46 |             "base_model_path": "ibm-granite/granite-8b-code-base",
47 |             "gpu_memory_in_gb": 80,
48 |             "dataset": test_data,
49 |             "dataset_text_field": "{text}",
50 |             "dataset_config_file": contract_test1
51 |         }
52 |     )
53 | 
54 |     te = TokenEstimator2(da)
55 |     te0 = TokenEstimator0(da)
56 | 
57 |     assert te.get_num_samples() == 9
58 |     assert te.get_estimated_batch_width(1) < te.get_estimated_batch_width(2)
59 |     assert te0.get_estimated_batch_width(2) < te.get_estimated_batch_width(2)
60 | 
61 | def test_te2_contract():
62 |     _, _, _, da, _, _ = parse(
63 |         {
64 |             "base_model_path": "ibm-granite/granite-8b-code-base",
65 |             "gpu_memory_in_gb": 80,
66 |             "dataset": test_data,
67 |             "dataset_text_field": "{text}",
68 |         }
69 |     )
70 | 
71 |     contract = GenerateTokenEstimator2Contract(da.dataset)
72 | 
73 |     assert contract["text"]["len"] == 9
74 | 


--------------------------------------------------------------------------------
/fm_training_estimator/ui/README.md:
--------------------------------------------------------------------------------
 1 | # ui
 2 | 
 3 | ## UI configuration options
 4 | 
 5 | ### Lookup path
 6 | 
 7 | Path to file with raw CSV data. Look into the `regressor` folder to learn more about data formats.
 8 | 
 9 | ### Model path
10 | 
11 | Path to model built using the `regressor` module.
12 | 
13 | ## cli
14 | 
15 | To use the cli:
16 | ```
17 | python -m fm_training_estimator.ui.cli <path to config file> -l <lookup file> -m <model file>
18 | ```
19 | Lookup file and model file are optional and can be left out.
20 | 
21 | First train a memory model:
22 | ```
23 | python -m fm_training_estimator.regressor.xgboost.train ./fm_training_estimator/regressor/test_data/data2.csv ./test.model.json '["tokens_per_second","memory","memory_act"]'
24 | ```
25 | 
26 | Run with all inputs:
27 | ```
28 | python -m fm_training_estimator.ui.cli \
29 |   ./fm_training_estimator/config/test_configs/config2.json \
30 |   -l ./fm_training_estimator/regressor/test_data/data2.csv \
31 |   -m ./test.model.json
32 | ```
33 | `config2.json` is an example of the setup where Lookup would work. `config3.json` is an example where lookup will fail and the system will fall back to regression.
34 | 
35 | ## api
36 | 
37 | Run the api:
38 | ```
39 | make run-api
40 | ```
41 | 
42 | Now, you can get an estimate for the config using something like the following:
43 | ```
44 | curl localhost:3000/api/estimate -d@<filename>
45 | ```
46 | Notice that the request is a POST, since we need to pass in config json as a request body.
47 | 
48 | ## web
49 | 
50 | To use the web ui:
51 | ```
52 | python -m fm_training_estimator.ui.web
53 | ```
54 | 
55 | to enable white listing of models, you can pass in the path of a txt file with one model per line. See the file `model_whitelist.txt` for an example. Use as:
56 | ```
57 | python -m fm_training_estimator.ui.web ./model_whitelist.txt
58 | ```
59 | 
60 | To enable lookup and regression based hybrid estimator:
61 | ```
62 | python -m fm_training_estimator.ui.web ./model_whitelist.txt \
63 |                                        ../regressor/test_data/data2.csv \
64 |                                        ../../test.model.json
65 | ```
66 | 
67 | As with the cli version, first train the model to use.
68 | 


--------------------------------------------------------------------------------
/fm_training_estimator/ui/__init__.py:
--------------------------------------------------------------------------------
1 | # Local
2 | from .core import run
3 | 


--------------------------------------------------------------------------------
/fm_training_estimator/ui/api.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from typing import Any
 3 | import json
 4 | import logging
 5 | 
 6 | # Third Party
 7 | from fastapi import Body, FastAPI
 8 | import fire
 9 | import uvicorn
10 | 
11 | # Local
12 | from .core import run
13 | 
14 | 
15 | def api(data_path, model_path):
16 |     app = FastAPI()
17 | 
18 |     @app.post("/api/estimate")
19 |     def estimate(config: Any = Body()):
20 |         conf = json.loads(config)
21 |         output = run(conf, data_path, model_path)
22 |         # this default float business is needed to deal with numpy.float32
23 |         # types present in the output json which don't serialize out of the box
24 |         return json.dumps(output, default=float)
25 | 
26 |     return app
27 | 
28 | 
29 | def run_api(data_path=None, model_path=None, port=3000):
30 | 
31 |     app = api(data_path, model_path)
32 |     uvicorn.run(app, host="0.0.0.0", port=port)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     logging.basicConfig(level=logging.INFO)
37 |     fire.Fire(run_api)
38 | 


--------------------------------------------------------------------------------
/fm_training_estimator/ui/cli.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | from pathlib import Path
 3 | from typing import Optional
 4 | import json
 5 | import logging
 6 | 
 7 | # Third Party
 8 | import fire
 9 | 
10 | # Local
11 | from .core import run
12 | 
13 | 
14 | def run_cli(
15 |     config: str,
16 |     output_path: str = "",
17 |     log_level: str = "INFO",
18 |     lookup_data_path: Optional[str] = None,
19 |     model_path: Optional[str] = None,
20 | ):
21 |     """Run the CLI."""
22 |     log_level = log_level.upper()
23 |     logging.basicConfig(level=log_level)
24 |     output = run(
25 |         config=config,
26 |         lookup_data_path=lookup_data_path,
27 |         model_path=model_path,
28 |     )
29 |     output_json = json.dumps(output, indent=4)
30 |     if output_path == "":
31 |         # use print instead of logging so that
32 |         # the output can be parsed as valid json
33 |         print(output_json)
34 |         return
35 |     output_path: Path = Path(output_path)
36 |     output_path.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
37 |     logging.info("writing the output to a file at %s", output_path)
38 |     with open(output_path, "w", encoding="utf-8") as f:
39 |         f.write(output_json)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     fire.Fire(run_cli)
44 | 


--------------------------------------------------------------------------------
/fm_training_estimator/ui/core.py:
--------------------------------------------------------------------------------
 1 | # Local
 2 | from ..config import is_fsdp, parse
 3 | from ..memory import HybridEstimator, HybridLoraEstimator, HybridQLoraEstimator
 4 | from ..throughput import HybridSpeedEstimator
 5 | from ..time import get_total_time
 6 | from ..tokens import TokenEstimator0, TokenEstimator2
 7 | from ..utils import fmt_size
 8 | 
 9 | 
10 | def run(config, lookup_data_path=None, model_path=None):
11 | 
12 |     res = {}
13 |     fm, ta, ia, da, la, qla = parse(config)
14 | 
15 |     token_est = None
16 |     if da.te_approach == 0:
17 |         token_est = TokenEstimator0(da)
18 |     elif da.te_approach == 2:
19 |         token_est = TokenEstimator2(da)
20 | 
21 |     if token_est != None:
22 |         data_max_width = token_est.get_max_sample_length()
23 |         if data_max_width < fm.block_size:
24 |             fm.block_size = data_max_width
25 | 
26 |     if fm.technique == "lora":
27 |         est = HybridLoraEstimator(fm, ta, ia, la, lookup_data_path, model_path)
28 |     elif fm.technique == "qlora":
29 |         est = HybridQLoraEstimator(fm, ta, ia, la, qla, lookup_data_path, model_path)
30 |     else:
31 |         est = HybridEstimator(fm, ta, ia, lookup_data_path, model_path)
32 | 
33 |     res["total_mem_estimate_og"] = float(est.get_total_mem_estimate())
34 |     res["activation_memory_og"] = float(est.calculate_activation_memory())
35 |     res["gradient_memory_og"] = float(est.calculate_gradient_memory())
36 |     res["model_memory_og"] = float(est.calculate_model_memory())
37 |     res["optimizer_memory_og"] = float(est.calculate_optimizer_memory())
38 | 
39 |     res["total_mem_estimate"] = fmt_size(res["total_mem_estimate_og"])
40 |     res["activation_memory"] = fmt_size(res["activation_memory_og"])
41 |     res["gradient_memory"] = fmt_size(res["gradient_memory_og"])
42 |     res["model_memory"] = fmt_size(res["model_memory_og"])
43 |     res["optimizer_memory"] = fmt_size(res["optimizer_memory_og"])
44 | 
45 |     res["num_gpus"] = ia.numGpusPerPod
46 | 
47 |     if ia.numGpusPerPod == 0:
48 |         if fm.technique == "full" and is_fsdp(ta):
49 |             res["num_gpus"] = est.fsdp_est.get_number_of_gpus()
50 |         elif fm.technique == "lora" or fm.technique == "qlora":
51 |             res["num_gpus"] = est.num_gpus
52 |         else:
53 |             res["num_gpus"] = 1
54 | 
55 |         ia.numGpusPerPod = res["num_gpus"]
56 | 
57 |     # No suitable configuration found
58 |     if res["num_gpus"] == -1:
59 |         return {"error": "Input configuration is infeasible!"}
60 | 
61 |     speed_est = HybridSpeedEstimator(fm, ta, ia, lookup_data_path, model_path)
62 |     res["tps"] = float(speed_est.get_tps())
63 | 
64 |     if token_est is not None:
65 |         res["tokens_per_sample"] = int(
66 |             token_est.get_estimated_batch_width(ta.per_device_train_batch_size)
67 |         )
68 |         res["total_tokens"] = int(token_est.get_total_tokens())
69 | 
70 |         # get the update tps for this estimate token width
71 |         res["tps"] = float(speed_est.get_tps(res["tokens_per_sample"]))
72 | 
73 |         time_total, time_train = get_total_time(ta, ia, token_est, res["tps"], res["total_tokens"])
74 |         res["time"] = time_total
75 |         res["time_train"] = time_train
76 | 
77 |     return res
78 | 


--------------------------------------------------------------------------------
/fm_training_estimator/ui/model_whitelist.txt:
--------------------------------------------------------------------------------
 1 | ibm-granite/granite-3b-code-base
 2 | ibm-granite/granite-3b-code-instruct
 3 | ibm-granite/granite-7b-base
 4 | ibm-granite/granite-7b-instruct
 5 | ibm-granite/granite-8b-code-base
 6 | ibm-granite/granite-8b-code-instruct
 7 | ibm-granite/granite-20b-code-base
 8 | ibm-granite/granite-20b-code-instruct
 9 | ibm-granite/granite-34b-code-base
10 | ibm-granite/granite-34b-code-instruct
11 | instructlab/merlinite-7b-lab
12 | instructlab/granite-7b-lab
13 | 


--------------------------------------------------------------------------------
/fm_training_estimator/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Local
 2 | from .model import extract_model_features, get_model_max_length
 3 | from .utils import (
 4 |     fmt_size,
 5 |     get_human_readable_number,
 6 |     get_size_from_precision,
 7 |     logger,
 8 |     unmarshal,
 9 | )
10 | 
11 | __all__ = [
12 |     "unmarshal",
13 |     "get_size_from_precision",
14 |     "get_human_readable_number",
15 |     "fmt_size",
16 |     "get_model_max_length",
17 |     "logger",
18 |     "extract_model_features",
19 | ]
20 | 


--------------------------------------------------------------------------------
/fm_training_estimator/utils/model.py:
--------------------------------------------------------------------------------
 1 | # Standard
 2 | import logging
 3 | 
 4 | # Third Party
 5 | from transformers import AutoConfig
 6 | 
 7 | 
 8 | def get_model_max_length(model_path: str) -> int:
 9 |     """return model's max sequence length by looking up its config
10 | 
11 |     Args:
12 |         model_path (str): model path on filesystem or hugging face id
13 | 
14 |     Returns:
15 |         int: max sequence length
16 |     """
17 |     config = AutoConfig.from_pretrained(model_path)
18 |     n_positions = 4096
19 |     if hasattr(config, "n_positions"):
20 |         n_positions = config.n_positions
21 |     elif hasattr(config, "max_position_embeddings"):
22 |         n_positions = config.max_position_embeddings
23 |     return n_positions
24 | 
25 | 
26 | def extract_model_features(model, fmt="dict"):
27 |     """Given a model name in HF format, extract out the params of the model.
28 | 
29 |     Can return the data in one of supported formats.
30 |     "dict": return a dictionary
31 |     "list": return a list
32 |     "csv": return a comma separated string of values
33 |     """
34 |     try:
35 |         conf = AutoConfig.from_pretrained(model)
36 |         conf = conf.to_dict()
37 |         res = {}
38 | 
39 |         # TODO: include number of params here
40 |         # need to refactor code out from Full Memory Estimator class
41 | 
42 |         res["model_arch"] = conf["architectures"][0]
43 | 
44 |         if "hidden_size" in conf:
45 |             res["model_hidden_size"] = conf["hidden_size"]
46 |         elif "n_embd" in conf:
47 |             res["model_hidden_size"] = conf["n_embd"]
48 |         elif "n_embed" in conf:
49 |             res["model_hidden_size"] = conf["n_embed"]
50 |         else:
51 |             res["model_hidden_size"] = -1
52 | 
53 |         if "intermediate_size" in conf:
54 |             res["model_intermediate_size"] = conf["intermediate_size"]
55 |         elif "n_inner" in conf:
56 |             res["model_intermediate_size"] = conf["n_inner"]
57 |         else:
58 |             res["model_intermediate_size"] = -1
59 | 
60 |         if "num_attention_heads" in conf:
61 |             res["model_num_attn_heads"] = conf["num_attention_heads"]
62 |         elif "n_head" in conf:
63 |             res["model_num_attn_heads"] = conf["n_head"]
64 |         else:
65 |             res["model_num_attn_heads"] = -1
66 | 
67 |         if "num_hidden_layers" in conf:
68 |             res["model_num_hidden_layers"] = conf["num_hidden_layers"]
69 |         elif "n_layer" in conf:
70 |             res["model_num_hidden_layers"] = conf["n_layer"]
71 |         else:
72 |             res["model_num_hidden_layers"] = -1
73 | 
74 |         if "num_key_value_heads" in conf:
75 |             res["model_num_key_value_heads"] = conf["num_key_value_heads"]
76 |         else:
77 |             res["model_num_key_value_heads"] = res["model_num_attn_heads"]
78 | 
79 |     except Exception as e:
80 |         logging.error(e)
81 |         logging.warning("Returning empty response!")
82 |         res = {}
83 | 
84 |     if fmt == "dict":
85 |         return res
86 | 
87 |     if fmt == "list":
88 |         return list(res.values())
89 | 
90 |     if fmt == "csv":
91 |         out = ""
92 |         for v in res.values():
93 |             out += f"{v},"
94 |         return out[:-1]
95 | 
96 |     logging.warning("Unknown format selected: ", fmt)
97 |     return res
98 | 


--------------------------------------------------------------------------------
/fm_training_estimator/utils/test_model.py:
--------------------------------------------------------------------------------
 1 | # Local
 2 | from .model import extract_model_features
 3 | 
 4 | 
 5 | def test_extract_model_features():
 6 | 
 7 |     # default dict format
 8 |     res = extract_model_features("ibm-granite/granite-3b-code-base")
 9 |     assert res["model_num_hidden_layers"] == 32
10 | 
11 |     res = extract_model_features("ibm-granite/granite-3b-code-base", fmt="list")
12 |     assert res == ["LlamaForCausalLM", 2560, 10240, 32, 32, 32]
13 | 
14 |     # extract in csv format
15 |     res = extract_model_features("ibm-granite/granite-3b-code-base", fmt="csv")
16 |     assert res == "LlamaForCausalLM,2560,10240,32,32,32"
17 | 
18 |     # example from different format
19 |     res = extract_model_features("ibm-granite/granite-20b-code-base", fmt="list")
20 |     assert res == ["GPTBigCodeForCausalLM", 6144, 24576, 48, 52, 48]
21 | 


--------------------------------------------------------------------------------
/fm_training_estimator/utils/utils.py:
--------------------------------------------------------------------------------
  1 | # Standard
  2 | from typing import Dict
  3 | import json
  4 | import logging
  5 | import math
  6 | import os
  7 | 
  8 | # Third Party
  9 | import yaml
 10 | 
 11 | logger = logging.getLogger("fm_training_estimator")
 12 | log_level = os.getenv("LOG_LEVEL", "INFO").upper()
 13 | 
 14 | # Validate the log level
 15 | numeric_level = getattr(logging, log_level, None)
 16 | if not isinstance(numeric_level, int):
 17 |     raise ValueError(f"Invalid log level: {log_level}")
 18 | 
 19 | logging.basicConfig(level=numeric_level)
 20 | logging.info(
 21 |     f"FM Training Estimator utils: Set central logging config to level {log_level}."
 22 | )
 23 | 
 24 | 
 25 | def unmarshal(path: str) -> Dict:
 26 |     """load data from the given filesystem path and return python dict
 27 | 
 28 |     Args:
 29 |         path (str): path to json or yaml file
 30 | 
 31 |     Returns:
 32 |         Dict: loaded data as python dictionary
 33 |     """
 34 |     if not path.endswith((".json", ".yaml", ".yml")):
 35 |         raise ValueError(
 36 |             "path to unmarshal should have either json or yaml extension, but got {path}".format(
 37 |                 path=path
 38 |             )
 39 |         )
 40 |     with open(path, "r", encoding="utf8") as f:
 41 |         if path.endswith(".json"):
 42 |             return json.load(f)
 43 |         return yaml.safe_load(f)
 44 | 
 45 | 
 46 | def get_size_from_precision(precision: str) -> float:
 47 |     """return multiplier based on the precision
 48 | 
 49 |     Args:
 50 |         precision (str): paramter precision
 51 | 
 52 |     Returns:
 53 |         int: multiplier
 54 |     """
 55 |     if precision in ("float16", "bfloat16"):
 56 |         return 2
 57 |     if precision == "float32":
 58 |         return 4
 59 |     if precision == "nf4":
 60 |         return 0.5
 61 |     return 4
 62 | 
 63 | 
 64 | # https://stackoverflow.com/a/3155023
 65 | def get_human_readable_number(number: int) -> str:
 66 |     """return human readable format with denomination for the given number
 67 | 
 68 |     Args:
 69 |         number (int): number
 70 | 
 71 |     Returns:
 72 |         str: human readable string for number with denomination
 73 |     """
 74 |     denominations = ["", " Thousand", " Million", " Billion", " Trillion"]
 75 |     number = float(number)
 76 |     millidx = max(
 77 |         0,
 78 |         min(
 79 |             len(denominations) - 1,
 80 |             int(math.floor(0 if number == 0 else math.log10(abs(number)) / 3)),
 81 |         ),
 82 |     )
 83 | 
 84 |     return "{:.0f}{}".format(number / 10 ** (3 * millidx), denominations[millidx])
 85 | 
 86 | 
 87 | # https://stackoverflow.com/a/1094933
 88 | def fmt_size(size: int) -> str:
 89 |     """returns human readable format for the given size in bytes
 90 | 
 91 |     Args:
 92 |         size (int): number of bytes
 93 | 
 94 |     Returns:
 95 |         str: human readable string format for the bytes
 96 |     """
 97 |     for unit in ("", "Ki", "Mi", "Gi", "Ti"):
 98 |         if abs(size) < 1024.0:
 99 |             return f"{size:3.1f} {unit}B"
100 |         size /= 1024.0
101 | 
102 |     return f"{size:.1f} PiB"
103 | 


--------------------------------------------------------------------------------
/imgs/build-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-model-stack/fm-training-estimator/ba3f02ab129877c6fa6dabca46f860ce0f95c62c/imgs/build-model.png


--------------------------------------------------------------------------------
/imgs/demo-cli.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foundation-model-stack/fm-training-estimator/ba3f02ab129877c6fa6dabca46f860ce0f95c62c/imgs/demo-cli.gif


--------------------------------------------------------------------------------
/launch_estimator.py:
--------------------------------------------------------------------------------
  1 | # Copyright The FM Training Estimator Authors
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Script wraps fm_training_estimator to run with user provided training configs.
 15 | The script will read configuration via environment variable `ESTIMATOR_INPUT_JSON_PATH`
 16 | for the path to the JSON config file or `ESTIMATOR_INPUT_JSON_ENV_VAR`
 17 | for the encoded config string to parse.
 18 | """
 19 | 
 20 | # Standard
 21 | from pathlib import Path
 22 | import base64
 23 | import json
 24 | import logging
 25 | import os
 26 | import pickle
 27 | import subprocess
 28 | import sys
 29 | import traceback
 30 | 
 31 | # First Party
 32 | from fm_training_estimator.config.arguments import EstimateInput, MemoryEstimate
 33 | from fm_training_estimator.sdk import estimate_memory, estimate_time, estimate_tokens
 34 | 
 35 | logging.basicConfig(level=logging.INFO)
 36 | 
 37 | 
 38 | def main():
 39 |     ##########
 40 |     #
 41 |     # Parse arguments
 42 |     #
 43 |     ##########
 44 |     try:
 45 |         input_dict = get_input_dict()
 46 |         logging.info("estimator launch parsed input json: %s", input_dict)
 47 |         if not input_dict:
 48 |             raise ValueError(
 49 |                 "Must set environment variable 'ESTIMATOR_INPUT_JSON_PATH'\
 50 |             or 'ESTIMATOR_INPUT_JSON_ENV_VAR'."
 51 |             )
 52 | 
 53 |     except FileNotFoundError as e:
 54 |         logging.error(traceback.format_exc())
 55 |         sys.exit(1)
 56 |     except (TypeError, ValueError, EnvironmentError) as e:
 57 |         logging.error(traceback.format_exc())
 58 |         sys.exit(1)
 59 |     except Exception as e:  # pylint: disable=broad-except
 60 |         logging.error(traceback.format_exc())
 61 |         sys.exit(1)
 62 | 
 63 |     ##########
 64 |     #
 65 |     # Run the estimator
 66 |     #
 67 |     ##########
 68 |     model_path = os.getenv("ESTIMATOR_MODEL_PATH")
 69 |     estimator_input = EstimateInput.from_dict(input_dict)
 70 | 
 71 |     out_path = os.getenv("ESTIMATOR_OUTPUT_PATH", "estimator_output")
 72 |     if not os.path.exists(out_path):
 73 |         os.makedirs(out_path)
 74 | 
 75 |     out_content = "Input parsed for this estimate: " + str(estimator_input) + "\n\n"
 76 | 
 77 |     ############ Memory ############
 78 |     out_content += "Estimating Memory:....\n"
 79 | 
 80 |     memory_output = estimate_memory(estimator_input)
 81 |     f = open(os.path.join(out_path, "memory_theory.json"), "w")
 82 |     f.write(json.dumps(memory_output.__dict__))
 83 |     f.close()
 84 | 
 85 |     out_content += "With only theory: " + str(memory_output) + "\n"
 86 |     if model_path:
 87 |         memory_output = estimate_memory(estimator_input, model_path)
 88 |         out_content += "With reg model: " + str(memory_output) + "\n"
 89 |         f = open(os.path.join(out_path, "memory_hybrid.json"), "w")
 90 |         f.write(json.dumps(memory_output.__dict__))
 91 |         f.close()
 92 | 
 93 |     ############ Time ############
 94 |     out_content += "\n" * 3
 95 |     out_content += "Estimating Time:....\n"
 96 | 
 97 |     time_output = estimate_time(estimator_input)
 98 |     f = open(os.path.join(out_path, "time_theory.json"), "w")
 99 |     f.write(json.dumps(time_output.__dict__))
100 |     f.close()
101 | 
102 |     out_content += "With only theory: " + str(time_output) + "\n"
103 |     if model_path:
104 |         time_output = estimate_time(estimator_input, model_path)
105 |         out_content += "With reg model: " + str(time_output) + "\n"
106 |         f = open(os.path.join(out_path, "time_hybrid.json"), "w")
107 |         f.write(json.dumps(time_output.__dict__))
108 |         f.close()
109 | 
110 |     ############ Tps ############
111 |     out_content += "\n" * 3
112 |     out_content += "Estimating tps:....\n"
113 | 
114 |     tps_output = estimate_tokens(estimator_input)
115 |     f = open(os.path.join(out_path, "tps_theory.json"), "w")
116 |     f.write(json.dumps(tps_output.__dict__))
117 |     f.close()
118 | 
119 |     out_content += "With only theory: " + str(tps_output) + "\n"
120 |     if model_path:
121 |         tps_output = estimate_tokens(estimator_input, model_path)
122 |         out_content += "With reg model: " + str(tps_output) + "\n"
123 |         f = open(os.path.join(out_path, "tps_hybrid.json"), "w")
124 |         f.write(json.dumps(tps_output.__dict__))
125 |         f.close()
126 | 
127 |     print(out_content)
128 | 
129 |     f = open(os.path.join(out_path, "output.txt"), "w")
130 |     f.write(out_content)
131 |     f.close()
132 |     return 0
133 | 
134 | 
135 | def get_input_dict():
136 |     """Parses JSON configuration if provided via environment variables
137 |     ESTIMATOR_INPUT_JSON_ENV_VAR or ESTIMATOR_INPUT_JSON_PATH.
138 | 
139 |     ESTIMATOR_INPUT_JSON_ENV_VAR is the base64 encoded JSON.
140 |     ESTIMATOR_INPUT_JSON_PATH is the path to the JSON config file.
141 | 
142 |     Returns: dict or {}
143 |     """
144 |     json_env_var = os.getenv("ESTIMATOR_INPUT_JSON_ENV_VAR")
145 |     json_path = os.getenv("ESTIMATOR_INPUT_JSON_PATH")
146 | 
147 |     # accepts either path to JSON file or encoded string config
148 |     # env var takes precedent
149 |     input_dict = {}
150 |     if json_env_var:
151 |         input_dict = txt_to_obj(json_env_var)
152 |     elif json_path:
153 |         with open(json_path, "r", encoding="utf-8") as f:
154 |             input_dict = json.load(f)
155 | 
156 |     return input_dict
157 | 
158 | 
159 | def txt_to_obj(txt):
160 |     """Given encoded byte string, converts to base64 decoded dict.
161 | 
162 |     Args:
163 |         txt: str
164 |     Returns: dict[str, Any]
165 |     """
166 |     base64_bytes = txt.encode("ascii")
167 |     message_bytes = base64.b64decode(base64_bytes)
168 |     try:
169 |         # If the bytes represent JSON string
170 |         return json.loads(message_bytes)
171 |     except UnicodeDecodeError:
172 |         # Otherwise the bytes are a pickled python dictionary
173 |         return pickle.loads(message_bytes)
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     main()
178 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "setuptools-scm>=8.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "fm_training_estimator"
 7 | dynamic = ["version"]
 8 | authors = [
 9 |   { name="Angel Luu", email="angel.luu@us.ibm.com" },
10 |   { name="Chander Govindarajan", email="mail@chandergovind.org" },
11 | ]
12 | description = "A package of Estimators for Large Language Model Training."
13 | license = {text = "Apache-2.0"}
14 | readme = "README.md"
15 | requires-python = ">=3.8"
16 | classifiers = [
17 |     "Programming Language :: Python :: 3",
18 |     "License :: OSI Approved :: Apache Software License",
19 |     "Operating System :: OS Independent",
20 | ]
21 | dependencies = [
22 |   "tox",
23 |   "pre-commit",
24 |   "transformers",
25 |   "peft",
26 |   "setuptools",
27 |   "fire",
28 |   "pandas",
29 |   "xgboost",
30 |   "scikit-learn<1.6.0",
31 |   "gradio",
32 |   "datasets",
33 |   "dataclass-wizard",
34 |   "uvicorn",
35 |   "arise-predictions==1.0.2"
36 | ]
37 | 
38 | [project.urls]
39 | Source = "https://github.com/foundation-model-stack/fm-training-estimator"
40 | 
41 | 
42 | [project.optional-dependencies]
43 | dev-docs = [
44 |     "sphinx>=4.0.2,<8.0",
45 |     "sphinx-autoapi>=2.1.0",
46 |     "sphinx-rtd-theme>=1.2.1,<2.1.0",
47 | ]
48 | 
49 | [tool.setuptools.packages.find]
50 | exclude = ["tests", "tests.*", "test_*.py"]
51 | namespaces = false
52 | 
53 | [tool.setuptools_scm]
54 | version_file = "fm_training_estimator/_version.py"


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = lint, fmt, test
 3 | 
 4 | [testenv:fmt]
 5 | description = format with pre-commit
 6 | deps =
 7 |     pre-commit
 8 |     tox
 9 | commands = bash ./tox.sh
10 | allowlist_externals = bash,./tox.sh
11 | 
12 | [testenv:lint]
13 | description = lint with pylint
14 | deps =
15 |     pylint>=2.16.2,<=3.1.0
16 |     pytest
17 | commands = pylint fm_training_estimator
18 | allowlist_externals = pylint
19 | 
20 | [testenv:docs]
21 | recreate = True
22 | extras = dev-docs
23 | changedir = docs/source
24 | 
25 | ; Disabled '-W' flag as warnings in the files
26 | ; TOTO: Add back in once build warnings fixed
27 | commands =
28 |   sphinx-build -E -a -b html -T . _build/html
29 | 
30 | [testenv:test]
31 | description = test with pytest
32 | deps =
33 |     pytest
34 |     -r requirements.txt
35 | commands = pytest
36 | allowlist_externals = pytest
37 | 
38 | [testenv:build]
39 | description = build wheel
40 | deps =
41 |     build
42 | commands = python -m build -w
43 | skip_install = True
44 | 
45 | [testenv:twinecheck]
46 | description = check wheel
47 | deps =
48 |     twine
49 | commands = twine check dist/*
50 | skip_install = True
51 | 


--------------------------------------------------------------------------------
/tox.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | pre-commit run --all-files
 4 | RETURN_CODE=$?
 5 | 
 6 | function echoWarning() {
 7 |   LIGHT_YELLOW='\033[1;33m'
 8 |   NC='\033[0m' # No Color
 9 |   echo -e "${LIGHT_YELLOW}${1}${NC}"
10 | }
11 | 
12 | if [ "$RETURN_CODE" -ne 0 ]; then
13 |   if [ "${CI}" != "true" ]; then
14 |     echoWarning "☝️ This appears to have failed, but actually your files have been formatted."
15 |     echoWarning "Make a new commit with these changes before making a pull request."
16 |   else
17 |     echoWarning "This test failed because your code isn't formatted correctly."
18 |     echoWarning 'Locally, run `make fmt`, it will appear to fail, but change files.'
19 |     echoWarning "Add the changed files to your commit and this stage will pass."
20 |   fi
21 | 
22 |   exit $RETURN_CODE
23 | fi
24 | 


--------------------------------------------------------------------------------