├── .github ├── actionlint.yaml ├── actions │ └── free-disk-space │ │ └── action.yml ├── dependabot.yml ├── mergify.yml ├── stale_bot.yml └── workflows │ ├── actionlint.dockerfile │ ├── actionlint.yml │ ├── constraints-update.yml │ ├── docs.yml │ ├── e2e-nvidia-l4-x1.yml │ ├── e2e-nvidia-l40s-x4.yml │ ├── lint.yml │ ├── matchers │ ├── actionlint.json │ └── pylint.json │ ├── pypi.yaml │ ├── spellcheck.yml │ └── test.yml ├── .gitignore ├── .isort.cfg ├── .markdownlint-cli2.yaml ├── .pre-commit-config.yaml ├── .pylintrc ├── .spellcheck-en-custom.txt ├── .spellcheck.yml ├── CHANGELOG.md ├── DCO.txt ├── LICENSE ├── Makefile ├── README.md ├── constraints-dev.txt ├── constraints-dev.txt.in ├── docs ├── ci.md └── release-strategy.md ├── pyproject.toml ├── requirements-dev.txt ├── requirements-files.in ├── requirements-leaderboard.txt ├── requirements-ruler.txt ├── requirements.txt ├── scripts ├── evaluate_best_checkpoint.py ├── functional-tests.sh ├── ruff.sh ├── test_branch_gen_answers.py ├── test_branch_generator.py ├── test_branch_judge_answers.py ├── test_gen_answers.py ├── test_judge_answers.py ├── test_leaderboard.py ├── test_mmlu.py └── test_mmlu_branch.py ├── src └── instructlab │ ├── __init__.py │ └── eval │ ├── __init__.py │ ├── data │ ├── mt_bench │ │ ├── judge_prompts.jsonl │ │ ├── question.jsonl │ │ └── reference_answer │ │ │ └── gpt-4.jsonl │ └── mt_bench_branch │ │ └── judge_prompts.jsonl │ ├── evaluator.py │ ├── exceptions.py │ ├── leaderboard.py │ ├── logger_config.py │ ├── mmlu.py │ ├── mt_bench.py │ ├── mt_bench_answers.py │ ├── mt_bench_branch_generator.py │ ├── mt_bench_common.py │ ├── mt_bench_conversation.py │ ├── mt_bench_judgment.py │ ├── mt_bench_model_adapter.py │ ├── ragas.py │ └── ruler.py ├── tests ├── test_mmlu.py ├── test_mt_bench.py ├── test_mt_bench_answers.py ├── test_mt_bench_common.py ├── test_mt_bench_judgment.py ├── test_mt_bench_model_adapter.py ├── test_project.py ├── test_ragas.py └── testdata │ └── sdg │ ├── _default_template_yaml │ ├── tonsil_data.jsonl │ └── tonsil_task.yaml └── tox.ini /.github/actionlint.yaml: -------------------------------------------------------------------------------- 1 | self-hosted-runner: 2 | labels: 3 | - ubuntu-gpu 4 | -------------------------------------------------------------------------------- /.github/actions/free-disk-space/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Free Disk Space' 2 | description: 'Frees disk space on the runner' 3 | runs: 4 | using: "composite" 5 | steps: 6 | - run: | 7 | df -h 8 | sudo docker rmi "$(docker image ls -aq)" >/dev/null 2>&1 || true 9 | sudo rm -rf \ 10 | /usr/share/dotnet /usr/local/lib/android /opt/ghc \ 11 | /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup \ 12 | /usr/lib/jvm || true 13 | sudo apt install aptitude -y >/dev/null 2>&1 14 | sudo aptitude purge '~n ^mysql' -f -y >/dev/null 2>&1 15 | sudo aptitude purge '~n ^dotnet' -f -y >/dev/null 2>&1 16 | sudo apt-get autoremove -y >/dev/null 2>&1 17 | sudo apt-get autoclean -y >/dev/null 2>&1 18 | df -h 19 | shell: bash 20 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # GitHub Dependabot configuration file 4 | version: 2 5 | updates: 6 | 7 | # Maintain dependencies for GitHub Actions 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | - package-ecosystem: "docker" 13 | directory: "/.github/workflows" 14 | schedule: 15 | interval: "daily" 16 | -------------------------------------------------------------------------------- /.github/mergify.yml: -------------------------------------------------------------------------------- 1 | pull_request_rules: 2 | - name: auto-merge 3 | description: automatic merge for main with >= 2 approved reviews, all requested reviews have given feedback, not held, and CI is successful 4 | conditions: 5 | - "#approved-reviews-by>=2" 6 | - "#review-requested=0" 7 | - "#changes-requested-reviews-by=0" 8 | - or: 9 | - base=main 10 | - base~=^release- 11 | - label!=hold 12 | - label!=do-not-merge 13 | - label!=needs-rebase 14 | 15 | # The files conditions regex should match the globs in workflow files 16 | # If workflow configuration files in .github/ are changed, the actionlint check must pass 17 | - or: 18 | - and: 19 | - check-success=actionlint 20 | - or: 21 | - files~=^\.github/(actions|workflows)/.*\.ya?ml$ 22 | - files~=^\.github/workflows/actionlint\. 23 | - and: 24 | - -files~=^\.github/(actions|workflows)/.*\.ya?ml$ 25 | - -files~=^\.github/workflows/actionlint\. 26 | 27 | # e2e medium workflow 28 | - or: 29 | - and: 30 | # note this should match the triggering criteria in 'e2e-nvidia-l4-x1.yml' 31 | - check-success~=e2e-medium-workflow-complete 32 | - or: 33 | - files~=\.py$ 34 | - files=pyproject.toml 35 | - files~=^requirements.*\.txt$ 36 | - files=.github/workflows/e2e-nvidia-l4-x1.yml 37 | - and: 38 | - -files~=\.py$ 39 | - -files=pyproject.toml 40 | - -files~=^requirements.*\.txt$ 41 | - -files=.github/workflows/e2e-nvidia-l4-x1.yml 42 | 43 | # lint must pass if files change that would trigger this job 44 | - or: 45 | - and: 46 | - check-success=lint-workflow-complete 47 | - or: 48 | # see .github/workflows/lint.yml and test.yml 49 | - files~=\.py$ 50 | - files=pyproject.toml 51 | - files~=^requirements.*\.txt$ 52 | - files=tox.ini 53 | - files~=^scripts/[^/]+\.sh$ 54 | - files=.github/workflows/lint.yml 55 | - and: 56 | - -files~=\.py$ 57 | - -files=pyproject.toml 58 | - -files~=^requirements.*\.txt$ 59 | - -files=tox.ini 60 | - -files~=^scripts/[^/]+\.sh$ 61 | - -files=.github/workflows/lint.yml 62 | 63 | - or: 64 | - and: 65 | - check-success=markdown-lint 66 | - or: 67 | - files~=\.md$ 68 | - files=.markdownlint-cli2.yaml 69 | - files=.github/workflows/docs.yml 70 | - and: 71 | - -files~=\.md$ 72 | - -files=.markdownlint-cli2.yaml 73 | - -files=.github/workflows/docs.yml 74 | 75 | - or: 76 | - and: 77 | - check-success=spellcheck 78 | - or: 79 | - files~=\.md$ 80 | - files=tox.ini 81 | - files~=^\.spellcheck[^/]+$ 82 | - files=.github/workflows/spellcheck.yml 83 | - and: 84 | - -files~=\.md$ 85 | - -files=tox.ini 86 | - -files~=^\.spellcheck[^/]+$ 87 | - -files=.github/workflows/spellcheck.yml 88 | 89 | actions: 90 | merge: 91 | method: merge 92 | delete_head_branch: 93 | 94 | - name: label-cicd 95 | description: Automatically apply CI/CD label 96 | conditions: 97 | - or: 98 | - files=.github/mergify.yml 99 | - files~=^\.github/(actions|workflows)/ 100 | - files=scripts/ruff.sh 101 | - files=.pre-commit-config.yaml 102 | - files=.pylintrc 103 | - files~=^\.spellcheck[^/]+$ 104 | - files=tox.ini 105 | - files=.markdownlint-cli2.yaml 106 | actions: 107 | label: 108 | add: 109 | - CI/CD 110 | 111 | - name: label-documentation 112 | description: Automatically apply documentation label 113 | conditions: 114 | - or: 115 | - files~=^[^/]+\.md$ 116 | actions: 117 | label: 118 | add: 119 | - documentation 120 | 121 | - name: label-testing 122 | description: Automatically apply testing label 123 | conditions: 124 | - or: 125 | - files~=^tests/ 126 | - files=tox.ini 127 | actions: 128 | label: 129 | add: 130 | - testing 131 | 132 | - name: ping author on conflicts and add 'needs-rebase' label 133 | conditions: 134 | - conflict 135 | - -closed 136 | actions: 137 | label: 138 | add: 139 | - needs-rebase 140 | comment: 141 | message: | 142 | This pull request has merge conflicts that must be resolved before it can be 143 | merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork 144 | 145 | - name: remove 'needs-rebase' label when conflict is resolved 146 | conditions: 147 | - -conflict 148 | - -closed 149 | actions: 150 | label: 151 | remove: 152 | - needs-rebase 153 | 154 | - name: release-branch-label 155 | description: Automatically apply the release-branch label to release branch PRs 156 | conditions: 157 | - base~=^release- 158 | actions: 159 | label: 160 | add: 161 | - release-branch 162 | 163 | - name: Apply ci-failure label if any CI checks have failed 164 | conditions: 165 | - "#check-failure>0" 166 | actions: 167 | label: 168 | add: 169 | - ci-failure 170 | 171 | - name: Remove ci-failure label if no failures are present 172 | conditions: 173 | - "#check-failure=0" 174 | actions: 175 | label: 176 | remove: 177 | - ci-failure 178 | 179 | - name: Apply 'one-approval' label if one of the maintainer approved the PR 180 | conditions: 181 | - "#approved-reviews-by=1" 182 | actions: 183 | label: 184 | add: 185 | - one-approval 186 | 187 | - name: Remove 'one-approval' label if the approval was reset 188 | conditions: 189 | - "#approved-reviews-by!=1" 190 | actions: 191 | label: 192 | remove: 193 | - one-approval 194 | 195 | - name: label-dependencies 196 | description: Automatically apply dependencies label 197 | conditions: 198 | - or: 199 | - files~=^requirements.*\.txt$ 200 | - files~=^requirements/ 201 | actions: 202 | label: 203 | add: 204 | - dependencies 205 | -------------------------------------------------------------------------------- /.github/stale_bot.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: 'Close stale issues and PRs' 4 | 5 | on: 6 | schedule: 7 | - cron: '30 1 * * *' 8 | 9 | env: 10 | LC_ALL: en_US.UTF-8 11 | 12 | defaults: 13 | run: 14 | shell: bash 15 | 16 | permissions: 17 | contents: read 18 | 19 | jobs: 20 | stale: 21 | permissions: 22 | issues: write 23 | pull-requests: write 24 | runs-on: ubuntu-latest 25 | steps: 26 | - name: "Harden Runner" 27 | uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1 28 | with: 29 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 30 | 31 | - name: "Stale Action" 32 | uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 33 | with: 34 | stale-issue-label: 'stale' 35 | stale-issue-message: > 36 | This issue has been automatically marked as stale because it has not had activity within 90 days. 37 | It will be automatically closed if no further activity occurs within 30 days. 38 | close-issue-message: > 39 | This issue has been automatically closed due to inactivity. Please feel free to reopen if you feel it is still relevant! 40 | days-before-issue-stale: 90 41 | days-before-issue-close: 30 42 | stale-pr-label: 'stale' 43 | stale-pr-message: > 44 | This pull request has been automatically marked as stale because it has not had activity within 90 days. 45 | It will be automatically closed if no further activity occurs within 30 days. 46 | close-pr-message: > 47 | This pull request has been automatically closed due to inactivity. Please feel free to reopen if you intend to continue working on it! 48 | days-before-pr-stale: 90 49 | days-before-pr-close: 30 50 | -------------------------------------------------------------------------------- /.github/workflows/actionlint.dockerfile: -------------------------------------------------------------------------------- 1 | # Since dependabot cannot update workflows using docker, 2 | # we use this indirection since dependabot can update this file. 3 | FROM rhysd/actionlint:1.7.7@sha256:887a259a5a534f3c4f36cb02dca341673c6089431057242cdc931e9f133147e9 4 | -------------------------------------------------------------------------------- /.github/workflows/actionlint.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: Lint GitHub Actions workflows 4 | on: 5 | push: 6 | branches: 7 | - "main" 8 | paths: 9 | - '.github/workflows/*.ya?ml' 10 | - '.github/workflows/actionlint.*' # This workflow 11 | pull_request: 12 | branches: 13 | - "main" 14 | paths: 15 | - '.github/workflows/*.ya?ml' 16 | - '.github/workflows/actionlint.*' # This workflow 17 | 18 | env: 19 | LC_ALL: en_US.UTF-8 20 | 21 | defaults: 22 | run: 23 | shell: bash 24 | 25 | permissions: 26 | contents: read 27 | 28 | jobs: 29 | actionlint: 30 | runs-on: ubuntu-latest 31 | steps: 32 | - name: "Harden Runner" 33 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 34 | with: 35 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 36 | 37 | - name: "Checkout" 38 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 39 | with: 40 | fetch-depth: 0 41 | 42 | - name: "Download actionlint" 43 | run: | 44 | docker build --tag actionlint - < .github/workflows/actionlint.dockerfile 45 | - name: "Check workflow files" 46 | run: | 47 | echo "::add-matcher::.github/workflows/matchers/actionlint.json" 48 | docker run --volume="${PWD}:/repo" --workdir=/repo actionlint -color 49 | -------------------------------------------------------------------------------- /.github/workflows/constraints-update.yml: -------------------------------------------------------------------------------- 1 | name: Update constraints-dev.txt 2 | 3 | on: 4 | schedule: 5 | - cron: '0 3 * * 1' # Every Monday at 03:00 UTC 6 | workflow_dispatch: 7 | 8 | jobs: 9 | update-constraints: 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: write 13 | pull-requests: write 14 | 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 18 | 19 | - name: Checkout "update-constraints" in-house CI action 20 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 21 | with: 22 | repository: instructlab/ci-actions 23 | path: ci-actions 24 | # no tag that includes https://github.com/instructlab/ci-actions/pull/26, yet 25 | ref: 88641ccaf122964eacdc1a82b18bda369b6f99bd # main 26 | sparse-checkout: | 27 | actions/update-constraints 28 | 29 | - name: Update constraints 30 | id: update-constraints 31 | uses: ./ci-actions/actions/update-constraints 32 | with: 33 | gh-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} 34 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: Lint Markdown documents 4 | 5 | on: 6 | push: 7 | branches: 8 | - "main" 9 | paths: 10 | - '**/*.md' 11 | - '.markdownlint-cli2.yaml' 12 | - '.github/workflows/docs.yml' # This workflow 13 | pull_request: 14 | branches: 15 | - "main" 16 | paths: 17 | - '**/*.md' 18 | - '.markdownlint-cli2.yaml' 19 | - '.github/workflows/docs.yml' # This workflow 20 | 21 | env: 22 | LC_ALL: en_US.UTF-8 23 | 24 | defaults: 25 | run: 26 | shell: bash 27 | 28 | permissions: 29 | contents: read 30 | 31 | jobs: 32 | markdown-lint: 33 | runs-on: ubuntu-latest 34 | steps: 35 | - name: "Harden Runner" 36 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 37 | with: 38 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 39 | - name: "Checkout" 40 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 41 | with: 42 | fetch-depth: 0 43 | - name: "Check Markdown documents" 44 | uses: DavidAnson/markdownlint-cli2-action@992badcdf24e3b8eb7e87ff9287fe931bcb00c6e # v20.0.0 45 | with: 46 | globs: '**/*.md' 47 | -------------------------------------------------------------------------------- /.github/workflows/e2e-nvidia-l4-x1.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: E2E (NVIDIA L4 x1) 4 | 5 | on: 6 | # run against every merge commit to 'main' and release branches 7 | push: 8 | branches: 9 | - main 10 | - release-* 11 | # only run on PRs that touch certain regex paths 12 | pull_request_target: 13 | branches: 14 | - main 15 | - release-* 16 | paths: 17 | # note this should match the merging criteria in 'mergify.yml' 18 | - '**.py' 19 | - 'pyproject.toml' 20 | - 'requirements**.txt' 21 | - 'constraints-dev.txt' 22 | - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow 23 | workflow_dispatch: 24 | 25 | concurrency: 26 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} 27 | cancel-in-progress: true 28 | 29 | env: 30 | LC_ALL: en_US.UTF-8 31 | TMPDIR: /home/tmp 32 | 33 | defaults: 34 | run: 35 | shell: bash 36 | 37 | permissions: 38 | contents: read 39 | 40 | jobs: 41 | start-medium-ec2-runner: 42 | runs-on: ubuntu-latest 43 | outputs: 44 | label: ${{ steps.start-ec2-runner.outputs.label }} 45 | ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} 46 | steps: 47 | - name: Configure AWS credentials 48 | uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 49 | with: 50 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 51 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 52 | aws-region: ${{ vars.AWS_REGION }} 53 | 54 | - name: Start EC2 runner 55 | id: start-ec2-runner 56 | uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1 57 | with: 58 | mode: start 59 | github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} 60 | ec2-image-id: ${{ vars.AWS_EC2_AMI }} 61 | ec2-instance-type: g6.8xlarge 62 | subnet-id: subnet-02d230cffd9385bd4 63 | security-group-id: sg-06300447c4a5fbef3 64 | iam-role-name: instructlab-ci-runner 65 | aws-resource-tags: > 66 | [ 67 | {"Key": "Name", "Value": "instructlab-ci-github-medium-runner"}, 68 | {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, 69 | {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, 70 | {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} 71 | ] 72 | 73 | e2e-medium-test: 74 | needs: 75 | - start-medium-ec2-runner 76 | runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }} 77 | 78 | # It is important that this job has no write permissions and has 79 | # no access to any secrets. This part (e2e) is where we are running 80 | # untrusted code from PRs. 81 | permissions: {} 82 | 83 | steps: 84 | - name: Install Packages 85 | run: | 86 | cat /etc/os-release 87 | mkdir -p "${TMPDIR}" 88 | sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel 89 | 90 | - name: Checkout instructlab/instructlab 91 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 92 | with: 93 | repository: "instructlab/instructlab" 94 | path: "instructlab" 95 | fetch-depth: 0 96 | 97 | - name: Checkout instructlab/eval 98 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 99 | with: 100 | repository: "instructlab/eval" 101 | path: "eval" 102 | # https://github.com/actions/checkout/issues/249 103 | fetch-depth: 0 104 | 105 | - name: Fetch and checkout PR 106 | id: fetch_pr 107 | if: github.event_name == 'pull_request_target' 108 | working-directory: ./eval 109 | run: | 110 | git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }} 111 | git checkout pr-${{ github.event.pull_request.number }} 112 | 113 | - name: Install ilab 114 | working-directory: ./instructlab 115 | run: | 116 | PYTHON=python3.11 ./scripts/install-ilab-with-cuda.sh 117 | 118 | - name: Update instructlab-eval library 119 | working-directory: ./eval 120 | run: | 121 | . ../instructlab/venv/bin/activate 122 | # Patch out our own pin from the ilab repo constraints file 123 | ilab_constraints=../instructlab/constraints-dev.txt 124 | sed -i '/instructlab-eval==/d' $ilab_constraints 125 | 126 | # Since we reuse the virtual environment prepared using ilab 127 | # constraints, we should stick to the same constraints when 128 | # installing latest eval. 129 | # 130 | # FIX: this is not ideal; a proper fix would require decoupling the 131 | # two repos in CI: either by removing the job completely and relying 132 | # on "sdk" (no ilab) test runs; or by preparing a separate 133 | # constraints file that would consider both the requirements files 134 | # for the eval library AND for the ilab - so that they are 135 | # consistent. 136 | pip_install="pip install -c $ilab_constraints" 137 | $pip_install . 138 | $pip_install .[cuda] 139 | 140 | - name: Run e2e test 141 | working-directory: ./instructlab 142 | run: | 143 | . venv/bin/activate 144 | ./scripts/e2e-ci.sh -m 145 | 146 | stop-medium-ec2-runner: 147 | needs: 148 | - start-medium-ec2-runner 149 | - e2e-medium-test 150 | runs-on: ubuntu-latest 151 | if: ${{ always() }} 152 | steps: 153 | - name: Configure AWS credentials 154 | uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 155 | with: 156 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 157 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 158 | aws-region: ${{ vars.AWS_REGION }} 159 | - name: Stop EC2 runner 160 | uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1 161 | with: 162 | mode: stop 163 | github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} 164 | label: ${{ needs.start-medium-ec2-runner.outputs.label }} 165 | ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }} 166 | 167 | e2e-medium-workflow-complete: 168 | # we don't want to block PRs on failed EC2 cleanup 169 | # so not requiring "stop-runner" as well 170 | needs: ["start-medium-ec2-runner", "e2e-medium-test"] 171 | runs-on: ubuntu-latest 172 | steps: 173 | - name: E2E Workflow Complete 174 | run: echo "E2E Workflow Complete" 175 | -------------------------------------------------------------------------------- /.github/workflows/e2e-nvidia-l40s-x4.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: E2E (NVIDIA L40S x4) 4 | 5 | on: 6 | schedule: 7 | - cron: '0 16 * * *' # Runs at 4PM UTC every day 8 | workflow_dispatch: 9 | inputs: 10 | pr_or_branch: 11 | description: 'pull request number or branch name' 12 | required: true 13 | default: 'main' 14 | 15 | env: 16 | TMPDIR: /home/tmp 17 | 18 | jobs: 19 | start-large-ec2-runner: 20 | runs-on: ubuntu-latest 21 | outputs: 22 | label: ${{ steps.start-ec2-runner.outputs.label }} 23 | ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} 24 | steps: 25 | - name: Configure AWS credentials 26 | uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 27 | with: 28 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 29 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 30 | aws-region: ${{ vars.AWS_REGION }} 31 | 32 | - name: Start EC2 runner 33 | id: start-ec2-runner 34 | uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1 35 | with: 36 | mode: start 37 | github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} 38 | ec2-image-id: ${{ vars.AWS_EC2_AMI }} 39 | ec2-instance-type: g6e.12xlarge 40 | subnet-id: subnet-024298cefa3bedd61 41 | security-group-id: sg-06300447c4a5fbef3 42 | iam-role-name: instructlab-ci-runner 43 | aws-resource-tags: > 44 | [ 45 | {"Key": "Name", "Value": "instructlab-ci-github-large-runner"}, 46 | {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, 47 | {"Key": "GitHubRef", "Value": "${{ github.ref }}"}, 48 | {"Key": "GitHubPR", "Value": "${{ github.event.number }}"} 49 | ] 50 | 51 | e2e-large-test: 52 | needs: 53 | - start-large-ec2-runner 54 | runs-on: ${{ needs.start-large-ec2-runner.outputs.label }} 55 | 56 | permissions: 57 | pull-requests: write 58 | 59 | steps: 60 | - name: Install Packages 61 | run: | 62 | cat /etc/os-release 63 | mkdir -p "${TMPDIR}" 64 | sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel 65 | 66 | - name: Checkout instructlab/instructlab 67 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 68 | with: 69 | repository: "instructlab/instructlab" 70 | path: "instructlab" 71 | # https://github.com/actions/checkout/issues/249 72 | fetch-depth: 0 73 | 74 | - name: Checkout instructlab/eval 75 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 76 | with: 77 | repository: "instructlab/eval" 78 | path: "eval" 79 | # https://github.com/actions/checkout/issues/249 80 | fetch-depth: 0 81 | 82 | - name: Determine if pr_or_branch is a PR number 83 | id: check_pr 84 | run: | 85 | PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set 86 | if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then 87 | echo "is_pr=true" >> "$GITHUB_OUTPUT" 88 | else 89 | echo "is_pr=false" >> "$GITHUB_OUTPUT" 90 | fi 91 | echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT" 92 | 93 | - name: Check if gh cli is installed 94 | id: gh_cli 95 | run: | 96 | if command -v gh &> /dev/null ; then 97 | echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT" 98 | else 99 | echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT" 100 | fi 101 | 102 | - name: Install gh CLI 103 | if: steps.gh_cli.outputs.gh_cli_installed == 'false' 104 | run: | 105 | sudo dnf install 'dnf-command(config-manager)' -y 106 | sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo 107 | sudo dnf install gh --repo gh-cli -y 108 | 109 | - name: test gh CLI 110 | run: | 111 | gh --version 112 | 113 | - name: set default repo 114 | working-directory: ./eval 115 | run: | 116 | gh repo set-default ${{ github.server_url }}/${{ github.repository }} 117 | env: 118 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 119 | 120 | - name: Add comment to PR 121 | if: steps.check_pr.outputs.is_pr == 'true' 122 | working-directory: ./eval 123 | run: | 124 | gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})" 125 | env: 126 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 127 | 128 | - name: Fetch and checkout PR 129 | if: steps.check_pr.outputs.is_pr == 'true' 130 | working-directory: ./eval 131 | run: | 132 | gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }} 133 | env: 134 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 135 | 136 | - name: Checkout branch 137 | if: steps.check_pr.outputs.is_pr == 'false' 138 | working-directory: ./eval 139 | run: | 140 | git checkout ${{ steps.check_pr.outputs.pr_or_branch }} 141 | 142 | - name: Install ilab 143 | working-directory: ./instructlab 144 | run: | 145 | PYTHON=python3.11 ./scripts/install-ilab-with-cuda.sh 146 | 147 | - name: Update instructlab-eval library 148 | working-directory: ./eval 149 | run: | 150 | . ../instructlab/venv/bin/activate 151 | # Patch out our own pin from the ilab repo constraints file 152 | ilab_constraints=../instructlab/constraints-dev.txt 153 | sed -i '/instructlab-eval==/d' $ilab_constraints 154 | 155 | # Since we reuse the virtual environment prepared using ilab 156 | # constraints, we should stick to the same constraints when 157 | # installing latest eval. 158 | # 159 | # FIX: this is not ideal; a proper fix would require decoupling the 160 | # two repos in CI: either by removing the job completely and relying 161 | # on "sdk" (no ilab) test runs; or by preparing a separate 162 | # constraints file that would consider both the requirements files 163 | # for the eval library AND for the ilab - so that they are 164 | # consistent. 165 | pip_install="pip install -c $ilab_constraints" 166 | $pip_install . 167 | $pip_install .[cuda] 168 | 169 | - name: Check disk before tests 170 | run: | 171 | df -h 172 | 173 | - name: Run e2e test 174 | working-directory: ./instructlab 175 | env: 176 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 177 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 178 | run: | 179 | . venv/bin/activate 180 | ./scripts/e2e-ci.sh -l 181 | 182 | - name: Check disk after tests 183 | run: | 184 | df -h 185 | 186 | - name: Add comment to PR if the workflow failed 187 | if: failure() && steps.check_pr.outputs.is_pr == 'true' 188 | working-directory: ./eval 189 | run: | 190 | gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate." 191 | env: 192 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 193 | 194 | - name: Add comment to PR if the workflow succeeded 195 | if: success() && steps.check_pr.outputs.is_pr == 'true' 196 | working-directory: ./eval 197 | run: | 198 | gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!" 199 | env: 200 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 201 | 202 | - name: Send Discord notification for failure 203 | if: failure() && steps.check_pr.outputs.is_pr == 'false' 204 | uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3 205 | with: 206 | webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }} 207 | status: ${{ job.status }} 208 | title: "e2e-nvidia-l40s-x4" 209 | description: | 210 | Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **with failures** ❌ 211 | Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details. 212 | color: 0xCB2431 # Red color for failure 213 | 214 | - name: Send Discord notification for success 215 | if: success() && steps.check_pr.outputs.is_pr == 'false' 216 | uses: sarisia/actions-status-discord@5ddd3b114a98457dd80a39b2f00b6a998cd69008 # v1.15.3 217 | with: 218 | webhook: ${{ secrets.SON_OF_JEEVES_DISCORD_WEBHOOK }} 219 | status: ${{ job.status }} 220 | title: "e2e-nvidia-l40s-x4" 221 | description: | 222 | Job in **${{ github.repository }}** running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed **successfully** ✅ 223 | Click [here](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) to view details. 224 | color: 0x28A745 # Green color for success 225 | 226 | stop-large-ec2-runner: 227 | needs: 228 | - start-large-ec2-runner 229 | - e2e-large-test 230 | runs-on: ubuntu-latest 231 | if: ${{ always() }} 232 | steps: 233 | - name: Configure AWS credentials 234 | uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1 235 | with: 236 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 237 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 238 | aws-region: ${{ vars.AWS_REGION }} 239 | 240 | - name: Stop EC2 runner 241 | uses: machulav/ec2-github-runner@fb91019e71385fb10dfcbec812b4de8c61589f7b # v2.4.1 242 | with: 243 | mode: stop 244 | github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} 245 | label: ${{ needs.start-large-ec2-runner.outputs.label }} 246 | ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }} 247 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: Lint, Format, and MyPy 4 | 5 | on: 6 | push: 7 | branches: 8 | - "main" 9 | - "release-**" 10 | paths: 11 | - '**.py' 12 | - 'pyproject.toml' 13 | - 'requirements*.txt' 14 | - 'constraints-dev.txt' 15 | - 'tox.ini' 16 | - '.pylintrc' 17 | - 'scripts/*.sh' # Used by this workflow 18 | - '.github/workflows/lint.yml' # This workflow 19 | pull_request: 20 | branches: 21 | - "main" 22 | - "release-**" 23 | paths: 24 | - '**.py' 25 | - 'pyproject.toml' 26 | - 'requirements*.txt' 27 | - 'constraints-dev.txt' 28 | - 'tox.ini' 29 | - '.pylintrc' 30 | - 'scripts/*.sh' # Used by this workflow 31 | - '.github/workflows/lint.yml' # This workflow 32 | 33 | env: 34 | LC_ALL: en_US.UTF-8 35 | 36 | defaults: 37 | run: 38 | shell: bash 39 | 40 | permissions: 41 | contents: read 42 | 43 | jobs: 44 | lint: 45 | runs-on: ubuntu-latest 46 | name: "${{ matrix.lint.name }}" 47 | strategy: 48 | fail-fast: false 49 | matrix: 50 | lint: 51 | - name: "ruff" 52 | commands: | 53 | tox -e ruff -- check 54 | - name: "pylint" 55 | commands: | 56 | echo "::add-matcher::.github/workflows/matchers/pylint.json" 57 | tox -e lint 58 | - name: "mypy" 59 | commands: | 60 | tox -e mypy 61 | steps: 62 | - name: "Harden Runner" 63 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 64 | with: 65 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 66 | 67 | - name: "Checkout" 68 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 69 | with: 70 | # https://github.com/actions/checkout/issues/249 71 | fetch-depth: 0 72 | 73 | - name: Free disk space 74 | uses: ./.github/actions/free-disk-space 75 | 76 | - name: Setup Python 3.11 77 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 78 | with: 79 | python-version: 3.11 80 | cache: pip 81 | cache-dependency-path: | 82 | **/pyproject.toml 83 | **/requirements*.txt 84 | 85 | - name: Install tox 86 | run: | 87 | pip_install="python -m pip install -c constraints-dev.txt" 88 | $pip_install --upgrade pip 89 | $pip_install tox tox-gh 90 | 91 | - name: "${{ matrix.lint.name }}" 92 | run: | 93 | ${{ matrix.lint.commands }} 94 | env: 95 | RUFF_OUTPUT_FORMAT: github 96 | 97 | lint-workflow-complete: 98 | needs: ["lint"] 99 | runs-on: ubuntu-latest 100 | steps: 101 | - name: Lint Workflow Complete 102 | run: echo "Lint Workflow Complete" 103 | -------------------------------------------------------------------------------- /.github/workflows/matchers/actionlint.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "actionlint", 5 | "pattern": [ 6 | { 7 | "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", 8 | "file": 1, 9 | "line": 2, 10 | "column": 3, 11 | "message": 4, 12 | "code": 5 13 | } 14 | ] 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /.github/workflows/matchers/pylint.json: -------------------------------------------------------------------------------- 1 | { 2 | "problemMatcher": [ 3 | { 4 | "owner": "pylint-error", 5 | "severity": "error", 6 | "pattern": [ 7 | { 8 | "regexp": "^(.+):(\\d+):(\\d+):\\s(([EF]\\d{4}):\\s.+)$", 9 | "file": 1, 10 | "line": 2, 11 | "column": 3, 12 | "message": 4, 13 | "code": 5 14 | } 15 | ] 16 | }, 17 | { 18 | "owner": "pylint-warning", 19 | "severity": "warning", 20 | "pattern": [ 21 | { 22 | "regexp": "^(.+):(\\d+):(\\d+):\\s(([CRW]\\d{4}):\\s.+)$", 23 | "file": 1, 24 | "line": 2, 25 | "column": 3, 26 | "message": 4, 27 | "code": 5 28 | } 29 | ] 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: Build, test, and upload PyPI package 4 | 5 | on: 6 | push: 7 | branches: 8 | - "main" 9 | - "release-**" 10 | tags: 11 | - "v*" 12 | pull_request: 13 | branches: 14 | - "main" 15 | - "release-**" 16 | release: 17 | types: 18 | - published 19 | 20 | env: 21 | LC_ALL: en_US.UTF-8 22 | 23 | defaults: 24 | run: 25 | shell: bash 26 | 27 | permissions: 28 | contents: read 29 | 30 | jobs: 31 | # Create and verify release artifacts 32 | # - build source dist (tar ball) and wheel 33 | # - validate artifacts with various tools 34 | # - upload artifacts to GHA 35 | build-package: 36 | name: Build and check packages 37 | runs-on: ubuntu-latest 38 | steps: 39 | - name: "Harden Runner" 40 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 41 | with: 42 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 43 | 44 | - name: "Checkout" 45 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 46 | with: 47 | # for setuptools-scm 48 | fetch-depth: 0 49 | 50 | - name: "Build and Inspect" 51 | uses: hynek/build-and-inspect-python-package@b5076c307dc91924a82ad150cdd1533b444d3310 # v2.12.0 52 | 53 | # push to Test PyPI on 54 | # - a new GitHub release is published 55 | # - a PR is merged into main branch 56 | publish-test-pypi: 57 | name: Publish packages to test.pypi.org 58 | # environment: publish-test-pypi 59 | if: ${{ (github.repository_owner == 'instructlab') && ((github.event.action == 'published') || ((github.event_name == 'push') && (github.ref == 'refs/heads/main'))) }} 60 | permissions: 61 | contents: read 62 | # see https://docs.pypi.org/trusted-publishers/ 63 | id-token: write 64 | runs-on: ubuntu-latest 65 | needs: build-package 66 | 67 | steps: 68 | - name: "Harden Runner" 69 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 70 | with: 71 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 72 | 73 | - name: "Download build artifacts" 74 | uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 75 | with: 76 | name: Packages 77 | path: dist 78 | 79 | - name: "Upload to Test PyPI" 80 | uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 81 | with: 82 | repository-url: https://test.pypi.org/legacy/ 83 | 84 | # push to Production PyPI on 85 | # - a new GitHub release is published 86 | publish-pypi: 87 | name: Publish release to pypi.org 88 | # environment: publish-pypi 89 | if: ${{ (github.repository_owner == 'instructlab') && (github.event.action == 'published') }} 90 | permissions: 91 | # see https://docs.pypi.org/trusted-publishers/ 92 | id-token: write 93 | # allow gh release upload 94 | contents: write 95 | 96 | runs-on: ubuntu-latest 97 | needs: build-package 98 | 99 | steps: 100 | - name: "Harden Runner" 101 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 102 | with: 103 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 104 | 105 | - name: "Download build artifacts" 106 | uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 107 | with: 108 | name: Packages 109 | path: dist 110 | 111 | - name: "Sigstore sign package" 112 | uses: sigstore/gh-action-sigstore-python@f514d46b907ebcd5bedc05145c03b69c1edd8b46 # v3.0.0 113 | with: 114 | release-signing-artifacts: false 115 | inputs: | 116 | ./dist/*.tar.gz 117 | ./dist/*.whl 118 | 119 | - name: "Upload artifacts and signatures to GitHub release" 120 | run: | 121 | gh release upload '${{ github.ref_name }}' dist/* --repo '${{ github.repository }}' 122 | env: 123 | GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} 124 | 125 | # PyPI does not accept .sigstore artifacts and 126 | # gh-action-pypi-publish has no option to ignore them. 127 | - name: "Remove sigstore signatures before uploading to PyPI" 128 | run: | 129 | rm ./dist/*.sigstore.json 130 | 131 | - name: "Upload to PyPI" 132 | uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 133 | -------------------------------------------------------------------------------- /.github/workflows/spellcheck.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: Spellcheck 4 | 5 | on: 6 | push: 7 | branches: 8 | - "main" 9 | paths: 10 | - '**.md' 11 | - '.github/workflows/spellcheck.yml' # This workflow 12 | pull_request: 13 | branches: 14 | - "main" 15 | paths: 16 | - '**.md' 17 | - '.github/workflows/spellcheck.yml' # This workflow 18 | 19 | env: 20 | LC_ALL: en_US.UTF-8 21 | 22 | defaults: 23 | run: 24 | shell: bash 25 | 26 | permissions: 27 | contents: read 28 | 29 | jobs: 30 | spellcheck: 31 | runs-on: ubuntu-latest 32 | steps: 33 | - name: "Harden Runner" 34 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 35 | with: 36 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 37 | 38 | - name: "Checkout" 39 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 40 | with: 41 | fetch-depth: 0 42 | 43 | - name: Spellcheck 44 | uses: rojopolis/spellcheck-github-actions@584b2ae95998967a53af7fbfb7f5b15352c38748 # v0.49.0 45 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | name: Test 4 | 5 | on: 6 | workflow_dispatch: 7 | push: 8 | branches: 9 | - "main" 10 | - "release-**" 11 | paths: 12 | - '**.py' 13 | - 'pyproject.toml' 14 | - 'requirements**.txt' 15 | - 'constraints-dev.txt' 16 | - 'tox.ini' 17 | - 'scripts/*.sh' # Used by this workflow 18 | - '.github/workflows/test.yml' # This workflow 19 | pull_request: 20 | branches: 21 | - "main" 22 | - "release-**" 23 | paths: 24 | - '**.py' 25 | - 'pyproject.toml' 26 | - 'requirements**.txt' 27 | - 'constraints-dev.txt' 28 | - 'tox.ini' 29 | - 'scripts/*.sh' # Used by this workflow 30 | - '.github/workflows/test.yml' # This workflow 31 | 32 | env: 33 | LC_ALL: en_US.UTF-8 34 | 35 | defaults: 36 | run: 37 | shell: bash 38 | 39 | permissions: 40 | contents: read 41 | 42 | jobs: 43 | test: 44 | name: "test: ${{ matrix.python }} on ${{ matrix.platform }}" 45 | runs-on: "${{ matrix.platform }}" 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | python: 50 | - "3.11" 51 | platform: 52 | - "ubuntu-latest" 53 | include: 54 | - python: "3.11" 55 | platform: "macos-latest" 56 | steps: 57 | - name: "Harden Runner" 58 | uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0 59 | with: 60 | egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs 61 | 62 | - name: Checkout 63 | uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 64 | with: 65 | # https://github.com/actions/checkout/issues/249 66 | fetch-depth: 0 67 | 68 | - name: Free disk space 69 | if: matrix.platform != 'macos-latest' 70 | uses: ./.github/actions/free-disk-space 71 | 72 | - name: Install the expect package 73 | if: startsWith(matrix.platform, 'ubuntu') 74 | run: | 75 | sudo apt-get install -y expect 76 | 77 | - name: Install tools on MacOS 78 | if: startsWith(matrix.platform, 'macos') 79 | run: | 80 | brew install expect coreutils bash 81 | 82 | - name: Setup Python ${{ matrix.python }} 83 | uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 84 | with: 85 | python-version: ${{ matrix.python }} 86 | cache: pip 87 | cache-dependency-path: | 88 | **/pyproject.toml 89 | **/requirements*.txt 90 | 91 | - name: Remove llama-cpp-python from cache 92 | run: | 93 | pip cache remove llama_cpp_python 94 | 95 | - name: Cache huggingface 96 | uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 97 | with: 98 | path: ~/.cache/huggingface 99 | # config contains DEFAULT_MODEL 100 | key: huggingface-${{ hashFiles('src/instructlab/configuration.py') }} 101 | 102 | - name: Install dependencies 103 | run: | 104 | pip_install="python -m pip install -c constraints-dev.txt" 105 | $pip_install --upgrade pip 106 | $pip_install tox tox-gh>=1.2 107 | 108 | - name: Run unit and functional tests with tox 109 | run: | 110 | tox 111 | 112 | - name: Remove llama-cpp-python from cache 113 | if: always() 114 | run: | 115 | pip cache remove llama_cpp_python 116 | 117 | test-workflow-complete: 118 | needs: ["test"] 119 | runs-on: ubuntu-latest 120 | steps: 121 | - name: Test Workflow Complete 122 | run: echo "Test Workflow Complete" 123 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Auto generated 2 | src/instructlab/eval/_version.py 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | coverage-py3-* 52 | coverage 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | durations/* 59 | 60 | # Functional tests 61 | mt_bench_branch_generator/* 62 | eval_output/* 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | db.sqlite3-journal 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | .pybuilder/ 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | # For a library or package, you might want to ignore these files since the code is 97 | # intended to run in multiple environments; otherwise, check them in: 98 | # .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # poetry 108 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 109 | # This is especially recommended for binary packages to ensure reproducibility, and is more 110 | # commonly ignored for libraries. 111 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 112 | #poetry.lock 113 | 114 | # pdm 115 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 116 | #pdm.lock 117 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 118 | # in version control. 119 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 120 | .pdm.toml 121 | .pdm-python 122 | .pdm-build/ 123 | 124 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 125 | __pypackages__/ 126 | 127 | # Celery stuff 128 | celerybeat-schedule 129 | celerybeat.pid 130 | 131 | # SageMath parsed files 132 | *.sage.py 133 | 134 | # Environments 135 | .env 136 | .venv 137 | env/ 138 | venv/ 139 | ENV/ 140 | env.bak/ 141 | venv.bak/ 142 | dictionary.dic 143 | 144 | # Spyder project settings 145 | .spyderproject 146 | .spyproject 147 | 148 | # Rope project settings 149 | .ropeproject 150 | 151 | # mkdocs documentation 152 | /site 153 | 154 | # mypy 155 | .mypy_cache/ 156 | .dmypy.json 157 | dmypy.json 158 | 159 | # Pyre type checker 160 | .pyre/ 161 | 162 | # pytype static type analyzer 163 | .pytype/ 164 | 165 | # Cython debug symbols 166 | cython_debug/ 167 | 168 | # PyCharm 169 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 170 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 171 | # and can be added to the global gitignore or merged into this file. For a more nuclear 172 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 173 | #.idea/ 174 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile=black 3 | from_first=true 4 | import_heading_future=Future 5 | import_heading_stdlib=Standard 6 | import_heading_thirdparty=Third Party 7 | import_heading_firstparty=First Party 8 | import_heading_localfolder=Local 9 | known_firstparty= 10 | known_localfolder=tuning 11 | -------------------------------------------------------------------------------- /.markdownlint-cli2.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | config: 4 | line-length: false 5 | no-emphasis-as-header: false 6 | first-line-heading: false 7 | code-block-style: false 8 | no-duplicate-header: false 9 | single-trailing-newline: false 10 | descriptive-link-text: false 11 | globs: 12 | - "**/*.md" 13 | ignores: 14 | - ".github/**" 15 | - "venv/**" 16 | - ".venv/**" 17 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | repos: 4 | - repo: https://github.com/PyCQA/isort 5 | rev: 5.11.5 6 | hooks: 7 | - id: isort 8 | exclude: imports 9 | - repo: https://github.com/astral-sh/ruff-pre-commit 10 | # Ruff version. 11 | rev: v0.3.4 12 | hooks: 13 | # Run the linter (most fixers are disabled for now). 14 | - id: ruff 15 | # Run the formatter. 16 | - id: ruff-format 17 | -------------------------------------------------------------------------------- /.spellcheck-en-custom.txt: -------------------------------------------------------------------------------- 1 | 2 | # make spellcheck-sort 3 | # Please keep this file sorted: 4 | # SPDX-License-Identifier: Apache-2.0 5 | Backport 6 | backported 7 | benchmarking 8 | codebase 9 | cli 10 | dev 11 | dr 12 | eval 13 | gpt 14 | hoc 15 | http 16 | instructlab 17 | jsonl 18 | justfile 19 | MMLU 20 | openai 21 | pre 22 | SDG 23 | Tatsu 24 | tl 25 | TODO 26 | tox 27 | venv 28 | vllm 29 | barebones 30 | LM 31 | -------------------------------------------------------------------------------- /.spellcheck.yml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | matrix: 4 | - name: markdown 5 | aspell: 6 | lang: en 7 | d: en_US 8 | camel-case: true 9 | mode: markdown 10 | sources: 11 | - "**/*.md|!.tox/**|!venv/**" 12 | dictionary: 13 | wordlists: 14 | - .spellcheck-en-custom.txt 15 | pipeline: 16 | - pyspelling.filters.context: 17 | context_visible_first: true 18 | escapes: '\\[\\`~]' 19 | delimiters: 20 | # Ignore multiline content between fences (fences can have 3 or more back ticks) 21 | # ```language 22 | # content 23 | # ``` 24 | - open: '(?s)^(?P *`{3,}).*?$' 25 | close: '^(?P=open)$' 26 | # Ignore text between inline back ticks 27 | - open: '(?P`+)' 28 | close: '(?P=open)' 29 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.5.0 2 | 3 | * Introduces Ragas as a supported evaluation framework. This integration only supports the `RubricsScore` metric and OpenAI models. Users can pass in either a dataset with a pre-computed `user_input`, `reference` and `response` fields or they can provide a dataset containing `user_input` and `reference` along with information about a model endpoint that will be used for computing the `response` field. 4 | 5 | ## 0.4.2 6 | 7 | * Adds the ability to provide a custom system prompt to the MMLU-based evaluators. When a system prompt is provided, LM-eval applies the chat template under the hood, else it will pass the model a barebones prompt. 8 | * Adds an `extra_args` parameter to the `.run` method of all MMLU-based evaluators. This way, consumers are able to directly pass any additional arguments they want through to the `lm_eval.evaluators.simple_evaluate` function. 9 | 10 | ## 0.4 11 | 12 | * Added ability to specify a custom http client to MT-Bench 13 | 14 | ## v0.2 15 | -------------------------------------------------------------------------------- /DCO.txt: -------------------------------------------------------------------------------- 1 | Developer Certificate of Origin 2 | Version 1.1 3 | 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 5 | 6 | Everyone is permitted to copy and distribute verbatim copies of this 7 | license document, but changing it is not allowed. 8 | 9 | 10 | Developer's Certificate of Origin 1.1 11 | 12 | By making a contribution to this project, I certify that: 13 | 14 | (a) The contribution was created in whole or in part by me and I 15 | have the right to submit it under the open source license 16 | indicated in the file; or 17 | 18 | (b) The contribution is based upon previous work that, to the best 19 | of my knowledge, is covered under an appropriate open source 20 | license and I have the right under that license to submit that 21 | work with modifications, whether created in whole or in part 22 | by me, under the same open source license (unless I am 23 | permitted to submit under a different license), as indicated 24 | in the file; or 25 | 26 | (c) The contribution was provided directly to me by some other 27 | person who certified (a), (b) or (c) and I have not modified 28 | it. 29 | 30 | (d) I understand and agree that this project and the contribution 31 | are public and that a record of the contribution (including all 32 | personal information I submit with it, including my sign-off) is 33 | maintained indefinitely and may be redistributed consistent with 34 | this project or the open source license(s) involved. 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # 4 | # If you want to see the full commands, run: 5 | # NOISY_BUILD=y make 6 | # 7 | ifeq ($(NOISY_BUILD),) 8 | ECHO_PREFIX=@ 9 | CMD_PREFIX=@ 10 | PIPE_DEV_NULL=> /dev/null 2> /dev/null 11 | else 12 | ECHO_PREFIX=@\# 13 | CMD_PREFIX= 14 | PIPE_DEV_NULL= 15 | endif 16 | 17 | .PHONY: help 18 | help: 19 | @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z_0-9-]+:.*?##/ { printf " \033[36m%-18s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) 20 | 21 | .PHONY: action-lint actionlint 22 | action-lint: actionlint 23 | actionlint: ## Lint GitHub Action workflows 24 | $(ECHO_PREFIX) printf " %-12s .github/...\n" "[ACTION LINT]" 25 | $(CMD_PREFIX) if ! command -v actionlint $(PIPE_DEV_NULL) ; then \ 26 | echo "Please install actionlint." ; \ 27 | echo "go install github.com/rhysd/actionlint/cmd/actionlint@latest" ; \ 28 | exit 1 ; \ 29 | fi 30 | $(CMD_PREFIX) if ! command -v shellcheck $(PIPE_DEV_NULL) ; then \ 31 | echo "Please install shellcheck." ; \ 32 | echo "https://github.com/koalaman/shellcheck#user-content-installing" ; \ 33 | exit 1 ; \ 34 | fi 35 | $(CMD_PREFIX) actionlint -color 36 | 37 | .PHONY: check-tox 38 | check-tox: 39 | @command -v tox &> /dev/null || (echo "'tox' is not installed" && exit 1) 40 | 41 | .PHONY: md-lint 42 | md-lint: ## Lint markdown files 43 | $(ECHO_PREFIX) printf " %-12s ./...\n" "[MD LINT]" 44 | $(CMD_PREFIX) podman run --rm -v $(CURDIR):/workdir --security-opt label=disable docker.io/davidanson/markdownlint-cli2:latest > /dev/null 45 | 46 | .PHONY: spellcheck 47 | spellcheck: ## Spellcheck markdown files 48 | tox p -e spellcheck 49 | 50 | .PHONY: spellcheck-sort 51 | spellcheck-sort: .spellcheck-en-custom.txt ## Sort spellcheck directory 52 | sort -d -f -o $< $< 53 | 54 | .PHONY: verify 55 | verify: check-tox ## Run linting, typing, and formatting checks via tox 56 | tox p -e fastlint,mypy,ruff 57 | 58 | ##@ Development 59 | 60 | .PHONY: tests 61 | tests: check-tox ## Run unit and type checks 62 | tox -e py3-unit,mypy 63 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # eval 2 | 3 | ![Lint](https://github.com/instructlab/eval/actions/workflows/lint.yml/badge.svg?branch=main) 4 | ![Tests](https://github.com/instructlab/eval/actions/workflows/test.yml/badge.svg?branch=main) 5 | ![Build](https://github.com/instructlab/eval/actions/workflows/pypi.yaml/badge.svg?branch=main) 6 | ![Release](https://img.shields.io/github/v/release/instructlab/eval) 7 | ![License](https://img.shields.io/github/license/instructlab/eval) 8 | 9 | ![`e2e-nvidia-l4-x1.yml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-l4-x1.yml/badge.svg?branch=main) 10 | ![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/eval/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main) 11 | 12 | Python Library for Evaluation 13 | 14 | ## What is Evaluation? 15 | 16 | Evaluation allows us to assess how a given model is performing against a set of specific tasks. This is done by running a set of standardized benchmark tests against 17 | the model. Running evaluation produces numerical scores across these various benchmarks, as well as logs excerpts/samples of the outputs the model produced during these 18 | benchmarks. Using a combination of these artifacts as reference, along with a manual smoke test, allows us to get the best idea about whether or not a model has learned 19 | and improved on something we are trying to teach it. There are 2 stages of model evaluation in the InstructLab process: 20 | 21 | ### Inter-checkpoint Evaluation 22 | 23 | This step occurs during multi-phase training. Each phase of training produces multiple different “checkpoints” of the model that are taken at various stages during 24 | the phase. At the end of each phase, we evaluate all the checkpoints in order to find the one that provides the best results. This is done as part of the 25 | [InstructLab Training](https://github.com/instructlab/training) library. 26 | 27 | ### Full-scale final Evaluation 28 | 29 | Once training is complete, and we have picked the best checkpoint from the output of the final phase, we can run full-scale evaluation suite which runs MT-Bench, MMLU, 30 | MT-Bench Branch and MMLU Branch. 31 | 32 | ### Leaderboard Evaluation 33 | 34 | For cases when you want to run the full Open LLM Leaderboard v2 evaluation suite, we provide an optional dependency package for the leaderboard tasks. This includes additional benchmarks like GPQA, IFEVAL, BBH, MMLU-PRO, MUSR, and MATH-HARD. 35 | 36 | To install the optional leaderboard dependencies, use: 37 | 38 | ```bash 39 | pip install instructlab-eval[leaderboard] 40 | ``` 41 | 42 | ## Methods of Evaluation 43 | 44 | Below are more in-depth explanations of the suite of benchmarks we are using as methods for evaluation of models. 45 | 46 | ### Multi-turn benchmark (MT-Bench) 47 | 48 | **tl;dr** Full model evaluation of performance on **skills** 49 | 50 | MT-Bench is a type of benchmarking that involves asking a model 80 multi-turn questions - i.e. 51 | 52 | ```text 53 | 54 | ``` 55 | 56 | A “judge” model reviews the given multi-turn question, the provided model answer, and rate the answer with a score out of 10. The scores are then averaged out 57 | and the final score produced is the “MT-bench score” for that model. This benchmark assumes no factual knowledge on the model’s part. The questions are static, but do not get obsolete with time. 58 | 59 | You can read more about MT-Bench [here](https://arxiv.org/abs/2306.05685) 60 | 61 | ### MT-Bench Branch 62 | 63 | MT-Bench Branch is an adaptation of MT-Bench that is designed to test custom skills that are added to the model with the InstructLab project. These new skills 64 | come in the form of question/answer pairs in a Git branch of the [taxonomy](https://github.com/instructlab/taxonomy). 65 | 66 | MT-Bench Branch uses the user supplied seed questions to have the candidate model generate answers to, which are then judged by the judge model using the user supplied 67 | seed answers as a reference. 68 | 69 | ### Massive Multitask Language Understanding (MMLU) 70 | 71 | **tl;dr** Full model evaluation of performance on **knowledge** 72 | 73 | MMLU is a type of benchmarking that involves a series of fact-based multiple choice questions, along with 4 options for answers. It tests if a model is able to interpret 74 | the questions correctly, along the answers, formulate its own answer, then selects the correct option out of the provided ones. The questions are designed as a set 75 | of 57 “tasks”, and each task has a given domain. The domains cover a number of topics ranging from Chemistry and Biology to US History and Math. 76 | 77 | The performance number is then compared against the set of known correct answers for each question to determine how many the model got right. The final MMLU score is the 78 | average of its scores. This benchmark does not involve any reference/critic model, and is a completely objective benchmark. This benchmark does assume factual knowledge 79 | on the model’s part. The questions are static, therefore MMLU cannot be used to gauge the model’s knowledge on more recent topics. 80 | 81 | InstructLab uses an implementation found [here](https://github.com/EleutherAI/lm-evaluation-harness) for running MMLU. 82 | 83 | You can read more about MMLU [here](https://arxiv.org/abs/2306.05685) 84 | 85 | ### MMLU Branch 86 | 87 | MMLU Branch is an adaptation of MMLU that is designed to test custom knowledge that is being added to the model via a Git branch of the [taxonomy](https://github.com/instructlab/taxonomy). 88 | 89 | A teacher model is used to generate new multiple choice questions based on the knowledge document included in the taxonomy Git branch. A “task” is then constructed that references the newly generated answer choices. These tasks are then used to score the model’s grasp on new knowledge the same way MMLU works. Generation of these tasks are done as part of the [InstructLab SDG](https://github.com/instructlab/sdg) library. 90 | 91 | ## Development 92 | 93 | > **⚠️ Note:** Must use Python version 3.11 or later. 94 | 95 | ### Set up your dev environment 96 | 97 | The following tools are required: 98 | 99 | - [`git`](https://git-scm.com) 100 | - [`python`](https://www.python.org) (v3.11) 101 | - [`pip`](https://pypi.org/project/pip/) (v23.0+) 102 | - [`bash`](https://www.gnu.org/software/bash/) (v5+, for functional tests) 103 | 104 | #### Optional: Use [cloud-instance.sh](https://github.com/instructlab/instructlab/tree/main/scripts/infra) to launch and setup an instance 105 | 106 | ```shell 107 | scripts/infra/cloud-instance.sh ec2 launch -t g6.2xlarge 108 | scripts/infra/cloud-instance.sh ec2 setup-rh-devenv 109 | scripts/infra/cloud-instance.sh ec2 install-rh-nvidia-drivers 110 | scripts/infra/cloud-instance.sh ec2 ssh sudo reboot 111 | scripts/infra/cloud-instance.sh ec2 ssh 112 | ``` 113 | 114 | #### Regardless of how you setup your instance 115 | 116 | ```shell 117 | git clone https://github.com/instructlab/taxonomy.git && pushd taxonomy && git branch rc && popd 118 | git clone --bare https://github.com/instructlab/eval.git && git clone eval.git/ && cd eval && git remote add syncrepo ../eval.git 119 | python3 -m venv venv 120 | source venv/bin/activate 121 | pip install -r requirements.txt 122 | pip install -r requirements-dev.txt 123 | pip install -e . 124 | pip install vllm 125 | ``` 126 | 127 | ### Testing 128 | 129 | Before pushing changes to GitHub, you need to run the tests as shown below. They can be run individually as shown in each sub-section 130 | or can be run with the one command: 131 | 132 | ```shell 133 | tox 134 | ``` 135 | 136 | #### Unit tests 137 | 138 | Unit tests are enforced by the CI system using [`pytest`](https://docs.pytest.org/). When making changes, run these tests before pushing the changes to avoid CI issues. 139 | 140 | Running unit tests can be done with: 141 | 142 | ```shell 143 | tox -e py3-unit 144 | ``` 145 | 146 | By default, all tests found within the `tests` directory are run. However, specific unit tests can run by passing filenames, classes and/or methods to `pytest` using tox positional arguments. The following example invokes a single test method `test_mt_bench` that is declared in the `tests/test_mt_bench.py` file: 147 | 148 | ```shell 149 | tox -e py3-unit -- tests/test_mt_bench.py::test_mt_bench 150 | ``` 151 | 152 | #### Functional tests 153 | 154 | Functional tests are enforced by the CI system. When making changes, run the tests before pushing the changes to avoid CI issues. 155 | 156 | Running functional tests can be done with: 157 | 158 | ```shell 159 | tox -e py3-functional 160 | ``` 161 | 162 | #### Coding style 163 | 164 | Cli follows the python [`pep8`](https://peps.python.org/pep-0008/) coding style. The coding style is enforced by the CI system, and your PR will fail until the style has been applied correctly. 165 | 166 | We use [pre-commit](https://pre-commit.com/) to enforce coding style using [`black`](https://github.com/psf/black), and [`isort`](https://pycqa.github.io/isort/). 167 | 168 | You can invoke formatting with: 169 | 170 | ```shell 171 | tox -e ruff 172 | ``` 173 | 174 | In addition, we use [`pylint`](https://www.pylint.org) to perform static code analysis of the code. 175 | 176 | You can invoke the linting with the following command 177 | 178 | ```shell 179 | tox -e lint 180 | ``` 181 | 182 | ### MT-Bench / MT-Bench Branch Example Usage 183 | 184 | Launch vllm serving granite-7b-lab 185 | 186 | ```shell 187 | python -m vllm.entrypoints.openai.api_server --model instructlab/granite-7b-lab --tensor-parallel-size 1 188 | ``` 189 | 190 | In another shell window 191 | 192 | ```shell 193 | export INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=10 # Optional if you want to shorten run times 194 | # Commands relative to eval directory 195 | python3 scripts/test_gen_answers.py 196 | python3 scripts/test_branch_gen_answers.py 197 | ``` 198 | 199 | Example output tree 200 | 201 | ```shell 202 | eval_output/ 203 | ├── mt_bench 204 | │   └── model_answer 205 | │   └── instructlab 206 | │   └── granite-7b-lab.jsonl 207 | └── mt_bench_branch 208 | ├── main 209 | │   ├── model_answer 210 | │   │   └── instructlab 211 | │   │   └── granite-7b-lab.jsonl 212 | │   ├── question.jsonl 213 | │   └── reference_answer 214 | │   └── instructlab 215 | │   └── granite-7b-lab.jsonl 216 | └── rc 217 | ├── model_answer 218 | │   └── instructlab 219 | │   └── granite-7b-lab.jsonl 220 | ├── question.jsonl 221 | └── reference_answer 222 | └── instructlab 223 | └── granite-7b-lab.jsonl 224 | ``` 225 | 226 | ```shell 227 | python3 scripts/test_judge_answers.py 228 | python3 scripts/test_branch_judge_answers.py 229 | ``` 230 | 231 | Example output tree 232 | 233 | ```shell 234 | eval_output/ 235 | ├── mt_bench 236 | │   ├── model_answer 237 | │   │   └── instructlab 238 | │   │   └── granite-7b-lab.jsonl 239 | │   └── model_judgment 240 | │   └── instructlab 241 | │   └── granite-7b-lab_single.jsonl 242 | └── mt_bench_branch 243 | ├── main 244 | │   ├── model_answer 245 | │   │   └── instructlab 246 | │   │   └── granite-7b-lab.jsonl 247 | │   ├── model_judgment 248 | │   │   └── instructlab 249 | │   │   └── granite-7b-lab_single.jsonl 250 | │   ├── question.jsonl 251 | │   └── reference_answer 252 | │   └── instructlab 253 | │   └── granite-7b-lab.jsonl 254 | └── rc 255 | ├── model_answer 256 | │   └── instructlab 257 | │   └── granite-7b-lab.jsonl 258 | ├── model_judgment 259 | │   └── instructlab 260 | │   └── granite-7b-lab_single.jsonl 261 | ├── question.jsonl 262 | └── reference_answer 263 | └── instructlab 264 | └── granite-7b-lab.jsonl 265 | ``` 266 | 267 | ## Developer Certificate of Origin 268 | 269 | When you make a contribution to InstructLab eval, you implicitly agree to the Developer Certificate of Origin terms as set in `DCO.txt` at the root of this repository. 270 | -------------------------------------------------------------------------------- /constraints-dev.txt.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/instructlab/eval/11683ac841df1df6ee65ea6478d1066bb744833d/constraints-dev.txt.in -------------------------------------------------------------------------------- /docs/ci.md: -------------------------------------------------------------------------------- 1 | # CI for InstructLab Eval 2 | 3 | Before running any testing locally, ensure you have run `pip install -r requirements-dev.txt` in your environment. 4 | 5 | ## Unit tests 6 | 7 | Unit tests are designed to test specific Eval components or features in isolation. Generally, new code should be adding or modifying unit tests. 8 | 9 | All unit tests currently live in the `tests/` directory and are run with [pytest](https://docs.pytest.org/) via [tox](https://tox.wiki/). 10 | 11 | To run the unit tests, you can run `tox -e unit` or `tox -e unitcov` if you want to generate coverage metrics as well. 12 | 13 | In CI, the tests are run with Python 3.11 on Ubuntu and MacOS runners - you can see the details [here](https://github.com/instructlab/eval/blob/main/.github/workflows/test.yml) 14 | 15 | ## Functional tests 16 | 17 | Functional tests are designed to test Eval components or features in tandem, but not necessarily as part of a complex workflow. New code may or may not need a functional test but should strive to implement one if possible. 18 | 19 | The functional test script is Shell-based and can be found at `scripts/functional-tests.sh`. 20 | 21 | To run the functional tests, you can run `tox -e functional`. 22 | 23 | In CI, the tests are run with Python 3.11 on Ubuntu and MacOS runners - you can see the details [here](https://github.com/instructlab/eval/blob/main/.github/workflows/test.yml) 24 | 25 | ## End-to-end (E2E) tests 26 | 27 | InstructLab Eval has several end-to-end jobs that run to ensure compatibility with the [InstructLab Core](https://github.com/instructlab/instructlab) project. 28 | You can see details about the types of jobs being run in the matrix below. 29 | 30 | For more details about the E2E scripts themselves, see [the InstructLab Core documentation](https://github.com/instructlab/instructlab/blob/main/docs/maintainers/ci.md#end-to-end-e2e-tests). 31 | 32 | ### Current E2E Jobs 33 | 34 | | Name | T-Shirt Size | Runner Host | Instance Type | OS | GPU Type | Script | Flags | Runs when? | Slack/Discord reporting? | 35 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | 36 | | [`e2e-nvidia-l4-x1.yml`](https://github.com/instructlab/sdg/blob/main/.github/workflows/e2e-nvidia-l4-x1.yml) | Medium | AWS |[`g6.8xlarge`](https://aws.amazon.com/ec2/instance-types/g5/) | CentOS Stream 9 | 1 x NVIDIA L4 w/ 24 GB VRAM | `e2e-ci.sh` | `m` | Pull Requests, Push to `main` or `release-*` branch | No | 37 | | [`e2e-nvidia-l40s-x4.yml`](https://github.com/instructlab/sdg/blob/main/.github/workflows/e2e-nvidia-l40s-x4.yml) | Large | AWS |[`g6e.12xlarge`](https://aws.amazon.com/ec2/instance-types/g6e/) | CentOS Stream 9 | 4 x NVIDIA L40S w/ 48 GB VRAM (192 GB) | `e2e-ci.sh` | `l` | Manually by Maintainers, Automatically against `main` branch at 4PM UTC | Yes | 38 | 39 | ### Discord/Slack reporting 40 | 41 | Some E2E jobs send their results to the channel `#e2e-ci-results` via the `Son of Jeeves` bot in both Discord and Slack. You can see which jobs currently have reporting via the "Current E2E Jobs" table above. 42 | 43 | In Slack, this has been implemented via [the official Slack GitHub Action](https://github.com/slackapi/slack-github-action?tab=readme-ov-file#technique-2-slack-app). 44 | In Discord, we use [actions/actions-status-discord](https://github.com/sarisia/actions-status-discord) and the built-in channel webhooks feature. 45 | 46 | ### Triggering an E2E job via GitHub Web UI 47 | 48 | For the E2E jobs that can be launched manually, they take an input field that 49 | specifies the PR number or git branch to run them against. If you run them 50 | against a PR, they will automatically post a comment to the PR when the tests 51 | begin and end so it's easier for those involved in the PR to follow the results. 52 | 53 | 1. Visit the [Actions tab](https://github.com/instructlab/eval/actions). 54 | 2. Click on one of the E2E workflows on the left side of the page. 55 | 3. Click on the `Run workflow` button on the right side of the page. 56 | 4. Enter a branch name or a PR number in the input field. 57 | 5. Click the green `Run workflow` button. 58 | 59 | > [!NOTE] 60 | > Only users with "Write" permissions to the repo can run CI jobs manually 61 | -------------------------------------------------------------------------------- /docs/release-strategy.md: -------------------------------------------------------------------------------- 1 | # InstructLab Eval Release Strategy 2 | 3 | This document discusses the release strategy and processes for the 4 | `instructlab-eval` Python package built from the 5 | git repository. 6 | 7 | ## Versioning Scheme 8 | 9 | Releases use a `X.Y.Z` numbering scheme. 10 | 11 | X-stream release are for major releases. At this stage in the project a major release has not been cut and we expect each release to be a new Y-stream. 12 | 13 | Z-stream releases are meant for critical bug and documentation fixes. Z-stream releases are cut as maintainers see fit. 14 | 15 | ## Schedule 16 | 17 | The project currently operates on an ad-hoc release schedule based on the discretion of the maintainers team. 18 | 19 | The cadence for major releases starting from 1.0 onward will be determined as the project matures. 20 | 21 | A schedule will be updated in a markdown file on the GitHub repository. 22 | 23 | ## Release Tracking 24 | 25 | Currently there is no formal process of release tracking. GitHub Issues are used for tracking individual work items. 26 | 27 | In the future, the project may use Milestones or Project Boards for more formal release planning. At that time this document will be updated. 28 | 29 | ## Git Branches and Tags 30 | 31 | Every `X.Y` release stream gets a new branch. 32 | 33 | Each release, `X.Y.Z`, exists as a tag named `vX.Y.Z`. 34 | 35 | ## Release Branch Maintenance 36 | 37 | Maintenance efforts are only on the most recent Y-stream. 38 | Critical bug fixes are backported to the most recent release branch. 39 | 40 | ## Release Mechanics 41 | 42 | Release mechanics are done by a Release Manager identified for that release. 43 | The Release Manager is a member of the Eval Maintainers team that has agreed to take on these responsibilities. 44 | The Release Manager can change on a per-release basis. 45 | 46 | The following are the steps for how Y-stream and Z-stream releases gets cut. 47 | 48 | ### Y-Stream 49 | 50 | 1. Determine a commit on the main branch that will serve as the basis for the next release - most of the time this should be the latest commit. 51 | 1. Create a new release branch in the format `release-vX.Y` off of the determined commit (will match `main` if the latest commit is chosen). 52 | 1. Create a new release on GitHub targeting the release branch and using the latest Y-Stream tag as the previous release (e.g. `0.15.1` precedes `0.16.0`). 53 | 1. Announce release via the following: 54 | - The `#eval` channel on Slack 55 | - The `#eval` channel on Discord 56 | - The `dev` mailing list 57 | 58 | ### Z-Stream 59 | 60 | 1. Backport all relevant commits from `main` to the `release-vX.Y` branch. 61 | - It may also be the case you wish to update release branch first - if this approach is taken, ensure any relevant commits are subsequently backported to `main` 62 | 1. Create a new release on GitHub targeting the release branch and using the previous Z-Stream tag as the previous release (e.g. `0.15.0` precedes `0.15.1`). 63 | 1. Announce release via the following: 64 | - The `#eval` channel on Slack 65 | - The `#eval` channel on Discord 66 | - The `dev` mailing list 67 | 68 | ## Release Notes 69 | 70 | The project maintains a single `CHANGELOG.md` file that documents all releases. To ensure our users 71 | are well-informed about new features, improvements, and breaking changes, we maintain a 72 | `CHANGELOG.md` file. This file serves as a centralized place to document changes that will be 73 | included in the next (X) or (Y) release. Given that the project is in its early stages, we are 74 | currently focused on documenting changes for the next (Y) release. 75 | 76 | ### Editing Release Notes 77 | 78 | When submitting a Pull Request (PR) that introduces notable features or breaking changes, committers 79 | need to update the `CHANGELOG.md` file. Clearly describe the changes, their impact, and 80 | any actions users might need to take. We want clear, concise, and user-friendly notes. 81 | 82 | ### Branching for a New Release 83 | 84 | Each time we prepare for a new (X) or (Y) release, we branch out from the main codebase. 85 | As part of this branching process, the contents of `CHANGELOG.md` are reviewed and 86 | finalized. 87 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | [build-system] 4 | requires = ["setuptools>=64", "setuptools_scm>=8"] 5 | build-backend = "setuptools.build_meta" 6 | 7 | [project] 8 | name = "instructlab-eval" 9 | authors = [ 10 | { name="InstructLab", email="dev@instructlab.ai" }, 11 | ] 12 | description = "Evaluation" 13 | readme = "README.md" 14 | license = {text = "Apache-2.0"} 15 | requires-python = ">=3.11" 16 | classifiers = [ 17 | "Development Status :: 3 - Alpha", 18 | "Environment :: Console", 19 | "License :: OSI Approved :: Apache Software License", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: MacOS :: MacOS X", 22 | "Operating System :: POSIX :: Linux", 23 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 24 | "Programming Language :: Python :: 3", 25 | "Programming Language :: Python :: 3.11", 26 | "Programming Language :: Python :: 3.12", 27 | "Programming Language :: Python :: Implementation :: CPython", 28 | ] 29 | dynamic = ["dependencies", "optional-dependencies", "version"] 30 | 31 | [project.scripts] 32 | 33 | [project.urls] 34 | homepage = "https://instructlab.ai" 35 | source = "https://github.com/instructlab/eval" 36 | issues = "https://github.com/instructlab/eval/issues" 37 | 38 | [project.entry-points."instructlab.eval.evaluator"] 39 | "mmlu" = "instructlab.eval.mmlu:MMLUEvaluator" 40 | "mmlu_branch" = "instructlab.eval.mmlu:MMLUBranchEvaluator" 41 | "mt_bench" = "instructlab.eval.mt_bench:MTBenchEvaluator" 42 | "mt_bench_branch" = "instructlab.eval.mt_bench:MTBenchBranchEvaluator" 43 | "leaderboard_v2" = "instructlab.eval.leaderboard:LeaderboardV2Evaluator" 44 | "ruler" = "instructlab.eval.ruler:RulerEvaluator" 45 | 46 | [tool.setuptools_scm] 47 | version_file = "src/instructlab/eval/_version.py" 48 | # do not include +gREV local version, required for Test PyPI upload 49 | local_scheme = "no-local-version" 50 | 51 | [tool.setuptools] 52 | package-dir = {"" = "src"} 53 | 54 | [tool.setuptools.dynamic] 55 | dependencies = {file = ["requirements.txt"]} 56 | optional-dependencies = {leaderboard = {file = ["requirements-leaderboard.txt"]}} 57 | 58 | [tool.setuptools.packages.find] 59 | where = ["src"] 60 | include = ["instructlab.eval"] 61 | 62 | [tool.ruff] 63 | target-version = "py39" 64 | # same as black's default line length 65 | line-length = 88 66 | 67 | [tool.ruff.lint] 68 | # Allow fix for all enabled rules (when `--fix`) is provided. 69 | fixable = ["ALL"] 70 | unfixable = [] 71 | 72 | # Fixers will be enabled gradually. 73 | select = [ 74 | # "B", # flake8-bugbear 75 | # "E", # pycodestyle 76 | # "F", # Pyflakes 77 | "Q", # flake8-quotes 78 | # Ruff does not support isort's import_headings feature, yet. 79 | # "I", # isort 80 | # "UP", # pyupgrade 81 | # "SIM", # flake8-simplify 82 | "TID", # flake8-tidy-imports 83 | ] 84 | ignore = [ 85 | # some embedded strings are longer than 88 characters 86 | "E501", # line too long 87 | "TID252", # Prefer absolute imports over relative imports from parent modules 88 | ] 89 | 90 | [tool.ruff.lint.isort] 91 | # same as .isort.cfg 92 | from-first = true 93 | # not supported yet 94 | # import-heading-future=Future 95 | # import-heading-stdlib=Standard 96 | # import-heading-thirdparty=Third Party 97 | # import-heading-firstparty=First Party 98 | # import-heading-localfolder=Local 99 | known-local-folder = ["tuning"] 100 | 101 | [tool.mypy] 102 | ignore_missing_imports = true 103 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | -r requirements.txt 4 | 5 | pre-commit>=3.0.4 6 | pylint>=2.16.2 7 | pylint-pydantic 8 | tox>=4.4.2 9 | 10 | pytest 11 | pytest-asyncio 12 | pytest-cov 13 | pytest-html 14 | 15 | ruff 16 | isort 17 | pyspelling 18 | 19 | mypy>=1.10.0 20 | types-tqdm 21 | types-PyYAML 22 | -------------------------------------------------------------------------------- /requirements-files.in: -------------------------------------------------------------------------------- 1 | requirements.txt 2 | requirements-dev.txt 3 | requirements-leaderboard.txt 4 | -------------------------------------------------------------------------------- /requirements-leaderboard.txt: -------------------------------------------------------------------------------- 1 | lm-eval[ifeval,vllm,math,sentencepiece]>=0.4.4 2 | 3 | # vLLM 0.8.3 + torch 2.6.0 doesn't work when running vLLM on granite-3.1-8b-instruct 4 | vllm<=0.7.3 5 | torch<=2.5.1 6 | -------------------------------------------------------------------------------- /requirements-ruler.txt: -------------------------------------------------------------------------------- 1 | lm-eval[ruler]>=0.4.8 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | GitPython>=3.1.42 3 | shortuuid 4 | openai>=1.13.3 5 | psutil 6 | torch 7 | transformers 8 | accelerate 9 | pandas 10 | pandas-stubs 11 | # Base lm-eval dependency 12 | lm-eval>=0.4.4 13 | httpx 14 | ragas>=0.2.11 15 | -------------------------------------------------------------------------------- /scripts/evaluate_best_checkpoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Example usage: 5 | python scripts/evaluate_best_checkpoint.py \ 6 | /path/to/checkpoint_dir \ 7 | --output-file /path/to/output_file 8 | """ 9 | 10 | # Standard 11 | from pathlib import Path 12 | from typing import Optional 13 | import json 14 | 15 | # Third Party 16 | import typer 17 | 18 | app = typer.Typer() 19 | 20 | 21 | @app.command() 22 | def main( 23 | input_dir: Path = typer.Argument(..., help="Input directory to process"), 24 | output_file: Optional[Path] = typer.Option(None, help="Optional output file path"), 25 | ): 26 | """ 27 | Process files in the input directory and optionally save results to an output file. 28 | """ 29 | if not input_dir.exists(): 30 | typer.echo(f"Error: Input directory '{input_dir}' does not exist") 31 | raise typer.Exit(1) 32 | 33 | if not input_dir.is_dir(): 34 | typer.echo(f"Error: '{input_dir}' is not a directory") 35 | raise typer.Exit(1) 36 | 37 | checkpoint_dirs = list(input_dir.glob("hf_format/samples_*")) 38 | typer.echo(f"Found {len(checkpoint_dirs)} samples files") 39 | 40 | if not checkpoint_dirs: 41 | typer.echo( 42 | f"No checkpoint directories found in the input directory: {input_dir}" 43 | ) 44 | raise typer.Exit(1) 45 | 46 | typer.echo("importing LeaderboardV2Evaluator, this may take a while...") 47 | # First Party 48 | from instructlab.eval.leaderboard import LeaderboardV2Evaluator 49 | 50 | checkpoint_results = {} 51 | for checkpoint in checkpoint_dirs: 52 | typer.echo(f"Processing checkpoint: {checkpoint}") 53 | ckpt_output_file = checkpoint / "leaderboard_results.json" 54 | evaluator = LeaderboardV2Evaluator( 55 | model_path=str(checkpoint), output_file=ckpt_output_file, num_gpus=8 56 | ) 57 | result = evaluator.run() 58 | checkpoint_results[checkpoint.name] = result 59 | typer.echo(f"Checkpoint {checkpoint.name} results: {result['overall_score']}") 60 | 61 | # Sort checkpoints by score 62 | sorted_checkpoints = sorted( 63 | checkpoint_results.items(), key=lambda x: x[1]["overall_score"], reverse=True 64 | ) 65 | typer.echo("Sorted checkpoints by score:") 66 | for checkpoint_name, result in sorted_checkpoints: 67 | typer.echo(f"{'=' * 100}") 68 | typer.echo(json.dumps(result, indent=2)) 69 | 70 | typer.echo(f"{'=' * 100}") 71 | typer.echo(f"Best checkpoint: {sorted_checkpoints[0][0]}") 72 | 73 | if output_file: 74 | typer.echo(f"Output will be saved to: {output_file}") 75 | with open(output_file, "w") as f: 76 | json.dump(checkpoint_results, f, indent=2) 77 | 78 | # Add your processing logic here 79 | 80 | typer.echo("Processing complete!") 81 | 82 | 83 | if __name__ == "__main__": 84 | app() 85 | -------------------------------------------------------------------------------- /scripts/functional-tests.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # SPDX-License-Identifier: Apache-2.0 3 | # 4 | # This test script is laid out as follows: 5 | # - UTILITIES: utility functions 6 | # - TESTS: test functions 7 | # - SETUP: environment setup steps 8 | # - MAIN: test execution steps 9 | # 10 | # If you are running locally and calling the script multiple times you may want to run like this: 11 | # 12 | # TEST_DIR=/tmp/foo ./scripts/functional-tests.sh 13 | 14 | set -ex 15 | 16 | ############# 17 | # UTILITIES # 18 | ############# 19 | 20 | clone_taxonomy(){ 21 | if [ ! -d taxonomy ]; then 22 | git clone https://github.com/instructlab/taxonomy.git 23 | fi 24 | } 25 | 26 | ######### 27 | # TESTS # 28 | ######### 29 | 30 | test_branch_generator(){ 31 | python3 ${SCRIPTDIR}/test_branch_generator.py --test-dir "${TEST_DIR}" 32 | } 33 | 34 | ######### 35 | # SETUP # 36 | ######### 37 | 38 | # shellcheck disable=SC2155 39 | export SCRIPTDIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 40 | # build a prompt string that includes the time, source file, line number, and function name 41 | export PS4='+$(date +"%Y-%m-%d %T") ${BASH_VERSION}:${BASH_SOURCE}:${LINENO}: ${FUNCNAME[0]:+${FUNCNAME[0]}(): }' 42 | 43 | # Support overriding the test directory for local testing otherwise creates a temporary directory 44 | TEST_DIR=${TEST_DIR:-$(mktemp -d)} 45 | 46 | export TEST_DIR 47 | export PACKAGE_NAME='instructlab-eval' 48 | 49 | 50 | ######## 51 | # MAIN # 52 | ######## 53 | 54 | pushd $TEST_DIR 55 | 56 | clone_taxonomy 57 | 58 | test_branch_generator 59 | 60 | 61 | popd 62 | exit 0 63 | -------------------------------------------------------------------------------- /scripts/ruff.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # SPDX-License-Identifier: Apache-2.0 3 | set -e 4 | 5 | # wrapper to combine ruff check, ruff format, and isort 6 | # 7 | # "ruff.sh fix" runs fixes and reformats the code 8 | # "ruff.sh check" checks style, format, and isort 9 | # "ruff.sh " passes abitrary args to ruff 10 | 11 | if [ -z "$1" ]; then 12 | echo "USAGE: $0 [check|fix|]" >&2 13 | exit 2 14 | fi 15 | 16 | run() { 17 | declare -i err 18 | 19 | echo "RUN: '$*'" 20 | "$@" 21 | err=$? 22 | echo 23 | return $err 24 | } 25 | 26 | case $1 in 27 | "check") 28 | declare -i exitcode=0 29 | 30 | set +e 31 | run ruff check . 32 | exitcode=$(( exitcode + $? )) 33 | 34 | run ruff format --diff . 35 | exitcode=$(( exitcode + $? )) 36 | 37 | run isort --check --diff . 38 | exitcode=$(( exitcode + $? )) 39 | set -e 40 | 41 | if [ $exitcode -ne 0 ]; then 42 | echo "ERROR: one or more checks have failed." >&2 43 | echo "Run 'tox -e ruff' to auto-correct all fixable errors." >&2 44 | exit 3 45 | fi 46 | ;; 47 | "fix") 48 | run ruff check --fix . 49 | run ruff format . 50 | run isort . 51 | ;; 52 | *) 53 | ruff "$@" 54 | esac 55 | -------------------------------------------------------------------------------- /scripts/test_branch_gen_answers.py: -------------------------------------------------------------------------------- 1 | # Third Party 2 | import httpx 3 | 4 | # First Party 5 | from instructlab.eval.mt_bench import MTBenchBranchEvaluator 6 | 7 | mt_bench_branch = MTBenchBranchEvaluator( 8 | "instructlab/granite-7b-lab", 9 | "instructlab/granite-7b-lab", 10 | "../taxonomy", 11 | "main", 12 | ) 13 | mt_bench_branch.gen_answers( 14 | "http://localhost:8000/v1", 15 | http_client=httpx.Client(verify=False), 16 | ) 17 | -------------------------------------------------------------------------------- /scripts/test_branch_generator.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | import argparse 3 | import os 4 | 5 | # First Party 6 | from instructlab.eval import mt_bench_branch_generator 7 | 8 | 9 | def test_mt_bench_branch_generator(test_dir): 10 | output_dir = os.path.join(test_dir, "mt_bench_branch_generator") 11 | mt_bench_branch_generator.generate( 12 | "prometheus-eval/prometheus-8x7b-v2.0", 13 | "main", 14 | "taxonomy", 15 | output_dir, 16 | ) 17 | main_dir = os.path.join(output_dir, "mt_bench_branch", "main") 18 | assert os.path.isfile(os.path.join(main_dir, "question.jsonl")) 19 | assert os.path.isfile( 20 | os.path.join( 21 | main_dir, 22 | "reference_answer", 23 | "prometheus-eval", 24 | "prometheus-8x7b-v2.0.jsonl", 25 | ) 26 | ) 27 | 28 | 29 | if __name__ == "__main__": 30 | parser = argparse.ArgumentParser(description="Test Branch Generator") 31 | parser.add_argument("--test-dir", help="Base test working directory") 32 | args = parser.parse_args() 33 | test_dir = args.test_dir 34 | 35 | test_mt_bench_branch_generator(test_dir) 36 | -------------------------------------------------------------------------------- /scripts/test_branch_judge_answers.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | import pprint 3 | 4 | # First Party 5 | from instructlab.eval.mt_bench import MTBenchBranchEvaluator 6 | 7 | mt_bench_branch = MTBenchBranchEvaluator( 8 | "instructlab/granite-7b-lab", 9 | "instructlab/granite-7b-lab", 10 | "../taxonomy", 11 | "main", 12 | ) 13 | overall_score, qa_pairs, error_rate = mt_bench_branch.judge_answers( 14 | "http://localhost:8000/v1" 15 | ) 16 | 17 | print(f"Overall Score: {overall_score}") 18 | print(f"Error Rate: {error_rate}") 19 | print(f"QA Pair 0:") 20 | pprint.pprint(qa_pairs[0]) 21 | 22 | print(f"qa_pairs length: {len(qa_pairs)}") 23 | 24 | for qa_pair in qa_pairs: 25 | question_id = qa_pair.get("question_id") 26 | assert question_id is not None 27 | assert qa_pair.get("score") is not None 28 | assert qa_pair.get("category") is not None 29 | assert qa_pair.get("question") is not None 30 | assert qa_pair.get("answer") is not None 31 | assert qa_pair.get("qna_file") is not None 32 | -------------------------------------------------------------------------------- /scripts/test_gen_answers.py: -------------------------------------------------------------------------------- 1 | # First Party 2 | from instructlab.eval.mt_bench import MTBenchEvaluator 3 | 4 | mt_bench = MTBenchEvaluator("instructlab/granite-7b-lab", "instructlab/granite-7b-lab") 5 | mt_bench.gen_answers("http://localhost:8000/v1") 6 | -------------------------------------------------------------------------------- /scripts/test_judge_answers.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | import pprint 3 | 4 | # First Party 5 | from instructlab.eval.mt_bench import MTBenchEvaluator 6 | 7 | mt_bench = MTBenchEvaluator("instructlab/granite-7b-lab", "instructlab/granite-7b-lab") 8 | overall_score, qa_pairs, turn_scores, error_rate = mt_bench.judge_answers( 9 | "http://localhost:8000/v1" 10 | ) 11 | 12 | print(f"Overall Score: {overall_score}") 13 | print(f"Turn 1 Score: {turn_scores[0]}") 14 | print(f"Turn 2 Score: {turn_scores[1]}") 15 | print(f"Error Rate: {error_rate}") 16 | print(f"QA Pair 0:") 17 | pprint.pprint(qa_pairs[0]) 18 | 19 | print(f"qa_pairs length: {len(qa_pairs)}") 20 | 21 | for qa_pair in qa_pairs: 22 | assert qa_pair.get("question_id") is not None 23 | assert qa_pair.get("score") is not None 24 | assert qa_pair.get("category") is not None 25 | assert qa_pair.get("question") is not None 26 | assert qa_pair.get("answer") is not None 27 | -------------------------------------------------------------------------------- /scripts/test_leaderboard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | # NOTE: This script requires the leaderboard optional dependencies. 5 | # Install with: pip install instructlab-eval[leaderboard] 6 | 7 | # Standard 8 | import json 9 | 10 | # First Party 11 | from instructlab.eval.leaderboard import LeaderboardV2Evaluator 12 | 13 | if __name__ == "__main__": 14 | evaluator = LeaderboardV2Evaluator( 15 | model_path="ibm-granite/granite-3.1-8b-base", 16 | eval_config={ 17 | "apply_chat_template": False, 18 | }, 19 | ) 20 | results = evaluator.run() 21 | print("got results from leaderboard v2") 22 | print(json.dumps(results, indent=2)) 23 | -------------------------------------------------------------------------------- /scripts/test_mmlu.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from typing import Dict, List, Tuple, TypedDict 3 | 4 | # First Party 5 | from instructlab.eval.mmlu import MMLUEvaluator 6 | 7 | SYSTEM_PROMPT = """I am, Red Hat® Instruct Model based on Granite 7B, an AI language model developed by Red Hat and IBM Research, based on the Granite-7b-base language model. My primary function is to be a chat assistant.""" 8 | 9 | 10 | class MMLUSample(TypedDict): 11 | """ 12 | Example of a single sample returned from lm_eval when running MMLU. 13 | This is not a comprehensive type, just the subset of fields we care about for this test. 14 | """ 15 | 16 | # Arguments is the list of (prompt, answer) pairs passed to MMLU as few-shot samples. 17 | # They will not be present with few_shot=0 18 | arguments: List[Tuple[str, str]] 19 | 20 | 21 | def all_samples_contain_system_prompt( 22 | samples: Dict[str, List[MMLUSample]], prompt: str 23 | ) -> bool: 24 | """ 25 | Given a mapping of evaluation --> list of results, validates that all few-shot examples 26 | included the system prompt 27 | """ 28 | for topic, samples_set in samples.items(): 29 | for sample in samples_set: 30 | for mmlu_prompt, _ in sample["arguments"]: 31 | if prompt not in mmlu_prompt: 32 | # we are looking for the exact system prompt, so no need to convert to normalize to lowercase 33 | print(f"found a sample in the '{topic}' MMLU topic set") 34 | return False 35 | 36 | return True 37 | 38 | 39 | def test_minimal_mmlu(): 40 | print("===> Executing 'test_minimal_mmlu'...") 41 | try: 42 | model_path = "instructlab/granite-7b-lab" 43 | tasks = ["mmlu_anatomy", "mmlu_astronomy"] 44 | mmlu = MMLUEvaluator( 45 | model_path=model_path, 46 | tasks=tasks, 47 | system_prompt=SYSTEM_PROMPT, 48 | ) 49 | overall_score, individual_scores = mmlu.run( 50 | extra_args={"log_samples": True, "write_out": True} 51 | ) 52 | samples = mmlu.results["samples"] 53 | 54 | print(overall_score) 55 | print(individual_scores) 56 | 57 | # we need n-shots > 1 to be able to validate the inclusion of the system prompt 58 | eligible_samples = { 59 | topic: samples[topic] 60 | for topic, shot in mmlu.results["n-shot"].items() 61 | if shot > 1 62 | } 63 | if eligible_samples: 64 | if not all_samples_contain_system_prompt(eligible_samples, SYSTEM_PROMPT): 65 | return False 66 | else: 67 | print( 68 | "MMLU was run in zero-shot mode, cannot confirm that system prompt was included, skipping check..." 69 | ) 70 | 71 | except Exception as exc: 72 | print(f"'test_minimal_mmlu' failed: {exc}") 73 | return False 74 | return True 75 | 76 | 77 | if __name__ == "__main__": 78 | assert test_minimal_mmlu() == True 79 | -------------------------------------------------------------------------------- /scripts/test_mmlu_branch.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | import os 3 | 4 | # First Party 5 | from instructlab.eval.mmlu import MMLUBranchEvaluator 6 | 7 | 8 | def test_mmlu_branch(): 9 | print("===> Executing 'test_mmlu_branch'...") 10 | try: 11 | model_path = "instructlab/granite-7b-lab" 12 | tasks_dir = ( 13 | f"{os.path.dirname(os.path.realpath(__file__))}/../tests/testdata/sdg" 14 | ) 15 | tasks = ["mmlu_pr"] 16 | mmlu = MMLUBranchEvaluator( 17 | model_path=model_path, tasks_dir=tasks_dir, tasks=tasks 18 | ) 19 | overall_score, individual_scores = mmlu.run() 20 | print(overall_score) 21 | print(individual_scores) 22 | except Exception as exc: 23 | print(f"'test_mmlu_branch' failed: {exc}") 24 | return False 25 | return True 26 | 27 | 28 | if __name__ == "__main__": 29 | assert test_mmlu_branch() == True 30 | -------------------------------------------------------------------------------- /src/instructlab/__init__.py: -------------------------------------------------------------------------------- 1 | __path__ = __import__("pkgutil").extend_path(__path__, __name__) 2 | -------------------------------------------------------------------------------- /src/instructlab/eval/__init__.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | import os 3 | 4 | # Inherit logging from caller rather than from vLLM 5 | os.environ["VLLM_CONFIGURE_LOGGING"] = "0" 6 | -------------------------------------------------------------------------------- /src/instructlab/eval/data/mt_bench/judge_prompts.jsonl: -------------------------------------------------------------------------------- 1 | {"name": "pair-v2", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[A]]"} 2 | {"name": "pair-v2-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. You should choose the assistant that follows the user's instructions and answers the user's questions better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. You should focus on who provides a better answer to the second user question. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"} 3 | {"name": "pair-math-v1", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer, assistant A's answer, and assistant B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation by comparing both assistants' answers with the reference answer. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for math questions", "category": "math", "output_format": "[[A]]"} 4 | {"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"} 5 | {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"} 6 | {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"} 7 | {"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"} 8 | {"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"} 9 | -------------------------------------------------------------------------------- /src/instructlab/eval/data/mt_bench_branch/judge_prompts.jsonl: -------------------------------------------------------------------------------- 1 | {"name": "pair-v2", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[A]]"} 2 | {"name": "pair-v2-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. You should choose the assistant that follows the user's instructions and answers the user's questions better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. You should focus on who provides a better answer to the second user question. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"} 3 | {"name": "pair-math-v1", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer, assistant A's answer, and assistant B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation by comparing both assistants' answers with the reference answer. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for math questions", "category": "math", "output_format": "[[A]]"} 4 | {"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"} 5 | {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"} 6 | {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. If correct, an assistant's answer that follows a similar style of the reference answer is preferable. Do not bias to any particular style that does not appear in the reference answer. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"} 7 | {"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"} 8 | {"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "math", "output_format": "[[rating]]"} 9 | -------------------------------------------------------------------------------- /src/instructlab/eval/evaluator.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | 4 | class Evaluator: 5 | """ 6 | Parent class for Evaluators 7 | """ 8 | 9 | name: str 10 | 11 | def __init__(self) -> None: 12 | pass 13 | -------------------------------------------------------------------------------- /src/instructlab/eval/exceptions.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | 4 | class EvalError(Exception): 5 | """ 6 | Parent class for all of instructlab-eval exceptions 7 | """ 8 | 9 | 10 | class ModelNotFoundError(EvalError): 11 | """ 12 | Error raised when model is not able to be found 13 | 14 | Attributes 15 | message error message to be printed on raise 16 | path filepath of model location 17 | """ 18 | 19 | def __init__(self, path) -> None: 20 | super().__init__() 21 | self.path = path 22 | self.message = f"Model could not be found at {path}" 23 | 24 | 25 | class InvalidModelError(EvalError): 26 | """ 27 | Error raised when model can be found but is invalid 28 | 29 | Attributes 30 | message error message to be printed on raise 31 | path filepath of model location 32 | reason root cause for model invalidity 33 | """ 34 | 35 | def __init__(self, path, reason) -> None: 36 | super().__init__() 37 | self.path = path 38 | self.reason = reason 39 | self.message = f"Model found at {path} but was invalid due to: {reason}" 40 | 41 | 42 | class InvalidMaxWorkersError(EvalError): 43 | """ 44 | Error raised when max_workers isn't an int or "auto" 45 | 46 | Attributes 47 | message error message to be printed on raise 48 | max_workers max_workers specified 49 | """ 50 | 51 | def __init__(self, max_workers) -> None: 52 | super().__init__() 53 | self.max_workers = max_workers 54 | self.message = f"Invalid max_workers '{max_workers}' specified. Valid values are positive integers or 'auto'." 55 | 56 | 57 | class InvalidGitRepoError(EvalError): 58 | """ 59 | Error raised when taxonomy dir provided isn't a valid git repo 60 | Attributes 61 | message error message to be printed on raise 62 | taxonomy_dir supplied taxonomy directory 63 | """ 64 | 65 | def __init__(self, taxonomy_dir) -> None: 66 | super().__init__() 67 | self.taxonomy_dir = taxonomy_dir 68 | self.message = f"Invalid git repo: {taxonomy_dir}" 69 | 70 | 71 | class GitRepoNotFoundError(EvalError): 72 | """ 73 | Error raised when taxonomy dir provided does not exist 74 | Attributes 75 | message error message to be printed on raise 76 | taxonomy_dir supplied taxonomy directory 77 | """ 78 | 79 | def __init__(self, taxonomy_dir) -> None: 80 | super().__init__() 81 | self.taxonomy_dir = taxonomy_dir 82 | self.message = f"Taxonomy git repo not found: {taxonomy_dir}" 83 | 84 | 85 | class InvalidGitBranchError(EvalError): 86 | """ 87 | Error raised when branch provided is invalid 88 | Attributes 89 | message error message to be printed on raise 90 | branch supplied branch 91 | """ 92 | 93 | def __init__(self, branch) -> None: 94 | super().__init__() 95 | self.branch = branch 96 | self.message = f"Invalid git branch: {branch}" 97 | 98 | 99 | class TasksDirNotFoundError(EvalError): 100 | """ 101 | Error raised when the tasks dir doesn't exist 102 | Attributes 103 | message error message to be printed on raise 104 | tasks_dir tasks dir 105 | """ 106 | 107 | def __init__(self, tasks_dir) -> None: 108 | super().__init__() 109 | self.tasks_dir = tasks_dir 110 | self.message = f"Tasks dir not found: {tasks_dir}" 111 | 112 | 113 | class InvalidTasksDirError(EvalError): 114 | """ 115 | Error raised when the tasks dir is invalid 116 | Attributes 117 | message error message to be printed on raise 118 | tasks_dir tasks dir 119 | """ 120 | 121 | def __init__(self, tasks_dir) -> None: 122 | super().__init__() 123 | self.tasks_dir = tasks_dir 124 | self.message = f"Invalid Tasks Dir: {tasks_dir}" 125 | 126 | 127 | class InvalidEvaluationResult(EvalError): 128 | """ 129 | Error raised for invalid eval results 130 | Attributes 131 | message error message to be printed on raise 132 | """ 133 | 134 | def __init__(self, message) -> None: 135 | super().__init__() 136 | self.message = message 137 | 138 | 139 | class ModelServingAPIError(EvalError): 140 | """ 141 | Error raised when reply retrieval from model serving fails. 142 | Attributes 143 | message error message to be printed on raise 144 | """ 145 | 146 | def __init__(self) -> None: 147 | super().__init__() 148 | self.message = "Failed to receive a reply from model serving API." 149 | 150 | 151 | class EmptyTaxonomyError(EvalError): 152 | """ 153 | Error raised when taxonomy doesn't contain any skill QNAs 154 | Attributes 155 | message error message to be printed on raise 156 | """ 157 | 158 | def __init__(self) -> None: 159 | super().__init__() 160 | self.message = "Provided taxonomy doesn't contain any skill qna.yaml files" 161 | -------------------------------------------------------------------------------- /src/instructlab/eval/logger_config.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # Standard 3 | import logging 4 | 5 | 6 | def setup_logger(name): 7 | # Set up the logger 8 | logger = logging.getLogger(name) 9 | return logger 10 | -------------------------------------------------------------------------------- /src/instructlab/eval/mmlu.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | """ 4 | MMLU - Massive Multitask Language Understanding 5 | https://en.wikipedia.org/wiki/MMLU 6 | https://arxiv.org/abs/2009.03300 7 | """ 8 | 9 | # Standard 10 | from typing import Any, Dict, Optional, Union 11 | import os 12 | 13 | # Third Party 14 | from lm_eval.evaluator import simple_evaluate 15 | from lm_eval.tasks import TaskManager 16 | import torch 17 | 18 | # First Party 19 | from instructlab.eval.evaluator import Evaluator 20 | from instructlab.eval.exceptions import ( 21 | InvalidModelError, 22 | InvalidTasksDirError, 23 | ModelNotFoundError, 24 | TasksDirNotFoundError, 25 | ) 26 | 27 | # Local 28 | from .logger_config import setup_logger 29 | 30 | logger = setup_logger(__name__) 31 | 32 | MMLU_TASKS = [ 33 | "mmlu_abstract_algebra", 34 | "mmlu_anatomy", 35 | "mmlu_astronomy", 36 | "mmlu_business_ethics", 37 | "mmlu_clinical_knowledge", 38 | "mmlu_college_biology", 39 | "mmlu_college_chemistry", 40 | "mmlu_college_computer_science", 41 | "mmlu_college_mathematics", 42 | "mmlu_college_medicine", 43 | "mmlu_college_physics", 44 | "mmlu_computer_security", 45 | "mmlu_conceptual_physics", 46 | "mmlu_econometrics", 47 | "mmlu_electrical_engineering", 48 | "mmlu_elementary_mathematics", 49 | "mmlu_formal_logic", 50 | "mmlu_global_facts", 51 | "mmlu_high_school_biology", 52 | "mmlu_high_school_chemistry", 53 | "mmlu_high_school_computer_science", 54 | "mmlu_high_school_european_history", 55 | "mmlu_high_school_geography", 56 | "mmlu_high_school_government_and_politics", 57 | "mmlu_high_school_macroeconomics", 58 | "mmlu_high_school_mathematics", 59 | "mmlu_high_school_microeconomics", 60 | "mmlu_high_school_physics", 61 | "mmlu_high_school_psychology", 62 | "mmlu_high_school_statistics", 63 | "mmlu_high_school_us_history", 64 | "mmlu_high_school_world_history", 65 | "mmlu_human_aging", 66 | "mmlu_human_sexuality", 67 | "mmlu_international_law", 68 | "mmlu_jurisprudence", 69 | "mmlu_logical_fallacies", 70 | "mmlu_machine_learning", 71 | "mmlu_management", 72 | "mmlu_marketing", 73 | "mmlu_medical_genetics", 74 | "mmlu_miscellaneous", 75 | "mmlu_moral_disputes", 76 | "mmlu_moral_scenarios", 77 | "mmlu_nutrition", 78 | "mmlu_philosophy", 79 | "mmlu_prehistory", 80 | "mmlu_professional_accounting", 81 | "mmlu_professional_law", 82 | "mmlu_professional_medicine", 83 | "mmlu_professional_psychology", 84 | "mmlu_public_relations", 85 | "mmlu_security_studies", 86 | "mmlu_sociology", 87 | "mmlu_us_foreign_policy", 88 | "mmlu_virology", 89 | "mmlu_world_religions", 90 | ] 91 | 92 | 93 | class AbstractMMLUEvaluator(Evaluator): 94 | """ 95 | Abstract child class of an Evaluator for Massive Multitask Language Understanding Branch 96 | 97 | Attributes: 98 | model_path absolute path to or name of a huggingface model 99 | tasks_dir path where the .jsonl and _task.yaml files for the branches being evaluated are stored 100 | tasks list of tasks for MMLU to test the model with 101 | model_dtype dtype of model when served 102 | few_shots number of examples 103 | batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. 104 | device PyTorch device (e.g. "cpu" or "cuda:0") for running models 105 | system_prompt system prompt to be used when applying the chat template 106 | results full output from the `lm_eval.evaluator.simple_evaluate` function after MMLU has run. 107 | """ 108 | 109 | def __init__( 110 | self, 111 | model_path, 112 | tasks_dir: Optional[str], 113 | tasks: list[str], 114 | model_dtype="bfloat16", 115 | few_shots: int = 5, 116 | batch_size: Optional[Union[int, str]] = "auto", 117 | device: str = ("cuda" if torch.cuda.is_available() else "cpu"), 118 | system_prompt: Optional[str] = None, 119 | ) -> None: 120 | self.model_path = model_path 121 | self.system_prompt = system_prompt 122 | self.tasks_dir = tasks_dir 123 | self.tasks = tasks 124 | self.model_dtype = model_dtype 125 | self.few_shots = few_shots 126 | self.batch_size = batch_size 127 | self.device = device 128 | self._results = None 129 | 130 | @property 131 | def results(self) -> Dict[str, Any] | None: 132 | """ 133 | Returns the results of the last MMLU evaluation, if one has taken place. 134 | 135 | Returns: 136 | Dict[str, Any] | None: The output from `lm_eval.evaluator.simple_evaluate` 137 | """ 138 | return self._results 139 | 140 | def run( 141 | self, server_url: str | None = None, extra_args: Dict[str, Any] | None = None 142 | ) -> tuple: 143 | """ 144 | Runs evaluation 145 | 146 | Attributes 147 | server_url Model server endpoint (Ex: http://localhost:8000/v1) for the model being evaluated 148 | extra_args Dictionary containing any extra arguments to be passed into the lm_eval `lm_eval.evaluator.simple_evaluate` function. 149 | 150 | Returns: 151 | overall_score Average score for the task group 152 | individual_scores Individual scores for each task in the task group 153 | """ 154 | extra_args = {} if not extra_args else extra_args 155 | logger.debug(locals()) 156 | 157 | # TODO: make this a parameter for class? 158 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 159 | 160 | individual_scores: dict = {} 161 | agg_score: float = 0.0 162 | 163 | results = self._run_mmlu(server_url) 164 | for task, result in results.items(): 165 | agg_score += float(result["acc,none"]) 166 | individual_scores[task] = { 167 | "score": float(result["acc,none"]), 168 | "stderr": float(result["acc_stderr,none"]), 169 | } 170 | 171 | overall_score = float(agg_score / len(individual_scores)) 172 | 173 | return overall_score, individual_scores 174 | 175 | def _run_mmlu( 176 | self, server_url: str | None = None, extra_args: Dict[str, Any] | None = None 177 | ) -> dict: 178 | extra_args = {} if not extra_args else extra_args 179 | if server_url is not None: 180 | # Requires lm_eval >= 0.4.4 181 | model_args = f"base_url={server_url}/completions,model={self.model_path},tokenizer_backend=huggingface" 182 | model = "local-completions" 183 | else: 184 | model_args = f"pretrained={self.model_path},dtype={self.model_dtype}" 185 | model = "hf" 186 | tm = None 187 | if self.tasks_dir is not None: 188 | if not os.path.exists(self.tasks_dir): 189 | raise TasksDirNotFoundError(self.tasks_dir) 190 | if not os.access(self.tasks_dir, os.R_OK): 191 | raise InvalidTasksDirError(self.tasks_dir) 192 | tm = TaskManager(verbosity="DEBUG", include_path=self.tasks_dir) 193 | should_apply_chat_template = self.system_prompt is not None 194 | 195 | # configure the args here so users can override them as necessary 196 | simple_evaluate_kwargs = { 197 | "model": model, 198 | "model_args": model_args, 199 | "tasks": self.tasks, 200 | "num_fewshot": self.few_shots, 201 | "batch_size": self.batch_size, 202 | "device": self.device, 203 | "task_manager": tm, 204 | "system_instruction": self.system_prompt, 205 | "apply_chat_template": should_apply_chat_template, 206 | } 207 | simple_evaluate_kwargs.update(extra_args) 208 | 209 | results = self._simple_evaluate_with_error_handling(**simple_evaluate_kwargs) 210 | self._results = results 211 | return results["results"] 212 | 213 | # This method converts general errors from simple_evaluate 214 | # into a more user-understandable error 215 | def _simple_evaluate_with_error_handling(self, **kwargs): 216 | try: 217 | return simple_evaluate(**kwargs) 218 | except KeyError as ke: 219 | # If the first task key file cannot be found in tasks_dir, simple_evaluate() will return 220 | # an obscure KeyError(first task key) 221 | if ( 222 | self.tasks_dir is not None 223 | and len(self.tasks) > 0 224 | and ke.args[0] == self.tasks[0] 225 | ): 226 | raise InvalidTasksDirError(self.tasks_dir) from ke 227 | raise 228 | except OSError as ose: 229 | # If a model can not be found, simple_evaluate() will return 230 | # an obscure OSError with a message 231 | if "is not a valid model" in str( 232 | ose 233 | ) or "does not appear to have a file named" in str(ose): 234 | raise ModelNotFoundError(self.model_path) from ose 235 | if "is not a valid JSON file" in str(ose): 236 | reason = "Looked for valid JSON file but couldn't find one - are you pointing at a directory with a 'config.json'?" 237 | raise InvalidModelError(self.model_path, reason) from ose 238 | raise 239 | 240 | 241 | class MMLUEvaluator(AbstractMMLUEvaluator): 242 | """ 243 | Evaluator for Massive Multitask Language Understanding (MMLU) 244 | 245 | Attributes: 246 | model_path absolute path to or name of a huggingface model 247 | tasks list of tasks for MMLU to test the model with 248 | model_dtype dtype of model when served 249 | few_shots number of examples 250 | batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. 251 | device PyTorch device (e.g. "cpu" or "cuda:0") for running models 252 | system_prompt system prompt to be used when applying the chat template 253 | """ 254 | 255 | name = "mmlu" 256 | 257 | def __init__( 258 | self, 259 | model_path, 260 | tasks: list[str] = MMLU_TASKS, 261 | model_dtype="bfloat16", 262 | few_shots: int = 5, 263 | batch_size: Optional[Union[int, str]] = "auto", 264 | device: str = ("cuda" if torch.cuda.is_available() else "cpu"), 265 | system_prompt: Optional[str] = None, 266 | ) -> None: 267 | super().__init__( 268 | model_path, 269 | None, 270 | tasks, 271 | model_dtype, 272 | few_shots, 273 | batch_size, 274 | device, 275 | system_prompt=system_prompt, 276 | ) 277 | 278 | 279 | class MMLUBranchEvaluator(AbstractMMLUEvaluator): 280 | """ 281 | Evaluator for Massive Multitask Language Understanding Branch (MMLUBranch) 282 | 283 | Attributes: 284 | model_path absolute path to or name of a huggingface model 285 | system_prompt system prompt to be used when applying the chat template 286 | tasks_dir path where the .jsonl and _task.yaml files for the branches being evaluated are stored 287 | tasks group name that is shared by all the MMLUBranch tasks 288 | model_dtype dtype of model when served 289 | few_shots number of examples 290 | batch_size batch size for evaluation. Valid values are a positive integer or 'auto' to select the largest batch size that will fit in memory, or 'auto:N' to reselect the largest batch size N times'. 291 | device PyTorch device (e.g. "cpu" or "cuda:0") for running models 292 | """ 293 | 294 | name = "mmlu_branch" 295 | -------------------------------------------------------------------------------- /src/instructlab/eval/mt_bench_answers.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # Standard 3 | import concurrent.futures 4 | import json 5 | import os 6 | import time 7 | 8 | # Third Party 9 | import shortuuid 10 | import tqdm 11 | 12 | # Local 13 | from .logger_config import setup_logger 14 | from .mt_bench_common import ( 15 | bench_dir, 16 | chat_completion_openai, 17 | get_openai_client, 18 | load_questions, 19 | temperature_config, 20 | ) 21 | from .mt_bench_model_adapter import get_conversation_template # type: ignore 22 | 23 | logger = setup_logger(__name__) 24 | 25 | 26 | def reorg_answer_file(answer_file): 27 | """Sort by question id and de-duplication""" 28 | logger.debug(locals()) 29 | with open(answer_file, "r+", encoding="utf-8") as f: 30 | answers = {} 31 | for l in f: 32 | qid = json.loads(l)["question_id"] 33 | answers[qid] = l 34 | 35 | # Reset to the beginning of the file and clear it 36 | f.seek(0) 37 | f.truncate() 38 | 39 | qids = sorted(list(answers.keys())) 40 | for qid in qids: 41 | f.write(answers[qid]) 42 | 43 | 44 | def get_answer( 45 | question: dict, 46 | model: str, 47 | num_choices: int, 48 | max_tokens: int, 49 | answer_file: str, 50 | force_temperature: float, 51 | openai_client, 52 | ): 53 | """Answer a question with the model""" 54 | assert force_temperature is None or question.get("required_temperature") is None 55 | if force_temperature is not None: 56 | temperature = force_temperature 57 | elif "required_temperature" in question.keys(): 58 | temperature = question["required_temperature"] 59 | elif question["category"] in temperature_config: 60 | temperature = temperature_config[question["category"]] 61 | else: 62 | temperature = 0.7 63 | 64 | choices = [] 65 | for i in range(num_choices): 66 | conv = get_conversation_template(model, "granite") 67 | 68 | turns = [] 69 | for j in range(len(question["turns"])): 70 | conv.append_message(conv.roles[0], question["turns"][j]) 71 | conv.append_message(conv.roles[1], None) 72 | 73 | output = chat_completion_openai( 74 | openai_client, 75 | model, 76 | conv, 77 | temperature, 78 | max_tokens, 79 | ) 80 | 81 | conv.update_last_message(output) 82 | turns.append(output) 83 | 84 | choices.append({"index": i, "turns": turns}) 85 | 86 | # Dump answers 87 | ans = { 88 | "question_id": question["question_id"], 89 | "answer_id": shortuuid.uuid(), 90 | "model_id": model, 91 | "choices": choices, 92 | "tstamp": time.time(), 93 | } 94 | 95 | os.makedirs(os.path.dirname(answer_file), exist_ok=True) 96 | with open(answer_file, "a", encoding="utf-8") as fout: 97 | fout.write(json.dumps(ans) + "\n") 98 | 99 | 100 | def generate_answers( 101 | model_name, 102 | model_api_base, 103 | api_key=None, 104 | branch=None, 105 | output_dir="eval_output", 106 | data_dir=None, 107 | question_begin=None, 108 | question_end=None, 109 | force_temperature=None, 110 | num_choices=1, 111 | max_tokens=1024, 112 | max_workers=1, 113 | bench_name="mt_bench", 114 | http_client=None, 115 | ): 116 | """Generate model answers to be judged""" 117 | logger.debug(locals()) 118 | 119 | openai_client = get_openai_client(model_api_base, api_key, http_client) 120 | 121 | if data_dir is None: 122 | data_dir = os.path.join(os.path.dirname(__file__), "data") 123 | 124 | data_base_dir = bench_dir(data_dir, bench_name, branch) 125 | output_base_dir = bench_dir(output_dir, bench_name, branch) 126 | 127 | question_file = os.path.join(data_base_dir, "question.jsonl") 128 | questions = load_questions(question_file, question_begin, question_end) 129 | 130 | answer_file = os.path.join(output_base_dir, "model_answer", f"{model_name}.jsonl") 131 | if os.path.isfile(answer_file): 132 | os.remove(answer_file) 133 | logger.debug("Removing previous answer file: %s", answer_file) 134 | 135 | first_n = None 136 | first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS") 137 | if first_n_env: 138 | first_n = int(first_n_env) 139 | logger.debug("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=%s", first_n) 140 | 141 | with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: 142 | futures = [] 143 | for i, question in enumerate(questions): 144 | if first_n is not None and i >= first_n: 145 | break 146 | 147 | future = executor.submit( 148 | get_answer, 149 | question, 150 | model_name, 151 | num_choices, 152 | max_tokens, 153 | answer_file, 154 | force_temperature, 155 | openai_client, 156 | ) 157 | futures.append(future) 158 | 159 | for future in tqdm.tqdm( 160 | concurrent.futures.as_completed(futures), total=len(futures) 161 | ): 162 | future.result() 163 | 164 | reorg_answer_file(answer_file) 165 | -------------------------------------------------------------------------------- /src/instructlab/eval/mt_bench_branch_generator.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # Standard 3 | from pathlib import Path 4 | import hashlib 5 | import json 6 | import os 7 | import time 8 | 9 | # Third Party 10 | from tqdm import tqdm 11 | import git 12 | import shortuuid 13 | import yaml 14 | 15 | # Local 16 | from .exceptions import ( 17 | EmptyTaxonomyError, 18 | GitRepoNotFoundError, 19 | InvalidGitBranchError, 20 | InvalidGitRepoError, 21 | ) 22 | from .logger_config import setup_logger 23 | from .mt_bench_common import bench_dir 24 | 25 | logger = setup_logger(__name__) 26 | 27 | 28 | def get_file_paths(directory): 29 | logger.debug(locals()) 30 | file_paths = [] 31 | root_paths = [ 32 | entry 33 | for entry in Path(directory).iterdir() 34 | if entry.is_dir() 35 | if not entry.name.startswith(".") 36 | if entry.name != "knowledge" 37 | if entry.name != "docs" 38 | if entry.name != "scripts" 39 | ] 40 | for basedir in root_paths: 41 | for root, _, files in os.walk(basedir): 42 | file_paths.extend( 43 | [os.path.join(root, file) for file in files if file == "qna.yaml"] 44 | ) 45 | return file_paths 46 | 47 | 48 | def read_qna(fn): 49 | with open(fn, "r", encoding="utf-8") as file: 50 | contents = yaml.safe_load(file) 51 | return contents.get("seed_examples") 52 | 53 | 54 | def generate(judge_model_name, branch, taxonomy_dir, output_dir): 55 | """Create questions and reference answers from taxonomy""" 56 | logger.debug(locals()) 57 | restore_branch = None 58 | try: 59 | if branch is not None: 60 | taxonomy_repo = git.Repo(taxonomy_dir) 61 | restore_branch = taxonomy_repo.active_branch 62 | taxonomy_repo.git.checkout(branch) 63 | 64 | qna_file_list = get_file_paths(taxonomy_dir) 65 | if len(qna_file_list) == 0: 66 | raise EmptyTaxonomyError 67 | 68 | question_lst = [] 69 | reference_answers = [] 70 | for qna_file_path in tqdm(qna_file_list): 71 | examples = read_qna(qna_file_path) 72 | qna_file = qna_file_path[len(taxonomy_dir) + 1 :] 73 | if examples is None: 74 | logger.warning("failed to load %s. skipping...", qna_file) 75 | continue 76 | for ex in examples: 77 | q, a = ex.get("question"), ex.get("answer") 78 | if q is None or a is None: 79 | logger.warning("Skipping malformed file %s", qna_file) 80 | continue 81 | 82 | c = ex.get("context") 83 | if c is not None: 84 | t_1 = ( 85 | "Given the context below:\n" 86 | + c 87 | + "\n" 88 | + "Answer the following question: " 89 | + q 90 | ) 91 | else: 92 | t_1 = q 93 | 94 | # Generate a consistent hash to have consistent question_id across qna_files from different runs 95 | str_bytes = bytes(q, "UTF-8") 96 | m = hashlib.md5(str_bytes) 97 | question_id = str(int(m.hexdigest(), base=16)) 98 | question_lst.append( 99 | { 100 | "qna_file": qna_file, 101 | "question_id": question_id, 102 | "category": "taxonomy", 103 | "turns": [t_1], 104 | "reference": [a], 105 | } 106 | ) 107 | 108 | reference_answers.append( 109 | { 110 | "question_id": question_id, 111 | "answer_id": shortuuid.uuid(), 112 | "model_id": judge_model_name, 113 | "choices": [{"index": 0, "turns": [a]}], 114 | "tstamp": time.time(), 115 | } 116 | ) 117 | 118 | logger.debug("Generated %s questions", len(question_lst)) 119 | 120 | output_base_dir = bench_dir(output_dir, "mt_bench_branch", branch) 121 | os.makedirs(output_base_dir, exist_ok=True) 122 | question_file = os.path.join(output_base_dir, "question.jsonl") 123 | logger.debug("Generating question file: %s", question_file) 124 | with open(question_file, "w", encoding="utf-8") as outfile: 125 | for entry in question_lst: 126 | json.dump(entry, outfile) 127 | outfile.write("\n") 128 | 129 | answer_file = os.path.join( 130 | output_base_dir, "reference_answer", f"{judge_model_name}.jsonl" 131 | ) 132 | logger.debug("Generating answer file: %s", answer_file) 133 | os.makedirs(os.path.dirname(answer_file), exist_ok=True) 134 | with open( 135 | answer_file, 136 | "w", 137 | encoding="utf-8", 138 | ) as outfile: 139 | for entry in reference_answers: 140 | json.dump(entry, outfile) 141 | outfile.write("\n") 142 | except git.exc.NoSuchPathError as nspe: 143 | raise GitRepoNotFoundError(taxonomy_dir) from nspe 144 | except git.exc.GitCommandError as gce: 145 | raise InvalidGitBranchError(branch) from gce 146 | except (git.exc.InvalidGitRepositoryError, git.exc.GitError) as ge: 147 | raise InvalidGitRepoError(taxonomy_dir) from ge 148 | finally: 149 | if restore_branch is not None: 150 | taxonomy_repo.git.checkout(restore_branch) 151 | -------------------------------------------------------------------------------- /src/instructlab/eval/mt_bench_conversation.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | """ 3 | Conversation prompt templates. 4 | """ 5 | 6 | # Standard 7 | from enum import IntEnum, auto 8 | from typing import Dict, List, Tuple, Union 9 | import dataclasses 10 | 11 | 12 | class SeparatorStyle(IntEnum): 13 | """Separator styles.""" 14 | 15 | ADD_COLON_SINGLE = auto() 16 | ADD_COLON_TWO = auto() 17 | ADD_COLON_SPACE_SINGLE = auto() 18 | NO_COLON_SINGLE = auto() 19 | NO_COLON_TWO = auto() 20 | ADD_NEW_LINE_SINGLE = auto() 21 | LLAMA2 = auto() 22 | DEFAULT = auto() 23 | 24 | 25 | @dataclasses.dataclass 26 | class Conversation: 27 | # pylint: disable=too-many-instance-attributes 28 | """A class that manages prompt templates and keeps all conversation history.""" 29 | 30 | # The name of this template 31 | name: str 32 | # The template of the system prompt 33 | system_template: str = "{system_message}" 34 | # The system message 35 | system_message: str = "" 36 | # The names of two roles 37 | roles: Tuple[str, str] = ("USER", "ASSISTANT") 38 | # All messages. Each item is (role, message). 39 | # Each message is either a string or a tuple of (string, List[image_url]). 40 | messages: List[List[str | None]] = dataclasses.field(default_factory=list) 41 | # The number of few shot examples 42 | offset: int = 0 43 | # The separator style and configurations 44 | sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE 45 | sep: str | None = "\n" 46 | sep2: str | None = None 47 | # Stop criteria (the default one is EOS token) 48 | stop_str: Union[str, List[str]] | None = None 49 | # Stops generation if meeting any token in this list 50 | stop_token_ids: List[int] | None = None 51 | 52 | def set_system_message(self, system_message: str): 53 | """Set the system message.""" 54 | self.system_message = system_message 55 | 56 | def get_system_message(self): 57 | """return the system message.""" 58 | return self.system_message 59 | 60 | def append_message(self, role: str, message: str | None): 61 | """Append a new message.""" 62 | self.messages.append([role, message]) 63 | 64 | def update_last_message(self, message: str): 65 | """Update the last output. 66 | 67 | The last message is typically set to be None when constructing the prompt, 68 | so we need to update it in-place after getting the response from a model. 69 | """ 70 | self.messages[-1][1] = message 71 | 72 | def to_openai_api_messages(self): 73 | """Convert the conversation to OpenAI chat completion format.""" 74 | if self.system_message == "": 75 | ret = [] 76 | else: 77 | ret = [{"role": "system", "content": self.system_message}] 78 | 79 | for i, (_, msg) in enumerate(self.messages[self.offset :]): 80 | if i % 2 == 0: 81 | ret.append({"role": "user", "content": msg}) 82 | else: 83 | if msg is not None: 84 | ret.append({"role": "assistant", "content": msg}) 85 | return ret 86 | 87 | def copy(self): 88 | return Conversation( 89 | name=self.name, 90 | system_template=self.system_template, 91 | system_message=self.system_message, 92 | roles=self.roles, 93 | messages=[[x, y] for x, y in self.messages], 94 | offset=self.offset, 95 | sep_style=self.sep_style, 96 | sep=self.sep, 97 | sep2=self.sep2, 98 | stop_str=self.stop_str, 99 | stop_token_ids=self.stop_token_ids, 100 | ) 101 | 102 | def dict(self): 103 | return { 104 | "template_name": self.name, 105 | "system_message": self.system_message, 106 | "roles": self.roles, 107 | "messages": self.extract_text_from_messages(), 108 | "offset": self.offset, 109 | } 110 | 111 | 112 | # A global registry for all conversation templates 113 | conv_templates: Dict[str, Conversation] = {} 114 | 115 | 116 | def register_conv_template(template: Conversation, override: bool = False): 117 | """Register a new conversation template.""" 118 | if not override: 119 | assert template.name not in conv_templates, ( 120 | f"{template.name} has been registered." 121 | ) 122 | 123 | conv_templates[template.name] = template 124 | 125 | 126 | def get_conv_template(name: str) -> Conversation: 127 | """Get a conversation template.""" 128 | return conv_templates[name].copy() 129 | 130 | 131 | # An empty template for raw conversation. 132 | register_conv_template( 133 | Conversation( 134 | name="raw", 135 | system_message="", 136 | roles=("", ""), 137 | sep_style=SeparatorStyle.NO_COLON_SINGLE, 138 | sep="", 139 | ) 140 | ) 141 | 142 | 143 | # api-based default template 144 | register_conv_template( 145 | Conversation( 146 | name="api_based_default", 147 | system_message="", 148 | roles=("user", "assistant"), 149 | sep_style=SeparatorStyle.DEFAULT, 150 | sep=None, 151 | ) 152 | ) 153 | 154 | 155 | # ChatGPT default template 156 | register_conv_template( 157 | Conversation( 158 | name="chatgpt", 159 | system_message="You are a helpful assistant.", 160 | roles=("user", "assistant"), 161 | sep_style=SeparatorStyle.DEFAULT, 162 | sep=None, 163 | ) 164 | ) 165 | 166 | # Mistral template 167 | # source: https://docs.mistral.ai/llm/mistral-instruct-v0.1#chat-template 168 | register_conv_template( 169 | Conversation( 170 | name="mistral", 171 | system_template="[INST] {system_message}\n", 172 | roles=("[INST]", "[/INST]"), 173 | sep_style=SeparatorStyle.LLAMA2, 174 | sep=" ", 175 | sep2="", 176 | ) 177 | ) 178 | 179 | register_conv_template( 180 | Conversation( 181 | name="labrador-chat", 182 | system_template="<|system|>\n{system_message}", 183 | system_message="""You are Labrador, an AI language model developed by IBM DMF (Data Model Factory) Alignment Team. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior. You always respond to greetings (for example, hi, hello, g'day, morning, afternoon, evening, night, what's up, nice to meet you, sup, etc) with "Hello! I am Labrador, created by the IBM DMF Alignment Team. How can I help you today?". Please do not say anything else and do not start a conversation.""", 184 | roles=("<|user|>", "<|assistant|>"), 185 | sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE, 186 | sep="\n", 187 | stop_str="<|endoftext|>", 188 | ) 189 | ) 190 | 191 | register_conv_template( 192 | Conversation( 193 | name="ibm-generic", 194 | system_template="<|system|>\n{system_message}", 195 | system_message="""You are an AI language model developed by IBM Research. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.""", 196 | roles=("<|user|>", "<|assistant|>"), 197 | sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE, 198 | sep="\n", 199 | stop_str="<|endoftext|>", 200 | ) 201 | ) 202 | 203 | register_conv_template( 204 | Conversation( 205 | name="granite-chat", 206 | system_template="<|system|>\n{system_message}", 207 | system_message="""You are Granite Chat, an AI language model developed by IBM. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.""", 208 | roles=("<|user|>", "<|assistant|>"), 209 | sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE, 210 | sep="\n", 211 | stop_str="<|endoftext|>", 212 | ) 213 | ) 214 | -------------------------------------------------------------------------------- /src/instructlab/eval/mt_bench_judgment.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # Standard 3 | from concurrent.futures import ThreadPoolExecutor 4 | import os 5 | 6 | # Third Party 7 | from tqdm import tqdm 8 | import numpy as np 9 | import pandas as pd 10 | 11 | # First Party 12 | from instructlab.eval import exceptions 13 | 14 | # Local 15 | from .logger_config import setup_logger 16 | from .mt_bench_common import ( 17 | NEED_REF_CATS, 18 | Judge, 19 | MatchSingle, 20 | bench_dir, 21 | check_data, 22 | get_model_list, 23 | get_openai_client, 24 | load_judge_prompts, 25 | load_model_answers, 26 | load_questions, 27 | play_a_match_single, 28 | ) 29 | 30 | logger = setup_logger(__name__) 31 | 32 | 33 | def make_match_single( 34 | questions, 35 | models, 36 | model_answers, 37 | judge, 38 | ref_answers=None, 39 | multi_turn=False, 40 | ): 41 | """Setup a match""" 42 | matches = [] 43 | for q in questions: 44 | if multi_turn and len(q["turns"]) != 2: 45 | continue 46 | q_id = q["question_id"] 47 | for m in models: 48 | a = model_answers[m][q_id] 49 | if ref_answers is not None: 50 | ref = ref_answers[judge.model_name][q_id] 51 | matches.append( 52 | MatchSingle( 53 | dict(q), m, a, judge, ref_answer=ref, multi_turn=multi_turn 54 | ) 55 | ) 56 | else: 57 | matches.append(MatchSingle(dict(q), m, a, judge, multi_turn=multi_turn)) 58 | return matches 59 | 60 | 61 | def make_judge_single(judge_model_name, judge_prompts) -> dict: 62 | """Setup the judge""" 63 | judges = {} 64 | judges["default"] = Judge(judge_model_name, judge_prompts["single-v1"]) 65 | judges["math"] = Judge( 66 | judge_model_name, judge_prompts["single-math-v1"], ref_based=True 67 | ) 68 | judges["default-mt"] = Judge( 69 | judge_model_name, judge_prompts["single-v1-multi-turn"], multi_turn=True 70 | ) 71 | judges["math-mt"] = Judge( 72 | judge_model_name, 73 | judge_prompts["single-math-v1-multi-turn"], 74 | ref_based=True, 75 | multi_turn=True, 76 | ) 77 | return judges 78 | 79 | 80 | def make_judgment( 81 | question_file, 82 | judgment_file, 83 | answer_file, 84 | bench_name="mt_bench", 85 | ): 86 | """Create judgment output""" 87 | logger.debug(locals()) 88 | judgment_df_all = pd.read_json( 89 | judgment_file, lines=True, dtype={"question_id": str} 90 | ) 91 | judgment_df = judgment_df_all[["model", "score", "turn"]] 92 | judgments_len = len(judgment_df) 93 | judgment_df = judgment_df[judgment_df["score"] != -1] 94 | error_free_judgments_len = len(judgment_df) 95 | error_rate = (judgments_len - error_free_judgments_len) / judgments_len 96 | logger.debug("#judgments: %s", judgments_len) 97 | logger.debug("#error free judgments: %s", error_free_judgments_len) 98 | logger.debug("error rate: %s", error_rate) 99 | 100 | turn_scores = [] 101 | # First turn 102 | df_1 = judgment_df[judgment_df["turn"] == 1].groupby(["model", "turn"]).mean() 103 | if len(df_1.index) > 0: 104 | overall_score = df_1["score"].iloc[0] 105 | turn_scores.append(overall_score) 106 | else: 107 | raise exceptions.InvalidEvaluationResult( 108 | "Evaluation provided no result. See logs for more details." 109 | ) 110 | 111 | if bench_name == "mt_bench": 112 | # Second turn 113 | df_2 = judgment_df[judgment_df["turn"] == 2].groupby(["model", "turn"]).mean() 114 | if len(df_2.index) > 0: 115 | turn2_score = df_2["score"].iloc[0] 116 | turn_scores.append(turn2_score) 117 | 118 | # Average 119 | df_3 = judgment_df[["model", "score"]].groupby(["model"]).mean() 120 | overall_score = df_3["score"].iloc[0] 121 | else: 122 | turn_scores.append("N/A") 123 | 124 | question_df = pd.read_json(question_file, lines=True, dtype={"question_id": str}) 125 | 126 | answer_df = pd.read_json(answer_file, lines=True, dtype={"question_id": str}) 127 | 128 | # Join to get questions with answers 129 | join_columns = ["question_id", "choices", "turns", "category"] 130 | if bench_name == "mt_bench_branch": 131 | join_columns.append("qna_file") 132 | 133 | joined_df = question_df.join( 134 | answer_df.set_index("question_id"), on="question_id", rsuffix="_answer" 135 | )[join_columns] 136 | # Join to get scores 137 | join_columns.append("score") 138 | joined_df = judgment_df_all.join( 139 | joined_df.set_index("question_id"), on="question_id", lsuffix="_judgment" 140 | )[join_columns] 141 | joined_df = joined_df[joined_df["score"] != -1] 142 | 143 | qa_pairs = [] 144 | for _, row in joined_df.iterrows(): 145 | qa_pair = { 146 | "question_id": row["question_id"], 147 | "score": row["score"], 148 | "category": row["category"], 149 | "question": row["turns"], 150 | "answer": row["choices"], 151 | } 152 | if bench_name == "mt_bench_branch": 153 | qa_pair["qna_file"] = row["qna_file"] 154 | qa_pairs.append(qa_pair) 155 | return overall_score, qa_pairs, turn_scores, error_rate 156 | 157 | 158 | def judge_model( 159 | model_name, 160 | judge_model_name, 161 | openai_client, 162 | branch=None, 163 | bench_name="mt_bench", 164 | output_dir="eval_output", 165 | data_dir=None, 166 | max_workers=1, 167 | first_n=None, 168 | merge_system_user_message=False, 169 | ): 170 | """Judge the model based on questions and reference answers""" 171 | logger.debug(locals()) 172 | package_data_dir = os.path.join(os.path.dirname(__file__), "data") 173 | use_builtin_ref_answers = False 174 | if data_dir is None: 175 | use_builtin_ref_answers = True 176 | data_dir = package_data_dir 177 | 178 | data_base_dir = bench_dir(data_dir, bench_name, branch) 179 | output_base_dir = bench_dir(output_dir, bench_name, branch) 180 | 181 | judge_file = os.path.join(package_data_dir, bench_name, "judge_prompts.jsonl") 182 | 183 | question_file = os.path.join(data_base_dir, "question.jsonl") 184 | answer_file = os.path.join(output_base_dir, "model_answer", f"{model_name}.jsonl") 185 | if use_builtin_ref_answers: 186 | ref_answer_file = os.path.join(data_base_dir, "reference_answer", "gpt-4.jsonl") 187 | else: 188 | ref_answer_file = os.path.join( 189 | data_base_dir, "reference_answer", f"{judge_model_name}.jsonl" 190 | ) 191 | 192 | # Load questions 193 | questions = load_questions(question_file, None, None) 194 | 195 | # Load answers 196 | model_answers = load_model_answers(answer_file) 197 | ref_answers = load_model_answers(ref_answer_file, judge_model_name) 198 | 199 | # Load judge 200 | judge_prompts = load_judge_prompts(judge_file) 201 | 202 | if first_n: 203 | questions = questions[:first_n] 204 | 205 | models = get_model_list(answer_file) 206 | 207 | judges = make_judge_single(judge_model_name, judge_prompts) 208 | output_file = os.path.join( 209 | output_base_dir, "model_judgment", f"{judge_model_name}_single.jsonl" 210 | ) 211 | if os.path.isfile(output_file): 212 | os.remove(output_file) 213 | logger.debug("Removing previous judgment file: %s", output_file) 214 | 215 | check_data(questions, model_answers, ref_answers, models, judges) 216 | 217 | question_math = [q for q in questions if q["category"] in NEED_REF_CATS] 218 | question_default = [q for q in questions if q["category"] not in NEED_REF_CATS] 219 | 220 | # Make matches 221 | matches = [] 222 | matches += make_match_single( 223 | question_default, models, model_answers, judges["default"] 224 | ) 225 | matches += make_match_single( 226 | question_math, 227 | models, 228 | model_answers, 229 | judges["math"], 230 | ref_answers, 231 | ) 232 | matches += make_match_single( 233 | question_default, 234 | models, 235 | model_answers, 236 | judges["default-mt"], 237 | multi_turn=True, 238 | ) 239 | matches += make_match_single( 240 | question_math, 241 | models, 242 | model_answers, 243 | judges["math-mt"], 244 | ref_answers, 245 | multi_turn=True, 246 | ) 247 | 248 | logger.debug("bench_name=%s", bench_name) 249 | logger.debug("judge=%s", judge_model_name) 250 | logger.debug("model_list=%s", models) 251 | logger.debug("total_num_questions=%s", len(questions)) 252 | logger.debug("total_num_matches=%s", len(matches)) 253 | 254 | # Play matches 255 | if max_workers == 1: 256 | for match in tqdm(matches): 257 | play_a_match_single( 258 | openai_client, 259 | match, 260 | output_file=output_file, 261 | merge_system_user_message=merge_system_user_message, 262 | ) 263 | else: 264 | 265 | def play_a_match_wrapper(match): 266 | play_a_match_single( 267 | openai_client, 268 | match, 269 | output_file=output_file, 270 | merge_system_user_message=merge_system_user_message, 271 | ) 272 | 273 | np.random.seed(0) 274 | np.random.shuffle(matches) 275 | 276 | with ThreadPoolExecutor(max_workers) as executor: 277 | for match in tqdm( 278 | executor.map(play_a_match_wrapper, matches), total=len(matches) 279 | ): 280 | pass 281 | 282 | return question_file, output_file, answer_file 283 | 284 | 285 | def generate_judgment( 286 | model_name, 287 | judge_model_name, 288 | model_api_base, 289 | api_key=None, 290 | bench_name="mt_bench", 291 | output_dir="eval_output", 292 | data_dir=None, 293 | branch=None, 294 | max_workers=1, 295 | first_n=None, 296 | merge_system_user_message=False, 297 | http_client=None, 298 | ): 299 | """Generate judgment with scores and qa_pairs for a model""" 300 | logger.debug(locals()) 301 | 302 | openai_client = get_openai_client(model_api_base, api_key, http_client) 303 | 304 | first_n_env = os.environ.get("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS") 305 | if first_n_env is not None and first_n is None: 306 | first_n = int(first_n_env) 307 | logger.debug("INSTRUCTLAB_EVAL_FIRST_N_QUESTIONS=%s", first_n) 308 | 309 | question_file, judgment_file, answer_file = judge_model( 310 | model_name, 311 | judge_model_name, 312 | openai_client, 313 | bench_name=bench_name, 314 | output_dir=output_dir, 315 | data_dir=data_dir, 316 | branch=branch, 317 | max_workers=max_workers, 318 | first_n=first_n, 319 | merge_system_user_message=merge_system_user_message, 320 | ) 321 | 322 | return make_judgment( 323 | question_file, 324 | judgment_file, 325 | answer_file, 326 | bench_name=bench_name, 327 | ) 328 | -------------------------------------------------------------------------------- /src/instructlab/eval/mt_bench_model_adapter.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | """Model adapter registration.""" 3 | 4 | # Standard 5 | from functools import cache 6 | from typing import List 7 | import abc 8 | import os 9 | 10 | # Local 11 | from .logger_config import setup_logger 12 | from .mt_bench_conversation import Conversation, get_conv_template 13 | 14 | OPENAI_MODEL_LIST = ("gpt-4",) 15 | 16 | logger = setup_logger(__name__) 17 | 18 | 19 | class BaseModelAdapter: 20 | """The base and the default model adapter.""" 21 | 22 | @abc.abstractmethod 23 | def match(self, model_path: str) -> bool: 24 | pass 25 | 26 | @abc.abstractmethod 27 | def get_default_conv_template(self, model_path: str) -> Conversation: 28 | pass 29 | 30 | 31 | # A global registry for all model adapters 32 | model_adapters: List[BaseModelAdapter] = [] 33 | 34 | 35 | def register_model_adapter(cls): 36 | """Register a model adapter.""" 37 | model_adapters.append(cls()) 38 | 39 | 40 | @cache 41 | def get_model_adapter(model_path: str, default_adapter_name: str) -> BaseModelAdapter: 42 | """Get a model adapter for a model_path.""" 43 | model_path_basename = os.path.basename(os.path.normpath(model_path)) 44 | 45 | default_adapter = None 46 | 47 | # Try the basename of model_path at first 48 | for adapter in model_adapters: 49 | if adapter.match(model_path_basename): 50 | return adapter 51 | if adapter.match(default_adapter_name) and default_adapter is None: 52 | default_adapter = adapter 53 | 54 | # Then try the full path 55 | for adapter in model_adapters: 56 | if adapter.match(model_path): 57 | return adapter 58 | 59 | if default_adapter is not None: 60 | logger.warning( 61 | "No valid model adapter for %s, defaulting to %s adapter", 62 | model_path, 63 | default_adapter_name, 64 | ) 65 | return default_adapter 66 | raise ValueError(f"No valid model adapter for {model_path}") 67 | 68 | 69 | def get_conversation_template( 70 | model_path: str, default_adapter_name: str 71 | ) -> Conversation: 72 | """Get the default conversation template.""" 73 | adapter = get_model_adapter(model_path, default_adapter_name) 74 | return adapter.get_default_conv_template(model_path) 75 | 76 | 77 | class ChatGPTAdapter(BaseModelAdapter): 78 | """The model adapter for ChatGPT""" 79 | 80 | def match(self, model_path: str): 81 | return model_path in OPENAI_MODEL_LIST 82 | 83 | def get_default_conv_template(self, model_path: str) -> Conversation: 84 | if "browsing" in model_path: 85 | return get_conv_template("api_based_default") 86 | return get_conv_template("chatgpt") 87 | 88 | 89 | class MistralAdapter(BaseModelAdapter): 90 | """The model adapter for Mistral AI models""" 91 | 92 | def match(self, model_path: str): 93 | model_path = model_path.lower() 94 | return ( 95 | "mistral" in model_path 96 | or "mixtral" in model_path 97 | or "prometheus" in model_path 98 | ) 99 | 100 | def get_default_conv_template(self, model_path: str) -> Conversation: 101 | return get_conv_template("mistral") 102 | 103 | 104 | class LabradoriteAdapter(BaseModelAdapter): 105 | """The model adapter for ibm/labradorite-13b""" 106 | 107 | def match(self, model_path: str): 108 | return "labradorite" in model_path.lower() 109 | 110 | def get_default_conv_template(self, model_path: str) -> Conversation: 111 | return get_conv_template("labrador-chat") 112 | 113 | 114 | class MerliniteAdapter(BaseModelAdapter): 115 | """The model adapter for ibm/merlinite-7b and instructlab/merlinite-7b-lab""" 116 | 117 | def match(self, model_path: str): 118 | return "merlinite" in model_path.lower() 119 | 120 | def get_default_conv_template(self, model_path: str) -> Conversation: 121 | return get_conv_template("ibm-generic") 122 | 123 | 124 | class GraniteAdapter(BaseModelAdapter): 125 | """The model adapter for instructlab/granite-7b-lab""" 126 | 127 | def match(self, model_path: str): 128 | model_path = model_path.lower() 129 | return ( 130 | "granite" in model_path 131 | and "granite-old" not in model_path 132 | and "granite-chat" not in model_path 133 | and "granite-code" not in model_path 134 | ) 135 | 136 | def get_default_conv_template(self, model_path: str) -> Conversation: 137 | return get_conv_template("ibm-generic") 138 | 139 | 140 | class LabradorAdapter(BaseModelAdapter): 141 | """The model adapter for ibm/labradorite-13b""" 142 | 143 | def match(self, model_path: str): 144 | model_path = model_path.lower() 145 | return ("granite-chat" in model_path) or ( 146 | "labrador" in model_path and "labradorite" not in model_path 147 | ) 148 | 149 | def get_default_conv_template(self, model_path: str) -> Conversation: 150 | return get_conv_template("granite-chat") 151 | 152 | 153 | # Note: the registration order matters. 154 | # The one registered earlier has a higher matching priority. 155 | register_model_adapter(MistralAdapter) 156 | register_model_adapter(LabradoriteAdapter) 157 | register_model_adapter(MerliniteAdapter) 158 | register_model_adapter(GraniteAdapter) 159 | register_model_adapter(LabradorAdapter) 160 | register_model_adapter(ChatGPTAdapter) 161 | -------------------------------------------------------------------------------- /src/instructlab/eval/ruler.py: -------------------------------------------------------------------------------- 1 | # Standard 2 | from typing import Any, Dict, List, Optional 3 | import json 4 | import os 5 | import pathlib 6 | 7 | # Third Party 8 | from lm_eval.evaluator import simple_evaluate 9 | 10 | # First Party 11 | from instructlab.eval.evaluator import Evaluator 12 | 13 | RULER_TASKS = [ 14 | "niah_single_1", 15 | "niah_single_2", 16 | "niah_single_3", 17 | "niah_multikey_1", 18 | "niah_multikey_2", 19 | "niah_multikey_3", 20 | "niah_multiquery", 21 | "niah_multivalue", 22 | "ruler_vt", 23 | "ruler_cwe", 24 | "ruler_fwe", 25 | "ruler_qa_hotpot", 26 | "ruler_qa_squad", 27 | ] 28 | 29 | DEFAULT_MAX_LENGTH = 4096 30 | 31 | 32 | class RulerEvaluator(Evaluator): 33 | """ 34 | Class definition for running RULER benchmarking tasks. 35 | """ 36 | 37 | name = "ruler" 38 | 39 | def __init__( 40 | self, 41 | model_path: Optional[str] = None, 42 | output_file: Optional[str] = None, 43 | tasks: list[str] = RULER_TASKS, 44 | api_endpoint: Optional[str] = None, 45 | max_length: Optional[int] = None, 46 | ) -> None: 47 | self.model_path = model_path 48 | self.tasks = tasks 49 | self.results: Dict[Any, Any] = {} 50 | self.output_file = output_file 51 | 52 | self.api_endpoint = api_endpoint or None 53 | self.max_length = max_length or 4096 54 | 55 | def save_to_file(self, output_file: Optional[str] = None) -> None: 56 | """Save results to a JSON file""" 57 | output_file = output_file if output_file else self.output_file 58 | if not output_file: 59 | raise ValueError("Output file path cannot be empty") 60 | 61 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 62 | with open(output_file, "w", encoding="utf-8") as f: 63 | json.dump(self.results, f, indent=2) 64 | 65 | def process_lm_eval_results( 66 | self, 67 | fpath: Optional[pathlib.Path] = None, 68 | raw_results: Optional[dict[str, Any]] = None, 69 | ) -> dict[str, float]: 70 | """ 71 | Process the evaluation results from lm_eval for the given file path and extract 72 | aggregarted scores for each context length 73 | Args: 74 | fpath (pathlib.Path): The file path to the evaluation results. 75 | 76 | """ 77 | unqiue_metrics_dict: dict[str, Any] = {} 78 | 79 | # This is required because the lm_eval results are nested under 'ruler' if 80 | # that is the supplied task to it. The output contains a nested dictionary 81 | # in this case, using RULER tasks as the key. Each context length is a further subkey 82 | # in the dictionary. There is an additional key per context length which also 83 | # contains score adjusted for stderr, which we are ignoring here. 84 | def extract_metrics(results: dict, unqiue_metrics_dict: dict = {}): 85 | for k, v in results.items(): 86 | if isinstance(v, dict): 87 | extract_metrics(v, unqiue_metrics_dict) 88 | else: 89 | if "stderr" not in k: 90 | metric = k.split(",")[0] 91 | if metric not in unqiue_metrics_dict: 92 | unqiue_metrics_dict[metric] = [] 93 | unqiue_metrics_dict[metric].append(v) 94 | 95 | return unqiue_metrics_dict 96 | 97 | if fpath: 98 | with open(fpath, "r", encoding="utf-8") as f: 99 | raw_results = json.load(f) 100 | 101 | if raw_results is not None: 102 | extract_metrics(raw_results["results"], unqiue_metrics_dict) 103 | unique_float_metrics = {} 104 | # if value is list of floats, average the list 105 | for k, v in unqiue_metrics_dict.items(): 106 | if isinstance(v, list) and all(isinstance(i, float) for i in v): 107 | unique_float_metrics[k] = sum(v) / len(v) 108 | 109 | # find average of all float values in dict 110 | float_values = [ 111 | v for v in unique_float_metrics.values() if isinstance(v, float) 112 | ] 113 | if float_values: 114 | unique_float_metrics["avg"] = sum(float_values) / len(float_values) 115 | else: 116 | unique_float_metrics["avg"] = 0.0 117 | 118 | # result format 119 | # {'8192': 0.90, '32768': 0.82, '65536': 0.77, '131072': 0.71, 'avg': 0.80} 120 | return unique_float_metrics 121 | 122 | def run( 123 | self, 124 | model_path: Optional[str] = None, 125 | tasks: Optional[List[str]] = None, 126 | output_file: Optional[str] = None, 127 | api_endpoint: Optional[str] = None, 128 | max_length: Optional[int] = DEFAULT_MAX_LENGTH, 129 | ) -> None: 130 | """ 131 | Run the RULER evaluation using the specified model and tasks. 132 | """ 133 | 134 | model_path = self.model_path if model_path is None else model_path 135 | tasks = self.tasks if not tasks else tasks 136 | output_file = self.output_file if not output_file else output_file 137 | 138 | # validate above params are not none and output file can be written to 139 | if not model_path: 140 | raise ValueError("Model path cannot be empty") 141 | if not output_file: 142 | raise ValueError("Output file path cannot be empty") 143 | if not api_endpoint: 144 | raise ValueError("API endpoint cannot be empty") 145 | 146 | # Prepare model_args 147 | model_args = { 148 | "pretrained": model_path, 149 | "base_url": api_endpoint, 150 | "max_length": max_length, 151 | } 152 | 153 | self.lm_eval_results = simple_evaluate( 154 | model="local-completions", 155 | model_args=model_args, 156 | tasks=tasks, 157 | ) 158 | 159 | self.result = self.process_lm_eval_results( 160 | raw_results=self.lm_eval_results, 161 | ) 162 | 163 | # write results to file 164 | if output_file: 165 | try: 166 | with open(output_file, "w", encoding="utf-8") as f: 167 | json.dump(self.result, f, indent=2) 168 | except (OSError, IOError) as e: 169 | raise ValueError(f"Failed to write to output file: {e}") from e 170 | -------------------------------------------------------------------------------- /tests/test_mmlu.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # Standard 4 | from unittest import mock 5 | from unittest.mock import patch 6 | import os 7 | 8 | # First Party 9 | from instructlab.eval.mmlu import MMLUBranchEvaluator, MMLUEvaluator 10 | 11 | MMLU_EXAMPLE_OUTPUT = { 12 | "results": { 13 | "mmlu_astronomy": { 14 | "alias": "astronomy", 15 | "acc,none": 0.5592105263157895, 16 | "acc_stderr,none": 0.04040311062490436, 17 | }, 18 | "mmlu_anatomy": { 19 | "alias": "anatomy", 20 | "acc,none": 0.4444444444444444, 21 | "acc_stderr,none": 0.04292596718256981, 22 | }, 23 | "mmlu_abstract_algebra": { 24 | "alias": "abstract_algebra", 25 | "acc,none": 0.35, 26 | "acc_stderr,none": 0.047937248544110196, 27 | }, 28 | }, 29 | } 30 | 31 | MODEL_EXAMPLE = "instructlab/granite-7b-lab" 32 | 33 | 34 | def assert_example_mmlu_individual_scores(overall_score, individual_scores): 35 | assert round(overall_score, 2) == 0.45 36 | assert individual_scores == { 37 | "mmlu_abstract_algebra": {"score": 0.35, "stderr": 0.047937248544110196}, 38 | "mmlu_anatomy": {"score": 0.4444444444444444, "stderr": 0.04292596718256981}, 39 | "mmlu_astronomy": {"score": 0.5592105263157895, "stderr": 0.04040311062490436}, 40 | } 41 | 42 | 43 | @patch( 44 | "instructlab.eval.mmlu.AbstractMMLUEvaluator._simple_evaluate_with_error_handling", 45 | return_value=MMLU_EXAMPLE_OUTPUT, 46 | ) 47 | def test_mmlu_branch(eval_mock): 48 | tasks_dir = f"{os.path.dirname(os.path.realpath(__file__))}/testdata/sdg" 49 | tasks = ["mmlu_pr"] 50 | mmlu = MMLUBranchEvaluator( 51 | model_path=MODEL_EXAMPLE, 52 | tasks_dir=tasks_dir, 53 | tasks=tasks, 54 | system_prompt="You are an intelligent AI language model.", 55 | ) 56 | overall_score, individual_scores = mmlu.run() 57 | 58 | assert_example_mmlu_individual_scores(overall_score, individual_scores) 59 | eval_mock.assert_called() 60 | 61 | 62 | @patch( 63 | "instructlab.eval.mmlu.AbstractMMLUEvaluator._simple_evaluate_with_error_handling", 64 | return_value=MMLU_EXAMPLE_OUTPUT, 65 | ) 66 | def test_mmlu(eval_mock): 67 | tasks = ["mmlu_anatomy", "mmlu_astronomy", "mmlu_algebra"] 68 | mmlu = MMLUEvaluator( 69 | model_path=MODEL_EXAMPLE, 70 | tasks=tasks, 71 | system_prompt="You are an intelligent AI language model.", 72 | ) 73 | overall_score, individual_scores = mmlu.run() 74 | 75 | eval_mock.assert_called() 76 | assert_example_mmlu_individual_scores(overall_score, individual_scores) 77 | -------------------------------------------------------------------------------- /tests/test_mt_bench.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # Standard 4 | from unittest import mock 5 | from unittest.mock import patch 6 | 7 | # First Party 8 | from instructlab.eval.mt_bench import MTBenchBranchEvaluator, MTBenchEvaluator 9 | 10 | 11 | def gen_qa_pairs(odd): 12 | i = 1 13 | qa_pairs = [] 14 | score = 0 15 | while i < 5: 16 | if i % 2: 17 | if odd: 18 | score = 0.2 19 | else: 20 | score = 0.1 21 | elif not i % 2: 22 | if odd: 23 | score = 0.3 24 | else: 25 | score = 0.4 26 | qa_pairs.append( 27 | { 28 | "question_id": i, 29 | "score": score, 30 | "qna_file": f"category{i}/qna.yaml", 31 | } 32 | ) 33 | i = i + 1 34 | qa_pairs.append( 35 | { 36 | "question_id": i, 37 | "score": 0.5, 38 | "qna_file": f"category{i}/qna.yaml", 39 | } 40 | ) 41 | if odd: 42 | qa_pairs.append( 43 | { 44 | "question_id": i + 1, 45 | "score": 0.6, 46 | "qna_file": f"category{i + 1}/qna.yaml", 47 | } 48 | ) 49 | return qa_pairs 50 | 51 | 52 | @patch("instructlab.eval.mt_bench_branch_generator.generate") 53 | @patch("instructlab.eval.mt_bench_answers.generate_answers") 54 | @patch( 55 | "instructlab.eval.mt_bench_judgment.generate_judgment", 56 | return_value=(0, gen_qa_pairs(True), None, 0), 57 | ) 58 | def test_mt_bench_branch(gen_judgment_mock, gen_answers_mock, generate_mock): 59 | mt_bench_branch = MTBenchBranchEvaluator( 60 | "instructlab/granite-7b-lab", 61 | "prometheus-eval/prometheus-8x7b-v2.0", 62 | "../taxonomy", 63 | "main", 64 | ) 65 | mt_bench_branch.gen_answers( 66 | "http://localhost:8000/v1", 67 | ) 68 | overall_score, qa_pairs, error_rate = mt_bench_branch.judge_answers( 69 | "http://localhost:8000/v1", 70 | ) 71 | assert overall_score == 0 72 | assert qa_pairs == gen_qa_pairs(True) 73 | assert error_rate == 0 74 | 75 | gen_judgment_mock.assert_called() 76 | gen_answers_mock.assert_called() 77 | generate_mock.assert_called() 78 | 79 | 80 | @patch("instructlab.eval.mt_bench_answers.generate_answers") 81 | @patch( 82 | "instructlab.eval.mt_bench_judgment.generate_judgment", 83 | return_value=(1.5001, [{}, {}], [1.002, 2], 0), 84 | ) 85 | def test_mt_bench(gen_judgment_mock, gen_answers_mock): 86 | mt_bench = MTBenchEvaluator( 87 | "instructlab/granite-7b-lab", 88 | "prometheus-eval/prometheus-8x7b-v2.0", 89 | ) 90 | mt_bench.gen_answers( 91 | "http://localhost:8000/v1", 92 | ) 93 | overall_score, qa_pairs, turn_scores, error_rate = mt_bench.judge_answers( 94 | "http://localhost:8000/v1", 95 | ) 96 | 97 | assert overall_score == 1.5001 98 | assert qa_pairs == [{}, {}] 99 | assert turn_scores == [1.002, 2] 100 | assert error_rate == 0 101 | 102 | gen_judgment_mock.assert_called() 103 | gen_answers_mock.assert_called() 104 | -------------------------------------------------------------------------------- /tests/test_mt_bench_answers.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # Standard 4 | import json 5 | import os 6 | import random 7 | import shutil 8 | import tempfile 9 | 10 | # First Party 11 | from instructlab.eval.mt_bench_answers import reorg_answer_file 12 | 13 | 14 | def test_reorg_answer_file(): 15 | answer_file = os.path.join( 16 | os.path.dirname(__file__), 17 | "..", 18 | "src", 19 | "instructlab", 20 | "eval", 21 | "data", 22 | "mt_bench", 23 | "reference_answer", 24 | "gpt-4.jsonl", 25 | ) 26 | 27 | # Create a temporary file 28 | with tempfile.NamedTemporaryFile(delete=True) as temp_file: 29 | temp_answer_file = temp_file.name 30 | 31 | # Copy the original file to the temp file 32 | shutil.copy(answer_file, temp_answer_file) 33 | 34 | orig_length = 0 35 | with open(temp_answer_file, "r+", encoding="utf-8") as f: 36 | answers = {} 37 | for l in f: 38 | orig_length += 1 39 | qid = json.loads(l)["question_id"] 40 | answers[qid] = l 41 | 42 | # Reset to the beginning of the file and clear it 43 | f.seek(0) 44 | f.truncate() 45 | 46 | # Randomize the values 47 | qids = sorted(list(answers.keys()), key=lambda answer: random.random()) 48 | for qid in qids: 49 | f.write(answers[qid]) 50 | # Write each answer twice 51 | f.write(answers[qid]) 52 | 53 | # Run the reorg which should sort and dedup the file in place 54 | reorg_answer_file(temp_answer_file) 55 | 56 | new_length = 0 57 | with open(temp_answer_file, "r", encoding="utf-8") as fin: 58 | previous_question_id = -1 59 | for l in fin: 60 | new_length += 1 61 | qid = json.loads(l)["question_id"] 62 | assert qid > previous_question_id 63 | previous_question_id = qid 64 | 65 | assert new_length == orig_length 66 | -------------------------------------------------------------------------------- /tests/test_mt_bench_common.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # Standard 4 | from unittest import mock 5 | 6 | # First Party 7 | from instructlab.eval.mt_bench_common import Judge, check_data 8 | 9 | CHECK_DATA_EXAMPLE_QUESTIONS = [ 10 | { 11 | "question_id": 81, 12 | "category": "writing", 13 | "turns": [ 14 | "Fake question", 15 | "Fake question", 16 | ], 17 | }, 18 | { 19 | "question_id": 101, 20 | "category": "reasoning", 21 | "turns": [ 22 | "Fake question", 23 | "Fake question", 24 | ], 25 | }, 26 | ] 27 | CHECK_DATA_EXAMPLE_MODEL_ANSWERS = { 28 | "granite-7b-lab": { 29 | 81: { 30 | "question_id": 81, 31 | "answer_id": "c4j9vPyHM8w3JHPGohrJQG", 32 | "model_id": "granite-7b-lab", 33 | "choices": [ 34 | { 35 | "index": 0, 36 | "turns": [ 37 | "Fake answer", 38 | "Fake answer", 39 | ], 40 | } 41 | ], 42 | "tstamp": 1730816201.883507, 43 | }, 44 | 101: { 45 | "question_id": 101, 46 | "answer_id": "kaQw7Fj2SDeE2VfvU25FJ4", 47 | "model_id": "granite-7b-lab", 48 | "choices": [ 49 | { 50 | "index": 0, 51 | "turns": [ 52 | "Fake answer", 53 | "Fake answer", 54 | ], 55 | } 56 | ], 57 | "tstamp": 1730816166.3719094, 58 | }, 59 | } 60 | } 61 | CHECK_DATA_EXAMPLE_REFERENCE_ANSWERS = { 62 | "merlinite-7b-lab": { 63 | 101: { 64 | "question_id": 101, 65 | "answer_id": "TFomieEmmAgdeCkvmuvwbc", 66 | "model_id": "gpt-4", 67 | "choices": [ 68 | { 69 | "index": 0, 70 | "turns": [ 71 | "Fake answer", 72 | "Fake answer", 73 | ], 74 | } 75 | ], 76 | "tstamp": 1686286924.844282, 77 | }, 78 | 102: { 79 | "question_id": 102, 80 | "answer_id": "hLH8WozvaB88bb5vV224H4", 81 | "model_id": "gpt-4", 82 | "choices": [ 83 | { 84 | "index": 0, 85 | "turns": [ 86 | "Fake answer", 87 | "Fake answer", 88 | ], 89 | } 90 | ], 91 | "tstamp": 1686286937.7164738, 92 | }, 93 | } 94 | } 95 | 96 | CHECK_DATA_EXAMPLE_MODELS = ["granite-7b-lab"] 97 | CHECK_DATA_EXAMPLE_JUDGES = { 98 | "default": Judge( 99 | model_name="merlinite-7b-lab", 100 | prompt_template={ 101 | "name": "single-v1", 102 | "type": "single", 103 | "system_prompt": "Fake prompt", 104 | "prompt_template": "Fake prompt", 105 | "description": "Prompt for general questions", 106 | "category": "general", 107 | "output_format": "[[rating]]", 108 | }, 109 | ref_based=False, 110 | multi_turn=False, 111 | ), 112 | "math": Judge( 113 | model_name="merlinite-7b-lab", 114 | prompt_template={ 115 | "name": "single-math-v1", 116 | "type": "single", 117 | "system_prompt": "Fake prompt", 118 | "prompt_template": "Fake prompt", 119 | "description": "Prompt for general questions", 120 | "category": "math", 121 | "output_format": "[[rating]]", 122 | }, 123 | ref_based=True, 124 | multi_turn=False, 125 | ), 126 | "default-mt": Judge( 127 | model_name="merlinite-7b-lab", 128 | prompt_template={ 129 | "name": "single-v1-multi-turn", 130 | "type": "single", 131 | "system_prompt": "Fake prompt", 132 | "prompt_template": "Fake prompt", 133 | "description": "Prompt for general questions", 134 | "category": "general", 135 | "output_format": "[[rating]]", 136 | }, 137 | ref_based=False, 138 | multi_turn=True, 139 | ), 140 | "math-mt": Judge( 141 | model_name="merlinite-7b-lab", 142 | prompt_template={ 143 | "name": "single-math-v1-multi-turn", 144 | "type": "single", 145 | "system_prompt": "Fake prompt", 146 | "prompt_template": "Fake prompt", 147 | "description": "Prompt for general questions", 148 | "category": "math", 149 | "output_format": "[[rating]]", 150 | }, 151 | ref_based=True, 152 | multi_turn=True, 153 | ), 154 | } 155 | 156 | 157 | def test_check_data(): 158 | check_data( 159 | CHECK_DATA_EXAMPLE_QUESTIONS, 160 | CHECK_DATA_EXAMPLE_MODEL_ANSWERS, 161 | CHECK_DATA_EXAMPLE_REFERENCE_ANSWERS, 162 | CHECK_DATA_EXAMPLE_MODELS, 163 | CHECK_DATA_EXAMPLE_JUDGES, 164 | ) 165 | 166 | try: 167 | check_data( 168 | CHECK_DATA_EXAMPLE_QUESTIONS, 169 | {"granite-7b-lab": {}}, 170 | CHECK_DATA_EXAMPLE_REFERENCE_ANSWERS, 171 | CHECK_DATA_EXAMPLE_MODELS, 172 | CHECK_DATA_EXAMPLE_JUDGES, 173 | ) 174 | except Exception as e: 175 | assert "Missing model granite-7b-lab's answer to Question" in str(e) 176 | else: 177 | assert False, "Didn't fail with missing model answer" 178 | 179 | try: 180 | check_data( 181 | CHECK_DATA_EXAMPLE_QUESTIONS, 182 | CHECK_DATA_EXAMPLE_MODEL_ANSWERS, 183 | {"merlinite-7b-lab": {}}, 184 | CHECK_DATA_EXAMPLE_MODELS, 185 | CHECK_DATA_EXAMPLE_JUDGES, 186 | ) 187 | except Exception as e: 188 | assert "Missing reference answer to Question" in str(e) 189 | else: 190 | assert False, "Didn't fail with missing reference answer" 191 | -------------------------------------------------------------------------------- /tests/test_mt_bench_judgment.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # Standard 4 | import os 5 | 6 | # First Party 7 | from instructlab.eval.mt_bench_common import Judge 8 | from instructlab.eval.mt_bench_judgment import load_judge_prompts, make_judge_single 9 | 10 | 11 | def test_make_judge_single(): 12 | judge_file = os.path.join( 13 | os.path.dirname(__file__), 14 | "..", 15 | "src", 16 | "instructlab", 17 | "eval", 18 | "data", 19 | "mt_bench", 20 | "judge_prompts.jsonl", 21 | ) 22 | judge_prompts = load_judge_prompts(judge_file) 23 | judges = make_judge_single("prometheus-8x7b-v2-0", judge_prompts) 24 | assert len(judges) == 4 25 | assert isinstance(judges["default"], Judge) 26 | assert isinstance(judges["math"], Judge) 27 | assert judges["math"].ref_based 28 | assert isinstance(judges["default-mt"], Judge) 29 | assert judges["default-mt"].multi_turn 30 | assert isinstance(judges["math-mt"], Judge) 31 | assert judges["math-mt"].ref_based 32 | assert judges["math-mt"].multi_turn 33 | -------------------------------------------------------------------------------- /tests/test_mt_bench_model_adapter.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | # Third Party 4 | import pytest 5 | 6 | # First Party 7 | from instructlab.eval.mt_bench_model_adapter import ( 8 | GraniteAdapter, 9 | MistralAdapter, 10 | get_conversation_template, 11 | get_model_adapter, 12 | ) 13 | 14 | MISTRAL_DEFAULT_MODEL_NAME = "mistral" 15 | EXAMPLE_MISTRAL_MODEL_PATHS = [ 16 | "mistral", 17 | "mistralai/Mixtral-8x7B-Instruct-v0.1", 18 | "/cache/instructlab/models/mistral-7b-instruct-v0.2.Q4_K_M.gguf", 19 | "prometheus-eval/prometheus-8x7b-v2.0", 20 | "/cache/instructlab/models/prometheus-eval/prometheus-8x7b-v2.0", 21 | ] 22 | 23 | GRANITE_DEFAULT_MODEL_NAME = "granite" 24 | EXAMPLE_GRANITE_MODEL_PATHS = [ 25 | "granite", 26 | "instructlab/granite-7b-lab", 27 | "/cache/instructlab/models/instructlab/granite-7b-lab.gguf", 28 | "instructlab/granite-8b-lab", 29 | ] 30 | 31 | TEST_TUPLES = [ 32 | ( 33 | MISTRAL_DEFAULT_MODEL_NAME, 34 | EXAMPLE_MISTRAL_MODEL_PATHS, 35 | MistralAdapter, 36 | MISTRAL_DEFAULT_MODEL_NAME, 37 | ), 38 | ( 39 | GRANITE_DEFAULT_MODEL_NAME, 40 | EXAMPLE_GRANITE_MODEL_PATHS, 41 | GraniteAdapter, 42 | "ibm-generic", 43 | ), 44 | ] 45 | 46 | 47 | def test_get_model_adapter(): 48 | for model, model_paths, adapter, _ in TEST_TUPLES: 49 | for model_path in model_paths: 50 | assert isinstance(get_model_adapter(model_path, model), adapter) 51 | 52 | # Test default adapter overrides as expected 53 | assert isinstance(get_model_adapter("", MISTRAL_DEFAULT_MODEL_NAME), MistralAdapter) 54 | 55 | 56 | def test_get_model_adapter_not_found(): 57 | with pytest.raises(ValueError): 58 | get_model_adapter("unknown", "unknown") 59 | 60 | 61 | def test_get_conversation_template(): 62 | for model, model_paths, _, conv_template_name in TEST_TUPLES: 63 | for model_path in model_paths: 64 | assert ( 65 | conv_template_name == get_conversation_template(model_path, model).name 66 | ) 67 | -------------------------------------------------------------------------------- /tests/test_project.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # Standard 3 | from importlib.metadata import entry_points 4 | 5 | # First Party 6 | from instructlab.eval.evaluator import Evaluator 7 | from instructlab.eval.leaderboard import LeaderboardV2Evaluator 8 | from instructlab.eval.mmlu import MMLUBranchEvaluator, MMLUEvaluator 9 | from instructlab.eval.mt_bench import MTBenchBranchEvaluator, MTBenchEvaluator 10 | from instructlab.eval.ruler import RulerEvaluator 11 | 12 | 13 | def test_evaluator_eps(): 14 | expected = { 15 | "mmlu": MMLUEvaluator, 16 | "mmlu_branch": MMLUBranchEvaluator, 17 | "mt_bench": MTBenchEvaluator, 18 | "mt_bench_branch": MTBenchBranchEvaluator, 19 | "leaderboard_v2": LeaderboardV2Evaluator, 20 | "ruler": RulerEvaluator, 21 | } 22 | eps = entry_points(group="instructlab.eval.evaluator") 23 | found = {} 24 | for ep in eps: 25 | # different project 26 | if not ep.module.startswith("instructlab.eval"): 27 | continue 28 | evaluator = ep.load() 29 | assert issubclass(evaluator, Evaluator) 30 | assert evaluator.name == ep.name 31 | found[ep.name] = evaluator 32 | 33 | assert found == expected 34 | -------------------------------------------------------------------------------- /tests/test_ragas.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | # Standard 3 | from pathlib import Path 4 | from unittest.mock import MagicMock, patch 5 | import unittest 6 | 7 | # Third Party 8 | from pandas import DataFrame 9 | from ragas.callbacks import ChainRun 10 | from ragas.dataset_schema import EvaluationDataset, EvaluationResult 11 | 12 | # First Party 13 | from instructlab.eval.ragas import ModelConfig, RagasEvaluator, RunConfig 14 | 15 | 16 | class TestRagasEvaluator(unittest.TestCase): 17 | def setUp(self): 18 | # Common setup data for all tests 19 | self.student_model_response = "Paris" 20 | self.user_question = "What is the capital of France?" 21 | self.golden_answer = "The capital of France is Paris." 22 | self.metric = "mocked-metric" 23 | self.metric_score = 4.0 24 | self.base_ds = [ 25 | { 26 | "user_input": self.user_question, 27 | "reference": self.golden_answer, 28 | } 29 | ] 30 | self.student_model = ModelConfig( 31 | model_name="super-jeeves-8x700B", 32 | ) 33 | self.run_config = RunConfig(max_retries=3, max_wait=60, seed=42, timeout=30) 34 | 35 | @patch("instructlab.eval.ragas.ChatOpenAI") 36 | @patch("instructlab.eval.ragas.evaluate") 37 | @patch.object(RagasEvaluator, "_generate_answers_from_model") 38 | @patch.object(RagasEvaluator, "_get_metrics") 39 | def test_run_with_dataset( 40 | self, 41 | mock_get_metrics: MagicMock, 42 | mock_generate_answers_from_model: MagicMock, 43 | mock_evaluate: MagicMock, 44 | mock_ChatOpenAI: MagicMock, 45 | ): 46 | """ 47 | Test case 1: Directly passing a Python list/dict dataset to `RagasEvaluator.run()`. 48 | """ 49 | # Prepare mocks 50 | mock_get_metrics.return_value = [self.metric] 51 | interim_df = DataFrame( 52 | { 53 | "user_input": [self.user_question], 54 | "response": [self.student_model_response], 55 | "reference": [self.golden_answer], 56 | } 57 | ) 58 | mock_generate_answers_from_model.return_value = interim_df 59 | mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) 60 | _unimportant_ragas_traces = { 61 | "default": ChainRun( 62 | run_id="42", 63 | parent_run_id=None, 64 | name="root", 65 | inputs={"system": "null", "user": "null"}, 66 | outputs={"assistant": "null"}, 67 | metadata={"user_id": 1337}, 68 | ) 69 | } 70 | mock_evaluate.return_value = EvaluationResult( 71 | scores=[{self.metric: self.metric_score}], 72 | dataset=mocked_evaluation_ds, 73 | ragas_traces=_unimportant_ragas_traces, 74 | ) 75 | 76 | # Instantiate evaluator 77 | evaluator = RagasEvaluator() 78 | 79 | # Run test 80 | result = evaluator.run( 81 | dataset=self.base_ds, 82 | student_model=self.student_model, 83 | run_config=self.run_config, 84 | student_openai_client=MagicMock(), # We pass a mock client 85 | ) 86 | 87 | # Assertions 88 | self.assertIsInstance(result, EvaluationResult) 89 | mock_generate_answers_from_model.assert_called_once() 90 | mock_evaluate.assert_called_once() 91 | # we didn't provide an API key, so it expects to get `api_key=None` 92 | mock_ChatOpenAI.assert_called_once_with(model="gpt-4o", api_key=None) 93 | 94 | @patch("instructlab.eval.ragas.ChatOpenAI") 95 | @patch("instructlab.eval.ragas.read_json") 96 | @patch("instructlab.eval.ragas.evaluate") 97 | @patch.object(RagasEvaluator, "_generate_answers_from_model") 98 | @patch.object(RagasEvaluator, "_get_metrics") 99 | def test_run_with_dataset_via_path( 100 | self, 101 | mock_get_metrics: MagicMock, 102 | mock_generate_answers_from_model: MagicMock, 103 | mock_evaluate: MagicMock, 104 | mock_read_json: MagicMock, 105 | mock_ChatOpenAI: MagicMock, 106 | ): 107 | """ 108 | Test case 2: Passing a Path to a JSONL file (containing the dataset) to `RagasEvaluator.run()`. 109 | """ 110 | # Prepare mocks 111 | mock_get_metrics.return_value = [self.metric] 112 | interim_df = DataFrame( 113 | { 114 | "user_input": [self.user_question], 115 | "response": [self.student_model_response], 116 | "reference": [self.golden_answer], 117 | } 118 | ) 119 | mock_generate_answers_from_model.return_value = interim_df 120 | mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) 121 | _unimportant_ragas_traces = { 122 | "default": ChainRun( 123 | run_id="42", 124 | parent_run_id=None, 125 | name="root", 126 | inputs={"system": "null", "user": "null"}, 127 | outputs={"assistant": "null"}, 128 | metadata={"user_id": 1337}, 129 | ) 130 | } 131 | mock_evaluate.return_value = EvaluationResult( 132 | scores=[{self.metric: self.metric_score}], 133 | dataset=mocked_evaluation_ds, 134 | ragas_traces=_unimportant_ragas_traces, 135 | ) 136 | 137 | mock_read_json.return_value = DataFrame(self.base_ds) 138 | 139 | # Instantiate evaluator 140 | evaluator = RagasEvaluator() 141 | 142 | # Run test 143 | result = evaluator.run( 144 | dataset=Path("dummy_path.jsonl"), 145 | student_model=self.student_model, 146 | run_config=self.run_config, 147 | student_openai_client=MagicMock(), 148 | ) 149 | 150 | # Assertions 151 | self.assertIsInstance(result, EvaluationResult) 152 | mock_read_json.assert_called_once_with( 153 | Path("dummy_path.jsonl"), orient="records", lines=True 154 | ) 155 | mock_generate_answers_from_model.assert_called() 156 | mock_evaluate.assert_called() 157 | 158 | @patch("instructlab.eval.ragas.ChatOpenAI") 159 | @patch("instructlab.eval.ragas.read_json") 160 | @patch("instructlab.eval.ragas.evaluate") 161 | @patch.object(RagasEvaluator, "_generate_answers_from_model") 162 | @patch.object(RagasEvaluator, "_get_metrics") 163 | def test_run_with_instance_attributes( 164 | self, 165 | mock_get_metrics: MagicMock, 166 | mock_generate_answers_from_model: MagicMock, 167 | mock_evaluate: MagicMock, 168 | mock_read_json: MagicMock, 169 | mock_ChatOpenAI: MagicMock, 170 | ): 171 | """ 172 | Test case 3: Using `RagasEvaluator` instance attributes for `student_model`, `run_config`, 173 | and `student_openai_client` instead of passing them explicitly. 174 | """ 175 | # Prepare mocks 176 | mock_get_metrics.return_value = [self.metric] 177 | interim_df = DataFrame( 178 | { 179 | "user_input": [self.user_question], 180 | "response": [self.student_model_response], 181 | "reference": [self.golden_answer], 182 | } 183 | ) 184 | mock_generate_answers_from_model.return_value = interim_df 185 | mocked_evaluation_ds = EvaluationDataset.from_pandas(interim_df) 186 | _unimportant_ragas_traces = { 187 | "default": ChainRun( 188 | run_id="42", 189 | parent_run_id=None, 190 | name="root", 191 | inputs={"system": "null", "user": "null"}, 192 | outputs={"assistant": "null"}, 193 | metadata={"user_id": 1337}, 194 | ) 195 | } 196 | mock_evaluate.return_value = EvaluationResult( 197 | scores=[{self.metric: self.metric_score}], 198 | dataset=mocked_evaluation_ds, 199 | ragas_traces=_unimportant_ragas_traces, 200 | ) 201 | 202 | mock_read_json.return_value = DataFrame(self.base_ds) 203 | 204 | # Instantiate evaluator with instance-level configs 205 | evaluator = RagasEvaluator( 206 | student_model=self.student_model, 207 | student_openai_client=MagicMock(), 208 | run_config=self.run_config, 209 | ) 210 | 211 | # Run test 212 | result = evaluator.run(dataset=Path("dummy_path.jsonl")) 213 | 214 | # Assertions 215 | self.assertIsInstance(result, EvaluationResult) 216 | mock_read_json.assert_called_with( 217 | Path("dummy_path.jsonl"), orient="records", lines=True 218 | ) 219 | mock_generate_answers_from_model.assert_called() 220 | mock_evaluate.assert_called() 221 | 222 | 223 | if __name__ == "__main__": 224 | unittest.main() 225 | -------------------------------------------------------------------------------- /tests/testdata/sdg/_default_template_yaml: -------------------------------------------------------------------------------- 1 | task: mmlu_pr 2 | dataset_path: json 3 | dataset_name: null 4 | test_split: test 5 | doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:" 6 | doc_to_choice: ["A", "B", "C", "D"] 7 | doc_to_target: answer 8 | output_type: multiple_choice 9 | metric_list: 10 | - metric: acc 11 | aggregation: mean 12 | higher_is_better: true 13 | -------------------------------------------------------------------------------- /tests/testdata/sdg/tonsil_task.yaml: -------------------------------------------------------------------------------- 1 | dataset_kwargs: 2 | data_files: 3 | test: tests/testdata/sdg/tonsil_data.jsonl 4 | group: mmlu_pr 5 | include: _default_template_yaml 6 | task: tonsils 7 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | [tox] 4 | # py3-unit runs unit tests with 'python3' 5 | # py311-unit runs the same tests with 'python3.11' 6 | envlist = ruff, lint, mypy, spellcheck, py3-{unit, functional} 7 | minversion = 4.4 8 | 9 | [testenv] 10 | description = run tests (unit, unitcov, functional) 11 | passenv = 12 | CMAKE_ARGS 13 | # Use PyTorch CPU build instead of CUDA build in test envs. CUDA dependencies 14 | # are huge. This reduces venv from 5.7 GB to 1.5 GB. 15 | setenv = 16 | PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu 17 | CMAKE_ARGS={env:CMAKE_ARGS:-DLLAMA_NATIVE=off} 18 | ILAB_MAX_STABLE_VRAM_WAIT=0 19 | package = wheel 20 | wheel_build_env = pkg 21 | install_command = pip install \ 22 | --use-feature fast-deps \ 23 | -c constraints-dev.txt \ 24 | {opts} {packages} 25 | # equivalent to `pip install instructlab[cpu]` 26 | extras = 27 | cpu 28 | leaderboard 29 | deps = 30 | pytest 31 | pytest-asyncio 32 | pytest-cov 33 | pytest-html 34 | commands = 35 | unit: {envpython} -m pytest {posargs:tests} 36 | unitcov: {envpython} -W error::UserWarning -m pytest --cov=instructlab.eval --cov-report term --cov-report=html:coverage-{env_name} --cov-report=xml:coverage-{env_name}.xml --html=durations/{env_name}.html {posargs:tests -m "not (examples or slow)"} 37 | functional: ./scripts/functional-tests.sh 38 | allowlist_externals = 39 | functional: ./scripts/functional-tests.sh 40 | 41 | # format, check, and linting targets don't build and install the project to 42 | # speed up testing. 43 | [testenv:lint] 44 | description = lint with pylint 45 | skip_install = true 46 | skipsdist = true 47 | deps = -r requirements-dev.txt 48 | commands = 49 | {envpython} -m pylint --load-plugins pylint_pydantic src/instructlab/eval/ 50 | 51 | [testenv:fastlint] 52 | description = fast lint with pylint (without 3rd party modules) 53 | skip_install = true 54 | skipsdist = true 55 | deps = 56 | pylint 57 | pylint-pydantic 58 | commands = 59 | {envpython} -m pylint --load-plugins pylint_pydantic {posargs:--disable=import-error src/instructlab/eval/} 60 | 61 | [testenv:ruff] 62 | description = reformat and fix code with Ruff (and isort) 63 | skip_install = True 64 | skipsdist = true 65 | # keep in sync with .pre-commit-config.yaml 66 | deps = 67 | ruff 68 | isort 69 | # supports 'fix', 'check', or abitrary args to 'ruff' command 70 | commands = 71 | ./scripts/ruff.sh {posargs:fix} 72 | allowlist_externals = ./scripts/ruff.sh 73 | 74 | [testenv:spellcheck] 75 | description = spell check (needs 'aspell' command) 76 | skip_install = true 77 | skipsdist = true 78 | deps = 79 | pyspelling 80 | commands = 81 | sh -c 'command -v aspell || (echo "aspell is not installed. Please install it." && exit 1)' 82 | {envpython} -m pyspelling --config {toxinidir}/.spellcheck.yml --spellchecker aspell 83 | allowlist_externals = sh 84 | 85 | [testenv:mypy] 86 | description = Python type checking with mypy 87 | deps = 88 | mypy 89 | types-tqdm 90 | types-PyYAML 91 | pytest 92 | commands = 93 | mypy src 94 | 95 | [testenv:py3] 96 | basepython = python3.11 97 | 98 | [testenv:py3-unit] 99 | basepython = {[testenv:py3]basepython} 100 | 101 | [testenv:py3-functional] 102 | basepython = {[testenv:py3]basepython} 103 | passenv = 104 | {[testenv]passenv} 105 | TEST_DIR 106 | 107 | [gh] 108 | python = 109 | 3.11 = py311-{unitcov, functional} 110 | 111 | [testenv:constraints] 112 | description = Generate new constraints file(s) 113 | basepython = {[testenv:py3]basepython} 114 | skip_install = True 115 | skipsdist = true 116 | deps = 117 | uv==0.7.8 118 | commands = {posargs} 119 | allowlist_externals = * 120 | --------------------------------------------------------------------------------